3 from django.conf import settings
4 from django.contrib.sites.models import Site
5 from django.core.cache import cache
6 from django.db.models.options import get_verbose_name as convert_camelcase
7 from django.utils import simplejson as json
8 from django.utils.http import urlquote_plus
9 from django.utils.safestring import mark_safe
10 from django.utils.text import capfirst
11 from django.template import loader, Context, Template
13 from philo.contrib.sobol.utils import make_tracking_querydict
16 from eventlet.green import urllib2
22 'Result', 'BaseSearch', 'DatabaseSearch', 'URLSearch', 'JSONSearch', 'GoogleSearch', 'registry'
26 SEARCH_CACHE_KEY = 'philo_sobol_search_results'
27 DEFAULT_RESULT_TEMPLATE_STRING = "{% if url %}<a href='{{ url }}'>{% endif %}{{ title }}{% if url %}</a>{% endif %}"
29 # Determines the timeout on the entire result cache.
30 MAX_CACHE_TIMEOUT = 60*60*24*7
33 class RegistrationError(Exception):
37 class SearchRegistry(object):
38 # Holds a registry of search types by slug.
42 def register(self, search, slug=None):
43 slug = slug or search.slug
44 if slug in self._registry:
45 if self._registry[slug] != search:
46 raise RegistrationError("A different search is already registered as `%s`")
48 self._registry[slug] = search
50 def unregister(self, search, slug=None):
52 if slug in self._registry and self._registry[slug] == search:
53 del self._registry[slug]
54 raise RegistrationError("`%s` is not registered as `%s`" % (search, slug))
56 for slug, search in self._registry.items():
58 del self._registry[slug]
61 return self._registry.items()
64 return self._registry.iteritems()
66 def iterchoices(self):
67 for slug, search in self.iteritems():
68 yield slug, search.verbose_name
70 def __getitem__(self, key):
71 return self._registry[key]
74 registry = SearchRegistry()
79 A result is instantiated with a configuration dictionary, a search,
80 and a template name. The configuration dictionary is expected to
81 define a `title` and optionally a `url`. Any other variables may be
82 defined; they will be made available through the result object in
83 the template, if one is defined.
85 def __init__(self, search, result):
90 return self.search.get_result_title(self.result)
93 return self.search.get_result_querydict(self.result).urlencode()
95 def get_template(self):
96 return self.search.get_result_template(self.result)
98 def get_extra_context(self):
99 return self.search.get_result_extra_context(self.result)
102 t = self.get_template()
103 c = Context(self.get_extra_context())
105 'title': self.get_title(),
106 'url': self.get_url()
110 def __unicode__(self):
114 class BaseSearchMetaclass(type):
115 def __new__(cls, name, bases, attrs):
116 if 'verbose_name' not in attrs:
117 attrs['verbose_name'] = capfirst(convert_camelcase(name))
118 if 'slug' not in attrs:
119 attrs['slug'] = name.lower()
120 return super(BaseSearchMetaclass, cls).__new__(cls, name, bases, attrs)
123 class BaseSearch(object):
125 Defines a generic search interface. Accessing self.results will
126 attempt to retrieve cached results and, if that fails, will
127 initiate a new search and store the results in the cache.
129 __metaclass__ = BaseSearchMetaclass
131 _cache_timeout = 60*48
133 def __init__(self, search_arg):
134 self.search_arg = search_arg
136 def _get_cached_results(self):
137 """Return the cached results if the results haven't timed out. Otherwise return None."""
138 result_cache = cache.get(SEARCH_CACHE_KEY)
139 if result_cache and self.__class__ in result_cache and self.search_arg.lower() in result_cache[self.__class__]:
140 cached = result_cache[self.__class__][self.search_arg.lower()]
141 if cached['timeout'] >= datetime.datetime.now():
142 return cached['results']
145 def _set_cached_results(self, results, timeout):
146 """Sets the results to the cache for <timeout> minutes."""
147 result_cache = cache.get(SEARCH_CACHE_KEY) or {}
148 cached = result_cache.setdefault(self.__class__, {}).setdefault(self.search_arg.lower(), {})
151 'timeout': datetime.datetime.now() + datetime.timedelta(minutes=timeout)
153 cache.set(SEARCH_CACHE_KEY, result_cache, MAX_CACHE_TIMEOUT)
157 if not hasattr(self, '_results'):
158 results = self._get_cached_results()
161 # Cache one extra result so we can see if there are
162 # more results to be had.
163 limit = self.result_limit
164 if limit is not None:
166 results = self.get_results(self.result_limit)
170 # On exceptions, don't set any cache; just return.
173 self._set_cached_results(results, self._cache_timeout)
174 self._results = results
178 def get_results(self, limit=None, result_class=Result):
180 Calls self.search() and parses the return value into Result objects.
182 results = self.search(limit)
183 return [result_class(self, result) for result in results]
185 def search(self, limit=None):
187 Returns an iterable of up to <limit> results. The
188 get_result_title, get_result_url, get_result_template, and
189 get_result_extra_context methods will be used to interpret the
190 individual items that this function returns, so the result can
191 be an object with attributes as easily as a dictionary
192 with keys. The only restriction is that the objects be
193 pickleable so that they can be used with django's cache system.
195 raise NotImplementedError
197 def get_result_title(self, result):
198 raise NotImplementedError
200 def get_result_url(self, result):
201 "Subclasses override this to provide the actual URL for the result."
202 raise NotImplementedError
204 def get_result_querydict(self, result):
205 return make_tracking_querydict(self.search_arg, self.get_result_url(result))
207 def get_result_template(self, result):
208 if hasattr(self, 'result_template'):
209 return loader.get_template(self.result_template)
210 if not hasattr(self, '_result_template'):
211 self._result_template = Template(DEFAULT_RESULT_TEMPLATE_STRING)
212 return self._result_template
214 def get_ajax_result_template(self, result):
215 return getattr(self, 'ajax_result_template', DEFAULT_RESULT_TEMPLATE_STRING)
217 def get_result_extra_context(self, result):
220 def has_more_results(self):
221 """Useful to determine whether to display a `view more results` link."""
222 return len(self.results) > self.result_limit
225 def more_results_url(self):
227 Returns the actual url for more results. This will be encoded
228 into a querystring for tracking purposes.
230 raise NotImplementedError
233 def more_results_querydict(self):
234 return make_tracking_querydict(self.search_arg, self.more_results_url)
236 def __unicode__(self):
237 return ' '.join(self.__class__.verbose_name.rsplit(' ', 1)[:-1]) + ' results'
240 class DatabaseSearch(BaseSearch):
243 def has_more_results(self):
244 return self.get_queryset().count() > self.result_limit
246 def search(self, limit=None):
247 if not hasattr(self, '_qs'):
248 self._qs = self.get_queryset()
249 if limit is not None:
250 self._qs = self._qs[:limit]
254 def get_queryset(self):
255 return self.model._default_manager.all()
258 class URLSearch(BaseSearch):
260 Defines a generic interface for searches that require accessing a
261 certain url to get search results.
264 query_format_str = "%s"
268 "The URL where the search gets its results."
269 return self.search_url + self.query_format_str % urlquote_plus(self.search_arg)
272 def more_results_url(self):
273 "The URL where the users would go to get more results."
276 def parse_response(self, response, limit=None):
277 raise NotImplementedError
279 def search(self, limit=None):
280 return self.parse_response(urllib2.urlopen(self.url), limit=limit)
283 class JSONSearch(URLSearch):
285 Makes a GET request and parses the results as JSON. The default
286 behavior assumes that the return value is a list of results.
288 def parse_response(self, response, limit=None):
289 return json.loads(response.read())[:limit]
292 class GoogleSearch(JSONSearch):
293 search_url = "http://ajax.googleapis.com/ajax/services/search/web"
294 query_format_str = "?v=1.0&q=%s"
295 # TODO: Change this template to reflect the app's actual name.
296 result_template = 'search/googlesearch.html'
299 def parse_response(self, response, limit=None):
300 responseData = json.loads(response.read())['responseData']
301 results, cursor = responseData['results'], responseData['cursor']
304 self._more_results_url = cursor['moreResultsUrl']
305 self._estimated_result_count = cursor['estimatedResultCount']
307 return results[:limit]
311 # Google requires that an ajax request have a proper Referer header.
312 return urllib2.Request(
313 super(GoogleSearch, self).url,
315 {'Referer': "http://%s" % Site.objects.get_current().domain}
319 def has_more_results(self):
320 if self.results and len(self.results) < self._estimated_result_count:
325 def more_results_url(self):
326 return self._more_results_url
328 def get_result_title(self, result):
329 return result['titleNoFormatting']
331 def get_result_url(self, result):
332 return result['unescapedUrl']
334 def get_result_extra_context(self, result):
338 registry.register(GoogleSearch)
342 from BeautifulSoup import BeautifulSoup, SoupStrainer, BeautifulStoneSoup
346 __all__ += ('ScrapeSearch', 'XMLSearch',)
347 class ScrapeSearch(URLSearch):
349 _strainer_kwargs = {}
353 if not hasattr(self, '_strainer'):
354 self._strainer = SoupStrainer(*self._strainer_args, **self._strainer_kwargs)
355 return self._strainer
357 def parse_response(self, response, limit=None):
358 strainer = self.strainer
359 soup = BeautifulSoup(response, parseOnlyThese=strainer)
360 return self.parse_results(soup[:limit])
362 def parse_results(self, results):
364 Provides a hook for parsing the results of straining. This
365 has no default behavior because the results absolutely
366 must be parsed to properly extract the information.
367 For more information, see http://www.crummy.com/software/BeautifulSoup/documentation.html#Improving%20Memory%20Usage%20with%20extract
369 raise NotImplementedError
372 class XMLSearch(ScrapeSearch):
373 _self_closing_tags = []
375 def parse_response(self, response, limit=None):
376 strainer = self.strainer
377 soup = BeautifulStoneSoup(page, selfClosingTags=self._self_closing_tags, parseOnlyThese=strainer)
378 return self.parse_results(soup[:limit])