3 from hashlib import sha1
5 from django.conf import settings
6 from django.contrib.sites.models import Site
7 from django.core.cache import cache
8 from django.db.models.options import get_verbose_name as convert_camelcase
9 from django.utils import simplejson as json
10 from django.utils.http import urlquote_plus
11 from django.utils.safestring import mark_safe
12 from django.utils.text import capfirst
13 from django.template import loader, Context, Template, TemplateDoesNotExist
15 from philo.contrib.sobol.utils import make_tracking_querydict, RegistryIterator
18 if getattr(settings, 'SOBOL_USE_EVENTLET', False):
20 from eventlet.green import urllib2
28 'Result', 'BaseSearch', 'DatabaseSearch', 'URLSearch', 'JSONSearch', 'GoogleSearch', 'SearchRegistry', 'registry', 'get_search_instance'
32 SEARCH_CACHE_SEED = 'philo_sobol_search_results'
33 USE_CACHE = getattr(settings, 'SOBOL_USE_CACHE', True)
36 class RegistrationError(Exception):
37 """Raised if there is a problem registering a search with a :class:`SearchRegistry`"""
41 class SearchRegistry(object):
42 """Holds a registry of search types by slug."""
47 def register(self, search, slug=None):
49 Register a search with the registry.
51 :param search: The search class to register - generally a subclass of :class:`BaseSearch`
52 :param slug: The slug which will be used to register the search class. If ``slug`` is ``None``, the search's default slug will be used.
53 :raises: :class:`RegistrationError` if a different search is already registered with ``slug``.
56 slug = slug or search.slug
57 if slug in self._registry:
58 registered = self._registry[slug]
59 if registered.__module__ != search.__module__:
60 raise RegistrationError("A different search is already registered as `%s`" % slug)
62 self._registry[slug] = search
64 def unregister(self, search, slug=None):
66 Unregister a search from the registry.
68 :param search: The search class to unregister - generally a subclass of :class:`BaseSearch`
69 :param slug: If provided, the search will only be removed if it was registered with ``slug``. If not provided, the search class will be unregistered no matter what slug it was registered with.
70 :raises: :class:`RegistrationError` if a slug is provided but the search registered with that slug is not ``search``.
74 if slug in self._registry and self._registry[slug] == search:
75 del self._registry[slug]
76 raise RegistrationError("`%s` is not registered as `%s`" % (search, slug))
78 for slug, search in self._registry.items():
80 del self._registry[slug]
83 """Returns a list of (slug, search) items in the registry."""
84 return self._registry.items()
87 """Returns an iterator over the (slug, search) pairs in the registry."""
88 return RegistryIterator(self._registry, 'iteritems')
90 def iterchoices(self):
91 """Returns an iterator over (slug, search.verbose_name) pairs for the registry."""
92 return RegistryIterator(self._registry, 'iteritems', lambda x: (x[0], x[1].verbose_name))
94 def __getitem__(self, key):
95 """Returns the search registered with ``key``."""
96 return self._registry[key]
99 """Returns an iterator over the keys in the registry."""
100 return self._registry.__iter__()
103 registry = SearchRegistry()
106 def _make_cache_key(search, search_arg):
107 return sha1(SEARCH_CACHE_SEED + search.slug + search_arg).hexdigest()
110 def get_search_instance(slug, search_arg):
111 """Returns a search instance for the given slug, either from the cache or newly-instantiated."""
112 search = registry[slug]
113 search_arg = search_arg.lower()
115 key = _make_cache_key(search, search_arg)
116 cached = cache.get(key)
119 instance = search(search_arg)
125 class Result(object):
127 :class:`Result` is a helper class that, given a search and a result of that search, is able to correctly render itself with a template defined by the search. Every :class:`Result` will pass a ``title``, a ``url`` (if applicable), and the raw ``result`` returned by the search into the template context when rendering.
129 :param search: An instance of a :class:`BaseSearch` subclass or an object that implements the same API.
130 :param result: An arbitrary result from the ``search``.
133 def __init__(self, search, result):
138 """Returns the title of the result by calling :meth:`BaseSearch.get_result_title` on the raw result."""
139 return self.search.get_result_title(self.result)
142 """Returns the url of the result or ``None`` by calling :meth:`BaseSearch.get_result_url` on the raw result. This url will contain a querystring which, if used, will track a :class:`.Click` for the actual url."""
143 return self.search.get_result_url(self.result)
145 def get_actual_url(self):
146 """Returns the actual url of the result by calling :meth:`BaseSearch.get_actual_result_url` on the raw result."""
147 return self.search.get_actual_result_url(self.result)
149 def get_content(self):
150 """Returns the content of the result by calling :meth:`BaseSearch.get_result_content` on the raw result."""
151 return self.search.get_result_content(self.result)
153 def get_template(self):
154 """Returns the template which will be used to render the :class:`Result` by calling :meth:`BaseSearch.get_result_template` on the raw result."""
155 return self.search.get_result_template(self.result)
157 def get_context(self):
159 Returns the context dictionary for the result. This is used both in rendering the result and in the AJAX return value for :meth:`.SearchView.ajax_api_view`. The context will contain the following keys:
162 The result of calling :meth:`get_title`
164 The result of calling :meth:`get_url`
166 The result of calling :meth:`get_content`
169 if not hasattr(self, '_context'):
171 'title': self.get_title(),
172 'url': self.get_url(),
173 'actual_url': self.get_actual_url(),
174 'content': self.get_content()
179 """Returns the template from :meth:`get_template` rendered with the context from :meth:`get_context`."""
180 t = self.get_template()
181 c = Context(self.get_context())
184 def __unicode__(self):
185 """Returns :meth:`render`"""
189 class BaseSearchMetaclass(type):
190 def __new__(cls, name, bases, attrs):
191 if 'verbose_name' not in attrs:
192 attrs['verbose_name'] = capfirst(' '.join(convert_camelcase(name).rsplit(' ', 1)[:-1]))
193 if 'slug' not in attrs:
194 attrs['slug'] = name[:-6].lower() if name.endswith("Search") else name.lower()
195 return super(BaseSearchMetaclass, cls).__new__(cls, name, bases, attrs)
198 class BaseSearch(object):
200 Defines a generic search api. Accessing :attr:`results` will attempt to retrieve cached results and, if that fails, will initiate a new search and store the results in the cache. Each search has a ``verbose_name`` and a ``slug``. If these are not provided as attributes, they will be automatically generated based on the name of the class.
202 :param search_arg: The string which is being searched for.
205 __metaclass__ = BaseSearchMetaclass
206 #: The number of results to return from the complete list. Default: 5
208 #: How long the items for the search should be cached (in minutes). Default: 48 hours.
209 _cache_timeout = 60*48
210 #: The path to the template which will be used to render the :class:`Result`\ s for this search. If this is ``None``, then the framework will try ``sobol/search/<slug>/result.html`` and ``sobol/search/result.html``.
211 result_template = None
212 #: The path to the template which will be used to generate the title of the :class:`Result`\ s for this search. If this is ``None``, then the framework will try ``sobol/search/<slug>/title.html`` and ``sobol/search/title.html``.
213 title_template = None
214 #: The path to the template which will be used to generate the content of the :class:`Result`\ s for this search. If this is ``None``, then the framework will try ``sobol/search/<slug>/content.html`` and ``sobol/search/content.html``.
215 content_template = None
217 def __init__(self, search_arg):
218 self.search_arg = search_arg
222 """Retrieves cached results or initiates a new search via :meth:`get_results` and caches the results."""
223 if not hasattr(self, '_results'):
225 # Cache one extra result so we can see if there are
226 # more results to be had.
227 limit = self.result_limit
228 if limit is not None:
230 results = self.get_results(limit)
234 # On exceptions, don't set any cache; just return.
237 self._results = results
240 for result in results:
242 key = _make_cache_key(self, self.search_arg)
243 cache.set(key, self, self._cache_timeout)
247 def get_results(self, limit=None, result_class=Result):
249 Calls :meth:`search` and parses the return value into :class:`Result` instances.
251 :param limit: Passed directly to :meth:`search`.
252 :param result_class: The class used to represent the results. This will be instantiated with the :class:`BaseSearch` instance and the raw result from the search.
255 results = self.search(limit)
256 return [result_class(self, result) for result in results]
258 def search(self, limit=None):
259 """Returns an iterable of up to ``limit`` results. The :meth:`get_result_title`, :meth:`get_result_url`, :meth:`get_result_template`, and :meth:`get_result_extra_context` methods will be used to interpret the individual items that this function returns, so the result can be an object with attributes as easily as a dictionary with keys. However, keep in mind that the raw results will be stored with django's caching mechanisms and will be converted to JSON."""
260 raise NotImplementedError
262 def get_actual_result_url(self, result):
263 """Returns the actual URL for the ``result`` or ``None`` if there is no URL. Must be implemented by subclasses."""
264 raise NotImplementedError
266 def get_result_querydict(self, result):
267 """Returns a querydict for tracking selection of the result, or ``None`` if there is no URL for the result."""
268 url = self.get_actual_result_url(result)
271 return make_tracking_querydict(self.search_arg, url)
273 def get_result_url(self, result):
274 """Returns ``None`` or a url which, when accessed, will register a :class:`.Click` for that url."""
275 qd = self.get_result_querydict(result)
278 return "?%s" % qd.urlencode()
280 def get_result_title(self, result):
281 """Returns the title of the ``result``. By default, renders ``sobol/search/<slug>/title.html`` or ``sobol/search/title.html`` with the result in the context. This can be overridden by setting :attr:`title_template` or simply overriding :meth:`get_result_title`. If no template can be found, this will raise :exc:`TemplateDoesNotExist`."""
282 return loader.render_to_string(self.title_template or [
283 'sobol/search/%s/title.html' % self.slug,
284 'sobol/search/title.html'
285 ], {'result': result})
287 def get_result_content(self, result):
288 """Returns the content for the ``result``. By default, renders ``sobol/search/<slug>/content.html`` or ``sobol/search/content.html`` with the result in the context. This can be overridden by setting :attr:`content_template` or simply overriding :meth:`get_result_content`. If no template is found, this will return an empty string."""
290 return loader.render_to_string(self.content_template or [
291 'sobol/search/%s/content.html' % self.slug,
292 'sobol/search/content.html'
293 ], {'result': result})
294 except TemplateDoesNotExist:
297 def get_result_template(self, result):
298 """Returns the template to be used for rendering the ``result``. For a search with slug ``google``, this would first try ``sobol/search/google/result.html``, then fall back on ``sobol/search/result.html``. Subclasses can override this by setting :attr:`result_template` to the path of another template."""
299 if self.result_template:
300 return loader.get_template(self.result_template)
301 return loader.select_template([
302 'sobol/search/%s/result.html' % self.slug,
303 'sobol/search/result.html'
307 def has_more_results(self):
308 """Returns ``True`` if there are more results than :attr:`result_limit` and ``False`` otherwise."""
309 return len(self.results) > self.result_limit
311 def get_actual_more_results_url(self):
312 """Returns the actual url for more results. By default, simply returns ``None``."""
315 def get_more_results_querydict(self):
316 """Returns a :class:`QueryDict` for tracking whether people click on a 'more results' link."""
317 url = self.get_actual_more_results_url()
319 return make_tracking_querydict(self.search_arg, url)
323 def more_results_url(self):
324 """Returns a URL which consists of a querystring which, when accessed, will log a :class:`.Click` for the actual URL."""
325 qd = self.get_more_results_querydict()
328 return "?%s" % qd.urlencode()
330 def __unicode__(self):
331 return self.verbose_name
334 class DatabaseSearch(BaseSearch):
335 """Implements :meth:`~BaseSearch.search` and :meth:`get_queryset` methods to handle database queries."""
336 #: The model which should be searched by the :class:`DatabaseSearch`.
339 def search(self, limit=None):
340 if not hasattr(self, '_qs'):
341 self._qs = self.get_queryset()
342 if limit is not None:
343 self._qs = self._qs[:limit]
347 def get_queryset(self):
348 """Returns a :class:`QuerySet` of all instances of :attr:`model`. This method should be overridden by subclasses to specify how the search should actually be implemented for the model."""
349 return self.model._default_manager.all()
352 class URLSearch(BaseSearch):
353 """Defines a generic interface for searches that require accessing a certain url to get search results."""
354 #: The base URL which will be accessed to get the search results.
356 #: The url-encoded query string to be used for fetching search results from :attr:`search_url`. Must have one ``%s`` to contain the search argument.
357 query_format_str = "%s"
361 """The URL where the search gets its results. Composed from :attr:`search_url` and :attr:`query_format_str`."""
362 return self.search_url + self.query_format_str % urlquote_plus(self.search_arg)
364 def get_actual_more_results_url(self):
367 def parse_response(self, response, limit=None):
368 """Handles the ``response`` from accessing :attr:`url` (with :func:`urllib2.urlopen`) and returns a list of up to ``limit`` results."""
369 raise NotImplementedError
371 def search(self, limit=None):
372 return self.parse_response(urllib2.urlopen(self.url), limit=limit)
375 class JSONSearch(URLSearch):
376 """Makes a GET request and parses the results as JSON. The default behavior assumes that the response contains a list of results."""
377 def parse_response(self, response, limit=None):
378 return json.loads(response.read())[:limit]
381 class GoogleSearch(JSONSearch):
382 """An example implementation of a :class:`JSONSearch`."""
383 search_url = "http://ajax.googleapis.com/ajax/services/search/web"
385 verbose_name = "Google search (current site)"
386 _more_results_url = None
389 def query_format_str(self):
390 default_args = self.default_args
393 return "?v=1.0&q=%s%%s" % urlquote_plus(default_args).replace('%', '%%')
396 def default_args(self):
397 """Unquoted default arguments for the :class:`GoogleSearch`."""
398 return "site:%s" % Site.objects.get_current().domain
400 def parse_response(self, response, limit=None):
401 responseData = json.loads(response.read())['responseData']
402 results, cursor = responseData['results'], responseData['cursor']
405 self._more_results_url = cursor['moreResultsUrl']
406 self._estimated_result_count = cursor['estimatedResultCount']
408 return results[:limit]
412 # Google requires that an ajax request have a proper Referer header.
413 return urllib2.Request(
414 super(GoogleSearch, self).url,
416 {'Referer': "http://%s" % Site.objects.get_current().domain}
420 def has_more_results(self):
421 if self.results and len(self.results) < self._estimated_result_count:
425 def get_actual_more_results_url(self):
426 return self._more_results_url
428 def get_actual_result_url(self, result):
429 return result['unescapedUrl']
431 def get_result_title(self, result):
432 return mark_safe(result['titleNoFormatting'])
434 def get_result_content(self, result):
435 return mark_safe(result['content'])
438 registry.register(GoogleSearch)
442 from BeautifulSoup import BeautifulSoup, SoupStrainer, BeautifulStoneSoup
446 __all__ += ('ScrapeSearch', 'XMLSearch',)
447 class ScrapeSearch(URLSearch):
448 """A base class for scrape-style searching, available if :mod:`BeautifulSoup` is installed."""
449 #: Arguments to be passed into a :class:`SoupStrainer`.
451 #: Keyword arguments to be passed into a :class:`SoupStrainer`.
457 Caches and returns a :class:`SoupStrainer` initialized with :attr:`strainer_args` and :attr:`strainer_kwargs`. This strainer will be used to parse only certain parts of the document.
459 .. seealso:: `BeautifulSoup: Improving Performance by Parsing Only Part of the Document <http://www.crummy.com/software/BeautifulSoup/documentation.html#Improving%20Performance%20by%20Parsing%20Only%20Part%20of%20the%20Document>`_
462 if not hasattr(self, '_strainer'):
463 self._strainer = SoupStrainer(*self.strainer_args, **self.strainer_kwargs)
464 return self._strainer
466 def parse_response(self, response, limit=None):
467 strainer = self.strainer
468 soup = BeautifulSoup(response, parseOnlyThese=strainer)
469 return self.parse_results(soup.findAll(recursive=False, limit=limit))
471 def parse_results(self, results):
473 Provides a hook for parsing the results of straining. This has no default behavior and must be implemented by subclasses because the results absolutely must be parsed to properly extract the information.
475 .. seealso:: `BeautifulSoup: Improving Memory Usage with extract <http://www.crummy.com/software/BeautifulSoup/documentation.html#Improving%20Memory%20Usage%20with%20extract>`_
477 raise NotImplementedError
480 class XMLSearch(ScrapeSearch):
481 """A base class for searching XML results."""
482 #: Self-closing tag names to be used when interpreting the XML document
484 #: .. seealso:: `BeautifulSoup: Parsing XML <http://www.crummy.com/software/BeautifulSoup/documentation.html#Parsing%20XML>`_
485 self_closing_tags = []
487 def parse_response(self, response, limit=None):
488 strainer = self.strainer
489 soup = BeautifulStoneSoup(response, selfClosingTags=self.self_closing_tags, parseOnlyThese=strainer)
490 return self.parse_results(soup.findAll(recursive=False, limit=limit))