3 from hashlib import sha1
5 from django.conf import settings
6 from django.contrib.sites.models import Site
7 from django.core.cache import cache
8 from django.db.models.options import get_verbose_name as convert_camelcase
9 from django.utils import simplejson as json
10 from django.utils.http import urlquote_plus
11 from django.utils.safestring import mark_safe
12 from django.utils.text import capfirst
13 from django.template import loader, Context, Template
15 from philo.contrib.sobol.utils import make_tracking_querydict, RegistryIterator
18 if getattr(settings, 'SOBOL_USE_EVENTLET', False):
20 from eventlet.green import urllib2
28 'Result', 'BaseSearch', 'DatabaseSearch', 'URLSearch', 'JSONSearch', 'GoogleSearch', 'SearchRegistry', 'registry', 'get_search_instance'
32 SEARCH_CACHE_SEED = 'philo_sobol_search_results'
33 USE_CACHE = getattr(settings, 'SOBOL_USE_SEARCH', True)
36 class RegistrationError(Exception):
37 """Raised if there is a problem registering a search with a :class:`SearchRegistry`"""
41 class SearchRegistry(object):
42 """Holds a registry of search types by slug."""
47 def register(self, search, slug=None):
49 Register a search with the registry.
51 :param search: The search class to register - generally a subclass of :class:`BaseSearch`
52 :param slug: The slug which will be used to register the search class. If ``slug`` is ``None``, the search's default slug will be used.
53 :raises: :class:`RegistrationError` if a different search is already registered with ``slug``.
56 slug = slug or search.slug
57 if slug in self._registry:
58 registered = self._registry[slug]
59 if registered.__module__ != search.__module__:
60 raise RegistrationError("A different search is already registered as `%s`" % slug)
62 self._registry[slug] = search
64 def unregister(self, search, slug=None):
66 Unregister a search from the registry.
68 :param search: The search class to unregister - generally a subclass of :class:`BaseSearch`
69 :param slug: If provided, the search will only be removed if it was registered with ``slug``. If not provided, the search class will be unregistered no matter what slug it was registered with.
70 :raises: :class:`RegistrationError` if a slug is provided but the search registered with that slug is not ``search``.
74 if slug in self._registry and self._registry[slug] == search:
75 del self._registry[slug]
76 raise RegistrationError("`%s` is not registered as `%s`" % (search, slug))
78 for slug, search in self._registry.items():
80 del self._registry[slug]
83 """Returns a list of (slug, search) items in the registry."""
84 return self._registry.items()
87 """Returns an iterator over the (slug, search) pairs in the registry."""
88 return RegistryIterator(self._registry, 'iteritems')
90 def iterchoices(self):
91 """Returns an iterator over (slug, search.verbose_name) pairs for the registry."""
92 return RegistryIterator(self._registry, 'iteritems', lambda x: (x[0], x[1].verbose_name))
94 def __getitem__(self, key):
95 """Returns the search registered with ``key``."""
96 return self._registry[key]
99 """Returns an iterator over the keys in the registry."""
100 return self._registry.__iter__()
103 registry = SearchRegistry()
106 def _make_cache_key(search, search_arg):
107 return sha1(SEARCH_CACHE_SEED + search.slug + search_arg).hexdigest()
110 def get_search_instance(slug, search_arg):
111 """Returns a search instance for the given slug, either from the cache or newly-instantiated."""
112 search = registry[slug]
113 search_arg = search_arg.lower()
115 key = _make_cache_key(search, search_arg)
116 cached = cache.get(key)
119 instance = search(search_arg)
125 class Result(object):
127 :class:`Result` is a helper class that, given a search and a result of that search, is able to correctly render itself with a template defined by the search. Every :class:`Result` will pass a ``title``, a ``url`` (if applicable), and the raw ``result`` returned by the search into the template context when rendering.
129 :param search: An instance of a :class:`BaseSearch` subclass or an object that implements the same API.
130 :param result: An arbitrary result from the ``search``.
133 def __init__(self, search, result):
138 """Returns the title of the result by calling :meth:`BaseSearch.get_result_title` on the raw result."""
139 return self.search.get_result_title(self.result)
142 """Returns the url of the result or an empty string by calling :meth:`BaseSearch.get_result_querydict` on the raw result and then encoding the querydict returned."""
143 qd = self.search.get_result_querydict(self.result)
146 return "?%s" % qd.urlencode()
148 def get_template(self):
149 """Returns the template for the result by calling :meth:`BaseSearch.get_result_template` on the raw result."""
150 return self.search.get_result_template(self.result)
152 def get_extra_context(self):
153 """Returns any extra context for the result by calling :meth:`BaseSearch.get_result_extra_context` on the raw result."""
154 return self.search.get_result_extra_context(self.result)
156 def get_context(self):
158 Returns the context dictionary for the result. This is used both in rendering the result and in the AJAX return value for :meth:`.SearchView.ajax_api_view`. The context will contain everything from :meth:`get_extra_context` as well as the following keys:
161 The result of calling :meth:`get_title`
163 The result of calling :meth:`get_url`
165 The raw result which the :class:`Result` was instantiated with.
168 context = self.get_extra_context()
170 'title': self.get_title(),
171 'url': self.get_url(),
172 'result': self.result
177 """Returns the template from :meth:`get_template` rendered with the context from :meth:`get_context`."""
178 t = self.get_template()
179 c = Context(self.get_context())
182 def __unicode__(self):
183 """Returns :meth:`render`"""
187 class BaseSearchMetaclass(type):
188 def __new__(cls, name, bases, attrs):
189 if 'verbose_name' not in attrs:
190 attrs['verbose_name'] = capfirst(' '.join(convert_camelcase(name).rsplit(' ', 1)[:-1]))
191 if 'slug' not in attrs:
192 attrs['slug'] = name[:-6].lower() if name.endswith("Search") else name.lower()
193 return super(BaseSearchMetaclass, cls).__new__(cls, name, bases, attrs)
196 class BaseSearch(object):
198 Defines a generic search api. Accessing :attr:`results` will attempt to retrieve cached results and, if that fails, will initiate a new search and store the results in the cache. Each search has a ``verbose_name`` and a ``slug``. If these are not provided as attributes, they will be automatically generated based on the name of the class.
200 :param search_arg: The string which is being searched for.
203 __metaclass__ = BaseSearchMetaclass
204 #: The number of results to return from the complete list. Default: 5
206 #: How long the items for the search should be cached (in minutes). Default: 48 hours.
207 _cache_timeout = 60*48
208 #: The path to the template which will be used to render the :class:`Result`\ s for this search.
209 result_template = "sobol/search/basesearch.html"
211 def __init__(self, search_arg):
212 self.search_arg = search_arg
216 """Retrieves cached results or initiates a new search via :meth:`get_results` and caches the results."""
217 if not hasattr(self, '_results'):
219 # Cache one extra result so we can see if there are
220 # more results to be had.
221 limit = self.result_limit
222 if limit is not None:
224 results = self.get_results(limit)
228 # On exceptions, don't set any cache; just return.
231 self._results = results
234 key = _make_cache_key(self, self.search_arg)
235 cache.set(key, self, self._cache_timeout)
239 def get_results(self, limit=None, result_class=Result):
241 Calls :meth:`search` and parses the return value into :class:`Result` instances.
243 :param limit: Passed directly to :meth:`search`.
244 :param result_class: The class used to represent the results. This will be instantiated with the :class:`BaseSearch` instance and the raw result from the search.
247 results = self.search(limit)
248 return [result_class(self, result) for result in results]
250 def search(self, limit=None):
251 """Returns an iterable of up to ``limit`` results. The :meth:`get_result_title`, :meth:`get_result_url`, :meth:`get_result_template`, and :meth:`get_result_extra_context` methods will be used to interpret the individual items that this function returns, so the result can be an object with attributes as easily as a dictionary with keys. However, keep in mind that the raw results will be stored with django's caching mechanisms and will be converted to JSON."""
252 raise NotImplementedError
254 def get_result_title(self, result):
255 """Returns the title of the ``result``. Must be implemented by subclasses."""
256 raise NotImplementedError
258 def get_result_url(self, result):
259 """Returns the actual URL for the ``result`` or ``None`` if there is no URL. Must be implemented by subclasses."""
260 raise NotImplementedError
262 def get_result_querydict(self, result):
263 """Returns a querydict for tracking selection of the result, or ``None`` if there is no URL for the result."""
264 url = self.get_result_url(result)
267 return make_tracking_querydict(self.search_arg, url)
269 def get_result_template(self, result):
270 """Returns the template to be used for rendering the ``result``."""
271 return loader.get_template(self.result_template)
273 def get_result_extra_context(self, result):
274 """Returns any extra context to be used when rendering the ``result``."""
278 def has_more_results(self):
279 """Returns ``True`` if there are more results than :attr:`result_limit` and ``False`` otherwise."""
280 return len(self.results) > self.result_limit
283 def more_results_url(self):
284 """Returns the actual url for more results. This should be accessed through :attr:`more_results_querydict` in the template so that the click can be tracked. By default, simply returns ``None``."""
288 def more_results_querydict(self):
289 """Returns a :class:`QueryDict` for tracking whether people click on a 'more results' link."""
290 url = self.more_results_url
292 return make_tracking_querydict(self.search_arg, url)
295 def __unicode__(self):
296 return self.verbose_name
299 class DatabaseSearch(BaseSearch):
300 """Implements :meth:`~BaseSearch.search` and :meth:`get_queryset` methods to handle database queries."""
301 #: The model which should be searched by the :class:`DatabaseSearch`.
304 def search(self, limit=None):
305 if not hasattr(self, '_qs'):
306 self._qs = self.get_queryset()
307 if limit is not None:
308 self._qs = self._qs[:limit]
312 def get_queryset(self):
313 """Returns a :class:`QuerySet` of all instances of :attr:`model`. This method should be overridden by subclasses to specify how the search should actually be implemented for the model."""
314 return self.model._default_manager.all()
317 class URLSearch(BaseSearch):
318 """Defines a generic interface for searches that require accessing a certain url to get search results."""
319 #: The base URL which will be accessed to get the search results.
321 #: The url-encoded query string to be used for fetching search results from :attr:`search_url`. Must have one ``%s`` to contain the search argument.
322 query_format_str = "%s"
326 """The URL where the search gets its results. Composed from :attr:`search_url` and :attr:`query_format_str`."""
327 return self.search_url + self.query_format_str % urlquote_plus(self.search_arg)
330 def more_results_url(self):
333 def parse_response(self, response, limit=None):
334 """Handles the ``response`` from accessing :attr:`url` (with :func:`urllib2.urlopen`) and returns a list of up to ``limit`` results."""
335 raise NotImplementedError
337 def search(self, limit=None):
338 return self.parse_response(urllib2.urlopen(self.url), limit=limit)
341 class JSONSearch(URLSearch):
342 """Makes a GET request and parses the results as JSON. The default behavior assumes that the response contains a list of results."""
343 def parse_response(self, response, limit=None):
344 return json.loads(response.read())[:limit]
347 class GoogleSearch(JSONSearch):
348 """An example implementation of a :class:`JSONSearch`."""
349 search_url = "http://ajax.googleapis.com/ajax/services/search/web"
351 verbose_name = "Google search (current site)"
352 result_template = "sobol/search/googlesearch.html"
353 _more_results_url = None
356 def query_format_str(self):
357 default_args = self.default_args
360 return "?v=1.0&q=%s%%s" % urlquote_plus(default_args).replace('%', '%%')
363 def default_args(self):
364 """Unquoted default arguments for the :class:`GoogleSearch`."""
365 return "site:%s" % Site.objects.get_current().domain
367 def parse_response(self, response, limit=None):
368 responseData = json.loads(response.read())['responseData']
369 results, cursor = responseData['results'], responseData['cursor']
372 self._more_results_url = cursor['moreResultsUrl']
373 self._estimated_result_count = cursor['estimatedResultCount']
375 return results[:limit]
379 # Google requires that an ajax request have a proper Referer header.
380 return urllib2.Request(
381 super(GoogleSearch, self).url,
383 {'Referer': "http://%s" % Site.objects.get_current().domain}
387 def has_more_results(self):
388 if self.results and len(self.results) < self._estimated_result_count:
393 def more_results_url(self):
394 return self._more_results_url
396 def get_result_title(self, result):
397 return result['titleNoFormatting']
399 def get_result_url(self, result):
400 return result['unescapedUrl']
403 registry.register(GoogleSearch)
407 from BeautifulSoup import BeautifulSoup, SoupStrainer, BeautifulStoneSoup
411 __all__ += ('ScrapeSearch', 'XMLSearch',)
412 class ScrapeSearch(URLSearch):
413 """A base class for scrape-style searching, available if :mod:`BeautifulSoup` is installed."""
414 #: Arguments to be passed into a :class:`SoupStrainer`.
416 #: Keyword arguments to be passed into a :class:`SoupStrainer`.
422 Caches and returns a :class:`SoupStrainer` initialized with :attr:`strainer_args` and :attr:`strainer_kwargs`. This strainer will be used to parse only certain parts of the document.
424 .. seealso:: `BeautifulSoup: Improving Performance by Parsing Only Part of the Document <http://www.crummy.com/software/BeautifulSoup/documentation.html#Improving%20Performance%20by%20Parsing%20Only%20Part%20of%20the%20Document>`_
427 if not hasattr(self, '_strainer'):
428 self._strainer = SoupStrainer(*self.strainer_args, **self.strainer_kwargs)
429 return self._strainer
431 def parse_response(self, response, limit=None):
432 strainer = self.strainer
433 soup = BeautifulSoup(response, parseOnlyThese=strainer)
434 return self.parse_results(soup.findAll(recursive=False, limit=limit))
436 def parse_results(self, results):
438 Provides a hook for parsing the results of straining. This has no default behavior and must be implemented by subclasses because the results absolutely must be parsed to properly extract the information.
440 .. seealso:: `BeautifulSoup: Improving Memory Usage with extract <http://www.crummy.com/software/BeautifulSoup/documentation.html#Improving%20Memory%20Usage%20with%20extract>`_
442 raise NotImplementedError
445 class XMLSearch(ScrapeSearch):
446 """A base class for searching XML results."""
447 #: Self-closing tag names to be used when interpreting the XML document
449 #: .. seealso:: `BeautifulSoup: Parsing XML <http://www.crummy.com/software/BeautifulSoup/documentation.html#Parsing%20XML>`_
450 self_closing_tags = []
452 def parse_response(self, response, limit=None):
453 strainer = self.strainer
454 soup = BeautifulStoneSoup(response, selfClosingTags=self.self_closing_tags, parseOnlyThese=strainer)
455 return self.parse_results(soup.findAll(recursive=False, limit=limit))