3 from hashlib import sha1
5 from django.conf import settings
6 from django.contrib.sites.models import Site
7 from django.core.cache import cache
8 from django.db.models.options import get_verbose_name as convert_camelcase
9 from django.utils import simplejson as json
10 from django.utils.http import urlquote_plus
11 from django.utils.safestring import mark_safe
12 from django.utils.text import capfirst
13 from django.template import loader, Context, Template
15 from philo.contrib.sobol.utils import make_tracking_querydict, RegistryIterator
18 if getattr(settings, 'SOBOL_USE_EVENTLET', False):
20 from eventlet.green import urllib2
28 'Result', 'BaseSearch', 'DatabaseSearch', 'URLSearch', 'JSONSearch', 'GoogleSearch', 'SearchRegistry', 'registry', 'get_search_instance'
32 SEARCH_CACHE_SEED = 'philo_sobol_search_results'
33 USE_CACHE = getattr(settings, 'SOBOL_USE_SEARCH', True)
36 class RegistrationError(Exception):
37 """Raised if there is a problem registering a search with a :class:`SearchRegistry`"""
41 class SearchRegistry(object):
42 """Holds a registry of search types by slug."""
47 def register(self, search, slug=None):
49 Register a search with the registry.
51 :param search: The search class to register - generally a subclass of :class:`BaseSearch`
52 :param slug: The slug which will be used to register the search class. If ``slug`` is ``None``, the search's default slug will be used.
53 :raises: :class:`RegistrationError` if a different search is already registered with ``slug``.
56 slug = slug or search.slug
57 if slug in self._registry:
58 registered = self._registry[slug]
59 if registered.__module__ != search.__module__:
60 raise RegistrationError("A different search is already registered as `%s`" % slug)
62 self._registry[slug] = search
64 def unregister(self, search, slug=None):
66 Unregister a search from the registry.
68 :param search: The search class to unregister - generally a subclass of :class:`BaseSearch`
69 :param slug: If provided, the search will only be removed if it was registered with ``slug``. If not provided, the search class will be unregistered no matter what slug it was registered with.
70 :raises: :class:`RegistrationError` if a slug is provided but the search registered with that slug is not ``search``.
74 if slug in self._registry and self._registry[slug] == search:
75 del self._registry[slug]
76 raise RegistrationError("`%s` is not registered as `%s`" % (search, slug))
78 for slug, search in self._registry.items():
80 del self._registry[slug]
83 """Returns a list of (slug, search) items in the registry."""
84 return self._registry.items()
87 """Returns an iterator over the (slug, search) pairs in the registry."""
88 return RegistryIterator(self._registry, 'iteritems')
90 def iterchoices(self):
91 """Returns an iterator over (slug, search.verbose_name) pairs for the registry."""
92 return RegistryIterator(self._registry, 'iteritems', lambda x: (x[0], x[1].verbose_name))
94 def __getitem__(self, key):
95 """Returns the search registered with ``key``."""
96 return self._registry[key]
99 """Returns an iterator over the keys in the registry."""
100 return self._registry.__iter__()
103 registry = SearchRegistry()
106 def _make_cache_key(search, search_arg):
107 return sha1(SEARCH_CACHE_SEED + search.slug + search_arg).hexdigest()
110 def get_search_instance(slug, search_arg):
111 """Returns a search instance for the given slug, either from the cache or newly-instantiated."""
112 search = registry[slug]
113 search_arg = search_arg.lower()
115 key = _make_cache_key(search, search_arg)
116 cached = cache.get(key)
119 return search(search_arg)
123 class Result(object):
125 :class:`Result` is a helper class that, given a search and a result of that search, is able to correctly render itself with a template defined by the search. Every :class:`Result` will pass a ``title``, a ``url`` (if applicable), and the raw ``result`` returned by the search into the template context when rendering.
127 :param search: An instance of a :class:`BaseSearch` subclass or an object that implements the same API.
128 :param result: An arbitrary result from the ``search``.
131 def __init__(self, search, result):
136 """Returns the title of the result by calling :meth:`BaseSearch.get_result_title` on the raw result."""
137 return self.search.get_result_title(self.result)
140 """Returns the url of the result or an empty string by calling :meth:`BaseSearch.get_result_querydict` on the raw result and then encoding the querydict returned."""
141 qd = self.search.get_result_querydict(self.result)
144 return "?%s" % qd.urlencode()
146 def get_template(self):
147 """Returns the template for the result by calling :meth:`BaseSearch.get_result_template` on the raw result."""
148 return self.search.get_result_template(self.result)
150 def get_extra_context(self):
151 """Returns any extra context for the result by calling :meth:`BaseSearch.get_result_extra_context` on the raw result."""
152 return self.search.get_result_extra_context(self.result)
154 def get_context(self):
156 Returns the context dictionary for the result. This is used both in rendering the result and in the AJAX return value for :meth:`.SearchView.ajax_api_view`. The context will contain everything from :meth:`get_extra_context` as well as the following keys:
159 The result of calling :meth:`get_title`
161 The result of calling :meth:`get_url`
163 The raw result which the :class:`Result` was instantiated with.
166 context = self.get_extra_context()
168 'title': self.get_title(),
169 'url': self.get_url(),
170 'result': self.result
175 """Returns the template from :meth:`get_template` rendered with the context from :meth:`get_context`."""
176 t = self.get_template()
177 c = Context(self.get_context())
180 def __unicode__(self):
181 """Returns :meth:`render`"""
185 class BaseSearchMetaclass(type):
186 def __new__(cls, name, bases, attrs):
187 if 'verbose_name' not in attrs:
188 attrs['verbose_name'] = capfirst(' '.join(convert_camelcase(name).rsplit(' ', 1)[:-1]))
189 if 'slug' not in attrs:
190 attrs['slug'] = name[:-6].lower() if name.endswith("Search") else name.lower()
191 return super(BaseSearchMetaclass, cls).__new__(cls, name, bases, attrs)
194 class BaseSearch(object):
196 Defines a generic search api. Accessing :attr:`results` will attempt to retrieve cached results and, if that fails, will initiate a new search and store the results in the cache. Each search has a ``verbose_name`` and a ``slug``. If these are not provided as attributes, they will be automatically generated based on the name of the class.
198 :param search_arg: The string which is being searched for.
201 __metaclass__ = BaseSearchMetaclass
202 #: The number of results to return from the complete list. Default: 5
204 #: How long the items for the search should be cached (in minutes). Default: 48 hours.
205 _cache_timeout = 60*48
206 #: The path to the template which will be used to render the :class:`Result`\ s for this search.
207 result_template = "sobol/search/basesearch.html"
209 def __init__(self, search_arg):
210 self.search_arg = search_arg
214 """Retrieves cached results or initiates a new search via :meth:`get_results` and caches the results."""
215 if not hasattr(self, '_results'):
217 # Cache one extra result so we can see if there are
218 # more results to be had.
219 limit = self.result_limit
220 if limit is not None:
222 results = self.get_results(limit)
226 # On exceptions, don't set any cache; just return.
229 self._results = results
232 key = _make_cache_key(self, self.search_arg)
233 cache.set(key, self, self._cache_timeout)
237 def get_results(self, limit=None, result_class=Result):
239 Calls :meth:`search` and parses the return value into :class:`Result` instances.
241 :param limit: Passed directly to :meth:`search`.
242 :param result_class: The class used to represent the results. This will be instantiated with the :class:`BaseSearch` instance and the raw result from the search.
245 results = self.search(limit)
246 return [result_class(self, result) for result in results]
248 def search(self, limit=None):
249 """Returns an iterable of up to ``limit`` results. The :meth:`get_result_title`, :meth:`get_result_url`, :meth:`get_result_template`, and :meth:`get_result_extra_context` methods will be used to interpret the individual items that this function returns, so the result can be an object with attributes as easily as a dictionary with keys. However, keep in mind that the raw results will be stored with django's caching mechanisms and will be converted to JSON."""
250 raise NotImplementedError
252 def get_result_title(self, result):
253 """Returns the title of the ``result``. Must be implemented by subclasses."""
254 raise NotImplementedError
256 def get_result_url(self, result):
257 """Returns the actual URL for the ``result`` or ``None`` if there is no URL. Must be implemented by subclasses."""
258 raise NotImplementedError
260 def get_result_querydict(self, result):
261 """Returns a querydict for tracking selection of the result, or ``None`` if there is no URL for the result."""
262 url = self.get_result_url(result)
265 return make_tracking_querydict(self.search_arg, url)
267 def get_result_template(self, result):
268 """Returns the template to be used for rendering the ``result``."""
269 return loader.get_template(self.result_template)
271 def get_result_extra_context(self, result):
272 """Returns any extra context to be used when rendering the ``result``."""
276 def has_more_results(self):
277 """Returns ``True`` if there are more results than :attr:`result_limit` and ``False`` otherwise."""
278 return len(self.results) > self.result_limit
281 def more_results_url(self):
282 """Returns the actual url for more results. This should be accessed through :attr:`more_results_querydict` in the template so that the click can be tracked. By default, simply returns ``None``."""
286 def more_results_querydict(self):
287 """Returns a :class:`QueryDict` for tracking whether people click on a 'more results' link."""
288 url = self.more_results_url
290 return make_tracking_querydict(self.search_arg, url)
293 def __unicode__(self):
294 return self.verbose_name
297 class DatabaseSearch(BaseSearch):
298 """Implements :meth:`~BaseSearch.search` and :meth:`get_queryset` methods to handle database queries."""
299 #: The model which should be searched by the :class:`DatabaseSearch`.
302 def search(self, limit=None):
303 if not hasattr(self, '_qs'):
304 self._qs = self.get_queryset()
305 if limit is not None:
306 self._qs = self._qs[:limit]
310 def get_queryset(self):
311 """Returns a :class:`QuerySet` of all instances of :attr:`model`. This method should be overridden by subclasses to specify how the search should actually be implemented for the model."""
312 return self.model._default_manager.all()
315 class URLSearch(BaseSearch):
316 """Defines a generic interface for searches that require accessing a certain url to get search results."""
317 #: The base URL which will be accessed to get the search results.
319 #: The url-encoded query string to be used for fetching search results from :attr:`search_url`. Must have one ``%s`` to contain the search argument.
320 query_format_str = "%s"
324 """The URL where the search gets its results. Composed from :attr:`search_url` and :attr:`query_format_str`."""
325 return self.search_url + self.query_format_str % urlquote_plus(self.search_arg)
328 def more_results_url(self):
331 def parse_response(self, response, limit=None):
332 """Handles the ``response`` from accessing :attr:`url` (with :func:`urllib2.urlopen`) and returns a list of up to ``limit`` results."""
333 raise NotImplementedError
335 def search(self, limit=None):
336 return self.parse_response(urllib2.urlopen(self.url), limit=limit)
339 class JSONSearch(URLSearch):
340 """Makes a GET request and parses the results as JSON. The default behavior assumes that the response contains a list of results."""
341 def parse_response(self, response, limit=None):
342 return json.loads(response.read())[:limit]
345 class GoogleSearch(JSONSearch):
346 """An example implementation of a :class:`JSONSearch`."""
347 search_url = "http://ajax.googleapis.com/ajax/services/search/web"
349 verbose_name = "Google search (current site)"
350 result_template = "sobol/search/googlesearch.html"
351 _more_results_url = None
354 def query_format_str(self):
355 default_args = self.default_args
358 return "?v=1.0&q=%s%%s" % urlquote_plus(default_args).replace('%', '%%')
361 def default_args(self):
362 """Unquoted default arguments for the :class:`GoogleSearch`."""
363 return "site:%s" % Site.objects.get_current().domain
365 def parse_response(self, response, limit=None):
366 responseData = json.loads(response.read())['responseData']
367 results, cursor = responseData['results'], responseData['cursor']
370 self._more_results_url = cursor['moreResultsUrl']
371 self._estimated_result_count = cursor['estimatedResultCount']
373 return results[:limit]
377 # Google requires that an ajax request have a proper Referer header.
378 return urllib2.Request(
379 super(GoogleSearch, self).url,
381 {'Referer': "http://%s" % Site.objects.get_current().domain}
385 def has_more_results(self):
386 if self.results and len(self.results) < self._estimated_result_count:
391 def more_results_url(self):
392 return self._more_results_url
394 def get_result_title(self, result):
395 return result['titleNoFormatting']
397 def get_result_url(self, result):
398 return result['unescapedUrl']
401 registry.register(GoogleSearch)
405 from BeautifulSoup import BeautifulSoup, SoupStrainer, BeautifulStoneSoup
409 __all__ += ('ScrapeSearch', 'XMLSearch',)
410 class ScrapeSearch(URLSearch):
411 """A base class for scrape-style searching, available if :mod:`BeautifulSoup` is installed."""
412 #: Arguments to be passed into a :class:`SoupStrainer`.
414 #: Keyword arguments to be passed into a :class:`SoupStrainer`.
420 Caches and returns a :class:`SoupStrainer` initialized with :attr:`strainer_args` and :attr:`strainer_kwargs`. This strainer will be used to parse only certain parts of the document.
422 .. seealso:: `BeautifulSoup: Improving Performance by Parsing Only Part of the Document <http://www.crummy.com/software/BeautifulSoup/documentation.html#Improving%20Performance%20by%20Parsing%20Only%20Part%20of%20the%20Document>`_
425 if not hasattr(self, '_strainer'):
426 self._strainer = SoupStrainer(*self.strainer_args, **self.strainer_kwargs)
427 return self._strainer
429 def parse_response(self, response, limit=None):
430 strainer = self.strainer
431 soup = BeautifulSoup(response, parseOnlyThese=strainer)
432 return self.parse_results(soup.findAll(recursive=False, limit=limit))
434 def parse_results(self, results):
436 Provides a hook for parsing the results of straining. This has no default behavior and must be implemented by subclasses because the results absolutely must be parsed to properly extract the information.
438 .. seealso:: `BeautifulSoup: Improving Memory Usage with extract <http://www.crummy.com/software/BeautifulSoup/documentation.html#Improving%20Memory%20Usage%20with%20extract>`_
440 raise NotImplementedError
443 class XMLSearch(ScrapeSearch):
444 """A base class for searching XML results."""
445 #: Self-closing tag names to be used when interpreting the XML document
447 #: .. seealso:: `BeautifulSoup: Parsing XML <http://www.crummy.com/software/BeautifulSoup/documentation.html#Parsing%20XML>`_
448 self_closing_tags = []
450 def parse_response(self, response, limit=None):
451 strainer = self.strainer
452 soup = BeautifulStoneSoup(response, selfClosingTags=self.self_closing_tags, parseOnlyThese=strainer)
453 return self.parse_results(soup.findAll(recursive=False, limit=limit))