4 from django.conf import settings
5 from django.contrib.sites.models import Site
6 from django.core.cache import cache
7 from django.db.models.options import get_verbose_name as convert_camelcase
8 from django.utils import simplejson as json
9 from django.utils.http import urlquote_plus
10 from django.utils.safestring import mark_safe
11 from django.utils.text import capfirst
12 from django.template import loader, Context, Template
14 from philo.contrib.sobol.utils import make_tracking_querydict, RegistryIterator
17 if getattr(settings, 'SOBOL_USE_EVENTLET', False):
19 from eventlet.green import urllib2
27 'Result', 'BaseSearch', 'DatabaseSearch', 'URLSearch', 'JSONSearch', 'GoogleSearch', 'SearchRegistry', 'registry'
31 SEARCH_CACHE_KEY = 'philo_sobol_search_results'
32 DEFAULT_RESULT_TEMPLATE_STRING = "{% if url %}<a href='{{ url }}'>{% endif %}{{ title }}{% if url %}</a>{% endif %}"
33 DEFAULT_RESULT_TEMPLATE = Template(DEFAULT_RESULT_TEMPLATE_STRING)
35 # Determines the timeout on the entire result cache.
36 MAX_CACHE_TIMEOUT = 60*24*7
39 class RegistrationError(Exception):
40 """Raised if there is a problem registering a search with a :class:`SearchRegistry`"""
44 class SearchRegistry(object):
45 """Holds a registry of search types by slug."""
50 def register(self, search, slug=None):
52 Register a search with the registry.
54 :param search: The search class to register - generally a subclass of :class:`BaseSearch`
55 :param slug: The slug which will be used to register the search class. If ``slug`` is ``None``, the search's default slug will be used.
56 :raises: :class:`RegistrationError` if a different search is already registered with ``slug``.
59 slug = slug or search.slug
60 if slug in self._registry:
61 registered = self._registry[slug]
62 if registered.__module__ != search.__module__:
63 raise RegistrationError("A different search is already registered as `%s`" % slug)
65 self._registry[slug] = search
67 def unregister(self, search, slug=None):
69 Unregister a search from the registry.
71 :param search: The search class to unregister - generally a subclass of :class:`BaseSearch`
72 :param slug: If provided, the search will only be removed if it was registered with ``slug``. If not provided, the search class will be unregistered no matter what slug it was registered with.
73 :raises: :class:`RegistrationError` if a slug is provided but the search registered with that slug is not ``search``.
77 if slug in self._registry and self._registry[slug] == search:
78 del self._registry[slug]
79 raise RegistrationError("`%s` is not registered as `%s`" % (search, slug))
81 for slug, search in self._registry.items():
83 del self._registry[slug]
86 """Returns a list of (slug, search) items in the registry."""
87 return self._registry.items()
90 """Returns an iterator over the (slug, search) pairs in the registry."""
91 return RegistryIterator(self._registry, 'iteritems')
93 def iterchoices(self):
94 """Returns an iterator over (slug, search.verbose_name) pairs for the registry."""
95 return RegistryIterator(self._registry, 'iteritems', lambda x: (x[0], x[1].verbose_name))
97 def __getitem__(self, key):
98 """Returns the search registered with ``key``."""
99 return self._registry[key]
102 """Returns an iterator over the keys in the registry."""
103 return self._registry.__iter__()
106 registry = SearchRegistry()
109 class Result(object):
111 :class:`Result` is a helper class that, given a search and a result of that search, is able to correctly render itself with a template defined by the search. Every :class:`Result` will pass a ``title``, a ``url`` (if applicable), and the raw ``result`` returned by the search into the template context when rendering.
113 :param search: An instance of a :class:`BaseSearch` subclass or an object that implements the same API.
114 :param result: An arbitrary result from the ``search``.
117 def __init__(self, search, result):
122 """Returns the title of the result by calling :meth:`BaseSearch.get_result_title` on the raw result."""
123 return self.search.get_result_title(self.result)
126 """Returns the url of the result or an empty string by calling :meth:`BaseSearch.get_result_querydict` on the raw result and then encoding the querydict returned."""
127 qd = self.search.get_result_querydict(self.result)
130 return "?%s" % qd.urlencode()
132 def get_template(self):
133 """Returns the template for the result by calling :meth:`BaseSearch.get_result_template` on the raw result."""
134 return self.search.get_result_template(self.result)
136 def get_extra_context(self):
137 """Returns any extra context for the result by calling :meth:`BaseSearch.get_result_extra_context` on the raw result."""
138 return self.search.get_result_extra_context(self.result)
140 def get_context(self):
142 Returns the context dictionary for the result. This is used both in rendering the result and in the AJAX return value for :meth:`.SearchView.ajax_api_view`. The context will contain everything from :meth:`get_extra_context` as well as the following keys:
145 The result of calling :meth:`get_title`
147 The result of calling :meth:`get_url`
149 The raw result which the :class:`Result` was instantiated with.
152 context = self.get_extra_context()
154 'title': self.get_title(),
155 'url': self.get_url(),
156 'result': self.result
161 """Returns the template from :meth:`get_template` rendered with the context from :meth:`get_context`."""
162 t = self.get_template()
163 c = Context(self.get_context())
166 def __unicode__(self):
167 """Returns :meth:`render`"""
171 class BaseSearchMetaclass(type):
172 def __new__(cls, name, bases, attrs):
173 if 'verbose_name' not in attrs:
174 attrs['verbose_name'] = capfirst(' '.join(convert_camelcase(name).rsplit(' ', 1)[:-1]))
175 if 'slug' not in attrs:
176 attrs['slug'] = name.lower()
177 return super(BaseSearchMetaclass, cls).__new__(cls, name, bases, attrs)
180 class BaseSearch(object):
182 Defines a generic search api. Accessing :attr:`results` will attempt to retrieve cached results and, if that fails, will initiate a new search and store the results in the cache. Each search has a ``verbose_name`` and a ``slug``. If these are not provided as attributes, they will be automatically generated based on the name of the class.
184 :param search_arg: The string which is being searched for.
187 __metaclass__ = BaseSearchMetaclass
188 #: The number of results to return from the complete list. Default: 10
190 #: How long the items for the search should be cached (in minutes). Default: 48 hours.
191 _cache_timeout = 60*48
193 def __init__(self, search_arg):
194 self.search_arg = search_arg
196 def _get_cached_results(self):
197 """Return the cached results if the results haven't timed out. Otherwise return None."""
198 result_cache = cache.get(SEARCH_CACHE_KEY)
199 if result_cache and self.__class__ in result_cache and self.search_arg.lower() in result_cache[self.__class__]:
200 cached = result_cache[self.__class__][self.search_arg.lower()]
201 if cached['timeout'] >= datetime.datetime.now():
202 return cached['results']
205 def _set_cached_results(self, results, timeout):
206 """Sets the results to the cache for <timeout> minutes."""
207 result_cache = cache.get(SEARCH_CACHE_KEY) or {}
208 cached = result_cache.setdefault(self.__class__, {}).setdefault(self.search_arg.lower(), {})
211 'timeout': datetime.datetime.now() + datetime.timedelta(minutes=timeout)
213 cache.set(SEARCH_CACHE_KEY, result_cache, MAX_CACHE_TIMEOUT)
217 """Retrieves cached results or initiates a new search via :meth:`get_results` and caches the results."""
218 if not hasattr(self, '_results'):
219 results = self._get_cached_results()
222 # Cache one extra result so we can see if there are
223 # more results to be had.
224 limit = self.result_limit
225 if limit is not None:
227 results = self.get_results(limit)
231 # On exceptions, don't set any cache; just return.
234 self._set_cached_results(results, self._cache_timeout)
235 self._results = results
239 def get_results(self, limit=None, result_class=Result):
241 Calls :meth:`search` and parses the return value into :class:`Result` instances.
243 :param limit: Passed directly to :meth:`search`.
244 :param result_class: The class used to represent the results. This will be instantiated with the :class:`BaseSearch` instance and the raw result from the search.
247 results = self.search(limit)
248 return [result_class(self, result) for result in results]
250 def search(self, limit=None):
251 """Returns an iterable of up to ``limit`` results. The :meth:`get_result_title`, :meth:`get_result_url`, :meth:`get_result_template`, and :meth:`get_result_extra_context` methods will be used to interpret the individual items that this function returns, so the result can be an object with attributes as easily as a dictionary with keys. However, keep in mind that the raw results will be stored with django's caching mechanisms and will be converted to JSON."""
252 raise NotImplementedError
254 def get_result_title(self, result):
255 """Returns the title of the ``result``. Must be implemented by subclasses."""
256 raise NotImplementedError
258 def get_result_url(self, result):
259 """Returns the actual URL for the ``result`` or ``None`` if there is no URL. Must be implemented by subclasses."""
260 raise NotImplementedError
262 def get_result_querydict(self, result):
263 """Returns a querydict for tracking selection of the result, or ``None`` if there is no URL for the result."""
264 url = self.get_result_url(result)
267 return make_tracking_querydict(self.search_arg, url)
269 def get_result_template(self, result):
270 """Returns the template to be used for rendering the ``result``."""
271 if hasattr(self, 'result_template'):
272 return loader.get_template(self.result_template)
273 if not hasattr(self, '_result_template'):
274 self._result_template = DEFAULT_RESULT_TEMPLATE
275 return self._result_template
277 def get_result_extra_context(self, result):
278 """Returns any extra context to be used when rendering the ``result``."""
281 def has_more_results(self):
282 """Returns ``True`` if there are more results than :attr:`result_limit` and ``False`` otherwise."""
283 return len(self.results) > self.result_limit
286 def more_results_url(self):
287 """Returns the actual url for more results. This should be accessed through :attr:`more_results_querydict` in the template so that the click can be tracked."""
288 raise NotImplementedError
291 def more_results_querydict(self):
292 """Returns a :class:`QueryDict` for tracking whether people click on a 'more results' link."""
293 return make_tracking_querydict(self.search_arg, self.more_results_url)
295 def __unicode__(self):
296 return self.verbose_name
299 class DatabaseSearch(BaseSearch):
300 """Implements :meth:`~BaseSearch.search` and :meth:`get_queryset` methods to handle database queries."""
301 #: The model which should be searched by the :class:`DatabaseSearch`.
304 def search(self, limit=None):
305 if not hasattr(self, '_qs'):
306 self._qs = self.get_queryset()
307 if limit is not None:
308 self._qs = self._qs[:limit]
312 def get_queryset(self):
313 """Returns a :class:`QuerySet` of all instances of :attr:`model`. This method should be overridden by subclasses to specify how the search should actually be implemented for the model."""
314 return self.model._default_manager.all()
317 class URLSearch(BaseSearch):
318 """Defines a generic interface for searches that require accessing a certain url to get search results."""
319 #: The base URL which will be accessed to get the search results.
321 #: The url-encoded query string to be used for fetching search results from :attr:`search_url`. Must have one ``%s`` to contain the search argument.
322 query_format_str = "%s"
326 """The URL where the search gets its results. Composed from :attr:`search_url` and :attr:`query_format_str`."""
327 return self.search_url + self.query_format_str % urlquote_plus(self.search_arg)
330 def more_results_url(self):
333 def parse_response(self, response, limit=None):
334 """Handles the ``response`` from accessing :attr:`url` (with :func:`urllib2.urlopen`) and returns a list of up to ``limit`` results."""
335 raise NotImplementedError
337 def search(self, limit=None):
338 return self.parse_response(urllib2.urlopen(self.url), limit=limit)
341 class JSONSearch(URLSearch):
342 """Makes a GET request and parses the results as JSON. The default behavior assumes that the response contains a list of results."""
343 def parse_response(self, response, limit=None):
344 return json.loads(response.read())[:limit]
347 class GoogleSearch(JSONSearch):
348 """An example implementation of a :class:`JSONSearch`."""
349 search_url = "http://ajax.googleapis.com/ajax/services/search/web"
350 result_template = 'search/googlesearch.html'
352 verbose_name = "Google search (current site)"
355 def query_format_str(self):
356 default_args = self.default_args
359 return "?v=1.0&q=%s%%s" % urlquote_plus(default_args).replace('%', '%%')
362 def default_args(self):
363 """Unquoted default arguments for the :class:`GoogleSearch`."""
364 return "site:%s" % Site.objects.get_current().domain
366 def parse_response(self, response, limit=None):
367 responseData = json.loads(response.read())['responseData']
368 results, cursor = responseData['results'], responseData['cursor']
371 self._more_results_url = cursor['moreResultsUrl']
372 self._estimated_result_count = cursor['estimatedResultCount']
374 return results[:limit]
378 # Google requires that an ajax request have a proper Referer header.
379 return urllib2.Request(
380 super(GoogleSearch, self).url,
382 {'Referer': "http://%s" % Site.objects.get_current().domain}
386 def has_more_results(self):
387 if self.results and len(self.results) < self._estimated_result_count:
392 def more_results_url(self):
393 return self._more_results_url
395 def get_result_title(self, result):
396 return result['titleNoFormatting']
398 def get_result_url(self, result):
399 return result['unescapedUrl']
402 registry.register(GoogleSearch)
406 from BeautifulSoup import BeautifulSoup, SoupStrainer, BeautifulStoneSoup
410 __all__ += ('ScrapeSearch', 'XMLSearch',)
411 class ScrapeSearch(URLSearch):
412 """A base class for scrape-style searching, available if :mod:`BeautifulSoup` is installed."""
413 #: Arguments to be passed into a :class:`SoupStrainer`.
415 #: Keyword arguments to be passed into a :class:`SoupStrainer`.
421 Caches and returns a :class:`SoupStrainer` initialized with :attr:`strainer_args` and :attr:`strainer_kwargs`. This strainer will be used to parse only certain parts of the document.
423 .. seealso:: `BeautifulSoup: Improving Performance by Parsing Only Part of the Document <http://www.crummy.com/software/BeautifulSoup/documentation.html#Improving%20Performance%20by%20Parsing%20Only%20Part%20of%20the%20Document>`_
426 if not hasattr(self, '_strainer'):
427 self._strainer = SoupStrainer(*self.strainer_args, **self.strainer_kwargs)
428 return self._strainer
430 def parse_response(self, response, limit=None):
431 strainer = self.strainer
432 soup = BeautifulSoup(response, parseOnlyThese=strainer)
433 return self.parse_results(soup.findAll(recursive=False, limit=limit))
435 def parse_results(self, results):
437 Provides a hook for parsing the results of straining. This has no default behavior and must be implemented by subclasses because the results absolutely must be parsed to properly extract the information.
439 .. seealso:: `BeautifulSoup: Improving Memory Usage with extract <http://www.crummy.com/software/BeautifulSoup/documentation.html#Improving%20Memory%20Usage%20with%20extract>`_
441 raise NotImplementedError
444 class XMLSearch(ScrapeSearch):
445 """A base class for searching XML results."""
446 #: Self-closing tag names to be used when interpreting the XML document
448 #: .. seealso:: `BeautifulSoup: Parsing XML <http://www.crummy.com/software/BeautifulSoup/documentation.html#Parsing%20XML>`_
449 self_closing_tags = []
451 def parse_response(self, response, limit=None):
452 strainer = self.strainer
453 soup = BeautifulStoneSoup(response, selfClosingTags=self.self_closing_tags, parseOnlyThese=strainer)
454 return self.parse_results(soup.findAll(recursive=False, limit=limit))