3 from hashlib import sha1
5 from django.conf import settings
6 from django.contrib.sites.models import Site
7 from django.core.cache import cache
8 from django.db.models.options import get_verbose_name as convert_camelcase
9 from django.utils import simplejson as json
10 from django.utils.http import urlquote_plus
11 from django.utils.safestring import mark_safe
12 from django.utils.text import capfirst
13 from django.template import loader, Context, Template, TemplateDoesNotExist
15 from philo.contrib.sobol.utils import make_tracking_querydict
16 from philo.utils.registry import Registry
19 if getattr(settings, 'SOBOL_USE_EVENTLET', False):
21 from eventlet.green import urllib2
29 'Result', 'BaseSearch', 'DatabaseSearch', 'URLSearch', 'JSONSearch', 'GoogleSearch', 'registry', 'get_search_instance'
33 SEARCH_CACHE_SEED = 'philo_sobol_search_results'
34 USE_CACHE = getattr(settings, 'SOBOL_USE_CACHE', True)
37 #: A registry for :class:`BaseSearch` subclasses that should be available in the admin.
41 def _make_cache_key(search, search_arg):
42 return sha1(SEARCH_CACHE_SEED + search.slug + search_arg).hexdigest()
45 def get_search_instance(slug, search_arg):
46 """Returns a search instance for the given slug, either from the cache or newly-instantiated."""
47 search = registry[slug]
48 search_arg = search_arg.lower()
50 key = _make_cache_key(search, search_arg)
51 cached = cache.get(key)
54 instance = search(search_arg)
61 :class:`Result` is a helper class that, given a search and a result of that search, is able to correctly render itself with a template defined by the search. Every :class:`Result` will pass a ``title``, a ``url`` (if applicable), and the raw ``result`` returned by the search into the template context when rendering.
63 :param search: An instance of a :class:`BaseSearch` subclass or an object that implements the same API.
64 :param result: An arbitrary result from the ``search``.
67 def __init__(self, search, result):
72 """Returns the title of the result by calling :meth:`BaseSearch.get_result_title` on the raw result."""
73 return self.search.get_result_title(self.result)
76 """Returns the url of the result or ``None`` by calling :meth:`BaseSearch.get_result_url` on the raw result. This url will contain a querystring which, if used, will track a :class:`.Click` for the actual url."""
77 return self.search.get_result_url(self.result)
79 def get_actual_url(self):
80 """Returns the actual url of the result by calling :meth:`BaseSearch.get_actual_result_url` on the raw result."""
81 return self.search.get_actual_result_url(self.result)
83 def get_content(self):
84 """Returns the content of the result by calling :meth:`BaseSearch.get_result_content` on the raw result."""
85 return self.search.get_result_content(self.result)
87 def get_template(self):
88 """Returns the template which will be used to render the :class:`Result` by calling :meth:`BaseSearch.get_result_template` on the raw result."""
89 return self.search.get_result_template(self.result)
91 def get_context(self):
93 Returns the context dictionary for the result. This is used both in rendering the result and in the AJAX return value for :meth:`.SearchView.ajax_api_view`. The context will contain the following keys:
96 The result of calling :meth:`get_title`
98 The result of calling :meth:`get_url`
100 The result of calling :meth:`get_content`
103 if not hasattr(self, '_context'):
105 'title': self.get_title(),
106 'url': self.get_url(),
107 'actual_url': self.get_actual_url(),
108 'content': self.get_content()
113 """Returns the template from :meth:`get_template` rendered with the context from :meth:`get_context`."""
114 t = self.get_template()
115 c = Context(self.get_context())
118 def __unicode__(self):
119 """Returns :meth:`render`"""
123 class BaseSearchMetaclass(type):
124 def __new__(cls, name, bases, attrs):
125 if 'verbose_name' not in attrs:
126 attrs['verbose_name'] = capfirst(' '.join(convert_camelcase(name).rsplit(' ', 1)[:-1]))
127 if 'slug' not in attrs:
128 attrs['slug'] = name[:-6].lower() if name.endswith("Search") else name.lower()
129 return super(BaseSearchMetaclass, cls).__new__(cls, name, bases, attrs)
132 class BaseSearch(object):
134 Defines a generic search api. Accessing :attr:`results` will attempt to retrieve cached results and, if that fails, will initiate a new search and store the results in the cache. Each search has a ``verbose_name`` and a ``slug``. If these are not provided as attributes, they will be automatically generated based on the name of the class.
136 :param search_arg: The string which is being searched for.
139 __metaclass__ = BaseSearchMetaclass
140 #: The number of results to return from the complete list. Default: 5
142 #: How long the items for the search should be cached (in minutes). Default: 48 hours.
143 _cache_timeout = 60*48
144 #: The path to the template which will be used to render the :class:`Result`\ s for this search. If this is ``None``, then the framework will try ``sobol/search/<slug>/result.html`` and ``sobol/search/result.html``.
145 result_template = None
146 #: The path to the template which will be used to generate the title of the :class:`Result`\ s for this search. If this is ``None``, then the framework will try ``sobol/search/<slug>/title.html`` and ``sobol/search/title.html``.
147 title_template = None
148 #: The path to the template which will be used to generate the content of the :class:`Result`\ s for this search. If this is ``None``, then the framework will try ``sobol/search/<slug>/content.html`` and ``sobol/search/content.html``.
149 content_template = None
151 def __init__(self, search_arg):
152 self.search_arg = search_arg
156 """Retrieves cached results or initiates a new search via :meth:`get_results` and caches the results."""
157 if not hasattr(self, '_results'):
159 # Cache one extra result so we can see if there are
160 # more results to be had.
161 limit = self.result_limit
162 if limit is not None:
164 results = self.get_results(limit)
168 # On exceptions, don't set any cache; just return.
171 self._results = results
174 for result in results:
176 key = _make_cache_key(self, self.search_arg)
177 cache.set(key, self, self._cache_timeout)
181 def get_results(self, limit=None, result_class=Result):
183 Calls :meth:`search` and parses the return value into :class:`Result` instances.
185 :param limit: Passed directly to :meth:`search`.
186 :param result_class: The class used to represent the results. This will be instantiated with the :class:`BaseSearch` instance and the raw result from the search.
189 results = self.search(limit)
190 return [result_class(self, result) for result in results]
192 def search(self, limit=None):
193 """Returns an iterable of up to ``limit`` results. The :meth:`get_result_title`, :meth:`get_result_url`, :meth:`get_result_template`, and :meth:`get_result_extra_context` methods will be used to interpret the individual items that this function returns, so the result can be an object with attributes as easily as a dictionary with keys. However, keep in mind that the raw results will be stored with django's caching mechanisms and will be converted to JSON."""
194 raise NotImplementedError
196 def get_actual_result_url(self, result):
197 """Returns the actual URL for the ``result`` or ``None`` if there is no URL. Must be implemented by subclasses."""
198 raise NotImplementedError
200 def get_result_querydict(self, result):
201 """Returns a querydict for tracking selection of the result, or ``None`` if there is no URL for the result."""
202 url = self.get_actual_result_url(result)
205 return make_tracking_querydict(self.search_arg, url)
207 def get_result_url(self, result):
208 """Returns ``None`` or a url which, when accessed, will register a :class:`.Click` for that url."""
209 qd = self.get_result_querydict(result)
212 return "?%s" % qd.urlencode()
214 def get_result_title(self, result):
215 """Returns the title of the ``result``. By default, renders ``sobol/search/<slug>/title.html`` or ``sobol/search/title.html`` with the result in the context. This can be overridden by setting :attr:`title_template` or simply overriding :meth:`get_result_title`. If no template can be found, this will raise :exc:`TemplateDoesNotExist`."""
216 return loader.render_to_string(self.title_template or [
217 'sobol/search/%s/title.html' % self.slug,
218 'sobol/search/title.html'
219 ], {'result': result})
221 def get_result_content(self, result):
222 """Returns the content for the ``result``. By default, renders ``sobol/search/<slug>/content.html`` or ``sobol/search/content.html`` with the result in the context. This can be overridden by setting :attr:`content_template` or simply overriding :meth:`get_result_content`. If no template is found, this will return an empty string."""
224 return loader.render_to_string(self.content_template or [
225 'sobol/search/%s/content.html' % self.slug,
226 'sobol/search/content.html'
227 ], {'result': result})
228 except TemplateDoesNotExist:
231 def get_result_template(self, result):
232 """Returns the template to be used for rendering the ``result``. For a search with slug ``google``, this would first try ``sobol/search/google/result.html``, then fall back on ``sobol/search/result.html``. Subclasses can override this by setting :attr:`result_template` to the path of another template."""
233 if self.result_template:
234 return loader.get_template(self.result_template)
235 return loader.select_template([
236 'sobol/search/%s/result.html' % self.slug,
237 'sobol/search/result.html'
241 def has_more_results(self):
242 """Returns ``True`` if there are more results than :attr:`result_limit` and ``False`` otherwise."""
243 return len(self.results) > self.result_limit
245 def get_actual_more_results_url(self):
246 """Returns the actual url for more results. By default, simply returns ``None``."""
249 def get_more_results_querydict(self):
250 """Returns a :class:`QueryDict` for tracking whether people click on a 'more results' link."""
251 url = self.get_actual_more_results_url()
253 return make_tracking_querydict(self.search_arg, url)
257 def more_results_url(self):
258 """Returns a URL which consists of a querystring which, when accessed, will log a :class:`.Click` for the actual URL."""
259 qd = self.get_more_results_querydict()
262 return "?%s" % qd.urlencode()
264 def __unicode__(self):
265 return self.verbose_name
268 class DatabaseSearch(BaseSearch):
269 """Implements :meth:`~BaseSearch.search` and :meth:`get_queryset` methods to handle database queries."""
270 #: The model which should be searched by the :class:`DatabaseSearch`.
273 def search(self, limit=None):
274 if not hasattr(self, '_qs'):
275 self._qs = self.get_queryset()
276 if limit is not None:
277 self._qs = self._qs[:limit]
281 def get_queryset(self):
282 """Returns a :class:`QuerySet` of all instances of :attr:`model`. This method should be overridden by subclasses to specify how the search should actually be implemented for the model."""
283 return self.model._default_manager.all()
286 class URLSearch(BaseSearch):
287 """Defines a generic interface for searches that require accessing a certain url to get search results."""
288 #: The base URL which will be accessed to get the search results.
290 #: The url-encoded query string to be used for fetching search results from :attr:`search_url`. Must have one ``%s`` to contain the search argument.
291 query_format_str = "%s"
295 """The URL where the search gets its results. Composed from :attr:`search_url` and :attr:`query_format_str`."""
296 return self.search_url + self.query_format_str % urlquote_plus(self.search_arg)
298 def get_actual_more_results_url(self):
301 def parse_response(self, response, limit=None):
302 """Handles the ``response`` from accessing :attr:`url` (with :func:`urllib2.urlopen`) and returns a list of up to ``limit`` results."""
303 raise NotImplementedError
305 def search(self, limit=None):
306 return self.parse_response(urllib2.urlopen(self.url), limit=limit)
309 class JSONSearch(URLSearch):
310 """Makes a GET request and parses the results as JSON. The default behavior assumes that the response contains a list of results."""
311 def parse_response(self, response, limit=None):
312 return json.loads(response.read())[:limit]
315 class GoogleSearch(JSONSearch):
316 """An example implementation of a :class:`JSONSearch`."""
317 search_url = "http://ajax.googleapis.com/ajax/services/search/web"
319 verbose_name = "Google search (current site)"
320 _more_results_url = None
323 def query_format_str(self):
324 default_args = self.default_args
327 return "?v=1.0&q=%s%%s" % urlquote_plus(default_args).replace('%', '%%')
330 def default_args(self):
331 """Unquoted default arguments for the :class:`GoogleSearch`."""
332 return "site:%s" % Site.objects.get_current().domain
334 def parse_response(self, response, limit=None):
335 responseData = json.loads(response.read())['responseData']
336 results, cursor = responseData['results'], responseData['cursor']
339 self._more_results_url = cursor['moreResultsUrl']
340 self._estimated_result_count = cursor['estimatedResultCount']
342 return results[:limit]
346 # Google requires that an ajax request have a proper Referer header.
347 return urllib2.Request(
348 super(GoogleSearch, self).url,
350 {'Referer': "http://%s" % Site.objects.get_current().domain}
354 def has_more_results(self):
355 if self.results and len(self.results) < self._estimated_result_count:
359 def get_actual_more_results_url(self):
360 return self._more_results_url
362 def get_actual_result_url(self, result):
363 return result['unescapedUrl']
365 def get_result_title(self, result):
366 return mark_safe(result['titleNoFormatting'])
368 def get_result_content(self, result):
369 return mark_safe(result['content'])
372 registry.register(GoogleSearch)
376 from BeautifulSoup import BeautifulSoup, SoupStrainer, BeautifulStoneSoup
380 __all__ += ('ScrapeSearch', 'XMLSearch',)
381 class ScrapeSearch(URLSearch):
382 """A base class for scrape-style searching, available if :mod:`BeautifulSoup` is installed."""
383 #: Arguments to be passed into a :class:`SoupStrainer`.
385 #: Keyword arguments to be passed into a :class:`SoupStrainer`.
391 Caches and returns a :class:`SoupStrainer` initialized with :attr:`strainer_args` and :attr:`strainer_kwargs`. This strainer will be used to parse only certain parts of the document.
393 .. seealso:: `BeautifulSoup: Improving Performance by Parsing Only Part of the Document <http://www.crummy.com/software/BeautifulSoup/documentation.html#Improving%20Performance%20by%20Parsing%20Only%20Part%20of%20the%20Document>`_
396 if not hasattr(self, '_strainer'):
397 self._strainer = SoupStrainer(*self.strainer_args, **self.strainer_kwargs)
398 return self._strainer
400 def parse_response(self, response, limit=None):
401 strainer = self.strainer
402 soup = BeautifulSoup(response, parseOnlyThese=strainer)
403 return self.parse_results(soup.findAll(recursive=False, limit=limit))
405 def parse_results(self, results):
407 Provides a hook for parsing the results of straining. This has no default behavior and must be implemented by subclasses because the results absolutely must be parsed to properly extract the information.
409 .. seealso:: `BeautifulSoup: Improving Memory Usage with extract <http://www.crummy.com/software/BeautifulSoup/documentation.html#Improving%20Memory%20Usage%20with%20extract>`_
411 raise NotImplementedError
414 class XMLSearch(ScrapeSearch):
415 """A base class for searching XML results."""
416 #: Self-closing tag names to be used when interpreting the XML document
418 #: .. seealso:: `BeautifulSoup: Parsing XML <http://www.crummy.com/software/BeautifulSoup/documentation.html#Parsing%20XML>`_
419 self_closing_tags = []
421 def parse_response(self, response, limit=None):
422 strainer = self.strainer
423 soup = BeautifulStoneSoup(response, selfClosingTags=self.self_closing_tags, parseOnlyThese=strainer)
424 return self.parse_results(soup.findAll(recursive=False, limit=limit))