4 from django.conf import settings
5 from django.contrib.sites.models import Site
6 from django.core.cache import cache
7 from django.db.models.options import get_verbose_name as convert_camelcase
8 from django.utils import simplejson as json
9 from django.utils.http import urlquote_plus
10 from django.utils.safestring import mark_safe
11 from django.utils.text import capfirst
12 from django.template import loader, Context, Template
14 from philo.contrib.sobol.utils import make_tracking_querydict
15 from philo.utils.registry import Registry
18 if getattr(settings, 'SOBOL_USE_EVENTLET', False):
20 from eventlet.green import urllib2
28 'Result', 'BaseSearch', 'DatabaseSearch', 'URLSearch', 'JSONSearch', 'GoogleSearch', 'registry'
32 SEARCH_CACHE_KEY = 'philo_sobol_search_results'
33 DEFAULT_RESULT_TEMPLATE_STRING = "{% if url %}<a href='{{ url }}'>{% endif %}{{ title }}{% if url %}</a>{% endif %}"
34 DEFAULT_RESULT_TEMPLATE = Template(DEFAULT_RESULT_TEMPLATE_STRING)
36 # Determines the timeout on the entire result cache.
37 MAX_CACHE_TIMEOUT = 60*24*7
40 #: A registry for :class:`BaseSearch` subclasses that should be available in the admin.
46 :class:`Result` is a helper class that, given a search and a result of that search, is able to correctly render itself with a template defined by the search. Every :class:`Result` will pass a ``title``, a ``url`` (if applicable), and the raw ``result`` returned by the search into the template context when rendering.
48 :param search: An instance of a :class:`BaseSearch` subclass or an object that implements the same API.
49 :param result: An arbitrary result from the ``search``.
52 def __init__(self, search, result):
57 """Returns the title of the result by calling :meth:`BaseSearch.get_result_title` on the raw result."""
58 return self.search.get_result_title(self.result)
61 """Returns the url of the result or an empty string by calling :meth:`BaseSearch.get_result_querydict` on the raw result and then encoding the querydict returned."""
62 qd = self.search.get_result_querydict(self.result)
65 return "?%s" % qd.urlencode()
67 def get_template(self):
68 """Returns the template for the result by calling :meth:`BaseSearch.get_result_template` on the raw result."""
69 return self.search.get_result_template(self.result)
71 def get_extra_context(self):
72 """Returns any extra context for the result by calling :meth:`BaseSearch.get_result_extra_context` on the raw result."""
73 return self.search.get_result_extra_context(self.result)
75 def get_context(self):
77 Returns the context dictionary for the result. This is used both in rendering the result and in the AJAX return value for :meth:`.SearchView.ajax_api_view`. The context will contain everything from :meth:`get_extra_context` as well as the following keys:
80 The result of calling :meth:`get_title`
82 The result of calling :meth:`get_url`
84 The raw result which the :class:`Result` was instantiated with.
87 context = self.get_extra_context()
89 'title': self.get_title(),
90 'url': self.get_url(),
96 """Returns the template from :meth:`get_template` rendered with the context from :meth:`get_context`."""
97 t = self.get_template()
98 c = Context(self.get_context())
101 def __unicode__(self):
102 """Returns :meth:`render`"""
106 class BaseSearchMetaclass(type):
107 def __new__(cls, name, bases, attrs):
108 if 'verbose_name' not in attrs:
109 attrs['verbose_name'] = capfirst(' '.join(convert_camelcase(name).rsplit(' ', 1)[:-1]))
110 if 'slug' not in attrs:
111 attrs['slug'] = name.lower()
112 return super(BaseSearchMetaclass, cls).__new__(cls, name, bases, attrs)
115 class BaseSearch(object):
117 Defines a generic search api. Accessing :attr:`results` will attempt to retrieve cached results and, if that fails, will initiate a new search and store the results in the cache. Each search has a ``verbose_name`` and a ``slug``. If these are not provided as attributes, they will be automatically generated based on the name of the class.
119 :param search_arg: The string which is being searched for.
122 __metaclass__ = BaseSearchMetaclass
123 #: The number of results to return from the complete list. Default: 10
125 #: How long the items for the search should be cached (in minutes). Default: 48 hours.
126 _cache_timeout = 60*48
128 def __init__(self, search_arg):
129 self.search_arg = search_arg
131 def _get_cached_results(self):
132 """Return the cached results if the results haven't timed out. Otherwise return None."""
133 result_cache = cache.get(SEARCH_CACHE_KEY)
134 if result_cache and self.__class__ in result_cache and self.search_arg.lower() in result_cache[self.__class__]:
135 cached = result_cache[self.__class__][self.search_arg.lower()]
136 if cached['timeout'] >= datetime.datetime.now():
137 return cached['results']
140 def _set_cached_results(self, results, timeout):
141 """Sets the results to the cache for <timeout> minutes."""
142 result_cache = cache.get(SEARCH_CACHE_KEY) or {}
143 cached = result_cache.setdefault(self.__class__, {}).setdefault(self.search_arg.lower(), {})
146 'timeout': datetime.datetime.now() + datetime.timedelta(minutes=timeout)
148 cache.set(SEARCH_CACHE_KEY, result_cache, MAX_CACHE_TIMEOUT)
152 """Retrieves cached results or initiates a new search via :meth:`get_results` and caches the results."""
153 if not hasattr(self, '_results'):
154 results = self._get_cached_results()
157 # Cache one extra result so we can see if there are
158 # more results to be had.
159 limit = self.result_limit
160 if limit is not None:
162 results = self.get_results(limit)
166 # On exceptions, don't set any cache; just return.
169 self._set_cached_results(results, self._cache_timeout)
170 self._results = results
174 def get_results(self, limit=None, result_class=Result):
176 Calls :meth:`search` and parses the return value into :class:`Result` instances.
178 :param limit: Passed directly to :meth:`search`.
179 :param result_class: The class used to represent the results. This will be instantiated with the :class:`BaseSearch` instance and the raw result from the search.
182 results = self.search(limit)
183 return [result_class(self, result) for result in results]
185 def search(self, limit=None):
186 """Returns an iterable of up to ``limit`` results. The :meth:`get_result_title`, :meth:`get_result_url`, :meth:`get_result_template`, and :meth:`get_result_extra_context` methods will be used to interpret the individual items that this function returns, so the result can be an object with attributes as easily as a dictionary with keys. However, keep in mind that the raw results will be stored with django's caching mechanisms and will be converted to JSON."""
187 raise NotImplementedError
189 def get_result_title(self, result):
190 """Returns the title of the ``result``. Must be implemented by subclasses."""
191 raise NotImplementedError
193 def get_result_url(self, result):
194 """Returns the actual URL for the ``result`` or ``None`` if there is no URL. Must be implemented by subclasses."""
195 raise NotImplementedError
197 def get_result_querydict(self, result):
198 """Returns a querydict for tracking selection of the result, or ``None`` if there is no URL for the result."""
199 url = self.get_result_url(result)
202 return make_tracking_querydict(self.search_arg, url)
204 def get_result_template(self, result):
205 """Returns the template to be used for rendering the ``result``."""
206 if hasattr(self, 'result_template'):
207 return loader.get_template(self.result_template)
208 if not hasattr(self, '_result_template'):
209 self._result_template = DEFAULT_RESULT_TEMPLATE
210 return self._result_template
212 def get_result_extra_context(self, result):
213 """Returns any extra context to be used when rendering the ``result``."""
216 def has_more_results(self):
217 """Returns ``True`` if there are more results than :attr:`result_limit` and ``False`` otherwise."""
218 return len(self.results) > self.result_limit
221 def more_results_url(self):
222 """Returns the actual url for more results. This should be accessed through :attr:`more_results_querydict` in the template so that the click can be tracked."""
223 raise NotImplementedError
226 def more_results_querydict(self):
227 """Returns a :class:`QueryDict` for tracking whether people click on a 'more results' link."""
228 return make_tracking_querydict(self.search_arg, self.more_results_url)
230 def __unicode__(self):
231 return self.verbose_name
234 class DatabaseSearch(BaseSearch):
235 """Implements :meth:`~BaseSearch.search` and :meth:`get_queryset` methods to handle database queries."""
236 #: The model which should be searched by the :class:`DatabaseSearch`.
239 def search(self, limit=None):
240 if not hasattr(self, '_qs'):
241 self._qs = self.get_queryset()
242 if limit is not None:
243 self._qs = self._qs[:limit]
247 def get_queryset(self):
248 """Returns a :class:`QuerySet` of all instances of :attr:`model`. This method should be overridden by subclasses to specify how the search should actually be implemented for the model."""
249 return self.model._default_manager.all()
252 class URLSearch(BaseSearch):
253 """Defines a generic interface for searches that require accessing a certain url to get search results."""
254 #: The base URL which will be accessed to get the search results.
256 #: The url-encoded query string to be used for fetching search results from :attr:`search_url`. Must have one ``%s`` to contain the search argument.
257 query_format_str = "%s"
261 """The URL where the search gets its results. Composed from :attr:`search_url` and :attr:`query_format_str`."""
262 return self.search_url + self.query_format_str % urlquote_plus(self.search_arg)
265 def more_results_url(self):
268 def parse_response(self, response, limit=None):
269 """Handles the ``response`` from accessing :attr:`url` (with :func:`urllib2.urlopen`) and returns a list of up to ``limit`` results."""
270 raise NotImplementedError
272 def search(self, limit=None):
273 return self.parse_response(urllib2.urlopen(self.url), limit=limit)
276 class JSONSearch(URLSearch):
277 """Makes a GET request and parses the results as JSON. The default behavior assumes that the response contains a list of results."""
278 def parse_response(self, response, limit=None):
279 return json.loads(response.read())[:limit]
282 class GoogleSearch(JSONSearch):
283 """An example implementation of a :class:`JSONSearch`."""
284 search_url = "http://ajax.googleapis.com/ajax/services/search/web"
285 result_template = 'search/googlesearch.html'
287 verbose_name = "Google search (current site)"
290 def query_format_str(self):
291 default_args = self.default_args
294 return "?v=1.0&q=%s%%s" % urlquote_plus(default_args).replace('%', '%%')
297 def default_args(self):
298 """Unquoted default arguments for the :class:`GoogleSearch`."""
299 return "site:%s" % Site.objects.get_current().domain
301 def parse_response(self, response, limit=None):
302 responseData = json.loads(response.read())['responseData']
303 results, cursor = responseData['results'], responseData['cursor']
306 self._more_results_url = cursor['moreResultsUrl']
307 self._estimated_result_count = cursor['estimatedResultCount']
309 return results[:limit]
313 # Google requires that an ajax request have a proper Referer header.
314 return urllib2.Request(
315 super(GoogleSearch, self).url,
317 {'Referer': "http://%s" % Site.objects.get_current().domain}
321 def has_more_results(self):
322 if self.results and len(self.results) < self._estimated_result_count:
327 def more_results_url(self):
328 return self._more_results_url
330 def get_result_title(self, result):
331 return result['titleNoFormatting']
333 def get_result_url(self, result):
334 return result['unescapedUrl']
337 registry.register(GoogleSearch)
341 from BeautifulSoup import BeautifulSoup, SoupStrainer, BeautifulStoneSoup
345 __all__ += ('ScrapeSearch', 'XMLSearch',)
346 class ScrapeSearch(URLSearch):
347 """A base class for scrape-style searching, available if :mod:`BeautifulSoup` is installed."""
348 #: Arguments to be passed into a :class:`SoupStrainer`.
350 #: Keyword arguments to be passed into a :class:`SoupStrainer`.
356 Caches and returns a :class:`SoupStrainer` initialized with :attr:`strainer_args` and :attr:`strainer_kwargs`. This strainer will be used to parse only certain parts of the document.
358 .. seealso:: `BeautifulSoup: Improving Performance by Parsing Only Part of the Document <http://www.crummy.com/software/BeautifulSoup/documentation.html#Improving%20Performance%20by%20Parsing%20Only%20Part%20of%20the%20Document>`_
361 if not hasattr(self, '_strainer'):
362 self._strainer = SoupStrainer(*self.strainer_args, **self.strainer_kwargs)
363 return self._strainer
365 def parse_response(self, response, limit=None):
366 strainer = self.strainer
367 soup = BeautifulSoup(response, parseOnlyThese=strainer)
368 return self.parse_results(soup.findAll(recursive=False, limit=limit))
370 def parse_results(self, results):
372 Provides a hook for parsing the results of straining. This has no default behavior and must be implemented by subclasses because the results absolutely must be parsed to properly extract the information.
374 .. seealso:: `BeautifulSoup: Improving Memory Usage with extract <http://www.crummy.com/software/BeautifulSoup/documentation.html#Improving%20Memory%20Usage%20with%20extract>`_
376 raise NotImplementedError
379 class XMLSearch(ScrapeSearch):
380 """A base class for searching XML results."""
381 #: Self-closing tag names to be used when interpreting the XML document
383 #: .. seealso:: `BeautifulSoup: Parsing XML <http://www.crummy.com/software/BeautifulSoup/documentation.html#Parsing%20XML>`_
384 self_closing_tags = []
386 def parse_response(self, response, limit=None):
387 strainer = self.strainer
388 soup = BeautifulStoneSoup(response, selfClosingTags=self.self_closing_tags, parseOnlyThese=strainer)
389 return self.parse_results(soup.findAll(recursive=False, limit=limit))