philo/contrib/sobol/search.py

   1 #encoding: utf-8
   2 import datetime
   3 from hashlib import sha1
   4
   5 from django.conf import settings
   6 from django.contrib.sites.models import Site
   7 from django.core.cache import cache
   8 from django.db.models.options import get_verbose_name as convert_camelcase
   9 from django.utils import simplejson as json
  10 from django.utils.http import urlquote_plus
  11 from django.utils.safestring import mark_safe
  12 from django.utils.text import capfirst
  13 from django.template import loader, Context, Template, TemplateDoesNotExist
  14
  15 from philo.contrib.sobol.utils import make_tracking_querydict
  16 from philo.utils.registry import Registry
  17
  18
  19 if getattr(settings, 'SOBOL_USE_EVENTLET', False):
  20         try:
  21                 from eventlet.green import urllib2
  22         except:
  23                 import urllib2
  24 else:
  25         import urllib2
  26
  27
  28 __all__ = (
  29         'Result', 'BaseSearch', 'DatabaseSearch', 'URLSearch', 'JSONSearch', 'GoogleSearch', 'registry', 'get_search_instance'
  30 )
  31
  32
  33 SEARCH_CACHE_SEED = 'philo_sobol_search_results'
  34 USE_CACHE = getattr(settings, 'SOBOL_USE_CACHE', True)
  35
  36
  37 #: A registry for :class:`BaseSearch` subclasses that should be available in the admin.
  38 registry = Registry()
  39
  40
  41 def _make_cache_key(search, search_arg):
  42         return sha1(SEARCH_CACHE_SEED + search.slug + search_arg).hexdigest()
  43
  44
  45 def get_search_instance(slug, search_arg):
  46         """Returns a search instance for the given slug, either from the cache or newly-instantiated."""
  47         search = registry[slug]
  48         search_arg = search_arg.lower()
  49         if USE_CACHE:
  50                 key = _make_cache_key(search, search_arg)
  51                 cached = cache.get(key)
  52                 if cached:
  53                         return cached
  54         instance = search(search_arg)
  55         instance.slug = slug
  56         return instance
  57
  58
  59 class Result(object):
  60         """
  61         :class:`Result` is a helper class that, given a search and a result of that search, is able to correctly render itself with a template defined by the search. Every :class:`Result` will pass a ``title``, a ``url`` (if applicable), and the raw ``result`` returned by the search into the template context when rendering.
  62
  63         :param search: An instance of a :class:`BaseSearch` subclass or an object that implements the same API.
  64         :param result: An arbitrary result from the ``search``.
  65
  66         """
  67         def __init__(self, search, result):
  68                 self.search = search
  69                 self.result = result
  70
  71         def get_title(self):
  72                 """Returns the title of the result by calling :meth:`BaseSearch.get_result_title` on the raw result."""
  73                 return self.search.get_result_title(self.result)
  74
  75         def get_url(self):
  76                 """Returns the url of the result or ``None`` by calling :meth:`BaseSearch.get_result_url` on the raw result. This url will contain a querystring which, if used, will track a :class:`.Click` for the actual url."""
  77                 return self.search.get_result_url(self.result)
  78
  79         def get_actual_url(self):
  80                 """Returns the actual url of the result by calling :meth:`BaseSearch.get_actual_result_url` on the raw result."""
  81                 return self.search.get_actual_result_url(self.result)
  82
  83         def get_content(self):
  84                 """Returns the content of the result by calling :meth:`BaseSearch.get_result_content` on the raw result."""
  85                 return self.search.get_result_content(self.result)
  86
  87         def get_template(self):
  88                 """Returns the template which will be used to render the :class:`Result` by calling :meth:`BaseSearch.get_result_template` on the raw result."""
  89                 return self.search.get_result_template(self.result)
  90
  91         def get_context(self):
  92                 """
  93                 Returns the context dictionary for the result. This is used both in rendering the result and in the AJAX return value for :meth:`.SearchView.ajax_api_view`. The context will contain the following keys:
  94
  95                 title
  96                         The result of calling :meth:`get_title`
  97                 url
  98                         The result of calling :meth:`get_url`
  99                 content
 100                         The result of calling :meth:`get_content`
 101
 102                 """
 103                 if not hasattr(self, '_context'):
 104                         self._context = {
 105                                 'title': self.get_title(),
 106                                 'url': self.get_url(),
 107                                 'actual_url': self.get_actual_url(),
 108                                 'content': self.get_content()
 109                         }
 110                 return self._context
 111
 112         def render(self):
 113                 """Returns the template from :meth:`get_template` rendered with the context from :meth:`get_context`."""
 114                 t = self.get_template()
 115                 c = Context(self.get_context())
 116                 return t.render(c)
 117
 118         def __unicode__(self):
 119                 """Returns :meth:`render`"""
 120                 return self.render()
 121
 122
 123 class BaseSearchMetaclass(type):
 124         def __new__(cls, name, bases, attrs):
 125                 if 'verbose_name' not in attrs:
 126                         attrs['verbose_name'] = capfirst(' '.join(convert_camelcase(name).rsplit(' ', 1)[:-1]))
 127                 if 'slug' not in attrs:
 128                         attrs['slug'] = name[:-6].lower() if name.endswith("Search") else name.lower()
 129                 return super(BaseSearchMetaclass, cls).__new__(cls, name, bases, attrs)
 130
 131
 132 class BaseSearch(object):
 133         """
 134         Defines a generic search api. Accessing :attr:`results` will attempt to retrieve cached results and, if that fails, will initiate a new search and store the results in the cache. Each search has a ``verbose_name`` and a ``slug``. If these are not provided as attributes, they will be automatically generated based on the name of the class.
 135
 136         :param search_arg: The string which is being searched for.
 137
 138         """
 139         __metaclass__ = BaseSearchMetaclass
 140         #: The number of results to return from the complete list. Default: 5
 141         result_limit = 5
 142         #: How long the items for the search should be cached (in minutes). Default: 48 hours.
 143         _cache_timeout = 60*48
 144         #: The path to the template which will be used to render the :class:`Result`\ s for this search. If this is ``None``, then the framework will try ``sobol/search/<slug>/result.html`` and ``sobol/search/result.html``.
 145         result_template = None
 146         #: The path to the template which will be used to generate the title of the :class:`Result`\ s for this search. If this is ``None``, then the framework will try ``sobol/search/<slug>/title.html`` and ``sobol/search/title.html``.
 147         title_template = None
 148         #: The path to the template which will be used to generate the content of the :class:`Result`\ s for this search. If this is ``None``, then the framework will try ``sobol/search/<slug>/content.html`` and ``sobol/search/content.html``.
 149         content_template = None
 150
 151         def __init__(self, search_arg):
 152                 self.search_arg = search_arg
 153
 154         @property
 155         def results(self):
 156                 """Retrieves cached results or initiates a new search via :meth:`get_results` and caches the results."""
 157                 if not hasattr(self, '_results'):
 158                         try:
 159                                 # Cache one extra result so we can see if there are
 160                                 # more results to be had.
 161                                 limit = self.result_limit
 162                                 if limit is not None:
 163                                         limit += 1
 164                                 results = self.get_results(limit)
 165                         except:
 166                                 if settings.DEBUG:
 167                                         raise
 168                                 #  On exceptions, don't set any cache; just return.
 169                                 return []
 170
 171                         self._results = results
 172
 173                         if USE_CACHE:
 174                                 for result in results:
 175                                         result.get_context()
 176                                 key = _make_cache_key(self, self.search_arg)
 177                                 cache.set(key, self, self._cache_timeout)
 178
 179                 return self._results
 180
 181         def get_results(self, limit=None, result_class=Result):
 182                 """
 183                 Calls :meth:`search` and parses the return value into :class:`Result` instances.
 184
 185                 :param limit: Passed directly to :meth:`search`.
 186                 :param result_class: The class used to represent the results. This will be instantiated with the :class:`BaseSearch` instance and the raw result from the search.
 187
 188                 """
 189                 results = self.search(limit)
 190                 return [result_class(self, result) for result in results]
 191
 192         def search(self, limit=None):
 193                 """Returns an iterable of up to ``limit`` results. The :meth:`get_result_title`, :meth:`get_result_url`, :meth:`get_result_template`, and :meth:`get_result_extra_context` methods will be used to interpret the individual items that this function returns, so the result can be an object with attributes as easily as a dictionary with keys. However, keep in mind that the raw results will be stored with django's caching mechanisms and will be converted to JSON."""
 194                 raise NotImplementedError
 195
 196         def get_actual_result_url(self, result):
 197                 """Returns the actual URL for the ``result`` or ``None`` if there is no URL. Must be implemented by subclasses."""
 198                 raise NotImplementedError
 199
 200         def get_result_querydict(self, result):
 201                 """Returns a querydict for tracking selection of the result, or ``None`` if there is no URL for the result."""
 202                 url = self.get_actual_result_url(result)
 203                 if url is None:
 204                         return None
 205                 return make_tracking_querydict(self.search_arg, url)
 206
 207         def get_result_url(self, result):
 208                 """Returns ``None`` or a url which, when accessed, will register a :class:`.Click` for that url."""
 209                 qd = self.get_result_querydict(result)
 210                 if qd is None:
 211                         return None
 212                 return "?%s" % qd.urlencode()
 213
 214         def get_result_title(self, result):
 215                 """Returns the title of the ``result``. By default, renders ``sobol/search/<slug>/title.html`` or ``sobol/search/title.html`` with the result in the context. This can be overridden by setting :attr:`title_template` or simply overriding :meth:`get_result_title`. If no template can be found, this will raise :exc:`TemplateDoesNotExist`."""
 216                 return loader.render_to_string(self.title_template or [
 217                         'sobol/search/%s/title.html' % self.slug,
 218                         'sobol/search/title.html'
 219                 ], {'result': result})
 220
 221         def get_result_content(self, result):
 222                 """Returns the content for the ``result``. By default, renders ``sobol/search/<slug>/content.html`` or ``sobol/search/content.html`` with the result in the context. This can be overridden by setting :attr:`content_template` or simply overriding :meth:`get_result_content`. If no template is found, this will return an empty string."""
 223                 try:
 224                         return loader.render_to_string(self.content_template or [
 225                                 'sobol/search/%s/content.html' % self.slug,
 226                                 'sobol/search/content.html'
 227                         ], {'result': result})
 228                 except TemplateDoesNotExist:
 229                         return ""
 230
 231         def get_result_template(self, result):
 232                 """Returns the template to be used for rendering the ``result``. For a search with slug ``google``, this would first try ``sobol/search/google/result.html``, then fall back on ``sobol/search/result.html``. Subclasses can override this by setting :attr:`result_template` to the path of another template."""
 233                 if self.result_template:
 234                         return loader.get_template(self.result_template)
 235                 return loader.select_template([
 236                         'sobol/search/%s/result.html' % self.slug,
 237                         'sobol/search/result.html'
 238                 ])
 239
 240         @property
 241         def has_more_results(self):
 242                 """Returns ``True`` if there are more results than :attr:`result_limit` and ``False`` otherwise."""
 243                 return len(self.results) > self.result_limit
 244
 245         def get_actual_more_results_url(self):
 246                 """Returns the actual url for more results. By default, simply returns ``None``."""
 247                 return None
 248
 249         def get_more_results_querydict(self):
 250                 """Returns a :class:`QueryDict` for tracking whether people click on a 'more results' link."""
 251                 url = self.get_actual_more_results_url()
 252                 if url:
 253                         return make_tracking_querydict(self.search_arg, url)
 254                 return None
 255
 256         @property
 257         def more_results_url(self):
 258                 """Returns a URL which consists of a querystring which, when accessed, will log a :class:`.Click` for the actual URL."""
 259                 qd = self.get_more_results_querydict()
 260                 if qd is None:
 261                         return None
 262                 return "?%s" % qd.urlencode()
 263
 264         def __unicode__(self):
 265                 return self.verbose_name
 266
 267
 268 class DatabaseSearch(BaseSearch):
 269         """Implements :meth:`~BaseSearch.search` and :meth:`get_queryset` methods to handle database queries."""
 270         #: The model which should be searched by the :class:`DatabaseSearch`.
 271         model = None
 272
 273         def search(self, limit=None):
 274                 if not hasattr(self, '_qs'):
 275                         self._qs = self.get_queryset()
 276                         if limit is not None:
 277                                 self._qs = self._qs[:limit]
 278
 279                 return self._qs
 280
 281         def get_queryset(self):
 282                 """Returns a :class:`QuerySet` of all instances of :attr:`model`. This method should be overridden by subclasses to specify how the search should actually be implemented for the model."""
 283                 return self.model._default_manager.all()
 284
 285
 286 class URLSearch(BaseSearch):
 287         """Defines a generic interface for searches that require accessing a certain url to get search results."""
 288         #: The base URL which will be accessed to get the search results.
 289         search_url = ''
 290         #: The url-encoded query string to be used for fetching search results from :attr:`search_url`. Must have one ``%s`` to contain the search argument.
 291         query_format_str = "%s"
 292
 293         @property
 294         def url(self):
 295                 """The URL where the search gets its results. Composed from :attr:`search_url` and :attr:`query_format_str`."""
 296                 return self.search_url + self.query_format_str % urlquote_plus(self.search_arg)
 297
 298         def get_actual_more_results_url(self):
 299                 return self.url
 300
 301         def parse_response(self, response, limit=None):
 302                 """Handles the ``response`` from accessing :attr:`url` (with :func:`urllib2.urlopen`) and returns a list of up to ``limit`` results."""
 303                 raise NotImplementedError
 304
 305         def search(self, limit=None):
 306                 return self.parse_response(urllib2.urlopen(self.url), limit=limit)
 307
 308
 309 class JSONSearch(URLSearch):
 310         """Makes a GET request and parses the results as JSON. The default behavior assumes that the response contains a list of results."""
 311         def parse_response(self, response, limit=None):
 312                 return json.loads(response.read())[:limit]
 313
 314
 315 class GoogleSearch(JSONSearch):
 316         """An example implementation of a :class:`JSONSearch`."""
 317         search_url = "http://ajax.googleapis.com/ajax/services/search/web"
 318         _cache_timeout = 60
 319         verbose_name = "Google search (current site)"
 320         _more_results_url = None
 321
 322         @property
 323         def query_format_str(self):
 324                 default_args = self.default_args
 325                 if default_args:
 326                         default_args += " "
 327                 return "?v=1.0&q=%s%%s" % urlquote_plus(default_args).replace('%', '%%')
 328
 329         @property
 330         def default_args(self):
 331                 """Unquoted default arguments for the :class:`GoogleSearch`."""
 332                 return "site:%s" % Site.objects.get_current().domain
 333
 334         def parse_response(self, response, limit=None):
 335                 responseData = json.loads(response.read())['responseData']
 336                 results, cursor = responseData['results'], responseData['cursor']
 337
 338                 if results:
 339                         self._more_results_url = cursor['moreResultsUrl']
 340                         self._estimated_result_count = cursor['estimatedResultCount']
 341
 342                 return results[:limit]
 343
 344         @property
 345         def url(self):
 346                 # Google requires that an ajax request have a proper Referer header.
 347                 return urllib2.Request(
 348                         super(GoogleSearch, self).url,
 349                         None,
 350                         {'Referer': "http://%s" % Site.objects.get_current().domain}
 351                 )
 352
 353         @property
 354         def has_more_results(self):
 355                 if self.results and len(self.results) < self._estimated_result_count:
 356                         return True
 357                 return False
 358
 359         def get_actual_more_results_url(self):
 360                 return self._more_results_url
 361
 362         def get_actual_result_url(self, result):
 363                 return result['unescapedUrl']
 364
 365         def get_result_title(self, result):
 366                 return mark_safe(result['titleNoFormatting'])
 367
 368         def get_result_content(self, result):
 369                 return mark_safe(result['content'])
 370
 371
 372 registry.register(GoogleSearch)
 373
 374
 375 try:
 376         from BeautifulSoup import BeautifulSoup, SoupStrainer, BeautifulStoneSoup
 377 except:
 378         pass
 379 else:
 380         __all__ += ('ScrapeSearch', 'XMLSearch',)
 381         class ScrapeSearch(URLSearch):
 382                 """A base class for scrape-style searching, available if :mod:`BeautifulSoup` is installed."""
 383                 #: Arguments to be passed into a :class:`SoupStrainer`.
 384                 strainer_args = []
 385                 #: Keyword arguments to be passed into a :class:`SoupStrainer`.
 386                 strainer_kwargs = {}
 387
 388                 @property
 389                 def strainer(self):
 390                         """
 391                         Caches and returns a :class:`SoupStrainer` initialized with :attr:`strainer_args` and :attr:`strainer_kwargs`. This strainer will be used to parse only certain parts of the document.
 392
 393                         .. seealso:: `BeautifulSoup: Improving Performance by Parsing Only Part of the Document <http://www.crummy.com/software/BeautifulSoup/documentation.html#Improving%20Performance%20by%20Parsing%20Only%20Part%20of%20the%20Document>`_
 394
 395                         """
 396                         if not hasattr(self, '_strainer'):
 397                                 self._strainer = SoupStrainer(*self.strainer_args, **self.strainer_kwargs)
 398                         return self._strainer
 399
 400                 def parse_response(self, response, limit=None):
 401                         strainer = self.strainer
 402                         soup = BeautifulSoup(response, parseOnlyThese=strainer)
 403                         return self.parse_results(soup.findAll(recursive=False, limit=limit))
 404
 405                 def parse_results(self, results):
 406                         """
 407                         Provides a hook for parsing the results of straining. This has no default behavior and must be implemented by subclasses because the results absolutely must be parsed to properly extract the information.
 408
 409                         .. seealso:: `BeautifulSoup: Improving Memory Usage with extract <http://www.crummy.com/software/BeautifulSoup/documentation.html#Improving%20Memory%20Usage%20with%20extract>`_
 410                         """
 411                         raise NotImplementedError
 412
 413
 414         class XMLSearch(ScrapeSearch):
 415                 """A base class for searching XML results."""
 416                 #: Self-closing tag names to be used when interpreting the XML document
 417                 #:
 418                 #: .. seealso:: `BeautifulSoup: Parsing XML <http://www.crummy.com/software/BeautifulSoup/documentation.html#Parsing%20XML>`_
 419                 self_closing_tags = []
 420
 421                 def parse_response(self, response, limit=None):
 422                         strainer = self.strainer
 423                         soup = BeautifulStoneSoup(response, selfClosingTags=self.self_closing_tags, parseOnlyThese=strainer)
 424                         return self.parse_results(soup.findAll(recursive=False, limit=limit))