philo/contrib/sobol/search.py

   1 #encoding: utf-8
   2 import datetime
   3 from hashlib import sha1
   4
   5 from django.conf import settings
   6 from django.contrib.sites.models import Site
   7 from django.core.cache import cache
   8 from django.db.models.options import get_verbose_name as convert_camelcase
   9 from django.utils import simplejson as json
  10 from django.utils.http import urlquote_plus
  11 from django.utils.safestring import mark_safe
  12 from django.utils.text import capfirst
  13 from django.template import loader, Context, Template, TemplateDoesNotExist
  14
  15 from philo.contrib.sobol.utils import make_tracking_querydict, RegistryIterator
  16
  17
  18 if getattr(settings, 'SOBOL_USE_EVENTLET', False):
  19         try:
  20                 from eventlet.green import urllib2
  21         except:
  22                 import urllib2
  23 else:
  24         import urllib2
  25
  26
  27 __all__ = (
  28         'Result', 'BaseSearch', 'DatabaseSearch', 'URLSearch', 'JSONSearch', 'GoogleSearch', 'SearchRegistry', 'registry', 'get_search_instance'
  29 )
  30
  31
  32 SEARCH_CACHE_SEED = 'philo_sobol_search_results'
  33 USE_CACHE = getattr(settings, 'SOBOL_USE_CACHE', True)
  34
  35
  36 class RegistrationError(Exception):
  37         """Raised if there is a problem registering a search with a :class:`SearchRegistry`"""
  38         pass
  39
  40
  41 class SearchRegistry(object):
  42         """Holds a registry of search types by slug."""
  43
  44         def __init__(self):
  45                 self._registry = {}
  46
  47         def register(self, search, slug=None):
  48                 """
  49                 Register a search with the registry.
  50
  51                 :param search: The search class to register - generally a subclass of :class:`BaseSearch`
  52                 :param slug: The slug which will be used to register the search class. If ``slug`` is ``None``, the search's default slug will be used.
  53                 :raises: :class:`RegistrationError` if a different search is already registered with ``slug``.
  54
  55                 """
  56                 slug = slug or search.slug
  57                 if slug in self._registry:
  58                         registered = self._registry[slug]
  59                         if registered.__module__ != search.__module__:
  60                                 raise RegistrationError("A different search is already registered as `%s`" % slug)
  61                 else:
  62                         self._registry[slug] = search
  63
  64         def unregister(self, search, slug=None):
  65                 """
  66                 Unregister a search from the registry.
  67
  68                 :param search: The search class to unregister - generally a subclass of :class:`BaseSearch`
  69                 :param slug: If provided, the search will only be removed if it was registered with ``slug``. If not provided, the search class will be unregistered no matter what slug it was registered with.
  70                 :raises: :class:`RegistrationError` if a slug is provided but the search registered with that slug is not ``search``.
  71
  72                 """
  73                 if slug is not None:
  74                         if slug in self._registry and self._registry[slug] == search:
  75                                 del self._registry[slug]
  76                         raise RegistrationError("`%s` is not registered as `%s`" % (search, slug))
  77                 else:
  78                         for slug, search in self._registry.items():
  79                                 if search == search:
  80                                         del self._registry[slug]
  81
  82         def items(self):
  83                 """Returns a list of (slug, search) items in the registry."""
  84                 return self._registry.items()
  85
  86         def iteritems(self):
  87                 """Returns an iterator over the (slug, search) pairs in the registry."""
  88                 return RegistryIterator(self._registry, 'iteritems')
  89
  90         def iterchoices(self):
  91                 """Returns an iterator over (slug, search.verbose_name) pairs for the registry."""
  92                 return RegistryIterator(self._registry, 'iteritems', lambda x: (x[0], x[1].verbose_name))
  93
  94         def __getitem__(self, key):
  95                 """Returns the search registered with ``key``."""
  96                 return self._registry[key]
  97
  98         def __iter__(self):
  99                 """Returns an iterator over the keys in the registry."""
 100                 return self._registry.__iter__()
 101
 102
 103 registry = SearchRegistry()
 104
 105
 106 def _make_cache_key(search, search_arg):
 107         return sha1(SEARCH_CACHE_SEED + search.slug + search_arg).hexdigest()
 108
 109
 110 def get_search_instance(slug, search_arg):
 111         """Returns a search instance for the given slug, either from the cache or newly-instantiated."""
 112         search = registry[slug]
 113         search_arg = search_arg.lower()
 114         if USE_CACHE:
 115                 key = _make_cache_key(search, search_arg)
 116                 cached = cache.get(key)
 117                 if cached:
 118                         return cached
 119         instance = search(search_arg)
 120         instance.slug = slug
 121         return instance
 122
 123
 124
 125 class Result(object):
 126         """
 127         :class:`Result` is a helper class that, given a search and a result of that search, is able to correctly render itself with a template defined by the search. Every :class:`Result` will pass a ``title``, a ``url`` (if applicable), and the raw ``result`` returned by the search into the template context when rendering.
 128
 129         :param search: An instance of a :class:`BaseSearch` subclass or an object that implements the same API.
 130         :param result: An arbitrary result from the ``search``.
 131
 132         """
 133         def __init__(self, search, result):
 134                 self.search = search
 135                 self.result = result
 136
 137         def get_title(self):
 138                 """Returns the title of the result by calling :meth:`BaseSearch.get_result_title` on the raw result."""
 139                 return self.search.get_result_title(self.result)
 140
 141         def get_url(self):
 142                 """Returns the url of the result or ``None`` by calling :meth:`BaseSearch.get_result_url` on the raw result. This url will contain a querystring which, if used, will track a :class:`.Click` for the actual url."""
 143                 return self.search.get_result_url(self.result)
 144
 145         def get_actual_url(self):
 146                 """Returns the actual url of the result by calling :meth:`BaseSearch.get_actual_result_url` on the raw result."""
 147                 return self.search.get_actual_result_url(self.result)
 148
 149         def get_content(self):
 150                 """Returns the content of the result by calling :meth:`BaseSearch.get_result_content` on the raw result."""
 151                 return self.search.get_result_content(self.result)
 152
 153         def get_template(self):
 154                 """Returns the template which will be used to render the :class:`Result` by calling :meth:`BaseSearch.get_result_template` on the raw result."""
 155                 return self.search.get_result_template(self.result)
 156
 157         def get_context(self):
 158                 """
 159                 Returns the context dictionary for the result. This is used both in rendering the result and in the AJAX return value for :meth:`.SearchView.ajax_api_view`. The context will contain the following keys:
 160
 161                 title
 162                         The result of calling :meth:`get_title`
 163                 url
 164                         The result of calling :meth:`get_url`
 165                 content
 166                         The result of calling :meth:`get_content`
 167
 168                 """
 169                 if not hasattr(self, '_context'):
 170                         self._context = {
 171                                 'title': self.get_title(),
 172                                 'url': self.get_url(),
 173                                 'actual_url': self.get_actual_url(),
 174                                 'content': self.get_content()
 175                         }
 176                 return self._context
 177
 178         def render(self):
 179                 """Returns the template from :meth:`get_template` rendered with the context from :meth:`get_context`."""
 180                 t = self.get_template()
 181                 c = Context(self.get_context())
 182                 return t.render(c)
 183
 184         def __unicode__(self):
 185                 """Returns :meth:`render`"""
 186                 return self.render()
 187
 188
 189 class BaseSearchMetaclass(type):
 190         def __new__(cls, name, bases, attrs):
 191                 if 'verbose_name' not in attrs:
 192                         attrs['verbose_name'] = capfirst(' '.join(convert_camelcase(name).rsplit(' ', 1)[:-1]))
 193                 if 'slug' not in attrs:
 194                         attrs['slug'] = name[:-6].lower() if name.endswith("Search") else name.lower()
 195                 return super(BaseSearchMetaclass, cls).__new__(cls, name, bases, attrs)
 196
 197
 198 class BaseSearch(object):
 199         """
 200         Defines a generic search api. Accessing :attr:`results` will attempt to retrieve cached results and, if that fails, will initiate a new search and store the results in the cache. Each search has a ``verbose_name`` and a ``slug``. If these are not provided as attributes, they will be automatically generated based on the name of the class.
 201
 202         :param search_arg: The string which is being searched for.
 203
 204         """
 205         __metaclass__ = BaseSearchMetaclass
 206         #: The number of results to return from the complete list. Default: 5
 207         result_limit = 5
 208         #: How long the items for the search should be cached (in minutes). Default: 48 hours.
 209         _cache_timeout = 60*48
 210         #: The path to the template which will be used to render the :class:`Result`\ s for this search. If this is ``None``, then the framework will try ``sobol/search/<slug>/result.html`` and ``sobol/search/result.html``.
 211         result_template = None
 212         #: The path to the template which will be used to generate the title of the :class:`Result`\ s for this search. If this is ``None``, then the framework will try ``sobol/search/<slug>/title.html`` and ``sobol/search/title.html``.
 213         title_template = None
 214         #: The path to the template which will be used to generate the content of the :class:`Result`\ s for this search. If this is ``None``, then the framework will try ``sobol/search/<slug>/content.html`` and ``sobol/search/content.html``.
 215         content_template = None
 216
 217         def __init__(self, search_arg):
 218                 self.search_arg = search_arg
 219
 220         @property
 221         def results(self):
 222                 """Retrieves cached results or initiates a new search via :meth:`get_results` and caches the results."""
 223                 if not hasattr(self, '_results'):
 224                         try:
 225                                 # Cache one extra result so we can see if there are
 226                                 # more results to be had.
 227                                 limit = self.result_limit
 228                                 if limit is not None:
 229                                         limit += 1
 230                                 results = self.get_results(limit)
 231                         except:
 232                                 if settings.DEBUG:
 233                                         raise
 234                                 #  On exceptions, don't set any cache; just return.
 235                                 return []
 236
 237                         self._results = results
 238
 239                         if USE_CACHE:
 240                                 for result in results:
 241                                         result.get_context()
 242                                 key = _make_cache_key(self, self.search_arg)
 243                                 cache.set(key, self, self._cache_timeout)
 244
 245                 return self._results
 246
 247         def get_results(self, limit=None, result_class=Result):
 248                 """
 249                 Calls :meth:`search` and parses the return value into :class:`Result` instances.
 250
 251                 :param limit: Passed directly to :meth:`search`.
 252                 :param result_class: The class used to represent the results. This will be instantiated with the :class:`BaseSearch` instance and the raw result from the search.
 253
 254                 """
 255                 results = self.search(limit)
 256                 return [result_class(self, result) for result in results]
 257
 258         def search(self, limit=None):
 259                 """Returns an iterable of up to ``limit`` results. The :meth:`get_result_title`, :meth:`get_result_url`, :meth:`get_result_template`, and :meth:`get_result_extra_context` methods will be used to interpret the individual items that this function returns, so the result can be an object with attributes as easily as a dictionary with keys. However, keep in mind that the raw results will be stored with django's caching mechanisms and will be converted to JSON."""
 260                 raise NotImplementedError
 261
 262         def get_actual_result_url(self, result):
 263                 """Returns the actual URL for the ``result`` or ``None`` if there is no URL. Must be implemented by subclasses."""
 264                 raise NotImplementedError
 265
 266         def get_result_querydict(self, result):
 267                 """Returns a querydict for tracking selection of the result, or ``None`` if there is no URL for the result."""
 268                 url = self.get_actual_result_url(result)
 269                 if url is None:
 270                         return None
 271                 return make_tracking_querydict(self.search_arg, url)
 272
 273         def get_result_url(self, result):
 274                 """Returns ``None`` or a url which, when accessed, will register a :class:`.Click` for that url."""
 275                 qd = self.get_result_querydict(result)
 276                 if qd is None:
 277                         return None
 278                 return "?%s" % qd.urlencode()
 279
 280         def get_result_title(self, result):
 281                 """Returns the title of the ``result``. By default, renders ``sobol/search/<slug>/title.html`` or ``sobol/search/title.html`` with the result in the context. This can be overridden by setting :attr:`title_template` or simply overriding :meth:`get_result_title`. If no template can be found, this will raise :exc:`TemplateDoesNotExist`."""
 282                 return loader.render_to_string(self.title_template or [
 283                         'sobol/search/%s/title.html' % self.slug,
 284                         'sobol/search/title.html'
 285                 ], {'result': result})
 286
 287         def get_result_content(self, result):
 288                 """Returns the content for the ``result``. By default, renders ``sobol/search/<slug>/content.html`` or ``sobol/search/content.html`` with the result in the context. This can be overridden by setting :attr:`content_template` or simply overriding :meth:`get_result_content`. If no template is found, this will return an empty string."""
 289                 try:
 290                         return loader.render_to_string(self.content_template or [
 291                                 'sobol/search/%s/content.html' % self.slug,
 292                                 'sobol/search/content.html'
 293                         ], {'result': result})
 294                 except TemplateDoesNotExist:
 295                         return ""
 296
 297         def get_result_template(self, result):
 298                 """Returns the template to be used for rendering the ``result``. For a search with slug ``google``, this would first try ``sobol/search/google/result.html``, then fall back on ``sobol/search/result.html``. Subclasses can override this by setting :attr:`result_template` to the path of another template."""
 299                 if self.result_template:
 300                         return loader.get_template(self.result_template)
 301                 return loader.select_template([
 302                         'sobol/search/%s/result.html' % self.slug,
 303                         'sobol/search/result.html'
 304                 ])
 305
 306         @property
 307         def has_more_results(self):
 308                 """Returns ``True`` if there are more results than :attr:`result_limit` and ``False`` otherwise."""
 309                 return len(self.results) > self.result_limit
 310
 311         def get_actual_more_results_url(self):
 312                 """Returns the actual url for more results. By default, simply returns ``None``."""
 313                 return None
 314
 315         def get_more_results_querydict(self):
 316                 """Returns a :class:`QueryDict` for tracking whether people click on a 'more results' link."""
 317                 url = self.get_actual_more_results_url()
 318                 if url:
 319                         return make_tracking_querydict(self.search_arg, url)
 320                 return None
 321
 322         @property
 323         def more_results_url(self):
 324                 """Returns a URL which consists of a querystring which, when accessed, will log a :class:`.Click` for the actual URL."""
 325                 qd = self.get_more_results_querydict()
 326                 if qd is None:
 327                         return None
 328                 return "?%s" % qd.urlencode()
 329
 330         def __unicode__(self):
 331                 return self.verbose_name
 332
 333
 334 class DatabaseSearch(BaseSearch):
 335         """Implements :meth:`~BaseSearch.search` and :meth:`get_queryset` methods to handle database queries."""
 336         #: The model which should be searched by the :class:`DatabaseSearch`.
 337         model = None
 338
 339         def search(self, limit=None):
 340                 if not hasattr(self, '_qs'):
 341                         self._qs = self.get_queryset()
 342                         if limit is not None:
 343                                 self._qs = self._qs[:limit]
 344
 345                 return self._qs
 346
 347         def get_queryset(self):
 348                 """Returns a :class:`QuerySet` of all instances of :attr:`model`. This method should be overridden by subclasses to specify how the search should actually be implemented for the model."""
 349                 return self.model._default_manager.all()
 350
 351
 352 class URLSearch(BaseSearch):
 353         """Defines a generic interface for searches that require accessing a certain url to get search results."""
 354         #: The base URL which will be accessed to get the search results.
 355         search_url = ''
 356         #: The url-encoded query string to be used for fetching search results from :attr:`search_url`. Must have one ``%s`` to contain the search argument.
 357         query_format_str = "%s"
 358
 359         @property
 360         def url(self):
 361                 """The URL where the search gets its results. Composed from :attr:`search_url` and :attr:`query_format_str`."""
 362                 return self.search_url + self.query_format_str % urlquote_plus(self.search_arg)
 363
 364         def get_actual_more_results_url(self):
 365                 return self.url
 366
 367         def parse_response(self, response, limit=None):
 368                 """Handles the ``response`` from accessing :attr:`url` (with :func:`urllib2.urlopen`) and returns a list of up to ``limit`` results."""
 369                 raise NotImplementedError
 370
 371         def search(self, limit=None):
 372                 return self.parse_response(urllib2.urlopen(self.url), limit=limit)
 373
 374
 375 class JSONSearch(URLSearch):
 376         """Makes a GET request and parses the results as JSON. The default behavior assumes that the response contains a list of results."""
 377         def parse_response(self, response, limit=None):
 378                 return json.loads(response.read())[:limit]
 379
 380
 381 class GoogleSearch(JSONSearch):
 382         """An example implementation of a :class:`JSONSearch`."""
 383         search_url = "http://ajax.googleapis.com/ajax/services/search/web"
 384         _cache_timeout = 60
 385         verbose_name = "Google search (current site)"
 386         _more_results_url = None
 387
 388         @property
 389         def query_format_str(self):
 390                 default_args = self.default_args
 391                 if default_args:
 392                         default_args += " "
 393                 return "?v=1.0&q=%s%%s" % urlquote_plus(default_args).replace('%', '%%')
 394
 395         @property
 396         def default_args(self):
 397                 """Unquoted default arguments for the :class:`GoogleSearch`."""
 398                 return "site:%s" % Site.objects.get_current().domain
 399
 400         def parse_response(self, response, limit=None):
 401                 responseData = json.loads(response.read())['responseData']
 402                 results, cursor = responseData['results'], responseData['cursor']
 403
 404                 if results:
 405                         self._more_results_url = cursor['moreResultsUrl']
 406                         self._estimated_result_count = cursor['estimatedResultCount']
 407
 408                 return results[:limit]
 409
 410         @property
 411         def url(self):
 412                 # Google requires that an ajax request have a proper Referer header.
 413                 return urllib2.Request(
 414                         super(GoogleSearch, self).url,
 415                         None,
 416                         {'Referer': "http://%s" % Site.objects.get_current().domain}
 417                 )
 418
 419         @property
 420         def has_more_results(self):
 421                 if self.results and len(self.results) < self._estimated_result_count:
 422                         return True
 423                 return False
 424
 425         def get_actual_more_results_url(self):
 426                 return self._more_results_url
 427
 428         def get_actual_result_url(self, result):
 429                 return result['unescapedUrl']
 430
 431         def get_result_title(self, result):
 432                 return mark_safe(result['titleNoFormatting'])
 433
 434         def get_result_content(self, result):
 435                 return mark_safe(result['content'])
 436
 437
 438 registry.register(GoogleSearch)
 439
 440
 441 try:
 442         from BeautifulSoup import BeautifulSoup, SoupStrainer, BeautifulStoneSoup
 443 except:
 444         pass
 445 else:
 446         __all__ += ('ScrapeSearch', 'XMLSearch',)
 447         class ScrapeSearch(URLSearch):
 448                 """A base class for scrape-style searching, available if :mod:`BeautifulSoup` is installed."""
 449                 #: Arguments to be passed into a :class:`SoupStrainer`.
 450                 strainer_args = []
 451                 #: Keyword arguments to be passed into a :class:`SoupStrainer`.
 452                 strainer_kwargs = {}
 453
 454                 @property
 455                 def strainer(self):
 456                         """
 457                         Caches and returns a :class:`SoupStrainer` initialized with :attr:`strainer_args` and :attr:`strainer_kwargs`. This strainer will be used to parse only certain parts of the document.
 458
 459                         .. seealso:: `BeautifulSoup: Improving Performance by Parsing Only Part of the Document <http://www.crummy.com/software/BeautifulSoup/documentation.html#Improving%20Performance%20by%20Parsing%20Only%20Part%20of%20the%20Document>`_
 460
 461                         """
 462                         if not hasattr(self, '_strainer'):
 463                                 self._strainer = SoupStrainer(*self.strainer_args, **self.strainer_kwargs)
 464                         return self._strainer
 465
 466                 def parse_response(self, response, limit=None):
 467                         strainer = self.strainer
 468                         soup = BeautifulSoup(response, parseOnlyThese=strainer)
 469                         return self.parse_results(soup.findAll(recursive=False, limit=limit))
 470
 471                 def parse_results(self, results):
 472                         """
 473                         Provides a hook for parsing the results of straining. This has no default behavior and must be implemented by subclasses because the results absolutely must be parsed to properly extract the information.
 474
 475                         .. seealso:: `BeautifulSoup: Improving Memory Usage with extract <http://www.crummy.com/software/BeautifulSoup/documentation.html#Improving%20Memory%20Usage%20with%20extract>`_
 476                         """
 477                         raise NotImplementedError
 478
 479
 480         class XMLSearch(ScrapeSearch):
 481                 """A base class for searching XML results."""
 482                 #: Self-closing tag names to be used when interpreting the XML document
 483                 #:
 484                 #: .. seealso:: `BeautifulSoup: Parsing XML <http://www.crummy.com/software/BeautifulSoup/documentation.html#Parsing%20XML>`_
 485                 self_closing_tags = []
 486
 487                 def parse_response(self, response, limit=None):
 488                         strainer = self.strainer
 489                         soup = BeautifulStoneSoup(response, selfClosingTags=self.self_closing_tags, parseOnlyThese=strainer)
 490                         return self.parse_results(soup.findAll(recursive=False, limit=limit))