philo/contrib/sobol/search.py

   1 #encoding: utf-8
   2 import datetime
   3 from hashlib import sha1
   4
   5 from django.conf import settings
   6 from django.contrib.sites.models import Site
   7 from django.core.cache import cache
   8 from django.db.models.options import get_verbose_name as convert_camelcase
   9 from django.utils import simplejson as json
  10 from django.utils.http import urlquote_plus
  11 from django.utils.safestring import mark_safe
  12 from django.utils.text import capfirst
  13 from django.template import loader, Context, Template
  14
  15 from philo.contrib.sobol.utils import make_tracking_querydict, RegistryIterator
  16
  17
  18 if getattr(settings, 'SOBOL_USE_EVENTLET', False):
  19         try:
  20                 from eventlet.green import urllib2
  21         except:
  22                 import urllib2
  23 else:
  24         import urllib2
  25
  26
  27 __all__ = (
  28         'Result', 'BaseSearch', 'DatabaseSearch', 'URLSearch', 'JSONSearch', 'GoogleSearch', 'SearchRegistry', 'registry', 'get_search_instance'
  29 )
  30
  31
  32 SEARCH_CACHE_SEED = 'philo_sobol_search_results'
  33 USE_CACHE = getattr(settings, 'SOBOL_USE_SEARCH', True)
  34
  35
  36 class RegistrationError(Exception):
  37         """Raised if there is a problem registering a search with a :class:`SearchRegistry`"""
  38         pass
  39
  40
  41 class SearchRegistry(object):
  42         """Holds a registry of search types by slug."""
  43
  44         def __init__(self):
  45                 self._registry = {}
  46
  47         def register(self, search, slug=None):
  48                 """
  49                 Register a search with the registry.
  50
  51                 :param search: The search class to register - generally a subclass of :class:`BaseSearch`
  52                 :param slug: The slug which will be used to register the search class. If ``slug`` is ``None``, the search's default slug will be used.
  53                 :raises: :class:`RegistrationError` if a different search is already registered with ``slug``.
  54
  55                 """
  56                 slug = slug or search.slug
  57                 if slug in self._registry:
  58                         registered = self._registry[slug]
  59                         if registered.__module__ != search.__module__:
  60                                 raise RegistrationError("A different search is already registered as `%s`" % slug)
  61                 else:
  62                         self._registry[slug] = search
  63
  64         def unregister(self, search, slug=None):
  65                 """
  66                 Unregister a search from the registry.
  67
  68                 :param search: The search class to unregister - generally a subclass of :class:`BaseSearch`
  69                 :param slug: If provided, the search will only be removed if it was registered with ``slug``. If not provided, the search class will be unregistered no matter what slug it was registered with.
  70                 :raises: :class:`RegistrationError` if a slug is provided but the search registered with that slug is not ``search``.
  71
  72                 """
  73                 if slug is not None:
  74                         if slug in self._registry and self._registry[slug] == search:
  75                                 del self._registry[slug]
  76                         raise RegistrationError("`%s` is not registered as `%s`" % (search, slug))
  77                 else:
  78                         for slug, search in self._registry.items():
  79                                 if search == search:
  80                                         del self._registry[slug]
  81
  82         def items(self):
  83                 """Returns a list of (slug, search) items in the registry."""
  84                 return self._registry.items()
  85
  86         def iteritems(self):
  87                 """Returns an iterator over the (slug, search) pairs in the registry."""
  88                 return RegistryIterator(self._registry, 'iteritems')
  89
  90         def iterchoices(self):
  91                 """Returns an iterator over (slug, search.verbose_name) pairs for the registry."""
  92                 return RegistryIterator(self._registry, 'iteritems', lambda x: (x[0], x[1].verbose_name))
  93
  94         def __getitem__(self, key):
  95                 """Returns the search registered with ``key``."""
  96                 return self._registry[key]
  97
  98         def __iter__(self):
  99                 """Returns an iterator over the keys in the registry."""
 100                 return self._registry.__iter__()
 101
 102
 103 registry = SearchRegistry()
 104
 105
 106 def _make_cache_key(search, search_arg):
 107         return sha1(SEARCH_CACHE_SEED + search.slug + search_arg).hexdigest()
 108
 109
 110 def get_search_instance(slug, search_arg):
 111         """Returns a search instance for the given slug, either from the cache or newly-instantiated."""
 112         search = registry[slug]
 113         search_arg = search_arg.lower()
 114         if USE_CACHE:
 115                 key = _make_cache_key(search, search_arg)
 116                 cached = cache.get(key)
 117                 if cached:
 118                         return cached
 119         instance = search(search_arg)
 120         instance.slug = slug
 121         return instance
 122
 123
 124
 125 class Result(object):
 126         """
 127         :class:`Result` is a helper class that, given a search and a result of that search, is able to correctly render itself with a template defined by the search. Every :class:`Result` will pass a ``title``, a ``url`` (if applicable), and the raw ``result`` returned by the search into the template context when rendering.
 128
 129         :param search: An instance of a :class:`BaseSearch` subclass or an object that implements the same API.
 130         :param result: An arbitrary result from the ``search``.
 131
 132         """
 133         def __init__(self, search, result):
 134                 self.search = search
 135                 self.result = result
 136
 137         def get_title(self):
 138                 """Returns the title of the result by calling :meth:`BaseSearch.get_result_title` on the raw result."""
 139                 return self.search.get_result_title(self.result)
 140
 141         def get_url(self):
 142                 """Returns the url of the result or an empty string by calling :meth:`BaseSearch.get_result_url` on the raw result."""
 143                 return self.search.get_result_url(self.result)
 144
 145         def get_content(self):
 146                 """Returns the content of the result by calling :meth:`BaseSearch.get_result_content` on the raw result."""
 147                 return self.search.get_result_content(self.result)
 148
 149         def get_extra_context(self):
 150                 """Returns any extra context for the result by calling :meth:`BaseSearch.get_result_extra_context` on the raw result."""
 151                 return self.search.get_result_extra_context(self.result)
 152
 153         def get_template(self):
 154                 """Returns the template which will be used to render the :class:`Result` by calling :meth:`BaseSearch.get_result_template` on the raw result."""
 155                 return self.search.get_result_template(self.result)
 156
 157         def get_context(self):
 158                 """
 159                 Returns the context dictionary for the result. This is used both in rendering the result and in the AJAX return value for :meth:`.SearchView.ajax_api_view`. The context will contain everything from :meth:`get_extra_context` as well as the following keys:
 160
 161                 title
 162                         The result of calling :meth:`get_title`
 163                 url
 164                         The result of calling :meth:`get_url`
 165                 content
 166                         The result of calling :meth:`get_content`
 167
 168                 """
 169                 context = self.get_extra_context()
 170                 context.update({
 171                         'title': self.get_title(),
 172                         'url': self.get_url(),
 173                         'content': self.get_content()
 174                 })
 175                 return context
 176
 177         def render(self):
 178                 """Returns the template from :meth:`get_template` rendered with the context from :meth:`get_context`."""
 179                 t = self.get_template()
 180                 c = Context(self.get_context())
 181                 return t.render(c)
 182
 183         def __unicode__(self):
 184                 """Returns :meth:`render`"""
 185                 return self.render()
 186
 187
 188 class BaseSearchMetaclass(type):
 189         def __new__(cls, name, bases, attrs):
 190                 if 'verbose_name' not in attrs:
 191                         attrs['verbose_name'] = capfirst(' '.join(convert_camelcase(name).rsplit(' ', 1)[:-1]))
 192                 if 'slug' not in attrs:
 193                         attrs['slug'] = name[:-6].lower() if name.endswith("Search") else name.lower()
 194                 return super(BaseSearchMetaclass, cls).__new__(cls, name, bases, attrs)
 195
 196
 197 class BaseSearch(object):
 198         """
 199         Defines a generic search api. Accessing :attr:`results` will attempt to retrieve cached results and, if that fails, will initiate a new search and store the results in the cache. Each search has a ``verbose_name`` and a ``slug``. If these are not provided as attributes, they will be automatically generated based on the name of the class.
 200
 201         :param search_arg: The string which is being searched for.
 202
 203         """
 204         __metaclass__ = BaseSearchMetaclass
 205         #: The number of results to return from the complete list. Default: 5
 206         result_limit = 5
 207         #: How long the items for the search should be cached (in minutes). Default: 48 hours.
 208         _cache_timeout = 60*48
 209         #: The path to the template which will be used to render the :class:`Result`\ s for this search. If this is ``None``, then the framework will try "sobol/search/<slug>/result.html" and "sobol/search/result.html".
 210         result_template = None
 211
 212         def __init__(self, search_arg):
 213                 self.search_arg = search_arg
 214
 215         @property
 216         def results(self):
 217                 """Retrieves cached results or initiates a new search via :meth:`get_results` and caches the results."""
 218                 if not hasattr(self, '_results'):
 219                         try:
 220                                 # Cache one extra result so we can see if there are
 221                                 # more results to be had.
 222                                 limit = self.result_limit
 223                                 if limit is not None:
 224                                         limit += 1
 225                                 results = self.get_results(limit)
 226                         except:
 227                                 if settings.DEBUG:
 228                                         raise
 229                                 #  On exceptions, don't set any cache; just return.
 230                                 return []
 231
 232                         self._results = results
 233
 234                         if USE_CACHE:
 235                                 key = _make_cache_key(self, self.search_arg)
 236                                 cache.set(key, self, self._cache_timeout)
 237
 238                 return self._results
 239
 240         def get_results(self, limit=None, result_class=Result):
 241                 """
 242                 Calls :meth:`search` and parses the return value into :class:`Result` instances.
 243
 244                 :param limit: Passed directly to :meth:`search`.
 245                 :param result_class: The class used to represent the results. This will be instantiated with the :class:`BaseSearch` instance and the raw result from the search.
 246
 247                 """
 248                 results = self.search(limit)
 249                 return [result_class(self, result) for result in results]
 250
 251         def search(self, limit=None):
 252                 """Returns an iterable of up to ``limit`` results. The :meth:`get_result_title`, :meth:`get_result_url`, :meth:`get_result_template`, and :meth:`get_result_extra_context` methods will be used to interpret the individual items that this function returns, so the result can be an object with attributes as easily as a dictionary with keys. However, keep in mind that the raw results will be stored with django's caching mechanisms and will be converted to JSON."""
 253                 raise NotImplementedError
 254
 255         def get_result_title(self, result):
 256                 """Returns the title of the ``result``. Must be implemented by subclasses."""
 257                 raise NotImplementedError
 258
 259         def get_actual_result_url(self, result):
 260                 """Returns the actual URL for the ``result`` or ``None`` if there is no URL. Must be implemented by subclasses."""
 261                 raise NotImplementedError
 262
 263         def get_result_querydict(self, result):
 264                 """Returns a querydict for tracking selection of the result, or ``None`` if there is no URL for the result."""
 265                 url = self.get_result_url(result)
 266                 if url is None:
 267                         return None
 268                 return make_tracking_querydict(self.search_arg, url)
 269
 270         def get_result_url(self, result):
 271                 """Returns ``None`` or a url which, when accessed, will register a :class:`.Click` for that url."""
 272                 qd = self.get_result_querydict(result)
 273                 if qd is None:
 274                         return None
 275                 return "?%s" % qd.urlencode()
 276
 277         def get_result_content(self, result):
 278                 """Returns the content for the ``result`` or ``None`` if there is no content. Must be implemented by subclasses."""
 279                 raise NotImplementedError
 280
 281         def get_result_extra_context(self, result):
 282                 """Returns any extra context to be used when rendering the ``result``. Make sure that any extra context can be serialized as JSON."""
 283                 return {}
 284
 285         def get_result_template(self, result):
 286                 """Returns the template to be used for rendering the ``result``. For a search with slug ``google``, this would first try ``sobol/search/google/result.html``, then fall back on ``sobol/search/result.html``. Subclasses can override this by setting :attr:`result_template` to the path of another template."""
 287                 if self.result_template:
 288                         return loader.get_template(self.result_template)
 289                 return loader.select_template([
 290                         'sobol/search/%s/result.html' % self.slug,
 291                         'sobol/search/result.html'
 292                 ])
 293
 294         @property
 295         def has_more_results(self):
 296                 """Returns ``True`` if there are more results than :attr:`result_limit` and ``False`` otherwise."""
 297                 return len(self.results) > self.result_limit
 298
 299         def get_actual_more_results_url(self):
 300                 """Returns the actual url for more results. By default, simply returns ``None``."""
 301                 return None
 302
 303         def get_more_results_querydict(self):
 304                 """Returns a :class:`QueryDict` for tracking whether people click on a 'more results' link."""
 305                 url = self.get_actual_more_results_url()
 306                 if url:
 307                         return make_tracking_querydict(self.search_arg, url)
 308                 return None
 309
 310         @property
 311         def more_results_url(self):
 312                 """Returns a URL which consists of a querystring which, when accessed, will log a :class:`.Click` for the actual URL."""
 313                 qd = self.get_more_results_querydict()
 314                 if qd is None:
 315                         return None
 316                 return "?%s" % qd.urlencode()
 317
 318         def __unicode__(self):
 319                 return self.verbose_name
 320
 321
 322 class DatabaseSearch(BaseSearch):
 323         """Implements :meth:`~BaseSearch.search` and :meth:`get_queryset` methods to handle database queries."""
 324         #: The model which should be searched by the :class:`DatabaseSearch`.
 325         model = None
 326
 327         def search(self, limit=None):
 328                 if not hasattr(self, '_qs'):
 329                         self._qs = self.get_queryset()
 330                         if limit is not None:
 331                                 self._qs = self._qs[:limit]
 332
 333                 return self._qs
 334
 335         def get_queryset(self):
 336                 """Returns a :class:`QuerySet` of all instances of :attr:`model`. This method should be overridden by subclasses to specify how the search should actually be implemented for the model."""
 337                 return self.model._default_manager.all()
 338
 339
 340 class URLSearch(BaseSearch):
 341         """Defines a generic interface for searches that require accessing a certain url to get search results."""
 342         #: The base URL which will be accessed to get the search results.
 343         search_url = ''
 344         #: The url-encoded query string to be used for fetching search results from :attr:`search_url`. Must have one ``%s`` to contain the search argument.
 345         query_format_str = "%s"
 346
 347         @property
 348         def url(self):
 349                 """The URL where the search gets its results. Composed from :attr:`search_url` and :attr:`query_format_str`."""
 350                 return self.search_url + self.query_format_str % urlquote_plus(self.search_arg)
 351
 352         def get_actual_more_results_url(self):
 353                 return self.url
 354
 355         def parse_response(self, response, limit=None):
 356                 """Handles the ``response`` from accessing :attr:`url` (with :func:`urllib2.urlopen`) and returns a list of up to ``limit`` results."""
 357                 raise NotImplementedError
 358
 359         def search(self, limit=None):
 360                 return self.parse_response(urllib2.urlopen(self.url), limit=limit)
 361
 362
 363 class JSONSearch(URLSearch):
 364         """Makes a GET request and parses the results as JSON. The default behavior assumes that the response contains a list of results."""
 365         def parse_response(self, response, limit=None):
 366                 return json.loads(response.read())[:limit]
 367
 368
 369 class GoogleSearch(JSONSearch):
 370         """An example implementation of a :class:`JSONSearch`."""
 371         search_url = "http://ajax.googleapis.com/ajax/services/search/web"
 372         _cache_timeout = 60
 373         verbose_name = "Google search (current site)"
 374         _more_results_url = None
 375
 376         @property
 377         def query_format_str(self):
 378                 default_args = self.default_args
 379                 if default_args:
 380                         default_args += " "
 381                 return "?v=1.0&q=%s%%s" % urlquote_plus(default_args).replace('%', '%%')
 382
 383         @property
 384         def default_args(self):
 385                 """Unquoted default arguments for the :class:`GoogleSearch`."""
 386                 return "site:%s" % Site.objects.get_current().domain
 387
 388         def parse_response(self, response, limit=None):
 389                 responseData = json.loads(response.read())['responseData']
 390                 results, cursor = responseData['results'], responseData['cursor']
 391
 392                 if results:
 393                         self._more_results_url = cursor['moreResultsUrl']
 394                         self._estimated_result_count = cursor['estimatedResultCount']
 395
 396                 return results[:limit]
 397
 398         @property
 399         def url(self):
 400                 # Google requires that an ajax request have a proper Referer header.
 401                 return urllib2.Request(
 402                         super(GoogleSearch, self).url,
 403                         None,
 404                         {'Referer': "http://%s" % Site.objects.get_current().domain}
 405                 )
 406
 407         @property
 408         def has_more_results(self):
 409                 if self.results and len(self.results) < self._estimated_result_count:
 410                         return True
 411                 return False
 412
 413         def get_actual_more_results_url(self):
 414                 return self._more_results_url
 415
 416         def get_result_title(self, result):
 417                 return result['titleNoFormatting']
 418
 419         def get_result_url(self, result):
 420                 return result['unescapedUrl']
 421
 422         def get_result_content(self, result):
 423                 return result['content']
 424
 425
 426 registry.register(GoogleSearch)
 427
 428
 429 try:
 430         from BeautifulSoup import BeautifulSoup, SoupStrainer, BeautifulStoneSoup
 431 except:
 432         pass
 433 else:
 434         __all__ += ('ScrapeSearch', 'XMLSearch',)
 435         class ScrapeSearch(URLSearch):
 436                 """A base class for scrape-style searching, available if :mod:`BeautifulSoup` is installed."""
 437                 #: Arguments to be passed into a :class:`SoupStrainer`.
 438                 strainer_args = []
 439                 #: Keyword arguments to be passed into a :class:`SoupStrainer`.
 440                 strainer_kwargs = {}
 441
 442                 @property
 443                 def strainer(self):
 444                         """
 445                         Caches and returns a :class:`SoupStrainer` initialized with :attr:`strainer_args` and :attr:`strainer_kwargs`. This strainer will be used to parse only certain parts of the document.
 446
 447                         .. seealso:: `BeautifulSoup: Improving Performance by Parsing Only Part of the Document <http://www.crummy.com/software/BeautifulSoup/documentation.html#Improving%20Performance%20by%20Parsing%20Only%20Part%20of%20the%20Document>`_
 448
 449                         """
 450                         if not hasattr(self, '_strainer'):
 451                                 self._strainer = SoupStrainer(*self.strainer_args, **self.strainer_kwargs)
 452                         return self._strainer
 453
 454                 def parse_response(self, response, limit=None):
 455                         strainer = self.strainer
 456                         soup = BeautifulSoup(response, parseOnlyThese=strainer)
 457                         return self.parse_results(soup.findAll(recursive=False, limit=limit))
 458
 459                 def parse_results(self, results):
 460                         """
 461                         Provides a hook for parsing the results of straining. This has no default behavior and must be implemented by subclasses because the results absolutely must be parsed to properly extract the information.
 462
 463                         .. seealso:: `BeautifulSoup: Improving Memory Usage with extract <http://www.crummy.com/software/BeautifulSoup/documentation.html#Improving%20Memory%20Usage%20with%20extract>`_
 464                         """
 465                         raise NotImplementedError
 466
 467
 468         class XMLSearch(ScrapeSearch):
 469                 """A base class for searching XML results."""
 470                 #: Self-closing tag names to be used when interpreting the XML document
 471                 #:
 472                 #: .. seealso:: `BeautifulSoup: Parsing XML <http://www.crummy.com/software/BeautifulSoup/documentation.html#Parsing%20XML>`_
 473                 self_closing_tags = []
 474
 475                 def parse_response(self, response, limit=None):
 476                         strainer = self.strainer
 477                         soup = BeautifulStoneSoup(response, selfClosingTags=self.self_closing_tags, parseOnlyThese=strainer)
 478                         return self.parse_results(soup.findAll(recursive=False, limit=limit))