philo/contrib/sobol/search.py

   1 #encoding: utf-8
   2 import datetime
   3 from hashlib import sha1
   4
   5 from django.conf import settings
   6 from django.contrib.sites.models import Site
   7 from django.core.cache import cache
   8 from django.db.models.options import get_verbose_name as convert_camelcase
   9 from django.utils import simplejson as json
  10 from django.utils.http import urlquote_plus
  11 from django.utils.safestring import mark_safe
  12 from django.utils.text import capfirst
  13 from django.template import loader, Context, Template
  14
  15 from philo.contrib.sobol.utils import make_tracking_querydict, RegistryIterator
  16
  17
  18 if getattr(settings, 'SOBOL_USE_EVENTLET', False):
  19         try:
  20                 from eventlet.green import urllib2
  21         except:
  22                 import urllib2
  23 else:
  24         import urllib2
  25
  26
  27 __all__ = (
  28         'Result', 'BaseSearch', 'DatabaseSearch', 'URLSearch', 'JSONSearch', 'GoogleSearch', 'SearchRegistry', 'registry', 'get_search_instance'
  29 )
  30
  31
  32 SEARCH_CACHE_SEED = 'philo_sobol_search_results'
  33 USE_CACHE = getattr(settings, 'SOBOL_USE_SEARCH', True)
  34
  35
  36 class RegistrationError(Exception):
  37         """Raised if there is a problem registering a search with a :class:`SearchRegistry`"""
  38         pass
  39
  40
  41 class SearchRegistry(object):
  42         """Holds a registry of search types by slug."""
  43
  44         def __init__(self):
  45                 self._registry = {}
  46
  47         def register(self, search, slug=None):
  48                 """
  49                 Register a search with the registry.
  50
  51                 :param search: The search class to register - generally a subclass of :class:`BaseSearch`
  52                 :param slug: The slug which will be used to register the search class. If ``slug`` is ``None``, the search's default slug will be used.
  53                 :raises: :class:`RegistrationError` if a different search is already registered with ``slug``.
  54
  55                 """
  56                 slug = slug or search.slug
  57                 if slug in self._registry:
  58                         registered = self._registry[slug]
  59                         if registered.__module__ != search.__module__:
  60                                 raise RegistrationError("A different search is already registered as `%s`" % slug)
  61                 else:
  62                         self._registry[slug] = search
  63
  64         def unregister(self, search, slug=None):
  65                 """
  66                 Unregister a search from the registry.
  67
  68                 :param search: The search class to unregister - generally a subclass of :class:`BaseSearch`
  69                 :param slug: If provided, the search will only be removed if it was registered with ``slug``. If not provided, the search class will be unregistered no matter what slug it was registered with.
  70                 :raises: :class:`RegistrationError` if a slug is provided but the search registered with that slug is not ``search``.
  71
  72                 """
  73                 if slug is not None:
  74                         if slug in self._registry and self._registry[slug] == search:
  75                                 del self._registry[slug]
  76                         raise RegistrationError("`%s` is not registered as `%s`" % (search, slug))
  77                 else:
  78                         for slug, search in self._registry.items():
  79                                 if search == search:
  80                                         del self._registry[slug]
  81
  82         def items(self):
  83                 """Returns a list of (slug, search) items in the registry."""
  84                 return self._registry.items()
  85
  86         def iteritems(self):
  87                 """Returns an iterator over the (slug, search) pairs in the registry."""
  88                 return RegistryIterator(self._registry, 'iteritems')
  89
  90         def iterchoices(self):
  91                 """Returns an iterator over (slug, search.verbose_name) pairs for the registry."""
  92                 return RegistryIterator(self._registry, 'iteritems', lambda x: (x[0], x[1].verbose_name))
  93
  94         def __getitem__(self, key):
  95                 """Returns the search registered with ``key``."""
  96                 return self._registry[key]
  97
  98         def __iter__(self):
  99                 """Returns an iterator over the keys in the registry."""
 100                 return self._registry.__iter__()
 101
 102
 103 registry = SearchRegistry()
 104
 105
 106 def _make_cache_key(search, search_arg):
 107         return sha1(SEARCH_CACHE_SEED + search.slug + search_arg).hexdigest()
 108
 109
 110 def get_search_instance(slug, search_arg):
 111         """Returns a search instance for the given slug, either from the cache or newly-instantiated."""
 112         search = registry[slug]
 113         search_arg = search_arg.lower()
 114         if USE_CACHE:
 115                 key = _make_cache_key(search, search_arg)
 116                 cached = cache.get(key)
 117                 if cached:
 118                         return cached
 119         return search(search_arg)
 120
 121
 122
 123 class Result(object):
 124         """
 125         :class:`Result` is a helper class that, given a search and a result of that search, is able to correctly render itself with a template defined by the search. Every :class:`Result` will pass a ``title``, a ``url`` (if applicable), and the raw ``result`` returned by the search into the template context when rendering.
 126
 127         :param search: An instance of a :class:`BaseSearch` subclass or an object that implements the same API.
 128         :param result: An arbitrary result from the ``search``.
 129
 130         """
 131         def __init__(self, search, result):
 132                 self.search = search
 133                 self.result = result
 134
 135         def get_title(self):
 136                 """Returns the title of the result by calling :meth:`BaseSearch.get_result_title` on the raw result."""
 137                 return self.search.get_result_title(self.result)
 138
 139         def get_url(self):
 140                 """Returns the url of the result or an empty string by calling :meth:`BaseSearch.get_result_querydict` on the raw result and then encoding the querydict returned."""
 141                 qd = self.search.get_result_querydict(self.result)
 142                 if qd is None:
 143                         return ""
 144                 return "?%s" % qd.urlencode()
 145
 146         def get_template(self):
 147                 """Returns the template for the result by calling :meth:`BaseSearch.get_result_template` on the raw result."""
 148                 return self.search.get_result_template(self.result)
 149
 150         def get_extra_context(self):
 151                 """Returns any extra context for the result by calling :meth:`BaseSearch.get_result_extra_context` on the raw result."""
 152                 return self.search.get_result_extra_context(self.result)
 153
 154         def get_context(self):
 155                 """
 156                 Returns the context dictionary for the result. This is used both in rendering the result and in the AJAX return value for :meth:`.SearchView.ajax_api_view`. The context will contain everything from :meth:`get_extra_context` as well as the following keys:
 157
 158                 title
 159                         The result of calling :meth:`get_title`
 160                 url
 161                         The result of calling :meth:`get_url`
 162                 result
 163                         The raw result which the :class:`Result` was instantiated with.
 164
 165                 """
 166                 context = self.get_extra_context()
 167                 context.update({
 168                         'title': self.get_title(),
 169                         'url': self.get_url(),
 170                         'result': self.result
 171                 })
 172                 return context
 173
 174         def render(self):
 175                 """Returns the template from :meth:`get_template` rendered with the context from :meth:`get_context`."""
 176                 t = self.get_template()
 177                 c = Context(self.get_context())
 178                 return t.render(c)
 179
 180         def __unicode__(self):
 181                 """Returns :meth:`render`"""
 182                 return self.render()
 183
 184
 185 class BaseSearchMetaclass(type):
 186         def __new__(cls, name, bases, attrs):
 187                 if 'verbose_name' not in attrs:
 188                         attrs['verbose_name'] = capfirst(' '.join(convert_camelcase(name).rsplit(' ', 1)[:-1]))
 189                 if 'slug' not in attrs:
 190                         attrs['slug'] = name[:-6].lower() if name.endswith("Search") else name.lower()
 191                 return super(BaseSearchMetaclass, cls).__new__(cls, name, bases, attrs)
 192
 193
 194 class BaseSearch(object):
 195         """
 196         Defines a generic search api. Accessing :attr:`results` will attempt to retrieve cached results and, if that fails, will initiate a new search and store the results in the cache. Each search has a ``verbose_name`` and a ``slug``. If these are not provided as attributes, they will be automatically generated based on the name of the class.
 197
 198         :param search_arg: The string which is being searched for.
 199
 200         """
 201         __metaclass__ = BaseSearchMetaclass
 202         #: The number of results to return from the complete list. Default: 5
 203         result_limit = 5
 204         #: How long the items for the search should be cached (in minutes). Default: 48 hours.
 205         _cache_timeout = 60*48
 206         #: The path to the template which will be used to render the :class:`Result`\ s for this search.
 207         result_template = "sobol/search/basesearch.html"
 208
 209         def __init__(self, search_arg):
 210                 self.search_arg = search_arg
 211
 212         @property
 213         def results(self):
 214                 """Retrieves cached results or initiates a new search via :meth:`get_results` and caches the results."""
 215                 if not hasattr(self, '_results'):
 216                         try:
 217                                 # Cache one extra result so we can see if there are
 218                                 # more results to be had.
 219                                 limit = self.result_limit
 220                                 if limit is not None:
 221                                         limit += 1
 222                                 results = self.get_results(limit)
 223                         except:
 224                                 if settings.DEBUG:
 225                                         raise
 226                                 #  On exceptions, don't set any cache; just return.
 227                                 return []
 228
 229                         self._results = results
 230
 231                         if USE_CACHE:
 232                                 key = _make_cache_key(self, self.search_arg)
 233                                 cache.set(key, self, self._cache_timeout)
 234
 235                 return self._results
 236
 237         def get_results(self, limit=None, result_class=Result):
 238                 """
 239                 Calls :meth:`search` and parses the return value into :class:`Result` instances.
 240
 241                 :param limit: Passed directly to :meth:`search`.
 242                 :param result_class: The class used to represent the results. This will be instantiated with the :class:`BaseSearch` instance and the raw result from the search.
 243
 244                 """
 245                 results = self.search(limit)
 246                 return [result_class(self, result) for result in results]
 247
 248         def search(self, limit=None):
 249                 """Returns an iterable of up to ``limit`` results. The :meth:`get_result_title`, :meth:`get_result_url`, :meth:`get_result_template`, and :meth:`get_result_extra_context` methods will be used to interpret the individual items that this function returns, so the result can be an object with attributes as easily as a dictionary with keys. However, keep in mind that the raw results will be stored with django's caching mechanisms and will be converted to JSON."""
 250                 raise NotImplementedError
 251
 252         def get_result_title(self, result):
 253                 """Returns the title of the ``result``. Must be implemented by subclasses."""
 254                 raise NotImplementedError
 255
 256         def get_result_url(self, result):
 257                 """Returns the actual URL for the ``result`` or ``None`` if there is no URL. Must be implemented by subclasses."""
 258                 raise NotImplementedError
 259
 260         def get_result_querydict(self, result):
 261                 """Returns a querydict for tracking selection of the result, or ``None`` if there is no URL for the result."""
 262                 url = self.get_result_url(result)
 263                 if url is None:
 264                         return None
 265                 return make_tracking_querydict(self.search_arg, url)
 266
 267         def get_result_template(self, result):
 268                 """Returns the template to be used for rendering the ``result``."""
 269                 return loader.get_template(self.result_template)
 270
 271         def get_result_extra_context(self, result):
 272                 """Returns any extra context to be used when rendering the ``result``."""
 273                 return {}
 274
 275         @property
 276         def has_more_results(self):
 277                 """Returns ``True`` if there are more results than :attr:`result_limit` and ``False`` otherwise."""
 278                 return len(self.results) > self.result_limit
 279
 280         @property
 281         def more_results_url(self):
 282                 """Returns the actual url for more results. This should be accessed through :attr:`more_results_querydict` in the template so that the click can be tracked. By default, simply returns ``None``."""
 283                 return None
 284
 285         @property
 286         def more_results_querydict(self):
 287                 """Returns a :class:`QueryDict` for tracking whether people click on a 'more results' link."""
 288                 url = self.more_results_url
 289                 if url:
 290                         return make_tracking_querydict(self.search_arg, url)
 291                 return None
 292
 293         def __unicode__(self):
 294                 return self.verbose_name
 295
 296
 297 class DatabaseSearch(BaseSearch):
 298         """Implements :meth:`~BaseSearch.search` and :meth:`get_queryset` methods to handle database queries."""
 299         #: The model which should be searched by the :class:`DatabaseSearch`.
 300         model = None
 301
 302         def search(self, limit=None):
 303                 if not hasattr(self, '_qs'):
 304                         self._qs = self.get_queryset()
 305                         if limit is not None:
 306                                 self._qs = self._qs[:limit]
 307
 308                 return self._qs
 309
 310         def get_queryset(self):
 311                 """Returns a :class:`QuerySet` of all instances of :attr:`model`. This method should be overridden by subclasses to specify how the search should actually be implemented for the model."""
 312                 return self.model._default_manager.all()
 313
 314
 315 class URLSearch(BaseSearch):
 316         """Defines a generic interface for searches that require accessing a certain url to get search results."""
 317         #: The base URL which will be accessed to get the search results.
 318         search_url = ''
 319         #: The url-encoded query string to be used for fetching search results from :attr:`search_url`. Must have one ``%s`` to contain the search argument.
 320         query_format_str = "%s"
 321
 322         @property
 323         def url(self):
 324                 """The URL where the search gets its results. Composed from :attr:`search_url` and :attr:`query_format_str`."""
 325                 return self.search_url + self.query_format_str % urlquote_plus(self.search_arg)
 326
 327         @property
 328         def more_results_url(self):
 329                 return self.url
 330
 331         def parse_response(self, response, limit=None):
 332                 """Handles the ``response`` from accessing :attr:`url` (with :func:`urllib2.urlopen`) and returns a list of up to ``limit`` results."""
 333                 raise NotImplementedError
 334
 335         def search(self, limit=None):
 336                 return self.parse_response(urllib2.urlopen(self.url), limit=limit)
 337
 338
 339 class JSONSearch(URLSearch):
 340         """Makes a GET request and parses the results as JSON. The default behavior assumes that the response contains a list of results."""
 341         def parse_response(self, response, limit=None):
 342                 return json.loads(response.read())[:limit]
 343
 344
 345 class GoogleSearch(JSONSearch):
 346         """An example implementation of a :class:`JSONSearch`."""
 347         search_url = "http://ajax.googleapis.com/ajax/services/search/web"
 348         _cache_timeout = 60
 349         verbose_name = "Google search (current site)"
 350         result_template = "sobol/search/googlesearch.html"
 351         _more_results_url = None
 352
 353         @property
 354         def query_format_str(self):
 355                 default_args = self.default_args
 356                 if default_args:
 357                         default_args += " "
 358                 return "?v=1.0&q=%s%%s" % urlquote_plus(default_args).replace('%', '%%')
 359
 360         @property
 361         def default_args(self):
 362                 """Unquoted default arguments for the :class:`GoogleSearch`."""
 363                 return "site:%s" % Site.objects.get_current().domain
 364
 365         def parse_response(self, response, limit=None):
 366                 responseData = json.loads(response.read())['responseData']
 367                 results, cursor = responseData['results'], responseData['cursor']
 368
 369                 if results:
 370                         self._more_results_url = cursor['moreResultsUrl']
 371                         self._estimated_result_count = cursor['estimatedResultCount']
 372
 373                 return results[:limit]
 374
 375         @property
 376         def url(self):
 377                 # Google requires that an ajax request have a proper Referer header.
 378                 return urllib2.Request(
 379                         super(GoogleSearch, self).url,
 380                         None,
 381                         {'Referer': "http://%s" % Site.objects.get_current().domain}
 382                 )
 383
 384         @property
 385         def has_more_results(self):
 386                 if self.results and len(self.results) < self._estimated_result_count:
 387                         return True
 388                 return False
 389
 390         @property
 391         def more_results_url(self):
 392                 return self._more_results_url
 393
 394         def get_result_title(self, result):
 395                 return result['titleNoFormatting']
 396
 397         def get_result_url(self, result):
 398                 return result['unescapedUrl']
 399
 400
 401 registry.register(GoogleSearch)
 402
 403
 404 try:
 405         from BeautifulSoup import BeautifulSoup, SoupStrainer, BeautifulStoneSoup
 406 except:
 407         pass
 408 else:
 409         __all__ += ('ScrapeSearch', 'XMLSearch',)
 410         class ScrapeSearch(URLSearch):
 411                 """A base class for scrape-style searching, available if :mod:`BeautifulSoup` is installed."""
 412                 #: Arguments to be passed into a :class:`SoupStrainer`.
 413                 strainer_args = []
 414                 #: Keyword arguments to be passed into a :class:`SoupStrainer`.
 415                 strainer_kwargs = {}
 416
 417                 @property
 418                 def strainer(self):
 419                         """
 420                         Caches and returns a :class:`SoupStrainer` initialized with :attr:`strainer_args` and :attr:`strainer_kwargs`. This strainer will be used to parse only certain parts of the document.
 421
 422                         .. seealso:: `BeautifulSoup: Improving Performance by Parsing Only Part of the Document <http://www.crummy.com/software/BeautifulSoup/documentation.html#Improving%20Performance%20by%20Parsing%20Only%20Part%20of%20the%20Document>`_
 423
 424                         """
 425                         if not hasattr(self, '_strainer'):
 426                                 self._strainer = SoupStrainer(*self.strainer_args, **self.strainer_kwargs)
 427                         return self._strainer
 428
 429                 def parse_response(self, response, limit=None):
 430                         strainer = self.strainer
 431                         soup = BeautifulSoup(response, parseOnlyThese=strainer)
 432                         return self.parse_results(soup.findAll(recursive=False, limit=limit))
 433
 434                 def parse_results(self, results):
 435                         """
 436                         Provides a hook for parsing the results of straining. This has no default behavior and must be implemented by subclasses because the results absolutely must be parsed to properly extract the information.
 437
 438                         .. seealso:: `BeautifulSoup: Improving Memory Usage with extract <http://www.crummy.com/software/BeautifulSoup/documentation.html#Improving%20Memory%20Usage%20with%20extract>`_
 439                         """
 440                         raise NotImplementedError
 441
 442
 443         class XMLSearch(ScrapeSearch):
 444                 """A base class for searching XML results."""
 445                 #: Self-closing tag names to be used when interpreting the XML document
 446                 #:
 447                 #: .. seealso:: `BeautifulSoup: Parsing XML <http://www.crummy.com/software/BeautifulSoup/documentation.html#Parsing%20XML>`_
 448                 self_closing_tags = []
 449
 450                 def parse_response(self, response, limit=None):
 451                         strainer = self.strainer
 452                         soup = BeautifulStoneSoup(response, selfClosingTags=self.self_closing_tags, parseOnlyThese=strainer)
 453                         return self.parse_results(soup.findAll(recursive=False, limit=limit))