philo/contrib/sobol/search.py

   1 #encoding: utf-8
   2 import datetime
   3 from hashlib import sha1
   4
   5 from django.conf import settings
   6 from django.contrib.sites.models import Site
   7 from django.core.cache import cache
   8 from django.db.models.options import get_verbose_name as convert_camelcase
   9 from django.utils import simplejson as json
  10 from django.utils.http import urlquote_plus
  11 from django.utils.safestring import mark_safe
  12 from django.utils.text import capfirst
  13 from django.template import loader, Context, Template
  14
  15 from philo.contrib.sobol.utils import make_tracking_querydict, RegistryIterator
  16
  17
  18 if getattr(settings, 'SOBOL_USE_EVENTLET', False):
  19         try:
  20                 from eventlet.green import urllib2
  21         except:
  22                 import urllib2
  23 else:
  24         import urllib2
  25
  26
  27 __all__ = (
  28         'Result', 'BaseSearch', 'DatabaseSearch', 'URLSearch', 'JSONSearch', 'GoogleSearch', 'SearchRegistry', 'registry', 'get_search_instance'
  29 )
  30
  31
  32 SEARCH_CACHE_SEED = 'philo_sobol_search_results'
  33 USE_CACHE = getattr(settings, 'SOBOL_USE_SEARCH', True)
  34
  35
  36 class RegistrationError(Exception):
  37         """Raised if there is a problem registering a search with a :class:`SearchRegistry`"""
  38         pass
  39
  40
  41 class SearchRegistry(object):
  42         """Holds a registry of search types by slug."""
  43
  44         def __init__(self):
  45                 self._registry = {}
  46
  47         def register(self, search, slug=None):
  48                 """
  49                 Register a search with the registry.
  50
  51                 :param search: The search class to register - generally a subclass of :class:`BaseSearch`
  52                 :param slug: The slug which will be used to register the search class. If ``slug`` is ``None``, the search's default slug will be used.
  53                 :raises: :class:`RegistrationError` if a different search is already registered with ``slug``.
  54
  55                 """
  56                 slug = slug or search.slug
  57                 if slug in self._registry:
  58                         registered = self._registry[slug]
  59                         if registered.__module__ != search.__module__:
  60                                 raise RegistrationError("A different search is already registered as `%s`" % slug)
  61                 else:
  62                         self._registry[slug] = search
  63
  64         def unregister(self, search, slug=None):
  65                 """
  66                 Unregister a search from the registry.
  67
  68                 :param search: The search class to unregister - generally a subclass of :class:`BaseSearch`
  69                 :param slug: If provided, the search will only be removed if it was registered with ``slug``. If not provided, the search class will be unregistered no matter what slug it was registered with.
  70                 :raises: :class:`RegistrationError` if a slug is provided but the search registered with that slug is not ``search``.
  71
  72                 """
  73                 if slug is not None:
  74                         if slug in self._registry and self._registry[slug] == search:
  75                                 del self._registry[slug]
  76                         raise RegistrationError("`%s` is not registered as `%s`" % (search, slug))
  77                 else:
  78                         for slug, search in self._registry.items():
  79                                 if search == search:
  80                                         del self._registry[slug]
  81
  82         def items(self):
  83                 """Returns a list of (slug, search) items in the registry."""
  84                 return self._registry.items()
  85
  86         def iteritems(self):
  87                 """Returns an iterator over the (slug, search) pairs in the registry."""
  88                 return RegistryIterator(self._registry, 'iteritems')
  89
  90         def iterchoices(self):
  91                 """Returns an iterator over (slug, search.verbose_name) pairs for the registry."""
  92                 return RegistryIterator(self._registry, 'iteritems', lambda x: (x[0], x[1].verbose_name))
  93
  94         def __getitem__(self, key):
  95                 """Returns the search registered with ``key``."""
  96                 return self._registry[key]
  97
  98         def __iter__(self):
  99                 """Returns an iterator over the keys in the registry."""
 100                 return self._registry.__iter__()
 101
 102
 103 registry = SearchRegistry()
 104
 105
 106 def _make_cache_key(search, search_arg):
 107         return sha1(SEARCH_CACHE_SEED + search.slug + search_arg).hexdigest()
 108
 109
 110 def get_search_instance(slug, search_arg):
 111         """Returns a search instance for the given slug, either from the cache or newly-instantiated."""
 112         search = registry[slug]
 113         search_arg = search_arg.lower()
 114         if USE_CACHE:
 115                 key = _make_cache_key(search, search_arg)
 116                 cached = cache.get(key)
 117                 if cached:
 118                         return cached
 119         return search(search_arg)
 120
 121
 122
 123 class Result(object):
 124         """
 125         :class:`Result` is a helper class that, given a search and a result of that search, is able to correctly render itself with a template defined by the search. Every :class:`Result` will pass a ``title``, a ``url`` (if applicable), and the raw ``result`` returned by the search into the template context when rendering.
 126
 127         :param search: An instance of a :class:`BaseSearch` subclass or an object that implements the same API.
 128         :param result: An arbitrary result from the ``search``.
 129
 130         """
 131         def __init__(self, search, result):
 132                 self.search = search
 133                 self.result = result
 134
 135         def get_title(self):
 136                 """Returns the title of the result by calling :meth:`BaseSearch.get_result_title` on the raw result."""
 137                 return self.search.get_result_title(self.result)
 138
 139         def get_url(self):
 140                 """Returns the url of the result or an empty string by calling :meth:`BaseSearch.get_result_querydict` on the raw result and then encoding the querydict returned."""
 141                 qd = self.search.get_result_querydict(self.result)
 142                 if qd is None:
 143                         return ""
 144                 return "?%s" % qd.urlencode()
 145
 146         def get_template(self):
 147                 """Returns the template for the result by calling :meth:`BaseSearch.get_result_template` on the raw result."""
 148                 return self.search.get_result_template(self.result)
 149
 150         def get_extra_context(self):
 151                 """Returns any extra context for the result by calling :meth:`BaseSearch.get_result_extra_context` on the raw result."""
 152                 return self.search.get_result_extra_context(self.result)
 153
 154         def get_context(self):
 155                 """
 156                 Returns the context dictionary for the result. This is used both in rendering the result and in the AJAX return value for :meth:`.SearchView.ajax_api_view`. The context will contain everything from :meth:`get_extra_context` as well as the following keys:
 157
 158                 title
 159                         The result of calling :meth:`get_title`
 160                 url
 161                         The result of calling :meth:`get_url`
 162                 result
 163                         The raw result which the :class:`Result` was instantiated with.
 164
 165                 """
 166                 context = self.get_extra_context()
 167                 context.update({
 168                         'title': self.get_title(),
 169                         'url': self.get_url(),
 170                         'result': self.result
 171                 })
 172                 return context
 173
 174         def render(self):
 175                 """Returns the template from :meth:`get_template` rendered with the context from :meth:`get_context`."""
 176                 t = self.get_template()
 177                 c = Context(self.get_context())
 178                 return t.render(c)
 179
 180         def __unicode__(self):
 181                 """Returns :meth:`render`"""
 182                 return self.render()
 183
 184
 185 class BaseSearchMetaclass(type):
 186         def __new__(cls, name, bases, attrs):
 187                 if 'verbose_name' not in attrs:
 188                         attrs['verbose_name'] = capfirst(' '.join(convert_camelcase(name).rsplit(' ', 1)[:-1]))
 189                 if 'slug' not in attrs:
 190                         attrs['slug'] = name.lower()
 191                 return super(BaseSearchMetaclass, cls).__new__(cls, name, bases, attrs)
 192
 193
 194 class BaseSearch(object):
 195         """
 196         Defines a generic search api. Accessing :attr:`results` will attempt to retrieve cached results and, if that fails, will initiate a new search and store the results in the cache. Each search has a ``verbose_name`` and a ``slug``. If these are not provided as attributes, they will be automatically generated based on the name of the class.
 197
 198         :param search_arg: The string which is being searched for.
 199
 200         """
 201         __metaclass__ = BaseSearchMetaclass
 202         #: The number of results to return from the complete list. Default: 10
 203         result_limit = 10
 204         #: How long the items for the search should be cached (in minutes). Default: 48 hours.
 205         _cache_timeout = 60*48
 206         #: The path to the template which will be used to render the :class:`Result`\ s for this search.
 207         result_template = "sobol/search/basesearch.html"
 208
 209         def __init__(self, search_arg):
 210                 self.search_arg = search_arg
 211
 212         @property
 213         def results(self):
 214                 """Retrieves cached results or initiates a new search via :meth:`get_results` and caches the results."""
 215                 if not hasattr(self, '_results'):
 216                         try:
 217                                 # Cache one extra result so we can see if there are
 218                                 # more results to be had.
 219                                 limit = self.result_limit
 220                                 if limit is not None:
 221                                         limit += 1
 222                                 results = self.get_results(limit)
 223                         except:
 224                                 if settings.DEBUG:
 225                                         raise
 226                                 #  On exceptions, don't set any cache; just return.
 227                                 return []
 228
 229                         self._results = results
 230
 231                         if USE_CACHE:
 232                                 key = _make_cache_key(self, self.search_arg)
 233                                 cache.set(key, self, self._cache_timeout)
 234
 235                 return self._results
 236
 237         def get_results(self, limit=None, result_class=Result):
 238                 """
 239                 Calls :meth:`search` and parses the return value into :class:`Result` instances.
 240
 241                 :param limit: Passed directly to :meth:`search`.
 242                 :param result_class: The class used to represent the results. This will be instantiated with the :class:`BaseSearch` instance and the raw result from the search.
 243
 244                 """
 245                 results = self.search(limit)
 246                 return [result_class(self, result) for result in results]
 247
 248         def search(self, limit=None):
 249                 """Returns an iterable of up to ``limit`` results. The :meth:`get_result_title`, :meth:`get_result_url`, :meth:`get_result_template`, and :meth:`get_result_extra_context` methods will be used to interpret the individual items that this function returns, so the result can be an object with attributes as easily as a dictionary with keys. However, keep in mind that the raw results will be stored with django's caching mechanisms and will be converted to JSON."""
 250                 raise NotImplementedError
 251
 252         def get_result_title(self, result):
 253                 """Returns the title of the ``result``. Must be implemented by subclasses."""
 254                 raise NotImplementedError
 255
 256         def get_result_url(self, result):
 257                 """Returns the actual URL for the ``result`` or ``None`` if there is no URL. Must be implemented by subclasses."""
 258                 raise NotImplementedError
 259
 260         def get_result_querydict(self, result):
 261                 """Returns a querydict for tracking selection of the result, or ``None`` if there is no URL for the result."""
 262                 url = self.get_result_url(result)
 263                 if url is None:
 264                         return None
 265                 return make_tracking_querydict(self.search_arg, url)
 266
 267         def get_result_template(self, result):
 268                 """Returns the template to be used for rendering the ``result``."""
 269                 return loader.get_template(self.result_template)
 270
 271         def get_result_extra_context(self, result):
 272                 """Returns any extra context to be used when rendering the ``result``."""
 273                 return {}
 274
 275         def has_more_results(self):
 276                 """Returns ``True`` if there are more results than :attr:`result_limit` and ``False`` otherwise."""
 277                 return len(self.results) > self.result_limit
 278
 279         @property
 280         def more_results_url(self):
 281                 """Returns the actual url for more results. This should be accessed through :attr:`more_results_querydict` in the template so that the click can be tracked."""
 282                 raise NotImplementedError
 283
 284         @property
 285         def more_results_querydict(self):
 286                 """Returns a :class:`QueryDict` for tracking whether people click on a 'more results' link."""
 287                 return make_tracking_querydict(self.search_arg, self.more_results_url)
 288
 289         def __unicode__(self):
 290                 return self.verbose_name
 291
 292
 293 class DatabaseSearch(BaseSearch):
 294         """Implements :meth:`~BaseSearch.search` and :meth:`get_queryset` methods to handle database queries."""
 295         #: The model which should be searched by the :class:`DatabaseSearch`.
 296         model = None
 297
 298         def search(self, limit=None):
 299                 if not hasattr(self, '_qs'):
 300                         self._qs = self.get_queryset()
 301                         if limit is not None:
 302                                 self._qs = self._qs[:limit]
 303
 304                 return self._qs
 305
 306         def get_queryset(self):
 307                 """Returns a :class:`QuerySet` of all instances of :attr:`model`. This method should be overridden by subclasses to specify how the search should actually be implemented for the model."""
 308                 return self.model._default_manager.all()
 309
 310
 311 class URLSearch(BaseSearch):
 312         """Defines a generic interface for searches that require accessing a certain url to get search results."""
 313         #: The base URL which will be accessed to get the search results.
 314         search_url = ''
 315         #: The url-encoded query string to be used for fetching search results from :attr:`search_url`. Must have one ``%s`` to contain the search argument.
 316         query_format_str = "%s"
 317
 318         @property
 319         def url(self):
 320                 """The URL where the search gets its results. Composed from :attr:`search_url` and :attr:`query_format_str`."""
 321                 return self.search_url + self.query_format_str % urlquote_plus(self.search_arg)
 322
 323         @property
 324         def more_results_url(self):
 325                 return self.url
 326
 327         def parse_response(self, response, limit=None):
 328                 """Handles the ``response`` from accessing :attr:`url` (with :func:`urllib2.urlopen`) and returns a list of up to ``limit`` results."""
 329                 raise NotImplementedError
 330
 331         def search(self, limit=None):
 332                 return self.parse_response(urllib2.urlopen(self.url), limit=limit)
 333
 334
 335 class JSONSearch(URLSearch):
 336         """Makes a GET request and parses the results as JSON. The default behavior assumes that the response contains a list of results."""
 337         def parse_response(self, response, limit=None):
 338                 return json.loads(response.read())[:limit]
 339
 340
 341 class GoogleSearch(JSONSearch):
 342         """An example implementation of a :class:`JSONSearch`."""
 343         search_url = "http://ajax.googleapis.com/ajax/services/search/web"
 344         _cache_timeout = 60
 345         verbose_name = "Google search (current site)"
 346         result_template = "sobol/search/googlesearch.html"
 347
 348         @property
 349         def query_format_str(self):
 350                 default_args = self.default_args
 351                 if default_args:
 352                         default_args += " "
 353                 return "?v=1.0&q=%s%%s" % urlquote_plus(default_args).replace('%', '%%')
 354
 355         @property
 356         def default_args(self):
 357                 """Unquoted default arguments for the :class:`GoogleSearch`."""
 358                 return "site:%s" % Site.objects.get_current().domain
 359
 360         def parse_response(self, response, limit=None):
 361                 responseData = json.loads(response.read())['responseData']
 362                 results, cursor = responseData['results'], responseData['cursor']
 363
 364                 if results:
 365                         self._more_results_url = cursor['moreResultsUrl']
 366                         self._estimated_result_count = cursor['estimatedResultCount']
 367
 368                 return results[:limit]
 369
 370         @property
 371         def url(self):
 372                 # Google requires that an ajax request have a proper Referer header.
 373                 return urllib2.Request(
 374                         super(GoogleSearch, self).url,
 375                         None,
 376                         {'Referer': "http://%s" % Site.objects.get_current().domain}
 377                 )
 378
 379         @property
 380         def has_more_results(self):
 381                 if self.results and len(self.results) < self._estimated_result_count:
 382                         return True
 383                 return False
 384
 385         @property
 386         def more_results_url(self):
 387                 return self._more_results_url
 388
 389         def get_result_title(self, result):
 390                 return result['titleNoFormatting']
 391
 392         def get_result_url(self, result):
 393                 return result['unescapedUrl']
 394
 395
 396 registry.register(GoogleSearch)
 397
 398
 399 try:
 400         from BeautifulSoup import BeautifulSoup, SoupStrainer, BeautifulStoneSoup
 401 except:
 402         pass
 403 else:
 404         __all__ += ('ScrapeSearch', 'XMLSearch',)
 405         class ScrapeSearch(URLSearch):
 406                 """A base class for scrape-style searching, available if :mod:`BeautifulSoup` is installed."""
 407                 #: Arguments to be passed into a :class:`SoupStrainer`.
 408                 strainer_args = []
 409                 #: Keyword arguments to be passed into a :class:`SoupStrainer`.
 410                 strainer_kwargs = {}
 411
 412                 @property
 413                 def strainer(self):
 414                         """
 415                         Caches and returns a :class:`SoupStrainer` initialized with :attr:`strainer_args` and :attr:`strainer_kwargs`. This strainer will be used to parse only certain parts of the document.
 416
 417                         .. seealso:: `BeautifulSoup: Improving Performance by Parsing Only Part of the Document <http://www.crummy.com/software/BeautifulSoup/documentation.html#Improving%20Performance%20by%20Parsing%20Only%20Part%20of%20the%20Document>`_
 418
 419                         """
 420                         if not hasattr(self, '_strainer'):
 421                                 self._strainer = SoupStrainer(*self.strainer_args, **self.strainer_kwargs)
 422                         return self._strainer
 423
 424                 def parse_response(self, response, limit=None):
 425                         strainer = self.strainer
 426                         soup = BeautifulSoup(response, parseOnlyThese=strainer)
 427                         return self.parse_results(soup.findAll(recursive=False, limit=limit))
 428
 429                 def parse_results(self, results):
 430                         """
 431                         Provides a hook for parsing the results of straining. This has no default behavior and must be implemented by subclasses because the results absolutely must be parsed to properly extract the information.
 432
 433                         .. seealso:: `BeautifulSoup: Improving Memory Usage with extract <http://www.crummy.com/software/BeautifulSoup/documentation.html#Improving%20Memory%20Usage%20with%20extract>`_
 434                         """
 435                         raise NotImplementedError
 436
 437
 438         class XMLSearch(ScrapeSearch):
 439                 """A base class for searching XML results."""
 440                 #: Self-closing tag names to be used when interpreting the XML document
 441                 #:
 442                 #: .. seealso:: `BeautifulSoup: Parsing XML <http://www.crummy.com/software/BeautifulSoup/documentation.html#Parsing%20XML>`_
 443                 self_closing_tags = []
 444
 445                 def parse_response(self, response, limit=None):
 446                         strainer = self.strainer
 447                         soup = BeautifulStoneSoup(response, selfClosingTags=self.self_closing_tags, parseOnlyThese=strainer)
 448                         return self.parse_results(soup.findAll(recursive=False, limit=limit))