philo/contrib/sobol/search.py

   1 #encoding: utf-8
   2 import datetime
   3 from hashlib import sha1
   4
   5 from django.conf import settings
   6 from django.contrib.sites.models import Site
   7 from django.core.cache import cache
   8 from django.db.models.options import get_verbose_name as convert_camelcase
   9 from django.utils import simplejson as json
  10 from django.utils.http import urlquote_plus
  11 from django.utils.safestring import mark_safe
  12 from django.utils.text import capfirst
  13 from django.template import loader, Context, Template
  14
  15 from philo.contrib.sobol.utils import make_tracking_querydict, RegistryIterator
  16
  17
  18 if getattr(settings, 'SOBOL_USE_EVENTLET', False):
  19         try:
  20                 from eventlet.green import urllib2
  21         except:
  22                 import urllib2
  23 else:
  24         import urllib2
  25
  26
  27 __all__ = (
  28         'Result', 'BaseSearch', 'DatabaseSearch', 'URLSearch', 'JSONSearch', 'GoogleSearch', 'SearchRegistry', 'registry', 'get_search_instance'
  29 )
  30
  31
  32 SEARCH_CACHE_SEED = 'philo_sobol_search_results'
  33 USE_CACHE = getattr(settings, 'SOBOL_USE_SEARCH', True)
  34
  35
  36 class RegistrationError(Exception):
  37         """Raised if there is a problem registering a search with a :class:`SearchRegistry`"""
  38         pass
  39
  40
  41 class SearchRegistry(object):
  42         """Holds a registry of search types by slug."""
  43
  44         def __init__(self):
  45                 self._registry = {}
  46
  47         def register(self, search, slug=None):
  48                 """
  49                 Register a search with the registry.
  50
  51                 :param search: The search class to register - generally a subclass of :class:`BaseSearch`
  52                 :param slug: The slug which will be used to register the search class. If ``slug`` is ``None``, the search's default slug will be used.
  53                 :raises: :class:`RegistrationError` if a different search is already registered with ``slug``.
  54
  55                 """
  56                 slug = slug or search.slug
  57                 if slug in self._registry:
  58                         registered = self._registry[slug]
  59                         if registered.__module__ != search.__module__:
  60                                 raise RegistrationError("A different search is already registered as `%s`" % slug)
  61                 else:
  62                         self._registry[slug] = search
  63
  64         def unregister(self, search, slug=None):
  65                 """
  66                 Unregister a search from the registry.
  67
  68                 :param search: The search class to unregister - generally a subclass of :class:`BaseSearch`
  69                 :param slug: If provided, the search will only be removed if it was registered with ``slug``. If not provided, the search class will be unregistered no matter what slug it was registered with.
  70                 :raises: :class:`RegistrationError` if a slug is provided but the search registered with that slug is not ``search``.
  71
  72                 """
  73                 if slug is not None:
  74                         if slug in self._registry and self._registry[slug] == search:
  75                                 del self._registry[slug]
  76                         raise RegistrationError("`%s` is not registered as `%s`" % (search, slug))
  77                 else:
  78                         for slug, search in self._registry.items():
  79                                 if search == search:
  80                                         del self._registry[slug]
  81
  82         def items(self):
  83                 """Returns a list of (slug, search) items in the registry."""
  84                 return self._registry.items()
  85
  86         def iteritems(self):
  87                 """Returns an iterator over the (slug, search) pairs in the registry."""
  88                 return RegistryIterator(self._registry, 'iteritems')
  89
  90         def iterchoices(self):
  91                 """Returns an iterator over (slug, search.verbose_name) pairs for the registry."""
  92                 return RegistryIterator(self._registry, 'iteritems', lambda x: (x[0], x[1].verbose_name))
  93
  94         def __getitem__(self, key):
  95                 """Returns the search registered with ``key``."""
  96                 return self._registry[key]
  97
  98         def __iter__(self):
  99                 """Returns an iterator over the keys in the registry."""
 100                 return self._registry.__iter__()
 101
 102
 103 registry = SearchRegistry()
 104
 105
 106 def _make_cache_key(search, search_arg):
 107         return sha1(SEARCH_CACHE_SEED + search.slug + search_arg).hexdigest()
 108
 109
 110 def get_search_instance(slug, search_arg):
 111         """Returns a search instance for the given slug, either from the cache or newly-instantiated."""
 112         search = registry[slug]
 113         search_arg = search_arg.lower()
 114         if USE_CACHE:
 115                 key = _make_cache_key(search, search_arg)
 116                 cached = cache.get(key)
 117                 if cached:
 118                         return cached
 119         instance = search(search_arg)
 120         instance.slug = slug
 121         return instance
 122
 123
 124
 125 class Result(object):
 126         """
 127         :class:`Result` is a helper class that, given a search and a result of that search, is able to correctly render itself with a template defined by the search. Every :class:`Result` will pass a ``title``, a ``url`` (if applicable), and the raw ``result`` returned by the search into the template context when rendering.
 128
 129         :param search: An instance of a :class:`BaseSearch` subclass or an object that implements the same API.
 130         :param result: An arbitrary result from the ``search``.
 131
 132         """
 133         def __init__(self, search, result):
 134                 self.search = search
 135                 self.result = result
 136
 137         def get_title(self):
 138                 """Returns the title of the result by calling :meth:`BaseSearch.get_result_title` on the raw result."""
 139                 return self.search.get_result_title(self.result)
 140
 141         def get_url(self):
 142                 """Returns the url of the result or an empty string by calling :meth:`BaseSearch.get_result_querydict` on the raw result and then encoding the querydict returned."""
 143                 qd = self.search.get_result_querydict(self.result)
 144                 if qd is None:
 145                         return ""
 146                 return "?%s" % qd.urlencode()
 147
 148         def get_template(self):
 149                 """Returns the template for the result by calling :meth:`BaseSearch.get_result_template` on the raw result."""
 150                 return self.search.get_result_template(self.result)
 151
 152         def get_extra_context(self):
 153                 """Returns any extra context for the result by calling :meth:`BaseSearch.get_result_extra_context` on the raw result."""
 154                 return self.search.get_result_extra_context(self.result)
 155
 156         def get_context(self):
 157                 """
 158                 Returns the context dictionary for the result. This is used both in rendering the result and in the AJAX return value for :meth:`.SearchView.ajax_api_view`. The context will contain everything from :meth:`get_extra_context` as well as the following keys:
 159
 160                 title
 161                         The result of calling :meth:`get_title`
 162                 url
 163                         The result of calling :meth:`get_url`
 164                 result
 165                         The raw result which the :class:`Result` was instantiated with.
 166
 167                 """
 168                 context = self.get_extra_context()
 169                 context.update({
 170                         'title': self.get_title(),
 171                         'url': self.get_url(),
 172                         'result': self.result
 173                 })
 174                 return context
 175
 176         def render(self):
 177                 """Returns the template from :meth:`get_template` rendered with the context from :meth:`get_context`."""
 178                 t = self.get_template()
 179                 c = Context(self.get_context())
 180                 return t.render(c)
 181
 182         def __unicode__(self):
 183                 """Returns :meth:`render`"""
 184                 return self.render()
 185
 186
 187 class BaseSearchMetaclass(type):
 188         def __new__(cls, name, bases, attrs):
 189                 if 'verbose_name' not in attrs:
 190                         attrs['verbose_name'] = capfirst(' '.join(convert_camelcase(name).rsplit(' ', 1)[:-1]))
 191                 if 'slug' not in attrs:
 192                         attrs['slug'] = name[:-6].lower() if name.endswith("Search") else name.lower()
 193                 return super(BaseSearchMetaclass, cls).__new__(cls, name, bases, attrs)
 194
 195
 196 class BaseSearch(object):
 197         """
 198         Defines a generic search api. Accessing :attr:`results` will attempt to retrieve cached results and, if that fails, will initiate a new search and store the results in the cache. Each search has a ``verbose_name`` and a ``slug``. If these are not provided as attributes, they will be automatically generated based on the name of the class.
 199
 200         :param search_arg: The string which is being searched for.
 201
 202         """
 203         __metaclass__ = BaseSearchMetaclass
 204         #: The number of results to return from the complete list. Default: 5
 205         result_limit = 5
 206         #: How long the items for the search should be cached (in minutes). Default: 48 hours.
 207         _cache_timeout = 60*48
 208         #: The path to the template which will be used to render the :class:`Result`\ s for this search.
 209         result_template = "sobol/search/basesearch.html"
 210
 211         def __init__(self, search_arg):
 212                 self.search_arg = search_arg
 213
 214         @property
 215         def results(self):
 216                 """Retrieves cached results or initiates a new search via :meth:`get_results` and caches the results."""
 217                 if not hasattr(self, '_results'):
 218                         try:
 219                                 # Cache one extra result so we can see if there are
 220                                 # more results to be had.
 221                                 limit = self.result_limit
 222                                 if limit is not None:
 223                                         limit += 1
 224                                 results = self.get_results(limit)
 225                         except:
 226                                 if settings.DEBUG:
 227                                         raise
 228                                 #  On exceptions, don't set any cache; just return.
 229                                 return []
 230
 231                         self._results = results
 232
 233                         if USE_CACHE:
 234                                 key = _make_cache_key(self, self.search_arg)
 235                                 cache.set(key, self, self._cache_timeout)
 236
 237                 return self._results
 238
 239         def get_results(self, limit=None, result_class=Result):
 240                 """
 241                 Calls :meth:`search` and parses the return value into :class:`Result` instances.
 242
 243                 :param limit: Passed directly to :meth:`search`.
 244                 :param result_class: The class used to represent the results. This will be instantiated with the :class:`BaseSearch` instance and the raw result from the search.
 245
 246                 """
 247                 results = self.search(limit)
 248                 return [result_class(self, result) for result in results]
 249
 250         def search(self, limit=None):
 251                 """Returns an iterable of up to ``limit`` results. The :meth:`get_result_title`, :meth:`get_result_url`, :meth:`get_result_template`, and :meth:`get_result_extra_context` methods will be used to interpret the individual items that this function returns, so the result can be an object with attributes as easily as a dictionary with keys. However, keep in mind that the raw results will be stored with django's caching mechanisms and will be converted to JSON."""
 252                 raise NotImplementedError
 253
 254         def get_result_title(self, result):
 255                 """Returns the title of the ``result``. Must be implemented by subclasses."""
 256                 raise NotImplementedError
 257
 258         def get_result_url(self, result):
 259                 """Returns the actual URL for the ``result`` or ``None`` if there is no URL. Must be implemented by subclasses."""
 260                 raise NotImplementedError
 261
 262         def get_result_querydict(self, result):
 263                 """Returns a querydict for tracking selection of the result, or ``None`` if there is no URL for the result."""
 264                 url = self.get_result_url(result)
 265                 if url is None:
 266                         return None
 267                 return make_tracking_querydict(self.search_arg, url)
 268
 269         def get_result_template(self, result):
 270                 """Returns the template to be used for rendering the ``result``."""
 271                 return loader.get_template(self.result_template)
 272
 273         def get_result_extra_context(self, result):
 274                 """Returns any extra context to be used when rendering the ``result``."""
 275                 return {}
 276
 277         @property
 278         def has_more_results(self):
 279                 """Returns ``True`` if there are more results than :attr:`result_limit` and ``False`` otherwise."""
 280                 return len(self.results) > self.result_limit
 281
 282         @property
 283         def more_results_url(self):
 284                 """Returns the actual url for more results. This should be accessed through :attr:`more_results_querydict` in the template so that the click can be tracked. By default, simply returns ``None``."""
 285                 return None
 286
 287         @property
 288         def more_results_querydict(self):
 289                 """Returns a :class:`QueryDict` for tracking whether people click on a 'more results' link."""
 290                 url = self.more_results_url
 291                 if url:
 292                         return make_tracking_querydict(self.search_arg, url)
 293                 return None
 294
 295         def __unicode__(self):
 296                 return self.verbose_name
 297
 298
 299 class DatabaseSearch(BaseSearch):
 300         """Implements :meth:`~BaseSearch.search` and :meth:`get_queryset` methods to handle database queries."""
 301         #: The model which should be searched by the :class:`DatabaseSearch`.
 302         model = None
 303
 304         def search(self, limit=None):
 305                 if not hasattr(self, '_qs'):
 306                         self._qs = self.get_queryset()
 307                         if limit is not None:
 308                                 self._qs = self._qs[:limit]
 309
 310                 return self._qs
 311
 312         def get_queryset(self):
 313                 """Returns a :class:`QuerySet` of all instances of :attr:`model`. This method should be overridden by subclasses to specify how the search should actually be implemented for the model."""
 314                 return self.model._default_manager.all()
 315
 316
 317 class URLSearch(BaseSearch):
 318         """Defines a generic interface for searches that require accessing a certain url to get search results."""
 319         #: The base URL which will be accessed to get the search results.
 320         search_url = ''
 321         #: The url-encoded query string to be used for fetching search results from :attr:`search_url`. Must have one ``%s`` to contain the search argument.
 322         query_format_str = "%s"
 323
 324         @property
 325         def url(self):
 326                 """The URL where the search gets its results. Composed from :attr:`search_url` and :attr:`query_format_str`."""
 327                 return self.search_url + self.query_format_str % urlquote_plus(self.search_arg)
 328
 329         @property
 330         def more_results_url(self):
 331                 return self.url
 332
 333         def parse_response(self, response, limit=None):
 334                 """Handles the ``response`` from accessing :attr:`url` (with :func:`urllib2.urlopen`) and returns a list of up to ``limit`` results."""
 335                 raise NotImplementedError
 336
 337         def search(self, limit=None):
 338                 return self.parse_response(urllib2.urlopen(self.url), limit=limit)
 339
 340
 341 class JSONSearch(URLSearch):
 342         """Makes a GET request and parses the results as JSON. The default behavior assumes that the response contains a list of results."""
 343         def parse_response(self, response, limit=None):
 344                 return json.loads(response.read())[:limit]
 345
 346
 347 class GoogleSearch(JSONSearch):
 348         """An example implementation of a :class:`JSONSearch`."""
 349         search_url = "http://ajax.googleapis.com/ajax/services/search/web"
 350         _cache_timeout = 60
 351         verbose_name = "Google search (current site)"
 352         result_template = "sobol/search/googlesearch.html"
 353         _more_results_url = None
 354
 355         @property
 356         def query_format_str(self):
 357                 default_args = self.default_args
 358                 if default_args:
 359                         default_args += " "
 360                 return "?v=1.0&q=%s%%s" % urlquote_plus(default_args).replace('%', '%%')
 361
 362         @property
 363         def default_args(self):
 364                 """Unquoted default arguments for the :class:`GoogleSearch`."""
 365                 return "site:%s" % Site.objects.get_current().domain
 366
 367         def parse_response(self, response, limit=None):
 368                 responseData = json.loads(response.read())['responseData']
 369                 results, cursor = responseData['results'], responseData['cursor']
 370
 371                 if results:
 372                         self._more_results_url = cursor['moreResultsUrl']
 373                         self._estimated_result_count = cursor['estimatedResultCount']
 374
 375                 return results[:limit]
 376
 377         @property
 378         def url(self):
 379                 # Google requires that an ajax request have a proper Referer header.
 380                 return urllib2.Request(
 381                         super(GoogleSearch, self).url,
 382                         None,
 383                         {'Referer': "http://%s" % Site.objects.get_current().domain}
 384                 )
 385
 386         @property
 387         def has_more_results(self):
 388                 if self.results and len(self.results) < self._estimated_result_count:
 389                         return True
 390                 return False
 391
 392         @property
 393         def more_results_url(self):
 394                 return self._more_results_url
 395
 396         def get_result_title(self, result):
 397                 return result['titleNoFormatting']
 398
 399         def get_result_url(self, result):
 400                 return result['unescapedUrl']
 401
 402
 403 registry.register(GoogleSearch)
 404
 405
 406 try:
 407         from BeautifulSoup import BeautifulSoup, SoupStrainer, BeautifulStoneSoup
 408 except:
 409         pass
 410 else:
 411         __all__ += ('ScrapeSearch', 'XMLSearch',)
 412         class ScrapeSearch(URLSearch):
 413                 """A base class for scrape-style searching, available if :mod:`BeautifulSoup` is installed."""
 414                 #: Arguments to be passed into a :class:`SoupStrainer`.
 415                 strainer_args = []
 416                 #: Keyword arguments to be passed into a :class:`SoupStrainer`.
 417                 strainer_kwargs = {}
 418
 419                 @property
 420                 def strainer(self):
 421                         """
 422                         Caches and returns a :class:`SoupStrainer` initialized with :attr:`strainer_args` and :attr:`strainer_kwargs`. This strainer will be used to parse only certain parts of the document.
 423
 424                         .. seealso:: `BeautifulSoup: Improving Performance by Parsing Only Part of the Document <http://www.crummy.com/software/BeautifulSoup/documentation.html#Improving%20Performance%20by%20Parsing%20Only%20Part%20of%20the%20Document>`_
 425
 426                         """
 427                         if not hasattr(self, '_strainer'):
 428                                 self._strainer = SoupStrainer(*self.strainer_args, **self.strainer_kwargs)
 429                         return self._strainer
 430
 431                 def parse_response(self, response, limit=None):
 432                         strainer = self.strainer
 433                         soup = BeautifulSoup(response, parseOnlyThese=strainer)
 434                         return self.parse_results(soup.findAll(recursive=False, limit=limit))
 435
 436                 def parse_results(self, results):
 437                         """
 438                         Provides a hook for parsing the results of straining. This has no default behavior and must be implemented by subclasses because the results absolutely must be parsed to properly extract the information.
 439
 440                         .. seealso:: `BeautifulSoup: Improving Memory Usage with extract <http://www.crummy.com/software/BeautifulSoup/documentation.html#Improving%20Memory%20Usage%20with%20extract>`_
 441                         """
 442                         raise NotImplementedError
 443
 444
 445         class XMLSearch(ScrapeSearch):
 446                 """A base class for searching XML results."""
 447                 #: Self-closing tag names to be used when interpreting the XML document
 448                 #:
 449                 #: .. seealso:: `BeautifulSoup: Parsing XML <http://www.crummy.com/software/BeautifulSoup/documentation.html#Parsing%20XML>`_
 450                 self_closing_tags = []
 451
 452                 def parse_response(self, response, limit=None):
 453                         strainer = self.strainer
 454                         soup = BeautifulStoneSoup(response, selfClosingTags=self.self_closing_tags, parseOnlyThese=strainer)
 455                         return self.parse_results(soup.findAll(recursive=False, limit=limit))