philo/contrib/sobol/search.py

   1 #encoding: utf-8
   2 import datetime
   3
   4 from django.conf import settings
   5 from django.contrib.sites.models import Site
   6 from django.core.cache import cache
   7 from django.db.models.options import get_verbose_name as convert_camelcase
   8 from django.utils import simplejson as json
   9 from django.utils.http import urlquote_plus
  10 from django.utils.safestring import mark_safe
  11 from django.utils.text import capfirst
  12 from django.template import loader, Context, Template
  13
  14 from philo.contrib.sobol.utils import make_tracking_querydict, RegistryIterator
  15
  16
  17 if getattr(settings, 'SOBOL_USE_EVENTLET', False):
  18         try:
  19                 from eventlet.green import urllib2
  20         except:
  21                 import urllib2
  22 else:
  23         import urllib2
  24
  25
  26 __all__ = (
  27         'Result', 'BaseSearch', 'DatabaseSearch', 'URLSearch', 'JSONSearch', 'GoogleSearch', 'SearchRegistry', 'registry'
  28 )
  29
  30
  31 SEARCH_CACHE_KEY = 'philo_sobol_search_results'
  32 DEFAULT_RESULT_TEMPLATE_STRING = "{% if url %}<a href='{{ url }}'>{% endif %}{{ title }}{% if url %}</a>{% endif %}"
  33 DEFAULT_RESULT_TEMPLATE = Template(DEFAULT_RESULT_TEMPLATE_STRING)
  34
  35 # Determines the timeout on the entire result cache.
  36 MAX_CACHE_TIMEOUT = 60*24*7
  37
  38
  39 class RegistrationError(Exception):
  40         """Raised if there is a problem registering a search with a :class:`SearchRegistry`"""
  41         pass
  42
  43
  44 class SearchRegistry(object):
  45         """Holds a registry of search types by slug."""
  46
  47         def __init__(self):
  48                 self._registry = {}
  49
  50         def register(self, search, slug=None):
  51                 """
  52                 Register a search with the registry.
  53
  54                 :param search: The search class to register - generally a subclass of :class:`BaseSearch`
  55                 :param slug: The slug which will be used to register the search class. If ``slug`` is ``None``, the search's default slug will be used.
  56                 :raises: :class:`RegistrationError` if a different search is already registered with ``slug``.
  57
  58                 """
  59                 slug = slug or search.slug
  60                 if slug in self._registry:
  61                         registered = self._registry[slug]
  62                         if registered.__module__ != search.__module__:
  63                                 raise RegistrationError("A different search is already registered as `%s`" % slug)
  64                 else:
  65                         self._registry[slug] = search
  66
  67         def unregister(self, search, slug=None):
  68                 """
  69                 Unregister a search from the registry.
  70
  71                 :param search: The search class to unregister - generally a subclass of :class:`BaseSearch`
  72                 :param slug: If provided, the search will only be removed if it was registered with ``slug``. If not provided, the search class will be unregistered no matter what slug it was registered with.
  73                 :raises: :class:`RegistrationError` if a slug is provided but the search registered with that slug is not ``search``.
  74
  75                 """
  76                 if slug is not None:
  77                         if slug in self._registry and self._registry[slug] == search:
  78                                 del self._registry[slug]
  79                         raise RegistrationError("`%s` is not registered as `%s`" % (search, slug))
  80                 else:
  81                         for slug, search in self._registry.items():
  82                                 if search == search:
  83                                         del self._registry[slug]
  84
  85         def items(self):
  86                 """Returns a list of (slug, search) items in the registry."""
  87                 return self._registry.items()
  88
  89         def iteritems(self):
  90                 """Returns an iterator over the (slug, search) pairs in the registry."""
  91                 return RegistryIterator(self._registry, 'iteritems')
  92
  93         def iterchoices(self):
  94                 """Returns an iterator over (slug, search.verbose_name) pairs for the registry."""
  95                 return RegistryIterator(self._registry, 'iteritems', lambda x: (x[0], x[1].verbose_name))
  96
  97         def __getitem__(self, key):
  98                 """Returns the search registered with ``key``."""
  99                 return self._registry[key]
 100
 101         def __iter__(self):
 102                 """Returns an iterator over the keys in the registry."""
 103                 return self._registry.__iter__()
 104
 105
 106 registry = SearchRegistry()
 107
 108
 109 class Result(object):
 110         """
 111         :class:`Result` is a helper class that, given a search and a result of that search, is able to correctly render itself with a template defined by the search. Every :class:`Result` will pass a ``title``, a ``url`` (if applicable), and the raw ``result`` returned by the search into the template context when rendering.
 112
 113         :param search: An instance of a :class:`BaseSearch` subclass or an object that implements the same API.
 114         :param result: An arbitrary result from the ``search``.
 115
 116         """
 117         def __init__(self, search, result):
 118                 self.search = search
 119                 self.result = result
 120
 121         def get_title(self):
 122                 """Returns the title of the result by calling :meth:`BaseSearch.get_result_title` on the raw result."""
 123                 return self.search.get_result_title(self.result)
 124
 125         def get_url(self):
 126                 """Returns the url of the result or an empty string by calling :meth:`BaseSearch.get_result_querydict` on the raw result and then encoding the querydict returned."""
 127                 qd = self.search.get_result_querydict(self.result)
 128                 if qd is None:
 129                         return ""
 130                 return "?%s" % qd.urlencode()
 131
 132         def get_template(self):
 133                 """Returns the template for the result by calling :meth:`BaseSearch.get_result_template` on the raw result."""
 134                 return self.search.get_result_template(self.result)
 135
 136         def get_extra_context(self):
 137                 """Returns any extra context for the result by calling :meth:`BaseSearch.get_result_extra_context` on the raw result."""
 138                 return self.search.get_result_extra_context(self.result)
 139
 140         def get_context(self):
 141                 """
 142                 Returns the context dictionary for the result. This is used both in rendering the result and in the AJAX return value for :meth:`.SearchView.ajax_api_view`. The context will contain everything from :meth:`get_extra_context` as well as the following keys:
 143
 144                 title
 145                         The result of calling :meth:`get_title`
 146                 url
 147                         The result of calling :meth:`get_url`
 148                 result
 149                         The raw result which the :class:`Result` was instantiated with.
 150
 151                 """
 152                 context = self.get_extra_context()
 153                 context.update({
 154                         'title': self.get_title(),
 155                         'url': self.get_url(),
 156                         'result': self.result
 157                 })
 158                 return context
 159
 160         def render(self):
 161                 """Returns the template from :meth:`get_template` rendered with the context from :meth:`get_context`."""
 162                 t = self.get_template()
 163                 c = Context(self.get_context())
 164                 return t.render(c)
 165
 166         def __unicode__(self):
 167                 """Returns :meth:`render`"""
 168                 return self.render()
 169
 170
 171 class BaseSearchMetaclass(type):
 172         def __new__(cls, name, bases, attrs):
 173                 if 'verbose_name' not in attrs:
 174                         attrs['verbose_name'] = capfirst(' '.join(convert_camelcase(name).rsplit(' ', 1)[:-1]))
 175                 if 'slug' not in attrs:
 176                         attrs['slug'] = name.lower()
 177                 return super(BaseSearchMetaclass, cls).__new__(cls, name, bases, attrs)
 178
 179
 180 class BaseSearch(object):
 181         """
 182         Defines a generic search api. Accessing :attr:`results` will attempt to retrieve cached results and, if that fails, will initiate a new search and store the results in the cache. Each search has a ``verbose_name`` and a ``slug``. If these are not provided as attributes, they will be automatically generated based on the name of the class.
 183
 184         :param search_arg: The string which is being searched for.
 185
 186         """
 187         __metaclass__ = BaseSearchMetaclass
 188         #: The number of results to return from the complete list. Default: 10
 189         result_limit = 10
 190         #: How long the items for the search should be cached (in minutes). Default: 48 hours.
 191         _cache_timeout = 60*48
 192
 193         def __init__(self, search_arg):
 194                 self.search_arg = search_arg
 195
 196         def _get_cached_results(self):
 197                 """Return the cached results if the results haven't timed out. Otherwise return None."""
 198                 result_cache = cache.get(SEARCH_CACHE_KEY)
 199                 if result_cache and self.__class__ in result_cache and self.search_arg.lower() in result_cache[self.__class__]:
 200                         cached = result_cache[self.__class__][self.search_arg.lower()]
 201                         if cached['timeout'] >= datetime.datetime.now():
 202                                 return cached['results']
 203                 return None
 204
 205         def _set_cached_results(self, results, timeout):
 206                 """Sets the results to the cache for <timeout> minutes."""
 207                 result_cache = cache.get(SEARCH_CACHE_KEY) or {}
 208                 cached = result_cache.setdefault(self.__class__, {}).setdefault(self.search_arg.lower(), {})
 209                 cached.update({
 210                         'results': results,
 211                         'timeout': datetime.datetime.now() + datetime.timedelta(minutes=timeout)
 212                 })
 213                 cache.set(SEARCH_CACHE_KEY, result_cache, MAX_CACHE_TIMEOUT)
 214
 215         @property
 216         def results(self):
 217                 """Retrieves cached results or initiates a new search via :meth:`get_results` and caches the results."""
 218                 if not hasattr(self, '_results'):
 219                         results = self._get_cached_results()
 220                         if results is None:
 221                                 try:
 222                                         # Cache one extra result so we can see if there are
 223                                         # more results to be had.
 224                                         limit = self.result_limit
 225                                         if limit is not None:
 226                                                 limit += 1
 227                                         results = self.get_results(limit)
 228                                 except:
 229                                         if settings.DEBUG:
 230                                                 raise
 231                                         #  On exceptions, don't set any cache; just return.
 232                                         return []
 233
 234                                 self._set_cached_results(results, self._cache_timeout)
 235                         self._results = results
 236
 237                 return self._results
 238
 239         def get_results(self, limit=None, result_class=Result):
 240                 """
 241                 Calls :meth:`search` and parses the return value into :class:`Result` instances.
 242
 243                 :param limit: Passed directly to :meth:`search`.
 244                 :param result_class: The class used to represent the results. This will be instantiated with the :class:`BaseSearch` instance and the raw result from the search.
 245
 246                 """
 247                 results = self.search(limit)
 248                 return [result_class(self, result) for result in results]
 249
 250         def search(self, limit=None):
 251                 """Returns an iterable of up to ``limit`` results. The :meth:`get_result_title`, :meth:`get_result_url`, :meth:`get_result_template`, and :meth:`get_result_extra_context` methods will be used to interpret the individual items that this function returns, so the result can be an object with attributes as easily as a dictionary with keys. However, keep in mind that the raw results will be stored with django's caching mechanisms and will be converted to JSON."""
 252                 raise NotImplementedError
 253
 254         def get_result_title(self, result):
 255                 """Returns the title of the ``result``. Must be implemented by subclasses."""
 256                 raise NotImplementedError
 257
 258         def get_result_url(self, result):
 259                 """Returns the actual URL for the ``result`` or ``None`` if there is no URL. Must be implemented by subclasses."""
 260                 raise NotImplementedError
 261
 262         def get_result_querydict(self, result):
 263                 """Returns a querydict for tracking selection of the result, or ``None`` if there is no URL for the result."""
 264                 url = self.get_result_url(result)
 265                 if url is None:
 266                         return None
 267                 return make_tracking_querydict(self.search_arg, url)
 268
 269         def get_result_template(self, result):
 270                 """Returns the template to be used for rendering the ``result``."""
 271                 if hasattr(self, 'result_template'):
 272                         return loader.get_template(self.result_template)
 273                 if not hasattr(self, '_result_template'):
 274                         self._result_template = DEFAULT_RESULT_TEMPLATE
 275                 return self._result_template
 276
 277         def get_result_extra_context(self, result):
 278                 """Returns any extra context to be used when rendering the ``result``."""
 279                 return {}
 280
 281         def has_more_results(self):
 282                 """Returns ``True`` if there are more results than :attr:`result_limit` and ``False`` otherwise."""
 283                 return len(self.results) > self.result_limit
 284
 285         @property
 286         def more_results_url(self):
 287                 """Returns the actual url for more results. This should be accessed through :attr:`more_results_querydict` in the template so that the click can be tracked."""
 288                 raise NotImplementedError
 289
 290         @property
 291         def more_results_querydict(self):
 292                 """Returns a :class:`QueryDict` for tracking whether people click on a 'more results' link."""
 293                 return make_tracking_querydict(self.search_arg, self.more_results_url)
 294
 295         def __unicode__(self):
 296                 return self.verbose_name
 297
 298
 299 class DatabaseSearch(BaseSearch):
 300         """Implements :meth:`~BaseSearch.search` and :meth:`get_queryset` methods to handle database queries."""
 301         #: The model which should be searched by the :class:`DatabaseSearch`.
 302         model = None
 303
 304         def search(self, limit=None):
 305                 if not hasattr(self, '_qs'):
 306                         self._qs = self.get_queryset()
 307                         if limit is not None:
 308                                 self._qs = self._qs[:limit]
 309
 310                 return self._qs
 311
 312         def get_queryset(self):
 313                 """Returns a :class:`QuerySet` of all instances of :attr:`model`. This method should be overridden by subclasses to specify how the search should actually be implemented for the model."""
 314                 return self.model._default_manager.all()
 315
 316
 317 class URLSearch(BaseSearch):
 318         """Defines a generic interface for searches that require accessing a certain url to get search results."""
 319         #: The base URL which will be accessed to get the search results.
 320         search_url = ''
 321         #: The url-encoded query string to be used for fetching search results from :attr:`search_url`. Must have one ``%s`` to contain the search argument.
 322         query_format_str = "%s"
 323
 324         @property
 325         def url(self):
 326                 """The URL where the search gets its results. Composed from :attr:`search_url` and :attr:`query_format_str`."""
 327                 return self.search_url + self.query_format_str % urlquote_plus(self.search_arg)
 328
 329         @property
 330         def more_results_url(self):
 331                 return self.url
 332
 333         def parse_response(self, response, limit=None):
 334                 """Handles the ``response`` from accessing :attr:`url` (with :func:`urllib2.urlopen`) and returns a list of up to ``limit`` results."""
 335                 raise NotImplementedError
 336
 337         def search(self, limit=None):
 338                 return self.parse_response(urllib2.urlopen(self.url), limit=limit)
 339
 340
 341 class JSONSearch(URLSearch):
 342         """Makes a GET request and parses the results as JSON. The default behavior assumes that the response contains a list of results."""
 343         def parse_response(self, response, limit=None):
 344                 return json.loads(response.read())[:limit]
 345
 346
 347 class GoogleSearch(JSONSearch):
 348         """An example implementation of a :class:`JSONSearch`."""
 349         search_url = "http://ajax.googleapis.com/ajax/services/search/web"
 350         result_template = 'search/googlesearch.html'
 351         _cache_timeout = 60
 352         verbose_name = "Google search (current site)"
 353
 354         @property
 355         def query_format_str(self):
 356                 default_args = self.default_args
 357                 if default_args:
 358                         default_args += " "
 359                 return "?v=1.0&q=%s%%s" % urlquote_plus(default_args).replace('%', '%%')
 360
 361         @property
 362         def default_args(self):
 363                 """Unquoted default arguments for the :class:`GoogleSearch`."""
 364                 return "site:%s" % Site.objects.get_current().domain
 365
 366         def parse_response(self, response, limit=None):
 367                 responseData = json.loads(response.read())['responseData']
 368                 results, cursor = responseData['results'], responseData['cursor']
 369
 370                 if results:
 371                         self._more_results_url = cursor['moreResultsUrl']
 372                         self._estimated_result_count = cursor['estimatedResultCount']
 373
 374                 return results[:limit]
 375
 376         @property
 377         def url(self):
 378                 # Google requires that an ajax request have a proper Referer header.
 379                 return urllib2.Request(
 380                         super(GoogleSearch, self).url,
 381                         None,
 382                         {'Referer': "http://%s" % Site.objects.get_current().domain}
 383                 )
 384
 385         @property
 386         def has_more_results(self):
 387                 if self.results and len(self.results) < self._estimated_result_count:
 388                         return True
 389                 return False
 390
 391         @property
 392         def more_results_url(self):
 393                 return self._more_results_url
 394
 395         def get_result_title(self, result):
 396                 return result['titleNoFormatting']
 397
 398         def get_result_url(self, result):
 399                 return result['unescapedUrl']
 400
 401
 402 registry.register(GoogleSearch)
 403
 404
 405 try:
 406         from BeautifulSoup import BeautifulSoup, SoupStrainer, BeautifulStoneSoup
 407 except:
 408         pass
 409 else:
 410         __all__ += ('ScrapeSearch', 'XMLSearch',)
 411         class ScrapeSearch(URLSearch):
 412                 """A base class for scrape-style searching, available if :mod:`BeautifulSoup` is installed."""
 413                 #: Arguments to be passed into a :class:`SoupStrainer`.
 414                 strainer_args = []
 415                 #: Keyword arguments to be passed into a :class:`SoupStrainer`.
 416                 strainer_kwargs = {}
 417
 418                 @property
 419                 def strainer(self):
 420                         """
 421                         Caches and returns a :class:`SoupStrainer` initialized with :attr:`strainer_args` and :attr:`strainer_kwargs`. This strainer will be used to parse only certain parts of the document.
 422
 423                         .. seealso:: `BeautifulSoup: Improving Performance by Parsing Only Part of the Document <http://www.crummy.com/software/BeautifulSoup/documentation.html#Improving%20Performance%20by%20Parsing%20Only%20Part%20of%20the%20Document>`_
 424
 425                         """
 426                         if not hasattr(self, '_strainer'):
 427                                 self._strainer = SoupStrainer(*self.strainer_args, **self.strainer_kwargs)
 428                         return self._strainer
 429
 430                 def parse_response(self, response, limit=None):
 431                         strainer = self.strainer
 432                         soup = BeautifulSoup(response, parseOnlyThese=strainer)
 433                         return self.parse_results(soup.findAll(recursive=False, limit=limit))
 434
 435                 def parse_results(self, results):
 436                         """
 437                         Provides a hook for parsing the results of straining. This has no default behavior and must be implemented by subclasses because the results absolutely must be parsed to properly extract the information.
 438
 439                         .. seealso:: `BeautifulSoup: Improving Memory Usage with extract <http://www.crummy.com/software/BeautifulSoup/documentation.html#Improving%20Memory%20Usage%20with%20extract>`_
 440                         """
 441                         raise NotImplementedError
 442
 443
 444         class XMLSearch(ScrapeSearch):
 445                 """A base class for searching XML results."""
 446                 #: Self-closing tag names to be used when interpreting the XML document
 447                 #:
 448                 #: .. seealso:: `BeautifulSoup: Parsing XML <http://www.crummy.com/software/BeautifulSoup/documentation.html#Parsing%20XML>`_
 449                 self_closing_tags = []
 450
 451                 def parse_response(self, response, limit=None):
 452                         strainer = self.strainer
 453                         soup = BeautifulStoneSoup(response, selfClosingTags=self.self_closing_tags, parseOnlyThese=strainer)
 454                         return self.parse_results(soup.findAll(recursive=False, limit=limit))