philo/contrib/sobol/search.py

   1 #encoding: utf-8
   2 import datetime
   3
   4 from django.conf import settings
   5 from django.contrib.sites.models import Site
   6 from django.core.cache import cache
   7 from django.db.models.options import get_verbose_name as convert_camelcase
   8 from django.utils import simplejson as json
   9 from django.utils.http import urlquote_plus
  10 from django.utils.safestring import mark_safe
  11 from django.utils.text import capfirst
  12 from django.template import loader, Context, Template
  13
  14 from philo.contrib.sobol.utils import make_tracking_querydict
  15 from philo.utils.registry import Registry
  16
  17
  18 if getattr(settings, 'SOBOL_USE_EVENTLET', False):
  19         try:
  20                 from eventlet.green import urllib2
  21         except:
  22                 import urllib2
  23 else:
  24         import urllib2
  25
  26
  27 __all__ = (
  28         'Result', 'BaseSearch', 'DatabaseSearch', 'URLSearch', 'JSONSearch', 'GoogleSearch', 'registry'
  29 )
  30
  31
  32 SEARCH_CACHE_KEY = 'philo_sobol_search_results'
  33 DEFAULT_RESULT_TEMPLATE_STRING = "{% if url %}<a href='{{ url }}'>{% endif %}{{ title }}{% if url %}</a>{% endif %}"
  34 DEFAULT_RESULT_TEMPLATE = Template(DEFAULT_RESULT_TEMPLATE_STRING)
  35
  36 # Determines the timeout on the entire result cache.
  37 MAX_CACHE_TIMEOUT = 60*24*7
  38
  39
  40 #: A registry for :class:`BaseSearch` subclasses that should be available in the admin.
  41 registry = Registry()
  42
  43
  44 class Result(object):
  45         """
  46         :class:`Result` is a helper class that, given a search and a result of that search, is able to correctly render itself with a template defined by the search. Every :class:`Result` will pass a ``title``, a ``url`` (if applicable), and the raw ``result`` returned by the search into the template context when rendering.
  47
  48         :param search: An instance of a :class:`BaseSearch` subclass or an object that implements the same API.
  49         :param result: An arbitrary result from the ``search``.
  50
  51         """
  52         def __init__(self, search, result):
  53                 self.search = search
  54                 self.result = result
  55
  56         def get_title(self):
  57                 """Returns the title of the result by calling :meth:`BaseSearch.get_result_title` on the raw result."""
  58                 return self.search.get_result_title(self.result)
  59
  60         def get_url(self):
  61                 """Returns the url of the result or an empty string by calling :meth:`BaseSearch.get_result_querydict` on the raw result and then encoding the querydict returned."""
  62                 qd = self.search.get_result_querydict(self.result)
  63                 if qd is None:
  64                         return ""
  65                 return "?%s" % qd.urlencode()
  66
  67         def get_template(self):
  68                 """Returns the template for the result by calling :meth:`BaseSearch.get_result_template` on the raw result."""
  69                 return self.search.get_result_template(self.result)
  70
  71         def get_extra_context(self):
  72                 """Returns any extra context for the result by calling :meth:`BaseSearch.get_result_extra_context` on the raw result."""
  73                 return self.search.get_result_extra_context(self.result)
  74
  75         def get_context(self):
  76                 """
  77                 Returns the context dictionary for the result. This is used both in rendering the result and in the AJAX return value for :meth:`.SearchView.ajax_api_view`. The context will contain everything from :meth:`get_extra_context` as well as the following keys:
  78
  79                 title
  80                         The result of calling :meth:`get_title`
  81                 url
  82                         The result of calling :meth:`get_url`
  83                 result
  84                         The raw result which the :class:`Result` was instantiated with.
  85
  86                 """
  87                 context = self.get_extra_context()
  88                 context.update({
  89                         'title': self.get_title(),
  90                         'url': self.get_url(),
  91                         'result': self.result
  92                 })
  93                 return context
  94
  95         def render(self):
  96                 """Returns the template from :meth:`get_template` rendered with the context from :meth:`get_context`."""
  97                 t = self.get_template()
  98                 c = Context(self.get_context())
  99                 return t.render(c)
 100
 101         def __unicode__(self):
 102                 """Returns :meth:`render`"""
 103                 return self.render()
 104
 105
 106 class BaseSearchMetaclass(type):
 107         def __new__(cls, name, bases, attrs):
 108                 if 'verbose_name' not in attrs:
 109                         attrs['verbose_name'] = capfirst(' '.join(convert_camelcase(name).rsplit(' ', 1)[:-1]))
 110                 if 'slug' not in attrs:
 111                         attrs['slug'] = name.lower()
 112                 return super(BaseSearchMetaclass, cls).__new__(cls, name, bases, attrs)
 113
 114
 115 class BaseSearch(object):
 116         """
 117         Defines a generic search api. Accessing :attr:`results` will attempt to retrieve cached results and, if that fails, will initiate a new search and store the results in the cache. Each search has a ``verbose_name`` and a ``slug``. If these are not provided as attributes, they will be automatically generated based on the name of the class.
 118
 119         :param search_arg: The string which is being searched for.
 120
 121         """
 122         __metaclass__ = BaseSearchMetaclass
 123         #: The number of results to return from the complete list. Default: 10
 124         result_limit = 10
 125         #: How long the items for the search should be cached (in minutes). Default: 48 hours.
 126         _cache_timeout = 60*48
 127
 128         def __init__(self, search_arg):
 129                 self.search_arg = search_arg
 130
 131         def _get_cached_results(self):
 132                 """Return the cached results if the results haven't timed out. Otherwise return None."""
 133                 result_cache = cache.get(SEARCH_CACHE_KEY)
 134                 if result_cache and self.__class__ in result_cache and self.search_arg.lower() in result_cache[self.__class__]:
 135                         cached = result_cache[self.__class__][self.search_arg.lower()]
 136                         if cached['timeout'] >= datetime.datetime.now():
 137                                 return cached['results']
 138                 return None
 139
 140         def _set_cached_results(self, results, timeout):
 141                 """Sets the results to the cache for <timeout> minutes."""
 142                 result_cache = cache.get(SEARCH_CACHE_KEY) or {}
 143                 cached = result_cache.setdefault(self.__class__, {}).setdefault(self.search_arg.lower(), {})
 144                 cached.update({
 145                         'results': results,
 146                         'timeout': datetime.datetime.now() + datetime.timedelta(minutes=timeout)
 147                 })
 148                 cache.set(SEARCH_CACHE_KEY, result_cache, MAX_CACHE_TIMEOUT)
 149
 150         @property
 151         def results(self):
 152                 """Retrieves cached results or initiates a new search via :meth:`get_results` and caches the results."""
 153                 if not hasattr(self, '_results'):
 154                         results = self._get_cached_results()
 155                         if results is None:
 156                                 try:
 157                                         # Cache one extra result so we can see if there are
 158                                         # more results to be had.
 159                                         limit = self.result_limit
 160                                         if limit is not None:
 161                                                 limit += 1
 162                                         results = self.get_results(limit)
 163                                 except:
 164                                         if settings.DEBUG:
 165                                                 raise
 166                                         #  On exceptions, don't set any cache; just return.
 167                                         return []
 168
 169                                 self._set_cached_results(results, self._cache_timeout)
 170                         self._results = results
 171
 172                 return self._results
 173
 174         def get_results(self, limit=None, result_class=Result):
 175                 """
 176                 Calls :meth:`search` and parses the return value into :class:`Result` instances.
 177
 178                 :param limit: Passed directly to :meth:`search`.
 179                 :param result_class: The class used to represent the results. This will be instantiated with the :class:`BaseSearch` instance and the raw result from the search.
 180
 181                 """
 182                 results = self.search(limit)
 183                 return [result_class(self, result) for result in results]
 184
 185         def search(self, limit=None):
 186                 """Returns an iterable of up to ``limit`` results. The :meth:`get_result_title`, :meth:`get_result_url`, :meth:`get_result_template`, and :meth:`get_result_extra_context` methods will be used to interpret the individual items that this function returns, so the result can be an object with attributes as easily as a dictionary with keys. However, keep in mind that the raw results will be stored with django's caching mechanisms and will be converted to JSON."""
 187                 raise NotImplementedError
 188
 189         def get_result_title(self, result):
 190                 """Returns the title of the ``result``. Must be implemented by subclasses."""
 191                 raise NotImplementedError
 192
 193         def get_result_url(self, result):
 194                 """Returns the actual URL for the ``result`` or ``None`` if there is no URL. Must be implemented by subclasses."""
 195                 raise NotImplementedError
 196
 197         def get_result_querydict(self, result):
 198                 """Returns a querydict for tracking selection of the result, or ``None`` if there is no URL for the result."""
 199                 url = self.get_result_url(result)
 200                 if url is None:
 201                         return None
 202                 return make_tracking_querydict(self.search_arg, url)
 203
 204         def get_result_template(self, result):
 205                 """Returns the template to be used for rendering the ``result``."""
 206                 if hasattr(self, 'result_template'):
 207                         return loader.get_template(self.result_template)
 208                 if not hasattr(self, '_result_template'):
 209                         self._result_template = DEFAULT_RESULT_TEMPLATE
 210                 return self._result_template
 211
 212         def get_result_extra_context(self, result):
 213                 """Returns any extra context to be used when rendering the ``result``."""
 214                 return {}
 215
 216         def has_more_results(self):
 217                 """Returns ``True`` if there are more results than :attr:`result_limit` and ``False`` otherwise."""
 218                 return len(self.results) > self.result_limit
 219
 220         @property
 221         def more_results_url(self):
 222                 """Returns the actual url for more results. This should be accessed through :attr:`more_results_querydict` in the template so that the click can be tracked."""
 223                 raise NotImplementedError
 224
 225         @property
 226         def more_results_querydict(self):
 227                 """Returns a :class:`QueryDict` for tracking whether people click on a 'more results' link."""
 228                 return make_tracking_querydict(self.search_arg, self.more_results_url)
 229
 230         def __unicode__(self):
 231                 return self.verbose_name
 232
 233
 234 class DatabaseSearch(BaseSearch):
 235         """Implements :meth:`~BaseSearch.search` and :meth:`get_queryset` methods to handle database queries."""
 236         #: The model which should be searched by the :class:`DatabaseSearch`.
 237         model = None
 238
 239         def search(self, limit=None):
 240                 if not hasattr(self, '_qs'):
 241                         self._qs = self.get_queryset()
 242                         if limit is not None:
 243                                 self._qs = self._qs[:limit]
 244
 245                 return self._qs
 246
 247         def get_queryset(self):
 248                 """Returns a :class:`QuerySet` of all instances of :attr:`model`. This method should be overridden by subclasses to specify how the search should actually be implemented for the model."""
 249                 return self.model._default_manager.all()
 250
 251
 252 class URLSearch(BaseSearch):
 253         """Defines a generic interface for searches that require accessing a certain url to get search results."""
 254         #: The base URL which will be accessed to get the search results.
 255         search_url = ''
 256         #: The url-encoded query string to be used for fetching search results from :attr:`search_url`. Must have one ``%s`` to contain the search argument.
 257         query_format_str = "%s"
 258
 259         @property
 260         def url(self):
 261                 """The URL where the search gets its results. Composed from :attr:`search_url` and :attr:`query_format_str`."""
 262                 return self.search_url + self.query_format_str % urlquote_plus(self.search_arg)
 263
 264         @property
 265         def more_results_url(self):
 266                 return self.url
 267
 268         def parse_response(self, response, limit=None):
 269                 """Handles the ``response`` from accessing :attr:`url` (with :func:`urllib2.urlopen`) and returns a list of up to ``limit`` results."""
 270                 raise NotImplementedError
 271
 272         def search(self, limit=None):
 273                 return self.parse_response(urllib2.urlopen(self.url), limit=limit)
 274
 275
 276 class JSONSearch(URLSearch):
 277         """Makes a GET request and parses the results as JSON. The default behavior assumes that the response contains a list of results."""
 278         def parse_response(self, response, limit=None):
 279                 return json.loads(response.read())[:limit]
 280
 281
 282 class GoogleSearch(JSONSearch):
 283         """An example implementation of a :class:`JSONSearch`."""
 284         search_url = "http://ajax.googleapis.com/ajax/services/search/web"
 285         result_template = 'search/googlesearch.html'
 286         _cache_timeout = 60
 287         verbose_name = "Google search (current site)"
 288
 289         @property
 290         def query_format_str(self):
 291                 default_args = self.default_args
 292                 if default_args:
 293                         default_args += " "
 294                 return "?v=1.0&q=%s%%s" % urlquote_plus(default_args).replace('%', '%%')
 295
 296         @property
 297         def default_args(self):
 298                 """Unquoted default arguments for the :class:`GoogleSearch`."""
 299                 return "site:%s" % Site.objects.get_current().domain
 300
 301         def parse_response(self, response, limit=None):
 302                 responseData = json.loads(response.read())['responseData']
 303                 results, cursor = responseData['results'], responseData['cursor']
 304
 305                 if results:
 306                         self._more_results_url = cursor['moreResultsUrl']
 307                         self._estimated_result_count = cursor['estimatedResultCount']
 308
 309                 return results[:limit]
 310
 311         @property
 312         def url(self):
 313                 # Google requires that an ajax request have a proper Referer header.
 314                 return urllib2.Request(
 315                         super(GoogleSearch, self).url,
 316                         None,
 317                         {'Referer': "http://%s" % Site.objects.get_current().domain}
 318                 )
 319
 320         @property
 321         def has_more_results(self):
 322                 if self.results and len(self.results) < self._estimated_result_count:
 323                         return True
 324                 return False
 325
 326         @property
 327         def more_results_url(self):
 328                 return self._more_results_url
 329
 330         def get_result_title(self, result):
 331                 return result['titleNoFormatting']
 332
 333         def get_result_url(self, result):
 334                 return result['unescapedUrl']
 335
 336
 337 registry.register(GoogleSearch)
 338
 339
 340 try:
 341         from BeautifulSoup import BeautifulSoup, SoupStrainer, BeautifulStoneSoup
 342 except:
 343         pass
 344 else:
 345         __all__ += ('ScrapeSearch', 'XMLSearch',)
 346         class ScrapeSearch(URLSearch):
 347                 """A base class for scrape-style searching, available if :mod:`BeautifulSoup` is installed."""
 348                 #: Arguments to be passed into a :class:`SoupStrainer`.
 349                 strainer_args = []
 350                 #: Keyword arguments to be passed into a :class:`SoupStrainer`.
 351                 strainer_kwargs = {}
 352
 353                 @property
 354                 def strainer(self):
 355                         """
 356                         Caches and returns a :class:`SoupStrainer` initialized with :attr:`strainer_args` and :attr:`strainer_kwargs`. This strainer will be used to parse only certain parts of the document.
 357
 358                         .. seealso:: `BeautifulSoup: Improving Performance by Parsing Only Part of the Document <http://www.crummy.com/software/BeautifulSoup/documentation.html#Improving%20Performance%20by%20Parsing%20Only%20Part%20of%20the%20Document>`_
 359
 360                         """
 361                         if not hasattr(self, '_strainer'):
 362                                 self._strainer = SoupStrainer(*self.strainer_args, **self.strainer_kwargs)
 363                         return self._strainer
 364
 365                 def parse_response(self, response, limit=None):
 366                         strainer = self.strainer
 367                         soup = BeautifulSoup(response, parseOnlyThese=strainer)
 368                         return self.parse_results(soup.findAll(recursive=False, limit=limit))
 369
 370                 def parse_results(self, results):
 371                         """
 372                         Provides a hook for parsing the results of straining. This has no default behavior and must be implemented by subclasses because the results absolutely must be parsed to properly extract the information.
 373
 374                         .. seealso:: `BeautifulSoup: Improving Memory Usage with extract <http://www.crummy.com/software/BeautifulSoup/documentation.html#Improving%20Memory%20Usage%20with%20extract>`_
 375                         """
 376                         raise NotImplementedError
 377
 378
 379         class XMLSearch(ScrapeSearch):
 380                 """A base class for searching XML results."""
 381                 #: Self-closing tag names to be used when interpreting the XML document
 382                 #:
 383                 #: .. seealso:: `BeautifulSoup: Parsing XML <http://www.crummy.com/software/BeautifulSoup/documentation.html#Parsing%20XML>`_
 384                 self_closing_tags = []
 385
 386                 def parse_response(self, response, limit=None):
 387                         strainer = self.strainer
 388                         soup = BeautifulStoneSoup(response, selfClosingTags=self.self_closing_tags, parseOnlyThese=strainer)
 389                         return self.parse_results(soup.findAll(recursive=False, limit=limit))