contrib/sobol/search.py

   1 #encoding: utf-8
   2
   3 from django.conf import settings
   4 from django.contrib.sites.models import Site
   5 from django.core.cache import cache
   6 from django.db.models.options import get_verbose_name as convert_camelcase
   7 from django.utils import simplejson as json
   8 from django.utils.http import urlquote_plus
   9 from django.utils.safestring import mark_safe
  10 from django.utils.text import capfirst
  11 from django.template import loader, Context, Template
  12 import datetime
  13 from philo.contrib.sobol.utils import make_tracking_querydict
  14
  15 try:
  16         from eventlet.green import urllib2
  17 except:
  18         import urllib2
  19
  20
  21 __all__ = (
  22         'Result', 'BaseSearch', 'DatabaseSearch', 'URLSearch', 'JSONSearch', 'GoogleSearch', 'registry'
  23 )
  24
  25
  26 SEARCH_CACHE_KEY = 'philo_sobol_search_results'
  27 DEFAULT_RESULT_TEMPLATE_STRING = "{% if url %}<a href='{{ url }}'>{% endif %}{{ title }}{% if url %}</a>{% endif %}"
  28 DEFAULT_RESULT_TEMPLATE = Template(DEFAULT_RESULT_TEMPLATE_STRING)
  29
  30 # Determines the timeout on the entire result cache.
  31 MAX_CACHE_TIMEOUT = 60*24*7
  32
  33
  34 class RegistrationError(Exception):
  35         pass
  36
  37
  38 class SearchRegistry(object):
  39         # Holds a registry of search types by slug.
  40         def __init__(self):
  41                 self._registry = {}
  42
  43         def register(self, search, slug=None):
  44                 slug = slug or search.slug
  45                 if slug in self._registry:
  46                         registered = self._registry[slug]
  47                         if registered.__module__ != search.__module__:
  48                                 raise RegistrationError("A different search is already registered as `%s`" % slug)
  49                 else:
  50                         self._registry[slug] = search
  51
  52         def unregister(self, search, slug=None):
  53                 if slug is not None:
  54                         if slug in self._registry and self._registry[slug] == search:
  55                                 del self._registry[slug]
  56                         raise RegistrationError("`%s` is not registered as `%s`" % (search, slug))
  57                 else:
  58                         for slug, search in self._registry.items():
  59                                 if search == search:
  60                                         del self._registry[slug]
  61
  62         def items(self):
  63                 return self._registry.items()
  64
  65         def iteritems(self):
  66                 return self._registry.iteritems()
  67
  68         def iterchoices(self):
  69                 for slug, search in self.iteritems():
  70                         yield slug, search.verbose_name
  71
  72         def __getitem__(self, key):
  73                 return self._registry[key]
  74
  75         def __iter__(self):
  76                 return self._registry.__iter__()
  77
  78
  79 registry = SearchRegistry()
  80
  81
  82 class Result(object):
  83         """
  84         A result is instantiated with a configuration dictionary, a search,
  85         and a template name. The configuration dictionary is expected to
  86         define a `title` and optionally a `url`. Any other variables may be
  87         defined; they will be made available through the result object in
  88         the template, if one is defined.
  89         """
  90         def __init__(self, search, result):
  91                 self.search = search
  92                 self.result = result
  93
  94         def get_title(self):
  95                 return self.search.get_result_title(self.result)
  96
  97         def get_url(self):
  98                 qd = self.search.get_result_querydict(self.result)
  99                 if qd is None:
 100                         return ""
 101                 return "?%s" % qd.urlencode()
 102
 103         def get_template(self):
 104                 return self.search.get_result_template(self.result)
 105
 106         def get_extra_context(self):
 107                 return self.search.get_result_extra_context(self.result)
 108
 109         def get_context(self):
 110                 context = self.get_extra_context()
 111                 context.update({
 112                         'title': self.get_title(),
 113                         'url': self.get_url()
 114                 })
 115                 return context
 116
 117         def render(self):
 118                 t = self.get_template()
 119                 c = Context(self.get_context())
 120                 return t.render(c)
 121
 122         def __unicode__(self):
 123                 return self.render()
 124
 125
 126 class BaseSearchMetaclass(type):
 127         def __new__(cls, name, bases, attrs):
 128                 if 'verbose_name' not in attrs:
 129                         attrs['verbose_name'] = capfirst(convert_camelcase(name))
 130                 if 'slug' not in attrs:
 131                         attrs['slug'] = name.lower()
 132                 return super(BaseSearchMetaclass, cls).__new__(cls, name, bases, attrs)
 133
 134
 135 class BaseSearch(object):
 136         """
 137         Defines a generic search interface. Accessing self.results will
 138         attempt to retrieve cached results and, if that fails, will
 139         initiate a new search and store the results in the cache.
 140         """
 141         __metaclass__ = BaseSearchMetaclass
 142         result_limit = 10
 143         _cache_timeout = 60*48
 144
 145         def __init__(self, search_arg):
 146                 self.search_arg = search_arg
 147
 148         def _get_cached_results(self):
 149                 """Return the cached results if the results haven't timed out. Otherwise return None."""
 150                 result_cache = cache.get(SEARCH_CACHE_KEY)
 151                 if result_cache and self.__class__ in result_cache and self.search_arg.lower() in result_cache[self.__class__]:
 152                         cached = result_cache[self.__class__][self.search_arg.lower()]
 153                         if cached['timeout'] >= datetime.datetime.now():
 154                                 return cached['results']
 155                 return None
 156
 157         def _set_cached_results(self, results, timeout):
 158                 """Sets the results to the cache for <timeout> minutes."""
 159                 result_cache = cache.get(SEARCH_CACHE_KEY) or {}
 160                 cached = result_cache.setdefault(self.__class__, {}).setdefault(self.search_arg.lower(), {})
 161                 cached.update({
 162                         'results': results,
 163                         'timeout': datetime.datetime.now() + datetime.timedelta(minutes=timeout)
 164                 })
 165                 cache.set(SEARCH_CACHE_KEY, result_cache, MAX_CACHE_TIMEOUT)
 166
 167         @property
 168         def results(self):
 169                 if not hasattr(self, '_results'):
 170                         results = self._get_cached_results()
 171                         if results is None:
 172                                 try:
 173                                         # Cache one extra result so we can see if there are
 174                                         # more results to be had.
 175                                         limit = self.result_limit
 176                                         if limit is not None:
 177                                                 limit += 1
 178                                         results = self.get_results(self.result_limit)
 179                                 except:
 180                                         if settings.DEBUG:
 181                                                 raise
 182                                         #  On exceptions, don't set any cache; just return.
 183                                         return []
 184
 185                                 self._set_cached_results(results, self._cache_timeout)
 186                         self._results = results
 187
 188                 return self._results
 189
 190         def get_results(self, limit=None, result_class=Result):
 191                 """
 192                 Calls self.search() and parses the return value into Result objects.
 193                 """
 194                 results = self.search(limit)
 195                 return [result_class(self, result) for result in results]
 196
 197         def search(self, limit=None):
 198                 """
 199                 Returns an iterable of up to <limit> results. The
 200                 get_result_title, get_result_url, get_result_template, and
 201                 get_result_extra_context methods will be used to interpret the
 202                 individual items that this function returns, so the result can
 203                 be an object with attributes as easily as a dictionary
 204                 with keys. The only restriction is that the objects be
 205                 pickleable so that they can be used with django's cache system.
 206                 """
 207                 raise NotImplementedError
 208
 209         def get_result_title(self, result):
 210                 raise NotImplementedError
 211
 212         def get_result_url(self, result):
 213                 "Subclasses override this to provide the actual URL for the result."
 214                 raise NotImplementedError
 215
 216         def get_result_querydict(self, result):
 217                 url = self.get_result_url(result)
 218                 if url is None:
 219                         return None
 220                 return make_tracking_querydict(self.search_arg, url)
 221
 222         def get_result_template(self, result):
 223                 if hasattr(self, 'result_template'):
 224                         return loader.get_template(self.result_template)
 225                 if not hasattr(self, '_result_template'):
 226                         self._result_template = DEFAULT_RESULT_TEMPLATE
 227                 return self._result_template
 228
 229         def get_result_extra_context(self, result):
 230                 return {}
 231
 232         def has_more_results(self):
 233                 """Useful to determine whether to display a `view more results` link."""
 234                 return len(self.results) > self.result_limit
 235
 236         @property
 237         def more_results_url(self):
 238                 """
 239                 Returns the actual url for more results. This will be encoded
 240                 into a querystring for tracking purposes.
 241                 """
 242                 raise NotImplementedError
 243
 244         @property
 245         def more_results_querydict(self):
 246                 return make_tracking_querydict(self.search_arg, self.more_results_url)
 247
 248         def __unicode__(self):
 249                 return ' '.join(self.__class__.verbose_name.rsplit(' ', 1)[:-1]) + ' results'
 250
 251
 252 class DatabaseSearch(BaseSearch):
 253         model = None
 254
 255         def has_more_results(self):
 256                 return self.get_queryset().count() > self.result_limit
 257
 258         def search(self, limit=None):
 259                 if not hasattr(self, '_qs'):
 260                         self._qs = self.get_queryset()
 261                         if limit is not None:
 262                                 self._qs = self._qs[:limit]
 263
 264                 return self._qs
 265
 266         def get_queryset(self):
 267                 return self.model._default_manager.all()
 268
 269
 270 class URLSearch(BaseSearch):
 271         """
 272         Defines a generic interface for searches that require accessing a
 273         certain url to get search results.
 274         """
 275         search_url = ''
 276         query_format_str = "%s"
 277
 278         @property
 279         def url(self):
 280                 "The URL where the search gets its results."
 281                 return self.search_url + self.query_format_str % urlquote_plus(self.search_arg)
 282
 283         @property
 284         def more_results_url(self):
 285                 "The URL where the users would go to get more results."
 286                 return self.url
 287
 288         def parse_response(self, response, limit=None):
 289                 raise NotImplementedError
 290
 291         def search(self, limit=None):
 292                 return self.parse_response(urllib2.urlopen(self.url), limit=limit)
 293
 294
 295 class JSONSearch(URLSearch):
 296         """
 297         Makes a GET request and parses the results as JSON. The default
 298         behavior assumes that the return value is a list of results.
 299         """
 300         def parse_response(self, response, limit=None):
 301                 return json.loads(response.read())[:limit]
 302
 303
 304 class GoogleSearch(JSONSearch):
 305         search_url = "http://ajax.googleapis.com/ajax/services/search/web"
 306         query_format_str = "?v=1.0&q=%s"
 307         # TODO: Change this template to reflect the app's actual name.
 308         result_template = 'search/googlesearch.html'
 309         _cache_timeout = 60
 310
 311         def parse_response(self, response, limit=None):
 312                 responseData = json.loads(response.read())['responseData']
 313                 results, cursor = responseData['results'], responseData['cursor']
 314
 315                 if results:
 316                         self._more_results_url = cursor['moreResultsUrl']
 317                         self._estimated_result_count = cursor['estimatedResultCount']
 318
 319                 return results[:limit]
 320
 321         @property
 322         def url(self):
 323                 # Google requires that an ajax request have a proper Referer header.
 324                 return urllib2.Request(
 325                         super(GoogleSearch, self).url,
 326                         None,
 327                         {'Referer': "http://%s" % Site.objects.get_current().domain}
 328                 )
 329
 330         @property
 331         def has_more_results(self):
 332                 if self.results and len(self.results) < self._estimated_result_count:
 333                         return True
 334                 return False
 335
 336         @property
 337         def more_results_url(self):
 338                 return self._more_results_url
 339
 340         def get_result_title(self, result):
 341                 return result['titleNoFormatting']
 342
 343         def get_result_url(self, result):
 344                 return result['unescapedUrl']
 345
 346         def get_result_extra_context(self, result):
 347                 return result
 348
 349
 350 registry.register(GoogleSearch)
 351
 352
 353 try:
 354         from BeautifulSoup import BeautifulSoup, SoupStrainer, BeautifulStoneSoup
 355 except:
 356         pass
 357 else:
 358         __all__ += ('ScrapeSearch', 'XMLSearch',)
 359         class ScrapeSearch(URLSearch):
 360                 _strainer_args = []
 361                 _strainer_kwargs = {}
 362
 363                 @property
 364                 def strainer(self):
 365                         if not hasattr(self, '_strainer'):
 366                                 self._strainer = SoupStrainer(*self._strainer_args, **self._strainer_kwargs)
 367                         return self._strainer
 368
 369                 def parse_response(self, response, limit=None):
 370                         strainer = self.strainer
 371                         soup = BeautifulSoup(response, parseOnlyThese=strainer)
 372                         return self.parse_results(soup.findAll(recursive=False, limit=limit))
 373
 374                 def parse_results(self, results):
 375                         """
 376                         Provides a hook for parsing the results of straining. This
 377                         has no default behavior because the results absolutely
 378                         must be parsed to properly extract the information.
 379                         For more information, see http://www.crummy.com/software/BeautifulSoup/documentation.html#Improving%20Memory%20Usage%20with%20extract
 380                         """
 381                         raise NotImplementedError
 382
 383
 384         class XMLSearch(ScrapeSearch):
 385                 _self_closing_tags = []
 386
 387                 def parse_response(self, response, limit=None):
 388                         strainer = self.strainer
 389                         soup = BeautifulStoneSoup(response, selfClosingTags=self._self_closing_tags, parseOnlyThese=strainer)
 390                         return self.parse_results(soup.findAll(recursive=False, limit=limit))