contrib/sobol/search.py

   1 #encoding: utf-8
   2
   3 from django.conf import settings
   4 from django.contrib.sites.models import Site
   5 from django.core.cache import cache
   6 from django.db.models.options import get_verbose_name as convert_camelcase
   7 from django.utils import simplejson as json
   8 from django.utils.http import urlquote_plus
   9 from django.utils.safestring import mark_safe
  10 from django.utils.text import capfirst
  11 from django.template import loader, Context, Template
  12 import datetime
  13 from philo.contrib.sobol.utils import make_tracking_querydict
  14
  15 try:
  16         from eventlet.green import urllib2
  17 except:
  18         import urllib2
  19
  20
  21 __all__ = (
  22         'Result', 'BaseSearch', 'DatabaseSearch', 'URLSearch', 'JSONSearch', 'GoogleSearch', 'registry'
  23 )
  24
  25
  26 SEARCH_CACHE_KEY = 'philo_sobol_search_results'
  27 DEFAULT_RESULT_TEMPLATE_STRING = "{% if url %}<a href='{{ url }}'>{% endif %}{{ title }}{% if url %}</a>{% endif %}"
  28
  29 # Determines the timeout on the entire result cache.
  30 MAX_CACHE_TIMEOUT = 60*60*24*7
  31
  32
  33 class RegistrationError(Exception):
  34         pass
  35
  36
  37 class SearchRegistry(object):
  38         # Holds a registry of search types by slug.
  39         def __init__(self):
  40                 self._registry = {}
  41
  42         def register(self, search, slug=None):
  43                 slug = slug or search.slug
  44                 if slug in self._registry:
  45                         if self._registry[slug] != search:
  46                                 raise RegistrationError("A different search is already registered as `%s`")
  47                 else:
  48                         self._registry[slug] = search
  49
  50         def unregister(self, search, slug=None):
  51                 if slug is not None:
  52                         if slug in self._registry and self._registry[slug] == search:
  53                                 del self._registry[slug]
  54                         raise RegistrationError("`%s` is not registered as `%s`" % (search, slug))
  55                 else:
  56                         for slug, search in self._registry.items():
  57                                 if search == search:
  58                                         del self._registry[slug]
  59
  60         def items(self):
  61                 return self._registry.items()
  62
  63         def iteritems(self):
  64                 return self._registry.iteritems()
  65
  66         def iterchoices(self):
  67                 for slug, search in self.iteritems():
  68                         yield slug, search.verbose_name
  69
  70         def __getitem__(self, key):
  71                 return self._registry[key]
  72
  73         def __iter__(self):
  74                 return self._registry.__iter__()
  75
  76
  77 registry = SearchRegistry()
  78
  79
  80 class Result(object):
  81         """
  82         A result is instantiated with a configuration dictionary, a search,
  83         and a template name. The configuration dictionary is expected to
  84         define a `title` and optionally a `url`. Any other variables may be
  85         defined; they will be made available through the result object in
  86         the template, if one is defined.
  87         """
  88         def __init__(self, search, result):
  89                 self.search = search
  90                 self.result = result
  91
  92         def get_title(self):
  93                 return self.search.get_result_title(self.result)
  94
  95         def get_url(self):
  96                 return "?%s" % self.search.get_result_querydict(self.result).urlencode()
  97
  98         def get_template(self):
  99                 return self.search.get_result_template(self.result)
 100
 101         def get_extra_context(self):
 102                 return self.search.get_result_extra_context(self.result)
 103
 104         def get_context(self):
 105                 context = self.get_extra_context()
 106                 context.update({
 107                         'title': self.get_title(),
 108                         'url': self.get_url()
 109                 })
 110                 return context
 111
 112         def render(self):
 113                 t = self.get_template()
 114                 c = Context(self.get_context())
 115                 return t.render(c)
 116
 117         def __unicode__(self):
 118                 return self.render()
 119
 120
 121 class BaseSearchMetaclass(type):
 122         def __new__(cls, name, bases, attrs):
 123                 if 'verbose_name' not in attrs:
 124                         attrs['verbose_name'] = capfirst(convert_camelcase(name))
 125                 if 'slug' not in attrs:
 126                         attrs['slug'] = name.lower()
 127                 return super(BaseSearchMetaclass, cls).__new__(cls, name, bases, attrs)
 128
 129
 130 class BaseSearch(object):
 131         """
 132         Defines a generic search interface. Accessing self.results will
 133         attempt to retrieve cached results and, if that fails, will
 134         initiate a new search and store the results in the cache.
 135         """
 136         __metaclass__ = BaseSearchMetaclass
 137         result_limit = 10
 138         _cache_timeout = 60*48
 139
 140         def __init__(self, search_arg):
 141                 self.search_arg = search_arg
 142
 143         def _get_cached_results(self):
 144                 """Return the cached results if the results haven't timed out. Otherwise return None."""
 145                 result_cache = cache.get(SEARCH_CACHE_KEY)
 146                 if result_cache and self.__class__ in result_cache and self.search_arg.lower() in result_cache[self.__class__]:
 147                         cached = result_cache[self.__class__][self.search_arg.lower()]
 148                         if cached['timeout'] >= datetime.datetime.now():
 149                                 return cached['results']
 150                 return None
 151
 152         def _set_cached_results(self, results, timeout):
 153                 """Sets the results to the cache for <timeout> minutes."""
 154                 result_cache = cache.get(SEARCH_CACHE_KEY) or {}
 155                 cached = result_cache.setdefault(self.__class__, {}).setdefault(self.search_arg.lower(), {})
 156                 cached.update({
 157                         'results': results,
 158                         'timeout': datetime.datetime.now() + datetime.timedelta(minutes=timeout)
 159                 })
 160                 cache.set(SEARCH_CACHE_KEY, result_cache, MAX_CACHE_TIMEOUT)
 161
 162         @property
 163         def results(self):
 164                 if not hasattr(self, '_results'):
 165                         results = self._get_cached_results()
 166                         if results is None:
 167                                 try:
 168                                         # Cache one extra result so we can see if there are
 169                                         # more results to be had.
 170                                         limit = self.result_limit
 171                                         if limit is not None:
 172                                                 limit += 1
 173                                         results = self.get_results(self.result_limit)
 174                                 except:
 175                                         if settings.DEBUG:
 176                                                 raise
 177                                         #  On exceptions, don't set any cache; just return.
 178                                         return []
 179
 180                                 self._set_cached_results(results, self._cache_timeout)
 181                         self._results = results
 182
 183                 return self._results
 184
 185         def get_results(self, limit=None, result_class=Result):
 186                 """
 187                 Calls self.search() and parses the return value into Result objects.
 188                 """
 189                 results = self.search(limit)
 190                 return [result_class(self, result) for result in results]
 191
 192         def search(self, limit=None):
 193                 """
 194                 Returns an iterable of up to <limit> results. The
 195                 get_result_title, get_result_url, get_result_template, and
 196                 get_result_extra_context methods will be used to interpret the
 197                 individual items that this function returns, so the result can
 198                 be an object with attributes as easily as a dictionary
 199                 with keys. The only restriction is that the objects be
 200                 pickleable so that they can be used with django's cache system.
 201                 """
 202                 raise NotImplementedError
 203
 204         def get_result_title(self, result):
 205                 raise NotImplementedError
 206
 207         def get_result_url(self, result):
 208                 "Subclasses override this to provide the actual URL for the result."
 209                 raise NotImplementedError
 210
 211         def get_result_querydict(self, result):
 212                 return make_tracking_querydict(self.search_arg, self.get_result_url(result))
 213
 214         def get_result_template(self, result):
 215                 if hasattr(self, 'result_template'):
 216                         return loader.get_template(self.result_template)
 217                 if not hasattr(self, '_result_template'):
 218                         self._result_template = Template(DEFAULT_RESULT_TEMPLATE_STRING)
 219                 return self._result_template
 220
 221         def get_result_extra_context(self, result):
 222                 return {}
 223
 224         def has_more_results(self):
 225                 """Useful to determine whether to display a `view more results` link."""
 226                 return len(self.results) > self.result_limit
 227
 228         @property
 229         def more_results_url(self):
 230                 """
 231                 Returns the actual url for more results. This will be encoded
 232                 into a querystring for tracking purposes.
 233                 """
 234                 raise NotImplementedError
 235
 236         @property
 237         def more_results_querydict(self):
 238                 return make_tracking_querydict(self.search_arg, self.more_results_url)
 239
 240         def __unicode__(self):
 241                 return ' '.join(self.__class__.verbose_name.rsplit(' ', 1)[:-1]) + ' results'
 242
 243
 244 class DatabaseSearch(BaseSearch):
 245         model = None
 246
 247         def has_more_results(self):
 248                 return self.get_queryset().count() > self.result_limit
 249
 250         def search(self, limit=None):
 251                 if not hasattr(self, '_qs'):
 252                         self._qs = self.get_queryset()
 253                         if limit is not None:
 254                                 self._qs = self._qs[:limit]
 255
 256                 return self._qs
 257
 258         def get_queryset(self):
 259                 return self.model._default_manager.all()
 260
 261
 262 class URLSearch(BaseSearch):
 263         """
 264         Defines a generic interface for searches that require accessing a
 265         certain url to get search results.
 266         """
 267         search_url = ''
 268         query_format_str = "%s"
 269
 270         @property
 271         def url(self):
 272                 "The URL where the search gets its results."
 273                 return self.search_url + self.query_format_str % urlquote_plus(self.search_arg)
 274
 275         @property
 276         def more_results_url(self):
 277                 "The URL where the users would go to get more results."
 278                 return self.url
 279
 280         def parse_response(self, response, limit=None):
 281                 raise NotImplementedError
 282
 283         def search(self, limit=None):
 284                 return self.parse_response(urllib2.urlopen(self.url), limit=limit)
 285
 286
 287 class JSONSearch(URLSearch):
 288         """
 289         Makes a GET request and parses the results as JSON. The default
 290         behavior assumes that the return value is a list of results.
 291         """
 292         def parse_response(self, response, limit=None):
 293                 return json.loads(response.read())[:limit]
 294
 295
 296 class GoogleSearch(JSONSearch):
 297         search_url = "http://ajax.googleapis.com/ajax/services/search/web"
 298         query_format_str = "?v=1.0&q=%s"
 299         # TODO: Change this template to reflect the app's actual name.
 300         result_template = 'search/googlesearch.html'
 301         timeout = 60
 302
 303         def parse_response(self, response, limit=None):
 304                 responseData = json.loads(response.read())['responseData']
 305                 results, cursor = responseData['results'], responseData['cursor']
 306
 307                 if results:
 308                         self._more_results_url = cursor['moreResultsUrl']
 309                         self._estimated_result_count = cursor['estimatedResultCount']
 310
 311                 return results[:limit]
 312
 313         @property
 314         def url(self):
 315                 # Google requires that an ajax request have a proper Referer header.
 316                 return urllib2.Request(
 317                         super(GoogleSearch, self).url,
 318                         None,
 319                         {'Referer': "http://%s" % Site.objects.get_current().domain}
 320                 )
 321
 322         @property
 323         def has_more_results(self):
 324                 if self.results and len(self.results) < self._estimated_result_count:
 325                         return True
 326                 return False
 327
 328         @property
 329         def more_results_url(self):
 330                 return self._more_results_url
 331
 332         def get_result_title(self, result):
 333                 return result['titleNoFormatting']
 334
 335         def get_result_url(self, result):
 336                 return result['unescapedUrl']
 337
 338         def get_result_extra_context(self, result):
 339                 return result
 340
 341
 342 registry.register(GoogleSearch)
 343
 344
 345 try:
 346         from BeautifulSoup import BeautifulSoup, SoupStrainer, BeautifulStoneSoup
 347 except:
 348         pass
 349 else:
 350         __all__ += ('ScrapeSearch', 'XMLSearch',)
 351         class ScrapeSearch(URLSearch):
 352                 _strainer_args = []
 353                 _strainer_kwargs = {}
 354
 355                 @property
 356                 def strainer(self):
 357                         if not hasattr(self, '_strainer'):
 358                                 self._strainer = SoupStrainer(*self._strainer_args, **self._strainer_kwargs)
 359                         return self._strainer
 360
 361                 def parse_response(self, response, limit=None):
 362                         strainer = self.strainer
 363                         soup = BeautifulSoup(response, parseOnlyThese=strainer)
 364                         return self.parse_results(soup[:limit])
 365
 366                 def parse_results(self, results):
 367                         """
 368                         Provides a hook for parsing the results of straining. This
 369                         has no default behavior because the results absolutely
 370                         must be parsed to properly extract the information.
 371                         For more information, see http://www.crummy.com/software/BeautifulSoup/documentation.html#Improving%20Memory%20Usage%20with%20extract
 372                         """
 373                         raise NotImplementedError
 374
 375
 376         class XMLSearch(ScrapeSearch):
 377                 _self_closing_tags = []
 378
 379                 def parse_response(self, response, limit=None):
 380                         strainer = self.strainer
 381                         soup = BeautifulStoneSoup(page, selfClosingTags=self._self_closing_tags, parseOnlyThese=strainer)
 382                         return self.parse_results(soup[:limit])