contrib/sobol/search.py

   1 #encoding: utf-8
   2
   3 from django.conf import settings
   4 from django.contrib.sites.models import Site
   5 from django.core.cache import cache
   6 from django.db.models.options import get_verbose_name as convert_camelcase
   7 from django.utils import simplejson as json
   8 from django.utils.http import urlquote_plus
   9 from django.utils.safestring import mark_safe
  10 from django.utils.text import capfirst
  11 from django.template import loader, Context, Template
  12 import datetime
  13 from philo.contrib.sobol.utils import make_tracking_querydict
  14
  15 try:
  16         from eventlet.green import urllib2
  17 except:
  18         import urllib2
  19
  20
  21 __all__ = (
  22         'Result', 'BaseSearch', 'DatabaseSearch', 'URLSearch', 'JSONSearch', 'GoogleSearch', 'registry'
  23 )
  24
  25
  26 SEARCH_CACHE_KEY = 'philo_sobol_search_results'
  27 DEFAULT_RESULT_TEMPLATE_STRING = "{% if url %}<a href='{{ url }}'>{% endif %}{{ title }}{% if url %}</a>{% endif %}"
  28
  29 # Determines the timeout on the entire result cache.
  30 MAX_CACHE_TIMEOUT = 60*60*24*7
  31
  32
  33 class RegistrationError(Exception):
  34         pass
  35
  36
  37 class SearchRegistry(object):
  38         # Holds a registry of search types by slug.
  39         def __init__(self):
  40                 self._registry = {}
  41
  42         def register(self, search, slug=None):
  43                 slug = slug or search.slug
  44                 if slug in self._registry:
  45                         if self._registry[slug] != search:
  46                                 raise RegistrationError("A different search is already registered as `%s`")
  47                 else:
  48                         self._registry[slug] = search
  49
  50         def unregister(self, search, slug=None):
  51                 if slug is not None:
  52                         if slug in self._registry and self._registry[slug] == search:
  53                                 del self._registry[slug]
  54                         raise RegistrationError("`%s` is not registered as `%s`" % (search, slug))
  55                 else:
  56                         for slug, search in self._registry.items():
  57                                 if search == search:
  58                                         del self._registry[slug]
  59
  60         def items(self):
  61                 return self._registry.items()
  62
  63         def iteritems(self):
  64                 return self._registry.iteritems()
  65
  66         def iterchoices(self):
  67                 for slug, search in self.iteritems():
  68                         yield slug, search.verbose_name
  69
  70         def __getitem__(self, key):
  71                 return self._registry[key]
  72
  73
  74 registry = SearchRegistry()
  75
  76
  77 class Result(object):
  78         """
  79         A result is instantiated with a configuration dictionary, a search,
  80         and a template name. The configuration dictionary is expected to
  81         define a `title` and optionally a `url`. Any other variables may be
  82         defined; they will be made available through the result object in
  83         the template, if one is defined.
  84         """
  85         def __init__(self, search, result):
  86                 self.search = search
  87                 self.result = result
  88
  89         def get_title(self):
  90                 return self.search.get_result_title(self.result)
  91
  92         def get_url(self):
  93                 return self.search.get_result_querydict(self.result).urlencode()
  94
  95         def get_template(self):
  96                 return self.search.get_result_template(self.result)
  97
  98         def get_extra_context(self):
  99                 return self.search.get_result_extra_context(self.result)
 100
 101         def render(self):
 102                 t = self.get_template()
 103                 c = Context(self.get_extra_context())
 104                 c.update({
 105                         'title': self.get_title(),
 106                         'url': self.get_url()
 107                 })
 108                 return t.render(c)
 109
 110         def __unicode__(self):
 111                 return self.render()
 112
 113
 114 class BaseSearchMetaclass(type):
 115         def __new__(cls, name, bases, attrs):
 116                 if 'verbose_name' not in attrs:
 117                         attrs['verbose_name'] = capfirst(convert_camelcase(name))
 118                 if 'slug' not in attrs:
 119                         attrs['slug'] = name.lower()
 120                 return super(BaseSearchMetaclass, cls).__new__(cls, name, bases, attrs)
 121
 122
 123 class BaseSearch(object):
 124         """
 125         Defines a generic search interface. Accessing self.results will
 126         attempt to retrieve cached results and, if that fails, will
 127         initiate a new search and store the results in the cache.
 128         """
 129         __metaclass__ = BaseSearchMetaclass
 130         result_limit = 10
 131         _cache_timeout = 60*48
 132
 133         def __init__(self, search_arg):
 134                 self.search_arg = search_arg
 135
 136         def _get_cached_results(self):
 137                 """Return the cached results if the results haven't timed out. Otherwise return None."""
 138                 result_cache = cache.get(SEARCH_CACHE_KEY)
 139                 if result_cache and self.__class__ in result_cache and self.search_arg.lower() in result_cache[self.__class__]:
 140                         cached = result_cache[self.__class__][self.search_arg.lower()]
 141                         if cached['timeout'] >= datetime.datetime.now():
 142                                 return cached['results']
 143                 return None
 144
 145         def _set_cached_results(self, results, timeout):
 146                 """Sets the results to the cache for <timeout> minutes."""
 147                 result_cache = cache.get(SEARCH_CACHE_KEY) or {}
 148                 cached = result_cache.setdefault(self.__class__, {}).setdefault(self.search_arg.lower(), {})
 149                 cached.update({
 150                         'results': results,
 151                         'timeout': datetime.datetime.now() + datetime.timedelta(minutes=timeout)
 152                 })
 153                 cache.set(SEARCH_CACHE_KEY, result_cache, MAX_CACHE_TIMEOUT)
 154
 155         @property
 156         def results(self):
 157                 if not hasattr(self, '_results'):
 158                         results = self._get_cached_results()
 159                         if results is None:
 160                                 try:
 161                                         # Cache one extra result so we can see if there are
 162                                         # more results to be had.
 163                                         limit = self.result_limit
 164                                         if limit is not None:
 165                                                 limit += 1
 166                                         results = self.get_results(self.result_limit)
 167                                 except:
 168                                         if settings.DEBUG:
 169                                                 raise
 170                                         #  On exceptions, don't set any cache; just return.
 171                                         return []
 172
 173                                 self._set_cached_results(results, self._cache_timeout)
 174                         self._results = results
 175
 176                 return self._results
 177
 178         def get_results(self, limit=None, result_class=Result):
 179                 """
 180                 Calls self.search() and parses the return value into Result objects.
 181                 """
 182                 results = self.search(limit)
 183                 return [result_class(self, result) for result in results]
 184
 185         def search(self, limit=None):
 186                 """
 187                 Returns an iterable of up to <limit> results. The
 188                 get_result_title, get_result_url, get_result_template, and
 189                 get_result_extra_context methods will be used to interpret the
 190                 individual items that this function returns, so the result can
 191                 be an object with attributes as easily as a dictionary
 192                 with keys. The only restriction is that the objects be
 193                 pickleable so that they can be used with django's cache system.
 194                 """
 195                 raise NotImplementedError
 196
 197         def get_result_title(self, result):
 198                 raise NotImplementedError
 199
 200         def get_result_url(self, result):
 201                 "Subclasses override this to provide the actual URL for the result."
 202                 raise NotImplementedError
 203
 204         def get_result_querydict(self, result):
 205                 return make_tracking_querydict(self.search_arg, self.get_result_url(result))
 206
 207         def get_result_template(self, result):
 208                 if hasattr(self, 'result_template'):
 209                         return loader.get_template(self.result_template)
 210                 if not hasattr(self, '_result_template'):
 211                         self._result_template = Template(DEFAULT_RESULT_TEMPLATE_STRING)
 212                 return self._result_template
 213
 214         def get_ajax_result_template(self, result):
 215                 return getattr(self, 'ajax_result_template', DEFAULT_RESULT_TEMPLATE_STRING)
 216
 217         def get_result_extra_context(self, result):
 218                 return {}
 219
 220         def has_more_results(self):
 221                 """Useful to determine whether to display a `view more results` link."""
 222                 return len(self.results) > self.result_limit
 223
 224         @property
 225         def more_results_url(self):
 226                 """
 227                 Returns the actual url for more results. This will be encoded
 228                 into a querystring for tracking purposes.
 229                 """
 230                 raise NotImplementedError
 231
 232         @property
 233         def more_results_querydict(self):
 234                 return make_tracking_querydict(self.search_arg, self.more_results_url)
 235
 236         def __unicode__(self):
 237                 return ' '.join(self.__class__.verbose_name.rsplit(' ', 1)[:-1]) + ' results'
 238
 239
 240 class DatabaseSearch(BaseSearch):
 241         model = None
 242
 243         def has_more_results(self):
 244                 return self.get_queryset().count() > self.result_limit
 245
 246         def search(self, limit=None):
 247                 if not hasattr(self, '_qs'):
 248                         self._qs = self.get_queryset()
 249                         if limit is not None:
 250                                 self._qs = self._qs[:limit]
 251
 252                 return self._qs
 253
 254         def get_queryset(self):
 255                 return self.model._default_manager.all()
 256
 257
 258 class URLSearch(BaseSearch):
 259         """
 260         Defines a generic interface for searches that require accessing a
 261         certain url to get search results.
 262         """
 263         search_url = ''
 264         query_format_str = "%s"
 265
 266         @property
 267         def url(self):
 268                 "The URL where the search gets its results."
 269                 return self.search_url + self.query_format_str % urlquote_plus(self.search_arg)
 270
 271         @property
 272         def more_results_url(self):
 273                 "The URL where the users would go to get more results."
 274                 return self.url
 275
 276         def parse_response(self, response, limit=None):
 277                 raise NotImplementedError
 278
 279         def search(self, limit=None):
 280                 return self.parse_response(urllib2.urlopen(self.url), limit=limit)
 281
 282
 283 class JSONSearch(URLSearch):
 284         """
 285         Makes a GET request and parses the results as JSON. The default
 286         behavior assumes that the return value is a list of results.
 287         """
 288         def parse_response(self, response, limit=None):
 289                 return json.loads(response.read())[:limit]
 290
 291
 292 class GoogleSearch(JSONSearch):
 293         search_url = "http://ajax.googleapis.com/ajax/services/search/web"
 294         query_format_str = "?v=1.0&q=%s"
 295         # TODO: Change this template to reflect the app's actual name.
 296         result_template = 'search/googlesearch.html'
 297         timeout = 60
 298
 299         def parse_response(self, response, limit=None):
 300                 responseData = json.loads(response.read())['responseData']
 301                 results, cursor = responseData['results'], responseData['cursor']
 302
 303                 if results:
 304                         self._more_results_url = cursor['moreResultsUrl']
 305                         self._estimated_result_count = cursor['estimatedResultCount']
 306
 307                 return results[:limit]
 308
 309         @property
 310         def url(self):
 311                 # Google requires that an ajax request have a proper Referer header.
 312                 return urllib2.Request(
 313                         super(GoogleSearch, self).url,
 314                         None,
 315                         {'Referer': "http://%s" % Site.objects.get_current().domain}
 316                 )
 317
 318         @property
 319         def has_more_results(self):
 320                 if self.results and len(self.results) < self._estimated_result_count:
 321                         return True
 322                 return False
 323
 324         @property
 325         def more_results_url(self):
 326                 return self._more_results_url
 327
 328         def get_result_title(self, result):
 329                 return result['titleNoFormatting']
 330
 331         def get_result_url(self, result):
 332                 return result['unescapedUrl']
 333
 334         def get_result_extra_context(self, result):
 335                 return result
 336
 337
 338 registry.register(GoogleSearch)
 339
 340
 341 try:
 342         from BeautifulSoup import BeautifulSoup, SoupStrainer, BeautifulStoneSoup
 343 except:
 344         pass
 345 else:
 346         __all__ += ('ScrapeSearch', 'XMLSearch',)
 347         class ScrapeSearch(URLSearch):
 348                 _strainer_args = []
 349                 _strainer_kwargs = {}
 350
 351                 @property
 352                 def strainer(self):
 353                         if not hasattr(self, '_strainer'):
 354                                 self._strainer = SoupStrainer(*self._strainer_args, **self._strainer_kwargs)
 355                         return self._strainer
 356
 357                 def parse_response(self, response, limit=None):
 358                         strainer = self.strainer
 359                         soup = BeautifulSoup(response, parseOnlyThese=strainer)
 360                         return self.parse_results(soup[:limit])
 361
 362                 def parse_results(self, results):
 363                         """
 364                         Provides a hook for parsing the results of straining. This
 365                         has no default behavior because the results absolutely
 366                         must be parsed to properly extract the information.
 367                         For more information, see http://www.crummy.com/software/BeautifulSoup/documentation.html#Improving%20Memory%20Usage%20with%20extract
 368                         """
 369                         raise NotImplementedError
 370
 371
 372         class XMLSearch(ScrapeSearch):
 373                 _self_closing_tags = []
 374
 375                 def parse_response(self, response, limit=None):
 376                         strainer = self.strainer
 377                         soup = BeautifulStoneSoup(page, selfClosingTags=self._self_closing_tags, parseOnlyThese=strainer)
 378                         return self.parse_results(soup[:limit])