contrib/sobol/search.py

   1 #encoding: utf-8
   2
   3 from django.conf import settings
   4 from django.contrib.sites.models import Site
   5 from django.core.cache import cache
   6 from django.db.models.options import get_verbose_name as convert_camelcase
   7 from django.utils import simplejson as json
   8 from django.utils.http import urlquote_plus
   9 from django.utils.safestring import mark_safe
  10 from django.utils.text import capfirst
  11 from django.template import loader, Context, Template
  12 import datetime
  13 from philo.contrib.sobol.utils import make_tracking_querydict
  14
  15 try:
  16         from eventlet.green import urllib2
  17 except:
  18         import urllib2
  19
  20
  21 __all__ = (
  22         'Result', 'BaseSearch', 'DatabaseSearch', 'URLSearch', 'JSONSearch', 'GoogleSearch', 'registry'
  23 )
  24
  25
  26 SEARCH_CACHE_KEY = 'philo_sobol_search_results'
  27 DEFAULT_RESULT_TEMPLATE_STRING = "{% if url %}<a href='{{ url }}'>{% endif %}{{ title }}{% if url %}</a>{% endif %}"
  28
  29 # Determines the timeout on the entire result cache.
  30 MAX_CACHE_TIMEOUT = 60*60*24*7
  31
  32
  33 class RegistrationError(Exception):
  34         pass
  35
  36
  37 class SearchRegistry(object):
  38         # Holds a registry of search types by slug.
  39         def __init__(self):
  40                 self._registry = {}
  41
  42         def register(self, search, slug=None):
  43                 slug = slug or search.slug
  44                 if slug in self._registry:
  45                         registered = self._registry[slug]
  46                         if registered.__module__ != search.__module__:
  47                                 raise RegistrationError("A different search is already registered as `%s`" % slug)
  48                 else:
  49                         self._registry[slug] = search
  50
  51         def unregister(self, search, slug=None):
  52                 if slug is not None:
  53                         if slug in self._registry and self._registry[slug] == search:
  54                                 del self._registry[slug]
  55                         raise RegistrationError("`%s` is not registered as `%s`" % (search, slug))
  56                 else:
  57                         for slug, search in self._registry.items():
  58                                 if search == search:
  59                                         del self._registry[slug]
  60
  61         def items(self):
  62                 return self._registry.items()
  63
  64         def iteritems(self):
  65                 return self._registry.iteritems()
  66
  67         def iterchoices(self):
  68                 for slug, search in self.iteritems():
  69                         yield slug, search.verbose_name
  70
  71         def __getitem__(self, key):
  72                 return self._registry[key]
  73
  74         def __iter__(self):
  75                 return self._registry.__iter__()
  76
  77
  78 registry = SearchRegistry()
  79
  80
  81 class Result(object):
  82         """
  83         A result is instantiated with a configuration dictionary, a search,
  84         and a template name. The configuration dictionary is expected to
  85         define a `title` and optionally a `url`. Any other variables may be
  86         defined; they will be made available through the result object in
  87         the template, if one is defined.
  88         """
  89         def __init__(self, search, result):
  90                 self.search = search
  91                 self.result = result
  92
  93         def get_title(self):
  94                 return self.search.get_result_title(self.result)
  95
  96         def get_url(self):
  97                 return "?%s" % self.search.get_result_querydict(self.result).urlencode()
  98
  99         def get_template(self):
 100                 return self.search.get_result_template(self.result)
 101
 102         def get_extra_context(self):
 103                 return self.search.get_result_extra_context(self.result)
 104
 105         def get_context(self):
 106                 context = self.get_extra_context()
 107                 context.update({
 108                         'title': self.get_title(),
 109                         'url': self.get_url()
 110                 })
 111                 return context
 112
 113         def render(self):
 114                 t = self.get_template()
 115                 c = Context(self.get_context())
 116                 return t.render(c)
 117
 118         def __unicode__(self):
 119                 return self.render()
 120
 121
 122 class BaseSearchMetaclass(type):
 123         def __new__(cls, name, bases, attrs):
 124                 if 'verbose_name' not in attrs:
 125                         attrs['verbose_name'] = capfirst(convert_camelcase(name))
 126                 if 'slug' not in attrs:
 127                         attrs['slug'] = name.lower()
 128                 return super(BaseSearchMetaclass, cls).__new__(cls, name, bases, attrs)
 129
 130
 131 class BaseSearch(object):
 132         """
 133         Defines a generic search interface. Accessing self.results will
 134         attempt to retrieve cached results and, if that fails, will
 135         initiate a new search and store the results in the cache.
 136         """
 137         __metaclass__ = BaseSearchMetaclass
 138         result_limit = 10
 139         _cache_timeout = 60*48
 140
 141         def __init__(self, search_arg):
 142                 self.search_arg = search_arg
 143
 144         def _get_cached_results(self):
 145                 """Return the cached results if the results haven't timed out. Otherwise return None."""
 146                 result_cache = cache.get(SEARCH_CACHE_KEY)
 147                 if result_cache and self.__class__ in result_cache and self.search_arg.lower() in result_cache[self.__class__]:
 148                         cached = result_cache[self.__class__][self.search_arg.lower()]
 149                         if cached['timeout'] >= datetime.datetime.now():
 150                                 return cached['results']
 151                 return None
 152
 153         def _set_cached_results(self, results, timeout):
 154                 """Sets the results to the cache for <timeout> minutes."""
 155                 result_cache = cache.get(SEARCH_CACHE_KEY) or {}
 156                 cached = result_cache.setdefault(self.__class__, {}).setdefault(self.search_arg.lower(), {})
 157                 cached.update({
 158                         'results': results,
 159                         'timeout': datetime.datetime.now() + datetime.timedelta(minutes=timeout)
 160                 })
 161                 cache.set(SEARCH_CACHE_KEY, result_cache, MAX_CACHE_TIMEOUT)
 162
 163         @property
 164         def results(self):
 165                 if not hasattr(self, '_results'):
 166                         results = self._get_cached_results()
 167                         if results is None:
 168                                 try:
 169                                         # Cache one extra result so we can see if there are
 170                                         # more results to be had.
 171                                         limit = self.result_limit
 172                                         if limit is not None:
 173                                                 limit += 1
 174                                         results = self.get_results(self.result_limit)
 175                                 except:
 176                                         if settings.DEBUG:
 177                                                 raise
 178                                         #  On exceptions, don't set any cache; just return.
 179                                         return []
 180
 181                                 self._set_cached_results(results, self._cache_timeout)
 182                         self._results = results
 183
 184                 return self._results
 185
 186         def get_results(self, limit=None, result_class=Result):
 187                 """
 188                 Calls self.search() and parses the return value into Result objects.
 189                 """
 190                 results = self.search(limit)
 191                 return [result_class(self, result) for result in results]
 192
 193         def search(self, limit=None):
 194                 """
 195                 Returns an iterable of up to <limit> results. The
 196                 get_result_title, get_result_url, get_result_template, and
 197                 get_result_extra_context methods will be used to interpret the
 198                 individual items that this function returns, so the result can
 199                 be an object with attributes as easily as a dictionary
 200                 with keys. The only restriction is that the objects be
 201                 pickleable so that they can be used with django's cache system.
 202                 """
 203                 raise NotImplementedError
 204
 205         def get_result_title(self, result):
 206                 raise NotImplementedError
 207
 208         def get_result_url(self, result):
 209                 "Subclasses override this to provide the actual URL for the result."
 210                 raise NotImplementedError
 211
 212         def get_result_querydict(self, result):
 213                 return make_tracking_querydict(self.search_arg, self.get_result_url(result))
 214
 215         def get_result_template(self, result):
 216                 if hasattr(self, 'result_template'):
 217                         return loader.get_template(self.result_template)
 218                 if not hasattr(self, '_result_template'):
 219                         self._result_template = Template(DEFAULT_RESULT_TEMPLATE_STRING)
 220                 return self._result_template
 221
 222         def get_result_extra_context(self, result):
 223                 return {}
 224
 225         def has_more_results(self):
 226                 """Useful to determine whether to display a `view more results` link."""
 227                 return len(self.results) > self.result_limit
 228
 229         @property
 230         def more_results_url(self):
 231                 """
 232                 Returns the actual url for more results. This will be encoded
 233                 into a querystring for tracking purposes.
 234                 """
 235                 raise NotImplementedError
 236
 237         @property
 238         def more_results_querydict(self):
 239                 return make_tracking_querydict(self.search_arg, self.more_results_url)
 240
 241         def __unicode__(self):
 242                 return ' '.join(self.__class__.verbose_name.rsplit(' ', 1)[:-1]) + ' results'
 243
 244
 245 class DatabaseSearch(BaseSearch):
 246         model = None
 247
 248         def has_more_results(self):
 249                 return self.get_queryset().count() > self.result_limit
 250
 251         def search(self, limit=None):
 252                 if not hasattr(self, '_qs'):
 253                         self._qs = self.get_queryset()
 254                         if limit is not None:
 255                                 self._qs = self._qs[:limit]
 256
 257                 return self._qs
 258
 259         def get_queryset(self):
 260                 return self.model._default_manager.all()
 261
 262
 263 class URLSearch(BaseSearch):
 264         """
 265         Defines a generic interface for searches that require accessing a
 266         certain url to get search results.
 267         """
 268         search_url = ''
 269         query_format_str = "%s"
 270
 271         @property
 272         def url(self):
 273                 "The URL where the search gets its results."
 274                 return self.search_url + self.query_format_str % urlquote_plus(self.search_arg)
 275
 276         @property
 277         def more_results_url(self):
 278                 "The URL where the users would go to get more results."
 279                 return self.url
 280
 281         def parse_response(self, response, limit=None):
 282                 raise NotImplementedError
 283
 284         def search(self, limit=None):
 285                 return self.parse_response(urllib2.urlopen(self.url), limit=limit)
 286
 287
 288 class JSONSearch(URLSearch):
 289         """
 290         Makes a GET request and parses the results as JSON. The default
 291         behavior assumes that the return value is a list of results.
 292         """
 293         def parse_response(self, response, limit=None):
 294                 return json.loads(response.read())[:limit]
 295
 296
 297 class GoogleSearch(JSONSearch):
 298         search_url = "http://ajax.googleapis.com/ajax/services/search/web"
 299         query_format_str = "?v=1.0&q=%s"
 300         # TODO: Change this template to reflect the app's actual name.
 301         result_template = 'search/googlesearch.html'
 302         timeout = 60
 303
 304         def parse_response(self, response, limit=None):
 305                 responseData = json.loads(response.read())['responseData']
 306                 results, cursor = responseData['results'], responseData['cursor']
 307
 308                 if results:
 309                         self._more_results_url = cursor['moreResultsUrl']
 310                         self._estimated_result_count = cursor['estimatedResultCount']
 311
 312                 return results[:limit]
 313
 314         @property
 315         def url(self):
 316                 # Google requires that an ajax request have a proper Referer header.
 317                 return urllib2.Request(
 318                         super(GoogleSearch, self).url,
 319                         None,
 320                         {'Referer': "http://%s" % Site.objects.get_current().domain}
 321                 )
 322
 323         @property
 324         def has_more_results(self):
 325                 if self.results and len(self.results) < self._estimated_result_count:
 326                         return True
 327                 return False
 328
 329         @property
 330         def more_results_url(self):
 331                 return self._more_results_url
 332
 333         def get_result_title(self, result):
 334                 return result['titleNoFormatting']
 335
 336         def get_result_url(self, result):
 337                 return result['unescapedUrl']
 338
 339         def get_result_extra_context(self, result):
 340                 return result
 341
 342
 343 registry.register(GoogleSearch)
 344
 345
 346 try:
 347         from BeautifulSoup import BeautifulSoup, SoupStrainer, BeautifulStoneSoup
 348 except:
 349         pass
 350 else:
 351         __all__ += ('ScrapeSearch', 'XMLSearch',)
 352         class ScrapeSearch(URLSearch):
 353                 _strainer_args = []
 354                 _strainer_kwargs = {}
 355
 356                 @property
 357                 def strainer(self):
 358                         if not hasattr(self, '_strainer'):
 359                                 self._strainer = SoupStrainer(*self._strainer_args, **self._strainer_kwargs)
 360                         return self._strainer
 361
 362                 def parse_response(self, response, limit=None):
 363                         strainer = self.strainer
 364                         soup = BeautifulSoup(response, parseOnlyThese=strainer)
 365                         return self.parse_results(soup.findAll(recursive=False, limit=limit))
 366
 367                 def parse_results(self, results):
 368                         """
 369                         Provides a hook for parsing the results of straining. This
 370                         has no default behavior because the results absolutely
 371                         must be parsed to properly extract the information.
 372                         For more information, see http://www.crummy.com/software/BeautifulSoup/documentation.html#Improving%20Memory%20Usage%20with%20extract
 373                         """
 374                         raise NotImplementedError
 375
 376
 377         class XMLSearch(ScrapeSearch):
 378                 _self_closing_tags = []
 379
 380                 def parse_response(self, response, limit=None):
 381                         strainer = self.strainer
 382                         soup = BeautifulStoneSoup(page, selfClosingTags=self._self_closing_tags, parseOnlyThese=strainer)
 383                         return self.parse_results(soup[:limit])