philo/contrib/sobol/search.py

   1 #encoding: utf-8
   2 import datetime
   3
   4 from django.conf import settings
   5 from django.contrib.sites.models import Site
   6 from django.core.cache import cache
   7 from django.db.models.options import get_verbose_name as convert_camelcase
   8 from django.utils import simplejson as json
   9 from django.utils.http import urlquote_plus
  10 from django.utils.safestring import mark_safe
  11 from django.utils.text import capfirst
  12 from django.template import loader, Context, Template
  13
  14 from philo.contrib.sobol.utils import make_tracking_querydict
  15
  16
  17 if getattr(settings, 'SOBOL_USE_EVENTLET', False):
  18         try:
  19                 from eventlet.green import urllib2
  20         except:
  21                 import urllib2
  22 else:
  23         import urllib2
  24
  25
  26 __all__ = (
  27         'Result', 'BaseSearch', 'DatabaseSearch', 'URLSearch', 'JSONSearch', 'GoogleSearch', 'registry'
  28 )
  29
  30
  31 SEARCH_CACHE_KEY = 'philo_sobol_search_results'
  32 DEFAULT_RESULT_TEMPLATE_STRING = "{% if url %}<a href='{{ url }}'>{% endif %}{{ title }}{% if url %}</a>{% endif %}"
  33 DEFAULT_RESULT_TEMPLATE = Template(DEFAULT_RESULT_TEMPLATE_STRING)
  34
  35 # Determines the timeout on the entire result cache.
  36 MAX_CACHE_TIMEOUT = 60*24*7
  37
  38
  39 class RegistrationError(Exception):
  40         pass
  41
  42
  43 class SearchRegistry(object):
  44         # Holds a registry of search types by slug.
  45         def __init__(self):
  46                 self._registry = {}
  47
  48         def register(self, search, slug=None):
  49                 slug = slug or search.slug
  50                 if slug in self._registry:
  51                         registered = self._registry[slug]
  52                         if registered.__module__ != search.__module__:
  53                                 raise RegistrationError("A different search is already registered as `%s`" % slug)
  54                 else:
  55                         self._registry[slug] = search
  56
  57         def unregister(self, search, slug=None):
  58                 if slug is not None:
  59                         if slug in self._registry and self._registry[slug] == search:
  60                                 del self._registry[slug]
  61                         raise RegistrationError("`%s` is not registered as `%s`" % (search, slug))
  62                 else:
  63                         for slug, search in self._registry.items():
  64                                 if search == search:
  65                                         del self._registry[slug]
  66
  67         def items(self):
  68                 return self._registry.items()
  69
  70         def iteritems(self):
  71                 return self._registry.iteritems()
  72
  73         def iterchoices(self):
  74                 for slug, search in self.iteritems():
  75                         yield slug, search.verbose_name
  76
  77         def __getitem__(self, key):
  78                 return self._registry[key]
  79
  80         def __iter__(self):
  81                 return self._registry.__iter__()
  82
  83
  84 registry = SearchRegistry()
  85
  86
  87 class Result(object):
  88         """
  89         A result is instantiated with a configuration dictionary, a search,
  90         and a template name. The configuration dictionary is expected to
  91         define a `title` and optionally a `url`. Any other variables may be
  92         defined; they will be made available through the result object in
  93         the template, if one is defined.
  94         """
  95         def __init__(self, search, result):
  96                 self.search = search
  97                 self.result = result
  98
  99         def get_title(self):
 100                 return self.search.get_result_title(self.result)
 101
 102         def get_url(self):
 103                 qd = self.search.get_result_querydict(self.result)
 104                 if qd is None:
 105                         return ""
 106                 return "?%s" % qd.urlencode()
 107
 108         def get_template(self):
 109                 return self.search.get_result_template(self.result)
 110
 111         def get_extra_context(self):
 112                 return self.search.get_result_extra_context(self.result)
 113
 114         def get_context(self):
 115                 context = self.get_extra_context()
 116                 context.update({
 117                         'title': self.get_title(),
 118                         'url': self.get_url()
 119                 })
 120                 return context
 121
 122         def render(self):
 123                 t = self.get_template()
 124                 c = Context(self.get_context())
 125                 return t.render(c)
 126
 127         def __unicode__(self):
 128                 return self.render()
 129
 130
 131 class BaseSearchMetaclass(type):
 132         def __new__(cls, name, bases, attrs):
 133                 if 'verbose_name' not in attrs:
 134                         attrs['verbose_name'] = capfirst(convert_camelcase(name))
 135                 if 'slug' not in attrs:
 136                         attrs['slug'] = name.lower()
 137                 return super(BaseSearchMetaclass, cls).__new__(cls, name, bases, attrs)
 138
 139
 140 class BaseSearch(object):
 141         """
 142         Defines a generic search interface. Accessing self.results will
 143         attempt to retrieve cached results and, if that fails, will
 144         initiate a new search and store the results in the cache.
 145         """
 146         __metaclass__ = BaseSearchMetaclass
 147         result_limit = 10
 148         _cache_timeout = 60*48
 149
 150         def __init__(self, search_arg):
 151                 self.search_arg = search_arg
 152
 153         def _get_cached_results(self):
 154                 """Return the cached results if the results haven't timed out. Otherwise return None."""
 155                 result_cache = cache.get(SEARCH_CACHE_KEY)
 156                 if result_cache and self.__class__ in result_cache and self.search_arg.lower() in result_cache[self.__class__]:
 157                         cached = result_cache[self.__class__][self.search_arg.lower()]
 158                         if cached['timeout'] >= datetime.datetime.now():
 159                                 return cached['results']
 160                 return None
 161
 162         def _set_cached_results(self, results, timeout):
 163                 """Sets the results to the cache for <timeout> minutes."""
 164                 result_cache = cache.get(SEARCH_CACHE_KEY) or {}
 165                 cached = result_cache.setdefault(self.__class__, {}).setdefault(self.search_arg.lower(), {})
 166                 cached.update({
 167                         'results': results,
 168                         'timeout': datetime.datetime.now() + datetime.timedelta(minutes=timeout)
 169                 })
 170                 cache.set(SEARCH_CACHE_KEY, result_cache, MAX_CACHE_TIMEOUT)
 171
 172         @property
 173         def results(self):
 174                 if not hasattr(self, '_results'):
 175                         results = self._get_cached_results()
 176                         if results is None:
 177                                 try:
 178                                         # Cache one extra result so we can see if there are
 179                                         # more results to be had.
 180                                         limit = self.result_limit
 181                                         if limit is not None:
 182                                                 limit += 1
 183                                         results = self.get_results(limit)
 184                                 except:
 185                                         if settings.DEBUG:
 186                                                 raise
 187                                         #  On exceptions, don't set any cache; just return.
 188                                         return []
 189
 190                                 self._set_cached_results(results, self._cache_timeout)
 191                         self._results = results
 192
 193                 return self._results
 194
 195         def get_results(self, limit=None, result_class=Result):
 196                 """
 197                 Calls self.search() and parses the return value into Result objects.
 198                 """
 199                 results = self.search(limit)
 200                 return [result_class(self, result) for result in results]
 201
 202         def search(self, limit=None):
 203                 """
 204                 Returns an iterable of up to <limit> results. The
 205                 get_result_title, get_result_url, get_result_template, and
 206                 get_result_extra_context methods will be used to interpret the
 207                 individual items that this function returns, so the result can
 208                 be an object with attributes as easily as a dictionary
 209                 with keys. The only restriction is that the objects be
 210                 pickleable so that they can be used with django's cache system.
 211                 """
 212                 raise NotImplementedError
 213
 214         def get_result_title(self, result):
 215                 raise NotImplementedError
 216
 217         def get_result_url(self, result):
 218                 "Subclasses override this to provide the actual URL for the result."
 219                 raise NotImplementedError
 220
 221         def get_result_querydict(self, result):
 222                 url = self.get_result_url(result)
 223                 if url is None:
 224                         return None
 225                 return make_tracking_querydict(self.search_arg, url)
 226
 227         def get_result_template(self, result):
 228                 if hasattr(self, 'result_template'):
 229                         return loader.get_template(self.result_template)
 230                 if not hasattr(self, '_result_template'):
 231                         self._result_template = DEFAULT_RESULT_TEMPLATE
 232                 return self._result_template
 233
 234         def get_result_extra_context(self, result):
 235                 return {}
 236
 237         def has_more_results(self):
 238                 """Useful to determine whether to display a `view more results` link."""
 239                 return len(self.results) > self.result_limit
 240
 241         @property
 242         def more_results_url(self):
 243                 """
 244                 Returns the actual url for more results. This will be encoded
 245                 into a querystring for tracking purposes.
 246                 """
 247                 raise NotImplementedError
 248
 249         @property
 250         def more_results_querydict(self):
 251                 return make_tracking_querydict(self.search_arg, self.more_results_url)
 252
 253         def __unicode__(self):
 254                 return ' '.join(self.__class__.verbose_name.rsplit(' ', 1)[:-1]) + ' results'
 255
 256
 257 class DatabaseSearch(BaseSearch):
 258         model = None
 259
 260         def search(self, limit=None):
 261                 if not hasattr(self, '_qs'):
 262                         self._qs = self.get_queryset()
 263                         if limit is not None:
 264                                 self._qs = self._qs[:limit]
 265
 266                 return self._qs
 267
 268         def get_queryset(self):
 269                 return self.model._default_manager.all()
 270
 271
 272 class URLSearch(BaseSearch):
 273         """
 274         Defines a generic interface for searches that require accessing a
 275         certain url to get search results.
 276         """
 277         search_url = ''
 278         query_format_str = "%s"
 279
 280         @property
 281         def url(self):
 282                 "The URL where the search gets its results."
 283                 return self.search_url + self.query_format_str % urlquote_plus(self.search_arg)
 284
 285         @property
 286         def more_results_url(self):
 287                 "The URL where the users would go to get more results."
 288                 return self.url
 289
 290         def parse_response(self, response, limit=None):
 291                 raise NotImplementedError
 292
 293         def search(self, limit=None):
 294                 return self.parse_response(urllib2.urlopen(self.url), limit=limit)
 295
 296
 297 class JSONSearch(URLSearch):
 298         """
 299         Makes a GET request and parses the results as JSON. The default
 300         behavior assumes that the return value is a list of results.
 301         """
 302         def parse_response(self, response, limit=None):
 303                 return json.loads(response.read())[:limit]
 304
 305
 306 class GoogleSearch(JSONSearch):
 307         search_url = "http://ajax.googleapis.com/ajax/services/search/web"
 308         # TODO: Change this template to reflect the app's actual name.
 309         result_template = 'search/googlesearch.html'
 310         _cache_timeout = 60
 311         verbose_name = "Google search (current site)"
 312
 313         @property
 314         def query_format_str(self):
 315                 default_args = self.default_args
 316                 if default_args:
 317                         default_args += " "
 318                 return "?v=1.0&q=%s%%s" % urlquote_plus(default_args).replace('%', '%%')
 319
 320         @property
 321         def default_args(self):
 322                 return "site:%s" % Site.objects.get_current().domain
 323
 324         def parse_response(self, response, limit=None):
 325                 responseData = json.loads(response.read())['responseData']
 326                 results, cursor = responseData['results'], responseData['cursor']
 327
 328                 if results:
 329                         self._more_results_url = cursor['moreResultsUrl']
 330                         self._estimated_result_count = cursor['estimatedResultCount']
 331
 332                 return results[:limit]
 333
 334         @property
 335         def url(self):
 336                 # Google requires that an ajax request have a proper Referer header.
 337                 return urllib2.Request(
 338                         super(GoogleSearch, self).url,
 339                         None,
 340                         {'Referer': "http://%s" % Site.objects.get_current().domain}
 341                 )
 342
 343         @property
 344         def has_more_results(self):
 345                 if self.results and len(self.results) < self._estimated_result_count:
 346                         return True
 347                 return False
 348
 349         @property
 350         def more_results_url(self):
 351                 return self._more_results_url
 352
 353         def get_result_title(self, result):
 354                 return result['titleNoFormatting']
 355
 356         def get_result_url(self, result):
 357                 return result['unescapedUrl']
 358
 359         def get_result_extra_context(self, result):
 360                 return result
 361
 362
 363 registry.register(GoogleSearch)
 364
 365
 366 try:
 367         from BeautifulSoup import BeautifulSoup, SoupStrainer, BeautifulStoneSoup
 368 except:
 369         pass
 370 else:
 371         __all__ += ('ScrapeSearch', 'XMLSearch',)
 372         class ScrapeSearch(URLSearch):
 373                 _strainer_args = []
 374                 _strainer_kwargs = {}
 375
 376                 @property
 377                 def strainer(self):
 378                         if not hasattr(self, '_strainer'):
 379                                 self._strainer = SoupStrainer(*self._strainer_args, **self._strainer_kwargs)
 380                         return self._strainer
 381
 382                 def parse_response(self, response, limit=None):
 383                         strainer = self.strainer
 384                         soup = BeautifulSoup(response, parseOnlyThese=strainer)
 385                         return self.parse_results(soup.findAll(recursive=False, limit=limit))
 386
 387                 def parse_results(self, results):
 388                         """
 389                         Provides a hook for parsing the results of straining. This
 390                         has no default behavior because the results absolutely
 391                         must be parsed to properly extract the information.
 392                         For more information, see http://www.crummy.com/software/BeautifulSoup/documentation.html#Improving%20Memory%20Usage%20with%20extract
 393                         """
 394                         raise NotImplementedError
 395
 396
 397         class XMLSearch(ScrapeSearch):
 398                 _self_closing_tags = []
 399
 400                 def parse_response(self, response, limit=None):
 401                         strainer = self.strainer
 402                         soup = BeautifulStoneSoup(response, selfClosingTags=self._self_closing_tags, parseOnlyThese=strainer)
 403                         return self.parse_results(soup.findAll(recursive=False, limit=limit))