philo/contrib/sobol/search.py

   1 #encoding: utf-8
   2 import datetime
   3
   4 from django.conf import settings
   5 from django.contrib.sites.models import Site
   6 from django.core.cache import cache
   7 from django.db.models.options import get_verbose_name as convert_camelcase
   8 from django.utils import simplejson as json
   9 from django.utils.http import urlquote_plus
  10 from django.utils.safestring import mark_safe
  11 from django.utils.text import capfirst
  12 from django.template import loader, Context, Template
  13
  14 from philo.contrib.sobol.utils import make_tracking_querydict, RegistryIterator
  15
  16
  17 if getattr(settings, 'SOBOL_USE_EVENTLET', False):
  18         try:
  19                 from eventlet.green import urllib2
  20         except:
  21                 import urllib2
  22 else:
  23         import urllib2
  24
  25
  26 __all__ = (
  27         'Result', 'BaseSearch', 'DatabaseSearch', 'URLSearch', 'JSONSearch', 'GoogleSearch', 'SearchRegistry', 'registry'
  28 )
  29
  30
  31 SEARCH_CACHE_KEY = 'philo_sobol_search_results'
  32 DEFAULT_RESULT_TEMPLATE_STRING = "{% if url %}<a href='{{ url }}'>{% endif %}{{ title }}{% if url %}</a>{% endif %}"
  33 DEFAULT_RESULT_TEMPLATE = Template(DEFAULT_RESULT_TEMPLATE_STRING)
  34
  35 # Determines the timeout on the entire result cache.
  36 MAX_CACHE_TIMEOUT = 60*24*7
  37
  38
  39 class RegistrationError(Exception):
  40         pass
  41
  42
  43 class SearchRegistry(object):
  44         """Holds a registry of search types by slug."""
  45
  46         def __init__(self):
  47                 self._registry = {}
  48
  49         def register(self, search, slug=None):
  50                 slug = slug or search.slug
  51                 if slug in self._registry:
  52                         registered = self._registry[slug]
  53                         if registered.__module__ != search.__module__:
  54                                 raise RegistrationError("A different search is already registered as `%s`" % slug)
  55                 else:
  56                         self._registry[slug] = search
  57
  58         def unregister(self, search, slug=None):
  59                 if slug is not None:
  60                         if slug in self._registry and self._registry[slug] == search:
  61                                 del self._registry[slug]
  62                         raise RegistrationError("`%s` is not registered as `%s`" % (search, slug))
  63                 else:
  64                         for slug, search in self._registry.items():
  65                                 if search == search:
  66                                         del self._registry[slug]
  67
  68         def items(self):
  69                 return self._registry.items()
  70
  71         def iteritems(self):
  72                 return RegistryIterator(self._registry, 'iteritems')
  73
  74         def iterchoices(self):
  75                 return RegistryIterator(self._registry, 'iteritems', lambda x: (x[0], x[1].verbose_name))
  76
  77         def __getitem__(self, key):
  78                 return self._registry[key]
  79
  80         def __iter__(self):
  81                 return self._registry.__iter__()
  82
  83
  84 registry = SearchRegistry()
  85
  86
  87 class Result(object):
  88         """
  89         A result is instantiated with a configuration dictionary, a search,
  90         and a template name. The configuration dictionary is expected to
  91         define a `title` and optionally a `url`. Any other variables may be
  92         defined; they will be made available through the result object in
  93         the template, if one is defined.
  94         """
  95         def __init__(self, search, result):
  96                 self.search = search
  97                 self.result = result
  98
  99         def get_title(self):
 100                 return self.search.get_result_title(self.result)
 101
 102         def get_url(self):
 103                 qd = self.search.get_result_querydict(self.result)
 104                 if qd is None:
 105                         return ""
 106                 return "?%s" % qd.urlencode()
 107
 108         def get_template(self):
 109                 return self.search.get_result_template(self.result)
 110
 111         def get_extra_context(self):
 112                 return self.search.get_result_extra_context(self.result)
 113
 114         def get_context(self):
 115                 context = self.get_extra_context()
 116                 context.update({
 117                         'title': self.get_title(),
 118                         'url': self.get_url(),
 119                         'result': self.result
 120                 })
 121                 return context
 122
 123         def render(self):
 124                 t = self.get_template()
 125                 c = Context(self.get_context())
 126                 return t.render(c)
 127
 128         def __unicode__(self):
 129                 return self.render()
 130
 131
 132 class BaseSearchMetaclass(type):
 133         def __new__(cls, name, bases, attrs):
 134                 if 'verbose_name' not in attrs:
 135                         attrs['verbose_name'] = capfirst(' '.join(convert_camelcase(name).rsplit(' ', 1)[:-1]))
 136                 if 'slug' not in attrs:
 137                         attrs['slug'] = name.lower()
 138                 return super(BaseSearchMetaclass, cls).__new__(cls, name, bases, attrs)
 139
 140
 141 class BaseSearch(object):
 142         """
 143         Defines a generic search interface. Accessing self.results will
 144         attempt to retrieve cached results and, if that fails, will
 145         initiate a new search and store the results in the cache.
 146         """
 147         __metaclass__ = BaseSearchMetaclass
 148         result_limit = 10
 149         _cache_timeout = 60*48
 150
 151         def __init__(self, search_arg):
 152                 self.search_arg = search_arg
 153
 154         def _get_cached_results(self):
 155                 """Return the cached results if the results haven't timed out. Otherwise return None."""
 156                 result_cache = cache.get(SEARCH_CACHE_KEY)
 157                 if result_cache and self.__class__ in result_cache and self.search_arg.lower() in result_cache[self.__class__]:
 158                         cached = result_cache[self.__class__][self.search_arg.lower()]
 159                         if cached['timeout'] >= datetime.datetime.now():
 160                                 return cached['results']
 161                 return None
 162
 163         def _set_cached_results(self, results, timeout):
 164                 """Sets the results to the cache for <timeout> minutes."""
 165                 result_cache = cache.get(SEARCH_CACHE_KEY) or {}
 166                 cached = result_cache.setdefault(self.__class__, {}).setdefault(self.search_arg.lower(), {})
 167                 cached.update({
 168                         'results': results,
 169                         'timeout': datetime.datetime.now() + datetime.timedelta(minutes=timeout)
 170                 })
 171                 cache.set(SEARCH_CACHE_KEY, result_cache, MAX_CACHE_TIMEOUT)
 172
 173         @property
 174         def results(self):
 175                 if not hasattr(self, '_results'):
 176                         results = self._get_cached_results()
 177                         if results is None:
 178                                 try:
 179                                         # Cache one extra result so we can see if there are
 180                                         # more results to be had.
 181                                         limit = self.result_limit
 182                                         if limit is not None:
 183                                                 limit += 1
 184                                         results = self.get_results(limit)
 185                                 except:
 186                                         if settings.DEBUG:
 187                                                 raise
 188                                         #  On exceptions, don't set any cache; just return.
 189                                         return []
 190
 191                                 self._set_cached_results(results, self._cache_timeout)
 192                         self._results = results
 193
 194                 return self._results
 195
 196         def get_results(self, limit=None, result_class=Result):
 197                 """
 198                 Calls self.search() and parses the return value into Result objects.
 199                 """
 200                 results = self.search(limit)
 201                 return [result_class(self, result) for result in results]
 202
 203         def search(self, limit=None):
 204                 """
 205                 Returns an iterable of up to <limit> results. The
 206                 get_result_title, get_result_url, get_result_template, and
 207                 get_result_extra_context methods will be used to interpret the
 208                 individual items that this function returns, so the result can
 209                 be an object with attributes as easily as a dictionary
 210                 with keys. The only restriction is that the objects be
 211                 pickleable so that they can be used with django's cache system.
 212                 """
 213                 raise NotImplementedError
 214
 215         def get_result_title(self, result):
 216                 raise NotImplementedError
 217
 218         def get_result_url(self, result):
 219                 "Subclasses override this to provide the actual URL for the result."
 220                 raise NotImplementedError
 221
 222         def get_result_querydict(self, result):
 223                 url = self.get_result_url(result)
 224                 if url is None:
 225                         return None
 226                 return make_tracking_querydict(self.search_arg, url)
 227
 228         def get_result_template(self, result):
 229                 if hasattr(self, 'result_template'):
 230                         return loader.get_template(self.result_template)
 231                 if not hasattr(self, '_result_template'):
 232                         self._result_template = DEFAULT_RESULT_TEMPLATE
 233                 return self._result_template
 234
 235         def get_result_extra_context(self, result):
 236                 return {}
 237
 238         def has_more_results(self):
 239                 """Useful to determine whether to display a `view more results` link."""
 240                 return len(self.results) > self.result_limit
 241
 242         @property
 243         def more_results_url(self):
 244                 """
 245                 Returns the actual url for more results. This will be encoded
 246                 into a querystring for tracking purposes.
 247                 """
 248                 raise NotImplementedError
 249
 250         @property
 251         def more_results_querydict(self):
 252                 return make_tracking_querydict(self.search_arg, self.more_results_url)
 253
 254         def __unicode__(self):
 255                 return self.verbose_name
 256
 257
 258 class DatabaseSearch(BaseSearch):
 259         model = None
 260
 261         def search(self, limit=None):
 262                 if not hasattr(self, '_qs'):
 263                         self._qs = self.get_queryset()
 264                         if limit is not None:
 265                                 self._qs = self._qs[:limit]
 266
 267                 return self._qs
 268
 269         def get_queryset(self):
 270                 return self.model._default_manager.all()
 271
 272
 273 class URLSearch(BaseSearch):
 274         """
 275         Defines a generic interface for searches that require accessing a
 276         certain url to get search results.
 277         """
 278         search_url = ''
 279         query_format_str = "%s"
 280
 281         @property
 282         def url(self):
 283                 "The URL where the search gets its results."
 284                 return self.search_url + self.query_format_str % urlquote_plus(self.search_arg)
 285
 286         @property
 287         def more_results_url(self):
 288                 "The URL where the users would go to get more results."
 289                 return self.url
 290
 291         def parse_response(self, response, limit=None):
 292                 raise NotImplementedError
 293
 294         def search(self, limit=None):
 295                 return self.parse_response(urllib2.urlopen(self.url), limit=limit)
 296
 297
 298 class JSONSearch(URLSearch):
 299         """
 300         Makes a GET request and parses the results as JSON. The default
 301         behavior assumes that the return value is a list of results.
 302         """
 303         def parse_response(self, response, limit=None):
 304                 return json.loads(response.read())[:limit]
 305
 306
 307 class GoogleSearch(JSONSearch):
 308         search_url = "http://ajax.googleapis.com/ajax/services/search/web"
 309         # TODO: Change this template to reflect the app's actual name.
 310         result_template = 'search/googlesearch.html'
 311         _cache_timeout = 60
 312         verbose_name = "Google search (current site)"
 313
 314         @property
 315         def query_format_str(self):
 316                 default_args = self.default_args
 317                 if default_args:
 318                         default_args += " "
 319                 return "?v=1.0&q=%s%%s" % urlquote_plus(default_args).replace('%', '%%')
 320
 321         @property
 322         def default_args(self):
 323                 return "site:%s" % Site.objects.get_current().domain
 324
 325         def parse_response(self, response, limit=None):
 326                 responseData = json.loads(response.read())['responseData']
 327                 results, cursor = responseData['results'], responseData['cursor']
 328
 329                 if results:
 330                         self._more_results_url = cursor['moreResultsUrl']
 331                         self._estimated_result_count = cursor['estimatedResultCount']
 332
 333                 return results[:limit]
 334
 335         @property
 336         def url(self):
 337                 # Google requires that an ajax request have a proper Referer header.
 338                 return urllib2.Request(
 339                         super(GoogleSearch, self).url,
 340                         None,
 341                         {'Referer': "http://%s" % Site.objects.get_current().domain}
 342                 )
 343
 344         @property
 345         def has_more_results(self):
 346                 if self.results and len(self.results) < self._estimated_result_count:
 347                         return True
 348                 return False
 349
 350         @property
 351         def more_results_url(self):
 352                 return self._more_results_url
 353
 354         def get_result_title(self, result):
 355                 return result['titleNoFormatting']
 356
 357         def get_result_url(self, result):
 358                 return result['unescapedUrl']
 359
 360
 361 registry.register(GoogleSearch)
 362
 363
 364 try:
 365         from BeautifulSoup import BeautifulSoup, SoupStrainer, BeautifulStoneSoup
 366 except:
 367         pass
 368 else:
 369         __all__ += ('ScrapeSearch', 'XMLSearch',)
 370         class ScrapeSearch(URLSearch):
 371                 _strainer_args = []
 372                 _strainer_kwargs = {}
 373
 374                 @property
 375                 def strainer(self):
 376                         if not hasattr(self, '_strainer'):
 377                                 self._strainer = SoupStrainer(*self._strainer_args, **self._strainer_kwargs)
 378                         return self._strainer
 379
 380                 def parse_response(self, response, limit=None):
 381                         strainer = self.strainer
 382                         soup = BeautifulSoup(response, parseOnlyThese=strainer)
 383                         return self.parse_results(soup.findAll(recursive=False, limit=limit))
 384
 385                 def parse_results(self, results):
 386                         """
 387                         Provides a hook for parsing the results of straining. This
 388                         has no default behavior because the results absolutely
 389                         must be parsed to properly extract the information.
 390                         For more information, see http://www.crummy.com/software/BeautifulSoup/documentation.html#Improving%20Memory%20Usage%20with%20extract
 391                         """
 392                         raise NotImplementedError
 393
 394
 395         class XMLSearch(ScrapeSearch):
 396                 _self_closing_tags = []
 397
 398                 def parse_response(self, response, limit=None):
 399                         strainer = self.strainer
 400                         soup = BeautifulStoneSoup(response, selfClosingTags=self._self_closing_tags, parseOnlyThese=strainer)
 401                         return self.parse_results(soup.findAll(recursive=False, limit=limit))