Tweaks to sobol to make its use simpler: added a prerendered version of the results...
[philo.git] / philo / contrib / sobol / search.py
1 #encoding: utf-8
2 import datetime
3
4 from django.conf import settings
5 from django.contrib.sites.models import Site
6 from django.core.cache import cache
7 from django.db.models.options import get_verbose_name as convert_camelcase
8 from django.utils import simplejson as json
9 from django.utils.http import urlquote_plus
10 from django.utils.safestring import mark_safe
11 from django.utils.text import capfirst
12 from django.template import loader, Context, Template
13
14 from philo.contrib.sobol.utils import make_tracking_querydict
15
16
17 if getattr(settings, 'SOBOL_USE_EVENTLET', False):
18         try:
19                 from eventlet.green import urllib2
20         except:
21                 import urllib2
22 else:
23         import urllib2
24
25
26 __all__ = (
27         'Result', 'BaseSearch', 'DatabaseSearch', 'URLSearch', 'JSONSearch', 'GoogleSearch', 'registry'
28 )
29
30
31 SEARCH_CACHE_KEY = 'philo_sobol_search_results'
32 DEFAULT_RESULT_TEMPLATE_STRING = "{% if url %}<a href='{{ url }}'>{% endif %}{{ title }}{% if url %}</a>{% endif %}"
33 DEFAULT_RESULT_TEMPLATE = Template(DEFAULT_RESULT_TEMPLATE_STRING)
34
35 # Determines the timeout on the entire result cache.
36 MAX_CACHE_TIMEOUT = 60*24*7
37
38
39 class RegistrationError(Exception):
40         pass
41
42
43 class SearchRegistry(object):
44         # Holds a registry of search types by slug.
45         def __init__(self):
46                 self._registry = {}
47         
48         def register(self, search, slug=None):
49                 slug = slug or search.slug
50                 if slug in self._registry:
51                         registered = self._registry[slug]
52                         if registered.__module__ != search.__module__:
53                                 raise RegistrationError("A different search is already registered as `%s`" % slug)
54                 else:
55                         self._registry[slug] = search
56         
57         def unregister(self, search, slug=None):
58                 if slug is not None:
59                         if slug in self._registry and self._registry[slug] == search:
60                                 del self._registry[slug]
61                         raise RegistrationError("`%s` is not registered as `%s`" % (search, slug))
62                 else:
63                         for slug, search in self._registry.items():
64                                 if search == search:
65                                         del self._registry[slug]
66         
67         def items(self):
68                 return self._registry.items()
69         
70         def iteritems(self):
71                 return self._registry.iteritems()
72         
73         def iterchoices(self):
74                 for slug, search in self.iteritems():
75                         yield slug, search.verbose_name
76         
77         def __getitem__(self, key):
78                 return self._registry[key]
79         
80         def __iter__(self):
81                 return self._registry.__iter__()
82
83
84 registry = SearchRegistry()
85
86
87 class Result(object):
88         """
89         A result is instantiated with a configuration dictionary, a search,
90         and a template name. The configuration dictionary is expected to
91         define a `title` and optionally a `url`. Any other variables may be
92         defined; they will be made available through the result object in
93         the template, if one is defined.
94         """
95         def __init__(self, search, result):
96                 self.search = search
97                 self.result = result
98         
99         def get_title(self):
100                 return self.search.get_result_title(self.result)
101         
102         def get_url(self):
103                 qd = self.search.get_result_querydict(self.result)
104                 if qd is None:
105                         return ""
106                 return "?%s" % qd.urlencode()
107         
108         def get_template(self):
109                 return self.search.get_result_template(self.result)
110         
111         def get_extra_context(self):
112                 return self.search.get_result_extra_context(self.result)
113         
114         def get_context(self):
115                 context = self.get_extra_context()
116                 context.update({
117                         'title': self.get_title(),
118                         'url': self.get_url(),
119                         'result': self.result
120                 })
121                 return context
122         
123         def render(self):
124                 t = self.get_template()
125                 c = Context(self.get_context())
126                 return t.render(c)
127         
128         def __unicode__(self):
129                 return self.render()
130
131
132 class BaseSearchMetaclass(type):
133         def __new__(cls, name, bases, attrs):
134                 if 'verbose_name' not in attrs:
135                         attrs['verbose_name'] = capfirst(convert_camelcase(name))
136                 if 'slug' not in attrs:
137                         attrs['slug'] = name.lower()
138                 return super(BaseSearchMetaclass, cls).__new__(cls, name, bases, attrs)
139
140
141 class BaseSearch(object):
142         """
143         Defines a generic search interface. Accessing self.results will
144         attempt to retrieve cached results and, if that fails, will
145         initiate a new search and store the results in the cache.
146         """
147         __metaclass__ = BaseSearchMetaclass
148         result_limit = 10
149         _cache_timeout = 60*48
150         
151         def __init__(self, search_arg):
152                 self.search_arg = search_arg
153         
154         def _get_cached_results(self):
155                 """Return the cached results if the results haven't timed out. Otherwise return None."""
156                 result_cache = cache.get(SEARCH_CACHE_KEY)
157                 if result_cache and self.__class__ in result_cache and self.search_arg.lower() in result_cache[self.__class__]:
158                         cached = result_cache[self.__class__][self.search_arg.lower()]
159                         if cached['timeout'] >= datetime.datetime.now():
160                                 return cached['results']
161                 return None
162         
163         def _set_cached_results(self, results, timeout):
164                 """Sets the results to the cache for <timeout> minutes."""
165                 result_cache = cache.get(SEARCH_CACHE_KEY) or {}
166                 cached = result_cache.setdefault(self.__class__, {}).setdefault(self.search_arg.lower(), {})
167                 cached.update({
168                         'results': results,
169                         'timeout': datetime.datetime.now() + datetime.timedelta(minutes=timeout)
170                 })
171                 cache.set(SEARCH_CACHE_KEY, result_cache, MAX_CACHE_TIMEOUT)
172         
173         @property
174         def results(self):
175                 if not hasattr(self, '_results'):
176                         results = self._get_cached_results()
177                         if results is None:
178                                 try:
179                                         # Cache one extra result so we can see if there are
180                                         # more results to be had.
181                                         limit = self.result_limit
182                                         if limit is not None:
183                                                 limit += 1
184                                         results = self.get_results(limit)
185                                 except:
186                                         if settings.DEBUG:
187                                                 raise
188                                         #  On exceptions, don't set any cache; just return.
189                                         return []
190                         
191                                 self._set_cached_results(results, self._cache_timeout)
192                         self._results = results
193                 
194                 return self._results
195         
196         def get_results(self, limit=None, result_class=Result):
197                 """
198                 Calls self.search() and parses the return value into Result objects.
199                 """
200                 results = self.search(limit)
201                 return [result_class(self, result) for result in results]
202         
203         def search(self, limit=None):
204                 """
205                 Returns an iterable of up to <limit> results. The
206                 get_result_title, get_result_url, get_result_template, and
207                 get_result_extra_context methods will be used to interpret the
208                 individual items that this function returns, so the result can
209                 be an object with attributes as easily as a dictionary
210                 with keys. The only restriction is that the objects be
211                 pickleable so that they can be used with django's cache system.
212                 """
213                 raise NotImplementedError
214         
215         def get_result_title(self, result):
216                 raise NotImplementedError
217         
218         def get_result_url(self, result):
219                 "Subclasses override this to provide the actual URL for the result."
220                 raise NotImplementedError
221         
222         def get_result_querydict(self, result):
223                 url = self.get_result_url(result)
224                 if url is None:
225                         return None
226                 return make_tracking_querydict(self.search_arg, url)
227         
228         def get_result_template(self, result):
229                 if hasattr(self, 'result_template'):
230                         return loader.get_template(self.result_template)
231                 if not hasattr(self, '_result_template'):
232                         self._result_template = DEFAULT_RESULT_TEMPLATE
233                 return self._result_template
234         
235         def get_result_extra_context(self, result):
236                 return {}
237         
238         def has_more_results(self):
239                 """Useful to determine whether to display a `view more results` link."""
240                 return len(self.results) > self.result_limit
241         
242         @property
243         def more_results_url(self):
244                 """
245                 Returns the actual url for more results. This will be encoded
246                 into a querystring for tracking purposes.
247                 """
248                 raise NotImplementedError
249         
250         @property
251         def more_results_querydict(self):
252                 return make_tracking_querydict(self.search_arg, self.more_results_url)
253         
254         def __unicode__(self):
255                 return ' '.join(self.__class__.verbose_name.rsplit(' ', 1)[:-1]) + ' results'
256
257
258 class DatabaseSearch(BaseSearch):
259         model = None
260         
261         def search(self, limit=None):
262                 if not hasattr(self, '_qs'):
263                         self._qs = self.get_queryset()
264                         if limit is not None:
265                                 self._qs = self._qs[:limit]
266                 
267                 return self._qs
268         
269         def get_queryset(self):
270                 return self.model._default_manager.all()
271
272
273 class URLSearch(BaseSearch):
274         """
275         Defines a generic interface for searches that require accessing a
276         certain url to get search results.
277         """
278         search_url = ''
279         query_format_str = "%s"
280
281         @property
282         def url(self):
283                 "The URL where the search gets its results."
284                 return self.search_url + self.query_format_str % urlquote_plus(self.search_arg)
285
286         @property
287         def more_results_url(self):
288                 "The URL where the users would go to get more results."
289                 return self.url
290         
291         def parse_response(self, response, limit=None):
292                 raise NotImplementedError
293         
294         def search(self, limit=None):
295                 return self.parse_response(urllib2.urlopen(self.url), limit=limit)
296
297
298 class JSONSearch(URLSearch):
299         """
300         Makes a GET request and parses the results as JSON. The default
301         behavior assumes that the return value is a list of results.
302         """
303         def parse_response(self, response, limit=None):
304                 return json.loads(response.read())[:limit]
305
306
307 class GoogleSearch(JSONSearch):
308         search_url = "http://ajax.googleapis.com/ajax/services/search/web"
309         # TODO: Change this template to reflect the app's actual name.
310         result_template = 'search/googlesearch.html'
311         _cache_timeout = 60
312         verbose_name = "Google search (current site)"
313         
314         @property
315         def query_format_str(self):
316                 default_args = self.default_args
317                 if default_args:
318                         default_args += " "
319                 return "?v=1.0&q=%s%%s" % urlquote_plus(default_args).replace('%', '%%')
320         
321         @property
322         def default_args(self):
323                 return "site:%s" % Site.objects.get_current().domain
324         
325         def parse_response(self, response, limit=None):
326                 responseData = json.loads(response.read())['responseData']
327                 results, cursor = responseData['results'], responseData['cursor']
328                 
329                 if results:
330                         self._more_results_url = cursor['moreResultsUrl']
331                         self._estimated_result_count = cursor['estimatedResultCount']
332                 
333                 return results[:limit]
334         
335         @property
336         def url(self):
337                 # Google requires that an ajax request have a proper Referer header.
338                 return urllib2.Request(
339                         super(GoogleSearch, self).url,
340                         None,
341                         {'Referer': "http://%s" % Site.objects.get_current().domain}
342                 )
343         
344         @property
345         def has_more_results(self):
346                 if self.results and len(self.results) < self._estimated_result_count:
347                         return True
348                 return False
349         
350         @property
351         def more_results_url(self):
352                 return self._more_results_url
353         
354         def get_result_title(self, result):
355                 return result['titleNoFormatting']
356         
357         def get_result_url(self, result):
358                 return result['unescapedUrl']
359
360
361 registry.register(GoogleSearch)
362
363
364 try:
365         from BeautifulSoup import BeautifulSoup, SoupStrainer, BeautifulStoneSoup
366 except:
367         pass
368 else:
369         __all__ += ('ScrapeSearch', 'XMLSearch',)
370         class ScrapeSearch(URLSearch):
371                 _strainer_args = []
372                 _strainer_kwargs = {}
373                 
374                 @property
375                 def strainer(self):
376                         if not hasattr(self, '_strainer'):
377                                 self._strainer = SoupStrainer(*self._strainer_args, **self._strainer_kwargs)
378                         return self._strainer
379                 
380                 def parse_response(self, response, limit=None):
381                         strainer = self.strainer
382                         soup = BeautifulSoup(response, parseOnlyThese=strainer)
383                         return self.parse_results(soup.findAll(recursive=False, limit=limit))
384                 
385                 def parse_results(self, results):
386                         """
387                         Provides a hook for parsing the results of straining. This
388                         has no default behavior because the results absolutely
389                         must be parsed to properly extract the information.
390                         For more information, see http://www.crummy.com/software/BeautifulSoup/documentation.html#Improving%20Memory%20Usage%20with%20extract
391                         """
392                         raise NotImplementedError
393         
394         
395         class XMLSearch(ScrapeSearch):
396                 _self_closing_tags = []
397                 
398                 def parse_response(self, response, limit=None):
399                         strainer = self.strainer
400                         soup = BeautifulStoneSoup(response, selfClosingTags=self._self_closing_tags, parseOnlyThese=strainer)
401                         return self.parse_results(soup.findAll(recursive=False, limit=limit))