Removed Sobol's automatic use of eventlet. Adjusted imports project-wide to conform...
[philo.git] / philo / contrib / sobol / search.py
1 #encoding: utf-8
2 import datetime
3
4 from django.conf import settings
5 from django.contrib.sites.models import Site
6 from django.core.cache import cache
7 from django.db.models.options import get_verbose_name as convert_camelcase
8 from django.utils import simplejson as json
9 from django.utils.http import urlquote_plus
10 from django.utils.safestring import mark_safe
11 from django.utils.text import capfirst
12 from django.template import loader, Context, Template
13
14 from philo.contrib.sobol.utils import make_tracking_querydict
15
16
17 if getattr(settings, 'SOBOL_USE_EVENTLET', False):
18         try:
19                 from eventlet.green import urllib2
20         except:
21                 import urllib2
22 else:
23         import urllib2
24
25
26 __all__ = (
27         'Result', 'BaseSearch', 'DatabaseSearch', 'URLSearch', 'JSONSearch', 'GoogleSearch', 'registry'
28 )
29
30
31 SEARCH_CACHE_KEY = 'philo_sobol_search_results'
32 DEFAULT_RESULT_TEMPLATE_STRING = "{% if url %}<a href='{{ url }}'>{% endif %}{{ title }}{% if url %}</a>{% endif %}"
33 DEFAULT_RESULT_TEMPLATE = Template(DEFAULT_RESULT_TEMPLATE_STRING)
34
35 # Determines the timeout on the entire result cache.
36 MAX_CACHE_TIMEOUT = 60*24*7
37
38
39 class RegistrationError(Exception):
40         pass
41
42
43 class SearchRegistry(object):
44         # Holds a registry of search types by slug.
45         def __init__(self):
46                 self._registry = {}
47         
48         def register(self, search, slug=None):
49                 slug = slug or search.slug
50                 if slug in self._registry:
51                         registered = self._registry[slug]
52                         if registered.__module__ != search.__module__:
53                                 raise RegistrationError("A different search is already registered as `%s`" % slug)
54                 else:
55                         self._registry[slug] = search
56         
57         def unregister(self, search, slug=None):
58                 if slug is not None:
59                         if slug in self._registry and self._registry[slug] == search:
60                                 del self._registry[slug]
61                         raise RegistrationError("`%s` is not registered as `%s`" % (search, slug))
62                 else:
63                         for slug, search in self._registry.items():
64                                 if search == search:
65                                         del self._registry[slug]
66         
67         def items(self):
68                 return self._registry.items()
69         
70         def iteritems(self):
71                 return self._registry.iteritems()
72         
73         def iterchoices(self):
74                 for slug, search in self.iteritems():
75                         yield slug, search.verbose_name
76         
77         def __getitem__(self, key):
78                 return self._registry[key]
79         
80         def __iter__(self):
81                 return self._registry.__iter__()
82
83
84 registry = SearchRegistry()
85
86
87 class Result(object):
88         """
89         A result is instantiated with a configuration dictionary, a search,
90         and a template name. The configuration dictionary is expected to
91         define a `title` and optionally a `url`. Any other variables may be
92         defined; they will be made available through the result object in
93         the template, if one is defined.
94         """
95         def __init__(self, search, result):
96                 self.search = search
97                 self.result = result
98         
99         def get_title(self):
100                 return self.search.get_result_title(self.result)
101         
102         def get_url(self):
103                 qd = self.search.get_result_querydict(self.result)
104                 if qd is None:
105                         return ""
106                 return "?%s" % qd.urlencode()
107         
108         def get_template(self):
109                 return self.search.get_result_template(self.result)
110         
111         def get_extra_context(self):
112                 return self.search.get_result_extra_context(self.result)
113         
114         def get_context(self):
115                 context = self.get_extra_context()
116                 context.update({
117                         'title': self.get_title(),
118                         'url': self.get_url()
119                 })
120                 return context
121         
122         def render(self):
123                 t = self.get_template()
124                 c = Context(self.get_context())
125                 return t.render(c)
126         
127         def __unicode__(self):
128                 return self.render()
129
130
131 class BaseSearchMetaclass(type):
132         def __new__(cls, name, bases, attrs):
133                 if 'verbose_name' not in attrs:
134                         attrs['verbose_name'] = capfirst(convert_camelcase(name))
135                 if 'slug' not in attrs:
136                         attrs['slug'] = name.lower()
137                 return super(BaseSearchMetaclass, cls).__new__(cls, name, bases, attrs)
138
139
140 class BaseSearch(object):
141         """
142         Defines a generic search interface. Accessing self.results will
143         attempt to retrieve cached results and, if that fails, will
144         initiate a new search and store the results in the cache.
145         """
146         __metaclass__ = BaseSearchMetaclass
147         result_limit = 10
148         _cache_timeout = 60*48
149         
150         def __init__(self, search_arg):
151                 self.search_arg = search_arg
152         
153         def _get_cached_results(self):
154                 """Return the cached results if the results haven't timed out. Otherwise return None."""
155                 result_cache = cache.get(SEARCH_CACHE_KEY)
156                 if result_cache and self.__class__ in result_cache and self.search_arg.lower() in result_cache[self.__class__]:
157                         cached = result_cache[self.__class__][self.search_arg.lower()]
158                         if cached['timeout'] >= datetime.datetime.now():
159                                 return cached['results']
160                 return None
161         
162         def _set_cached_results(self, results, timeout):
163                 """Sets the results to the cache for <timeout> minutes."""
164                 result_cache = cache.get(SEARCH_CACHE_KEY) or {}
165                 cached = result_cache.setdefault(self.__class__, {}).setdefault(self.search_arg.lower(), {})
166                 cached.update({
167                         'results': results,
168                         'timeout': datetime.datetime.now() + datetime.timedelta(minutes=timeout)
169                 })
170                 cache.set(SEARCH_CACHE_KEY, result_cache, MAX_CACHE_TIMEOUT)
171         
172         @property
173         def results(self):
174                 if not hasattr(self, '_results'):
175                         results = self._get_cached_results()
176                         if results is None:
177                                 try:
178                                         # Cache one extra result so we can see if there are
179                                         # more results to be had.
180                                         limit = self.result_limit
181                                         if limit is not None:
182                                                 limit += 1
183                                         results = self.get_results(limit)
184                                 except:
185                                         if settings.DEBUG:
186                                                 raise
187                                         #  On exceptions, don't set any cache; just return.
188                                         return []
189                         
190                                 self._set_cached_results(results, self._cache_timeout)
191                         self._results = results
192                 
193                 return self._results
194         
195         def get_results(self, limit=None, result_class=Result):
196                 """
197                 Calls self.search() and parses the return value into Result objects.
198                 """
199                 results = self.search(limit)
200                 return [result_class(self, result) for result in results]
201         
202         def search(self, limit=None):
203                 """
204                 Returns an iterable of up to <limit> results. The
205                 get_result_title, get_result_url, get_result_template, and
206                 get_result_extra_context methods will be used to interpret the
207                 individual items that this function returns, so the result can
208                 be an object with attributes as easily as a dictionary
209                 with keys. The only restriction is that the objects be
210                 pickleable so that they can be used with django's cache system.
211                 """
212                 raise NotImplementedError
213         
214         def get_result_title(self, result):
215                 raise NotImplementedError
216         
217         def get_result_url(self, result):
218                 "Subclasses override this to provide the actual URL for the result."
219                 raise NotImplementedError
220         
221         def get_result_querydict(self, result):
222                 url = self.get_result_url(result)
223                 if url is None:
224                         return None
225                 return make_tracking_querydict(self.search_arg, url)
226         
227         def get_result_template(self, result):
228                 if hasattr(self, 'result_template'):
229                         return loader.get_template(self.result_template)
230                 if not hasattr(self, '_result_template'):
231                         self._result_template = DEFAULT_RESULT_TEMPLATE
232                 return self._result_template
233         
234         def get_result_extra_context(self, result):
235                 return {}
236         
237         def has_more_results(self):
238                 """Useful to determine whether to display a `view more results` link."""
239                 return len(self.results) > self.result_limit
240         
241         @property
242         def more_results_url(self):
243                 """
244                 Returns the actual url for more results. This will be encoded
245                 into a querystring for tracking purposes.
246                 """
247                 raise NotImplementedError
248         
249         @property
250         def more_results_querydict(self):
251                 return make_tracking_querydict(self.search_arg, self.more_results_url)
252         
253         def __unicode__(self):
254                 return ' '.join(self.__class__.verbose_name.rsplit(' ', 1)[:-1]) + ' results'
255
256
257 class DatabaseSearch(BaseSearch):
258         model = None
259         
260         def search(self, limit=None):
261                 if not hasattr(self, '_qs'):
262                         self._qs = self.get_queryset()
263                         if limit is not None:
264                                 self._qs = self._qs[:limit]
265                 
266                 return self._qs
267         
268         def get_queryset(self):
269                 return self.model._default_manager.all()
270
271
272 class URLSearch(BaseSearch):
273         """
274         Defines a generic interface for searches that require accessing a
275         certain url to get search results.
276         """
277         search_url = ''
278         query_format_str = "%s"
279
280         @property
281         def url(self):
282                 "The URL where the search gets its results."
283                 return self.search_url + self.query_format_str % urlquote_plus(self.search_arg)
284
285         @property
286         def more_results_url(self):
287                 "The URL where the users would go to get more results."
288                 return self.url
289         
290         def parse_response(self, response, limit=None):
291                 raise NotImplementedError
292         
293         def search(self, limit=None):
294                 return self.parse_response(urllib2.urlopen(self.url), limit=limit)
295
296
297 class JSONSearch(URLSearch):
298         """
299         Makes a GET request and parses the results as JSON. The default
300         behavior assumes that the return value is a list of results.
301         """
302         def parse_response(self, response, limit=None):
303                 return json.loads(response.read())[:limit]
304
305
306 class GoogleSearch(JSONSearch):
307         search_url = "http://ajax.googleapis.com/ajax/services/search/web"
308         # TODO: Change this template to reflect the app's actual name.
309         result_template = 'search/googlesearch.html'
310         _cache_timeout = 60
311         verbose_name = "Google search (current site)"
312         
313         @property
314         def query_format_str(self):
315                 default_args = self.default_args
316                 if default_args:
317                         default_args += " "
318                 return "?v=1.0&q=%s%%s" % urlquote_plus(default_args).replace('%', '%%')
319         
320         @property
321         def default_args(self):
322                 return "site:%s" % Site.objects.get_current().domain
323         
324         def parse_response(self, response, limit=None):
325                 responseData = json.loads(response.read())['responseData']
326                 results, cursor = responseData['results'], responseData['cursor']
327                 
328                 if results:
329                         self._more_results_url = cursor['moreResultsUrl']
330                         self._estimated_result_count = cursor['estimatedResultCount']
331                 
332                 return results[:limit]
333         
334         @property
335         def url(self):
336                 # Google requires that an ajax request have a proper Referer header.
337                 return urllib2.Request(
338                         super(GoogleSearch, self).url,
339                         None,
340                         {'Referer': "http://%s" % Site.objects.get_current().domain}
341                 )
342         
343         @property
344         def has_more_results(self):
345                 if self.results and len(self.results) < self._estimated_result_count:
346                         return True
347                 return False
348         
349         @property
350         def more_results_url(self):
351                 return self._more_results_url
352         
353         def get_result_title(self, result):
354                 return result['titleNoFormatting']
355         
356         def get_result_url(self, result):
357                 return result['unescapedUrl']
358         
359         def get_result_extra_context(self, result):
360                 return result
361
362
363 registry.register(GoogleSearch)
364
365
366 try:
367         from BeautifulSoup import BeautifulSoup, SoupStrainer, BeautifulStoneSoup
368 except:
369         pass
370 else:
371         __all__ += ('ScrapeSearch', 'XMLSearch',)
372         class ScrapeSearch(URLSearch):
373                 _strainer_args = []
374                 _strainer_kwargs = {}
375                 
376                 @property
377                 def strainer(self):
378                         if not hasattr(self, '_strainer'):
379                                 self._strainer = SoupStrainer(*self._strainer_args, **self._strainer_kwargs)
380                         return self._strainer
381                 
382                 def parse_response(self, response, limit=None):
383                         strainer = self.strainer
384                         soup = BeautifulSoup(response, parseOnlyThese=strainer)
385                         return self.parse_results(soup.findAll(recursive=False, limit=limit))
386                 
387                 def parse_results(self, results):
388                         """
389                         Provides a hook for parsing the results of straining. This
390                         has no default behavior because the results absolutely
391                         must be parsed to properly extract the information.
392                         For more information, see http://www.crummy.com/software/BeautifulSoup/documentation.html#Improving%20Memory%20Usage%20with%20extract
393                         """
394                         raise NotImplementedError
395         
396         
397         class XMLSearch(ScrapeSearch):
398                 _self_closing_tags = []
399                 
400                 def parse_response(self, response, limit=None):
401                         strainer = self.strainer
402                         soup = BeautifulStoneSoup(response, selfClosingTags=self._self_closing_tags, parseOnlyThese=strainer)
403                         return self.parse_results(soup.findAll(recursive=False, limit=limit))