Merge branch 'shipherd_hotfix'
[philo.git] / contrib / sobol / search.py
1 #encoding: utf-8
2
3 from django.conf import settings
4 from django.contrib.sites.models import Site
5 from django.core.cache import cache
6 from django.db.models.options import get_verbose_name as convert_camelcase
7 from django.utils import simplejson as json
8 from django.utils.http import urlquote_plus
9 from django.utils.safestring import mark_safe
10 from django.utils.text import capfirst
11 from django.template import loader, Context, Template
12 import datetime
13 from philo.contrib.sobol.utils import make_tracking_querydict
14
15 try:
16         from eventlet.green import urllib2
17 except:
18         import urllib2
19
20
21 __all__ = (
22         'Result', 'BaseSearch', 'DatabaseSearch', 'URLSearch', 'JSONSearch', 'GoogleSearch', 'registry'
23 )
24
25
26 SEARCH_CACHE_KEY = 'philo_sobol_search_results'
27 DEFAULT_RESULT_TEMPLATE_STRING = "{% if url %}<a href='{{ url }}'>{% endif %}{{ title }}{% if url %}</a>{% endif %}"
28 DEFAULT_RESULT_TEMPLATE = Template(DEFAULT_RESULT_TEMPLATE_STRING)
29
30 # Determines the timeout on the entire result cache.
31 MAX_CACHE_TIMEOUT = 60*24*7
32
33
34 class RegistrationError(Exception):
35         pass
36
37
38 class SearchRegistry(object):
39         # Holds a registry of search types by slug.
40         def __init__(self):
41                 self._registry = {}
42         
43         def register(self, search, slug=None):
44                 slug = slug or search.slug
45                 if slug in self._registry:
46                         registered = self._registry[slug]
47                         if registered.__module__ != search.__module__:
48                                 raise RegistrationError("A different search is already registered as `%s`" % slug)
49                 else:
50                         self._registry[slug] = search
51         
52         def unregister(self, search, slug=None):
53                 if slug is not None:
54                         if slug in self._registry and self._registry[slug] == search:
55                                 del self._registry[slug]
56                         raise RegistrationError("`%s` is not registered as `%s`" % (search, slug))
57                 else:
58                         for slug, search in self._registry.items():
59                                 if search == search:
60                                         del self._registry[slug]
61         
62         def items(self):
63                 return self._registry.items()
64         
65         def iteritems(self):
66                 return self._registry.iteritems()
67         
68         def iterchoices(self):
69                 for slug, search in self.iteritems():
70                         yield slug, search.verbose_name
71         
72         def __getitem__(self, key):
73                 return self._registry[key]
74         
75         def __iter__(self):
76                 return self._registry.__iter__()
77
78
79 registry = SearchRegistry()
80
81
82 class Result(object):
83         """
84         A result is instantiated with a configuration dictionary, a search,
85         and a template name. The configuration dictionary is expected to
86         define a `title` and optionally a `url`. Any other variables may be
87         defined; they will be made available through the result object in
88         the template, if one is defined.
89         """
90         def __init__(self, search, result):
91                 self.search = search
92                 self.result = result
93         
94         def get_title(self):
95                 return self.search.get_result_title(self.result)
96         
97         def get_url(self):
98                 qd = self.search.get_result_querydict(self.result)
99                 if qd is None:
100                         return ""
101                 return "?%s" % qd.urlencode()
102         
103         def get_template(self):
104                 return self.search.get_result_template(self.result)
105         
106         def get_extra_context(self):
107                 return self.search.get_result_extra_context(self.result)
108         
109         def get_context(self):
110                 context = self.get_extra_context()
111                 context.update({
112                         'title': self.get_title(),
113                         'url': self.get_url()
114                 })
115                 return context
116         
117         def render(self):
118                 t = self.get_template()
119                 c = Context(self.get_context())
120                 return t.render(c)
121         
122         def __unicode__(self):
123                 return self.render()
124
125
126 class BaseSearchMetaclass(type):
127         def __new__(cls, name, bases, attrs):
128                 if 'verbose_name' not in attrs:
129                         attrs['verbose_name'] = capfirst(convert_camelcase(name))
130                 if 'slug' not in attrs:
131                         attrs['slug'] = name.lower()
132                 return super(BaseSearchMetaclass, cls).__new__(cls, name, bases, attrs)
133
134
135 class BaseSearch(object):
136         """
137         Defines a generic search interface. Accessing self.results will
138         attempt to retrieve cached results and, if that fails, will
139         initiate a new search and store the results in the cache.
140         """
141         __metaclass__ = BaseSearchMetaclass
142         result_limit = 10
143         _cache_timeout = 60*48
144         
145         def __init__(self, search_arg):
146                 self.search_arg = search_arg
147         
148         def _get_cached_results(self):
149                 """Return the cached results if the results haven't timed out. Otherwise return None."""
150                 result_cache = cache.get(SEARCH_CACHE_KEY)
151                 if result_cache and self.__class__ in result_cache and self.search_arg.lower() in result_cache[self.__class__]:
152                         cached = result_cache[self.__class__][self.search_arg.lower()]
153                         if cached['timeout'] >= datetime.datetime.now():
154                                 return cached['results']
155                 return None
156         
157         def _set_cached_results(self, results, timeout):
158                 """Sets the results to the cache for <timeout> minutes."""
159                 result_cache = cache.get(SEARCH_CACHE_KEY) or {}
160                 cached = result_cache.setdefault(self.__class__, {}).setdefault(self.search_arg.lower(), {})
161                 cached.update({
162                         'results': results,
163                         'timeout': datetime.datetime.now() + datetime.timedelta(minutes=timeout)
164                 })
165                 cache.set(SEARCH_CACHE_KEY, result_cache, MAX_CACHE_TIMEOUT)
166         
167         @property
168         def results(self):
169                 if not hasattr(self, '_results'):
170                         results = self._get_cached_results()
171                         if results is None:
172                                 try:
173                                         # Cache one extra result so we can see if there are
174                                         # more results to be had.
175                                         limit = self.result_limit
176                                         if limit is not None:
177                                                 limit += 1
178                                         results = self.get_results(limit)
179                                 except:
180                                         if settings.DEBUG:
181                                                 raise
182                                         #  On exceptions, don't set any cache; just return.
183                                         return []
184                         
185                                 self._set_cached_results(results, self._cache_timeout)
186                         self._results = results
187                 
188                 return self._results
189         
190         def get_results(self, limit=None, result_class=Result):
191                 """
192                 Calls self.search() and parses the return value into Result objects.
193                 """
194                 results = self.search(limit)
195                 return [result_class(self, result) for result in results]
196         
197         def search(self, limit=None):
198                 """
199                 Returns an iterable of up to <limit> results. The
200                 get_result_title, get_result_url, get_result_template, and
201                 get_result_extra_context methods will be used to interpret the
202                 individual items that this function returns, so the result can
203                 be an object with attributes as easily as a dictionary
204                 with keys. The only restriction is that the objects be
205                 pickleable so that they can be used with django's cache system.
206                 """
207                 raise NotImplementedError
208         
209         def get_result_title(self, result):
210                 raise NotImplementedError
211         
212         def get_result_url(self, result):
213                 "Subclasses override this to provide the actual URL for the result."
214                 raise NotImplementedError
215         
216         def get_result_querydict(self, result):
217                 url = self.get_result_url(result)
218                 if url is None:
219                         return None
220                 return make_tracking_querydict(self.search_arg, url)
221         
222         def get_result_template(self, result):
223                 if hasattr(self, 'result_template'):
224                         return loader.get_template(self.result_template)
225                 if not hasattr(self, '_result_template'):
226                         self._result_template = DEFAULT_RESULT_TEMPLATE
227                 return self._result_template
228         
229         def get_result_extra_context(self, result):
230                 return {}
231         
232         def has_more_results(self):
233                 """Useful to determine whether to display a `view more results` link."""
234                 return len(self.results) > self.result_limit
235         
236         @property
237         def more_results_url(self):
238                 """
239                 Returns the actual url for more results. This will be encoded
240                 into a querystring for tracking purposes.
241                 """
242                 raise NotImplementedError
243         
244         @property
245         def more_results_querydict(self):
246                 return make_tracking_querydict(self.search_arg, self.more_results_url)
247         
248         def __unicode__(self):
249                 return ' '.join(self.__class__.verbose_name.rsplit(' ', 1)[:-1]) + ' results'
250
251
252 class DatabaseSearch(BaseSearch):
253         model = None
254         
255         def search(self, limit=None):
256                 if not hasattr(self, '_qs'):
257                         self._qs = self.get_queryset()
258                         if limit is not None:
259                                 self._qs = self._qs[:limit]
260                 
261                 return self._qs
262         
263         def get_queryset(self):
264                 return self.model._default_manager.all()
265
266
267 class URLSearch(BaseSearch):
268         """
269         Defines a generic interface for searches that require accessing a
270         certain url to get search results.
271         """
272         search_url = ''
273         query_format_str = "%s"
274
275         @property
276         def url(self):
277                 "The URL where the search gets its results."
278                 return self.search_url + self.query_format_str % urlquote_plus(self.search_arg)
279
280         @property
281         def more_results_url(self):
282                 "The URL where the users would go to get more results."
283                 return self.url
284         
285         def parse_response(self, response, limit=None):
286                 raise NotImplementedError
287         
288         def search(self, limit=None):
289                 return self.parse_response(urllib2.urlopen(self.url), limit=limit)
290
291
292 class JSONSearch(URLSearch):
293         """
294         Makes a GET request and parses the results as JSON. The default
295         behavior assumes that the return value is a list of results.
296         """
297         def parse_response(self, response, limit=None):
298                 return json.loads(response.read())[:limit]
299
300
301 class GoogleSearch(JSONSearch):
302         search_url = "http://ajax.googleapis.com/ajax/services/search/web"
303         # TODO: Change this template to reflect the app's actual name.
304         result_template = 'search/googlesearch.html'
305         _cache_timeout = 60
306         verbose_name = "Google search (current site)"
307         
308         @property
309         def query_format_str(self):
310                 default_args = self.default_args
311                 if default_args:
312                         default_args += " "
313                 return "?v=1.0&q=%s%%s" % urlquote_plus(default_args).replace('%', '%%')
314         
315         @property
316         def default_args(self):
317                 return "site:%s" % Site.objects.get_current().domain
318         
319         def parse_response(self, response, limit=None):
320                 responseData = json.loads(response.read())['responseData']
321                 results, cursor = responseData['results'], responseData['cursor']
322                 
323                 if results:
324                         self._more_results_url = cursor['moreResultsUrl']
325                         self._estimated_result_count = cursor['estimatedResultCount']
326                 
327                 return results[:limit]
328         
329         @property
330         def url(self):
331                 # Google requires that an ajax request have a proper Referer header.
332                 return urllib2.Request(
333                         super(GoogleSearch, self).url,
334                         None,
335                         {'Referer': "http://%s" % Site.objects.get_current().domain}
336                 )
337         
338         @property
339         def has_more_results(self):
340                 if self.results and len(self.results) < self._estimated_result_count:
341                         return True
342                 return False
343         
344         @property
345         def more_results_url(self):
346                 return self._more_results_url
347         
348         def get_result_title(self, result):
349                 return result['titleNoFormatting']
350         
351         def get_result_url(self, result):
352                 return result['unescapedUrl']
353         
354         def get_result_extra_context(self, result):
355                 return result
356
357
358 registry.register(GoogleSearch)
359
360
361 try:
362         from BeautifulSoup import BeautifulSoup, SoupStrainer, BeautifulStoneSoup
363 except:
364         pass
365 else:
366         __all__ += ('ScrapeSearch', 'XMLSearch',)
367         class ScrapeSearch(URLSearch):
368                 _strainer_args = []
369                 _strainer_kwargs = {}
370                 
371                 @property
372                 def strainer(self):
373                         if not hasattr(self, '_strainer'):
374                                 self._strainer = SoupStrainer(*self._strainer_args, **self._strainer_kwargs)
375                         return self._strainer
376                 
377                 def parse_response(self, response, limit=None):
378                         strainer = self.strainer
379                         soup = BeautifulSoup(response, parseOnlyThese=strainer)
380                         return self.parse_results(soup.findAll(recursive=False, limit=limit))
381                 
382                 def parse_results(self, results):
383                         """
384                         Provides a hook for parsing the results of straining. This
385                         has no default behavior because the results absolutely
386                         must be parsed to properly extract the information.
387                         For more information, see http://www.crummy.com/software/BeautifulSoup/documentation.html#Improving%20Memory%20Usage%20with%20extract
388                         """
389                         raise NotImplementedError
390         
391         
392         class XMLSearch(ScrapeSearch):
393                 _self_closing_tags = []
394                 
395                 def parse_response(self, response, limit=None):
396                         strainer = self.strainer
397                         soup = BeautifulStoneSoup(response, selfClosingTags=self._self_closing_tags, parseOnlyThese=strainer)
398                         return self.parse_results(soup.findAll(recursive=False, limit=limit))