3 from django.conf import settings
4 from django.contrib.sites.models import Site
5 from django.core.cache import cache
6 from django.db.models.options import get_verbose_name as convert_camelcase
7 from django.utils import simplejson as json
8 from django.utils.http import urlquote_plus
9 from django.utils.safestring import mark_safe
10 from django.utils.text import capfirst
11 from django.template import loader, Context, Template
13 from philo.contrib.sobol.utils import make_tracking_querydict
16 from eventlet.green import urllib2
22 'Result', 'BaseSearch', 'DatabaseSearch', 'URLSearch', 'JSONSearch', 'GoogleSearch', 'registry'
26 SEARCH_CACHE_KEY = 'philo_sobol_search_results'
27 DEFAULT_RESULT_TEMPLATE_STRING = "{% if url %}<a href='{{ url }}'>{% endif %}{{ title }}{% if url %}</a>{% endif %}"
28 DEFAULT_RESULT_TEMPLATE = Template(DEFAULT_RESULT_TEMPLATE_STRING)
30 # Determines the timeout on the entire result cache.
31 MAX_CACHE_TIMEOUT = 60*24*7
34 class RegistrationError(Exception):
38 class SearchRegistry(object):
39 # Holds a registry of search types by slug.
43 def register(self, search, slug=None):
44 slug = slug or search.slug
45 if slug in self._registry:
46 registered = self._registry[slug]
47 if registered.__module__ != search.__module__:
48 raise RegistrationError("A different search is already registered as `%s`" % slug)
50 self._registry[slug] = search
52 def unregister(self, search, slug=None):
54 if slug in self._registry and self._registry[slug] == search:
55 del self._registry[slug]
56 raise RegistrationError("`%s` is not registered as `%s`" % (search, slug))
58 for slug, search in self._registry.items():
60 del self._registry[slug]
63 return self._registry.items()
66 return self._registry.iteritems()
68 def iterchoices(self):
69 for slug, search in self.iteritems():
70 yield slug, search.verbose_name
72 def __getitem__(self, key):
73 return self._registry[key]
76 return self._registry.__iter__()
79 registry = SearchRegistry()
84 A result is instantiated with a configuration dictionary, a search,
85 and a template name. The configuration dictionary is expected to
86 define a `title` and optionally a `url`. Any other variables may be
87 defined; they will be made available through the result object in
88 the template, if one is defined.
90 def __init__(self, search, result):
95 return self.search.get_result_title(self.result)
98 qd = self.search.get_result_querydict(self.result)
101 return "?%s" % qd.urlencode()
103 def get_template(self):
104 return self.search.get_result_template(self.result)
106 def get_extra_context(self):
107 return self.search.get_result_extra_context(self.result)
109 def get_context(self):
110 context = self.get_extra_context()
112 'title': self.get_title(),
113 'url': self.get_url()
118 t = self.get_template()
119 c = Context(self.get_context())
122 def __unicode__(self):
126 class BaseSearchMetaclass(type):
127 def __new__(cls, name, bases, attrs):
128 if 'verbose_name' not in attrs:
129 attrs['verbose_name'] = capfirst(convert_camelcase(name))
130 if 'slug' not in attrs:
131 attrs['slug'] = name.lower()
132 return super(BaseSearchMetaclass, cls).__new__(cls, name, bases, attrs)
135 class BaseSearch(object):
137 Defines a generic search interface. Accessing self.results will
138 attempt to retrieve cached results and, if that fails, will
139 initiate a new search and store the results in the cache.
141 __metaclass__ = BaseSearchMetaclass
143 _cache_timeout = 60*48
145 def __init__(self, search_arg):
146 self.search_arg = search_arg
148 def _get_cached_results(self):
149 """Return the cached results if the results haven't timed out. Otherwise return None."""
150 result_cache = cache.get(SEARCH_CACHE_KEY)
151 if result_cache and self.__class__ in result_cache and self.search_arg.lower() in result_cache[self.__class__]:
152 cached = result_cache[self.__class__][self.search_arg.lower()]
153 if cached['timeout'] >= datetime.datetime.now():
154 return cached['results']
157 def _set_cached_results(self, results, timeout):
158 """Sets the results to the cache for <timeout> minutes."""
159 result_cache = cache.get(SEARCH_CACHE_KEY) or {}
160 cached = result_cache.setdefault(self.__class__, {}).setdefault(self.search_arg.lower(), {})
163 'timeout': datetime.datetime.now() + datetime.timedelta(minutes=timeout)
165 cache.set(SEARCH_CACHE_KEY, result_cache, MAX_CACHE_TIMEOUT)
169 if not hasattr(self, '_results'):
170 results = self._get_cached_results()
173 # Cache one extra result so we can see if there are
174 # more results to be had.
175 limit = self.result_limit
176 if limit is not None:
178 results = self.get_results(limit)
182 # On exceptions, don't set any cache; just return.
185 self._set_cached_results(results, self._cache_timeout)
186 self._results = results
190 def get_results(self, limit=None, result_class=Result):
192 Calls self.search() and parses the return value into Result objects.
194 results = self.search(limit)
195 return [result_class(self, result) for result in results]
197 def search(self, limit=None):
199 Returns an iterable of up to <limit> results. The
200 get_result_title, get_result_url, get_result_template, and
201 get_result_extra_context methods will be used to interpret the
202 individual items that this function returns, so the result can
203 be an object with attributes as easily as a dictionary
204 with keys. The only restriction is that the objects be
205 pickleable so that they can be used with django's cache system.
207 raise NotImplementedError
209 def get_result_title(self, result):
210 raise NotImplementedError
212 def get_result_url(self, result):
213 "Subclasses override this to provide the actual URL for the result."
214 raise NotImplementedError
216 def get_result_querydict(self, result):
217 url = self.get_result_url(result)
220 return make_tracking_querydict(self.search_arg, url)
222 def get_result_template(self, result):
223 if hasattr(self, 'result_template'):
224 return loader.get_template(self.result_template)
225 if not hasattr(self, '_result_template'):
226 self._result_template = DEFAULT_RESULT_TEMPLATE
227 return self._result_template
229 def get_result_extra_context(self, result):
232 def has_more_results(self):
233 """Useful to determine whether to display a `view more results` link."""
234 return len(self.results) > self.result_limit
237 def more_results_url(self):
239 Returns the actual url for more results. This will be encoded
240 into a querystring for tracking purposes.
242 raise NotImplementedError
245 def more_results_querydict(self):
246 return make_tracking_querydict(self.search_arg, self.more_results_url)
248 def __unicode__(self):
249 return ' '.join(self.__class__.verbose_name.rsplit(' ', 1)[:-1]) + ' results'
252 class DatabaseSearch(BaseSearch):
255 def search(self, limit=None):
256 if not hasattr(self, '_qs'):
257 self._qs = self.get_queryset()
258 if limit is not None:
259 self._qs = self._qs[:limit]
263 def get_queryset(self):
264 return self.model._default_manager.all()
267 class URLSearch(BaseSearch):
269 Defines a generic interface for searches that require accessing a
270 certain url to get search results.
273 query_format_str = "%s"
277 "The URL where the search gets its results."
278 return self.search_url + self.query_format_str % urlquote_plus(self.search_arg)
281 def more_results_url(self):
282 "The URL where the users would go to get more results."
285 def parse_response(self, response, limit=None):
286 raise NotImplementedError
288 def search(self, limit=None):
289 return self.parse_response(urllib2.urlopen(self.url), limit=limit)
292 class JSONSearch(URLSearch):
294 Makes a GET request and parses the results as JSON. The default
295 behavior assumes that the return value is a list of results.
297 def parse_response(self, response, limit=None):
298 return json.loads(response.read())[:limit]
301 class GoogleSearch(JSONSearch):
302 search_url = "http://ajax.googleapis.com/ajax/services/search/web"
303 query_format_str = "?v=1.0&q=%s"
304 # TODO: Change this template to reflect the app's actual name.
305 result_template = 'search/googlesearch.html'
308 def parse_response(self, response, limit=None):
309 responseData = json.loads(response.read())['responseData']
310 results, cursor = responseData['results'], responseData['cursor']
313 self._more_results_url = cursor['moreResultsUrl']
314 self._estimated_result_count = cursor['estimatedResultCount']
316 return results[:limit]
320 # Google requires that an ajax request have a proper Referer header.
321 return urllib2.Request(
322 super(GoogleSearch, self).url,
324 {'Referer': "http://%s" % Site.objects.get_current().domain}
328 def has_more_results(self):
329 if self.results and len(self.results) < self._estimated_result_count:
334 def more_results_url(self):
335 return self._more_results_url
337 def get_result_title(self, result):
338 return result['titleNoFormatting']
340 def get_result_url(self, result):
341 return result['unescapedUrl']
343 def get_result_extra_context(self, result):
347 registry.register(GoogleSearch)
351 from BeautifulSoup import BeautifulSoup, SoupStrainer, BeautifulStoneSoup
355 __all__ += ('ScrapeSearch', 'XMLSearch',)
356 class ScrapeSearch(URLSearch):
358 _strainer_kwargs = {}
362 if not hasattr(self, '_strainer'):
363 self._strainer = SoupStrainer(*self._strainer_args, **self._strainer_kwargs)
364 return self._strainer
366 def parse_response(self, response, limit=None):
367 strainer = self.strainer
368 soup = BeautifulSoup(response, parseOnlyThese=strainer)
369 return self.parse_results(soup.findAll(recursive=False, limit=limit))
371 def parse_results(self, results):
373 Provides a hook for parsing the results of straining. This
374 has no default behavior because the results absolutely
375 must be parsed to properly extract the information.
376 For more information, see http://www.crummy.com/software/BeautifulSoup/documentation.html#Improving%20Memory%20Usage%20with%20extract
378 raise NotImplementedError
381 class XMLSearch(ScrapeSearch):
382 _self_closing_tags = []
384 def parse_response(self, response, limit=None):
385 strainer = self.strainer
386 soup = BeautifulStoneSoup(response, selfClosingTags=self._self_closing_tags, parseOnlyThese=strainer)
387 return self.parse_results(soup.findAll(recursive=False, limit=limit))