3 from django.conf import settings
4 from django.contrib.sites.models import Site
5 from django.core.cache import cache
6 from django.db.models.options import get_verbose_name as convert_camelcase
7 from django.utils import simplejson as json
8 from django.utils.http import urlquote_plus
9 from django.utils.safestring import mark_safe
10 from django.utils.text import capfirst
11 from django.template import loader, Context, Template
13 from philo.contrib.sobol.utils import make_tracking_querydict
16 from eventlet.green import urllib2
22 'Result', 'BaseSearch', 'DatabaseSearch', 'URLSearch', 'JSONSearch', 'GoogleSearch', 'registry'
26 SEARCH_CACHE_KEY = 'philo_sobol_search_results'
27 DEFAULT_RESULT_TEMPLATE_STRING = "{% if url %}<a href='{{ url }}'>{% endif %}{{ title }}{% if url %}</a>{% endif %}"
29 # Determines the timeout on the entire result cache.
30 MAX_CACHE_TIMEOUT = 60*60*24*7
33 class RegistrationError(Exception):
37 class SearchRegistry(object):
38 # Holds a registry of search types by slug.
42 def register(self, search, slug=None):
43 slug = slug or search.slug
44 if slug in self._registry:
45 if self._registry[slug] != search:
46 raise RegistrationError("A different search is already registered as `%s`")
48 self._registry[slug] = search
50 def unregister(self, search, slug=None):
52 if slug in self._registry and self._registry[slug] == search:
53 del self._registry[slug]
54 raise RegistrationError("`%s` is not registered as `%s`" % (search, slug))
56 for slug, search in self._registry.items():
58 del self._registry[slug]
61 return self._registry.items()
64 return self._registry.iteritems()
66 def iterchoices(self):
67 for slug, search in self.iteritems():
68 yield slug, search.verbose_name
70 def __getitem__(self, key):
71 return self._registry[key]
74 return self._registry.__iter__()
77 registry = SearchRegistry()
82 A result is instantiated with a configuration dictionary, a search,
83 and a template name. The configuration dictionary is expected to
84 define a `title` and optionally a `url`. Any other variables may be
85 defined; they will be made available through the result object in
86 the template, if one is defined.
88 def __init__(self, search, result):
93 return self.search.get_result_title(self.result)
96 return "?%s" % self.search.get_result_querydict(self.result).urlencode()
98 def get_template(self):
99 return self.search.get_result_template(self.result)
101 def get_extra_context(self):
102 return self.search.get_result_extra_context(self.result)
104 def get_context(self):
105 context = self.get_extra_context()
107 'title': self.get_title(),
108 'url': self.get_url()
113 t = self.get_template()
114 c = Context(self.get_context())
117 def __unicode__(self):
121 class BaseSearchMetaclass(type):
122 def __new__(cls, name, bases, attrs):
123 if 'verbose_name' not in attrs:
124 attrs['verbose_name'] = capfirst(convert_camelcase(name))
125 if 'slug' not in attrs:
126 attrs['slug'] = name.lower()
127 return super(BaseSearchMetaclass, cls).__new__(cls, name, bases, attrs)
130 class BaseSearch(object):
132 Defines a generic search interface. Accessing self.results will
133 attempt to retrieve cached results and, if that fails, will
134 initiate a new search and store the results in the cache.
136 __metaclass__ = BaseSearchMetaclass
138 _cache_timeout = 60*48
140 def __init__(self, search_arg):
141 self.search_arg = search_arg
143 def _get_cached_results(self):
144 """Return the cached results if the results haven't timed out. Otherwise return None."""
145 result_cache = cache.get(SEARCH_CACHE_KEY)
146 if result_cache and self.__class__ in result_cache and self.search_arg.lower() in result_cache[self.__class__]:
147 cached = result_cache[self.__class__][self.search_arg.lower()]
148 if cached['timeout'] >= datetime.datetime.now():
149 return cached['results']
152 def _set_cached_results(self, results, timeout):
153 """Sets the results to the cache for <timeout> minutes."""
154 result_cache = cache.get(SEARCH_CACHE_KEY) or {}
155 cached = result_cache.setdefault(self.__class__, {}).setdefault(self.search_arg.lower(), {})
158 'timeout': datetime.datetime.now() + datetime.timedelta(minutes=timeout)
160 cache.set(SEARCH_CACHE_KEY, result_cache, MAX_CACHE_TIMEOUT)
164 if not hasattr(self, '_results'):
165 results = self._get_cached_results()
168 # Cache one extra result so we can see if there are
169 # more results to be had.
170 limit = self.result_limit
171 if limit is not None:
173 results = self.get_results(self.result_limit)
177 # On exceptions, don't set any cache; just return.
180 self._set_cached_results(results, self._cache_timeout)
181 self._results = results
185 def get_results(self, limit=None, result_class=Result):
187 Calls self.search() and parses the return value into Result objects.
189 results = self.search(limit)
190 return [result_class(self, result) for result in results]
192 def search(self, limit=None):
194 Returns an iterable of up to <limit> results. The
195 get_result_title, get_result_url, get_result_template, and
196 get_result_extra_context methods will be used to interpret the
197 individual items that this function returns, so the result can
198 be an object with attributes as easily as a dictionary
199 with keys. The only restriction is that the objects be
200 pickleable so that they can be used with django's cache system.
202 raise NotImplementedError
204 def get_result_title(self, result):
205 raise NotImplementedError
207 def get_result_url(self, result):
208 "Subclasses override this to provide the actual URL for the result."
209 raise NotImplementedError
211 def get_result_querydict(self, result):
212 return make_tracking_querydict(self.search_arg, self.get_result_url(result))
214 def get_result_template(self, result):
215 if hasattr(self, 'result_template'):
216 return loader.get_template(self.result_template)
217 if not hasattr(self, '_result_template'):
218 self._result_template = Template(DEFAULT_RESULT_TEMPLATE_STRING)
219 return self._result_template
221 def get_result_extra_context(self, result):
224 def has_more_results(self):
225 """Useful to determine whether to display a `view more results` link."""
226 return len(self.results) > self.result_limit
229 def more_results_url(self):
231 Returns the actual url for more results. This will be encoded
232 into a querystring for tracking purposes.
234 raise NotImplementedError
237 def more_results_querydict(self):
238 return make_tracking_querydict(self.search_arg, self.more_results_url)
240 def __unicode__(self):
241 return ' '.join(self.__class__.verbose_name.rsplit(' ', 1)[:-1]) + ' results'
244 class DatabaseSearch(BaseSearch):
247 def has_more_results(self):
248 return self.get_queryset().count() > self.result_limit
250 def search(self, limit=None):
251 if not hasattr(self, '_qs'):
252 self._qs = self.get_queryset()
253 if limit is not None:
254 self._qs = self._qs[:limit]
258 def get_queryset(self):
259 return self.model._default_manager.all()
262 class URLSearch(BaseSearch):
264 Defines a generic interface for searches that require accessing a
265 certain url to get search results.
268 query_format_str = "%s"
272 "The URL where the search gets its results."
273 return self.search_url + self.query_format_str % urlquote_plus(self.search_arg)
276 def more_results_url(self):
277 "The URL where the users would go to get more results."
280 def parse_response(self, response, limit=None):
281 raise NotImplementedError
283 def search(self, limit=None):
284 return self.parse_response(urllib2.urlopen(self.url), limit=limit)
287 class JSONSearch(URLSearch):
289 Makes a GET request and parses the results as JSON. The default
290 behavior assumes that the return value is a list of results.
292 def parse_response(self, response, limit=None):
293 return json.loads(response.read())[:limit]
296 class GoogleSearch(JSONSearch):
297 search_url = "http://ajax.googleapis.com/ajax/services/search/web"
298 query_format_str = "?v=1.0&q=%s"
299 # TODO: Change this template to reflect the app's actual name.
300 result_template = 'search/googlesearch.html'
303 def parse_response(self, response, limit=None):
304 responseData = json.loads(response.read())['responseData']
305 results, cursor = responseData['results'], responseData['cursor']
308 self._more_results_url = cursor['moreResultsUrl']
309 self._estimated_result_count = cursor['estimatedResultCount']
311 return results[:limit]
315 # Google requires that an ajax request have a proper Referer header.
316 return urllib2.Request(
317 super(GoogleSearch, self).url,
319 {'Referer': "http://%s" % Site.objects.get_current().domain}
323 def has_more_results(self):
324 if self.results and len(self.results) < self._estimated_result_count:
329 def more_results_url(self):
330 return self._more_results_url
332 def get_result_title(self, result):
333 return result['titleNoFormatting']
335 def get_result_url(self, result):
336 return result['unescapedUrl']
338 def get_result_extra_context(self, result):
342 registry.register(GoogleSearch)
346 from BeautifulSoup import BeautifulSoup, SoupStrainer, BeautifulStoneSoup
350 __all__ += ('ScrapeSearch', 'XMLSearch',)
351 class ScrapeSearch(URLSearch):
353 _strainer_kwargs = {}
357 if not hasattr(self, '_strainer'):
358 self._strainer = SoupStrainer(*self._strainer_args, **self._strainer_kwargs)
359 return self._strainer
361 def parse_response(self, response, limit=None):
362 strainer = self.strainer
363 soup = BeautifulSoup(response, parseOnlyThese=strainer)
364 return self.parse_results(soup[:limit])
366 def parse_results(self, results):
368 Provides a hook for parsing the results of straining. This
369 has no default behavior because the results absolutely
370 must be parsed to properly extract the information.
371 For more information, see http://www.crummy.com/software/BeautifulSoup/documentation.html#Improving%20Memory%20Usage%20with%20extract
373 raise NotImplementedError
376 class XMLSearch(ScrapeSearch):
377 _self_closing_tags = []
379 def parse_response(self, response, limit=None):
380 strainer = self.strainer
381 soup = BeautifulStoneSoup(page, selfClosingTags=self._self_closing_tags, parseOnlyThese=strainer)
382 return self.parse_results(soup[:limit])