3 from django.conf import settings
4 from django.contrib.sites.models import Site
5 from django.core.cache import cache
6 from django.db.models.options import get_verbose_name as convert_camelcase
7 from django.utils import simplejson as json
8 from django.utils.http import urlquote_plus
9 from django.utils.safestring import mark_safe
10 from django.utils.text import capfirst
11 from django.template import loader, Context, Template
13 from philo.contrib.sobol.utils import make_tracking_querydict
16 from eventlet.green import urllib2
22 'Result', 'BaseSearch', 'DatabaseSearch', 'URLSearch', 'JSONSearch', 'GoogleSearch', 'registry'
26 SEARCH_CACHE_KEY = 'philo_sobol_search_results'
27 DEFAULT_RESULT_TEMPLATE_STRING = "{% if url %}<a href='{{ url }}'>{% endif %}{{ title }}{% if url %}</a>{% endif %}"
29 # Determines the timeout on the entire result cache.
30 MAX_CACHE_TIMEOUT = 60*60*24*7
33 class RegistrationError(Exception):
37 class SearchRegistry(object):
38 # Holds a registry of search types by slug.
42 def register(self, search, slug=None):
43 slug = slug or search.slug
44 if slug in self._registry:
45 registered = self._registry[slug]
46 if registered.__module__ != search.__module__:
47 raise RegistrationError("A different search is already registered as `%s`" % slug)
49 self._registry[slug] = search
51 def unregister(self, search, slug=None):
53 if slug in self._registry and self._registry[slug] == search:
54 del self._registry[slug]
55 raise RegistrationError("`%s` is not registered as `%s`" % (search, slug))
57 for slug, search in self._registry.items():
59 del self._registry[slug]
62 return self._registry.items()
65 return self._registry.iteritems()
67 def iterchoices(self):
68 for slug, search in self.iteritems():
69 yield slug, search.verbose_name
71 def __getitem__(self, key):
72 return self._registry[key]
75 return self._registry.__iter__()
78 registry = SearchRegistry()
83 A result is instantiated with a configuration dictionary, a search,
84 and a template name. The configuration dictionary is expected to
85 define a `title` and optionally a `url`. Any other variables may be
86 defined; they will be made available through the result object in
87 the template, if one is defined.
89 def __init__(self, search, result):
94 return self.search.get_result_title(self.result)
97 return "?%s" % self.search.get_result_querydict(self.result).urlencode()
99 def get_template(self):
100 return self.search.get_result_template(self.result)
102 def get_extra_context(self):
103 return self.search.get_result_extra_context(self.result)
105 def get_context(self):
106 context = self.get_extra_context()
108 'title': self.get_title(),
109 'url': self.get_url()
114 t = self.get_template()
115 c = Context(self.get_context())
118 def __unicode__(self):
122 class BaseSearchMetaclass(type):
123 def __new__(cls, name, bases, attrs):
124 if 'verbose_name' not in attrs:
125 attrs['verbose_name'] = capfirst(convert_camelcase(name))
126 if 'slug' not in attrs:
127 attrs['slug'] = name.lower()
128 return super(BaseSearchMetaclass, cls).__new__(cls, name, bases, attrs)
131 class BaseSearch(object):
133 Defines a generic search interface. Accessing self.results will
134 attempt to retrieve cached results and, if that fails, will
135 initiate a new search and store the results in the cache.
137 __metaclass__ = BaseSearchMetaclass
139 _cache_timeout = 60*48
141 def __init__(self, search_arg):
142 self.search_arg = search_arg
144 def _get_cached_results(self):
145 """Return the cached results if the results haven't timed out. Otherwise return None."""
146 result_cache = cache.get(SEARCH_CACHE_KEY)
147 if result_cache and self.__class__ in result_cache and self.search_arg.lower() in result_cache[self.__class__]:
148 cached = result_cache[self.__class__][self.search_arg.lower()]
149 if cached['timeout'] >= datetime.datetime.now():
150 return cached['results']
153 def _set_cached_results(self, results, timeout):
154 """Sets the results to the cache for <timeout> minutes."""
155 result_cache = cache.get(SEARCH_CACHE_KEY) or {}
156 cached = result_cache.setdefault(self.__class__, {}).setdefault(self.search_arg.lower(), {})
159 'timeout': datetime.datetime.now() + datetime.timedelta(minutes=timeout)
161 cache.set(SEARCH_CACHE_KEY, result_cache, MAX_CACHE_TIMEOUT)
165 if not hasattr(self, '_results'):
166 results = self._get_cached_results()
169 # Cache one extra result so we can see if there are
170 # more results to be had.
171 limit = self.result_limit
172 if limit is not None:
174 results = self.get_results(self.result_limit)
178 # On exceptions, don't set any cache; just return.
181 self._set_cached_results(results, self._cache_timeout)
182 self._results = results
186 def get_results(self, limit=None, result_class=Result):
188 Calls self.search() and parses the return value into Result objects.
190 results = self.search(limit)
191 return [result_class(self, result) for result in results]
193 def search(self, limit=None):
195 Returns an iterable of up to <limit> results. The
196 get_result_title, get_result_url, get_result_template, and
197 get_result_extra_context methods will be used to interpret the
198 individual items that this function returns, so the result can
199 be an object with attributes as easily as a dictionary
200 with keys. The only restriction is that the objects be
201 pickleable so that they can be used with django's cache system.
203 raise NotImplementedError
205 def get_result_title(self, result):
206 raise NotImplementedError
208 def get_result_url(self, result):
209 "Subclasses override this to provide the actual URL for the result."
210 raise NotImplementedError
212 def get_result_querydict(self, result):
213 return make_tracking_querydict(self.search_arg, self.get_result_url(result))
215 def get_result_template(self, result):
216 if hasattr(self, 'result_template'):
217 return loader.get_template(self.result_template)
218 if not hasattr(self, '_result_template'):
219 self._result_template = Template(DEFAULT_RESULT_TEMPLATE_STRING)
220 return self._result_template
222 def get_result_extra_context(self, result):
225 def has_more_results(self):
226 """Useful to determine whether to display a `view more results` link."""
227 return len(self.results) > self.result_limit
230 def more_results_url(self):
232 Returns the actual url for more results. This will be encoded
233 into a querystring for tracking purposes.
235 raise NotImplementedError
238 def more_results_querydict(self):
239 return make_tracking_querydict(self.search_arg, self.more_results_url)
241 def __unicode__(self):
242 return ' '.join(self.__class__.verbose_name.rsplit(' ', 1)[:-1]) + ' results'
245 class DatabaseSearch(BaseSearch):
248 def has_more_results(self):
249 return self.get_queryset().count() > self.result_limit
251 def search(self, limit=None):
252 if not hasattr(self, '_qs'):
253 self._qs = self.get_queryset()
254 if limit is not None:
255 self._qs = self._qs[:limit]
259 def get_queryset(self):
260 return self.model._default_manager.all()
263 class URLSearch(BaseSearch):
265 Defines a generic interface for searches that require accessing a
266 certain url to get search results.
269 query_format_str = "%s"
273 "The URL where the search gets its results."
274 return self.search_url + self.query_format_str % urlquote_plus(self.search_arg)
277 def more_results_url(self):
278 "The URL where the users would go to get more results."
281 def parse_response(self, response, limit=None):
282 raise NotImplementedError
284 def search(self, limit=None):
285 return self.parse_response(urllib2.urlopen(self.url), limit=limit)
288 class JSONSearch(URLSearch):
290 Makes a GET request and parses the results as JSON. The default
291 behavior assumes that the return value is a list of results.
293 def parse_response(self, response, limit=None):
294 return json.loads(response.read())[:limit]
297 class GoogleSearch(JSONSearch):
298 search_url = "http://ajax.googleapis.com/ajax/services/search/web"
299 query_format_str = "?v=1.0&q=%s"
300 # TODO: Change this template to reflect the app's actual name.
301 result_template = 'search/googlesearch.html'
304 def parse_response(self, response, limit=None):
305 responseData = json.loads(response.read())['responseData']
306 results, cursor = responseData['results'], responseData['cursor']
309 self._more_results_url = cursor['moreResultsUrl']
310 self._estimated_result_count = cursor['estimatedResultCount']
312 return results[:limit]
316 # Google requires that an ajax request have a proper Referer header.
317 return urllib2.Request(
318 super(GoogleSearch, self).url,
320 {'Referer': "http://%s" % Site.objects.get_current().domain}
324 def has_more_results(self):
325 if self.results and len(self.results) < self._estimated_result_count:
330 def more_results_url(self):
331 return self._more_results_url
333 def get_result_title(self, result):
334 return result['titleNoFormatting']
336 def get_result_url(self, result):
337 return result['unescapedUrl']
339 def get_result_extra_context(self, result):
343 registry.register(GoogleSearch)
347 from BeautifulSoup import BeautifulSoup, SoupStrainer, BeautifulStoneSoup
351 __all__ += ('ScrapeSearch', 'XMLSearch',)
352 class ScrapeSearch(URLSearch):
354 _strainer_kwargs = {}
358 if not hasattr(self, '_strainer'):
359 self._strainer = SoupStrainer(*self._strainer_args, **self._strainer_kwargs)
360 return self._strainer
362 def parse_response(self, response, limit=None):
363 strainer = self.strainer
364 soup = BeautifulSoup(response, parseOnlyThese=strainer)
365 return self.parse_results(soup[:limit])
367 def parse_results(self, results):
369 Provides a hook for parsing the results of straining. This
370 has no default behavior because the results absolutely
371 must be parsed to properly extract the information.
372 For more information, see http://www.crummy.com/software/BeautifulSoup/documentation.html#Improving%20Memory%20Usage%20with%20extract
374 raise NotImplementedError
377 class XMLSearch(ScrapeSearch):
378 _self_closing_tags = []
380 def parse_response(self, response, limit=None):
381 strainer = self.strainer
382 soup = BeautifulStoneSoup(page, selfClosingTags=self._self_closing_tags, parseOnlyThese=strainer)
383 return self.parse_results(soup[:limit])