3 from django.conf import settings
4 from django.contrib.sites.models import Site
5 from django.core.cache import cache
6 from django.db.models.options import get_verbose_name as convert_camelcase
7 from django.utils import simplejson as json
8 from django.utils.http import urlquote_plus
9 from django.utils.safestring import mark_safe
10 from django.utils.text import capfirst
11 from django.template import loader, Context, Template
13 from philo.contrib.sobol.utils import make_tracking_querydict
16 from eventlet.green import urllib2
22 'Result', 'BaseSearch', 'DatabaseSearch', 'URLSearch', 'JSONSearch', 'GoogleSearch', 'registry'
26 SEARCH_CACHE_KEY = 'philo_sobol_search_results'
27 DEFAULT_RESULT_TEMPLATE_STRING = "{% if url %}<a href='{{ url }}'>{% endif %}{{ title }}{% if url %}</a>{% endif %}"
28 DEFAULT_RESULT_TEMPLATE = Template(DEFAULT_RESULT_TEMPLATE_STRING)
30 # Determines the timeout on the entire result cache.
31 MAX_CACHE_TIMEOUT = 60*24*7
34 class RegistrationError(Exception):
38 class SearchRegistry(object):
39 # Holds a registry of search types by slug.
43 def register(self, search, slug=None):
44 slug = slug or search.slug
45 if slug in self._registry:
46 registered = self._registry[slug]
47 if registered.__module__ != search.__module__:
48 raise RegistrationError("A different search is already registered as `%s`" % slug)
50 self._registry[slug] = search
52 def unregister(self, search, slug=None):
54 if slug in self._registry and self._registry[slug] == search:
55 del self._registry[slug]
56 raise RegistrationError("`%s` is not registered as `%s`" % (search, slug))
58 for slug, search in self._registry.items():
60 del self._registry[slug]
63 return self._registry.items()
66 return self._registry.iteritems()
68 def iterchoices(self):
69 for slug, search in self.iteritems():
70 yield slug, search.verbose_name
72 def __getitem__(self, key):
73 return self._registry[key]
76 return self._registry.__iter__()
79 registry = SearchRegistry()
84 A result is instantiated with a configuration dictionary, a search,
85 and a template name. The configuration dictionary is expected to
86 define a `title` and optionally a `url`. Any other variables may be
87 defined; they will be made available through the result object in
88 the template, if one is defined.
90 def __init__(self, search, result):
95 return self.search.get_result_title(self.result)
98 qd = self.search.get_result_querydict(self.result)
101 return "?%s" % qd.urlencode()
103 def get_template(self):
104 return self.search.get_result_template(self.result)
106 def get_extra_context(self):
107 return self.search.get_result_extra_context(self.result)
109 def get_context(self):
110 context = self.get_extra_context()
112 'title': self.get_title(),
113 'url': self.get_url()
118 t = self.get_template()
119 c = Context(self.get_context())
122 def __unicode__(self):
126 class BaseSearchMetaclass(type):
127 def __new__(cls, name, bases, attrs):
128 if 'verbose_name' not in attrs:
129 attrs['verbose_name'] = capfirst(convert_camelcase(name))
130 if 'slug' not in attrs:
131 attrs['slug'] = name.lower()
132 return super(BaseSearchMetaclass, cls).__new__(cls, name, bases, attrs)
135 class BaseSearch(object):
137 Defines a generic search interface. Accessing self.results will
138 attempt to retrieve cached results and, if that fails, will
139 initiate a new search and store the results in the cache.
141 __metaclass__ = BaseSearchMetaclass
143 _cache_timeout = 60*48
145 def __init__(self, search_arg):
146 self.search_arg = search_arg
148 def _get_cached_results(self):
149 """Return the cached results if the results haven't timed out. Otherwise return None."""
150 result_cache = cache.get(SEARCH_CACHE_KEY)
151 if result_cache and self.__class__ in result_cache and self.search_arg.lower() in result_cache[self.__class__]:
152 cached = result_cache[self.__class__][self.search_arg.lower()]
153 if cached['timeout'] >= datetime.datetime.now():
154 return cached['results']
157 def _set_cached_results(self, results, timeout):
158 """Sets the results to the cache for <timeout> minutes."""
159 result_cache = cache.get(SEARCH_CACHE_KEY) or {}
160 cached = result_cache.setdefault(self.__class__, {}).setdefault(self.search_arg.lower(), {})
163 'timeout': datetime.datetime.now() + datetime.timedelta(minutes=timeout)
165 cache.set(SEARCH_CACHE_KEY, result_cache, MAX_CACHE_TIMEOUT)
169 if not hasattr(self, '_results'):
170 results = self._get_cached_results()
173 # Cache one extra result so we can see if there are
174 # more results to be had.
175 limit = self.result_limit
176 if limit is not None:
178 results = self.get_results(self.result_limit)
182 # On exceptions, don't set any cache; just return.
185 self._set_cached_results(results, self._cache_timeout)
186 self._results = results
190 def get_results(self, limit=None, result_class=Result):
192 Calls self.search() and parses the return value into Result objects.
194 results = self.search(limit)
195 return [result_class(self, result) for result in results]
197 def search(self, limit=None):
199 Returns an iterable of up to <limit> results. The
200 get_result_title, get_result_url, get_result_template, and
201 get_result_extra_context methods will be used to interpret the
202 individual items that this function returns, so the result can
203 be an object with attributes as easily as a dictionary
204 with keys. The only restriction is that the objects be
205 pickleable so that they can be used with django's cache system.
207 raise NotImplementedError
209 def get_result_title(self, result):
210 raise NotImplementedError
212 def get_result_url(self, result):
213 "Subclasses override this to provide the actual URL for the result."
214 raise NotImplementedError
216 def get_result_querydict(self, result):
217 url = self.get_result_url(result)
220 return make_tracking_querydict(self.search_arg, url)
222 def get_result_template(self, result):
223 if hasattr(self, 'result_template'):
224 return loader.get_template(self.result_template)
225 if not hasattr(self, '_result_template'):
226 self._result_template = DEFAULT_RESULT_TEMPLATE
227 return self._result_template
229 def get_result_extra_context(self, result):
232 def has_more_results(self):
233 """Useful to determine whether to display a `view more results` link."""
234 return len(self.results) > self.result_limit
237 def more_results_url(self):
239 Returns the actual url for more results. This will be encoded
240 into a querystring for tracking purposes.
242 raise NotImplementedError
245 def more_results_querydict(self):
246 return make_tracking_querydict(self.search_arg, self.more_results_url)
248 def __unicode__(self):
249 return ' '.join(self.__class__.verbose_name.rsplit(' ', 1)[:-1]) + ' results'
252 class DatabaseSearch(BaseSearch):
255 def has_more_results(self):
256 return self.get_queryset().count() > self.result_limit
258 def search(self, limit=None):
259 if not hasattr(self, '_qs'):
260 self._qs = self.get_queryset()
261 if limit is not None:
262 self._qs = self._qs[:limit]
266 def get_queryset(self):
267 return self.model._default_manager.all()
270 class URLSearch(BaseSearch):
272 Defines a generic interface for searches that require accessing a
273 certain url to get search results.
276 query_format_str = "%s"
280 "The URL where the search gets its results."
281 return self.search_url + self.query_format_str % urlquote_plus(self.search_arg)
284 def more_results_url(self):
285 "The URL where the users would go to get more results."
288 def parse_response(self, response, limit=None):
289 raise NotImplementedError
291 def search(self, limit=None):
292 return self.parse_response(urllib2.urlopen(self.url), limit=limit)
295 class JSONSearch(URLSearch):
297 Makes a GET request and parses the results as JSON. The default
298 behavior assumes that the return value is a list of results.
300 def parse_response(self, response, limit=None):
301 return json.loads(response.read())[:limit]
304 class GoogleSearch(JSONSearch):
305 search_url = "http://ajax.googleapis.com/ajax/services/search/web"
306 query_format_str = "?v=1.0&q=%s"
307 # TODO: Change this template to reflect the app's actual name.
308 result_template = 'search/googlesearch.html'
311 def parse_response(self, response, limit=None):
312 responseData = json.loads(response.read())['responseData']
313 results, cursor = responseData['results'], responseData['cursor']
316 self._more_results_url = cursor['moreResultsUrl']
317 self._estimated_result_count = cursor['estimatedResultCount']
319 return results[:limit]
323 # Google requires that an ajax request have a proper Referer header.
324 return urllib2.Request(
325 super(GoogleSearch, self).url,
327 {'Referer': "http://%s" % Site.objects.get_current().domain}
331 def has_more_results(self):
332 if self.results and len(self.results) < self._estimated_result_count:
337 def more_results_url(self):
338 return self._more_results_url
340 def get_result_title(self, result):
341 return result['titleNoFormatting']
343 def get_result_url(self, result):
344 return result['unescapedUrl']
346 def get_result_extra_context(self, result):
350 registry.register(GoogleSearch)
354 from BeautifulSoup import BeautifulSoup, SoupStrainer, BeautifulStoneSoup
358 __all__ += ('ScrapeSearch', 'XMLSearch',)
359 class ScrapeSearch(URLSearch):
361 _strainer_kwargs = {}
365 if not hasattr(self, '_strainer'):
366 self._strainer = SoupStrainer(*self._strainer_args, **self._strainer_kwargs)
367 return self._strainer
369 def parse_response(self, response, limit=None):
370 strainer = self.strainer
371 soup = BeautifulSoup(response, parseOnlyThese=strainer)
372 return self.parse_results(soup.findAll(recursive=False, limit=limit))
374 def parse_results(self, results):
376 Provides a hook for parsing the results of straining. This
377 has no default behavior because the results absolutely
378 must be parsed to properly extract the information.
379 For more information, see http://www.crummy.com/software/BeautifulSoup/documentation.html#Improving%20Memory%20Usage%20with%20extract
381 raise NotImplementedError
384 class XMLSearch(ScrapeSearch):
385 _self_closing_tags = []
387 def parse_response(self, response, limit=None):
388 strainer = self.strainer
389 soup = BeautifulStoneSoup(response, selfClosingTags=self._self_closing_tags, parseOnlyThese=strainer)
390 return self.parse_results(soup.findAll(recursive=False, limit=limit))