3 from django.conf import settings
4 from django.contrib.sites.models import Site
5 from django.core.cache import cache
6 from django.db.models.options import get_verbose_name as convert_camelcase
7 from django.utils import simplejson as json
8 from django.utils.http import urlquote_plus
9 from django.utils.safestring import mark_safe
10 from django.utils.text import capfirst
11 from django.template import loader, Context, Template
13 from philo.contrib.sobol.utils import make_tracking_querydict
16 from eventlet.green import urllib2
22 'Result', 'BaseSearch', 'DatabaseSearch', 'URLSearch', 'JSONSearch', 'GoogleSearch', 'registry'
26 SEARCH_CACHE_KEY = 'philo_sobol_search_results'
27 DEFAULT_RESULT_TEMPLATE_STRING = "{% if url %}<a href='{{ url }}'>{% endif %}{{ title }}{% if url %}</a>{% endif %}"
28 DEFAULT_RESULT_TEMPLATE = Template(DEFAULT_RESULT_TEMPLATE_STRING)
30 # Determines the timeout on the entire result cache.
31 MAX_CACHE_TIMEOUT = 60*24*7
34 class RegistrationError(Exception):
38 class SearchRegistry(object):
39 # Holds a registry of search types by slug.
43 def register(self, search, slug=None):
44 slug = slug or search.slug
45 if slug in self._registry:
46 registered = self._registry[slug]
47 if registered.__module__ != search.__module__:
48 raise RegistrationError("A different search is already registered as `%s`" % slug)
50 self._registry[slug] = search
52 def unregister(self, search, slug=None):
54 if slug in self._registry and self._registry[slug] == search:
55 del self._registry[slug]
56 raise RegistrationError("`%s` is not registered as `%s`" % (search, slug))
58 for slug, search in self._registry.items():
60 del self._registry[slug]
63 return self._registry.items()
66 return self._registry.iteritems()
68 def iterchoices(self):
69 for slug, search in self.iteritems():
70 yield slug, search.verbose_name
72 def __getitem__(self, key):
73 return self._registry[key]
76 return self._registry.__iter__()
79 registry = SearchRegistry()
84 A result is instantiated with a configuration dictionary, a search,
85 and a template name. The configuration dictionary is expected to
86 define a `title` and optionally a `url`. Any other variables may be
87 defined; they will be made available through the result object in
88 the template, if one is defined.
90 def __init__(self, search, result):
95 return self.search.get_result_title(self.result)
98 qd = self.search.get_result_querydict(self.result)
101 return "?%s" % qd.urlencode()
103 def get_template(self):
104 return self.search.get_result_template(self.result)
106 def get_extra_context(self):
107 return self.search.get_result_extra_context(self.result)
109 def get_context(self):
110 context = self.get_extra_context()
112 'title': self.get_title(),
113 'url': self.get_url()
118 t = self.get_template()
119 c = Context(self.get_context())
122 def __unicode__(self):
126 class BaseSearchMetaclass(type):
127 def __new__(cls, name, bases, attrs):
128 if 'verbose_name' not in attrs:
129 attrs['verbose_name'] = capfirst(convert_camelcase(name))
130 if 'slug' not in attrs:
131 attrs['slug'] = name.lower()
132 return super(BaseSearchMetaclass, cls).__new__(cls, name, bases, attrs)
135 class BaseSearch(object):
137 Defines a generic search interface. Accessing self.results will
138 attempt to retrieve cached results and, if that fails, will
139 initiate a new search and store the results in the cache.
141 __metaclass__ = BaseSearchMetaclass
143 _cache_timeout = 60*48
145 def __init__(self, search_arg):
146 self.search_arg = search_arg
148 def _get_cached_results(self):
149 """Return the cached results if the results haven't timed out. Otherwise return None."""
150 result_cache = cache.get(SEARCH_CACHE_KEY)
151 if result_cache and self.__class__ in result_cache and self.search_arg.lower() in result_cache[self.__class__]:
152 cached = result_cache[self.__class__][self.search_arg.lower()]
153 if cached['timeout'] >= datetime.datetime.now():
154 return cached['results']
157 def _set_cached_results(self, results, timeout):
158 """Sets the results to the cache for <timeout> minutes."""
159 result_cache = cache.get(SEARCH_CACHE_KEY) or {}
160 cached = result_cache.setdefault(self.__class__, {}).setdefault(self.search_arg.lower(), {})
163 'timeout': datetime.datetime.now() + datetime.timedelta(minutes=timeout)
165 cache.set(SEARCH_CACHE_KEY, result_cache, MAX_CACHE_TIMEOUT)
169 if not hasattr(self, '_results'):
170 results = self._get_cached_results()
173 # Cache one extra result so we can see if there are
174 # more results to be had.
175 limit = self.result_limit
176 if limit is not None:
178 results = self.get_results(limit)
182 # On exceptions, don't set any cache; just return.
185 self._set_cached_results(results, self._cache_timeout)
186 self._results = results
190 def get_results(self, limit=None, result_class=Result):
192 Calls self.search() and parses the return value into Result objects.
194 results = self.search(limit)
195 return [result_class(self, result) for result in results]
197 def search(self, limit=None):
199 Returns an iterable of up to <limit> results. The
200 get_result_title, get_result_url, get_result_template, and
201 get_result_extra_context methods will be used to interpret the
202 individual items that this function returns, so the result can
203 be an object with attributes as easily as a dictionary
204 with keys. The only restriction is that the objects be
205 pickleable so that they can be used with django's cache system.
207 raise NotImplementedError
209 def get_result_title(self, result):
210 raise NotImplementedError
212 def get_result_url(self, result):
213 "Subclasses override this to provide the actual URL for the result."
214 raise NotImplementedError
216 def get_result_querydict(self, result):
217 url = self.get_result_url(result)
220 return make_tracking_querydict(self.search_arg, url)
222 def get_result_template(self, result):
223 if hasattr(self, 'result_template'):
224 return loader.get_template(self.result_template)
225 if not hasattr(self, '_result_template'):
226 self._result_template = DEFAULT_RESULT_TEMPLATE
227 return self._result_template
229 def get_result_extra_context(self, result):
232 def has_more_results(self):
233 """Useful to determine whether to display a `view more results` link."""
234 return len(self.results) > self.result_limit
237 def more_results_url(self):
239 Returns the actual url for more results. This will be encoded
240 into a querystring for tracking purposes.
242 raise NotImplementedError
245 def more_results_querydict(self):
246 return make_tracking_querydict(self.search_arg, self.more_results_url)
248 def __unicode__(self):
249 return ' '.join(self.__class__.verbose_name.rsplit(' ', 1)[:-1]) + ' results'
252 class DatabaseSearch(BaseSearch):
255 def search(self, limit=None):
256 if not hasattr(self, '_qs'):
257 self._qs = self.get_queryset()
258 if limit is not None:
259 self._qs = self._qs[:limit]
263 def get_queryset(self):
264 return self.model._default_manager.all()
267 class URLSearch(BaseSearch):
269 Defines a generic interface for searches that require accessing a
270 certain url to get search results.
273 query_format_str = "%s"
277 "The URL where the search gets its results."
278 return self.search_url + self.query_format_str % urlquote_plus(self.search_arg)
281 def more_results_url(self):
282 "The URL where the users would go to get more results."
285 def parse_response(self, response, limit=None):
286 raise NotImplementedError
288 def search(self, limit=None):
289 return self.parse_response(urllib2.urlopen(self.url), limit=limit)
292 class JSONSearch(URLSearch):
294 Makes a GET request and parses the results as JSON. The default
295 behavior assumes that the return value is a list of results.
297 def parse_response(self, response, limit=None):
298 return json.loads(response.read())[:limit]
301 class GoogleSearch(JSONSearch):
302 search_url = "http://ajax.googleapis.com/ajax/services/search/web"
303 # TODO: Change this template to reflect the app's actual name.
304 result_template = 'search/googlesearch.html'
306 verbose_name = "Google search (current site)"
309 def query_format_str(self):
310 default_args = self.default_args
313 return "?v=1.0&q=%s%%s" % urlquote_plus(default_args).replace('%', '%%')
316 def default_args(self):
317 return "site:%s" % Site.objects.get_current().domain
319 def parse_response(self, response, limit=None):
320 responseData = json.loads(response.read())['responseData']
321 results, cursor = responseData['results'], responseData['cursor']
324 self._more_results_url = cursor['moreResultsUrl']
325 self._estimated_result_count = cursor['estimatedResultCount']
327 return results[:limit]
331 # Google requires that an ajax request have a proper Referer header.
332 return urllib2.Request(
333 super(GoogleSearch, self).url,
335 {'Referer': "http://%s" % Site.objects.get_current().domain}
339 def has_more_results(self):
340 if self.results and len(self.results) < self._estimated_result_count:
345 def more_results_url(self):
346 return self._more_results_url
348 def get_result_title(self, result):
349 return result['titleNoFormatting']
351 def get_result_url(self, result):
352 return result['unescapedUrl']
354 def get_result_extra_context(self, result):
358 registry.register(GoogleSearch)
362 from BeautifulSoup import BeautifulSoup, SoupStrainer, BeautifulStoneSoup
366 __all__ += ('ScrapeSearch', 'XMLSearch',)
367 class ScrapeSearch(URLSearch):
369 _strainer_kwargs = {}
373 if not hasattr(self, '_strainer'):
374 self._strainer = SoupStrainer(*self._strainer_args, **self._strainer_kwargs)
375 return self._strainer
377 def parse_response(self, response, limit=None):
378 strainer = self.strainer
379 soup = BeautifulSoup(response, parseOnlyThese=strainer)
380 return self.parse_results(soup.findAll(recursive=False, limit=limit))
382 def parse_results(self, results):
384 Provides a hook for parsing the results of straining. This
385 has no default behavior because the results absolutely
386 must be parsed to properly extract the information.
387 For more information, see http://www.crummy.com/software/BeautifulSoup/documentation.html#Improving%20Memory%20Usage%20with%20extract
389 raise NotImplementedError
392 class XMLSearch(ScrapeSearch):
393 _self_closing_tags = []
395 def parse_response(self, response, limit=None):
396 strainer = self.strainer
397 soup = BeautifulStoneSoup(response, selfClosingTags=self._self_closing_tags, parseOnlyThese=strainer)
398 return self.parse_results(soup.findAll(recursive=False, limit=limit))