4 from django.conf import settings
5 from django.contrib.sites.models import Site
6 from django.core.cache import cache
7 from django.db.models.options import get_verbose_name as convert_camelcase
8 from django.utils import simplejson as json
9 from django.utils.http import urlquote_plus
10 from django.utils.safestring import mark_safe
11 from django.utils.text import capfirst
12 from django.template import loader, Context, Template
14 from philo.contrib.sobol.utils import make_tracking_querydict
17 if getattr(settings, 'SOBOL_USE_EVENTLET', False):
19 from eventlet.green import urllib2
27 'Result', 'BaseSearch', 'DatabaseSearch', 'URLSearch', 'JSONSearch', 'GoogleSearch', 'registry'
31 SEARCH_CACHE_KEY = 'philo_sobol_search_results'
32 DEFAULT_RESULT_TEMPLATE_STRING = "{% if url %}<a href='{{ url }}'>{% endif %}{{ title }}{% if url %}</a>{% endif %}"
33 DEFAULT_RESULT_TEMPLATE = Template(DEFAULT_RESULT_TEMPLATE_STRING)
35 # Determines the timeout on the entire result cache.
36 MAX_CACHE_TIMEOUT = 60*24*7
39 class RegistrationError(Exception):
43 class SearchRegistry(object):
44 # Holds a registry of search types by slug.
48 def register(self, search, slug=None):
49 slug = slug or search.slug
50 if slug in self._registry:
51 registered = self._registry[slug]
52 if registered.__module__ != search.__module__:
53 raise RegistrationError("A different search is already registered as `%s`" % slug)
55 self._registry[slug] = search
57 def unregister(self, search, slug=None):
59 if slug in self._registry and self._registry[slug] == search:
60 del self._registry[slug]
61 raise RegistrationError("`%s` is not registered as `%s`" % (search, slug))
63 for slug, search in self._registry.items():
65 del self._registry[slug]
68 return self._registry.items()
71 return self._registry.iteritems()
73 def iterchoices(self):
74 for slug, search in self.iteritems():
75 yield slug, search.verbose_name
77 def __getitem__(self, key):
78 return self._registry[key]
81 return self._registry.__iter__()
84 registry = SearchRegistry()
89 A result is instantiated with a configuration dictionary, a search,
90 and a template name. The configuration dictionary is expected to
91 define a `title` and optionally a `url`. Any other variables may be
92 defined; they will be made available through the result object in
93 the template, if one is defined.
95 def __init__(self, search, result):
100 return self.search.get_result_title(self.result)
103 qd = self.search.get_result_querydict(self.result)
106 return "?%s" % qd.urlencode()
108 def get_template(self):
109 return self.search.get_result_template(self.result)
111 def get_extra_context(self):
112 return self.search.get_result_extra_context(self.result)
114 def get_context(self):
115 context = self.get_extra_context()
117 'title': self.get_title(),
118 'url': self.get_url()
123 t = self.get_template()
124 c = Context(self.get_context())
127 def __unicode__(self):
131 class BaseSearchMetaclass(type):
132 def __new__(cls, name, bases, attrs):
133 if 'verbose_name' not in attrs:
134 attrs['verbose_name'] = capfirst(convert_camelcase(name))
135 if 'slug' not in attrs:
136 attrs['slug'] = name.lower()
137 return super(BaseSearchMetaclass, cls).__new__(cls, name, bases, attrs)
140 class BaseSearch(object):
142 Defines a generic search interface. Accessing self.results will
143 attempt to retrieve cached results and, if that fails, will
144 initiate a new search and store the results in the cache.
146 __metaclass__ = BaseSearchMetaclass
148 _cache_timeout = 60*48
150 def __init__(self, search_arg):
151 self.search_arg = search_arg
153 def _get_cached_results(self):
154 """Return the cached results if the results haven't timed out. Otherwise return None."""
155 result_cache = cache.get(SEARCH_CACHE_KEY)
156 if result_cache and self.__class__ in result_cache and self.search_arg.lower() in result_cache[self.__class__]:
157 cached = result_cache[self.__class__][self.search_arg.lower()]
158 if cached['timeout'] >= datetime.datetime.now():
159 return cached['results']
162 def _set_cached_results(self, results, timeout):
163 """Sets the results to the cache for <timeout> minutes."""
164 result_cache = cache.get(SEARCH_CACHE_KEY) or {}
165 cached = result_cache.setdefault(self.__class__, {}).setdefault(self.search_arg.lower(), {})
168 'timeout': datetime.datetime.now() + datetime.timedelta(minutes=timeout)
170 cache.set(SEARCH_CACHE_KEY, result_cache, MAX_CACHE_TIMEOUT)
174 if not hasattr(self, '_results'):
175 results = self._get_cached_results()
178 # Cache one extra result so we can see if there are
179 # more results to be had.
180 limit = self.result_limit
181 if limit is not None:
183 results = self.get_results(limit)
187 # On exceptions, don't set any cache; just return.
190 self._set_cached_results(results, self._cache_timeout)
191 self._results = results
195 def get_results(self, limit=None, result_class=Result):
197 Calls self.search() and parses the return value into Result objects.
199 results = self.search(limit)
200 return [result_class(self, result) for result in results]
202 def search(self, limit=None):
204 Returns an iterable of up to <limit> results. The
205 get_result_title, get_result_url, get_result_template, and
206 get_result_extra_context methods will be used to interpret the
207 individual items that this function returns, so the result can
208 be an object with attributes as easily as a dictionary
209 with keys. The only restriction is that the objects be
210 pickleable so that they can be used with django's cache system.
212 raise NotImplementedError
214 def get_result_title(self, result):
215 raise NotImplementedError
217 def get_result_url(self, result):
218 "Subclasses override this to provide the actual URL for the result."
219 raise NotImplementedError
221 def get_result_querydict(self, result):
222 url = self.get_result_url(result)
225 return make_tracking_querydict(self.search_arg, url)
227 def get_result_template(self, result):
228 if hasattr(self, 'result_template'):
229 return loader.get_template(self.result_template)
230 if not hasattr(self, '_result_template'):
231 self._result_template = DEFAULT_RESULT_TEMPLATE
232 return self._result_template
234 def get_result_extra_context(self, result):
237 def has_more_results(self):
238 """Useful to determine whether to display a `view more results` link."""
239 return len(self.results) > self.result_limit
242 def more_results_url(self):
244 Returns the actual url for more results. This will be encoded
245 into a querystring for tracking purposes.
247 raise NotImplementedError
250 def more_results_querydict(self):
251 return make_tracking_querydict(self.search_arg, self.more_results_url)
253 def __unicode__(self):
254 return ' '.join(self.__class__.verbose_name.rsplit(' ', 1)[:-1]) + ' results'
257 class DatabaseSearch(BaseSearch):
260 def search(self, limit=None):
261 if not hasattr(self, '_qs'):
262 self._qs = self.get_queryset()
263 if limit is not None:
264 self._qs = self._qs[:limit]
268 def get_queryset(self):
269 return self.model._default_manager.all()
272 class URLSearch(BaseSearch):
274 Defines a generic interface for searches that require accessing a
275 certain url to get search results.
278 query_format_str = "%s"
282 "The URL where the search gets its results."
283 return self.search_url + self.query_format_str % urlquote_plus(self.search_arg)
286 def more_results_url(self):
287 "The URL where the users would go to get more results."
290 def parse_response(self, response, limit=None):
291 raise NotImplementedError
293 def search(self, limit=None):
294 return self.parse_response(urllib2.urlopen(self.url), limit=limit)
297 class JSONSearch(URLSearch):
299 Makes a GET request and parses the results as JSON. The default
300 behavior assumes that the return value is a list of results.
302 def parse_response(self, response, limit=None):
303 return json.loads(response.read())[:limit]
306 class GoogleSearch(JSONSearch):
307 search_url = "http://ajax.googleapis.com/ajax/services/search/web"
308 # TODO: Change this template to reflect the app's actual name.
309 result_template = 'search/googlesearch.html'
311 verbose_name = "Google search (current site)"
314 def query_format_str(self):
315 default_args = self.default_args
318 return "?v=1.0&q=%s%%s" % urlquote_plus(default_args).replace('%', '%%')
321 def default_args(self):
322 return "site:%s" % Site.objects.get_current().domain
324 def parse_response(self, response, limit=None):
325 responseData = json.loads(response.read())['responseData']
326 results, cursor = responseData['results'], responseData['cursor']
329 self._more_results_url = cursor['moreResultsUrl']
330 self._estimated_result_count = cursor['estimatedResultCount']
332 return results[:limit]
336 # Google requires that an ajax request have a proper Referer header.
337 return urllib2.Request(
338 super(GoogleSearch, self).url,
340 {'Referer': "http://%s" % Site.objects.get_current().domain}
344 def has_more_results(self):
345 if self.results and len(self.results) < self._estimated_result_count:
350 def more_results_url(self):
351 return self._more_results_url
353 def get_result_title(self, result):
354 return result['titleNoFormatting']
356 def get_result_url(self, result):
357 return result['unescapedUrl']
359 def get_result_extra_context(self, result):
363 registry.register(GoogleSearch)
367 from BeautifulSoup import BeautifulSoup, SoupStrainer, BeautifulStoneSoup
371 __all__ += ('ScrapeSearch', 'XMLSearch',)
372 class ScrapeSearch(URLSearch):
374 _strainer_kwargs = {}
378 if not hasattr(self, '_strainer'):
379 self._strainer = SoupStrainer(*self._strainer_args, **self._strainer_kwargs)
380 return self._strainer
382 def parse_response(self, response, limit=None):
383 strainer = self.strainer
384 soup = BeautifulSoup(response, parseOnlyThese=strainer)
385 return self.parse_results(soup.findAll(recursive=False, limit=limit))
387 def parse_results(self, results):
389 Provides a hook for parsing the results of straining. This
390 has no default behavior because the results absolutely
391 must be parsed to properly extract the information.
392 For more information, see http://www.crummy.com/software/BeautifulSoup/documentation.html#Improving%20Memory%20Usage%20with%20extract
394 raise NotImplementedError
397 class XMLSearch(ScrapeSearch):
398 _self_closing_tags = []
400 def parse_response(self, response, limit=None):
401 strainer = self.strainer
402 soup = BeautifulStoneSoup(response, selfClosingTags=self._self_closing_tags, parseOnlyThese=strainer)
403 return self.parse_results(soup.findAll(recursive=False, limit=limit))