4 from django.conf import settings
5 from django.contrib.sites.models import Site
6 from django.core.cache import cache
7 from django.db.models.options import get_verbose_name as convert_camelcase
8 from django.utils import simplejson as json
9 from django.utils.http import urlquote_plus
10 from django.utils.safestring import mark_safe
11 from django.utils.text import capfirst
12 from django.template import loader, Context, Template
14 from philo.contrib.sobol.utils import make_tracking_querydict, RegistryIterator
17 if getattr(settings, 'SOBOL_USE_EVENTLET', False):
19 from eventlet.green import urllib2
27 'Result', 'BaseSearch', 'DatabaseSearch', 'URLSearch', 'JSONSearch', 'GoogleSearch', 'SearchRegistry', 'registry'
31 SEARCH_CACHE_KEY = 'philo_sobol_search_results'
32 DEFAULT_RESULT_TEMPLATE_STRING = "{% if url %}<a href='{{ url }}'>{% endif %}{{ title }}{% if url %}</a>{% endif %}"
33 DEFAULT_RESULT_TEMPLATE = Template(DEFAULT_RESULT_TEMPLATE_STRING)
35 # Determines the timeout on the entire result cache.
36 MAX_CACHE_TIMEOUT = 60*24*7
39 class RegistrationError(Exception):
43 class SearchRegistry(object):
44 """Holds a registry of search types by slug."""
49 def register(self, search, slug=None):
50 slug = slug or search.slug
51 if slug in self._registry:
52 registered = self._registry[slug]
53 if registered.__module__ != search.__module__:
54 raise RegistrationError("A different search is already registered as `%s`" % slug)
56 self._registry[slug] = search
58 def unregister(self, search, slug=None):
60 if slug in self._registry and self._registry[slug] == search:
61 del self._registry[slug]
62 raise RegistrationError("`%s` is not registered as `%s`" % (search, slug))
64 for slug, search in self._registry.items():
66 del self._registry[slug]
69 return self._registry.items()
72 return RegistryIterator(self._registry, 'iteritems')
74 def iterchoices(self):
75 return RegistryIterator(self._registry, 'iteritems', lambda x: (x[0], x[1].verbose_name))
77 def __getitem__(self, key):
78 return self._registry[key]
81 return self._registry.__iter__()
84 registry = SearchRegistry()
89 A result is instantiated with a configuration dictionary, a search,
90 and a template name. The configuration dictionary is expected to
91 define a `title` and optionally a `url`. Any other variables may be
92 defined; they will be made available through the result object in
93 the template, if one is defined.
95 def __init__(self, search, result):
100 return self.search.get_result_title(self.result)
103 qd = self.search.get_result_querydict(self.result)
106 return "?%s" % qd.urlencode()
108 def get_template(self):
109 return self.search.get_result_template(self.result)
111 def get_extra_context(self):
112 return self.search.get_result_extra_context(self.result)
114 def get_context(self):
115 context = self.get_extra_context()
117 'title': self.get_title(),
118 'url': self.get_url(),
119 'result': self.result
124 t = self.get_template()
125 c = Context(self.get_context())
128 def __unicode__(self):
132 class BaseSearchMetaclass(type):
133 def __new__(cls, name, bases, attrs):
134 if 'verbose_name' not in attrs:
135 attrs['verbose_name'] = capfirst(' '.join(convert_camelcase(name).rsplit(' ', 1)[:-1]))
136 if 'slug' not in attrs:
137 attrs['slug'] = name.lower()
138 return super(BaseSearchMetaclass, cls).__new__(cls, name, bases, attrs)
141 class BaseSearch(object):
143 Defines a generic search interface. Accessing self.results will
144 attempt to retrieve cached results and, if that fails, will
145 initiate a new search and store the results in the cache.
147 __metaclass__ = BaseSearchMetaclass
149 _cache_timeout = 60*48
151 def __init__(self, search_arg):
152 self.search_arg = search_arg
154 def _get_cached_results(self):
155 """Return the cached results if the results haven't timed out. Otherwise return None."""
156 result_cache = cache.get(SEARCH_CACHE_KEY)
157 if result_cache and self.__class__ in result_cache and self.search_arg.lower() in result_cache[self.__class__]:
158 cached = result_cache[self.__class__][self.search_arg.lower()]
159 if cached['timeout'] >= datetime.datetime.now():
160 return cached['results']
163 def _set_cached_results(self, results, timeout):
164 """Sets the results to the cache for <timeout> minutes."""
165 result_cache = cache.get(SEARCH_CACHE_KEY) or {}
166 cached = result_cache.setdefault(self.__class__, {}).setdefault(self.search_arg.lower(), {})
169 'timeout': datetime.datetime.now() + datetime.timedelta(minutes=timeout)
171 cache.set(SEARCH_CACHE_KEY, result_cache, MAX_CACHE_TIMEOUT)
175 if not hasattr(self, '_results'):
176 results = self._get_cached_results()
179 # Cache one extra result so we can see if there are
180 # more results to be had.
181 limit = self.result_limit
182 if limit is not None:
184 results = self.get_results(limit)
188 # On exceptions, don't set any cache; just return.
191 self._set_cached_results(results, self._cache_timeout)
192 self._results = results
196 def get_results(self, limit=None, result_class=Result):
198 Calls self.search() and parses the return value into Result objects.
200 results = self.search(limit)
201 return [result_class(self, result) for result in results]
203 def search(self, limit=None):
205 Returns an iterable of up to <limit> results. The
206 get_result_title, get_result_url, get_result_template, and
207 get_result_extra_context methods will be used to interpret the
208 individual items that this function returns, so the result can
209 be an object with attributes as easily as a dictionary
210 with keys. The only restriction is that the objects be
211 pickleable so that they can be used with django's cache system.
213 raise NotImplementedError
215 def get_result_title(self, result):
216 raise NotImplementedError
218 def get_result_url(self, result):
219 "Subclasses override this to provide the actual URL for the result."
220 raise NotImplementedError
222 def get_result_querydict(self, result):
223 url = self.get_result_url(result)
226 return make_tracking_querydict(self.search_arg, url)
228 def get_result_template(self, result):
229 if hasattr(self, 'result_template'):
230 return loader.get_template(self.result_template)
231 if not hasattr(self, '_result_template'):
232 self._result_template = DEFAULT_RESULT_TEMPLATE
233 return self._result_template
235 def get_result_extra_context(self, result):
238 def has_more_results(self):
239 """Useful to determine whether to display a `view more results` link."""
240 return len(self.results) > self.result_limit
243 def more_results_url(self):
245 Returns the actual url for more results. This will be encoded
246 into a querystring for tracking purposes.
248 raise NotImplementedError
251 def more_results_querydict(self):
252 return make_tracking_querydict(self.search_arg, self.more_results_url)
254 def __unicode__(self):
255 return self.verbose_name
258 class DatabaseSearch(BaseSearch):
261 def search(self, limit=None):
262 if not hasattr(self, '_qs'):
263 self._qs = self.get_queryset()
264 if limit is not None:
265 self._qs = self._qs[:limit]
269 def get_queryset(self):
270 return self.model._default_manager.all()
273 class URLSearch(BaseSearch):
275 Defines a generic interface for searches that require accessing a
276 certain url to get search results.
279 query_format_str = "%s"
283 "The URL where the search gets its results."
284 return self.search_url + self.query_format_str % urlquote_plus(self.search_arg)
287 def more_results_url(self):
288 "The URL where the users would go to get more results."
291 def parse_response(self, response, limit=None):
292 raise NotImplementedError
294 def search(self, limit=None):
295 return self.parse_response(urllib2.urlopen(self.url), limit=limit)
298 class JSONSearch(URLSearch):
300 Makes a GET request and parses the results as JSON. The default
301 behavior assumes that the return value is a list of results.
303 def parse_response(self, response, limit=None):
304 return json.loads(response.read())[:limit]
307 class GoogleSearch(JSONSearch):
308 search_url = "http://ajax.googleapis.com/ajax/services/search/web"
309 # TODO: Change this template to reflect the app's actual name.
310 result_template = 'search/googlesearch.html'
312 verbose_name = "Google search (current site)"
315 def query_format_str(self):
316 default_args = self.default_args
319 return "?v=1.0&q=%s%%s" % urlquote_plus(default_args).replace('%', '%%')
322 def default_args(self):
323 return "site:%s" % Site.objects.get_current().domain
325 def parse_response(self, response, limit=None):
326 responseData = json.loads(response.read())['responseData']
327 results, cursor = responseData['results'], responseData['cursor']
330 self._more_results_url = cursor['moreResultsUrl']
331 self._estimated_result_count = cursor['estimatedResultCount']
333 return results[:limit]
337 # Google requires that an ajax request have a proper Referer header.
338 return urllib2.Request(
339 super(GoogleSearch, self).url,
341 {'Referer': "http://%s" % Site.objects.get_current().domain}
345 def has_more_results(self):
346 if self.results and len(self.results) < self._estimated_result_count:
351 def more_results_url(self):
352 return self._more_results_url
354 def get_result_title(self, result):
355 return result['titleNoFormatting']
357 def get_result_url(self, result):
358 return result['unescapedUrl']
361 registry.register(GoogleSearch)
365 from BeautifulSoup import BeautifulSoup, SoupStrainer, BeautifulStoneSoup
369 __all__ += ('ScrapeSearch', 'XMLSearch',)
370 class ScrapeSearch(URLSearch):
372 _strainer_kwargs = {}
376 if not hasattr(self, '_strainer'):
377 self._strainer = SoupStrainer(*self._strainer_args, **self._strainer_kwargs)
378 return self._strainer
380 def parse_response(self, response, limit=None):
381 strainer = self.strainer
382 soup = BeautifulSoup(response, parseOnlyThese=strainer)
383 return self.parse_results(soup.findAll(recursive=False, limit=limit))
385 def parse_results(self, results):
387 Provides a hook for parsing the results of straining. This
388 has no default behavior because the results absolutely
389 must be parsed to properly extract the information.
390 For more information, see http://www.crummy.com/software/BeautifulSoup/documentation.html#Improving%20Memory%20Usage%20with%20extract
392 raise NotImplementedError
395 class XMLSearch(ScrapeSearch):
396 _self_closing_tags = []
398 def parse_response(self, response, limit=None):
399 strainer = self.strainer
400 soup = BeautifulStoneSoup(response, selfClosingTags=self._self_closing_tags, parseOnlyThese=strainer)
401 return self.parse_results(soup.findAll(recursive=False, limit=limit))