From: Stephen Burrows <stephen.r.burrows@gmail.com>
Date: Thu, 24 Feb 2011 19:07:40 +0000 (-0500)
Subject: Merge branch 'master' into search
X-Git-Tag: philo-0.9~16^2~15^2~4
X-Git-Url: http://git.ithinksw.org/philo.git/commitdiff_plain/62a418ca34a8d4e8323e1c998c21b5e957739fd9?hp=1ad3c023974f619f977d1cced84e1251846fb7a2

Merge branch 'master' into search
---

diff --git a/contrib/sobol/__init__.py b/contrib/sobol/__init__.py
new file mode 100644
index 0000000..90eaf18
--- /dev/null
+++ b/contrib/sobol/__init__.py
@@ -0,0 +1 @@
+from philo.contrib.sobol.search import *
\ No newline at end of file
diff --git a/contrib/sobol/admin.py b/contrib/sobol/admin.py
new file mode 100644
index 0000000..eb6fd03
--- /dev/null
+++ b/contrib/sobol/admin.py
@@ -0,0 +1,40 @@
+from django.contrib import admin
+from django.db.models import Count
+from philo.admin import EntityAdmin
+from philo.contrib.sobol.models import Search, ResultURL, SearchView
+
+
+class ResultURLInline(admin.TabularInline):
+	model = ResultURL
+	template = 'search/admin/chosen_result_inline.html'
+	readonly_fields = ('url',)
+	can_delete = False
+	extra = 0
+	max_num = 0
+
+
+class SearchAdmin(admin.ModelAdmin):
+	readonly_fields = ('string',)
+	inlines = [ResultURLInline]
+	list_display = ['string', 'unique_urls', 'total_clicks']
+	search_fields = ['string', 'result_urls__url']
+	
+	def unique_urls(self, obj):
+		return obj.unique_urls
+	unique_urls.admin_order_field = 'unique_urls'
+	
+	def total_clicks(self, obj):
+		return obj.total_clicks
+	total_clicks.admin_order_field = 'total_clicks'
+	
+	def queryset(self, request):
+		qs = super(SearchAdmin, self).queryset(request)
+		return qs.annotate(total_clicks=Count('result_urls__clicks', distinct=True), unique_urls=Count('result_urls', distinct=True))
+
+
+class SearchViewAdmin(EntityAdmin):
+	pass
+
+
+admin.site.register(Search, SearchAdmin)
+admin.site.register(SearchView, SearchViewAdmin)
\ No newline at end of file
diff --git a/contrib/sobol/forms.py b/contrib/sobol/forms.py
new file mode 100644
index 0000000..e79d9e7
--- /dev/null
+++ b/contrib/sobol/forms.py
@@ -0,0 +1,12 @@
+from django import forms
+from philo.contrib.sobol.utils import SEARCH_ARG_GET_KEY
+
+
+class BaseSearchForm(forms.BaseForm):
+	base_fields = {
+		SEARCH_ARG_GET_KEY: forms.CharField()
+	}
+
+
+class SearchForm(forms.Form, BaseSearchForm):
+	pass
\ No newline at end of file
diff --git a/contrib/sobol/models.py b/contrib/sobol/models.py
new file mode 100644
index 0000000..8113750
--- /dev/null
+++ b/contrib/sobol/models.py
@@ -0,0 +1,138 @@
+from django.conf.urls.defaults import patterns, url
+from django.contrib import messages
+from django.db import models
+from django.http import HttpResponseRedirect, Http404
+from django.utils import simplejson as json
+from philo.contrib.sobol import registry
+from philo.contrib.sobol.forms import SearchForm
+from philo.contrib.sobol.utils import HASH_REDIRECT_GET_KEY, URL_REDIRECT_GET_KEY, SEARCH_ARG_GET_KEY, check_redirect_hash
+from philo.exceptions import ViewCanNotProvideSubpath
+from philo.models import MultiView, Page, SlugMultipleChoiceField
+from philo.validators import RedirectValidator
+import datetime
+try:
+	import eventlet
+except:
+	eventlet = False
+
+
+class Search(models.Model):
+	string = models.TextField()
+	
+	def __unicode__(self):
+		return self.search_string
+	
+	class Meta:
+		ordering = ['string']
+		verbose_name_plural = 'searches'
+
+
+class ResultURL(models.Model):
+	search = models.ForeignKey(Search, related_name='result_urls')
+	url = models.TextField(validators=[RedirectValidator()])
+	
+	def __unicode__(self):
+		return self.url
+	
+	class Meta:
+		ordering = ['url']
+
+
+class Click(models.Model):
+	result = models.ForeignKey(ResultURL, related_name='clicks')
+	datetime = models.DateTimeField()
+	
+	def __unicode__(self):
+		return self.datetime.strftime('%B %d, %Y %H:%M:%S')
+	
+	class Meta:
+		ordering = ['datetime']
+		get_latest_by = 'datetime'
+
+
+class SearchView(MultiView):
+	results_page = models.ForeignKey(Page, related_name='search_results_related')
+	searches = SlugMultipleChoiceField(choices=registry.iterchoices())
+	allow_partial_loading = models.BooleanField(default=True)
+	placeholder_text = models.CharField(max_length=75, default="Search")
+	
+	def get_reverse_params(self, obj):
+		raise ViewCanNotProvideSubpath
+	
+	@property
+	def urlpatterns(self):
+		urlpatterns = patterns('',
+			url(r'^$', self.results_view, name='results'),
+		)
+		if self.allow_partial_loading:
+			urlpatterns += patterns('',
+				url(r'^(?P<slug>[\w-]+)/?', self.partial_ajax_results_view, name='partial_ajax_results_view')
+			)
+		return urlpatterns
+	
+	def results_view(self, request, extra_context=None):
+		results = None
+		
+		context = self.get_context()
+		context.update(extra_context or {})
+		
+		if SEARCH_ARG_GET_KEY in request.GET:
+			form = SearchForm(request.GET)
+			
+			if form.is_valid():
+				search_string = request.GET[SEARCH_ARG_GET_KEY].lower()
+				url = request.GET.get(URL_REDIRECT_GET_KEY)
+				hash = request.GET.get(HASH_REDIRECT_GET_KEY)
+				
+				if url and hash:
+					if check_redirect_hash(hash, search_string, url):
+						# Create the necessary models
+						search = Search.objects.get_or_create(string=search_string)[0]
+						result_url = search.result_urls.get_or_create(url=url)[0]
+						result_url.clicks.create(datetime=datetime.datetime.now())
+						return HttpResponseRedirect(url)
+					else:
+						messages.add_message(request, messages.INFO, "The link you followed had been tampered with. Here are all the results for your search term instead!")
+						# TODO: Should search_string be escaped here?
+						return HttpResponseRedirect("%s?%s=%s" % (request.path, SEARCH_ARG_GET_KEY, search_string))
+				if not self.allow_partial_loading:
+					search_instances = []
+					if eventlet:
+						pool = eventlet.GreenPool()
+					for slug in self.searches:
+						search = registry[slug]
+						search_instance = search(search_string)
+						search_instances.append(search_instance)
+						if eventlet:
+							pool.spawn_n(self.make_result_cache, search_instance)
+						else:
+							self.make_result_cache(search_instance)
+					if eventlet:
+						pool.waitall()
+					context.update({
+						'searches': search_instances
+					})
+		else:
+			form = SearchForm()
+		context.update({
+			'form': form
+		})
+		return self.results_page.render_to_response(request, extra_context=context)
+	
+	def make_result_cache(self, search_instance):
+		search_instance.results
+	
+	def partial_ajax_results_view(self, request, slug, extra_context=None):
+		search_string = request.GET.get(SEARCH_ARG_GET_KEY)
+		
+		if not request.is_ajax() or not self.allow_partial_loading or slug not in self.searches or search_string is None:
+			raise Http404
+		
+		search = registry[slug]
+		search_instance = search(search_string.lower())
+		results = search_instance.results
+		response = json.dumps({
+			'results': results,
+			'template': search_instance.get_ajax_result_template()
+		})
+		return response
\ No newline at end of file
diff --git a/contrib/sobol/search.py b/contrib/sobol/search.py
new file mode 100644
index 0000000..cce3ec6
--- /dev/null
+++ b/contrib/sobol/search.py
@@ -0,0 +1,378 @@
+#encoding: utf-8
+
+from django.conf import settings
+from django.contrib.sites.models import Site
+from django.core.cache import cache
+from django.db.models.options import get_verbose_name as convert_camelcase
+from django.utils import simplejson as json
+from django.utils.http import urlquote_plus
+from django.utils.safestring import mark_safe
+from django.utils.text import capfirst
+from django.template import loader, Context, Template
+import datetime
+from philo.contrib.sobol.utils import make_tracking_querydict
+
+try:
+	from eventlet.green import urllib2
+except:
+	import urllib2
+
+
+__all__ = (
+	'Result', 'BaseSearch', 'DatabaseSearch', 'URLSearch', 'JSONSearch', 'GoogleSearch', 'registry'
+)
+
+
+SEARCH_CACHE_KEY = 'philo_sobol_search_results'
+DEFAULT_RESULT_TEMPLATE_STRING = "{% if url %}<a href='{{ url }}'>{% endif %}{{ title }}{% if url %}</a>{% endif %}"
+
+# Determines the timeout on the entire result cache.
+MAX_CACHE_TIMEOUT = 60*60*24*7
+
+
+class RegistrationError(Exception):
+	pass
+
+
+class SearchRegistry(object):
+	# Holds a registry of search types by slug.
+	def __init__(self):
+		self._registry = {}
+	
+	def register(self, search, slug=None):
+		slug = slug or search.slug
+		if slug in self._registry:
+			if self._registry[slug] != search:
+				raise RegistrationError("A different search is already registered as `%s`")
+		else:
+			self._registry[slug] = search
+	
+	def unregister(self, search, slug=None):
+		if slug is not None:
+			if slug in self._registry and self._registry[slug] == search:
+				del self._registry[slug]
+			raise RegistrationError("`%s` is not registered as `%s`" % (search, slug))
+		else:
+			for slug, search in self._registry.items():
+				if search == search:
+					del self._registry[slug]
+	
+	def items(self):
+		return self._registry.items()
+	
+	def iteritems(self):
+		return self._registry.iteritems()
+	
+	def iterchoices(self):
+		for slug, search in self.iteritems():
+			yield slug, search.verbose_name
+	
+	def __getitem__(self, key):
+		return self._registry[key]
+
+
+registry = SearchRegistry()
+
+
+class Result(object):
+	"""
+	A result is instantiated with a configuration dictionary, a search,
+	and a template name. The configuration dictionary is expected to
+	define a `title` and optionally a `url`. Any other variables may be
+	defined; they will be made available through the result object in
+	the template, if one is defined.
+	"""
+	def __init__(self, search, result):
+		self.search = search
+		self.result = result
+	
+	def get_title(self):
+		return self.search.get_result_title(self.result)
+	
+	def get_url(self):
+		return self.search.get_result_querydict(self.result).urlencode()
+	
+	def get_template(self):
+		return self.search.get_result_template(self.result)
+	
+	def get_extra_context(self):
+		return self.search.get_result_extra_context(self.result)
+	
+	def render(self):
+		t = self.get_template()
+		c = Context(self.get_extra_context())
+		c.update({
+			'title': self.get_title(),
+			'url': self.get_url()
+		})
+		return t.render(c)
+	
+	def __unicode__(self):
+		return self.render()
+
+
+class BaseSearchMetaclass(type):
+	def __new__(cls, name, bases, attrs):
+		if 'verbose_name' not in attrs:
+			attrs['verbose_name'] = capfirst(convert_camelcase(name))
+		if 'slug' not in attrs:
+			attrs['slug'] = name.lower()
+		return super(BaseSearchMetaclass, cls).__new__(cls, name, bases, attrs)
+
+
+class BaseSearch(object):
+	"""
+	Defines a generic search interface. Accessing self.results will
+	attempt to retrieve cached results and, if that fails, will
+	initiate a new search and store the results in the cache.
+	"""
+	__metaclass__ = BaseSearchMetaclass
+	result_limit = 10
+	_cache_timeout = 60*48
+	
+	def __init__(self, search_arg):
+		self.search_arg = search_arg
+	
+	def _get_cached_results(self):
+		"""Return the cached results if the results haven't timed out. Otherwise return None."""
+		result_cache = cache.get(SEARCH_CACHE_KEY)
+		if result_cache and self.__class__ in result_cache and self.search_arg.lower() in result_cache[self.__class__]:
+			cached = result_cache[self.__class__][self.search_arg.lower()]
+			if cached['timeout'] >= datetime.datetime.now():
+				return cached['results']
+		return None
+	
+	def _set_cached_results(self, results, timeout):
+		"""Sets the results to the cache for <timeout> minutes."""
+		result_cache = cache.get(SEARCH_CACHE_KEY) or {}
+		cached = result_cache.setdefault(self.__class__, {}).setdefault(self.search_arg.lower(), {})
+		cached.update({
+			'results': results,
+			'timeout': datetime.datetime.now() + datetime.timedelta(minutes=timeout)
+		})
+		cache.set(SEARCH_CACHE_KEY, result_cache, MAX_CACHE_TIMEOUT)
+	
+	@property
+	def results(self):
+		if not hasattr(self, '_results'):
+			results = self._get_cached_results()
+			if results is None:
+				try:
+					# Cache one extra result so we can see if there are
+					# more results to be had.
+					limit = self.result_limit
+					if limit is not None:
+						limit += 1
+					results = self.get_results(self.result_limit)
+				except:
+					if settings.DEBUG:
+						raise
+					#  On exceptions, don't set any cache; just return.
+					return []
+			
+				self._set_cached_results(results, self._cache_timeout)
+			self._results = results
+		
+		return self._results
+	
+	def get_results(self, limit=None, result_class=Result):
+		"""
+		Calls self.search() and parses the return value into Result objects.
+		"""
+		results = self.search(limit)
+		return [result_class(self, result) for result in results]
+	
+	def search(self, limit=None):
+		"""
+		Returns an iterable of up to <limit> results. The
+		get_result_title, get_result_url, get_result_template, and
+		get_result_extra_context methods will be used to interpret the
+		individual items that this function returns, so the result can
+		be an object with attributes as easily as a dictionary
+ 		with keys. The only restriction is that the objects be
+		pickleable so that they can be used with django's cache system.
+		"""
+		raise NotImplementedError
+	
+	def get_result_title(self, result):
+		raise NotImplementedError
+	
+	def get_result_url(self, result):
+		"Subclasses override this to provide the actual URL for the result."
+		raise NotImplementedError
+	
+	def get_result_querydict(self, result):
+		return make_tracking_querydict(self.search_arg, self.get_result_url(result))
+	
+	def get_result_template(self, result):
+		if hasattr(self, 'result_template'):
+			return loader.get_template(self.result_template)
+		if not hasattr(self, '_result_template'):
+			self._result_template = Template(DEFAULT_RESULT_TEMPLATE_STRING)
+		return self._result_template
+	
+	def get_ajax_result_template(self, result):
+		return getattr(self, 'ajax_result_template', DEFAULT_RESULT_TEMPLATE_STRING)
+	
+	def get_result_extra_context(self, result):
+		return {}
+	
+	def has_more_results(self):
+		"""Useful to determine whether to display a `view more results` link."""
+		return len(self.results) > self.result_limit
+	
+	@property
+	def more_results_url(self):
+		"""
+		Returns the actual url for more results. This will be encoded
+		into a querystring for tracking purposes.
+		"""
+		raise NotImplementedError
+	
+	@property
+	def more_results_querydict(self):
+		return make_tracking_querydict(self.search_arg, self.more_results_url)
+	
+	def __unicode__(self):
+		return ' '.join(self.__class__.verbose_name.rsplit(' ', 1)[:-1]) + ' results'
+
+
+class DatabaseSearch(BaseSearch):
+	model = None
+	
+	def has_more_results(self):
+		return self.get_queryset().count() > self.result_limit
+	
+	def search(self, limit=None):
+		if not hasattr(self, '_qs'):
+			self._qs = self.get_queryset()
+			if limit is not None:
+				self._qs = self._qs[:limit]
+		
+		return self._qs
+	
+	def get_queryset(self):
+		return self.model._default_manager.all()
+
+
+class URLSearch(BaseSearch):
+	"""
+	Defines a generic interface for searches that require accessing a
+	certain url to get search results.
+	"""
+	search_url = ''
+	query_format_str = "%s"
+
+	@property
+	def url(self):
+		"The URL where the search gets its results."
+		return self.search_url + self.query_format_str % urlquote_plus(self.search_arg)
+
+	@property
+	def more_results_url(self):
+		"The URL where the users would go to get more results."
+		return self.url
+	
+	def parse_response(self, response, limit=None):
+		raise NotImplementedError
+	
+	def search(self, limit=None):
+		return self.parse_response(urllib2.urlopen(self.url), limit=limit)
+
+
+class JSONSearch(URLSearch):
+	"""
+	Makes a GET request and parses the results as JSON. The default
+	behavior assumes that the return value is a list of results.
+	"""
+	def parse_response(self, response, limit=None):
+		return json.loads(response.read())[:limit]
+
+
+class GoogleSearch(JSONSearch):
+	search_url = "http://ajax.googleapis.com/ajax/services/search/web"
+	query_format_str = "?v=1.0&q=%s"
+	# TODO: Change this template to reflect the app's actual name.
+	result_template = 'search/googlesearch.html'
+	timeout = 60
+	
+	def parse_response(self, response, limit=None):
+		responseData = json.loads(response.read())['responseData']
+		results, cursor = responseData['results'], responseData['cursor']
+		
+		if results:
+			self._more_results_url = cursor['moreResultsUrl']
+			self._estimated_result_count = cursor['estimatedResultCount']
+		
+		return results[:limit]
+	
+	@property
+	def url(self):
+		# Google requires that an ajax request have a proper Referer header.
+		return urllib2.Request(
+			super(GoogleSearch, self).url,
+			None,
+			{'Referer': "http://%s" % Site.objects.get_current().domain}
+		)
+	
+	@property
+	def has_more_results(self):
+		if self.results and len(self.results) < self._estimated_result_count:
+			return True
+		return False
+	
+	@property
+	def more_results_url(self):
+		return self._more_results_url
+	
+	def get_result_title(self, result):
+		return result['titleNoFormatting']
+	
+	def get_result_url(self, result):
+		return result['unescapedUrl']
+	
+	def get_result_extra_context(self, result):
+		return result
+
+
+registry.register(GoogleSearch)
+
+
+try:
+	from BeautifulSoup import BeautifulSoup, SoupStrainer, BeautifulStoneSoup
+except:
+	pass
+else:
+	__all__ += ('ScrapeSearch', 'XMLSearch',)
+	class ScrapeSearch(URLSearch):
+		_strainer_args = []
+		_strainer_kwargs = {}
+		
+		@property
+		def strainer(self):
+			if not hasattr(self, '_strainer'):
+				self._strainer = SoupStrainer(*self._strainer_args, **self._strainer_kwargs)
+			return self._strainer
+		
+		def parse_response(self, response, limit=None):
+			strainer = self.strainer
+			soup = BeautifulSoup(response, parseOnlyThese=strainer)
+			return self.parse_results(soup[:limit])
+		
+		def parse_results(self, results):
+			"""
+			Provides a hook for parsing the results of straining. This
+			has no default behavior because the results absolutely
+			must be parsed to properly extract the information.
+			For more information, see http://www.crummy.com/software/BeautifulSoup/documentation.html#Improving%20Memory%20Usage%20with%20extract
+			"""
+			raise NotImplementedError
+	
+	
+	class XMLSearch(ScrapeSearch):
+		_self_closing_tags = []
+		
+		def parse_response(self, response, limit=None):
+			strainer = self.strainer
+			soup = BeautifulStoneSoup(page, selfClosingTags=self._self_closing_tags, parseOnlyThese=strainer)
+			return self.parse_results(soup[:limit])
\ No newline at end of file
diff --git a/contrib/sobol/templates/search/googlesearch.html b/contrib/sobol/templates/search/googlesearch.html
new file mode 100644
index 0000000..2938214
--- /dev/null
+++ b/contrib/sobol/templates/search/googlesearch.html
@@ -0,0 +1,4 @@
+<article>
+	<h1><a href="?{{ url }}">{{ title|safe }}</a></h1>
+	<p>{{ content|safe }}</p>
+</article>
\ No newline at end of file
diff --git a/contrib/sobol/utils.py b/contrib/sobol/utils.py
new file mode 100644
index 0000000..723a463
--- /dev/null
+++ b/contrib/sobol/utils.py
@@ -0,0 +1,32 @@
+from django.conf import settings
+from django.http import QueryDict
+from django.utils.encoding import smart_str
+from django.utils.hashcompat import sha_constructor
+from django.utils.http import urlquote_plus, urlquote
+
+
+SEARCH_ARG_GET_KEY = 'q'
+URL_REDIRECT_GET_KEY = 'url'
+HASH_REDIRECT_GET_KEY = 's'
+
+
+def make_redirect_hash(search_arg, url):
+	return sha_constructor(smart_str(search_arg + url + settings.SECRET_KEY)).hexdigest()[::2]
+
+
+def check_redirect_hash(hash, search_arg, url):
+	return hash == make_redirect_hash(search_arg, url)
+
+
+def make_tracking_querydict(search_arg, url):
+	"""
+	Returns a QueryDict instance containing the information necessary
+	for tracking clicks of this url.
+	
+	NOTE: will this kind of initialization handle quoting correctly?
+	"""
+	return QueryDict("%s=%s&%s=%s&%s=%s" % (
+		SEARCH_ARG_GET_KEY, urlquote_plus(search_arg),
+ 		URL_REDIRECT_GET_KEY, urlquote(url),
+		HASH_REDIRECT_GET_KEY, make_redirect_hash(search_arg, url))
+	)
\ No newline at end of file
diff --git a/models/fields/__init__.py b/models/fields/__init__.py
index d8ed839..25af832 100644
--- a/models/fields/__init__.py
+++ b/models/fields/__init__.py
@@ -1,6 +1,10 @@
 from django import forms
+from django.core.exceptions import ValidationError
+from django.core.validators import validate_slug
 from django.db import models
 from django.utils import simplejson as json
+from django.utils.text import capfirst
+from django.utils.translation import ugettext_lazy as _
 from philo.forms.fields import JSONFormField
 from philo.validators import TemplateValidator, json_validator
 #from philo.models.fields.entities import *
@@ -55,10 +59,69 @@ class JSONField(models.TextField):
 		return super(JSONField, self).formfield(*args, **kwargs)
 
 
+class SlugMultipleChoiceField(models.Field):
+	__metaclass__ = models.SubfieldBase
+	description = _("Comma-separated slug field")
+	
+	def get_internal_type(self):
+		return "TextField"
+	
+	def to_python(self, value):
+		if not value:
+			return []
+		
+		if isinstance(value, list):
+			return value
+		
+		return value.split(',')
+	
+	def get_prep_value(self, value):
+		return ','.join(value)
+	
+	def formfield(self, **kwargs):
+		# This is necessary because django hard-codes TypedChoiceField for things with choices.
+		defaults = {
+			'widget': forms.CheckboxSelectMultiple,
+			'choices': self.get_choices(include_blank=False),
+			'label': capfirst(self.verbose_name),
+			'required': not self.blank,
+			'help_text': self.help_text
+		}
+		if self.has_default():
+			if callable(self.default):
+				defaults['initial'] = self.default
+				defaults['show_hidden_initial'] = True
+			else:
+				defaults['initial'] = self.get_default()
+		
+		for k in kwargs.keys():
+			if k not in ('coerce', 'empty_value', 'choices', 'required',
+						 'widget', 'label', 'initial', 'help_text',
+						 'error_messages', 'show_hidden_initial'):
+				del kwargs[k]
+		
+		defaults.update(kwargs)
+		form_class = forms.TypedMultipleChoiceField
+		return form_class(**defaults)
+	
+	def validate(self, value, model_instance):
+		invalid_values = []
+		for val in value:
+			try:
+				validate_slug(val)
+			except ValidationError:
+				invalid_values.append(val)
+		
+		if invalid_values:
+			# should really make a custom message.
+			raise ValidationError(self.error_messages['invalid_choice'] % invalid_values)
+
+
 try:
 	from south.modelsinspector import add_introspection_rules
 except ImportError:
 	pass
 else:
+	add_introspection_rules([], ["^philo\.models\.fields\.SlugMultipleChoiceField"])
 	add_introspection_rules([], ["^philo\.models\.fields\.TemplateField"])
 	add_introspection_rules([], ["^philo\.models\.fields\.JSONField"])
\ No newline at end of file