2009-07-30 13:01:38 +00:00
|
|
|
# Copyright (C) 2009 David Sauve
|
2009-07-30 12:57:35 +00:00
|
|
|
#
|
|
|
|
# This program is free software; you can redistribute it and/or modify
|
|
|
|
# it under the terms of the GNU General Public License as published by
|
|
|
|
# the Free Software Foundation; either version 2 of the License, or
|
|
|
|
# (at your option) any later version.
|
|
|
|
#
|
|
|
|
# This program is distributed in the hope that it will be useful,
|
|
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
# GNU General Public License for more details.
|
|
|
|
#
|
|
|
|
# You should have received a copy of the GNU General Public License along
|
|
|
|
# with this program; if not, write to the Free Software Foundation, Inc.,
|
|
|
|
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
|
|
|
|
2009-06-16 18:48:11 +00:00
|
|
|
import datetime
|
|
|
|
import cPickle as pickle
|
|
|
|
import os
|
|
|
|
import re
|
2009-08-05 20:44:12 +00:00
|
|
|
import shutil
|
2009-06-16 18:48:11 +00:00
|
|
|
import warnings
|
|
|
|
|
|
|
|
from django.conf import settings
|
|
|
|
from django.core.exceptions import ImproperlyConfigured
|
|
|
|
from django.utils.encoding import smart_unicode, force_unicode
|
|
|
|
|
|
|
|
from haystack.backends import BaseSearchBackend, BaseSearchQuery
|
|
|
|
from haystack.exceptions import MissingDependency
|
|
|
|
from haystack.models import SearchResult
|
|
|
|
|
|
|
|
try:
|
|
|
|
import xapian
|
|
|
|
except ImportError:
|
|
|
|
raise MissingDependency("The 'xapian' backend requires the installation of 'xapian'. Please refer to the documentation.")
|
|
|
|
|
|
|
|
|
|
|
|
DEFAULT_MAX_RESULTS = 100000
|
|
|
|
|
|
|
|
DOCUMENT_ID_TERM_PREFIX = 'Q'
|
|
|
|
DOCUMENT_CUSTOM_TERM_PREFIX = 'X'
|
|
|
|
DOCUMENT_CT_TERM_PREFIX = DOCUMENT_CUSTOM_TERM_PREFIX + 'CONTENTTYPE'
|
|
|
|
|
2009-06-18 16:15:13 +00:00
|
|
|
field_re = re.compile(r'(?<=(?<!Z)X)([A-Z_]+)(\w+)')
|
2009-08-07 20:47:49 +00:00
|
|
|
gap_re = re.compile(r'(?P<type>year|month|day|hour|minute|second+)s?=?(?P<value>\d*)', re.IGNORECASE)
|
2009-06-18 16:15:13 +00:00
|
|
|
|
|
|
|
|
2009-08-02 18:18:40 +00:00
|
|
|
class XHValueRangeProcessor(xapian.ValueRangeProcessor):
|
|
|
|
def __init__(self, sb):
|
|
|
|
self.sb = sb
|
|
|
|
xapian.ValueRangeProcessor.__init__(self)
|
|
|
|
|
|
|
|
def __call__(self, begin, end):
|
2009-08-03 00:32:19 +00:00
|
|
|
"""
|
|
|
|
Construct a tuple for value range processing.
|
|
|
|
|
|
|
|
`begin` -- a string in the format '<field_name>:[low_range]'
|
|
|
|
If 'low_range' is omitted, assume the smallest possible value.
|
|
|
|
`end` -- a string in the the format '[high_range|*]'. If '*', assume
|
|
|
|
the highest possible value.
|
|
|
|
|
|
|
|
Return a tuple of three strings: (column, low, high)
|
|
|
|
"""
|
2009-08-02 18:18:40 +00:00
|
|
|
colon = begin.find(':')
|
|
|
|
field_name = begin[:colon]
|
|
|
|
begin = begin[colon + 1:len(begin)]
|
|
|
|
for field_dict in self.sb.schema:
|
|
|
|
if field_dict['field_name'] == field_name:
|
2009-08-03 00:14:51 +00:00
|
|
|
if not begin:
|
|
|
|
if field_dict['type'] == 'text':
|
2009-08-03 00:32:19 +00:00
|
|
|
begin = u'a' # TODO: A better way of getting a min text value?
|
2009-08-03 00:14:51 +00:00
|
|
|
elif field_dict['type'] == 'long' or field_dict['type'] == 'float':
|
|
|
|
begin = float('-inf')
|
|
|
|
elif field_dict['type'] == 'date' or field_dict['type'] == 'datetime':
|
|
|
|
begin = u'00010101000000'
|
2009-08-03 00:32:19 +00:00
|
|
|
elif end == '*':
|
|
|
|
if field_dict['type'] == 'text':
|
|
|
|
end = u'z' * 100 # TODO: A better way of getting a max text value?
|
|
|
|
elif field_dict['type'] == 'long' or field_dict['type'] == 'float':
|
|
|
|
end = float('inf')
|
|
|
|
elif field_dict['type'] == 'date' or field_dict['type'] == 'datetime':
|
|
|
|
end = u'99990101000000'
|
2009-08-02 18:38:33 +00:00
|
|
|
if field_dict['type'] == 'long' or field_dict['type'] == 'float':
|
2009-08-03 00:14:51 +00:00
|
|
|
begin = xapian.sortable_serialise(float(begin))
|
|
|
|
end = xapian.sortable_serialise(float(end))
|
|
|
|
return field_dict['column'], str(begin), str(end)
|
2009-08-02 18:18:40 +00:00
|
|
|
|
|
|
|
|
2009-06-16 18:48:11 +00:00
|
|
|
class SearchBackend(BaseSearchBackend):
|
2009-06-18 16:15:13 +00:00
|
|
|
"""
|
|
|
|
`SearchBackend` defines the Xapian search backend for use with the Haystack
|
|
|
|
API for Django search.
|
|
|
|
|
|
|
|
It uses the Xapian Python bindings to interface with Xapian, and as
|
|
|
|
such is subject to this bug: <http://trac.xapian.org/ticket/364> when
|
|
|
|
Django is running with mod_python or mod_wsgi under Apache.
|
|
|
|
|
|
|
|
Until this issue has been fixed by Xapian, it is neccessary to set
|
|
|
|
`WSGIApplicationGroup to %{GLOBAL}` when using mod_wsgi, or
|
|
|
|
`PythonInterpreter main_interpreter` when using mod_python.
|
|
|
|
|
|
|
|
In order to use this backend, `HAYSTACK_XAPIAN_PATH` must be set in
|
|
|
|
your settings. This should point to a location where you would your
|
|
|
|
indexes to reside.
|
|
|
|
"""
|
2009-06-25 12:51:31 +00:00
|
|
|
RESERVED_WORDS = (
|
|
|
|
'AND',
|
|
|
|
'NOT',
|
|
|
|
'OR',
|
|
|
|
'XOR',
|
|
|
|
'NEAR',
|
|
|
|
'ADJ',
|
|
|
|
)
|
|
|
|
|
|
|
|
RESERVED_CHARACTERS = (
|
|
|
|
'\\', '+', '-', '&&', '||', '!', '(', ')', '{', '}',
|
|
|
|
'[', ']', '^', '"', '~', '*', '?', ':',
|
|
|
|
)
|
|
|
|
|
2009-07-29 19:34:46 +00:00
|
|
|
def __init__(self, site=None, stemming_language='english'):
|
2009-06-18 16:15:13 +00:00
|
|
|
"""
|
|
|
|
Instantiates an instance of `SearchBackend`.
|
|
|
|
|
|
|
|
Optional arguments:
|
|
|
|
`site` -- The site to associate the backend with (default = None)
|
2009-07-29 19:34:46 +00:00
|
|
|
`stemming_language` -- The stemming language (default = 'english')
|
2009-06-18 16:15:13 +00:00
|
|
|
|
2009-07-29 19:34:46 +00:00
|
|
|
Also sets the stemming language to be used to `stemming_language`.
|
2009-06-18 16:15:13 +00:00
|
|
|
"""
|
2009-06-16 18:48:11 +00:00
|
|
|
super(SearchBackend, self).__init__(site)
|
2009-07-21 13:43:55 +00:00
|
|
|
|
2009-06-16 18:48:11 +00:00
|
|
|
if not hasattr(settings, 'HAYSTACK_XAPIAN_PATH'):
|
|
|
|
raise ImproperlyConfigured('You must specify a HAYSTACK_XAPIAN_PATH in your settings.')
|
|
|
|
|
2009-07-29 19:34:46 +00:00
|
|
|
if not os.path.exists(settings.HAYSTACK_XAPIAN_PATH):
|
|
|
|
os.makedirs(settings.HAYSTACK_XAPIAN_PATH)
|
|
|
|
|
2009-07-31 01:18:31 +00:00
|
|
|
self.stemmer = xapian.Stem(stemming_language)
|
2009-07-31 15:39:58 +00:00
|
|
|
|
|
|
|
def get_identifier(self, obj_or_string):
|
|
|
|
return DOCUMENT_ID_TERM_PREFIX + super(SearchBackend, self).get_identifier(obj_or_string)
|
2009-06-16 18:48:11 +00:00
|
|
|
|
|
|
|
def update(self, index, iterable):
|
2009-06-18 16:15:13 +00:00
|
|
|
"""
|
|
|
|
Updates the `index` with any objects in `iterable` by adding/updating
|
|
|
|
the database as needed.
|
|
|
|
|
|
|
|
Required arguments:
|
|
|
|
`index` -- The `SearchIndex` to process
|
|
|
|
`iterable` -- An iterable of model instances to index
|
|
|
|
|
|
|
|
For each object in `iterable`, a document is created containing all
|
|
|
|
of the terms extracted from `index.prepare(obj)` with stemming prefixes,
|
|
|
|
field prefixes, and 'as-is'.
|
|
|
|
|
|
|
|
eg. `content:Testing` ==> `testing, Ztest, ZXCONTENTtest`
|
|
|
|
|
2009-07-31 01:18:31 +00:00
|
|
|
Each document also contains an extra term in the format:
|
2009-06-18 16:15:13 +00:00
|
|
|
|
|
|
|
`XCONTENTTYPE<app_name>.<model_name>`
|
|
|
|
|
|
|
|
As well as a unique identifier in the the format:
|
|
|
|
|
|
|
|
`Q<app_name>.<model_name>.<pk>`
|
|
|
|
|
|
|
|
eg.: foo.bar (pk=1) ==> `Qfoo.bar.1`, `XCONTENTTYPEfoo.bar`
|
|
|
|
|
|
|
|
This is useful for querying for a specific document corresponding to
|
2009-07-31 01:18:31 +00:00
|
|
|
a model instance.
|
2009-06-18 16:15:13 +00:00
|
|
|
|
2009-07-31 01:18:31 +00:00
|
|
|
The document also contains a pickled version of the object itself and
|
|
|
|
the document ID in the document data field.
|
2009-06-18 16:15:13 +00:00
|
|
|
|
2009-07-09 18:04:06 +00:00
|
|
|
Finally, we also store field values to be used for sorting data. We
|
|
|
|
store these in the document value slots (position zero is reserver
|
|
|
|
for the document ID). All values are stored as unicode strings with
|
|
|
|
conversion of float, int, double, values being done by Xapian itself
|
|
|
|
through the use of the :method:xapian.sortable_serialise method.
|
2009-06-18 16:15:13 +00:00
|
|
|
"""
|
2009-07-31 01:18:31 +00:00
|
|
|
database = self._database(writable=True)
|
2009-06-16 18:48:11 +00:00
|
|
|
try:
|
|
|
|
for obj in iterable:
|
|
|
|
document = xapian.Document()
|
2009-07-31 01:18:31 +00:00
|
|
|
term_generator = self._term_generator(database, document)
|
|
|
|
document_id = self.get_identifier(obj)
|
|
|
|
model_data = index.prepare(obj)
|
2009-06-16 18:48:11 +00:00
|
|
|
|
2009-07-31 01:18:31 +00:00
|
|
|
for field in self.schema:
|
|
|
|
if field['field_name'] in model_data.keys():
|
2009-07-31 19:17:22 +00:00
|
|
|
prefix = DOCUMENT_CUSTOM_TERM_PREFIX + field['field_name'].upper()
|
2009-07-31 01:18:31 +00:00
|
|
|
value = model_data[field['field_name']]
|
2009-08-01 20:18:07 +00:00
|
|
|
term_generator.index_text(force_unicode(value))
|
|
|
|
term_generator.index_text(force_unicode(value), 1, prefix)
|
2009-08-06 13:03:03 +00:00
|
|
|
document.add_value(field['column'], self._marshal_value(value))
|
2009-06-16 18:48:11 +00:00
|
|
|
|
2009-07-31 15:39:58 +00:00
|
|
|
document.set_data(pickle.dumps(
|
|
|
|
(obj._meta.app_label, obj._meta.module_name, obj.pk, model_data),
|
|
|
|
pickle.HIGHEST_PROTOCOL
|
|
|
|
))
|
|
|
|
document.add_term(document_id)
|
2009-06-18 16:15:13 +00:00
|
|
|
document.add_term(
|
|
|
|
DOCUMENT_CT_TERM_PREFIX + u'%s.%s' %
|
|
|
|
(obj._meta.app_label, obj._meta.module_name)
|
|
|
|
)
|
2009-07-31 15:39:58 +00:00
|
|
|
database.replace_document(document_id, document)
|
2009-06-16 18:48:11 +00:00
|
|
|
|
|
|
|
except UnicodeDecodeError:
|
|
|
|
sys.stderr.write('Chunk failed.\n')
|
|
|
|
pass
|
|
|
|
|
|
|
|
def remove(self, obj):
|
2009-06-18 16:15:13 +00:00
|
|
|
"""
|
|
|
|
Remove indexes for `obj` from the database.
|
|
|
|
|
|
|
|
We delete all instances of `Q<app_name>.<model_name>.<pk>` which
|
|
|
|
should be unique to this object.
|
|
|
|
"""
|
2009-07-31 15:39:58 +00:00
|
|
|
database = self._database(writable=True)
|
|
|
|
database.delete_document(self.get_identifier(obj))
|
2009-06-16 18:48:11 +00:00
|
|
|
|
|
|
|
def clear(self, models=[]):
|
2009-06-18 16:15:13 +00:00
|
|
|
"""
|
|
|
|
Clear all instances of `models` from the database or all models, if
|
|
|
|
not specified.
|
|
|
|
|
|
|
|
Optional Arguments:
|
|
|
|
`models` -- Models to clear from the database (default = [])
|
|
|
|
|
|
|
|
If `models` is empty, an empty query is executed which matches all
|
|
|
|
documents in the database. Afterwards, each match is deleted.
|
|
|
|
|
|
|
|
Otherwise, for each model, a `delete_document` call is issued with
|
|
|
|
the term `XCONTENTTYPE<app_name>.<model_name>`. This will delete
|
|
|
|
all documents with the specified model type.
|
|
|
|
"""
|
2009-07-31 15:39:58 +00:00
|
|
|
database = self._database(writable=True)
|
2009-06-16 18:48:11 +00:00
|
|
|
if not models:
|
2009-07-31 15:39:58 +00:00
|
|
|
query, __unused__ = self._query(database, '*')
|
|
|
|
enquire = self._enquire(database, query)
|
2009-06-16 18:48:11 +00:00
|
|
|
for match in enquire.get_mset(0, DEFAULT_MAX_RESULTS):
|
2009-07-31 15:39:58 +00:00
|
|
|
database.delete_document(match.get_docid())
|
2009-06-16 18:48:11 +00:00
|
|
|
else:
|
|
|
|
for model in models:
|
2009-07-31 15:39:58 +00:00
|
|
|
database.delete_document(
|
2009-06-18 16:15:13 +00:00
|
|
|
DOCUMENT_CT_TERM_PREFIX + '%s.%s' %
|
|
|
|
(model._meta.app_label, model._meta.module_name)
|
|
|
|
)
|
2009-06-16 18:48:11 +00:00
|
|
|
|
|
|
|
def search(self, query_string, sort_by=None, start_offset=0, end_offset=DEFAULT_MAX_RESULTS,
|
|
|
|
fields='', highlight=False, facets=None, date_facets=None, query_facets=None,
|
2009-08-07 19:58:36 +00:00
|
|
|
narrow_queries=None, boost=None, **kwargs):
|
2009-06-18 16:15:13 +00:00
|
|
|
"""
|
|
|
|
Executes the search as defined in `query_string`.
|
|
|
|
|
|
|
|
Required arguments:
|
|
|
|
`query_string` -- Search query to execute
|
|
|
|
|
|
|
|
Optional arguments:
|
|
|
|
`sort_by` -- Sort results by specified field (default = None)
|
|
|
|
`start_offset` -- Slice results from `start_offset` (default = 0)
|
|
|
|
`end_offset` -- Slice results at `end_offset` (default = 10,000)
|
|
|
|
`fields` -- Filter results on `fields` (default = '')
|
|
|
|
`highlight` -- Highlight terms in results (default = False)
|
|
|
|
`facets` -- Facet results on fields (default = None)
|
|
|
|
`date_facets` -- Facet results on date ranges (default = None)
|
|
|
|
`query_facets` -- Facet results on queries (default = None)
|
|
|
|
`narrow_queries` -- Narrow queries (default = None)
|
2009-08-07 19:58:36 +00:00
|
|
|
`boost` -- Dictionary of terms and weights to boost results
|
2009-06-18 16:15:13 +00:00
|
|
|
|
|
|
|
Returns:
|
|
|
|
A dictionary with the following keys:
|
|
|
|
`results` -- A list of `SearchResult`
|
|
|
|
`hits` -- The total available results
|
|
|
|
`facets` - A dictionary of facets with the following keys:
|
|
|
|
`fields` -- A list of field facets
|
|
|
|
`dates` -- A list of date facets
|
|
|
|
`queries` -- A list of query facets
|
|
|
|
If faceting was not used, the `facets` key will not be present
|
|
|
|
|
|
|
|
If `query_string` is empty, returns no results.
|
|
|
|
|
2009-07-03 19:44:01 +00:00
|
|
|
Otherwise, loads the available fields from the database meta data schema
|
2009-06-18 16:15:13 +00:00
|
|
|
and sets up prefixes for each one along with a prefix for `django_ct`,
|
|
|
|
used to filter by model, and loads the current stemmer instance.
|
|
|
|
|
|
|
|
Afterwards, executes the Xapian query parser to create a query from
|
|
|
|
`query_string` that is then passed to a new `enquire` instance.
|
|
|
|
|
|
|
|
The resulting match set is passed to :method:`_process_results` for
|
|
|
|
further processing prior to returning a dictionary with the results.
|
2009-06-20 11:32:18 +00:00
|
|
|
|
|
|
|
If `HAYSTACK_INCLUDE_SPELLING` was enabled in `settings.py`, the
|
|
|
|
extra flag `FLAG_SPELLING_CORRECTION` will be passed to the query parser
|
|
|
|
and any suggestions for spell correction will be returned as well as
|
|
|
|
the results.
|
2009-06-18 16:15:13 +00:00
|
|
|
"""
|
2009-06-16 18:48:11 +00:00
|
|
|
if not query_string:
|
|
|
|
return {
|
|
|
|
'results': [],
|
|
|
|
'hits': 0,
|
|
|
|
}
|
|
|
|
|
|
|
|
if query_facets is not None:
|
|
|
|
warnings.warn("Query faceting has not been implemented yet.", Warning, stacklevel=2)
|
|
|
|
|
2009-07-31 15:39:58 +00:00
|
|
|
database = self._database()
|
2009-08-07 19:58:36 +00:00
|
|
|
query, spelling_suggestion = self._query(
|
|
|
|
database, query_string, narrow_queries, boost
|
|
|
|
)
|
2009-07-31 15:39:58 +00:00
|
|
|
enquire = self._enquire(database, query)
|
2009-07-24 20:15:34 +00:00
|
|
|
|
|
|
|
if sort_by:
|
2009-07-31 15:39:58 +00:00
|
|
|
sorter = self._sorter(sort_by)
|
2009-07-24 20:15:34 +00:00
|
|
|
enquire.set_sort_by_key_then_relevance(sorter, True)
|
|
|
|
|
2009-07-31 15:39:58 +00:00
|
|
|
results = []
|
|
|
|
facets_dict = {
|
|
|
|
'fields': {},
|
|
|
|
'dates': {},
|
|
|
|
'queries': {},
|
|
|
|
}
|
2009-06-16 18:48:11 +00:00
|
|
|
matches = enquire.get_mset(start_offset, end_offset)
|
2009-06-20 11:32:18 +00:00
|
|
|
|
2009-07-31 15:39:58 +00:00
|
|
|
for match in matches:
|
|
|
|
document = match.get_document()
|
|
|
|
app_label, module_name, pk, model_data = pickle.loads(document.get_data())
|
|
|
|
if facets:
|
|
|
|
facets_dict['fields'] = self._do_field_facets(
|
|
|
|
document, facets, facets_dict['fields']
|
|
|
|
)
|
|
|
|
if highlight and (len(query_string) > 0):
|
|
|
|
model_data['highlighted'] = {
|
|
|
|
self.content_field_name: self._do_highlight(
|
|
|
|
model_data.get(self.content_field_name), query_string
|
|
|
|
)
|
|
|
|
}
|
2009-07-31 19:23:55 +00:00
|
|
|
results.append(
|
|
|
|
SearchResult(app_label, module_name, pk, match.weight, **model_data)
|
|
|
|
)
|
2009-06-16 18:48:11 +00:00
|
|
|
|
2009-08-10 20:12:59 +00:00
|
|
|
if date_facets:
|
|
|
|
facets_dict['dates'] = self._do_date_facets(results, date_facets)
|
|
|
|
|
2009-07-31 15:39:58 +00:00
|
|
|
return {
|
|
|
|
'results': results,
|
|
|
|
'hits': matches.get_matches_estimated(),
|
|
|
|
'facets': facets_dict,
|
|
|
|
'spelling_suggestion': spelling_suggestion,
|
|
|
|
}
|
2009-06-16 18:48:11 +00:00
|
|
|
|
|
|
|
def delete_index(self):
|
2009-06-18 16:15:13 +00:00
|
|
|
"""
|
|
|
|
Delete the index.
|
|
|
|
|
|
|
|
This removes all indexes files and the `HAYSTACK_XAPIAN_PATH` folder.
|
|
|
|
"""
|
2009-07-29 19:34:46 +00:00
|
|
|
if os.path.exists(settings.HAYSTACK_XAPIAN_PATH):
|
2009-08-05 20:44:12 +00:00
|
|
|
shutil.rmtree(settings.HAYSTACK_XAPIAN_PATH)
|
2009-06-16 18:48:11 +00:00
|
|
|
|
|
|
|
def document_count(self):
|
2009-06-18 16:15:13 +00:00
|
|
|
"""
|
|
|
|
Retrieves the total document count for the search index.
|
|
|
|
"""
|
2009-06-16 18:48:11 +00:00
|
|
|
try:
|
2009-07-31 15:39:58 +00:00
|
|
|
database = self._database()
|
2009-06-16 18:48:11 +00:00
|
|
|
except xapian.DatabaseOpeningError:
|
|
|
|
return 0
|
2009-07-31 15:39:58 +00:00
|
|
|
return database.get_doccount()
|
2009-06-16 18:48:11 +00:00
|
|
|
|
2009-06-17 20:54:39 +00:00
|
|
|
def more_like_this(self, model_instance):
|
2009-06-18 16:15:13 +00:00
|
|
|
"""
|
|
|
|
Given a model instance, returns a result set of similar documents.
|
|
|
|
|
|
|
|
Required arguments:
|
|
|
|
`model_instance` -- The model instance to use as a basis for
|
|
|
|
retrieving similar documents.
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
A dictionary with the following keys:
|
|
|
|
`results` -- A list of `SearchResult`
|
|
|
|
`hits` -- The total available results
|
|
|
|
|
|
|
|
Opens a database connection, then builds a simple query using the
|
|
|
|
`model_instance` to build the unique identifier.
|
|
|
|
|
|
|
|
For each document retrieved(should always be one), adds an entry into
|
|
|
|
an RSet (relevance set) with the document id, then, uses the RSet
|
|
|
|
to query for an ESet (A set of terms that can be used to suggest
|
|
|
|
expansions to the original query), omitting any document that was in
|
|
|
|
the original query.
|
|
|
|
|
|
|
|
Finally, processes the resulting matches and returns.
|
|
|
|
"""
|
2009-07-31 15:39:58 +00:00
|
|
|
database = self._database()
|
2009-07-31 19:29:48 +00:00
|
|
|
query = xapian.Query(self.get_identifier(model_instance))
|
2009-07-31 15:39:58 +00:00
|
|
|
enquire = self._enquire(database, query)
|
2009-06-17 20:54:39 +00:00
|
|
|
rset = xapian.RSet()
|
|
|
|
for match in enquire.get_mset(0, DEFAULT_MAX_RESULTS):
|
|
|
|
rset.add_document(match.get_docid())
|
2009-06-17 23:30:25 +00:00
|
|
|
query = xapian.Query(xapian.Query.OP_OR,
|
2009-06-17 20:54:39 +00:00
|
|
|
[expand.term for expand in enquire.get_eset(DEFAULT_MAX_RESULTS, rset)]
|
|
|
|
)
|
2009-07-31 19:29:48 +00:00
|
|
|
query = xapian.Query(
|
|
|
|
xapian.Query.OP_AND_NOT, [query, self.get_identifier(model_instance)]
|
2009-06-17 23:30:25 +00:00
|
|
|
)
|
2009-06-17 20:54:39 +00:00
|
|
|
enquire.set_query(query)
|
2009-06-18 16:15:13 +00:00
|
|
|
|
2009-06-16 18:48:11 +00:00
|
|
|
results = []
|
2009-07-31 19:29:48 +00:00
|
|
|
matches = enquire.get_mset(0, DEFAULT_MAX_RESULTS)
|
|
|
|
|
2009-06-16 18:48:11 +00:00
|
|
|
for match in matches:
|
|
|
|
document = match.get_document()
|
2009-07-31 19:29:48 +00:00
|
|
|
app_label, module_name, pk, model_data = pickle.loads(document.get_data())
|
|
|
|
results.append(
|
|
|
|
SearchResult(app_label, module_name, pk, match.weight, **model_data)
|
2009-06-16 18:48:11 +00:00
|
|
|
)
|
2009-06-20 11:32:18 +00:00
|
|
|
|
2009-06-16 18:48:11 +00:00
|
|
|
return {
|
|
|
|
'results': results,
|
2009-07-31 19:29:48 +00:00
|
|
|
'hits': matches.get_matches_estimated(),
|
|
|
|
'facets': {
|
|
|
|
'fields': {},
|
|
|
|
'dates': {},
|
|
|
|
'queries': {},
|
|
|
|
},
|
|
|
|
'spelling_suggestion': None,
|
2009-06-16 18:48:11 +00:00
|
|
|
}
|
|
|
|
|
2009-07-27 19:12:20 +00:00
|
|
|
def _do_highlight(self, content, text, tag='em'):
|
|
|
|
"""
|
|
|
|
Highlight `text` in `content` with html `tag`.
|
|
|
|
|
|
|
|
This method assumes that the input text (`content`) does not contain
|
|
|
|
any special formatting. That is, it does not contain any html tags
|
|
|
|
or similar markup that could be screwed up by the highlighting.
|
|
|
|
|
|
|
|
Required arguments:
|
|
|
|
`content` -- Content to search for instances of `text`
|
|
|
|
`text` -- The text to be highlighted
|
|
|
|
"""
|
|
|
|
for term in [term.replace('*', '') for term in text.split()]:
|
|
|
|
term_re = re.compile(re.escape(term), re.IGNORECASE)
|
|
|
|
content = term_re.sub('<%s>%s</%s>' % (tag, term, tag), content)
|
|
|
|
return content
|
|
|
|
|
2009-06-16 18:48:11 +00:00
|
|
|
def _do_field_facets(self, document, facets, fields):
|
2009-06-18 16:15:13 +00:00
|
|
|
"""
|
|
|
|
Private method that facets a document by field name.
|
|
|
|
|
|
|
|
Required arguments:
|
|
|
|
`document` -- The document to parse
|
|
|
|
`facets` -- A list of facets to use when faceting
|
|
|
|
`fields` -- A list of fields that have already been faceted. This
|
|
|
|
will be extended with any new field names and counts
|
|
|
|
found in the `document`.
|
|
|
|
|
|
|
|
For each term in the document, extract the field name and determine
|
|
|
|
if it is one of the `facets` we want. If so, verify if it already in
|
|
|
|
the `fields` list. If it is, update the count, otherwise, add it and
|
|
|
|
set the count to 1.
|
|
|
|
"""
|
|
|
|
for term in [(term.term, term.termfreq) for term in document]:
|
2009-06-16 18:48:11 +00:00
|
|
|
match = field_re.search(term[0])
|
|
|
|
if match and match.group(1).lower() in facets:
|
|
|
|
if match.group(1).lower() in fields:
|
|
|
|
fields[match.group(1).lower()] += [(match.group(2), term[1])]
|
|
|
|
else:
|
|
|
|
fields[match.group(1).lower()] = [(match.group(2), term[1])]
|
|
|
|
return fields
|
|
|
|
|
2009-08-10 20:12:59 +00:00
|
|
|
def _do_date_facets(self, results, date_facets):
|
2009-08-07 18:22:40 +00:00
|
|
|
"""
|
|
|
|
Private method that facets a document by date ranges
|
2009-08-10 20:12:59 +00:00
|
|
|
|
2009-08-07 18:22:40 +00:00
|
|
|
Required arguments:
|
2009-08-10 20:12:59 +00:00
|
|
|
`results` -- A list SearchResults to facet
|
|
|
|
`date_facets` -- A dictionary containg facet parameters:
|
|
|
|
{'field': {'start_date': ..., 'end_date': ...: 'gap': '...'}}
|
|
|
|
nb., gap must satisfy the regex:
|
|
|
|
(?P<type>year|month|day|hour|minute|second+)s?=?(?P<value>\d*)
|
|
|
|
|
|
|
|
For each date facet field in `date_facets`, generates a list
|
|
|
|
of date ranges (from `start_date` to `end_date` by `gap`) then
|
|
|
|
iterates through `results` and tallies the count for each date_facet.
|
|
|
|
|
|
|
|
Returns a dictionary of date facets (fields) containing a list with
|
|
|
|
entries for each range and a count of documents matching the range.
|
|
|
|
|
|
|
|
eg. {
|
|
|
|
'pub_date': [
|
|
|
|
('2009-01-01T00:00:00Z', 5),
|
|
|
|
('2009-02-01T00:00:00Z', 0),
|
|
|
|
('2009-03-01T00:00:00Z', 0),
|
|
|
|
('2009-04-01T00:00:00Z', 1),
|
|
|
|
('2009-05-01T00:00:00Z', 2),
|
|
|
|
],
|
|
|
|
}
|
2009-08-07 18:22:40 +00:00
|
|
|
"""
|
2009-08-10 20:12:59 +00:00
|
|
|
facet_dict = {}
|
|
|
|
|
2009-08-07 18:22:40 +00:00
|
|
|
for date_facet, facet_params in date_facets.iteritems():
|
2009-08-07 23:41:24 +00:00
|
|
|
match = gap_re.search(facet_params['gap']).groupdict()
|
|
|
|
gap_type = match['type']
|
|
|
|
gap_value = match.get('value', 1)
|
2009-08-10 20:12:59 +00:00
|
|
|
date_range = facet_params['start_date']
|
|
|
|
facet_list = []
|
2009-08-10 20:34:29 +00:00
|
|
|
while date_range < facet_params['end_date']:
|
2009-08-10 20:12:59 +00:00
|
|
|
facet_list.append((date_range.isoformat(), 0))
|
|
|
|
if gap_type == 'year':
|
|
|
|
date_range = date_range.replace(
|
|
|
|
year=date_range.year + int(gap_value)
|
|
|
|
)
|
|
|
|
elif gap_type == 'month':
|
|
|
|
if date_range.month == 12:
|
|
|
|
date_range = date_range.replace(
|
|
|
|
month=1, year=date_range.year + int(gap_value)
|
|
|
|
)
|
2009-08-08 12:00:54 +00:00
|
|
|
else:
|
2009-08-10 20:12:59 +00:00
|
|
|
date_range = date_range.replace(
|
|
|
|
month=date_range.month + int(gap_value)
|
|
|
|
)
|
|
|
|
elif gap_type == 'day':
|
2009-08-10 20:38:13 +00:00
|
|
|
date_range += datetime.timedelta(days=int(gap_value))
|
2009-08-10 20:12:59 +00:00
|
|
|
elif gap_type == 'hour':
|
2009-08-10 20:38:13 +00:00
|
|
|
date_range += datetime.timedelta(hours=int(gap_value))
|
2009-08-10 20:12:59 +00:00
|
|
|
elif gap_type == 'minute':
|
2009-08-10 20:38:13 +00:00
|
|
|
date_range += datetime.timedelta(minutes=int(gap_value))
|
2009-08-10 20:12:59 +00:00
|
|
|
elif gap_type == 'second':
|
2009-08-10 20:38:13 +00:00
|
|
|
date_range += datetime.timedelta(seconds=int(gap_value))
|
2009-08-10 20:12:59 +00:00
|
|
|
|
2009-08-10 20:34:29 +00:00
|
|
|
facet_list = sorted(facet_list, key=lambda n:n[0], reverse=True)
|
2009-08-10 20:12:59 +00:00
|
|
|
|
|
|
|
for result in results:
|
|
|
|
result_date = getattr(result, date_facet)
|
|
|
|
if result_date:
|
|
|
|
if not isinstance(result_date, datetime.datetime):
|
|
|
|
result_date = datetime.datetime(
|
|
|
|
year=result_date.year,
|
|
|
|
month=result_date.month,
|
|
|
|
day=result_date.day,
|
|
|
|
)
|
|
|
|
for n, facet_date in enumerate(facet_list):
|
2009-08-10 20:34:29 +00:00
|
|
|
if result_date > datetime.datetime.strptime(facet_date[0], '%Y-%m-%dT%H:%M:%S'):
|
2009-08-10 20:12:59 +00:00
|
|
|
facet_list[n] = (facet_list[n][0], (facet_list[n][1] + 1))
|
2009-08-10 20:34:29 +00:00
|
|
|
break
|
2009-08-10 20:12:59 +00:00
|
|
|
|
|
|
|
facet_dict[date_facet] = facet_list
|
|
|
|
|
|
|
|
return facet_dict
|
2009-08-07 18:22:40 +00:00
|
|
|
|
2009-08-06 13:03:03 +00:00
|
|
|
def _marshal_value(self, value):
|
2009-06-16 18:48:11 +00:00
|
|
|
"""
|
2009-08-06 13:03:03 +00:00
|
|
|
Private method that converts Python values to a string for Xapian values.
|
2009-06-16 18:48:11 +00:00
|
|
|
"""
|
|
|
|
if isinstance(value, datetime.datetime):
|
2009-07-22 15:30:22 +00:00
|
|
|
if value.microsecond:
|
|
|
|
value = u'%04d%02d%02d%02d%02d%02d%06d' % (
|
|
|
|
value.year, value.month, value.day, value.hour,
|
|
|
|
value.minute, value.second, value.microsecond
|
|
|
|
)
|
|
|
|
else:
|
|
|
|
value = u'%04d%02d%02d%02d%02d%02d' % (
|
|
|
|
value.year, value.month, value.day, value.hour,
|
|
|
|
value.minute, value.second
|
|
|
|
)
|
2009-06-16 18:48:11 +00:00
|
|
|
elif isinstance(value, datetime.date):
|
2009-07-22 15:30:22 +00:00
|
|
|
value = u'%04d%02d%02d000000' % (value.year, value.month, value.day)
|
2009-06-16 18:48:11 +00:00
|
|
|
elif isinstance(value, bool):
|
|
|
|
if value:
|
2009-07-21 17:47:13 +00:00
|
|
|
value = u't'
|
2009-06-16 18:48:11 +00:00
|
|
|
else:
|
2009-07-21 17:47:13 +00:00
|
|
|
value = u'f'
|
2009-08-02 00:51:54 +00:00
|
|
|
elif isinstance(value, (int, long, float)):
|
2009-08-01 20:15:21 +00:00
|
|
|
value = xapian.sortable_serialise(value)
|
2009-06-16 18:48:11 +00:00
|
|
|
else:
|
|
|
|
value = force_unicode(value)
|
|
|
|
return value
|
|
|
|
|
2009-07-31 01:18:31 +00:00
|
|
|
def _database(self, writable=False):
|
2009-07-21 17:11:33 +00:00
|
|
|
"""
|
2009-07-31 01:18:31 +00:00
|
|
|
Private method that returns a xapian.Database for use and sets up
|
|
|
|
schema and content_field definitions.
|
|
|
|
|
|
|
|
Optional arguments:
|
|
|
|
``writable`` -- Open the database in read/write mode (default=False)
|
|
|
|
|
|
|
|
Returns an instance of a xapian.Database or xapian.WritableDatabase
|
|
|
|
"""
|
|
|
|
if writable:
|
|
|
|
self.content_field_name, fields = self.site.build_unified_schema()
|
|
|
|
self.schema = self._build_schema(fields)
|
|
|
|
|
|
|
|
database = xapian.WritableDatabase(settings.HAYSTACK_XAPIAN_PATH, xapian.DB_CREATE_OR_OPEN)
|
|
|
|
database.set_metadata('schema', pickle.dumps(self.schema, pickle.HIGHEST_PROTOCOL))
|
|
|
|
database.set_metadata('content', pickle.dumps(self.content_field_name, pickle.HIGHEST_PROTOCOL))
|
|
|
|
else:
|
|
|
|
database = xapian.Database(settings.HAYSTACK_XAPIAN_PATH)
|
|
|
|
|
|
|
|
self.schema = pickle.loads(database.get_metadata('schema'))
|
|
|
|
self.content_field_name = pickle.loads(database.get_metadata('content'))
|
|
|
|
|
|
|
|
return database
|
|
|
|
|
|
|
|
def _term_generator(self, database, document):
|
|
|
|
"""
|
|
|
|
Private method that returns a Xapian.TermGenerator
|
2009-07-21 17:44:50 +00:00
|
|
|
|
|
|
|
Required Argument:
|
|
|
|
`document` -- The document to be indexed
|
|
|
|
|
2009-07-31 15:39:58 +00:00
|
|
|
Returns a Xapian.TermGenerator instance. If `HAYSTACK_INCLUDE_SPELLING`
|
|
|
|
is True, then the term generator will have spell-checking enabled.
|
2009-07-21 17:44:50 +00:00
|
|
|
"""
|
2009-07-31 01:18:31 +00:00
|
|
|
term_generator = xapian.TermGenerator()
|
|
|
|
term_generator.set_database(database)
|
|
|
|
term_generator.set_stemmer(self.stemmer)
|
|
|
|
if getattr(settings, 'HAYSTACK_INCLUDE_SPELLING', False) is True:
|
|
|
|
term_generator.set_flags(xapian.TermGenerator.FLAG_SPELLING)
|
|
|
|
term_generator.set_document(document)
|
|
|
|
return term_generator
|
2009-07-21 17:44:50 +00:00
|
|
|
|
2009-08-07 19:58:36 +00:00
|
|
|
def _query(self, database, query_string, narrow_queries=None, boost=None):
|
2009-07-22 20:47:30 +00:00
|
|
|
"""
|
2009-08-02 14:24:44 +00:00
|
|
|
Private method that takes a query string and returns a xapian.Query.
|
2009-07-31 15:39:58 +00:00
|
|
|
|
|
|
|
Required arguments:
|
|
|
|
`query_string` -- The query string to parse
|
|
|
|
|
|
|
|
Optional arguments:
|
|
|
|
`narrow_queries` -- A list of queries to narrow the query with
|
2009-08-07 19:58:36 +00:00
|
|
|
`boost` -- A dictionary of terms to boost with values
|
2009-07-31 15:39:58 +00:00
|
|
|
|
2009-08-02 14:41:58 +00:00
|
|
|
Returns a xapian.Query instance with prefixes and ranges properly
|
|
|
|
setup as pulled from the `query_string`.
|
2009-07-31 15:39:58 +00:00
|
|
|
"""
|
|
|
|
spelling_suggestion = None
|
|
|
|
|
|
|
|
if query_string == '*':
|
|
|
|
query = xapian.Query('') # Make '*' match everything
|
|
|
|
else:
|
|
|
|
flags = self._flags()
|
2009-08-02 00:20:32 +00:00
|
|
|
qp = self._query_parser(database)
|
2009-08-02 18:18:40 +00:00
|
|
|
vrp = XHValueRangeProcessor(self)
|
|
|
|
qp.add_valuerangeprocessor(vrp)
|
2009-07-31 15:39:58 +00:00
|
|
|
query = qp.parse_query(query_string, flags)
|
|
|
|
if getattr(settings, 'HAYSTACK_INCLUDE_SPELLING', False) is True:
|
|
|
|
spelling_suggestion = qp.get_corrected_query_string()
|
2009-08-02 00:20:32 +00:00
|
|
|
|
2009-07-31 15:39:58 +00:00
|
|
|
if narrow_queries:
|
2009-08-02 18:18:40 +00:00
|
|
|
subqueries = [
|
|
|
|
qp.parse_query(narrow_query, flags) for narrow_query in narrow_queries
|
|
|
|
]
|
2009-08-02 14:24:44 +00:00
|
|
|
query = xapian.Query(
|
2009-08-02 18:18:40 +00:00
|
|
|
xapian.Query.OP_FILTER,
|
|
|
|
query, xapian.Query(xapian.Query.OP_AND, subqueries)
|
2009-08-02 14:24:44 +00:00
|
|
|
)
|
2009-08-07 19:58:36 +00:00
|
|
|
if boost:
|
|
|
|
subqueries = [
|
|
|
|
xapian.Query(
|
|
|
|
xapian.Query.OP_SCALE_WEIGHT, xapian.Query(term), value
|
|
|
|
) for term, value in boost.iteritems()
|
|
|
|
]
|
|
|
|
query = xapian.Query(
|
|
|
|
xapian.Query.OP_OR, query,
|
|
|
|
xapian.Query(xapian.Query.OP_AND, subqueries)
|
|
|
|
)
|
2009-07-31 15:39:58 +00:00
|
|
|
|
|
|
|
return query, spelling_suggestion
|
|
|
|
|
|
|
|
def _sorter(self, sort_by):
|
|
|
|
"""
|
|
|
|
Private methos that takes a list of fields to sort by and returns a
|
|
|
|
xapian.MultiValueSorter
|
2009-07-22 20:47:30 +00:00
|
|
|
|
|
|
|
Required Arguments:
|
|
|
|
`sort_by` -- A list of fields to sort by
|
|
|
|
|
|
|
|
Returns a xapian.MultiValueSorter instance
|
|
|
|
"""
|
|
|
|
sorter = xapian.MultiValueSorter()
|
2009-07-31 15:39:58 +00:00
|
|
|
|
2009-07-22 20:47:30 +00:00
|
|
|
for sort_field in sort_by:
|
|
|
|
if sort_field.startswith('-'):
|
2009-08-06 13:03:03 +00:00
|
|
|
reverse = True
|
2009-07-22 20:47:30 +00:00
|
|
|
sort_field = sort_field[1:] # Strip the '-'
|
|
|
|
else:
|
2009-08-06 13:03:03 +00:00
|
|
|
reverse = False # Reverse is inverted in Xapian -- http://trac.xapian.org/ticket/311
|
2009-07-31 15:39:58 +00:00
|
|
|
sorter.add(self._value_column(sort_field), reverse)
|
|
|
|
|
2009-07-22 20:47:30 +00:00
|
|
|
return sorter
|
|
|
|
|
2009-07-31 15:39:58 +00:00
|
|
|
def _flags(self):
|
2009-07-29 19:34:46 +00:00
|
|
|
"""
|
|
|
|
Returns the commonly used Xapian.QueryParser flags
|
|
|
|
"""
|
2009-07-22 20:47:30 +00:00
|
|
|
flags = xapian.QueryParser.FLAG_PARTIAL \
|
|
|
|
| xapian.QueryParser.FLAG_PHRASE \
|
|
|
|
| xapian.QueryParser.FLAG_BOOLEAN \
|
|
|
|
| xapian.QueryParser.FLAG_LOVEHATE \
|
2009-07-30 12:54:46 +00:00
|
|
|
| xapian.QueryParser.FLAG_WILDCARD \
|
|
|
|
| xapian.QueryParser.FLAG_PURE_NOT
|
2009-07-22 20:47:30 +00:00
|
|
|
if getattr(settings, 'HAYSTACK_INCLUDE_SPELLING', False) is True:
|
|
|
|
flags = flags | xapian.QueryParser.FLAG_SPELLING_CORRECTION
|
|
|
|
return flags
|
|
|
|
|
2009-07-31 15:39:58 +00:00
|
|
|
def _query_parser(self, database):
|
2009-07-24 19:27:43 +00:00
|
|
|
"""
|
2009-08-02 00:20:32 +00:00
|
|
|
Private method that returns a Xapian.QueryParser instance.
|
2009-07-31 15:39:58 +00:00
|
|
|
|
|
|
|
Required arguments:
|
|
|
|
`database` -- The database to be queried
|
2009-07-24 20:05:31 +00:00
|
|
|
|
2009-07-24 19:27:43 +00:00
|
|
|
The query parser returned will have stemming enabled, a boolean prefix
|
2009-07-31 15:39:58 +00:00
|
|
|
for `django_ct`, and prefixes for all of the fields in the `self.schema`.
|
2009-07-24 19:27:43 +00:00
|
|
|
"""
|
|
|
|
qp = xapian.QueryParser()
|
2009-07-31 15:39:58 +00:00
|
|
|
qp.set_database(database)
|
2009-07-24 19:27:43 +00:00
|
|
|
qp.set_stemmer(self.stemmer)
|
|
|
|
qp.set_stemming_strategy(xapian.QueryParser.STEM_SOME)
|
2009-08-02 18:18:40 +00:00
|
|
|
qp.add_boolean_prefix('django_ct', DOCUMENT_CT_TERM_PREFIX)
|
|
|
|
for field_dict in self.schema:
|
|
|
|
qp.add_prefix(
|
|
|
|
field_dict['field_name'],
|
|
|
|
DOCUMENT_CUSTOM_TERM_PREFIX + field_dict['field_name'].upper()
|
|
|
|
)
|
2009-08-02 00:20:32 +00:00
|
|
|
return qp
|
2009-06-16 18:48:11 +00:00
|
|
|
|
2009-07-31 15:39:58 +00:00
|
|
|
def _enquire(self, database, query):
|
2009-07-24 20:05:31 +00:00
|
|
|
"""
|
2009-07-31 15:39:58 +00:00
|
|
|
Private method that that returns a Xapian.Enquire instance for use with
|
|
|
|
the specifed `query`.
|
2009-07-24 20:05:31 +00:00
|
|
|
|
|
|
|
Required Arguments:
|
|
|
|
`query` -- The query to run
|
|
|
|
|
|
|
|
Returns a xapian.Enquire instance
|
|
|
|
"""
|
2009-07-31 15:39:58 +00:00
|
|
|
enquire = xapian.Enquire(database)
|
2009-07-24 20:05:31 +00:00
|
|
|
enquire.set_query(query)
|
|
|
|
enquire.set_docid_order(enquire.ASCENDING)
|
2009-07-31 15:39:58 +00:00
|
|
|
|
2009-07-24 20:05:31 +00:00
|
|
|
return enquire
|
|
|
|
|
2009-07-29 19:34:46 +00:00
|
|
|
def _build_schema(self, fields):
|
|
|
|
"""
|
|
|
|
Private method to build a schema.
|
|
|
|
|
|
|
|
Required arguments:
|
|
|
|
``fields`` -- A list of fields in the index
|
|
|
|
|
|
|
|
Returns a list of fields in dictionary format ready for inclusion in
|
2009-08-03 12:47:19 +00:00
|
|
|
an indexed meta-data.
|
2009-07-29 19:34:46 +00:00
|
|
|
"""
|
2009-08-05 13:28:44 +00:00
|
|
|
schema = []
|
|
|
|
n = 0
|
|
|
|
for field in fields:
|
2009-07-29 19:34:46 +00:00
|
|
|
if field['indexed'] == 'true':
|
2009-08-05 13:28:44 +00:00
|
|
|
field['column'] = n
|
|
|
|
n += 1
|
|
|
|
schema.append(field)
|
|
|
|
return schema
|
2009-07-29 19:34:46 +00:00
|
|
|
|
2009-07-31 15:39:58 +00:00
|
|
|
def _value_column(self, field):
|
|
|
|
"""
|
|
|
|
Private method that returns the column value slot in the database
|
|
|
|
for a given field.
|
|
|
|
|
|
|
|
Required arguemnts:
|
|
|
|
`field` -- The field to lookup
|
|
|
|
|
|
|
|
Returns an integer with the column location (0 indexed).
|
|
|
|
"""
|
|
|
|
for field_dict in self.schema:
|
|
|
|
if field_dict['field_name'] == field:
|
|
|
|
return field_dict['column']
|
|
|
|
return 0
|
|
|
|
|
|
|
|
|
2009-06-16 18:48:11 +00:00
|
|
|
class SearchQuery(BaseSearchQuery):
|
2009-06-18 16:15:13 +00:00
|
|
|
"""
|
|
|
|
`SearchQuery` is responsible for converting search queries into a format
|
|
|
|
that Xapian can understand.
|
|
|
|
|
|
|
|
Most of the work is done by the :method:`build_query`.
|
|
|
|
"""
|
2009-06-16 18:48:11 +00:00
|
|
|
def __init__(self, backend=None):
|
2009-06-18 16:15:13 +00:00
|
|
|
"""
|
|
|
|
Create a new instance of the SearchQuery setting the backend as
|
|
|
|
specified. If no backend is set, will use the Xapian `SearchBackend`.
|
|
|
|
|
|
|
|
Optional arguments:
|
|
|
|
`backend` -- The `SearchBackend` to use (default = None)
|
|
|
|
"""
|
2009-06-16 18:48:11 +00:00
|
|
|
super(SearchQuery, self).__init__(backend=backend)
|
|
|
|
self.backend = backend or SearchBackend()
|
|
|
|
|
|
|
|
def build_query(self):
|
2009-06-18 16:15:13 +00:00
|
|
|
"""
|
|
|
|
Builds a search query from previously set values, returning a query
|
|
|
|
string in a format ready for use by the Xapian `SearchBackend`.
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
A query string suitable for parsing by Xapian.
|
|
|
|
"""
|
2009-06-16 18:48:11 +00:00
|
|
|
query = ''
|
|
|
|
|
|
|
|
if not self.query_filters:
|
|
|
|
query = '*'
|
|
|
|
else:
|
|
|
|
query_chunks = []
|
|
|
|
|
|
|
|
for the_filter in self.query_filters:
|
|
|
|
if the_filter.is_and():
|
|
|
|
query_chunks.append('AND')
|
|
|
|
|
|
|
|
if the_filter.is_not():
|
|
|
|
query_chunks.append('NOT')
|
|
|
|
|
|
|
|
if the_filter.is_or():
|
|
|
|
query_chunks.append('OR')
|
|
|
|
|
|
|
|
value = the_filter.value
|
|
|
|
|
|
|
|
if not isinstance(value, (list, tuple)):
|
|
|
|
# Convert whatever we find to what xapian wants.
|
2009-08-06 13:03:03 +00:00
|
|
|
value = self.backend._marshal_value(value)
|
2009-06-16 18:48:11 +00:00
|
|
|
|
|
|
|
# Check to see if it's a phrase for an exact match.
|
|
|
|
if ' ' in value:
|
|
|
|
value = '"%s"' % value
|
|
|
|
|
|
|
|
# 'content' is a special reserved word, much like 'pk' in
|
|
|
|
# Django's ORM layer. It indicates 'no special field'.
|
|
|
|
if the_filter.field == 'content':
|
|
|
|
query_chunks.append(value)
|
|
|
|
else:
|
|
|
|
filter_types = {
|
|
|
|
'exact': "%s:%s",
|
2009-08-03 12:48:55 +00:00
|
|
|
'gte': "%s:%s..*",
|
|
|
|
'gt': "NOT %s:..%s",
|
|
|
|
'lte': "%s:..%s",
|
|
|
|
'lt': "NOT %s:%s..*",
|
2009-06-16 18:48:11 +00:00
|
|
|
'startswith': "%s:%s*",
|
|
|
|
}
|
|
|
|
|
|
|
|
if the_filter.filter_type != 'in':
|
|
|
|
query_chunks.append(filter_types[the_filter.filter_type] % (the_filter.field, value))
|
|
|
|
else:
|
|
|
|
in_options = []
|
|
|
|
|
|
|
|
for possible_value in value:
|
|
|
|
in_options.append("%s:%s" % (the_filter.field, possible_value))
|
|
|
|
|
|
|
|
query_chunks.append("(%s)" % " OR ".join(in_options))
|
|
|
|
|
|
|
|
if query_chunks[0] in ('AND', 'OR'):
|
|
|
|
# Pull off an undesirable leading "AND" or "OR".
|
|
|
|
del(query_chunks[0])
|
|
|
|
|
|
|
|
query = " ".join(query_chunks)
|
|
|
|
|
|
|
|
if len(self.models):
|
|
|
|
models = ['django_ct:%s.%s' % (model._meta.app_label, model._meta.module_name) for model in self.models]
|
|
|
|
models_clause = ' '.join(models)
|
|
|
|
final_query = '(%s) %s' % (query, models_clause)
|
|
|
|
|
|
|
|
else:
|
|
|
|
final_query = query
|
|
|
|
|
2009-08-06 23:22:42 +00:00
|
|
|
return final_query
|
|
|
|
|
|
|
|
def run(self):
|
|
|
|
"""
|
|
|
|
Builds and executes the query. Returns a list of search results.
|
|
|
|
"""
|
|
|
|
final_query = self.build_query()
|
|
|
|
kwargs = {
|
|
|
|
'start_offset': self.start_offset,
|
|
|
|
}
|
|
|
|
|
|
|
|
if self.order_by:
|
|
|
|
kwargs['sort_by'] = self.order_by
|
|
|
|
|
|
|
|
if self.end_offset is not None:
|
|
|
|
kwargs['end_offset'] = self.end_offset - self.start_offset
|
|
|
|
|
|
|
|
if self.highlight:
|
|
|
|
kwargs['highlight'] = self.highlight
|
|
|
|
|
|
|
|
if self.facets:
|
|
|
|
kwargs['facets'] = list(self.facets)
|
|
|
|
|
|
|
|
if self.date_facets:
|
|
|
|
kwargs['date_facets'] = self.date_facets
|
|
|
|
|
|
|
|
if self.query_facets:
|
|
|
|
kwargs['query_facets'] = self.query_facets
|
|
|
|
|
|
|
|
if self.narrow_queries:
|
|
|
|
kwargs['narrow_queries'] = self.narrow_queries
|
2009-08-07 19:58:36 +00:00
|
|
|
|
|
|
|
if self.boost:
|
|
|
|
kwargs['boost'] = self.boost
|
2009-08-06 23:22:42 +00:00
|
|
|
|
|
|
|
results = self.backend.search(final_query, **kwargs)
|
|
|
|
self._results = results.get('results', [])
|
|
|
|
self._hit_count = results.get('hits', 0)
|
|
|
|
self._facet_counts = results.get('facets', {})
|
|
|
|
self._spelling_suggestion = results.get('spelling_suggestion', None)
|
|
|
|
|
|
|
|
def run_mlt(self):
|
|
|
|
"""
|
|
|
|
Builds and executes the query. Returns a list of search results.
|
|
|
|
"""
|
|
|
|
if self._more_like_this is False or self._mlt_instance is None:
|
|
|
|
raise MoreLikeThisError("No instance was provided to determine 'More Like This' results.")
|
|
|
|
|
|
|
|
additional_query_string = self.build_query()
|
|
|
|
kwargs = {
|
|
|
|
'start_offset': self.start_offset,
|
|
|
|
}
|
|
|
|
|
|
|
|
if self.end_offset is not None:
|
|
|
|
kwargs['end_offset'] = self.end_offset - self.start_offset
|
|
|
|
|
|
|
|
results = self.backend.more_like_this(self._mlt_instance, additional_query_string, **kwargs)
|
|
|
|
self._results = results.get('results', [])
|
|
|
|
self._hit_count = results.get('hits', 0)
|