2009-07-30 13:01:38 +00:00
|
|
|
# Copyright (C) 2009 David Sauve
|
2009-07-30 12:57:35 +00:00
|
|
|
#
|
|
|
|
# This program is free software; you can redistribute it and/or modify
|
|
|
|
# it under the terms of the GNU General Public License as published by
|
|
|
|
# the Free Software Foundation; either version 2 of the License, or
|
|
|
|
# (at your option) any later version.
|
|
|
|
#
|
|
|
|
# This program is distributed in the hope that it will be useful,
|
|
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
# GNU General Public License for more details.
|
|
|
|
#
|
|
|
|
# You should have received a copy of the GNU General Public License along
|
|
|
|
# with this program; if not, write to the Free Software Foundation, Inc.,
|
|
|
|
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
|
|
|
|
2009-06-16 18:48:11 +00:00
|
|
|
import datetime
|
|
|
|
import cPickle as pickle
|
|
|
|
import os
|
|
|
|
import re
|
|
|
|
import warnings
|
|
|
|
|
|
|
|
from django.conf import settings
|
|
|
|
from django.core.exceptions import ImproperlyConfigured
|
|
|
|
from django.utils.encoding import smart_unicode, force_unicode
|
|
|
|
|
|
|
|
from haystack.backends import BaseSearchBackend, BaseSearchQuery
|
|
|
|
from haystack.exceptions import MissingDependency
|
|
|
|
from haystack.models import SearchResult
|
|
|
|
|
|
|
|
try:
|
|
|
|
import xapian
|
|
|
|
except ImportError:
|
|
|
|
raise MissingDependency("The 'xapian' backend requires the installation of 'xapian'. Please refer to the documentation.")
|
|
|
|
|
|
|
|
|
|
|
|
DEFAULT_MAX_RESULTS = 100000
|
|
|
|
|
|
|
|
DOCUMENT_ID_TERM_PREFIX = 'Q'
|
|
|
|
DOCUMENT_CUSTOM_TERM_PREFIX = 'X'
|
|
|
|
DOCUMENT_CT_TERM_PREFIX = DOCUMENT_CUSTOM_TERM_PREFIX + 'CONTENTTYPE'
|
|
|
|
|
2009-06-18 16:15:13 +00:00
|
|
|
field_re = re.compile(r'(?<=(?<!Z)X)([A-Z_]+)(\w+)')
|
|
|
|
|
|
|
|
|
2009-06-16 18:48:11 +00:00
|
|
|
class SearchBackend(BaseSearchBackend):
|
2009-06-18 16:15:13 +00:00
|
|
|
"""
|
|
|
|
`SearchBackend` defines the Xapian search backend for use with the Haystack
|
|
|
|
API for Django search.
|
|
|
|
|
|
|
|
It uses the Xapian Python bindings to interface with Xapian, and as
|
|
|
|
such is subject to this bug: <http://trac.xapian.org/ticket/364> when
|
|
|
|
Django is running with mod_python or mod_wsgi under Apache.
|
|
|
|
|
|
|
|
Until this issue has been fixed by Xapian, it is neccessary to set
|
|
|
|
`WSGIApplicationGroup to %{GLOBAL}` when using mod_wsgi, or
|
|
|
|
`PythonInterpreter main_interpreter` when using mod_python.
|
|
|
|
|
|
|
|
In order to use this backend, `HAYSTACK_XAPIAN_PATH` must be set in
|
|
|
|
your settings. This should point to a location where you would your
|
|
|
|
indexes to reside.
|
|
|
|
"""
|
2009-06-25 12:51:31 +00:00
|
|
|
RESERVED_WORDS = (
|
|
|
|
'AND',
|
|
|
|
'NOT',
|
|
|
|
'OR',
|
|
|
|
'XOR',
|
|
|
|
'NEAR',
|
|
|
|
'ADJ',
|
|
|
|
)
|
|
|
|
|
|
|
|
RESERVED_CHARACTERS = (
|
|
|
|
'\\', '+', '-', '&&', '||', '!', '(', ')', '{', '}',
|
|
|
|
'[', ']', '^', '"', '~', '*', '?', ':',
|
|
|
|
)
|
|
|
|
|
2009-07-29 19:34:46 +00:00
|
|
|
def __init__(self, site=None, stemming_language='english'):
|
2009-06-18 16:15:13 +00:00
|
|
|
"""
|
|
|
|
Instantiates an instance of `SearchBackend`.
|
|
|
|
|
|
|
|
Optional arguments:
|
|
|
|
`site` -- The site to associate the backend with (default = None)
|
2009-07-29 19:34:46 +00:00
|
|
|
`stemming_language` -- The stemming language (default = 'english')
|
2009-06-18 16:15:13 +00:00
|
|
|
|
2009-07-29 19:34:46 +00:00
|
|
|
Also sets the stemming language to be used to `stemming_language`.
|
2009-06-18 16:15:13 +00:00
|
|
|
"""
|
2009-06-16 18:48:11 +00:00
|
|
|
super(SearchBackend, self).__init__(site)
|
2009-07-21 13:43:55 +00:00
|
|
|
|
2009-06-16 18:48:11 +00:00
|
|
|
if not hasattr(settings, 'HAYSTACK_XAPIAN_PATH'):
|
|
|
|
raise ImproperlyConfigured('You must specify a HAYSTACK_XAPIAN_PATH in your settings.')
|
|
|
|
|
2009-07-29 19:34:46 +00:00
|
|
|
if not os.path.exists(settings.HAYSTACK_XAPIAN_PATH):
|
|
|
|
os.makedirs(settings.HAYSTACK_XAPIAN_PATH)
|
|
|
|
|
2009-07-31 01:18:31 +00:00
|
|
|
self.stemmer = xapian.Stem(stemming_language)
|
2009-07-31 15:39:58 +00:00
|
|
|
|
|
|
|
def get_identifier(self, obj_or_string):
|
|
|
|
return DOCUMENT_ID_TERM_PREFIX + super(SearchBackend, self).get_identifier(obj_or_string)
|
2009-06-16 18:48:11 +00:00
|
|
|
|
|
|
|
def update(self, index, iterable):
|
2009-06-18 16:15:13 +00:00
|
|
|
"""
|
|
|
|
Updates the `index` with any objects in `iterable` by adding/updating
|
|
|
|
the database as needed.
|
|
|
|
|
|
|
|
Required arguments:
|
|
|
|
`index` -- The `SearchIndex` to process
|
|
|
|
`iterable` -- An iterable of model instances to index
|
|
|
|
|
|
|
|
For each object in `iterable`, a document is created containing all
|
|
|
|
of the terms extracted from `index.prepare(obj)` with stemming prefixes,
|
|
|
|
field prefixes, and 'as-is'.
|
|
|
|
|
|
|
|
eg. `content:Testing` ==> `testing, Ztest, ZXCONTENTtest`
|
|
|
|
|
2009-07-31 01:18:31 +00:00
|
|
|
Each document also contains an extra term in the format:
|
2009-06-18 16:15:13 +00:00
|
|
|
|
|
|
|
`XCONTENTTYPE<app_name>.<model_name>`
|
|
|
|
|
|
|
|
As well as a unique identifier in the the format:
|
|
|
|
|
|
|
|
`Q<app_name>.<model_name>.<pk>`
|
|
|
|
|
|
|
|
eg.: foo.bar (pk=1) ==> `Qfoo.bar.1`, `XCONTENTTYPEfoo.bar`
|
|
|
|
|
|
|
|
This is useful for querying for a specific document corresponding to
|
2009-07-31 01:18:31 +00:00
|
|
|
a model instance.
|
2009-06-18 16:15:13 +00:00
|
|
|
|
2009-07-31 01:18:31 +00:00
|
|
|
The document also contains a pickled version of the object itself and
|
|
|
|
the document ID in the document data field.
|
2009-06-18 16:15:13 +00:00
|
|
|
|
2009-07-09 18:04:06 +00:00
|
|
|
Finally, we also store field values to be used for sorting data. We
|
|
|
|
store these in the document value slots (position zero is reserver
|
|
|
|
for the document ID). All values are stored as unicode strings with
|
|
|
|
conversion of float, int, double, values being done by Xapian itself
|
|
|
|
through the use of the :method:xapian.sortable_serialise method.
|
2009-06-18 16:15:13 +00:00
|
|
|
"""
|
2009-07-31 01:18:31 +00:00
|
|
|
database = self._database(writable=True)
|
2009-06-16 18:48:11 +00:00
|
|
|
try:
|
|
|
|
for obj in iterable:
|
|
|
|
document = xapian.Document()
|
2009-07-31 01:18:31 +00:00
|
|
|
term_generator = self._term_generator(database, document)
|
|
|
|
document_id = self.get_identifier(obj)
|
|
|
|
model_data = index.prepare(obj)
|
2009-06-16 18:48:11 +00:00
|
|
|
|
2009-07-31 01:18:31 +00:00
|
|
|
for field in self.schema:
|
|
|
|
if field['field_name'] in model_data.keys():
|
2009-07-31 19:17:22 +00:00
|
|
|
prefix = DOCUMENT_CUSTOM_TERM_PREFIX + field['field_name'].upper()
|
2009-07-31 01:18:31 +00:00
|
|
|
value = model_data[field['field_name']]
|
2009-07-03 19:18:23 +00:00
|
|
|
data = self._from_python(value)
|
2009-07-31 01:18:31 +00:00
|
|
|
term_generator.index_text(data)
|
|
|
|
term_generator.index_text(data, 1, prefix)
|
2009-07-09 18:04:06 +00:00
|
|
|
if isinstance(value, (int, long, float)):
|
2009-07-31 01:18:31 +00:00
|
|
|
document.add_value(field['column'], xapian.sortable_serialise(value))
|
2009-07-09 18:04:06 +00:00
|
|
|
else:
|
2009-07-31 01:18:31 +00:00
|
|
|
document.add_value(field['column'], data)
|
2009-06-16 18:48:11 +00:00
|
|
|
|
2009-07-31 15:39:58 +00:00
|
|
|
document.set_data(pickle.dumps(
|
|
|
|
(obj._meta.app_label, obj._meta.module_name, obj.pk, model_data),
|
|
|
|
pickle.HIGHEST_PROTOCOL
|
|
|
|
))
|
|
|
|
document.add_term(document_id)
|
2009-06-18 16:15:13 +00:00
|
|
|
document.add_term(
|
|
|
|
DOCUMENT_CT_TERM_PREFIX + u'%s.%s' %
|
|
|
|
(obj._meta.app_label, obj._meta.module_name)
|
|
|
|
)
|
2009-07-31 15:39:58 +00:00
|
|
|
database.replace_document(document_id, document)
|
2009-06-16 18:48:11 +00:00
|
|
|
|
|
|
|
except UnicodeDecodeError:
|
|
|
|
sys.stderr.write('Chunk failed.\n')
|
|
|
|
pass
|
|
|
|
|
|
|
|
def remove(self, obj):
|
2009-06-18 16:15:13 +00:00
|
|
|
"""
|
|
|
|
Remove indexes for `obj` from the database.
|
|
|
|
|
|
|
|
We delete all instances of `Q<app_name>.<model_name>.<pk>` which
|
|
|
|
should be unique to this object.
|
|
|
|
"""
|
2009-07-31 15:39:58 +00:00
|
|
|
database = self._database(writable=True)
|
|
|
|
database.delete_document(self.get_identifier(obj))
|
2009-06-16 18:48:11 +00:00
|
|
|
|
|
|
|
def clear(self, models=[]):
|
2009-06-18 16:15:13 +00:00
|
|
|
"""
|
|
|
|
Clear all instances of `models` from the database or all models, if
|
|
|
|
not specified.
|
|
|
|
|
|
|
|
Optional Arguments:
|
|
|
|
`models` -- Models to clear from the database (default = [])
|
|
|
|
|
|
|
|
If `models` is empty, an empty query is executed which matches all
|
|
|
|
documents in the database. Afterwards, each match is deleted.
|
|
|
|
|
|
|
|
Otherwise, for each model, a `delete_document` call is issued with
|
|
|
|
the term `XCONTENTTYPE<app_name>.<model_name>`. This will delete
|
|
|
|
all documents with the specified model type.
|
|
|
|
"""
|
2009-07-31 15:39:58 +00:00
|
|
|
database = self._database(writable=True)
|
2009-06-16 18:48:11 +00:00
|
|
|
if not models:
|
2009-07-31 15:39:58 +00:00
|
|
|
query, __unused__ = self._query(database, '*')
|
|
|
|
enquire = self._enquire(database, query)
|
2009-06-16 18:48:11 +00:00
|
|
|
for match in enquire.get_mset(0, DEFAULT_MAX_RESULTS):
|
2009-07-31 15:39:58 +00:00
|
|
|
database.delete_document(match.get_docid())
|
2009-06-16 18:48:11 +00:00
|
|
|
else:
|
|
|
|
for model in models:
|
2009-07-31 15:39:58 +00:00
|
|
|
database.delete_document(
|
2009-06-18 16:15:13 +00:00
|
|
|
DOCUMENT_CT_TERM_PREFIX + '%s.%s' %
|
|
|
|
(model._meta.app_label, model._meta.module_name)
|
|
|
|
)
|
2009-06-16 18:48:11 +00:00
|
|
|
|
|
|
|
def search(self, query_string, sort_by=None, start_offset=0, end_offset=DEFAULT_MAX_RESULTS,
|
|
|
|
fields='', highlight=False, facets=None, date_facets=None, query_facets=None,
|
|
|
|
narrow_queries=None, **kwargs):
|
2009-06-18 16:15:13 +00:00
|
|
|
"""
|
|
|
|
Executes the search as defined in `query_string`.
|
|
|
|
|
|
|
|
Required arguments:
|
|
|
|
`query_string` -- Search query to execute
|
|
|
|
|
|
|
|
Optional arguments:
|
|
|
|
`sort_by` -- Sort results by specified field (default = None)
|
|
|
|
`start_offset` -- Slice results from `start_offset` (default = 0)
|
|
|
|
`end_offset` -- Slice results at `end_offset` (default = 10,000)
|
|
|
|
`fields` -- Filter results on `fields` (default = '')
|
|
|
|
`highlight` -- Highlight terms in results (default = False)
|
|
|
|
`facets` -- Facet results on fields (default = None)
|
|
|
|
`date_facets` -- Facet results on date ranges (default = None)
|
|
|
|
`query_facets` -- Facet results on queries (default = None)
|
|
|
|
`narrow_queries` -- Narrow queries (default = None)
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
A dictionary with the following keys:
|
|
|
|
`results` -- A list of `SearchResult`
|
|
|
|
`hits` -- The total available results
|
|
|
|
`facets` - A dictionary of facets with the following keys:
|
|
|
|
`fields` -- A list of field facets
|
|
|
|
`dates` -- A list of date facets
|
|
|
|
`queries` -- A list of query facets
|
|
|
|
If faceting was not used, the `facets` key will not be present
|
|
|
|
|
|
|
|
If `query_string` is empty, returns no results.
|
|
|
|
|
2009-07-03 19:44:01 +00:00
|
|
|
Otherwise, loads the available fields from the database meta data schema
|
2009-06-18 16:15:13 +00:00
|
|
|
and sets up prefixes for each one along with a prefix for `django_ct`,
|
|
|
|
used to filter by model, and loads the current stemmer instance.
|
|
|
|
|
|
|
|
Afterwards, executes the Xapian query parser to create a query from
|
|
|
|
`query_string` that is then passed to a new `enquire` instance.
|
|
|
|
|
|
|
|
The resulting match set is passed to :method:`_process_results` for
|
|
|
|
further processing prior to returning a dictionary with the results.
|
2009-06-20 11:32:18 +00:00
|
|
|
|
|
|
|
If `HAYSTACK_INCLUDE_SPELLING` was enabled in `settings.py`, the
|
|
|
|
extra flag `FLAG_SPELLING_CORRECTION` will be passed to the query parser
|
|
|
|
and any suggestions for spell correction will be returned as well as
|
|
|
|
the results.
|
2009-06-18 16:15:13 +00:00
|
|
|
"""
|
2009-06-16 18:48:11 +00:00
|
|
|
if not query_string:
|
|
|
|
return {
|
|
|
|
'results': [],
|
|
|
|
'hits': 0,
|
|
|
|
}
|
|
|
|
|
|
|
|
if date_facets is not None:
|
|
|
|
warnings.warn("Date faceting has not been implemented yet.", Warning, stacklevel=2)
|
|
|
|
|
|
|
|
if query_facets is not None:
|
|
|
|
warnings.warn("Query faceting has not been implemented yet.", Warning, stacklevel=2)
|
|
|
|
|
2009-07-31 15:39:58 +00:00
|
|
|
database = self._database()
|
|
|
|
query, spelling_suggestion = self._query(database, query_string, narrow_queries)
|
|
|
|
enquire = self._enquire(database, query)
|
2009-07-24 20:15:34 +00:00
|
|
|
|
|
|
|
if sort_by:
|
2009-07-31 15:39:58 +00:00
|
|
|
sorter = self._sorter(sort_by)
|
2009-07-24 20:15:34 +00:00
|
|
|
enquire.set_sort_by_key_then_relevance(sorter, True)
|
|
|
|
|
2009-07-31 15:39:58 +00:00
|
|
|
results = []
|
|
|
|
facets_dict = {
|
|
|
|
'fields': {},
|
|
|
|
'dates': {},
|
|
|
|
'queries': {},
|
|
|
|
}
|
2009-06-16 18:48:11 +00:00
|
|
|
matches = enquire.get_mset(start_offset, end_offset)
|
2009-06-20 11:32:18 +00:00
|
|
|
|
2009-07-31 15:39:58 +00:00
|
|
|
for match in matches:
|
|
|
|
document = match.get_document()
|
|
|
|
app_label, module_name, pk, model_data = pickle.loads(document.get_data())
|
|
|
|
results.append(
|
|
|
|
SearchResult(app_label, module_name, pk, match.weight, **model_data)
|
|
|
|
)
|
|
|
|
if facets:
|
|
|
|
facets_dict['fields'] = self._do_field_facets(
|
|
|
|
document, facets, facets_dict['fields']
|
|
|
|
)
|
|
|
|
if highlight and (len(query_string) > 0):
|
|
|
|
model_data['highlighted'] = {
|
|
|
|
self.content_field_name: self._do_highlight(
|
|
|
|
model_data.get(self.content_field_name), query_string
|
|
|
|
)
|
|
|
|
}
|
2009-06-16 18:48:11 +00:00
|
|
|
|
2009-07-31 15:39:58 +00:00
|
|
|
return {
|
|
|
|
'results': results,
|
|
|
|
'hits': matches.get_matches_estimated(),
|
|
|
|
'facets': facets_dict,
|
|
|
|
'spelling_suggestion': spelling_suggestion,
|
|
|
|
}
|
2009-06-16 18:48:11 +00:00
|
|
|
|
|
|
|
def delete_index(self):
|
2009-06-18 16:15:13 +00:00
|
|
|
"""
|
|
|
|
Delete the index.
|
|
|
|
|
|
|
|
This removes all indexes files and the `HAYSTACK_XAPIAN_PATH` folder.
|
|
|
|
"""
|
2009-07-29 19:34:46 +00:00
|
|
|
if os.path.exists(settings.HAYSTACK_XAPIAN_PATH):
|
|
|
|
index_files = os.listdir(settings.HAYSTACK_XAPIAN_PATH)
|
2009-06-16 18:48:11 +00:00
|
|
|
for index_file in index_files:
|
2009-07-29 19:34:46 +00:00
|
|
|
os.remove(os.path.join(settings.HAYSTACK_XAPIAN_PATH, index_file))
|
|
|
|
os.removedirs(settings.HAYSTACK_XAPIAN_PATH)
|
2009-06-16 18:48:11 +00:00
|
|
|
|
|
|
|
def document_count(self):
|
2009-06-18 16:15:13 +00:00
|
|
|
"""
|
|
|
|
Retrieves the total document count for the search index.
|
|
|
|
"""
|
2009-06-16 18:48:11 +00:00
|
|
|
try:
|
2009-07-31 15:39:58 +00:00
|
|
|
database = self._database()
|
2009-06-16 18:48:11 +00:00
|
|
|
except xapian.DatabaseOpeningError:
|
|
|
|
return 0
|
2009-07-31 15:39:58 +00:00
|
|
|
return database.get_doccount()
|
2009-06-16 18:48:11 +00:00
|
|
|
|
2009-06-17 20:54:39 +00:00
|
|
|
def more_like_this(self, model_instance):
|
2009-06-18 16:15:13 +00:00
|
|
|
"""
|
|
|
|
Given a model instance, returns a result set of similar documents.
|
|
|
|
|
|
|
|
Required arguments:
|
|
|
|
`model_instance` -- The model instance to use as a basis for
|
|
|
|
retrieving similar documents.
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
A dictionary with the following keys:
|
|
|
|
`results` -- A list of `SearchResult`
|
|
|
|
`hits` -- The total available results
|
|
|
|
|
|
|
|
Opens a database connection, then builds a simple query using the
|
|
|
|
`model_instance` to build the unique identifier.
|
|
|
|
|
|
|
|
For each document retrieved(should always be one), adds an entry into
|
|
|
|
an RSet (relevance set) with the document id, then, uses the RSet
|
|
|
|
to query for an ESet (A set of terms that can be used to suggest
|
|
|
|
expansions to the original query), omitting any document that was in
|
|
|
|
the original query.
|
|
|
|
|
|
|
|
Finally, processes the resulting matches and returns.
|
|
|
|
"""
|
2009-07-31 15:39:58 +00:00
|
|
|
database = self._database()
|
2009-06-18 16:15:13 +00:00
|
|
|
query = xapian.Query(
|
|
|
|
DOCUMENT_ID_TERM_PREFIX + self.get_identifier(model_instance)
|
|
|
|
)
|
2009-07-31 15:39:58 +00:00
|
|
|
enquire = self._enquire(database, query)
|
2009-06-17 20:54:39 +00:00
|
|
|
rset = xapian.RSet()
|
|
|
|
for match in enquire.get_mset(0, DEFAULT_MAX_RESULTS):
|
|
|
|
rset.add_document(match.get_docid())
|
2009-06-17 23:30:25 +00:00
|
|
|
query = xapian.Query(xapian.Query.OP_OR,
|
2009-06-17 20:54:39 +00:00
|
|
|
[expand.term for expand in enquire.get_eset(DEFAULT_MAX_RESULTS, rset)]
|
|
|
|
)
|
2009-06-17 23:30:25 +00:00
|
|
|
query = xapian.Query(xapian.Query.OP_AND_NOT,
|
|
|
|
[query, DOCUMENT_ID_TERM_PREFIX + self.get_identifier(model_instance)]
|
|
|
|
)
|
2009-06-17 20:54:39 +00:00
|
|
|
enquire.set_query(query)
|
|
|
|
matches = enquire.get_mset(0, DEFAULT_MAX_RESULTS)
|
|
|
|
return self._process_results(matches)
|
|
|
|
|
2009-07-27 19:12:20 +00:00
|
|
|
def _process_results(self, matches, query_string='', highlight=False, facets=None):
|
2009-06-18 16:15:13 +00:00
|
|
|
"""
|
|
|
|
Private method for processing an MSet (match set).
|
|
|
|
|
|
|
|
Required arguments:
|
|
|
|
`matches` -- An MSet of matches
|
|
|
|
|
|
|
|
Optional arguments:
|
2009-07-27 19:12:20 +00:00
|
|
|
`query_string` -- The query string that generated the matches
|
|
|
|
`highlight` -- Add highlighting to results? (default=False)
|
2009-06-18 16:15:13 +00:00
|
|
|
`facets` -- Fields to facet (default = None)
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
A dictionary with the following keys:
|
|
|
|
`results` -- A list of `SearchResult`
|
|
|
|
`hits` -- The total available results
|
|
|
|
`facets` - A dictionary of facets with the following keys:
|
|
|
|
`fields` -- A list of field facets
|
|
|
|
`dates` -- A list of date facets
|
|
|
|
`queries` -- A list of query facets
|
|
|
|
If faceting was not used, the `facets` key will not be present
|
|
|
|
|
|
|
|
For each match in the `matches`, retrieves the corresponding document
|
|
|
|
and extracts the `app_name`, `model_name`, and `pk` from the information
|
|
|
|
at value position 0, and :method:pickle.loads the remaining model
|
|
|
|
values from the document data area.
|
|
|
|
|
|
|
|
For each match, one `SearchResult` will be appended to the `results`
|
|
|
|
list.
|
|
|
|
"""
|
2009-06-16 18:48:11 +00:00
|
|
|
facets_dict = {
|
|
|
|
'fields': {},
|
|
|
|
'dates': {},
|
|
|
|
'queries': {},
|
|
|
|
}
|
|
|
|
results = []
|
|
|
|
hits = matches.get_matches_estimated()
|
|
|
|
|
|
|
|
for match in matches:
|
|
|
|
document = match.get_document()
|
|
|
|
app_label, module_name, pk = document.get_value(0).split('.')
|
|
|
|
additional_fields = pickle.loads(document.get_data())
|
2009-07-27 19:12:20 +00:00
|
|
|
if highlight and (len(query_string) > 0):
|
|
|
|
additional_fields['highlighted'] = {
|
|
|
|
self.content_field_name: self._do_highlight(
|
|
|
|
additional_fields.get(self.content_field_name), query_string
|
|
|
|
)
|
|
|
|
}
|
2009-06-16 18:48:11 +00:00
|
|
|
result = SearchResult(
|
|
|
|
app_label, module_name, pk, match.weight, **additional_fields
|
|
|
|
)
|
|
|
|
results.append(result)
|
|
|
|
|
|
|
|
if facets:
|
2009-06-18 16:15:13 +00:00
|
|
|
facets_dict['fields'] = self._do_field_facets(
|
|
|
|
document, facets, facets_dict['fields']
|
|
|
|
)
|
2009-06-20 11:32:18 +00:00
|
|
|
|
2009-06-16 18:48:11 +00:00
|
|
|
return {
|
|
|
|
'results': results,
|
|
|
|
'hits': hits,
|
|
|
|
'facets': facets_dict,
|
|
|
|
}
|
|
|
|
|
2009-07-27 19:12:20 +00:00
|
|
|
def _do_highlight(self, content, text, tag='em'):
|
|
|
|
"""
|
|
|
|
Highlight `text` in `content` with html `tag`.
|
|
|
|
|
|
|
|
This method assumes that the input text (`content`) does not contain
|
|
|
|
any special formatting. That is, it does not contain any html tags
|
|
|
|
or similar markup that could be screwed up by the highlighting.
|
|
|
|
|
|
|
|
Required arguments:
|
|
|
|
`content` -- Content to search for instances of `text`
|
|
|
|
`text` -- The text to be highlighted
|
|
|
|
"""
|
|
|
|
for term in [term.replace('*', '') for term in text.split()]:
|
|
|
|
term_re = re.compile(re.escape(term), re.IGNORECASE)
|
|
|
|
content = term_re.sub('<%s>%s</%s>' % (tag, term, tag), content)
|
|
|
|
return content
|
|
|
|
|
2009-06-16 18:48:11 +00:00
|
|
|
def _do_field_facets(self, document, facets, fields):
|
2009-06-18 16:15:13 +00:00
|
|
|
"""
|
|
|
|
Private method that facets a document by field name.
|
|
|
|
|
|
|
|
Required arguments:
|
|
|
|
`document` -- The document to parse
|
|
|
|
`facets` -- A list of facets to use when faceting
|
|
|
|
`fields` -- A list of fields that have already been faceted. This
|
|
|
|
will be extended with any new field names and counts
|
|
|
|
found in the `document`.
|
|
|
|
|
|
|
|
For each term in the document, extract the field name and determine
|
|
|
|
if it is one of the `facets` we want. If so, verify if it already in
|
|
|
|
the `fields` list. If it is, update the count, otherwise, add it and
|
|
|
|
set the count to 1.
|
|
|
|
"""
|
|
|
|
for term in [(term.term, term.termfreq) for term in document]:
|
2009-06-16 18:48:11 +00:00
|
|
|
match = field_re.search(term[0])
|
|
|
|
if match and match.group(1).lower() in facets:
|
|
|
|
if match.group(1).lower() in fields:
|
|
|
|
fields[match.group(1).lower()] += [(match.group(2), term[1])]
|
|
|
|
else:
|
|
|
|
fields[match.group(1).lower()] = [(match.group(2), term[1])]
|
|
|
|
return fields
|
|
|
|
|
|
|
|
def _from_python(self, value):
|
|
|
|
"""
|
2009-07-31 15:39:58 +00:00
|
|
|
Private method that converts Python values to a string for Xapian.
|
2009-06-16 18:48:11 +00:00
|
|
|
"""
|
|
|
|
if isinstance(value, datetime.datetime):
|
2009-07-22 15:30:22 +00:00
|
|
|
if value.microsecond:
|
|
|
|
value = u'%04d%02d%02d%02d%02d%02d%06d' % (
|
|
|
|
value.year, value.month, value.day, value.hour,
|
|
|
|
value.minute, value.second, value.microsecond
|
|
|
|
)
|
|
|
|
else:
|
|
|
|
value = u'%04d%02d%02d%02d%02d%02d' % (
|
|
|
|
value.year, value.month, value.day, value.hour,
|
|
|
|
value.minute, value.second
|
|
|
|
)
|
2009-06-16 18:48:11 +00:00
|
|
|
elif isinstance(value, datetime.date):
|
2009-07-22 15:30:22 +00:00
|
|
|
value = u'%04d%02d%02d000000' % (value.year, value.month, value.day)
|
2009-06-16 18:48:11 +00:00
|
|
|
elif isinstance(value, bool):
|
|
|
|
if value:
|
2009-07-21 17:47:13 +00:00
|
|
|
value = u't'
|
2009-06-16 18:48:11 +00:00
|
|
|
else:
|
2009-07-21 17:47:13 +00:00
|
|
|
value = u'f'
|
2009-06-16 18:48:11 +00:00
|
|
|
else:
|
|
|
|
value = force_unicode(value)
|
|
|
|
return value
|
|
|
|
|
2009-07-31 01:18:31 +00:00
|
|
|
def _database(self, writable=False):
|
2009-07-21 17:11:33 +00:00
|
|
|
"""
|
2009-07-31 01:18:31 +00:00
|
|
|
Private method that returns a xapian.Database for use and sets up
|
|
|
|
schema and content_field definitions.
|
|
|
|
|
|
|
|
Optional arguments:
|
|
|
|
``writable`` -- Open the database in read/write mode (default=False)
|
|
|
|
|
|
|
|
Returns an instance of a xapian.Database or xapian.WritableDatabase
|
|
|
|
"""
|
|
|
|
if writable:
|
|
|
|
self.content_field_name, fields = self.site.build_unified_schema()
|
|
|
|
self.schema = self._build_schema(fields)
|
|
|
|
|
|
|
|
database = xapian.WritableDatabase(settings.HAYSTACK_XAPIAN_PATH, xapian.DB_CREATE_OR_OPEN)
|
|
|
|
database.set_metadata('schema', pickle.dumps(self.schema, pickle.HIGHEST_PROTOCOL))
|
|
|
|
database.set_metadata('content', pickle.dumps(self.content_field_name, pickle.HIGHEST_PROTOCOL))
|
|
|
|
else:
|
|
|
|
database = xapian.Database(settings.HAYSTACK_XAPIAN_PATH)
|
|
|
|
|
|
|
|
self.schema = pickle.loads(database.get_metadata('schema'))
|
|
|
|
self.content_field_name = pickle.loads(database.get_metadata('content'))
|
|
|
|
|
|
|
|
return database
|
|
|
|
|
|
|
|
def _term_generator(self, database, document):
|
|
|
|
"""
|
|
|
|
Private method that returns a Xapian.TermGenerator
|
2009-07-21 17:44:50 +00:00
|
|
|
|
|
|
|
Required Argument:
|
|
|
|
`document` -- The document to be indexed
|
|
|
|
|
2009-07-31 15:39:58 +00:00
|
|
|
Returns a Xapian.TermGenerator instance. If `HAYSTACK_INCLUDE_SPELLING`
|
|
|
|
is True, then the term generator will have spell-checking enabled.
|
2009-07-21 17:44:50 +00:00
|
|
|
"""
|
2009-07-31 01:18:31 +00:00
|
|
|
term_generator = xapian.TermGenerator()
|
|
|
|
term_generator.set_database(database)
|
|
|
|
term_generator.set_stemmer(self.stemmer)
|
|
|
|
if getattr(settings, 'HAYSTACK_INCLUDE_SPELLING', False) is True:
|
|
|
|
term_generator.set_flags(xapian.TermGenerator.FLAG_SPELLING)
|
|
|
|
term_generator.set_document(document)
|
|
|
|
return term_generator
|
2009-07-21 17:44:50 +00:00
|
|
|
|
2009-07-31 15:39:58 +00:00
|
|
|
def _query(self, database, query_string, narrow_queries=None):
|
2009-07-22 20:47:30 +00:00
|
|
|
"""
|
2009-07-31 15:39:58 +00:00
|
|
|
Private method that takes a query string and returns a xapian.Query
|
|
|
|
|
|
|
|
Required arguments:
|
|
|
|
`query_string` -- The query string to parse
|
|
|
|
|
|
|
|
Optional arguments:
|
|
|
|
`narrow_queries` -- A list of queries to narrow the query with
|
|
|
|
|
|
|
|
Returns a xapian.Query instance
|
|
|
|
"""
|
|
|
|
spelling_suggestion = None
|
|
|
|
|
|
|
|
if query_string == '*':
|
|
|
|
query = xapian.Query('') # Make '*' match everything
|
|
|
|
else:
|
|
|
|
flags = self._flags()
|
|
|
|
qp = self._query_parser(database)
|
|
|
|
query = qp.parse_query(query_string, flags)
|
|
|
|
if getattr(settings, 'HAYSTACK_INCLUDE_SPELLING', False) is True:
|
|
|
|
spelling_suggestion = qp.get_corrected_query_string()
|
|
|
|
|
|
|
|
if narrow_queries:
|
|
|
|
subqueries = [qp.parse_query(narrow_query, flags) for narrow_query in narrow_queries]
|
|
|
|
query = xapian.Query(xapian.Query.OP_FILTER, query, xapian.Query(xapian.Query.OP_AND, subqueries))
|
|
|
|
|
|
|
|
return query, spelling_suggestion
|
|
|
|
|
|
|
|
def _sorter(self, sort_by):
|
|
|
|
"""
|
|
|
|
Private methos that takes a list of fields to sort by and returns a
|
|
|
|
xapian.MultiValueSorter
|
2009-07-22 20:47:30 +00:00
|
|
|
|
|
|
|
Required Arguments:
|
|
|
|
`sort_by` -- A list of fields to sort by
|
|
|
|
|
|
|
|
Returns a xapian.MultiValueSorter instance
|
|
|
|
"""
|
|
|
|
sorter = xapian.MultiValueSorter()
|
2009-07-31 15:39:58 +00:00
|
|
|
|
2009-07-22 20:47:30 +00:00
|
|
|
for sort_field in sort_by:
|
|
|
|
if sort_field.startswith('-'):
|
|
|
|
reverse = False
|
|
|
|
sort_field = sort_field[1:] # Strip the '-'
|
|
|
|
else:
|
|
|
|
reverse = True # Reverse is inverted in Xapian -- http://trac.xapian.org/ticket/311
|
2009-07-31 15:39:58 +00:00
|
|
|
sorter.add(self._value_column(sort_field), reverse)
|
|
|
|
|
2009-07-22 20:47:30 +00:00
|
|
|
return sorter
|
|
|
|
|
2009-07-31 15:39:58 +00:00
|
|
|
def _flags(self):
|
2009-07-29 19:34:46 +00:00
|
|
|
"""
|
|
|
|
Returns the commonly used Xapian.QueryParser flags
|
|
|
|
"""
|
2009-07-22 20:47:30 +00:00
|
|
|
flags = xapian.QueryParser.FLAG_PARTIAL \
|
|
|
|
| xapian.QueryParser.FLAG_PHRASE \
|
|
|
|
| xapian.QueryParser.FLAG_BOOLEAN \
|
|
|
|
| xapian.QueryParser.FLAG_LOVEHATE \
|
2009-07-30 12:54:46 +00:00
|
|
|
| xapian.QueryParser.FLAG_WILDCARD \
|
|
|
|
| xapian.QueryParser.FLAG_PURE_NOT
|
2009-07-22 20:47:30 +00:00
|
|
|
if getattr(settings, 'HAYSTACK_INCLUDE_SPELLING', False) is True:
|
|
|
|
flags = flags | xapian.QueryParser.FLAG_SPELLING_CORRECTION
|
|
|
|
return flags
|
|
|
|
|
2009-07-31 15:39:58 +00:00
|
|
|
def _query_parser(self, database):
|
2009-07-24 19:27:43 +00:00
|
|
|
"""
|
2009-07-31 15:39:58 +00:00
|
|
|
Private method that returns a Xapian.QueryParser instance.
|
|
|
|
|
|
|
|
Required arguments:
|
|
|
|
`database` -- The database to be queried
|
2009-07-24 20:05:31 +00:00
|
|
|
|
2009-07-24 19:27:43 +00:00
|
|
|
The query parser returned will have stemming enabled, a boolean prefix
|
2009-07-31 15:39:58 +00:00
|
|
|
for `django_ct`, and prefixes for all of the fields in the `self.schema`.
|
2009-07-24 19:27:43 +00:00
|
|
|
"""
|
|
|
|
qp = xapian.QueryParser()
|
2009-07-31 15:39:58 +00:00
|
|
|
qp.set_database(database)
|
2009-07-24 19:27:43 +00:00
|
|
|
qp.set_stemmer(self.stemmer)
|
|
|
|
qp.set_stemming_strategy(xapian.QueryParser.STEM_SOME)
|
|
|
|
qp.add_boolean_prefix('django_ct', DOCUMENT_CT_TERM_PREFIX)
|
2009-07-31 01:18:31 +00:00
|
|
|
for field_dict in self.schema:
|
|
|
|
qp.add_prefix(field_dict['field_name'], DOCUMENT_CUSTOM_TERM_PREFIX + field_dict['field_name'].upper())
|
2009-07-24 19:27:43 +00:00
|
|
|
return qp
|
2009-06-16 18:48:11 +00:00
|
|
|
|
2009-07-31 15:39:58 +00:00
|
|
|
def _enquire(self, database, query):
|
2009-07-24 20:05:31 +00:00
|
|
|
"""
|
2009-07-31 15:39:58 +00:00
|
|
|
Private method that that returns a Xapian.Enquire instance for use with
|
|
|
|
the specifed `query`.
|
2009-07-24 20:05:31 +00:00
|
|
|
|
|
|
|
Required Arguments:
|
|
|
|
`query` -- The query to run
|
|
|
|
|
|
|
|
Returns a xapian.Enquire instance
|
|
|
|
"""
|
2009-07-31 15:39:58 +00:00
|
|
|
enquire = xapian.Enquire(database)
|
2009-07-24 20:05:31 +00:00
|
|
|
enquire.set_query(query)
|
|
|
|
enquire.set_docid_order(enquire.ASCENDING)
|
2009-07-31 15:39:58 +00:00
|
|
|
|
2009-07-24 20:05:31 +00:00
|
|
|
return enquire
|
|
|
|
|
2009-07-29 19:34:46 +00:00
|
|
|
def _build_schema(self, fields):
|
|
|
|
"""
|
|
|
|
Private method to build a schema.
|
|
|
|
|
|
|
|
Required arguments:
|
|
|
|
``fields`` -- A list of fields in the index
|
|
|
|
|
|
|
|
Returns a list of fields in dictionary format ready for inclusion in
|
|
|
|
an indexe meta-data.
|
|
|
|
"""
|
|
|
|
for i, field in enumerate(fields):
|
|
|
|
if field['indexed'] == 'true':
|
|
|
|
field['column'] = i
|
|
|
|
else:
|
|
|
|
del field
|
|
|
|
return fields
|
|
|
|
|
2009-07-31 15:39:58 +00:00
|
|
|
def _value_column(self, field):
|
|
|
|
"""
|
|
|
|
Private method that returns the column value slot in the database
|
|
|
|
for a given field.
|
|
|
|
|
|
|
|
Required arguemnts:
|
|
|
|
`field` -- The field to lookup
|
|
|
|
|
|
|
|
Returns an integer with the column location (0 indexed).
|
|
|
|
"""
|
|
|
|
for field_dict in self.schema:
|
|
|
|
if field_dict['field_name'] == field:
|
|
|
|
return field_dict['column']
|
|
|
|
return 0
|
|
|
|
|
|
|
|
|
2009-07-27 19:12:20 +00:00
|
|
|
|
2009-06-16 18:48:11 +00:00
|
|
|
class SearchQuery(BaseSearchQuery):
|
2009-06-18 16:15:13 +00:00
|
|
|
"""
|
|
|
|
`SearchQuery` is responsible for converting search queries into a format
|
|
|
|
that Xapian can understand.
|
|
|
|
|
|
|
|
Most of the work is done by the :method:`build_query`.
|
|
|
|
"""
|
2009-06-16 18:48:11 +00:00
|
|
|
def __init__(self, backend=None):
|
2009-06-18 16:15:13 +00:00
|
|
|
"""
|
|
|
|
Create a new instance of the SearchQuery setting the backend as
|
|
|
|
specified. If no backend is set, will use the Xapian `SearchBackend`.
|
|
|
|
|
|
|
|
Optional arguments:
|
|
|
|
`backend` -- The `SearchBackend` to use (default = None)
|
|
|
|
"""
|
2009-06-16 18:48:11 +00:00
|
|
|
super(SearchQuery, self).__init__(backend=backend)
|
|
|
|
self.backend = backend or SearchBackend()
|
|
|
|
|
|
|
|
def build_query(self):
|
2009-06-18 16:15:13 +00:00
|
|
|
"""
|
|
|
|
Builds a search query from previously set values, returning a query
|
|
|
|
string in a format ready for use by the Xapian `SearchBackend`.
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
A query string suitable for parsing by Xapian.
|
|
|
|
"""
|
2009-06-16 18:48:11 +00:00
|
|
|
query = ''
|
|
|
|
|
|
|
|
if not self.query_filters:
|
|
|
|
query = '*'
|
|
|
|
else:
|
|
|
|
query_chunks = []
|
|
|
|
|
|
|
|
for the_filter in self.query_filters:
|
|
|
|
if the_filter.is_and():
|
|
|
|
query_chunks.append('AND')
|
|
|
|
|
|
|
|
if the_filter.is_not():
|
|
|
|
query_chunks.append('NOT')
|
|
|
|
|
|
|
|
if the_filter.is_or():
|
|
|
|
query_chunks.append('OR')
|
|
|
|
|
|
|
|
value = the_filter.value
|
|
|
|
|
|
|
|
if not isinstance(value, (list, tuple)):
|
|
|
|
# Convert whatever we find to what xapian wants.
|
|
|
|
value = self.backend._from_python(value)
|
|
|
|
|
|
|
|
# Check to see if it's a phrase for an exact match.
|
|
|
|
if ' ' in value:
|
|
|
|
value = '"%s"' % value
|
|
|
|
|
|
|
|
# 'content' is a special reserved word, much like 'pk' in
|
|
|
|
# Django's ORM layer. It indicates 'no special field'.
|
|
|
|
if the_filter.field == 'content':
|
|
|
|
query_chunks.append(value)
|
|
|
|
else:
|
|
|
|
filter_types = {
|
|
|
|
'exact': "%s:%s",
|
|
|
|
'gt': "%s:%s..*",
|
|
|
|
'gte': "NOT %s:*..%s",
|
|
|
|
'lt': "%s:*..%s",
|
|
|
|
'lte': "NOT %s:%s..*",
|
|
|
|
'startswith': "%s:%s*",
|
|
|
|
}
|
|
|
|
|
|
|
|
if the_filter.filter_type != 'in':
|
|
|
|
query_chunks.append(filter_types[the_filter.filter_type] % (the_filter.field, value))
|
|
|
|
else:
|
|
|
|
in_options = []
|
|
|
|
|
|
|
|
for possible_value in value:
|
|
|
|
in_options.append("%s:%s" % (the_filter.field, possible_value))
|
|
|
|
|
|
|
|
query_chunks.append("(%s)" % " OR ".join(in_options))
|
|
|
|
|
|
|
|
if query_chunks[0] in ('AND', 'OR'):
|
|
|
|
# Pull off an undesirable leading "AND" or "OR".
|
|
|
|
del(query_chunks[0])
|
|
|
|
|
|
|
|
query = " ".join(query_chunks)
|
|
|
|
|
|
|
|
if len(self.models):
|
|
|
|
models = ['django_ct:%s.%s' % (model._meta.app_label, model._meta.module_name) for model in self.models]
|
|
|
|
models_clause = ' '.join(models)
|
|
|
|
final_query = '(%s) %s' % (query, models_clause)
|
|
|
|
|
|
|
|
else:
|
|
|
|
final_query = query
|
|
|
|
|
|
|
|
# print final_query
|
|
|
|
|
|
|
|
# TODO: Implement boost
|
|
|
|
# if self.boost:
|
|
|
|
# boost_list = []
|
|
|
|
#
|
|
|
|
# for boost_word, boost_value in self.boost.items():
|
|
|
|
# boost_list.append("%s^%s" % (boost_word, boost_value))
|
|
|
|
#
|
|
|
|
# final_query = "%s %s" % (final_query, " ".join(boost_list))
|
|
|
|
|
2009-06-25 12:51:31 +00:00
|
|
|
return final_query
|