2009-06-16 18:48:11 +00:00
|
|
|
import datetime
|
2014-05-11 18:15:28 +00:00
|
|
|
import pickle
|
2022-02-10 19:44:21 +00:00
|
|
|
from pathlib import Path
|
2009-06-16 18:48:11 +00:00
|
|
|
import os
|
|
|
|
import re
|
2009-08-05 20:44:12 +00:00
|
|
|
import shutil
|
2009-08-28 14:42:58 +00:00
|
|
|
import sys
|
2009-06-16 18:48:11 +00:00
|
|
|
|
|
|
|
from django.conf import settings
|
|
|
|
from django.core.exceptions import ImproperlyConfigured
|
|
|
|
|
2022-02-10 19:44:21 +00:00
|
|
|
from filelock import FileLock
|
|
|
|
|
2011-05-09 04:21:14 +00:00
|
|
|
from haystack import connections
|
|
|
|
from haystack.backends import BaseEngine, BaseSearchBackend, BaseSearchQuery, SearchNode, log_query
|
2014-06-13 17:51:07 +00:00
|
|
|
from haystack.constants import ID, DJANGO_ID, DJANGO_CT, DEFAULT_OPERATOR
|
2012-04-20 18:44:40 +00:00
|
|
|
from haystack.exceptions import HaystackError, MissingDependency
|
2014-05-17 11:40:59 +00:00
|
|
|
from haystack.inputs import AutoQuery
|
2009-06-16 18:48:11 +00:00
|
|
|
from haystack.models import SearchResult
|
2014-05-11 06:12:26 +00:00
|
|
|
from haystack.utils import get_identifier, get_model_ct
|
2009-06-16 18:48:11 +00:00
|
|
|
|
2014-09-25 03:08:31 +00:00
|
|
|
NGRAM_MIN_LENGTH = 2
|
|
|
|
NGRAM_MAX_LENGTH = 15
|
|
|
|
|
2018-12-19 04:15:49 +00:00
|
|
|
LONG_TERM = re.compile(b'[^\s]{239,}')
|
|
|
|
LONG_TERM_METHOD = getattr(settings, 'XAPIAN_LONG_TERM_METHOD', 'truncate')
|
|
|
|
LONG_TERM_LENGTH = getattr(settings, 'XAPIAN_LONG_TERM_LENGTH', 240)
|
|
|
|
|
2009-06-16 18:48:11 +00:00
|
|
|
try:
|
|
|
|
import xapian
|
|
|
|
except ImportError:
|
2014-05-10 19:47:23 +00:00
|
|
|
raise MissingDependency("The 'xapian' backend requires the installation of 'Xapian'. "
|
|
|
|
"Please refer to the documentation.")
|
2009-06-16 18:48:11 +00:00
|
|
|
|
|
|
|
|
2014-05-14 19:34:39 +00:00
|
|
|
# this maps the different reserved fields to prefixes used to
|
|
|
|
# create the database:
|
|
|
|
# id str: unique document id.
|
|
|
|
# django_id int: id of the django model instance.
|
|
|
|
# django_ct str: of the content type of the django model.
|
|
|
|
# field str: name of the field of the index.
|
2017-01-10 21:03:46 +00:00
|
|
|
TERM_PREFIXES = {
|
|
|
|
ID: 'Q',
|
|
|
|
DJANGO_ID: 'QQ',
|
|
|
|
DJANGO_CT: 'CONTENTTYPE',
|
|
|
|
'field': 'X'
|
|
|
|
}
|
2009-06-16 18:48:11 +00:00
|
|
|
|
2011-07-17 15:50:35 +00:00
|
|
|
MEMORY_DB_NAME = ':memory:'
|
|
|
|
|
2011-03-26 01:32:57 +00:00
|
|
|
DEFAULT_XAPIAN_FLAGS = (
|
2012-04-20 18:44:40 +00:00
|
|
|
xapian.QueryParser.FLAG_PHRASE |
|
2011-03-26 01:32:57 +00:00
|
|
|
xapian.QueryParser.FLAG_BOOLEAN |
|
|
|
|
xapian.QueryParser.FLAG_LOVEHATE |
|
|
|
|
xapian.QueryParser.FLAG_WILDCARD |
|
|
|
|
xapian.QueryParser.FLAG_PURE_NOT
|
|
|
|
)
|
|
|
|
|
2014-06-13 17:51:07 +00:00
|
|
|
# Mapping from `HAYSTACK_DEFAULT_OPERATOR` to Xapian operators
|
|
|
|
XAPIAN_OPTS = {'AND': xapian.Query.OP_AND,
|
|
|
|
'OR': xapian.Query.OP_OR,
|
|
|
|
'PHRASE': xapian.Query.OP_PHRASE,
|
|
|
|
'NEAR': xapian.Query.OP_NEAR
|
|
|
|
}
|
|
|
|
|
2014-05-17 21:18:22 +00:00
|
|
|
# number of documents checked by default when building facets
|
|
|
|
# this must be improved to be relative to the total number of docs.
|
|
|
|
DEFAULT_CHECK_AT_LEAST = 1000
|
|
|
|
|
2014-05-18 12:07:53 +00:00
|
|
|
# field types accepted to be serialized as values in Xapian
|
2014-09-25 03:08:31 +00:00
|
|
|
FIELD_TYPES = {'text', 'integer', 'date', 'datetime', 'float', 'boolean',
|
|
|
|
'edge_ngram', 'ngram'}
|
2014-05-18 12:07:53 +00:00
|
|
|
|
|
|
|
# defines the format used to store types in Xapian
|
|
|
|
# this format ensures datetimes are sorted correctly
|
|
|
|
DATETIME_FORMAT = '%Y%m%d%H%M%S'
|
|
|
|
INTEGER_FORMAT = '%012d'
|
|
|
|
|
2014-05-23 07:37:22 +00:00
|
|
|
# defines the distance given between
|
|
|
|
# texts with positional information
|
|
|
|
TERMPOS_DISTANCE = 100
|
2009-06-18 16:15:13 +00:00
|
|
|
|
2022-02-10 19:44:21 +00:00
|
|
|
|
|
|
|
def filelocked(func):
|
|
|
|
"""Decorator to wrap a XapianSearchBackend method in a filelock."""
|
|
|
|
|
|
|
|
def wrapper(self, *args, **kwargs):
|
|
|
|
"""Run the function inside a lock."""
|
|
|
|
if self.path == MEMORY_DB_NAME or not self.use_lockfile:
|
|
|
|
func(self, *args, **kwargs)
|
|
|
|
else:
|
|
|
|
lockfile = Path(self.filelock.lock_file)
|
|
|
|
lockfile.parent.mkdir(parents=True, exist_ok=True)
|
|
|
|
lockfile.touch()
|
|
|
|
with self.filelock:
|
|
|
|
func(self, *args, **kwargs)
|
|
|
|
|
|
|
|
return wrapper
|
|
|
|
|
|
|
|
|
2009-12-08 14:56:57 +00:00
|
|
|
class InvalidIndexError(HaystackError):
|
|
|
|
"""Raised when an index can not be opened."""
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
2009-12-04 19:42:06 +00:00
|
|
|
class XHValueRangeProcessor(xapian.ValueRangeProcessor):
|
2014-05-10 19:47:23 +00:00
|
|
|
"""
|
|
|
|
A Processor to construct ranges of values
|
|
|
|
"""
|
2009-12-08 18:18:17 +00:00
|
|
|
def __init__(self, backend):
|
2014-05-11 19:02:47 +00:00
|
|
|
self.backend = backend
|
2009-12-04 19:42:06 +00:00
|
|
|
xapian.ValueRangeProcessor.__init__(self)
|
2012-04-20 18:44:40 +00:00
|
|
|
|
2009-12-04 19:42:06 +00:00
|
|
|
def __call__(self, begin, end):
|
|
|
|
"""
|
|
|
|
Construct a tuple for value range processing.
|
|
|
|
`begin` -- a string in the format '<field_name>:[low_range]'
|
|
|
|
If 'low_range' is omitted, assume the smallest possible value.
|
|
|
|
`end` -- a string in the the format '[high_range|*]'. If '*', assume
|
|
|
|
the highest possible value.
|
|
|
|
Return a tuple of three strings: (column, low, high)
|
|
|
|
"""
|
|
|
|
colon = begin.find(':')
|
|
|
|
field_name = begin[:colon]
|
|
|
|
begin = begin[colon + 1:len(begin)]
|
2009-12-08 18:18:17 +00:00
|
|
|
for field_dict in self.backend.schema:
|
2009-12-04 19:42:06 +00:00
|
|
|
if field_dict['field_name'] == field_name:
|
2014-05-18 11:21:17 +00:00
|
|
|
field_type = field_dict['type']
|
|
|
|
|
2009-12-04 19:42:06 +00:00
|
|
|
if not begin:
|
2014-05-18 11:21:17 +00:00
|
|
|
if field_type == 'text':
|
2014-05-11 15:50:48 +00:00
|
|
|
begin = 'a' # TODO: A better way of getting a min text value?
|
2014-05-18 11:21:17 +00:00
|
|
|
elif field_type == 'integer':
|
2014-05-18 16:25:27 +00:00
|
|
|
begin = -sys.maxsize - 1
|
2014-05-18 11:21:17 +00:00
|
|
|
elif field_type == 'float':
|
2009-12-04 19:42:06 +00:00
|
|
|
begin = float('-inf')
|
2014-05-18 11:21:17 +00:00
|
|
|
elif field_type == 'date' or field_type == 'datetime':
|
2014-05-11 15:50:48 +00:00
|
|
|
begin = '00010101000000'
|
2009-12-04 19:42:06 +00:00
|
|
|
elif end == '*':
|
2014-05-18 11:21:17 +00:00
|
|
|
if field_type == 'text':
|
2014-05-11 15:50:48 +00:00
|
|
|
end = 'z' * 100 # TODO: A better way of getting a max text value?
|
2014-05-18 11:21:17 +00:00
|
|
|
elif field_type == 'integer':
|
2014-05-18 16:25:27 +00:00
|
|
|
end = sys.maxsize
|
2014-05-18 11:21:17 +00:00
|
|
|
elif field_type == 'float':
|
2009-12-04 19:42:06 +00:00
|
|
|
end = float('inf')
|
2014-05-18 11:21:17 +00:00
|
|
|
elif field_type == 'date' or field_type == 'datetime':
|
2014-05-11 15:50:48 +00:00
|
|
|
end = '99990101000000'
|
2014-05-18 11:21:17 +00:00
|
|
|
|
2014-05-18 12:07:53 +00:00
|
|
|
if field_type == 'float':
|
|
|
|
begin = _term_to_xapian_value(float(begin), field_type)
|
|
|
|
end = _term_to_xapian_value(float(end), field_type)
|
|
|
|
elif field_type == 'integer':
|
|
|
|
begin = _term_to_xapian_value(int(begin), field_type)
|
|
|
|
end = _term_to_xapian_value(int(end), field_type)
|
2009-12-04 19:42:06 +00:00
|
|
|
return field_dict['column'], str(begin), str(end)
|
|
|
|
|
|
|
|
|
2009-08-18 13:49:20 +00:00
|
|
|
class XHExpandDecider(xapian.ExpandDecider):
|
|
|
|
def __call__(self, term):
|
|
|
|
"""
|
|
|
|
Return True if the term should be used for expanding the search
|
|
|
|
query, False otherwise.
|
2012-04-20 18:44:40 +00:00
|
|
|
|
2014-05-14 19:34:39 +00:00
|
|
|
Ignore terms related with the content type of objects.
|
2009-08-18 13:49:20 +00:00
|
|
|
"""
|
2017-01-10 21:03:46 +00:00
|
|
|
if term.decode('utf-8').startswith(TERM_PREFIXES[DJANGO_CT]):
|
2009-08-18 13:49:20 +00:00
|
|
|
return False
|
|
|
|
return True
|
|
|
|
|
|
|
|
|
2011-05-09 04:21:14 +00:00
|
|
|
class XapianSearchBackend(BaseSearchBackend):
|
2009-06-18 16:15:13 +00:00
|
|
|
"""
|
|
|
|
`SearchBackend` defines the Xapian search backend for use with the Haystack
|
|
|
|
API for Django search.
|
2012-04-20 18:44:40 +00:00
|
|
|
|
2022-02-05 09:56:21 +00:00
|
|
|
It uses the Xapian Python bindings to interface with Xapian.
|
2012-04-20 18:44:40 +00:00
|
|
|
|
2011-05-09 04:21:14 +00:00
|
|
|
In order to use this backend, `PATH` must be included in the
|
|
|
|
`connection_options`. This should point to a location where you would your
|
2009-06-18 16:15:13 +00:00
|
|
|
indexes to reside.
|
|
|
|
"""
|
2011-07-17 15:50:35 +00:00
|
|
|
inmemory_db = None
|
|
|
|
|
2012-04-20 20:10:31 +00:00
|
|
|
def __init__(self, connection_alias, **connection_options):
|
2009-06-18 16:15:13 +00:00
|
|
|
"""
|
|
|
|
Instantiates an instance of `SearchBackend`.
|
2012-04-20 18:44:40 +00:00
|
|
|
|
2009-06-18 16:15:13 +00:00
|
|
|
Optional arguments:
|
2011-05-09 04:21:14 +00:00
|
|
|
`connection_alias` -- The name of the connection
|
|
|
|
`language` -- The stemming language (default = 'english')
|
|
|
|
`**connection_options` -- The various options needed to setup
|
|
|
|
the backend.
|
2012-04-20 18:44:40 +00:00
|
|
|
|
2011-05-09 04:21:14 +00:00
|
|
|
Also sets the stemming language to be used to `language`.
|
2009-06-18 16:15:13 +00:00
|
|
|
"""
|
2022-02-10 19:44:21 +00:00
|
|
|
self.use_lockfile = bool(
|
|
|
|
getattr(settings, 'HAYSTACK_XAPIAN_USE_LOCKFILE', True)
|
|
|
|
)
|
2021-08-09 18:09:22 +00:00
|
|
|
super().__init__(connection_alias, **connection_options)
|
2012-04-20 18:44:40 +00:00
|
|
|
|
2011-05-09 04:21:14 +00:00
|
|
|
if not 'PATH' in connection_options:
|
2014-05-10 19:47:23 +00:00
|
|
|
raise ImproperlyConfigured("You must specify a 'PATH' in your settings for connection '%s'."
|
|
|
|
% connection_alias)
|
2012-04-20 18:44:40 +00:00
|
|
|
|
2011-05-09 04:21:14 +00:00
|
|
|
self.path = connection_options.get('PATH')
|
2012-04-20 18:44:40 +00:00
|
|
|
|
2017-10-27 13:04:39 +00:00
|
|
|
if self.path != MEMORY_DB_NAME:
|
|
|
|
try:
|
|
|
|
os.makedirs(self.path)
|
2021-08-09 15:41:03 +00:00
|
|
|
except FileExistsError:
|
2017-10-27 13:04:39 +00:00
|
|
|
pass
|
2012-04-20 18:44:40 +00:00
|
|
|
|
2022-02-10 19:44:21 +00:00
|
|
|
if self.use_lockfile:
|
|
|
|
lockfile = Path(self.path) / "lockfile"
|
|
|
|
self.filelock = FileLock(lockfile)
|
|
|
|
|
2011-05-09 04:21:14 +00:00
|
|
|
self.flags = connection_options.get('FLAGS', DEFAULT_XAPIAN_FLAGS)
|
2011-08-22 16:14:47 +00:00
|
|
|
self.language = getattr(settings, 'HAYSTACK_XAPIAN_LANGUAGE', 'english')
|
2014-05-10 19:47:23 +00:00
|
|
|
|
2015-01-12 19:37:44 +00:00
|
|
|
stemming_strategy_string = getattr(settings, 'HAYSTACK_XAPIAN_STEMMING_STRATEGY', 'STEM_SOME')
|
|
|
|
self.stemming_strategy = getattr(xapian.QueryParser, stemming_strategy_string, xapian.QueryParser.STEM_SOME)
|
|
|
|
|
2014-05-11 05:48:55 +00:00
|
|
|
# these 4 attributes are caches populated in `build_schema`
|
|
|
|
# they are checked in `_update_cache`
|
2014-05-20 04:35:05 +00:00
|
|
|
# use property to retrieve them
|
|
|
|
self._fields = {}
|
|
|
|
self._schema = []
|
2009-12-08 18:18:17 +00:00
|
|
|
self._content_field_name = None
|
2014-05-11 05:48:55 +00:00
|
|
|
self._columns = {}
|
2012-04-20 18:44:40 +00:00
|
|
|
|
2014-05-11 05:48:55 +00:00
|
|
|
def _update_cache(self):
|
2014-05-20 04:35:05 +00:00
|
|
|
"""
|
|
|
|
To avoid build_schema every time, we cache
|
|
|
|
some values: they only change when a SearchIndex
|
|
|
|
changes, which typically restarts the Python.
|
|
|
|
"""
|
2014-05-10 19:49:16 +00:00
|
|
|
fields = connections[self.connection_alias].get_unified_index().all_searchfields()
|
|
|
|
if self._fields != fields:
|
|
|
|
self._fields = fields
|
|
|
|
self._content_field_name, self._schema = self.build_schema(self._fields)
|
|
|
|
|
2009-12-08 18:18:17 +00:00
|
|
|
@property
|
|
|
|
def schema(self):
|
2014-05-11 05:48:55 +00:00
|
|
|
self._update_cache()
|
2009-12-08 18:18:17 +00:00
|
|
|
return self._schema
|
|
|
|
|
|
|
|
@property
|
|
|
|
def content_field_name(self):
|
2014-05-11 05:48:55 +00:00
|
|
|
self._update_cache()
|
2009-12-08 18:18:17 +00:00
|
|
|
return self._content_field_name
|
2012-04-20 18:44:40 +00:00
|
|
|
|
2014-05-18 14:36:53 +00:00
|
|
|
@property
|
|
|
|
def column(self):
|
2014-05-11 05:48:55 +00:00
|
|
|
"""
|
|
|
|
Returns the column in the database of a given field name.
|
|
|
|
"""
|
|
|
|
self._update_cache()
|
2014-05-18 14:36:53 +00:00
|
|
|
return self._columns
|
2014-05-11 05:48:55 +00:00
|
|
|
|
2022-02-10 19:44:21 +00:00
|
|
|
@filelocked
|
2017-05-18 06:49:11 +00:00
|
|
|
def update(self, index, iterable, commit=True):
|
2009-06-18 16:15:13 +00:00
|
|
|
"""
|
|
|
|
Updates the `index` with any objects in `iterable` by adding/updating
|
|
|
|
the database as needed.
|
2012-04-20 18:44:40 +00:00
|
|
|
|
2009-06-18 16:15:13 +00:00
|
|
|
Required arguments:
|
|
|
|
`index` -- The `SearchIndex` to process
|
|
|
|
`iterable` -- An iterable of model instances to index
|
2017-05-18 06:49:11 +00:00
|
|
|
Optional arguments:
|
|
|
|
`commit` -- ignored
|
2012-04-20 18:44:40 +00:00
|
|
|
|
2009-06-18 16:15:13 +00:00
|
|
|
For each object in `iterable`, a document is created containing all
|
2012-04-20 18:44:40 +00:00
|
|
|
of the terms extracted from `index.full_prepare(obj)` with field prefixes,
|
|
|
|
and 'as-is' as needed. Also, if the field type is 'text' it will be
|
2010-01-28 00:37:49 +00:00
|
|
|
stemmed and stored with the 'Z' prefix as well.
|
2012-04-20 18:44:40 +00:00
|
|
|
|
2010-01-28 00:37:49 +00:00
|
|
|
eg. `content:Testing` ==> `testing, Ztest, ZXCONTENTtest, XCONTENTtest`
|
2012-04-20 18:44:40 +00:00
|
|
|
|
2009-07-31 01:18:31 +00:00
|
|
|
Each document also contains an extra term in the format:
|
2012-04-20 18:44:40 +00:00
|
|
|
|
2009-06-18 16:15:13 +00:00
|
|
|
`XCONTENTTYPE<app_name>.<model_name>`
|
2012-04-20 18:44:40 +00:00
|
|
|
|
2009-06-18 16:15:13 +00:00
|
|
|
As well as a unique identifier in the the format:
|
2012-04-20 18:44:40 +00:00
|
|
|
|
2009-06-18 16:15:13 +00:00
|
|
|
`Q<app_name>.<model_name>.<pk>`
|
2012-04-20 18:44:40 +00:00
|
|
|
|
2009-06-18 16:15:13 +00:00
|
|
|
eg.: foo.bar (pk=1) ==> `Qfoo.bar.1`, `XCONTENTTYPEfoo.bar`
|
2012-04-20 18:44:40 +00:00
|
|
|
|
2009-06-18 16:15:13 +00:00
|
|
|
This is useful for querying for a specific document corresponding to
|
2009-07-31 01:18:31 +00:00
|
|
|
a model instance.
|
2012-04-20 18:44:40 +00:00
|
|
|
|
2009-07-31 01:18:31 +00:00
|
|
|
The document also contains a pickled version of the object itself and
|
|
|
|
the document ID in the document data field.
|
2012-04-20 18:44:40 +00:00
|
|
|
|
2009-07-09 18:04:06 +00:00
|
|
|
Finally, we also store field values to be used for sorting data. We
|
|
|
|
store these in the document value slots (position zero is reserver
|
|
|
|
for the document ID). All values are stored as unicode strings with
|
|
|
|
conversion of float, int, double, values being done by Xapian itself
|
|
|
|
through the use of the :method:xapian.sortable_serialise method.
|
2009-06-18 16:15:13 +00:00
|
|
|
"""
|
2009-07-31 01:18:31 +00:00
|
|
|
database = self._database(writable=True)
|
2014-05-14 19:53:13 +00:00
|
|
|
|
2009-06-16 18:48:11 +00:00
|
|
|
try:
|
2014-05-14 19:53:13 +00:00
|
|
|
term_generator = xapian.TermGenerator()
|
|
|
|
term_generator.set_database(database)
|
|
|
|
term_generator.set_stemmer(xapian.Stem(self.language))
|
2021-09-25 17:57:24 +00:00
|
|
|
term_generator.set_stemming_strategy(self.stemming_strategy)
|
2014-05-14 19:53:13 +00:00
|
|
|
if self.include_spelling is True:
|
|
|
|
term_generator.set_flags(xapian.TermGenerator.FLAG_SPELLING)
|
|
|
|
|
2014-05-22 08:06:24 +00:00
|
|
|
def _add_text(termpos, text, weight, prefix=''):
|
|
|
|
"""
|
|
|
|
indexes text appending 2 extra terms
|
|
|
|
to identify beginning and ending of the text.
|
|
|
|
"""
|
2014-05-22 17:16:37 +00:00
|
|
|
term_generator.set_termpos(termpos)
|
|
|
|
|
2014-05-22 08:06:24 +00:00
|
|
|
start_term = '%s^' % prefix
|
|
|
|
end_term = '%s$' % prefix
|
|
|
|
# add begin
|
|
|
|
document.add_posting(start_term, termpos, weight)
|
|
|
|
# add text
|
|
|
|
term_generator.index_text(text, weight, prefix)
|
|
|
|
termpos = term_generator.get_termpos()
|
|
|
|
# add ending
|
|
|
|
termpos += 1
|
|
|
|
document.add_posting(end_term, termpos, weight)
|
|
|
|
|
|
|
|
# increase termpos
|
|
|
|
term_generator.set_termpos(termpos)
|
2014-05-23 07:37:22 +00:00
|
|
|
term_generator.increase_termpos(TERMPOS_DISTANCE)
|
2014-05-22 08:06:24 +00:00
|
|
|
|
2014-05-20 21:01:49 +00:00
|
|
|
return term_generator.get_termpos()
|
|
|
|
|
2014-05-23 07:43:36 +00:00
|
|
|
def _add_literal_text(termpos, text, weight, prefix=''):
|
|
|
|
"""
|
|
|
|
Adds sentence to the document with positional information
|
|
|
|
but without processing.
|
|
|
|
|
|
|
|
The sentence is bounded by "^" "$" to allow exact matches.
|
|
|
|
"""
|
|
|
|
text = '^ %s $' % text
|
|
|
|
for word in text.split():
|
|
|
|
term = '%s%s' % (prefix, word)
|
|
|
|
document.add_posting(term, termpos, weight)
|
|
|
|
termpos += 1
|
|
|
|
termpos += TERMPOS_DISTANCE
|
|
|
|
return termpos
|
|
|
|
|
2014-05-22 08:06:24 +00:00
|
|
|
def add_text(termpos, prefix, text, weight):
|
|
|
|
"""
|
|
|
|
Adds text to the document with positional information
|
|
|
|
and processing (e.g. stemming).
|
|
|
|
"""
|
|
|
|
termpos = _add_text(termpos, text, weight, prefix=prefix)
|
|
|
|
termpos = _add_text(termpos, text, weight, prefix='')
|
2014-05-23 07:43:36 +00:00
|
|
|
termpos = _add_literal_text(termpos, text, weight, prefix=prefix)
|
|
|
|
termpos = _add_literal_text(termpos, text, weight, prefix='')
|
2014-05-22 08:06:24 +00:00
|
|
|
return termpos
|
|
|
|
|
2014-09-25 03:08:31 +00:00
|
|
|
def _get_ngram_lengths(value):
|
|
|
|
values = value.split()
|
|
|
|
for item in values:
|
2021-08-09 15:41:03 +00:00
|
|
|
for ngram_length in range(NGRAM_MIN_LENGTH, NGRAM_MAX_LENGTH + 1):
|
2014-09-25 03:08:31 +00:00
|
|
|
yield item, ngram_length
|
|
|
|
|
2009-06-16 18:48:11 +00:00
|
|
|
for obj in iterable:
|
|
|
|
document = xapian.Document()
|
2009-12-02 16:47:26 +00:00
|
|
|
term_generator.set_document(document)
|
2012-04-20 18:44:40 +00:00
|
|
|
|
2014-09-25 03:08:31 +00:00
|
|
|
def ngram_terms(value):
|
|
|
|
for item, length in _get_ngram_lengths(value):
|
|
|
|
item_length = len(item)
|
2021-08-09 15:41:03 +00:00
|
|
|
for start in range(0, item_length - length + 1):
|
|
|
|
for size in range(length, length + 1):
|
2014-09-25 03:08:31 +00:00
|
|
|
end = start + size
|
|
|
|
if end > item_length:
|
|
|
|
continue
|
|
|
|
yield _to_xapian_term(item[start:end])
|
|
|
|
|
|
|
|
def edge_ngram_terms(value):
|
|
|
|
for item, length in _get_ngram_lengths(value):
|
|
|
|
yield _to_xapian_term(item[0:length])
|
|
|
|
|
|
|
|
def add_edge_ngram_to_document(prefix, value, weight):
|
|
|
|
"""
|
|
|
|
Splits the term in ngrams and adds each ngram to the index.
|
|
|
|
The minimum and maximum size of the ngram is respectively
|
|
|
|
NGRAM_MIN_LENGTH and NGRAM_MAX_LENGTH.
|
|
|
|
"""
|
|
|
|
for term in edge_ngram_terms(value):
|
|
|
|
document.add_term(term, weight)
|
|
|
|
document.add_term(prefix + term, weight)
|
|
|
|
|
|
|
|
def add_ngram_to_document(prefix, value, weight):
|
|
|
|
"""
|
|
|
|
Splits the term in ngrams and adds each ngram to the index.
|
|
|
|
The minimum and maximum size of the ngram is respectively
|
|
|
|
NGRAM_MIN_LENGTH and NGRAM_MAX_LENGTH.
|
|
|
|
"""
|
|
|
|
for term in ngram_terms(value):
|
|
|
|
document.add_term(term, weight)
|
|
|
|
document.add_term(prefix + term, weight)
|
|
|
|
|
2014-05-23 07:43:36 +00:00
|
|
|
def add_non_text_to_document(prefix, term, weight):
|
2014-05-22 08:06:24 +00:00
|
|
|
"""
|
2014-05-23 07:43:36 +00:00
|
|
|
Adds term to the document without positional information
|
2014-05-22 08:06:24 +00:00
|
|
|
and without processing.
|
|
|
|
|
|
|
|
If the term is alone, also adds it as "^<term>$"
|
|
|
|
to allow exact matches on single terms.
|
|
|
|
"""
|
2014-05-23 07:43:36 +00:00
|
|
|
document.add_term(term, weight)
|
|
|
|
document.add_term(prefix + term, weight)
|
2014-05-20 21:01:49 +00:00
|
|
|
|
|
|
|
def add_datetime_to_document(termpos, prefix, term, weight):
|
2014-05-22 08:06:24 +00:00
|
|
|
"""
|
|
|
|
Adds a datetime to document with positional order
|
|
|
|
to allow exact matches on it.
|
|
|
|
"""
|
2014-05-20 21:01:49 +00:00
|
|
|
date, time = term.split()
|
|
|
|
document.add_posting(date, termpos, weight)
|
|
|
|
termpos += 1
|
|
|
|
document.add_posting(time, termpos, weight)
|
|
|
|
termpos += 1
|
|
|
|
document.add_posting(prefix + date, termpos, weight)
|
|
|
|
termpos += 1
|
|
|
|
document.add_posting(prefix + time, termpos, weight)
|
2014-05-23 07:37:22 +00:00
|
|
|
termpos += TERMPOS_DISTANCE + 1
|
2014-05-20 21:01:49 +00:00
|
|
|
return termpos
|
|
|
|
|
2010-02-25 16:10:10 +00:00
|
|
|
data = index.full_prepare(obj)
|
2010-07-21 23:13:26 +00:00
|
|
|
weights = index.get_field_weights()
|
2014-05-20 21:01:49 +00:00
|
|
|
|
2014-05-22 08:06:24 +00:00
|
|
|
termpos = term_generator.get_termpos() # identifies the current position in the document.
|
2009-07-31 01:18:31 +00:00
|
|
|
for field in self.schema:
|
2014-05-18 16:25:27 +00:00
|
|
|
if field['field_name'] not in list(data.keys()):
|
2014-05-22 08:06:24 +00:00
|
|
|
# not supported fields are ignored.
|
2014-05-14 19:53:13 +00:00
|
|
|
continue
|
|
|
|
|
|
|
|
if field['field_name'] in weights:
|
|
|
|
weight = int(weights[field['field_name']])
|
|
|
|
else:
|
|
|
|
weight = 1
|
|
|
|
|
2014-05-18 12:07:53 +00:00
|
|
|
value = data[field['field_name']]
|
2014-05-22 08:06:24 +00:00
|
|
|
|
2017-01-10 21:03:46 +00:00
|
|
|
if field['field_name'] in (ID, DJANGO_ID, DJANGO_CT):
|
2014-05-22 08:06:24 +00:00
|
|
|
# Private fields are indexed in a different way:
|
|
|
|
# `django_id` is an int and `django_ct` is text;
|
|
|
|
# besides, they are indexed by their (unstemmed) value.
|
2017-01-10 21:03:46 +00:00
|
|
|
if field['field_name'] == DJANGO_ID:
|
2014-05-18 12:07:53 +00:00
|
|
|
value = int(value)
|
|
|
|
value = _term_to_xapian_value(value, field['type'])
|
2014-05-14 19:53:13 +00:00
|
|
|
|
2014-05-18 12:07:53 +00:00
|
|
|
document.add_term(TERM_PREFIXES[field['field_name']] + value, weight)
|
|
|
|
document.add_value(field['column'], value)
|
2014-05-20 21:01:49 +00:00
|
|
|
continue
|
2014-05-14 21:21:43 +00:00
|
|
|
else:
|
2014-05-14 19:53:13 +00:00
|
|
|
prefix = TERM_PREFIXES['field'] + field['field_name'].upper()
|
|
|
|
|
2014-05-20 21:01:49 +00:00
|
|
|
# if not multi_valued, we add as a document value
|
|
|
|
# for sorting and facets
|
2014-05-14 19:53:13 +00:00
|
|
|
if field['multi_valued'] == 'false':
|
2014-05-18 12:07:53 +00:00
|
|
|
document.add_value(field['column'], _term_to_xapian_value(value, field['type']))
|
2014-05-20 21:01:49 +00:00
|
|
|
else:
|
|
|
|
for t in value:
|
|
|
|
# add the exact match of each value
|
|
|
|
term = _to_xapian_term(t)
|
2014-05-22 08:06:24 +00:00
|
|
|
termpos = add_text(termpos, prefix, term, weight)
|
2014-05-20 21:01:49 +00:00
|
|
|
continue
|
|
|
|
|
|
|
|
term = _to_xapian_term(value)
|
2014-05-22 17:16:37 +00:00
|
|
|
if term == '':
|
|
|
|
continue
|
2014-05-20 21:01:49 +00:00
|
|
|
# from here on the term is a string;
|
|
|
|
# we now decide how it is indexed
|
|
|
|
|
|
|
|
if field['type'] == 'text':
|
|
|
|
# text is indexed with positional information
|
|
|
|
termpos = add_text(termpos, prefix, term, weight)
|
|
|
|
elif field['type'] == 'datetime':
|
|
|
|
termpos = add_datetime_to_document(termpos, prefix, term, weight)
|
2014-09-25 03:08:31 +00:00
|
|
|
elif field['type'] == 'ngram':
|
|
|
|
add_ngram_to_document(prefix, value, weight)
|
|
|
|
elif field['type'] == 'edge_ngram':
|
|
|
|
add_edge_ngram_to_document(prefix, value, weight)
|
2014-05-23 07:43:36 +00:00
|
|
|
else:
|
|
|
|
# all other terms are added without positional information
|
|
|
|
add_non_text_to_document(prefix, term, weight)
|
2012-04-20 18:44:40 +00:00
|
|
|
|
2014-05-14 19:34:39 +00:00
|
|
|
# store data without indexing it
|
2009-07-31 15:39:58 +00:00
|
|
|
document.set_data(pickle.dumps(
|
2017-02-14 08:41:06 +00:00
|
|
|
(obj._meta.app_label, obj._meta.model_name, obj.pk, data),
|
2009-07-31 15:39:58 +00:00
|
|
|
pickle.HIGHEST_PROTOCOL
|
|
|
|
))
|
2014-05-14 19:34:39 +00:00
|
|
|
|
|
|
|
# add the id of the document
|
2017-01-10 21:03:46 +00:00
|
|
|
document_id = TERM_PREFIXES[ID] + get_identifier(obj)
|
2009-07-31 15:39:58 +00:00
|
|
|
document.add_term(document_id)
|
2014-05-14 19:34:39 +00:00
|
|
|
|
|
|
|
# finally, replace or add the document to the database
|
2009-07-31 15:39:58 +00:00
|
|
|
database.replace_document(document_id, document)
|
2012-04-20 18:44:40 +00:00
|
|
|
|
2009-06-16 18:48:11 +00:00
|
|
|
except UnicodeDecodeError:
|
|
|
|
sys.stderr.write('Chunk failed.\n')
|
|
|
|
pass
|
2012-04-20 18:44:40 +00:00
|
|
|
|
2011-01-04 20:52:47 +00:00
|
|
|
finally:
|
2012-04-20 19:57:56 +00:00
|
|
|
database.close()
|
2012-04-20 18:44:40 +00:00
|
|
|
|
2022-02-10 19:44:21 +00:00
|
|
|
@filelocked
|
2017-05-18 06:49:11 +00:00
|
|
|
def remove(self, obj, commit=True):
|
2009-06-18 16:15:13 +00:00
|
|
|
"""
|
|
|
|
Remove indexes for `obj` from the database.
|
2012-04-20 18:44:40 +00:00
|
|
|
|
2009-06-18 16:15:13 +00:00
|
|
|
We delete all instances of `Q<app_name>.<model_name>.<pk>` which
|
|
|
|
should be unique to this object.
|
2017-05-18 06:49:11 +00:00
|
|
|
|
|
|
|
Optional arguments:
|
|
|
|
`commit` -- ignored
|
2009-06-18 16:15:13 +00:00
|
|
|
"""
|
2009-07-31 15:39:58 +00:00
|
|
|
database = self._database(writable=True)
|
2017-01-10 21:03:46 +00:00
|
|
|
database.delete_document(TERM_PREFIXES[ID] + get_identifier(obj))
|
2012-04-20 19:57:56 +00:00
|
|
|
database.close()
|
2012-04-20 18:44:40 +00:00
|
|
|
|
2014-05-11 19:02:47 +00:00
|
|
|
def clear(self, models=(), commit=True):
|
2009-06-18 16:15:13 +00:00
|
|
|
"""
|
|
|
|
Clear all instances of `models` from the database or all models, if
|
|
|
|
not specified.
|
2012-04-20 18:44:40 +00:00
|
|
|
|
2009-06-18 16:15:13 +00:00
|
|
|
Optional Arguments:
|
|
|
|
`models` -- Models to clear from the database (default = [])
|
2012-04-20 18:44:40 +00:00
|
|
|
|
2009-06-18 16:15:13 +00:00
|
|
|
If `models` is empty, an empty query is executed which matches all
|
|
|
|
documents in the database. Afterwards, each match is deleted.
|
2012-04-20 18:44:40 +00:00
|
|
|
|
2009-06-18 16:15:13 +00:00
|
|
|
Otherwise, for each model, a `delete_document` call is issued with
|
|
|
|
the term `XCONTENTTYPE<app_name>.<model_name>`. This will delete
|
|
|
|
all documents with the specified model type.
|
|
|
|
"""
|
2009-06-16 18:48:11 +00:00
|
|
|
if not models:
|
2010-10-27 17:26:50 +00:00
|
|
|
# Because there does not appear to be a "clear all" method,
|
2011-05-09 04:21:14 +00:00
|
|
|
# it's much quicker to remove the contents of the `self.path`
|
2010-10-27 17:26:50 +00:00
|
|
|
# folder than it is to remove each document one at a time.
|
2011-05-09 04:21:14 +00:00
|
|
|
if os.path.exists(self.path):
|
|
|
|
shutil.rmtree(self.path)
|
2009-06-16 18:48:11 +00:00
|
|
|
else:
|
2012-06-05 16:57:24 +00:00
|
|
|
database = self._database(writable=True)
|
2009-06-16 18:48:11 +00:00
|
|
|
for model in models:
|
2017-01-10 21:03:46 +00:00
|
|
|
database.delete_document(TERM_PREFIXES[DJANGO_CT] + get_model_ct(model))
|
2012-06-05 16:57:24 +00:00
|
|
|
database.close()
|
2010-10-27 17:26:50 +00:00
|
|
|
|
|
|
|
def document_count(self):
|
|
|
|
try:
|
|
|
|
return self._database().get_doccount()
|
|
|
|
except InvalidIndexError:
|
|
|
|
return 0
|
|
|
|
|
2014-05-11 14:20:57 +00:00
|
|
|
def _build_models_query(self, query):
|
|
|
|
"""
|
|
|
|
Builds a query from `query` that filters to documents only from registered models.
|
|
|
|
"""
|
|
|
|
registered_models_ct = self.build_models_list()
|
|
|
|
if registered_models_ct:
|
2017-01-10 21:03:46 +00:00
|
|
|
restrictions = [xapian.Query('%s%s' % (TERM_PREFIXES[DJANGO_CT], model_ct))
|
2014-05-11 14:20:57 +00:00
|
|
|
for model_ct in registered_models_ct]
|
|
|
|
limit_query = xapian.Query(xapian.Query.OP_OR, restrictions)
|
|
|
|
|
|
|
|
query = xapian.Query(xapian.Query.OP_AND, query, limit_query)
|
|
|
|
|
|
|
|
return query
|
|
|
|
|
2014-05-17 21:43:29 +00:00
|
|
|
def _check_field_names(self, field_names):
|
|
|
|
"""
|
|
|
|
Raises InvalidIndexError if any of a field_name in field_names is
|
|
|
|
not indexed.
|
|
|
|
"""
|
|
|
|
if field_names:
|
|
|
|
for field_name in field_names:
|
|
|
|
try:
|
2014-05-18 14:36:53 +00:00
|
|
|
self.column[field_name]
|
2014-05-17 21:43:29 +00:00
|
|
|
except KeyError:
|
|
|
|
raise InvalidIndexError('Trying to use non indexed field "%s"' % field_name)
|
|
|
|
|
2009-10-08 18:42:58 +00:00
|
|
|
@log_query
|
2009-11-29 21:29:52 +00:00
|
|
|
def search(self, query, sort_by=None, start_offset=0, end_offset=None,
|
|
|
|
fields='', highlight=False, facets=None, date_facets=None,
|
|
|
|
query_facets=None, narrow_queries=None, spelling_query=None,
|
2017-01-09 12:20:22 +00:00
|
|
|
limit_to_registered_models=None, result_class=None, **kwargs):
|
2009-06-18 16:15:13 +00:00
|
|
|
"""
|
2009-12-08 01:02:39 +00:00
|
|
|
Executes the Xapian::query as defined in `query`.
|
2012-04-20 18:44:40 +00:00
|
|
|
|
2009-06-18 16:15:13 +00:00
|
|
|
Required arguments:
|
2009-11-19 20:06:12 +00:00
|
|
|
`query` -- Search query to execute
|
2012-04-20 18:44:40 +00:00
|
|
|
|
2009-06-18 16:15:13 +00:00
|
|
|
Optional arguments:
|
|
|
|
`sort_by` -- Sort results by specified field (default = None)
|
|
|
|
`start_offset` -- Slice results from `start_offset` (default = 0)
|
2009-11-29 21:29:52 +00:00
|
|
|
`end_offset` -- Slice results at `end_offset` (default = None), if None, then all documents
|
2009-06-18 16:15:13 +00:00
|
|
|
`fields` -- Filter results on `fields` (default = '')
|
|
|
|
`highlight` -- Highlight terms in results (default = False)
|
|
|
|
`facets` -- Facet results on fields (default = None)
|
|
|
|
`date_facets` -- Facet results on date ranges (default = None)
|
|
|
|
`query_facets` -- Facet results on queries (default = None)
|
|
|
|
`narrow_queries` -- Narrow queries (default = None)
|
2009-09-17 17:39:52 +00:00
|
|
|
`spelling_query` -- An optional query to execute spelling suggestion on
|
2014-05-11 19:02:47 +00:00
|
|
|
`limit_to_registered_models` -- Limit returned results to models registered in
|
|
|
|
the current `SearchSite` (default = True)
|
2012-04-20 18:44:40 +00:00
|
|
|
|
2009-06-18 16:15:13 +00:00
|
|
|
Returns:
|
|
|
|
A dictionary with the following keys:
|
|
|
|
`results` -- A list of `SearchResult`
|
|
|
|
`hits` -- The total available results
|
|
|
|
`facets` - A dictionary of facets with the following keys:
|
|
|
|
`fields` -- A list of field facets
|
|
|
|
`dates` -- A list of date facets
|
|
|
|
`queries` -- A list of query facets
|
|
|
|
If faceting was not used, the `facets` key will not be present
|
2012-04-20 18:44:40 +00:00
|
|
|
|
2009-12-08 01:02:39 +00:00
|
|
|
If `query` is None, returns no results.
|
2012-04-20 18:44:40 +00:00
|
|
|
|
2011-05-09 04:21:14 +00:00
|
|
|
If `INCLUDE_SPELLING` was enabled in the connection options, the
|
2009-06-20 11:32:18 +00:00
|
|
|
extra flag `FLAG_SPELLING_CORRECTION` will be passed to the query parser
|
|
|
|
and any suggestions for spell correction will be returned as well as
|
|
|
|
the results.
|
2009-06-18 16:15:13 +00:00
|
|
|
"""
|
2009-12-02 20:05:53 +00:00
|
|
|
if xapian.Query.empty(query):
|
2009-06-16 18:48:11 +00:00
|
|
|
return {
|
|
|
|
'results': [],
|
|
|
|
'hits': 0,
|
|
|
|
}
|
2012-04-20 18:44:40 +00:00
|
|
|
|
2014-05-17 21:43:29 +00:00
|
|
|
self._check_field_names(facets)
|
|
|
|
self._check_field_names(date_facets)
|
|
|
|
self._check_field_names(query_facets)
|
|
|
|
|
2009-07-31 15:39:58 +00:00
|
|
|
database = self._database()
|
2012-04-20 18:44:40 +00:00
|
|
|
|
2017-01-09 12:20:22 +00:00
|
|
|
if limit_to_registered_models is None:
|
|
|
|
limit_to_registered_models = getattr(settings, 'HAYSTACK_LIMIT_TO_REGISTERED_MODELS', True)
|
|
|
|
|
2011-05-03 20:03:07 +00:00
|
|
|
if result_class is None:
|
|
|
|
result_class = SearchResult
|
2012-04-20 18:44:40 +00:00
|
|
|
|
2011-05-09 04:21:14 +00:00
|
|
|
if self.include_spelling is True:
|
2009-12-03 17:20:04 +00:00
|
|
|
spelling_suggestion = self._do_spelling_suggestion(database, query, spelling_query)
|
|
|
|
else:
|
|
|
|
spelling_suggestion = ''
|
2012-04-20 18:44:40 +00:00
|
|
|
|
2009-12-04 15:27:23 +00:00
|
|
|
if narrow_queries is not None:
|
|
|
|
query = xapian.Query(
|
2009-12-05 15:43:52 +00:00
|
|
|
xapian.Query.OP_AND, query, xapian.Query(
|
2012-05-28 23:06:07 +00:00
|
|
|
xapian.Query.OP_AND, [self.parse_query(narrow_query) for narrow_query in narrow_queries]
|
2009-12-05 15:43:52 +00:00
|
|
|
)
|
2009-12-04 15:27:23 +00:00
|
|
|
)
|
2012-04-20 18:44:40 +00:00
|
|
|
|
2009-12-04 15:27:23 +00:00
|
|
|
if limit_to_registered_models:
|
2014-05-11 14:20:57 +00:00
|
|
|
query = self._build_models_query(query)
|
2012-04-20 18:44:40 +00:00
|
|
|
|
2009-12-02 20:05:53 +00:00
|
|
|
enquire = xapian.Enquire(database)
|
2011-06-13 20:37:29 +00:00
|
|
|
if hasattr(settings, 'HAYSTACK_XAPIAN_WEIGHTING_SCHEME'):
|
|
|
|
enquire.set_weighting_scheme(xapian.BM25Weight(*settings.HAYSTACK_XAPIAN_WEIGHTING_SCHEME))
|
2009-12-02 20:05:53 +00:00
|
|
|
enquire.set_query(query)
|
2012-04-20 18:44:40 +00:00
|
|
|
|
2009-07-24 20:15:34 +00:00
|
|
|
if sort_by:
|
2021-09-26 12:59:06 +00:00
|
|
|
_xapian_sort(enquire, sort_by, self.column)
|
2012-04-20 18:44:40 +00:00
|
|
|
|
2009-07-31 15:39:58 +00:00
|
|
|
results = []
|
|
|
|
facets_dict = {
|
|
|
|
'fields': {},
|
|
|
|
'dates': {},
|
|
|
|
'queries': {},
|
|
|
|
}
|
2012-04-20 18:44:40 +00:00
|
|
|
|
2009-11-28 17:31:03 +00:00
|
|
|
if not end_offset:
|
2010-01-27 20:55:34 +00:00
|
|
|
end_offset = database.get_doccount() - start_offset
|
2012-04-20 18:44:40 +00:00
|
|
|
|
2014-05-17 21:18:22 +00:00
|
|
|
## prepare spies in case of facets
|
|
|
|
if facets:
|
|
|
|
facets_spies = self._prepare_facet_field_spies(facets)
|
|
|
|
for spy in facets_spies:
|
|
|
|
enquire.add_matchspy(spy)
|
|
|
|
|
2017-01-10 19:34:58 +00:00
|
|
|
# print enquire.get_query()
|
2017-01-09 12:20:22 +00:00
|
|
|
|
2010-02-19 14:47:58 +00:00
|
|
|
matches = self._get_enquire_mset(database, enquire, start_offset, end_offset)
|
2012-04-20 18:44:40 +00:00
|
|
|
|
2009-07-31 15:39:58 +00:00
|
|
|
for match in matches:
|
2015-04-16 13:08:20 +00:00
|
|
|
app_label, model_name, pk, model_data = pickle.loads(self._get_document_data(database, match.document))
|
2009-12-03 18:49:26 +00:00
|
|
|
if highlight:
|
2009-07-31 15:39:58 +00:00
|
|
|
model_data['highlighted'] = {
|
|
|
|
self.content_field_name: self._do_highlight(
|
2009-12-03 18:49:26 +00:00
|
|
|
model_data.get(self.content_field_name), query
|
2009-07-31 15:39:58 +00:00
|
|
|
)
|
|
|
|
}
|
2009-07-31 19:23:55 +00:00
|
|
|
results.append(
|
2015-04-16 13:08:20 +00:00
|
|
|
result_class(app_label, model_name, pk, match.percent, **model_data)
|
2009-07-31 19:23:55 +00:00
|
|
|
)
|
2012-04-20 18:44:40 +00:00
|
|
|
|
2009-08-11 13:28:15 +00:00
|
|
|
if facets:
|
2014-05-17 21:18:22 +00:00
|
|
|
# pick single valued facets from spies
|
|
|
|
single_facets_dict = self._process_facet_field_spies(facets_spies)
|
|
|
|
|
|
|
|
# pick multivalued valued facets from results
|
|
|
|
multi_facets_dict = self._do_multivalued_field_facets(results, facets)
|
|
|
|
|
|
|
|
# merge both results (http://stackoverflow.com/a/38990/931303)
|
|
|
|
facets_dict['fields'] = dict(list(single_facets_dict.items()) + list(multi_facets_dict.items()))
|
|
|
|
|
2009-08-10 20:12:59 +00:00
|
|
|
if date_facets:
|
|
|
|
facets_dict['dates'] = self._do_date_facets(results, date_facets)
|
2014-05-17 21:18:22 +00:00
|
|
|
|
2009-08-11 12:42:42 +00:00
|
|
|
if query_facets:
|
|
|
|
facets_dict['queries'] = self._do_query_facets(results, query_facets)
|
2012-04-20 18:44:40 +00:00
|
|
|
|
2009-07-31 15:39:58 +00:00
|
|
|
return {
|
|
|
|
'results': results,
|
2010-11-10 18:55:34 +00:00
|
|
|
'hits': self._get_hit_count(database, enquire),
|
2009-07-31 15:39:58 +00:00
|
|
|
'facets': facets_dict,
|
|
|
|
'spelling_suggestion': spelling_suggestion,
|
|
|
|
}
|
2012-04-20 18:44:40 +00:00
|
|
|
|
2009-12-03 21:40:33 +00:00
|
|
|
def more_like_this(self, model_instance, additional_query=None,
|
2009-12-04 21:41:41 +00:00
|
|
|
start_offset=0, end_offset=None,
|
2011-05-03 20:03:07 +00:00
|
|
|
limit_to_registered_models=True, result_class=None, **kwargs):
|
2009-06-18 16:15:13 +00:00
|
|
|
"""
|
|
|
|
Given a model instance, returns a result set of similar documents.
|
2012-04-20 18:44:40 +00:00
|
|
|
|
2009-06-18 16:15:13 +00:00
|
|
|
Required arguments:
|
|
|
|
`model_instance` -- The model instance to use as a basis for
|
|
|
|
retrieving similar documents.
|
2012-04-20 18:44:40 +00:00
|
|
|
|
2009-08-14 22:54:46 +00:00
|
|
|
Optional arguments:
|
2009-12-03 21:40:33 +00:00
|
|
|
`additional_query` -- An additional query to narrow results
|
2009-08-14 22:54:46 +00:00
|
|
|
`start_offset` -- The starting offset (default=0)
|
2009-11-29 21:29:52 +00:00
|
|
|
`end_offset` -- The ending offset (default=None), if None, then all documents
|
2014-05-10 19:47:23 +00:00
|
|
|
`limit_to_registered_models` -- Limit returned results to models registered in the search (default = True)
|
2012-04-20 18:44:40 +00:00
|
|
|
|
2009-06-18 16:15:13 +00:00
|
|
|
Returns:
|
|
|
|
A dictionary with the following keys:
|
|
|
|
`results` -- A list of `SearchResult`
|
|
|
|
`hits` -- The total available results
|
2012-04-20 18:44:40 +00:00
|
|
|
|
2009-06-18 16:15:13 +00:00
|
|
|
Opens a database connection, then builds a simple query using the
|
|
|
|
`model_instance` to build the unique identifier.
|
2012-04-20 18:44:40 +00:00
|
|
|
|
2009-06-18 16:15:13 +00:00
|
|
|
For each document retrieved(should always be one), adds an entry into
|
|
|
|
an RSet (relevance set) with the document id, then, uses the RSet
|
|
|
|
to query for an ESet (A set of terms that can be used to suggest
|
|
|
|
expansions to the original query), omitting any document that was in
|
|
|
|
the original query.
|
2012-04-20 18:44:40 +00:00
|
|
|
|
2009-06-18 16:15:13 +00:00
|
|
|
Finally, processes the resulting matches and returns.
|
|
|
|
"""
|
2009-07-31 15:39:58 +00:00
|
|
|
database = self._database()
|
2012-04-20 18:44:40 +00:00
|
|
|
|
2011-05-03 20:03:07 +00:00
|
|
|
if result_class is None:
|
|
|
|
result_class = SearchResult
|
2012-04-20 18:44:40 +00:00
|
|
|
|
2017-01-10 21:03:46 +00:00
|
|
|
query = xapian.Query(TERM_PREFIXES[ID] + get_identifier(model_instance))
|
2012-04-20 18:44:40 +00:00
|
|
|
|
2009-12-02 20:05:53 +00:00
|
|
|
enquire = xapian.Enquire(database)
|
|
|
|
enquire.set_query(query)
|
2012-04-20 18:44:40 +00:00
|
|
|
|
2009-06-17 20:54:39 +00:00
|
|
|
rset = xapian.RSet()
|
2012-04-20 18:44:40 +00:00
|
|
|
|
2009-11-28 17:31:03 +00:00
|
|
|
if not end_offset:
|
2009-12-02 16:47:26 +00:00
|
|
|
end_offset = database.get_doccount()
|
2012-04-20 18:44:40 +00:00
|
|
|
|
2014-05-13 20:55:10 +00:00
|
|
|
match = None
|
2010-02-19 14:47:58 +00:00
|
|
|
for match in self._get_enquire_mset(database, enquire, 0, end_offset):
|
2009-08-18 12:56:35 +00:00
|
|
|
rset.add_document(match.docid)
|
2012-04-20 18:44:40 +00:00
|
|
|
|
2014-05-13 20:55:10 +00:00
|
|
|
if match is None:
|
|
|
|
if not self.silently_fail:
|
|
|
|
raise InvalidIndexError('Instance %s with id "%d" not indexed' %
|
|
|
|
(get_identifier(model_instance), model_instance.id))
|
|
|
|
else:
|
|
|
|
return {'results': [],
|
|
|
|
'hits': 0}
|
|
|
|
|
2010-05-28 18:41:09 +00:00
|
|
|
query = xapian.Query(
|
|
|
|
xapian.Query.OP_ELITE_SET,
|
|
|
|
[expand.term for expand in enquire.get_eset(match.document.termlist_count(), rset, XHExpandDecider())],
|
|
|
|
match.document.termlist_count()
|
2009-06-17 20:54:39 +00:00
|
|
|
)
|
2009-07-31 19:29:48 +00:00
|
|
|
query = xapian.Query(
|
2017-01-10 21:03:46 +00:00
|
|
|
xapian.Query.OP_AND_NOT, [query, TERM_PREFIXES[ID] + get_identifier(model_instance)]
|
2009-06-17 23:30:25 +00:00
|
|
|
)
|
2014-05-11 14:20:57 +00:00
|
|
|
|
2009-12-03 21:48:21 +00:00
|
|
|
if limit_to_registered_models:
|
2014-05-11 14:20:57 +00:00
|
|
|
query = self._build_models_query(query)
|
|
|
|
|
2009-12-03 21:40:33 +00:00
|
|
|
if additional_query:
|
2009-08-14 22:54:46 +00:00
|
|
|
query = xapian.Query(
|
2009-08-14 23:06:23 +00:00
|
|
|
xapian.Query.OP_AND, query, additional_query
|
2009-08-14 22:54:46 +00:00
|
|
|
)
|
2012-04-20 18:44:40 +00:00
|
|
|
|
2009-06-17 20:54:39 +00:00
|
|
|
enquire.set_query(query)
|
2012-04-20 18:44:40 +00:00
|
|
|
|
2009-06-16 18:48:11 +00:00
|
|
|
results = []
|
2010-02-19 14:47:58 +00:00
|
|
|
matches = self._get_enquire_mset(database, enquire, start_offset, end_offset)
|
2012-04-20 18:44:40 +00:00
|
|
|
|
2009-06-16 18:48:11 +00:00
|
|
|
for match in matches:
|
2015-04-16 13:08:20 +00:00
|
|
|
app_label, model_name, pk, model_data = pickle.loads(self._get_document_data(database, match.document))
|
2009-07-31 19:29:48 +00:00
|
|
|
results.append(
|
2015-04-16 13:08:20 +00:00
|
|
|
result_class(app_label, model_name, pk, match.percent, **model_data)
|
2009-06-16 18:48:11 +00:00
|
|
|
)
|
2010-05-28 18:41:09 +00:00
|
|
|
|
2009-06-16 18:48:11 +00:00
|
|
|
return {
|
|
|
|
'results': results,
|
2010-11-10 18:55:34 +00:00
|
|
|
'hits': self._get_hit_count(database, enquire),
|
2009-07-31 19:29:48 +00:00
|
|
|
'facets': {
|
|
|
|
'fields': {},
|
|
|
|
'dates': {},
|
|
|
|
'queries': {},
|
|
|
|
},
|
|
|
|
'spelling_suggestion': None,
|
2009-06-16 18:48:11 +00:00
|
|
|
}
|
2012-04-20 18:44:40 +00:00
|
|
|
|
2009-12-04 19:42:06 +00:00
|
|
|
def parse_query(self, query_string):
|
|
|
|
"""
|
|
|
|
Given a `query_string`, will attempt to return a xapian.Query
|
2012-04-20 18:44:40 +00:00
|
|
|
|
2009-12-04 19:42:06 +00:00
|
|
|
Required arguments:
|
|
|
|
``query_string`` -- A query string to parse
|
2012-04-20 18:44:40 +00:00
|
|
|
|
2009-12-04 19:42:06 +00:00
|
|
|
Returns a xapian.Query
|
|
|
|
"""
|
2009-12-04 20:41:32 +00:00
|
|
|
if query_string == '*':
|
2012-04-20 18:44:40 +00:00
|
|
|
return xapian.Query('') # Match everything
|
2009-12-04 20:41:32 +00:00
|
|
|
elif query_string == '':
|
2012-04-20 18:44:40 +00:00
|
|
|
return xapian.Query() # Match nothing
|
|
|
|
|
2009-12-04 19:42:06 +00:00
|
|
|
qp = xapian.QueryParser()
|
|
|
|
qp.set_database(self._database())
|
|
|
|
qp.set_stemmer(xapian.Stem(self.language))
|
2015-01-12 19:37:44 +00:00
|
|
|
qp.set_stemming_strategy(self.stemming_strategy)
|
2014-06-13 17:51:07 +00:00
|
|
|
qp.set_default_op(XAPIAN_OPTS[DEFAULT_OPERATOR])
|
2017-01-10 21:03:46 +00:00
|
|
|
qp.add_boolean_prefix(DJANGO_CT, TERM_PREFIXES[DJANGO_CT])
|
2010-02-09 19:04:53 +00:00
|
|
|
|
2009-12-04 19:42:06 +00:00
|
|
|
for field_dict in self.schema:
|
2014-05-14 19:34:39 +00:00
|
|
|
# since 'django_ct' has a boolean_prefix,
|
|
|
|
# we ignore it here.
|
2017-01-10 21:03:46 +00:00
|
|
|
if field_dict['field_name'] == DJANGO_CT:
|
2014-05-14 19:34:39 +00:00
|
|
|
continue
|
|
|
|
|
2009-12-04 19:42:06 +00:00
|
|
|
qp.add_prefix(
|
|
|
|
field_dict['field_name'],
|
2014-05-14 19:34:39 +00:00
|
|
|
TERM_PREFIXES['field'] + field_dict['field_name'].upper()
|
2009-12-04 19:42:06 +00:00
|
|
|
)
|
2012-04-20 18:44:40 +00:00
|
|
|
|
2009-12-04 19:42:06 +00:00
|
|
|
vrp = XHValueRangeProcessor(self)
|
|
|
|
qp.add_valuerangeprocessor(vrp)
|
2012-04-20 18:44:40 +00:00
|
|
|
|
2011-05-09 04:21:14 +00:00
|
|
|
return qp.parse_query(query_string, self.flags)
|
2012-04-20 18:44:40 +00:00
|
|
|
|
2009-08-16 15:40:47 +00:00
|
|
|
def build_schema(self, fields):
|
|
|
|
"""
|
|
|
|
Build the schema from fields.
|
2012-04-20 18:44:40 +00:00
|
|
|
|
2014-05-10 19:47:23 +00:00
|
|
|
:param fields: A list of fields in the index
|
|
|
|
:returns: list of dictionaries
|
2012-04-20 18:44:40 +00:00
|
|
|
|
2014-05-10 19:47:23 +00:00
|
|
|
Each dictionary has the keys
|
|
|
|
field_name: The name of the field index
|
|
|
|
type: what type of value it is
|
|
|
|
'multi_valued': if it allows more than one value
|
|
|
|
'column': a number identifying it
|
|
|
|
'type': the type of the field
|
|
|
|
'multi_valued': 'false', 'column': 0}
|
2009-08-16 15:40:47 +00:00
|
|
|
"""
|
|
|
|
content_field_name = ''
|
2011-08-09 02:54:58 +00:00
|
|
|
schema_fields = [
|
2014-05-14 19:34:39 +00:00
|
|
|
{'field_name': ID,
|
|
|
|
'type': 'text',
|
|
|
|
'multi_valued': 'false',
|
|
|
|
'column': 0},
|
|
|
|
{'field_name': DJANGO_ID,
|
2014-05-18 12:14:34 +00:00
|
|
|
'type': 'integer',
|
2014-05-14 19:34:39 +00:00
|
|
|
'multi_valued': 'false',
|
|
|
|
'column': 1},
|
|
|
|
{'field_name': DJANGO_CT,
|
|
|
|
'type': 'text',
|
|
|
|
'multi_valued': 'false',
|
|
|
|
'column': 2},
|
2011-08-09 02:54:58 +00:00
|
|
|
]
|
2014-05-11 05:48:55 +00:00
|
|
|
self._columns[ID] = 0
|
2014-05-14 19:34:39 +00:00
|
|
|
self._columns[DJANGO_ID] = 1
|
|
|
|
self._columns[DJANGO_CT] = 2
|
2014-05-11 05:48:55 +00:00
|
|
|
|
2011-08-09 02:54:58 +00:00
|
|
|
column = len(schema_fields)
|
2012-04-20 18:44:40 +00:00
|
|
|
|
2014-05-18 16:25:27 +00:00
|
|
|
for field_name, field_class in sorted(list(fields.items()), key=lambda n: n[0]):
|
2009-08-16 15:40:47 +00:00
|
|
|
if field_class.document is True:
|
2010-02-09 01:28:51 +00:00
|
|
|
content_field_name = field_class.index_fieldname
|
2012-04-20 18:44:40 +00:00
|
|
|
|
2009-08-16 15:40:47 +00:00
|
|
|
if field_class.indexed is True:
|
|
|
|
field_data = {
|
2010-02-09 01:28:51 +00:00
|
|
|
'field_name': field_class.index_fieldname,
|
2009-08-16 15:40:47 +00:00
|
|
|
'type': 'text',
|
|
|
|
'multi_valued': 'false',
|
|
|
|
'column': column,
|
|
|
|
}
|
2012-04-20 18:44:40 +00:00
|
|
|
|
2014-05-18 12:14:34 +00:00
|
|
|
if field_class.field_type == 'date':
|
2009-08-16 15:40:47 +00:00
|
|
|
field_data['type'] = 'date'
|
2014-05-18 12:14:34 +00:00
|
|
|
elif field_class.field_type == 'datetime':
|
|
|
|
field_data['type'] = 'datetime'
|
2010-11-10 17:46:04 +00:00
|
|
|
elif field_class.field_type == 'integer':
|
2014-05-18 12:14:34 +00:00
|
|
|
field_data['type'] = 'integer'
|
2010-11-10 17:46:04 +00:00
|
|
|
elif field_class.field_type == 'float':
|
2009-08-29 22:04:28 +00:00
|
|
|
field_data['type'] = 'float'
|
2010-11-10 17:46:04 +00:00
|
|
|
elif field_class.field_type == 'boolean':
|
2009-08-16 15:40:47 +00:00
|
|
|
field_data['type'] = 'boolean'
|
2014-09-25 03:08:31 +00:00
|
|
|
elif field_class.field_type == 'ngram':
|
|
|
|
field_data['type'] = 'ngram'
|
|
|
|
elif field_class.field_type == 'edge_ngram':
|
|
|
|
field_data['type'] = 'edge_ngram'
|
2012-04-20 18:44:40 +00:00
|
|
|
|
2010-11-10 17:46:04 +00:00
|
|
|
if field_class.is_multivalued:
|
2009-08-16 15:40:47 +00:00
|
|
|
field_data['multi_valued'] = 'true'
|
2012-04-20 18:44:40 +00:00
|
|
|
|
2009-08-16 15:40:47 +00:00
|
|
|
schema_fields.append(field_data)
|
2014-05-11 05:48:55 +00:00
|
|
|
self._columns[field_data['field_name']] = column
|
2009-08-16 15:40:47 +00:00
|
|
|
column += 1
|
2010-02-19 14:47:58 +00:00
|
|
|
|
2014-05-10 19:47:23 +00:00
|
|
|
return content_field_name, schema_fields
|
2012-04-20 18:44:40 +00:00
|
|
|
|
2014-05-10 19:47:23 +00:00
|
|
|
@staticmethod
|
|
|
|
def _do_highlight(content, query, tag='em'):
|
2009-07-27 19:12:20 +00:00
|
|
|
"""
|
2009-12-03 18:49:26 +00:00
|
|
|
Highlight `query` terms in `content` with html `tag`.
|
2012-04-20 18:44:40 +00:00
|
|
|
|
2009-07-27 19:12:20 +00:00
|
|
|
This method assumes that the input text (`content`) does not contain
|
|
|
|
any special formatting. That is, it does not contain any html tags
|
|
|
|
or similar markup that could be screwed up by the highlighting.
|
2012-04-20 18:44:40 +00:00
|
|
|
|
2009-07-27 19:12:20 +00:00
|
|
|
Required arguments:
|
|
|
|
`content` -- Content to search for instances of `text`
|
|
|
|
`text` -- The text to be highlighted
|
|
|
|
"""
|
2009-12-03 18:49:26 +00:00
|
|
|
for term in query:
|
2015-11-14 09:07:28 +00:00
|
|
|
term = term.decode('utf-8')
|
2012-04-20 18:44:40 +00:00
|
|
|
for match in re.findall('[^A-Z]+', term): # Ignore field identifiers
|
2009-12-03 18:49:26 +00:00
|
|
|
match_re = re.compile(match, re.I)
|
|
|
|
content = match_re.sub('<%s>%s</%s>' % (tag, term, tag), content)
|
2012-04-20 18:44:40 +00:00
|
|
|
|
2009-07-27 19:12:20 +00:00
|
|
|
return content
|
2012-04-20 18:44:40 +00:00
|
|
|
|
2014-05-17 21:18:22 +00:00
|
|
|
def _prepare_facet_field_spies(self, facets):
|
2009-06-18 16:15:13 +00:00
|
|
|
"""
|
2014-05-17 21:18:22 +00:00
|
|
|
Returns a list of spies based on the facets
|
|
|
|
used to count frequencies.
|
|
|
|
"""
|
|
|
|
spies = []
|
|
|
|
for facet in facets:
|
2014-05-18 14:36:53 +00:00
|
|
|
slot = self.column[facet]
|
2014-05-17 21:18:22 +00:00
|
|
|
spy = xapian.ValueCountMatchSpy(slot)
|
|
|
|
# add attribute "slot" to know which column this spy is targeting.
|
|
|
|
spy.slot = slot
|
|
|
|
spies.append(spy)
|
|
|
|
return spies
|
2012-04-20 18:44:40 +00:00
|
|
|
|
2014-05-17 21:18:22 +00:00
|
|
|
def _process_facet_field_spies(self, spies):
|
|
|
|
"""
|
|
|
|
Returns a dict of facet names with lists of
|
|
|
|
tuples of the form (term, term_frequency)
|
|
|
|
from a list of spies that observed the enquire.
|
|
|
|
"""
|
|
|
|
facet_dict = {}
|
|
|
|
for spy in spies:
|
|
|
|
field = self.schema[spy.slot]
|
2014-05-18 12:07:53 +00:00
|
|
|
field_name, field_type = field['field_name'], field['type']
|
|
|
|
|
2014-05-17 21:18:22 +00:00
|
|
|
facet_dict[field_name] = []
|
2014-05-18 16:25:27 +00:00
|
|
|
for facet in list(spy.values()):
|
2015-11-14 09:07:28 +00:00
|
|
|
if field_type == 'float':
|
|
|
|
# the float term is a Xapian serialized object, which is
|
|
|
|
# in bytes.
|
|
|
|
term = facet.term
|
|
|
|
else:
|
|
|
|
term = facet.term.decode('utf-8')
|
|
|
|
|
|
|
|
facet_dict[field_name].append((_from_xapian_value(term, field_type),
|
2014-05-18 12:07:53 +00:00
|
|
|
facet.termfreq))
|
2014-05-17 21:18:22 +00:00
|
|
|
return facet_dict
|
2012-04-20 18:44:40 +00:00
|
|
|
|
2014-05-17 21:18:22 +00:00
|
|
|
def _do_multivalued_field_facets(self, results, field_facets):
|
|
|
|
"""
|
|
|
|
Implements a multivalued field facet on the results.
|
|
|
|
|
|
|
|
This is implemented using brute force - O(N^2) -
|
|
|
|
because Xapian does not have it implemented yet
|
|
|
|
(see http://trac.xapian.org/ticket/199)
|
2009-06-18 16:15:13 +00:00
|
|
|
"""
|
2009-08-11 13:28:15 +00:00
|
|
|
facet_dict = {}
|
2012-04-20 18:44:40 +00:00
|
|
|
|
2009-08-11 13:28:15 +00:00
|
|
|
for field in field_facets:
|
|
|
|
facet_list = {}
|
2014-05-17 21:18:22 +00:00
|
|
|
if not self._multi_value_field(field):
|
|
|
|
continue
|
2012-04-20 18:44:40 +00:00
|
|
|
|
2009-08-11 13:28:15 +00:00
|
|
|
for result in results:
|
|
|
|
field_value = getattr(result, field)
|
2014-05-17 21:18:22 +00:00
|
|
|
for item in field_value: # Facet each item in a MultiValueField
|
|
|
|
facet_list[item] = facet_list.get(item, 0) + 1
|
2012-04-20 18:44:40 +00:00
|
|
|
|
2014-05-18 16:25:27 +00:00
|
|
|
facet_dict[field] = list(facet_list.items())
|
2009-08-11 13:28:15 +00:00
|
|
|
return facet_dict
|
2012-04-20 18:44:40 +00:00
|
|
|
|
2014-05-10 19:47:23 +00:00
|
|
|
@staticmethod
|
|
|
|
def _do_date_facets(results, date_facets):
|
2009-08-07 18:22:40 +00:00
|
|
|
"""
|
|
|
|
Private method that facets a document by date ranges
|
2012-04-20 18:44:40 +00:00
|
|
|
|
2009-08-07 18:22:40 +00:00
|
|
|
Required arguments:
|
2009-08-10 20:12:59 +00:00
|
|
|
`results` -- A list SearchResults to facet
|
2009-08-11 12:49:45 +00:00
|
|
|
`date_facets` -- A dictionary containing facet parameters:
|
2009-08-16 18:03:19 +00:00
|
|
|
{'field': {'start_date': ..., 'end_date': ...: 'gap_by': '...', 'gap_amount': n}}
|
|
|
|
nb., gap must be one of the following:
|
|
|
|
year|month|day|hour|minute|second
|
2012-04-20 18:44:40 +00:00
|
|
|
|
2009-08-10 20:12:59 +00:00
|
|
|
For each date facet field in `date_facets`, generates a list
|
2009-08-16 18:03:19 +00:00
|
|
|
of date ranges (from `start_date` to `end_date` by `gap_by`) then
|
2009-08-10 20:12:59 +00:00
|
|
|
iterates through `results` and tallies the count for each date_facet.
|
2012-04-20 18:44:40 +00:00
|
|
|
|
2009-08-10 20:12:59 +00:00
|
|
|
Returns a dictionary of date facets (fields) containing a list with
|
|
|
|
entries for each range and a count of documents matching the range.
|
2012-04-20 18:44:40 +00:00
|
|
|
|
2009-08-10 20:12:59 +00:00
|
|
|
eg. {
|
|
|
|
'pub_date': [
|
2015-11-11 22:38:30 +00:00
|
|
|
(datetime.datetime(2009, 1, 1, 0, 0), 5),
|
|
|
|
(datetime.datetime(2009, 2, 1, 0, 0), 0),
|
|
|
|
(datetime.datetime(2009, 3, 1, 0, 0), 0),
|
|
|
|
(datetime.datetime(2008, 4, 1, 0, 0), 1),
|
|
|
|
(datetime.datetime(2008, 5, 1, 0, 0), 2),
|
2009-08-10 20:12:59 +00:00
|
|
|
],
|
|
|
|
}
|
2009-08-07 18:22:40 +00:00
|
|
|
"""
|
2015-11-11 22:38:30 +00:00
|
|
|
def next_datetime(previous, gap_value, gap_type):
|
|
|
|
year = previous.year
|
|
|
|
month = previous.month
|
|
|
|
|
|
|
|
if gap_type == 'year':
|
|
|
|
next = previous.replace(year=year + gap_value)
|
|
|
|
elif gap_type == 'month':
|
|
|
|
if month + gap_value <= 12:
|
|
|
|
next = previous.replace(month=month + gap_value)
|
|
|
|
else:
|
|
|
|
next = previous.replace(
|
|
|
|
month=((month + gap_value) % 12),
|
2015-11-14 09:07:28 +00:00
|
|
|
year=(year + (month + gap_value) // 12)
|
2015-11-11 22:38:30 +00:00
|
|
|
)
|
|
|
|
elif gap_type == 'day':
|
|
|
|
next = previous + datetime.timedelta(days=gap_value)
|
|
|
|
elif gap_type == 'hour':
|
|
|
|
return previous + datetime.timedelta(hours=gap_value)
|
|
|
|
elif gap_type == 'minute':
|
|
|
|
next = previous + datetime.timedelta(minutes=gap_value)
|
|
|
|
elif gap_type == 'second':
|
|
|
|
next = previous + datetime.timedelta(seconds=gap_value)
|
|
|
|
else:
|
|
|
|
raise TypeError('\'gap_by\' must be '
|
|
|
|
'{second, minute, day, month, year}')
|
|
|
|
return next
|
|
|
|
|
2009-08-10 20:12:59 +00:00
|
|
|
facet_dict = {}
|
2012-04-20 18:44:40 +00:00
|
|
|
|
2014-05-18 16:25:27 +00:00
|
|
|
for date_facet, facet_params in list(date_facets.items()):
|
2009-08-16 18:03:19 +00:00
|
|
|
gap_type = facet_params.get('gap_by')
|
|
|
|
gap_value = facet_params.get('gap_amount', 1)
|
2009-08-10 20:12:59 +00:00
|
|
|
date_range = facet_params['start_date']
|
2015-11-11 22:38:30 +00:00
|
|
|
|
|
|
|
# construct the bins of the histogram
|
2009-08-10 20:12:59 +00:00
|
|
|
facet_list = []
|
2009-08-10 20:34:29 +00:00
|
|
|
while date_range < facet_params['end_date']:
|
2015-11-11 22:38:30 +00:00
|
|
|
facet_list.append((date_range, 0))
|
|
|
|
date_range = next_datetime(date_range, gap_value, gap_type)
|
2012-04-20 18:44:40 +00:00
|
|
|
|
2014-05-10 19:47:23 +00:00
|
|
|
facet_list = sorted(facet_list, key=lambda x: x[0], reverse=True)
|
2012-04-20 18:44:40 +00:00
|
|
|
|
2009-08-10 20:12:59 +00:00
|
|
|
for result in results:
|
|
|
|
result_date = getattr(result, date_facet)
|
2015-11-11 22:38:30 +00:00
|
|
|
|
|
|
|
# convert date to datetime
|
|
|
|
if not isinstance(result_date, datetime.datetime):
|
|
|
|
result_date = datetime.datetime(result_date.year,
|
|
|
|
result_date.month,
|
|
|
|
result_date.day)
|
|
|
|
|
|
|
|
# ignore results outside the boundaries.
|
|
|
|
if facet_list[0][0] < result_date < facet_list[-1][0]:
|
|
|
|
continue
|
|
|
|
|
|
|
|
# populate the histogram by putting the result on the right bin.
|
|
|
|
for n, facet_date in enumerate(facet_list):
|
|
|
|
if result_date > facet_date[0]:
|
|
|
|
# equal to facet_list[n][1] += 1, but for a tuple
|
|
|
|
facet_list[n] = (facet_list[n][0], (facet_list[n][1] + 1))
|
|
|
|
break # bin found; go to next result
|
2012-04-20 18:44:40 +00:00
|
|
|
|
2009-08-10 20:12:59 +00:00
|
|
|
facet_dict[date_facet] = facet_list
|
2012-04-20 18:44:40 +00:00
|
|
|
|
2009-08-10 20:12:59 +00:00
|
|
|
return facet_dict
|
2012-04-20 18:44:40 +00:00
|
|
|
|
2009-08-11 12:42:42 +00:00
|
|
|
def _do_query_facets(self, results, query_facets):
|
2009-08-11 12:49:45 +00:00
|
|
|
"""
|
|
|
|
Private method that facets a document by query
|
2012-04-20 18:44:40 +00:00
|
|
|
|
2009-08-11 12:49:45 +00:00
|
|
|
Required arguments:
|
|
|
|
`results` -- A list SearchResults to facet
|
|
|
|
`query_facets` -- A dictionary containing facet parameters:
|
|
|
|
{'field': 'query', [...]}
|
2012-04-20 18:44:40 +00:00
|
|
|
|
2009-08-11 12:49:45 +00:00
|
|
|
For each query in `query_facets`, generates a dictionary entry with
|
|
|
|
the field name as the key and a tuple with the query and result count
|
|
|
|
as the value.
|
2012-04-20 18:44:40 +00:00
|
|
|
|
2009-08-11 12:49:45 +00:00
|
|
|
eg. {'name': ('a*', 5)}
|
|
|
|
"""
|
2009-08-11 12:42:42 +00:00
|
|
|
facet_dict = {}
|
2014-05-18 16:25:27 +00:00
|
|
|
for field, query in list(dict(query_facets).items()):
|
2009-12-05 15:43:52 +00:00
|
|
|
facet_dict[field] = (query, self.search(self.parse_query(query))['hits'])
|
|
|
|
|
2009-08-11 12:42:42 +00:00
|
|
|
return facet_dict
|
2012-04-20 18:44:40 +00:00
|
|
|
|
2014-05-10 19:47:23 +00:00
|
|
|
@staticmethod
|
|
|
|
def _do_spelling_suggestion(database, query, spelling_query):
|
2009-12-03 17:20:04 +00:00
|
|
|
"""
|
|
|
|
Private method that returns a single spelling suggestion based on
|
|
|
|
`spelling_query` or `query`.
|
2012-04-20 18:44:40 +00:00
|
|
|
|
2009-12-03 17:20:04 +00:00
|
|
|
Required arguments:
|
|
|
|
`database` -- The database to check spelling against
|
|
|
|
`query` -- The query to check
|
|
|
|
`spelling_query` -- If not None, this will be checked instead of `query`
|
2012-04-20 18:44:40 +00:00
|
|
|
|
2009-12-03 17:20:04 +00:00
|
|
|
Returns a string with a suggested spelling
|
|
|
|
"""
|
|
|
|
if spelling_query:
|
|
|
|
if ' ' in spelling_query:
|
2015-11-14 09:07:28 +00:00
|
|
|
return ' '.join([database.get_spelling_suggestion(term).decode('utf-8') for term in spelling_query.split()])
|
2009-12-03 17:20:04 +00:00
|
|
|
else:
|
2015-11-14 09:07:28 +00:00
|
|
|
return database.get_spelling_suggestion(spelling_query).decode('utf-8')
|
2012-04-20 18:44:40 +00:00
|
|
|
|
2009-12-05 16:32:29 +00:00
|
|
|
term_set = set()
|
2009-12-03 18:49:26 +00:00
|
|
|
for term in query:
|
2015-11-14 09:07:28 +00:00
|
|
|
for match in re.findall('[^A-Z]+', term.decode('utf-8')): # Ignore field identifiers
|
|
|
|
term_set.add(database.get_spelling_suggestion(match).decode('utf-8'))
|
2012-04-20 18:44:40 +00:00
|
|
|
|
2009-12-05 16:32:29 +00:00
|
|
|
return ' '.join(term_set)
|
2012-04-20 18:44:40 +00:00
|
|
|
|
2009-07-31 01:18:31 +00:00
|
|
|
def _database(self, writable=False):
|
2009-07-21 17:11:33 +00:00
|
|
|
"""
|
2009-12-08 18:18:17 +00:00
|
|
|
Private method that returns a xapian.Database for use.
|
|
|
|
|
2009-07-31 01:18:31 +00:00
|
|
|
Optional arguments:
|
|
|
|
``writable`` -- Open the database in read/write mode (default=False)
|
2009-12-08 18:18:17 +00:00
|
|
|
|
2009-07-31 01:18:31 +00:00
|
|
|
Returns an instance of a xapian.Database or xapian.WritableDatabase
|
|
|
|
"""
|
2012-04-20 19:57:56 +00:00
|
|
|
if self.path == MEMORY_DB_NAME:
|
|
|
|
if not self.inmemory_db:
|
|
|
|
self.inmemory_db = xapian.inmemory_open()
|
|
|
|
return self.inmemory_db
|
2009-07-31 01:18:31 +00:00
|
|
|
if writable:
|
2011-05-09 04:21:14 +00:00
|
|
|
database = xapian.WritableDatabase(self.path, xapian.DB_CREATE_OR_OPEN)
|
2009-07-31 01:18:31 +00:00
|
|
|
else:
|
2009-12-08 14:56:57 +00:00
|
|
|
try:
|
2011-05-09 04:21:14 +00:00
|
|
|
database = xapian.Database(self.path)
|
2009-12-08 14:56:57 +00:00
|
|
|
except xapian.DatabaseOpeningError:
|
2014-05-11 15:50:48 +00:00
|
|
|
raise InvalidIndexError('Unable to open index at %s' % self.path)
|
2009-12-08 18:18:17 +00:00
|
|
|
|
2009-07-31 01:18:31 +00:00
|
|
|
return database
|
2009-12-08 18:18:17 +00:00
|
|
|
|
2014-05-10 19:47:23 +00:00
|
|
|
@staticmethod
|
2014-05-17 21:18:22 +00:00
|
|
|
def _get_enquire_mset(database, enquire, start_offset, end_offset, checkatleast=DEFAULT_CHECK_AT_LEAST):
|
2010-02-06 15:45:26 +00:00
|
|
|
"""
|
|
|
|
A safer version of Xapian.enquire.get_mset
|
|
|
|
|
|
|
|
Simply wraps the Xapian version and catches any `Xapian.DatabaseModifiedError`,
|
|
|
|
attempting a `database.reopen` as needed.
|
|
|
|
|
|
|
|
Required arguments:
|
|
|
|
`database` -- The database to be read
|
|
|
|
`enquire` -- An instance of an Xapian.enquire object
|
|
|
|
`start_offset` -- The start offset to pass to `enquire.get_mset`
|
|
|
|
`end_offset` -- The end offset to pass to `enquire.get_mset`
|
|
|
|
"""
|
|
|
|
try:
|
2014-05-17 21:18:22 +00:00
|
|
|
return enquire.get_mset(start_offset, end_offset, checkatleast)
|
2010-02-06 15:45:26 +00:00
|
|
|
except xapian.DatabaseModifiedError:
|
|
|
|
database.reopen()
|
2014-05-17 21:18:22 +00:00
|
|
|
return enquire.get_mset(start_offset, end_offset, checkatleast)
|
2010-02-06 15:45:26 +00:00
|
|
|
|
2014-05-10 19:47:23 +00:00
|
|
|
@staticmethod
|
|
|
|
def _get_document_data(database, document):
|
2010-02-19 14:47:58 +00:00
|
|
|
"""
|
|
|
|
A safer version of Xapian.document.get_data
|
|
|
|
|
|
|
|
Simply wraps the Xapian version and catches any `Xapian.DatabaseModifiedError`,
|
|
|
|
attempting a `database.reopen` as needed.
|
|
|
|
|
|
|
|
Required arguments:
|
|
|
|
`database` -- The database to be read
|
|
|
|
`document` -- An instance of an Xapian.document object
|
|
|
|
"""
|
|
|
|
try:
|
|
|
|
return document.get_data()
|
|
|
|
except xapian.DatabaseModifiedError:
|
|
|
|
database.reopen()
|
|
|
|
return document.get_data()
|
|
|
|
|
2010-11-10 18:55:34 +00:00
|
|
|
def _get_hit_count(self, database, enquire):
|
|
|
|
"""
|
|
|
|
Given a database and enquire instance, returns the estimated number
|
|
|
|
of matches.
|
2012-04-20 18:44:40 +00:00
|
|
|
|
2010-11-10 18:55:34 +00:00
|
|
|
Required arguments:
|
|
|
|
`database` -- The database to be queried
|
|
|
|
`enquire` -- The enquire instance
|
|
|
|
"""
|
|
|
|
return self._get_enquire_mset(
|
|
|
|
database, enquire, 0, database.get_doccount()
|
2010-11-10 20:36:02 +00:00
|
|
|
).size()
|
2010-11-10 18:55:34 +00:00
|
|
|
|
2009-09-16 18:07:24 +00:00
|
|
|
def _multi_value_field(self, field):
|
|
|
|
"""
|
|
|
|
Private method that returns `True` if a field is multi-valued, else
|
|
|
|
`False`.
|
2012-04-20 18:44:40 +00:00
|
|
|
|
2009-09-16 18:07:24 +00:00
|
|
|
Required arguemnts:
|
|
|
|
`field` -- The field to lookup
|
2012-04-20 18:44:40 +00:00
|
|
|
|
2009-09-16 18:07:24 +00:00
|
|
|
Returns a boolean value indicating whether the field is multi-valued.
|
|
|
|
"""
|
|
|
|
for field_dict in self.schema:
|
|
|
|
if field_dict['field_name'] == field:
|
|
|
|
return field_dict['multi_valued'] == 'true'
|
|
|
|
return False
|
|
|
|
|
2009-07-31 15:39:58 +00:00
|
|
|
|
2011-05-09 04:21:14 +00:00
|
|
|
class XapianSearchQuery(BaseSearchQuery):
|
2009-06-18 16:15:13 +00:00
|
|
|
"""
|
2009-10-21 14:22:29 +00:00
|
|
|
This class is the Xapian specific version of the SearchQuery class.
|
|
|
|
It acts as an intermediary between the ``SearchQuerySet`` and the
|
|
|
|
``SearchBackend`` itself.
|
2009-06-18 16:15:13 +00:00
|
|
|
"""
|
2011-08-08 21:53:32 +00:00
|
|
|
def build_params(self, *args, **kwargs):
|
2021-08-09 18:09:22 +00:00
|
|
|
kwargs = super().build_params(*args, **kwargs)
|
2011-08-08 21:53:32 +00:00
|
|
|
|
|
|
|
if self.end_offset is not None:
|
|
|
|
kwargs['end_offset'] = self.end_offset - self.start_offset
|
2012-04-20 18:44:40 +00:00
|
|
|
|
2011-08-08 21:53:32 +00:00
|
|
|
return kwargs
|
|
|
|
|
2009-11-11 01:45:37 +00:00
|
|
|
def build_query(self):
|
|
|
|
if not self.query_filter:
|
2009-11-29 21:05:36 +00:00
|
|
|
query = xapian.Query('')
|
2009-11-11 01:45:37 +00:00
|
|
|
else:
|
2009-11-29 21:05:36 +00:00
|
|
|
query = self._query_from_search_node(self.query_filter)
|
2012-04-20 18:44:40 +00:00
|
|
|
|
2009-11-29 22:06:29 +00:00
|
|
|
if self.models:
|
|
|
|
subqueries = [
|
2009-11-30 00:12:59 +00:00
|
|
|
xapian.Query(
|
2014-05-11 15:22:44 +00:00
|
|
|
xapian.Query.OP_SCALE_WEIGHT,
|
2017-01-10 21:03:46 +00:00
|
|
|
xapian.Query('%s%s' % (TERM_PREFIXES[DJANGO_CT], get_model_ct(model))),
|
2014-05-11 15:22:44 +00:00
|
|
|
0 # Pure boolean sub-query
|
2009-11-29 22:06:29 +00:00
|
|
|
) for model in self.models
|
|
|
|
]
|
|
|
|
query = xapian.Query(
|
|
|
|
xapian.Query.OP_AND, query,
|
|
|
|
xapian.Query(xapian.Query.OP_OR, subqueries)
|
|
|
|
)
|
2012-04-20 18:44:40 +00:00
|
|
|
|
2009-11-29 21:05:36 +00:00
|
|
|
if self.boost:
|
|
|
|
subqueries = [
|
|
|
|
xapian.Query(
|
2014-05-18 14:38:54 +00:00
|
|
|
xapian.Query.OP_SCALE_WEIGHT,
|
|
|
|
self._term_query(term, None, None), value
|
2014-05-18 16:25:27 +00:00
|
|
|
) for term, value in list(self.boost.items())
|
2009-11-29 21:05:36 +00:00
|
|
|
]
|
|
|
|
query = xapian.Query(
|
2010-01-14 23:33:58 +00:00
|
|
|
xapian.Query.OP_AND_MAYBE, query,
|
|
|
|
xapian.Query(xapian.Query.OP_OR, subqueries)
|
2009-11-29 21:05:36 +00:00
|
|
|
)
|
2012-04-20 18:44:40 +00:00
|
|
|
|
2009-11-29 21:05:36 +00:00
|
|
|
return query
|
2012-04-20 18:44:40 +00:00
|
|
|
|
2009-11-11 02:31:25 +00:00
|
|
|
def _query_from_search_node(self, search_node, is_not=False):
|
|
|
|
query_list = []
|
2012-04-20 18:44:40 +00:00
|
|
|
|
2009-11-11 02:31:25 +00:00
|
|
|
for child in search_node.children:
|
|
|
|
if isinstance(child, SearchNode):
|
|
|
|
query_list.append(
|
2010-05-13 21:08:28 +00:00
|
|
|
self._query_from_search_node(child, child.negated)
|
2009-11-11 02:31:25 +00:00
|
|
|
)
|
|
|
|
else:
|
2009-12-03 14:38:49 +00:00
|
|
|
expression, term = child
|
2014-05-17 09:15:20 +00:00
|
|
|
field_name, filter_type = search_node.split_expression(expression)
|
2012-04-20 18:44:40 +00:00
|
|
|
|
2014-05-20 04:35:05 +00:00
|
|
|
constructed_query_list = self._query_from_term(term, field_name, filter_type, is_not)
|
|
|
|
query_list.extend(constructed_query_list)
|
2012-04-20 18:44:40 +00:00
|
|
|
|
2009-11-11 02:45:49 +00:00
|
|
|
if search_node.connector == 'OR':
|
|
|
|
return xapian.Query(xapian.Query.OP_OR, query_list)
|
|
|
|
else:
|
|
|
|
return xapian.Query(xapian.Query.OP_AND, query_list)
|
2012-04-20 18:44:40 +00:00
|
|
|
|
2014-05-20 04:35:05 +00:00
|
|
|
def _query_from_term(self, term, field_name, filter_type, is_not):
|
|
|
|
"""
|
|
|
|
Uses arguments to construct a list of xapian.Query's.
|
|
|
|
"""
|
|
|
|
if field_name != 'content' and field_name not in self.backend.column:
|
|
|
|
raise InvalidIndexError('field "%s" not indexed' % field_name)
|
|
|
|
|
|
|
|
# It it is an AutoQuery, it has no filters
|
|
|
|
# or others, thus we short-circuit the procedure.
|
|
|
|
if isinstance(term, AutoQuery):
|
|
|
|
if field_name != 'content':
|
|
|
|
query = '%s:%s' % (field_name, term.prepare(self))
|
|
|
|
else:
|
|
|
|
query = term.prepare(self)
|
|
|
|
return [self.backend.parse_query(query)]
|
|
|
|
query_list = []
|
|
|
|
|
|
|
|
# Handle `ValuesListQuerySet`.
|
|
|
|
if hasattr(term, 'values_list'):
|
|
|
|
term = list(term)
|
|
|
|
|
|
|
|
if field_name == 'content':
|
|
|
|
# content is the generic search:
|
|
|
|
# force no field_name search
|
|
|
|
# and the field_type to be 'text'.
|
|
|
|
field_name = None
|
|
|
|
field_type = 'text'
|
|
|
|
|
|
|
|
# we don't know what is the type(term), so we parse it.
|
|
|
|
# Ideally this would not be required, but
|
|
|
|
# some filters currently depend on the term to make decisions.
|
|
|
|
term = _to_xapian_term(term)
|
|
|
|
|
|
|
|
query_list.append(self._filter_contains(term, field_name, field_type, is_not))
|
|
|
|
# when filter has no filter_type, haystack uses
|
2016-10-14 13:08:54 +00:00
|
|
|
# filter_type = 'content'. Here we remove it
|
2014-05-20 04:35:05 +00:00
|
|
|
# since the above query is already doing this
|
2016-10-14 13:08:54 +00:00
|
|
|
if filter_type == 'content':
|
2014-05-20 04:35:05 +00:00
|
|
|
filter_type = None
|
|
|
|
else:
|
|
|
|
# get the field_type from the backend
|
|
|
|
field_type = self.backend.schema[self.backend.column[field_name]]['type']
|
|
|
|
|
|
|
|
# private fields don't accept 'contains' or 'startswith'
|
|
|
|
# since they have no meaning.
|
2017-01-10 21:03:46 +00:00
|
|
|
if filter_type in ('contains', 'startswith') and field_name in (ID, DJANGO_ID, DJANGO_CT):
|
2014-05-20 04:35:05 +00:00
|
|
|
filter_type = 'exact'
|
|
|
|
|
|
|
|
if field_type == 'text':
|
|
|
|
# we don't know what type "term" is, but we know we are searching as text
|
|
|
|
# so we parse it like that.
|
|
|
|
# Ideally this would not be required since _term_query does it, but
|
|
|
|
# some filters currently depend on the term to make decisions.
|
|
|
|
if isinstance(term, list):
|
|
|
|
term = [_to_xapian_term(term) for term in term]
|
|
|
|
else:
|
|
|
|
term = _to_xapian_term(term)
|
|
|
|
|
|
|
|
# todo: we should check that the filter is valid for this field_type or raise InvalidIndexError
|
|
|
|
if filter_type == 'contains':
|
|
|
|
query_list.append(self._filter_contains(term, field_name, field_type, is_not))
|
2016-10-14 13:08:54 +00:00
|
|
|
elif filter_type in ('content', 'exact'):
|
2014-05-20 04:35:05 +00:00
|
|
|
query_list.append(self._filter_exact(term, field_name, field_type, is_not))
|
|
|
|
elif filter_type == 'in':
|
|
|
|
query_list.append(self._filter_in(term, field_name, field_type, is_not))
|
|
|
|
elif filter_type == 'startswith':
|
|
|
|
query_list.append(self._filter_startswith(term, field_name, field_type, is_not))
|
2016-10-14 13:36:33 +00:00
|
|
|
elif filter_type == 'endswith':
|
|
|
|
raise NotImplementedError("The Xapian search backend doesn't support endswith queries.")
|
2014-05-20 04:35:05 +00:00
|
|
|
elif filter_type == 'gt':
|
|
|
|
query_list.append(self._filter_gt(term, field_name, field_type, is_not))
|
|
|
|
elif filter_type == 'gte':
|
|
|
|
query_list.append(self._filter_gte(term, field_name, field_type, is_not))
|
|
|
|
elif filter_type == 'lt':
|
|
|
|
query_list.append(self._filter_lt(term, field_name, field_type, is_not))
|
|
|
|
elif filter_type == 'lte':
|
|
|
|
query_list.append(self._filter_lte(term, field_name, field_type, is_not))
|
2016-10-14 16:04:20 +00:00
|
|
|
elif filter_type == 'range':
|
|
|
|
query_list.append(self._filter_range(term, field_name, field_type, is_not))
|
2014-05-20 04:35:05 +00:00
|
|
|
return query_list
|
|
|
|
|
2014-05-17 10:42:45 +00:00
|
|
|
def _all_query(self):
|
|
|
|
"""
|
2014-05-18 12:07:53 +00:00
|
|
|
Returns a match all query.
|
2014-05-17 10:42:45 +00:00
|
|
|
"""
|
|
|
|
return xapian.Query('')
|
|
|
|
|
2014-05-18 12:07:53 +00:00
|
|
|
def _filter_contains(self, term, field_name, field_type, is_not):
|
2009-12-01 14:19:30 +00:00
|
|
|
"""
|
2014-05-17 11:31:34 +00:00
|
|
|
Splits the sentence in terms and join them with OR,
|
|
|
|
using stemmed and un-stemmed.
|
2014-05-18 14:38:22 +00:00
|
|
|
|
|
|
|
Assumes term is not a list.
|
2009-12-01 14:19:30 +00:00
|
|
|
"""
|
2014-05-18 12:07:53 +00:00
|
|
|
if field_type == 'text':
|
|
|
|
term_list = term.split()
|
|
|
|
else:
|
|
|
|
term_list = [term]
|
2014-05-17 11:31:34 +00:00
|
|
|
|
2014-05-18 12:07:53 +00:00
|
|
|
query = self._or_query(term_list, field_name, field_type)
|
2014-05-17 11:31:34 +00:00
|
|
|
if is_not:
|
|
|
|
return xapian.Query(xapian.Query.OP_AND_NOT, self._all_query(), query)
|
2009-11-30 21:49:47 +00:00
|
|
|
else:
|
2014-05-17 11:31:34 +00:00
|
|
|
return query
|
2012-04-20 18:44:40 +00:00
|
|
|
|
2014-05-17 10:53:58 +00:00
|
|
|
def _filter_in(self, term_list, field_name, field_type, is_not):
|
2009-12-01 14:19:30 +00:00
|
|
|
"""
|
2014-05-17 11:19:17 +00:00
|
|
|
Returns a query that matches exactly ANY term in term_list.
|
2012-04-20 18:44:40 +00:00
|
|
|
|
2014-05-17 11:19:17 +00:00
|
|
|
Notice that:
|
|
|
|
A in {B,C} <=> (A = B or A = C)
|
|
|
|
~(A in {B,C}) <=> ~(A = B or A = C)
|
|
|
|
Because OP_AND_NOT(C, D) <=> (C and ~D), then D=(A in {B,C}) requires `is_not=False`.
|
2014-05-18 14:38:22 +00:00
|
|
|
|
|
|
|
Assumes term is a list.
|
2009-12-01 14:19:30 +00:00
|
|
|
"""
|
2014-05-17 11:19:17 +00:00
|
|
|
query_list = [self._filter_exact(term, field_name, field_type, is_not=False)
|
|
|
|
for term in term_list]
|
|
|
|
|
2009-12-01 14:11:01 +00:00
|
|
|
if is_not:
|
2014-05-10 19:47:23 +00:00
|
|
|
return xapian.Query(xapian.Query.OP_AND_NOT, self._all_query(),
|
|
|
|
xapian.Query(xapian.Query.OP_OR, query_list))
|
2009-12-01 14:11:01 +00:00
|
|
|
else:
|
|
|
|
return xapian.Query(xapian.Query.OP_OR, query_list)
|
2012-04-20 18:44:40 +00:00
|
|
|
|
2014-05-17 10:53:58 +00:00
|
|
|
def _filter_exact(self, term, field_name, field_type, is_not):
|
2014-05-17 10:42:45 +00:00
|
|
|
"""
|
2014-05-17 11:16:16 +00:00
|
|
|
Returns a query that matches exactly the un-stemmed term
|
|
|
|
with positional order.
|
2014-05-18 14:38:22 +00:00
|
|
|
|
|
|
|
Assumes term is not a list.
|
2014-05-17 10:42:45 +00:00
|
|
|
"""
|
2017-01-10 21:03:46 +00:00
|
|
|
if field_type == 'text' and field_name not in (DJANGO_CT,):
|
2014-05-22 08:06:24 +00:00
|
|
|
term = '^ %s $' % term
|
2014-05-17 14:54:21 +00:00
|
|
|
query = self._phrase_query(term.split(), field_name, field_type)
|
2014-05-17 11:16:16 +00:00
|
|
|
else:
|
2014-05-23 07:43:36 +00:00
|
|
|
query = self._term_query(term, field_name, field_type, stemmed=False)
|
2014-05-18 12:07:53 +00:00
|
|
|
|
2014-05-17 10:42:45 +00:00
|
|
|
if is_not:
|
|
|
|
return xapian.Query(xapian.Query.OP_AND_NOT, self._all_query(), query)
|
|
|
|
else:
|
|
|
|
return query
|
|
|
|
|
2014-05-17 10:53:58 +00:00
|
|
|
def _filter_startswith(self, term, field_name, field_type, is_not):
|
2009-12-03 17:20:04 +00:00
|
|
|
"""
|
2014-05-17 11:14:43 +00:00
|
|
|
Returns a startswith query on the un-stemmed term.
|
2014-05-18 14:38:22 +00:00
|
|
|
|
|
|
|
Assumes term is not a list.
|
2014-05-17 11:14:43 +00:00
|
|
|
"""
|
2014-06-13 17:51:07 +00:00
|
|
|
if field_type == 'text':
|
|
|
|
if len(term.split()) == 1:
|
|
|
|
term = '^ %s*' % term
|
|
|
|
query = self.backend.parse_query(term)
|
|
|
|
else:
|
|
|
|
term = '^ %s' % term
|
|
|
|
query = self._phrase_query(term.split(), field_name, field_type)
|
2014-05-17 11:14:43 +00:00
|
|
|
else:
|
2014-06-13 17:51:07 +00:00
|
|
|
term = '^%s*' % term
|
|
|
|
query = self.backend.parse_query(term)
|
2012-04-20 18:44:40 +00:00
|
|
|
|
2010-02-09 19:04:53 +00:00
|
|
|
if is_not:
|
2014-05-17 11:14:43 +00:00
|
|
|
return xapian.Query(xapian.Query.OP_AND_NOT, self._all_query(), query)
|
|
|
|
return query
|
2012-04-20 18:44:40 +00:00
|
|
|
|
2014-05-23 07:43:36 +00:00
|
|
|
def _or_query(self, term_list, field, field_type):
|
2014-05-17 11:21:39 +00:00
|
|
|
"""
|
|
|
|
Joins each item of term_list decorated by _term_query with an OR.
|
|
|
|
"""
|
2014-05-23 07:43:36 +00:00
|
|
|
term_list = [self._term_query(term, field, field_type) for term in term_list]
|
2014-05-17 11:21:39 +00:00
|
|
|
return xapian.Query(xapian.Query.OP_OR, term_list)
|
|
|
|
|
2014-05-17 14:54:21 +00:00
|
|
|
def _phrase_query(self, term_list, field_name, field_type):
|
2009-12-04 21:44:36 +00:00
|
|
|
"""
|
2014-05-17 14:54:21 +00:00
|
|
|
Returns a query that matches exact terms with
|
2014-05-17 11:12:45 +00:00
|
|
|
positional order (i.e. ["this", "thing"] != ["thing", "this"])
|
2014-05-17 14:54:21 +00:00
|
|
|
and no stem.
|
2009-12-04 21:44:36 +00:00
|
|
|
|
2014-05-17 14:54:21 +00:00
|
|
|
If `field_name` is not `None`, restrict to the field.
|
2009-12-01 14:19:30 +00:00
|
|
|
"""
|
2014-05-17 14:54:21 +00:00
|
|
|
term_list = [self._term_query(term, field_name, field_type,
|
|
|
|
stemmed=False) for term in term_list]
|
2014-05-17 11:12:45 +00:00
|
|
|
|
2014-05-17 14:54:21 +00:00
|
|
|
query = xapian.Query(xapian.Query.OP_PHRASE, term_list)
|
|
|
|
return query
|
2012-04-20 18:44:40 +00:00
|
|
|
|
2014-05-23 07:43:36 +00:00
|
|
|
def _term_query(self, term, field_name, field_type, stemmed=True):
|
2009-12-01 14:19:30 +00:00
|
|
|
"""
|
2014-05-17 11:08:34 +00:00
|
|
|
Constructs a query of a single term.
|
2012-04-20 18:44:40 +00:00
|
|
|
|
2014-05-17 11:08:34 +00:00
|
|
|
If `field_name` is not `None`, the term is search on that field only.
|
2014-05-17 14:54:21 +00:00
|
|
|
If exact is `True`, the search is restricted to boolean matches.
|
2009-12-01 14:19:30 +00:00
|
|
|
"""
|
2014-05-17 11:08:34 +00:00
|
|
|
constructor = '{prefix}{term}'
|
|
|
|
|
2014-05-18 18:51:44 +00:00
|
|
|
# construct the prefix to be used.
|
2014-05-17 11:08:34 +00:00
|
|
|
prefix = ''
|
|
|
|
if field_name:
|
|
|
|
prefix = TERM_PREFIXES['field'] + field_name.upper()
|
2014-05-18 12:07:53 +00:00
|
|
|
term = _to_xapian_term(term)
|
2014-05-17 11:08:34 +00:00
|
|
|
|
2017-01-10 21:03:46 +00:00
|
|
|
if field_name in (ID, DJANGO_ID, DJANGO_CT):
|
2014-05-18 18:51:44 +00:00
|
|
|
# to ensure the value is serialized correctly.
|
2017-01-10 21:03:46 +00:00
|
|
|
if field_name == DJANGO_ID:
|
2014-05-18 18:51:44 +00:00
|
|
|
term = int(term)
|
|
|
|
term = _term_to_xapian_value(term, field_type)
|
|
|
|
return xapian.Query('%s%s' % (TERM_PREFIXES[field_name], term))
|
|
|
|
|
|
|
|
# we construct the query dates in a slightly different way
|
|
|
|
if field_type == 'datetime':
|
|
|
|
date, time = term.split()
|
|
|
|
return xapian.Query(xapian.Query.OP_AND_MAYBE,
|
|
|
|
constructor.format(prefix=prefix, term=date),
|
|
|
|
constructor.format(prefix=prefix, term=time)
|
|
|
|
)
|
|
|
|
|
|
|
|
# only use stem if field is text or "None"
|
|
|
|
if field_type not in ('text', None):
|
|
|
|
stemmed = False
|
|
|
|
|
2014-05-17 14:54:21 +00:00
|
|
|
unstemmed_term = constructor.format(prefix=prefix, term=term)
|
|
|
|
if stemmed:
|
2014-05-17 11:08:34 +00:00
|
|
|
stem = xapian.Stem(self.backend.language)
|
2014-05-18 19:38:34 +00:00
|
|
|
stemmed_term = 'Z' + constructor.format(prefix=prefix, term=stem(term).decode('utf-8'))
|
2012-04-20 18:44:40 +00:00
|
|
|
|
2014-05-17 11:08:34 +00:00
|
|
|
return xapian.Query(xapian.Query.OP_OR,
|
2014-05-17 14:54:21 +00:00
|
|
|
xapian.Query(stemmed_term),
|
|
|
|
xapian.Query(unstemmed_term)
|
2014-05-17 11:08:34 +00:00
|
|
|
)
|
2014-05-17 14:54:21 +00:00
|
|
|
else:
|
2014-05-18 18:51:44 +00:00
|
|
|
return xapian.Query(unstemmed_term)
|
2012-04-20 18:44:40 +00:00
|
|
|
|
2014-05-18 12:07:53 +00:00
|
|
|
def _filter_gt(self, term, field_name, field_type, is_not):
|
|
|
|
return self._filter_lte(term, field_name, field_type, is_not=not is_not)
|
2012-04-20 18:44:40 +00:00
|
|
|
|
2014-05-18 12:07:53 +00:00
|
|
|
def _filter_lt(self, term, field_name, field_type, is_not):
|
|
|
|
return self._filter_gte(term, field_name, field_type, is_not=not is_not)
|
2012-04-20 18:44:40 +00:00
|
|
|
|
2014-05-18 12:07:53 +00:00
|
|
|
def _filter_gte(self, term, field_name, field_type, is_not):
|
2009-12-01 14:19:30 +00:00
|
|
|
"""
|
2014-05-17 10:42:45 +00:00
|
|
|
Private method that returns a xapian.Query that searches for any term
|
|
|
|
that is greater than `term` in a specified `field`.
|
|
|
|
"""
|
|
|
|
vrp = XHValueRangeProcessor(self.backend)
|
2014-05-18 12:07:53 +00:00
|
|
|
pos, begin, end = vrp('%s:%s' % (field_name, _term_to_xapian_value(term, field_type)), '*')
|
2014-05-17 10:42:45 +00:00
|
|
|
if is_not:
|
|
|
|
return xapian.Query(xapian.Query.OP_AND_NOT,
|
|
|
|
self._all_query(),
|
|
|
|
xapian.Query(xapian.Query.OP_VALUE_RANGE, pos, begin, end)
|
|
|
|
)
|
|
|
|
return xapian.Query(xapian.Query.OP_VALUE_RANGE, pos, begin, end)
|
|
|
|
|
2014-05-18 12:07:53 +00:00
|
|
|
def _filter_lte(self, term, field_name, field_type, is_not):
|
2014-05-17 10:42:45 +00:00
|
|
|
"""
|
|
|
|
Private method that returns a xapian.Query that searches for any term
|
|
|
|
that is less than `term` in a specified `field`.
|
|
|
|
"""
|
|
|
|
vrp = XHValueRangeProcessor(self.backend)
|
2014-05-18 12:07:53 +00:00
|
|
|
pos, begin, end = vrp('%s:' % field_name, '%s' % _term_to_xapian_value(term, field_type))
|
2014-05-17 10:42:45 +00:00
|
|
|
if is_not:
|
|
|
|
return xapian.Query(xapian.Query.OP_AND_NOT,
|
|
|
|
self._all_query(),
|
|
|
|
xapian.Query(xapian.Query.OP_VALUE_RANGE, pos, begin, end)
|
|
|
|
)
|
|
|
|
return xapian.Query(xapian.Query.OP_VALUE_RANGE, pos, begin, end)
|
2009-12-01 13:58:34 +00:00
|
|
|
|
2016-10-14 16:04:20 +00:00
|
|
|
def _filter_range(self, term, field_name, field_type, is_not):
|
|
|
|
"""
|
|
|
|
Private method that returns a xapian.Query that searches for any term
|
|
|
|
that is between the values from the `term` list.
|
|
|
|
"""
|
|
|
|
vrp = XHValueRangeProcessor(self.backend)
|
|
|
|
pos, begin, end = vrp('%s:%s' % (field_name, _term_to_xapian_value(term[0], field_type)),
|
|
|
|
'%s' % _term_to_xapian_value(term[1], field_type))
|
|
|
|
if is_not:
|
|
|
|
return xapian.Query(xapian.Query.OP_AND_NOT,
|
|
|
|
self._all_query(),
|
|
|
|
xapian.Query(xapian.Query.OP_VALUE_RANGE, pos, begin, end)
|
|
|
|
)
|
|
|
|
return xapian.Query(xapian.Query.OP_VALUE_RANGE, pos, begin, end)
|
|
|
|
|
2009-11-28 23:55:11 +00:00
|
|
|
|
2014-05-18 12:07:53 +00:00
|
|
|
def _term_to_xapian_value(term, field_type):
|
2009-11-28 23:55:11 +00:00
|
|
|
"""
|
2014-05-18 12:07:53 +00:00
|
|
|
Converts a term to a serialized
|
|
|
|
Xapian value based on the field_type.
|
2009-11-28 23:55:11 +00:00
|
|
|
"""
|
2014-05-18 12:07:53 +00:00
|
|
|
assert field_type in FIELD_TYPES
|
|
|
|
|
|
|
|
def strf(dt):
|
|
|
|
"""
|
|
|
|
Equivalent to datetime.datetime.strptime(dt, DATETIME_FORMAT)
|
|
|
|
but accepts years below 1900 (see http://stackoverflow.com/q/10263956/931303)
|
|
|
|
"""
|
|
|
|
return '%04d%02d%02d%02d%02d%02d' % (
|
|
|
|
dt.year, dt.month, dt.day, dt.hour, dt.minute, dt.second)
|
|
|
|
|
|
|
|
if field_type == 'boolean':
|
|
|
|
assert isinstance(term, bool)
|
|
|
|
if term:
|
2014-05-11 15:50:48 +00:00
|
|
|
value = 't'
|
2009-11-28 23:55:11 +00:00
|
|
|
else:
|
2014-05-11 15:50:48 +00:00
|
|
|
value = 'f'
|
2009-11-28 23:55:11 +00:00
|
|
|
|
2014-05-18 12:07:53 +00:00
|
|
|
elif field_type == 'integer':
|
|
|
|
value = INTEGER_FORMAT % term
|
|
|
|
elif field_type == 'float':
|
|
|
|
value = xapian.sortable_serialise(term)
|
|
|
|
elif field_type == 'date' or field_type == 'datetime':
|
|
|
|
if field_type == 'date':
|
|
|
|
# http://stackoverflow.com/a/1937636/931303 and comments
|
|
|
|
term = datetime.datetime.combine(term, datetime.time())
|
|
|
|
value = strf(term)
|
|
|
|
else: # field_type == 'text'
|
|
|
|
value = _to_xapian_term(term)
|
2009-12-03 14:38:49 +00:00
|
|
|
|
2014-05-17 21:18:22 +00:00
|
|
|
return value
|
|
|
|
|
|
|
|
|
2014-05-18 12:07:53 +00:00
|
|
|
def _to_xapian_term(term):
|
2009-12-03 14:38:49 +00:00
|
|
|
"""
|
2014-05-18 12:07:53 +00:00
|
|
|
Converts a Python type to a
|
|
|
|
Xapian term that can be indexed.
|
2009-12-03 14:38:49 +00:00
|
|
|
"""
|
2021-12-22 07:59:28 +00:00
|
|
|
value = str(term).lower()
|
2018-12-19 04:15:49 +00:00
|
|
|
if LONG_TERM_METHOD:
|
|
|
|
value = _ensure_term_length(value)
|
|
|
|
return value
|
2009-12-03 14:38:49 +00:00
|
|
|
|
2018-12-19 04:15:49 +00:00
|
|
|
def _ensure_term_length(text):
|
|
|
|
"""
|
|
|
|
Ensures that terms are not too long, this helps protect against long urls
|
|
|
|
and CJK terms which are not tokenised by Xapian (and so are unsupported)
|
|
|
|
"""
|
|
|
|
# Text must operate on bytes, not unicode, because xapian's term limit is
|
|
|
|
# a byte restriction length, not a char limit length.
|
|
|
|
text = text.encode('utf8')
|
|
|
|
|
|
|
|
for match in reversed(list(LONG_TERM.finditer(text))):
|
|
|
|
hole = text[match.start():match.end()]
|
|
|
|
# There are two options available in xapian's omega project. We re-create
|
|
|
|
# these two options here using python code.
|
|
|
|
if LONG_TERM_METHOD == 'truncate':
|
|
|
|
hole = hole[:LONG_TERM_LENGTH]
|
|
|
|
elif LONG_TERM_METHOD == 'hash':
|
|
|
|
from hashlib import sha224
|
|
|
|
hole = sha224(hole.encode('utf8')).hexdigest()
|
|
|
|
text = text[:match.start()] + hole + text[match.end():]
|
|
|
|
|
|
|
|
# We ignore any errors because truncate may have chopped a unicode in half.
|
|
|
|
return text.decode('utf8', 'ignore')
|
2009-12-03 14:38:49 +00:00
|
|
|
|
2014-05-18 12:07:53 +00:00
|
|
|
def _from_xapian_value(value, field_type):
|
|
|
|
"""
|
|
|
|
Converts a serialized Xapian value
|
|
|
|
to Python equivalent based on the field_type.
|
|
|
|
|
|
|
|
Doesn't accept multivalued fields.
|
|
|
|
"""
|
|
|
|
assert field_type in FIELD_TYPES
|
|
|
|
if field_type == 'boolean':
|
|
|
|
if value == 't':
|
|
|
|
return True
|
|
|
|
elif value == 'f':
|
|
|
|
return False
|
|
|
|
else:
|
|
|
|
InvalidIndexError('Field type "%d" does not accept value "%s"' % (field_type, value))
|
|
|
|
elif field_type == 'integer':
|
|
|
|
return int(value)
|
|
|
|
elif field_type == 'float':
|
|
|
|
return xapian.sortable_unserialise(value)
|
|
|
|
elif field_type == 'date' or field_type == 'datetime':
|
|
|
|
datetime_value = datetime.datetime.strptime(value, DATETIME_FORMAT)
|
|
|
|
if field_type == 'datetime':
|
|
|
|
return datetime_value
|
|
|
|
else:
|
|
|
|
return datetime_value.date()
|
|
|
|
else: # field_type == 'text'
|
|
|
|
return value
|
|
|
|
|
|
|
|
|
2015-11-13 20:03:53 +00:00
|
|
|
def _xapian_sort(enquire, sort_by, column):
|
2021-09-26 12:59:06 +00:00
|
|
|
sorter = xapian.MultiValueKeyMaker()
|
2015-11-13 20:03:53 +00:00
|
|
|
|
|
|
|
for sort_field in sort_by:
|
|
|
|
if sort_field.startswith('-'):
|
|
|
|
reverse = False
|
|
|
|
sort_field = sort_field[1:] # Strip the '-'
|
|
|
|
else:
|
|
|
|
reverse = True
|
|
|
|
sorter.add_value(column[sort_field], reverse)
|
|
|
|
|
|
|
|
enquire.set_sort_by_key_then_relevance(sorter, True)
|
|
|
|
|
|
|
|
|
2011-05-09 04:21:14 +00:00
|
|
|
class XapianEngine(BaseEngine):
|
|
|
|
backend = XapianSearchBackend
|
|
|
|
query = XapianSearchQuery
|