98 lines
2.4 KiB
Python

from whoosh.index import create_in, open_dir, EmptyIndexError, Index
from whoosh.analysis import StemmingAnalyzer, CharsetFilter
from whoosh.support.charset import accent_map
from whoosh.fields import Schema, TEXT, KEYWORD, NUMERIC
from whoosh.qparser import QueryParser
from whoosh.query import Term, NullQuery, FuzzyTerm
from typing import Any
from django.conf import settings
import os.path
import shutil
stemming_analyzer = StemmingAnalyzer() | CharsetFilter(accent_map)
schema = Schema(
work=NUMERIC(stored=True, unique=True),
collection=NUMERIC(stored=True),
name=TEXT(stored=True),
composer=TEXT(stored=True),
edition=TEXT(stored=True),
tag=KEYWORD(commas=True),
text=TEXT(analyzer=stemming_analyzer),
)
index_path = settings.WHOOSH_INDEX
def create_index() -> Index:
if not os.path.exists(index_path):
os.mkdir(index_path)
ix = create_in(index_path, schema)
return ix
def get_index() -> Index:
try:
return open_dir(index_path)
except EmptyIndexError:
return create_index()
def reset_index() -> Index:
shutil.rmtree(index_path)
return create_index()
def index_docs(works: list[dict]):
ix = get_index()
with ix.writer() as writer:
for work in works:
writer.update_document(**work)
def search(
query: str,
collections: list[int] = [],
page: int = 1,
pagesize: int = 20,
) -> tuple[list[dict], dict[str, Any]]:
meta = {}
query = query.lower()
qp = QueryParser("text", schema=schema)
q = qp.parse(query)
limit = NullQuery
for c in collections:
limit |= Term("collection", c)
hits = []
ix = get_index()
with ix.searcher() as searcher:
results = searcher.search_page(q & limit, page, pagesize)
# if no results, do a prefix search
if results.results.is_empty():
qp.termclass = FuzzyTerm
q = qp.parse(query)
results = searcher.search_page(q & limit, page, pagesize)
for result in results:
hits.append(
dict(
pk=result["work"],
name=result["name"],
composer=result["composer"],
edition=result["edition"],
collection_id=int(result["collection"]),
)
)
meta["query"] = str(q & limit)
meta["total"] = len(results)
return hits, meta