from whoosh.index import create_in, open_dir, EmptyIndexError, Index from whoosh.analysis import StemmingAnalyzer, CharsetFilter from whoosh.support.charset import accent_map from whoosh.fields import Schema, TEXT, KEYWORD, NUMERIC from whoosh.qparser import QueryParser from whoosh.query import Term, NullQuery from typing import Any from django.conf import settings import os.path import shutil stemming_analyzer = StemmingAnalyzer() | CharsetFilter(accent_map) schema = Schema( work=NUMERIC(stored=True, unique=True), collection=NUMERIC(stored=True), name=TEXT(stored=True), composer=TEXT(stored=True), edition=TEXT(stored=True), tag=KEYWORD(commas=True), text=TEXT(analyzer=stemming_analyzer), ) index_path = os.path.join(os.path.dirname(settings.BASE_DIR), "index") def create_index() -> Index: if not os.path.exists(index_path): os.mkdir(index_path) ix = create_in(index_path, schema) return ix def get_index() -> Index: try: return open_dir(index_path) except EmptyIndexError: return create_index() def reset_index() -> Index: shutil.rmtree(index_path) return create_index() def index_docs(works: list[dict]): ix = get_index() with ix.writer() as writer: for work in works: writer.update_document(**work) def search( query: str, collections: list[int] = [], page: int = 1, pagesize: int = 20, ) -> tuple[list[dict], dict[str, Any]]: meta = {} qp = QueryParser("text", schema=schema) q = qp.parse(query.lower()) meta["query"] = str(q) terms = NullQuery for c in collections: terms = terms | Term("collection", c) q = q & terms hits = [] ix = get_index() with ix.searcher() as searcher: results = searcher.search_page(q, page, pagesize) for result in results: hits.append( dict( pk=result["work"], name=result["name"], composer=result["composer"], edition=result["edition"], collection_id=int(result["collection"]), ) ) meta["total"] = len(results) return hits, meta