polyphonic/app/library/indexer/whoosh.py

from whoosh.index import create_in, open_dir, EmptyIndexError, Index
from whoosh.analysis import StemmingAnalyzer, CharsetFilter
from whoosh.support.charset import accent_map
from whoosh.fields import Schema, TEXT, KEYWORD, NUMERIC
from whoosh.qparser import QueryParser
from whoosh.query import Term, NullQuery


from typing import Any
from django.conf import settings
import os.path
import shutil

stemming_analyzer = StemmingAnalyzer() | CharsetFilter(accent_map)

schema = Schema(
    work=NUMERIC(stored=True, unique=True),
    collection=NUMERIC(stored=True),
    name=TEXT(stored=True),
    composer=TEXT(stored=True),
    edition=TEXT(stored=True),
    tag=KEYWORD(commas=True),
    text=TEXT(analyzer=stemming_analyzer),
)


index_path = os.path.join(os.path.dirname(settings.BASE_DIR), "index")


def create_index() -> Index:
    if not os.path.exists(index_path):
        os.mkdir(index_path)

    ix = create_in(index_path, schema)
    return ix


def get_index() -> Index:
    try:
        return open_dir(index_path)
    except EmptyIndexError:
        return create_index()


def reset_index() -> Index:
    shutil.rmtree(index_path)
    return create_index()


def index_docs(works: list[dict]):
    ix = get_index()
    with ix.writer() as writer:
        for work in works:
            writer.update_document(**work)


def search(
    query: str,
    collections: list[int] = [],
    page: int = 1,
    pagesize: int = 20,
) -> tuple[list[dict], dict[str, Any]]:
    meta = {}

    qp = QueryParser("text", schema=schema)
    q = qp.parse(query.lower())
    meta["query"] = str(q)

    terms = NullQuery
    for c in collections:
        terms = terms | Term("collection", c)
    q = q & terms

    hits = []
    ix = get_index()
    with ix.searcher() as searcher:
        results = searcher.search_page(q, page, pagesize)
        for result in results:
            hits.append(
                dict(
                    pk=result["work"],
                    name=result["name"],
                    composer=result["composer"],
                    edition=result["edition"],
                    collection_id=int(result["collection"]),
                )
            )

    meta["total"] = len(results)

    return hits, meta