polyphonic/app/library/indexer/whoosh.py

from whoosh.index import create_in, open_dir, EmptyIndexError, Index
from whoosh.analysis import StemmingAnalyzer, CharsetFilter
from whoosh.support.charset import accent_map
from whoosh.fields import Schema, ID, TEXT, KEYWORD, STORED, NUMERIC
from whoosh.qparser import QueryParser
from whoosh.query import Term, NullQuery

from library.models import Work

from typing import Any
from django.conf import settings
from django.db.models import QuerySet
import os.path
import shutil

stemming_analyzer = StemmingAnalyzer() | CharsetFilter(accent_map)

schema = Schema(
    work=NUMERIC(stored=True, unique=True),
    collection=NUMERIC(stored=True),
    name=TEXT(stored=True),
    composer=TEXT(stored=True),
    edition=TEXT(stored=True),
    tag=KEYWORD(commas=True),
    text=TEXT(analyzer=stemming_analyzer),
)


index_path = os.path.join(os.path.dirname(settings.BASE_DIR), "index")


def create_index() -> Index:
    if not os.path.exists(index_path):
        os.mkdir(index_path)

    ix = create_in(index_path, schema)
    return ix


def get_index() -> Index:
    try:
        return open_dir(index_path)
    except EmptyIndexError:
        return create_index()


def reset_index() -> Index:
    shutil.rmtree(index_path)
    return create_index()


def index_docs(works: list[dict]):
    ix = get_index()
    with ix.writer() as writer:
        for work in works:
            writer.update_document(**work)


def search(
    query: str,
    collections: list[int] = [],
    page: int = 1,
    pagesize: int = 20,
) -> tuple[list[dict], dict[str, Any]]:
    meta = {}

    qp = QueryParser("text", schema=schema)
    q = qp.parse(query.lower())
    meta["query"] = str(q)

    terms = NullQuery
    for c in collections:
        terms = terms | Term("collection", c)
    q = q & terms

    hits = []
    ix = get_index()
    with ix.searcher() as searcher:
        results = searcher.search_page(q, page, pagesize)
        for result in results:
            hits.append(
                dict(
                    pk=result["work"],
                    name=result["name"],
                    composer=result["composer"],
                    edition=result["edition"],
                    collection_id=int(result["collection"]),
                )
            )

    meta["total"] = len(results)

    return hits, meta