polyphonic/app/library/indexer/whoosh.py

from whoosh.index import create_in, open_dir, EmptyIndexError, Index
from whoosh.analysis import StemmingAnalyzer, CharsetFilter
from whoosh.support.charset import accent_map
from whoosh.fields import Schema, TEXT, KEYWORD, NUMERIC
from whoosh.qparser import QueryParser
from whoosh.query import Term, NullQuery, FuzzyTerm


from typing import Any
from django.conf import settings
import os.path
import shutil

stemming_analyzer = StemmingAnalyzer() | CharsetFilter(accent_map)

schema = Schema(
    work=NUMERIC(stored=True, unique=True),
    collection=NUMERIC(stored=True),
    name=TEXT(stored=True),
    composer=TEXT(stored=True),
    edition=TEXT(stored=True),
    tag=KEYWORD(commas=True),
    text=TEXT(analyzer=stemming_analyzer),
)


index_path = settings.WHOOSH_INDEX


def create_index() -> Index:
    if not os.path.exists(index_path):
        os.mkdir(index_path)

    ix = create_in(index_path, schema)
    return ix


def get_index() -> Index:
    try:
        return open_dir(index_path)
    except EmptyIndexError:
        return create_index()


def reset_index() -> Index:
    shutil.rmtree(index_path)
    return create_index()


def index_docs(works: list[dict]):
    ix = get_index()
    with ix.writer() as writer:
        for work in works:
            writer.update_document(**work)


def search(
    query: str,
    collections: list[int] = [],
    page: int = 1,
    pagesize: int = 20,
) -> tuple[list[dict], dict[str, Any]]:
    meta = {}
    query = query.lower()

    qp = QueryParser("text", schema=schema)
    q = qp.parse(query)

    limit = NullQuery
    for c in collections:
        limit |= Term("collection", c)

    hits = []
    ix = get_index()
    with ix.searcher() as searcher:
        results = searcher.search_page(q & limit, page, pagesize)

        # if no results, do a prefix search
        if results.results.is_empty():
            qp.termclass = FuzzyTerm
            q = qp.parse(query)
            results = searcher.search_page(q & limit, page, pagesize)

        for result in results:
            hits.append(
                dict(
                    pk=result["work"],
                    name=result["name"],
                    composer=result["composer"],
                    edition=result["edition"],
                    collection_id=int(result["collection"]),
                )
            )
    meta["query"] = str(q & limit)
    meta["total"] = len(results)

    return hits, meta