Coverage for src/lilbee/retrieval/query/tokenize.py: 100%

1"""Token utilities for the RAG query pipeline."""

3from __future__ import annotations

5import math

6import re

8_MIN_TOKEN_LEN = 2

9_TOKEN_SPLIT_RE = re.compile(r"\W+")

12def _tokenize(text: str) -> list[str]:

13 """Lowercase alphanumeric tokens, split on any non-alnum run."""

14 return [word for word in _TOKEN_SPLIT_RE.split(text.lower()) if len(word) >= _MIN_TOKEN_LEN]

17def _idf_weights(

18 question_terms: set[str],

19 chunk_tokens: list[set[str]],

20) -> dict[str, float]:

21 """Inverse Document Frequency weight per query term over the candidate chunks.

23 Classical IDF per Spärck Jones (1972), "A Statistical Interpretation

24 of Term Specificity and Its Application in Retrieval", Journal of

25 Documentation 28:11-21. Terms that appear in every chunk collapse to

26 zero weight, so corpus-specific stopwords are filtered automatically.

27 """

28 n = len(chunk_tokens)

29 df: dict[str, int] = {}

30 for tokens in chunk_tokens:

31 for term in tokens & question_terms:

32 df[term] = df.get(term, 0) + 1

33 return {t: max(0.0, math.log(n / (1 + df.get(t, 0)))) for t in question_terms}

Coverage for src / lilbee / retrieval / query / tokenize.py: 100%