Coverage for src / lilbee / retrieval / query / tokenize.py: 100%
14 statements
« prev ^ index » next coverage.py v7.13.4, created at 2026-05-15 20:55 +0000
« prev ^ index » next coverage.py v7.13.4, created at 2026-05-15 20:55 +0000
1"""Token utilities for the RAG query pipeline."""
3from __future__ import annotations
5import math
6import re
8_MIN_TOKEN_LEN = 2
9_TOKEN_SPLIT_RE = re.compile(r"\W+")
12def _tokenize(text: str) -> list[str]:
13 """Lowercase alphanumeric tokens, split on any non-alnum run."""
14 return [word for word in _TOKEN_SPLIT_RE.split(text.lower()) if len(word) >= _MIN_TOKEN_LEN]
17def _idf_weights(
18 question_terms: set[str],
19 chunk_tokens: list[set[str]],
20) -> dict[str, float]:
21 """Inverse Document Frequency weight per query term over the candidate chunks.
23 Classical IDF per Spärck Jones (1972), "A Statistical Interpretation
24 of Term Specificity and Its Application in Retrieval", Journal of
25 Documentation 28:11-21. Terms that appear in every chunk collapse to
26 zero weight, so corpus-specific stopwords are filtered automatically.
27 """
28 n = len(chunk_tokens)
29 df: dict[str, int] = {}
30 for tokens in chunk_tokens:
31 for term in tokens & question_terms:
32 df[term] = df.get(term, 0) + 1
33 return {t: max(0.0, math.log(n / (1 + df.get(t, 0)))) for t in question_terms}