Coverage for src / lilbee / retrieval / query / tokenize.py: 100%

14 statements  

« prev     ^ index     » next       coverage.py v7.13.4, created at 2026-05-15 20:55 +0000

1"""Token utilities for the RAG query pipeline.""" 

2 

3from __future__ import annotations 

4 

5import math 

6import re 

7 

8_MIN_TOKEN_LEN = 2 

9_TOKEN_SPLIT_RE = re.compile(r"\W+") 

10 

11 

12def _tokenize(text: str) -> list[str]: 

13 """Lowercase alphanumeric tokens, split on any non-alnum run.""" 

14 return [word for word in _TOKEN_SPLIT_RE.split(text.lower()) if len(word) >= _MIN_TOKEN_LEN] 

15 

16 

17def _idf_weights( 

18 question_terms: set[str], 

19 chunk_tokens: list[set[str]], 

20) -> dict[str, float]: 

21 """Inverse Document Frequency weight per query term over the candidate chunks. 

22 

23 Classical IDF per Spärck Jones (1972), "A Statistical Interpretation 

24 of Term Specificity and Its Application in Retrieval", Journal of 

25 Documentation 28:11-21. Terms that appear in every chunk collapse to 

26 zero weight, so corpus-specific stopwords are filtered automatically. 

27 """ 

28 n = len(chunk_tokens) 

29 df: dict[str, int] = {} 

30 for tokens in chunk_tokens: 

31 for term in tokens & question_terms: 

32 df[term] = df.get(term, 0) + 1 

33 return {t: max(0.0, math.log(n / (1 + df.get(t, 0)))) for t in question_terms}