Coverage for src / lilbee / wiki / cache.py: 100%

24 statements  

« prev     ^ index     » next       coverage.py v7.13.4, created at 2026-05-15 20:55 +0000

1"""Incremental-rebuild cache helpers for wiki page generation. 

2 

3Provides ``_leaf_hash`` (SHA-256 over chunk content as cache key) and 

4``_find_cached_leaf`` (look up a previously-written page whose 

5``leaf_hash`` frontmatter matches), plus ``normalize_whitespace`` for 

6robust excerpt comparison across PDF line wrapping. 

7""" 

8 

9from __future__ import annotations 

10 

11import hashlib 

12import re 

13from pathlib import Path 

14 

15from lilbee.data.store import SearchChunk 

16from lilbee.wiki.shared import WikiSubdir, parse_frontmatter 

17 

18_WHITESPACE_RE = re.compile(r"\s+") 

19 

20 

21def _leaf_hash(chunks: list[SearchChunk]) -> str: 

22 """SHA-256 over concatenated chunk content (null-separated, in given order). 

23 

24 Acts as the cache key for incremental rebuild: an existing page whose 

25 frontmatter ``leaf_hash`` matches this value has already summarized the 

26 exact same input and can be reused without a new LLM call. 

27 """ 

28 h = hashlib.sha256() 

29 for chunk in chunks: 

30 h.update(chunk.chunk.encode("utf-8")) 

31 h.update(b"\0") 

32 return h.hexdigest() 

33 

34 

35def _find_cached_leaf(wiki_root: Path, slug: str, leaf_hash: str) -> Path | None: 

36 """Return an existing page whose ``leaf_hash`` frontmatter matches, or ``None``. 

37 

38 Checks both ``summaries/`` and ``drafts/`` so an unchanged draft stays in 

39 drafts rather than triggering a speculative regeneration. 

40 """ 

41 for subdir in (WikiSubdir.SUMMARIES, WikiSubdir.DRAFTS): 

42 candidate = wiki_root / subdir / f"{slug}.md" 

43 if not candidate.is_file(): 

44 continue 

45 fm = parse_frontmatter(candidate.read_text(encoding="utf-8")) 

46 if fm.get("leaf_hash") == leaf_hash: 

47 return candidate 

48 return None 

49 

50 

51def normalize_whitespace(text: str) -> str: 

52 """Collapse runs of whitespace to a single space and strip the edges. 

53 

54 PDF extractors preserve line breaks mid-sentence (``vehicle,\\nthe greater``) 

55 while LLMs paraphrase the same quote as a single-spaced string 

56 (``vehicle, the greater``). A strict substring check rejects a faithful 

57 citation on whitespace alone, so both sides are normalized before 

58 comparison. 

59 """ 

60 return _WHITESPACE_RE.sub(" ", text).strip()