Coverage for src / lilbee / wiki / cache.py: 100%
24 statements
« prev ^ index » next coverage.py v7.13.4, created at 2026-05-15 20:55 +0000
« prev ^ index » next coverage.py v7.13.4, created at 2026-05-15 20:55 +0000
1"""Incremental-rebuild cache helpers for wiki page generation.
3Provides ``_leaf_hash`` (SHA-256 over chunk content as cache key) and
4``_find_cached_leaf`` (look up a previously-written page whose
5``leaf_hash`` frontmatter matches), plus ``normalize_whitespace`` for
6robust excerpt comparison across PDF line wrapping.
7"""
9from __future__ import annotations
11import hashlib
12import re
13from pathlib import Path
15from lilbee.data.store import SearchChunk
16from lilbee.wiki.shared import WikiSubdir, parse_frontmatter
18_WHITESPACE_RE = re.compile(r"\s+")
21def _leaf_hash(chunks: list[SearchChunk]) -> str:
22 """SHA-256 over concatenated chunk content (null-separated, in given order).
24 Acts as the cache key for incremental rebuild: an existing page whose
25 frontmatter ``leaf_hash`` matches this value has already summarized the
26 exact same input and can be reused without a new LLM call.
27 """
28 h = hashlib.sha256()
29 for chunk in chunks:
30 h.update(chunk.chunk.encode("utf-8"))
31 h.update(b"\0")
32 return h.hexdigest()
35def _find_cached_leaf(wiki_root: Path, slug: str, leaf_hash: str) -> Path | None:
36 """Return an existing page whose ``leaf_hash`` frontmatter matches, or ``None``.
38 Checks both ``summaries/`` and ``drafts/`` so an unchanged draft stays in
39 drafts rather than triggering a speculative regeneration.
40 """
41 for subdir in (WikiSubdir.SUMMARIES, WikiSubdir.DRAFTS):
42 candidate = wiki_root / subdir / f"{slug}.md"
43 if not candidate.is_file():
44 continue
45 fm = parse_frontmatter(candidate.read_text(encoding="utf-8"))
46 if fm.get("leaf_hash") == leaf_hash:
47 return candidate
48 return None
51def normalize_whitespace(text: str) -> str:
52 """Collapse runs of whitespace to a single space and strip the edges.
54 PDF extractors preserve line breaks mid-sentence (``vehicle,\\nthe greater``)
55 while LLMs paraphrase the same quote as a single-spaced string
56 (``vehicle, the greater``). A strict substring check rejects a faithful
57 citation on whitespace alone, so both sides are normalized before
58 comparison.
59 """
60 return _WHITESPACE_RE.sub(" ", text).strip()