Coverage for src / lilbee / wiki / quality.py: 100%
80 statements
« prev ^ index » next coverage.py v7.13.4, created at 2026-05-15 20:55 +0000
« prev ^ index » next coverage.py v7.13.4, created at 2026-05-15 20:55 +0000
1"""Faithfulness scoring and drift heuristics for wiki page bodies.
3Holds the deterministic body-vs-source cosine score, the title/body
4coherence pre-check that gates a page on a structurally valid heading,
5and the unified-diff helpers used when an existing page's content
6changed by more than the configured threshold (drift detection).
7"""
9from __future__ import annotations
11import difflib
12import logging
14import numpy as np
16from lilbee.app.services import get_services
17from lilbee.core.config import Config
18from lilbee.core.text import clean_label_for_display, is_valid_label
19from lilbee.data.store import SearchChunk
20from lilbee.wiki.citation import strip_citation_block
22log = logging.getLogger(__name__)
24_MAX_DIFF_PREVIEW_LINES = 20 # lines of unified diff shown in drift warnings
27def content_change_ratio(old_text: str, new_text: str) -> float:
28 """Fraction of lines that changed between two texts (0.0 = identical, 1.0 = total rewrite)."""
29 old_lines = old_text.splitlines()
30 new_lines = new_text.splitlines()
31 if not old_lines and not new_lines:
32 return 0.0
33 total = max(len(old_lines), len(new_lines))
34 matcher = difflib.SequenceMatcher(None, old_lines, new_lines)
35 changed = total - sum(block.size for block in matcher.get_matching_blocks())
36 return changed / total
39def diff_summary(old_text: str, new_text: str) -> str:
40 """Human-readable unified diff summary (first 20 diff lines)."""
41 diff = difflib.unified_diff(
42 old_text.splitlines(),
43 new_text.splitlines(),
44 lineterm="",
45 fromfile="old",
46 tofile="new",
47 )
48 lines = list(diff)
49 if len(lines) > _MAX_DIFF_PREVIEW_LINES:
50 extra = len(lines) - _MAX_DIFF_PREVIEW_LINES
51 return "\n".join(lines[:_MAX_DIFF_PREVIEW_LINES]) + f"\n... ({extra} more lines)"
52 return "\n".join(lines)
55def _title_content_coherence(wiki_text: str, label: str) -> bool:
56 """Deterministic pre-check: title and body must reference the concept.
58 The LLM faithfulness score evaluates whether the prose reflects
59 the source chunks but does not penalize structural noise in the
60 title (bb-8b7s: ``| | designer`` passed at 0.90 because the body
61 was coherent). This pre-check asserts three invariants:
63 1. The first ``# `` heading must be a sanity-valid label per
64 :func:`is_valid_label`. A heading like ``| | designer`` fails
65 the structural-char gate even though it contains the cleaned
66 display name as a substring.
67 2. The cleaned display name must appear in the heading as a
68 case-insensitive substring. Covers LLM drift where the
69 heading names a different concept than requested.
70 3. The body must mention the display name at least once outside
71 the heading. Covers the "LLM talked about something adjacent
72 but never named the concept" regression.
74 Returns True when all three hold, False otherwise.
75 """
76 display = clean_label_for_display(label).lower()
77 if not display:
78 return False
79 heading: str | None = None
80 body_parts: list[str] = []
81 for line in wiki_text.splitlines():
82 if heading is None and line.startswith("# "):
83 heading = line[2:].strip()
84 continue
85 body_parts.append(line)
86 if heading is None:
87 return False
88 if not is_valid_label(heading):
89 return False
90 if display not in heading.lower():
91 return False
92 body = "\n".join(body_parts).lower()
93 return display in body
96def _mean_vector(vectors: list[list[float]]) -> list[float]:
97 """Compute the element-wise mean of a non-empty vector list.
99 Empty input returns an empty list; callers must check before any
100 downstream dot-product so we do not leak a shape mismatch.
102 Routes through numpy so the inner loop runs in C: for the typical
103 ``D=768``, ``N=10`` case this cuts per-call cost from ~8k Python
104 ops to a single SIMD-backed reduction.
105 """
106 if not vectors:
107 return []
108 result: list[float] = np.asarray(vectors, dtype=np.float32).mean(axis=0).tolist()
109 return result
112def _embedding_faithfulness_score(
113 body_vec: list[float],
114 source_vectors: list[list[float]],
115) -> float:
116 """Cosine-similarity score between the body and the mean source vector.
118 Assumes L2-normalized vectors (both the embedder and the store
119 return normalized vectors); cosine reduces to a dot product.
120 Falls through to :func:`cosine_sim` so a non-normalized vector
121 does not silently produce an out-of-range value. Result is
122 clamped at zero because a negative cosine means the body vector
123 points the other way from the mean of the sources: treat that
124 the same as uncorrelated for threshold purposes.
126 Returns 0.0 on a dimension mismatch between the body vector and
127 the source-vector mean. That is not expected in production (the
128 embedder and the chunk vectors come from the same model), but a
129 stub-driven test may hand in off-shape vectors and crashing the
130 whole pipeline on the shape-check hides the real assertion.
131 """
132 from lilbee.data.store import cosine_sim
134 mean_vec = _mean_vector(source_vectors)
135 if not mean_vec or not body_vec:
136 return 0.0
137 if len(mean_vec) != len(body_vec):
138 log.warning(
139 "Body vector dim %d does not match source vector dim %d; scoring 0.0",
140 len(body_vec),
141 len(mean_vec),
142 )
143 return 0.0
144 return max(0.0, cosine_sim(body_vec, mean_vec))
147def check_faithfulness(
148 chunks: list[SearchChunk],
149 wiki_text: str,
150 label: str,
151 config: Config | None = None,
152) -> float:
153 """Score the wiki body's similarity to its source chunks, 0.0 on failure.
155 Faithfulness is a deterministic cosine-similarity score between
156 the page body and the mean of its source chunk vectors. The B3
157 title/body coherence pre-check still runs first as a hard gate: a
158 garbage H1 returns 0.0 regardless of embedding similarity, so
159 structurally broken pages route to drafts even when the prose
160 happens to be coherent.
162 ``chunks`` carries ``.vector`` populated by LanceDB (see
163 ``SearchChunk`` in ``lilbee.data.store``), so no extra embedder call is
164 needed for the source side. The body is embedded once via the
165 shared services embedder. Any exception in the embedder (model
166 missing, network issue, invalid config) is caught and reported as
167 0.0 so a single faulty page drops to drafts instead of aborting
168 the whole build.
169 """
170 if not _title_content_coherence(wiki_text, label):
171 log.info(
172 "Faithfulness title/body coherence failed for %r; scoring 0.0",
173 label,
174 )
175 return 0.0
176 source_vectors = [c.vector for c in chunks if c.vector]
177 if not source_vectors:
178 log.warning("No source vectors for %s; scoring 0.0", label)
179 return 0.0
181 # Strip the frontmatter + citation block so we embed only the body
182 # prose. render_citation_block may not have run yet when the score
183 # is computed (it is appended later), but strip_citation_block is
184 # idempotent on missing trailers.
185 body_text = strip_citation_block(wiki_text).strip()
186 if not body_text:
187 log.warning("Empty body for %s; scoring 0.0", label)
188 return 0.0
190 try:
191 body_vectors = get_services().embedder.embed_batch([body_text])
192 except Exception as exc:
193 log.warning("Body embedding failed for %s: %s", label, exc)
194 return 0.0
195 if not body_vectors:
196 return 0.0
197 return _embedding_faithfulness_score(body_vectors[0], source_vectors)