Coverage for src / lilbee / wiki / quality.py: 100%

80 statements  

« prev     ^ index     » next       coverage.py v7.13.4, created at 2026-05-15 20:55 +0000

1"""Faithfulness scoring and drift heuristics for wiki page bodies. 

2 

3Holds the deterministic body-vs-source cosine score, the title/body 

4coherence pre-check that gates a page on a structurally valid heading, 

5and the unified-diff helpers used when an existing page's content 

6changed by more than the configured threshold (drift detection). 

7""" 

8 

9from __future__ import annotations 

10 

11import difflib 

12import logging 

13 

14import numpy as np 

15 

16from lilbee.app.services import get_services 

17from lilbee.core.config import Config 

18from lilbee.core.text import clean_label_for_display, is_valid_label 

19from lilbee.data.store import SearchChunk 

20from lilbee.wiki.citation import strip_citation_block 

21 

22log = logging.getLogger(__name__) 

23 

24_MAX_DIFF_PREVIEW_LINES = 20 # lines of unified diff shown in drift warnings 

25 

26 

27def content_change_ratio(old_text: str, new_text: str) -> float: 

28 """Fraction of lines that changed between two texts (0.0 = identical, 1.0 = total rewrite).""" 

29 old_lines = old_text.splitlines() 

30 new_lines = new_text.splitlines() 

31 if not old_lines and not new_lines: 

32 return 0.0 

33 total = max(len(old_lines), len(new_lines)) 

34 matcher = difflib.SequenceMatcher(None, old_lines, new_lines) 

35 changed = total - sum(block.size for block in matcher.get_matching_blocks()) 

36 return changed / total 

37 

38 

39def diff_summary(old_text: str, new_text: str) -> str: 

40 """Human-readable unified diff summary (first 20 diff lines).""" 

41 diff = difflib.unified_diff( 

42 old_text.splitlines(), 

43 new_text.splitlines(), 

44 lineterm="", 

45 fromfile="old", 

46 tofile="new", 

47 ) 

48 lines = list(diff) 

49 if len(lines) > _MAX_DIFF_PREVIEW_LINES: 

50 extra = len(lines) - _MAX_DIFF_PREVIEW_LINES 

51 return "\n".join(lines[:_MAX_DIFF_PREVIEW_LINES]) + f"\n... ({extra} more lines)" 

52 return "\n".join(lines) 

53 

54 

55def _title_content_coherence(wiki_text: str, label: str) -> bool: 

56 """Deterministic pre-check: title and body must reference the concept. 

57 

58 The LLM faithfulness score evaluates whether the prose reflects 

59 the source chunks but does not penalize structural noise in the 

60 title (bb-8b7s: ``| | designer`` passed at 0.90 because the body 

61 was coherent). This pre-check asserts three invariants: 

62 

63 1. The first ``# `` heading must be a sanity-valid label per 

64 :func:`is_valid_label`. A heading like ``| | designer`` fails 

65 the structural-char gate even though it contains the cleaned 

66 display name as a substring. 

67 2. The cleaned display name must appear in the heading as a 

68 case-insensitive substring. Covers LLM drift where the 

69 heading names a different concept than requested. 

70 3. The body must mention the display name at least once outside 

71 the heading. Covers the "LLM talked about something adjacent 

72 but never named the concept" regression. 

73 

74 Returns True when all three hold, False otherwise. 

75 """ 

76 display = clean_label_for_display(label).lower() 

77 if not display: 

78 return False 

79 heading: str | None = None 

80 body_parts: list[str] = [] 

81 for line in wiki_text.splitlines(): 

82 if heading is None and line.startswith("# "): 

83 heading = line[2:].strip() 

84 continue 

85 body_parts.append(line) 

86 if heading is None: 

87 return False 

88 if not is_valid_label(heading): 

89 return False 

90 if display not in heading.lower(): 

91 return False 

92 body = "\n".join(body_parts).lower() 

93 return display in body 

94 

95 

96def _mean_vector(vectors: list[list[float]]) -> list[float]: 

97 """Compute the element-wise mean of a non-empty vector list. 

98 

99 Empty input returns an empty list; callers must check before any 

100 downstream dot-product so we do not leak a shape mismatch. 

101 

102 Routes through numpy so the inner loop runs in C: for the typical 

103 ``D=768``, ``N=10`` case this cuts per-call cost from ~8k Python 

104 ops to a single SIMD-backed reduction. 

105 """ 

106 if not vectors: 

107 return [] 

108 result: list[float] = np.asarray(vectors, dtype=np.float32).mean(axis=0).tolist() 

109 return result 

110 

111 

112def _embedding_faithfulness_score( 

113 body_vec: list[float], 

114 source_vectors: list[list[float]], 

115) -> float: 

116 """Cosine-similarity score between the body and the mean source vector. 

117 

118 Assumes L2-normalized vectors (both the embedder and the store 

119 return normalized vectors); cosine reduces to a dot product. 

120 Falls through to :func:`cosine_sim` so a non-normalized vector 

121 does not silently produce an out-of-range value. Result is 

122 clamped at zero because a negative cosine means the body vector 

123 points the other way from the mean of the sources: treat that 

124 the same as uncorrelated for threshold purposes. 

125 

126 Returns 0.0 on a dimension mismatch between the body vector and 

127 the source-vector mean. That is not expected in production (the 

128 embedder and the chunk vectors come from the same model), but a 

129 stub-driven test may hand in off-shape vectors and crashing the 

130 whole pipeline on the shape-check hides the real assertion. 

131 """ 

132 from lilbee.data.store import cosine_sim 

133 

134 mean_vec = _mean_vector(source_vectors) 

135 if not mean_vec or not body_vec: 

136 return 0.0 

137 if len(mean_vec) != len(body_vec): 

138 log.warning( 

139 "Body vector dim %d does not match source vector dim %d; scoring 0.0", 

140 len(body_vec), 

141 len(mean_vec), 

142 ) 

143 return 0.0 

144 return max(0.0, cosine_sim(body_vec, mean_vec)) 

145 

146 

147def check_faithfulness( 

148 chunks: list[SearchChunk], 

149 wiki_text: str, 

150 label: str, 

151 config: Config | None = None, 

152) -> float: 

153 """Score the wiki body's similarity to its source chunks, 0.0 on failure. 

154 

155 Faithfulness is a deterministic cosine-similarity score between 

156 the page body and the mean of its source chunk vectors. The B3 

157 title/body coherence pre-check still runs first as a hard gate: a 

158 garbage H1 returns 0.0 regardless of embedding similarity, so 

159 structurally broken pages route to drafts even when the prose 

160 happens to be coherent. 

161 

162 ``chunks`` carries ``.vector`` populated by LanceDB (see 

163 ``SearchChunk`` in ``lilbee.data.store``), so no extra embedder call is 

164 needed for the source side. The body is embedded once via the 

165 shared services embedder. Any exception in the embedder (model 

166 missing, network issue, invalid config) is caught and reported as 

167 0.0 so a single faulty page drops to drafts instead of aborting 

168 the whole build. 

169 """ 

170 if not _title_content_coherence(wiki_text, label): 

171 log.info( 

172 "Faithfulness title/body coherence failed for %r; scoring 0.0", 

173 label, 

174 ) 

175 return 0.0 

176 source_vectors = [c.vector for c in chunks if c.vector] 

177 if not source_vectors: 

178 log.warning("No source vectors for %s; scoring 0.0", label) 

179 return 0.0 

180 

181 # Strip the frontmatter + citation block so we embed only the body 

182 # prose. render_citation_block may not have run yet when the score 

183 # is computed (it is appended later), but strip_citation_block is 

184 # idempotent on missing trailers. 

185 body_text = strip_citation_block(wiki_text).strip() 

186 if not body_text: 

187 log.warning("Empty body for %s; scoring 0.0", label) 

188 return 0.0 

189 

190 try: 

191 body_vectors = get_services().embedder.embed_batch([body_text]) 

192 except Exception as exc: 

193 log.warning("Body embedding failed for %s: %s", label, exc) 

194 return 0.0 

195 if not body_vectors: 

196 return 0.0 

197 return _embedding_faithfulness_score(body_vectors[0], source_vectors)