Coverage for src/lilbee/wiki/quality.py: 100%

1"""Faithfulness scoring and drift heuristics for wiki page bodies.

3Holds the deterministic body-vs-source cosine score, the title/body

4coherence pre-check that gates a page on a structurally valid heading,

5and the unified-diff helpers used when an existing page's content

6changed by more than the configured threshold (drift detection).

7"""

9from __future__ import annotations

11import difflib

12import logging

14import numpy as np

16from lilbee.app.services import get_services

17from lilbee.core.config import Config

18from lilbee.core.text import clean_label_for_display, is_valid_label

19from lilbee.data.store import SearchChunk

20from lilbee.wiki.citation import strip_citation_block

22log = logging.getLogger(__name__)

24_MAX_DIFF_PREVIEW_LINES = 20 # lines of unified diff shown in drift warnings

27def content_change_ratio(old_text: str, new_text: str) -> float:

28 """Fraction of lines that changed between two texts (0.0 = identical, 1.0 = total rewrite)."""

29 old_lines = old_text.splitlines()

30 new_lines = new_text.splitlines()

31 if not old_lines and not new_lines:

32 return 0.0

33 total = max(len(old_lines), len(new_lines))

34 matcher = difflib.SequenceMatcher(None, old_lines, new_lines)

35 changed = total - sum(block.size for block in matcher.get_matching_blocks())

36 return changed / total

39def diff_summary(old_text: str, new_text: str) -> str:

40 """Human-readable unified diff summary (first 20 diff lines)."""

41 diff = difflib.unified_diff(

42 old_text.splitlines(),

43 new_text.splitlines(),

44 lineterm="",

45 fromfile="old",

46 tofile="new",

47 )

48 lines = list(diff)

49 if len(lines) > _MAX_DIFF_PREVIEW_LINES:

50 extra = len(lines) - _MAX_DIFF_PREVIEW_LINES

51 return "\n".join(lines[:_MAX_DIFF_PREVIEW_LINES]) + f"\n... ({extra} more lines)"

52 return "\n".join(lines)

55def _title_content_coherence(wiki_text: str, label: str) -> bool:

56 """Deterministic pre-check: title and body must reference the concept.

58 The LLM faithfulness score evaluates whether the prose reflects

59 the source chunks but does not penalize structural noise in the

60 title (bb-8b7s: ``| | designer`` passed at 0.90 because the body

61 was coherent). This pre-check asserts three invariants:

63 1. The first ``# `` heading must be a sanity-valid label per

64 :func:`is_valid_label`. A heading like ``| | designer`` fails

65 the structural-char gate even though it contains the cleaned

66 display name as a substring.

67 2. The cleaned display name must appear in the heading as a

68 case-insensitive substring. Covers LLM drift where the

69 heading names a different concept than requested.

70 3. The body must mention the display name at least once outside

71 the heading. Covers the "LLM talked about something adjacent

72 but never named the concept" regression.

74 Returns True when all three hold, False otherwise.

75 """

76 display = clean_label_for_display(label).lower()

77 if not display:

78 return False

79 heading: str | None = None

80 body_parts: list[str] = []

81 for line in wiki_text.splitlines():

82 if heading is None and line.startswith("# "):

83 heading = line[2:].strip()

84 continue

85 body_parts.append(line)

86 if heading is None:

87 return False

88 if not is_valid_label(heading):

89 return False

90 if display not in heading.lower():

91 return False

92 body = "\n".join(body_parts).lower()

93 return display in body

96def _mean_vector(vectors: list[list[float]]) -> list[float]:

97 """Compute the element-wise mean of a non-empty vector list.

99 Empty input returns an empty list; callers must check before any

100 downstream dot-product so we do not leak a shape mismatch.

101

102 Routes through numpy so the inner loop runs in C: for the typical

103 ``D=768``, ``N=10`` case this cuts per-call cost from ~8k Python

104 ops to a single SIMD-backed reduction.

105 """

106 if not vectors:

107 return []

108 result: list[float] = np.asarray(vectors, dtype=np.float32).mean(axis=0).tolist()

109 return result

110

111

112def _embedding_faithfulness_score(

113 body_vec: list[float],

114 source_vectors: list[list[float]],

115) -> float:

116 """Cosine-similarity score between the body and the mean source vector.

117

118 Assumes L2-normalized vectors (both the embedder and the store

119 return normalized vectors); cosine reduces to a dot product.

120 Falls through to :func:`cosine_sim` so a non-normalized vector

121 does not silently produce an out-of-range value. Result is

122 clamped at zero because a negative cosine means the body vector

123 points the other way from the mean of the sources: treat that

124 the same as uncorrelated for threshold purposes.

125

126 Returns 0.0 on a dimension mismatch between the body vector and

127 the source-vector mean. That is not expected in production (the

128 embedder and the chunk vectors come from the same model), but a

129 stub-driven test may hand in off-shape vectors and crashing the

130 whole pipeline on the shape-check hides the real assertion.

131 """

132 from lilbee.data.store import cosine_sim

133

134 mean_vec = _mean_vector(source_vectors)

135 if not mean_vec or not body_vec:

136 return 0.0

137 if len(mean_vec) != len(body_vec):

138 log.warning(

139 "Body vector dim %d does not match source vector dim %d; scoring 0.0",

140 len(body_vec),

141 len(mean_vec),

142 )

143 return 0.0

144 return max(0.0, cosine_sim(body_vec, mean_vec))

145

146

147def check_faithfulness(

148 chunks: list[SearchChunk],

149 wiki_text: str,

150 label: str,

151 config: Config | None = None,

152) -> float:

153 """Score the wiki body's similarity to its source chunks, 0.0 on failure.

154

155 Faithfulness is a deterministic cosine-similarity score between

156 the page body and the mean of its source chunk vectors. The B3

157 title/body coherence pre-check still runs first as a hard gate: a

158 garbage H1 returns 0.0 regardless of embedding similarity, so

159 structurally broken pages route to drafts even when the prose

160 happens to be coherent.

161

162 ``chunks`` carries ``.vector`` populated by LanceDB (see

163 ``SearchChunk`` in ``lilbee.data.store``), so no extra embedder call is

164 needed for the source side. The body is embedded once via the

165 shared services embedder. Any exception in the embedder (model

166 missing, network issue, invalid config) is caught and reported as

167 0.0 so a single faulty page drops to drafts instead of aborting

168 the whole build.

169 """

170 if not _title_content_coherence(wiki_text, label):

171 log.info(

172 "Faithfulness title/body coherence failed for %r; scoring 0.0",

173 label,

174 )

175 return 0.0

176 source_vectors = [c.vector for c in chunks if c.vector]

177 if not source_vectors:

178 log.warning("No source vectors for %s; scoring 0.0", label)

179 return 0.0

180

181 # Strip the frontmatter + citation block so we embed only the body

182 # prose. render_citation_block may not have run yet when the score

183 # is computed (it is appended later), but strip_citation_block is

184 # idempotent on missing trailers.

185 body_text = strip_citation_block(wiki_text).strip()

186 if not body_text:

187 log.warning("Empty body for %s; scoring 0.0", label)

188 return 0.0

189

190 try:

191 body_vectors = get_services().embedder.embed_batch([body_text])

192 except Exception as exc:

193 log.warning("Body embedding failed for %s: %s", label, exc)

194 return 0.0

195 if not body_vectors:

196 return 0.0

197 return _embedding_faithfulness_score(body_vectors[0], source_vectors)

Coverage for src / lilbee / wiki / quality.py: 100%

80 statements