Coverage for src / lilbee / wiki / page.py: 100%

134 statements  

« prev     ^ index     » next       coverage.py v7.13.4, created at 2026-05-15 20:55 +0000

1"""Single-page generation pipeline for wiki summary and synthesis pages. 

2 

3Given a label, prompt, and grounding chunks, drives the LLM call, 

4parses + verifies citations, scores faithfulness, builds the 

5frontmatter / body / citation block, and lands the page on disk via 

6:mod:`lilbee.wiki.persistence`. Also owns ``index_wiki_page``: the 

7post-write step that chunks, embeds, and stores the wiki body itself 

8so wiki content participates in retrieval. 

9""" 

10 

11from __future__ import annotations 

12 

13import logging 

14from collections.abc import Callable 

15from datetime import UTC, datetime 

16from pathlib import Path 

17from typing import cast 

18 

19from lilbee.app.services import get_services 

20from lilbee.core.config import CHUNKS_TABLE, DEFAULT_NUM_CTX, Config 

21from lilbee.data.chunk import chunk_text 

22from lilbee.data.store import ( 

23 CHUNK_TYPE_WIKI, 

24 CitationRecord, 

25 SearchChunk, 

26 Store, 

27 escape_sql_string, 

28) 

29from lilbee.providers.base import LLMProvider 

30from lilbee.retrieval.reasoning import strip_reasoning 

31from lilbee.wiki.citation import ( 

32 ParsedCitation, 

33 extract_body, 

34 parse_wiki_citations, 

35 render_citation_block, 

36 strip_citation_block, 

37) 

38from lilbee.wiki.citations import ( 

39 render_provenance, 

40 verify_citations, 

41) 

42from lilbee.wiki.persistence import ( 

43 divert_to_drafts, 

44 persist_and_finalize, 

45 subdir_from_wiki_source, 

46) 

47from lilbee.wiki.quality import check_faithfulness, content_change_ratio, diff_summary 

48from lilbee.wiki.shared import ( 

49 WIKI_CONTENT_SUBDIRS, 

50 PageTarget, 

51 WikiSubdir, 

52) 

53 

54log = logging.getLogger(__name__) 

55 

56WikiProgressCallback = Callable[[str, dict[str, object]], None] 

57"""Callback for wiki generation progress: (stage, data) -> None.""" 

58 

59# Fraction of context window reserved for chunks. The remainder leaves 

60# room for the system/user prompt template and generation output. 

61_CONTEXT_BUDGET_FRACTION = 0.75 

62 

63# Approximate characters per token for budget estimation. 4 chars/token 

64# is a widely used heuristic for English text. 

65_CHARS_PER_TOKEN = 4 

66 

67# Directive recognized by chat templates that support a reasoning mode 

68# (Qwen3, DeepSeek-R1, etc.). Wiki generation is a summarization task 

69# where chain-of-thought adds wall-clock cost without improving output, 

70# so we suppress it whenever the provider reports the capability. 

71_NO_THINK_DIRECTIVE = "/no_think" 

72 

73# Capability string returned by llama-cpp providers for reasoning models 

74# (Qwen3, DeepSeek-R1). Defined locally so wiki.generation doesn't 

75# depend on a specific provider-layer constant name. 

76_CAPABILITY_THINKING = "thinking" 

77 

78 

79def build_wiki_messages(prompt: str, provider: LLMProvider, config: Config) -> list[dict[str, str]]: 

80 """Build the chat messages list for a wiki-gen call. 

81 

82 When the provider reports the ``thinking`` capability for the active 

83 chat model, prepends ``/no_think`` so the chat template disables the 

84 reasoning mode. Otherwise the prompt passes through unchanged. 

85 """ 

86 capabilities = provider.get_capabilities(config.chat_model) 

87 if _CAPABILITY_THINKING in capabilities: 

88 prompt = f"{_NO_THINK_DIRECTIVE}\n\n{prompt}" 

89 return [{"role": "user", "content": prompt}] 

90 

91 

92def truncate_chunks_to_budget( 

93 chunks: list[SearchChunk], 

94 config: Config, 

95) -> list[SearchChunk]: 

96 """Drop trailing chunks so the total text fits within the model's context budget. 

97 

98 Uses a chars/4 heuristic for token estimation. Returns the original list 

99 unchanged when all chunks fit. 

100 """ 

101 context_window = config.num_ctx or DEFAULT_NUM_CTX 

102 budget_tokens = int(context_window * _CONTEXT_BUDGET_FRACTION) 

103 budget_chars = budget_tokens * _CHARS_PER_TOKEN 

104 

105 total_chars = 0 

106 kept: list[SearchChunk] = [] 

107 for chunk in chunks: 

108 chunk_chars = len(chunk.chunk) 

109 if total_chars + chunk_chars > budget_chars and kept: 

110 break 

111 kept.append(chunk) 

112 total_chars += chunk_chars 

113 

114 if len(kept) < len(chunks): 

115 log.warning( 

116 "Truncated chunks from %d to %d to fit context window (%d tokens)", 

117 len(chunks), 

118 len(kept), 

119 context_window, 

120 ) 

121 return kept 

122 

123 

124def chunks_to_text(chunks: list[SearchChunk]) -> str: 

125 """Format chunks as numbered text blocks for the LLM prompt.""" 

126 parts: list[str] = [] 

127 for i, chunk in enumerate(chunks): 

128 location = "" 

129 if chunk.page_start: 

130 location = f" (page {chunk.page_start})" 

131 elif chunk.line_start: 

132 location = f" (lines {chunk.line_start}-{chunk.line_end})" 

133 parts.append(f"[Chunk {i + 1}]{location}:\n{chunk.chunk}") 

134 return "\n\n".join(parts) 

135 

136 

137def build_frontmatter( 

138 config: Config, 

139 source_names: list[str], 

140 score: float, 

141 leaf_hash: str = "", 

142 chunks: list[SearchChunk] | None = None, 

143) -> str: 

144 """Build YAML frontmatter for a wiki page. 

145 

146 When ``leaf_hash`` is non-empty it is written so incremental rebuild 

147 can skip regeneration on a subsequent sync whose chunks produce the 

148 same hash. When ``chunks`` is provided the frontmatter carries a 

149 ``provenance`` block naming the source/chunk-index pairs that fed 

150 the generator and the extraction method from config, so a bad page 

151 is auditable without re-running the pipeline. 

152 """ 

153 sources_yaml = ", ".join(f'"{s}"' for s in sorted(source_names)) 

154 hash_line = f"leaf_hash: {leaf_hash}\n" if leaf_hash else "" 

155 provenance_block = render_provenance(config, chunks) if chunks is not None else "" 

156 return ( 

157 f"---\n" 

158 f"generated_by: {config.chat_model}\n" 

159 f"generated_at: {datetime.now(UTC).isoformat()}\n" 

160 f"sources: [{sources_yaml}]\n" 

161 f"faithfulness_score: {score:.2f}\n" 

162 f"{hash_line}" 

163 f"{provenance_block}" 

164 f"---\n\n" 

165 ) 

166 

167 

168def write_page( 

169 wiki_root: Path, 

170 subdir: str, 

171 slug: str, 

172 full_content: str, 

173 drift_threshold: float, 

174) -> Path: 

175 """Write page to disk with drift detection. Returns path written to. 

176 

177 ``slug`` may contain forward slashes (e.g. ``cv-manual/page-0042``); 

178 any intermediate directories are created before writing. 

179 """ 

180 page_path = wiki_root / subdir / f"{slug}.md" 

181 page_path.parent.mkdir(parents=True, exist_ok=True) 

182 

183 if page_path.exists(): 

184 old_content = page_path.read_text(encoding="utf-8") 

185 ratio = content_change_ratio(old_content, full_content) 

186 if ratio > drift_threshold: 

187 drafts_dir = wiki_root / WikiSubdir.DRAFTS 

188 diff_text = diff_summary(old_content, full_content) 

189 return divert_to_drafts(full_content, drafts_dir, slug, ratio, diff_text) 

190 

191 page_path.write_text(full_content, encoding="utf-8") 

192 return page_path 

193 

194 

195def assemble_content( 

196 frontmatter: str, 

197 wiki_text: str, 

198 citation_block: str, 

199) -> str: 

200 """Combine frontmatter, body, and citations into the full page content.""" 

201 full = frontmatter + wiki_text 

202 if citation_block: 

203 full += "\n\n" + citation_block 

204 return full 

205 

206 

207def index_wiki_page(content: str, wiki_source: str, store: Store) -> int: 

208 """Chunk a wiki page body, embed it, and write rows with ``chunk_type="wiki"``. 

209 

210 ``wiki_source`` must follow the ``<wiki_dir>/<subdir>/<slug>.md`` 

211 shape (see :attr:`PageTarget.wiki_source`). Three branches: 

212 

213 - subdir in :data:`WIKI_CONTENT_SUBDIRS`: clear stale rows, chunk, 

214 embed, write. Returns the row count. 

215 - subdir is ``drafts/`` or ``archive/``: skip without touching the 

216 store. Returns 0. 

217 - malformed ``wiki_source`` (no subdir component): log.warning and 

218 return 0. Does not raise because the caller set is narrow (only 

219 internal wiki paths reach here) and surfacing the bad input in 

220 the log is sufficient triage. 

221 

222 Record shape matches the markdown-ingest convention in 

223 ``lilbee.data.ingest``: ``content_type="text"``, all four page/line 

224 positions ``0`` (wiki pages are not paginated). 

225 """ 

226 subdir = subdir_from_wiki_source(wiki_source) 

227 if subdir is None: 

228 log.warning("index_wiki_page: malformed wiki_source %r (no subdir)", wiki_source) 

229 return 0 

230 if subdir not in WIKI_CONTENT_SUBDIRS: 

231 return 0 

232 

233 body = extract_body(content).strip() 

234 store.clear_table( 

235 CHUNKS_TABLE, 

236 f"source = '{escape_sql_string(wiki_source)}' AND chunk_type = '{CHUNK_TYPE_WIKI}'", 

237 ) 

238 if not body: 

239 return 0 

240 

241 chunks = chunk_text(body, mime_type="text/markdown", use_semantic=True) 

242 if not chunks: 

243 return 0 

244 

245 vectors = get_services().embedder.embed_batch(chunks) 

246 records = [ 

247 { 

248 "source": wiki_source, 

249 "content_type": "text", 

250 "chunk_type": CHUNK_TYPE_WIKI, 

251 "page_start": 0, 

252 "page_end": 0, 

253 "line_start": 0, 

254 "line_end": 0, 

255 "chunk": text, 

256 "chunk_index": idx, 

257 "vector": vector, 

258 } 

259 for idx, (text, vector) in enumerate(zip(chunks, vectors, strict=True)) 

260 ] 

261 store.add_chunks(records) 

262 return len(records) 

263 

264 

265def generate_page( 

266 label: str, 

267 prompt: str, 

268 chunks: list[SearchChunk], 

269 citation_resolver: Callable[[list[ParsedCitation]], list[CitationRecord]], 

270 page_type: str, 

271 slug: str, 

272 source_names: list[str], 

273 provider: LLMProvider, 

274 store: Store, 

275 config: Config, 

276 on_progress: WikiProgressCallback | None = None, 

277 leaf_hash: str = "", 

278) -> Path | None: 

279 """Core generation pipeline shared by summary and synthesis pages.""" 

280 

281 def _emit(stage: str, **data: object) -> None: 

282 if on_progress is not None: 

283 on_progress(stage, data) 

284 

285 _emit("preparing", chunks=len(chunks), source=label) 

286 

287 messages = build_wiki_messages(prompt, provider, config) 

288 _emit("generating", source=label) 

289 options = config.generation_options( 

290 temperature=config.wiki_temperature, 

291 max_tokens=config.wiki_summary_max_tokens, 

292 ) 

293 try: 

294 response = provider.chat(messages, stream=False, options=options) 

295 wiki_text = strip_reasoning(cast(str, response)).strip() 

296 except Exception as exc: 

297 log.warning("LLM failed to generate wiki page for %s: %s", label, exc) 

298 _emit("failed", error=str(exc)) 

299 return None 

300 

301 if not wiki_text: 

302 log.warning("LLM returned empty response for wiki page %s", label) 

303 _emit("failed", error="Model returned empty response") 

304 return None 

305 

306 parsed_citations = parse_wiki_citations(wiki_text) 

307 verified = verify_citations(citation_resolver(parsed_citations), chunks, label, config) 

308 if not verified: 

309 log.warning("No valid citations for %s, skipping", label) 

310 _emit("failed", error="No valid citations found") 

311 return None 

312 

313 _emit("faithfulness_check") 

314 score = check_faithfulness(chunks, wiki_text, label, config) 

315 threshold = config.wiki_embedding_faithfulness_threshold 

316 subdir = page_type if score >= threshold else WikiSubdir.DRAFTS 

317 if subdir == WikiSubdir.DRAFTS: 

318 log.info("Wiki page %s scored %.2f (< %.2f), sending to drafts", label, score, threshold) 

319 

320 wiki_text = strip_citation_block(wiki_text) 

321 frontmatter = build_frontmatter(config, source_names, score, leaf_hash, chunks=chunks) 

322 citation_block = render_citation_block(verified) 

323 full_content = assemble_content(frontmatter, wiki_text, citation_block) 

324 

325 wiki_root = config.data_root / config.wiki_dir 

326 target = PageTarget( 

327 wiki_root=wiki_root, 

328 subdir=subdir, 

329 slug=slug, 

330 wiki_source=f"{config.wiki_dir}/{subdir}/{slug}.md", 

331 page_type=page_type, 

332 label=label, 

333 ) 

334 page_path = persist_and_finalize(full_content, target, verified, source_names, store, config) 

335 

336 log.info( 

337 "Generated wiki page for %s -> %s (score=%.2f, citations=%d)", 

338 label, 

339 target.subdir, 

340 score, 

341 len(verified), 

342 ) 

343 return page_path