Coverage for src / lilbee / wiki / page.py: 100%
134 statements
« prev ^ index » next coverage.py v7.13.4, created at 2026-05-15 20:55 +0000
« prev ^ index » next coverage.py v7.13.4, created at 2026-05-15 20:55 +0000
1"""Single-page generation pipeline for wiki summary and synthesis pages.
3Given a label, prompt, and grounding chunks, drives the LLM call,
4parses + verifies citations, scores faithfulness, builds the
5frontmatter / body / citation block, and lands the page on disk via
6:mod:`lilbee.wiki.persistence`. Also owns ``index_wiki_page``: the
7post-write step that chunks, embeds, and stores the wiki body itself
8so wiki content participates in retrieval.
9"""
11from __future__ import annotations
13import logging
14from collections.abc import Callable
15from datetime import UTC, datetime
16from pathlib import Path
17from typing import cast
19from lilbee.app.services import get_services
20from lilbee.core.config import CHUNKS_TABLE, DEFAULT_NUM_CTX, Config
21from lilbee.data.chunk import chunk_text
22from lilbee.data.store import (
23 CHUNK_TYPE_WIKI,
24 CitationRecord,
25 SearchChunk,
26 Store,
27 escape_sql_string,
28)
29from lilbee.providers.base import LLMProvider
30from lilbee.retrieval.reasoning import strip_reasoning
31from lilbee.wiki.citation import (
32 ParsedCitation,
33 extract_body,
34 parse_wiki_citations,
35 render_citation_block,
36 strip_citation_block,
37)
38from lilbee.wiki.citations import (
39 render_provenance,
40 verify_citations,
41)
42from lilbee.wiki.persistence import (
43 divert_to_drafts,
44 persist_and_finalize,
45 subdir_from_wiki_source,
46)
47from lilbee.wiki.quality import check_faithfulness, content_change_ratio, diff_summary
48from lilbee.wiki.shared import (
49 WIKI_CONTENT_SUBDIRS,
50 PageTarget,
51 WikiSubdir,
52)
54log = logging.getLogger(__name__)
56WikiProgressCallback = Callable[[str, dict[str, object]], None]
57"""Callback for wiki generation progress: (stage, data) -> None."""
59# Fraction of context window reserved for chunks. The remainder leaves
60# room for the system/user prompt template and generation output.
61_CONTEXT_BUDGET_FRACTION = 0.75
63# Approximate characters per token for budget estimation. 4 chars/token
64# is a widely used heuristic for English text.
65_CHARS_PER_TOKEN = 4
67# Directive recognized by chat templates that support a reasoning mode
68# (Qwen3, DeepSeek-R1, etc.). Wiki generation is a summarization task
69# where chain-of-thought adds wall-clock cost without improving output,
70# so we suppress it whenever the provider reports the capability.
71_NO_THINK_DIRECTIVE = "/no_think"
73# Capability string returned by llama-cpp providers for reasoning models
74# (Qwen3, DeepSeek-R1). Defined locally so wiki.generation doesn't
75# depend on a specific provider-layer constant name.
76_CAPABILITY_THINKING = "thinking"
79def build_wiki_messages(prompt: str, provider: LLMProvider, config: Config) -> list[dict[str, str]]:
80 """Build the chat messages list for a wiki-gen call.
82 When the provider reports the ``thinking`` capability for the active
83 chat model, prepends ``/no_think`` so the chat template disables the
84 reasoning mode. Otherwise the prompt passes through unchanged.
85 """
86 capabilities = provider.get_capabilities(config.chat_model)
87 if _CAPABILITY_THINKING in capabilities:
88 prompt = f"{_NO_THINK_DIRECTIVE}\n\n{prompt}"
89 return [{"role": "user", "content": prompt}]
92def truncate_chunks_to_budget(
93 chunks: list[SearchChunk],
94 config: Config,
95) -> list[SearchChunk]:
96 """Drop trailing chunks so the total text fits within the model's context budget.
98 Uses a chars/4 heuristic for token estimation. Returns the original list
99 unchanged when all chunks fit.
100 """
101 context_window = config.num_ctx or DEFAULT_NUM_CTX
102 budget_tokens = int(context_window * _CONTEXT_BUDGET_FRACTION)
103 budget_chars = budget_tokens * _CHARS_PER_TOKEN
105 total_chars = 0
106 kept: list[SearchChunk] = []
107 for chunk in chunks:
108 chunk_chars = len(chunk.chunk)
109 if total_chars + chunk_chars > budget_chars and kept:
110 break
111 kept.append(chunk)
112 total_chars += chunk_chars
114 if len(kept) < len(chunks):
115 log.warning(
116 "Truncated chunks from %d to %d to fit context window (%d tokens)",
117 len(chunks),
118 len(kept),
119 context_window,
120 )
121 return kept
124def chunks_to_text(chunks: list[SearchChunk]) -> str:
125 """Format chunks as numbered text blocks for the LLM prompt."""
126 parts: list[str] = []
127 for i, chunk in enumerate(chunks):
128 location = ""
129 if chunk.page_start:
130 location = f" (page {chunk.page_start})"
131 elif chunk.line_start:
132 location = f" (lines {chunk.line_start}-{chunk.line_end})"
133 parts.append(f"[Chunk {i + 1}]{location}:\n{chunk.chunk}")
134 return "\n\n".join(parts)
137def build_frontmatter(
138 config: Config,
139 source_names: list[str],
140 score: float,
141 leaf_hash: str = "",
142 chunks: list[SearchChunk] | None = None,
143) -> str:
144 """Build YAML frontmatter for a wiki page.
146 When ``leaf_hash`` is non-empty it is written so incremental rebuild
147 can skip regeneration on a subsequent sync whose chunks produce the
148 same hash. When ``chunks`` is provided the frontmatter carries a
149 ``provenance`` block naming the source/chunk-index pairs that fed
150 the generator and the extraction method from config, so a bad page
151 is auditable without re-running the pipeline.
152 """
153 sources_yaml = ", ".join(f'"{s}"' for s in sorted(source_names))
154 hash_line = f"leaf_hash: {leaf_hash}\n" if leaf_hash else ""
155 provenance_block = render_provenance(config, chunks) if chunks is not None else ""
156 return (
157 f"---\n"
158 f"generated_by: {config.chat_model}\n"
159 f"generated_at: {datetime.now(UTC).isoformat()}\n"
160 f"sources: [{sources_yaml}]\n"
161 f"faithfulness_score: {score:.2f}\n"
162 f"{hash_line}"
163 f"{provenance_block}"
164 f"---\n\n"
165 )
168def write_page(
169 wiki_root: Path,
170 subdir: str,
171 slug: str,
172 full_content: str,
173 drift_threshold: float,
174) -> Path:
175 """Write page to disk with drift detection. Returns path written to.
177 ``slug`` may contain forward slashes (e.g. ``cv-manual/page-0042``);
178 any intermediate directories are created before writing.
179 """
180 page_path = wiki_root / subdir / f"{slug}.md"
181 page_path.parent.mkdir(parents=True, exist_ok=True)
183 if page_path.exists():
184 old_content = page_path.read_text(encoding="utf-8")
185 ratio = content_change_ratio(old_content, full_content)
186 if ratio > drift_threshold:
187 drafts_dir = wiki_root / WikiSubdir.DRAFTS
188 diff_text = diff_summary(old_content, full_content)
189 return divert_to_drafts(full_content, drafts_dir, slug, ratio, diff_text)
191 page_path.write_text(full_content, encoding="utf-8")
192 return page_path
195def assemble_content(
196 frontmatter: str,
197 wiki_text: str,
198 citation_block: str,
199) -> str:
200 """Combine frontmatter, body, and citations into the full page content."""
201 full = frontmatter + wiki_text
202 if citation_block:
203 full += "\n\n" + citation_block
204 return full
207def index_wiki_page(content: str, wiki_source: str, store: Store) -> int:
208 """Chunk a wiki page body, embed it, and write rows with ``chunk_type="wiki"``.
210 ``wiki_source`` must follow the ``<wiki_dir>/<subdir>/<slug>.md``
211 shape (see :attr:`PageTarget.wiki_source`). Three branches:
213 - subdir in :data:`WIKI_CONTENT_SUBDIRS`: clear stale rows, chunk,
214 embed, write. Returns the row count.
215 - subdir is ``drafts/`` or ``archive/``: skip without touching the
216 store. Returns 0.
217 - malformed ``wiki_source`` (no subdir component): log.warning and
218 return 0. Does not raise because the caller set is narrow (only
219 internal wiki paths reach here) and surfacing the bad input in
220 the log is sufficient triage.
222 Record shape matches the markdown-ingest convention in
223 ``lilbee.data.ingest``: ``content_type="text"``, all four page/line
224 positions ``0`` (wiki pages are not paginated).
225 """
226 subdir = subdir_from_wiki_source(wiki_source)
227 if subdir is None:
228 log.warning("index_wiki_page: malformed wiki_source %r (no subdir)", wiki_source)
229 return 0
230 if subdir not in WIKI_CONTENT_SUBDIRS:
231 return 0
233 body = extract_body(content).strip()
234 store.clear_table(
235 CHUNKS_TABLE,
236 f"source = '{escape_sql_string(wiki_source)}' AND chunk_type = '{CHUNK_TYPE_WIKI}'",
237 )
238 if not body:
239 return 0
241 chunks = chunk_text(body, mime_type="text/markdown", use_semantic=True)
242 if not chunks:
243 return 0
245 vectors = get_services().embedder.embed_batch(chunks)
246 records = [
247 {
248 "source": wiki_source,
249 "content_type": "text",
250 "chunk_type": CHUNK_TYPE_WIKI,
251 "page_start": 0,
252 "page_end": 0,
253 "line_start": 0,
254 "line_end": 0,
255 "chunk": text,
256 "chunk_index": idx,
257 "vector": vector,
258 }
259 for idx, (text, vector) in enumerate(zip(chunks, vectors, strict=True))
260 ]
261 store.add_chunks(records)
262 return len(records)
265def generate_page(
266 label: str,
267 prompt: str,
268 chunks: list[SearchChunk],
269 citation_resolver: Callable[[list[ParsedCitation]], list[CitationRecord]],
270 page_type: str,
271 slug: str,
272 source_names: list[str],
273 provider: LLMProvider,
274 store: Store,
275 config: Config,
276 on_progress: WikiProgressCallback | None = None,
277 leaf_hash: str = "",
278) -> Path | None:
279 """Core generation pipeline shared by summary and synthesis pages."""
281 def _emit(stage: str, **data: object) -> None:
282 if on_progress is not None:
283 on_progress(stage, data)
285 _emit("preparing", chunks=len(chunks), source=label)
287 messages = build_wiki_messages(prompt, provider, config)
288 _emit("generating", source=label)
289 options = config.generation_options(
290 temperature=config.wiki_temperature,
291 max_tokens=config.wiki_summary_max_tokens,
292 )
293 try:
294 response = provider.chat(messages, stream=False, options=options)
295 wiki_text = strip_reasoning(cast(str, response)).strip()
296 except Exception as exc:
297 log.warning("LLM failed to generate wiki page for %s: %s", label, exc)
298 _emit("failed", error=str(exc))
299 return None
301 if not wiki_text:
302 log.warning("LLM returned empty response for wiki page %s", label)
303 _emit("failed", error="Model returned empty response")
304 return None
306 parsed_citations = parse_wiki_citations(wiki_text)
307 verified = verify_citations(citation_resolver(parsed_citations), chunks, label, config)
308 if not verified:
309 log.warning("No valid citations for %s, skipping", label)
310 _emit("failed", error="No valid citations found")
311 return None
313 _emit("faithfulness_check")
314 score = check_faithfulness(chunks, wiki_text, label, config)
315 threshold = config.wiki_embedding_faithfulness_threshold
316 subdir = page_type if score >= threshold else WikiSubdir.DRAFTS
317 if subdir == WikiSubdir.DRAFTS:
318 log.info("Wiki page %s scored %.2f (< %.2f), sending to drafts", label, score, threshold)
320 wiki_text = strip_citation_block(wiki_text)
321 frontmatter = build_frontmatter(config, source_names, score, leaf_hash, chunks=chunks)
322 citation_block = render_citation_block(verified)
323 full_content = assemble_content(frontmatter, wiki_text, citation_block)
325 wiki_root = config.data_root / config.wiki_dir
326 target = PageTarget(
327 wiki_root=wiki_root,
328 subdir=subdir,
329 slug=slug,
330 wiki_source=f"{config.wiki_dir}/{subdir}/{slug}.md",
331 page_type=page_type,
332 label=label,
333 )
334 page_path = persist_and_finalize(full_content, target, verified, source_names, store, config)
336 log.info(
337 "Generated wiki page for %s -> %s (score=%.2f, citations=%d)",
338 label,
339 target.subdir,
340 score,
341 len(verified),
342 )
343 return page_path