Coverage for src/lilbee/wiki/page.py: 100%

1"""Single-page generation pipeline for wiki summary and synthesis pages.

3Given a label, prompt, and grounding chunks, drives the LLM call,

4parses + verifies citations, scores faithfulness, builds the

5frontmatter / body / citation block, and lands the page on disk via

6:mod:`lilbee.wiki.persistence`. Also owns ``index_wiki_page``: the

7post-write step that chunks, embeds, and stores the wiki body itself

8so wiki content participates in retrieval.

9"""

11from __future__ import annotations

13import logging

14from collections.abc import Callable

15from datetime import UTC, datetime

16from pathlib import Path

17from typing import cast

19from lilbee.app.services import get_services

20from lilbee.core.config import CHUNKS_TABLE, DEFAULT_NUM_CTX, Config

21from lilbee.data.chunk import chunk_text

22from lilbee.data.store import (

23 CHUNK_TYPE_WIKI,

24 CitationRecord,

25 SearchChunk,

26 Store,

27 escape_sql_string,

28)

29from lilbee.providers.base import LLMProvider

30from lilbee.retrieval.reasoning import strip_reasoning

31from lilbee.wiki.citation import (

32 ParsedCitation,

33 extract_body,

34 parse_wiki_citations,

35 render_citation_block,

36 strip_citation_block,

37)

38from lilbee.wiki.citations import (

39 render_provenance,

40 verify_citations,

41)

42from lilbee.wiki.persistence import (

43 divert_to_drafts,

44 persist_and_finalize,

45 subdir_from_wiki_source,

46)

47from lilbee.wiki.quality import check_faithfulness, content_change_ratio, diff_summary

48from lilbee.wiki.shared import (

49 WIKI_CONTENT_SUBDIRS,

50 PageTarget,

51 WikiSubdir,

52)

54log = logging.getLogger(__name__)

56WikiProgressCallback = Callable[[str, dict[str, object]], None]

57"""Callback for wiki generation progress: (stage, data) -> None."""

59# Fraction of context window reserved for chunks. The remainder leaves

60# room for the system/user prompt template and generation output.

61_CONTEXT_BUDGET_FRACTION = 0.75

63# Approximate characters per token for budget estimation. 4 chars/token

64# is a widely used heuristic for English text.

65_CHARS_PER_TOKEN = 4

67# Directive recognized by chat templates that support a reasoning mode

68# (Qwen3, DeepSeek-R1, etc.). Wiki generation is a summarization task

69# where chain-of-thought adds wall-clock cost without improving output,

70# so we suppress it whenever the provider reports the capability.

71_NO_THINK_DIRECTIVE = "/no_think"

73# Capability string returned by llama-cpp providers for reasoning models

74# (Qwen3, DeepSeek-R1). Defined locally so wiki.generation doesn't

75# depend on a specific provider-layer constant name.

76_CAPABILITY_THINKING = "thinking"

79def build_wiki_messages(prompt: str, provider: LLMProvider, config: Config) -> list[dict[str, str]]:

80 """Build the chat messages list for a wiki-gen call.

82 When the provider reports the ``thinking`` capability for the active

83 chat model, prepends ``/no_think`` so the chat template disables the

84 reasoning mode. Otherwise the prompt passes through unchanged.

85 """

86 capabilities = provider.get_capabilities(config.chat_model)

87 if _CAPABILITY_THINKING in capabilities:

88 prompt = f"{_NO_THINK_DIRECTIVE}\n\n{prompt}"

89 return [{"role": "user", "content": prompt}]

92def truncate_chunks_to_budget(

93 chunks: list[SearchChunk],

94 config: Config,

95) -> list[SearchChunk]:

96 """Drop trailing chunks so the total text fits within the model's context budget.

98 Uses a chars/4 heuristic for token estimation. Returns the original list

99 unchanged when all chunks fit.

100 """

101 context_window = config.num_ctx or DEFAULT_NUM_CTX

102 budget_tokens = int(context_window * _CONTEXT_BUDGET_FRACTION)

103 budget_chars = budget_tokens * _CHARS_PER_TOKEN

104

105 total_chars = 0

106 kept: list[SearchChunk] = []

107 for chunk in chunks:

108 chunk_chars = len(chunk.chunk)

109 if total_chars + chunk_chars > budget_chars and kept:

110 break

111 kept.append(chunk)

112 total_chars += chunk_chars

113

114 if len(kept) < len(chunks):

115 log.warning(

116 "Truncated chunks from %d to %d to fit context window (%d tokens)",

117 len(chunks),

118 len(kept),

119 context_window,

120 )

121 return kept

122

123

124def chunks_to_text(chunks: list[SearchChunk]) -> str:

125 """Format chunks as numbered text blocks for the LLM prompt."""

126 parts: list[str] = []

127 for i, chunk in enumerate(chunks):

128 location = ""

129 if chunk.page_start:

130 location = f" (page {chunk.page_start})"

131 elif chunk.line_start:

132 location = f" (lines {chunk.line_start}-{chunk.line_end})"

133 parts.append(f"[Chunk {i + 1}]{location}:\n{chunk.chunk}")

134 return "\n\n".join(parts)

135

136

137def build_frontmatter(

138 config: Config,

139 source_names: list[str],

140 score: float,

141 leaf_hash: str = "",

142 chunks: list[SearchChunk] | None = None,

143) -> str:

144 """Build YAML frontmatter for a wiki page.

145

146 When ``leaf_hash`` is non-empty it is written so incremental rebuild

147 can skip regeneration on a subsequent sync whose chunks produce the

148 same hash. When ``chunks`` is provided the frontmatter carries a

149 ``provenance`` block naming the source/chunk-index pairs that fed

150 the generator and the extraction method from config, so a bad page

151 is auditable without re-running the pipeline.

152 """

153 sources_yaml = ", ".join(f'"{s}"' for s in sorted(source_names))

154 hash_line = f"leaf_hash: {leaf_hash}\n" if leaf_hash else ""

155 provenance_block = render_provenance(config, chunks) if chunks is not None else ""

156 return (

157 f"---\n"

158 f"generated_by: {config.chat_model}\n"

159 f"generated_at: {datetime.now(UTC).isoformat()}\n"

160 f"sources: [{sources_yaml}]\n"

161 f"faithfulness_score: {score:.2f}\n"

162 f"{hash_line}"

163 f"{provenance_block}"

164 f"---\n\n"

165 )

166

167

168def write_page(

169 wiki_root: Path,

170 subdir: str,

171 slug: str,

172 full_content: str,

173 drift_threshold: float,

174) -> Path:

175 """Write page to disk with drift detection. Returns path written to.

176

177 ``slug`` may contain forward slashes (e.g. ``cv-manual/page-0042``);

178 any intermediate directories are created before writing.

179 """

180 page_path = wiki_root / subdir / f"{slug}.md"

181 page_path.parent.mkdir(parents=True, exist_ok=True)

182

183 if page_path.exists():

184 old_content = page_path.read_text(encoding="utf-8")

185 ratio = content_change_ratio(old_content, full_content)

186 if ratio > drift_threshold:

187 drafts_dir = wiki_root / WikiSubdir.DRAFTS

188 diff_text = diff_summary(old_content, full_content)

189 return divert_to_drafts(full_content, drafts_dir, slug, ratio, diff_text)

190

191 page_path.write_text(full_content, encoding="utf-8")

192 return page_path

193

194

195def assemble_content(

196 frontmatter: str,

197 wiki_text: str,

198 citation_block: str,

199) -> str:

200 """Combine frontmatter, body, and citations into the full page content."""

201 full = frontmatter + wiki_text

202 if citation_block:

203 full += "\n\n" + citation_block

204 return full

205

206

207def index_wiki_page(content: str, wiki_source: str, store: Store) -> int:

208 """Chunk a wiki page body, embed it, and write rows with ``chunk_type="wiki"``.

209

210 ``wiki_source`` must follow the ``<wiki_dir>/<subdir>/<slug>.md``

211 shape (see :attr:`PageTarget.wiki_source`). Three branches:

212

213 - subdir in :data:`WIKI_CONTENT_SUBDIRS`: clear stale rows, chunk,

214 embed, write. Returns the row count.

215 - subdir is ``drafts/`` or ``archive/``: skip without touching the

216 store. Returns 0.

217 - malformed ``wiki_source`` (no subdir component): log.warning and

218 return 0. Does not raise because the caller set is narrow (only

219 internal wiki paths reach here) and surfacing the bad input in

220 the log is sufficient triage.

221

222 Record shape matches the markdown-ingest convention in

223 ``lilbee.data.ingest``: ``content_type="text"``, all four page/line

224 positions ``0`` (wiki pages are not paginated).

225 """

226 subdir = subdir_from_wiki_source(wiki_source)

227 if subdir is None:

228 log.warning("index_wiki_page: malformed wiki_source %r (no subdir)", wiki_source)

229 return 0

230 if subdir not in WIKI_CONTENT_SUBDIRS:

231 return 0

232

233 body = extract_body(content).strip()

234 store.clear_table(

235 CHUNKS_TABLE,

236 f"source = '{escape_sql_string(wiki_source)}' AND chunk_type = '{CHUNK_TYPE_WIKI}'",

237 )

238 if not body:

239 return 0

240

241 chunks = chunk_text(body, mime_type="text/markdown", use_semantic=True)

242 if not chunks:

243 return 0

244

245 vectors = get_services().embedder.embed_batch(chunks)

246 records = [

247 {

248 "source": wiki_source,

249 "content_type": "text",

250 "chunk_type": CHUNK_TYPE_WIKI,

251 "page_start": 0,

252 "page_end": 0,

253 "line_start": 0,

254 "line_end": 0,

255 "chunk": text,

256 "chunk_index": idx,

257 "vector": vector,

258 }

259 for idx, (text, vector) in enumerate(zip(chunks, vectors, strict=True))

260 ]

261 store.add_chunks(records)

262 return len(records)

263

264

265def generate_page(

266 label: str,

267 prompt: str,

268 chunks: list[SearchChunk],

269 citation_resolver: Callable[[list[ParsedCitation]], list[CitationRecord]],

270 page_type: str,

271 slug: str,

272 source_names: list[str],

273 provider: LLMProvider,

274 store: Store,

275 config: Config,

276 on_progress: WikiProgressCallback | None = None,

277 leaf_hash: str = "",

278) -> Path | None:

279 """Core generation pipeline shared by summary and synthesis pages."""

280

281 def _emit(stage: str, **data: object) -> None:

282 if on_progress is not None:

283 on_progress(stage, data)

284

285 _emit("preparing", chunks=len(chunks), source=label)

286

287 messages = build_wiki_messages(prompt, provider, config)

288 _emit("generating", source=label)

289 options = config.generation_options(

290 temperature=config.wiki_temperature,

291 max_tokens=config.wiki_summary_max_tokens,

292 )

293 try:

294 response = provider.chat(messages, stream=False, options=options)

295 wiki_text = strip_reasoning(cast(str, response)).strip()

296 except Exception as exc:

297 log.warning("LLM failed to generate wiki page for %s: %s", label, exc)

298 _emit("failed", error=str(exc))

299 return None

300

301 if not wiki_text:

302 log.warning("LLM returned empty response for wiki page %s", label)

303 _emit("failed", error="Model returned empty response")

304 return None

305

306 parsed_citations = parse_wiki_citations(wiki_text)

307 verified = verify_citations(citation_resolver(parsed_citations), chunks, label, config)

308 if not verified:

309 log.warning("No valid citations for %s, skipping", label)

310 _emit("failed", error="No valid citations found")

311 return None

312

313 _emit("faithfulness_check")

314 score = check_faithfulness(chunks, wiki_text, label, config)

315 threshold = config.wiki_embedding_faithfulness_threshold

316 subdir = page_type if score >= threshold else WikiSubdir.DRAFTS

317 if subdir == WikiSubdir.DRAFTS:

318 log.info("Wiki page %s scored %.2f (< %.2f), sending to drafts", label, score, threshold)

319

320 wiki_text = strip_citation_block(wiki_text)

321 frontmatter = build_frontmatter(config, source_names, score, leaf_hash, chunks=chunks)

322 citation_block = render_citation_block(verified)

323 full_content = assemble_content(frontmatter, wiki_text, citation_block)

324

325 wiki_root = config.data_root / config.wiki_dir

326 target = PageTarget(

327 wiki_root=wiki_root,

328 subdir=subdir,

329 slug=slug,

330 wiki_source=f"{config.wiki_dir}/{subdir}/{slug}.md",

331 page_type=page_type,

332 label=label,

333 )

334 page_path = persist_and_finalize(full_content, target, verified, source_names, store, config)

335

336 log.info(

337 "Generated wiki page for %s -> %s (score=%.2f, citations=%d)",

338 label,

339 target.subdir,

340 score,

341 len(verified),

342 )

343 return page_path

Coverage for src / lilbee / wiki / page.py: 100%

134 statements