Coverage for src / lilbee / wiki / drafts.py: 100%

141 statements  

« prev     ^ index     » next       coverage.py v7.13.4, created at 2026-05-15 20:55 +0000

1"""Draft review surface. List, diff, accept, reject wiki drafts. 

2 

3Wiki generation routes pages to ``wiki/drafts/`` when the content 

4drift against an existing page exceeds the configured threshold or 

5when the faithfulness score falls below it. Without a review 

6surface drafts accumulate with no exit ramp, so this module exposes 

7the four operations a reviewer needs: see what is pending, diff 

8against the published version, accept (overwrite the published 

9page and re-index its chunks), or reject (delete the draft file). 

10""" 

11 

12from __future__ import annotations 

13 

14import difflib 

15import logging 

16import re 

17from dataclasses import dataclass 

18from pathlib import Path 

19from typing import Any 

20 

21from lilbee.data.store import Store 

22from lilbee.wiki.page import index_wiki_page 

23from lilbee.wiki.shared import ( 

24 PENDING_MARKER_KEYWORD_COLLISION, 

25 PENDING_MARKER_KEYWORD_PARSE, 

26 PendingKind, 

27 WikiSubdir, 

28 parse_frontmatter, 

29) 

30 

31__all__ = [ 

32 "AcceptResult", 

33 "DraftInfo", 

34 "PendingKind", 

35 "accept_draft", 

36 "diff_draft", 

37 "list_drafts", 

38 "reject_draft", 

39] 

40 

41log = logging.getLogger(__name__) 

42 

43_DRIFT_MARKER_RE = re.compile( 

44 r"<!--\s*DRIFT:\s*(?P<pct>\d+)%\s*content changed[^>]*-->", 

45 re.IGNORECASE, 

46) 

47 

48# Batched-generation pending markers. The per-source batched call 

49# writes one of these when the parser could not recover a requested 

50# section, or when two sources proposed the same concept slug and the 

51# second write lost the race. The keyword phrases live in 

52# ``wiki.shared`` so writer (generation) and reader (drafts) agree on 

53# the exact wording; this regex adds the ``<!--`` wrapper plus ``\s+`` 

54# in place of each literal space, so the reader tolerates double-space 

55# variations in cached markers. Keywords carry no regex metacharacters 

56# so ``re.escape`` is unnecessary. 

57_PARSE_KEYWORD_PATTERN = PENDING_MARKER_KEYWORD_PARSE.replace(" ", r"\s+") 

58_COLLISION_KEYWORD_PATTERN = PENDING_MARKER_KEYWORD_COLLISION.replace(" ", r"\s+") 

59_PENDING_PARSE_MARKER_RE = re.compile( 

60 rf"<!--\s*{_PARSE_KEYWORD_PATTERN}[^>]*-->", 

61 re.IGNORECASE, 

62) 

63_PENDING_COLLISION_MARKER_RE = re.compile( 

64 rf"<!--\s*{_COLLISION_KEYWORD_PATTERN}[^>]*-->", 

65 re.IGNORECASE, 

66) 

67 

68# Published wiki subdirs searched in priority order when pairing a 

69# draft slug with its counterpart. Summaries and synthesis come first 

70# because they are the subdirs most drafts originate from (drift 

71# detection runs on regen of an existing source or cluster page). 

72_PUBLISHED_SUBDIRS: tuple[str, ...] = ( 

73 WikiSubdir.SUMMARIES, 

74 WikiSubdir.SYNTHESIS, 

75 WikiSubdir.CONCEPTS, 

76 WikiSubdir.ENTITIES, 

77) 

78 

79 

80@dataclass 

81class DraftInfo: 

82 """Metadata about a single draft, surfaced in ``wiki drafts list``. 

83 

84 ``pending_kind`` distinguishes drift drafts (None) from 

85 batched-generation markers (``"parse"``, ``"collision"``). Callers 

86 can render the kind in the list view and branch on it when 

87 deciding how to surface the draft (e.g. a collision needs the 

88 winning-source context, a parse marker just needs a rerun). 

89 """ 

90 

91 slug: str 

92 path: Path 

93 drift_ratio: float | None 

94 faithfulness_score: float | None 

95 bad_title: bool 

96 published_path: Path | None 

97 mtime: float 

98 pending_kind: str | None = None 

99 

100 @property 

101 def published_exists(self) -> bool: 

102 """True when a matching published page exists for this draft.""" 

103 return self.published_path is not None 

104 

105 def to_dict(self) -> dict[str, Any]: 

106 """Serialize to a JSON-friendly dict.""" 

107 return { 

108 "slug": self.slug, 

109 "path": str(self.path), 

110 "drift_ratio": self.drift_ratio, 

111 "faithfulness_score": self.faithfulness_score, 

112 "bad_title": self.bad_title, 

113 "published_path": str(self.published_path) if self.published_path else None, 

114 "published_exists": self.published_exists, 

115 "mtime": self.mtime, 

116 "pending_kind": self.pending_kind, 

117 } 

118 

119 

120@dataclass 

121class AcceptResult: 

122 """Outcome of accepting a draft. Returned so callers can confirm. 

123 

124 ``requested_slug`` is always the slug the caller asked to accept 

125 (for PENDING-COLLISION drafts this looks like 

126 ``brakes-collision-abc12345``). ``slug`` is where the content 

127 landed (the de-collisioned base slug, so ``brakes``). For 

128 non-collision drafts the two match. HTTP clients that round-trip 

129 accept→list-refresh can compare both fields to track the rename. 

130 """ 

131 

132 slug: str 

133 requested_slug: str 

134 moved_to: Path 

135 reindexed_chunks: int 

136 

137 def to_dict(self) -> dict[str, Any]: 

138 """Serialize to a JSON-friendly dict for HTTP/MCP/CLI responses.""" 

139 return { 

140 "slug": self.slug, 

141 "requested_slug": self.requested_slug, 

142 "moved_to": self.moved_to.as_posix(), 

143 "reindexed_chunks": self.reindexed_chunks, 

144 } 

145 

146 

147def _draft_path(wiki_root: Path, slug: str) -> Path: 

148 return wiki_root / WikiSubdir.DRAFTS / f"{slug}.md" 

149 

150 

151def _find_published(wiki_root: Path, slug: str) -> Path | None: 

152 """Return the first published page matching *slug*, or None. 

153 

154 Checks summaries, synthesis, concepts, and entities subdirs in 

155 priority order so a draft regenerated from an existing summary 

156 page pairs with its original rather than the same slug under a 

157 different page type. 

158 """ 

159 for subdir in _PUBLISHED_SUBDIRS: 

160 candidate = wiki_root / subdir / f"{slug}.md" 

161 if candidate.is_file(): 

162 return candidate 

163 return None 

164 

165 

166def _parse_drift_ratio(text: str) -> float | None: 

167 """Extract the drift percentage from a draft's leading marker.""" 

168 match = _DRIFT_MARKER_RE.search(text) 

169 if match is None: 

170 return None 

171 return int(match.group("pct")) / 100.0 

172 

173 

174def _parse_pending_kind(text: str) -> str | None: 

175 """Classify *text* as a PENDING-PARSE, PENDING-COLLISION, or neither. 

176 

177 Returns ``None`` when the leading marker is absent or is the 

178 drift marker. Only inspects the first marker encountered so a 

179 draft body that quotes the HTML comment (unlikely but possible) 

180 does not get mis-classified. 

181 """ 

182 if _PENDING_PARSE_MARKER_RE.search(text): 

183 return PendingKind.PARSE 

184 if _PENDING_COLLISION_MARKER_RE.search(text): 

185 return PendingKind.COLLISION 

186 return None 

187 

188 

189def _strip_drift_marker(text: str) -> str: 

190 """Remove the drift-review marker so accepted content lands clean.""" 

191 return _DRIFT_MARKER_RE.sub("", text, count=1).lstrip() 

192 

193 

194def _strip_pending_markers(text: str) -> str: 

195 """Remove PENDING-PARSE/COLLISION markers on the way into a published page.""" 

196 text = _PENDING_PARSE_MARKER_RE.sub("", text, count=1) 

197 text = _PENDING_COLLISION_MARKER_RE.sub("", text, count=1) 

198 return text.lstrip() 

199 

200 

201def _classify_and_strip_markers(text: str) -> tuple[str | None, float | None, str]: 

202 """Single-pass read: parse kind, drift ratio, and return marker-stripped body. 

203 

204 Three ``.sub()`` passes (one per pending-parse, pending-collision, and 

205 drift markers) plus three ``.search()`` scans needed to classify which 

206 markers are present, returning kind, drift ratio, and stripped body 

207 together so callers don't reparse the body once per attribute. 

208 """ 

209 pending_kind = _parse_pending_kind(text) 

210 drift = _parse_drift_ratio(text) 

211 stripped = _PENDING_PARSE_MARKER_RE.sub("", text, count=1) 

212 stripped = _PENDING_COLLISION_MARKER_RE.sub("", stripped, count=1) 

213 stripped = _DRIFT_MARKER_RE.sub("", stripped, count=1) 

214 return pending_kind, drift, stripped.lstrip() 

215 

216 

217def list_drafts(wiki_root: Path) -> list[DraftInfo]: 

218 """Return one ``DraftInfo`` per draft markdown file under ``drafts/``. 

219 

220 Recurses so per-source draft nesting (``drafts/<source>/page.md``) 

221 is covered. Reads each draft's full text once, classifies any 

222 pending marker and drift ratio, strips the markers, then parses 

223 frontmatter on the stripped body (so frontmatter parsing works 

224 uniformly whether or not a marker shifted it down). 

225 """ 

226 drafts_dir = wiki_root / WikiSubdir.DRAFTS 

227 if not drafts_dir.is_dir(): 

228 return [] 

229 infos: list[DraftInfo] = [] 

230 for path in sorted(drafts_dir.rglob("*.md")): 

231 text = path.read_text(encoding="utf-8") 

232 pending_kind, drift, stripped = _classify_and_strip_markers(text) 

233 fm = parse_frontmatter(stripped) 

234 slug = str(path.relative_to(drafts_dir).with_suffix("")).replace("\\", "/") 

235 infos.append( 

236 DraftInfo( 

237 slug=slug, 

238 path=path, 

239 drift_ratio=drift, 

240 faithfulness_score=_coerce_float(fm.get("faithfulness_score")), 

241 bad_title=bool(fm.get("bad_title", False)), 

242 published_path=_find_published(wiki_root, slug), 

243 mtime=path.stat().st_mtime, 

244 pending_kind=pending_kind, 

245 ) 

246 ) 

247 return infos 

248 

249 

250def diff_draft(slug: str, wiki_root: Path) -> str: 

251 """Return a unified diff of the draft against its published counterpart. 

252 

253 Raises :class:`FileNotFoundError` when the draft does not exist. 

254 When no published counterpart exists the diff shows the draft as 

255 all-new (baseline empty), which is useful for reviewing drafts 

256 that originated from a fresh low-faithfulness generation. 

257 """ 

258 draft = _draft_path(wiki_root, slug) 

259 if not draft.is_file(): 

260 raise FileNotFoundError(f"draft not found: {slug}") 

261 draft_text = draft.read_text(encoding="utf-8") 

262 published = _find_published(wiki_root, slug) 

263 baseline = published.read_text(encoding="utf-8") if published else "" 

264 diff = difflib.unified_diff( 

265 baseline.splitlines(), 

266 draft_text.splitlines(), 

267 fromfile=str(published) if published else "(new draft)", 

268 tofile=str(draft), 

269 lineterm="", 

270 ) 

271 return "\n".join(diff) 

272 

273 

274_COLLISION_SUFFIX_RE = re.compile(r"-collision-[0-9a-f]{8}$") 

275 

276 

277def _base_slug_for_collision(slug: str) -> str: 

278 """Strip the ``-collision-<hash>`` suffix so accept lands on the winning slug.""" 

279 return _COLLISION_SUFFIX_RE.sub("", slug) 

280 

281 

282def accept_draft(slug: str, wiki_root: Path, store: Store) -> AcceptResult: 

283 """Move the draft into its published subdir and re-index its chunks. 

284 

285 Behavior branches on the draft's pending kind: 

286 

287 - **Drift draft** (default): write the accepted body to its 

288 published counterpart (or ``summaries/`` when unpaired), 

289 re-index, delete the draft. 

290 - **PENDING-PARSE** (batched-generation parser could not recover 

291 a section): accepting is a no-op on the published side: the 

292 marker has no body to accept. The marker is deleted and the 

293 user is told to run ``wiki build`` to regenerate. Returns an 

294 ``AcceptResult`` with ``reindexed_chunks=0`` and 

295 ``moved_to`` pointing at the deleted marker. 

296 - **PENDING-COLLISION** (two sources proposed the same concept 

297 slug): strips the ``-collision-<hash>`` suffix to find the 

298 winning slug, overwrites the winning page with this draft's 

299 body, re-indexes, deletes the collision marker. 

300 

301 Sequence for drift/collision: write the published file first, 

302 re-index next, delete the draft last. If the re-index raises 

303 (chunker, embedder, LanceDB contention), the draft file stays 

304 on disk so the user can retry ``accept``: ``index_wiki_page`` 

305 is idempotent on the same ``wiki_source`` (``clear_table`` + 

306 re-write). 

307 

308 Raises :class:`FileNotFoundError` when the draft does not exist. 

309 """ 

310 draft = _draft_path(wiki_root, slug) 

311 if not draft.is_file(): 

312 raise FileNotFoundError(f"draft not found: {slug}") 

313 raw = draft.read_text(encoding="utf-8") 

314 pending_kind = _parse_pending_kind(raw) 

315 

316 if pending_kind == PendingKind.PARSE: 

317 draft.unlink() 

318 log.info( 

319 "Accepted PENDING-PARSE marker %s; run `lilbee wiki build` " 

320 "to regenerate the missing section.", 

321 slug, 

322 ) 

323 return AcceptResult(slug=slug, requested_slug=slug, moved_to=draft, reindexed_chunks=0) 

324 

325 clean = _strip_pending_markers(_strip_drift_marker(raw)) 

326 

327 target_slug = _base_slug_for_collision(slug) if pending_kind == PendingKind.COLLISION else slug 

328 published = _find_published(wiki_root, target_slug) 

329 if published is not None: 

330 target = published 

331 else: 

332 target = wiki_root / WikiSubdir.SUMMARIES / f"{target_slug}.md" 

333 log.info( 

334 "Draft %s has no published counterpart; accepting into %s", 

335 slug, 

336 WikiSubdir.SUMMARIES, 

337 ) 

338 target.parent.mkdir(parents=True, exist_ok=True) 

339 target.write_text(clean, encoding="utf-8") 

340 

341 reindexed = _reindex_accepted_page(target, wiki_root, store) 

342 draft.unlink() 

343 log.info("Accepted draft %s -> %s (%d chunks indexed)", slug, target, reindexed) 

344 return AcceptResult( 

345 slug=target_slug, 

346 requested_slug=slug, 

347 moved_to=target, 

348 reindexed_chunks=reindexed, 

349 ) 

350 

351 

352def reject_draft(slug: str, wiki_root: Path) -> None: 

353 """Delete the draft file without touching the published page or the index.""" 

354 draft = _draft_path(wiki_root, slug) 

355 if not draft.is_file(): 

356 raise FileNotFoundError(f"draft not found: {slug}") 

357 draft.unlink() 

358 log.info("Rejected draft %s", slug) 

359 

360 

361def _reindex_accepted_page(target: Path, wiki_root: Path, store: Store) -> int: 

362 """Re-index *target* via :func:`lilbee.wiki.page.index_wiki_page`. 

363 

364 Returns the number of ``chunk_type="wiki"`` rows written. Routes 

365 through the same chunk / embed / clear-and-rewrite path as initial 

366 page generation, so an accepted draft is indexed identically to a 

367 fresh page and no bespoke accept-time code path exists. 

368 """ 

369 wiki_source = _wiki_source_for(target, wiki_root) 

370 content = target.read_text(encoding="utf-8") 

371 return index_wiki_page(content, wiki_source, store) 

372 

373 

374def _wiki_source_for(target: Path, wiki_root: Path) -> str: 

375 """Build the ``wiki_source`` identifier used in the chunks table. 

376 

377 Shape matches :attr:`PageTarget.wiki_source`: 

378 ``<wiki_dir>/<subdir>/<slug>.md``. 

379 """ 

380 wiki_dir_name = wiki_root.name 

381 relative = target.relative_to(wiki_root) 

382 return f"{wiki_dir_name}/{relative.as_posix()}" 

383 

384 

385def _coerce_float(value: Any) -> float | None: 

386 """Return *value* as a float, or None when conversion is not sensible.""" 

387 if value is None: 

388 return None 

389 try: 

390 return float(value) 

391 except (TypeError, ValueError): 

392 return None