Coverage for src / lilbee / wiki / drafts.py: 100%
141 statements
« prev ^ index » next coverage.py v7.13.4, created at 2026-05-15 20:55 +0000
« prev ^ index » next coverage.py v7.13.4, created at 2026-05-15 20:55 +0000
1"""Draft review surface. List, diff, accept, reject wiki drafts.
3Wiki generation routes pages to ``wiki/drafts/`` when the content
4drift against an existing page exceeds the configured threshold or
5when the faithfulness score falls below it. Without a review
6surface drafts accumulate with no exit ramp, so this module exposes
7the four operations a reviewer needs: see what is pending, diff
8against the published version, accept (overwrite the published
9page and re-index its chunks), or reject (delete the draft file).
10"""
12from __future__ import annotations
14import difflib
15import logging
16import re
17from dataclasses import dataclass
18from pathlib import Path
19from typing import Any
21from lilbee.data.store import Store
22from lilbee.wiki.page import index_wiki_page
23from lilbee.wiki.shared import (
24 PENDING_MARKER_KEYWORD_COLLISION,
25 PENDING_MARKER_KEYWORD_PARSE,
26 PendingKind,
27 WikiSubdir,
28 parse_frontmatter,
29)
31__all__ = [
32 "AcceptResult",
33 "DraftInfo",
34 "PendingKind",
35 "accept_draft",
36 "diff_draft",
37 "list_drafts",
38 "reject_draft",
39]
41log = logging.getLogger(__name__)
43_DRIFT_MARKER_RE = re.compile(
44 r"<!--\s*DRIFT:\s*(?P<pct>\d+)%\s*content changed[^>]*-->",
45 re.IGNORECASE,
46)
48# Batched-generation pending markers. The per-source batched call
49# writes one of these when the parser could not recover a requested
50# section, or when two sources proposed the same concept slug and the
51# second write lost the race. The keyword phrases live in
52# ``wiki.shared`` so writer (generation) and reader (drafts) agree on
53# the exact wording; this regex adds the ``<!--`` wrapper plus ``\s+``
54# in place of each literal space, so the reader tolerates double-space
55# variations in cached markers. Keywords carry no regex metacharacters
56# so ``re.escape`` is unnecessary.
57_PARSE_KEYWORD_PATTERN = PENDING_MARKER_KEYWORD_PARSE.replace(" ", r"\s+")
58_COLLISION_KEYWORD_PATTERN = PENDING_MARKER_KEYWORD_COLLISION.replace(" ", r"\s+")
59_PENDING_PARSE_MARKER_RE = re.compile(
60 rf"<!--\s*{_PARSE_KEYWORD_PATTERN}[^>]*-->",
61 re.IGNORECASE,
62)
63_PENDING_COLLISION_MARKER_RE = re.compile(
64 rf"<!--\s*{_COLLISION_KEYWORD_PATTERN}[^>]*-->",
65 re.IGNORECASE,
66)
68# Published wiki subdirs searched in priority order when pairing a
69# draft slug with its counterpart. Summaries and synthesis come first
70# because they are the subdirs most drafts originate from (drift
71# detection runs on regen of an existing source or cluster page).
72_PUBLISHED_SUBDIRS: tuple[str, ...] = (
73 WikiSubdir.SUMMARIES,
74 WikiSubdir.SYNTHESIS,
75 WikiSubdir.CONCEPTS,
76 WikiSubdir.ENTITIES,
77)
80@dataclass
81class DraftInfo:
82 """Metadata about a single draft, surfaced in ``wiki drafts list``.
84 ``pending_kind`` distinguishes drift drafts (None) from
85 batched-generation markers (``"parse"``, ``"collision"``). Callers
86 can render the kind in the list view and branch on it when
87 deciding how to surface the draft (e.g. a collision needs the
88 winning-source context, a parse marker just needs a rerun).
89 """
91 slug: str
92 path: Path
93 drift_ratio: float | None
94 faithfulness_score: float | None
95 bad_title: bool
96 published_path: Path | None
97 mtime: float
98 pending_kind: str | None = None
100 @property
101 def published_exists(self) -> bool:
102 """True when a matching published page exists for this draft."""
103 return self.published_path is not None
105 def to_dict(self) -> dict[str, Any]:
106 """Serialize to a JSON-friendly dict."""
107 return {
108 "slug": self.slug,
109 "path": str(self.path),
110 "drift_ratio": self.drift_ratio,
111 "faithfulness_score": self.faithfulness_score,
112 "bad_title": self.bad_title,
113 "published_path": str(self.published_path) if self.published_path else None,
114 "published_exists": self.published_exists,
115 "mtime": self.mtime,
116 "pending_kind": self.pending_kind,
117 }
120@dataclass
121class AcceptResult:
122 """Outcome of accepting a draft. Returned so callers can confirm.
124 ``requested_slug`` is always the slug the caller asked to accept
125 (for PENDING-COLLISION drafts this looks like
126 ``brakes-collision-abc12345``). ``slug`` is where the content
127 landed (the de-collisioned base slug, so ``brakes``). For
128 non-collision drafts the two match. HTTP clients that round-trip
129 accept→list-refresh can compare both fields to track the rename.
130 """
132 slug: str
133 requested_slug: str
134 moved_to: Path
135 reindexed_chunks: int
137 def to_dict(self) -> dict[str, Any]:
138 """Serialize to a JSON-friendly dict for HTTP/MCP/CLI responses."""
139 return {
140 "slug": self.slug,
141 "requested_slug": self.requested_slug,
142 "moved_to": self.moved_to.as_posix(),
143 "reindexed_chunks": self.reindexed_chunks,
144 }
147def _draft_path(wiki_root: Path, slug: str) -> Path:
148 return wiki_root / WikiSubdir.DRAFTS / f"{slug}.md"
151def _find_published(wiki_root: Path, slug: str) -> Path | None:
152 """Return the first published page matching *slug*, or None.
154 Checks summaries, synthesis, concepts, and entities subdirs in
155 priority order so a draft regenerated from an existing summary
156 page pairs with its original rather than the same slug under a
157 different page type.
158 """
159 for subdir in _PUBLISHED_SUBDIRS:
160 candidate = wiki_root / subdir / f"{slug}.md"
161 if candidate.is_file():
162 return candidate
163 return None
166def _parse_drift_ratio(text: str) -> float | None:
167 """Extract the drift percentage from a draft's leading marker."""
168 match = _DRIFT_MARKER_RE.search(text)
169 if match is None:
170 return None
171 return int(match.group("pct")) / 100.0
174def _parse_pending_kind(text: str) -> str | None:
175 """Classify *text* as a PENDING-PARSE, PENDING-COLLISION, or neither.
177 Returns ``None`` when the leading marker is absent or is the
178 drift marker. Only inspects the first marker encountered so a
179 draft body that quotes the HTML comment (unlikely but possible)
180 does not get mis-classified.
181 """
182 if _PENDING_PARSE_MARKER_RE.search(text):
183 return PendingKind.PARSE
184 if _PENDING_COLLISION_MARKER_RE.search(text):
185 return PendingKind.COLLISION
186 return None
189def _strip_drift_marker(text: str) -> str:
190 """Remove the drift-review marker so accepted content lands clean."""
191 return _DRIFT_MARKER_RE.sub("", text, count=1).lstrip()
194def _strip_pending_markers(text: str) -> str:
195 """Remove PENDING-PARSE/COLLISION markers on the way into a published page."""
196 text = _PENDING_PARSE_MARKER_RE.sub("", text, count=1)
197 text = _PENDING_COLLISION_MARKER_RE.sub("", text, count=1)
198 return text.lstrip()
201def _classify_and_strip_markers(text: str) -> tuple[str | None, float | None, str]:
202 """Single-pass read: parse kind, drift ratio, and return marker-stripped body.
204 Three ``.sub()`` passes (one per pending-parse, pending-collision, and
205 drift markers) plus three ``.search()`` scans needed to classify which
206 markers are present, returning kind, drift ratio, and stripped body
207 together so callers don't reparse the body once per attribute.
208 """
209 pending_kind = _parse_pending_kind(text)
210 drift = _parse_drift_ratio(text)
211 stripped = _PENDING_PARSE_MARKER_RE.sub("", text, count=1)
212 stripped = _PENDING_COLLISION_MARKER_RE.sub("", stripped, count=1)
213 stripped = _DRIFT_MARKER_RE.sub("", stripped, count=1)
214 return pending_kind, drift, stripped.lstrip()
217def list_drafts(wiki_root: Path) -> list[DraftInfo]:
218 """Return one ``DraftInfo`` per draft markdown file under ``drafts/``.
220 Recurses so per-source draft nesting (``drafts/<source>/page.md``)
221 is covered. Reads each draft's full text once, classifies any
222 pending marker and drift ratio, strips the markers, then parses
223 frontmatter on the stripped body (so frontmatter parsing works
224 uniformly whether or not a marker shifted it down).
225 """
226 drafts_dir = wiki_root / WikiSubdir.DRAFTS
227 if not drafts_dir.is_dir():
228 return []
229 infos: list[DraftInfo] = []
230 for path in sorted(drafts_dir.rglob("*.md")):
231 text = path.read_text(encoding="utf-8")
232 pending_kind, drift, stripped = _classify_and_strip_markers(text)
233 fm = parse_frontmatter(stripped)
234 slug = str(path.relative_to(drafts_dir).with_suffix("")).replace("\\", "/")
235 infos.append(
236 DraftInfo(
237 slug=slug,
238 path=path,
239 drift_ratio=drift,
240 faithfulness_score=_coerce_float(fm.get("faithfulness_score")),
241 bad_title=bool(fm.get("bad_title", False)),
242 published_path=_find_published(wiki_root, slug),
243 mtime=path.stat().st_mtime,
244 pending_kind=pending_kind,
245 )
246 )
247 return infos
250def diff_draft(slug: str, wiki_root: Path) -> str:
251 """Return a unified diff of the draft against its published counterpart.
253 Raises :class:`FileNotFoundError` when the draft does not exist.
254 When no published counterpart exists the diff shows the draft as
255 all-new (baseline empty), which is useful for reviewing drafts
256 that originated from a fresh low-faithfulness generation.
257 """
258 draft = _draft_path(wiki_root, slug)
259 if not draft.is_file():
260 raise FileNotFoundError(f"draft not found: {slug}")
261 draft_text = draft.read_text(encoding="utf-8")
262 published = _find_published(wiki_root, slug)
263 baseline = published.read_text(encoding="utf-8") if published else ""
264 diff = difflib.unified_diff(
265 baseline.splitlines(),
266 draft_text.splitlines(),
267 fromfile=str(published) if published else "(new draft)",
268 tofile=str(draft),
269 lineterm="",
270 )
271 return "\n".join(diff)
274_COLLISION_SUFFIX_RE = re.compile(r"-collision-[0-9a-f]{8}$")
277def _base_slug_for_collision(slug: str) -> str:
278 """Strip the ``-collision-<hash>`` suffix so accept lands on the winning slug."""
279 return _COLLISION_SUFFIX_RE.sub("", slug)
282def accept_draft(slug: str, wiki_root: Path, store: Store) -> AcceptResult:
283 """Move the draft into its published subdir and re-index its chunks.
285 Behavior branches on the draft's pending kind:
287 - **Drift draft** (default): write the accepted body to its
288 published counterpart (or ``summaries/`` when unpaired),
289 re-index, delete the draft.
290 - **PENDING-PARSE** (batched-generation parser could not recover
291 a section): accepting is a no-op on the published side: the
292 marker has no body to accept. The marker is deleted and the
293 user is told to run ``wiki build`` to regenerate. Returns an
294 ``AcceptResult`` with ``reindexed_chunks=0`` and
295 ``moved_to`` pointing at the deleted marker.
296 - **PENDING-COLLISION** (two sources proposed the same concept
297 slug): strips the ``-collision-<hash>`` suffix to find the
298 winning slug, overwrites the winning page with this draft's
299 body, re-indexes, deletes the collision marker.
301 Sequence for drift/collision: write the published file first,
302 re-index next, delete the draft last. If the re-index raises
303 (chunker, embedder, LanceDB contention), the draft file stays
304 on disk so the user can retry ``accept``: ``index_wiki_page``
305 is idempotent on the same ``wiki_source`` (``clear_table`` +
306 re-write).
308 Raises :class:`FileNotFoundError` when the draft does not exist.
309 """
310 draft = _draft_path(wiki_root, slug)
311 if not draft.is_file():
312 raise FileNotFoundError(f"draft not found: {slug}")
313 raw = draft.read_text(encoding="utf-8")
314 pending_kind = _parse_pending_kind(raw)
316 if pending_kind == PendingKind.PARSE:
317 draft.unlink()
318 log.info(
319 "Accepted PENDING-PARSE marker %s; run `lilbee wiki build` "
320 "to regenerate the missing section.",
321 slug,
322 )
323 return AcceptResult(slug=slug, requested_slug=slug, moved_to=draft, reindexed_chunks=0)
325 clean = _strip_pending_markers(_strip_drift_marker(raw))
327 target_slug = _base_slug_for_collision(slug) if pending_kind == PendingKind.COLLISION else slug
328 published = _find_published(wiki_root, target_slug)
329 if published is not None:
330 target = published
331 else:
332 target = wiki_root / WikiSubdir.SUMMARIES / f"{target_slug}.md"
333 log.info(
334 "Draft %s has no published counterpart; accepting into %s",
335 slug,
336 WikiSubdir.SUMMARIES,
337 )
338 target.parent.mkdir(parents=True, exist_ok=True)
339 target.write_text(clean, encoding="utf-8")
341 reindexed = _reindex_accepted_page(target, wiki_root, store)
342 draft.unlink()
343 log.info("Accepted draft %s -> %s (%d chunks indexed)", slug, target, reindexed)
344 return AcceptResult(
345 slug=target_slug,
346 requested_slug=slug,
347 moved_to=target,
348 reindexed_chunks=reindexed,
349 )
352def reject_draft(slug: str, wiki_root: Path) -> None:
353 """Delete the draft file without touching the published page or the index."""
354 draft = _draft_path(wiki_root, slug)
355 if not draft.is_file():
356 raise FileNotFoundError(f"draft not found: {slug}")
357 draft.unlink()
358 log.info("Rejected draft %s", slug)
361def _reindex_accepted_page(target: Path, wiki_root: Path, store: Store) -> int:
362 """Re-index *target* via :func:`lilbee.wiki.page.index_wiki_page`.
364 Returns the number of ``chunk_type="wiki"`` rows written. Routes
365 through the same chunk / embed / clear-and-rewrite path as initial
366 page generation, so an accepted draft is indexed identically to a
367 fresh page and no bespoke accept-time code path exists.
368 """
369 wiki_source = _wiki_source_for(target, wiki_root)
370 content = target.read_text(encoding="utf-8")
371 return index_wiki_page(content, wiki_source, store)
374def _wiki_source_for(target: Path, wiki_root: Path) -> str:
375 """Build the ``wiki_source`` identifier used in the chunks table.
377 Shape matches :attr:`PageTarget.wiki_source`:
378 ``<wiki_dir>/<subdir>/<slug>.md``.
379 """
380 wiki_dir_name = wiki_root.name
381 relative = target.relative_to(wiki_root)
382 return f"{wiki_dir_name}/{relative.as_posix()}"
385def _coerce_float(value: Any) -> float | None:
386 """Return *value* as a float, or None when conversion is not sensible."""
387 if value is None:
388 return None
389 try:
390 return float(value)
391 except (TypeError, ValueError):
392 return None