Coverage for src/lilbee/wiki/drafts.py: 100%

1"""Draft review surface. List, diff, accept, reject wiki drafts.

3Wiki generation routes pages to ``wiki/drafts/`` when the content

4drift against an existing page exceeds the configured threshold or

5when the faithfulness score falls below it. Without a review

6surface drafts accumulate with no exit ramp, so this module exposes

7the four operations a reviewer needs: see what is pending, diff

8against the published version, accept (overwrite the published

9page and re-index its chunks), or reject (delete the draft file).

10"""

12from __future__ import annotations

14import difflib

15import logging

16import re

17from dataclasses import dataclass

18from pathlib import Path

19from typing import Any

21from lilbee.data.store import Store

22from lilbee.wiki.page import index_wiki_page

23from lilbee.wiki.shared import (

24 PENDING_MARKER_KEYWORD_COLLISION,

25 PENDING_MARKER_KEYWORD_PARSE,

26 PendingKind,

27 WikiSubdir,

28 parse_frontmatter,

29)

31__all__ = [

32 "AcceptResult",

33 "DraftInfo",

34 "PendingKind",

35 "accept_draft",

36 "diff_draft",

37 "list_drafts",

38 "reject_draft",

39]

41log = logging.getLogger(__name__)

43_DRIFT_MARKER_RE = re.compile(

44 r"",

45 re.IGNORECASE,

46)

48# Batched-generation pending markers. The per-source batched call

49# writes one of these when the parser could not recover a requested

50# section, or when two sources proposed the same concept slug and the

51# second write lost the race. The keyword phrases live in

52# ``wiki.shared`` so writer (generation) and reader (drafts) agree on

53# the exact wording; this regex adds the ``<!--`` wrapper plus ``\s+``

54# in place of each literal space, so the reader tolerates double-space

55# variations in cached markers. Keywords carry no regex metacharacters

56# so ``re.escape`` is unnecessary.

57_PARSE_KEYWORD_PATTERN = PENDING_MARKER_KEYWORD_PARSE.replace(" ", r"\s+")

58_COLLISION_KEYWORD_PATTERN = PENDING_MARKER_KEYWORD_COLLISION.replace(" ", r"\s+")

59_PENDING_PARSE_MARKER_RE = re.compile(

60 rf"",

61 re.IGNORECASE,

62)

63_PENDING_COLLISION_MARKER_RE = re.compile(

64 rf"",

65 re.IGNORECASE,

66)

68# Published wiki subdirs searched in priority order when pairing a

69# draft slug with its counterpart. Summaries and synthesis come first

70# because they are the subdirs most drafts originate from (drift

71# detection runs on regen of an existing source or cluster page).

72_PUBLISHED_SUBDIRS: tuple[str, ...] = (

73 WikiSubdir.SUMMARIES,

74 WikiSubdir.SYNTHESIS,

75 WikiSubdir.CONCEPTS,

76 WikiSubdir.ENTITIES,

77)

80@dataclass

81class DraftInfo:

82 """Metadata about a single draft, surfaced in ``wiki drafts list``.

84 ``pending_kind`` distinguishes drift drafts (None) from

85 batched-generation markers (``"parse"``, ``"collision"``). Callers

86 can render the kind in the list view and branch on it when

87 deciding how to surface the draft (e.g. a collision needs the

88 winning-source context, a parse marker just needs a rerun).

89 """

91 slug: str

92 path: Path

93 drift_ratio: float | None

94 faithfulness_score: float | None

95 bad_title: bool

96 published_path: Path | None

97 mtime: float

98 pending_kind: str | None = None

100 @property

101 def published_exists(self) -> bool:

102 """True when a matching published page exists for this draft."""

103 return self.published_path is not None

104

105 def to_dict(self) -> dict[str, Any]:

106 """Serialize to a JSON-friendly dict."""

107 return {

108 "slug": self.slug,

109 "path": str(self.path),

110 "drift_ratio": self.drift_ratio,

111 "faithfulness_score": self.faithfulness_score,

112 "bad_title": self.bad_title,

113 "published_path": str(self.published_path) if self.published_path else None,

114 "published_exists": self.published_exists,

115 "mtime": self.mtime,

116 "pending_kind": self.pending_kind,

117 }

118

119

120@dataclass

121class AcceptResult:

122 """Outcome of accepting a draft. Returned so callers can confirm.

123

124 ``requested_slug`` is always the slug the caller asked to accept

125 (for PENDING-COLLISION drafts this looks like

126 ``brakes-collision-abc12345``). ``slug`` is where the content

127 landed (the de-collisioned base slug, so ``brakes``). For

128 non-collision drafts the two match. HTTP clients that round-trip

129 accept→list-refresh can compare both fields to track the rename.

130 """

131

132 slug: str

133 requested_slug: str

134 moved_to: Path

135 reindexed_chunks: int

136

137 def to_dict(self) -> dict[str, Any]:

138 """Serialize to a JSON-friendly dict for HTTP/MCP/CLI responses."""

139 return {

140 "slug": self.slug,

141 "requested_slug": self.requested_slug,

142 "moved_to": self.moved_to.as_posix(),

143 "reindexed_chunks": self.reindexed_chunks,

144 }

145

146

147def _draft_path(wiki_root: Path, slug: str) -> Path:

148 return wiki_root / WikiSubdir.DRAFTS / f"{slug}.md"

149

150

151def _find_published(wiki_root: Path, slug: str) -> Path | None:

152 """Return the first published page matching *slug*, or None.

153

154 Checks summaries, synthesis, concepts, and entities subdirs in

155 priority order so a draft regenerated from an existing summary

156 page pairs with its original rather than the same slug under a

157 different page type.

158 """

159 for subdir in _PUBLISHED_SUBDIRS:

160 candidate = wiki_root / subdir / f"{slug}.md"

161 if candidate.is_file():

162 return candidate

163 return None

164

165

166def _parse_drift_ratio(text: str) -> float | None:

167 """Extract the drift percentage from a draft's leading marker."""

168 match = _DRIFT_MARKER_RE.search(text)

169 if match is None:

170 return None

171 return int(match.group("pct")) / 100.0

172

173

174def _parse_pending_kind(text: str) -> str | None:

175 """Classify *text* as a PENDING-PARSE, PENDING-COLLISION, or neither.

176

177 Returns ``None`` when the leading marker is absent or is the

178 drift marker. Only inspects the first marker encountered so a

179 draft body that quotes the HTML comment (unlikely but possible)

180 does not get mis-classified.

181 """

182 if _PENDING_PARSE_MARKER_RE.search(text):

183 return PendingKind.PARSE

184 if _PENDING_COLLISION_MARKER_RE.search(text):

185 return PendingKind.COLLISION

186 return None

187

188

189def _strip_drift_marker(text: str) -> str:

190 """Remove the drift-review marker so accepted content lands clean."""

191 return _DRIFT_MARKER_RE.sub("", text, count=1).lstrip()

192

193

194def _strip_pending_markers(text: str) -> str:

195 """Remove PENDING-PARSE/COLLISION markers on the way into a published page."""

196 text = _PENDING_PARSE_MARKER_RE.sub("", text, count=1)

197 text = _PENDING_COLLISION_MARKER_RE.sub("", text, count=1)

198 return text.lstrip()

199

200

201def _classify_and_strip_markers(text: str) -> tuple[str | None, float | None, str]:

202 """Single-pass read: parse kind, drift ratio, and return marker-stripped body.

203

204 Three ``.sub()`` passes (one per pending-parse, pending-collision, and

205 drift markers) plus three ``.search()`` scans needed to classify which

206 markers are present, returning kind, drift ratio, and stripped body

207 together so callers don't reparse the body once per attribute.

208 """

209 pending_kind = _parse_pending_kind(text)

210 drift = _parse_drift_ratio(text)

211 stripped = _PENDING_PARSE_MARKER_RE.sub("", text, count=1)

212 stripped = _PENDING_COLLISION_MARKER_RE.sub("", stripped, count=1)

213 stripped = _DRIFT_MARKER_RE.sub("", stripped, count=1)

214 return pending_kind, drift, stripped.lstrip()

215

216

217def list_drafts(wiki_root: Path) -> list[DraftInfo]:

218 """Return one ``DraftInfo`` per draft markdown file under ``drafts/``.

219

220 Recurses so per-source draft nesting (``drafts/<source>/page.md``)

221 is covered. Reads each draft's full text once, classifies any

222 pending marker and drift ratio, strips the markers, then parses

223 frontmatter on the stripped body (so frontmatter parsing works

224 uniformly whether or not a marker shifted it down).

225 """

226 drafts_dir = wiki_root / WikiSubdir.DRAFTS

227 if not drafts_dir.is_dir():

228 return []

229 infos: list[DraftInfo] = []

230 for path in sorted(drafts_dir.rglob("*.md")):

231 text = path.read_text(encoding="utf-8")

232 pending_kind, drift, stripped = _classify_and_strip_markers(text)

233 fm = parse_frontmatter(stripped)

234 slug = str(path.relative_to(drafts_dir).with_suffix("")).replace("\\", "/")

235 infos.append(

236 DraftInfo(

237 slug=slug,

238 path=path,

239 drift_ratio=drift,

240 faithfulness_score=_coerce_float(fm.get("faithfulness_score")),

241 bad_title=bool(fm.get("bad_title", False)),

242 published_path=_find_published(wiki_root, slug),

243 mtime=path.stat().st_mtime,

244 pending_kind=pending_kind,

245 )

246 )

247 return infos

248

249

250def diff_draft(slug: str, wiki_root: Path) -> str:

251 """Return a unified diff of the draft against its published counterpart.

252

253 Raises :class:`FileNotFoundError` when the draft does not exist.

254 When no published counterpart exists the diff shows the draft as

255 all-new (baseline empty), which is useful for reviewing drafts

256 that originated from a fresh low-faithfulness generation.

257 """

258 draft = _draft_path(wiki_root, slug)

259 if not draft.is_file():

260 raise FileNotFoundError(f"draft not found: {slug}")

261 draft_text = draft.read_text(encoding="utf-8")

262 published = _find_published(wiki_root, slug)

263 baseline = published.read_text(encoding="utf-8") if published else ""

264 diff = difflib.unified_diff(

265 baseline.splitlines(),

266 draft_text.splitlines(),

267 fromfile=str(published) if published else "(new draft)",

268 tofile=str(draft),

269 lineterm="",

270 )

271 return "\n".join(diff)

272

273

274_COLLISION_SUFFIX_RE = re.compile(r"-collision-[0-9a-f]{8}$")

275

276

277def _base_slug_for_collision(slug: str) -> str:

278 """Strip the ``-collision-<hash>`` suffix so accept lands on the winning slug."""

279 return _COLLISION_SUFFIX_RE.sub("", slug)

280

281

282def accept_draft(slug: str, wiki_root: Path, store: Store) -> AcceptResult:

283 """Move the draft into its published subdir and re-index its chunks.

284

285 Behavior branches on the draft's pending kind:

286

287 - **Drift draft** (default): write the accepted body to its

288 published counterpart (or ``summaries/`` when unpaired),

289 re-index, delete the draft.

290 - **PENDING-PARSE** (batched-generation parser could not recover

291 a section): accepting is a no-op on the published side: the

292 marker has no body to accept. The marker is deleted and the

293 user is told to run ``wiki build`` to regenerate. Returns an

294 ``AcceptResult`` with ``reindexed_chunks=0`` and

295 ``moved_to`` pointing at the deleted marker.

296 - **PENDING-COLLISION** (two sources proposed the same concept

297 slug): strips the ``-collision-<hash>`` suffix to find the

298 winning slug, overwrites the winning page with this draft's

299 body, re-indexes, deletes the collision marker.

300

301 Sequence for drift/collision: write the published file first,

302 re-index next, delete the draft last. If the re-index raises

303 (chunker, embedder, LanceDB contention), the draft file stays

304 on disk so the user can retry ``accept``: ``index_wiki_page``

305 is idempotent on the same ``wiki_source`` (``clear_table`` +

306 re-write).

307

308 Raises :class:`FileNotFoundError` when the draft does not exist.

309 """

310 draft = _draft_path(wiki_root, slug)

311 if not draft.is_file():

312 raise FileNotFoundError(f"draft not found: {slug}")

313 raw = draft.read_text(encoding="utf-8")

314 pending_kind = _parse_pending_kind(raw)

315

316 if pending_kind == PendingKind.PARSE:

317 draft.unlink()

318 log.info(

319 "Accepted PENDING-PARSE marker %s; run `lilbee wiki build` "

320 "to regenerate the missing section.",

321 slug,

322 )

323 return AcceptResult(slug=slug, requested_slug=slug, moved_to=draft, reindexed_chunks=0)

324

325 clean = _strip_pending_markers(_strip_drift_marker(raw))

326

327 target_slug = _base_slug_for_collision(slug) if pending_kind == PendingKind.COLLISION else slug

328 published = _find_published(wiki_root, target_slug)

329 if published is not None:

330 target = published

331 else:

332 target = wiki_root / WikiSubdir.SUMMARIES / f"{target_slug}.md"

333 log.info(

334 "Draft %s has no published counterpart; accepting into %s",

335 slug,

336 WikiSubdir.SUMMARIES,

337 )

338 target.parent.mkdir(parents=True, exist_ok=True)

339 target.write_text(clean, encoding="utf-8")

340

341 reindexed = _reindex_accepted_page(target, wiki_root, store)

342 draft.unlink()

343 log.info("Accepted draft %s -> %s (%d chunks indexed)", slug, target, reindexed)

344 return AcceptResult(

345 slug=target_slug,

346 requested_slug=slug,

347 moved_to=target,

348 reindexed_chunks=reindexed,

349 )

350

351

352def reject_draft(slug: str, wiki_root: Path) -> None:

353 """Delete the draft file without touching the published page or the index."""

354 draft = _draft_path(wiki_root, slug)

355 if not draft.is_file():

356 raise FileNotFoundError(f"draft not found: {slug}")

357 draft.unlink()

358 log.info("Rejected draft %s", slug)

359

360

361def _reindex_accepted_page(target: Path, wiki_root: Path, store: Store) -> int:

362 """Re-index *target* via :func:`lilbee.wiki.page.index_wiki_page`.

363

364 Returns the number of ``chunk_type="wiki"`` rows written. Routes

365 through the same chunk / embed / clear-and-rewrite path as initial

366 page generation, so an accepted draft is indexed identically to a

367 fresh page and no bespoke accept-time code path exists.

368 """

369 wiki_source = _wiki_source_for(target, wiki_root)

370 content = target.read_text(encoding="utf-8")

371 return index_wiki_page(content, wiki_source, store)

372

373

374def _wiki_source_for(target: Path, wiki_root: Path) -> str:

375 """Build the ``wiki_source`` identifier used in the chunks table.

376

377 Shape matches :attr:`PageTarget.wiki_source`:

378 ``<wiki_dir>/<subdir>/<slug>.md``.

379 """

380 wiki_dir_name = wiki_root.name

381 relative = target.relative_to(wiki_root)

382 return f"{wiki_dir_name}/{relative.as_posix()}"

383

384

385def _coerce_float(value: Any) -> float | None:

386 """Return *value* as a float, or None when conversion is not sensible."""

387 if value is None:

388 return None

389 try:

390 return float(value)

391 except (TypeError, ValueError):

392 return None

Coverage for src / lilbee / wiki / drafts.py: 100%

141 statements