Coverage for src / lilbee / wiki / lint.py: 100%
144 statements
« prev ^ index » next coverage.py v7.13.4, created at 2026-05-15 20:55 +0000
« prev ^ index » next coverage.py v7.13.4, created at 2026-05-15 20:55 +0000
1"""Lint wiki pages for citation staleness, missing sources, and unmarked claims.
3Two modes:
4- lightweight: runs automatically after sync, checks only pages whose sources changed
5- full: manual ``lilbee wiki lint``, checks all wiki pages
6"""
8from __future__ import annotations
10import logging
11from dataclasses import dataclass, field
12from enum import Enum
13from pathlib import Path
15from lilbee.core.config import Config, cfg
16from lilbee.core.security import validate_path_within
17from lilbee.data.ingest import file_hash
18from lilbee.data.store import CitationRecord, Store
19from lilbee.wiki.citation import (
20 CitationStatus,
21 find_unmarked_claims,
22 verify_citation,
23)
24from lilbee.wiki.grammar import WIKI_LINK_RE
25from lilbee.wiki.index import append_wiki_log
26from lilbee.wiki.shared import (
27 WIKI_CONTENT_SUBDIRS,
28 WikiLogAction,
29 WikiSubdir,
30 parse_frontmatter,
31)
33_ORPHAN_CANDIDATE_SUBDIRS: tuple[str, ...] = (WikiSubdir.CONCEPTS, WikiSubdir.ENTITIES)
35log = logging.getLogger(__name__)
38class IssueSeverity(Enum):
39 """Severity level for lint issues."""
41 WARNING = "warning"
42 ERROR = "error"
45class IssueType(Enum):
46 """Classification of lint findings, used by prune to filter programmatically."""
48 PATH_TRAVERSAL = "path_traversal"
49 SOURCE_MISSING = "source_missing"
50 STALE_HASH = "stale_hash"
51 EXCERPT_MISSING = "excerpt_missing"
52 MODEL_CHANGED = "model_changed"
53 UNMARKED_CLAIM = "unmarked_claim"
54 ORPHAN = "orphan"
57@dataclass(frozen=True)
58class LintIssue:
59 """A single lint finding on a wiki page."""
61 wiki_source: str
62 severity: IssueSeverity
63 message: str
64 issue_type: IssueType | None = None
66 def to_dict(self) -> dict[str, str]:
67 """Serialize to a plain dict suitable for JSON output."""
68 return {
69 "wiki_source": self.wiki_source,
70 "severity": self.severity.value,
71 "message": self.message,
72 "issue_type": self.issue_type.value if self.issue_type else "",
73 }
76@dataclass
77class LintReport:
78 """Aggregated results from linting one or more wiki pages."""
80 issues: list[LintIssue] = field(default_factory=list)
82 @property
83 def error_count(self) -> int:
84 return sum(1 for i in self.issues if i.severity == IssueSeverity.ERROR)
86 @property
87 def warning_count(self) -> int:
88 return sum(1 for i in self.issues if i.severity == IssueSeverity.WARNING)
91def _lint_citation(
92 rec: CitationRecord,
93 documents_dir: Path,
94) -> LintIssue | None:
95 """Check a single citation record against the filesystem.
96 Returns a LintIssue if the citation is stale or broken, None if valid.
97 """
98 source_path = documents_dir / rec["source_filename"]
99 wiki_source = rec["wiki_source"]
101 try:
102 validate_path_within(source_path, documents_dir)
103 except ValueError:
104 return LintIssue(
105 wiki_source=wiki_source,
106 severity=IssueSeverity.ERROR,
107 message=f"Source path escapes documents dir: {rec['source_filename']}",
108 issue_type=IssueType.PATH_TRAVERSAL,
109 )
111 if not source_path.exists():
112 return LintIssue(
113 wiki_source=wiki_source,
114 severity=IssueSeverity.ERROR,
115 message=f"Source deleted: {rec['source_filename']}",
116 issue_type=IssueType.SOURCE_MISSING,
117 )
119 current_hash = file_hash(source_path)
120 if current_hash != rec["source_hash"]:
121 return LintIssue(
122 wiki_source=wiki_source,
123 severity=IssueSeverity.WARNING,
124 message=f"Stale hash for {rec['source_filename']} (citation: {rec['citation_key']})",
125 issue_type=IssueType.STALE_HASH,
126 )
128 source_text = source_path.read_text(encoding="utf-8", errors="replace")
129 status = verify_citation(rec, source_text)
130 if status == CitationStatus.EXCERPT_MISSING:
131 return LintIssue(
132 wiki_source=wiki_source,
133 severity=IssueSeverity.WARNING,
134 message=f"Excerpt not found in source for {rec['citation_key']}",
135 issue_type=IssueType.EXCERPT_MISSING,
136 )
137 return None
140def _lint_model_changed(wiki_source: str, text: str, config: Config) -> LintIssue | None:
141 """Flag pages whose generated_by model differs from the current chat model."""
142 generated_by = parse_frontmatter(text).get("generated_by", "")
143 if not generated_by:
144 return None
145 if generated_by != config.chat_model:
146 return LintIssue(
147 wiki_source=wiki_source,
148 severity=IssueSeverity.WARNING,
149 issue_type=IssueType.MODEL_CHANGED,
150 message=(
151 f"model_changed: page generated by {generated_by!r}, "
152 f"current model is {config.chat_model!r}"
153 ),
154 )
155 return None
158def _lint_unmarked(wiki_source: str, text: str) -> list[LintIssue]:
159 """Find unmarked claims in a wiki page."""
160 unmarked = find_unmarked_claims(text)
161 return [
162 LintIssue(
163 wiki_source=wiki_source,
164 severity=IssueSeverity.WARNING,
165 message=f"Unmarked claim: {line[:80]}",
166 issue_type=IssueType.UNMARKED_CLAIM,
167 )
168 for line in unmarked
169 ]
172def lint_wiki_page(
173 wiki_source: str,
174 store: Store,
175 config: Config | None = None,
176) -> list[LintIssue]:
177 """Lint a single wiki page: check citations and unmarked claims."""
178 if config is None:
179 config = cfg
180 issues: list[LintIssue] = []
182 citations = store.get_citations_for_wiki(wiki_source)
183 for rec in citations:
184 issue = _lint_citation(rec, config.documents_dir)
185 if issue is not None:
186 issues.append(issue)
188 wiki_root = config.data_root / config.wiki_dir
189 # wiki_source is like "wiki/summaries/doc.md": strip the wiki_dir prefix
190 relative = str(wiki_source).removeprefix(str(config.wiki_dir) + "/")
191 wiki_path = wiki_root / relative
192 if wiki_path.exists():
193 text = wiki_path.read_text(encoding="utf-8", errors="replace")
194 issues.extend(_lint_unmarked(wiki_source, text))
195 model_issue = _lint_model_changed(wiki_source, text, config)
196 if model_issue is not None:
197 issues.append(model_issue)
199 return issues
202def lint_changed_sources(
203 changed_sources: list[str],
204 store: Store,
205 config: Config | None = None,
206) -> LintReport:
207 """Lightweight lint for wiki pages citing changed or removed sources.
209 Callable from tools that already know the set of changed sources
210 (e.g. a future `lilbee wiki check <source>` command); the sync
211 pipeline uses `lilbee.wiki.ingest.incremental_update` instead, which runs full
212 extraction rather than citation replay.
213 """
214 if config is None:
215 config = cfg
216 report = LintReport()
218 seen_pages: set[str] = set()
219 for source_name in changed_sources:
220 citations = store.get_citations_for_source(source_name)
221 for rec in citations:
222 wiki_source = rec["wiki_source"]
223 if wiki_source in seen_pages:
224 continue
225 seen_pages.add(wiki_source)
226 report.issues.extend(lint_wiki_page(wiki_source, store, config))
228 if report.issues:
229 log.info(
230 "Wiki lint: %d error(s), %d warning(s)",
231 report.error_count,
232 report.warning_count,
233 )
234 return report
237def lint_all(
238 store: Store,
239 config: Config | None = None,
240) -> LintReport:
241 """Full lint: check every wiki page in the store."""
242 if config is None:
243 config = cfg
244 report = LintReport()
246 wiki_root = config.data_root / config.wiki_dir
247 if not wiki_root.exists():
248 return report
250 for subdir in WIKI_CONTENT_SUBDIRS:
251 subdir_path = wiki_root / subdir
252 if not subdir_path.is_dir():
253 continue
254 for md_path in sorted(subdir_path.rglob("*.md")):
255 relative = md_path.relative_to(wiki_root)
256 wiki_source = f"{config.wiki_dir}/{relative.as_posix()}"
257 report.issues.extend(lint_wiki_page(wiki_source, store, config))
259 report.issues.extend(_lint_orphans(wiki_root, config))
260 append_wiki_log(
261 WikiLogAction.LINT,
262 f"{report.error_count} error(s), {report.warning_count} warning(s)",
263 config,
264 )
265 return report
268def _lint_orphans(wiki_root: Path, config: Config) -> list[LintIssue]:
269 """Flag concept/entity pages that no other page links back to.
271 Single-pass over the wiki tree: we collect every inbound
272 ``[[slug]]`` reference and the set of orphan candidates in one
273 ``rglob`` walk, then subtract. The earlier two-pass version
274 re-walked the tree to compute ``referenced`` and again to check
275 candidates, which doubles the file-IO at build time.
276 """
277 referenced: set[str] = set()
278 candidates: list[Path] = []
279 candidate_roots = {wiki_root / sub for sub in _ORPHAN_CANDIDATE_SUBDIRS}
280 for md_path in wiki_root.rglob("*.md"):
281 text = md_path.read_text(encoding="utf-8", errors="replace")
282 for match in WIKI_LINK_RE.finditer(text):
283 slug = match.group(1).split("|", 1)[0].strip().lower()
284 if slug:
285 referenced.add(slug)
286 if any(root in md_path.parents for root in candidate_roots):
287 candidates.append(md_path)
289 issues: list[LintIssue] = []
290 for md_path in sorted(candidates):
291 slug = md_path.stem.lower()
292 if slug in referenced:
293 continue
294 relative = md_path.relative_to(wiki_root)
295 wiki_source = f"{config.wiki_dir}/{relative.as_posix()}"
296 issues.append(
297 LintIssue(
298 wiki_source=wiki_source,
299 severity=IssueSeverity.WARNING,
300 issue_type=IssueType.ORPHAN,
301 message=f"Orphan: no inbound [[{slug}]] links from any other page",
302 )
303 )
304 return issues