Coverage for src / lilbee / wiki / lint.py: 100%

144 statements  

« prev     ^ index     » next       coverage.py v7.13.4, created at 2026-05-15 20:55 +0000

1"""Lint wiki pages for citation staleness, missing sources, and unmarked claims. 

2 

3Two modes: 

4- lightweight: runs automatically after sync, checks only pages whose sources changed 

5- full: manual ``lilbee wiki lint``, checks all wiki pages 

6""" 

7 

8from __future__ import annotations 

9 

10import logging 

11from dataclasses import dataclass, field 

12from enum import Enum 

13from pathlib import Path 

14 

15from lilbee.core.config import Config, cfg 

16from lilbee.core.security import validate_path_within 

17from lilbee.data.ingest import file_hash 

18from lilbee.data.store import CitationRecord, Store 

19from lilbee.wiki.citation import ( 

20 CitationStatus, 

21 find_unmarked_claims, 

22 verify_citation, 

23) 

24from lilbee.wiki.grammar import WIKI_LINK_RE 

25from lilbee.wiki.index import append_wiki_log 

26from lilbee.wiki.shared import ( 

27 WIKI_CONTENT_SUBDIRS, 

28 WikiLogAction, 

29 WikiSubdir, 

30 parse_frontmatter, 

31) 

32 

33_ORPHAN_CANDIDATE_SUBDIRS: tuple[str, ...] = (WikiSubdir.CONCEPTS, WikiSubdir.ENTITIES) 

34 

35log = logging.getLogger(__name__) 

36 

37 

38class IssueSeverity(Enum): 

39 """Severity level for lint issues.""" 

40 

41 WARNING = "warning" 

42 ERROR = "error" 

43 

44 

45class IssueType(Enum): 

46 """Classification of lint findings, used by prune to filter programmatically.""" 

47 

48 PATH_TRAVERSAL = "path_traversal" 

49 SOURCE_MISSING = "source_missing" 

50 STALE_HASH = "stale_hash" 

51 EXCERPT_MISSING = "excerpt_missing" 

52 MODEL_CHANGED = "model_changed" 

53 UNMARKED_CLAIM = "unmarked_claim" 

54 ORPHAN = "orphan" 

55 

56 

57@dataclass(frozen=True) 

58class LintIssue: 

59 """A single lint finding on a wiki page.""" 

60 

61 wiki_source: str 

62 severity: IssueSeverity 

63 message: str 

64 issue_type: IssueType | None = None 

65 

66 def to_dict(self) -> dict[str, str]: 

67 """Serialize to a plain dict suitable for JSON output.""" 

68 return { 

69 "wiki_source": self.wiki_source, 

70 "severity": self.severity.value, 

71 "message": self.message, 

72 "issue_type": self.issue_type.value if self.issue_type else "", 

73 } 

74 

75 

76@dataclass 

77class LintReport: 

78 """Aggregated results from linting one or more wiki pages.""" 

79 

80 issues: list[LintIssue] = field(default_factory=list) 

81 

82 @property 

83 def error_count(self) -> int: 

84 return sum(1 for i in self.issues if i.severity == IssueSeverity.ERROR) 

85 

86 @property 

87 def warning_count(self) -> int: 

88 return sum(1 for i in self.issues if i.severity == IssueSeverity.WARNING) 

89 

90 

91def _lint_citation( 

92 rec: CitationRecord, 

93 documents_dir: Path, 

94) -> LintIssue | None: 

95 """Check a single citation record against the filesystem. 

96 Returns a LintIssue if the citation is stale or broken, None if valid. 

97 """ 

98 source_path = documents_dir / rec["source_filename"] 

99 wiki_source = rec["wiki_source"] 

100 

101 try: 

102 validate_path_within(source_path, documents_dir) 

103 except ValueError: 

104 return LintIssue( 

105 wiki_source=wiki_source, 

106 severity=IssueSeverity.ERROR, 

107 message=f"Source path escapes documents dir: {rec['source_filename']}", 

108 issue_type=IssueType.PATH_TRAVERSAL, 

109 ) 

110 

111 if not source_path.exists(): 

112 return LintIssue( 

113 wiki_source=wiki_source, 

114 severity=IssueSeverity.ERROR, 

115 message=f"Source deleted: {rec['source_filename']}", 

116 issue_type=IssueType.SOURCE_MISSING, 

117 ) 

118 

119 current_hash = file_hash(source_path) 

120 if current_hash != rec["source_hash"]: 

121 return LintIssue( 

122 wiki_source=wiki_source, 

123 severity=IssueSeverity.WARNING, 

124 message=f"Stale hash for {rec['source_filename']} (citation: {rec['citation_key']})", 

125 issue_type=IssueType.STALE_HASH, 

126 ) 

127 

128 source_text = source_path.read_text(encoding="utf-8", errors="replace") 

129 status = verify_citation(rec, source_text) 

130 if status == CitationStatus.EXCERPT_MISSING: 

131 return LintIssue( 

132 wiki_source=wiki_source, 

133 severity=IssueSeverity.WARNING, 

134 message=f"Excerpt not found in source for {rec['citation_key']}", 

135 issue_type=IssueType.EXCERPT_MISSING, 

136 ) 

137 return None 

138 

139 

140def _lint_model_changed(wiki_source: str, text: str, config: Config) -> LintIssue | None: 

141 """Flag pages whose generated_by model differs from the current chat model.""" 

142 generated_by = parse_frontmatter(text).get("generated_by", "") 

143 if not generated_by: 

144 return None 

145 if generated_by != config.chat_model: 

146 return LintIssue( 

147 wiki_source=wiki_source, 

148 severity=IssueSeverity.WARNING, 

149 issue_type=IssueType.MODEL_CHANGED, 

150 message=( 

151 f"model_changed: page generated by {generated_by!r}, " 

152 f"current model is {config.chat_model!r}" 

153 ), 

154 ) 

155 return None 

156 

157 

158def _lint_unmarked(wiki_source: str, text: str) -> list[LintIssue]: 

159 """Find unmarked claims in a wiki page.""" 

160 unmarked = find_unmarked_claims(text) 

161 return [ 

162 LintIssue( 

163 wiki_source=wiki_source, 

164 severity=IssueSeverity.WARNING, 

165 message=f"Unmarked claim: {line[:80]}", 

166 issue_type=IssueType.UNMARKED_CLAIM, 

167 ) 

168 for line in unmarked 

169 ] 

170 

171 

172def lint_wiki_page( 

173 wiki_source: str, 

174 store: Store, 

175 config: Config | None = None, 

176) -> list[LintIssue]: 

177 """Lint a single wiki page: check citations and unmarked claims.""" 

178 if config is None: 

179 config = cfg 

180 issues: list[LintIssue] = [] 

181 

182 citations = store.get_citations_for_wiki(wiki_source) 

183 for rec in citations: 

184 issue = _lint_citation(rec, config.documents_dir) 

185 if issue is not None: 

186 issues.append(issue) 

187 

188 wiki_root = config.data_root / config.wiki_dir 

189 # wiki_source is like "wiki/summaries/doc.md": strip the wiki_dir prefix 

190 relative = str(wiki_source).removeprefix(str(config.wiki_dir) + "/") 

191 wiki_path = wiki_root / relative 

192 if wiki_path.exists(): 

193 text = wiki_path.read_text(encoding="utf-8", errors="replace") 

194 issues.extend(_lint_unmarked(wiki_source, text)) 

195 model_issue = _lint_model_changed(wiki_source, text, config) 

196 if model_issue is not None: 

197 issues.append(model_issue) 

198 

199 return issues 

200 

201 

202def lint_changed_sources( 

203 changed_sources: list[str], 

204 store: Store, 

205 config: Config | None = None, 

206) -> LintReport: 

207 """Lightweight lint for wiki pages citing changed or removed sources. 

208 

209 Callable from tools that already know the set of changed sources 

210 (e.g. a future `lilbee wiki check <source>` command); the sync 

211 pipeline uses `lilbee.wiki.ingest.incremental_update` instead, which runs full 

212 extraction rather than citation replay. 

213 """ 

214 if config is None: 

215 config = cfg 

216 report = LintReport() 

217 

218 seen_pages: set[str] = set() 

219 for source_name in changed_sources: 

220 citations = store.get_citations_for_source(source_name) 

221 for rec in citations: 

222 wiki_source = rec["wiki_source"] 

223 if wiki_source in seen_pages: 

224 continue 

225 seen_pages.add(wiki_source) 

226 report.issues.extend(lint_wiki_page(wiki_source, store, config)) 

227 

228 if report.issues: 

229 log.info( 

230 "Wiki lint: %d error(s), %d warning(s)", 

231 report.error_count, 

232 report.warning_count, 

233 ) 

234 return report 

235 

236 

237def lint_all( 

238 store: Store, 

239 config: Config | None = None, 

240) -> LintReport: 

241 """Full lint: check every wiki page in the store.""" 

242 if config is None: 

243 config = cfg 

244 report = LintReport() 

245 

246 wiki_root = config.data_root / config.wiki_dir 

247 if not wiki_root.exists(): 

248 return report 

249 

250 for subdir in WIKI_CONTENT_SUBDIRS: 

251 subdir_path = wiki_root / subdir 

252 if not subdir_path.is_dir(): 

253 continue 

254 for md_path in sorted(subdir_path.rglob("*.md")): 

255 relative = md_path.relative_to(wiki_root) 

256 wiki_source = f"{config.wiki_dir}/{relative.as_posix()}" 

257 report.issues.extend(lint_wiki_page(wiki_source, store, config)) 

258 

259 report.issues.extend(_lint_orphans(wiki_root, config)) 

260 append_wiki_log( 

261 WikiLogAction.LINT, 

262 f"{report.error_count} error(s), {report.warning_count} warning(s)", 

263 config, 

264 ) 

265 return report 

266 

267 

268def _lint_orphans(wiki_root: Path, config: Config) -> list[LintIssue]: 

269 """Flag concept/entity pages that no other page links back to. 

270 

271 Single-pass over the wiki tree: we collect every inbound 

272 ``[[slug]]`` reference and the set of orphan candidates in one 

273 ``rglob`` walk, then subtract. The earlier two-pass version 

274 re-walked the tree to compute ``referenced`` and again to check 

275 candidates, which doubles the file-IO at build time. 

276 """ 

277 referenced: set[str] = set() 

278 candidates: list[Path] = [] 

279 candidate_roots = {wiki_root / sub for sub in _ORPHAN_CANDIDATE_SUBDIRS} 

280 for md_path in wiki_root.rglob("*.md"): 

281 text = md_path.read_text(encoding="utf-8", errors="replace") 

282 for match in WIKI_LINK_RE.finditer(text): 

283 slug = match.group(1).split("|", 1)[0].strip().lower() 

284 if slug: 

285 referenced.add(slug) 

286 if any(root in md_path.parents for root in candidate_roots): 

287 candidates.append(md_path) 

288 

289 issues: list[LintIssue] = [] 

290 for md_path in sorted(candidates): 

291 slug = md_path.stem.lower() 

292 if slug in referenced: 

293 continue 

294 relative = md_path.relative_to(wiki_root) 

295 wiki_source = f"{config.wiki_dir}/{relative.as_posix()}" 

296 issues.append( 

297 LintIssue( 

298 wiki_source=wiki_source, 

299 severity=IssueSeverity.WARNING, 

300 issue_type=IssueType.ORPHAN, 

301 message=f"Orphan: no inbound [[{slug}]] links from any other page", 

302 ) 

303 ) 

304 return issues