Coverage for src / lilbee / wiki / prune.py: 100%

105 statements  

« prev     ^ index     » next       coverage.py v7.13.4, created at 2026-05-15 20:55 +0000

1"""Prune stale and orphaned wiki pages. 

2 

3Pruning rules: 

41. All cited sources deleted -> archive the page 

52. Concept cluster shrinks below 3 sources -> archive synthesis page 

63. >50% of citations are stale (stale_hash or excerpt_missing) -> flag for regeneration 

7 

8Archived pages are moved to wiki/archive/ and removed from the vector store. 

9""" 

10 

11from __future__ import annotations 

12 

13import logging 

14import shutil 

15from dataclasses import dataclass, field 

16from enum import Enum 

17from pathlib import Path 

18 

19from lilbee.core.config import Config, cfg 

20from lilbee.data.store import Store 

21from lilbee.wiki.index import append_wiki_log, update_wiki_index 

22from lilbee.wiki.lint import IssueType, lint_wiki_page 

23from lilbee.wiki.shared import ( 

24 MIN_CLUSTER_SOURCES, 

25 WIKI_CONTENT_SUBDIRS, 

26 WikiSubdir, 

27) 

28 

29log = logging.getLogger(__name__) 

30 

31_STALE_TYPES = {IssueType.STALE_HASH, IssueType.EXCERPT_MISSING} 

32 

33 

34class PruneAction(Enum): 

35 """What happened to a wiki page during pruning.""" 

36 

37 ARCHIVED = "archived" 

38 FLAGGED = "flagged" 

39 

40 

41@dataclass(frozen=True) 

42class PruneRecord: 

43 """A single pruning action taken on a wiki page.""" 

44 

45 wiki_source: str 

46 action: PruneAction 

47 reason: str 

48 

49 def to_dict(self) -> dict[str, str]: 

50 """Serialize to a plain dict suitable for JSON output.""" 

51 return { 

52 "wiki_source": self.wiki_source, 

53 "action": self.action.value, 

54 "reason": self.reason, 

55 } 

56 

57 

58@dataclass 

59class PruneReport: 

60 """Aggregated results from pruning wiki pages.""" 

61 

62 records: list[PruneRecord] = field(default_factory=list) 

63 

64 @property 

65 def archived_count(self) -> int: 

66 return sum(1 for r in self.records if r.action == PruneAction.ARCHIVED) 

67 

68 @property 

69 def flagged_count(self) -> int: 

70 return sum(1 for r in self.records if r.action == PruneAction.FLAGGED) 

71 

72 

73def _archive_page( 

74 wiki_source: str, 

75 wiki_root: Path, 

76 store: Store, 

77 config: Config, 

78) -> None: 

79 """Move a wiki page to wiki/archive/ and clean up store data.""" 

80 relative = wiki_source.removeprefix(config.wiki_dir + "/") 

81 source_path = wiki_root / relative 

82 

83 archive_dir = wiki_root / WikiSubdir.ARCHIVE 

84 archive_dir.mkdir(parents=True, exist_ok=True) 

85 archive_path = archive_dir / source_path.name 

86 

87 if source_path.exists(): 

88 shutil.move(source_path, archive_path) 

89 log.info("Archived wiki page %s -> %s", source_path, archive_path) 

90 else: 

91 log.warning("Wiki page file not found for archival: %s", source_path) 

92 

93 store.delete_by_source(wiki_source) 

94 store.delete_citations_for_wiki(wiki_source) 

95 

96 

97def _check_all_sources_deleted( 

98 wiki_source: str, 

99 store: Store, 

100 documents_dir: Path, 

101) -> bool: 

102 """Return True if every cited source file has been deleted from disk.""" 

103 citations = store.get_citations_for_wiki(wiki_source) 

104 if not citations: 

105 return False 

106 source_files = {c["source_filename"] for c in citations} 

107 return all(not (documents_dir / f).exists() for f in source_files) 

108 

109 

110def _check_cluster_below_threshold( 

111 wiki_source: str, 

112 store: Store, 

113 documents_dir: Path, 

114 min_sources: int = MIN_CLUSTER_SOURCES, 

115) -> bool: 

116 """Return True if a synthesis page's live source count dropped below min_sources.""" 

117 if f"/{WikiSubdir.SYNTHESIS}/" not in wiki_source: 

118 return False 

119 citations = store.get_citations_for_wiki(wiki_source) 

120 if not citations: 

121 return False 

122 source_files = {c["source_filename"] for c in citations} 

123 live_count = sum(1 for f in source_files if (documents_dir / f).exists()) 

124 return live_count < min_sources 

125 

126 

127def _check_stale_majority( 

128 wiki_source: str, 

129 store: Store, 

130 config: Config, 

131) -> bool: 

132 """Return True if >50% of citations are stale (stale_hash or excerpt_missing).""" 

133 issues = lint_wiki_page(wiki_source, store, config) 

134 if not issues: 

135 return False 

136 citations = store.get_citations_for_wiki(wiki_source) 

137 if not citations: 

138 return False 

139 stale_count = sum(1 for i in issues if i.issue_type in _STALE_TYPES) 

140 return stale_count / len(citations) > config.wiki_stale_citation_threshold 

141 

142 

143def _archive_and_record( 

144 wiki_source: str, 

145 wiki_root: Path, 

146 store: Store, 

147 config: Config, 

148 reason: str, 

149) -> PruneRecord: 

150 """Archive a wiki page and return a PruneRecord for the action.""" 

151 _archive_page(wiki_source, wiki_root, store, config) 

152 return PruneRecord(wiki_source=wiki_source, action=PruneAction.ARCHIVED, reason=reason) 

153 

154 

155def _evaluate_page( 

156 wiki_source: str, wiki_root: Path, store: Store, config: Config 

157) -> PruneRecord | None: 

158 """Check a single wiki page against pruning rules. Returns a record or None.""" 

159 if _check_all_sources_deleted(wiki_source, store, config.documents_dir): 

160 return _archive_and_record( 

161 wiki_source, wiki_root, store, config, "all cited sources deleted" 

162 ) 

163 if _check_cluster_below_threshold(wiki_source, store, config.documents_dir): 

164 return _archive_and_record( 

165 wiki_source, 

166 wiki_root, 

167 store, 

168 config, 

169 f"concept cluster below {MIN_CLUSTER_SOURCES} live sources", 

170 ) 

171 if _check_stale_majority(wiki_source, store, config): 

172 return PruneRecord( 

173 wiki_source=wiki_source, 

174 action=PruneAction.FLAGGED, 

175 reason="majority of citations stale", 

176 ) 

177 return None 

178 

179 

180def _finalize_prune(report: PruneReport, config: Config) -> None: 

181 """Update wiki index and log after pruning.""" 

182 if not report.records: 

183 return 

184 log.info( 

185 "Wiki prune: %d archived, %d flagged", 

186 report.archived_count, 

187 report.flagged_count, 

188 ) 

189 update_wiki_index(config) 

190 for rec in report.records: 

191 append_wiki_log(f"pruned ({rec.action.value})", f"{rec.wiki_source}: {rec.reason}", config) 

192 

193 

194def prune_wiki(store: Store, config: Config | None = None) -> PruneReport: 

195 """Scan all wiki pages and prune stale/orphaned ones.""" 

196 if config is None: 

197 config = cfg 

198 wiki_root = config.data_root / config.wiki_dir 

199 report = PruneReport() 

200 if not wiki_root.exists(): 

201 return report 

202 for subdir in WIKI_CONTENT_SUBDIRS: 

203 subdir_path = wiki_root / subdir 

204 if not subdir_path.exists(): 

205 continue 

206 for md_path in sorted(subdir_path.rglob("*.md")): 

207 relative = md_path.relative_to(wiki_root) 

208 wiki_source = f"{config.wiki_dir}/{relative.as_posix()}" 

209 record = _evaluate_page(wiki_source, wiki_root, store, config) 

210 if record: 

211 report.records.append(record) 

212 _finalize_prune(report, config) 

213 return report