Coverage for src / lilbee / wiki / prune.py: 100%
105 statements
« prev ^ index » next coverage.py v7.13.4, created at 2026-05-15 20:55 +0000
« prev ^ index » next coverage.py v7.13.4, created at 2026-05-15 20:55 +0000
1"""Prune stale and orphaned wiki pages.
3Pruning rules:
41. All cited sources deleted -> archive the page
52. Concept cluster shrinks below 3 sources -> archive synthesis page
63. >50% of citations are stale (stale_hash or excerpt_missing) -> flag for regeneration
8Archived pages are moved to wiki/archive/ and removed from the vector store.
9"""
11from __future__ import annotations
13import logging
14import shutil
15from dataclasses import dataclass, field
16from enum import Enum
17from pathlib import Path
19from lilbee.core.config import Config, cfg
20from lilbee.data.store import Store
21from lilbee.wiki.index import append_wiki_log, update_wiki_index
22from lilbee.wiki.lint import IssueType, lint_wiki_page
23from lilbee.wiki.shared import (
24 MIN_CLUSTER_SOURCES,
25 WIKI_CONTENT_SUBDIRS,
26 WikiSubdir,
27)
29log = logging.getLogger(__name__)
31_STALE_TYPES = {IssueType.STALE_HASH, IssueType.EXCERPT_MISSING}
34class PruneAction(Enum):
35 """What happened to a wiki page during pruning."""
37 ARCHIVED = "archived"
38 FLAGGED = "flagged"
41@dataclass(frozen=True)
42class PruneRecord:
43 """A single pruning action taken on a wiki page."""
45 wiki_source: str
46 action: PruneAction
47 reason: str
49 def to_dict(self) -> dict[str, str]:
50 """Serialize to a plain dict suitable for JSON output."""
51 return {
52 "wiki_source": self.wiki_source,
53 "action": self.action.value,
54 "reason": self.reason,
55 }
58@dataclass
59class PruneReport:
60 """Aggregated results from pruning wiki pages."""
62 records: list[PruneRecord] = field(default_factory=list)
64 @property
65 def archived_count(self) -> int:
66 return sum(1 for r in self.records if r.action == PruneAction.ARCHIVED)
68 @property
69 def flagged_count(self) -> int:
70 return sum(1 for r in self.records if r.action == PruneAction.FLAGGED)
73def _archive_page(
74 wiki_source: str,
75 wiki_root: Path,
76 store: Store,
77 config: Config,
78) -> None:
79 """Move a wiki page to wiki/archive/ and clean up store data."""
80 relative = wiki_source.removeprefix(config.wiki_dir + "/")
81 source_path = wiki_root / relative
83 archive_dir = wiki_root / WikiSubdir.ARCHIVE
84 archive_dir.mkdir(parents=True, exist_ok=True)
85 archive_path = archive_dir / source_path.name
87 if source_path.exists():
88 shutil.move(source_path, archive_path)
89 log.info("Archived wiki page %s -> %s", source_path, archive_path)
90 else:
91 log.warning("Wiki page file not found for archival: %s", source_path)
93 store.delete_by_source(wiki_source)
94 store.delete_citations_for_wiki(wiki_source)
97def _check_all_sources_deleted(
98 wiki_source: str,
99 store: Store,
100 documents_dir: Path,
101) -> bool:
102 """Return True if every cited source file has been deleted from disk."""
103 citations = store.get_citations_for_wiki(wiki_source)
104 if not citations:
105 return False
106 source_files = {c["source_filename"] for c in citations}
107 return all(not (documents_dir / f).exists() for f in source_files)
110def _check_cluster_below_threshold(
111 wiki_source: str,
112 store: Store,
113 documents_dir: Path,
114 min_sources: int = MIN_CLUSTER_SOURCES,
115) -> bool:
116 """Return True if a synthesis page's live source count dropped below min_sources."""
117 if f"/{WikiSubdir.SYNTHESIS}/" not in wiki_source:
118 return False
119 citations = store.get_citations_for_wiki(wiki_source)
120 if not citations:
121 return False
122 source_files = {c["source_filename"] for c in citations}
123 live_count = sum(1 for f in source_files if (documents_dir / f).exists())
124 return live_count < min_sources
127def _check_stale_majority(
128 wiki_source: str,
129 store: Store,
130 config: Config,
131) -> bool:
132 """Return True if >50% of citations are stale (stale_hash or excerpt_missing)."""
133 issues = lint_wiki_page(wiki_source, store, config)
134 if not issues:
135 return False
136 citations = store.get_citations_for_wiki(wiki_source)
137 if not citations:
138 return False
139 stale_count = sum(1 for i in issues if i.issue_type in _STALE_TYPES)
140 return stale_count / len(citations) > config.wiki_stale_citation_threshold
143def _archive_and_record(
144 wiki_source: str,
145 wiki_root: Path,
146 store: Store,
147 config: Config,
148 reason: str,
149) -> PruneRecord:
150 """Archive a wiki page and return a PruneRecord for the action."""
151 _archive_page(wiki_source, wiki_root, store, config)
152 return PruneRecord(wiki_source=wiki_source, action=PruneAction.ARCHIVED, reason=reason)
155def _evaluate_page(
156 wiki_source: str, wiki_root: Path, store: Store, config: Config
157) -> PruneRecord | None:
158 """Check a single wiki page against pruning rules. Returns a record or None."""
159 if _check_all_sources_deleted(wiki_source, store, config.documents_dir):
160 return _archive_and_record(
161 wiki_source, wiki_root, store, config, "all cited sources deleted"
162 )
163 if _check_cluster_below_threshold(wiki_source, store, config.documents_dir):
164 return _archive_and_record(
165 wiki_source,
166 wiki_root,
167 store,
168 config,
169 f"concept cluster below {MIN_CLUSTER_SOURCES} live sources",
170 )
171 if _check_stale_majority(wiki_source, store, config):
172 return PruneRecord(
173 wiki_source=wiki_source,
174 action=PruneAction.FLAGGED,
175 reason="majority of citations stale",
176 )
177 return None
180def _finalize_prune(report: PruneReport, config: Config) -> None:
181 """Update wiki index and log after pruning."""
182 if not report.records:
183 return
184 log.info(
185 "Wiki prune: %d archived, %d flagged",
186 report.archived_count,
187 report.flagged_count,
188 )
189 update_wiki_index(config)
190 for rec in report.records:
191 append_wiki_log(f"pruned ({rec.action.value})", f"{rec.wiki_source}: {rec.reason}", config)
194def prune_wiki(store: Store, config: Config | None = None) -> PruneReport:
195 """Scan all wiki pages and prune stale/orphaned ones."""
196 if config is None:
197 config = cfg
198 wiki_root = config.data_root / config.wiki_dir
199 report = PruneReport()
200 if not wiki_root.exists():
201 return report
202 for subdir in WIKI_CONTENT_SUBDIRS:
203 subdir_path = wiki_root / subdir
204 if not subdir_path.exists():
205 continue
206 for md_path in sorted(subdir_path.rglob("*.md")):
207 relative = md_path.relative_to(wiki_root)
208 wiki_source = f"{config.wiki_dir}/{relative.as_posix()}"
209 record = _evaluate_page(wiki_source, wiki_root, store, config)
210 if record:
211 report.records.append(record)
212 _finalize_prune(report, config)
213 return report