Coverage for src / lilbee / wiki / generation.py: 100%

144 statements  

« prev     ^ index     » next       coverage.py v7.13.4, created at 2026-05-15 20:55 +0000

1"""Top-level wiki build orchestrators. 

2 

3Two public entry points live here: 

4 

5- :func:`build_wiki` produces entity and LLM-curated concept pages 

6 per source, runs the one-time legacy-concept-page archival first, 

7 then rewrites ``[[link]]`` slugs across all wiki content subdirs. 

8- :func:`generate_synthesis_pages` produces cross-source synthesis 

9 pages from concept clusters spanning 3+ documents. 

10 

11Both reuse the per-source batch path and the single-page pipeline 

12from :mod:`lilbee.wiki.synthesis` and :mod:`lilbee.wiki.page`. 

13""" 

14 

15from __future__ import annotations 

16 

17import logging 

18from pathlib import Path 

19from typing import TypedDict 

20 

21from lilbee.app.services import get_services 

22from lilbee.core.config import Config, cfg 

23from lilbee.data.store import SearchChunk, Store 

24from lilbee.providers.base import LLMProvider 

25from lilbee.retrieval.clustering import SourceClusterer 

26from lilbee.wiki.batch import archive_legacy_concept_pages 

27from lilbee.wiki.entity_extractor import ExtractedEntity, get_entity_extractor 

28from lilbee.wiki.index import append_wiki_log, update_wiki_index 

29from lilbee.wiki.links import apply_rewriter, compile_rewriter 

30from lilbee.wiki.shared import ( 

31 MIN_CLUSTER_SOURCES, 

32 WIKI_CONTENT_SUBDIRS, 

33 WikiLogAction, 

34 WikiSubdir, 

35) 

36from lilbee.wiki.synthesis import ( 

37 generate_source_batch, 

38 generate_synthesis_page, 

39 group_entities_by_primary_source, 

40) 

41 

42log = logging.getLogger(__name__) 

43 

44_ENTITY_LIKE_SUBDIRS: tuple[str, ...] = (WikiSubdir.CONCEPTS, WikiSubdir.ENTITIES) 

45 

46 

47def _generate_for_cluster( 

48 label: str, 

49 sources: frozenset[str], 

50 provider: LLMProvider, 

51 store: Store, 

52 config: Config, 

53) -> Path | None: 

54 """Gather chunks for a cluster and generate a synthesis page.""" 

55 source_names = sorted(sources) 

56 chunks_by_source: dict[str, list] = {} 

57 for name in source_names: 

58 chunks = store.get_chunks_by_source(name) 

59 if chunks: 

60 chunks_by_source[name] = chunks 

61 

62 if len(chunks_by_source) < MIN_CLUSTER_SOURCES: 

63 return None 

64 

65 return generate_synthesis_page(label, source_names, chunks_by_source, provider, store, config) 

66 

67 

68def generate_synthesis_pages( 

69 provider: LLMProvider, 

70 store: Store, 

71 clusterer: SourceClusterer, 

72 config: Config | None = None, 

73) -> list[Path]: 

74 """Generate synthesis pages for source clusters spanning 3+ documents.""" 

75 if config is None: 

76 config = cfg 

77 

78 clusters = clusterer.get_clusters(min_sources=MIN_CLUSTER_SOURCES) 

79 if not clusters: 

80 log.info("No source clusters span %d+ sources, skipping synthesis", MIN_CLUSTER_SOURCES) 

81 return [] 

82 

83 pages: list[Path] = [] 

84 for cluster in clusters: 

85 page = _generate_for_cluster(cluster.label, cluster.sources, provider, store, config) 

86 if page is not None: 

87 pages.append(page) 

88 

89 log.info("Generated %d synthesis pages", len(pages)) 

90 return pages 

91 

92 

93def _all_sources_in_scope( 

94 entities: list[ExtractedEntity], 

95 grouped: dict[str, list[ExtractedEntity]], 

96 store: Store, 

97 config: Config, 

98 extract_concepts: bool, 

99) -> set[str]: 

100 """Union of sources with entities and (when enabled) eligible for concept curation. 

101 

102 Seed the union with every entity's primary source. When 

103 ``extract_concepts`` is True AND ``wiki_batch_min_chunks`` is 

104 satisfied, add any source in the store that passes the floor. 

105 This gives concept-only sources (no extracted entities) their 

106 chance at curation while keeping zero-entity short sources 

107 skipped entirely. 

108 """ 

109 sources: set[str] = set(grouped) 

110 if not extract_concepts: 

111 return sources 

112 try: 

113 records = store.get_sources() 

114 except Exception as exc: 

115 log.warning("get_sources failed; sticking to entity-grouped sources: %s", exc) 

116 return sources 

117 for record in records: 

118 name = record.get("filename", "") if isinstance(record, dict) else "" 

119 if not name: 

120 continue 

121 if name in sources: 

122 continue 

123 chunk_count = record.get("chunk_count", 0) if isinstance(record, dict) else 0 

124 if chunk_count >= config.wiki_batch_min_chunks: 

125 sources.add(name) 

126 _ = entities # silences linters on unused pass-through; kept for doc clarity 

127 return sources 

128 

129 

130def _entity_surface_map(entities: list[ExtractedEntity]) -> dict[str, str]: 

131 """Build the surface-form -> slug map for the ``[[link]]`` rewriter. 

132 

133 Includes both the entity's human label (e.g. *"Henry Ford"*) and 

134 the slug-with-hyphens-as-spaces variant (*"henry ford"*) so the 

135 rewriter catches either form in body text. 

136 """ 

137 mapping: dict[str, str] = {} 

138 for entity in entities: 

139 mapping[entity.label] = entity.slug 

140 spaced = entity.slug.replace("-", " ") 

141 if spaced and spaced != entity.label: 

142 mapping[spaced] = entity.slug 

143 return mapping 

144 

145 

146def _augment_surface_map_with_existing_pages( 

147 surface_to_slug: dict[str, str], wiki_root: Path 

148) -> None: 

149 """Add slugs for pages already on disk so an incremental rebuild of 

150 one concept still links to its unchanged neighbors. **Mutates 

151 surface_to_slug in place.** Only enriches the map with the 

152 hyphen-to-space surface form because frontmatter labels aren't 

153 read here; body prose typically uses the spaced form so this 

154 covers the common case. 

155 """ 

156 for subdir in _ENTITY_LIKE_SUBDIRS: 

157 subdir_path = wiki_root / subdir 

158 if not subdir_path.is_dir(): 

159 continue 

160 for md_path in subdir_path.rglob("*.md"): 

161 slug = md_path.stem 

162 spaced = slug.replace("-", " ") 

163 surface_to_slug.setdefault(spaced, slug) 

164 

165 

166def _rewrite_links_across_wiki(entities: list[ExtractedEntity], config: Config) -> None: 

167 """Rewrite ``[[slug]]`` links on every page under ``wiki/`` content subdirs. 

168 

169 A page never receives a link to itself: the rewriter takes the 

170 owning slug and drops it inside its match callback, so the 

171 surface map is shared unmodified across every page in the walk 

172 (no O(M) dict rebuild per file). The map is augmented with 

173 slugs from the existing on-disk corpus so a touched page still 

174 links to untouched neighbors. The alternation regex + lookup are 

175 compiled once per build and reused across pages. 

176 """ 

177 surface_to_slug = _entity_surface_map(entities) 

178 wiki_root = config.data_root / config.wiki_dir 

179 _augment_surface_map_with_existing_pages(surface_to_slug, wiki_root) 

180 rewriter = compile_rewriter(surface_to_slug) 

181 if rewriter is None: 

182 return 

183 

184 for subdir in WIKI_CONTENT_SUBDIRS: 

185 subdir_path = wiki_root / subdir 

186 if not subdir_path.is_dir(): 

187 continue 

188 is_entity_subdir = subdir in _ENTITY_LIKE_SUBDIRS 

189 for md_path in subdir_path.rglob("*.md"): 

190 owning_slug = md_path.stem if is_entity_subdir else None 

191 original = md_path.read_text(encoding="utf-8") 

192 rewritten = apply_rewriter(original, rewriter, skip_slug=owning_slug) 

193 if rewritten != original: 

194 md_path.write_text(rewritten, encoding="utf-8") 

195 

196 

197def build_wiki( 

198 entities: list[ExtractedEntity], 

199 provider: LLMProvider, 

200 store: Store, 

201 config: Config | None = None, 

202 *, 

203 extract_concepts: bool = True, 

204) -> list[Path]: 

205 """Produce entity and LLM-curated concept pages per source. 

206 

207 Per-entity / per-concept fan-out is collapsed into a per-source 

208 batched call: for each source in ``entities``' chunk refs, one LLM 

209 call identifies 3-5 concepts AND writes a wiki section for every 

210 pre-extracted entity belonging to that source. Output sections are 

211 split, citation-verified, embedding-scored, and landed under 

212 ``wiki/entities/`` or ``wiki/concepts/`` depending on kind. 

213 

214 ``extract_concepts=False`` (used by the incremental-ingest hook) 

215 drops the concept-curation paragraph from the prompt so a 

216 touched source does not churn concept slugs. 

217 

218 A one-time archive migration runs first (idempotently, gated by 

219 ``{data_dir}/.phase-d-migrated``), moving legacy concept pages 

220 under ``wiki/archive/concepts/`` and unwrapping stale 

221 ``[[archived-slug]]`` links across the remaining pages. 

222 """ 

223 if config is None: 

224 config = cfg 

225 wiki_root = config.data_root / config.wiki_dir 

226 archive_legacy_concept_pages(wiki_root, config.data_dir) 

227 

228 grouped = group_entities_by_primary_source(entities) 

229 all_sources = _all_sources_in_scope(entities, grouped, store, config, extract_concepts) 

230 written_concept_slugs: dict[str, str] = {} 

231 pages: list[Path] = [] 

232 

233 for source in sorted(all_sources): 

234 source_entities = grouped.get(source, []) 

235 chunks = store.get_chunks_by_source(source) 

236 chunk_count = len(chunks) 

237 source_extract = extract_concepts and chunk_count >= config.wiki_batch_min_chunks 

238 if not source_entities and not source_extract: 

239 log.info( 

240 "Skipping source %s: %d entities, %d chunks, min=%d, extract=%s", 

241 source, 

242 len(source_entities), 

243 chunk_count, 

244 config.wiki_batch_min_chunks, 

245 source_extract, 

246 ) 

247 continue 

248 source_pages = generate_source_batch( 

249 source=source, 

250 entities=source_entities, 

251 chunks=chunks, 

252 provider=provider, 

253 store=store, 

254 config=config, 

255 extract_concepts=source_extract, 

256 written_concept_slugs=written_concept_slugs, 

257 ) 

258 pages.extend(source_pages) 

259 

260 _rewrite_links_across_wiki(entities, config) 

261 log.info("Generated %d batched wiki pages", len(pages)) 

262 return pages 

263 

264 

265class WikiBuildSummary(TypedDict): 

266 """Result of a full wiki build/update.""" 

267 

268 paths: list[str] 

269 entities: int 

270 count: int 

271 

272 

273def run_full_build(config: Config | None = None) -> WikiBuildSummary: 

274 """Extract entities and build wiki pages for every ingested source.""" 

275 if config is None: 

276 config = cfg 

277 svc = get_services() 

278 chunks: list[SearchChunk] = [] 

279 for record in svc.store.get_sources(): 

280 chunks.extend(svc.store.get_chunks_by_source(record["filename"])) 

281 

282 extractor = get_entity_extractor(config.wiki_entity_mode, svc.provider, config) 

283 entities = extractor.extract(chunks) 

284 pages = build_wiki( 

285 entities, 

286 svc.provider, 

287 svc.store, 

288 config, 

289 extract_concepts=config.wiki_extract_concepts, 

290 ) 

291 update_wiki_index() 

292 append_wiki_log(WikiLogAction.BUILD, f"{len(pages)} pages from {len(entities)} records") 

293 return { 

294 "paths": [str(p) for p in pages], 

295 "entities": len(entities), 

296 "count": len(pages), 

297 } 

298 

299 

300class WikiSynthesizeSummary(TypedDict): 

301 """Result of running synthesis-page generation.""" 

302 

303 paths: list[str] 

304 count: int 

305 

306 

307def run_full_synthesize(config: Config | None = None) -> WikiSynthesizeSummary: 

308 """Generate synthesis pages for cross-source clusters.""" 

309 if config is None: 

310 config = cfg 

311 svc = get_services() 

312 paths = generate_synthesis_pages(svc.provider, svc.store, svc.clusterer, config) 

313 return { 

314 "paths": [str(p) for p in paths], 

315 "count": len(paths), 

316 }