Coverage for src / lilbee / wiki / generation.py: 100%
144 statements
« prev ^ index » next coverage.py v7.13.4, created at 2026-05-15 20:55 +0000
« prev ^ index » next coverage.py v7.13.4, created at 2026-05-15 20:55 +0000
1"""Top-level wiki build orchestrators.
3Two public entry points live here:
5- :func:`build_wiki` produces entity and LLM-curated concept pages
6 per source, runs the one-time legacy-concept-page archival first,
7 then rewrites ``[[link]]`` slugs across all wiki content subdirs.
8- :func:`generate_synthesis_pages` produces cross-source synthesis
9 pages from concept clusters spanning 3+ documents.
11Both reuse the per-source batch path and the single-page pipeline
12from :mod:`lilbee.wiki.synthesis` and :mod:`lilbee.wiki.page`.
13"""
15from __future__ import annotations
17import logging
18from pathlib import Path
19from typing import TypedDict
21from lilbee.app.services import get_services
22from lilbee.core.config import Config, cfg
23from lilbee.data.store import SearchChunk, Store
24from lilbee.providers.base import LLMProvider
25from lilbee.retrieval.clustering import SourceClusterer
26from lilbee.wiki.batch import archive_legacy_concept_pages
27from lilbee.wiki.entity_extractor import ExtractedEntity, get_entity_extractor
28from lilbee.wiki.index import append_wiki_log, update_wiki_index
29from lilbee.wiki.links import apply_rewriter, compile_rewriter
30from lilbee.wiki.shared import (
31 MIN_CLUSTER_SOURCES,
32 WIKI_CONTENT_SUBDIRS,
33 WikiLogAction,
34 WikiSubdir,
35)
36from lilbee.wiki.synthesis import (
37 generate_source_batch,
38 generate_synthesis_page,
39 group_entities_by_primary_source,
40)
42log = logging.getLogger(__name__)
44_ENTITY_LIKE_SUBDIRS: tuple[str, ...] = (WikiSubdir.CONCEPTS, WikiSubdir.ENTITIES)
47def _generate_for_cluster(
48 label: str,
49 sources: frozenset[str],
50 provider: LLMProvider,
51 store: Store,
52 config: Config,
53) -> Path | None:
54 """Gather chunks for a cluster and generate a synthesis page."""
55 source_names = sorted(sources)
56 chunks_by_source: dict[str, list] = {}
57 for name in source_names:
58 chunks = store.get_chunks_by_source(name)
59 if chunks:
60 chunks_by_source[name] = chunks
62 if len(chunks_by_source) < MIN_CLUSTER_SOURCES:
63 return None
65 return generate_synthesis_page(label, source_names, chunks_by_source, provider, store, config)
68def generate_synthesis_pages(
69 provider: LLMProvider,
70 store: Store,
71 clusterer: SourceClusterer,
72 config: Config | None = None,
73) -> list[Path]:
74 """Generate synthesis pages for source clusters spanning 3+ documents."""
75 if config is None:
76 config = cfg
78 clusters = clusterer.get_clusters(min_sources=MIN_CLUSTER_SOURCES)
79 if not clusters:
80 log.info("No source clusters span %d+ sources, skipping synthesis", MIN_CLUSTER_SOURCES)
81 return []
83 pages: list[Path] = []
84 for cluster in clusters:
85 page = _generate_for_cluster(cluster.label, cluster.sources, provider, store, config)
86 if page is not None:
87 pages.append(page)
89 log.info("Generated %d synthesis pages", len(pages))
90 return pages
93def _all_sources_in_scope(
94 entities: list[ExtractedEntity],
95 grouped: dict[str, list[ExtractedEntity]],
96 store: Store,
97 config: Config,
98 extract_concepts: bool,
99) -> set[str]:
100 """Union of sources with entities and (when enabled) eligible for concept curation.
102 Seed the union with every entity's primary source. When
103 ``extract_concepts`` is True AND ``wiki_batch_min_chunks`` is
104 satisfied, add any source in the store that passes the floor.
105 This gives concept-only sources (no extracted entities) their
106 chance at curation while keeping zero-entity short sources
107 skipped entirely.
108 """
109 sources: set[str] = set(grouped)
110 if not extract_concepts:
111 return sources
112 try:
113 records = store.get_sources()
114 except Exception as exc:
115 log.warning("get_sources failed; sticking to entity-grouped sources: %s", exc)
116 return sources
117 for record in records:
118 name = record.get("filename", "") if isinstance(record, dict) else ""
119 if not name:
120 continue
121 if name in sources:
122 continue
123 chunk_count = record.get("chunk_count", 0) if isinstance(record, dict) else 0
124 if chunk_count >= config.wiki_batch_min_chunks:
125 sources.add(name)
126 _ = entities # silences linters on unused pass-through; kept for doc clarity
127 return sources
130def _entity_surface_map(entities: list[ExtractedEntity]) -> dict[str, str]:
131 """Build the surface-form -> slug map for the ``[[link]]`` rewriter.
133 Includes both the entity's human label (e.g. *"Henry Ford"*) and
134 the slug-with-hyphens-as-spaces variant (*"henry ford"*) so the
135 rewriter catches either form in body text.
136 """
137 mapping: dict[str, str] = {}
138 for entity in entities:
139 mapping[entity.label] = entity.slug
140 spaced = entity.slug.replace("-", " ")
141 if spaced and spaced != entity.label:
142 mapping[spaced] = entity.slug
143 return mapping
146def _augment_surface_map_with_existing_pages(
147 surface_to_slug: dict[str, str], wiki_root: Path
148) -> None:
149 """Add slugs for pages already on disk so an incremental rebuild of
150 one concept still links to its unchanged neighbors. **Mutates
151 surface_to_slug in place.** Only enriches the map with the
152 hyphen-to-space surface form because frontmatter labels aren't
153 read here; body prose typically uses the spaced form so this
154 covers the common case.
155 """
156 for subdir in _ENTITY_LIKE_SUBDIRS:
157 subdir_path = wiki_root / subdir
158 if not subdir_path.is_dir():
159 continue
160 for md_path in subdir_path.rglob("*.md"):
161 slug = md_path.stem
162 spaced = slug.replace("-", " ")
163 surface_to_slug.setdefault(spaced, slug)
166def _rewrite_links_across_wiki(entities: list[ExtractedEntity], config: Config) -> None:
167 """Rewrite ``[[slug]]`` links on every page under ``wiki/`` content subdirs.
169 A page never receives a link to itself: the rewriter takes the
170 owning slug and drops it inside its match callback, so the
171 surface map is shared unmodified across every page in the walk
172 (no O(M) dict rebuild per file). The map is augmented with
173 slugs from the existing on-disk corpus so a touched page still
174 links to untouched neighbors. The alternation regex + lookup are
175 compiled once per build and reused across pages.
176 """
177 surface_to_slug = _entity_surface_map(entities)
178 wiki_root = config.data_root / config.wiki_dir
179 _augment_surface_map_with_existing_pages(surface_to_slug, wiki_root)
180 rewriter = compile_rewriter(surface_to_slug)
181 if rewriter is None:
182 return
184 for subdir in WIKI_CONTENT_SUBDIRS:
185 subdir_path = wiki_root / subdir
186 if not subdir_path.is_dir():
187 continue
188 is_entity_subdir = subdir in _ENTITY_LIKE_SUBDIRS
189 for md_path in subdir_path.rglob("*.md"):
190 owning_slug = md_path.stem if is_entity_subdir else None
191 original = md_path.read_text(encoding="utf-8")
192 rewritten = apply_rewriter(original, rewriter, skip_slug=owning_slug)
193 if rewritten != original:
194 md_path.write_text(rewritten, encoding="utf-8")
197def build_wiki(
198 entities: list[ExtractedEntity],
199 provider: LLMProvider,
200 store: Store,
201 config: Config | None = None,
202 *,
203 extract_concepts: bool = True,
204) -> list[Path]:
205 """Produce entity and LLM-curated concept pages per source.
207 Per-entity / per-concept fan-out is collapsed into a per-source
208 batched call: for each source in ``entities``' chunk refs, one LLM
209 call identifies 3-5 concepts AND writes a wiki section for every
210 pre-extracted entity belonging to that source. Output sections are
211 split, citation-verified, embedding-scored, and landed under
212 ``wiki/entities/`` or ``wiki/concepts/`` depending on kind.
214 ``extract_concepts=False`` (used by the incremental-ingest hook)
215 drops the concept-curation paragraph from the prompt so a
216 touched source does not churn concept slugs.
218 A one-time archive migration runs first (idempotently, gated by
219 ``{data_dir}/.phase-d-migrated``), moving legacy concept pages
220 under ``wiki/archive/concepts/`` and unwrapping stale
221 ``[[archived-slug]]`` links across the remaining pages.
222 """
223 if config is None:
224 config = cfg
225 wiki_root = config.data_root / config.wiki_dir
226 archive_legacy_concept_pages(wiki_root, config.data_dir)
228 grouped = group_entities_by_primary_source(entities)
229 all_sources = _all_sources_in_scope(entities, grouped, store, config, extract_concepts)
230 written_concept_slugs: dict[str, str] = {}
231 pages: list[Path] = []
233 for source in sorted(all_sources):
234 source_entities = grouped.get(source, [])
235 chunks = store.get_chunks_by_source(source)
236 chunk_count = len(chunks)
237 source_extract = extract_concepts and chunk_count >= config.wiki_batch_min_chunks
238 if not source_entities and not source_extract:
239 log.info(
240 "Skipping source %s: %d entities, %d chunks, min=%d, extract=%s",
241 source,
242 len(source_entities),
243 chunk_count,
244 config.wiki_batch_min_chunks,
245 source_extract,
246 )
247 continue
248 source_pages = generate_source_batch(
249 source=source,
250 entities=source_entities,
251 chunks=chunks,
252 provider=provider,
253 store=store,
254 config=config,
255 extract_concepts=source_extract,
256 written_concept_slugs=written_concept_slugs,
257 )
258 pages.extend(source_pages)
260 _rewrite_links_across_wiki(entities, config)
261 log.info("Generated %d batched wiki pages", len(pages))
262 return pages
265class WikiBuildSummary(TypedDict):
266 """Result of a full wiki build/update."""
268 paths: list[str]
269 entities: int
270 count: int
273def run_full_build(config: Config | None = None) -> WikiBuildSummary:
274 """Extract entities and build wiki pages for every ingested source."""
275 if config is None:
276 config = cfg
277 svc = get_services()
278 chunks: list[SearchChunk] = []
279 for record in svc.store.get_sources():
280 chunks.extend(svc.store.get_chunks_by_source(record["filename"]))
282 extractor = get_entity_extractor(config.wiki_entity_mode, svc.provider, config)
283 entities = extractor.extract(chunks)
284 pages = build_wiki(
285 entities,
286 svc.provider,
287 svc.store,
288 config,
289 extract_concepts=config.wiki_extract_concepts,
290 )
291 update_wiki_index()
292 append_wiki_log(WikiLogAction.BUILD, f"{len(pages)} pages from {len(entities)} records")
293 return {
294 "paths": [str(p) for p in pages],
295 "entities": len(entities),
296 "count": len(pages),
297 }
300class WikiSynthesizeSummary(TypedDict):
301 """Result of running synthesis-page generation."""
303 paths: list[str]
304 count: int
307def run_full_synthesize(config: Config | None = None) -> WikiSynthesizeSummary:
308 """Generate synthesis pages for cross-source clusters."""
309 if config is None:
310 config = cfg
311 svc = get_services()
312 paths = generate_synthesis_pages(svc.provider, svc.store, svc.clusterer, config)
313 return {
314 "paths": [str(p) for p in paths],
315 "count": len(paths),
316 }