Coverage for src / lilbee / wiki / ingest.py: 100%

38 statements  

« prev     ^ index     » next       coverage.py v7.13.4, created at 2026-05-15 20:55 +0000

1"""Wiki post-ingest hook: regenerate pages touched by a recent sync.""" 

2 

3from __future__ import annotations 

4 

5import asyncio 

6import logging 

7 

8from lilbee.app.services import get_services 

9from lilbee.core.config import cfg 

10 

11log = logging.getLogger(__name__) 

12 

13 

14async def incremental_update(changed_sources: set[str]) -> None: 

15 """Regenerate only the wiki pages touched by *changed_sources*. 

16 

17 Builds a fresh ``ExtractedEntity`` set from the current corpus, 

18 keeps the records that either have no page on disk yet or whose 

19 chunk trail includes one of the changed sources, and regenerates 

20 just those. Above ``cfg.wiki_ingest_update_cap`` touched pages the 

21 auto-update bails out and logs a manual-update hint instead. 

22 """ 

23 if not cfg.wiki or not changed_sources: 

24 return 

25 from lilbee.data.store import SearchChunk 

26 from lilbee.wiki import append_wiki_log, build_wiki, update_wiki_index 

27 from lilbee.wiki.entity_extractor import EntityKind, get_entity_extractor 

28 from lilbee.wiki.shared import WikiLogAction, WikiSubdir 

29 

30 svc = get_services() 

31 extractor = get_entity_extractor(cfg.wiki_entity_mode, svc.provider, cfg) 

32 

33 chunks: list[SearchChunk] = [] 

34 for record in svc.store.get_sources(): 

35 chunks.extend(svc.store.get_chunks_by_source(record["filename"])) 

36 entities = await asyncio.to_thread(extractor.extract, chunks) 

37 

38 wiki_root = cfg.data_root / cfg.wiki_dir 

39 touched = [] 

40 for entity in entities: 

41 # The extractor emits only ENTITY kind; CONCEPT is reserved for 

42 # LLM-curated pages produced inside the batched call. Keeping 

43 # the dispatch neutral guards against a future extractor that 

44 # re-introduces CONCEPT. 

45 subdir = WikiSubdir.CONCEPTS if entity.kind is EntityKind.CONCEPT else WikiSubdir.ENTITIES 

46 page_path = wiki_root / subdir / f"{entity.slug}.md" 

47 if not page_path.exists(): 

48 touched.append(entity) 

49 continue 

50 if any(ref.source in changed_sources for ref in entity.chunk_refs): 

51 touched.append(entity) 

52 

53 if not touched: 

54 return 

55 

56 if len(touched) > cfg.wiki_ingest_update_cap: 

57 # warning, not info: the default LILBEE_LOG_LEVEL is WARNING, so 

58 # log.info would silently drop the manual-update hint and the user 

59 # would see no signal at all during `lilbee sync` when the cap trips. 

60 log.warning( 

61 "Wiki auto-update skipped: %d pages touched (cap %d). " 

62 "Run 'lilbee wiki update' to refresh.", 

63 len(touched), 

64 cfg.wiki_ingest_update_cap, 

65 ) 

66 append_wiki_log( 

67 WikiLogAction.INGEST, 

68 f"skipped: {len(touched)} pages exceeds cap {cfg.wiki_ingest_update_cap}", 

69 ) 

70 return 

71 

72 # extract_concepts=False so an incremental sync does not churn 

73 # concept slugs. Concept curation is a deliberate, user-invoked 

74 # refresh (full `lilbee wiki build`). 

75 pages = await asyncio.to_thread( 

76 build_wiki, touched, svc.provider, svc.store, cfg, extract_concepts=False 

77 ) 

78 update_wiki_index() 

79 append_wiki_log( 

80 WikiLogAction.INGEST, 

81 f"{len(pages)} pages regenerated for {', '.join(sorted(changed_sources))}", 

82 )