Coverage for src / lilbee / wiki / entity_extractor / ner_concepts.py: 100%

89 statements  

« prev     ^ index     » next       coverage.py v7.13.4, created at 2026-05-15 20:55 +0000

1"""spaCy NER entity extractor (default strategy). 

2 

3Produces typed NER entities only. LLM-curated concept pages are 

4proposed downstream by the per-source batched call in 

5:mod:`lilbee.wiki.generation`. 

6""" 

7 

8from __future__ import annotations 

9 

10import functools 

11import logging 

12import re 

13from typing import TYPE_CHECKING, Any 

14 

15from lilbee.core.text import is_valid_label, make_slug 

16from lilbee.wiki.entity_extractor.base import ( 

17 ChunkRef, 

18 EntityKind, 

19 ExtractedEntity, 

20) 

21 

22if TYPE_CHECKING: 

23 from lilbee.core.config import Config 

24 from lilbee.data.store import SearchChunk 

25 from lilbee.providers.base import LLMProvider 

26 

27log = logging.getLogger(__name__) 

28 

29_WHITESPACE_RE = re.compile(r"\s+") 

30 

31# Pre-spaCy markdown-noise strippers. Compiled once at module scope so 

32# the extractor's hot path does not recompile them per chunk. Match on 

33# line boundaries via re.MULTILINE; each sub() empties the matched 

34# line so downstream line-joins collapse the hole to a single newline. 

35_TABLE_ROW_RE = re.compile(r"^\|.*\|\s*$", re.MULTILINE) 

36_PAGE_NUMBER_RE = re.compile(r"^\s*\d{1,4}\s*$", re.MULTILINE) 

37_NAV_CHROME_RE = re.compile( 

38 r"^\s*(?:Home|Menu|Navigation|Edit this page|Jump to navigation|Jump to search)\s*$", 

39 re.MULTILINE, 

40) 

41 

42 

43def _normalize(text: str) -> str: 

44 """Lowercase, strip, and collapse internal whitespace for dedup keys.""" 

45 return _WHITESPACE_RE.sub(" ", text.strip().lower()) 

46 

47 

48def pre_clean_for_ner(text: str) -> str: 

49 """Strip markdown-structural noise before handing text to spaCy. 

50 

51 Removes whole-line markdown-table rows (``| Designer | Irv ... |``), 

52 standalone page-number lines from PDF extraction (``42``), and 

53 Wikipedia / CMS navigation chrome (``Edit this page``). Leaves 

54 prose untouched: every regex anchors to a full line and emits an 

55 empty line in place of the match, which spaCy treats as a sentence 

56 break. 

57 

58 Only targets the noise patterns actually observed in the bb-8b7s 

59 QA corpus. Fuller markdown parsing is deferred; a regex pre-clean 

60 is sufficient for the current signal-to-noise ratio. 

61 """ 

62 text = _TABLE_ROW_RE.sub("", text) 

63 text = _PAGE_NUMBER_RE.sub("", text) 

64 return _NAV_CHROME_RE.sub("", text) 

65 

66 

67class NerConceptsExtractor: 

68 """Emit typed NER entities (``EntityKind.ENTITY`` only). 

69 

70 LLM-curated concept pages are produced downstream by the per-source 

71 batched call in :mod:`lilbee.wiki.generation`. 

72 """ 

73 

74 def __init__(self, provider: LLMProvider, config: Config) -> None: 

75 self._provider = provider 

76 self._config = config 

77 

78 def extract(self, chunks: list[SearchChunk]) -> list[ExtractedEntity]: 

79 if not chunks: 

80 return [] 

81 nlp = _load_spacy() 

82 if nlp is None: 

83 return [] 

84 

85 entity_records: dict[str, _Aggregate] = {} 

86 allowed_ent_types = self._config.concept_allowed_ent_types 

87 

88 debug_enabled = log.isEnabledFor(logging.DEBUG) 

89 funnel: dict[str, int] = { 

90 "raw_ents": 0, 

91 "type_filter_dropped": 0, 

92 "label_sanity_dropped_entities": 0, 

93 "kept_entity_surfaces": 0, 

94 } 

95 cleaned_texts = (pre_clean_for_ner(c.chunk) for c in chunks) 

96 for chunk, doc in zip(chunks, nlp.pipe(cleaned_texts), strict=True): 

97 ref = ChunkRef(source=chunk.source, chunk_index=chunk.chunk_index) 

98 _accumulate_doc_entities( 

99 doc, ref, entity_records, allowed_ent_types, funnel, debug_enabled 

100 ) 

101 

102 if debug_enabled: 

103 log.debug( 

104 "ner funnel: raw_ents=%(raw_ents)d " 

105 "type_filter_dropped=%(type_filter_dropped)d " 

106 "label_sanity_dropped_entities=%(label_sanity_dropped_entities)d " 

107 "kept_entity_surfaces=%(kept_entity_surfaces)d", 

108 funnel, 

109 ) 

110 

111 min_mentions = self._config.wiki_entity_min_mentions 

112 results: list[ExtractedEntity] = [] 

113 for agg in entity_records.values(): 

114 record = _make_record(agg, EntityKind.ENTITY, min_mentions) 

115 if record is not None: 

116 results.append(record) 

117 results.sort(key=lambda e: (e.kind.value, e.slug)) 

118 return results 

119 

120 

121def _accumulate_doc_entities( 

122 doc: Any, 

123 ref: ChunkRef, 

124 entity_records: dict[str, _Aggregate], 

125 allowed_ent_types: set[str] | frozenset[str], 

126 funnel: dict[str, int], 

127 debug_enabled: bool, 

128) -> None: 

129 """Fold one spaCy doc's entities into ``entity_records`` (mutated in place).""" 

130 for ent in doc.ents: 

131 funnel["raw_ents"] += 1 

132 if ent.label_ not in allowed_ent_types: 

133 funnel["type_filter_dropped"] += 1 

134 continue 

135 surface = ent.text.strip() 

136 if not is_valid_label(surface): 

137 funnel["label_sanity_dropped_entities"] += 1 

138 if debug_enabled: 

139 log.debug("label-sanity: rejected entity %r", surface) 

140 continue 

141 key = _normalize(surface) 

142 rec = entity_records.setdefault(key, _Aggregate(label=surface, type_hint=ent.label_)) 

143 rec.refs.add(ref) 

144 funnel["kept_entity_surfaces"] += 1 

145 

146 

147class _Aggregate: 

148 """Mutable accumulator used only while folding per-chunk hits.""" 

149 

150 __slots__ = ("label", "refs", "type_hint") 

151 

152 def __init__(self, label: str, type_hint: str) -> None: 

153 self.label = label 

154 self.type_hint = type_hint 

155 self.refs: set[ChunkRef] = set() 

156 

157 

158def _sorted_refs(refs: set[ChunkRef]) -> tuple[ChunkRef, ...]: 

159 return tuple(sorted(refs, key=lambda r: (r.source, r.chunk_index))) 

160 

161 

162def _make_record(agg: _Aggregate, kind: EntityKind, min_mentions: int) -> ExtractedEntity | None: 

163 """Turn an aggregate into an ``ExtractedEntity`` or drop it. 

164 

165 Filters records below the mention threshold and records whose label 

166 slug-cleans to an empty string (e.g. labels of only punctuation); 

167 without the empty-slug guard those would try to write files named 

168 just ``.md`` on disk. 

169 """ 

170 if len(agg.refs) < min_mentions: 

171 return None 

172 slug = make_slug(agg.label) 

173 if not slug: 

174 return None 

175 return ExtractedEntity( 

176 slug=slug, 

177 kind=kind, 

178 label=agg.label, 

179 type_hint=agg.type_hint, 

180 chunk_refs=_sorted_refs(agg.refs), 

181 ) 

182 

183 

184@functools.cache 

185def _load_spacy() -> Any | None: 

186 """Load the shared spaCy pipeline, or return None if unavailable. 

187 

188 Cached so the "spaCy unavailable" warning fires at most once per process. 

189 Without the cache, every chunk-extract call repeats the warning; on a 

190 corpus with 1 000 chunks the user got 1 000 identical lines. 

191 """ 

192 try: 

193 from lilbee.retrieval.concepts import load_spacy_pipeline 

194 except ImportError: 

195 log.warning("Entity extraction disabled: lilbee.concepts unavailable") 

196 return None 

197 try: 

198 return load_spacy_pipeline() 

199 except ImportError: 

200 log.warning("Entity extraction disabled: spaCy model unavailable") 

201 return None