Coverage for src / lilbee / wiki / entity_extractor / ner_concepts.py: 100%
89 statements
« prev ^ index » next coverage.py v7.13.4, created at 2026-05-15 20:55 +0000
« prev ^ index » next coverage.py v7.13.4, created at 2026-05-15 20:55 +0000
1"""spaCy NER entity extractor (default strategy).
3Produces typed NER entities only. LLM-curated concept pages are
4proposed downstream by the per-source batched call in
5:mod:`lilbee.wiki.generation`.
6"""
8from __future__ import annotations
10import functools
11import logging
12import re
13from typing import TYPE_CHECKING, Any
15from lilbee.core.text import is_valid_label, make_slug
16from lilbee.wiki.entity_extractor.base import (
17 ChunkRef,
18 EntityKind,
19 ExtractedEntity,
20)
22if TYPE_CHECKING:
23 from lilbee.core.config import Config
24 from lilbee.data.store import SearchChunk
25 from lilbee.providers.base import LLMProvider
27log = logging.getLogger(__name__)
29_WHITESPACE_RE = re.compile(r"\s+")
31# Pre-spaCy markdown-noise strippers. Compiled once at module scope so
32# the extractor's hot path does not recompile them per chunk. Match on
33# line boundaries via re.MULTILINE; each sub() empties the matched
34# line so downstream line-joins collapse the hole to a single newline.
35_TABLE_ROW_RE = re.compile(r"^\|.*\|\s*$", re.MULTILINE)
36_PAGE_NUMBER_RE = re.compile(r"^\s*\d{1,4}\s*$", re.MULTILINE)
37_NAV_CHROME_RE = re.compile(
38 r"^\s*(?:Home|Menu|Navigation|Edit this page|Jump to navigation|Jump to search)\s*$",
39 re.MULTILINE,
40)
43def _normalize(text: str) -> str:
44 """Lowercase, strip, and collapse internal whitespace for dedup keys."""
45 return _WHITESPACE_RE.sub(" ", text.strip().lower())
48def pre_clean_for_ner(text: str) -> str:
49 """Strip markdown-structural noise before handing text to spaCy.
51 Removes whole-line markdown-table rows (``| Designer | Irv ... |``),
52 standalone page-number lines from PDF extraction (``42``), and
53 Wikipedia / CMS navigation chrome (``Edit this page``). Leaves
54 prose untouched: every regex anchors to a full line and emits an
55 empty line in place of the match, which spaCy treats as a sentence
56 break.
58 Only targets the noise patterns actually observed in the bb-8b7s
59 QA corpus. Fuller markdown parsing is deferred; a regex pre-clean
60 is sufficient for the current signal-to-noise ratio.
61 """
62 text = _TABLE_ROW_RE.sub("", text)
63 text = _PAGE_NUMBER_RE.sub("", text)
64 return _NAV_CHROME_RE.sub("", text)
67class NerConceptsExtractor:
68 """Emit typed NER entities (``EntityKind.ENTITY`` only).
70 LLM-curated concept pages are produced downstream by the per-source
71 batched call in :mod:`lilbee.wiki.generation`.
72 """
74 def __init__(self, provider: LLMProvider, config: Config) -> None:
75 self._provider = provider
76 self._config = config
78 def extract(self, chunks: list[SearchChunk]) -> list[ExtractedEntity]:
79 if not chunks:
80 return []
81 nlp = _load_spacy()
82 if nlp is None:
83 return []
85 entity_records: dict[str, _Aggregate] = {}
86 allowed_ent_types = self._config.concept_allowed_ent_types
88 debug_enabled = log.isEnabledFor(logging.DEBUG)
89 funnel: dict[str, int] = {
90 "raw_ents": 0,
91 "type_filter_dropped": 0,
92 "label_sanity_dropped_entities": 0,
93 "kept_entity_surfaces": 0,
94 }
95 cleaned_texts = (pre_clean_for_ner(c.chunk) for c in chunks)
96 for chunk, doc in zip(chunks, nlp.pipe(cleaned_texts), strict=True):
97 ref = ChunkRef(source=chunk.source, chunk_index=chunk.chunk_index)
98 _accumulate_doc_entities(
99 doc, ref, entity_records, allowed_ent_types, funnel, debug_enabled
100 )
102 if debug_enabled:
103 log.debug(
104 "ner funnel: raw_ents=%(raw_ents)d "
105 "type_filter_dropped=%(type_filter_dropped)d "
106 "label_sanity_dropped_entities=%(label_sanity_dropped_entities)d "
107 "kept_entity_surfaces=%(kept_entity_surfaces)d",
108 funnel,
109 )
111 min_mentions = self._config.wiki_entity_min_mentions
112 results: list[ExtractedEntity] = []
113 for agg in entity_records.values():
114 record = _make_record(agg, EntityKind.ENTITY, min_mentions)
115 if record is not None:
116 results.append(record)
117 results.sort(key=lambda e: (e.kind.value, e.slug))
118 return results
121def _accumulate_doc_entities(
122 doc: Any,
123 ref: ChunkRef,
124 entity_records: dict[str, _Aggregate],
125 allowed_ent_types: set[str] | frozenset[str],
126 funnel: dict[str, int],
127 debug_enabled: bool,
128) -> None:
129 """Fold one spaCy doc's entities into ``entity_records`` (mutated in place)."""
130 for ent in doc.ents:
131 funnel["raw_ents"] += 1
132 if ent.label_ not in allowed_ent_types:
133 funnel["type_filter_dropped"] += 1
134 continue
135 surface = ent.text.strip()
136 if not is_valid_label(surface):
137 funnel["label_sanity_dropped_entities"] += 1
138 if debug_enabled:
139 log.debug("label-sanity: rejected entity %r", surface)
140 continue
141 key = _normalize(surface)
142 rec = entity_records.setdefault(key, _Aggregate(label=surface, type_hint=ent.label_))
143 rec.refs.add(ref)
144 funnel["kept_entity_surfaces"] += 1
147class _Aggregate:
148 """Mutable accumulator used only while folding per-chunk hits."""
150 __slots__ = ("label", "refs", "type_hint")
152 def __init__(self, label: str, type_hint: str) -> None:
153 self.label = label
154 self.type_hint = type_hint
155 self.refs: set[ChunkRef] = set()
158def _sorted_refs(refs: set[ChunkRef]) -> tuple[ChunkRef, ...]:
159 return tuple(sorted(refs, key=lambda r: (r.source, r.chunk_index)))
162def _make_record(agg: _Aggregate, kind: EntityKind, min_mentions: int) -> ExtractedEntity | None:
163 """Turn an aggregate into an ``ExtractedEntity`` or drop it.
165 Filters records below the mention threshold and records whose label
166 slug-cleans to an empty string (e.g. labels of only punctuation);
167 without the empty-slug guard those would try to write files named
168 just ``.md`` on disk.
169 """
170 if len(agg.refs) < min_mentions:
171 return None
172 slug = make_slug(agg.label)
173 if not slug:
174 return None
175 return ExtractedEntity(
176 slug=slug,
177 kind=kind,
178 label=agg.label,
179 type_hint=agg.type_hint,
180 chunk_refs=_sorted_refs(agg.refs),
181 )
184@functools.cache
185def _load_spacy() -> Any | None:
186 """Load the shared spaCy pipeline, or return None if unavailable.
188 Cached so the "spaCy unavailable" warning fires at most once per process.
189 Without the cache, every chunk-extract call repeats the warning; on a
190 corpus with 1 000 chunks the user got 1 000 identical lines.
191 """
192 try:
193 from lilbee.retrieval.concepts import load_spacy_pipeline
194 except ImportError:
195 log.warning("Entity extraction disabled: lilbee.concepts unavailable")
196 return None
197 try:
198 return load_spacy_pipeline()
199 except ImportError:
200 log.warning("Entity extraction disabled: spaCy model unavailable")
201 return None