Coverage for src/lilbee/wiki/entity_extractor/ner

1"""spaCy NER entity extractor (default strategy).

3Produces typed NER entities only. LLM-curated concept pages are

4proposed downstream by the per-source batched call in

5:mod:`lilbee.wiki.generation`.

6"""

8from __future__ import annotations

10import functools

11import logging

12import re

13from typing import TYPE_CHECKING, Any

15from lilbee.core.text import is_valid_label, make_slug

16from lilbee.wiki.entity_extractor.base import (

17 ChunkRef,

18 EntityKind,

19 ExtractedEntity,

20)

22if TYPE_CHECKING:

23 from lilbee.core.config import Config

24 from lilbee.data.store import SearchChunk

25 from lilbee.providers.base import LLMProvider

27log = logging.getLogger(__name__)

29_WHITESPACE_RE = re.compile(r"\s+")

31# Pre-spaCy markdown-noise strippers. Compiled once at module scope so

32# the extractor's hot path does not recompile them per chunk. Match on

33# line boundaries via re.MULTILINE; each sub() empties the matched

34# line so downstream line-joins collapse the hole to a single newline.

35_TABLE_ROW_RE = re.compile(r"^\|.*\|\s*$", re.MULTILINE)

36_PAGE_NUMBER_RE = re.compile(r"^\s*\d{1,4}\s*$", re.MULTILINE)

37_NAV_CHROME_RE = re.compile(

39 re.MULTILINE,

40)

43def _normalize(text: str) -> str:

44 """Lowercase, strip, and collapse internal whitespace for dedup keys."""

45 return _WHITESPACE_RE.sub(" ", text.strip().lower())

48def pre_clean_for_ner(text: str) -> str:

49 """Strip markdown-structural noise before handing text to spaCy.

51 Removes whole-line markdown-table rows (``| Designer | Irv ... |``),

52 standalone page-number lines from PDF extraction (``42``), and

53 Wikipedia / CMS navigation chrome (``Edit this page``). Leaves

54 prose untouched: every regex anchors to a full line and emits an

55 empty line in place of the match, which spaCy treats as a sentence

56 break.

58 Only targets the noise patterns actually observed in the bb-8b7s

59 QA corpus. Fuller markdown parsing is deferred; a regex pre-clean

60 is sufficient for the current signal-to-noise ratio.

61 """

62 text = _TABLE_ROW_RE.sub("", text)

63 text = _PAGE_NUMBER_RE.sub("", text)

64 return _NAV_CHROME_RE.sub("", text)

67class NerConceptsExtractor:

68 """Emit typed NER entities (``EntityKind.ENTITY`` only).

70 LLM-curated concept pages are produced downstream by the per-source

71 batched call in :mod:`lilbee.wiki.generation`.

72 """

74 def __init__(self, provider: LLMProvider, config: Config) -> None:

75 self._provider = provider

76 self._config = config

78 def extract(self, chunks: list[SearchChunk]) -> list[ExtractedEntity]:

79 if not chunks:

80 return []

81 nlp = _load_spacy()

82 if nlp is None:

83 return []

85 entity_records: dict[str, _Aggregate] = {}

86 allowed_ent_types = self._config.concept_allowed_ent_types

88 debug_enabled = log.isEnabledFor(logging.DEBUG)

89 funnel: dict[str, int] = {

90 "raw_ents": 0,

91 "type_filter_dropped": 0,

92 "label_sanity_dropped_entities": 0,

93 "kept_entity_surfaces": 0,

94 }

95 cleaned_texts = (pre_clean_for_ner(c.chunk) for c in chunks)

96 for chunk, doc in zip(chunks, nlp.pipe(cleaned_texts), strict=True):

97 ref = ChunkRef(source=chunk.source, chunk_index=chunk.chunk_index)

98 _accumulate_doc_entities(

99 doc, ref, entity_records, allowed_ent_types, funnel, debug_enabled

100 )

101

102 if debug_enabled:

103 log.debug(

104 "ner funnel: raw_ents=%(raw_ents)d "

105 "type_filter_dropped=%(type_filter_dropped)d "

106 "label_sanity_dropped_entities=%(label_sanity_dropped_entities)d "

107 "kept_entity_surfaces=%(kept_entity_surfaces)d",

108 funnel,

109 )

110

111 min_mentions = self._config.wiki_entity_min_mentions

112 results: list[ExtractedEntity] = []

113 for agg in entity_records.values():

114 record = _make_record(agg, EntityKind.ENTITY, min_mentions)

115 if record is not None:

116 results.append(record)

117 results.sort(key=lambda e: (e.kind.value, e.slug))

118 return results

119

120

121def _accumulate_doc_entities(

122 doc: Any,

123 ref: ChunkRef,

124 entity_records: dict[str, _Aggregate],

125 allowed_ent_types: set[str] | frozenset[str],

126 funnel: dict[str, int],

127 debug_enabled: bool,

128) -> None:

129 """Fold one spaCy doc's entities into ``entity_records`` (mutated in place)."""

130 for ent in doc.ents:

131 funnel["raw_ents"] += 1

132 if ent.label_ not in allowed_ent_types:

133 funnel["type_filter_dropped"] += 1

134 continue

135 surface = ent.text.strip()

136 if not is_valid_label(surface):

137 funnel["label_sanity_dropped_entities"] += 1

138 if debug_enabled:

139 log.debug("label-sanity: rejected entity %r", surface)

140 continue

141 key = _normalize(surface)

142 rec = entity_records.setdefault(key, _Aggregate(label=surface, type_hint=ent.label_))

143 rec.refs.add(ref)

144 funnel["kept_entity_surfaces"] += 1

145

146

147class _Aggregate:

148 """Mutable accumulator used only while folding per-chunk hits."""

149

150 __slots__ = ("label", "refs", "type_hint")

151

152 def __init__(self, label: str, type_hint: str) -> None:

153 self.label = label

154 self.type_hint = type_hint

155 self.refs: set[ChunkRef] = set()

156

157

158def _sorted_refs(refs: set[ChunkRef]) -> tuple[ChunkRef, ...]:

159 return tuple(sorted(refs, key=lambda r: (r.source, r.chunk_index)))

160

161

162def _make_record(agg: _Aggregate, kind: EntityKind, min_mentions: int) -> ExtractedEntity | None:

163 """Turn an aggregate into an ``ExtractedEntity`` or drop it.

164

165 Filters records below the mention threshold and records whose label

166 slug-cleans to an empty string (e.g. labels of only punctuation);

167 without the empty-slug guard those would try to write files named

168 just ``.md`` on disk.

169 """

170 if len(agg.refs) < min_mentions:

171 return None

172 slug = make_slug(agg.label)

173 if not slug:

174 return None

175 return ExtractedEntity(

176 slug=slug,

177 kind=kind,

178 label=agg.label,

179 type_hint=agg.type_hint,

180 chunk_refs=_sorted_refs(agg.refs),

181 )

182

183

184@functools.cache

185def _load_spacy() -> Any | None:

186 """Load the shared spaCy pipeline, or return None if unavailable.

187

188 Cached so the "spaCy unavailable" warning fires at most once per process.

189 Without the cache, every chunk-extract call repeats the warning; on a

190 corpus with 1 000 chunks the user got 1 000 identical lines.

191 """

192 try:

193 from lilbee.retrieval.concepts import load_spacy_pipeline

194 except ImportError:

195 log.warning("Entity extraction disabled: lilbee.concepts unavailable")

196 return None

197 try:

198 return load_spacy_pipeline()

199 except ImportError:

200 log.warning("Entity extraction disabled: spaCy model unavailable")

201 return None

Coverage for src / lilbee / wiki / entity_extractor / ner_concepts.py: 100%

89 statements