Coverage for src / lilbee / retrieval / concepts / nlp.py: 100%
35 statements
« prev ^ index » next coverage.py v7.13.4, created at 2026-05-15 20:55 +0000
« prev ^ index » next coverage.py v7.13.4, created at 2026-05-15 20:55 +0000
1"""spaCy-backed NLP helpers for the concept graph."""
3from __future__ import annotations
5import logging
6from typing import Any
8from lilbee.core.text import is_valid_label
10log = logging.getLogger(__name__)
13def concepts_available() -> bool:
14 """Check if concept graph dependencies (spacy, graspologic) are installed."""
15 try:
16 import graspologic_native # noqa: F401
17 import spacy # noqa: F401
19 return True
20 except ImportError:
21 return False
24def _ensure_spacy_model() -> Any:
25 """Load the spaCy NER model; raise ImportError with an install hint if missing."""
26 import spacy
28 model_name = "en_core_web_sm"
29 try:
30 return spacy.load(model_name)
31 except OSError as exc:
32 raise ImportError(
33 f"spaCy model {model_name!r} not installed. Run: python -m spacy download {model_name}"
34 ) from exc
37def load_spacy_pipeline() -> Any:
38 """Public wrapper around the shared spaCy NER + noun-chunk pipeline.
40 Raises ``ImportError`` if spaCy or the ``en_core_web_sm`` model cannot
41 be installed.
42 """
43 return _ensure_spacy_model()
46def _filter_noun_chunks(doc: Any, max_concepts: int) -> list[str]:
47 """Extract deduplicated, filtered noun chunks from a spaCy doc.
49 Applies the same :func:`is_valid_label` gate the wiki entity
50 extractor uses, so structural-noise concepts (markdown table
51 delimiters, page-number-prefixed tokens, sub-three-char fragments)
52 never enter the co-occurrence graph and therefore never become a
53 synthesis-page cluster label.
55 The gate runs on the lowercased form here while the NER extractor
56 gates on the original-cased surface; the two decisions match
57 because ``is_valid_label`` is case-agnostic today. Any future
58 case-sensitive rule must land in both call sites together.
59 """
60 seen: set[str] = set()
61 concepts: list[str] = []
62 for chunk in doc.noun_chunks:
63 concept = chunk.text.lower().strip()
64 if not is_valid_label(concept):
65 continue
66 if concept in seen:
67 continue
68 seen.add(concept)
69 concepts.append(concept)
70 if len(concepts) >= max_concepts:
71 break
72 return concepts