Coverage for src/lilbee/retrieval/concepts/nlp.py: 100%

1"""spaCy-backed NLP helpers for the concept graph."""

3from __future__ import annotations

5import logging

6from typing import Any

8from lilbee.core.text import is_valid_label

10log = logging.getLogger(__name__)

13def concepts_available() -> bool:

14 """Check if concept graph dependencies (spacy, graspologic) are installed."""

15 try:

16 import graspologic_native # noqa: F401

17 import spacy # noqa: F401

19 return True

20 except ImportError:

21 return False

24def _ensure_spacy_model() -> Any:

25 """Load the spaCy NER model; raise ImportError with an install hint if missing."""

26 import spacy

28 model_name = "en_core_web_sm"

29 try:

30 return spacy.load(model_name)

31 except OSError as exc:

32 raise ImportError(

33 f"spaCy model {model_name!r} not installed. Run: python -m spacy download {model_name}"

34 ) from exc

37def load_spacy_pipeline() -> Any:

38 """Public wrapper around the shared spaCy NER + noun-chunk pipeline.

40 Raises ``ImportError`` if spaCy or the ``en_core_web_sm`` model cannot

41 be installed.

42 """

43 return _ensure_spacy_model()

46def _filter_noun_chunks(doc: Any, max_concepts: int) -> list[str]:

47 """Extract deduplicated, filtered noun chunks from a spaCy doc.

49 Applies the same :func:`is_valid_label` gate the wiki entity

50 extractor uses, so structural-noise concepts (markdown table

51 delimiters, page-number-prefixed tokens, sub-three-char fragments)

52 never enter the co-occurrence graph and therefore never become a

53 synthesis-page cluster label.

55 The gate runs on the lowercased form here while the NER extractor

56 gates on the original-cased surface; the two decisions match

57 because ``is_valid_label`` is case-agnostic today. Any future

58 case-sensitive rule must land in both call sites together.

59 """

60 seen: set[str] = set()

61 concepts: list[str] = []

62 for chunk in doc.noun_chunks:

63 concept = chunk.text.lower().strip()

64 if not is_valid_label(concept):

65 continue

66 if concept in seen:

67 continue

68 seen.add(concept)

69 concepts.append(concept)

70 if len(concepts) >= max_concepts:

71 break

72 return concepts

Coverage for src / lilbee / retrieval / concepts / nlp.py: 100%

35 statements