Coverage for src / lilbee / retrieval / concepts / nlp.py: 100%

35 statements  

« prev     ^ index     » next       coverage.py v7.13.4, created at 2026-05-15 20:55 +0000

1"""spaCy-backed NLP helpers for the concept graph.""" 

2 

3from __future__ import annotations 

4 

5import logging 

6from typing import Any 

7 

8from lilbee.core.text import is_valid_label 

9 

10log = logging.getLogger(__name__) 

11 

12 

13def concepts_available() -> bool: 

14 """Check if concept graph dependencies (spacy, graspologic) are installed.""" 

15 try: 

16 import graspologic_native # noqa: F401 

17 import spacy # noqa: F401 

18 

19 return True 

20 except ImportError: 

21 return False 

22 

23 

24def _ensure_spacy_model() -> Any: 

25 """Load the spaCy NER model; raise ImportError with an install hint if missing.""" 

26 import spacy 

27 

28 model_name = "en_core_web_sm" 

29 try: 

30 return spacy.load(model_name) 

31 except OSError as exc: 

32 raise ImportError( 

33 f"spaCy model {model_name!r} not installed. Run: python -m spacy download {model_name}" 

34 ) from exc 

35 

36 

37def load_spacy_pipeline() -> Any: 

38 """Public wrapper around the shared spaCy NER + noun-chunk pipeline. 

39 

40 Raises ``ImportError`` if spaCy or the ``en_core_web_sm`` model cannot 

41 be installed. 

42 """ 

43 return _ensure_spacy_model() 

44 

45 

46def _filter_noun_chunks(doc: Any, max_concepts: int) -> list[str]: 

47 """Extract deduplicated, filtered noun chunks from a spaCy doc. 

48 

49 Applies the same :func:`is_valid_label` gate the wiki entity 

50 extractor uses, so structural-noise concepts (markdown table 

51 delimiters, page-number-prefixed tokens, sub-three-char fragments) 

52 never enter the co-occurrence graph and therefore never become a 

53 synthesis-page cluster label. 

54 

55 The gate runs on the lowercased form here while the NER extractor 

56 gates on the original-cased surface; the two decisions match 

57 because ``is_valid_label`` is case-agnostic today. Any future 

58 case-sensitive rule must land in both call sites together. 

59 """ 

60 seen: set[str] = set() 

61 concepts: list[str] = [] 

62 for chunk in doc.noun_chunks: 

63 concept = chunk.text.lower().strip() 

64 if not is_valid_label(concept): 

65 continue 

66 if concept in seen: 

67 continue 

68 seen.add(concept) 

69 concepts.append(concept) 

70 if len(concepts) >= max_concepts: 

71 break 

72 return concepts