Coverage for src / lilbee / core / text.py: 100%

25 statements  

« prev     ^ index     » next       coverage.py v7.13.4, created at 2026-05-15 20:55 +0000

1"""Label sanity checks and slug formatting.""" 

2 

3from __future__ import annotations 

4 

5import re 

6 

7_SLUG_CLEAN_RE = re.compile(r"[^a-z0-9-]") 

8 

9# Characters that signal markdown-structural noise in a concept label. 

10# Single source of truth for both ``is_valid_label`` (membership check) 

11# and ``clean_label_for_display`` (regex strip). 

12_STRUCTURAL_CHARS = frozenset("|#>") 

13_DISPLAY_STRUCTURAL_RE = re.compile(f"[{re.escape(''.join(_STRUCTURAL_CHARS))}]+") 

14_DISPLAY_WHITESPACE_RE = re.compile(r"\s+") 

15 

16LABEL_SANITY_MIN_LEN = 3 

17LABEL_SANITY_MIN_ALNUM_RATIO = 0.5 

18 

19 

20def make_slug(label: str) -> str: 

21 """Turn a concept label into a filesystem-safe slug. 

22 

23 Lowercases, maps whitespace to single hyphens and slashes to double 

24 hyphens (path encoding), strips anything outside ``[a-z0-9-]``, and 

25 trims leading and trailing hyphens. Returns ``""`` when no sluggable 

26 characters remain; callers must treat an empty slug as "skip this 

27 entity" so the generator never writes a file called ``.md``. 

28 

29 Internal hyphen runs from the ``/`` path encoding are preserved; 

30 only leading and trailing hyphens (e.g. ``--body`` from a stripped 

31 ``| | Body``) are removed. 

32 """ 

33 slug = label.lower().replace(" ", "-").replace("/", "--") 

34 slug = _SLUG_CLEAN_RE.sub("", slug) 

35 return slug.strip("-") 

36 

37 

38def is_valid_label(label: str) -> bool: 

39 """Reject structural-noise labels before aggregation. 

40 

41 Catches the noise patterns observed in QA (bb-8b7s): 

42 

43 - empty or sub-three-char fragments, 

44 - markdown table delimiters (``| | designer``), 

45 - page-number-prefixed tokens (``158 vehicle``), 

46 - paren-prefixed numerics (``(7.0 l)``: would otherwise slug to 

47 ``70-l`` after punctuation cleanup), 

48 - hyphen-prefixed fragments (``-answers``: trailing text from 

49 markdown bracket-link extraction). 

50 

51 Requires the first non-whitespace character to be a Unicode letter 

52 so any non-alpha prefix (digit, bracket, hyphen, punctuation) is 

53 rejected up front. Legitimate labels like ``E-mail`` or ``iPhone`` 

54 pass. Still permissive on three-char fragments like ``cro`` / 

55 ``fus``; A3's entity-type filter and ``wiki_entity_min_mentions`` 

56 catch those downstream. 

57 """ 

58 stripped = label.strip() 

59 if len(stripped) < LABEL_SANITY_MIN_LEN: 

60 return False 

61 if not stripped[0].isalpha(): 

62 return False 

63 if any(ch in _STRUCTURAL_CHARS for ch in stripped): 

64 return False 

65 alnum = sum(1 for ch in stripped if ch.isalnum()) 

66 return alnum / len(stripped) >= LABEL_SANITY_MIN_ALNUM_RATIO 

67 

68 

69def clean_label_for_display(label: str) -> str: 

70 """Return a prompt-safe version of *label* for the ``{topic}`` slot. 

71 

72 Defense-in-depth behind :func:`is_valid_label`: a concept or entity 

73 label that reached this function already passed the sanity gate 

74 and should not contain ``|#>`` in practice. The structural-char 

75 strip here guards against a future code path that bypasses the 

76 gate (synthesis cluster labels sourced from ``concept_nodes``, 

77 user-supplied topics, tests). The always-useful work is whitespace 

78 normalization: spaCy surface forms can carry internal runs of 

79 whitespace that would reach the H1 verbatim. 

80 

81 Preserves the original capitalization so proper nouns 

82 (``Chevrolet Caprice``, ``iPhone``) survive intact; the model 

83 title-cases lowercase common nouns on its own. 

84 """ 

85 clean = _DISPLAY_STRUCTURAL_RE.sub("", label) 

86 return _DISPLAY_WHITESPACE_RE.sub(" ", clean).strip()