Coverage for src/lilbee/core/text.py: 100%

1"""Label sanity checks and slug formatting."""

3from __future__ import annotations

5import re

7_SLUG_CLEAN_RE = re.compile(r"[^a-z0-9-]")

9# Characters that signal markdown-structural noise in a concept label.

10# Single source of truth for both ``is_valid_label`` (membership check)

11# and ``clean_label_for_display`` (regex strip).

12_STRUCTURAL_CHARS = frozenset("|#>")

13_DISPLAY_STRUCTURAL_RE = re.compile(f"[{re.escape(''.join(_STRUCTURAL_CHARS))}]+")

14_DISPLAY_WHITESPACE_RE = re.compile(r"\s+")

16LABEL_SANITY_MIN_LEN = 3

17LABEL_SANITY_MIN_ALNUM_RATIO = 0.5

20def make_slug(label: str) -> str:

21 """Turn a concept label into a filesystem-safe slug.

23 Lowercases, maps whitespace to single hyphens and slashes to double

24 hyphens (path encoding), strips anything outside ``[a-z0-9-]``, and

25 trims leading and trailing hyphens. Returns ``""`` when no sluggable

26 characters remain; callers must treat an empty slug as "skip this

27 entity" so the generator never writes a file called ``.md``.

29 Internal hyphen runs from the ``/`` path encoding are preserved;

30 only leading and trailing hyphens (e.g. ``--body`` from a stripped

31 ``| | Body``) are removed.

32 """

33 slug = label.lower().replace(" ", "-").replace("/", "--")

34 slug = _SLUG_CLEAN_RE.sub("", slug)

35 return slug.strip("-")

38def is_valid_label(label: str) -> bool:

39 """Reject structural-noise labels before aggregation.

41 Catches the noise patterns observed in QA (bb-8b7s):

43 - empty or sub-three-char fragments,

44 - markdown table delimiters (``| | designer``),

45 - page-number-prefixed tokens (``158 vehicle``),

46 - paren-prefixed numerics (``(7.0 l)``: would otherwise slug to

47 ``70-l`` after punctuation cleanup),

48 - hyphen-prefixed fragments (``-answers``: trailing text from

49 markdown bracket-link extraction).

51 Requires the first non-whitespace character to be a Unicode letter

52 so any non-alpha prefix (digit, bracket, hyphen, punctuation) is

53 rejected up front. Legitimate labels like ``E-mail`` or ``iPhone``

54 pass. Still permissive on three-char fragments like ``cro`` /

55 ``fus``; A3's entity-type filter and ``wiki_entity_min_mentions``

56 catch those downstream.

57 """

58 stripped = label.strip()

59 if len(stripped) < LABEL_SANITY_MIN_LEN:

60 return False

61 if not stripped[0].isalpha():

62 return False

63 if any(ch in _STRUCTURAL_CHARS for ch in stripped):

64 return False

65 alnum = sum(1 for ch in stripped if ch.isalnum())

66 return alnum / len(stripped) >= LABEL_SANITY_MIN_ALNUM_RATIO

69def clean_label_for_display(label: str) -> str:

70 """Return a prompt-safe version of *label* for the ``{topic}`` slot.

72 Defense-in-depth behind :func:`is_valid_label`: a concept or entity

73 label that reached this function already passed the sanity gate

74 and should not contain ``|#>`` in practice. The structural-char

75 strip here guards against a future code path that bypasses the

76 gate (synthesis cluster labels sourced from ``concept_nodes``,

77 user-supplied topics, tests). The always-useful work is whitespace

78 normalization: spaCy surface forms can carry internal runs of

79 whitespace that would reach the H1 verbatim.

81 Preserves the original capitalization so proper nouns

82 (``Chevrolet Caprice``, ``iPhone``) survive intact; the model

83 title-cases lowercase common nouns on its own.

84 """

85 clean = _DISPLAY_STRUCTURAL_RE.sub("", label)

86 return _DISPLAY_WHITESPACE_RE.sub(" ", clean).strip()

Coverage for src / lilbee / core / text.py: 100%

25 statements