Coverage for src / lilbee / core / text.py: 100%
25 statements
« prev ^ index » next coverage.py v7.13.4, created at 2026-05-15 20:55 +0000
« prev ^ index » next coverage.py v7.13.4, created at 2026-05-15 20:55 +0000
1"""Label sanity checks and slug formatting."""
3from __future__ import annotations
5import re
7_SLUG_CLEAN_RE = re.compile(r"[^a-z0-9-]")
9# Characters that signal markdown-structural noise in a concept label.
10# Single source of truth for both ``is_valid_label`` (membership check)
11# and ``clean_label_for_display`` (regex strip).
12_STRUCTURAL_CHARS = frozenset("|#>")
13_DISPLAY_STRUCTURAL_RE = re.compile(f"[{re.escape(''.join(_STRUCTURAL_CHARS))}]+")
14_DISPLAY_WHITESPACE_RE = re.compile(r"\s+")
16LABEL_SANITY_MIN_LEN = 3
17LABEL_SANITY_MIN_ALNUM_RATIO = 0.5
20def make_slug(label: str) -> str:
21 """Turn a concept label into a filesystem-safe slug.
23 Lowercases, maps whitespace to single hyphens and slashes to double
24 hyphens (path encoding), strips anything outside ``[a-z0-9-]``, and
25 trims leading and trailing hyphens. Returns ``""`` when no sluggable
26 characters remain; callers must treat an empty slug as "skip this
27 entity" so the generator never writes a file called ``.md``.
29 Internal hyphen runs from the ``/`` path encoding are preserved;
30 only leading and trailing hyphens (e.g. ``--body`` from a stripped
31 ``| | Body``) are removed.
32 """
33 slug = label.lower().replace(" ", "-").replace("/", "--")
34 slug = _SLUG_CLEAN_RE.sub("", slug)
35 return slug.strip("-")
38def is_valid_label(label: str) -> bool:
39 """Reject structural-noise labels before aggregation.
41 Catches the noise patterns observed in QA (bb-8b7s):
43 - empty or sub-three-char fragments,
44 - markdown table delimiters (``| | designer``),
45 - page-number-prefixed tokens (``158 vehicle``),
46 - paren-prefixed numerics (``(7.0 l)``: would otherwise slug to
47 ``70-l`` after punctuation cleanup),
48 - hyphen-prefixed fragments (``-answers``: trailing text from
49 markdown bracket-link extraction).
51 Requires the first non-whitespace character to be a Unicode letter
52 so any non-alpha prefix (digit, bracket, hyphen, punctuation) is
53 rejected up front. Legitimate labels like ``E-mail`` or ``iPhone``
54 pass. Still permissive on three-char fragments like ``cro`` /
55 ``fus``; A3's entity-type filter and ``wiki_entity_min_mentions``
56 catch those downstream.
57 """
58 stripped = label.strip()
59 if len(stripped) < LABEL_SANITY_MIN_LEN:
60 return False
61 if not stripped[0].isalpha():
62 return False
63 if any(ch in _STRUCTURAL_CHARS for ch in stripped):
64 return False
65 alnum = sum(1 for ch in stripped if ch.isalnum())
66 return alnum / len(stripped) >= LABEL_SANITY_MIN_ALNUM_RATIO
69def clean_label_for_display(label: str) -> str:
70 """Return a prompt-safe version of *label* for the ``{topic}`` slot.
72 Defense-in-depth behind :func:`is_valid_label`: a concept or entity
73 label that reached this function already passed the sanity gate
74 and should not contain ``|#>`` in practice. The structural-char
75 strip here guards against a future code path that bypasses the
76 gate (synthesis cluster labels sourced from ``concept_nodes``,
77 user-supplied topics, tests). The always-useful work is whitespace
78 normalization: spaCy surface forms can carry internal runs of
79 whitespace that would reach the H1 verbatim.
81 Preserves the original capitalization so proper nouns
82 (``Chevrolet Caprice``, ``iPhone``) survive intact; the model
83 title-cases lowercase common nouns on its own.
84 """
85 clean = _DISPLAY_STRUCTURAL_RE.sub("", label)
86 return _DISPLAY_WHITESPACE_RE.sub(" ", clean).strip()