Coverage for src / lilbee / wiki / citations.py: 100%
96 statements
« prev ^ index » next coverage.py v7.13.4, created at 2026-05-15 20:55 +0000
« prev ^ index » next coverage.py v7.13.4, created at 2026-05-15 20:55 +0000
1"""Citation resolution and verification for wiki pages.
3Builds :class:`CitationRecord` rows from parsed ``[^srcN]`` markers,
4matches citation excerpts back to the source chunks they came from
5(single-source and multi-source variants), verifies the excerpts are
6substring-present in the chunk pool, and renders the YAML provenance
7block written into a wiki page's frontmatter.
8"""
10from __future__ import annotations
12import logging
13from datetime import UTC, datetime
15import yaml
17from lilbee.core.config import Config
18from lilbee.data.store import CitationRecord, SearchChunk
19from lilbee.wiki.cache import normalize_whitespace
20from lilbee.wiki.citation import ParsedCitation
22log = logging.getLogger(__name__)
24# JSON-style escape sequences that may appear inside quoted excerpts the
25# model emits. Any backslash-prefixed character not in this map stays
26# verbatim (e.g. ``\\x`` passes through unchanged).
27_EXCERPT_ESCAPES: dict[str, str] = {"n": "\n", "t": "\t", '"': '"', "\\": "\\"}
30def _extract_excerpt(source_ref: str) -> str:
31 """Extract the quoted excerpt from a citation source_ref string.
32 e.g. 'doc.md, excerpt: "Python supports typing."' → 'Python supports typing.'
34 Common JSON-style escape sequences inside the quoted span (``\\n``,
35 ``\\t``, ``\\"``, ``\\\\``) are decoded to their literal characters so
36 they round-trip against the source text. Some models "helpfully"
37 encode real newlines as ``\\n`` when emitting a quoted excerpt; the
38 source chunk they came from has real newlines, so skipping this
39 step leaves otherwise-faithful citations unverifiable.
40 """
41 marker = 'excerpt: "'
42 idx = source_ref.find(marker)
43 if idx == -1:
44 return ""
45 start = idx + len(marker)
46 end = source_ref.find('"', start)
47 raw = source_ref[start:].strip() if end == -1 else source_ref[start:end].strip()
48 return _decode_excerpt_escapes(raw)
51def _decode_excerpt_escapes(raw: str) -> str:
52 """Decode the JSON-style escapes models commonly emit inside quoted strings."""
53 if "\\" not in raw:
54 return raw
55 result: list[str] = []
56 i = 0
57 while i < len(raw):
58 ch = raw[i]
59 mapped = _EXCERPT_ESCAPES.get(raw[i + 1]) if ch == "\\" and i + 1 < len(raw) else None
60 if mapped is not None:
61 result.append(mapped)
62 i += 2
63 else:
64 result.append(ch)
65 i += 1
66 return "".join(result)
69def _find_excerpt_location(
70 excerpt: str,
71 chunks: list[SearchChunk],
72) -> tuple[int, int, int, int]:
73 """Find page/line location of an excerpt within chunks."""
74 if excerpt:
75 for chunk in chunks:
76 if excerpt in chunk.chunk:
77 return chunk.page_start, chunk.page_end, chunk.line_start, chunk.line_end
78 return 0, 0, 0, 0
81def _build_citation_record(
82 citation_key: str,
83 excerpt: str,
84 source_filename: str,
85 source_hash: str,
86 page_start: int,
87 page_end: int,
88 line_start: int,
89 line_end: int,
90 created_at: str,
91) -> CitationRecord:
92 """Build a single CitationRecord with consistent defaults."""
93 return CitationRecord(
94 wiki_source="", # filled by caller
95 wiki_chunk_index=0,
96 citation_key=citation_key,
97 claim_type="fact" if excerpt else "inference",
98 source_filename=source_filename,
99 source_hash=source_hash,
100 page_start=page_start,
101 page_end=page_end,
102 line_start=line_start,
103 line_end=line_end,
104 excerpt=excerpt,
105 created_at=created_at,
106 )
109def _resolve_citations(
110 parsed_citations: list[ParsedCitation],
111 source_name: str,
112 source_hash: str,
113 chunks: list[SearchChunk],
114) -> list[CitationRecord]:
115 """Resolve parsed citation refs to CitationRecord objects.
116 Searches for each citation's excerpt in the source chunks to find
117 the best matching location (page/line numbers).
118 """
119 records: list[CitationRecord] = []
120 now = datetime.now(UTC).isoformat()
122 for parsed in parsed_citations:
123 excerpt = _extract_excerpt(parsed.source_ref)
124 page_start, page_end, line_start, line_end = _find_excerpt_location(excerpt, chunks)
125 records.append(
126 _build_citation_record(
127 parsed.citation_key,
128 excerpt,
129 source_name,
130 source_hash,
131 page_start,
132 page_end,
133 line_start,
134 line_end,
135 now,
136 )
137 )
138 return records
141def verify_citations(
142 citation_records: list[CitationRecord],
143 chunks: list[SearchChunk],
144 label: str,
145 config: Config,
146) -> list[CitationRecord]:
147 """Filter citation records, keeping only those whose excerpts are in the chunks."""
148 wiki_prefix = config.wiki_dir + "/"
149 all_chunk_text = normalize_whitespace(" ".join(c.chunk for c in chunks))
150 verified: list[CitationRecord] = []
151 for rec in citation_records:
152 if rec["source_filename"].startswith(wiki_prefix):
153 log.debug("Skipping wiki-sourced citation %s", rec["citation_key"])
154 continue
155 if rec["claim_type"] == "inference" or not rec["excerpt"]:
156 verified.append(rec)
157 continue
158 if normalize_whitespace(rec["excerpt"]) in all_chunk_text:
159 verified.append(rec)
160 else:
161 log.debug("Citation %s excerpt not found in %s, dropping", rec["citation_key"], label)
162 return verified
165def render_provenance(config: Config, chunks: list[SearchChunk]) -> str:
166 """Render the provenance block: chunk references + extraction method.
168 Routes through ``yaml.safe_dump`` rather than hand-rolled string
169 formatting so a chunk source containing a quote, backslash,
170 colon, or newline does not produce invalid YAML that
171 ``parse_frontmatter`` would silently drop on read.
172 """
173 block = {
174 "provenance": {
175 "extraction_method": config.wiki_entity_mode.value,
176 "chunks": [{"source": c.source, "chunk_index": c.chunk_index} for c in chunks],
177 }
178 }
179 return yaml.safe_dump(block, sort_keys=False)
182def resolve_multi_source_citations(
183 parsed_citations: list[ParsedCitation],
184 source_names: list[str],
185 source_hashes: dict[str, str],
186 chunks_by_source: dict[str, list[SearchChunk]],
187) -> list[CitationRecord]:
188 """Resolve citations from a synthesis page that cites multiple sources.
189 Each citation's source_ref is matched against the source list to
190 determine which source document it references.
191 """
192 records: list[CitationRecord] = []
193 now = datetime.now(UTC).isoformat()
195 all_chunks = [c for cs in chunks_by_source.values() for c in cs]
197 for parsed in parsed_citations:
198 excerpt = _extract_excerpt(parsed.source_ref)
200 matched_source = _match_citation_source(parsed.source_ref, source_names)
201 if not matched_source:
202 matched_source = _find_excerpt_source(excerpt, chunks_by_source)
203 if not matched_source and source_names:
204 # No citation match found; default to first listed source
205 log.warning(
206 "No citation match for chunk: defaulting to first source: %s",
207 source_names[0],
208 )
209 matched_source = source_names[0]
211 search_chunks = chunks_by_source.get(matched_source, all_chunks)
212 page_start, page_end, line_start, line_end = _find_excerpt_location(excerpt, search_chunks)
213 records.append(
214 _build_citation_record(
215 parsed.citation_key,
216 excerpt,
217 matched_source,
218 source_hashes.get(matched_source, ""),
219 page_start,
220 page_end,
221 line_start,
222 line_end,
223 now,
224 )
225 )
226 return records
229def _match_citation_source(source_ref: str, source_names: list[str]) -> str:
230 """Find which source a citation references by matching filenames in the ref."""
231 for name in source_names:
232 if name in source_ref:
233 return name
234 return ""
237def _find_excerpt_source(excerpt: str, chunks_by_source: dict[str, list[SearchChunk]]) -> str:
238 """Find which source contains a given excerpt by searching chunks."""
239 if not excerpt:
240 return ""
241 for source, chunks in chunks_by_source.items():
242 for chunk in chunks:
243 if excerpt in chunk.chunk:
244 return source
245 return ""