Coverage for src / lilbee / wiki / citations.py: 100%

96 statements  

« prev     ^ index     » next       coverage.py v7.13.4, created at 2026-05-15 20:55 +0000

1"""Citation resolution and verification for wiki pages. 

2 

3Builds :class:`CitationRecord` rows from parsed ``[^srcN]`` markers, 

4matches citation excerpts back to the source chunks they came from 

5(single-source and multi-source variants), verifies the excerpts are 

6substring-present in the chunk pool, and renders the YAML provenance 

7block written into a wiki page's frontmatter. 

8""" 

9 

10from __future__ import annotations 

11 

12import logging 

13from datetime import UTC, datetime 

14 

15import yaml 

16 

17from lilbee.core.config import Config 

18from lilbee.data.store import CitationRecord, SearchChunk 

19from lilbee.wiki.cache import normalize_whitespace 

20from lilbee.wiki.citation import ParsedCitation 

21 

22log = logging.getLogger(__name__) 

23 

24# JSON-style escape sequences that may appear inside quoted excerpts the 

25# model emits. Any backslash-prefixed character not in this map stays 

26# verbatim (e.g. ``\\x`` passes through unchanged). 

27_EXCERPT_ESCAPES: dict[str, str] = {"n": "\n", "t": "\t", '"': '"', "\\": "\\"} 

28 

29 

30def _extract_excerpt(source_ref: str) -> str: 

31 """Extract the quoted excerpt from a citation source_ref string. 

32 e.g. 'doc.md, excerpt: "Python supports typing."' → 'Python supports typing.' 

33 

34 Common JSON-style escape sequences inside the quoted span (``\\n``, 

35 ``\\t``, ``\\"``, ``\\\\``) are decoded to their literal characters so 

36 they round-trip against the source text. Some models "helpfully" 

37 encode real newlines as ``\\n`` when emitting a quoted excerpt; the 

38 source chunk they came from has real newlines, so skipping this 

39 step leaves otherwise-faithful citations unverifiable. 

40 """ 

41 marker = 'excerpt: "' 

42 idx = source_ref.find(marker) 

43 if idx == -1: 

44 return "" 

45 start = idx + len(marker) 

46 end = source_ref.find('"', start) 

47 raw = source_ref[start:].strip() if end == -1 else source_ref[start:end].strip() 

48 return _decode_excerpt_escapes(raw) 

49 

50 

51def _decode_excerpt_escapes(raw: str) -> str: 

52 """Decode the JSON-style escapes models commonly emit inside quoted strings.""" 

53 if "\\" not in raw: 

54 return raw 

55 result: list[str] = [] 

56 i = 0 

57 while i < len(raw): 

58 ch = raw[i] 

59 mapped = _EXCERPT_ESCAPES.get(raw[i + 1]) if ch == "\\" and i + 1 < len(raw) else None 

60 if mapped is not None: 

61 result.append(mapped) 

62 i += 2 

63 else: 

64 result.append(ch) 

65 i += 1 

66 return "".join(result) 

67 

68 

69def _find_excerpt_location( 

70 excerpt: str, 

71 chunks: list[SearchChunk], 

72) -> tuple[int, int, int, int]: 

73 """Find page/line location of an excerpt within chunks.""" 

74 if excerpt: 

75 for chunk in chunks: 

76 if excerpt in chunk.chunk: 

77 return chunk.page_start, chunk.page_end, chunk.line_start, chunk.line_end 

78 return 0, 0, 0, 0 

79 

80 

81def _build_citation_record( 

82 citation_key: str, 

83 excerpt: str, 

84 source_filename: str, 

85 source_hash: str, 

86 page_start: int, 

87 page_end: int, 

88 line_start: int, 

89 line_end: int, 

90 created_at: str, 

91) -> CitationRecord: 

92 """Build a single CitationRecord with consistent defaults.""" 

93 return CitationRecord( 

94 wiki_source="", # filled by caller 

95 wiki_chunk_index=0, 

96 citation_key=citation_key, 

97 claim_type="fact" if excerpt else "inference", 

98 source_filename=source_filename, 

99 source_hash=source_hash, 

100 page_start=page_start, 

101 page_end=page_end, 

102 line_start=line_start, 

103 line_end=line_end, 

104 excerpt=excerpt, 

105 created_at=created_at, 

106 ) 

107 

108 

109def _resolve_citations( 

110 parsed_citations: list[ParsedCitation], 

111 source_name: str, 

112 source_hash: str, 

113 chunks: list[SearchChunk], 

114) -> list[CitationRecord]: 

115 """Resolve parsed citation refs to CitationRecord objects. 

116 Searches for each citation's excerpt in the source chunks to find 

117 the best matching location (page/line numbers). 

118 """ 

119 records: list[CitationRecord] = [] 

120 now = datetime.now(UTC).isoformat() 

121 

122 for parsed in parsed_citations: 

123 excerpt = _extract_excerpt(parsed.source_ref) 

124 page_start, page_end, line_start, line_end = _find_excerpt_location(excerpt, chunks) 

125 records.append( 

126 _build_citation_record( 

127 parsed.citation_key, 

128 excerpt, 

129 source_name, 

130 source_hash, 

131 page_start, 

132 page_end, 

133 line_start, 

134 line_end, 

135 now, 

136 ) 

137 ) 

138 return records 

139 

140 

141def verify_citations( 

142 citation_records: list[CitationRecord], 

143 chunks: list[SearchChunk], 

144 label: str, 

145 config: Config, 

146) -> list[CitationRecord]: 

147 """Filter citation records, keeping only those whose excerpts are in the chunks.""" 

148 wiki_prefix = config.wiki_dir + "/" 

149 all_chunk_text = normalize_whitespace(" ".join(c.chunk for c in chunks)) 

150 verified: list[CitationRecord] = [] 

151 for rec in citation_records: 

152 if rec["source_filename"].startswith(wiki_prefix): 

153 log.debug("Skipping wiki-sourced citation %s", rec["citation_key"]) 

154 continue 

155 if rec["claim_type"] == "inference" or not rec["excerpt"]: 

156 verified.append(rec) 

157 continue 

158 if normalize_whitespace(rec["excerpt"]) in all_chunk_text: 

159 verified.append(rec) 

160 else: 

161 log.debug("Citation %s excerpt not found in %s, dropping", rec["citation_key"], label) 

162 return verified 

163 

164 

165def render_provenance(config: Config, chunks: list[SearchChunk]) -> str: 

166 """Render the provenance block: chunk references + extraction method. 

167 

168 Routes through ``yaml.safe_dump`` rather than hand-rolled string 

169 formatting so a chunk source containing a quote, backslash, 

170 colon, or newline does not produce invalid YAML that 

171 ``parse_frontmatter`` would silently drop on read. 

172 """ 

173 block = { 

174 "provenance": { 

175 "extraction_method": config.wiki_entity_mode.value, 

176 "chunks": [{"source": c.source, "chunk_index": c.chunk_index} for c in chunks], 

177 } 

178 } 

179 return yaml.safe_dump(block, sort_keys=False) 

180 

181 

182def resolve_multi_source_citations( 

183 parsed_citations: list[ParsedCitation], 

184 source_names: list[str], 

185 source_hashes: dict[str, str], 

186 chunks_by_source: dict[str, list[SearchChunk]], 

187) -> list[CitationRecord]: 

188 """Resolve citations from a synthesis page that cites multiple sources. 

189 Each citation's source_ref is matched against the source list to 

190 determine which source document it references. 

191 """ 

192 records: list[CitationRecord] = [] 

193 now = datetime.now(UTC).isoformat() 

194 

195 all_chunks = [c for cs in chunks_by_source.values() for c in cs] 

196 

197 for parsed in parsed_citations: 

198 excerpt = _extract_excerpt(parsed.source_ref) 

199 

200 matched_source = _match_citation_source(parsed.source_ref, source_names) 

201 if not matched_source: 

202 matched_source = _find_excerpt_source(excerpt, chunks_by_source) 

203 if not matched_source and source_names: 

204 # No citation match found; default to first listed source 

205 log.warning( 

206 "No citation match for chunk: defaulting to first source: %s", 

207 source_names[0], 

208 ) 

209 matched_source = source_names[0] 

210 

211 search_chunks = chunks_by_source.get(matched_source, all_chunks) 

212 page_start, page_end, line_start, line_end = _find_excerpt_location(excerpt, search_chunks) 

213 records.append( 

214 _build_citation_record( 

215 parsed.citation_key, 

216 excerpt, 

217 matched_source, 

218 source_hashes.get(matched_source, ""), 

219 page_start, 

220 page_end, 

221 line_start, 

222 line_end, 

223 now, 

224 ) 

225 ) 

226 return records 

227 

228 

229def _match_citation_source(source_ref: str, source_names: list[str]) -> str: 

230 """Find which source a citation references by matching filenames in the ref.""" 

231 for name in source_names: 

232 if name in source_ref: 

233 return name 

234 return "" 

235 

236 

237def _find_excerpt_source(excerpt: str, chunks_by_source: dict[str, list[SearchChunk]]) -> str: 

238 """Find which source contains a given excerpt by searching chunks.""" 

239 if not excerpt: 

240 return "" 

241 for source, chunks in chunks_by_source.items(): 

242 for chunk in chunks: 

243 if excerpt in chunk.chunk: 

244 return source 

245 return ""