Coverage for src / lilbee / retrieval / query / formatting.py: 100%
52 statements
« prev ^ index » next coverage.py v7.13.4, created at 2026-05-15 20:55 +0000
« prev ^ index » next coverage.py v7.13.4, created at 2026-05-15 20:55 +0000
1"""Source formatting, context templating, and LLM citation extraction."""
3from __future__ import annotations
5import re
6from pathlib import Path
8from lilbee.core.config import cfg
9from lilbee.data.store import CHUNK_TYPE_WIKI, CitationRecord, SearchChunk
11CONTEXT_TEMPLATE = """Context:
12{context}
14Question: {question}"""
17_CITE_REF_RE = re.compile(r"\[(\d+)\]")
19# Matches trailing LLM-generated citation blocks like "Key sources:", "Sources:",
20# "References:", "Bibliography:", "Citations:" (with optional markdown heading).
21_LLM_CITATION_BLOCK_RE = re.compile(
22 r"\n{1,3}(?:#+\s*)?(?:(?:Key\s+)?Sources|References|Bibliography|Citations)\s*:?\s*\n.*",
23 re.IGNORECASE | re.DOTALL,
24)
27def display_source_path(source: str) -> str:
28 """Render a chunk's source as an absolute path with ``~`` expansion.
30 Source values in the store are stored relative to ``documents_dir`` so the
31 database is portable across machines. For display we resolve back to the
32 user's filesystem and substitute ``~`` for the home directory so the path
33 is unambiguous without being noisy.
35 Falls back to the raw source string if the file no longer exists on disk
36 (e.g. the user moved the documents directory since ingestion).
37 """
38 candidate = cfg.documents_dir / source
39 try:
40 resolved = candidate.resolve(strict=False)
41 except OSError:
42 return source
43 home = Path.home()
44 try:
45 return f"~/{resolved.relative_to(home)}"
46 except ValueError:
47 return str(resolved)
50def _format_citation(citation: CitationRecord) -> str:
51 """Format a single citation record as an indented attribution line."""
52 source_display = display_source_path(citation["source_filename"])
53 if citation["page_start"] or citation["page_end"]:
54 ps, pe = citation["page_start"], citation["page_end"]
55 pages = f"page {ps}" if ps == pe else f"pages {ps}-{pe}"
56 return f" → {source_display}, {pages}"
57 if citation["line_start"] or citation["line_end"]:
58 ls, le = citation["line_start"], citation["line_end"]
59 lines = f"line {ls}" if ls == le else f"lines {ls}-{le}"
60 return f" → {source_display}, {lines}"
61 return f" → {source_display}"
64def format_source(result: SearchChunk, citations: list[CitationRecord] | None = None) -> str:
65 """Format a search result as a source citation line.
66 For wiki chunks, shows the wiki page path followed by indented transitive citations.
67 """
68 source_display = display_source_path(result.source)
69 if result.chunk_type == CHUNK_TYPE_WIKI and citations:
70 parts = [f" → {source_display}"]
71 for cit in citations:
72 parts.append(_format_citation(cit))
73 return "\n".join(parts)
75 if result.content_type == "pdf":
76 ps, pe = result.page_start, result.page_end
77 pages = f"page {ps}" if ps == pe else f"pages {ps}-{pe}"
78 return f" → {source_display}, {pages}"
80 if result.content_type == "code":
81 ls, le = result.line_start, result.line_end
82 lines = f"line {ls}" if ls == le else f"lines {ls}-{le}"
83 return f" → {source_display}, {lines}"
85 return f" → {source_display}"
88def build_context(results: list[SearchChunk]) -> str:
89 """Build context block from search results."""
90 return "\n\n".join(f"[{i}] {r.chunk}" for i, r in enumerate(results, 1))
93def _extract_cited_indices(text: str) -> set[int]:
94 """Extract [N] citation references from LLM answer text."""
95 return {int(m.group(1)) for m in _CITE_REF_RE.finditer(text)}
98def strip_llm_citations(text: str) -> str:
99 """Remove LLM-generated trailing citation blocks from answer text."""
100 return _LLM_CITATION_BLOCK_RE.sub("", text).rstrip()