Coverage for src / lilbee / retrieval / query / formatting.py: 100%

52 statements  

« prev     ^ index     » next       coverage.py v7.13.4, created at 2026-05-15 20:55 +0000

1"""Source formatting, context templating, and LLM citation extraction.""" 

2 

3from __future__ import annotations 

4 

5import re 

6from pathlib import Path 

7 

8from lilbee.core.config import cfg 

9from lilbee.data.store import CHUNK_TYPE_WIKI, CitationRecord, SearchChunk 

10 

11CONTEXT_TEMPLATE = """Context: 

12{context} 

13 

14Question: {question}""" 

15 

16 

17_CITE_REF_RE = re.compile(r"\[(\d+)\]") 

18 

19# Matches trailing LLM-generated citation blocks like "Key sources:", "Sources:", 

20# "References:", "Bibliography:", "Citations:" (with optional markdown heading). 

21_LLM_CITATION_BLOCK_RE = re.compile( 

22 r"\n{1,3}(?:#+\s*)?(?:(?:Key\s+)?Sources|References|Bibliography|Citations)\s*:?\s*\n.*", 

23 re.IGNORECASE | re.DOTALL, 

24) 

25 

26 

27def display_source_path(source: str) -> str: 

28 """Render a chunk's source as an absolute path with ``~`` expansion. 

29 

30 Source values in the store are stored relative to ``documents_dir`` so the 

31 database is portable across machines. For display we resolve back to the 

32 user's filesystem and substitute ``~`` for the home directory so the path 

33 is unambiguous without being noisy. 

34 

35 Falls back to the raw source string if the file no longer exists on disk 

36 (e.g. the user moved the documents directory since ingestion). 

37 """ 

38 candidate = cfg.documents_dir / source 

39 try: 

40 resolved = candidate.resolve(strict=False) 

41 except OSError: 

42 return source 

43 home = Path.home() 

44 try: 

45 return f"~/{resolved.relative_to(home)}" 

46 except ValueError: 

47 return str(resolved) 

48 

49 

50def _format_citation(citation: CitationRecord) -> str: 

51 """Format a single citation record as an indented attribution line.""" 

52 source_display = display_source_path(citation["source_filename"]) 

53 if citation["page_start"] or citation["page_end"]: 

54 ps, pe = citation["page_start"], citation["page_end"] 

55 pages = f"page {ps}" if ps == pe else f"pages {ps}-{pe}" 

56 return f"{source_display}, {pages}" 

57 if citation["line_start"] or citation["line_end"]: 

58 ls, le = citation["line_start"], citation["line_end"] 

59 lines = f"line {ls}" if ls == le else f"lines {ls}-{le}" 

60 return f"{source_display}, {lines}" 

61 return f"{source_display}" 

62 

63 

64def format_source(result: SearchChunk, citations: list[CitationRecord] | None = None) -> str: 

65 """Format a search result as a source citation line. 

66 For wiki chunks, shows the wiki page path followed by indented transitive citations. 

67 """ 

68 source_display = display_source_path(result.source) 

69 if result.chunk_type == CHUNK_TYPE_WIKI and citations: 

70 parts = [f"{source_display}"] 

71 for cit in citations: 

72 parts.append(_format_citation(cit)) 

73 return "\n".join(parts) 

74 

75 if result.content_type == "pdf": 

76 ps, pe = result.page_start, result.page_end 

77 pages = f"page {ps}" if ps == pe else f"pages {ps}-{pe}" 

78 return f"{source_display}, {pages}" 

79 

80 if result.content_type == "code": 

81 ls, le = result.line_start, result.line_end 

82 lines = f"line {ls}" if ls == le else f"lines {ls}-{le}" 

83 return f"{source_display}, {lines}" 

84 

85 return f"{source_display}" 

86 

87 

88def build_context(results: list[SearchChunk]) -> str: 

89 """Build context block from search results.""" 

90 return "\n\n".join(f"[{i}] {r.chunk}" for i, r in enumerate(results, 1)) 

91 

92 

93def _extract_cited_indices(text: str) -> set[int]: 

94 """Extract [N] citation references from LLM answer text.""" 

95 return {int(m.group(1)) for m in _CITE_REF_RE.finditer(text)} 

96 

97 

98def strip_llm_citations(text: str) -> str: 

99 """Remove LLM-generated trailing citation blocks from answer text.""" 

100 return _LLM_CITATION_BLOCK_RE.sub("", text).rstrip()