Coverage for src / lilbee / data / code_chunker.py: 100%

84 statements  

« prev     ^ index     » next       coverage.py v7.13.4, created at 2026-05-15 20:55 +0000

1"""Code chunking via tree-sitter AST analysis. 

2 

3Extracts structured symbol information (functions, classes, imports) 

4and builds enriched chunk headers with symbol metadata. 

5""" 

6 

7import logging 

8from dataclasses import dataclass 

9from pathlib import Path 

10from typing import Any 

11 

12from tree_sitter_language_pack import ( 

13 PackConfig, 

14 ProcessConfig, 

15 detect_language, 

16 has_language, 

17 init, 

18 process, 

19) 

20 

21from lilbee.core.config import cfg 

22from lilbee.data.chunk import chunk_text 

23 

24log = logging.getLogger(__name__) 

25 

26 

27@dataclass 

28class SymbolInfo: 

29 """Extracted symbol metadata from tree-sitter process().""" 

30 

31 name: str 

32 kind: str 

33 line_start: int 

34 line_end: int 

35 text: str 

36 

37 

38@dataclass 

39class CodeChunk: 

40 """A chunk of source code with line location metadata.""" 

41 

42 chunk: str 

43 line_start: int 

44 line_end: int 

45 chunk_index: int 

46 

47 

48def _detect_language(file_path: Path) -> str | None: 

49 """Detect language from file path using tree-sitter-language-pack.""" 

50 result: str | None = detect_language(str(file_path)) 

51 return result 

52 

53 

54def _ensure_language(lang: str) -> bool: 

55 """Download language parser if not already available.""" 

56 try: 

57 if has_language(lang): 

58 return True 

59 # tslp 1.8.0 mistypes init() against _native.PackConfig, but the 

60 # public re-export is options.PackConfig (a dataclass). Runtime is 

61 # fine. Both share the same fields. 

62 init(PackConfig(languages=[lang])) # type: ignore[arg-type] 

63 return has_language(lang) 

64 except Exception: 

65 log.debug("Failed to download tree-sitter language: %s", lang) 

66 return False 

67 

68 

69def find_line(needle: str, lines: list[str], start: int) -> int: 

70 """Find the first line index (1-based) containing needle, from start.""" 

71 for i in range(start, len(lines)): 

72 if needle and needle in lines[i]: 

73 return i + 1 

74 return start + 1 

75 

76 

77def _fallback_chunks(text: str) -> list[CodeChunk]: 

78 """Fallback text chunking with approximate line tracking.""" 

79 raw = chunk_text(text) 

80 lines = text.split("\n") 

81 results: list[CodeChunk] = [] 

82 search_from = 0 

83 

84 for idx, chunk in enumerate(raw): 

85 first_line = chunk.split("\n")[0][:80] 

86 line_start = find_line(first_line, lines, search_from) 

87 line_end = min(line_start + chunk.count("\n"), len(lines)) 

88 results.append( 

89 CodeChunk( 

90 chunk=chunk, 

91 line_start=line_start, 

92 line_end=line_end, 

93 chunk_index=idx, 

94 ) 

95 ) 

96 search_from = line_start 

97 

98 return results 

99 

100 

101def _extract_symbols(result: Any, source_text: str) -> list[SymbolInfo]: 

102 """Parse process() result into typed SymbolInfo objects.""" 

103 symbols: list[SymbolInfo] = [] 

104 for entry in result.structure: 

105 span = entry.span 

106 symbols.append( 

107 SymbolInfo( 

108 name=str(entry.name), 

109 kind=str(entry.kind).lower(), 

110 line_start=span.start_line + 1, 

111 line_end=span.end_line + 1, 

112 text=source_text[span.start_byte : span.end_byte], 

113 ) 

114 ) 

115 return symbols 

116 

117 

118def chunk_code(file_path: Path) -> list[CodeChunk]: 

119 """Chunk a source file using tree-sitter-language-pack's process() API. 

120 Extracts structural symbols (functions, classes) and builds enriched 

121 chunks with metadata headers. Falls back to token-based chunking 

122 if the language isn't supported or parsing fails. 

123 """ 

124 source_text = file_path.read_text(encoding="utf-8", errors="replace") 

125 if not source_text.strip(): 

126 return [] 

127 

128 lang = _detect_language(file_path) 

129 if not lang: 

130 return _fallback_chunks(source_text) 

131 

132 try: 

133 if not _ensure_language(lang): 

134 return _fallback_chunks(source_text) 

135 config = ProcessConfig( 

136 lang, 

137 structure=True, 

138 symbols=True, 

139 docstrings=True, 

140 chunk_max_size=cfg.chunk_size, 

141 ) 

142 result = process(source_text, config) # type: ignore[arg-type] # tslp 1.8.0 typing bug, see init() above 

143 except Exception: 

144 log.debug("tree-sitter process() failed for %s", file_path, exc_info=True) 

145 return _fallback_chunks(source_text) 

146 

147 symbols = _extract_symbols(result, source_text) 

148 if not symbols: 

149 return _fallback_chunks(source_text) 

150 

151 chunks: list[CodeChunk] = [] 

152 for i, sym in enumerate(symbols): 

153 header = f"# File: {file_path}" 

154 if sym.name and sym.kind: 

155 header += f" | {sym.kind}: {sym.name}" 

156 header += f" (lines {sym.line_start}-{sym.line_end})" 

157 

158 chunks.append( 

159 CodeChunk( 

160 chunk=f"{header}\n\n{sym.text}", 

161 line_start=sym.line_start, 

162 line_end=sym.line_end, 

163 chunk_index=i, 

164 ) 

165 ) 

166 

167 return chunks 

168 

169 

170def is_code_file(file_path: Path) -> bool: 

171 """Check if a file is supported by tree-sitter chunking.""" 

172 return detect_language(str(file_path)) is not None