Coverage for src / lilbee / data / code_chunker.py: 100%
84 statements
« prev ^ index » next coverage.py v7.13.4, created at 2026-05-15 20:55 +0000
« prev ^ index » next coverage.py v7.13.4, created at 2026-05-15 20:55 +0000
1"""Code chunking via tree-sitter AST analysis.
3Extracts structured symbol information (functions, classes, imports)
4and builds enriched chunk headers with symbol metadata.
5"""
7import logging
8from dataclasses import dataclass
9from pathlib import Path
10from typing import Any
12from tree_sitter_language_pack import (
13 PackConfig,
14 ProcessConfig,
15 detect_language,
16 has_language,
17 init,
18 process,
19)
21from lilbee.core.config import cfg
22from lilbee.data.chunk import chunk_text
24log = logging.getLogger(__name__)
27@dataclass
28class SymbolInfo:
29 """Extracted symbol metadata from tree-sitter process()."""
31 name: str
32 kind: str
33 line_start: int
34 line_end: int
35 text: str
38@dataclass
39class CodeChunk:
40 """A chunk of source code with line location metadata."""
42 chunk: str
43 line_start: int
44 line_end: int
45 chunk_index: int
48def _detect_language(file_path: Path) -> str | None:
49 """Detect language from file path using tree-sitter-language-pack."""
50 result: str | None = detect_language(str(file_path))
51 return result
54def _ensure_language(lang: str) -> bool:
55 """Download language parser if not already available."""
56 try:
57 if has_language(lang):
58 return True
59 # tslp 1.8.0 mistypes init() against _native.PackConfig, but the
60 # public re-export is options.PackConfig (a dataclass). Runtime is
61 # fine. Both share the same fields.
62 init(PackConfig(languages=[lang])) # type: ignore[arg-type]
63 return has_language(lang)
64 except Exception:
65 log.debug("Failed to download tree-sitter language: %s", lang)
66 return False
69def find_line(needle: str, lines: list[str], start: int) -> int:
70 """Find the first line index (1-based) containing needle, from start."""
71 for i in range(start, len(lines)):
72 if needle and needle in lines[i]:
73 return i + 1
74 return start + 1
77def _fallback_chunks(text: str) -> list[CodeChunk]:
78 """Fallback text chunking with approximate line tracking."""
79 raw = chunk_text(text)
80 lines = text.split("\n")
81 results: list[CodeChunk] = []
82 search_from = 0
84 for idx, chunk in enumerate(raw):
85 first_line = chunk.split("\n")[0][:80]
86 line_start = find_line(first_line, lines, search_from)
87 line_end = min(line_start + chunk.count("\n"), len(lines))
88 results.append(
89 CodeChunk(
90 chunk=chunk,
91 line_start=line_start,
92 line_end=line_end,
93 chunk_index=idx,
94 )
95 )
96 search_from = line_start
98 return results
101def _extract_symbols(result: Any, source_text: str) -> list[SymbolInfo]:
102 """Parse process() result into typed SymbolInfo objects."""
103 symbols: list[SymbolInfo] = []
104 for entry in result.structure:
105 span = entry.span
106 symbols.append(
107 SymbolInfo(
108 name=str(entry.name),
109 kind=str(entry.kind).lower(),
110 line_start=span.start_line + 1,
111 line_end=span.end_line + 1,
112 text=source_text[span.start_byte : span.end_byte],
113 )
114 )
115 return symbols
118def chunk_code(file_path: Path) -> list[CodeChunk]:
119 """Chunk a source file using tree-sitter-language-pack's process() API.
120 Extracts structural symbols (functions, classes) and builds enriched
121 chunks with metadata headers. Falls back to token-based chunking
122 if the language isn't supported or parsing fails.
123 """
124 source_text = file_path.read_text(encoding="utf-8", errors="replace")
125 if not source_text.strip():
126 return []
128 lang = _detect_language(file_path)
129 if not lang:
130 return _fallback_chunks(source_text)
132 try:
133 if not _ensure_language(lang):
134 return _fallback_chunks(source_text)
135 config = ProcessConfig(
136 lang,
137 structure=True,
138 symbols=True,
139 docstrings=True,
140 chunk_max_size=cfg.chunk_size,
141 )
142 result = process(source_text, config) # type: ignore[arg-type] # tslp 1.8.0 typing bug, see init() above
143 except Exception:
144 log.debug("tree-sitter process() failed for %s", file_path, exc_info=True)
145 return _fallback_chunks(source_text)
147 symbols = _extract_symbols(result, source_text)
148 if not symbols:
149 return _fallback_chunks(source_text)
151 chunks: list[CodeChunk] = []
152 for i, sym in enumerate(symbols):
153 header = f"# File: {file_path}"
154 if sym.name and sym.kind:
155 header += f" | {sym.kind}: {sym.name}"
156 header += f" (lines {sym.line_start}-{sym.line_end})"
158 chunks.append(
159 CodeChunk(
160 chunk=f"{header}\n\n{sym.text}",
161 line_start=sym.line_start,
162 line_end=sym.line_end,
163 chunk_index=i,
164 )
165 )
167 return chunks
170def is_code_file(file_path: Path) -> bool:
171 """Check if a file is supported by tree-sitter chunking."""
172 return detect_language(str(file_path)) is not None