Coverage for src / lilbee / providers / llama_cpp / log_dispatch.py: 100%

119 statements  

« prev     ^ index     » next       coverage.py v7.13.4, created at 2026-05-15 20:55 +0000

1"""llama.cpp log dispatcher and native-stderr suppression. 

2 

3Routes llama.cpp's C-stream log messages through Python logging, coalescing 

4CONT continuation chunks until a newline. The pending buffer is owned by the 

5``_LogDispatcher`` singleton because it tracks request-scoped state for a 

6ctypes callback. 

7""" 

8 

9from __future__ import annotations 

10 

11import logging 

12import os 

13import sys 

14import threading 

15from collections.abc import Iterator 

16from contextlib import contextmanager 

17from dataclasses import dataclass 

18from typing import Any 

19 

20from lilbee.providers.base import ProviderError 

21 

22_llama_log = logging.getLogger("lilbee.llama_cpp") 

23log = logging.getLogger(__name__) 

24 

25# ggml.h log levels (not exposed by llama-cpp-python). 

26_GGML_LOG_LEVEL_INFO = 1 

27_GGML_LOG_LEVEL_WARN = 2 

28_GGML_LOG_LEVEL_ERROR = 3 

29_GGML_LOG_LEVEL_DEBUG = 4 

30_GGML_LOG_LEVEL_CONT = 5 

31 

32# WARN demotes to INFO so noisy auto-corrections stay silent at the default WARNING level. 

33_GGML_TO_PY_LEVEL = { 

34 _GGML_LOG_LEVEL_INFO: logging.DEBUG, 

35 _GGML_LOG_LEVEL_WARN: logging.INFO, 

36 _GGML_LOG_LEVEL_ERROR: logging.ERROR, 

37 _GGML_LOG_LEVEL_DEBUG: logging.DEBUG, 

38} 

39 

40# llama.cpp emits these at GGML_LOG_LEVEL_ERROR but they're advisory: the model 

41# still loads correctly. Demote to WARNING so users don't think the setup is 

42# broken. 

43_GGML_ERROR_SOFT_DEMOTE = ( 

44 "special_eos_id is not in special_eog_ids", 

45 "embeddings required but some input tokens were not marked as outputs", 

46 # 'n_ctx_seq (X) > n_ctx_train (Y)' -- our embed clamp prevents misuse. 

47 "n_ctx_seq", 

48 "tokenizer config may be incorrect", 

49 # Reported at ERROR every model load on pre-M5/A19 macs (i.e. most macs); 

50 # cosmetic. 

51 "tensor API disabled for pre-M5", 

52 # 'control-looking token: ... was not control-type'. Tokenizer-config 

53 # quirk; llama.cpp auto-corrects. 

54 "was not control-type", 

55 # 'special_eog_ids contains "<|tool_response>", removing "</s>"'. 

56 # Auto-correction of EOG list. 

57 "removing '</s>' token from EOG list", 

58 # Format-string bug in upstream tensor-name logging; harmless. 

59 "cannot properly format tensor name", 

60 # SWA full-size cache fallback for iSWA models; informational. 

61 "using full-size SWA cache", 

62) 

63 

64_STDERR_LOCK = threading.Lock() 

65 

66 

67def _resolve_ggml_level(ggml_level: int, text: str) -> int: 

68 """Translate ggml log level to Python, demoting known-advisory ERRORs to WARNING.""" 

69 py_level = _GGML_TO_PY_LEVEL.get(ggml_level, logging.DEBUG) 

70 if py_level == logging.ERROR and any(s in text for s in _GGML_ERROR_SOFT_DEMOTE): 

71 return logging.WARNING 

72 return py_level 

73 

74 

75@dataclass 

76class _LogDispatchSnapshot: 

77 """Frozen snapshot of dispatcher state for test save/restore.""" 

78 

79 callback: Any 

80 installed: bool 

81 pending: dict[int, str] 

82 pending_level: int 

83 

84 

85class _LogDispatcher: 

86 """Owns the mutable state for routing llama.cpp logs through Python logging. 

87 

88 All four pieces of state (callback handle, installed flag, pending 

89 buffer, pending-level) live here as instance fields so tests can 

90 snapshot / restore the whole dispatcher in one call instead of 

91 poking module globals directly. 

92 """ 

93 

94 def __init__(self) -> None: 

95 # ctypes does not retain a Python reference to the wrapped callback; 

96 # hanging it off the dispatcher keeps it alive for process lifetime. 

97 self.callback: Any = None 

98 self.installed: bool = False 

99 # Request-scoped buffer for a ctypes callback; not on Services because 

100 # the C side has no notion of dependency injection. 

101 self.pending: dict[int, str] = {} 

102 self.pending_level: int = _GGML_LOG_LEVEL_INFO 

103 

104 def dispatch(self, level: int, text_bytes: bytes, _user_data: Any) -> None: 

105 """Dispatch one llama.cpp log message; CONT chunks coalesce on newline.""" 

106 try: 

107 text = text_bytes.decode("utf-8", errors="replace") if text_bytes else "" 

108 except Exception: # pragma: no cover 

109 return 

110 

111 if level == _GGML_LOG_LEVEL_CONT: 

112 self.pending[0] = self.pending.get(0, "") + text 

113 else: 

114 if 0 in self.pending: 

115 buffered = self.pending.pop(0).rstrip() 

116 if buffered: 

117 _llama_log.log(_resolve_ggml_level(self.pending_level, buffered), buffered) 

118 self.pending_level = level 

119 self.pending[0] = text 

120 

121 if "\n" in self.pending.get(0, ""): 

122 full = self.pending.pop(0).rstrip() 

123 if full: 

124 _llama_log.log(_resolve_ggml_level(self.pending_level, full), full) 

125 

126 def install(self) -> None: 

127 """Route llama.cpp logs through Python logging. Idempotent.""" 

128 if self.installed: 

129 return 

130 llama_cpp = import_llama_cpp() 

131 self.callback = llama_cpp.llama_log_callback(self.dispatch) 

132 llama_cpp.llama_log_set(self.callback, None) 

133 self.installed = True 

134 

135 def snapshot(self) -> _LogDispatchSnapshot: 

136 """Return a frozen snapshot of dispatcher state for test save/restore.""" 

137 return _LogDispatchSnapshot( 

138 callback=self.callback, 

139 installed=self.installed, 

140 pending=dict(self.pending), 

141 pending_level=self.pending_level, 

142 ) 

143 

144 def restore(self, snap: _LogDispatchSnapshot) -> None: 

145 """Reset dispatcher state to *snap*.""" 

146 self.callback = snap.callback 

147 self.installed = snap.installed 

148 self.pending = dict(snap.pending) 

149 self.pending_level = snap.pending_level 

150 

151 def reset(self) -> None: 

152 """Clear dispatcher state (for tests that want a known-clean baseline).""" 

153 self.callback = None 

154 self.installed = False 

155 self.pending.clear() 

156 self.pending_level = _GGML_LOG_LEVEL_INFO 

157 

158 

159# Process-lifetime singleton. Tests treat this as the dispatcher boundary. 

160_dispatcher = _LogDispatcher() 

161 

162 

163_MISSING_VULKAN_HINT = ( 

164 "lilbee's Linux wheel currently links against libvulkan.so.1 at runtime. " 

165 "Install your distro's Vulkan loader package (Arch: vulkan-icd-loader, " 

166 "Fedora: vulkan-loader, Debian/Ubuntu: libvulkan1) and try again. See " 

167 "https://github.com/tobocop2/lilbee#linux-runtime-requirements." 

168) 

169 

170 

171def import_llama_cpp() -> Any: 

172 """Import ``llama_cpp`` with an actionable error when libvulkan is missing. 

173 

174 Bare Arch / Fedora installs lack the Vulkan loader and the raw 

175 ``OSError: libvulkan.so.1: cannot open shared object file`` is opaque. 

176 Re-raise as :class:`ProviderError` with install guidance. Idempotent 

177 after the first successful import (Python caches it in ``sys.modules``), 

178 so callers can sprinkle it at every llama_cpp entry point cheaply. 

179 """ 

180 _apply_gpu_device_env() 

181 try: 

182 import llama_cpp 

183 

184 return llama_cpp 

185 except OSError as exc: 

186 if "libvulkan" in str(exc): 

187 raise ProviderError(_MISSING_VULKAN_HINT, provider="llama-cpp") from exc 

188 raise 

189 

190 

191# Backend env vars set from ``cfg.gpu_devices`` before the first llama_cpp 

192# import. Vulkan, CUDA, and ROCm each read their own; a user-set 

193# ``cfg.gpu_devices`` is applied to all four because the user is 

194# specifying their own indexes and opting in to all wheel flavors. 

195_GPU_VISIBLE_ENV_VARS = ( 

196 "GGML_VK_VISIBLE_DEVICES", 

197 "CUDA_VISIBLE_DEVICES", 

198 "HIP_VISIBLE_DEVICES", 

199 "ROCR_VISIBLE_DEVICES", 

200) 

201 

202# Subset of the above that the Vulkan autodetect applies to. CUDA and 

203# HIP/ROCm enumerate single-vendor adapter sets (NVIDIA-only, 

204# AMD-only); a Vulkan device index doesn't translate, so writing the 

205# autodetect result to ``CUDA_VISIBLE_DEVICES`` on a CUDA wheel + 

206# dual-GPU host would hide the only NVIDIA card and silently fall 

207# back to CPU. 

208_VULKAN_AUTODETECT_ENV_VARS = ("GGML_VK_VISIBLE_DEVICES",) 

209 

210 

211_VK_LOADER_LAYERS_DISABLE_ENV_VAR = "VK_LOADER_LAYERS_DISABLE" 

212 

213# Layers with documented crashes against multi-VkDevice apps: 

214# https://github.com/ggml-org/llama.cpp/issues/18109 (RTSS / OBS / HudSight) 

215# https://github.com/ValveSoftware/steam-for-linux/issues/9120 (Steam overlay) 

216# https://alegruz.github.io/graphics/2025/03/22/galaxyoverlayvklayer-issue.html (Galaxy) 

217# Vendor-dispatch layers (NV_optimus, AMD_switchable_graphics, MESA_device_select) 

218# and user-opt-in overlays (MangoHud) are intentionally absent so GPU routing 

219# stays identical to what every other Vulkan app on the host sees. 

220_VK_LOADER_LAYERS_DISABLE_GLOBS: tuple[str, ...] = ( 

221 "VK_LAYER_VALVE_steam_overlay*", 

222 "VK_LAYER_VALVE_steam_fossilize*", 

223 "VK_LAYER_RTSS*", 

224 "VK_LAYER_OBS_HOOK*", 

225 "VK_LAYER_HudSight*", 

226 "GalaxyOverlayVkLayer*", 

227 "VK_LAYER_GalaxyOverlay*", 

228 "VK_LAYER_DISCORD_overlay*", 

229 "VK_LAYER_EOS_Overlay*", 

230 "VK_LAYER_RESHADE*", 

231 "VK_LAYER_VKBASALT*", 

232) 

233_VK_LOADER_LAYERS_DISABLE_VALUE = ",".join(_VK_LOADER_LAYERS_DISABLE_GLOBS) 

234 

235 

236def _apply_gpu_device_env() -> None: 

237 """Apply ``cfg.gpu_devices`` (or autodetect) to backend visibility env vars. 

238 

239 Must run before ``import llama_cpp``: the Vulkan / CUDA / ROCm 

240 runtimes snapshot their visible-device lists at library load time 

241 and ignore subsequent changes. Resolution order: 

242 

243 1. Explicit env var (``GGML_VK_VISIBLE_DEVICES`` etc.) -- always 

244 wins, including when the user intentionally set it empty. 

245 2. ``cfg.gpu_devices`` -- the user's lilbee-level pin. Applied to 

246 every backend (the user is opting in and naming indexes that 

247 match their wheel's enumeration). 

248 3. Vulkan autodetection -- pick the highest-ranked Vulkan device 

249 (discrete > integrated). Applied **only** to 

250 ``GGML_VK_VISIBLE_DEVICES``; the Vulkan index doesn't translate 

251 to CUDA / HIP / ROCm enumeration order, so writing it there 

252 could mask the only visible NVIDIA / AMD card on a single- 

253 vendor wheel. 

254 

255 Steps 2 and 3 only set vars that aren't already in the 

256 environment, so step 1 is preserved. 

257 """ 

258 from lilbee.core.config import cfg 

259 from lilbee.providers.llama_cpp.gpu_select import ( 

260 VulkanIcdEnvVar, 

261 autoselect_best_gpu_index, 

262 disable_conflicting_vulkan_icds, 

263 ) 

264 

265 # Suppress known-crashing third-party Vulkan overlay layers on 

266 # Windows + Linux. Must precede every subsequent vkCreateInstance 

267 # call (our own probe in autoselect plus llama.cpp's), otherwise the 

268 # overlay piggy-backs on the first instance and stays resident even 

269 # if disabled later. setdefault preserves a user-set 

270 # VK_LOADER_LAYERS_DISABLE, and the loader composes our globs with 

271 # the user's own ENABLE token per spec. 

272 if sys.platform == "win32" or sys.platform.startswith("linux"): 

273 os.environ.setdefault(_VK_LOADER_LAYERS_DISABLE_ENV_VAR, _VK_LOADER_LAYERS_DISABLE_VALUE) 

274 

275 # Dual-vendor Vulkan crash mitigation runs first and unconditionally: 

276 # the loader loads every registered ICD at vkCreateInstance, before 

277 # GGML_VK_VISIBLE_DEVICES is consulted, so device pinning alone cannot 

278 # prevent a buggy second-vendor ICD from corrupting the heap. 

279 disable_glob = disable_conflicting_vulkan_icds() 

280 if disable_glob is not None: 

281 os.environ.setdefault(VulkanIcdEnvVar.LOADER_DRIVERS_DISABLE, disable_glob) 

282 log.info("Disabling conflicting Vulkan ICDs: %s", disable_glob) 

283 

284 if cfg.gpu_devices: 

285 for name in _GPU_VISIBLE_ENV_VARS: 

286 os.environ.setdefault(name, cfg.gpu_devices) 

287 return 

288 autoselected = autoselect_best_gpu_index() 

289 if not autoselected: 

290 return 

291 for name in _VULKAN_AUTODETECT_ENV_VARS: 

292 os.environ.setdefault(name, autoselected) 

293 

294 

295def install_llama_log_handler() -> None: 

296 """Route llama.cpp logs through Python logging. Idempotent.""" 

297 _dispatcher.install() 

298 

299 

300@contextmanager 

301def stderr_suppressed() -> Iterator[None]: 

302 """Redirect fd 2 to /dev/null for the duration of the block. 

303 

304 Holds ``_STDERR_LOCK`` so concurrent fd-2 manipulation can't corrupt 

305 each other's saved-stderr value. Use this around the *full* embed or 

306 rerank loop, not per call: per-call wrapping serializes hundreds of 

307 syscalls + lock acquires inside the inner loop and starves the TUI 

308 threads of CPU. 

309 """ 

310 with _STDERR_LOCK: 

311 devnull = os.open(os.devnull, os.O_WRONLY) 

312 old_stderr = os.dup(2) 

313 os.dup2(devnull, 2) 

314 try: 

315 yield 

316 finally: 

317 os.dup2(old_stderr, 2) 

318 os.close(devnull) 

319 os.close(old_stderr) 

320 

321 

322def suppress_native_stderr(fn: Any, *args: Any, **kwargs: Any) -> Any: 

323 """Call *fn* with C-level stderr suppressed. 

324 

325 Thin wrapper around :func:`stderr_suppressed` for one-shot call sites 

326 (Llama() construction, GGUF metadata read). 

327 """ 

328 with stderr_suppressed(): 

329 return fn(*args, **kwargs)