Coverage for src/lilbee/providers/llama_cpp/log

1"""llama.cpp log dispatcher and native-stderr suppression.

3Routes llama.cpp's C-stream log messages through Python logging, coalescing

4CONT continuation chunks until a newline. The pending buffer is owned by the

5``_LogDispatcher`` singleton because it tracks request-scoped state for a

6ctypes callback.

7"""

9from __future__ import annotations

11import logging

12import os

13import sys

14import threading

15from collections.abc import Iterator

16from contextlib import contextmanager

17from dataclasses import dataclass

18from typing import Any

20from lilbee.providers.base import ProviderError

22_llama_log = logging.getLogger("lilbee.llama_cpp")

23log = logging.getLogger(__name__)

25# ggml.h log levels (not exposed by llama-cpp-python).

26_GGML_LOG_LEVEL_INFO = 1

27_GGML_LOG_LEVEL_WARN = 2

28_GGML_LOG_LEVEL_ERROR = 3

29_GGML_LOG_LEVEL_DEBUG = 4

30_GGML_LOG_LEVEL_CONT = 5

32# WARN demotes to INFO so noisy auto-corrections stay silent at the default WARNING level.

33_GGML_TO_PY_LEVEL = {

34 _GGML_LOG_LEVEL_INFO: logging.DEBUG,

35 _GGML_LOG_LEVEL_WARN: logging.INFO,

36 _GGML_LOG_LEVEL_ERROR: logging.ERROR,

37 _GGML_LOG_LEVEL_DEBUG: logging.DEBUG,

38}

40# llama.cpp emits these at GGML_LOG_LEVEL_ERROR but they're advisory: the model

41# still loads correctly. Demote to WARNING so users don't think the setup is

42# broken.

43_GGML_ERROR_SOFT_DEMOTE = (

44 "special_eos_id is not in special_eog_ids",

45 "embeddings required but some input tokens were not marked as outputs",

46 # 'n_ctx_seq (X) > n_ctx_train (Y)' -- our embed clamp prevents misuse.

47 "n_ctx_seq",

48 "tokenizer config may be incorrect",

49 # Reported at ERROR every model load on pre-M5/A19 macs (i.e. most macs);

50 # cosmetic.

51 "tensor API disabled for pre-M5",

52 # 'control-looking token: ... was not control-type'. Tokenizer-config

53 # quirk; llama.cpp auto-corrects.

54 "was not control-type",

55 # 'special_eog_ids contains "<|tool_response>", removing "</s>"'.

56 # Auto-correction of EOG list.

57 "removing '</s>' token from EOG list",

58 # Format-string bug in upstream tensor-name logging; harmless.

59 "cannot properly format tensor name",

60 # SWA full-size cache fallback for iSWA models; informational.

61 "using full-size SWA cache",

62)

64_STDERR_LOCK = threading.Lock()

67def _resolve_ggml_level(ggml_level: int, text: str) -> int:

68 """Translate ggml log level to Python, demoting known-advisory ERRORs to WARNING."""

69 py_level = _GGML_TO_PY_LEVEL.get(ggml_level, logging.DEBUG)

70 if py_level == logging.ERROR and any(s in text for s in _GGML_ERROR_SOFT_DEMOTE):

71 return logging.WARNING

72 return py_level

75@dataclass

76class _LogDispatchSnapshot:

77 """Frozen snapshot of dispatcher state for test save/restore."""

79 callback: Any

80 installed: bool

81 pending: dict[int, str]

82 pending_level: int

85class _LogDispatcher:

86 """Owns the mutable state for routing llama.cpp logs through Python logging.

88 All four pieces of state (callback handle, installed flag, pending

89 buffer, pending-level) live here as instance fields so tests can

90 snapshot / restore the whole dispatcher in one call instead of

91 poking module globals directly.

92 """

94 def __init__(self) -> None:

95 # ctypes does not retain a Python reference to the wrapped callback;

96 # hanging it off the dispatcher keeps it alive for process lifetime.

97 self.callback: Any = None

98 self.installed: bool = False

99 # Request-scoped buffer for a ctypes callback; not on Services because

100 # the C side has no notion of dependency injection.

101 self.pending: dict[int, str] = {}

102 self.pending_level: int = _GGML_LOG_LEVEL_INFO

103

104 def dispatch(self, level: int, text_bytes: bytes, _user_data: Any) -> None:

105 """Dispatch one llama.cpp log message; CONT chunks coalesce on newline."""

106 try:

107 text = text_bytes.decode("utf-8", errors="replace") if text_bytes else ""

108 except Exception: # pragma: no cover

109 return

110

111 if level == _GGML_LOG_LEVEL_CONT:

112 self.pending[0] = self.pending.get(0, "") + text

113 else:

114 if 0 in self.pending:

115 buffered = self.pending.pop(0).rstrip()

116 if buffered:

117 _llama_log.log(_resolve_ggml_level(self.pending_level, buffered), buffered)

118 self.pending_level = level

119 self.pending[0] = text

120

121 if "\n" in self.pending.get(0, ""):

122 full = self.pending.pop(0).rstrip()

123 if full:

124 _llama_log.log(_resolve_ggml_level(self.pending_level, full), full)

125

126 def install(self) -> None:

127 """Route llama.cpp logs through Python logging. Idempotent."""

128 if self.installed:

129 return

130 llama_cpp = import_llama_cpp()

131 self.callback = llama_cpp.llama_log_callback(self.dispatch)

132 llama_cpp.llama_log_set(self.callback, None)

133 self.installed = True

134

135 def snapshot(self) -> _LogDispatchSnapshot:

136 """Return a frozen snapshot of dispatcher state for test save/restore."""

137 return _LogDispatchSnapshot(

138 callback=self.callback,

139 installed=self.installed,

140 pending=dict(self.pending),

141 pending_level=self.pending_level,

142 )

143

144 def restore(self, snap: _LogDispatchSnapshot) -> None:

145 """Reset dispatcher state to *snap*."""

146 self.callback = snap.callback

147 self.installed = snap.installed

148 self.pending = dict(snap.pending)

149 self.pending_level = snap.pending_level

150

151 def reset(self) -> None:

152 """Clear dispatcher state (for tests that want a known-clean baseline)."""

153 self.callback = None

154 self.installed = False

155 self.pending.clear()

156 self.pending_level = _GGML_LOG_LEVEL_INFO

157

158

159# Process-lifetime singleton. Tests treat this as the dispatcher boundary.

160_dispatcher = _LogDispatcher()

161

162

163_MISSING_VULKAN_HINT = (

164 "lilbee's Linux wheel currently links against libvulkan.so.1 at runtime. "

165 "Install your distro's Vulkan loader package (Arch: vulkan-icd-loader, "

166 "Fedora: vulkan-loader, Debian/Ubuntu: libvulkan1) and try again. See "

167 "https://github.com/tobocop2/lilbee#linux-runtime-requirements."

168)

169

170

171def import_llama_cpp() -> Any:

172 """Import ``llama_cpp`` with an actionable error when libvulkan is missing.

173

174 Bare Arch / Fedora installs lack the Vulkan loader and the raw

175 ``OSError: libvulkan.so.1: cannot open shared object file`` is opaque.

176 Re-raise as :class:`ProviderError` with install guidance. Idempotent

177 after the first successful import (Python caches it in ``sys.modules``),

178 so callers can sprinkle it at every llama_cpp entry point cheaply.

179 """

180 _apply_gpu_device_env()

181 try:

182 import llama_cpp

183

184 return llama_cpp

185 except OSError as exc:

186 if "libvulkan" in str(exc):

187 raise ProviderError(_MISSING_VULKAN_HINT, provider="llama-cpp") from exc

188 raise

189

190

191# Backend env vars set from ``cfg.gpu_devices`` before the first llama_cpp

192# import. Vulkan, CUDA, and ROCm each read their own; a user-set

193# ``cfg.gpu_devices`` is applied to all four because the user is

194# specifying their own indexes and opting in to all wheel flavors.

195_GPU_VISIBLE_ENV_VARS = (

196 "GGML_VK_VISIBLE_DEVICES",

197 "CUDA_VISIBLE_DEVICES",

198 "HIP_VISIBLE_DEVICES",

199 "ROCR_VISIBLE_DEVICES",

200)

201

202# Subset of the above that the Vulkan autodetect applies to. CUDA and

203# HIP/ROCm enumerate single-vendor adapter sets (NVIDIA-only,

204# AMD-only); a Vulkan device index doesn't translate, so writing the

205# autodetect result to ``CUDA_VISIBLE_DEVICES`` on a CUDA wheel +

206# dual-GPU host would hide the only NVIDIA card and silently fall

207# back to CPU.

208_VULKAN_AUTODETECT_ENV_VARS = ("GGML_VK_VISIBLE_DEVICES",)

209

210

211_VK_LOADER_LAYERS_DISABLE_ENV_VAR = "VK_LOADER_LAYERS_DISABLE"

212

213# Layers with documented crashes against multi-VkDevice apps:

214# https://github.com/ggml-org/llama.cpp/issues/18109 (RTSS / OBS / HudSight)

215# https://github.com/ValveSoftware/steam-for-linux/issues/9120 (Steam overlay)

216# https://alegruz.github.io/graphics/2025/03/22/galaxyoverlayvklayer-issue.html (Galaxy)

217# Vendor-dispatch layers (NV_optimus, AMD_switchable_graphics, MESA_device_select)

218# and user-opt-in overlays (MangoHud) are intentionally absent so GPU routing

219# stays identical to what every other Vulkan app on the host sees.

220_VK_LOADER_LAYERS_DISABLE_GLOBS: tuple[str, ...] = (

221 "VK_LAYER_VALVE_steam_overlay*",

222 "VK_LAYER_VALVE_steam_fossilize*",

223 "VK_LAYER_RTSS*",

224 "VK_LAYER_OBS_HOOK*",

225 "VK_LAYER_HudSight*",

226 "GalaxyOverlayVkLayer*",

227 "VK_LAYER_GalaxyOverlay*",

228 "VK_LAYER_DISCORD_overlay*",

229 "VK_LAYER_EOS_Overlay*",

230 "VK_LAYER_RESHADE*",

231 "VK_LAYER_VKBASALT*",

232)

233_VK_LOADER_LAYERS_DISABLE_VALUE = ",".join(_VK_LOADER_LAYERS_DISABLE_GLOBS)

234

235

236def _apply_gpu_device_env() -> None:

237 """Apply ``cfg.gpu_devices`` (or autodetect) to backend visibility env vars.

238

239 Must run before ``import llama_cpp``: the Vulkan / CUDA / ROCm

240 runtimes snapshot their visible-device lists at library load time

241 and ignore subsequent changes. Resolution order:

242

243 1. Explicit env var (``GGML_VK_VISIBLE_DEVICES`` etc.) -- always

244 wins, including when the user intentionally set it empty.

245 2. ``cfg.gpu_devices`` -- the user's lilbee-level pin. Applied to

246 every backend (the user is opting in and naming indexes that

247 match their wheel's enumeration).

248 3. Vulkan autodetection -- pick the highest-ranked Vulkan device

249 (discrete > integrated). Applied **only** to

250 ``GGML_VK_VISIBLE_DEVICES``; the Vulkan index doesn't translate

251 to CUDA / HIP / ROCm enumeration order, so writing it there

252 could mask the only visible NVIDIA / AMD card on a single-

253 vendor wheel.

254

255 Steps 2 and 3 only set vars that aren't already in the

256 environment, so step 1 is preserved.

257 """

258 from lilbee.core.config import cfg

259 from lilbee.providers.llama_cpp.gpu_select import (

260 VulkanIcdEnvVar,

261 autoselect_best_gpu_index,

262 disable_conflicting_vulkan_icds,

263 )

264

265 # Suppress known-crashing third-party Vulkan overlay layers on

266 # Windows + Linux. Must precede every subsequent vkCreateInstance

267 # call (our own probe in autoselect plus llama.cpp's), otherwise the

268 # overlay piggy-backs on the first instance and stays resident even

269 # if disabled later. setdefault preserves a user-set

270 # VK_LOADER_LAYERS_DISABLE, and the loader composes our globs with

271 # the user's own ENABLE token per spec.

272 if sys.platform == "win32" or sys.platform.startswith("linux"):

273 os.environ.setdefault(_VK_LOADER_LAYERS_DISABLE_ENV_VAR, _VK_LOADER_LAYERS_DISABLE_VALUE)

274

275 # Dual-vendor Vulkan crash mitigation runs first and unconditionally:

276 # the loader loads every registered ICD at vkCreateInstance, before

277 # GGML_VK_VISIBLE_DEVICES is consulted, so device pinning alone cannot

278 # prevent a buggy second-vendor ICD from corrupting the heap.

279 disable_glob = disable_conflicting_vulkan_icds()

280 if disable_glob is not None:

281 os.environ.setdefault(VulkanIcdEnvVar.LOADER_DRIVERS_DISABLE, disable_glob)

282 log.info("Disabling conflicting Vulkan ICDs: %s", disable_glob)

283

284 if cfg.gpu_devices:

285 for name in _GPU_VISIBLE_ENV_VARS:

286 os.environ.setdefault(name, cfg.gpu_devices)

287 return

288 autoselected = autoselect_best_gpu_index()

289 if not autoselected:

290 return

291 for name in _VULKAN_AUTODETECT_ENV_VARS:

292 os.environ.setdefault(name, autoselected)

293

294

295def install_llama_log_handler() -> None:

296 """Route llama.cpp logs through Python logging. Idempotent."""

297 _dispatcher.install()

298

299

300@contextmanager

301def stderr_suppressed() -> Iterator[None]:

302 """Redirect fd 2 to /dev/null for the duration of the block.

303

304 Holds ``_STDERR_LOCK`` so concurrent fd-2 manipulation can't corrupt

305 each other's saved-stderr value. Use this around the *full* embed or

306 rerank loop, not per call: per-call wrapping serializes hundreds of

307 syscalls + lock acquires inside the inner loop and starves the TUI

308 threads of CPU.

309 """

310 with _STDERR_LOCK:

311 devnull = os.open(os.devnull, os.O_WRONLY)

312 old_stderr = os.dup(2)

313 os.dup2(devnull, 2)

314 try:

315 yield

316 finally:

317 os.dup2(old_stderr, 2)

318 os.close(devnull)

319 os.close(old_stderr)

320

321

322def suppress_native_stderr(fn: Any, *args: Any, **kwargs: Any) -> Any:

323 """Call *fn* with C-level stderr suppressed.

324

325 Thin wrapper around :func:`stderr_suppressed` for one-shot call sites

326 (Llama() construction, GGUF metadata read).

327 """

328 with stderr_suppressed():

329 return fn(*args, **kwargs)

Coverage for src / lilbee / providers / llama_cpp / log_dispatch.py: 100%

119 statements