Coverage for src / lilbee / providers / llama_cpp / log_dispatch.py: 100%
119 statements
« prev ^ index » next coverage.py v7.13.4, created at 2026-05-15 20:55 +0000
« prev ^ index » next coverage.py v7.13.4, created at 2026-05-15 20:55 +0000
1"""llama.cpp log dispatcher and native-stderr suppression.
3Routes llama.cpp's C-stream log messages through Python logging, coalescing
4CONT continuation chunks until a newline. The pending buffer is owned by the
5``_LogDispatcher`` singleton because it tracks request-scoped state for a
6ctypes callback.
7"""
9from __future__ import annotations
11import logging
12import os
13import sys
14import threading
15from collections.abc import Iterator
16from contextlib import contextmanager
17from dataclasses import dataclass
18from typing import Any
20from lilbee.providers.base import ProviderError
22_llama_log = logging.getLogger("lilbee.llama_cpp")
23log = logging.getLogger(__name__)
25# ggml.h log levels (not exposed by llama-cpp-python).
26_GGML_LOG_LEVEL_INFO = 1
27_GGML_LOG_LEVEL_WARN = 2
28_GGML_LOG_LEVEL_ERROR = 3
29_GGML_LOG_LEVEL_DEBUG = 4
30_GGML_LOG_LEVEL_CONT = 5
32# WARN demotes to INFO so noisy auto-corrections stay silent at the default WARNING level.
33_GGML_TO_PY_LEVEL = {
34 _GGML_LOG_LEVEL_INFO: logging.DEBUG,
35 _GGML_LOG_LEVEL_WARN: logging.INFO,
36 _GGML_LOG_LEVEL_ERROR: logging.ERROR,
37 _GGML_LOG_LEVEL_DEBUG: logging.DEBUG,
38}
40# llama.cpp emits these at GGML_LOG_LEVEL_ERROR but they're advisory: the model
41# still loads correctly. Demote to WARNING so users don't think the setup is
42# broken.
43_GGML_ERROR_SOFT_DEMOTE = (
44 "special_eos_id is not in special_eog_ids",
45 "embeddings required but some input tokens were not marked as outputs",
46 # 'n_ctx_seq (X) > n_ctx_train (Y)' -- our embed clamp prevents misuse.
47 "n_ctx_seq",
48 "tokenizer config may be incorrect",
49 # Reported at ERROR every model load on pre-M5/A19 macs (i.e. most macs);
50 # cosmetic.
51 "tensor API disabled for pre-M5",
52 # 'control-looking token: ... was not control-type'. Tokenizer-config
53 # quirk; llama.cpp auto-corrects.
54 "was not control-type",
55 # 'special_eog_ids contains "<|tool_response>", removing "</s>"'.
56 # Auto-correction of EOG list.
57 "removing '</s>' token from EOG list",
58 # Format-string bug in upstream tensor-name logging; harmless.
59 "cannot properly format tensor name",
60 # SWA full-size cache fallback for iSWA models; informational.
61 "using full-size SWA cache",
62)
64_STDERR_LOCK = threading.Lock()
67def _resolve_ggml_level(ggml_level: int, text: str) -> int:
68 """Translate ggml log level to Python, demoting known-advisory ERRORs to WARNING."""
69 py_level = _GGML_TO_PY_LEVEL.get(ggml_level, logging.DEBUG)
70 if py_level == logging.ERROR and any(s in text for s in _GGML_ERROR_SOFT_DEMOTE):
71 return logging.WARNING
72 return py_level
75@dataclass
76class _LogDispatchSnapshot:
77 """Frozen snapshot of dispatcher state for test save/restore."""
79 callback: Any
80 installed: bool
81 pending: dict[int, str]
82 pending_level: int
85class _LogDispatcher:
86 """Owns the mutable state for routing llama.cpp logs through Python logging.
88 All four pieces of state (callback handle, installed flag, pending
89 buffer, pending-level) live here as instance fields so tests can
90 snapshot / restore the whole dispatcher in one call instead of
91 poking module globals directly.
92 """
94 def __init__(self) -> None:
95 # ctypes does not retain a Python reference to the wrapped callback;
96 # hanging it off the dispatcher keeps it alive for process lifetime.
97 self.callback: Any = None
98 self.installed: bool = False
99 # Request-scoped buffer for a ctypes callback; not on Services because
100 # the C side has no notion of dependency injection.
101 self.pending: dict[int, str] = {}
102 self.pending_level: int = _GGML_LOG_LEVEL_INFO
104 def dispatch(self, level: int, text_bytes: bytes, _user_data: Any) -> None:
105 """Dispatch one llama.cpp log message; CONT chunks coalesce on newline."""
106 try:
107 text = text_bytes.decode("utf-8", errors="replace") if text_bytes else ""
108 except Exception: # pragma: no cover
109 return
111 if level == _GGML_LOG_LEVEL_CONT:
112 self.pending[0] = self.pending.get(0, "") + text
113 else:
114 if 0 in self.pending:
115 buffered = self.pending.pop(0).rstrip()
116 if buffered:
117 _llama_log.log(_resolve_ggml_level(self.pending_level, buffered), buffered)
118 self.pending_level = level
119 self.pending[0] = text
121 if "\n" in self.pending.get(0, ""):
122 full = self.pending.pop(0).rstrip()
123 if full:
124 _llama_log.log(_resolve_ggml_level(self.pending_level, full), full)
126 def install(self) -> None:
127 """Route llama.cpp logs through Python logging. Idempotent."""
128 if self.installed:
129 return
130 llama_cpp = import_llama_cpp()
131 self.callback = llama_cpp.llama_log_callback(self.dispatch)
132 llama_cpp.llama_log_set(self.callback, None)
133 self.installed = True
135 def snapshot(self) -> _LogDispatchSnapshot:
136 """Return a frozen snapshot of dispatcher state for test save/restore."""
137 return _LogDispatchSnapshot(
138 callback=self.callback,
139 installed=self.installed,
140 pending=dict(self.pending),
141 pending_level=self.pending_level,
142 )
144 def restore(self, snap: _LogDispatchSnapshot) -> None:
145 """Reset dispatcher state to *snap*."""
146 self.callback = snap.callback
147 self.installed = snap.installed
148 self.pending = dict(snap.pending)
149 self.pending_level = snap.pending_level
151 def reset(self) -> None:
152 """Clear dispatcher state (for tests that want a known-clean baseline)."""
153 self.callback = None
154 self.installed = False
155 self.pending.clear()
156 self.pending_level = _GGML_LOG_LEVEL_INFO
159# Process-lifetime singleton. Tests treat this as the dispatcher boundary.
160_dispatcher = _LogDispatcher()
163_MISSING_VULKAN_HINT = (
164 "lilbee's Linux wheel currently links against libvulkan.so.1 at runtime. "
165 "Install your distro's Vulkan loader package (Arch: vulkan-icd-loader, "
166 "Fedora: vulkan-loader, Debian/Ubuntu: libvulkan1) and try again. See "
167 "https://github.com/tobocop2/lilbee#linux-runtime-requirements."
168)
171def import_llama_cpp() -> Any:
172 """Import ``llama_cpp`` with an actionable error when libvulkan is missing.
174 Bare Arch / Fedora installs lack the Vulkan loader and the raw
175 ``OSError: libvulkan.so.1: cannot open shared object file`` is opaque.
176 Re-raise as :class:`ProviderError` with install guidance. Idempotent
177 after the first successful import (Python caches it in ``sys.modules``),
178 so callers can sprinkle it at every llama_cpp entry point cheaply.
179 """
180 _apply_gpu_device_env()
181 try:
182 import llama_cpp
184 return llama_cpp
185 except OSError as exc:
186 if "libvulkan" in str(exc):
187 raise ProviderError(_MISSING_VULKAN_HINT, provider="llama-cpp") from exc
188 raise
191# Backend env vars set from ``cfg.gpu_devices`` before the first llama_cpp
192# import. Vulkan, CUDA, and ROCm each read their own; a user-set
193# ``cfg.gpu_devices`` is applied to all four because the user is
194# specifying their own indexes and opting in to all wheel flavors.
195_GPU_VISIBLE_ENV_VARS = (
196 "GGML_VK_VISIBLE_DEVICES",
197 "CUDA_VISIBLE_DEVICES",
198 "HIP_VISIBLE_DEVICES",
199 "ROCR_VISIBLE_DEVICES",
200)
202# Subset of the above that the Vulkan autodetect applies to. CUDA and
203# HIP/ROCm enumerate single-vendor adapter sets (NVIDIA-only,
204# AMD-only); a Vulkan device index doesn't translate, so writing the
205# autodetect result to ``CUDA_VISIBLE_DEVICES`` on a CUDA wheel +
206# dual-GPU host would hide the only NVIDIA card and silently fall
207# back to CPU.
208_VULKAN_AUTODETECT_ENV_VARS = ("GGML_VK_VISIBLE_DEVICES",)
211_VK_LOADER_LAYERS_DISABLE_ENV_VAR = "VK_LOADER_LAYERS_DISABLE"
213# Layers with documented crashes against multi-VkDevice apps:
214# https://github.com/ggml-org/llama.cpp/issues/18109 (RTSS / OBS / HudSight)
215# https://github.com/ValveSoftware/steam-for-linux/issues/9120 (Steam overlay)
216# https://alegruz.github.io/graphics/2025/03/22/galaxyoverlayvklayer-issue.html (Galaxy)
217# Vendor-dispatch layers (NV_optimus, AMD_switchable_graphics, MESA_device_select)
218# and user-opt-in overlays (MangoHud) are intentionally absent so GPU routing
219# stays identical to what every other Vulkan app on the host sees.
220_VK_LOADER_LAYERS_DISABLE_GLOBS: tuple[str, ...] = (
221 "VK_LAYER_VALVE_steam_overlay*",
222 "VK_LAYER_VALVE_steam_fossilize*",
223 "VK_LAYER_RTSS*",
224 "VK_LAYER_OBS_HOOK*",
225 "VK_LAYER_HudSight*",
226 "GalaxyOverlayVkLayer*",
227 "VK_LAYER_GalaxyOverlay*",
228 "VK_LAYER_DISCORD_overlay*",
229 "VK_LAYER_EOS_Overlay*",
230 "VK_LAYER_RESHADE*",
231 "VK_LAYER_VKBASALT*",
232)
233_VK_LOADER_LAYERS_DISABLE_VALUE = ",".join(_VK_LOADER_LAYERS_DISABLE_GLOBS)
236def _apply_gpu_device_env() -> None:
237 """Apply ``cfg.gpu_devices`` (or autodetect) to backend visibility env vars.
239 Must run before ``import llama_cpp``: the Vulkan / CUDA / ROCm
240 runtimes snapshot their visible-device lists at library load time
241 and ignore subsequent changes. Resolution order:
243 1. Explicit env var (``GGML_VK_VISIBLE_DEVICES`` etc.) -- always
244 wins, including when the user intentionally set it empty.
245 2. ``cfg.gpu_devices`` -- the user's lilbee-level pin. Applied to
246 every backend (the user is opting in and naming indexes that
247 match their wheel's enumeration).
248 3. Vulkan autodetection -- pick the highest-ranked Vulkan device
249 (discrete > integrated). Applied **only** to
250 ``GGML_VK_VISIBLE_DEVICES``; the Vulkan index doesn't translate
251 to CUDA / HIP / ROCm enumeration order, so writing it there
252 could mask the only visible NVIDIA / AMD card on a single-
253 vendor wheel.
255 Steps 2 and 3 only set vars that aren't already in the
256 environment, so step 1 is preserved.
257 """
258 from lilbee.core.config import cfg
259 from lilbee.providers.llama_cpp.gpu_select import (
260 VulkanIcdEnvVar,
261 autoselect_best_gpu_index,
262 disable_conflicting_vulkan_icds,
263 )
265 # Suppress known-crashing third-party Vulkan overlay layers on
266 # Windows + Linux. Must precede every subsequent vkCreateInstance
267 # call (our own probe in autoselect plus llama.cpp's), otherwise the
268 # overlay piggy-backs on the first instance and stays resident even
269 # if disabled later. setdefault preserves a user-set
270 # VK_LOADER_LAYERS_DISABLE, and the loader composes our globs with
271 # the user's own ENABLE token per spec.
272 if sys.platform == "win32" or sys.platform.startswith("linux"):
273 os.environ.setdefault(_VK_LOADER_LAYERS_DISABLE_ENV_VAR, _VK_LOADER_LAYERS_DISABLE_VALUE)
275 # Dual-vendor Vulkan crash mitigation runs first and unconditionally:
276 # the loader loads every registered ICD at vkCreateInstance, before
277 # GGML_VK_VISIBLE_DEVICES is consulted, so device pinning alone cannot
278 # prevent a buggy second-vendor ICD from corrupting the heap.
279 disable_glob = disable_conflicting_vulkan_icds()
280 if disable_glob is not None:
281 os.environ.setdefault(VulkanIcdEnvVar.LOADER_DRIVERS_DISABLE, disable_glob)
282 log.info("Disabling conflicting Vulkan ICDs: %s", disable_glob)
284 if cfg.gpu_devices:
285 for name in _GPU_VISIBLE_ENV_VARS:
286 os.environ.setdefault(name, cfg.gpu_devices)
287 return
288 autoselected = autoselect_best_gpu_index()
289 if not autoselected:
290 return
291 for name in _VULKAN_AUTODETECT_ENV_VARS:
292 os.environ.setdefault(name, autoselected)
295def install_llama_log_handler() -> None:
296 """Route llama.cpp logs through Python logging. Idempotent."""
297 _dispatcher.install()
300@contextmanager
301def stderr_suppressed() -> Iterator[None]:
302 """Redirect fd 2 to /dev/null for the duration of the block.
304 Holds ``_STDERR_LOCK`` so concurrent fd-2 manipulation can't corrupt
305 each other's saved-stderr value. Use this around the *full* embed or
306 rerank loop, not per call: per-call wrapping serializes hundreds of
307 syscalls + lock acquires inside the inner loop and starves the TUI
308 threads of CPU.
309 """
310 with _STDERR_LOCK:
311 devnull = os.open(os.devnull, os.O_WRONLY)
312 old_stderr = os.dup(2)
313 os.dup2(devnull, 2)
314 try:
315 yield
316 finally:
317 os.dup2(old_stderr, 2)
318 os.close(devnull)
319 os.close(old_stderr)
322def suppress_native_stderr(fn: Any, *args: Any, **kwargs: Any) -> Any:
323 """Call *fn* with C-level stderr suppressed.
325 Thin wrapper around :func:`stderr_suppressed` for one-shot call sites
326 (Llama() construction, GGUF metadata read).
327 """
328 with stderr_suppressed():
329 return fn(*args, **kwargs)