Coverage for src / lilbee / providers / mtmd_backend.py: 100%
69 statements
« prev ^ index » next coverage.py v7.13.4, created at 2026-05-15 20:55 +0000
« prev ^ index » next coverage.py v7.13.4, created at 2026-05-15 20:55 +0000
1"""Vision OCR loader that drives llama.cpp's mtmd pipeline with the GGUF's
2own chat template, so there's no projector-type-to-handler lookup table.
3"""
5from __future__ import annotations
7import logging
8from pathlib import Path
9from typing import Any
11from gguf import GGUFReader
13from lilbee.core.config import cfg
14from lilbee.providers.llama_cpp.abort_signal import abort_callback
15from lilbee.providers.llama_cpp.gguf_meta import (
16 find_mmproj_for_model,
17 read_gguf_metadata,
18 train_ctx_from_meta,
19)
20from lilbee.providers.llama_cpp.log_dispatch import (
21 import_llama_cpp,
22 install_llama_log_handler,
23 suppress_native_stderr,
24)
26log = logging.getLogger(__name__)
29# Image-placeholder tokens seen in GGUF chat templates. The upstream
30# mtmd pipeline substitutes image URLs with mtmd's media marker, so
31# these get rewritten to {{ content.image_url.url }} before rendering.
32# Case matters: GGUF templates are machine-emitted and stable, so a
33# case-insensitive replace would risk corrupting unrelated Jinja
34# identifiers.
35_GGUF_IMAGE_TOKENS: tuple[str, ...] = (
36 "<|image_pad|>",
37 "<image>",
38 "<IMAGE>",
39 "<__media__>",
40 "<__image__>",
41)
42_IMAGE_URL_JINJA = "{{ content.image_url.url }}"
44_TOKENIZER_CHAT_TEMPLATE_KEY = "tokenizer.chat_template"
46_VISION_FALLBACK_N_CTX = 4096
47"""n_ctx for a vision load when the GGUF has no ``context_length`` in metadata.
49Most vision GGUFs report their training context (typical values: 4096, 8192,
5032768); this covers the rare missing/unreadable-metadata case so the loader
51still gets a sensible explicit n_ctx.
52"""
55def read_chat_template(model_path: Path) -> str | None:
56 """Return the Jinja chat template embedded in a GGUF model, or None."""
57 try:
58 reader = GGUFReader(str(model_path))
59 field = reader.get_field(_TOKENIZER_CHAT_TEMPLATE_KEY)
60 except (OSError, ValueError, IndexError, KeyError):
61 log.debug("Failed to read chat template from %s", model_path, exc_info=True)
62 return None
63 if field is None:
64 return None
65 return bytes(field.parts[field.data[0]]).decode("utf-8", errors="replace")
68def adapt_gguf_template_for_mtmd(template: str) -> str:
69 """Rewrite known image-placeholder tokens to ``{{ content.image_url.url }}``."""
70 for token in _GGUF_IMAGE_TOKENS:
71 if token in template:
72 template = template.replace(token, _IMAGE_URL_JINJA)
73 return template
76def build_vision_chat_handler(model_path: Path, mmproj_path: Path) -> Any:
77 """Return the mtmd chat handler configured with the GGUF's embedded template.
79 ``DEFAULT_SYSTEM_MESSAGE`` is set to ``None`` so no stray system turn
80 is injected. Falls back to the upstream default template when the
81 GGUF has no ``tokenizer.chat_template``.
82 """
83 # Surface the libvulkan-missing hint before submodule import, since
84 # importing llama_cpp.llama_chat_format triggers the parent package's
85 # native loader as a side effect.
86 import_llama_cpp()
87 from llama_cpp.llama_chat_format import Llava15ChatHandler
89 # Defined per call so each loaded model binds its own ``CHAT_FORMAT``
90 # (set below) to a fresh class; hoisting this to module scope would
91 # make the first loaded model's template leak into every subsequent
92 # one.
93 class _GgufTemplateChatHandler(Llava15ChatHandler):
94 DEFAULT_SYSTEM_MESSAGE = None
96 handler_cls: type[Llava15ChatHandler] = _GgufTemplateChatHandler
98 template = read_chat_template(model_path)
99 if template is not None:
100 handler_cls.CHAT_FORMAT = adapt_gguf_template_for_mtmd(template)
101 log.info(
102 "Vision chat handler: using GGUF-embedded template (%d bytes) from %s",
103 len(template),
104 model_path.name,
105 )
106 else:
107 log.info(
108 "Vision chat handler: no GGUF-embedded chat template for %s; using upstream default",
109 model_path.name,
110 )
112 return handler_cls(str(mmproj_path), verbose=False)
115def load_vision_llama(
116 model_path: Path,
117 mmproj_path: Path | None = None,
118 *,
119 abort_callback_override: Any = None,
120) -> Any:
121 """Load a vision-capable ``Llama`` using the GGUF-templated chat handler.
123 ``abort_callback_override`` lets pool workers bind a callback that
124 reads the worker's shared ``mp.Value`` abort flag.
125 """
126 Llama = import_llama_cpp().Llama # noqa: N806 # heavy native lib; keep import lazy
128 install_llama_log_handler()
129 if mmproj_path is None:
130 mmproj_path = find_mmproj_for_model(model_path)
132 chat_handler = build_vision_chat_handler(model_path, mmproj_path)
134 import os
136 # llama-cpp-python defaults n_threads to ~cpu_count()//2 which leaves the
137 # GPU starved on prompt-eval work (image projection through the vision
138 # adapter is CPU-bound on the encode side even with all layers on GPU).
139 # Ollama runs full-core. Match that here for perf parity.
140 n_threads = os.cpu_count() or 4
141 kwargs: dict[str, Any] = {
142 "model_path": str(model_path),
143 "chat_handler": chat_handler,
144 "verbose": False,
145 "n_gpu_layers": -1,
146 "n_ctx": _resolve_vision_n_ctx(model_path),
147 "n_threads": n_threads,
148 "n_threads_batch": n_threads,
149 }
150 if cfg.main_gpu is not None:
151 kwargs["main_gpu"] = cfg.main_gpu
152 if abort_callback_override is not None:
153 kwargs["abort_callback"] = abort_callback_override
154 else:
155 kwargs.setdefault("abort_callback", abort_callback)
157 llama = suppress_native_stderr(Llama, **kwargs)
158 metadata = getattr(llama, "metadata", {}) or {}
159 n_ctx_fn = getattr(llama, "n_ctx", None)
160 n_ctx = n_ctx_fn() if callable(n_ctx_fn) else "?"
161 log.info(
162 "Vision model loaded: model=%s mmproj=%s n_ctx=%s arch=%s",
163 model_path.name,
164 mmproj_path.name,
165 n_ctx,
166 metadata.get("general.architecture", "?"),
167 )
168 return llama
171def _resolve_vision_n_ctx(model_path: Path) -> int:
172 """Pick n_ctx for a vision load using the model's training context.
174 Reads ``<arch>.context_length`` from the GGUF metadata and uses it
175 directly. The chat-tuned ``cfg.num_ctx`` is not propagated: a vision pass
176 packs image-token embeddings plus the prompt (often hundreds to a few
177 thousand tokens per page), and clamping to a small chat ctx truncates OCR
178 output. An explicit value (rather than 0) keeps the OOM-retry path
179 working since ``_halve_ctx_for_retry`` cannot bisect from 0.
180 """
181 try:
182 meta = read_gguf_metadata(model_path)
183 except Exception:
184 log.debug("read_gguf_metadata failed for vision %s", model_path, exc_info=True)
185 meta = None
186 return train_ctx_from_meta(meta, fallback=_VISION_FALLBACK_N_CTX, model_path=model_path)