Coverage for src/lilbee/providers/mtmd

1"""Vision OCR loader that drives llama.cpp's mtmd pipeline with the GGUF's

2own chat template, so there's no projector-type-to-handler lookup table.

3"""

5from __future__ import annotations

7import logging

8from pathlib import Path

9from typing import Any

11from gguf import GGUFReader

13from lilbee.core.config import cfg

14from lilbee.providers.llama_cpp.abort_signal import abort_callback

15from lilbee.providers.llama_cpp.gguf_meta import (

16 find_mmproj_for_model,

17 read_gguf_metadata,

18 train_ctx_from_meta,

19)

20from lilbee.providers.llama_cpp.log_dispatch import (

21 import_llama_cpp,

22 install_llama_log_handler,

23 suppress_native_stderr,

24)

26log = logging.getLogger(__name__)

29# Image-placeholder tokens seen in GGUF chat templates. The upstream

30# mtmd pipeline substitutes image URLs with mtmd's media marker, so

31# these get rewritten to {{ content.image_url.url }} before rendering.

32# Case matters: GGUF templates are machine-emitted and stable, so a

33# case-insensitive replace would risk corrupting unrelated Jinja

34# identifiers.

35_GGUF_IMAGE_TOKENS: tuple[str, ...] = (

36 "<|image_pad|>",

37 "<image>",

38 "<IMAGE>",

39 "<__media__>",

40 "<__image__>",

41)

42_IMAGE_URL_JINJA = "{{ content.image_url.url }}"

44_TOKENIZER_CHAT_TEMPLATE_KEY = "tokenizer.chat_template"

46_VISION_FALLBACK_N_CTX = 4096

47"""n_ctx for a vision load when the GGUF has no ``context_length`` in metadata.

49Most vision GGUFs report their training context (typical values: 4096, 8192,

5032768); this covers the rare missing/unreadable-metadata case so the loader

51still gets a sensible explicit n_ctx.

52"""

55def read_chat_template(model_path: Path) -> str | None:

56 """Return the Jinja chat template embedded in a GGUF model, or None."""

57 try:

58 reader = GGUFReader(str(model_path))

59 field = reader.get_field(_TOKENIZER_CHAT_TEMPLATE_KEY)

60 except (OSError, ValueError, IndexError, KeyError):

61 log.debug("Failed to read chat template from %s", model_path, exc_info=True)

62 return None

63 if field is None:

64 return None

65 return bytes(field.parts[field.data[0]]).decode("utf-8", errors="replace")

68def adapt_gguf_template_for_mtmd(template: str) -> str:

69 """Rewrite known image-placeholder tokens to ``{{ content.image_url.url }}``."""

70 for token in _GGUF_IMAGE_TOKENS:

71 if token in template:

72 template = template.replace(token, _IMAGE_URL_JINJA)

73 return template

76def build_vision_chat_handler(model_path: Path, mmproj_path: Path) -> Any:

77 """Return the mtmd chat handler configured with the GGUF's embedded template.

79 ``DEFAULT_SYSTEM_MESSAGE`` is set to ``None`` so no stray system turn

80 is injected. Falls back to the upstream default template when the

81 GGUF has no ``tokenizer.chat_template``.

82 """

83 # Surface the libvulkan-missing hint before submodule import, since

84 # importing llama_cpp.llama_chat_format triggers the parent package's

85 # native loader as a side effect.

86 import_llama_cpp()

87 from llama_cpp.llama_chat_format import Llava15ChatHandler

89 # Defined per call so each loaded model binds its own ``CHAT_FORMAT``

90 # (set below) to a fresh class; hoisting this to module scope would

91 # make the first loaded model's template leak into every subsequent

92 # one.

93 class _GgufTemplateChatHandler(Llava15ChatHandler):

94 DEFAULT_SYSTEM_MESSAGE = None

96 handler_cls: type[Llava15ChatHandler] = _GgufTemplateChatHandler

98 template = read_chat_template(model_path)

99 if template is not None:

100 handler_cls.CHAT_FORMAT = adapt_gguf_template_for_mtmd(template)

101 log.info(

102 "Vision chat handler: using GGUF-embedded template (%d bytes) from %s",

103 len(template),

104 model_path.name,

105 )

106 else:

107 log.info(

108 "Vision chat handler: no GGUF-embedded chat template for %s; using upstream default",

109 model_path.name,

110 )

111

112 return handler_cls(str(mmproj_path), verbose=False)

113

114

115def load_vision_llama(

116 model_path: Path,

117 mmproj_path: Path | None = None,

118 *,

119 abort_callback_override: Any = None,

120) -> Any:

121 """Load a vision-capable ``Llama`` using the GGUF-templated chat handler.

122

123 ``abort_callback_override`` lets pool workers bind a callback that

124 reads the worker's shared ``mp.Value`` abort flag.

125 """

126 Llama = import_llama_cpp().Llama # noqa: N806 # heavy native lib; keep import lazy

127

128 install_llama_log_handler()

129 if mmproj_path is None:

130 mmproj_path = find_mmproj_for_model(model_path)

131

132 chat_handler = build_vision_chat_handler(model_path, mmproj_path)

133

134 import os

135

136 # llama-cpp-python defaults n_threads to ~cpu_count()//2 which leaves the

137 # GPU starved on prompt-eval work (image projection through the vision

138 # adapter is CPU-bound on the encode side even with all layers on GPU).

139 # Ollama runs full-core. Match that here for perf parity.

140 n_threads = os.cpu_count() or 4

141 kwargs: dict[str, Any] = {

142 "model_path": str(model_path),

143 "chat_handler": chat_handler,

144 "verbose": False,

145 "n_gpu_layers": -1,

146 "n_ctx": _resolve_vision_n_ctx(model_path),

147 "n_threads": n_threads,

148 "n_threads_batch": n_threads,

149 }

150 if cfg.main_gpu is not None:

151 kwargs["main_gpu"] = cfg.main_gpu

152 if abort_callback_override is not None:

153 kwargs["abort_callback"] = abort_callback_override

154 else:

155 kwargs.setdefault("abort_callback", abort_callback)

156

157 llama = suppress_native_stderr(Llama, **kwargs)

158 metadata = getattr(llama, "metadata", {}) or {}

159 n_ctx_fn = getattr(llama, "n_ctx", None)

160 n_ctx = n_ctx_fn() if callable(n_ctx_fn) else "?"

161 log.info(

162 "Vision model loaded: model=%s mmproj=%s n_ctx=%s arch=%s",

163 model_path.name,

164 mmproj_path.name,

165 n_ctx,

166 metadata.get("general.architecture", "?"),

167 )

168 return llama

169

170

171def _resolve_vision_n_ctx(model_path: Path) -> int:

172 """Pick n_ctx for a vision load using the model's training context.

173

174 Reads ``<arch>.context_length`` from the GGUF metadata and uses it

175 directly. The chat-tuned ``cfg.num_ctx`` is not propagated: a vision pass

176 packs image-token embeddings plus the prompt (often hundreds to a few

177 thousand tokens per page), and clamping to a small chat ctx truncates OCR

178 output. An explicit value (rather than 0) keeps the OOM-retry path

179 working since ``_halve_ctx_for_retry`` cannot bisect from 0.

180 """

181 try:

182 meta = read_gguf_metadata(model_path)

183 except Exception:

184 log.debug("read_gguf_metadata failed for vision %s", model_path, exc_info=True)

185 meta = None

186 return train_ctx_from_meta(meta, fallback=_VISION_FALLBACK_N_CTX, model_path=model_path)

Coverage for src / lilbee / providers / mtmd_backend.py: 100%

69 statements