Coverage for src / lilbee / providers / mtmd_backend.py: 100%

69 statements  

« prev     ^ index     » next       coverage.py v7.13.4, created at 2026-05-15 20:55 +0000

1"""Vision OCR loader that drives llama.cpp's mtmd pipeline with the GGUF's 

2own chat template, so there's no projector-type-to-handler lookup table. 

3""" 

4 

5from __future__ import annotations 

6 

7import logging 

8from pathlib import Path 

9from typing import Any 

10 

11from gguf import GGUFReader 

12 

13from lilbee.core.config import cfg 

14from lilbee.providers.llama_cpp.abort_signal import abort_callback 

15from lilbee.providers.llama_cpp.gguf_meta import ( 

16 find_mmproj_for_model, 

17 read_gguf_metadata, 

18 train_ctx_from_meta, 

19) 

20from lilbee.providers.llama_cpp.log_dispatch import ( 

21 import_llama_cpp, 

22 install_llama_log_handler, 

23 suppress_native_stderr, 

24) 

25 

26log = logging.getLogger(__name__) 

27 

28 

29# Image-placeholder tokens seen in GGUF chat templates. The upstream 

30# mtmd pipeline substitutes image URLs with mtmd's media marker, so 

31# these get rewritten to {{ content.image_url.url }} before rendering. 

32# Case matters: GGUF templates are machine-emitted and stable, so a 

33# case-insensitive replace would risk corrupting unrelated Jinja 

34# identifiers. 

35_GGUF_IMAGE_TOKENS: tuple[str, ...] = ( 

36 "<|image_pad|>", 

37 "<image>", 

38 "<IMAGE>", 

39 "<__media__>", 

40 "<__image__>", 

41) 

42_IMAGE_URL_JINJA = "{{ content.image_url.url }}" 

43 

44_TOKENIZER_CHAT_TEMPLATE_KEY = "tokenizer.chat_template" 

45 

46_VISION_FALLBACK_N_CTX = 4096 

47"""n_ctx for a vision load when the GGUF has no ``context_length`` in metadata. 

48 

49Most vision GGUFs report their training context (typical values: 4096, 8192, 

5032768); this covers the rare missing/unreadable-metadata case so the loader 

51still gets a sensible explicit n_ctx. 

52""" 

53 

54 

55def read_chat_template(model_path: Path) -> str | None: 

56 """Return the Jinja chat template embedded in a GGUF model, or None.""" 

57 try: 

58 reader = GGUFReader(str(model_path)) 

59 field = reader.get_field(_TOKENIZER_CHAT_TEMPLATE_KEY) 

60 except (OSError, ValueError, IndexError, KeyError): 

61 log.debug("Failed to read chat template from %s", model_path, exc_info=True) 

62 return None 

63 if field is None: 

64 return None 

65 return bytes(field.parts[field.data[0]]).decode("utf-8", errors="replace") 

66 

67 

68def adapt_gguf_template_for_mtmd(template: str) -> str: 

69 """Rewrite known image-placeholder tokens to ``{{ content.image_url.url }}``.""" 

70 for token in _GGUF_IMAGE_TOKENS: 

71 if token in template: 

72 template = template.replace(token, _IMAGE_URL_JINJA) 

73 return template 

74 

75 

76def build_vision_chat_handler(model_path: Path, mmproj_path: Path) -> Any: 

77 """Return the mtmd chat handler configured with the GGUF's embedded template. 

78 

79 ``DEFAULT_SYSTEM_MESSAGE`` is set to ``None`` so no stray system turn 

80 is injected. Falls back to the upstream default template when the 

81 GGUF has no ``tokenizer.chat_template``. 

82 """ 

83 # Surface the libvulkan-missing hint before submodule import, since 

84 # importing llama_cpp.llama_chat_format triggers the parent package's 

85 # native loader as a side effect. 

86 import_llama_cpp() 

87 from llama_cpp.llama_chat_format import Llava15ChatHandler 

88 

89 # Defined per call so each loaded model binds its own ``CHAT_FORMAT`` 

90 # (set below) to a fresh class; hoisting this to module scope would 

91 # make the first loaded model's template leak into every subsequent 

92 # one. 

93 class _GgufTemplateChatHandler(Llava15ChatHandler): 

94 DEFAULT_SYSTEM_MESSAGE = None 

95 

96 handler_cls: type[Llava15ChatHandler] = _GgufTemplateChatHandler 

97 

98 template = read_chat_template(model_path) 

99 if template is not None: 

100 handler_cls.CHAT_FORMAT = adapt_gguf_template_for_mtmd(template) 

101 log.info( 

102 "Vision chat handler: using GGUF-embedded template (%d bytes) from %s", 

103 len(template), 

104 model_path.name, 

105 ) 

106 else: 

107 log.info( 

108 "Vision chat handler: no GGUF-embedded chat template for %s; using upstream default", 

109 model_path.name, 

110 ) 

111 

112 return handler_cls(str(mmproj_path), verbose=False) 

113 

114 

115def load_vision_llama( 

116 model_path: Path, 

117 mmproj_path: Path | None = None, 

118 *, 

119 abort_callback_override: Any = None, 

120) -> Any: 

121 """Load a vision-capable ``Llama`` using the GGUF-templated chat handler. 

122 

123 ``abort_callback_override`` lets pool workers bind a callback that 

124 reads the worker's shared ``mp.Value`` abort flag. 

125 """ 

126 Llama = import_llama_cpp().Llama # noqa: N806 # heavy native lib; keep import lazy 

127 

128 install_llama_log_handler() 

129 if mmproj_path is None: 

130 mmproj_path = find_mmproj_for_model(model_path) 

131 

132 chat_handler = build_vision_chat_handler(model_path, mmproj_path) 

133 

134 import os 

135 

136 # llama-cpp-python defaults n_threads to ~cpu_count()//2 which leaves the 

137 # GPU starved on prompt-eval work (image projection through the vision 

138 # adapter is CPU-bound on the encode side even with all layers on GPU). 

139 # Ollama runs full-core. Match that here for perf parity. 

140 n_threads = os.cpu_count() or 4 

141 kwargs: dict[str, Any] = { 

142 "model_path": str(model_path), 

143 "chat_handler": chat_handler, 

144 "verbose": False, 

145 "n_gpu_layers": -1, 

146 "n_ctx": _resolve_vision_n_ctx(model_path), 

147 "n_threads": n_threads, 

148 "n_threads_batch": n_threads, 

149 } 

150 if cfg.main_gpu is not None: 

151 kwargs["main_gpu"] = cfg.main_gpu 

152 if abort_callback_override is not None: 

153 kwargs["abort_callback"] = abort_callback_override 

154 else: 

155 kwargs.setdefault("abort_callback", abort_callback) 

156 

157 llama = suppress_native_stderr(Llama, **kwargs) 

158 metadata = getattr(llama, "metadata", {}) or {} 

159 n_ctx_fn = getattr(llama, "n_ctx", None) 

160 n_ctx = n_ctx_fn() if callable(n_ctx_fn) else "?" 

161 log.info( 

162 "Vision model loaded: model=%s mmproj=%s n_ctx=%s arch=%s", 

163 model_path.name, 

164 mmproj_path.name, 

165 n_ctx, 

166 metadata.get("general.architecture", "?"), 

167 ) 

168 return llama 

169 

170 

171def _resolve_vision_n_ctx(model_path: Path) -> int: 

172 """Pick n_ctx for a vision load using the model's training context. 

173 

174 Reads ``<arch>.context_length`` from the GGUF metadata and uses it 

175 directly. The chat-tuned ``cfg.num_ctx`` is not propagated: a vision pass 

176 packs image-token embeddings plus the prompt (often hundreds to a few 

177 thousand tokens per page), and clamping to a small chat ctx truncates OCR 

178 output. An explicit value (rather than 0) keeps the OOM-retry path 

179 working since ``_halve_ctx_for_retry`` cannot bisect from 0. 

180 """ 

181 try: 

182 meta = read_gguf_metadata(model_path) 

183 except Exception: 

184 log.debug("read_gguf_metadata failed for vision %s", model_path, exc_info=True) 

185 meta = None 

186 return train_ctx_from_meta(meta, fallback=_VISION_FALLBACK_N_CTX, model_path=model_path)