Coverage for src / lilbee / vision.py: 100%
28 statements
« prev ^ index » next coverage.py v7.13.4, created at 2026-05-15 20:55 +0000
« prev ^ index » next coverage.py v7.13.4, created at 2026-05-15 20:55 +0000
1"""Helpers for PDF rasterisation and vision-model OCR.
3Multi-page vision OCR runs through ``LlamaCppProvider.pdf_ocr`` and the
4persistent worker pool; this module hosts the small helpers (page count,
5rasterisation, prompt + chat-message construction, and the shared
6:class:`PageText` / :class:`PdfOcrChunk` types) that both the worker and
7the parent need.
8"""
10import logging
11from collections.abc import Iterator
12from pathlib import Path
13from typing import NamedTuple
15log = logging.getLogger(__name__)
18class PageText(NamedTuple):
19 """Extracted text for a single PDF page."""
21 page: int
22 text: str
25class PdfOcrChunk(NamedTuple):
26 """One streaming PDF-OCR worker frame: page index, total pages, page text."""
28 page: int
29 total: int
30 text: str
33OCR_PROMPT = (
34 "Extract ALL text from this page as clean markdown. "
35 "Preserve table structure using markdown table syntax. "
36 "Include all rows, columns, headers, and page text exactly as shown."
37)
39_RASTER_DPI = 150
42def pdf_page_count(path: Path) -> int:
43 """Return the number of pages in a PDF without rasterizing."""
44 from kreuzberg import PdfPageIterator # lazy: heavy dependency
46 it = PdfPageIterator(str(path), dpi=_RASTER_DPI)
47 return len(it)
50def rasterize_pdf(path: Path) -> Iterator[tuple[int, bytes]]:
51 """Yield (0-based index, PNG bytes) for each page of a PDF."""
52 from kreuzberg import PdfPageIterator # lazy: heavy dependency
54 with PdfPageIterator(str(path), dpi=_RASTER_DPI) as pages:
55 yield from pages
58def _png_to_data_url(png_bytes: bytes) -> str:
59 """Convert raw PNG bytes to a base64 data URL for OpenAI-compatible messages."""
60 import base64
62 b64 = base64.b64encode(png_bytes).decode("ascii")
63 return f"data:image/png;base64,{b64}"
66def build_vision_messages(prompt: str, png_bytes: bytes) -> list[dict]:
67 """Build OpenAI-compatible messages with image content for vision models.
69 Uses the multipart content format expected by llama.cpp's mtmd pipeline.
70 """
71 return [
72 {
73 "role": "user",
74 "content": [
75 {"type": "image_url", "image_url": {"url": _png_to_data_url(png_bytes)}},
76 {"type": "text", "text": prompt},
77 ],
78 }
79 ]