Coverage for src / lilbee / vision.py: 100%

28 statements  

« prev     ^ index     » next       coverage.py v7.13.4, created at 2026-05-15 20:55 +0000

1"""Helpers for PDF rasterisation and vision-model OCR. 

2 

3Multi-page vision OCR runs through ``LlamaCppProvider.pdf_ocr`` and the 

4persistent worker pool; this module hosts the small helpers (page count, 

5rasterisation, prompt + chat-message construction, and the shared 

6:class:`PageText` / :class:`PdfOcrChunk` types) that both the worker and 

7the parent need. 

8""" 

9 

10import logging 

11from collections.abc import Iterator 

12from pathlib import Path 

13from typing import NamedTuple 

14 

15log = logging.getLogger(__name__) 

16 

17 

18class PageText(NamedTuple): 

19 """Extracted text for a single PDF page.""" 

20 

21 page: int 

22 text: str 

23 

24 

25class PdfOcrChunk(NamedTuple): 

26 """One streaming PDF-OCR worker frame: page index, total pages, page text.""" 

27 

28 page: int 

29 total: int 

30 text: str 

31 

32 

33OCR_PROMPT = ( 

34 "Extract ALL text from this page as clean markdown. " 

35 "Preserve table structure using markdown table syntax. " 

36 "Include all rows, columns, headers, and page text exactly as shown." 

37) 

38 

39_RASTER_DPI = 150 

40 

41 

42def pdf_page_count(path: Path) -> int: 

43 """Return the number of pages in a PDF without rasterizing.""" 

44 from kreuzberg import PdfPageIterator # lazy: heavy dependency 

45 

46 it = PdfPageIterator(str(path), dpi=_RASTER_DPI) 

47 return len(it) 

48 

49 

50def rasterize_pdf(path: Path) -> Iterator[tuple[int, bytes]]: 

51 """Yield (0-based index, PNG bytes) for each page of a PDF.""" 

52 from kreuzberg import PdfPageIterator # lazy: heavy dependency 

53 

54 with PdfPageIterator(str(path), dpi=_RASTER_DPI) as pages: 

55 yield from pages 

56 

57 

58def _png_to_data_url(png_bytes: bytes) -> str: 

59 """Convert raw PNG bytes to a base64 data URL for OpenAI-compatible messages.""" 

60 import base64 

61 

62 b64 = base64.b64encode(png_bytes).decode("ascii") 

63 return f"data:image/png;base64,{b64}" 

64 

65 

66def build_vision_messages(prompt: str, png_bytes: bytes) -> list[dict]: 

67 """Build OpenAI-compatible messages with image content for vision models. 

68 

69 Uses the multipart content format expected by llama.cpp's mtmd pipeline. 

70 """ 

71 return [ 

72 { 

73 "role": "user", 

74 "content": [ 

75 {"type": "image_url", "image_url": {"url": _png_to_data_url(png_bytes)}}, 

76 {"type": "text", "text": prompt}, 

77 ], 

78 } 

79 ]