Coverage for src / lilbee / data / ingest / types.py: 100%
59 statements
« prev ^ index » next coverage.py v7.13.4, created at 2026-06-28 01:01 +0000
« prev ^ index » next coverage.py v7.13.4, created at 2026-06-28 01:01 +0000
1"""Shared ingest types and constants."""
3from __future__ import annotations
5from dataclasses import dataclass
6from enum import StrEnum
7from pathlib import Path
8from typing import NamedTuple, TypedDict
10from pydantic import BaseModel
12from lilbee.data.store import ChunkType
15class FileToProcess(NamedTuple):
16 """A file queued for ingestion with its metadata."""
18 name: str
19 path: Path
20 content_type: str
21 file_hash: str
22 needs_cleanup: bool
25# Minimum total chars for extracted text to be considered meaningful.
26# 50 chars ≈ 12 words: if a PDF yields less, it's almost certainly a scanned
27# document with no embedded text layer. Text PDFs with even just a title page
28# easily exceed this threshold; blank/scan-only PDFs yield 0 chars.
29MIN_MEANINGFUL_CHARS = 50
31PDF_CONTENT_TYPE = "pdf"
32MARKDOWN_OUTPUT = "markdown"
33TESSERACT_BACKEND = "tesseract"
36class ExtractMode(StrEnum):
37 """Extraction topology: pagination / OCR / output format."""
39 MARKDOWN = "markdown"
40 PAGINATED = "paginated"
41 PAGINATED_OCR = "paginated_ocr"
44class ChunkRecord(TypedDict):
45 """A single store-ready chunk record matching store.CHUNKS_SCHEMA."""
47 source: str
48 content_type: str
49 chunk_type: ChunkType
50 page_start: int
51 page_end: int
52 line_start: int
53 line_end: int
54 chunk: str
55 chunk_index: int
56 vector: list[float]
59class SyncResult(BaseModel):
60 """Summary of a sync operation."""
62 added: list[str] = []
63 updated: list[str] = []
64 removed: list[str] = []
65 unchanged: int = 0
66 failed: list[str] = []
67 skipped: list[str] = []
68 # Chunks whose text exceeded the embedder's char budget and were truncated
69 # before embedding. Non-zero means some tail content did not reach the index.
70 truncated: int = 0
72 def __str__(self) -> str:
73 lines = [
74 f"Added: {len(self.added)}",
75 f"Updated: {len(self.updated)}",
76 f"Removed: {len(self.removed)}",
77 f"Unchanged: {self.unchanged}",
78 f"Skipped: {len(self.skipped)}",
79 f"Failed: {len(self.failed)}",
80 f"Truncated: {self.truncated}",
81 ]
82 for f in self.skipped:
83 lines.append(f" [yellow]{f}[/yellow]")
84 for f in self.failed:
85 lines.append(f" [red]{f}[/red]")
86 return "\n".join(lines)
88 def __repr__(self) -> str:
89 return (
90 f"SyncResult(added={len(self.added)}, updated={len(self.updated)}, "
91 f"removed={len(self.removed)}, unchanged={self.unchanged}, "
92 f"skipped={len(self.skipped)}, failed={len(self.failed)}, "
93 f"truncated={self.truncated})"
94 )
96 def __rich__(self) -> str:
97 return self.__str__()
100@dataclass
101class _IngestResult:
102 """Outcome of a single file ingestion attempt."""
104 name: str
105 path: Path
106 chunk_count: int
107 error: Exception | None
108 file_hash: str = ""
111# Extension → content_type string for document formats handled by kreuzberg
112DOCUMENT_EXTENSION_MAP: dict[str, str] = {
113 **{ext: "text" for ext in (".md", ".txt", ".html", ".rst", ".yaml", ".yml")},
114 ".pdf": PDF_CONTENT_TYPE,
115 **{ext: ext.lstrip(".") for ext in (".docx", ".xlsx", ".pptx")},
116 ".epub": "epub",
117 **{ext: "image" for ext in (".png", ".jpg", ".jpeg", ".tiff", ".tif", ".bmp", ".webp")},
118 **{ext: "data" for ext in (".csv", ".tsv")},
119 ".xml": "xml",
120 **{ext: "json" for ext in (".json", ".jsonl")},
121}