Coverage for src / lilbee / data / ingest / types.py: 100%

59 statements  

« prev     ^ index     » next       coverage.py v7.13.4, created at 2026-06-28 01:01 +0000

1"""Shared ingest types and constants.""" 

2 

3from __future__ import annotations 

4 

5from dataclasses import dataclass 

6from enum import StrEnum 

7from pathlib import Path 

8from typing import NamedTuple, TypedDict 

9 

10from pydantic import BaseModel 

11 

12from lilbee.data.store import ChunkType 

13 

14 

15class FileToProcess(NamedTuple): 

16 """A file queued for ingestion with its metadata.""" 

17 

18 name: str 

19 path: Path 

20 content_type: str 

21 file_hash: str 

22 needs_cleanup: bool 

23 

24 

25# Minimum total chars for extracted text to be considered meaningful. 

26# 50 chars ≈ 12 words: if a PDF yields less, it's almost certainly a scanned 

27# document with no embedded text layer. Text PDFs with even just a title page 

28# easily exceed this threshold; blank/scan-only PDFs yield 0 chars. 

29MIN_MEANINGFUL_CHARS = 50 

30 

31PDF_CONTENT_TYPE = "pdf" 

32MARKDOWN_OUTPUT = "markdown" 

33TESSERACT_BACKEND = "tesseract" 

34 

35 

36class ExtractMode(StrEnum): 

37 """Extraction topology: pagination / OCR / output format.""" 

38 

39 MARKDOWN = "markdown" 

40 PAGINATED = "paginated" 

41 PAGINATED_OCR = "paginated_ocr" 

42 

43 

44class ChunkRecord(TypedDict): 

45 """A single store-ready chunk record matching store.CHUNKS_SCHEMA.""" 

46 

47 source: str 

48 content_type: str 

49 chunk_type: ChunkType 

50 page_start: int 

51 page_end: int 

52 line_start: int 

53 line_end: int 

54 chunk: str 

55 chunk_index: int 

56 vector: list[float] 

57 

58 

59class SyncResult(BaseModel): 

60 """Summary of a sync operation.""" 

61 

62 added: list[str] = [] 

63 updated: list[str] = [] 

64 removed: list[str] = [] 

65 unchanged: int = 0 

66 failed: list[str] = [] 

67 skipped: list[str] = [] 

68 # Chunks whose text exceeded the embedder's char budget and were truncated 

69 # before embedding. Non-zero means some tail content did not reach the index. 

70 truncated: int = 0 

71 

72 def __str__(self) -> str: 

73 lines = [ 

74 f"Added: {len(self.added)}", 

75 f"Updated: {len(self.updated)}", 

76 f"Removed: {len(self.removed)}", 

77 f"Unchanged: {self.unchanged}", 

78 f"Skipped: {len(self.skipped)}", 

79 f"Failed: {len(self.failed)}", 

80 f"Truncated: {self.truncated}", 

81 ] 

82 for f in self.skipped: 

83 lines.append(f" [yellow]{f}[/yellow]") 

84 for f in self.failed: 

85 lines.append(f" [red]{f}[/red]") 

86 return "\n".join(lines) 

87 

88 def __repr__(self) -> str: 

89 return ( 

90 f"SyncResult(added={len(self.added)}, updated={len(self.updated)}, " 

91 f"removed={len(self.removed)}, unchanged={self.unchanged}, " 

92 f"skipped={len(self.skipped)}, failed={len(self.failed)}, " 

93 f"truncated={self.truncated})" 

94 ) 

95 

96 def __rich__(self) -> str: 

97 return self.__str__() 

98 

99 

100@dataclass 

101class _IngestResult: 

102 """Outcome of a single file ingestion attempt.""" 

103 

104 name: str 

105 path: Path 

106 chunk_count: int 

107 error: Exception | None 

108 file_hash: str = "" 

109 

110 

111# Extension → content_type string for document formats handled by kreuzberg 

112DOCUMENT_EXTENSION_MAP: dict[str, str] = { 

113 **{ext: "text" for ext in (".md", ".txt", ".html", ".rst", ".yaml", ".yml")}, 

114 ".pdf": PDF_CONTENT_TYPE, 

115 **{ext: ext.lstrip(".") for ext in (".docx", ".xlsx", ".pptx")}, 

116 ".epub": "epub", 

117 **{ext: "image" for ext in (".png", ".jpg", ".jpeg", ".tiff", ".tif", ".bmp", ".webp")}, 

118 **{ext: "data" for ext in (".csv", ".tsv")}, 

119 ".xml": "xml", 

120 **{ext: "json" for ext in (".json", ".jsonl")}, 

121}