Coverage for src / lilbee / data / store / types.py: 100%

69 statements  

« prev     ^ index     » next       coverage.py v7.13.4, created at 2026-05-15 20:55 +0000

1"""Public dataclasses, TypedDicts, enums, and constants for the store package.""" 

2 

3from __future__ import annotations 

4 

5from dataclasses import dataclass 

6from datetime import timedelta 

7from enum import StrEnum 

8from typing import TypedDict 

9 

10from pydantic import BaseModel, ConfigDict, Field, field_validator 

11 

12# How often readers re-check the manifest for new versions from other processes. 

13# Zero means strong consistency (every read checks); higher values reduce disk I/O 

14# on slow media (HDD) at the cost of serving slightly stale data. 

15READ_CONSISTENCY_INTERVAL = timedelta(seconds=5) 

16 

17# Values for the ``chunk_type`` column. Everything goes in as raw except wiki 

18# pages written by the wiki producer; callers filter with ``Store.search(chunk_type=...)``. 

19CHUNK_TYPE_RAW = "raw" 

20CHUNK_TYPE_WIKI = "wiki" 

21 

22# ``schema_version`` is an integer for forward-compat. Bump only if we ever need to 

23# add or rename a meta column without forcing every store to drop_all. 

24META_SCHEMA_VERSION = 1 

25 

26# Always-true predicate used to clear the single-row ``_meta`` table before re-insert. 

27# Lance's ``Table.delete`` requires a SQL where clause; this matches every row without 

28# coupling the deletion to any specific column's value domain. 

29META_DELETE_ALL_PREDICATE = "schema_version IS NOT NULL" 

30 

31 

32class SearchScope(StrEnum): 

33 """What the user wants to search over. 

34 

35 Values are used as-is on CLI flags, MCP params, and HTTP query strings. 

36 ``BOTH`` resolves to a ``None`` ``chunk_type`` (no filter); the two 

37 others map 1:1 to the chunks-table values. 

38 """ 

39 

40 RAW = CHUNK_TYPE_RAW 

41 WIKI = CHUNK_TYPE_WIKI 

42 BOTH = "both" 

43 

44 

45def scope_to_chunk_type(scope: SearchScope | str | None) -> str | None: 

46 """Translate a user-facing scope into a ``Store.search`` ``chunk_type`` arg. 

47 

48 ``None``/``"both"`` → no filter. ``"raw"`` / ``"wiki"`` → the matching 

49 chunks-table value. Raises ``ValueError`` on any other string. 

50 """ 

51 if scope is None: 

52 return None 

53 normalized = SearchScope(scope) 

54 if normalized is SearchScope.BOTH: 

55 return None 

56 return normalized.value 

57 

58 

59class SearchChunk(BaseModel): 

60 """A search result from LanceDB. 

61 Hybrid results have ``relevance_score`` set (higher = better). 

62 Vector-only results have ``distance`` set (lower = better). 

63 """ 

64 

65 model_config = ConfigDict(populate_by_name=True) 

66 

67 source: str 

68 content_type: str 

69 chunk_type: str = CHUNK_TYPE_RAW 

70 

71 @field_validator("chunk_type", mode="before") 

72 @classmethod 

73 def _coerce_none_chunk_type(cls, v: str | None) -> str: 

74 """LanceDB rows from before the chunk_type column was added return None.""" 

75 return v if v is not None else CHUNK_TYPE_RAW 

76 

77 page_start: int 

78 page_end: int 

79 line_start: int 

80 line_end: int 

81 chunk: str 

82 chunk_index: int 

83 vector: list[float] = Field(repr=False) 

84 distance: float | None = Field(None, alias="_distance") 

85 relevance_score: float | None = Field(None, alias="_relevance_score") 

86 

87 

88class SourceRecord(TypedDict): 

89 """A tracked source document record.""" 

90 

91 filename: str 

92 file_hash: str 

93 ingested_at: str 

94 chunk_count: int 

95 source_type: str 

96 

97 

98class CitationRecord(TypedDict): 

99 """A citation linking a wiki chunk to a specific source location.""" 

100 

101 wiki_source: str 

102 wiki_chunk_index: int 

103 citation_key: str 

104 claim_type: str 

105 source_filename: str 

106 source_hash: str 

107 page_start: int 

108 page_end: int 

109 line_start: int 

110 line_end: int 

111 excerpt: str 

112 created_at: str 

113 

114 

115class StoreMeta(TypedDict): 

116 """Single-row store metadata recording the embedding model used to build the store. 

117 

118 Compatibility is checked before every read and write. When ``cfg.embedding_model`` 

119 or ``cfg.embedding_dim`` drifts from the persisted row, the store refuses to serve 

120 until ``lilbee rebuild`` (CLI) or ``POST /api/sync {"force_rebuild": true}`` (HTTP) 

121 rewrites the chunks under the new model. 

122 

123 ``updated_at`` is an ISO 8601 UTC timestamp produced by ``datetime.isoformat()``; 

124 kept as ``str`` to match the LanceDB ``utf8`` schema column. 

125 """ 

126 

127 embedding_model: str 

128 embedding_dim: int 

129 schema_version: int 

130 updated_at: str 

131 

132 

133class EmbeddingModelMismatchError(RuntimeError): 

134 """Raised when stored vectors were built with a different embedding model than ``cfg``. 

135 

136 Carries a user-facing message naming both the persisted and the configured model and 

137 pointing at the two recovery paths (``lilbee rebuild`` and ``POST /api/sync`` with 

138 ``force_rebuild=true``). 

139 """ 

140 

141 

142@dataclass 

143class RemoveResult: 

144 """Result of a remove_documents operation.""" 

145 

146 removed: list[str] 

147 not_found: list[str]