Coverage for src / lilbee / data / store / types.py: 100%

116 statements  

« prev     ^ index     » next       coverage.py v7.13.4, created at 2026-06-28 01:01 +0000

1"""Public dataclasses, TypedDicts, enums, and constants for the store package.""" 

2 

3from __future__ import annotations 

4 

5from dataclasses import dataclass 

6from datetime import timedelta 

7from enum import StrEnum 

8from typing import TypedDict 

9 

10from pydantic import BaseModel, ConfigDict, Field, field_validator 

11 

12# How often readers re-check the manifest for new versions from other processes. 

13# Zero means strong consistency (every read checks); higher values reduce disk I/O 

14# on slow media (HDD) at the cost of serving slightly stale data. 

15READ_CONSISTENCY_INTERVAL = timedelta(seconds=5) 

16 

17 

18class ChunkType(StrEnum): 

19 """Values for the ``chunk_type`` column. 

20 

21 Everything ingests as ``RAW`` except wiki pages written by the wiki 

22 producer; callers filter with ``Store.search(chunk_type=...)``. 

23 """ 

24 

25 RAW = "raw" 

26 WIKI = "wiki" 

27 

28 

29class SourceType(StrEnum): 

30 """Values for the ``_sources.source_type`` column. 

31 

32 ``DOCUMENT`` mirrors a file under ``documents/`` and is managed by the 

33 file-driven sync. ``IMPORTED`` is detached: it came from ``lilbee import`` 

34 and has no backing file, so sync must not treat it as a missing document. 

35 """ 

36 

37 DOCUMENT = "document" 

38 IMPORTED = "imported" 

39 

40 

41# ``schema_version`` is an integer for forward-compat. Bump only if we ever need to 

42# add or rename a meta column without forcing every store to drop_all. 

43META_SCHEMA_VERSION = 1 

44 

45# Always-true predicate used to clear the single-row ``_meta`` table before re-insert. 

46# Lance's ``Table.delete`` requires a SQL where clause; this matches every row without 

47# coupling the deletion to any specific column's value domain. 

48META_DELETE_ALL_PREDICATE = "schema_version IS NOT NULL" 

49 

50 

51class SearchScope(StrEnum): 

52 """What the user wants to search over. 

53 

54 Values are used as-is on CLI flags, MCP params, and HTTP query strings. 

55 ``BOTH`` resolves to a ``None`` ``chunk_type`` (no filter); the two 

56 others map 1:1 to the chunks-table values. 

57 """ 

58 

59 RAW = ChunkType.RAW 

60 WIKI = ChunkType.WIKI 

61 BOTH = "both" 

62 

63 

64def scope_to_chunk_type(scope: SearchScope | str | None) -> ChunkType | None: 

65 """Translate a user-facing scope into a ``Store.search`` ``chunk_type`` arg. 

66 

67 ``None``/``"both"`` → no filter. ``"raw"`` / ``"wiki"`` → the matching 

68 ``ChunkType``. Raises ``ValueError`` on any other string. 

69 """ 

70 if scope is None: 

71 return None 

72 normalized = SearchScope(scope) 

73 if normalized is SearchScope.BOTH: 

74 return None 

75 return ChunkType(normalized.value) 

76 

77 

78class SearchChunk(BaseModel): 

79 """A search result from LanceDB. 

80 Hybrid results have ``relevance_score`` set (higher = better). 

81 Vector-only results have ``distance`` set (lower = better). 

82 Reranked results have ``rerank_score`` set (higher = better). 

83 """ 

84 

85 model_config = ConfigDict(populate_by_name=True) 

86 

87 source: str 

88 content_type: str 

89 chunk_type: ChunkType = ChunkType.RAW 

90 

91 @field_validator("chunk_type", mode="before") 

92 @classmethod 

93 def _coerce_none_chunk_type(cls, v: str | None) -> str: 

94 """LanceDB rows from before the chunk_type column was added return None.""" 

95 return v if v is not None else ChunkType.RAW 

96 

97 page_start: int 

98 page_end: int 

99 line_start: int 

100 line_end: int 

101 chunk: str 

102 chunk_index: int 

103 vector: list[float] = Field(repr=False) 

104 distance: float | None = Field(None, alias="_distance") 

105 relevance_score: float | None = Field(None, alias="_relevance_score") 

106 rerank_score: float | None = None 

107 

108 

109class SourceRecord(TypedDict): 

110 """A tracked source document record.""" 

111 

112 filename: str 

113 file_hash: str 

114 ingested_at: str 

115 chunk_count: int 

116 source_type: str 

117 

118 

119class PageTextRecord(TypedDict): 

120 """One row of the per-page text dataset, matching ``_page_texts``.""" 

121 

122 source: str 

123 page: int 

124 text: str 

125 content_type: str 

126 

127 

128class CitationRecord(TypedDict): 

129 """A citation linking a wiki chunk to a specific source location.""" 

130 

131 wiki_source: str 

132 wiki_chunk_index: int 

133 citation_key: str 

134 claim_type: str 

135 source_filename: str 

136 source_hash: str 

137 page_start: int 

138 page_end: int 

139 line_start: int 

140 line_end: int 

141 excerpt: str 

142 created_at: str 

143 

144 

145class MemoryKind(StrEnum): 

146 """Whether a memory is an always-injected preference or a similarity-recalled fact.""" 

147 

148 PREFERENCE = "preference" 

149 FACT = "fact" 

150 

151 

152class MemorySource(StrEnum): 

153 """Provenance of a memory: user-typed, LLM-extracted, or agent-written.""" 

154 

155 MANUAL = "manual" 

156 EXTRACTED = "extracted" 

157 AGENT = "agent" 

158 

159 

160# Memory owner namespaces. ``"local"`` is the single human (TUI/CLI/REST); agents own 

161# ``"agent:<id>"`` namespaces. The prefix lives only here so it is never hand-spliced. 

162LOCAL_OWNER = "local" 

163AGENT_OWNER_PREFIX = "agent:" 

164 

165 

166def agent_owner(agent_id: str) -> str: 

167 """Owner string for an agent identity (``"opencode"`` -> ``"agent:opencode"``).""" 

168 return f"{AGENT_OWNER_PREFIX}{agent_id}" 

169 

170 

171def is_agent_owner(owner: str) -> bool: 

172 """True when *owner* is an agent namespace rather than the local human.""" 

173 return owner.startswith(AGENT_OWNER_PREFIX) 

174 

175 

176class MemoryRow(BaseModel): 

177 """A long-term memory entry in the per-library ``_memories`` table. 

178 

179 Built from a LanceDB row via ``MemoryRow(**row)`` (which coerces the ``kind`` 

180 and ``source`` strings to enums) and written back via ``model_dump(mode="json")``. 

181 Extra keys like a search ``_distance`` are ignored on construction. 

182 """ 

183 

184 model_config = ConfigDict(extra="ignore") 

185 

186 id: str 

187 owner: str 

188 shared: bool 

189 kind: MemoryKind 

190 source: MemorySource 

191 text: str 

192 vector: list[float] = Field(repr=False) 

193 created_at: str 

194 updated_at: str 

195 

196 

197class StoreMeta(TypedDict): 

198 """Single-row store metadata recording the embedding model used to build the store. 

199 

200 Compatibility is checked before every read and write. When ``cfg.embedding_model`` 

201 or ``cfg.embedding_dim`` drifts from the persisted row, the store refuses to serve 

202 until ``lilbee rebuild`` (CLI) or ``POST /api/sync {"force_rebuild": true}`` (HTTP) 

203 rewrites the chunks under the new model. 

204 

205 ``updated_at`` is an ISO 8601 UTC timestamp produced by ``datetime.isoformat()``; 

206 kept as ``str`` to match the LanceDB ``utf8`` schema column. 

207 """ 

208 

209 embedding_model: str 

210 embedding_dim: int 

211 schema_version: int 

212 updated_at: str 

213 

214 

215class EmbeddingModelMismatchError(RuntimeError): 

216 """Raised when stored vectors were built with a different embedder than ``cfg``. 

217 

218 Carries the persisted and configured refs and dims so each surface renders its 

219 own recovery affordance (TUI prompt, CLI command, REST body) from the facts. 

220 """ 

221 

222 def __init__( 

223 self, 

224 *, 

225 persisted_model: str, 

226 persisted_dim: int, 

227 current_model: str, 

228 current_dim: int, 

229 ) -> None: 

230 self.persisted_model = persisted_model 

231 self.persisted_dim = persisted_dim 

232 self.current_model = current_model 

233 self.current_dim = current_dim 

234 super().__init__(self._build_message()) 

235 

236 @property 

237 def dims_match(self) -> bool: 

238 """True when the index is adoptable by switching embedder alone (same dim).""" 

239 return self.persisted_dim == self.current_dim 

240 

241 def _build_message(self) -> str: 

242 if self.dims_match: 

243 return ( 

244 f"This index was built with embedding model '{self.persisted_model}', " 

245 f"but lilbee is configured to use '{self.current_model}'. Configure lilbee " 

246 f"to use '{self.persisted_model}' to search this index, or rebuild it under " 

247 f"'{self.current_model}'." 

248 ) 

249 return ( 

250 f"This index was built with embedding model '{self.persisted_model}' " 

251 f"(dim {self.persisted_dim}), which differs from the current " 

252 f"'{self.current_model}' (dim {self.current_dim}). The dimensions differ, " 

253 f"so rebuild the index under '{self.current_model}' to use it." 

254 ) 

255 

256 

257@dataclass 

258class RemoveResult: 

259 """Result of a remove_documents operation.""" 

260 

261 removed: list[str] 

262 not_found: list[str]