Coverage for src / lilbee / data / store / types.py: 100%
116 statements
« prev ^ index » next coverage.py v7.13.4, created at 2026-06-28 01:01 +0000
« prev ^ index » next coverage.py v7.13.4, created at 2026-06-28 01:01 +0000
1"""Public dataclasses, TypedDicts, enums, and constants for the store package."""
3from __future__ import annotations
5from dataclasses import dataclass
6from datetime import timedelta
7from enum import StrEnum
8from typing import TypedDict
10from pydantic import BaseModel, ConfigDict, Field, field_validator
12# How often readers re-check the manifest for new versions from other processes.
13# Zero means strong consistency (every read checks); higher values reduce disk I/O
14# on slow media (HDD) at the cost of serving slightly stale data.
15READ_CONSISTENCY_INTERVAL = timedelta(seconds=5)
18class ChunkType(StrEnum):
19 """Values for the ``chunk_type`` column.
21 Everything ingests as ``RAW`` except wiki pages written by the wiki
22 producer; callers filter with ``Store.search(chunk_type=...)``.
23 """
25 RAW = "raw"
26 WIKI = "wiki"
29class SourceType(StrEnum):
30 """Values for the ``_sources.source_type`` column.
32 ``DOCUMENT`` mirrors a file under ``documents/`` and is managed by the
33 file-driven sync. ``IMPORTED`` is detached: it came from ``lilbee import``
34 and has no backing file, so sync must not treat it as a missing document.
35 """
37 DOCUMENT = "document"
38 IMPORTED = "imported"
41# ``schema_version`` is an integer for forward-compat. Bump only if we ever need to
42# add or rename a meta column without forcing every store to drop_all.
43META_SCHEMA_VERSION = 1
45# Always-true predicate used to clear the single-row ``_meta`` table before re-insert.
46# Lance's ``Table.delete`` requires a SQL where clause; this matches every row without
47# coupling the deletion to any specific column's value domain.
48META_DELETE_ALL_PREDICATE = "schema_version IS NOT NULL"
51class SearchScope(StrEnum):
52 """What the user wants to search over.
54 Values are used as-is on CLI flags, MCP params, and HTTP query strings.
55 ``BOTH`` resolves to a ``None`` ``chunk_type`` (no filter); the two
56 others map 1:1 to the chunks-table values.
57 """
59 RAW = ChunkType.RAW
60 WIKI = ChunkType.WIKI
61 BOTH = "both"
64def scope_to_chunk_type(scope: SearchScope | str | None) -> ChunkType | None:
65 """Translate a user-facing scope into a ``Store.search`` ``chunk_type`` arg.
67 ``None``/``"both"`` → no filter. ``"raw"`` / ``"wiki"`` → the matching
68 ``ChunkType``. Raises ``ValueError`` on any other string.
69 """
70 if scope is None:
71 return None
72 normalized = SearchScope(scope)
73 if normalized is SearchScope.BOTH:
74 return None
75 return ChunkType(normalized.value)
78class SearchChunk(BaseModel):
79 """A search result from LanceDB.
80 Hybrid results have ``relevance_score`` set (higher = better).
81 Vector-only results have ``distance`` set (lower = better).
82 Reranked results have ``rerank_score`` set (higher = better).
83 """
85 model_config = ConfigDict(populate_by_name=True)
87 source: str
88 content_type: str
89 chunk_type: ChunkType = ChunkType.RAW
91 @field_validator("chunk_type", mode="before")
92 @classmethod
93 def _coerce_none_chunk_type(cls, v: str | None) -> str:
94 """LanceDB rows from before the chunk_type column was added return None."""
95 return v if v is not None else ChunkType.RAW
97 page_start: int
98 page_end: int
99 line_start: int
100 line_end: int
101 chunk: str
102 chunk_index: int
103 vector: list[float] = Field(repr=False)
104 distance: float | None = Field(None, alias="_distance")
105 relevance_score: float | None = Field(None, alias="_relevance_score")
106 rerank_score: float | None = None
109class SourceRecord(TypedDict):
110 """A tracked source document record."""
112 filename: str
113 file_hash: str
114 ingested_at: str
115 chunk_count: int
116 source_type: str
119class PageTextRecord(TypedDict):
120 """One row of the per-page text dataset, matching ``_page_texts``."""
122 source: str
123 page: int
124 text: str
125 content_type: str
128class CitationRecord(TypedDict):
129 """A citation linking a wiki chunk to a specific source location."""
131 wiki_source: str
132 wiki_chunk_index: int
133 citation_key: str
134 claim_type: str
135 source_filename: str
136 source_hash: str
137 page_start: int
138 page_end: int
139 line_start: int
140 line_end: int
141 excerpt: str
142 created_at: str
145class MemoryKind(StrEnum):
146 """Whether a memory is an always-injected preference or a similarity-recalled fact."""
148 PREFERENCE = "preference"
149 FACT = "fact"
152class MemorySource(StrEnum):
153 """Provenance of a memory: user-typed, LLM-extracted, or agent-written."""
155 MANUAL = "manual"
156 EXTRACTED = "extracted"
157 AGENT = "agent"
160# Memory owner namespaces. ``"local"`` is the single human (TUI/CLI/REST); agents own
161# ``"agent:<id>"`` namespaces. The prefix lives only here so it is never hand-spliced.
162LOCAL_OWNER = "local"
163AGENT_OWNER_PREFIX = "agent:"
166def agent_owner(agent_id: str) -> str:
167 """Owner string for an agent identity (``"opencode"`` -> ``"agent:opencode"``)."""
168 return f"{AGENT_OWNER_PREFIX}{agent_id}"
171def is_agent_owner(owner: str) -> bool:
172 """True when *owner* is an agent namespace rather than the local human."""
173 return owner.startswith(AGENT_OWNER_PREFIX)
176class MemoryRow(BaseModel):
177 """A long-term memory entry in the per-library ``_memories`` table.
179 Built from a LanceDB row via ``MemoryRow(**row)`` (which coerces the ``kind``
180 and ``source`` strings to enums) and written back via ``model_dump(mode="json")``.
181 Extra keys like a search ``_distance`` are ignored on construction.
182 """
184 model_config = ConfigDict(extra="ignore")
186 id: str
187 owner: str
188 shared: bool
189 kind: MemoryKind
190 source: MemorySource
191 text: str
192 vector: list[float] = Field(repr=False)
193 created_at: str
194 updated_at: str
197class StoreMeta(TypedDict):
198 """Single-row store metadata recording the embedding model used to build the store.
200 Compatibility is checked before every read and write. When ``cfg.embedding_model``
201 or ``cfg.embedding_dim`` drifts from the persisted row, the store refuses to serve
202 until ``lilbee rebuild`` (CLI) or ``POST /api/sync {"force_rebuild": true}`` (HTTP)
203 rewrites the chunks under the new model.
205 ``updated_at`` is an ISO 8601 UTC timestamp produced by ``datetime.isoformat()``;
206 kept as ``str`` to match the LanceDB ``utf8`` schema column.
207 """
209 embedding_model: str
210 embedding_dim: int
211 schema_version: int
212 updated_at: str
215class EmbeddingModelMismatchError(RuntimeError):
216 """Raised when stored vectors were built with a different embedder than ``cfg``.
218 Carries the persisted and configured refs and dims so each surface renders its
219 own recovery affordance (TUI prompt, CLI command, REST body) from the facts.
220 """
222 def __init__(
223 self,
224 *,
225 persisted_model: str,
226 persisted_dim: int,
227 current_model: str,
228 current_dim: int,
229 ) -> None:
230 self.persisted_model = persisted_model
231 self.persisted_dim = persisted_dim
232 self.current_model = current_model
233 self.current_dim = current_dim
234 super().__init__(self._build_message())
236 @property
237 def dims_match(self) -> bool:
238 """True when the index is adoptable by switching embedder alone (same dim)."""
239 return self.persisted_dim == self.current_dim
241 def _build_message(self) -> str:
242 if self.dims_match:
243 return (
244 f"This index was built with embedding model '{self.persisted_model}', "
245 f"but lilbee is configured to use '{self.current_model}'. Configure lilbee "
246 f"to use '{self.persisted_model}' to search this index, or rebuild it under "
247 f"'{self.current_model}'."
248 )
249 return (
250 f"This index was built with embedding model '{self.persisted_model}' "
251 f"(dim {self.persisted_dim}), which differs from the current "
252 f"'{self.current_model}' (dim {self.current_dim}). The dimensions differ, "
253 f"so rebuild the index under '{self.current_model}' to use it."
254 )
257@dataclass
258class RemoveResult:
259 """Result of a remove_documents operation."""
261 removed: list[str]
262 not_found: list[str]