Coverage for src / lilbee / data / store / types.py: 100%
69 statements
« prev ^ index » next coverage.py v7.13.4, created at 2026-05-15 20:55 +0000
« prev ^ index » next coverage.py v7.13.4, created at 2026-05-15 20:55 +0000
1"""Public dataclasses, TypedDicts, enums, and constants for the store package."""
3from __future__ import annotations
5from dataclasses import dataclass
6from datetime import timedelta
7from enum import StrEnum
8from typing import TypedDict
10from pydantic import BaseModel, ConfigDict, Field, field_validator
12# How often readers re-check the manifest for new versions from other processes.
13# Zero means strong consistency (every read checks); higher values reduce disk I/O
14# on slow media (HDD) at the cost of serving slightly stale data.
15READ_CONSISTENCY_INTERVAL = timedelta(seconds=5)
17# Values for the ``chunk_type`` column. Everything goes in as raw except wiki
18# pages written by the wiki producer; callers filter with ``Store.search(chunk_type=...)``.
19CHUNK_TYPE_RAW = "raw"
20CHUNK_TYPE_WIKI = "wiki"
22# ``schema_version`` is an integer for forward-compat. Bump only if we ever need to
23# add or rename a meta column without forcing every store to drop_all.
24META_SCHEMA_VERSION = 1
26# Always-true predicate used to clear the single-row ``_meta`` table before re-insert.
27# Lance's ``Table.delete`` requires a SQL where clause; this matches every row without
28# coupling the deletion to any specific column's value domain.
29META_DELETE_ALL_PREDICATE = "schema_version IS NOT NULL"
32class SearchScope(StrEnum):
33 """What the user wants to search over.
35 Values are used as-is on CLI flags, MCP params, and HTTP query strings.
36 ``BOTH`` resolves to a ``None`` ``chunk_type`` (no filter); the two
37 others map 1:1 to the chunks-table values.
38 """
40 RAW = CHUNK_TYPE_RAW
41 WIKI = CHUNK_TYPE_WIKI
42 BOTH = "both"
45def scope_to_chunk_type(scope: SearchScope | str | None) -> str | None:
46 """Translate a user-facing scope into a ``Store.search`` ``chunk_type`` arg.
48 ``None``/``"both"`` → no filter. ``"raw"`` / ``"wiki"`` → the matching
49 chunks-table value. Raises ``ValueError`` on any other string.
50 """
51 if scope is None:
52 return None
53 normalized = SearchScope(scope)
54 if normalized is SearchScope.BOTH:
55 return None
56 return normalized.value
59class SearchChunk(BaseModel):
60 """A search result from LanceDB.
61 Hybrid results have ``relevance_score`` set (higher = better).
62 Vector-only results have ``distance`` set (lower = better).
63 """
65 model_config = ConfigDict(populate_by_name=True)
67 source: str
68 content_type: str
69 chunk_type: str = CHUNK_TYPE_RAW
71 @field_validator("chunk_type", mode="before")
72 @classmethod
73 def _coerce_none_chunk_type(cls, v: str | None) -> str:
74 """LanceDB rows from before the chunk_type column was added return None."""
75 return v if v is not None else CHUNK_TYPE_RAW
77 page_start: int
78 page_end: int
79 line_start: int
80 line_end: int
81 chunk: str
82 chunk_index: int
83 vector: list[float] = Field(repr=False)
84 distance: float | None = Field(None, alias="_distance")
85 relevance_score: float | None = Field(None, alias="_relevance_score")
88class SourceRecord(TypedDict):
89 """A tracked source document record."""
91 filename: str
92 file_hash: str
93 ingested_at: str
94 chunk_count: int
95 source_type: str
98class CitationRecord(TypedDict):
99 """A citation linking a wiki chunk to a specific source location."""
101 wiki_source: str
102 wiki_chunk_index: int
103 citation_key: str
104 claim_type: str
105 source_filename: str
106 source_hash: str
107 page_start: int
108 page_end: int
109 line_start: int
110 line_end: int
111 excerpt: str
112 created_at: str
115class StoreMeta(TypedDict):
116 """Single-row store metadata recording the embedding model used to build the store.
118 Compatibility is checked before every read and write. When ``cfg.embedding_model``
119 or ``cfg.embedding_dim`` drifts from the persisted row, the store refuses to serve
120 until ``lilbee rebuild`` (CLI) or ``POST /api/sync {"force_rebuild": true}`` (HTTP)
121 rewrites the chunks under the new model.
123 ``updated_at`` is an ISO 8601 UTC timestamp produced by ``datetime.isoformat()``;
124 kept as ``str`` to match the LanceDB ``utf8`` schema column.
125 """
127 embedding_model: str
128 embedding_dim: int
129 schema_version: int
130 updated_at: str
133class EmbeddingModelMismatchError(RuntimeError):
134 """Raised when stored vectors were built with a different embedding model than ``cfg``.
136 Carries a user-facing message naming both the persisted and the configured model and
137 pointing at the two recovery paths (``lilbee rebuild`` and ``POST /api/sync`` with
138 ``force_rebuild=true``).
139 """
142@dataclass
143class RemoveResult:
144 """Result of a remove_documents operation."""
146 removed: list[str]
147 not_found: list[str]