Coverage for src/lilbee/core/config/model.py: 100%

1"""The :class:`Config` dataclass and the ``cfg`` singleton.

3The settings sources, TOML parser, and the resilient builder that falls

4back to defaults on stale-config validation failures live here too. Every

5``from lilbee.core.config import cfg`` resolves through ``lilbee.core.config.__init__``

6to the same instance defined at module bottom.

7"""

9import logging

10import os

11from pathlib import Path

12from typing import Any, ClassVar

14from pydantic import Field, ValidationInfo, field_validator, model_validator

15from pydantic_settings import BaseSettings, SettingsConfigDict

17from .defaults import (

18 DEFAULT_ALLOWED_NER_LABELS,

19 DEFAULT_CORS_ORIGIN_REGEX,

20 DEFAULT_CRAWL_EXCLUDE_PATTERNS,

21 DEFAULT_GENERAL_SYSTEM_PROMPT,

22 DEFAULT_IGNORE_DIRS,

23 DEFAULT_RAG_SYSTEM_PROMPT,

24)

25from .enums import ChatMode, ClustererBackend, KvCacheType, WikiEntityMode

26from .parsing import parse_bool

27from .validators import ConfigField

29log = logging.getLogger(__name__)

31# Sentinel for unset Path-typed fields. ``Field(default=Path())`` produces an

32# instance equal to this, so the model_validator can distinguish "user passed

33# the default" from "user explicitly set a value".

34_UNSET_PATH = Path()

37class Config(BaseSettings):

38 """Runtime configuration: one singleton instance, mutated by CLI overrides."""

40 model_config = SettingsConfigDict(

41 env_prefix="LILBEE_",

42 validate_assignment=True,

43 arbitrary_types_allowed=True,

44 extra="ignore",

45 )

47 # Paths: resolved from env/defaults in model_validator(mode='before')

48 data_root: Path = Field(default=Path())

49 # Writable so plugin-managed servers can pivot storage to a vault path on

50 # first boot; rebuild the index after migrating.

51 documents_dir: Path = ConfigField(default=Path(), writable=True)

52 data_dir: Path = Field(default=Path())

53 lancedb_dir: Path = Field(default=Path())

54 models_dir: Path = Field(default=Path())

55 # Markdown vault root; when set, search results carry a vault-relative

56 # ``vault_path`` so a host UI can deep-link into the vault.

57 vault_base: Path | None = ConfigField(default=None, writable=True)

59 chat_model: str = Field(default="Qwen/Qwen3-0.6B-GGUF/Qwen3-0.6B-Q8_0.gguf", min_length=1)

60 embedding_model: str = Field(

61 default="nomic-ai/nomic-embed-text-v1.5-GGUF/nomic-embed-text-v1.5.Q4_K_M.gguf",

62 min_length=1,

63 )

64 # Vision OCR model for scanned PDFs and image-only pages. Empty = disabled;

65 # there is no cross-role fallback onto the chat model even if multimodal.

66 vision_model: str = ConfigField(default="", public=True)

67 embedding_dim: int = Field(default=768, ge=1)

68 chunk_size: int = ConfigField(default=512, ge=64, writable=True, reindex=True)

69 chunk_overlap: int = ConfigField(default=100, ge=0, writable=True, reindex=True)

70 max_embed_chars: int = Field(default=2000, ge=1)

71 top_k: int = ConfigField(default=8, ge=1, writable=True)

72 max_distance: float = ConfigField(default=0.65, ge=0.0, writable=True)

73 # Floor for hybrid-search relevance scores (0.0 = no filtering). lilbee

74 # surfaces LanceDB's raw RRF sum, not a normalized score: with K=60 a

75 # chunk ranked first in both the vector and FTS lists tops out near

76 # 1/61 + 1/61 ~= 0.033, so any positive floor above that silently drops

77 # every result. Keep this at 0.0 unless the RRF scores are normalized first.

78 min_relevance_score: float = ConfigField(default=0.0, ge=0.0, writable=True)

79 adaptive_threshold: bool = Field(default=False)

80 rag_system_prompt: str = ConfigField(

81 default=DEFAULT_RAG_SYSTEM_PROMPT, min_length=1, writable=True

82 )

83 general_system_prompt: str = ConfigField(

84 default=DEFAULT_GENERAL_SYSTEM_PROMPT, min_length=1, writable=True

85 )

86 chat_mode: str = ConfigField(default=ChatMode.SEARCH.value, writable=True)

87 ignore_dirs: frozenset[str] = Field(default=DEFAULT_IGNORE_DIRS)

88 # OCR for scanned PDFs via vision-capable chat model.

89 # None = auto-detect (use OCR if chat model is vision-capable).

90 # True = force OCR regardless of detection.

91 # False = disable OCR entirely.

92 enable_ocr: bool | None = ConfigField(default=None, writable=True)

93 # Per-page timeout in seconds for vision OCR (0 = no limit).

94 ocr_timeout: float = ConfigField(default=120.0, ge=0.0, writable=True)

95 # Outer wall-clock budget for the streamed pool drain: load grace plus

96 # per_page * pages. Tune up for slow hardware (M1 Pro vision is

97 # ~5min/page) or down for fast hardware. ocr_timeout still governs the

98 # per-page expectation that drives the total budget.

99 vision_load_budget_s: float = ConfigField(default=300.0, ge=0.0, writable=True)

100

101 # Tesseract fallback wall-clock timeout per file, seconds. 0 = no cap.

102 tesseract_timeout: float = ConfigField(default=60.0, ge=0.0, writable=True)

103 semantic_chunking: bool = ConfigField(default=False, writable=True)

104 topic_threshold: float = ConfigField(default=0.75, ge=0.0, le=1.0, writable=True)

105 server_host: str = "127.0.0.1"

106 server_port: int = Field(default=0, ge=0, le=65535)

107 cors_origins: list[str] = Field(default_factory=list)

108 cors_origin_regex: str = Field(default=DEFAULT_CORS_ORIGIN_REGEX)

109 # Seconds between SSE heartbeat events when the producer queue is idle.

110 # Must stay well below the plugin's STREAM_IDLE_TIMEOUT_MS (120s) so a

111 # single long-running vision OCR page can't starve the client into aborting.

112 sse_heartbeat_interval: float = ConfigField(default=30.0, ge=0.0, writable=True)

113 json_mode: bool = False

114 temperature: float | None = ConfigField(default=0.1, ge=0.0, writable=True)

115 top_p: float | None = ConfigField(default=0.9, ge=0.0, le=1.0, writable=True)

116 top_k_sampling: int | None = ConfigField(default=40, ge=1, writable=True)

117 # 1.1 is llama.cpp's default. Leaving this at None caused n-gram loops

118 # ("tire tire tire...") on some open-weights models.

119 repeat_penalty: float | None = ConfigField(default=1.1, ge=0.0, writable=True)

120 num_ctx: int | None = ConfigField(default=None, ge=1, writable=True)

121 max_tokens: int | None = ConfigField(default=4096, ge=1, writable=True)

122 seed: int | None = ConfigField(default=None, writable=True)

123 llm_provider: str = ConfigField(default="auto", writable=True)

124 remote_base_url: str = ConfigField(default="http://localhost:11434", writable=True)

125 llm_api_key: str = ConfigField(default="", writable=True, write_only=True)

126 openrouter_api_key: str = ConfigField(default="", writable=True, write_only=True)

127 gemini_api_key: str = ConfigField(default="", writable=True, write_only=True)

128 anthropic_api_key: str = ConfigField(default="", writable=True, write_only=True)

129 openai_api_key: str = ConfigField(default="", writable=True, write_only=True)

130 mistral_api_key: str = ConfigField(default="", writable=True, write_only=True)

131 deepseek_api_key: str = ConfigField(default="", writable=True, write_only=True)

132

133 # Retrieval quality knobs.

134

135 # Max chunks per source in top-k; prevents one large file monopolizing results.

136 diversity_max_per_source: int = ConfigField(default=3, ge=1, writable=True)

137

138 # MMR relevance/diversity tradeoff; 0 = max diversity, 1 = pure relevance

139 # (Carbonell & Goldstein 1998).

140 mmr_lambda: float = ConfigField(default=0.5, ge=0.0, le=1.0, writable=True)

141

142 # Extra candidates retrieved for MMR reranking (multiplies top_k).

143 candidate_multiplier: int = ConfigField(default=3, ge=1, writable=True)

144

145 # LLM-generated alternative queries for expansion. 0 disables.

146 query_expansion_count: int = ConfigField(default=3, ge=0, writable=True)

147

148 # Skip LLM expansion when tokenized query length ≤ this. The LLM round-trip

149 # dominates latency on small local models; short queries already have strong

150 # BM25/vector signal. Concept-graph expansion still runs. 0 disables the skip.

151 expansion_short_query_tokens: int = ConfigField(default=2, ge=0, writable=True)

152

153 # Cosine-distance step when adaptive-widening retry kicks in.

154 adaptive_threshold_step: float = ConfigField(default=0.2, gt=0.0, writable=True)

155

156 # Reject expansion variants below expansion_similarity_threshold.

157 expansion_guardrails: bool = ConfigField(default=True, writable=True)

158

159 # Min cosine similarity between question and variant embeddings.

160 expansion_similarity_threshold: float = ConfigField(default=0.5, ge=0.0, le=1.0, writable=True)

161

162 # Sigmoid-normalized BM25 score above which query expansion is skipped.

163 expansion_skip_threshold: float = Field(default=0.8, ge=0.0, le=1.0)

164

165 # Min BM25 top-1 vs top-2 gap to skip expansion.

166 expansion_skip_gap: float = Field(default=0.15, ge=0.0, le=1.0)

167

168 # Chunks included in LLM context after adaptive selection.

169 max_context_sources: int = ConfigField(default=6, ge=1, writable=True)

170

171 # HyDE (Gao et al. 2022): hypothetical-answer embedding search. +~500ms.

172 hyde: bool = ConfigField(default=False, writable=True)

173

174 # HyDE result weight relative to real-doc search (0.0-1.0).

175 hyde_weight: float = ConfigField(default=0.7, ge=0.0, le=1.0, writable=True)

176

177 # HyDE prompt template. Must contain {question} placeholder.

178 hyde_prompt: str = (

179 "Write a 50-100 word passage that directly answers this question as if "

180 "it were an excerpt from a real document. Do not include any preamble, "

181 "just write the passage.\n\nQuestion: {question}"

182 )

183

184 # Reranker model ref. Empty disables reranking. Native GGUFs use

185 # llama-cpp rank pooling; hosted refs (cohere/voyage/jina/together/hf-tei)

186 # need the backend extra.

187 reranker_model: str = ConfigField(default="", public=True)

188

189 # Candidate count sent to the reranker.

190 rerank_candidates: int = ConfigField(default=60, ge=1, writable=True, public=True)

191

192 # Date-range filter; only fires when a temporal keyword is detected.

193 temporal_filtering: bool = ConfigField(default=True, writable=True)

194

195 # If True, emit <think>…</think> content as separate SSE reasoning events;

196 # if False, strip it silently.

197 show_reasoning: bool = ConfigField(default=False, writable=True)

198

199 # Maximum reasoning characters before lilbee forces the model to answer.

200 # Per-model overrides apply on top of this default. Approx N/4 tokens.

201 # 0 disables the cap (unlimited reasoning; accept the runaway-loop risk).

202 max_reasoning_chars: int = ConfigField(default=64_000, ge=0, writable=True)

203

204 # Web crawling.

205

206 # Optional global ceilings. None = no ceiling.

207 crawl_max_depth: int | None = ConfigField(default=None, ge=0, writable=True)

208 crawl_max_pages: int | None = ConfigField(default=None, ge=1, writable=True)

209

210 # Per-URL fetch timeout, seconds.

211 crawl_timeout: int = ConfigField(default=30, ge=1, writable=True)

212

213 # 0 = unlimited, default = CPU count.

214 crawl_max_concurrent: int = Field(default=0, ge=0)

215

216 # Seconds between periodic syncs during crawl. 0 = sync only at end.

217 crawl_sync_interval: int = ConfigField(default=30, ge=0, writable=True)

218

219 # Per-request delay + jitter (defaults chosen to be gentler than crawl4ai's).

220 crawl_mean_delay: float = ConfigField(default=0.5, ge=0.0, writable=True)

221 crawl_max_delay_range: float = ConfigField(default=0.5, ge=0.0, writable=True)

222

223 # In-flight requests per crawl.

224 crawl_concurrent_requests: int = ConfigField(default=3, ge=1, writable=True)

225

226 # Per-domain rate-limiter that backs off on HTTP 429/503 and retries.

227 crawl_retry_on_rate_limit: bool = ConfigField(default=True, writable=True)

228 crawl_retry_base_delay_min: float = ConfigField(default=1.0, ge=0.0, writable=True)

229 crawl_retry_base_delay_max: float = ConfigField(default=3.0, ge=0.0, writable=True)

230 crawl_retry_max_backoff: float = ConfigField(default=30.0, ge=0.0, writable=True)

231 crawl_retry_max_attempts: int = ConfigField(default=3, ge=0, writable=True)

232

233 # Regex patterns dropped at link-discovery time. Defaults block CMS

234 # scaffolding (WordPress admin, archives, tracking params, etc.).

235 crawl_exclude_patterns: list[str] = ConfigField(

236 default_factory=lambda: list(DEFAULT_CRAWL_EXCLUDE_PATTERNS),

237 writable=True,

238 )

239

240 # Fraction of GPU/unified memory reserved for loaded models.

241 gpu_memory_fraction: float = ConfigField(default=0.75, ge=0.1, le=1.0, writable=True)

242

243 # Seconds a model stays loaded after last use. 0 = unload immediately.

244 model_keep_alive: int = ConfigField(default=300, ge=0, writable=True)

245

246 # Per-call deadline for one pool round-trip (send + recv). Embed batches

247 # larger than this on slow machines surface as TimeoutError; raise for

248 # heavy ingest jobs.

249 worker_pool_call_timeout_s: float = ConfigField(default=300.0, gt=0.0, writable=True)

250

251 # Spawn every configured role at startup instead of on first use. Trades

252 # a slower TUI mount (~1-3s per worker, cold-started in parallel) for a

253 # responsive first interaction. Roles whose model is unset are skipped,

254 # so a setup with only chat + embed never spawns rerank or vision.

255 # Set to false for headless / scripted use where the first call doesn't

256 # need to be fast.

257 worker_pool_eager_start: bool = ConfigField(default=True, writable=True)

258

259 # Idle worker reap. A worker that has been quiet for this many seconds

260 # is shut down to free RAM/VRAM; the next request respawns it.

261 # ``0`` disables reaping (workers stay up until TUI exit).

262 worker_pool_max_idle_s: float = ConfigField(default=300.0, ge=0.0, writable=True)

263

264 # Upper bound for the dynamic n_ctx picker. The picker chooses the

265 # largest 256-multiple ctx that fits in available memory and the

266 # model's training window; this caps it at a sane ceiling.

267 num_ctx_max: int = ConfigField(default=16384, ge=512, writable=True)

268

269 # Flash attention. None (default) = on with TypeError fallback for

270 # older llama-cpp-python builds, True = force on, False = off.

271 # Resolves the 'padding V cache to 1024' warning on models with

272 # uneven per-layer V dims (e.g. Gemma3) and saves ~25% KV memory.

273 flash_attention: bool | None = ConfigField(default=None, writable=True)

274

275 # KV cache element type. q8_0 / q4_0 halve or quarter cache memory

276 # but require flash attention to be enabled.

277 kv_cache_type: KvCacheType = ConfigField(default=KvCacheType.F16, writable=True)

278

279 # Number of model layers to offload to GPU. None (default) = all

280 # layers, 0 = CPU only, positive int = partial offload. Useful when a

281 # discrete GPU has less VRAM than the model needs.

282 n_gpu_layers: int | None = ConfigField(default=None, writable=True)

283

284 # GPU device picker for dual-GPU machines (typical laptop case:

285 # discrete NVIDIA + integrated Intel/AMD). The Vulkan backend

286 # enumerates every adapter the system exposes and may pick the

287 # integrated one first, producing stalls or OOMs that look like

288 # llama.cpp bugs. Setting ``gpu_devices`` constrains visibility

289 # before llama_cpp loads, pinning inference to the chosen device(s).

290 #

291 # Accepts a comma-separated list of device indexes ("0", "1",

292 # "0,1") and applies it to every backend simultaneously:

293 # ``GGML_VK_VISIBLE_DEVICES`` for Vulkan, ``CUDA_VISIBLE_DEVICES``

294 # for CUDA, ``HIP_VISIBLE_DEVICES`` / ``ROCR_VISIBLE_DEVICES`` for

295 # ROCm. Setting one variable that the active backend ignores is

296 # harmless, so we set all four rather than detecting the build.

297 #

298 # Must be set before the first llama.cpp call; in practice that

299 # means via ``LILBEE_GPU_DEVICES`` or ``config.toml`` (TUI edits

300 # only take effect after a restart). ``None`` (default) hands off

301 # to the autodetect in ``providers/llama_cpp/gpu_select.py``,

302 # which parses ``vulkaninfo --summary`` and pins the discrete

303 # adapter when one is present. The autodetect is silent on failure

304 # (no vulkaninfo, single device, parse error), leaving the

305 # Vulkan-loader's default ordering in place.

306 gpu_devices: str | None = ConfigField(default=None, writable=True)

307

308 # Primary GPU index passed to ``Llama(main_gpu=...)``. Only matters

309 # when multiple devices remain visible after ``gpu_devices``; with

310 # a single visible device, llama.cpp ignores this. ``None``

311 # (default) lets llama.cpp pick (index 0).

312 main_gpu: int | None = ConfigField(default=None, writable=True)

313

314 # True = Markdown widget for chat; False = plain Static (faster).

315 markdown_rendering: bool = True

316

317 # TUI theme name; persists the last Ctrl+T pick across sessions.

318 theme: str = ConfigField(default="rose-pine", writable=True)

319

320 # Per-model generation defaults set via apply_model_defaults().

321 _model_defaults: Any = None

322

323 # Wiki layer. LLM-maintained synthesis pages with citation provenance.

324 # Off by default; flip to True (or set LILBEE_WIKI=1) to enable. When off,

325 # the Wiki view tab and the chat ModelBar's scope picker are both hidden.

326 wiki: bool = ConfigField(default=False, writable=True)

327 wiki_dir: str = "wiki"

328 wiki_prune_raw: bool = ConfigField(default=False, writable=True)

329

330 # Minimum cosine similarity between a page body and the mean of its

331 # source chunk vectors before a page is published (below → drafts).

332 # Replaces the old LLM-based faithfulness score: mean-of-chunks is a

333 # deterministic, zero-LLM-call signal that routes topic-drifted

334 # pages to drafts without the 0.0 to 1.0 ambiguity of a model-emitted

335 # number. Tuning knob: swap to per-chunk max or top-K-mean if the

336 # default 0.5 produces false drafts.

337 wiki_embedding_faithfulness_threshold: float = ConfigField(

338 default=0.5, ge=0.0, le=1.0, writable=True

339 )

340

341 # Per-call output token cap for wiki generation. Without this a

342 # reasoning model (Qwen3, DeepSeek-R1) can burn the full context

343 # window emitting <think> tokens before the actual answer, taking

344 # minutes per page. Default leaves headroom for a typical reasoning

345 # budget plus a real response (~1000 output + ~1000 slack).

346 wiki_summary_max_tokens: int = ConfigField(default=2048, ge=256, writable=True)

347

348 # Wiki generation is a structured-output task: the model must emit the

349 # block separators, the citation footnotes, and verbatim quotes. The

350 # usual chat default (~0.8) is too creative for that. Lowering the

351 # sampling temperature makes the model stick to the template and quote

352 # more faithfully. 0.1 leaves just enough slack to avoid hard loops.

353 wiki_temperature: float = ConfigField(default=0.1, ge=0.0, le=2.0, writable=True)

354

355 # Fraction of citations that must be stale before a wiki page is flagged.

356 wiki_stale_citation_threshold: float = Field(default=0.5, ge=0.0, le=1.0)

357

358 # Fraction of content changed that triggers human-review drift guard.

359 wiki_drift_threshold: float = Field(default=0.3, ge=0.0, le=1.0)

360

361 # LLM prompt templates for wiki page generation. Writable so advanced

362 # users can override them from /settings, config.toml, or

363 # ``LILBEE_WIKI_*_PROMPT`` env vars. Templates must keep the expected

364 # ``{placeholders}``. If you remove one the generator will crash on

365 # first use. The defaults below are the only reason the pipeline

366 # works out of the box.

367 wiki_summary_prompt: str = ConfigField(

368 writable=True,

369 default=(

370 "You are a knowledge compiler. Given the source chunks below from a single "

371 "document, write a concise wiki summary page in markdown.\n\n"

372 "Rules:\n"

373 "1. Every factual claim MUST have an inline citation [^src1], [^src2], etc.\n"

374 "2. Cite the EXACT text from the source that supports each claim by quoting it.\n"

375 "3. For interpretations or connections not directly stated in the source, "

376 "mark with [*inference*].\n"

377 "4. Use blockquotes (>) for directly cited facts.\n"

378 "5. End with a citation block in this format:\n\n"

379 "---\n"

380 "\n"

381 '[^src1]: {source_name}, excerpt: "exact quoted text"\n'

382 '[^src2]: {source_name}, excerpt: "exact quoted text"\n\n'

383 "Source document: {source_name}\n\n"

384 "Chunks:\n{chunks_text}\n\n"

385 "Write the wiki summary page now. Start with a heading."

386 ),

387 )

388 wiki_synthesis_prompt: str = ConfigField(

389 writable=True,

390 default=(

391 "You are a knowledge compiler. Given source chunks from MULTIPLE documents "

392 "about related concepts, write a synthesis wiki page in markdown that connects "

393 "ideas across sources.\n\n"

394 "Rules:\n"

395 "1. Every factual claim MUST have an inline citation [^src1], [^src2], etc.\n"

396 "2. Cite the EXACT text from the source that supports each claim by quoting it.\n"

397 "3. For connections, interpretations, or patterns you identify across sources, "

398 "mark with [*inference*].\n"

399 "4. Use blockquotes (>) for directly cited facts.\n"

400 "5. Reference each source by its filename when drawing connections.\n"

401 "6. End with a citation block in this format:\n\n"

402 "---\n"

403 "\n"

404 '[^src1]: {{source_name}}, excerpt: "exact quoted text"\n'

405 '[^src2]: {{source_name}}, excerpt: "exact quoted text"\n\n'

406 "Topic: {topic}\n\n"

407 "Sources:\n{source_list}\n\n"

408 "Chunks:\n{chunks_text}\n\n"

409 "Write the synthesis page now. Start with a heading."

410 ),

411 )

412

413 # Wiki synthesis clusterer backend. CONCEPTS requires the [graph] extra

414 # and falls back to EMBEDDING when unavailable.

415 wiki_clusterer: ClustererBackend = ConfigField(

416 default=ClustererBackend.EMBEDDING, writable=True

417 )

418

419 # Neighborhood size for the mutual-kNN graph. 0 = auto-scale from corpus size.

420 wiki_clusterer_k: int = ConfigField(default=0, ge=0, writable=True)

421

422 # LazyGraphRAG-style concept graph. Requires the [graph] extra.

423 concept_graph: bool = ConfigField(default=True, writable=True)

424

425 # Weight of concept overlap boost relative to vector similarity.

426 concept_boost_weight: float = ConfigField(default=0.3, ge=0.0, le=1.0, writable=True)

427

428 # Floor on post-boost distance to stop weak boosts from promoting marginal hits.

429 concept_boost_floor: float = ConfigField(default=0.05, ge=0.0, writable=True)

430

431 # Max noun-phrase concepts extracted per chunk.

432 concept_max_per_chunk: int = ConfigField(default=5, ge=1, writable=True)

433

434 # spaCy NER labels kept by the wiki entity extractor. Anything not

435 # in this set (QUANTITY, CARDINAL, DATE, TIME, MONEY, PERCENT,

436 # ORDINAL, ...) is dropped before aggregation. Override via

437 # LILBEE_CONCEPT_ALLOWED_ENT_TYPES as a comma-separated list.

438 concept_allowed_ent_types: frozenset[str] = Field(default=DEFAULT_ALLOWED_NER_LABELS)

439

440 # Strategy used to extract entities for the concept/entity wiki.

441 # NER_ENTITIES (default) pulls typed NER entities with spaCy; concept

442 # pages are proposed by the LLM inside the per-source batched call,

443 # not by the extractor. NER_CONCEPTS_PLUS_LLM_TYPES layers an

444 # LLM-proposed domain schema on top. LLM_TAGGED asks the LLM to tag

445 # every chunk (most expensive). Unimplemented modes fall back to

446 # NER_ENTITIES.

447 wiki_entity_mode: WikiEntityMode = ConfigField(

448 default=WikiEntityMode.NER_ENTITIES, writable=True

449 )

450

451 # Minimum distinct chunk mentions before an entity or concept earns

452 # its own wiki page. Filters one-off noise.

453 wiki_entity_min_mentions: int = ConfigField(default=3, ge=1, writable=True)

454

455 # Maximum chunks passed into each concept or entity page generation

456 # call. Caps context size so one page does not blow the context

457 # window on a prolific topic.

458 wiki_concept_max_chunks_per_page: int = ConfigField(default=25, ge=1, writable=True)

459

460 # Maximum number of related concepts the model is asked to list in

461 # the `## Related` section of each page.

462 wiki_related_max: int = ConfigField(default=8, ge=0, writable=True)

463

464 # Auto-update cap: if a single sync touches more than this many

465 # concept or entity pages, skip the per-slug regeneration and tell

466 # the user to run `lilbee wiki update` explicitly. Keeps a surprise

467 # bulk import from firing hundreds of LLM calls.

468 wiki_ingest_update_cap: int = ConfigField(default=20, ge=1, writable=True)

469

470 # Whether the per-source batched call asks the LLM to curate

471 # concept pages alongside the pre-extracted entity list. False →

472 # entity sections only, no concept curation (incremental ingest

473 # path uses this to avoid churning concept slugs per source-touch).

474 wiki_extract_concepts: bool = ConfigField(default=True, writable=True)

475

476 # Minimum chunk count a source must contribute before it is eligible

477 # for concept curation. Sources below the floor still get a batched

478 # call when they have entities (the prompt writes entity-only

479 # sections); sources below the floor with zero entities are skipped

480 # entirely. Prevents boilerplate / TOC / appendix documents from

481 # burning an LLM call to invent "concepts".

482 wiki_batch_min_chunks: int = ConfigField(default=3, ge=1, writable=True)

483

484 # Prompt template for the per-source batched call. Placeholders:

485 # {source}, {entity_list}, {chunks_text}, {concept_instruction}.

486 # {concept_instruction} is filled with a concept-curation paragraph

487 # when concepts are requested, or the empty string otherwise.

488 wiki_entity_batch_prompt: str = ConfigField(

489 writable=True,

490 default=(

491 "You are writing wiki sections based on these chunks from {source}.\n\n"

492 "{concept_instruction}"

493 "Write a wiki section for each of these NER ENTITIES: {entity_list}\n\n"

494 "Format each section exactly as:\n"

495 "## Name\n"

496 "{{content with [^src1]-style citations}}\n\n"

497 "Rules:\n"

498 "1. Every factual claim MUST have an inline citation [^src1], [^src2], etc.\n"

499 "2. Cite the EXACT text from the source that supports each claim by quoting it.\n"

500 "3. For interpretations or connections not directly stated, mark with [*inference*].\n"

501 "4. Use blockquotes (>) for directly cited facts.\n"

502 "5. End the response with a citation block in this format:\n\n"

503 "---\n"

504 "\n"

505 '[^src1]: {{source_name}}, excerpt: "exact quoted text"\n'

506 '[^src2]: {{source_name}}, excerpt: "exact quoted text"\n\n'

507 "Source chunks:\n{chunks_text}\n"

508 ),

509 )

510

511 # Class variable: not a settings field

512 _toml_cache: ClassVar[dict[str, Any]] = {}

513

514 @field_validator(

515 "temperature",

516 "top_p",

517 "repeat_penalty",

518 "top_k_sampling",

519 "num_ctx",

520 "seed",

521 mode="before",

522 )

523 @classmethod

524 def _empty_string_to_none(cls, v: Any) -> Any:

525 if isinstance(v, str) and v.strip() == "":

526 return None

527 return v

528

529 @field_validator("chat_mode", mode="before")

530 @classmethod

531 def _normalize_chat_mode(cls, v: Any) -> str:

532 """Coerce chat_mode to a ChatMode value; default ChatMode.SEARCH."""

533 if v is None or v == "":

534 return ChatMode.SEARCH.value

535 candidate = str(v).strip().lower()

536 try:

537 return ChatMode(candidate).value

538 except ValueError as exc:

539 valid = ", ".join(repr(m.value) for m in ChatMode)

540 raise ValueError(f"chat_mode must be one of {{{valid}}}, got {v!r}") from exc

541

542 @field_validator("enable_ocr", mode="before")

543 @classmethod

544 def _parse_enable_ocr(cls, v: Any) -> bool | None:

545 """Parse enable_ocr from env var string or direct value.

546

547 Accepts: true/false/1/0/yes/no (case-insensitive), empty string

548 or None for auto-detect.

549 """

550 if v is None:

551 return None

552 if isinstance(v, bool):

553 return v

554 if isinstance(v, str):

555 if v.strip().lower() in ("", "auto", "none"):

556 return None

557 try:

558 return parse_bool(v)

559 except ValueError:

560 pass # fall through to bool() coercion below for unrecognised strings

561 return bool(v)

562

563 @field_validator("flash_attention", mode="before")

564 @classmethod

565 def _parse_flash_attention(cls, v: Any) -> bool | None:

566 """Auto/on/off tri-state: empty/auto/none -> None, else parse bool."""

567 if v is None:

568 return None

569 if isinstance(v, bool):

570 return v

571 if isinstance(v, str):

572 if v.strip().lower() in ("", "auto", "none"):

573 return None

574 try:

575 return parse_bool(v)

576 except ValueError:

577 return None

578 return bool(v)

579

580 @field_validator("n_gpu_layers", mode="before")

581 @classmethod

582 def _parse_n_gpu_layers(cls, v: Any) -> int | None:

583 """Auto -> None, ``cpu`` alias -> 0, integers parsed verbatim."""

584 if v is None:

585 return None

586 if isinstance(v, str):

587 label = v.strip().lower()

588 if label in ("", "auto", "none"):

589 return None

590 if label == "cpu":

591 return 0

592 try:

593 return int(label)

594 except ValueError:

595 log.warning("Invalid LILBEE_N_GPU_LAYERS=%r, using auto", v)

596 return None

597 return int(v)

598

599 @field_validator("main_gpu", mode="before")

600 @classmethod

601 def _parse_main_gpu(cls, v: Any) -> int | None:

602 """Empty/auto strings -> None, integers parsed verbatim."""

603 if v is None:

604 return None

605 if isinstance(v, str):

606 label = v.strip().lower()

607 if label in ("", "auto", "none"):

608 return None

609 try:

610 return int(label)

611 except ValueError:

612 log.warning("Invalid LILBEE_MAIN_GPU=%r, using auto", v)

613 return None

614 return int(v)

615

616 @field_validator("gpu_devices", mode="before")

617 @classmethod

618 def _parse_gpu_devices(cls, v: Any) -> str | None:

619 """Normalize device list: strip whitespace, drop empties, keep order."""

620 if v is None:

621 return None

622 if isinstance(v, str):

623 label = v.strip().lower()

624 if label in ("", "auto", "all", "none"):

625 return None

626 parts = [p.strip() for p in v.split(",") if p.strip()]

627 if not parts:

628 return None

629 for part in parts:

630 if not part.lstrip("-").isdigit():

631 log.warning("Invalid LILBEE_GPU_DEVICES=%r, ignoring", v)

632 return None

633 return ",".join(parts)

634 return str(v)

635

636 @field_validator("semantic_chunking", mode="before")

637 @classmethod

638 def _parse_semantic_chunking(cls, v: Any) -> bool:

639 """Parse from env string; invalid values warn and fall back to False."""

640 if isinstance(v, bool):

641 return v

642 if isinstance(v, str):

643 try:

644 return parse_bool(v)

645 except ValueError:

646 log.warning("Invalid LILBEE_SEMANTIC_CHUNKING=%r, using default False", v)

647 return False

648 return bool(v)

649

650 @field_validator(

651 "chat_model", "embedding_model", "vision_model", "reranker_model", mode="after"

652 )

653 @classmethod

654 def _normalize_model_tag(cls, v: str, info: ValidationInfo) -> str:

655 """Validate and canonicalize a model ref; blank clears optional roles."""

656 if not v or not v.strip():

657 if info.field_name in {"chat_model", "embedding_model"}:

658 raise ValueError(f"{info.field_name} must not be blank")

659 return ""

660 from lilbee.providers.model_ref import parse_model_ref

661

662 return parse_model_ref(v).for_openai_prefix()

663

664 @field_validator("cors_origins", mode="before")

665 @classmethod

666 def _split_cors_origins(cls, v: Any) -> Any:

667 if isinstance(v, str):

668 return [o.strip() for o in v.split(",") if o.strip()]

669 return v

670

671 @field_validator("crawl_exclude_patterns", mode="before")

672 @classmethod

673 def _split_crawl_exclude_patterns(cls, v: Any) -> Any:

674 """Accept newline-separated strings from env vars / plain-text config.

675

676 Regex commonly uses commas (e.g. `{2,4}`) and pipes (alternation), so

677 newline is the only separator safe to use for this field. TOML lists

678 and JSON arrays pass through unchanged.

679 """

680 if isinstance(v, str):

681 return [p.strip() for p in v.splitlines() if p.strip()]

682 return v

683

684 @field_validator("crawl_exclude_patterns", mode="after")

685 @classmethod

686 def _validate_crawl_exclude_patterns(cls, v: list[str]) -> list[str]:

687 """Reject any entry that isn't a valid Python regex.

688

689 These patterns are compiled at crawl time. An invalid pattern there

690 surfaces as an opaque mid-crawl error; catching it at PATCH time gives

691 the user a 400 with a pointer to the bad entry.

692 """

693 import re

694

695 bad: list[str] = []

696 for i, pattern in enumerate(v):

697 try:

698 re.compile(pattern)

699 except re.error as exc:

700 bad.append(f"[{i}] {pattern!r}: {exc}")

701 if bad:

702 raise ValueError("invalid regex in crawl_exclude_patterns:\n " + "\n ".join(bad))

703 return v

704

705 @field_validator("ignore_dirs", mode="before")

706 @classmethod

707 def _merge_ignore_dirs(cls, v: Any) -> frozenset[str]:

708 if isinstance(v, str):

709 extra = frozenset(name.strip() for name in v.split(",") if name.strip())

710 return DEFAULT_IGNORE_DIRS | extra

711 if isinstance(v, (set, frozenset, list)):

712 return DEFAULT_IGNORE_DIRS | frozenset(v)

713 return DEFAULT_IGNORE_DIRS

714

715 @field_validator("concept_allowed_ent_types", mode="before")

716 @classmethod

717 def _parse_ent_types(cls, v: Any) -> frozenset[str]:

718 """Replace-semantics override: a narrowed set is used as-is,

719 not unioned with defaults. A user asking for ``PERSON,ORG``

720 wants exactly those kinds. Accepts comma-separated strings

721 from env and list / set / frozenset from code. Empty input

722 falls back to :data:`DEFAULT_ALLOWED_NER_LABELS` so an empty

723 env var does not silently disable the gate.

724 """

725 if isinstance(v, str):

726 parts = frozenset(name.strip().upper() for name in v.split(",") if name.strip())

727 return parts or DEFAULT_ALLOWED_NER_LABELS

728 if isinstance(v, (set, frozenset, list)):

729 parts = frozenset(str(x).upper() for x in v)

730 return parts or DEFAULT_ALLOWED_NER_LABELS

731 return DEFAULT_ALLOWED_NER_LABELS

732

733 @model_validator(mode="before")

734 @classmethod

735 def _resolve_defaults(cls, data: Any) -> Any:

736 from lilbee.core.system import canonical_models_dir, default_data_dir, find_local_root

737

738 if not isinstance(data, dict): # pragma: no cover

739 return data

740

741 if data.get("data_root") in (None, _UNSET_PATH):

742 data_env = os.environ.get("LILBEE_DATA", "").strip()

743 if data_env:

744 data["data_root"] = Path(data_env)

745 else:

746 local = find_local_root()

747 data["data_root"] = local if local is not None else default_data_dir()

748 root = data["data_root"]

749 if data.get("documents_dir") in (None, _UNSET_PATH):

750 data["documents_dir"] = root / "documents"

751 if data.get("data_dir") in (None, _UNSET_PATH):

752 data["data_dir"] = root / "data"

753 if data.get("lancedb_dir") in (None, _UNSET_PATH):

754 data["lancedb_dir"] = root / "data" / "lancedb"

755 if data.get("models_dir") in (None, _UNSET_PATH):

756 data["models_dir"] = canonical_models_dir()

757

758 return data

759

760 @classmethod

761 def settings_customise_sources(

762 cls,

763 settings_cls: type[BaseSettings],

764 init_settings: Any,

765 env_settings: Any,

766 dotenv_settings: Any,

767 file_secret_settings: Any,

768 ) -> tuple[Any, ...]:

769 from lilbee.core.system import default_data_dir, find_local_root

770

771 data_env = os.environ.get("LILBEE_DATA", "")

772 if data_env:

773 toml_dir = Path(data_env)

774 else:

775 local = find_local_root()

776 toml_dir = local if local else default_data_dir()

777 toml_path = toml_dir / "config.toml"

778

779 plain_env = _PlainEnvSource(settings_cls, env_prefix="LILBEE_", env_ignore_empty=True)

780 sources: list[Any] = [init_settings, plain_env]

781 if toml_path.exists() and os.environ.get("LILBEE_SKIP_TOML_CONFIG") != "1":

782 sources.append(_TomlSource(settings_cls, toml_path))

783 return tuple(sources)

784

785 @property

786 def model_defaults(self) -> Any:

787 """Per-model generation defaults (read-only). Set via apply_model_defaults()."""

788 return self._model_defaults

789

790 def apply_model_defaults(self, defaults: Any) -> None:

791 """Store per-model generation defaults for 3-layer merge."""

792 object.__setattr__(self, "_model_defaults", defaults)

793

794 def clear_model_defaults(self) -> None:

795 """Reset per-model defaults to None."""

796 object.__setattr__(self, "_model_defaults", None)

797

798 def generation_options(self, **overrides: Any) -> dict[str, Any]:

799 """Merge model defaults, user config, and per-call overrides, dropping None."""

800 result = _model_defaults_dict(self._model_defaults)

801 user_fields: dict[str, Any] = {

802 "temperature": self.temperature,

803 "top_p": self.top_p,

804 "top_k": self.top_k_sampling,

805 "repeat_penalty": self.repeat_penalty,

806 "num_ctx": self.num_ctx,

807 "seed": self.seed,

808 "max_tokens": self.max_tokens,

809 }

810 for k, v in user_fields.items():

811 if v is not None:

812 result[k] = v

813 for k, v in overrides.items():

814 if v is not None:

815 result[k] = v

816 return result

817

818

819def _model_defaults_dict(defaults: Any) -> dict[str, Any]:

820 """Non-None fields of a ModelDefaults instance as a dict."""

821 if defaults is None:

822 return {}

823 from dataclasses import fields as dc_fields

824

825 return {

826 f.name: getattr(defaults, f.name)

827 for f in dc_fields(defaults)

828 if getattr(defaults, f.name) is not None

829 }

830

831

832class _PlainEnvSource:

833 """Reads LILBEE_* env vars as plain strings so field validators handle parsing."""

834

835 def __init__(

836 self,

837 settings_cls: type[BaseSettings],

838 env_prefix: str,

839 env_ignore_empty: bool = True,

840 ) -> None:

841 self._prefix = env_prefix

842 self._ignore_empty = env_ignore_empty

843 self._fields = set(settings_cls.model_fields)

844

845 def __call__(self) -> dict[str, Any]:

846 result: dict[str, Any] = {}

847 for field_name in self._fields:

848 env_key = f"{self._prefix}{field_name.upper()}"

849 raw = os.environ.get(env_key)

850 if raw is None:

851 continue

852 if self._ignore_empty and raw == "":

853 continue

854 result[field_name] = raw

855 return result

856

857

858class _TomlSource:

859 """Custom pydantic-settings source that reads config.toml."""

860

861 def __init__(self, settings_cls: type[BaseSettings], path: Path) -> None:

862 self._path = path

863

864 def __call__(self) -> dict[str, Any]:

865 import tomllib

866

867 try:

868 with self._path.open("rb") as f:

869 data = tomllib.load(f)

870 except (ValueError, OSError):

871 log.warning("Failed to read %s, ignoring", self._path)

872 return {}

873 # Empty strings represent "no persisted value" for nullable scalar

874 # fields (legacy from set_setting writing "" for None). Pydantic

875 # can't coerce "" to int|None, so dropping them here lets the field

876 # default apply rather than crashing the whole Config load.

877 return {k: str(v) for k, v in data.items() if str(v) != ""}

878

879

880def _build_cfg() -> tuple[Config, Exception | None]:

881 """Build cfg; on stale-config validation failure, fall back to defaults.

882

883 A persisted ``config.toml`` from before a breaking schema change can

884 contain values the new validators reject. Crashing at module import

885 means every command (``lilbee --help`` included) emits a Python

886 traceback. Falling back to env+defaults lets the package load; the

887 CLI / TUI surfaces the original error before doing real work.

888 """

889 try:

890 return Config(), None

891 except Exception as exc:

892 os.environ["LILBEE_SKIP_TOML_CONFIG"] = "1"

893 try:

894 return Config(), exc

895 finally:

896 os.environ.pop("LILBEE_SKIP_TOML_CONFIG", None)

897

898

899cfg, config_load_error = _build_cfg()

900

901# Canonicalize LILBEE_DATA at the cfg.data_root resolution boundary so

902# spawn-context worker subprocesses inherit the same data root.

903# ``setdefault`` preserves a user-set value.

904os.environ.setdefault("LILBEE_DATA", str(cfg.data_root))

Coverage for src / lilbee / core / config / model.py: 100%

392 statements