Coverage for src / lilbee / core / config / defaults.py: 100%
27 statements
« prev ^ index » next coverage.py v7.13.4, created at 2026-06-28 01:01 +0000
« prev ^ index » next coverage.py v7.13.4, created at 2026-06-28 01:01 +0000
1"""Default values and constants for :mod:`lilbee.config`.
3Holds frozen literal data: directory ignore lists, NER label allow-list,
4LanceDB table names, the crawl URL exclusion patterns (grouped per
5category), and the default system / CORS prompts.
6"""
8DEFAULT_IGNORE_DIRS = frozenset(
9 {
10 "node_modules",
11 "__pycache__",
12 "venv",
13 "build",
14 "dist",
15 "target",
16 "vendor",
17 "_build",
18 "coverage",
19 "htmlcov",
20 }
21)
23# spaCy NER labels that map onto something wiki-shaped. Excludes
24# QUANTITY / ORDINAL / CARDINAL / DATE / TIME / MONEY / PERCENT /
25# LANGUAGE / LAW because pages for "42" or "2021" are never useful.
26# FAC (buildings / airports) and NORP (nationalities / political /
27# religious groups) are included because corpora routinely surface
28# them as wiki-worthy topics.
29DEFAULT_ALLOWED_NER_LABELS = frozenset(
30 {"PERSON", "ORG", "GPE", "LOC", "EVENT", "WORK_OF_ART", "PRODUCT", "FAC", "NORP"}
31)
33# Timeout for backend catalog / management HTTP calls.
34DEFAULT_HTTP_TIMEOUT = 30.0
36# Safe default + cap for chat-mode n_ctx; full 128K+ training contexts OOM laptops.
37DEFAULT_NUM_CTX = 8192
39CHUNKS_TABLE = "chunks"
40SOURCES_TABLE = "_sources"
41CITATIONS_TABLE = "_citations"
42MEMORIES_TABLE = "_memories"
43META_TABLE = "_meta"
44PAGE_TEXTS_TABLE = "_page_texts"
45CONCEPT_NODES_TABLE = "concept_nodes"
46CONCEPT_EDGES_TABLE = "concept_edges"
47CHUNK_CONCEPTS_TABLE = "chunk_concepts"
49# Default URL-exclusion regexes for recursive crawls. Grouped by source
50# CMS / category. User overrides come from LILBEE_CRAWL_EXCLUDE_PATTERNS
51# (newline-separated) or config.toml.
53# WordPress scaffolding: admin UIs, APIs, RPC, numeric permalinks, Elementor.
54_WP_EXCLUDE: tuple[str, ...] = (
55 r"/wp-admin/",
56 r"/wp-login(\.php)?",
57 r"/wp-json/",
58 r"/xmlrpc\.php",
59 r"/wp-cron\.php",
60 r"/wp-includes/",
61 r"/wp-content/uploads/",
62 r"\?p=\d+",
63 r"\?page_id=\d+",
64 r"\?cat=\d+",
65 r"/elementor-\d+",
66 r"\?elementor_library",
67)
69# Pagination and archive permalinks (WP + other CMSes share this shape).
70_ARCHIVE_EXCLUDE: tuple[str, ...] = (
71 r"/page/\d+/?$",
72 r"\?paged?=\d+",
73 r"/20\d{2}(/\d{2}(/\d{2})?)?/?$",
74 r"/tag/",
75 r"/category/",
76 r"/author/",
77 r"/archives?/?$",
78 r"/comment-page-\d+",
79)
81# Syndication feeds (content-duplicated in HTML pages).
82_FEED_EXCLUDE: tuple[str, ...] = (
83 r"/feed/?$",
84 r"/feed/atom/?$",
85 r"/feed/rdf/?$",
86 r"/comments/feed/?$",
87 r"/rss/?$",
88)
90# Duplicate views of the same canonical page (AMP, print, preview).
91_DUPLICATE_VIEW_EXCLUDE: tuple[str, ...] = (
92 r"/amp/?$",
93 r"\?amp=",
94 r"\?print=",
95 r"/print/?$",
96 r"\?preview=",
97)
99# WP attachment URLs (point at media, not content pages).
100_ATTACHMENT_EXCLUDE: tuple[str, ...] = (
101 r"/attachment/",
102 r"\?attachment_id=",
103)
105# Auth and account flows (generic across CMSes and e-commerce platforms).
106_AUTH_EXCLUDE: tuple[str, ...] = (
107 r"/login",
108 r"/logout",
109 r"/register",
110 r"/signup",
111 r"/signin",
112 r"/account",
113 r"/my-account/",
114 r"/profile",
115 r"/password-reset",
116 r"/forgot-password",
117)
119# E-commerce transactional flows (cart / checkout / compare / etc.).
120_ECOMMERCE_EXCLUDE: tuple[str, ...] = (
121 r"/cart",
122 r"/checkout",
123 r"/wishlist",
124 r"/orders?",
125 r"/compare",
126 r"/products\.json",
127 r"/collections/.+/products/.+\?page=",
128)
130# Marketing / tracking query parameters (utm_*, fbclid, gclid, etc.).
131_TRACKING_EXCLUDE: tuple[str, ...] = (
132 (
133 r"[?&]("
134 r"utm_[a-z_]+"
135 r"|fbclid|gclid|msclkid|yclid"
136 r"|mc_cid|mc_eid"
137 r"|_hsenc|_hsmi|hsCtaTracking"
138 r"|mkt_tok|mkt_[a-z_]+"
139 r"|trk|trkInfo"
140 r"|dm_i"
141 r"|vero_id|vero_conv"
142 r"|oly_anon_id|oly_enc_id"
143 r"|igshid"
144 r"|pk_campaign|pk_source|pk_medium|pk_[a-z_]+"
145 r"|_ga"
146 r"|ref|referrer"
147 r"|affiliate|aff_id|aff_ref|aff|partner"
148 r"|srsltid"
149 r"|share|replytocom"
150 r")="
151 ),
152)
154# Site-meta URLs and non-HTML resources; skipped before fetch.
155_META_EXCLUDE: tuple[str, ...] = (
156 r"/sitemap[^/]*\.xml",
157 r"/robots\.txt",
158 r"/humans\.txt",
159 r"/favicon\.ico",
160 r"/\.well-known/",
161 r"\.(jpe?g|png|gif|webp|avif|svg|ico|pdf|docx?|xlsx?|pptx?|zip|tar|gz|mp3|mp4|webm|ogg|ttf|woff2?|css|js|map|json|xml)(\?.*)?$",
162)
164# Mediawiki/Wikipedia navlinks that dominate BFS before the article body.
165_MEDIAWIKI_EXCLUDE: tuple[str, ...] = (
166 r"/wiki/Main_Page$",
167 r"/wiki/Wikipedia:",
168 r"/wiki/Portal:",
169 r"/wiki/Help:",
170 r"/wiki/Special:",
171 r"/wiki/Category:",
172 r"/wiki/Template:",
173 r"/wiki/Template_talk:",
174 r"/wiki/Talk:",
175 r"/wiki/File:",
176 r"/wiki/File_talk:",
177 r"/wiki/User:",
178 r"/wiki/User_talk:",
179 r"/w/index\.php",
180)
182DEFAULT_CRAWL_EXCLUDE_PATTERNS: tuple[str, ...] = (
183 *_WP_EXCLUDE,
184 *_ARCHIVE_EXCLUDE,
185 *_FEED_EXCLUDE,
186 *_DUPLICATE_VIEW_EXCLUDE,
187 *_ATTACHMENT_EXCLUDE,
188 *_AUTH_EXCLUDE,
189 *_ECOMMERCE_EXCLUDE,
190 *_TRACKING_EXCLUDE,
191 *_META_EXCLUDE,
192 *_MEDIAWIKI_EXCLUDE,
193)
196DEFAULT_RAG_SYSTEM_PROMPT = (
197 "You are a precise, direct assistant grounded in the provided context. "
198 "Answer using only the context: if it doesn't contain enough information, "
199 "say so rather than guessing. Be specific: quote relevant passages and "
200 "reference context by number (e.g. [1], [2]) inline. Prefer exact values "
201 "over approximations. For code, prefer working examples over abstract "
202 "explanations. Keep responses concise unless asked to elaborate."
203)
205DEFAULT_GENERAL_SYSTEM_PROMPT = (
206 "You are a helpful, direct assistant. Answer the user's question from "
207 "general knowledge. Keep responses concise unless asked to elaborate. "
208 "For code, prefer working examples over abstract explanations."
209)
211# CORS allow-origin regex: Obsidian (desktop + iOS) and localhost loopback.
212# Mutating endpoints still require auth regardless of origin.
213DEFAULT_CORS_ORIGIN_REGEX = (
214 r"^(app://obsidian\.md"
215 r"|capacitor://localhost"
216 r"|https?://localhost(:\d+)?"
217 r"|https?://127\.0\.0\.1(:\d+)?"
218 r"|https?://\[::1\](:\d+)?)$"
219)