Coverage for src / lilbee / core / config / defaults.py: 100%
25 statements
« prev ^ index » next coverage.py v7.13.4, created at 2026-05-15 20:55 +0000
« prev ^ index » next coverage.py v7.13.4, created at 2026-05-15 20:55 +0000
1"""Default values and constants for :mod:`lilbee.config`.
3Holds frozen literal data: directory ignore lists, NER label allow-list,
4LanceDB table names, the crawl URL exclusion patterns (grouped per
5category), and the default system / CORS prompts.
6"""
8DEFAULT_IGNORE_DIRS = frozenset(
9 {
10 "node_modules",
11 "__pycache__",
12 "venv",
13 "build",
14 "dist",
15 "target",
16 "vendor",
17 "_build",
18 "coverage",
19 "htmlcov",
20 }
21)
23# spaCy NER labels that map onto something wiki-shaped. Excludes
24# QUANTITY / ORDINAL / CARDINAL / DATE / TIME / MONEY / PERCENT /
25# LANGUAGE / LAW because pages for "42" or "2021" are never useful.
26# FAC (buildings / airports) and NORP (nationalities / political /
27# religious groups) are included because corpora routinely surface
28# them as wiki-worthy topics.
29DEFAULT_ALLOWED_NER_LABELS = frozenset(
30 {"PERSON", "ORG", "GPE", "LOC", "EVENT", "WORK_OF_ART", "PRODUCT", "FAC", "NORP"}
31)
33# Timeout for backend catalog / management HTTP calls.
34DEFAULT_HTTP_TIMEOUT = 30.0
36# Safe default + cap for chat-mode n_ctx; full 128K+ training contexts OOM laptops.
37DEFAULT_NUM_CTX = 8192
39CHUNKS_TABLE = "chunks"
40SOURCES_TABLE = "_sources"
41CITATIONS_TABLE = "_citations"
42META_TABLE = "_meta"
43CONCEPT_NODES_TABLE = "concept_nodes"
44CONCEPT_EDGES_TABLE = "concept_edges"
45CHUNK_CONCEPTS_TABLE = "chunk_concepts"
47# Default URL-exclusion regexes for recursive crawls. Grouped by source
48# CMS / category. User overrides come from LILBEE_CRAWL_EXCLUDE_PATTERNS
49# (newline-separated) or config.toml.
51# WordPress scaffolding: admin UIs, APIs, RPC, numeric permalinks, Elementor.
52_WP_EXCLUDE: tuple[str, ...] = (
53 r"/wp-admin/",
54 r"/wp-login(\.php)?",
55 r"/wp-json/",
56 r"/xmlrpc\.php",
57 r"/wp-cron\.php",
58 r"/wp-includes/",
59 r"/wp-content/uploads/",
60 r"\?p=\d+",
61 r"\?page_id=\d+",
62 r"\?cat=\d+",
63 r"/elementor-\d+",
64 r"\?elementor_library",
65)
67# Pagination and archive permalinks (WP + other CMSes share this shape).
68_ARCHIVE_EXCLUDE: tuple[str, ...] = (
69 r"/page/\d+/?$",
70 r"\?paged?=\d+",
71 r"/20\d{2}(/\d{2}(/\d{2})?)?/?$",
72 r"/tag/",
73 r"/category/",
74 r"/author/",
75 r"/archives?/?$",
76 r"/comment-page-\d+",
77)
79# Syndication feeds (content-duplicated in HTML pages).
80_FEED_EXCLUDE: tuple[str, ...] = (
81 r"/feed/?$",
82 r"/feed/atom/?$",
83 r"/feed/rdf/?$",
84 r"/comments/feed/?$",
85 r"/rss/?$",
86)
88# Duplicate views of the same canonical page (AMP, print, preview).
89_DUPLICATE_VIEW_EXCLUDE: tuple[str, ...] = (
90 r"/amp/?$",
91 r"\?amp=",
92 r"\?print=",
93 r"/print/?$",
94 r"\?preview=",
95)
97# WP attachment URLs (point at media, not content pages).
98_ATTACHMENT_EXCLUDE: tuple[str, ...] = (
99 r"/attachment/",
100 r"\?attachment_id=",
101)
103# Auth and account flows (generic across CMSes and e-commerce platforms).
104_AUTH_EXCLUDE: tuple[str, ...] = (
105 r"/login",
106 r"/logout",
107 r"/register",
108 r"/signup",
109 r"/signin",
110 r"/account",
111 r"/my-account/",
112 r"/profile",
113 r"/password-reset",
114 r"/forgot-password",
115)
117# E-commerce transactional flows (cart / checkout / compare / etc.).
118_ECOMMERCE_EXCLUDE: tuple[str, ...] = (
119 r"/cart",
120 r"/checkout",
121 r"/wishlist",
122 r"/orders?",
123 r"/compare",
124 r"/products\.json",
125 r"/collections/.+/products/.+\?page=",
126)
128# Marketing / tracking query parameters (utm_*, fbclid, gclid, etc.).
129_TRACKING_EXCLUDE: tuple[str, ...] = (
130 (
131 r"[?&]("
132 r"utm_[a-z_]+"
133 r"|fbclid|gclid|msclkid|yclid"
134 r"|mc_cid|mc_eid"
135 r"|_hsenc|_hsmi|hsCtaTracking"
136 r"|mkt_tok|mkt_[a-z_]+"
137 r"|trk|trkInfo"
138 r"|dm_i"
139 r"|vero_id|vero_conv"
140 r"|oly_anon_id|oly_enc_id"
141 r"|igshid"
142 r"|pk_campaign|pk_source|pk_medium|pk_[a-z_]+"
143 r"|_ga"
144 r"|ref|referrer"
145 r"|affiliate|aff_id|aff_ref|aff|partner"
146 r"|srsltid"
147 r"|share|replytocom"
148 r")="
149 ),
150)
152# Site-meta URLs and non-HTML resources; skipped before fetch.
153_META_EXCLUDE: tuple[str, ...] = (
154 r"/sitemap[^/]*\.xml",
155 r"/robots\.txt",
156 r"/humans\.txt",
157 r"/favicon\.ico",
158 r"/\.well-known/",
159 r"\.(jpe?g|png|gif|webp|avif|svg|ico|pdf|docx?|xlsx?|pptx?|zip|tar|gz|mp3|mp4|webm|ogg|ttf|woff2?|css|js|map|json|xml)(\?.*)?$",
160)
162# Mediawiki/Wikipedia navlinks that dominate BFS before the article body.
163_MEDIAWIKI_EXCLUDE: tuple[str, ...] = (
164 r"/wiki/Main_Page$",
165 r"/wiki/Wikipedia:",
166 r"/wiki/Portal:",
167 r"/wiki/Help:",
168 r"/wiki/Special:",
169 r"/wiki/Category:",
170 r"/wiki/Template:",
171 r"/wiki/Template_talk:",
172 r"/wiki/Talk:",
173 r"/wiki/File:",
174 r"/wiki/File_talk:",
175 r"/wiki/User:",
176 r"/wiki/User_talk:",
177 r"/w/index\.php",
178)
180DEFAULT_CRAWL_EXCLUDE_PATTERNS: tuple[str, ...] = (
181 *_WP_EXCLUDE,
182 *_ARCHIVE_EXCLUDE,
183 *_FEED_EXCLUDE,
184 *_DUPLICATE_VIEW_EXCLUDE,
185 *_ATTACHMENT_EXCLUDE,
186 *_AUTH_EXCLUDE,
187 *_ECOMMERCE_EXCLUDE,
188 *_TRACKING_EXCLUDE,
189 *_META_EXCLUDE,
190 *_MEDIAWIKI_EXCLUDE,
191)
194DEFAULT_RAG_SYSTEM_PROMPT = (
195 "You are a precise, direct assistant grounded in the provided context. "
196 "Answer using only the context: if it doesn't contain enough information, "
197 "say so rather than guessing. Be specific: quote relevant passages and "
198 "reference context by number (e.g. [1], [2]) inline. Prefer exact values "
199 "over approximations. For code, prefer working examples over abstract "
200 "explanations. Keep responses concise unless asked to elaborate."
201)
203DEFAULT_GENERAL_SYSTEM_PROMPT = (
204 "You are a helpful, direct assistant. Answer the user's question from "
205 "general knowledge. Keep responses concise unless asked to elaborate. "
206 "For code, prefer working examples over abstract explanations."
207)
209# CORS allow-origin regex: Obsidian (desktop + iOS) and localhost loopback.
210# Mutating endpoints still require auth regardless of origin.
211DEFAULT_CORS_ORIGIN_REGEX = (
212 r"^(app://obsidian\.md"
213 r"|capacitor://localhost"
214 r"|https?://localhost(:\d+)?"
215 r"|https?://127\.0\.0\.1(:\d+)?"
216 r"|https?://\[::1\](:\d+)?)$"
217)