Coverage for src / lilbee / core / config / defaults.py: 100%

27 statements  

« prev     ^ index     » next       coverage.py v7.13.4, created at 2026-06-28 01:01 +0000

1"""Default values and constants for :mod:`lilbee.config`. 

2 

3Holds frozen literal data: directory ignore lists, NER label allow-list, 

4LanceDB table names, the crawl URL exclusion patterns (grouped per 

5category), and the default system / CORS prompts. 

6""" 

7 

8DEFAULT_IGNORE_DIRS = frozenset( 

9 { 

10 "node_modules", 

11 "__pycache__", 

12 "venv", 

13 "build", 

14 "dist", 

15 "target", 

16 "vendor", 

17 "_build", 

18 "coverage", 

19 "htmlcov", 

20 } 

21) 

22 

23# spaCy NER labels that map onto something wiki-shaped. Excludes 

24# QUANTITY / ORDINAL / CARDINAL / DATE / TIME / MONEY / PERCENT / 

25# LANGUAGE / LAW because pages for "42" or "2021" are never useful. 

26# FAC (buildings / airports) and NORP (nationalities / political / 

27# religious groups) are included because corpora routinely surface 

28# them as wiki-worthy topics. 

29DEFAULT_ALLOWED_NER_LABELS = frozenset( 

30 {"PERSON", "ORG", "GPE", "LOC", "EVENT", "WORK_OF_ART", "PRODUCT", "FAC", "NORP"} 

31) 

32 

33# Timeout for backend catalog / management HTTP calls. 

34DEFAULT_HTTP_TIMEOUT = 30.0 

35 

36# Safe default + cap for chat-mode n_ctx; full 128K+ training contexts OOM laptops. 

37DEFAULT_NUM_CTX = 8192 

38 

39CHUNKS_TABLE = "chunks" 

40SOURCES_TABLE = "_sources" 

41CITATIONS_TABLE = "_citations" 

42MEMORIES_TABLE = "_memories" 

43META_TABLE = "_meta" 

44PAGE_TEXTS_TABLE = "_page_texts" 

45CONCEPT_NODES_TABLE = "concept_nodes" 

46CONCEPT_EDGES_TABLE = "concept_edges" 

47CHUNK_CONCEPTS_TABLE = "chunk_concepts" 

48 

49# Default URL-exclusion regexes for recursive crawls. Grouped by source 

50# CMS / category. User overrides come from LILBEE_CRAWL_EXCLUDE_PATTERNS 

51# (newline-separated) or config.toml. 

52 

53# WordPress scaffolding: admin UIs, APIs, RPC, numeric permalinks, Elementor. 

54_WP_EXCLUDE: tuple[str, ...] = ( 

55 r"/wp-admin/", 

56 r"/wp-login(\.php)?", 

57 r"/wp-json/", 

58 r"/xmlrpc\.php", 

59 r"/wp-cron\.php", 

60 r"/wp-includes/", 

61 r"/wp-content/uploads/", 

62 r"\?p=\d+", 

63 r"\?page_id=\d+", 

64 r"\?cat=\d+", 

65 r"/elementor-\d+", 

66 r"\?elementor_library", 

67) 

68 

69# Pagination and archive permalinks (WP + other CMSes share this shape). 

70_ARCHIVE_EXCLUDE: tuple[str, ...] = ( 

71 r"/page/\d+/?$", 

72 r"\?paged?=\d+", 

73 r"/20\d{2}(/\d{2}(/\d{2})?)?/?$", 

74 r"/tag/", 

75 r"/category/", 

76 r"/author/", 

77 r"/archives?/?$", 

78 r"/comment-page-\d+", 

79) 

80 

81# Syndication feeds (content-duplicated in HTML pages). 

82_FEED_EXCLUDE: tuple[str, ...] = ( 

83 r"/feed/?$", 

84 r"/feed/atom/?$", 

85 r"/feed/rdf/?$", 

86 r"/comments/feed/?$", 

87 r"/rss/?$", 

88) 

89 

90# Duplicate views of the same canonical page (AMP, print, preview). 

91_DUPLICATE_VIEW_EXCLUDE: tuple[str, ...] = ( 

92 r"/amp/?$", 

93 r"\?amp=", 

94 r"\?print=", 

95 r"/print/?$", 

96 r"\?preview=", 

97) 

98 

99# WP attachment URLs (point at media, not content pages). 

100_ATTACHMENT_EXCLUDE: tuple[str, ...] = ( 

101 r"/attachment/", 

102 r"\?attachment_id=", 

103) 

104 

105# Auth and account flows (generic across CMSes and e-commerce platforms). 

106_AUTH_EXCLUDE: tuple[str, ...] = ( 

107 r"/login", 

108 r"/logout", 

109 r"/register", 

110 r"/signup", 

111 r"/signin", 

112 r"/account", 

113 r"/my-account/", 

114 r"/profile", 

115 r"/password-reset", 

116 r"/forgot-password", 

117) 

118 

119# E-commerce transactional flows (cart / checkout / compare / etc.). 

120_ECOMMERCE_EXCLUDE: tuple[str, ...] = ( 

121 r"/cart", 

122 r"/checkout", 

123 r"/wishlist", 

124 r"/orders?", 

125 r"/compare", 

126 r"/products\.json", 

127 r"/collections/.+/products/.+\?page=", 

128) 

129 

130# Marketing / tracking query parameters (utm_*, fbclid, gclid, etc.). 

131_TRACKING_EXCLUDE: tuple[str, ...] = ( 

132 ( 

133 r"[?&](" 

134 r"utm_[a-z_]+" 

135 r"|fbclid|gclid|msclkid|yclid" 

136 r"|mc_cid|mc_eid" 

137 r"|_hsenc|_hsmi|hsCtaTracking" 

138 r"|mkt_tok|mkt_[a-z_]+" 

139 r"|trk|trkInfo" 

140 r"|dm_i" 

141 r"|vero_id|vero_conv" 

142 r"|oly_anon_id|oly_enc_id" 

143 r"|igshid" 

144 r"|pk_campaign|pk_source|pk_medium|pk_[a-z_]+" 

145 r"|_ga" 

146 r"|ref|referrer" 

147 r"|affiliate|aff_id|aff_ref|aff|partner" 

148 r"|srsltid" 

149 r"|share|replytocom" 

150 r")=" 

151 ), 

152) 

153 

154# Site-meta URLs and non-HTML resources; skipped before fetch. 

155_META_EXCLUDE: tuple[str, ...] = ( 

156 r"/sitemap[^/]*\.xml", 

157 r"/robots\.txt", 

158 r"/humans\.txt", 

159 r"/favicon\.ico", 

160 r"/\.well-known/", 

161 r"\.(jpe?g|png|gif|webp|avif|svg|ico|pdf|docx?|xlsx?|pptx?|zip|tar|gz|mp3|mp4|webm|ogg|ttf|woff2?|css|js|map|json|xml)(\?.*)?$", 

162) 

163 

164# Mediawiki/Wikipedia navlinks that dominate BFS before the article body. 

165_MEDIAWIKI_EXCLUDE: tuple[str, ...] = ( 

166 r"/wiki/Main_Page$", 

167 r"/wiki/Wikipedia:", 

168 r"/wiki/Portal:", 

169 r"/wiki/Help:", 

170 r"/wiki/Special:", 

171 r"/wiki/Category:", 

172 r"/wiki/Template:", 

173 r"/wiki/Template_talk:", 

174 r"/wiki/Talk:", 

175 r"/wiki/File:", 

176 r"/wiki/File_talk:", 

177 r"/wiki/User:", 

178 r"/wiki/User_talk:", 

179 r"/w/index\.php", 

180) 

181 

182DEFAULT_CRAWL_EXCLUDE_PATTERNS: tuple[str, ...] = ( 

183 *_WP_EXCLUDE, 

184 *_ARCHIVE_EXCLUDE, 

185 *_FEED_EXCLUDE, 

186 *_DUPLICATE_VIEW_EXCLUDE, 

187 *_ATTACHMENT_EXCLUDE, 

188 *_AUTH_EXCLUDE, 

189 *_ECOMMERCE_EXCLUDE, 

190 *_TRACKING_EXCLUDE, 

191 *_META_EXCLUDE, 

192 *_MEDIAWIKI_EXCLUDE, 

193) 

194 

195 

196DEFAULT_RAG_SYSTEM_PROMPT = ( 

197 "You are a precise, direct assistant grounded in the provided context. " 

198 "Answer using only the context: if it doesn't contain enough information, " 

199 "say so rather than guessing. Be specific: quote relevant passages and " 

200 "reference context by number (e.g. [1], [2]) inline. Prefer exact values " 

201 "over approximations. For code, prefer working examples over abstract " 

202 "explanations. Keep responses concise unless asked to elaborate." 

203) 

204 

205DEFAULT_GENERAL_SYSTEM_PROMPT = ( 

206 "You are a helpful, direct assistant. Answer the user's question from " 

207 "general knowledge. Keep responses concise unless asked to elaborate. " 

208 "For code, prefer working examples over abstract explanations." 

209) 

210 

211# CORS allow-origin regex: Obsidian (desktop + iOS) and localhost loopback. 

212# Mutating endpoints still require auth regardless of origin. 

213DEFAULT_CORS_ORIGIN_REGEX = ( 

214 r"^(app://obsidian\.md" 

215 r"|capacitor://localhost" 

216 r"|https?://localhost(:\d+)?" 

217 r"|https?://127\.0\.0\.1(:\d+)?" 

218 r"|https?://\[::1\](:\d+)?)$" 

219)