Coverage for src / lilbee / core / config / defaults.py: 100%

25 statements  

« prev     ^ index     » next       coverage.py v7.13.4, created at 2026-05-15 20:55 +0000

1"""Default values and constants for :mod:`lilbee.config`. 

2 

3Holds frozen literal data: directory ignore lists, NER label allow-list, 

4LanceDB table names, the crawl URL exclusion patterns (grouped per 

5category), and the default system / CORS prompts. 

6""" 

7 

8DEFAULT_IGNORE_DIRS = frozenset( 

9 { 

10 "node_modules", 

11 "__pycache__", 

12 "venv", 

13 "build", 

14 "dist", 

15 "target", 

16 "vendor", 

17 "_build", 

18 "coverage", 

19 "htmlcov", 

20 } 

21) 

22 

23# spaCy NER labels that map onto something wiki-shaped. Excludes 

24# QUANTITY / ORDINAL / CARDINAL / DATE / TIME / MONEY / PERCENT / 

25# LANGUAGE / LAW because pages for "42" or "2021" are never useful. 

26# FAC (buildings / airports) and NORP (nationalities / political / 

27# religious groups) are included because corpora routinely surface 

28# them as wiki-worthy topics. 

29DEFAULT_ALLOWED_NER_LABELS = frozenset( 

30 {"PERSON", "ORG", "GPE", "LOC", "EVENT", "WORK_OF_ART", "PRODUCT", "FAC", "NORP"} 

31) 

32 

33# Timeout for backend catalog / management HTTP calls. 

34DEFAULT_HTTP_TIMEOUT = 30.0 

35 

36# Safe default + cap for chat-mode n_ctx; full 128K+ training contexts OOM laptops. 

37DEFAULT_NUM_CTX = 8192 

38 

39CHUNKS_TABLE = "chunks" 

40SOURCES_TABLE = "_sources" 

41CITATIONS_TABLE = "_citations" 

42META_TABLE = "_meta" 

43CONCEPT_NODES_TABLE = "concept_nodes" 

44CONCEPT_EDGES_TABLE = "concept_edges" 

45CHUNK_CONCEPTS_TABLE = "chunk_concepts" 

46 

47# Default URL-exclusion regexes for recursive crawls. Grouped by source 

48# CMS / category. User overrides come from LILBEE_CRAWL_EXCLUDE_PATTERNS 

49# (newline-separated) or config.toml. 

50 

51# WordPress scaffolding: admin UIs, APIs, RPC, numeric permalinks, Elementor. 

52_WP_EXCLUDE: tuple[str, ...] = ( 

53 r"/wp-admin/", 

54 r"/wp-login(\.php)?", 

55 r"/wp-json/", 

56 r"/xmlrpc\.php", 

57 r"/wp-cron\.php", 

58 r"/wp-includes/", 

59 r"/wp-content/uploads/", 

60 r"\?p=\d+", 

61 r"\?page_id=\d+", 

62 r"\?cat=\d+", 

63 r"/elementor-\d+", 

64 r"\?elementor_library", 

65) 

66 

67# Pagination and archive permalinks (WP + other CMSes share this shape). 

68_ARCHIVE_EXCLUDE: tuple[str, ...] = ( 

69 r"/page/\d+/?$", 

70 r"\?paged?=\d+", 

71 r"/20\d{2}(/\d{2}(/\d{2})?)?/?$", 

72 r"/tag/", 

73 r"/category/", 

74 r"/author/", 

75 r"/archives?/?$", 

76 r"/comment-page-\d+", 

77) 

78 

79# Syndication feeds (content-duplicated in HTML pages). 

80_FEED_EXCLUDE: tuple[str, ...] = ( 

81 r"/feed/?$", 

82 r"/feed/atom/?$", 

83 r"/feed/rdf/?$", 

84 r"/comments/feed/?$", 

85 r"/rss/?$", 

86) 

87 

88# Duplicate views of the same canonical page (AMP, print, preview). 

89_DUPLICATE_VIEW_EXCLUDE: tuple[str, ...] = ( 

90 r"/amp/?$", 

91 r"\?amp=", 

92 r"\?print=", 

93 r"/print/?$", 

94 r"\?preview=", 

95) 

96 

97# WP attachment URLs (point at media, not content pages). 

98_ATTACHMENT_EXCLUDE: tuple[str, ...] = ( 

99 r"/attachment/", 

100 r"\?attachment_id=", 

101) 

102 

103# Auth and account flows (generic across CMSes and e-commerce platforms). 

104_AUTH_EXCLUDE: tuple[str, ...] = ( 

105 r"/login", 

106 r"/logout", 

107 r"/register", 

108 r"/signup", 

109 r"/signin", 

110 r"/account", 

111 r"/my-account/", 

112 r"/profile", 

113 r"/password-reset", 

114 r"/forgot-password", 

115) 

116 

117# E-commerce transactional flows (cart / checkout / compare / etc.). 

118_ECOMMERCE_EXCLUDE: tuple[str, ...] = ( 

119 r"/cart", 

120 r"/checkout", 

121 r"/wishlist", 

122 r"/orders?", 

123 r"/compare", 

124 r"/products\.json", 

125 r"/collections/.+/products/.+\?page=", 

126) 

127 

128# Marketing / tracking query parameters (utm_*, fbclid, gclid, etc.). 

129_TRACKING_EXCLUDE: tuple[str, ...] = ( 

130 ( 

131 r"[?&](" 

132 r"utm_[a-z_]+" 

133 r"|fbclid|gclid|msclkid|yclid" 

134 r"|mc_cid|mc_eid" 

135 r"|_hsenc|_hsmi|hsCtaTracking" 

136 r"|mkt_tok|mkt_[a-z_]+" 

137 r"|trk|trkInfo" 

138 r"|dm_i" 

139 r"|vero_id|vero_conv" 

140 r"|oly_anon_id|oly_enc_id" 

141 r"|igshid" 

142 r"|pk_campaign|pk_source|pk_medium|pk_[a-z_]+" 

143 r"|_ga" 

144 r"|ref|referrer" 

145 r"|affiliate|aff_id|aff_ref|aff|partner" 

146 r"|srsltid" 

147 r"|share|replytocom" 

148 r")=" 

149 ), 

150) 

151 

152# Site-meta URLs and non-HTML resources; skipped before fetch. 

153_META_EXCLUDE: tuple[str, ...] = ( 

154 r"/sitemap[^/]*\.xml", 

155 r"/robots\.txt", 

156 r"/humans\.txt", 

157 r"/favicon\.ico", 

158 r"/\.well-known/", 

159 r"\.(jpe?g|png|gif|webp|avif|svg|ico|pdf|docx?|xlsx?|pptx?|zip|tar|gz|mp3|mp4|webm|ogg|ttf|woff2?|css|js|map|json|xml)(\?.*)?$", 

160) 

161 

162# Mediawiki/Wikipedia navlinks that dominate BFS before the article body. 

163_MEDIAWIKI_EXCLUDE: tuple[str, ...] = ( 

164 r"/wiki/Main_Page$", 

165 r"/wiki/Wikipedia:", 

166 r"/wiki/Portal:", 

167 r"/wiki/Help:", 

168 r"/wiki/Special:", 

169 r"/wiki/Category:", 

170 r"/wiki/Template:", 

171 r"/wiki/Template_talk:", 

172 r"/wiki/Talk:", 

173 r"/wiki/File:", 

174 r"/wiki/File_talk:", 

175 r"/wiki/User:", 

176 r"/wiki/User_talk:", 

177 r"/w/index\.php", 

178) 

179 

180DEFAULT_CRAWL_EXCLUDE_PATTERNS: tuple[str, ...] = ( 

181 *_WP_EXCLUDE, 

182 *_ARCHIVE_EXCLUDE, 

183 *_FEED_EXCLUDE, 

184 *_DUPLICATE_VIEW_EXCLUDE, 

185 *_ATTACHMENT_EXCLUDE, 

186 *_AUTH_EXCLUDE, 

187 *_ECOMMERCE_EXCLUDE, 

188 *_TRACKING_EXCLUDE, 

189 *_META_EXCLUDE, 

190 *_MEDIAWIKI_EXCLUDE, 

191) 

192 

193 

194DEFAULT_RAG_SYSTEM_PROMPT = ( 

195 "You are a precise, direct assistant grounded in the provided context. " 

196 "Answer using only the context: if it doesn't contain enough information, " 

197 "say so rather than guessing. Be specific: quote relevant passages and " 

198 "reference context by number (e.g. [1], [2]) inline. Prefer exact values " 

199 "over approximations. For code, prefer working examples over abstract " 

200 "explanations. Keep responses concise unless asked to elaborate." 

201) 

202 

203DEFAULT_GENERAL_SYSTEM_PROMPT = ( 

204 "You are a helpful, direct assistant. Answer the user's question from " 

205 "general knowledge. Keep responses concise unless asked to elaborate. " 

206 "For code, prefer working examples over abstract explanations." 

207) 

208 

209# CORS allow-origin regex: Obsidian (desktop + iOS) and localhost loopback. 

210# Mutating endpoints still require auth regardless of origin. 

211DEFAULT_CORS_ORIGIN_REGEX = ( 

212 r"^(app://obsidian\.md" 

213 r"|capacitor://localhost" 

214 r"|https?://localhost(:\d+)?" 

215 r"|https?://127\.0\.0\.1(:\d+)?" 

216 r"|https?://\[::1\](:\d+)?)$" 

217)