Coverage for src / lilbee / catalog / hf_client.py: 100%

115 statements  

« prev     ^ index     » next       coverage.py v7.13.4, created at 2026-05-15 20:55 +0000

1"""HuggingFace API client with TTL cache.""" 

2 

3from __future__ import annotations 

4 

5import logging 

6import os 

7import threading 

8import time 

9from http import HTTPStatus 

10 

11import httpx 

12from huggingface_hub import ModelInfo 

13from huggingface_hub.hf_api import RepoSibling 

14 

15from lilbee.catalog.models import CatalogModel, HfGgufMeta, HfPage 

16 

17log = logging.getLogger(__name__) 

18 

19# Substrings dropped from huggingface_hub's request / file-download loggers. 

20# These advisories aren't actionable in a local TUI: HF prints an 

21# unauthenticated-requests notice on every public pull, and the file_download 

22# logger re-warns on every retry the library schedules. The catalog surfaces 

23# the final download failure with a clear message, so per-attempt warnings 

24# are noise. 

25_HF_SUPPRESS_SUBSTRINGS = ( 

26 "unauthenticated requests to the HF Hub", 

27 "Error while downloading from", 

28 "Trying to resume download", 

29) 

30 

31_HF_FILTERED_LOGGER_NAMES = ( 

32 "huggingface_hub.utils._http", 

33 "huggingface_hub.file_download", 

34) 

35 

36 

37class _HfSubstringFilter(logging.Filter): 

38 """Drop huggingface_hub log records whose message contains a suppressed substring.""" 

39 

40 def __init__(self, needles: tuple[str, ...]) -> None: 

41 super().__init__() 

42 self._needles = needles 

43 

44 def filter(self, record: logging.LogRecord) -> bool: 

45 return not any(n in record.getMessage() for n in self._needles) 

46 

47 

48def install_hf_log_filter() -> None: 

49 """Attach the substring filter to huggingface_hub's chatty loggers. 

50 

51 Called automatically when this module is imported (see the module-top 

52 invocation below) so the filter is in place before any catalog HTTP 

53 call can emit a warning. Exposed as a function so tests can re-apply. 

54 """ 

55 hf_filter = _HfSubstringFilter(_HF_SUPPRESS_SUBSTRINGS) 

56 for name in _HF_FILTERED_LOGGER_NAMES: 

57 logging.getLogger(name).addFilter(hf_filter) 

58 

59 

60# Install the filter at module import. All HF HTTP traffic in lilbee 

61# routes through this module, so installing here always beats the first 

62# huggingface_hub warning to the punch. 

63install_hf_log_filter() 

64 

65HF_API_URL = "https://huggingface.co/api/models" 

66 

67DEFAULT_TIMEOUT = 30.0 

68 

69# Fields requested from the HF listing API via ``?expand=``. Without this 

70# expand, the default response omits siblings, cardData, and gguf. 

71_HF_EXPAND_FIELDS: list[str] = ["gguf", "siblings", "downloads", "pipeline_tag", "cardData"] 

72 

73# HF ``?search=`` is a single space-tokenized substring match on the model id. 

74# Multiple ``search=`` params are silently ignored, so the user's query is 

75# space-joined onto the GGUF filter into one param value. 

76_HF_GGUF_SEARCH_TERM = "GGUF" 

77 

78_EMPTY_HF_PAGE = HfPage(models=[], has_more=False) 

79 

80_BYTES_PER_GB = 1024**3 

81 

82 

83def hf_token() -> str | None: 

84 """Read HuggingFace token from env vars or huggingface_hub login cache.""" 

85 token = os.environ.get("LILBEE_HF_TOKEN") or os.environ.get("HF_TOKEN") or None 

86 if token: 

87 return token 

88 try: 

89 from huggingface_hub import get_token 

90 

91 return get_token() 

92 except Exception: 

93 return None 

94 

95 

96def hf_headers() -> dict[str, str]: 

97 """Build HTTP headers for HuggingFace API requests.""" 

98 token = hf_token() 

99 if token: 

100 return {"Authorization": f"Bearer {token}"} 

101 return {} 

102 

103 

104def _hf_search_value(search: str) -> str: 

105 """Build the HF ``search=`` value: GGUF plus the user's tokens, space-joined.""" 

106 tokens = [_HF_GGUF_SEARCH_TERM, *search.split()] 

107 return " ".join(tokens) 

108 

109 

110def _has_gguf_siblings(siblings: list[RepoSibling]) -> bool: 

111 """Return True if the sibling list contains at least one .gguf file.""" 

112 return any(s.rfilename.endswith(".gguf") for s in siblings) 

113 

114 

115def _estimate_size_from_siblings(siblings: list[RepoSibling]) -> float: 

116 """Estimate model size in GB from the largest GGUF file in siblings.""" 

117 max_bytes = 0 

118 for sib in siblings: 

119 if sib.rfilename.endswith(".gguf"): 

120 max_bytes = max(max_bytes, sib.size or 0) 

121 if max_bytes > 0: 

122 return round(max_bytes / _BYTES_PER_GB, 1) 

123 return 0.0 # unknown: display as "?" in UI 

124 

125 

126class HfClient: 

127 """HuggingFace catalog API client with a per-instance TTL cache. 

128 

129 Holds the per-process cache of catalog pages keyed by query 

130 parameters. The cache TTL and capacity are class-level so tests can 

131 override them via subclassing if needed; the cache state itself is 

132 per-instance so ``reset_services()`` discards a stale instance 

133 along with its cache. 

134 """ 

135 

136 CACHE_TTL: float = 300.0 

137 CACHE_MAX_ENTRIES: int = 50 

138 # Rate-limit the "Failed to fetch models" warning so an offline user 

139 # doesn't see one line per UI tick. First failure surfaces immediately; 

140 # repeats within the window stay at DEBUG. 

141 FETCH_FAILURE_WARN_INTERVAL_S: float = 300.0 

142 

143 def __init__(self) -> None: 

144 self._cache: dict[str, tuple[float, HfPage]] = {} 

145 self._cache_lock = threading.Lock() 

146 # -inf, not 0.0: on a freshly booted machine ``time.monotonic()`` can be 

147 # smaller than the window, which would push the first failure to DEBUG. 

148 self._last_fetch_failure_warn: float = float("-inf") 

149 

150 def fetch_models( 

151 self, 

152 pipeline_tag: str = "text-generation", 

153 sort: str = "downloads", 

154 limit: int = 50, 

155 offset: int = 0, 

156 library: str | None = None, 

157 search: str = "", 

158 ) -> HfPage: 

159 """Fetch GGUF models from HuggingFace API with TTL cache. 

160 

161 Returns an ``HfPage`` with a ``has_more`` flag derived from the 

162 ``Link: <...>; rel="next"`` response header (RFC 5988), the same 

163 mechanism the ``huggingface_hub`` library uses internally. 

164 """ 

165 # Local import to avoid a cycle: query imports hf_client (this 

166 # module), and hf_client uses pipeline_to_task from query. 

167 from lilbee.catalog.query import pipeline_to_task 

168 

169 search_value = _hf_search_value(search) 

170 cache_key = f"{pipeline_tag}:{sort}:{limit}:{offset}:{library}:{search_value}" 

171 now = time.monotonic() 

172 with self._cache_lock: 

173 expired = [k for k, (ts, _) in self._cache.items() if now - ts >= self.CACHE_TTL] 

174 for k in expired: 

175 del self._cache[k] 

176 

177 cached = self._cache.get(cache_key) 

178 if cached and now - cached[0] < self.CACHE_TTL: 

179 return cached[1] 

180 

181 params = httpx.QueryParams( 

182 pipeline_tag=pipeline_tag, 

183 search=search_value, 

184 sort=sort, 

185 limit=limit, 

186 skip=offset, 

187 expand=_HF_EXPAND_FIELDS, 

188 ) 

189 if library: 

190 params = params.add("library", library) 

191 try: 

192 resp = httpx.get( 

193 HF_API_URL, params=params, timeout=DEFAULT_TIMEOUT, headers=hf_headers() 

194 ) 

195 if resp.status_code >= HTTPStatus.BAD_REQUEST: 

196 log.warning("HuggingFace API returned HTTP %d", resp.status_code) 

197 return _EMPTY_HF_PAGE 

198 data = resp.json() 

199 except (httpx.HTTPError, ValueError) as exc: 

200 self._log_fetch_failure(exc) 

201 return _EMPTY_HF_PAGE 

202 

203 has_more = "next" in resp.links 

204 

205 models: list[CatalogModel] = [] 

206 for raw in data: 

207 if not raw.get("id"): 

208 continue 

209 item = ModelInfo(**raw) 

210 card_desc = item.card_data.get("description", "") if item.card_data else "" 

211 gguf_meta = HfGgufMeta(**(item.gguf or {})) 

212 if gguf_meta.total > 0: 

213 size_gb = round(gguf_meta.total / _BYTES_PER_GB, 1) 

214 else: 

215 size_gb = _estimate_size_from_siblings(item.siblings or []) 

216 task = pipeline_to_task(item.pipeline_tag or "") 

217 models.append( 

218 CatalogModel( 

219 hf_repo=item.id, 

220 gguf_filename="*.gguf", 

221 size_gb=size_gb, 

222 min_ram_gb=round(max(2.0, size_gb * 1.5), 1), 

223 description=card_desc[:120] if card_desc else "", 

224 featured=False, 

225 downloads=item.downloads or 0, 

226 task=task, 

227 ) 

228 ) 

229 page = HfPage(models=models, has_more=has_more) 

230 with self._cache_lock: 

231 self._cache[cache_key] = (now, page) 

232 if len(self._cache) > self.CACHE_MAX_ENTRIES: 

233 oldest_key = min(self._cache, key=lambda k: self._cache[k][0]) 

234 del self._cache[oldest_key] 

235 return page 

236 

237 def _log_fetch_failure(self, exc: Exception) -> None: 

238 """Log an HF fetch failure, rate-limited so offline use doesn't spam. 

239 

240 First failure of each ``FETCH_FAILURE_WARN_INTERVAL_S`` window logs 

241 at WARNING; repeats within the window log at DEBUG. The interval 

242 starts from the last WARNING so a flapping network produces one 

243 line every five minutes, not one per UI tick. 

244 """ 

245 now = time.monotonic() 

246 if now - self._last_fetch_failure_warn >= self.FETCH_FAILURE_WARN_INTERVAL_S: 

247 log.warning("Failed to fetch models from HuggingFace: %s", exc) 

248 self._last_fetch_failure_warn = now 

249 else: 

250 log.debug("Suppressed repeat HF fetch failure: %s", exc)