Coverage for src / lilbee / crawler / bootstrap.py: 100%

124 statements  

« prev     ^ index     » next       coverage.py v7.13.4, created at 2026-06-28 01:01 +0000

1"""Playwright Chromium detection and install.""" 

2 

3from __future__ import annotations 

4 

5import asyncio 

6import os 

7import re 

8import sys 

9from pathlib import Path 

10 

11from lilbee._frozen import is_frozen 

12from lilbee.runtime.progress import ( 

13 DetailedProgressCallback, 

14 EventType, 

15 SetupDoneEvent, 

16 SetupProgressEvent, 

17 SetupStartEvent, 

18) 

19 

20 

21class CrawlerBrowserError(RuntimeError): 

22 """Playwright is installed but its Chromium browser binary is not.""" 

23 

24 

25class CrawlerBackendError(RuntimeError): 

26 """The ``crawler`` extra (crawl4ai) was never installed.""" 

27 

28 

29_CHROMIUM_COMPONENT = "chromium" 

30# Rough size estimate for the Chromium download; Playwright bundles vary 

31# slightly per platform but this gives the UI a decent denominator before 

32# 'Total bytes' parses out of stdout. 

33_CHROMIUM_ESTIMATE_MB = 180 

34_CHROMIUM_SIZE_ESTIMATE_BYTES = _CHROMIUM_ESTIMATE_MB * 1024 * 1024 

35 

36# Unit -> bytes scale for Playwright stdout progress lines. 

37_BYTE_UNIT_SCALE: dict[str, int] = { 

38 "b": 1, 

39 "kb": 1024, 

40 "kib": 1024, 

41 "mb": 1024 * 1024, 

42 "mib": 1024 * 1024, 

43} 

44 

45# Playwright 1.58 prints lines like 

46# ``|■■■■■■■■ | 10% of 162.3 MiB`` during 

47# the chromium download. The percent comes first, then "of <total> <unit>". 

48_PROGRESS_LINE_RE = re.compile( 

49 r"(\d+)\s*%\s*of\s*(\d+(?:\.\d+)?)\s*(MiB|Mb|MB|KiB|KB|B)", 

50 re.IGNORECASE, 

51) 

52 

53 

54def _browsers_cache_path() -> Path: 

55 """Return the root path where Playwright stores browser binaries.""" 

56 override = os.environ.get("PLAYWRIGHT_BROWSERS_PATH") 

57 if override: 

58 return Path(override).expanduser() 

59 if sys.platform == "darwin": 

60 return Path.home() / "Library" / "Caches" / "ms-playwright" 

61 if sys.platform == "win32": 

62 local = os.environ.get("LOCALAPPDATA", str(Path.home() / "AppData" / "Local")) 

63 return Path(local) / "ms-playwright" 

64 return Path.home() / ".cache" / "ms-playwright" 

65 

66 

67def _read_chromium_revision(browsers_json: Path) -> str | None: 

68 """Pull the ``chromium`` revision out of a Playwright ``browsers.json``.""" 

69 import json 

70 

71 try: 

72 data = json.loads(browsers_json.read_text(encoding="utf-8")) 

73 except (OSError, ValueError): 

74 return None 

75 for browser in data.get("browsers", []): 

76 if browser.get("name") == _CHROMIUM_COMPONENT: 

77 revision = browser.get("revision") 

78 return str(revision) if revision is not None else None 

79 return None 

80 

81 

82def _expected_chromium_revision() -> str | None: 

83 """Revision string Playwright was built against (e.g. ``"1217"``). 

84 

85 ``None`` means "unknown" -- treat as "do install" so a missing 

86 browsers.json never short-circuits the bootstrap path. 

87 """ 

88 try: 

89 import playwright as _pw 

90 except ImportError: 

91 return None 

92 for path in Path(_pw.__file__).parent.rglob("browsers.json"): 

93 return _read_chromium_revision(path) 

94 return None 

95 

96 

97def chromium_installed() -> bool: 

98 """Return True if the Chromium revision Playwright expects is on disk. 

99 

100 Matching by any ``chromium-*`` directory isn't enough: when the 

101 system has chromium-1217 but the bundled Playwright driver expects 

102 chromium-1208, launch fails with ``Executable doesn't exist`` even 

103 though the bootstrap check thought everything was ready. 

104 """ 

105 root = _browsers_cache_path() 

106 if not root.exists(): 

107 return False 

108 expected = _expected_chromium_revision() 

109 if expected is None: 

110 return any(p.is_dir() and p.name.startswith("chromium-") for p in root.iterdir()) 

111 return (root / f"{_CHROMIUM_COMPONENT}-{expected}").is_dir() 

112 

113 

114def crawler_browsers_path() -> Path: 

115 """Public accessor for the crawler browser cache root. 

116 

117 Used by the HTTP status endpoint to tell plugins where Chromium 

118 lives. The underlying resolver stays private because callers should 

119 not depend on the Playwright-specific directory layout. 

120 """ 

121 return _browsers_cache_path() 

122 

123 

124def _bytes_from_stdout(line: str) -> tuple[int, int] | None: 

125 """Extract (downloaded_bytes, total_bytes) from a Playwright stdout line. 

126 

127 Matches the ``NN% of N.N MiB`` shape Playwright 1.58+ emits for the 

128 Chromium download. Returns None when the line doesn't match. The 

129 percent and total both parse out of the same line so callers never 

130 have to handle a missing total. 

131 """ 

132 match = _PROGRESS_LINE_RE.search(line) 

133 if match is None: 

134 return None 

135 pct = int(match.group(1)) 

136 raw_total = float(match.group(2)) 

137 unit = match.group(3).lower() 

138 scale = _BYTE_UNIT_SCALE.get(unit, 1) 

139 total = int(raw_total * scale) 

140 downloaded = int(total * pct / 100) 

141 return downloaded, total 

142 

143 

144def _emit_setup_start(on_progress: DetailedProgressCallback | None) -> None: 

145 if on_progress is None: 

146 return 

147 on_progress( 

148 EventType.SETUP_START, 

149 SetupStartEvent( 

150 component=_CHROMIUM_COMPONENT, 

151 size_estimate_bytes=_CHROMIUM_SIZE_ESTIMATE_BYTES, 

152 ), 

153 ) 

154 

155 

156def _emit_setup_done( 

157 on_progress: DetailedProgressCallback | None, 

158 *, 

159 success: bool, 

160 error: str | None, 

161) -> None: 

162 if on_progress is None: 

163 return 

164 on_progress( 

165 EventType.SETUP_DONE, 

166 SetupDoneEvent(component=_CHROMIUM_COMPONENT, success=success, error=error), 

167 ) 

168 

169 

170async def _drain_stdout_to_progress( 

171 stream: asyncio.StreamReader, 

172 on_progress: DetailedProgressCallback | None, 

173) -> None: 

174 while True: 

175 line_bytes = await stream.readline() 

176 if not line_bytes: 

177 return 

178 line = line_bytes.decode(errors="replace").rstrip() 

179 parsed = _bytes_from_stdout(line) 

180 if parsed is None or on_progress is None: 

181 continue 

182 downloaded, total = parsed 

183 on_progress( 

184 EventType.SETUP_PROGRESS, 

185 SetupProgressEvent( 

186 component=_CHROMIUM_COMPONENT, 

187 downloaded_bytes=downloaded, 

188 total_bytes=total, 

189 detail=line, 

190 ), 

191 ) 

192 

193 

194async def _drain_stderr(stream: asyncio.StreamReader, tail: list[str]) -> None: 

195 while True: 

196 line_bytes = await stream.readline() 

197 if not line_bytes: 

198 return 

199 tail.append(line_bytes.decode(errors="replace").rstrip()) 

200 

201 

202_PLAYWRIGHT_MISSING_HINT = ( 

203 "Chromium bootstrap requires the playwright Python package, which is " 

204 "bundled with the release binary and the lilbee[crawler] extra. " 

205 "Reinstall with 'pip install lilbee[crawler]' or download a fresh " 

206 "release binary." 

207) 

208 

209 

210def _resolve_playwright_runner() -> tuple[list[str], dict[str, str]]: 

211 """Return ``(argv_prefix, env)`` for invoking ``playwright install chromium``. 

212 

213 Spawns Playwright's bundled Node driver directly so the call works under a 

214 pip install, ``uv tool install``, or a frozen (Nuitka onefile) binary. Falls 

215 back to ``[sys.executable, '-m', 'playwright']`` for unfrozen builds when the 

216 driver lookup fails; re-raises for frozen builds, where ``sys.executable`` 

217 is the lilbee exe and ``-m playwright`` would leak into typer. 

218 """ 

219 try: 

220 from playwright._impl._driver import compute_driver_executable, get_driver_env 

221 except ImportError as exc: 

222 raise CrawlerBrowserError(_PLAYWRIGHT_MISSING_HINT) from exc 

223 try: 

224 driver_exe, driver_cli = compute_driver_executable() 

225 except Exception: 

226 if not is_frozen(): 

227 return [sys.executable, "-m", "playwright"], dict(os.environ) 

228 raise 

229 return [str(driver_exe), str(driver_cli)], dict(get_driver_env()) 

230 

231 

232async def bootstrap_chromium( 

233 on_progress: DetailedProgressCallback | None = None, 

234) -> None: 

235 """Run ``playwright install chromium`` as a subprocess, emitting events. 

236 

237 Short-circuits when ``chromium_installed()`` is already True. Emits 

238 ``setup_start`` before spawning, ``setup_progress`` for each recognizable 

239 progress line on stdout, and ``setup_done`` on exit (``success=False`` plus 

240 the subprocess stderr tail on failure). Raises :class:`CrawlerBrowserError` 

241 with the tail so task workers route to FAILED cleanly. 

242 """ 

243 if chromium_installed(): 

244 _emit_setup_done(on_progress, success=True, error=None) 

245 return 

246 

247 _emit_setup_start(on_progress) 

248 

249 try: 

250 runner, runner_env = _resolve_playwright_runner() 

251 except CrawlerBrowserError as exc: 

252 _emit_setup_done(on_progress, success=False, error=str(exc)) 

253 raise 

254 

255 proc = await asyncio.create_subprocess_exec( 

256 *runner, 

257 "install", 

258 "chromium", 

259 stdout=asyncio.subprocess.PIPE, 

260 stderr=asyncio.subprocess.PIPE, 

261 env=runner_env, 

262 ) 

263 # mypy narrowing: asyncio.create_subprocess_exec with PIPE guarantees 

264 # non-None streams at runtime; the asserts only satisfy the type checker. 

265 assert proc.stdout is not None # noqa: S101 

266 assert proc.stderr is not None # noqa: S101 

267 

268 stderr_tail: list[str] = [] 

269 await asyncio.gather( 

270 _drain_stdout_to_progress(proc.stdout, on_progress), 

271 _drain_stderr(proc.stderr, stderr_tail), 

272 ) 

273 returncode = await proc.wait() 

274 

275 if returncode != 0: 

276 tail = "\n".join(stderr_tail[-10:]) or f"exit code {returncode}" 

277 _emit_setup_done(on_progress, success=False, error=tail) 

278 raise CrawlerBrowserError(f"Chromium bootstrap failed (exit {returncode}): {tail}") 

279 

280 _emit_setup_done(on_progress, success=True, error=None)