Coverage for src / lilbee / crawler / bootstrap.py: 100%

96 statements  

« prev     ^ index     » next       coverage.py v7.13.4, created at 2026-05-15 20:55 +0000

1"""Playwright Chromium detection and install.""" 

2 

3from __future__ import annotations 

4 

5import asyncio 

6import os 

7import re 

8import sys 

9from pathlib import Path 

10 

11from lilbee.runtime.progress import ( 

12 DetailedProgressCallback, 

13 EventType, 

14 SetupDoneEvent, 

15 SetupProgressEvent, 

16 SetupStartEvent, 

17) 

18 

19 

20class CrawlerBrowserError(RuntimeError): 

21 """Playwright is installed but its Chromium browser binary is not.""" 

22 

23 

24class CrawlerBackendError(RuntimeError): 

25 """The ``crawler`` extra (crawl4ai) was never installed.""" 

26 

27 

28_CHROMIUM_COMPONENT = "chromium" 

29# Rough size estimate for the Chromium download; Playwright bundles vary 

30# slightly per platform but this gives the UI a decent denominator before 

31# 'Total bytes' parses out of stdout. 

32_CHROMIUM_ESTIMATE_MB = 180 

33_CHROMIUM_SIZE_ESTIMATE_BYTES = _CHROMIUM_ESTIMATE_MB * 1024 * 1024 

34 

35# Unit -> bytes scale for Playwright stdout progress lines. 

36_BYTE_UNIT_SCALE: dict[str, int] = { 

37 "b": 1, 

38 "kb": 1024, 

39 "kib": 1024, 

40 "mb": 1024 * 1024, 

41 "mib": 1024 * 1024, 

42} 

43 

44# Playwright 1.58 prints lines like 

45# ``|■■■■■■■■ | 10% of 162.3 MiB`` during 

46# the chromium download. The percent comes first, then "of <total> <unit>". 

47_PROGRESS_LINE_RE = re.compile( 

48 r"(\d+)\s*%\s*of\s*(\d+(?:\.\d+)?)\s*(MiB|Mb|MB|KiB|KB|B)", 

49 re.IGNORECASE, 

50) 

51 

52 

53def _browsers_cache_path() -> Path: 

54 """Return the root path where Playwright stores browser binaries.""" 

55 override = os.environ.get("PLAYWRIGHT_BROWSERS_PATH") 

56 if override: 

57 return Path(override).expanduser() 

58 if sys.platform == "darwin": 

59 return Path.home() / "Library" / "Caches" / "ms-playwright" 

60 if sys.platform == "win32": 

61 local = os.environ.get("LOCALAPPDATA", str(Path.home() / "AppData" / "Local")) 

62 return Path(local) / "ms-playwright" 

63 return Path.home() / ".cache" / "ms-playwright" 

64 

65 

66def chromium_installed() -> bool: 

67 """Return True if at least one chromium-* install directory exists.""" 

68 root = _browsers_cache_path() 

69 if not root.exists(): 

70 return False 

71 return any(p.is_dir() and p.name.startswith("chromium-") for p in root.iterdir()) 

72 

73 

74def crawler_browsers_path() -> Path: 

75 """Public accessor for the crawler browser cache root. 

76 

77 Used by the HTTP status endpoint to tell plugins where Chromium 

78 lives. The underlying resolver stays private because callers should 

79 not depend on the Playwright-specific directory layout. 

80 """ 

81 return _browsers_cache_path() 

82 

83 

84def _bytes_from_stdout(line: str) -> tuple[int, int] | None: 

85 """Extract (downloaded_bytes, total_bytes) from a Playwright stdout line. 

86 

87 Matches the ``NN% of N.N MiB`` shape Playwright 1.58+ emits for the 

88 Chromium download. Returns None when the line doesn't match. The 

89 percent and total both parse out of the same line so callers never 

90 have to handle a missing total. 

91 """ 

92 match = _PROGRESS_LINE_RE.search(line) 

93 if match is None: 

94 return None 

95 pct = int(match.group(1)) 

96 raw_total = float(match.group(2)) 

97 unit = match.group(3).lower() 

98 scale = _BYTE_UNIT_SCALE.get(unit, 1) 

99 total = int(raw_total * scale) 

100 downloaded = int(total * pct / 100) 

101 return downloaded, total 

102 

103 

104def _emit_setup_start(on_progress: DetailedProgressCallback | None) -> None: 

105 if on_progress is None: 

106 return 

107 on_progress( 

108 EventType.SETUP_START, 

109 SetupStartEvent( 

110 component=_CHROMIUM_COMPONENT, 

111 size_estimate_bytes=_CHROMIUM_SIZE_ESTIMATE_BYTES, 

112 ), 

113 ) 

114 

115 

116def _emit_setup_done( 

117 on_progress: DetailedProgressCallback | None, 

118 *, 

119 success: bool, 

120 error: str | None, 

121) -> None: 

122 if on_progress is None: 

123 return 

124 on_progress( 

125 EventType.SETUP_DONE, 

126 SetupDoneEvent(component=_CHROMIUM_COMPONENT, success=success, error=error), 

127 ) 

128 

129 

130async def _drain_stdout_to_progress( 

131 stream: asyncio.StreamReader, 

132 on_progress: DetailedProgressCallback | None, 

133) -> None: 

134 while True: 

135 line_bytes = await stream.readline() 

136 if not line_bytes: 

137 return 

138 line = line_bytes.decode(errors="replace").rstrip() 

139 parsed = _bytes_from_stdout(line) 

140 if parsed is None or on_progress is None: 

141 continue 

142 downloaded, total = parsed 

143 on_progress( 

144 EventType.SETUP_PROGRESS, 

145 SetupProgressEvent( 

146 component=_CHROMIUM_COMPONENT, 

147 downloaded_bytes=downloaded, 

148 total_bytes=total, 

149 detail=line, 

150 ), 

151 ) 

152 

153 

154async def _drain_stderr(stream: asyncio.StreamReader, tail: list[str]) -> None: 

155 while True: 

156 line_bytes = await stream.readline() 

157 if not line_bytes: 

158 return 

159 tail.append(line_bytes.decode(errors="replace").rstrip()) 

160 

161 

162_PLAYWRIGHT_MISSING_HINT = ( 

163 "Chromium bootstrap requires the playwright Python package, which is " 

164 "bundled with the release binary and the lilbee[crawler] extra. " 

165 "Reinstall with 'pip install lilbee[crawler]' or download a fresh " 

166 "release binary." 

167) 

168 

169 

170def _resolve_playwright_runner() -> tuple[list[str], dict[str, str]]: 

171 """Return ``(argv_prefix, env)`` for invoking ``playwright install chromium``. 

172 

173 Spawns Playwright's bundled Node driver directly so the call works under a 

174 pip install, ``uv tool install``, or a frozen (Nuitka onefile) binary. Falls 

175 back to ``[sys.executable, '-m', 'playwright']`` for unfrozen builds when the 

176 driver lookup fails; re-raises for frozen builds. 

177 """ 

178 try: 

179 from playwright._impl._driver import compute_driver_executable, get_driver_env 

180 except ImportError as exc: 

181 raise CrawlerBrowserError(_PLAYWRIGHT_MISSING_HINT) from exc 

182 try: 

183 driver_exe, driver_cli = compute_driver_executable() 

184 except Exception: 

185 if not getattr(sys, "frozen", False): 

186 return [sys.executable, "-m", "playwright"], dict(os.environ) 

187 raise 

188 return [str(driver_exe), str(driver_cli)], dict(get_driver_env()) 

189 

190 

191async def bootstrap_chromium( 

192 on_progress: DetailedProgressCallback | None = None, 

193) -> None: 

194 """Run ``playwright install chromium`` as a subprocess, emitting events. 

195 

196 Short-circuits when ``chromium_installed()`` is already True. Emits 

197 ``setup_start`` before spawning, ``setup_progress`` for each recognizable 

198 progress line on stdout, and ``setup_done`` on exit (``success=False`` plus 

199 the subprocess stderr tail on failure). Raises :class:`CrawlerBrowserError` 

200 with the tail so task workers route to FAILED cleanly. 

201 """ 

202 if chromium_installed(): 

203 _emit_setup_done(on_progress, success=True, error=None) 

204 return 

205 

206 _emit_setup_start(on_progress) 

207 

208 try: 

209 runner, runner_env = _resolve_playwright_runner() 

210 except CrawlerBrowserError as exc: 

211 _emit_setup_done(on_progress, success=False, error=str(exc)) 

212 raise 

213 

214 proc = await asyncio.create_subprocess_exec( 

215 *runner, 

216 "install", 

217 "chromium", 

218 stdout=asyncio.subprocess.PIPE, 

219 stderr=asyncio.subprocess.PIPE, 

220 env=runner_env, 

221 ) 

222 # mypy narrowing: asyncio.create_subprocess_exec with PIPE guarantees 

223 # non-None streams at runtime; the asserts only satisfy the type checker. 

224 assert proc.stdout is not None # noqa: S101 

225 assert proc.stderr is not None # noqa: S101 

226 

227 stderr_tail: list[str] = [] 

228 await asyncio.gather( 

229 _drain_stdout_to_progress(proc.stdout, on_progress), 

230 _drain_stderr(proc.stderr, stderr_tail), 

231 ) 

232 returncode = await proc.wait() 

233 

234 if returncode != 0: 

235 tail = "\n".join(stderr_tail[-10:]) or f"exit code {returncode}" 

236 _emit_setup_done(on_progress, success=False, error=tail) 

237 raise CrawlerBrowserError(f"Chromium bootstrap failed (exit {returncode}): {tail}") 

238 

239 _emit_setup_done(on_progress, success=True, error=None)