Coverage for src / lilbee / crawler / bootstrap.py: 100%
124 statements
« prev ^ index » next coverage.py v7.13.4, created at 2026-06-28 01:01 +0000
« prev ^ index » next coverage.py v7.13.4, created at 2026-06-28 01:01 +0000
1"""Playwright Chromium detection and install."""
3from __future__ import annotations
5import asyncio
6import os
7import re
8import sys
9from pathlib import Path
11from lilbee._frozen import is_frozen
12from lilbee.runtime.progress import (
13 DetailedProgressCallback,
14 EventType,
15 SetupDoneEvent,
16 SetupProgressEvent,
17 SetupStartEvent,
18)
21class CrawlerBrowserError(RuntimeError):
22 """Playwright is installed but its Chromium browser binary is not."""
25class CrawlerBackendError(RuntimeError):
26 """The ``crawler`` extra (crawl4ai) was never installed."""
29_CHROMIUM_COMPONENT = "chromium"
30# Rough size estimate for the Chromium download; Playwright bundles vary
31# slightly per platform but this gives the UI a decent denominator before
32# 'Total bytes' parses out of stdout.
33_CHROMIUM_ESTIMATE_MB = 180
34_CHROMIUM_SIZE_ESTIMATE_BYTES = _CHROMIUM_ESTIMATE_MB * 1024 * 1024
36# Unit -> bytes scale for Playwright stdout progress lines.
37_BYTE_UNIT_SCALE: dict[str, int] = {
38 "b": 1,
39 "kb": 1024,
40 "kib": 1024,
41 "mb": 1024 * 1024,
42 "mib": 1024 * 1024,
43}
45# Playwright 1.58 prints lines like
46# ``|■■■■■■■■ | 10% of 162.3 MiB`` during
47# the chromium download. The percent comes first, then "of <total> <unit>".
48_PROGRESS_LINE_RE = re.compile(
49 r"(\d+)\s*%\s*of\s*(\d+(?:\.\d+)?)\s*(MiB|Mb|MB|KiB|KB|B)",
50 re.IGNORECASE,
51)
54def _browsers_cache_path() -> Path:
55 """Return the root path where Playwright stores browser binaries."""
56 override = os.environ.get("PLAYWRIGHT_BROWSERS_PATH")
57 if override:
58 return Path(override).expanduser()
59 if sys.platform == "darwin":
60 return Path.home() / "Library" / "Caches" / "ms-playwright"
61 if sys.platform == "win32":
62 local = os.environ.get("LOCALAPPDATA", str(Path.home() / "AppData" / "Local"))
63 return Path(local) / "ms-playwright"
64 return Path.home() / ".cache" / "ms-playwright"
67def _read_chromium_revision(browsers_json: Path) -> str | None:
68 """Pull the ``chromium`` revision out of a Playwright ``browsers.json``."""
69 import json
71 try:
72 data = json.loads(browsers_json.read_text(encoding="utf-8"))
73 except (OSError, ValueError):
74 return None
75 for browser in data.get("browsers", []):
76 if browser.get("name") == _CHROMIUM_COMPONENT:
77 revision = browser.get("revision")
78 return str(revision) if revision is not None else None
79 return None
82def _expected_chromium_revision() -> str | None:
83 """Revision string Playwright was built against (e.g. ``"1217"``).
85 ``None`` means "unknown" -- treat as "do install" so a missing
86 browsers.json never short-circuits the bootstrap path.
87 """
88 try:
89 import playwright as _pw
90 except ImportError:
91 return None
92 for path in Path(_pw.__file__).parent.rglob("browsers.json"):
93 return _read_chromium_revision(path)
94 return None
97def chromium_installed() -> bool:
98 """Return True if the Chromium revision Playwright expects is on disk.
100 Matching by any ``chromium-*`` directory isn't enough: when the
101 system has chromium-1217 but the bundled Playwright driver expects
102 chromium-1208, launch fails with ``Executable doesn't exist`` even
103 though the bootstrap check thought everything was ready.
104 """
105 root = _browsers_cache_path()
106 if not root.exists():
107 return False
108 expected = _expected_chromium_revision()
109 if expected is None:
110 return any(p.is_dir() and p.name.startswith("chromium-") for p in root.iterdir())
111 return (root / f"{_CHROMIUM_COMPONENT}-{expected}").is_dir()
114def crawler_browsers_path() -> Path:
115 """Public accessor for the crawler browser cache root.
117 Used by the HTTP status endpoint to tell plugins where Chromium
118 lives. The underlying resolver stays private because callers should
119 not depend on the Playwright-specific directory layout.
120 """
121 return _browsers_cache_path()
124def _bytes_from_stdout(line: str) -> tuple[int, int] | None:
125 """Extract (downloaded_bytes, total_bytes) from a Playwright stdout line.
127 Matches the ``NN% of N.N MiB`` shape Playwright 1.58+ emits for the
128 Chromium download. Returns None when the line doesn't match. The
129 percent and total both parse out of the same line so callers never
130 have to handle a missing total.
131 """
132 match = _PROGRESS_LINE_RE.search(line)
133 if match is None:
134 return None
135 pct = int(match.group(1))
136 raw_total = float(match.group(2))
137 unit = match.group(3).lower()
138 scale = _BYTE_UNIT_SCALE.get(unit, 1)
139 total = int(raw_total * scale)
140 downloaded = int(total * pct / 100)
141 return downloaded, total
144def _emit_setup_start(on_progress: DetailedProgressCallback | None) -> None:
145 if on_progress is None:
146 return
147 on_progress(
148 EventType.SETUP_START,
149 SetupStartEvent(
150 component=_CHROMIUM_COMPONENT,
151 size_estimate_bytes=_CHROMIUM_SIZE_ESTIMATE_BYTES,
152 ),
153 )
156def _emit_setup_done(
157 on_progress: DetailedProgressCallback | None,
158 *,
159 success: bool,
160 error: str | None,
161) -> None:
162 if on_progress is None:
163 return
164 on_progress(
165 EventType.SETUP_DONE,
166 SetupDoneEvent(component=_CHROMIUM_COMPONENT, success=success, error=error),
167 )
170async def _drain_stdout_to_progress(
171 stream: asyncio.StreamReader,
172 on_progress: DetailedProgressCallback | None,
173) -> None:
174 while True:
175 line_bytes = await stream.readline()
176 if not line_bytes:
177 return
178 line = line_bytes.decode(errors="replace").rstrip()
179 parsed = _bytes_from_stdout(line)
180 if parsed is None or on_progress is None:
181 continue
182 downloaded, total = parsed
183 on_progress(
184 EventType.SETUP_PROGRESS,
185 SetupProgressEvent(
186 component=_CHROMIUM_COMPONENT,
187 downloaded_bytes=downloaded,
188 total_bytes=total,
189 detail=line,
190 ),
191 )
194async def _drain_stderr(stream: asyncio.StreamReader, tail: list[str]) -> None:
195 while True:
196 line_bytes = await stream.readline()
197 if not line_bytes:
198 return
199 tail.append(line_bytes.decode(errors="replace").rstrip())
202_PLAYWRIGHT_MISSING_HINT = (
203 "Chromium bootstrap requires the playwright Python package, which is "
204 "bundled with the release binary and the lilbee[crawler] extra. "
205 "Reinstall with 'pip install lilbee[crawler]' or download a fresh "
206 "release binary."
207)
210def _resolve_playwright_runner() -> tuple[list[str], dict[str, str]]:
211 """Return ``(argv_prefix, env)`` for invoking ``playwright install chromium``.
213 Spawns Playwright's bundled Node driver directly so the call works under a
214 pip install, ``uv tool install``, or a frozen (Nuitka onefile) binary. Falls
215 back to ``[sys.executable, '-m', 'playwright']`` for unfrozen builds when the
216 driver lookup fails; re-raises for frozen builds, where ``sys.executable``
217 is the lilbee exe and ``-m playwright`` would leak into typer.
218 """
219 try:
220 from playwright._impl._driver import compute_driver_executable, get_driver_env
221 except ImportError as exc:
222 raise CrawlerBrowserError(_PLAYWRIGHT_MISSING_HINT) from exc
223 try:
224 driver_exe, driver_cli = compute_driver_executable()
225 except Exception:
226 if not is_frozen():
227 return [sys.executable, "-m", "playwright"], dict(os.environ)
228 raise
229 return [str(driver_exe), str(driver_cli)], dict(get_driver_env())
232async def bootstrap_chromium(
233 on_progress: DetailedProgressCallback | None = None,
234) -> None:
235 """Run ``playwright install chromium`` as a subprocess, emitting events.
237 Short-circuits when ``chromium_installed()`` is already True. Emits
238 ``setup_start`` before spawning, ``setup_progress`` for each recognizable
239 progress line on stdout, and ``setup_done`` on exit (``success=False`` plus
240 the subprocess stderr tail on failure). Raises :class:`CrawlerBrowserError`
241 with the tail so task workers route to FAILED cleanly.
242 """
243 if chromium_installed():
244 _emit_setup_done(on_progress, success=True, error=None)
245 return
247 _emit_setup_start(on_progress)
249 try:
250 runner, runner_env = _resolve_playwright_runner()
251 except CrawlerBrowserError as exc:
252 _emit_setup_done(on_progress, success=False, error=str(exc))
253 raise
255 proc = await asyncio.create_subprocess_exec(
256 *runner,
257 "install",
258 "chromium",
259 stdout=asyncio.subprocess.PIPE,
260 stderr=asyncio.subprocess.PIPE,
261 env=runner_env,
262 )
263 # mypy narrowing: asyncio.create_subprocess_exec with PIPE guarantees
264 # non-None streams at runtime; the asserts only satisfy the type checker.
265 assert proc.stdout is not None # noqa: S101
266 assert proc.stderr is not None # noqa: S101
268 stderr_tail: list[str] = []
269 await asyncio.gather(
270 _drain_stdout_to_progress(proc.stdout, on_progress),
271 _drain_stderr(proc.stderr, stderr_tail),
272 )
273 returncode = await proc.wait()
275 if returncode != 0:
276 tail = "\n".join(stderr_tail[-10:]) or f"exit code {returncode}"
277 _emit_setup_done(on_progress, success=False, error=tail)
278 raise CrawlerBrowserError(f"Chromium bootstrap failed (exit {returncode}): {tail}")
280 _emit_setup_done(on_progress, success=True, error=None)