Coverage for src / lilbee / crawler / bootstrap.py: 100%
96 statements
« prev ^ index » next coverage.py v7.13.4, created at 2026-05-15 20:55 +0000
« prev ^ index » next coverage.py v7.13.4, created at 2026-05-15 20:55 +0000
1"""Playwright Chromium detection and install."""
3from __future__ import annotations
5import asyncio
6import os
7import re
8import sys
9from pathlib import Path
11from lilbee.runtime.progress import (
12 DetailedProgressCallback,
13 EventType,
14 SetupDoneEvent,
15 SetupProgressEvent,
16 SetupStartEvent,
17)
20class CrawlerBrowserError(RuntimeError):
21 """Playwright is installed but its Chromium browser binary is not."""
24class CrawlerBackendError(RuntimeError):
25 """The ``crawler`` extra (crawl4ai) was never installed."""
28_CHROMIUM_COMPONENT = "chromium"
29# Rough size estimate for the Chromium download; Playwright bundles vary
30# slightly per platform but this gives the UI a decent denominator before
31# 'Total bytes' parses out of stdout.
32_CHROMIUM_ESTIMATE_MB = 180
33_CHROMIUM_SIZE_ESTIMATE_BYTES = _CHROMIUM_ESTIMATE_MB * 1024 * 1024
35# Unit -> bytes scale for Playwright stdout progress lines.
36_BYTE_UNIT_SCALE: dict[str, int] = {
37 "b": 1,
38 "kb": 1024,
39 "kib": 1024,
40 "mb": 1024 * 1024,
41 "mib": 1024 * 1024,
42}
44# Playwright 1.58 prints lines like
45# ``|■■■■■■■■ | 10% of 162.3 MiB`` during
46# the chromium download. The percent comes first, then "of <total> <unit>".
47_PROGRESS_LINE_RE = re.compile(
48 r"(\d+)\s*%\s*of\s*(\d+(?:\.\d+)?)\s*(MiB|Mb|MB|KiB|KB|B)",
49 re.IGNORECASE,
50)
53def _browsers_cache_path() -> Path:
54 """Return the root path where Playwright stores browser binaries."""
55 override = os.environ.get("PLAYWRIGHT_BROWSERS_PATH")
56 if override:
57 return Path(override).expanduser()
58 if sys.platform == "darwin":
59 return Path.home() / "Library" / "Caches" / "ms-playwright"
60 if sys.platform == "win32":
61 local = os.environ.get("LOCALAPPDATA", str(Path.home() / "AppData" / "Local"))
62 return Path(local) / "ms-playwright"
63 return Path.home() / ".cache" / "ms-playwright"
66def chromium_installed() -> bool:
67 """Return True if at least one chromium-* install directory exists."""
68 root = _browsers_cache_path()
69 if not root.exists():
70 return False
71 return any(p.is_dir() and p.name.startswith("chromium-") for p in root.iterdir())
74def crawler_browsers_path() -> Path:
75 """Public accessor for the crawler browser cache root.
77 Used by the HTTP status endpoint to tell plugins where Chromium
78 lives. The underlying resolver stays private because callers should
79 not depend on the Playwright-specific directory layout.
80 """
81 return _browsers_cache_path()
84def _bytes_from_stdout(line: str) -> tuple[int, int] | None:
85 """Extract (downloaded_bytes, total_bytes) from a Playwright stdout line.
87 Matches the ``NN% of N.N MiB`` shape Playwright 1.58+ emits for the
88 Chromium download. Returns None when the line doesn't match. The
89 percent and total both parse out of the same line so callers never
90 have to handle a missing total.
91 """
92 match = _PROGRESS_LINE_RE.search(line)
93 if match is None:
94 return None
95 pct = int(match.group(1))
96 raw_total = float(match.group(2))
97 unit = match.group(3).lower()
98 scale = _BYTE_UNIT_SCALE.get(unit, 1)
99 total = int(raw_total * scale)
100 downloaded = int(total * pct / 100)
101 return downloaded, total
104def _emit_setup_start(on_progress: DetailedProgressCallback | None) -> None:
105 if on_progress is None:
106 return
107 on_progress(
108 EventType.SETUP_START,
109 SetupStartEvent(
110 component=_CHROMIUM_COMPONENT,
111 size_estimate_bytes=_CHROMIUM_SIZE_ESTIMATE_BYTES,
112 ),
113 )
116def _emit_setup_done(
117 on_progress: DetailedProgressCallback | None,
118 *,
119 success: bool,
120 error: str | None,
121) -> None:
122 if on_progress is None:
123 return
124 on_progress(
125 EventType.SETUP_DONE,
126 SetupDoneEvent(component=_CHROMIUM_COMPONENT, success=success, error=error),
127 )
130async def _drain_stdout_to_progress(
131 stream: asyncio.StreamReader,
132 on_progress: DetailedProgressCallback | None,
133) -> None:
134 while True:
135 line_bytes = await stream.readline()
136 if not line_bytes:
137 return
138 line = line_bytes.decode(errors="replace").rstrip()
139 parsed = _bytes_from_stdout(line)
140 if parsed is None or on_progress is None:
141 continue
142 downloaded, total = parsed
143 on_progress(
144 EventType.SETUP_PROGRESS,
145 SetupProgressEvent(
146 component=_CHROMIUM_COMPONENT,
147 downloaded_bytes=downloaded,
148 total_bytes=total,
149 detail=line,
150 ),
151 )
154async def _drain_stderr(stream: asyncio.StreamReader, tail: list[str]) -> None:
155 while True:
156 line_bytes = await stream.readline()
157 if not line_bytes:
158 return
159 tail.append(line_bytes.decode(errors="replace").rstrip())
162_PLAYWRIGHT_MISSING_HINT = (
163 "Chromium bootstrap requires the playwright Python package, which is "
164 "bundled with the release binary and the lilbee[crawler] extra. "
165 "Reinstall with 'pip install lilbee[crawler]' or download a fresh "
166 "release binary."
167)
170def _resolve_playwright_runner() -> tuple[list[str], dict[str, str]]:
171 """Return ``(argv_prefix, env)`` for invoking ``playwright install chromium``.
173 Spawns Playwright's bundled Node driver directly so the call works under a
174 pip install, ``uv tool install``, or a frozen (Nuitka onefile) binary. Falls
175 back to ``[sys.executable, '-m', 'playwright']`` for unfrozen builds when the
176 driver lookup fails; re-raises for frozen builds.
177 """
178 try:
179 from playwright._impl._driver import compute_driver_executable, get_driver_env
180 except ImportError as exc:
181 raise CrawlerBrowserError(_PLAYWRIGHT_MISSING_HINT) from exc
182 try:
183 driver_exe, driver_cli = compute_driver_executable()
184 except Exception:
185 if not getattr(sys, "frozen", False):
186 return [sys.executable, "-m", "playwright"], dict(os.environ)
187 raise
188 return [str(driver_exe), str(driver_cli)], dict(get_driver_env())
191async def bootstrap_chromium(
192 on_progress: DetailedProgressCallback | None = None,
193) -> None:
194 """Run ``playwright install chromium`` as a subprocess, emitting events.
196 Short-circuits when ``chromium_installed()`` is already True. Emits
197 ``setup_start`` before spawning, ``setup_progress`` for each recognizable
198 progress line on stdout, and ``setup_done`` on exit (``success=False`` plus
199 the subprocess stderr tail on failure). Raises :class:`CrawlerBrowserError`
200 with the tail so task workers route to FAILED cleanly.
201 """
202 if chromium_installed():
203 _emit_setup_done(on_progress, success=True, error=None)
204 return
206 _emit_setup_start(on_progress)
208 try:
209 runner, runner_env = _resolve_playwright_runner()
210 except CrawlerBrowserError as exc:
211 _emit_setup_done(on_progress, success=False, error=str(exc))
212 raise
214 proc = await asyncio.create_subprocess_exec(
215 *runner,
216 "install",
217 "chromium",
218 stdout=asyncio.subprocess.PIPE,
219 stderr=asyncio.subprocess.PIPE,
220 env=runner_env,
221 )
222 # mypy narrowing: asyncio.create_subprocess_exec with PIPE guarantees
223 # non-None streams at runtime; the asserts only satisfy the type checker.
224 assert proc.stdout is not None # noqa: S101
225 assert proc.stderr is not None # noqa: S101
227 stderr_tail: list[str] = []
228 await asyncio.gather(
229 _drain_stdout_to_progress(proc.stdout, on_progress),
230 _drain_stderr(proc.stderr, stderr_tail),
231 )
232 returncode = await proc.wait()
234 if returncode != 0:
235 tail = "\n".join(stderr_tail[-10:]) or f"exit code {returncode}"
236 _emit_setup_done(on_progress, success=False, error=tail)
237 raise CrawlerBrowserError(f"Chromium bootstrap failed (exit {returncode}): {tail}")
239 _emit_setup_done(on_progress, success=True, error=None)