Coverage for src / lilbee / crawler / fetcher.py: 100%
4 statements
« prev ^ index » next coverage.py v7.13.4, created at 2026-05-15 20:55 +0000
« prev ^ index » next coverage.py v7.13.4, created at 2026-05-15 20:55 +0000
1"""Protocol contract for web-fetcher backends consumed by the crawl runner."""
3from __future__ import annotations
5from collections.abc import AsyncGenerator
6from typing import Any, Protocol, runtime_checkable
8from lilbee.crawler.models import (
9 CancelToken,
10 ConcurrencySpec,
11 FetchedPage,
12 FilterSpec,
13)
16@runtime_checkable
17class WebFetcher(Protocol):
18 """Backend contract for fetching web pages as markdown.
20 Implementations must honour ``CancelToken`` promptly inside
21 ``fetch_recursive`` so the streaming loop in ``runner.py`` can
22 abort without waiting for an in-flight batch to drain.
24 Lifecycle ordering:
26 1. ``__aenter__`` is called before any fetch method. Adapters with
27 per-operation setup (e.g. crawl4ai opens a fresh
28 ``AsyncWebCrawler`` inside each fetch method) may no-op here.
29 2. ``fetch_single`` and ``fetch_recursive`` may be called multiple
30 times during the same context; they must not assume fresh state.
31 3. ``fetch_recursive`` returns an async generator; callers are
32 expected to ``.aclose()`` it deterministically on early break.
33 4. ``__aexit__`` tears the backend down and must succeed even if
34 a prior fetch raised.
35 """
37 async def __aenter__(self) -> WebFetcher: ...
39 async def __aexit__(
40 self,
41 exc_type: type[BaseException] | None,
42 exc: BaseException | None,
43 tb: Any,
44 ) -> None: ...
46 async def fetch_single(self, url: str, *, timeout: float) -> FetchedPage:
47 """Fetch one URL and return its markdown + link set."""
48 ...
50 def fetch_recursive(
51 self,
52 seed_url: str,
53 *,
54 depth: int | None,
55 max_pages: int | None,
56 timeout: float,
57 concurrency: ConcurrencySpec,
58 filters: FilterSpec,
59 cancel: CancelToken | None = None,
60 ) -> AsyncGenerator[FetchedPage, None]:
61 """Stream pages discovered by BFS from ``seed_url``.
63 ``depth`` / ``max_pages``: positive int caps, or ``None`` for
64 unbounded. Adapters translate ``None`` into whatever sentinel the
65 underlying SDK wants (crawl4ai uses ``math.inf``).
67 Returns an async generator so the orchestration layer can
68 react per page (progress events, save-to-disk, cancel) and
69 deterministically ``.aclose()`` the stream when it breaks
70 out early (e.g. on ``max_pages`` hard cap).
71 """
72 ...