Coverage for src/lilbee/crawler/fetcher.py: 100%

1"""Protocol contract for web-fetcher backends consumed by the crawl runner."""

3from __future__ import annotations

5from collections.abc import AsyncGenerator

6from typing import Any, Protocol, runtime_checkable

8from lilbee.crawler.models import (

9 CancelToken,

10 ConcurrencySpec,

11 FetchedPage,

12 FilterSpec,

13)

16@runtime_checkable

17class WebFetcher(Protocol):

18 """Backend contract for fetching web pages as markdown.

20 Implementations must honour ``CancelToken`` promptly inside

21 ``fetch_recursive`` so the streaming loop in ``runner.py`` can

22 abort without waiting for an in-flight batch to drain.

24 Lifecycle ordering:

26 1. ``__aenter__`` is called before any fetch method. Adapters with

27 per-operation setup (e.g. crawl4ai opens a fresh

28 ``AsyncWebCrawler`` inside each fetch method) may no-op here.

29 2. ``fetch_single`` and ``fetch_recursive`` may be called multiple

30 times during the same context; they must not assume fresh state.

31 3. ``fetch_recursive`` returns an async generator; callers are

32 expected to ``.aclose()`` it deterministically on early break.

33 4. ``__aexit__`` tears the backend down and must succeed even if

34 a prior fetch raised.

35 """

37 async def __aenter__(self) -> WebFetcher: ...

39 async def __aexit__(

40 self,

41 exc_type: type[BaseException] | None,

42 exc: BaseException | None,

43 tb: Any,

44 ) -> None: ...

46 async def fetch_single(self, url: str, *, timeout: float) -> FetchedPage:

47 """Fetch one URL and return its markdown + link set."""

48 ...

50 def fetch_recursive(

51 self,

52 seed_url: str,

53 *,

54 depth: int | None,

55 max_pages: int | None,

56 timeout: float,

57 concurrency: ConcurrencySpec,

58 filters: FilterSpec,

59 cancel: CancelToken | None = None,

60 ) -> AsyncGenerator[FetchedPage, None]:

61 """Stream pages discovered by BFS from ``seed_url``.

63 ``depth`` / ``max_pages``: positive int caps, or ``None`` for

64 unbounded. Adapters translate ``None`` into whatever sentinel the

65 underlying SDK wants (crawl4ai uses ``math.inf``).

67 Returns an async generator so the orchestration layer can

68 react per page (progress events, save-to-disk, cancel) and

69 deterministically ``.aclose()`` the stream when it breaks

70 out early (e.g. on ``max_pages`` hard cap).

71 """

72 ...

Coverage for src / lilbee / crawler / fetcher.py: 100%

4 statements