Coverage for src / lilbee / crawler / fetcher.py: 100%

4 statements  

« prev     ^ index     » next       coverage.py v7.13.4, created at 2026-05-15 20:55 +0000

1"""Protocol contract for web-fetcher backends consumed by the crawl runner.""" 

2 

3from __future__ import annotations 

4 

5from collections.abc import AsyncGenerator 

6from typing import Any, Protocol, runtime_checkable 

7 

8from lilbee.crawler.models import ( 

9 CancelToken, 

10 ConcurrencySpec, 

11 FetchedPage, 

12 FilterSpec, 

13) 

14 

15 

16@runtime_checkable 

17class WebFetcher(Protocol): 

18 """Backend contract for fetching web pages as markdown. 

19 

20 Implementations must honour ``CancelToken`` promptly inside 

21 ``fetch_recursive`` so the streaming loop in ``runner.py`` can 

22 abort without waiting for an in-flight batch to drain. 

23 

24 Lifecycle ordering: 

25 

26 1. ``__aenter__`` is called before any fetch method. Adapters with 

27 per-operation setup (e.g. crawl4ai opens a fresh 

28 ``AsyncWebCrawler`` inside each fetch method) may no-op here. 

29 2. ``fetch_single`` and ``fetch_recursive`` may be called multiple 

30 times during the same context; they must not assume fresh state. 

31 3. ``fetch_recursive`` returns an async generator; callers are 

32 expected to ``.aclose()`` it deterministically on early break. 

33 4. ``__aexit__`` tears the backend down and must succeed even if 

34 a prior fetch raised. 

35 """ 

36 

37 async def __aenter__(self) -> WebFetcher: ... 

38 

39 async def __aexit__( 

40 self, 

41 exc_type: type[BaseException] | None, 

42 exc: BaseException | None, 

43 tb: Any, 

44 ) -> None: ... 

45 

46 async def fetch_single(self, url: str, *, timeout: float) -> FetchedPage: 

47 """Fetch one URL and return its markdown + link set.""" 

48 ... 

49 

50 def fetch_recursive( 

51 self, 

52 seed_url: str, 

53 *, 

54 depth: int | None, 

55 max_pages: int | None, 

56 timeout: float, 

57 concurrency: ConcurrencySpec, 

58 filters: FilterSpec, 

59 cancel: CancelToken | None = None, 

60 ) -> AsyncGenerator[FetchedPage, None]: 

61 """Stream pages discovered by BFS from ``seed_url``. 

62 

63 ``depth`` / ``max_pages``: positive int caps, or ``None`` for 

64 unbounded. Adapters translate ``None`` into whatever sentinel the 

65 underlying SDK wants (crawl4ai uses ``math.inf``). 

66 

67 Returns an async generator so the orchestration layer can 

68 react per page (progress events, save-to-disk, cancel) and 

69 deterministically ``.aclose()`` the stream when it breaks 

70 out early (e.g. on ``max_pages`` hard cap). 

71 """ 

72 ...