Coverage for src / lilbee / crawler / models.py: 100%

34 statements  

« prev     ^ index     » next       coverage.py v7.13.4, created at 2026-06-28 01:01 +0000

1"""Backend-agnostic value types crossing the runner/fetcher seam.""" 

2 

3from __future__ import annotations 

4 

5import threading 

6from dataclasses import dataclass, field 

7from typing import TypeAlias 

8 

9# Explicit "no page limit" for a crawl. Distinct from None, which means 

10# "unspecified, use the protective default cfg.crawl_safety_max_pages". 

11CRAWL_PAGES_UNLIMITED = 0 

12 

13 

14@dataclass 

15class CrawlResult: 

16 """Outcome of crawling a single URL. 

17 

18 This is the high-level result surfaced to lilbee callers 

19 (CLI, MCP, HTTP, TUI). The adapter produces ``FetchedPage`` 

20 and the orchestration layer converts it to ``CrawlResult`` 

21 when returning up to the caller. 

22 """ 

23 

24 url: str 

25 markdown: str = "" 

26 success: bool = True 

27 error: str | None = None 

28 

29 

30@dataclass 

31class FetchedPage: 

32 """Single page produced by a ``WebFetcher`` backend. 

33 

34 Distinct from :class:`CrawlResult` so the adapter surface 

35 stays narrow and neutral: just the bytes we needed out of 

36 the underlying SDK's response object. 

37 """ 

38 

39 url: str 

40 markdown: str = "" 

41 success: bool = True 

42 error: str | None = None 

43 links: list[str] = field(default_factory=list) 

44 

45 

46@dataclass 

47class ConcurrencySpec: 

48 """Backend-agnostic concurrency + rate-limit knobs. 

49 

50 The crawl4ai adapter translates these into ``RateLimiter`` and 

51 ``SemaphoreDispatcher`` calls; a future adapter with its own 

52 BFS loop maps them onto ``asyncio.Semaphore`` + retry logic. 

53 """ 

54 

55 semaphore_count: int = 1 

56 mean_delay: float = 0.0 

57 max_delay_range: float = 0.0 

58 retry_on_rate_limit: bool = False 

59 retry_base_delay_min: float = 0.0 

60 retry_base_delay_max: float = 0.0 

61 retry_max_backoff: float = 0.0 

62 retry_max_attempts: int = 0 

63 

64 

65@dataclass 

66class FilterSpec: 

67 """Backend-agnostic filter settings applied to discovered links. 

68 

69 Pure Python data; each adapter decides how to plug the settings 

70 into its own filter pipeline. 

71 """ 

72 

73 exclude_patterns: list[str] = field(default_factory=list) 

74 include_subdomains: bool = False 

75 

76 

77CancelToken: TypeAlias = threading.Event 

78"""Cancellation handle the orchestration layer passes to a fetcher. 

79 

80An already-``set()`` event means "stop as soon as you can". The 

81crawl4ai adapter polls it in both its streaming loop and its BFS 

82strategy's ``should_cancel`` hook; a future adapter can poll it 

83in whatever granularity it supports. 

84"""