Coverage for src / lilbee / crawler / models.py: 100%
34 statements
« prev ^ index » next coverage.py v7.13.4, created at 2026-06-28 01:01 +0000
« prev ^ index » next coverage.py v7.13.4, created at 2026-06-28 01:01 +0000
1"""Backend-agnostic value types crossing the runner/fetcher seam."""
3from __future__ import annotations
5import threading
6from dataclasses import dataclass, field
7from typing import TypeAlias
9# Explicit "no page limit" for a crawl. Distinct from None, which means
10# "unspecified, use the protective default cfg.crawl_safety_max_pages".
11CRAWL_PAGES_UNLIMITED = 0
14@dataclass
15class CrawlResult:
16 """Outcome of crawling a single URL.
18 This is the high-level result surfaced to lilbee callers
19 (CLI, MCP, HTTP, TUI). The adapter produces ``FetchedPage``
20 and the orchestration layer converts it to ``CrawlResult``
21 when returning up to the caller.
22 """
24 url: str
25 markdown: str = ""
26 success: bool = True
27 error: str | None = None
30@dataclass
31class FetchedPage:
32 """Single page produced by a ``WebFetcher`` backend.
34 Distinct from :class:`CrawlResult` so the adapter surface
35 stays narrow and neutral: just the bytes we needed out of
36 the underlying SDK's response object.
37 """
39 url: str
40 markdown: str = ""
41 success: bool = True
42 error: str | None = None
43 links: list[str] = field(default_factory=list)
46@dataclass
47class ConcurrencySpec:
48 """Backend-agnostic concurrency + rate-limit knobs.
50 The crawl4ai adapter translates these into ``RateLimiter`` and
51 ``SemaphoreDispatcher`` calls; a future adapter with its own
52 BFS loop maps them onto ``asyncio.Semaphore`` + retry logic.
53 """
55 semaphore_count: int = 1
56 mean_delay: float = 0.0
57 max_delay_range: float = 0.0
58 retry_on_rate_limit: bool = False
59 retry_base_delay_min: float = 0.0
60 retry_base_delay_max: float = 0.0
61 retry_max_backoff: float = 0.0
62 retry_max_attempts: int = 0
65@dataclass
66class FilterSpec:
67 """Backend-agnostic filter settings applied to discovered links.
69 Pure Python data; each adapter decides how to plug the settings
70 into its own filter pipeline.
71 """
73 exclude_patterns: list[str] = field(default_factory=list)
74 include_subdomains: bool = False
77CancelToken: TypeAlias = threading.Event
78"""Cancellation handle the orchestration layer passes to a fetcher.
80An already-``set()`` event means "stop as soon as you can". The
81crawl4ai adapter polls it in both its streaming loop and its BFS
82strategy's ``should_cancel`` hook; a future adapter can poll it
83in whatever granularity it supports.
84"""