Coverage for src / lilbee / crawler / __init__.py: 100%

11 statements  

« prev     ^ index     » next       coverage.py v7.13.4, created at 2026-05-15 20:55 +0000

1"""Web crawling: fetch pages as markdown and save them to the documents directory.""" 

2 

3from __future__ import annotations 

4 

5import os 

6 

7from lilbee.crawler.bootstrap import ( 

8 CrawlerBackendError, 

9 CrawlerBrowserError, 

10 bootstrap_chromium, 

11 chromium_installed, 

12 crawler_browsers_path, 

13) 

14from lilbee.crawler.crawl4ai_fetcher import crawler_available 

15from lilbee.crawler.fetcher import WebFetcher 

16from lilbee.crawler.models import ( 

17 CancelToken, 

18 ConcurrencySpec, 

19 CrawlResult, 

20 FetchedPage, 

21 FilterSpec, 

22) 

23from lilbee.crawler.runner import ( 

24 crawl_and_save, 

25 crawl_recursive, 

26 crawl_single, 

27) 

28from lilbee.crawler.save import ( 

29 METADATA_FLUSH_INTERVAL, 

30 CrawlMeta, 

31 content_hash, 

32 load_crawl_metadata, 

33 save_crawl_metadata, 

34 url_to_filename, 

35) 

36from lilbee.crawler.url_filter import ( 

37 get_blocked_networks, 

38 is_url, 

39 require_valid_crawl_url, 

40 validate_crawl_url, 

41) 

42 

43__all__ = [ 

44 "METADATA_FLUSH_INTERVAL", 

45 "CancelToken", 

46 "ConcurrencySpec", 

47 "CrawlMeta", 

48 "CrawlResult", 

49 "CrawlerBackendError", 

50 "CrawlerBrowserError", 

51 "FetchedPage", 

52 "FilterSpec", 

53 "WebFetcher", 

54 "bootstrap_chromium", 

55 "chromium_installed", 

56 "content_hash", 

57 "crawl_and_save", 

58 "crawl_recursive", 

59 "crawl_single", 

60 "crawler_available", 

61 "crawler_browsers_path", 

62 "get_blocked_networks", 

63 "is_url", 

64 "load_crawl_metadata", 

65 "require_valid_crawl_url", 

66 "save_crawl_metadata", 

67 "url_to_filename", 

68 "validate_crawl_url", 

69] 

70 

71# Pin Playwright's browser cache so install and launch agree on Chromium's 

72# location, regardless of wheel vs frozen-binary layout. 

73os.environ.setdefault("PLAYWRIGHT_BROWSERS_PATH", str(crawler_browsers_path()))