Coverage for src / lilbee / crawler / url_filter.py: 100%

36 statements  

« prev     ^ index     » next       coverage.py v7.13.4, created at 2026-05-15 20:55 +0000

1"""URL validation, blocked-network checks, and host-scope helpers.""" 

2 

3from __future__ import annotations 

4 

5import ipaddress 

6import socket 

7from urllib.parse import urlparse 

8 

9_BLOCKED_NETWORKS: tuple[ipaddress.IPv4Network | ipaddress.IPv6Network, ...] = ( 

10 ipaddress.ip_network("127.0.0.0/8"), 

11 ipaddress.ip_network("10.0.0.0/8"), 

12 ipaddress.ip_network("172.16.0.0/12"), 

13 ipaddress.ip_network("192.168.0.0/16"), 

14 ipaddress.ip_network("169.254.0.0/16"), 

15 ipaddress.ip_network("::1/128"), 

16) 

17 

18 

19def get_blocked_networks() -> tuple[ipaddress.IPv4Network | ipaddress.IPv6Network, ...]: 

20 """Return blocked network list. Override in tests via monkeypatch.""" 

21 return _BLOCKED_NETWORKS 

22 

23 

24def is_url(value: str) -> bool: 

25 """Check if a string is an HTTP/HTTPS URL.""" 

26 return value.startswith(("http://", "https://")) 

27 

28 

29def validate_crawl_url(url: str) -> None: 

30 """Validate a URL for crawling. Raises ValueError for unsafe URLs. 

31 Rejects private IPs, loopback, link-local, and non-HTTP schemes. 

32 """ 

33 parsed = urlparse(url) 

34 scheme = parsed.scheme.lower() 

35 if scheme not in ("http", "https"): 

36 raise ValueError(f"Only http:// and https:// URLs are allowed, got {scheme}://") 

37 

38 hostname = parsed.hostname 

39 if not hostname: 

40 raise ValueError("URL has no hostname") 

41 

42 try: 

43 addr_infos = socket.getaddrinfo(hostname, None) 

44 except socket.gaierror as exc: 

45 raise ValueError(f"Cannot resolve hostname: {hostname}") from exc 

46 

47 for _family, _type, _proto, _canonname, sockaddr in addr_infos: 

48 ip = ipaddress.ip_address(sockaddr[0]) 

49 for network in get_blocked_networks(): 

50 if ip in network: 

51 raise ValueError(f"Crawling private/reserved IP {ip} is not allowed") 

52 

53 

54def require_valid_crawl_url(url: str) -> None: 

55 """Validate URL for crawling. Raises ValueError if invalid.""" 

56 if not is_url(url): 

57 raise ValueError("URL must start with http:// or https://") 

58 validate_crawl_url(url) 

59 

60 

61def host_in_scope(link_host: str, host: str, *, include_subdomains: bool) -> bool: 

62 """Return True when ``link_host`` should be followed during a whole-site crawl.""" 

63 if not link_host: 

64 return False 

65 if link_host == host: 

66 return True 

67 return include_subdomains and link_host.endswith(f".{host}")