Coverage for src / lilbee / crawler / url_filter.py: 100%

36 statements  

« prev     ^ index     » next       coverage.py v7.13.4, created at 2026-06-28 01:01 +0000

1"""URL validation, blocked-network checks, and host-scope helpers.""" 

2 

3from __future__ import annotations 

4 

5import ipaddress 

6import socket 

7from urllib.parse import urlparse 

8 

9_BLOCKED_NETWORKS: tuple[ipaddress.IPv4Network | ipaddress.IPv6Network, ...] = ( 

10 ipaddress.ip_network("127.0.0.0/8"), 

11 ipaddress.ip_network("10.0.0.0/8"), 

12 ipaddress.ip_network("172.16.0.0/12"), 

13 ipaddress.ip_network("192.168.0.0/16"), 

14 ipaddress.ip_network("169.254.0.0/16"), 

15 ipaddress.ip_network("::1/128"), 

16 ipaddress.ip_network("fe80::/10"), # IPv6 link-local 

17 ipaddress.ip_network("fc00::/7"), # IPv6 unique-local (ULA) 

18 ipaddress.ip_network("ff00::/8"), # IPv6 multicast 

19) 

20 

21 

22def get_blocked_networks() -> tuple[ipaddress.IPv4Network | ipaddress.IPv6Network, ...]: 

23 """Return blocked network list. Override in tests via monkeypatch.""" 

24 return _BLOCKED_NETWORKS 

25 

26 

27def is_url(value: str) -> bool: 

28 """Check if a string is an HTTP/HTTPS URL.""" 

29 return value.startswith(("http://", "https://")) 

30 

31 

32def validate_crawl_url(url: str) -> None: 

33 """Validate a URL for crawling. Raises ValueError for unsafe URLs. 

34 Rejects private IPs, loopback, link-local, and non-HTTP schemes. 

35 """ 

36 parsed = urlparse(url) 

37 scheme = parsed.scheme.lower() 

38 if scheme not in ("http", "https"): 

39 raise ValueError(f"Only http:// and https:// URLs are allowed, got {scheme}://") 

40 

41 hostname = parsed.hostname 

42 if not hostname: 

43 raise ValueError("URL has no hostname") 

44 

45 try: 

46 addr_infos = socket.getaddrinfo(hostname, None) 

47 except socket.gaierror as exc: 

48 raise ValueError(f"Cannot resolve hostname: {hostname}") from exc 

49 

50 for _family, _type, _proto, _canonname, sockaddr in addr_infos: 

51 ip = ipaddress.ip_address(sockaddr[0]) 

52 for network in get_blocked_networks(): 

53 if ip in network: 

54 raise ValueError(f"Crawling private/reserved IP {ip} is not allowed") 

55 

56 

57def require_valid_crawl_url(url: str) -> None: 

58 """Validate URL for crawling. Raises ValueError if invalid.""" 

59 if not is_url(url): 

60 raise ValueError("URL must start with http:// or https://") 

61 validate_crawl_url(url) 

62 

63 

64def host_in_scope(link_host: str, host: str, *, include_subdomains: bool) -> bool: 

65 """Return True when ``link_host`` should be followed during a whole-site crawl.""" 

66 if not link_host: 

67 return False 

68 if link_host == host: 

69 return True 

70 return include_subdomains and link_host.endswith(f".{host}")