Coverage for src / lilbee / crawler / url_filter.py: 100%
36 statements
« prev ^ index » next coverage.py v7.13.4, created at 2026-06-28 01:01 +0000
« prev ^ index » next coverage.py v7.13.4, created at 2026-06-28 01:01 +0000
1"""URL validation, blocked-network checks, and host-scope helpers."""
3from __future__ import annotations
5import ipaddress
6import socket
7from urllib.parse import urlparse
9_BLOCKED_NETWORKS: tuple[ipaddress.IPv4Network | ipaddress.IPv6Network, ...] = (
10 ipaddress.ip_network("127.0.0.0/8"),
11 ipaddress.ip_network("10.0.0.0/8"),
12 ipaddress.ip_network("172.16.0.0/12"),
13 ipaddress.ip_network("192.168.0.0/16"),
14 ipaddress.ip_network("169.254.0.0/16"),
15 ipaddress.ip_network("::1/128"),
16 ipaddress.ip_network("fe80::/10"), # IPv6 link-local
17 ipaddress.ip_network("fc00::/7"), # IPv6 unique-local (ULA)
18 ipaddress.ip_network("ff00::/8"), # IPv6 multicast
19)
22def get_blocked_networks() -> tuple[ipaddress.IPv4Network | ipaddress.IPv6Network, ...]:
23 """Return blocked network list. Override in tests via monkeypatch."""
24 return _BLOCKED_NETWORKS
27def is_url(value: str) -> bool:
28 """Check if a string is an HTTP/HTTPS URL."""
29 return value.startswith(("http://", "https://"))
32def validate_crawl_url(url: str) -> None:
33 """Validate a URL for crawling. Raises ValueError for unsafe URLs.
34 Rejects private IPs, loopback, link-local, and non-HTTP schemes.
35 """
36 parsed = urlparse(url)
37 scheme = parsed.scheme.lower()
38 if scheme not in ("http", "https"):
39 raise ValueError(f"Only http:// and https:// URLs are allowed, got {scheme}://")
41 hostname = parsed.hostname
42 if not hostname:
43 raise ValueError("URL has no hostname")
45 try:
46 addr_infos = socket.getaddrinfo(hostname, None)
47 except socket.gaierror as exc:
48 raise ValueError(f"Cannot resolve hostname: {hostname}") from exc
50 for _family, _type, _proto, _canonname, sockaddr in addr_infos:
51 ip = ipaddress.ip_address(sockaddr[0])
52 for network in get_blocked_networks():
53 if ip in network:
54 raise ValueError(f"Crawling private/reserved IP {ip} is not allowed")
57def require_valid_crawl_url(url: str) -> None:
58 """Validate URL for crawling. Raises ValueError if invalid."""
59 if not is_url(url):
60 raise ValueError("URL must start with http:// or https://")
61 validate_crawl_url(url)
64def host_in_scope(link_host: str, host: str, *, include_subdomains: bool) -> bool:
65 """Return True when ``link_host`` should be followed during a whole-site crawl."""
66 if not link_host:
67 return False
68 if link_host == host:
69 return True
70 return include_subdomains and link_host.endswith(f".{host}")