Coverage for src / lilbee / crawler / url_filter.py: 100%
36 statements
« prev ^ index » next coverage.py v7.13.4, created at 2026-05-15 20:55 +0000
« prev ^ index » next coverage.py v7.13.4, created at 2026-05-15 20:55 +0000
1"""URL validation, blocked-network checks, and host-scope helpers."""
3from __future__ import annotations
5import ipaddress
6import socket
7from urllib.parse import urlparse
9_BLOCKED_NETWORKS: tuple[ipaddress.IPv4Network | ipaddress.IPv6Network, ...] = (
10 ipaddress.ip_network("127.0.0.0/8"),
11 ipaddress.ip_network("10.0.0.0/8"),
12 ipaddress.ip_network("172.16.0.0/12"),
13 ipaddress.ip_network("192.168.0.0/16"),
14 ipaddress.ip_network("169.254.0.0/16"),
15 ipaddress.ip_network("::1/128"),
16)
19def get_blocked_networks() -> tuple[ipaddress.IPv4Network | ipaddress.IPv6Network, ...]:
20 """Return blocked network list. Override in tests via monkeypatch."""
21 return _BLOCKED_NETWORKS
24def is_url(value: str) -> bool:
25 """Check if a string is an HTTP/HTTPS URL."""
26 return value.startswith(("http://", "https://"))
29def validate_crawl_url(url: str) -> None:
30 """Validate a URL for crawling. Raises ValueError for unsafe URLs.
31 Rejects private IPs, loopback, link-local, and non-HTTP schemes.
32 """
33 parsed = urlparse(url)
34 scheme = parsed.scheme.lower()
35 if scheme not in ("http", "https"):
36 raise ValueError(f"Only http:// and https:// URLs are allowed, got {scheme}://")
38 hostname = parsed.hostname
39 if not hostname:
40 raise ValueError("URL has no hostname")
42 try:
43 addr_infos = socket.getaddrinfo(hostname, None)
44 except socket.gaierror as exc:
45 raise ValueError(f"Cannot resolve hostname: {hostname}") from exc
47 for _family, _type, _proto, _canonname, sockaddr in addr_infos:
48 ip = ipaddress.ip_address(sockaddr[0])
49 for network in get_blocked_networks():
50 if ip in network:
51 raise ValueError(f"Crawling private/reserved IP {ip} is not allowed")
54def require_valid_crawl_url(url: str) -> None:
55 """Validate URL for crawling. Raises ValueError if invalid."""
56 if not is_url(url):
57 raise ValueError("URL must start with http:// or https://")
58 validate_crawl_url(url)
61def host_in_scope(link_host: str, host: str, *, include_subdomains: bool) -> bool:
62 """Return True when ``link_host`` should be followed during a whole-site crawl."""
63 if not link_host:
64 return False
65 if link_host == host:
66 return True
67 return include_subdomains and link_host.endswith(f".{host}")