Coverage for src / lilbee / crawler / sitemap.py: 100%
39 statements
« prev ^ index » next coverage.py v7.13.4, created at 2026-06-28 01:01 +0000
« prev ^ index » next coverage.py v7.13.4, created at 2026-06-28 01:01 +0000
1"""Best-effort ``/sitemap.xml`` lookup used as a progress-hint denominator."""
3from __future__ import annotations
5import re
6from http import HTTPStatus
7from urllib.parse import urlparse
9from lilbee.crawler.url_filter import host_in_scope, require_valid_crawl_url
10from lilbee.runtime.progress import CRAWL_TOTAL_UNKNOWN
12# Sitemap lookups are best-effort progress hints; never block the actual crawl.
13_SITEMAP_FETCH_TIMEOUT_SECONDS = 5.0
14_SITEMAP_MAX_URLS = 10_000
15_SITEMAP_URL_TAG_RE = re.compile(r"<loc>\s*([^<]+?)\s*</loc>", re.IGNORECASE)
18def _fetch_sitemap_text(start_url: str) -> str | None:
19 """Return sitemap.xml body or None on any fetch/status failure."""
20 import httpx
22 parsed = urlparse(start_url)
23 sitemap_url = f"{parsed.scheme}://{parsed.netloc}/sitemap.xml"
24 try:
25 resp = httpx.get(sitemap_url, timeout=_SITEMAP_FETCH_TIMEOUT_SECONDS, follow_redirects=True)
26 except (httpx.HTTPError, OSError):
27 return None
28 if resp.status_code >= HTTPStatus.BAD_REQUEST:
29 return None
30 # Redirects can steer the fetch to a private/metadata host (SSRF), so
31 # re-validate the final resolved URL before trusting the body.
32 try:
33 require_valid_crawl_url(str(resp.url))
34 except ValueError:
35 return None
36 return resp.text
39def _count_sitemap_urls(start_url: str, *, include_subdomains: bool) -> int:
40 """Best-effort count of URLs in the host's /sitemap.xml that match the crawl scope.
42 Returns ``CRAWL_TOTAL_UNKNOWN`` on any failure (missing sitemap, timeout,
43 parse error, redirect away from the starting host). This is purely a
44 progress-hint denominator, so correctness is not load-bearing.
46 Only fetches sitemap.xml directly at the root of the starting host; does
47 not follow robots.txt references or nested sitemap indexes.
48 """
49 host = (urlparse(start_url).hostname or "").lower()
50 if not host:
51 return CRAWL_TOTAL_UNKNOWN
52 text = _fetch_sitemap_text(start_url)
53 if text is None:
54 return CRAWL_TOTAL_UNKNOWN
56 count = 0
57 for match in _SITEMAP_URL_TAG_RE.finditer(text):
58 link_host = (urlparse(match.group(1).strip()).hostname or "").lower()
59 if host_in_scope(link_host, host, include_subdomains=include_subdomains):
60 count += 1
61 if count >= _SITEMAP_MAX_URLS:
62 break
63 return count if count > 0 else CRAWL_TOTAL_UNKNOWN