Coverage for src/lilbee/crawler/sitemap.py: 100%

1"""Best-effort ``/sitemap.xml`` lookup used as a progress-hint denominator."""

3from __future__ import annotations

5import re

6from http import HTTPStatus

7from urllib.parse import urlparse

9from lilbee.crawler.url_filter import host_in_scope

10from lilbee.runtime.progress import CRAWL_TOTAL_UNKNOWN

12# Sitemap lookups are best-effort progress hints; never block the actual crawl.

13_SITEMAP_FETCH_TIMEOUT_SECONDS = 5.0

14_SITEMAP_MAX_URLS = 10_000

15_SITEMAP_URL_TAG_RE = re.compile(r"<loc>\s*([^<]+?)\s*</loc>", re.IGNORECASE)

18def _fetch_sitemap_text(start_url: str) -> str | None:

19 """Return sitemap.xml body or None on any fetch/status failure."""

20 import httpx

22 parsed = urlparse(start_url)

23 sitemap_url = f"{parsed.scheme}://{parsed.netloc}/sitemap.xml"

24 try:

25 resp = httpx.get(sitemap_url, timeout=_SITEMAP_FETCH_TIMEOUT_SECONDS, follow_redirects=True)

26 except (httpx.HTTPError, OSError):

27 return None

28 if resp.status_code >= HTTPStatus.BAD_REQUEST:

29 return None

30 return resp.text

33def _count_sitemap_urls(start_url: str, *, include_subdomains: bool) -> int:

34 """Best-effort count of URLs in the host's /sitemap.xml that match the crawl scope.

36 Returns ``CRAWL_TOTAL_UNKNOWN`` on any failure (missing sitemap, timeout,

37 parse error, redirect away from the starting host). This is purely a

38 progress-hint denominator, so correctness is not load-bearing.

40 Only fetches sitemap.xml directly at the root of the starting host; does

41 not follow robots.txt references or nested sitemap indexes.

42 """

43 host = (urlparse(start_url).hostname or "").lower()

44 if not host:

45 return CRAWL_TOTAL_UNKNOWN

46 text = _fetch_sitemap_text(start_url)

47 if text is None:

48 return CRAWL_TOTAL_UNKNOWN

50 count = 0

51 for match in _SITEMAP_URL_TAG_RE.finditer(text):

52 link_host = (urlparse(match.group(1).strip()).hostname or "").lower()

53 if host_in_scope(link_host, host, include_subdomains=include_subdomains):

54 count += 1

55 if count >= _SITEMAP_MAX_URLS:

56 break

57 return count if count > 0 else CRAWL_TOTAL_UNKNOWN

Coverage for src / lilbee / crawler / sitemap.py: 100%