Coverage for src / lilbee / crawler / sitemap.py: 100%

39 statements  

« prev     ^ index     » next       coverage.py v7.13.4, created at 2026-06-28 01:01 +0000

1"""Best-effort ``/sitemap.xml`` lookup used as a progress-hint denominator.""" 

2 

3from __future__ import annotations 

4 

5import re 

6from http import HTTPStatus 

7from urllib.parse import urlparse 

8 

9from lilbee.crawler.url_filter import host_in_scope, require_valid_crawl_url 

10from lilbee.runtime.progress import CRAWL_TOTAL_UNKNOWN 

11 

12# Sitemap lookups are best-effort progress hints; never block the actual crawl. 

13_SITEMAP_FETCH_TIMEOUT_SECONDS = 5.0 

14_SITEMAP_MAX_URLS = 10_000 

15_SITEMAP_URL_TAG_RE = re.compile(r"<loc>\s*([^<]+?)\s*</loc>", re.IGNORECASE) 

16 

17 

18def _fetch_sitemap_text(start_url: str) -> str | None: 

19 """Return sitemap.xml body or None on any fetch/status failure.""" 

20 import httpx 

21 

22 parsed = urlparse(start_url) 

23 sitemap_url = f"{parsed.scheme}://{parsed.netloc}/sitemap.xml" 

24 try: 

25 resp = httpx.get(sitemap_url, timeout=_SITEMAP_FETCH_TIMEOUT_SECONDS, follow_redirects=True) 

26 except (httpx.HTTPError, OSError): 

27 return None 

28 if resp.status_code >= HTTPStatus.BAD_REQUEST: 

29 return None 

30 # Redirects can steer the fetch to a private/metadata host (SSRF), so 

31 # re-validate the final resolved URL before trusting the body. 

32 try: 

33 require_valid_crawl_url(str(resp.url)) 

34 except ValueError: 

35 return None 

36 return resp.text 

37 

38 

39def _count_sitemap_urls(start_url: str, *, include_subdomains: bool) -> int: 

40 """Best-effort count of URLs in the host's /sitemap.xml that match the crawl scope. 

41 

42 Returns ``CRAWL_TOTAL_UNKNOWN`` on any failure (missing sitemap, timeout, 

43 parse error, redirect away from the starting host). This is purely a 

44 progress-hint denominator, so correctness is not load-bearing. 

45 

46 Only fetches sitemap.xml directly at the root of the starting host; does 

47 not follow robots.txt references or nested sitemap indexes. 

48 """ 

49 host = (urlparse(start_url).hostname or "").lower() 

50 if not host: 

51 return CRAWL_TOTAL_UNKNOWN 

52 text = _fetch_sitemap_text(start_url) 

53 if text is None: 

54 return CRAWL_TOTAL_UNKNOWN 

55 

56 count = 0 

57 for match in _SITEMAP_URL_TAG_RE.finditer(text): 

58 link_host = (urlparse(match.group(1).strip()).hostname or "").lower() 

59 if host_in_scope(link_host, host, include_subdomains=include_subdomains): 

60 count += 1 

61 if count >= _SITEMAP_MAX_URLS: 

62 break 

63 return count if count > 0 else CRAWL_TOTAL_UNKNOWN