Coverage for src / lilbee / data / ingest / skip_marker.py: 100%

39 statements  

« prev     ^ index     » next       coverage.py v7.13.4, created at 2026-05-15 20:55 +0000

1"""Sidecar record of files that produced no chunks, so a sync can skip them. 

2 

3A file that yields zero chunks (Tesseract timeout, decode failure, no usable 

4text) gets a marker here keyed by the file hash that failed. 

5``_plan_file_changes`` treats a file whose current hash matches its marker as 

6unchanged, so the per-file extract cost (30-60s for a stubborn scanned PDF) is 

7paid once, not on every sync. The marker is a small JSON file in 

8``cfg.data_root``; editing the file changes its hash and re-arms it, and 

9``retry_skipped`` / ``force_rebuild`` drop the file from the marker set. 

10""" 

11 

12from __future__ import annotations 

13 

14import contextlib 

15import json 

16import logging 

17import os 

18from pathlib import Path 

19 

20log = logging.getLogger(__name__) 

21 

22SKIP_MARKER_FILENAME = "skipped_sources.json" 

23 

24 

25def _marker_path(data_root: Path) -> Path: 

26 return data_root / SKIP_MARKER_FILENAME 

27 

28 

29def load_skip_markers(data_root: Path) -> dict[str, str]: 

30 """Load the filename → failed-hash map, or empty dict on any read error.""" 

31 path = _marker_path(data_root) 

32 if not path.exists(): 

33 return {} 

34 try: 

35 raw = json.loads(path.read_text(encoding="utf-8")) 

36 except (OSError, json.JSONDecodeError) as exc: 

37 log.debug("Skip-marker file unreadable, treating as empty: %s", exc) 

38 return {} 

39 if not isinstance(raw, dict): 

40 return {} 

41 return {str(k): str(v) for k, v in raw.items() if isinstance(v, str)} 

42 

43 

44def write_skip_markers(data_root: Path, markers: dict[str, str]) -> None: 

45 """Replace the marker file atomically. Best-effort: errors are logged, not raised.""" 

46 path = _marker_path(data_root) 

47 tmp = path.with_suffix(path.suffix + ".tmp") 

48 try: 

49 data_root.mkdir(parents=True, exist_ok=True) 

50 tmp.write_text(json.dumps(markers, sort_keys=True), encoding="utf-8") 

51 os.replace(tmp, path) 

52 except OSError as exc: 

53 log.warning("Failed to persist skip markers to %s: %s", path, exc) 

54 with contextlib.suppress(OSError): 

55 tmp.unlink() 

56 

57 

58def clear_skip_markers(data_root: Path) -> None: 

59 """Delete the marker file. No-op if absent.""" 

60 path = _marker_path(data_root) 

61 try: 

62 path.unlink(missing_ok=True) 

63 except OSError as exc: 

64 log.debug("Could not remove skip-marker file %s: %s", path, exc)