Coverage for src / lilbee / data / ingest / skip_marker.py: 100%
39 statements
« prev ^ index » next coverage.py v7.13.4, created at 2026-05-15 20:55 +0000
« prev ^ index » next coverage.py v7.13.4, created at 2026-05-15 20:55 +0000
1"""Sidecar record of files that produced no chunks, so a sync can skip them.
3A file that yields zero chunks (Tesseract timeout, decode failure, no usable
4text) gets a marker here keyed by the file hash that failed.
5``_plan_file_changes`` treats a file whose current hash matches its marker as
6unchanged, so the per-file extract cost (30-60s for a stubborn scanned PDF) is
7paid once, not on every sync. The marker is a small JSON file in
8``cfg.data_root``; editing the file changes its hash and re-arms it, and
9``retry_skipped`` / ``force_rebuild`` drop the file from the marker set.
10"""
12from __future__ import annotations
14import contextlib
15import json
16import logging
17import os
18from pathlib import Path
20log = logging.getLogger(__name__)
22SKIP_MARKER_FILENAME = "skipped_sources.json"
25def _marker_path(data_root: Path) -> Path:
26 return data_root / SKIP_MARKER_FILENAME
29def load_skip_markers(data_root: Path) -> dict[str, str]:
30 """Load the filename → failed-hash map, or empty dict on any read error."""
31 path = _marker_path(data_root)
32 if not path.exists():
33 return {}
34 try:
35 raw = json.loads(path.read_text(encoding="utf-8"))
36 except (OSError, json.JSONDecodeError) as exc:
37 log.debug("Skip-marker file unreadable, treating as empty: %s", exc)
38 return {}
39 if not isinstance(raw, dict):
40 return {}
41 return {str(k): str(v) for k, v in raw.items() if isinstance(v, str)}
44def write_skip_markers(data_root: Path, markers: dict[str, str]) -> None:
45 """Replace the marker file atomically. Best-effort: errors are logged, not raised."""
46 path = _marker_path(data_root)
47 tmp = path.with_suffix(path.suffix + ".tmp")
48 try:
49 data_root.mkdir(parents=True, exist_ok=True)
50 tmp.write_text(json.dumps(markers, sort_keys=True), encoding="utf-8")
51 os.replace(tmp, path)
52 except OSError as exc:
53 log.warning("Failed to persist skip markers to %s: %s", path, exc)
54 with contextlib.suppress(OSError):
55 tmp.unlink()
58def clear_skip_markers(data_root: Path) -> None:
59 """Delete the marker file. No-op if absent."""
60 path = _marker_path(data_root)
61 try:
62 path.unlink(missing_ok=True)
63 except OSError as exc:
64 log.debug("Could not remove skip-marker file %s: %s", path, exc)