Coverage for src / lilbee / data / ingest / discovery.py: 100%
45 statements
« prev ^ index » next coverage.py v7.13.4, created at 2026-05-15 20:55 +0000
« prev ^ index » next coverage.py v7.13.4, created at 2026-05-15 20:55 +0000
1"""File discovery, classification, and hashing."""
3from __future__ import annotations
5import hashlib
6import logging
7import os
8from pathlib import Path
10from lilbee.core.config import cfg
11from lilbee.core.security import validate_path_within
12from lilbee.core.system import is_ignored_dir
13from lilbee.data.code_chunker import is_code_file
14from lilbee.data.ingest.types import DOCUMENT_EXTENSION_MAP
16log = logging.getLogger(__name__)
19def file_hash(path: Path) -> str:
20 """Compute SHA-256 hex digest of a file."""
21 h = hashlib.sha256()
22 with open(path, "rb") as f:
23 for block in iter(lambda: f.read(8192), b""):
24 h.update(block)
25 return h.hexdigest()
28def _relative_name(path: Path) -> str:
29 """Get path relative to documents dir as a forward-slash string (portable across OS)."""
30 return path.relative_to(cfg.documents_dir).as_posix()
33def classify_file(path: Path) -> str | None:
34 """Classify file by extension. Returns content_type or None if unsupported."""
35 doc_type = DOCUMENT_EXTENSION_MAP.get(path.suffix.lower())
36 if doc_type is not None:
37 return doc_type
38 if is_code_file(path):
39 return "code"
40 return None
43def discover_files() -> dict[str, Path]:
44 """Scan documents/ recursively, return {relative_name: absolute_path}."""
45 if not cfg.documents_dir.exists():
46 return {}
47 docs_resolved = cfg.documents_dir.resolve()
48 files: dict[str, Path] = {}
49 for root, dirs, filenames in os.walk(cfg.documents_dir, topdown=True):
50 dirs[:] = [d for d in dirs if not is_ignored_dir(d, cfg.ignore_dirs)]
51 for fname in filenames:
52 if fname.startswith("."):
53 continue
54 path = Path(root) / fname
55 try:
56 validate_path_within(path, docs_resolved)
57 except ValueError:
58 log.warning("Symlink escapes documents dir, skipping: %s", path)
59 continue
60 if classify_file(path) is not None:
61 files[_relative_name(path)] = path
62 return files