Coverage for src / lilbee / data / ingest / discovery.py: 100%

45 statements  

« prev     ^ index     » next       coverage.py v7.13.4, created at 2026-05-15 20:55 +0000

1"""File discovery, classification, and hashing.""" 

2 

3from __future__ import annotations 

4 

5import hashlib 

6import logging 

7import os 

8from pathlib import Path 

9 

10from lilbee.core.config import cfg 

11from lilbee.core.security import validate_path_within 

12from lilbee.core.system import is_ignored_dir 

13from lilbee.data.code_chunker import is_code_file 

14from lilbee.data.ingest.types import DOCUMENT_EXTENSION_MAP 

15 

16log = logging.getLogger(__name__) 

17 

18 

19def file_hash(path: Path) -> str: 

20 """Compute SHA-256 hex digest of a file.""" 

21 h = hashlib.sha256() 

22 with open(path, "rb") as f: 

23 for block in iter(lambda: f.read(8192), b""): 

24 h.update(block) 

25 return h.hexdigest() 

26 

27 

28def _relative_name(path: Path) -> str: 

29 """Get path relative to documents dir as a forward-slash string (portable across OS).""" 

30 return path.relative_to(cfg.documents_dir).as_posix() 

31 

32 

33def classify_file(path: Path) -> str | None: 

34 """Classify file by extension. Returns content_type or None if unsupported.""" 

35 doc_type = DOCUMENT_EXTENSION_MAP.get(path.suffix.lower()) 

36 if doc_type is not None: 

37 return doc_type 

38 if is_code_file(path): 

39 return "code" 

40 return None 

41 

42 

43def discover_files() -> dict[str, Path]: 

44 """Scan documents/ recursively, return {relative_name: absolute_path}.""" 

45 if not cfg.documents_dir.exists(): 

46 return {} 

47 docs_resolved = cfg.documents_dir.resolve() 

48 files: dict[str, Path] = {} 

49 for root, dirs, filenames in os.walk(cfg.documents_dir, topdown=True): 

50 dirs[:] = [d for d in dirs if not is_ignored_dir(d, cfg.ignore_dirs)] 

51 for fname in filenames: 

52 if fname.startswith("."): 

53 continue 

54 path = Path(root) / fname 

55 try: 

56 validate_path_within(path, docs_resolved) 

57 except ValueError: 

58 log.warning("Symlink escapes documents dir, skipping: %s", path) 

59 continue 

60 if classify_file(path) is not None: 

61 files[_relative_name(path)] = path 

62 return files