Coverage for src / lilbee / cli / commands / dataset.py: 100%

40 statements  

« prev     ^ index     » next       coverage.py v7.13.4, created at 2026-06-28 01:01 +0000

1"""Export and import the per-page text dataset.""" 

2 

3from __future__ import annotations 

4 

5import asyncio 

6from pathlib import Path 

7from typing import NoReturn 

8 

9import typer 

10 

11from lilbee.cli import theme 

12from lilbee.cli.app import apply_overrides, console, data_dir_option, global_option 

13from lilbee.cli.helpers import json_output 

14from lilbee.core.config import cfg 

15 

16_export_output_argument = typer.Argument( 

17 Path("pages.parquet"), 

18 help="Output file (suffix sets the format unless --format is given).", 

19) 

20_import_dataset_argument = typer.Argument( 

21 ..., 

22 help="Dataset file to import (parquet or jsonl).", 

23) 

24_format_option = typer.Option( 

25 "", 

26 "--format", 

27 help="Dataset format: parquet or jsonl. Inferred from the file suffix when omitted.", 

28) 

29_export_source_option = typer.Option( 

30 None, 

31 "--source", 

32 help="Export only this source (default: every source).", 

33) 

34 

35 

36def _fail(message: str) -> NoReturn: 

37 """Emit *message* as an error in the active output mode and exit non-zero.""" 

38 if cfg.json_mode: 

39 json_output({"error": message}) 

40 else: 

41 console.print(f"[{theme.ERROR}]Error:[/{theme.ERROR}] {message}") 

42 raise SystemExit(1) 

43 

44 

45def export_cmd( 

46 output: Path = _export_output_argument, 

47 fmt: str = _format_option, 

48 source: str | None = _export_source_option, 

49 data_dir: Path | None = data_dir_option, 

50 use_global: bool = global_option, 

51) -> None: 

52 """Write a per-page {source, page, text} dataset (drops vectors).""" 

53 apply_overrides(data_dir=data_dir, use_global=use_global) 

54 from lilbee.app.dataset import DatasetError, export_to_path 

55 

56 try: 

57 summary = export_to_path(output, fmt, source) 

58 except DatasetError as exc: 

59 _fail(str(exc)) 

60 

61 if cfg.json_mode: 

62 json_output(summary.model_dump()) 

63 return 

64 console.print( 

65 f"Wrote [{theme.LABEL}]{summary.pages}[/{theme.LABEL}] pages from " 

66 f"[{theme.LABEL}]{summary.sources}[/{theme.LABEL}] source(s) to " 

67 f"[{theme.ACCENT}]{output}[/{theme.ACCENT}]" 

68 ) 

69 

70 

71def import_cmd( 

72 dataset: Path = _import_dataset_argument, 

73 fmt: str = _format_option, 

74 data_dir: Path | None = data_dir_option, 

75 use_global: bool = global_option, 

76) -> None: 

77 """Import a per-page text dataset, re-embedding it with the current model.""" 

78 apply_overrides(data_dir=data_dir, use_global=use_global) 

79 from lilbee.app.dataset import DatasetError, import_from_path 

80 

81 try: 

82 summary = asyncio.run(import_from_path(dataset, fmt)) 

83 except DatasetError as exc: 

84 _fail(str(exc)) 

85 

86 if cfg.json_mode: 

87 json_output(summary.model_dump()) 

88 return 

89 console.print( 

90 f"Imported [{theme.LABEL}]{len(summary.sources)}[/{theme.LABEL}] source(s) " 

91 f"([{theme.LABEL}]{summary.pages}[/{theme.LABEL}] pages, " 

92 f"[{theme.LABEL}]{summary.chunks}[/{theme.LABEL}] chunks)" 

93 )