Coverage for src / lilbee / cli / commands / dataset.py: 100%
40 statements
« prev ^ index » next coverage.py v7.13.4, created at 2026-06-28 01:01 +0000
« prev ^ index » next coverage.py v7.13.4, created at 2026-06-28 01:01 +0000
1"""Export and import the per-page text dataset."""
3from __future__ import annotations
5import asyncio
6from pathlib import Path
7from typing import NoReturn
9import typer
11from lilbee.cli import theme
12from lilbee.cli.app import apply_overrides, console, data_dir_option, global_option
13from lilbee.cli.helpers import json_output
14from lilbee.core.config import cfg
16_export_output_argument = typer.Argument(
17 Path("pages.parquet"),
18 help="Output file (suffix sets the format unless --format is given).",
19)
20_import_dataset_argument = typer.Argument(
21 ...,
22 help="Dataset file to import (parquet or jsonl).",
23)
24_format_option = typer.Option(
25 "",
26 "--format",
27 help="Dataset format: parquet or jsonl. Inferred from the file suffix when omitted.",
28)
29_export_source_option = typer.Option(
30 None,
31 "--source",
32 help="Export only this source (default: every source).",
33)
36def _fail(message: str) -> NoReturn:
37 """Emit *message* as an error in the active output mode and exit non-zero."""
38 if cfg.json_mode:
39 json_output({"error": message})
40 else:
41 console.print(f"[{theme.ERROR}]Error:[/{theme.ERROR}] {message}")
42 raise SystemExit(1)
45def export_cmd(
46 output: Path = _export_output_argument,
47 fmt: str = _format_option,
48 source: str | None = _export_source_option,
49 data_dir: Path | None = data_dir_option,
50 use_global: bool = global_option,
51) -> None:
52 """Write a per-page {source, page, text} dataset (drops vectors)."""
53 apply_overrides(data_dir=data_dir, use_global=use_global)
54 from lilbee.app.dataset import DatasetError, export_to_path
56 try:
57 summary = export_to_path(output, fmt, source)
58 except DatasetError as exc:
59 _fail(str(exc))
61 if cfg.json_mode:
62 json_output(summary.model_dump())
63 return
64 console.print(
65 f"Wrote [{theme.LABEL}]{summary.pages}[/{theme.LABEL}] pages from "
66 f"[{theme.LABEL}]{summary.sources}[/{theme.LABEL}] source(s) to "
67 f"[{theme.ACCENT}]{output}[/{theme.ACCENT}]"
68 )
71def import_cmd(
72 dataset: Path = _import_dataset_argument,
73 fmt: str = _format_option,
74 data_dir: Path | None = data_dir_option,
75 use_global: bool = global_option,
76) -> None:
77 """Import a per-page text dataset, re-embedding it with the current model."""
78 apply_overrides(data_dir=data_dir, use_global=use_global)
79 from lilbee.app.dataset import DatasetError, import_from_path
81 try:
82 summary = asyncio.run(import_from_path(dataset, fmt))
83 except DatasetError as exc:
84 _fail(str(exc))
86 if cfg.json_mode:
87 json_output(summary.model_dump())
88 return
89 console.print(
90 f"Imported [{theme.LABEL}]{len(summary.sources)}[/{theme.LABEL}] source(s) "
91 f"([{theme.LABEL}]{summary.pages}[/{theme.LABEL}] pages, "
92 f"[{theme.LABEL}]{summary.chunks}[/{theme.LABEL}] chunks)"
93 )