Coverage for src / lilbee / server / handlers / documents.py: 100%
53 statements
« prev ^ index » next coverage.py v7.13.4, created at 2026-06-28 01:01 +0000
« prev ^ index » next coverage.py v7.13.4, created at 2026-06-28 01:01 +0000
1"""Document listing, deletion, and source-content handlers."""
3from __future__ import annotations
5import mimetypes
7from lilbee.app.services import get_services
8from lilbee.core.config import cfg
9from lilbee.core.security import validate_path_within
10from lilbee.server.models import (
11 DocumentInfo,
12 DocumentListResponse,
13 DocumentRemoveResponse,
14 SourceContentResponse,
15)
17# Windows mimetypes reads from the registry, which may not define ``.md``
18# as ``text/markdown``. Pin the mapping at import time; ``add_type`` is
19# idempotent so repeated imports are safe.
20mimetypes.add_type("text/markdown", ".md")
23# Types that can carry script even within an "inline-rendered" category.
24# Keep the deny narrow and explicit. Broadening this set is a security-relevant
25# change: file an issue with the ``security`` label before adding entries.
26_RAW_INLINE_RENDER_DENY: frozenset[str] = frozenset(
27 {
28 "text/html",
29 "text/javascript",
30 "application/javascript",
31 "application/xhtml+xml",
32 "text/css",
33 "image/svg+xml",
34 }
35)
38def _is_safe_for_inline_render(content_type: str) -> bool:
39 """Whether ``raw=1`` may serve this Content-Type as-is.
41 Trusted categories (``text/*``, ``image/*``, ``application/pdf``) pass
42 through, with named exceptions for types that embed executable script.
43 Everything else degrades to ``application/octet-stream`` so an attacker-
44 renamed file (e.g. ``evil.html``) cannot trick a browser into rendering
45 it inline within the plugin origin.
46 """
47 if content_type in _RAW_INLINE_RENDER_DENY:
48 return False
49 if content_type == "application/pdf":
50 return True
51 return content_type.startswith("text/") or content_type.startswith("image/")
54def _imported_source_markdown(source: str) -> str | None:
55 """Page texts joined in page order; ``None`` when the source has none."""
56 rows = get_services().store.get_page_texts(source)
57 if not rows:
58 return None
59 ordered = sorted(rows, key=lambda row: row["page"])
60 return "\n\n".join(row["text"] for row in ordered)
63async def delete_documents(
64 names: list[str], *, delete_files: bool = False
65) -> DocumentRemoveResponse:
66 """Remove documents from the knowledge base by source name."""
67 result = get_services().store.remove_documents(names, delete_files=delete_files)
68 return DocumentRemoveResponse(removed=result.removed, not_found=result.not_found)
71async def list_documents(
72 search: str = "",
73 limit: int = 50,
74 offset: int = 0,
75) -> DocumentListResponse:
76 """Return indexed documents with metadata, paginated and filterable.
78 Pagination and the filename filter are pushed into LanceDB via
79 ``Store.get_sources(search=..., limit=..., offset=...)`` and the
80 total comes from ``Store.count_sources(search=...)`` so neither
81 call materializes the full SOURCES table per request.
82 """
83 store = get_services().store
84 search_term = search or None
85 page = store.get_sources(search=search_term, limit=limit, offset=offset)
86 total = store.count_sources(search=search_term)
87 return DocumentListResponse(
88 documents=[
89 DocumentInfo(
90 filename=s["filename"],
91 chunk_count=s.get("chunk_count", 0),
92 ingested_at=s.get("ingested_at", ""),
93 )
94 for s in page
95 ],
96 total=total,
97 limit=limit,
98 offset=offset,
99 has_more=len(page) > 0 and (offset + len(page)) < total,
100 )
103async def get_source_content(
104 source: str, raw: bool = False
105) -> SourceContentResponse | tuple[bytes, str]:
106 """Return a stored source file: JSON with markdown text for text types, or
107 ``(bytes, content_type)`` when *raw* is True. Binary types return empty
108 markdown so clients know to re-request with ``raw=1``.
109 """
110 from lilbee.wiki.index import parse_title
112 if not source or not source.strip():
113 raise ValueError("source must not be empty")
114 documents_dir = cfg.documents_dir
115 resolved = validate_path_within(documents_dir / source, documents_dir)
116 if not resolved.is_file():
117 # Imported sources have no file on disk; their text lives in the page-text store.
118 markdown = _imported_source_markdown(source)
119 if markdown is None:
120 raise FileNotFoundError(source)
121 if raw:
122 return markdown.encode("utf-8"), "text/markdown"
123 return SourceContentResponse(
124 markdown=markdown, content_type="text/markdown", title=parse_title(markdown) or None
125 )
127 content_type, _ = mimetypes.guess_type(resolved.name)
128 if content_type is None:
129 content_type = "application/octet-stream"
131 if raw:
132 # Cap raw responses to inline-render-safe categories; anything else
133 # degrades to a binary download so attacker-renamed files (e.g.
134 # evil.html) can't trick the embedding browser into running script
135 # under our origin.
136 served_type = (
137 content_type if _is_safe_for_inline_render(content_type) else "application/octet-stream"
138 )
139 return resolved.read_bytes(), served_type
141 if not content_type.startswith("text/"):
142 return SourceContentResponse(markdown="", content_type=content_type, title=None)
144 text = resolved.read_text(encoding="utf-8", errors="replace")
145 title = parse_title(text) or None
146 return SourceContentResponse(markdown=text, content_type=content_type, title=title)