Coverage for src / lilbee / server / handlers / documents.py: 100%
42 statements
« prev ^ index » next coverage.py v7.13.4, created at 2026-05-15 20:55 +0000
« prev ^ index » next coverage.py v7.13.4, created at 2026-05-15 20:55 +0000
1"""Document listing, deletion, and source-content handlers."""
3from __future__ import annotations
5import mimetypes
7from lilbee.app.services import get_services
8from lilbee.core.config import cfg
9from lilbee.core.security import validate_path_within
10from lilbee.server.models import (
11 DocumentInfo,
12 DocumentListResponse,
13 DocumentRemoveResponse,
14 SourceContentResponse,
15)
17# Windows mimetypes reads from the registry, which may not define ``.md``
18# as ``text/markdown``. Pin the mapping at import time; ``add_type`` is
19# idempotent so repeated imports are safe.
20mimetypes.add_type("text/markdown", ".md")
23# Types that can carry script even within an "inline-rendered" category.
24# Keep the deny narrow and explicit. Broadening this set is a security-relevant
25# change: file an issue with the ``security`` label before adding entries.
26_RAW_INLINE_RENDER_DENY: frozenset[str] = frozenset(
27 {
28 "text/html",
29 "text/javascript",
30 "application/javascript",
31 "application/xhtml+xml",
32 "text/css",
33 "image/svg+xml",
34 }
35)
38def _is_safe_for_inline_render(content_type: str) -> bool:
39 """Whether ``raw=1`` may serve this Content-Type as-is.
41 Trusted categories (``text/*``, ``image/*``, ``application/pdf``) pass
42 through, with named exceptions for types that embed executable script.
43 Everything else degrades to ``application/octet-stream`` so an attacker-
44 renamed file (e.g. ``evil.html``) cannot trick a browser into rendering
45 it inline within the plugin origin.
46 """
47 if content_type in _RAW_INLINE_RENDER_DENY:
48 return False
49 if content_type == "application/pdf":
50 return True
51 return content_type.startswith("text/") or content_type.startswith("image/")
54async def delete_documents(
55 names: list[str], *, delete_files: bool = False
56) -> DocumentRemoveResponse:
57 """Remove documents from the knowledge base by source name."""
58 result = get_services().store.remove_documents(names, delete_files=delete_files)
59 return DocumentRemoveResponse(removed=result.removed, not_found=result.not_found)
62async def list_documents(
63 search: str = "",
64 limit: int = 50,
65 offset: int = 0,
66) -> DocumentListResponse:
67 """Return indexed documents with metadata, paginated and filterable.
69 Pagination and the filename filter are pushed into LanceDB via
70 ``Store.get_sources(search=..., limit=..., offset=...)`` and the
71 total comes from ``Store.count_sources(search=...)`` so neither
72 call materializes the full SOURCES table per request.
73 """
74 store = get_services().store
75 search_term = search or None
76 page = store.get_sources(search=search_term, limit=limit, offset=offset)
77 total = store.count_sources(search=search_term)
78 return DocumentListResponse(
79 documents=[
80 DocumentInfo(
81 filename=s["filename"],
82 chunk_count=s.get("chunk_count", 0),
83 ingested_at=s.get("ingested_at", ""),
84 )
85 for s in page
86 ],
87 total=total,
88 limit=limit,
89 offset=offset,
90 has_more=len(page) > 0 and (offset + len(page)) < total,
91 )
94async def get_source_content(
95 source: str, raw: bool = False
96) -> SourceContentResponse | tuple[bytes, str]:
97 """Return a stored source file: JSON with markdown text for text types, or
98 ``(bytes, content_type)`` when *raw* is True. Binary types return empty
99 markdown so clients know to re-request with ``raw=1``.
100 """
101 from lilbee.wiki.index import parse_title
103 if not source or not source.strip():
104 raise ValueError("source must not be empty")
105 documents_dir = cfg.documents_dir
106 resolved = validate_path_within(documents_dir / source, documents_dir)
107 if not resolved.is_file():
108 raise FileNotFoundError(source)
110 content_type, _ = mimetypes.guess_type(resolved.name)
111 if content_type is None:
112 content_type = "application/octet-stream"
114 if raw:
115 # Cap raw responses to inline-render-safe categories; anything else
116 # degrades to a binary download so attacker-renamed files (e.g.
117 # evil.html) can't trick the embedding browser into running script
118 # under our origin.
119 served_type = (
120 content_type if _is_safe_for_inline_render(content_type) else "application/octet-stream"
121 )
122 return resolved.read_bytes(), served_type
124 if not content_type.startswith("text/"):
125 return SourceContentResponse(markdown="", content_type=content_type, title=None)
127 text = resolved.read_text(encoding="utf-8", errors="replace")
128 title = parse_title(text) or None
129 return SourceContentResponse(markdown=text, content_type=content_type, title=title)