Coverage for src/lilbee/server/handlers/documents.py: 100%

1"""Document listing, deletion, and source-content handlers."""

3from __future__ import annotations

5import mimetypes

7from lilbee.app.services import get_services

8from lilbee.core.config import cfg

9from lilbee.core.security import validate_path_within

10from lilbee.server.models import (

11 DocumentInfo,

12 DocumentListResponse,

13 DocumentRemoveResponse,

14 SourceContentResponse,

15)

17# Windows mimetypes reads from the registry, which may not define ``.md``

18# as ``text/markdown``. Pin the mapping at import time; ``add_type`` is

19# idempotent so repeated imports are safe.

20mimetypes.add_type("text/markdown", ".md")

23# Types that can carry script even within an "inline-rendered" category.

24# Keep the deny narrow and explicit. Broadening this set is a security-relevant

25# change: file an issue with the ``security`` label before adding entries.

26_RAW_INLINE_RENDER_DENY: frozenset[str] = frozenset(

27 {

28 "text/html",

29 "text/javascript",

30 "application/javascript",

31 "application/xhtml+xml",

32 "text/css",

33 "image/svg+xml",

34 }

35)

38def _is_safe_for_inline_render(content_type: str) -> bool:

39 """Whether ``raw=1`` may serve this Content-Type as-is.

41 Trusted categories (``text/*``, ``image/*``, ``application/pdf``) pass

42 through, with named exceptions for types that embed executable script.

43 Everything else degrades to ``application/octet-stream`` so an attacker-

44 renamed file (e.g. ``evil.html``) cannot trick a browser into rendering

45 it inline within the plugin origin.

46 """

47 if content_type in _RAW_INLINE_RENDER_DENY:

48 return False

49 if content_type == "application/pdf":

50 return True

51 return content_type.startswith("text/") or content_type.startswith("image/")

54async def delete_documents(

55 names: list[str], *, delete_files: bool = False

56) -> DocumentRemoveResponse:

57 """Remove documents from the knowledge base by source name."""

58 result = get_services().store.remove_documents(names, delete_files=delete_files)

59 return DocumentRemoveResponse(removed=result.removed, not_found=result.not_found)

62async def list_documents(

63 search: str = "",

64 limit: int = 50,

65 offset: int = 0,

66) -> DocumentListResponse:

67 """Return indexed documents with metadata, paginated and filterable.

69 Pagination and the filename filter are pushed into LanceDB via

70 ``Store.get_sources(search=..., limit=..., offset=...)`` and the

71 total comes from ``Store.count_sources(search=...)`` so neither

72 call materializes the full SOURCES table per request.

73 """

74 store = get_services().store

75 search_term = search or None

76 page = store.get_sources(search=search_term, limit=limit, offset=offset)

77 total = store.count_sources(search=search_term)

78 return DocumentListResponse(

79 documents=[

80 DocumentInfo(

81 filename=s["filename"],

82 chunk_count=s.get("chunk_count", 0),

83 ingested_at=s.get("ingested_at", ""),

84 )

85 for s in page

86 ],

87 total=total,

88 limit=limit,

89 offset=offset,

90 has_more=len(page) > 0 and (offset + len(page)) < total,

91 )

94async def get_source_content(

95 source: str, raw: bool = False

96) -> SourceContentResponse | tuple[bytes, str]:

97 """Return a stored source file: JSON with markdown text for text types, or

98 ``(bytes, content_type)`` when *raw* is True. Binary types return empty

99 markdown so clients know to re-request with ``raw=1``.

100 """

101 from lilbee.wiki.index import parse_title

102

103 if not source or not source.strip():

104 raise ValueError("source must not be empty")

105 documents_dir = cfg.documents_dir

106 resolved = validate_path_within(documents_dir / source, documents_dir)

107 if not resolved.is_file():

108 raise FileNotFoundError(source)

109

110 content_type, _ = mimetypes.guess_type(resolved.name)

111 if content_type is None:

112 content_type = "application/octet-stream"

113

114 if raw:

115 # Cap raw responses to inline-render-safe categories; anything else

116 # degrades to a binary download so attacker-renamed files (e.g.

117 # evil.html) can't trick the embedding browser into running script

118 # under our origin.

119 served_type = (

120 content_type if _is_safe_for_inline_render(content_type) else "application/octet-stream"

121 )

122 return resolved.read_bytes(), served_type

123

124 if not content_type.startswith("text/"):

125 return SourceContentResponse(markdown="", content_type=content_type, title=None)

126

127 text = resolved.read_text(encoding="utf-8", errors="replace")

128 title = parse_title(text) or None

129 return SourceContentResponse(markdown=text, content_type=content_type, title=title)

Coverage for src / lilbee / server / handlers / documents.py: 100%

42 statements