Coverage for src / lilbee / server / handlers / documents.py: 100%

53 statements  

« prev     ^ index     » next       coverage.py v7.13.4, created at 2026-06-28 01:01 +0000

1"""Document listing, deletion, and source-content handlers.""" 

2 

3from __future__ import annotations 

4 

5import mimetypes 

6 

7from lilbee.app.services import get_services 

8from lilbee.core.config import cfg 

9from lilbee.core.security import validate_path_within 

10from lilbee.server.models import ( 

11 DocumentInfo, 

12 DocumentListResponse, 

13 DocumentRemoveResponse, 

14 SourceContentResponse, 

15) 

16 

17# Windows mimetypes reads from the registry, which may not define ``.md`` 

18# as ``text/markdown``. Pin the mapping at import time; ``add_type`` is 

19# idempotent so repeated imports are safe. 

20mimetypes.add_type("text/markdown", ".md") 

21 

22 

23# Types that can carry script even within an "inline-rendered" category. 

24# Keep the deny narrow and explicit. Broadening this set is a security-relevant 

25# change: file an issue with the ``security`` label before adding entries. 

26_RAW_INLINE_RENDER_DENY: frozenset[str] = frozenset( 

27 { 

28 "text/html", 

29 "text/javascript", 

30 "application/javascript", 

31 "application/xhtml+xml", 

32 "text/css", 

33 "image/svg+xml", 

34 } 

35) 

36 

37 

38def _is_safe_for_inline_render(content_type: str) -> bool: 

39 """Whether ``raw=1`` may serve this Content-Type as-is. 

40 

41 Trusted categories (``text/*``, ``image/*``, ``application/pdf``) pass 

42 through, with named exceptions for types that embed executable script. 

43 Everything else degrades to ``application/octet-stream`` so an attacker- 

44 renamed file (e.g. ``evil.html``) cannot trick a browser into rendering 

45 it inline within the plugin origin. 

46 """ 

47 if content_type in _RAW_INLINE_RENDER_DENY: 

48 return False 

49 if content_type == "application/pdf": 

50 return True 

51 return content_type.startswith("text/") or content_type.startswith("image/") 

52 

53 

54def _imported_source_markdown(source: str) -> str | None: 

55 """Page texts joined in page order; ``None`` when the source has none.""" 

56 rows = get_services().store.get_page_texts(source) 

57 if not rows: 

58 return None 

59 ordered = sorted(rows, key=lambda row: row["page"]) 

60 return "\n\n".join(row["text"] for row in ordered) 

61 

62 

63async def delete_documents( 

64 names: list[str], *, delete_files: bool = False 

65) -> DocumentRemoveResponse: 

66 """Remove documents from the knowledge base by source name.""" 

67 result = get_services().store.remove_documents(names, delete_files=delete_files) 

68 return DocumentRemoveResponse(removed=result.removed, not_found=result.not_found) 

69 

70 

71async def list_documents( 

72 search: str = "", 

73 limit: int = 50, 

74 offset: int = 0, 

75) -> DocumentListResponse: 

76 """Return indexed documents with metadata, paginated and filterable. 

77 

78 Pagination and the filename filter are pushed into LanceDB via 

79 ``Store.get_sources(search=..., limit=..., offset=...)`` and the 

80 total comes from ``Store.count_sources(search=...)`` so neither 

81 call materializes the full SOURCES table per request. 

82 """ 

83 store = get_services().store 

84 search_term = search or None 

85 page = store.get_sources(search=search_term, limit=limit, offset=offset) 

86 total = store.count_sources(search=search_term) 

87 return DocumentListResponse( 

88 documents=[ 

89 DocumentInfo( 

90 filename=s["filename"], 

91 chunk_count=s.get("chunk_count", 0), 

92 ingested_at=s.get("ingested_at", ""), 

93 ) 

94 for s in page 

95 ], 

96 total=total, 

97 limit=limit, 

98 offset=offset, 

99 has_more=len(page) > 0 and (offset + len(page)) < total, 

100 ) 

101 

102 

103async def get_source_content( 

104 source: str, raw: bool = False 

105) -> SourceContentResponse | tuple[bytes, str]: 

106 """Return a stored source file: JSON with markdown text for text types, or 

107 ``(bytes, content_type)`` when *raw* is True. Binary types return empty 

108 markdown so clients know to re-request with ``raw=1``. 

109 """ 

110 from lilbee.wiki.index import parse_title 

111 

112 if not source or not source.strip(): 

113 raise ValueError("source must not be empty") 

114 documents_dir = cfg.documents_dir 

115 resolved = validate_path_within(documents_dir / source, documents_dir) 

116 if not resolved.is_file(): 

117 # Imported sources have no file on disk; their text lives in the page-text store. 

118 markdown = _imported_source_markdown(source) 

119 if markdown is None: 

120 raise FileNotFoundError(source) 

121 if raw: 

122 return markdown.encode("utf-8"), "text/markdown" 

123 return SourceContentResponse( 

124 markdown=markdown, content_type="text/markdown", title=parse_title(markdown) or None 

125 ) 

126 

127 content_type, _ = mimetypes.guess_type(resolved.name) 

128 if content_type is None: 

129 content_type = "application/octet-stream" 

130 

131 if raw: 

132 # Cap raw responses to inline-render-safe categories; anything else 

133 # degrades to a binary download so attacker-renamed files (e.g. 

134 # evil.html) can't trick the embedding browser into running script 

135 # under our origin. 

136 served_type = ( 

137 content_type if _is_safe_for_inline_render(content_type) else "application/octet-stream" 

138 ) 

139 return resolved.read_bytes(), served_type 

140 

141 if not content_type.startswith("text/"): 

142 return SourceContentResponse(markdown="", content_type=content_type, title=None) 

143 

144 text = resolved.read_text(encoding="utf-8", errors="replace") 

145 title = parse_title(text) or None 

146 return SourceContentResponse(markdown=text, content_type=content_type, title=title)