Coverage for src / lilbee / server / handlers / documents.py: 100%

42 statements  

« prev     ^ index     » next       coverage.py v7.13.4, created at 2026-05-15 20:55 +0000

1"""Document listing, deletion, and source-content handlers.""" 

2 

3from __future__ import annotations 

4 

5import mimetypes 

6 

7from lilbee.app.services import get_services 

8from lilbee.core.config import cfg 

9from lilbee.core.security import validate_path_within 

10from lilbee.server.models import ( 

11 DocumentInfo, 

12 DocumentListResponse, 

13 DocumentRemoveResponse, 

14 SourceContentResponse, 

15) 

16 

17# Windows mimetypes reads from the registry, which may not define ``.md`` 

18# as ``text/markdown``. Pin the mapping at import time; ``add_type`` is 

19# idempotent so repeated imports are safe. 

20mimetypes.add_type("text/markdown", ".md") 

21 

22 

23# Types that can carry script even within an "inline-rendered" category. 

24# Keep the deny narrow and explicit. Broadening this set is a security-relevant 

25# change: file an issue with the ``security`` label before adding entries. 

26_RAW_INLINE_RENDER_DENY: frozenset[str] = frozenset( 

27 { 

28 "text/html", 

29 "text/javascript", 

30 "application/javascript", 

31 "application/xhtml+xml", 

32 "text/css", 

33 "image/svg+xml", 

34 } 

35) 

36 

37 

38def _is_safe_for_inline_render(content_type: str) -> bool: 

39 """Whether ``raw=1`` may serve this Content-Type as-is. 

40 

41 Trusted categories (``text/*``, ``image/*``, ``application/pdf``) pass 

42 through, with named exceptions for types that embed executable script. 

43 Everything else degrades to ``application/octet-stream`` so an attacker- 

44 renamed file (e.g. ``evil.html``) cannot trick a browser into rendering 

45 it inline within the plugin origin. 

46 """ 

47 if content_type in _RAW_INLINE_RENDER_DENY: 

48 return False 

49 if content_type == "application/pdf": 

50 return True 

51 return content_type.startswith("text/") or content_type.startswith("image/") 

52 

53 

54async def delete_documents( 

55 names: list[str], *, delete_files: bool = False 

56) -> DocumentRemoveResponse: 

57 """Remove documents from the knowledge base by source name.""" 

58 result = get_services().store.remove_documents(names, delete_files=delete_files) 

59 return DocumentRemoveResponse(removed=result.removed, not_found=result.not_found) 

60 

61 

62async def list_documents( 

63 search: str = "", 

64 limit: int = 50, 

65 offset: int = 0, 

66) -> DocumentListResponse: 

67 """Return indexed documents with metadata, paginated and filterable. 

68 

69 Pagination and the filename filter are pushed into LanceDB via 

70 ``Store.get_sources(search=..., limit=..., offset=...)`` and the 

71 total comes from ``Store.count_sources(search=...)`` so neither 

72 call materializes the full SOURCES table per request. 

73 """ 

74 store = get_services().store 

75 search_term = search or None 

76 page = store.get_sources(search=search_term, limit=limit, offset=offset) 

77 total = store.count_sources(search=search_term) 

78 return DocumentListResponse( 

79 documents=[ 

80 DocumentInfo( 

81 filename=s["filename"], 

82 chunk_count=s.get("chunk_count", 0), 

83 ingested_at=s.get("ingested_at", ""), 

84 ) 

85 for s in page 

86 ], 

87 total=total, 

88 limit=limit, 

89 offset=offset, 

90 has_more=len(page) > 0 and (offset + len(page)) < total, 

91 ) 

92 

93 

94async def get_source_content( 

95 source: str, raw: bool = False 

96) -> SourceContentResponse | tuple[bytes, str]: 

97 """Return a stored source file: JSON with markdown text for text types, or 

98 ``(bytes, content_type)`` when *raw* is True. Binary types return empty 

99 markdown so clients know to re-request with ``raw=1``. 

100 """ 

101 from lilbee.wiki.index import parse_title 

102 

103 if not source or not source.strip(): 

104 raise ValueError("source must not be empty") 

105 documents_dir = cfg.documents_dir 

106 resolved = validate_path_within(documents_dir / source, documents_dir) 

107 if not resolved.is_file(): 

108 raise FileNotFoundError(source) 

109 

110 content_type, _ = mimetypes.guess_type(resolved.name) 

111 if content_type is None: 

112 content_type = "application/octet-stream" 

113 

114 if raw: 

115 # Cap raw responses to inline-render-safe categories; anything else 

116 # degrades to a binary download so attacker-renamed files (e.g. 

117 # evil.html) can't trick the embedding browser into running script 

118 # under our origin. 

119 served_type = ( 

120 content_type if _is_safe_for_inline_render(content_type) else "application/octet-stream" 

121 ) 

122 return resolved.read_bytes(), served_type 

123 

124 if not content_type.startswith("text/"): 

125 return SourceContentResponse(markdown="", content_type=content_type, title=None) 

126 

127 text = resolved.read_text(encoding="utf-8", errors="replace") 

128 title = parse_title(text) or None 

129 return SourceContentResponse(markdown=text, content_type=content_type, title=title)