Coverage for src / lilbee / server / handlers / crawl.py: 100%

23 statements  

« prev     ^ index     » next       coverage.py v7.13.4, created at 2026-06-28 01:01 +0000

1"""Crawl streaming handler.""" 

2 

3from __future__ import annotations 

4 

5import asyncio 

6from collections.abc import AsyncGenerator 

7from pathlib import Path 

8 

9from lilbee.core.config.enums import CrawlRenderMode 

10from lilbee.server.handlers.sse import SseStream, sse_done, sse_error 

11 

12 

13async def crawl_stream( 

14 url: str, 

15 depth: int | None = None, 

16 max_pages: int | None = None, 

17 render_mode: CrawlRenderMode | None = None, 

18) -> AsyncGenerator[str, None]: 

19 """Stream crawl progress as SSE events. 

20 Emits crawl_start, crawl_page, crawl_done events, then a final done event 

21 with the list of files written. On error emits crawl_error. 

22 Sets a cancel event on client disconnect so the crawl stops between pages. 

23 

24 On first use, Chromium isn't installed yet. The stream inlines 

25 setup_start/progress/done events before the crawl begins so a stream 

26 consumer can render a matching 'setup' progress indicator. 

27 """ 

28 sse = SseStream() 

29 

30 async def _run_crawl() -> list[Path]: 

31 from lilbee.crawler import crawl_and_save 

32 

33 # crawl_and_save runs the Chromium bootstrap itself on first use, 

34 # relaying setup_* events through the same on_progress callback 

35 # so the SSE stream carries them before any crawl_* events. 

36 try: 

37 return await crawl_and_save( 

38 url, 

39 depth=depth, 

40 max_pages=max_pages, 

41 on_progress=sse.callback, 

42 cancel=sse.cancel, 

43 render_mode=render_mode, 

44 ) 

45 finally: 

46 sse.queue.put_nowait(None) 

47 

48 task = asyncio.create_task(_run_crawl()) 

49 async for event in sse.drain(task, "Crawl stream"): 

50 yield event 

51 if not sse.cancel.is_set() and task.done() and not task.cancelled(): 

52 exc = task.exception() 

53 if exc is not None: 

54 yield sse_error(str(exc)) 

55 return 

56 paths = task.result() 

57 yield sse_done({"files_written": [str(p) for p in paths]})