Coverage for src / lilbee / server / handlers / crawl.py: 100%

22 statements  

« prev     ^ index     » next       coverage.py v7.13.4, created at 2026-05-15 20:55 +0000

1"""Crawl streaming handler.""" 

2 

3from __future__ import annotations 

4 

5import asyncio 

6from collections.abc import AsyncGenerator 

7from pathlib import Path 

8 

9from lilbee.server.handlers.sse import SseStream, sse_done, sse_error 

10 

11 

12async def crawl_stream( 

13 url: str, depth: int | None = None, max_pages: int | None = None 

14) -> AsyncGenerator[str, None]: 

15 """Stream crawl progress as SSE events. 

16 Emits crawl_start, crawl_page, crawl_done events, then a final done event 

17 with the list of files written. On error emits crawl_error. 

18 Sets a cancel event on client disconnect so the crawl stops between pages. 

19 

20 On first use, Chromium isn't installed yet. The stream inlines 

21 setup_start/progress/done events before the crawl begins so a stream 

22 consumer can render a matching 'setup' progress indicator. 

23 """ 

24 sse = SseStream() 

25 

26 async def _run_crawl() -> list[Path]: 

27 from lilbee.crawler import crawl_and_save 

28 

29 # crawl_and_save runs the Chromium bootstrap itself on first use, 

30 # relaying setup_* events through the same on_progress callback 

31 # so the SSE stream carries them before any crawl_* events. 

32 try: 

33 return await crawl_and_save( 

34 url, depth=depth, max_pages=max_pages, on_progress=sse.callback, cancel=sse.cancel 

35 ) 

36 finally: 

37 sse.queue.put_nowait(None) 

38 

39 task = asyncio.create_task(_run_crawl()) 

40 async for event in sse.drain(task, "Crawl stream"): 

41 yield event 

42 if not sse.cancel.is_set() and task.done() and not task.cancelled(): 

43 exc = task.exception() 

44 if exc is not None: 

45 yield sse_error(str(exc)) 

46 return 

47 paths = task.result() 

48 yield sse_done({"files_written": [str(p) for p in paths]})