Coverage for src / lilbee / retrieval / query / history_window.py: 100%

22 statements  

« prev     ^ index     » next       coverage.py v7.13.4, created at 2026-06-28 01:01 +0000

1"""Token-budget history windowing for chat conversations.""" 

2 

3from __future__ import annotations 

4 

5from collections.abc import Callable 

6from typing import TYPE_CHECKING 

7 

8if TYPE_CHECKING: 

9 from lilbee.retrieval.query.searcher import ChatMessage 

10 

11# Conservative char->token estimator. Matches OpenAI's "4 chars ~= 1 token" 

12# rule of thumb for English; under-counts non-ASCII slightly but the 

13# budget already leaves headroom for that. 

14_CHARS_PER_TOKEN = 4 

15 

16 

17def estimate_text_tokens(text: str) -> int: 

18 """Cheap char/4 token estimate for a string.""" 

19 return max(1, len(text) // _CHARS_PER_TOKEN) 

20 

21 

22def estimate_tokens(message: ChatMessage) -> int: 

23 """Cheap char/4 token estimate for one message.""" 

24 return estimate_text_tokens(message["content"]) 

25 

26 

27def windowed_history( 

28 messages: list[ChatMessage], 

29 *, 

30 max_tokens: int, 

31 estimator: Callable[[ChatMessage], int] = estimate_tokens, 

32) -> list[ChatMessage]: 

33 """Return the suffix of *messages* whose token cost fits in *max_tokens*. 

34 

35 Drops messages from the front in pairs so the window starts at a user 

36 message; never strands an orphan assistant reply with no preceding user 

37 turn for the model to anchor to. The newest pair is always kept even 

38 if it exceeds the budget on its own (caller decides what to do then). 

39 """ 

40 if max_tokens <= 0 or not messages: 

41 return list(messages) 

42 sizes = [estimator(m) for m in messages] 

43 total = sum(sizes) 

44 if total <= max_tokens: 

45 return list(messages) 

46 start = 0 

47 # ``len(messages) - 2`` keeps the newest user/assistant pair even when it 

48 # exceeds the budget on its own. The caller decides what to do if the 

49 # final pair is over-sized (typically: send it anyway and let llama-cpp 

50 # error if it must, rather than send nothing at all). 

51 while start < len(messages) - 2 and total > max_tokens: 

52 # Drop the front pair (user + assistant). If the front isn't a user 

53 # message (malformed input), drop one to realign. 

54 drop = 2 if messages[start]["role"] == "user" else 1 

55 for i in range(start, min(start + drop, len(messages))): 

56 total -= sizes[i] 

57 start += drop 

58 return list(messages[start:])