Coverage for src / lilbee / retrieval / query / history_window.py: 100%
22 statements
« prev ^ index » next coverage.py v7.13.4, created at 2026-06-28 01:01 +0000
« prev ^ index » next coverage.py v7.13.4, created at 2026-06-28 01:01 +0000
1"""Token-budget history windowing for chat conversations."""
3from __future__ import annotations
5from collections.abc import Callable
6from typing import TYPE_CHECKING
8if TYPE_CHECKING:
9 from lilbee.retrieval.query.searcher import ChatMessage
11# Conservative char->token estimator. Matches OpenAI's "4 chars ~= 1 token"
12# rule of thumb for English; under-counts non-ASCII slightly but the
13# budget already leaves headroom for that.
14_CHARS_PER_TOKEN = 4
17def estimate_text_tokens(text: str) -> int:
18 """Cheap char/4 token estimate for a string."""
19 return max(1, len(text) // _CHARS_PER_TOKEN)
22def estimate_tokens(message: ChatMessage) -> int:
23 """Cheap char/4 token estimate for one message."""
24 return estimate_text_tokens(message["content"])
27def windowed_history(
28 messages: list[ChatMessage],
29 *,
30 max_tokens: int,
31 estimator: Callable[[ChatMessage], int] = estimate_tokens,
32) -> list[ChatMessage]:
33 """Return the suffix of *messages* whose token cost fits in *max_tokens*.
35 Drops messages from the front in pairs so the window starts at a user
36 message; never strands an orphan assistant reply with no preceding user
37 turn for the model to anchor to. The newest pair is always kept even
38 if it exceeds the budget on its own (caller decides what to do then).
39 """
40 if max_tokens <= 0 or not messages:
41 return list(messages)
42 sizes = [estimator(m) for m in messages]
43 total = sum(sizes)
44 if total <= max_tokens:
45 return list(messages)
46 start = 0
47 # ``len(messages) - 2`` keeps the newest user/assistant pair even when it
48 # exceeds the budget on its own. The caller decides what to do if the
49 # final pair is over-sized (typically: send it anyway and let llama-cpp
50 # error if it must, rather than send nothing at all).
51 while start < len(messages) - 2 and total > max_tokens:
52 # Drop the front pair (user + assistant). If the front isn't a user
53 # message (malformed input), drop one to realign.
54 drop = 2 if messages[start]["role"] == "user" else 1
55 for i in range(start, min(start + drop, len(messages))):
56 total -= sizes[i]
57 start += drop
58 return list(messages[start:])