From 29820e4cac6c5f226ea20a95de2dd03db12339ae Mon Sep 17 00:00:00 2001 From: young1lin <2550110827@qq.com> Date: Mon, 25 May 2026 13:10:57 +0800 Subject: [PATCH] docs(chat): clarify cached-token semantics for explicit-cache providers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `cached_tokens` is reported by every OpenAI-compatible provider that supports prompt caching, but how it becomes non-zero differs by mode: - Implicit caching (OpenAI, Azure OpenAI, DeepSeek, …) populates the field automatically whenever a prompt prefix matches a previous request within the provider's cache TTL. No client-side opt-in. - Explicit caching (Qwen on Aliyun, Anthropic Claude, …) only populates the field after the caller attaches `cache_control: {"type": "ephemeral"}` to the relevant message / content block. Until that opt-in is applied upstream of the request, the field stays zero even when the prefix is otherwise byte-stable. Without this distinction documented, the previous commit reads as if `TokenUsage.CachedTokens` will show non-zero values for Qwen / Claude once this PR lands — which is not the case. The plumbing here is a prerequisite (stable prefix via sorted tools) and a meter (visibility of the field), but the explicit-cache opt-in itself is out of scope and lives elsewhere. Document this on `TokenUsage.CachedTokens` and the `cachedTokens` helper so callers do not mistake observability for activation. --- internal/models/chat/remote_api.go | 10 ++++++++++ internal/types/chat.go | 22 ++++++++++++++++++---- 2 files changed, 28 insertions(+), 4 deletions(-) diff --git a/internal/models/chat/remote_api.go b/internal/models/chat/remote_api.go index 10e0f319..e4ae8fba 100644 --- a/internal/models/chat/remote_api.go +++ b/internal/models/chat/remote_api.go @@ -1176,6 +1176,16 @@ func (c *RemoteAPIChat) GetAPIKey() string { // cachedTokens returns the cached prompt-token count from an OpenAI-compatible // usage detail block, or zero when the provider did not report one. Some // providers omit PromptTokensDetails entirely, so the nil guard is required. +// +// Note on provider semantics: +// - Implicit-cache providers (OpenAI, Azure OpenAI, DeepSeek, …) populate +// `cached_tokens` automatically whenever the prompt prefix matches a +// previous request — no caller opt-in is required. +// - Explicit-cache providers (Qwen on Aliyun, Anthropic Claude, …) only +// populate `cached_tokens` after the caller attaches `cache_control: +// {"type": "ephemeral"}` to the relevant message / content block. This +// helper still returns zero for those providers until that opt-in is +// applied upstream of the request. func cachedTokens(d *openai.PromptTokensDetails) int { if d == nil { return 0 diff --git a/internal/types/chat.go b/internal/types/chat.go index 412f5bb1..e13166ea 100644 --- a/internal/types/chat.go +++ b/internal/types/chat.go @@ -11,10 +11,24 @@ type TokenUsage struct { CompletionTokens int `json:"completion_tokens"` TotalTokens int `json:"total_tokens"` // CachedTokens is the subset of PromptTokens that hit a provider-side - // prompt cache (OpenAI prompt_tokens_details.cached_tokens, Qwen explicit - // caching, etc.). Zero when the provider does not report cache hits or - // when no cache was hit. Omitted from JSON when zero to keep payloads - // quiet for providers that never populate it. + // prompt cache. Populated from `usage.prompt_tokens_details.cached_tokens` + // in OpenAI-compatible responses. + // + // Whether this field is non-zero depends on the provider's caching mode: + // + // - Implicit caching (OpenAI, Azure OpenAI, DeepSeek, …) — automatic. + // The field populates whenever the prompt prefix matches a previous + // request within the provider's cache TTL. No client-side opt-in. + // + // - Explicit caching (Qwen on Aliyun, Anthropic Claude, …) — opt-in + // required. The caller must attach `cache_control: {"type": + // "ephemeral"}` to the relevant message or content block to make + // the provider create and read the cache. Until that opt-in is + // applied, CachedTokens stays zero even when the prompt prefix is + // otherwise byte-stable. + // + // Omitted from JSON when zero so payloads stay quiet for providers + // that never populate it. CachedTokens int `json:"cached_tokens,omitempty"` }