From 29820e4cac6c5f226ea20a95de2dd03db12339ae Mon Sep 17 00:00:00 2001
From: young1lin <2550110827@qq.com>
Date: Mon, 25 May 2026 13:10:57 +0800
Subject: [PATCH] docs(chat): clarify cached-token semantics for explicit-cache
 providers
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

`cached_tokens` is reported by every OpenAI-compatible provider that
supports prompt caching, but how it becomes non-zero differs by mode:

- Implicit caching (OpenAI, Azure OpenAI, DeepSeek, …) populates the
  field automatically whenever a prompt prefix matches a previous
  request within the provider's cache TTL. No client-side opt-in.

- Explicit caching (Qwen on Aliyun, Anthropic Claude, …) only
  populates the field after the caller attaches `cache_control:
  {"type": "ephemeral"}` to the relevant message / content block.
  Until that opt-in is applied upstream of the request, the field
  stays zero even when the prefix is otherwise byte-stable.

Without this distinction documented, the previous commit reads as if
`TokenUsage.CachedTokens` will show non-zero values for Qwen / Claude
once this PR lands — which is not the case. The plumbing here is a
prerequisite (stable prefix via sorted tools) and a meter (visibility
of the field), but the explicit-cache opt-in itself is out of scope
and lives elsewhere.

Document this on `TokenUsage.CachedTokens` and the `cachedTokens`
helper so callers do not mistake observability for activation.
---
 internal/models/chat/remote_api.go | 10 ++++++++++
 internal/types/chat.go             | 22 ++++++++++++++++++----
 2 files changed, 28 insertions(+), 4 deletions(-)

diff --git a/internal/models/chat/remote_api.go b/internal/models/chat/remote_api.go
index 10e0f319..e4ae8fba 100644
--- a/internal/models/chat/remote_api.go
+++ b/internal/models/chat/remote_api.go
@@ -1176,6 +1176,16 @@ func (c *RemoteAPIChat) GetAPIKey() string {
 // cachedTokens returns the cached prompt-token count from an OpenAI-compatible
 // usage detail block, or zero when the provider did not report one. Some
 // providers omit PromptTokensDetails entirely, so the nil guard is required.
+//
+// Note on provider semantics:
+//   - Implicit-cache providers (OpenAI, Azure OpenAI, DeepSeek, …) populate
+//     `cached_tokens` automatically whenever the prompt prefix matches a
+//     previous request — no caller opt-in is required.
+//   - Explicit-cache providers (Qwen on Aliyun, Anthropic Claude, …) only
+//     populate `cached_tokens` after the caller attaches `cache_control:
+//     {"type": "ephemeral"}` to the relevant message / content block. This
+//     helper still returns zero for those providers until that opt-in is
+//     applied upstream of the request.
 func cachedTokens(d *openai.PromptTokensDetails) int {
 	if d == nil {
 		return 0
diff --git a/internal/types/chat.go b/internal/types/chat.go
index 412f5bb1..e13166ea 100644
--- a/internal/types/chat.go
+++ b/internal/types/chat.go
@@ -11,10 +11,24 @@ type TokenUsage struct {
 	CompletionTokens int `json:"completion_tokens"`
 	TotalTokens      int `json:"total_tokens"`
 	// CachedTokens is the subset of PromptTokens that hit a provider-side
-	// prompt cache (OpenAI prompt_tokens_details.cached_tokens, Qwen explicit
-	// caching, etc.). Zero when the provider does not report cache hits or
-	// when no cache was hit. Omitted from JSON when zero to keep payloads
-	// quiet for providers that never populate it.
+	// prompt cache. Populated from `usage.prompt_tokens_details.cached_tokens`
+	// in OpenAI-compatible responses.
+	//
+	// Whether this field is non-zero depends on the provider's caching mode:
+	//
+	//   - Implicit caching (OpenAI, Azure OpenAI, DeepSeek, …) — automatic.
+	//     The field populates whenever the prompt prefix matches a previous
+	//     request within the provider's cache TTL. No client-side opt-in.
+	//
+	//   - Explicit caching (Qwen on Aliyun, Anthropic Claude, …) — opt-in
+	//     required. The caller must attach `cache_control: {"type":
+	//     "ephemeral"}` to the relevant message or content block to make
+	//     the provider create and read the cache. Until that opt-in is
+	//     applied, CachedTokens stays zero even when the prompt prefix is
+	//     otherwise byte-stable.
+	//
+	// Omitted from JSON when zero so payloads stay quiet for providers
+	// that never populate it.
 	CachedTokens int `json:"cached_tokens,omitempty"`
 }