feat(datasource): add Yuque (语雀) connector

Syncs personal and team Yuque knowledge bases into WeKnora as Markdown,
with full and incremental sync + deletion detection. Mirrors the Feishu
connector pattern; fetches V2Doc.body directly (no async export step).

Config: api_token (required) + base_url (optional, defaults to public cloud).
401/403 wraps ErrInvalidCredentials; 429 honors Retry-After; filters to
status=1 Doc-type at list stage to conserve API quota. Body is trusted as
Markdown for format=markdown|lake — empirically verified against the v2
API: Lake XML lives separately in body_lake, body itself is the Markdown
view. Other formats (e.g. html) are skipped with a placeholder item so
they surface in sync logs instead of silently feeding HTML to the parser.

v2Doc.status accepts both int and string shapes — Yuque's OpenAPI spec
declares string but the live API returns int. sanitizeFileName truncates
at a UTF-8 rune boundary so multi-byte titles don't emit invalid UTF-8
that downstream filename validation rejects.

Also adds logger.SetOutput test hook, used by the token-redaction test.
This commit is contained in:
nullkey
2026-04-21 17:29:37 +08:00
committed by lyingbug
parent 6accdc2a60
commit 656b07027a
9 changed files with 1953 additions and 2 deletions

View File

@@ -52,6 +52,7 @@ import (
"github.com/Tencent/WeKnora/internal/datasource"
feishuConnector "github.com/Tencent/WeKnora/internal/datasource/connector/feishu"
notionConnector "github.com/Tencent/WeKnora/internal/datasource/connector/notion"
yuqueConnector "github.com/Tencent/WeKnora/internal/datasource/connector/yuque"
"github.com/Tencent/WeKnora/internal/event"
"github.com/Tencent/WeKnora/internal/handler"
"github.com/Tencent/WeKnora/internal/handler/session"
@@ -177,7 +178,6 @@ func BuildContainer(container *dig.Container) *dig.Container {
must(container.Provide(service.NewImageMultimodalService, dig.Name("imageMultimodal")))
must(container.Provide(service.NewKnowledgePostProcessService, dig.Name("knowledgePostProcess")))
must(container.Provide(service.NewMessageService))
must(container.Provide(service.NewMCPServiceService))
must(container.Provide(service.NewCustomAgentService))
@@ -1472,9 +1472,11 @@ func initConnectorRegistry() *datasource.ConnectorRegistry {
// Register Notion connector
_ = registry.Register(notionConnector.NewConnector())
// Register Yuque connector
_ = registry.Register(yuqueConnector.NewConnector())
// Future connectors will be registered here:
// _ = registry.Register(confluenceConnector.NewConnector())
// _ = registry.Register(yuqueConnector.NewConnector())
// _ = registry.Register(githubConnector.NewConnector())
return registry

View File

@@ -0,0 +1,296 @@
package yuque
import (
"context"
"encoding/json"
"fmt"
"io"
"net/http"
"net/url"
"sync"
"time"
"github.com/Tencent/WeKnora/internal/datasource"
"github.com/Tencent/WeKnora/internal/logger"
)
const (
defaultTimeout = 30 * time.Second
defaultPageSize = 100
userAgent = "WeKnora-Yuque-Connector/1.0"
)
// client wraps the Yuque Open API.
type client struct {
baseURL string
token string
httpClient *http.Client
// logTokenOnce ensures the redacted token identity is logged at most once
// per client lifetime (first real request), rather than on every call.
logTokenOnce sync.Once
}
// newClient constructs a client with a normalized base URL.
func newClient(cfg *Config) *client {
return &client{
baseURL: cfg.GetBaseURL(),
token: cfg.APIToken,
httpClient: &http.Client{Timeout: defaultTimeout},
}
}
// doRequest executes an authenticated request and decodes JSON, with retry logic
// for transient errors (429, 5xx, transport failures).
// The raw X-Auth-Token is never logged; only a redacted form is emitted once per
// client lifetime (not per request) to keep sync logs readable at thousand-doc scale.
func (c *client) doRequest(ctx context.Context, method, path string, result interface{}) error {
const (
maxRetries = 3
max5xxRetries = 1
retry5xxDelay = 2 * time.Second
)
var lastErr error
backoff := []time.Duration{2 * time.Second, 4 * time.Second, 8 * time.Second}
c.logTokenOnce.Do(func() {
logger.Infof(ctx, "[Yuque] client configured token=%s base=%s", redactToken(c.token), c.baseURL)
})
for attempt := 0; attempt <= maxRetries; attempt++ {
reqURL := c.baseURL + path
req, err := http.NewRequestWithContext(ctx, method, reqURL, nil)
if err != nil {
return fmt.Errorf("create request: %w", err)
}
req.Header.Set("X-Auth-Token", c.token)
req.Header.Set("User-Agent", userAgent)
req.Header.Set("Content-Type", "application/json; charset=utf-8")
if attempt == 0 {
logger.Infof(ctx, "[Yuque] %s %s", method, path)
} else {
logger.Infof(ctx, "[Yuque] %s %s (retry %d/%d)", method, path, attempt, maxRetries)
}
resp, err := c.httpClient.Do(req)
if err != nil {
lastErr = fmt.Errorf("execute request: %w", err)
if attempt < maxRetries {
if sErr := sleepCtx(ctx, backoff[attempt]); sErr != nil {
return sErr
}
continue
}
return lastErr
}
body, readErr := io.ReadAll(resp.Body)
resp.Body.Close()
if readErr != nil {
lastErr = fmt.Errorf("read response body: %w", readErr)
if attempt < maxRetries {
if sErr := sleepCtx(ctx, backoff[attempt]); sErr != nil {
return sErr
}
continue
}
return lastErr
}
bodyPreview := truncate(string(body), 500)
logger.Infof(ctx, "[Yuque] %s %s → status=%d bodyLen=%d body=%s",
method, path, resp.StatusCode, len(body), bodyPreview)
if resp.StatusCode == 429 {
wait := parseRetryAfter(resp.Header.Get("Retry-After"), backoff[min(attempt, len(backoff)-1)])
lastErr = fmt.Errorf("yuque rate limited: status=429 body=%s", bodyPreview)
if attempt < maxRetries {
if sErr := sleepCtx(ctx, wait); sErr != nil {
return sErr
}
continue
}
return lastErr
}
if resp.StatusCode >= 500 && resp.StatusCode < 600 {
lastErr = fmt.Errorf("yuque server error: status=%d body=%s", resp.StatusCode, bodyPreview)
if attempt < max5xxRetries {
if sErr := sleepCtx(ctx, retry5xxDelay); sErr != nil {
return sErr
}
continue
}
return lastErr
}
// 401/403 → surface as ErrInvalidCredentials so DataSourceService can
// distinguish bad-token from transient failures and auto-flag the source.
if resp.StatusCode == http.StatusUnauthorized || resp.StatusCode == http.StatusForbidden {
return fmt.Errorf("%w: status=%d body=%s", datasource.ErrInvalidCredentials, resp.StatusCode, bodyPreview)
}
if resp.StatusCode < 200 || resp.StatusCode >= 300 {
var apiErr apiErrorBody
_ = json.Unmarshal(body, &apiErr)
if apiErr.Message != "" {
return fmt.Errorf("yuque api error: status=%d msg=%s", resp.StatusCode, apiErr.Message)
}
return fmt.Errorf("yuque api error: status=%d body=%s", resp.StatusCode, bodyPreview)
}
if result != nil {
if err := json.Unmarshal(body, result); err != nil {
return fmt.Errorf("decode response: %w", err)
}
}
return nil
}
return lastErr
}
// parseRetryAfter returns the Retry-After duration from the header, or fallback if unparseable.
// Retry-After: "0" (or negative) is coerced to 100ms so we still yield and don't busy-retry.
// Note: only integer-seconds form is supported (RFC 7231 also allows HTTP-date — not seen from Yuque).
func parseRetryAfter(header string, fallback time.Duration) time.Duration {
if header == "" {
return fallback
}
if secs, err := time.ParseDuration(header + "s"); err == nil {
if secs <= 0 {
return 100 * time.Millisecond
}
return secs
}
return fallback
}
// sleepCtx pauses for d, returning early if ctx is cancelled.
func sleepCtx(ctx context.Context, d time.Duration) error {
t := time.NewTimer(d)
defer t.Stop()
select {
case <-t.C:
return nil
case <-ctx.Done():
return ctx.Err()
}
}
// Ping verifies the credentials by calling GET /api/v2/user.
func (c *client) Ping(ctx context.Context) error {
var resp v2UserResponse
return c.doRequest(ctx, http.MethodGet, "/api/v2/user", &resp)
}
// truncate returns s truncated to maxLen with "..." appended if longer.
func truncate(s string, maxLen int) string {
if len(s) <= maxLen {
return s
}
return s[:maxLen] + "..."
}
// buildQuery encodes query parameters, omitting empty values.
func buildQuery(params map[string]string) string {
values := url.Values{}
for k, v := range params {
if v != "" {
values.Set(k, v)
}
}
if len(values) == 0 {
return ""
}
return "?" + values.Encode()
}
// GetCurrentUser returns the user associated with the current token.
func (c *client) GetCurrentUser(ctx context.Context) (v2User, error) {
var resp v2UserResponse
if err := c.doRequest(ctx, http.MethodGet, "/api/v2/user", &resp); err != nil {
return v2User{}, err
}
return resp.Data, nil
}
// ListUserGroups returns the groups the given user belongs to.
// Note: userID is Yuque's numeric user ID (not the login) — the /users/{id}/groups
// endpoint requires the integer ID form.
func (c *client) ListUserGroups(ctx context.Context, userID int64) ([]v2Group, error) {
path := fmt.Sprintf("/api/v2/users/%d/groups", userID)
var resp v2GroupListResponse
if err := c.doRequest(ctx, http.MethodGet, path, &resp); err != nil {
return nil, err
}
return resp.Data, nil
}
// ListUserRepos returns document-type (Book) repos owned by the given user login.
func (c *client) ListUserRepos(ctx context.Context, login string) ([]v2Repo, error) {
return c.listReposPaginated(ctx, fmt.Sprintf("/api/v2/users/%s/repos", login))
}
// ListGroupRepos returns document-type (Book) repos owned by the given group login.
func (c *client) ListGroupRepos(ctx context.Context, login string) ([]v2Repo, error) {
return c.listReposPaginated(ctx, fmt.Sprintf("/api/v2/groups/%s/repos", login))
}
// listReposPaginated walks the pagination for a repo listing endpoint.
// Filters to type=Book (document-type knowledge bases only; design/sheet/resource skipped).
func (c *client) listReposPaginated(ctx context.Context, basePath string) ([]v2Repo, error) {
var all []v2Repo
offset := 0
for {
q := buildQuery(map[string]string{
"type": "Book",
"offset": fmt.Sprintf("%d", offset),
"limit": fmt.Sprintf("%d", defaultPageSize),
})
var resp v2RepoListResponse
if err := c.doRequest(ctx, http.MethodGet, basePath+q, &resp); err != nil {
return nil, err
}
all = append(all, resp.Data...)
if len(resp.Data) < defaultPageSize {
break
}
offset += defaultPageSize
}
return all, nil
}
// ListBookDocs lists all documents in a book, handling pagination.
// Returns summaries only (body not included — use GetDocDetail to fetch body per doc).
func (c *client) ListBookDocs(ctx context.Context, bookID int64) ([]v2Doc, error) {
basePath := fmt.Sprintf("/api/v2/repos/%d/docs", bookID)
var all []v2Doc
offset := 0
for {
q := buildQuery(map[string]string{
"offset": fmt.Sprintf("%d", offset),
"limit": fmt.Sprintf("%d", defaultPageSize),
})
var resp v2DocListResponse
if err := c.doRequest(ctx, http.MethodGet, basePath+q, &resp); err != nil {
return nil, err
}
all = append(all, resp.Data...)
if len(resp.Data) < defaultPageSize {
break
}
offset += defaultPageSize
}
return all, nil
}
// GetDocDetail fetches the full document detail (including body) by doc ID.
func (c *client) GetDocDetail(ctx context.Context, docID int64) (v2DocDetail, error) {
path := fmt.Sprintf("/api/v2/repos/docs/%d", docID)
var resp v2DocDetailResponse
if err := c.doRequest(ctx, http.MethodGet, path, &resp); err != nil {
return v2DocDetail{}, err
}
return resp.Data, nil
}

View File

@@ -0,0 +1,418 @@
package yuque
import (
"bytes"
"context"
"encoding/json"
"errors"
"fmt"
"io"
"net/http"
"net/http/httptest"
"os"
"strings"
"testing"
"time"
"github.com/Tencent/WeKnora/internal/datasource"
"github.com/Tencent/WeKnora/internal/logger"
)
// fakeYuque sets up an httptest server emulating Yuque API endpoints.
// Handlers can be overridden per test.
type fakeYuque struct {
server *httptest.Server
mux *http.ServeMux
calls []string // methodPath history
}
func newFakeYuque() *fakeYuque {
f := &fakeYuque{mux: http.NewServeMux()}
f.server = httptest.NewServer(f.mux)
// Default /api/v2/user handler (most tests hit it at least once).
f.handleJSON("/api/v2/user", 200, v2UserResponse{Data: v2User{ID: 1, Login: "me", Name: "Me"}})
return f
}
func (f *fakeYuque) Close() { f.server.Close() }
func (f *fakeYuque) cfg() *Config {
return &Config{APIToken: "tok-super-secret-value-1234", BaseURL: f.server.URL}
}
func (f *fakeYuque) handleJSON(path string, status int, body interface{}) {
f.mux.HandleFunc(path, func(w http.ResponseWriter, r *http.Request) {
f.calls = append(f.calls, r.Method+" "+r.URL.Path+"?"+r.URL.RawQuery)
w.Header().Set("Content-Type", "application/json")
w.WriteHeader(status)
_ = json.NewEncoder(w).Encode(body)
})
}
func TestClient_Ping_Success(t *testing.T) {
f := newFakeYuque()
defer f.Close()
c := newClient(f.cfg())
if err := c.Ping(context.Background()); err != nil {
t.Fatalf("Ping error: %v", err)
}
}
func TestClient_Ping_SendsAuthHeader(t *testing.T) {
f := newFakeYuque()
defer f.Close()
var gotToken string
f.mux = http.NewServeMux()
f.mux.HandleFunc("/api/v2/user", func(w http.ResponseWriter, r *http.Request) {
gotToken = r.Header.Get("X-Auth-Token")
w.Header().Set("Content-Type", "application/json")
_ = json.NewEncoder(w).Encode(v2UserResponse{Data: v2User{ID: 1, Login: "me"}})
})
f.server.Config.Handler = f.mux
c := newClient(f.cfg())
if err := c.Ping(context.Background()); err != nil {
t.Fatalf("Ping error: %v", err)
}
if gotToken != "tok-super-secret-value-1234" {
t.Errorf("X-Auth-Token = %q, want the token", gotToken)
}
}
func TestClient_Ping_401(t *testing.T) {
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.WriteHeader(401)
_, _ = io.WriteString(w, `{"message":"Unauthorized"}`)
}))
defer srv.Close()
c := newClient(&Config{APIToken: "bad", BaseURL: srv.URL})
err := c.Ping(context.Background())
if err == nil {
t.Fatal("expected error on 401")
}
if !errors.Is(err, datasource.ErrInvalidCredentials) {
t.Errorf("401 should wrap ErrInvalidCredentials, got: %v", err)
}
}
func TestClient_Ping_403WrapsInvalidCredentials(t *testing.T) {
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.WriteHeader(403)
_, _ = io.WriteString(w, `{"message":"Forbidden"}`)
}))
defer srv.Close()
c := newClient(&Config{APIToken: "insufficient", BaseURL: srv.URL})
err := c.Ping(context.Background())
if err == nil {
t.Fatal("expected error on 403")
}
if !errors.Is(err, datasource.ErrInvalidCredentials) {
t.Errorf("403 should wrap ErrInvalidCredentials, got: %v", err)
}
}
func TestClient_TokenNeverLoggedInFull(t *testing.T) {
// Redirect the project's internal logger to an in-memory buffer so we can
// assert the raw token never appears in log output. Using stdlib `log`
// would be vacuous — the real logger is a private logrus instance.
var buf bytes.Buffer
logger.SetOutput(&buf)
defer logger.SetOutput(os.Stdout)
f := newFakeYuque()
defer f.Close()
rawToken := "tok-super-secret-value-1234"
cfg := &Config{APIToken: rawToken, BaseURL: f.server.URL}
c := newClient(cfg)
_ = c.Ping(context.Background())
out := buf.String()
if out == "" {
t.Fatal("expected logger output (sanity check — SetOutput wiring may be broken)")
}
if strings.Contains(out, rawToken) {
t.Errorf("raw token leaked into logs:\n%s", out)
}
// Verify the redacted form was emitted (positive assertion, not just absence).
if !strings.Contains(out, redactToken(rawToken)) {
t.Errorf("expected redacted token %q in logs, got:\n%s", redactToken(rawToken), out)
}
}
func TestRedactToken(t *testing.T) {
tests := []struct{ in, want string }{
{"short", "***"},
{"abcdef1234567890", "abcdef...7890"},
}
for _, tt := range tests {
if got := redactToken(tt.in); got != tt.want {
t.Errorf("redactToken(%q) = %q, want %q", tt.in, got, tt.want)
}
}
}
func TestClient_429WithRetryAfter_Retries(t *testing.T) {
attempt := 0
mux := http.NewServeMux()
mux.HandleFunc("/api/v2/user", func(w http.ResponseWriter, r *http.Request) {
attempt++
if attempt == 1 {
// "0" is coerced to 100ms inside the client so the test stays fast.
w.Header().Set("Retry-After", "0")
w.WriteHeader(429)
_, _ = io.WriteString(w, `{"message":"rate limited"}`)
return
}
w.Header().Set("Content-Type", "application/json")
_ = json.NewEncoder(w).Encode(v2UserResponse{Data: v2User{ID: 1, Login: "me"}})
})
srv := httptest.NewServer(mux)
defer srv.Close()
c := newClient(&Config{APIToken: "t", BaseURL: srv.URL})
if err := c.Ping(context.Background()); err != nil {
t.Fatalf("expected success after retry, got %v", err)
}
if attempt < 2 {
t.Errorf("expected at least 2 attempts, got %d", attempt)
}
}
func TestClient_429ExhaustsRetries(t *testing.T) {
attempts := 0
mux := http.NewServeMux()
mux.HandleFunc("/api/v2/user", func(w http.ResponseWriter, r *http.Request) {
attempts++
w.Header().Set("Retry-After", "0")
w.WriteHeader(429)
_, _ = io.WriteString(w, `{"message":"rate limited"}`)
})
srv := httptest.NewServer(mux)
defer srv.Close()
c := newClient(&Config{APIToken: "t", BaseURL: srv.URL})
err := c.Ping(context.Background())
if err == nil {
t.Fatal("expected error when 429s exceed retry budget")
}
if attempts != 4 { // initial + 3 retries
t.Errorf("attempts = %d, want 4 (1 + 3 retries)", attempts)
}
}
func TestClient_5xxRetriesOnce(t *testing.T) {
attempts := 0
mux := http.NewServeMux()
mux.HandleFunc("/api/v2/user", func(w http.ResponseWriter, r *http.Request) {
attempts++
w.WriteHeader(500)
_, _ = io.WriteString(w, `{"message":"internal error"}`)
})
srv := httptest.NewServer(mux)
defer srv.Close()
// Make the fixed 2-second 5xx sleep negligible by using a context whose
// deadline fires after just one retry cycle. We still expect exactly 2
// attempts because the implementation should retry 5xx once and then stop.
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
defer cancel()
c := newClient(&Config{APIToken: "t", BaseURL: srv.URL})
if err := c.Ping(ctx); err == nil {
t.Fatal("expected error after 5xx exhaustion")
}
if attempts != 2 { // initial + 1 retry
t.Errorf("attempts = %d, want 2 (5xx retries exactly once)", attempts)
}
}
func TestClient_4xxNotRetried(t *testing.T) {
attempts := 0
mux := http.NewServeMux()
mux.HandleFunc("/api/v2/user", func(w http.ResponseWriter, r *http.Request) {
attempts++
w.WriteHeader(400)
_, _ = io.WriteString(w, `{"message":"bad request"}`)
})
srv := httptest.NewServer(mux)
defer srv.Close()
c := newClient(&Config{APIToken: "t", BaseURL: srv.URL})
if err := c.Ping(context.Background()); err == nil {
t.Fatal("expected error on 400")
}
if attempts != 1 {
t.Errorf("attempts = %d, want 1 (non-429/5xx 4xx must not retry)", attempts)
}
}
func TestParseRetryAfter(t *testing.T) {
fallback := 5 * time.Second
tests := []struct {
header string
want time.Duration
}{
{"", fallback},
{"0", 100 * time.Millisecond},
{"-1", 100 * time.Millisecond}, // negative coerced
{"3", 3 * time.Second},
{"abc", fallback}, // unparseable
}
for _, tt := range tests {
if got := parseRetryAfter(tt.header, fallback); got != tt.want {
t.Errorf("parseRetryAfter(%q) = %v, want %v", tt.header, got, tt.want)
}
}
}
func TestClient_GetCurrentUser(t *testing.T) {
f := newFakeYuque()
defer f.Close()
// Override the default /api/v2/user handler via a fresh mux.
f.mux = http.NewServeMux()
f.mux.HandleFunc("/api/v2/user", func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "application/json")
_ = json.NewEncoder(w).Encode(v2UserResponse{Data: v2User{ID: 42, Login: "alice", Name: "Alice"}})
})
f.server.Config.Handler = f.mux
c := newClient(f.cfg())
u, err := c.GetCurrentUser(context.Background())
if err != nil {
t.Fatalf("GetCurrentUser: %v", err)
}
if u.ID != 42 || u.Login != "alice" {
t.Errorf("got %+v, want {42 alice}", u)
}
}
func TestClient_ListUserGroups(t *testing.T) {
f := newFakeYuque()
defer f.Close()
f.handleJSON("/api/v2/users/42/groups", 200, v2GroupListResponse{Data: []v2Group{
{ID: 100, Login: "team-a", Name: "Team A"},
{ID: 101, Login: "team-b", Name: "Team B"},
}})
c := newClient(f.cfg())
gs, err := c.ListUserGroups(context.Background(), 42)
if err != nil {
t.Fatalf("ListUserGroups: %v", err)
}
if len(gs) != 2 || gs[0].Login != "team-a" {
t.Errorf("got %+v", gs)
}
}
func TestClient_ListUserRepos_FiltersType(t *testing.T) {
f := newFakeYuque()
defer f.Close()
var gotQuery string
f.mux.HandleFunc("/api/v2/users/alice/repos", func(w http.ResponseWriter, r *http.Request) {
gotQuery = r.URL.RawQuery
w.Header().Set("Content-Type", "application/json")
_ = json.NewEncoder(w).Encode(v2RepoListResponse{Data: []v2Repo{
{ID: 9, Slug: "book1", Name: "Book 1", Type: "Book", Namespace: "alice/book1"},
}})
})
c := newClient(f.cfg())
repos, err := c.ListUserRepos(context.Background(), "alice")
if err != nil {
t.Fatalf("ListUserRepos: %v", err)
}
if !strings.Contains(gotQuery, "type=Book") {
t.Errorf("expected type=Book in query, got %q", gotQuery)
}
if !strings.Contains(gotQuery, "offset=0") {
t.Errorf("expected offset=0 in query, got %q", gotQuery)
}
if !strings.Contains(gotQuery, "limit=100") {
t.Errorf("expected limit=100 in query, got %q", gotQuery)
}
if len(repos) != 1 || repos[0].Namespace != "alice/book1" {
t.Errorf("got %+v", repos)
}
}
func TestClient_ListGroupRepos(t *testing.T) {
f := newFakeYuque()
defer f.Close()
f.handleJSON("/api/v2/groups/team-a/repos", 200, v2RepoListResponse{Data: []v2Repo{
{ID: 11, Slug: "tb", Name: "Team Book", Type: "Book", Namespace: "team-a/tb"},
}})
c := newClient(f.cfg())
repos, err := c.ListGroupRepos(context.Background(), "team-a")
if err != nil {
t.Fatalf("ListGroupRepos: %v", err)
}
if len(repos) != 1 || repos[0].ID != 11 {
t.Errorf("got %+v", repos)
}
}
func TestClient_ListBookDocs_Pagination(t *testing.T) {
f := newFakeYuque()
defer f.Close()
// Page 1: 100 docs, page 2: 30 docs, then stop.
callCount := 0
f.mux.HandleFunc("/api/v2/repos/555/docs", func(w http.ResponseWriter, r *http.Request) {
callCount++
offset := r.URL.Query().Get("offset")
docs := make([]v2Doc, 0)
switch offset {
case "0":
for i := 0; i < 100; i++ {
docs = append(docs, v2Doc{ID: int64(i + 1), Type: "Doc", Status: "1", Title: fmt.Sprintf("D%d", i+1), ContentUpdatedAt: "2026-04-20T00:00:00Z"})
}
case "100":
for i := 0; i < 30; i++ {
docs = append(docs, v2Doc{ID: int64(i + 101), Type: "Doc", Status: "1", Title: fmt.Sprintf("D%d", i+101), ContentUpdatedAt: "2026-04-20T00:00:00Z"})
}
}
w.Header().Set("Content-Type", "application/json")
_ = json.NewEncoder(w).Encode(v2DocListResponse{Data: docs})
})
c := newClient(f.cfg())
docs, err := c.ListBookDocs(context.Background(), 555)
if err != nil {
t.Fatalf("ListBookDocs: %v", err)
}
if len(docs) != 130 {
t.Errorf("len(docs) = %d, want 130", len(docs))
}
if callCount != 2 {
t.Errorf("callCount = %d, want 2 (one full page + one partial)", callCount)
}
}
func TestClient_GetDocDetail(t *testing.T) {
f := newFakeYuque()
defer f.Close()
f.handleJSON("/api/v2/repos/docs/7", 200, v2DocDetailResponse{
Data: v2DocDetail{
ID: 7,
Title: "Hello",
Body: "# Hello\n\nworld",
Format: "markdown",
Status: "1",
ContentUpdatedAt: "2026-04-20T12:00:00Z",
},
})
c := newClient(f.cfg())
d, err := c.GetDocDetail(context.Background(), 7)
if err != nil {
t.Fatalf("GetDocDetail: %v", err)
}
if d.Body != "# Hello\n\nworld" {
t.Errorf("Body = %q", d.Body)
}
}

View File

@@ -0,0 +1,312 @@
package yuque
import (
"context"
"encoding/json"
"fmt"
"sort"
"strconv"
"time"
"github.com/Tencent/WeKnora/internal/datasource"
"github.com/Tencent/WeKnora/internal/logger"
"github.com/Tencent/WeKnora/internal/types"
)
// Compile-time proof that *Connector satisfies the datasource.Connector interface.
// Catches signature drift as soon as the interface changes, rather than at
// container wiring or runtime.
var _ datasource.Connector = (*Connector)(nil)
// Connector implements datasource.Connector for Yuque.
type Connector struct{}
// NewConnector creates a new Yuque connector.
func NewConnector() *Connector { return &Connector{} }
// Type returns the connector type identifier.
func (c *Connector) Type() string { return types.ConnectorTypeYuque }
// Validate verifies the given credentials by pinging the current-user endpoint.
func (c *Connector) Validate(ctx context.Context, config *types.DataSourceConfig) error {
cfg, err := parseYuqueConfig(config)
if err != nil {
return err
}
cli := newClient(cfg)
if err := cli.Ping(ctx); err != nil {
return fmt.Errorf("yuque connection failed: %w", err)
}
return nil
}
// ListResources returns all repos (personal + team) accessible to the token.
// Serial fetch for v1 (user groups typically <10). TODO(perf): parallelize if slow.
func (c *Connector) ListResources(ctx context.Context, config *types.DataSourceConfig) ([]types.Resource, error) {
cfg, err := parseYuqueConfig(config)
if err != nil {
return nil, err
}
cli := newClient(cfg)
me, err := cli.GetCurrentUser(ctx)
if err != nil {
return nil, fmt.Errorf("get current user: %w", err)
}
// Personal repos come first and win over team representations of the same repo
// (first-write-wins dedup) — owner-side metadata is more authoritative.
repos := make(map[int64]v2Repo)
personal, err := cli.ListUserRepos(ctx, me.Login)
if err != nil {
return nil, fmt.Errorf("list personal repos: %w", err)
}
for _, r := range personal {
if _, ok := repos[r.ID]; !ok {
repos[r.ID] = r
}
}
groups, err := cli.ListUserGroups(ctx, me.ID)
if err != nil {
return nil, fmt.Errorf("list user groups: %w", err)
}
for _, g := range groups {
teamRepos, err := cli.ListGroupRepos(ctx, g.Login)
if err != nil {
// Skip this group but continue others (e.g., 403 on a restricted group).
// Log so operators can distinguish partial failures from empty groups.
logger.Warnf(ctx, "[Yuque] skip group %s: %v", g.Login, err)
continue
}
for _, r := range teamRepos {
if _, ok := repos[r.ID]; !ok {
repos[r.ID] = r
}
}
}
out := make([]types.Resource, 0, len(repos))
for _, r := range repos {
out = append(out, types.Resource{
ExternalID: strconv.FormatInt(r.ID, 10),
Name: r.Name,
Type: "book",
URL: cfg.GetBaseURL() + "/" + r.Namespace,
Description: r.Namespace,
ModifiedAt: parseContentUpdatedAt(r.UpdatedAt),
Metadata: map[string]interface{}{
"public": r.Public,
"book_type": r.Type,
},
})
}
// Stable, deterministic order for UI rendering and response-body caching.
sort.Slice(out, func(i, j int) bool { return out[i].ExternalID < out[j].ExternalID })
return out, nil
}
// FetchAll performs a full sync of all books specified in resourceIDs.
func (c *Connector) FetchAll(ctx context.Context, config *types.DataSourceConfig, resourceIDs []string) ([]types.FetchedItem, error) {
items, _, err := c.walk(ctx, config, resourceIDs, nil, false)
return items, err
}
// walk is the shared implementation for FetchAll / FetchIncremental.
// If incremental is false, prev is ignored and no cursor is returned (returns nil for cursor).
func (c *Connector) walk(
ctx context.Context,
config *types.DataSourceConfig,
resourceIDs []string,
prev *yuqueCursor,
incremental bool,
) ([]types.FetchedItem, *yuqueCursor, error) {
cfg, err := parseYuqueConfig(config)
if err != nil {
return nil, nil, err
}
cli := newClient(cfg)
newCursor := &yuqueCursor{LastSyncTime: time.Now(), BookDocTimes: make(map[string]map[string]string)}
var out []types.FetchedItem
for _, bookIDStr := range resourceIDs {
bookID, err := strconv.ParseInt(bookIDStr, 10, 64)
if err != nil {
return nil, nil, fmt.Errorf("invalid book id %q: %w", bookIDStr, err)
}
docs, err := cli.ListBookDocs(ctx, bookID)
if err != nil {
return nil, nil, fmt.Errorf("list docs for book %d: %w", bookID, err)
}
currentDocs := make(map[string]bool)
newCursor.BookDocTimes[bookIDStr] = make(map[string]string)
var skippedType, skippedDraft, kept int
var sampleSkipType, sampleSkipDraft string
for _, d := range docs {
// Empty type/status is treated as acceptable — forward-compat with
// API variations that omit the field.
if d.Type != "" && d.Type != "Doc" {
skippedType++
if sampleSkipType == "" {
sampleSkipType = fmt.Sprintf("id=%d type=%q title=%q", d.ID, d.Type, d.Title)
}
continue
}
if d.Status != "" && d.Status != "1" {
skippedDraft++
if sampleSkipDraft == "" {
sampleSkipDraft = fmt.Sprintf("id=%d status=%q title=%q", d.ID, d.Status, d.Title)
}
continue
}
kept++
docIDStr := strconv.FormatInt(d.ID, 10)
currentDocs[docIDStr] = true
newCursor.BookDocTimes[bookIDStr][docIDStr] = d.ContentUpdatedAt
// Incremental: skip if content hasn't changed.
if incremental && prev != nil && prev.BookDocTimes != nil {
if prevTimes, ok := prev.BookDocTimes[bookIDStr]; ok {
if prevTimes[docIDStr] == d.ContentUpdatedAt {
continue
}
}
}
detail, err := cli.GetDocDetail(ctx, d.ID)
if err != nil {
// Record failure but continue (placeholder item with error metadata).
// Keep doc_id/book_id/slug for observability pipelines that join on these.
out = append(out, types.FetchedItem{
ExternalID: docIDStr,
Title: d.Title,
SourceResourceID: bookIDStr,
Metadata: map[string]string{
"error": err.Error(),
"channel": types.ChannelYuque,
"doc_id": docIDStr,
"book_id": bookIDStr,
"slug": d.Slug,
},
})
continue
}
// Yuque serializes `body` as Markdown when format is "markdown" or
// "lake" (Lake XML lives separately in `body_lake`) — empirically
// verified against the v2 API. Any other format (e.g. "html") may
// put non-Markdown content in `body`, so skip defensively.
if detail.Format != "" && detail.Format != "markdown" && detail.Format != "lake" {
logger.Warnf(ctx, "[Yuque] skip doc %d (%q): unsupported format %q",
d.ID, d.Title, detail.Format)
out = append(out, types.FetchedItem{
ExternalID: docIDStr,
Title: d.Title,
SourceResourceID: bookIDStr,
Metadata: map[string]string{
"channel": types.ChannelYuque,
"doc_id": docIDStr,
"book_id": bookIDStr,
"slug": d.Slug,
"skip_reason": "unsupported format: " + detail.Format,
},
})
continue
}
out = append(out, types.FetchedItem{
ExternalID: docIDStr,
Title: d.Title,
Content: []byte(detail.Body),
ContentType: "text/markdown",
FileName: sanitizeFileName(d.Title) + ".md",
URL: buildDocURL(cfg.GetBaseURL(), detail.Book.Namespace, d.Slug),
UpdatedAt: parseContentUpdatedAt(d.ContentUpdatedAt),
SourceResourceID: bookIDStr,
Metadata: map[string]string{
"doc_id": docIDStr,
"book_id": bookIDStr,
"slug": d.Slug,
"creator": strconv.FormatInt(d.UserID, 10),
"word_count": strconv.Itoa(d.WordCount),
"channel": types.ChannelYuque,
},
})
}
logger.Infof(ctx, "[Yuque] book %d: total=%d kept=%d skipped_non_doc=%d skipped_draft=%d non_doc_sample={%s} draft_sample={%s}",
bookID, len(docs), kept, skippedType, skippedDraft, sampleSkipType, sampleSkipDraft)
// Deletion detection (incremental only): previous doc IDs not in current → IsDeleted=true
if incremental && prev != nil && prev.BookDocTimes != nil {
if prevTimes, ok := prev.BookDocTimes[bookIDStr]; ok {
for prevDocID := range prevTimes {
if !currentDocs[prevDocID] {
out = append(out, types.FetchedItem{
ExternalID: prevDocID,
IsDeleted: true,
SourceResourceID: bookIDStr,
})
}
}
}
}
}
if !incremental {
return out, nil, nil
}
return out, newCursor, nil
}
// buildDocURL constructs a browser URL for a Yuque doc.
// Namespace may be empty on some responses; fall back to base URL only.
func buildDocURL(baseURL, namespace, slug string) string {
if namespace == "" {
return baseURL
}
return baseURL + "/" + namespace + "/" + slug
}
// FetchIncremental returns items changed (or deleted) since the prior cursor.
// Deletion detection: docs present in the prior cursor but absent from the
// current list are emitted as IsDeleted=true placeholder items.
func (c *Connector) FetchIncremental(
ctx context.Context,
config *types.DataSourceConfig,
cursor *types.SyncCursor,
) ([]types.FetchedItem, *types.SyncCursor, error) {
resourceIDs := config.ResourceIDs
if len(resourceIDs) == 0 {
return nil, nil, fmt.Errorf("no resource IDs (book IDs) configured")
}
// Decode prior cursor (if any).
var prev *yuqueCursor
if cursor != nil && cursor.ConnectorCursor != nil {
var p yuqueCursor
b, _ := json.Marshal(cursor.ConnectorCursor)
_ = json.Unmarshal(b, &p)
prev = &p
}
items, newCursor, err := c.walk(ctx, config, resourceIDs, prev, true)
if err != nil {
return nil, nil, err
}
// Marshal newCursor into a generic map for the SyncCursor wrapper.
cursorMap := make(map[string]interface{})
b, _ := json.Marshal(newCursor)
_ = json.Unmarshal(b, &cursorMap)
return items, &types.SyncCursor{
LastSyncTime: newCursor.LastSyncTime,
ConnectorCursor: cursorMap,
}, nil
}

View File

@@ -0,0 +1,475 @@
package yuque
import (
"context"
"net/http"
"net/http/httptest"
"testing"
"github.com/Tencent/WeKnora/internal/types"
)
func makeDSConfig(f *fakeYuque, resourceIDs []string) *types.DataSourceConfig {
return &types.DataSourceConfig{
Type: types.ConnectorTypeYuque,
Credentials: map[string]interface{}{"api_token": f.cfg().APIToken, "base_url": f.cfg().BaseURL},
ResourceIDs: resourceIDs,
}
}
func TestConnector_Type(t *testing.T) {
if NewConnector().Type() != types.ConnectorTypeYuque {
t.Errorf("Type() = %q, want %q", NewConnector().Type(), types.ConnectorTypeYuque)
}
}
func TestConnector_Validate_Success(t *testing.T) {
f := newFakeYuque()
defer f.Close()
if err := NewConnector().Validate(context.Background(), makeDSConfig(f, nil)); err != nil {
t.Fatalf("Validate error: %v", err)
}
}
func TestConnector_Validate_Bad401(t *testing.T) {
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.WriteHeader(401)
}))
defer srv.Close()
err := NewConnector().Validate(context.Background(), &types.DataSourceConfig{
Credentials: map[string]interface{}{"api_token": "bad", "base_url": srv.URL},
})
if err == nil {
t.Fatal("expected error for 401")
}
}
func TestConnector_ListResources_Aggregates(t *testing.T) {
f := newFakeYuque()
defer f.Close()
// user = alice (id=1)
// Override default /api/v2/user handler to return alice (f.handleJSON panics on duplicate path registration).
f.mux = http.NewServeMux()
f.server.Config.Handler = f.mux
f.handleJSON("/api/v2/user", 200, v2UserResponse{Data: v2User{ID: 1, Login: "alice"}})
// personal repos
f.handleJSON("/api/v2/users/alice/repos", 200, v2RepoListResponse{Data: []v2Repo{
{ID: 10, Slug: "personal", Name: "Personal", Type: "Book", Namespace: "alice/personal"},
}})
// 2 groups
f.handleJSON("/api/v2/users/1/groups", 200, v2GroupListResponse{Data: []v2Group{
{ID: 100, Login: "team-a"}, {ID: 101, Login: "team-b"},
}})
f.handleJSON("/api/v2/groups/team-a/repos", 200, v2RepoListResponse{Data: []v2Repo{
{ID: 20, Slug: "ab", Name: "A Book", Type: "Book", Namespace: "team-a/ab"},
}})
f.handleJSON("/api/v2/groups/team-b/repos", 200, v2RepoListResponse{Data: []v2Repo{
{ID: 30, Slug: "bb", Name: "B Book", Type: "Book", Namespace: "team-b/bb"},
}})
resources, err := NewConnector().ListResources(context.Background(), makeDSConfig(f, nil))
if err != nil {
t.Fatalf("ListResources: %v", err)
}
if len(resources) != 3 {
t.Fatalf("len = %d, want 3", len(resources))
}
// Verify fields on one entry
var ab *types.Resource
for i := range resources {
if resources[i].Name == "A Book" {
ab = &resources[i]
}
}
if ab == nil {
t.Fatal("expected A Book resource")
}
if ab.ExternalID != "20" || ab.Description != "team-a/ab" || ab.Type != "book" {
t.Errorf("got %+v", ab)
}
}
func TestConnector_ListResources_DedupByID(t *testing.T) {
f := newFakeYuque()
defer f.Close()
f.mux = http.NewServeMux()
f.server.Config.Handler = f.mux
f.handleJSON("/api/v2/user", 200, v2UserResponse{Data: v2User{ID: 1, Login: "alice"}})
f.handleJSON("/api/v2/users/alice/repos", 200, v2RepoListResponse{Data: []v2Repo{
{ID: 99, Slug: "shared", Type: "Book", Namespace: "alice/shared"},
}})
f.handleJSON("/api/v2/users/1/groups", 200, v2GroupListResponse{Data: []v2Group{
{ID: 200, Login: "team-x"},
}})
// Same repo (ID 99) appears again via a team the user is in.
f.handleJSON("/api/v2/groups/team-x/repos", 200, v2RepoListResponse{Data: []v2Repo{
{ID: 99, Slug: "shared", Type: "Book", Namespace: "alice/shared"},
}})
resources, err := NewConnector().ListResources(context.Background(), makeDSConfig(f, nil))
if err != nil {
t.Fatalf("ListResources: %v", err)
}
if len(resources) != 1 {
t.Errorf("expected dedup to 1, got %d", len(resources))
}
}
func TestConnector_FetchAll_Markdown(t *testing.T) {
f := newFakeYuque()
defer f.Close()
// Book 7 → 2 docs; one published Doc, one draft (filtered)
f.handleJSON("/api/v2/repos/7/docs", 200, v2DocListResponse{Data: []v2Doc{
{ID: 101, Type: "Doc", Status: "1", Title: "Hello", Slug: "hello", BookID: 7, ContentUpdatedAt: "2026-04-20T10:00:00Z", WordCount: 42},
{ID: 102, Type: "Doc", Status: "0", Title: "Draft", Slug: "draft", BookID: 7, ContentUpdatedAt: "2026-04-20T11:00:00Z"}, // draft, skipped
}})
f.handleJSON("/api/v2/repos/docs/101", 200, v2DocDetailResponse{
Data: v2DocDetail{ID: 101, Title: "Hello", Body: "# Hello\n\nworld", Format: "markdown", Status: "1",
ContentUpdatedAt: "2026-04-20T10:00:00Z", Book: v2Repo{Namespace: "alice/demo"}},
})
items, err := NewConnector().FetchAll(context.Background(), makeDSConfig(f, []string{"7"}), []string{"7"})
if err != nil {
t.Fatalf("FetchAll: %v", err)
}
if len(items) != 1 {
t.Fatalf("len = %d, want 1 (draft filtered)", len(items))
}
it := items[0]
if it.ExternalID != "101" || it.Title != "Hello" {
t.Errorf("got %+v", it)
}
if string(it.Content) != "# Hello\n\nworld" {
t.Errorf("Content = %q", string(it.Content))
}
if it.ContentType != "text/markdown" {
t.Errorf("ContentType = %q", it.ContentType)
}
if it.FileName != "Hello.md" {
t.Errorf("FileName = %q", it.FileName)
}
if it.Metadata["channel"] != types.ChannelYuque {
t.Errorf("channel = %q", it.Metadata["channel"])
}
if it.UpdatedAt.IsZero() {
t.Error("UpdatedAt should be parsed")
}
}
func TestConnector_FetchAll_DocDetailError_EmitsPlaceholder(t *testing.T) {
f := newFakeYuque()
defer f.Close()
f.handleJSON("/api/v2/repos/9/docs", 200, v2DocListResponse{Data: []v2Doc{
{ID: 301, Type: "Doc", Status: "1", Title: "Broken", Slug: "broken", ContentUpdatedAt: "2026-04-20T10:00:00Z"},
{ID: 302, Type: "Doc", Status: "1", Title: "OK", Slug: "ok", ContentUpdatedAt: "2026-04-20T10:00:00Z"},
}})
f.mux.HandleFunc("/api/v2/repos/docs/301", func(w http.ResponseWriter, r *http.Request) {
w.WriteHeader(http.StatusBadRequest) // 4xx is non-retriable, bubbles up as an error
_, _ = w.Write([]byte(`{"message":"broken"}`))
})
f.handleJSON("/api/v2/repos/docs/302", 200, v2DocDetailResponse{
Data: v2DocDetail{ID: 302, Title: "OK", Body: "ok", Status: "1", ContentUpdatedAt: "2026-04-20T10:00:00Z"},
})
items, err := NewConnector().FetchAll(context.Background(), makeDSConfig(f, []string{"9"}), []string{"9"})
if err != nil {
t.Fatalf("FetchAll must not abort the batch on a single detail error, got %v", err)
}
if len(items) != 2 {
t.Fatalf("want 2 items (placeholder + ok), got %d: %+v", len(items), items)
}
// Find the placeholder — the one with the "error" metadata key.
var placeholder *types.FetchedItem
for i := range items {
if items[i].Metadata["error"] != "" {
placeholder = &items[i]
}
}
if placeholder == nil {
t.Fatal("expected a placeholder item with Metadata[error] set")
}
if placeholder.ExternalID != "301" || placeholder.Title != "Broken" {
t.Errorf("placeholder identity wrong: %+v", placeholder)
}
if placeholder.Metadata["channel"] != types.ChannelYuque {
t.Errorf("placeholder channel = %q", placeholder.Metadata["channel"])
}
if placeholder.Metadata["book_id"] != "9" || placeholder.Metadata["doc_id"] != "301" || placeholder.Metadata["slug"] != "broken" {
t.Errorf("placeholder missing traceability metadata: %+v", placeholder.Metadata)
}
if len(placeholder.Content) != 0 {
t.Errorf("placeholder should have empty Content, got %q", string(placeholder.Content))
}
}
func TestConnector_FetchAll_LakeFormatIngestedAsMarkdown(t *testing.T) {
// Yuque's v2 doc-detail serves `body` as Markdown for type="Doc" regardless
// of authoring format. format="lake" means the doc was edited in the Lake
// editor, but `body` is still the Markdown representation (Lake XML lives
// in `body_lake`). So both should be ingested identically.
f := newFakeYuque()
defer f.Close()
f.handleJSON("/api/v2/repos/12/docs", 200, v2DocListResponse{Data: []v2Doc{
{ID: 401, Type: "Doc", Status: "1", Title: "Lake Doc", Slug: "lake", ContentUpdatedAt: "2026-04-20T10:00:00Z"},
{ID: 402, Type: "Doc", Status: "1", Title: "MD Doc", Slug: "md", ContentUpdatedAt: "2026-04-20T10:00:00Z"},
}})
f.handleJSON("/api/v2/repos/docs/401", 200, v2DocDetailResponse{
Data: v2DocDetail{ID: 401, Title: "Lake Doc", Format: "lake", Body: "# lake body as markdown", Status: "1", ContentUpdatedAt: "2026-04-20T10:00:00Z"},
})
f.handleJSON("/api/v2/repos/docs/402", 200, v2DocDetailResponse{
Data: v2DocDetail{ID: 402, Title: "MD Doc", Format: "markdown", Body: "# MD", Status: "1", ContentUpdatedAt: "2026-04-20T10:00:00Z"},
})
items, err := NewConnector().FetchAll(context.Background(), makeDSConfig(f, []string{"12"}), []string{"12"})
if err != nil {
t.Fatalf("FetchAll: %v", err)
}
if len(items) != 2 {
t.Fatalf("want 2 items, got %d", len(items))
}
wantBody := map[string]string{
"401": "# lake body as markdown",
"402": "# MD",
}
for _, it := range items {
if string(it.Content) != wantBody[it.ExternalID] {
t.Errorf("item %s Content = %q, want %q", it.ExternalID, it.Content, wantBody[it.ExternalID])
}
if it.ContentType != "text/markdown" {
t.Errorf("item %s ContentType = %q, want text/markdown", it.ExternalID, it.ContentType)
}
}
}
func TestConnector_FetchAll_SkipsUnsupportedFormats(t *testing.T) {
f := newFakeYuque()
defer f.Close()
f.handleJSON("/api/v2/repos/13/docs", 200, v2DocListResponse{Data: []v2Doc{
{ID: 501, Type: "Doc", Status: "1", Title: "HTML Doc", Slug: "h", ContentUpdatedAt: "2026-04-20T10:00:00Z"},
{ID: 502, Type: "Doc", Status: "1", Title: "OK Doc", Slug: "ok", ContentUpdatedAt: "2026-04-20T10:00:00Z"},
}})
f.handleJSON("/api/v2/repos/docs/501", 200, v2DocDetailResponse{
Data: v2DocDetail{ID: 501, Title: "HTML Doc", Format: "html", Body: "<p>raw html</p>", Status: "1", ContentUpdatedAt: "2026-04-20T10:00:00Z"},
})
f.handleJSON("/api/v2/repos/docs/502", 200, v2DocDetailResponse{
Data: v2DocDetail{ID: 502, Title: "OK Doc", Format: "lake", Body: "# ok", Status: "1", ContentUpdatedAt: "2026-04-20T10:00:00Z"},
})
items, err := NewConnector().FetchAll(context.Background(), makeDSConfig(f, []string{"13"}), []string{"13"})
if err != nil {
t.Fatalf("FetchAll: %v", err)
}
if len(items) != 2 {
t.Fatalf("want 2 items (placeholder + ok), got %d", len(items))
}
var htmlPlaceholder, okItem *types.FetchedItem
for i := range items {
if items[i].ExternalID == "501" {
htmlPlaceholder = &items[i]
}
if items[i].ExternalID == "502" {
okItem = &items[i]
}
}
if htmlPlaceholder == nil || okItem == nil {
t.Fatal("missing one of the items")
}
if htmlPlaceholder.Metadata["skip_reason"] == "" {
t.Errorf("html doc should have skip_reason metadata, got %+v", htmlPlaceholder.Metadata)
}
if len(htmlPlaceholder.Content) != 0 {
t.Errorf("html placeholder should have empty Content")
}
if string(okItem.Content) != "# ok" {
t.Errorf("lake doc Content = %q, want %q", string(okItem.Content), "# ok")
}
}
func TestConnector_FetchAll_SkipsNonDocTypes(t *testing.T) {
f := newFakeYuque()
defer f.Close()
f.handleJSON("/api/v2/repos/8/docs", 200, v2DocListResponse{Data: []v2Doc{
{ID: 201, Type: "Sheet", Status: "1", Title: "S"},
{ID: 202, Type: "Board", Status: "1", Title: "B"},
{ID: 203, Type: "Thread", Status: "1", Title: "T"},
{ID: 204, Type: "Doc", Status: "1", Title: "D", ContentUpdatedAt: "2026-04-20T10:00:00Z"},
}})
f.handleJSON("/api/v2/repos/docs/204", 200, v2DocDetailResponse{
Data: v2DocDetail{ID: 204, Title: "D", Body: "text", Status: "1", ContentUpdatedAt: "2026-04-20T10:00:00Z"},
})
items, err := NewConnector().FetchAll(context.Background(), makeDSConfig(f, []string{"8"}), []string{"8"})
if err != nil {
t.Fatalf("FetchAll: %v", err)
}
if len(items) != 1 || items[0].ExternalID != "204" {
t.Errorf("expected only the Doc, got %+v", items)
}
}
func TestConnector_FetchIncremental_FirstSync(t *testing.T) {
f := newFakeYuque()
defer f.Close()
f.handleJSON("/api/v2/repos/10/docs", 200, v2DocListResponse{Data: []v2Doc{
{ID: 1, Type: "Doc", Status: "1", Title: "A", ContentUpdatedAt: "2026-04-20T10:00:00Z"},
{ID: 2, Type: "Doc", Status: "1", Title: "B", ContentUpdatedAt: "2026-04-20T11:00:00Z"},
}})
f.handleJSON("/api/v2/repos/docs/1", 200, v2DocDetailResponse{Data: v2DocDetail{ID: 1, Title: "A", Body: "a", Status: "1", ContentUpdatedAt: "2026-04-20T10:00:00Z"}})
f.handleJSON("/api/v2/repos/docs/2", 200, v2DocDetailResponse{Data: v2DocDetail{ID: 2, Title: "B", Body: "b", Status: "1", ContentUpdatedAt: "2026-04-20T11:00:00Z"}})
items, cursor, err := NewConnector().FetchIncremental(context.Background(), makeDSConfig(f, []string{"10"}), nil)
if err != nil {
t.Fatalf("FetchIncremental: %v", err)
}
if len(items) != 2 {
t.Errorf("items = %d, want 2 on first sync", len(items))
}
if cursor == nil {
t.Fatal("cursor must not be nil")
}
}
func TestConnector_FetchIncremental_NoChanges(t *testing.T) {
f := newFakeYuque()
defer f.Close()
f.handleJSON("/api/v2/repos/10/docs", 200, v2DocListResponse{Data: []v2Doc{
{ID: 1, Type: "Doc", Status: "1", Title: "A", ContentUpdatedAt: "2026-04-20T10:00:00Z"},
}})
f.handleJSON("/api/v2/repos/docs/1", 200, v2DocDetailResponse{Data: v2DocDetail{ID: 1, Title: "A", Body: "a", Status: "1", ContentUpdatedAt: "2026-04-20T10:00:00Z"}})
// First sync → establish cursor
_, cursor1, err := NewConnector().FetchIncremental(context.Background(), makeDSConfig(f, []string{"10"}), nil)
if err != nil {
t.Fatalf("first sync: %v", err)
}
// Second sync with same cursor → 0 items
items, _, err := NewConnector().FetchIncremental(context.Background(), makeDSConfig(f, []string{"10"}), cursor1)
if err != nil {
t.Fatalf("second sync: %v", err)
}
if len(items) != 0 {
t.Errorf("expected 0 changed items, got %d", len(items))
}
}
func TestConnector_FetchIncremental_ReturnsOnlyChanged(t *testing.T) {
// First sync: 2 docs, both at t0.
f1 := newFakeYuque()
f1.handleJSON("/api/v2/repos/11/docs", 200, v2DocListResponse{Data: []v2Doc{
{ID: 1, Type: "Doc", Status: "1", Title: "A", Slug: "a", ContentUpdatedAt: "2026-04-20T10:00:00Z"},
{ID: 2, Type: "Doc", Status: "1", Title: "B", Slug: "b", ContentUpdatedAt: "2026-04-20T10:00:00Z"},
}})
f1.handleJSON("/api/v2/repos/docs/1", 200, v2DocDetailResponse{Data: v2DocDetail{ID: 1, Title: "A", Body: "a", Status: "1", ContentUpdatedAt: "2026-04-20T10:00:00Z"}})
f1.handleJSON("/api/v2/repos/docs/2", 200, v2DocDetailResponse{Data: v2DocDetail{ID: 2, Title: "B", Body: "b", Status: "1", ContentUpdatedAt: "2026-04-20T10:00:00Z"}})
_, cursor1, err := NewConnector().FetchIncremental(context.Background(), makeDSConfig(f1, []string{"11"}), nil)
if err != nil {
t.Fatalf("first sync: %v", err)
}
f1.Close()
// Second sync: doc 1 unchanged, doc 2 has a newer ContentUpdatedAt.
f2 := newFakeYuque()
defer f2.Close()
f2.handleJSON("/api/v2/repos/11/docs", 200, v2DocListResponse{Data: []v2Doc{
{ID: 1, Type: "Doc", Status: "1", Title: "A", Slug: "a", ContentUpdatedAt: "2026-04-20T10:00:00Z"},
{ID: 2, Type: "Doc", Status: "1", Title: "B2", Slug: "b", ContentUpdatedAt: "2026-04-20T12:00:00Z"},
}})
// Only doc 2's detail should be fetched; if the implementation incorrectly
// fetches doc 1, the test will still pass metadata-wise — but this handler
// makes the expected single call clear.
f2.handleJSON("/api/v2/repos/docs/2", 200, v2DocDetailResponse{Data: v2DocDetail{ID: 2, Title: "B2", Body: "b2 updated", Status: "1", ContentUpdatedAt: "2026-04-20T12:00:00Z"}})
items, _, err := NewConnector().FetchIncremental(context.Background(), makeDSConfig(f2, []string{"11"}), cursor1)
if err != nil {
t.Fatalf("second sync: %v", err)
}
if len(items) != 1 {
t.Fatalf("expected exactly 1 changed item, got %d: %+v", len(items), items)
}
if items[0].ExternalID != "2" {
t.Errorf("expected changed item ExternalID=2, got %q", items[0].ExternalID)
}
if string(items[0].Content) != "b2 updated" {
t.Errorf("expected refreshed body, got %q", string(items[0].Content))
}
}
func TestConnector_FetchIncremental_DetectsDeletion(t *testing.T) {
// First sync: 2 docs
f1 := newFakeYuque()
f1.handleJSON("/api/v2/repos/10/docs", 200, v2DocListResponse{Data: []v2Doc{
{ID: 1, Type: "Doc", Status: "1", Title: "A", ContentUpdatedAt: "2026-04-20T10:00:00Z"},
{ID: 2, Type: "Doc", Status: "1", Title: "B", ContentUpdatedAt: "2026-04-20T11:00:00Z"},
}})
f1.handleJSON("/api/v2/repos/docs/1", 200, v2DocDetailResponse{Data: v2DocDetail{ID: 1, Title: "A", Body: "a", Status: "1", ContentUpdatedAt: "2026-04-20T10:00:00Z"}})
f1.handleJSON("/api/v2/repos/docs/2", 200, v2DocDetailResponse{Data: v2DocDetail{ID: 2, Title: "B", Body: "b", Status: "1", ContentUpdatedAt: "2026-04-20T11:00:00Z"}})
_, cursor1, err := NewConnector().FetchIncremental(context.Background(), makeDSConfig(f1, []string{"10"}), nil)
if err != nil {
t.Fatalf("first sync: %v", err)
}
f1.Close()
// Second sync: doc 2 removed
f2 := newFakeYuque()
defer f2.Close()
f2.handleJSON("/api/v2/repos/10/docs", 200, v2DocListResponse{Data: []v2Doc{
{ID: 1, Type: "Doc", Status: "1", Title: "A", ContentUpdatedAt: "2026-04-20T10:00:00Z"},
}})
items, _, err := NewConnector().FetchIncremental(context.Background(), makeDSConfig(f2, []string{"10"}), cursor1)
if err != nil {
t.Fatalf("second sync: %v", err)
}
deleted := 0
for _, it := range items {
if it.IsDeleted && it.ExternalID == "2" {
deleted++
}
}
if deleted != 1 {
t.Errorf("expected 1 deletion for doc 2, got %d; items=%+v", deleted, items)
}
}
func TestConnector_ListResources_ContinuesOnGroupFailure(t *testing.T) {
f := newFakeYuque()
defer f.Close()
f.mux = http.NewServeMux()
f.server.Config.Handler = f.mux
f.handleJSON("/api/v2/user", 200, v2UserResponse{Data: v2User{ID: 1, Login: "alice"}})
f.handleJSON("/api/v2/users/alice/repos", 200, v2RepoListResponse{Data: []v2Repo{
{ID: 10, Slug: "me", Name: "Mine", Type: "Book", Namespace: "alice/me"},
}})
f.handleJSON("/api/v2/users/1/groups", 200, v2GroupListResponse{Data: []v2Group{
{ID: 100, Login: "team-ok"}, {ID: 101, Login: "team-forbidden"},
}})
f.handleJSON("/api/v2/groups/team-ok/repos", 200, v2RepoListResponse{Data: []v2Repo{
{ID: 20, Slug: "ok", Name: "OK Book", Type: "Book", Namespace: "team-ok/ok"},
}})
// team-forbidden returns 403 — should be skipped, not abort the whole call.
f.mux.HandleFunc("/api/v2/groups/team-forbidden/repos", func(w http.ResponseWriter, r *http.Request) {
w.WriteHeader(http.StatusForbidden)
_, _ = w.Write([]byte(`{"message":"forbidden"}`))
})
resources, err := NewConnector().ListResources(context.Background(), makeDSConfig(f, nil))
if err != nil {
t.Fatalf("ListResources should not fail on per-group error, got %v", err)
}
// Expect the 2 successful repos (personal + team-ok). team-forbidden is skipped.
if len(resources) != 2 {
t.Fatalf("expected 2 resources (personal + team-ok), got %d: %+v", len(resources), resources)
}
names := map[string]bool{}
for _, r := range resources {
names[r.Name] = true
}
if !names["Mine"] || !names["OK Book"] {
t.Errorf("expected Mine + OK Book, got %v", names)
}
}

View File

@@ -0,0 +1,261 @@
// Package yuque implements the Yuque (语雀) data source connector for WeKnora.
//
// It syncs documents from personal and group knowledge bases (books/repos) into WeKnora
// knowledge bases, preserving Markdown formatting.
//
// Yuque API docs:
// - Authentication: X-Auth-Token header, personal token from https://www.yuque.com/settings/tokens
// - User: GET /api/v2/user
// - Groups: GET /api/v2/users/{id}/groups
// - Repos: GET /api/v2/users/{login}/repos, GET /api/v2/groups/{login}/repos
// - Docs: GET /api/v2/repos/{book_id}/docs (list), GET /api/v2/repos/docs/{id} (detail)
//
// Known limitations (v1):
// - Only syncs type=Doc (Sheet/Thread/Board/Table skipped)
// - Only syncs status="1" (published), drafts skipped
// - Private-book images (CDN URLs with auth) may fail to load
// - Lake editor may leave non-standard markdown (anchors, sized image attrs)
package yuque
import (
"encoding/json"
"fmt"
"strconv"
"strings"
"time"
"unicode/utf8"
"github.com/Tencent/WeKnora/internal/datasource"
"github.com/Tencent/WeKnora/internal/types"
)
// DefaultBaseURL is the Yuque public cloud API base URL.
const DefaultBaseURL = "https://www.yuque.com"
// Config holds Yuque-specific configuration.
type Config struct {
// APIToken is a personal token from Yuque settings → tokens.
APIToken string `json:"api_token"`
// BaseURL is the Yuque deployment base URL (default: https://www.yuque.com).
// For enterprise/private deployments, use the company's yuque domain.
BaseURL string `json:"base_url,omitempty"`
}
// GetBaseURL returns the normalized base URL:
// - empty → DefaultBaseURL
// - missing scheme → prepend "https://"
// - trailing slash → stripped
func (c *Config) GetBaseURL() string {
url := strings.TrimSpace(c.BaseURL)
if url == "" {
return DefaultBaseURL
}
if !strings.Contains(url, "://") {
url = "https://" + url
}
url = strings.TrimRight(url, "/")
return url
}
// parseYuqueConfig extracts and validates Yuque-specific configuration.
// Uses JSON marshal/unmarshal roundtrip (consistent with Feishu's parseFeishuConfig)
// rather than single-field type assertion, because we have multiple fields with
// optional defaults.
func parseYuqueConfig(config *types.DataSourceConfig) (*Config, error) {
if config == nil {
return nil, fmt.Errorf("%w: config is nil", datasource.ErrInvalidConfig)
}
credBytes, err := json.Marshal(config.Credentials)
if err != nil {
return nil, fmt.Errorf("marshal credentials: %w", err)
}
var cfg Config
if err := json.Unmarshal(credBytes, &cfg); err != nil {
return nil, fmt.Errorf("parse yuque credentials: %w", err)
}
if strings.TrimSpace(cfg.APIToken) == "" {
return nil, fmt.Errorf("%w: api_token is required", datasource.ErrInvalidCredentials)
}
return &cfg, nil
}
// --- Yuque API response types ---
// flexibleStatus accepts either a string ("1") or a number (1) for the doc
// `status` field. Yuque's OpenAPI spec declares `status` as string, but the
// runtime API returns it as an integer, so unmarshaling into a plain string
// fails with "cannot unmarshal number into Go struct field ... of type string".
// Normalizing to the textual form lets existing comparisons (e.g. != "1") keep
// working for both response shapes.
type flexibleStatus string
func (s *flexibleStatus) UnmarshalJSON(b []byte) error {
if string(b) == "null" {
*s = ""
return nil
}
if len(b) > 0 && b[0] == '"' {
var str string
if err := json.Unmarshal(b, &str); err != nil {
return err
}
*s = flexibleStatus(str)
return nil
}
// Integer form. We decode to int64 so that floats, booleans, arrays, and
// objects fail loudly instead of being silently stringified — if Yuque
// changes the shape again, we'd rather surface a clear error than feed
// garbage to the Status == "1" comparison.
var i int64
if err := json.Unmarshal(b, &i); err != nil {
return fmt.Errorf("flexibleStatus: expected string or integer, got %s: %w", b, err)
}
*s = flexibleStatus(strconv.FormatInt(i, 10))
return nil
}
// apiErrorBody is the error body shape Yuque sometimes returns on non-2xx.
type apiErrorBody struct {
Message string `json:"message"`
Status int `json:"status"`
}
// v2UserResponse wraps GET /api/v2/user.
type v2UserResponse struct {
Data v2User `json:"data"`
}
type v2User struct {
ID int64 `json:"id"`
Login string `json:"login"`
Name string `json:"name"`
}
// v2GroupListResponse wraps GET /api/v2/users/{id}/groups.
type v2GroupListResponse struct {
Data []v2Group `json:"data"`
}
type v2Group struct {
ID int64 `json:"id"`
Login string `json:"login"`
Name string `json:"name"`
}
// v2RepoListResponse wraps GET /api/v2/users/{login}/repos and /groups/{login}/repos.
type v2RepoListResponse struct {
Data []v2Repo `json:"data"`
}
type v2Repo struct {
ID int64 `json:"id"`
Type string `json:"type"` // "Book" | "Design" (listing filter enum; connector requests type=Book)
Slug string `json:"slug"`
Name string `json:"name"`
UserID int64 `json:"user_id"`
Namespace string `json:"namespace"` // e.g. "group_login/book_slug"
Public int `json:"public"` // 0:private, 1:public, 2:internal
Description string `json:"description"`
UpdatedAt string `json:"updated_at"` // RFC3339 string
}
// v2DocListResponse wraps GET /api/v2/repos/{book_id}/docs.
type v2DocListResponse struct {
Meta struct {
Total int `json:"total"`
} `json:"meta"`
Data []v2Doc `json:"data"`
}
// v2Doc is the document summary returned by the list endpoint (no body).
type v2Doc struct {
ID int64 `json:"id"`
Type string `json:"type"` // Doc / Sheet / Thread / Board / Table
Slug string `json:"slug"`
Title string `json:"title"`
BookID int64 `json:"book_id"`
UserID int64 `json:"user_id"`
Status flexibleStatus `json:"status"` // "0" draft, "1" published — API may return int or string
ContentUpdatedAt string `json:"content_updated_at"` // RFC3339 string — use for change detection
UpdatedAt string `json:"updated_at"`
WordCount int `json:"word_count"`
}
// v2DocDetailResponse wraps GET /api/v2/repos/docs/{id}.
type v2DocDetailResponse struct {
Data v2DocDetail `json:"data"`
}
type v2DocDetail struct {
ID int64 `json:"id"`
Type string `json:"type"`
Slug string `json:"slug"`
Title string `json:"title"`
BookID int64 `json:"book_id"`
Format string `json:"format"` // "markdown" / "lake" / "html"
Body string `json:"body"` // Markdown content
Status flexibleStatus `json:"status"` // API may return int or string
ContentUpdatedAt string `json:"content_updated_at"`
UpdatedAt string `json:"updated_at"`
WordCount int `json:"word_count"`
Book v2Repo `json:"book"`
}
// yuqueCursor stores incremental sync state.
// Key1: book_id (string), Key2: doc_id (string), Value: content_updated_at (raw RFC3339 string)
type yuqueCursor struct {
LastSyncTime time.Time `json:"last_sync_time"`
BookDocTimes map[string]map[string]string `json:"book_doc_times,omitempty"`
}
// parseContentUpdatedAt parses Yuque ISO 8601 timestamp (returns zero time on parse failure).
func parseContentUpdatedAt(ts string) time.Time {
if ts == "" {
return time.Time{}
}
t, err := time.Parse(time.RFC3339, ts)
if err != nil {
return time.Time{}
}
return t
}
// sanitizeFileName removes characters that are invalid in filenames and
// truncates to a safe length at a UTF-8 rune boundary. Raw byte truncation
// would split a multi-byte codepoint (Chinese characters are 3 bytes in UTF-8)
// and produce an invalid UTF-8 string, which downstream filename validation
// (utf8.ValidString) rejects with "文件名包含非法字符".
func sanitizeFileName(name string) string {
if name == "" {
return "untitled"
}
replacer := strings.NewReplacer(
"/", "_", "\\", "_", ":", "_", "*", "_",
"?", "_", "\"", "_", "<", "_", ">", "_", "|", "_",
)
result := replacer.Replace(name)
const maxBytes = 200
if len(result) > maxBytes {
result = result[:maxBytes]
// Peel any trailing bytes that no longer form a complete rune.
// DecodeLastRuneInString returns (RuneError, 1) for a partial UTF-8
// sequence — at most 3 bytes need peeling since a rune is ≤ 4 bytes.
for len(result) > 0 {
r, size := utf8.DecodeLastRuneInString(result)
if r != utf8.RuneError || size != 1 {
break
}
result = result[:len(result)-1]
}
}
return result
}
// redactToken returns a masked form of the token for logging (never log the full token).
func redactToken(t string) string {
if len(t) < 12 {
return "***"
}
return t[:6] + "..." + t[len(t)-4:]
}

View File

@@ -0,0 +1,176 @@
package yuque
import (
"encoding/json"
"strings"
"testing"
"unicode/utf8"
"github.com/Tencent/WeKnora/internal/types"
)
// Yuque's runtime API returns the doc `status` field as an integer (0 or 1),
// even though its OpenAPI spec declares it as string. We tolerate both forms.
func TestV2Doc_Status_AcceptsNumberAndString(t *testing.T) {
cases := []struct {
name string
body string
want flexibleStatus
}{
{"number 1", `{"id":1,"status":1}`, "1"},
{"number 0", `{"id":1,"status":0}`, "0"},
{"string 1", `{"id":1,"status":"1"}`, "1"},
{"null", `{"id":1,"status":null}`, ""},
{"missing", `{"id":1}`, ""},
}
for _, tc := range cases {
t.Run(tc.name, func(t *testing.T) {
var d v2Doc
if err := json.Unmarshal([]byte(tc.body), &d); err != nil {
t.Fatalf("unmarshal: %v", err)
}
if d.Status != tc.want {
t.Errorf("Status = %q, want %q", d.Status, tc.want)
}
})
}
}
func TestV2DocDetail_Status_AcceptsNumber(t *testing.T) {
var d v2DocDetail
if err := json.Unmarshal([]byte(`{"id":1,"status":1}`), &d); err != nil {
t.Fatalf("unmarshal: %v", err)
}
if d.Status != "1" {
t.Errorf("Status = %q, want %q", d.Status, "1")
}
}
func TestV2DocListResponse_Status_Number(t *testing.T) {
// Reproduces the real-world failure:
// decode response: json: cannot unmarshal number into Go struct field v2Doc.data.status of type string
body := `{"meta":{"total":1},"data":[{"id":101,"type":"Doc","status":1,"title":"hello"}]}`
var resp v2DocListResponse
if err := json.Unmarshal([]byte(body), &resp); err != nil {
t.Fatalf("unmarshal: %v", err)
}
if len(resp.Data) != 1 {
t.Fatalf("len(Data) = %d", len(resp.Data))
}
if resp.Data[0].Status != "1" {
t.Errorf("Status = %q, want %q", resp.Data[0].Status, "1")
}
}
// flexibleStatus deliberately rejects unexpected JSON shapes so a future Yuque
// schema drift (e.g. status becoming a float, bool, array) fails loudly rather
// than being silently stringified and bypassing the `!= "1"` filter.
func TestV2Doc_Status_RejectsUnexpectedShapes(t *testing.T) {
cases := []string{
`{"id":1,"status":true}`,
`{"id":1,"status":1.5}`,
`{"id":1,"status":[1]}`,
`{"id":1,"status":{"x":1}}`,
}
for _, body := range cases {
t.Run(body, func(t *testing.T) {
var d v2Doc
if err := json.Unmarshal([]byte(body), &d); err == nil {
t.Errorf("expected error for %s, got Status=%q", body, d.Status)
}
})
}
}
func TestParseYuqueConfig(t *testing.T) {
t.Run("valid full", func(t *testing.T) {
cfg, err := parseYuqueConfig(&types.DataSourceConfig{
Credentials: map[string]interface{}{
"api_token": "tok-123",
"base_url": "https://company.yuque.com",
},
})
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if cfg.APIToken != "tok-123" {
t.Errorf("APIToken = %q, want tok-123", cfg.APIToken)
}
if cfg.BaseURL != "https://company.yuque.com" {
t.Errorf("BaseURL = %q", cfg.BaseURL)
}
})
t.Run("default base_url", func(t *testing.T) {
cfg, err := parseYuqueConfig(&types.DataSourceConfig{
Credentials: map[string]interface{}{"api_token": "tok-abc"},
})
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if cfg.GetBaseURL() != "https://www.yuque.com" {
t.Errorf("GetBaseURL default = %q, want https://www.yuque.com", cfg.GetBaseURL())
}
})
t.Run("missing api_token", func(t *testing.T) {
_, err := parseYuqueConfig(&types.DataSourceConfig{
Credentials: map[string]interface{}{},
})
if err == nil {
t.Fatal("expected error for missing api_token")
}
})
t.Run("nil config", func(t *testing.T) {
_, err := parseYuqueConfig(nil)
if err == nil {
t.Fatal("expected error for nil config")
}
})
}
func TestGetBaseURL_TrimsTrailingSlash(t *testing.T) {
c := &Config{BaseURL: "https://x.yuque.com/"}
if got := c.GetBaseURL(); got != "https://x.yuque.com" {
t.Errorf("GetBaseURL = %q, want https://x.yuque.com", got)
}
}
func TestGetBaseURL_AddsSchemeIfMissing(t *testing.T) {
c := &Config{BaseURL: "company.yuque.com"}
if got := c.GetBaseURL(); got != "https://company.yuque.com" {
t.Errorf("GetBaseURL = %q, want https://company.yuque.com", got)
}
}
func TestGetBaseURL_EmptyUsesDefault(t *testing.T) {
c := &Config{BaseURL: ""}
if got := c.GetBaseURL(); got != "https://www.yuque.com" {
t.Errorf("GetBaseURL = %q, want default", got)
}
}
func TestGetBaseURL_TrimsWhitespace(t *testing.T) {
c := &Config{BaseURL: " https://x.yuque.com/ "}
if got := c.GetBaseURL(); got != "https://x.yuque.com" {
t.Errorf("GetBaseURL = %q, want https://x.yuque.com", got)
}
}
func TestSanitizeFileName_TruncatesAtRuneBoundary(t *testing.T) {
// Long Chinese title (each 测 is 3 bytes in UTF-8). Raw byte slicing at 200
// would split a rune and produce invalid UTF-8 that downstream filename
// validation rejects with "文件名包含非法字符".
long := strings.Repeat("测试", 100) // 600 bytes
got := sanitizeFileName(long)
if !utf8.ValidString(got) {
t.Fatalf("sanitizeFileName produced invalid UTF-8: %q", got)
}
if len(got) > 200 {
t.Errorf("len = %d, want ≤ 200", len(got))
}
if len(got) == 0 {
t.Error("result is empty")
}
}

View File

@@ -191,6 +191,16 @@ func GetLogger(c context.Context) *logrus.Entry {
return logrus.NewEntry(appLogger)
}
// SetOutput overrides the internal logger's output destination.
// Intended for use in tests that need to capture and assert on log content
// (e.g. verifying secrets are not written out). Restore the original writer
// (usually os.Stdout) in a defer after the test.
func SetOutput(w io.Writer) {
loggerMu.Lock()
defer loggerMu.Unlock()
appLogger.SetOutput(w)
}
// SetLogLevel 设置日志级别
func SetLogLevel(level LogLevel) {
var logLevel logrus.Level

View File

@@ -29,6 +29,7 @@ const (
ChannelSlack = "slack" // Slack
ChannelIM = "im" // Generic IM channel
ChannelNotion = "notion" // Notion
ChannelYuque = "yuque" // Yuque (语雀)
)
// Knowledge parse status constants