fix: deep-copy stored files and images when cloning a knowledge base

Cloning a knowledge base previously copied only the storage path strings
(knowledge.FilePath and chunk.ImageInfo.URL), so the source and the clone
shared the same physical objects in the storage backend. Once the original
file and extracted images are deleted on source removal, the clone is left
with dangling references and its document and images become unreadable —
data loss that occurs even for same-store clones.

Add a CopyFile primitive to the FileService interface and implement it in
every backend: server-side CopyObject on the object stores
(s3/obs/cos/oss/tos/ks3/minio), io.Copy on local, and a no-op on dummy.
Destinations use the knowledge-owned layout and reuse the existing
path/object-key guards; a sentinel ErrCrossBackendCopy is returned when the
source scheme does not match the backend.

Use CopyFile to deep-copy the document file in cloneKnowledge and the
extracted images in CloneChunk and cloneFAQKnowledgeBase via a shared
cloneChunkImageInfo helper that deduplicates identical image URLs per clone
and rewrites them to the new objects. Copied objects are cleaned up
best-effort if a clone fails partway through. A clone-time preflight rejects
cloning into a target bound to a different storage backend when the tenant
pins providers via StorageEngineConfig.

Adds unit tests for local CopyFile (independent copy survives source
deletion, traversal rejection, cross-backend rejection), cloneChunkImageInfo
(empty/multi/dedup/parse-failure/OriginalURL handling), and the storage
provider preflight.
This commit is contained in:
ochan.kwon
2026-06-02 20:25:38 +09:00
committed by lyingbug
parent f4af9cca97
commit e9980c6011
25 changed files with 918 additions and 12 deletions

View File

@@ -41,6 +41,9 @@ func (f *fakeFileService) GetFileURL(ctx context.Context, filePath string) (stri
return "local://" + strings.TrimPrefix(filePath, "/"), nil
}
func (f *fakeFileService) DeleteFile(ctx context.Context, _ string) error { return nil }
func (f *fakeFileService) CopyFile(ctx context.Context, _ string, _ uint64, _ string) (string, error) {
return "", nil
}
// TestMaterializeKnowledgeFile_HandlesLocalScheme is the regression guard
// for the dev-mode failure where DuckDB was handed a local:// URL it can't

View File

@@ -175,6 +175,35 @@ func (s *cosFileService) parseCosObjectName(filePath string) (string, error) {
return strings.TrimPrefix(filePath, s.bucketURL), nil
}
// CopyFile copies an existing COS object to a new knowledge-owned object using a
// server-side Object.Copy (no data leaves COS). The destination uses the same
// layout as SaveFile. Returns ErrCrossBackendCopy when srcPath is not a cos:// path.
func (s *cosFileService) CopyFile(ctx context.Context,
srcPath string, tenantID uint64, knowledgeID string,
) (string, error) {
srcObjectKey, err := s.parseCosObjectName(srcPath)
if err != nil {
return "", fmt.Errorf("cos copy rejected source %q: %w", srcPath, ErrCrossBackendCopy)
}
if err := utils.SafeObjectKey(srcObjectKey); err != nil {
return "", fmt.Errorf("invalid source path: %w", err)
}
ext := filepath.Ext(srcPath)
destKey := fmt.Sprintf("%s/%d/%s/%s%s", s.cosPathPrefix, tenantID, knowledgeID, uuid.New().String(), ext)
// sourceURL is the host + object key WITHOUT a scheme, per the COS SDK contract.
sourceURL := fmt.Sprintf("%s.cos.%s.myqcloud.com/%s", s.bucketName, s.region, srcObjectKey)
_, _, err = s.client.Object.Copy(ctx, destKey, sourceURL, nil)
if err != nil {
return "", fmt.Errorf("failed to copy file in COS: %w", err)
}
newPath := fmt.Sprintf("cos://%s/%s/%s", s.bucketName, s.region, destKey)
logger.Infof(ctx, "Copied COS object %s to %s", srcPath, newPath)
return newPath, nil
}
// SaveBytes saves bytes data to COS
// If temp is true and temp bucket is configured, saves to temp bucket (with lifecycle auto-expiration)
// Otherwise saves to main bucket

View File

@@ -6,6 +6,7 @@ import (
"io"
"mime/multipart"
"github.com/Tencent/WeKnora/internal/logger"
"github.com/Tencent/WeKnora/internal/types/interfaces"
"github.com/google/uuid"
)
@@ -47,6 +48,13 @@ func (s *DummyFileService) SaveBytes(ctx context.Context, data []byte, tenantID
return uuid.New().String(), nil
}
// CopyFile is a no-op for the dummy service: it logs a warning and returns the
// source path unchanged (the shared reference is intentional in this stub).
func (s *DummyFileService) CopyFile(ctx context.Context, srcPath string, tenantID uint64, knowledgeID string) (string, error) {
logger.Warnf(ctx, "[dummy] CopyFile no-op: returning source path %q unchanged (no real copy performed)", srcPath)
return srcPath, nil
}
// GetFileURL returns the file path as URL (dummy implementation)
func (s *DummyFileService) GetFileURL(ctx context.Context, filePath string) (string, error) {
return filePath, nil

View File

@@ -0,0 +1,9 @@
package file
import "errors"
// ErrCrossBackendCopy is returned by CopyFile implementations when the source
// path belongs to a different storage provider than the destination service.
// PR1 only supports same-backend (server-side) copies; cross-backend streaming
// copy is intentionally not implemented yet.
var ErrCrossBackendCopy = errors.New("file: cross-backend copy not supported")

View File

@@ -10,6 +10,7 @@ import (
"strings"
"time"
"github.com/Tencent/WeKnora/internal/logger"
"github.com/Tencent/WeKnora/internal/types/interfaces"
"github.com/Tencent/WeKnora/internal/utils"
"github.com/google/uuid"
@@ -177,6 +178,38 @@ func (s *ks3FileService) SaveBytes(ctx context.Context, data []byte, tenantID ui
return fmt.Sprintf("%s%s/%s", ks3Scheme, s.bucketName, objectKey), nil
}
// CopyFile copies an existing KS3 object to a new knowledge-owned object using a
// server-side CopyObject (no data leaves KS3). The destination uses the same
// layout as SaveFile. Returns ErrCrossBackendCopy when srcPath is not a ks3:// path.
func (s *ks3FileService) CopyFile(ctx context.Context,
srcPath string, tenantID uint64, knowledgeID string,
) (string, error) {
srcBucket, srcKey, err := parseKS3FilePath(srcPath)
if err != nil {
return "", fmt.Errorf("ks3 copy rejected source %q: %w", srcPath, ErrCrossBackendCopy)
}
if err := utils.SafeObjectKey(srcKey); err != nil {
return "", fmt.Errorf("invalid source path: %w", err)
}
ext := filepath.Ext(srcPath)
destKey := joinKS3Key(s.pathPrefix, fmt.Sprintf("%d", tenantID), knowledgeID, uuid.New().String()+ext)
_, err = s.client.CopyObject(&ks3s3.CopyObjectInput{
Bucket: ks3aws.String(s.bucketName),
Key: ks3aws.String(destKey),
SourceBucket: ks3aws.String(srcBucket),
SourceKey: ks3aws.String(srcKey),
})
if err != nil {
return "", fmt.Errorf("failed to copy file in KS3: %w", err)
}
newPath := fmt.Sprintf("%s%s/%s", ks3Scheme, s.bucketName, destKey)
logger.Infof(ctx, "Copied KS3 object %s to %s", srcPath, newPath)
return newPath, nil
}
func (s *ks3FileService) GetFile(ctx context.Context, filePath string) (io.ReadCloser, error) {
_, objectKey, err := parseKS3FilePath(filePath)
if err != nil {

View File

@@ -151,6 +151,64 @@ func (s *localFileService) DeleteFile(ctx context.Context, filePath string) erro
return nil
}
// CopyFile copies an existing local object to a new knowledge-owned object.
// The destination uses the same layout as SaveFile (baseDir/{tenantID}/{knowledgeID}/{unique}{ext}),
// and the copy is a real byte-for-byte copy (no hardlink) so deleting the source
// never affects it. Returns ErrCrossBackendCopy when srcPath is not a local path.
func (s *localFileService) CopyFile(ctx context.Context,
srcPath string, tenantID uint64, knowledgeID string,
) (string, error) {
// Only local paths are accepted. A provider scheme other than local://
// (e.g. s3://, minio://) means a cross-backend copy, which this service
// does not support. Legacy bare/absolute paths have no scheme and pass.
if i := strings.Index(srcPath, "://"); i >= 0 && srcPath[:i+3] != localScheme {
return "", fmt.Errorf("local file service cannot copy %q: %w", srcPath, ErrCrossBackendCopy)
}
// Validate and resolve the source path under baseDir (same guard as GetFile).
srcCandidate := s.normalizePathForBase(srcPath)
srcResolved, err := secutils.SafePathUnderBase(s.baseDir, srcCandidate)
if err != nil {
logger.Errorf(ctx, "Path traversal denied for CopyFile src: %v", err)
return "", fmt.Errorf("invalid source path: %w", err)
}
// Build destination path with the knowledge-owned layout.
dir := filepath.Join(s.baseDir, fmt.Sprintf("%d", tenantID), knowledgeID)
if _, err := secutils.SafePathUnderBase(s.baseDir, dir); err != nil {
logger.Errorf(ctx, "Path traversal denied for CopyFile dir: %v", err)
return "", fmt.Errorf("invalid path: %w", err)
}
if err := os.MkdirAll(dir, 0o755); err != nil {
return "", fmt.Errorf("failed to create directory: %w", err)
}
ext := filepath.Ext(srcPath)
filename := fmt.Sprintf("%d%s", time.Now().UnixNano(), ext)
dstPath := filepath.Join(dir, filename)
src, err := os.Open(srcResolved)
if err != nil {
return "", fmt.Errorf("failed to open source file: %w", err)
}
defer src.Close()
dst, err := os.Create(dstPath)
if err != nil {
return "", fmt.Errorf("failed to create destination file: %w", err)
}
defer dst.Close()
if _, err := io.Copy(dst, src); err != nil {
return "", fmt.Errorf("failed to copy file content: %w", err)
}
relPath, _ := filepath.Rel(s.baseDir, dstPath)
newPath := localScheme + filepath.ToSlash(relPath)
logger.Infof(ctx, "Copied local file %s to %s", srcPath, newPath)
return newPath, nil
}
// SaveBytes saves bytes data to a file and returns the file path
// temp parameter is ignored for local storage (no auto-expiration support)
// fileName 仅允许安全文件名,禁止路径遍历(如 ../../

View File

@@ -0,0 +1,90 @@
package file
import (
"context"
"errors"
"io"
"os"
"path/filepath"
"strings"
"testing"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
// seedLocalObject writes a file under baseDir at the SaveBytes-style layout
// (tenant/exports) and returns its local:// path.
func seedLocalObject(t *testing.T, baseDir string, tenantID uint64, name string, data []byte) string {
t.Helper()
dir := filepath.Join(baseDir, "0", "src-knowledge")
require.NoError(t, os.MkdirAll(dir, 0o755))
full := filepath.Join(dir, name)
require.NoError(t, os.WriteFile(full, data, 0o644))
rel, err := filepath.Rel(baseDir, full)
require.NoError(t, err)
return localScheme + filepath.ToSlash(rel)
}
func readLocal(t *testing.T, svc interface {
GetFile(context.Context, string) (io.ReadCloser, error)
}, path string) []byte {
t.Helper()
rc, err := svc.GetFile(context.Background(), path)
require.NoError(t, err)
defer rc.Close()
b, err := io.ReadAll(rc)
require.NoError(t, err)
return b
}
// TestLocalCopyFile_IndependentCopy verifies that CopyFile creates a real,
// independent copy: it exists at a new knowledge-owned path AND survives
// deletion of the source object (the C1/C2 regression this PR fixes).
func TestLocalCopyFile_IndependentCopy(t *testing.T) {
base := t.TempDir()
svc := NewLocalFileService(base, "")
content := []byte("hello deep copy")
srcPath := seedLocalObject(t, base, 0, "doc.txt", content)
newPath, err := svc.CopyFile(context.Background(), srcPath, 42, "dst-knowledge")
require.NoError(t, err)
// New path must be knowledge-owned: local://42/dst-knowledge/<unique>.txt
require.True(t, strings.HasPrefix(newPath, localScheme+"42/dst-knowledge/"),
"unexpected dst path: %s", newPath)
assert.Equal(t, ".txt", filepath.Ext(newPath))
// Copy is readable and byte-identical.
assert.Equal(t, content, readLocal(t, svc, newPath))
// Delete the source — the copy must remain intact.
require.NoError(t, svc.DeleteFile(context.Background(), srcPath))
assert.Equal(t, content, readLocal(t, svc, newPath),
"copy should survive deletion of source")
}
// TestLocalCopyFile_CrossBackend verifies that handing a non-local provider
// scheme to the local service is rejected with ErrCrossBackendCopy.
func TestLocalCopyFile_CrossBackend(t *testing.T) {
base := t.TempDir()
svc := NewLocalFileService(base, "")
_, err := svc.CopyFile(context.Background(), "s3://bucket/10/exports/a.png", 7, "k")
require.Error(t, err)
assert.True(t, errors.Is(err, ErrCrossBackendCopy),
"expected ErrCrossBackendCopy, got %v", err)
}
// TestLocalCopyFile_TraversalRejected verifies that the same path guard used by
// GetFile/DeleteFile rejects a traversal source path.
func TestLocalCopyFile_TraversalRejected(t *testing.T) {
base := t.TempDir()
svc := NewLocalFileService(base, "")
_, err := svc.CopyFile(context.Background(), localScheme+"../../etc/passwd", 7, "k")
require.Error(t, err)
assert.False(t, errors.Is(err, ErrCrossBackendCopy),
"traversal should be a path error, not cross-backend")
}

View File

@@ -10,6 +10,7 @@ import (
"strings"
"time"
"github.com/Tencent/WeKnora/internal/logger"
"github.com/Tencent/WeKnora/internal/types/interfaces"
"github.com/Tencent/WeKnora/internal/utils"
"github.com/google/uuid"
@@ -164,6 +165,33 @@ func (s *minioFileService) DeleteFile(ctx context.Context, filePath string) erro
return nil
}
// CopyFile copies an existing MinIO object to a new knowledge-owned object using a
// server-side CopyObject (no data leaves MinIO). The destination uses the same
// layout as SaveFile. Returns ErrCrossBackendCopy when srcPath is not a minio:// path.
func (s *minioFileService) CopyFile(ctx context.Context,
srcPath string, tenantID uint64, knowledgeID string,
) (string, error) {
srcKey, err := s.parseMinioFilePath(srcPath)
if err != nil {
return "", fmt.Errorf("minio copy rejected source %q: %w", srcPath, ErrCrossBackendCopy)
}
ext := filepath.Ext(srcPath)
destKey := fmt.Sprintf("%d/%s/%s%s", tenantID, knowledgeID, uuid.New().String(), ext)
_, err = s.client.CopyObject(ctx,
minio.CopyDestOptions{Bucket: s.bucketName, Object: destKey},
minio.CopySrcOptions{Bucket: s.bucketName, Object: srcKey},
)
if err != nil {
return "", fmt.Errorf("failed to copy file in MinIO: %w", err)
}
newPath := fmt.Sprintf("minio://%s/%s", s.bucketName, destKey)
logger.Infof(ctx, "Copied MinIO object %s to %s", srcPath, newPath)
return newPath, nil
}
// SaveBytes saves bytes data to MinIO and returns the file path
// temp parameter is ignored for MinIO (no auto-expiration support in this implementation)
func (s *minioFileService) SaveBytes(ctx context.Context, data []byte, tenantID uint64, fileName string, temp bool) (string, error) {

View File

@@ -10,6 +10,7 @@ import (
"strings"
"time"
"github.com/Tencent/WeKnora/internal/logger"
"github.com/Tencent/WeKnora/internal/types/interfaces"
"github.com/aws/aws-sdk-go-v2/aws"
"github.com/aws/aws-sdk-go-v2/credentials"
@@ -229,6 +230,54 @@ func (s *obsFileService) GetFileURL(ctx context.Context, filePath string) (strin
return fmt.Sprintf("%s/%s/%s", s.endpoint, s.bucketName, strings.TrimPrefix(objectKey, "/")), nil
}
// CopyFile copies an existing OBS object to a new knowledge-owned object using a
// server-side CopyObject (OBS is S3-compatible). The destination uses the same
// layout as SaveFile. Returns ErrCrossBackendCopy when srcPath does not belong
// to this OBS service.
func (s *obsFileService) CopyFile(ctx context.Context,
srcPath string, tenantID uint64, knowledgeID string,
) (string, error) {
// Reject paths that do not use this service's prefix (proxy domain or obs://).
// parseObsFilePath falls back to returning the raw input for unknown prefixes,
// so guard explicitly here to detect cross-backend sources.
if !strings.HasPrefix(srcPath, s.getPrifix()) {
return "", fmt.Errorf("obs copy rejected source %q: %w", srcPath, ErrCrossBackendCopy)
}
srcKey, err := s.parseObsFilePath(srcPath)
if err != nil {
return "", fmt.Errorf("obs copy rejected source %q: %w", srcPath, ErrCrossBackendCopy)
}
ext := filepath.Ext(srcPath)
var destKey string
if s.pathPrefix != "" {
destKey = fmt.Sprintf("%s/%d/%s/%s%s", s.pathPrefix, tenantID, knowledgeID, uuid.New().String(), ext)
} else {
destKey = fmt.Sprintf("%d/%s/%s%s", tenantID, knowledgeID, uuid.New().String(), ext)
}
// CopySource is "bucket/key"; the '/' separators must NOT be percent-encoded
// (url.PathEscape would turn them into %2F and break the bucket/key split).
_, err = s.client.CopyObject(ctx, &s3.CopyObjectInput{
Bucket: aws.String(s.bucketName),
CopySource: aws.String(s.bucketName + "/" + srcKey),
Key: aws.String(destKey),
})
if err != nil {
return "", fmt.Errorf("failed to copy file in OBS: %w", err)
}
prefix := s.getPrifix()
var newPath string
if s.proxyDomain != "" {
newPath = fmt.Sprintf("%s%s", prefix, destKey)
} else {
newPath = fmt.Sprintf("%s%s/%s", prefix, s.bucketName, destKey)
}
logger.Infof(ctx, "Copied OBS object %s to %s", srcPath, newPath)
return newPath, nil
}
func (s *obsFileService) SaveBytes(ctx context.Context, data []byte, tenantID uint64, fileName string, temp bool) (string, error) {
ext := filepath.Ext(fileName)

View File

@@ -12,6 +12,7 @@ import (
"strings"
"time"
"github.com/Tencent/WeKnora/internal/logger"
"github.com/Tencent/WeKnora/internal/types/interfaces"
"github.com/Tencent/WeKnora/internal/utils"
"github.com/aliyun/alibabacloud-oss-go-sdk-v2/oss"
@@ -242,6 +243,38 @@ func (s *ossFileService) SaveBytes(ctx context.Context, data []byte, tenantID ui
return fmt.Sprintf("oss://%s/%s", targetBucket, objectName), nil
}
// CopyFile copies an existing OSS object to a new knowledge-owned object using a
// server-side CopyObject (no data leaves OSS). The destination uses the same
// layout as SaveFile. Returns ErrCrossBackendCopy when srcPath is not an oss:// path.
func (s *ossFileService) CopyFile(ctx context.Context,
srcPath string, tenantID uint64, knowledgeID string,
) (string, error) {
srcBucket, srcKey, err := parseOssFilePath(srcPath)
if err != nil {
return "", fmt.Errorf("oss copy rejected source %q: %w", srcPath, ErrCrossBackendCopy)
}
if err := utils.SafeObjectKey(srcKey); err != nil {
return "", fmt.Errorf("invalid source path: %w", err)
}
ext := filepath.Ext(srcPath)
destKey := fmt.Sprintf("%s%d/%s/%s%s", s.pathPrefix, tenantID, knowledgeID, uuid.New().String(), ext)
_, err = s.client.CopyObject(ctx, &oss.CopyObjectRequest{
Bucket: oss.Ptr(s.bucketName),
Key: oss.Ptr(destKey),
SourceBucket: oss.Ptr(srcBucket),
SourceKey: oss.Ptr(srcKey),
})
if err != nil {
return "", fmt.Errorf("failed to copy file in OSS: %w", err)
}
newPath := fmt.Sprintf("oss://%s/%s", s.bucketName, destKey)
logger.Infof(ctx, "Copied OSS object %s to %s", srcPath, newPath)
return newPath, nil
}
// GetFile retrieves a file from OSS by its path.
func (s *ossFileService) GetFile(ctx context.Context, filePath string) (io.ReadCloser, error) {
bucketName, objectName, err := parseOssFilePath(filePath)

View File

@@ -11,6 +11,7 @@ import (
"strings"
"time"
"github.com/Tencent/WeKnora/internal/logger"
"github.com/Tencent/WeKnora/internal/types/interfaces"
"github.com/Tencent/WeKnora/internal/utils"
"github.com/aws/aws-sdk-go-v2/aws"
@@ -244,6 +245,37 @@ func (s *s3FileService) DeleteFile(ctx context.Context, filePath string) error {
return nil
}
// CopyFile copies an existing S3 object to a new knowledge-owned object using a
// server-side CopyObject (no data leaves S3). The destination uses the same
// layout as SaveFile. Returns ErrCrossBackendCopy when srcPath is not an s3:// path.
func (s *s3FileService) CopyFile(ctx context.Context,
srcPath string, tenantID uint64, knowledgeID string,
) (string, error) {
srcKey, err := s.parseS3FilePath(srcPath)
if err != nil {
return "", fmt.Errorf("s3 copy rejected source %q: %w", srcPath, ErrCrossBackendCopy)
}
ext := filepath.Ext(srcPath)
destKey := fmt.Sprintf("%s%d/%s/%s%s", s.pathPrefix, tenantID, knowledgeID, uuid.New().String(), ext)
// CopySource is "bucket/key"; the '/' separators must NOT be percent-encoded
// (url.PathEscape would turn them into %2F and break the bucket/key split).
// srcKey is already validated by parseS3FilePath -> SafeObjectKey.
_, err = s.client.CopyObject(ctx, &s3.CopyObjectInput{
Bucket: aws.String(s.bucketName),
CopySource: aws.String(s.bucketName + "/" + srcKey),
Key: aws.String(destKey),
})
if err != nil {
return "", fmt.Errorf("failed to copy file in S3: %w", err)
}
newPath := fmt.Sprintf("s3://%s/%s", s.bucketName, destKey)
logger.Infof(ctx, "Copied S3 object %s to %s", srcPath, newPath)
return newPath, nil
}
// SaveBytes saves bytes data to S3 and returns the file path
// temp parameter is ignored for S3 (no auto-expiration support in this implementation)
func (s *s3FileService) SaveBytes(ctx context.Context, data []byte, tenantID uint64, fileName string, temp bool) (string, error) {

View File

@@ -11,6 +11,7 @@ import (
"strings"
"time"
"github.com/Tencent/WeKnora/internal/logger"
"github.com/Tencent/WeKnora/internal/types/interfaces"
"github.com/Tencent/WeKnora/internal/utils"
"github.com/google/uuid"
@@ -225,6 +226,43 @@ func (s *tosFileService) SaveBytes(ctx context.Context, data []byte, tenantID ui
return fmt.Sprintf("tos://%s/%s", targetBucket, objectName), nil
}
// CopyFile copies an existing TOS object to a new knowledge-owned object using a
// server-side CopyObject (no data leaves TOS). The destination uses the same
// layout as SaveFile. Returns ErrCrossBackendCopy when srcPath is not a tos:// path.
func (s *tosFileService) CopyFile(ctx context.Context,
srcPath string, tenantID uint64, knowledgeID string,
) (string, error) {
srcBucket, srcKey, err := parseTOSFilePath(srcPath)
if err != nil {
return "", fmt.Errorf("tos copy rejected source %q: %w", srcPath, ErrCrossBackendCopy)
}
if err := utils.SafeObjectKey(srcKey); err != nil {
return "", fmt.Errorf("invalid source path: %w", err)
}
ext := filepath.Ext(srcPath)
destKey := joinTOSObjectKey(
s.pathPrefix,
fmt.Sprintf("%d", tenantID),
knowledgeID,
uuid.New().String()+ext,
)
_, err = s.client.CopyObject(ctx, &tos.CopyObjectInput{
Bucket: s.bucketName,
Key: destKey,
SrcBucket: srcBucket,
SrcKey: srcKey,
})
if err != nil {
return "", fmt.Errorf("failed to copy file in TOS: %w", err)
}
newPath := fmt.Sprintf("tos://%s/%s", s.bucketName, destKey)
logger.Infof(ctx, "Copied TOS object %s to %s", srcPath, newPath)
return newPath, nil
}
func (s *tosFileService) GetFile(ctx context.Context, filePath string) (io.ReadCloser, error) {
bucketName, objectName, err := parseTOSFilePath(filePath)
if err != nil {

View File

@@ -0,0 +1,207 @@
package service
import (
"context"
"encoding/json"
"errors"
"fmt"
"io"
"mime/multipart"
"testing"
"github.com/Tencent/WeKnora/internal/types"
)
// countingFileService is a minimal FileService stub for cloneChunkImageInfo tests.
// CopyFile records each invocation and returns a deterministic destination path
// derived from (knowledgeID, srcPath) so dedup and rewrite behaviour are verifiable.
type countingFileService struct {
copyCalls int
copiedFrom []string
failOnURL string // when non-empty, CopyFile returns an error for this srcPath
deleteCalls int
}
func (c *countingFileService) CheckConnectivity(ctx context.Context) error { return nil }
func (c *countingFileService) SaveFile(ctx context.Context, file *multipart.FileHeader, tenantID uint64, knowledgeID string) (string, error) {
return "", nil
}
func (c *countingFileService) SaveBytes(ctx context.Context, data []byte, tenantID uint64, fileName string, temp bool) (string, error) {
return "", nil
}
func (c *countingFileService) GetFile(ctx context.Context, filePath string) (io.ReadCloser, error) {
return nil, errors.New("not implemented")
}
func (c *countingFileService) GetFileURL(ctx context.Context, filePath string) (string, error) {
return filePath, nil
}
func (c *countingFileService) DeleteFile(ctx context.Context, filePath string) error {
c.deleteCalls++
return nil
}
func (c *countingFileService) CopyFile(ctx context.Context, srcPath string, tenantID uint64, knowledgeID string) (string, error) {
if c.failOnURL != "" && srcPath == c.failOnURL {
return "", fmt.Errorf("simulated copy failure for %s", srcPath)
}
c.copyCalls++
c.copiedFrom = append(c.copiedFrom, srcPath)
return fmt.Sprintf("local://%d/%s/copy-of-%s", tenantID, knowledgeID, srcPath), nil
}
func mustImageInfoJSON(t *testing.T, imgs []types.ImageInfo) string {
t.Helper()
b, err := json.Marshal(imgs)
if err != nil {
t.Fatalf("marshal image_info: %v", err)
}
return string(b)
}
func TestCloneChunkImageInfo_Empty(t *testing.T) {
svc := &countingFileService{}
out, copied, err := cloneChunkImageInfo(context.Background(), svc, "", 1, "kb-1", map[string]string{})
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if out != "" || copied != nil {
t.Fatalf("expected empty result, got out=%q copied=%v", out, copied)
}
if svc.copyCalls != 0 {
t.Fatalf("expected 0 copies, got %d", svc.copyCalls)
}
}
func TestCloneChunkImageInfo_RewritesURLAndMatchedOriginal(t *testing.T) {
svc := &countingFileService{}
src := mustImageInfoJSON(t, []types.ImageInfo{
{URL: "local://1/k0/a.png", OriginalURL: "local://1/k0/a.png", Caption: "cap"},
})
out, copied, err := cloneChunkImageInfo(context.Background(), svc, src, 7, "k-dst", map[string]string{})
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if svc.copyCalls != 1 || len(copied) != 1 {
t.Fatalf("expected exactly 1 copy, got calls=%d copied=%v", svc.copyCalls, copied)
}
var got []types.ImageInfo
if err := json.Unmarshal([]byte(out), &got); err != nil {
t.Fatalf("unmarshal out: %v", err)
}
want := "local://7/k-dst/copy-of-local://1/k0/a.png"
if got[0].URL != want {
t.Errorf("URL not rewritten: got %q want %q", got[0].URL, want)
}
// OriginalURL equalled URL -> must also be rewritten to the new object.
if got[0].OriginalURL != want {
t.Errorf("matched OriginalURL not rewritten: got %q want %q", got[0].OriginalURL, want)
}
if got[0].Caption != "cap" {
t.Errorf("Caption mutated: got %q", got[0].Caption)
}
}
func TestCloneChunkImageInfo_PreservesUnmatchedOriginalURL(t *testing.T) {
svc := &countingFileService{}
src := mustImageInfoJSON(t, []types.ImageInfo{
{URL: "local://1/k0/a.png", OriginalURL: "https://external.example.com/a.png"},
})
out, _, err := cloneChunkImageInfo(context.Background(), svc, src, 1, "k-dst", map[string]string{})
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
var got []types.ImageInfo
if err := json.Unmarshal([]byte(out), &got); err != nil {
t.Fatalf("unmarshal out: %v", err)
}
if got[0].OriginalURL != "https://external.example.com/a.png" {
t.Errorf("external OriginalURL must be preserved, got %q", got[0].OriginalURL)
}
}
func TestCloneChunkImageInfo_DedupsIdenticalURLs(t *testing.T) {
svc := &countingFileService{}
src := mustImageInfoJSON(t, []types.ImageInfo{
{URL: "local://1/k0/same.png"},
{URL: "local://1/k0/same.png"},
{URL: "local://1/k0/other.png"},
})
_, copied, err := cloneChunkImageInfo(context.Background(), svc, src, 1, "k-dst", map[string]string{})
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if svc.copyCalls != 2 {
t.Fatalf("expected 2 unique copies (dedup), got %d", svc.copyCalls)
}
if len(copied) != 2 {
t.Fatalf("expected 2 copied URLs, got %v", copied)
}
}
func TestCloneChunkImageInfo_DedupsAcrossCallsViaSharedCache(t *testing.T) {
svc := &countingFileService{}
cache := map[string]string{}
src := mustImageInfoJSON(t, []types.ImageInfo{{URL: "local://1/k0/shared.png"}})
if _, _, err := cloneChunkImageInfo(context.Background(), svc, src, 1, "k-dst", cache); err != nil {
t.Fatalf("first call error: %v", err)
}
if _, copied, err := cloneChunkImageInfo(context.Background(), svc, src, 1, "k-dst", cache); err != nil {
t.Fatalf("second call error: %v", err)
} else if len(copied) != 0 {
t.Fatalf("second call should reuse cache (0 new copies), got %v", copied)
}
if svc.copyCalls != 1 {
t.Fatalf("expected 1 copy total across calls, got %d", svc.copyCalls)
}
}
func TestCloneChunkImageInfo_ParseFailureAbortsClone(t *testing.T) {
svc := &countingFileService{}
_, _, err := cloneChunkImageInfo(context.Background(), svc, "{not valid json", 1, "k-dst", map[string]string{})
if err == nil {
t.Fatal("expected error on invalid image_info JSON, got nil")
}
if svc.copyCalls != 0 {
t.Fatalf("expected no copies on parse failure, got %d", svc.copyCalls)
}
}
func TestCloneChunkImageInfo_CopyFailureReturnsPartialForCleanup(t *testing.T) {
svc := &countingFileService{failOnURL: "local://1/k0/bad.png"}
src := mustImageInfoJSON(t, []types.ImageInfo{
{URL: "local://1/k0/good.png"},
{URL: "local://1/k0/bad.png"},
})
_, copied, err := cloneChunkImageInfo(context.Background(), svc, src, 1, "k-dst", map[string]string{})
if err == nil {
t.Fatal("expected error when an image copy fails")
}
// The already-copied "good.png" must be returned so the caller can clean it up.
if len(copied) != 1 {
t.Fatalf("expected 1 already-copied URL for rollback, got %v", copied)
}
}
func TestCloneChunkImageInfo_SkipsEmptyURL(t *testing.T) {
svc := &countingFileService{}
src := mustImageInfoJSON(t, []types.ImageInfo{{URL: "", Caption: "no-image"}})
out, copied, err := cloneChunkImageInfo(context.Background(), svc, src, 1, "k-dst", map[string]string{})
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if svc.copyCalls != 0 || len(copied) != 0 {
t.Fatalf("empty URL must be skipped, calls=%d copied=%v", svc.copyCalls, copied)
}
var got []types.ImageInfo
if err := json.Unmarshal([]byte(out), &got); err != nil {
t.Fatalf("unmarshal out: %v", err)
}
if got[0].URL != "" {
t.Errorf("empty URL should stay empty, got %q", got[0].URL)
}
}

View File

@@ -13,12 +13,106 @@ import (
"github.com/Tencent/WeKnora/internal/logger"
"github.com/Tencent/WeKnora/internal/tracing/langfuse"
"github.com/Tencent/WeKnora/internal/types"
"github.com/Tencent/WeKnora/internal/types/interfaces"
"github.com/google/uuid"
"github.com/hibiken/asynq"
"github.com/redis/go-redis/v9"
"golang.org/x/sync/errgroup"
)
// copyOwnedObject performs a real copy of srcPath into a NEW object owned by
// (tenantID, knowledgeID) using the destination FileService, returning the new
// provider:// path. The same-backend check lives inside dstSvc.CopyFile, which
// returns file.ErrCrossBackendCopy when srcPath belongs to a different provider;
// that error is propagated unchanged so callers can fail the clone explicitly.
// srcSvc is accepted for symmetry with the read side but is not used directly:
// server-side copies are issued by the destination service.
func copyOwnedObject(
ctx context.Context,
srcSvc, dstSvc interfaces.FileService,
srcPath string,
tenantID uint64,
knowledgeID string,
) (string, error) {
_ = srcSvc // reserved for future cross-backend streaming fallback
return dstSvc.CopyFile(ctx, srcPath, tenantID, knowledgeID)
}
// cloneChunkImageInfo parses a chunk's image_info JSON, copies every referenced
// object into a NEW object owned by (tenantID, knowledgeID), and returns the
// re-serialized image_info plus the list of newly-created object URLs (for
// rollback on failure). urlCache dedups identical source objects across chunks
// so the same source image is copied at most once per clone.
//
// An empty srcImageInfo yields ("", nil, nil). A JSON parse failure returns an
// error (the clone fails) rather than silently inheriting the shared-reference
// bug. When an image's OriginalURL points at the same object as its URL (the
// common case for extracted images), OriginalURL is rewritten to the new path
// too; an OriginalURL from a different/external source is preserved.
func cloneChunkImageInfo(
ctx context.Context,
dstSvc interfaces.FileService,
srcImageInfo string,
tenantID uint64,
knowledgeID string,
urlCache map[string]string,
) (newImageInfo string, copiedURLs []string, err error) {
if srcImageInfo == "" {
return "", nil, nil
}
var images []*types.ImageInfo
if err := json.Unmarshal([]byte(srcImageInfo), &images); err != nil {
return "", nil, fmt.Errorf("failed to parse chunk image_info JSON: %w", err)
}
for _, img := range images {
if img == nil || img.URL == "" {
continue
}
originalMatchedURL := img.OriginalURL == img.URL
newURL, cached := urlCache[img.URL]
if !cached {
newURL, err = copyOwnedObject(ctx, dstSvc, dstSvc, img.URL, tenantID, knowledgeID)
if err != nil {
return "", copiedURLs, fmt.Errorf("failed to copy chunk image %q: %w", img.URL, err)
}
urlCache[img.URL] = newURL
copiedURLs = append(copiedURLs, newURL)
}
if originalMatchedURL {
img.OriginalURL = newURL
}
img.URL = newURL
}
out, err := json.Marshal(images)
if err != nil {
return "", copiedURLs, fmt.Errorf("failed to re-serialize chunk image_info: %w", err)
}
return string(out), copiedURLs, nil
}
// cleanupCopiedObjects deletes objects that were newly created during a clone
// that subsequently failed, to avoid orphaning storage. It is best-effort:
// delete errors are logged but never returned (the original clone error wins).
func cleanupCopiedObjects(ctx context.Context, svc interfaces.FileService, paths []string) {
if len(paths) == 0 || svc == nil {
return
}
logger.Infof(ctx, "Cleaning up %d copied objects after clone failure", len(paths))
for _, p := range paths {
if p == "" {
continue
}
if err := svc.DeleteFile(ctx, p); err != nil {
logger.Errorf(ctx, "Failed to clean up copied object %s: %v", p, err)
}
}
}
func (s *knowledgeService) CloneKnowledgeBase(ctx context.Context, srcID, dstID string) error {
srcKB, dstKB, err := s.kbService.CopyKnowledgeBase(ctx, srcID, dstID)
if err != nil {
@@ -98,7 +192,7 @@ func (s *knowledgeService) CloneKnowledgeBase(ctx context.Context, srcID, dstID
// and updating the vector database representation of the moved chunks.
// It also ensures that the chunk's relationships (like pre and next chunk IDs) are maintained
// by mapping the source chunk IDs to the new target chunk IDs.
func (s *knowledgeService) CloneChunk(ctx context.Context, src, dst *types.Knowledge) error {
func (s *knowledgeService) CloneChunk(ctx context.Context, src, dst *types.Knowledge) (err error) {
chunkPage := 1
chunkPageSize := 100
srcTodst := map[string]string{}
@@ -108,6 +202,24 @@ func (s *knowledgeService) CloneChunk(ctx context.Context, src, dst *types.Knowl
types.ChunkTypeText, types.ChunkTypeParentText, types.ChunkTypeSummary,
types.ChunkTypeImageCaption, types.ChunkTypeImageOCR,
}
// Resolve the destination FileService so extracted images can be copied
// into objects owned by the destination knowledge. urlCache dedups identical
// source images across chunks; copiedURLs accumulates new objects so they can
// be cleaned up if the clone fails partway through.
dstKB, dstKBErr := s.kbService.GetKnowledgeBaseByID(ctx, dst.KnowledgeBaseID)
if dstKBErr != nil {
return fmt.Errorf("failed to load destination knowledge base for image copy: %w", dstKBErr)
}
dstSvc := s.resolveFileService(ctx, dstKB)
urlCache := map[string]string{}
var copiedURLs []string
defer func() {
if err != nil {
cleanupCopiedObjects(ctx, dstSvc, copiedURLs)
}
}()
for {
sourceChunks, _, err := s.chunkRepo.ListPagedChunksByKnowledgeID(ctx,
src.TenantID,
@@ -143,6 +255,16 @@ func (s *knowledgeService) CloneChunk(ctx context.Context, src, dst *types.Knowl
}
}
// Deep-copy extracted images into objects owned by the destination
// knowledge so deleting the source never breaks this clone.
newImageInfo, copied, copyErr := cloneChunkImageInfo(
ctx, dstSvc, sourceChunk.ImageInfo, dst.TenantID, dst.ID, urlCache)
if copyErr != nil {
err = fmt.Errorf("clone chunk image copy failed: %w", copyErr)
return err
}
copiedURLs = append(copiedURLs, copied...)
targetChunk := &types.Chunk{
ID: uuid.New().String(),
TenantID: dst.TenantID,
@@ -162,7 +284,7 @@ func (s *knowledgeService) CloneChunk(ctx context.Context, src, dst *types.Knowl
ParentChunkID: sourceChunk.ParentChunkID,
Metadata: sourceChunk.Metadata,
ContentHash: sourceChunk.ContentHash,
ImageInfo: sourceChunk.ImageInfo,
ImageInfo: newImageInfo,
CreatedAt: now,
UpdatedAt: now,
}
@@ -406,7 +528,19 @@ func (s *knowledgeService) cloneFAQKnowledgeBase(
srcKB, dstKB *types.KnowledgeBase,
progress *types.KBCloneProgress,
handleError func(*types.KBCloneProgress, error, string),
) error {
) (retErr error) {
// Deep-copy extracted FAQ images into objects owned by the destination KB.
// urlCache dedups identical source images across chunks; copiedURLs tracks
// new objects for best-effort cleanup if the clone fails partway through.
dstSvc := s.resolveFileService(ctx, dstKB)
imageURLCache := map[string]string{}
var copiedImageURLs []string
defer func() {
if retErr != nil {
cleanupCopiedObjects(ctx, dstSvc, copiedImageURLs)
}
}()
// Get source FAQ knowledge first (FAQ KB has exactly one Knowledge entry)
srcKnowledgeList, err := s.repo.ListKnowledgeByKnowledgeBaseID(ctx, srcKB.TenantID, srcKB.ID)
if err != nil {
@@ -540,6 +674,18 @@ func (s *knowledgeService) cloneFAQKnowledgeBase(
}
}
// Deep-copy extracted images into objects owned by the destination
// FAQ knowledge so deleting the source never breaks this clone.
newImageInfo, copied, copyErr := cloneChunkImageInfo(
ctx, dstSvc, srcChunk.ImageInfo, dstKB.TenantID, dstKnowledge.ID, imageURLCache)
if copyErr != nil {
logger.Errorf(ctx, "Failed to copy FAQ chunk images: %v", copyErr)
handleError(progress, copyErr, "Failed to copy FAQ entry images")
retErr = copyErr
return retErr
}
copiedImageURLs = append(copiedImageURLs, copied...)
newChunk := &types.Chunk{
ID: uuid.New().String(),
TenantID: dstKB.TenantID,
@@ -553,7 +699,7 @@ func (s *knowledgeService) cloneFAQKnowledgeBase(
ChunkType: types.ChunkTypeFAQ,
Metadata: srcChunk.Metadata,
ContentHash: srcChunk.ContentHash,
ImageInfo: srcChunk.ImageInfo,
ImageInfo: newImageInfo,
Status: int(types.ChunkStatusStored), // Initially stored, will be indexed
CreatedAt: time.Now(),
UpdatedAt: time.Now(),

View File

@@ -102,6 +102,10 @@ func (s *createKnowledgeFileServiceStub) DeleteFile(ctx context.Context, filePat
return nil
}
func (s *createKnowledgeFileServiceStub) CopyFile(ctx context.Context, srcPath string, tenantID uint64, knowledgeID string) (string, error) {
return "", errors.New("not implemented")
}
type createKnowledgeTaskEnqueuerStub struct {
calls int
}

View File

@@ -59,8 +59,32 @@ func (s *knowledgeService) cloneKnowledge(
StorageSize: src.StorageSize,
Metadata: src.Metadata,
}
// Deep-copy the source document file into an object owned by the destination
// knowledge. Without this the clone only shares the source's storage path, so
// deleting the source knowledge would destroy the clone's file too. The new
// object is tracked for cleanup if the clone fails downstream.
var copiedFilePaths []string
if src.FilePath != "" {
srcKB, kbErr := s.kbService.GetKnowledgeBaseByID(ctx, src.KnowledgeBaseID)
if kbErr != nil {
return fmt.Errorf("clone knowledge: failed to load source knowledge base: %w", kbErr)
}
srcSvc := s.resolveFileServiceForPath(ctx, srcKB, src.FilePath)
dstSvc := s.resolveFileService(ctx, targetKB)
newPath, copyErr := copyOwnedObject(ctx, srcSvc, dstSvc, src.FilePath, targetKB.TenantID, dst.ID)
if copyErr != nil {
return fmt.Errorf("clone knowledge file copy failed: %w", copyErr)
}
dst.FilePath = newPath
copiedFilePaths = append(copiedFilePaths, newPath)
}
defer func() {
if err != nil {
if len(copiedFilePaths) > 0 {
cleanupCopiedObjects(ctx, s.resolveFileService(ctx, targetKB), copiedFilePaths)
}
dst.ParseStatus = "failed"
dst.ErrorMessage = err.Error()
_ = s.repo.UpdateKnowledge(ctx, dst)

View File

@@ -4,6 +4,7 @@ import (
"context"
"encoding/json"
"errors"
"fmt"
"sort"
"strings"
"time"
@@ -921,6 +922,25 @@ func (s *knowledgeBaseService) CopyKnowledgeBase(ctx context.Context,
"source and target knowledge bases are bound to different vector stores; " +
"cross-store cloning is not yet supported")
}
// Defense 3: storage backend must match — only meaningful when the
// tenant has a StorageEngineConfig. Without it, resolveFileService
// ignores per-KB provider pins and routes ALL KBs to the global
// storage service, so a clone can never span two real backends and
// the pins must NOT be used to reject (that would be a false positive).
// When a tenant config exists, pins are honored, so compare effective
// providers and reject a genuine cross-backend clone up front (it would
// otherwise fail mid-clone with ErrCrossBackendCopy).
if tenant, _ := ctx.Value(types.TenantInfoContextKey).(*types.Tenant); tenant != nil && tenant.StorageEngineConfig != nil {
tenantDefault := tenant.StorageEngineConfig.DefaultProvider
srcProvider := sourceKB.EffectiveStorageProvider(tenantDefault)
dstProvider := targetKB.EffectiveStorageProvider(tenantDefault)
if srcProvider != "" && dstProvider != "" && srcProvider != dstProvider {
return nil, nil, apperrors.NewBadRequestError(fmt.Sprintf(
"source and target knowledge bases use different storage backends (%s vs %s); "+
"cross-storage-backend cloning is not supported", srcProvider, dstProvider))
}
}
} else {
var faqConfig *types.FAQConfig
if sourceKB.FAQConfig != nil {

View File

@@ -1003,6 +1003,24 @@ func (h *KnowledgeBaseHandler) CopyKnowledgeBase(c *gin.Context) {
"cross-store cloning is not yet supported"))
return
}
// Pre-flight defense 3: storage backend must match — only meaningful
// when the tenant has a StorageEngineConfig. Without it,
// resolveFileService ignores per-KB provider pins and routes ALL KBs to
// the global storage service, so a clone can never span two real
// backends and the pins must NOT be used to reject (false positive).
// When a tenant config exists, pins are honored, so reject a genuine
// cross-backend clone before enqueueing.
if tenant, _ := ctx.Value(types.TenantInfoContextKey).(*types.Tenant); tenant != nil && tenant.StorageEngineConfig != nil {
tenantDefault := tenant.StorageEngineConfig.DefaultProvider
srcProvider := sourceKB.EffectiveStorageProvider(tenantDefault)
dstProvider := targetKB.EffectiveStorageProvider(tenantDefault)
if srcProvider != "" && dstProvider != "" && srcProvider != dstProvider {
c.Error(apperrors.NewBadRequestError(
"source and target knowledge bases use different storage backends (" +
srcProvider + " vs " + dstProvider + "); cross-storage-backend cloning is not supported"))
return
}
}
}
// Generate task ID if not provided

View File

@@ -39,6 +39,10 @@ func (s *stubIMFileService) GetFileURL(ctx context.Context, filePath string) (st
func (s *stubIMFileService) DeleteFile(context.Context, string) error { return nil }
func (s *stubIMFileService) CopyFile(context.Context, string, uint64, string) (string, error) {
return "", nil
}
func TestBuildIMFileServiceForProvider_FallbackToGlobal(t *testing.T) {
stub := &stubIMFileService{}
tenant := &types.Tenant{

View File

@@ -104,6 +104,10 @@ func (c *captureSaveBytes) GetFileURL(context.Context, string) (string, error) {
func (c *captureSaveBytes) DeleteFile(context.Context, string) error { return nil }
func (c *captureSaveBytes) CopyFile(context.Context, string, uint64, string) (string, error) {
return "", nil
}
var _ interfaces.FileService = (*captureSaveBytes)(nil)
func TestResolveDataURIImages(t *testing.T) {

View File

@@ -37,6 +37,9 @@ func (m *mockFileService) GetFileURL(ctx context.Context, filePath string) (stri
return filePath, nil
}
func (m *mockFileService) DeleteFile(ctx context.Context, filePath string) error { return nil }
func (m *mockFileService) CopyFile(ctx context.Context, srcPath string, tenantID uint64, knowledgeID string) (string, error) {
return "", nil
}
func TestResolveRemoteImages_NormalDownload(t *testing.T) {
// Whitelist localhost for this test so the test server is reachable

View File

@@ -49,6 +49,10 @@ func (s *stubFileService) DeleteFile(ctx context.Context, filePath string) error
panic("unexpected call to DeleteFile")
}
func (s *stubFileService) CopyFile(ctx context.Context, srcPath string, tenantID uint64, knowledgeID string) (string, error) {
panic("unexpected call to CopyFile")
}
func TestServeFilesFallsBackToGlobalFileService(t *testing.T) {
gin.SetMode(gin.TestMode)
t.Setenv("STORAGE_TYPE", "local")

View File

@@ -23,4 +23,9 @@ type FileService interface {
GetFileURL(ctx context.Context, filePath string) (string, error)
// DeleteFile deletes a file.
DeleteFile(ctx context.Context, filePath string) error
// CopyFile copies an existing stored object to a NEW object owned by
// (tenantID, knowledgeID), returning the new provider:// path. The copy is
// independent: deleting the source never affects it. Returns ErrCrossBackendCopy
// when srcPath belongs to a different storage provider than this service.
CopyFile(ctx context.Context, srcPath string, tenantID uint64, knowledgeID string) (string, error)
}

View File

@@ -296,6 +296,17 @@ func (kb *KnowledgeBase) GetStorageProvider() string {
return strings.ToLower(strings.TrimSpace(kb.StorageConfig.Provider))
}
// EffectiveStorageProvider returns the KB's storage provider, falling back to
// the supplied tenant default when the KB does not pin one. This mirrors the
// selection logic in resolveFileService and is used by clone preflight checks
// to detect cross-storage-backend clones (which are not supported).
func (kb *KnowledgeBase) EffectiveStorageProvider(tenantDefault string) string {
if p := kb.GetStorageProvider(); p != "" {
return p
}
return strings.ToLower(strings.TrimSpace(tenantDefault))
}
// SetStorageProvider writes the provider to the new StorageProviderConfig field.
func (kb *KnowledgeBase) SetStorageProvider(provider string) {
if kb == nil {

View File

@@ -157,8 +157,6 @@ func TestKnowledgeBase_UnmarshalJSON_WithVectorStoreID(t *testing.T) {
// If a future change introduces such a shadow, the value above would fail to populate.
}
// TestKnowledgeBase_HasVectorStore covers the nil-safe binding accessor.
func TestKnowledgeBase_HasVectorStore(t *testing.T) {
t.Run("nil receiver returns false", func(t *testing.T) {
@@ -269,3 +267,51 @@ func TestKnowledgeBase_SharesStoreWith(t *testing.T) {
})
}
}
func TestEffectiveStorageProvider(t *testing.T) {
tests := []struct {
name string
kbProvider string
tenantDefault string
want string
}{
{"kb pins provider", "minio", "cos", "minio"},
{"kb empty falls back to tenant default", "", "cos", "cos"},
{"both empty", "", "", ""},
{"tenant default cased", "", " COS ", "cos"},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
kb := &KnowledgeBase{}
if tt.kbProvider != "" {
kb.StorageProviderConfig = &StorageProviderConfig{Provider: tt.kbProvider}
}
if got := kb.EffectiveStorageProvider(tt.tenantDefault); got != tt.want {
t.Errorf("EffectiveStorageProvider(%q) with kb=%q = %q, want %q",
tt.tenantDefault, tt.kbProvider, got, tt.want)
}
})
}
}
// TestEffectiveStorageProvider_CrossBackendDetection documents the comparison the
// clone preflight performs: a mismatch is only flagged when both effective
// providers are non-empty and differ.
func TestEffectiveStorageProvider_CrossBackendDetection(t *testing.T) {
tenantDefault := "minio"
src := &KnowledgeBase{} // inherits tenant default -> minio
dst := &KnowledgeBase{StorageProviderConfig: &StorageProviderConfig{Provider: "cos"}}
sp := src.EffectiveStorageProvider(tenantDefault)
dp := dst.EffectiveStorageProvider(tenantDefault)
if sp == "" || dp == "" || sp == dp {
t.Fatalf("expected cross-backend mismatch, got src=%q dst=%q", sp, dp)
}
// Same effective provider (dst empty inherits the same tenant default) must NOT be flagged.
dstSame := &KnowledgeBase{}
if dstSame.EffectiveStorageProvider(tenantDefault) != sp {
t.Errorf("same tenant default should match: got %q vs %q",
dstSame.EffectiveStorageProvider(tenantDefault), sp)
}
}