Files
WeKnora/internal/application/service/file/local_copy_test.go
ochan.kwon e9980c6011 fix: deep-copy stored files and images when cloning a knowledge base
Cloning a knowledge base previously copied only the storage path strings
(knowledge.FilePath and chunk.ImageInfo.URL), so the source and the clone
shared the same physical objects in the storage backend. Once the original
file and extracted images are deleted on source removal, the clone is left
with dangling references and its document and images become unreadable —
data loss that occurs even for same-store clones.

Add a CopyFile primitive to the FileService interface and implement it in
every backend: server-side CopyObject on the object stores
(s3/obs/cos/oss/tos/ks3/minio), io.Copy on local, and a no-op on dummy.
Destinations use the knowledge-owned layout and reuse the existing
path/object-key guards; a sentinel ErrCrossBackendCopy is returned when the
source scheme does not match the backend.

Use CopyFile to deep-copy the document file in cloneKnowledge and the
extracted images in CloneChunk and cloneFAQKnowledgeBase via a shared
cloneChunkImageInfo helper that deduplicates identical image URLs per clone
and rewrites them to the new objects. Copied objects are cleaned up
best-effort if a clone fails partway through. A clone-time preflight rejects
cloning into a target bound to a different storage backend when the tenant
pins providers via StorageEngineConfig.

Adds unit tests for local CopyFile (independent copy survives source
deletion, traversal rejection, cross-backend rejection), cloneChunkImageInfo
(empty/multi/dedup/parse-failure/OriginalURL handling), and the storage
provider preflight.
2026-06-03 14:45:59 +08:00

91 lines
3.0 KiB
Go

package file
import (
"context"
"errors"
"io"
"os"
"path/filepath"
"strings"
"testing"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
// seedLocalObject writes a file under baseDir at the SaveBytes-style layout
// (tenant/exports) and returns its local:// path.
func seedLocalObject(t *testing.T, baseDir string, tenantID uint64, name string, data []byte) string {
t.Helper()
dir := filepath.Join(baseDir, "0", "src-knowledge")
require.NoError(t, os.MkdirAll(dir, 0o755))
full := filepath.Join(dir, name)
require.NoError(t, os.WriteFile(full, data, 0o644))
rel, err := filepath.Rel(baseDir, full)
require.NoError(t, err)
return localScheme + filepath.ToSlash(rel)
}
func readLocal(t *testing.T, svc interface {
GetFile(context.Context, string) (io.ReadCloser, error)
}, path string) []byte {
t.Helper()
rc, err := svc.GetFile(context.Background(), path)
require.NoError(t, err)
defer rc.Close()
b, err := io.ReadAll(rc)
require.NoError(t, err)
return b
}
// TestLocalCopyFile_IndependentCopy verifies that CopyFile creates a real,
// independent copy: it exists at a new knowledge-owned path AND survives
// deletion of the source object (the C1/C2 regression this PR fixes).
func TestLocalCopyFile_IndependentCopy(t *testing.T) {
base := t.TempDir()
svc := NewLocalFileService(base, "")
content := []byte("hello deep copy")
srcPath := seedLocalObject(t, base, 0, "doc.txt", content)
newPath, err := svc.CopyFile(context.Background(), srcPath, 42, "dst-knowledge")
require.NoError(t, err)
// New path must be knowledge-owned: local://42/dst-knowledge/<unique>.txt
require.True(t, strings.HasPrefix(newPath, localScheme+"42/dst-knowledge/"),
"unexpected dst path: %s", newPath)
assert.Equal(t, ".txt", filepath.Ext(newPath))
// Copy is readable and byte-identical.
assert.Equal(t, content, readLocal(t, svc, newPath))
// Delete the source — the copy must remain intact.
require.NoError(t, svc.DeleteFile(context.Background(), srcPath))
assert.Equal(t, content, readLocal(t, svc, newPath),
"copy should survive deletion of source")
}
// TestLocalCopyFile_CrossBackend verifies that handing a non-local provider
// scheme to the local service is rejected with ErrCrossBackendCopy.
func TestLocalCopyFile_CrossBackend(t *testing.T) {
base := t.TempDir()
svc := NewLocalFileService(base, "")
_, err := svc.CopyFile(context.Background(), "s3://bucket/10/exports/a.png", 7, "k")
require.Error(t, err)
assert.True(t, errors.Is(err, ErrCrossBackendCopy),
"expected ErrCrossBackendCopy, got %v", err)
}
// TestLocalCopyFile_TraversalRejected verifies that the same path guard used by
// GetFile/DeleteFile rejects a traversal source path.
func TestLocalCopyFile_TraversalRejected(t *testing.T) {
base := t.TempDir()
svc := NewLocalFileService(base, "")
_, err := svc.CopyFile(context.Background(), localScheme+"../../etc/passwd", 7, "k")
require.Error(t, err)
assert.False(t, errors.Is(err, ErrCrossBackendCopy),
"traversal should be a path error, not cross-backend")
}