mirror of
https://github.com/Tencent/WeKnora.git
synced 2026-06-04 13:30:32 +08:00
Cloning a knowledge base previously copied only the storage path strings (knowledge.FilePath and chunk.ImageInfo.URL), so the source and the clone shared the same physical objects in the storage backend. Once the original file and extracted images are deleted on source removal, the clone is left with dangling references and its document and images become unreadable — data loss that occurs even for same-store clones. Add a CopyFile primitive to the FileService interface and implement it in every backend: server-side CopyObject on the object stores (s3/obs/cos/oss/tos/ks3/minio), io.Copy on local, and a no-op on dummy. Destinations use the knowledge-owned layout and reuse the existing path/object-key guards; a sentinel ErrCrossBackendCopy is returned when the source scheme does not match the backend. Use CopyFile to deep-copy the document file in cloneKnowledge and the extracted images in CloneChunk and cloneFAQKnowledgeBase via a shared cloneChunkImageInfo helper that deduplicates identical image URLs per clone and rewrites them to the new objects. Copied objects are cleaned up best-effort if a clone fails partway through. A clone-time preflight rejects cloning into a target bound to a different storage backend when the tenant pins providers via StorageEngineConfig. Adds unit tests for local CopyFile (independent copy survives source deletion, traversal rejection, cross-backend rejection), cloneChunkImageInfo (empty/multi/dedup/parse-failure/OriginalURL handling), and the storage provider preflight.
165 lines
5.0 KiB
Go
165 lines
5.0 KiB
Go
package tools
|
|
|
|
import (
|
|
"bytes"
|
|
"context"
|
|
"errors"
|
|
"io"
|
|
"mime/multipart"
|
|
"os"
|
|
"path/filepath"
|
|
"strings"
|
|
"testing"
|
|
|
|
"github.com/Tencent/WeKnora/internal/types"
|
|
)
|
|
|
|
// fakeFileService implements interfaces.FileService just enough to drive
|
|
// materializeKnowledgeFile. Every method we don't care about simply errors
|
|
// out so accidental usage is loud.
|
|
type fakeFileService struct {
|
|
readers map[string]func() (io.ReadCloser, error)
|
|
}
|
|
|
|
func (f *fakeFileService) CheckConnectivity(ctx context.Context) error { return nil }
|
|
func (f *fakeFileService) SaveFile(ctx context.Context, _ *multipart.FileHeader, _ uint64, _ string) (string, error) {
|
|
return "", errors.New("not implemented in fake")
|
|
}
|
|
func (f *fakeFileService) SaveBytes(ctx context.Context, _ []byte, _ uint64, _ string, _ bool) (string, error) {
|
|
return "", errors.New("not implemented in fake")
|
|
}
|
|
func (f *fakeFileService) GetFile(ctx context.Context, filePath string) (io.ReadCloser, error) {
|
|
fn, ok := f.readers[filePath]
|
|
if !ok {
|
|
return nil, errors.New("unknown path: " + filePath)
|
|
}
|
|
return fn()
|
|
}
|
|
func (f *fakeFileService) GetFileURL(ctx context.Context, filePath string) (string, error) {
|
|
// Return a URL that DuckDB would NOT be able to open on its own; the
|
|
// production code must *not* pass this through to DuckDB.
|
|
return "local://" + strings.TrimPrefix(filePath, "/"), nil
|
|
}
|
|
func (f *fakeFileService) DeleteFile(ctx context.Context, _ string) error { return nil }
|
|
func (f *fakeFileService) CopyFile(ctx context.Context, _ string, _ uint64, _ string) (string, error) {
|
|
return "", nil
|
|
}
|
|
|
|
// TestMaterializeKnowledgeFile_HandlesLocalScheme is the regression guard
|
|
// for the dev-mode failure where DuckDB was handed a local:// URL it can't
|
|
// resolve. The tool must pull bytes via FileService.GetFile and hand DuckDB
|
|
// a concrete filesystem path with the right extension.
|
|
func TestMaterializeKnowledgeFile_HandlesLocalScheme(t *testing.T) {
|
|
payload := []byte("col1,col2\n1,2\n3,4\n")
|
|
|
|
fs := &fakeFileService{
|
|
readers: map[string]func() (io.ReadCloser, error){
|
|
"tenants/42/data.csv": func() (io.ReadCloser, error) {
|
|
return io.NopCloser(bytes.NewReader(payload)), nil
|
|
},
|
|
},
|
|
}
|
|
|
|
tool := &DataAnalysisTool{
|
|
fileService: fs,
|
|
sessionID: "test-materialize",
|
|
}
|
|
|
|
k := &types.Knowledge{
|
|
ID: "k-abc",
|
|
FileType: "csv",
|
|
FilePath: "tenants/42/data.csv",
|
|
}
|
|
|
|
path, cleanup, err := tool.materializeKnowledgeFile(context.Background(), k)
|
|
if err != nil {
|
|
t.Fatalf("materialize: %v", err)
|
|
}
|
|
t.Cleanup(cleanup)
|
|
|
|
// Path must be a real filesystem path (no provider scheme) so DuckDB
|
|
// can st_read / read_xlsx / read_csv_auto it directly.
|
|
if strings.Contains(path, "://") {
|
|
t.Fatalf("expected bare filesystem path, got scheme URL: %q", path)
|
|
}
|
|
if filepath.Ext(path) != ".csv" {
|
|
t.Errorf("expected .csv suffix preserved for DuckDB format detection, got %q", path)
|
|
}
|
|
|
|
// File must actually exist and have the exact bytes we fed in.
|
|
got, err := os.ReadFile(path)
|
|
if err != nil {
|
|
t.Fatalf("read temp file: %v", err)
|
|
}
|
|
if !bytes.Equal(got, payload) {
|
|
t.Errorf("payload roundtrip mismatch: got %q, want %q", got, payload)
|
|
}
|
|
|
|
// cleanup() must remove the temp file without error.
|
|
cleanup()
|
|
if _, err := os.Stat(path); !os.IsNotExist(err) {
|
|
t.Errorf("expected temp file to be removed after cleanup, stat err=%v", err)
|
|
}
|
|
// Double-cleanup must be safe (test harness may invoke it again).
|
|
cleanup()
|
|
}
|
|
|
|
func TestMaterializeKnowledgeFile_PropagatesGetFileError(t *testing.T) {
|
|
fs := &fakeFileService{
|
|
readers: map[string]func() (io.ReadCloser, error){
|
|
"bad/path": func() (io.ReadCloser, error) {
|
|
return nil, errors.New("boom")
|
|
},
|
|
},
|
|
}
|
|
|
|
tool := &DataAnalysisTool{fileService: fs, sessionID: "test-error"}
|
|
|
|
_, cleanup, err := tool.materializeKnowledgeFile(
|
|
context.Background(),
|
|
&types.Knowledge{ID: "k-err", FileType: "xlsx", FilePath: "bad/path"},
|
|
)
|
|
if err == nil {
|
|
defer cleanup()
|
|
t.Fatal("expected error from underlying GetFile, got nil")
|
|
}
|
|
if !strings.Contains(err.Error(), "boom") {
|
|
t.Errorf("expected wrapped 'boom' error, got: %v", err)
|
|
}
|
|
}
|
|
|
|
func TestMaterializeKnowledgeFile_PreservesExtension(t *testing.T) {
|
|
cases := []struct {
|
|
fileType string
|
|
wantExt string
|
|
}{
|
|
{"xlsx", ".xlsx"},
|
|
{"XLSX", ".xlsx"}, // normalized lowercase
|
|
{"csv", ".csv"},
|
|
{"xls", ".xls"},
|
|
}
|
|
for _, c := range cases {
|
|
t.Run(c.fileType, func(t *testing.T) {
|
|
fs := &fakeFileService{
|
|
readers: map[string]func() (io.ReadCloser, error){
|
|
"p": func() (io.ReadCloser, error) {
|
|
return io.NopCloser(bytes.NewReader([]byte("x"))), nil
|
|
},
|
|
},
|
|
}
|
|
tool := &DataAnalysisTool{fileService: fs, sessionID: "ext"}
|
|
path, cleanup, err := tool.materializeKnowledgeFile(
|
|
context.Background(),
|
|
&types.Knowledge{ID: "k", FileType: c.fileType, FilePath: "p"},
|
|
)
|
|
if err != nil {
|
|
t.Fatalf("materialize: %v", err)
|
|
}
|
|
defer cleanup()
|
|
if filepath.Ext(path) != c.wantExt {
|
|
t.Errorf("fileType=%q: extension mismatch got %q want %q", c.fileType, filepath.Ext(path), c.wantExt)
|
|
}
|
|
})
|
|
}
|
|
}
|