mirror of
https://github.com/Tencent/WeKnora.git
synced 2026-06-04 13:30:32 +08:00
Cloning a knowledge base previously copied only the storage path strings (knowledge.FilePath and chunk.ImageInfo.URL), so the source and the clone shared the same physical objects in the storage backend. Once the original file and extracted images are deleted on source removal, the clone is left with dangling references and its document and images become unreadable — data loss that occurs even for same-store clones. Add a CopyFile primitive to the FileService interface and implement it in every backend: server-side CopyObject on the object stores (s3/obs/cos/oss/tos/ks3/minio), io.Copy on local, and a no-op on dummy. Destinations use the knowledge-owned layout and reuse the existing path/object-key guards; a sentinel ErrCrossBackendCopy is returned when the source scheme does not match the backend. Use CopyFile to deep-copy the document file in cloneKnowledge and the extracted images in CloneChunk and cloneFAQKnowledgeBase via a shared cloneChunkImageInfo helper that deduplicates identical image URLs per clone and rewrites them to the new objects. Copied objects are cleaned up best-effort if a clone fails partway through. A clone-time preflight rejects cloning into a target bound to a different storage backend when the tenant pins providers via StorageEngineConfig. Adds unit tests for local CopyFile (independent copy survives source deletion, traversal rejection, cross-backend rejection), cloneChunkImageInfo (empty/multi/dedup/parse-failure/OriginalURL handling), and the storage provider preflight.
324 lines
9.1 KiB
Go
324 lines
9.1 KiB
Go
package file
|
|
|
|
import (
|
|
"bytes"
|
|
"context"
|
|
"errors"
|
|
"fmt"
|
|
"io"
|
|
"mime/multipart"
|
|
"path/filepath"
|
|
"strings"
|
|
"time"
|
|
|
|
"github.com/Tencent/WeKnora/internal/logger"
|
|
"github.com/Tencent/WeKnora/internal/types/interfaces"
|
|
"github.com/Tencent/WeKnora/internal/utils"
|
|
"github.com/google/uuid"
|
|
"github.com/volcengine/ve-tos-golang-sdk/v2/tos"
|
|
"github.com/volcengine/ve-tos-golang-sdk/v2/tos/enum"
|
|
)
|
|
|
|
// tosFileService implements the FileService interface for Volcengine TOS.
|
|
type tosFileService struct {
|
|
client *tos.ClientV2
|
|
pathPrefix string
|
|
bucketName string
|
|
tempBucketName string
|
|
}
|
|
|
|
const tosScheme = "tos://"
|
|
|
|
// NewTosFileService creates a TOS file service.
|
|
func NewTosFileService(endpoint, region, accessKey, secretKey, bucketName, pathPrefix string) (interfaces.FileService, error) {
|
|
return NewTosFileServiceWithTempBucket(endpoint, region, accessKey, secretKey, bucketName, pathPrefix, "", "")
|
|
}
|
|
|
|
// NewTosFileServiceWithTempBucket creates a TOS file service with optional temp bucket.
|
|
func NewTosFileServiceWithTempBucket(endpoint, region, accessKey, secretKey, bucketName, pathPrefix, tempBucketName, tempRegion string) (interfaces.FileService, error) {
|
|
client, err := tos.NewClientV2(
|
|
endpoint,
|
|
tos.WithRegion(region),
|
|
tos.WithCredentials(tos.NewStaticCredentials(accessKey, secretKey)),
|
|
)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to initialize TOS client: %w", err)
|
|
}
|
|
|
|
if err := ensureTOSBucket(client, bucketName); err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
if tempBucketName != "" {
|
|
if tempRegion == "" {
|
|
tempRegion = region
|
|
}
|
|
// Temporary bucket may belong to another region, so probe with a short-lived client.
|
|
tempClient, err := tos.NewClientV2(
|
|
endpoint,
|
|
tos.WithRegion(tempRegion),
|
|
tos.WithCredentials(tos.NewStaticCredentials(accessKey, secretKey)),
|
|
)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to initialize TOS temp client: %w", err)
|
|
}
|
|
if err := ensureTOSBucket(tempClient, tempBucketName); err != nil {
|
|
return nil, err
|
|
}
|
|
}
|
|
|
|
return &tosFileService{
|
|
client: client,
|
|
pathPrefix: strings.Trim(pathPrefix, "/"),
|
|
bucketName: bucketName,
|
|
tempBucketName: tempBucketName,
|
|
}, nil
|
|
}
|
|
|
|
// CheckConnectivity verifies TOS is reachable by performing a HeadBucket request.
|
|
func (s *tosFileService) CheckConnectivity(ctx context.Context) error {
|
|
checkCtx, cancel := context.WithTimeout(ctx, 10*time.Second)
|
|
defer cancel()
|
|
_, err := s.client.HeadBucket(checkCtx, &tos.HeadBucketInput{
|
|
Bucket: s.bucketName,
|
|
})
|
|
return err
|
|
}
|
|
|
|
// CheckTosConnectivity tests TOS connectivity using the provided credentials.
|
|
func CheckTosConnectivity(ctx context.Context, endpoint, region, accessKey, secretKey, bucketName string) error {
|
|
client, err := tos.NewClientV2(
|
|
endpoint,
|
|
tos.WithRegion(region),
|
|
tos.WithCredentials(tos.NewStaticCredentials(accessKey, secretKey)),
|
|
)
|
|
if err != nil {
|
|
return fmt.Errorf("failed to initialize TOS client: %w", err)
|
|
}
|
|
checkCtx, cancel := context.WithTimeout(ctx, 10*time.Second)
|
|
defer cancel()
|
|
_, err = client.HeadBucket(checkCtx, &tos.HeadBucketInput{
|
|
Bucket: bucketName,
|
|
})
|
|
return err
|
|
}
|
|
|
|
func ensureTOSBucket(client *tos.ClientV2, bucketName string) error {
|
|
_, err := client.HeadBucket(context.Background(), &tos.HeadBucketInput{
|
|
Bucket: bucketName,
|
|
})
|
|
if err == nil {
|
|
return nil
|
|
}
|
|
|
|
var serverErr *tos.TosServerError
|
|
if errors.As(err, &serverErr) && serverErr.StatusCode == 404 {
|
|
_, createErr := client.CreateBucketV2(context.Background(), &tos.CreateBucketV2Input{
|
|
Bucket: bucketName,
|
|
})
|
|
if createErr == nil {
|
|
return nil
|
|
}
|
|
if errors.As(createErr, &serverErr) && serverErr.StatusCode == 409 {
|
|
return nil
|
|
}
|
|
return fmt.Errorf("failed to create TOS bucket: %w", createErr)
|
|
}
|
|
|
|
return fmt.Errorf("failed to check TOS bucket: %w", err)
|
|
}
|
|
|
|
func joinTOSObjectKey(parts ...string) string {
|
|
filtered := make([]string, 0, len(parts))
|
|
for _, part := range parts {
|
|
part = strings.Trim(part, "/")
|
|
if part != "" {
|
|
filtered = append(filtered, part)
|
|
}
|
|
}
|
|
return strings.Join(filtered, "/")
|
|
}
|
|
|
|
func parseTOSFilePath(filePath string) (bucketName string, objectKey string, err error) {
|
|
if !strings.HasPrefix(filePath, tosScheme) {
|
|
return "", "", fmt.Errorf("invalid TOS file path: %s", filePath)
|
|
}
|
|
|
|
rest := strings.TrimPrefix(filePath, tosScheme)
|
|
parts := strings.SplitN(rest, "/", 2)
|
|
if len(parts) != 2 || parts[0] == "" || parts[1] == "" {
|
|
return "", "", fmt.Errorf("invalid TOS file path: %s", filePath)
|
|
}
|
|
return parts[0], parts[1], nil
|
|
}
|
|
|
|
func (s *tosFileService) SaveFile(ctx context.Context, file *multipart.FileHeader, tenantID uint64, knowledgeID string) (string, error) {
|
|
ext := filepath.Ext(file.Filename)
|
|
objectName := joinTOSObjectKey(
|
|
s.pathPrefix,
|
|
fmt.Sprintf("%d", tenantID),
|
|
knowledgeID,
|
|
uuid.New().String()+ext,
|
|
)
|
|
|
|
src, err := file.Open()
|
|
if err != nil {
|
|
return "", fmt.Errorf("failed to open file: %w", err)
|
|
}
|
|
defer src.Close()
|
|
|
|
contentType := file.Header.Get("Content-Type")
|
|
if contentType == "" {
|
|
contentType = utils.GetContentTypeByExt(ext)
|
|
}
|
|
_, err = s.client.PutObjectV2(ctx, &tos.PutObjectV2Input{
|
|
PutObjectBasicInput: tos.PutObjectBasicInput{
|
|
Bucket: s.bucketName,
|
|
Key: objectName,
|
|
ContentType: contentType,
|
|
},
|
|
Content: src,
|
|
})
|
|
if err != nil {
|
|
return "", fmt.Errorf("failed to upload file to TOS: %w", err)
|
|
}
|
|
|
|
return fmt.Sprintf("tos://%s/%s", s.bucketName, objectName), nil
|
|
}
|
|
|
|
func (s *tosFileService) SaveBytes(ctx context.Context, data []byte, tenantID uint64, fileName string, temp bool) (string, error) {
|
|
safeName, err := utils.SafeFileName(fileName)
|
|
if err != nil {
|
|
return "", fmt.Errorf("invalid file name: %w", err)
|
|
}
|
|
ext := filepath.Ext(safeName)
|
|
reader := bytes.NewReader(data)
|
|
|
|
targetBucket := s.bucketName
|
|
objectName := joinTOSObjectKey(
|
|
s.pathPrefix,
|
|
fmt.Sprintf("%d", tenantID),
|
|
"exports",
|
|
uuid.New().String()+ext,
|
|
)
|
|
|
|
if temp && s.tempBucketName != "" {
|
|
targetBucket = s.tempBucketName
|
|
objectName = joinTOSObjectKey(
|
|
"exports",
|
|
fmt.Sprintf("%d", tenantID),
|
|
uuid.New().String()+ext,
|
|
)
|
|
}
|
|
|
|
_, err = s.client.PutObjectV2(ctx, &tos.PutObjectV2Input{
|
|
PutObjectBasicInput: tos.PutObjectBasicInput{
|
|
Bucket: targetBucket,
|
|
Key: objectName,
|
|
ContentType: utils.GetContentTypeByExt(ext),
|
|
},
|
|
Content: reader,
|
|
})
|
|
if err != nil {
|
|
return "", fmt.Errorf("failed to upload bytes to TOS: %w", err)
|
|
}
|
|
|
|
return fmt.Sprintf("tos://%s/%s", targetBucket, objectName), nil
|
|
}
|
|
|
|
// CopyFile copies an existing TOS object to a new knowledge-owned object using a
|
|
// server-side CopyObject (no data leaves TOS). The destination uses the same
|
|
// layout as SaveFile. Returns ErrCrossBackendCopy when srcPath is not a tos:// path.
|
|
func (s *tosFileService) CopyFile(ctx context.Context,
|
|
srcPath string, tenantID uint64, knowledgeID string,
|
|
) (string, error) {
|
|
srcBucket, srcKey, err := parseTOSFilePath(srcPath)
|
|
if err != nil {
|
|
return "", fmt.Errorf("tos copy rejected source %q: %w", srcPath, ErrCrossBackendCopy)
|
|
}
|
|
if err := utils.SafeObjectKey(srcKey); err != nil {
|
|
return "", fmt.Errorf("invalid source path: %w", err)
|
|
}
|
|
|
|
ext := filepath.Ext(srcPath)
|
|
destKey := joinTOSObjectKey(
|
|
s.pathPrefix,
|
|
fmt.Sprintf("%d", tenantID),
|
|
knowledgeID,
|
|
uuid.New().String()+ext,
|
|
)
|
|
|
|
_, err = s.client.CopyObject(ctx, &tos.CopyObjectInput{
|
|
Bucket: s.bucketName,
|
|
Key: destKey,
|
|
SrcBucket: srcBucket,
|
|
SrcKey: srcKey,
|
|
})
|
|
if err != nil {
|
|
return "", fmt.Errorf("failed to copy file in TOS: %w", err)
|
|
}
|
|
|
|
newPath := fmt.Sprintf("tos://%s/%s", s.bucketName, destKey)
|
|
logger.Infof(ctx, "Copied TOS object %s to %s", srcPath, newPath)
|
|
return newPath, nil
|
|
}
|
|
|
|
func (s *tosFileService) GetFile(ctx context.Context, filePath string) (io.ReadCloser, error) {
|
|
bucketName, objectName, err := parseTOSFilePath(filePath)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
if err := utils.SafeObjectKey(objectName); err != nil {
|
|
return nil, fmt.Errorf("invalid file path: %w", err)
|
|
}
|
|
|
|
output, err := s.client.GetObjectV2(ctx, &tos.GetObjectV2Input{
|
|
Bucket: bucketName,
|
|
Key: objectName,
|
|
})
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to get file from TOS: %w", err)
|
|
}
|
|
return output.Content, nil
|
|
}
|
|
|
|
func (s *tosFileService) DeleteFile(ctx context.Context, filePath string) error {
|
|
bucketName, objectName, err := parseTOSFilePath(filePath)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
if err := utils.SafeObjectKey(objectName); err != nil {
|
|
return fmt.Errorf("invalid file path: %w", err)
|
|
}
|
|
|
|
_, err = s.client.DeleteObjectV2(ctx, &tos.DeleteObjectV2Input{
|
|
Bucket: bucketName,
|
|
Key: objectName,
|
|
})
|
|
if err != nil {
|
|
return fmt.Errorf("failed to delete file from TOS: %w", err)
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func (s *tosFileService) GetFileURL(ctx context.Context, filePath string) (string, error) {
|
|
bucketName, objectName, err := parseTOSFilePath(filePath)
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
if err := utils.SafeObjectKey(objectName); err != nil {
|
|
return "", fmt.Errorf("invalid file path: %w", err)
|
|
}
|
|
|
|
output, err := s.client.PreSignedURL(&tos.PreSignedURLInput{
|
|
HTTPMethod: enum.HttpMethodGet,
|
|
Bucket: bucketName,
|
|
Key: objectName,
|
|
Expires: int64((24 * time.Hour).Seconds()),
|
|
})
|
|
if err != nil {
|
|
return "", fmt.Errorf("failed to generate TOS presigned URL: %w", err)
|
|
}
|
|
return output.SignedUrl, nil
|
|
}
|