mirror of
https://github.com/Tencent/WeKnora.git
synced 2026-06-04 13:30:32 +08:00
Cloning a knowledge base previously copied only the storage path strings (knowledge.FilePath and chunk.ImageInfo.URL), so the source and the clone shared the same physical objects in the storage backend. Once the original file and extracted images are deleted on source removal, the clone is left with dangling references and its document and images become unreadable — data loss that occurs even for same-store clones. Add a CopyFile primitive to the FileService interface and implement it in every backend: server-side CopyObject on the object stores (s3/obs/cos/oss/tos/ks3/minio), io.Copy on local, and a no-op on dummy. Destinations use the knowledge-owned layout and reuse the existing path/object-key guards; a sentinel ErrCrossBackendCopy is returned when the source scheme does not match the backend. Use CopyFile to deep-copy the document file in cloneKnowledge and the extracted images in CloneChunk and cloneFAQKnowledgeBase via a shared cloneChunkImageInfo helper that deduplicates identical image URLs per clone and rewrites them to the new objects. Copied objects are cleaned up best-effort if a clone fails partway through. A clone-time preflight rejects cloning into a target bound to a different storage backend when the tenant pins providers via StorageEngineConfig. Adds unit tests for local CopyFile (independent copy survives source deletion, traversal rejection, cross-backend rejection), cloneChunkImageInfo (empty/multi/dedup/parse-failure/OriginalURL handling), and the storage provider preflight.
270 lines
9.8 KiB
Go
270 lines
9.8 KiB
Go
package file
|
|
|
|
import (
|
|
"bytes"
|
|
"context"
|
|
"fmt"
|
|
"io"
|
|
"mime/multipart"
|
|
"net/http"
|
|
"net/url"
|
|
"path/filepath"
|
|
"strings"
|
|
"time"
|
|
|
|
"github.com/Tencent/WeKnora/internal/logger"
|
|
"github.com/Tencent/WeKnora/internal/types/interfaces"
|
|
"github.com/Tencent/WeKnora/internal/utils"
|
|
"github.com/google/uuid"
|
|
"github.com/tencentyun/cos-go-sdk-v5"
|
|
)
|
|
|
|
// cosFileService implements the FileService interface for Tencent Cloud COS
|
|
type cosFileService struct {
|
|
client *cos.Client
|
|
bucketURL string
|
|
cosPathPrefix string
|
|
tempClient *cos.Client
|
|
tempBucketURL string
|
|
bucketName string
|
|
region string
|
|
}
|
|
|
|
const cosScheme = "cos://"
|
|
|
|
// newCosClient creates a bare cosFileService with just the SDK client initialised.
|
|
// Shared by NewCosFileService* constructors and CheckCosConnectivity.
|
|
func newCosClient(bucketName, region, secretID, secretKey string) (*cosFileService, error) {
|
|
bucketURL := fmt.Sprintf("https://%s.cos.%s.myqcloud.com/", bucketName, region)
|
|
u, err := url.Parse(bucketURL)
|
|
logger.Infof(context.Background(), "newCosClient: bucketURL: %s", bucketURL)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to parse bucketURL: %w", err)
|
|
}
|
|
client := cos.NewClient(&cos.BaseURL{BucketURL: u}, &http.Client{
|
|
Transport: &cos.AuthorizationTransport{
|
|
SecretID: secretID,
|
|
SecretKey: secretKey,
|
|
},
|
|
})
|
|
return &cosFileService{client: client, bucketURL: bucketURL, bucketName: bucketName, region: region}, nil
|
|
}
|
|
|
|
// NewCosFileService creates a new COS file service instance
|
|
func NewCosFileService(bucketName, region, secretId, secretKey, cosPathPrefix string) (interfaces.FileService, error) {
|
|
return NewCosFileServiceWithTempBucket(bucketName, region, secretId, secretKey, cosPathPrefix, "", "")
|
|
}
|
|
|
|
// NewCosFileServiceWithTempBucket creates a new COS file service instance with optional temp bucket
|
|
func NewCosFileServiceWithTempBucket(bucketName, region, secretId, secretKey, cosPathPrefix, tempBucketName, tempRegion string) (interfaces.FileService, error) {
|
|
svc, err := newCosClient(bucketName, region, secretId, secretKey)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
svc.cosPathPrefix = cosPathPrefix
|
|
|
|
if tempBucketName != "" {
|
|
if tempRegion == "" {
|
|
tempRegion = region
|
|
}
|
|
tempBucketURL := fmt.Sprintf("https://%s.cos.%s.myqcloud.com/", tempBucketName, tempRegion)
|
|
tempU, err := url.Parse(tempBucketURL)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to parse temp bucketURL: %w", err)
|
|
}
|
|
svc.tempClient = cos.NewClient(&cos.BaseURL{BucketURL: tempU}, &http.Client{
|
|
Transport: &cos.AuthorizationTransport{
|
|
SecretID: secretId,
|
|
SecretKey: secretKey,
|
|
},
|
|
})
|
|
svc.tempBucketURL = tempBucketURL
|
|
}
|
|
|
|
return svc, nil
|
|
}
|
|
|
|
// CheckConnectivity verifies COS is reachable by performing a HEAD request on the bucket.
|
|
func (s *cosFileService) CheckConnectivity(ctx context.Context) error {
|
|
checkCtx, cancel := context.WithTimeout(ctx, 10*time.Second)
|
|
defer cancel()
|
|
_, err := s.client.Bucket.Head(checkCtx)
|
|
return err
|
|
}
|
|
|
|
// CheckCosConnectivity tests COS connectivity using the provided credentials.
|
|
// It creates a temporary service instance internally and delegates to CheckConnectivity.
|
|
func CheckCosConnectivity(ctx context.Context, bucketName, region, secretID, secretKey string) error {
|
|
svc, err := newCosClient(bucketName, region, secretID, secretKey)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
return svc.CheckConnectivity(ctx)
|
|
}
|
|
|
|
// SaveFile saves a file to COS storage
|
|
// It generates a unique name for the file and organizes it by tenant and knowledge ID
|
|
func (s *cosFileService) SaveFile(ctx context.Context,
|
|
file *multipart.FileHeader, tenantID uint64, knowledgeID string,
|
|
) (string, error) {
|
|
ext := filepath.Ext(file.Filename)
|
|
objectName := fmt.Sprintf("%s/%d/%s/%s%s", s.cosPathPrefix, tenantID, knowledgeID, uuid.New().String(), ext)
|
|
src, err := file.Open()
|
|
if err != nil {
|
|
return "", fmt.Errorf("failed to open file: %w", err)
|
|
}
|
|
defer src.Close()
|
|
_, err = s.client.Object.Put(ctx, objectName, src, nil)
|
|
if err != nil {
|
|
return "", fmt.Errorf("failed to upload file to COS: %w", err)
|
|
}
|
|
return fmt.Sprintf("cos://%s/%s/%s", s.bucketName, s.region, objectName), nil
|
|
}
|
|
|
|
// GetFile retrieves a file from COS storage by its path URL
|
|
func (s *cosFileService) GetFile(ctx context.Context, filePathUrl string) (io.ReadCloser, error) {
|
|
objectName, err := s.parseCosObjectName(filePathUrl)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
if err := utils.SafeObjectKey(objectName); err != nil {
|
|
return nil, fmt.Errorf("invalid file path: %w", err)
|
|
}
|
|
resp, err := s.client.Object.Get(ctx, objectName, nil)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to get file from COS: %w", err)
|
|
}
|
|
return resp.Body, nil
|
|
}
|
|
|
|
// DeleteFile removes a file from COS storage
|
|
func (s *cosFileService) DeleteFile(ctx context.Context, filePath string) error {
|
|
objectName, err := s.parseCosObjectName(filePath)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
if err := utils.SafeObjectKey(objectName); err != nil {
|
|
return fmt.Errorf("invalid file path: %w", err)
|
|
}
|
|
_, err = s.client.Object.Delete(ctx, objectName)
|
|
if err != nil {
|
|
return fmt.Errorf("failed to delete file: %w", err)
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// parseCosObjectName extracts the object name from:
|
|
// - provider scheme: cos://{bucket}/{region}/{objectKey}
|
|
// - legacy URL: https://bucket.cos.region.myqcloud.com/{objectKey}
|
|
func (s *cosFileService) parseCosObjectName(filePath string) (string, error) {
|
|
for _, other := range []string{"local://", "minio://", "s3://", "tos://", "oss://", "ks3://", "obs://"} {
|
|
if strings.HasPrefix(filePath, other) {
|
|
return "", fmt.Errorf("cos file service cannot resolve %s path", strings.Split(other, "://")[0])
|
|
}
|
|
}
|
|
// Provider scheme format: cos://{bucket}/{region}/{objectKey}
|
|
if strings.HasPrefix(filePath, cosScheme) {
|
|
rest := strings.TrimPrefix(filePath, cosScheme)
|
|
parts := strings.SplitN(rest, "/", 3)
|
|
if len(parts) == 3 {
|
|
return parts[2], nil
|
|
}
|
|
return rest, nil
|
|
}
|
|
// Legacy format: https://bucket.cos.region.myqcloud.com/{objectKey}
|
|
return strings.TrimPrefix(filePath, s.bucketURL), nil
|
|
}
|
|
|
|
// CopyFile copies an existing COS object to a new knowledge-owned object using a
|
|
// server-side Object.Copy (no data leaves COS). The destination uses the same
|
|
// layout as SaveFile. Returns ErrCrossBackendCopy when srcPath is not a cos:// path.
|
|
func (s *cosFileService) CopyFile(ctx context.Context,
|
|
srcPath string, tenantID uint64, knowledgeID string,
|
|
) (string, error) {
|
|
srcObjectKey, err := s.parseCosObjectName(srcPath)
|
|
if err != nil {
|
|
return "", fmt.Errorf("cos copy rejected source %q: %w", srcPath, ErrCrossBackendCopy)
|
|
}
|
|
if err := utils.SafeObjectKey(srcObjectKey); err != nil {
|
|
return "", fmt.Errorf("invalid source path: %w", err)
|
|
}
|
|
|
|
ext := filepath.Ext(srcPath)
|
|
destKey := fmt.Sprintf("%s/%d/%s/%s%s", s.cosPathPrefix, tenantID, knowledgeID, uuid.New().String(), ext)
|
|
|
|
// sourceURL is the host + object key WITHOUT a scheme, per the COS SDK contract.
|
|
sourceURL := fmt.Sprintf("%s.cos.%s.myqcloud.com/%s", s.bucketName, s.region, srcObjectKey)
|
|
_, _, err = s.client.Object.Copy(ctx, destKey, sourceURL, nil)
|
|
if err != nil {
|
|
return "", fmt.Errorf("failed to copy file in COS: %w", err)
|
|
}
|
|
|
|
newPath := fmt.Sprintf("cos://%s/%s/%s", s.bucketName, s.region, destKey)
|
|
logger.Infof(ctx, "Copied COS object %s to %s", srcPath, newPath)
|
|
return newPath, nil
|
|
}
|
|
|
|
// SaveBytes saves bytes data to COS
|
|
// If temp is true and temp bucket is configured, saves to temp bucket (with lifecycle auto-expiration)
|
|
// Otherwise saves to main bucket
|
|
func (s *cosFileService) SaveBytes(ctx context.Context, data []byte, tenantID uint64, fileName string, temp bool) (string, error) {
|
|
safeName, err := utils.SafeFileName(fileName)
|
|
if err != nil {
|
|
return "", fmt.Errorf("invalid file name: %w", err)
|
|
}
|
|
ext := filepath.Ext(safeName)
|
|
reader := bytes.NewReader(data)
|
|
|
|
// 如果请求写入临时桶且临时桶已配置
|
|
if temp && s.tempClient != nil {
|
|
objectName := fmt.Sprintf("exports/%d/%s%s", tenantID, uuid.New().String(), ext)
|
|
_, err := s.tempClient.Object.Put(ctx, objectName, reader, nil)
|
|
if err != nil {
|
|
return "", fmt.Errorf("failed to upload bytes to COS temp bucket: %w", err)
|
|
}
|
|
// Temp bucket still uses legacy URL format for backward compat (auto-expiring)
|
|
return fmt.Sprintf("%s%s", s.tempBucketURL, objectName), nil
|
|
}
|
|
|
|
// 写入主桶
|
|
objectName := fmt.Sprintf("%s/%d/exports/%s%s", s.cosPathPrefix, tenantID, uuid.New().String(), ext)
|
|
_, err = s.client.Object.Put(ctx, objectName, reader, nil)
|
|
if err != nil {
|
|
return "", fmt.Errorf("failed to upload bytes to COS: %w", err)
|
|
}
|
|
|
|
return fmt.Sprintf("cos://%s/%s/%s", s.bucketName, s.region, objectName), nil
|
|
}
|
|
|
|
// GetFileURL returns a presigned download URL for the file
|
|
func (s *cosFileService) GetFileURL(ctx context.Context, filePath string) (string, error) {
|
|
// 判断文件属于哪个桶
|
|
if s.tempClient != nil && strings.HasPrefix(filePath, s.tempBucketURL) {
|
|
objectName := strings.TrimPrefix(filePath, s.tempBucketURL)
|
|
if err := utils.SafeObjectKey(objectName); err != nil {
|
|
return "", fmt.Errorf("invalid file path: %w", err)
|
|
}
|
|
// Generate presigned URL (valid for 24 hours)
|
|
presignedURL, err := s.tempClient.Object.GetPresignedURL(ctx, http.MethodGet, objectName, s.tempClient.GetCredential().SecretID, s.tempClient.GetCredential().SecretKey, 24*time.Hour, nil)
|
|
if err != nil {
|
|
return "", fmt.Errorf("failed to generate presigned URL for temp bucket: %w", err)
|
|
}
|
|
return presignedURL.String(), nil
|
|
}
|
|
|
|
objectName, err := s.parseCosObjectName(filePath)
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
if err := utils.SafeObjectKey(objectName); err != nil {
|
|
return "", fmt.Errorf("invalid file path: %w", err)
|
|
}
|
|
// Generate presigned URL (valid for 24 hours)
|
|
presignedURL, err := s.client.Object.GetPresignedURL(ctx, http.MethodGet, objectName, s.client.GetCredential().SecretID, s.client.GetCredential().SecretKey, 24*time.Hour, nil)
|
|
if err != nil {
|
|
return "", fmt.Errorf("failed to generate presigned URL: %w", err)
|
|
}
|
|
|
|
return presignedURL.String(), nil
|
|
}
|