Files
WeKnora/internal/application/service/file/cos.go
ochan.kwon e9980c6011 fix: deep-copy stored files and images when cloning a knowledge base
Cloning a knowledge base previously copied only the storage path strings
(knowledge.FilePath and chunk.ImageInfo.URL), so the source and the clone
shared the same physical objects in the storage backend. Once the original
file and extracted images are deleted on source removal, the clone is left
with dangling references and its document and images become unreadable —
data loss that occurs even for same-store clones.

Add a CopyFile primitive to the FileService interface and implement it in
every backend: server-side CopyObject on the object stores
(s3/obs/cos/oss/tos/ks3/minio), io.Copy on local, and a no-op on dummy.
Destinations use the knowledge-owned layout and reuse the existing
path/object-key guards; a sentinel ErrCrossBackendCopy is returned when the
source scheme does not match the backend.

Use CopyFile to deep-copy the document file in cloneKnowledge and the
extracted images in CloneChunk and cloneFAQKnowledgeBase via a shared
cloneChunkImageInfo helper that deduplicates identical image URLs per clone
and rewrites them to the new objects. Copied objects are cleaned up
best-effort if a clone fails partway through. A clone-time preflight rejects
cloning into a target bound to a different storage backend when the tenant
pins providers via StorageEngineConfig.

Adds unit tests for local CopyFile (independent copy survives source
deletion, traversal rejection, cross-backend rejection), cloneChunkImageInfo
(empty/multi/dedup/parse-failure/OriginalURL handling), and the storage
provider preflight.
2026-06-03 14:45:59 +08:00

270 lines
9.8 KiB
Go

package file
import (
"bytes"
"context"
"fmt"
"io"
"mime/multipart"
"net/http"
"net/url"
"path/filepath"
"strings"
"time"
"github.com/Tencent/WeKnora/internal/logger"
"github.com/Tencent/WeKnora/internal/types/interfaces"
"github.com/Tencent/WeKnora/internal/utils"
"github.com/google/uuid"
"github.com/tencentyun/cos-go-sdk-v5"
)
// cosFileService implements the FileService interface for Tencent Cloud COS
type cosFileService struct {
client *cos.Client
bucketURL string
cosPathPrefix string
tempClient *cos.Client
tempBucketURL string
bucketName string
region string
}
const cosScheme = "cos://"
// newCosClient creates a bare cosFileService with just the SDK client initialised.
// Shared by NewCosFileService* constructors and CheckCosConnectivity.
func newCosClient(bucketName, region, secretID, secretKey string) (*cosFileService, error) {
bucketURL := fmt.Sprintf("https://%s.cos.%s.myqcloud.com/", bucketName, region)
u, err := url.Parse(bucketURL)
logger.Infof(context.Background(), "newCosClient: bucketURL: %s", bucketURL)
if err != nil {
return nil, fmt.Errorf("failed to parse bucketURL: %w", err)
}
client := cos.NewClient(&cos.BaseURL{BucketURL: u}, &http.Client{
Transport: &cos.AuthorizationTransport{
SecretID: secretID,
SecretKey: secretKey,
},
})
return &cosFileService{client: client, bucketURL: bucketURL, bucketName: bucketName, region: region}, nil
}
// NewCosFileService creates a new COS file service instance
func NewCosFileService(bucketName, region, secretId, secretKey, cosPathPrefix string) (interfaces.FileService, error) {
return NewCosFileServiceWithTempBucket(bucketName, region, secretId, secretKey, cosPathPrefix, "", "")
}
// NewCosFileServiceWithTempBucket creates a new COS file service instance with optional temp bucket
func NewCosFileServiceWithTempBucket(bucketName, region, secretId, secretKey, cosPathPrefix, tempBucketName, tempRegion string) (interfaces.FileService, error) {
svc, err := newCosClient(bucketName, region, secretId, secretKey)
if err != nil {
return nil, err
}
svc.cosPathPrefix = cosPathPrefix
if tempBucketName != "" {
if tempRegion == "" {
tempRegion = region
}
tempBucketURL := fmt.Sprintf("https://%s.cos.%s.myqcloud.com/", tempBucketName, tempRegion)
tempU, err := url.Parse(tempBucketURL)
if err != nil {
return nil, fmt.Errorf("failed to parse temp bucketURL: %w", err)
}
svc.tempClient = cos.NewClient(&cos.BaseURL{BucketURL: tempU}, &http.Client{
Transport: &cos.AuthorizationTransport{
SecretID: secretId,
SecretKey: secretKey,
},
})
svc.tempBucketURL = tempBucketURL
}
return svc, nil
}
// CheckConnectivity verifies COS is reachable by performing a HEAD request on the bucket.
func (s *cosFileService) CheckConnectivity(ctx context.Context) error {
checkCtx, cancel := context.WithTimeout(ctx, 10*time.Second)
defer cancel()
_, err := s.client.Bucket.Head(checkCtx)
return err
}
// CheckCosConnectivity tests COS connectivity using the provided credentials.
// It creates a temporary service instance internally and delegates to CheckConnectivity.
func CheckCosConnectivity(ctx context.Context, bucketName, region, secretID, secretKey string) error {
svc, err := newCosClient(bucketName, region, secretID, secretKey)
if err != nil {
return err
}
return svc.CheckConnectivity(ctx)
}
// SaveFile saves a file to COS storage
// It generates a unique name for the file and organizes it by tenant and knowledge ID
func (s *cosFileService) SaveFile(ctx context.Context,
file *multipart.FileHeader, tenantID uint64, knowledgeID string,
) (string, error) {
ext := filepath.Ext(file.Filename)
objectName := fmt.Sprintf("%s/%d/%s/%s%s", s.cosPathPrefix, tenantID, knowledgeID, uuid.New().String(), ext)
src, err := file.Open()
if err != nil {
return "", fmt.Errorf("failed to open file: %w", err)
}
defer src.Close()
_, err = s.client.Object.Put(ctx, objectName, src, nil)
if err != nil {
return "", fmt.Errorf("failed to upload file to COS: %w", err)
}
return fmt.Sprintf("cos://%s/%s/%s", s.bucketName, s.region, objectName), nil
}
// GetFile retrieves a file from COS storage by its path URL
func (s *cosFileService) GetFile(ctx context.Context, filePathUrl string) (io.ReadCloser, error) {
objectName, err := s.parseCosObjectName(filePathUrl)
if err != nil {
return nil, err
}
if err := utils.SafeObjectKey(objectName); err != nil {
return nil, fmt.Errorf("invalid file path: %w", err)
}
resp, err := s.client.Object.Get(ctx, objectName, nil)
if err != nil {
return nil, fmt.Errorf("failed to get file from COS: %w", err)
}
return resp.Body, nil
}
// DeleteFile removes a file from COS storage
func (s *cosFileService) DeleteFile(ctx context.Context, filePath string) error {
objectName, err := s.parseCosObjectName(filePath)
if err != nil {
return err
}
if err := utils.SafeObjectKey(objectName); err != nil {
return fmt.Errorf("invalid file path: %w", err)
}
_, err = s.client.Object.Delete(ctx, objectName)
if err != nil {
return fmt.Errorf("failed to delete file: %w", err)
}
return nil
}
// parseCosObjectName extracts the object name from:
// - provider scheme: cos://{bucket}/{region}/{objectKey}
// - legacy URL: https://bucket.cos.region.myqcloud.com/{objectKey}
func (s *cosFileService) parseCosObjectName(filePath string) (string, error) {
for _, other := range []string{"local://", "minio://", "s3://", "tos://", "oss://", "ks3://", "obs://"} {
if strings.HasPrefix(filePath, other) {
return "", fmt.Errorf("cos file service cannot resolve %s path", strings.Split(other, "://")[0])
}
}
// Provider scheme format: cos://{bucket}/{region}/{objectKey}
if strings.HasPrefix(filePath, cosScheme) {
rest := strings.TrimPrefix(filePath, cosScheme)
parts := strings.SplitN(rest, "/", 3)
if len(parts) == 3 {
return parts[2], nil
}
return rest, nil
}
// Legacy format: https://bucket.cos.region.myqcloud.com/{objectKey}
return strings.TrimPrefix(filePath, s.bucketURL), nil
}
// CopyFile copies an existing COS object to a new knowledge-owned object using a
// server-side Object.Copy (no data leaves COS). The destination uses the same
// layout as SaveFile. Returns ErrCrossBackendCopy when srcPath is not a cos:// path.
func (s *cosFileService) CopyFile(ctx context.Context,
srcPath string, tenantID uint64, knowledgeID string,
) (string, error) {
srcObjectKey, err := s.parseCosObjectName(srcPath)
if err != nil {
return "", fmt.Errorf("cos copy rejected source %q: %w", srcPath, ErrCrossBackendCopy)
}
if err := utils.SafeObjectKey(srcObjectKey); err != nil {
return "", fmt.Errorf("invalid source path: %w", err)
}
ext := filepath.Ext(srcPath)
destKey := fmt.Sprintf("%s/%d/%s/%s%s", s.cosPathPrefix, tenantID, knowledgeID, uuid.New().String(), ext)
// sourceURL is the host + object key WITHOUT a scheme, per the COS SDK contract.
sourceURL := fmt.Sprintf("%s.cos.%s.myqcloud.com/%s", s.bucketName, s.region, srcObjectKey)
_, _, err = s.client.Object.Copy(ctx, destKey, sourceURL, nil)
if err != nil {
return "", fmt.Errorf("failed to copy file in COS: %w", err)
}
newPath := fmt.Sprintf("cos://%s/%s/%s", s.bucketName, s.region, destKey)
logger.Infof(ctx, "Copied COS object %s to %s", srcPath, newPath)
return newPath, nil
}
// SaveBytes saves bytes data to COS
// If temp is true and temp bucket is configured, saves to temp bucket (with lifecycle auto-expiration)
// Otherwise saves to main bucket
func (s *cosFileService) SaveBytes(ctx context.Context, data []byte, tenantID uint64, fileName string, temp bool) (string, error) {
safeName, err := utils.SafeFileName(fileName)
if err != nil {
return "", fmt.Errorf("invalid file name: %w", err)
}
ext := filepath.Ext(safeName)
reader := bytes.NewReader(data)
// 如果请求写入临时桶且临时桶已配置
if temp && s.tempClient != nil {
objectName := fmt.Sprintf("exports/%d/%s%s", tenantID, uuid.New().String(), ext)
_, err := s.tempClient.Object.Put(ctx, objectName, reader, nil)
if err != nil {
return "", fmt.Errorf("failed to upload bytes to COS temp bucket: %w", err)
}
// Temp bucket still uses legacy URL format for backward compat (auto-expiring)
return fmt.Sprintf("%s%s", s.tempBucketURL, objectName), nil
}
// 写入主桶
objectName := fmt.Sprintf("%s/%d/exports/%s%s", s.cosPathPrefix, tenantID, uuid.New().String(), ext)
_, err = s.client.Object.Put(ctx, objectName, reader, nil)
if err != nil {
return "", fmt.Errorf("failed to upload bytes to COS: %w", err)
}
return fmt.Sprintf("cos://%s/%s/%s", s.bucketName, s.region, objectName), nil
}
// GetFileURL returns a presigned download URL for the file
func (s *cosFileService) GetFileURL(ctx context.Context, filePath string) (string, error) {
// 判断文件属于哪个桶
if s.tempClient != nil && strings.HasPrefix(filePath, s.tempBucketURL) {
objectName := strings.TrimPrefix(filePath, s.tempBucketURL)
if err := utils.SafeObjectKey(objectName); err != nil {
return "", fmt.Errorf("invalid file path: %w", err)
}
// Generate presigned URL (valid for 24 hours)
presignedURL, err := s.tempClient.Object.GetPresignedURL(ctx, http.MethodGet, objectName, s.tempClient.GetCredential().SecretID, s.tempClient.GetCredential().SecretKey, 24*time.Hour, nil)
if err != nil {
return "", fmt.Errorf("failed to generate presigned URL for temp bucket: %w", err)
}
return presignedURL.String(), nil
}
objectName, err := s.parseCosObjectName(filePath)
if err != nil {
return "", err
}
if err := utils.SafeObjectKey(objectName); err != nil {
return "", fmt.Errorf("invalid file path: %w", err)
}
// Generate presigned URL (valid for 24 hours)
presignedURL, err := s.client.Object.GetPresignedURL(ctx, http.MethodGet, objectName, s.client.GetCredential().SecretID, s.client.GetCredential().SecretKey, 24*time.Hour, nil)
if err != nil {
return "", fmt.Errorf("failed to generate presigned URL: %w", err)
}
return presignedURL.String(), nil
}