mirror of
https://github.com/Tencent/WeKnora.git
synced 2026-06-04 13:30:32 +08:00
fix(agent): load all sheets from Excel files in data analysis (#1007)
DuckDB's st_read (spatial) only reads the first layer/sheet of a .xlsx workbook, so every sheet beyond Sheet1 was silently dropped from the DuckDB table the Data Analysis tool builds. Users trying to analyse multi-sheet workbooks could only see the first sheet. Switch to DuckDB's dedicated 'excel' extension (read_xlsx) for the actual data load, enumerate sheets via the spatial extension's st_read_meta, and UNION ALL BY NAME the rows of every sheet into one table. A synthetic __sheet_name column records the source so the LLM can still filter/aggregate per sheet; schema drift between sheets is tolerated via UNION BY NAME. If enumeration fails (older DuckDB, local filesystem errors, …) we fall back to reading the first sheet so the tool stays usable. - Install & LOAD the 'excel' extension alongside 'spatial' both at startup (internal/container) and in the offline prefetch binary (cmd/download/duckdb). - Harden sheet/path handling against single quotes. - Update the tool description so the agent knows about __sheet_name. - Add unit tests for the CREATE TABLE SQL builder covering 0 / 1 / N sheets and quote escaping. Refs: https://github.com/Tencent/WeKnora/issues/1007
This commit is contained in:
@@ -3,11 +3,18 @@ package main
|
||||
import (
|
||||
"context"
|
||||
"database/sql"
|
||||
"fmt"
|
||||
|
||||
_ "github.com/duckdb/duckdb-go/v2"
|
||||
)
|
||||
|
||||
func downloadSpatial() {
|
||||
// duckdbExtensions is the list of DuckDB extensions required by WeKnora's
|
||||
// data analysis tool. `spatial` is used for layer metadata (st_read_meta)
|
||||
// so we can enumerate sheet names from Excel files, while `excel` provides
|
||||
// the dedicated read_xlsx reader with proper type inference.
|
||||
var duckdbExtensions = []string{"spatial", "excel"}
|
||||
|
||||
func downloadExtensions() {
|
||||
ctx := context.Background()
|
||||
|
||||
sqlDB, err := sql.Open("duckdb", ":memory:")
|
||||
@@ -16,19 +23,16 @@ func downloadSpatial() {
|
||||
}
|
||||
defer sqlDB.Close()
|
||||
|
||||
// Try to install spatial extension (may already be installed or network unavailable)
|
||||
installSQL := "INSTALL spatial;"
|
||||
if _, err := sqlDB.ExecContext(ctx, installSQL); err != nil {
|
||||
panic(err)
|
||||
}
|
||||
|
||||
// Try to load spatial extension
|
||||
loadSQL := "LOAD spatial;"
|
||||
if _, err := sqlDB.ExecContext(ctx, loadSQL); err != nil {
|
||||
panic(err)
|
||||
for _, ext := range duckdbExtensions {
|
||||
if _, err := sqlDB.ExecContext(ctx, fmt.Sprintf("INSTALL %s;", ext)); err != nil {
|
||||
panic(fmt.Errorf("failed to install %s extension: %w", ext, err))
|
||||
}
|
||||
if _, err := sqlDB.ExecContext(ctx, fmt.Sprintf("LOAD %s;", ext)); err != nil {
|
||||
panic(fmt.Errorf("failed to load %s extension: %w", ext, err))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func main() {
|
||||
downloadSpatial()
|
||||
downloadExtensions()
|
||||
}
|
||||
|
||||
@@ -14,9 +14,21 @@ import (
|
||||
)
|
||||
|
||||
var dataAnalysisTool = BaseTool{
|
||||
name: ToolDataAnalysis,
|
||||
description: "Use this tool when the knowledge is CSV or Excel files. It loads the data into memory and executes SQL for data analysis. If the user's question requires data statistics, convert the question into SQL and execute it.",
|
||||
schema: utils.GenerateSchema[DataAnalysisInput](),
|
||||
name: ToolDataAnalysis,
|
||||
description: "Use this tool when the knowledge is CSV or Excel files. It loads the data into memory and executes SQL for data analysis. " +
|
||||
"For Excel files with multiple sheets, every sheet is loaded into the same table and the source sheet name is exposed as a '__sheet_name' column so you can filter/aggregate per sheet. " +
|
||||
"If the user's question requires data statistics, convert the question into SQL and execute it.",
|
||||
schema: utils.GenerateSchema[DataAnalysisInput](),
|
||||
}
|
||||
|
||||
// excelSheetNameColumn is the name of the synthetic column that identifies
|
||||
// which Excel sheet a row came from when multiple sheets are unioned together.
|
||||
const excelSheetNameColumn = "__sheet_name"
|
||||
|
||||
// sqlSingleQuoteEscape escapes single quotes in a string so it can be safely
|
||||
// embedded inside a single-quoted SQL literal.
|
||||
func sqlSingleQuoteEscape(s string) string {
|
||||
return strings.ReplaceAll(s, "'", "''")
|
||||
}
|
||||
|
||||
type DataAnalysisInput struct {
|
||||
@@ -300,7 +312,14 @@ func (t *DataAnalysisTool) LoadFromCSV(ctx context.Context, filename string, tab
|
||||
return t.LoadFromTable(ctx, tableName)
|
||||
}
|
||||
|
||||
// LoadFromExcel loads data from an Excel file into a DuckDB table and returns the table schema
|
||||
// LoadFromExcel loads data from an Excel file into a DuckDB table and returns the table schema.
|
||||
//
|
||||
// Multi-sheet workbooks are fully supported: every sheet in the workbook is
|
||||
// loaded and the rows from all sheets are unioned (UNION ALL BY NAME) into a
|
||||
// single table. A synthetic '__sheet_name' column is added so downstream SQL
|
||||
// can filter / aggregate per sheet. If sheet enumeration fails for any
|
||||
// reason, we fall back to reading just the first sheet (original behavior).
|
||||
//
|
||||
// Parameters:
|
||||
// - ctx: context for cancellation and timeout
|
||||
// - filename: path to the Excel file
|
||||
@@ -310,29 +329,109 @@ func (t *DataAnalysisTool) LoadFromCSV(ctx context.Context, filename string, tab
|
||||
// - *TableSchema: schema information of the created table
|
||||
// - error: any error that occurred during the operation
|
||||
//
|
||||
// Note: This function requires the spatial extension to be installed in DuckDB
|
||||
// Note: requires the DuckDB 'excel' extension (for read_xlsx) and the
|
||||
// 'spatial' extension (for st_read_meta used to enumerate sheets).
|
||||
func (t *DataAnalysisTool) LoadFromExcel(ctx context.Context, filename string, tableName string) (*TableSchema, error) {
|
||||
logger.Infof(ctx, "[Tool][DataAnalysis] Loading Excel file '%s' into table '%s' for session %s", filename, tableName, t.sessionID)
|
||||
|
||||
// Record the created table for cleanup. If already exists, skip creation
|
||||
// Record the created table for cleanup. If already exists, skip creation.
|
||||
if t.recordCreatedTable(tableName) {
|
||||
// Try to read Excel file using st_read (from spatial extension)
|
||||
// If spatial extension doesn't support Excel, we'll need to convert to CSV first
|
||||
createTableSQL := fmt.Sprintf("CREATE TABLE \"%s\" AS SELECT * FROM st_read('%s')", tableName, filename)
|
||||
|
||||
_, err := t.db.ExecContext(ctx, createTableSQL)
|
||||
if err != nil {
|
||||
logger.Errorf(ctx, "[Tool][DataAnalysis] Failed to create table from Excel: %v", err)
|
||||
return nil, fmt.Errorf("failed to create table from Excel file. Consider converting to CSV first: %w", err)
|
||||
sheetNames, enumErr := t.listExcelSheets(ctx, filename)
|
||||
if enumErr != nil {
|
||||
logger.Warnf(ctx,
|
||||
"[Tool][DataAnalysis] Could not enumerate sheets for '%s' (session=%s): %v. Falling back to first sheet only.",
|
||||
filename, t.sessionID, enumErr,
|
||||
)
|
||||
}
|
||||
|
||||
logger.Infof(ctx, "[Tool][DataAnalysis] Successfully created table '%s' from Excel file in session %s", tableName, t.sessionID)
|
||||
createTableSQL := buildExcelCreateTableSQL(tableName, filename, sheetNames)
|
||||
|
||||
if _, err := t.db.ExecContext(ctx, createTableSQL); err != nil {
|
||||
logger.Errorf(ctx, "[Tool][DataAnalysis] Failed to create table from Excel (sheets=%v): %v", sheetNames, err)
|
||||
return nil, fmt.Errorf("failed to create table from Excel file (sheets=%v): %w", sheetNames, err)
|
||||
}
|
||||
|
||||
logger.Infof(ctx,
|
||||
"[Tool][DataAnalysis] Successfully created table '%s' from Excel file in session %s (sheets=%v)",
|
||||
tableName, t.sessionID, sheetNames,
|
||||
)
|
||||
}
|
||||
|
||||
// Get and return the table schema
|
||||
return t.LoadFromTable(ctx, tableName)
|
||||
}
|
||||
|
||||
// listExcelSheets returns the names of every sheet (layer) inside the given
|
||||
// Excel workbook by querying DuckDB's spatial st_read_meta table function.
|
||||
// The returned slice preserves the on-disk order of sheets.
|
||||
func (t *DataAnalysisTool) listExcelSheets(ctx context.Context, filename string) ([]string, error) {
|
||||
metaSQL := fmt.Sprintf("SELECT layer_name FROM st_read_meta('%s')", sqlSingleQuoteEscape(filename))
|
||||
|
||||
rows, err := t.db.QueryContext(ctx, metaSQL)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to query sheet metadata: %w", err)
|
||||
}
|
||||
defer rows.Close()
|
||||
|
||||
var names []string
|
||||
for rows.Next() {
|
||||
var name string
|
||||
if err := rows.Scan(&name); err != nil {
|
||||
return nil, fmt.Errorf("failed to scan sheet name: %w", err)
|
||||
}
|
||||
if strings.TrimSpace(name) == "" {
|
||||
continue
|
||||
}
|
||||
names = append(names, name)
|
||||
}
|
||||
if err := rows.Err(); err != nil {
|
||||
return nil, fmt.Errorf("error iterating sheet metadata rows: %w", err)
|
||||
}
|
||||
return names, nil
|
||||
}
|
||||
|
||||
// buildExcelCreateTableSQL assembles the CREATE TABLE statement used by
|
||||
// LoadFromExcel. Exposed at package level (lower-case) to make it trivially
|
||||
// testable without a live DuckDB connection.
|
||||
func buildExcelCreateTableSQL(tableName, filename string, sheetNames []string) string {
|
||||
escFile := sqlSingleQuoteEscape(filename)
|
||||
|
||||
// No sheet info (enumeration failed or empty): read the first sheet only.
|
||||
if len(sheetNames) == 0 {
|
||||
return fmt.Sprintf(
|
||||
"CREATE TABLE \"%s\" AS SELECT * FROM read_xlsx('%s')",
|
||||
tableName, escFile,
|
||||
)
|
||||
}
|
||||
|
||||
// Single sheet: keep it simple but still tag the source for consistency
|
||||
// with the multi-sheet path.
|
||||
if len(sheetNames) == 1 {
|
||||
escSheet := sqlSingleQuoteEscape(sheetNames[0])
|
||||
return fmt.Sprintf(
|
||||
"CREATE TABLE \"%s\" AS SELECT *, '%s' AS %s FROM read_xlsx('%s', sheet = '%s')",
|
||||
tableName, escSheet, excelSheetNameColumn, escFile, escSheet,
|
||||
)
|
||||
}
|
||||
|
||||
// Multiple sheets: UNION ALL BY NAME tolerates schema differences
|
||||
// between sheets (missing columns become NULL, conflicting types are
|
||||
// widened).
|
||||
parts := make([]string, 0, len(sheetNames))
|
||||
for _, sheet := range sheetNames {
|
||||
escSheet := sqlSingleQuoteEscape(sheet)
|
||||
parts = append(parts, fmt.Sprintf(
|
||||
"SELECT *, '%s' AS %s FROM read_xlsx('%s', sheet = '%s')",
|
||||
escSheet, excelSheetNameColumn, escFile, escSheet,
|
||||
))
|
||||
}
|
||||
return fmt.Sprintf(
|
||||
"CREATE TABLE \"%s\" AS %s",
|
||||
tableName,
|
||||
strings.Join(parts, "\nUNION ALL BY NAME\n"),
|
||||
)
|
||||
}
|
||||
|
||||
// LoadFromKnowledge loads data from a Knowledge entity into a DuckDB table and returns the table schema
|
||||
// It automatically determines the file type and calls the appropriate loading method
|
||||
// Parameters:
|
||||
|
||||
90
internal/agent/tools/data_analysis_test.go
Normal file
90
internal/agent/tools/data_analysis_test.go
Normal file
@@ -0,0 +1,90 @@
|
||||
package tools
|
||||
|
||||
import (
|
||||
"strings"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestBuildExcelCreateTableSQL_NoSheets(t *testing.T) {
|
||||
got := buildExcelCreateTableSQL("tbl", "/tmp/data.xlsx", nil)
|
||||
want := `CREATE TABLE "tbl" AS SELECT * FROM read_xlsx('/tmp/data.xlsx')`
|
||||
if got != want {
|
||||
t.Fatalf("mismatch.\n got: %s\nwant: %s", got, want)
|
||||
}
|
||||
}
|
||||
|
||||
func TestBuildExcelCreateTableSQL_SingleSheetTagsSource(t *testing.T) {
|
||||
got := buildExcelCreateTableSQL("tbl", "/tmp/data.xlsx", []string{"Sheet1"})
|
||||
|
||||
// Must use read_xlsx (excel extension) with explicit sheet param.
|
||||
if !strings.Contains(got, "FROM read_xlsx('/tmp/data.xlsx', sheet = 'Sheet1')") {
|
||||
t.Fatalf("expected read_xlsx with sheet param, got: %s", got)
|
||||
}
|
||||
// Must tag the source sheet name via the synthetic column so downstream
|
||||
// SQL behaves consistently between single- and multi-sheet workbooks.
|
||||
if !strings.Contains(got, "'Sheet1' AS "+excelSheetNameColumn) {
|
||||
t.Fatalf("expected sheet-name column, got: %s", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestBuildExcelCreateTableSQL_MultiSheetUsesUnionAllByName(t *testing.T) {
|
||||
got := buildExcelCreateTableSQL("tbl", "/tmp/data.xlsx", []string{"Sheet1", "Sheet2", "报表"})
|
||||
|
||||
// Each sheet must appear as a SELECT reading that specific sheet, and
|
||||
// the __sheet_name column must carry its name for per-sheet filtering.
|
||||
for _, sheet := range []string{"Sheet1", "Sheet2", "报表"} {
|
||||
needleRead := "FROM read_xlsx('/tmp/data.xlsx', sheet = '" + sheet + "')"
|
||||
needleTag := "'" + sheet + "' AS " + excelSheetNameColumn
|
||||
if !strings.Contains(got, needleRead) {
|
||||
t.Fatalf("missing read_xlsx for sheet %q in:\n%s", sheet, got)
|
||||
}
|
||||
if !strings.Contains(got, needleTag) {
|
||||
t.Fatalf("missing __sheet_name tag for sheet %q in:\n%s", sheet, got)
|
||||
}
|
||||
}
|
||||
|
||||
// Must combine with UNION ALL BY NAME so schema drift between sheets is
|
||||
// tolerated.
|
||||
if !strings.Contains(got, "UNION ALL BY NAME") {
|
||||
t.Fatalf("expected UNION ALL BY NAME in multi-sheet SQL, got:\n%s", got)
|
||||
}
|
||||
|
||||
// Exactly N-1 UNIONs for N sheets.
|
||||
if strings.Count(got, "UNION ALL BY NAME") != 2 {
|
||||
t.Fatalf("expected 2 UNION ALL BY NAME separators, got %d in:\n%s",
|
||||
strings.Count(got, "UNION ALL BY NAME"), got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestBuildExcelCreateTableSQL_EscapesSingleQuotes(t *testing.T) {
|
||||
// Sheet name and file path both contain single quotes, which must be
|
||||
// doubled to produce a valid SQL literal.
|
||||
sheets := []string{"Jo's data"}
|
||||
got := buildExcelCreateTableSQL("tbl", "/tmp/O'Brien/data.xlsx", sheets)
|
||||
|
||||
if !strings.Contains(got, "sheet = 'Jo''s data'") {
|
||||
t.Fatalf("sheet name was not escaped, got:\n%s", got)
|
||||
}
|
||||
if !strings.Contains(got, "read_xlsx('/tmp/O''Brien/data.xlsx'") {
|
||||
t.Fatalf("file path was not escaped, got:\n%s", got)
|
||||
}
|
||||
if !strings.Contains(got, "'Jo''s data' AS "+excelSheetNameColumn) {
|
||||
t.Fatalf("sheet-name literal was not escaped, got:\n%s", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestSqlSingleQuoteEscape(t *testing.T) {
|
||||
cases := map[string]string{
|
||||
"": "",
|
||||
"no_quote": "no_quote",
|
||||
"a'b": "a''b",
|
||||
"''": "''''",
|
||||
"mix'ed'quote": "mix''ed''quote",
|
||||
"中文 with 'quote": "中文 with ''quote",
|
||||
}
|
||||
for in, want := range cases {
|
||||
if got := sqlSingleQuoteEscape(in); got != want {
|
||||
t.Errorf("sqlSingleQuoteEscape(%q) = %q, want %q", in, got, want)
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1132,16 +1132,17 @@ func NewDuckDB() (*sql.DB, error) {
|
||||
return nil, fmt.Errorf("failed to open duckdb: %w", err)
|
||||
}
|
||||
|
||||
// Try to install and load spatial extension
|
||||
installSQL := "INSTALL spatial;"
|
||||
if _, err := sqlDB.ExecContext(context.Background(), installSQL); err != nil {
|
||||
logger.Warnf(context.Background(), "[DuckDB] Failed to install spatial extension: %v", err)
|
||||
}
|
||||
|
||||
// Try to load spatial extension
|
||||
loadSQL := "LOAD spatial;"
|
||||
if _, err := sqlDB.ExecContext(context.Background(), loadSQL); err != nil {
|
||||
logger.Warnf(context.Background(), "[DuckDB] Failed to load spatial extension: %v", err)
|
||||
// Try to install and load required extensions.
|
||||
// - spatial: used for st_read_meta() to enumerate layer (sheet) names from .xlsx/.xls
|
||||
// - excel: used for read_xlsx() which gives proper type inference per sheet
|
||||
bgCtx := context.Background()
|
||||
for _, ext := range []string{"spatial", "excel"} {
|
||||
if _, err := sqlDB.ExecContext(bgCtx, fmt.Sprintf("INSTALL %s;", ext)); err != nil {
|
||||
logger.Warnf(bgCtx, "[DuckDB] Failed to install %s extension: %v", ext, err)
|
||||
}
|
||||
if _, err := sqlDB.ExecContext(bgCtx, fmt.Sprintf("LOAD %s;", ext)); err != nil {
|
||||
logger.Warnf(bgCtx, "[DuckDB] Failed to load %s extension: %v", ext, err)
|
||||
}
|
||||
}
|
||||
|
||||
return sqlDB, nil
|
||||
|
||||
Reference in New Issue
Block a user