fix(agent): load all sheets from Excel files in data analysis (#1007)

DuckDB's st_read (spatial) only reads the first layer/sheet of a .xlsx
workbook, so every sheet beyond Sheet1 was silently dropped from the
DuckDB table the Data Analysis tool builds. Users trying to analyse
multi-sheet workbooks could only see the first sheet.

Switch to DuckDB's dedicated 'excel' extension (read_xlsx) for the
actual data load, enumerate sheets via the spatial extension's
st_read_meta, and UNION ALL BY NAME the rows of every sheet into one
table. A synthetic __sheet_name column records the source so the LLM
can still filter/aggregate per sheet; schema drift between sheets is
tolerated via UNION BY NAME. If enumeration fails (older DuckDB, local
filesystem errors, …) we fall back to reading the first sheet so the
tool stays usable.

- Install & LOAD the 'excel' extension alongside 'spatial' both at
  startup (internal/container) and in the offline prefetch binary
  (cmd/download/duckdb).
- Harden sheet/path handling against single quotes.
- Update the tool description so the agent knows about __sheet_name.
- Add unit tests for the CREATE TABLE SQL builder covering 0 / 1 / N
  sheets and quote escaping.

Refs: https://github.com/Tencent/WeKnora/issues/1007
This commit is contained in:
wizardchen
2026-04-24 19:32:51 +08:00
committed by lyingbug
parent beca2b89a3
commit 11811f9bf8
4 changed files with 231 additions and 37 deletions

View File

@@ -3,11 +3,18 @@ package main
import (
"context"
"database/sql"
"fmt"
_ "github.com/duckdb/duckdb-go/v2"
)
func downloadSpatial() {
// duckdbExtensions is the list of DuckDB extensions required by WeKnora's
// data analysis tool. `spatial` is used for layer metadata (st_read_meta)
// so we can enumerate sheet names from Excel files, while `excel` provides
// the dedicated read_xlsx reader with proper type inference.
var duckdbExtensions = []string{"spatial", "excel"}
func downloadExtensions() {
ctx := context.Background()
sqlDB, err := sql.Open("duckdb", ":memory:")
@@ -16,19 +23,16 @@ func downloadSpatial() {
}
defer sqlDB.Close()
// Try to install spatial extension (may already be installed or network unavailable)
installSQL := "INSTALL spatial;"
if _, err := sqlDB.ExecContext(ctx, installSQL); err != nil {
panic(err)
}
// Try to load spatial extension
loadSQL := "LOAD spatial;"
if _, err := sqlDB.ExecContext(ctx, loadSQL); err != nil {
panic(err)
for _, ext := range duckdbExtensions {
if _, err := sqlDB.ExecContext(ctx, fmt.Sprintf("INSTALL %s;", ext)); err != nil {
panic(fmt.Errorf("failed to install %s extension: %w", ext, err))
}
if _, err := sqlDB.ExecContext(ctx, fmt.Sprintf("LOAD %s;", ext)); err != nil {
panic(fmt.Errorf("failed to load %s extension: %w", ext, err))
}
}
}
func main() {
downloadSpatial()
downloadExtensions()
}

View File

@@ -14,9 +14,21 @@ import (
)
var dataAnalysisTool = BaseTool{
name: ToolDataAnalysis,
description: "Use this tool when the knowledge is CSV or Excel files. It loads the data into memory and executes SQL for data analysis. If the user's question requires data statistics, convert the question into SQL and execute it.",
schema: utils.GenerateSchema[DataAnalysisInput](),
name: ToolDataAnalysis,
description: "Use this tool when the knowledge is CSV or Excel files. It loads the data into memory and executes SQL for data analysis. " +
"For Excel files with multiple sheets, every sheet is loaded into the same table and the source sheet name is exposed as a '__sheet_name' column so you can filter/aggregate per sheet. " +
"If the user's question requires data statistics, convert the question into SQL and execute it.",
schema: utils.GenerateSchema[DataAnalysisInput](),
}
// excelSheetNameColumn is the name of the synthetic column that identifies
// which Excel sheet a row came from when multiple sheets are unioned together.
const excelSheetNameColumn = "__sheet_name"
// sqlSingleQuoteEscape escapes single quotes in a string so it can be safely
// embedded inside a single-quoted SQL literal.
func sqlSingleQuoteEscape(s string) string {
return strings.ReplaceAll(s, "'", "''")
}
type DataAnalysisInput struct {
@@ -300,7 +312,14 @@ func (t *DataAnalysisTool) LoadFromCSV(ctx context.Context, filename string, tab
return t.LoadFromTable(ctx, tableName)
}
// LoadFromExcel loads data from an Excel file into a DuckDB table and returns the table schema
// LoadFromExcel loads data from an Excel file into a DuckDB table and returns the table schema.
//
// Multi-sheet workbooks are fully supported: every sheet in the workbook is
// loaded and the rows from all sheets are unioned (UNION ALL BY NAME) into a
// single table. A synthetic '__sheet_name' column is added so downstream SQL
// can filter / aggregate per sheet. If sheet enumeration fails for any
// reason, we fall back to reading just the first sheet (original behavior).
//
// Parameters:
// - ctx: context for cancellation and timeout
// - filename: path to the Excel file
@@ -310,29 +329,109 @@ func (t *DataAnalysisTool) LoadFromCSV(ctx context.Context, filename string, tab
// - *TableSchema: schema information of the created table
// - error: any error that occurred during the operation
//
// Note: This function requires the spatial extension to be installed in DuckDB
// Note: requires the DuckDB 'excel' extension (for read_xlsx) and the
// 'spatial' extension (for st_read_meta used to enumerate sheets).
func (t *DataAnalysisTool) LoadFromExcel(ctx context.Context, filename string, tableName string) (*TableSchema, error) {
logger.Infof(ctx, "[Tool][DataAnalysis] Loading Excel file '%s' into table '%s' for session %s", filename, tableName, t.sessionID)
// Record the created table for cleanup. If already exists, skip creation
// Record the created table for cleanup. If already exists, skip creation.
if t.recordCreatedTable(tableName) {
// Try to read Excel file using st_read (from spatial extension)
// If spatial extension doesn't support Excel, we'll need to convert to CSV first
createTableSQL := fmt.Sprintf("CREATE TABLE \"%s\" AS SELECT * FROM st_read('%s')", tableName, filename)
_, err := t.db.ExecContext(ctx, createTableSQL)
if err != nil {
logger.Errorf(ctx, "[Tool][DataAnalysis] Failed to create table from Excel: %v", err)
return nil, fmt.Errorf("failed to create table from Excel file. Consider converting to CSV first: %w", err)
sheetNames, enumErr := t.listExcelSheets(ctx, filename)
if enumErr != nil {
logger.Warnf(ctx,
"[Tool][DataAnalysis] Could not enumerate sheets for '%s' (session=%s): %v. Falling back to first sheet only.",
filename, t.sessionID, enumErr,
)
}
logger.Infof(ctx, "[Tool][DataAnalysis] Successfully created table '%s' from Excel file in session %s", tableName, t.sessionID)
createTableSQL := buildExcelCreateTableSQL(tableName, filename, sheetNames)
if _, err := t.db.ExecContext(ctx, createTableSQL); err != nil {
logger.Errorf(ctx, "[Tool][DataAnalysis] Failed to create table from Excel (sheets=%v): %v", sheetNames, err)
return nil, fmt.Errorf("failed to create table from Excel file (sheets=%v): %w", sheetNames, err)
}
logger.Infof(ctx,
"[Tool][DataAnalysis] Successfully created table '%s' from Excel file in session %s (sheets=%v)",
tableName, t.sessionID, sheetNames,
)
}
// Get and return the table schema
return t.LoadFromTable(ctx, tableName)
}
// listExcelSheets returns the names of every sheet (layer) inside the given
// Excel workbook by querying DuckDB's spatial st_read_meta table function.
// The returned slice preserves the on-disk order of sheets.
func (t *DataAnalysisTool) listExcelSheets(ctx context.Context, filename string) ([]string, error) {
metaSQL := fmt.Sprintf("SELECT layer_name FROM st_read_meta('%s')", sqlSingleQuoteEscape(filename))
rows, err := t.db.QueryContext(ctx, metaSQL)
if err != nil {
return nil, fmt.Errorf("failed to query sheet metadata: %w", err)
}
defer rows.Close()
var names []string
for rows.Next() {
var name string
if err := rows.Scan(&name); err != nil {
return nil, fmt.Errorf("failed to scan sheet name: %w", err)
}
if strings.TrimSpace(name) == "" {
continue
}
names = append(names, name)
}
if err := rows.Err(); err != nil {
return nil, fmt.Errorf("error iterating sheet metadata rows: %w", err)
}
return names, nil
}
// buildExcelCreateTableSQL assembles the CREATE TABLE statement used by
// LoadFromExcel. Exposed at package level (lower-case) to make it trivially
// testable without a live DuckDB connection.
func buildExcelCreateTableSQL(tableName, filename string, sheetNames []string) string {
escFile := sqlSingleQuoteEscape(filename)
// No sheet info (enumeration failed or empty): read the first sheet only.
if len(sheetNames) == 0 {
return fmt.Sprintf(
"CREATE TABLE \"%s\" AS SELECT * FROM read_xlsx('%s')",
tableName, escFile,
)
}
// Single sheet: keep it simple but still tag the source for consistency
// with the multi-sheet path.
if len(sheetNames) == 1 {
escSheet := sqlSingleQuoteEscape(sheetNames[0])
return fmt.Sprintf(
"CREATE TABLE \"%s\" AS SELECT *, '%s' AS %s FROM read_xlsx('%s', sheet = '%s')",
tableName, escSheet, excelSheetNameColumn, escFile, escSheet,
)
}
// Multiple sheets: UNION ALL BY NAME tolerates schema differences
// between sheets (missing columns become NULL, conflicting types are
// widened).
parts := make([]string, 0, len(sheetNames))
for _, sheet := range sheetNames {
escSheet := sqlSingleQuoteEscape(sheet)
parts = append(parts, fmt.Sprintf(
"SELECT *, '%s' AS %s FROM read_xlsx('%s', sheet = '%s')",
escSheet, excelSheetNameColumn, escFile, escSheet,
))
}
return fmt.Sprintf(
"CREATE TABLE \"%s\" AS %s",
tableName,
strings.Join(parts, "\nUNION ALL BY NAME\n"),
)
}
// LoadFromKnowledge loads data from a Knowledge entity into a DuckDB table and returns the table schema
// It automatically determines the file type and calls the appropriate loading method
// Parameters:

View File

@@ -0,0 +1,90 @@
package tools
import (
"strings"
"testing"
)
func TestBuildExcelCreateTableSQL_NoSheets(t *testing.T) {
got := buildExcelCreateTableSQL("tbl", "/tmp/data.xlsx", nil)
want := `CREATE TABLE "tbl" AS SELECT * FROM read_xlsx('/tmp/data.xlsx')`
if got != want {
t.Fatalf("mismatch.\n got: %s\nwant: %s", got, want)
}
}
func TestBuildExcelCreateTableSQL_SingleSheetTagsSource(t *testing.T) {
got := buildExcelCreateTableSQL("tbl", "/tmp/data.xlsx", []string{"Sheet1"})
// Must use read_xlsx (excel extension) with explicit sheet param.
if !strings.Contains(got, "FROM read_xlsx('/tmp/data.xlsx', sheet = 'Sheet1')") {
t.Fatalf("expected read_xlsx with sheet param, got: %s", got)
}
// Must tag the source sheet name via the synthetic column so downstream
// SQL behaves consistently between single- and multi-sheet workbooks.
if !strings.Contains(got, "'Sheet1' AS "+excelSheetNameColumn) {
t.Fatalf("expected sheet-name column, got: %s", got)
}
}
func TestBuildExcelCreateTableSQL_MultiSheetUsesUnionAllByName(t *testing.T) {
got := buildExcelCreateTableSQL("tbl", "/tmp/data.xlsx", []string{"Sheet1", "Sheet2", "报表"})
// Each sheet must appear as a SELECT reading that specific sheet, and
// the __sheet_name column must carry its name for per-sheet filtering.
for _, sheet := range []string{"Sheet1", "Sheet2", "报表"} {
needleRead := "FROM read_xlsx('/tmp/data.xlsx', sheet = '" + sheet + "')"
needleTag := "'" + sheet + "' AS " + excelSheetNameColumn
if !strings.Contains(got, needleRead) {
t.Fatalf("missing read_xlsx for sheet %q in:\n%s", sheet, got)
}
if !strings.Contains(got, needleTag) {
t.Fatalf("missing __sheet_name tag for sheet %q in:\n%s", sheet, got)
}
}
// Must combine with UNION ALL BY NAME so schema drift between sheets is
// tolerated.
if !strings.Contains(got, "UNION ALL BY NAME") {
t.Fatalf("expected UNION ALL BY NAME in multi-sheet SQL, got:\n%s", got)
}
// Exactly N-1 UNIONs for N sheets.
if strings.Count(got, "UNION ALL BY NAME") != 2 {
t.Fatalf("expected 2 UNION ALL BY NAME separators, got %d in:\n%s",
strings.Count(got, "UNION ALL BY NAME"), got)
}
}
func TestBuildExcelCreateTableSQL_EscapesSingleQuotes(t *testing.T) {
// Sheet name and file path both contain single quotes, which must be
// doubled to produce a valid SQL literal.
sheets := []string{"Jo's data"}
got := buildExcelCreateTableSQL("tbl", "/tmp/O'Brien/data.xlsx", sheets)
if !strings.Contains(got, "sheet = 'Jo''s data'") {
t.Fatalf("sheet name was not escaped, got:\n%s", got)
}
if !strings.Contains(got, "read_xlsx('/tmp/O''Brien/data.xlsx'") {
t.Fatalf("file path was not escaped, got:\n%s", got)
}
if !strings.Contains(got, "'Jo''s data' AS "+excelSheetNameColumn) {
t.Fatalf("sheet-name literal was not escaped, got:\n%s", got)
}
}
func TestSqlSingleQuoteEscape(t *testing.T) {
cases := map[string]string{
"": "",
"no_quote": "no_quote",
"a'b": "a''b",
"''": "''''",
"mix'ed'quote": "mix''ed''quote",
"中文 with 'quote": "中文 with ''quote",
}
for in, want := range cases {
if got := sqlSingleQuoteEscape(in); got != want {
t.Errorf("sqlSingleQuoteEscape(%q) = %q, want %q", in, got, want)
}
}
}

View File

@@ -1132,16 +1132,17 @@ func NewDuckDB() (*sql.DB, error) {
return nil, fmt.Errorf("failed to open duckdb: %w", err)
}
// Try to install and load spatial extension
installSQL := "INSTALL spatial;"
if _, err := sqlDB.ExecContext(context.Background(), installSQL); err != nil {
logger.Warnf(context.Background(), "[DuckDB] Failed to install spatial extension: %v", err)
}
// Try to load spatial extension
loadSQL := "LOAD spatial;"
if _, err := sqlDB.ExecContext(context.Background(), loadSQL); err != nil {
logger.Warnf(context.Background(), "[DuckDB] Failed to load spatial extension: %v", err)
// Try to install and load required extensions.
// - spatial: used for st_read_meta() to enumerate layer (sheet) names from .xlsx/.xls
// - excel: used for read_xlsx() which gives proper type inference per sheet
bgCtx := context.Background()
for _, ext := range []string{"spatial", "excel"} {
if _, err := sqlDB.ExecContext(bgCtx, fmt.Sprintf("INSTALL %s;", ext)); err != nil {
logger.Warnf(bgCtx, "[DuckDB] Failed to install %s extension: %v", ext, err)
}
if _, err := sqlDB.ExecContext(bgCtx, fmt.Sprintf("LOAD %s;", ext)); err != nil {
logger.Warnf(bgCtx, "[DuckDB] Failed to load %s extension: %v", ext, err)
}
}
return sqlDB, nil