feat(builtin-models): validate YAML entries and align ID length with schema

Three correctness fixes that the lifecycle PR deliberately deferred:

1. ID length / struct-tag drift
   - models.id is varchar(64) on both PG and SQLite (per the init
     migrations), but Model.ID's GORM tag said varchar(36) — a remnant
     from when the field only held UUIDs. The mismatch is harmless under
     golang-migrate (struct tag is ignored), but misleading on AutoMigrate
     paths and in IDE tooltips. Tag now matches the real column width.
   - New ModelIDMaxLen constant (=64) is the single source of truth for
     anyone accepting user-provided ids. The YAML loader uses it to
     reject too-long ids up front with a clear message instead of letting
     the INSERT explode with a generic "value too long for type" error.

2. Field validation in the YAML loader
   - Type, Source, and Status are typed strings but YAML can supply any
     value. Misspellings (e.g. `type: knowledgeqa` lowercase, `type: LLM`)
     were previously persisted as-is and produced rows that looked fine
     in the table but failed at provider-factory lookup time, which is
     hard to debug.
   - validateBuiltinModelEntry now checks: empty id, id length, empty
     type, type ∈ {KnowledgeQA, Embedding, Rerank, VLLM, ASR}, and
     status ∈ {active, downloading, download_failed, empty}. Source is
     intentionally NOT validated because the provider matrix in
     internal/models/* keeps growing and a strict allow-list here would
     force changes in two places per new provider.
   - Invalid entries are warned + skipped (not aborting the whole load),
     and excluded from the keep-set so the drift sweep does not delete
     existing matching rows on the strength of a typo'd YAML retry.

3. Magic number cleanup
   - DefaultBuiltinModelTenantID (=10000) replaces the hard-coded `10000`
     literal in toModel(). The invariant lives in three places already
     (PG migration, SQLite migration, this constant); naming it makes
     the cross-reference explicit and grep-able.

Tests:
- New TestLoadBuiltinModelsConfig_RejectsInvalidEntries with five
  sub-cases (id-too-long, missing-type, lowercase-type, unknown-type,
  unknown-status) asserts the table stays empty after each.
- All 11 existing tests still pass.
This commit is contained in:
wizardchen
2026-05-26 11:28:37 +08:00
committed by lyingbug
parent 4bce10d1ea
commit 500c821817
3 changed files with 148 additions and 8 deletions

View File

@@ -2,6 +2,7 @@ package types
import (
"context"
"fmt"
"log"
"os"
"path/filepath"
@@ -128,8 +129,8 @@ func LoadBuiltinModelsConfig(ctx context.Context, db *gorm.DB, configDir string)
for i := range file.BuiltinModels {
e := &file.BuiltinModels[i]
if e.ID == "" {
log.Printf("[builtin-models] WARN: entry %d has empty id; skipping", i)
if err := validateBuiltinModelEntry(e, i); err != nil {
log.Printf("[builtin-models] WARN: %v; skipping", err)
continue
}
m := e.toModel()
@@ -201,14 +202,81 @@ func pruneOrphanYAMLManagedModels(ctx context.Context, db *gorm.DB, keepIDs []st
return res.RowsAffected, res.Error
}
// validBuiltinModelTypes is the set of model types the loader accepts.
// Mirrors the ModelType* constants. Anything else is rejected with a
// warning rather than persisted, because provider factories match types
// case-sensitively and a misspelled YAML entry would produce a row that
// looks present in the table but is unusable downstream.
var validBuiltinModelTypes = map[ModelType]struct{}{
ModelTypeKnowledgeQA: {},
ModelTypeEmbedding: {},
ModelTypeRerank: {},
ModelTypeVLLM: {},
ModelTypeASR: {},
}
// validBuiltinModelStatuses is the set of statuses the loader accepts.
// Empty is allowed and is normalized to ModelStatusActive in toModel().
var validBuiltinModelStatuses = map[ModelStatus]struct{}{
ModelStatusActive: {},
ModelStatusDownloading: {},
ModelStatusDownloadFailed: {},
}
// validateBuiltinModelEntry returns nil if the entry is loadable, or an
// error describing what's wrong. Catches the failure modes that would
// either crash the INSERT or silently produce an unusable row:
//
// - empty id (cannot UPSERT)
// - id longer than the DB column (PG/SQLite cap at varchar(64),
// see ModelIDMaxLen) which would fail at INSERT time
// - empty or misspelled type (provider factories match exact strings)
// - explicit non-empty status outside the known set
//
// Source is intentionally NOT validated against a fixed list because the
// provider matrix in internal/models/* keeps growing and a too-strict
// check here would force changes in two places per new provider.
func validateBuiltinModelEntry(e *BuiltinModelEntry, index int) error {
if e.ID == "" {
return errBuiltinModel("entry %d has empty id", index)
}
if len(e.ID) > ModelIDMaxLen {
return errBuiltinModel("entry %d id %q exceeds %d-char DB limit (got %d)",
index, e.ID, ModelIDMaxLen, len(e.ID))
}
if e.Type == "" {
return errBuiltinModel("entry %d (%s) missing required field 'type'", index, e.ID)
}
if _, ok := validBuiltinModelTypes[e.Type]; !ok {
return errBuiltinModel(
"entry %d (%s) has unknown type %q (expected one of: KnowledgeQA, Embedding, Rerank, VLLM, ASR)",
index, e.ID, e.Type)
}
if e.Status != "" {
if _, ok := validBuiltinModelStatuses[e.Status]; !ok {
return errBuiltinModel(
"entry %d (%s) has unknown status %q (expected: active, downloading, download_failed, or empty)",
index, e.ID, e.Status)
}
}
return nil
}
// errBuiltinModel formats a validation error with a stable prefix so log
// output stays consistent with the rest of this package's warnings.
func errBuiltinModel(format string, args ...interface{}) error {
return fmt.Errorf(format, args...)
}
// toModel converts a YAML entry to a runtime Model with sensible defaults.
// tenant_id defaults to 10000 (matches the seed value of tenants_id_seq);
// source defaults to "remote"; status defaults to "active". IsBuiltin and
// ManagedBy are always forced regardless of YAML input.
// tenant_id defaults to DefaultBuiltinModelTenantID (10000, matching the
// seed value of tenants_id_seq); source defaults to "remote"; status
// defaults to "active". IsBuiltin and ManagedBy are always forced
// regardless of YAML input.
func (e *BuiltinModelEntry) toModel() Model {
tenantID := e.TenantID
if tenantID == 0 {
tenantID = 10000
tenantID = DefaultBuiltinModelTenantID
}
source := e.Source
if source == "" {

View File

@@ -281,6 +281,60 @@ func TestLoadBuiltinModelsConfig_EmptyListSweepsAllYAMLManaged(t *testing.T) {
assert.Equal(t, int64(1), live, "manual row must survive empty-list sweep")
}
func TestLoadBuiltinModelsConfig_RejectsInvalidEntries(t *testing.T) {
// Each sub-case writes a single-entry YAML and asserts the resulting
// `models` table stays empty (entry was rejected by validator).
cases := []struct {
name string
yaml string
}{
{
name: "id too long",
yaml: "builtin_models:\n - id: " + repeatChar("a", ModelIDMaxLen+1) +
"\n type: KnowledgeQA\n",
},
{
name: "missing type",
yaml: "builtin_models:\n - id: builtin-x\n name: x\n",
},
{
name: "unknown type (case mismatch)",
yaml: "builtin_models:\n - id: builtin-x\n type: knowledgeqa\n",
},
{
name: "unknown type",
yaml: "builtin_models:\n - id: builtin-x\n type: LLM\n",
},
{
name: "unknown status",
yaml: "builtin_models:\n - id: builtin-x\n type: KnowledgeQA\n status: enabled\n",
},
}
for _, c := range cases {
t.Run(c.name, func(t *testing.T) {
db := setupBuiltinModelsDB(t)
dir := writeYAML(t, c.yaml)
require.NoError(t, LoadBuiltinModelsConfig(context.Background(), db, dir))
var count int64
require.NoError(t, db.Model(&Model{}).Count(&count).Error)
assert.Equal(t, int64(0), count,
"invalid entry must be rejected, table must stay empty")
})
}
}
// repeatChar returns s repeated n times. Used to synthesize id strings
// that exceed ModelIDMaxLen without depending on strings.Repeat in tests.
func repeatChar(s string, n int) string {
out := make([]byte, 0, len(s)*n)
for i := 0; i < n; i++ {
out = append(out, s...)
}
return string(out)
}
func TestLoadBuiltinModelsConfig_PreservesEntryIDOverBeforeCreate(t *testing.T) {
// Regression guard: BeforeCreate now only generates a UUID when ID is
// empty. YAML always supplies a stable id; if BeforeCreate ever forgets

View File

@@ -85,10 +85,28 @@ type ModelParameters struct {
// exist; a runtime mutator on the entity is both redundant and a footgun
// (mutates an entity that other code may still be using).
// ModelIDMaxLen is the upper bound on `models.id`. Matches the actual
// schema width on both PostgreSQL (varchar(64) in migrations/versioned/
// 000000_init.up.sql) and SQLite (varchar(64) in migrations/sqlite/
// 000000_init.up.sql). Loaders that accept user-provided ids (e.g. the
// built-in models YAML loader) must reject anything longer to avoid a
// "value too long for type" failure at INSERT time.
const ModelIDMaxLen = 64
// DefaultBuiltinModelTenantID is the tenant id that built-in models are
// assigned to when YAML does not specify one. Kept in sync with the seed
// value of tenants_id_seq in migrations/versioned/000000_init.up.sql
// (and the equivalent SQLite init); changing one without the other will
// break visibility of built-in models for the default tenant.
const DefaultBuiltinModelTenantID uint64 = 10000
// Model represents the AI model
type Model struct {
// Unique identifier of the model
ID string `yaml:"id" json:"id" gorm:"type:varchar(36);primaryKey"`
// Unique identifier of the model. The actual DB schema width is
// varchar(64) on both PostgreSQL and SQLite (see ModelIDMaxLen);
// GORM's struct tag is documented to match so AutoMigrate paths
// produce the same shape.
ID string `yaml:"id" json:"id" gorm:"type:varchar(64);primaryKey"`
// Tenant ID
TenantID uint64 `yaml:"tenant_id" json:"tenant_id"`
// Name of the model