mirror of
https://github.com/molstar/molstar.git
synced 2026-06-04 13:30:24 +08:00
improved schema generation from mmcif dic
This commit is contained in:
88
data/bird-field-names.csv
Normal file
88
data/bird-field-names.csv
Normal file
@@ -0,0 +1,88 @@
|
||||
pdbx_reference_molecule.prd_id
|
||||
pdbx_reference_molecule.name
|
||||
pdbx_reference_molecule.represent_as
|
||||
pdbx_reference_molecule.type
|
||||
pdbx_reference_molecule.type_evidence_code
|
||||
pdbx_reference_molecule.class
|
||||
pdbx_reference_molecule.class_evidence_code
|
||||
pdbx_reference_molecule.formula
|
||||
pdbx_reference_molecule.chem_comp_id
|
||||
pdbx_reference_molecule.formula_weight
|
||||
pdbx_reference_molecule.release_status
|
||||
pdbx_reference_molecule.replaces
|
||||
pdbx_reference_molecule.replaced_by
|
||||
pdbx_reference_molecule.compound_detail
|
||||
pdbx_reference_molecule.description
|
||||
pdbx_reference_molecule.representative_PDB_id_code
|
||||
|
||||
pdbx_reference_entity_list.prd_id
|
||||
pdbx_reference_entity_list.ref_entity_id
|
||||
pdbx_reference_entity_list.component_id
|
||||
pdbx_reference_entity_list.type
|
||||
pdbx_reference_entity_list.details
|
||||
|
||||
pdbx_reference_entity_nonpoly.prd_id
|
||||
pdbx_reference_entity_nonpoly.ref_entity_id
|
||||
pdbx_reference_entity_nonpoly.name
|
||||
pdbx_reference_entity_nonpoly.chem_comp_id
|
||||
|
||||
pdbx_reference_entity_link.prd_id
|
||||
pdbx_reference_entity_link.link_id
|
||||
pdbx_reference_entity_link.link_class
|
||||
pdbx_reference_entity_link.ref_entity_id_1
|
||||
pdbx_reference_entity_link.entity_seq_num_1
|
||||
pdbx_reference_entity_link.comp_id_1
|
||||
pdbx_reference_entity_link.atom_id_1
|
||||
pdbx_reference_entity_link.ref_entity_id_2
|
||||
pdbx_reference_entity_link.entity_seq_num_2
|
||||
pdbx_reference_entity_link.comp_id_2
|
||||
pdbx_reference_entity_link.atom_id_2
|
||||
pdbx_reference_entity_link.value_order
|
||||
pdbx_reference_entity_link.component_1
|
||||
pdbx_reference_entity_link.component_2
|
||||
pdbx_reference_entity_link.details
|
||||
|
||||
pdbx_reference_entity_poly_link.prd_id
|
||||
pdbx_reference_entity_poly_link.ref_entity_id
|
||||
pdbx_reference_entity_poly_link.link_id
|
||||
pdbx_reference_entity_poly_link.atom_id_1
|
||||
pdbx_reference_entity_poly_link.comp_id_1
|
||||
pdbx_reference_entity_poly_link.entity_seq_num_1
|
||||
pdbx_reference_entity_poly_link.atom_id_2
|
||||
pdbx_reference_entity_poly_link.comp_id_2
|
||||
pdbx_reference_entity_poly_link.entity_seq_num_2
|
||||
pdbx_reference_entity_poly_link.value_order
|
||||
pdbx_reference_entity_poly_link.component_id
|
||||
|
||||
pdbx_reference_entity_poly.prd_id
|
||||
pdbx_reference_entity_poly.ref_entity_id
|
||||
pdbx_reference_entity_poly.db_code
|
||||
pdbx_reference_entity_poly.db_name
|
||||
pdbx_reference_entity_poly.type
|
||||
|
||||
pdbx_reference_entity_sequence.prd_id
|
||||
pdbx_reference_entity_sequence.ref_entity_id
|
||||
pdbx_reference_entity_sequence.type
|
||||
pdbx_reference_entity_sequence.NRP_flag
|
||||
pdbx_reference_entity_sequence.one_letter_codes
|
||||
|
||||
pdbx_reference_entity_poly_seq.prd_id
|
||||
pdbx_reference_entity_poly_seq.ref_entity_id
|
||||
pdbx_reference_entity_poly_seq.num
|
||||
pdbx_reference_entity_poly_seq.mon_id
|
||||
pdbx_reference_entity_poly_seq.parent_mon_id
|
||||
pdbx_reference_entity_poly_seq.hetero
|
||||
pdbx_reference_entity_poly_seq.observed
|
||||
|
||||
pdbx_reference_entity_src_nat.prd_id
|
||||
pdbx_reference_entity_src_nat.ref_entity_id
|
||||
pdbx_reference_entity_src_nat.ordinal
|
||||
pdbx_reference_entity_src_nat.taxid
|
||||
pdbx_reference_entity_src_nat.organism_scientific
|
||||
pdbx_reference_entity_src_nat.db_code
|
||||
pdbx_reference_entity_src_nat.db_name
|
||||
|
||||
pdbx_prd_audit.prd_id
|
||||
pdbx_prd_audit.date
|
||||
pdbx_prd_audit.processing_site
|
||||
pdbx_prd_audit.action_type
|
||||
|
@@ -21,7 +21,7 @@ async function runGenerateSchema(name: string, fieldNamesPath?: string, minCount
|
||||
const parsed = await comp();
|
||||
if (parsed.isError) throw parsed
|
||||
|
||||
console.log(fieldNamesPath, minCount)
|
||||
// console.log(fieldNamesPath, minCount)
|
||||
|
||||
let filter: Filter | undefined
|
||||
if (minCount && fieldNamesPath) {
|
||||
@@ -32,7 +32,6 @@ async function runGenerateSchema(name: string, fieldNamesPath?: string, minCount
|
||||
} else if (minCount) {
|
||||
filter = await getUsageCountsFilter(minCount)
|
||||
} else if (fieldNamesPath) {
|
||||
console.log('MOIN')
|
||||
filter = await getFieldNamesFilter(fieldNamesPath)
|
||||
}
|
||||
|
||||
@@ -59,11 +58,11 @@ async function getFieldNamesFilter(fieldNamesPath: string): Promise<Filter> {
|
||||
const filter: Filter = {}
|
||||
fieldNames.forEach((name, i) => {
|
||||
const [ category, field ] = name.split('.')
|
||||
console.log(category, field)
|
||||
// console.log(category, field)
|
||||
if (!filter[ category ]) filter[ category ] = {}
|
||||
filter[ category ][ field ] = true
|
||||
})
|
||||
console.log(filter)
|
||||
// console.log(filter)
|
||||
return filter
|
||||
}
|
||||
|
||||
@@ -93,7 +92,7 @@ async function ensureMmcifDicAvailable() {
|
||||
if (FORCE_MMCIF_DOWNLOAD || !fs.existsSync(MMCIF_DIC_PATH)) {
|
||||
console.log('downloading mmcif dic...')
|
||||
const data = await fetch(MMCIF_DIC_URL)
|
||||
if (!fs.existsSync(MMCIF_DIC_DIR)){
|
||||
if (!fs.existsSync(MMCIF_DIC_DIR)) {
|
||||
fs.mkdirSync(MMCIF_DIC_DIR);
|
||||
}
|
||||
fs.writeFileSync(MMCIF_DIC_PATH, await data.text())
|
||||
|
||||
@@ -4,22 +4,25 @@
|
||||
* @author Alexander Rose <alexander.rose@weirdbyte.de>
|
||||
*/
|
||||
|
||||
import { Database, Column } from './json-schema'
|
||||
import { Database, ValueColumn, ListColumn } from './json-schema'
|
||||
import * as Data from 'mol-io/reader/cif/data-model'
|
||||
|
||||
export function getFieldType (type: string, values?: string[]): Column {
|
||||
export function getFieldType (type: string, values?: string[]): ValueColumn|ListColumn {
|
||||
switch (type) {
|
||||
case 'code':
|
||||
case 'ucode':
|
||||
if (values && values.length) {
|
||||
return { 'enum': values }
|
||||
} else {
|
||||
return 'str'
|
||||
}
|
||||
case 'line':
|
||||
case 'uline':
|
||||
case 'text':
|
||||
case 'char':
|
||||
case 'uchar3':
|
||||
case 'uchar1':
|
||||
case 'boolean':
|
||||
if (values && values.length) {
|
||||
return { enum: [ 'str', values ] }
|
||||
} else {
|
||||
return 'str'
|
||||
}
|
||||
case 'aliasname':
|
||||
case 'name':
|
||||
case 'idname':
|
||||
@@ -29,7 +32,6 @@ export function getFieldType (type: string, values?: string[]): Column {
|
||||
case 'phone':
|
||||
case 'email':
|
||||
case 'code30':
|
||||
case 'ec-type':
|
||||
case 'seq-one-letter-code':
|
||||
case 'author':
|
||||
case 'orcid_id':
|
||||
@@ -44,27 +46,30 @@ export function getFieldType (type: string, values?: string[]): Column {
|
||||
case 'float-range':
|
||||
case 'binary':
|
||||
case 'operation_expression':
|
||||
case 'ucode-alphanum-csv':
|
||||
case 'point_symmetry':
|
||||
case 'id_list':
|
||||
case '4x3_matrix':
|
||||
case '3x4_matrices':
|
||||
case 'point_group':
|
||||
case 'point_group_helical':
|
||||
case 'boolean':
|
||||
case 'symmetry_operation':
|
||||
case 'date_dep':
|
||||
case 'uchar3':
|
||||
case 'uchar1':
|
||||
case 'url':
|
||||
case 'symop':
|
||||
return 'str'
|
||||
case 'int':
|
||||
case 'non_negative_int':
|
||||
case 'positive_int':
|
||||
if (values && values.length) {
|
||||
return { enum: [ 'int', values ] }
|
||||
} else {
|
||||
return 'int'
|
||||
}
|
||||
case 'float':
|
||||
return 'float'
|
||||
case 'ec-type':
|
||||
case 'ucode-alphanum-csv':
|
||||
case 'id_list':
|
||||
return { list: [ 'str', ',' ] }
|
||||
}
|
||||
console.log(`unknown type '${type}'`)
|
||||
return 'str'
|
||||
@@ -94,10 +99,10 @@ function getField ( category: string, field: string, d: Data.Frame, ctx: FrameDa
|
||||
}
|
||||
}
|
||||
|
||||
function getEnums (d: Data.Frame, ctx: FrameData): string[]|undefined {
|
||||
function getEnums (d: Data.Frame, ctx: FrameData) {
|
||||
const value = getField('item_enumeration', 'value', d, ctx)
|
||||
if (value) {
|
||||
const enums: string[] = []
|
||||
if (value) {
|
||||
for (let i = 0; i < value.rowCount; ++i) {
|
||||
enums.push(value.str(i))
|
||||
// console.log(value.str(i))
|
||||
@@ -108,16 +113,10 @@ function getEnums (d: Data.Frame, ctx: FrameData): string[]|undefined {
|
||||
}
|
||||
}
|
||||
|
||||
function getCode (d: Data.Frame, ctx: FrameData): [string, string[]]|undefined {
|
||||
function getCode (d: Data.Frame, ctx: FrameData): [string, string[]|undefined]|undefined {
|
||||
const code = getField('item_type', 'code', d, ctx)
|
||||
if (code) {
|
||||
let c = code.str(0)
|
||||
let e = []
|
||||
if (c === 'ucode') {
|
||||
const enums = getEnums(d, ctx)
|
||||
if (enums) e.push(...enums)
|
||||
}
|
||||
return [c, e]
|
||||
return [ code.str(0), getEnums(d, ctx) ]
|
||||
} else {
|
||||
console.log(`item_type.code not found for '${d.header}'`)
|
||||
}
|
||||
@@ -131,15 +130,46 @@ function getSubCategory (d: Data.Frame, ctx: FrameData): string|undefined {
|
||||
}
|
||||
|
||||
const FORCE_INT_FIELDS = [
|
||||
'_atom_site.id',
|
||||
'_atom_site.auth_seq_id',
|
||||
'_pdbx_struct_mod_residue.auth_seq_id',
|
||||
'_struct_conf.beg_auth_seq_id',
|
||||
'_struct_conf.end_auth_seq_id',
|
||||
'_struct_sheet_range.beg_auth_seq_id',
|
||||
'_struct_sheet_range.end_auth_seq_id',
|
||||
'_struct_conn.ptnr1_auth_seq_id',
|
||||
'_struct_conn.ptnr2_auth_seq_id',
|
||||
'_pdbx_struct_mod_residue.auth_seq_id',
|
||||
'_atom_site.id',
|
||||
'_atom_site.auth_seq_id'
|
||||
'_struct_sheet_range.beg_auth_seq_id',
|
||||
'_struct_sheet_range.end_auth_seq_id',
|
||||
];
|
||||
|
||||
const COMMA_SEPARATED_LIST_FIELDS = [
|
||||
'_atom_site.pdbx_struct_group_id',
|
||||
'_chem_comp.mon_nstd_parent_comp_id',
|
||||
'_diffrn_radiation.pdbx_wavelength_list',
|
||||
'_diffrn_source.pdbx_wavelength_list',
|
||||
'_em_diffraction.tilt_angle_list', // 20,40,50,55
|
||||
'_em_entity_assembly.entity_id_list',
|
||||
'_entity.pdbx_ec',
|
||||
'_pdbx_depui_entry_details.experimental_methods',
|
||||
'_pdbx_depui_entry_details.requested_accession_types',
|
||||
'_pdbx_soln_scatter_model.software_list', // INSIGHT II, HOMOLOGY, DISCOVERY, BIOPOLYMER, DELPHI
|
||||
'_pdbx_soln_scatter_model.software_author_list', // MSI
|
||||
'_pdbx_soln_scatter_model.entry_fitting_list', // Odd example: 'PDB CODE 1HFI, 1HCC, 1HFH, 1VCC'
|
||||
'_pdbx_struct_assembly_gen.entity_inst_id',
|
||||
'_pdbx_struct_assembly_gen.asym_id_list',
|
||||
'_pdbx_struct_assembly_gen.auth_asym_id_list',
|
||||
'_pdbx_struct_assembly_gen_depositor_info.asym_id_list',
|
||||
'_pdbx_struct_assembly_gen_depositor_info.chain_id_list',
|
||||
'_pdbx_struct_group_list.group_enumeration_type',
|
||||
'_reflns.pdbx_diffrn_id',
|
||||
'_refine.pdbx_diffrn_id',
|
||||
'_reflns_shell.pdbx_diffrn_id',
|
||||
'_struct_keywords.text',
|
||||
];
|
||||
|
||||
const SPACE_SEPARATED_LIST_FIELDS = [
|
||||
'_chem_comp.pdbx_subcomponent_list', // TSM DPH HIS CHF EMR
|
||||
'_pdbx_soln_scatter.data_reduction_software_list', // OTOKO
|
||||
'_pdbx_soln_scatter.data_analysis_software_list', // SCTPL5 GNOM
|
||||
];
|
||||
|
||||
export function generateSchema (dic: Data.Block) {
|
||||
@@ -195,14 +225,24 @@ export function generateSchema (dic: Data.Block) {
|
||||
} else {
|
||||
if (itemName.match(/\[[1-3]\]\[[1-3]\]/)) {
|
||||
fields[itemName.replace(/\[[1-3]\]\[[1-3]\]/, '')] = { 'matrix': [ 3, 3 ] }
|
||||
// console.log(`${d.header} should have 'matrix' _item_sub_category.id`)
|
||||
console.log(`${d.header} should have 'matrix' _item_sub_category.id`)
|
||||
} else if (itemName.match(/\[[1-3]\]/)) {
|
||||
fields[itemName.replace(/\[[1-3]\]/, '')] = { 'vector': [ 3 ] }
|
||||
// console.log(`${d.header} should have 'vector' _item_sub_category.id`)
|
||||
console.log(`${d.header} should have 'vector' _item_sub_category.id`)
|
||||
} else {
|
||||
const code = getCode(d, ctx)
|
||||
if (code) {
|
||||
fields[itemName] = getFieldType(code[0], code[1])
|
||||
let fieldType = getFieldType(code[0], code[1]);
|
||||
if (typeof fieldType === 'string') {
|
||||
if (COMMA_SEPARATED_LIST_FIELDS.includes(d.header)) {
|
||||
fieldType = { 'list': [ 'str', ',' ] };
|
||||
console.log(`comma separated: ${d.header}`)
|
||||
} else if (SPACE_SEPARATED_LIST_FIELDS.includes(d.header)) {
|
||||
fieldType = { 'list': [ 'str', ' ' ] };
|
||||
console.log(`space separated: ${d.header}`)
|
||||
}
|
||||
}
|
||||
fields[itemName] = fieldType
|
||||
} else {
|
||||
console.log(`could not determine code for '${d.header}'`)
|
||||
}
|
||||
|
||||
@@ -27,7 +27,8 @@ const coord = Schema.coord;
|
||||
|
||||
const Aliased = Schema.Aliased;
|
||||
const Matrix = Schema.Matrix;
|
||||
const Vector = Schema.Vector;`
|
||||
const Vector = Schema.Vector;
|
||||
const List = Schema.List;`
|
||||
}
|
||||
|
||||
function footer (name: string) {
|
||||
@@ -37,14 +38,23 @@ export interface ${name}_Database extends Database<${name}_Schema> { }`
|
||||
}
|
||||
|
||||
const value: { [k: string]: (...args: any[]) => string } = {
|
||||
enum: function (...values: string[]) {
|
||||
return `Aliased<'${values.join(`' | '`)}'>(str)`
|
||||
enum: function (type: string, values: string[]) {
|
||||
return `Aliased<'${values.join(`' | '`)}'>(${type})`
|
||||
},
|
||||
matrix: function (rows: number, cols: number) {
|
||||
return `Matrix(${rows}, ${cols})`
|
||||
},
|
||||
vector: function (dim: number) {
|
||||
return `Vector(${dim})`
|
||||
},
|
||||
list: function (type: 'str'|'int'|'float', separator: string) {
|
||||
if (type === 'int') {
|
||||
return `List('${separator}', x => parseInt(x, 10))`
|
||||
} else if (type === 'float') {
|
||||
return `List('${separator}', x => parseFloat(x))`
|
||||
} else {
|
||||
return `List('${separator}', x => x)`
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -64,7 +74,7 @@ export function generate (name: string, schema: Database, fields?: Filter, impor
|
||||
codeLines.push(`export const ${name}_Schema = {`)
|
||||
Object.keys(schema).forEach(table => {
|
||||
if (fields && !fields[ table ]) return
|
||||
codeLines.push(`\t${safePropertyString(table)}: {`)
|
||||
codeLines.push(` ${safePropertyString(table)}: {`)
|
||||
const columns = schema[ table ]
|
||||
Object.keys(columns).forEach(columnName => {
|
||||
if (fields && !fields[ table ][ columnName ]) return
|
||||
@@ -76,9 +86,9 @@ export function generate (name: string, schema: Database, fields?: Filter, impor
|
||||
} else {
|
||||
typeDef = fieldType
|
||||
}
|
||||
codeLines.push(`\t\t${safePropertyString(columnName)}: ${typeDef},`)
|
||||
codeLines.push(` ${safePropertyString(columnName)}: ${typeDef},`)
|
||||
})
|
||||
codeLines.push('\t},')
|
||||
codeLines.push(' },')
|
||||
})
|
||||
codeLines.push('}')
|
||||
|
||||
|
||||
@@ -12,7 +12,8 @@ export interface Table {
|
||||
[ columnName: string ]: Column
|
||||
}
|
||||
|
||||
export type Column = IntCol | StrCol | FloatCol | CoordCol | EnumCol | VectorCol | MatrixCol
|
||||
export type ValueColumn = IntCol | StrCol | FloatCol | CoordCol | EnumCol
|
||||
export type Column = ValueColumn | VectorCol | MatrixCol | ListColumn
|
||||
|
||||
type IntCol = 'int'
|
||||
type StrCol = 'str'
|
||||
@@ -24,7 +25,7 @@ interface ComplexColumn {
|
||||
}
|
||||
|
||||
interface EnumCol extends ComplexColumn {
|
||||
enum: string[]
|
||||
enum: [ IntCol | StrCol, string[] ]
|
||||
}
|
||||
|
||||
interface VectorCol extends ComplexColumn {
|
||||
@@ -35,6 +36,10 @@ interface MatrixCol extends ComplexColumn {
|
||||
matrix: [ number, number ]
|
||||
}
|
||||
|
||||
export interface ListColumn extends ComplexColumn {
|
||||
list: [ ValueColumn, string ]
|
||||
}
|
||||
|
||||
export function getTypeAndArgs (column: ComplexColumn) {
|
||||
const type = Object.keys(column)[0] as string
|
||||
const args = column[ type ]
|
||||
|
||||
@@ -7,12 +7,16 @@
|
||||
import { Database, Table, Column } from './json-schema'
|
||||
|
||||
const SimpleColumnTypes = [ 'str', 'int', 'float', 'coord' ]
|
||||
const ComplexColumnTypes = [ 'enum', 'vector', 'matrix' ]
|
||||
const ComplexColumnTypes = [ 'enum', 'vector', 'matrix', 'list' ]
|
||||
|
||||
function allTrue<T> (list: T[], fn: (e: T) => boolean) {
|
||||
return list.reduce((a, v) => a && fn(v), true)
|
||||
}
|
||||
|
||||
function allString (list: string[]) {
|
||||
return list.reduce((a, v) => a && typeof v === 'string', true)
|
||||
}
|
||||
|
||||
function validateColumn (column: Column): true|Error {
|
||||
if (typeof column === 'string') {
|
||||
if (!SimpleColumnTypes.includes(column)) {
|
||||
@@ -31,8 +35,8 @@ function validateColumn (column: Column): true|Error {
|
||||
}
|
||||
switch (type) {
|
||||
case 'enum':
|
||||
if (!args.reduce((a, v) => a && typeof v === 'string', true)) {
|
||||
return new Error(`enum column must have string args`)
|
||||
if (args.length !== 2 && (!allString(args[1]) && !allTrue(args[1], Number.isInteger))) {
|
||||
return new Error(`enum column must have all string or all integer args ${args}`)
|
||||
}
|
||||
break;
|
||||
case 'vector':
|
||||
@@ -45,6 +49,11 @@ function validateColumn (column: Column): true|Error {
|
||||
return new Error(`matrix column must have two integer args`)
|
||||
}
|
||||
break;
|
||||
case 'list':
|
||||
if (args.length !== 2 || !allString(args)) {
|
||||
return new Error(`list column must have two string args`)
|
||||
}
|
||||
break;
|
||||
default:
|
||||
return new Error(`complex column types must be one of '${ComplexColumnTypes.join(', ')}' not '${type}'`)
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user