improved schema generation from mmcif dic

This commit is contained in:
Alexander Rose
2018-03-14 11:48:06 -07:00
parent 75e1dac484
commit e9b57bb89b
6 changed files with 199 additions and 48 deletions

88
data/bird-field-names.csv Normal file
View File

@@ -0,0 +1,88 @@
pdbx_reference_molecule.prd_id
pdbx_reference_molecule.name
pdbx_reference_molecule.represent_as
pdbx_reference_molecule.type
pdbx_reference_molecule.type_evidence_code
pdbx_reference_molecule.class
pdbx_reference_molecule.class_evidence_code
pdbx_reference_molecule.formula
pdbx_reference_molecule.chem_comp_id
pdbx_reference_molecule.formula_weight
pdbx_reference_molecule.release_status
pdbx_reference_molecule.replaces
pdbx_reference_molecule.replaced_by
pdbx_reference_molecule.compound_detail
pdbx_reference_molecule.description
pdbx_reference_molecule.representative_PDB_id_code
pdbx_reference_entity_list.prd_id
pdbx_reference_entity_list.ref_entity_id
pdbx_reference_entity_list.component_id
pdbx_reference_entity_list.type
pdbx_reference_entity_list.details
pdbx_reference_entity_nonpoly.prd_id
pdbx_reference_entity_nonpoly.ref_entity_id
pdbx_reference_entity_nonpoly.name
pdbx_reference_entity_nonpoly.chem_comp_id
pdbx_reference_entity_link.prd_id
pdbx_reference_entity_link.link_id
pdbx_reference_entity_link.link_class
pdbx_reference_entity_link.ref_entity_id_1
pdbx_reference_entity_link.entity_seq_num_1
pdbx_reference_entity_link.comp_id_1
pdbx_reference_entity_link.atom_id_1
pdbx_reference_entity_link.ref_entity_id_2
pdbx_reference_entity_link.entity_seq_num_2
pdbx_reference_entity_link.comp_id_2
pdbx_reference_entity_link.atom_id_2
pdbx_reference_entity_link.value_order
pdbx_reference_entity_link.component_1
pdbx_reference_entity_link.component_2
pdbx_reference_entity_link.details
pdbx_reference_entity_poly_link.prd_id
pdbx_reference_entity_poly_link.ref_entity_id
pdbx_reference_entity_poly_link.link_id
pdbx_reference_entity_poly_link.atom_id_1
pdbx_reference_entity_poly_link.comp_id_1
pdbx_reference_entity_poly_link.entity_seq_num_1
pdbx_reference_entity_poly_link.atom_id_2
pdbx_reference_entity_poly_link.comp_id_2
pdbx_reference_entity_poly_link.entity_seq_num_2
pdbx_reference_entity_poly_link.value_order
pdbx_reference_entity_poly_link.component_id
pdbx_reference_entity_poly.prd_id
pdbx_reference_entity_poly.ref_entity_id
pdbx_reference_entity_poly.db_code
pdbx_reference_entity_poly.db_name
pdbx_reference_entity_poly.type
pdbx_reference_entity_sequence.prd_id
pdbx_reference_entity_sequence.ref_entity_id
pdbx_reference_entity_sequence.type
pdbx_reference_entity_sequence.NRP_flag
pdbx_reference_entity_sequence.one_letter_codes
pdbx_reference_entity_poly_seq.prd_id
pdbx_reference_entity_poly_seq.ref_entity_id
pdbx_reference_entity_poly_seq.num
pdbx_reference_entity_poly_seq.mon_id
pdbx_reference_entity_poly_seq.parent_mon_id
pdbx_reference_entity_poly_seq.hetero
pdbx_reference_entity_poly_seq.observed
pdbx_reference_entity_src_nat.prd_id
pdbx_reference_entity_src_nat.ref_entity_id
pdbx_reference_entity_src_nat.ordinal
pdbx_reference_entity_src_nat.taxid
pdbx_reference_entity_src_nat.organism_scientific
pdbx_reference_entity_src_nat.db_code
pdbx_reference_entity_src_nat.db_name
pdbx_prd_audit.prd_id
pdbx_prd_audit.date
pdbx_prd_audit.processing_site
pdbx_prd_audit.action_type
1 pdbx_reference_molecule.prd_id
2 pdbx_reference_molecule.name
3 pdbx_reference_molecule.represent_as
4 pdbx_reference_molecule.type
5 pdbx_reference_molecule.type_evidence_code
6 pdbx_reference_molecule.class
7 pdbx_reference_molecule.class_evidence_code
8 pdbx_reference_molecule.formula
9 pdbx_reference_molecule.chem_comp_id
10 pdbx_reference_molecule.formula_weight
11 pdbx_reference_molecule.release_status
12 pdbx_reference_molecule.replaces
13 pdbx_reference_molecule.replaced_by
14 pdbx_reference_molecule.compound_detail
15 pdbx_reference_molecule.description
16 pdbx_reference_molecule.representative_PDB_id_code
17 pdbx_reference_entity_list.prd_id
18 pdbx_reference_entity_list.ref_entity_id
19 pdbx_reference_entity_list.component_id
20 pdbx_reference_entity_list.type
21 pdbx_reference_entity_list.details
22 pdbx_reference_entity_nonpoly.prd_id
23 pdbx_reference_entity_nonpoly.ref_entity_id
24 pdbx_reference_entity_nonpoly.name
25 pdbx_reference_entity_nonpoly.chem_comp_id
26 pdbx_reference_entity_link.prd_id
27 pdbx_reference_entity_link.link_id
28 pdbx_reference_entity_link.link_class
29 pdbx_reference_entity_link.ref_entity_id_1
30 pdbx_reference_entity_link.entity_seq_num_1
31 pdbx_reference_entity_link.comp_id_1
32 pdbx_reference_entity_link.atom_id_1
33 pdbx_reference_entity_link.ref_entity_id_2
34 pdbx_reference_entity_link.entity_seq_num_2
35 pdbx_reference_entity_link.comp_id_2
36 pdbx_reference_entity_link.atom_id_2
37 pdbx_reference_entity_link.value_order
38 pdbx_reference_entity_link.component_1
39 pdbx_reference_entity_link.component_2
40 pdbx_reference_entity_link.details
41 pdbx_reference_entity_poly_link.prd_id
42 pdbx_reference_entity_poly_link.ref_entity_id
43 pdbx_reference_entity_poly_link.link_id
44 pdbx_reference_entity_poly_link.atom_id_1
45 pdbx_reference_entity_poly_link.comp_id_1
46 pdbx_reference_entity_poly_link.entity_seq_num_1
47 pdbx_reference_entity_poly_link.atom_id_2
48 pdbx_reference_entity_poly_link.comp_id_2
49 pdbx_reference_entity_poly_link.entity_seq_num_2
50 pdbx_reference_entity_poly_link.value_order
51 pdbx_reference_entity_poly_link.component_id
52 pdbx_reference_entity_poly.prd_id
53 pdbx_reference_entity_poly.ref_entity_id
54 pdbx_reference_entity_poly.db_code
55 pdbx_reference_entity_poly.db_name
56 pdbx_reference_entity_poly.type
57 pdbx_reference_entity_sequence.prd_id
58 pdbx_reference_entity_sequence.ref_entity_id
59 pdbx_reference_entity_sequence.type
60 pdbx_reference_entity_sequence.NRP_flag
61 pdbx_reference_entity_sequence.one_letter_codes
62 pdbx_reference_entity_poly_seq.prd_id
63 pdbx_reference_entity_poly_seq.ref_entity_id
64 pdbx_reference_entity_poly_seq.num
65 pdbx_reference_entity_poly_seq.mon_id
66 pdbx_reference_entity_poly_seq.parent_mon_id
67 pdbx_reference_entity_poly_seq.hetero
68 pdbx_reference_entity_poly_seq.observed
69 pdbx_reference_entity_src_nat.prd_id
70 pdbx_reference_entity_src_nat.ref_entity_id
71 pdbx_reference_entity_src_nat.ordinal
72 pdbx_reference_entity_src_nat.taxid
73 pdbx_reference_entity_src_nat.organism_scientific
74 pdbx_reference_entity_src_nat.db_code
75 pdbx_reference_entity_src_nat.db_name
76 pdbx_prd_audit.prd_id
77 pdbx_prd_audit.date
78 pdbx_prd_audit.processing_site
79 pdbx_prd_audit.action_type

View File

@@ -21,7 +21,7 @@ async function runGenerateSchema(name: string, fieldNamesPath?: string, minCount
const parsed = await comp();
if (parsed.isError) throw parsed
console.log(fieldNamesPath, minCount)
// console.log(fieldNamesPath, minCount)
let filter: Filter | undefined
if (minCount && fieldNamesPath) {
@@ -32,7 +32,6 @@ async function runGenerateSchema(name: string, fieldNamesPath?: string, minCount
} else if (minCount) {
filter = await getUsageCountsFilter(minCount)
} else if (fieldNamesPath) {
console.log('MOIN')
filter = await getFieldNamesFilter(fieldNamesPath)
}
@@ -59,11 +58,11 @@ async function getFieldNamesFilter(fieldNamesPath: string): Promise<Filter> {
const filter: Filter = {}
fieldNames.forEach((name, i) => {
const [ category, field ] = name.split('.')
console.log(category, field)
// console.log(category, field)
if (!filter[ category ]) filter[ category ] = {}
filter[ category ][ field ] = true
})
console.log(filter)
// console.log(filter)
return filter
}

View File

@@ -4,22 +4,25 @@
* @author Alexander Rose <alexander.rose@weirdbyte.de>
*/
import { Database, Column } from './json-schema'
import { Database, ValueColumn, ListColumn } from './json-schema'
import * as Data from 'mol-io/reader/cif/data-model'
export function getFieldType (type: string, values?: string[]): Column {
export function getFieldType (type: string, values?: string[]): ValueColumn|ListColumn {
switch (type) {
case 'code':
case 'ucode':
if (values && values.length) {
return { 'enum': values }
} else {
return 'str'
}
case 'line':
case 'uline':
case 'text':
case 'char':
case 'uchar3':
case 'uchar1':
case 'boolean':
if (values && values.length) {
return { enum: [ 'str', values ] }
} else {
return 'str'
}
case 'aliasname':
case 'name':
case 'idname':
@@ -29,7 +32,6 @@ export function getFieldType (type: string, values?: string[]): Column {
case 'phone':
case 'email':
case 'code30':
case 'ec-type':
case 'seq-one-letter-code':
case 'author':
case 'orcid_id':
@@ -44,27 +46,30 @@ export function getFieldType (type: string, values?: string[]): Column {
case 'float-range':
case 'binary':
case 'operation_expression':
case 'ucode-alphanum-csv':
case 'point_symmetry':
case 'id_list':
case '4x3_matrix':
case '3x4_matrices':
case 'point_group':
case 'point_group_helical':
case 'boolean':
case 'symmetry_operation':
case 'date_dep':
case 'uchar3':
case 'uchar1':
case 'url':
case 'symop':
return 'str'
case 'int':
case 'non_negative_int':
case 'positive_int':
if (values && values.length) {
return { enum: [ 'int', values ] }
} else {
return 'int'
}
case 'float':
return 'float'
case 'ec-type':
case 'ucode-alphanum-csv':
case 'id_list':
return { list: [ 'str', ',' ] }
}
console.log(`unknown type '${type}'`)
return 'str'
@@ -94,10 +99,10 @@ function getField ( category: string, field: string, d: Data.Frame, ctx: FrameDa
}
}
function getEnums (d: Data.Frame, ctx: FrameData): string[]|undefined {
function getEnums (d: Data.Frame, ctx: FrameData) {
const value = getField('item_enumeration', 'value', d, ctx)
if (value) {
const enums: string[] = []
if (value) {
for (let i = 0; i < value.rowCount; ++i) {
enums.push(value.str(i))
// console.log(value.str(i))
@@ -108,16 +113,10 @@ function getEnums (d: Data.Frame, ctx: FrameData): string[]|undefined {
}
}
function getCode (d: Data.Frame, ctx: FrameData): [string, string[]]|undefined {
function getCode (d: Data.Frame, ctx: FrameData): [string, string[]|undefined]|undefined {
const code = getField('item_type', 'code', d, ctx)
if (code) {
let c = code.str(0)
let e = []
if (c === 'ucode') {
const enums = getEnums(d, ctx)
if (enums) e.push(...enums)
}
return [c, e]
return [ code.str(0), getEnums(d, ctx) ]
} else {
console.log(`item_type.code not found for '${d.header}'`)
}
@@ -131,15 +130,46 @@ function getSubCategory (d: Data.Frame, ctx: FrameData): string|undefined {
}
const FORCE_INT_FIELDS = [
'_atom_site.id',
'_atom_site.auth_seq_id',
'_pdbx_struct_mod_residue.auth_seq_id',
'_struct_conf.beg_auth_seq_id',
'_struct_conf.end_auth_seq_id',
'_struct_sheet_range.beg_auth_seq_id',
'_struct_sheet_range.end_auth_seq_id',
'_struct_conn.ptnr1_auth_seq_id',
'_struct_conn.ptnr2_auth_seq_id',
'_pdbx_struct_mod_residue.auth_seq_id',
'_atom_site.id',
'_atom_site.auth_seq_id'
'_struct_sheet_range.beg_auth_seq_id',
'_struct_sheet_range.end_auth_seq_id',
];
const COMMA_SEPARATED_LIST_FIELDS = [
'_atom_site.pdbx_struct_group_id',
'_chem_comp.mon_nstd_parent_comp_id',
'_diffrn_radiation.pdbx_wavelength_list',
'_diffrn_source.pdbx_wavelength_list',
'_em_diffraction.tilt_angle_list', // 20,40,50,55
'_em_entity_assembly.entity_id_list',
'_entity.pdbx_ec',
'_pdbx_depui_entry_details.experimental_methods',
'_pdbx_depui_entry_details.requested_accession_types',
'_pdbx_soln_scatter_model.software_list', // INSIGHT II, HOMOLOGY, DISCOVERY, BIOPOLYMER, DELPHI
'_pdbx_soln_scatter_model.software_author_list', // MSI
'_pdbx_soln_scatter_model.entry_fitting_list', // Odd example: 'PDB CODE 1HFI, 1HCC, 1HFH, 1VCC'
'_pdbx_struct_assembly_gen.entity_inst_id',
'_pdbx_struct_assembly_gen.asym_id_list',
'_pdbx_struct_assembly_gen.auth_asym_id_list',
'_pdbx_struct_assembly_gen_depositor_info.asym_id_list',
'_pdbx_struct_assembly_gen_depositor_info.chain_id_list',
'_pdbx_struct_group_list.group_enumeration_type',
'_reflns.pdbx_diffrn_id',
'_refine.pdbx_diffrn_id',
'_reflns_shell.pdbx_diffrn_id',
'_struct_keywords.text',
];
const SPACE_SEPARATED_LIST_FIELDS = [
'_chem_comp.pdbx_subcomponent_list', // TSM DPH HIS CHF EMR
'_pdbx_soln_scatter.data_reduction_software_list', // OTOKO
'_pdbx_soln_scatter.data_analysis_software_list', // SCTPL5 GNOM
];
export function generateSchema (dic: Data.Block) {
@@ -195,14 +225,24 @@ export function generateSchema (dic: Data.Block) {
} else {
if (itemName.match(/\[[1-3]\]\[[1-3]\]/)) {
fields[itemName.replace(/\[[1-3]\]\[[1-3]\]/, '')] = { 'matrix': [ 3, 3 ] }
// console.log(`${d.header} should have 'matrix' _item_sub_category.id`)
console.log(`${d.header} should have 'matrix' _item_sub_category.id`)
} else if (itemName.match(/\[[1-3]\]/)) {
fields[itemName.replace(/\[[1-3]\]/, '')] = { 'vector': [ 3 ] }
// console.log(`${d.header} should have 'vector' _item_sub_category.id`)
console.log(`${d.header} should have 'vector' _item_sub_category.id`)
} else {
const code = getCode(d, ctx)
if (code) {
fields[itemName] = getFieldType(code[0], code[1])
let fieldType = getFieldType(code[0], code[1]);
if (typeof fieldType === 'string') {
if (COMMA_SEPARATED_LIST_FIELDS.includes(d.header)) {
fieldType = { 'list': [ 'str', ',' ] };
console.log(`comma separated: ${d.header}`)
} else if (SPACE_SEPARATED_LIST_FIELDS.includes(d.header)) {
fieldType = { 'list': [ 'str', ' ' ] };
console.log(`space separated: ${d.header}`)
}
}
fields[itemName] = fieldType
} else {
console.log(`could not determine code for '${d.header}'`)
}

View File

@@ -27,7 +27,8 @@ const coord = Schema.coord;
const Aliased = Schema.Aliased;
const Matrix = Schema.Matrix;
const Vector = Schema.Vector;`
const Vector = Schema.Vector;
const List = Schema.List;`
}
function footer (name: string) {
@@ -37,14 +38,23 @@ export interface ${name}_Database extends Database<${name}_Schema> { }`
}
const value: { [k: string]: (...args: any[]) => string } = {
enum: function (...values: string[]) {
return `Aliased<'${values.join(`' | '`)}'>(str)`
enum: function (type: string, values: string[]) {
return `Aliased<'${values.join(`' | '`)}'>(${type})`
},
matrix: function (rows: number, cols: number) {
return `Matrix(${rows}, ${cols})`
},
vector: function (dim: number) {
return `Vector(${dim})`
},
list: function (type: 'str'|'int'|'float', separator: string) {
if (type === 'int') {
return `List('${separator}', x => parseInt(x, 10))`
} else if (type === 'float') {
return `List('${separator}', x => parseFloat(x))`
} else {
return `List('${separator}', x => x)`
}
}
}
@@ -64,7 +74,7 @@ export function generate (name: string, schema: Database, fields?: Filter, impor
codeLines.push(`export const ${name}_Schema = {`)
Object.keys(schema).forEach(table => {
if (fields && !fields[ table ]) return
codeLines.push(`\t${safePropertyString(table)}: {`)
codeLines.push(` ${safePropertyString(table)}: {`)
const columns = schema[ table ]
Object.keys(columns).forEach(columnName => {
if (fields && !fields[ table ][ columnName ]) return
@@ -76,9 +86,9 @@ export function generate (name: string, schema: Database, fields?: Filter, impor
} else {
typeDef = fieldType
}
codeLines.push(`\t\t${safePropertyString(columnName)}: ${typeDef},`)
codeLines.push(` ${safePropertyString(columnName)}: ${typeDef},`)
})
codeLines.push('\t},')
codeLines.push(' },')
})
codeLines.push('}')

View File

@@ -12,7 +12,8 @@ export interface Table {
[ columnName: string ]: Column
}
export type Column = IntCol | StrCol | FloatCol | CoordCol | EnumCol | VectorCol | MatrixCol
export type ValueColumn = IntCol | StrCol | FloatCol | CoordCol | EnumCol
export type Column = ValueColumn | VectorCol | MatrixCol | ListColumn
type IntCol = 'int'
type StrCol = 'str'
@@ -24,7 +25,7 @@ interface ComplexColumn {
}
interface EnumCol extends ComplexColumn {
enum: string[]
enum: [ IntCol | StrCol, string[] ]
}
interface VectorCol extends ComplexColumn {
@@ -35,6 +36,10 @@ interface MatrixCol extends ComplexColumn {
matrix: [ number, number ]
}
export interface ListColumn extends ComplexColumn {
list: [ ValueColumn, string ]
}
export function getTypeAndArgs (column: ComplexColumn) {
const type = Object.keys(column)[0] as string
const args = column[ type ]

View File

@@ -7,12 +7,16 @@
import { Database, Table, Column } from './json-schema'
const SimpleColumnTypes = [ 'str', 'int', 'float', 'coord' ]
const ComplexColumnTypes = [ 'enum', 'vector', 'matrix' ]
const ComplexColumnTypes = [ 'enum', 'vector', 'matrix', 'list' ]
function allTrue<T> (list: T[], fn: (e: T) => boolean) {
return list.reduce((a, v) => a && fn(v), true)
}
function allString (list: string[]) {
return list.reduce((a, v) => a && typeof v === 'string', true)
}
function validateColumn (column: Column): true|Error {
if (typeof column === 'string') {
if (!SimpleColumnTypes.includes(column)) {
@@ -31,8 +35,8 @@ function validateColumn (column: Column): true|Error {
}
switch (type) {
case 'enum':
if (!args.reduce((a, v) => a && typeof v === 'string', true)) {
return new Error(`enum column must have string args`)
if (args.length !== 2 && (!allString(args[1]) && !allTrue(args[1], Number.isInteger))) {
return new Error(`enum column must have all string or all integer args ${args}`)
}
break;
case 'vector':
@@ -45,6 +49,11 @@ function validateColumn (column: Column): true|Error {
return new Error(`matrix column must have two integer args`)
}
break;
case 'list':
if (args.length !== 2 || !allString(args)) {
return new Error(`list column must have two string args`)
}
break;
default:
return new Error(`complex column types must be one of '${ComplexColumnTypes.join(', ')}' not '${type}'`)
}