Files
rdkit/Code/GraphMol/FileParsers/SequenceParsers.cpp
Greg Landrum f4a02f5f13 Expanded sequence support (#1140)
* patch from Roger Sayle to support nucleic acids as well as amino acids

* add some tests which do not yet pass

* fix a bug I introduced in residue name parsing

* basic sequence testing

* add some more tests
2016-11-04 04:29:26 +01:00

1670 lines
56 KiB
C++

//
// Copyright (C) 2015,2016 Greg Landrum and NextMove Software
//
// @@ All Rights Reserved @@
// This file is part of the RDKit.
// The contents are covered by the terms of the BSD license
// which is included in the file license.txt, found at the root
// of the RDKit source tree.
//
#include <string.h>
#include <string>
#include <vector>
#include <map>
#include <GraphMol/GraphMol.h>
#include <GraphMol/MolOps.h>
#include <GraphMol/MonomerInfo.h>
namespace RDKit {
static Atom *CreateAAAtom(RWMol *mol, const char *name,
AtomPDBResidueInfo &info) {
Atom *atom = (Atom *)0;
if (name[0] == ' ' && name[1] == 'C') {
atom = new Atom(6);
} else if (name[0] == ' ' && name[1] == 'N') {
atom = new Atom(7);
} else if (name[0] == ' ' && name[1] == 'O') {
atom = new Atom(8);
} else if (name[0] == ' ' && name[1] == 'P') {
atom = new Atom(15);
} else if (name[0] == ' ' && name[1] == 'S') {
atom = new Atom(16);
} else if (name[0] == 'S' && name[1] == 'E') {
atom = new Atom(34);
} else
atom = new Atom(0);
mol->addAtom(atom, true, true);
AtomPDBResidueInfo *copy = (AtomPDBResidueInfo *)info.copy();
copy->setName(name);
atom->setMonomerInfo(copy);
unsigned int serno = info.getSerialNumber();
info.setSerialNumber(serno + 1);
return atom;
}
static void CreateAABond(RWMol *mol, Atom *beg, Atom *end, unsigned int order) {
Bond *bond;
if (order == 2)
bond = new Bond(Bond::DOUBLE);
else
bond = new Bond(Bond::SINGLE);
bond->setOwningMol(mol);
bond->setBeginAtom(beg);
bond->setEndAtom(end);
mol->addBond(bond, true);
}
static void CreateAABackbone(RWMol *mol, Atom *&r1, Atom *&r2, Atom *&cb,
AtomPDBResidueInfo &info, int ldstereo) {
r1 = CreateAAAtom(mol, " N ", info);
Atom *ca = CreateAAAtom(mol, " CA ", info);
r2 = CreateAAAtom(mol, " C ", info);
Atom *o = CreateAAAtom(mol, " O ", info);
cb = CreateAAAtom(mol, " CB ", info);
CreateAABond(mol, r1, ca, 1);
CreateAABond(mol, ca, r2, 1);
CreateAABond(mol, r2, o, 2);
CreateAABond(mol, ca, cb, 1);
if (ldstereo > 0) // L-stereo
ca->setChiralTag(Atom::CHI_TETRAHEDRAL_CCW);
else if (ldstereo < 0) // D-stereo
ca->setChiralTag(Atom::CHI_TETRAHEDRAL_CW);
}
// aa is a three letter PDB residue code
static void CreateAminoAcid(RWMol *mol, const char *aa, Atom *&r1, Atom *&r2,
Atom *&r3, AtomPDBResidueInfo &info) {
Atom *atom[10];
r1 = (Atom *)0;
r2 = (Atom *)0;
r3 = (Atom *)0;
int resno = info.getResidueNumber();
info.setResidueNumber(resno + 1);
info.setIsHeteroAtom(false);
info.setResidueName(aa);
// Standard amino acids before non-standard, in PDB code alphabetical order
switch (aa[0]) {
case 'A':
if (!strcmp(aa, "ALA")) {
CreateAABackbone(mol, r1, r2, atom[0], info, 1);
} else if (!strcmp(aa, "ARG")) {
CreateAABackbone(mol, r1, r2, atom[0], info, 1);
atom[1] = CreateAAAtom(mol, " CG ", info);
atom[2] = CreateAAAtom(mol, " CD ", info);
atom[3] = CreateAAAtom(mol, " NE ", info);
atom[4] = CreateAAAtom(mol, " CZ ", info);
atom[5] = CreateAAAtom(mol, " NH1", info);
atom[6] = CreateAAAtom(mol, " NH2", info);
CreateAABond(mol, atom[0], atom[1], 1);
CreateAABond(mol, atom[1], atom[2], 1);
CreateAABond(mol, atom[2], atom[3], 1);
CreateAABond(mol, atom[3], atom[4], 1);
CreateAABond(mol, atom[4], atom[5], 2);
CreateAABond(mol, atom[4], atom[6], 1);
} else if (!strcmp(aa, "ASP")) {
CreateAABackbone(mol, r1, r2, atom[0], info, 1);
atom[1] = CreateAAAtom(mol, " CG ", info);
atom[2] = CreateAAAtom(mol, " OD1", info);
atom[3] = CreateAAAtom(mol, " OD2", info);
CreateAABond(mol, atom[0], atom[1], 1);
CreateAABond(mol, atom[1], atom[2], 2);
CreateAABond(mol, atom[1], atom[3], 1);
} else if (!strcmp(aa, "ASN")) {
CreateAABackbone(mol, r1, r2, atom[0], info, 1);
atom[1] = CreateAAAtom(mol, " CG ", info);
atom[2] = CreateAAAtom(mol, " OD1", info);
atom[3] = CreateAAAtom(mol, " ND2", info);
CreateAABond(mol, atom[0], atom[1], 1);
CreateAABond(mol, atom[1], atom[2], 2);
CreateAABond(mol, atom[1], atom[3], 1);
} else if (!strcmp(aa, "ABA")) {
info.setIsHeteroAtom(true);
CreateAABackbone(mol, r1, r2, atom[0], info, 1);
atom[1] = CreateAAAtom(mol, " CG ", info);
CreateAABond(mol, atom[0], atom[1], 1);
} else if (!strcmp(aa, "ACE")) {
info.setIsHeteroAtom(true);
r2 = CreateAAAtom(mol, " C ", info);
atom[0] = CreateAAAtom(mol, " O ", info);
atom[1] = CreateAAAtom(mol, " CH3", info);
CreateAABond(mol, r2, atom[1], 1);
CreateAABond(mol, r2, atom[0], 2);
}
break;
case 'C':
if (!strcmp(aa, "CYS")) {
CreateAABackbone(mol, r1, r2, atom[0], info, 1);
r3 = CreateAAAtom(mol, " SG ", info);
CreateAABond(mol, atom[0], r3, 1);
}
break;
case 'D':
if (!strcmp(aa, "DAL")) {
info.setIsHeteroAtom(true);
CreateAABackbone(mol, r1, r2, atom[0], info, -1);
} else if (!strcmp(aa, "DAR")) {
CreateAABackbone(mol, r1, r2, atom[0], info, -1);
atom[1] = CreateAAAtom(mol, " CG ", info);
atom[2] = CreateAAAtom(mol, " CD ", info);
atom[3] = CreateAAAtom(mol, " NE ", info);
atom[4] = CreateAAAtom(mol, " CZ ", info);
atom[5] = CreateAAAtom(mol, " NH1", info);
atom[6] = CreateAAAtom(mol, " NH2", info);
CreateAABond(mol, atom[0], atom[1], 1);
CreateAABond(mol, atom[1], atom[2], 1);
CreateAABond(mol, atom[2], atom[3], 1);
CreateAABond(mol, atom[3], atom[4], 1);
CreateAABond(mol, atom[4], atom[5], 2);
CreateAABond(mol, atom[4], atom[6], 1);
} else if (!strcmp(aa, "DAS")) {
CreateAABackbone(mol, r1, r2, atom[0], info, -1);
atom[1] = CreateAAAtom(mol, " CG ", info);
atom[2] = CreateAAAtom(mol, " OD1", info);
atom[3] = CreateAAAtom(mol, " OD2", info);
CreateAABond(mol, atom[0], atom[1], 1);
CreateAABond(mol, atom[1], atom[2], 2);
CreateAABond(mol, atom[1], atom[3], 1);
} else if (!strcmp(aa, "DBB")) {
info.setIsHeteroAtom(true);
CreateAABackbone(mol, r1, r2, atom[0], info, 1);
atom[1] = CreateAAAtom(mol, " CG ", info);
CreateAABond(mol, atom[0], atom[1], 1);
} else if (!strcmp(aa, "DCY")) {
info.setIsHeteroAtom(true);
CreateAABackbone(mol, r1, r2, atom[0], info, -1);
r3 = CreateAAAtom(mol, " SG ", info);
CreateAABond(mol, atom[0], r3, 1);
} else if (!strcmp(aa, "DGL")) {
CreateAABackbone(mol, r1, r2, atom[0], info, -1);
atom[1] = CreateAAAtom(mol, " CG ", info);
atom[2] = CreateAAAtom(mol, " CD ", info);
atom[3] = CreateAAAtom(mol, " OE1", info);
atom[4] = CreateAAAtom(mol, " OE2", info);
CreateAABond(mol, atom[0], atom[1], 1);
CreateAABond(mol, atom[1], atom[2], 1);
CreateAABond(mol, atom[2], atom[3], 2);
CreateAABond(mol, atom[2], atom[4], 1);
} else if (!strcmp(aa, "DGN")) {
CreateAABackbone(mol, r1, r2, atom[0], info, -1);
atom[1] = CreateAAAtom(mol, " CG ", info);
atom[2] = CreateAAAtom(mol, " CD ", info);
atom[3] = CreateAAAtom(mol, " OE1", info);
atom[4] = CreateAAAtom(mol, " NE2", info);
CreateAABond(mol, atom[0], atom[1], 1);
CreateAABond(mol, atom[1], atom[2], 1);
CreateAABond(mol, atom[2], atom[3], 2);
CreateAABond(mol, atom[2], atom[4], 1);
} else if (!strcmp(aa, "DHI")) {
info.setIsHeteroAtom(true);
CreateAABackbone(mol, r1, r2, atom[0], info, -1);
atom[1] = CreateAAAtom(mol, " CG ", info);
atom[2] = CreateAAAtom(mol, " ND1", info);
atom[3] = CreateAAAtom(mol, " CD2", info);
atom[4] = CreateAAAtom(mol, " CE1", info);
atom[5] = CreateAAAtom(mol, " NE2", info);
CreateAABond(mol, atom[0], atom[1], 1);
CreateAABond(mol, atom[1], atom[2], 1);
CreateAABond(mol, atom[2], atom[4], 2);
CreateAABond(mol, atom[4], atom[5], 1);
CreateAABond(mol, atom[5], atom[3], 1);
CreateAABond(mol, atom[3], atom[1], 2);
} else if (!strcmp(aa, "DIL")) {
CreateAABackbone(mol, r1, r2, atom[0], info, -1);
atom[1] = CreateAAAtom(mol, " CG1", info);
atom[2] = CreateAAAtom(mol, " CG2", info);
atom[3] = CreateAAAtom(mol, " CD1", info);
CreateAABond(mol, atom[0], atom[1], 1);
CreateAABond(mol, atom[0], atom[2], 1);
CreateAABond(mol, atom[2], atom[3], 1);
atom[0]->setChiralTag(Atom::CHI_TETRAHEDRAL_CCW);
} else if (!strcmp(aa, "DLE")) {
CreateAABackbone(mol, r1, r2, atom[0], info, -1);
atom[1] = CreateAAAtom(mol, " CG ", info);
atom[2] = CreateAAAtom(mol, " CD1", info);
atom[3] = CreateAAAtom(mol, " CD2", info);
CreateAABond(mol, atom[0], atom[1], 1);
CreateAABond(mol, atom[1], atom[2], 1);
CreateAABond(mol, atom[1], atom[3], 1);
} else if (!strcmp(aa, "DLY")) {
CreateAABackbone(mol, r1, r2, atom[0], info, -1);
atom[1] = CreateAAAtom(mol, " CG ", info);
atom[2] = CreateAAAtom(mol, " CD ", info);
atom[3] = CreateAAAtom(mol, " CE ", info);
atom[4] = CreateAAAtom(mol, " NZ ", info);
CreateAABond(mol, atom[0], atom[1], 1);
CreateAABond(mol, atom[1], atom[2], 1);
CreateAABond(mol, atom[2], atom[3], 1);
CreateAABond(mol, atom[3], atom[4], 1);
} else if (!strcmp(aa, "DPN")) {
info.setIsHeteroAtom(true);
CreateAABackbone(mol, r1, r2, atom[0], info, -1);
atom[1] = CreateAAAtom(mol, " CG ", info);
atom[2] = CreateAAAtom(mol, " CD1", info);
atom[3] = CreateAAAtom(mol, " CD2", info);
atom[4] = CreateAAAtom(mol, " CE1", info);
atom[5] = CreateAAAtom(mol, " CE2", info);
atom[6] = CreateAAAtom(mol, " CZ ", info);
CreateAABond(mol, atom[0], atom[1], 1);
CreateAABond(mol, atom[1], atom[2], 2);
CreateAABond(mol, atom[2], atom[4], 1);
CreateAABond(mol, atom[4], atom[6], 2);
CreateAABond(mol, atom[6], atom[5], 1);
CreateAABond(mol, atom[5], atom[3], 2);
CreateAABond(mol, atom[3], atom[1], 1);
} else if (!strcmp(aa, "DPR")) {
CreateAABackbone(mol, r1, r2, atom[0], info, -1);
atom[1] = CreateAAAtom(mol, " CG ", info);
atom[2] = CreateAAAtom(mol, " CD ", info);
CreateAABond(mol, atom[0], atom[1], 1);
CreateAABond(mol, atom[1], atom[2], 1);
CreateAABond(mol, atom[2], r1, 1);
} else if (!strcmp(aa, "DSG")) {
CreateAABackbone(mol, r1, r2, atom[0], info, -1);
atom[1] = CreateAAAtom(mol, " CG ", info);
atom[2] = CreateAAAtom(mol, " OD1", info);
atom[3] = CreateAAAtom(mol, " ND2", info);
CreateAABond(mol, atom[0], atom[1], 1);
CreateAABond(mol, atom[1], atom[2], 2);
CreateAABond(mol, atom[1], atom[3], 1);
} else if (!strcmp(aa, "DTH")) {
CreateAABackbone(mol, r1, r2, atom[0], info, -1);
atom[1] = CreateAAAtom(mol, " OG1", info);
atom[2] = CreateAAAtom(mol, " CG2", info);
CreateAABond(mol, atom[0], atom[1], 1);
CreateAABond(mol, atom[0], atom[2], 1);
atom[0]->setChiralTag(Atom::CHI_TETRAHEDRAL_CW);
} else if (!strcmp(aa, "DTR")) {
CreateAABackbone(mol, r1, r2, atom[0], info, -1);
atom[1] = CreateAAAtom(mol, " CG ", info);
atom[2] = CreateAAAtom(mol, " CD1", info);
atom[3] = CreateAAAtom(mol, " CD2", info);
atom[4] = CreateAAAtom(mol, " NE1", info);
atom[5] = CreateAAAtom(mol, " CE2", info);
atom[6] = CreateAAAtom(mol, " CE3", info);
atom[7] = CreateAAAtom(mol, " CZ2", info);
atom[8] = CreateAAAtom(mol, " CZ3", info);
atom[9] = CreateAAAtom(mol, " CH2", info);
CreateAABond(mol, atom[0], atom[1], 1);
CreateAABond(mol, atom[1], atom[2], 2);
CreateAABond(mol, atom[1], atom[3], 1);
CreateAABond(mol, atom[2], atom[4], 1);
CreateAABond(mol, atom[3], atom[5], 2);
CreateAABond(mol, atom[3], atom[6], 1);
CreateAABond(mol, atom[4], atom[5], 1);
CreateAABond(mol, atom[5], atom[7], 1);
CreateAABond(mol, atom[6], atom[8], 2);
CreateAABond(mol, atom[7], atom[9], 2);
CreateAABond(mol, atom[8], atom[9], 1);
} else if (!strcmp(aa, "DVA")) {
info.setIsHeteroAtom(true);
CreateAABackbone(mol, r1, r2, atom[0], info, -1);
atom[1] = CreateAAAtom(mol, " CG1", info);
atom[2] = CreateAAAtom(mol, " CG2", info);
CreateAABond(mol, atom[0], atom[1], 1);
CreateAABond(mol, atom[0], atom[2], 1);
}
break;
case 'G':
if (!strcmp(aa, "GLN")) {
CreateAABackbone(mol, r1, r2, atom[0], info, 1);
atom[1] = CreateAAAtom(mol, " CG ", info);
atom[2] = CreateAAAtom(mol, " CD ", info);
atom[3] = CreateAAAtom(mol, " OE1", info);
atom[4] = CreateAAAtom(mol, " NE2", info);
CreateAABond(mol, atom[0], atom[1], 1);
CreateAABond(mol, atom[1], atom[2], 1);
CreateAABond(mol, atom[2], atom[3], 2);
CreateAABond(mol, atom[2], atom[4], 1);
} else if (!strcmp(aa, "GLU")) {
CreateAABackbone(mol, r1, r2, atom[0], info, 1);
atom[1] = CreateAAAtom(mol, " CG ", info);
atom[2] = CreateAAAtom(mol, " CD ", info);
atom[3] = CreateAAAtom(mol, " OE1", info);
atom[4] = CreateAAAtom(mol, " OE2", info);
CreateAABond(mol, atom[0], atom[1], 1);
CreateAABond(mol, atom[1], atom[2], 1);
CreateAABond(mol, atom[2], atom[3], 2);
CreateAABond(mol, atom[2], atom[4], 1);
} else if (!strcmp(aa, "GLY")) {
r1 = CreateAAAtom(mol, " N ", info);
atom[0] = CreateAAAtom(mol, " CA ", info);
r2 = CreateAAAtom(mol, " C ", info);
atom[1] = CreateAAAtom(mol, " O ", info);
CreateAABond(mol, r1, atom[0], 1);
CreateAABond(mol, atom[0], r2, 1);
CreateAABond(mol, r2, atom[1], 2);
}
break;
case 'H':
if (!strcmp(aa, "HIS")) {
CreateAABackbone(mol, r1, r2, atom[0], info, 1);
atom[1] = CreateAAAtom(mol, " CG ", info);
atom[2] = CreateAAAtom(mol, " ND1", info);
atom[3] = CreateAAAtom(mol, " CD2", info);
atom[4] = CreateAAAtom(mol, " CE1", info);
atom[5] = CreateAAAtom(mol, " NE2", info);
CreateAABond(mol, atom[0], atom[1], 1);
CreateAABond(mol, atom[1], atom[2], 1);
CreateAABond(mol, atom[2], atom[4], 2);
CreateAABond(mol, atom[4], atom[5], 1);
CreateAABond(mol, atom[5], atom[3], 1);
CreateAABond(mol, atom[3], atom[1], 2);
}
break;
case 'I':
if (!strcmp(aa, "ILE")) {
CreateAABackbone(mol, r1, r2, atom[0], info, 1);
atom[1] = CreateAAAtom(mol, " CG1", info);
atom[2] = CreateAAAtom(mol, " CG2", info);
atom[3] = CreateAAAtom(mol, " CD1", info);
CreateAABond(mol, atom[0], atom[1], 1);
CreateAABond(mol, atom[0], atom[2], 1);
CreateAABond(mol, atom[2], atom[3], 1);
atom[0]->setChiralTag(Atom::CHI_TETRAHEDRAL_CW);
}
break;
case 'L':
if (!strcmp(aa, "LEU")) {
CreateAABackbone(mol, r1, r2, atom[0], info, 1);
atom[1] = CreateAAAtom(mol, " CG ", info);
atom[2] = CreateAAAtom(mol, " CD1", info);
atom[3] = CreateAAAtom(mol, " CD2", info);
CreateAABond(mol, atom[0], atom[1], 1);
CreateAABond(mol, atom[1], atom[2], 1);
CreateAABond(mol, atom[1], atom[3], 1);
} else if (!strcmp(aa, "LYS")) {
CreateAABackbone(mol, r1, r2, atom[0], info, 1);
atom[1] = CreateAAAtom(mol, " CG ", info);
atom[2] = CreateAAAtom(mol, " CD ", info);
atom[3] = CreateAAAtom(mol, " CE ", info);
atom[4] = CreateAAAtom(mol, " NZ ", info);
CreateAABond(mol, atom[0], atom[1], 1);
CreateAABond(mol, atom[1], atom[2], 1);
CreateAABond(mol, atom[2], atom[3], 1);
CreateAABond(mol, atom[3], atom[4], 1);
}
break;
case 'M':
if (!strcmp(aa, "MET")) {
CreateAABackbone(mol, r1, r2, atom[0], info, 1);
atom[1] = CreateAAAtom(mol, " CG ", info);
atom[2] = CreateAAAtom(mol, " SD ", info);
atom[3] = CreateAAAtom(mol, " CE ", info);
CreateAABond(mol, atom[0], atom[1], 1);
CreateAABond(mol, atom[1], atom[2], 1);
CreateAABond(mol, atom[2], atom[3], 1);
} else if (!strcmp(aa, "MED")) {
info.setIsHeteroAtom(true);
CreateAABackbone(mol, r1, r2, atom[0], info, -1);
atom[1] = CreateAAAtom(mol, " CG ", info);
atom[2] = CreateAAAtom(mol, " SD ", info);
atom[3] = CreateAAAtom(mol, " CE ", info);
CreateAABond(mol, atom[0], atom[1], 1);
CreateAABond(mol, atom[1], atom[2], 1);
CreateAABond(mol, atom[2], atom[3], 1);
} else if (!strcmp(aa, "MSE")) {
info.setIsHeteroAtom(true);
CreateAABackbone(mol, r1, r2, atom[0], info, 1);
atom[1] = CreateAAAtom(mol, " CG ", info);
atom[2] = CreateAAAtom(mol, "SE ", info);
atom[3] = CreateAAAtom(mol, " CE ", info);
CreateAABond(mol, atom[0], atom[1], 1);
CreateAABond(mol, atom[1], atom[2], 1);
CreateAABond(mol, atom[2], atom[3], 1);
}
break;
case 'N':
if (!strcmp(aa, "NLE")) {
info.setIsHeteroAtom(true);
CreateAABackbone(mol, r1, r2, atom[0], info, 1);
atom[1] = CreateAAAtom(mol, " CG ", info);
atom[2] = CreateAAAtom(mol, " CD ", info);
atom[3] = CreateAAAtom(mol, " CE ", info);
CreateAABond(mol, atom[0], atom[1], 1);
CreateAABond(mol, atom[1], atom[2], 1);
CreateAABond(mol, atom[2], atom[3], 1);
} else if (!strcmp(aa, "NVA")) {
info.setIsHeteroAtom(true);
CreateAABackbone(mol, r1, r2, atom[0], info, 1);
atom[1] = CreateAAAtom(mol, " CG ", info);
atom[2] = CreateAAAtom(mol, " CD ", info);
CreateAABond(mol, atom[0], atom[1], 1);
CreateAABond(mol, atom[1], atom[2], 1);
}
break;
case 'O':
if (!strcmp(aa, "ORN")) {
info.setIsHeteroAtom(true);
CreateAABackbone(mol, r1, r2, atom[0], info, 1);
atom[1] = CreateAAAtom(mol, " CG ", info);
atom[2] = CreateAAAtom(mol, " CD ", info);
atom[3] = CreateAAAtom(mol, " NE ", info);
CreateAABond(mol, atom[0], atom[1], 1);
CreateAABond(mol, atom[1], atom[2], 1);
CreateAABond(mol, atom[2], atom[3], 1);
}
break;
case 'P':
if (!strcmp(aa, "PHE")) {
CreateAABackbone(mol, r1, r2, atom[0], info, 1);
atom[1] = CreateAAAtom(mol, " CG ", info);
atom[2] = CreateAAAtom(mol, " CD1", info);
atom[3] = CreateAAAtom(mol, " CD2", info);
atom[4] = CreateAAAtom(mol, " CE1", info);
atom[5] = CreateAAAtom(mol, " CE2", info);
atom[6] = CreateAAAtom(mol, " CZ ", info);
CreateAABond(mol, atom[0], atom[1], 1);
CreateAABond(mol, atom[1], atom[2], 2);
CreateAABond(mol, atom[2], atom[4], 1);
CreateAABond(mol, atom[4], atom[6], 2);
CreateAABond(mol, atom[6], atom[5], 1);
CreateAABond(mol, atom[5], atom[3], 2);
CreateAABond(mol, atom[3], atom[1], 1);
} else if (!strcmp(aa, "PRO")) {
CreateAABackbone(mol, r1, r2, atom[0], info, 1);
atom[1] = CreateAAAtom(mol, " CG ", info);
atom[2] = CreateAAAtom(mol, " CD ", info);
CreateAABond(mol, atom[0], atom[1], 1);
CreateAABond(mol, atom[1], atom[2], 1);
CreateAABond(mol, atom[2], r1, 1);
} else if (!strcmp(aa, "PCA")) {
info.setIsHeteroAtom(true);
CreateAABackbone(mol, r1, r2, atom[0], info, 1);
atom[1] = CreateAAAtom(mol, " CG ", info);
atom[2] = CreateAAAtom(mol, " CD ", info);
atom[3] = CreateAAAtom(mol, " OE ", info);
CreateAABond(mol, atom[0], atom[1], 1);
CreateAABond(mol, atom[1], atom[2], 1);
CreateAABond(mol, atom[2], r1, 1);
CreateAABond(mol, atom[2], atom[3], 2);
}
break;
case 'S':
if (!strcmp(aa, "SER")) {
CreateAABackbone(mol, r1, r2, atom[0], info, 1);
atom[1] = CreateAAAtom(mol, " OG ", info);
CreateAABond(mol, atom[0], atom[1], 1);
} else if (!strcmp(aa, "SAR")) {
info.setIsHeteroAtom(true);
r1 = CreateAAAtom(mol, " N ", info);
atom[0] = CreateAAAtom(mol, " CA ", info);
r2 = CreateAAAtom(mol, " C ", info);
atom[1] = CreateAAAtom(mol, " O ", info);
atom[2] = CreateAAAtom(mol, " CN ", info);
CreateAABond(mol, r1, atom[0], 1);
CreateAABond(mol, atom[0], r2, 1);
CreateAABond(mol, r2, atom[1], 2);
CreateAABond(mol, r1, atom[2], 1);
}
break;
case 'T':
if (!strcmp(aa, "THR")) {
CreateAABackbone(mol, r1, r2, atom[0], info, 1);
atom[1] = CreateAAAtom(mol, " OG1", info);
atom[2] = CreateAAAtom(mol, " CG2", info);
CreateAABond(mol, atom[0], atom[1], 1);
CreateAABond(mol, atom[0], atom[2], 1);
atom[0]->setChiralTag(Atom::CHI_TETRAHEDRAL_CCW);
} else if (!strcmp(aa, "TRP")) {
CreateAABackbone(mol, r1, r2, atom[0], info, 1);
atom[1] = CreateAAAtom(mol, " CG ", info);
atom[2] = CreateAAAtom(mol, " CD1", info);
atom[3] = CreateAAAtom(mol, " CD2", info);
atom[4] = CreateAAAtom(mol, " NE1", info);
atom[5] = CreateAAAtom(mol, " CE2", info);
atom[6] = CreateAAAtom(mol, " CE3", info);
atom[7] = CreateAAAtom(mol, " CZ2", info);
atom[8] = CreateAAAtom(mol, " CZ3", info);
atom[9] = CreateAAAtom(mol, " CH2", info);
CreateAABond(mol, atom[0], atom[1], 1);
CreateAABond(mol, atom[1], atom[2], 2);
CreateAABond(mol, atom[1], atom[3], 1);
CreateAABond(mol, atom[2], atom[4], 1);
CreateAABond(mol, atom[3], atom[5], 2);
CreateAABond(mol, atom[3], atom[6], 1);
CreateAABond(mol, atom[4], atom[5], 1);
CreateAABond(mol, atom[5], atom[7], 1);
CreateAABond(mol, atom[6], atom[8], 2);
CreateAABond(mol, atom[7], atom[9], 2);
CreateAABond(mol, atom[8], atom[9], 1);
} else if (!strcmp(aa, "TYR")) {
CreateAABackbone(mol, r1, r2, atom[0], info, 1);
atom[1] = CreateAAAtom(mol, " CG ", info);
atom[2] = CreateAAAtom(mol, " CD1", info);
atom[3] = CreateAAAtom(mol, " CD2", info);
atom[4] = CreateAAAtom(mol, " CE1", info);
atom[5] = CreateAAAtom(mol, " CE2", info);
atom[6] = CreateAAAtom(mol, " CZ ", info);
atom[7] = CreateAAAtom(mol, " OH ", info);
CreateAABond(mol, atom[0], atom[1], 1);
CreateAABond(mol, atom[1], atom[2], 2);
CreateAABond(mol, atom[2], atom[4], 1);
CreateAABond(mol, atom[4], atom[6], 2);
CreateAABond(mol, atom[6], atom[5], 1);
CreateAABond(mol, atom[5], atom[3], 2);
CreateAABond(mol, atom[3], atom[1], 1);
CreateAABond(mol, atom[6], atom[7], 1);
}
break;
case 'V':
if (!strcmp(aa, "VAL")) {
CreateAABackbone(mol, r1, r2, atom[0], info, 1);
atom[1] = CreateAAAtom(mol, " CG1", info);
atom[2] = CreateAAAtom(mol, " CG2", info);
CreateAABond(mol, atom[0], atom[1], 1);
CreateAABond(mol, atom[0], atom[2], 1);
}
break;
}
}
static RWMol *AASequenceToMol(const char *seq, bool lowerD) {
AtomPDBResidueInfo info;
Atom *prev = (Atom *)0;
char chain[2];
chain[0] = 'A';
chain[1] = '\0';
info.setSerialNumber(1);
info.setAltLoc(" ");
info.setResidueNumber(0);
info.setInsertionCode(" ");
info.setChainId(chain);
RWMol *mol = new RWMol();
while (*seq) {
Atom *r1 = (Atom *)0;
Atom *r2 = (Atom *)0;
Atom *r3 = (Atom *)0;
switch (*seq) {
case '\n':
case '\r':
case '-':
seq++;
continue;
case ' ':
case '\t':
case '\0':
break;
case '.':
if (prev) {
Atom *oxt = CreateAAAtom(mol, " OXT", info);
CreateAABond(mol, prev, oxt, 1);
if (chain[0] < 'Z') {
chain[0]++;
info.setChainId(chain);
}
info.setResidueNumber(0);
prev = (Atom *)0;
}
seq++;
continue;
default:
delete mol;
return (RWMol *)0;
case 'A':
CreateAminoAcid(mol, "ALA", r1, r2, r3, info);
break;
case 'C':
CreateAminoAcid(mol, "CYS", r1, r2, r3, info);
break;
case 'D':
CreateAminoAcid(mol, "ASP", r1, r2, r3, info);
break;
case 'E':
CreateAminoAcid(mol, "GLU", r1, r2, r3, info);
break;
case 'F':
CreateAminoAcid(mol, "PHE", r1, r2, r3, info);
break;
case 'G':
case 'g':
CreateAminoAcid(mol, "GLY", r1, r2, r3, info);
break;
case 'H':
CreateAminoAcid(mol, "HIS", r1, r2, r3, info);
break;
case 'I':
CreateAminoAcid(mol, "ILE", r1, r2, r3, info);
break;
case 'K':
CreateAminoAcid(mol, "LYS", r1, r2, r3, info);
break;
case 'L':
CreateAminoAcid(mol, "LEU", r1, r2, r3, info);
break;
case 'M':
CreateAminoAcid(mol, "MET", r1, r2, r3, info);
break;
case 'N':
CreateAminoAcid(mol, "ASN", r1, r2, r3, info);
break;
case 'P':
CreateAminoAcid(mol, "PRO", r1, r2, r3, info);
break;
case 'Q':
CreateAminoAcid(mol, "GLN", r1, r2, r3, info);
break;
case 'R':
CreateAminoAcid(mol, "ARG", r1, r2, r3, info);
break;
case 'S':
CreateAminoAcid(mol, "SER", r1, r2, r3, info);
break;
case 'T':
CreateAminoAcid(mol, "THR", r1, r2, r3, info);
break;
case 'V':
CreateAminoAcid(mol, "VAL", r1, r2, r3, info);
break;
case 'W':
CreateAminoAcid(mol, "TRP", r1, r2, r3, info);
break;
case 'Y':
CreateAminoAcid(mol, "TYR", r1, r2, r3, info);
break;
case 'a':
CreateAminoAcid(mol, lowerD ? "DAL" : "ALA", r1, r2, r3, info);
break;
case 'c':
CreateAminoAcid(mol, lowerD ? "DCY" : "CYS", r1, r2, r3, info);
break;
case 'd':
CreateAminoAcid(mol, lowerD ? "DAS" : "ASP", r1, r2, r3, info);
break;
case 'e':
CreateAminoAcid(mol, lowerD ? "DGL" : "GLU", r1, r2, r3, info);
break;
case 'f':
CreateAminoAcid(mol, lowerD ? "DPN" : "PHE", r1, r2, r3, info);
break;
case 'h':
CreateAminoAcid(mol, lowerD ? "DHI" : "HIS", r1, r2, r3, info);
break;
case 'i':
CreateAminoAcid(mol, lowerD ? "DIL" : "ILE", r1, r2, r3, info);
break;
case 'k':
CreateAminoAcid(mol, lowerD ? "DLY" : "LYS", r1, r2, r3, info);
break;
case 'l':
CreateAminoAcid(mol, lowerD ? "DLE" : "LEU", r1, r2, r3, info);
break;
case 'm':
CreateAminoAcid(mol, lowerD ? "MED" : "MET", r1, r2, r3, info);
break;
case 'n':
CreateAminoAcid(mol, lowerD ? "DSG" : "ASN", r1, r2, r3, info);
break;
case 'p':
CreateAminoAcid(mol, lowerD ? "DPR" : "PRO", r1, r2, r3, info);
break;
case 'q':
CreateAminoAcid(mol, lowerD ? "DGN" : "GLN", r1, r2, r3, info);
break;
case 'r':
CreateAminoAcid(mol, lowerD ? "DAR" : "ARG", r1, r2, r3, info);
break;
case 's':
CreateAminoAcid(mol, lowerD ? "DSN" : "SER", r1, r2, r3, info);
break;
case 't':
CreateAminoAcid(mol, lowerD ? "DTH" : "THR", r1, r2, r3, info);
break;
case 'v':
CreateAminoAcid(mol, lowerD ? "DVA" : "VAL", r1, r2, r3, info);
break;
case 'w':
CreateAminoAcid(mol, lowerD ? "DTR" : "TRP", r1, r2, r3, info);
break;
case 'y':
CreateAminoAcid(mol, lowerD ? "DTY" : "TYR", r1, r2, r3, info);
break;
}
if (prev && r1) CreateAABond(mol, prev, r1, 1);
prev = r2;
seq++;
}
if (prev) {
Atom *oxt = CreateAAAtom(mol, " OXT", info);
CreateAABond(mol, prev, oxt, 1);
}
return mol;
}
static void CreateNucleicAcid(RWMol *mol, const char *na, Atom *&r1, Atom *&r2,
AtomPDBResidueInfo &info, bool PCap5)
{
Atom *atom[32];
r1 = (Atom *)0;
r2 = (Atom *)0;
int resno = info.getResidueNumber();
info.setResidueNumber(resno + 1);
info.setIsHeteroAtom(false);
info.setResidueName(na);
if (resno) {
atom[1] = CreateAAAtom(mol, " P ", info);
atom[2] = CreateAAAtom(mol, " OP1", info);
atom[3] = CreateAAAtom(mol, " OP2", info);
atom[4] = CreateAAAtom(mol, " O5'", info);
CreateAABond(mol, atom[1], atom[2], 2);
CreateAABond(mol, atom[1], atom[3], 1);
CreateAABond(mol, atom[1], atom[4], 1);
r1 = atom[1];
} else if (PCap5) {
atom[0] = CreateAAAtom(mol, " OP3", info);
atom[1] = CreateAAAtom(mol, " P ", info);
atom[2] = CreateAAAtom(mol, " OP1", info);
atom[3] = CreateAAAtom(mol, " OP2", info);
atom[4] = CreateAAAtom(mol, " O5'", info);
CreateAABond(mol, atom[0], atom[1], 1);
CreateAABond(mol, atom[1], atom[2], 2);
CreateAABond(mol, atom[1], atom[3], 1);
CreateAABond(mol, atom[1], atom[4], 1);
r1 = atom[0];
} else {
atom[4] = CreateAAAtom(mol, " O5'", info);
r1 = atom[4];
}
atom[5] = CreateAAAtom(mol, " C5'", info);
atom[6] = CreateAAAtom(mol, " C4'", info);
atom[7] = CreateAAAtom(mol, " O4'", info);
atom[8] = CreateAAAtom(mol, " C3'", info);
atom[9] = CreateAAAtom(mol, " O3'", info);
atom[10] = CreateAAAtom(mol, " C2'", info);
CreateAABond(mol, atom[4], atom[5], 1);
CreateAABond(mol, atom[5], atom[6], 1);
CreateAABond(mol, atom[6], atom[7], 1);
CreateAABond(mol, atom[6], atom[8], 1);
CreateAABond(mol, atom[8], atom[9], 1);
CreateAABond(mol, atom[8], atom[10], 1);
atom[6]->setChiralTag(Atom::CHI_TETRAHEDRAL_CW);
atom[8]->setChiralTag(Atom::CHI_TETRAHEDRAL_CW);
if (na[1]==' ') {
atom[11] = CreateAAAtom(mol, " O2'", info);
atom[12] = CreateAAAtom(mol, " C1'", info);
CreateAABond(mol, atom[10], atom[11], 1);
CreateAABond(mol, atom[7], atom[12], 1);
CreateAABond(mol, atom[10], atom[12], 1);
atom[10]->setChiralTag(Atom::CHI_TETRAHEDRAL_CW);
} else {
atom[12] = CreateAAAtom(mol, " C1'", info);
CreateAABond(mol, atom[7], atom[12], 1);
CreateAABond(mol, atom[10], atom[12], 1);
}
r2 = atom[9];
switch (na[2]) {
case 'A':
atom[13] = CreateAAAtom(mol, " N9 ", info);
atom[14] = CreateAAAtom(mol, " C8 ", info);
atom[15] = CreateAAAtom(mol, " N7 ", info);
atom[16] = CreateAAAtom(mol, " C5 ", info);
atom[17] = CreateAAAtom(mol, " C6 ", info);
atom[18] = CreateAAAtom(mol, " N6 ", info);
atom[19] = CreateAAAtom(mol, " N1 ", info);
atom[20] = CreateAAAtom(mol, " C2 ", info);
atom[21] = CreateAAAtom(mol, " N3 ", info);
atom[22] = CreateAAAtom(mol, " C4 ", info);
CreateAABond(mol, atom[12], atom[13], 1);
CreateAABond(mol, atom[13], atom[14], 1);
CreateAABond(mol, atom[13], atom[22], 1);
CreateAABond(mol, atom[14], atom[15], 2);
CreateAABond(mol, atom[15], atom[16], 1);
CreateAABond(mol, atom[16], atom[17], 1);
CreateAABond(mol, atom[16], atom[22], 2);
CreateAABond(mol, atom[17], atom[18], 1);
CreateAABond(mol, atom[17], atom[19], 2);
CreateAABond(mol, atom[19], atom[20], 1);
CreateAABond(mol, atom[20], atom[21], 2);
CreateAABond(mol, atom[21], atom[22], 1);
atom[12]->setChiralTag(Atom::CHI_TETRAHEDRAL_CCW);
break;
case 'C':
atom[13] = CreateAAAtom(mol, " N1 ", info);
atom[14] = CreateAAAtom(mol, " C2 ", info);
atom[15] = CreateAAAtom(mol, " O2 ", info);
atom[16] = CreateAAAtom(mol, " N3 ", info);
atom[17] = CreateAAAtom(mol, " C4 ", info);
atom[18] = CreateAAAtom(mol, " N4 ", info);
atom[19] = CreateAAAtom(mol, " C5 ", info);
atom[20] = CreateAAAtom(mol, " C6 ", info);
CreateAABond(mol, atom[12], atom[13], 1);
CreateAABond(mol, atom[13], atom[14], 1);
CreateAABond(mol, atom[13], atom[20], 1);
CreateAABond(mol, atom[14], atom[15], 2);
CreateAABond(mol, atom[14], atom[16], 1);
CreateAABond(mol, atom[16], atom[17], 2);
CreateAABond(mol, atom[17], atom[18], 1);
CreateAABond(mol, atom[17], atom[19], 1);
CreateAABond(mol, atom[19], atom[20], 2);
atom[12]->setChiralTag(Atom::CHI_TETRAHEDRAL_CCW);
break;
case 'G':
atom[13] = CreateAAAtom(mol, " N9 ", info);
atom[14] = CreateAAAtom(mol, " C8 ", info);
atom[15] = CreateAAAtom(mol, " N7 ", info);
atom[16] = CreateAAAtom(mol, " C5 ", info);
atom[17] = CreateAAAtom(mol, " C6 ", info);
atom[18] = CreateAAAtom(mol, " O6 ", info);
atom[19] = CreateAAAtom(mol, " N1 ", info);
atom[20] = CreateAAAtom(mol, " C2 ", info);
atom[21] = CreateAAAtom(mol, " N2 ", info);
atom[22] = CreateAAAtom(mol, " N3 ", info);
atom[23] = CreateAAAtom(mol, " C4 ", info);
CreateAABond(mol, atom[12], atom[13], 1);
CreateAABond(mol, atom[13], atom[14], 1);
CreateAABond(mol, atom[13], atom[23], 1);
CreateAABond(mol, atom[14], atom[15], 2);
CreateAABond(mol, atom[15], atom[16], 1);
CreateAABond(mol, atom[16], atom[17], 1);
CreateAABond(mol, atom[16], atom[23], 2);
CreateAABond(mol, atom[17], atom[18], 2);
CreateAABond(mol, atom[17], atom[19], 1);
CreateAABond(mol, atom[19], atom[20], 1);
CreateAABond(mol, atom[20], atom[21], 1);
CreateAABond(mol, atom[20], atom[22], 2);
CreateAABond(mol, atom[22], atom[23], 1);
atom[12]->setChiralTag(Atom::CHI_TETRAHEDRAL_CCW);
break;
case 'T':
atom[13] = CreateAAAtom(mol, " N1 ", info);
atom[14] = CreateAAAtom(mol, " C2 ", info);
atom[15] = CreateAAAtom(mol, " O2 ", info);
atom[16] = CreateAAAtom(mol, " N3 ", info);
atom[17] = CreateAAAtom(mol, " C4 ", info);
atom[18] = CreateAAAtom(mol, " O4 ", info);
atom[19] = CreateAAAtom(mol, " C5 ", info);
atom[20] = CreateAAAtom(mol, " C7 ", info);
atom[21] = CreateAAAtom(mol, " C6 ", info);
CreateAABond(mol, atom[12], atom[13], 1);
CreateAABond(mol, atom[13], atom[14], 1);
CreateAABond(mol, atom[13], atom[21], 1);
CreateAABond(mol, atom[14], atom[15], 2);
CreateAABond(mol, atom[14], atom[16], 1);
CreateAABond(mol, atom[16], atom[17], 1);
CreateAABond(mol, atom[17], atom[18], 2);
CreateAABond(mol, atom[17], atom[19], 1);
CreateAABond(mol, atom[19], atom[20], 1);
CreateAABond(mol, atom[19], atom[21], 2);
atom[12]->setChiralTag(Atom::CHI_TETRAHEDRAL_CCW);
break;
case 'U':
atom[13] = CreateAAAtom(mol, " N1 ", info);
atom[14] = CreateAAAtom(mol, " C2 ", info);
atom[15] = CreateAAAtom(mol, " O2 ", info);
atom[16] = CreateAAAtom(mol, " N3 ", info);
atom[17] = CreateAAAtom(mol, " C4 ", info);
atom[18] = CreateAAAtom(mol, " O4 ", info);
atom[19] = CreateAAAtom(mol, " C5 ", info);
atom[20] = CreateAAAtom(mol, " C6 ", info);
CreateAABond(mol, atom[12], atom[13], 1);
CreateAABond(mol, atom[13], atom[14], 1);
CreateAABond(mol, atom[13], atom[20], 1);
CreateAABond(mol, atom[14], atom[15], 2);
CreateAABond(mol, atom[14], atom[16], 1);
CreateAABond(mol, atom[16], atom[17], 1);
CreateAABond(mol, atom[17], atom[18], 2);
CreateAABond(mol, atom[17], atom[19], 1);
CreateAABond(mol, atom[19], atom[20], 2);
atom[12]->setChiralTag(Atom::CHI_TETRAHEDRAL_CCW);
break;
}
}
static void CreatePCap3(RWMol *mol, Atom *prev, AtomPDBResidueInfo &info) {
Atom *atom[4];
int resno = info.getResidueNumber();
info.setResidueNumber(resno + 1);
info.setIsHeteroAtom(true);
info.setResidueName("2PO");
atom[0] = CreateAAAtom(mol, " P ", info);
atom[1] = CreateAAAtom(mol, " OP1", info);
atom[2] = CreateAAAtom(mol, " OP2", info);
atom[3] = CreateAAAtom(mol, " OP3", info);
CreateAABond(mol, prev, atom[0], 1);
CreateAABond(mol, atom[0], atom[1], 2);
CreateAABond(mol, atom[0], atom[2], 1);
CreateAABond(mol, atom[0], atom[3], 1);
}
static RWMol *NASequenceToMol(const char *seq, bool Dna,
bool PCap5, bool PCap3) {
char chain[2];
chain[0] = 'A';
chain[1] = '\0';
Atom *prev = (Atom *)0;
AtomPDBResidueInfo info;
info.setSerialNumber(1);
info.setAltLoc(" ");
info.setResidueNumber(0);
info.setInsertionCode(" ");
info.setChainId(chain);
RWMol *mol = new RWMol();
while (*seq) {
Atom *r1 = (Atom *)0;
Atom *r2 = (Atom *)0;
switch (*seq) {
case '\n':
case '\r':
case '-':
seq++;
continue;
case ' ':
case '\t':
case '\0':
break;
case '.':
if (prev) {
if (PCap3)
CreatePCap3(mol, prev, info);
if (chain[0] < 'Z') {
chain[0]++;
info.setChainId(chain);
}
info.setResidueNumber(0);
}
prev = (Atom *)0;
seq++;
continue;
default:
delete mol;
return (RWMol *)0;
case 'A':
case 'a':
CreateNucleicAcid(mol, Dna ? " DA" : " A", r1, r2, info, PCap5);
break;
case 'C':
case 'c':
CreateNucleicAcid(mol, Dna ? " DC" : " C", r1, r2, info, PCap5);
break;
case 'G':
case 'g':
CreateNucleicAcid(mol, Dna ? " DG" : " G", r1, r2, info, PCap5);
break;
case 'T':
case 't':
CreateNucleicAcid(mol, Dna ? " DT" : " T", r1, r2, info, PCap5);
break;
case 'U':
case 'u':
CreateNucleicAcid(mol, Dna ? " DU" : " U", r1, r2, info, PCap5);
break;
}
if (prev && r1) CreateAABond(mol, prev, r1, 1);
prev = r2;
seq++;
}
if (prev && PCap3)
CreatePCap3(mol, prev, info);
return mol;
}
RWMol *SequenceToMol(const char *seq, bool sanitize, int flavor) {
if (!seq) return (RWMol *)0;
RWMol *mol;
switch (flavor) {
/* Protein */
case 0: mol = AASequenceToMol(seq,false); break;
case 1: mol = AASequenceToMol(seq,true); break;
/* RNA */
case 2: mol = NASequenceToMol(seq,false,false,false); break;
case 3: mol = NASequenceToMol(seq,false,true,false); break;
case 4: mol = NASequenceToMol(seq,false,false,true); break;
case 5: mol = NASequenceToMol(seq,false,true,true); break;
/* DNA */
case 6: mol = NASequenceToMol(seq,true,false,false); break;
case 7: mol = NASequenceToMol(seq,true,true,false); break;
case 8: mol = NASequenceToMol(seq,true,false,true); break;
case 9: mol = NASequenceToMol(seq,true,true,true); break;
default:
return (RWMol*)0;
}
if (sanitize && mol)
MolOps::sanitizeMol(*mol);
return mol;
}
RWMol *SequenceToMol(const std::string &seq, bool sanitize, int flavor) {
return SequenceToMol(seq.c_str(), sanitize, flavor);
}
RWMol *SequenceToMol(const char *seq, bool sanitize, bool lowerD) {
return SequenceToMol(seq, sanitize, lowerD ? 1 : 0);
}
RWMol *SequenceToMol(const std::string &seq, bool sanitize, bool lowerD) {
return SequenceToMol(seq.c_str(), sanitize, lowerD ? 1 : 0);
}
RWMol *FASTAToMol(const char *seq, bool sanitize, int flavor) {
if (!seq) return (RWMol *)0;
std::string title;
if (seq[0] == '>') {
seq++;
while (*seq && *seq != '\n' && *seq != '\r') title += *seq++;
}
RWMol *mol = SequenceToMol(seq, sanitize, flavor);
if (!title.empty()) mol->setProp(common_properties::_Name, title);
return mol;
}
RWMol *FASTAToMol(const std::string &seq, bool sanitize, int flavor) {
return FASTAToMol(seq.c_str(), sanitize, flavor);
}
RWMol *FASTAToMol(const char *seq, bool sanitize, bool lowerD) {
return FASTAToMol(seq, sanitize, lowerD ? 1 : 0);
}
RWMol *FASTAToMol(const std::string &seq, bool sanitize, bool lowerD) {
return FASTAToMol(seq.c_str(), sanitize, lowerD ? 1 : 0);
}
struct HELMMonomer {
Atom *r1;
Atom *r2;
Atom *r3;
Atom *oxt;
HELMMonomer() : r1(0), r2(0), r3(0), oxt(0) {}
HELMMonomer(Atom *x, Atom *y, Atom *z) : r1(x), r2(y), r3(z), oxt(0) {}
};
static const char *GetHELMOneLetterCode(char ch) {
switch (ch) {
case 'A':
return "ALA";
case 'C':
return "CYS";
case 'D':
return "ASP";
case 'E':
return "GLU";
case 'F':
return "PHE";
case 'G':
return "GLY";
case 'H':
return "HIS";
case 'I':
return "ILE";
case 'K':
return "LYS";
case 'L':
return "LEU";
case 'M':
return "MET";
case 'N':
return "ASN";
case 'P':
return "PRO";
case 'Q':
return "GLN";
case 'R':
return "ARG";
case 'S':
return "SER";
case 'T':
return "THR";
case 'V':
return "VAL";
case 'W':
return "TRP";
case 'Y':
return "TYR";
}
return (char *)0;
}
static bool IsHELMMonomerIDChar(char ch) {
if (ch >= 'A' && ch <= 'Z') return true;
if (ch >= 'a' && ch <= 'z') return true;
if (ch >= '0' && ch <= '9') return true;
return false;
}
static const char *LookupHELMPeptideMonomer(const char *ptr) {
switch (ptr[0]) {
case 'A':
if (ptr[1] == '\0') return "ALA";
if (ptr[1] == 'b' && ptr[2] == 'u' && ptr[3] == '\0') return "ABA";
break;
case 'C':
if (ptr[1] == '\0') return "CYS";
break;
case 'D':
if (ptr[1] == '\0') return "ASP";
case 'E':
if (ptr[1] == '\0') return "GLU";
break;
case 'F':
if (ptr[1] == '\0') return "PHE";
break;
case 'G':
if (ptr[1] == '\0') return "GLY";
if (ptr[1] == 'l' && ptr[2] == 'p' && ptr[3] == '\0') return "PCA";
break;
case 'H':
if (ptr[1] == '\0') return "HIS";
break;
case 'I':
if (ptr[1] == '\0') return "ILE";
break;
case 'K':
if (ptr[1] == '\0') return "LYS";
break;
case 'L':
if (ptr[1] == '\0') return "LEU";
break;
case 'M':
if (ptr[1] == '\0') return "MET";
break;
case 'N':
if (ptr[1] == '\0') return "ASN";
if (ptr[1] == 'a' && ptr[2] == 'l' && ptr[3] == '\0') return "NAL";
if (ptr[1] == 'l' && ptr[2] == 'e' && ptr[3] == '\0') return "NLE";
if (ptr[1] == 'v' && ptr[2] == 'a' && ptr[3] == '\0') return "NVA";
break;
case 'O':
if (ptr[1] == 'r' && ptr[2] == 'n' && ptr[3] == '\0') return "ORN";
break;
case 'P':
if (ptr[1] == '\0') return "PRO";
break;
case 'Q':
if (ptr[1] == '\0') return "GLN";
break;
case 'R':
if (ptr[1] == '\0') return "ARG";
break;
case 'S':
if (ptr[1] == '\0') return "SER";
if (ptr[1] == 'a' && ptr[2] == 'r' && ptr[3] == '\0') return "SAR";
break;
case 'T':
if (ptr[1] == '\0') return "THR";
break;
case 'V':
if (ptr[1] == '\0') return "VAL";
break;
case 'W':
if (ptr[1] == '\0') return "TRP";
break;
case 'Y':
if (ptr[1] == '\0') return "TYR";
break;
case 'd':
switch (ptr[1]) {
case 'A':
if (ptr[2] == '\0') return "DAL";
break;
case 'C':
if (ptr[2] == '\0') return "DCY";
break;
case 'D':
if (ptr[2] == '\0') return "DAS";
break;
case 'E':
if (ptr[2] == '\0') return "DGL";
break;
case 'F':
if (ptr[2] == '\0') return "DPN";
break;
case 'H':
if (ptr[2] == '\0') return "DHI";
break;
case 'I':
if (ptr[2] == '\0') return "DIL";
break;
case 'K':
if (ptr[2] == '\0') return "DLY";
break;
case 'L':
if (ptr[2] == '\0') return "DLE";
break;
case 'M':
if (ptr[2] == '\0') return "MED";
break;
case 'N':
if (ptr[2] == '\0') return "DSG";
break;
case 'P':
if (ptr[2] == '\0') return "DPR";
break;
case 'Q':
if (ptr[2] == '\0') return "DGN";
break;
case 'R':
if (ptr[2] == '\0') return "DAR";
break;
case 'S':
if (ptr[2] == '\0') return "DSN";
break;
case 'T':
if (ptr[2] == '\0') return "DTH";
break;
case 'V':
if (ptr[2] == '\0') return "DVA";
break;
case 'W':
if (ptr[2] == '\0') return "DTR";
break;
case 'Y':
if (ptr[2] == '\0') return "DTY";
break;
}
break;
case 's':
if (ptr[1] == 'e' && ptr[2] == 'C' && ptr[3] == '\0') return "MSE";
break;
}
return (const char *)0;
}
static const char *ParseHELMPeptide(RWMol *mol, const char *ptr,
const char *chain,
std::vector<HELMMonomer> &vseq) {
unsigned int len = 0;
HELMMonomer curr;
vseq.clear();
if (ptr[0] == '}') return ptr;
AtomPDBResidueInfo info;
info.setSerialNumber(1);
info.setAltLoc(" ");
info.setResidueNumber(0);
info.setInsertionCode(" ");
info.setChainId(chain);
if (ptr[0] == '[' && ptr[1] == 'a' && ptr[2] == 'c' && ptr[3] == ']') {
if (ptr[4] != '.') return (const char *)0;
info.setResidueNumber(-2);
CreateAminoAcid(mol, "ACE", curr.r1, curr.r2, curr.r3, info);
vseq.push_back(curr);
info.setResidueNumber(0);
ptr += 5;
len = 1;
}
for (;;) {
const char *name = 0;
if (*ptr == '[') {
std::string tmp;
ptr++;
while (IsHELMMonomerIDChar(*ptr)) tmp += *ptr++;
if (*ptr != ']') return (char *)0;
name = LookupHELMPeptideMonomer(tmp.c_str());
} else
name = GetHELMOneLetterCode(*ptr);
if (!name) return (const char *)0;
ptr++;
CreateAminoAcid(mol, name, curr.r1, curr.r2, curr.r3, info);
if (len && vseq[len - 1].r2 && curr.r1) {
CreateAABond(mol, vseq[len - 1].r2, curr.r1, 1);
vseq[len - 1].r2 = 0;
}
vseq.push_back(curr);
len++;
if (*ptr == '.') {
if (ptr[1] == '[' && ptr[2] == 'a' && ptr[3] == 'm' && ptr[4] == ']' &&
ptr[5] == '}') {
if (!vseq[len - 1].r2) return (const char *)0;
int resno = info.getResidueNumber();
info.setResidueNumber(resno + 1);
info.setIsHeteroAtom(true);
info.setResidueName("NH2");
Atom *n = CreateAAAtom(mol, " N ", info);
CreateAABond(mol, vseq[len - 1].r2, n, 1);
vseq[len - 1].r2 = (Atom *)0;
vseq.push_back(HELMMonomer());
len++;
return ptr + 5;
}
ptr++;
} else if (*ptr == '}') {
if (!vseq[len - 1].r2) return (const char *)0;
Atom *oxt = CreateAAAtom(mol, " OXT", info);
CreateAABond(mol, vseq[len - 1].r2, oxt, 1);
vseq[len - 1].oxt = oxt;
return ptr;
} else
return (const char *)0;
}
}
static const char *ParseHELMNucleic(RWMol *mol, const char *ptr,
const char *chain) {
if (ptr[0] == '}')
return ptr;
bool PCap5 = false;
Atom *prev = 0;
Atom *r1 = 0;
Atom *r2 = 0;
AtomPDBResidueInfo info;
info.setSerialNumber(1);
info.setAltLoc(" ");
info.setResidueNumber(0);
info.setInsertionCode(" ");
info.setChainId(chain);
if (*ptr=='P') {
PCap5 = true;
ptr++;
if (*ptr=='.')
ptr++;
}
for (;;) {
const char *name = 0;
if (*ptr == 'R' && ptr[1]=='(') {
if (ptr[2]=='A') {
if (ptr[3]==')') {
name = " A";
ptr += 4;
}
} else if (ptr[2]=='C') {
if (ptr[3]==')') {
name = " C";
ptr += 4;
}
} else if (ptr[2]=='G') {
if (ptr[3]==')') {
name = " G";
ptr += 4;
}
} else if (ptr[2]=='T') {
if (ptr[3]==')') {
name = " T";
ptr += 4;
}
} else if (ptr[2]=='U') {
if (ptr[3]==')') {
name = " U";
ptr += 4;
}
}
} else if (*ptr=='[' && ptr[1]=='d' && ptr[2]=='R' &&
ptr[3]==']' && ptr[4]=='(') {
if (ptr[5]=='A') {
if (ptr[6]==')') {
name = " DA";
ptr += 7;
}
} else if (ptr[5]=='C') {
if (ptr[6]==')') {
name = " DC";
ptr += 7;
}
} else if (ptr[5]=='G') {
if (ptr[6]==')') {
name = " DG";
ptr += 7;
}
} else if (ptr[5]=='T') {
if (ptr[6]==')') {
name = " DT";
ptr += 7;
}
} else if (ptr[5]=='U') {
if (ptr[6]==')') {
name = " DU";
ptr += 7;
}
}
}
if (!name)
return (const char *)0;
CreateNucleicAcid(mol,name,r1,r2,info,PCap5);
if (prev && r1)
CreateAABond(mol, prev, r1, 1);
prev = r2;
if (*ptr == '}')
return ptr;
if (*ptr == '.')
ptr++;
if (*ptr != 'P')
return (const char*)0;
ptr++;
if (*ptr == '}') {
CreatePCap3(mol,prev,info);
return ptr;
}
PCap5 = true;
if (*ptr == '.')
ptr++;
}
}
static bool ParseHELM(RWMol *mol, const char *ptr) {
std::map<std::string, std::vector<HELMMonomer> > seqs;
const char *orig;
char chain[2];
chain[0] = 'A';
chain[1] = '\0';
for (;;) {
orig = ptr;
if (ptr[0] == 'P' && ptr[1] == 'E' && ptr[2] == 'P' && ptr[3] == 'T' &&
ptr[4] == 'I' && ptr[5] == 'D' && ptr[6] == 'E' && ptr[7] >= '1' &&
ptr[7] <= '9') {
ptr += 8;
while (*ptr >= '0' && *ptr <= '9') ptr++;
if (*ptr != '{') return false;
std::string id(orig, ptr - orig);
chain[0] = 'A' + (orig[7] - '1');
ptr = ParseHELMPeptide(mol, ptr + 1, chain, seqs[id]);
if (!ptr || *ptr != '}') return false;
ptr++;
} else if (ptr[0]=='R' && ptr[1]=='N' && ptr[2]=='A' &&
ptr[3]>='1' && ptr[3]<='9') {
ptr += 4;
while (*ptr >= '0' && *ptr <= '9') ptr++;
if (*ptr != '{') return false;
chain[0] = 'A' + (orig[3] - '1');
ptr = ParseHELMNucleic(mol, ptr + 1, chain);
if (!ptr || *ptr != '}') return false;
ptr++;
} else
return false;
if (*ptr == '$') break;
if (*ptr == '\0') return true;
if (*ptr != '|') return false;
ptr++;
}
ptr++;
if (ptr[0] == '$' && ptr[1] == '$' && ptr[2] == '$') return true;
for (;;) {
orig = ptr;
if (ptr[0] == 'P' && ptr[1] == 'E' && ptr[2] == 'P' && ptr[3] == 'T' &&
ptr[4] == 'I' && ptr[5] == 'D' && ptr[6] == 'E' && ptr[7] >= '1' &&
ptr[7] <= '9') {
ptr += 8;
} else
return false;
while (*ptr >= '0' && *ptr <= '9') ptr++;
if (*ptr != ',') return false;
std::string id1(orig, ptr - orig);
ptr++;
orig = ptr;
if (ptr[0] == 'P' && ptr[1] == 'E' && ptr[2] == 'P' && ptr[3] == 'T' &&
ptr[4] == 'I' && ptr[5] == 'D' && ptr[6] == 'E' && ptr[7] >= '1' &&
ptr[7] <= '9') {
ptr += 8;
} else
return false;
while (*ptr >= '0' && *ptr <= '9') ptr++;
if (*ptr != ',') return false;
std::string id2(orig, ptr - orig);
ptr++;
unsigned int res1;
unsigned int res2;
unsigned int res1r;
unsigned int res2r;
if (*ptr >= '1' && *ptr <= '9') {
res1 = (*ptr++) - '0';
while (*ptr >= '0' && *ptr <= '9') res1 = 10 * res1 + ((*ptr++) - '0');
} else
return false;
if (ptr[0] == ':' && ptr[1] == 'R' && ptr[2] >= '1' && ptr[2] <= '9') {
res1r = ptr[2] - '0';
ptr += 3;
} else
return false;
if (*ptr != '-') return false;
ptr++;
if (*ptr >= '1' && *ptr <= '9') {
res2 = (*ptr++) - '0';
while (*ptr >= '0' && *ptr <= '9') res2 = 10 * res2 + ((*ptr++) - '0');
} else
return false;
if (ptr[0] == ':' && ptr[1] == 'R' && ptr[2] >= '1' && ptr[2] <= '9') {
res2r = ptr[2] - '0';
ptr += 3;
} else
return false;
// printf("%s:%u:R%u - %s:%u:R%u\n",id1.c_str(),res1,res1r,
// id2.c_str(),res2,res2r);
if (res1 < 1 || res2 < 1) return false;
if (seqs.find(id1) == seqs.end() || seqs.find(id2) == seqs.end())
return false;
std::vector<HELMMonomer> *vseq1 = &seqs[id1];
if (res1 > (unsigned int)vseq1->size()) return false;
std::vector<HELMMonomer> *vseq2 = &seqs[id2];
if (res2 > (unsigned int)vseq2->size()) return false;
if (res1r == 3 && res2r == 3) {
Atom *src = (*vseq1)[res1 - 1].r3;
Atom *dst = (*vseq2)[res2 - 1].r3;
if (src && dst && src != dst) {
CreateAABond(mol, src, dst, 1);
(*vseq1)[res1 - 1].r3 = (Atom *)0;
(*vseq2)[res2 - 1].r3 = (Atom *)0;
} else
return false;
} else if (res1r == 1 && res2r == 2) {
Atom *src = (*vseq1)[res1 - 1].r1;
Atom *dst = (*vseq2)[res2 - 1].r2;
Atom *oxt = (*vseq2)[res2 - 1].oxt;
if (src && dst && oxt && src != dst) {
mol->removeAtom(oxt);
CreateAABond(mol, src, dst, 1);
(*vseq1)[res1 - 1].r1 = (Atom *)0;
(*vseq2)[res2 - 1].r2 = (Atom *)0;
} else
return false;
} else if (res1r == 2 && res2r == 1) {
Atom *src = (*vseq2)[res2 - 1].r1;
Atom *dst = (*vseq1)[res1 - 1].r2;
Atom *oxt = (*vseq1)[res1 - 1].oxt;
if (src && dst && oxt && src != dst) {
mol->removeAtom(oxt);
CreateAABond(mol, dst, src, 1);
(*vseq1)[res1 - 1].r2 = (Atom *)0;
(*vseq2)[res2 - 1].r1 = (Atom *)0;
} else
return false;
} else
return false;
if (*ptr == '$') break;
if (*ptr != '|') return false;
ptr++;
}
ptr++;
return ptr[0] == '$' && ptr[1] == '$';
}
RWMol *HELMToMol(const char *helm, bool sanitize) {
RWMol *mol = new RWMol();
const char *ptr = helm;
if (ptr[0] == '$' && ptr[1] == '$' && ptr[2] == '$' && ptr[3] == '$')
return mol;
if (ParseHELM(mol, ptr)) {
if (sanitize) MolOps::sanitizeMol(*mol);
return mol;
}
delete mol;
return (RWMol *)0;
}
RWMol *HELMToMol(const std::string &helm, bool sanitize) {
return HELMToMol(helm.c_str(), sanitize);
}
} // namespace RDKit