mirror of
https://github.com/rdkit/rdkit.git
synced 2026-06-05 22:04:27 +08:00
501 lines
12 KiB
C++
501 lines
12 KiB
C++
//
|
|
// Copyright (C) 2015 Greg Landrum and NextMove Software
|
|
//
|
|
// @@ All Rights Reserved @@
|
|
// This file is part of the RDKit.
|
|
// The contents are covered by the terms of the BSD license
|
|
// which is included in the file license.txt, found at the root
|
|
// of the RDKit source tree.
|
|
//
|
|
#include <string.h>
|
|
#include <stdio.h>
|
|
#include <string>
|
|
|
|
#include <GraphMol/RDKitBase.h>
|
|
#include <GraphMol/GraphMol.h>
|
|
#include <GraphMol/MonomerInfo.h>
|
|
|
|
|
|
namespace RDKit {
|
|
|
|
|
|
static char getOneLetterCode(const AtomPDBResidueInfo *info)
|
|
{
|
|
const char *ptr = info->getResidueName().c_str();
|
|
switch (ptr[0]) {
|
|
case 'A':
|
|
if (!strcmp(ptr,"ALA"))
|
|
return 'A';
|
|
if (!strcmp(ptr,"ARG"))
|
|
return 'R';
|
|
if (!strcmp(ptr,"ASN"))
|
|
return 'N';
|
|
if (!strcmp(ptr,"ASP"))
|
|
return 'D';
|
|
break;
|
|
case 'C':
|
|
if (!strcmp(ptr,"CYS"))
|
|
return 'C';
|
|
break;
|
|
case 'D':
|
|
if (!strcmp(ptr,"DAL"))
|
|
return 'a';
|
|
if (!strcmp(ptr,"DAR"))
|
|
return 'r';
|
|
if (!strcmp(ptr,"DAS"))
|
|
return 'd';
|
|
if (!strcmp(ptr,"DCY"))
|
|
return 'c';
|
|
if (!strcmp(ptr,"DGL"))
|
|
return 'e';
|
|
if (!strcmp(ptr,"DGN"))
|
|
return 'q';
|
|
if (!strcmp(ptr,"DHI"))
|
|
return 'h';
|
|
if (!strcmp(ptr,"DIL"))
|
|
return 'i';
|
|
if (!strcmp(ptr,"DLE"))
|
|
return 'l';
|
|
if (!strcmp(ptr,"DLY"))
|
|
return 'k';
|
|
if (!strcmp(ptr,"DPN"))
|
|
return 'f';
|
|
if (!strcmp(ptr,"DPR"))
|
|
return 'p';
|
|
if (!strcmp(ptr,"DSG"))
|
|
return 'n';
|
|
if (!strcmp(ptr,"DSN"))
|
|
return 's';
|
|
if (!strcmp(ptr,"DTH"))
|
|
return 't';
|
|
if (!strcmp(ptr,"DTR"))
|
|
return 'w';
|
|
if (!strcmp(ptr,"DTY"))
|
|
return 'y';
|
|
if (!strcmp(ptr,"DVA"))
|
|
return 'v';
|
|
break;
|
|
case 'G':
|
|
if (!strcmp(ptr,"GLU"))
|
|
return 'E';
|
|
if (!strcmp(ptr,"GLN"))
|
|
return 'Q';
|
|
if (!strcmp(ptr,"GLY"))
|
|
return 'G';
|
|
break;
|
|
case 'H':
|
|
if (!strcmp(ptr,"HIS"))
|
|
return 'H';
|
|
break;
|
|
case 'I':
|
|
if (!strcmp(ptr,"ILE"))
|
|
return 'I';
|
|
break;
|
|
case 'L':
|
|
if (!strcmp(ptr,"LEU"))
|
|
return 'L';
|
|
if (!strcmp(ptr,"LYS"))
|
|
return 'K';
|
|
break;
|
|
case 'M':
|
|
if (!strcmp(ptr,"MET"))
|
|
return 'M';
|
|
if (!strcmp(ptr,"MED"))
|
|
return 'm';
|
|
break;
|
|
case 'P':
|
|
if (!strcmp(ptr,"PHE"))
|
|
return 'F';
|
|
if (!strcmp(ptr,"PRO"))
|
|
return 'P';
|
|
break;
|
|
case 'S':
|
|
if (!strcmp(ptr,"SER"))
|
|
return 'S';
|
|
break;
|
|
case 'T':
|
|
if (!strcmp(ptr,"THR"))
|
|
return 'T';
|
|
if (!strcmp(ptr,"TRP"))
|
|
return 'W';
|
|
if (!strcmp(ptr,"TYR"))
|
|
return 'Y';
|
|
break;
|
|
case 'V':
|
|
if (!strcmp(ptr,"VAL"))
|
|
return 'V';
|
|
break;
|
|
}
|
|
return 'X';
|
|
}
|
|
|
|
|
|
std::string MolToSequence(const ROMol &mol)
|
|
{
|
|
std::string result;
|
|
|
|
for (ROMol::ConstAtomIterator atomIt=mol.beginAtoms();
|
|
atomIt!=mol.endAtoms();++atomIt){
|
|
const Atom *atom = *atomIt;
|
|
AtomPDBResidueInfo *info = (AtomPDBResidueInfo*)(atom->getMonomerInfo());
|
|
if (info && info->getMonomerType()==AtomMonomerInfo::PDBRESIDUE &&
|
|
info->getName() == " CA ") {
|
|
result += getOneLetterCode(info);
|
|
}
|
|
}
|
|
return result;
|
|
}
|
|
|
|
|
|
std::string MolToFASTA(const ROMol &mol)
|
|
{
|
|
std::string seq = MolToSequence(mol);
|
|
if (seq.empty())
|
|
return "";
|
|
|
|
std::string result = ">";
|
|
std::string name;
|
|
if (mol.getPropIfPresent(common_properties::_Name,name))
|
|
result += name;
|
|
result += '\n';
|
|
result += seq;
|
|
result += '\n';
|
|
return result;
|
|
}
|
|
|
|
|
|
static const char *getHELMMonomer(const AtomPDBResidueInfo *info)
|
|
{
|
|
const char *ptr = info->getResidueName().c_str();
|
|
switch (ptr[0]) {
|
|
case 'A':
|
|
if (!strcmp(ptr,"ALA"))
|
|
return "A";
|
|
if (!strcmp(ptr,"ARG"))
|
|
return "R";
|
|
if (!strcmp(ptr,"ASN"))
|
|
return "N";
|
|
if (!strcmp(ptr,"ASP"))
|
|
return "D";
|
|
break;
|
|
case 'C':
|
|
if (!strcmp(ptr,"CYS"))
|
|
return "C";
|
|
break;
|
|
case 'D':
|
|
if (!strcmp(ptr,"DAL"))
|
|
return "[dA]";
|
|
if (!strcmp(ptr,"DAR"))
|
|
return "[dR]";
|
|
if (!strcmp(ptr,"DAS"))
|
|
return "[dD]";
|
|
if (!strcmp(ptr,"DCY"))
|
|
return "[dC]";
|
|
if (!strcmp(ptr,"DGL"))
|
|
return "[dE]";
|
|
if (!strcmp(ptr,"DGN"))
|
|
return "[dQ]";
|
|
if (!strcmp(ptr,"DHI"))
|
|
return "[dH]";
|
|
if (!strcmp(ptr,"DLE"))
|
|
return "[dL]";
|
|
if (!strcmp(ptr,"DLY"))
|
|
return "[dK]";
|
|
if (!strcmp(ptr,"DPN"))
|
|
return "[dF]";
|
|
if (!strcmp(ptr,"DPR"))
|
|
return "[dP]";
|
|
if (!strcmp(ptr,"DSG"))
|
|
return "[dN]";
|
|
if (!strcmp(ptr,"DSN"))
|
|
return "[dS]";
|
|
if (!strcmp(ptr,"DTR"))
|
|
return "[dW]";
|
|
if (!strcmp(ptr,"DTY"))
|
|
return "[dY]";
|
|
if (!strcmp(ptr,"DVA"))
|
|
return "[dV]";
|
|
break;
|
|
case 'G':
|
|
if (!strcmp(ptr,"GLU"))
|
|
return "E";
|
|
if (!strcmp(ptr,"GLN"))
|
|
return "Q";
|
|
if (!strcmp(ptr,"GLY"))
|
|
return "G";
|
|
break;
|
|
case 'H':
|
|
if (!strcmp(ptr,"HIS"))
|
|
return "H";
|
|
break;
|
|
case 'I':
|
|
if (!strcmp(ptr,"ILE"))
|
|
return "I";
|
|
break;
|
|
case 'L':
|
|
if (!strcmp(ptr,"LEU"))
|
|
return "L";
|
|
if (!strcmp(ptr,"LYS"))
|
|
return "K";
|
|
break;
|
|
case 'M':
|
|
if (!strcmp(ptr,"MET"))
|
|
return "M";
|
|
if (!strcmp(ptr,"MED"))
|
|
return "[dM]";
|
|
if (!strcmp(ptr,"MSE"))
|
|
return "[seC]";
|
|
break;
|
|
case 'N':
|
|
if (!strcmp(ptr,"NLE"))
|
|
return "[Nle]";
|
|
if (!strcmp(ptr,"NVA"))
|
|
return "[Nva]";
|
|
break;
|
|
case 'O':
|
|
if (!strcmp(ptr,"ORN"))
|
|
return "[Orn]";
|
|
break;
|
|
case 'P':
|
|
if (!strcmp(ptr,"PHE"))
|
|
return "F";
|
|
if (!strcmp(ptr,"PRO"))
|
|
return "P";
|
|
break;
|
|
case 'S':
|
|
if (!strcmp(ptr,"SER"))
|
|
return "S";
|
|
break;
|
|
case 'T':
|
|
if (!strcmp(ptr,"THR"))
|
|
return "T";
|
|
if (!strcmp(ptr,"TRP"))
|
|
return "W";
|
|
if (!strcmp(ptr,"TYR"))
|
|
return "Y";
|
|
break;
|
|
case 'V':
|
|
if (!strcmp(ptr,"VAL"))
|
|
return "V";
|
|
break;
|
|
}
|
|
return (const char*)0;
|
|
}
|
|
|
|
|
|
static bool IsEupeptideBond(AtomPDBResidueInfo *src,
|
|
AtomPDBResidueInfo *dst)
|
|
{
|
|
if (src->getChainId() != dst->getChainId())
|
|
return false;
|
|
|
|
int sresno = src->getResidueNumber();
|
|
int dresno = dst->getResidueNumber();
|
|
|
|
if (src->getName() == " C " && dst->getName() == " N ") {
|
|
if (sresno != dresno)
|
|
return dresno > sresno;
|
|
return dst->getInsertionCode() > src->getInsertionCode();
|
|
}
|
|
if (src->getName() == " N " && dst->getName() == " C ") {
|
|
if (sresno != dresno)
|
|
return dresno < sresno;
|
|
return dst->getInsertionCode() < src->getInsertionCode();
|
|
}
|
|
return false;
|
|
}
|
|
|
|
|
|
static bool IsSupportedHELMBond(AtomPDBResidueInfo *src,
|
|
AtomPDBResidueInfo *dst)
|
|
{
|
|
if (src->getName()==" SG " && dst->getName()==" SG ") {
|
|
if ((src->getResidueName()=="CYS" ||
|
|
src->getResidueName()=="DCY") &&
|
|
(dst->getResidueName()=="CYS" ||
|
|
dst->getResidueName()=="DCY"))
|
|
return true;
|
|
}
|
|
|
|
if (src->getName()==" N " && dst->getName()==" C ")
|
|
return true;
|
|
if (src->getName()==" C " && dst->getName()==" N ")
|
|
return true;
|
|
#if 0
|
|
printf("%s%d%s.%s - %s%d%s.%s\n",
|
|
src->getResidueName().c_str(),src->getResidueNumber(),
|
|
src->getChainId().c_str(),src->getName().c_str(),
|
|
dst->getResidueName().c_str(),dst->getResidueNumber(),
|
|
dst->getChainId().c_str(),dst->getName().c_str());
|
|
#endif
|
|
return false;
|
|
}
|
|
|
|
|
|
static bool FindHELMAtom(std::vector<AtomPDBResidueInfo*> *seq,
|
|
AtomPDBResidueInfo *info,
|
|
std::string &id, std::string &pos)
|
|
{
|
|
char buffer[32];
|
|
char ch;
|
|
|
|
const char *ptr = info->getName().c_str();
|
|
if (ptr[0]==' ' && ptr[1]=='S' && ptr[2]=='G' && ptr[3]==' ')
|
|
ch = '3';
|
|
else if (ptr[0]==' ' && ptr[1]=='N' && ptr[2]==' ' && ptr[3]==' ')
|
|
ch = '1';
|
|
else if (ptr[0]==' ' && ptr[1]=='C' && ptr[2]==' ' && ptr[3]==' ')
|
|
ch = '2';
|
|
else return false;
|
|
|
|
int resno = info->getResidueNumber();
|
|
for (unsigned int i=1; i<10; i++) {
|
|
unsigned int len = (unsigned int)seq[i].size();
|
|
for (unsigned int j=0; j<len; j++) {
|
|
AtomPDBResidueInfo *targ = seq[i][j];
|
|
if (targ->getResidueNumber() == resno &&
|
|
targ->getResidueName() == info->getResidueName() &&
|
|
targ->getChainId() == info->getChainId() &&
|
|
targ->getInsertionCode() == info->getInsertionCode()) {
|
|
id = "PEPTIDE";
|
|
id += (char)(i+'0');
|
|
sprintf(buffer,"%u:R%c",j+1,ch);
|
|
pos = buffer;
|
|
return true;
|
|
}
|
|
}
|
|
}
|
|
return false;
|
|
}
|
|
|
|
|
|
static std::string NameHELMBond(std::vector<AtomPDBResidueInfo*> *seq,
|
|
AtomPDBResidueInfo *src,
|
|
AtomPDBResidueInfo *dst)
|
|
{
|
|
std::string id1,pos1;
|
|
if (!FindHELMAtom(seq,src,id1,pos1))
|
|
return "";
|
|
std::string id2,pos2;
|
|
if (!FindHELMAtom(seq,dst,id2,pos2))
|
|
return "";
|
|
|
|
std::string result = id1;
|
|
result += ',';
|
|
result += id2;
|
|
result += ',';
|
|
result += pos1;
|
|
result += '-';
|
|
result += pos2;
|
|
return result;
|
|
}
|
|
|
|
|
|
std::string MolToHELM(const ROMol &mol)
|
|
{
|
|
std::vector<AtomPDBResidueInfo*> seq[10];
|
|
std::string result;
|
|
bool first = true;
|
|
std::string chain;
|
|
int id = 1;
|
|
|
|
/* First pass: Monomers */
|
|
for (ROMol::ConstAtomIterator atomIt=mol.beginAtoms();
|
|
atomIt!=mol.endAtoms();++atomIt){
|
|
const Atom *atom = *atomIt;
|
|
AtomPDBResidueInfo *info = (AtomPDBResidueInfo*)(atom->getMonomerInfo());
|
|
// We can only write HELM if all atoms have PDB residue information
|
|
if (!info || info->getMonomerType()!=AtomMonomerInfo::PDBRESIDUE)
|
|
return "";
|
|
|
|
if (info->getName() == " CA ") {
|
|
const char *mono = getHELMMonomer(info);
|
|
if (!mono)
|
|
return "";
|
|
if (first) {
|
|
chain = info->getChainId();
|
|
result = "PEPTIDE1{";
|
|
first = false;
|
|
} else if (info->getChainId() != chain) {
|
|
// Nine chains should be enough?
|
|
if (id == 9)
|
|
return "";
|
|
id++;
|
|
chain = info->getChainId();
|
|
result += "}|PEPTIDE";
|
|
result += (char)(id+'0');
|
|
result += "{";
|
|
} else result += ".";
|
|
result += mono;
|
|
seq[id].push_back(info);
|
|
} else if (info->getResidueName() == "NH2" &&
|
|
info->getName() == " N ") {
|
|
if (first)
|
|
return "";
|
|
result += ".[am]";
|
|
} else if (info->getResidueName() == "ACE" &&
|
|
info->getName() == " C ") {
|
|
if (first) {
|
|
chain = info->getChainId();
|
|
result = "PEPTIDE1{[ac]";
|
|
first = false;
|
|
} else if (info->getChainId() != chain) {
|
|
// Nine chains should be enough?
|
|
if (id == 9)
|
|
return "";
|
|
id++;
|
|
chain = info->getChainId();
|
|
result += "}|PEPTIDE";
|
|
result += (char)(id+'0');
|
|
result += "{[ac]";
|
|
} else return "";
|
|
seq[id].push_back(info);
|
|
}
|
|
}
|
|
|
|
if (first)
|
|
return "";
|
|
|
|
result += "}$";
|
|
|
|
first = true;
|
|
for (ROMol::ConstBondIterator bondIt=mol.beginBonds();
|
|
bondIt!=mol.endBonds(); ++bondIt){
|
|
const Bond *bond = *bondIt;
|
|
Atom *beg = bond->getBeginAtom();
|
|
Atom *end = bond->getEndAtom();
|
|
if (!beg || !end)
|
|
continue;
|
|
AtomPDBResidueInfo *binfo = (AtomPDBResidueInfo*)(beg->getMonomerInfo());
|
|
AtomPDBResidueInfo *einfo = (AtomPDBResidueInfo*)(end->getMonomerInfo());
|
|
if (!binfo || !einfo)
|
|
continue;
|
|
// Test if this is an uninteresting intra-residue bond
|
|
if (binfo->getResidueNumber() == einfo->getResidueNumber() &&
|
|
binfo->getResidueName() == einfo->getResidueName() &&
|
|
binfo->getChainId() == einfo->getChainId() &&
|
|
binfo->getInsertionCode() == einfo->getInsertionCode())
|
|
continue;
|
|
if (bond->getBondType() != Bond::SINGLE)
|
|
return "";
|
|
if (IsEupeptideBond(binfo,einfo))
|
|
continue;
|
|
if (!IsSupportedHELMBond(binfo,einfo))
|
|
return "";
|
|
std::string tmp = NameHELMBond(seq,binfo,einfo);
|
|
if (tmp.empty())
|
|
return "";
|
|
if (!first)
|
|
result += "|";
|
|
else first = false;
|
|
result += tmp;
|
|
}
|
|
|
|
result += "$$$";
|
|
return result;
|
|
}
|
|
|
|
|
|
} // namespace RDKit
|
|
|