Files
rdkit/Code/GraphMol/FileParsers/SequenceWriters.cpp
2015-10-12 08:43:41 -04:00

501 lines
12 KiB
C++

//
// Copyright (C) 2015 Greg Landrum and NextMove Software
//
// @@ All Rights Reserved @@
// This file is part of the RDKit.
// The contents are covered by the terms of the BSD license
// which is included in the file license.txt, found at the root
// of the RDKit source tree.
//
#include <string.h>
#include <stdio.h>
#include <string>
#include <GraphMol/RDKitBase.h>
#include <GraphMol/GraphMol.h>
#include <GraphMol/MonomerInfo.h>
namespace RDKit {
static char getOneLetterCode(const AtomPDBResidueInfo *info)
{
const char *ptr = info->getResidueName().c_str();
switch (ptr[0]) {
case 'A':
if (!strcmp(ptr,"ALA"))
return 'A';
if (!strcmp(ptr,"ARG"))
return 'R';
if (!strcmp(ptr,"ASN"))
return 'N';
if (!strcmp(ptr,"ASP"))
return 'D';
break;
case 'C':
if (!strcmp(ptr,"CYS"))
return 'C';
break;
case 'D':
if (!strcmp(ptr,"DAL"))
return 'a';
if (!strcmp(ptr,"DAR"))
return 'r';
if (!strcmp(ptr,"DAS"))
return 'd';
if (!strcmp(ptr,"DCY"))
return 'c';
if (!strcmp(ptr,"DGL"))
return 'e';
if (!strcmp(ptr,"DGN"))
return 'q';
if (!strcmp(ptr,"DHI"))
return 'h';
if (!strcmp(ptr,"DIL"))
return 'i';
if (!strcmp(ptr,"DLE"))
return 'l';
if (!strcmp(ptr,"DLY"))
return 'k';
if (!strcmp(ptr,"DPN"))
return 'f';
if (!strcmp(ptr,"DPR"))
return 'p';
if (!strcmp(ptr,"DSG"))
return 'n';
if (!strcmp(ptr,"DSN"))
return 's';
if (!strcmp(ptr,"DTH"))
return 't';
if (!strcmp(ptr,"DTR"))
return 'w';
if (!strcmp(ptr,"DTY"))
return 'y';
if (!strcmp(ptr,"DVA"))
return 'v';
break;
case 'G':
if (!strcmp(ptr,"GLU"))
return 'E';
if (!strcmp(ptr,"GLN"))
return 'Q';
if (!strcmp(ptr,"GLY"))
return 'G';
break;
case 'H':
if (!strcmp(ptr,"HIS"))
return 'H';
break;
case 'I':
if (!strcmp(ptr,"ILE"))
return 'I';
break;
case 'L':
if (!strcmp(ptr,"LEU"))
return 'L';
if (!strcmp(ptr,"LYS"))
return 'K';
break;
case 'M':
if (!strcmp(ptr,"MET"))
return 'M';
if (!strcmp(ptr,"MED"))
return 'm';
break;
case 'P':
if (!strcmp(ptr,"PHE"))
return 'F';
if (!strcmp(ptr,"PRO"))
return 'P';
break;
case 'S':
if (!strcmp(ptr,"SER"))
return 'S';
break;
case 'T':
if (!strcmp(ptr,"THR"))
return 'T';
if (!strcmp(ptr,"TRP"))
return 'W';
if (!strcmp(ptr,"TYR"))
return 'Y';
break;
case 'V':
if (!strcmp(ptr,"VAL"))
return 'V';
break;
}
return 'X';
}
std::string MolToSequence(const ROMol &mol)
{
std::string result;
for (ROMol::ConstAtomIterator atomIt=mol.beginAtoms();
atomIt!=mol.endAtoms();++atomIt){
const Atom *atom = *atomIt;
AtomPDBResidueInfo *info = (AtomPDBResidueInfo*)(atom->getMonomerInfo());
if (info && info->getMonomerType()==AtomMonomerInfo::PDBRESIDUE &&
info->getName() == " CA ") {
result += getOneLetterCode(info);
}
}
return result;
}
std::string MolToFASTA(const ROMol &mol)
{
std::string seq = MolToSequence(mol);
if (seq.empty())
return "";
std::string result = ">";
std::string name;
if (mol.getPropIfPresent(common_properties::_Name,name))
result += name;
result += '\n';
result += seq;
result += '\n';
return result;
}
static const char *getHELMMonomer(const AtomPDBResidueInfo *info)
{
const char *ptr = info->getResidueName().c_str();
switch (ptr[0]) {
case 'A':
if (!strcmp(ptr,"ALA"))
return "A";
if (!strcmp(ptr,"ARG"))
return "R";
if (!strcmp(ptr,"ASN"))
return "N";
if (!strcmp(ptr,"ASP"))
return "D";
break;
case 'C':
if (!strcmp(ptr,"CYS"))
return "C";
break;
case 'D':
if (!strcmp(ptr,"DAL"))
return "[dA]";
if (!strcmp(ptr,"DAR"))
return "[dR]";
if (!strcmp(ptr,"DAS"))
return "[dD]";
if (!strcmp(ptr,"DCY"))
return "[dC]";
if (!strcmp(ptr,"DGL"))
return "[dE]";
if (!strcmp(ptr,"DGN"))
return "[dQ]";
if (!strcmp(ptr,"DHI"))
return "[dH]";
if (!strcmp(ptr,"DLE"))
return "[dL]";
if (!strcmp(ptr,"DLY"))
return "[dK]";
if (!strcmp(ptr,"DPN"))
return "[dF]";
if (!strcmp(ptr,"DPR"))
return "[dP]";
if (!strcmp(ptr,"DSG"))
return "[dN]";
if (!strcmp(ptr,"DSN"))
return "[dS]";
if (!strcmp(ptr,"DTR"))
return "[dW]";
if (!strcmp(ptr,"DTY"))
return "[dY]";
if (!strcmp(ptr,"DVA"))
return "[dV]";
break;
case 'G':
if (!strcmp(ptr,"GLU"))
return "E";
if (!strcmp(ptr,"GLN"))
return "Q";
if (!strcmp(ptr,"GLY"))
return "G";
break;
case 'H':
if (!strcmp(ptr,"HIS"))
return "H";
break;
case 'I':
if (!strcmp(ptr,"ILE"))
return "I";
break;
case 'L':
if (!strcmp(ptr,"LEU"))
return "L";
if (!strcmp(ptr,"LYS"))
return "K";
break;
case 'M':
if (!strcmp(ptr,"MET"))
return "M";
if (!strcmp(ptr,"MED"))
return "[dM]";
if (!strcmp(ptr,"MSE"))
return "[seC]";
break;
case 'N':
if (!strcmp(ptr,"NLE"))
return "[Nle]";
if (!strcmp(ptr,"NVA"))
return "[Nva]";
break;
case 'O':
if (!strcmp(ptr,"ORN"))
return "[Orn]";
break;
case 'P':
if (!strcmp(ptr,"PHE"))
return "F";
if (!strcmp(ptr,"PRO"))
return "P";
break;
case 'S':
if (!strcmp(ptr,"SER"))
return "S";
break;
case 'T':
if (!strcmp(ptr,"THR"))
return "T";
if (!strcmp(ptr,"TRP"))
return "W";
if (!strcmp(ptr,"TYR"))
return "Y";
break;
case 'V':
if (!strcmp(ptr,"VAL"))
return "V";
break;
}
return (const char*)0;
}
static bool IsEupeptideBond(AtomPDBResidueInfo *src,
AtomPDBResidueInfo *dst)
{
if (src->getChainId() != dst->getChainId())
return false;
int sresno = src->getResidueNumber();
int dresno = dst->getResidueNumber();
if (src->getName() == " C " && dst->getName() == " N ") {
if (sresno != dresno)
return dresno > sresno;
return dst->getInsertionCode() > src->getInsertionCode();
}
if (src->getName() == " N " && dst->getName() == " C ") {
if (sresno != dresno)
return dresno < sresno;
return dst->getInsertionCode() < src->getInsertionCode();
}
return false;
}
static bool IsSupportedHELMBond(AtomPDBResidueInfo *src,
AtomPDBResidueInfo *dst)
{
if (src->getName()==" SG " && dst->getName()==" SG ") {
if ((src->getResidueName()=="CYS" ||
src->getResidueName()=="DCY") &&
(dst->getResidueName()=="CYS" ||
dst->getResidueName()=="DCY"))
return true;
}
if (src->getName()==" N " && dst->getName()==" C ")
return true;
if (src->getName()==" C " && dst->getName()==" N ")
return true;
#if 0
printf("%s%d%s.%s - %s%d%s.%s\n",
src->getResidueName().c_str(),src->getResidueNumber(),
src->getChainId().c_str(),src->getName().c_str(),
dst->getResidueName().c_str(),dst->getResidueNumber(),
dst->getChainId().c_str(),dst->getName().c_str());
#endif
return false;
}
static bool FindHELMAtom(std::vector<AtomPDBResidueInfo*> *seq,
AtomPDBResidueInfo *info,
std::string &id, std::string &pos)
{
char buffer[32];
char ch;
const char *ptr = info->getName().c_str();
if (ptr[0]==' ' && ptr[1]=='S' && ptr[2]=='G' && ptr[3]==' ')
ch = '3';
else if (ptr[0]==' ' && ptr[1]=='N' && ptr[2]==' ' && ptr[3]==' ')
ch = '1';
else if (ptr[0]==' ' && ptr[1]=='C' && ptr[2]==' ' && ptr[3]==' ')
ch = '2';
else return false;
int resno = info->getResidueNumber();
for (unsigned int i=1; i<10; i++) {
unsigned int len = (unsigned int)seq[i].size();
for (unsigned int j=0; j<len; j++) {
AtomPDBResidueInfo *targ = seq[i][j];
if (targ->getResidueNumber() == resno &&
targ->getResidueName() == info->getResidueName() &&
targ->getChainId() == info->getChainId() &&
targ->getInsertionCode() == info->getInsertionCode()) {
id = "PEPTIDE";
id += (char)(i+'0');
sprintf(buffer,"%u:R%c",j+1,ch);
pos = buffer;
return true;
}
}
}
return false;
}
static std::string NameHELMBond(std::vector<AtomPDBResidueInfo*> *seq,
AtomPDBResidueInfo *src,
AtomPDBResidueInfo *dst)
{
std::string id1,pos1;
if (!FindHELMAtom(seq,src,id1,pos1))
return "";
std::string id2,pos2;
if (!FindHELMAtom(seq,dst,id2,pos2))
return "";
std::string result = id1;
result += ',';
result += id2;
result += ',';
result += pos1;
result += '-';
result += pos2;
return result;
}
std::string MolToHELM(const ROMol &mol)
{
std::vector<AtomPDBResidueInfo*> seq[10];
std::string result;
bool first = true;
std::string chain;
int id = 1;
/* First pass: Monomers */
for (ROMol::ConstAtomIterator atomIt=mol.beginAtoms();
atomIt!=mol.endAtoms();++atomIt){
const Atom *atom = *atomIt;
AtomPDBResidueInfo *info = (AtomPDBResidueInfo*)(atom->getMonomerInfo());
// We can only write HELM if all atoms have PDB residue information
if (!info || info->getMonomerType()!=AtomMonomerInfo::PDBRESIDUE)
return "";
if (info->getName() == " CA ") {
const char *mono = getHELMMonomer(info);
if (!mono)
return "";
if (first) {
chain = info->getChainId();
result = "PEPTIDE1{";
first = false;
} else if (info->getChainId() != chain) {
// Nine chains should be enough?
if (id == 9)
return "";
id++;
chain = info->getChainId();
result += "}|PEPTIDE";
result += (char)(id+'0');
result += "{";
} else result += ".";
result += mono;
seq[id].push_back(info);
} else if (info->getResidueName() == "NH2" &&
info->getName() == " N ") {
if (first)
return "";
result += ".[am]";
} else if (info->getResidueName() == "ACE" &&
info->getName() == " C ") {
if (first) {
chain = info->getChainId();
result = "PEPTIDE1{[ac]";
first = false;
} else if (info->getChainId() != chain) {
// Nine chains should be enough?
if (id == 9)
return "";
id++;
chain = info->getChainId();
result += "}|PEPTIDE";
result += (char)(id+'0');
result += "{[ac]";
} else return "";
seq[id].push_back(info);
}
}
if (first)
return "";
result += "}$";
first = true;
for (ROMol::ConstBondIterator bondIt=mol.beginBonds();
bondIt!=mol.endBonds(); ++bondIt){
const Bond *bond = *bondIt;
Atom *beg = bond->getBeginAtom();
Atom *end = bond->getEndAtom();
if (!beg || !end)
continue;
AtomPDBResidueInfo *binfo = (AtomPDBResidueInfo*)(beg->getMonomerInfo());
AtomPDBResidueInfo *einfo = (AtomPDBResidueInfo*)(end->getMonomerInfo());
if (!binfo || !einfo)
continue;
// Test if this is an uninteresting intra-residue bond
if (binfo->getResidueNumber() == einfo->getResidueNumber() &&
binfo->getResidueName() == einfo->getResidueName() &&
binfo->getChainId() == einfo->getChainId() &&
binfo->getInsertionCode() == einfo->getInsertionCode())
continue;
if (bond->getBondType() != Bond::SINGLE)
return "";
if (IsEupeptideBond(binfo,einfo))
continue;
if (!IsSupportedHELMBond(binfo,einfo))
return "";
std::string tmp = NameHELMBond(seq,binfo,einfo);
if (tmp.empty())
return "";
if (!first)
result += "|";
else first = false;
result += tmp;
}
result += "$$$";
return result;
}
} // namespace RDKit