Files
rdkit/Code/GraphMol/FileParsers/SequenceWriters.cpp
2015-11-14 14:58:11 +01:00

379 lines
12 KiB
C++

//
// Copyright (C) 2015 Greg Landrum and NextMove Software
//
// @@ All Rights Reserved @@
// This file is part of the RDKit.
// The contents are covered by the terms of the BSD license
// which is included in the file license.txt, found at the root
// of the RDKit source tree.
//
#include <string.h>
#include <stdio.h>
#include <string>
#include <GraphMol/RDKitBase.h>
#include <GraphMol/GraphMol.h>
#include <GraphMol/MonomerInfo.h>
namespace RDKit {
static char getOneLetterCode(const AtomPDBResidueInfo *info) {
const char *ptr = info->getResidueName().c_str();
switch (ptr[0]) {
case 'A':
if (!strcmp(ptr, "ALA")) return 'A';
if (!strcmp(ptr, "ARG")) return 'R';
if (!strcmp(ptr, "ASN")) return 'N';
if (!strcmp(ptr, "ASP")) return 'D';
break;
case 'C':
if (!strcmp(ptr, "CYS")) return 'C';
break;
case 'D':
if (!strcmp(ptr, "DAL")) return 'a';
if (!strcmp(ptr, "DAR")) return 'r';
if (!strcmp(ptr, "DAS")) return 'd';
if (!strcmp(ptr, "DCY")) return 'c';
if (!strcmp(ptr, "DGL")) return 'e';
if (!strcmp(ptr, "DGN")) return 'q';
if (!strcmp(ptr, "DHI")) return 'h';
if (!strcmp(ptr, "DIL")) return 'i';
if (!strcmp(ptr, "DLE")) return 'l';
if (!strcmp(ptr, "DLY")) return 'k';
if (!strcmp(ptr, "DPN")) return 'f';
if (!strcmp(ptr, "DPR")) return 'p';
if (!strcmp(ptr, "DSG")) return 'n';
if (!strcmp(ptr, "DSN")) return 's';
if (!strcmp(ptr, "DTH")) return 't';
if (!strcmp(ptr, "DTR")) return 'w';
if (!strcmp(ptr, "DTY")) return 'y';
if (!strcmp(ptr, "DVA")) return 'v';
break;
case 'G':
if (!strcmp(ptr, "GLU")) return 'E';
if (!strcmp(ptr, "GLN")) return 'Q';
if (!strcmp(ptr, "GLY")) return 'G';
break;
case 'H':
if (!strcmp(ptr, "HIS")) return 'H';
break;
case 'I':
if (!strcmp(ptr, "ILE")) return 'I';
break;
case 'L':
if (!strcmp(ptr, "LEU")) return 'L';
if (!strcmp(ptr, "LYS")) return 'K';
break;
case 'M':
if (!strcmp(ptr, "MET")) return 'M';
if (!strcmp(ptr, "MED")) return 'm';
break;
case 'P':
if (!strcmp(ptr, "PHE")) return 'F';
if (!strcmp(ptr, "PRO")) return 'P';
break;
case 'S':
if (!strcmp(ptr, "SER")) return 'S';
break;
case 'T':
if (!strcmp(ptr, "THR")) return 'T';
if (!strcmp(ptr, "TRP")) return 'W';
if (!strcmp(ptr, "TYR")) return 'Y';
break;
case 'V':
if (!strcmp(ptr, "VAL")) return 'V';
break;
}
return 'X';
}
std::string MolToSequence(const ROMol &mol) {
std::string result;
for (ROMol::ConstAtomIterator atomIt = mol.beginAtoms();
atomIt != mol.endAtoms(); ++atomIt) {
const Atom *atom = *atomIt;
AtomPDBResidueInfo *info = (AtomPDBResidueInfo *)(atom->getMonomerInfo());
if (info && info->getMonomerType() == AtomMonomerInfo::PDBRESIDUE &&
info->getName() == " CA ") {
result += getOneLetterCode(info);
}
}
return result;
}
std::string MolToFASTA(const ROMol &mol) {
std::string seq = MolToSequence(mol);
if (seq.empty()) return "";
std::string result = ">";
std::string name;
if (mol.getPropIfPresent(common_properties::_Name, name)) result += name;
result += '\n';
result += seq;
result += '\n';
return result;
}
static const char *getHELMMonomer(const AtomPDBResidueInfo *info) {
const char *ptr = info->getResidueName().c_str();
switch (ptr[0]) {
case 'A':
if (!strcmp(ptr, "ALA")) return "A";
if (!strcmp(ptr, "ARG")) return "R";
if (!strcmp(ptr, "ASN")) return "N";
if (!strcmp(ptr, "ASP")) return "D";
break;
case 'C':
if (!strcmp(ptr, "CYS")) return "C";
break;
case 'D':
if (!strcmp(ptr, "DAL")) return "[dA]";
if (!strcmp(ptr, "DAR")) return "[dR]";
if (!strcmp(ptr, "DAS")) return "[dD]";
if (!strcmp(ptr, "DCY")) return "[dC]";
if (!strcmp(ptr, "DGL")) return "[dE]";
if (!strcmp(ptr, "DGN")) return "[dQ]";
if (!strcmp(ptr, "DHI")) return "[dH]";
if (!strcmp(ptr, "DLE")) return "[dL]";
if (!strcmp(ptr, "DLY")) return "[dK]";
if (!strcmp(ptr, "DPN")) return "[dF]";
if (!strcmp(ptr, "DPR")) return "[dP]";
if (!strcmp(ptr, "DSG")) return "[dN]";
if (!strcmp(ptr, "DSN")) return "[dS]";
if (!strcmp(ptr, "DTR")) return "[dW]";
if (!strcmp(ptr, "DTY")) return "[dY]";
if (!strcmp(ptr, "DVA")) return "[dV]";
break;
case 'G':
if (!strcmp(ptr, "GLU")) return "E";
if (!strcmp(ptr, "GLN")) return "Q";
if (!strcmp(ptr, "GLY")) return "G";
break;
case 'H':
if (!strcmp(ptr, "HIS")) return "H";
break;
case 'I':
if (!strcmp(ptr, "ILE")) return "I";
break;
case 'L':
if (!strcmp(ptr, "LEU")) return "L";
if (!strcmp(ptr, "LYS")) return "K";
break;
case 'M':
if (!strcmp(ptr, "MET")) return "M";
if (!strcmp(ptr, "MED")) return "[dM]";
if (!strcmp(ptr, "MSE")) return "[seC]";
break;
case 'N':
if (!strcmp(ptr, "NLE")) return "[Nle]";
if (!strcmp(ptr, "NVA")) return "[Nva]";
break;
case 'O':
if (!strcmp(ptr, "ORN")) return "[Orn]";
break;
case 'P':
if (!strcmp(ptr, "PHE")) return "F";
if (!strcmp(ptr, "PRO")) return "P";
break;
case 'S':
if (!strcmp(ptr, "SER")) return "S";
break;
case 'T':
if (!strcmp(ptr, "THR")) return "T";
if (!strcmp(ptr, "TRP")) return "W";
if (!strcmp(ptr, "TYR")) return "Y";
break;
case 'V':
if (!strcmp(ptr, "VAL")) return "V";
break;
}
return (const char *)0;
}
static bool IsEupeptideBond(AtomPDBResidueInfo *src, AtomPDBResidueInfo *dst) {
if (src->getChainId() != dst->getChainId()) return false;
int sresno = src->getResidueNumber();
int dresno = dst->getResidueNumber();
if (src->getName() == " C " && dst->getName() == " N ") {
if (sresno != dresno) return dresno > sresno;
return dst->getInsertionCode() > src->getInsertionCode();
}
if (src->getName() == " N " && dst->getName() == " C ") {
if (sresno != dresno) return dresno < sresno;
return dst->getInsertionCode() < src->getInsertionCode();
}
return false;
}
static bool IsSupportedHELMBond(AtomPDBResidueInfo *src,
AtomPDBResidueInfo *dst) {
if (src->getName() == " SG " && dst->getName() == " SG ") {
if ((src->getResidueName() == "CYS" || src->getResidueName() == "DCY") &&
(dst->getResidueName() == "CYS" || dst->getResidueName() == "DCY"))
return true;
}
if (src->getName() == " N " && dst->getName() == " C ") return true;
if (src->getName() == " C " && dst->getName() == " N ") return true;
#if 0
printf("%s%d%s.%s - %s%d%s.%s\n",
src->getResidueName().c_str(),src->getResidueNumber(),
src->getChainId().c_str(),src->getName().c_str(),
dst->getResidueName().c_str(),dst->getResidueNumber(),
dst->getChainId().c_str(),dst->getName().c_str());
#endif
return false;
}
static bool FindHELMAtom(std::vector<AtomPDBResidueInfo *> *seq,
AtomPDBResidueInfo *info, std::string &id,
std::string &pos) {
char buffer[32];
char ch;
const char *ptr = info->getName().c_str();
if (ptr[0] == ' ' && ptr[1] == 'S' && ptr[2] == 'G' && ptr[3] == ' ')
ch = '3';
else if (ptr[0] == ' ' && ptr[1] == 'N' && ptr[2] == ' ' && ptr[3] == ' ')
ch = '1';
else if (ptr[0] == ' ' && ptr[1] == 'C' && ptr[2] == ' ' && ptr[3] == ' ')
ch = '2';
else
return false;
int resno = info->getResidueNumber();
for (unsigned int i = 1; i < 10; i++) {
unsigned int len = (unsigned int)seq[i].size();
for (unsigned int j = 0; j < len; j++) {
AtomPDBResidueInfo *targ = seq[i][j];
if (targ->getResidueNumber() == resno &&
targ->getResidueName() == info->getResidueName() &&
targ->getChainId() == info->getChainId() &&
targ->getInsertionCode() == info->getInsertionCode()) {
id = "PEPTIDE";
id += (char)(i + '0');
sprintf(buffer, "%u:R%c", j + 1, ch);
pos = buffer;
return true;
}
}
}
return false;
}
static std::string NameHELMBond(std::vector<AtomPDBResidueInfo *> *seq,
AtomPDBResidueInfo *src,
AtomPDBResidueInfo *dst) {
std::string id1, pos1;
if (!FindHELMAtom(seq, src, id1, pos1)) return "";
std::string id2, pos2;
if (!FindHELMAtom(seq, dst, id2, pos2)) return "";
std::string result = id1;
result += ',';
result += id2;
result += ',';
result += pos1;
result += '-';
result += pos2;
return result;
}
std::string MolToHELM(const ROMol &mol) {
std::vector<AtomPDBResidueInfo *> seq[10];
std::string result;
bool first = true;
std::string chain;
int id = 1;
/* First pass: Monomers */
for (ROMol::ConstAtomIterator atomIt = mol.beginAtoms();
atomIt != mol.endAtoms(); ++atomIt) {
const Atom *atom = *atomIt;
AtomPDBResidueInfo *info = (AtomPDBResidueInfo *)(atom->getMonomerInfo());
// We can only write HELM if all atoms have PDB residue information
if (!info || info->getMonomerType() != AtomMonomerInfo::PDBRESIDUE)
return "";
if (info->getName() == " CA ") {
const char *mono = getHELMMonomer(info);
if (!mono) return "";
if (first) {
chain = info->getChainId();
result = "PEPTIDE1{";
first = false;
} else if (info->getChainId() != chain) {
// Nine chains should be enough?
if (id == 9) return "";
id++;
chain = info->getChainId();
result += "}|PEPTIDE";
result += (char)(id + '0');
result += "{";
} else
result += ".";
result += mono;
seq[id].push_back(info);
} else if (info->getResidueName() == "NH2" && info->getName() == " N ") {
if (first) return "";
result += ".[am]";
} else if (info->getResidueName() == "ACE" && info->getName() == " C ") {
if (first) {
chain = info->getChainId();
result = "PEPTIDE1{[ac]";
first = false;
} else if (info->getChainId() != chain) {
// Nine chains should be enough?
if (id == 9) return "";
id++;
chain = info->getChainId();
result += "}|PEPTIDE";
result += (char)(id + '0');
result += "{[ac]";
} else
return "";
seq[id].push_back(info);
}
}
if (first) return "";
result += "}$";
first = true;
for (ROMol::ConstBondIterator bondIt = mol.beginBonds();
bondIt != mol.endBonds(); ++bondIt) {
const Bond *bond = *bondIt;
Atom *beg = bond->getBeginAtom();
Atom *end = bond->getEndAtom();
if (!beg || !end) continue;
AtomPDBResidueInfo *binfo = (AtomPDBResidueInfo *)(beg->getMonomerInfo());
AtomPDBResidueInfo *einfo = (AtomPDBResidueInfo *)(end->getMonomerInfo());
if (!binfo || !einfo) continue;
// Test if this is an uninteresting intra-residue bond
if (binfo->getResidueNumber() == einfo->getResidueNumber() &&
binfo->getResidueName() == einfo->getResidueName() &&
binfo->getChainId() == einfo->getChainId() &&
binfo->getInsertionCode() == einfo->getInsertionCode())
continue;
if (bond->getBondType() != Bond::SINGLE) return "";
if (IsEupeptideBond(binfo, einfo)) continue;
if (!IsSupportedHELMBond(binfo, einfo)) return "";
std::string tmp = NameHELMBond(seq, binfo, einfo);
if (tmp.empty()) return "";
if (!first)
result += "|";
else
first = false;
result += tmp;
}
result += "$$$";
return result;
}
} // namespace RDKit