// // Copyright (C) 2015 Greg Landrum and NextMove Software // // @@ All Rights Reserved @@ // This file is part of the RDKit. // The contents are covered by the terms of the BSD license // which is included in the file license.txt, found at the root // of the RDKit source tree. // #include #include #include #include #include #include namespace RDKit { static char getOneLetterCode(const AtomPDBResidueInfo *info) { const char *ptr = info->getResidueName().c_str(); switch (ptr[0]) { case 'A': if (!strcmp(ptr, "ALA")) return 'A'; if (!strcmp(ptr, "ARG")) return 'R'; if (!strcmp(ptr, "ASN")) return 'N'; if (!strcmp(ptr, "ASP")) return 'D'; break; case 'C': if (!strcmp(ptr, "CYS")) return 'C'; break; case 'D': if (!strcmp(ptr, "DAL")) return 'a'; if (!strcmp(ptr, "DAR")) return 'r'; if (!strcmp(ptr, "DAS")) return 'd'; if (!strcmp(ptr, "DCY")) return 'c'; if (!strcmp(ptr, "DGL")) return 'e'; if (!strcmp(ptr, "DGN")) return 'q'; if (!strcmp(ptr, "DHI")) return 'h'; if (!strcmp(ptr, "DIL")) return 'i'; if (!strcmp(ptr, "DLE")) return 'l'; if (!strcmp(ptr, "DLY")) return 'k'; if (!strcmp(ptr, "DPN")) return 'f'; if (!strcmp(ptr, "DPR")) return 'p'; if (!strcmp(ptr, "DSG")) return 'n'; if (!strcmp(ptr, "DSN")) return 's'; if (!strcmp(ptr, "DTH")) return 't'; if (!strcmp(ptr, "DTR")) return 'w'; if (!strcmp(ptr, "DTY")) return 'y'; if (!strcmp(ptr, "DVA")) return 'v'; break; case 'G': if (!strcmp(ptr, "GLU")) return 'E'; if (!strcmp(ptr, "GLN")) return 'Q'; if (!strcmp(ptr, "GLY")) return 'G'; break; case 'H': if (!strcmp(ptr, "HIS")) return 'H'; break; case 'I': if (!strcmp(ptr, "ILE")) return 'I'; break; case 'L': if (!strcmp(ptr, "LEU")) return 'L'; if (!strcmp(ptr, "LYS")) return 'K'; break; case 'M': if (!strcmp(ptr, "MET")) return 'M'; if (!strcmp(ptr, "MED")) return 'm'; break; case 'P': if (!strcmp(ptr, "PHE")) return 'F'; if (!strcmp(ptr, "PRO")) return 'P'; break; case 'S': if (!strcmp(ptr, "SER")) return 'S'; break; case 'T': if (!strcmp(ptr, "THR")) return 'T'; if (!strcmp(ptr, "TRP")) return 'W'; if (!strcmp(ptr, "TYR")) return 'Y'; break; case 'V': if (!strcmp(ptr, "VAL")) return 'V'; break; } return 'X'; } std::string MolToSequence(const ROMol &mol) { std::string result; for (ROMol::ConstAtomIterator atomIt = mol.beginAtoms(); atomIt != mol.endAtoms(); ++atomIt) { const Atom *atom = *atomIt; AtomPDBResidueInfo *info = (AtomPDBResidueInfo *)(atom->getMonomerInfo()); if (info && info->getMonomerType() == AtomMonomerInfo::PDBRESIDUE && info->getName() == " CA ") { result += getOneLetterCode(info); } } return result; } std::string MolToFASTA(const ROMol &mol) { std::string seq = MolToSequence(mol); if (seq.empty()) return ""; std::string result = ">"; std::string name; if (mol.getPropIfPresent(common_properties::_Name, name)) result += name; result += '\n'; result += seq; result += '\n'; return result; } static const char *getHELMMonomer(const AtomPDBResidueInfo *info) { const char *ptr = info->getResidueName().c_str(); switch (ptr[0]) { case 'A': if (!strcmp(ptr, "ALA")) return "A"; if (!strcmp(ptr, "ARG")) return "R"; if (!strcmp(ptr, "ASN")) return "N"; if (!strcmp(ptr, "ASP")) return "D"; break; case 'C': if (!strcmp(ptr, "CYS")) return "C"; break; case 'D': if (!strcmp(ptr, "DAL")) return "[dA]"; if (!strcmp(ptr, "DAR")) return "[dR]"; if (!strcmp(ptr, "DAS")) return "[dD]"; if (!strcmp(ptr, "DCY")) return "[dC]"; if (!strcmp(ptr, "DGL")) return "[dE]"; if (!strcmp(ptr, "DGN")) return "[dQ]"; if (!strcmp(ptr, "DHI")) return "[dH]"; if (!strcmp(ptr, "DLE")) return "[dL]"; if (!strcmp(ptr, "DLY")) return "[dK]"; if (!strcmp(ptr, "DPN")) return "[dF]"; if (!strcmp(ptr, "DPR")) return "[dP]"; if (!strcmp(ptr, "DSG")) return "[dN]"; if (!strcmp(ptr, "DSN")) return "[dS]"; if (!strcmp(ptr, "DTR")) return "[dW]"; if (!strcmp(ptr, "DTY")) return "[dY]"; if (!strcmp(ptr, "DVA")) return "[dV]"; break; case 'G': if (!strcmp(ptr, "GLU")) return "E"; if (!strcmp(ptr, "GLN")) return "Q"; if (!strcmp(ptr, "GLY")) return "G"; break; case 'H': if (!strcmp(ptr, "HIS")) return "H"; break; case 'I': if (!strcmp(ptr, "ILE")) return "I"; break; case 'L': if (!strcmp(ptr, "LEU")) return "L"; if (!strcmp(ptr, "LYS")) return "K"; break; case 'M': if (!strcmp(ptr, "MET")) return "M"; if (!strcmp(ptr, "MED")) return "[dM]"; if (!strcmp(ptr, "MSE")) return "[seC]"; break; case 'N': if (!strcmp(ptr, "NLE")) return "[Nle]"; if (!strcmp(ptr, "NVA")) return "[Nva]"; break; case 'O': if (!strcmp(ptr, "ORN")) return "[Orn]"; break; case 'P': if (!strcmp(ptr, "PHE")) return "F"; if (!strcmp(ptr, "PRO")) return "P"; break; case 'S': if (!strcmp(ptr, "SER")) return "S"; break; case 'T': if (!strcmp(ptr, "THR")) return "T"; if (!strcmp(ptr, "TRP")) return "W"; if (!strcmp(ptr, "TYR")) return "Y"; break; case 'V': if (!strcmp(ptr, "VAL")) return "V"; break; } return (const char *)0; } static bool IsEupeptideBond(AtomPDBResidueInfo *src, AtomPDBResidueInfo *dst) { if (src->getChainId() != dst->getChainId()) return false; int sresno = src->getResidueNumber(); int dresno = dst->getResidueNumber(); if (src->getName() == " C " && dst->getName() == " N ") { if (sresno != dresno) return dresno > sresno; return dst->getInsertionCode() > src->getInsertionCode(); } if (src->getName() == " N " && dst->getName() == " C ") { if (sresno != dresno) return dresno < sresno; return dst->getInsertionCode() < src->getInsertionCode(); } return false; } static bool IsSupportedHELMBond(AtomPDBResidueInfo *src, AtomPDBResidueInfo *dst) { if (src->getName() == " SG " && dst->getName() == " SG ") { if ((src->getResidueName() == "CYS" || src->getResidueName() == "DCY") && (dst->getResidueName() == "CYS" || dst->getResidueName() == "DCY")) return true; } if (src->getName() == " N " && dst->getName() == " C ") return true; if (src->getName() == " C " && dst->getName() == " N ") return true; #if 0 printf("%s%d%s.%s - %s%d%s.%s\n", src->getResidueName().c_str(),src->getResidueNumber(), src->getChainId().c_str(),src->getName().c_str(), dst->getResidueName().c_str(),dst->getResidueNumber(), dst->getChainId().c_str(),dst->getName().c_str()); #endif return false; } static bool FindHELMAtom(std::vector *seq, AtomPDBResidueInfo *info, std::string &id, std::string &pos) { char buffer[32]; char ch; const char *ptr = info->getName().c_str(); if (ptr[0] == ' ' && ptr[1] == 'S' && ptr[2] == 'G' && ptr[3] == ' ') ch = '3'; else if (ptr[0] == ' ' && ptr[1] == 'N' && ptr[2] == ' ' && ptr[3] == ' ') ch = '1'; else if (ptr[0] == ' ' && ptr[1] == 'C' && ptr[2] == ' ' && ptr[3] == ' ') ch = '2'; else return false; int resno = info->getResidueNumber(); for (unsigned int i = 1; i < 10; i++) { unsigned int len = (unsigned int)seq[i].size(); for (unsigned int j = 0; j < len; j++) { AtomPDBResidueInfo *targ = seq[i][j]; if (targ->getResidueNumber() == resno && targ->getResidueName() == info->getResidueName() && targ->getChainId() == info->getChainId() && targ->getInsertionCode() == info->getInsertionCode()) { id = "PEPTIDE"; id += (char)(i + '0'); sprintf(buffer, "%u:R%c", j + 1, ch); pos = buffer; return true; } } } return false; } static std::string NameHELMBond(std::vector *seq, AtomPDBResidueInfo *src, AtomPDBResidueInfo *dst) { std::string id1, pos1; if (!FindHELMAtom(seq, src, id1, pos1)) return ""; std::string id2, pos2; if (!FindHELMAtom(seq, dst, id2, pos2)) return ""; std::string result = id1; result += ','; result += id2; result += ','; result += pos1; result += '-'; result += pos2; return result; } std::string MolToHELM(const ROMol &mol) { std::vector seq[10]; std::string result; bool first = true; std::string chain; int id = 1; /* First pass: Monomers */ for (ROMol::ConstAtomIterator atomIt = mol.beginAtoms(); atomIt != mol.endAtoms(); ++atomIt) { const Atom *atom = *atomIt; AtomPDBResidueInfo *info = (AtomPDBResidueInfo *)(atom->getMonomerInfo()); // We can only write HELM if all atoms have PDB residue information if (!info || info->getMonomerType() != AtomMonomerInfo::PDBRESIDUE) return ""; if (info->getName() == " CA ") { const char *mono = getHELMMonomer(info); if (!mono) return ""; if (first) { chain = info->getChainId(); result = "PEPTIDE1{"; first = false; } else if (info->getChainId() != chain) { // Nine chains should be enough? if (id == 9) return ""; id++; chain = info->getChainId(); result += "}|PEPTIDE"; result += (char)(id + '0'); result += "{"; } else result += "."; result += mono; seq[id].push_back(info); } else if (info->getResidueName() == "NH2" && info->getName() == " N ") { if (first) return ""; result += ".[am]"; } else if (info->getResidueName() == "ACE" && info->getName() == " C ") { if (first) { chain = info->getChainId(); result = "PEPTIDE1{[ac]"; first = false; } else if (info->getChainId() != chain) { // Nine chains should be enough? if (id == 9) return ""; id++; chain = info->getChainId(); result += "}|PEPTIDE"; result += (char)(id + '0'); result += "{[ac]"; } else return ""; seq[id].push_back(info); } } if (first) return ""; result += "}$"; first = true; for (ROMol::ConstBondIterator bondIt = mol.beginBonds(); bondIt != mol.endBonds(); ++bondIt) { const Bond *bond = *bondIt; Atom *beg = bond->getBeginAtom(); Atom *end = bond->getEndAtom(); if (!beg || !end) continue; AtomPDBResidueInfo *binfo = (AtomPDBResidueInfo *)(beg->getMonomerInfo()); AtomPDBResidueInfo *einfo = (AtomPDBResidueInfo *)(end->getMonomerInfo()); if (!binfo || !einfo) continue; // Test if this is an uninteresting intra-residue bond if (binfo->getResidueNumber() == einfo->getResidueNumber() && binfo->getResidueName() == einfo->getResidueName() && binfo->getChainId() == einfo->getChainId() && binfo->getInsertionCode() == einfo->getInsertionCode()) continue; if (bond->getBondType() != Bond::SINGLE) return ""; if (IsEupeptideBond(binfo, einfo)) continue; if (!IsSupportedHELMBond(binfo, einfo)) return ""; std::string tmp = NameHELMBond(seq, binfo, einfo); if (tmp.empty()) return ""; if (!first) result += "|"; else first = false; result += tmp; } result += "$$$"; return result; } } // namespace RDKit