Files
libcifpp/src/Compound.cpp
2021-11-25 16:24:55 +01:00

753 lines
19 KiB
C++

/*-
* SPDX-License-Identifier: BSD-2-Clause
*
* Copyright (c) 2020 NKI/AVL, Netherlands Cancer Institute
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <map>
#include <mutex>
#include <numeric>
#include <shared_mutex>
#include <boost/algorithm/string.hpp>
#include <filesystem>
#include <fstream>
#include "cif++/Cif++.hpp"
#include "cif++/CifParser.hpp"
#include "cif++/CifUtils.hpp"
#include "cif++/Compound.hpp"
#include "cif++/Point.hpp"
namespace ba = boost::algorithm;
namespace fs = std::filesystem;
namespace mmcif
{
// --------------------------------------------------------------------
std::string to_string(BondType bondType)
{
switch (bondType)
{
case BondType::sing: return "sing";
case BondType::doub: return "doub";
case BondType::trip: return "trip";
case BondType::quad: return "quad";
case BondType::arom: return "arom";
case BondType::poly: return "poly";
case BondType::delo: return "delo";
case BondType::pi: return "pi";
}
throw std::invalid_argument("Invalid bondType");
}
BondType from_string(const std::string &bondType)
{
if (cif::iequals(bondType, "sing"))
return BondType::sing;
if (cif::iequals(bondType, "doub"))
return BondType::doub;
if (cif::iequals(bondType, "trip"))
return BondType::trip;
if (cif::iequals(bondType, "quad"))
return BondType::quad;
if (cif::iequals(bondType, "arom"))
return BondType::arom;
if (cif::iequals(bondType, "poly"))
return BondType::poly;
if (cif::iequals(bondType, "delo"))
return BondType::delo;
if (cif::iequals(bondType, "pi"))
return BondType::pi;
throw std::invalid_argument("Invalid bondType: " + bondType);
}
// --------------------------------------------------------------------
// Compound helper classes
struct CompoundAtomLess
{
bool operator()(const CompoundAtom &a, const CompoundAtom &b) const
{
int d = a.id.compare(b.id);
if (d == 0)
d = a.typeSymbol - b.typeSymbol;
return d < 0;
}
};
struct CompoundBondLess
{
bool operator()(const CompoundBond &a, const CompoundBond &b) const
{
int d = a.atomID[0].compare(b.atomID[0]);
if (d == 0)
d = a.atomID[1].compare(b.atomID[1]);
if (d == 0)
d = static_cast<int>(a.type) - static_cast<int>(b.type);
return d < 0;
}
};
// --------------------------------------------------------------------
// Compound
Compound::Compound(cif::Datablock &db)
{
auto &chemComp = db["chem_comp"];
if (chemComp.size() != 1)
throw std::runtime_error("Invalid compound file, chem_comp should contain a single row");
cif::tie(mID, mName, mType, mFormula, mFormulaWeight, mFormalCharge) =
chemComp.front().get("id", "name", "type", "formula", "formula_weight", "pdbx_formal_charge");
// The name should not contain newline characters since that triggers validation errors later on
ba::replace_all(mName, "\n", "");
mGroup = "non-polymer";
auto &chemCompAtom = db["chem_comp_atom"];
for (auto row : chemCompAtom)
{
CompoundAtom atom;
std::string typeSymbol;
cif::tie(atom.id, typeSymbol, atom.charge, atom.aromatic, atom.leavingAtom, atom.stereoConfig, atom.x, atom.y, atom.z) =
row.get("atom_id", "type_symbol", "charge", "pdbx_aromatic_flag", "pdbx_leaving_atom_flag", "pdbx_stereo_config",
"model_Cartn_x", "model_Cartn_y", "model_Cartn_z");
atom.typeSymbol = AtomTypeTraits(typeSymbol).type();
mAtoms.push_back(std::move(atom));
}
auto &chemCompBond = db["chem_comp_bond"];
for (auto row : chemCompBond)
{
CompoundBond bond;
std::string valueOrder;
cif::tie(bond.atomID[0], bond.atomID[1], valueOrder, bond.aromatic, bond.stereoConfig) = row.get("atom_id_1", "atom_id_2", "value_order", "pdbx_aromatic_flag", "pdbx_stereo_config");
bond.type = from_string(valueOrder);
mBonds.push_back(std::move(bond));
}
}
Compound::Compound(cif::Datablock &db, const std::string &id, const std::string &name, const std::string &type, const std::string &group)
: mID(id)
, mName(name)
, mType(type)
, mGroup(group)
{
auto &chemCompAtom = db["chem_comp_atom"];
for (auto row : chemCompAtom)
{
CompoundAtom atom;
std::string typeSymbol;
cif::tie(atom.id, typeSymbol, atom.charge, atom.x, atom.y, atom.z) =
row.get("atom_id", "type_symbol", "charge", "x", "y", "z");
atom.typeSymbol = AtomTypeTraits(typeSymbol).type();
mFormalCharge += atom.charge;
mFormulaWeight += AtomTypeTraits(atom.typeSymbol).weight();
mAtoms.push_back(std::move(atom));
}
auto &chemCompBond = db["chem_comp_bond"];
for (auto row : chemCompBond)
{
CompoundBond bond;
std::string btype;
cif::tie(bond.atomID[0], bond.atomID[1], btype, bond.aromatic) = row.get("atom_id_1", "atom_id_2", "type", "aromatic");
using cif::iequals;
if (iequals(btype, "single"))
bond.type = BondType::sing;
else if (iequals(btype, "double"))
bond.type = BondType::doub;
else if (iequals(btype, "triple"))
bond.type = BondType::trip;
else if (iequals(btype, "deloc") or iequals(btype, "aromat") or iequals(btype, "aromatic"))
bond.type = BondType::delo;
else
{
if (cif::VERBOSE)
std::cerr << "Unimplemented chem_comp_bond.type " << btype << " in " << id << std::endl;
bond.type = BondType::sing;
}
mBonds.push_back(std::move(bond));
}
}
CompoundAtom Compound::getAtomByID(const std::string &atomID) const
{
CompoundAtom result = {};
for (auto &a : mAtoms)
{
if (a.id == atomID)
{
result = a;
break;
}
}
if (result.id != atomID)
throw std::out_of_range("No atom " + atomID + " in Compound " + mID);
return result;
}
bool Compound::atomsBonded(const std::string &atomId_1, const std::string &atomId_2) const
{
auto i = find_if(mBonds.begin(), mBonds.end(),
[&](const CompoundBond &b) {
return (b.atomID[0] == atomId_1 and b.atomID[1] == atomId_2) or (b.atomID[0] == atomId_2 and b.atomID[1] == atomId_1);
});
return i != mBonds.end();
}
// --------------------------------------------------------------------
// a factory class to generate compounds
CIFPP_EXPORT const std::map<std::string, char> kAAMap{
{"ALA", 'A'},
{"ARG", 'R'},
{"ASN", 'N'},
{"ASP", 'D'},
{"CYS", 'C'},
{"GLN", 'Q'},
{"GLU", 'E'},
{"GLY", 'G'},
{"HIS", 'H'},
{"ILE", 'I'},
{"LEU", 'L'},
{"LYS", 'K'},
{"MET", 'M'},
{"PHE", 'F'},
{"PRO", 'P'},
{"SER", 'S'},
{"THR", 'T'},
{"TRP", 'W'},
{"TYR", 'Y'},
{"VAL", 'V'},
{"GLX", 'Z'},
{"ASX", 'B'}};
CIFPP_EXPORT const std::map<std::string, char> kBaseMap{
{"A", 'A'},
{"C", 'C'},
{"G", 'G'},
{"T", 'T'},
{"U", 'U'},
{"DA", 'A'},
{"DC", 'C'},
{"DG", 'G'},
{"DT", 'T'}};
// --------------------------------------------------------------------
class CompoundFactoryImpl : public std::enable_shared_from_this<CompoundFactoryImpl>
{
public:
CompoundFactoryImpl(std::shared_ptr<CompoundFactoryImpl> next);
CompoundFactoryImpl(const std::filesystem::path &file, std::shared_ptr<CompoundFactoryImpl> next);
virtual ~CompoundFactoryImpl()
{
for (auto c: mCompounds)
delete c;
}
Compound *get(std::string id)
{
std::shared_lock lock(mMutex);
ba::to_upper(id);
Compound *result = nullptr;
// walk the list, see if any of us has the compound already
for (auto impl = shared_from_this(); impl; impl = impl->mNext)
{
for (auto cmp : impl->mCompounds)
{
if (cmp->id() == id)
{
result = cmp;
break;
}
}
if (result)
break;
}
if (result == nullptr and mMissing.count(id) == 0)
{
for (auto impl = shared_from_this(); impl; impl = impl->mNext)
{
result = impl->create(id);
if (result != nullptr)
break;
}
if (result == nullptr)
mMissing.insert(id);
}
return result;
}
std::shared_ptr<CompoundFactoryImpl> next() const
{
return mNext;
}
bool isKnownPeptide(const std::string &resName)
{
return mKnownPeptides.count(resName) or
(mNext and mNext->isKnownPeptide(resName));
}
bool isKnownBase(const std::string &resName)
{
return mKnownBases.count(resName) or
(mNext and mNext->isKnownBase(resName));
}
protected:
virtual Compound *create(const std::string &id)
{
// For the base class we assume every compound is preloaded
return nullptr;
}
std::shared_timed_mutex mMutex;
std::vector<Compound *> mCompounds;
std::set<std::string> mKnownPeptides;
std::set<std::string> mKnownBases;
std::set<std::string> mMissing;
std::shared_ptr<CompoundFactoryImpl> mNext;
};
// --------------------------------------------------------------------
CompoundFactoryImpl::CompoundFactoryImpl(std::shared_ptr<CompoundFactoryImpl> next)
: mNext(next)
{
for (const auto &[key, value] : kAAMap)
mKnownPeptides.insert(key);
for (const auto &[key, value] : kBaseMap)
mKnownBases.insert(key);
}
CompoundFactoryImpl::CompoundFactoryImpl(const std::filesystem::path &file, std::shared_ptr<CompoundFactoryImpl> next)
: mNext(next)
{
cif::File cifFile(file);
auto compList = cifFile.get("comp_list");
if (compList) // So this is a CCP4 restraints file, special handling
{
auto &chemComp = (*compList)["chem_comp"];
for (const auto &[id, name, group] : chemComp.rows<std::string, std::string, std::string>("id", "name", "group"))
{
std::string type;
// known groups are (counted from ccp4 monomer dictionary)
// D-pyranose
// DNA
// L-PEPTIDE LINKING
// L-SACCHARIDE
// L-peptide
// L-pyranose
// M-peptide
// NON-POLYMER
// P-peptide
// RNA
// furanose
// non-polymer
// non_polymer
// peptide
// pyranose
// saccharide
if (cif::iequals(id, "gly"))
type = "peptide linking";
else if (cif::iequals(group, "l-peptide") or cif::iequals(group, "L-peptide linking") or cif::iequals(group, "peptide"))
type = "L-peptide linking";
else if (cif::iequals(group, "DNA"))
type = "DNA linking";
else if (cif::iequals(group, "RNA"))
type = "RNA linking";
else
type = "non-polymer";
auto &db = cifFile["comp_" + id];
mCompounds.push_back(new Compound(db, id, name, type, group));
}
}
else
{
// A CCD components file, validate it first
cifFile.loadDictionary("mmcif_pdbx_v50");
if (not cifFile.isValid())
throw std::runtime_error("Invalid compound file");
for (auto &db : cifFile)
mCompounds.push_back(new Compound(db));
}
}
// --------------------------------------------------------------------
// Version for the default compounds, based on the cached components.cif file from CCD
class CCDCompoundFactoryImpl : public CompoundFactoryImpl
{
public:
CCDCompoundFactoryImpl(std::shared_ptr<CompoundFactoryImpl> next, const fs::path& file)
: CompoundFactoryImpl(next)
, mCompoundsFile(file)
{
}
CCDCompoundFactoryImpl(std::shared_ptr<CompoundFactoryImpl> next)
: CompoundFactoryImpl(next)
{
}
Compound *create(const std::string &id) override;
cif::DatablockIndex mIndex;
fs::path mCompoundsFile;
};
Compound *CCDCompoundFactoryImpl::create(const std::string &id)
{
Compound *result = nullptr;
std::unique_ptr<std::istream> ccd;
if (mCompoundsFile.empty())
{
ccd = cif::loadResource("components.cif");
if (not ccd)
throw std::runtime_error("Could not locate the CCD components.cif file, please make sure the software is installed properly and/or use the update-libcifpp-data to fetch the data.");
}
else
ccd.reset(new std::ifstream(mCompoundsFile));
cif::File file;
if (mIndex.empty())
{
if (cif::VERBOSE > 1)
{
std::cout << "Creating component index "
<< "...";
std::cout.flush();
}
cif::Parser parser(*ccd, file, false);
mIndex = parser.indexDatablocks();
if (cif::VERBOSE > 1)
std::cout << " done" << std::endl;
// reload the resource, perhaps this should be improved...
if (mCompoundsFile.empty())
{
ccd = cif::loadResource("components.cif");
if (not ccd)
throw std::runtime_error("Could not locate the CCD components.cif file, please make sure the software is installed properly and/or use the update-libcifpp-data to fetch the data.");
}
else
ccd.reset(new std::ifstream(mCompoundsFile));
}
if (cif::VERBOSE > 1)
{
std::cout << "Loading component " << id << "...";
std::cout.flush();
}
cif::Parser parser(*ccd, file, false);
parser.parseSingleDatablock(id, mIndex);
if (cif::VERBOSE > 1)
std::cout << " done" << std::endl;
if (not file.empty())
{
auto &db = file.firstDatablock();
if (db.getName() == id)
{
result = new Compound(db);
std::shared_lock lock(mMutex);
mCompounds.push_back(result);
}
}
if (result == nullptr and cif::VERBOSE)
std::cerr << "Could not locate compound " << id << " in the CCD components file" << std::endl;
return result;
}
// --------------------------------------------------------------------
// Version for the default compounds, based on the data found in CCP4's monomers lib
class CCP4CompoundFactoryImpl : public CompoundFactoryImpl
{
public:
CCP4CompoundFactoryImpl(const fs::path &clibd_mon, std::shared_ptr<CompoundFactoryImpl> next = nullptr);
Compound *create(const std::string &id) override;
private:
cif::File mFile;
fs::path mCLIBD_MON;
};
CCP4CompoundFactoryImpl::CCP4CompoundFactoryImpl(const fs::path &clibd_mon, std::shared_ptr<CompoundFactoryImpl> next)
: CompoundFactoryImpl(next)
, mFile((clibd_mon / "list" / "mon_lib_list.cif").string())
, mCLIBD_MON(clibd_mon)
{
const std::regex peptideRx("(?:[lmp]-)?peptide", std::regex::icase);
auto &chemComps = mFile["comp_list"]["chem_comp"];
for (const auto &[group, threeLetterCode] : chemComps.rows<std::string, std::string>("group", "three_letter_code"))
{
if (std::regex_match(group, peptideRx))
mKnownPeptides.insert(threeLetterCode);
else if (ba::iequals(group, "DNA") or ba::iequals(group, "RNA"))
mKnownBases.insert(threeLetterCode);
}
}
Compound *CCP4CompoundFactoryImpl::create(const std::string &id)
{
Compound *result = nullptr;
auto &cat = mFile["comp_list"]["chem_comp"];
auto rs = cat.find(cif::Key("three_letter_code") == id);
if (rs.size() == 1)
{
auto row = rs.front();
std::string name, group;
uint32_t numberAtomsAll, numberAtomsNh;
cif::tie(name, group, numberAtomsAll, numberAtomsNh) =
row.get("name", "group", "number_atoms_all", "number_atoms_nh");
fs::path resFile = mCLIBD_MON / ba::to_lower_copy(id.substr(0, 1)) / (id + ".cif");
if (not fs::exists(resFile) and (id == "COM" or id == "CON" or "PRN")) // seriously...
resFile = mCLIBD_MON / ba::to_lower_copy(id.substr(0, 1)) / (id + '_' + id + ".cif");
if (fs::exists(resFile))
{
cif::File cf(resFile.string());
// locate the datablock
auto &db = cf["comp_" + id];
std::string type;
// known groups are (counted from ccp4 monomer dictionary)
// D-pyranose
// DNA
// L-PEPTIDE LINKING
// L-SACCHARIDE
// L-peptide
// L-pyranose
// M-peptide
// NON-POLYMER
// P-peptide
// RNA
// furanose
// non-polymer
// non_polymer
// peptide
// pyranose
// saccharide
if (cif::iequals(id, "gly"))
type = "peptide linking";
else if (cif::iequals(group, "l-peptide") or cif::iequals(group, "L-peptide linking") or cif::iequals(group, "peptide"))
type = "L-peptide linking";
else if (cif::iequals(group, "DNA"))
type = "DNA linking";
else if (cif::iequals(group, "RNA"))
type = "RNA linking";
else
type = "non-polymer";
mCompounds.push_back(new Compound(db, id, name, type, group));
result = mCompounds.back();
}
}
return result;
}
// --------------------------------------------------------------------
std::unique_ptr<CompoundFactory> CompoundFactory::sInstance;
thread_local std::unique_ptr<CompoundFactory> CompoundFactory::tlInstance;
bool CompoundFactory::sUseThreadLocalInstance;
void CompoundFactory::init(bool useThreadLocalInstanceOnly)
{
sUseThreadLocalInstance = useThreadLocalInstanceOnly;
}
CompoundFactory::CompoundFactory()
: mImpl(nullptr)
{
auto ccd = cif::loadResource("components.cif");
if (ccd)
mImpl.reset(new CCDCompoundFactoryImpl(mImpl));
else if (cif::VERBOSE)
std::cerr << "CCD components.cif file was not found" << std::endl;
const char *clibd_mon = getenv("CLIBD_MON");
if (clibd_mon != nullptr and fs::is_directory(clibd_mon))
mImpl.reset(new CCP4CompoundFactoryImpl(clibd_mon));
else if (cif::VERBOSE)
std::cerr << "CCP4 monomers library not found, CLIBD_MON is not defined" << std::endl;
}
CompoundFactory::~CompoundFactory()
{
}
CompoundFactory &CompoundFactory::instance()
{
if (sUseThreadLocalInstance)
{
if (not tlInstance)
tlInstance.reset(new CompoundFactory());
return *tlInstance;
}
else
{
if (not sInstance)
sInstance.reset(new CompoundFactory());
return *sInstance;
}
}
void CompoundFactory::clear()
{
if (sUseThreadLocalInstance)
tlInstance.reset(nullptr);
else
sInstance.reset();
}
void CompoundFactory::setDefaultDictionary(const std::filesystem::path &inDictFile)
{
if (not fs::exists(inDictFile))
throw std::runtime_error("file not found: " + inDictFile.string());
try
{
mImpl.reset(new CCDCompoundFactoryImpl(mImpl, inDictFile));
}
catch (const std::exception &)
{
std::cerr << "Error loading dictionary " << inDictFile << std::endl;
throw;
}
}
void CompoundFactory::pushDictionary(const std::filesystem::path &inDictFile)
{
if (not fs::exists(inDictFile))
throw std::runtime_error("file not found: " + inDictFile.string());
// ifstream file(inDictFile);
// if (not file.is_open())
// throw std::runtime_error("Could not open peptide list " + inDictFile);
try
{
mImpl.reset(new CompoundFactoryImpl(inDictFile, mImpl));
}
catch (const std::exception &)
{
std::cerr << "Error loading dictionary " << inDictFile << std::endl;
throw;
}
}
void CompoundFactory::popDictionary()
{
if (mImpl)
mImpl = mImpl->next();
}
const Compound *CompoundFactory::create(std::string id)
{
// static bool warned = false;
// if (mImpl and warned == false)
// {
// std::cerr << "Warning: no compound information library was found, resulting data may be incorrect or incomplete" << std::endl;
// warned = true;
// }
return mImpl ? mImpl->get(id) : nullptr;
}
bool CompoundFactory::isKnownPeptide(const std::string &resName) const
{
return mImpl ? mImpl->isKnownPeptide(resName) : kAAMap.count(resName) > 0;
}
bool CompoundFactory::isKnownBase(const std::string &resName) const
{
return mImpl ? mImpl->isKnownBase(resName) : kBaseMap.count(resName) > 0;
}
} // namespace mmcif