diff --git a/CMakeLists.txt b/CMakeLists.txt index a742de5..4d5525f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -25,7 +25,7 @@ cmake_minimum_required(VERSION 3.16) # set the project name -project(libcifpp VERSION 5.2.5 LANGUAGES CXX) +project(libcifpp VERSION 6.0.0 LANGUAGES CXX) list(PREPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake") diff --git a/changelog b/changelog index 965bb53..8a4694f 100644 --- a/changelog +++ b/changelog @@ -1,3 +1,6 @@ +Version 6.0.0 +- Drop the use of CCP4's monomer library for compound information + Version 5.2.5 - Correctly import the Eigen3 library diff --git a/include/cif++/compound.hpp b/include/cif++/compound.hpp index 70db418..f9a574c 100644 --- a/include/cif++/compound.hpp +++ b/include/cif++/compound.hpp @@ -44,11 +44,7 @@ /// The data is loaded by default from a file called `components.cif`. This file /// is located using load_resource. (See documentation on cif::load_resource for more information) /// -/// But if the CCP4 environment is available at runtime, the compound information -/// may also be generated from the CCP4 monomer library. -/// -/// Note that the information in CCP4 and CCD is not equal. -/// +/// Note that since version 6 the CCP4 monomer library is no longer used. /// See also :doc:`/compound` for more information. @@ -157,10 +153,6 @@ class compound float formula_weight() const { return m_formula_weight; } ///< Return the formula mass of the chemical component in Daltons. int formal_charge() const { return m_formal_charge; } ///< Return the formal charge on the chemical component. - /// The group record is only available in CCP4 monomer library files. - /// For CCD entries this value will always contain 'non-polymer' - std::string group() const { return m_group; } - const std::vector &atoms() const { return m_atoms; } ///< Return the list of atoms for this compound const std::vector &bonds() const { return m_bonds; } ///< Return the list of bonds for this compound @@ -176,8 +168,6 @@ class compound private: friend class compound_factory_impl; - friend class CCD_compound_factory_impl; - friend class CCP4_compound_factory_impl; compound(cif::datablock &db); compound(cif::datablock &db, const std::string &id, const std::string &name, const std::string &type, const std::string &group); @@ -246,6 +236,8 @@ class compound_factory CIFPP_EXPORT static const std::map kAAMap, ///< Globally accessible static list of the default amino acids kBaseMap; ///< Globally accessible static list of the default bases + void report_missing_compound(const std::string &compound_id); + private: compound_factory(); diff --git a/include/cif++/utilities.hpp b/include/cif++/utilities.hpp index 6abac40..f75f993 100644 --- a/include/cif++/utilities.hpp +++ b/include/cif++/utilities.hpp @@ -365,6 +365,14 @@ std::unique_ptr load_resource(std::filesystem::path name); void add_file_resource(const std::string &name, std::filesystem::path dataFile); +/** + * @brief List all the file resources added with cif::add_file_resource. + * + * @param os The std::ostream to write the directories to + */ + +void list_file_resources(std::ostream &os); + /** * @brief Add a directory to the list of search directories. This list is * searched in a last-in-first-out order. @@ -379,4 +387,12 @@ void add_file_resource(const std::string &name, std::filesystem::path dataFile); void add_data_directory(std::filesystem::path dataDir); +/** + * @brief List all the data directories, for error reporting on missing resources. + * + * @param os The std::ostream to write the directories to + */ + +void list_data_directories(std::ostream &os); + } // namespace cif diff --git a/src/compound.cpp b/src/compound.cpp index e4f6c03..b150ea5 100644 --- a/src/compound.cpp +++ b/src/compound.cpp @@ -313,11 +313,10 @@ const std::map compound_factory::kBaseMap{ class compound_factory_impl : public std::enable_shared_from_this { public: - compound_factory_impl(std::shared_ptr next); - + compound_factory_impl(); compound_factory_impl(const fs::path &file, std::shared_ptr next); - virtual ~compound_factory_impl() + ~compound_factory_impl() { for (auto c : m_compounds) delete c; @@ -331,7 +330,7 @@ class compound_factory_impl : public std::enable_shared_from_thism_next) { for (auto cmp : impl->m_compounds) @@ -363,155 +362,52 @@ class compound_factory_impl : public std::enable_shared_from_this next() const + std::shared_ptr next() { return m_next; } - bool is_known_peptide(const std::string &resName) + void describe(std::ostream &os) { - return m_known_peptides.count(resName) or - (m_next and m_next->is_known_peptide(resName)); + if (m_file.empty()) + os << "CCD components.cif resource\n"; + else + os << "CCD components file: " << std::quoted(m_file.string()) << '\n'; + + if (m_next) + m_next->describe(os); } - bool is_known_base(const std::string &resName) - { - return m_known_bases.count(resName) or - (m_next and m_next->is_known_base(resName)); - } - - protected: - virtual compound *create(const std::string &id) - { - // For the base class we assume every compound is preloaded - return nullptr; - } + private: + compound *create(const std::string &id); std::shared_timed_mutex mMutex; + fs::path m_file; + cif::parser::datablock_index m_index; + std::vector m_compounds; - std::set m_known_peptides; - std::set m_known_bases; std::set m_missing; std::shared_ptr m_next; }; -// -------------------------------------------------------------------- - -compound_factory_impl::compound_factory_impl(std::shared_ptr next) - : m_next(next) +compound_factory_impl::compound_factory_impl() { - for (const auto &[key, value] : compound_factory::kAAMap) - m_known_peptides.insert(key); - - for (const auto &[key, value] : compound_factory::kBaseMap) - m_known_bases.insert(key); } compound_factory_impl::compound_factory_impl(const fs::path &file, std::shared_ptr next) - : m_next(next) + : m_file(file) + , m_next(next) { - cif::file cifFile(file); - - if (cifFile.contains("comp_list")) // So this is a CCP4 restraints file, special handling - { - auto &compList = cifFile["comp_list"]; - auto &chemComp = compList["chem_comp"]; - - for (const auto &[id, name, group] : chemComp.rows("id", "name", "group")) - { - std::string type; - - // known groups are (counted from ccp4 monomer dictionary) - - // D-pyranose - // DNA - // L-PEPTIDE LINKING - // L-SACCHARIDE - // L-peptide - // L-pyranose - // M-peptide - // NON-POLYMER - // P-peptide - // RNA - // furanose - // non-polymer - // non_polymer - // peptide - // pyranose - // saccharide - - if (cif::iequals(id, "gly")) - type = "peptide linking"; - else if (cif::iequals(group, "l-peptide") or cif::iequals(group, "L-peptide linking") or cif::iequals(group, "peptide") or cif::iequals(group, "p-peptide")) - type = "L-peptide linking"; - else if (cif::iequals(group, "DNA")) - type = "DNA linking"; - else if (cif::iequals(group, "RNA")) - type = "RNA linking"; - else - type = "non-polymer"; - - auto &db = cifFile["comp_" + id]; - - m_compounds.push_back(new compound(db, id, name, type, group)); - } - } - else - { - // A CCD components file, validate it first - try - { - cifFile.load_dictionary("mmcif_pdbx.dic"); - - if (not cifFile.is_valid()) - { - std::cerr << "The components file " << file << " is not valid\n"; - if (cif::VERBOSE < 1) - std::cerr << "(use --verbose to see why)\n"; - } - } - catch (const std::exception &e) - { - std::cerr << "When trying to load the components file " << file << " there was an exception:\n" - << e.what() << '\n'; - } - - for (auto &db : cifFile) - m_compounds.push_back(new compound(db)); - } } -// -------------------------------------------------------------------- -// Version for the default compounds, based on the cached components.cif file from CCD - -class CCD_compound_factory_impl : public compound_factory_impl -{ - public: - CCD_compound_factory_impl(std::shared_ptr next, const fs::path &file) - : compound_factory_impl(next) - , mCompoundsFile(file) - { - } - - CCD_compound_factory_impl(std::shared_ptr next) - : compound_factory_impl(next) - { - } - - compound *create(const std::string &id) override; - - cif::parser::datablock_index mIndex; - fs::path mCompoundsFile; -}; - -compound *CCD_compound_factory_impl::create(const std::string &id) +compound *compound_factory_impl::create(const std::string &id) { compound *result = nullptr; std::unique_ptr ccd; - if (mCompoundsFile.empty()) + if (m_file.empty()) { ccd = cif::load_resource("components.cif"); if (not ccd) @@ -521,11 +417,11 @@ compound *CCD_compound_factory_impl::create(const std::string &id) } } else - ccd.reset(new std::ifstream(mCompoundsFile)); + ccd.reset(new std::ifstream(m_file)); cif::file file; - if (mIndex.empty()) + if (m_index.empty()) { if (cif::VERBOSE > 1) { @@ -535,20 +431,20 @@ compound *CCD_compound_factory_impl::create(const std::string &id) } cif::parser parser(*ccd, file); - mIndex = parser.index_datablocks(); + m_index = parser.index_datablocks(); if (cif::VERBOSE > 1) std::cout << " done" << std::endl; // reload the resource, perhaps this should be improved... - if (mCompoundsFile.empty()) + if (m_file.empty()) { ccd = cif::load_resource("components.cif"); if (not ccd) throw std::runtime_error("Could not locate the CCD components.cif file, please make sure the software is installed properly and/or use the update-libcifpp-data to fetch the data."); } else - ccd.reset(new std::ifstream(mCompoundsFile)); + ccd.reset(new std::ifstream(m_file)); } if (cif::VERBOSE > 1) @@ -558,7 +454,7 @@ compound *CCD_compound_factory_impl::create(const std::string &id) } cif::parser parser(*ccd, file); - parser.parse_single_datablock(id, mIndex); + parser.parse_single_datablock(id, m_index); if (cif::VERBOSE > 1) std::cout << " done" << std::endl; @@ -575,107 +471,6 @@ compound *CCD_compound_factory_impl::create(const std::string &id) } } - if (result == nullptr and cif::VERBOSE > 0) - std::cerr << "Could not locate compound " << id << " in the CCD components file\n"; - - return result; -} - -// -------------------------------------------------------------------- -// Version for the default compounds, based on the data found in CCP4's monomers lib - -class CCP4_compound_factory_impl : public compound_factory_impl -{ - public: - CCP4_compound_factory_impl(const fs::path &clibd_mon, std::shared_ptr next = nullptr); - - compound *create(const std::string &id) override; - - private: - fs::path m_CLIBD_MON; -}; - -CCP4_compound_factory_impl::CCP4_compound_factory_impl(const fs::path &clibd_mon, std::shared_ptr next) - : compound_factory_impl(next) - , m_CLIBD_MON(clibd_mon) -{ - const std::regex peptideRx("(?:[lmp]-)?peptide", std::regex::icase); - - cif::file file(m_CLIBD_MON / "list" / "mon_lib_list.cif"); - auto &chemComps = file["comp_list"]["chem_comp"]; - - for (const auto &[group, comp_id] : chemComps.rows("group", "id")) - { - if (std::regex_match(group, peptideRx)) - m_known_peptides.insert(comp_id); - else if (cif::iequals(group, "DNA") or cif::iequals(group, "RNA")) - m_known_bases.insert(comp_id); - } -} - -compound *CCP4_compound_factory_impl::create(const std::string &id) -{ - compound *result = nullptr; - - fs::path resFile = m_CLIBD_MON / cif::to_lower_copy(id.substr(0, 1)) / (id + ".cif"); - - if (not fs::exists(resFile) and (id == "COM" or id == "CON" or "PRN")) // seriously... - resFile = m_CLIBD_MON / cif::to_lower_copy(id.substr(0, 1)) / (id + '_' + id + ".cif"); - - if (fs::exists(resFile)) - { - cif::file cf(resFile.string()); - - auto &db_list = cf["comp_list"]; - auto list = db_list["chem_comp"]; - - if (list.size() == 1) - { - std::string name, group; - uint32_t numberAtomsAll, numberAtomsNh; - cif::tie(name, group, numberAtomsAll, numberAtomsNh) = - list.front().get("name", "group", "number_atoms_all", "number_atoms_nh"); - - // locate the datablock - auto &db = cf["comp_" + id]; - - std::string type; - - // known groups are (counted from ccp4 monomer dictionary) - - // D-pyranose - // DNA - // L-PEPTIDE LINKING - // L-SACCHARIDE - // L-peptide - // L-pyranose - // M-peptide - // NON-POLYMER - // P-peptide - // RNA - // furanose - // non-polymer - // non_polymer - // peptide - // pyranose - // saccharide - - if (cif::iequals(id, "gly")) - type = "peptide linking"; - else if (cif::iequals(group, "l-peptide") or cif::iequals(group, "L-peptide linking") or cif::iequals(group, "peptide") or cif::iequals(group, "p-peptide")) - type = "L-peptide linking"; - else if (cif::iequals(group, "DNA")) - type = "DNA linking"; - else if (cif::iequals(group, "RNA")) - type = "RNA linking"; - else - type = "non-polymer"; - - m_compounds.push_back(new compound(db, id, name, type, group)); - result = m_compounds.back(); - } - } - return result; } @@ -695,15 +490,9 @@ compound_factory::compound_factory() { auto ccd = cif::load_resource("components.cif"); if (ccd) - m_impl = std::make_shared(m_impl); + m_impl = std::make_shared(); else if (cif::VERBOSE > 0) - std::cerr << "CCD components.cif file was not found\n"; - - const char *clibd_mon = getenv("CLIBD_MON"); - if (clibd_mon != nullptr and fs::is_directory(clibd_mon)) - m_impl = std::make_shared(clibd_mon, m_impl); - else if (cif::VERBOSE > 0) - std::cerr << "CCP4 monomers library not found, CLIBD_MON is not defined\n"; + std::cerr << "CCD components.cif resource was not found\n"; } compound_factory::~compound_factory() @@ -741,7 +530,7 @@ void compound_factory::set_default_dictionary(const fs::path &inDictFile) try { - m_impl.reset(new CCD_compound_factory_impl(m_impl, inDictFile)); + m_impl.reset(new compound_factory_impl(inDictFile, m_impl)); } catch (const std::exception &) { @@ -772,17 +561,50 @@ void compound_factory::pop_dictionary() const compound *compound_factory::create(std::string id) { - return m_impl ? m_impl->get(id) : nullptr; + auto result = m_impl ? m_impl->get(id) : nullptr; + if (not result) + report_missing_compound(id); + return result; } bool compound_factory::is_known_peptide(const std::string &resName) const { - return m_impl ? m_impl->is_known_peptide(resName) : kAAMap.count(resName) > 0; + return kAAMap.count(resName) > 0; } bool compound_factory::is_known_base(const std::string &resName) const { - return m_impl ? m_impl->is_known_base(resName) : kBaseMap.count(resName) > 0; + return kBaseMap.count(resName) > 0; +} + +void compound_factory::report_missing_compound(const std::string &compound_id) +{ + static bool s_reported = false; + if (std::exchange(s_reported, true) == false) + { + using namespace cif::colour; + + std::clog << "\n" << cif::coloured("Configuration error:", white, red) << "\n\n" + << "The attempt to retrieve compound information for " << std::quoted(compound_id) << " failed.\n\n" + << "This information is searched for in a CCD file called components.cif or components.cif.gz\n" + << "which should be located in one of the following directories:\n\n"; + + cif::list_data_directories(std::clog); + + std::clog << "\n(Note that you can add a directory to the search paths by setting the LIBCIFPP_DATA_DIR environmental variable)\n\n"; + + if (m_impl) + { + std::clog << "The current order of compound factory objects is:\n\n"; + m_impl->describe(std::clog); + } + else + std::clog << "No compound factory objects are created since none of the data sources is found.\n"; + + cif::list_file_resources(std::clog); + + std::clog.flush(); + } } } // namespace cif diff --git a/src/parser.cpp b/src/parser.cpp index 9aa572a..3b0b579 100644 --- a/src/parser.cpp +++ b/src/parser.cpp @@ -608,6 +608,9 @@ sac_parser::datablock_index sac_parser::index_datablocks() std::string::size_type si = 0; std::string datablock; + // Seek to beginning of file + m_source.pubseekpos(0); + for (auto ch = m_source.sbumpc(); ch != std::streambuf::traits_type::eof(); ch = m_source.sbumpc()) { switch (state) @@ -667,7 +670,7 @@ sac_parser::datablock_index sac_parser::index_datablocks() case data_name: if (is_non_blank(ch)) - datablock.insert(datablock.end(), char(ch)); + datablock.insert(datablock.end(), (char)std::toupper(ch)); else if (is_space(ch)) { if (not datablock.empty()) diff --git a/src/pdb/pdb2cif.cpp b/src/pdb/pdb2cif.cpp index 717d7d5..f41f984 100644 --- a/src/pdb/pdb2cif.cpp +++ b/src/pdb/pdb2cif.cpp @@ -5146,7 +5146,7 @@ void PDBFileParser::ParseConnectivtyAnnotation() getCategory("struct_conn")->emplace({ { "id", type + std::to_string(linkNr) }, - { "conn_type_id", type }, + { "conn_type_id", type }, // { "ccp4_link_id", ccp4LinkID }, diff --git a/src/utilities.cpp b/src/utilities.cpp index 25ec73e..b73d611 100644 --- a/src/utilities.cpp +++ b/src/utilities.cpp @@ -845,6 +845,9 @@ class resource_pool std::unique_ptr load(fs::path name); + const auto data_directories() { return mDirs; } + const auto file_resources() { return mLocalResources; } + private: resource_pool(); @@ -937,4 +940,22 @@ std::unique_ptr load_resource(std::filesystem::path name) return resource_pool::instance().load(name); } +void list_file_resources(std::ostream &os) +{ + auto &file_resources = resource_pool::instance().file_resources(); + + if (not file_resources.empty()) + { + os << "\nThe following named resources were loaded:\n"; + for (const auto &[name, path] : file_resources) + os << name << " -> " << std::quoted(path.string()) << '\n'; + } +} + +void list_data_directories(std::ostream &os) +{ + for (auto &p : resource_pool::instance().data_directories()) + os << p << '\n'; +} + } // namespace cif diff --git a/test/unit-v2-test.cpp b/test/unit-v2-test.cpp index c7e01b0..b1378cf 100644 --- a/test/unit-v2-test.cpp +++ b/test/unit-v2-test.cpp @@ -3483,3 +3483,11 @@ ATOM 7 CD PRO A 1 15.762 13.216 43.724 1.00 30.71 C)" auto f = cif::pdb::read(is); } + +// -------------------------------------------------------------------- + +BOOST_AUTO_TEST_CASE(compound_not_found_test_1) +{ + auto cmp = cif::compound_factory::instance().create("&&&"); + BOOST_CHECK(cmp == nullptr); +} \ No newline at end of file