diff --git a/CMakeLists.txt b/CMakeLists.txt index eba9a8d93..57dc63b6d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -54,6 +54,7 @@ option(RDK_BUILD_TEST_GZIP "Build the gzip'd stream test" OFF) option(RDK_OPTIMIZE_POPCNT "Use SSE4.2 popcount instruction while compiling." ON) option(RDK_USE_STRICT_ROTOR_DEFINITION "Use the most strict rotatable bond definition" ON) option(RDK_BUILD_DESCRIPTORS3D "Build the 3D descriptors calculators, requires Eigen3 to be installed" ON) +option(RDK_BUILD_CHEMDRAW_SUPPORT "build support for the Revvity ChemDraw document format" ON ) option(RDK_BUILD_FREESASA_SUPPORT "build the rdkit freesasa wrapper" OFF ) option(RDK_BUILD_COORDGEN_SUPPORT "build the rdkit coordgen wrapper" ON ) option(RDK_BUILD_MAEPARSER_SUPPORT "build the rdkit MAE parser wrapper" ON ) @@ -593,8 +594,6 @@ if (MSVC) add_definitions( "/wd4267" ) endif(MSVC) - - add_subdirectory(External) add_subdirectory(Code) diff --git a/Code/GraphMol/FileParsers/CDXMLParser.cpp b/Code/GraphMol/FileParsers/CDXMLParser.cpp index 4fd286309..b9d101d6c 100644 --- a/Code/GraphMol/FileParsers/CDXMLParser.cpp +++ b/Code/GraphMol/FileParsers/CDXMLParser.cpp @@ -8,6 +8,10 @@ // of the RDKit source tree. // #include "CDXMLParser.h" +#include +#include + +#ifndef RDK_BUILD_CHEMDRAW_SUPPORT #include #include #include @@ -832,3 +836,40 @@ std::vector> MolsFromCDXML( } // namespace CDXMLParser } // namespace v2 } // namespace RDKit +#else +#include +#include + +namespace RDKit{ +namespace v2 { +namespace CDXMLParser { + +std::vector> MolsFromCDXMLDataStream( + std::istream &inStream, const CDXMLParserParams ¶ms) { + // populate tree structure pt + ChemDrawParserParams chemdraw_params; + chemdraw_params.sanitize = params.sanitize; + chemdraw_params.removeHs = params.removeHs; + return MolsFromChemDrawDataStream(inStream, chemdraw_params); +} + +std::vector> MolsFromCDXMLFile( + const std::string &fileName, const CDXMLParserParams ¶ms) { + std::ifstream ifs(fileName); + if (!ifs || ifs.bad()) { + std::ostringstream errout; + errout << "Bad input file " << fileName; + throw BadFileException(errout.str()); + } + return MolsFromCDXMLDataStream(ifs, params); +} + +std::vector> MolsFromCDXML( + const std::string &cdxml, const CDXMLParserParams ¶ms) { + std::stringstream iss(cdxml); + return MolsFromCDXMLDataStream(iss, params); +} +} +} +} +#endif diff --git a/Code/GraphMol/FileParsers/CMakeLists.txt b/Code/GraphMol/FileParsers/CMakeLists.txt index ddf761ace..e282e2c97 100644 --- a/Code/GraphMol/FileParsers/CMakeLists.txt +++ b/Code/GraphMol/FileParsers/CMakeLists.txt @@ -34,7 +34,8 @@ rdkit_library(FileParsers MultithreadedMolSupplier.cpp MultithreadedSmilesMolSupplier.cpp MultithreadedSDMolSupplier.cpp - LINK_LIBRARIES GenericGroups Depictor SmilesParse ChemTransforms GraphMol SubstructMatch ${MAEPARSER_LIB}) + LINK_LIBRARIES GenericGroups Depictor SmilesParse ChemTransforms GraphMol SubstructMatch ${MAEPARSER_LIB} ${RDK_CHEMDRAW_LIBS}) + target_compile_definitions(FileParsers PRIVATE RDKIT_FILEPARSERS_BUILD) rdkit_headers(CDXMLParser.h diff --git a/Code/GraphMol/FileParsers/cdxml_parser_catch.cpp b/Code/GraphMol/FileParsers/cdxml_parser_catch.cpp index fcccaf291..18ada8af8 100644 --- a/Code/GraphMol/FileParsers/cdxml_parser_catch.cpp +++ b/Code/GraphMol/FileParsers/cdxml_parser_catch.cpp @@ -295,14 +295,16 @@ TEST_CASE("CDXML") { E="28" BS="N" />)"; - std::stringstream iss(cdxml1); + { + std::stringstream iss(cdxml1); auto mols = MolsFromCDXMLDataStream(iss); for (auto &mol : mols) { CHECK(MolToSmiles(*mol) == "CC(C)(C)OC(=O)C1CCCCCC1"); } } { + std::stringstream iss(cdxml1); // v1 api auto mols = CDXMLDataStreamToMols(iss); for (auto &mol : mols) { @@ -544,6 +546,7 @@ TEST_CASE("CDXML") { // this was hella fun to validate the stereo-chemistry... auto fname = cdxmlbase + "chemdraw_template1.cdxml"; auto mols = MolsFromCDXMLFile(fname); +#ifndef RDK_BUILD_CHEMDRAW_SUPPORT std::vector expected = { "CCC/C=C/C=C/C(=O)O[C@H]1/C(=C/C(=O)OC)C[C@H]2C[C@H]([C@@H](C)O)OC(=O)C[C@H](O)C[C@@H]3C[C@H](OC(C)=O)C(C)(C)[C@](O)(C[C@@H]4C/C(=C/C(=O)OC)C[C@H](/C=C/C(C)(C)[C@]1(O)O2)O4)O3", "[B]", @@ -558,6 +561,27 @@ TEST_CASE("CDXML") { "Cc1cccn1[C@H](C)C(C#N)O[Si](C)(C)C", "CC1CCC2(O)C3(OC4(O)C[C@]2(C)C2(O)[C@H](OC(=O)c5ccc[nH]5)C(O)(C(C)C)C4(C)C32O)C1O", "C=C(C)[C@H]1CC(=O)CC2=C(C1)[C@H]1C(=O)O[C@H]3C[C@@](C)(O)[C@@H](C2=O)[C@@H]13"}; +#else + // the new cdxml parser handles stereo a lot better + std::vector expected = { + "CCC/C=C/C=C/C(=O)O[C@H]1/C(=C/C(=O)OC)C[C@H]2C[C@H]([C@@H](C)O)OC(=O)C[C@H](O)C[C@@H]3C[C@H](OC(C)=O)C(C)(C)[C@](O)(C[C@@H]4C/C(=C/C(=O)OC)C[C@H](/C=C/C(C)(C)[C@]1(O)O2)O4)O3", + "[B]", + "*", + "[C]", + "Cc1ccc2n1[C@@H]1[C@@H]3O[C@]([C@H](C)O)(C=C2)[C@H]1c1ccc(C)n1[C@@H]3C", + // this is may or may not be correct, but the structure is drawn + // incorrectly. + // There's a test below which fixes this + "Cc1ccc2n1[C@H](C)C(=O)[C@@H]1[C@H]2C(=O)C=Cc2ccc(C)n21", + "Cc1ccc2ccc(=O)ccn12", + "Cc1cccn1[C@H](C)C=O", + "Cc1ccc2ccc([O-])cc[n+]1-2", + "Cc1ccc2ccc(=O)ccn12", + "Cc1cccn1[C@H](C)C(C#N)O[Si](C)(C)C", + "CC1CC[C@]2(O)[C@]3(C)C[C@]4(O)O[C@@]2([C@@H]1O)C1(O)C4(C)C(O)(C(C)C)[C@@H](OC(=O)c2ccc[nH]2)[C@]13O", + "C=C(C)[C@H]1CC(=O)CC2=C(C1)[C@H]1C(=O)O[C@H]3C[C@@](C)(O)[C@@H](C2=O)[C@@H]13"}; + +#endif CHECK(mols.size() == expected.size()); int i = 0; for (auto &mol : mols) { @@ -585,6 +609,7 @@ TEST_CASE("CDXML") { // the rdkit is correct here... auto fname = cdxmlbase + "chemdraw_template2.cdxml"; auto mols = MolsFromCDXMLFile(fname); +#ifndef RDK_BUILD_CHEMDRAW_SUPPORT std::vector expected = { "CCN1CC2(COC)CCC(OC)C34C5CC6C(OC)CC(O)(C(CC23)C14)C5C6O", "*", @@ -656,6 +681,81 @@ TEST_CASE("CDXML") { "C", "CC1CCC2(O)C3(OC4(O)C[C@]2(C)C2(O)[C@H](O)C(O)(C(C)C)C4(C)C32O)C1O", "[2H]"}; +#else + // The new cdxml parser handles stereo a LOT better + std::string talatisamine = "CCN1C[C@]2(COC)CCC(OC)[C@@]34[C@@H]5C[C@@H]6C(OC)C[C@@](O)([C@H]5[C@H]6O)[C@@H](C[C@H]23)[C@H]14"; + std::vector expected = { + talatisamine, //0 + "*", + "C", + "[F]", + "[B]", + "[C]", + "[2H]", + talatisamine, + "*", + "C", + "[F]", // 10 + "[B]", + "[C]", + "[2H]", + talatisamine, + "*", + "C", + "[F]", + "[B]", + "[C]", + "[2H]", // 20 + talatisamine, + "*", + "C", + "[F]", + "[B]", + "[C]", + "[2H]", + talatisamine, + "CCN1C[C@]2(COC)CC[C@H](OC)[C@]34C1C(C[C@H]23)[C@@]1(O)CC(OC)[C@H]2C[C@@H]4[C@@H]1[C@H]2O", + "*", // 30 + "[B]", + "[C]", + "[2H]", + "C", + "[F]", + "*", + "C", + "[F]", + "[B]", + "[C]", // 40 + "[2H]", + talatisamine, + "*", + "C", + "[F]", + "[B]", + "[C]", + "[2H]", + talatisamine, + "*", // 50 + "C", + "[F]", + "[B]", + "[C]", + "[2H]", + "CC1CC[C@]2(O)[C@]3(C)C[C@]4(O)O[C@@]2([C@@H]1O)C1(O)C4(C)C(O)(C(C)C)[C@@H](O)[C@]13O", + "CC1=C(C(C)C)[C@@H](O)[C@@]2(O)[C@@]3(C)CC(=O)O[C@@]4([C@H](O)C(C)CC[C@]34O)[C@@]12O", + "CC1=C[C@@]23OC(=O)C[C@@](C)([C@@]2(O)CC1)[C@]1(O)[C@H](O)C2(C(C)C)OC2(C)[C@@]31O", + "*", + "[B]", // 60 + "[C]", + "CC1CC[C@@H]2[C@]3(C)C[C@@H]4O[C@@]2(C1)C1[C@@H]3CC(C(C)C)C14C", + "[2H]", + "*", + "[B]", + "[C]", + "C", + "CC1CC[C@]2(O)[C@]3(C)C[C@]4(O)O[C@@]2([C@@H]1O)C1(O)C4(C)C(O)(C(C)C)[C@@H](O)[C@]13O", + "[2H]"}; +#endif CHECK(mols.size() == expected.size()); int i = 0; for (auto &mol : mols) { @@ -762,7 +862,11 @@ TEST_CASE("CDXML") { auto mols = MolsFromCDXMLFile(fname); CHECK(0); } catch (FileParseException &e) { +#ifndef RDK_BUILD_CHEMDRAW_SUPPORT CHECK(std::string(e.what()) == "expected > at line: 373"); +#else + CHECK(std::string(e.what()) == "Bad Input File"); +#endif } } SECTION("Lots of stereo") { @@ -823,7 +927,7 @@ TEST_CASE("CDXML") { } } } - SECTION("Lots of bad stereo") { + SECTION("Lots of bad molecules") { { auto fname = cdxmlbase + "bad-id.cdxml"; auto mols = MolsFromCDXMLFile(fname); @@ -837,7 +941,11 @@ TEST_CASE("CDXML") { { auto fname = cdxmlbase + "bad-bondorder.cdxml"; auto mols = MolsFromCDXMLFile(fname); +#ifndef RDK_BUILD_CHEMDRAW_SUPPORT CHECK(mols.size() == 0); +#else + CHECK(mols.size() == 1); // The original chemdraw reader makes unknowns single bonds +#endif } { auto fname = cdxmlbase + "bad-bondorder2.cdxml"; @@ -854,8 +962,15 @@ TEST_CASE("atropisomers") { SECTION("atropisomer") { { std::vector filenames = {"atrop1.cdxml"}; + +#ifndef RDK_BUILD_CHEMDRAW_SUPPORT std::vector expected = { "C[C]1[C][CH]C(Cl)C(C)=C1c1c(C)ccc(Cl)c1C |(-2.936,-0.12,;-2.936,-1.66,;-1.602,-2.43,;-1.602,-3.97,;-2.936,-4.74,;-2.93,-6.28,;-4.27,-3.97,;-5.603,-4.74,;-4.27,-2.43,;-5.603,-1.66,;-5.603,-0.12,;-4.27,0.64,;-6.937,0.64,;-8.271,-0.12,;-8.271,-1.66,;-9.604,-2.43,;-6.937,-2.43,;-6.937,-3.97,),^1:1,3,^2:2,wU:8.8|"}; +#else + std::vector expected = { + "C[C]1[C][CH]C(Cl)C(C)=C1c1c(C)ccc(Cl)c1C |(-2.936,-0.12,;-2.936,-1.66,;-1.602,-2.43,;-1.602,-3.97,;-2.936,-4.74,;-2.93,-6.28,;-4.27,-3.97,;-5.603,-4.74,;-4.27,-2.43,;-5.603,-1.66,;-5.603,-0.12,;-4.27,0.639999,;-6.937,0.639999,;-8.271,-0.12,;-8.271,-1.66,;-9.604,-2.43,;-6.937,-2.43,;-6.937,-3.97,),^1:1,3,^2:2,wU:8.8|"}; + +#endif for (auto i = 0u; i < filenames.size(); ++i) { auto fname = cdxmlbase + filenames[i]; auto mol = MolsFromCDXMLFile(fname); diff --git a/Code/GraphMol/FileParsers/test_data/Issue3514824.cdxml b/Code/GraphMol/FileParsers/test_data/Issue3514824.cdxml new file mode 100644 index 000000000..d9d088ece --- /dev/null +++ b/Code/GraphMol/FileParsers/test_data/Issue3514824.cdxml @@ -0,0 +1,544 @@ + + + + + + + + + + + + + + +OOOOO \ No newline at end of file diff --git a/External/CMakeLists.txt b/External/CMakeLists.txt index f4bf20d92..26cf6cbba 100644 --- a/External/CMakeLists.txt +++ b/External/CMakeLists.txt @@ -7,4 +7,8 @@ add_subdirectory(RingFamilies) add_subdirectory(GA) if(RDK_BUILD_PUBCHEMSHAPE_SUPPORT) add_subdirectory(pubchem_shape) -endif() \ No newline at end of file +endif() + +if(RDK_BUILD_CHEMDRAW_SUPPORT) + add_subdirectory(ChemDraw) +endif() diff --git a/External/ChemDraw/CMakeLists.txt b/External/ChemDraw/CMakeLists.txt new file mode 100644 index 000000000..e2d77ed5d --- /dev/null +++ b/External/ChemDraw/CMakeLists.txt @@ -0,0 +1,193 @@ +add_custom_target(chemdraw_support ALL) +include(CMakePrintHelpers) + +# The ChemDraw Library requires expat and expatpp to run. +# this has an include expat.h and expatpp.h that needs to be included +# Currently this is an OLD version of expatpp from source forge is used +# and included in this directory. +# +# For builds, we currently need a target_include_directories +# and will need to be fixed in the future + +if(RDK_BUILD_CHEMDRAW_SUPPORT) + if(NOT EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/chemdraw/chemdraw/CDXIO.h" ) + set(RELEASE_NO "1.0.9") + set(MD5 "a41bb1abb2df2082274b74dccee19fb4") + downloadAndCheckMD5("https://codeload.github.com/Glysade/chemdraw/tar.gz/refs/tags/v${RELEASE_NO}" + "${CMAKE_CURRENT_SOURCE_DIR}/chemdraw-v${RELEASE_NO}.tar.gz" ${MD5}) + + execute_process(COMMAND ${CMAKE_COMMAND} -E tar zxf + ${CMAKE_CURRENT_SOURCE_DIR}/chemdraw-v${RELEASE_NO}.tar.gz + WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}) + file(RENAME "${CMAKE_CURRENT_SOURCE_DIR}/chemdraw-${RELEASE_NO}" "${CMAKE_CURRENT_SOURCE_DIR}/chemdraw") + endif() + + include(TestBigEndian) + message("-- Looking for endianess") + test_big_endian(WORDS_BIGENDIAN) + #/* 1234 = LIL_ENDIAN, 4321 = BIGENDIAN */ + + if(WORDS_BIGENDIAN) + message("-- CHEDRAW BIGENDIAN PLATFORM") + add_definitions("-DPLATFORM_BIGENDIAN") + else(WORDS_BIGENDIAN) + message("- CHEMDRAW LITTLEENDIAN PLATFORM") + add_definitions("-DPLATFORM_LITTLEENDIAN") + endif(WORDS_BIGENDIAN) + + + # we don't want to install expat, this is statically linked in to the ChemDraw lib + # however, we don't want to install it so use the undocumented EXCLUDE_FROM_ALL + add_subdirectory(chemdraw/expatpp EXCLUDE_FROM_ALL) + + include_directories( + ${CMAKE_CURRENT_SOURCE_DIR}/chemdraw/expatpp/expatpp-code-r6-trunk/src_pp + ${CMAKE_CURRENT_SOURCE_DIR}/chemdraw/expatpp/expatpp-code-r6-trunk/expat/lib) + + # it's way easier to use the RDKIT machinery to build and link so let's do that + file(GLOB CHEMDRAW_SOURCE "chemdraw/chemdraw/*.cpp") + rdkit_library(ChemDraw ${CHEMDRAW_SOURCE} SHARED) + target_compile_definitions(ChemDraw PRIVATE CHEMDRAW_BUILD) + target_link_libraries(ChemDraw PRIVATE expat) + + # export all the symbols for ChemDraw on MSVC + if((MSVC AND RDK_INSTALL_DLLS_MSVC) OR ((NOT MSVC) AND WIN32)) + message("== ChemDraw exporting all symbols") + set_target_properties(ChemDraw PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS TRUE) + endif() + # On Windows, define MYLIB_EXPORTS when building the DLL + if (WIN32) + target_compile_definitions(ChemDraw + PRIVATE CHEMDRAW_BUILD + ) + endif() + + # On Linux/macOS, hide all symbols by default and expose only our API + if (CMAKE_CXX_COMPILER_ID STREQUAL "Clang" OR + CMAKE_CXX_COMPILER_ID STREQUAL "GNU" OR + CMAKE_CXX_COMPILER_ID STREQUAL "Emscripten") + # Require -fvisibility=hidden + set_target_properties(ChemDraw PROPERTIES + CXX_VISIBILITY_PRESET hidden + VISIBILITY_INLINES_HIDDEN ON + ) + endif() + + +# export all the symbols for ChemDraw on MSVC +if((MSVC AND BUILD_SHARED_LIBS) OR ((NOT MSVC) AND WIN32)) + set_target_properties(ChemDraw PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS TRUE) +endif() + # Suppress warnings since we don't control the chedraw code and indicate + # we have the EXPAT_CONFIG created + if(MSVC) + ADD_DEFINITIONS("-DTARGET_API_LIB -D_WINDOWS -DTARGET_OS_WIN32 -DHAVE_EXPAT_CONFIG_H") + # we don't really control chemdraw source code, so suppress warnings + target_compile_options(ChemDraw PRIVATE "/W0") + else() + ADD_DEFINITIONS("-DTARGET_API_LIB -D__linux -DHAVE_EXPAT_CONFIG_H") + # we don't really control chemdraw source code, so suppress warnings + target_compile_options(ChemDraw PRIVATE -w -Wno-unknown-pragmas -Wno-error) + if(RDK_INSTALL_STATIC_LIBS) + if(TARGET ChemDraw_static) + target_compile_options(ChemDraw_static PRIVATE -w -Wno-unknown-pragmas -Wno-error) + endif() + endif(RDK_INSTALL_STATIC_LIBS) + endif() + + install(TARGETS ChemDraw DESTINATION ${RDKit_LibDir}) + + if(CMAKE_CXX_COMPILER_ID STREQUAL "Clang") + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wno-comment -Wno-parentheses -Wno-logical-op-parentheses -Wno-pointer-bool-conversion -Wno-unused-value -Wno-unsequenced -Wno-constant-logical-operand") + endif() + + if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU") + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wformat-overflow=0 -Wformat=0 -Wno-format-security") + endif() + + + include_directories(chemdraw) + + set(RDChemDrawLib_INCLUDE_DIRS ${CMAKE_CURRENT_SOURCE_DIR} + CACHE STRING "RDChemDrawLib Include File" FORCE) + + rdkit_library(RDChemDrawLib + bond.cpp + bracket.cpp + chemdraw.cpp + fragment.cpp + node.cpp + reaction.cpp + utils.cpp + writer.cpp + # ${EXPAT_SRC} + SHARED LINK_LIBRARIES ChemDraw + CIPLabeler ChemTransforms GraphMol RDGeneral Depictor SubstructMatch SmilesParse ) + + rdkit_library(RDChemDrawReactionLib + chemdrawreaction.cpp + # ${EXPAT_SRC} + SHARED LINK_LIBRARIES RDChemDrawLib ChemDraw + CIPLabeler ChemTransforms ChemReactions GraphMol RDGeneral Depictor SubstructMatch SmilesParse ) + + if(MSVC) + target_compile_definitions(RDChemDrawLib PRIVATE RDKIT_RDCHEMDRAWLIB_BUILD XML_USE_MSC_EXTENSIONS) + target_compile_definitions(RDChemDrawReactionLib PRIVATE RDKIT_RDCHEMDRAWREACTIONLIB_BUILD + XML_USE_MSC_EXTENSIONS) + else() + target_compile_definitions(RDChemDrawLib PRIVATE RDKIT_RDCHEMDRAWLIB_BUILD) + target_compile_definitions(RDChemDrawReactionLib PRIVATE RDKIT_RDCHEMDRAWREACTIONLIB_BUILD) + endif() + + install(TARGETS RDChemDrawLib DESTINATION ${RDKit_LibDir}) + install(TARGETS RDChemDrawReactionLib DESTINATION ${RDKit_LibDir}) + set(RDK_CHEMDRAW_LIBS RDChemDrawLib CACHE STRING "the external libraries" FORCE) + set(RDK_CHEMDRAWREACTION_LIBS RDChemDrawReactinoLib CACHE STRING "the external libraries" FORCE) + + rdkit_headers(chemdraw.h DEST GraphMol) + rdkit_headers(chemdrawreaction.h DEST GraphMol) + + # all the tests + rdkit_catch_test(chemdrawCatchTest test.cpp + LINK_LIBRARIES RDChemDrawLib ChemDraw SubstructMatch ChemReactions + FileParsers SmilesParse CIPLabeler ChemTransforms GraphMol) + + rdkit_catch_test(chemdrawChiralCatchTest test-chiral.cpp + LINK_LIBRARIES RDChemDrawLib ChemDraw SubstructMatch ChemReactions + FileParsers SmilesParse CIPLabeler ChemTransforms GraphMol) + + rdkit_catch_test(chemdrawReactionsCatchTest test-reactions.cpp + LINK_LIBRARIES RDChemDrawReactionLib RDChemDrawLib ChemDraw SubstructMatch ChemReactions + FileParsers SmilesParse CIPLabeler ChemTransforms GraphMol) + + rdkit_catch_test(chemdraw3DCatchTest test_3d.cpp + LINK_LIBRARIES RDChemDrawLib ChemDraw SubstructMatch ChemReactions + FileParsers SmilesParse CIPLabeler ChemTransforms GraphMol) + + rdkit_catch_test(chemdraw6KCatchTest test_6k.cpp + LINK_LIBRARIES RDChemDrawLib ChemDraw SubstructMatch ChemReactions + FileParsers SmilesParse CIPLabeler ChemTransforms GraphMol) + + if(RDK_BUILD_CPP_TESTS) + if(MSVC) + # The nanotubes blow up the smiles writer stack on MSVC so increase it + set_target_properties(chemdraw6KCatchTest PROPERTIES LINK_FLAGS + "/STACK:4194304") + + # this sets everything I think + # set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} /STACK:4194304") + else() + #target_compile_options(chemdrawCatchTest PRIVATE -w -Wno-unknown-pragmas -Wno-error) + #target_compile_options(chemdrawChiralCatchTest PRIVATE -w -Wno-unknown-pragmas -Wno-error) + #target_compile_options(chemdrawReactionsCatchTest PRIVATE -w -Wno-unknown-pragmas -Wno-error) + #target_compile_options(chemdraw3DCatchTest PRIVATE -w -Wno-unknown-pragmas -Wno-error) + #target_compile_options(chemdraw6KCatchTest PRIVATE -w -Wno-unknown-pragmas -Wno-error) + endif(MSVC) + endif(RDK_BUILD_CPP_TESTS) + + if(RDK_BUILD_PYTHON_WRAPPERS) + add_subdirectory(Wrap) + endif(RDK_BUILD_PYTHON_WRAPPERS) + +endif(RDK_BUILD_CHEMDRAW_SUPPORT) + diff --git a/External/ChemDraw/ChemDrawEndInclude.h b/External/ChemDraw/ChemDrawEndInclude.h new file mode 100644 index 000000000..82b916dfb --- /dev/null +++ b/External/ChemDraw/ChemDrawEndInclude.h @@ -0,0 +1,29 @@ +#if defined(__clang__) +/* Clang/LLVM. ---------------------------------------------- */ +#pragma GCC diagnostic pop + +#elif defined(__ICC) || defined(__INTEL_COMPILER) +/* Intel ICC/ICPC. ------------------------------------------ */ + +#elif (defined(__GNUC__) || defined(__GNUG__)) && \ + (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ > 5)) +/* GNU GCC/G++. these pragmas only work with >v4.1 + * --------------------------------------------- */ +#pragma GCC diagnostic pop + +#elif defined(__HP_cc) || defined(__HP_aCC) +/* Hewlett-Packard C/aC++. ---------------------------------- */ + +#elif defined(__IBMC__) || defined(__IBMCPP__) +/* IBM XL C/C++. -------------------------------------------- */ + +#elif defined(_MSC_VER) +/* Microsoft Visual Studio. --------------------------------- */ +#pragma warning(pop) +#elif defined(__PGI) +/* Portland Group PGCC/PGCPP. ------------------------------- */ + +#elif defined(__SUNPRO_C) || defined(__SUNPRO_CC) +/* Oracle Solaris Studio. ----------------------------------- */ + +#endif diff --git a/External/ChemDraw/ChemDrawStartInclude.h b/External/ChemDraw/ChemDrawStartInclude.h new file mode 100644 index 000000000..2c29c1d20 --- /dev/null +++ b/External/ChemDraw/ChemDrawStartInclude.h @@ -0,0 +1,66 @@ +#if defined(__clang__) +/* Clang/LLVM. ---------------------------------------------- */ +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wunused-value" +#pragma GCC diagnostic ignored "-Wmissing-field-initializers" +#pragma GCC diagnostic ignored "-Wsign-compare" +#pragma GCC diagnostic ignored "-Wconversion" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wtype-limits" +#pragma GCC diagnostic ignored "-Wreorder" +#pragma GCC diagnostic ignored "-Wunused" +#pragma GCC diagnostic ignored "-Wmacro-redefined" +#pragma GCC diagnostic ignored "-Wunknown-pragmas" +#pragma GCC diagnostic ignored "-Wignored-qualifiers" +#pragma GCC diagnostic ignored "-Wall" +#pragma GCC diagnostic ignored "-Wextra" +#if defined(__apple_build_version__) +#if __apple_build_version__ >= 7000072 +#pragma GCC diagnostic ignored "-Wunused-local-typedef" +#endif +#endif +#elif defined(__ICC) || defined(__INTEL_COMPILER) +/* Intel ICC/ICPC. ------------------------------------------ */ + +#elif (defined(__GNUC__) || defined(__GNUG__)) && \ + (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ > 1)) +/* GNU GCC/G++. --------------------------------------------- */ +#if (__GNUC__ > 4 || __GNUC_MINOR__ > 5) +#pragma GCC diagnostic push +#endif +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wmissing-field-initializers" +#pragma GCC diagnostic ignored "-Wsign-compare" +#pragma GCC diagnostic ignored "-Wconversion" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +//#pragma GCC diagnostic ignored "-Wmacro-redefined" +#pragma GCC diagnostic ignored "-Wunknown-pragmas" +#pragma GCC diagnostic ignored "-Wignored-qualifiers" +#pragma GCC diagnostic ignored "-Wextra" +#pragma GCC diagnostic ignored "-Wall" +#if (__GNUC__ > 4 || __GNUC_MINOR__ > 7) +#pragma GCC diagnostic ignored "-Wunused-local-typedefs" +#endif +#if (__GNUC__ > 8) +#pragma GCC diagnostic ignored "-Wdeprecated-copy" +#pragma GCC diagnostic ignored "-Wpessimizing-move" +#endif +#elif defined(__HP_cc) || defined(__HP_aCC) +/* Hewlett-Packard C/aC++. ---------------------------------- */ + +#elif defined(__IBMC__) || defined(__IBMCPP__) +/* IBM XL C/C++. -------------------------------------------- */ + +#elif defined(_MSC_VER) +/* Microsoft Visual Studio. --------------------------------- */ +#pragma warning(push) +#pragma warning(disable : 4996 4267) + +#elif defined(__PGI) +/* Portland Group PGCC/PGCPP. ------------------------------- */ + +#elif defined(__SUNPRO_C) || defined(__SUNPRO_CC) +/* Oracle Solaris Studio. ----------------------------------- */ + +#endif diff --git a/External/ChemDraw/Wrap/CMakeLists.txt b/External/ChemDraw/Wrap/CMakeLists.txt new file mode 100644 index 000000000..9737705d8 --- /dev/null +++ b/External/ChemDraw/Wrap/CMakeLists.txt @@ -0,0 +1,8 @@ +remove_definitions(-DRDKIT_CHEMDRAW_BUILD) +rdkit_python_extension(rdChemDraw + rdChemDraw.cpp + DEST Chem + LINK_LIBRARIES + RDChemDrawLib RDChemDrawReactionLib + ) + diff --git a/External/ChemDraw/Wrap/rdChemDraw.cpp b/External/ChemDraw/Wrap/rdChemDraw.cpp new file mode 100644 index 000000000..f24defa60 --- /dev/null +++ b/External/ChemDraw/Wrap/rdChemDraw.cpp @@ -0,0 +1,261 @@ +// +// Copyright (c) 2025, Glysade Inc. +// and other RDKit contributors +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// * Neither the name of Novartis Institutues for BioMedical Research Inc. +// nor the names of its contributors may be used to endorse or promote +// products derived from this software without specific prior written +// permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#include +#include + +namespace python = boost::python; +using namespace RDKit; +namespace { +std::string pyObjectToString(python::object input) { + python::extract ex(input); + if (ex.check()) { + return ex(); + } + std::wstring ws = python::extract(input); + return std::string(ws.begin(), ws.end()); +} + +python::object MolsFromChemDrawBlockHelper(const std::string &filename, bool sanitize, + bool removeHs) { + std::vector> mols; + try { + mols = RDKit::v2::MolsFromChemDrawBlock(filename, {sanitize, removeHs}); + } catch (RDKit::BadFileException &e) { + PyErr_SetString(PyExc_IOError, e.what()); + throw python::error_already_set(); + } catch (RDKit::FileParseException &e) { + BOOST_LOG(rdWarningLog) << e.what() << std::endl; + } catch (...) { + } + python::list res; + for (auto &mol : mols) { + // take ownership of the data from the unique_ptr + ROMOL_SPTR sptr(static_cast(mol.release())); + res.append(sptr); + } + return python::tuple(res); +} + +python::tuple MolsFromChemDrawFileHelper(python::object cdxml, bool sanitize, + bool removeHs) { + auto mols = RDKit::v2::MolsFromChemDrawFile(pyObjectToString(cdxml), {sanitize, removeHs}); + python::list res; + for (auto &mol : mols) { + // take ownership of the data from the unique_ptr + ROMOL_SPTR sptr(static_cast(mol.release())); + res.append(sptr); + } + return python::tuple(res); +} + +python::object ReactionsFromChemDrawFileHelper(const char *filename, bool sanitize, + bool removeHs) { + std::vector> rxns; + try { + rxns = RDKit::v2::ChemDrawFileToChemicalReactions(filename, sanitize, removeHs); + } catch (RDKit::BadFileException &e) { + PyErr_SetString(PyExc_IOError, e.what()); + throw python::error_already_set(); + } catch (RDKit::FileParseException &e) { + BOOST_LOG(rdWarningLog) << e.what() << std::endl; + } catch (...) { + } + python::list res; + for (auto &rxn : rxns) { + // take ownership of the data from the unique_ptr + res.append(std::shared_ptr(rxn.release())); + } + return python::tuple(res); +} + +python::object ReactionsFromChemDrawBlockHelper(python::object imolBlock, bool sanitize, + bool removeHs) { + std::istringstream inStream(pyObjectToString(imolBlock)); + std::vector> rxns; + try { + rxns = RDKit::v2::ChemDrawDataStreamToChemicalReactions(inStream, sanitize, removeHs); + } catch (RDKit::FileParseException &e) { + BOOST_LOG(rdWarningLog) << e.what() << std::endl; + } catch (...) { + } + python::list res; + for (auto &rxn : rxns) { + // take ownership of the data from the unique_ptr + res.append(std::shared_ptr(rxn.release())); + } + return python::tuple(res); +} +} + +BOOST_PYTHON_MODULE(rdChemDraw) { + python::scope().attr("__doc__") = + "Module containing classes and functions for working with ChemDraw files."; + + // Molecule Interface + std::string docString = + R"DOC(Extract all molecules from a ChemDraw file. + + Note that the ChemDraw format is large and complex, the RDKit doesn't support + full functionality, just the base ones required for molecule and + reaction parsing. + + ARGUMENTS: + + - filename: the chemdraw filename (.cdx/.cdxml) + + - sanitize: if True, sanitize the molecules [default True] + + - removeHs: if True, convert explicit Hs into implicit Hs. [default True] + + RETURNS: + a tuple of parsed Mol objects.)DOC"; + + python::def("MolsFromChemDrawFile", MolsFromChemDrawFileHelper, + (python::arg("filename"), python::arg("sanitize") = true, + python::arg("removeHs") = true), + docString.c_str()); + + docString = + R"DOC(Extract all molecules from a ChemDraw file. + + Note that the ChemDraw format is large and complex, the RDKit doesn't support + full functionality, just the base ones required for molecule and + reaction parsing. + + ARGUMENTS: + + - block: the CDX/CDXML block + + - sanitize: if True, sanitize the molecules [default True] + + - removeHs: if True, convert explicit Hs into implicit Hs. [default True] + + RETURNS: + a tuple of parsed Mol objects.)DOC"; + + python::def("MolsFromChemDrawBlock", MolsFromChemDrawBlockHelper, + (python::arg("block"), python::arg("sanitize") = true, + python::arg("removeHs") = true), + docString.c_str()); + + docString = + R"DOC(Extract all reactions from a ChemDraw file. + + Note that the ChemDraw format is large and complex, the RDKit doesn't support + full functionality, just the base ones required for molecule and + reaction parsing. + + ARGUMENTS: + + - filename: the chemdraw filename (.cdx/.cdxml) + + - sanitize: if True, sanitize the molecules [default True] + + - removeHs: if True, convert explicit Hs into implicit Hs. [default True] + + RETURNS: + a tuple of parsed ChemicalReaction objects.)DOC"; + + // Reaction Interface + python::def("ReactionsFromChemDrawFile", ReactionsFromChemDrawFileHelper, + (python::arg("filename"), python::arg("sanitize") = false, + python::arg("removeHs") = false), + docString.c_str()); + + docString = + R"DOC(Extract all reactions from a ChemDraw text block. + + Note that the ChemDraw format is large and complex, the RDKit doesn't support + full functionality, just the base ones required for molecule and + reaction parsing. + + ARGUMENTS: + + - filename: the chemdraw filename (.cdx/.cdxml) + + - sanitize: if True, sanitize the molecules [default True] + + - removeHs: if True, convert explicit Hs into implicit Hs. [default True] + + RETURNS: + a tuple of parsed ChemicalReaction objects.)DOC"; + + python::def( + "ReactionsFromChemDrawBlock", ReactionsFromChemDrawBlockHelper, + (python::arg("rxnblock"), python::arg("sanitize") = false, + python::arg("removeHs") = false), + docString.c_str()); + + + python::enum_("CDXFormat") + .value("CDX", v2::CDXFormat::CDX) + .value("CDXML", v2::CDXFormat::CDXML); + + docString = + R"DOC(Convert a molecule into a chemdraw string using the specified format + + ARGUMENTS: + + - mol: the molecule to convert + + - format: The ChemDraw format to use, CDXML/CDX [default CDXML] + + RETURNS: + an iterator of parsed ChemicalReaction objects.)DOC"; + + python::def( + "MolToChemDrawBlock", v2::MolToChemDrawBlock, + (python::arg("mol"), python::arg("format")=v2::CDXFormat::CDXML), + docString.c_str()); +} diff --git a/External/ChemDraw/Wrap/testChemDraw.py b/External/ChemDraw/Wrap/testChemDraw.py new file mode 100644 index 000000000..c5142bd78 --- /dev/null +++ b/External/ChemDraw/Wrap/testChemDraw.py @@ -0,0 +1,277 @@ +# Copyright (c) 2025 Glysade Inc +# All rights reserved. +# +# This file is part of the RDKit. +# The contents are covered by the terms of the BSD license +# which is included in the file license.txt, found at the root +# of the RDKit source tree. + +import copy +import os +import sys +import unittest + +from rdkit import Chem +from rdkit.Chem import rdChemDraw, rdChemDrawReaction + +class TestChemDraw(unittest.TestCase): + + def test_cdxml(self): + cdxml = """ + + + + + + + + + + + + + Boc""" + mols = rdChemDraw.MolsFromChemDraw(cdxml) + self.assertEqual(len(mols), 1) + self.assertEqual(Chem.MolToSmiles(mols[0]), "CC(C)(C)OC(=O)C1CCCCCC1") + + diff --git a/External/ChemDraw/bond.cpp b/External/ChemDraw/bond.cpp new file mode 100644 index 000000000..ce1e562dd --- /dev/null +++ b/External/ChemDraw/bond.cpp @@ -0,0 +1,227 @@ +// +// Copyright (c) 2024, Glysade Inc +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// * Neither the name of Novartis Institutes for BioMedical Research Inc. +// nor the names of its contributors may be used to endorse or promote +// products derived from this software without specific prior written +// permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +//#include "node.h" +#include "utils.h" +#include "fragment.h" + +namespace RDKit { +namespace ChemDraw { +bool parseBond(RWMol &mol, unsigned int fragmentId, CDXBond &bond, + PageData &pagedata) { + int bond_id = bond.GetObjectID(); + Atom *start_atom = pagedata.atomIds[bond.m_beginNodeID]; + Atom *end_atom = pagedata.atomIds[bond.m_endNodeID]; + if ((!start_atom || !end_atom)) { + BOOST_LOG(rdErrorLog) << "Bad bond in CDXML skipping fragment " + << fragmentId << "..." << std::endl; + return false; + } + Bond::BondType order = Bond::UNSPECIFIED; + std::unique_ptr qb; + switch (bond.m_bondOrder) { + case kCDXBondOrder_Single: + order = Bond::BondType::SINGLE; + break; + case kCDXBondOrder_Double: + order = Bond::BondType::DOUBLE; + break; + case kCDXBondOrder_Triple: + order = Bond::BondType::TRIPLE; + break; + case kCDXBondOrder_Quadruple: + order = Bond::BondType::QUADRUPLE; + break; + case kCDXBondOrder_Quintuple: + order = Bond::BondType::QUINTUPLE; + break; + case kCDXBondOrder_Sextuple: + order = Bond::BondType::HEXTUPLE; + break; + case kCDXBondOrder_OneHalf: + order = Bond::BondType::AROMATIC; + start_atom->setIsAromatic(true); + end_atom->setIsAromatic(true); + break; + case kCDXBondOrder_TwoHalf: + order = Bond::BondType::TWOANDAHALF; + break; + case kCDXBondOrder_ThreeHalf: + order = Bond::BondType::THREEANDAHALF; + break; + case kCDXBondOrder_FourHalf: + order = Bond::BondType::FOURANDAHALF; + break; + case kCDXBondOrder_FiveHalf: + order = Bond::BondType::FIVEANDAHALF; + break; + case kCDXBondOrder_Dative: + order = Bond::BondType::DATIVE; + break; + case kCDXBondOrder_Ionic: + order = Bond::BondType::IONIC; + break; + case kCDXBondOrder_SingleOrDouble: { + order = Bond::BondType::SINGLE; + qb = std::make_unique(); + qb->setQuery(makeSingleOrDoubleBondQuery()); + break; + } + case kCDXBondOrder_SingleOrAromatic: { + order = Bond::BondType::SINGLE; + qb = std::make_unique(); + qb->setQuery(makeSingleOrAromaticBondQuery()); + break; + } + case kCDXBondOrder_DoubleOrAromatic: { + order = Bond::BondType::DOUBLE; + qb = std::make_unique(); + qb->setQuery(makeDoubleOrAromaticBondQuery()); + break; + } + case kCDXBondOrder_Any: { + qb = std::make_unique(); + qb->setQuery(makeBondNullQuery()); + break; + } + case kCDXBondOrder_Hydrogen: + BOOST_LOG(rdErrorLog) + << "Unhandled bond order Hydrogen, skipping fragment" << std::endl; + return false; + case kCDXBondOrder_ThreeCenter: + BOOST_LOG(rdErrorLog) + << "Unhandled bond order ThreeCenter, skipping fragment" << std::endl; + return false; + case kCDXBondOrder_Half: + BOOST_LOG(rdErrorLog) + << "Unhandled bond order Half, skipping fragment" << std::endl; + return false; + default: + BOOST_LOG(rdErrorLog) << "Bad bond, skipping fragment" << std::endl; + return false; + }; + + // The RDKit only supports one direction for wedges so + // normalize it + bool swap_bond_ends = false; + switch (bond.m_display) { + case kCDXBondDisplay_Solid: + break; + case kCDXBondDisplay_Dash: + break; + case kCDXBondDisplay_Hash: + break; + case kCDXBondDisplay_WedgedHashBegin: + break; + case kCDXBondDisplay_WedgedHashEnd: + swap_bond_ends = true; + break; + case kCDXBondDisplay_Bold: + break; + case kCDXBondDisplay_WedgeBegin: + break; + case kCDXBondDisplay_WedgeEnd: + swap_bond_ends = true; + break; + case kCDXBondDisplay_Wavy: + break; + case kCDXBondDisplay_HollowWedgeBegin: + break; + case kCDXBondDisplay_HollowWedgeEnd: + break; + case kCDXBondDisplay_WavyWedgeBegin: + break; + case kCDXBondDisplay_WavyWedgeEnd: + break; + case kCDXBondDisplay_Dot: + break; + case kCDXBondDisplay_DashDot: + break; + case kCDXBondDisplay_DottedHydrogen: + break; + } + + unsigned int bondIdx = 0; + auto startIdx = start_atom->getIdx(); + auto endIdx = end_atom->getIdx(); + if (swap_bond_ends) std::swap(startIdx, endIdx); + + if (qb) { + qb->setBeginAtomIdx(startIdx); + qb->setEndAtomIdx(endIdx); + bondIdx = mol.addBond(qb.release(), true) - 1; + } else { + bondIdx = mol.addBond(startIdx, endIdx, order) - 1; + } + + Bond *bnd = mol.getBondWithIdx(bondIdx); + if (order == Bond::BondType::AROMATIC) { + bnd->setIsAromatic(true); + bnd->getBeginAtom()->setIsAromatic(true); + bnd->getEndAtom()->setIsAromatic(true); + } + bnd->setProp(CDX_BOND_ID, bond.GetObjectID()); + + switch (bond.m_display) { + case kCDXBondDisplay_WedgedHashBegin: + case kCDXBondDisplay_WedgedHashEnd: { + bnd->setBondDir(Bond::BondDir::BEGINDASH); + bnd->setProp(common_properties::_MolFileBondCfg, 3); + } break; + case kCDXBondDisplay_WedgeBegin: + case kCDXBondDisplay_WedgeEnd: { + bnd->setBondDir(Bond::BondDir::BEGINWEDGE); + bnd->setProp(common_properties::_MolFileBondCfg, 1); + } break; + case kCDXBondDisplay_Wavy: { + switch (order) { + case Bond::BondType::SINGLE: + bnd->setBondDir(Bond::BondDir::UNKNOWN); + bnd->setProp(common_properties::_MolFileBondCfg, 2); + break; + case Bond::BondType::DOUBLE: + bnd->setBondDir(Bond::BondDir::EITHERDOUBLE); + bnd->setStereo(Bond::STEREOANY); + break; + default: + BOOST_LOG(rdWarningLog) + << "ignoring Wavy bond set on a non double bond id: " << bond_id + << std::endl; + } + break; + + default: + break; + } + } + return true; +} +} +} // namespace RDKit diff --git a/External/ChemDraw/bond.h b/External/ChemDraw/bond.h new file mode 100644 index 000000000..904f69b66 --- /dev/null +++ b/External/ChemDraw/bond.h @@ -0,0 +1,52 @@ +// +// Copyright (c) 2024, Glysade Inc +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// * Neither the name of Novartis Institutes for BioMedical Research Inc. +// nor the names of its contributors may be used to endorse or promote +// products derived from this software without specific prior written +// permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +#ifndef CHEMDRAW_BOND_H +#define CHEMDRAW_BOND_H + +#include +#include +#include + +#include "ChemDrawStartInclude.h" +#include "chemdraw/CDXStdObjects.h" +#include "ChemDrawEndInclude.h" + +#include "utils.h" +#include "fragment.h" + +namespace RDKit { +namespace ChemDraw { +bool parseBond(RWMol &mol, unsigned int fragmentId, CDXBond &bond, + PageData &pagedata); +} +} +#endif diff --git a/External/ChemDraw/bracket.cpp b/External/ChemDraw/bracket.cpp new file mode 100644 index 000000000..cf5e42968 --- /dev/null +++ b/External/ChemDraw/bracket.cpp @@ -0,0 +1,106 @@ +// +// Copyright (c) 2024, Glysade Inc +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// * Neither the name of Novartis Institutes for BioMedical Research Inc. +// nor the names of its contributors may be used to endorse or promote +// products derived from this software without specific prior written +// permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +//#include "node.h" +#include "utils.h" +#include "bracket.h" + +namespace RDKit { +namespace ChemDraw { +// This is currently unimplemented waiting on full bracket support in the rdkit +// or support for expansion inside the RDChemDrawLib +bool parseBracket(CDXBracketedGroup &bracket, PageData &/*pagedata*/) { + // Get the contained atoms/bonds in the bracket + for (auto &attachment : bracket.ContainedObjects()) { + CDXDatumID childid = (CDXDatumID)attachment.second->GetTag(); + if (childid == kCDXObj_BracketAttachment) { + CDXBracketAttachment &bracketattachment = + (CDXBracketAttachment &)(*attachment.second); + for (auto &bracketdata : bracketattachment.ContainedObjects()) { + CDXDatumID bracketid = (CDXDatumID)bracketdata.second->GetTag(); + if (bracketid == kCDXObj_CrossingBond) { + //CDXCrossingBond &crossingbond = + // (CDXCrossingBond &)(*attachment.second); + // XX unimplmented crossingbond.m_bondID; // bond that crosses brackets + // XX unimplmented crossingbond.m_innerAtomID; // atom within brackets + } + } + } + } + + // SubstanceGroup sgroup; + switch (bracket.m_usage) { + case kCDXBracketUsage_Unspecified: + break; + case kCDXBracketUsage_Anypolymer: + break; + case kCDXBracketUsage_Component: + break; + case kCDXBracketUsage_Copolymer: + break; + case kCDXBracketUsage_CopolymerAlternating: + break; + case kCDXBracketUsage_CopolymerBlock: + break; + case kCDXBracketUsage_CopolymerRandom: + break; + case kCDXBracketUsage_Crosslink: + break; + case kCDXBracketUsage_Generic: + break; + case kCDXBracketUsage_Graft: + break; + case kCDXBracketUsage_Mer: + case kCDXBracketUsage_MixtureOrdered: + break; + case kCDXBracketUsage_MixtureUnordered: + break; + case kCDXBracketUsage_Modification: + break; + case kCDXBracketUsage_Monomer: // repeat head-to-tail, head-to-head (check + // flip) + break; + case kCDXBracketUsage_MultipleGroup: + break; + case kCDXBracketUsage_MultipleGroupOverride: + break; + case kCDXBracketUsage_SRU: // Structural repeating unit, repeat pattern + // head-to-tail (default) head-to-head (check + // flip?) + break; + case kCDXBracketUsage_Unused1: + break; + case kCDXBracketUsage_Unused2: + break; + } + return true; +} +} +} // namespace RDKit diff --git a/External/ChemDraw/bracket.h b/External/ChemDraw/bracket.h new file mode 100644 index 000000000..87fe27c51 --- /dev/null +++ b/External/ChemDraw/bracket.h @@ -0,0 +1,52 @@ +// +// Copyright (c) 2024, Glysade Inc +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// * Neither the name of Novartis Institutes for BioMedical Research Inc. +// nor the names of its contributors may be used to endorse or promote +// products derived from this software without specific prior written +// permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +#ifndef CHEMDRAW_BRACKET_H +#define CHEMDRAW_BRACKET_H + +#include +#include +#include + +#include "ChemDrawStartInclude.h" +#include "chemdraw/CDXStdObjects.h" +#include "ChemDrawEndInclude.h" + +#include "utils.h" +#include "fragment.h" + +namespace RDKit { +namespace ChemDraw { +bool parseBracket(CDXBracketedGroup &bracket, PageData &pagedata); +} +} + +#endif diff --git a/External/ChemDraw/chemdraw.cpp b/External/ChemDraw/chemdraw.cpp new file mode 100644 index 000000000..3b1892887 --- /dev/null +++ b/External/ChemDraw/chemdraw.cpp @@ -0,0 +1,359 @@ +// +// Copyright (c) 2024 Glysade Inc and other RDkit contributors +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// * Neither the name of Novartis Institutes for BioMedical Research Inc. +// nor the names of its contributors may be used to endorse or promote +// products derived from this software without specific prior written +// permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// + +#include +#include +#include +#include + +#include "ChemDrawStartInclude.h" +#include "chemdraw/CDXMLParser.h" +#include "chemdraw/CDXStdObjects.h" +#include "ChemDrawEndInclude.h" + +#include "bracket.h" +#include "chemdraw.h" +#include "chemdraw_doc.h" +#include "fragment.h" +#include "reaction.h" +#include "utils.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +// #define DEBUG 1 +namespace { +using namespace RDKit; +using namespace RDKit::v2; +using namespace RDKit::ChemDraw; +// The parsing of fragments needed to be moved to a recursive function since +// they may be embedded further in the document, i.e. a group may hold multiple +// fragments +// +// Additionally, a grouped_fragments map is included to group fragments together +// for the purposes of reactions. +// +// Ungrouped fragments will end up as vectors of size 1 in the grouped_fragement +// list. The reaction schemes in the CDXML docs appear to use the fragment id +// for ungrouped fragments and the grouped id for grouped fragments, so the +// grouped_fragments holds both for ease of bookkeeping. +void visit_children( + CDXObject &node, PageData &pagedata, + int &missing_frag_id, // if we don't have a fragment id, start at -1 and + // decrement + double bondLength, // bond length of the document for assigning coordinates + const ChemDrawParserParams ¶ms, // parser parameters + int group_id = -1) { // current group id for this set of subnodes + MolzipParams molzip_params; + molzip_params.label = MolzipLabel::AtomProperty; + molzip_params.atomProperty = FUSE_LABEL; + molzip_params.enforceValenceRules = false; + + for (auto frag : node.ContainedObjects()) { + CDXDatumID id = (CDXDatumID)frag.second->GetTag(); + if (id == kCDXObj_Fragment) { + std::unique_ptr mol = std::make_unique(); + if (!parseFragment(*mol, (CDXFragment &)(*frag.second), pagedata, + missing_frag_id)) { + continue; + } + unsigned int frag_id = mol->getProp(CDX_FRAG_ID); + pagedata.fragmentLookup[frag_id] = pagedata.mols.size(); + if (group_id != -1) { + pagedata.groupedFragments[group_id].push_back(frag_id); + } else { + pagedata.groupedFragments[frag_id].push_back(frag_id); + } + + if (mol->hasProp(NEEDS_FUSE)) { + mol->clearProp(NEEDS_FUSE); + std::unique_ptr fused; + try { + replaceFragments(*mol); + fused = molzip(*mol, molzip_params); + } catch (Invar::Invariant &) { + BOOST_LOG(rdWarningLog) << "Failed fusion of fragment skipping... " + << frag_id << std::endl; + // perhaps have an option to extract all fragments? + // mols.push_back(std::move(mol)); + continue; + } + fused->setProp(CDX_FRAG_ID, static_cast(frag_id)); + pagedata.mols.emplace_back(dynamic_cast(fused.release())); + } else { + pagedata.mols.push_back(std::move(mol)); + } + RWMol *res = pagedata.mols.back().get(); + auto conf = std::make_unique(res->getNumAtoms()); + conf->set3D(false); + + bool hasConf = false; + bool is3D = false; + for (auto &atm : res->atoms()) { + RDGeom::Point3D p{0.0, 0.0, 0.0}; + + if (atm->hasProp(CDX_ATOM_POS)) { + hasConf = true; + const std::vector coord = + atm->getProp>(CDX_ATOM_POS); + + p.x = coord[0]; + p.y = -1 * coord[1]; // CDXML uses an inverted coordinate + // system, so we need to reverse that + if (coord.size() == 2) { + p.z = 0.0; + } else { + p.z = coord[2]; + is3D = true; + } + } + conf->setAtomPos(atm->getIdx(), p); + atm->clearProp(CDX_ATOM_POS); + } + + if (hasConf) { + if (!is3D) { + scaleBonds(*res, *conf, RDKIT_DEPICT_BONDLENGTH, bondLength); + } + conf->set3D(is3D); + + auto confidx = res->addConformer(conf.release()); + + if (is3D) { + res->updatePropertyCache(false); + MolOps::assignChiralTypesFrom3D(*res, confidx, true); + } else { + MolOps::assignChiralTypesFromBondDirs(*res, confidx, true); + } + Atropisomers::detectAtropisomerChirality(*res, + &res->getConformer(confidx)); + } else { // no Conformer + Atropisomers::detectAtropisomerChirality(*res, nullptr); + } + + // now that atom stereochem has been perceived, the wedging + // information is no longer needed, so we clear + // single bond dir flags: + MolOps::clearSingleBondDirFlags(*res); + + if (params.sanitize) { + try { + if (params.removeHs) { + // Bond stereo detection must happen before H removal, or + // else we might be removing stereogenic H atoms in double + // bonds (e.g. imines). But before we run stereo detection, + // we need to run mol cleanup so don't have trouble with + // e.g. nitro groups. Sadly, this a;; means we will find + // run both cleanup and ring finding twice (a fast find + // rings in bond stereo detection, and another in + // sanitization's SSSR symmetrization). + unsigned int failedOp = 0; + MolOps::sanitizeMol(*res, failedOp, MolOps::SANITIZE_CLEANUP); + MolOps::detectBondStereochemistry(*res); + MolOps::removeHs(*res); + } else { + MolOps::sanitizeMol(*res); + MolOps::detectBondStereochemistry(*res); + } + + } catch (...) { + BOOST_LOG(rdWarningLog) + << "CDXMLParser: failed sanitizing skipping fragment " << frag_id + << std::endl; + pagedata.mols.pop_back(); + continue; + } + MolOps::assignStereochemistry(*res, true, true, true); + // Sometimes ChemDraw just marks with R and S, so let's assign + // these as long as they were not already determined + checkChemDrawTetrahedralGeometries(*res); + } else { + MolOps::detectBondStereochemistry(*res); + } + } else if (id == kCDXObj_ReactionScheme) { // get the reaction info + CDXReactionScheme &scheme = (CDXReactionScheme &)(*frag.second); + pagedata.schemes.emplace_back(scheme); + /* + int scheme_id = scheme.GetObjectID(); //frag.second.template + get(".id", -1); for (auto &rxnNode : + scheme.ContainedObjects()) { CDXDatumID type_id = + (CDXDatumID)rxnNode.second->GetTag(); if (type_id == kCDXObj_ReactionStep) + { CDXReactionStep &step = (CDXReactionStep&)(*rxnNode.second); auto + step_id = step.GetObjectID(); SchemeInfo scheme; scheme.scheme_id = + scheme_id; scheme.step_id = step_id; scheme.ReactionStepProducts = + step.m_products; scheme.ReactionStepReactants = step.m_reactants; + scheme.ReactionStepObjectsBelowArrow = step.m_objectsBelowArrow; + scheme.ReactionStepAtomMap = step.m_aamap; + schemes.push_back(scheme); + } + } + */ + } else if (id == kCDXObj_Group) { + CDXGroup &group = (CDXGroup &)(*frag.second); + group_id = frag.second->GetObjectID(); + visit_children(group, pagedata, missing_frag_id, bondLength, params, + group_id); + } else if (id == kCDXObj_BracketedGroup) { + CDXBracketedGroup &bracketgroup = (CDXBracketedGroup &)(*frag.second); + parseBracket(bracketgroup, pagedata); + } + } +} + +std::unique_ptr streamToCDXDocument(std::istream &inStream, + CDXFormat format) { + if (format == CDXFormat::CDXML) { + CDXMLParser parser; + // populate tree structure pt + std::string data = std::string(std::istreambuf_iterator(inStream), + std::istreambuf_iterator()); + const bool HaveAllXml = true; + if (XML_STATUS_OK != parser.XML_Parse(data.c_str(), + static_cast(data.size()), + HaveAllXml)) { + auto error = XML_GetErrorCode(parser); + BOOST_LOG(rdErrorLog) << "Failed parsing XML with error code " << error; + throw FileParseException("Bad Input File"); + } + + return parser.ReleaseDocument(); + } else { + throw FileParseException("Can't handle cdx yet"); + return std::unique_ptr(); + } +} + +// may raise FileParseException +std::vector> molsFromCDXMLDataStream( + std::istream &inStream, const ChemDrawParserParams ¶ms) { + std::unique_ptr document = + streamToCDXDocument(inStream, params.format); + if (!document) { + // error + return std::vector>(); + } + PageData pagedata; + auto bondLength = document->m_bondLength; + + int missing_frag_id = -1; + for (auto node : document->ContainedObjects()) { + CDXDatumID id = (CDXDatumID)node.second->GetTag(); + switch (id) { + case kCDXObj_Page: + visit_children(*node.second, pagedata, missing_frag_id, bondLength, + params); + break; + default: + break; + } + } + for (auto &scheme : pagedata.schemes) { + scheme.set_reaction_steps(pagedata.groupedFragments, pagedata.mols); + } + pagedata.clearCDXProps(); + + return std::move(pagedata.mols); +} +} // namespace + +namespace RDKit { +namespace ChemDraw { +std::unique_ptr ChemDrawToDocument(std::istream &inStream, + CDXFormat format) { + return streamToCDXDocument(inStream, format); +} + +std::unique_ptr ChemDrawToDocument(const std::string &filename) { + std::fstream chemdrawfile(filename); + std::string ext = std::filesystem::path(filename).extension().string(); + boost::algorithm::to_lower(ext); + if (ext == ".cdxml") + return streamToCDXDocument(chemdrawfile, CDXFormat::CDXML); + else if (ext == ".cdx") { + return streamToCDXDocument(chemdrawfile, CDXFormat::CDX); + } + std::string msg = + std::string("Unknoen filetype ") + + (std::string)std::filesystem::path(filename).extension().string(); + throw FileParseException(msg.c_str()); +} +} + +namespace v2 { +std::vector> MolsFromChemDrawDataStream( + std::istream &inStream, const ChemDrawParserParams ¶ms) { + auto chemdrawmols = molsFromCDXMLDataStream(inStream, params); + std::vector> mols; + mols.reserve(chemdrawmols.size()); + for (auto &mol : chemdrawmols) { + RWMol *m = (RWMol *)mol.release(); + mols.push_back(std::unique_ptr(m)); + } + return mols; +} + +std::vector> MolsFromChemDrawBlock( + const std::string &block, const ChemDrawParserParams ¶ms) { + std::stringstream ss; + ss << block; + return MolsFromChemDrawDataStream(ss, params); +} + +std::vector> MolsFromChemDrawFile( + const std::string &filename, const ChemDrawParserParams ¶ms) { + CDXMLParser parser; + std::vector> mols; + + std::fstream chemdrawfile(filename); // FIX ME CHECK CDX versus CDXML + if (!chemdrawfile) { + throw BadFileException(filename + " does not exist"); + return mols; + } + auto chemdrawmols = molsFromCDXMLDataStream(chemdrawfile, params); + + mols.reserve(chemdrawmols.size()); + for (auto &mol : chemdrawmols) { + RWMol *m = (RWMol *)mol.release(); + mols.push_back(std::unique_ptr(m)); + } + return mols; +} +} +} // namespace RDKit diff --git a/External/ChemDraw/chemdraw.h b/External/ChemDraw/chemdraw.h new file mode 100644 index 000000000..c4e1884d9 --- /dev/null +++ b/External/ChemDraw/chemdraw.h @@ -0,0 +1,69 @@ +// +// Copyright (c) 2024, Glysade Inc +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// * Neither the name of Novartis Institutes for BioMedical Research Inc. +// nor the names of its contributors may be used to endorse or promote +// products derived from this software without specific prior written +// permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +#ifndef RDKIT_CHEMDRAW_H +#define RDKIT_CHEMDRAW_H + +#include +#include +#include +#include + +namespace RDKit { +namespace v2 { +enum class CDXFormat { + CDX = 1, + CDXML = 2 +}; + +struct RDKIT_RDCHEMDRAWLIB_EXPORT ChemDrawParserParams { + bool sanitize = true; + bool removeHs = true; + CDXFormat format = CDXFormat::CDXML; +}; + +std::vector> RDKIT_RDCHEMDRAWLIB_EXPORT +MolsFromChemDrawDataStream(std::istream &inStream, + const ChemDrawParserParams ¶ms = ChemDrawParserParams()); + +std::vector> RDKIT_RDCHEMDRAWLIB_EXPORT +MolsFromChemDrawFile(const std::string &filename, + const ChemDrawParserParams ¶ms = ChemDrawParserParams()); + +std::vector> RDKIT_RDCHEMDRAWLIB_EXPORT +MolsFromChemDrawBlock(const std::string &block, + const ChemDrawParserParams ¶ms = ChemDrawParserParams()); + +std::string RDKIT_RDCHEMDRAWLIB_EXPORT +MolToChemDrawBlock(const ROMol &mol, CDXFormat format = CDXFormat::CDXML); +} +} // namespace RDKit +#endif diff --git a/External/ChemDraw/chemdraw_doc.h b/External/ChemDraw/chemdraw_doc.h new file mode 100644 index 000000000..370b0b136 --- /dev/null +++ b/External/ChemDraw/chemdraw_doc.h @@ -0,0 +1,51 @@ + +// +// Copyright (c) 2025, Glysade Inc +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// * Neither the name of Novartis Institutes for BioMedical Research Inc. +// nor the names of its contributors may be used to endorse or promote +// products derived from this software without specific prior written +// permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +#ifndef RDKIT_CHEMDRAW_DOC_H +#define RDKIT_CHEMDRAW_DOC_H + +#include "chemdraw.h" +#include "ChemDrawStartInclude.h" +#include "chemdraw/CDXStdObjects.h" +#include "ChemDrawEndInclude.h" + +namespace RDKit { +namespace ChemDraw { +std::unique_ptr RDKIT_RDCHEMDRAWLIB_EXPORT +ChemDrawToDocument(std::istream &inStream, v2::CDXFormat format); + +std::unique_ptr RDKIT_RDCHEMDRAWLIB_EXPORT +ChemDrawToDocument(const std::string &filename); + +} +} // namespace RDKit +#endif diff --git a/External/ChemDraw/chemdrawreaction.cpp b/External/ChemDraw/chemdrawreaction.cpp new file mode 100644 index 000000000..6a45ac3e4 --- /dev/null +++ b/External/ChemDraw/chemdrawreaction.cpp @@ -0,0 +1,167 @@ +// +// Copyright (c) 2024, Glysade Inc +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// * Neither the name of Novartis Institutes for BioMedical Research Inc. +// nor the names of its contributors may be used to endorse or promote +// products derived from this software without specific prior written +// permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// + +#include "chemdraw.h" +#include "chemdrawreaction.h" +#include "reaction.h" +#include "utils.h" + +#include +#include +#include +#include + +namespace RDKit { +using namespace RDKit::v2; +using namespace RDKit::ChemDraw; + +// ChemDraw reaction API +// Convert reaction information to RDKIT reactions +namespace { +void make_query_atoms(RWMol &mol) { + for (auto &atom : mol.atoms()) { + QueryOps::replaceAtomWithQueryAtom(&mol, atom); + } +} + +void add_template(const std::string &prop, std::map &templates, + std::unique_ptr &mol) { + auto reactant_idx = mol->getProp(prop); + if (templates.find(reactant_idx) != templates.end()) { + templates[reactant_idx] = + ROMOL_SPTR(combineMols(*templates[reactant_idx], *mol)); + } else { + templates[reactant_idx] = ROMOL_SPTR(std::move(mol)); + } +} +} // namespace + +namespace v2 { +//! Parse a text stream with ChemDraw data into a ChemicalReaction +std::vector> +ChemDrawDataStreamToChemicalReactions(std::istream &inStream, bool sanitize, + bool removeHs) { + ChemDrawParserParams params; + params.sanitize = sanitize; + params.removeHs = removeHs; + auto mols = MolsFromChemDrawDataStream(inStream, params); + std::vector> result; + + std::map, std::vector> + schemes; + std::set used; + std::map reactant_templates; + std::map product_templates; + std::map agent_templates; + + for (size_t i = 0; i < mols.size(); ++i) { + unsigned int step = 0; + unsigned int scheme = 0; + if (mols[i]->getPropIfPresent(CDX_SCHEME_ID, scheme) && + mols[i]->getPropIfPresent(CDX_STEP_ID, step)) { + auto schemestep = std::pair(scheme, step); + schemes[schemestep].push_back(i); + } + } + if (schemes.empty()) { + return result; + } + for (const auto &scheme : schemes) { + // convert atoms to queries: + ChemicalReaction *res = new ChemicalReaction; + result.push_back(std::unique_ptr(res)); + for (auto idx : scheme.second) { + CHECK_INVARIANT( + used.find(idx) == used.end(), + "Fragment used in twice in one or more reactions, this shouldn't happen"); + if (mols[idx]->hasProp(CDX_REAGENT_ID)) { + used.insert(idx); + make_query_atoms(*mols[idx]); + add_template(CDX_REAGENT_ID, reactant_templates, mols[idx]); + } else if (mols[idx]->hasProp(CDX_AGENT_ID)) { + used.insert(idx); + make_query_atoms(*mols[idx]); + add_template(CDX_AGENT_ID, agent_templates, mols[idx]); + } else if (mols[idx]->hasProp(CDX_PRODUCT_ID)) { + used.insert(idx); + make_query_atoms(*mols[idx]); + add_template(CDX_PRODUCT_ID, product_templates, mols[idx]); + } + } + for (auto reactant : reactant_templates) { + res->addReactantTemplate(reactant.second); + } + for (auto reactant : agent_templates) { + res->addAgentTemplate(reactant.second); + } + for (auto reactant : product_templates) { + res->addProductTemplate(reactant.second); + } + updateProductsStereochem(res); + // ChemDraw-based reactions do not have implicit properties + res->setImplicitPropertiesFlag(false); + + if (!sanitize) { // we still need to fix the reaction for smarts style + // matching + unsigned int failed; + RxnOps::sanitizeRxn( + *res, failed, + RxnOps::SANITIZE_ADJUST_REACTANTS | RxnOps::SANITIZE_ADJUST_PRODUCTS, + RxnOps::MatchOnlyAtRgroupsAdjustParams()); + } + } + return result; +} + +std::vector> ChemDrawToChemicalReactions( + const std::string &rxnBlock, bool sanitize, bool removeHs) { + std::istringstream inStream(rxnBlock); + return ChemDrawDataStreamToChemicalReactions(inStream, sanitize, removeHs); +} + +std::vector> ChemDrawFileToChemicalReactions( + const std::string &fName, bool sanitize, bool removeHs) { + std::ifstream inStream(fName.c_str()); + std::vector> res; + ; + + if (!inStream || inStream.bad()) { + return res; + } + if (!inStream.eof()) { + return ChemDrawDataStreamToChemicalReactions(inStream, sanitize, removeHs); + } + return res; +} + +} +} // namespace RDKit diff --git a/External/ChemDraw/chemdrawreaction.h b/External/ChemDraw/chemdrawreaction.h new file mode 100644 index 000000000..809a154f9 --- /dev/null +++ b/External/ChemDraw/chemdrawreaction.h @@ -0,0 +1,63 @@ +// +// Copyright (c) 2025, Glysade Inc +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// * Neither the name of Novartis Institutes for BioMedical Research Inc. +// nor the names of its contributors may be used to endorse or promote +// products derived from this software without specific prior written +// permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +#ifndef RDKIT_CHEMDRAW_REACTION_H +#define RDKIT_CHEMDRAW_REACTION_H + +#include +#include +#include +#include + +namespace RDKit +{ +namespace v2 { +//--------------------------------------------------------------------------- +//! \name Chemdraw rxn Support +///@{ + +//! Parse text in ChemDraw rxn format into a vector of ChemicalReactions +RDKIT_RDCHEMDRAWREACTIONLIB_EXPORT std::vector> +ChemDrawToChemicalReactions(const std::string &rxnBlock, bool sanitize = false, + bool removeHs = false); +//! Parse a file in ChemDraw rxn format into a vector of ChemicalReactions +RDKIT_RDCHEMDRAWREACTIONLIB_EXPORT std::vector> +ChemDrawFileToChemicalReactions(const std::string &fileName, bool sanitize = false, + bool removeHs = false); +//! Parse a text stream in ChemDraw rxn format into a vector of ChemicalReactions +RDKIT_RDCHEMDRAWREACTIONLIB_EXPORT std::vector> +ChemDrawDataStreamToChemicalReactions(std::istream &rxnStream, + bool sanitize = false, + bool removeHs = false); + +} +} // namespace RDKit +#endif diff --git a/External/ChemDraw/fragment.cpp b/External/ChemDraw/fragment.cpp new file mode 100644 index 000000000..b459a1529 --- /dev/null +++ b/External/ChemDraw/fragment.cpp @@ -0,0 +1,586 @@ +// +// Copyright (c) 2024, Glysade Inc +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// * Neither the name of Novartis Institutes for BioMedical Research Inc. +// nor the names of its contributors may be used to endorse or promote +// products derived from this software without specific prior written +// permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +#include "fragment.h" +#include "bond.h" +#include "node.h" + +namespace RDKit { +namespace ChemDraw { +namespace { +const char *sequenceTypeToName(CDXSeqType seqtype) { + switch (seqtype) { + case kCDXSeqType_Unknown: + return "Unknown"; + case kCDXSeqType_Peptide: + return "Peptide (Helm)"; // HELM peptides + case kCDXSeqType_Peptide1: + return "Peptide1 (Single Letter Amino Acid)"; // Single letter amino + // acids (Legacy biopolymer + // support) + case kCDXSeqType_Peptide3: + return "Peptide3 (Three letter amino acid)"; // Three letter amino acids + // (Legacy biopolymer + // support) + case kCDXSeqType_DNA: + return "DNA"; + case kCDXSeqType_RNA: + return "RNA"; + case kCDXSeqType_Biopolymer: + return "Biopolymer"; + default: + return ""; + } +} +} // namespace +bool parseFragment(RWMol &mol, CDXFragment &fragment, PageData &pagedata, + int &missingFragId, int externalAttachment) { + int frag_id = fragment.GetObjectID(); + if (fragment.m_sequenceType != kCDXSeqType_Unknown) { + BOOST_LOG(rdWarningLog) + << "Unhandled chemdraw sequence type " + << sequenceTypeToName(fragment.m_sequenceType) << std::endl; + return false; + } + if (frag_id == -1) { + // ChemDraw simply assigns a new one + BOOST_LOG(rdWarningLog) + << "Invalid or missing fragment id from CDXML fragment, assigning new one..." + << std::endl; + frag_id = missingFragId; + missingFragId--; + } + mol.setProp(CDX_FRAG_ID, frag_id); + + // for atom in frag + std::map, StereoGroupInfo> sgroups; + + // nodetypes = + // https://www.cambridgesoft.com/services/documentation/sdk/chemdraw/cdx/properties/Node_Type.htm + bool skip_fragment = + false; // is there an irrecoverable error for this fragment + + for (auto child : fragment.ContainedObjects()) { + CDXDatumID id = (CDXDatumID)child.second->GetTag(); +#ifdef DEBUG + std::cerr << "Data Type: " << id << std::endl; +#endif + switch (id) { + case kCDXObj_Node: { + CDXNode &node = (CDXNode &)(*child.second); + if (!parseNode(mol, frag_id, node, pagedata, sgroups, missingFragId, + externalAttachment)) { + skip_fragment = true; + } + break; + } + case kCDXObj_Bond: { + CDXBond &bond = (CDXBond &)(*child.second); + if (!parseBond(mol, frag_id, bond, pagedata)) { + skip_fragment = true; + break; + } + } + case kCDXProp_EndObject: break; + case kCDXProp_CreationUserName: break; + case kCDXProp_CreationDate: break; + case kCDXProp_CreationProgram: break; + case kCDXProp_ModificationUserName: break; + case kCDXProp_ModificationDate: break; + case kCDXProp_ModificationProgram: break; + case kCDXProp_Unused1: break; + case kCDXProp_Name: break; + case kCDXProp_Comment: break; + case kCDXProp_ZOrder: break; + case kCDXProp_RegistryNumber: break; + case kCDXProp_RegistryAuthority: break; + case kCDXProp_Unused2: break; + case kCDXProp_RepresentsProperty: break; + case kCDXProp_IgnoreWarnings: break; + case kCDXProp_ChemicalWarning: break; + case kCDXProp_Visible: break; + case kCDXProp_Transparent: break; + case kCDXProp_SupersededBy: break; + case kCDXProp_StructurePerspective: break; + case kCDXProp_FontTable: break; + case kCDXProp_2DPosition: break; + case kCDXProp_3DPosition: break; + case kCDXProp_2DExtent: break; + case kCDXProp_3DExtent: break; + case kCDXProp_BoundingBox: break; + case kCDXProp_RotationAngle: break; + case kCDXProp_BoundsInParent: break; + case kCDXProp_3DHead: break; + case kCDXProp_3DTail: break; + case kCDXProp_TopLeft: break; + case kCDXProp_TopRight: break; + case kCDXProp_BottomRight: break; + case kCDXProp_BottomLeft: break; + case kCDXProp_3DCenter: break; + case kCDXProp_3DMajorAxisEnd: break; + case kCDXProp_3DMinorAxisEnd: break; + case kCDXProp_ColorTable: break; + case kCDXProp_ForegroundColor: break; + case kCDXProp_BackgroundColor: break; + case kCDXProp_FadePercent: break; + case kCDXProp_Unused8: break; + case kCDXProp_Unused9: break; + case kCDXProp_ForegroundAlpha: break; + case kCDXProp_BackgroundAlpha: break; + case kCDXProp_HighlightColor: break; + case kCDXProp_Node_Type: break; + case kCDXProp_Node_LabelDisplay: break; + case kCDXProp_Node_Element: break; + case kCDXProp_Atom_ElementList: break; + case kCDXProp_Atom_Formula: break; + case kCDXProp_Atom_Isotope: break; + case kCDXProp_Atom_Charge: break; + case kCDXProp_Atom_Radical: break; + case kCDXProp_Atom_RestrictFreeSites: break; + case kCDXProp_Atom_RestrictImplicitHydrogens: break; + case kCDXProp_Atom_RestrictRingBondCount: break; + case kCDXProp_Atom_RestrictUnsaturatedBonds: break; + case kCDXProp_Atom_RestrictRxnChange: break; + case kCDXProp_Atom_RestrictRxnStereo: break; + case kCDXProp_Atom_AbnormalValence: break; + case kCDXProp_Unused3: break; + case kCDXProp_Atom_NumHydrogens: break; + case kCDXProp_Unused4: break; + case kCDXProp_Unused5: break; + case kCDXProp_Atom_HDot: break; + case kCDXProp_Atom_HDash: break; + case kCDXProp_Atom_Geometry: break; + case kCDXProp_Atom_BondOrdering: break; + case kCDXProp_Node_Attachments: break; + case kCDXProp_Atom_GenericNickname: break; + case kCDXProp_Atom_AltGroupID: break; + case kCDXProp_Atom_RestrictSubstituentsUpTo: break; + case kCDXProp_Atom_RestrictSubstituentsExactly: break; + case kCDXProp_Atom_CIPStereochemistry: break; + case kCDXProp_Atom_Translation: break; + case kCDXProp_Atom_AtomNumber: break; + case kCDXProp_Atom_ShowQuery: break; + case kCDXProp_Atom_ShowStereo: break; + case kCDXProp_Atom_ShowAtomNumber: break; + case kCDXProp_Atom_LinkCountLow: break; + case kCDXProp_Atom_LinkCountHigh: break; + case kCDXProp_Atom_IsotopicAbundance: break; + case kCDXProp_Atom_ExternalConnectionType: break; + case kCDXProp_Atom_GenericList: break; + case kCDXProp_Atom_ShowTerminalCarbonLabels: break; + case kCDXProp_Atom_ShowNonTerminalCarbonLabels: break; + case kCDXProp_Atom_HideImplicitHydrogens: break; + case kCDXProp_Atom_ShowEnhancedStereo: break; + case kCDXProp_Atom_EnhancedStereoType: break; + case kCDXProp_Atom_EnhancedStereoGroupNum: break; + case kCDXProp_Node_NeedsClean: break; + case kCDXProp_Atom_ResidueID: break; + case kCDXProp_Atom_ShowResidueID: break; + case kCDXProp_Atom_ExternalConnectionNum: break; + case kCDXProp_Atom_ShowAtomID: break; + case kCDXProp_Atom_AtomID: break; + case kCDXProp_Node_HydrogenBondAttachmentAtoms: break; + case kCDXProp_Node_HydrogenBonds: break; + case kCDXProp_Mole_Racemic: break; + case kCDXProp_Mole_Absolute: break; + case kCDXProp_Mole_Relative: break; + case kCDXProp_Mole_Formula: break; + case kCDXProp_Mole_Weight: break; + case kCDXProp_Frag_ConnectionOrder: break; + case kCDXProp_Frag_SequenceType: break; + case kCDXProp_Frag_IsFromGuidedStereo: break; + case kCDXProp_Frag_IsComplement: break; + case kCDXProp_Bond_Order: break; + case kCDXProp_Bond_Display: break; + case kCDXProp_Bond_Display2: break; + case kCDXProp_Bond_DoublePosition: break; + case kCDXProp_Bond_Begin: break; + case kCDXProp_Bond_End: break; + case kCDXProp_Bond_RestrictTopology: break; + case kCDXProp_Bond_RestrictRxnParticipation: break; + case kCDXProp_Bond_BeginAttach: break; + case kCDXProp_Bond_EndAttach: break; + case kCDXProp_Bond_CIPStereochemistry: break; + case kCDXProp_Bond_BondOrdering: break; + case kCDXProp_Bond_ShowQuery: break; + case kCDXProp_Bond_ShowStereo: break; + case kCDXProp_Bond_CrossingBonds: break; + case kCDXProp_Bond_ShowRxn: break; + case kCDXProp_Bond_Connectivity: break; + case kCDXProp_Bond_BeginExternalNum: break; + case kCDXProp_Bond_EndExternalNum: break; + case kCDXProp_Bond_Connectivity_Routed: break; + case kCDXProp_Text: break; + case kCDXProp_Justification: break; + case kCDXProp_LineHeight: break; + case kCDXProp_WordWrapWidth: break; + case kCDXProp_LineStarts: break; + case kCDXProp_LabelAlignment: break; + case kCDXProp_LabelLineHeight: break; + case kCDXProp_CaptionLineHeight: break; + case kCDXProp_InterpretChemically: break; + case kCDXProp_UTF8Text: break; + case kCDXProp_MacPrintInfo: break; + case kCDXProp_WinPrintInfo: break; + case kCDXProp_PrintMargins: break; + case kCDXProp_ChainAngle: break; + case kCDXProp_BondSpacing: break; + case kCDXProp_BondLength: break; + case kCDXProp_BoldWidth: break; + case kCDXProp_LineWidth: break; + case kCDXProp_MarginWidth: break; + case kCDXProp_HashSpacing: break; + case kCDXProp_LabelStyle: break; + case kCDXProp_CaptionStyle: break; + case kCDXProp_CaptionJustification: break; + case kCDXProp_FractionalWidths: break; + case kCDXProp_Magnification: break; + case kCDXProp_WidthPages: break; + case kCDXProp_HeightPages: break; + case kCDXProp_DrawingSpaceType: break; + case kCDXProp_Width: break; + case kCDXProp_Height: break; + case kCDXProp_PageOverlap: break; + case kCDXProp_Header: break; + case kCDXProp_HeaderPosition: break; + case kCDXProp_Footer: break; + case kCDXProp_FooterPosition: break; + case kCDXProp_PrintTrimMarks: break; + case kCDXProp_LabelStyleFont: break; + case kCDXProp_CaptionStyleFont: break; + case kCDXProp_LabelStyleSize: break; + case kCDXProp_CaptionStyleSize: break; + case kCDXProp_LabelStyleFace: break; + case kCDXProp_CaptionStyleFace: break; + case kCDXProp_LabelStyleColor: break; + case kCDXProp_CaptionStyleColor: break; + case kCDXProp_BondSpacingAbs: break; + case kCDXProp_LabelJustification: break; + case kCDXProp_FixInplaceExtent: break; + case kCDXProp_Side: break; + case kCDXProp_FixInplaceGap: break; + case kCDXProp_CartridgeData: break; + case kCDXProp_AminoAcidTermini: break; + case kCDXProp_ShowSequenceTermini: break; + case kCDXProp_ShowSequenceBonds: break; + case kCDXProp_ResidueWrapCount: break; + case kCDXProp_ResidueBlockCount: break; + case kCDXProp_Unused10: break; + case kCDXProp_Unused11: break; + case kCDXProp_BondSpacingType: break; + case kCDXProp_LabelStyleFontName: break; + case kCDXProp_CaptionStyleFontName: break; + case kCDXProp_ShowSequenceUnlinkedBranches: break; + case kCDXProp_MonomerRenderingStyle: break; + case kCDXProp_Window_IsZoomed: break; + case kCDXProp_Window_Position: break; + case kCDXProp_Window_Size: break; + case kCDXProp_Graphic_Type: break; + case kCDXProp_Line_Type: break; + case kCDXProp_Arrow_Type: break; + case kCDXProp_Rectangle_Type: break; + case kCDXProp_Oval_Type: break; + case kCDXProp_Orbital_Type: break; + case kCDXProp_Bracket_Type: break; + case kCDXProp_Symbol_Type: break; + case kCDXProp_Curve_Type: break; + case kCDXProp_Arrowhead_Size: break; + case kCDXProp_Arc_AngularSize: break; + case kCDXProp_Bracket_LipSize: break; + case kCDXProp_Curve_Points: break; + case kCDXProp_Bracket_Usage: break; + case kCDXProp_Polymer_RepeatPattern: break; + case kCDXProp_Polymer_FlipType: break; + case kCDXProp_BracketedObjects: break; + case kCDXProp_Bracket_RepeatCount: break; + case kCDXProp_Bracket_ComponentOrder: break; + case kCDXProp_Bracket_SRULabel: break; + case kCDXProp_Bracket_GraphicID: break; + case kCDXProp_Bracket_BondID: break; + case kCDXProp_Bracket_InnerAtomID: break; + case kCDXProp_Curve_Points3D: break; + case kCDXProp_Arrowhead_Type: break; + case kCDXProp_Arrowhead_CenterSize: break; + case kCDXProp_Arrowhead_Width: break; + case kCDXProp_ShadowSize: break; + case kCDXProp_Arrow_ShaftSpacing: break; + case kCDXProp_Arrow_EquilibriumRatio: break; + case kCDXProp_Arrowhead_Head: break; + case kCDXProp_Arrowhead_Tail: break; + case kCDXProp_Fill_Type: break; + case kCDXProp_Curve_Spacing: break; + case kCDXProp_Closed: break; + case kCDXProp_Arrow_Dipole: break; + case kCDXProp_Arrow_NoGo: break; + case kCDXProp_CornerRadius: break; + case kCDXProp_Frame_Type: break; + case kCDXProp_Arrow_SourceID: break; + case kCDXProp_Arrow_TargetID: break; + case kCDXProp_Arrow_IsSmart_Deleted: break; + case kCDXProp_Picture_Edition: break; + case kCDXProp_Picture_EditionAlias: break; + case kCDXProp_MacPICT: break; + case kCDXProp_WindowsMetafile: break; + case kCDXProp_OLEObject: break; + case kCDXProp_EnhancedMetafile: break; + case kCDXProp_Compressed_MacPICT: break; + case kCDXProp_Compressed_WindowsMetafile: break; + case kCDXProp_Compressed_OLEObject: break; + case kCDXProp_Compressed_EnhancedMetafile: break; + case kCDXProp_Uncompressed_MacPICT_Size: break; + case kCDXProp_Uncompressed_WindowsMetafile_Size: break; + case kCDXProp_Uncompressed_OLEObject_Size: break; + case kCDXProp_Uncompressed_EnhancedMetafile_Size: break; + case kCDXProp_GIF: break; + case kCDXProp_TIFF: break; + case kCDXProp_PNG: break; + case kCDXProp_JPEG: break; + case kCDXProp_BMP: break; + case kCDXProp_PDF: break; + case kCDXProp_Spectrum_XSpacing: break; + case kCDXProp_Spectrum_XLow: break; + case kCDXProp_Spectrum_XType: break; + case kCDXProp_Spectrum_YType: break; + case kCDXProp_Spectrum_XAxisLabel: break; + case kCDXProp_Spectrum_YAxisLabel: break; + case kCDXProp_Spectrum_DataPoint: break; + case kCDXProp_Spectrum_Class: break; + case kCDXProp_Spectrum_YLow: break; + case kCDXProp_Spectrum_YScale: break; + case kCDXProp_TLC_OriginFraction: break; + case kCDXProp_TLC_SolventFrontFraction: break; + case kCDXProp_TLC_ShowOrigin: break; + case kCDXProp_TLC_ShowSolventFront: break; + case kCDXProp_ShowBorders: break; + case kCDXProp_TLC_ShowSideTicks: break; + case kCDXProp_TLC_Rf: break; + case kCDXProp_TLC_Tail: break; + case kCDXProp_TLC_ShowRf: break; + case kCDXProp_GEP_ShowScale: break; + case kCDXProp_GEP_ScaleUnit: break; + case kCDXProp_GEP_StartRange: break; + case kCDXProp_GEP_EndRange: break; + case kCDXProp_GEP_ShowValue: break; + case kCDXProp_GEP_Value: break; + case kCDXProp_GEP_LaneLabelsAngle: break; + case kCDXProp_GEP_AxisWidth: break; + case kCDXProp_BioShape_Type: break; + case kCDXProp_1SubstrateEnzyme_ReceptorSize: break; + case kCDXProp_Receptor_NeckWidth: break; + case kCDXProp_HelixProtein_CylinderWidth: break; + case kCDXProp_HelixProtein_CylinderHeight: break; + case kCDXProp_HelixProtein_CylinderDistance: break; + case kCDXProp_HelixProtein_PipeWidth: break; + case kCDXProp_HelixProtein_Extra: break; + case kCDXProp_Membrane_ElementSize: break; + case kCDXProp_Membrane_StartAngle: break; + case kCDXProp_Membrane_EndAngle: break; + case kCDXProp_DNA_WaveLength: break; + case kCDXProp_DNA_WaveWidth: break; + case kCDXProp_DNA_Offset: break; + case kCDXProp_DNA_WaveHeight: break; + case kCDXProp_Gprotein_UpperHeight: break; + case kCDXProp_NamedAlternativeGroup_TextFrame: break; + case kCDXProp_NamedAlternativeGroup_GroupFrame: break; + case kCDXProp_NamedAlternativeGroup_Valence: break; + case kCDXProp_GeometricFeature: break; + case kCDXProp_RelationValue: break; + case kCDXProp_BasisObjects: break; + case kCDXProp_ConstraintType: break; + case kCDXProp_ConstraintMin: break; + case kCDXProp_ConstraintMax: break; + case kCDXProp_IgnoreUnconnectedAtoms: break; + case kCDXProp_DihedralIsChiral: break; + case kCDXProp_PointIsDirected: break; + case kCDXProp_ChemicalPropertyType: break; + case kCDXProp_ChemicalPropertyDisplayID: break; + case kCDXProp_ChemicalPropertyIsActive: break; + case kCDXProp_ChemicalPropertyUnknown: break; + case kCDXProp_ChemicalPropertyName: break; + case kCDXProp_ChemicalPropertyFormula: break; + case kCDXProp_ChemicalPropertyExactMass: break; + case kCDXProp_ChemicalPropertyMolWeight: break; + case kCDXProp_ChemicalPropertyMOverZ: break; + case kCDXProp_ChemicalPropertyAnalysis: break; + case kCDXProp_ChemicalPropertyBoilingPoint: break; + case kCDXProp_ChemicalPropertyMeltingPoint: break; + case kCDXProp_ChemicalPropertyCriticalTemp: break; + case kCDXProp_ChemicalPropertyCriticalPressure: break; + case kCDXProp_ChemicalPropertyCriticalVolume: break; + case kCDXProp_ChemicalPropertyGibbsEnergy: break; + case kCDXProp_ChemicalPropertyLogP: break; + case kCDXProp_ChemicalPropertyMR: break; + case kCDXProp_ChemicalPropertyHenrysLaw: break; + case kCDXProp_ChemicalPropertyHeatOfForm: break; + case kCDXProp_ChemicalPropertytPSA: break; + case kCDXProp_ChemicalPropertyCLogP: break; + case kCDXProp_ChemicalPropertyCMR: break; + case kCDXProp_ChemicalPropertyLogS: break; + case kCDXProp_ChemicalPropertyPKa: break; + case kCDXProp_ChemicalPropertyID: break; + case kCDXProp_ChemicalPropertyFragmentLabel: break; + case kCDXProp_ChemicalPropertyTypeIUPACAtomNumber: break; + case kCDXProp_ChemicalPropertyIsChemicallySignificant: break; + case kCDXProp_ChemicalPropertyExternalBonds: break; + case kCDXProp_ReactionStep_Atom_Map: break; + case kCDXProp_ReactionStep_Reactants: break; + case kCDXProp_ReactionStep_Products: break; + case kCDXProp_ReactionStep_Plusses: break; + case kCDXProp_ReactionStep_Arrows: break; + case kCDXProp_ReactionStep_ObjectsAboveArrow: break; + case kCDXProp_ReactionStep_ObjectsBelowArrow: break; + case kCDXProp_ReactionStep_Atom_Map_Manual: break; + case kCDXProp_ReactionStep_Atom_Map_Auto: break; + case kCDXProp_RxnAutonumber_Style: break; + case kCDXProp_RxnAutonumber_Conditions: break; + case kCDXProp_RxnAutonumber_Start: break; + case kCDXProp_RxnAutonumber_Format: break; + case kCDXProp_ObjectTag_Type: break; + case kCDXProp_Unused6: break; + case kCDXProp_Unused7: break; + case kCDXProp_ObjectTag_Tracking: break; + case kCDXProp_ObjectTag_Persistent: break; + case kCDXProp_ObjectTag_Value: break; + case kCDXProp_Positioning: break; + case kCDXProp_PositioningAngle: break; + case kCDXProp_PositioningOffset: break; + case kCDXProp_Sequence_Identifier: break; + case kCDXProp_CrossReference_Container: break; + case kCDXProp_CrossReference_Document: break; + case kCDXProp_CrossReference_Identifier: break; + case kCDXProp_CrossReference_Sequence: break; + case kCDXProp_Template_PaneHeight: break; + case kCDXProp_Template_NumRows: break; + case kCDXProp_Template_NumColumns: break; + case kCDXProp_Group_Integral: break; + case kCDXProp_SG_DataType: break; + case kCDXProp_SG_PropertyType: break; + case kCDXProp_SG_DataValue: break; + case kCDXProp_SG_ComponentIsReactant: break; + case kCDXProp_SG_ComponentIsHeader: break; + case kCDXProp_IsHidden: break; + case kCDXProp_IsReadOnly: break; + case kCDXProp_IsEdited: break; + case kCDXProp_SG_ComponentReferenceID: break; + case kCDXProp_PlasmidMap_NumberBasePairs: break; + case kCDXProp_PlasmidMap_MarkerStart: break; + case kCDXProp_PlasmidMap_MarkerOffset: break; + case kCDXProp_PlasmidMap_MarkerAngle: break; + case kCDXProp_PlasmidMap_RegionStart: break; + case kCDXProp_PlasmidMap_RegionEnd: break; + case kCDXProp_PlasmidMap_RegionOffset: break; + case kCDXProp_PlasmidMap_RingRadius: break; + case kCDXProp_RLogic_Group: break; + case kCDXProp_RLogic_Occurrence: break; + case kCDXProp_RLogic_RestH: break; + case kCDXProp_RLogic_IfThenGroup: break; + case kCDXProp_Annotation_Keyword: break; + case kCDXProp_Annotation_Content: break; + case kCDXProp_SplitterPositions: break; + case kCDXProp_PageDefinition: break; + case kCDXProp_Property_Rule: break; + case kCDXProp_Property_DataType: break; + case kCDXProp_Property_Value: break; + case kCDXUser_TemporaryBegin: break; + case kCDXUser_TemporaryEnd: break; + case kCDXObj_Document: break; + case kCDXObj_Page: break; + case kCDXObj_Group: break; + case kCDXObj_Fragment: break; + case kCDXObj_Text: break; + case kCDXObj_Graphic: break; + case kCDXObj_Curve: break; + case kCDXObj_EmbeddedObject: break; + case kCDXObj_NamedAlternativeGroup: break; + case kCDXObj_TemplateGrid: break; + case kCDXObj_RegistryNumber: break; + case kCDXObj_ReactionScheme: break; + case kCDXObj_ReactionStep: break; + case kCDXObj_ObjectDefinition: break; + case kCDXObj_Spectrum: break; + case kCDXObj_ObjectTag: break; + case kCDXObj_OleClientItem: break; + case kCDXObj_Sequence: break; + case kCDXObj_CrossReference: break; + case kCDXObj_Splitter: break; + case kCDXObj_Table: break; + case kCDXObj_BracketedGroup: break; + case kCDXObj_BracketAttachment: break; + case kCDXObj_CrossingBond: break; + case kCDXObj_Border: break; + case kCDXObj_Geometry: break; + case kCDXObj_Constraint: break; + case kCDXObj_TLCPlate: break; + case kCDXObj_TLCLane: break; + case kCDXObj_TLCSpot: break; + case kCDXObj_ChemicalProperty: break; + case kCDXObj_Arrow: break; + case kCDXObj_StoichiometryGrid: break; + case kCDXObj_SGComponent: break; + case kCDXObj_SGDatum: break; + case kCDXObj_BioShape: break; + case kCDXObj_PlasmidMap: break; + case kCDXObj_PlasmidMarker: break; + case kCDXObj_PlasmidRegion: break; + case kCDXObj_RLogic: break; + case kCDXObj_RLogicItem: break; + case kCDXObj_Annotation: break; + case kCDXObj_GEPPlate: break; + case kCDXObj_GEPBand: break; + case kCDXObj_Marker: break; + case kCDXObj_GEPLane: break; + case kCDXObj_DocumentProperties: break; + case kCDXObj_Property: break; + case kCDXObj_ColoredMolecularArea: break; + case kCDXObj_UnknownObject: break; + } + } + + // Add the stereo groups + if (!sgroups.empty()) { + std::vector stereo_groups; + for (auto &sgroup : sgroups) { + unsigned gId = 0; + if (sgroup.second.grouptype != StereoGroupType::STEREO_ABSOLUTE && + sgroup.second.sgroup > 0) { + gId = sgroup.second.sgroup; + } + std::vector newBonds; + stereo_groups.emplace_back(sgroup.second.grouptype, sgroup.second.atoms, + newBonds, gId); + } + mol.setStereoGroups(std::move(stereo_groups)); + } + + return !skip_fragment; +} +} +} // namespace RDKit diff --git a/External/ChemDraw/fragment.h b/External/ChemDraw/fragment.h new file mode 100644 index 000000000..329e34e82 --- /dev/null +++ b/External/ChemDraw/fragment.h @@ -0,0 +1,96 @@ +// +// Copyright (c) 2024, Glysade Inc +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// * Neither the name of Novartis Institutes for BioMedical Research Inc. +// nor the names of its contributors may be used to endorse or promote +// products derived from this software without specific prior written +// permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +#ifndef CHEMDRAW_FRAGMENT_H +#define CHEMDRAW_FRAGMENT_H + +#include +#include +#include + +#include "ChemDrawStartInclude.h" +#include "chemdraw/CDXStdObjects.h" +#include "ChemDrawEndInclude.h" + +#include "reaction.h" +#include "utils.h" + +namespace RDKit { +namespace ChemDraw { +struct PageData { + PageData() + : atomIds(), + bondIds(), + mols(), + fragmentLookup(), + groupedFragments(), + schemes() {} + + PageData(const PageData &) = delete; + + std::map atomIds; + std::map bondIds; + std::vector> mols; // All molecules found in the doc + std::map + fragmentLookup; // fragment.id->molecule index + std::map> + groupedFragments; // grouped.id -> [fragment.id] + std::vector schemes; // reaction schemes found + + void clearCDXProps() { + for (auto &mol : mols) { + for (auto atom : mol->atoms()) { + atom->clearProp(CDX_ATOM_ID); + atom->clearProp(CDX_BOND_ORDERING); + atom->clearProp(CDX_CIP); + } + for (auto bond : mol->bonds()) { + bond->clearProp(CDX_BOND_ID); + } + } + } +}; +//! Parse a CDX fragment record +//! params +//! RWMol mol : molecule to parse the fragment into +//! CDXFragment fragment : fragment to read +//! std::map ids: atom lookup, used for bonding and fusing +//! fragments int missing_frag_id: if the fragment id is missing, this is what +//! to use. n.b. may be obsolete, everything needs an id to be valid int +//! external_attachment:: if this fragment has a external node, this it it's id, +//! otherwise -1 +//! external node's are normally NickNames or new Fragments +bool parseFragment(RWMol &mol, CDXFragment &fragment, PageData &pagedata, + int &missingFragId, int externalAttachment = -1); +} +} // namespace RDKit + +#endif diff --git a/External/ChemDraw/node.cpp b/External/ChemDraw/node.cpp new file mode 100644 index 000000000..a275fc5d2 --- /dev/null +++ b/External/ChemDraw/node.cpp @@ -0,0 +1,323 @@ +// +// Copyright (c) 2024, Glysade Inc +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// * Neither the name of Novartis Institutes for BioMedical Research Inc. +// nor the names of its contributors may be used to endorse or promote +// products derived from this software without specific prior written +// permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +//#include "node.h" +#include "fragment.h" +#include "utils.h" + +namespace RDKit { +namespace ChemDraw { +bool parseNode( + RWMol &mol, unsigned int fragmentId, CDXNode &node, PageData &pagedata, + std::map, StereoGroupInfo> &sgroups, + int &missingFragId, int externalAttachment) { + int atom_id = node.GetObjectID(); + int elemno = node.m_elementNum; // default to carbon + // UINT16 max is not addigned? + int num_hydrogens = + node.m_numHydrogens == kNumHydrogenUnspecified ? 0 : node.m_numHydrogens; + bool explicitHs = node.m_numHydrogens != kNumHydrogenUnspecified; + int charge = 0; + if ((node.m_charge & 0x00FFFFFF) == 0) + charge = node.m_charge >> 24; + else + charge = node.m_charge; + int atommap = 0; + int rgroup_num = -1; + int isotope = node.m_isotope; + + bool checkForRGroup = false; + ; + std::string query_label; + std::vector elementlist; + + // position node.m_2dPosition; +#ifdef DEBUG + std::cerr << NodeType(node.m_nodeType) << std::endl; +#endif + switch (node.m_nodeType) { + case kCDXNodeType_Element: { + break; + } + case kCDXNodeType_ElementList: { + if (node.m_elementList) { + elementlist = *node.m_elementList; + query_label = "ElementList"; + } + break; + } + case kCDXNodeType_Nickname: { + elemno = 0; + atommap = atom_id; + break; + } + case kCDXNodeType_Fragment: { + elemno = 0; + atommap = atom_id; + break; + } + case kCDXNodeType_ExternalConnectionPoint: { + if (externalAttachment <= 0) { + // sometimes this is a dummy atom, but I don't know when. + if (node.m_externalConnectionType == kCDXExternalConnection_Diamond) { + elemno = 0; + } + atommap = atom_id; + } else { + elemno = 0; + atommap = externalAttachment; + } + break; + } + case kCDXNodeType_GenericNickname: { + if (node.m_genericNickname.size()) { + switch (node.m_genericNickname[0]) { + case 'R': { + checkForRGroup = true; + elemno = 0; + query_label = node.m_genericNickname; + break; + } + case 'A': + case 'Q': + case 'X': + case 'M': { + elemno = 0; + query_label = node.m_genericNickname; + } break; + default: + std::cerr << "Unhandled generic nickname: " + << node.m_genericNickname << std::endl; + } + } + break; + } + case kCDXNodeType_Unspecified: + break; + case kCDXNodeType_ElementListNickname: + break; + case kCDXNodeType_Formula: + break; + case kCDXNodeType_AnonymousAlternativeGroup: + break; + case kCDXNodeType_NamedAlternativeGroup: + break; + case kCDXNodeType_MultiAttachment: + break; + case kCDXNodeType_VariableAttachment: + break; + case kCDXNodeType_LinkNode: + break; + case kCDXNodeType_Monomer: + break; + } + + for (auto &child : node.ContainedObjects()) { + if (child.second->GetTag() == kCDXObj_Text) { + const std::string &text = ((CDXText *)child.second)->GetText().str(); + if (text.size() > 0 && text[0] == 'R') { + try { + if (checkForRGroup) + rgroup_num = text.size() > 1 ? stoi(text.substr(1)) : 0; + else + isotope = text.size() > 1 ? stoi(text.substr(1)) : 0; + } catch (const std::invalid_argument &e) { + if (rgroup_num) + BOOST_LOG(rdWarningLog) + << "RGroupError: Invalid argument - Cannot convert '" << text + << "' to an integer." << std::endl; + } catch (const std::out_of_range &e) { + if (rgroup_num) + BOOST_LOG(rdWarningLog) + << "RGroupError: Out of range - The number '" << text + << "' is too large or too small." << std::endl; + } + } + } + } + + StereoGroupType grouptype = StereoGroupType::STEREO_ABSOLUTE; + switch (node.m_enhancedStereoType) { + case kCDXEnhancedStereo_Absolute: + grouptype = StereoGroupType::STEREO_ABSOLUTE; + break; + case kCDXEnhancedStereo_And: + grouptype = StereoGroupType::STEREO_AND; + break; + case kCDXEnhancedStereo_Or: + grouptype = StereoGroupType::STEREO_OR; + break; + default: + break; + } + + CHECK_INVARIANT(atom_id != -1, "Uninitialized atom id in cdxml."); + Atom *rd_atom = new Atom(elemno); + rd_atom->setFormalCharge(charge); + rd_atom->setNumExplicitHs(num_hydrogens); + rd_atom->setNoImplicit(explicitHs); + + rd_atom->setIsotope(isotope); + if (rgroup_num >= 0) { + rd_atom->setAtomMapNum(rgroup_num); + } + set_fuse_label(rd_atom, atommap); + switch (node.m_hStereo) { + case kCDXProp_Atom_HDot: // this atom has an implicit hydrogen with a + // wedged bond + rd_atom->setProp(CDX_IMPLICIT_HYDROGEN_STEREO, 'w'); + break; + case kCDXProp_Atom_HDash: // this atom has an implicit hydrogen with a + // hashed bond + rd_atom->setProp(CDX_IMPLICIT_HYDROGEN_STEREO, 'h'); + break; + } + + if (node.m_bondOrdering) { + // This node may be completely replaced by the fragment + // i.e. [*:1]C[*:1].C[*:1]C => CCC + rd_atom->setProp>(CDX_BOND_ORDERING, *node.m_bondOrdering); + } + if (node.m_geometry == kCDXAtomGeometry_Tetrahedral) { + // std::cerr << "tetrahedral" << std::endl; + // if we have a cip type we can interpret, set it, otherwise don't + + switch (node.m_CIP) { + case kCDXCIPAtom_R: + case kCDXCIPAtom_r: + case kCDXCIPAtom_S: + case kCDXCIPAtom_s: + rd_atom->setProp(CDX_CIP, node.m_CIP); + break; + default: + rd_atom->setProp(CDX_CIP, kCDXCIPAtom_Undetermined); + break; + } + } + + std::vector atom_coords; + if (node.KnownPosition3D()) { + atom_coords.reserve(3); + atom_coords.push_back(node.m_3dPosition.x); + atom_coords.push_back(node.m_3dPosition.y); + atom_coords.push_back(node.m_3dPosition.z); + } else { + atom_coords.reserve(2); + atom_coords.push_back(node.m_2dPosition.x); + atom_coords.push_back(node.m_2dPosition.y); + } + rd_atom->setProp>(CDX_ATOM_POS, atom_coords); + rd_atom->setProp(CDX_ATOM_ID, atom_id); + + const bool updateLabels = true; + const bool takeOwnership = true; + auto idx = mol.addAtom(rd_atom, updateLabels, takeOwnership); + if (query_label.size()) { + if (query_label[0] == 'R') { + rd_atom = addquery(makeAtomNullQuery(), query_label, mol, idx); + } else if (query_label == "A") { + rd_atom = addquery(makeAAtomQuery(), query_label, mol, idx); + } else if (query_label == "Q") { + rd_atom = addquery(makeQAtomQuery(), query_label, mol, idx); + } else if (query_label == "M") { + rd_atom = addquery(makeMAtomQuery(), query_label, mol, idx); + } else if (query_label == "MH") { + rd_atom = addquery(makeMHAtomQuery(), query_label, mol, idx); + } else if (query_label == "X") { + rd_atom = addquery(makeXAtomQuery(), query_label, mol, idx); + } else if (query_label == "ElementList") { + if (!elementlist.size()) { + BOOST_LOG(rdWarningLog) + << "ElementList is empty, ignoring..." << std::endl; + } else { + auto *q = new ATOM_OR_QUERY; + q->setDescription("AtomOr"); + for (auto atNum : elementlist) { + q->addChild( + QueryAtom::QUERYATOM_QUERY::CHILD_TYPE(makeAtomNumQuery(atNum))); + } + rd_atom = addquery(q, query_label, mol, idx); + rd_atom->setAtomicNum(elementlist.front()); + } + } else if (query_label.size()) { + std::cerr << "Unhandled generic nickname: " << query_label << std::endl; + } else { + rd_atom->setProp(common_properties::atomLabel, query_label); + } + } + + switch (node.m_radical) { + case kCDXRadical_None: + break; + case kCDXRadical_Singlet: + rd_atom->setNumRadicalElectrons(2); + break; + case kCDXRadical_Doublet: { + rd_atom->setNumRadicalElectrons(1); + break; + } + case kCDXRadical_Triplet: { + rd_atom->setNumRadicalElectrons(2); + break; + } + } + + if (node.m_enhancedStereoGroupNum > 0) { + auto key = std::make_pair(node.m_enhancedStereoGroupNum, grouptype); + auto &stereo = sgroups[key]; + stereo.sgroup = node.m_enhancedStereoGroupNum; + stereo.grouptype = grouptype; + stereo.atoms.push_back(rd_atom); + } + + pagedata.atomIds[atom_id] = + rd_atom; // The mol has ownership so this can't leak + if (node.m_nodeType == kCDXNodeType_Nickname || + node.m_nodeType == kCDXNodeType_Fragment) { + // This fragment needs to be expanded and joined to the current one + // the external_id is the node's atom_id + for (auto fragment : node.ContainedObjects()) { + if (fragment.second->GetTag() == kCDXObj_Fragment) { + if (!parseFragment(mol, (CDXFragment &)(*fragment.second), pagedata, + missingFragId, atom_id)) { + return false; + } + mol.setProp(NEEDS_FUSE, true); + // might need to reset to OUR frag_id since parse_fragment will + // set + // it to the fragments + mol.setProp(CDX_FRAG_ID, fragmentId); + } + } + } + return true; +} +} +} // namespace RDKit diff --git a/External/ChemDraw/node.h b/External/ChemDraw/node.h new file mode 100644 index 000000000..a449f46ef --- /dev/null +++ b/External/ChemDraw/node.h @@ -0,0 +1,54 @@ +// +// Copyright (c) 2024, Glysade Inc +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// * Neither the name of Novartis Institutes for BioMedical Research Inc. +// nor the names of its contributors may be used to endorse or promote +// products derived from this software without specific prior written +// permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +#ifndef CHEMDRAW_NODE_H +#define CHEMDRAW_NODE_H + +#include +#include +#include + +#include "ChemDrawStartInclude.h" +#include "chemdraw/CDXStdObjects.h" +#include "ChemDrawEndInclude.h" + +#include "utils.h" +#include "fragment.h" + +namespace RDKit { +namespace ChemDraw { +bool parseNode( + RWMol &mol, unsigned int fragmentId, CDXNode &node, PageData &pagedata, + std::map, StereoGroupInfo> &sgroups, + int &missingFragId, int externalAttachment); +} +} +#endif diff --git a/External/ChemDraw/reaction.cpp b/External/ChemDraw/reaction.cpp new file mode 100644 index 000000000..373003239 --- /dev/null +++ b/External/ChemDraw/reaction.cpp @@ -0,0 +1,165 @@ +// +// Copyright (c) 2024, Glysade Inc +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// * Neither the name of Novartis Institutes for BioMedical Research Inc. +// nor the names of its contributors may be used to endorse or promote +// products derived from this software without specific prior written +// permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// + +#include "chemdraw.h" +#include "chemdrawreaction.h" +#include "reaction.h" +#include "utils.h" + +#include +#include +#include +#include + +namespace RDKit { +namespace ChemDraw { +void ReactionStepInfo::set_reaction_data( + std::string type, std::string prop, const std::vector &frag_ids, + const std::map &fragments, + std::map> &grouped_fragments, + const std::vector> &mols) const { + unsigned int reagent_idx = 0; + for (auto idx : frag_ids) { + auto iter = grouped_fragments.find(idx); + if (iter == grouped_fragments.end()) { + BOOST_LOG(rdWarningLog) << "CDXMLParser: Schema " << scheme_id << " step " + << step_id << " " << type << " reaction fragment " + << idx << " not found in document." << std::endl; + continue; + } + for (auto reaction_fragment_id : iter->second) { + auto fragment = fragments.find(reaction_fragment_id); + if (fragment == fragments.end()) { + BOOST_LOG(rdWarningLog) + << "CDXMLParser: Schema " << scheme_id << " step " << step_id << " " + << type << " fragment " << idx << " not found in document." + << std::endl; + continue; + } + auto &mol = mols[fragment->second]; + mol->setProp(CDX_SCHEME_ID, scheme_id); + mol->setProp(CDX_STEP_ID, step_id); + mol->setProp(prop, reagent_idx); + } + reagent_idx += 1; + } +} + +void ReactionStepInfo::set_reaction_step( + size_t scheme_id, std::map &atoms, + const std::map &fragments, + std::map> &grouped_fragments, + const std::vector> &mols) const { + // Set the molecule properties + set_reaction_data("ReactionStepReactants", CDX_REAGENT_ID, + ReactionStepReactants, fragments, grouped_fragments, mols); + set_reaction_data("ReactionStepProducts", CDX_PRODUCT_ID, + ReactionStepProducts, fragments, grouped_fragments, mols); + + auto agents = ReactionStepObjectsAboveArrow; + agents.insert(agents.end(), ReactionStepObjectsBelowArrow.begin(), + ReactionStepObjectsBelowArrow.end()); + set_reaction_data("ReactionStepAgents", CDX_AGENT_ID, agents, fragments, + grouped_fragments, mols); + + // Set the Atom Maps + int atommap = 0; + for (auto mapping : ReactionStepAtomMap) { + ++atommap; + unsigned int idx1 = mapping.first; + unsigned int idx2 = mapping.second; + if (atoms.find(idx1) != atoms.end()) { + atoms[idx1]->setAtomMapNum(atommap); + } else { + BOOST_LOG(rdWarningLog) + << "CDXMLParser: Schema " << scheme_id << " step " << step_id + << " ReactionStepAtomMap cannot find atom with node id " << idx1 + << "skipping schema..." << std::endl; + } + if (atoms.find(idx2) != atoms.end()) { + atoms[idx2]->setAtomMapNum(atommap); + } else { + // XXX log error + BOOST_LOG(rdWarningLog) + << "CDXMLParser: Schema " << scheme_id << " step " << step_id + << " ReactionStepAtomMap cannot find atom with node id " << idx2 + << " skipping schema..." << std::endl; + } + } +} + +ReactionInfo::ReactionInfo(CDXReactionScheme &scheme) + : scheme_id(static_cast(scheme.GetObjectID())) { + for (auto &rxnNode : scheme.ContainedObjects()) { + CDXDatumID type_id = (CDXDatumID)rxnNode.second->GetTag(); + if (type_id == kCDXObj_ReactionStep) { + CDXReactionStep &step = (CDXReactionStep &)(*rxnNode.second); + auto step_id = step.GetObjectID(); + steps.emplace_back(ReactionStepInfo()); + ReactionStepInfo &scheme = steps.back(); + scheme.scheme_id = scheme_id; + scheme.step_id = step_id; + scheme.ReactionStepProducts = step.m_products; + scheme.ReactionStepReactants = step.m_reactants; + scheme.ReactionStepObjectsBelowArrow = step.m_objectsBelowArrow; + scheme.ReactionStepAtomMap = step.m_aamap; + steps.push_back(scheme); + } + } +} + +void ReactionInfo::set_reaction_steps( + std::map> &grouped_fragments, + const std::vector> &mols) const { + if (steps.size()) { + std::map fragments; + std::map agents; + std::map products; + std::map atoms; + size_t mol_idx = 0; + for (auto &mol : mols) { + auto idx = mol->getProp(CDX_FRAG_ID); + fragments[idx] = mol_idx++; + for (auto &atom : mol->atoms()) { + unsigned int idx = atom->getProp(CDX_ATOM_ID); + atoms[idx] = atom; + } + } + + for (auto &step : steps) { + step.set_reaction_step(scheme_id, atoms, fragments, grouped_fragments, + mols); + } + } +} +} +} // namespace RDKit diff --git a/External/ChemDraw/reaction.h b/External/ChemDraw/reaction.h new file mode 100644 index 000000000..2e1cad681 --- /dev/null +++ b/External/ChemDraw/reaction.h @@ -0,0 +1,87 @@ +// +// Copyright (c) 2024, Glysade Inc +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// * Neither the name of Novartis Institutes for BioMedical Research Inc. +// nor the names of its contributors may be used to endorse or promote +// products derived from this software without specific prior written +// permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +#ifndef CHEMDRAW_REACTION_H +#define CHEMDRAW_REACTION_H + +#include + +#include "ChemDrawStartInclude.h" +#include "chemdraw/CDXStdObjects.h" +#include "ChemDrawEndInclude.h" + +#include +#include + +namespace RDKit { +namespace ChemDraw { +struct ReactionStepInfo { + // Holds the current reaction step information so that we can convert + // chemdraw molecules into rdkit reactions + unsigned int scheme_id; + unsigned int step_id; + std::vector ReactionStepProducts; + std::vector ReactionStepReactants; + std::vector ReactionStepObjectsAboveArrow; + std::vector ReactionStepObjectsBelowArrow; + std::vector> ReactionStepAtomMap; + + void set_reaction_data( + std::string type, std::string prop, const std::vector &frag_ids, + const std::map &fragments, + std::map> &grouped_fragments, + const std::vector> &mols) const; + + void set_reaction_step( + size_t scheme_id, std::map &atoms, + const std::map &fragments, + std::map> &grouped_fragments, + const std::vector> &mols) const; +}; + +class ReactionInfo { + // Holds the information form the CDX data so that we can convert + // the molecules in the file to RDKit Reactions + + std::vector steps; + unsigned int scheme_id; + + public: + ReactionInfo(CDXReactionScheme &scheme); + + void set_reaction_steps( + std::map> &grouped_fragments, + const std::vector> &mols) const; +}; +} +} // namespace RDKit + +#endif diff --git a/External/ChemDraw/test-chiral.cpp b/External/ChemDraw/test-chiral.cpp new file mode 100644 index 000000000..3cb803457 --- /dev/null +++ b/External/ChemDraw/test-chiral.cpp @@ -0,0 +1,96 @@ +// +// Copyright (c) 2024 Glysade Inc and other RDkit contributors +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// * Neither the name of Novartis Institutes for BioMedical Research Inc. +// nor the names of its contributors may be used to endorse or promote +// products derived from this software without specific prior written +// permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// + +#include "chemdraw.h" +#include +#include "RDGeneral/test.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +using namespace RDKit; +using namespace RDKit::v2; + +TEST_CASE("Geometry") { + std::string path = + std::string(getenv("RDBASE")) + "/External/ChemDraw/test_data/"; + SECTION("R/S Tetrahedral") { + //_sleep(10 * 1000); + + { + auto fname = path + "geometry-tetrahedral.cdxml"; + auto mols = MolsFromChemDrawFile(fname); + REQUIRE(mols.size()); // [C@H]1(C2)[C@@H]2C1 + auto mol = "[C@H]1(C2)[C@@H]2C1"_smiles; + auto smi = MolToSmiles(*mol); + REQUIRE(smi == MolToSmiles(*mols[0])); + } + { + auto fname = path + "geometry-tetrahedral-2.cdxml"; + auto mols = MolsFromChemDrawFile(fname); + REQUIRE(mols.size()); + auto mol = "[C@H]1(C2)[C@@H]2C1"_smiles; + auto smi = MolToSmiles(*mol); + REQUIRE(smi == MolToSmiles(*mols[0])); + } + + { + auto fname = path + "geometry-tetrahedral-3.cdxml"; + auto mols = MolsFromChemDrawFile(fname); + REQUIRE(mols.size()); + auto mol = "C1CC[C@H]2CCCC[C@@H]2C1"_smiles; + auto smi = MolToSmiles(*mol); + REQUIRE(smi == MolToSmiles(*mols[0])); + } + + /* this one we still get wrong... + { + auto fname = path + "geometry-tetrahedral-4.cdxml"; + auto mols = MolsFromChemDrawFile(fname); + REQUIRE(mols.size()); + auto mol = "CC(S[C@@H]1CC2=C([H])C(CC[C@]2(C)[C@@]3([H])CC([H])([H])[C@]4(C)[C@](OC5=O)(CC5([H])[H])CC[C@@]4([H])[C@]13[H])=O)=O"_smiles; + auto smi = MolToSmiles(*mol); + std::cerr << "** " << smi << std::endl; + REQUIRE(smi == MolToSmiles(*mols[0])); + } + */ + } +} diff --git a/External/ChemDraw/test-reactions.cpp b/External/ChemDraw/test-reactions.cpp new file mode 100644 index 000000000..30b694add --- /dev/null +++ b/External/ChemDraw/test-reactions.cpp @@ -0,0 +1,164 @@ +// +// Copyright (c) 2025 Glysade Inc and other RDkit contributors +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// * Neither the name of Novartis Institutes for BioMedical Research Inc. +// nor the names of its contributors may be used to endorse or promote +// products derived from this software without specific prior written +// permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// + +#include "chemdraw.h" +#include "chemdrawreaction.h" +#include +#include "RDGeneral/test.h" +#include +#include +#include +#include +#include +#include + + +#include +using namespace RDKit; +using namespace RDKit::v2; + +TEST_CASE("CDXML Parser") { + std::string cdxmlbase = + std::string(getenv("RDBASE")) + "/Code/GraphMol/test_data/CDXML/"; + SECTION("CDXML REACTION") { + auto fname = cdxmlbase + "rxn2.cdxml"; + std::vector expected = { + "Cl[c:1]1[cH:4][cH:3][cH:2][cH:6][cH:5]1", + "OC(O)B[c:7]1[cH:8][cH:9][cH:10][cH:11][cH:12]1", + "[cH:1]1[cH:4][cH:3][cH:2][c:6](-[c:7]2[cH:8][cH:9][cH:10][cH:11][cH:12]2)[cH:5]1"}; + + auto rxns = ChemDrawFileToChemicalReactions(fname); + CHECK(rxns.size() == 1); + unsigned int i = 0; + int count = 0; + for (auto &mol : rxns[0]->getReactants()) { + CHECK(mol->getProp("CDX_SCHEME_ID") == 397); + CHECK(mol->getProp("CDX_STEP_ID") == 398); + CHECK(mol->getProp("CDX_REAGENT_ID") == i++); + CHECK(MolToSmiles(*mol) == expected[count++]); + } + i = 0; + for (auto &mol : rxns[0]->getProducts()) { + CHECK(mol->getProp("CDX_SCHEME_ID") == 397); + CHECK(mol->getProp("CDX_STEP_ID") == 398); + CHECK(mol->getProp("CDX_PRODUCT_ID") == i++); + CHECK(MolToSmiles(*mol) == expected[count++]); + } + + auto smarts = ChemicalReactionToRxnSmarts(*rxns[0]); + CHECK( + smarts == + "[#6&D2:2]1:[#6&D2:3]:[#6&D2:4]:[#6&D3:1](:[#6&D2:5]:[#6&D2:6]:1)-[#17&D1].[#6&D3](-[#5&D2]-[#6&D3:7]1:[#6&D2:8]:[#6&D2:9]:[#6&D2:10]:[#6&D2:11]:[#6&D2:12]:1)(-[#8&D1])-[#8&D1]>>[#6&D2:1]1:[#6&D2:5]:[#6&D3:6](:[#6&D2:2]:[#6&D2:3]:[#6&D2:4]:1)-[#6&D3:7]1:[#6&D2:8]:[#6&D2:9]:[#6&D2:10]:[#6&D2:11]:[#6&D2:12]:1"); + } + + SECTION("Github #7528 CDXML Grouped Agents in Reactions") { + // The failing case had fragments grouped with labels, ensure the grouped + // cersion and the ungrouped versions have the same results + auto fname = cdxmlbase + "github7467-grouped-fragments.cdxml"; + auto rxns = ChemDrawFileToChemicalReactions(fname); + CHECK(rxns.size() == 1); + fname = cdxmlbase + "github7467-ungrouped-fragments.cdxml"; + auto rxns2 = ChemDrawFileToChemicalReactions(fname); + + CHECK(ChemicalReactionToRxnSmarts(*rxns[0]) == + ChemicalReactionToRxnSmarts(*rxns2[0])); + + // Check to see if our understanding of grouped reagents in reactions is + // correct + fname = cdxmlbase + "reaction-with-grouped-templates.cdxml"; + auto rxns3 = ChemDrawFileToChemicalReactions(fname); + CHECK(rxns3.size() == 1); + std::string rxnb = R"RXN($RXN + + Mrv2004 062120241319 + + 2 0 +$MOL + + Mrv2004 06212413192D + + 5 5 0 0 0 0 999 V2000 + 2.6221 -4.6475 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0 + 2.6221 -5.4725 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0 + 3.4070 -5.7274 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0 + 3.8918 -5.0600 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0 + 3.4070 -4.3926 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0 + 1 2 2 0 0 0 0 + 2 3 1 0 0 0 0 + 3 4 2 0 0 0 0 + 4 5 1 0 0 0 0 + 5 1 1 0 0 0 0 +M END +$MOL + + Mrv2004 06212413192D + + 11 11 0 0 0 0 999 V2000 + 6.9305 -4.5100 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0 + 6.9305 -5.3350 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0 + 7.6450 -5.7475 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0 + 8.3594 -5.3350 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0 + 8.3594 -4.5100 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0 + 7.6450 -4.0975 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0 + 8.6171 -4.4825 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0 + 8.6171 -5.3075 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0 + 9.4020 -5.5624 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0 + 9.8868 -4.8950 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0 + 9.4020 -4.2276 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0 + 1 2 1 0 0 0 0 + 6 1 1 0 0 0 0 + 2 3 1 0 0 0 0 + 3 4 1 0 0 0 0 + 4 5 1 0 0 0 0 + 5 6 1 0 0 0 0 + 7 8 2 0 0 0 0 + 11 7 1 0 0 0 0 + 8 9 1 0 0 0 0 + 9 10 2 0 0 0 0 + 10 11 1 0 0 0 0 +M END +)RXN"; + std::unique_ptr rxn_mb{RxnBlockToChemicalReaction(rxnb)}; + // CDXMLToReaction is sanitized by default, this might be a mistake... + unsigned int failed; + RxnOps::sanitizeRxn( + *rxn_mb, failed, + RxnOps::SANITIZE_ADJUST_REACTANTS | RxnOps::SANITIZE_ADJUST_PRODUCTS, + RxnOps::MatchOnlyAtRgroupsAdjustParams()); + + CHECK(rxns3[0]->getNumReactantTemplates() == + rxn_mb->getNumReactantTemplates()); + CHECK(ChemicalReactionToRxnSmarts(*rxns3[0]) == + ChemicalReactionToRxnSmarts(*rxn_mb)); + } +} + diff --git a/External/ChemDraw/test.cpp b/External/ChemDraw/test.cpp new file mode 100644 index 000000000..5fadb985b --- /dev/null +++ b/External/ChemDraw/test.cpp @@ -0,0 +1,1414 @@ +// +// Copyright (c) 2024 Glysade Inc and other RDkit contributors +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// * Neither the name of Novartis Institutes for BioMedical Research Inc. +// nor the names of its contributors may be used to endorse or promote +// products derived from this software without specific prior written +// permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// + +#include "chemdraw.h" +#include +#include "RDGeneral/test.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +using namespace RDKit; +using namespace RDKit::v2; + +std::string canon(const std::string &smi) { + auto *m = SmilesToMol(smi); + auto res = MolToSmiles(*m); + delete m; + return res; +} + +void check_smiles_and_roundtrip(const RWMol &m, const std::string &expected) { + CHECK(MolToSmiles(m) == expected); + // std::cout << "*********" << std::endl; + // std::cout << MolToMolBlock(m) << std::endl; + std::unique_ptr mol(MolBlockToMol(MolToMolBlock(m))); + CHECK(MolToSmiles(*mol) == expected); +} + +TEST_CASE("CDXML") { + std::string cdxmlbase = + std::string(getenv("RDBASE")) + "/Code/GraphMol/test_data/CDXML/"; + + SECTION("SIMPLE") { + std::string cdxml1 = R"( + + + + + + + + + + + + + Boc)"; + std::stringstream iss(cdxml1); + { + auto mols = MolsFromChemDrawDataStream(iss); + for (auto &mol : mols) { + CHECK(MolToSmiles(*mol) == "CC(C)(C)OC(=O)C1CCCCCC1"); + } + } + } +} + +TEST_CASE("CDXML Advanced") { + std::string cdxmlbase = + std::string(getenv("RDBASE")) + "/Code/GraphMol/test_data/CDXML/"; + SECTION("RING CHIRALITY") { + std::string fname = cdxmlbase + "ring-stereo1.cdxml"; + std::vector expected = {"C1CC[C@H]2CCCC[C@H]2C1"}; + auto mols = MolsFromChemDrawFile(fname); + CHECK(mols.size() == expected.size()); + int i = 0; + for (auto &mol : mols) { + CHECK(MolToSmiles(*mol) == expected[i++]); + } + } + SECTION("SIMPLE CHIRAL") { + std::string fname = cdxmlbase + "chirality1.cdxml"; + std::vector expected = {"C[C@H](N)C[C@H](C)N"}; + auto mols = MolsFromChemDrawFile(fname); + CHECK(mols.size() == expected.size()); + int i = 0; + for (auto &mol : mols) { + CHECK(MolToSmiles(*mol) == expected[i++]); + } + } + SECTION("CDXML-CISTRANS") { + auto fname = cdxmlbase + "cistrans1.cdxml"; + std::vector expected = {"F/C(I)=C(\\Cl)Br"}; + auto mols = MolsFromChemDrawFile(fname); + CHECK(mols.size() == expected.size()); + int i = 0; + for (auto &mol : mols) { + CHECK(MolToSmiles(*mol) == expected[i++]); + } + } + SECTION("DEUTERIUM") { + auto fname = cdxmlbase + "deuterium.cdxml"; + { + std::vector expected = { + "[2H]c1c([2H])c([2H])c([2H])c([2H])c1[2H]"}; + auto mols = MolsFromChemDrawFile(fname); + CHECK(mols.size() == expected.size()); + int i = 0; + for (auto &mol : mols) { + CHECK(MolToSmiles(*mol) == expected[i++]); + } + } + { + std::vector expected = { + "[2H]C1=C([2H])C([2H])=C([2H])C([2H])=C1[2H]"}; + ChemDrawParserParams params; + params.sanitize = false; + params.removeHs = false; + auto mols = MolsFromChemDrawFile(fname, params); + CHECK(mols.size() == expected.size()); + int i = 0; + for (auto &mol : mols) { + CHECK(MolToSmiles(*mol) == expected[i++]); + } + } + { + std::vector expected = { + "[2H]C1=C([2H])C([2H])=C([2H])C([2H])=C1[2H]"}; + ChemDrawParserParams params; + params.sanitize = false; + params.removeHs = true; + auto mols = MolsFromChemDrawFile(fname, params); + CHECK(mols.size() == expected.size()); + int i = 0; + for (auto &mol : mols) { + CHECK(MolToSmiles(*mol) == expected[i++]); + } + } + } + SECTION("Queries") { + { + auto fname = cdxmlbase + "query-atoms.cdxml"; + + std::vector expected = {"*c1ccccc1", "*c1ccccc1", + "*c1ccccc1"}; + std::vector expected_smarts = { + "[#6]1:[#6]:[#6]:[#6]:[#6]:[#6]:1-*", + "[#6]1:[#6]:[#6]:[#6]:[#6]:[#6]:1-[!#1]", + "[#6]1:[#6]:[#6]:[#6]:[#6]:[#6]:1-[!#6&!#1]"}; + auto mols = MolsFromChemDrawFile(fname); + CHECK(mols.size() == expected.size()); + int i = 0; + for (auto &mol : mols) { + CHECK(MolToSmarts(*mol) == expected_smarts[i]); + CHECK(MolToSmiles(*mol) == expected[i++]); + } + } + { + auto fname = cdxmlbase + "anybond.cdxml"; + auto mols = MolsFromChemDrawFile(fname); + CHECK(mols.size() == 1); + CHECK(MolToSmiles(*mols[0]) == "C1CCC~CC1"); + CHECK(MolToSmarts(*mols[0]) == "[#6]1~[#6]-[#6]-[#6]-[#6]-[#6]-1"); + } + } + SECTION("ElementList") { + auto fname = cdxmlbase + "element-list.cdxml"; + + std::vector expected = {"[C]CC"}; + std::vector expected_smarts = {"[#6]-[#6]-[#6,#7,#8,#16]"}; + auto mols = MolsFromChemDrawFile(fname); + CHECK(mols.size() == expected.size()); + int i = 0; + for (auto &mol : mols) { + CHECK(MolToSmarts(*mol) == expected_smarts[i]); + CHECK(MolToSmiles(*mol) == expected[i++]); + } + } + SECTION("Enhanced Stereo") { + auto fname = cdxmlbase + "beta-cypermethrin.cdxml"; + std::vector expected = { + "CC1(C)[C@H](C=C(Cl)Cl)[C@H]1C(=O)O[C@@H](C#N)c1cccc(Oc2ccccc2)c1"}; + std::vector expected_cx = { + "CC1(C)[C@H](C=C(Cl)Cl)[C@@H]1C(=O)O[C@H](C#N)c1cccc(Oc2ccccc2)c1 |&1:3,&2:8,12|"}; + auto mols = MolsFromChemDrawFile(fname); + CHECK(mols.size() == expected.size()); + int i = 0; + for (auto &mol : mols) { + mol.get()->clearConformers(); + CHECK(MolToSmiles(*mol) == expected[i]); + CHECK(MolToCXSmiles(*mol) == expected_cx[i++]); + } + } + SECTION("Enhanced Stereo 2") { + auto fname = cdxmlbase + "beta-cypermethrin-or-abs.cdxml"; + std::vector expected = { + "CC1(C)[C@H](C=C(Cl)Cl)[C@H]1C(=O)O[C@@H](C#N)c1cccc(Oc2ccccc2)c1"}; + std::vector expected_cx = { + "CC1(C)[C@H](C=C(Cl)Cl)[C@@H]1C(=O)O[C@H](C#N)c1cccc(Oc2ccccc2)c1 |a:3,o1:8,12|"}; + auto mols = MolsFromChemDrawFile(fname); + CHECK(mols.size() == expected.size()); + int i = 0; + SmilesWriteParams wp; + for (auto &mol : mols) { + auto tomol = std::unique_ptr((ROMol*)mol.release()); + tomol.get()->clearConformers(); + RDKit::canonicalizeStereoGroups(tomol); + + CHECK(MolToSmiles(*tomol) == expected[i]); + CHECK(MolToCXSmiles(*tomol, wp) == expected_cx[i++]); + } + } + + SECTION("Bad CDXML") { + auto fname = cdxmlbase + "bad-cdxml.cdxml"; + // Only one passes sanitization + { + std::vector expected = {"*c1ccccc1"}; + std::vector expected_smarts = { + "[#6]1:[#6]:[#6]:[#6]:[#6]:[#6]:1-*", + }; + auto params = ChemDrawParserParams(); + auto mols = MolsFromChemDrawFile(fname, params); + for (auto &mol : mols) { + std::cerr << MolToSmarts(*mol) << std::endl; + std::cerr << MolToSmiles(*mol) << std::endl; + } + CHECK(mols.size() == expected.size()); + int i = 0; + for (auto &mol : mols) { + CHECK(MolToSmarts(*mol) == expected_smarts[i]); + CHECK(MolToSmiles(*mol) == expected[i++]); + } + } + // setting sanitization to false, we get both + std::vector expected = {"*C1=C([H])C([H])=C([H])C([H])=C1[H]", + "*C1=C([H])N([H])=C([H])C([H])=C1[H]"}; + std::vector expected_smarts = { + "[#6]1(=[#6](-[#6](=[#6](-[#6](=[#6]-1-*)-[H])-[H])-[H])-[H])-[H]", + "[#6]1(=[#6](-[#6](=[#7](-[#6](=[#6]-1-[!#1])-[H])-[H])-[H])-[H])-[H]", + }; + ChemDrawParserParams params; + params.sanitize = false; + auto mols = MolsFromChemDrawFile(fname, params); + CHECK(mols.size() == expected.size()); + int i = 0; + for (auto &mol : mols) { + CHECK(MolToSmarts(*mol) == expected_smarts[i]); + CHECK(MolToSmiles(*mol) == expected[i++]); + } + } + + SECTION("Fusion with chirality") { + auto fname = cdxmlbase + "fusion-chiral.cdxml"; + auto mols = MolsFromChemDrawFile(fname); + std::vector expected = {"C[C@@H](O)[C@@H](C)O"}; + CHECK(mols.size() == expected.size()); + int i = 0; + for (auto &mol : mols) { + CHECK(MolToSmiles(*mol) == expected[i++]); + } + } + + SECTION("deuterium atom") { + auto fname = cdxmlbase + "deuterium-atom.cdxml"; + ChemDrawParserParams params; + params.sanitize = false; + params.removeHs = false; + auto mols = MolsFromChemDrawFile(fname, params); + std::vector expected = {"[2H]"}; + CHECK(mols.size() == expected.size()); + int i = 0; + for (auto &mol : mols) { + CHECK(MolToSmiles(*mol) == expected[i++]); + } + } + SECTION("ChemDraw Template 2 from the synthesis-workshop") { + // this was another hella fun to validate the stereo-chemistry... + // there were so many stereo warnings in chemdraw, I'm just going to + // assume the rdkit is correct here... + auto fname = cdxmlbase + "chemdraw_template2.cdxml"; + std::string talatisamine = "CCN1C[C@]2(COC)CCC(OC)[C@@]34[C@@H]5C[C@@H]6C(OC)C[C@@](O)([C@H]5[C@H]6O)[C@@H](C[C@H]23)[C@H]14"; + auto mols = MolsFromChemDrawFile(fname); + std::vector expected = { + talatisamine, //0 + "*", + "C", + "[F]", + "[B]", + "[C]", + "[2H]", + talatisamine, + "*", + "C", + "[F]", // 10 + "[B]", + "[C]", + "[2H]", + talatisamine, + "*", + "C", + "[F]", + "[B]", + "[C]", + "[2H]", // 20 + talatisamine, + "*", + "C", + "[F]", + "[B]", + "[C]", + "[2H]", + talatisamine, + "CCN1C[C@]2(COC)CC[C@H](OC)[C@]34C1C(C[C@H]23)[C@@]1(O)CC(OC)[C@H]2C[C@@H]4[C@@H]1[C@H]2O", + "*", // 30 + "[B]", + "[C]", + "[2H]", + "C", + "[F]", + "*", + "C", + "[F]", + "[B]", + "[C]", // 40 + "[2H]", + talatisamine, + "*", + "C", + "[F]", + "[B]", + "[C]", + "[2H]", + talatisamine, + "*", // 50 + "C", + "[F]", + "[B]", + "[C]", + "[2H]", + "CC1CC[C@]2(O)[C@]3(C)C[C@]4(O)O[C@@]2([C@@H]1O)C1(O)C4(C)C(O)(C(C)C)[C@@H](O)[C@]13O", + "CC1=C(C(C)C)[C@@H](O)[C@@]2(O)[C@@]3(C)CC(=O)O[C@@]4([C@H](O)C(C)CC[C@]34O)[C@@]12O", + "CC1=C[C@@]23OC(=O)C[C@@](C)([C@@]2(O)CC1)[C@]1(O)[C@H](O)C2(C(C)C)OC2(C)[C@@]31O", + "*", + "[B]", // 60 + "[C]", + "CC1CC[C@@H]2[C@]3(C)C[C@@H]4O[C@@]2(C1)C1[C@@H]3CC(C(C)C)C14C", + "[2H]", + "*", + "[B]", + "[C]", + "C", + "CC1CC[C@]2(O)[C@]3(C)C[C@]4(O)O[C@@]2([C@@H]1O)C1(O)C4(C)C(O)(C(C)C)[C@@H](O)[C@]13O", + "[2H]"}; + CHECK(mols.size() == expected.size()); + int i = 0; + for (auto &mol : mols) { + INFO(std::to_string(i) + " " + MolToSmiles(*mol)); + CHECK(MolToSmiles(*mol) == expected[i++]); + } + } + SECTION("ChemDraw Template 3 from the synthesis-workshop") { + // this was another hella fun to validate the stereo-chemistry... + // there were so many stereo warnings in chemdraw, I'm just going to + // assume + // the rdkit is correct here... + auto fname = cdxmlbase + "chemdraw_template3.cdxml"; + auto mols = MolsFromChemDrawFile(fname); + std::vector expected = { + "CCC/C=C/C=C/C(=O)O[C@H]1/C(=C/C(=O)OC)C[C@H]2C[C@H]([C@@H](C)O)OC(=O)C[C@H](O)C[C@@H]3C[C@H](OC(C)=O)C(C)(C)[C@](O)(C[C@@H]4C/C(=C/C(=O)OC)C[C@H](/C=C/C(C)(C)[C@]1(O)O2)O4)O3", + "[B]", + "*", + "[C]", + "CCC/C=C/C=C/C(=O)O[C@H]1/C(=C/C(=O)OC)C[C@H]2C[C@H]([C@@H](C)O)OC(=O)C[C@H](O)C[C@@H]3C[C@H](OC(C)=O)C(C)(C)[C@](O)(C[C@@H]4C/C(=C/C(=O)OC)C[C@H](/C=C/C(C)(C)[C@]1(O)O2)O4)O3", + "[B]", + "*", + "[C]", + "CCC/C=C/C=C/C(=O)O[C@H]1/C(=C/C(=O)OC)C[C@@H](C[C@@H](O)[C@@H](C)O)O[C@@]1(O)C(C)(C)/C=C/C=O", + "*", + "[C]", + "C=C(C[C@H]([O])C[C@]1(O)O[C@H](C[C@@H](O)CC(=O)O)C[C@H](OC(C)=O)C1(C)C)C[Si](C)(C)C", + "*.CC[Si](CC)CC", + "CC[Si](C)(CC)CC", + "CC[Si](C)(CC)CC", + "CC", + "CC", + "*", + "C=C(C[C@H]([O])C[C@]1(O)O[C@H](C[C@@H](O)CC(=O)O)C[C@H](OC(C)=O)C1(C)C)C[Si](C)(C)C", + "*.CC[Si](CC)CC", + "CCC/C=C/C=C/C(=O)O[C@H]1/C(=C/C(=O)OC)C[C@@H](C[C@@H](O)[C@@H](C)O)O[C@@]1(O)C(C)(C)/C=C/C=O", + "[C]"}; + int i = 0; + for (auto &mol : mols) { + INFO(i); + check_smiles_and_roundtrip(*mol, expected[i++]); + } + } + SECTION("protecting group") { + auto fname = cdxmlbase + "protecting-groups.cdxml"; + auto mols = MolsFromChemDrawFile(fname); + std::vector expected = {"CC[Si](C)(CC)CC", "CC"}; + CHECK(mols.size() == expected.size()); + int i = 0; + for (auto &mol : mols) { + check_smiles_and_roundtrip(*mol, expected[i++]); + } + } + SECTION("protecting group 2") { + auto fname = cdxmlbase + "protecting-groups2.cdxml"; + auto mols = MolsFromChemDrawFile(fname); + std::vector expected = {"CC[Si](C)(CC)CC", "CC"}; + CHECK(mols.size() == expected.size()); + int i = 0; + for (auto &mol : mols) { + check_smiles_and_roundtrip(*mol, expected[i++]); + } + } + + SECTION("floating protecting group") { + auto fname = cdxmlbase + "floating-protecting-group.cdxml"; + // This is a weird one, chemdraw simply ignores the error that causes the + // bond issue, we should probably drop the floating fragment here if + // we are sanitizing + auto mols = MolsFromChemDrawFile(fname); + std::vector expected = { + "*", + "C=C(C[C@H]([O])C[C@]1(O)O[C@H](C[C@@H](O)CC(=O)O)C[C@H](OC(C)=O)C1(C)C)C[Si](C)(C)C", + "*.CC[Si](CC)CC"}; + CHECK(mols.size() == expected.size()); + int i = 0; + for (auto &mol : mols) { + INFO(i); + check_smiles_and_roundtrip(*mol, expected[i++]); + } + } + + SECTION("Missing File Name") { + try { + auto mols = MolsFromChemDrawFile("missing file"); + CHECK(0); // Bad file exception not caught + } catch (RDKit::BadFileException &) { + } + } + + SECTION("Aromatic ring (bondorder==4") { + auto fname = cdxmlbase + "aromatic.cdxml"; + auto mols = MolsFromChemDrawFile(fname); + std::vector expected = {"c1ccccc1"}; + CHECK(mols.size() == expected.size()); + int i = 0; + for (auto &mol : mols) { + check_smiles_and_roundtrip(*mol, expected[i++]); + } + } + SECTION("Malformed") { + auto fname = cdxmlbase + "malformed.cdxml"; + try { + auto mols = MolsFromChemDrawFile(fname); + CHECK(0); + } catch (FileParseException &e) { + // CHECK(std::string(e.what()) == "expected > at line: 373"); + } + } + SECTION("Lots of stereo") { + { + auto fname = cdxmlbase + "stereo.cdxml"; + std::vector expected = { + "C[C@@H](Cl)[C@H](N)O.C[C@@H](F)[C@H](N)O.C[C@H](Br)[C@@H](N)O.C[C@H](I)[C@@H](N)O"}; + auto mols = MolsFromChemDrawFile(fname); + CHECK(mols.size() == expected.size()); + int i = 0; + for (auto &mol : mols) { + check_smiles_and_roundtrip(*mol, expected[i++]); + } + } + + { // The above, but broken out for easier testing + std::vector filenames = {"stereo1.cdxml", "stereo2.cdxml", + "stereo3.cdxml", "stereo4.cdxml"}; + std::vector expected = { + "C[C@H](I)[C@@H](N)O", "C[C@@H](I)[C@H](N)O", "C[C@@H](Cl)[C@H](N)O", + "C[C@H](Br)[C@@H](N)O"}; + + for (auto i = 0u; i < filenames.size(); ++i) { + auto fname = cdxmlbase + filenames[i]; + auto mols = MolsFromChemDrawFile(fname); + CHECK(mols.size() == 1); + auto &m = *mols.back(); + check_smiles_and_roundtrip(m, expected[i]); + } + } + + { + auto fname = cdxmlbase + "wavy.cdxml"; + std::vector expected = {"Cc1cccc(C)c1NC(=O)N=C1CCCN1C", + "Cc1cccc(C)c1NC(=O)/N=C1\\CCCN1C"}; + auto mols = MolsFromChemDrawFile(fname); + CHECK(mols.size() == expected.size()); + int i = 0; + for (auto &mol : mols) { + if (i == 0) { + CHECK(mol->getBondWithIdx(11)->getStereo() == + Bond::BondStereo::STEREOANY); + } + check_smiles_and_roundtrip(*mol, expected[i++]); + } + } + { + auto fname = cdxmlbase + "wavy-single.cdxml"; + std::vector expected = {"CCCC"}; + auto mols = MolsFromChemDrawFile(fname); + CHECK(mols.size() == expected.size()); + int i = 0; + for (auto &mol : mols) { + CHECK(mol->getBondWithIdx(0)->getBondDir() == Bond::BondDir::NONE); + CHECK(mol->getBondWithIdx(1)->getBondDir() == Bond::BondDir::NONE); + CHECK(mol->getBondWithIdx(2)->getBondDir() == Bond::BondDir::NONE); + check_smiles_and_roundtrip(*mol, expected[i++]); + } + } + } + SECTION("Lots of bad stereo") { + { + auto fname = cdxmlbase + "bad-id.cdxml"; + auto mols = MolsFromChemDrawFile(fname); + CHECK(mols.size() == 0); + } + { + auto fname = cdxmlbase + "bad-coords.cdxml"; + auto mols = MolsFromChemDrawFile(fname); + CHECK(mols.size() == 0); + } + { + auto fname = cdxmlbase + "bad-bondorder2.cdxml"; + auto mols = MolsFromChemDrawFile(fname); + CHECK(mols.size() == 0); + } + } +} + +TEST_CASE("REACTIONS") { + std::string cdxmlbase = + std::string(getenv("RDBASE")) + "/Code/GraphMol/test_data/CDXML/"; + SECTION("REACTION") { + std::string fname = cdxmlbase + "reaction-with-boc.cdxml"; + std::vector expected = {"CC(C)(C)OC(=O)C1CCCCCC1[*:1]", + "c1ccc([*:1])cc1", "C1CC1", "C1CCC1"}; + { + auto mols = MolsFromChemDrawFile(fname); + CHECK(mols.size() == expected.size()); + int i = 0; + for (auto &mol : mols) { + CHECK(MolToSmiles(*mol) == expected[i++]); + } + } + { + // v1 api + auto mols = MolsFromChemDrawFile(fname); + CHECK(mols.size() == expected.size()); + int i = 0; + for (auto &mol : mols) { + CHECK(MolToSmiles(*mol) == expected[i++]); + } + } + } + SECTION("REACTION2") { + auto fname = cdxmlbase + "rxn1.cdxml"; + std::vector expected = { + "Cl[c:1]1[cH:4][cH:3][cH:2][cH:6][cH:5]1", + "OC(O)B[c:7]1[cH:8][cH:9][cH:10][cH:11][cH:12]1", + "[cH:1]1[cH:4][cH:3][cH:2][c:6](-[c:7]2[cH:8][cH:9][cH:10][cH:11][cH:12]2)[cH:5]1"}; + auto mols = MolsFromChemDrawFile(fname); + int i = 0; + for (auto &mol : mols) { + CHECK(mol->getProp("CDX_SCHEME_ID") == 397); + CHECK(mol->getProp("CDX_STEP_ID") == 398); + if (i == 0) { + CHECK(mol->getProp("CDX_REAGENT_ID") == 0); + } else if (i == 1) { + CHECK(mol->getProp("CDX_REAGENT_ID") == 1); + } else if (i == 2) { + CHECK(mol->getProp("CDX_PRODUCT_ID") == 0); + } + CHECK(MolToSmiles(*mol) == expected[i++]); + } + } +} + +TEST_CASE("atropisomers") { + std::string cdxmlbase = + std::string(getenv("RDBASE")) + "/Code/GraphMol/test_data/CDXML/"; + + SECTION("atropisomer") { + { + // XXX the rounding here is a little different from the original RDKit + // CDXML parser 0.64->0.39999 + // This is something we should probably figure out. + std::vector filenames = {"atrop1.cdxml"}; + std::vector expected = { + "C[C]1[C][CH]C(Cl)C(C)=C1c1c(C)ccc(Cl)c1C |(-2.936,-0.12,;-2.936,-1.66,;-1.602,-2.43,;-1.602,-3.97,;-2.936,-4.74,;-2.93,-6.28,;-4.27,-3.97,;-5.603,-4.74,;-4.27,-2.43,;-5.603,-1.66,;-5.603,-0.12,;-4.27,0.639999,;-6.937,0.639999,;-8.271,-0.12,;-8.271,-1.66,;-9.604,-2.43,;-6.937,-2.43,;-6.937,-3.97,),^1:1,3,^2:2,wU:8.8|"}; + for (auto i = 0u; i < filenames.size(); ++i) { + auto fname = cdxmlbase + filenames[i]; + auto mol = MolsFromChemDrawFile(fname); + + SmilesWriteParams ps; + auto smi = MolToCXSmiles(*(mol[0].get()), ps, + SmilesWrite::CXSmilesFields::CX_ALL); + CHECK(smi == expected[i]); + } + } + } +} + +TEST_CASE("bad stereo in a natural product") { + std::string cdxmlbase = + std::string(getenv("RDBASE")) + "/Code/GraphMol/test_data/CDXML/"; + SECTION("case 1") { + auto fname = cdxmlbase + "stereo5.cdxml"; + auto mols = MolsFromChemDrawFile(fname); + REQUIRE(mols.size() == 1); + CHECK( + MolToSmiles(*mols[0]) == + "Cc1ccc2n1[C@@H]1[C@@H]3O[C@]([C@H](C)O)(C=C2)[C@H]1c1ccc(C)n1[C@@H]3C"); + } +} + +TEST_CASE("Github #6262: preserve bond wedging") { + std::string cdxmlbase = + std::string(getenv("RDBASE")) + "/Code/GraphMol/test_data/CDXML/"; + SECTION("case 1") { + auto fname = cdxmlbase + "stereo6.cdxml"; + auto mols = MolsFromChemDrawFile(fname); + REQUIRE(mols.size() == 1); + { + REQUIRE(mols[0]->getBondBetweenAtoms(2, 5)); + unsigned int cfg = 0; + CHECK(mols[0]->getBondBetweenAtoms(2, 5)->getPropIfPresent( + common_properties::_MolFileBondCfg, cfg)); + CHECK(cfg == 1); + } + { + REQUIRE(mols[0]->getBondBetweenAtoms(1, 4)); + unsigned int cfg = 0; + CHECK(mols[0]->getBondBetweenAtoms(1, 4)->getPropIfPresent( + common_properties::_MolFileBondCfg, cfg)); + CHECK(cfg == 3); + } + { + REQUIRE(mols[0]->getBondBetweenAtoms(3, 8)); + unsigned int cfg = 0; + CHECK(mols[0]->getBondBetweenAtoms(3, 8)->getPropIfPresent( + common_properties::_MolFileBondCfg, cfg)); + CHECK(cfg == 2); + } + } +} + +TEST_CASE("Github #6887: and1 or1 in same mol") { + SECTION("case 1") { + std::string cdxml1 = R"( + + + + + + + + + + + + + +&1or2or1OMeClBr +)"; + std::stringstream iss(cdxml1); + auto mols = MolsFromChemDrawDataStream(iss); + mols[0]->clearConformers(); + CHECK(MolToCXSmiles(*mols[0]) == + "CO[C@H](C)C[C@H](Cl)C[C@H](C)Br |o1:5,o2:8,&1:2|"); + } +} + +TEST_CASE("Github #7528 - read fragments in groups") { + std::string cdxmlbase = + std::string(getenv("RDBASE")) + "/Code/GraphMol/test_data/CDXML/"; + SECTION("case 1") { + auto fname = cdxmlbase + "github7467-grouped-fragments.cdxml"; + ChemDrawParserParams params; + params.sanitize = false; + auto mols = MolsFromChemDrawFile(fname, params); + REQUIRE(mols.size() == 2); + } +} + +TEST_CASE("Github #7501 - dative bonds") { + std::string cdxmlbase = + std::string(getenv("RDBASE")) + "/Code/GraphMol/test_data/CDXML/"; + SECTION("case 1") { + auto fname = cdxmlbase + "github7501-dative.cdxml"; + ChemDrawParserParams params; + auto mols = MolsFromChemDrawFile(fname, params); + CHECK(MolToSmiles(*mols[0]) == + "C[CH2](C)->[Os]12<-[CH3]CC[NH]->1CC=[NH]->2"); // All datives to the + // Osmium + } +} + +TEST_CASE("Synthesis-workshop") { + std::string cdxmlbase = + std::string(getenv("RDBASE")) + "/Code/GraphMol/test_data/CDXML/"; + SECTION("ChemDraw Template from the synthesis-workshop") { + // this was hella fun to validate the stereo-chemistry... + auto fname = cdxmlbase + "chemdraw_template1.cdxml"; + auto mols = MolsFromChemDrawFile(fname); + std::vector expected = { + "CCC/C=C/C=C/C(=O)O[C@H]1/C(=C/C(=O)OC)C[C@H]2C[C@H]([C@@H](C)O)OC(=O)C[C@H](O)C[C@@H]3C[C@H](OC(C)=O)C(C)(C)[C@](O)(C[C@@H]4C/C(=C/C(=O)OC)C[C@H](/C=C/C(C)(C)[C@]1(O)O2)O4)O3", + "[B]", + "*", + "[C]", + "Cc1ccc2n1[C@@H]1[C@@H]3O[C@]([C@H](C)O)(C=C2)[C@H]1c1ccc(C)n1[C@@H]3C", + // this is may or may not be correct, but the structure is drawn + // incorrectly. + // There's a test below which fixes this + "Cc1ccc2n1[C@H](C)C(=O)[C@@H]1[C@H]2C(=O)C=Cc2ccc(C)n21", + "Cc1ccc2ccc(=O)ccn12", + "Cc1cccn1[C@H](C)C=O", + "Cc1ccc2ccc([O-])cc[n+]1-2", + "Cc1ccc2ccc(=O)ccn12", + "Cc1cccn1[C@H](C)C(C#N)O[Si](C)(C)C", + "CC1CC[C@]2(O)[C@]3(C)C[C@]4(O)O[C@@]2([C@@H]1O)C1(O)C4(C)C(O)(C(C)C)[C@@H](OC(=O)c2ccc[nH]2)[C@]13O", + "C=C(C)[C@H]1CC(=O)CC2=C(C1)[C@H]1C(=O)O[C@H]3C[C@@](C)(O)[C@@H](C2=O)[C@@H]13"}; + CHECK(mols.size() == expected.size()); + int i = 0; + for (auto &mol : mols) { + INFO(i); + CHECK(MolToSmiles(*mol) == expected[i++]); + } + } +} + +TEST_CASE("Output CDXML") { + SECTION("basic") { + auto mol = + "N#Cc1ccc(cc1Cl)O[C@@H]1CC[C@H](CC1)NC(=O)c1ccc(nn1)N1CCC(CC1)CN1CCN(CC1)c1cc2C(=O)N(C(=O)c2cc1F)C1CCC(=O)NC1=O"_smiles; + // auto mol = "[C@H](I)(F)Br"_smiles; + // auto res = MolToMolBlock(*mol); + // mol->debugMol(std::cerr); + std::string output = MolToChemDrawBlock(*mol); + std::stringstream cdxml; + cdxml << output; + auto mols = MolsFromChemDrawDataStream(cdxml); + CHECK(MolToSmiles(*mols[0]) == MolToSmiles(*mol)); + } +} + +TEST_CASE("Brackets") { + std::string cdxmlbase = + std::string(getenv("RDBASE")) + "/Code/GraphMol/test_data/CDXML/"; + SECTION("MultipleGroups") { + auto fname = cdxmlbase + "sgroups_and_remove_atoms_4.cdxml"; + ChemDrawParserParams params; + auto mols = MolsFromChemDrawFile(fname, params); + CHECK(MolToSmiles(*mols[0]) == "CCN(C)CC"); + } +} + +TEST_CASE("Enhanced Stereochem") { + SECTION("Round Trip") { + auto mol = "F[C@H](Cl)Br |o1:1|"_smiles; + CHECK(MolToCXSmiles(*mol) == "F[C@H](Cl)Br |o1:1|"); + auto cdx = MolToChemDrawBlock(*mol); + std::stringstream iss(cdx); + auto mols = MolsFromChemDrawDataStream(iss); + + SmilesWriteParams ps; + auto cxsmi = MolToCXSmiles(*mols[0], ps, + SmilesWrite::CXSmilesFields::CX_ALL ^ + SmilesWrite::CXSmilesFields::CX_COORDS); + CHECK(cxsmi == "F[C@H](Cl)Br |o1:1|"); + } +} + +TEST_CASE("Round TRIP") { + std::string path = + std::string(getenv("RDBASE")) + "/Code/GraphMol/test_data/"; + std::string code_path = std::string(getenv("RDBASE")); + + SECTION("round trip") { + std::set exceptions = {"stereo3d_unknown.mol", "mrv-sma.mol", + "github2040_1.mol"}; + int failed = 0; + int total = 0; + RDLog::LogStateSetter blocker; + for (const auto &entry : + std::filesystem::recursive_directory_iterator(code_path)) { + if (entry.path().string().find("ChemDraw") != std::string::npos) + continue; // Skip ChemDraw directory + if (entry.path().string().find("build") != std::string::npos) continue; + if (entry.is_regular_file() && + entry.path().extension().string() == ".mol") { + if (exceptions.find(entry.path().filename().string()) != + exceptions.end()) { + std::cerr << "Skipping exception: " << entry.path() << std::endl; + continue; + } + RWMol *mol = nullptr; + try { + mol = MolFileToMol(entry.path().string()); + } catch (...) { + continue; + } + if (mol) { + // CDX doesn't support atom map numbers apparently + total++; + for (auto atom : mol->atoms()) { + atom->setAtomMapNum(0); + } + // CDXML doesn't support ZERO bonds + bool haszerobond = false; + for (auto bond : mol->bonds()) { + if (bond->getBondType() == Bond::BondType::ZERO) { + haszerobond = true; + break; + } + } + if (haszerobond) { + delete mol; + continue; + } + + // std::cerr << entry.path() << std::endl; + std::string cdx; + try { + cdx = MolToChemDrawBlock(*mol); + } catch (...) { + std::cerr << entry.path().filename().string() << std::endl; + std::cerr << "FAIL (cdxml-write-exception):" << entry.path() + << std::endl; + failed++; + continue; + } + std::vector> mols; + std::stringstream iss(cdx); + try { + mols = MolsFromChemDrawDataStream(iss); + } catch (...) { + std::cerr << entry.path().filename().string() << std::endl; + std::cerr << "FAIL (cdxml-exception):" << entry.path() << std::endl; + failed++; + } + auto smi1 = MolToSmiles(*mol); + + if (mols.size() == 0) { + std::cerr << entry.path().filename().string() << std::endl; + std::cerr << "FAIL (nomol):" << entry.path() << std::endl; + mol->debugMol(std::cerr); + failed++; + continue; + } + auto smi2 = MolToSmiles(*mols[0]); + if (smi1 != smi2) { + // std::cerr << + // "**************************************************************" + // << std::endl; + std::cerr << "FAIL:" << entry.path() << " " << smi1 + << " != " << smi2 << std::endl; + failed++; + // std::cerr << "molfile:" << smi1 << std::endl; + // std::cerr << "cdx :" << smi2 << std::endl; + // std::cerr << cdx << std::endl; + } else { + // std::cerr << "PASS:" << entry.path() << std::endl; + } + // CHECK(smi1 == smi2); + delete mol; + } + } + } + std::cerr << "Failed:" << failed << " out of " << total << std::endl; + REQUIRE(failed == 0); + } +} + +TEST_CASE("Fragments") { + std::string path = + std::string(getenv("RDBASE")) + "/External/ChemDraw/test_data/"; + SECTION("Single Atom Replacements") { + auto fname = path + "atom-to-fragment.cdxml"; + auto mols = MolsFromChemDrawFile(fname); + REQUIRE(mols.size()); + REQUIRE("CC=C=C(C)C" == MolToSmiles(*mols[0])); + } +} + +TEST_CASE("Geometry") { + std::string path = + std::string(getenv("RDBASE")) + "/External/ChemDraw/test_data/"; + SECTION("Single Atom Replacements") { + auto fname = path + "geometry-tetrahedral.cdxml"; + auto mols = MolsFromChemDrawFile(fname); + REQUIRE(mols.size()); + REQUIRE("C1[C@H]2C[C@@H]12" == MolToSmiles(*mols[0])); + } +} diff --git a/External/ChemDraw/test_3d.cpp b/External/ChemDraw/test_3d.cpp new file mode 100644 index 000000000..2fcf7067b --- /dev/null +++ b/External/ChemDraw/test_3d.cpp @@ -0,0 +1,98 @@ +// +// Copyright (c) 2025 Glysade Inc and other RDkit contributors +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// * Neither the name of Novartis Institutes for BioMedical Research Inc. +// nor the names of its contributors may be used to endorse or promote +// products derived from this software without specific prior written +// permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// + +#include "chemdraw.h" +#include +#include "RDGeneral/test.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +using namespace RDKit; +using namespace RDKit::v2; + +TEST_CASE("Round TRIP") { + std::string path = + std::string(getenv("RDBASE")) + "/Code/GraphMol/test_data/"; + std::string code_path = std::string(getenv("RDBASE")); + + // Eventually this catch test is to see if round tripping mol 3d -> chemdraw returns + // reasonable coords, however chemdraw seems to forget about the original scale + // and converts to pixel drawing coords, so this test is kind of meaningless + SECTION("3D structs") { + auto fname = + code_path + "/Code/GraphMol/FileParsers/test_data/Issue3514824.mol"; + auto mol = MolFileToMol(fname); + REQUIRE(mol); + auto &conf = mol->getConformer(0); + for (auto bond : mol->bonds()) { + auto p1 = conf.getAtomPos(bond->getBeginAtomIdx()); + auto p2 = conf.getAtomPos(bond->getEndAtomIdx()); + auto length = (p1 - p2).length(); + std::cerr << bond->getIdx() << " : " << length << std::endl; + ; + } + std::cerr << "----------" << std::endl; + { + auto fname2 = + code_path + "/Code/GraphMol/FileParsers/test_data/Issue3514824.cdxml"; + auto mols = MolsFromChemDrawFile(fname2); + auto &conf2 = mols[0]->getConformer(0); + for (auto bond : mols[0]->bonds()) { + auto p1 = conf2.getAtomPos(bond->getBeginAtomIdx()); + auto p2 = conf2.getAtomPos(bond->getEndAtomIdx()); + auto length = (p1 - p2).length(); + std::cerr << bond->getIdx() << " : " << length << std::endl; + } + } + std::cerr << "----------" << std::endl; + { + auto cdx = MolToChemDrawBlock(*mol); + auto mols = MolsFromChemDrawBlock(cdx); + auto &conf2 = mols[0]->getConformer(0); + for (auto bond : mols[0]->bonds()) { + auto p1 = conf2.getAtomPos(bond->getBeginAtomIdx()); + auto p2 = conf2.getAtomPos(bond->getEndAtomIdx()); + auto length = (p1 - p2).length(); + std::cerr << bond->getIdx() << " : " << length << std::endl; + } + } + delete mol; + } +} diff --git a/External/ChemDraw/test_6k.cpp b/External/ChemDraw/test_6k.cpp new file mode 100644 index 000000000..9b56e9a10 --- /dev/null +++ b/External/ChemDraw/test_6k.cpp @@ -0,0 +1,330 @@ +// +// Copyright (c) 2025 Glysade Inc and other RDkit contributors +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// * Neither the name of Novartis Institutes for BioMedical Research Inc. +// nor the names of its contributors may be used to endorse or promote +// products derived from this software without specific prior written +// permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// + +#include "chemdraw.h" +#include "chemdraw_doc.h" +#include +#include "RDGeneral/test.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "ChemDrawStartInclude.h" +#include "chemdraw/CDXStdObjects.h" +#include "ChemDrawEndInclude.h" + +#include + +using namespace RDKit; +using namespace RDKit::v2; +namespace { +std::string replace(std::string &istr, const std::string &from, + const std::string &to) { + std::string str(istr); + size_t start_pos = str.find(from); + if (start_pos == std::string::npos) return str; + str.replace(start_pos, from.length(), to); + return str; +} + +bool hasNonSupportedFeatures(CDXDocument &document, const std::string &fname) { + // check for monomers + std::ifstream ifs(fname); + std::stringstream xml; + xml << ifs.rdbuf(); + // We should be able to figure this out from the node but... + if(xml.str().find("monomerAttachmentStructure_") != std::string::npos || + xml.str().find("Name=\"monomerAttachments") != std::string::npos) { + return true; + } + + for (auto node : document.ContainedObjects()) { + CDXDatumID id = (CDXDatumID)node.second->GetTag(); + switch (id) { + case kCDXObj_Page: + for (auto frag : node.second->ContainedObjects()) { + CDXDatumID id = (CDXDatumID)frag.second->GetTag(); + if (id == kCDXObj_Fragment) { + CDXFragment &fragment = (CDXFragment &)(*frag.second); + if (fragment.m_sequenceType == kCDXSeqType_Unknown) return true; + } else if (id == kCDXObj_BracketAttachment || id == kCDXObj_BracketedGroup) { + return true; + } + } + break; + case kCDXObj_ObjectTag: { + CDXObject &object = *((CDXObject *)node.second); + id = (CDXDatumID)object.GetTag(); + // Check for monomers + break; + } + default: + break; + } + } + return false; +} + +bool hasNonSupportedFeatures(const std::string &fname) { + auto doc = ChemDraw::ChemDrawToDocument(fname); + return hasNonSupportedFeatures(*doc, fname); +} + +TEST_CASE("Round TRIP") { + std::string path = + std::string(getenv("RDBASE")) + "/External/ChemDraw/test_data/CDXML6K/"; + + SECTION("round trip") { + // if we can't find the CDXML6K path, then don't run the test + if(!std::filesystem::exists(path)) { + return; + } + int failed = 0; + int saniFailed = 0; + int total = 0; + int parseable = 0; + int nomol = 0; + int badparse = 0; + int success = 0; + int smimatches = 0; + int nonSupported = 0; + int no_mol_in_doc = 0; + int bad_chemdraw_mol = 0; + RDLog::LogStateSetter blocker; + std::string cdxpath = path + "CDXML/"; + std::string molpath = path + "mol/"; + std::string smipath = path + "smiles/"; + + std::string failpath = path + "FAILED/"; + std::string nomolpath = path + "NOMOL/"; + std::string badparsepath = path + "BADPARSE/"; + std::string sanitizationpath = path + "SANI/"; + + std::set known_failures{ + "INDMUMLL1117_2025-01-24-17-23-14_304.cdxml", // Dative oxygen gets set to a radical + "INDMUMLL1117_2025-01-24-17-26-06_1010.cdxml", // The next batch has a type of stereochem I don't know how to parse yet + "INDMUMLL1117_2025-01-24-17-26-06_1012.cdxml", + "INDMUMLL1117_2025-01-24-17-26-06_1022.cdxml", + "INDMUMLL1117_2025-01-24-17-26-06_1024.cdxml", + "INDMUMLL1117_2025-01-24-17-26-06_1026.cdxml", + "INDMUMLL1117_2025-01-24-17-26-06_1032.cdxml", + "INDMUMLL1117_2025-01-24-17-26-06_1034.cdxml", + "INDMUMLL1117_2025-01-24-17-26-06_1036.cdxml", + "INDMUMLL1117_2025-01-24-17-26-06_1040.cdxml", + "INDMUMLL1117_2025-01-24-17-26-06_1042.cdxml", + "INDMUMLL1117_2025-01-24-17-26-06_1048.cdxml", // Stereo chem batch ends here + "INDMUMLL1117_2025-01-24-17-26-13_1690.cdxml", // RDKit shows a radical for the dative ->[O] + "INDMUMLL1117_2025-01-24-17-27-11_6877.cdxml", // The next batch has a type of stereochem I don't know how to parse yet (same as before) + "INDMUMLL1117_2025-01-24-17-27-11_6878.cdxml", + "INDMUMLL1117_2025-01-24-17-27-11_6883.cdxml", + "INDMUMLL1117_2025-01-24-17-27-11_6884.cdxml", + "INDMUMLL1117_2025-01-24-17-27-11_6889.cdxml", + "INDMUMLL1117_2025-01-24-17-27-11_6896.cdxml", + "INDMUMLL1117_2025-01-24-17-27-30_8574.cdxml", // Stereo chem batch ends here + "INDMUMLL1117_2025-01-24-17-27-31_8633.cdxml", // RDkit is missing a dummy atom molecule + "INDMUMLL1117_2025-01-24-17-27-31_8651.cdxml", // RDkit is missing a dummy atom molecule + "INDMUMLL1117_2025-01-24-17-27-53_10330.cdxml",// 2D projection of 3D stereo, we fail this one + "INDMUMLL1117_2025-01-24-17-27-53_10332.cdxml",// 2D projection of 3D stereo, we fail this one + "INDMUMLL1117_2025-01-24-17-27-54_10336.cdxml",// RDKit Smiles keeps any bonds ~, ChemDraw doesn't + "INDMUMLL1117_2025-01-24-17-28-02_10942.cdxml",// Chemdraw smiles doesn't support quadruple bond $ + "INDMUMLL1117_2025-01-24-17-28-15_11666.cdxml",// RDKit Smiles keeps any bonds ~, ChemDraw doesn't + "INDMUMLL1117_2025-01-24-17-28-20_12011.cdxml",// RDKit gets stereo from the 3D data and the wedging + "INDMUMLL1117_2025-01-24-17-28-20_12012.cdxml",// RDKit gets stereo from the 3D data and the wedging + "INDMUMLL1117_2025-01-24-17-28-21_12031.cdxml",// 2D projection of 3D stereo, we fail this one + "INDMUMLL1117_2025-01-24-17-28-30_12568.cdxml",// 2D projection of 3D stereo, we fail this one + "INDMUMLL1117_2025-01-24-17-29-06_14654.cdxml",// Dative oxygen gets set to a radical + "INDMUMLL1117_2025-01-24-17-29-08_14775.cdxml",// RDKit Smiles keeps any bonds ~, ChemDraw doesn't + "INDMUMLL1117_2025-01-24-17-29-09_14896.cdxml",// We apparently do a bit of a better job than chemdraw here in parsing R/S + "INDMUMLL1117_2025-01-24-17-29-09_14897.cdxml" // RDKit just gets very different stereo chem, no idea why + }; + + for (auto p : {failpath, nomolpath, badparsepath, sanitizationpath}) { + if (std::filesystem::exists(p)) { + std::filesystem::remove_all(p); + } + std::filesystem::create_directory(p); + } + + for (const auto &entry : + std::filesystem::recursive_directory_iterator(cdxpath)) { + if (entry.is_regular_file()) { + std::string fname = entry.path().filename().string(); + // issue here - graphite nanotube + if (fname == "INDMUMLL1117_2025-01-24-17-28-02_10946.cdxml") + continue; // nanotube takes forever + auto molfname = molpath + replace(fname, ".cdxml", ".mol"); + auto smifname = smipath + replace(fname, ".cdxml", ".smi"); + // if chemscript couldn't make an output, ignore it + total++; + + if (!std::filesystem::exists(molfname) || + !std::filesystem::exists(smifname)) { + no_mol_in_doc++; + continue; + } + + // Get the ChemScript mol and smiles + std::unique_ptr mol; + //= nullptr; + try { + mol.reset(MolFileToMol(molfname)); + } catch (...) { + bad_chemdraw_mol++; + continue; + } + // REQUIRE(mols.size()); + std::ifstream ifs(smifname); + std::string smiles_in; + ifs >> smiles_in; + std::string smiles; + { + try { + auto smimol = SmilesToMol(smiles_in); + if (!smimol) + smiles = smiles_in; + else { + smiles = MolToSmiles(*smimol); + delete smimol; + } + } catch (...) { + smiles = smiles_in; + } + } + + parseable++; + // Read the cdxml + std::vector> mols; + bool santizationFailure = false; + try { + mols = MolsFromChemDrawFile(entry.path().string()); + if (mols.size() == 0) { + ChemDrawParserParams params; + params.sanitize = false; + mols = MolsFromChemDrawFile(entry.path().string(), params); + santizationFailure = true; + } + if (!mols.size()) { + if (smiles.size() == 0) { + // At least we match the chemscript non-mol + success++; + } + else if (hasNonSupportedFeatures(entry.path().string())) { + //std::cerr << "[NOMOL (Unsupported)]: " << entry.path().string() + // << std::endl; + nonSupported++; + } else { + std::cerr << "[NOMOL]: " << entry.path().string() + << std::endl; + std::filesystem::copy( + entry.path().string(), + nomolpath + entry.path().filename().string()); + nomol++; + } + continue; + } + } catch (...) { + std::cerr << "[BADPARSE]: " << entry.path().string() << std::endl; + std::filesystem::copy( + entry.path(), badparsepath + entry.path().filename().string()); + badparse++; + continue; + } + std::unique_ptr m = std::make_unique(*mols[0]); + for (size_t i = 1; i < mols.size(); i++) { + m.reset(combineMols(*m, *mols[i])); + } + + auto rdkit_smi = MolToSmiles(*m); + auto mol_smi = mol.get() ? MolToSmiles(*mol) : ""; + + if (mol_smi != rdkit_smi) { + // Do we match chemscripts smiles output at least? + if (rdkit_smi == smiles) { + smimatches++; + continue; + } + + if (hasNonSupportedFeatures(entry.path().string())) { + nonSupported++; + continue; // has unsupported features + } + if (santizationFailure) { + std::cerr << "[SANI]: " << entry.path() << std::endl; + std::filesystem::copy( + entry.path(), + sanitizationpath + entry.path().filename().string()); + saniFailed++; + } else { + if(known_failures.find(entry.path().filename().string()) != known_failures.end()) + continue; // we know this failure and it's ok for now + + std::cerr << "[FAIL]: " << entry.path() << std::endl; + std::filesystem::copy(entry.path(), + failpath + entry.path().filename().string()); + failed++; + } + std::cerr << "rdkit: " << rdkit_smi << std::endl; + std::cerr << "chemscript (mol): " << mol_smi << std::endl; + std::cerr << "chemscript (smiles): " << smiles << std::endl; + std::cerr << molfname << std::endl; + std::cerr << smifname << std::endl; + } else { + success++; + } + } + } + std::cerr << "Total:" << total << std::endl; + std::cerr << "Parseable (has chemscript output):" << total << std::endl; + std::cerr << "Success:" << success + smimatches << std::endl; + std::cerr << "skipped (non supported features):" << nonSupported + << std::endl; + std::cerr << "skipped (no mol in doc):" << no_mol_in_doc + << std::endl; + std::cerr << "Chemscript smiles matches not chemscript mol: " << smimatches + << std::endl; + std::cerr << "Failed:" << failed << std::endl; + std::cerr << "Sanitization:" << saniFailed << std::endl; + std::cerr << "Nomol:" << nomol << std::endl; + std::cerr << "Badparse:" << badparse << std::endl; + std::cerr << "Bad ChemDraw Mol:" << bad_chemdraw_mol << std::endl; + REQUIRE(failed == 0); + } +} +} // namespace diff --git a/External/ChemDraw/test_data/atom-to-fragment.cdxml b/External/ChemDraw/test_data/atom-to-fragment.cdxml new file mode 100644 index 000000000..2469fbe68 --- /dev/null +++ b/External/ChemDraw/test_data/atom-to-fragment.cdxml @@ -0,0 +1,219 @@ + + + + + + + + + + + + + + \ No newline at end of file diff --git a/External/ChemDraw/test_data/geometry-tetrahedral-2.cdxml b/External/ChemDraw/test_data/geometry-tetrahedral-2.cdxml new file mode 100644 index 000000000..f3787d143 --- /dev/null +++ b/External/ChemDraw/test_data/geometry-tetrahedral-2.cdxml @@ -0,0 +1,151 @@ + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/External/ChemDraw/test_data/geometry-tetrahedral-3.cdxml b/External/ChemDraw/test_data/geometry-tetrahedral-3.cdxml new file mode 100644 index 000000000..cff5af830 --- /dev/null +++ b/External/ChemDraw/test_data/geometry-tetrahedral-3.cdxml @@ -0,0 +1,217 @@ + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/External/ChemDraw/test_data/geometry-tetrahedral-4.cdxml b/External/ChemDraw/test_data/geometry-tetrahedral-4.cdxml new file mode 100644 index 000000000..3c2b90bc1 --- /dev/null +++ b/External/ChemDraw/test_data/geometry-tetrahedral-4.cdxml @@ -0,0 +1,1059 @@ + + + + + + + + + + + + + +OSOOOHHHHHHHH \ No newline at end of file diff --git a/External/ChemDraw/test_data/geometry-tetrahedral.cdxml b/External/ChemDraw/test_data/geometry-tetrahedral.cdxml new file mode 100644 index 000000000..d1ba5ea6e --- /dev/null +++ b/External/ChemDraw/test_data/geometry-tetrahedral.cdxml @@ -0,0 +1,162 @@ + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/External/ChemDraw/utils.cpp b/External/ChemDraw/utils.cpp new file mode 100644 index 000000000..f155945ad --- /dev/null +++ b/External/ChemDraw/utils.cpp @@ -0,0 +1,325 @@ +#include "utils.h" +#include +#include + +#include +#include +#include +namespace RDKit { +namespace ChemDraw { +std::string NodeType(CDXNodeType nodetype) { + switch (nodetype) { + case kCDXNodeType_Unspecified: + return "Unspecified"; + case kCDXNodeType_Element: + return "Element"; + case kCDXNodeType_ElementList: + return "ElementList"; + case kCDXNodeType_ElementListNickname: + return "ElementListNickname"; + case kCDXNodeType_Nickname: + return "Nickname"; + case kCDXNodeType_Fragment: + return "Fragment"; + case kCDXNodeType_Formula: + return "Forumla"; + case kCDXNodeType_GenericNickname: + return "GenericNickname"; + case kCDXNodeType_AnonymousAlternativeGroup: + return "Anonymous Alternative Group"; + case kCDXNodeType_NamedAlternativeGroup: + return "Named Alternative Group"; + case kCDXNodeType_MultiAttachment: + return "MultiAttachment"; + case kCDXNodeType_VariableAttachment: + return "Variable Attachment"; + case kCDXNodeType_ExternalConnectionPoint: + return "ExternalConnectionPoint"; + case kCDXNodeType_LinkNode: + return "LinkNode"; + case kCDXNodeType_Monomer: + return "Monomer"; + default: + return "?"; + } +} + +void scaleBonds(const ROMol &mol, Conformer &conf, double targetBondLength, + double bondLength) { + double avg_bond_length = 0.0; + if (bondLength < 0) { + // If we don't have a bond length for any reason, just scale the avgerage + // bond length + for (auto &bond : mol.bonds()) { + avg_bond_length += (conf.getAtomPos(bond->getBeginAtomIdx()) - + conf.getAtomPos(bond->getEndAtomIdx())) + .length(); + } + avg_bond_length /= mol.getNumBonds(); + } else { + avg_bond_length = bondLength; + } + + if (avg_bond_length > 0) { + double scale = targetBondLength / avg_bond_length; + for (auto &pos : conf.getPositions()) { + pos *= scale; + } + } +} + +unsigned int get_fuse_label(Atom *atm) { + // return atm->getAtomMapNum(); easier debugging + unsigned int label = 0; // default is no label + atm->getPropIfPresent(FUSE_LABEL, label); + return label; +} + +void set_fuse_label(Atom *atm, unsigned int idx) { + // atm->setAtomMapNum(idx); //for debugging + if (idx) { + atm->setProp(FUSE_LABEL, idx); + } else { + atm->clearProp(FUSE_LABEL); + } +} + +struct FragmentReplacement { + // R = Replacement + // F = Fragment + // C = Conneciton + // C R C F F + // N=*=C.*=CCC=* + // label 1 1 1 + // has bond ordering + // + // goal replace the atom R with the connections + unsigned int label = 0; + Atom *replacement_atom = nullptr; + + std::vector replacement_connection_atoms; + std::vector fragment_atoms; + + bool replace(RWMol &mol) { + if (!replacement_atom) return true; + + auto bond_ordering = + replacement_atom->getProp>(CDX_BOND_ORDERING); + + // Find the connecting atoms and and do the replacement + for (auto bond : mol.atomBonds(replacement_atom)) { + // find the position of the attachement bonds in the bond ordering + auto bond_id = bond->getProp(CDX_BOND_ID); + auto it = std::find(bond_ordering.begin(), bond_ordering.end(), bond_id); + if (it == bond_ordering.end()) return false; + + auto pos = std::distance(bond_ordering.begin(), it); + + auto &xatom = fragment_atoms[pos]; + + for (auto &xbond : mol.atomBonds(xatom)) { + // xatom is the fragment dummy atom + // xbond is the fragment bond + if (bond->getBeginAtom() == replacement_atom) { + mol.addBond(xbond->getOtherAtom(xatom), bond->getEndAtom(), + bond->getBondType()); + } else { + mol.addBond(bond->getBeginAtom(), xbond->getOtherAtom(xatom), + bond->getBondType()); + } + } + } + + mol.removeAtom(replacement_atom); + for (auto &atom : fragment_atoms) { + mol.removeAtom(atom); + } + return true; + } +}; + +// Replace fragments that are not possible with molzip +bool replaceFragments(RWMol &mol) { + // Anything with a single atom that is supposed to be replaced via a fragment + // is here + std::map replacements; + + for (auto &atom : mol.atoms()) { + auto label = get_fuse_label(atom); + if (label) { + if (atom->hasProp(CDX_BOND_ORDERING)) { + auto &frag = replacements[label]; + frag.label = label; + frag.replacement_atom = atom; + } else { + // The is the fragment attachment atoms that need to + // be attached to the ones connected to the atom being replaced + auto &frag = replacements[label]; + frag.fragment_atoms.push_back(atom); + } + } + } + mol.beginBatchEdit(); + for (auto &replacement : replacements) { + replacement.second.replace(mol); + } + mol.commitBatchEdit(); + return true; +} +namespace { +Atom::ChiralType getChirality(ROMol &mol, Atom *center_atom, Conformer &conf) { + if (center_atom->hasProp(CDX_BOND_ORDERING)) { + std::vector bond_ordering = + center_atom->getProp>(CDX_BOND_ORDERING); + if (bond_ordering.size() < 3) { + return Atom::ChiralType::CHI_UNSPECIFIED; + } + std::vector atoms; + + std::vector> angles; + auto center = conf.getAtomPos(center_atom->getIdx()); + + for (auto cdx_id : bond_ordering) { + if (cdx_id == 0) { + continue; + } + + for (auto bond : mol.atomBonds(center_atom)) { + int bond_id; + if (bond->getPropIfPresent(CDX_BOND_ID, bond_id)) { + } else { + return Atom::ChiralType::CHI_UNSPECIFIED; + } + if (bond_id == cdx_id) { + auto atom = bond->getOtherAtom(center_atom); + if (!atom) { + // something went really wrong + return Atom::ChiralType::CHI_UNSPECIFIED; + } + auto pos = conf.getAtomPos(atom->getIdx()) - center; + double angle = atan2(pos.x, pos.y); + angles.push_back(std::make_pair(angle, bond->getIdx())); + } + } + } + + std::sort(angles.begin(), angles.end()); + + // angles are now sorted in a clockwise rotation + INT_LIST bonds; + for (auto &angle : angles) { + bonds.push_back(angle.second); + } + + if(bonds.size() < 3) { + return Atom::ChiralType::CHI_UNSPECIFIED; + } + + auto nswaps = center_atom->getPerturbationOrder(bonds); + if (bonds.size() == 3 && center_atom->getTotalNumHs() == 1) { + ++nswaps; + } + // This is supports the HDot and HDash available in chemdraw + // one is an implicit wedged hydrogen and one is a dashed hydrogen + if (center_atom->hasProp(CDX_IMPLICIT_HYDROGEN_STEREO) && + center_atom->getProp(CDX_IMPLICIT_HYDROGEN_STEREO) == 'w') + nswaps++; + + if (nswaps % 2) { + return Atom::ChiralType::CHI_TETRAHEDRAL_CCW; + } + return Atom::ChiralType::CHI_TETRAHEDRAL_CW; + } + + return Atom::ChiralType::CHI_UNSPECIFIED; +} +} // namespace +void checkChemDrawTetrahedralGeometries(RWMol &mol) { + std::vector> unsetTetrahedralAtoms; + Conformer *conf = nullptr; + if (mol.getNumConformers()) { + conf = &mol.getConformer(); + } + bool chiralityChanged = false; + + for (auto atom : mol.atoms()) { + // only deal with unspecified chiralities + if (atom->getChiralTag() != Atom::ChiralType::CHI_UNSPECIFIED) { + atom->clearProp(CDX_CIP); + continue; + } + if (conf && !conf->is3D()) { + atom->setChiralTag(getChirality(mol, atom, *conf)); + if (atom->getChiralTag() != Atom::ChiralType::CHI_UNSPECIFIED) { + chiralityChanged = true; + } + } + // If we have a cip code, might as well check it too + CDXAtomCIPType cip; + if (atom->getPropIfPresent(CDX_CIP, cip)) { + // assign, possibly wrong, initial stereo. + // note: we can probably deduce this through CDX_BOND_ORDERING, but + // I currenlty don't understand that well enough. + switch (cip) { + case kCDXCIPAtom_R: + if(!chiralityChanged) atom->setChiralTag(Atom::ChiralType::CHI_TETRAHEDRAL_CW); + unsetTetrahedralAtoms.push_back(std::make_pair('R', atom)); + break; + case kCDXCIPAtom_r: + if(!chiralityChanged) atom->setChiralTag(Atom::ChiralType::CHI_TETRAHEDRAL_CW); + unsetTetrahedralAtoms.push_back(std::make_pair('r', atom)); + break; + case kCDXCIPAtom_S: + if(!chiralityChanged) atom->setChiralTag(Atom::ChiralType::CHI_TETRAHEDRAL_CW); + unsetTetrahedralAtoms.push_back(std::make_pair('S', atom)); + break; + case kCDXCIPAtom_s: + if(!chiralityChanged) atom->setChiralTag(Atom::ChiralType::CHI_TETRAHEDRAL_CCW); + unsetTetrahedralAtoms.push_back(std::make_pair('s', atom)); + break; + default: + break; + } + } + + } + + // Now that we have missing chiralities, let's check the CIP codes and reset + // if necessary. + // This is an expensive way of doing this, but we only have stereo->cip not + // cip->stereo implemented currently + + for (auto cipatom : unsetTetrahedralAtoms) { + try { + CIPLabeler::assignCIPLabels(mol); + } catch (...) { + // can throw std::runtime error? + break; + } + std::string cipcode; + if (cipatom.second->getPropIfPresent( + common_properties::_CIPCode, cipcode)) { + if (cipcode.size() && cipcode[0] != cipatom.first) { + // need to swap + if (cipatom.second->getChiralTag() == + Atom::ChiralType::CHI_TETRAHEDRAL_CW) { + cipatom.second->setChiralTag(Atom::ChiralType::CHI_TETRAHEDRAL_CCW); + cipatom.second->updatePropertyCache(); + chiralityChanged = true; + } else if (cipatom.second->getChiralTag() == + Atom::ChiralType::CHI_TETRAHEDRAL_CCW) { + cipatom.second->setChiralTag(Atom::ChiralType::CHI_TETRAHEDRAL_CW); + cipatom.second->updatePropertyCache(); + chiralityChanged = true; + } + } + } + } + if (chiralityChanged) { + const bool cleanIt = true; + const bool force = true; + MolOps::assignStereochemistry(mol, cleanIt, force); + } +} +} +} // namespace RDKit diff --git a/External/ChemDraw/utils.h b/External/ChemDraw/utils.h new file mode 100644 index 000000000..4ffc53086 --- /dev/null +++ b/External/ChemDraw/utils.h @@ -0,0 +1,110 @@ +// +// Copyright (c) 2024, Glysade Inc +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// * Neither the name of Novartis Institutes for BioMedical Research Inc. +// nor the names of its contributors may be used to endorse or promote +// products derived from this software without specific prior written +// permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +#ifndef CHEMDRAW_UTILS_H +#define CHEMDRAW_UTILS_H + +#include +#include +#include +#include + +#include "ChemDrawStartInclude.h" +#include "chemdraw/CDXStdObjects.h" +#include "ChemDrawEndInclude.h" + +namespace RDKit { +namespace ChemDraw { +constexpr double RDKIT_DEPICT_BONDLENGTH = 1.5; +const std::string NEEDS_FUSE("CDX_NEEDS_FUSE"); +const std::string CDX_FRAG_ID("CDX_FRAG_ID"); +const std::string CDX_GROUP_ID("CDX_GROUP_ID"); +const std::string FUSE_LABEL("CDX_NODE_ID"); +const std::string CDX_SCHEME_ID("CDX_SCHEME_ID"); +const std::string CDX_STEP_ID("CDX_STEP_ID"); +const std::string CDX_REAGENT_ID("CDX_REAGENT_ID"); +const std::string CDX_PRODUCT_ID("CDX_PRODUCT_ID"); +const std::string CDX_AGENT_ID("CDX_AGENT_ID"); +const std::string CDX_ATOM_POS("CDX_ATOM_POS"); +const std::string CDX_ATOM_ID("_CDX_ATOM_ID"); +const std::string CDX_BOND_ID("_CDX_BOND_ID"); +const std::string CDX_BOND_ORDERING("CDX_BOND_ORDERING"); +const std::string CDX_CIP("CDX_CIP"); +const std::string CDX_IMPLICIT_HYDROGEN_STEREO("CDX_ATOM_STEREO"); + +// Convert a ChemDrawNode to a string +std::string NodeType(CDXNodeType nodetype); + +// Scale the bonds to the targetBondLength. If bondLength is zero +// use the average bond length in the molecule +void scaleBonds(const ROMol &mol, Conformer &conf, double targetBondLength, + double bondLength); + +// Indicate which atoms should be fused together from various +// fragments in the ChemDraw file + +unsigned int get_fuse_label(Atom *atm); +void set_fuse_label(Atom *atm, unsigned int idx); + +// Replace fragments that are not possible with molzip +bool replaceFragments(RWMol &mol); + +// Add a Query to a molecule +template +Atom *addquery(Q *qry, std::string symbol, RWMol &mol, unsigned int idx) { + PRECONDITION(qry, "bad query"); + auto *atm = mol.getAtomWithIdx(idx); + auto qa = std::make_unique(*atm); + qa->setQuery(qry); + qa->setNoImplicit(true); + mol.replaceAtom(idx, qa.get()); + Atom *res = mol.getAtomWithIdx(idx); + if (symbol != "") { + res->setProp(common_properties::atomLabel, symbol); + } + return res; +} + +// Simple Structure for keeping track of Stereo Groups +struct StereoGroupInfo { + int sgroup = -1; + bool conflictingSgroupTypes = false; + StereoGroupType grouptype; + std::vector atoms; +}; + +// check to see if we have a tetrahedral flag and ChemDraw CIP set but no +// stereo assigned, if so check the bond ordering for CW and CCW +void checkChemDrawTetrahedralGeometries(RWMol &mol); +} +} // namespace RDKit + +#endif diff --git a/External/ChemDraw/writer.cpp b/External/ChemDraw/writer.cpp new file mode 100644 index 000000000..4f3f761f0 --- /dev/null +++ b/External/ChemDraw/writer.cpp @@ -0,0 +1,296 @@ +// +// Copyright (c) 2024, Glysade Inc +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// * Neither the name of Novartis Institutes for BioMedical Research Inc. +// nor the names of its contributors may be used to endorse or promote +// products derived from this software without specific prior written +// permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +#include "chemdraw.h" +#include +#include +#include + +#include "ChemDrawStartInclude.h" +#include "chemdraw/CDXStdObjects.h" +#include "ChemDrawEndInclude.h" + + +namespace RDKit { +namespace v2 { +const double DEFAULT_CDX_BOND_LENGTH = 14.4; + +namespace { +// Do we need to set explicit hs in chemdraw, this uses basically the same +// logic as SmilesWriter +bool needsExplicitHs(const Atom *atom) { + auto num = atom->getAtomicNum(); + const INT_VECT &defaultVs = PeriodicTable::getTable()->getValenceList(num); + int totalValence = atom->getTotalValence(); + bool nonStandard = false; + + if (atom->getNumRadicalElectrons()) { + nonStandard = true; + } else if ((num == 7 || num == 15) && atom->getIsAromatic() && + atom->getNumExplicitHs()) { + // another type of "nonstandard" valence is an aromatic N or P with + // explicit Hs indicated: + nonStandard = true; + } else { + nonStandard = (totalValence != defaultVs.front() && atom->getTotalNumHs()); + } + + return nonStandard; +} +} // namespace + +std::string MolToChemDrawBlock(const ROMol &mol, CDXFormat format) { + RWMol trmol(mol); + MolOps::Kekulize(trmol); + if (!trmol.getNumConformers()) { + RDDepict::compute2DCoords(trmol); + } + + CDXObjectID object_id = 1; + CDXDocument document(object_id++); + CDXPage *page = new CDXPage(object_id++); + document.m_bondLength = DEFAULT_CDX_BOND_LENGTH; + document.m_flags |= CDXDocument::CDXDocumentProperty1::has_bondLength; + CDXFragment *fragment = new CDXFragment(object_id++); + page->AddChild(fragment); + std::vector nodes; + nodes.reserve(trmol.getNumAtoms()); + + const Conformer *conf = nullptr; + if (trmol.getNumConformers() == 0) { + RDDepict::compute2DCoords(trmol); + } + conf = &trmol.getConformer(0); + bool is3D = conf->is3D(); + + // I REALLY don't know why this is 2*DEFAULT_CDX_BOND_LENGTH but it looks + // right + // when loading the CDX into ChemDraw + // We convert the average bond length into the target bond length here + double target_bond_length = 2 * DEFAULT_CDX_BOND_LENGTH; + double dist = 0.0; + for (auto bond : trmol.bonds()) { + auto pos1 = conf->getAtomPos(bond->getBeginAtomIdx()); + auto pos2 = conf->getAtomPos(bond->getEndAtomIdx()); + dist += (pos1 - pos2).length(); + } + dist /= trmol.getNumBonds(); + double scale = is3D ? 1. : target_bond_length / dist; + + auto wedgeBonds = Chirality::pickBondsToWedge(trmol, nullptr, conf); + + for (auto &atom : trmol.atoms()) { + CDXNode *node = new CDXNode(object_id + atom->getIdx()); + auto pos = conf->getAtomPos(atom->getIdx()); + if (is3D) { + node->Position3D(CDXPoint3D(CDXCoordinatefromPoints(pos.x), + -CDXCoordinatefromPoints(pos.y), + CDXCoordinatefromPoints(pos.z))); + } else { + node->Position(CDXPoint2D(CDXCoordinatefromPoints(scale * pos.x), + CDXCoordinatefromPoints(-scale * pos.y))); + } + node->m_nodeType = kCDXNodeType_Element; + node->m_isotope = atom->getIsotope(); + node->m_elementNum = atom->getAtomicNum(); + // Use the same logic from the smiles writer needs brackets + // + // node->m_numHydrogens = atom->getNumExplicitHs() ? + // atom->getNumExplicitHs() + // : kNumHydrogenUnspecified; + node->m_numHydrogens = + needsExplicitHs(atom) ? atom->getTotalNumHs() : kNumHydrogenUnspecified; + node->m_charge = atom->getFormalCharge() * 0x1000000; + if (atom->getFormalCharge() || atom->getNumRadicalElectrons() != 0) { + node->m_numHydrogens = + atom->getTotalNumHs(); // XXX is this right? We seem to need to set + // it with charges + } + if (atom->getNumRadicalElectrons()) { + switch (atom->getNumRadicalElectrons()) { + case 0: + break; + case 1: + node->m_radical = kCDXRadical_Singlet; + break; + case 2: + break; + case 3: + break; + } + } + // this might be a bit slow, perhaps make into a map... + unsigned int sgnum = 0; + for (auto &sg : trmol.getStereoGroups()) { + sgnum++; + for (auto &sgatom : sg.getAtoms()) { + if (atom->getIdx() == sgatom->getIdx()) { + switch (sg.getGroupType()) { + case StereoGroupType::STEREO_ABSOLUTE: + node->m_enhancedStereoType = kCDXEnhancedStereo_Absolute; + break; + case StereoGroupType::STEREO_OR: + node->m_enhancedStereoType = kCDXEnhancedStereo_Or; + break; + case StereoGroupType::STEREO_AND: + node->m_enhancedStereoType = kCDXEnhancedStereo_And; + break; + } + node->m_enhancedStereoGroupNum = sgnum; + } + } + } + nodes.push_back(node); + fragment->AddChild(node); + } + + for (auto &bond : trmol.bonds()) { + CDXBond *cdxbond = + new CDXBond(object_id + mol.getNumAtoms() + bond->getIdx()); + + int dirCode = 0; + bool reverse = false; + Chirality::GetMolFileBondStereoInfo(bond, wedgeBonds, conf, dirCode, + reverse); + + switch (bond->getBondType()) { + case Bond::BondType::SINGLE: + cdxbond->m_bondOrder = kCDXBondOrder_Single; + break; + case Bond::DOUBLE: + cdxbond->m_bondOrder = kCDXBondOrder_Double; + break; + case Bond::TRIPLE: + cdxbond->m_bondOrder = kCDXBondOrder_Triple; + break; + case Bond::QUADRUPLE: + cdxbond->m_bondOrder = kCDXBondOrder_Quadruple; + break; + case Bond::QUINTUPLE: + cdxbond->m_bondOrder = kCDXBondOrder_Quintuple; + break; + case Bond::HEXTUPLE: + cdxbond->m_bondOrder = kCDXBondOrder_Sextuple; + break; + case Bond::ONEANDAHALF: + cdxbond->m_bondOrder = kCDXBondOrder_OneHalf; + break; + case Bond::TWOANDAHALF: + cdxbond->m_bondOrder = kCDXBondOrder_TwoHalf; + break; + case Bond::THREEANDAHALF: + cdxbond->m_bondOrder = kCDXBondOrder_ThreeHalf; + break; + case Bond::FOURANDAHALF: + cdxbond->m_bondOrder = kCDXBondOrder_FourHalf; + break; + case Bond::FIVEANDAHALF: + cdxbond->m_bondOrder = kCDXBondOrder_FiveHalf; + break; + case Bond::AROMATIC: + cdxbond->m_bondOrder = kCDXBondOrder_OneHalf; + break; + case Bond::IONIC: + cdxbond->m_bondOrder = kCDXBondOrder_Ionic; + break; + case Bond::HYDROGEN: + cdxbond->m_bondOrder = kCDXBondOrder_Hydrogen; + break; + case Bond::THREECENTER: + cdxbond->m_bondOrder = kCDXBondOrder_ThreeCenter; + break; + case Bond::DATIVE: + cdxbond->m_bondOrder = kCDXBondOrder_Dative; + break; + case Bond::UNSPECIFIED: { + auto query = describeQuery(bond); + if (query == "DoubleOrAromaticBond 1 = val\n") { + cdxbond->m_bondOrder = kCDXBondOrder_DoubleOrAromatic; + } else if (query == "SingleOrAromaticBond 1 = val\n") { + cdxbond->m_bondOrder = kCDXBondOrder_SingleOrAromatic; + } else if (query == "SingleOrDoubleBond 1 = val\n") { + cdxbond->m_bondOrder = kCDXBondOrder_SingleOrDouble; + } else { + cdxbond->m_bondOrder = kCDXBondOrder_Any; + } + } break; + case Bond::DATIVEONE: + case Bond::DATIVEL: + case Bond::DATIVER: + case Bond::OTHER: + case Bond::ZERO: + // unhandled + break; + } + + cdxbond->Connects(nodes[bond->getBeginAtomIdx()], + nodes[bond->getEndAtomIdx()]); + + switch (dirCode) { + case 6: // swap 1 and 6 due to swapped y + cdxbond->m_display = reverse ? kCDXBondDisplay_WedgedHashEnd + : kCDXBondDisplay_WedgedHashBegin; + break; + case 1: + cdxbond->m_display = + reverse ? kCDXBondDisplay_WedgeEnd : kCDXBondDisplay_WedgeBegin; + break; + case 3: + cdxbond->m_display = kCDXBondDisplay_Wavy; + break; + default: + break; + } + + if (bond->getBondDir() == Bond::BondDir::EITHERDOUBLE || + bond->getBondDir() == Bond::BondDir::UNKNOWN) + cdxbond->m_display = kCDXBondDisplay_Wavy; + + fragment->AddChild(cdxbond); + } + + document.AddChild(page); + document.m_colorTable.m_colors + .clear(); // if this isn't empty something fails. + + std::ostringstream os; + if(format == CDXFormat::CDXML) { + os << kCDXML_HeaderString; + XMLDataSink ds(os); + document.XMLWrite(ds); + } else { + CDXostream ds(os); + CDXWriteDocToStorage(&document, ds); + } + return os.str(); +} +} +} // namespace RDKit