// // Copyright (C) 2020 Gareth Jones, Glysade LLC // // @@ All Rights Reserved @@ // This file is part of the RDKit. // The contents are covered by the terms of the BSD license // which is included in the file license.txt, found at the root // of the RDKit source tree. // #include #include #include #include #include #include #include #include #include #include using namespace std; using namespace RDKit; using namespace boost::program_options; namespace options = boost::program_options; // Example systems based on Brian's MultipleCores notebook for profiling int main(int argc, char *argv[]) { RDLog::InitLogs(); boost::logging::disable_logs("rdApp.debug"); options_description desc("Allowed options"); desc.add_options()("help", "Help message")( "dataset", options::value()->default_value("rg-easy"), "Built-in dataset rg-easy, rg-stereo")( "maximumOperations", options::value()->default_value(-1), "Maximum number of GA operations")( "populationSize", options::value()->default_value(-1), "GA population size")( "numberOperationsWithoutImprovement", options::value()->default_value(-1), "number of operations without improvement before exiting")( "randomSeed", options::value()->default_value(-1), "Random number seed (-1 for default, -2 for random)")( "matchingStrategy", options::value()->default_value("GA"), "Matching strategy- GA or GreedyChunks")( "numberRuns", options::value()->default_value(1), "Number of GA runs")("start", options::value()->default_value(0), "start")( "batch", options::value()->default_value(10000000), "batch size"); options::variables_map vm; options::store(options::parse_command_line(argc, argv, desc), vm); options::notify(vm); if (vm.count("help")) { cerr << desc << endl; return 0; } auto dataset = vm["dataset"].as(); std::string rdBase(getenv("RDBASE")); std::string file; std::vector coreSmiles; if (dataset == "rg-easy") { cerr << "Using dataset rg-easy" << endl; file = rdBase + "/Docs/Notebooks/compounds.txt"; coreSmiles = {"O=C1C([*:1])C2N1C(C(O)=O)=C([*:3])CS2", "O=C1C([*:1])C2N1C(C(O)=O)=C([*:3])CC2", "O=C1C([*:1])C2N1C(C(O)=O)=C([*:3])CO2", "O=C1C([*:1])C2N1C(C(O)=O)=C([*:3])C2", "O=C1C([*:1])C2N1C(C(O)=O)C([*:3])([*:4])C2", "O=C1C([*:1])C2N1C(C(O)=O)=C([*:3])S2", "O=C1C([*:1])C2N1C(C(O)=O)C([*:3])([*:4])S2", "O=C1C([*:1])C2N1C(C(O)=O)C([*:3])([*:4])O2", "O=C1C([*:1])C([*:5])N1"}; } else if (dataset == "rg-stereo") { cerr << "Using dataset rg-stereo" << endl; file = rdBase + "/Docs/Notebooks/compounds.txt"; coreSmiles = {"O=C1C([*:2])([*:1])[C@@H]2N1C(C(O)=O)=C([*:3])CS2", "O=C1C([*:2])([*:1])[C@@H]2N1C(C(O)=O)=C([*:3])CC2", "O=C1C([*:2])([*:1])[C@@H]2N1C(C(O)=O)=C([*:3])CO2", "O=C1C([*:2])([*:1])[C@@H]2N1C(C(O)=O)=C([*:3])C2", "O=C1C([*:2])([*:1])[C@@H]2N1C(C(O)=O)C([*:3])([*:4])C2", "O=C1C([*:2])([*:1])[C@@H]2N1C(C(O)=O)=C([*:3])S2", "O=C1C([*:2])([*:1])[C@@H]2N1C(C(O)=O)C([*:3])([*:4])S2", "O=C1C([*:2])([*:1])[C@@H]2N1C(C(O)=O)C([*:3])([*:4])O2", "O=C1C([*:2])([*:1])C([*:6])([*:5])N1"}; } else if (dataset == "beacon") { cerr << "Using Beacon dataset" << endl; file = rdBase + "/Docs/Notebooks/compounds2.txt"; coreSmiles = {"N1([*:1])CCN([*:2])CC1", "C1(O[*:1])CCC(O[*:2])CC1", "C1([*:1])CCC([*:2])CC1"}; } else { cerr << "unknown dataset " << dataset; return 1; } vector> molecules; { fstream fh; fh.open(file, ios::in); string line; getline(fh, line); int count = 0; int start = vm["start"].as(); int batch = vm["batch"].as(); while (getline(fh, line)) { count++; if (count < start) { continue; } if (count > start + batch) { break; } int pos = line.find_last_of("\t"); auto smiles = line.substr(pos + 1); shared_ptr mol(SmilesToMol(smiles)); molecules.push_back(mol); } } cerr << "Read " << molecules.size() << endl; std::vector cores; for (const auto &s : coreSmiles) { cores.emplace_back(SmartsToMol(s)); } RGroupDecompositionParameters parameters; parameters.scoreMethod = FingerprintVariance; parameters.gaMaximumOperations = vm["maximumOperations"].as(); parameters.gaNumberOperationsWithoutImprovement = vm["numberOperationsWithoutImprovement"].as(); parameters.gaPopulationSize = vm["populationSize"].as(); parameters.gaRandomSeed = vm["randomSeed"].as(); parameters.gaNumberRuns = vm["numberRuns"].as(); auto strategyString = vm["matchingStrategy"].as(); if (strategyString == "GA") { parameters.matchingStrategy = GA; } else if (strategyString == "GreedyChunks") { parameters.matchingStrategy = GreedyChunks; } else { cerr << "Unknown matching strategy " << strategyString << endl; return 0; } RGroupDecomposition decomposition(cores, parameters); int numberAdded(0); for (auto &molecule : molecules) { auto added = decomposition.add(*molecule); if (added > -1) { auto smiles = MolToSmiles(*molecule); cerr << "\"" << smiles << "\", " << endl; } numberAdded = std::max(numberAdded, added); } cerr << "Added " << numberAdded << " compounds to decomposition" << endl; auto result = decomposition.processAndScore(); cerr << "Results success " << result.success << " score " << result.score << endl; }