// $Id$ // // Copyright (C) 2014 Novartis Institutes for BioMedical Research // // @@ All Rights Reserved @@ // This file is part of the RDKit. // The contents are covered by the terms of the BSD license // which is included in the file license.txt, found at the root // of the RDKit source tree. // #include #include #include #include #include #include "../RDKitBase.h" #include "../SmilesParse/SmilesParse.h" #include "MolHash.h" using namespace RDKit::MolHash; namespace RDKit { void test1() { BOOST_LOG(rdInfoLog) << "-------------------------------------" << std::endl; BOOST_LOG(rdInfoLog) << "Testing MolHash test1 DEFAULT ARGUMENTS"<< std::endl; std::cout<<"Hash size = "<< 8*sizeof(HashCodeType) <<" bits.\n"; const char* smi[] = { "CN(C)c1ccc(CC(=O)NCCCCCCCCCCNC23CC4CC(C2)CC(C3)C4)cc1", "CN(C)c1ccc(CC(=O)NCCCCCCCNC23CC4CC(C2)CC(C3)C4)cc1", "CN(C)c1ccc(CC(=O)NCCCCCCCCCCCCNC23CC4CC(C2)CC(C3)C4)cc1", "CN(C)c1ccc(CC(=O)NCCCCCCCCCNC23CC4CC(C2)CC(C3)C4)cc1", "CN(C)c1ccc(CC(=O)NCCCCCCCCNC23CC4CC(C2)CC(C3)C4)cc1", "CN(C)c1ccc(CC(=O)NCCCCCCCCCCCNC23CC4CC(C2)CC(C3)C4)cc1", "CN(C)c1ccc(CC(NCCCCCC(NO)=O)=O)cc1", "CC(C)Cc1ccc(C(C)C(=O)NC23CC4CC(C2)CC(C3)C4)cc1", "c1cc([N+]([O-])=O)ccc1CC(=O)NC1CCCCCC1", "CC1(C)NC(C)(C)CC(NC(=O)Cc2ccccc2)C1", "CC(C)CC(NC(CNC(CNC(C(Cc1c2ccccc2[nH]c1)NC(C(Cc1cnc[nH]1)NC(CNC(C(C(C)O)NC(C(C(C)(C)S)NC(C(Cc1ccccc1)NC(C(CCCNC(=N)N)NC(C(N)CCC(N)=O)=O)=O)=O)=O)=O)=O)=O)=O)=O)=O)C(NC(C(=O)N1CCCC1C(=O)NC(CS)C(NC(CC(N)=O)C(NCC(=O)N1CCCC1C(O)=O)=O)=O)Cc1ccc(O)cc1)=O", // CHEMBL527084 "CCCCCCC1C23C4=c5c6c7c8c5c5c9c%10c%11c%12c(c%108)c8c7c7c%10c%13c%14c%15c%16c%17c%18c%19c%20c(c%21c%22c%23c(c9C(C25C[N+]1(C)C)C1c2c3c3c5c9c2-c(c%231)c(c%22%19)C%18C9C1(C5=C%13C(C43)c6%10)C%14%17C[N+](C)(C)C1CCCCCC)c%21%11)c%12c(c%16%20)c8c%157", // CHEMBL439119 "CC(C)CC(NC(=O)C(Cc1ccc(NC(C)=O)cc1)NC(=O)C(Cc1ccc(NC(C)=O)cc1)NC(C(CO)NC(C(NC(c1ccncc1)=O)NC(=O)C(Cc1ccc(Cl)cc1)NC(C(NC(C)=O)Cc1cc2ccccc2cc1)=O)=O)=O)C(NC(CCCCNC(C)C)C(N1C(C(=O)NC(C)C(N)=O)CCC1)=O)=O", // CHEMBL439258 "NCCCCC(NC(CN)=O)C(NCC(NC(CC(C)C)C(=O)N1Cc2ccccc2CC1C(N1CC2CCCCC2C1C(NCC(NC(CC(C)C)C(=O)N1Cc2ccccc2CC1C(=O)N1CC2CCCCC2C1C(NCC(NC(C(N1Cc2ccccc2CC1C(N1CC2CCCCC2C1C(NCC(NC(CC(C)C)C(=O)N1Cc2ccccc2CC1C(=O)N1CC2CCCCC2C1C(NCC(NC(C(N1Cc2c(cccc2)CC1C(N1CC2CCCCC2C1C(NCC(NC(CC(C)C)C(=O)N1Cc2ccccc2CC1C(N1CC2CCCCC2C1C(NCC(NC(CC(C)C)C(NC(C(N)=O)CCCNC(=N)N)=O)=O)=O)=O)=O)=O)=O)=O)CCCCN)=O)=O)=O)=O)=O)=O)CCCCN)=O)=O)=O)=O)=O)=O)=O", // CHEMBL441746 "CC(C)CC(NC(C(C(C)C)NC(C(N)CCC(O)=O)=O)=O)C(NC(C(O)C(=O)NC(CC(O)=O)C(NCC(=O)NC(CCC(O)=O)C(NC(C(O)=O)Cc1ccccc1)=O)=O)Cc1ccccc1)=O", // CHEMBL384606 "CCC(C)C1C(=O)N2CCCC2C(=O)NC2CSSCC3NC(=O)C(C(C)C)NC(=O)C(CCCCN)NC(=O)C(CC(N)=O)NC(=O)C(CCCCN)NC(=O)C4CSSCC(C(=O)NC(C(C)C)C(=O)NC(Cc5ccccc5)C(=O)N1)NC(=O)C(CO)NC(=O)C(CCC(O)=O)NC(=O)CNC(=O)C(NC(=O)C1CCCN1C(=O)C(C(C)CC)NC(=O)CNC(=O)C(CC(N)=O)NC(=O)C(CCCNC(=N)N)NC(=O)C(Cc1ccc(O)cc1)NC3=O)CSSCC(NC(=O)CNC(=O)C(C)NC(=O)C(C(C)C)NC(=O)C(C(C)O)NC(=O)C(C(C)O)NC(=O)C(CC(C)C)NC2=O)C(=O)NC(CO)C(=O)N4", // CHEMBL526869 }; for(int i=0; i atomsToUse; std::vector bondsToUse; std::vector atomCodes(mol->getNumAtoms()); std::vector bondCodes(mol->getNumBonds()); unsigned n; n = mol->getNumAtoms(); atomsToUse.resize(n); for(unsigned i = 0; i < n; i++) atomsToUse[i] = i; n = mol->getNumBonds(); bondsToUse.resize(n); for(unsigned i = 0; i < n; i++) bondsToUse[i] = i; n = mol->getNumAtoms(); for(unsigned i = 0; i < n; i++) atomCodes[i] = 1; // + mol->getAtomWithIdx(i)->getAtomicNum(); //res0 != res1,2,3 n = mol->getNumBonds(); for(unsigned i = 0; i < n; i++) bondCodes[i] = 1; fillAtomBondCodes(*mol, CF_NO_LABELS, &atomCodes, &bondCodes); HashCodeType res0 = generateMoleculeHashCode(*mol); HashCodeType res1 = generateMoleculeHashCode(*mol, &atomsToUse, 0, &atomCodes, &bondCodes); HashCodeType res2 = generateMoleculeHashCode(*mol, 0, &bondsToUse, &atomCodes, &bondCodes); HashCodeType res3 = generateMoleculeHashCode(*mol, &atomsToUse, &bondsToUse, &atomCodes, &bondCodes); std::cout << res0 <<" = "<< encode(&res0, sizeof(res0)) << std::endl; std::cout << res1 <<" = "<< encode(&res1, sizeof(res1)) << std::endl; std::cout << res2 <<" = "<< encode(&res2, sizeof(res2)) << std::endl; std::cout << res3 <<" = "<< encode(&res3, sizeof(res3)) << std::endl << std::endl; bool passed = 0 != res0 && res0 == res1 && res0 == res2 && res0 == res3; // TEST_ASSERT(passed); } BOOST_LOG(rdInfoLog) << "\tdone" << std::endl; } void test2() { BOOST_LOG(rdInfoLog) << "-------------------------------------" << std::endl; BOOST_LOG(rdInfoLog) << "Testing MolHash test2 CHIRALITY == ATOM"<< std::endl; std::cout<<"Hash size = "<< 8*sizeof(HashCodeType) <<" bits.\n"; const char* smi[] = { //equal non-chiral hash "C[C@H](F)Cl", "C[C@@H](F)Cl", "CC(F)Cl", "[13CH3]C(F)Cl", "C[C@H](Cl)F", "C[C@@H](Cl)F", }; std::vector HashNonChiral; for(int i=0; i < sizeof(smi)/sizeof(smi[0]); i++) { ROMOL_SPTR mol = ROMOL_SPTR(SmilesToMol(smi[i])); std::vector atomCodes(mol->getNumAtoms()); std::vector bondCodes(mol->getNumBonds()); fillAtomBondCodes(*mol, CF_ELEMENT|CF_CHARGE/*|CF_VALENCE*/ | CF_ATOM_AROMATIC , &atomCodes, &bondCodes); // fillAtomBondCodes(*mol, CF_ATOM_ALL &(~(CF_BOND_CHIRALITY | CF_ATOM_CHIRALITY | CF_ISOTOPE)), &atomCodes, &bondCodes); HashCodeType res = generateMoleculeHashCode(*mol, 0, 0, &atomCodes, &bondCodes); HashNonChiral.push_back(res); std::cout << res <<" = "<< encode(&res, sizeof(res)) <<" | "<< smi[i]< HashNonChiral; for(int i=0; i < sizeof(smi)/sizeof(smi[0]); i++) { ROMOL_SPTR mol = ROMOL_SPTR(SmilesToMol(smi[i])); std::vector atomCodes(mol->getNumAtoms()); std::vector bondCodes(mol->getNumBonds()); fillAtomBondCodes(*mol, CF_BOND_ALL &(~(CF_BOND_CHIRALITY)), &atomCodes, &bondCodes); HashCodeType res = generateMoleculeHashCode(*mol, 0, 0, &atomCodes, &bondCodes); HashNonChiral.push_back(res); std::cout << res <<" = "<< encode(&res, sizeof(res)) <<" | "<< smi[i]< HashChiral; for(int i=0; i < sizeof(smi)/sizeof(smi[0]); i++) { ROMOL_SPTR mol = ROMOL_SPTR(SmilesToMol(smi[i])); std::vector atomCodes(mol->getNumAtoms()); std::vector bondCodes(mol->getNumBonds()); fillAtomBondCodes(*mol, CF_BOND_CHIRALITY | CF_ATOM_CHIRALITY | CF_ISOTOPE, &atomCodes, &bondCodes); HashCodeType resC = generateMoleculeHashCode(*mol, 0, 0, &atomCodes, &bondCodes); HashChiral.push_back(resC); std::cout << resC <<" = "<< encode(&resC, sizeof(resC)) <<" "<< smi[i]< atomCodes(mol1->getNumAtoms()); std::vector bondCodes(mol2->getNumBonds()); fillAtomBondCodes(*mol1, CF_BOND_CHIRALITY | CF_ATOM_CHIRALITY | CF_ISOTOPE, &atomCodes, &bondCodes); HashCodeType hash1 = generateMoleculeHashCode(*mol1, 0, 0, &atomCodes, &bondCodes); fillAtomBondCodes(*mol2, CF_BOND_CHIRALITY | CF_ATOM_CHIRALITY | CF_ISOTOPE, &atomCodes, &bondCodes); HashCodeType hash2 = generateMoleculeHashCode(*mol2, 0, 0, &atomCodes, &bondCodes); std::cout< atomCodes(mol1->getNumAtoms()); std::vector bondCodes(mol2->getNumBonds()); fillAtomBondCodes(*mol1, CF_BOND_CHIRALITY | CF_ATOM_CHIRALITY | CF_ISOTOPE, &atomCodes, &bondCodes); HashCodeType hash1 = generateMoleculeHashCode(*mol1, 0, 0, &atomCodes, &bondCodes); fillAtomBondCodes(*mol2, CF_BOND_CHIRALITY | CF_ATOM_CHIRALITY | CF_ISOTOPE, &atomCodes, &bondCodes); HashCodeType hash2 = generateMoleculeHashCode(*mol2, 0, 0, &atomCodes, &bondCodes); std::cout< atomCodes(mol1->getNumAtoms()); std::vector bondCodes(mol2->getNumBonds()); fillAtomBondCodes(*mol1, CF_BOND_CHIRALITY | CF_ATOM_CHIRALITY | CF_ISOTOPE, &atomCodes, &bondCodes); HashCodeType hash1 = generateMoleculeHashCode(*mol1, 0, 0, &atomCodes, &bondCodes); fillAtomBondCodes(*mol2, CF_BOND_CHIRALITY | CF_ATOM_CHIRALITY | CF_ISOTOPE, &atomCodes, &bondCodes); HashCodeType hash2 = generateMoleculeHashCode(*mol2, 0, 0, &atomCodes, &bondCodes); std::cout< atomCodes(mol1->getNumAtoms()); std::vector bondCodes(mol2->getNumBonds()); fillAtomBondCodes(*mol1, CF_BOND_CHIRALITY | CF_ATOM_CHIRALITY | CF_ISOTOPE, &atomCodes, &bondCodes); HashCodeType hash1 = generateMoleculeHashCode(*mol1, 0, 0, &atomCodes, &bondCodes); fillAtomBondCodes(*mol2, CF_BOND_CHIRALITY | CF_ATOM_CHIRALITY | CF_ISOTOPE, &atomCodes, &bondCodes); HashCodeType hash2 = generateMoleculeHashCode(*mol2, 0, 0, &atomCodes, &bondCodes); std::cout< atomCodes; std::vector bondCodes; fillAtomBondCodes(mol, flags, &atomCodes, &bondCodes); std::vector atomsToUse; std::vector bondsToUse; unsigned n = mol.getNumAtoms(); for(unsigned i=0; i < n; i++) { const Atom* atom = mol.getAtomWithIdx(i); if(1) atomsToUse.push_back(atom->getIdx()); } n = mol.getNumBonds(); for(unsigned i=0; i < n; i++) { const Bond* bond = mol.getBondWithIdx(i); if(1) bondsToUse.push_back(bond->getIdx()); } return generateMoleculeHashCode(mol, &atomsToUse, &bondsToUse, &atomCodes, &bondCodes); } // {num atoms, num bonds} - {formula hash} // - {non-chiral atom hashes} - {non-chiral bond hashes} // - {chiral information} #pragma pack(push,1) struct HashResult { unsigned Line; // molecule Id [1, ...) HashCodeType Hash; // unsigned ChiralInfo; HashResult(unsigned id = 0) : Line(id), Hash(0)//, ChiralInfo(0) { } }; #pragma pack(pop) bool lessHashResult_ALL(const HashResult& r, const HashResult& l) { return r.Hash < l.Hash; } void analyzeResults(std::list& res) { std::cerr<<"\nANALYZING "<::iterator r0=res.begin(); r0!=res.end(); r0++) { std::cerr<<"Result: "<<++rn<<"\r"; if(0 == r0->Line) // collision has been already found continue; unsigned hashCollision=0; std::vector cl; //use binary search of collision in sorted list to improve performance //........ std::list::iterator r1=r0; for(std::list::iterator r=++r1; r!=res.end(); r++) { if(0==r->Line) // collision has been already found continue; if(r->Hash == r0->Hash) // collision found { ++hashCollision; cl.push_back(r->Line); // std::cout<Id<<"="<Line<<"\n"; // TEMP TEST r->Line = 0; // mark as already processed collision to exclude duplicates } } if(0!=hashCollision) // collision found { cn += hashCollision; std::cout<<"mol line "<Line<<": "< res; std::cout<<"FILE: "< 0 && smiles[i] < ' '; i--) smiles[i] = '\0'; // remove LF std::string id; std::cerr<<"\rLine: "<< ++line <<" "; if('#' != smiles[0] && ' ' != smiles[0] && '/' != smiles[0] // commented to skip && 0 == strchr(smiles,'.')) // skip ions { ROMOL_SPTR mol; try { mol = ROMOL_SPTR(SmilesToMol(getSmilesOnly(smiles, &id))); } catch(...) // internal RDKit error: Invar::Invariant& ex { std::cerr<<" RDKit error: "<< smiles <<"/n"; continue; } res.push_back(HashResult(line)); HashResult& r = res.back(); // r.ChiralInfo = 0;//mol-(); r.Hash = computeHash(*mol, CF_ALL) & bitMask; } else std::cerr<<" skipped: "<< smiles <<"/n"; } fclose(f); std::cout<<"\nDONE. "<