mirror of
https://github.com/rdkit/rdkit.git
synced 2026-06-05 22:04:27 +08:00
565 lines
23 KiB
C++
565 lines
23 KiB
C++
// $Id$
|
|
//
|
|
// Copyright (C) 2014 Novartis Institutes for BioMedical Research
|
|
//
|
|
// @@ All Rights Reserved @@
|
|
// This file is part of the RDKit.
|
|
// The contents are covered by the terms of the BSD license
|
|
// which is included in the file license.txt, found at the root
|
|
// of the RDKit source tree.
|
|
//
|
|
#include <list>
|
|
#include <vector>
|
|
#include <string>
|
|
#include <stdio.h>
|
|
#include <ctype.h>
|
|
#include "../RDKitBase.h"
|
|
#include "../SmilesParse/SmilesParse.h"
|
|
#include "MolHash.h"
|
|
|
|
using namespace RDKit::MolHash;
|
|
namespace RDKit
|
|
{
|
|
|
|
void test1()
|
|
{
|
|
BOOST_LOG(rdInfoLog) << "-------------------------------------" << std::endl;
|
|
BOOST_LOG(rdInfoLog) << "Testing MolHash test1 DEFAULT ARGUMENTS"<< std::endl;
|
|
std::cout<<"Hash size = "<< 8*sizeof(HashCodeType) <<" bits.\n";
|
|
const char* smi[] =
|
|
{
|
|
"CN(C)c1ccc(CC(=O)NCCCCCCCCCCNC23CC4CC(C2)CC(C3)C4)cc1",
|
|
"CN(C)c1ccc(CC(=O)NCCCCCCCNC23CC4CC(C2)CC(C3)C4)cc1",
|
|
"CN(C)c1ccc(CC(=O)NCCCCCCCCCCCCNC23CC4CC(C2)CC(C3)C4)cc1",
|
|
"CN(C)c1ccc(CC(=O)NCCCCCCCCCNC23CC4CC(C2)CC(C3)C4)cc1",
|
|
"CN(C)c1ccc(CC(=O)NCCCCCCCCNC23CC4CC(C2)CC(C3)C4)cc1",
|
|
"CN(C)c1ccc(CC(=O)NCCCCCCCCCCCNC23CC4CC(C2)CC(C3)C4)cc1",
|
|
"CN(C)c1ccc(CC(NCCCCCC(NO)=O)=O)cc1",
|
|
"CC(C)Cc1ccc(C(C)C(=O)NC23CC4CC(C2)CC(C3)C4)cc1",
|
|
"c1cc([N+]([O-])=O)ccc1CC(=O)NC1CCCCCC1",
|
|
"CC1(C)NC(C)(C)CC(NC(=O)Cc2ccccc2)C1",
|
|
"CC(C)CC(NC(CNC(CNC(C(Cc1c2ccccc2[nH]c1)NC(C(Cc1cnc[nH]1)NC(CNC(C(C(C)O)NC(C(C(C)(C)S)NC(C(Cc1ccccc1)NC(C(CCCNC(=N)N)NC(C(N)CCC(N)=O)=O)=O)=O)=O)=O)=O)=O)=O)=O)=O)C(NC(C(=O)N1CCCC1C(=O)NC(CS)C(NC(CC(N)=O)C(NCC(=O)N1CCCC1C(O)=O)=O)=O)Cc1ccc(O)cc1)=O", // CHEMBL527084
|
|
"CCCCCCC1C23C4=c5c6c7c8c5c5c9c%10c%11c%12c(c%108)c8c7c7c%10c%13c%14c%15c%16c%17c%18c%19c%20c(c%21c%22c%23c(c9C(C25C[N+]1(C)C)C1c2c3c3c5c9c2-c(c%231)c(c%22%19)C%18C9C1(C5=C%13C(C43)c6%10)C%14%17C[N+](C)(C)C1CCCCCC)c%21%11)c%12c(c%16%20)c8c%157", // CHEMBL439119
|
|
"CC(C)CC(NC(=O)C(Cc1ccc(NC(C)=O)cc1)NC(=O)C(Cc1ccc(NC(C)=O)cc1)NC(C(CO)NC(C(NC(c1ccncc1)=O)NC(=O)C(Cc1ccc(Cl)cc1)NC(C(NC(C)=O)Cc1cc2ccccc2cc1)=O)=O)=O)C(NC(CCCCNC(C)C)C(N1C(C(=O)NC(C)C(N)=O)CCC1)=O)=O", // CHEMBL439258
|
|
"NCCCCC(NC(CN)=O)C(NCC(NC(CC(C)C)C(=O)N1Cc2ccccc2CC1C(N1CC2CCCCC2C1C(NCC(NC(CC(C)C)C(=O)N1Cc2ccccc2CC1C(=O)N1CC2CCCCC2C1C(NCC(NC(C(N1Cc2ccccc2CC1C(N1CC2CCCCC2C1C(NCC(NC(CC(C)C)C(=O)N1Cc2ccccc2CC1C(=O)N1CC2CCCCC2C1C(NCC(NC(C(N1Cc2c(cccc2)CC1C(N1CC2CCCCC2C1C(NCC(NC(CC(C)C)C(=O)N1Cc2ccccc2CC1C(N1CC2CCCCC2C1C(NCC(NC(CC(C)C)C(NC(C(N)=O)CCCNC(=N)N)=O)=O)=O)=O)=O)=O)=O)=O)CCCCN)=O)=O)=O)=O)=O)=O)CCCCN)=O)=O)=O)=O)=O)=O)=O", // CHEMBL441746
|
|
"CC(C)CC(NC(C(C(C)C)NC(C(N)CCC(O)=O)=O)=O)C(NC(C(O)C(=O)NC(CC(O)=O)C(NCC(=O)NC(CCC(O)=O)C(NC(C(O)=O)Cc1ccccc1)=O)=O)Cc1ccccc1)=O", // CHEMBL384606
|
|
"CCC(C)C1C(=O)N2CCCC2C(=O)NC2CSSCC3NC(=O)C(C(C)C)NC(=O)C(CCCCN)NC(=O)C(CC(N)=O)NC(=O)C(CCCCN)NC(=O)C4CSSCC(C(=O)NC(C(C)C)C(=O)NC(Cc5ccccc5)C(=O)N1)NC(=O)C(CO)NC(=O)C(CCC(O)=O)NC(=O)CNC(=O)C(NC(=O)C1CCCN1C(=O)C(C(C)CC)NC(=O)CNC(=O)C(CC(N)=O)NC(=O)C(CCCNC(=N)N)NC(=O)C(Cc1ccc(O)cc1)NC3=O)CSSCC(NC(=O)CNC(=O)C(C)NC(=O)C(C(C)C)NC(=O)C(C(C)O)NC(=O)C(C(C)O)NC(=O)C(CC(C)C)NC2=O)C(=O)NC(CO)C(=O)N4", // CHEMBL526869
|
|
};
|
|
for(size_t i=0; i<sizeof(smi)/sizeof(smi[0]); i++)
|
|
{
|
|
ROMOL_SPTR mol = ROMOL_SPTR(SmilesToMol(smi[i]));
|
|
std::vector<unsigned> atomsToUse;
|
|
std::vector<unsigned> bondsToUse;
|
|
std::vector<boost::uint32_t> atomCodes(mol->getNumAtoms());
|
|
std::vector<boost::uint32_t> bondCodes(mol->getNumBonds());
|
|
|
|
unsigned n;
|
|
n = mol->getNumAtoms();
|
|
atomsToUse.resize(n);
|
|
for(unsigned i = 0; i < n; i++)
|
|
atomsToUse[i] = i;
|
|
n = mol->getNumBonds();
|
|
bondsToUse.resize(n);
|
|
for(unsigned i = 0; i < n; i++)
|
|
bondsToUse[i] = i;
|
|
|
|
n = mol->getNumAtoms();
|
|
for(unsigned i = 0; i < n; i++)
|
|
atomCodes[i] = 1; // + mol->getAtomWithIdx(i)->getAtomicNum(); //res0 != res1,2,3
|
|
n = mol->getNumBonds();
|
|
for(unsigned i = 0; i < n; i++)
|
|
bondCodes[i] = 1;
|
|
|
|
fillAtomBondCodes(*mol, CF_NO_LABELS, &atomCodes, &bondCodes);
|
|
|
|
HashCodeType res0 = generateMoleculeHashCode(*mol);
|
|
HashCodeType res1 = generateMoleculeHashCode(*mol, &atomsToUse, 0, &atomCodes, &bondCodes);
|
|
HashCodeType res2 = generateMoleculeHashCode(*mol, 0, &bondsToUse, &atomCodes, &bondCodes);
|
|
HashCodeType res3 = generateMoleculeHashCode(*mol, &atomsToUse, &bondsToUse, &atomCodes, &bondCodes);
|
|
|
|
std::cout << res0 <<" = "<< encode(&res0, sizeof(res0)) << std::endl;
|
|
std::cout << res1 <<" = "<< encode(&res1, sizeof(res1)) << std::endl;
|
|
std::cout << res2 <<" = "<< encode(&res2, sizeof(res2)) << std::endl;
|
|
std::cout << res3 <<" = "<< encode(&res3, sizeof(res3)) << std::endl << std::endl;
|
|
|
|
// bool passed = 0 != res0 && res0 == res1 && res0 == res2 && res0 == res3;
|
|
// TEST_ASSERT(passed);
|
|
}
|
|
BOOST_LOG(rdInfoLog) << "\tdone" << std::endl;
|
|
}
|
|
|
|
void test2()
|
|
{
|
|
BOOST_LOG(rdInfoLog) << "-------------------------------------" << std::endl;
|
|
BOOST_LOG(rdInfoLog) << "Testing MolHash test2 CHIRALITY == ATOM"<< std::endl;
|
|
std::cout<<"Hash size = "<< 8*sizeof(HashCodeType) <<" bits.\n";
|
|
const char* smi[] =
|
|
{
|
|
//equal non-chiral hash
|
|
"C[C@H](F)Cl",
|
|
"C[C@@H](F)Cl",
|
|
"CC(F)Cl",
|
|
"[13CH3]C(F)Cl",
|
|
"C[C@H](Cl)F",
|
|
"C[C@@H](Cl)F",
|
|
};
|
|
|
|
std::vector<HashCodeType> HashNonChiral;
|
|
|
|
for(size_t i=0; i < sizeof(smi)/sizeof(smi[0]); i++)
|
|
{
|
|
ROMOL_SPTR mol = ROMOL_SPTR(SmilesToMol(smi[i]));
|
|
std::vector<boost::uint32_t> atomCodes(mol->getNumAtoms());
|
|
std::vector<boost::uint32_t> bondCodes(mol->getNumBonds());
|
|
|
|
fillAtomBondCodes(*mol, CF_ELEMENT|CF_CHARGE/*|CF_VALENCE*/
|
|
| CF_ATOM_AROMATIC
|
|
, &atomCodes, &bondCodes);
|
|
|
|
// fillAtomBondCodes(*mol, CF_ATOM_ALL &(~(CF_BOND_CHIRALITY | CF_ATOM_CHIRALITY | CF_ISOTOPE)), &atomCodes, &bondCodes);
|
|
HashCodeType res = generateMoleculeHashCode(*mol, 0, 0, &atomCodes, &bondCodes);
|
|
HashNonChiral.push_back(res);
|
|
std::cout << res <<" = "<< encode(&res, sizeof(res)) <<" | "<< smi[i]<<std::endl;
|
|
}
|
|
bool passed = true;
|
|
for(size_t i=0; i < HashNonChiral.size(); i++)
|
|
for(size_t j=0; j < HashNonChiral.size(); j++)
|
|
if(i != j && HashNonChiral[i] != HashNonChiral[j])
|
|
passed = false;
|
|
TEST_ASSERT(passed);
|
|
BOOST_LOG(rdInfoLog) << "\tdone" << std::endl;
|
|
}
|
|
|
|
void test21()
|
|
{
|
|
BOOST_LOG(rdInfoLog) << "-------------------------------------" << std::endl;
|
|
BOOST_LOG(rdInfoLog) << "Testing MolHash test21 CHIRALITY == BOND"<< std::endl;
|
|
std::cout<<"Hash size = "<< 8*sizeof(HashCodeType) <<" bits.\n";
|
|
const char* smi[] =
|
|
{
|
|
//equal non-chiral BOND hash
|
|
"C/C=C/C",
|
|
"CC=CC",
|
|
"C/C=C\\C",
|
|
};
|
|
|
|
std::vector<HashCodeType> HashNonChiral;
|
|
|
|
for(size_t i=0; i < sizeof(smi)/sizeof(smi[0]); i++)
|
|
{
|
|
ROMOL_SPTR mol = ROMOL_SPTR(SmilesToMol(smi[i]));
|
|
std::vector<boost::uint32_t> atomCodes(mol->getNumAtoms());
|
|
std::vector<boost::uint32_t> bondCodes(mol->getNumBonds());
|
|
|
|
fillAtomBondCodes(*mol, CF_BOND_ALL &(~(CF_BOND_CHIRALITY)), &atomCodes, &bondCodes);
|
|
HashCodeType res = generateMoleculeHashCode(*mol, 0, 0, &atomCodes, &bondCodes);
|
|
HashNonChiral.push_back(res);
|
|
std::cout << res <<" = "<< encode(&res, sizeof(res)) <<" | "<< smi[i]<<std::endl;
|
|
}
|
|
bool passed = true;
|
|
for(size_t i=0; i < HashNonChiral.size(); i++)
|
|
for(size_t j=0; j < HashNonChiral.size(); j++)
|
|
if(i != j && HashNonChiral[i] != HashNonChiral[j])
|
|
passed = false;
|
|
TEST_ASSERT(passed);
|
|
BOOST_LOG(rdInfoLog) << "\tdone" << std::endl;
|
|
}
|
|
|
|
void test3()
|
|
{
|
|
BOOST_LOG(rdInfoLog) << "-------------------------------------" << std::endl;
|
|
BOOST_LOG(rdInfoLog) << "Testing MolHash test3 CHIRALITY DIFF"<< std::endl;
|
|
const char* smi[] =
|
|
{
|
|
//different chiral hash
|
|
"C[C@H](F)Cl",
|
|
"C[C@@H](F)Cl",
|
|
"CC(F)Cl",
|
|
"[13CH3]C(F)Cl",
|
|
|
|
"C[C@H]1CC[C@H](C)CC1",
|
|
"C[C@H]1CC[C@@H](C)CC1",
|
|
"CC1CCC(C)CC1",
|
|
};
|
|
|
|
std::vector<HashCodeType> HashChiral;
|
|
|
|
for(size_t i=0; i < sizeof(smi)/sizeof(smi[0]); i++)
|
|
{
|
|
ROMOL_SPTR mol = ROMOL_SPTR(SmilesToMol(smi[i]));
|
|
std::vector<boost::uint32_t> atomCodes(mol->getNumAtoms());
|
|
std::vector<boost::uint32_t> bondCodes(mol->getNumBonds());
|
|
|
|
fillAtomBondCodes(*mol, CF_BOND_CHIRALITY | CF_ATOM_CHIRALITY | CF_ISOTOPE, &atomCodes, &bondCodes);
|
|
HashCodeType resC = generateMoleculeHashCode(*mol, 0, 0, &atomCodes, &bondCodes);
|
|
HashChiral.push_back(resC);
|
|
|
|
std::cout << resC <<" = "<< encode(&resC, sizeof(resC)) <<" "<< smi[i]<<std::endl;
|
|
}
|
|
|
|
bool passed = true;
|
|
for(size_t i=0; i < HashChiral.size(); i++)
|
|
for(size_t j=0; j < HashChiral.size(); j++)
|
|
if(i != j && HashChiral[i] == HashChiral[j])
|
|
passed = false;
|
|
TEST_ASSERT(passed);
|
|
BOOST_LOG(rdInfoLog) << "\tdone" << std::endl;
|
|
}
|
|
|
|
void test3a()
|
|
{
|
|
BOOST_LOG(rdInfoLog) << "-------------------------------------" << std::endl;
|
|
BOOST_LOG(rdInfoLog) << "Testing MolHash test3a CHIRALITY EQUAL"<< std::endl;
|
|
{
|
|
const char* smi[] =
|
|
{
|
|
"C[C@H](F)Cl",
|
|
"C[C@@H](Cl)F",
|
|
};
|
|
ROMOL_SPTR mol1 = ROMOL_SPTR(SmilesToMol(smi[0]));
|
|
ROMOL_SPTR mol2 = ROMOL_SPTR(SmilesToMol(smi[1]));
|
|
|
|
std::vector<boost::uint32_t> atomCodes(mol1->getNumAtoms());
|
|
std::vector<boost::uint32_t> bondCodes(mol2->getNumBonds());
|
|
|
|
fillAtomBondCodes(*mol1, CF_BOND_CHIRALITY | CF_ATOM_CHIRALITY | CF_ISOTOPE, &atomCodes, &bondCodes);
|
|
HashCodeType hash1 = generateMoleculeHashCode(*mol1, 0, 0, &atomCodes, &bondCodes);
|
|
fillAtomBondCodes(*mol2, CF_BOND_CHIRALITY | CF_ATOM_CHIRALITY | CF_ISOTOPE, &atomCodes, &bondCodes);
|
|
HashCodeType hash2 = generateMoleculeHashCode(*mol2, 0, 0, &atomCodes, &bondCodes);
|
|
std::cout<<hash1<<" "<<hash2<<std::endl;
|
|
TEST_ASSERT(hash1==hash2);
|
|
}
|
|
{
|
|
const char* smi[] =
|
|
{
|
|
"C[C@@H](F)Cl",
|
|
"C[C@H](Cl)F",
|
|
};
|
|
ROMOL_SPTR mol1 = ROMOL_SPTR(SmilesToMol(smi[0]));
|
|
ROMOL_SPTR mol2 = ROMOL_SPTR(SmilesToMol(smi[1]));
|
|
|
|
std::vector<boost::uint32_t> atomCodes(mol1->getNumAtoms());
|
|
std::vector<boost::uint32_t> bondCodes(mol2->getNumBonds());
|
|
|
|
fillAtomBondCodes(*mol1, CF_BOND_CHIRALITY | CF_ATOM_CHIRALITY | CF_ISOTOPE, &atomCodes, &bondCodes);
|
|
HashCodeType hash1 = generateMoleculeHashCode(*mol1, 0, 0, &atomCodes, &bondCodes);
|
|
fillAtomBondCodes(*mol2, CF_BOND_CHIRALITY | CF_ATOM_CHIRALITY | CF_ISOTOPE, &atomCodes, &bondCodes);
|
|
HashCodeType hash2 = generateMoleculeHashCode(*mol2, 0, 0, &atomCodes, &bondCodes);
|
|
std::cout<<hash1<<" "<<hash2<<std::endl;
|
|
TEST_ASSERT(hash1==hash2);
|
|
}
|
|
|
|
{
|
|
const char* smi[] =
|
|
{
|
|
"C/C=C/Cl",
|
|
"Cl/C=C/C",
|
|
};
|
|
ROMOL_SPTR mol1 = ROMOL_SPTR(SmilesToMol(smi[0]));
|
|
ROMOL_SPTR mol2 = ROMOL_SPTR(SmilesToMol(smi[1]));
|
|
|
|
std::vector<boost::uint32_t> atomCodes(mol1->getNumAtoms());
|
|
std::vector<boost::uint32_t> bondCodes(mol2->getNumBonds());
|
|
|
|
fillAtomBondCodes(*mol1, CF_BOND_CHIRALITY | CF_ATOM_CHIRALITY | CF_ISOTOPE, &atomCodes, &bondCodes);
|
|
HashCodeType hash1 = generateMoleculeHashCode(*mol1, 0, 0, &atomCodes, &bondCodes);
|
|
fillAtomBondCodes(*mol2, CF_BOND_CHIRALITY | CF_ATOM_CHIRALITY | CF_ISOTOPE, &atomCodes, &bondCodes);
|
|
HashCodeType hash2 = generateMoleculeHashCode(*mol2, 0, 0, &atomCodes, &bondCodes);
|
|
std::cout<<hash1<<" "<<hash2<<std::endl;
|
|
TEST_ASSERT(hash1==hash2);
|
|
}
|
|
{
|
|
const char* smi[] =
|
|
{
|
|
"C/C=C/Cl",
|
|
"C/C=C\\Cl",
|
|
};
|
|
ROMOL_SPTR mol1 = ROMOL_SPTR(SmilesToMol(smi[0]));
|
|
ROMOL_SPTR mol2 = ROMOL_SPTR(SmilesToMol(smi[1]));
|
|
|
|
std::vector<boost::uint32_t> atomCodes(mol1->getNumAtoms());
|
|
std::vector<boost::uint32_t> bondCodes(mol2->getNumBonds());
|
|
|
|
fillAtomBondCodes(*mol1, CF_BOND_CHIRALITY | CF_ATOM_CHIRALITY | CF_ISOTOPE, &atomCodes, &bondCodes);
|
|
HashCodeType hash1 = generateMoleculeHashCode(*mol1, 0, 0, &atomCodes, &bondCodes);
|
|
fillAtomBondCodes(*mol2, CF_BOND_CHIRALITY | CF_ATOM_CHIRALITY | CF_ISOTOPE, &atomCodes, &bondCodes);
|
|
HashCodeType hash2 = generateMoleculeHashCode(*mol2, 0, 0, &atomCodes, &bondCodes);
|
|
std::cout<<hash1<<" "<<hash2<<std::endl;
|
|
TEST_ASSERT(hash1!=hash2);
|
|
}
|
|
|
|
BOOST_LOG(rdInfoLog) << "\tdone" << std::endl;
|
|
}
|
|
|
|
|
|
void test4()
|
|
{
|
|
BOOST_LOG(rdInfoLog) << "-------------------------------------" << std::endl;
|
|
BOOST_LOG(rdInfoLog) << "Testing MolHash test4 STRING"<< std::endl;
|
|
const char* smi[] =
|
|
{
|
|
//different chiral hash and equal non-chiral hash
|
|
"C[C@H](F)Cl",
|
|
"C[C@@H](F)Cl",
|
|
"CC(F)Cl",
|
|
"[13CH3]C(F)Cl",
|
|
//different chiral hash
|
|
"C[C@H]1CC[C@H](C)CC1",
|
|
"C[C@H]1CC[C@@H](C)CC1",
|
|
"CC1CCC(C)CC1",
|
|
};
|
|
|
|
for(size_t i=0; i < sizeof(smi)/sizeof(smi[0]); i++)
|
|
{
|
|
ROMOL_SPTR mol = ROMOL_SPTR(SmilesToMol(smi[i]));
|
|
std::cout << generateMoleculeHashSet(*mol, 0, 0) <<" "<< smi[i] << std::endl;
|
|
}
|
|
TEST_ASSERT(true); // there is no any exseption
|
|
BOOST_LOG(rdInfoLog) << "\tdone" << std::endl;
|
|
}
|
|
|
|
void test5()
|
|
{
|
|
BOOST_LOG(rdInfoLog) << "-------------------------------------" << std::endl;
|
|
BOOST_LOG(rdInfoLog) << "Testing MolHash test5 "<< std::endl;
|
|
const char* smi[] =
|
|
{
|
|
// different chiral hash and equal non-chiral hash
|
|
// groups of 3
|
|
"C[CH](F)Cl",
|
|
"C[C@H](F)Cl",
|
|
"C[C@@H](F)Cl",
|
|
//
|
|
"c1cc(C[CH](F)Cl)cnc1",
|
|
"c1cc(C[C@H](F)Cl)cnc1",
|
|
"c1cc(C[C@@H](F)Cl)cnc1"
|
|
};
|
|
|
|
for(size_t i=0; i < sizeof(smi)/sizeof(smi[0]); i+=3)
|
|
{
|
|
ROMOL_SPTR mol1 = ROMOL_SPTR(SmilesToMol(smi[i]));
|
|
TEST_ASSERT(mol1);
|
|
ROMOL_SPTR mol2 = ROMOL_SPTR(SmilesToMol(smi[i+1]));
|
|
TEST_ASSERT(mol2);
|
|
ROMOL_SPTR mol3 = ROMOL_SPTR(SmilesToMol(smi[i+2]));
|
|
TEST_ASSERT(mol3);
|
|
{
|
|
std::string hash1=generateMoleculeHashSet(*mol1);
|
|
std::string hash2=generateMoleculeHashSet(*mol2);
|
|
std::string hash3=generateMoleculeHashSet(*mol3);
|
|
TEST_ASSERT(hash1!=hash2);
|
|
TEST_ASSERT(hash1!=hash3);
|
|
TEST_ASSERT(hash3!=hash2);
|
|
}
|
|
// {
|
|
// std::string hash1=generateMoleculeHashSet(*mol1);
|
|
// std::string hash2=generateMoleculeHashSet(*mol2);
|
|
// std::cout << hash1 <<" "<< smi[i] << std::endl;
|
|
// std::cout << hash2 <<" "<< smi[i+1] << std::endl;
|
|
// TEST_ASSERT(hash1!=hash2);
|
|
// }
|
|
}
|
|
BOOST_LOG(rdInfoLog) << "\tdone" << std::endl;
|
|
}
|
|
|
|
|
|
void doUnitTest()
|
|
{
|
|
std::cout<<"Hash size = "<< 8*sizeof(HashCodeType) <<" bits.\n";
|
|
|
|
BOOST_LOG(rdInfoLog) << "*******************************************************\n";
|
|
test1();
|
|
BOOST_LOG(rdInfoLog) << "*******************************************************\n";
|
|
test2();
|
|
BOOST_LOG(rdInfoLog) << "*******************************************************\n";
|
|
test21();
|
|
BOOST_LOG(rdInfoLog) << "*******************************************************\n";
|
|
test3();
|
|
BOOST_LOG(rdInfoLog) << "*******************************************************\n";
|
|
test3a();
|
|
BOOST_LOG(rdInfoLog) << "*******************************************************\n";
|
|
test4();
|
|
BOOST_LOG(rdInfoLog) << "*******************************************************\n";
|
|
test5();
|
|
}
|
|
|
|
//=============================================================================
|
|
// investigation test case for computing of a probability of the hash code collisions
|
|
//=============================================================================
|
|
|
|
std::string getSmilesOnly(const char* smiles, std::string* id=0)
|
|
{
|
|
const char* sp = strchr(smiles,' ');
|
|
unsigned n = (sp ? sp-smiles+1 : strlen(smiles));
|
|
if(id)
|
|
*id = std::string(smiles+n);
|
|
return std::string(smiles, n);
|
|
}
|
|
|
|
HashCodeType computeHash(const ROMol &mol, CodeFlags flags)
|
|
{
|
|
std::vector<boost::uint32_t> atomCodes;
|
|
std::vector<boost::uint32_t> bondCodes;
|
|
|
|
fillAtomBondCodes(mol, flags, &atomCodes, &bondCodes);
|
|
|
|
std::vector<unsigned> atomsToUse;
|
|
std::vector<unsigned> bondsToUse;
|
|
|
|
unsigned n = mol.getNumAtoms();
|
|
for(unsigned i=0; i < n; i++)
|
|
{
|
|
const Atom* atom = mol.getAtomWithIdx(i);
|
|
if(1)
|
|
atomsToUse.push_back(atom->getIdx());
|
|
}
|
|
|
|
n = mol.getNumBonds();
|
|
for(unsigned i=0; i < n; i++)
|
|
{
|
|
const Bond* bond = mol.getBondWithIdx(i);
|
|
if(1)
|
|
bondsToUse.push_back(bond->getIdx());
|
|
}
|
|
|
|
return generateMoleculeHashCode(mol, &atomsToUse, &bondsToUse, &atomCodes, &bondCodes);
|
|
}
|
|
|
|
// {num atoms, num bonds} - {formula hash}
|
|
// - {non-chiral atom hashes} - {non-chiral bond hashes}
|
|
// - {chiral information}
|
|
|
|
#pragma pack(push,1)
|
|
struct HashResult
|
|
{
|
|
unsigned Line; // molecule Id [1, ...)
|
|
HashCodeType Hash;
|
|
// unsigned ChiralInfo;
|
|
HashResult(unsigned id = 0) : Line(id), Hash(0)//, ChiralInfo(0)
|
|
{
|
|
}
|
|
};
|
|
#pragma pack(pop)
|
|
|
|
bool lessHashResult_ALL(const HashResult& r, const HashResult& l)
|
|
{
|
|
return r.Hash < l.Hash;
|
|
}
|
|
|
|
void analyzeResults(std::list<HashResult>& res)
|
|
{
|
|
std::cerr<<"\nANALYZING "<<res.size()<<" Results ...\n";
|
|
std::cout<<"Collisions found:\n";
|
|
// std::sort(res.begin(), res.end(), lessHashResult_ALL);
|
|
unsigned rn=0, cn=0;
|
|
for(std::list<HashResult>::iterator r0=res.begin(); r0!=res.end(); r0++)
|
|
{
|
|
std::cerr<<"Result: "<<++rn<<"\r";
|
|
if(0 == r0->Line) // collision has been already found
|
|
continue;
|
|
unsigned hashCollision=0;
|
|
std::vector<unsigned> cl;
|
|
//use binary search of collision in sorted list to improve performance
|
|
//........
|
|
std::list<HashResult>::iterator r1=r0;
|
|
for(std::list<HashResult>::iterator r=++r1; r!=res.end(); r++)
|
|
{
|
|
if(0==r->Line) // collision has been already found
|
|
continue;
|
|
if(r->Hash == r0->Hash) // collision found
|
|
{
|
|
++hashCollision;
|
|
cl.push_back(r->Line);
|
|
// std::cout<<r0->Id<<"="<<r->Line<<"\n"; // TEMP TEST
|
|
r->Line = 0; // mark as already processed collision to exclude duplicates
|
|
}
|
|
}
|
|
if(0!=hashCollision) // collision found
|
|
{
|
|
cn += hashCollision;
|
|
std::cout<<"mol line "<<r0->Line<<": "<<hashCollision<<" collisions with: ";
|
|
for(unsigned i=0; i < cl.size(); i++)
|
|
std::cout<<cl[i]<<" ";
|
|
std::cout<<"lines.\n";
|
|
}
|
|
}
|
|
std::cout<<"Total: "<<cn<<" hash collisions found in "<<res.size()<<" molecules.\n";
|
|
}
|
|
|
|
void testFileSMILES(const char* file, HashCodeType bitMask)
|
|
{
|
|
unsigned line=0;
|
|
std::list<HashResult> res;
|
|
std::cout<<"FILE: "<<file<<"\n";
|
|
|
|
FILE* f = fopen(file, "rt");
|
|
if(!f)
|
|
{
|
|
perror("Could not OPEN smi file");
|
|
return;
|
|
}
|
|
char smiles[4096];
|
|
while(fgets(smiles, sizeof(smiles), f) && line <= 1000999)
|
|
{
|
|
for(size_t i = strlen(smiles)-1; i > 0 && smiles[i] < ' '; i--)
|
|
smiles[i] = '\0'; // remove LF
|
|
std::string id;
|
|
std::cerr<<"\rLine: "<< ++line <<" ";
|
|
if('#' != smiles[0] && ' ' != smiles[0] && '/' != smiles[0] // commented to skip
|
|
&& 0 == strchr(smiles,'.')) // skip ions
|
|
{
|
|
ROMOL_SPTR mol;
|
|
try
|
|
{
|
|
mol = ROMOL_SPTR(SmilesToMol(getSmilesOnly(smiles, &id)));
|
|
}
|
|
catch(...) // internal RDKit error: Invar::Invariant& ex
|
|
{
|
|
std::cerr<<" RDKit error: "<< smiles <<"/n";
|
|
continue;
|
|
}
|
|
res.push_back(HashResult(line));
|
|
HashResult& r = res.back();
|
|
// r.ChiralInfo = 0;//mol-();
|
|
r.Hash = computeHash(*mol, CF_ALL) & bitMask;
|
|
}
|
|
else
|
|
std::cerr<<" skipped: "<< smiles <<"/n";
|
|
}
|
|
fclose(f);
|
|
std::cout<<"\nDONE. "<<res.size()<<" molecules processed.\n";
|
|
analyzeResults(res);
|
|
std::cout<<"Test COMPLETED.\n";
|
|
}
|
|
|
|
void checkCollisions(const char* file, boost::uint32_t bits=0)
|
|
{
|
|
HashCodeType bitMask = 0;
|
|
if(0 == bits || 8*sizeof(HashCodeType) < bits)
|
|
bits = 8*sizeof(HashCodeType);
|
|
for(unsigned i=0; i < bits; i++)
|
|
bitMask |= 1ULL << i;
|
|
std::cout<<"Hash size = "<<bits<<" bits. Mask = "<<bitMask<<"\n";
|
|
|
|
if(0==strcmp(file+strlen(file)-4, ".smi"))
|
|
testFileSMILES(file, bitMask);
|
|
else
|
|
std::cout<<"UNKNOWN File Extention.\n";
|
|
}
|
|
|
|
} //RDKit
|
|
|
|
int main(int argc, char*argv[])
|
|
{
|
|
RDKit::doUnitTest();
|
|
|
|
if(2==argc)
|
|
RDKit::checkCollisions(argv[1]);
|
|
else if(3==argc && isdigit(*argv[2]))
|
|
RDKit::checkCollisions(argv[1], atoi(argv[2]));
|
|
else if(1!=argc)
|
|
std::cout<<"UNKNOWN Argument.\n";
|
|
return 0;
|
|
}
|
|
|