// $Id$ // // Copyright (C) 2007,2008 Greg Landrum // // @@ All Rights Reserved @@ // This file is part of the RDKit. // The contents are covered by the terms of the BSD license // which is included in the file license.txt, found at the root // of the RDKit source tree. // #include SQLITE_EXTENSION_INIT1 #include #include #include #include #include #include #include #include #include #include #include #include std::string stringFromTextArg(sqlite3_value *arg) { const unsigned char *text = sqlite3_value_text(arg); int nBytes = sqlite3_value_bytes(arg); std::string res((const char *)text, nBytes); return res; } std::string stringFromBlobArg(sqlite3_value *arg) { const void *blob = sqlite3_value_blob(arg); int nBytes = sqlite3_value_bytes(arg); std::string res((const char *)blob, nBytes); return res; } RDKit::ROMol *molFromBlobArg(sqlite3_value *arg) { std::string pkl = stringFromBlobArg(arg); RDKit::ROMol *m; try { m = new RDKit::ROMol(pkl); } catch (RDKit::MolPicklerException &) { m = 0; } return m; } ExplicitBitVect *ebvFromBlobArg(sqlite3_value *arg) { std::string pkl = stringFromBlobArg(arg); ExplicitBitVect *ebv; try { ebv = new ExplicitBitVect(pkl); } catch (ValueErrorException &) { ebv = 0; } return ebv; } template RDKit::SparseIntVect *sivFromBlobArg(sqlite3_value *arg) { std::string pkl = stringFromBlobArg(arg); RDKit::SparseIntVect *siv; try { siv = new RDKit::SparseIntVect(pkl); } catch (ValueErrorException &) { siv = 0; } return siv; } /* --------------------------------- Benchmarking results. Database: 65385 pubchem compounds Simple access: select count(*) from molecules where length(molpkl)>40; 0.3s depickle : select count(*) from molecules where rdk_molNumAtoms(molpkl)>40; 11.3s substruct1 : select count(*) from molecules where rdk_molHasSubstruct(molpkl,'c1ncncn1'); 18.0s substruct2 : select count(*) from molecules where rdk_molHasSubstruct(molpkl,'[#6;r10]'); 15.8 3 Oct 2007: depickle : select count(*) from molecules where rdk_molNumAtoms(molpkl)>40; 9.4s mw : select count(*) from molecules where rdk_molAMW(molpkl)<200; 9.7s --------------------------------- */ static void numAtomsFunc(sqlite3_context *context, int argc, sqlite3_value **argv) { RDKit::ROMol *m = molFromBlobArg(argv[0]); if (m) { int res = m->getNumAtoms(); delete m; sqlite3_result_int(context, res); } else { std::string errorMsg = "BLOB could not be converted into a molecule"; sqlite3_result_error(context, errorMsg.c_str(), errorMsg.length()); } } static void molWtFunc(sqlite3_context *context, int argc, sqlite3_value **argv) { RDKit::ROMol *m = molFromBlobArg(argv[0]); if (m) { double res = RDKit::Descriptors::CalcAMW(*m); delete m; sqlite3_result_double(context, res); } else { std::string errorMsg = "BLOB could not be converted into a molecule"; sqlite3_result_error(context, errorMsg.c_str(), errorMsg.length()); } } static void molLogPFunc(sqlite3_context *context, int argc, sqlite3_value **argv) { RDKit::ROMol *m = molFromBlobArg(argv[0]); if (m) { double res, tmp; RDKit::Descriptors::CalcCrippenDescriptors(*m, res, tmp); delete m; sqlite3_result_double(context, res); } else { std::string errorMsg = "BLOB could not be converted into a molecule"; sqlite3_result_error(context, errorMsg.c_str(), errorMsg.length()); } } static void smilesToBlob(sqlite3_context *context, int argc, sqlite3_value **argv) { std::string smiles = stringFromTextArg(argv[0]); RDKit::ROMol *m = 0; try { m = RDKit::SmilesToMol(smiles); } catch (RDKit::MolSanitizeException &) { m = 0; } if (m) { std::string text; RDKit::MolPickler::pickleMol(*m, text); delete m; sqlite3_result_blob(context, text.c_str(), text.length(), SQLITE_TRANSIENT); } else { std::string errorMsg = "SMILES could not be converted into a molecule"; sqlite3_result_error(context, errorMsg.c_str(), errorMsg.length()); } } static void molHasSubstruct(sqlite3_context *context, int argc, sqlite3_value **argv) { RDKit::ROMol *m = molFromBlobArg(argv[0]); if (!m) { std::string errorMsg = "BLOB (argument 1) could not be converted into a molecule"; sqlite3_result_error(context, errorMsg.c_str(), errorMsg.length()); return; } std::string smarts = stringFromTextArg(argv[1]); std::map &molMap = *static_cast *>( sqlite3_user_data(context)); RDKit::ROMol *patt = 0; if (molMap.find(smarts) != molMap.end()) { patt = boost::any_cast(molMap[smarts]).get(); } else { patt = static_cast(RDKit::SmartsToMol(smarts)); molMap[smarts] = boost::any(RDKit::ROMOL_SPTR(patt)); } if (!patt) { std::string errorMsg = "SMARTS (argument 2) could not be converted into a molecule"; sqlite3_result_error(context, errorMsg.c_str(), errorMsg.length()); return; } RDKit::MatchVectType match; int res = RDKit::SubstructMatch(*m, *patt, match, true, false, true); delete m; sqlite3_result_int(context, res); } static void molSubstructCount(sqlite3_context *context, int argc, sqlite3_value **argv) { RDKit::ROMol *m = molFromBlobArg(argv[0]); if (!m) { std::string errorMsg = "BLOB (argument 1) could not be converted into a molecule"; sqlite3_result_error(context, errorMsg.c_str(), errorMsg.length()); return; } std::string smarts = stringFromTextArg(argv[1]); std::map &molMap = *static_cast *>( sqlite3_user_data(context)); RDKit::ROMol *patt = 0; if (molMap.find(smarts) != molMap.end()) { patt = boost::any_cast(molMap[smarts]).get(); } else { patt = static_cast(RDKit::SmartsToMol(smarts)); molMap[smarts] = boost::any(RDKit::ROMOL_SPTR(patt)); } if (!patt) { std::string errorMsg = "SMARTS (argument 2) could not be converted into a molecule"; sqlite3_result_error(context, errorMsg.c_str(), errorMsg.length()); return; } std::vector matches; int res = RDKit::SubstructMatch(*m, *patt, matches, true, true, false); delete m; sqlite3_result_int(context, res); } static void blobToRDKitFingerprint(sqlite3_context *context, int argc, sqlite3_value **argv) { RDKit::ROMol *m = molFromBlobArg(argv[0]); if (!m) { std::string errorMsg = "BLOB (argument 1) could not be converted into a molecule"; sqlite3_result_error(context, errorMsg.c_str(), errorMsg.length()); return; } ExplicitBitVect *fp = RDKit::DaylightFingerprintMol(*m, 1, 7, 2048, 4, true, 0.3, 128); std::string text = fp->toString(); delete fp; delete m; sqlite3_result_text(context, text.c_str(), text.length(), SQLITE_TRANSIENT); } static void blobToAtomPairFingerprint(sqlite3_context *context, int argc, sqlite3_value **argv) { RDKit::ROMol *m = molFromBlobArg(argv[0]); if (!m) { std::string errorMsg = "BLOB (argument 1) could not be converted into a molecule"; sqlite3_result_error(context, errorMsg.c_str(), errorMsg.length()); return; } RDKit::SparseIntVect *fp = RDKit::Descriptors::AtomPairs::getAtomPairFingerprint(*m); std::string text = fp->toString(); delete fp; delete m; sqlite3_result_text(context, text.c_str(), text.length(), SQLITE_TRANSIENT); } static void bvTanimotoSim(sqlite3_context *context, int argc, sqlite3_value **argv) { ExplicitBitVect *bv1 = ebvFromBlobArg(argv[0]); if (!bv1) { std::string errorMsg = "BLOB (argument 1) could not be converted into a bit vector"; sqlite3_result_error(context, errorMsg.c_str(), errorMsg.length()); return; } ExplicitBitVect *bv2 = ebvFromBlobArg(argv[1]); if (!bv2) { delete bv1; std::string errorMsg = "BLOB (argument 2) could not be converted into a bit vector"; sqlite3_result_error(context, errorMsg.c_str(), errorMsg.length()); return; } double res = SimilarityWrapper(*bv1, *bv2, TanimotoSimilarity); delete bv1; delete bv2; sqlite3_result_double(context, res); } static void ucvTanimotoSim(sqlite3_context *context, int argc, sqlite3_value **argv) { // table from Andrew Dalke: static const unsigned int popCounts[] = { 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8, }; const unsigned char *t1 = (const unsigned char *)sqlite3_value_blob(argv[0]); int nB1 = sqlite3_value_bytes(argv[0]); const unsigned char *t2 = (const unsigned char *)sqlite3_value_blob(argv[1]); int nB2 = sqlite3_value_bytes(argv[1]); if (nB1 != nB2) { std::string errorMsg = "bit vectors not ths same length"; sqlite3_result_error(context, errorMsg.c_str(), errorMsg.length()); return; } unsigned int x = 0, y = 0, z = 0; for (unsigned int i = 0; i < (unsigned int)nB1; ++i) { y += popCounts[*t1]; z += popCounts[*t2]; x += popCounts[(*t1) & (*t2)]; ++t1; ++t2; } double res = 0; if (y + z - x > 0) { res = double(x) / (y + z - x); } sqlite3_result_double(context, res); } #if 0 // Naive approach: actually construct two sparse int vects: static void sivDiceSim( sqlite3_context *context, int argc, sqlite3_value **argv ){ RDKit::SparseIntVect *v1=sivFromBlobArg(argv[0]); if(!v1){ std::string errorMsg="BLOB (argument 1) could not be converted into an int vector"; sqlite3_result_error(context,errorMsg.c_str(),errorMsg.length()); return; } RDKit::SparseIntVect *v2=sivFromBlobArg(argv[1]); if(!v2){ delete v1; std::string errorMsg="BLOB (argument 2) could not be converted into a bit vector"; sqlite3_result_error(context,errorMsg.c_str(),errorMsg.length()); return; } double res= RDKit::DiceSimilarity(*v1,*v2); delete v1; delete v2; sqlite3_result_double(context, res); } #else // faster, just parse the format directly static void sivDiceSim(sqlite3_context *context, int argc, sqlite3_value **argv) { const unsigned char *t1 = (const unsigned char *)sqlite3_value_blob(argv[0]); int nB1 = sqlite3_value_bytes(argv[0]); const unsigned char *t2 = (const unsigned char *)sqlite3_value_blob(argv[1]); int nB2 = sqlite3_value_bytes(argv[1]); // check the version flags: boost::uint32_t tmp; tmp = *(reinterpret_cast(t1)); t1 += sizeof(boost::uint32_t); if (tmp != ci_SPARSEINTVECT_VERSION) { std::string errorMsg = "BLOB (argument 1) could not be converted into an int vector"; sqlite3_result_error(context, errorMsg.c_str(), errorMsg.length()); return; } tmp = *(reinterpret_cast(t2)); t2 += sizeof(boost::uint32_t); if (tmp != ci_SPARSEINTVECT_VERSION) { std::string errorMsg = "BLOB (argument 2) could not be converted into an int vector"; sqlite3_result_error(context, errorMsg.c_str(), errorMsg.length()); return; } // check the element size: tmp = *(reinterpret_cast(t1)); t1 += sizeof(boost::uint32_t); if (tmp != sizeof(boost::uint32_t)) { std::string errorMsg = "BLOB (argument 1) could not be converted into an uint32_t vector"; sqlite3_result_error(context, errorMsg.c_str(), errorMsg.length()); return; } tmp = *(reinterpret_cast(t2)); t2 += sizeof(boost::uint32_t); if (tmp != sizeof(boost::uint32_t)) { std::string errorMsg = "BLOB (argument 2) could not be converted into an uint32_t vector"; sqlite3_result_error(context, errorMsg.c_str(), errorMsg.length()); return; } double res = 0.; // start reading: boost::uint32_t len1, len2; len1 = *(reinterpret_cast(t1)); t1 += sizeof(boost::uint32_t); len2 = *(reinterpret_cast(t2)); t2 += sizeof(boost::uint32_t); if (len1 != len2) { std::string errorMsg = "attempt to compare fingerprints of different length"; sqlite3_result_error(context, errorMsg.c_str(), errorMsg.length()); return; } boost::uint32_t nElem1, nElem2; nElem1 = *(reinterpret_cast(t1)); t1 += sizeof(boost::uint32_t); nElem2 = *(reinterpret_cast(t2)); t2 += sizeof(boost::uint32_t); if (!nElem1 || !nElem2) { res = 0.0; sqlite3_result_double(context, res); } double v1Sum = 0, v2Sum = 0, numer = 0; boost::uint32_t idx1 = 0; boost::int32_t v1; boost::uint32_t idx2 = 0; boost::int32_t v2; idx1 = *(reinterpret_cast(t1)); t1 += sizeof(boost::uint32_t); v1 = *(reinterpret_cast(t1)); t1 += sizeof(boost::int32_t); nElem1--; v1Sum += v1; idx2 = *(reinterpret_cast(t2)); t2 += sizeof(boost::uint32_t); v2 = *(reinterpret_cast(t2)); t2 += sizeof(boost::int32_t); nElem2--; v2Sum += v2; while (1) { while (nElem2 && idx2 < idx1) { idx2 = *(reinterpret_cast(t2)); t2 += sizeof(boost::uint32_t); v2 = *(reinterpret_cast(t2)); t2 += sizeof(boost::int32_t); nElem2--; v2Sum += v2; } if (idx2 == idx1) { // std::cerr<<" --- "<(t1)); t1 += sizeof(boost::uint32_t); v1 = *(reinterpret_cast(t1)); t1 += sizeof(boost::int32_t); nElem1--; v1Sum += v1; } else { break; } } while (nElem2) { idx2 = *(reinterpret_cast(t2)); t2 += sizeof(boost::uint32_t); v2 = *(reinterpret_cast(t2)); t2 += sizeof(boost::int32_t); nElem2--; v2Sum += v2; } double denom = v1Sum + v2Sum; if (fabs(denom) < 1e-6) { res = 0.0; } else { res = 2. * numer / denom; } // std::cerr<<" "< *molMap = new std::map(); sqlite3_create_function(db, "rdk_molNumAtoms", 1, SQLITE_ANY, 0, numAtomsFunc, 0, 0); sqlite3_create_function(db, "rdk_molAMW", 1, SQLITE_ANY, 0, molWtFunc, 0, 0); sqlite3_create_function(db, "rdk_smilesToBlob", 1, SQLITE_ANY, 0, smilesToBlob, 0, 0); sqlite3_create_function(db, "rdk_molToRDKitFP", 1, SQLITE_ANY, 0, blobToRDKitFingerprint, 0, 0); sqlite3_create_function(db, "rdk_bvTanimotoSim", 2, SQLITE_ANY, 0, bvTanimotoSim, 0, 0); sqlite3_create_function(db, "rdk_ucvTanimotoSim", 2, SQLITE_ANY, 0, ucvTanimotoSim, 0, 0); sqlite3_create_function(db, "rdk_molToAtomPairFP", 1, SQLITE_ANY, 0, blobToAtomPairFingerprint, 0, 0); sqlite3_create_function(db, "rdk_sivDiceSim", 2, SQLITE_ANY, 0, sivDiceSim, 0, 0); sqlite3_create_function(db, "rdk_sivDiceSim2", 2, SQLITE_ANY, 0, sivDiceSim2, 0, 0); sqlite3_create_function(db, "rdk_molHasSubstruct", 2, SQLITE_ANY, static_cast(molMap), molHasSubstruct, 0, 0); sqlite3_create_function(db, "rdk_molSubstructCount", 2, SQLITE_ANY, static_cast(molMap), molSubstructCount, 0, 0); sqlite3_create_function(db, "rdk_molLogP", 1, SQLITE_ANY, 0, molLogPFunc, 0, 0); return 0; }