From 65a5f6030c5270a2d96d767cf7e4b59485b59818 Mon Sep 17 00:00:00 2001 From: Marvin Steijaert Date: Tue, 1 Oct 2019 16:05:31 +0200 Subject: [PATCH] Updated cartridge documentation (#2635) * Updated cartridge documentation Made examples compatible with latest chembl (25) and most recent conda versions of rdkit (2019.03.4.0, python 3.6.9) + rdkit-postgresql (2019.03.4.0) * updated more query results * updated more query results --- Docs/Book/Cartridge.md | 388 +++++++++++++++++++---------------------- 1 file changed, 180 insertions(+), 208 deletions(-) diff --git a/Docs/Book/Cartridge.md b/Docs/Book/Cartridge.md index 8dcf91ef6..fdac641c1 100644 --- a/Docs/Book/Cartridge.md +++ b/Docs/Book/Cartridge.md @@ -65,29 +65,29 @@ Start by downloading and installing the postgresql dump from the ChEMBL website Connect to the database, install the cartridge, and create the schema that we'll use: - chembl_23=# create extension if not exists rdkit; - chembl_23=# create schema rdk; + chembl_25=# create extension if not exists rdkit; + chembl_25=# create schema rdk; Create the molecules and build the substructure search index: - chembl_23=# select * into rdk.mols from (select molregno,mol_from_ctab(molfile::cstring) m from compound_structures) tmp where m is not null; - SELECT 1727081 - chembl_23=# create index molidx on rdk.mols using gist(m); + chembl_25=# select * into rdk.mols from (select molregno,mol_from_ctab(molfile::cstring) m from compound_structures) tmp where m is not null; + SELECT 1870451 + chembl_25=# create index molidx on rdk.mols using gist(m); CREATE INDEX - chembl_23=# alter table rdk.mols add primary key (molregno); + chembl_25=# alter table rdk.mols add primary key (molregno); ALTER TABLE Create some fingerprints and build the similarity search index: - chembl_23=# select molregno,torsionbv_fp(m) as torsionbv,morganbv_fp(m) as mfp2,featmorganbv_fp(m) as ffp2 into rdk.fps from rdk.mols; - SELECT 1727081 - chembl_23=# create index fps_ttbv_idx on rdk.fps using gist(torsionbv); + chembl_25=# select molregno,torsionbv_fp(m) as torsionbv,morganbv_fp(m) as mfp2,featmorganbv_fp(m) as ffp2 into rdk.fps from rdk.mols; + SELECT 1870451 + chembl_25=# create index fps_ttbv_idx on rdk.fps using gist(torsionbv); CREATE INDEX - chembl_23=# create index fps_mfp2_idx on rdk.fps using gist(mfp2); + chembl_25=# create index fps_mfp2_idx on rdk.fps using gist(mfp2); CREATE INDEX - chembl_23=# create index fps_ffp2_idx on rdk.fps using gist(ffp2); + chembl_25=# create index fps_ffp2_idx on rdk.fps using gist(ffp2); CREATE INDEX - chembl_23=# alter table rdk.fps add primary key (molregno); + chembl_25=# alter table rdk.fps add primary key (molregno); ALTER TABLE Here is a group of the commands used here (and below) in one block so that you can just paste it in at the psql prompt: @@ -103,7 +103,7 @@ Here is a group of the commands used here (and below) in one block so that you c create index fps_ffp2_idx on rdk.fps using gist(ffp2); alter table rdk.fps add primary key (molregno); create or replace function get_mfp2_neighbors(smiles text) - returns table(molregno integer, m mol, similarity double precision) as + returns table(molregno bigint, m mol, similarity double precision) as $$ select molregno,m,tanimoto_sml(morganbv_fp(mol_from_smiles($1::cstring)),mfp2) as similarity from rdk.fps join rdk.mols using (molregno) @@ -115,52 +115,52 @@ Here is a group of the commands used here (and below) in one block so that you c Example query molecules taken from the [eMolecules home page](http://www.emolecules.com/): - chembl_23=# select count(*) from rdk.mols where m@>'c1cccc2c1nncc2' ; + chembl_25=# select count(*) from rdk.mols where m@>'c1cccc2c1nncc2' ; count ------- - 447 + 461 (1 row) Time: 107.602 ms - chembl_23=# select count(*) from rdk.mols where m@>'c1ccnc2c1nccn2' ; + chembl_25=# select count(*) from rdk.mols where m@>'c1ccnc2c1nccn2' ; count ------- - 1013 + 1124 (1 row) Time: 216.222 ms - chembl_23=# select count(*) from rdk.mols where m@>'c1cncc2n1ccn2' ; + chembl_25=# select count(*) from rdk.mols where m@>'c1cncc2n1ccn2' ; count ------- - 1775 + 2233 (1 row) Time: 88.266 ms - chembl_23=# select count(*) from rdk.mols where m@>'Nc1ncnc(N)n1' ; + chembl_25=# select count(*) from rdk.mols where m@>'Nc1ncnc(N)n1' ; count ------- - 5842 + 7095 (1 row) Time: 327.855 ms - chembl_23=# select count(*) from rdk.mols where m@>'c1scnn1' ; + chembl_25=# select count(*) from rdk.mols where m@>'c1scnn1' ; count ------- - 15962 + 16526 (1 row) Time: 568.675 ms - chembl_23=# select count(*) from rdk.mols where m@>'c1cccc2c1ncs2' ; + chembl_25=# select count(*) from rdk.mols where m@>'c1cccc2c1ncs2' ; count ------- - 18986 + 20745 (1 row) Time: 998.104 ms - chembl_23=# select count(*) from rdk.mols where m@>'c1cccc2c1CNCCN2' ; + chembl_25=# select count(*) from rdk.mols where m@>'c1cccc2c1CNCCN2' ; count ------- - 1613 + 1788 (1 row) Time: 1922.273 ms @@ -171,45 +171,42 @@ Given we're searching through 1.7 million compounds these search times aren't in One easy way to speed things up, particularly for queries that return a large number of results, is to only retrieve a limited number of results: - chembl_23=# select * from rdk.mols where m@>'c1cccc2c1CNCCN2' limit 100; - molregno | m - - ----------+----------------------------------------------------------------------------------------------------------------------------------------------------------- - -------------------------------- - 908048 | O=C1CN(C(=O)c2ccc(Br)o2)C(c2ccc(F)cc2)c2cc(F)ccc2N1 - 931972 | Cl.c1ccc(CC2CNc3ccccc3CN2)cc1 - 904450 | CCOC(=O)[C@H]1[C@H]2COc3ccc(Cl)cc3[C@@H]2N2C(=O)c3ccc(Cl)cc3NC(=O)[C@@]12C - 226391 | C/C=C1/CC2C(OC)Nc3cc(OC)c(OC)cc3C(=O)N2C1 - 930820 | CN1CC(=O)N(CC(=O)Nc2ccc(N(C)C)cc2)c2ccccc2C1=O - 18576 | CO[C@H]1Nc2c(ccc(C)c2O)C(=O)N2C=C(/C=C/C(N)=O)C[C@@H]12 - 249934 | O=C(c1cccc2ccccc12)N1CCN(Cc2cncn2Cc2ccccc2)c2ccccc2C1 - ... - 91020 | CC(C)C[C@H]1C(=O)N2c3ccccc3[C@@](O)(C[C@@H]3NC(=O)c4ccccc4N4C(=O)c5ccccc5NC34)[C@H]2N1C(=O)C(CCCNC(=O)OCc1ccccc1)NC(=O)OC(C)(C)C - 91225 | CC(C)C[C@H]1C(=O)N2c3ccccc3[C@@](O)(C[C@@H]3NC(=O)c4ccccc4N4C(=O)c5ccccc5NC34)[C@H]2N1C(=O)CCC(=O)[O-].[Na+] - 348798 | O=C(O)CN1C(=O)C(c2ccc(Cl)cc2)N(C(C(=O)O)c2ccc(Cl)cc2)C(=O)c2cc(I)ccc21 - 348972 | C[C@H](c1ccc(Cl)cc1)N1C(=O)c2cc(I)ccc2N(CCCCC(=O)O)C(=O)[C@@H]1c1ccc(C(F)(F)F)cc1 - - ...skipping 23 lines + chembl_25=# select * from rdk.mols where m@>'c1cccc2c1CNCCN2' limit 100; + molregno | m + ----------+-------------------------------------------------------------------------------------------------------------- + 1671940 | Cc1cccc(C)c1N1C(=O)c2ccccc2NC(=O)C1C(=O)NCc1ccco1 + 1318078 | COCN1C(=O)[C@@H]2C[C@@H](O)CN2C(=O)c2ccccc21 + 1318783 | O/N=C1/Nc2ccccc2C(=S)N2CSCC12 + 1318127 | CC(=O)O[C@H]1C[C@H]2C(=S)Nc3ccccc3C(=S)N2C1 + 1308578 | O=C1Nc2cc([N+](=O)[O-])ccc2C(=O)N2CCC[C@@H]12 + 1417168 | O=C(NCC(F)(F)F)C1C(=O)Nc2ccccc2C(=O)N1Cc1ccccc1 + ... + 793329 | Cc1ccc2c(c1)C(c1ccccc1)N(C(=O)c1ccc(OC(C)C)cc1)CC(=O)N2 + 921215 | O=C1CN(C(=O)c2cc([N+](=O)[O-])ccc2Cl)C(c2ccc(F)cc2)c2cc(F)ccc2N1 + 790949 | CCOC(=O)[C@H]1[C@H]2COc3ccc(Cl)cc3[C@@H]2N2C(=O)c3cc(C)ccc3NC(=O)[C@@]12C + 760998 | CC(=O)N1CC(=O)Nc2ccc(Cl)cc2C1c1ccc(F)cc1 + (100 rows) + Time: 97.357 ms #### SMARTS-based queries Oxadiazole or thiadiazole: - chembl_23=# select * from rdk.mols where m@>'c1[o,s]ncn1'::qmol limit 500; - molregno | m - ----------+-------------------------------------------------------------------------------------------------------------- - 1370170 | Fc1cccc(-c2nc(NCC3COc4ccccc4O3)no2)c1F - 1370417 | COc1cc(CN2CCC(Cc3nc(-c4ccc5c(c4)CCO5)no3)C2)ccc1F - 1370526 | Cl.Cn1cc(-c2noc(/C=C3/CCN4CCCC[C@@H]4C3)n2)c2ccccc21 - 1379267 | CCC(c1ccccc1)c1noc(CCN(CC)CC)n1 - 1404150 | OC[C@H]1O[C@H](c2nc(-c3nc(-c4cccs4)no3)cs2)C[C@@H]1O - 1217463 | CC(C)(C)c1ccc(-c2noc(CCC(=O)N3CCCCC3)n2)cc1 + chembl_25=# select * from rdk.mols where m@>'c1[o,s]ncn1'::qmol limit 500; + molregno | m + ----------+--------------------------------------------------------------------------------------------------- + 1882516 | COc1cccc(CN(C)Cc2nc(C(C)C)no2)c1 + 2194441 | Cc1nc([C@](C)(O)C#Cc2ccc3c(c2)-c2nc(C(N)=O)sc2[C@@H](F)CO3)no1 + 1881742 | CCOc1ccc(C(F)(F)F)cc1NC(=O)NCc1noc(C)n1 + 1949861 | FC(F)(F)c1ccc(-c2nc(-c3ccc4nc[nH]c4c3)no2)cc1 + 1949860 | FC(F)(F)c1cccc(-c2nc(-c3ccc4nc[nH]c4c3)no2)c1 + 2172627 | O=c1[nH]cc(-c2cc(Cl)ccc2Oc2cc(F)c(S(=O)(=O)Nc3ncns3)cc2F)n2cncc12 ... - 1517753 | CC(C)c1noc(N2CCC(CO[C@H]3CC[C@H](c4ccc(S(C)(=O)=O)cc4F)CC3)CC2)n1 - 1263024 | COc1cc(Nc2nc3c(s2)CCCC3c2ccccc2)ccc1-c1nc(C)no1 - 1264016 | O=C(O)CCc1nc2cc(-c3noc(-c4cc(C(F)(F)F)cc(C(F)(F)F)c4)n3)ccc2[nH]1 - 1847733 | Cc1cc(-c2noc([C@H]3CCCCN3C(=O)COc3ccccc3)n2)no1 + 1848026 | O=C1CCCN1c1cccc(-c2noc([C@H]3CCCCN3C(=O)COc3ccccc3)n2)c1 + 1848027 | O=C1CN(c2cccc(-c3noc([C@H]4CCCCN4C(=O)COc4ccccc4)n3)c2)C(=O)N1 + 1848036 | CN(C)C(=O)CCC(=O)Nc1cc(F)cc(-c2noc([C@H]3CCCCN3C(=O)COc3ccccc3)n2)c1 + 1852688 | CC(Sc1nc(N)cc(N)n1)c1nc(C(C)(C)C)no1 (500 rows) Time: 761.847 ms @@ -220,64 +217,39 @@ This is slower than the pure SMILES query, this is generally true of SMARTS-base Note that by default stereochemistry is not taken into account when doing substructure queries: - chembl_23=# select * from rdk.mols where m@>'NC(=O)[C@@H]1CCCN1C=O' limit 10; - molregno | - m - - ----------+----------------------------------------------------------------------------------------------------------------------------------------------------------- - ---------------------------------------------------------------------------------------------------------------------------------------------------------------------- - --------------------- - 87611 | CNCC(=O)N[C@@H](CCCN=C(N)N)C(=O)N1C[C@H]2C[C@H]1C(=O)N[C@@H](Cc1ccc(O)cc1)C(=O)N[C@H](C(=O)N[C@@H](Cc1c[nH]cn1)C(=O)N1CCC[C@H]1C(=O)N[C@@H](Cc1ccccc1)C(=O - )O)CCSS2 - 88372 | CNCCCC[C@H](NC(=O)[C@H](Cc1ccccc1)NC(=O)[C@@H](CCCCNC)NC(=O)[C@H](Cc1ccc(O)cc1)NC(=O)[C@H](CO)NC(=O)[C@@H](Cc1ccccc1)NC(=O)[C@@H](Cc1ccccc1)NC(=O)[C@@H](C - c1ccc2ccccc2c1)NC(C)=O)C(=O)N1CCC[C@@H]1C(=O)N[C@H](C)C(=O)O - 88322 | CC(=O)N[C@H](Cc1ccc2ccccc2c1)C(=O)N[C@H](Cc1ccccc1)C(=O)N[C@H](Cc1ccccc1)C(=O)N[C@@H](CO)C(=O)N[C@@H](Cc1ccc(O)cc1)C(=O)N[C@H](CCCCNC(C)C)C(=O)N[C@@H](Cc1 - ccccc1)C(=O)N[C@@H](CCCCNC(C)C)C(=O)N1CCC[C@@H]1C(=O)N[C@H](C)C(=O)O - 88168 | CC(=O)N[C@H](Cc1ccc2ccccc2c1)C(=O)N[C@H](Cc1ccccc1)C(=O)N[C@H](Cc1ccccc1)C(=O)N[C@@H](CO)C(=O)N[C@@H](Cc1ccc(O)cc1)C(=O)N[C@H](CCCN=C(N)N)C(=O)N[C@@H](Cc1 - ccccc1)C(=O)N[C@@H](CCCCNC1CCCC1)C(=O)N1CCC[C@@H]1C(=O)N[C@H](C)C(=O)O - 88150 | CC(=O)N[C@H](Cc1ccc2ccccc2c1)C(=O)N[C@H](Cc1ccccc1)C(=O)N[C@H](Cc1ccccc1)C(=O)N[C@@H](CO)C(=O)N[C@@H](Cc1ccc(O)cc1)C(=O)N[C@H](CCCN=C(N)N)C(=O)N[C@@H](Cc1 - ccccc1)C(=O)N[C@@H](CCCCNCc1ccc(C)cc1)C(=O)N1CCC[C@@H]1C(=O)N[C@H](C)C(=O)O - 88373 | CC(=O)N[C@H](Cc1ccc2ccccc2c1)C(=O)N[C@H](Cc1ccccc1)C(=O)N[C@H](Cc1ccccc1)C(=O)N[C@@H](CO)C(=O)N[C@@H](Cc1ccc(O)cc1)C(=O)N[C@H](CCCCNC1CCCCC1)C(=O)N[C@@H]( - Cc1ccccc1)C(=O)N[C@@H](CCCCNC1CCCCC1)C(=O)N1CCC[C@@H]1C(=O)N[C@H](C)C(=O)O - 93377 | CC(=O)N[C@@H](Cc1ccc([N+](=O)[O-])cc1)C(=O)N1CCC[C@H]1C(=O)N[C@@H](CCC/N=C(/N)NS(=O)(=O)c1c(C)c(C)c2c(c1C)CCC(C)(C)O2)C(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@@H]( - CCC/N=C(/N)NS(=O)(=O)c1c(C)c(C)c2c(c1C)CCC(C)(C)O2)C(=O)N[C@H](C(=O)NCC(=O)N[C@@H](COC(C)(C)C)C(=O)N[C@@H](CCCCNC(=O)c1ccccc1N)C(=O)NCC(=O)O)[C@@H](C)OC(C)(C)C - 94493 | CC(C)C[C@@H]1NC(=O)[C@H]([C@@H](C)O)NC(=O)[C@H](Cc2c[nH]c3ccccc23)NC(=O)[C@H](C(C)C)NC(=O)[C@H](NC(=O)[C@H](CCCCN)NC(=O)[C@@H]2CCCN2C(=O)[C@H](CCC(N)=O)NC - (=O)CNC(=O)CN)CSSC[C@@H](C(=O)N[C@@H](Cc2ccc(O)cc2)C(=O)N[C@@H](CO)C(=O)N[C@H](C(=O)NCC(=O)NCC(N)=O)[C@@H](C)O)NC(=O)[C@H](Cc2c[nH]cn2)NC(=O)[C@H](Cc2ccccc2)NC(=O)CNC - (=O)[C@@H]2CCCN2C1=O - - ...skipping 1 line - 89559 | CC1(C)SSC(C)(C)[C@@H](C(=O)N[C@@H](Cc2c[nH]cn2)C(=O)N2CCC[C@H]2C(=O)N[C@@H](Cc2ccccc2)C(=O)O)NC(=O)[C@H](Cc2ccc(O)cc2)NC(=O)[C@H]1NC(=O)[C@H](CCCN=C(N)N)N - C(=O)[C@@H](N)CC(=O)O + chembl_25=# select * from rdk.mols where m@>'NC(=O)[C@@H]1CCCN1C=O' limit 10; + molregno | m + ----------+--------------------------------------------------------------------------------------------------- + 2213985 | CC[C@H](C)[C@@H]1NC(=O)[C@@H]2CCCN2C(=O)[C@@H]2CCCN2C(=O)[C@H]([C@@H](C)CC)NC(=O)[C@H](CO)NC(=O)[C@H](C)NC(=O)[C@H]([C@H](C)O)NC(=O)[C@@H]2CSSC[C@H](NC1=O)C(=O)N[C@@H](Cc1cnc[nH]1)C(=O)N[C@H](Cc1ccccc1)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](Cc1c[nH]c3ccccc13)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N2 + 1956682 | NC(=O)[C@@H]1CCCN1C(=O)[C@H](Cc1nc(I)[nH]c1I)NC(=O)c1cnccn1 + 2212188 | CN1C(=O)[C@H](CCCNC(=N)N)NC(=O)[C@@H](Cc2ccc(O)cc2)NC(=O)[C@@H]2CCCN2C(=O)[C@H](Cc2ccc3ccccc3c2)NC(=O)[C@@H]1CC(=O)O + 2053463 | NCCCC[C@H](NC(=O)[C@H](Cc1ccc(OP(=O)(O)O)cc1)NC(=O)Cc1ccccc1)C(=O)N1CCC[C@H]1C(=O)N[C@@H](Cc1ccccc1)C(N)=O + 2060743 | CCCCCCCCCCCCCCCCNC(=O)CN(CC(=O)NC(C)(C)C(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N1CCC[C@H]1C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](CC(N)=O)C(N)=O)C(=O)c1cccnc1 + 2060744 | CCCCCCCCCCCCCCCCN(CCCCCCCCCCCCCCCC)CCCCCC(=O)NC(C)(C)C(=O)NC(Cc1ccccc1)C(=O)NC(CC(C)C)C(=O)NC(Cc1ccccc1)C(=O)NC(CCCNC(=N)N)C(=O)N1CCCC1C(=O)NC(CCCNC(=N)N)C(=O)NC(CC(N)=O)C(N)=O + 2077784 | CC[C@H](C)[C@@H]1NC(=O)[C@H](Cc2ccccc2)NC(=O)[C@H](CC(C)C)NC(=O)[C@@H]2CCCN2C(=O)[C@H](Cc2ccccc2)NC(=O)[C@H](C(C)C)NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CCSC)NC1=O + 2077779 | CC[C@H](C)[C@@H]1NC(=O)[C@H](Cc2ccccc2)NC(=O)[C@H](CC(C)C)NC(=O)[C@@H]2CCCN2C(=O)[C@H](Cc2ccccc2)NC(=O)[C@H](C(C)C)NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CC[S+](C)[O-])NC1=O + 2077782 | CC[C@H](C)[C@@H]1NC(=O)[C@H](Cc2c[nH]c3ccccc23)NC(=O)[C@H](Cc2ccccc2)NC(=O)[C@H](Cc2ccccc2)NC(=O)[C@@H]2CCCN2C(=O)[C@H](CCSC)NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CC[S+](C)[O-])NC1=O + 2077780 | CC(C)C[C@@H]1NC(=O)[C@H](CC[S+](C)[O-])NC(=O)[C@H](C(C)C)NC(=O)[C@H](Cc2c[nH]c3ccccc23)NC(=O)[C@H](Cc2ccccc2)NC(=O)[C@H](Cc2ccccc2)NC(=O)[C@@H]2CCCN2C(=O)[C@H](CC[S+](C)[O-])NC1=O (10 rows) - + This can be changed using the rdkit.do\_chiral\_sss configuration variable: - chembl_23=# set rdkit.do_chiral_sss=true; + chembl_25=# set rdkit.do_chiral_sss=true; SET Time: 0.241 ms - chembl_23=# select * from rdk.mols where m@>'NC(=O)[C@@H]1CCCN1C=O' limit 10; - molregno | - m - - ----------+-------------------------------------------------------------------------------------------------------------------------------------------------------------- - ------------------------------------------------------------------------------------------------------------------------------------------------------------------------- - --------------- - 87611 | CNCC(=O)N[C@@H](CCCN=C(N)N)C(=O)N1C[C@H]2C[C@H]1C(=O)N[C@@H](Cc1ccc(O)cc1)C(=O)N[C@H](C(=O)N[C@@H](Cc1c[nH]cn1)C(=O)N1CCC[C@H]1C(=O)N[C@@H](Cc1ccccc1)C(=O)O) - CCSS2 - 93377 | CC(=O)N[C@@H](Cc1ccc([N+](=O)[O-])cc1)C(=O)N1CCC[C@H]1C(=O)N[C@@H](CCC/N=C(/N)NS(=O)(=O)c1c(C)c(C)c2c(c1C)CCC(C)(C)O2)C(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@@H](CCC - /N=C(/N)NS(=O)(=O)c1c(C)c(C)c2c(c1C)CCC(C)(C)O2)C(=O)N[C@H](C(=O)NCC(=O)N[C@@H](COC(C)(C)C)C(=O)N[C@@H](CCCCNC(=O)c1ccccc1N)C(=O)NCC(=O)O)[C@@H](C)OC(C)(C)C - 94493 | CC(C)C[C@@H]1NC(=O)[C@H]([C@@H](C)O)NC(=O)[C@H](Cc2c[nH]c3ccccc23)NC(=O)[C@H](C(C)C)NC(=O)[C@H](NC(=O)[C@H](CCCCN)NC(=O)[C@@H]2CCCN2C(=O)[C@H](CCC(N)=O)NC(=O - )CNC(=O)CN)CSSC[C@@H](C(=O)N[C@@H](Cc2ccc(O)cc2)C(=O)N[C@@H](CO)C(=O)N[C@H](C(=O)NCC(=O)NCC(N)=O)[C@@H](C)O)NC(=O)[C@H](Cc2c[nH]cn2)NC(=O)[C@H](Cc2ccccc2)NC(=O)CNC(=O)[C - @@H]2CCCN2C1=O - 89558 | NC(N)=NCCC[C@H](NC(=O)[C@@H](N)CC(=O)O)C(=O)N[C@H]1CCSSC[C@@H](C(=O)N[C@@H](Cc2c[nH]cn2)C(=O)N2CCC[C@H]2C(=O)N[C@@H](Cc2ccccc2)C(=O)O)NC(=O)[C@H](Cc2ccc(O)cc - 2)NC1=O - 89559 | CC1(C)SSC(C)(C)[C@@H](C(=O)N[C@@H](Cc2c[nH]cn2)C(=O)N2CCC[C@H]2C(=O)N[C@@H](Cc2ccccc2)C(=O)O)NC(=O)[C@H](Cc2ccc(O)cc2)NC(=O)[C@H]1NC(=O)[C@H](CCCN=C(N)N)NC(= - O)[C@@H](N)CC(=O)O - 126618 | NC(=O)[C@@H]1CCCN1C(=O)[C@@H]1CCCN1C(=O)[C@@H](O)[C@H](N)Cc1ccccc1 - 152339 | O=C(O)CN[C@H](CC1CCCCC1)C(=O)N1CCC[C@H]1C(=O)NCCCc1c[nH]cn1 - 152504 | N[C@H](CC1CCCCC1)C(=O)N1[C@H](C(=O)NC/C=C/c2c[nH]cn2)C[C@@H]2CCCC[C@@H]21 - 152383 | N[C@H](CC1CCCCC1)C(=O)N1CCC[C@H]1C(=O)NCCCCc1c[nH]cn1 - 151837 | N[C@H](CC1CCCCC1)C(=O)N1CCC[C@H]1C(=O)NC/C=C/c1c[nH]cn1 + chembl_25=# select * from rdk.mols where m@>'NC(=O)[C@@H]1CCCN1C=O' limit 10; + molregno | m + ----------+--------------------------------------------------------------------------------------------------- + 2213985 | CC[C@H](C)[C@@H]1NC(=O)[C@@H]2CCCN2C(=O)[C@@H]2CCCN2C(=O)[C@H]([C@@H](C)CC)NC(=O)[C@H](CO)NC(=O)[C@H](C)NC(=O)[C@H]([C@H](C)O)NC(=O)[C@@H]2CSSC[C@H](NC1=O)C(=O)N[C@@H](Cc1cnc[nH]1)C(=O)N[C@H](Cc1ccccc1)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](Cc1c[nH]c3ccccc13)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N2 + 1956682 | NC(=O)[C@@H]1CCCN1C(=O)[C@H](Cc1nc(I)[nH]c1I)NC(=O)c1cnccn1 + 2212188 | CN1C(=O)[C@H](CCCNC(=N)N)NC(=O)[C@@H](Cc2ccc(O)cc2)NC(=O)[C@@H]2CCCN2C(=O)[C@H](Cc2ccc3ccccc3c2)NC(=O)[C@@H]1CC(=O)O + 2053463 | NCCCC[C@H](NC(=O)[C@H](Cc1ccc(OP(=O)(O)O)cc1)NC(=O)Cc1ccccc1)C(=O)N1CCC[C@H]1C(=O)N[C@@H](Cc1ccccc1)C(N)=O + 2060743 | CCCCCCCCCCCCCCCCNC(=O)CN(CC(=O)NC(C)(C)C(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N1CCC[C@H]1C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](CC(N)=O)C(N)=O)C(=O)c1cccnc1 + 2077784 | CC[C@H](C)[C@@H]1NC(=O)[C@H](Cc2ccccc2)NC(=O)[C@H](CC(C)C)NC(=O)[C@@H]2CCCN2C(=O)[C@H](Cc2ccccc2)NC(=O)[C@H](C(C)C)NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CCSC)NC1=O + 2077779 | CC[C@H](C)[C@@H]1NC(=O)[C@H](Cc2ccccc2)NC(=O)[C@H](CC(C)C)NC(=O)[C@@H]2CCCN2C(=O)[C@H](Cc2ccccc2)NC(=O)[C@H](C(C)C)NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CC[S+](C)[O-])NC1=O + 2077782 | CC[C@H](C)[C@@H]1NC(=O)[C@H](Cc2c[nH]c3ccccc23)NC(=O)[C@H](Cc2ccccc2)NC(=O)[C@H](Cc2ccccc2)NC(=O)[C@@H]2CCCN2C(=O)[C@H](CCSC)NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CC[S+](C)[O-])NC1=O + 2077780 | CC(C)C[C@@H]1NC(=O)[C@H](CC[S+](C)[O-])NC(=O)[C@H](C(C)C)NC(=O)[C@H](Cc2c[nH]c3ccccc23)NC(=O)[C@H](Cc2ccccc2)NC(=O)[C@H](Cc2ccccc2)NC(=O)[C@@H]2CCCN2C(=O)[C@H](CC[S+](C)[O-])NC1=O + 2211488 | CC[C@H](C)[C@H](N)C(=O)N[C@H](C(=O)N[C@@H](CC(C)C)C(=O)N[C@H](C(=O)N1CCC[C@H]1C(=O)N1CCC[C@H]1C(=O)N[C@H](CCC(=O)N[C@@H](CCC(=O)N[C@@H](CC(C)C)C(=O)O)Cc1ccccc1)Cc1ccccc1)C(C)C)[C@@H](C)CC (10 rows) Time: 6.181 ms @@ -289,19 +261,19 @@ having to construct complex SMARTS queries. The cartridge function `mol_adjust_q can be used to do just this. Here is an example of the default behavior, using a query for 2,6 di-substituted pyridines: - chembl_23=# select molregno,m from rdk.mols where m@>mol_adjust_query_properties('*c1cccc(NC(=O)*)n1') limit 10; - molregno | m - ----------+------------------------------------------------------------------------------------------- - 1993749 | Cn1c(Nc2c(Cl)ccc(CNC(=O)C(C)(C)C)c2Cl)nc2cc(C(=O)Nc3cccc(C(F)(F)F)n3)c(N3CCC(F)(F)C3)cc21 - 1988455 | Cc1cccc(C(=O)Nc2cccc(Oc3cccnc3)n2)c1 - 1870095 | COC(=O)CN(C(=O)C(C)c1c(F)cccc1F)c1cccc(C)n1 - 1870023 | CCC(C)C(=O)N(CC(=O)OC)c1cccc(C)n1 - 1873944 | Cc1ccc(C(=O)N(C)CC(=O)Nc2cccc(C)n2)cn1 - 1873968 | Cc1cccc(NC(=O)CN(C)C(=O)c2ccc(-n3cccc3)nc2)n1 - 1882693 | Cc1cccc(NC(=O)CCNCc2c(C)nn(C)c2N(C)C)n1 - 1882711 | COc1c(CNCCC(=O)Nc2cccc(C)n2)c(C)nn1C - 1868705 | CCOc1cccc(NC(=O)c2cnc(C)cn2)n1 - 1875177 | Cc1cccc(NC(=O)[C@@H]2CCCN2Cc2nc(C)c(C)o2)n1 + chembl_25=# select molregno,m from rdk.mols where m@>mol_adjust_query_properties('*c1cccc(NC(=O)*)n1') limit 10; + molregno | m + ----------+--------------------------------------------------------------------------------------------------- + 1609520 | Cc1cccc(NC(=O)c2cc(Br)ccc2C(=O)O)n1 + 1141456 | CCN(CC)CCCn1cc(NC(=O)Nc2cccc(-c3ccccc3)n2)c2ccccc21 + 1431198 | Cc1cccc(NC(=O)c2nc(C)sc2Nc2cccnc2)n1 + 734975 | Cc1cccc(NC(=O)CN(C)S(=O)(=O)c2ccc(Cl)cc2)n1 + 760426 | Cc1cccc(NC(=O)CCCn2cc([N+](=O)[O-])cn2)n1 + 782786 | Cc1cccc(NC(=O)CN2C(=O)NC(C)(c3ccc4ccccc4c3)C2=O)n1 + 1478990 | Cc1cccc(NC(=O)Cn2c(=O)sc3cc(C(=O)c4ccccc4)ccc32)n1 + 1478787 | Cc1cccc(NC(=O)Cn2c(=O)sc3cc(C(=O)c4ccccc4F)ccc32)n1 + 1955608 | C[C@H](N)C(=O)Nc1cccc(N)n1 + 773911 | Cc1cccc(NC(=O)c2c(-c3ccccc3)noc2C)n1 (10 rows) Time: 11.895 ms @@ -315,20 +287,20 @@ By default `mol_adjust_query_properties()` makes the following changes to the mo We can control the behavior by providing an additional JSON argument. Here's an example where we disable the additional degree queries: - chembl_23=# select molregno,m from rdk.mols where m@>mol_adjust_query_properties('*c1cccc(NC(=O)*)n1', - chembl_23(# '{"adjustDegree":false}') limit 10; - molregno | m - ----------+------------------------------------------------------------------------------------------- - 1993749 | Cn1c(Nc2c(Cl)ccc(CNC(=O)C(C)(C)C)c2Cl)nc2cc(C(=O)Nc3cccc(C(F)(F)F)n3)c(N3CCC(F)(F)C3)cc21 - 1957849 | COc1ccc2ncc(F)c(C[C@H](O)C3CCC(NCc4nc5c(cc4F)OCC(=O)N5)CO3)c2n1 - 1959611 | O=C1COc2ccc(CNC3CCN(CCn4c(=O)ccc5ncc(OCc6cccnn6)cc54)CC3)nc2N1 - 1988455 | Cc1cccc(C(=O)Nc2cccc(Oc3cccnc3)n2)c1 - 1870095 | COC(=O)CN(C(=O)C(C)c1c(F)cccc1F)c1cccc(C)n1 - 1870023 | CCC(C)C(=O)N(CC(=O)OC)c1cccc(C)n1 - 1873944 | Cc1ccc(C(=O)N(C)CC(=O)Nc2cccc(C)n2)cn1 - 1873968 | Cc1cccc(NC(=O)CN(C)C(=O)c2ccc(-n3cccc3)nc2)n1 - 1882693 | Cc1cccc(NC(=O)CCNCc2c(C)nn(C)c2N(C)C)n1 - 1882711 | COc1c(CNCCC(=O)Nc2cccc(C)n2)c(C)nn1C + chembl_25=# select molregno,m from rdk.mols where m@>mol_adjust_query_properties('*c1cccc(NC(=O)*)n1', + chembl_25(# '{"adjustDegree":false}') limit 10; + molregno | m + ----------+--------------------------------------------------------------------------------------------------- + 2146308 | CCn1ncc2cc3nc(c21)NCCOC[C@H](c1ccccc1)NC(=O)N3 + 2137309 | CCn1ncc2cc3nc(c21)CCCO[C@@H](O)[C@H](c1ccccc1)NC(=O)N3 + 2102593 | CCn1ncc2cc3nc(c21)CCCO[C@@H]([C@@H](C)O)[C@@H](c1ccccc1)NC(=O)N3 + 2171613 | CCn1ncc2cc3nc(c21)CCCO[C@@H]([C@H](C)O)[C@@H](c1ccccc1)NC(=O)N3 + 2111904 | CCn1ncc2cc3nc(c21)C[C@H](O)COC[C@H](c1cccc(Cl)c1)NC(=O)N3 + 2173410 | CCn1ncc2cc3nc(c21)CCCOC[C@H](c1ccccc1)NC(=O)N3 + 2189450 | Cn1ncc2cc3nc(c21)CCCOC[C@H](c1ccccc1)NC(=O)N3 + 2195752 | CCn1ncc2cc3nc(c21)C[C@H](O)COC[C@H](c1ccccc1)NC(=O)N3 + 1609520 | Cc1cccc(NC(=O)c2cc(Br)ccc2C(=O)O)n1 + 1141456 | CCN(CC)CCCn1cc(NC(=O)Nc2cccc(-c3ccccc3)n2)c2ccccc21 (10 rows) Time: 10.780 ms @@ -336,20 +308,20 @@ where we disable the additional degree queries: or where we don't add the additional degree queries to ring atoms or dummies (they are only added to chain atoms): - chembl_23=# select molregno,m from rdk.mols where m@>mol_adjust_query_properties('*c1cccc(NC(=O)*)n1', - chembl_23(# '{"adjustDegree":true,"adjustDegreeFlags":"IGNORERINGS|IGNOREDUMMIES"}') limit 10; - molregno | m - ----------+------------------------------------------------------------------------------------------- - 1993749 | Cn1c(Nc2c(Cl)ccc(CNC(=O)C(C)(C)C)c2Cl)nc2cc(C(=O)Nc3cccc(C(F)(F)F)n3)c(N3CCC(F)(F)C3)cc21 - 1957849 | COc1ccc2ncc(F)c(C[C@H](O)C3CCC(NCc4nc5c(cc4F)OCC(=O)N5)CO3)c2n1 - 1959611 | O=C1COc2ccc(CNC3CCN(CCn4c(=O)ccc5ncc(OCc6cccnn6)cc54)CC3)nc2N1 - 1988455 | Cc1cccc(C(=O)Nc2cccc(Oc3cccnc3)n2)c1 - 1873944 | Cc1ccc(C(=O)N(C)CC(=O)Nc2cccc(C)n2)cn1 - 1873968 | Cc1cccc(NC(=O)CN(C)C(=O)c2ccc(-n3cccc3)nc2)n1 - 1882693 | Cc1cccc(NC(=O)CCNCc2c(C)nn(C)c2N(C)C)n1 - 1882711 | COc1c(CNCCC(=O)Nc2cccc(C)n2)c(C)nn1C - 1884388 | Cc1noc(COCC(=O)Nc2ccc(Br)c(C)n2)n1 - 1868705 | CCOc1cccc(NC(=O)c2cnc(C)cn2)n1 + chembl_25=# select molregno,m from rdk.mols where m@>mol_adjust_query_properties('*c1cccc(NC(=O)*)n1', + chembl_25(# '{"adjustDegree":true,"adjustDegreeFlags":"IGNORERINGS|IGNOREDUMMIES"}') limit 10; + molregno | m + ----------+--------------------------------------------------------------------------------------------------- + 2146308 | CCn1ncc2cc3nc(c21)NCCOC[C@H](c1ccccc1)NC(=O)N3 + 2137309 | CCn1ncc2cc3nc(c21)CCCO[C@@H](O)[C@H](c1ccccc1)NC(=O)N3 + 2102593 | CCn1ncc2cc3nc(c21)CCCO[C@@H]([C@@H](C)O)[C@@H](c1ccccc1)NC(=O)N3 + 2171613 | CCn1ncc2cc3nc(c21)CCCO[C@@H]([C@H](C)O)[C@@H](c1ccccc1)NC(=O)N3 + 2111904 | CCn1ncc2cc3nc(c21)C[C@H](O)COC[C@H](c1cccc(Cl)c1)NC(=O)N3 + 2173410 | CCn1ncc2cc3nc(c21)CCCOC[C@H](c1ccccc1)NC(=O)N3 + 2189450 | Cn1ncc2cc3nc(c21)CCCOC[C@H](c1ccccc1)NC(=O)N3 + 2195752 | CCn1ncc2cc3nc(c21)C[C@H](O)COC[C@H](c1ccccc1)NC(=O)N3 + 1609520 | Cc1cccc(NC(=O)c2cc(Br)ccc2C(=O)O)n1 + 1141456 | CCN(CC)CCCn1cc(NC(=O)Nc2cccc(-c3ccccc3)n2)c2ccccc21 (10 rows) Time: 12.827 ms @@ -381,7 +353,7 @@ are constructed by combining operations from the list below with the `|` charact Basic similarity searching: - chembl_23=# select count(*) from rdk.fps where mfp2%morganbv_fp('Cc1ccc2nc(-c3ccc(NC(C4N(C(c5cccs5)=O)CCC4)=O)cc3)sc2c1'); + chembl_25=# select count(*) from rdk.fps where mfp2%morganbv_fp('Cc1ccc2nc(-c3ccc(NC(C4N(C(c5cccs5)=O)CCC4)=O)cc3)sc2c1'); count ------- 67 @@ -391,8 +363,8 @@ Basic similarity searching: Usually we'd like to find a sorted listed of neighbors along with the accompanying SMILES. This SQL function makes that pattern easy: - chembl_23=# create or replace function get_mfp2_neighbors(smiles text) - returns table(molregno integer, m mol, similarity double precision) as + chembl_25=# create or replace function get_mfp2_neighbors(smiles text) + returns table(molregno bigint, m mol, similarity double precision) as $$ select molregno,m,tanimoto_sml(morganbv_fp(mol_from_smiles($1::cstring)),mfp2) as similarity from rdk.fps join rdk.mols using (molregno) @@ -401,35 +373,35 @@ Usually we'd like to find a sorted listed of neighbors along with the accompanyi $$ language sql stable ; CREATE FUNCTION Time: 0.856 ms - chembl_23=# select * from get_mfp2_neighbors('Cc1ccc2nc(-c3ccc(NC(C4N(C(c5cccs5)=O)CCC4)=O)cc3)sc2c1') limit 10; - molregno | m | similarity - ----------+------------------------------------------------------------+------------------- - 471319 | Cc1ccc2nc(-c3ccc(NC(=O)C4CCN(S(=O)(=O)c5cccs5)C4)cc3)sc2c1 | 0.638888888888889 - 1032469 | O=C(Nc1nc2ccc(Cl)cc2s1)[C@@H]1CCCN1C(=O)c1cccs1 | 0.623188405797101 - 751668 | COc1ccc2nc(NC(=O)[C@@H]3CCCN3C(=O)c3cccs3)sc2c1 | 0.619718309859155 - 471318 | Cc1ccc2nc(-c3ccc(NC(=O)C4CN(S(=O)(=O)c5cccs5)C4)cc3)sc2c1 | 0.611111111111111 - 740754 | Cc1ccc(NC(=O)C2CCCN2C(=O)c2cccs2)cc1C | 0.606060606060606 - 732905 | O=C(Nc1ccc(S(=O)(=O)N2CCCC2)cc1)C1CCCN1C(=O)c1cccs1 | 0.602941176470588 - 1087495 | Cc1ccc(NC(=O)C2CCCN2C(=O)c2cccs2)c(C)c1 | 0.597014925373134 - 471462 | CCS(=O)(=O)N1CCC(C(=O)Nc2ccc(-c3nc4ccc(C)cc4s3)cc2)CC1 | 0.585714285714286 - 810850 | Cc1cc(C)n(-c2ccc(NC(=O)C3CCCCN3C(=O)c3cccs3)cc2)n1 | 0.583333333333333 - 1224407 | O=C(Nc1cccc(S(=O)(=O)N2CCCC2)c1)C1CCCN1C(=O)c1cccs1 | 0.579710144927536 + chembl_25=# select * from get_mfp2_neighbors('Cc1ccc2nc(-c3ccc(NC(C4N(C(c5cccs5)=O)CCC4)=O)cc3)sc2c1') limit 10; + molregno | m | similarity + ----------+------------------------------------------------------------------+------------------- + 751668 | COc1ccc2nc(NC(=O)[C@@H]3CCCN3C(=O)c3cccs3)sc2c1 | 0.619718309859155 + 740754 | Cc1ccc(NC(=O)C2CCCN2C(=O)c2cccs2)cc1C | 0.606060606060606 + 732905 | O=C(Nc1ccc(S(=O)(=O)N2CCCC2)cc1)C1CCCN1C(=O)c1cccs1 | 0.602941176470588 + 810850 | Cc1cc(C)n(-c2ccc(NC(=O)C3CCCCN3C(=O)c3cccs3)cc2)n1 | 0.583333333333333 + 1224407 | O=C(Nc1cccc(S(=O)(=O)N2CCCC2)c1)C1CCCN1C(=O)c1cccs1 | 0.579710144927536 + 779258 | CC1CCN(S(=O)(=O)c2ccc(NC(=O)[C@@H]3CCCN3C(=O)c3cccs3)cc2)CC1 | 0.569444444444444 + 472441 | Cc1ccc2nc(-c3ccc(NC(=O)C4CCN(S(=O)(=O)C(C)C)CC4)cc3)sc2c1 | 0.569444444444444 + 745651 | Cc1ccc(NC(=O)[C@@H]2CCCN2C(=O)c2cccs2)cc1S(=O)(=O)N1CCCCC1 | 0.567567567567568 + 472510 | Cc1ccc2nc(-c3ccc(NC(=O)C4CCN(S(=O)(=O)c5cccc(Cl)c5)CC4)cc3)sc2c1 | 0.565789473684211 + 1233426 | Cc1cccc2sc(NC(=O)[C@@H]3CCCN3C(=O)c3cccs3)nc12 | 0.563380281690141 (10 rows) Time: 28.909 ms - chembl_23=# select * from get_mfp2_neighbors('Cc1ccc2nc(N(C)CC(=O)O)sc2c1') limit 10; - molregno | m | similarity - ----------+-------------------------------------------------------+------------------- - 1044892 | Cc1ccc2nc(N(CCN(C)C)C(=O)c3cc(Cl)sc3Cl)sc2c1 | 0.518518518518518 - 1040496 | Cc1ccc2nc(N(CCCN(C)C)C(=O)CCc3ccccc3)sc2c1 | 0.517857142857143 - 1049393 | Cc1ccc2nc(N(CCCN(C)C)C(=O)CS(=O)(=O)c3ccccc3)sc2c1 | 0.517857142857143 - 441378 | Cc1ccc2nc(NC(=O)CCC(=O)O)sc2c1 | 0.510204081632653 - 1047691 | Cc1ccc(S(=O)(=O)CC(=O)N(CCCN(C)C)c2nc3ccc(C)cc3s2)cc1 | 0.509090909090909 - 911501 | Cc1ccc2nc(N(CCN(C)C)C(=O)c3cc(Cl)sc3Cl)sc2c1.Cl | 0.509090909090909 - 1042958 | Cc1ccc2nc(N(CCN(C)C)C(=O)c3ccc4ccccc4c3)sc2c1 | 0.509090909090909 - 775269 | Cc1ccc2nc(N(CCCN(C)C)C(=O)CCc3ccccc3)sc2c1.Cl | 0.508771929824561 - 1045663 | Cc1ccc2nc(N(CCCN(C)C)C(=O)COc3ccc(Cl)cc3)sc2c1 | 0.5 - 1015485 | Cc1ccc2nc(N(Cc3cccnc3)C(=O)Cc3ccccc3)sc2c1 | 0.5 + chembl_25=# select * from get_mfp2_neighbors('Cc1ccc2nc(N(C)CC(=O)O)sc2c1') limit 10; + molregno | m | similarity + ----------+----------------------------------------------------------+------------------- + 2138088 | CN(CC(=O)O)c1nc2ccc([N+](=O)[O-])cc2s1 | 0.673913043478261 + 1040255 | CC(=O)N(CCCN(C)C)c1nc2ccc(C)cc2s1 | 0.571428571428571 + 773946 | CC(=O)N(CCCN(C)C)c1nc2ccc(C)cc2s1.Cl | 0.56 + 1044892 | Cc1ccc2nc(N(CCN(C)C)C(=O)c3cc(Cl)sc3Cl)sc2c1 | 0.518518518518518 + 441378 | Cc1ccc2nc(NC(=O)CCC(=O)O)sc2c1 | 0.510204081632653 + 1047691 | Cc1ccc(S(=O)(=O)CC(=O)N(CCCN(C)C)c2nc3ccc(C)cc3s2)cc1 | 0.509090909090909 + 1042958 | Cc1ccc2nc(N(CCN(C)C)C(=O)c3ccc4ccccc4c3)sc2c1 | 0.509090909090909 + 1015485 | Cc1ccc2nc(N(Cc3cccnc3)C(=O)Cc3ccccc3)sc2c1 | 0.5 + 994843 | Cc1ccc(S(=O)(=O)CC(=O)N(CCCN(C)C)c2nc3ccc(C)cc3s2)cc1.Cl | 0.5 + 841938 | Cc1ccc2nc(N(CCN(C)C)C(=O)c3ccc4ccccc4c3)sc2c1.Cl | 0.5 (10 rows) Time: 41.623 ms @@ -438,40 +410,40 @@ Usually we'd like to find a sorted listed of neighbors along with the accompanyi By default, the minimum similarity returned with a similarity search is 0.5. This can be adjusted with the rdkit.tanimoto\_threshold (and rdkit.dice\_threshold) configuration variables: - chembl_23=# select count(*) from get_mfp2_neighbors('Cc1ccc2nc(N(C)CC(=O)O)sc2c1'); + chembl_25=# select count(*) from get_mfp2_neighbors('Cc1ccc2nc(N(C)CC(=O)O)sc2c1'); count ------- - 20 + 21 (1 row) Time: 181.438 ms - chembl_23=# set rdkit.tanimoto_threshold=0.7; + chembl_25=# set rdkit.tanimoto_threshold=0.7; SET Time: 0.047 ms - chembl_23=# select count(*) from get_mfp2_neighbors('Cc1ccc2nc(N(C)CC(=O)O)sc2c1'); + chembl_25=# select count(*) from get_mfp2_neighbors('Cc1ccc2nc(N(C)CC(=O)O)sc2c1'); count ------- 0 (1 row) Time: 161.228 ms - chembl_23=# set rdkit.tanimoto_threshold=0.6; + chembl_25=# set rdkit.tanimoto_threshold=0.6; SET Time: 0.045 ms - chembl_23=# select count(*) from get_mfp2_neighbors('Cc1ccc2nc(N(C)CC(=O)O)sc2c1'); + chembl_25=# select count(*) from get_mfp2_neighbors('Cc1ccc2nc(N(C)CC(=O)O)sc2c1'); count ------- - 1 + 2 (1 row) Time: 184.275 ms - chembl_23=# set rdkit.tanimoto_threshold=0.5; + chembl_25=# set rdkit.tanimoto_threshold=0.5; SET Time: 0.055 ms - chembl_23=# select count(*) from get_mfp2_neighbors('Cc1ccc2nc(N(C)CC(=O)O)sc2c1'); + chembl_25=# select count(*) from get_mfp2_neighbors('Cc1ccc2nc(N(C)CC(=O)O)sc2c1'); count ------- - 20 + 21 (1 row) Time: 181.100 ms @@ -480,14 +452,14 @@ By default, the minimum similarity returned with a similarity search is 0.5. Thi The most straightforward use of the MCS code is to find the maximum common substructure of a group of molecules: - chembl_23=# select fmcs(m::text) from rdk.mols join compound_records using (molregno) where doc_id=4; + chembl_25=# select fmcs(m::text) from rdk.mols join compound_records using (molregno) where doc_id=4; fmcs ------------------------------------------------------------------------ [#6](-[#6]-[#7]-[#6]-[#6](-,:[#6])-,:[#6])-,:[#6]-,:[#6]-,:[#6]-,:[#6] (1 row) Time: 31.041 ms - chembl_23=# select fmcs(m::text) from rdk.mols join compound_records using (molregno) where doc_id=5; + chembl_25=# select fmcs(m::text) from rdk.mols join compound_records using (molregno) where doc_id=5; fmcs ------------------------------------------------------------------------------------------------------------------------------------------ [#6]-[#6](=[#8])-[#7]-[#6](-[#6](=[#8])-[#7]1-[#6]-[#6]-[#6]-[#6]-1-[#6](=[#8])-[#7]-[#6](-[#6](=[#8])-[#8])-[#6]-[#6])-[#6](-[#6])-[#6] @@ -497,7 +469,7 @@ The most straightforward use of the MCS code is to find the maximum common subst The same thing can be done with a SMILES column: - chembl_23=# select fmcs(canonical_smiles) from compound_structures join compound_records using (molregno) where doc_id=4; + chembl_25=# select fmcs(canonical_smiles) from compound_structures join compound_records using (molregno) where doc_id=4; fmcs ------------------------------------------------------------------------ [#6](-[#7]-[#6]-[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6])-[#6](-,:[#6])-,:[#6] @@ -507,9 +479,9 @@ The same thing can be done with a SMILES column: It's also possible to adjust some of the parameters to the FMCS algorithm, though this is somewhat more painful as of this writing (the 2017\_03 release cycle). Here are a couple of examples: - chembl_23=# select fmcs_smiles(str,'{"Threshold":0.8}') from - chembl_23-# (select string_agg(m::text,' ') as str from rdk.mols - chembl_23(# join compound_records using (molregno) where doc_id=4) as str ; + chembl_25=# select fmcs_smiles(str,'{"Threshold":0.8}') from + chembl_25-# (select string_agg(m::text,' ') as str from rdk.mols + chembl_25(# join compound_records using (molregno) where doc_id=4) as str ; fmcs_smiles ------------------------------------------------------------------------------------------------------------------------------------------------------------------ @@ -517,10 +489,10 @@ It's also possible to adjust some of the parameters to the FMCS algorithm, thoug (1 row) Time: 9673.949 ms - chembl_23=# - chembl_23=# select fmcs_smiles(str,'{"AtomCompare":"Any"}') from - chembl_23-# (select string_agg(m::text,' ') as str from rdk.mols - chembl_23(# join compound_records using (molregno) where doc_id=4) as str ; + chembl_25=# + chembl_25=# select fmcs_smiles(str,'{"AtomCompare":"Any"}') from + chembl_25-# (select string_agg(m::text,' ') as str from rdk.mols + chembl_25(# join compound_records using (molregno) where doc_id=4) as str ; fmcs_smiles ------------------------------------------------------------------------------------------------------------------------------------------------------------------------ [#6]-,:[#6,#7]-[#8,#6]-[#6,#7](-[#6,#8]-[#7,#6]-,:[#6,#7]-,:[#6,#7]-,:[#7,#6]-,:[#6])-[#6,#7]-[#6]-[#6](-[#8,#6]-[#6])-[#6,#7]-[#7,#6]-[#6]-,:[#6,#8]-,:[#7,#6]-,:[#6] @@ -530,9 +502,9 @@ It's also possible to adjust some of the parameters to the FMCS algorithm, thoug *Note* The combination of `"AtomCompare":"Any"` and a value of `"Threshold"` that is less than 1.0 does a quite generic search and can results in very long search times. Using `"Timeout"` with this combination is recommended: - chembl_23=# select fmcs_smiles(str,'{"AtomCompare":"Any","CompleteRingsOnly":true,"Threshold":0.8,"Timeout":60}') from - chembl_23-# (select string_agg(m::text,' ') as str from rdk.mols - chembl_23(# join compound_records using (molregno) where doc_id=3) as str ; + chembl_25=# select fmcs_smiles(str,'{"AtomCompare":"Any","CompleteRingsOnly":true,"Threshold":0.8,"Timeout":60}') from + chembl_25-# (select string_agg(m::text,' ') as str from rdk.mols + chembl_25(# join compound_records using (molregno) where doc_id=3) as str ; WARNING: findMCS timed out, result is not maximal fmcs_smiles @@ -724,7 +696,7 @@ The recommended adapter for connecting to postgresql is pyscopg2 (>> import psycopg2 - >>> conn = psycopg2.connect(database='chembl_16') + >>> conn = psycopg2.connect(database='chembl_25') >>> curs = conn.cursor() >>> curs.execute('select * from rdk.mols where m@>%s',('c1cccc2c1nncc2',)) >>> curs.fetchone() @@ -735,12 +707,12 @@ That returns a SMILES for each molecule. If you plan to do more work with the mo >>> curs.execute('select molregno,mol_send(m) from rdk.mols where m@>%s',('c1cccc2c1nncc2',)) >>> row = curs.fetchone() >>> row - (9830, ) + (9830, ) These pickles can then be converted into molecules: >>> from rdkit import Chem - >>> m = Chem.Mol(str(row[1])) + >>> m = Chem.Mol(row[1].tobytes()) >>> Chem.MolToSmiles(m,True) 'CC(C)Sc1ccc(CC2CCN(C3CCN(C(=O)c4cnnc5ccccc54)CC3)CC2)cc1'