Updated cartridge documentation (#2635)

* Updated cartridge documentation

Made examples compatible with latest chembl (25) and most recent conda versions of rdkit (2019.03.4.0, python 3.6.9) + rdkit-postgresql (2019.03.4.0)

* updated more query results

* updated more query results
This commit is contained in:
Marvin Steijaert
2019-10-01 16:05:31 +02:00
committed by Greg Landrum
parent 5dfd67a22a
commit 65a5f6030c

View File

@@ -65,29 +65,29 @@ Start by downloading and installing the postgresql dump from the ChEMBL website
Connect to the database, install the cartridge, and create the schema that we'll use:
chembl_23=# create extension if not exists rdkit;
chembl_23=# create schema rdk;
chembl_25=# create extension if not exists rdkit;
chembl_25=# create schema rdk;
Create the molecules and build the substructure search index:
chembl_23=# select * into rdk.mols from (select molregno,mol_from_ctab(molfile::cstring) m from compound_structures) tmp where m is not null;
SELECT 1727081
chembl_23=# create index molidx on rdk.mols using gist(m);
chembl_25=# select * into rdk.mols from (select molregno,mol_from_ctab(molfile::cstring) m from compound_structures) tmp where m is not null;
SELECT 1870451
chembl_25=# create index molidx on rdk.mols using gist(m);
CREATE INDEX
chembl_23=# alter table rdk.mols add primary key (molregno);
chembl_25=# alter table rdk.mols add primary key (molregno);
ALTER TABLE
Create some fingerprints and build the similarity search index:
chembl_23=# select molregno,torsionbv_fp(m) as torsionbv,morganbv_fp(m) as mfp2,featmorganbv_fp(m) as ffp2 into rdk.fps from rdk.mols;
SELECT 1727081
chembl_23=# create index fps_ttbv_idx on rdk.fps using gist(torsionbv);
chembl_25=# select molregno,torsionbv_fp(m) as torsionbv,morganbv_fp(m) as mfp2,featmorganbv_fp(m) as ffp2 into rdk.fps from rdk.mols;
SELECT 1870451
chembl_25=# create index fps_ttbv_idx on rdk.fps using gist(torsionbv);
CREATE INDEX
chembl_23=# create index fps_mfp2_idx on rdk.fps using gist(mfp2);
chembl_25=# create index fps_mfp2_idx on rdk.fps using gist(mfp2);
CREATE INDEX
chembl_23=# create index fps_ffp2_idx on rdk.fps using gist(ffp2);
chembl_25=# create index fps_ffp2_idx on rdk.fps using gist(ffp2);
CREATE INDEX
chembl_23=# alter table rdk.fps add primary key (molregno);
chembl_25=# alter table rdk.fps add primary key (molregno);
ALTER TABLE
Here is a group of the commands used here (and below) in one block so that you can just paste it in at the psql prompt:
@@ -103,7 +103,7 @@ Here is a group of the commands used here (and below) in one block so that you c
create index fps_ffp2_idx on rdk.fps using gist(ffp2);
alter table rdk.fps add primary key (molregno);
create or replace function get_mfp2_neighbors(smiles text)
returns table(molregno integer, m mol, similarity double precision) as
returns table(molregno bigint, m mol, similarity double precision) as
$$
select molregno,m,tanimoto_sml(morganbv_fp(mol_from_smiles($1::cstring)),mfp2) as similarity
from rdk.fps join rdk.mols using (molregno)
@@ -115,52 +115,52 @@ Here is a group of the commands used here (and below) in one block so that you c
Example query molecules taken from the [eMolecules home page](http://www.emolecules.com/):
chembl_23=# select count(*) from rdk.mols where m@>'c1cccc2c1nncc2' ;
chembl_25=# select count(*) from rdk.mols where m@>'c1cccc2c1nncc2' ;
count
-------
447
461
(1 row)
Time: 107.602 ms
chembl_23=# select count(*) from rdk.mols where m@>'c1ccnc2c1nccn2' ;
chembl_25=# select count(*) from rdk.mols where m@>'c1ccnc2c1nccn2' ;
count
-------
1013
1124
(1 row)
Time: 216.222 ms
chembl_23=# select count(*) from rdk.mols where m@>'c1cncc2n1ccn2' ;
chembl_25=# select count(*) from rdk.mols where m@>'c1cncc2n1ccn2' ;
count
-------
1775
2233
(1 row)
Time: 88.266 ms
chembl_23=# select count(*) from rdk.mols where m@>'Nc1ncnc(N)n1' ;
chembl_25=# select count(*) from rdk.mols where m@>'Nc1ncnc(N)n1' ;
count
-------
5842
7095
(1 row)
Time: 327.855 ms
chembl_23=# select count(*) from rdk.mols where m@>'c1scnn1' ;
chembl_25=# select count(*) from rdk.mols where m@>'c1scnn1' ;
count
-------
15962
16526
(1 row)
Time: 568.675 ms
chembl_23=# select count(*) from rdk.mols where m@>'c1cccc2c1ncs2' ;
chembl_25=# select count(*) from rdk.mols where m@>'c1cccc2c1ncs2' ;
count
-------
18986
20745
(1 row)
Time: 998.104 ms
chembl_23=# select count(*) from rdk.mols where m@>'c1cccc2c1CNCCN2' ;
chembl_25=# select count(*) from rdk.mols where m@>'c1cccc2c1CNCCN2' ;
count
-------
1613
1788
(1 row)
Time: 1922.273 ms
@@ -171,45 +171,42 @@ Given we're searching through 1.7 million compounds these search times aren't in
One easy way to speed things up, particularly for queries that return a large number of results, is to only retrieve a limited number of results:
chembl_23=# select * from rdk.mols where m@>'c1cccc2c1CNCCN2' limit 100;
molregno | m
----------+-----------------------------------------------------------------------------------------------------------------------------------------------------------
--------------------------------
908048 | O=C1CN(C(=O)c2ccc(Br)o2)C(c2ccc(F)cc2)c2cc(F)ccc2N1
931972 | Cl.c1ccc(CC2CNc3ccccc3CN2)cc1
904450 | CCOC(=O)[C@H]1[C@H]2COc3ccc(Cl)cc3[C@@H]2N2C(=O)c3ccc(Cl)cc3NC(=O)[C@@]12C
226391 | C/C=C1/CC2C(OC)Nc3cc(OC)c(OC)cc3C(=O)N2C1
930820 | CN1CC(=O)N(CC(=O)Nc2ccc(N(C)C)cc2)c2ccccc2C1=O
18576 | CO[C@H]1Nc2c(ccc(C)c2O)C(=O)N2C=C(/C=C/C(N)=O)C[C@@H]12
249934 | O=C(c1cccc2ccccc12)N1CCN(Cc2cncn2Cc2ccccc2)c2ccccc2C1
...
91020 | CC(C)C[C@H]1C(=O)N2c3ccccc3[C@@](O)(C[C@@H]3NC(=O)c4ccccc4N4C(=O)c5ccccc5NC34)[C@H]2N1C(=O)C(CCCNC(=O)OCc1ccccc1)NC(=O)OC(C)(C)C
91225 | CC(C)C[C@H]1C(=O)N2c3ccccc3[C@@](O)(C[C@@H]3NC(=O)c4ccccc4N4C(=O)c5ccccc5NC34)[C@H]2N1C(=O)CCC(=O)[O-].[Na+]
348798 | O=C(O)CN1C(=O)C(c2ccc(Cl)cc2)N(C(C(=O)O)c2ccc(Cl)cc2)C(=O)c2cc(I)ccc21
348972 | C[C@H](c1ccc(Cl)cc1)N1C(=O)c2cc(I)ccc2N(CCCCC(=O)O)C(=O)[C@@H]1c1ccc(C(F)(F)F)cc1
...skipping 23 lines
chembl_25=# select * from rdk.mols where m@>'c1cccc2c1CNCCN2' limit 100;
molregno | m
----------+--------------------------------------------------------------------------------------------------------------
1671940 | Cc1cccc(C)c1N1C(=O)c2ccccc2NC(=O)C1C(=O)NCc1ccco1
1318078 | COCN1C(=O)[C@@H]2C[C@@H](O)CN2C(=O)c2ccccc21
1318783 | O/N=C1/Nc2ccccc2C(=S)N2CSCC12
1318127 | CC(=O)O[C@H]1C[C@H]2C(=S)Nc3ccccc3C(=S)N2C1
1308578 | O=C1Nc2cc([N+](=O)[O-])ccc2C(=O)N2CCC[C@@H]12
1417168 | O=C(NCC(F)(F)F)C1C(=O)Nc2ccccc2C(=O)N1Cc1ccccc1
...
793329 | Cc1ccc2c(c1)C(c1ccccc1)N(C(=O)c1ccc(OC(C)C)cc1)CC(=O)N2
921215 | O=C1CN(C(=O)c2cc([N+](=O)[O-])ccc2Cl)C(c2ccc(F)cc2)c2cc(F)ccc2N1
790949 | CCOC(=O)[C@H]1[C@H]2COc3ccc(Cl)cc3[C@@H]2N2C(=O)c3cc(C)ccc3NC(=O)[C@@]12C
760998 | CC(=O)N1CC(=O)Nc2ccc(Cl)cc2C1c1ccc(F)cc1
(100 rows)
Time: 97.357 ms
#### SMARTS-based queries
Oxadiazole or thiadiazole:
chembl_23=# select * from rdk.mols where m@>'c1[o,s]ncn1'::qmol limit 500;
molregno | m
----------+--------------------------------------------------------------------------------------------------------------
1370170 | Fc1cccc(-c2nc(NCC3COc4ccccc4O3)no2)c1F
1370417 | COc1cc(CN2CCC(Cc3nc(-c4ccc5c(c4)CCO5)no3)C2)ccc1F
1370526 | Cl.Cn1cc(-c2noc(/C=C3/CCN4CCCC[C@@H]4C3)n2)c2ccccc21
1379267 | CCC(c1ccccc1)c1noc(CCN(CC)CC)n1
1404150 | OC[C@H]1O[C@H](c2nc(-c3nc(-c4cccs4)no3)cs2)C[C@@H]1O
1217463 | CC(C)(C)c1ccc(-c2noc(CCC(=O)N3CCCCC3)n2)cc1
chembl_25=# select * from rdk.mols where m@>'c1[o,s]ncn1'::qmol limit 500;
molregno | m
----------+---------------------------------------------------------------------------------------------------
1882516 | COc1cccc(CN(C)Cc2nc(C(C)C)no2)c1
2194441 | Cc1nc([C@](C)(O)C#Cc2ccc3c(c2)-c2nc(C(N)=O)sc2[C@@H](F)CO3)no1
1881742 | CCOc1ccc(C(F)(F)F)cc1NC(=O)NCc1noc(C)n1
1949861 | FC(F)(F)c1ccc(-c2nc(-c3ccc4nc[nH]c4c3)no2)cc1
1949860 | FC(F)(F)c1cccc(-c2nc(-c3ccc4nc[nH]c4c3)no2)c1
2172627 | O=c1[nH]cc(-c2cc(Cl)ccc2Oc2cc(F)c(S(=O)(=O)Nc3ncns3)cc2F)n2cncc12
...
1517753 | CC(C)c1noc(N2CCC(CO[C@H]3CC[C@H](c4ccc(S(C)(=O)=O)cc4F)CC3)CC2)n1
1263024 | COc1cc(Nc2nc3c(s2)CCCC3c2ccccc2)ccc1-c1nc(C)no1
1264016 | O=C(O)CCc1nc2cc(-c3noc(-c4cc(C(F)(F)F)cc(C(F)(F)F)c4)n3)ccc2[nH]1
1847733 | Cc1cc(-c2noc([C@H]3CCCCN3C(=O)COc3ccccc3)n2)no1
1848026 | O=C1CCCN1c1cccc(-c2noc([C@H]3CCCCN3C(=O)COc3ccccc3)n2)c1
1848027 | O=C1CN(c2cccc(-c3noc([C@H]4CCCCN4C(=O)COc4ccccc4)n3)c2)C(=O)N1
1848036 | CN(C)C(=O)CCC(=O)Nc1cc(F)cc(-c2noc([C@H]3CCCCN3C(=O)COc3ccccc3)n2)c1
1852688 | CC(Sc1nc(N)cc(N)n1)c1nc(C(C)(C)C)no1
(500 rows)
Time: 761.847 ms
@@ -220,64 +217,39 @@ This is slower than the pure SMILES query, this is generally true of SMARTS-base
Note that by default stereochemistry is not taken into account when doing substructure queries:
chembl_23=# select * from rdk.mols where m@>'NC(=O)[C@@H]1CCCN1C=O' limit 10;
molregno |
m
----------+-----------------------------------------------------------------------------------------------------------------------------------------------------------
----------------------------------------------------------------------------------------------------------------------------------------------------------------------
---------------------
87611 | CNCC(=O)N[C@@H](CCCN=C(N)N)C(=O)N1C[C@H]2C[C@H]1C(=O)N[C@@H](Cc1ccc(O)cc1)C(=O)N[C@H](C(=O)N[C@@H](Cc1c[nH]cn1)C(=O)N1CCC[C@H]1C(=O)N[C@@H](Cc1ccccc1)C(=O
)O)CCSS2
88372 | CNCCCC[C@H](NC(=O)[C@H](Cc1ccccc1)NC(=O)[C@@H](CCCCNC)NC(=O)[C@H](Cc1ccc(O)cc1)NC(=O)[C@H](CO)NC(=O)[C@@H](Cc1ccccc1)NC(=O)[C@@H](Cc1ccccc1)NC(=O)[C@@H](C
c1ccc2ccccc2c1)NC(C)=O)C(=O)N1CCC[C@@H]1C(=O)N[C@H](C)C(=O)O
88322 | CC(=O)N[C@H](Cc1ccc2ccccc2c1)C(=O)N[C@H](Cc1ccccc1)C(=O)N[C@H](Cc1ccccc1)C(=O)N[C@@H](CO)C(=O)N[C@@H](Cc1ccc(O)cc1)C(=O)N[C@H](CCCCNC(C)C)C(=O)N[C@@H](Cc1
ccccc1)C(=O)N[C@@H](CCCCNC(C)C)C(=O)N1CCC[C@@H]1C(=O)N[C@H](C)C(=O)O
88168 | CC(=O)N[C@H](Cc1ccc2ccccc2c1)C(=O)N[C@H](Cc1ccccc1)C(=O)N[C@H](Cc1ccccc1)C(=O)N[C@@H](CO)C(=O)N[C@@H](Cc1ccc(O)cc1)C(=O)N[C@H](CCCN=C(N)N)C(=O)N[C@@H](Cc1
ccccc1)C(=O)N[C@@H](CCCCNC1CCCC1)C(=O)N1CCC[C@@H]1C(=O)N[C@H](C)C(=O)O
88150 | CC(=O)N[C@H](Cc1ccc2ccccc2c1)C(=O)N[C@H](Cc1ccccc1)C(=O)N[C@H](Cc1ccccc1)C(=O)N[C@@H](CO)C(=O)N[C@@H](Cc1ccc(O)cc1)C(=O)N[C@H](CCCN=C(N)N)C(=O)N[C@@H](Cc1
ccccc1)C(=O)N[C@@H](CCCCNCc1ccc(C)cc1)C(=O)N1CCC[C@@H]1C(=O)N[C@H](C)C(=O)O
88373 | CC(=O)N[C@H](Cc1ccc2ccccc2c1)C(=O)N[C@H](Cc1ccccc1)C(=O)N[C@H](Cc1ccccc1)C(=O)N[C@@H](CO)C(=O)N[C@@H](Cc1ccc(O)cc1)C(=O)N[C@H](CCCCNC1CCCCC1)C(=O)N[C@@H](
Cc1ccccc1)C(=O)N[C@@H](CCCCNC1CCCCC1)C(=O)N1CCC[C@@H]1C(=O)N[C@H](C)C(=O)O
93377 | CC(=O)N[C@@H](Cc1ccc([N+](=O)[O-])cc1)C(=O)N1CCC[C@H]1C(=O)N[C@@H](CCC/N=C(/N)NS(=O)(=O)c1c(C)c(C)c2c(c1C)CCC(C)(C)O2)C(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@@H](
CCC/N=C(/N)NS(=O)(=O)c1c(C)c(C)c2c(c1C)CCC(C)(C)O2)C(=O)N[C@H](C(=O)NCC(=O)N[C@@H](COC(C)(C)C)C(=O)N[C@@H](CCCCNC(=O)c1ccccc1N)C(=O)NCC(=O)O)[C@@H](C)OC(C)(C)C
94493 | CC(C)C[C@@H]1NC(=O)[C@H]([C@@H](C)O)NC(=O)[C@H](Cc2c[nH]c3ccccc23)NC(=O)[C@H](C(C)C)NC(=O)[C@H](NC(=O)[C@H](CCCCN)NC(=O)[C@@H]2CCCN2C(=O)[C@H](CCC(N)=O)NC
(=O)CNC(=O)CN)CSSC[C@@H](C(=O)N[C@@H](Cc2ccc(O)cc2)C(=O)N[C@@H](CO)C(=O)N[C@H](C(=O)NCC(=O)NCC(N)=O)[C@@H](C)O)NC(=O)[C@H](Cc2c[nH]cn2)NC(=O)[C@H](Cc2ccccc2)NC(=O)CNC
(=O)[C@@H]2CCCN2C1=O
...skipping 1 line
89559 | CC1(C)SSC(C)(C)[C@@H](C(=O)N[C@@H](Cc2c[nH]cn2)C(=O)N2CCC[C@H]2C(=O)N[C@@H](Cc2ccccc2)C(=O)O)NC(=O)[C@H](Cc2ccc(O)cc2)NC(=O)[C@H]1NC(=O)[C@H](CCCN=C(N)N)N
C(=O)[C@@H](N)CC(=O)O
chembl_25=# select * from rdk.mols where m@>'NC(=O)[C@@H]1CCCN1C=O' limit 10;
molregno | m
----------+---------------------------------------------------------------------------------------------------
2213985 | CC[C@H](C)[C@@H]1NC(=O)[C@@H]2CCCN2C(=O)[C@@H]2CCCN2C(=O)[C@H]([C@@H](C)CC)NC(=O)[C@H](CO)NC(=O)[C@H](C)NC(=O)[C@H]([C@H](C)O)NC(=O)[C@@H]2CSSC[C@H](NC1=O)C(=O)N[C@@H](Cc1cnc[nH]1)C(=O)N[C@H](Cc1ccccc1)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](Cc1c[nH]c3ccccc13)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N2
1956682 | NC(=O)[C@@H]1CCCN1C(=O)[C@H](Cc1nc(I)[nH]c1I)NC(=O)c1cnccn1
2212188 | CN1C(=O)[C@H](CCCNC(=N)N)NC(=O)[C@@H](Cc2ccc(O)cc2)NC(=O)[C@@H]2CCCN2C(=O)[C@H](Cc2ccc3ccccc3c2)NC(=O)[C@@H]1CC(=O)O
2053463 | NCCCC[C@H](NC(=O)[C@H](Cc1ccc(OP(=O)(O)O)cc1)NC(=O)Cc1ccccc1)C(=O)N1CCC[C@H]1C(=O)N[C@@H](Cc1ccccc1)C(N)=O
2060743 | CCCCCCCCCCCCCCCCNC(=O)CN(CC(=O)NC(C)(C)C(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N1CCC[C@H]1C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](CC(N)=O)C(N)=O)C(=O)c1cccnc1
2060744 | CCCCCCCCCCCCCCCCN(CCCCCCCCCCCCCCCC)CCCCCC(=O)NC(C)(C)C(=O)NC(Cc1ccccc1)C(=O)NC(CC(C)C)C(=O)NC(Cc1ccccc1)C(=O)NC(CCCNC(=N)N)C(=O)N1CCCC1C(=O)NC(CCCNC(=N)N)C(=O)NC(CC(N)=O)C(N)=O
2077784 | CC[C@H](C)[C@@H]1NC(=O)[C@H](Cc2ccccc2)NC(=O)[C@H](CC(C)C)NC(=O)[C@@H]2CCCN2C(=O)[C@H](Cc2ccccc2)NC(=O)[C@H](C(C)C)NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CCSC)NC1=O
2077779 | CC[C@H](C)[C@@H]1NC(=O)[C@H](Cc2ccccc2)NC(=O)[C@H](CC(C)C)NC(=O)[C@@H]2CCCN2C(=O)[C@H](Cc2ccccc2)NC(=O)[C@H](C(C)C)NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CC[S+](C)[O-])NC1=O
2077782 | CC[C@H](C)[C@@H]1NC(=O)[C@H](Cc2c[nH]c3ccccc23)NC(=O)[C@H](Cc2ccccc2)NC(=O)[C@H](Cc2ccccc2)NC(=O)[C@@H]2CCCN2C(=O)[C@H](CCSC)NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CC[S+](C)[O-])NC1=O
2077780 | CC(C)C[C@@H]1NC(=O)[C@H](CC[S+](C)[O-])NC(=O)[C@H](C(C)C)NC(=O)[C@H](Cc2c[nH]c3ccccc23)NC(=O)[C@H](Cc2ccccc2)NC(=O)[C@H](Cc2ccccc2)NC(=O)[C@@H]2CCCN2C(=O)[C@H](CC[S+](C)[O-])NC1=O
(10 rows)
This can be changed using the rdkit.do\_chiral\_sss configuration variable:
chembl_23=# set rdkit.do_chiral_sss=true;
chembl_25=# set rdkit.do_chiral_sss=true;
SET
Time: 0.241 ms
chembl_23=# select * from rdk.mols where m@>'NC(=O)[C@@H]1CCCN1C=O' limit 10;
molregno |
m
----------+--------------------------------------------------------------------------------------------------------------------------------------------------------------
-------------------------------------------------------------------------------------------------------------------------------------------------------------------------
---------------
87611 | CNCC(=O)N[C@@H](CCCN=C(N)N)C(=O)N1C[C@H]2C[C@H]1C(=O)N[C@@H](Cc1ccc(O)cc1)C(=O)N[C@H](C(=O)N[C@@H](Cc1c[nH]cn1)C(=O)N1CCC[C@H]1C(=O)N[C@@H](Cc1ccccc1)C(=O)O)
CCSS2
93377 | CC(=O)N[C@@H](Cc1ccc([N+](=O)[O-])cc1)C(=O)N1CCC[C@H]1C(=O)N[C@@H](CCC/N=C(/N)NS(=O)(=O)c1c(C)c(C)c2c(c1C)CCC(C)(C)O2)C(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@@H](CCC
/N=C(/N)NS(=O)(=O)c1c(C)c(C)c2c(c1C)CCC(C)(C)O2)C(=O)N[C@H](C(=O)NCC(=O)N[C@@H](COC(C)(C)C)C(=O)N[C@@H](CCCCNC(=O)c1ccccc1N)C(=O)NCC(=O)O)[C@@H](C)OC(C)(C)C
94493 | CC(C)C[C@@H]1NC(=O)[C@H]([C@@H](C)O)NC(=O)[C@H](Cc2c[nH]c3ccccc23)NC(=O)[C@H](C(C)C)NC(=O)[C@H](NC(=O)[C@H](CCCCN)NC(=O)[C@@H]2CCCN2C(=O)[C@H](CCC(N)=O)NC(=O
)CNC(=O)CN)CSSC[C@@H](C(=O)N[C@@H](Cc2ccc(O)cc2)C(=O)N[C@@H](CO)C(=O)N[C@H](C(=O)NCC(=O)NCC(N)=O)[C@@H](C)O)NC(=O)[C@H](Cc2c[nH]cn2)NC(=O)[C@H](Cc2ccccc2)NC(=O)CNC(=O)[C
@@H]2CCCN2C1=O
89558 | NC(N)=NCCC[C@H](NC(=O)[C@@H](N)CC(=O)O)C(=O)N[C@H]1CCSSC[C@@H](C(=O)N[C@@H](Cc2c[nH]cn2)C(=O)N2CCC[C@H]2C(=O)N[C@@H](Cc2ccccc2)C(=O)O)NC(=O)[C@H](Cc2ccc(O)cc
2)NC1=O
89559 | CC1(C)SSC(C)(C)[C@@H](C(=O)N[C@@H](Cc2c[nH]cn2)C(=O)N2CCC[C@H]2C(=O)N[C@@H](Cc2ccccc2)C(=O)O)NC(=O)[C@H](Cc2ccc(O)cc2)NC(=O)[C@H]1NC(=O)[C@H](CCCN=C(N)N)NC(=
O)[C@@H](N)CC(=O)O
126618 | NC(=O)[C@@H]1CCCN1C(=O)[C@@H]1CCCN1C(=O)[C@@H](O)[C@H](N)Cc1ccccc1
152339 | O=C(O)CN[C@H](CC1CCCCC1)C(=O)N1CCC[C@H]1C(=O)NCCCc1c[nH]cn1
152504 | N[C@H](CC1CCCCC1)C(=O)N1[C@H](C(=O)NC/C=C/c2c[nH]cn2)C[C@@H]2CCCC[C@@H]21
152383 | N[C@H](CC1CCCCC1)C(=O)N1CCC[C@H]1C(=O)NCCCCc1c[nH]cn1
151837 | N[C@H](CC1CCCCC1)C(=O)N1CCC[C@H]1C(=O)NC/C=C/c1c[nH]cn1
chembl_25=# select * from rdk.mols where m@>'NC(=O)[C@@H]1CCCN1C=O' limit 10;
molregno | m
----------+---------------------------------------------------------------------------------------------------
2213985 | CC[C@H](C)[C@@H]1NC(=O)[C@@H]2CCCN2C(=O)[C@@H]2CCCN2C(=O)[C@H]([C@@H](C)CC)NC(=O)[C@H](CO)NC(=O)[C@H](C)NC(=O)[C@H]([C@H](C)O)NC(=O)[C@@H]2CSSC[C@H](NC1=O)C(=O)N[C@@H](Cc1cnc[nH]1)C(=O)N[C@H](Cc1ccccc1)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](Cc1c[nH]c3ccccc13)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N2
1956682 | NC(=O)[C@@H]1CCCN1C(=O)[C@H](Cc1nc(I)[nH]c1I)NC(=O)c1cnccn1
2212188 | CN1C(=O)[C@H](CCCNC(=N)N)NC(=O)[C@@H](Cc2ccc(O)cc2)NC(=O)[C@@H]2CCCN2C(=O)[C@H](Cc2ccc3ccccc3c2)NC(=O)[C@@H]1CC(=O)O
2053463 | NCCCC[C@H](NC(=O)[C@H](Cc1ccc(OP(=O)(O)O)cc1)NC(=O)Cc1ccccc1)C(=O)N1CCC[C@H]1C(=O)N[C@@H](Cc1ccccc1)C(N)=O
2060743 | CCCCCCCCCCCCCCCCNC(=O)CN(CC(=O)NC(C)(C)C(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N1CCC[C@H]1C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](CC(N)=O)C(N)=O)C(=O)c1cccnc1
2077784 | CC[C@H](C)[C@@H]1NC(=O)[C@H](Cc2ccccc2)NC(=O)[C@H](CC(C)C)NC(=O)[C@@H]2CCCN2C(=O)[C@H](Cc2ccccc2)NC(=O)[C@H](C(C)C)NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CCSC)NC1=O
2077779 | CC[C@H](C)[C@@H]1NC(=O)[C@H](Cc2ccccc2)NC(=O)[C@H](CC(C)C)NC(=O)[C@@H]2CCCN2C(=O)[C@H](Cc2ccccc2)NC(=O)[C@H](C(C)C)NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CC[S+](C)[O-])NC1=O
2077782 | CC[C@H](C)[C@@H]1NC(=O)[C@H](Cc2c[nH]c3ccccc23)NC(=O)[C@H](Cc2ccccc2)NC(=O)[C@H](Cc2ccccc2)NC(=O)[C@@H]2CCCN2C(=O)[C@H](CCSC)NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CC[S+](C)[O-])NC1=O
2077780 | CC(C)C[C@@H]1NC(=O)[C@H](CC[S+](C)[O-])NC(=O)[C@H](C(C)C)NC(=O)[C@H](Cc2c[nH]c3ccccc23)NC(=O)[C@H](Cc2ccccc2)NC(=O)[C@H](Cc2ccccc2)NC(=O)[C@@H]2CCCN2C(=O)[C@H](CC[S+](C)[O-])NC1=O
2211488 | CC[C@H](C)[C@H](N)C(=O)N[C@H](C(=O)N[C@@H](CC(C)C)C(=O)N[C@H](C(=O)N1CCC[C@H]1C(=O)N1CCC[C@H]1C(=O)N[C@H](CCC(=O)N[C@@H](CCC(=O)N[C@@H](CC(C)C)C(=O)O)Cc1ccccc1)Cc1ccccc1)C(C)C)[C@@H](C)CC
(10 rows)
Time: 6.181 ms
@@ -289,19 +261,19 @@ having to construct complex SMARTS queries. The cartridge function `mol_adjust_q
can be used to do just this. Here is an example of the default behavior, using a
query for 2,6 di-substituted pyridines:
chembl_23=# select molregno,m from rdk.mols where m@>mol_adjust_query_properties('*c1cccc(NC(=O)*)n1') limit 10;
molregno | m
----------+-------------------------------------------------------------------------------------------
1993749 | Cn1c(Nc2c(Cl)ccc(CNC(=O)C(C)(C)C)c2Cl)nc2cc(C(=O)Nc3cccc(C(F)(F)F)n3)c(N3CCC(F)(F)C3)cc21
1988455 | Cc1cccc(C(=O)Nc2cccc(Oc3cccnc3)n2)c1
1870095 | COC(=O)CN(C(=O)C(C)c1c(F)cccc1F)c1cccc(C)n1
1870023 | CCC(C)C(=O)N(CC(=O)OC)c1cccc(C)n1
1873944 | Cc1ccc(C(=O)N(C)CC(=O)Nc2cccc(C)n2)cn1
1873968 | Cc1cccc(NC(=O)CN(C)C(=O)c2ccc(-n3cccc3)nc2)n1
1882693 | Cc1cccc(NC(=O)CCNCc2c(C)nn(C)c2N(C)C)n1
1882711 | COc1c(CNCCC(=O)Nc2cccc(C)n2)c(C)nn1C
1868705 | CCOc1cccc(NC(=O)c2cnc(C)cn2)n1
1875177 | Cc1cccc(NC(=O)[C@@H]2CCCN2Cc2nc(C)c(C)o2)n1
chembl_25=# select molregno,m from rdk.mols where m@>mol_adjust_query_properties('*c1cccc(NC(=O)*)n1') limit 10;
molregno | m
----------+---------------------------------------------------------------------------------------------------
1609520 | Cc1cccc(NC(=O)c2cc(Br)ccc2C(=O)O)n1
1141456 | CCN(CC)CCCn1cc(NC(=O)Nc2cccc(-c3ccccc3)n2)c2ccccc21
1431198 | Cc1cccc(NC(=O)c2nc(C)sc2Nc2cccnc2)n1
734975 | Cc1cccc(NC(=O)CN(C)S(=O)(=O)c2ccc(Cl)cc2)n1
760426 | Cc1cccc(NC(=O)CCCn2cc([N+](=O)[O-])cn2)n1
782786 | Cc1cccc(NC(=O)CN2C(=O)NC(C)(c3ccc4ccccc4c3)C2=O)n1
1478990 | Cc1cccc(NC(=O)Cn2c(=O)sc3cc(C(=O)c4ccccc4)ccc32)n1
1478787 | Cc1cccc(NC(=O)Cn2c(=O)sc3cc(C(=O)c4ccccc4F)ccc32)n1
1955608 | C[C@H](N)C(=O)Nc1cccc(N)n1
773911 | Cc1cccc(NC(=O)c2c(-c3ccccc3)noc2C)n1
(10 rows)
Time: 11.895 ms
@@ -315,20 +287,20 @@ By default `mol_adjust_query_properties()` makes the following changes to the mo
We can control the behavior by providing an additional JSON argument. Here's an example
where we disable the additional degree queries:
chembl_23=# select molregno,m from rdk.mols where m@>mol_adjust_query_properties('*c1cccc(NC(=O)*)n1',
chembl_23(# '{"adjustDegree":false}') limit 10;
molregno | m
----------+-------------------------------------------------------------------------------------------
1993749 | Cn1c(Nc2c(Cl)ccc(CNC(=O)C(C)(C)C)c2Cl)nc2cc(C(=O)Nc3cccc(C(F)(F)F)n3)c(N3CCC(F)(F)C3)cc21
1957849 | COc1ccc2ncc(F)c(C[C@H](O)C3CCC(NCc4nc5c(cc4F)OCC(=O)N5)CO3)c2n1
1959611 | O=C1COc2ccc(CNC3CCN(CCn4c(=O)ccc5ncc(OCc6cccnn6)cc54)CC3)nc2N1
1988455 | Cc1cccc(C(=O)Nc2cccc(Oc3cccnc3)n2)c1
1870095 | COC(=O)CN(C(=O)C(C)c1c(F)cccc1F)c1cccc(C)n1
1870023 | CCC(C)C(=O)N(CC(=O)OC)c1cccc(C)n1
1873944 | Cc1ccc(C(=O)N(C)CC(=O)Nc2cccc(C)n2)cn1
1873968 | Cc1cccc(NC(=O)CN(C)C(=O)c2ccc(-n3cccc3)nc2)n1
1882693 | Cc1cccc(NC(=O)CCNCc2c(C)nn(C)c2N(C)C)n1
1882711 | COc1c(CNCCC(=O)Nc2cccc(C)n2)c(C)nn1C
chembl_25=# select molregno,m from rdk.mols where m@>mol_adjust_query_properties('*c1cccc(NC(=O)*)n1',
chembl_25(# '{"adjustDegree":false}') limit 10;
molregno | m
----------+---------------------------------------------------------------------------------------------------
2146308 | CCn1ncc2cc3nc(c21)NCCOC[C@H](c1ccccc1)NC(=O)N3
2137309 | CCn1ncc2cc3nc(c21)CCCO[C@@H](O)[C@H](c1ccccc1)NC(=O)N3
2102593 | CCn1ncc2cc3nc(c21)CCCO[C@@H]([C@@H](C)O)[C@@H](c1ccccc1)NC(=O)N3
2171613 | CCn1ncc2cc3nc(c21)CCCO[C@@H]([C@H](C)O)[C@@H](c1ccccc1)NC(=O)N3
2111904 | CCn1ncc2cc3nc(c21)C[C@H](O)COC[C@H](c1cccc(Cl)c1)NC(=O)N3
2173410 | CCn1ncc2cc3nc(c21)CCCOC[C@H](c1ccccc1)NC(=O)N3
2189450 | Cn1ncc2cc3nc(c21)CCCOC[C@H](c1ccccc1)NC(=O)N3
2195752 | CCn1ncc2cc3nc(c21)C[C@H](O)COC[C@H](c1ccccc1)NC(=O)N3
1609520 | Cc1cccc(NC(=O)c2cc(Br)ccc2C(=O)O)n1
1141456 | CCN(CC)CCCn1cc(NC(=O)Nc2cccc(-c3ccccc3)n2)c2ccccc21
(10 rows)
Time: 10.780 ms
@@ -336,20 +308,20 @@ where we disable the additional degree queries:
or where we don't add the additional degree queries to ring atoms or dummies (they are only
added to chain atoms):
chembl_23=# select molregno,m from rdk.mols where m@>mol_adjust_query_properties('*c1cccc(NC(=O)*)n1',
chembl_23(# '{"adjustDegree":true,"adjustDegreeFlags":"IGNORERINGS|IGNOREDUMMIES"}') limit 10;
molregno | m
----------+-------------------------------------------------------------------------------------------
1993749 | Cn1c(Nc2c(Cl)ccc(CNC(=O)C(C)(C)C)c2Cl)nc2cc(C(=O)Nc3cccc(C(F)(F)F)n3)c(N3CCC(F)(F)C3)cc21
1957849 | COc1ccc2ncc(F)c(C[C@H](O)C3CCC(NCc4nc5c(cc4F)OCC(=O)N5)CO3)c2n1
1959611 | O=C1COc2ccc(CNC3CCN(CCn4c(=O)ccc5ncc(OCc6cccnn6)cc54)CC3)nc2N1
1988455 | Cc1cccc(C(=O)Nc2cccc(Oc3cccnc3)n2)c1
1873944 | Cc1ccc(C(=O)N(C)CC(=O)Nc2cccc(C)n2)cn1
1873968 | Cc1cccc(NC(=O)CN(C)C(=O)c2ccc(-n3cccc3)nc2)n1
1882693 | Cc1cccc(NC(=O)CCNCc2c(C)nn(C)c2N(C)C)n1
1882711 | COc1c(CNCCC(=O)Nc2cccc(C)n2)c(C)nn1C
1884388 | Cc1noc(COCC(=O)Nc2ccc(Br)c(C)n2)n1
1868705 | CCOc1cccc(NC(=O)c2cnc(C)cn2)n1
chembl_25=# select molregno,m from rdk.mols where m@>mol_adjust_query_properties('*c1cccc(NC(=O)*)n1',
chembl_25(# '{"adjustDegree":true,"adjustDegreeFlags":"IGNORERINGS|IGNOREDUMMIES"}') limit 10;
molregno | m
----------+---------------------------------------------------------------------------------------------------
2146308 | CCn1ncc2cc3nc(c21)NCCOC[C@H](c1ccccc1)NC(=O)N3
2137309 | CCn1ncc2cc3nc(c21)CCCO[C@@H](O)[C@H](c1ccccc1)NC(=O)N3
2102593 | CCn1ncc2cc3nc(c21)CCCO[C@@H]([C@@H](C)O)[C@@H](c1ccccc1)NC(=O)N3
2171613 | CCn1ncc2cc3nc(c21)CCCO[C@@H]([C@H](C)O)[C@@H](c1ccccc1)NC(=O)N3
2111904 | CCn1ncc2cc3nc(c21)C[C@H](O)COC[C@H](c1cccc(Cl)c1)NC(=O)N3
2173410 | CCn1ncc2cc3nc(c21)CCCOC[C@H](c1ccccc1)NC(=O)N3
2189450 | Cn1ncc2cc3nc(c21)CCCOC[C@H](c1ccccc1)NC(=O)N3
2195752 | CCn1ncc2cc3nc(c21)C[C@H](O)COC[C@H](c1ccccc1)NC(=O)N3
1609520 | Cc1cccc(NC(=O)c2cc(Br)ccc2C(=O)O)n1
1141456 | CCN(CC)CCCn1cc(NC(=O)Nc2cccc(-c3ccccc3)n2)c2ccccc21
(10 rows)
Time: 12.827 ms
@@ -381,7 +353,7 @@ are constructed by combining operations from the list below with the `|` charact
Basic similarity searching:
chembl_23=# select count(*) from rdk.fps where mfp2%morganbv_fp('Cc1ccc2nc(-c3ccc(NC(C4N(C(c5cccs5)=O)CCC4)=O)cc3)sc2c1');
chembl_25=# select count(*) from rdk.fps where mfp2%morganbv_fp('Cc1ccc2nc(-c3ccc(NC(C4N(C(c5cccs5)=O)CCC4)=O)cc3)sc2c1');
count
-------
67
@@ -391,8 +363,8 @@ Basic similarity searching:
Usually we'd like to find a sorted listed of neighbors along with the accompanying SMILES. This SQL function makes that pattern easy:
chembl_23=# create or replace function get_mfp2_neighbors(smiles text)
returns table(molregno integer, m mol, similarity double precision) as
chembl_25=# create or replace function get_mfp2_neighbors(smiles text)
returns table(molregno bigint, m mol, similarity double precision) as
$$
select molregno,m,tanimoto_sml(morganbv_fp(mol_from_smiles($1::cstring)),mfp2) as similarity
from rdk.fps join rdk.mols using (molregno)
@@ -401,35 +373,35 @@ Usually we'd like to find a sorted listed of neighbors along with the accompanyi
$$ language sql stable ;
CREATE FUNCTION
Time: 0.856 ms
chembl_23=# select * from get_mfp2_neighbors('Cc1ccc2nc(-c3ccc(NC(C4N(C(c5cccs5)=O)CCC4)=O)cc3)sc2c1') limit 10;
molregno | m | similarity
----------+------------------------------------------------------------+-------------------
471319 | Cc1ccc2nc(-c3ccc(NC(=O)C4CCN(S(=O)(=O)c5cccs5)C4)cc3)sc2c1 | 0.638888888888889
1032469 | O=C(Nc1nc2ccc(Cl)cc2s1)[C@@H]1CCCN1C(=O)c1cccs1 | 0.623188405797101
751668 | COc1ccc2nc(NC(=O)[C@@H]3CCCN3C(=O)c3cccs3)sc2c1 | 0.619718309859155
471318 | Cc1ccc2nc(-c3ccc(NC(=O)C4CN(S(=O)(=O)c5cccs5)C4)cc3)sc2c1 | 0.611111111111111
740754 | Cc1ccc(NC(=O)C2CCCN2C(=O)c2cccs2)cc1C | 0.606060606060606
732905 | O=C(Nc1ccc(S(=O)(=O)N2CCCC2)cc1)C1CCCN1C(=O)c1cccs1 | 0.602941176470588
1087495 | Cc1ccc(NC(=O)C2CCCN2C(=O)c2cccs2)c(C)c1 | 0.597014925373134
471462 | CCS(=O)(=O)N1CCC(C(=O)Nc2ccc(-c3nc4ccc(C)cc4s3)cc2)CC1 | 0.585714285714286
810850 | Cc1cc(C)n(-c2ccc(NC(=O)C3CCCCN3C(=O)c3cccs3)cc2)n1 | 0.583333333333333
1224407 | O=C(Nc1cccc(S(=O)(=O)N2CCCC2)c1)C1CCCN1C(=O)c1cccs1 | 0.579710144927536
chembl_25=# select * from get_mfp2_neighbors('Cc1ccc2nc(-c3ccc(NC(C4N(C(c5cccs5)=O)CCC4)=O)cc3)sc2c1') limit 10;
molregno | m | similarity
----------+------------------------------------------------------------------+-------------------
751668 | COc1ccc2nc(NC(=O)[C@@H]3CCCN3C(=O)c3cccs3)sc2c1 | 0.619718309859155
740754 | Cc1ccc(NC(=O)C2CCCN2C(=O)c2cccs2)cc1C | 0.606060606060606
732905 | O=C(Nc1ccc(S(=O)(=O)N2CCCC2)cc1)C1CCCN1C(=O)c1cccs1 | 0.602941176470588
810850 | Cc1cc(C)n(-c2ccc(NC(=O)C3CCCCN3C(=O)c3cccs3)cc2)n1 | 0.583333333333333
1224407 | O=C(Nc1cccc(S(=O)(=O)N2CCCC2)c1)C1CCCN1C(=O)c1cccs1 | 0.579710144927536
779258 | CC1CCN(S(=O)(=O)c2ccc(NC(=O)[C@@H]3CCCN3C(=O)c3cccs3)cc2)CC1 | 0.569444444444444
472441 | Cc1ccc2nc(-c3ccc(NC(=O)C4CCN(S(=O)(=O)C(C)C)CC4)cc3)sc2c1 | 0.569444444444444
745651 | Cc1ccc(NC(=O)[C@@H]2CCCN2C(=O)c2cccs2)cc1S(=O)(=O)N1CCCCC1 | 0.567567567567568
472510 | Cc1ccc2nc(-c3ccc(NC(=O)C4CCN(S(=O)(=O)c5cccc(Cl)c5)CC4)cc3)sc2c1 | 0.565789473684211
1233426 | Cc1cccc2sc(NC(=O)[C@@H]3CCCN3C(=O)c3cccs3)nc12 | 0.563380281690141
(10 rows)
Time: 28.909 ms
chembl_23=# select * from get_mfp2_neighbors('Cc1ccc2nc(N(C)CC(=O)O)sc2c1') limit 10;
molregno | m | similarity
----------+-------------------------------------------------------+-------------------
1044892 | Cc1ccc2nc(N(CCN(C)C)C(=O)c3cc(Cl)sc3Cl)sc2c1 | 0.518518518518518
1040496 | Cc1ccc2nc(N(CCCN(C)C)C(=O)CCc3ccccc3)sc2c1 | 0.517857142857143
1049393 | Cc1ccc2nc(N(CCCN(C)C)C(=O)CS(=O)(=O)c3ccccc3)sc2c1 | 0.517857142857143
441378 | Cc1ccc2nc(NC(=O)CCC(=O)O)sc2c1 | 0.510204081632653
1047691 | Cc1ccc(S(=O)(=O)CC(=O)N(CCCN(C)C)c2nc3ccc(C)cc3s2)cc1 | 0.509090909090909
911501 | Cc1ccc2nc(N(CCN(C)C)C(=O)c3cc(Cl)sc3Cl)sc2c1.Cl | 0.509090909090909
1042958 | Cc1ccc2nc(N(CCN(C)C)C(=O)c3ccc4ccccc4c3)sc2c1 | 0.509090909090909
775269 | Cc1ccc2nc(N(CCCN(C)C)C(=O)CCc3ccccc3)sc2c1.Cl | 0.508771929824561
1045663 | Cc1ccc2nc(N(CCCN(C)C)C(=O)COc3ccc(Cl)cc3)sc2c1 | 0.5
1015485 | Cc1ccc2nc(N(Cc3cccnc3)C(=O)Cc3ccccc3)sc2c1 | 0.5
chembl_25=# select * from get_mfp2_neighbors('Cc1ccc2nc(N(C)CC(=O)O)sc2c1') limit 10;
molregno | m | similarity
----------+----------------------------------------------------------+-------------------
2138088 | CN(CC(=O)O)c1nc2ccc([N+](=O)[O-])cc2s1 | 0.673913043478261
1040255 | CC(=O)N(CCCN(C)C)c1nc2ccc(C)cc2s1 | 0.571428571428571
773946 | CC(=O)N(CCCN(C)C)c1nc2ccc(C)cc2s1.Cl | 0.56
1044892 | Cc1ccc2nc(N(CCN(C)C)C(=O)c3cc(Cl)sc3Cl)sc2c1 | 0.518518518518518
441378 | Cc1ccc2nc(NC(=O)CCC(=O)O)sc2c1 | 0.510204081632653
1047691 | Cc1ccc(S(=O)(=O)CC(=O)N(CCCN(C)C)c2nc3ccc(C)cc3s2)cc1 | 0.509090909090909
1042958 | Cc1ccc2nc(N(CCN(C)C)C(=O)c3ccc4ccccc4c3)sc2c1 | 0.509090909090909
1015485 | Cc1ccc2nc(N(Cc3cccnc3)C(=O)Cc3ccccc3)sc2c1 | 0.5
994843 | Cc1ccc(S(=O)(=O)CC(=O)N(CCCN(C)C)c2nc3ccc(C)cc3s2)cc1.Cl | 0.5
841938 | Cc1ccc2nc(N(CCN(C)C)C(=O)c3ccc4ccccc4c3)sc2c1.Cl | 0.5
(10 rows)
Time: 41.623 ms
@@ -438,40 +410,40 @@ Usually we'd like to find a sorted listed of neighbors along with the accompanyi
By default, the minimum similarity returned with a similarity search is 0.5. This can be adjusted with the rdkit.tanimoto\_threshold (and rdkit.dice\_threshold) configuration variables:
chembl_23=# select count(*) from get_mfp2_neighbors('Cc1ccc2nc(N(C)CC(=O)O)sc2c1');
chembl_25=# select count(*) from get_mfp2_neighbors('Cc1ccc2nc(N(C)CC(=O)O)sc2c1');
count
-------
20
21
(1 row)
Time: 181.438 ms
chembl_23=# set rdkit.tanimoto_threshold=0.7;
chembl_25=# set rdkit.tanimoto_threshold=0.7;
SET
Time: 0.047 ms
chembl_23=# select count(*) from get_mfp2_neighbors('Cc1ccc2nc(N(C)CC(=O)O)sc2c1');
chembl_25=# select count(*) from get_mfp2_neighbors('Cc1ccc2nc(N(C)CC(=O)O)sc2c1');
count
-------
0
(1 row)
Time: 161.228 ms
chembl_23=# set rdkit.tanimoto_threshold=0.6;
chembl_25=# set rdkit.tanimoto_threshold=0.6;
SET
Time: 0.045 ms
chembl_23=# select count(*) from get_mfp2_neighbors('Cc1ccc2nc(N(C)CC(=O)O)sc2c1');
chembl_25=# select count(*) from get_mfp2_neighbors('Cc1ccc2nc(N(C)CC(=O)O)sc2c1');
count
-------
1
2
(1 row)
Time: 184.275 ms
chembl_23=# set rdkit.tanimoto_threshold=0.5;
chembl_25=# set rdkit.tanimoto_threshold=0.5;
SET
Time: 0.055 ms
chembl_23=# select count(*) from get_mfp2_neighbors('Cc1ccc2nc(N(C)CC(=O)O)sc2c1');
chembl_25=# select count(*) from get_mfp2_neighbors('Cc1ccc2nc(N(C)CC(=O)O)sc2c1');
count
-------
20
21
(1 row)
Time: 181.100 ms
@@ -480,14 +452,14 @@ By default, the minimum similarity returned with a similarity search is 0.5. Thi
The most straightforward use of the MCS code is to find the maximum common substructure of a group of molecules:
chembl_23=# select fmcs(m::text) from rdk.mols join compound_records using (molregno) where doc_id=4;
chembl_25=# select fmcs(m::text) from rdk.mols join compound_records using (molregno) where doc_id=4;
fmcs
------------------------------------------------------------------------
[#6](-[#6]-[#7]-[#6]-[#6](-,:[#6])-,:[#6])-,:[#6]-,:[#6]-,:[#6]-,:[#6]
(1 row)
Time: 31.041 ms
chembl_23=# select fmcs(m::text) from rdk.mols join compound_records using (molregno) where doc_id=5;
chembl_25=# select fmcs(m::text) from rdk.mols join compound_records using (molregno) where doc_id=5;
fmcs
------------------------------------------------------------------------------------------------------------------------------------------
[#6]-[#6](=[#8])-[#7]-[#6](-[#6](=[#8])-[#7]1-[#6]-[#6]-[#6]-[#6]-1-[#6](=[#8])-[#7]-[#6](-[#6](=[#8])-[#8])-[#6]-[#6])-[#6](-[#6])-[#6]
@@ -497,7 +469,7 @@ The most straightforward use of the MCS code is to find the maximum common subst
The same thing can be done with a SMILES column:
chembl_23=# select fmcs(canonical_smiles) from compound_structures join compound_records using (molregno) where doc_id=4;
chembl_25=# select fmcs(canonical_smiles) from compound_structures join compound_records using (molregno) where doc_id=4;
fmcs
------------------------------------------------------------------------
[#6](-[#7]-[#6]-[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6])-[#6](-,:[#6])-,:[#6]
@@ -507,9 +479,9 @@ The same thing can be done with a SMILES column:
It's also possible to adjust some of the parameters to the FMCS algorithm, though this is somewhat more painful as of this writing (the 2017\_03 release cycle). Here are a couple of examples:
chembl_23=# select fmcs_smiles(str,'{"Threshold":0.8}') from
chembl_23-# (select string_agg(m::text,' ') as str from rdk.mols
chembl_23(# join compound_records using (molregno) where doc_id=4) as str ;
chembl_25=# select fmcs_smiles(str,'{"Threshold":0.8}') from
chembl_25-# (select string_agg(m::text,' ') as str from rdk.mols
chembl_25(# join compound_records using (molregno) where doc_id=4) as str ;
fmcs_smiles
------------------------------------------------------------------------------------------------------------------------------------------------------------------
@@ -517,10 +489,10 @@ It's also possible to adjust some of the parameters to the FMCS algorithm, thoug
(1 row)
Time: 9673.949 ms
chembl_23=#
chembl_23=# select fmcs_smiles(str,'{"AtomCompare":"Any"}') from
chembl_23-# (select string_agg(m::text,' ') as str from rdk.mols
chembl_23(# join compound_records using (molregno) where doc_id=4) as str ;
chembl_25=#
chembl_25=# select fmcs_smiles(str,'{"AtomCompare":"Any"}') from
chembl_25-# (select string_agg(m::text,' ') as str from rdk.mols
chembl_25(# join compound_records using (molregno) where doc_id=4) as str ;
fmcs_smiles
------------------------------------------------------------------------------------------------------------------------------------------------------------------------
[#6]-,:[#6,#7]-[#8,#6]-[#6,#7](-[#6,#8]-[#7,#6]-,:[#6,#7]-,:[#6,#7]-,:[#7,#6]-,:[#6])-[#6,#7]-[#6]-[#6](-[#8,#6]-[#6])-[#6,#7]-[#7,#6]-[#6]-,:[#6,#8]-,:[#7,#6]-,:[#6]
@@ -530,9 +502,9 @@ It's also possible to adjust some of the parameters to the FMCS algorithm, thoug
*Note* The combination of `"AtomCompare":"Any"` and a value of `"Threshold"` that is less than 1.0 does a quite generic search and can results in very long search times. Using `"Timeout"` with this combination is recommended:
chembl_23=# select fmcs_smiles(str,'{"AtomCompare":"Any","CompleteRingsOnly":true,"Threshold":0.8,"Timeout":60}') from
chembl_23-# (select string_agg(m::text,' ') as str from rdk.mols
chembl_23(# join compound_records using (molregno) where doc_id=3) as str ;
chembl_25=# select fmcs_smiles(str,'{"AtomCompare":"Any","CompleteRingsOnly":true,"Threshold":0.8,"Timeout":60}') from
chembl_25-# (select string_agg(m::text,' ') as str from rdk.mols
chembl_25(# join compound_records using (molregno) where doc_id=3) as str ;
WARNING: findMCS timed out, result is not maximal
fmcs_smiles
@@ -724,7 +696,7 @@ The recommended adapter for connecting to postgresql is pyscopg2 (<https://pypi.
Here's an example of connecting to our local copy of ChEMBL and doing a basic substructure search:
>>> import psycopg2
>>> conn = psycopg2.connect(database='chembl_16')
>>> conn = psycopg2.connect(database='chembl_25')
>>> curs = conn.cursor()
>>> curs.execute('select * from rdk.mols where m@>%s',('c1cccc2c1nncc2',))
>>> curs.fetchone()
@@ -735,12 +707,12 @@ That returns a SMILES for each molecule. If you plan to do more work with the mo
>>> curs.execute('select molregno,mol_send(m) from rdk.mols where m@>%s',('c1cccc2c1nncc2',))
>>> row = curs.fetchone()
>>> row
(9830, <read-only buffer for 0x...>)
(9830, <memory at 0x...>)
These pickles can then be converted into molecules:
>>> from rdkit import Chem
>>> m = Chem.Mol(str(row[1]))
>>> m = Chem.Mol(row[1].tobytes())
>>> Chem.MolToSmiles(m,True)
'CC(C)Sc1ccc(CC2CCN(C3CCN(C(=O)c4cnnc5ccccc54)CC3)CC2)cc1'