Files
rdkit/Code/SimDivPickers/Wrap/testPickers.py
Greg Landrum 6cfc8f36a7 [WIP] Add Leader picker implementation (#2724)
* initial commit of Roger's contrib

* add the new file to the headers list

* clang-format

* stub of the python wrapper

* code added for testing

* code added for testing

* add LeaderPicker.seq.h

* default pickSize value

* first crude python wrapper

* Add C++ tests for the LeaderPicker
combine the thread and non-thread versions

* that was not really a test

* support providing a functor from Python

* no longer need the .seq header

* temporarily disable the threaded version to allow CI runs to pass

* some refactoring and cleanup
2019-10-24 13:14:15 +02:00

319 lines
12 KiB
Python
Executable File

from rdkit import RDConfig
import unittest, os
from rdkit.SimDivFilters import rdSimDivPickers
from rdkit.DataManip.Metric import rdMetricMatrixCalc as rdmmc
from rdkit import DataStructs
import numpy
import random
class TestCase(unittest.TestCase):
def setUp(self):
self.n = 1000
self.m = 80
self.d = 2
self.dataPts = numpy.zeros((self.n, self.d), 'd')
for i in range(self.n):
for j in range(self.d):
self.dataPts[i, j] = random.random()
self.dMat = rdmmc.GetEuclideanDistMat(self.dataPts)
def test0MaxMin(self):
pkr = rdSimDivPickers.MaxMinPicker()
maxmin = pkr.Pick(self.dMat, self.n, self.m, (886, 112))
self.assertEqual(maxmin[0], 886)
self.assertEqual(maxmin[1], 112)
def func(i, j):
if i == j:
return 0.0
if i < j:
j, i = i, j
return self.dMat[i * (i - 1) // 2 + j]
lmaxmin = pkr.LazyPick(func, self.n, self.m, (886, 112))
self.assertEqual(list(lmaxmin), list(maxmin))
lmaxmin = pkr.LazyPick(func, self.n, self.m, (886, 112), useCache=False)
self.assertEqual(list(lmaxmin), list(maxmin))
self.assertRaises(ValueError, lambda: pkr.Pick(self.dMat, self.n, self.m, (1012, )))
self.assertRaises(ValueError, lambda: pkr.Pick(self.dMat, self.n, self.m, (-1, )))
maxmin = pkr.Pick(self.dMat, self.n, self.m)
self.assertTrue(maxmin)
lmaxmin = pkr.LazyPick(func, self.n, self.m)
self.assertTrue(lmaxmin)
def test1HierarchPick(self):
fname = os.path.join(RDConfig.RDBaseDir, 'Code', 'SimDivPickers', 'Wrap', 'test_data',
'points.csv')
with open(fname) as infil:
lines = infil.readlines()
self.dataPts = numpy.zeros((len(lines), 2), 'd')
labels = []
i = 0
for line in lines:
tlst = line.strip().split(',')
self.dataPts[i, 0] = float(tlst[1])
self.dataPts[i, 1] = float(tlst[2])
labels.append(int(tlst[3]))
i += 1
self.dMat = rdmmc.GetEuclideanDistMat(self.dataPts)
pkr = rdSimDivPickers.HierarchicalClusterPicker(rdSimDivPickers.ClusterMethod.WARD)
clusters = pkr.Cluster(self.dMat, i, 2)
# check that each of the clusters have the same label
for cl in clusters:
clbl = labels[cl[0]]
for id in cl:
assert clbl == labels[id]
hierarch = pkr.Pick(self.dMat, i, 2)
self.assertEqual(tuple(hierarch), (1, 30))
def testIssue208(self):
sz = 10
N = 3
m = []
for i in range(sz):
for j in range(i + 1, sz):
m.append(random.random())
m = numpy.array(m)
picker = rdSimDivPickers.HierarchicalClusterPicker(rdSimDivPickers.ClusterMethod.WARD)
p1 = list(picker.Pick(m, sz, N))
p1.sort()
p2 = list(picker.Pick(m, sz, N))
p2.sort()
self.assertEqual(p1, p2)
def testInts(self):
""" make sure we can handle ints too """
sz = 10
N = 3
m = []
for i in range(sz):
for j in range(i + 1, sz):
m.append(int(100 * random.random()))
m = numpy.array(m)
picker = rdSimDivPickers.HierarchicalClusterPicker(rdSimDivPickers.ClusterMethod.WARD)
p1 = list(picker.Pick(m, sz, N))
p1.sort()
p2 = list(picker.Pick(m, sz, N))
p2.sort()
self.assertEqual(p1, p2)
def testNonUniqueCrash(self):
from rdkit import DataStructs
sz = 300
nbits = 40
nBitsToSet = int(nbits * .3)
N = 8
vs = []
for i in range(sz):
bv = DataStructs.ExplicitBitVect(nbits)
for j in range(nBitsToSet):
val = int(nbits * random.random())
bv.SetBit(val)
vs.append(bv)
vs.append(bv)
def taniFunc(i, j, bvs=vs):
d = 1 - DataStructs.FingerprintSimilarity(bvs[i], bvs[j])
return d
picker = rdSimDivPickers.MaxMinPicker()
mm1 = picker.LazyPick(taniFunc, len(vs), N)
self.assertEqual(len(mm1), N)
picker = None
picker = rdSimDivPickers.MaxMinPicker()
mm2 = picker.LazyBitVectorPick(vs, len(vs), N)
self.assertEqual(len(mm2), N)
picker = rdSimDivPickers.MaxMinPicker()
mm3 = picker.LazyBitVectorPick(vs, len(vs), N)
self.assertEqual(len(mm3), N)
# we get the occasional dupe randomly,
# make sure we don't get three dupes in a row
self.assertTrue(tuple(mm2) != tuple(mm1)) or (tuple(mm3) != tuple(mm1))
picker = None
ds = []
nvs = len(vs)
for i in range(nvs):
for j in range(i + 1, nvs):
d = taniFunc(i, j)
ds.append(d)
m = numpy.array(ds)
picker = rdSimDivPickers.HierarchicalClusterPicker(rdSimDivPickers.ClusterMethod.WARD)
p1 = list(picker.Pick(m, nvs, N))
def testBitVectorMaxMin(self):
from rdkit import DataStructs
sz = 100
nbits = 200
nBitsToSet = int(nbits * .1)
N = 10
vs = []
for i in range(sz):
bv = DataStructs.ExplicitBitVect(nbits)
for j in range(nBitsToSet):
val = int(nbits * random.random())
bv.SetBit(val)
vs.append(bv)
def func(i, j, bvs=vs):
d = DataStructs.TanimotoSimilarity(bvs[i], bvs[j], returnDistance=True)
return d
picker = rdSimDivPickers.MaxMinPicker()
mm1 = picker.LazyPick(func, len(vs), N, seed=42)
self.assertEqual(len(mm1), N)
mm2 = picker.LazyPick(func, len(vs), N, useCache=False, seed=42)
self.assertEqual(len(mm2), N)
self.assertEqual(list(mm1), list(mm2))
mm2 = picker.LazyBitVectorPick(vs, len(vs), N, seed=42)
self.assertEqual(len(mm2), N)
self.assertEqual(list(mm1), list(mm2))
mm2 = picker.LazyBitVectorPick(vs, len(vs), N, useCache=False, seed=42)
self.assertEqual(len(mm2), N)
self.assertEqual(list(mm1), list(mm2))
def testBitVectorMaxMin2(self):
fps = [
"11110010101000000000", "00000000000010010000", "11001010000000000001",
"00100110101000001000", "01010110000100011001", "11000110101001000011",
"00000000001100001111", "00011110110000001101", "00000011011110100010",
"11000010110001000000", "00000100010000010000", "10000001000010110010",
"00010010000000010100", "00011100100110101000", "10001001100110100000",
"10000110100110010000", "00101110000101000000", "11011101100011100000",
"10000110000100101000", "00101000100000010001", "01000001000010000000",
"00101101010100000110", "10001000100110110001", "00011000010100000001",
"00101000001000100011", "00010000100010011001", "01100001000100010001",
"10000101000001101101", "00001000011001011000", "11110000100100100000",
"10100110000000011010", "00110100010110010010", "00000000000001010010",
"00100000000010100001", "11110011000010001000", "10110001010100001000",
"00001100100110011011", "00010010100100001110", "10100101100010100010",
"01100100010100000001", "10101110011100000000", "01011000000001000001",
"00000011100110100010", "01100001010001001001", "00001000000001001100",
"10011001110000000100", "10110000001001100100", "00011000000001001011",
"11001011010001100010", "10010000000001001011", "00010000100111100000",
"00001000001110001000", "11010000010001100110", "01101001100000111000",
"01001000001110111000", "10000000000100010010", "11001000010010000000",
"01010010000100110001", "00010001010100100001", "01110010000000010000",
"10001010000011000001", "00000110000000100100", "00010000010001000000",
"11101100011010000011", "00000010100001010001", "00010000110010000101",
"00010001001000111001", "01000010001100100110", "00110110000000100001",
"00100010010110110010", "01000000110011001111", "00011000001000110010",
"01111010101000110100", "00001010000010110110", "00110011000011011010",
"00111010111010000110", "00010011101010000011", "00000001011000010000",
"00011011101110110000", "00010001101000000001", "00010000001010011010",
"00000010100100100010", "00000010001011000100", "11010000000001011100",
"00001000110101000001", "00000010000000110010", "10000000010011000001",
"11110110100100010000", "10001111000110001001", "00100110000110000100",
"00000100100000100100", "00110000101100010100", "00001010100000100000",
"01011000000011000111", "00010000100001010001", "10000010100000010000",
"00001000000000110010", "00001000101011010001", "00011110000100100000", "11001001010001010100"
]
N = 5
fps = [DataStructs.CreateFromBitString(x) for x in fps]
picker = rdSimDivPickers.MaxMinPicker()
mm1 = picker.LazyBitVectorPick(fps, len(fps), N, seed=42)
self.assertEqual(len(mm1), N)
self.assertEqual(list(mm1), [37, 1, 43, 38, 16])
mm2 = picker.LazyBitVectorPick(fps, len(fps), N, useCache=False, seed=42)
self.assertEqual(len(mm2), N)
self.assertEqual(list(mm1), list(mm2))
def testBitVectorMaxMin3(self):
fname = os.path.join(RDConfig.RDBaseDir, 'Code', 'SimDivPickers', 'Wrap', 'test_data',
'chembl_cyps.head.fps')
fps = []
with open(fname) as infil:
for line in infil:
fp = DataStructs.CreateFromFPSText(line.strip())
fps.append(fp)
mmp = rdSimDivPickers.MaxMinPicker()
ids = list(mmp.LazyBitVectorPick(fps, len(fps), 20, seed=42))
self.assertEqual(ids,[374,720,690,339,875,842,404,725,120,385,115,868,630,\
881,516,497,412,718,869,407])
ids = list(
mmp.LazyBitVectorPick(fps, len(fps), 20, firstPicks=[374, 720, 690, 339, 875], seed=42))
self.assertEqual(ids,[374,720,690,339,875,842,404,725,120,385,115,868,630,\
881,516,497,412,718,869,407])
def testBitVectorMaxMin4(self):
# threshold tests
fname = os.path.join(RDConfig.RDBaseDir, 'Code', 'SimDivPickers', 'Wrap', 'test_data',
'chembl_cyps.head.fps')
fps = []
with open(fname) as infil:
for line in infil:
fp = DataStructs.CreateFromFPSText(line.strip())
fps.append(fp)
mmp = rdSimDivPickers.MaxMinPicker()
ids, threshold = mmp.LazyBitVectorPickWithThreshold(fps, len(fps), 20, -1.0, seed=42)
self.assertEqual(list(ids),[374,720,690,339,875,842,404,725,120,385,115,868,630,\
881,516,497,412,718,869,407])
self.assertAlmostEqual(threshold, 0.8977, 4)
ids, threshold = mmp.LazyBitVectorPickWithThreshold(fps, len(fps), 20, 0.91, seed=42)
self.assertEqual(list(ids), [374, 720, 690, 339, 875, 842, 404, 725, 120, 385, 115, 868, 630])
self.assertTrue(threshold >= 0.91)
def testBitVectorLeader1(self):
# threshold tests
fname = os.path.join(RDConfig.RDBaseDir, 'Code', 'SimDivPickers', 'Wrap', 'test_data',
'chembl_cyps.head.fps')
fps = []
with open(fname) as infil:
for line in infil:
fp = DataStructs.CreateFromFPSText(line.strip())
fps.append(fp)
mmp = rdSimDivPickers.LeaderPicker()
thresh = 0.8
ids = mmp.LazyBitVectorPick(fps, len(fps), thresh)
self.assertEqual(len(ids), 146)
for i in range(len(ids)):
for j in range(i):
self.assertGreaterEqual(1 - DataStructs.TanimotoSimilarity(fps[ids[i]], fps[ids[j]]),
thresh)
thresh = 0.9
ids = mmp.LazyBitVectorPick(fps, len(fps), thresh)
self.assertEqual(len(ids), 14)
for i in range(len(ids)):
for j in range(i):
self.assertGreaterEqual(1 - DataStructs.TanimotoSimilarity(fps[ids[i]], fps[ids[j]]),
thresh)
ids = mmp.LazyBitVectorPick(fps, len(fps), thresh, pickSize=10)
self.assertEqual(len(ids), 10)
for i in range(len(ids)):
for j in range(i):
self.assertGreaterEqual(1 - DataStructs.TanimotoSimilarity(fps[ids[i]], fps[ids[j]]),
thresh)
def testLazyLeader(self):
pkr = rdSimDivPickers.LeaderPicker()
def func(i, j):
if i == j:
return 0.0
if i < j:
j, i = i, j
return i - j
lres = pkr.LazyPick(func, 100, 20)
self.assertEqual(list(lres), [0, 21, 42, 63, 84])
if __name__ == '__main__':
unittest.main()