Files
rdkit/Code/SimDivPickers/Wrap/testPickers.py
Ric 880a8e5725 Reformat Python code for 2023.03 release (#6294)
* run yapf

* run isort

---------

Co-authored-by: Greg Landrum <greg.landrum@gmail.com>
2023-04-28 06:53:56 +02:00

327 lines
12 KiB
Python
Executable File

import os
import random
import unittest
import numpy
from rdkit import DataStructs, RDConfig
from rdkit.DataManip.Metric import rdMetricMatrixCalc as rdmmc
from rdkit.SimDivFilters import rdSimDivPickers
class TestCase(unittest.TestCase):
def setUp(self):
self.n = 1000
self.m = 80
self.d = 2
self.dataPts = numpy.zeros((self.n, self.d), 'd')
for i in range(self.n):
for j in range(self.d):
self.dataPts[i, j] = random.random()
self.dMat = rdmmc.GetEuclideanDistMat(self.dataPts)
def test0MaxMin(self):
pkr = rdSimDivPickers.MaxMinPicker()
maxmin = pkr.Pick(self.dMat, self.n, self.m, (886, 112))
self.assertEqual(maxmin[0], 886)
self.assertEqual(maxmin[1], 112)
def func(i, j):
if i == j:
return 0.0
if i < j:
j, i = i, j
return self.dMat[i * (i - 1) // 2 + j]
lmaxmin = pkr.LazyPick(func, self.n, self.m, (886, 112))
self.assertEqual(list(lmaxmin), list(maxmin))
lmaxmin = pkr.LazyPick(func, self.n, self.m, (886, 112), useCache=False)
self.assertEqual(list(lmaxmin), list(maxmin))
self.assertRaises(ValueError, lambda: pkr.Pick(self.dMat, self.n, self.m, (1012, )))
self.assertRaises(ValueError, lambda: pkr.Pick(self.dMat, self.n, self.m, (-1, )))
maxmin = pkr.Pick(self.dMat, self.n, self.m)
self.assertTrue(maxmin)
lmaxmin = pkr.LazyPick(func, self.n, self.m)
self.assertTrue(lmaxmin)
def test1HierarchPick(self):
fname = os.path.join(RDConfig.RDBaseDir, 'Code', 'SimDivPickers', 'Wrap', 'test_data',
'points.csv')
with open(fname) as infil:
lines = infil.readlines()
self.dataPts = numpy.zeros((len(lines), 2), 'd')
labels = []
i = 0
for line in lines:
tlst = line.strip().split(',')
self.dataPts[i, 0] = float(tlst[1])
self.dataPts[i, 1] = float(tlst[2])
labels.append(int(tlst[3]))
i += 1
self.dMat = rdmmc.GetEuclideanDistMat(self.dataPts)
pkr = rdSimDivPickers.HierarchicalClusterPicker(rdSimDivPickers.ClusterMethod.WARD)
clusters = pkr.Cluster(self.dMat, i, 2)
# check that each of the clusters have the same label
for cl in clusters:
clbl = labels[cl[0]]
for id in cl:
assert clbl == labels[id]
hierarch = pkr.Pick(self.dMat, i, 2)
self.assertEqual(tuple(hierarch), (1, 30))
def testIssue208(self):
sz = 10
N = 3
m = []
for i in range(sz):
for j in range(i + 1, sz):
m.append(random.random())
m = numpy.array(m)
picker = rdSimDivPickers.HierarchicalClusterPicker(rdSimDivPickers.ClusterMethod.WARD)
p1 = list(picker.Pick(m, sz, N))
p1.sort()
p2 = list(picker.Pick(m, sz, N))
p2.sort()
self.assertEqual(p1, p2)
def testInts(self):
""" make sure we can handle ints too """
sz = 10
N = 3
m = []
for i in range(sz):
for j in range(i + 1, sz):
m.append(int(100 * random.random()))
m = numpy.array(m)
picker = rdSimDivPickers.HierarchicalClusterPicker(rdSimDivPickers.ClusterMethod.WARD)
p1 = list(picker.Pick(m, sz, N))
p1.sort()
p2 = list(picker.Pick(m, sz, N))
p2.sort()
self.assertEqual(p1, p2)
def testNonUniqueCrash(self):
from rdkit import DataStructs
sz = 300
nbits = 40
nBitsToSet = int(nbits * .3)
N = 8
vs = []
for i in range(sz):
bv = DataStructs.ExplicitBitVect(nbits)
for j in range(nBitsToSet):
val = int(nbits * random.random())
bv.SetBit(val)
vs.append(bv)
vs.append(bv)
def taniFunc(i, j, bvs=vs):
d = 1 - DataStructs.FingerprintSimilarity(bvs[i], bvs[j])
return d
picker = rdSimDivPickers.MaxMinPicker()
mm1 = picker.LazyPick(taniFunc, len(vs), N)
self.assertEqual(len(mm1), N)
picker = None
picker = rdSimDivPickers.MaxMinPicker()
mm2 = picker.LazyBitVectorPick(vs, len(vs), N)
self.assertEqual(len(mm2), N)
picker = rdSimDivPickers.MaxMinPicker()
mm3 = picker.LazyBitVectorPick(vs, len(vs), N)
self.assertEqual(len(mm3), N)
# we get the occasional dupe randomly,
# make sure we don't get three dupes in a row
self.assertTrue(tuple(mm2) != tuple(mm1)) or (tuple(mm3) != tuple(mm1))
picker = None
ds = []
nvs = len(vs)
for i in range(nvs):
for j in range(i + 1, nvs):
d = taniFunc(i, j)
ds.append(d)
m = numpy.array(ds)
picker = rdSimDivPickers.HierarchicalClusterPicker(rdSimDivPickers.ClusterMethod.WARD)
p1 = list(picker.Pick(m, nvs, N))
def testBitVectorMaxMin(self):
from rdkit import DataStructs
sz = 100
nbits = 200
nBitsToSet = int(nbits * .1)
N = 10
vs = []
for i in range(sz):
bv = DataStructs.ExplicitBitVect(nbits)
for j in range(nBitsToSet):
val = int(nbits * random.random())
bv.SetBit(val)
vs.append(bv)
def func(i, j, bvs=vs):
d = DataStructs.TanimotoSimilarity(bvs[i], bvs[j], returnDistance=True)
return d
picker = rdSimDivPickers.MaxMinPicker()
mm1 = picker.LazyPick(func, len(vs), N, seed=42)
self.assertEqual(len(mm1), N)
mm2 = picker.LazyPick(func, len(vs), N, useCache=False, seed=42)
self.assertEqual(len(mm2), N)
self.assertEqual(list(mm1), list(mm2))
mm2 = picker.LazyBitVectorPick(vs, len(vs), N, seed=42)
self.assertEqual(len(mm2), N)
self.assertEqual(list(mm1), list(mm2))
mm2 = picker.LazyBitVectorPick(vs, len(vs), N, useCache=False, seed=42)
self.assertEqual(len(mm2), N)
self.assertEqual(list(mm1), list(mm2))
def testBitVectorMaxMin2(self):
fps = [
"11110010101000000000", "00000000000010010000", "11001010000000000001",
"00100110101000001000", "01010110000100011001", "11000110101001000011",
"00000000001100001111", "00011110110000001101", "00000011011110100010",
"11000010110001000000", "00000100010000010000", "10000001000010110010",
"00010010000000010100", "00011100100110101000", "10001001100110100000",
"10000110100110010000", "00101110000101000000", "11011101100011100000",
"10000110000100101000", "00101000100000010001", "01000001000010000000",
"00101101010100000110", "10001000100110110001", "00011000010100000001",
"00101000001000100011", "00010000100010011001", "01100001000100010001",
"10000101000001101101", "00001000011001011000", "11110000100100100000",
"10100110000000011010", "00110100010110010010", "00000000000001010010",
"00100000000010100001", "11110011000010001000", "10110001010100001000",
"00001100100110011011", "00010010100100001110", "10100101100010100010",
"01100100010100000001", "10101110011100000000", "01011000000001000001",
"00000011100110100010", "01100001010001001001", "00001000000001001100",
"10011001110000000100", "10110000001001100100", "00011000000001001011",
"11001011010001100010", "10010000000001001011", "00010000100111100000",
"00001000001110001000", "11010000010001100110", "01101001100000111000",
"01001000001110111000", "10000000000100010010", "11001000010010000000",
"01010010000100110001", "00010001010100100001", "01110010000000010000",
"10001010000011000001", "00000110000000100100", "00010000010001000000",
"11101100011010000011", "00000010100001010001", "00010000110010000101",
"00010001001000111001", "01000010001100100110", "00110110000000100001",
"00100010010110110010", "01000000110011001111", "00011000001000110010",
"01111010101000110100", "00001010000010110110", "00110011000011011010",
"00111010111010000110", "00010011101010000011", "00000001011000010000",
"00011011101110110000", "00010001101000000001", "00010000001010011010",
"00000010100100100010", "00000010001011000100", "11010000000001011100",
"00001000110101000001", "00000010000000110010", "10000000010011000001",
"11110110100100010000", "10001111000110001001", "00100110000110000100",
"00000100100000100100", "00110000101100010100", "00001010100000100000",
"01011000000011000111", "00010000100001010001", "10000010100000010000",
"00001000000000110010", "00001000101011010001", "00011110000100100000", "11001001010001010100"
]
N = 5
fps = [DataStructs.CreateFromBitString(x) for x in fps]
picker = rdSimDivPickers.MaxMinPicker()
mm1 = picker.LazyBitVectorPick(fps, len(fps), N, seed=42)
self.assertEqual(len(mm1), N)
self.assertEqual(list(mm1), [37, 1, 43, 38, 16])
mm2 = picker.LazyBitVectorPick(fps, len(fps), N, useCache=False, seed=42)
self.assertEqual(len(mm2), N)
self.assertEqual(list(mm1), list(mm2))
def testBitVectorMaxMin3(self):
fname = os.path.join(RDConfig.RDBaseDir, 'Code', 'SimDivPickers', 'Wrap', 'test_data',
'chembl_cyps.head.fps')
fps = []
with open(fname) as infil:
for line in infil:
fp = DataStructs.CreateFromFPSText(line.strip())
fps.append(fp)
mmp = rdSimDivPickers.MaxMinPicker()
ids = list(mmp.LazyBitVectorPick(fps, len(fps), 20, seed=42))
self.assertEqual(ids, [
374, 720, 690, 339, 875, 842, 404, 725, 120, 385, 115, 868, 630, 881, 516, 497, 412, 718, 869,
407
])
ids = list(
mmp.LazyBitVectorPick(fps, len(fps), 20, firstPicks=[374, 720, 690, 339, 875], seed=42))
self.assertEqual(ids, [
374, 720, 690, 339, 875, 842, 404, 725, 120, 385, 115, 868, 630, 881, 516, 497, 412, 718, 869,
407
])
def testBitVectorMaxMin4(self):
# threshold tests
fname = os.path.join(RDConfig.RDBaseDir, 'Code', 'SimDivPickers', 'Wrap', 'test_data',
'chembl_cyps.head.fps')
fps = []
with open(fname) as infil:
for line in infil:
fp = DataStructs.CreateFromFPSText(line.strip())
fps.append(fp)
mmp = rdSimDivPickers.MaxMinPicker()
ids, threshold = mmp.LazyBitVectorPickWithThreshold(fps, len(fps), 20, -1.0, seed=42)
self.assertEqual(list(ids), [
374, 720, 690, 339, 875, 842, 404, 725, 120, 385, 115, 868, 630, 881, 516, 497, 412, 718, 869,
407
])
self.assertAlmostEqual(threshold, 0.8977, 4)
ids, threshold = mmp.LazyBitVectorPickWithThreshold(fps, len(fps), 20, 0.91, seed=42)
self.assertEqual(list(ids), [374, 720, 690, 339, 875, 842, 404, 725, 120, 385, 115, 868, 630])
self.assertTrue(threshold >= 0.91)
def testBitVectorLeader1(self):
# threshold tests
fname = os.path.join(RDConfig.RDBaseDir, 'Code', 'SimDivPickers', 'Wrap', 'test_data',
'chembl_cyps.head.fps')
fps = []
with open(fname) as infil:
for line in infil:
fp = DataStructs.CreateFromFPSText(line.strip())
fps.append(fp)
mmp = rdSimDivPickers.LeaderPicker()
thresh = 0.8
ids = mmp.LazyBitVectorPick(fps, len(fps), thresh)
self.assertEqual(len(ids), 146)
for i in range(len(ids)):
for j in range(i):
self.assertGreaterEqual(1 - DataStructs.TanimotoSimilarity(fps[ids[i]], fps[ids[j]]),
thresh)
thresh = 0.9
ids = mmp.LazyBitVectorPick(fps, len(fps), thresh)
self.assertEqual(len(ids), 14)
for i in range(len(ids)):
for j in range(i):
self.assertGreaterEqual(1 - DataStructs.TanimotoSimilarity(fps[ids[i]], fps[ids[j]]),
thresh)
ids = mmp.LazyBitVectorPick(fps, len(fps), thresh, pickSize=10)
self.assertEqual(len(ids), 10)
for i in range(len(ids)):
for j in range(i):
self.assertGreaterEqual(1 - DataStructs.TanimotoSimilarity(fps[ids[i]], fps[ids[j]]),
thresh)
def testLazyLeader(self):
pkr = rdSimDivPickers.LeaderPicker()
def func(i, j):
if i == j:
return 0.0
if i < j:
j, i = i, j
return i - j
lres = pkr.LazyPick(func, 100, 20)
self.assertEqual(list(lres), [0, 21, 42, 63, 84])
if __name__ == '__main__':
unittest.main()