mirror of
https://github.com/rdkit/rdkit.git
synced 2026-06-03 21:44:30 +08:00
327 lines
12 KiB
Python
Executable File
327 lines
12 KiB
Python
Executable File
import os
|
|
import random
|
|
import unittest
|
|
|
|
import numpy
|
|
|
|
from rdkit import DataStructs, RDConfig
|
|
from rdkit.DataManip.Metric import rdMetricMatrixCalc as rdmmc
|
|
from rdkit.SimDivFilters import rdSimDivPickers
|
|
|
|
|
|
class TestCase(unittest.TestCase):
|
|
|
|
def setUp(self):
|
|
self.n = 1000
|
|
self.m = 80
|
|
self.d = 2
|
|
self.dataPts = numpy.zeros((self.n, self.d), 'd')
|
|
for i in range(self.n):
|
|
for j in range(self.d):
|
|
self.dataPts[i, j] = random.random()
|
|
self.dMat = rdmmc.GetEuclideanDistMat(self.dataPts)
|
|
|
|
def test0MaxMin(self):
|
|
pkr = rdSimDivPickers.MaxMinPicker()
|
|
maxmin = pkr.Pick(self.dMat, self.n, self.m, (886, 112))
|
|
self.assertEqual(maxmin[0], 886)
|
|
self.assertEqual(maxmin[1], 112)
|
|
|
|
def func(i, j):
|
|
if i == j:
|
|
return 0.0
|
|
if i < j:
|
|
j, i = i, j
|
|
return self.dMat[i * (i - 1) // 2 + j]
|
|
|
|
lmaxmin = pkr.LazyPick(func, self.n, self.m, (886, 112))
|
|
self.assertEqual(list(lmaxmin), list(maxmin))
|
|
|
|
lmaxmin = pkr.LazyPick(func, self.n, self.m, (886, 112), useCache=False)
|
|
self.assertEqual(list(lmaxmin), list(maxmin))
|
|
|
|
self.assertRaises(ValueError, lambda: pkr.Pick(self.dMat, self.n, self.m, (1012, )))
|
|
self.assertRaises(ValueError, lambda: pkr.Pick(self.dMat, self.n, self.m, (-1, )))
|
|
|
|
maxmin = pkr.Pick(self.dMat, self.n, self.m)
|
|
self.assertTrue(maxmin)
|
|
lmaxmin = pkr.LazyPick(func, self.n, self.m)
|
|
self.assertTrue(lmaxmin)
|
|
|
|
def test1HierarchPick(self):
|
|
fname = os.path.join(RDConfig.RDBaseDir, 'Code', 'SimDivPickers', 'Wrap', 'test_data',
|
|
'points.csv')
|
|
with open(fname) as infil:
|
|
lines = infil.readlines()
|
|
self.dataPts = numpy.zeros((len(lines), 2), 'd')
|
|
labels = []
|
|
i = 0
|
|
for line in lines:
|
|
tlst = line.strip().split(',')
|
|
self.dataPts[i, 0] = float(tlst[1])
|
|
self.dataPts[i, 1] = float(tlst[2])
|
|
labels.append(int(tlst[3]))
|
|
i += 1
|
|
self.dMat = rdmmc.GetEuclideanDistMat(self.dataPts)
|
|
pkr = rdSimDivPickers.HierarchicalClusterPicker(rdSimDivPickers.ClusterMethod.WARD)
|
|
clusters = pkr.Cluster(self.dMat, i, 2)
|
|
# check that each of the clusters have the same label
|
|
for cl in clusters:
|
|
clbl = labels[cl[0]]
|
|
for id in cl:
|
|
assert clbl == labels[id]
|
|
hierarch = pkr.Pick(self.dMat, i, 2)
|
|
self.assertEqual(tuple(hierarch), (1, 30))
|
|
|
|
def testIssue208(self):
|
|
sz = 10
|
|
N = 3
|
|
m = []
|
|
for i in range(sz):
|
|
for j in range(i + 1, sz):
|
|
m.append(random.random())
|
|
m = numpy.array(m)
|
|
picker = rdSimDivPickers.HierarchicalClusterPicker(rdSimDivPickers.ClusterMethod.WARD)
|
|
p1 = list(picker.Pick(m, sz, N))
|
|
p1.sort()
|
|
p2 = list(picker.Pick(m, sz, N))
|
|
p2.sort()
|
|
self.assertEqual(p1, p2)
|
|
|
|
def testInts(self):
|
|
""" make sure we can handle ints too """
|
|
sz = 10
|
|
N = 3
|
|
m = []
|
|
for i in range(sz):
|
|
for j in range(i + 1, sz):
|
|
m.append(int(100 * random.random()))
|
|
m = numpy.array(m)
|
|
picker = rdSimDivPickers.HierarchicalClusterPicker(rdSimDivPickers.ClusterMethod.WARD)
|
|
p1 = list(picker.Pick(m, sz, N))
|
|
p1.sort()
|
|
p2 = list(picker.Pick(m, sz, N))
|
|
p2.sort()
|
|
self.assertEqual(p1, p2)
|
|
|
|
def testNonUniqueCrash(self):
|
|
from rdkit import DataStructs
|
|
sz = 300
|
|
nbits = 40
|
|
nBitsToSet = int(nbits * .3)
|
|
N = 8
|
|
vs = []
|
|
for i in range(sz):
|
|
bv = DataStructs.ExplicitBitVect(nbits)
|
|
for j in range(nBitsToSet):
|
|
val = int(nbits * random.random())
|
|
bv.SetBit(val)
|
|
vs.append(bv)
|
|
vs.append(bv)
|
|
|
|
def taniFunc(i, j, bvs=vs):
|
|
d = 1 - DataStructs.FingerprintSimilarity(bvs[i], bvs[j])
|
|
return d
|
|
|
|
picker = rdSimDivPickers.MaxMinPicker()
|
|
mm1 = picker.LazyPick(taniFunc, len(vs), N)
|
|
self.assertEqual(len(mm1), N)
|
|
picker = None
|
|
|
|
picker = rdSimDivPickers.MaxMinPicker()
|
|
mm2 = picker.LazyBitVectorPick(vs, len(vs), N)
|
|
self.assertEqual(len(mm2), N)
|
|
|
|
picker = rdSimDivPickers.MaxMinPicker()
|
|
mm3 = picker.LazyBitVectorPick(vs, len(vs), N)
|
|
self.assertEqual(len(mm3), N)
|
|
|
|
# we get the occasional dupe randomly,
|
|
# make sure we don't get three dupes in a row
|
|
self.assertTrue(tuple(mm2) != tuple(mm1)) or (tuple(mm3) != tuple(mm1))
|
|
picker = None
|
|
|
|
ds = []
|
|
nvs = len(vs)
|
|
for i in range(nvs):
|
|
for j in range(i + 1, nvs):
|
|
d = taniFunc(i, j)
|
|
ds.append(d)
|
|
m = numpy.array(ds)
|
|
picker = rdSimDivPickers.HierarchicalClusterPicker(rdSimDivPickers.ClusterMethod.WARD)
|
|
p1 = list(picker.Pick(m, nvs, N))
|
|
|
|
def testBitVectorMaxMin(self):
|
|
from rdkit import DataStructs
|
|
sz = 100
|
|
nbits = 200
|
|
nBitsToSet = int(nbits * .1)
|
|
N = 10
|
|
vs = []
|
|
for i in range(sz):
|
|
bv = DataStructs.ExplicitBitVect(nbits)
|
|
for j in range(nBitsToSet):
|
|
val = int(nbits * random.random())
|
|
bv.SetBit(val)
|
|
vs.append(bv)
|
|
|
|
def func(i, j, bvs=vs):
|
|
d = DataStructs.TanimotoSimilarity(bvs[i], bvs[j], returnDistance=True)
|
|
return d
|
|
|
|
picker = rdSimDivPickers.MaxMinPicker()
|
|
mm1 = picker.LazyPick(func, len(vs), N, seed=42)
|
|
self.assertEqual(len(mm1), N)
|
|
|
|
mm2 = picker.LazyPick(func, len(vs), N, useCache=False, seed=42)
|
|
self.assertEqual(len(mm2), N)
|
|
self.assertEqual(list(mm1), list(mm2))
|
|
|
|
mm2 = picker.LazyBitVectorPick(vs, len(vs), N, seed=42)
|
|
self.assertEqual(len(mm2), N)
|
|
self.assertEqual(list(mm1), list(mm2))
|
|
|
|
mm2 = picker.LazyBitVectorPick(vs, len(vs), N, useCache=False, seed=42)
|
|
self.assertEqual(len(mm2), N)
|
|
self.assertEqual(list(mm1), list(mm2))
|
|
|
|
def testBitVectorMaxMin2(self):
|
|
fps = [
|
|
"11110010101000000000", "00000000000010010000", "11001010000000000001",
|
|
"00100110101000001000", "01010110000100011001", "11000110101001000011",
|
|
"00000000001100001111", "00011110110000001101", "00000011011110100010",
|
|
"11000010110001000000", "00000100010000010000", "10000001000010110010",
|
|
"00010010000000010100", "00011100100110101000", "10001001100110100000",
|
|
"10000110100110010000", "00101110000101000000", "11011101100011100000",
|
|
"10000110000100101000", "00101000100000010001", "01000001000010000000",
|
|
"00101101010100000110", "10001000100110110001", "00011000010100000001",
|
|
"00101000001000100011", "00010000100010011001", "01100001000100010001",
|
|
"10000101000001101101", "00001000011001011000", "11110000100100100000",
|
|
"10100110000000011010", "00110100010110010010", "00000000000001010010",
|
|
"00100000000010100001", "11110011000010001000", "10110001010100001000",
|
|
"00001100100110011011", "00010010100100001110", "10100101100010100010",
|
|
"01100100010100000001", "10101110011100000000", "01011000000001000001",
|
|
"00000011100110100010", "01100001010001001001", "00001000000001001100",
|
|
"10011001110000000100", "10110000001001100100", "00011000000001001011",
|
|
"11001011010001100010", "10010000000001001011", "00010000100111100000",
|
|
"00001000001110001000", "11010000010001100110", "01101001100000111000",
|
|
"01001000001110111000", "10000000000100010010", "11001000010010000000",
|
|
"01010010000100110001", "00010001010100100001", "01110010000000010000",
|
|
"10001010000011000001", "00000110000000100100", "00010000010001000000",
|
|
"11101100011010000011", "00000010100001010001", "00010000110010000101",
|
|
"00010001001000111001", "01000010001100100110", "00110110000000100001",
|
|
"00100010010110110010", "01000000110011001111", "00011000001000110010",
|
|
"01111010101000110100", "00001010000010110110", "00110011000011011010",
|
|
"00111010111010000110", "00010011101010000011", "00000001011000010000",
|
|
"00011011101110110000", "00010001101000000001", "00010000001010011010",
|
|
"00000010100100100010", "00000010001011000100", "11010000000001011100",
|
|
"00001000110101000001", "00000010000000110010", "10000000010011000001",
|
|
"11110110100100010000", "10001111000110001001", "00100110000110000100",
|
|
"00000100100000100100", "00110000101100010100", "00001010100000100000",
|
|
"01011000000011000111", "00010000100001010001", "10000010100000010000",
|
|
"00001000000000110010", "00001000101011010001", "00011110000100100000", "11001001010001010100"
|
|
]
|
|
N = 5
|
|
fps = [DataStructs.CreateFromBitString(x) for x in fps]
|
|
picker = rdSimDivPickers.MaxMinPicker()
|
|
mm1 = picker.LazyBitVectorPick(fps, len(fps), N, seed=42)
|
|
self.assertEqual(len(mm1), N)
|
|
self.assertEqual(list(mm1), [37, 1, 43, 38, 16])
|
|
|
|
mm2 = picker.LazyBitVectorPick(fps, len(fps), N, useCache=False, seed=42)
|
|
self.assertEqual(len(mm2), N)
|
|
self.assertEqual(list(mm1), list(mm2))
|
|
|
|
def testBitVectorMaxMin3(self):
|
|
fname = os.path.join(RDConfig.RDBaseDir, 'Code', 'SimDivPickers', 'Wrap', 'test_data',
|
|
'chembl_cyps.head.fps')
|
|
fps = []
|
|
with open(fname) as infil:
|
|
for line in infil:
|
|
fp = DataStructs.CreateFromFPSText(line.strip())
|
|
fps.append(fp)
|
|
mmp = rdSimDivPickers.MaxMinPicker()
|
|
ids = list(mmp.LazyBitVectorPick(fps, len(fps), 20, seed=42))
|
|
self.assertEqual(ids, [
|
|
374, 720, 690, 339, 875, 842, 404, 725, 120, 385, 115, 868, 630, 881, 516, 497, 412, 718, 869,
|
|
407
|
|
])
|
|
|
|
ids = list(
|
|
mmp.LazyBitVectorPick(fps, len(fps), 20, firstPicks=[374, 720, 690, 339, 875], seed=42))
|
|
self.assertEqual(ids, [
|
|
374, 720, 690, 339, 875, 842, 404, 725, 120, 385, 115, 868, 630, 881, 516, 497, 412, 718, 869,
|
|
407
|
|
])
|
|
|
|
def testBitVectorMaxMin4(self):
|
|
# threshold tests
|
|
fname = os.path.join(RDConfig.RDBaseDir, 'Code', 'SimDivPickers', 'Wrap', 'test_data',
|
|
'chembl_cyps.head.fps')
|
|
fps = []
|
|
with open(fname) as infil:
|
|
for line in infil:
|
|
fp = DataStructs.CreateFromFPSText(line.strip())
|
|
fps.append(fp)
|
|
mmp = rdSimDivPickers.MaxMinPicker()
|
|
ids, threshold = mmp.LazyBitVectorPickWithThreshold(fps, len(fps), 20, -1.0, seed=42)
|
|
self.assertEqual(list(ids), [
|
|
374, 720, 690, 339, 875, 842, 404, 725, 120, 385, 115, 868, 630, 881, 516, 497, 412, 718, 869,
|
|
407
|
|
])
|
|
|
|
self.assertAlmostEqual(threshold, 0.8977, 4)
|
|
|
|
ids, threshold = mmp.LazyBitVectorPickWithThreshold(fps, len(fps), 20, 0.91, seed=42)
|
|
self.assertEqual(list(ids), [374, 720, 690, 339, 875, 842, 404, 725, 120, 385, 115, 868, 630])
|
|
self.assertTrue(threshold >= 0.91)
|
|
|
|
def testBitVectorLeader1(self):
|
|
# threshold tests
|
|
fname = os.path.join(RDConfig.RDBaseDir, 'Code', 'SimDivPickers', 'Wrap', 'test_data',
|
|
'chembl_cyps.head.fps')
|
|
fps = []
|
|
with open(fname) as infil:
|
|
for line in infil:
|
|
fp = DataStructs.CreateFromFPSText(line.strip())
|
|
fps.append(fp)
|
|
mmp = rdSimDivPickers.LeaderPicker()
|
|
thresh = 0.8
|
|
ids = mmp.LazyBitVectorPick(fps, len(fps), thresh)
|
|
self.assertEqual(len(ids), 146)
|
|
for i in range(len(ids)):
|
|
for j in range(i):
|
|
self.assertGreaterEqual(1 - DataStructs.TanimotoSimilarity(fps[ids[i]], fps[ids[j]]),
|
|
thresh)
|
|
thresh = 0.9
|
|
ids = mmp.LazyBitVectorPick(fps, len(fps), thresh)
|
|
self.assertEqual(len(ids), 14)
|
|
for i in range(len(ids)):
|
|
for j in range(i):
|
|
self.assertGreaterEqual(1 - DataStructs.TanimotoSimilarity(fps[ids[i]], fps[ids[j]]),
|
|
thresh)
|
|
|
|
ids = mmp.LazyBitVectorPick(fps, len(fps), thresh, pickSize=10)
|
|
self.assertEqual(len(ids), 10)
|
|
for i in range(len(ids)):
|
|
for j in range(i):
|
|
self.assertGreaterEqual(1 - DataStructs.TanimotoSimilarity(fps[ids[i]], fps[ids[j]]),
|
|
thresh)
|
|
|
|
def testLazyLeader(self):
|
|
pkr = rdSimDivPickers.LeaderPicker()
|
|
|
|
def func(i, j):
|
|
if i == j:
|
|
return 0.0
|
|
if i < j:
|
|
j, i = i, j
|
|
return i - j
|
|
|
|
lres = pkr.LazyPick(func, 100, 20)
|
|
self.assertEqual(list(lres), [0, 21, 42, 63, 84])
|
|
|
|
|
|
if __name__ == '__main__':
|
|
unittest.main()
|