Files
D-SCRIPT/dscript/tests/test_alphabets.py
2025-07-21 14:21:48 -04:00

297 lines
9.0 KiB
Python

import numpy as np
from dscript.alphabets import DNA, SDM12, Alphabet, SecStr8, Uniprot21
class TestAlphabet:
"""Test cases for Alphabet class."""
def test_alphabet_init_default(self):
"""Test basic Alphabet initialization."""
chars = b"ACGT"
alphabet = Alphabet(chars)
assert len(alphabet) == 4
assert alphabet.size == 4
assert not alphabet.mask
assert alphabet.chars.dtype == np.uint8
def test_alphabet_init_with_encoding(self):
"""Test Alphabet initialization with custom encoding."""
chars = b"ACGT"
encoding = np.array([3, 2, 1, 0])
alphabet = Alphabet(chars, encoding=encoding)
assert len(alphabet) == 4
assert alphabet.size == 4
def test_alphabet_init_with_mask(self):
"""Test Alphabet initialization with mask=True."""
chars = b"ACGT"
alphabet = Alphabet(chars, mask=True)
assert len(alphabet) == 3 # size - 1 when masked
assert alphabet.mask
def test_alphabet_getitem(self):
"""Test indexing into alphabet."""
chars = b"ACGT"
alphabet = Alphabet(chars)
assert alphabet[0] == "A"
assert alphabet[1] == "C"
assert alphabet[2] == "G"
assert alphabet[3] == "T"
def test_alphabet_encode_basic(self):
"""Test encoding byte strings."""
chars = b"ACGT"
alphabet = Alphabet(chars)
# Test encoding single characters
encoded = alphabet.encode(b"A")
assert encoded[0] == 0
encoded = alphabet.encode(b"ACGT")
np.testing.assert_array_equal(encoded, [0, 1, 2, 3])
def test_alphabet_encode_missing_char(self):
"""Test encoding with missing character uses default missing value."""
chars = b"ACG" # Missing T
alphabet = Alphabet(chars, missing=99)
encoded = alphabet.encode(b"ACGT")
expected = [0, 1, 2, 99] # T should be encoded as missing value
np.testing.assert_array_equal(encoded, expected)
def test_alphabet_decode(self):
"""Test decoding numeric encoding back to byte string."""
chars = b"ACGT"
alphabet = Alphabet(chars)
# Encode then decode should give original
encoded = alphabet.encode(b"ACGT")
decoded = alphabet.decode(encoded)
assert decoded == b"ACGT"
def test_alphabet_unpack(self):
"""Test unpacking integer to array."""
chars = b"ACGT"
alphabet = Alphabet(chars)
# Test unpacking - this tests the mathematical conversion
result = alphabet.unpack(0, 3) # unpack 0 into length 3
expected = np.array([0, 0, 0], dtype=np.uint8)
np.testing.assert_array_equal(result, expected)
# Test with non-zero value
result = alphabet.unpack(5, 2) # 5 in base 4 is [1, 1]
expected = np.array([1, 1], dtype=np.uint8)
np.testing.assert_array_equal(result, expected)
def test_alphabet_get_kmer(self):
"""Test getting k-mer from integer."""
chars = b"ACGT"
alphabet = Alphabet(chars)
# Test getting k-mer
kmer = alphabet.get_kmer(0, 2)
assert kmer == b"AA" # 0 unpacked to [0,0] then decoded to "AA"
# Test another value
kmer = alphabet.get_kmer(5, 2) # 5 in base 4 = [1,1] = "CC"
assert kmer == b"CC"
class TestDNA:
"""Test cases for DNA alphabet."""
def test_dna_alphabet(self):
"""Test DNA alphabet basic functionality."""
assert len(DNA) == 4
assert DNA[0] == "A"
assert DNA[1] == "C"
assert DNA[2] == "G"
assert DNA[3] == "T"
# Test encoding
encoded = DNA.encode(b"ACGT")
np.testing.assert_array_equal(encoded, [0, 1, 2, 3])
# Test decoding
decoded = DNA.decode(encoded)
assert decoded == b"ACGT"
class TestUniprot21:
"""Test cases for Uniprot21 alphabet."""
def test_uniprot21_init_default(self):
"""Test Uniprot21 initialization."""
alphabet = Uniprot21()
assert len(alphabet) == 21
assert not alphabet.mask
def test_uniprot21_init_with_mask(self):
"""Test Uniprot21 initialization with mask."""
alphabet = Uniprot21(mask=True)
assert len(alphabet) == 20 # 21 - 1
assert alphabet.mask
def test_uniprot21_synonym_encoding(self):
"""Test that synonyms are encoded correctly."""
alphabet = Uniprot21()
# Test that synonyms map to correct values
# 'O' should map to same as 'K' (index 11)
o_encoded = alphabet.encode(b"O")[0]
k_encoded = alphabet.encode(b"K")[0]
assert o_encoded == k_encoded
# 'U' should map to same as 'C' (index 4)
u_encoded = alphabet.encode(b"U")[0]
c_encoded = alphabet.encode(b"C")[0]
assert u_encoded == c_encoded
# 'B' and 'Z' should map to same as 'X' (index 20)
b_encoded = alphabet.encode(b"B")[0]
z_encoded = alphabet.encode(b"Z")[0]
x_encoded = alphabet.encode(b"X")[0]
assert b_encoded == x_encoded
assert z_encoded == x_encoded
def test_uniprot21_standard_amino_acids(self):
"""Test encoding of standard amino acids."""
alphabet = Uniprot21()
# Test some standard amino acids
standard_aas = b"ARNDCQEGHILKMFPSTWYVX"
encoded = alphabet.encode(standard_aas)
# Should be encoded as sequential numbers 0-20
expected = np.arange(21)
np.testing.assert_array_equal(encoded, expected)
class TestSDM12:
"""Test cases for SDM12 alphabet."""
def test_sdm12_init_default(self):
"""Test SDM12 initialization."""
alphabet = SDM12()
assert len(alphabet) == 13
assert not alphabet.mask
def test_sdm12_init_with_mask(self):
"""Test SDM12 initialization with mask."""
alphabet = SDM12(mask=True)
assert len(alphabet) == 12 # 13 - 1
assert alphabet.mask
def test_sdm12_grouping(self):
"""Test that amino acids are grouped correctly."""
alphabet = SDM12()
# Test that amino acids in same group have same encoding
# K, E, R, O should all map to group 2
k_encoded = alphabet.encode(b"K")[0]
e_encoded = alphabet.encode(b"E")[0]
r_encoded = alphabet.encode(b"R")[0]
o_encoded = alphabet.encode(b"O")[0]
assert k_encoded == e_encoded == r_encoded == o_encoded
# T, S, Q should all map to group 4
t_encoded = alphabet.encode(b"T")[0]
s_encoded = alphabet.encode(b"S")[0]
q_encoded = alphabet.encode(b"Q")[0]
assert t_encoded == s_encoded == q_encoded
# Y, F should map to group 5
y_encoded = alphabet.encode(b"Y")[0]
f_encoded = alphabet.encode(b"F")[0]
assert y_encoded == f_encoded
# L, I, V, M should map to group 6
l_encoded = alphabet.encode(b"L")[0]
i_encoded = alphabet.encode(b"I")[0]
v_encoded = alphabet.encode(b"V")[0]
m_encoded = alphabet.encode(b"M")[0]
assert l_encoded == i_encoded == v_encoded == m_encoded
class TestSecStr8:
"""Test cases for SecStr8 alphabet."""
def test_secstr8_alphabet(self):
"""Test SecStr8 alphabet basic functionality."""
assert len(SecStr8) == 8
# Test that it contains expected characters
chars = b"HBEGITS "
for i, char in enumerate(chars):
assert SecStr8[i] == chr(char)
# Test encoding
encoded = SecStr8.encode(chars)
expected = np.arange(8)
np.testing.assert_array_equal(encoded, expected)
# Test decoding
decoded = SecStr8.decode(encoded)
assert decoded == chars
class TestAlphabetEdgeCases:
"""Test edge cases and error conditions."""
def test_empty_alphabet(self):
"""Test behavior with empty alphabet."""
chars = b""
alphabet = Alphabet(chars)
assert len(alphabet) == 0
def test_large_missing_value(self):
"""Test with large missing value."""
chars = b"AC"
alphabet = Alphabet(chars, missing=200)
# Encode character not in alphabet
encoded = alphabet.encode(b"T")
assert encoded[0] == 200
def test_encode_empty_string(self):
"""Test encoding empty byte string."""
chars = b"ACGT"
alphabet = Alphabet(chars)
encoded = alphabet.encode(b"")
assert len(encoded) == 0
def test_decode_empty_array(self):
"""Test decoding empty array."""
chars = b"ACGT"
alphabet = Alphabet(chars)
decoded = alphabet.decode(np.array([], dtype=np.uint8))
assert decoded == b""
def test_unpack_zero_length(self):
"""Test unpacking with k=0."""
chars = b"ACGT"
alphabet = Alphabet(chars)
result = alphabet.unpack(5, 0)
assert len(result) == 0
def test_get_kmer_zero_length(self):
"""Test getting k-mer with k=0."""
chars = b"ACGT"
alphabet = Alphabet(chars)
kmer = alphabet.get_kmer(5, 0)
assert kmer == b""