mirror of
https://github.com/KosinskiLab/AlphaPulldown.git
synced 2026-06-04 14:14:24 +08:00
* Harden MMseqs species ID resolution fallback * Reorganize tests for CPU coverage CI * New * Fix function coverage checker def-line false positives * Expand unit coverage for helper and backend manager utilities * New. * New. * Expand unit coverage for template and post-processing helpers * Expand unit coverage for objects.py edge cases * Publish HTML coverage reports via GitHub Pages * Add CPU unit coverage for AlphaFold3 backend helpers * Reorganize tests and expand backend coverage * Reset shared test flags between cases * Expand AF3 prepare_input unit coverage * Cover AF3 and truemultimer feature creation * Test AF3 multimer MSA translation paths * Cover AF3 duplicate-residue multimer fallback * Cover AF2 resume and postprocess edge paths * Cover AF3 template mmCIF preparation * Test small script entry points * Expand workflow and ModelCIF test coverage * Add backend extras and install guide * Clarify AF3 backend installation path * Stabilize cluster GPU test runners * Document AF3 CMake SQLite hints * Simplify backend installation guide * Align AF3 install with working cluster env * Backfill typing dataclass_transform for AF2 * Pin TensorFlow for cluster installs * Fallback AF2 relax when CUDA OpenMM is unavailable * Raise AF3 default minimum bucket size * Simplify backend cluster installation guide * Fix AF3 wrapper JSON output isolation * Fix AF3 JSON wrapper outputs and MMseqs ID parsing * Fix CI entrypoint stub and Python 3.8 typing * Document release readiness test gates
392 lines
9.7 KiB
Python
392 lines
9.7 KiB
Python
from urllib import error
|
|
|
|
import numpy as np
|
|
import pytest
|
|
|
|
from alphafold.data import msa_pairing
|
|
from alphafold.data import parsers
|
|
from alphafold.data import pipeline
|
|
from alphapulldown.objects import MonomericObject
|
|
from alphapulldown.utils import mmseqs_species_identifiers
|
|
|
|
|
|
@pytest.fixture(autouse=True)
|
|
def clear_species_id_cache():
|
|
mmseqs_species_identifiers._SPECIES_ID_CACHE.clear()
|
|
yield
|
|
mmseqs_species_identifiers._SPECIES_ID_CACHE.clear()
|
|
|
|
|
|
def _feature_dict_from_a3m(
|
|
sequence: str,
|
|
a3m: str,
|
|
*,
|
|
species_resolver,
|
|
) -> dict[str, np.ndarray]:
|
|
feature_dict = {
|
|
**pipeline.make_sequence_features(sequence, 'none', len(sequence)),
|
|
**pipeline.make_msa_features([parsers.parse_a3m(a3m)]),
|
|
}
|
|
mmseqs_species_identifiers.enrich_mmseq_feature_dict_with_identifiers(
|
|
feature_dict,
|
|
a3m,
|
|
species_resolver=species_resolver,
|
|
)
|
|
valid_feats = msa_pairing.MSA_FEATURES + (
|
|
'msa_species_identifiers',
|
|
'msa_uniprot_accession_identifiers',
|
|
)
|
|
feature_dict.update(
|
|
{
|
|
f'{key}_all_seq': value
|
|
for key, value in feature_dict.items()
|
|
if key in valid_feats
|
|
}
|
|
)
|
|
return feature_dict
|
|
|
|
|
|
def test_make_msa_features_resolves_mmseqs_species_identifiers(monkeypatch):
|
|
monkeypatch.setattr(
|
|
mmseqs_species_identifiers,
|
|
'resolve_species_ids_by_accession',
|
|
lambda accessions, **_: {
|
|
'A0A636IKY3': '108619',
|
|
'UPI001118B830': '562',
|
|
},
|
|
)
|
|
|
|
a3m = '\n'.join([
|
|
'>101',
|
|
'ACDE',
|
|
'>UniRef100_A0A636IKY3\t136\t0.883',
|
|
'ACDF',
|
|
'>UniRef100_UPI001118B830\t855\t0.990',
|
|
'AC-E',
|
|
'',
|
|
])
|
|
|
|
features = mmseqs_species_identifiers.build_mmseq_identifier_features(a3m)
|
|
|
|
assert features['msa_species_identifiers'].tolist() == [
|
|
b'',
|
|
b'108619',
|
|
b'562',
|
|
]
|
|
assert features['msa_uniprot_accession_identifiers'].tolist() == [
|
|
b'',
|
|
b'A0A636IKY3',
|
|
b'UPI001118B830',
|
|
]
|
|
|
|
|
|
def test_pair_sequences_works_with_mmseqs_accession_species_resolution(
|
|
monkeypatch,
|
|
):
|
|
monkeypatch.setattr(
|
|
mmseqs_species_identifiers,
|
|
'resolve_species_ids_by_accession',
|
|
lambda accessions, **_: {
|
|
'A0A636IKY3': '562',
|
|
'A0A743YDY2': '573',
|
|
'UPI001118B830': '562',
|
|
'UPI00101273C6': '573',
|
|
},
|
|
)
|
|
|
|
chain_a = _feature_dict_from_a3m(
|
|
'ACDE',
|
|
'\n'.join([
|
|
'>101',
|
|
'ACDE',
|
|
'>UniRef100_A0A636IKY3\t136\t0.883',
|
|
'ACDF',
|
|
'>UniRef100_A0A743YDY2\t134\t0.932',
|
|
'AC-E',
|
|
'',
|
|
]),
|
|
species_resolver=mmseqs_species_identifiers.resolve_species_ids_by_accession,
|
|
)
|
|
chain_b = _feature_dict_from_a3m(
|
|
'WXYZ',
|
|
'\n'.join([
|
|
'>101',
|
|
'WXYZ',
|
|
'>UniRef100_UPI001118B830\t855\t0.990',
|
|
'WXYW',
|
|
'>UniRef100_UPI00101273C6\t833\t0.919',
|
|
'WX-Z',
|
|
'',
|
|
]),
|
|
species_resolver=mmseqs_species_identifiers.resolve_species_ids_by_accession,
|
|
)
|
|
|
|
paired_rows = msa_pairing.pair_sequences([chain_a, chain_b])[2]
|
|
|
|
assert paired_rows.shape == (3, 2)
|
|
assert tuple(paired_rows[0]) == (0, 0)
|
|
assert {tuple(row) for row in paired_rows[1:]} == {(1, 1), (2, 2)}
|
|
|
|
|
|
def test_make_mmseq_features_researches_templates_for_precomputed_msa(
|
|
monkeypatch,
|
|
tmp_path,
|
|
):
|
|
import alphapulldown.objects as objects_mod
|
|
|
|
a3m_path = tmp_path / 'dummy.a3m'
|
|
a3m_text = '\n'.join([
|
|
'# header line that should be ignored later',
|
|
'>101',
|
|
'ACDE',
|
|
'',
|
|
])
|
|
a3m_path.write_text(a3m_text, encoding='utf-8')
|
|
|
|
calls = {}
|
|
|
|
def fake_unserialize_msa(a3m_lines, sequence):
|
|
calls['unserialize_msa'] = {
|
|
'a3m_lines': a3m_lines,
|
|
'sequence': sequence,
|
|
}
|
|
return (
|
|
['PRECOMPUTED_UNPAIRED'],
|
|
['PRECOMPUTED_PAIRED'],
|
|
['PRECOMPUTED_UNIQUE'],
|
|
['PRECOMPUTED_CARDINALITY'],
|
|
['PRECOMPUTED_TEMPLATE'],
|
|
)
|
|
|
|
def fake_get_msa_and_templates(**kwargs):
|
|
calls['get_msa_and_templates'] = kwargs
|
|
return (
|
|
['IGNORED_UNPAIRED'],
|
|
['IGNORED_PAIRED'],
|
|
['IGNORED_UNIQUE'],
|
|
['IGNORED_CARDINALITY'],
|
|
['TEMPLATE_FROM_RESEARCH'],
|
|
)
|
|
|
|
def fake_build_monomer_feature(sequence, msa, template_feature):
|
|
calls['build_monomer_feature'] = {
|
|
'sequence': sequence,
|
|
'msa': msa,
|
|
'template_feature': template_feature,
|
|
}
|
|
return {
|
|
'template_confidence_scores': None,
|
|
'template_release_date': None,
|
|
}
|
|
|
|
def fake_enrich(feature_dict, a3m, **_kwargs):
|
|
calls['enrich_mmseq_feature_dict_with_identifiers'] = a3m
|
|
feature_dict['msa_species_identifiers'] = np.asarray([b''])
|
|
feature_dict['msa_uniprot_accession_identifiers'] = np.asarray([b''])
|
|
|
|
monkeypatch.setattr(objects_mod, 'unserialize_msa', fake_unserialize_msa)
|
|
monkeypatch.setattr(
|
|
objects_mod,
|
|
'get_msa_and_templates',
|
|
fake_get_msa_and_templates,
|
|
)
|
|
monkeypatch.setattr(
|
|
objects_mod,
|
|
'build_monomer_feature',
|
|
fake_build_monomer_feature,
|
|
)
|
|
monkeypatch.setattr(
|
|
objects_mod,
|
|
'enrich_mmseq_feature_dict_with_identifiers',
|
|
fake_enrich,
|
|
)
|
|
|
|
monomer = MonomericObject('dummy', 'ACDE')
|
|
monomer.make_mmseq_features(
|
|
DEFAULT_API_SERVER='https://fake.server',
|
|
output_dir=str(tmp_path),
|
|
use_precomputed_msa=True,
|
|
use_templates=True,
|
|
)
|
|
|
|
assert calls['unserialize_msa']['sequence'] == 'ACDE'
|
|
assert calls['unserialize_msa']['a3m_lines'] == ['>101\nACDE']
|
|
assert calls['get_msa_and_templates'] == {
|
|
'jobname': 'dummy',
|
|
'query_sequences': 'ACDE',
|
|
'a3m_lines': False,
|
|
'result_dir': tmp_path,
|
|
'msa_mode': 'single_sequence',
|
|
'use_templates': True,
|
|
'custom_template_path': None,
|
|
'pair_mode': 'none',
|
|
'host_url': 'https://fake.server',
|
|
'user_agent': 'alphapulldown',
|
|
}
|
|
assert calls['build_monomer_feature'] == {
|
|
'sequence': 'ACDE',
|
|
'msa': 'PRECOMPUTED_UNPAIRED',
|
|
'template_feature': 'TEMPLATE_FROM_RESEARCH',
|
|
}
|
|
assert (
|
|
calls['enrich_mmseq_feature_dict_with_identifiers']
|
|
== 'PRECOMPUTED_UNPAIRED'
|
|
)
|
|
assert isinstance(monomer.feature_dict['template_confidence_scores'], np.ndarray)
|
|
assert monomer.feature_dict['template_release_date'] == ['none']
|
|
|
|
|
|
def test_resolve_species_ids_by_accession_retries_after_transport_failure(
|
|
monkeypatch,
|
|
):
|
|
calls = []
|
|
|
|
def fake_query(accessions, *, urlopen):
|
|
calls.append(tuple(accessions))
|
|
if len(calls) == 1:
|
|
raise error.URLError('temporary outage')
|
|
return {
|
|
'results': [
|
|
{
|
|
'primaryAccession': 'A0A636IKY3',
|
|
'organism': {'taxonId': 562},
|
|
}
|
|
]
|
|
}
|
|
|
|
monkeypatch.setattr(
|
|
mmseqs_species_identifiers,
|
|
'_query_uniprot_batch',
|
|
fake_query,
|
|
)
|
|
|
|
first = mmseqs_species_identifiers.resolve_species_ids_by_accession(
|
|
['A0A636IKY3']
|
|
)
|
|
second = mmseqs_species_identifiers.resolve_species_ids_by_accession(
|
|
['A0A636IKY3']
|
|
)
|
|
|
|
assert first == {'A0A636IKY3': ''}
|
|
assert second == {'A0A636IKY3': '562'}
|
|
assert calls == [('A0A636IKY3',), ('A0A636IKY3',)]
|
|
|
|
|
|
def test_resolve_species_ids_by_accession_skips_single_accession_fallback_after_transport_failure(
|
|
monkeypatch,
|
|
):
|
|
calls = []
|
|
|
|
def fake_query(accessions, *, urlopen):
|
|
calls.append(tuple(accessions))
|
|
raise error.URLError('offline')
|
|
|
|
monkeypatch.setattr(
|
|
mmseqs_species_identifiers,
|
|
'_query_uniprot_batch',
|
|
fake_query,
|
|
)
|
|
|
|
resolved = mmseqs_species_identifiers.resolve_species_ids_by_accession(
|
|
['A0A636IKY3', 'A0A743YDY2']
|
|
)
|
|
|
|
assert resolved == {
|
|
'A0A636IKY3': '',
|
|
'A0A743YDY2': '',
|
|
}
|
|
assert calls == [('A0A636IKY3', 'A0A743YDY2')]
|
|
|
|
|
|
def test_build_mmseq_identifier_features_skips_non_uniprot_identifiers(
|
|
monkeypatch,
|
|
):
|
|
calls = []
|
|
|
|
def fake_resolver(accessions):
|
|
calls.append(tuple(accessions))
|
|
return {'A0A636IKY3': '108619'}
|
|
|
|
a3m = '\n'.join([
|
|
'>101',
|
|
'ACDE',
|
|
'>MGYP000264027769',
|
|
'ACDF',
|
|
'>UniRef100_MGYP000264027769',
|
|
'ACDG',
|
|
'>UniRef100_A0A636IKY3',
|
|
'ACDH',
|
|
'',
|
|
])
|
|
|
|
features = mmseqs_species_identifiers.build_mmseq_identifier_features(
|
|
a3m,
|
|
species_resolver=fake_resolver,
|
|
)
|
|
|
|
assert calls == [('A0A636IKY3',)]
|
|
assert features['msa_species_identifiers'].tolist() == [
|
|
b'',
|
|
b'',
|
|
b'',
|
|
b'108619',
|
|
]
|
|
assert features['msa_uniprot_accession_identifiers'].tolist() == [
|
|
b'',
|
|
b'',
|
|
b'',
|
|
b'A0A636IKY3',
|
|
]
|
|
|
|
|
|
def test_resolve_species_ids_by_accession_skips_unsupported_accessions(
|
|
monkeypatch,
|
|
):
|
|
uniprot_calls = []
|
|
uniparc_calls = []
|
|
|
|
def fake_uniprot_query(accessions, *, urlopen):
|
|
uniprot_calls.append(tuple(accessions))
|
|
return {
|
|
'results': [
|
|
{
|
|
'primaryAccession': 'A0A636IKY3',
|
|
'organism': {'taxonId': 562},
|
|
}
|
|
]
|
|
}
|
|
|
|
def fake_uniparc_query(accessions, *, urlopen):
|
|
uniparc_calls.append(tuple(accessions))
|
|
return {
|
|
'results': [
|
|
{
|
|
'uniParcId': 'UPI001118B830',
|
|
'organisms': [{'taxonId': 83333}],
|
|
}
|
|
]
|
|
}
|
|
|
|
monkeypatch.setattr(
|
|
mmseqs_species_identifiers,
|
|
'_query_uniprot_batch',
|
|
fake_uniprot_query,
|
|
)
|
|
monkeypatch.setattr(
|
|
mmseqs_species_identifiers,
|
|
'_query_uniparc_batch',
|
|
fake_uniparc_query,
|
|
)
|
|
|
|
resolved = mmseqs_species_identifiers.resolve_species_ids_by_accession(
|
|
['A0A636IKY3', 'MGYP000264027769', 'UPI001118B830']
|
|
)
|
|
|
|
assert resolved == {
|
|
'A0A636IKY3': '562',
|
|
'MGYP000264027769': '',
|
|
'UPI001118B830': '83333',
|
|
}
|
|
assert uniprot_calls == [('A0A636IKY3',)]
|
|
assert uniparc_calls == [('UPI001118B830',)]
|