mirror of
https://github.com/aqlaboratory/openfold.git
synced 2026-06-04 12:44:26 +08:00
113 lines
4.7 KiB
Python
113 lines
4.7 KiB
Python
import json
|
|
import os
|
|
import random
|
|
import matplotlib.pyplot as plt
|
|
|
|
|
|
def select_proteins_from_PDB():
|
|
with open('/data/sk844/pdb_seqres.txt', 'r') as f:
|
|
lines = f.readlines()
|
|
max_idx = len(lines)
|
|
print(max_idx)
|
|
fastas = []
|
|
for r in range(1):
|
|
label_idx = random.randrange(0, max_idx, 2)
|
|
print(label_idx)
|
|
fasta = lines[label_idx] + lines[label_idx + 1]
|
|
print(fasta)
|
|
fastas.append(fasta)
|
|
print(len(fastas))
|
|
for fasta in fastas:
|
|
with open('output_dir/sample_proteins/' + fasta[1:7] + '.fasta', 'w') as f:
|
|
f.write(fasta)
|
|
print('File start')
|
|
print(repr(lines[0]))
|
|
print(repr(lines[1]))
|
|
print(repr(lines[2]))
|
|
|
|
def measure_time():
|
|
timings_all = []
|
|
rootdir = "../alphafold/output_dir/sample_proteins"
|
|
for dir in sorted(os.listdir(rootdir)):
|
|
print(dir)
|
|
if os.path.isfile(dir):
|
|
continue
|
|
with open(os.path.join(rootdir, dir, 'msas', 'timings_msa.json'), 'r') as f:
|
|
timings = json.load(f)
|
|
timings_all.append((dir, timings))
|
|
|
|
for protein_name, protein_dict in timings_all:
|
|
with open(os.path.join('output_dir','sample_proteins', protein_name + '.fasta'), 'r') as f:
|
|
fasta_lines = f.readlines()
|
|
protein_length = len(fasta_lines[1].strip())
|
|
protein_dict['length'] = protein_length
|
|
print(timings_all)
|
|
timings_msas = [prot_dict['msas']/60 for _, prot_dict in timings_all if prot_dict['msas']]
|
|
timings_templates_featurization = [prot_dict['templates']/60 for _, prot_dict in timings_all if prot_dict['templates']]
|
|
prot_lengths = [prot_dict['length'] for _, prot_dict in timings_all if prot_dict['msas']]
|
|
# Plot alignment times
|
|
plt.scatter(prot_lengths, timings_msas)
|
|
plt.title("Alignment times")
|
|
plt.xlabel("Protein size")
|
|
plt.ylabel("Execution times (minutes)")
|
|
plt.show()
|
|
# Plot template featurization times
|
|
plt.scatter(prot_lengths, timings_templates_featurization)
|
|
plt.title("Template Featurization times")
|
|
plt.xlabel("Protein size")
|
|
plt.ylabel("Execution times (minutes)")
|
|
plt.show()
|
|
print("Average alignment times: ", sum(timings_msas)/len(timings_msas))
|
|
print("Template featurization times: ", sum(timings_templates_featurization)/len(timings_templates_featurization))
|
|
|
|
protein_lengths = []
|
|
for file in sorted(os.listdir(os.path.join('output_dir','sample_proteins'))):
|
|
with open(os.path.join('output_dir', 'sample_proteins', file), 'r') as f:
|
|
fasta_lines = f.readlines()
|
|
protein_lengths.append(len(fasta_lines[1].strip()))
|
|
print('Protein lenghts')
|
|
print(protein_lengths)
|
|
|
|
#Now dealing with the execution times on 1-core
|
|
rootdir2 = "../alphafold/output_dir/sample_proteins_1core"
|
|
timings_granular = {}
|
|
for dir in sorted(os.listdir(rootdir2)):
|
|
print(dir)
|
|
if os.path.isfile(dir):
|
|
continue
|
|
with open(os.path.join(rootdir2, dir, 'msas', 'timings_msa.json'), 'r') as f:
|
|
timings = json.load(f)
|
|
timings['msas_1core'] = timings.pop('msas')
|
|
timings['templates_1core'] = timings.pop('templates')
|
|
timings_granular[dir] = timings
|
|
|
|
parallel_to_1core_ratio_msa = []
|
|
parallel_to_1core_ratio_template_featurization = []
|
|
protein_lengths_1_core = []
|
|
for protein_name, protein_dict in timings_all:
|
|
if protein_name not in timings_granular:
|
|
continue
|
|
protein_dict.update(timings_granular[protein_name])
|
|
parallel_to_1core_ratio_msa.append(protein_dict['msas_1core'] / protein_dict['msas'])
|
|
parallel_to_1core_ratio_template_featurization.append(protein_dict['templates_1core'] / protein_dict['templates'])
|
|
protein_lengths_1_core.append(protein_dict['length'])
|
|
print(protein_name, protein_dict['msas_1core'] / protein_dict['msas'], '\t', protein_dict)
|
|
print('Average 1-core multiplier for MSAs: ', sum(parallel_to_1core_ratio_msa)/len(parallel_to_1core_ratio_msa))
|
|
print('Average 1-core multiplier for template featurization: ',
|
|
sum(parallel_to_1core_ratio_template_featurization) / len(parallel_to_1core_ratio_template_featurization))
|
|
|
|
# Seeings alignment times wrt protein lengths
|
|
plt.scatter(protein_lengths_1_core, parallel_to_1core_ratio_msa)
|
|
plt.show()
|
|
|
|
# Now, let's deal with template search and featurization times
|
|
hhsearch_times_1_core = []
|
|
for protein_name, protein_dict in timings_granular.items():
|
|
hhsearch_times_1_core.append(protein_dict['hhsearch'])
|
|
print('Average hhsearch times on 1-core (seconds): ', sum(hhsearch_times_1_core) / len(hhsearch_times_1_core))
|
|
|
|
|
|
if __name__ == '__main__':
|
|
measure_time()
|
|
|