mirror of
https://github.com/microsoft/foldingdiff.git
synced 2026-06-04 13:30:33 +08:00
Ability to only do secondary structures for sequences of a max len
This commit is contained in:
@@ -24,6 +24,20 @@ from biotite.structure.io.pdb import PDBFile
|
||||
SSE_BACKEND = Literal["dssp", "psea"]
|
||||
|
||||
|
||||
def get_pdb_length(fname: str) -> int:
|
||||
"""
|
||||
Get the length of the chain described in the PDB file
|
||||
"""
|
||||
warnings.filterwarnings("ignore", ".*elements were guessed from atom_.*")
|
||||
structure = PDBFile.read(fname)
|
||||
if structure.get_model_count() > 1:
|
||||
return -1
|
||||
chain = structure.get_structure()[0]
|
||||
backbone = chain[struc.filter_backbone(chain)]
|
||||
l = int(len(backbone) / 3)
|
||||
return l
|
||||
|
||||
|
||||
def count_structures_in_pdb(
|
||||
fname: str, backend: SSE_BACKEND = "psea"
|
||||
) -> Tuple[int, int]:
|
||||
@@ -61,7 +75,9 @@ def count_structures_in_pdb(
|
||||
num_alpha = ss_counts["H"] if "H" in ss_counts else 0
|
||||
num_beta = ss_counts["B"] if "B" in ss_counts else 0
|
||||
else:
|
||||
raise ValueError(f"Unrecognized backend: {backend}")
|
||||
raise ValueError(
|
||||
f"Unrecognized backend for calculating secondary structures: {backend}"
|
||||
)
|
||||
logging.debug(f"From {fname}:\t{num_alpha} {num_beta}")
|
||||
return num_alpha, num_beta
|
||||
|
||||
@@ -69,10 +85,19 @@ def count_structures_in_pdb(
|
||||
def make_ss_cooccurrence_plot(
|
||||
pdb_files: Collection[str],
|
||||
outpdf: str,
|
||||
max_seq_len: int = 0,
|
||||
backend: SSE_BACKEND = "psea",
|
||||
threads: int = 4,
|
||||
):
|
||||
""" """
|
||||
"""
|
||||
Create a secondary structure co-occurrence plot
|
||||
"""
|
||||
if max_seq_len > 0:
|
||||
orig_len = len(pdb_files)
|
||||
pdb_files = [p for p in pdb_files if get_pdb_length(p) <= max_seq_len]
|
||||
logging.info(
|
||||
f"Filtering out sequences with more than {max_seq_len} residues: {orig_len} --> {len(pdb_files)}"
|
||||
)
|
||||
logging.info(f"Calculating {len(pdb_files)} structures using {backend}")
|
||||
pfunc = functools.partial(count_structures_in_pdb, backend=backend)
|
||||
pool = mp.Pool(threads)
|
||||
|
||||
@@ -269,8 +269,7 @@ def main() -> None:
|
||||
model,
|
||||
train_dset,
|
||||
n=10,
|
||||
# sweep_lengths=(50, train_dset.dset.pad),
|
||||
sweep_lengths=(50, 52), # Dummy values
|
||||
sweep_lengths=(50, test_dset.dset.pad),
|
||||
batch_size=args.batchsize,
|
||||
)
|
||||
else:
|
||||
@@ -331,7 +330,9 @@ def main() -> None:
|
||||
|
||||
# Plot single plots
|
||||
plot_distribution_overlap(
|
||||
{"Test": orig_values, "Sampled": samp_values}, ft_name, fname=plotdir / f"dist_{ft_name}.pdf"
|
||||
{"Test": orig_values, "Sampled": samp_values},
|
||||
ft_name,
|
||||
fname=plotdir / f"dist_{ft_name}.pdf",
|
||||
)
|
||||
plot_distribution_overlap(
|
||||
{"Test": orig_values, "Sampled": samp_values},
|
||||
@@ -372,12 +373,13 @@ def main() -> None:
|
||||
# Generate plots of secondary structure co-occurrence
|
||||
make_ss_cooccurrence_plot(
|
||||
pdb_files,
|
||||
str(outdir / "sampled_pdb" / "ss_cooccurrence_sampled.pdf"),
|
||||
str(outdir / "plots" / "ss_cooccurrence_sampled.pdf"),
|
||||
threads=multiprocessing.cpu_count(),
|
||||
)
|
||||
make_ss_cooccurrence_plot(
|
||||
test_dset.filenames,
|
||||
str(outdir / "sampled_pdb" / "ss_cooccurrence_train.pdf"),
|
||||
str(outdir / "plots" / "ss_cooccurrence_test.pdf"),
|
||||
max_seq_len=test_dset.dset.pad,
|
||||
threads=multiprocessing.cpu_count(),
|
||||
)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user