Ability to only do secondary structures for sequences of a max len

This commit is contained in:
Kevin Wu
2022-09-15 14:55:58 -07:00
parent ed3a4eff87
commit 9cb7bca32f
2 changed files with 34 additions and 7 deletions

View File

@@ -24,6 +24,20 @@ from biotite.structure.io.pdb import PDBFile
SSE_BACKEND = Literal["dssp", "psea"]
def get_pdb_length(fname: str) -> int:
"""
Get the length of the chain described in the PDB file
"""
warnings.filterwarnings("ignore", ".*elements were guessed from atom_.*")
structure = PDBFile.read(fname)
if structure.get_model_count() > 1:
return -1
chain = structure.get_structure()[0]
backbone = chain[struc.filter_backbone(chain)]
l = int(len(backbone) / 3)
return l
def count_structures_in_pdb(
fname: str, backend: SSE_BACKEND = "psea"
) -> Tuple[int, int]:
@@ -61,7 +75,9 @@ def count_structures_in_pdb(
num_alpha = ss_counts["H"] if "H" in ss_counts else 0
num_beta = ss_counts["B"] if "B" in ss_counts else 0
else:
raise ValueError(f"Unrecognized backend: {backend}")
raise ValueError(
f"Unrecognized backend for calculating secondary structures: {backend}"
)
logging.debug(f"From {fname}:\t{num_alpha} {num_beta}")
return num_alpha, num_beta
@@ -69,10 +85,19 @@ def count_structures_in_pdb(
def make_ss_cooccurrence_plot(
pdb_files: Collection[str],
outpdf: str,
max_seq_len: int = 0,
backend: SSE_BACKEND = "psea",
threads: int = 4,
):
""" """
"""
Create a secondary structure co-occurrence plot
"""
if max_seq_len > 0:
orig_len = len(pdb_files)
pdb_files = [p for p in pdb_files if get_pdb_length(p) <= max_seq_len]
logging.info(
f"Filtering out sequences with more than {max_seq_len} residues: {orig_len} --> {len(pdb_files)}"
)
logging.info(f"Calculating {len(pdb_files)} structures using {backend}")
pfunc = functools.partial(count_structures_in_pdb, backend=backend)
pool = mp.Pool(threads)

View File

@@ -269,8 +269,7 @@ def main() -> None:
model,
train_dset,
n=10,
# sweep_lengths=(50, train_dset.dset.pad),
sweep_lengths=(50, 52), # Dummy values
sweep_lengths=(50, test_dset.dset.pad),
batch_size=args.batchsize,
)
else:
@@ -331,7 +330,9 @@ def main() -> None:
# Plot single plots
plot_distribution_overlap(
{"Test": orig_values, "Sampled": samp_values}, ft_name, fname=plotdir / f"dist_{ft_name}.pdf"
{"Test": orig_values, "Sampled": samp_values},
ft_name,
fname=plotdir / f"dist_{ft_name}.pdf",
)
plot_distribution_overlap(
{"Test": orig_values, "Sampled": samp_values},
@@ -372,12 +373,13 @@ def main() -> None:
# Generate plots of secondary structure co-occurrence
make_ss_cooccurrence_plot(
pdb_files,
str(outdir / "sampled_pdb" / "ss_cooccurrence_sampled.pdf"),
str(outdir / "plots" / "ss_cooccurrence_sampled.pdf"),
threads=multiprocessing.cpu_count(),
)
make_ss_cooccurrence_plot(
test_dset.filenames,
str(outdir / "sampled_pdb" / "ss_cooccurrence_train.pdf"),
str(outdir / "plots" / "ss_cooccurrence_test.pdf"),
max_seq_len=test_dset.dset.pad,
threads=multiprocessing.cpu_count(),
)