Remove dependencies on biopython

2026-06-04 13:30:33 +08:00 · 2022-09-09 10:46:37 -07:00
parent 485fcb83a3
commit 7dccdb215f
3 changed files with 4 additions and 93 deletions
--- a/bin/sample.py
+++ b/bin/sample.py
@@ -246,7 +246,9 @@ def main():
        pd.DataFrame(s, columns=train_dset.feature_names["angles"])
        for s in final_sampled
    ]
-    pdb_files = write_preds_pdb_folder(sampled_dfs, all_ft_train_dset, outdir / "sampled_pdb")
+    pdb_files = write_preds_pdb_folder(
+        sampled_dfs, all_ft_train_dset, outdir / "sampled_pdb"
+    )

    logging.info(f"Done writing main outputs! Calculating tm scores...")
    all_tm_scores = {}
@@ -257,6 +259,7 @@ def main():
    with open(outdir / "tm_scores.json", "w") as sink:
        json.dump(all_tm_scores, sink, indent=4)

+
 if __name__ == "__main__":
    logging.basicConfig(level=logging.INFO)
    main()
--- a/protdiff/angles_and_coords.py
+++ b/protdiff/angles_and_coords.py
@@ -1,8 +1,5 @@
 """
 Code to convert from angles between residues to XYZ coordinates. 
-
-Based on: 
-https://github.com/biopython/biopython/blob/master/Bio/PDB/ic_rebuild.py
 """
 import os
 import logging
@@ -13,16 +10,10 @@ import warnings
 import numpy as np
 import pandas as pd

-from Bio import PDB
-from Bio.PDB import PICIO, ic_rebuild
 from sequence_models import pdb_utils

 import biotite.structure as struc
 from biotite.structure.io.pdb import PDBFile
-from biotite.structure.io.pdbx import PDBxFile
-
-import torch
-from torch.utils.data import Dataset

 import nerf

@@ -33,47 +24,6 @@ MINIMAL_ANGLES = ["phi", "psi", "omega"]
 MINIMAL_DISTS = []


-def pdb_to_pic(pdb_file: str, pic_file: str):
-    """
-    Convert the PDB file to a PIC file
-    """
-    parser = PDB.PDBParser(QUIET=True)
-    s = parser.get_structure("pdb", pdb_file)
-    chains = [c for c in s.get_chains()]
-    if len(chains) > 1:
-        raise NotImplementedError
-    chain = chains.pop()  # type Bio.PDB.Chain.Chain
-    # print(chain.__dict__.keys())
-
-    # Convert to relative angles
-    # Calculate dihedrals, angles, bond lengths (internal coordinates) for Atom data
-    # Generates atomArray through init_edra
-    chain.atom_to_internal_coordinates()
-
-    for res in chain.internal_coord.ordered_aa_ic_list:
-        # Look at only analines because that's what we generate
-        if res.residue.get_resname() != "ALA":
-            continue
-        # print("REF", res, type(res))
-        # print(res.dihedra.keys())
-
-    with open(pic_file, "w") as sink:
-        PICIO.write_PIC(chain, sink)
-
-
-def pic_to_pdb(pic_file: str, pdb_file: str):
-    """
-    Read int he PIC file and convert to a PDB file
-    """
-    with open(pic_file) as source:
-        f = PICIO.read_PIC(source)
-    f.internal_to_atom_coordinates()
-
-    io = PDB.PDBIO()
-    io.set_structure(f)
-    io.save(pdb_file)
-
-
 def coords_to_trrosetta_angles(
    coords: Union[np.ndarray, Dict[str, List[List[float]]]],
 ) -> Optional[np.ndarray]:
@@ -258,47 +208,6 @@ def canonical_distances_and_dihedrals(
    return pd.DataFrame({k: calc_angles[k].squeeze() for k in distances + angles})


-def sample_coords(
-    fname: str,
-    subset_residues: Optional[Collection[str]] = None,
-    query_atoms: List[str] = ["N", "CA", "C", "O", "CB"],
-) -> List[pd.DataFrame]:
-    """
-    Sample the atomic coordinates of Alanine atoms. Return a list of dataframes each containing these
-    coordinates.
-
-    We use this to help figure out where to initialize atoms when creating a new chain
-    """
-    atomic_coords = []
-
-    parser = PDB.PDBParser(QUIET=True)
-    s = parser.get_structure("", fname)
-    for chain in s.get_chains():
-        residues = [
-            r for r in chain.get_residues() if r.get_resname() not in ("HOH", "NA")
-        ]
-
-        for res in residues:
-            if subset_residues is not None and res.get_resname() not in subset_residues:
-                continue
-            coords = {}
-            for atom in res.get_atoms():
-                coords[atom.get_name()] = atom.get_coord()
-            all_atoms_present = True
-
-            for atom in query_atoms:
-                if atom not in coords:
-                    logging.debug(f"{atom} not found in {res.get_resname()}")
-                    all_atoms_present = False
-                    break
-
-            if all_atoms_present:
-                atomic_coords.append(
-                    pd.DataFrame([coords[k] for k in query_atoms], index=query_atoms)
-                )
-    return atomic_coords
-
-
 def create_new_chain_nerf(
    out_fname: str,
    dists_and_angles: pd.DataFrame,
--- a/protdiff/datasets.py
+++ b/protdiff/datasets.py
@@ -296,7 +296,6 @@ class CathCanonicalAnglesDataset(Dataset):
                f"Computing full dataset of {len(fnames)} with {multiprocessing.cpu_count()} threads"
            )
            # Generate dihedral angles
-            # https://biopython.org/docs/1.76/api/Bio.PDB.PDBParser.html
            pool = multiprocessing.Pool(processes=multiprocessing.cpu_count())
            struct_arrays = pool.map(pfunc, fnames, chunksize=250)
            pool.close()