Files
openfold/scripts/data_dir_to_fasta.py
2021-11-10 01:53:46 -05:00

70 lines
2.0 KiB
Python

import argparse
import logging
import os
from openfold.data import mmcif_parsing
from openfold.np import protein, residue_constants
def main(args):
fasta = []
for fname in os.listdir(args.data_dir):
basename, ext = os.path.splitext(fname)
basename = basename.upper()
fpath = os.path.join(args.data_dir, fname)
if(ext == ".cif"):
with open(fpath, 'r') as fp:
mmcif_str = fp.read()
mmcif = mmcif_parsing.parse(
file_id=basename, mmcif_string=mmcif_str
)
if(mmcif.mmcif_object is None):
logging.warning(f'Failed to parse {fname}...')
if(args.raise_errors):
raise list(mmcif.errors.values())[0]
else:
continue
mmcif = mmcif.mmcif_object
for chain, seq in mmcif.chain_to_seqres.items():
chain_id = '_'.join([basename, chain])
fasta.append(f">{chain_id}")
fasta.append(seq)
elif(ext == ".core"):
with open(fpath, 'r') as fp:
core_str = fp.read()
core_protein = protein.from_proteinnet_string(core_str)
aatype = core_protein.aatype
seq = ''.join([
residue_constants.restypes_with_x[aatype[i]]
for i in range(len(aatype))
])
fasta.append(f">{basename}")
fasta.append(seq)
with open(args.output_path, "w") as fp:
fp.write('\n'.join(fasta))
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"data_dir", type=str,
help="Path to a directory containing mmCIF or .core files"
)
parser.add_argument(
"output_path", type=str,
help="Path to output FASTA file"
)
parser.add_argument(
"--raise_errors", type=bool, default=False,
help="Whether to crash on parsing errors"
)
args = parser.parse_args()
main(args)