From ba5e34ed039030fafe7dc50fcbf1a71b936d7392 Mon Sep 17 00:00:00 2001 From: Kevin Wu Date: Wed, 14 Sep 2022 22:36:05 -0700 Subject: [PATCH] Only fold sequences that are valid --- bin/omegafold_across_gpus.py | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/bin/omegafold_across_gpus.py b/bin/omegafold_across_gpus.py index 6aca6ad..257b00c 100644 --- a/bin/omegafold_across_gpus.py +++ b/bin/omegafold_across_gpus.py @@ -13,10 +13,21 @@ from typing import * import torch import numpy as np +from biotite.sequence import ProteinSequence, AlphabetError - -def read_fasta(fname: str) -> Dict[str, str]: +def read_fasta(fname: str, check_valid: bool = True) -> Dict[str, str]: """Read the sequences in the fasta to a dict""" + def add_seq_if_valid(d: Dict[str, str], k:str, v:str) -> None: + """Add v to d[k] if v is a valid sequence""" + if not check_valid: + d[k] = v + return + try: + _ = ProteinSequence(v) + d[k] = v + except AlphabetError: + logging.warning(f"Illegal character in entry {k}: {v} | skipping") + retval = {} curr_k, curr_v = "", "" with open(fname) as source: @@ -25,7 +36,7 @@ def read_fasta(fname: str) -> Dict[str, str]: if curr_k: # Record and reset assert curr_v assert curr_k not in retval, f"Duplicated fasta entry: {curr_k}" - retval[curr_k] = curr_v + add_seq_if_valid(retval, curr_k, curr_v) curr_k = line.strip().strip(">") curr_v = "" else: @@ -33,7 +44,7 @@ def read_fasta(fname: str) -> Dict[str, str]: # Write the last sequence assert curr_k assert curr_v - retval[curr_k] = curr_v + add_seq_if_valid(retval, curr_k, curr_v) return retval