mirror of
https://github.com/aqlaboratory/openfold.git
synced 2026-06-04 12:44:26 +08:00
Scripts from Lukas to be used in improved setup process
This commit is contained in:
@@ -0,0 +1,82 @@
|
||||
from argparse import ArgumentParser
|
||||
from pathlib import Path
|
||||
import json
|
||||
|
||||
|
||||
def main(args):
|
||||
# get the super index
|
||||
with open(args.alignment_db_super_index_path, "r") as fp:
|
||||
super_index = json.load(fp)
|
||||
|
||||
# get all chains and sequences
|
||||
chains_to_seqs = {}
|
||||
with open(args.all_chains_fasta, "r") as fp:
|
||||
lines = fp.readlines()
|
||||
|
||||
# iterate through chain-sequence pairs
|
||||
for chain_idx in range(0, len(lines), 2):
|
||||
chain = lines[chain_idx][1:].strip()
|
||||
seq = lines[chain_idx + 1].strip()
|
||||
chains_to_seqs[chain] = seq
|
||||
|
||||
chains_w_alignments = set(super_index.keys())
|
||||
chains_wo_alignments = set(chains_to_seqs.keys()) - chains_w_alignments
|
||||
|
||||
seq_to_chain_w_alignment = {
|
||||
chains_to_seqs[chain]: chain for chain in chains_w_alignments
|
||||
}
|
||||
|
||||
print("Unique sequences with alignments:", len(seq_to_chain_w_alignment))
|
||||
|
||||
# map chain without alignment to alignment entry of another chain with the
|
||||
# same sequence
|
||||
remaining_unaligned_chains = []
|
||||
for chain in chains_wo_alignments:
|
||||
seq = chains_to_seqs[chain]
|
||||
|
||||
try:
|
||||
corresponding_alignment = super_index[seq_to_chain_w_alignment[seq]]
|
||||
# no corresponding chain with alignment found
|
||||
except KeyError:
|
||||
remaining_unaligned_chains.append(chain)
|
||||
continue
|
||||
|
||||
super_index[chain] = corresponding_alignment
|
||||
|
||||
with open(args.output_path, "w") as fp:
|
||||
json.dump(super_index, fp)
|
||||
|
||||
print(
|
||||
f"No corresponding alignment found for the following {len(remaining_unaligned_chains)} chains:",
|
||||
remaining_unaligned_chains,
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = ArgumentParser(
|
||||
description="""
|
||||
If the alignment-db index was created on unique-chain alignments only,
|
||||
this will add the missing chain entries to the super-index file based on
|
||||
a .fasta file that contains sequences for all chains.
|
||||
|
||||
Note that this only modifies the index and not the database itself, as
|
||||
the duplicate sequences will just point to the same alignments.
|
||||
"""
|
||||
)
|
||||
parser.add_argument(
|
||||
"alignment_db_super_index_path",
|
||||
type=Path,
|
||||
help="Path to alignment-db super index file.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"output_path", type=Path, help="Write the output super index to this path."
|
||||
)
|
||||
parser.add_argument(
|
||||
"all_chains_fasta",
|
||||
type=Path,
|
||||
help="Path to the fasta file containing sequences for all chains.",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
main(args)
|
||||
105
scripts/fasta_to_clusterfile.py
Normal file
105
scripts/fasta_to_clusterfile.py
Normal file
@@ -0,0 +1,105 @@
|
||||
"""
|
||||
This script takes a .fasta file as input and then clusters it on a given
|
||||
sequence identity threshold using mmseqs2. The mmseqs2 flags are identical to
|
||||
what PDB officially uses to provide their official sequence clusters
|
||||
(https://github.com/soedinglab/MMseqs2/issues/452).
|
||||
"""
|
||||
import shutil
|
||||
import subprocess
|
||||
from argparse import ArgumentParser
|
||||
from collections import defaultdict
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def reformat_cluster_file(cluster_file: Path, output_file: Path):
|
||||
"""
|
||||
This function takes a mmseqs2 output cluster file and reformats it to a text
|
||||
file where each line contains a space-separated list of {PDB_ID}_{CHAIN_ID}
|
||||
belonging to the same cluster.
|
||||
"""
|
||||
cluster_to_chains = defaultdict(list)
|
||||
|
||||
# extract all chains belonging to each cluster
|
||||
with open(cluster_file, "r") as f:
|
||||
for line in f:
|
||||
line = line.strip()
|
||||
cluster_name, chain_id = line.split()
|
||||
cluster_to_chains[cluster_name].append(chain_id)
|
||||
|
||||
# write all chains belonging to the same cluster on the same line
|
||||
with open(output_file, "w") as f:
|
||||
for chains in cluster_to_chains.values():
|
||||
f.write(f"{' '.join(chains)}\n")
|
||||
|
||||
|
||||
def main(args):
|
||||
input_file = args.input_fasta.absolute()
|
||||
output_file = args.output_file.absolute()
|
||||
output_dir = args.output_file.parent
|
||||
|
||||
# prefix that all output files get
|
||||
mmseqs_prefix = "_mmseqs_out"
|
||||
|
||||
# temporary directory that mmseqs2 uses
|
||||
tmp_name = f"{mmseqs_prefix}_temp"
|
||||
tmp_dir = output_dir / tmp_name
|
||||
|
||||
mmseqs_command = [
|
||||
args.mmseqs_binary_path,
|
||||
"easy-cluster",
|
||||
input_file,
|
||||
mmseqs_prefix,
|
||||
tmp_name,
|
||||
"--min-seq-id",
|
||||
str(args.seq_id),
|
||||
"-c",
|
||||
"0.9",
|
||||
"-s",
|
||||
"8",
|
||||
"--max-seqs",
|
||||
"1000",
|
||||
"--cluster-mode",
|
||||
"1",
|
||||
]
|
||||
|
||||
# run mmseqs with PDB settings
|
||||
print("Running mmseqs2...")
|
||||
subprocess.run(mmseqs_command, check=True, cwd=output_dir)
|
||||
|
||||
cluster_file = output_dir / "_mmseqs_out_cluster.tsv"
|
||||
|
||||
print("Reformatting output file...")
|
||||
reformat_cluster_file(cluster_file, output_file)
|
||||
|
||||
print("Cleaning up mmseqs2 output...")
|
||||
mmseqs_outputs = [
|
||||
output_dir / f"{mmseqs_prefix}_{suffix}"
|
||||
for suffix in ["cluster.tsv", "rep_seq.fasta", "all_seqs.fasta"]
|
||||
]
|
||||
for file in mmseqs_outputs:
|
||||
file.unlink()
|
||||
shutil.rmtree(tmp_dir)
|
||||
|
||||
print("Done!")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = ArgumentParser(
|
||||
description="Creates a sequence cluster file from a .fasta file using mmseqs2 with PDB settings."
|
||||
)
|
||||
parser.add_argument(
|
||||
"input_fasta",
|
||||
type=Path,
|
||||
help="Input .fasta file. Sequence names should be in format >{PDB_ID}_{CHAIN_ID}",
|
||||
)
|
||||
parser.add_argument(
|
||||
"output_file",
|
||||
type=Path,
|
||||
help="Output file. Each line will contain a space-separated list of {PDB_ID}_{CHAIN_ID} belonging to the same cluster.",
|
||||
)
|
||||
parser.add_argument("mmseqs_binary_path", type=str, help="Path to mmseqs binary")
|
||||
parser.add_argument("--seq-id", type=float, default=0.4, help="Sequence identity threshold for clustering.")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
main(args)
|
||||
Reference in New Issue
Block a user