Scripts from Lukas to be used in improved setup process

2026-06-04 12:44:26 +08:00 · 2023-09-08 13:28:08 -04:00
parent bb3f51e5a2
commit 50949b9a58
2 changed files with 187 additions and 0 deletions
--- a/scripts/alignment_db_scripts/add_non_unique_to_alignment_db.py
+++ b/scripts/alignment_db_scripts/add_non_unique_to_alignment_db.py
@@ -0,0 +1,82 @@
+from argparse import ArgumentParser
+from pathlib import Path
+import json
+
+
+def main(args):
+    # get the super index
+    with open(args.alignment_db_super_index_path, "r") as fp:
+        super_index = json.load(fp)
+
+    # get all chains and sequences
+    chains_to_seqs = {}
+    with open(args.all_chains_fasta, "r") as fp:
+        lines = fp.readlines()
+
+        # iterate through chain-sequence pairs
+        for chain_idx in range(0, len(lines), 2):
+            chain = lines[chain_idx][1:].strip()
+            seq = lines[chain_idx + 1].strip()
+            chains_to_seqs[chain] = seq
+
+    chains_w_alignments = set(super_index.keys())
+    chains_wo_alignments = set(chains_to_seqs.keys()) - chains_w_alignments
+
+    seq_to_chain_w_alignment = {
+        chains_to_seqs[chain]: chain for chain in chains_w_alignments
+    }
+
+    print("Unique sequences with alignments:", len(seq_to_chain_w_alignment))
+
+    # map chain without alignment to alignment entry of another chain with the
+    # same sequence
+    remaining_unaligned_chains = []
+    for chain in chains_wo_alignments:
+        seq = chains_to_seqs[chain]
+
+        try:
+            corresponding_alignment = super_index[seq_to_chain_w_alignment[seq]]
+        # no corresponding chain with alignment found
+        except KeyError:
+            remaining_unaligned_chains.append(chain)
+            continue
+
+        super_index[chain] = corresponding_alignment
+
+    with open(args.output_path, "w") as fp:
+        json.dump(super_index, fp)
+
+    print(
+        f"No corresponding alignment found for the following {len(remaining_unaligned_chains)} chains:",
+        remaining_unaligned_chains,
+    )
+
+
+if __name__ == "__main__":
+    parser = ArgumentParser(
+        description="""
+        If the alignment-db index was created on unique-chain alignments only,
+        this will add the missing chain entries to the super-index file based on
+        a .fasta file that contains sequences for all chains.
+        
+        Note that this only modifies the index and not the database itself, as
+        the duplicate sequences will just point to the same alignments. 
+        """
+    )
+    parser.add_argument(
+        "alignment_db_super_index_path",
+        type=Path,
+        help="Path to alignment-db super index file.",
+    )
+    parser.add_argument(
+        "output_path", type=Path, help="Write the output super index to this path."
+    )
+    parser.add_argument(
+        "all_chains_fasta",
+        type=Path,
+        help="Path to the fasta file containing sequences for all chains.",
+    )
+
+    args = parser.parse_args()
+
+    main(args)
--- a/scripts/fasta_to_clusterfile.py
+++ b/scripts/fasta_to_clusterfile.py
@@ -0,0 +1,105 @@
+"""
+This script takes a .fasta file as input and then clusters it on a given
+sequence identity threshold using mmseqs2. The mmseqs2 flags are identical to
+what PDB officially uses to provide their official sequence clusters
+(https://github.com/soedinglab/MMseqs2/issues/452).
+"""
+import shutil
+import subprocess
+from argparse import ArgumentParser
+from collections import defaultdict
+from pathlib import Path
+
+
+def reformat_cluster_file(cluster_file: Path, output_file: Path):
+    """
+    This function takes a mmseqs2 output cluster file and reformats it to a text
+    file where each line contains a space-separated list of {PDB_ID}_{CHAIN_ID}
+    belonging to the same cluster.
+    """
+    cluster_to_chains = defaultdict(list)
+
+    # extract all chains belonging to each cluster
+    with open(cluster_file, "r") as f:
+        for line in f:
+            line = line.strip()
+            cluster_name, chain_id = line.split()
+            cluster_to_chains[cluster_name].append(chain_id)
+
+    # write all chains belonging to the same cluster on the same line
+    with open(output_file, "w") as f:
+        for chains in cluster_to_chains.values():
+            f.write(f"{' '.join(chains)}\n")
+
+
+def main(args):
+    input_file = args.input_fasta.absolute()
+    output_file = args.output_file.absolute()
+    output_dir = args.output_file.parent
+
+    # prefix that all output files get
+    mmseqs_prefix = "_mmseqs_out"
+
+    # temporary directory that mmseqs2 uses
+    tmp_name = f"{mmseqs_prefix}_temp"
+    tmp_dir = output_dir / tmp_name
+
+    mmseqs_command = [
+        args.mmseqs_binary_path,
+        "easy-cluster",
+        input_file,
+        mmseqs_prefix,
+        tmp_name,
+        "--min-seq-id",
+        str(args.seq_id),
+        "-c",
+        "0.9",
+        "-s",
+        "8",
+        "--max-seqs",
+        "1000",
+        "--cluster-mode",
+        "1",
+    ]
+
+    # run mmseqs with PDB settings
+    print("Running mmseqs2...")
+    subprocess.run(mmseqs_command, check=True, cwd=output_dir)
+
+    cluster_file = output_dir / "_mmseqs_out_cluster.tsv"
+
+    print("Reformatting output file...")
+    reformat_cluster_file(cluster_file, output_file)
+
+    print("Cleaning up mmseqs2 output...")
+    mmseqs_outputs = [
+        output_dir / f"{mmseqs_prefix}_{suffix}"
+        for suffix in ["cluster.tsv", "rep_seq.fasta", "all_seqs.fasta"]
+    ]
+    for file in mmseqs_outputs:
+        file.unlink()
+    shutil.rmtree(tmp_dir)
+
+    print("Done!")
+
+
+if __name__ == "__main__":
+    parser = ArgumentParser(
+        description="Creates a sequence cluster file from a .fasta file using mmseqs2 with PDB settings."
+    )
+    parser.add_argument(
+        "input_fasta",
+        type=Path,
+        help="Input .fasta file. Sequence names should be in format >{PDB_ID}_{CHAIN_ID}",
+    )
+    parser.add_argument(
+        "output_file",
+        type=Path,
+        help="Output file. Each line will contain a space-separated list of {PDB_ID}_{CHAIN_ID} belonging to the same cluster.",
+    )
+    parser.add_argument("mmseqs_binary_path", type=str, help="Path to mmseqs binary")
+    parser.add_argument("--seq-id", type=float, default=0.4, help="Sequence identity threshold for clustering.")
+
+    args = parser.parse_args()
+
+    main(args)