From 0b5c9492ffe453e78ee1f2d1c9989d74e351d944 Mon Sep 17 00:00:00 2001 From: Lukas Jarosch Date: Sun, 5 May 2024 23:49:22 -0700 Subject: [PATCH] Give script more descriptive name --- ...uplicates.py => expand_alignment_duplicates.py} | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) rename scripts/{expand_roda_duplicates.py => expand_alignment_duplicates.py} (80%) diff --git a/scripts/expand_roda_duplicates.py b/scripts/expand_alignment_duplicates.py similarity index 80% rename from scripts/expand_roda_duplicates.py rename to scripts/expand_alignment_duplicates.py index 587b791..d32bf82 100644 --- a/scripts/expand_roda_duplicates.py +++ b/scripts/expand_alignment_duplicates.py @@ -1,9 +1,10 @@ """ -The RODA database is non-redundant, meaning that it only stores one explicit -representative alignment directory for all PDB chains in a 100% sequence -identity cluster. In order to add explicit alignments for all PDB chains, this -script will add the missing chain directories and symlink them to their -representative alignment directories. +The OpenProteinSet alignment database is non-redundant, meaning that it only +stores one explicit representative alignment directory for all PDB chains in a +100% sequence identity cluster. In order to add explicit alignments for all PDB +chains, this script will add the missing chain directories and symlink them to +their representative alignment directories. This is required in order to train +OpenFold on the full PDB, not just one representative chain per cluster. """ from argparse import ArgumentParser @@ -52,6 +53,9 @@ def main(alignment_dir: Path, duplicate_chains_file: Path): with open(duplicate_chains_file, "r") as fp: duplicate_chains = [list(line.strip().split()) for line in fp] + # convert to absolute path for symlink creation + alignment_dir = alignment_dir.resolve() + create_duplicate_dirs(duplicate_chains, alignment_dir)