add implementation of transform/reduce-to-chains command

2026-06-04 12:44:24 +08:00 · 2021-12-12 11:07:11 +01:00
parent 253faf593f
commit 637cfe7998
10 changed files with 250 additions and 15 deletions
--- a/distro/test_data/1fbl.cif.gz
+++ b/distro/test_data/1fbl.cif.gz
--- a/distro/test_data/2W83.cif.gz
+++ b/distro/test_data/2W83.cif.gz
--- a/distro/test_data/2W83.pdb.gz
+++ b/distro/test_data/2W83.pdb.gz
--- a/misc/test-scripts/testsets.sh
+++ b/misc/test-scripts/testsets.sh
@@ -262,6 +262,34 @@ analyze() {

 }

+transform() {
+
+  title TRANSFORM COMMANDS
+
+  test ./prank.sh transform reduce-to-chains  -f distro/test_data/2W83.cif     -chains A                                                  # output: <out_dir>/2W83_A.cif
+  test ./prank.sh transform reduce-to-chains  -f distro/test_data/2W83.pdb     -chains A                                                  # output: <out_dir>/2W83_A.pdb
+  test ./prank.sh transform reduce-to-chains  -f distro/test_data/2W83.cif.gz  -chains A,B                                                # output: <out_dir>/2W83_A,B.cif.gz
+  test ./prank.sh transform reduce-to-chains  -f distro/test_data/2W83.cif.gz  -chains A,B  -out_file distro/test_output/2W83_A,B.cif.gz  # output: distro/test_output/2W83_A,B.cif.gz
+  test ./prank.sh transform reduce-to-chains  -f distro/test_data/2W83.cif     -chains keep                                               # output: <out_dir>/2W83.cif
+  test ./prank.sh transform reduce-to-chains  -f distro/test_data/2W83.cif     -chains keep -out_format pdb.gz                            # output: <out_dir>/2W83.pdb.gz
+  test ./prank.sh transform reduce-to-chains  -f distro/test_data/2W83.cif     -chains all                                                # output: <out_dir>/2W83_all.cif
+  test ./prank.sh transform reduce-to-chains  -f distro/test_data/2W83.cif     -chains A    -out_format keep                              # output: <out_dir>/2W83_A.cif
+  test ./prank.sh transform reduce-to-chains  -f distro/test_data/2W83.cif.gz  -chains A    -out_format pdb.gz                            # output: <out_dir>/2W83_A.pdb.gz
+  test ./prank.sh transform reduce-to-chains  -f distro/test_data/2W83.pdb.gz  -chains A,B  -out_format cif                               # output: <out_dir>/2W83_A,B.cif
+
+  test ./prank.sh transform reduce-to-chains  -f distro/test_data/1fbl.cif     -chains A
+  test ./prank.sh transform reduce-to-chains  -f distro/test_data/1fbl.pdb     -chains A
+  test ./prank.sh transform reduce-to-chains  -f distro/test_data/1fbl.cif.gz  -chains A,B
+  test ./prank.sh transform reduce-to-chains  -f distro/test_data/1fbl.cif.gz  -chains A,B  -out_file distro/test_output/1fbl_A,B.cif.gz
+  test ./prank.sh transform reduce-to-chains  -f distro/test_data/1fbl.cif     -chains keep
+  test ./prank.sh transform reduce-to-chains  -f distro/test_data/1fbl.cif     -chains keep -out_format pdb.gz
+  test ./prank.sh transform reduce-to-chains  -f distro/test_data/1fbl.cif     -chains all
+  test ./prank.sh transform reduce-to-chains  -f distro/test_data/1fbl.cif     -chains A    -out_format keep
+  test ./prank.sh transform reduce-to-chains  -f distro/test_data/1fbl.cif.gz  -chains A    -out_format pdb.gz
+  test ./prank.sh transform reduce-to-chains  -f distro/test_data/1fbl.pdb.gz  -chains A,B  -out_format cif
+  
+}
+
 classifiers() {

    title TRAIN/EVAL USING DIFFERENT CLASSIFIERS
@@ -283,6 +311,8 @@ feature_importances() {
    test ./prank.sh traineval -t chen11-fpocket.ds -e joined.ds -c config/train-default  -feature_importances 1 -classifier FasterForest2    -label FF2 -loop 1 -cache_datasets 0  -out_subdir TEST/IMPORTANCES
 }

+
+
 ###################################################################################################################


--- a/misc/tutorials/hidden-commands.md
+++ b/misc/tutorials/hidden-commands.md
@@ -60,7 +60,7 @@ Analyze a dataset with an explicitly specified residue labeling.
 ## Reduce structure to chains

 ~~~sh
-./prank.sh analyze reduce-to-chains -f <structure_file> -chains <chain_names> -out_format <format_file_extension>
+./prank.sh analyze reduce-to-chains -f <structure_file> -chains <chain_names> -out_format <format_file_extension> -out_file <file_name>
 ~~~
 * `-f <>` required, structure fie in one of the formats `pdb|pdb.gz|cif|cif.gz`
 * `-chains` required, coma separated list of chain names, wildcards: `keep`, `all`
@@ -69,21 +69,24 @@ Analyze a dataset with an explicitly specified residue labeling.
  * `*` is not the same as keeping structure as is, but runs the reduction procedure with all the chains, useful for debugging
 * `-out_format` optional, default value is `keep` -- use the same format as the input 
  * possible values: `keep|pdb|pdb.gz|cif|cif.gz`
-
+* `-out_file` optional, output structure file name, path relative to the shell working directory
+  * if specified, redced strucdure is saved under secified name and no other output is produced
+  * if not specified, default name is generated (see examples) and file is saved in the output directory specified with parameters `-o`, `-output_base_dir`, `-out_subdir`
     
 Examples:
 ~~~sh
-./prank.sh analyze reduce-to-chains  -f 2W83.cif     -chains A                         # output file: 2W83_A.cif
-./prank.sh analyze reduce-to-chains  -f 2W83.cif.gz  -chains A,B                       # output file: 2W83_A,B.cif.gz 
-./prank.sh analyze reduce-to-chains  -f 2W83.cif     -chains keep                      # output file: 2W83.cif
-./prank.sh analyze reduce-to-chains  -f 2W83.cif     -chains keep  -out_format pdb.gz  # output file: 2W83.pdb.gz
-./prank.sh analyze reduce-to-chains  -f 2W83.cif     -chains all                       # output file: 2W83_all.cif
-./prank.sh analyze reduce-to-chains  -f 2W83.cif     -chains A     -out_format keep    # output file: 2W83_A.cif
-./prank.sh analyze reduce-to-chains  -f 2W83.cif.gz  -chains A     -out_format pdb.gz  # output file: 2W83_A.pdb.gz
-./prank.sh analyze reduce-to-chains  -f 2W83.pdb.gz  -chains A,B   -out_format cif     # output file: 2W83_A,B.cif
+./prank.sh transform reduce-to-chains -f distro/test_data/2W83.cif    -chains A                                                 # output: <out_dir>/2W83_A.cif
+./prank.sh transform reduce-to-chains -f distro/test_data/2W83.pdb    -chains A                                                 # output: <out_dir>/2W83_A.pdb
+./prank.sh transform reduce-to-chains -f distro/test_data/2W83.cif.gz -chains A,B                                               # output: <out_dir>/2W83_A,B.cif.gz
+./prank.sh transform reduce-to-chains -f distro/test_data/2W83.cif.gz -chains A,B  -out_file distro/test_output/2W83_A,B.cif.gz # output: distro/test_output/2W83_A,B.cif.gz
+./prank.sh transform reduce-to-chains -f distro/test_data/2W83.cif    -chains keep                                              # output: <out_dir>/2W83.cif
+./prank.sh transform reduce-to-chains -f distro/test_data/2W83.cif    -chains keep -out_format pdb.gz                           # output: <out_dir>/2W83.pdb.gz
+./prank.sh transform reduce-to-chains -f distro/test_data/2W83.cif    -chains all                                               # output: <out_dir>/2W83_all.cif
+./prank.sh transform reduce-to-chains -f distro/test_data/2W83.cif    -chains A    -out_format keep                             # output: <out_dir>/2W83_A.cif
+./prank.sh transform reduce-to-chains -f distro/test_data/2W83.cif.gz -chains A    -out_format pdb.gz                           # output: <out_dir>/2W83_A.pdb.gz
+./prank.sh transform reduce-to-chains -f distro/test_data/2W83.pdb.gz -chains A,B  -out_format cif                              # output: <out_dir>/2W83_A,B.cif
 ~~~

-
 ## Print
            

--- a/src/main/groovy/cz/siret/prank/domain/Protein.groovy
+++ b/src/main/groovy/cz/siret/prank/domain/Protein.groovy
@@ -369,10 +369,10 @@ class Protein implements Parametrized {
    String saveToPdbFile(String fileName, boolean compressed = false) {
        if (compressed) {
            fileName += ".gz"
-            Futils.writeGzip fileName, structure.toPDB()
-        } else {
-            Futils.writeFile fileName, structure.toPDB()
        }
+
+        PdbUtils.saveToFile(structure, "pdb", fileName, compressed)
+
        return fileName
    }

--- a/src/main/groovy/cz/siret/prank/program/Main.groovy
+++ b/src/main/groovy/cz/siret/prank/program/Main.groovy
@@ -9,6 +9,7 @@ import cz.siret.prank.program.params.Params
 import cz.siret.prank.program.routines.Routine
 import cz.siret.prank.program.routines.analyze.AnalyzeRoutine
 import cz.siret.prank.program.routines.analyze.PrintRoutine
+import cz.siret.prank.program.routines.analyze.TransformRoutine
 import cz.siret.prank.program.routines.predict.PredictResiduesRoutine
 import cz.siret.prank.program.routines.predict.PredictRoutine
 import cz.siret.prank.program.routines.predict.RescoreRoutine
@@ -339,6 +340,10 @@ class Main implements Parametrized, Writable {
        new AnalyzeRoutine(args, this).execute()
    }

+    private runTransform() {
+        new TransformRoutine(args, this).execute()
+    }
+
    private runPrint() {
        new PrintRoutine(args, this).execute()
    }
@@ -393,6 +398,8 @@ class Main implements Parametrized, Writable {
                break
            case 'analyze':       runAnalyze()
                break
+            case 'transform':     runTransform()
+                break
            case 'print':         runPrint()
                break
            case 'run':           runExperiment(args.unnamedArgs[0])
--- a/src/main/groovy/cz/siret/prank/program/params/Params.groovy
+++ b/src/main/groovy/cz/siret/prank/program/params/Params.groovy
@@ -909,7 +909,7 @@ class Params {

    /**
     * Timestamp that will be added as a prefix to each message printed to stdout ("" = no timestamp)
-     * Example: "yyyy.MM.dd HHmm:"
+     * Example: "yyyy.MM.dd HH:mm:"
     */
    @RuntimeParam
    String stdout_timestamp = ""
@@ -1185,6 +1185,20 @@ class Params {
    @ModelParam
    boolean feat_csv_ignore_missing = false

+
+    /**
+     *
+     */
+    @RuntimeParam
+    String chains = "keep"
+
+    @RuntimeParam
+    String out_format = "keep"
+
+    @RuntimeParam
+    String out_file = null
+
+
 //===========================================================================================================//

    /**
--- a/src/main/groovy/cz/siret/prank/program/routines/analyze/TransformRoutine.groovy
+++ b/src/main/groovy/cz/siret/prank/program/routines/analyze/TransformRoutine.groovy
@@ -0,0 +1,158 @@
+package cz.siret.prank.program.routines.analyze
+
+import com.google.common.base.Splitter
+import com.google.common.collect.ImmutableMap
+import cz.siret.prank.domain.*
+import cz.siret.prank.domain.labeling.*
+import cz.siret.prank.domain.loaders.LoaderParams
+import cz.siret.prank.export.FastaExporter
+import cz.siret.prank.geom.Atoms
+import cz.siret.prank.geom.Struct
+import cz.siret.prank.program.Main
+import cz.siret.prank.program.PrankException
+import cz.siret.prank.program.rendering.PymolRenderer
+import cz.siret.prank.program.rendering.RenderingModel
+import cz.siret.prank.program.routines.Routine
+import cz.siret.prank.utils.BinCounter
+import cz.siret.prank.utils.CmdLineArgs
+import cz.siret.prank.utils.Futils
+import cz.siret.prank.utils.PdbUtils
+import cz.siret.prank.utils.Sutils
+import groovy.transform.CompileStatic
+import groovy.util.logging.Slf4j
+import org.biojava.nbio.structure.ResidueNumber
+import org.biojava.nbio.structure.Structure
+
+import static cz.siret.prank.geom.SecondaryStructureUtils.assignSecondaryStructure
+import static cz.siret.prank.utils.Cutils.newSynchronizedList
+import static cz.siret.prank.utils.Formatter.format
+import static cz.siret.prank.utils.Futils.mkdirs
+import static cz.siret.prank.utils.Futils.writeFile
+
+/**
+ * Various tools for analyzing datasets.
+ * Routine with sub-commands.
+ */
+@Slf4j
+@CompileStatic
+class TransformRoutine extends Routine {
+
+    String subCommand
+    String label
+    Dataset dataset
+    CmdLineArgs args
+
+    TransformRoutine(CmdLineArgs args, Main main) {
+        super(null)
+
+        this.args = args
+
+        subCommand = args.popFirstUnnamedArg() // next if present should be dataset
+        if (!commandRegister.containsKey(subCommand)) {
+            write "Invalid transform sub-command '$subCommand'! Available commands: " + commandRegister.keySet()
+            throw new PrankException("Invalid command.")
+        }
+
+        dataset = main.loadDatasetOrFile()
+
+        label = "transform_" + subCommand + "_" + dataset.label
+        outdir = main.findOutdir(label)
+    }
+
+    void execute() {
+        write "executing transform $subCommand command"
+
+        commandRegister.get(subCommand).call()
+    }
+
+ //===========================================================================================================//
+ // Sub-Commands
+ //===========================================================================================================//
+
+    final Map<String, Closure> commandRegister = ImmutableMap.copyOf([
+        "reduce-to-chains" : { cmdReduceToChains() }
+    ])
+
+//===========================================================================================================//
+
+    /**
+     * chain label = "<author_id>(<mmcif_id>)"
+     */
+    private List<String> chainLabels(Structure structure) {
+        return structure.chains.collect { Struct.getAuthorId(it) + "(" + Struct.getMmcifId(it) + ")" }
+    }
+
+    private void cmdReduceToChains() {
+        String file = args.get("f")
+        String outFormatParam = params.out_format
+        String outFileParam = params.out_file
+        String chainsParam = params.chains
+
+        def validVals = ["keep", "pdb", "pdb.gz", "cif", "cif.gz"]
+        if (!(outFormatParam in validVals)) {
+            throw new PrankException("Invalid value of out_format param: '$outFormatParam'. Valid values: $validVals")
+        }
+
+        write "processing file [${Futils.absPath(file)}]"
+
+        Structure structure = PdbUtils.loadFromFile(file)
+        String baseFileName = Futils.baseName(file)
+        String outFileBaseName // without extension
+
+        List<String> schains = structure.chains.collect { Struct.getAuthorId(it) }.toUnique().toSorted()
+        write "chains: " + chainLabels(structure)
+        write "atoms: " + Atoms.allFromStructure(structure).count
+
+        if (chainsParam == "keep") {
+            write "keeping the structure as is / not reducing to chains"
+            outFileBaseName = baseFileName
+        } else {
+            List<String> newChains
+            if (chainsParam == "all") {
+                write "selecting all the chains"
+                newChains = schains
+                outFileBaseName = baseFileName + "_all"
+            } else {
+                newChains = Sutils.split(chainsParam, ",")
+                outFileBaseName = baseFileName + "_" + newChains.join(",")
+            }
+
+            write "reducing to chains: " + newChains
+
+            structure = PdbUtils.reduceStructureToChains(structure, newChains)
+            write "chains (after reduction): " + chainLabels(structure)
+            write "atoms (after reduction): " + Atoms.allFromStructure(structure).count
+        }
+
+        boolean compress = false
+        String outFormat = "pdb"
+        String outExt
+        if (outFormatParam == "keep") {
+            compress = Futils.isCompressed(file)
+            outFormat = Futils.realExtension(file)
+            outExt = Futils.realExtension(file) + ((compress) ? ".gz" : "")
+            if (outFormat == "ent") {
+                outFormat = "pdb"
+            }
+        } else {
+            compress = Futils.isCompressed(outFormatParam)
+            outFormat = Sutils.removeSuffix(outFormatParam, ".gz")
+            outExt = outFormat + ((compress) ? ".gz" : "")
+        }
+
+        String outFilePath
+        if (outFileParam != null) {
+            outFilePath = outFileParam
+        } else {
+            mkdirs(outdir)
+            writeParams(outdir)
+            String outFileName = outFileBaseName + "." + outExt
+            outFilePath = outdir + "/" + outFileName
+        }
+
+        write "Output file: " + Futils.absPath(outFilePath)
+
+        PdbUtils.saveToFile(structure, outFormat, outFilePath, compress)
+    }
+
+}
--- a/src/main/groovy/cz/siret/prank/utils/PdbUtils.groovy
+++ b/src/main/groovy/cz/siret/prank/utils/PdbUtils.groovy
@@ -110,6 +110,29 @@ class PdbUtils {
        return struc
    }

+    /**
+     * @param fileName
+     * @param format "cif" or "pdb"
+     * @param compressed - compress to gz
+     * @return file name used
+     */
+    static String saveToFile(Structure structure, String format, String fileName, boolean compressed = false) {
+        String content
+        if (format == "cif") {
+            content = structure.toMMCIF()
+        } else {
+            content = structure.toPDB()
+        }
+
+        if (compressed) {
+            Futils.writeGzip fileName, content
+        } else {
+            Futils.writeFile fileName, content
+        }
+        return fileName
+    }
+
+//===========================================================================================================//

    static String correctResidueCode(String residueCode) {
        //MSE is only found as a molecular replacement for MET