add implementation of transform/reduce-to-chains command

This commit is contained in:
rdk
2021-12-12 11:07:11 +01:00
parent 253faf593f
commit 637cfe7998
10 changed files with 250 additions and 15 deletions

Binary file not shown.

Binary file not shown.

Binary file not shown.

View File

@@ -262,6 +262,34 @@ analyze() {
}
transform() {
title TRANSFORM COMMANDS
test ./prank.sh transform reduce-to-chains -f distro/test_data/2W83.cif -chains A # output: <out_dir>/2W83_A.cif
test ./prank.sh transform reduce-to-chains -f distro/test_data/2W83.pdb -chains A # output: <out_dir>/2W83_A.pdb
test ./prank.sh transform reduce-to-chains -f distro/test_data/2W83.cif.gz -chains A,B # output: <out_dir>/2W83_A,B.cif.gz
test ./prank.sh transform reduce-to-chains -f distro/test_data/2W83.cif.gz -chains A,B -out_file distro/test_output/2W83_A,B.cif.gz # output: distro/test_output/2W83_A,B.cif.gz
test ./prank.sh transform reduce-to-chains -f distro/test_data/2W83.cif -chains keep # output: <out_dir>/2W83.cif
test ./prank.sh transform reduce-to-chains -f distro/test_data/2W83.cif -chains keep -out_format pdb.gz # output: <out_dir>/2W83.pdb.gz
test ./prank.sh transform reduce-to-chains -f distro/test_data/2W83.cif -chains all # output: <out_dir>/2W83_all.cif
test ./prank.sh transform reduce-to-chains -f distro/test_data/2W83.cif -chains A -out_format keep # output: <out_dir>/2W83_A.cif
test ./prank.sh transform reduce-to-chains -f distro/test_data/2W83.cif.gz -chains A -out_format pdb.gz # output: <out_dir>/2W83_A.pdb.gz
test ./prank.sh transform reduce-to-chains -f distro/test_data/2W83.pdb.gz -chains A,B -out_format cif # output: <out_dir>/2W83_A,B.cif
test ./prank.sh transform reduce-to-chains -f distro/test_data/1fbl.cif -chains A
test ./prank.sh transform reduce-to-chains -f distro/test_data/1fbl.pdb -chains A
test ./prank.sh transform reduce-to-chains -f distro/test_data/1fbl.cif.gz -chains A,B
test ./prank.sh transform reduce-to-chains -f distro/test_data/1fbl.cif.gz -chains A,B -out_file distro/test_output/1fbl_A,B.cif.gz
test ./prank.sh transform reduce-to-chains -f distro/test_data/1fbl.cif -chains keep
test ./prank.sh transform reduce-to-chains -f distro/test_data/1fbl.cif -chains keep -out_format pdb.gz
test ./prank.sh transform reduce-to-chains -f distro/test_data/1fbl.cif -chains all
test ./prank.sh transform reduce-to-chains -f distro/test_data/1fbl.cif -chains A -out_format keep
test ./prank.sh transform reduce-to-chains -f distro/test_data/1fbl.cif.gz -chains A -out_format pdb.gz
test ./prank.sh transform reduce-to-chains -f distro/test_data/1fbl.pdb.gz -chains A,B -out_format cif
}
classifiers() {
title TRAIN/EVAL USING DIFFERENT CLASSIFIERS
@@ -283,6 +311,8 @@ feature_importances() {
test ./prank.sh traineval -t chen11-fpocket.ds -e joined.ds -c config/train-default -feature_importances 1 -classifier FasterForest2 -label FF2 -loop 1 -cache_datasets 0 -out_subdir TEST/IMPORTANCES
}
###################################################################################################################

View File

@@ -60,7 +60,7 @@ Analyze a dataset with an explicitly specified residue labeling.
## Reduce structure to chains
~~~sh
./prank.sh analyze reduce-to-chains -f <structure_file> -chains <chain_names> -out_format <format_file_extension>
./prank.sh analyze reduce-to-chains -f <structure_file> -chains <chain_names> -out_format <format_file_extension> -out_file <file_name>
~~~
* `-f <>` required, structure fie in one of the formats `pdb|pdb.gz|cif|cif.gz`
* `-chains` required, coma separated list of chain names, wildcards: `keep`, `all`
@@ -69,21 +69,24 @@ Analyze a dataset with an explicitly specified residue labeling.
* `*` is not the same as keeping structure as is, but runs the reduction procedure with all the chains, useful for debugging
* `-out_format` optional, default value is `keep` -- use the same format as the input
* possible values: `keep|pdb|pdb.gz|cif|cif.gz`
* `-out_file` optional, output structure file name, path relative to the shell working directory
* if specified, redced strucdure is saved under secified name and no other output is produced
* if not specified, default name is generated (see examples) and file is saved in the output directory specified with parameters `-o`, `-output_base_dir`, `-out_subdir`
Examples:
~~~sh
./prank.sh analyze reduce-to-chains -f 2W83.cif -chains A # output file: 2W83_A.cif
./prank.sh analyze reduce-to-chains -f 2W83.cif.gz -chains A,B # output file: 2W83_A,B.cif.gz
./prank.sh analyze reduce-to-chains -f 2W83.cif -chains keep # output file: 2W83.cif
./prank.sh analyze reduce-to-chains -f 2W83.cif -chains keep -out_format pdb.gz # output file: 2W83.pdb.gz
./prank.sh analyze reduce-to-chains -f 2W83.cif -chains all # output file: 2W83_all.cif
./prank.sh analyze reduce-to-chains -f 2W83.cif -chains A -out_format keep # output file: 2W83_A.cif
./prank.sh analyze reduce-to-chains -f 2W83.cif.gz -chains A -out_format pdb.gz # output file: 2W83_A.pdb.gz
./prank.sh analyze reduce-to-chains -f 2W83.pdb.gz -chains A,B -out_format cif # output file: 2W83_A,B.cif
./prank.sh transform reduce-to-chains -f distro/test_data/2W83.cif -chains A # output: <out_dir>/2W83_A.cif
./prank.sh transform reduce-to-chains -f distro/test_data/2W83.pdb -chains A # output: <out_dir>/2W83_A.pdb
./prank.sh transform reduce-to-chains -f distro/test_data/2W83.cif.gz -chains A,B # output: <out_dir>/2W83_A,B.cif.gz
./prank.sh transform reduce-to-chains -f distro/test_data/2W83.cif.gz -chains A,B -out_file distro/test_output/2W83_A,B.cif.gz # output: distro/test_output/2W83_A,B.cif.gz
./prank.sh transform reduce-to-chains -f distro/test_data/2W83.cif -chains keep # output: <out_dir>/2W83.cif
./prank.sh transform reduce-to-chains -f distro/test_data/2W83.cif -chains keep -out_format pdb.gz # output: <out_dir>/2W83.pdb.gz
./prank.sh transform reduce-to-chains -f distro/test_data/2W83.cif -chains all # output: <out_dir>/2W83_all.cif
./prank.sh transform reduce-to-chains -f distro/test_data/2W83.cif -chains A -out_format keep # output: <out_dir>/2W83_A.cif
./prank.sh transform reduce-to-chains -f distro/test_data/2W83.cif.gz -chains A -out_format pdb.gz # output: <out_dir>/2W83_A.pdb.gz
./prank.sh transform reduce-to-chains -f distro/test_data/2W83.pdb.gz -chains A,B -out_format cif # output: <out_dir>/2W83_A,B.cif
~~~
## Print

View File

@@ -369,10 +369,10 @@ class Protein implements Parametrized {
String saveToPdbFile(String fileName, boolean compressed = false) {
if (compressed) {
fileName += ".gz"
Futils.writeGzip fileName, structure.toPDB()
} else {
Futils.writeFile fileName, structure.toPDB()
}
PdbUtils.saveToFile(structure, "pdb", fileName, compressed)
return fileName
}

View File

@@ -9,6 +9,7 @@ import cz.siret.prank.program.params.Params
import cz.siret.prank.program.routines.Routine
import cz.siret.prank.program.routines.analyze.AnalyzeRoutine
import cz.siret.prank.program.routines.analyze.PrintRoutine
import cz.siret.prank.program.routines.analyze.TransformRoutine
import cz.siret.prank.program.routines.predict.PredictResiduesRoutine
import cz.siret.prank.program.routines.predict.PredictRoutine
import cz.siret.prank.program.routines.predict.RescoreRoutine
@@ -339,6 +340,10 @@ class Main implements Parametrized, Writable {
new AnalyzeRoutine(args, this).execute()
}
private runTransform() {
new TransformRoutine(args, this).execute()
}
private runPrint() {
new PrintRoutine(args, this).execute()
}
@@ -393,6 +398,8 @@ class Main implements Parametrized, Writable {
break
case 'analyze': runAnalyze()
break
case 'transform': runTransform()
break
case 'print': runPrint()
break
case 'run': runExperiment(args.unnamedArgs[0])

View File

@@ -909,7 +909,7 @@ class Params {
/**
* Timestamp that will be added as a prefix to each message printed to stdout ("" = no timestamp)
* Example: "yyyy.MM.dd HHmm:"
* Example: "yyyy.MM.dd HH:mm:"
*/
@RuntimeParam
String stdout_timestamp = ""
@@ -1185,6 +1185,20 @@ class Params {
@ModelParam
boolean feat_csv_ignore_missing = false
/**
*
*/
@RuntimeParam
String chains = "keep"
@RuntimeParam
String out_format = "keep"
@RuntimeParam
String out_file = null
//===========================================================================================================//
/**

View File

@@ -0,0 +1,158 @@
package cz.siret.prank.program.routines.analyze
import com.google.common.base.Splitter
import com.google.common.collect.ImmutableMap
import cz.siret.prank.domain.*
import cz.siret.prank.domain.labeling.*
import cz.siret.prank.domain.loaders.LoaderParams
import cz.siret.prank.export.FastaExporter
import cz.siret.prank.geom.Atoms
import cz.siret.prank.geom.Struct
import cz.siret.prank.program.Main
import cz.siret.prank.program.PrankException
import cz.siret.prank.program.rendering.PymolRenderer
import cz.siret.prank.program.rendering.RenderingModel
import cz.siret.prank.program.routines.Routine
import cz.siret.prank.utils.BinCounter
import cz.siret.prank.utils.CmdLineArgs
import cz.siret.prank.utils.Futils
import cz.siret.prank.utils.PdbUtils
import cz.siret.prank.utils.Sutils
import groovy.transform.CompileStatic
import groovy.util.logging.Slf4j
import org.biojava.nbio.structure.ResidueNumber
import org.biojava.nbio.structure.Structure
import static cz.siret.prank.geom.SecondaryStructureUtils.assignSecondaryStructure
import static cz.siret.prank.utils.Cutils.newSynchronizedList
import static cz.siret.prank.utils.Formatter.format
import static cz.siret.prank.utils.Futils.mkdirs
import static cz.siret.prank.utils.Futils.writeFile
/**
* Various tools for analyzing datasets.
* Routine with sub-commands.
*/
@Slf4j
@CompileStatic
class TransformRoutine extends Routine {
String subCommand
String label
Dataset dataset
CmdLineArgs args
TransformRoutine(CmdLineArgs args, Main main) {
super(null)
this.args = args
subCommand = args.popFirstUnnamedArg() // next if present should be dataset
if (!commandRegister.containsKey(subCommand)) {
write "Invalid transform sub-command '$subCommand'! Available commands: " + commandRegister.keySet()
throw new PrankException("Invalid command.")
}
dataset = main.loadDatasetOrFile()
label = "transform_" + subCommand + "_" + dataset.label
outdir = main.findOutdir(label)
}
void execute() {
write "executing transform $subCommand command"
commandRegister.get(subCommand).call()
}
//===========================================================================================================//
// Sub-Commands
//===========================================================================================================//
final Map<String, Closure> commandRegister = ImmutableMap.copyOf([
"reduce-to-chains" : { cmdReduceToChains() }
])
//===========================================================================================================//
/**
* chain label = "<author_id>(<mmcif_id>)"
*/
private List<String> chainLabels(Structure structure) {
return structure.chains.collect { Struct.getAuthorId(it) + "(" + Struct.getMmcifId(it) + ")" }
}
private void cmdReduceToChains() {
String file = args.get("f")
String outFormatParam = params.out_format
String outFileParam = params.out_file
String chainsParam = params.chains
def validVals = ["keep", "pdb", "pdb.gz", "cif", "cif.gz"]
if (!(outFormatParam in validVals)) {
throw new PrankException("Invalid value of out_format param: '$outFormatParam'. Valid values: $validVals")
}
write "processing file [${Futils.absPath(file)}]"
Structure structure = PdbUtils.loadFromFile(file)
String baseFileName = Futils.baseName(file)
String outFileBaseName // without extension
List<String> schains = structure.chains.collect { Struct.getAuthorId(it) }.toUnique().toSorted()
write "chains: " + chainLabels(structure)
write "atoms: " + Atoms.allFromStructure(structure).count
if (chainsParam == "keep") {
write "keeping the structure as is / not reducing to chains"
outFileBaseName = baseFileName
} else {
List<String> newChains
if (chainsParam == "all") {
write "selecting all the chains"
newChains = schains
outFileBaseName = baseFileName + "_all"
} else {
newChains = Sutils.split(chainsParam, ",")
outFileBaseName = baseFileName + "_" + newChains.join(",")
}
write "reducing to chains: " + newChains
structure = PdbUtils.reduceStructureToChains(structure, newChains)
write "chains (after reduction): " + chainLabels(structure)
write "atoms (after reduction): " + Atoms.allFromStructure(structure).count
}
boolean compress = false
String outFormat = "pdb"
String outExt
if (outFormatParam == "keep") {
compress = Futils.isCompressed(file)
outFormat = Futils.realExtension(file)
outExt = Futils.realExtension(file) + ((compress) ? ".gz" : "")
if (outFormat == "ent") {
outFormat = "pdb"
}
} else {
compress = Futils.isCompressed(outFormatParam)
outFormat = Sutils.removeSuffix(outFormatParam, ".gz")
outExt = outFormat + ((compress) ? ".gz" : "")
}
String outFilePath
if (outFileParam != null) {
outFilePath = outFileParam
} else {
mkdirs(outdir)
writeParams(outdir)
String outFileName = outFileBaseName + "." + outExt
outFilePath = outdir + "/" + outFileName
}
write "Output file: " + Futils.absPath(outFilePath)
PdbUtils.saveToFile(structure, outFormat, outFilePath, compress)
}
}

View File

@@ -110,6 +110,29 @@ class PdbUtils {
return struc
}
/**
* @param fileName
* @param format "cif" or "pdb"
* @param compressed - compress to gz
* @return file name used
*/
static String saveToFile(Structure structure, String format, String fileName, boolean compressed = false) {
String content
if (format == "cif") {
content = structure.toMMCIF()
} else {
content = structure.toPDB()
}
if (compressed) {
Futils.writeGzip fileName, content
} else {
Futils.writeFile fileName, content
}
return fileName
}
//===========================================================================================================//
static String correctResidueCode(String residueCode) {
//MSE is only found as a molecular replacement for MET