From 1b7809a66781884701f0342c8c5f1cb48e941149 Mon Sep 17 00:00:00 2001 From: rdk Date: Sat, 9 Nov 2019 17:30:15 +0100 Subject: [PATCH] improved parameter comments, param annotations, cleanup --- .../cz/siret/prank/domain/Dataset.groovy | 4 + .../labeling/ModelBasedResidueLabeler.groovy | 2 +- .../labeling/ResidueBasedPointLabeler.groovy | 2 +- .../domain/labeling/ResidueLabeling.groovy | 4 +- .../domain/labeling/ResidueLabelings.groovy | 2 +- .../features/PrankFeatureExtractor.groovy | 12 - .../prank/features/api/FeatureRegistry.groovy | 4 + .../api/ResidueToAtomicFeatWrapper.groovy | 2 +- .../api/ResidueToSasFeatWrapper.groovy | 2 +- .../api/SasFeatureCalculationContext.groovy | 2 +- .../prank/program/params/ModelParam.groovy | 13 + .../siret/prank/program/params/Params.groovy | 389 +++++++++++++++--- .../prank/program/params/RuntimeParam.groovy | 11 + .../siret/prank/utils/CutoffAtomsCallLog.java | 2 +- .../cz/siret/prank/utils/ProcessRunner.groovy | 2 +- 15 files changed, 372 insertions(+), 81 deletions(-) create mode 100644 src/main/groovy/cz/siret/prank/program/params/ModelParam.groovy create mode 100644 src/main/groovy/cz/siret/prank/program/params/RuntimeParam.groovy diff --git a/src/main/groovy/cz/siret/prank/domain/Dataset.groovy b/src/main/groovy/cz/siret/prank/domain/Dataset.groovy index 8a114098..cae0c7a9 100644 --- a/src/main/groovy/cz/siret/prank/domain/Dataset.groovy +++ b/src/main/groovy/cz/siret/prank/domain/Dataset.groovy @@ -215,6 +215,10 @@ class Dataset implements Parametrized { return getLoader(attributes.get(PARAM_PREDICTION_METHOD), item) } + /** + * Get configured instance of prediction loader. + * @param method LBS prediction method name + */ private PredictionLoader getLoader(String method, Item item) { PredictionLoader res switch (method) { diff --git a/src/main/groovy/cz/siret/prank/domain/labeling/ModelBasedResidueLabeler.groovy b/src/main/groovy/cz/siret/prank/domain/labeling/ModelBasedResidueLabeler.groovy index 1909cfd3..b38d4cb8 100644 --- a/src/main/groovy/cz/siret/prank/domain/labeling/ModelBasedResidueLabeler.groovy +++ b/src/main/groovy/cz/siret/prank/domain/labeling/ModelBasedResidueLabeler.groovy @@ -18,7 +18,7 @@ import static cz.siret.prank.utils.Formatter.format import static cz.siret.prank.utils.Formatter.formatNumbers /** - * (not intended o be reused with mode proteins) + * (not intended to be reused with mode proteins) */ @Slf4j @CompileStatic diff --git a/src/main/groovy/cz/siret/prank/domain/labeling/ResidueBasedPointLabeler.groovy b/src/main/groovy/cz/siret/prank/domain/labeling/ResidueBasedPointLabeler.groovy index 10f73b64..a43e0751 100644 --- a/src/main/groovy/cz/siret/prank/domain/labeling/ResidueBasedPointLabeler.groovy +++ b/src/main/groovy/cz/siret/prank/domain/labeling/ResidueBasedPointLabeler.groovy @@ -8,7 +8,7 @@ import groovy.util.logging.Slf4j import org.biojava.nbio.structure.Atom /** - * + * Labels points according to nearest residue. */ @Slf4j @CompileStatic diff --git a/src/main/groovy/cz/siret/prank/domain/labeling/ResidueLabeling.groovy b/src/main/groovy/cz/siret/prank/domain/labeling/ResidueLabeling.groovy index 03ca972c..d3a9d7ef 100644 --- a/src/main/groovy/cz/siret/prank/domain/labeling/ResidueLabeling.groovy +++ b/src/main/groovy/cz/siret/prank/domain/labeling/ResidueLabeling.groovy @@ -7,7 +7,7 @@ import groovy.transform.CompileStatic import javax.annotation.Nullable /** - * Holds partiticular assignment of labels to a set of residues. + * Holds particular assignment of labels to a set of residues. */ @CompileStatic class ResidueLabeling { @@ -15,7 +15,7 @@ class ResidueLabeling { private List> labeledResidues private Map> labeledMap - ResidueLabeling(List labeledResidues) { + ResidueLabeling(List> labeledResidues) { this.labeledResidues = labeledResidues this.labeledMap = Maps.uniqueIndex(labeledResidues, { it.residue.key }) } diff --git a/src/main/groovy/cz/siret/prank/domain/labeling/ResidueLabelings.groovy b/src/main/groovy/cz/siret/prank/domain/labeling/ResidueLabelings.groovy index 950b8d08..9eb61792 100644 --- a/src/main/groovy/cz/siret/prank/domain/labeling/ResidueLabelings.groovy +++ b/src/main/groovy/cz/siret/prank/domain/labeling/ResidueLabelings.groovy @@ -23,7 +23,7 @@ import static cz.siret.prank.utils.Futils.mkdirs import static cz.siret.prank.utils.Futils.writeFile /** - * Logic for calculating residue lebalings during P2Rank prediction + * Logic for calculating residue labelings during P2Rank prediction */ @Slf4j class ResidueLabelings implements Parametrized { diff --git a/src/main/groovy/cz/siret/prank/features/PrankFeatureExtractor.groovy b/src/main/groovy/cz/siret/prank/features/PrankFeatureExtractor.groovy index e9ad3651..bb1154c9 100644 --- a/src/main/groovy/cz/siret/prank/features/PrankFeatureExtractor.groovy +++ b/src/main/groovy/cz/siret/prank/features/PrankFeatureExtractor.groovy @@ -37,15 +37,9 @@ class PrankFeatureExtractor extends FeatureExtractor impleme List atomTableFeatures List residueTableFeatures - // tied to a protein private PointSampler pocketPointSampler - /** - * if set to true extractorFactory will use zero vectors for unknown residues - * otherwise throws exception (so the whole pocket can be ignored) - */ - private boolean MASK_UNKNOWN_RESIDUES = params.mask_unknown_residues private double NEIGH_CUTOFF_DIST = params.neighbourhood_radius private boolean DO_SMOOTH_REPRESENTATION = params.smooth_representation private double SMOOTHING_CUTOFF_DIST = params.smoothing_radius @@ -112,7 +106,6 @@ class PrankFeatureExtractor extends FeatureExtractor impleme return res } - @Override void prepareProteinPrototypeForPockets() { pocketPointSampler = PointSampler.create(protein, trainingExtractor) @@ -143,7 +136,6 @@ class PrankFeatureExtractor extends FeatureExtractor impleme this.protein = protein this.pocket = pocket - this.MASK_UNKNOWN_RESIDUES = proteinPrototype.MASK_UNKNOWN_RESIDUES this.headerAdditionalFeatures = proteinPrototype.headerAdditionalFeatures this.pocketPointSampler = proteinPrototype.pocketPointSampler this.extraFeaturesHeader = proteinPrototype.extraFeaturesHeader @@ -156,10 +148,6 @@ class PrankFeatureExtractor extends FeatureExtractor impleme this.surfaceLayerAtoms = proteinPrototype.surfaceLayerAtoms this.properties = proteinPrototype.properties this.smoothRepresentations = proteinPrototype.smoothRepresentations - - - - } @Override diff --git a/src/main/groovy/cz/siret/prank/features/api/FeatureRegistry.groovy b/src/main/groovy/cz/siret/prank/features/api/FeatureRegistry.groovy index d35ffb42..fd5db8ba 100644 --- a/src/main/groovy/cz/siret/prank/features/api/FeatureRegistry.groovy +++ b/src/main/groovy/cz/siret/prank/features/api/FeatureRegistry.groovy @@ -1,5 +1,9 @@ package cz.siret.prank.features.api + +import cz.siret.prank.features.api.FeatureCalculator +import cz.siret.prank.features.api.ResidueToAtomicFeatWrapper +import cz.siret.prank.features.api.ResidueToSasFeatWrapper import cz.siret.prank.features.implementation.AAIndexFeature import cz.siret.prank.features.implementation.Asa2Feature import cz.siret.prank.features.implementation.AsaFeature diff --git a/src/main/groovy/cz/siret/prank/features/api/ResidueToAtomicFeatWrapper.groovy b/src/main/groovy/cz/siret/prank/features/api/ResidueToAtomicFeatWrapper.groovy index 487cfeec..f398069a 100644 --- a/src/main/groovy/cz/siret/prank/features/api/ResidueToAtomicFeatWrapper.groovy +++ b/src/main/groovy/cz/siret/prank/features/api/ResidueToAtomicFeatWrapper.groovy @@ -5,7 +5,7 @@ import cz.siret.prank.domain.Residue import org.biojava.nbio.structure.Atom /** - * + * Maps a residue features to atom feature */ class ResidueToAtomicFeatWrapper extends AtomFeatureCalculator { diff --git a/src/main/groovy/cz/siret/prank/features/api/ResidueToSasFeatWrapper.groovy b/src/main/groovy/cz/siret/prank/features/api/ResidueToSasFeatWrapper.groovy index b328139e..b969dfe0 100644 --- a/src/main/groovy/cz/siret/prank/features/api/ResidueToSasFeatWrapper.groovy +++ b/src/main/groovy/cz/siret/prank/features/api/ResidueToSasFeatWrapper.groovy @@ -10,7 +10,7 @@ import groovy.transform.CompileStatic import org.biojava.nbio.structure.Atom /** - * Mapping Closest Residue to SAS + * Mapping Closest Residue to SAS point */ @CompileStatic class ResidueToSasFeatWrapper extends SasFeatureCalculator { diff --git a/src/main/groovy/cz/siret/prank/features/api/SasFeatureCalculationContext.groovy b/src/main/groovy/cz/siret/prank/features/api/SasFeatureCalculationContext.groovy index 70a2aea5..19acec91 100644 --- a/src/main/groovy/cz/siret/prank/features/api/SasFeatureCalculationContext.groovy +++ b/src/main/groovy/cz/siret/prank/features/api/SasFeatureCalculationContext.groovy @@ -6,7 +6,7 @@ import cz.siret.prank.geom.Atoms import groovy.transform.CompileStatic /** - * Context for calculation of SAS feature. + * Context for calculation of a SAS feature. */ @CompileStatic class SasFeatureCalculationContext { diff --git a/src/main/groovy/cz/siret/prank/program/params/ModelParam.groovy b/src/main/groovy/cz/siret/prank/program/params/ModelParam.groovy new file mode 100644 index 00000000..fd570f0a --- /dev/null +++ b/src/main/groovy/cz/siret/prank/program/params/ModelParam.groovy @@ -0,0 +1,13 @@ +package cz.siret.prank.program.params + +/** + * Marks parameters of the prediction model (including feature extraction params) i.e. algorithm params. + * The notion of "model" here is seen wholesomely as a pocket prediction model = the whole algorithm, which includes feature extraction, classification and aggregation to binding sites. + * + * These are the parameters that must be the same in training and prediction phase. + * + * Currently annotation serves only documentation purposes. + */ +@interface ModelParam { + +} \ No newline at end of file diff --git a/src/main/groovy/cz/siret/prank/program/params/Params.groovy b/src/main/groovy/cz/siret/prank/program/params/Params.groovy index 8626c25b..139e0006 100644 --- a/src/main/groovy/cz/siret/prank/program/params/Params.groovy +++ b/src/main/groovy/cz/siret/prank/program/params/Params.groovy @@ -16,7 +16,6 @@ class Params { public static final Params INSTANCE = new Params() - public static Params getInst() { return INSTANCE } @@ -25,128 +24,195 @@ class Params { * define this if you want dataset program parameters to be evaluated relative to this directory * (set absolute path or path relative to install dir, null defaults to working dir) */ + @RuntimeParam String dataset_base_dir = null /** * all output of the program will be stored in subdirectores of this directory * (set absolute path or path relative to install dir, null defaults to working dir) */ + @RuntimeParam String output_base_dir = null /** * serialized model * (set path relative to install_dir/models/) */ + @RuntimeParam String model = "default.model" /** * Random seed */ + @RuntimeParam int seed = 42 + /** + * Parallel execution (processing datasets in parallel) + */ + @RuntimeParam boolean parallel = true /** * Number of computing threads */ + @RuntimeParam int threads = Runtime.getRuntime().availableProcessors() + 1 /** * Number for threads for generating R plots */ + @RuntimeParam int r_threads = 2 /** - * Number of folds to work on simultaneously + * Number of cross-validation folds to work on simultaneously. + * (Multiplies required memory) */ + @RuntimeParam int crossval_threads = 1 // Math.min(5, Runtime.getRuntime().availableProcessors()) /** * defines witch atoms around the ligand are considered to be part of the pocket * (ligands with longer distance are considered irrelevant floating ligands) */ + @ModelParam // training double ligand_protein_contact_distance = 4 - //== FAETURES + //==[ Features ]=========================================================================================================// + /** + * List of general calculated features + */ + @ModelParam List extra_features = ["protrusion","bfactor"] + /** + * List of features that come directly from atom type table + * see atomic-properties.csv + */ + @ModelParam List atom_table_features = ["apRawValids","apRawInvalids","atomicHydrophobicity"] // "ap5sasaValids","ap5sasaInvalids" + /** + * List of features that come directly from residue table + */ + @ModelParam + List residue_table_features = [] // ['aa5fact1','aa5fact2','aa5fact3','aa5fact4','aa5fact5'] + + /** + * Exponent applied to all atom table features + */ + @ModelParam double atom_table_feat_pow = 2 /** * dummy param to preserve behaviour of older versions + * if true sign of value is reapplied after transformation by atom_table_feat_pow */ + @ModelParam boolean atom_table_feat_keep_sgn = false - List residue_table_features = [] // ['aa5fact1','aa5fact2','aa5fact3','aa5fact4','aa5fact5'] - + /** + * radius for calculation protrusion feature + */ + @ModelParam double protrusion_radius = 10 //===========================================================================================================// - /** * Number of bins for protr_hist feature, must be >=2 */ + @ModelParam int protr_hist_bins = 5 + /** + * Param of protr_hist feature + */ + @ModelParam boolean protr_hist_cumulative = false + /** + * Param of protr_hist feature + */ + @ModelParam boolean protr_hist_relative = false -//===========================================================================================================// - /** * Number of bins for Atom Pair distance histogram (pair_hist) feature, must be >=2 */ + @ModelParam int pair_hist_bins = 5 /** * Radius capturing atoms considered in pair_hist feature */ + @ModelParam double pair_hist_radius = 6 /** * smooth vs. sharp binning */ + @ModelParam boolean pair_hist_smooth = false + /** + * apply normalization to histogram + */ + @ModelParam boolean pair_hist_normalize = false /** - * if false only protein exposed atmos are considered + * if false only protein exposed atoms are considered */ + @ModelParam boolean pair_hist_deep = true /** * size of random subsample of atom pairs, 0 = all */ + @ModelParam int pair_hist_subsample_limit = 0 //===========================================================================================================// /** - * conservation parameteres + * Load sequence conservation data */ - boolean load_conservation = false // always load conservation (for stats) + @RuntimeParam + boolean load_conservation = false + + /** + * Pocket scoring algorithm + */ + @ModelParam String score_pockets_by = "p2rank" // possible values: "p2rank", "conservation", "combi" /** - * Conservation exponent for rescoring pockets + * Conservation exponent for re-scoring pockets */ + @ModelParam int conservation_exponent = 1 + /** + * Radius for calculating conservation cloud related features + */ + @ModelParam double conserv_cloud_radius = 10 + /** + * Radius for calculating secondary structure cloud related features + */ + @ModelParam double ss_cloud_radius = 10 /** * Conservation file with this pattern is loaded: * baseName + chainId + "." + origin + ".hom.gz" */ + @RuntimeParam String conservation_origin = "hssp" /** @@ -154,39 +220,46 @@ class Params { * Path relative to dataset directory. * if null: look in the same directory as protein file */ + @RuntimeParam String conservation_dir = null /** * Log scores for binding and nonbinding scores to file */ + @RuntimeParam String log_scores_to_file = "" /** * limits how many pocket SAS points are used for scoring (after sorting), 0=unlimited * affects scoring pockets and also residues */ + @ModelParam int score_point_limit = 0 -//===========================================================================================================// - - - //== CLASSIFIERS =================== +//==[ Classifiers ]=========================================================================================================// /** * see ClassifierOption */ + @ModelParam String classifier = "FastRandomForest" /** * see ClassifierOption */ + @ModelParam String inner_classifier = "FastRandomForest" + /** + * see ClassifierOption + */ + @ModelParam int meta_classifier_iterations = 5 /** * works only with classifier "CostSensitive_RF" */ + @ModelParam // training double false_positive_cost = 2 //=== Random Forests ================= @@ -194,118 +267,179 @@ class Params { /** * RandomForest trees */ + @ModelParam // training int rf_trees = 100 /** * RandomForest depth limit, 0=unlimited */ + @ModelParam // training int rf_depth = 0 /** * RandomForest feature subset size for one tree, 0=default(sqrt) */ + @ModelParam // training int rf_features = 0 /** * number of threads used in RandomForest training (0=use value of threads param) */ + @RuntimeParam // training int rf_threads = 0 /** * size of a bag: 1..100% of the dataset */ + @ModelParam // training int rf_bagsize = 100 /** * cutoff for joining ligand atom groups into one ligand */ - double ligand_clustering_distance = 1.7 // ~ covalent bond length + @ModelParam // training + double ligand_clustering_distance = 1.7 // ~= covalent bond length /** * cutoff around ligand that defines positives */ + @ModelParam double positive_point_ligand_distance = 2.5 /** * distance around ligand atoms that define ligand induced volume * (for evaluation by some criteria, DSO, ligand coverage...) */ + @ModelParam double ligand_induced_volume_cutoff = 2.5 /** * points between [positive_point_ligand_distance,neutral_point_margin] will be left out form training */ + @ModelParam // training double neutral_points_margin = 5.5 - boolean mask_unknown_residues = true - /** - * chem. properties representation neighbourhood radius in A + * Neighbourhood radius (A) used for calculating most of the features. */ + @ModelParam double neighbourhood_radius = 8 /** - * HETATM groups that are considered cofactor and ignored + * HETATM groups that are ignored (not marked as relevant ligands, e.g because they are cofactors or part of a substrate) */ + @ModelParam // training Set ignore_het_groups = ["HOH","DOD","WAT","NAG","MAN","UNK","GLC","ABA","MPD","GOL","SO4","PO4"] as Set /** - * positive point defining ligand types accepted values: "relevant", "ignored", "small", "distant" + * Which ligand types define positive SAS points. + * accepted values: "relevant", "ignored", "small", "distant" */ + @ModelParam // training List positive_def_ligtypes = ["relevant"] /** - * min. heavy atom count for ligand, other ligands ignored + * Minimal heavy atom count for relevant ligands, other ligands are considered too small and ignored */ + @ModelParam // training int min_ligand_atoms = 5 + /** + * Point sampler for extracting instances for training. + * P2Rank and PRANK use SurfacePointSampler that produces SAS points. + * Others like GridPointSampler are experimental. + */ + @ModelParam String point_sampler = "SurfacePointSampler" /** * multiplier for random point sampling */ + @ModelParam // training int sampling_multiplier = 3 /** * solvent radius for SAS surface */ + @ModelParam double solvent_radius = 1.6 /** - * SAS tessellation (~density) used in pradiction step + * SAS tessellation (~density) used in prediction step */ + @ModelParam int tessellation = 2 /** * SAS tessellation (~density) used in training step */ + @ModelParam // training int train_tessellation = 2 - // for grid and random sampling + /** + * for grid and random sampling + */ + @ModelParam double point_min_distfrom_protein = 2.5 + + /** + * for grid and random sampling + */ + @ModelParam double point_max_distfrom_pocket = 4.5 - /* for GridPointSampler */ + /** + * grid cell size for GridPointSampler + */ + @ModelParam double grid_cell_edge = 2 /** * Restrict training set size, 0=unlimited */ + @RuntimeParam // training int max_train_instances = 0 + /** + * Param of SAS score weighting function (see WeightFun) + */ + @ModelParam double weight_power = 2 + + /** + * Param of SAS score weighting function (see WeightFun) + */ + @ModelParam double weight_sigma = 2.2 + + /** + * Param of SAS score weighting function (see WeightFun) + */ + @ModelParam double weight_dist_param = 4.5 + /** + * Choice of SAS score weighting function (see WeightFun) + */ + @ModelParam String weight_function = "INV" + /** + * If false only single layer of proteins solvent exposed atoms is used for calculating features that are projected from protein atoms to SAS points + */ + @ModelParam boolean deep_surrounding = false /** calculate feature vectors from smooth atom feature representation * (instead of directly from atom properties) */ + @ModelParam boolean smooth_representation = false + /** + * related to smooth_representation + */ + @ModelParam double smoothing_radius = 4.5 /** @@ -313,6 +447,7 @@ class Params { * if true, atom feature vectors are averaged * else they are only summed up */ + @ModelParam boolean average_feat_vectors = false /** @@ -320,311 +455,386 @@ class Params { * only applicable when average_feat_vectors=true * <0,1> goes from 'no average, just sum' -> 'full average' */ + @ModelParam double avg_pow = 1 /** * regarding feature projection from atoms to SAS points: calculate weighted average - * (shoud be true by default, kept false for backward compatibility reasons) + * (should be true by default, kept false for backward compatibility reasons) */ + @ModelParam boolean avg_weighted = false /** - * exponent of point ligandabitity score (before adding it to pocket score) + * exponent of point ligandability score (before adding it to pocket score) */ + @ModelParam double point_score_pow = 2 /** - * Binary classifiers produces historgam of scores for class0 and class1 + * Binary classifiers produces histogram of scores for class0 and class1 * if true only score for class1 is considered * makes a difference only if histogram produced by classifier doesn't sum up to 1 */ + @ModelParam boolean use_only_positive_score = true + /** + * If true trained models will not be saved to disk (good for parameter optimization) + */ + @RuntimeParam boolean delete_models = false /** * delete files containing training/evaluation feature vectors */ + @RuntimeParam boolean delete_vectors = true + /** + * check all loaded/calculated vectors for invalid (NaN) values + */ + @RuntimeParam boolean check_vectors = false /** - * collect vectors also from eval dataset (only makes sense if delete_vectors=false) + * collect vectors also from eval dataset (only makes sense in combination with delete_vectors=false) */ + @RuntimeParam boolean collect_eval_vectors = false /** * collect vectors only at the beginning of seed loop routine - * if dataset is subsampled (using train_protein_limit param) then dataset is subsampled only once + * if dataset is sub-sampled (using train_protein_limit param) then dataset is sub-sampled only once * set to false when doing learning curve! * train_protein_limit>0 should be always paired with collect_only_once=false */ + @RuntimeParam boolean collect_only_once = true /** * number of random seed iterations */ + @RuntimeParam int loop = 1 /** * keep datasets (structures and SAS points) in memory between crossval/seedloop iterations */ + @RuntimeParam boolean cache_datasets = false /** * calculate feature importances * available only for some classifiers */ + @RuntimeParam boolean feature_importances = false /** * produce pymol visualisations */ + @RuntimeParam boolean visualizations = true /** * visualize all surface points (not just inner pocket points) */ + @RuntimeParam boolean vis_all_surface = false /** * copy all protein pdb files to visualization folder (making visualizations portable) */ + @RuntimeParam boolean vis_copy_proteins = true /** * generate new protein pdb files from structures in memory instead of reusing input files * (useful when structures were manipulated in memory, e.g. when reducing to specified chains) */ + @RuntimeParam boolean vis_generate_proteins = true /** * zip PyMol visualizations to save space */ + @RuntimeParam boolean zip_visualizations = false /** * use strictly inner pocket points or more wider pocket neighbourhood */ + @RuntimeParam boolean strict_inner_points = false /** - * crossvalidation folds + * cross-validation folds */ + @RuntimeParam int folds = 5 /** * collect evaluations for top [n+0, n+1,...] pockets (n is true pocket count) */ + @RuntimeParam List eval_tolerances = [0,1,2,4,10,99] /** - * make own prank pocket predictions (P2RANK) + * Calculate pocket predictions. + * This is a main switch between re-scoring of predictions by other methods (PRANK) and pocket prediction (P2Rank) */ + @RuntimeParam boolean predictions = true /** - * residue prediction mode (opposed to pocket prediction) + * Residue prediction mode (as opposed to full pocket prediction mode) */ + @RuntimeParam boolean predict_residues = false /** - * (in predict mode) produce residue labeling file + * produce residue labeling file (in predict mode) + * + * Even in full pocket prediction mode (predict_residues=false) we can label and score residues using transformers. */ + @RuntimeParam boolean label_residues = true /** - * residue score threshold fir calculating predicted binary label + * residue score threshold for calculating predicted binary label */ + @ModelParam double residue_score_threshold = 1d /** * in calculation of residue score from neighboring SAS points: * <0,1> goes from 'no average, just sum' -> 'full average' */ + @ModelParam double residue_score_sum_to_avg = 0d /** * added to the cutoff distance around residue in score aggregation from SAS points */ + @ModelParam double residue_score_extra_dist = 0d /** * minimum ligandability score for SAS point to be considered ligandable */ + @ModelParam double pred_point_threshold = 0.4 /** * minimum cluster size (of ligandable points) for initial clustering */ + @ModelParam int pred_min_cluster_size = 3 /** * clustering distance for ligandable clusters for second phase clustering */ + @ModelParam double pred_clustering_dist = 5 /** * SAS points around ligandable points (an their score) will be included in the pocket */ + @ModelParam double extended_pocket_cutoff = 3.5 /** - * cuttoff distance of protein surface atoms considered as part of the pocket + * cutoff distance of protein surface atoms considered as part of the pocket */ + @ModelParam double pred_protein_surface_cutoff = 3.5 /** * Prefix output directory with date and time */ + @RuntimeParam boolean out_prefix_date = false /** - * + * Place all output files in this sub-directory of the output directory */ + @RuntimeParam String out_subdir = null /** - * balance SAS point score weight by density + * Balance SAS point score weight by density (points in denser areas will have lower weight) */ + @ModelParam boolean balance_density = false + /** + * Radius for balancing of SAS point score weight + */ + @ModelParam double balance_density_radius = 2 /** * output detailed tables for all proteins, ligands and pockets */ + @RuntimeParam boolean log_cases = false /** * cutoff for protein exposed atoms calculation (distance from SAS surface is solv.radius. + surf_cutoff) */ + @ModelParam double surface_additional_cutoff = 1.8 /** * collect negatives just from decoy pockets found by other method * (alternatively take negative points from all of the protein's surface) */ + @ModelParam // training boolean sample_negatives_from_decoys = false /** - * cutoff atound ligand atoms to select negatives, 0=all + * cutoff around ligand atoms to select negatives, 0=all * valid if training from whole surface (sample_negatives_from_decoys=false) */ + @ModelParam // training double train_lig_cutoff = 0 /** * n, use only top-n pockets to select training instances, 0=all */ + @ModelParam // training int train_pockets = 0 /** * clear primary caches (protein structures) between runs (when iterating params or seed) */ + @RuntimeParam // training boolean clear_prim_caches = false /** * clear secondary caches (protein surfaces etc.) between runs (when iterating params or seed) */ + @RuntimeParam // training boolean clear_sec_caches = false - - /** * acceptable distance between ligand center and closest protein atom for relevant ligands */ + @ModelParam // training double ligc_prot_dist = 5.5 /** - * pocket rescoring algorithm PRANK="ModelBasedRescorer" + * Select pocket re-scoring algorithm when running in re-scoring mode (predictions=false). + * + * Published PRANK (2015) = "ModelBasedRescorer" */ + @ModelParam String rescorer = "ModelBasedRescorer" + /** + * Parameter of the PLBIndexRescorer algorithm. + */ + @ModelParam boolean plb_rescorer_atomic = false /** - * stop processing the datsaset on the first unrecoverable error with a dataset item + * stop processing the dataset on the first unrecoverable error with a dataset item */ + @RuntimeParam boolean fail_fast = false /** * target class ratio of positives/negatives we train on. * relates to subsampling and supersampling */ + @RuntimeParam // training double target_class_ratio = 0.1 /** * in training use subsampling to deal with class imbalance */ + @RuntimeParam // training boolean subsample = false /** * in training use supersampling to deal with class imbalance */ + @RuntimeParam // training boolean supersample = false /** * sort negatives desc by protrusion before subsampling */ + @RuntimeParam // training boolean subsampl_high_protrusion_negatives = false /** * don't produce prediction files for individual proteins (useful for long repetitive experiments) */ + @RuntimeParam boolean output_only_stats = false /** * compress results of individual ploop runs */ + @RuntimeParam boolean ploop_zip_runs = true /** * delete results of individual ploop/hopt runs */ + @RuntimeParam boolean ploop_delete_runs = true - /** * logging level (TRACE/DEBUG/INFO/WARN/ERROR) */ + @RuntimeParam String log_level = "INFO" /** * print log messages to console */ + @RuntimeParam boolean log_to_console = true /** * print log messages to file (run.log in outdir) */ + @RuntimeParam boolean log_to_file = true /** * compress and delete log file at the end (if log_to_file) */ + @RuntimeParam boolean zip_log_file = false /** * limit the number of proteins that used for training. random subset of proteins from the dataset is used each run in seedloop * 0 = no limit */ + @RuntimeParam // training int train_protein_limit = 0 /** * add weights to instances to achieve target_weight_ratio (if classifier algorithm supports it) * */ + @ModelParam // training boolean balance_class_weights = false /** * target ratio of weighted sums of positive/negative instances when balancing class weights (balance_class_weights=true) */ + @ModelParam // training double target_class_weight_ratio = 0.1 /** * produce classifier stats also for train dataset */ + @RuntimeParam // training boolean classifier_train_stats = false /** @@ -632,68 +842,99 @@ class Params { * Allows calculation of AUC and AUPRC classifier statistics but consumes a lot of memory. * (>1GB for holo4k dataset with tesselation=2) */ + @RuntimeParam boolean stats_collect_predictions = false - /** produce ROC and PR curve graphs (not fully implemented yet) */ + /** + * produce ROC and PR curve graphs (not fully implemented yet) + */ + @RuntimeParam boolean stats_curves = false /** - * Contact residues distance cutoff + * Contact residues distance cutoff (see ContactResiduesPositionFeature) */ + @ModelParam double feat_crang_contact_dist = 3 /** * probe radius for calculating accessible surface area for asa feature */ + @ModelParam double feat_asa_probe_radius = 1.4 /** * probe radius for calculating accessible surface area for asa feature */ + @ModelParam double feat_asa_probe_radius2 = 1.4 /** * radius of the neighbourhood considered in asa feature */ + @ModelParam double feat_asa_neigh_radius = 6 + /** + * radius for calculating of the pmass feature + */ + @ModelParam double feat_pmass_radius = 11 + /** + * parameter of the pmass feature + */ + @ModelParam int feat_pmass_natoms = 70 - + + /** + * parameter of the pmass feature + */ + @ModelParam int feat_pmass_nsasp = 40 /** * selected sub-features in aa index feature */ + @ModelParam List feat_aa_properties = null /** * Hyperparameter optimizer implementation (so far only "spearmint") */ + @RuntimeParam // training String hopt_optimizer = "spearmint" /** * Spearmint home directory (containing main.py) */ + @RuntimeParam // training String hopt_spearmint_dir = "" /** * Metric to minimize in hyperparameter optimization * (minus sign allowed) */ + @RuntimeParam // training String hopt_objective = "-DCA_4_0" /** * max number of iterations in hyperparameter optimization */ + @RuntimeParam // training int hopt_max_iterations = 1000 /** * randomize seed before every training in experiments */ + @RuntimeParam // training boolean randomize_seed = false + /** + * Most important training/evaluation statistics that will be placed in selected_stats.csv table for easier access. + * (all stats will be collected anyway) + */ + @RuntimeParam // training List selected_stats = ['DCA_4_0', 'DCA_4_2', 'DCA_4_4', @@ -713,70 +954,92 @@ class Params { 'AVG_POCKET_SAS_POINTS_TRUE_POCKETS', 'TIME_MINUTES'] - /** * Path to json file that contains parameters of transformation of raw score to "z-score calculated from distribution of true pockets" (pocket.auxInfo.zScoreTP). * Use path relative to distro/models/score. */ + @RuntimeParam String zscoretp_transformer = "default_zscoretp.json" /** * Path to json file that contains parameters of transformation of raw score to "probability that pocket with given score is true pocket" (pocket.auxInfo.probaTP). * Use path relative to distro/models/score. */ + @RuntimeParam String probatp_transformer = "default_probatp.json" - /** * Path to json file that contains parameters of transformation of raw score to "z-score calculated from distribution of all residue scores". * Use path relative to distro/models/score. */ + @RuntimeParam String zscoretp_res_transformer = "residue/p2rank_default_zscore.json" /** * Path to json file that contains parameters of transformation of raw score to "probability that residue with given score is true residue". * Use path relative to distro/models/score. */ + @RuntimeParam String probatp_res_transformer = "residue/p2rank_default_proba.json" + /** + * List of pocket score transformers that should be trained (i.e. fitted / inferred) during predict-eval. + * Transformers are tied to the output distribution of the model (and its parametrization) so new transformers should be trained for every released model. + */ + @RuntimeParam List train_score_transformers = [] // ["ZscoreTpTransformer","ProbabilityScoreTransformer"] /** - * Train resaidue score transformers on a dataset during predict-eval + * Train residue score transformers on a dataset during predict-eval. + * Transformers are tied to the output distribution of the model (and its parametrization) so new transformers should be trained for every released model. */ + @RuntimeParam boolean train_score_transformers_for_residues = false /** - * Reduce loaded protein structures to chains declared in dataset file (in optional chains column) + * Reduce loaded protein structures to chains declared in dataset file (in optional chains column). + * If false all protein chains will be loaded. */ + @RuntimeParam boolean load_only_specified_chains = false /** - * In hyperparameter optimization (ploop and hopt commands) train model only once in the beginning - * (makes sense if optimized hyperparameters do't influence training and feature extraction) + * In hyper-parameter optimization (ploop and hopt commands) train model only once in the beginning + * (makes sense if optimized hyper-parameters do't influence training and feature extraction) */ + @RuntimeParam boolean hopt_train_only_once = false /** + * Parameter of propensity feature. * directory in program resources to take peptide propensities from * (resources/tables/peptides/$var/...) * Available: SprintT1070, SprintA870 - * TODO: move to dist dir + * + * TODO: rename param and make it general, not only specific to peptide binding + * TODO: move to dist dir on release */ + @ModelParam String pept_propensities_set = "SprintT1070" + + /** + * When identifying which protein chains are peptides consider provided binary residue labeling (that comes with the dataset). + */ + @ModelParam // training boolean identify_peptides_by_labeling = false /** * Atoms size threshold for using KD-tree in cutoutSphere routine */ + @RuntimeParam int use_kdtree_cutout_sphere_thrashold = 150 //===========================================================================================================// /** - * Should be (slightly above) the distence of solvent exposed atoms to SAS points - * @return + * Derived parameter. + * Should be (slightly above) the distance of solvent exposed atoms to SAS points. */ double getSasCutoffDist() { solvent_radius + surface_additional_cutoff @@ -784,20 +1047,28 @@ class Params { //===========================================================================================================// + /** + * This method is here so the program version is included in toString() for Params object. + */ String getVersion() { Main.getVersion() } + /** + * location of P2Rank installation directory (i.e. directory where the binary and configs and models are / unpacked distro directory) + */ String installDir // TODO refactor //===========================================================================================================// + /** + * Apply parameter values from the command line + */ public updateFromCommandLine(CmdLineArgs args) { applyCmdLineArgs(args) // processing of special params - if (!parallel) { threads = 1 rf_threads = 1 @@ -808,7 +1079,7 @@ class Params { } @CompileDynamic - void applyCmdLineArgs(CmdLineArgs args) { + private void applyCmdLineArgs(CmdLineArgs args) { boolean filterRanged = args.hasListParams diff --git a/src/main/groovy/cz/siret/prank/program/params/RuntimeParam.groovy b/src/main/groovy/cz/siret/prank/program/params/RuntimeParam.groovy new file mode 100644 index 00000000..42455f7d --- /dev/null +++ b/src/main/groovy/cz/siret/prank/program/params/RuntimeParam.groovy @@ -0,0 +1,11 @@ +package cz.siret.prank.program.params + +/** + * Marks parameters of the program execution (training or prediction phase) + * i.e. parameters not directly related to the prediction algorithm itself. + * + * Currently annotation serves only documentation purposes. + */ +@interface RuntimeParam { + +} \ No newline at end of file diff --git a/src/main/groovy/cz/siret/prank/utils/CutoffAtomsCallLog.java b/src/main/groovy/cz/siret/prank/utils/CutoffAtomsCallLog.java index de90bd29..44607989 100644 --- a/src/main/groovy/cz/siret/prank/utils/CutoffAtomsCallLog.java +++ b/src/main/groovy/cz/siret/prank/utils/CutoffAtomsCallLog.java @@ -7,7 +7,7 @@ import java.util.List; import java.util.stream.Collectors; /** - * + * Utility for debugging and profiling cutoffAtoms methods */ public class CutoffAtomsCallLog { diff --git a/src/main/groovy/cz/siret/prank/utils/ProcessRunner.groovy b/src/main/groovy/cz/siret/prank/utils/ProcessRunner.groovy index ca93de21..a200acb6 100644 --- a/src/main/groovy/cz/siret/prank/utils/ProcessRunner.groovy +++ b/src/main/groovy/cz/siret/prank/utils/ProcessRunner.groovy @@ -6,7 +6,7 @@ import groovy.transform.CompileStatic import groovy.util.logging.Slf4j /** - * + * Encapsulates system (command line) process */ @Slf4j @CompileStatic