From 1b7809a66781884701f0342c8c5f1cb48e941149 Mon Sep 17 00:00:00 2001
From: rdk <rdk@users.noreply.github.com>
Date: Sat, 9 Nov 2019 17:30:15 +0100
Subject: [PATCH] improved parameter comments, param annotations, cleanup

---
 .../cz/siret/prank/domain/Dataset.groovy      |   4 +
 .../labeling/ModelBasedResidueLabeler.groovy  |   2 +-
 .../labeling/ResidueBasedPointLabeler.groovy  |   2 +-
 .../domain/labeling/ResidueLabeling.groovy    |   4 +-
 .../domain/labeling/ResidueLabelings.groovy   |   2 +-
 .../features/PrankFeatureExtractor.groovy     |  12 -
 .../prank/features/api/FeatureRegistry.groovy |   4 +
 .../api/ResidueToAtomicFeatWrapper.groovy     |   2 +-
 .../api/ResidueToSasFeatWrapper.groovy        |   2 +-
 .../api/SasFeatureCalculationContext.groovy   |   2 +-
 .../prank/program/params/ModelParam.groovy    |  13 +
 .../siret/prank/program/params/Params.groovy  | 389 +++++++++++++++---
 .../prank/program/params/RuntimeParam.groovy  |  11 +
 .../siret/prank/utils/CutoffAtomsCallLog.java |   2 +-
 .../cz/siret/prank/utils/ProcessRunner.groovy |   2 +-
 15 files changed, 372 insertions(+), 81 deletions(-)
 create mode 100644 src/main/groovy/cz/siret/prank/program/params/ModelParam.groovy
 create mode 100644 src/main/groovy/cz/siret/prank/program/params/RuntimeParam.groovy
diff --git a/src/main/groovy/cz/siret/prank/domain/Dataset.groovy b/src/main/groovy/cz/siret/prank/domain/Dataset.groovy
index 8a114098..cae0c7a9 100644
--- a/src/main/groovy/cz/siret/prank/domain/Dataset.groovy
+++ b/src/main/groovy/cz/siret/prank/domain/Dataset.groovy
@@ -215,6 +215,10 @@ class Dataset implements Parametrized {
         return getLoader(attributes.get(PARAM_PREDICTION_METHOD), item)
     }
 
+    /**
+     * Get configured instance of prediction loader.
+     * @param method LBS prediction method name
+     */
     private PredictionLoader getLoader(String method, Item item) {
         PredictionLoader res
         switch (method) {
diff --git a/src/main/groovy/cz/siret/prank/domain/labeling/ModelBasedResidueLabeler.groovy b/src/main/groovy/cz/siret/prank/domain/labeling/ModelBasedResidueLabeler.groovy
index 1909cfd3..b38d4cb8 100644
--- a/src/main/groovy/cz/siret/prank/domain/labeling/ModelBasedResidueLabeler.groovy
+++ b/src/main/groovy/cz/siret/prank/domain/labeling/ModelBasedResidueLabeler.groovy
@@ -18,7 +18,7 @@ import static cz.siret.prank.utils.Formatter.format
 import static cz.siret.prank.utils.Formatter.formatNumbers
 
 /**
- * (not intended o be reused with mode proteins)
+ * (not intended to be reused with mode proteins)
  */
 @Slf4j
 @CompileStatic
diff --git a/src/main/groovy/cz/siret/prank/domain/labeling/ResidueBasedPointLabeler.groovy b/src/main/groovy/cz/siret/prank/domain/labeling/ResidueBasedPointLabeler.groovy
index 10f73b64..a43e0751 100644
--- a/src/main/groovy/cz/siret/prank/domain/labeling/ResidueBasedPointLabeler.groovy
+++ b/src/main/groovy/cz/siret/prank/domain/labeling/ResidueBasedPointLabeler.groovy
@@ -8,7 +8,7 @@ import groovy.util.logging.Slf4j
 import org.biojava.nbio.structure.Atom
 
 /**
- *
+ * Labels points according to nearest residue.
  */
 @Slf4j
 @CompileStatic
diff --git a/src/main/groovy/cz/siret/prank/domain/labeling/ResidueLabeling.groovy b/src/main/groovy/cz/siret/prank/domain/labeling/ResidueLabeling.groovy
index 03ca972c..d3a9d7ef 100644
--- a/src/main/groovy/cz/siret/prank/domain/labeling/ResidueLabeling.groovy
+++ b/src/main/groovy/cz/siret/prank/domain/labeling/ResidueLabeling.groovy
@@ -7,7 +7,7 @@ import groovy.transform.CompileStatic
 import javax.annotation.Nullable
 
 /**
- * Holds partiticular assignment of labels to a set of residues.
+ * Holds particular assignment of labels to a set of residues.
  */
 @CompileStatic
 class ResidueLabeling<L>  {
@@ -15,7 +15,7 @@ class ResidueLabeling<L>  {
     private List<LabeledResidue<L>> labeledResidues
     private Map<Residue.Key, LabeledResidue<L>> labeledMap
 
-    ResidueLabeling(List<LabeledResidue> labeledResidues) {
+    ResidueLabeling(List<LabeledResidue<L>> labeledResidues) {
         this.labeledResidues = labeledResidues
         this.labeledMap = Maps.uniqueIndex(labeledResidues, { it.residue.key })
     }
diff --git a/src/main/groovy/cz/siret/prank/domain/labeling/ResidueLabelings.groovy b/src/main/groovy/cz/siret/prank/domain/labeling/ResidueLabelings.groovy
index 950b8d08..9eb61792 100644
--- a/src/main/groovy/cz/siret/prank/domain/labeling/ResidueLabelings.groovy
+++ b/src/main/groovy/cz/siret/prank/domain/labeling/ResidueLabelings.groovy
@@ -23,7 +23,7 @@ import static cz.siret.prank.utils.Futils.mkdirs
 import static cz.siret.prank.utils.Futils.writeFile
 
 /**
- * Logic for calculating residue lebalings during P2Rank prediction
+ * Logic for calculating residue labelings during P2Rank prediction
  */
 @Slf4j
 class ResidueLabelings implements Parametrized {
diff --git a/src/main/groovy/cz/siret/prank/features/PrankFeatureExtractor.groovy b/src/main/groovy/cz/siret/prank/features/PrankFeatureExtractor.groovy
index e9ad3651..bb1154c9 100644
--- a/src/main/groovy/cz/siret/prank/features/PrankFeatureExtractor.groovy
+++ b/src/main/groovy/cz/siret/prank/features/PrankFeatureExtractor.groovy
@@ -37,15 +37,9 @@ class PrankFeatureExtractor extends FeatureExtractor<PrankFeatureVector> impleme
     List<String> atomTableFeatures
     List<String> residueTableFeatures
 
-
     // tied to a protein
     private PointSampler pocketPointSampler
 
-    /**
-     * if set to true extractorFactory will use zero vectors for unknown residues
-     * otherwise throws exception (so the whole pocket can be ignored)
-     */
-    private boolean MASK_UNKNOWN_RESIDUES = params.mask_unknown_residues
     private double NEIGH_CUTOFF_DIST = params.neighbourhood_radius
     private boolean DO_SMOOTH_REPRESENTATION = params.smooth_representation
     private double SMOOTHING_CUTOFF_DIST = params.smoothing_radius
@@ -112,7 +106,6 @@ class PrankFeatureExtractor extends FeatureExtractor<PrankFeatureVector> impleme
         return res
     }
 
-
     @Override
     void prepareProteinPrototypeForPockets() {
         pocketPointSampler = PointSampler.create(protein, trainingExtractor)
@@ -143,7 +136,6 @@ class PrankFeatureExtractor extends FeatureExtractor<PrankFeatureVector> impleme
         this.protein = protein
         this.pocket = pocket
 
-        this.MASK_UNKNOWN_RESIDUES = proteinPrototype.MASK_UNKNOWN_RESIDUES
         this.headerAdditionalFeatures = proteinPrototype.headerAdditionalFeatures
         this.pocketPointSampler    = proteinPrototype.pocketPointSampler
         this.extraFeaturesHeader   = proteinPrototype.extraFeaturesHeader
@@ -156,10 +148,6 @@ class PrankFeatureExtractor extends FeatureExtractor<PrankFeatureVector> impleme
         this.surfaceLayerAtoms = proteinPrototype.surfaceLayerAtoms
         this.properties = proteinPrototype.properties
         this.smoothRepresentations = proteinPrototype.smoothRepresentations
-
-
-
-
     }
 
     @Override
diff --git a/src/main/groovy/cz/siret/prank/features/api/FeatureRegistry.groovy b/src/main/groovy/cz/siret/prank/features/api/FeatureRegistry.groovy
index d35ffb42..fd5db8ba 100644
--- a/src/main/groovy/cz/siret/prank/features/api/FeatureRegistry.groovy
+++ b/src/main/groovy/cz/siret/prank/features/api/FeatureRegistry.groovy
@@ -1,5 +1,9 @@
 package cz.siret.prank.features.api
 
+
+import cz.siret.prank.features.api.FeatureCalculator
+import cz.siret.prank.features.api.ResidueToAtomicFeatWrapper
+import cz.siret.prank.features.api.ResidueToSasFeatWrapper
 import cz.siret.prank.features.implementation.AAIndexFeature
 import cz.siret.prank.features.implementation.Asa2Feature
 import cz.siret.prank.features.implementation.AsaFeature
diff --git a/src/main/groovy/cz/siret/prank/features/api/ResidueToAtomicFeatWrapper.groovy b/src/main/groovy/cz/siret/prank/features/api/ResidueToAtomicFeatWrapper.groovy
index 487cfeec..f398069a 100644
--- a/src/main/groovy/cz/siret/prank/features/api/ResidueToAtomicFeatWrapper.groovy
+++ b/src/main/groovy/cz/siret/prank/features/api/ResidueToAtomicFeatWrapper.groovy
@@ -5,7 +5,7 @@ import cz.siret.prank.domain.Residue
 import org.biojava.nbio.structure.Atom
 
 /**
- *
+ * Maps a residue features to atom feature
  */
 class ResidueToAtomicFeatWrapper extends AtomFeatureCalculator {
 
diff --git a/src/main/groovy/cz/siret/prank/features/api/ResidueToSasFeatWrapper.groovy b/src/main/groovy/cz/siret/prank/features/api/ResidueToSasFeatWrapper.groovy
index b328139e..b969dfe0 100644
--- a/src/main/groovy/cz/siret/prank/features/api/ResidueToSasFeatWrapper.groovy
+++ b/src/main/groovy/cz/siret/prank/features/api/ResidueToSasFeatWrapper.groovy
@@ -10,7 +10,7 @@ import groovy.transform.CompileStatic
 import org.biojava.nbio.structure.Atom
 
 /**
- * Mapping Closest Residue to SAS
+ * Mapping Closest Residue to SAS point
  */
 @CompileStatic
 class ResidueToSasFeatWrapper extends SasFeatureCalculator {
diff --git a/src/main/groovy/cz/siret/prank/features/api/SasFeatureCalculationContext.groovy b/src/main/groovy/cz/siret/prank/features/api/SasFeatureCalculationContext.groovy
index 70a2aea5..19acec91 100644
--- a/src/main/groovy/cz/siret/prank/features/api/SasFeatureCalculationContext.groovy
+++ b/src/main/groovy/cz/siret/prank/features/api/SasFeatureCalculationContext.groovy
@@ -6,7 +6,7 @@ import cz.siret.prank.geom.Atoms
 import groovy.transform.CompileStatic
 
 /**
- * Context for calculation of SAS feature.
+ * Context for calculation of a SAS feature.
  */
 @CompileStatic
 class SasFeatureCalculationContext {
diff --git a/src/main/groovy/cz/siret/prank/program/params/ModelParam.groovy b/src/main/groovy/cz/siret/prank/program/params/ModelParam.groovy
new file mode 100644
index 00000000..fd570f0a
--- /dev/null
+++ b/src/main/groovy/cz/siret/prank/program/params/ModelParam.groovy
@@ -0,0 +1,13 @@
+package cz.siret.prank.program.params
+
+/**
+ * Marks parameters of the prediction model (including feature extraction params) i.e. algorithm params.
+ * The notion of "model" here is seen wholesomely as a pocket prediction model = the whole algorithm, which includes feature extraction, classification and aggregation to binding sites.
+ *
+ * These are the parameters that must be the same in training and prediction phase.
+ *
+ * Currently annotation serves only documentation purposes.
+ */
+@interface ModelParam {
+
+}
\ No newline at end of file
diff --git a/src/main/groovy/cz/siret/prank/program/params/Params.groovy b/src/main/groovy/cz/siret/prank/program/params/Params.groovy
index 8626c25b..139e0006 100644
--- a/src/main/groovy/cz/siret/prank/program/params/Params.groovy
+++ b/src/main/groovy/cz/siret/prank/program/params/Params.groovy
@@ -16,7 +16,6 @@ class Params {
 
     public static final Params INSTANCE = new Params()
 
-
     public static Params getInst() {
         return INSTANCE
     }
@@ -25,128 +24,195 @@ class Params {
      * define this if you want dataset program parameters to be evaluated relative to this directory
      * (set absolute path or path relative to install dir, null defaults to working dir)
      */
+    @RuntimeParam
     String dataset_base_dir = null
 
     /**
      * all output of the program will be stored in subdirectores of this directory
      * (set absolute path or path relative to install dir, null defaults to working dir)
      */
+    @RuntimeParam
     String output_base_dir = null
 
     /**
      * serialized model
      * (set path relative to install_dir/models/)
      */
+    @RuntimeParam
     String model = "default.model"
 
     /**
      * Random seed
      */
+    @RuntimeParam
     int seed = 42
 
+    /**
+     * Parallel execution (processing datasets in parallel)
+     */
+    @RuntimeParam
     boolean parallel = true
 
     /**
      * Number of computing threads
      */
+    @RuntimeParam
     int threads = Runtime.getRuntime().availableProcessors() + 1
 
     /**
      *  Number for threads for generating R plots
      */
+    @RuntimeParam
     int r_threads = 2
 
     /**
-     * Number of folds to work on simultaneously
+     * Number of cross-validation folds to work on simultaneously.
+     * (Multiplies required memory)
      */
+    @RuntimeParam
     int crossval_threads = 1 // Math.min(5, Runtime.getRuntime().availableProcessors())
 
     /**
      * defines witch atoms around the ligand are considered to be part of the pocket
      * (ligands with longer distance are considered irrelevant floating ligands)
      */
+    @ModelParam // training
     double ligand_protein_contact_distance = 4
 
-    //== FAETURES
+    //==[ Features ]=========================================================================================================//
 
+    /**
+     * List of general calculated features
+     */
+    @ModelParam
     List<String> extra_features = ["protrusion","bfactor"]
 
+    /**
+     * List of features that come directly from atom type table
+     * see atomic-properties.csv
+     */
+    @ModelParam
     List<String> atom_table_features = ["apRawValids","apRawInvalids","atomicHydrophobicity"] // "ap5sasaValids","ap5sasaInvalids"
 
+    /**
+     * List of features that come directly from residue table
+     */
+    @ModelParam
+    List<String> residue_table_features = [] // ['aa5fact1','aa5fact2','aa5fact3','aa5fact4','aa5fact5']
+
+    /**
+     * Exponent applied to all atom table features
+     */
+    @ModelParam
     double atom_table_feat_pow = 2
 
     /**
      * dummy param to preserve behaviour of older versions
+     * if true sign of value is reapplied after transformation by atom_table_feat_pow
      */
+    @ModelParam
     boolean atom_table_feat_keep_sgn = false
 
-    List<String> residue_table_features = [] // ['aa5fact1','aa5fact2','aa5fact3','aa5fact4','aa5fact5']
-
+    /**
+     * radius for calculation protrusion feature
+     */
+    @ModelParam
     double protrusion_radius = 10
 
 //===========================================================================================================//
 
-
     /**
      * Number of bins for protr_hist feature, must be >=2
      */
+    @ModelParam
     int protr_hist_bins = 5
 
+    /**
+     * Param of protr_hist feature
+     */
+    @ModelParam
     boolean protr_hist_cumulative = false
 
+    /**
+     * Param of protr_hist feature
+     */
+    @ModelParam
     boolean protr_hist_relative = false
 
-//===========================================================================================================//
-
     /**
      * Number of bins for Atom Pair distance histogram (pair_hist) feature, must be >=2
      */
+    @ModelParam
     int pair_hist_bins = 5
 
     /**
      * Radius capturing atoms considered in pair_hist feature
      */
+    @ModelParam
     double pair_hist_radius = 6
 
     /**
      * smooth vs. sharp binning
      */
+    @ModelParam
     boolean pair_hist_smooth = false
 
+    /**
+     * apply normalization to histogram
+     */
+    @ModelParam
     boolean pair_hist_normalize = false
 
     /**
-     * if false only protein exposed atmos are considered
+     * if false only protein exposed atoms are considered
      */
+    @ModelParam
     boolean pair_hist_deep = true
 
     /**
      * size of random subsample of atom pairs, 0 = all
      */
+    @ModelParam
     int pair_hist_subsample_limit = 0
 
 //===========================================================================================================//
 
     /**
-     * conservation parameteres
+     * Load sequence conservation data
      */
-    boolean load_conservation = false // always load conservation (for stats)
+    @RuntimeParam
+    boolean load_conservation = false
 
+
+    /**
+     * Pocket scoring algorithm
+     */
+    @ModelParam
     String score_pockets_by = "p2rank" // possible values: "p2rank", "conservation", "combi"
 
     /**
-     * Conservation exponent for rescoring pockets
+     * Conservation exponent for re-scoring pockets
      */
+    @ModelParam
     int conservation_exponent = 1
 
+    /**
+     * Radius for calculating conservation cloud related features
+     */
+    @ModelParam
     double conserv_cloud_radius = 10
 
+    /**
+     * Radius for calculating secondary structure cloud related features
+     */
+    @ModelParam
     double ss_cloud_radius = 10
 
     /**
      * Conservation file with this pattern is loaded:
      * baseName + chainId + "." + origin + ".hom.gz"
      */
+    @RuntimeParam
     String conservation_origin = "hssp"
 
     /**
@@ -154,39 +220,46 @@ class Params {
      * Path relative to dataset directory.
      * if null: look in the same directory as protein file
      */
+    @RuntimeParam
     String conservation_dir = null
 
     /**
      * Log scores for binding and nonbinding scores to file
      */
+    @RuntimeParam
     String log_scores_to_file = ""
 
     /**
      * limits how many pocket SAS points are used for scoring (after sorting), 0=unlimited
      * affects scoring pockets and also residues
      */
+    @ModelParam
     int score_point_limit = 0
 
-//===========================================================================================================//
-
-
-    //== CLASSIFIERS ===================
+//==[ Classifiers ]=========================================================================================================//
 
     /**
      * see ClassifierOption
      */
+    @ModelParam
     String classifier = "FastRandomForest"
 
     /**
      * see ClassifierOption
      */
+    @ModelParam
     String inner_classifier = "FastRandomForest"
 
+    /**
+     * see ClassifierOption
+     */
+    @ModelParam
     int meta_classifier_iterations = 5
 
     /**
      * works only with classifier "CostSensitive_RF"
      */
+    @ModelParam // training
     double false_positive_cost = 2
 
     //=== Random Forests =================
@@ -194,118 +267,179 @@ class Params {
     /**
      * RandomForest trees
      */
+    @ModelParam // training
     int rf_trees = 100
 
     /**
      * RandomForest depth limit, 0=unlimited
      */
+    @ModelParam // training
     int rf_depth = 0
 
     /**
      * RandomForest feature subset size for one tree, 0=default(sqrt)
      */
+    @ModelParam // training
     int rf_features = 0
 
     /**
      * number of threads used in RandomForest training (0=use value of threads param)
      */
+    @RuntimeParam // training
     int rf_threads = 0
 
     /**
      * size of a bag: 1..100% of the dataset
      */
+    @ModelParam // training
     int rf_bagsize = 100
 
     /**
      * cutoff for joining ligand atom groups into one ligand
      */
-    double ligand_clustering_distance = 1.7 // ~ covalent bond length
+    @ModelParam // training
+    double ligand_clustering_distance = 1.7 // ~= covalent bond length
 
     /**
      * cutoff around ligand that defines positives
      */
+    @ModelParam
     double positive_point_ligand_distance = 2.5
 
     /**
      * distance around ligand atoms that define ligand induced volume
      * (for evaluation by some criteria, DSO, ligand coverage...)
      */
+    @ModelParam
     double ligand_induced_volume_cutoff = 2.5
 
     /**
      * points between [positive_point_ligand_distance,neutral_point_margin] will be left out form training
      */
+    @ModelParam // training
     double neutral_points_margin = 5.5
 
-    boolean mask_unknown_residues = true
-
     /**
-     * chem. properties representation neighbourhood radius in A
+     * Neighbourhood radius (A) used for calculating most of the features.
      */
+    @ModelParam
     double neighbourhood_radius = 8
 
     /**
-     * HETATM groups that are considered cofactor and ignored
+     * HETATM groups that are ignored (not marked as relevant ligands, e.g because they are cofactors or part of a substrate)
      */
+    @ModelParam // training
     Set<String> ignore_het_groups = ["HOH","DOD","WAT","NAG","MAN","UNK","GLC","ABA","MPD","GOL","SO4","PO4"] as Set
 
     /**
-     * positive point defining ligand types accepted values: "relevant", "ignored", "small", "distant"
+     * Which ligand types define positive SAS points.
+     * accepted values: "relevant", "ignored", "small", "distant"
      */
+    @ModelParam // training
     List<String> positive_def_ligtypes = ["relevant"]
 
     /**
-     * min. heavy atom count for ligand, other ligands ignored
+     * Minimal heavy atom count for relevant ligands, other ligands are considered too small and ignored
      */
+    @ModelParam // training
     int min_ligand_atoms = 5
 
+    /**
+     * Point sampler for extracting instances for training.
+     * P2Rank and PRANK use SurfacePointSampler that produces SAS points.
+     * Others like GridPointSampler are experimental.
+     */
+    @ModelParam
     String point_sampler = "SurfacePointSampler"
 
     /**
      * multiplier for random point sampling
      */
+    @ModelParam // training
     int sampling_multiplier = 3
 
     /**
      * solvent radius for SAS surface
      */
+    @ModelParam
     double solvent_radius = 1.6
 
     /**
-     * SAS tessellation (~density) used in pradiction step
+     * SAS tessellation (~density) used in prediction step
      */
+    @ModelParam
     int tessellation = 2
 
     /**
      * SAS tessellation (~density) used in training step
      */
+    @ModelParam // training
     int train_tessellation = 2
 
-    // for grid and random sampling
+    /**
+     * for grid and random sampling
+     */
+    @ModelParam
     double point_min_distfrom_protein = 2.5
+
+    /**
+     * for grid and random sampling
+     */
+    @ModelParam
     double point_max_distfrom_pocket = 4.5
 
-    /* for GridPointSampler */
+    /**
+     * grid cell size for GridPointSampler
+     */
+    @ModelParam
     double grid_cell_edge = 2
 
     /**
      * Restrict training set size, 0=unlimited
      */
+    @RuntimeParam // training
     int max_train_instances = 0
 
+    /**
+     * Param of SAS score weighting function (see WeightFun)
+     */
+    @ModelParam
     double weight_power = 2
+
+    /**
+     * Param of SAS score weighting function (see WeightFun)
+     */
+    @ModelParam
     double weight_sigma = 2.2
+
+    /**
+     * Param of SAS score weighting function (see WeightFun)
+     */
+    @ModelParam
     double weight_dist_param = 4.5
 
+    /**
+     * Choice of SAS score weighting function (see WeightFun)
+     */
+    @ModelParam
     String weight_function = "INV"
 
+    /**
+     * If false only single layer of proteins solvent exposed atoms is used for calculating features that are projected from protein atoms to SAS points
+     */
+    @ModelParam
     boolean deep_surrounding = false
 
     /** calculate feature vectors from smooth atom feature representation
      * (instead of directly from atom properties)
      */
+    @ModelParam
     boolean smooth_representation = false
 
+    /**
+     * related to smooth_representation
+     */
+    @ModelParam
     double smoothing_radius = 4.5
 
     /**
@@ -313,6 +447,7 @@ class Params {
      * if true, atom feature vectors are averaged
      * else they are only summed up
      */
+    @ModelParam
     boolean average_feat_vectors = false
 
     /**
@@ -320,311 +455,386 @@ class Params {
      * only applicable when average_feat_vectors=true
      * <0,1> goes from 'no average, just sum' -> 'full average'
      */
+    @ModelParam
     double avg_pow = 1
 
     /**
      * regarding feature projection from atoms to SAS points: calculate weighted average
-     * (shoud be true by default, kept false for backward compatibility reasons)
+     * (should be true by default, kept false for backward compatibility reasons)
      */
+    @ModelParam
     boolean avg_weighted = false
 
     /**
-     * exponent of point ligandabitity score (before adding it to pocket score)
+     * exponent of point ligandability score (before adding it to pocket score)
      */
+    @ModelParam
     double point_score_pow = 2
 
     /**
-     * Binary classifiers produces historgam of scores for class0 and class1
+     * Binary classifiers produces histogram of scores for class0 and class1
      * if true only score for class1 is considered
      * makes a difference only if histogram produced by classifier doesn't sum up to 1
      */
+    @ModelParam
     boolean use_only_positive_score = true
 
+    /**
+     * If true trained models will not be saved to disk (good for parameter optimization)
+     */
+    @RuntimeParam
     boolean delete_models = false
 
     /**
      * delete files containing training/evaluation feature vectors
      */
+    @RuntimeParam
     boolean delete_vectors = true
 
+    /**
+     * check all loaded/calculated vectors for invalid (NaN) values
+     */
+    @RuntimeParam
     boolean check_vectors = false
 
     /**
-     * collect vectors also from eval dataset (only makes sense if delete_vectors=false)
+     * collect vectors also from eval dataset (only makes sense in combination with delete_vectors=false)
      */
+    @RuntimeParam
     boolean collect_eval_vectors = false
 
     /**
      * collect vectors only at the beginning of seed loop routine
-     * if dataset is subsampled (using train_protein_limit param) then dataset is subsampled only once
+     * if dataset is sub-sampled (using train_protein_limit param) then dataset is sub-sampled only once
      * set to false when doing learning curve!
      * train_protein_limit>0 should be always paired with collect_only_once=false
      */
+    @RuntimeParam
     boolean collect_only_once = true
 
     /**
      * number of random seed iterations
      */
+    @RuntimeParam
     int loop = 1
 
     /**
      * keep datasets (structures and SAS points) in memory between crossval/seedloop iterations
      */
+    @RuntimeParam
     boolean cache_datasets = false
 
     /**
      * calculate feature importances
      * available only for some classifiers
      */
+    @RuntimeParam
     boolean feature_importances = false
 
     /**
      * produce pymol visualisations
      */
+    @RuntimeParam
     boolean visualizations = true
 
     /**
      * visualize all surface points (not just inner pocket points)
      */
+    @RuntimeParam
     boolean vis_all_surface = false
 
     /**
      * copy all protein pdb files to visualization folder (making visualizations portable)
      */
+    @RuntimeParam
     boolean vis_copy_proteins = true
 
     /**
      * generate new protein pdb files from structures in memory instead of reusing input files
      * (useful when structures were manipulated in memory, e.g. when reducing to specified chains)
      */
+    @RuntimeParam
     boolean vis_generate_proteins = true
 
     /**
      * zip PyMol visualizations to save space
      */
+    @RuntimeParam
     boolean zip_visualizations = false
 
     /**
      * use strictly inner pocket points or more wider pocket neighbourhood
      */
+    @RuntimeParam
     boolean strict_inner_points = false
 
     /**
-     * crossvalidation folds
+     * cross-validation folds
      */
+    @RuntimeParam
     int folds = 5
 
     /**
      * collect evaluations for top [n+0, n+1,...] pockets (n is true pocket count)
      */
+    @RuntimeParam
     List<Integer> eval_tolerances = [0,1,2,4,10,99]
 
     /**
-     * make own prank pocket predictions (P2RANK)
+     * Calculate pocket predictions.
+     * This is a main switch between re-scoring of predictions by other methods (PRANK) and pocket prediction (P2Rank)
      */
+    @RuntimeParam
     boolean predictions = true
 
     /**
-     * residue prediction mode (opposed to pocket prediction)
+     * Residue prediction mode (as opposed to full pocket prediction mode)
      */
+    @RuntimeParam
     boolean predict_residues = false
 
     /**
-     * (in predict mode) produce residue labeling file
+     * produce residue labeling file (in predict mode)
+     *
+     * Even in full pocket prediction mode (predict_residues=false) we can label and score residues using transformers.
      */
+    @RuntimeParam
     boolean label_residues = true
 
     /**
-     * residue score threshold fir calculating predicted binary label
+     * residue score threshold for calculating predicted binary label
      */
+    @ModelParam
     double residue_score_threshold = 1d
 
     /**
      * in calculation of residue score from neighboring SAS points:
      * <0,1> goes from 'no average, just sum' -> 'full average'
      */
+    @ModelParam
     double residue_score_sum_to_avg = 0d
 
     /**
      * added to the cutoff distance around residue in score aggregation from SAS points
      */
+    @ModelParam
     double residue_score_extra_dist = 0d
 
     /**
      * minimum ligandability score for SAS point to be considered ligandable
      */
+    @ModelParam
     double pred_point_threshold = 0.4
 
     /**
      * minimum cluster size (of ligandable points) for initial clustering
      */
+    @ModelParam
     int pred_min_cluster_size = 3
 
     /**
      * clustering distance for ligandable clusters for second phase clustering
      */
+    @ModelParam
     double pred_clustering_dist = 5
 
     /**
      * SAS points around ligandable points (an their score) will be included in the pocket
      */
+    @ModelParam
     double extended_pocket_cutoff = 3.5
 
     /**
-     * cuttoff distance of protein surface atoms considered as part of the pocket
+     * cutoff distance of protein surface atoms considered as part of the pocket
      */
+    @ModelParam
     double pred_protein_surface_cutoff = 3.5
 
     /**
      * Prefix output directory with date and time
      */
+    @RuntimeParam
     boolean out_prefix_date = false
 
     /**
-     *
+     * Place all output files in this sub-directory of the output directory
      */
+    @RuntimeParam
     String out_subdir = null
 
     /**
-     * balance SAS point score weight by density
+     * Balance SAS point score weight by density (points in denser areas will have lower weight)
      */
+    @ModelParam
     boolean balance_density = false
 
+    /**
+     * Radius for balancing of SAS point score weight
+     */
+    @ModelParam
     double balance_density_radius = 2
 
     /**
      * output detailed tables for all proteins, ligands and pockets
      */
+    @RuntimeParam
     boolean log_cases = false
 
     /**
      * cutoff for protein exposed atoms calculation (distance from SAS surface is solv.radius. + surf_cutoff)
      */
+    @ModelParam
     double surface_additional_cutoff = 1.8
 
     /**
      * collect negatives just from decoy pockets found by other method
      * (alternatively take negative points from all of the protein's surface)
      */
+    @ModelParam // training
     boolean sample_negatives_from_decoys = false
 
     /**
-     * cutoff atound ligand atoms to select negatives, 0=all
+     * cutoff around ligand atoms to select negatives, 0=all
      * valid if training from whole surface (sample_negatives_from_decoys=false)
      */
+    @ModelParam // training
     double train_lig_cutoff = 0
 
     /**
      * n, use only top-n pockets to select training instances, 0=all
      */
+    @ModelParam // training
     int train_pockets = 0
 
     /**
      * clear primary caches (protein structures) between runs (when iterating params or seed)
      */
+    @RuntimeParam // training
     boolean clear_prim_caches = false
 
     /**
      * clear secondary caches (protein surfaces etc.) between runs (when iterating params or seed)
      */
+    @RuntimeParam // training
     boolean clear_sec_caches = false
 
-
-
     /**
      * acceptable distance between ligand center and closest protein atom for relevant ligands
      */
+    @ModelParam // training
     double ligc_prot_dist = 5.5
 
     /**
-     * pocket rescoring algorithm PRANK="ModelBasedRescorer"
+     * Select pocket re-scoring algorithm when running in re-scoring mode (predictions=false).
+     *
+     * Published PRANK (2015) = "ModelBasedRescorer"
      */
+    @ModelParam
     String rescorer = "ModelBasedRescorer"
 
+    /**
+     * Parameter of the PLBIndexRescorer algorithm.
+     */
+    @ModelParam
     boolean plb_rescorer_atomic = false
 
     /**
-     * stop processing the datsaset on the first unrecoverable error with a dataset item
+     * stop processing the dataset on the first unrecoverable error with a dataset item
      */
+    @RuntimeParam
     boolean fail_fast = false
 
     /**
      * target class ratio of positives/negatives we train on.
      * relates to subsampling and supersampling
      */
+    @RuntimeParam // training
     double target_class_ratio = 0.1
 
     /**
      * in training use subsampling to deal with class imbalance
      */
+    @RuntimeParam // training
     boolean subsample = false
 
     /**
      * in training use supersampling to deal with class imbalance
      */
+    @RuntimeParam // training
     boolean supersample = false
 
     /**
      * sort negatives desc by protrusion before subsampling
      */
+    @RuntimeParam // training
     boolean subsampl_high_protrusion_negatives = false
 
     /**
      * don't produce prediction files for individual proteins (useful for long repetitive experiments)
      */
+    @RuntimeParam 
     boolean output_only_stats = false
 
     /**
      * compress results of individual ploop runs
      */
+    @RuntimeParam
     boolean ploop_zip_runs = true
 
     /**
      * delete results of individual ploop/hopt runs
      */
+    @RuntimeParam
     boolean ploop_delete_runs = true
 
-
     /**
      * logging level (TRACE/DEBUG/INFO/WARN/ERROR)
      */
+    @RuntimeParam
     String log_level = "INFO"
 
     /**
      * print log messages to console
      */
+    @RuntimeParam
     boolean log_to_console = true
 
     /**
      * print log messages to file (run.log in outdir)
      */
+    @RuntimeParam
     boolean log_to_file = true
 
     /**
      * compress and delete log file at the end (if log_to_file)
      */
+    @RuntimeParam
     boolean zip_log_file = false
 
     /**
      * limit the number of proteins that used for training. random subset of proteins from the dataset is used each run in seedloop
      * 0 = no limit
      */
+    @RuntimeParam // training
     int train_protein_limit = 0
 
     /**
      * add weights to instances to achieve target_weight_ratio (if classifier algorithm supports it)
      *
      */
+    @ModelParam // training
     boolean balance_class_weights = false
 
     /**
      * target ratio of weighted sums of positive/negative instances when balancing class weights (balance_class_weights=true)
      */
+    @ModelParam // training
     double target_class_weight_ratio = 0.1
 
     /**
      * produce classifier stats also for train dataset
      */
+    @RuntimeParam // training
     boolean classifier_train_stats = false
 
     /**
@@ -632,68 +842,99 @@ class Params {
      * Allows calculation of AUC and AUPRC classifier statistics but consumes a lot of memory.
      * (>1GB for holo4k dataset with tesselation=2)
      */
+    @RuntimeParam
     boolean stats_collect_predictions = false
 
-    /** produce ROC and PR curve graphs (not fully implemented yet) */
+    /**
+     * produce ROC and PR curve graphs (not fully implemented yet)
+     */
+    @RuntimeParam
     boolean stats_curves = false
 
     /**
-     * Contact residues distance cutoff
+     * Contact residues distance cutoff (see ContactResiduesPositionFeature)
      */
+    @ModelParam
     double feat_crang_contact_dist = 3
 
     /**
      * probe radius for calculating accessible surface area for asa feature
      */
+    @ModelParam
     double feat_asa_probe_radius = 1.4
 
     /**
      * probe radius for calculating accessible surface area for asa feature
      */
+    @ModelParam
     double feat_asa_probe_radius2 = 1.4
 
     /**
      * radius of the neighbourhood considered in asa feature
      */
+    @ModelParam
     double feat_asa_neigh_radius = 6
 
+    /**
+     * radius for calculating of the pmass feature
+     */
+    @ModelParam
     double feat_pmass_radius = 11
 
+    /**
+     * parameter of the pmass feature
+     */
+    @ModelParam
     int feat_pmass_natoms = 70
-    
+
+    /**
+     * parameter of the pmass feature
+     */
+    @ModelParam
     int feat_pmass_nsasp = 40
 
     /**
      * selected sub-features in aa index feature
      */
+    @ModelParam
     List<String> feat_aa_properties = null
 
     /**
      * Hyperparameter optimizer implementation (so far only "spearmint")
      */
+    @RuntimeParam // training
     String hopt_optimizer = "spearmint"
 
     /**
      * Spearmint home directory (containing main.py)
      */
+    @RuntimeParam // training
     String hopt_spearmint_dir = ""
 
     /**
      * Metric to minimize in hyperparameter optimization
      * (minus sign allowed)
      */
+    @RuntimeParam // training
     String hopt_objective = "-DCA_4_0"
 
     /**
      * max number of iterations in hyperparameter optimization
      */
+    @RuntimeParam // training
     int hopt_max_iterations = 1000
 
     /**
      * randomize seed before every training in experiments
      */
+    @RuntimeParam // training
     boolean randomize_seed = false
 
+    /**
+     * Most important training/evaluation statistics that will be placed in selected_stats.csv table for easier access.
+     * (all stats will be collected anyway)
+     */
+    @RuntimeParam // training
     List<String> selected_stats = ['DCA_4_0',
                                    'DCA_4_2',
                                    'DCA_4_4',
@@ -713,70 +954,92 @@ class Params {
                                    'AVG_POCKET_SAS_POINTS_TRUE_POCKETS',
                                    'TIME_MINUTES']
 
-
     /**
      * Path to json file that contains parameters of transformation of raw score to "z-score calculated from distribution of true pockets" (pocket.auxInfo.zScoreTP).
      * Use path relative to distro/models/score.
      */
+    @RuntimeParam
     String zscoretp_transformer = "default_zscoretp.json"
 
     /**
      * Path to json file that contains parameters of transformation of raw score to "probability that pocket with given score is true pocket" (pocket.auxInfo.probaTP).
      * Use path relative to distro/models/score.
      */
+    @RuntimeParam
     String probatp_transformer = "default_probatp.json"
 
-
     /**
      * Path to json file that contains parameters of transformation of raw score to "z-score calculated from distribution of all residue scores".
      * Use path relative to distro/models/score.
      */
+    @RuntimeParam
     String zscoretp_res_transformer = "residue/p2rank_default_zscore.json"
 
     /**
      * Path to json file that contains parameters of transformation of raw score to "probability that residue with given score is true residue".
      * Use path relative to distro/models/score.
      */
+    @RuntimeParam
     String probatp_res_transformer = "residue/p2rank_default_proba.json"
 
+    /**
+     * List of pocket score transformers that should be trained (i.e. fitted / inferred) during predict-eval.
+     * Transformers are tied to the output distribution of the model (and its parametrization) so new transformers should be trained for every released model.
+     */
+    @RuntimeParam
     List<String> train_score_transformers = [] // ["ZscoreTpTransformer","ProbabilityScoreTransformer"]
 
     /**
-     * Train resaidue score transformers on a dataset during predict-eval
+     * Train residue score transformers on a dataset during predict-eval.
+     * Transformers are tied to the output distribution of the model (and its parametrization) so new transformers should be trained for every released model.
      */
+    @RuntimeParam
     boolean train_score_transformers_for_residues = false
 
     /**
-     * Reduce loaded protein structures to chains declared in dataset file (in optional chains column)
+     * Reduce loaded protein structures to chains declared in dataset file (in optional chains column).
+     * If false all protein chains will be loaded.
      */
+    @RuntimeParam
     boolean load_only_specified_chains = false
 
     /**
-     * In hyperparameter optimization (ploop and hopt commands) train model only once in the beginning
-     * (makes sense if optimized hyperparameters do't influence training and feature extraction)
+     * In hyper-parameter optimization (ploop and hopt commands) train model only once in the beginning
+     * (makes sense if optimized hyper-parameters do't influence training and feature extraction)
      */
+    @RuntimeParam
     boolean hopt_train_only_once = false
 
     /**
+     * Parameter of propensity feature.
      * directory in program resources to take peptide propensities from
      * (resources/tables/peptides/$var/...)
      * Available: SprintT1070, SprintA870
-     * TODO: move to dist dir
+     *
+     * TODO: rename param and make it general, not only specific to peptide binding
+     * TODO: move to dist dir on release
      */
+    @ModelParam
     String pept_propensities_set = "SprintT1070"
 
+
+    /**
+     * When identifying which protein chains are peptides consider provided binary residue labeling (that comes with the dataset).
+     */
+    @ModelParam // training
     boolean identify_peptides_by_labeling = false
 
     /**
      * Atoms size threshold for using KD-tree in cutoutSphere routine
      */
+    @RuntimeParam
     int use_kdtree_cutout_sphere_thrashold = 150
 
 //===========================================================================================================//
 
     /**
-     * Should be (slightly above) the distence of solvent exposed atoms to SAS points
-     * @return
+     * Derived parameter.
+     * Should be (slightly above) the distance of solvent exposed atoms to SAS points.
      */
     double getSasCutoffDist() {
         solvent_radius + surface_additional_cutoff
@@ -784,20 +1047,28 @@ class Params {
 
 //===========================================================================================================//
 
+    /**
+     * This method is here so the program version is included in toString() for Params object.
+     */
     String getVersion() {
         Main.getVersion()
     }
 
+    /**
+     * location of P2Rank installation directory (i.e. directory where the binary and configs and models are / unpacked distro directory)
+     */
     String installDir // TODO refactor
 
 //===========================================================================================================//
 
+    /**
+     * Apply parameter values from the command line
+     */
     public updateFromCommandLine(CmdLineArgs args) {
 
         applyCmdLineArgs(args)
 
         // processing of special params
-
         if (!parallel) {
             threads = 1
             rf_threads = 1
@@ -808,7 +1079,7 @@ class Params {
     }
 
     @CompileDynamic
-    void applyCmdLineArgs(CmdLineArgs args) {
+    private void applyCmdLineArgs(CmdLineArgs args) {
 
         boolean filterRanged = args.hasListParams
 
diff --git a/src/main/groovy/cz/siret/prank/program/params/RuntimeParam.groovy b/src/main/groovy/cz/siret/prank/program/params/RuntimeParam.groovy
new file mode 100644
index 00000000..42455f7d
--- /dev/null
+++ b/src/main/groovy/cz/siret/prank/program/params/RuntimeParam.groovy
@@ -0,0 +1,11 @@
+package cz.siret.prank.program.params
+
+/**
+ * Marks parameters of the program execution (training or prediction phase)
+ * i.e. parameters not directly related to the prediction algorithm itself.
+ *
+ * Currently annotation serves only documentation purposes.
+ */
+@interface RuntimeParam {
+
+}
\ No newline at end of file
diff --git a/src/main/groovy/cz/siret/prank/utils/CutoffAtomsCallLog.java b/src/main/groovy/cz/siret/prank/utils/CutoffAtomsCallLog.java
index de90bd29..44607989 100644
--- a/src/main/groovy/cz/siret/prank/utils/CutoffAtomsCallLog.java
+++ b/src/main/groovy/cz/siret/prank/utils/CutoffAtomsCallLog.java
@@ -7,7 +7,7 @@ import java.util.List;
 import java.util.stream.Collectors;
 
 /**
- *
+ * Utility for debugging and profiling cutoffAtoms methods
  */
 public class CutoffAtomsCallLog {
 
diff --git a/src/main/groovy/cz/siret/prank/utils/ProcessRunner.groovy b/src/main/groovy/cz/siret/prank/utils/ProcessRunner.groovy
index ca93de21..a200acb6 100644
--- a/src/main/groovy/cz/siret/prank/utils/ProcessRunner.groovy
+++ b/src/main/groovy/cz/siret/prank/utils/ProcessRunner.groovy
@@ -6,7 +6,7 @@ import groovy.transform.CompileStatic
 import groovy.util.logging.Slf4j
 
 /**
- *
+ * Encapsulates system (command line) process
  */
 @Slf4j
 @CompileStatic