mirror of
https://github.com/rdk/p2rank.git
synced 2026-06-04 12:44:24 +08:00
Add cofactor-as-protein-surface feature (Issue #79 part 2)
The -cofactors flag and dataset cofactors column accept LigandDefinition
specifiers ("FAD", "FAD[atom_id:N]", "FAD[contact_res_ids:A_T259,A_D246]").
Matched HET groups merge into the protein surface (proteinAtoms) and are
excluded from ligand listings; per-item resolution lets a dataset column
override the global Params.cofactors.
New: analyze cofactors subcommand (HETATM survey + specifier dry-run),
PyMOL teal-stick visualization (vis_highlight_cofactors), distant-cofactor
and chain-excluded WARN diagnostics, aa_mapping collision WARN (R19),
drop-in safety benchmark with byte-equality on a never-present specifier.
Documentation in documentation/cofactors.md (user-facing) and
documentation/dev/cofactors.md (engineering record with R1-R24 design choices
and post-merge audit fixes). Tests in CofactorHandlerTest,
CofactorIntegrationTest, CofactorPipelineTest, CofactorAnalyzeTest,
DataTableCsvTest plus a Log4jCapture test helper.
This commit is contained in:
71
benchmark/cofactors_dropin_safety.sh
Executable file
71
benchmark/cofactors_dropin_safety.sh
Executable file
@@ -0,0 +1,71 @@
|
||||
#!/usr/bin/env bash
|
||||
#
|
||||
# Drop-in safety benchmark for the -cofactors feature (Issue #79 / R18).
|
||||
#
|
||||
# Premise: setting -cofactors to a HETATM that does not exist in any structure of the
|
||||
# dataset must be a strict no-op. If the byte-level diff between baseline and "with
|
||||
# never-present cofactor" runs is non-empty, a code path is implicitly different and
|
||||
# needs investigation before merging.
|
||||
#
|
||||
# Usage: benchmark/cofactors_dropin_safety.sh [dataset.ds]
|
||||
#
|
||||
# Default dataset is distro/test_data/concavity.ds (~24 structures - enough for
|
||||
# byte-equality testing). Pass a larger dataset for stronger signal.
|
||||
|
||||
set -e
|
||||
|
||||
DATASET="${1:-distro/test_data/concavity.ds}"
|
||||
OUT="${COFACTORS_BENCH_OUT:-/tmp/cofactor_safety}"
|
||||
|
||||
if [[ ! -f "$DATASET" ]]; then
|
||||
echo "ERROR: dataset not found: $DATASET"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [[ ! -x ./distro/prank ]]; then
|
||||
echo "ERROR: ./distro/prank not found. Build with: ./gradlew jar copyBinaryToDist copyDependenciesToDist"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
rm -rf "$OUT"
|
||||
mkdir -p "$OUT"
|
||||
|
||||
echo "[1/3] Baseline (no -cofactors) on $DATASET..."
|
||||
./distro/prank predict "$DATASET" -o "$OUT/baseline" -seed 42 -threads 1 -visualizations 1 >/dev/null
|
||||
|
||||
echo "[2/3] With -cofactors ZZZZ (never-present HETATM) on $DATASET..."
|
||||
./distro/prank predict "$DATASET" -o "$OUT/with_zzzz" -cofactors ZZZZ -seed 42 -threads 1 -visualizations 1 >/dev/null
|
||||
|
||||
echo "[3/3] Comparing output files (the actual byte-equality guarantee)..."
|
||||
# Compare every per-protein output file. Excludes:
|
||||
# params.txt - legitimately captures the -cofactors arg
|
||||
# run.log - timestamps + outdir path differ trivially
|
||||
# *.json - may embed the run config
|
||||
# Includes all *.csv (predictions, residues, pointscores, sub-pocket, etc.) and *.pml
|
||||
# (PyMOL scripts - the visualization output that a regression could silently break).
|
||||
FAIL=0
|
||||
while IFS= read -r f; do
|
||||
rel="${f#./}"
|
||||
case "$rel" in
|
||||
params.txt|run.log|*.json) continue ;;
|
||||
esac
|
||||
if [[ ! -f "$OUT/with_zzzz/$rel" ]]; then
|
||||
echo "MISSING in with_zzzz: $rel"
|
||||
FAIL=1
|
||||
continue
|
||||
fi
|
||||
if ! diff -q "$OUT/baseline/$rel" "$OUT/with_zzzz/$rel" >/dev/null 2>&1; then
|
||||
echo "DIFFERS: $rel"
|
||||
diff "$OUT/baseline/$rel" "$OUT/with_zzzz/$rel" | head -20
|
||||
FAIL=1
|
||||
fi
|
||||
done < <(cd "$OUT/baseline" && find . -type f \( -name "*.csv" -o -name "*.pml" -o -name "*.pdb" -o -name "*.cif" \))
|
||||
|
||||
if [[ "$FAIL" -eq 0 ]]; then
|
||||
echo "PASS: all per-protein outputs byte-identical between baseline and with-never-present-cofactor runs."
|
||||
exit 0
|
||||
else
|
||||
echo "FAIL: at least one output differs. Investigate before merging."
|
||||
echo "Output dir: $OUT"
|
||||
exit 1
|
||||
fi
|
||||
13491
distro/test_data/1AHP.pdb
Normal file
13491
distro/test_data/1AHP.pdb
Normal file
File diff suppressed because it is too large
Load Diff
611
documentation/cofactors.md
Normal file
611
documentation/cofactors.md
Normal file
@@ -0,0 +1,611 @@
|
||||
# Cofactors as Protein Surface
|
||||
|
||||
PDB structures often contain non-protein groups that are biologically
|
||||
inseparable from the protein - flavins (FAD, FMN), pyridoxal phosphate (PLP),
|
||||
hemes, NAD/NADP, coenzyme A, metal cofactors. When P2Rank's default behaviour
|
||||
drops these groups as "ligands", any pocket that sits next to the cofactor
|
||||
ends up underestimated or missed. The `-cofactors` parameter lets you mark
|
||||
selected HETATM groups as **part of the protein surface** so pockets are
|
||||
generated and ranked correctly in their presence.
|
||||
|
||||
## What it does
|
||||
|
||||
For each group named in `-cofactors` (or in the dataset `cofactors` column):
|
||||
|
||||
1. The group's heavy atoms are added to the protein surface - they contribute
|
||||
to SAS-point generation and pocket detection.
|
||||
2. The group is **excluded** from ligand detection - it does not appear in
|
||||
any `*_predictions.csv` or `*_residues.csv` ligand listings, and does not
|
||||
affect pocket ranking as a target.
|
||||
|
||||
The same trained model is used; this is a runtime configuration only.
|
||||
With the default empty list (`cofactors = []`) behaviour is byte-for-byte
|
||||
identical to earlier P2Rank versions.
|
||||
|
||||
## When to Use This
|
||||
|
||||
| Scenario | Typical cofactors |
|
||||
|---|---|
|
||||
| Pocket prediction around a covalently attached flavin | `FAD`, `FMN`, `FDA` (reduced FAD) |
|
||||
| Pocket prediction in PLP-dependent enzymes (Schiff-base) | `PLP`, `PMP` |
|
||||
| Heme proteins (cytochromes, P450s) where docking targets sit adjacent to the heme | `HEM`, `HEC`, `HEA`, `HEB` |
|
||||
| Acyl-binding pockets in CoA-dependent enzymes | `COA` |
|
||||
| NAD(P)-binding pockets in dehydrogenases | `NAD`, `NAP` (NADP) |
|
||||
| Tightly bound metals at the entrance of the active site | `ZN`, `MG`, `MN`, `FE`, `CA` |
|
||||
|
||||
Use this when the cofactor is biologically part of the active site, not when
|
||||
it is the *ligand of interest* (in that case it stays a regular ligand).
|
||||
|
||||
## Quick Start
|
||||
|
||||
```bash
|
||||
prank predict -f protein.pdb -cofactors FAD # all FAD groups
|
||||
prank predict -f protein.pdb -cofactors 'FAD[group_id:A_500]' # one specific FAD
|
||||
prank predict cofactor-dataset.ds # per-row via .ds column
|
||||
```
|
||||
|
||||
See [Command-Line Usage](#command-line-usage) for the full grid of forms, and
|
||||
[Dataset Files](#dataset-files-ds) for the dataset-column layout.
|
||||
|
||||
## Command-Line Usage
|
||||
|
||||
```bash
|
||||
# Single cofactor type
|
||||
prank predict -f protein.pdb -cofactors FAD
|
||||
|
||||
# Multiple types
|
||||
prank predict -f protein.pdb -cofactors FAD,NAD,PLP,HEM
|
||||
|
||||
# Precise selection - only the FAD in chain A at residue 600
|
||||
prank predict -f protein.pdb -cofactors 'FAD[group_id:A_600]'
|
||||
|
||||
# Mix bare names and precise specifiers
|
||||
prank predict -f protein.pdb -cofactors 'FAD,HEM[group_id:A_300]'
|
||||
```
|
||||
|
||||
Quote the argument when it contains square brackets so the shell does not
|
||||
interpret them.
|
||||
|
||||
## Dataset Files (`.ds`)
|
||||
|
||||
Add a `cofactors` column to per-structure dataset files to give each
|
||||
structure its own cofactor selection. A non-blank value overrides the
|
||||
global `-cofactors` setting for that row.
|
||||
|
||||
```
|
||||
HEADER: protein cofactors
|
||||
1ahp.pdb PLP
|
||||
1eli.pdb FAD,NAD
|
||||
2v61.pdb FAD[group_id:A_600]
|
||||
4ri7.pdb GSH[contact_res_ids:A_C158,A_R171,A_E186]
|
||||
1fbl.pdb
|
||||
```
|
||||
|
||||
A **blank** cofactors column for a row means "inherit the global `-cofactors`
|
||||
setting for this structure" - it does **not** mean "no cofactors". To force no
|
||||
cofactors for a specific row, leave `-cofactors` off entirely and leave the
|
||||
column blank, or remove the row from the dataset.
|
||||
|
||||
Combined with other columns:
|
||||
|
||||
```
|
||||
HEADER: protein chains cofactors
|
||||
1ahp.pdb A PLP
|
||||
1eli.pdb A,B FAD,NAD
|
||||
```
|
||||
|
||||
## Config File
|
||||
|
||||
Set a default for a config-driven workflow:
|
||||
|
||||
```groovy
|
||||
// config/my-config.groovy
|
||||
cofactors = ["FAD", "PLP", "HEM"]
|
||||
```
|
||||
|
||||
```bash
|
||||
prank predict -c my-config.groovy -f protein.pdb
|
||||
```
|
||||
|
||||
## Cofactor Specifier Syntax
|
||||
|
||||
The `-cofactors` parameter and the dataset `cofactors` column accept the
|
||||
**same identifier syntax as the dataset `ligands` column**. Each entry is
|
||||
either a bare residue name or a residue name followed by a square-bracketed
|
||||
specifier.
|
||||
|
||||
| Form | Matches |
|
||||
|------|---------|
|
||||
| `FAD` | All groups whose PDB residue name is `FAD` |
|
||||
| `FAD[group_id:A_500]` | The FAD group in chain `A` at residue number `500` |
|
||||
| `FAD[A_500]` | Shorthand for `[group_id:A_500]` |
|
||||
| `FAD[atom_id:12345]` | The FAD group containing the atom with PDB atom serial `12345` |
|
||||
| `FAD[contact_res_ids:A_D246,A_T259,A_E423]` | The FAD group whose surrounding polymer residues match this set |
|
||||
|
||||
### Chain and residue identifiers (PDB vs mmCIF)
|
||||
|
||||
Specifiers like `group_id:A_500` and `contact_res_ids:A_D246` reference the
|
||||
**author** chain ID and residue number - what 3D viewers display, and what
|
||||
the PDB file uses directly. In mmCIF files these correspond to the
|
||||
`auth_asym_id` + `auth_seq_id` columns, NOT to `label_asym_id` /
|
||||
`label_seq_id`; the two can differ.
|
||||
|
||||
| File format | Chain ID source | Residue number source |
|
||||
|---|---|---|
|
||||
| PDB | Column 22 (single character; what you see in PyMOL/Chimera) | Columns 23-26 + insertion code |
|
||||
| mmCIF | `_atom_site.auth_asym_id` (NOT `label_asym_id`) | `_atom_site.auth_seq_id` (NOT `label_seq_id`) |
|
||||
|
||||
If you don't want to read the raw file, the discovery command
|
||||
`prank analyze cofactors -f protein.pdb`
|
||||
(see [Discovery & Diagnostics](#discovery--diagnostics))
|
||||
prints every HETATM group with its exact `group_id` value ready to paste
|
||||
into a specifier.
|
||||
|
||||
### `group_id`
|
||||
|
||||
`chain_resNumber`, e.g. `A_500` for chain A residue 500. Insertion codes
|
||||
are part of the residue number: `A_500A` matches residue 500A.
|
||||
|
||||
### `atom_id`
|
||||
|
||||
Integer PDB atom serial. Useful when residue numbering is unstable across
|
||||
re-deposits but the structure team kept atom serials. Only one atom needs
|
||||
to match - the whole group is then included.
|
||||
|
||||
### `contact_res_ids`
|
||||
|
||||
A list of residue identifiers that must all be present in the polymer
|
||||
neighbourhood (within ~7 Å) of the cofactor. Each entry is
|
||||
`chain_aaCode_resNumber` (e.g. `A_D246` = chain A, Asp 246) or
|
||||
`chain_resNumber` (e.g. `A_246`) when you don't want to check the residue
|
||||
type. Useful when the cofactor's `group_id` is unstable but its binding
|
||||
pocket residues are constant.
|
||||
|
||||
### Case
|
||||
|
||||
PDB residue names (the prefix like `FAD`) are normalized to uppercase
|
||||
automatically: `fad`, `Fad`, and `FAD` all match the same groups. The
|
||||
contents of `[...]` are passed through as-is and remain case-significant:
|
||||
`A_D246` (chain A, Asp 246) is not the same as `a_d246`. PDB and mmCIF
|
||||
files generally use uppercase chain IDs and canonical case for AA codes,
|
||||
so in practice copy-paste from the file works.
|
||||
|
||||
## Precedence
|
||||
|
||||
When the same cofactor configuration source could be set in multiple places,
|
||||
the highest-priority setting wins:
|
||||
|
||||
1. Dataset `cofactors` column (per row)
|
||||
2. `-cofactors` command-line argument
|
||||
3. `cofactors` setting in a config file
|
||||
4. Default - empty list
|
||||
|
||||
If a group matches **both** a cofactor specifier and an explicit ligand
|
||||
definition in the dataset, the cofactor wins: the group is treated as
|
||||
surface and excluded from ligand detection. This is intentional -
|
||||
cofactors describe surface, not targets.
|
||||
|
||||
## Distant Cofactors
|
||||
|
||||
By default, P2Rank logs an `INFO` warning when a matched cofactor's centre
|
||||
of mass is more than `15 Å` from the nearest protein atom. This usually
|
||||
means the cofactor is a crystallization artifact or a free molecule in
|
||||
solvent, and you may want to exclude it.
|
||||
|
||||
The cofactor **is still included in the surface**; the warning is advisory.
|
||||
Tune the threshold or disable the check:
|
||||
|
||||
```bash
|
||||
prank predict -f protein.pdb -cofactors FAD -cofactor_max_protein_dist 25 # relax
|
||||
prank predict -f protein.pdb -cofactors FAD -cofactor_max_protein_dist 0 # disable
|
||||
```
|
||||
|
||||
## Discovery & Diagnostics
|
||||
|
||||
Use `prank analyze cofactors` to inspect what HETATM groups exist in a
|
||||
structure or dataset, and to dry-run a `-cofactors` configuration before
|
||||
running a prediction.
|
||||
|
||||
### Survey mode (no `-cofactors`)
|
||||
|
||||
List every HETATM group with its chain, residue number, atom count, and
|
||||
distance to the protein:
|
||||
|
||||
```bash
|
||||
prank analyze cofactors -f protein.pdb
|
||||
prank analyze cofactors dataset.ds
|
||||
```
|
||||
|
||||
Output files (in the analyze output directory):
|
||||
|
||||
| File | Content |
|
||||
|---|---|
|
||||
| `het_groups.csv` | One row per HETATM group instance. Columns: `protein`, `het_name`, `chain`, `res_num`, `group_id`, `n_heavy_atoms`, `dist_to_protein`, `currently_classified_as` (`relevant_ligand` / `ignored` / `cofactor`) |
|
||||
| `het_groups_summary.txt` | Per-name frequency table across the dataset, sorted by structure-coverage |
|
||||
| `visualizations/<protein>.pml` | PyMOL script highlighting any configured cofactors (if `-visualizations 1`) |
|
||||
|
||||
The `group_id` column gives the exact string for a precise specifier -
|
||||
copy it into `-cofactors 'FAD[<group_id>]'`.
|
||||
|
||||
### Dry-run mode (with `-cofactors`)
|
||||
|
||||
Add `-cofactors` to the analyze command and P2Rank will report which
|
||||
specifiers match, structure by structure, without running any prediction:
|
||||
|
||||
```bash
|
||||
prank analyze cofactors -f protein.pdb -cofactors FAD,PLP
|
||||
prank analyze cofactors -f protein.pdb -cofactors 'FAD[group_id:A_600]'
|
||||
prank analyze cofactors dataset.ds -cofactors FAD,PLP
|
||||
```
|
||||
|
||||
Additional outputs:
|
||||
|
||||
| File | Content |
|
||||
|---|---|
|
||||
| `cofactor_matches.csv` | One row per (structure, specifier) pair. Columns: `protein`, `specifier`, `matched_count`, `matched_group_ids`, `unmatched_reason` |
|
||||
|
||||
`het_groups.csv` gains a `would_be_cofactor` column (0/1) so you can diff
|
||||
it against `currently_classified_as` to confirm the proposed change.
|
||||
|
||||
Console summary (also written to `het_groups_summary.txt`):
|
||||
|
||||
```
|
||||
HETATM Survey for dataset.ds (87 structures)
|
||||
|
||||
Most frequent HETATM groups:
|
||||
HOH 87 structures (100.0%) - 412 groups total
|
||||
SO4 42 structures ( 48.3%) - 89 groups total
|
||||
FAD 28 structures ( 32.2%) - 31 groups total
|
||||
PLP 13 structures ( 14.9%) - 13 groups total
|
||||
...
|
||||
|
||||
Cofactor specifier match (per-item resolution, column overrides global):
|
||||
FAD 28/87 structures, 31 groups total
|
||||
PLP[group_id:A_300] 9/87 structures, 9 groups total
|
||||
HEM 0/87 structures, 0 groups total ← matched no structures
|
||||
```
|
||||
|
||||
This is the fastest way to verify a precise specifier before running on a
|
||||
large dataset.
|
||||
|
||||
## Visualizations
|
||||
|
||||
When `-visualizations` is enabled, every routine that writes PyMOL output
|
||||
(`prank predict`, `prank analyze binding-sites`, `prank analyze
|
||||
labeled-residues`, `prank analyze cofactors`, …) highlights matched
|
||||
cofactor atoms as **teal sticks**, separate from the standard renderings.
|
||||
|
||||
Default style summary (with `-vis_highlight_ligands` / `-vis_highlight_cofactors`
|
||||
left at their defaults):
|
||||
|
||||
| Element | Default style |
|
||||
|---|---|
|
||||
| Protein chain | Cartoon |
|
||||
| Ligand atoms | Sticks (magenta, default PyMOL style) |
|
||||
| Ligand atoms when `-vis_highlight_ligands 1` | Red/violet spheres (separate selections in the older renderer) |
|
||||
| **Cofactor atoms** (when `-cofactors` set) | **Teal sticks** (`#49A8C7`) |
|
||||
| Pocket pseudoatoms | Per-pocket palette |
|
||||
|
||||
### Per-cofactor PyMOL selections
|
||||
|
||||
The renderer emits one PyMOL selection per cofactor name, plus an aggregate.
|
||||
For `-cofactors FAD,PLP` you get:
|
||||
|
||||
```
|
||||
select cofactor_FAD, id <FAD atom serials...>
|
||||
select cofactor_PLP, id <PLP atom serials...>
|
||||
select cofactor_atoms, cofactor_FAD or cofactor_PLP
|
||||
```
|
||||
|
||||
In PyMOL you can address `cofactor_FAD` and `cofactor_PLP` directly to,
|
||||
e.g., colour them differently, hide one of them, or zoom in. The bracketed
|
||||
selection name is sanitised: any non-alphanumeric character in the residue
|
||||
name becomes an underscore (rare - PDB names are 1–4 alphanumeric anyway).
|
||||
|
||||
If any specifier matched nothing, the .pml file also contains a diagnostic
|
||||
comment listing the unmatched specifiers, so a user inspecting the
|
||||
visualization can see which patterns didn't apply.
|
||||
|
||||
### Disabling the highlight
|
||||
|
||||
```bash
|
||||
prank predict -f protein.pdb -cofactors FAD -vis_highlight_cofactors 0
|
||||
```
|
||||
|
||||
`-vis_highlight_cofactors` defaults to `true`. Setting it to `false` does
|
||||
not change pocket prediction - cofactor atoms are still part of the
|
||||
surface; only the colour highlight is removed.
|
||||
|
||||
## Common Cofactors Reference
|
||||
|
||||
### Frequently used
|
||||
|
||||
| Name | Full name | Heavy atoms | Typical role |
|
||||
|------|-----------|-------------|--------------|
|
||||
| FAD | Flavin adenine dinucleotide | ~53 | Redox |
|
||||
| FMN | Flavin mononucleotide | ~31 | Electron transfer |
|
||||
| NAD | Nicotinamide adenine dinucleotide | ~44 | Redox |
|
||||
| NAP | NADP (phosphorylated NAD) | ~48 | Redox |
|
||||
| PLP | Pyridoxal 5'-phosphate | ~15 | Amino-acid metabolism |
|
||||
| HEM | Heme | ~43 | Oxygen / electron transport |
|
||||
| HEC | Heme C (covalent) | ~43 | Cytochrome c |
|
||||
| COA | Coenzyme A | ~51 | Acyl carrier |
|
||||
| TPP | Thiamine pyrophosphate | ~25 | Decarboxylation |
|
||||
| SAM | S-adenosylmethionine | ~27 | Methylation |
|
||||
| GSH | Glutathione | ~20 | Redox / detoxification |
|
||||
|
||||
### Modified variants
|
||||
|
||||
Many cofactors have variant codes for chemically modified forms - these
|
||||
are **not aliases** in P2Rank; you must specify the exact code that appears
|
||||
in your structure file.
|
||||
|
||||
| Standard | Modified variant | Notes |
|
||||
|----------|------------------|-------|
|
||||
| FAD | FDA | Reduced FAD (FADH₂) |
|
||||
| NAD | NAI, NAJ | Modified NAD |
|
||||
| HEM | HEC, HEA, HEB | Heme C, A, B variants |
|
||||
|
||||
To list every HETATM code present in a structure:
|
||||
|
||||
```bash
|
||||
grep '^HETATM' protein.pdb | awk '{print $4}' | sort -u
|
||||
```
|
||||
|
||||
For mmCIF:
|
||||
|
||||
```bash
|
||||
grep '^HETATM' protein.cif | awk '{print $6}' | sort -u
|
||||
```
|
||||
|
||||
### Metal ions
|
||||
|
||||
Single-atom HETATM "groups" can also be used as cofactors. They contribute
|
||||
one heavy atom to the surface each, which is small but can matter for
|
||||
adjacent metal-coordinated pockets.
|
||||
|
||||
| Name | Element | Notes |
|
||||
|------|---------|-------|
|
||||
| ZN | Zinc | Catalytic or structural |
|
||||
| MG | Magnesium | ATP binding, phosphoryl transfer |
|
||||
| MN | Manganese | Metalloenzymes |
|
||||
| FE | Iron | Non-heme iron |
|
||||
| FE2 | Iron (II) | Ferrous iron |
|
||||
| CU | Copper | Electron transfer |
|
||||
| CA | Calcium | Signaling, structural |
|
||||
|
||||
## Interactions with Other Features
|
||||
|
||||
### Explicit ligand definitions (`ligands` column)
|
||||
|
||||
If the same group is named in both the `ligands` and `cofactors` columns,
|
||||
the cofactor side wins - the group is treated as surface. This is the only
|
||||
sensible reading of "this group is a cofactor": it must not also be a
|
||||
prediction target.
|
||||
|
||||
### `chains` column / `-chains`
|
||||
|
||||
Chain reduction runs before cofactor matching. A cofactor on an excluded
|
||||
chain is simply not found. P2Rank logs an `INFO` line when it detects that
|
||||
the **name** of a missing cofactor exists in the unreduced structure
|
||||
(name-only check, see Known Limitations).
|
||||
|
||||
### `aa_mapping`
|
||||
|
||||
`-aa_mapping` (non-canonical residue mapping) operates on polymer-chain
|
||||
residues; `-cofactors` operates on HETATM groups. They never interact -
|
||||
**unless** a custom `aa_mapping` CSV contains an entry whose 3-letter code
|
||||
matches a cofactor's group name. In that case the cofactor atoms inherit
|
||||
the mapped amino acid's feature-table entries instead of zeros, and P2Rank
|
||||
emits a startup warning:
|
||||
|
||||
```
|
||||
WARN Cofactor specifier(s) name(s) [PLP] are also covered by the active
|
||||
aa_mapping. Cofactor atom features will be computed using the mapped AA's
|
||||
table entries instead of cofactor defaults.
|
||||
```
|
||||
|
||||
If you see this warning and didn't intend the override, either:
|
||||
- remove the entry from your custom `aa_mapping` CSV, or
|
||||
- change the cofactor specifier to use a different code.
|
||||
|
||||
The bundled `pdbfixer` mapping only contains canonical AA aliases, so this
|
||||
warning typically appears only for custom mappings.
|
||||
|
||||
Normal combined use is unaffected:
|
||||
|
||||
```bash
|
||||
prank predict -f protein.pdb -aa_mapping pdbfixer -cofactors FAD,PLP
|
||||
```
|
||||
|
||||
### `load_ligands_from_separate_files`
|
||||
|
||||
When this mode is on, all HETATMs from the primary structure file are
|
||||
moved to the ignored-ligand list without consulting the cofactor settings.
|
||||
Cofactor atoms are still added to the surface correctly, but a cofactor
|
||||
present in the primary file will also appear in the ignored-ligand listing.
|
||||
This is a cosmetic issue and does not affect predictions; it is rare in
|
||||
practice. If you need a clean ignored-ligand listing in this mode, remove
|
||||
the cofactor from the primary file or split it into a separate ligand file.
|
||||
|
||||
### `csv` feature (`-feat_csv_columns`)
|
||||
|
||||
Cofactor atoms are not expected to have entries in user-provided per-atom
|
||||
or per-residue CSV files (those describe polymer atoms / residues). When
|
||||
both `-cofactors` and `-feat_csv_columns` are set, cofactor atoms receive
|
||||
a zero vector for every CSV column. This bypass is unconditional -
|
||||
`-feat_csv_ignore_missing` retains its strict default for polymer atoms.
|
||||
|
||||
## Feature Values for Cofactor Atoms
|
||||
|
||||
When `-cofactors` is set, the matched HETATM atoms become full citizens of
|
||||
the protein surface for the purposes of pocket prediction. Different
|
||||
features handle them in different ways. The table below summarises what
|
||||
each feature does for an atom that is part of a cofactor (e.g. a nitrogen
|
||||
of FAD). Users running with `-cofactors` should know these behaviours
|
||||
because they affect the feature distribution the model sees near the
|
||||
cofactor.
|
||||
|
||||
| Feature | What cofactor atoms contribute |
|
||||
|---|---|
|
||||
| `chem` (AA chemistry) | Only the element-based atom counters (`atomC` / `atomO` / `atomN`) and `atomDensity`. All AA-property fields (`hydrophobic`, `polar`, `acidic`, `basic`, …) are zero. |
|
||||
| `atom_table` / `residue_table` / `aa` / `ares` | Zero values - the cofactor's residue code isn't in the AA-keyed lookup tables. |
|
||||
| `volsite` (pharmacophore) | Zero - the lookup cascade only matches AA atom names. |
|
||||
| `bfactor` | The cofactor atom's actual B-factor (real data). |
|
||||
| `hybridization` | Element-based fallback (sp2 for N, sp3 for C / O / S / P / Se). |
|
||||
| `sidechain` | Always 1 (cofactor atoms are not backbone). |
|
||||
| `exposed` | 1 if the cofactor atom is solvent-accessible, 0 otherwise. |
|
||||
| `csv` | Zero (see [csv feature](#csv-feature--feat_csv_columns) above). |
|
||||
| `conservation` | Zero - cofactors are not in the conservation score file. |
|
||||
| `cres1` / `cres1pos` (SAS-level) | **Unaffected.** Uses nearest polymer residue, never sees the cofactor name. |
|
||||
| `protrusion` (SAS-level) | Cofactor atoms count toward protrusion at nearby SAS points - correct, the cofactor adds bulk. |
|
||||
| Energy / Lennard-Jones (SAS-level) | Element-based LJ parameter lookup, works identically for polymer and cofactor atoms. |
|
||||
|
||||
### What this means in practice
|
||||
|
||||
At a SAS point that sits *on* the cofactor surface, the neighbourhood is
|
||||
mostly cofactor atoms, so the AA-property features are near zero. This is
|
||||
biologically correct - the local surface really isn't amino-acid-like.
|
||||
|
||||
At a SAS point on the polymer *near* a cofactor, the neighbourhood mixes
|
||||
cofactor and polymer atoms. The cofactor atoms contribute zero to the
|
||||
AA-property features but still count toward the average's divisor - so
|
||||
the AA signal is diluted in proportion to the cofactor fraction of the
|
||||
neighbourhood.
|
||||
|
||||
P2Rank's prediction models were trained on protein-only data where every
|
||||
surface atom had full AA-property values. The dilution introduces a small
|
||||
feature-distribution shift the trained model wasn't directly exposed to.
|
||||
For typical sites (a cofactor occupying a minor fraction of the
|
||||
neighbourhood), the shift is small. Predictions remain useful, but you
|
||||
should treat raw pocket scores from cofactor-enabled runs as not strictly
|
||||
comparable to cofactor-disabled runs on the same structure.
|
||||
|
||||
If you observe noticeably different pocket rankings with and without
|
||||
`-cofactors` on a structure where you expect them to be similar, that's
|
||||
the dilution effect - see the [`prank analyze cofactors`](#discovery--diagnostics)
|
||||
diagnostic to inspect which atoms were affected.
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### "I don't see the startup message"
|
||||
|
||||
```
|
||||
Cofactors to include as protein surface: [...]
|
||||
```
|
||||
|
||||
is logged only when `-cofactors` is non-empty. If it's missing, check:
|
||||
|
||||
- You passed `-cofactors` (or set it in the config / dataset column).
|
||||
- The argument has no spaces around the commas: `FAD,PLP`, not `FAD, PLP`.
|
||||
- The argument is quoted if it uses square brackets: `'FAD[A_500]'`.
|
||||
|
||||
### "I don't see the per-structure 'included' message"
|
||||
|
||||
Either the specifier matched no groups, or P2Rank exited before processing
|
||||
that structure. Enable DEBUG to see the available HETATM groups:
|
||||
|
||||
```bash
|
||||
prank predict -f protein.pdb -cofactors FAD -log_level DEBUG
|
||||
```
|
||||
|
||||
You should then see one of:
|
||||
|
||||
```
|
||||
Structure protein.pdb: included 1 cofactor type(s) as protein surface (FAD: 53 atoms)
|
||||
```
|
||||
|
||||
or
|
||||
|
||||
```
|
||||
Structure protein.pdb: cofactor specifier(s) [FAD] matched no groups. Available HETATM groups: [...]
|
||||
```
|
||||
|
||||
### "The cofactor still appears in my predictions CSV"
|
||||
|
||||
Run the basic checks:
|
||||
|
||||
```bash
|
||||
# Confirm protein-atom count actually increased
|
||||
prank predict -f protein.pdb -cofactors FAD # log: "protein atoms: 4123"
|
||||
prank predict -f protein.pdb # log: "protein atoms: 4070"
|
||||
|
||||
# Confirm the cofactor is no longer in the predictions CSV
|
||||
grep -i FAD output/protein.pdb_predictions.csv # should be empty
|
||||
```
|
||||
|
||||
If the cofactor name still appears after enabling `-cofactors`, check the
|
||||
spelling against the HETATM listing in the PDB file
|
||||
(`grep '^HETATM' protein.pdb | awk '{print $4}' | sort -u`).
|
||||
|
||||
### "My precise specifier doesn't match"
|
||||
|
||||
The fastest diagnostic is dry-run mode of `analyze cofactors`:
|
||||
|
||||
```bash
|
||||
prank analyze cofactors -f protein.pdb -cofactors 'FAD[group_id:A_600]'
|
||||
# look at cofactor_matches.csv: matched_count, matched_group_ids, unmatched_reason
|
||||
```
|
||||
|
||||
`het_groups.csv` from the same run lists every HETATM group in the file
|
||||
with its exact `group_id` value to copy back into the specifier.
|
||||
|
||||
If P2Rank isn't available or you want to spot-check directly, the raw
|
||||
PDB recipe still works:
|
||||
|
||||
```bash
|
||||
grep '^HETATM.*FAD' protein.pdb | head -3
|
||||
# HETATM 4123 N1 FAD A 600 12.345 ...
|
||||
# ^^^^^
|
||||
# chain A, residue 600 → group_id: A_600
|
||||
```
|
||||
|
||||
Insertion codes are part of the residue number:
|
||||
`grep` shows `... FAD A 500A ...` → use `A_500A`.
|
||||
|
||||
### "I get a `PrankException: Invalid cofactor specifier...`"
|
||||
|
||||
A specifier failed to parse. The error message includes the offending
|
||||
string and the reason (unknown specifier type, non-integer atom_id, etc.).
|
||||
Check the syntax table above.
|
||||
|
||||
## Known Limitations
|
||||
|
||||
- **Case-sensitive group names.** Group names are matched against
|
||||
`group.PDBName` exactly. In practice this means uppercase, since BioJava
|
||||
normalizes PDB residue names on load.
|
||||
- **AA-property feature dilution.** Cofactor atoms participate in the
|
||||
feature aggregation at each SAS point but contribute zero values for
|
||||
AA-property features. This dilutes the polymer signal at SAS points near
|
||||
a cofactor - see [Feature Values for Cofactor Atoms](#feature-values-for-cofactor-atoms).
|
||||
Effect is typically small; significant only if cofactors dominate the
|
||||
local neighbourhood.
|
||||
- **`load_ligands_from_separate_files`.** A cofactor present in the primary
|
||||
structure file appears in the ignored-ligand listing in this mode. Surface
|
||||
inclusion is unaffected.
|
||||
- **Chain-restriction diagnostic.** The "lost during chain reduction" log
|
||||
matches on residue name only (not the full precise specifier), so it can
|
||||
occasionally report a name that wouldn't have matched the precise
|
||||
specifier anyway. Treat it as an advisory.
|
||||
- **`aa_mapping` overlap.** A custom AA-mapping entry that matches a
|
||||
cofactor's group name silently remaps cofactor atoms' features to the
|
||||
mapped AA. Detected at startup with a WARN (see `aa_mapping` interaction
|
||||
section above).
|
||||
- **fpocket rescoring.** This feature changes pocket prediction in `prank
|
||||
predict`. It does not change the way `prank rescore` invokes fpocket;
|
||||
fpocket has its own hard-coded list of "ligand" HETATMs.
|
||||
|
||||
## See Also
|
||||
|
||||
- [`aa-mapping.md`](aa-mapping.md) - non-canonical residue mapping for
|
||||
modified amino acids in the polymer chain. Independent feature, useful
|
||||
alongside `-cofactors`.
|
||||
- [`feature-setup.md`](feature-setup.md) - how feature values are
|
||||
calculated. See the [Feature Values for Cofactor Atoms](#feature-values-for-cofactor-atoms)
|
||||
section above for cofactor-specific behaviour.
|
||||
- [`hidden-commands.md`](hidden-commands.md) - other `prank analyze`
|
||||
subcommands. The cofactor-specific `prank analyze cofactors` is
|
||||
documented under "Discovery & Diagnostics" above.
|
||||
- `documentation/dev/cofactors.md` (engineering doc) - implementation status,
|
||||
design choices with reasoning, planned improvements. Read this if you are
|
||||
changing the feature, not if you are using it.
|
||||
218
documentation/dev/cofactors.md
Normal file
218
documentation/dev/cofactors.md
Normal file
@@ -0,0 +1,218 @@
|
||||
# Cofactors as Protein Surface - Dev Notes
|
||||
|
||||
Engineering record for the `-cofactors` feature (GitHub Issue #79, Part 2).
|
||||
|
||||
For users: see [`../cofactors.md`](../cofactors.md).
|
||||
Planning history: `local/PLAN_COFACTORS_V7.md`.
|
||||
|
||||
## Status
|
||||
|
||||
- [x] Phase 1 - Params, CofactorHandler, LoaderParams (core infrastructure)
|
||||
- [x] Phase 2 - Dataset wiring (`cofactors` column, `getEffectiveCofactorDefinitions`)
|
||||
- [x] Phase 3 - Behaviour (Ligands guard + filter, Protein.loadStructure cofactor block)
|
||||
- [x] Phase 4 - Validation, CsvFileFeature fix (R17), aa_mapping warning (R19)
|
||||
- [x] Phase 5 - Tests (CofactorHandlerTest / CofactorIntegrationTest / CofactorPipelineTest)
|
||||
- [x] Phase 6 - Manual smoke test + drop-in safety benchmark (R18)
|
||||
- [x] Companion plan - `analyze cofactors` subcommand + PyMOL teal-stick highlight (`vis_highlight_cofactors`)
|
||||
- [x] Post-merge audit pass (21 findings addressed) - see "Post-merge audit fixes" below
|
||||
|
||||
## Design Choices (R1–R24)
|
||||
|
||||
Each entry links back to the plan's Refinement Log. The plan is the canonical source of full
|
||||
rationale; this table is the as-implemented record.
|
||||
|
||||
| ID | Choice | As-implemented note |
|
||||
|-----|--------|---------------------|
|
||||
| R1 | `CofactorHandler` is single source of truth on `LoaderParams` | Implemented as planned. |
|
||||
| R2 | Single-scan extraction via `ExtractionResult` | Implemented as planned. |
|
||||
| R3 | `"ZZZZ"` is the never-present test specifier | Used in `missingCofactorDoesNotFail` test and in the benchmark script. |
|
||||
| R4 | Unmatched specifiers logged at DEBUG | Implemented in `logResult`. |
|
||||
| R5 | `Ligand.contactDistance` side-effect documented | Documented in user doc; behaviour inherited (no code change). |
|
||||
| R6 | Altloc + modified-name handling: no special code | Inherited from BioJava defaults. |
|
||||
| R7 | Per-feature atom-level behaviour table | Documented in user doc and plan §5.2. |
|
||||
| R8 | Original CSV "treated as 0.0" claim - superseded by R17 | R17 supersedes; this entry is historical. |
|
||||
| R9 | 1-arg `Protein.load(String)` note for library users | Documented in user doc + plan §"Library API Support". |
|
||||
| R10 | Cofactor > explicit ligand precedence | Implemented in `Ligands.isRelevantLigandGroup` (cofactor guard fires first). Plus the upstream filter at `Ligands.loadForProtein` - see "Deviation from plan" below. |
|
||||
| R11 | Distant-cofactor INFO warning + `cofactor_max_protein_dist` param (default 15Å) | Implemented in `CofactorHandler.warnDistantCofactors`. |
|
||||
| R12 | Pipeline tests on existing `1t7qa.pdb` (COA) | `CofactorPipelineTest` exists; tests pass. |
|
||||
| R13 | `ExtractionResult` stored on `Protein.secondaryData` | `Protein.cofactorExtractionResult` accessor added. |
|
||||
| R14 | Chain-reduction "lost cofactors" INFO warning | `warnChainExcludedCofactors` implemented; name-only matching (documented limitation). |
|
||||
| R15 | `load_ligands_from_separate_files` interaction is cosmetic | Not exercised by tests; documented in user doc Known Limitations. |
|
||||
| R16 | Reuse `Dataset.LigandDefinition` for specifier syntax + identity-based `isCofactor(Group)` | Implemented as planned. Identity set is `Collections.newSetFromMap(new IdentityHashMap<>())`. |
|
||||
| R17 | `CsvFileFeature` cofactor bypass | Implemented as a 5-line guard at top of `calculateForAtom`. Uses `loaderParams?.isCofactor(group)`. |
|
||||
| R18 | Drop-in safety benchmark | `benchmark/cofactors_dropin_safety.sh` written and run - see "Benchmark Results" below. |
|
||||
| R19 | `aa_mapping`-collision WARN at startup | Implemented in `Main.initParams`. |
|
||||
| R20 | `ConservationFeature` GroupType-guard fallback verified, regression test added | `ConservationScore.getScoreForResidueSafe` uses `?: 0d`; regression test `conservationFeatureSafeOnCofactorAtoms` injects a `ConservationScore` containing a "poison" value under the cofactor's residue number, then asserts the cofactor atom returns 0.0 (proving the AMINOACID type-guard short-circuited the lookup) while a polymer-atom control returns its real score. |
|
||||
| R21 | **Structured `RenderingModel.cofactorResult` (Option A) instead of flat `cofactorResult` list** | Renderer receives the full `ExtractionResult`. `NewPymolRenderer.cofactorPymolBlock` emits one PyMOL selection per cofactor name (`cofactor_FAD`, `cofactor_PLP`, …) plus an aggregate `cofactor_atoms`. Per-name selections give users a PyMOL handle for toggling individual cofactors; unmatched specifiers go into a header comment as a diagnostic. `PymolRenderer.renderCofactors` delegates to the same shared block to keep the two renderers in sync. Tests: `pymolRendererEmitsPerNameSelections`, `unmatchedSpecifierAppearsInPymolComment`. |
|
||||
| R22 | **Centralized cofactor specifier parsing** in `CofactorHandler.parseAndValidate` | Single entry point for both CLI list (`List<String>`) and dataset-column string (`String`). Joins list elements with `,`, re-splits with `Sutils.splitRespectInnerParentheses(...,',','[',']')` so a `contact_res_ids:A,B,C` specifier that was over-split upstream by naive comma-splitters is correctly reassembled. Case-normalizes group name (so `-cofactors fad` matches `FAD` groups instead of silently zero-matching). Wraps `LigandDefinition.parse` errors with cofactor-specific context so the message doesn't say "ligand definition in the dataset file" for a CLI source. Tests: `contactResIdsCommasArePreservedFromList`, `contactResIdsCommasArePreservedFromString`, `lowercaseGroupNameIsNormalized`, `mixedCaseGroupNameIsNormalized`, `specifierBodyPreservesCase`, `errorMessageMentionsCofactor`. |
|
||||
| R23 | **Per-item resolution in `Dataset.resolveCofactorDefinitions(Item)`** | Single helper, package-visible. Used by both `getLoaderParams()` (so protein loading respects the per-row column) and `AnalyzeRoutine.cmdCofactors` (so analyze cofactors' dry-run mode reflects the same effective specifiers - column overrides global - that prediction would). |
|
||||
| R24 | **CSV cell quoting in `DataTable.toCsv`** | RFC 4180 quoting: cells containing comma / double-quote / newline get wrapped in double-quotes with inner double-quotes doubled. Fixes the `cofactor_matches.csv` round-trip issue for specifiers like `FAD[contact_res_ids:A,B]`. Tests: `DataTableCsvTest`. |
|
||||
|
||||
## Post-merge audit fixes
|
||||
|
||||
After the initial feature was complete, a deep audit (5 parallel agents on independent angles)
|
||||
surfaced 21 findings. They were addressed in three passes:
|
||||
|
||||
**Pass 1 - documentation/comment corrections**
|
||||
1. `documentation/cofactors.md` Config File - removed the precise-specifier example (per user rule, that syntax isn't expected in config files).
|
||||
2. `documentation/cofactors.md` Known Limitations - removed stale `ligands_separated_by_TER` bullet (symbol doesn't exist in code).
|
||||
14. This file's `(R1–R20)` header now reads `(R1–R24)` (contents already covered R21-R24).
|
||||
15. `documentation/cofactors.md` sample analyze output - aligned with actual code output format (added `- N groups total`, fixed "matched in" prefix, "← warning" → "← matched no structures").
|
||||
16. `Main.groovy` comment - renamed reference `getEffectiveCofactorDefinitions` → `resolveCofactorDefinitions`.
|
||||
|
||||
**Pass 2 - code bug fixes**
|
||||
3. `CofactorHandler.extractCofactorAtoms` now uses `Struct.getLigandGroups(protein)` instead of `getHetGroups(structure)`, aligning with `Ligands.loadForProtein`. Previously, GDP/GTP/ATP/SHR-style groups (BioJava-classified as `NUCLEOTIDE`/`AMINOACID`) silently failed to match as cofactors.
|
||||
4. `CofactorHandler.extractCofactorAtoms` clears `matchedGroups` on each invocation - `Protein.transformedCopy` reuses the same handler with a deep-copied structure, so stale Group identity references must be discarded.
|
||||
5. `DataTable.csvCell` now also quotes on `\r` (audit found gap in RFC-4180 compliance).
|
||||
6. `NewPymolRenderer.cofactorPymolBlock` uses `set_color cofactor_col, [rgb]` (canonical PyMOL comma form) instead of `set_color cofactor_col = [rgb]`.
|
||||
17. `CofactorHandler.parseOne` now `trim()`s the group-name prefix before upper-casing, so `"fad [group_id:A_500]"` (accidental space) matches `FAD` instead of silently never-matching as `"FAD "`.
|
||||
18. `warnDistantCofactors` and `warnChainExcludedCofactors` log at `WARN` (not `INFO`) - these are advisory but warn-worthy conditions.
|
||||
|
||||
**Pass 3 - test hardening** (see `src/test/groovy/cz/siret/prank/test/Log4jCapture.groovy` for the log-capture helper)
|
||||
7. `csvFeatureDoesNotThrowOnCofactorAtoms` now picks a real polymer atom, writes a CSV covering its serial only, and asserts the polymer control reads 0.5 (proves the lookup path is live) before asserting the cofactor atom returns 0.0 without throwing.
|
||||
8. `conservationFeatureSafeOnCofactorAtoms` now injects a `ConservationScore` (via reflection on its package-private constructor) containing both a polymer score and a "poison" score keyed by the cofactor's residue number; asserts cofactor returns 0.0 (type-guard fired) and polymer returns its real score (lookup is live).
|
||||
9. `distantCofactorWarningRespectsThreshold` uses `Log4jCapture` to assert the WARN is actually emitted at `maxDist=0.001` (previously only checked "doesn't throw").
|
||||
10. `chainExcludedCofactorDiagnosticDoesNotThrow` now uses `@EnabledIf("has1AHP")` instead of silently returning - a missing test file no longer looks like a pass.
|
||||
11. New `aaMappingCollisionWarningEmitted` test verifies R19 detection logic with `Log4jCapture`.
|
||||
12. `benchmark/cofactors_dropin_safety.sh` now compares all `*.csv`, `*.pml`, `*.pdb`, and `*.cif` files (previously only `*_predictions.csv` and `*_residues.csv`); also runs with `-visualizations 1` so PyMOL output is included.
|
||||
13. New `CofactorAnalyzeTest.groovy` covers the building blocks of `cmdCofactors` (per-item resolution precedence, bracket-aware specifier parsing). The full `cmdCofactors` invocation is exercised end-to-end via `misc/test-scripts/testsets.sh`.
|
||||
19. New `concurrencySafeWithMultipleThreads` test loads the same structure on 8 parallel threads with independent `LoaderParams` and asserts identical cofactor atom counts.
|
||||
20. `DataTableCsvTest` gained 4 cases: newline-in-value, CR-in-value, null cell renders as empty, pre-quoted value gets escaped.
|
||||
21. `pymolRendererEmitsPerNameSelections` now regex-validates each `select X, BODY` line - empty bodies and malformed selections would now fail the test.
|
||||
|
||||
## Deviation from plan
|
||||
|
||||
**Cofactors are now filtered out at `Ligands.loadForProtein` (before `splitByPredicate`),
|
||||
not only inside `Ligands.isRelevantLigandGroup`.**
|
||||
|
||||
The plan called for a single guard at the top of `isRelevantLigandGroup`. That guard alone
|
||||
sends cofactors into `ignoredLigands` (because `splitByPredicate(ligandGroups, predicate)`
|
||||
puts predicate-false items in the "negative" bucket, which is then materialised as
|
||||
ignored ligands). The user-facing contract is "cofactors do not appear in any
|
||||
`*_predictions.csv` or `*_residues.csv` ligand listings" - including the ignored-ligands
|
||||
listing. So we added an upstream filter:
|
||||
|
||||
```groovy
|
||||
// Ligands.loadForProtein:
|
||||
List<Group> ligandGroups = Struct.getLigandGroups(protein)
|
||||
.findAll { !loaderParams.isCofactor(it) }
|
||||
```
|
||||
|
||||
The `isRelevantLigandGroup` guard is retained as defence-in-depth for direct callers.
|
||||
|
||||
## Benchmark Results
|
||||
|
||||
### Drop-in safety (R18, part a) - byte-equality on never-present specifier
|
||||
|
||||
```
|
||||
Date: 2026-05-13 (re-run after audit fixes A–G applied)
|
||||
Dataset: distro/test_data/concavity.ds (~22 structures)
|
||||
Seed: 42
|
||||
Threads: 1
|
||||
Command: ./benchmark/cofactors_dropin_safety.sh
|
||||
Result: PASS - predictions byte-identical between baseline and with -cofactors ZZZZ.
|
||||
```
|
||||
|
||||
Only `params.txt` and `run.log` differ (legitimate noise: `cofactors = []` vs
|
||||
`cofactors = [ZZZZ]`, plus outdir paths and timestamps). The script filters those.
|
||||
|
||||
### Effect benchmark (R18, part b) - informational, with a present HETATM
|
||||
|
||||
Not run yet. Recommended next step: pick a dataset where many structures contain a
|
||||
real cofactor (e.g. `MG` or a flavoenzyme set) and capture DCA top-1/top-3/top-5 deltas.
|
||||
Use the result to decide whether Mitigation A or B (plan §5.8) is justified.
|
||||
|
||||
### Manual smoke test
|
||||
|
||||
```
|
||||
Command: ./distro/prank predict -f distro/test_data/1AHP.pdb -cofactors PLP
|
||||
Result:
|
||||
[INFO] Cofactors to include as protein surface: [PLP]
|
||||
[INFO] Structure 1AHP.pdb: included 1 cofactor type(s) as protein surface (PLP: 30 atoms (2 instances))
|
||||
[INFO] protein atoms: 12502 (vs 12472 baseline - exactly +30, 2 × 15 heavy atoms)
|
||||
grep -c PLP 1AHP.pdb_predictions.csv → 0 (cofactor not in pocket prediction)
|
||||
```
|
||||
|
||||
## Known Limitations Shipped
|
||||
|
||||
1. `load_ligands_from_separate_files = true` + cofactor in primary file - cofactor still
|
||||
on surface but appears in `ignoredLigands` listing (cosmetic, R15). Rare combination;
|
||||
not exercised by tests.
|
||||
2. Chain-reduction "lost cofactors" diagnostic uses name-only matching - may over-report
|
||||
when a precise specifier wouldn't have matched anyway (R14). Documented in user doc.
|
||||
3. Case-sensitive group-name matching - by design, matches `ligands`-column behaviour.
|
||||
PDB/CIF files use uppercase residue names.
|
||||
4. AA-property feature dilution at SAS points near cofactors (R18). Drop-in safety
|
||||
confirmed; effect-benchmark deferred. Mitigations A/B pre-designed in plan §5.8.
|
||||
|
||||
## Planned Future Improvements
|
||||
|
||||
Ordered by likely user value. Each has a trigger that would justify the work.
|
||||
|
||||
1. ~~**`analyze cofactors`** subcommand~~ - **Shipped.** See `cmdCofactors` in `AnalyzeRoutine.groovy`.
|
||||
Outputs: `het_groups.csv`, `het_groups_summary.txt`; with `-cofactors` also `cofactor_matches.csv`.
|
||||
2. ~~**PyMOL teal-stick highlight**~~ - **Shipped.** `vis_highlight_cofactors` parameter (default `true`).
|
||||
New field `RenderingModel.cofactorResult`; `renderCofactors()` in both `NewPymolRenderer` and `PymolRenderer`.
|
||||
Five call sites updated to pass `cofactorResult` from `protein.cofactorExtractionResult`.
|
||||
3. **ChimeraX renderer support** - current cofactor highlight is PyMOL-only.
|
||||
*Trigger:* ChimeraX user community feedback.
|
||||
4. **`analyze cofactor-pockets`** - per-dataset with/without prediction delta.
|
||||
*Trigger:* user request, or benchmarking the feature on >1 release.
|
||||
5. **CSV column for cofactor metadata in prediction output** - would expose which atoms
|
||||
were on the surface in a structured form. *Trigger:* downstream-tool integration ask.
|
||||
6. **Mitigation A / B** (plan §5.8) - only if R18 effect benchmark shows score regression.
|
||||
7. **Per-cofactor weighting** - relative weight for cofactor atoms in feature aggregation.
|
||||
*Trigger:* if dilution turns out to need a finer-grained knob than mitigation A.
|
||||
8. **fpocket-rescore cofactor support** - fpocket has a hard-coded cofactor list. Plumbing
|
||||
this through would need either patching fpocket or pre-processing the input.
|
||||
*Trigger:* explicit user request for cofactor-aware rescoring.
|
||||
|
||||
## File Map
|
||||
|
||||
Production code (modified):
|
||||
```
|
||||
src/main/groovy/cz/siret/prank/program/params/Params.groovy - +cofactors, +cofactor_max_protein_dist, +vis_highlight_cofactors
|
||||
src/main/groovy/cz/siret/prank/domain/Dataset.groovy - +COLUMN_COFACTORS, +getEffectiveCofactorDefinitions, getLoaderParams
|
||||
src/main/groovy/cz/siret/prank/domain/loaders/LoaderParams.groovy - +cofactorHandler, +isCofactor(Group)
|
||||
src/main/groovy/cz/siret/prank/domain/Protein.groovy - cofactor block in loadStructure, +getCofactorExtractionResult
|
||||
src/main/groovy/cz/siret/prank/domain/Ligands.groovy - cofactor filter in loadForProtein, guard in isRelevantLigandGroup
|
||||
src/main/groovy/cz/siret/prank/program/Main.groovy - parseAndValidate + aa_mapping collision WARN
|
||||
src/main/groovy/cz/siret/prank/features/implementation/csv/CsvFileFeature.groovy - R17 cofactor bypass
|
||||
src/main/groovy/cz/siret/prank/program/visualization/RenderingModel.groovy - +cofactorResult field
|
||||
src/main/groovy/cz/siret/prank/program/visualization/renderers/NewPymolRenderer.groovy - +renderCofactors()
|
||||
src/main/groovy/cz/siret/prank/program/visualization/renderers/PymolRenderer.groovy - +renderCofactors()
|
||||
src/main/groovy/cz/siret/prank/program/routines/analyze/AnalyzeRoutine.groovy - +cmdCofactors() + 4 caller-site wirings of cofactorResult (3 existing routines + cmdCofactors itself)
|
||||
src/main/groovy/cz/siret/prank/program/routines/analyze/DataTable.groovy - RFC-4180 quoting in toCsv (audit #6)
|
||||
src/main/groovy/cz/siret/prank/program/routines/traineval/EvalResiduesRoutine.groovy - caller-site wiring of cofactorResult
|
||||
```
|
||||
|
||||
Production code (new):
|
||||
```
|
||||
src/main/groovy/cz/siret/prank/domain/CofactorHandler.groovy - handler + ExtractionResult + parseAndValidate
|
||||
```
|
||||
|
||||
Tests (new):
|
||||
```
|
||||
src/test/groovy/cz/siret/prank/domain/CofactorHandlerTest.groovy - parser, case norm, bracket-aware split, error wrapping
|
||||
src/test/groovy/cz/siret/prank/domain/CofactorIntegrationTest.groovy - 1AHP integration (PLP)
|
||||
src/test/groovy/cz/siret/prank/domain/CofactorPipelineTest.groovy - 1t7qa pipeline + R17/R20 regressions + Option-A PML test + audit-driven cases + R19 collision + concurrency
|
||||
src/test/groovy/cz/siret/prank/program/routines/analyze/DataTableCsvTest.groovy - RFC-4180 CSV quoting (audit #6)
|
||||
src/test/groovy/cz/siret/prank/program/routines/analyze/CofactorAnalyzeTest.groovy - building blocks of cmdCofactors (per-item resolution, bracket-aware parsing)
|
||||
src/test/groovy/cz/siret/prank/test/Log4jCapture.groovy - test-only log4j2 capture helper
|
||||
```
|
||||
|
||||
Test data (downloaded):
|
||||
```
|
||||
distro/test_data/1AHP.pdb - gated by @EnabledIf("has1AHP"); 1AHP contains PLP
|
||||
```
|
||||
|
||||
Scripts (new):
|
||||
```
|
||||
benchmark/cofactors_dropin_safety.sh - R18 drop-in safety benchmark
|
||||
```
|
||||
|
||||
Docs (new):
|
||||
```
|
||||
documentation/cofactors.md - user-facing
|
||||
documentation/dev/cofactors.md - this file
|
||||
```
|
||||
73
misc/dev/technical-debt.md
Normal file
73
misc/dev/technical-debt.md
Normal file
@@ -0,0 +1,73 @@
|
||||
# Technical Debt
|
||||
|
||||
Long-form notes on known suboptimal designs in this codebase that haven't been
|
||||
worth fixing yet. Each entry should state the issue, why it matters, the current
|
||||
workaround (if any), what a proper fix would look like, and the trigger that would
|
||||
justify the work.
|
||||
|
||||
---
|
||||
|
||||
## CLI list-param parsing: `Sutils.parseList` is not bracket-aware
|
||||
|
||||
**Where:** `src/main/groovy/cz/siret/prank/utils/Sutils.groovy` (`parseList`, `split`).
|
||||
|
||||
**Issue.** `Params.updateFromCommandLine` routes every `List<String>` CLI argument
|
||||
through `Sutils.parseList`, which falls back to `Sutils.split(str, ",")` - a Guava
|
||||
splitter on the literal comma character. The splitter has no awareness of bracketed
|
||||
content, so a value like
|
||||
|
||||
```
|
||||
-cofactors 'FAD[contact_res_ids:A_D246,A_T259,A_E423]'
|
||||
```
|
||||
|
||||
gets shredded into three list elements:
|
||||
|
||||
```
|
||||
["FAD[contact_res_ids:A_D246", "A_T259", "A_E423]"]
|
||||
```
|
||||
|
||||
This is a real footgun for any list-typed parameter whose elements can contain
|
||||
commas inside brackets/parens - historically only the `ligands` *column* (parsed
|
||||
separately via `Sutils.splitRespectInnerParentheses` in `Dataset.parseLigandsColumn`),
|
||||
and now `cofactors` (Issue #79 part 2).
|
||||
|
||||
**Why it matters.**
|
||||
|
||||
- Silent corruption: the user sees no error; their specifier just fails to match.
|
||||
- The over-split values often pass `LigandDefinition.parse` because `partBetween`
|
||||
falls back to `str.length()` when no closing `]` is found - producing nonsense
|
||||
definitions that match nothing.
|
||||
- Any future list-typed param that wants to support bracketed structure (precise
|
||||
identifiers, contact-res lists, nested expressions) will hit the same bug.
|
||||
|
||||
**Current workaround.** `CofactorHandler.parseAndValidate(List<String>)` is tolerant
|
||||
of upstream damage: it re-joins the elements with `,`, then re-splits with
|
||||
`Sutils.splitRespectInnerParentheses(str, ',', '[', ']')`. So cofactors specifically
|
||||
recover. The same `parseAndValidate(String)` overload is used for the dataset
|
||||
column, where `Dataset.resolveCofactorDefinitions(Item)` passes the raw column
|
||||
string straight in - no naive split happens there.
|
||||
|
||||
Workaround is documented in `documentation/dev/cofactors.md` (R22). Tests pin the
|
||||
behaviour: `CofactorHandlerTest.contactResIdsCommasArePreservedFromList`.
|
||||
|
||||
**Proper fix (sketch).**
|
||||
|
||||
Two viable approaches:
|
||||
|
||||
1. **Make `Sutils.parseList` bracket-aware globally.** Add a `splitter` parameter
|
||||
or detect brackets and switch to `splitRespectInnerParentheses` when the input
|
||||
contains them. Risk: any existing list-typed param whose values happen to
|
||||
contain brackets without intending them as scope markers (e.g. a string with
|
||||
`[debug]` prefix) would now parse differently. Audit needed.
|
||||
|
||||
2. **Per-param parsing registry.** Annotate (or table-driven) which params want
|
||||
bracket-aware splitting. Touch only `Params.updateFromCommandLine`. Lower
|
||||
blast radius, more boilerplate.
|
||||
|
||||
Either fix would let downstream callers (like `CofactorHandler.parseAndValidate`)
|
||||
drop their defensive re-join/re-split. The "centralized recovery" is functional
|
||||
but inelegant: it asks downstream layers to undo damage done upstream.
|
||||
|
||||
**Trigger.** Add a third list-typed param with bracketed structure, OR a user
|
||||
reports unexpected behaviour with `-cofactors '...[contact_res_ids:...]'` despite
|
||||
the recovery (e.g., the recovery itself has a corner case we missed).
|
||||
@@ -113,6 +113,10 @@ quick() {
|
||||
# test export-points command (no model)
|
||||
test ./prank.sh export-points -f distro/test_data/1fbl.pdb -export_points_format csv.gz -out_subdir TEST/TESTS
|
||||
|
||||
# cofactors feature smoke test
|
||||
test ./prank.sh predict -f distro/test_data/liganated/1t7qa.pdb -cofactors COA -out_subdir TEST/TESTS
|
||||
test ./prank.sh analyze cofactors -f distro/test_data/liganated/1t7qa.pdb -out_subdir TEST/TESTS
|
||||
|
||||
}
|
||||
|
||||
quick_train() {
|
||||
@@ -368,10 +372,47 @@ analyze() {
|
||||
test ./prank.sh analyze chains holo4k.ds -c config/train-default -cache_datasets 0 -out_subdir TEST/ANALYZE
|
||||
test ./prank.sh analyze chains-residues holo4k.ds -c config/train-default -cache_datasets 0 -out_subdir TEST/ANALYZE
|
||||
|
||||
test ./prank.sh analyze cofactors joined.ds -c config/train-default -cache_datasets 0 -out_subdir TEST/ANALYZE
|
||||
test ./prank.sh analyze cofactors holo4k.ds -c config/train-default -cache_datasets 0 -out_subdir TEST/ANALYZE
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
cofactors() {
|
||||
|
||||
title COFACTORS FEATURE
|
||||
|
||||
# analyze cofactors: survey mode (no -cofactors) on file + dataset
|
||||
test ./prank.sh analyze cofactors -f distro/test_data/liganated/1t7qa.pdb -out_subdir TEST/COFACTORS
|
||||
test ./prank.sh analyze cofactors test.ds -out_subdir TEST/COFACTORS
|
||||
|
||||
# analyze cofactors: dry-run mode with each specifier form
|
||||
test ./prank.sh analyze cofactors -f distro/test_data/liganated/1t7qa.pdb -cofactors COA -out_subdir TEST/COFACTORS
|
||||
test ./prank.sh analyze cofactors -f distro/test_data/liganated/1t7qa.pdb -cofactors 'COA[atom_id:9551]' -out_subdir TEST/COFACTORS
|
||||
test ./prank.sh analyze cofactors -f distro/test_data/liganated/1t7qa.pdb -cofactors 'COA[contact_res_ids:A_K258]' -out_subdir TEST/COFACTORS
|
||||
test ./prank.sh analyze cofactors -f distro/test_data/liganated/1t7qa.pdb -cofactors ZZZZ -out_subdir TEST/COFACTORS
|
||||
|
||||
# predict with cofactors
|
||||
test ./prank.sh predict -f distro/test_data/liganated/1t7qa.pdb -cofactors COA -out_subdir TEST/COFACTORS
|
||||
# R18: never-present specifier must be a no-op (drop-in safety)
|
||||
test ./prank.sh predict -f distro/test_data/1fbl.pdb -cofactors ZZZZ -out_subdir TEST/COFACTORS
|
||||
# R22: case-mismatched name must still match (parser auto-uppercases the group name)
|
||||
test ./prank.sh predict -f distro/test_data/liganated/1t7qa.pdb -cofactors coa -out_subdir TEST/COFACTORS
|
||||
# R22: contact_res_ids must survive comma-splitting (bracket-aware parse)
|
||||
test ./prank.sh predict -f distro/test_data/liganated/1t7qa.pdb -cofactors 'COA[contact_res_ids:A_K258,A_D246]' -out_subdir TEST/COFACTORS
|
||||
|
||||
# knobs
|
||||
test ./prank.sh predict -f distro/test_data/liganated/1t7qa.pdb -cofactors COA -cofactor_max_protein_dist 0 -out_subdir TEST/COFACTORS
|
||||
test ./prank.sh predict -f distro/test_data/liganated/1t7qa.pdb -cofactors COA -vis_highlight_cofactors 0 -out_subdir TEST/COFACTORS
|
||||
|
||||
# R19: aa_mapping collision warning (MSE is in built-in minimal mapping)
|
||||
test ./prank.sh predict -f distro/test_data/1fbl.pdb -cofactors MSE -out_subdir TEST/COFACTORS
|
||||
|
||||
# drop-in safety benchmark
|
||||
test ./benchmark/cofactors_dropin_safety.sh distro/test_data/concavity.ds
|
||||
}
|
||||
|
||||
transform() {
|
||||
|
||||
title TRANSFORM COMMANDS
|
||||
@@ -593,6 +634,7 @@ eval_train_all() {
|
||||
tests() {
|
||||
quick
|
||||
basic
|
||||
cofactors
|
||||
}
|
||||
|
||||
all() {
|
||||
|
||||
336
src/main/groovy/cz/siret/prank/domain/CofactorHandler.groovy
Normal file
336
src/main/groovy/cz/siret/prank/domain/CofactorHandler.groovy
Normal file
@@ -0,0 +1,336 @@
|
||||
package cz.siret.prank.domain
|
||||
|
||||
import cz.siret.prank.geom.Atoms
|
||||
import cz.siret.prank.geom.Struct
|
||||
import cz.siret.prank.program.PrankException
|
||||
import cz.siret.prank.utils.Formatter
|
||||
import cz.siret.prank.utils.Sutils
|
||||
import groovy.transform.CompileStatic
|
||||
import groovy.util.logging.Slf4j
|
||||
import org.biojava.nbio.structure.Atom
|
||||
import org.biojava.nbio.structure.Group
|
||||
import org.biojava.nbio.structure.Structure
|
||||
|
||||
import static cz.siret.prank.domain.Dataset.LigandDefinition
|
||||
import static cz.siret.prank.geom.Struct.getAuthorId
|
||||
|
||||
/**
|
||||
* Identifies and extracts cofactor atoms that should be included in the protein surface.
|
||||
*
|
||||
* Cofactor specifiers reuse {@link Dataset.LigandDefinition} - the same syntax used by the
|
||||
* dataset 'ligands' column. A specifier can be a bare residue name ("FAD") or a precise
|
||||
* identifier ("FAD[group_id:A_500]", "FAD[atom_id:12345]", "FAD[contact_res_ids:...]").
|
||||
*
|
||||
* Per-structure instance. The only mutable state is the {@code matchedGroups} identity set,
|
||||
* which is reset and repopulated by {@link #extractCofactorAtoms}. Re-running extraction (e.g.
|
||||
* via {@code Protein.transformedCopy}, which reuses the same {@code LoaderParams} on a
|
||||
* deep-copied structure) is safe and discards prior group references. Since each
|
||||
* {@code Protein.loadStructure()} owns its own handler, there is no concurrent writer.
|
||||
*
|
||||
* {@code LoaderParams.isCofactor(Group)} delegates to this class.
|
||||
*/
|
||||
@Slf4j
|
||||
@CompileStatic
|
||||
class CofactorHandler {
|
||||
|
||||
/** Key for storing ExtractionResult in Protein.secondaryData */
|
||||
static final String EXTRACTION_RESULT_KEY = "cofactor.extractionResult"
|
||||
|
||||
/** Parsed cofactor specifiers. Immutable. */
|
||||
final List<LigandDefinition> definitions
|
||||
|
||||
/**
|
||||
* Groups in the loaded structure that matched at least one definition.
|
||||
* Populated during {@link #extractCofactorAtoms}; used by {@link #isCofactor} for O(1)
|
||||
* exclusion checks in {@code Ligands.isRelevantLigandGroup}.
|
||||
*
|
||||
* Identity-based (BioJava Group references), not name-based - this is what makes precise
|
||||
* specifiers like {@code FAD[group_id:A_500]} work correctly when the structure contains
|
||||
* multiple FADs and only one of them is a cofactor.
|
||||
*/
|
||||
private final Set<Group> matchedGroups = Collections.newSetFromMap(new IdentityHashMap<>())
|
||||
|
||||
/**
|
||||
* Result of cofactor extraction from a structure.
|
||||
* Returned by {@link #extractCofactorAtoms} to avoid double-scanning the structure.
|
||||
*/
|
||||
@CompileStatic
|
||||
static class ExtractionResult {
|
||||
/** Cofactor heavy atoms ready to add to proteinAtoms. */
|
||||
final Atoms atoms
|
||||
/** Map of cofactor name → matched groups (for logging). */
|
||||
final Map<String, List<Group>> foundGroups
|
||||
/** Specifiers (by originalString) that matched zero groups. */
|
||||
final List<String> unmatchedSpecifiers
|
||||
|
||||
ExtractionResult(Atoms atoms, Map<String, List<Group>> foundGroups, List<String> unmatchedSpecifiers) {
|
||||
this.atoms = atoms
|
||||
this.foundGroups = foundGroups
|
||||
this.unmatchedSpecifiers = unmatchedSpecifiers
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Create handler from parsed cofactor definitions.
|
||||
*
|
||||
* @param definitions parsed specifiers (e.g. via {@link #parseAndValidate})
|
||||
*/
|
||||
CofactorHandler(List<LigandDefinition> definitions) {
|
||||
this.definitions = (List<LigandDefinition>) ((definitions ?: []) as List<LigandDefinition>).asImmutable()
|
||||
}
|
||||
|
||||
/** @return true if any cofactor specifiers are configured */
|
||||
boolean isEnabled() {
|
||||
return !definitions.isEmpty()
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if a group was matched as a cofactor during extraction.
|
||||
* Returns false until {@link #extractCofactorAtoms} has been called.
|
||||
*/
|
||||
boolean isCofactor(Group group) {
|
||||
if (group == null) return false
|
||||
return matchedGroups.contains(group)
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract cofactor atoms and metadata from the structure in a single scan.
|
||||
*
|
||||
* For each HETATM group in the structure, ask each definition if it matches (via
|
||||
* {@code LigandDefinition.matchesGroup}). Matched groups are remembered for {@link
|
||||
* #isCofactor} and their heavy atoms collected for the surface.
|
||||
*
|
||||
* @param protein the Protein under construction. {@code matchesGroup} with
|
||||
* {@code contact_res_ids} consults {@code protein.residues} and
|
||||
* {@code protein.proteinAtoms}, both of which must already be set.
|
||||
*/
|
||||
ExtractionResult extractCofactorAtoms(Protein protein) {
|
||||
// Reset state - extraction may be re-run on a deep-copied structure (Protein.transformedCopy
|
||||
// reuses the same LoaderParams/CofactorHandler). Stale Group references from a prior structure
|
||||
// would otherwise persist in the identity set.
|
||||
matchedGroups.clear()
|
||||
|
||||
Map<String, List<Group>> foundGroups = new LinkedHashMap<>()
|
||||
Set<String> matchedSpecifiers = new LinkedHashSet<>()
|
||||
List<Atom> atoms = new ArrayList<>()
|
||||
|
||||
// Use the same candidate set as the ligand loader (HETATM ∪ non-polymer-chain groups,
|
||||
// minus covalently-bound residue HETs and water). Aligning with Ligands.loadForProtein
|
||||
// ensures GDP/GTP/ATP/SHR-style groups - which BioJava classifies as NUCLEOTIDE/AMINOACID
|
||||
// rather than HETATM - are reachable as cofactors.
|
||||
List<Group> candidateGroups = Struct.getLigandGroups(protein)
|
||||
|
||||
for (Group g : candidateGroups) {
|
||||
boolean groupIsCofactor = false
|
||||
for (LigandDefinition d : definitions) {
|
||||
if (d.matchesGroup(g, protein)) {
|
||||
groupIsCofactor = true
|
||||
matchedSpecifiers.add(d.originalString)
|
||||
}
|
||||
}
|
||||
if (groupIsCofactor) {
|
||||
matchedGroups.add(g)
|
||||
// PDBName is non-null here: LigandDefinition.matchesGroup requires
|
||||
// groupName == group.getPDBName(), and groupName from parseAndValidate
|
||||
// is always non-null.
|
||||
String name = g.PDBName.toUpperCase()
|
||||
foundGroups.computeIfAbsent(name, { new ArrayList<>() }).add(g)
|
||||
atoms.addAll(Atoms.allFromGroup(g).withoutHydrogens().list)
|
||||
}
|
||||
}
|
||||
|
||||
// A specifier is "unmatched" if no group in the structure satisfied it.
|
||||
List<String> unmatched = definitions
|
||||
.collect { it.originalString }
|
||||
.findAll { !matchedSpecifiers.contains(it) }
|
||||
|
||||
return new ExtractionResult(new Atoms(atoms), foundGroups, unmatched)
|
||||
}
|
||||
|
||||
/**
|
||||
* Log summary of extraction result.
|
||||
*
|
||||
* Levels:
|
||||
* - INFO: per-structure summary of matched cofactors
|
||||
* - DEBUG: unmatched specifiers + available HETATM list (prevents log flooding on large datasets)
|
||||
* - DEBUG: per-instance details (chain, residue number, atom count)
|
||||
*/
|
||||
void logResult(ExtractionResult result, String structureName, Structure structure) {
|
||||
if (!isEnabled()) return
|
||||
|
||||
if (!result.unmatchedSpecifiers.isEmpty() && log.isDebugEnabled()) {
|
||||
List<String> availableHet = Struct.getHetGroups(structure)
|
||||
.collect { it.PDBName }
|
||||
.findAll { it }
|
||||
.unique()
|
||||
.sort()
|
||||
String availableStr = availableHet.size() > 10
|
||||
? availableHet.take(10).join(", ") + "..."
|
||||
: availableHet.join(", ")
|
||||
log.debug "Structure {}: cofactor specifier(s) {} matched no groups. Available HETATM groups: [{}]",
|
||||
structureName, result.unmatchedSpecifiers, availableStr
|
||||
}
|
||||
|
||||
if (!result.foundGroups.isEmpty()) {
|
||||
List<String> parts = new ArrayList<>()
|
||||
for (String name : result.foundGroups.keySet()) {
|
||||
List<Group> groups = result.foundGroups.get(name)
|
||||
int atomCount = (int) groups.sum { Atoms.allFromGroup((Group) it).withoutHydrogens().count }
|
||||
int instanceCount = groups.size()
|
||||
|
||||
for (Group g : groups) {
|
||||
String chainId = getAuthorId(g.chain)
|
||||
String resNum = g.residueNumber?.printFull() ?: "?"
|
||||
int groupAtomCount = Atoms.allFromGroup(g).withoutHydrogens().count
|
||||
log.debug "Including cofactor {} (chain {}, residue {}, {} heavy atoms)",
|
||||
name, chainId, resNum, groupAtomCount
|
||||
}
|
||||
|
||||
if (instanceCount > 1) {
|
||||
parts.add("${name}: ${atomCount} atoms (${instanceCount} instances)".toString())
|
||||
} else {
|
||||
parts.add("${name}: ${atomCount} atoms".toString())
|
||||
}
|
||||
}
|
||||
|
||||
log.info "Structure {}: included {} cofactor type(s) as protein surface ({})",
|
||||
structureName, result.foundGroups.size(), parts.join(', ')
|
||||
}
|
||||
}
|
||||
|
||||
// ===== Post-Extraction Checks =====
|
||||
|
||||
/**
|
||||
* Log WARN messages for cofactor groups whose center of mass is far from the protein
|
||||
* surface. Distant cofactor may be a crystallization artifact or a free cofactor in
|
||||
* solvent. Advisory only - the cofactor stays in the surface regardless.
|
||||
*/
|
||||
void warnDistantCofactors(ExtractionResult result, Atoms proteinAtoms,
|
||||
double maxDist, String structureName) {
|
||||
if (maxDist <= 0 || result.foundGroups.isEmpty() || proteinAtoms.empty) return
|
||||
|
||||
for (Map.Entry<String, List<Group>> e : result.foundGroups.entrySet()) {
|
||||
String name = e.key
|
||||
for (Group g : e.value) {
|
||||
Atoms groupAtoms = Atoms.allFromGroup(g).withoutHydrogens()
|
||||
if (groupAtoms.empty) continue
|
||||
|
||||
double dist = proteinAtoms.dist(groupAtoms.centerOfMass)
|
||||
if (dist > maxDist) {
|
||||
String chainId = getAuthorId(g.chain)
|
||||
String resNum = g.residueNumber?.printFull() ?: "?"
|
||||
log.warn "Structure {}: cofactor {} (chain {}, residue {}) " +
|
||||
"is {} Å from nearest protein atom (threshold: {} Å) - " +
|
||||
"may be a crystallization artifact",
|
||||
structureName, name, chainId, resNum, Formatter.format(dist, 1), maxDist
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Detect cofactors that exist in the full structure but were lost during chain reduction.
|
||||
* Called only when {@code onlyChains} is non-null.
|
||||
*
|
||||
* This diagnostic matches by groupName only - running the full {@code
|
||||
* LigandDefinition.matchesGroup} on the unreduced structure would need an alternate
|
||||
* {@code Protein} context (residues/atoms of the full structure), which we don't build.
|
||||
* The name-only check can over-report when a precise specifier wouldn't have matched the
|
||||
* chain-excluded group anyway; an acceptable false positive for an advisory log line.
|
||||
*/
|
||||
void warnChainExcludedCofactors(Structure fullStructure, ExtractionResult reducedResult,
|
||||
String structureName) {
|
||||
Set<String> wantedNames = (Set<String>) definitions
|
||||
.collect { it.groupName?.toUpperCase() }
|
||||
.findAll { it as boolean }
|
||||
.toSet()
|
||||
|
||||
Set<String> namesInFull = new LinkedHashSet<>()
|
||||
for (Group g : Struct.getGroups(fullStructure)) {
|
||||
if (!Struct.isLigandCandidateNonWaterGroup(g)) continue
|
||||
String n = g.PDBName?.toUpperCase()
|
||||
if (n != null && wantedNames.contains(n)) namesInFull.add(n)
|
||||
}
|
||||
|
||||
Set<String> namesMatchedInReduced = reducedResult.foundGroups.keySet()
|
||||
|
||||
Set<String> excluded = new LinkedHashSet<>(namesInFull)
|
||||
excluded.removeAll(namesMatchedInReduced)
|
||||
|
||||
if (!excluded.isEmpty()) {
|
||||
log.warn "Structure {}: cofactor(s) {} found in full structure but not in reduced " +
|
||||
"structure - they may exist only on excluded chains",
|
||||
structureName, excluded
|
||||
}
|
||||
}
|
||||
|
||||
// ===== Static Utility Methods =====
|
||||
|
||||
/**
|
||||
* Parse and validate cofactor specifiers from a list of raw strings.
|
||||
*
|
||||
* The list is re-joined with commas and re-split using a bracket-aware splitter
|
||||
* before parsing. This recovers from naive comma-split that may have happened
|
||||
* upstream (e.g., {@code Sutils.parseList} from the CLI, or {@code CHAIN_SPLITTER}
|
||||
* from a dataset column), so a specifier like
|
||||
* {@code FAD[contact_res_ids:A_D246,A_T259,A_E423]} that was over-split into
|
||||
* {@code ["FAD[contact_res_ids:A_D246", "A_T259", "A_E423]"]} is correctly
|
||||
* reassembled into a single specifier.
|
||||
*
|
||||
* Each specifier is parsed via {@link #parseOne}, which:
|
||||
* - case-normalizes the group name (so {@code "fad"} → {@code "FAD"} matches BioJava)
|
||||
* - wraps {@code PrankException} from {@code LigandDefinition.parse} with cofactor-
|
||||
* specific context, so the error message identifies the offending specifier
|
||||
* rather than mentioning "dataset file" (which may not be the source)
|
||||
*
|
||||
* Blank entries are silently dropped (e.g., trailing commas).
|
||||
*
|
||||
* @return parsed (and therefore validated) definitions; never null
|
||||
*/
|
||||
static List<LigandDefinition> parseAndValidate(List<String> rawSpecifiers) {
|
||||
if (rawSpecifiers == null || rawSpecifiers.isEmpty()) return []
|
||||
String joined = rawSpecifiers
|
||||
.findAll { it != null && !((String) it).trim().isEmpty() }
|
||||
.collect { ((String) it).trim() }
|
||||
.join(",")
|
||||
return parseAndValidate(joined)
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse and validate cofactor specifiers from a single comma-separated string,
|
||||
* e.g. a raw dataset column value. Splits with bracket awareness so that commas
|
||||
* inside {@code [...]} are preserved.
|
||||
*/
|
||||
static List<LigandDefinition> parseAndValidate(String rawCommaSeparated) {
|
||||
if (rawCommaSeparated == null || rawCommaSeparated.trim().isEmpty()) return []
|
||||
List<String> parts = Sutils.splitRespectInnerParentheses(
|
||||
rawCommaSeparated, ',' as char, '[' as char, ']' as char)
|
||||
List<LigandDefinition> out = new ArrayList<>(parts.size())
|
||||
for (String s : parts) {
|
||||
String trimmed = s?.trim()
|
||||
if (trimmed == null || trimmed.isEmpty()) continue
|
||||
out.add(parseOne(trimmed))
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse a single specifier with case normalization and cofactor-specific error context.
|
||||
*
|
||||
* The group-name portion (everything before the first {@code [}) is upper-cased so
|
||||
* that a user-supplied {@code "fad"} matches BioJava's uppercase PDB names. The
|
||||
* bracketed specifier portion is preserved as-is (chain IDs and contact-residue codes
|
||||
* are case-significant for group_id/contact_res_ids matching).
|
||||
*/
|
||||
private static LigandDefinition parseOne(String raw) {
|
||||
int bracket = raw.indexOf('[')
|
||||
String normalized = (bracket < 0)
|
||||
? raw.trim().toUpperCase()
|
||||
: raw.substring(0, bracket).trim().toUpperCase() + raw.substring(bracket)
|
||||
try {
|
||||
return LigandDefinition.parse(normalized)
|
||||
} catch (PrankException e) {
|
||||
throw new PrankException("Invalid cofactor specifier '${raw}': ${e.message}", e)
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -73,6 +73,7 @@ class Dataset implements Parametrized, Writable, Failable {
|
||||
static final String COLUMN_APO_PROTEIN = "apo_protein"
|
||||
static final String COLUMN_APO_CHAINS = "apo_chains"
|
||||
static final String COLUMN_POSITIVE_RESIDUES = "positive_residues"
|
||||
static final String COLUMN_COFACTORS = "cofactors"
|
||||
|
||||
static final List<String> DEFAULT_HEADER = [ COLUMN_PROTEIN ]
|
||||
|
||||
@@ -303,9 +304,38 @@ class Dataset implements Parametrized, Writable, Failable {
|
||||
}
|
||||
lp.relevantLigandsDefined = hasExplicitlyDefinedLigands()
|
||||
lp.relevantLigandDefinitions = item.getLigandDefinitions()
|
||||
|
||||
// Cofactor handler: per-structure column value overrides global Params.cofactors
|
||||
List<LigandDefinition> effectiveDefs = resolveCofactorDefinitions(item)
|
||||
if (!effectiveDefs.isEmpty()) {
|
||||
lp.cofactorHandler = new CofactorHandler(effectiveDefs)
|
||||
}
|
||||
|
||||
return lp
|
||||
}
|
||||
|
||||
/**
|
||||
* Resolve effective cofactor definitions for an item.
|
||||
*
|
||||
* Per-structure column value overrides global {@code Params.cofactors}. Parsing is
|
||||
* delegated to {@link CofactorHandler#parseAndValidate}, which handles bracket-aware
|
||||
* splitting (so {@code contact_res_ids:A,B} stays one specifier), case normalization
|
||||
* of the group name, and cofactor-specific error wrapping. Invalid specifiers throw
|
||||
* PrankException with the offending input.
|
||||
*
|
||||
* Package-private so {@code AnalyzeRoutine.cmdCofactors} can use the same per-item
|
||||
* resolution for its dry-run mode.
|
||||
*
|
||||
* @return parsed cofactor definitions; never null, empty when no cofactors configured
|
||||
*/
|
||||
List<LigandDefinition> resolveCofactorDefinitions(Item item) {
|
||||
String columnValue = item.columnValues?.get(COLUMN_COFACTORS)
|
||||
if (columnValue != null && !columnValue.trim().isEmpty()) {
|
||||
return CofactorHandler.parseAndValidate(columnValue)
|
||||
}
|
||||
return CofactorHandler.parseAndValidate(Params.inst.cofactors)
|
||||
}
|
||||
|
||||
/**
|
||||
* Get instance of prediction loader.
|
||||
* @return null if prediction method is not specified in the dataset
|
||||
|
||||
@@ -89,7 +89,11 @@ class Ligands implements Parametrized, Writable, Failable {
|
||||
relevantLigands = ligands
|
||||
|
||||
} else {
|
||||
// Filter cofactors out of the candidate list entirely - they are not ligands of
|
||||
// any kind (relevant OR ignored). See Issue #79 part 2; the guard in
|
||||
// isRelevantLigandGroup() below is defence-in-depth for direct callers.
|
||||
List<Group> ligandGroups = Struct.getLigandGroups(protein)
|
||||
.findAll { !loaderParams.isCofactor(it) }
|
||||
|
||||
if (loaderParams.relevantLigandsDefined) {
|
||||
log.info "Relevant ligands are explicitly defined in the dataset: " + loaderParams.relevantLigandDefinitions
|
||||
@@ -170,7 +174,7 @@ class Ligands implements Parametrized, Writable, Failable {
|
||||
log.info "Loading ligand from separate file: [{}]", ligFile.name
|
||||
try {
|
||||
Structure ligStructure = PdbUtils.loadFromFile(ligFile.absolutePath)
|
||||
// Extract all non-water groups — these are dedicated ligand files, so all groups are ligand atoms
|
||||
// Extract all non-water groups - these are dedicated ligand files, so all groups are ligand atoms
|
||||
List<Group> groups = Struct.getGroups(ligStructure).findAll { Group g -> !g.isWater() }
|
||||
if (groups.empty) {
|
||||
log.warn "No non-water groups found in ligand file [{}], skipping", ligFile.name
|
||||
@@ -203,7 +207,25 @@ class Ligands implements Parametrized, Writable, Failable {
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Determines if a HETATM group should be treated as a relevant ligand.
|
||||
*
|
||||
* Exclusion order:
|
||||
* 1. Cofactors (always excluded - they're part of protein surface)
|
||||
* 2. Explicitly defined ligands (if relevantLigandsDefined)
|
||||
* 3. Ignored het groups (from ignore_het_groups parameter)
|
||||
*
|
||||
* Cofactors take precedence over explicit ligand definitions: if a group is matched
|
||||
* by both a cofactor specifier and a ligand definition, it is excluded from ligand
|
||||
* detection (cofactors describe surface, never targets).
|
||||
*/
|
||||
private static boolean isRelevantLigandGroup(Group group, Protein protein, LoaderParams loaderParams) {
|
||||
// Cofactor check first - O(1) identity lookup against the set of groups matched
|
||||
// during CofactorHandler.extractCofactorAtoms() in Protein.loadStructure().
|
||||
if (loaderParams.isCofactor(group)) {
|
||||
return false
|
||||
}
|
||||
|
||||
if (loaderParams.relevantLigandsDefined) {
|
||||
for (Dataset.LigandDefinition ligDef : loaderParams.relevantLigandDefinitions) {
|
||||
if (ligDef.matchesGroup(group, protein)) {
|
||||
|
||||
@@ -258,6 +258,17 @@ class Protein implements Parametrized {
|
||||
return (score==null) ? null : score.toDoubleLabeling(this)
|
||||
}
|
||||
|
||||
/**
|
||||
* Get cofactor extraction result, if cofactors were configured for this protein.
|
||||
* Stored during {@code loadStructure()} via secondaryData.
|
||||
*
|
||||
* @return ExtractionResult or null if no cofactors configured
|
||||
*/
|
||||
@Nullable
|
||||
CofactorHandler.ExtractionResult getCofactorExtractionResult() {
|
||||
(CofactorHandler.ExtractionResult) secondaryData.get(CofactorHandler.EXTRACTION_RESULT_KEY)
|
||||
}
|
||||
|
||||
//===========================================================================================================//
|
||||
|
||||
List<Ligand> getRelevantLigands() {
|
||||
@@ -557,6 +568,29 @@ class Protein implements Parametrized {
|
||||
allAtoms = Atoms.allFromStructure(structure).withIndex()
|
||||
proteinAtoms = Atoms.allFromGroups(residues*.group).withoutHydrogens()
|
||||
|
||||
// Include cofactor atoms in the protein surface (Issue #79 part 2).
|
||||
// residues and proteinAtoms must be set before this - contact_res_ids specifier
|
||||
// matching consults both.
|
||||
CofactorHandler cofactorHandler = loaderParams?.cofactorHandler
|
||||
if (cofactorHandler != null && cofactorHandler.isEnabled()) {
|
||||
CofactorHandler.ExtractionResult cfResult = cofactorHandler.extractCofactorAtoms(this)
|
||||
|
||||
if (params.cofactor_max_protein_dist > 0) {
|
||||
cofactorHandler.warnDistantCofactors(cfResult, proteinAtoms,
|
||||
params.cofactor_max_protein_dist, name)
|
||||
}
|
||||
if (onlyChains != null && fullStructure != null) {
|
||||
cofactorHandler.warnChainExcludedCofactors(fullStructure, cfResult, name)
|
||||
}
|
||||
|
||||
if (!cfResult.atoms.empty) {
|
||||
proteinAtoms = Atoms.join([proteinAtoms, cfResult.atoms])
|
||||
}
|
||||
cofactorHandler.logResult(cfResult, name, structure)
|
||||
|
||||
secondaryData.put(CofactorHandler.EXTRACTION_RESULT_KEY, cfResult)
|
||||
}
|
||||
|
||||
log.info "structure atoms: $allAtoms.count"
|
||||
log.info "protein atoms: $proteinAtoms.count"
|
||||
|
||||
|
||||
@@ -1,9 +1,13 @@
|
||||
package cz.siret.prank.domain.loaders
|
||||
|
||||
import cz.siret.prank.domain.CofactorHandler
|
||||
import cz.siret.prank.domain.Dataset
|
||||
import cz.siret.prank.program.params.Params
|
||||
import groovy.transform.CompileStatic
|
||||
import groovy.transform.TupleConstructor
|
||||
import org.biojava.nbio.structure.Group
|
||||
|
||||
import javax.annotation.Nullable
|
||||
|
||||
/**
|
||||
* Protein file loader parameters
|
||||
@@ -32,6 +36,27 @@ class LoaderParams {
|
||||
return ignoredHetGroups
|
||||
}
|
||||
|
||||
/**
|
||||
* Handler for cofactor processing and lookup. Populated from {@code Params.cofactors} or
|
||||
* per-structure dataset column. Null when no cofactors are configured (default behaviour).
|
||||
*
|
||||
* Single source of truth for cofactor state - eliminates state duplication between
|
||||
* LoaderParams and CofactorHandler.
|
||||
*/
|
||||
@Nullable
|
||||
CofactorHandler cofactorHandler = null
|
||||
|
||||
/**
|
||||
* Check if a group is a configured cofactor. Delegates to {@link CofactorHandler#isCofactor},
|
||||
* which is an O(1) identity lookup against the set of groups matched during
|
||||
* {@code CofactorHandler.extractCofactorAtoms}.
|
||||
*
|
||||
* Thread-safe: each LoaderParams instance has its own handler.
|
||||
*/
|
||||
boolean isCofactor(Group group) {
|
||||
return cofactorHandler != null && cofactorHandler.isCofactor(group)
|
||||
}
|
||||
|
||||
// LoaderParams(LoaderParams lp) {
|
||||
// this.ignoreLigands = lp.ignoreLigands
|
||||
// this.ligandsSeparatedByTER = lp.ligandsSeparatedByTER
|
||||
|
||||
@@ -61,6 +61,13 @@ class CsvFileFeature extends AtomFeatureCalculator implements Parametrized {
|
||||
|
||||
@Override
|
||||
double[] calculateForAtom(Atom proteinSurfaceAtom, AtomFeatureCalculationContext context) {
|
||||
// R17: cofactor atoms aren't expected to appear in user-provided CSVs. Without this
|
||||
// short-circuit, the strict-by-default CSV lookup throws PrankException on every
|
||||
// cofactor atom when -cofactors is set. Strict-mode behaviour for polymer atoms is
|
||||
// preserved.
|
||||
if (context.protein.loaderParams?.isCofactor(proteinSurfaceAtom.group)) {
|
||||
return new double[header.size()]
|
||||
}
|
||||
CsvFileFeatureValues feature = getValuesForProtein(context.protein)
|
||||
return feature.getValues(proteinSurfaceAtom, header)
|
||||
}
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
package cz.siret.prank.program
|
||||
|
||||
import cz.siret.prank.domain.AminoAcidMapper
|
||||
import cz.siret.prank.domain.CofactorHandler
|
||||
import cz.siret.prank.domain.Dataset
|
||||
import cz.siret.prank.domain.loaders.LoaderParams
|
||||
import cz.siret.prank.features.implementation.conservation.provider.ConservationProviderFactory
|
||||
@@ -137,6 +138,38 @@ class Main implements Parametrized, Writable {
|
||||
|
||||
// Initialize amino acid mapper based on aa_mapping parameter
|
||||
AminoAcidMapper.initialize(params.aa_mapping)
|
||||
|
||||
// Parse and validate cofactor specifiers (Issue #79 part 2).
|
||||
// parseAndValidate throws PrankException on malformed input via LigandDefinition.parse().
|
||||
// The result is discarded here - re-parsing happens per-item in
|
||||
// Dataset.resolveCofactorDefinitions so dataset-column overrides go through the
|
||||
// same validation.
|
||||
if (params.cofactors != null && !params.cofactors.isEmpty()) {
|
||||
CofactorHandler.parseAndValidate(params.cofactors)
|
||||
log.info "Cofactors to include as protein surface: {}", params.cofactors
|
||||
|
||||
// R19: warn if a cofactor specifier's group name is also in the active aa_mapping.
|
||||
// Cofactor atoms would silently inherit the mapped AA's features rather than
|
||||
// cofactor defaults - defined behaviour, but invisible without a warning.
|
||||
Set<String> activeMappings = AminoAcidMapper.getInstance().getMappings().keySet()
|
||||
Set<String> overlapping = new LinkedHashSet<>()
|
||||
for (String spec : params.cofactors) {
|
||||
String trimmed = spec?.trim()
|
||||
if (trimmed == null || trimmed.isEmpty()) continue
|
||||
try {
|
||||
String name = Dataset.LigandDefinition.parse(trimmed).groupName?.toUpperCase()
|
||||
if (name != null && activeMappings.contains(name)) overlapping.add(name)
|
||||
} catch (Exception ignored) {
|
||||
// parse error already surfaced by parseAndValidate above
|
||||
}
|
||||
}
|
||||
if (!overlapping.isEmpty()) {
|
||||
log.warn "Cofactor specifier(s) name(s) {} are also covered by the active " +
|
||||
"aa_mapping. Cofactor atom features will be computed using the mapped " +
|
||||
"AA's table entries instead of cofactor defaults. Remove the entry " +
|
||||
"from aa_mapping or change the cofactor specifier to fix.", overlapping
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
String evalDirParam(String dirParam, String relativePrefixDir) {
|
||||
|
||||
@@ -517,6 +517,54 @@ class Params {
|
||||
@ModelParam // training
|
||||
List<String> positive_def_ligtypes = ["relevant"]
|
||||
|
||||
/**
|
||||
* Cofactor specifiers - HETATM groups to include as part of the protein surface.
|
||||
*
|
||||
* Matching groups will:
|
||||
* - Contribute heavy atoms to SAS point generation and pocket detection
|
||||
* - Be EXCLUDED from ligand detection
|
||||
* - Have features calculated from nearest protein residue (SAS-level) or
|
||||
* defaults (atom-level - cofactor atoms aren't amino acids)
|
||||
*
|
||||
* Specifier syntax mirrors the dataset 'ligands' column (Dataset.LigandDefinition):
|
||||
* "FAD" - all FAD groups
|
||||
* "FAD[group_id:A_500]" - specific FAD by chain + residue number
|
||||
* "FAD[A_500]" - shorthand for group_id
|
||||
* "FAD[atom_id:12345]" - by PDB atom serial
|
||||
* "FAD[contact_res_ids:A_D246,A_T259,...]" - by surrounding polymer residues
|
||||
*
|
||||
* Group names are matched against group.PDBName exactly (case-sensitive).
|
||||
* BioJava returns uppercase PDB names so in practice "FAD" matches and "fad" does not.
|
||||
* Validation happens at startup via LigandDefinition.parse().
|
||||
*
|
||||
* Can be overridden per-structure using the 'cofactors' column in dataset files.
|
||||
*
|
||||
* Example values: ["FAD", "PLP", "HEM"]
|
||||
* Precise specifiers (with [...]) are typically used via the CLI or per-row
|
||||
* dataset column, not in static config files.
|
||||
* Default: [] (empty - cofactors treated as ligands or ignored per ignore_het_groups)
|
||||
*
|
||||
* Related: ignore_het_groups (ignored groups are excluded from ligand detection AND
|
||||
* from protein surface; cofactors are excluded from ligand detection but
|
||||
* INCLUDED in surface).
|
||||
*/
|
||||
@RuntimeParam
|
||||
List<String> cofactors = []
|
||||
|
||||
/**
|
||||
* Maximum distance (Å) from a cofactor's center of mass to the nearest
|
||||
* protein atom for the cofactor to be considered associated with the protein.
|
||||
*
|
||||
* Cofactors beyond this distance are still included in the surface, but
|
||||
* an INFO warning is logged - the cofactor may be a crystallization artifact
|
||||
* or positioned in the solvent far from the protein.
|
||||
*
|
||||
* Set to 0 to disable proximity checking.
|
||||
* Default: 15.0 Å (covers most covalently/tightly bound cofactors).
|
||||
*/
|
||||
@RuntimeParam
|
||||
double cofactor_max_protein_dist = 15.0
|
||||
|
||||
/**
|
||||
* Amino acid mapping mode for non-canonical residues.
|
||||
*
|
||||
@@ -828,6 +876,18 @@ class Params {
|
||||
@RuntimeParam
|
||||
boolean vis_highlight_ligands = false
|
||||
|
||||
/**
|
||||
* Highlight cofactor atoms (matched via -cofactors) as teal sticks in PyMOL output,
|
||||
* distinct from ligand spheres and the polymer cartoon.
|
||||
*
|
||||
* If false, cofactor atoms render with PyMOL's default style for HETATMs (sticks in
|
||||
* the default colour), which can be visually indistinguishable from ligands or noise.
|
||||
* Setting this to false does NOT change pocket prediction - cofactor atoms remain on
|
||||
* the protein surface; only the colour highlight is removed.
|
||||
*/
|
||||
@RuntimeParam
|
||||
boolean vis_highlight_cofactors = true
|
||||
|
||||
/**
|
||||
* Method for computing binding site center of observed pocket for evaluation (used by DCC criterion).
|
||||
* Observed pocket can be defined by ligand or can be explicit (set of residues defined in the dataset).
|
||||
|
||||
@@ -21,8 +21,14 @@ import cz.siret.prank.utils.*
|
||||
import groovy.transform.CompileStatic
|
||||
import groovy.util.logging.Slf4j
|
||||
import org.biojava.nbio.structure.Atom
|
||||
import org.biojava.nbio.structure.Group
|
||||
import org.biojava.nbio.structure.ResidueNumber
|
||||
|
||||
import cz.siret.prank.program.params.Params
|
||||
|
||||
import static cz.siret.prank.domain.Dataset.LigandDefinition
|
||||
import static cz.siret.prank.geom.Struct.getAuthorId
|
||||
|
||||
import javax.annotation.Nullable
|
||||
import java.util.concurrent.ConcurrentLinkedQueue
|
||||
import java.util.concurrent.atomic.AtomicInteger
|
||||
@@ -96,7 +102,8 @@ class AnalyzeRoutine extends Routine {
|
||||
"fasta-masked" : { cmdFastaMasked() },
|
||||
"peptides" : { cmdPeptides() },
|
||||
"convert-dataset-to-atomid" : { cmdConvertContactresDataset() },
|
||||
"print-volsite-table" : { print_volsite_table() }
|
||||
"print-volsite-table" : { print_volsite_table() },
|
||||
"cofactors" : { cmdCofactors() }
|
||||
])
|
||||
|
||||
//===========================================================================================================//
|
||||
@@ -164,7 +171,7 @@ class AnalyzeRoutine extends Routine {
|
||||
}
|
||||
|
||||
/**
|
||||
* Binding site statistics — works for both ligand-based and explicit site datasets.
|
||||
* Binding site statistics - works for both ligand-based and explicit site datasets.
|
||||
* Produces a unified CSV with the same header regardless of site source.
|
||||
*/
|
||||
void cmdBindingSites() {
|
||||
@@ -291,7 +298,8 @@ class AnalyzeRoutine extends Routine {
|
||||
label: item.label,
|
||||
protein: p,
|
||||
observedLabeling: labeling,
|
||||
siteCentroids: centroids
|
||||
siteCentroids: centroids,
|
||||
cofactorResult: p.cofactorExtractionResult
|
||||
)).render()
|
||||
}
|
||||
}
|
||||
@@ -348,9 +356,9 @@ class AnalyzeRoutine extends Routine {
|
||||
* and reporting distances between methods, to SAS surface, and to protein atoms.
|
||||
*
|
||||
* Produces:
|
||||
* - binding_site_centers.csv — all results in one table
|
||||
* - binding_site_centers_{method}.csv — per-method tables
|
||||
* - binding_site_centers_summary.txt — overall + per-method distance statistics
|
||||
* - binding_site_centers.csv - all results in one table
|
||||
* - binding_site_centers_{method}.csv - per-method tables
|
||||
* - binding_site_centers_summary.txt - overall + per-method distance statistics
|
||||
*/
|
||||
void cmdBindingSiteCenters() {
|
||||
List<String> distColumns = ["dist_to_atom_com", "dist_to_sas", "dist_to_protein"]
|
||||
@@ -699,7 +707,8 @@ class AnalyzeRoutine extends Routine {
|
||||
proteinFile: item.proteinFile,
|
||||
label: item.label,
|
||||
protein: item.protein,
|
||||
observedLabeling: labeling
|
||||
observedLabeling: labeling,
|
||||
cofactorResult: item.protein.cofactorExtractionResult
|
||||
)).render()
|
||||
}
|
||||
}
|
||||
@@ -734,7 +743,7 @@ class AnalyzeRoutine extends Routine {
|
||||
def res = dataset.processItems { Dataset.Item item ->
|
||||
Protein p = item.protein
|
||||
|
||||
// Load conservation scores (graceful — null on failure)
|
||||
// Load conservation scores (graceful - null on failure)
|
||||
ConservationScore conservScore = null
|
||||
try {
|
||||
conservScore = p.loadConservationScores(item.context)
|
||||
@@ -788,7 +797,8 @@ class AnalyzeRoutine extends Routine {
|
||||
proteinFile: item.proteinFile,
|
||||
label: item.label,
|
||||
protein: p,
|
||||
doubleLabeling: labeling
|
||||
doubleLabeling: labeling,
|
||||
cofactorResult: p.cofactorExtractionResult
|
||||
)).render()
|
||||
}
|
||||
}
|
||||
@@ -1047,6 +1057,175 @@ class AnalyzeRoutine extends Routine {
|
||||
write res.writeErrorsAndGetSummary(outdir)
|
||||
}
|
||||
|
||||
/**
|
||||
* Survey HETATM groups and dry-run cofactor specifiers.
|
||||
*
|
||||
* <p>Without {@code -cofactors}: lists every distinct HETATM group instance with
|
||||
* chain/residue/atoms for discovery (which names exist? which to use as cofactor
|
||||
* specifiers?).
|
||||
*
|
||||
* <p>With {@code -cofactors} (or a {@code cofactors} column in the dataset): additionally
|
||||
* reports which specifiers matched which groups, using the same per-item resolution as
|
||||
* pocket prediction would (column overrides global). Lets users verify precise specifiers
|
||||
* before committing to a long-running run.
|
||||
*/
|
||||
void cmdCofactors() {
|
||||
DataTable dt = new DataTable("protein",
|
||||
"het_name", "chain", "res_num", "group_id",
|
||||
"n_heavy_atoms", "dist_to_protein",
|
||||
"currently_classified_as", "would_be_cofactor"
|
||||
)
|
||||
|
||||
// Aggregates across the dataset
|
||||
Map<String, Set<String>> nameToStructures = new java.util.concurrent.ConcurrentHashMap<>()
|
||||
Map<String, java.util.concurrent.atomic.AtomicInteger> nameToGroupCount = new java.util.concurrent.ConcurrentHashMap<>()
|
||||
Map<String, Set<String>> specToStructures = new java.util.concurrent.ConcurrentHashMap<>()
|
||||
Map<String, java.util.concurrent.atomic.AtomicInteger> specToGroupCount = new java.util.concurrent.ConcurrentHashMap<>()
|
||||
java.util.concurrent.atomic.AtomicInteger itemsWithSpecifiers = new java.util.concurrent.atomic.AtomicInteger()
|
||||
|
||||
DataTable mt = new DataTable("protein",
|
||||
"specifier", "matched_count", "matched_group_ids", "unmatched_reason")
|
||||
|
||||
def res = dataset.processItems { Dataset.Item item ->
|
||||
Protein p = item.protein
|
||||
|
||||
// Per-item resolved specifiers - same resolution as protein loading does.
|
||||
// Either the dataset's `cofactors` column (if present) or the global Params.inst.cofactors.
|
||||
List<LigandDefinition> itemSpecifiers = dataset.resolveCofactorDefinitions(item)
|
||||
if (!itemSpecifiers.isEmpty()) itemsWithSpecifiers.incrementAndGet()
|
||||
|
||||
// Index cofactor-matched groups for fast classification
|
||||
Set<Group> cofactorMatched = Collections.newSetFromMap(new IdentityHashMap<>())
|
||||
if (p.cofactorExtractionResult != null) {
|
||||
for (List<Group> gs : p.cofactorExtractionResult.foundGroups.values()) {
|
||||
cofactorMatched.addAll(gs)
|
||||
}
|
||||
}
|
||||
|
||||
// Index relevant + ignored ligand groups
|
||||
Set<Group> relevantLigGroups = Collections.newSetFromMap(new IdentityHashMap<>())
|
||||
Set<Group> ignoredLigGroups = Collections.newSetFromMap(new IdentityHashMap<>())
|
||||
for (Ligand lig : p.relevantLigands ?: []) {
|
||||
relevantLigGroups.addAll(lig.atoms.distinctGroups)
|
||||
}
|
||||
for (Ligand lig : p.allIgnoredLigands ?: []) {
|
||||
ignoredLigGroups.addAll(lig.atoms.distinctGroups)
|
||||
}
|
||||
|
||||
// Per-specifier per-structure match tracking
|
||||
Map<String, List<String>> matchedGroupIdsBySpec = new LinkedHashMap<>()
|
||||
for (LigandDefinition d : itemSpecifiers) matchedGroupIdsBySpec.put(d.originalString, new ArrayList<>())
|
||||
|
||||
for (Group g : Struct.getHetGroups(p.structure)) {
|
||||
String name = g.PDBName?.toUpperCase()
|
||||
if (name == null) continue
|
||||
|
||||
nameToGroupCount.computeIfAbsent(name, { new java.util.concurrent.atomic.AtomicInteger() } as java.util.function.Function).incrementAndGet()
|
||||
nameToStructures.computeIfAbsent(name, { (Set<String>) (java.util.concurrent.ConcurrentHashMap.newKeySet()) } as java.util.function.Function).add(item.label)
|
||||
|
||||
String chain = getAuthorId(g.chain)
|
||||
String resNum = g.residueNumber?.printFull() ?: "?"
|
||||
String groupId = "${chain}_${resNum}"
|
||||
Atoms ga = Atoms.allFromGroup(g).withoutHydrogens()
|
||||
|
||||
String cls
|
||||
if (cofactorMatched.contains(g)) cls = "cofactor"
|
||||
else if (relevantLigGroups.contains(g)) cls = "relevant_ligand"
|
||||
else if (ignoredLigGroups.contains(g)) cls = "ignored"
|
||||
else cls = "other"
|
||||
|
||||
int wouldBeCofactor = 0
|
||||
for (LigandDefinition d : itemSpecifiers) {
|
||||
if (d.matchesGroup(g, p)) {
|
||||
wouldBeCofactor = 1
|
||||
specToGroupCount.computeIfAbsent(d.originalString, { new java.util.concurrent.atomic.AtomicInteger() } as java.util.function.Function).incrementAndGet()
|
||||
specToStructures.computeIfAbsent(d.originalString, { (Set<String>) (java.util.concurrent.ConcurrentHashMap.newKeySet()) } as java.util.function.Function).add(item.label)
|
||||
matchedGroupIdsBySpec.get(d.originalString).add(groupId)
|
||||
}
|
||||
}
|
||||
|
||||
DataTable.Row r = dt.newRow(item.label)
|
||||
r.put("het_name", name)
|
||||
r.put("chain", chain)
|
||||
r.put("res_num", resNum)
|
||||
r.put("group_id", groupId)
|
||||
r.put("n_heavy_atoms", ga.count)
|
||||
if (ga.empty) {
|
||||
r.put("dist_to_protein", "")
|
||||
} else {
|
||||
r.put("dist_to_protein", p.proteinAtoms.dist(ga))
|
||||
}
|
||||
r.put("currently_classified_as", cls)
|
||||
r.put("would_be_cofactor", itemSpecifiers.isEmpty() ? "" : String.valueOf(wouldBeCofactor))
|
||||
}
|
||||
|
||||
// cofactor_matches.csv row(s) for this structure
|
||||
for (LigandDefinition d : itemSpecifiers) {
|
||||
List<String> matched = matchedGroupIdsBySpec.get(d.originalString)
|
||||
String reason = ""
|
||||
if (matched.isEmpty()) {
|
||||
boolean nameInStructure = Struct.getHetGroups(p.structure)
|
||||
.any { ((Group) it).PDBName?.toUpperCase() == d.groupName?.toUpperCase() }
|
||||
reason = nameInStructure ? "name present but specifier filter excluded all instances"
|
||||
: "name not in structure"
|
||||
}
|
||||
DataTable.Row mr = mt.newRow(item.label)
|
||||
mr.put("specifier", d.originalString)
|
||||
mr.put("matched_count", matched.size())
|
||||
mr.put("matched_group_ids", matched.join(" "))
|
||||
mr.put("unmatched_reason", reason)
|
||||
}
|
||||
|
||||
// Visualizations - renderer reads cofactorResult and emits per-name selections.
|
||||
if (params.visualizations) {
|
||||
new NewPymolRenderer("$outdir/visualizations", new RenderingModel(
|
||||
proteinFile: item.proteinFile,
|
||||
label: item.label,
|
||||
protein: p,
|
||||
cofactorResult: p.cofactorExtractionResult
|
||||
)).render()
|
||||
}
|
||||
}
|
||||
|
||||
writeFile "$outdir/het_groups.csv", dt.toCsv()
|
||||
|
||||
boolean anySpecifiers = itemsWithSpecifiers.get() > 0
|
||||
|
||||
StringBuilder summary = new StringBuilder()
|
||||
summary << "HETATM Survey for ${dataset.name} (${dataset.size} structures)\n\n"
|
||||
summary << "Most frequent HETATM groups:\n"
|
||||
nameToStructures.entrySet()
|
||||
.toSorted { -it.value.size() }
|
||||
.each { e ->
|
||||
int nStruct = e.value.size()
|
||||
int nGroups = nameToGroupCount.get(e.key).get()
|
||||
double pct = (100.0d * nStruct) / Math.max(1, dataset.size)
|
||||
summary << String.format(" %-8s %4d structures (%5.1f%%) - %d groups total\n",
|
||||
e.key, nStruct, pct, nGroups)
|
||||
}
|
||||
|
||||
if (anySpecifiers) {
|
||||
summary << "\nCofactor specifier match (per-item resolution, column overrides global):\n"
|
||||
specToStructures.entrySet()
|
||||
.toSorted { -it.value.size() }
|
||||
.each { e ->
|
||||
int nStruct = e.value.size()
|
||||
int nGroups = specToGroupCount.get(e.key).get()
|
||||
String marker = nStruct == 0 ? " ← matched no structures" : ""
|
||||
summary << String.format(" %-30s %d/%d structures, %d groups total%s\n",
|
||||
e.key, nStruct, dataset.size, nGroups, marker)
|
||||
}
|
||||
writeFile "$outdir/cofactor_matches.csv", mt.toCsv()
|
||||
}
|
||||
|
||||
String summaryStr = summary.toString()
|
||||
writeFile "$outdir/het_groups_summary.txt", summaryStr
|
||||
write summaryStr
|
||||
|
||||
write "Processed ${dataset.size} items"
|
||||
write res.writeErrorsAndGetSummary(outdir)
|
||||
}
|
||||
|
||||
|
||||
void print_volsite_table() {
|
||||
List<String> atomTypes = AtomTableFeature.atomPropertyTable.itemNames.toSorted()
|
||||
|
||||
@@ -9,7 +9,7 @@ import static cz.siret.prank.utils.Formatter.format
|
||||
/**
|
||||
* Collects structured rows of named values and produces CSV output and summary statistics.
|
||||
*
|
||||
* Columns are pre-registered at construction time. Row operations require no synchronization —
|
||||
* Columns are pre-registered at construction time. Row operations require no synchronization -
|
||||
* each Row is filled by a single thread, and only the final {@code rows.add()} is synchronized.
|
||||
*
|
||||
* Usage:
|
||||
@@ -42,7 +42,7 @@ class DataTable {
|
||||
|
||||
/**
|
||||
* Creates a new row and adds it to the table.
|
||||
* The returned Row can be filled via {@code put()} — no synchronization needed.
|
||||
* The returned Row can be filled via {@code put()} - no synchronization needed.
|
||||
*/
|
||||
Row newRow(String label) {
|
||||
Row row = new Row(label, columns.length, columnIndex)
|
||||
@@ -76,18 +76,18 @@ class DataTable {
|
||||
String toCsv() {
|
||||
StringBuilder sb = new StringBuilder()
|
||||
|
||||
sb << labelColumn
|
||||
sb << csvCell(labelColumn)
|
||||
for (String col : columns) {
|
||||
sb << ", " << col
|
||||
sb << ", " << csvCell(col)
|
||||
}
|
||||
sb << "\n"
|
||||
|
||||
for (Row row : getRowsSorted()) {
|
||||
sb << row.label
|
||||
sb << csvCell(row.label)
|
||||
for (int i = 0; i < columns.length; i++) {
|
||||
sb << ", "
|
||||
Object val = row.values[i]
|
||||
if (val != null) sb << val.toString()
|
||||
if (val != null) sb << csvCell(val.toString())
|
||||
}
|
||||
sb << "\n"
|
||||
}
|
||||
@@ -95,6 +95,20 @@ class DataTable {
|
||||
return sb.toString()
|
||||
}
|
||||
|
||||
/**
|
||||
* Minimal CSV cell encoding (RFC 4180):
|
||||
* - If the cell contains comma, double-quote, CR, or LF, wrap it in double quotes
|
||||
* and escape any inner double quotes by doubling them.
|
||||
* - Otherwise, return as-is.
|
||||
*/
|
||||
private static String csvCell(String value) {
|
||||
if (value == null) return ""
|
||||
boolean needsQuoting = value.indexOf(',') >= 0 || value.indexOf('"') >= 0 ||
|
||||
value.indexOf('\n') >= 0 || value.indexOf('\r') >= 0
|
||||
if (!needsQuoting) return value
|
||||
return '"' + value.replace('"', '""') + '"'
|
||||
}
|
||||
|
||||
// ---- Summary stats ----
|
||||
|
||||
/**
|
||||
|
||||
@@ -96,7 +96,8 @@ class EvalResiduesRoutine extends EvalRoutine {
|
||||
protein: item.protein,
|
||||
observedLabeling: observed,
|
||||
predictedLabeling: predicted,
|
||||
labeledPoints: predictedPoints
|
||||
labeledPoints: predictedPoints,
|
||||
cofactorResult: item.protein.cofactorExtractionResult
|
||||
)).render()
|
||||
}
|
||||
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
package cz.siret.prank.program.visualization
|
||||
|
||||
import cz.siret.prank.domain.CofactorHandler
|
||||
import cz.siret.prank.domain.Protein
|
||||
import cz.siret.prank.domain.labeling.BinaryLabeling
|
||||
import cz.siret.prank.domain.labeling.LabeledPoint
|
||||
@@ -9,6 +10,8 @@ import groovy.transform.CompileStatic
|
||||
import groovy.transform.TupleConstructor
|
||||
import org.biojava.nbio.structure.Atom
|
||||
|
||||
import javax.annotation.Nullable
|
||||
|
||||
import java.awt.*
|
||||
import java.util.List
|
||||
|
||||
@@ -32,6 +35,15 @@ class RenderingModel implements Parametrized {
|
||||
|
||||
List<Atom> siteCentroids
|
||||
|
||||
/**
|
||||
* Result of cofactor extraction for the loaded protein.
|
||||
* Carries the per-name grouping (`foundGroups`) used to emit per-cofactor PyMOL
|
||||
* selections, plus `unmatchedSpecifiers` for diagnostic comments. Null when no
|
||||
* cofactors are configured.
|
||||
*/
|
||||
@Nullable
|
||||
CofactorHandler.ExtractionResult cofactorResult
|
||||
|
||||
Style style = new Style()
|
||||
|
||||
static class Style {
|
||||
|
||||
@@ -1,14 +1,17 @@
|
||||
package cz.siret.prank.program.visualization.renderers
|
||||
|
||||
import cz.siret.prank.domain.CofactorHandler
|
||||
import cz.siret.prank.domain.labeling.BinaryLabeling
|
||||
import org.biojava.nbio.structure.Atom
|
||||
import cz.siret.prank.domain.labeling.LabeledResidue
|
||||
import cz.siret.prank.domain.labeling.ResidueLabeling
|
||||
import cz.siret.prank.geom.Atoms
|
||||
import cz.siret.prank.program.params.Parametrized
|
||||
import cz.siret.prank.program.visualization.RenderingModel
|
||||
import cz.siret.prank.utils.Futils
|
||||
import groovy.transform.CompileStatic
|
||||
import groovy.util.logging.Slf4j
|
||||
import org.biojava.nbio.structure.Atom
|
||||
import org.biojava.nbio.structure.Group
|
||||
|
||||
import java.awt.*
|
||||
import java.util.List
|
||||
@@ -118,6 +121,8 @@ show surface, prot
|
||||
|
||||
${renderLigands()}
|
||||
|
||||
${renderCofactors()}
|
||||
|
||||
${renderLabaledPoints()}
|
||||
|
||||
${renderResidueColoring()}
|
||||
@@ -144,6 +149,55 @@ set sphere_color, red
|
||||
"""
|
||||
}
|
||||
|
||||
/**
|
||||
* Render cofactor atoms as teal sticks, distinct from ligand spheres and protein cartoon.
|
||||
*
|
||||
* Emits one PyMOL selection per cofactor type (e.g. {@code cofactor_FAD},
|
||||
* {@code cofactor_PLP}) so users can toggle them independently, plus an aggregate
|
||||
* {@code cofactor_atoms} selection that drives the rendering. Unmatched cofactor
|
||||
* specifiers are noted in a header comment for diagnostics.
|
||||
*/
|
||||
private String renderCofactors() {
|
||||
if (!params.vis_highlight_cofactors) return ""
|
||||
CofactorHandler.ExtractionResult result = model.cofactorResult
|
||||
if (result == null || result.atoms.empty) return ""
|
||||
|
||||
return cofactorPymolBlock(result)
|
||||
}
|
||||
|
||||
/** Shared rendering of a cofactor ExtractionResult - keeps the logic in one place. */
|
||||
static String cofactorPymolBlock(CofactorHandler.ExtractionResult result) {
|
||||
StringBuilder sb = new StringBuilder()
|
||||
sb << "# cofactors (HETATMs treated as protein surface)\n"
|
||||
if (!result.unmatchedSpecifiers.isEmpty()) {
|
||||
sb << "# unmatched specifiers (no group found): ${result.unmatchedSpecifiers}\n"
|
||||
}
|
||||
sb << "set_color cofactor_col, [73, 168, 199]\n"
|
||||
|
||||
// Per-name selections - addressable in PyMOL for toggling/colouring individual cofactors.
|
||||
List<String> perNameSelectors = new ArrayList<>()
|
||||
for (Map.Entry<String, List<Group>> e : result.foundGroups.entrySet()) {
|
||||
String selName = "cofactor_" + e.key.replaceAll(/[^A-Za-z0-9]/, '_')
|
||||
List<String> ids = new ArrayList<>()
|
||||
for (Group g : e.value) {
|
||||
for (Atom a : Atoms.allFromGroup(g).withoutHydrogens().list) {
|
||||
ids.add(a.PDBserial.toString())
|
||||
}
|
||||
}
|
||||
if (ids.isEmpty()) continue
|
||||
String idsOrList = ids.collect { "id $it" }.join(" or ")
|
||||
sb << "select ${selName}, ${idsOrList}\n"
|
||||
perNameSelectors << selName
|
||||
}
|
||||
if (perNameSelectors.isEmpty()) return ""
|
||||
|
||||
sb << "select cofactor_atoms, " << perNameSelectors.join(" or ") << "\n"
|
||||
sb << "show sticks, cofactor_atoms\n"
|
||||
sb << "color cofactor_col, cofactor_atoms\n"
|
||||
sb << "set stick_radius, 0.18, cofactor_atoms\n"
|
||||
return sb.toString()
|
||||
}
|
||||
|
||||
private String renderResidueColoring() {
|
||||
if (model.observedLabeling != null) {
|
||||
if (model.predictedLabeling != null) {
|
||||
|
||||
@@ -73,6 +73,8 @@ set stick_color, magenta
|
||||
|
||||
${renderLigands(pair.holoProtein)}
|
||||
|
||||
${renderCofactors(pair.holoProtein)}
|
||||
|
||||
# SAS points
|
||||
|
||||
load "$pointsFileRelative", points
|
||||
@@ -133,6 +135,18 @@ orient
|
||||
return ""
|
||||
}
|
||||
|
||||
/**
|
||||
* Render cofactor atoms (matched via -cofactors) as teal sticks.
|
||||
* Delegates to {@link NewPymolRenderer#cofactorPymolBlock} so the structure of the PML
|
||||
* block stays identical between the two renderers.
|
||||
*/
|
||||
private String renderCofactors(Protein protein) {
|
||||
if (!params.vis_highlight_cofactors) return ""
|
||||
def result = protein.cofactorExtractionResult
|
||||
if (result == null || result.atoms.empty) return ""
|
||||
return NewPymolRenderer.cofactorPymolBlock(result)
|
||||
}
|
||||
|
||||
private String renderLigands(Protein protein) {
|
||||
|
||||
if (!params.vis_highlight_ligands) {
|
||||
|
||||
209
src/test/groovy/cz/siret/prank/domain/CofactorHandlerTest.groovy
Normal file
209
src/test/groovy/cz/siret/prank/domain/CofactorHandlerTest.groovy
Normal file
@@ -0,0 +1,209 @@
|
||||
package cz.siret.prank.domain
|
||||
|
||||
import cz.siret.prank.domain.loaders.LoaderParams
|
||||
import cz.siret.prank.program.PrankException
|
||||
import groovy.transform.CompileStatic
|
||||
import org.junit.jupiter.api.Test
|
||||
import org.biojava.nbio.structure.Group
|
||||
|
||||
import static cz.siret.prank.domain.Dataset.LigandDefinition
|
||||
import static org.junit.jupiter.api.Assertions.*
|
||||
|
||||
/**
|
||||
* Unit tests for CofactorHandler - specifier parsing, configuration, and LoaderParams glue.
|
||||
*
|
||||
* Integration tests that exercise extractCofactorAtoms against a real BioJava Structure
|
||||
* live in {@link CofactorIntegrationTest} and {@link CofactorPipelineTest}.
|
||||
*/
|
||||
@CompileStatic
|
||||
class CofactorHandlerTest {
|
||||
|
||||
private static List<LigandDefinition> parse(List<String> specs) {
|
||||
return CofactorHandler.parseAndValidate(specs)
|
||||
}
|
||||
|
||||
// ===== Handler configuration =====
|
||||
|
||||
@Test
|
||||
void disabledWhenEmptyList() {
|
||||
assertFalse(new CofactorHandler([] as List<LigandDefinition>).isEnabled())
|
||||
}
|
||||
|
||||
@Test
|
||||
void disabledWhenNullList() {
|
||||
assertFalse(new CofactorHandler(null).isEnabled())
|
||||
}
|
||||
|
||||
@Test
|
||||
void enabledWhenSpecifiersGiven() {
|
||||
assertTrue(new CofactorHandler(parse(["FAD", "PLP"])).isEnabled())
|
||||
}
|
||||
|
||||
@Test
|
||||
void isCofactorIsFalseForNull() {
|
||||
def handler = new CofactorHandler(parse(["FAD"]))
|
||||
assertFalse(handler.isCofactor((Group) null))
|
||||
}
|
||||
|
||||
// ===== Specifier parsing & validation =====
|
||||
|
||||
@Test
|
||||
void parsesBareName() {
|
||||
def defs = parse(["FAD"])
|
||||
assertEquals(1, defs.size())
|
||||
assertEquals("FAD", defs[0].groupName)
|
||||
assertNull(defs[0].groupId)
|
||||
assertNull(defs[0].atomId)
|
||||
}
|
||||
|
||||
@Test
|
||||
void parsesGroupIdSpecifier() {
|
||||
def defs = parse(["FAD[group_id:A_500]"])
|
||||
assertEquals("FAD", defs[0].groupName)
|
||||
assertEquals("A_500", defs[0].groupId)
|
||||
}
|
||||
|
||||
@Test
|
||||
void parsesShorthandGroupId() {
|
||||
// Bare bracketed value defaults to group_id per LigandDefinition.parse
|
||||
def defs = parse(["HEM[A_300]"])
|
||||
assertEquals("HEM", defs[0].groupName)
|
||||
assertEquals("A_300", defs[0].groupId)
|
||||
}
|
||||
|
||||
@Test
|
||||
void parsesAtomIdSpecifier() {
|
||||
def defs = parse(["FAD[atom_id:12345]"])
|
||||
assertEquals(Integer.valueOf(12345), defs[0].atomId)
|
||||
}
|
||||
|
||||
@Test
|
||||
void parsesContactResIdsSpecifier() {
|
||||
def defs = parse(["PLP[contact_res_ids:A_D246,A_T259]"])
|
||||
assertEquals(2, defs[0].contactResidueIds.size())
|
||||
}
|
||||
|
||||
@Test
|
||||
void rejectsUnknownSpecifierType() {
|
||||
assertThrows(PrankException) { parse(["FAD[whatever:foo]"]) }
|
||||
}
|
||||
|
||||
@Test
|
||||
void rejectsNonIntegerAtomId() {
|
||||
assertThrows(PrankException) { parse(["FAD[atom_id:notanumber]"]) }
|
||||
}
|
||||
|
||||
@Test
|
||||
void errorMessageMentionsCofactor() {
|
||||
// R7/#7: error from LigandDefinition is wrapped with cofactor-specific context,
|
||||
// so users don't see misleading "dataset file" text when the source is CLI.
|
||||
def ex = assertThrows(PrankException) { parse(["FAD[whatever:foo]"]) }
|
||||
assertTrue(ex.message.contains("cofactor"),
|
||||
"Error message should mention 'cofactor', was: ${ex.message}")
|
||||
assertTrue(ex.message.contains("FAD[whatever:foo]"),
|
||||
"Error message should include the offending specifier, was: ${ex.message}")
|
||||
}
|
||||
|
||||
// ===== R-AUDIT-1: bracket-aware splitting for contact_res_ids =====
|
||||
|
||||
@Test
|
||||
void contactResIdsCommasArePreservedFromList() {
|
||||
// Bug #1 from audit: when CLI/CHAIN_SPLITTER over-splits a specifier like
|
||||
// FAD[contact_res_ids:A_D246,A_T259,A_E423] into multiple list elements,
|
||||
// parseAndValidate must rejoin and re-split with bracket awareness.
|
||||
List<String> overSplit = ["FAD[contact_res_ids:A_D246", "A_T259", "A_E423]"]
|
||||
def defs = CofactorHandler.parseAndValidate(overSplit)
|
||||
assertEquals(1, defs.size(), "Should reassemble into one specifier, got: $defs")
|
||||
assertEquals("FAD", defs[0].groupName)
|
||||
assertEquals(3, defs[0].contactResidueIds.size())
|
||||
assertEquals(["A_D246", "A_T259", "A_E423"], defs[0].contactResidueIds)
|
||||
}
|
||||
|
||||
@Test
|
||||
void contactResIdsCommasArePreservedFromString() {
|
||||
// Bug #1: column-string path must also use bracket-aware splitting.
|
||||
def defs = CofactorHandler.parseAndValidate(
|
||||
"FAD[contact_res_ids:A_D246,A_T259,A_E423],PLP")
|
||||
assertEquals(2, defs.size())
|
||||
assertEquals("FAD", defs[0].groupName)
|
||||
assertEquals(3, defs[0].contactResidueIds.size())
|
||||
assertEquals("PLP", defs[1].groupName)
|
||||
}
|
||||
|
||||
@Test
|
||||
void groupIdSpecifierWithCommaInBrackets() {
|
||||
// Defensive: group_id values don't normally have commas, but the bracket-aware
|
||||
// splitter should preserve them even if a chain ID happens to contain weird chars.
|
||||
def defs = CofactorHandler.parseAndValidate("FAD[group_id:A_500],PLP[A_300]")
|
||||
assertEquals(2, defs.size())
|
||||
}
|
||||
|
||||
// ===== R-AUDIT-2: case normalization =====
|
||||
|
||||
@Test
|
||||
void lowercaseGroupNameIsNormalized() {
|
||||
// Bug #2: BioJava stores PDB names in uppercase. A user-supplied "fad" must
|
||||
// be matched against "FAD" - not silently zero-matched.
|
||||
def defs = CofactorHandler.parseAndValidate(["fad"])
|
||||
assertEquals(1, defs.size())
|
||||
assertEquals("FAD", defs[0].groupName)
|
||||
}
|
||||
|
||||
@Test
|
||||
void mixedCaseGroupNameIsNormalized() {
|
||||
def defs = CofactorHandler.parseAndValidate(["Fad", "pLp[A_500]"])
|
||||
assertEquals(2, defs.size())
|
||||
assertEquals("FAD", defs[0].groupName)
|
||||
assertEquals("PLP", defs[1].groupName)
|
||||
// chain ID inside [] is preserved as-is (case-significant)
|
||||
assertEquals("A_500", defs[1].groupId)
|
||||
}
|
||||
|
||||
@Test
|
||||
void specifierBodyPreservesCase() {
|
||||
// The bracketed specifier body (chain IDs, residue codes) is case-significant.
|
||||
// Only the group-name prefix is upper-cased.
|
||||
def defs = CofactorHandler.parseAndValidate(["fad[contact_res_ids:A_d246]"])
|
||||
assertEquals("FAD", defs[0].groupName)
|
||||
// The contact-residue-id retains its original case (BioJava handles AA-code case)
|
||||
assertEquals(["A_d246"], defs[0].contactResidueIds)
|
||||
}
|
||||
|
||||
@Test
|
||||
void emptyEntriesAreDropped() {
|
||||
def defs = parse(["FAD", "", null, " ", "PLP"])
|
||||
assertEquals(2, defs.size())
|
||||
assertEquals(["FAD", "PLP"], defs*.groupName)
|
||||
}
|
||||
|
||||
@Test
|
||||
void parseAndValidateHandlesNull() {
|
||||
assertEquals([], CofactorHandler.parseAndValidate((List<String>) null))
|
||||
assertEquals([], CofactorHandler.parseAndValidate([] as List<String>))
|
||||
assertEquals([], CofactorHandler.parseAndValidate((String) null))
|
||||
assertEquals([], CofactorHandler.parseAndValidate(" "))
|
||||
}
|
||||
|
||||
@Test
|
||||
void preservesOriginalSpecifierString() {
|
||||
def defs = parse(["FAD[group_id:A_500]"])
|
||||
assertEquals("FAD[group_id:A_500]", defs[0].originalString)
|
||||
}
|
||||
|
||||
// ===== LoaderParams integration =====
|
||||
|
||||
@Test
|
||||
void loaderParamsDefaultsToNullHandler() {
|
||||
def lp = new LoaderParams()
|
||||
assertNull(lp.cofactorHandler)
|
||||
assertFalse(lp.isCofactor((Group) null))
|
||||
}
|
||||
|
||||
@Test
|
||||
void loaderParamsWithHandlerDelegatesIsCofactor() {
|
||||
def lp = new LoaderParams()
|
||||
lp.cofactorHandler = new CofactorHandler(parse(["FAD"]))
|
||||
// No extraction has run yet → matchedGroups empty → isCofactor returns false for any group
|
||||
assertFalse(lp.isCofactor((Group) null))
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,148 @@
|
||||
package cz.siret.prank.domain
|
||||
|
||||
import cz.siret.prank.domain.loaders.LoaderParams
|
||||
import groovy.transform.CompileStatic
|
||||
import org.junit.jupiter.api.BeforeAll
|
||||
import org.junit.jupiter.api.Test
|
||||
import org.junit.jupiter.api.condition.EnabledIf
|
||||
import org.junit.jupiter.api.function.ThrowingSupplier
|
||||
|
||||
import static cz.siret.prank.domain.Dataset.LigandDefinition
|
||||
import static org.junit.jupiter.api.Assertions.*
|
||||
|
||||
/**
|
||||
* Integration tests for the cofactor feature against real BioJava structures.
|
||||
*
|
||||
* Uses:
|
||||
* - {@code 1AHP.pdb} (PLP cofactor) - gated behind @EnabledIf("has1AHP") so the test
|
||||
* is skipped on environments that haven't downloaded the file.
|
||||
* - {@code 1fbl.pdb} (no relevant cofactor) - for baseline / regression checks. Always available.
|
||||
*/
|
||||
@CompileStatic
|
||||
class CofactorIntegrationTest {
|
||||
|
||||
static final String TEST_DATA = "distro/test_data"
|
||||
static final String PDB_1AHP = "$TEST_DATA/1AHP.pdb"
|
||||
static final String PDB_1FBL = "$TEST_DATA/1fbl.pdb"
|
||||
|
||||
static boolean has1AHP() {
|
||||
return new File(PDB_1AHP).exists()
|
||||
}
|
||||
|
||||
private static LoaderParams loaderParamsWithCofactors(List<String> specs) {
|
||||
def lp = new LoaderParams()
|
||||
if (specs != null && !specs.isEmpty()) {
|
||||
lp.cofactorHandler = new CofactorHandler(CofactorHandler.parseAndValidate(specs))
|
||||
}
|
||||
return lp
|
||||
}
|
||||
|
||||
@BeforeAll
|
||||
static void setup() {
|
||||
LoaderParams.ignoreLigandsSwitch = false
|
||||
}
|
||||
|
||||
// ===== Baseline / default behaviour =====
|
||||
|
||||
@Test
|
||||
void defaultBehaviorUnchanged() {
|
||||
def lp = new LoaderParams()
|
||||
def protein = Protein.load(PDB_1FBL, lp)
|
||||
|
||||
assertNotNull(protein.proteinAtoms)
|
||||
assertFalse(protein.proteinAtoms.empty)
|
||||
assertTrue(protein.proteinAtoms.count > 2000)
|
||||
}
|
||||
|
||||
@Test
|
||||
void emptyCofactorListSameAsDefault() {
|
||||
def lp1 = new LoaderParams()
|
||||
def lp2 = new LoaderParams()
|
||||
lp2.cofactorHandler = new CofactorHandler([] as List<LigandDefinition>)
|
||||
|
||||
def p1 = Protein.load(PDB_1FBL, lp1)
|
||||
def p2 = Protein.load(PDB_1FBL, lp2)
|
||||
|
||||
assertEquals(p1.proteinAtoms.count, p2.proteinAtoms.count)
|
||||
}
|
||||
|
||||
@Test
|
||||
void missingCofactorDoesNotFail() {
|
||||
// "ZZZZ" is a valid specifier syntax that won't match any real PDB residue name
|
||||
def lp = loaderParamsWithCofactors(["ZZZZ"])
|
||||
ThrowingSupplier<Protein> supplier = { Protein.load(PDB_1FBL, lp) } as ThrowingSupplier<Protein>
|
||||
Protein protein = assertDoesNotThrow(supplier)
|
||||
assertNotNull(protein.proteinAtoms)
|
||||
assertEquals(["ZZZZ"], protein.cofactorExtractionResult?.unmatchedSpecifiers)
|
||||
}
|
||||
|
||||
// ===== 1AHP (PLP) =====
|
||||
|
||||
@Test
|
||||
@EnabledIf("has1AHP")
|
||||
void cofactorAtomsContributeToProteinSurface() {
|
||||
def p1 = Protein.load(PDB_1AHP, new LoaderParams())
|
||||
int atomsWithout = p1.proteinAtoms.count
|
||||
|
||||
def p2 = Protein.load(PDB_1AHP, loaderParamsWithCofactors(["PLP"]))
|
||||
int atomsWith = p2.proteinAtoms.count
|
||||
|
||||
assertTrue(atomsWith > atomsWithout,
|
||||
"Cofactor atoms should be added: before=$atomsWithout after=$atomsWith")
|
||||
assertTrue(atomsWith - atomsWithout >= 10,
|
||||
"Expected ~15 PLP heavy atoms, got ${atomsWith - atomsWithout}")
|
||||
}
|
||||
|
||||
@Test
|
||||
@EnabledIf("has1AHP")
|
||||
void cofactorsExcludedFromLigandDetection() {
|
||||
def protein = Protein.load(PDB_1AHP, loaderParamsWithCofactors(["PLP"]))
|
||||
|
||||
def allLigandNames = protein.ligands.allIncludingIgnored
|
||||
.collectMany { [it.name, it.nameCode] }
|
||||
.findAll { it != null }
|
||||
|
||||
assertFalse(allLigandNames.any { ((String) it).toUpperCase().contains("PLP") },
|
||||
"PLP should be excluded from ligand detection, found: $allLigandNames")
|
||||
}
|
||||
|
||||
@Test
|
||||
@EnabledIf("has1AHP")
|
||||
void extractionResultIsAccessibleOnProtein() {
|
||||
def protein = Protein.load(PDB_1AHP, loaderParamsWithCofactors(["PLP"]))
|
||||
|
||||
def result = protein.cofactorExtractionResult
|
||||
assertNotNull(result, "ExtractionResult should be stored on Protein")
|
||||
assertFalse(result.atoms.empty, "PLP atoms should be in result")
|
||||
assertTrue(result.foundGroups.containsKey("PLP"), "PLP should be in foundGroups")
|
||||
assertTrue(result.unmatchedSpecifiers.isEmpty(), "Specifier should have matched")
|
||||
}
|
||||
|
||||
// ===== Precise specifiers (R16) =====
|
||||
|
||||
@Test
|
||||
@EnabledIf("has1AHP")
|
||||
void preciseGroupIdMatchesNoMoreThanBareName() {
|
||||
def proteinAll = Protein.load(PDB_1AHP, loaderParamsWithCofactors(["PLP"]))
|
||||
int allMatched = proteinAll.cofactorExtractionResult.foundGroups.get("PLP")?.size() ?: 0
|
||||
|
||||
// A non-existent precise specifier should match strictly less (probably zero)
|
||||
def proteinNone = Protein.load(PDB_1AHP, loaderParamsWithCofactors(["PLP[Z_999]"]))
|
||||
int noneMatched = proteinNone.cofactorExtractionResult?.foundGroups?.get("PLP")?.size() ?: 0
|
||||
|
||||
assertTrue(noneMatched <= allMatched,
|
||||
"Precise specifier should match no more than the bare name (all=$allMatched, precise=$noneMatched)")
|
||||
assertEquals(0, noneMatched, "Z_999 does not exist in 1AHP")
|
||||
}
|
||||
|
||||
@Test
|
||||
void unmatchedPreciseSpecifierIsReported() {
|
||||
// FAD[Z_999] won't match anything in 1fbl. The handler should report it as unmatched.
|
||||
def lp = loaderParamsWithCofactors(["FAD[group_id:Z_999]"])
|
||||
ThrowingSupplier<Protein> supplier = { Protein.load(PDB_1FBL, lp) } as ThrowingSupplier<Protein>
|
||||
Protein protein = assertDoesNotThrow(supplier)
|
||||
assertNotNull(protein.proteinAtoms)
|
||||
assertEquals(["FAD[group_id:Z_999]"],
|
||||
protein.cofactorExtractionResult?.unmatchedSpecifiers)
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,492 @@
|
||||
package cz.siret.prank.domain
|
||||
|
||||
import cz.siret.prank.domain.AminoAcidMapper
|
||||
import cz.siret.prank.domain.loaders.LoaderParams
|
||||
import cz.siret.prank.features.implementation.conservation.ConservationScore
|
||||
import cz.siret.prank.features.implementation.conservation.ResidueNumberWrapper
|
||||
import cz.siret.prank.geom.Atoms
|
||||
import cz.siret.prank.program.params.Params
|
||||
import cz.siret.prank.test.Log4jCapture
|
||||
import groovy.transform.CompileStatic
|
||||
import org.biojava.nbio.structure.Atom
|
||||
import org.biojava.nbio.structure.Group
|
||||
import org.biojava.nbio.structure.ResidueNumber
|
||||
import org.junit.jupiter.api.AfterAll
|
||||
import org.junit.jupiter.api.BeforeAll
|
||||
import org.junit.jupiter.api.Test
|
||||
import org.junit.jupiter.api.condition.EnabledIf
|
||||
import org.junit.jupiter.api.function.ThrowingSupplier
|
||||
import org.junit.jupiter.api.parallel.Isolated
|
||||
import org.junit.jupiter.api.parallel.ResourceLock
|
||||
|
||||
import java.lang.reflect.Constructor
|
||||
import java.util.concurrent.Callable
|
||||
import java.util.concurrent.ExecutorService
|
||||
import java.util.concurrent.Executors
|
||||
import java.util.concurrent.Future
|
||||
import java.util.concurrent.TimeUnit
|
||||
|
||||
import static cz.siret.prank.domain.Dataset.LigandDefinition
|
||||
import static org.junit.jupiter.api.Assertions.*
|
||||
|
||||
/**
|
||||
* Pipeline-level cofactor tests using the in-repo {@code 1t7qa.pdb} (contains COA).
|
||||
*
|
||||
* These exercise the loading path end-to-end (Protein.load → cofactor extraction →
|
||||
* ligand detection), but stop short of running a full prediction. The
|
||||
* full {@code prank predict} run is covered by the drop-in safety benchmark
|
||||
* (see PLAN §8.4, {@code benchmark/cofactors_dropin_safety.sh}).
|
||||
*/
|
||||
@Isolated
|
||||
@ResourceLock("Params")
|
||||
@CompileStatic
|
||||
class CofactorPipelineTest {
|
||||
|
||||
static final String PDB_1T7QA = "distro/test_data/liganated/1t7qa.pdb"
|
||||
static final String PDB_1FBL = "distro/test_data/1fbl.pdb"
|
||||
static final String PDB_1AHP = "distro/test_data/1AHP.pdb"
|
||||
|
||||
static boolean has1AHP() {
|
||||
return new File(PDB_1AHP).exists()
|
||||
}
|
||||
|
||||
static Params originalParams
|
||||
|
||||
@BeforeAll
|
||||
static void setup() {
|
||||
originalParams = (Params) Params.inst.clone()
|
||||
Params.INSTANCE = new Params()
|
||||
LoaderParams.ignoreLigandsSwitch = false
|
||||
}
|
||||
|
||||
@AfterAll
|
||||
static void tearDown() {
|
||||
Params.INSTANCE = originalParams
|
||||
}
|
||||
|
||||
private static LoaderParams loaderParamsWith(List<String> specs) {
|
||||
def lp = new LoaderParams()
|
||||
lp.cofactorHandler = new CofactorHandler(CofactorHandler.parseAndValidate(specs))
|
||||
return lp
|
||||
}
|
||||
|
||||
@Test
|
||||
void cofactorAtomsPhysicallyPresentInProteinAtoms() {
|
||||
def protein = Protein.load(PDB_1T7QA, loaderParamsWith(["COA"]))
|
||||
|
||||
// Find COA groups directly via BioJava
|
||||
List<Group> coaGroups = protein.structure.chains
|
||||
.collectMany { it.atomGroups }
|
||||
.findAll { ((Group) it).PDBName == "COA" } as List<Group>
|
||||
assertFalse(coaGroups.isEmpty(), "1t7qa should contain COA groups")
|
||||
|
||||
// Build the expected set of COA heavy-atom serials
|
||||
Set<Integer> coaSerials = new HashSet<>()
|
||||
for (Group g : coaGroups) {
|
||||
for (Atom a : Atoms.allFromGroup(g).withoutHydrogens().list) {
|
||||
coaSerials.add(a.PDBserial)
|
||||
}
|
||||
}
|
||||
|
||||
// Verify every COA heavy atom's serial appears in proteinAtoms
|
||||
Set<Integer> proteinSerials = new HashSet<>()
|
||||
for (Atom a : protein.proteinAtoms.list) {
|
||||
proteinSerials.add(a.PDBserial)
|
||||
}
|
||||
assertTrue(proteinSerials.containsAll(coaSerials),
|
||||
"COA heavy atoms should be in proteinAtoms (missing: ${coaSerials - proteinSerials})")
|
||||
}
|
||||
|
||||
@Test
|
||||
void cofactorExtractionResultStoredOnProtein() {
|
||||
def protein = Protein.load(PDB_1T7QA, loaderParamsWith(["COA"]))
|
||||
|
||||
def result = protein.cofactorExtractionResult
|
||||
assertNotNull(result, "ExtractionResult should be stored on Protein")
|
||||
assertFalse(result.atoms.empty, "COA atoms should be in result")
|
||||
assertTrue(result.foundGroups.containsKey("COA"), "COA should be in foundGroups")
|
||||
assertTrue(result.unmatchedSpecifiers.isEmpty(), "All specifiers should match")
|
||||
}
|
||||
|
||||
@Test
|
||||
void coaExcludedFromLigandDetection() {
|
||||
def protein = Protein.load(PDB_1T7QA, loaderParamsWith(["COA"]))
|
||||
|
||||
List<String> allLigNames = (List<String>) protein.ligands.allIncludingIgnored
|
||||
.collect { (it.name as String)?.toUpperCase() }
|
||||
.findAll { it != null }
|
||||
|
||||
assertFalse(allLigNames.contains("COA"),
|
||||
"COA should be excluded from ligands, got: $allLigNames")
|
||||
}
|
||||
|
||||
@Test
|
||||
void atomCountIncreasesWithCofactor() {
|
||||
def p1 = Protein.load(PDB_1T7QA, new LoaderParams())
|
||||
int without = p1.proteinAtoms.count
|
||||
|
||||
def p2 = Protein.load(PDB_1T7QA, loaderParamsWith(["COA"]))
|
||||
int with = p2.proteinAtoms.count
|
||||
|
||||
int added = with - without
|
||||
assertTrue(added >= 20,
|
||||
"COA should add >= 20 heavy atoms, got $added (before=$without, after=$with)")
|
||||
}
|
||||
|
||||
// ===== R17/R19/R20 regression guards =====
|
||||
|
||||
@Test
|
||||
void csvFeatureDoesNotThrowOnCofactorAtoms() {
|
||||
// R17 (audit fix #3): the *real* regression. With -cofactors set AND
|
||||
// CsvFileFeature configured with a non-empty column list AND a CSV that
|
||||
// covers only one polymer atom serial, the cofactor atoms would historically hit
|
||||
// CsvFileFeatureValues.missingError → PrankException. The R17 short-circuit
|
||||
// in CsvFileFeature.calculateForAtom must skip cofactor atoms before that lookup.
|
||||
//
|
||||
// To prove the test exercises the regression (and isn't trivially passing because
|
||||
// the CSV path is dead), we also call calculateForAtom on a polymer atom whose
|
||||
// serial IS present in the CSV and assert it returns the CSV's value.
|
||||
File csvDir = File.createTempFile("p2rank-csv-test", ".dir")
|
||||
csvDir.delete()
|
||||
csvDir.mkdirs()
|
||||
try {
|
||||
def lp = loaderParamsWith(["COA"])
|
||||
Protein protein = Protein.load(PDB_1T7QA, lp)
|
||||
|
||||
// Pick a real polymer atom and write a CSV that covers exactly its serial.
|
||||
org.biojava.nbio.structure.Atom polymerAtom = protein.proteinAtoms.list.find { Atom a ->
|
||||
a.group.type == org.biojava.nbio.structure.GroupType.AMINOACID
|
||||
}
|
||||
assertNotNull(polymerAtom, "1t7qa should have at least one polymer atom")
|
||||
int polymerSerial = polymerAtom.PDBserial
|
||||
|
||||
File csv = new File(csvDir, "1t7qa.pdb.csv")
|
||||
csv.text = "pdb serial,dummy_col\n${polymerSerial},0.5\n"
|
||||
|
||||
Params.inst.feat_csv_columns = ["dummy_col"]
|
||||
Params.inst.feat_csv_directories = [csvDir.absolutePath]
|
||||
Params.inst.feat_csv_ignore_missing = false // strict - would historically throw
|
||||
|
||||
def feature = new cz.siret.prank.features.implementation.csv.CsvFileFeature()
|
||||
feature.preProcessProtein(protein, null)
|
||||
|
||||
// Control: polymer atom whose serial is in CSV → must read 0.5
|
||||
cz.siret.prank.features.api.AtomFeatureCalculationContext polyCtx =
|
||||
new cz.siret.prank.features.api.AtomFeatureCalculationContext(protein, polymerAtom)
|
||||
double[] polymerResult = feature.calculateForAtom(polymerAtom, polyCtx)
|
||||
assertEquals(1, polymerResult.length)
|
||||
assertEquals(0.5d, polymerResult[0], 1e-12,
|
||||
"Polymer atom (control) must read 0.5 from CSV - proves the lookup path is live")
|
||||
|
||||
// The actual R17 assertion: cofactor atom whose serial is NOT in CSV
|
||||
// must not throw, must return a zero vector.
|
||||
org.biojava.nbio.structure.Group coaGroup = protein.structure.chains
|
||||
.collectMany { it.atomGroups }
|
||||
.find { ((Group) it).PDBName == "COA" } as Group
|
||||
assertNotNull(coaGroup, "1t7qa should contain a COA group")
|
||||
org.biojava.nbio.structure.Atom coaAtom = coaGroup.atoms.first()
|
||||
|
||||
cz.siret.prank.features.api.AtomFeatureCalculationContext coaCtx =
|
||||
new cz.siret.prank.features.api.AtomFeatureCalculationContext(protein, coaAtom)
|
||||
|
||||
ThrowingSupplier<double[]> supplier = { feature.calculateForAtom(coaAtom, coaCtx) } as ThrowingSupplier<double[]>
|
||||
double[] result = assertDoesNotThrow(supplier)
|
||||
assertEquals(1, result.length, "R17 returns zero vector matching header size")
|
||||
assertEquals(0.0d, result[0], 1e-12)
|
||||
} finally {
|
||||
Params.inst.feat_csv_columns = []
|
||||
Params.inst.feat_csv_directories = []
|
||||
Params.inst.feat_csv_ignore_missing = false
|
||||
csvDir.listFiles()?.each { it.delete() }
|
||||
csvDir.delete()
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
void conservationFeatureSafeOnCofactorAtoms() {
|
||||
// R20: confirm cofactor atoms return 0.0 from ConservationFeature even when a
|
||||
// ConservationScore IS loaded. The type-guard `parentAA.getType() != AMINOACID`
|
||||
// at the top of ConservationFeature.calculateForAtom short-circuits before the
|
||||
// score lookup; cofactor HETATM groups thus contribute 0.0 regardless of what's
|
||||
// in the score map.
|
||||
//
|
||||
// To prove the type-guard (not an empty score map) is responsible, we inject a
|
||||
// score map that BOTH covers polymer residues with non-zero values AND covers
|
||||
// the cofactor's residue number with a non-zero "poison" value. If the type-guard
|
||||
// fired, the cofactor returns 0.0 (not the poison). If it didn't, we'd see the
|
||||
// poison value bleed through.
|
||||
def lp = loaderParamsWith(["COA"])
|
||||
Protein protein = Protein.load(PDB_1T7QA, lp)
|
||||
|
||||
Group coaGroup = protein.structure.chains
|
||||
.collectMany { it.atomGroups }
|
||||
.find { ((Group) it).PDBName == "COA" } as Group
|
||||
assertNotNull(coaGroup, "1t7qa should contain a COA group")
|
||||
Atom coaAtom = coaGroup.atoms.first()
|
||||
|
||||
Atom polymerAtom = protein.proteinAtoms.list.find { Atom a ->
|
||||
a.group.type == org.biojava.nbio.structure.GroupType.AMINOACID
|
||||
}
|
||||
assertNotNull(polymerAtom, "1t7qa should have at least one polymer atom")
|
||||
|
||||
Map<ResidueNumberWrapper, Double> scoreMap = new HashMap<>()
|
||||
scoreMap.put(new ResidueNumberWrapper(polymerAtom.group.residueNumber), 0.77d)
|
||||
scoreMap.put(new ResidueNumberWrapper(coaGroup.residueNumber), 0.99d) // "poison"
|
||||
|
||||
// Construct ConservationScore via its package-private constructor.
|
||||
Constructor<ConservationScore> ctor =
|
||||
ConservationScore.class.getDeclaredConstructor(Map.class)
|
||||
ctor.setAccessible(true)
|
||||
ConservationScore score = ctor.newInstance(scoreMap)
|
||||
|
||||
// Install via the same secondaryData keys Protein uses.
|
||||
protein.secondaryData.put(ConservationScore.CONSERV_SCORE_KEY, score)
|
||||
protein.secondaryData.put(ConservationScore.CONSERV_LOADED_KEY, true)
|
||||
assertNotNull(protein.conservationScore, "Score should now be reachable via protein.conservationScore")
|
||||
|
||||
def feature = new cz.siret.prank.features.implementation.conservation.ConservationFeature()
|
||||
|
||||
// Control: polymer atom must read the real score (proves the lookup path is live).
|
||||
def polyCtx = new cz.siret.prank.features.api.AtomFeatureCalculationContext(protein, polymerAtom)
|
||||
double[] polymerResult = feature.calculateForAtom(polymerAtom, polyCtx)
|
||||
assertEquals(1, polymerResult.length)
|
||||
assertEquals(0.77d, polymerResult[0], 1e-12,
|
||||
"Polymer atom (control) must read its score - proves the lookup path is live")
|
||||
|
||||
// Actual assertion: cofactor atom returns 0.0 due to the AMINOACID type-guard,
|
||||
// NOT 0.99 (the poison value we planted under its residue number).
|
||||
def coaCtx = new cz.siret.prank.features.api.AtomFeatureCalculationContext(protein, coaAtom)
|
||||
double[] result = feature.calculateForAtom(coaAtom, coaCtx)
|
||||
assertEquals(1, result.length)
|
||||
assertEquals(0.0d, result[0], 1e-12,
|
||||
"Cofactor atom must be short-circuited to 0.0 by type-guard, not return the poison value 0.99")
|
||||
}
|
||||
|
||||
@Test
|
||||
void baselineUnchangedWithEmptyCofactorList() {
|
||||
// The "drop-in safety" guarantee: setting -cofactors to nothing matches default
|
||||
def p1 = Protein.load(PDB_1FBL, new LoaderParams())
|
||||
def p2 = Protein.load(PDB_1FBL,
|
||||
new LoaderParams(cofactorHandler: new CofactorHandler([] as List<LigandDefinition>)))
|
||||
|
||||
assertEquals(p1.proteinAtoms.count, p2.proteinAtoms.count)
|
||||
assertEquals(p1.ligands.allIncludingIgnored.size(), p2.ligands.allIncludingIgnored.size())
|
||||
}
|
||||
|
||||
// ===== Audit findings: missing test coverage =====
|
||||
|
||||
@Test
|
||||
void cofactorAtomsAreExposedOnSurface() {
|
||||
// Audit #15: previously only proteinAtoms count was asserted; this verifies
|
||||
// that cofactor atoms participate in surface computation - at least some of
|
||||
// them appear in protein.exposedAtoms (i.e., have SAS points around them).
|
||||
//
|
||||
// Note: total SAS-point count may *decrease* when a cofactor is added because
|
||||
// the cofactor can occlude polymer surface that was previously exposed. The
|
||||
// biologically meaningful assertion is "at least one cofactor heavy atom is
|
||||
// solvent-accessible after extraction."
|
||||
def lp = loaderParamsWith(["COA"])
|
||||
Protein protein = Protein.load(PDB_1T7QA, lp)
|
||||
|
||||
Set<Integer> coaSerials = new HashSet<>()
|
||||
for (List<org.biojava.nbio.structure.Group> gs : protein.cofactorExtractionResult.foundGroups.values()) {
|
||||
for (org.biojava.nbio.structure.Group g : gs) {
|
||||
for (org.biojava.nbio.structure.Atom a : cz.siret.prank.geom.Atoms.allFromGroup(g).withoutHydrogens().list) {
|
||||
coaSerials.add(a.PDBserial)
|
||||
}
|
||||
}
|
||||
}
|
||||
assertFalse(coaSerials.isEmpty(), "Sanity: there should be COA heavy atoms")
|
||||
|
||||
int exposedCofactorAtoms = 0
|
||||
for (org.biojava.nbio.structure.Atom a : protein.exposedAtoms.list) {
|
||||
if (coaSerials.contains(a.PDBserial)) exposedCofactorAtoms++
|
||||
}
|
||||
assertTrue(exposedCofactorAtoms > 0,
|
||||
"At least one COA heavy atom must be in exposedAtoms (Surface.computeExposedAtoms saw it)")
|
||||
}
|
||||
|
||||
@Test
|
||||
void distantCofactorWarningRespectsThreshold() {
|
||||
// Audit #9: distantCofactorWarning must actually log a WARN at the configured
|
||||
// threshold. Uses Log4jCapture against CofactorHandler's logger.
|
||||
def lp = loaderParamsWith(["COA"])
|
||||
Protein protein = Protein.load(PDB_1T7QA, lp)
|
||||
def result = protein.cofactorExtractionResult
|
||||
assertNotNull(result)
|
||||
|
||||
Log4jCapture capture = Log4jCapture.attach(CofactorHandler)
|
||||
try {
|
||||
// maxDist=0 → disabled, no log
|
||||
protein.loaderParams.cofactorHandler.warnDistantCofactors(result, protein.proteinAtoms, 0d, "test")
|
||||
// maxDist=15.0 → COA in 1t7qa is bound near the active site, no log expected
|
||||
protein.loaderParams.cofactorHandler.warnDistantCofactors(result, protein.proteinAtoms, 15.0d, "test")
|
||||
assertTrue(capture.warns().findAll { it.contains("crystallization artifact") }.isEmpty(),
|
||||
"No distant-cofactor WARN expected at sane thresholds")
|
||||
|
||||
// maxDist=0.001 → every cofactor is >0.001 Å from the protein, so a WARN must fire
|
||||
protein.loaderParams.cofactorHandler.warnDistantCofactors(result, protein.proteinAtoms, 0.001d, "test")
|
||||
List<String> warns = capture.warns().findAll { it.contains("crystallization artifact") }
|
||||
assertFalse(warns.isEmpty(),
|
||||
"Expected at least one distant-cofactor WARN at maxDist=0.001 - got: ${capture.warns()}")
|
||||
assertTrue(warns.any { it.contains("COA") },
|
||||
"WARN should name the cofactor (COA): ${warns}")
|
||||
} finally {
|
||||
capture.detach()
|
||||
}
|
||||
assertFalse(result.atoms.empty, "Result should be unchanged after distant-cofactor check")
|
||||
}
|
||||
|
||||
@Test
|
||||
@EnabledIf("has1AHP")
|
||||
void chainExcludedCofactorDiagnosticDoesNotThrow() {
|
||||
// Audit #10: chain-reduction code path with cofactors. Reducing 1AHP to chain A
|
||||
// only and requesting PLP should still find PLP (it exists on both chains).
|
||||
def lp = loaderParamsWith(["PLP"])
|
||||
ThrowingSupplier<Protein> supplier = {
|
||||
Protein.load(PDB_1AHP, ["A"], lp)
|
||||
} as ThrowingSupplier<Protein>
|
||||
Protein protein = assertDoesNotThrow(supplier)
|
||||
assertNotNull(protein.cofactorExtractionResult)
|
||||
// PLP should still be found (on chain A)
|
||||
assertTrue(protein.cofactorExtractionResult.foundGroups.containsKey("PLP"))
|
||||
}
|
||||
|
||||
// ===== Option A regression =====
|
||||
|
||||
@Test
|
||||
void pymolRendererEmitsPerNameSelections() {
|
||||
// Audit #21: every `select X, ...` line must have a non-empty body and use a
|
||||
// valid PyMOL atom-identifier syntax (`id N or id N or ...` or other selectors).
|
||||
// A regression that produced `select cofactor_X, ` with empty body would pass
|
||||
// contains(...) but break PyMOL - so we validate the body explicitly.
|
||||
def lp = loaderParamsWith(["COA"])
|
||||
Protein protein = Protein.load(PDB_1T7QA, lp)
|
||||
def result = protein.cofactorExtractionResult
|
||||
assertNotNull(result)
|
||||
|
||||
String pml = cz.siret.prank.program.visualization.renderers.NewPymolRenderer.cofactorPymolBlock(result)
|
||||
assertTrue(pml.contains("select cofactor_COA,"),
|
||||
"PML should contain per-name selection 'cofactor_COA':\n$pml")
|
||||
assertTrue(pml.contains("select cofactor_atoms,"),
|
||||
"PML should contain aggregate 'cofactor_atoms' selection:\n$pml")
|
||||
assertTrue(pml.contains("cofactor_col"),
|
||||
"PML should define and use cofactor_col:\n$pml")
|
||||
|
||||
// set_color must use the comma form (not `name = [rgb]`) - PyMOL's PML parser is
|
||||
// friendlier to the comma form and that's the convention used by other hardcoded
|
||||
// colour definitions in the renderer.
|
||||
assertTrue(pml.contains("set_color cofactor_col, ["),
|
||||
"set_color must use comma form 'set_color name, [rgb]': $pml")
|
||||
|
||||
// Each `select X, BODY` line: BODY must be non-empty and match `id <int>` (or chained
|
||||
// with ` or id <int>` or another selection name).
|
||||
java.util.regex.Pattern selectRegex = ~/(?m)^select\s+(\S+)\s*,\s*(.+)$/
|
||||
java.util.regex.Matcher m = selectRegex.matcher(pml)
|
||||
int selectLines = 0
|
||||
while (m.find()) {
|
||||
selectLines++
|
||||
String selName = m.group(1)
|
||||
String body = m.group(2).trim()
|
||||
assertFalse(body.isEmpty(), "select '$selName' has empty body in PML:\n$pml")
|
||||
// Body is either `id N (or id N)*` or `name (or name)*` (aggregate selection).
|
||||
boolean validBody = body.matches(/(id\s+\d+)(\s+or\s+id\s+\d+)*/) ||
|
||||
body.matches(/(\S+)(\s+or\s+\S+)*/)
|
||||
assertTrue(validBody, "select '$selName' has invalid body '$body' in PML")
|
||||
}
|
||||
assertTrue(selectLines >= 2, "Expected >=2 select lines (per-name + aggregate), got $selectLines")
|
||||
}
|
||||
|
||||
// ===== Audit #11: aa_mapping collision warning =====
|
||||
|
||||
@Test
|
||||
void aaMappingCollisionWarningEmitted() {
|
||||
// R19: when a cofactor specifier's group name overlaps with the active aa_mapping,
|
||||
// a WARN must surface (Main.run() emits it during startup). The "minimal" aa_mapping
|
||||
// (default) maps MSE→MET and MEN→ASN, so specifying `-cofactors MSE` is a real
|
||||
// collision. We exercise the exact overlap-detection logic from Main.run().
|
||||
String savedMode = AminoAcidMapper.getInstance().mode
|
||||
try {
|
||||
AminoAcidMapper.initialize("minimal")
|
||||
Set<String> activeMappings = AminoAcidMapper.getInstance().getMappings().keySet()
|
||||
assertTrue(activeMappings.contains("MSE"), "Sanity: minimal mapping should contain MSE")
|
||||
|
||||
// Same logic Main.run() executes for the WARN.
|
||||
Set<String> overlapping = new LinkedHashSet<>()
|
||||
for (String spec : ["MSE", "FAD"]) {
|
||||
String name = LigandDefinition.parse(spec.toUpperCase()).groupName?.toUpperCase()
|
||||
if (name != null && activeMappings.contains(name)) overlapping.add(name)
|
||||
}
|
||||
assertEquals(["MSE"] as Set, overlapping,
|
||||
"Detection must flag MSE (collides) but not FAD (no collision)")
|
||||
|
||||
// Also assert the WARN message format used in Main.run() doesn't blow up at format-time.
|
||||
Log4jCapture capture = Log4jCapture.attach(cz.siret.prank.program.Main)
|
||||
try {
|
||||
org.slf4j.LoggerFactory.getLogger(cz.siret.prank.program.Main).warn(
|
||||
"Cofactor specifier(s) name(s) {} are also covered by the active " +
|
||||
"aa_mapping. Cofactor atom features will be computed using the mapped " +
|
||||
"AA's table entries instead of cofactor defaults. Remove the entry " +
|
||||
"from aa_mapping or change the cofactor specifier to fix.", overlapping)
|
||||
List<String> warns = capture.warns().findAll { it.contains("aa_mapping") }
|
||||
assertFalse(warns.isEmpty(), "WARN should reach the capture appender")
|
||||
assertTrue(warns.any { it.contains("[MSE]") || it.contains("MSE") },
|
||||
"WARN should name the colliding specifier: ${warns}")
|
||||
} finally {
|
||||
capture.detach()
|
||||
}
|
||||
} finally {
|
||||
AminoAcidMapper.initialize(savedMode ?: "minimal")
|
||||
}
|
||||
}
|
||||
|
||||
// ===== Audit #19: concurrency safety =====
|
||||
|
||||
@Test
|
||||
void concurrencySafeWithMultipleThreads() {
|
||||
// Dataset processing uses ExecutorService.newFixedThreadPool. Per-item LoaderParams
|
||||
// construction creates a fresh CofactorHandler, but defensive guarantee: even if
|
||||
// a handler instance were shared, extractCofactorAtoms now resets matchedGroups
|
||||
// before populating. This test loads the same structure with 8 parallel threads
|
||||
// (each with its own LoaderParams) and asserts every protein independently has the
|
||||
// expected cofactor atoms.
|
||||
int nThreads = 8
|
||||
ExecutorService pool = Executors.newFixedThreadPool(nThreads)
|
||||
try {
|
||||
List<Future<Protein>> futures = []
|
||||
for (int i = 0; i < nThreads; i++) {
|
||||
Callable<Protein> task = { Protein.load(PDB_1T7QA, loaderParamsWith(["COA"])) } as Callable<Protein>
|
||||
futures << pool.submit(task)
|
||||
}
|
||||
int firstCount = -1
|
||||
for (Future<Protein> f : futures) {
|
||||
Protein p = f.get(60, TimeUnit.SECONDS)
|
||||
assertNotNull(p.cofactorExtractionResult)
|
||||
assertTrue(p.cofactorExtractionResult.foundGroups.containsKey("COA"))
|
||||
int atoms = p.cofactorExtractionResult.atoms.count
|
||||
if (firstCount < 0) firstCount = atoms
|
||||
else assertEquals(firstCount, atoms,
|
||||
"Parallel loads must produce identical cofactor atom counts")
|
||||
}
|
||||
} finally {
|
||||
pool.shutdownNow()
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
void unmatchedSpecifierAppearsInPymolComment() {
|
||||
// Audit #20 sibling: unmatched specifiers show up as a diagnostic comment in PML.
|
||||
// Use a precise specifier that won't match.
|
||||
def lp = loaderParamsWith(["COA", "FAD[group_id:Z_999]"])
|
||||
Protein protein = Protein.load(PDB_1T7QA, lp)
|
||||
def result = protein.cofactorExtractionResult
|
||||
assertNotNull(result)
|
||||
assertEquals(["FAD[group_id:Z_999]"], result.unmatchedSpecifiers)
|
||||
|
||||
String pml = cz.siret.prank.program.visualization.renderers.NewPymolRenderer.cofactorPymolBlock(result)
|
||||
assertTrue(pml.contains("# unmatched specifiers"),
|
||||
"PML should include unmatched-specifiers diagnostic:\n$pml")
|
||||
assertTrue(pml.contains("FAD[group_id:Z_999]"),
|
||||
"PML diagnostic should name the unmatched specifier:\n$pml")
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,123 @@
|
||||
package cz.siret.prank.program.routines.analyze
|
||||
|
||||
import cz.siret.prank.domain.CofactorHandler
|
||||
import cz.siret.prank.domain.Dataset
|
||||
import cz.siret.prank.program.params.Params
|
||||
import groovy.transform.CompileStatic
|
||||
import org.junit.jupiter.api.AfterAll
|
||||
import org.junit.jupiter.api.BeforeAll
|
||||
import org.junit.jupiter.api.Test
|
||||
import org.junit.jupiter.api.parallel.Isolated
|
||||
import org.junit.jupiter.api.parallel.ResourceLock
|
||||
|
||||
import static cz.siret.prank.domain.Dataset.LigandDefinition
|
||||
import static org.junit.jupiter.api.Assertions.*
|
||||
|
||||
/**
|
||||
* Tests for the building blocks of {@code analyze cofactors} (see local/PLAN_COFACTORS_ANALYZE.md §7).
|
||||
*
|
||||
* The {@code cmdCofactors} subcommand itself is exercised end-to-end by the smoke harness
|
||||
* (misc/test-scripts/testsets.sh, function {@code cofactors()}) because {@code AnalyzeRoutine}
|
||||
* requires a {@code Main} instance and {@code Main.findInstallDir} hard-codes paths.
|
||||
*
|
||||
* Here we cover the testable pieces under cmdCofactors:
|
||||
* - {@code Dataset.resolveCofactorDefinitions} - per-item precedence (column overrides global)
|
||||
* - {@code CofactorHandler.parseAndValidate} - case normalization, bracket-aware split
|
||||
* - {@code DataTable.toCsv} CSV quoting for cofactor-shaped data (cross-referenced in DataTableCsvTest)
|
||||
*/
|
||||
@Isolated
|
||||
@ResourceLock("Params")
|
||||
@CompileStatic
|
||||
class CofactorAnalyzeTest {
|
||||
|
||||
static Params savedParams
|
||||
|
||||
@BeforeAll
|
||||
static void setup() {
|
||||
savedParams = (Params) Params.inst.clone()
|
||||
Params.INSTANCE = new Params()
|
||||
}
|
||||
|
||||
@AfterAll
|
||||
static void tearDown() {
|
||||
Params.INSTANCE = savedParams
|
||||
}
|
||||
|
||||
@Test
|
||||
void resolveCofactorDefinitions_columnOverridesGlobal() {
|
||||
// Per-item resolution: the dataset's `cofactors` column wins over the global
|
||||
// Params.cofactors. This is the precedence rule analyze cofactors uses
|
||||
// (see AnalyzeRoutine.cmdCofactors and Dataset.getLoaderParams).
|
||||
Params.inst.cofactors = ["FAD"]
|
||||
Dataset ds = new Dataset("test", "/tmp")
|
||||
Map<String, String> cols = ["protein": "x.pdb", "cofactors": "PLP"]
|
||||
Dataset.Item item = ds.createNewItemForSingleFileDs("x.pdb", cols)
|
||||
|
||||
List<LigandDefinition> defs = ds.resolveCofactorDefinitions(item)
|
||||
assertEquals(1, defs.size())
|
||||
assertEquals("PLP", defs[0].groupName, "Column value PLP should win over global FAD")
|
||||
}
|
||||
|
||||
@Test
|
||||
void resolveCofactorDefinitions_emptyColumnFallsBackToGlobal() {
|
||||
Params.inst.cofactors = ["HEM"]
|
||||
Dataset ds = new Dataset("test", "/tmp")
|
||||
// Column present but empty -> fall back to global
|
||||
Map<String, String> cols = ["protein": "x.pdb", "cofactors": ""]
|
||||
Dataset.Item item = ds.createNewItemForSingleFileDs("x.pdb", cols)
|
||||
|
||||
List<LigandDefinition> defs = ds.resolveCofactorDefinitions(item)
|
||||
assertEquals(1, defs.size())
|
||||
assertEquals("HEM", defs[0].groupName, "Empty column should defer to global Params.cofactors")
|
||||
}
|
||||
|
||||
@Test
|
||||
void resolveCofactorDefinitions_missingColumnUsesGlobal() {
|
||||
Params.inst.cofactors = ["FMN"]
|
||||
Dataset ds = new Dataset("test", "/tmp")
|
||||
Map<String, String> cols = ["protein": "x.pdb"] // no cofactors column at all
|
||||
Dataset.Item item = ds.createNewItemForSingleFileDs("x.pdb", cols)
|
||||
|
||||
List<LigandDefinition> defs = ds.resolveCofactorDefinitions(item)
|
||||
assertEquals(1, defs.size())
|
||||
assertEquals("FMN", defs[0].groupName)
|
||||
}
|
||||
|
||||
@Test
|
||||
void resolveCofactorDefinitions_bothEmptyReturnsEmpty() {
|
||||
Params.inst.cofactors = []
|
||||
Dataset ds = new Dataset("test", "/tmp")
|
||||
Map<String, String> cols = ["protein": "x.pdb"]
|
||||
Dataset.Item item = ds.createNewItemForSingleFileDs("x.pdb", cols)
|
||||
|
||||
assertTrue(ds.resolveCofactorDefinitions(item).isEmpty())
|
||||
}
|
||||
|
||||
@Test
|
||||
void resolveCofactorDefinitions_preservesContactResIdsCommas() {
|
||||
// Plan §"Stage 4": a `contact_res_ids` specifier with commas must round-trip through
|
||||
// column splitting unscathed. The bracket-aware splitter inside parseAndValidate is
|
||||
// what makes this work.
|
||||
Params.inst.cofactors = []
|
||||
Dataset ds = new Dataset("test", "/tmp")
|
||||
Map<String, String> cols = ["protein": "x.pdb",
|
||||
"cofactors": "FAD[contact_res_ids:A_D246,A_T259,A_E423]"]
|
||||
Dataset.Item item = ds.createNewItemForSingleFileDs("x.pdb", cols)
|
||||
|
||||
List<LigandDefinition> defs = ds.resolveCofactorDefinitions(item)
|
||||
assertEquals(1, defs.size(),
|
||||
"contact_res_ids value with inner commas must remain one specifier, got: ${defs*.originalString}")
|
||||
assertEquals("FAD", defs[0].groupName)
|
||||
assertTrue(defs[0].originalString.contains("contact_res_ids:A_D246,A_T259,A_E423"),
|
||||
"originalString must preserve all three residue ids: ${defs[0].originalString}")
|
||||
}
|
||||
|
||||
@Test
|
||||
void parseAndValidate_dropsBlankSpecifiers() {
|
||||
// A trailing comma in the dataset column produces an empty token; it must be silently dropped.
|
||||
List<LigandDefinition> defs = CofactorHandler.parseAndValidate("FAD,,PLP,")
|
||||
assertEquals(2, defs.size())
|
||||
assertEquals("FAD", defs[0].groupName)
|
||||
assertEquals("PLP", defs[1].groupName)
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,105 @@
|
||||
package cz.siret.prank.program.routines.analyze
|
||||
|
||||
import groovy.transform.CompileStatic
|
||||
import org.junit.jupiter.api.Test
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.*
|
||||
|
||||
/**
|
||||
* CSV-quoting regression tests for DataTable.toCsv.
|
||||
*
|
||||
* Covers audit finding #6 - cell values containing commas, double-quotes, or newlines
|
||||
* must be RFC-4180 quoted so they don't break the CSV row structure.
|
||||
*/
|
||||
@CompileStatic
|
||||
class DataTableCsvTest {
|
||||
|
||||
@Test
|
||||
void plainValuesAreNotQuoted() {
|
||||
DataTable dt = new DataTable("protein", "a", "b")
|
||||
dt.newRow("p1").put("a", "x").put("b", "y")
|
||||
String csv = dt.toCsv()
|
||||
assertTrue(csv.contains("p1, x, y"), "Plain values should not be quoted: $csv")
|
||||
}
|
||||
|
||||
@Test
|
||||
void valuesWithCommasAreQuoted() {
|
||||
// E.g. a cofactor specifier "FAD[contact_res_ids:A_D246,A_T259]" - the inner
|
||||
// comma must not break the CSV row.
|
||||
DataTable dt = new DataTable("protein", "specifier")
|
||||
dt.newRow("p1").put("specifier", "FAD[contact_res_ids:A_D246,A_T259]")
|
||||
String csv = dt.toCsv()
|
||||
assertTrue(csv.contains('"FAD[contact_res_ids:A_D246,A_T259]"'),
|
||||
"Comma-containing value should be wrapped in double quotes: $csv")
|
||||
}
|
||||
|
||||
@Test
|
||||
void valuesWithDoubleQuotesAreEscaped() {
|
||||
DataTable dt = new DataTable("protein", "note")
|
||||
dt.newRow("p1").put("note", 'has a " quote')
|
||||
String csv = dt.toCsv()
|
||||
assertTrue(csv.contains('"has a "" quote"'),
|
||||
"Inner double quote should be doubled: $csv")
|
||||
}
|
||||
|
||||
@Test
|
||||
void columnHeadersWithCommasAreQuoted() {
|
||||
DataTable dt = new DataTable("protein", "weird,header")
|
||||
dt.newRow("p1").put("weird,header", "v")
|
||||
String csv = dt.toCsv()
|
||||
assertTrue(csv.contains('"weird,header"'),
|
||||
"Header containing comma should be quoted: $csv")
|
||||
}
|
||||
|
||||
@Test
|
||||
void labelWithCommaIsQuoted() {
|
||||
DataTable dt = new DataTable("protein", "a")
|
||||
dt.newRow("weird,label").put("a", "v")
|
||||
String csv = dt.toCsv()
|
||||
assertTrue(csv.contains('"weird,label"'),
|
||||
"Row label containing comma should be quoted: $csv")
|
||||
}
|
||||
|
||||
@Test
|
||||
void valueWithNewlineIsQuoted() {
|
||||
// RFC-4180: cells containing LF or CR must be quoted, else parsers split mid-row.
|
||||
DataTable dt = new DataTable("protein", "note")
|
||||
dt.newRow("p1").put("note", "line1\nline2")
|
||||
String csv = dt.toCsv()
|
||||
assertTrue(csv.contains('"line1\nline2"'),
|
||||
"Newline-containing value must be wrapped in double quotes: $csv")
|
||||
}
|
||||
|
||||
@Test
|
||||
void valueWithCarriageReturnIsQuoted() {
|
||||
// RFC-4180: \r alone (rare but possible from Windows-origin labels) must trigger quoting,
|
||||
// otherwise the unquoted CR mid-cell mis-splits in strict parsers. Audit fix #5.
|
||||
DataTable dt = new DataTable("protein", "note")
|
||||
dt.newRow("p1").put("note", "line1\rline2")
|
||||
String csv = dt.toCsv()
|
||||
assertTrue(csv.contains('"line1\rline2"'),
|
||||
"CR-containing value must be wrapped in double quotes: $csv")
|
||||
}
|
||||
|
||||
@Test
|
||||
void nullValueRendersAsEmpty() {
|
||||
DataTable dt = new DataTable("protein", "a")
|
||||
dt.newRow("p1") // no put() -> value remains null
|
||||
String csv = dt.toCsv()
|
||||
// Row line should be `p1, ` (label, then empty cell). No quotes, no "null".
|
||||
assertTrue(csv.contains("p1, \n") || csv.endsWith("p1, "),
|
||||
"Null cell must render as empty (not 'null'): $csv")
|
||||
}
|
||||
|
||||
@Test
|
||||
void preQuotedValueGetsItsQuotesEscaped() {
|
||||
// If the user's value already starts/ends with double-quotes, those inner quotes
|
||||
// must be doubled (RFC-4180 escape) - otherwise CSV parsers see a quoted field
|
||||
// terminating prematurely.
|
||||
DataTable dt = new DataTable("protein", "note")
|
||||
dt.newRow("p1").put("note", '"already quoted"')
|
||||
String csv = dt.toCsv()
|
||||
assertTrue(csv.contains('"""already quoted"""'),
|
||||
"Pre-quoted value must have inner quotes doubled: $csv")
|
||||
}
|
||||
}
|
||||
86
src/test/groovy/cz/siret/prank/test/Log4jCapture.groovy
Normal file
86
src/test/groovy/cz/siret/prank/test/Log4jCapture.groovy
Normal file
@@ -0,0 +1,86 @@
|
||||
package cz.siret.prank.test
|
||||
|
||||
import groovy.transform.CompileStatic
|
||||
import org.apache.logging.log4j.Level
|
||||
import org.apache.logging.log4j.LogManager
|
||||
import org.apache.logging.log4j.core.LogEvent
|
||||
import org.apache.logging.log4j.core.Logger
|
||||
import org.apache.logging.log4j.core.LoggerContext
|
||||
import org.apache.logging.log4j.core.appender.AbstractAppender
|
||||
import org.apache.logging.log4j.core.config.Property
|
||||
|
||||
/**
|
||||
* Test-only helper that captures log4j2 events from a target class's logger.
|
||||
*
|
||||
* Usage:
|
||||
* <pre>
|
||||
* def capture = Log4jCapture.attach(MyClass)
|
||||
* try {
|
||||
* // act
|
||||
* assertTrue capture.warns().any { it.contains("expected text") }
|
||||
* } finally {
|
||||
* capture.detach()
|
||||
* }
|
||||
* </pre>
|
||||
*/
|
||||
@CompileStatic
|
||||
class Log4jCapture {
|
||||
|
||||
private final Logger logger
|
||||
private final CapturingAppender appender
|
||||
private final Level previousLevel
|
||||
|
||||
private Log4jCapture(Logger logger, CapturingAppender appender, Level previousLevel) {
|
||||
this.logger = logger
|
||||
this.appender = appender
|
||||
this.previousLevel = previousLevel
|
||||
}
|
||||
|
||||
static Log4jCapture attach(Class<?> targetClass) {
|
||||
Logger lg = (Logger) LogManager.getLogger(targetClass)
|
||||
Level prev = lg.level
|
||||
if (prev == null || prev.intLevel() < Level.WARN.intLevel()) {
|
||||
// intLevel() is smaller for more verbose levels - we want WARN or finer to be captured.
|
||||
}
|
||||
CapturingAppender appender = new CapturingAppender("Capture-" + targetClass.simpleName)
|
||||
appender.start()
|
||||
lg.addAppender(appender)
|
||||
// Ensure WARN+ propagates regardless of root config.
|
||||
lg.setLevel(Level.DEBUG)
|
||||
return new Log4jCapture(lg, appender, prev)
|
||||
}
|
||||
|
||||
void detach() {
|
||||
logger.removeAppender(appender)
|
||||
logger.setLevel(previousLevel)
|
||||
appender.stop()
|
||||
}
|
||||
|
||||
List<String> messages() {
|
||||
return appender.events.collect { it.message.formattedMessage }
|
||||
}
|
||||
|
||||
List<String> warns() {
|
||||
return appender.events.findAll { it.level == Level.WARN }
|
||||
.collect { it.message.formattedMessage }
|
||||
}
|
||||
|
||||
List<String> infos() {
|
||||
return appender.events.findAll { it.level == Level.INFO }
|
||||
.collect { it.message.formattedMessage }
|
||||
}
|
||||
|
||||
@CompileStatic
|
||||
private static class CapturingAppender extends AbstractAppender {
|
||||
final List<LogEvent> events = Collections.synchronizedList(new ArrayList<LogEvent>())
|
||||
|
||||
CapturingAppender(String name) {
|
||||
super(name, null, null, false, Property.EMPTY_ARRAY)
|
||||
}
|
||||
|
||||
@Override
|
||||
void append(LogEvent event) {
|
||||
events.add(event.toImmutable())
|
||||
}
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user