From 026be7eae53adef22388d049a7a6b0660f8c125b Mon Sep 17 00:00:00 2001 From: rdk Date: Tue, 3 Mar 2026 21:58:55 +0100 Subject: [PATCH] Improve analyze binding-sites: visualizations, site radius, eager loading - Add PyMol visualizations using dataset.binaryResidueLabeler - Add site_radius column (max distance from centroid to any site atom) - Add excludeFromSummary param to DataTable.formatSummaryTable to skip center coordinates from numeric summary stats - Load ExplicitSitesIndex eagerly during dataset loading (fail-fast) - Skip CSV rows with empty residue/coordinate fields in AhojUbsSiteParser - Write items without binding sites to separate file in outdir --- .../cz/siret/prank/domain/Dataset.groovy | 9 +++- .../domain/loaders/AhojUbsSiteParser.groovy | 9 ++++ .../domain/loaders/DatasetItemLoader.groovy | 1 - .../routines/analyze/AnalyzeRoutine.groovy | 51 +++++++++++++++---- .../program/routines/analyze/DataTable.groovy | 7 +-- 5 files changed, 62 insertions(+), 15 deletions(-) diff --git a/src/main/groovy/cz/siret/prank/domain/Dataset.groovy b/src/main/groovy/cz/siret/prank/domain/Dataset.groovy index 7cb522e5..a7d815d2 100644 --- a/src/main/groovy/cz/siret/prank/domain/Dataset.groovy +++ b/src/main/groovy/cz/siret/prank/domain/Dataset.groovy @@ -450,12 +450,15 @@ class Dataset implements Parametrized, Writable, Failable { } ExplicitSitesIndex getExplicitSitesIndex() { - if (explicitSitesIndex == null && hasExplicitSites()) { + return explicitSitesIndex + } + + private void loadExplicitSitesIndex() { + if (hasExplicitSites()) { String format = attributes.get(PARAM_EXPLICIT_SITES_FORMAT) String file = dir + "/" + attributes.get(PARAM_EXPLICIT_SITES_FILE) explicitSitesIndex = ExplicitSitesIndex.loadFromFile(format, file) } - return explicitSitesIndex } /** @@ -578,6 +581,8 @@ class Dataset implements Parametrized, Writable, Failable { throw new PrankException("dataset contains invalid files") } + dataset.loadExplicitSitesIndex() + return dataset } diff --git a/src/main/groovy/cz/siret/prank/domain/loaders/AhojUbsSiteParser.groovy b/src/main/groovy/cz/siret/prank/domain/loaders/AhojUbsSiteParser.groovy index 6c8a8982..d64fec73 100644 --- a/src/main/groovy/cz/siret/prank/domain/loaders/AhojUbsSiteParser.groovy +++ b/src/main/groovy/cz/siret/prank/domain/loaders/AhojUbsSiteParser.groovy @@ -21,11 +21,17 @@ class AhojUbsSiteParser { Map> byFilename = new LinkedHashMap<>() int totalSites = 0 + int skippedEmpty = 0 for (String line : lines.tail()) { if (StringUtils.isBlank(line)) continue String[] cols = line.split(",", -1) + if (StringUtils.isBlank(cols[5])) { + skippedEmpty++ + continue + } + String siteId = cols[1] String filename = cols[4] List residueIds = Sutils.splitOnWhitespace(cols[5]) @@ -39,6 +45,9 @@ class AhojUbsSiteParser { totalSites++ } + if (skippedEmpty > 0) { + log.warn "Skipped {} rows with empty residue/coordinate fields in [{}]", skippedEmpty, filePath + } log.info "Loaded explicit sites index: {} sites for {} proteins from [{}]", totalSites, byFilename.size(), filePath diff --git a/src/main/groovy/cz/siret/prank/domain/loaders/DatasetItemLoader.groovy b/src/main/groovy/cz/siret/prank/domain/loaders/DatasetItemLoader.groovy index 936e80ef..e837068a 100644 --- a/src/main/groovy/cz/siret/prank/domain/loaders/DatasetItemLoader.groovy +++ b/src/main/groovy/cz/siret/prank/domain/loaders/DatasetItemLoader.groovy @@ -65,7 +65,6 @@ class DatasetItemLoader implements Parametrized, Writable { ExplicitSitesIndex index = item.originDataset.explicitSitesIndex res.holoProtein.sites = index.resolveForProtein(res.holoProtein, item.proteinFile) log.info "Loaded {} explicit sites for [{}]", res.holoProtein.sites.size(), item.label - write " sites: ${res.holoProtein.sites.size()}" } ProcessedItemContext itemContext = item.context diff --git a/src/main/groovy/cz/siret/prank/program/routines/analyze/AnalyzeRoutine.groovy b/src/main/groovy/cz/siret/prank/program/routines/analyze/AnalyzeRoutine.groovy index a6ec3821..81493ebc 100644 --- a/src/main/groovy/cz/siret/prank/program/routines/analyze/AnalyzeRoutine.groovy +++ b/src/main/groovy/cz/siret/prank/program/routines/analyze/AnalyzeRoutine.groovy @@ -167,7 +167,7 @@ class AnalyzeRoutine extends Routine { void cmdBindingSites() { DataTable dt = new DataTable("protein", "site_label", "site_type", - "n_atoms", "n_residues", "residue_ids", + "n_atoms", "n_residues", "site_radius", "residue_ids", "center_x", "center_y", "center_z", "lig_name", "lig_code", "lig_chain", "contact_dist", "center_to_prot_dist" @@ -183,8 +183,8 @@ class AnalyzeRoutine extends Routine { // Explicit-site-specific counters AtomicInteger totalSkippedSites = new AtomicInteger() AtomicInteger totalUnresolvedResidues = new AtomicInteger() - AtomicInteger proteinsWithSites = new AtomicInteger() - AtomicInteger proteinsWithoutSites = new AtomicInteger() + + Queue itemsWithoutSites = new ConcurrentLinkedQueue<>() def res = dataset.processItems { Dataset.Item item -> Protein p = item.protein @@ -195,9 +195,7 @@ class AnalyzeRoutine extends Routine { List sites = p.sites ?: [] if (sites.isEmpty()) { - proteinsWithoutSites.incrementAndGet() - } else { - proteinsWithSites.incrementAndGet() + itemsWithoutSites.add(item.row) } // Track unresolved: compare defs vs resolved sites @@ -226,12 +224,16 @@ class AnalyzeRoutine extends Routine { .put("site_type", "explicit") .put("n_atoms", site.atoms.count) .put("n_residues", site.residues.size()) + .put("site_radius", siteRadius(c, site.atoms)) .put("residue_ids", formatResidueIds(site.residues)) .put("center_x", c.x) .put("center_y", c.y) .put("center_z", c.z) } } else { + if (p.relevantLigands.isEmpty()) { + itemsWithoutSites.add(item.row) + } double cutoff = params.ligand_protein_contact_distance for (Ligand lig : p.relevantLigands) { Atom c = lig.centroid @@ -243,6 +245,7 @@ class AnalyzeRoutine extends Routine { .put("site_type", "ligand") .put("n_atoms", lig.size) .put("n_residues", contactResidues.size()) + .put("site_radius", siteRadius(c, lig.atoms)) .put("residue_ids", formatResidueIds(contactResidues)) .put("center_x", c.x) .put("center_y", c.y) @@ -258,31 +261,52 @@ class AnalyzeRoutine extends Routine { totalSmall.addAndGet(p.ligands.smallLigandCount) totalDistant.addAndGet(p.ligands.distantLigandCount) } + + if (params.visualizations) { + BinaryLabeling labeling = item.binaryLabeling + if (labeling != null) { + new NewPymolRenderer("$outdir/visualizations", new RenderingModel( + proteinFile: item.proteinFile, + label: item.label, + protein: p, + observedLabeling: labeling + )).render() + } + } } writeFile "$outdir/binding_sites.csv", dt.toCsv() Map extraInfo = new LinkedHashMap<>() + int noSiteCount = itemsWithoutSites.size() if (hasExplicitSites) { extraInfo.put("Site source:", "explicit") extraInfo.put("Sites format:", dataset.attributes.get(Dataset.PARAM_EXPLICIT_SITES_FORMAT)) extraInfo.put("Sites file:", dataset.attributes.get(Dataset.PARAM_EXPLICIT_SITES_FILE)) - extraInfo.put("Proteins with sites:", proteinsWithSites.get()) - extraInfo.put("Proteins without sites:", proteinsWithoutSites.get()) + extraInfo.put("Proteins with sites:", dataset.size - noSiteCount - res.errorCount) + extraInfo.put("Proteins without sites:", noSiteCount) extraInfo.put("Sites skipped (no residues):", totalSkippedSites.get()) extraInfo.put("Unresolved residues:", totalUnresolvedResidues.get()) } else { extraInfo.put("Site source:", "ligands") + extraInfo.put("Proteins without ligands:", noSiteCount) extraInfo.put("Ignored ligands:", totalIgnored.get()) extraInfo.put("Small ligands:", totalSmall.get()) extraInfo.put("Distant ligands:", totalDistant.get()) } extraInfo.put("Errors:", res.errorCount) - String summary = dt.formatSummaryTable("Binding Sites Summary", extraInfo) + Set noSummary = ["center_x", "center_y", "center_z"] as Set + String summary = dt.formatSummaryTable("Binding Sites Summary", extraInfo, noSummary) write summary writeFile "$outdir/binding_sites_summary.txt", summary + if (!itemsWithoutSites.isEmpty()) { + String noSitesFile = "$outdir/items_without_sites.txt" + writeFile noSitesFile, itemsWithoutSites.toSorted().join("\n") + "\n" + write "NOTE: $noSiteCount of ${dataset.size} items have no binding sites. List written to [$noSitesFile]" + } + write "Processed ${dataset.size} items" write res.errorSummary @@ -295,6 +319,15 @@ class AnalyzeRoutine extends Routine { }.join(" ") } + private static double siteRadius(Atom centroid, Atoms atoms) { + double maxDist = 0 + for (Atom a : atoms) { + double d = Struct.dist(centroid, a) + if (d > maxDist) maxDist = d + } + return maxDist + } + void cmdPeptides() { LoaderParams.ignoreLigandsSwitch = true diff --git a/src/main/groovy/cz/siret/prank/program/routines/analyze/DataTable.groovy b/src/main/groovy/cz/siret/prank/program/routines/analyze/DataTable.groovy index 39f68f1b..21d8bdef 100644 --- a/src/main/groovy/cz/siret/prank/program/routines/analyze/DataTable.groovy +++ b/src/main/groovy/cz/siret/prank/program/routines/analyze/DataTable.groovy @@ -115,9 +115,10 @@ class DataTable { /** * Returns indices of columns that have at least one numeric value. */ - private List getNumericColumnIndices() { + private List getNumericColumnIndices(Set exclude = Collections.emptySet()) { List result = new ArrayList<>() for (int i = 0; i < columns.length; i++) { + if (exclude.contains(columns[i])) continue int ci = i if (getRows().any { Row row -> row.values[ci] instanceof Number }) { result.add(i) @@ -142,8 +143,8 @@ class DataTable { return result } - String formatSummaryTable(String title = "Dataset Summary", Map extraInfo = [:]) { - List numColIndices = getNumericColumnIndices() + String formatSummaryTable(String title = "Dataset Summary", Map extraInfo = [:], Set excludeFromSummary = Collections.emptySet()) { + List numColIndices = getNumericColumnIndices(excludeFromSummary) int n = size() StringBuilder table = new StringBuilder()