Improve analyze binding-sites: visualizations, site radius, eager loading

- Add PyMol visualizations using dataset.binaryResidueLabeler
- Add site_radius column (max distance from centroid to any site atom)
- Add excludeFromSummary param to DataTable.formatSummaryTable to skip
  center coordinates from numeric summary stats
- Load ExplicitSitesIndex eagerly during dataset loading (fail-fast)
- Skip CSV rows with empty residue/coordinate fields in AhojUbsSiteParser
- Write items without binding sites to separate file in outdir
This commit is contained in:
rdk
2026-03-03 21:58:55 +01:00
parent 9e9a500836
commit 026be7eae5
5 changed files with 62 additions and 15 deletions

View File

@@ -450,12 +450,15 @@ class Dataset implements Parametrized, Writable, Failable {
}
ExplicitSitesIndex getExplicitSitesIndex() {
if (explicitSitesIndex == null && hasExplicitSites()) {
return explicitSitesIndex
}
private void loadExplicitSitesIndex() {
if (hasExplicitSites()) {
String format = attributes.get(PARAM_EXPLICIT_SITES_FORMAT)
String file = dir + "/" + attributes.get(PARAM_EXPLICIT_SITES_FILE)
explicitSitesIndex = ExplicitSitesIndex.loadFromFile(format, file)
}
return explicitSitesIndex
}
/**
@@ -578,6 +581,8 @@ class Dataset implements Parametrized, Writable, Failable {
throw new PrankException("dataset contains invalid files")
}
dataset.loadExplicitSitesIndex()
return dataset
}

View File

@@ -21,11 +21,17 @@ class AhojUbsSiteParser {
Map<String, List<ExplicitSitesIndex.SiteDef>> byFilename = new LinkedHashMap<>()
int totalSites = 0
int skippedEmpty = 0
for (String line : lines.tail()) {
if (StringUtils.isBlank(line)) continue
String[] cols = line.split(",", -1)
if (StringUtils.isBlank(cols[5])) {
skippedEmpty++
continue
}
String siteId = cols[1]
String filename = cols[4]
List<String> residueIds = Sutils.splitOnWhitespace(cols[5])
@@ -39,6 +45,9 @@ class AhojUbsSiteParser {
totalSites++
}
if (skippedEmpty > 0) {
log.warn "Skipped {} rows with empty residue/coordinate fields in [{}]", skippedEmpty, filePath
}
log.info "Loaded explicit sites index: {} sites for {} proteins from [{}]",
totalSites, byFilename.size(), filePath

View File

@@ -65,7 +65,6 @@ class DatasetItemLoader implements Parametrized, Writable {
ExplicitSitesIndex index = item.originDataset.explicitSitesIndex
res.holoProtein.sites = index.resolveForProtein(res.holoProtein, item.proteinFile)
log.info "Loaded {} explicit sites for [{}]", res.holoProtein.sites.size(), item.label
write " sites: ${res.holoProtein.sites.size()}"
}
ProcessedItemContext itemContext = item.context

View File

@@ -167,7 +167,7 @@ class AnalyzeRoutine extends Routine {
void cmdBindingSites() {
DataTable dt = new DataTable("protein",
"site_label", "site_type",
"n_atoms", "n_residues", "residue_ids",
"n_atoms", "n_residues", "site_radius", "residue_ids",
"center_x", "center_y", "center_z",
"lig_name", "lig_code", "lig_chain",
"contact_dist", "center_to_prot_dist"
@@ -183,8 +183,8 @@ class AnalyzeRoutine extends Routine {
// Explicit-site-specific counters
AtomicInteger totalSkippedSites = new AtomicInteger()
AtomicInteger totalUnresolvedResidues = new AtomicInteger()
AtomicInteger proteinsWithSites = new AtomicInteger()
AtomicInteger proteinsWithoutSites = new AtomicInteger()
Queue<String> itemsWithoutSites = new ConcurrentLinkedQueue<>()
def res = dataset.processItems { Dataset.Item item ->
Protein p = item.protein
@@ -195,9 +195,7 @@ class AnalyzeRoutine extends Routine {
List<ResidueSite> sites = p.sites ?: []
if (sites.isEmpty()) {
proteinsWithoutSites.incrementAndGet()
} else {
proteinsWithSites.incrementAndGet()
itemsWithoutSites.add(item.row)
}
// Track unresolved: compare defs vs resolved sites
@@ -226,12 +224,16 @@ class AnalyzeRoutine extends Routine {
.put("site_type", "explicit")
.put("n_atoms", site.atoms.count)
.put("n_residues", site.residues.size())
.put("site_radius", siteRadius(c, site.atoms))
.put("residue_ids", formatResidueIds(site.residues))
.put("center_x", c.x)
.put("center_y", c.y)
.put("center_z", c.z)
}
} else {
if (p.relevantLigands.isEmpty()) {
itemsWithoutSites.add(item.row)
}
double cutoff = params.ligand_protein_contact_distance
for (Ligand lig : p.relevantLigands) {
Atom c = lig.centroid
@@ -243,6 +245,7 @@ class AnalyzeRoutine extends Routine {
.put("site_type", "ligand")
.put("n_atoms", lig.size)
.put("n_residues", contactResidues.size())
.put("site_radius", siteRadius(c, lig.atoms))
.put("residue_ids", formatResidueIds(contactResidues))
.put("center_x", c.x)
.put("center_y", c.y)
@@ -258,31 +261,52 @@ class AnalyzeRoutine extends Routine {
totalSmall.addAndGet(p.ligands.smallLigandCount)
totalDistant.addAndGet(p.ligands.distantLigandCount)
}
if (params.visualizations) {
BinaryLabeling labeling = item.binaryLabeling
if (labeling != null) {
new NewPymolRenderer("$outdir/visualizations", new RenderingModel(
proteinFile: item.proteinFile,
label: item.label,
protein: p,
observedLabeling: labeling
)).render()
}
}
}
writeFile "$outdir/binding_sites.csv", dt.toCsv()
Map<String, Object> extraInfo = new LinkedHashMap<>()
int noSiteCount = itemsWithoutSites.size()
if (hasExplicitSites) {
extraInfo.put("Site source:", "explicit")
extraInfo.put("Sites format:", dataset.attributes.get(Dataset.PARAM_EXPLICIT_SITES_FORMAT))
extraInfo.put("Sites file:", dataset.attributes.get(Dataset.PARAM_EXPLICIT_SITES_FILE))
extraInfo.put("Proteins with sites:", proteinsWithSites.get())
extraInfo.put("Proteins without sites:", proteinsWithoutSites.get())
extraInfo.put("Proteins with sites:", dataset.size - noSiteCount - res.errorCount)
extraInfo.put("Proteins without sites:", noSiteCount)
extraInfo.put("Sites skipped (no residues):", totalSkippedSites.get())
extraInfo.put("Unresolved residues:", totalUnresolvedResidues.get())
} else {
extraInfo.put("Site source:", "ligands")
extraInfo.put("Proteins without ligands:", noSiteCount)
extraInfo.put("Ignored ligands:", totalIgnored.get())
extraInfo.put("Small ligands:", totalSmall.get())
extraInfo.put("Distant ligands:", totalDistant.get())
}
extraInfo.put("Errors:", res.errorCount)
String summary = dt.formatSummaryTable("Binding Sites Summary", extraInfo)
Set<String> noSummary = ["center_x", "center_y", "center_z"] as Set
String summary = dt.formatSummaryTable("Binding Sites Summary", extraInfo, noSummary)
write summary
writeFile "$outdir/binding_sites_summary.txt", summary
if (!itemsWithoutSites.isEmpty()) {
String noSitesFile = "$outdir/items_without_sites.txt"
writeFile noSitesFile, itemsWithoutSites.toSorted().join("\n") + "\n"
write "NOTE: $noSiteCount of ${dataset.size} items have no binding sites. List written to [$noSitesFile]"
}
write "Processed ${dataset.size} items"
write res.errorSummary
@@ -295,6 +319,15 @@ class AnalyzeRoutine extends Routine {
}.join(" ")
}
private static double siteRadius(Atom centroid, Atoms atoms) {
double maxDist = 0
for (Atom a : atoms) {
double d = Struct.dist(centroid, a)
if (d > maxDist) maxDist = d
}
return maxDist
}
void cmdPeptides() {
LoaderParams.ignoreLigandsSwitch = true

View File

@@ -115,9 +115,10 @@ class DataTable {
/**
* Returns indices of columns that have at least one numeric value.
*/
private List<Integer> getNumericColumnIndices() {
private List<Integer> getNumericColumnIndices(Set<String> exclude = Collections.emptySet()) {
List<Integer> result = new ArrayList<>()
for (int i = 0; i < columns.length; i++) {
if (exclude.contains(columns[i])) continue
int ci = i
if (getRows().any { Row row -> row.values[ci] instanceof Number }) {
result.add(i)
@@ -142,8 +143,8 @@ class DataTable {
return result
}
String formatSummaryTable(String title = "Dataset Summary", Map<String, Object> extraInfo = [:]) {
List<Integer> numColIndices = getNumericColumnIndices()
String formatSummaryTable(String title = "Dataset Summary", Map<String, Object> extraInfo = [:], Set<String> excludeFromSummary = Collections.emptySet()) {
List<Integer> numColIndices = getNumericColumnIndices(excludeFromSummary)
int n = size()
StringBuilder table = new StringBuilder()