Add explicit sites loading and analyze binding-sites command

Implement ExplicitSitesIndex for loading binding site definitions from
external CSV files (pluggable format system, first format: ahoj_ubs).
Sites are resolved during item loading via DatasetItemLoader.

Add 'analyze binding-sites' sub-command producing unified CSV and summary
stats for both ligand-based and explicit site datasets, with unresolved
residue/site tracking for explicit datasets.

Remove unused SiteLoader stub.
This commit is contained in:
rdk
2026-03-03 15:00:31 +01:00
parent 8f5da9fdcd
commit 997727e878
6 changed files with 310 additions and 22 deletions

View File

@@ -6,6 +6,7 @@ import cz.siret.prank.domain.labeling.BinaryLabeling
import cz.siret.prank.domain.labeling.LigandBasedResidueLabeler
import cz.siret.prank.domain.labeling.ResidueLabeler
import cz.siret.prank.domain.loaders.DatasetItemLoader
import cz.siret.prank.domain.loaders.ExplicitSitesIndex
import cz.siret.prank.domain.loaders.ExtendedResidueId
import cz.siret.prank.domain.loaders.LoaderParams
import cz.siret.prank.domain.loaders.pockets.*
@@ -56,6 +57,8 @@ class Dataset implements Parametrized, Writable, Failable {
static final String PARAM_LIGANDS_SEPARATED_BY_TER = "LIGANDS_SEPARATED_BY_TER"
static final String PARAM_RESIDUE_LABELING_FORMAT = "RESIDUE_LABELING_FORMAT"
static final String PARAM_RESIDUE_LABELING_FILE = "RESIDUE_LABELING_FILE"
static final String PARAM_EXPLICIT_SITES_FORMAT = "EXPLICIT_SITES_FORMAT"
static final String PARAM_EXPLICIT_SITES_FILE = "EXPLICIT_SITES_FILE"
/*
* dataset column names
@@ -84,6 +87,7 @@ class Dataset implements Parametrized, Writable, Failable {
boolean forTraining = false
private ResidueLabeler residueLabeler
private ExplicitSitesIndex explicitSitesIndex
//===========================================================================================================//
@@ -441,6 +445,19 @@ class Dataset implements Parametrized, Writable, Failable {
return attributes.containsKey(PARAM_RESIDUE_LABELING_FORMAT) || header.contains(COLUMN_POSITIVE_RESIDUES)
}
boolean hasExplicitSites() {
return attributes.containsKey(PARAM_EXPLICIT_SITES_FORMAT)
}
ExplicitSitesIndex getExplicitSitesIndex() {
if (explicitSitesIndex == null && hasExplicitSites()) {
String format = attributes.get(PARAM_EXPLICIT_SITES_FORMAT)
String file = dir + "/" + attributes.get(PARAM_EXPLICIT_SITES_FILE)
explicitSitesIndex = ExplicitSitesIndex.loadFromFile(format, file)
}
return explicitSitesIndex
}
/**
* @return true if predicted residue labeling is defined a as a part of the dataset
*/

View File

@@ -0,0 +1,48 @@
package cz.siret.prank.domain.loaders
import cz.siret.prank.utils.Futils
import cz.siret.prank.utils.Sutils
import groovy.transform.CompileStatic
import groovy.util.logging.Slf4j
import org.apache.commons.lang3.StringUtils
/**
* Parses the 'ahoj_ubs' CSV format for explicit binding site definitions.
*
* Expected CSV columns:
* site_uniprots,site_uid,site_recipe,threshold,afdb_filename,chain_resi,center_x,center_y,center_z
*/
@Slf4j
@CompileStatic
class AhojUbsSiteParser {
static ExplicitSitesIndex parse(String filePath) {
List<String> lines = Futils.readLines(filePath)
Map<String, List<ExplicitSitesIndex.SiteDef>> byFilename = new LinkedHashMap<>()
int totalSites = 0
for (String line : lines.tail()) {
if (StringUtils.isBlank(line)) continue
String[] cols = line.split(",", -1)
String siteId = cols[1]
String filename = cols[4]
List<String> residueIds = Sutils.splitOnWhitespace(cols[5])
double cx = Double.parseDouble(cols[6])
double cy = Double.parseDouble(cols[7])
double cz = Double.parseDouble(cols[8])
ExplicitSitesIndex.SiteDef sd = new ExplicitSitesIndex.SiteDef(
siteId, filename, residueIds, cx, cy, cz)
byFilename.computeIfAbsent(filename, { new ArrayList<>() }).add(sd)
totalSites++
}
log.info "Loaded explicit sites index: {} sites for {} proteins from [{}]",
totalSites, byFilename.size(), filePath
return new ExplicitSitesIndex(byFilename)
}
}

View File

@@ -61,6 +61,13 @@ class DatasetItemLoader implements Parametrized, Writable {
res.prediction = new Prediction(res.protein, [])
}
if (item.originDataset.hasExplicitSites()) {
ExplicitSitesIndex index = item.originDataset.explicitSitesIndex
res.holoProtein.sites = index.resolveForProtein(res.holoProtein, item.proteinFile)
log.info "Loaded {} explicit sites for [{}]", res.holoProtein.sites.size(), item.label
write " sites: ${res.holoProtein.sites.size()}"
}
ProcessedItemContext itemContext = item.context
if (params.identify_peptides_by_labeling) {

View File

@@ -0,0 +1,100 @@
package cz.siret.prank.domain.loaders
import cz.siret.prank.domain.Protein
import cz.siret.prank.domain.Residue
import cz.siret.prank.domain.ResidueSite
import cz.siret.prank.geom.Point
import cz.siret.prank.program.PrankException
import cz.siret.prank.utils.Futils
import groovy.transform.CompileStatic
import groovy.util.logging.Slf4j
import org.biojava.nbio.structure.Atom
import org.biojava.nbio.structure.ResidueNumber
/**
* Index of explicit site definitions loaded from an external CSV file.
* Keyed by filename for O(1) lookup per protein.
* Pluggable via format string dispatched in {@link #loadFromFile}.
*/
@Slf4j
@CompileStatic
class ExplicitSitesIndex {
@CompileStatic
static class SiteDef {
String siteId
String filename
List<String> residueIds
double centerX
double centerY
double centerZ
SiteDef(String siteId, String filename, List<String> residueIds,
double centerX, double centerY, double centerZ) {
this.siteId = siteId
this.filename = filename
this.residueIds = residueIds
this.centerX = centerX
this.centerY = centerY
this.centerZ = centerZ
}
}
private final Map<String, List<SiteDef>> byFilename
ExplicitSitesIndex(Map<String, List<SiteDef>> byFilename) {
this.byFilename = byFilename
}
static ExplicitSitesIndex loadFromFile(String format, String filePath) {
switch (format) {
case "ahoj_ubs":
return AhojUbsSiteParser.parse(filePath)
default:
throw new PrankException("Unknown explicit sites format: " + format)
}
}
List<SiteDef> getDefsForProtein(String proteinFile) {
String filename = Futils.shortName(proteinFile)
List<SiteDef> defs = byFilename.get(filename)
return defs != null ? defs : Collections.<SiteDef> emptyList()
}
List<ResidueSite> resolveForProtein(Protein protein, String proteinFile) {
List<SiteDef> defs = getDefsForProtein(proteinFile)
if (defs.isEmpty()) {
log.warn "No explicit sites found for [{}] in sites file", Futils.shortName(proteinFile)
return Collections.<ResidueSite> emptyList()
}
List<ResidueSite> sites = new ArrayList<>()
for (SiteDef sd : defs) {
List<Residue> residues = resolveResidues(sd, protein)
if (residues.isEmpty()) {
log.warn "Site [{}] has no resolved residues, skipping", sd.siteId
continue
}
Atom centroid = Point.of(sd.centerX, sd.centerY, sd.centerZ)
sites.add(new ResidueSite(sd.siteId, centroid, residues, protein))
}
return sites
}
private List<Residue> resolveResidues(SiteDef sd, Protein protein) {
List<Residue> resolved = new ArrayList<>()
for (String resId : sd.residueIds) {
ExtendedResidueId eid = ExtendedResidueId.parse(resId)
ResidueNumber rn = eid.toResidueNumber()
Residue r = protein.residues.getResidue(Residue.Key.of(rn))
if (r != null) {
resolved.add(r)
} else {
log.warn "Cannot resolve residue [{}] for site [{}] in protein [{}]",
resId, sd.siteId, protein.name
}
}
return resolved
}
}

View File

@@ -1,22 +0,0 @@
package cz.siret.prank.domain.loaders
import cz.siret.prank.domain.Protein
import cz.siret.prank.domain.ResidueSite
import groovy.transform.CompileStatic
import groovy.util.logging.Slf4j
/**
* Loads binding sites from external definition (e.g. CSV file).
* Placeholder — actual loading logic will be implemented later.
* Note: currently unused — Protein.sites is never populated by any code path yet.
*/
@Slf4j
@CompileStatic
class SiteLoader {
List<ResidueSite> loadForProtein(Protein protein, String siteDefinitionFile) {
log.info "Site loading not yet implemented for protein [{}], file [{}]", protein.name, siteDefinitionFile
return Collections.<ResidueSite>emptyList()
}
}

View File

@@ -3,6 +3,7 @@ package cz.siret.prank.program.routines.analyze
import cz.siret.prank.domain.*
import cz.siret.prank.domain.labeling.*
import cz.siret.prank.domain.loaders.ExplicitSitesIndex
import cz.siret.prank.domain.loaders.LoaderParams
import cz.siret.prank.export.FastaExporter
import cz.siret.prank.features.implementation.table.AtomTableFeature
@@ -22,6 +23,7 @@ import org.biojava.nbio.structure.ResidueNumber
import javax.annotation.Nullable
import java.util.concurrent.ConcurrentLinkedQueue
import java.util.concurrent.atomic.AtomicInteger
import static cz.siret.prank.geom.SecondaryStructureUtils.assignSecondaryStructure
import static cz.siret.prank.utils.Cutils.newSynchronizedList
@@ -75,6 +77,7 @@ class AnalyzeRoutine extends Routine {
final Map<String, Closure> commandRegister = unmodifiableMap([
"residues" : { cmdResidues() },
"binding-residues" : { cmdBindingResidues() },
"binding-sites" : { cmdBindingSites() },
"labeled-residues" : { cmdLabeledResidues() },
"aa-propensities" : { cmdAaPropensities() },
"atomtype-propensities" : { cmdAtomTypePropensities() },
@@ -157,6 +160,141 @@ class AnalyzeRoutine extends Routine {
}
/**
* Binding site statistics — works for both ligand-based and explicit site datasets.
* Produces a unified CSV with the same header regardless of site source.
*/
void cmdBindingSites() {
DataTable dt = new DataTable("protein",
"site_label", "site_type",
"n_atoms", "n_residues", "residue_ids",
"center_x", "center_y", "center_z",
"lig_name", "lig_code", "lig_chain",
"contact_dist", "center_to_prot_dist"
)
boolean hasExplicitSites = dataset.hasExplicitSites()
// Ligand-specific counters
AtomicInteger totalIgnored = new AtomicInteger()
AtomicInteger totalSmall = new AtomicInteger()
AtomicInteger totalDistant = new AtomicInteger()
// Explicit-site-specific counters
AtomicInteger totalSkippedSites = new AtomicInteger()
AtomicInteger totalUnresolvedResidues = new AtomicInteger()
AtomicInteger proteinsWithSites = new AtomicInteger()
AtomicInteger proteinsWithoutSites = new AtomicInteger()
def res = dataset.processItems { Dataset.Item item ->
Protein p = item.protein
if (hasExplicitSites) {
ExplicitSitesIndex index = dataset.explicitSitesIndex
List<ExplicitSitesIndex.SiteDef> defs = index.getDefsForProtein(item.proteinFile)
List<ResidueSite> sites = p.sites ?: []
if (sites.isEmpty()) {
proteinsWithoutSites.incrementAndGet()
} else {
proteinsWithSites.incrementAndGet()
}
// Track unresolved: compare defs vs resolved sites
Map<String, ResidueSite> resolvedByName = new HashMap<>()
for (ResidueSite site : sites) {
resolvedByName.put(site.name, site)
}
for (ExplicitSitesIndex.SiteDef sd : defs) {
ResidueSite resolved = resolvedByName.get(sd.siteId)
if (resolved == null) {
totalSkippedSites.incrementAndGet()
totalUnresolvedResidues.addAndGet(sd.residueIds.size())
} else {
int unresolved = sd.residueIds.size() - resolved.residues.size()
if (unresolved > 0) {
totalUnresolvedResidues.addAndGet(unresolved)
}
}
}
for (ResidueSite site : sites) {
Atom c = site.centroid
dt.newRow(item.label)
.put("site_label", site.label)
.put("site_type", "explicit")
.put("n_atoms", site.atoms.count)
.put("n_residues", site.residues.size())
.put("residue_ids", formatResidueIds(site.residues))
.put("center_x", c.x)
.put("center_y", c.y)
.put("center_z", c.z)
}
} else {
double cutoff = params.ligand_protein_contact_distance
for (Ligand lig : p.relevantLigands) {
Atom c = lig.centroid
Atoms contactAtoms = p.proteinAtoms.cutoutShell(lig.atoms, cutoff)
List<Residue> contactResidues = p.residues.getDistinctForAtoms(contactAtoms)
dt.newRow(item.label)
.put("site_label", lig.label)
.put("site_type", "ligand")
.put("n_atoms", lig.size)
.put("n_residues", contactResidues.size())
.put("residue_ids", formatResidueIds(contactResidues))
.put("center_x", c.x)
.put("center_y", c.y)
.put("center_z", c.z)
.put("lig_name", lig.name)
.put("lig_code", lig.code as String)
.put("lig_chain", lig.chain)
.put("contact_dist", lig.contactDistance)
.put("center_to_prot_dist", lig.centerToProteinDist)
}
totalIgnored.addAndGet(p.ligands.ignoredLigandCount)
totalSmall.addAndGet(p.ligands.smallLigandCount)
totalDistant.addAndGet(p.ligands.distantLigandCount)
}
}
writeFile "$outdir/binding_sites.csv", dt.toCsv()
Map<String, Object> extraInfo = new LinkedHashMap<>()
if (hasExplicitSites) {
extraInfo.put("Site source:", "explicit")
extraInfo.put("Sites format:", dataset.attributes.get(Dataset.PARAM_EXPLICIT_SITES_FORMAT))
extraInfo.put("Sites file:", dataset.attributes.get(Dataset.PARAM_EXPLICIT_SITES_FILE))
extraInfo.put("Proteins with sites:", proteinsWithSites.get())
extraInfo.put("Proteins without sites:", proteinsWithoutSites.get())
extraInfo.put("Sites skipped (no residues):", totalSkippedSites.get())
extraInfo.put("Unresolved residues:", totalUnresolvedResidues.get())
} else {
extraInfo.put("Site source:", "ligands")
extraInfo.put("Ignored ligands:", totalIgnored.get())
extraInfo.put("Small ligands:", totalSmall.get())
extraInfo.put("Distant ligands:", totalDistant.get())
}
extraInfo.put("Errors:", res.errorCount)
String summary = dt.formatSummaryTable("Binding Sites Summary", extraInfo)
write summary
writeFile "$outdir/binding_sites_summary.txt", summary
write "Processed ${dataset.size} items"
write res.errorSummary
res.writeErrorCsvs(outdir)
}
private static String formatResidueIds(List<Residue> residues) {
residues.collect { Residue r ->
r.chain.authorId + "_" + r.residueNumber.seqNum + (r.residueNumber.insCode ?: "")
}.join(" ")
}
void cmdPeptides() {
LoaderParams.ignoreLigandsSwitch = true