mirror of
https://github.com/rdk/p2rank.git
synced 2026-06-04 12:44:24 +08:00
Add explicit sites loading and analyze binding-sites command
Implement ExplicitSitesIndex for loading binding site definitions from external CSV files (pluggable format system, first format: ahoj_ubs). Sites are resolved during item loading via DatasetItemLoader. Add 'analyze binding-sites' sub-command producing unified CSV and summary stats for both ligand-based and explicit site datasets, with unresolved residue/site tracking for explicit datasets. Remove unused SiteLoader stub.
This commit is contained in:
@@ -6,6 +6,7 @@ import cz.siret.prank.domain.labeling.BinaryLabeling
|
||||
import cz.siret.prank.domain.labeling.LigandBasedResidueLabeler
|
||||
import cz.siret.prank.domain.labeling.ResidueLabeler
|
||||
import cz.siret.prank.domain.loaders.DatasetItemLoader
|
||||
import cz.siret.prank.domain.loaders.ExplicitSitesIndex
|
||||
import cz.siret.prank.domain.loaders.ExtendedResidueId
|
||||
import cz.siret.prank.domain.loaders.LoaderParams
|
||||
import cz.siret.prank.domain.loaders.pockets.*
|
||||
@@ -56,6 +57,8 @@ class Dataset implements Parametrized, Writable, Failable {
|
||||
static final String PARAM_LIGANDS_SEPARATED_BY_TER = "LIGANDS_SEPARATED_BY_TER"
|
||||
static final String PARAM_RESIDUE_LABELING_FORMAT = "RESIDUE_LABELING_FORMAT"
|
||||
static final String PARAM_RESIDUE_LABELING_FILE = "RESIDUE_LABELING_FILE"
|
||||
static final String PARAM_EXPLICIT_SITES_FORMAT = "EXPLICIT_SITES_FORMAT"
|
||||
static final String PARAM_EXPLICIT_SITES_FILE = "EXPLICIT_SITES_FILE"
|
||||
|
||||
/*
|
||||
* dataset column names
|
||||
@@ -84,6 +87,7 @@ class Dataset implements Parametrized, Writable, Failable {
|
||||
boolean forTraining = false
|
||||
|
||||
private ResidueLabeler residueLabeler
|
||||
private ExplicitSitesIndex explicitSitesIndex
|
||||
|
||||
//===========================================================================================================//
|
||||
|
||||
@@ -441,6 +445,19 @@ class Dataset implements Parametrized, Writable, Failable {
|
||||
return attributes.containsKey(PARAM_RESIDUE_LABELING_FORMAT) || header.contains(COLUMN_POSITIVE_RESIDUES)
|
||||
}
|
||||
|
||||
boolean hasExplicitSites() {
|
||||
return attributes.containsKey(PARAM_EXPLICIT_SITES_FORMAT)
|
||||
}
|
||||
|
||||
ExplicitSitesIndex getExplicitSitesIndex() {
|
||||
if (explicitSitesIndex == null && hasExplicitSites()) {
|
||||
String format = attributes.get(PARAM_EXPLICIT_SITES_FORMAT)
|
||||
String file = dir + "/" + attributes.get(PARAM_EXPLICIT_SITES_FILE)
|
||||
explicitSitesIndex = ExplicitSitesIndex.loadFromFile(format, file)
|
||||
}
|
||||
return explicitSitesIndex
|
||||
}
|
||||
|
||||
/**
|
||||
* @return true if predicted residue labeling is defined a as a part of the dataset
|
||||
*/
|
||||
|
||||
@@ -0,0 +1,48 @@
|
||||
package cz.siret.prank.domain.loaders
|
||||
|
||||
import cz.siret.prank.utils.Futils
|
||||
import cz.siret.prank.utils.Sutils
|
||||
import groovy.transform.CompileStatic
|
||||
import groovy.util.logging.Slf4j
|
||||
import org.apache.commons.lang3.StringUtils
|
||||
|
||||
/**
|
||||
* Parses the 'ahoj_ubs' CSV format for explicit binding site definitions.
|
||||
*
|
||||
* Expected CSV columns:
|
||||
* site_uniprots,site_uid,site_recipe,threshold,afdb_filename,chain_resi,center_x,center_y,center_z
|
||||
*/
|
||||
@Slf4j
|
||||
@CompileStatic
|
||||
class AhojUbsSiteParser {
|
||||
|
||||
static ExplicitSitesIndex parse(String filePath) {
|
||||
List<String> lines = Futils.readLines(filePath)
|
||||
Map<String, List<ExplicitSitesIndex.SiteDef>> byFilename = new LinkedHashMap<>()
|
||||
|
||||
int totalSites = 0
|
||||
for (String line : lines.tail()) {
|
||||
if (StringUtils.isBlank(line)) continue
|
||||
|
||||
String[] cols = line.split(",", -1)
|
||||
|
||||
String siteId = cols[1]
|
||||
String filename = cols[4]
|
||||
List<String> residueIds = Sutils.splitOnWhitespace(cols[5])
|
||||
double cx = Double.parseDouble(cols[6])
|
||||
double cy = Double.parseDouble(cols[7])
|
||||
double cz = Double.parseDouble(cols[8])
|
||||
|
||||
ExplicitSitesIndex.SiteDef sd = new ExplicitSitesIndex.SiteDef(
|
||||
siteId, filename, residueIds, cx, cy, cz)
|
||||
byFilename.computeIfAbsent(filename, { new ArrayList<>() }).add(sd)
|
||||
totalSites++
|
||||
}
|
||||
|
||||
log.info "Loaded explicit sites index: {} sites for {} proteins from [{}]",
|
||||
totalSites, byFilename.size(), filePath
|
||||
|
||||
return new ExplicitSitesIndex(byFilename)
|
||||
}
|
||||
|
||||
}
|
||||
@@ -61,6 +61,13 @@ class DatasetItemLoader implements Parametrized, Writable {
|
||||
res.prediction = new Prediction(res.protein, [])
|
||||
}
|
||||
|
||||
if (item.originDataset.hasExplicitSites()) {
|
||||
ExplicitSitesIndex index = item.originDataset.explicitSitesIndex
|
||||
res.holoProtein.sites = index.resolveForProtein(res.holoProtein, item.proteinFile)
|
||||
log.info "Loaded {} explicit sites for [{}]", res.holoProtein.sites.size(), item.label
|
||||
write " sites: ${res.holoProtein.sites.size()}"
|
||||
}
|
||||
|
||||
ProcessedItemContext itemContext = item.context
|
||||
|
||||
if (params.identify_peptides_by_labeling) {
|
||||
|
||||
@@ -0,0 +1,100 @@
|
||||
package cz.siret.prank.domain.loaders
|
||||
|
||||
import cz.siret.prank.domain.Protein
|
||||
import cz.siret.prank.domain.Residue
|
||||
import cz.siret.prank.domain.ResidueSite
|
||||
import cz.siret.prank.geom.Point
|
||||
import cz.siret.prank.program.PrankException
|
||||
import cz.siret.prank.utils.Futils
|
||||
import groovy.transform.CompileStatic
|
||||
import groovy.util.logging.Slf4j
|
||||
import org.biojava.nbio.structure.Atom
|
||||
import org.biojava.nbio.structure.ResidueNumber
|
||||
|
||||
/**
|
||||
* Index of explicit site definitions loaded from an external CSV file.
|
||||
* Keyed by filename for O(1) lookup per protein.
|
||||
* Pluggable via format string dispatched in {@link #loadFromFile}.
|
||||
*/
|
||||
@Slf4j
|
||||
@CompileStatic
|
||||
class ExplicitSitesIndex {
|
||||
|
||||
@CompileStatic
|
||||
static class SiteDef {
|
||||
String siteId
|
||||
String filename
|
||||
List<String> residueIds
|
||||
double centerX
|
||||
double centerY
|
||||
double centerZ
|
||||
|
||||
SiteDef(String siteId, String filename, List<String> residueIds,
|
||||
double centerX, double centerY, double centerZ) {
|
||||
this.siteId = siteId
|
||||
this.filename = filename
|
||||
this.residueIds = residueIds
|
||||
this.centerX = centerX
|
||||
this.centerY = centerY
|
||||
this.centerZ = centerZ
|
||||
}
|
||||
}
|
||||
|
||||
private final Map<String, List<SiteDef>> byFilename
|
||||
|
||||
ExplicitSitesIndex(Map<String, List<SiteDef>> byFilename) {
|
||||
this.byFilename = byFilename
|
||||
}
|
||||
|
||||
static ExplicitSitesIndex loadFromFile(String format, String filePath) {
|
||||
switch (format) {
|
||||
case "ahoj_ubs":
|
||||
return AhojUbsSiteParser.parse(filePath)
|
||||
default:
|
||||
throw new PrankException("Unknown explicit sites format: " + format)
|
||||
}
|
||||
}
|
||||
|
||||
List<SiteDef> getDefsForProtein(String proteinFile) {
|
||||
String filename = Futils.shortName(proteinFile)
|
||||
List<SiteDef> defs = byFilename.get(filename)
|
||||
return defs != null ? defs : Collections.<SiteDef> emptyList()
|
||||
}
|
||||
|
||||
List<ResidueSite> resolveForProtein(Protein protein, String proteinFile) {
|
||||
List<SiteDef> defs = getDefsForProtein(proteinFile)
|
||||
if (defs.isEmpty()) {
|
||||
log.warn "No explicit sites found for [{}] in sites file", Futils.shortName(proteinFile)
|
||||
return Collections.<ResidueSite> emptyList()
|
||||
}
|
||||
|
||||
List<ResidueSite> sites = new ArrayList<>()
|
||||
for (SiteDef sd : defs) {
|
||||
List<Residue> residues = resolveResidues(sd, protein)
|
||||
if (residues.isEmpty()) {
|
||||
log.warn "Site [{}] has no resolved residues, skipping", sd.siteId
|
||||
continue
|
||||
}
|
||||
Atom centroid = Point.of(sd.centerX, sd.centerY, sd.centerZ)
|
||||
sites.add(new ResidueSite(sd.siteId, centroid, residues, protein))
|
||||
}
|
||||
return sites
|
||||
}
|
||||
|
||||
private List<Residue> resolveResidues(SiteDef sd, Protein protein) {
|
||||
List<Residue> resolved = new ArrayList<>()
|
||||
for (String resId : sd.residueIds) {
|
||||
ExtendedResidueId eid = ExtendedResidueId.parse(resId)
|
||||
ResidueNumber rn = eid.toResidueNumber()
|
||||
Residue r = protein.residues.getResidue(Residue.Key.of(rn))
|
||||
if (r != null) {
|
||||
resolved.add(r)
|
||||
} else {
|
||||
log.warn "Cannot resolve residue [{}] for site [{}] in protein [{}]",
|
||||
resId, sd.siteId, protein.name
|
||||
}
|
||||
}
|
||||
return resolved
|
||||
}
|
||||
|
||||
}
|
||||
@@ -1,22 +0,0 @@
|
||||
package cz.siret.prank.domain.loaders
|
||||
|
||||
import cz.siret.prank.domain.Protein
|
||||
import cz.siret.prank.domain.ResidueSite
|
||||
import groovy.transform.CompileStatic
|
||||
import groovy.util.logging.Slf4j
|
||||
|
||||
/**
|
||||
* Loads binding sites from external definition (e.g. CSV file).
|
||||
* Placeholder — actual loading logic will be implemented later.
|
||||
* Note: currently unused — Protein.sites is never populated by any code path yet.
|
||||
*/
|
||||
@Slf4j
|
||||
@CompileStatic
|
||||
class SiteLoader {
|
||||
|
||||
List<ResidueSite> loadForProtein(Protein protein, String siteDefinitionFile) {
|
||||
log.info "Site loading not yet implemented for protein [{}], file [{}]", protein.name, siteDefinitionFile
|
||||
return Collections.<ResidueSite>emptyList()
|
||||
}
|
||||
|
||||
}
|
||||
@@ -3,6 +3,7 @@ package cz.siret.prank.program.routines.analyze
|
||||
|
||||
import cz.siret.prank.domain.*
|
||||
import cz.siret.prank.domain.labeling.*
|
||||
import cz.siret.prank.domain.loaders.ExplicitSitesIndex
|
||||
import cz.siret.prank.domain.loaders.LoaderParams
|
||||
import cz.siret.prank.export.FastaExporter
|
||||
import cz.siret.prank.features.implementation.table.AtomTableFeature
|
||||
@@ -22,6 +23,7 @@ import org.biojava.nbio.structure.ResidueNumber
|
||||
|
||||
import javax.annotation.Nullable
|
||||
import java.util.concurrent.ConcurrentLinkedQueue
|
||||
import java.util.concurrent.atomic.AtomicInteger
|
||||
|
||||
import static cz.siret.prank.geom.SecondaryStructureUtils.assignSecondaryStructure
|
||||
import static cz.siret.prank.utils.Cutils.newSynchronizedList
|
||||
@@ -75,6 +77,7 @@ class AnalyzeRoutine extends Routine {
|
||||
final Map<String, Closure> commandRegister = unmodifiableMap([
|
||||
"residues" : { cmdResidues() },
|
||||
"binding-residues" : { cmdBindingResidues() },
|
||||
"binding-sites" : { cmdBindingSites() },
|
||||
"labeled-residues" : { cmdLabeledResidues() },
|
||||
"aa-propensities" : { cmdAaPropensities() },
|
||||
"atomtype-propensities" : { cmdAtomTypePropensities() },
|
||||
@@ -157,6 +160,141 @@ class AnalyzeRoutine extends Routine {
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Binding site statistics — works for both ligand-based and explicit site datasets.
|
||||
* Produces a unified CSV with the same header regardless of site source.
|
||||
*/
|
||||
void cmdBindingSites() {
|
||||
DataTable dt = new DataTable("protein",
|
||||
"site_label", "site_type",
|
||||
"n_atoms", "n_residues", "residue_ids",
|
||||
"center_x", "center_y", "center_z",
|
||||
"lig_name", "lig_code", "lig_chain",
|
||||
"contact_dist", "center_to_prot_dist"
|
||||
)
|
||||
|
||||
boolean hasExplicitSites = dataset.hasExplicitSites()
|
||||
|
||||
// Ligand-specific counters
|
||||
AtomicInteger totalIgnored = new AtomicInteger()
|
||||
AtomicInteger totalSmall = new AtomicInteger()
|
||||
AtomicInteger totalDistant = new AtomicInteger()
|
||||
|
||||
// Explicit-site-specific counters
|
||||
AtomicInteger totalSkippedSites = new AtomicInteger()
|
||||
AtomicInteger totalUnresolvedResidues = new AtomicInteger()
|
||||
AtomicInteger proteinsWithSites = new AtomicInteger()
|
||||
AtomicInteger proteinsWithoutSites = new AtomicInteger()
|
||||
|
||||
def res = dataset.processItems { Dataset.Item item ->
|
||||
Protein p = item.protein
|
||||
|
||||
if (hasExplicitSites) {
|
||||
ExplicitSitesIndex index = dataset.explicitSitesIndex
|
||||
List<ExplicitSitesIndex.SiteDef> defs = index.getDefsForProtein(item.proteinFile)
|
||||
List<ResidueSite> sites = p.sites ?: []
|
||||
|
||||
if (sites.isEmpty()) {
|
||||
proteinsWithoutSites.incrementAndGet()
|
||||
} else {
|
||||
proteinsWithSites.incrementAndGet()
|
||||
}
|
||||
|
||||
// Track unresolved: compare defs vs resolved sites
|
||||
Map<String, ResidueSite> resolvedByName = new HashMap<>()
|
||||
for (ResidueSite site : sites) {
|
||||
resolvedByName.put(site.name, site)
|
||||
}
|
||||
for (ExplicitSitesIndex.SiteDef sd : defs) {
|
||||
ResidueSite resolved = resolvedByName.get(sd.siteId)
|
||||
if (resolved == null) {
|
||||
totalSkippedSites.incrementAndGet()
|
||||
totalUnresolvedResidues.addAndGet(sd.residueIds.size())
|
||||
} else {
|
||||
int unresolved = sd.residueIds.size() - resolved.residues.size()
|
||||
if (unresolved > 0) {
|
||||
totalUnresolvedResidues.addAndGet(unresolved)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (ResidueSite site : sites) {
|
||||
Atom c = site.centroid
|
||||
|
||||
dt.newRow(item.label)
|
||||
.put("site_label", site.label)
|
||||
.put("site_type", "explicit")
|
||||
.put("n_atoms", site.atoms.count)
|
||||
.put("n_residues", site.residues.size())
|
||||
.put("residue_ids", formatResidueIds(site.residues))
|
||||
.put("center_x", c.x)
|
||||
.put("center_y", c.y)
|
||||
.put("center_z", c.z)
|
||||
}
|
||||
} else {
|
||||
double cutoff = params.ligand_protein_contact_distance
|
||||
for (Ligand lig : p.relevantLigands) {
|
||||
Atom c = lig.centroid
|
||||
Atoms contactAtoms = p.proteinAtoms.cutoutShell(lig.atoms, cutoff)
|
||||
List<Residue> contactResidues = p.residues.getDistinctForAtoms(contactAtoms)
|
||||
|
||||
dt.newRow(item.label)
|
||||
.put("site_label", lig.label)
|
||||
.put("site_type", "ligand")
|
||||
.put("n_atoms", lig.size)
|
||||
.put("n_residues", contactResidues.size())
|
||||
.put("residue_ids", formatResidueIds(contactResidues))
|
||||
.put("center_x", c.x)
|
||||
.put("center_y", c.y)
|
||||
.put("center_z", c.z)
|
||||
.put("lig_name", lig.name)
|
||||
.put("lig_code", lig.code as String)
|
||||
.put("lig_chain", lig.chain)
|
||||
.put("contact_dist", lig.contactDistance)
|
||||
.put("center_to_prot_dist", lig.centerToProteinDist)
|
||||
}
|
||||
|
||||
totalIgnored.addAndGet(p.ligands.ignoredLigandCount)
|
||||
totalSmall.addAndGet(p.ligands.smallLigandCount)
|
||||
totalDistant.addAndGet(p.ligands.distantLigandCount)
|
||||
}
|
||||
}
|
||||
|
||||
writeFile "$outdir/binding_sites.csv", dt.toCsv()
|
||||
|
||||
Map<String, Object> extraInfo = new LinkedHashMap<>()
|
||||
if (hasExplicitSites) {
|
||||
extraInfo.put("Site source:", "explicit")
|
||||
extraInfo.put("Sites format:", dataset.attributes.get(Dataset.PARAM_EXPLICIT_SITES_FORMAT))
|
||||
extraInfo.put("Sites file:", dataset.attributes.get(Dataset.PARAM_EXPLICIT_SITES_FILE))
|
||||
extraInfo.put("Proteins with sites:", proteinsWithSites.get())
|
||||
extraInfo.put("Proteins without sites:", proteinsWithoutSites.get())
|
||||
extraInfo.put("Sites skipped (no residues):", totalSkippedSites.get())
|
||||
extraInfo.put("Unresolved residues:", totalUnresolvedResidues.get())
|
||||
} else {
|
||||
extraInfo.put("Site source:", "ligands")
|
||||
extraInfo.put("Ignored ligands:", totalIgnored.get())
|
||||
extraInfo.put("Small ligands:", totalSmall.get())
|
||||
extraInfo.put("Distant ligands:", totalDistant.get())
|
||||
}
|
||||
extraInfo.put("Errors:", res.errorCount)
|
||||
|
||||
String summary = dt.formatSummaryTable("Binding Sites Summary", extraInfo)
|
||||
write summary
|
||||
writeFile "$outdir/binding_sites_summary.txt", summary
|
||||
|
||||
write "Processed ${dataset.size} items"
|
||||
write res.errorSummary
|
||||
|
||||
res.writeErrorCsvs(outdir)
|
||||
}
|
||||
|
||||
private static String formatResidueIds(List<Residue> residues) {
|
||||
residues.collect { Residue r ->
|
||||
r.chain.authorId + "_" + r.residueNumber.seqNum + (r.residueNumber.insCode ?: "")
|
||||
}.join(" ")
|
||||
}
|
||||
|
||||
void cmdPeptides() {
|
||||
LoaderParams.ignoreLigandsSwitch = true
|
||||
|
||||
|
||||
Reference in New Issue
Block a user