mirror of
https://github.com/rdk/p2rank.git
synced 2026-06-04 12:44:24 +08:00
Improve analyze binding-sites: visualizations, site radius, eager loading
- Add PyMol visualizations using dataset.binaryResidueLabeler - Add site_radius column (max distance from centroid to any site atom) - Add excludeFromSummary param to DataTable.formatSummaryTable to skip center coordinates from numeric summary stats - Load ExplicitSitesIndex eagerly during dataset loading (fail-fast) - Skip CSV rows with empty residue/coordinate fields in AhojUbsSiteParser - Write items without binding sites to separate file in outdir
This commit is contained in:
@@ -450,12 +450,15 @@ class Dataset implements Parametrized, Writable, Failable {
|
||||
}
|
||||
|
||||
ExplicitSitesIndex getExplicitSitesIndex() {
|
||||
if (explicitSitesIndex == null && hasExplicitSites()) {
|
||||
return explicitSitesIndex
|
||||
}
|
||||
|
||||
private void loadExplicitSitesIndex() {
|
||||
if (hasExplicitSites()) {
|
||||
String format = attributes.get(PARAM_EXPLICIT_SITES_FORMAT)
|
||||
String file = dir + "/" + attributes.get(PARAM_EXPLICIT_SITES_FILE)
|
||||
explicitSitesIndex = ExplicitSitesIndex.loadFromFile(format, file)
|
||||
}
|
||||
return explicitSitesIndex
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -578,6 +581,8 @@ class Dataset implements Parametrized, Writable, Failable {
|
||||
throw new PrankException("dataset contains invalid files")
|
||||
}
|
||||
|
||||
dataset.loadExplicitSitesIndex()
|
||||
|
||||
return dataset
|
||||
}
|
||||
|
||||
|
||||
@@ -21,11 +21,17 @@ class AhojUbsSiteParser {
|
||||
Map<String, List<ExplicitSitesIndex.SiteDef>> byFilename = new LinkedHashMap<>()
|
||||
|
||||
int totalSites = 0
|
||||
int skippedEmpty = 0
|
||||
for (String line : lines.tail()) {
|
||||
if (StringUtils.isBlank(line)) continue
|
||||
|
||||
String[] cols = line.split(",", -1)
|
||||
|
||||
if (StringUtils.isBlank(cols[5])) {
|
||||
skippedEmpty++
|
||||
continue
|
||||
}
|
||||
|
||||
String siteId = cols[1]
|
||||
String filename = cols[4]
|
||||
List<String> residueIds = Sutils.splitOnWhitespace(cols[5])
|
||||
@@ -39,6 +45,9 @@ class AhojUbsSiteParser {
|
||||
totalSites++
|
||||
}
|
||||
|
||||
if (skippedEmpty > 0) {
|
||||
log.warn "Skipped {} rows with empty residue/coordinate fields in [{}]", skippedEmpty, filePath
|
||||
}
|
||||
log.info "Loaded explicit sites index: {} sites for {} proteins from [{}]",
|
||||
totalSites, byFilename.size(), filePath
|
||||
|
||||
|
||||
@@ -65,7 +65,6 @@ class DatasetItemLoader implements Parametrized, Writable {
|
||||
ExplicitSitesIndex index = item.originDataset.explicitSitesIndex
|
||||
res.holoProtein.sites = index.resolveForProtein(res.holoProtein, item.proteinFile)
|
||||
log.info "Loaded {} explicit sites for [{}]", res.holoProtein.sites.size(), item.label
|
||||
write " sites: ${res.holoProtein.sites.size()}"
|
||||
}
|
||||
|
||||
ProcessedItemContext itemContext = item.context
|
||||
|
||||
@@ -167,7 +167,7 @@ class AnalyzeRoutine extends Routine {
|
||||
void cmdBindingSites() {
|
||||
DataTable dt = new DataTable("protein",
|
||||
"site_label", "site_type",
|
||||
"n_atoms", "n_residues", "residue_ids",
|
||||
"n_atoms", "n_residues", "site_radius", "residue_ids",
|
||||
"center_x", "center_y", "center_z",
|
||||
"lig_name", "lig_code", "lig_chain",
|
||||
"contact_dist", "center_to_prot_dist"
|
||||
@@ -183,8 +183,8 @@ class AnalyzeRoutine extends Routine {
|
||||
// Explicit-site-specific counters
|
||||
AtomicInteger totalSkippedSites = new AtomicInteger()
|
||||
AtomicInteger totalUnresolvedResidues = new AtomicInteger()
|
||||
AtomicInteger proteinsWithSites = new AtomicInteger()
|
||||
AtomicInteger proteinsWithoutSites = new AtomicInteger()
|
||||
|
||||
Queue<String> itemsWithoutSites = new ConcurrentLinkedQueue<>()
|
||||
|
||||
def res = dataset.processItems { Dataset.Item item ->
|
||||
Protein p = item.protein
|
||||
@@ -195,9 +195,7 @@ class AnalyzeRoutine extends Routine {
|
||||
List<ResidueSite> sites = p.sites ?: []
|
||||
|
||||
if (sites.isEmpty()) {
|
||||
proteinsWithoutSites.incrementAndGet()
|
||||
} else {
|
||||
proteinsWithSites.incrementAndGet()
|
||||
itemsWithoutSites.add(item.row)
|
||||
}
|
||||
|
||||
// Track unresolved: compare defs vs resolved sites
|
||||
@@ -226,12 +224,16 @@ class AnalyzeRoutine extends Routine {
|
||||
.put("site_type", "explicit")
|
||||
.put("n_atoms", site.atoms.count)
|
||||
.put("n_residues", site.residues.size())
|
||||
.put("site_radius", siteRadius(c, site.atoms))
|
||||
.put("residue_ids", formatResidueIds(site.residues))
|
||||
.put("center_x", c.x)
|
||||
.put("center_y", c.y)
|
||||
.put("center_z", c.z)
|
||||
}
|
||||
} else {
|
||||
if (p.relevantLigands.isEmpty()) {
|
||||
itemsWithoutSites.add(item.row)
|
||||
}
|
||||
double cutoff = params.ligand_protein_contact_distance
|
||||
for (Ligand lig : p.relevantLigands) {
|
||||
Atom c = lig.centroid
|
||||
@@ -243,6 +245,7 @@ class AnalyzeRoutine extends Routine {
|
||||
.put("site_type", "ligand")
|
||||
.put("n_atoms", lig.size)
|
||||
.put("n_residues", contactResidues.size())
|
||||
.put("site_radius", siteRadius(c, lig.atoms))
|
||||
.put("residue_ids", formatResidueIds(contactResidues))
|
||||
.put("center_x", c.x)
|
||||
.put("center_y", c.y)
|
||||
@@ -258,31 +261,52 @@ class AnalyzeRoutine extends Routine {
|
||||
totalSmall.addAndGet(p.ligands.smallLigandCount)
|
||||
totalDistant.addAndGet(p.ligands.distantLigandCount)
|
||||
}
|
||||
|
||||
if (params.visualizations) {
|
||||
BinaryLabeling labeling = item.binaryLabeling
|
||||
if (labeling != null) {
|
||||
new NewPymolRenderer("$outdir/visualizations", new RenderingModel(
|
||||
proteinFile: item.proteinFile,
|
||||
label: item.label,
|
||||
protein: p,
|
||||
observedLabeling: labeling
|
||||
)).render()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
writeFile "$outdir/binding_sites.csv", dt.toCsv()
|
||||
|
||||
Map<String, Object> extraInfo = new LinkedHashMap<>()
|
||||
int noSiteCount = itemsWithoutSites.size()
|
||||
if (hasExplicitSites) {
|
||||
extraInfo.put("Site source:", "explicit")
|
||||
extraInfo.put("Sites format:", dataset.attributes.get(Dataset.PARAM_EXPLICIT_SITES_FORMAT))
|
||||
extraInfo.put("Sites file:", dataset.attributes.get(Dataset.PARAM_EXPLICIT_SITES_FILE))
|
||||
extraInfo.put("Proteins with sites:", proteinsWithSites.get())
|
||||
extraInfo.put("Proteins without sites:", proteinsWithoutSites.get())
|
||||
extraInfo.put("Proteins with sites:", dataset.size - noSiteCount - res.errorCount)
|
||||
extraInfo.put("Proteins without sites:", noSiteCount)
|
||||
extraInfo.put("Sites skipped (no residues):", totalSkippedSites.get())
|
||||
extraInfo.put("Unresolved residues:", totalUnresolvedResidues.get())
|
||||
} else {
|
||||
extraInfo.put("Site source:", "ligands")
|
||||
extraInfo.put("Proteins without ligands:", noSiteCount)
|
||||
extraInfo.put("Ignored ligands:", totalIgnored.get())
|
||||
extraInfo.put("Small ligands:", totalSmall.get())
|
||||
extraInfo.put("Distant ligands:", totalDistant.get())
|
||||
}
|
||||
extraInfo.put("Errors:", res.errorCount)
|
||||
|
||||
String summary = dt.formatSummaryTable("Binding Sites Summary", extraInfo)
|
||||
Set<String> noSummary = ["center_x", "center_y", "center_z"] as Set
|
||||
String summary = dt.formatSummaryTable("Binding Sites Summary", extraInfo, noSummary)
|
||||
write summary
|
||||
writeFile "$outdir/binding_sites_summary.txt", summary
|
||||
|
||||
if (!itemsWithoutSites.isEmpty()) {
|
||||
String noSitesFile = "$outdir/items_without_sites.txt"
|
||||
writeFile noSitesFile, itemsWithoutSites.toSorted().join("\n") + "\n"
|
||||
write "NOTE: $noSiteCount of ${dataset.size} items have no binding sites. List written to [$noSitesFile]"
|
||||
}
|
||||
|
||||
write "Processed ${dataset.size} items"
|
||||
write res.errorSummary
|
||||
|
||||
@@ -295,6 +319,15 @@ class AnalyzeRoutine extends Routine {
|
||||
}.join(" ")
|
||||
}
|
||||
|
||||
private static double siteRadius(Atom centroid, Atoms atoms) {
|
||||
double maxDist = 0
|
||||
for (Atom a : atoms) {
|
||||
double d = Struct.dist(centroid, a)
|
||||
if (d > maxDist) maxDist = d
|
||||
}
|
||||
return maxDist
|
||||
}
|
||||
|
||||
void cmdPeptides() {
|
||||
LoaderParams.ignoreLigandsSwitch = true
|
||||
|
||||
|
||||
@@ -115,9 +115,10 @@ class DataTable {
|
||||
/**
|
||||
* Returns indices of columns that have at least one numeric value.
|
||||
*/
|
||||
private List<Integer> getNumericColumnIndices() {
|
||||
private List<Integer> getNumericColumnIndices(Set<String> exclude = Collections.emptySet()) {
|
||||
List<Integer> result = new ArrayList<>()
|
||||
for (int i = 0; i < columns.length; i++) {
|
||||
if (exclude.contains(columns[i])) continue
|
||||
int ci = i
|
||||
if (getRows().any { Row row -> row.values[ci] instanceof Number }) {
|
||||
result.add(i)
|
||||
@@ -142,8 +143,8 @@ class DataTable {
|
||||
return result
|
||||
}
|
||||
|
||||
String formatSummaryTable(String title = "Dataset Summary", Map<String, Object> extraInfo = [:]) {
|
||||
List<Integer> numColIndices = getNumericColumnIndices()
|
||||
String formatSummaryTable(String title = "Dataset Summary", Map<String, Object> extraInfo = [:], Set<String> excludeFromSummary = Collections.emptySet()) {
|
||||
List<Integer> numColIndices = getNumericColumnIndices(excludeFromSummary)
|
||||
int n = size()
|
||||
|
||||
StringBuilder table = new StringBuilder()
|
||||
|
||||
Reference in New Issue
Block a user