Files
pymol-open-source/layer0/ccealignmodule.cpp
2024-05-20 09:07:33 -04:00

651 lines
18 KiB
C++

// ccealign -- structural alignment plugin modele for PyMol
/////////////////////////////////////////////////////////////////////////////
//
// Copyright (C) Jason Vertrees.
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in
// the documentation and/or other materials provided with the
// distribution.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
// OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
//////////////////////////////////////////////////////////////////////////////
#ifndef _PYMOL_NOPY
#include "os_python.h"
#include "os_std.h"
#include "ce_types.h"
#include "tnt/tnt.h"
#include "tnt/jama_lu.h"
#include "tnt/jama_svd.h"
#define TA1 TNT::Array1D
#define TA2 TNT::Array2D
// tranpose of a matrix
static
TA2<double> transpose(const TA2<double>& v);
/////////////////////////////////////////////////////////////////////////////
// CE Specific
/////////////////////////////////////////////////////////////////////////////
double** calcDM(pcePoint coords, int len)
{
int i = 0;
double** dm = (double**) malloc(sizeof(double*)*len);
for (i = 0; i < len; i++)
dm[i] = (double*) malloc( sizeof(double)*len);
int row=0, col=0;
for (row = 0; row < len; row++) {
for (col = 0; col < len; col++) {
dm[row][col] = sqrt(pow(coords[row].x - coords[col].x,2) +
pow(coords[row].y - coords[col].y,2) +
pow(coords[row].z - coords[col].z,2) );
}
}
return dm;
}
double** calcS(double** d1, double** d2, int lenA, int lenB, int wSize)
{
int i;
double winSize = (double) wSize;
// initialize the 2D similarity matrix
double** S = (double**) malloc(sizeof(double*)*lenA);
for (i = 0; i < lenA; i++)
S[i] = (double*) malloc( sizeof(double)*lenB);
double sumSize = (winSize-1.0)*(winSize-2.0) / 2.0;
//
// This is where the magic of CE comes out. In the similarity matrix,
// for each i and j, the value of ceSIM[i][j] is how well the residues
// i - i+winSize in protein A, match to residues j - j+winSize in protein
// B. A value of 0 means absolute match; a value >> 1 means bad match.
//
int iA, iB, row, col;
for (iA = 0; iA < lenA; iA++) {
for (iB = 0; iB < lenB; iB++) {
S[iA][iB] = -1.0;
if (iA > lenA - wSize || iB > lenB - wSize)
continue;
double score = 0.0;
//
// We always skip the calculation of the distance from THIS
// residue, to the next residue. This is a time-saving heur-
// istic decision. Almost all alpha carbon bonds of neighboring
// residues is 3.8 Angstroms. Due to entropy, S = -k ln pi * pi,
// this tell us nothing, so it doesn't help so ignore it.
//
for (row = 0; row < wSize - 2; row++) {
for (col = row + 2; col < wSize; col++) {
score += fabs( d1[iA+row][iA+col] - d2[iB+row][iB+col] );
}
}
S[iA][iB] = score / sumSize;
}
}
return S;
}
pcePoint getCoords(PyObject* L, int length)
{
// make space for the current coords
pcePoint coords = (pcePoint) malloc(sizeof(cePoint)*length);
if (!coords)
return nullptr;
// loop through the arguments, pulling out the
// XYZ coordinates.
int i;
for ( i = 0; i < length; i++ ) {
PyObject* curCoord = PyList_GetItem(L,i);
Py_INCREF(curCoord);
PyObject* curVal = PyList_GetItem(curCoord,0);
Py_INCREF(curVal);
coords[i].x = PyFloat_AsDouble(curVal);
Py_DECREF(curVal);
curVal = PyList_GetItem(curCoord,1);
Py_INCREF(curVal);
coords[i].y = PyFloat_AsDouble(curVal);
Py_DECREF(curVal);
curVal = PyList_GetItem(curCoord,2);
Py_INCREF(curVal);
coords[i].z = PyFloat_AsDouble(curVal);
Py_DECREF(curVal);
Py_DECREF(curCoord);
}
return coords;
}
pathCache findPath( double** S, double** dA, double** dB, int lenA, int lenB, float D0, float D1, int winSize, int gapMax, int * bufferSize )
{
// CE-specific cutoffs
const int MAX_KEPT = 20;
// the best Path's score
double bestPathScore = 1e6;
int bestPathLength = 0;
// length of longest possible alignment
int smaller = ( lenA < lenB ) ? lenA : lenB;
int winSum = (winSize-1)*(winSize-2)/2;
path bestPath = (path) malloc(sizeof(afp)*smaller);
// index variable for below
int i, j;
for ( i = 0; i < smaller; i++ ) {
bestPath[i].first = -1;
bestPath[i].second = -1;
}
//======================================================================
// for storing the best 20 paths
int bufferIndex = 0; //, bufferSize = 0;
int lenBuffer[MAX_KEPT];
double scoreBuffer[MAX_KEPT];
pathCache pathBuffer = (pathCache) malloc(sizeof(path*)*MAX_KEPT);
for ( i = 0; i < MAX_KEPT; i++ ) {
// initialize the paths
scoreBuffer[i] = 1e6;
lenBuffer[i] = 0;
pathBuffer[i] = 0;
}
// winCache
// this array stores a list of residues seen. We use it to calculate the
// total score of a path from 1..M and then add it to M+1..N.
int* winCache = (int*) malloc(sizeof(int)*smaller);
for ( i = 0; i < smaller; i++ )
winCache[i] = (i+1)*i*winSize/2 + (i+1)*winSum;
// allScoreBuffer
// this 2D array keeps track of all partial gapped scores
double** allScoreBuffer = (double**) malloc(sizeof(double*)*smaller);
for ( i = 0; i < smaller; i++ ) {
allScoreBuffer[i] = (double*) malloc( (gapMax*2+1) * sizeof(double));
// initialize the ASB
for ( j = 0; j < gapMax*2+1; j++ )
allScoreBuffer[i][j] = 1e6;
}
int* tIndex = (int*) malloc(sizeof(int)*smaller);
int gapBestIndex = -1;
//======================================================================
// Start the search through the CE matrix.
//
int iA, iB;
for ( iA = 0; iA < lenA; iA++ ) {
if ( iA > lenA - winSize*(bestPathLength-1) )
break;
for ( iB = 0; iB < lenB; iB++ ) {
if ( S[iA][iB] >= D0 )
continue;
if ( S[iA][iB] == -1.0 )
continue;
if ( iB > lenB - winSize*(bestPathLength-1) )
break;
//
// Restart curPath here.
//
path curPath = (path) malloc( sizeof(afp)*smaller );
int i;
for ( i = 0; i < smaller; i++ ) {
curPath[i].first = -1;
curPath[i].second = -1;
}
curPath[0].first = iA;
curPath[0].second = iB;
int curPathLength = 1;
tIndex[curPathLength-1] = 0;
double curTotalScore = 0.0;
//
// Check all possible paths starting from iA, iB
//
int done = 0;
while ( ! done ) {
double gapBestScore = 1e6;
gapBestIndex = -1;
int g;
//
// Check all possible gaps [1..gapMax] from here
//
for ( g = 0; g < (gapMax*2)+1; g++ ) {
int jA = curPath[curPathLength-1].first + winSize;
int jB = curPath[curPathLength-1].second + winSize;
if ( (g+1) % 2 == 0 ) {
jA += (g+1)/2;
}
else { // ( g odd )
jB += (g+1)/2;
}
//
// Following are three heuristics to ensure high quality
// long paths and make sure we don't run over the end of
// the S, matrix.
// 1st: If jA and jB are at the end of the matrix
if ( jA > lenA-winSize || jB > lenB-winSize ){
// FIXME, was: jA > lenA-winSize-1 || jB > lenB-winSize-1
continue;
}
// 2nd: If this gapped octapeptide is bad, ignore it.
if ( S[jA][jB] > D0 )
continue;
// 3rd: if too close to end, ignore it.
if ( S[jA][jB] == -1.0 )
continue;
double curScore = 0.0;
int s;
for ( s = 0; s < curPathLength; s++ ) {
curScore += fabs( dA[curPath[s].first][jA] - dB[curPath[s].second][jB] );
curScore += fabs( dA[curPath[s].first + (winSize-1)][jA+(winSize-1)] -
dB[curPath[s].second + (winSize-1)][jB+(winSize-1)] );
int k;
for ( k = 1; k < winSize-1; k++ )
curScore += fabs( dA[curPath[s].first + k][ jA + (winSize-1) - k ] -
dB[curPath[s].second + k][ jB + (winSize-1) - k ] );
}
curScore /= (double) winSize * (double) curPathLength;
if ( curScore >= D1 ) {
continue;
}
// store GAPPED best
if ( curScore < gapBestScore ) {
curPath[curPathLength].first = jA;
curPath[curPathLength].second = jB;
gapBestScore = curScore;
gapBestIndex = g;
allScoreBuffer[curPathLength-1][g] = curScore;
}
} /// ROF -- END GAP SEARCHING
//
// DONE GAPPING:
//
// calculate curTotalScore
curTotalScore = 0.0;
int jGap, gA, gB;
double score1=0.0, score2=0.0;
if ( gapBestIndex != -1 ) {
jGap = (gapBestIndex + 1 ) / 2;
if ((gapBestIndex + 1 ) % 2 == 0) {
gA = curPath[ curPathLength-1 ].first + winSize + jGap;
gB = curPath[ curPathLength-1 ].second + winSize;
}
else {
gA = curPath[ curPathLength-1 ].first + winSize;
gB = curPath[ curPathLength-1 ].second + winSize + jGap;
}
// perfect
score1 = (allScoreBuffer[curPathLength-1][gapBestIndex] * winSize * curPathLength
+ S[gA][gB]*winSum)/(winSize*curPathLength+winSum);
// perfect
score2 = ((curPathLength > 1 ? (allScoreBuffer[curPathLength-2][tIndex[curPathLength-1]])
: S[iA][iB])
* winCache[curPathLength-1]
+ score1 * (winCache[curPathLength] - winCache[curPathLength-1]))
/ winCache[curPathLength];
curTotalScore = score2;
// heuristic -- path is getting sloppy, stop looking
if ( curTotalScore > D1 ) {
done = 1;
gapBestIndex=-1;
break;
}
else {
allScoreBuffer[curPathLength-1][gapBestIndex] = curTotalScore;
tIndex[curPathLength] = gapBestIndex;
curPathLength++;
}
}
else {
// if here, then there was no good gapped path
// so quit and restart from iA, iB+1
done = 1;
curPathLength--;
break;
}
//
// test this gapped path against the best seen
// starting from iA, iB
//
// if our currently best gapped path from iA and iB is LONGER
// than the current best; or, it's equal length and the score's
// better, keep the new path.
if ( curPathLength > bestPathLength ||
(curPathLength == bestPathLength && curTotalScore < bestPathScore )) {
bestPathLength = curPathLength;
bestPathScore = curTotalScore;
// deep copy curPath
path tempPath = (path) malloc( sizeof(afp)*smaller );
int i;
for ( i = 0; i < smaller; i++ ) {
tempPath[i].first = curPath[i].first;
tempPath[i].second = curPath[i].second;
}
free(bestPath);
bestPath = tempPath;
}
} /// END WHILE
//
// At this point, we've found the best path starting at iA, iB.
//
if ( bestPathLength > lenBuffer[bufferIndex] ||
( bestPathLength == lenBuffer[bufferIndex] &&
bestPathScore < scoreBuffer[bufferIndex] )) {
// we're going to add an entry to the ring-buffer.
// Adjust maxSize values and curIndex accordingly.
bufferIndex = ( bufferIndex == MAX_KEPT-1 ) ? 0 : bufferIndex+1;
*bufferSize = ( *bufferSize < MAX_KEPT ) ? (*bufferSize)+1 : MAX_KEPT;
path pathCopy = (path) malloc( sizeof(afp)*smaller );
int i;
for ( i = 0; i < smaller; i++ ) {
pathCopy[i].first = bestPath[i].first;
pathCopy[i].second = bestPath[i].second;
}
if ( bufferIndex == 0 && (*bufferSize) == MAX_KEPT ) {
if ( pathBuffer[MAX_KEPT-1] )
free(pathBuffer[MAX_KEPT-1]);
pathBuffer[MAX_KEPT-1] = pathCopy;
scoreBuffer[MAX_KEPT-1] = bestPathScore;
lenBuffer[MAX_KEPT-1] = bestPathLength;
}
else {
if ( pathBuffer[bufferIndex-1] )
free(pathBuffer[bufferIndex-1]);
pathBuffer[bufferIndex-1] = pathCopy;
scoreBuffer[bufferIndex-1] = bestPathScore;
lenBuffer[bufferIndex-1] = bestPathLength;
}
}
free(curPath); curPath=0;
} // ROF -- end for iB
} // ROF -- end for iA
// free memory
for ( i = 0; i < smaller; i++ )
free(allScoreBuffer[i]);
free(allScoreBuffer);
free(tIndex);
free(winCache);
free(bestPath);
return pathBuffer;
}
PyObject* findBest( pcePoint coordsA, pcePoint coordsB, pathCache paths, int bufferSize, int smaller, int winSize )
{
// keep the best values
double bestRMSD = 1e6;
TA2<double> bestU;
TA1<double> bestCOM1, bestCOM2;
int bestLen = 0;
int bestO = -1;
// loop through the buffer
for ( int o = 0; o < bufferSize; o++ ) {
// grab the current path
TA2<double> c1(smaller, 3, 0.0);
TA2<double> c2(smaller, 3, 0.0);
int curLen = 0;
int j = 0; int it = 0;
while ( j < smaller ) {
// rebuild the coordinate lists for this path
if ( paths[o][j].first != -1 )
{
for ( int k = 0; k < winSize; k++ )
{
double t1[] = { coordsA[ paths[o][j].first +k ].x,
coordsA[ paths[o][j].first +k ].y,
coordsA[ paths[o][j].first +k ].z };
double t2[] = { coordsB[ paths[o][j].second+k ].x,
coordsB[ paths[o][j].second+k ].y,
coordsB[ paths[o][j].second+k ].z };
for ( int d = 0; d < c1.dim2(); d++ ) {
c1[it][d] = t1[d];
c2[it][d] = t2[d];
}
it++;
}
j++;
}
else {
curLen = it;
break;
}
}
//
// For convenience, let there be M points of N dimensions
//
int m = curLen;
int n = c2.dim2();
//==========================================================================
//
// Superpose the two proteins
//
//==========================================================================
// centers of mass for c1 and c2
TA1<double> c1COM(n,0.0);
TA1<double> c2COM(n,0.0);
// Calc CsOM
for (int i = 0; i < m; i++ )
{
for (int j = 0; j < n; j++ )
{
c1COM[j] += (double) c1[i][j] / (double) m;
c2COM[j] += (double) c2[i][j] / (double) m;
}
}
// Move the two vectors to the origin
for (int i = 0; i < m; i++ )
{
for (int j = 0; j < n; j++ )
{
c1[i][j] -= c1COM[j];
c2[i][j] -= c2COM[j];
}
}
//==========================================================================
//
// Calculate U and RMSD. This is broken down to the super-silly-easy
// math of: U = Wt * V, where Wt and V are NxN matrices from the SVD of
// R, the correlation matrix between the two origin-based vector sets.
//
//==========================================================================
// Calculate the initial residual, E0
// E0 = sum( Yn*Yn + Xn*Xn ) -- sum of squares
double E0 = 0.0;
for (int i = 0; i < m; i++ )
{
for (int j = 0; j < n; j++ )
{
E0 += (c1[i][j]*c1[i][j])+(c2[i][j]*c2[i][j]);
}
}
//
// SVD is the SVD of the correlation matrix Xt*Y
// R = c2' * c1 = W * S * Vt
JAMA::SVD<double> svd = JAMA::SVD<double>( TNT::matmult(transpose(c2), c1 ) );
// left singular vectors
TA2<double> W = TA2<double>(n,n);
// right singular vectors
TA2<double> Vt = TA2<double>(n,n);
// singular values
TA1<double> sigmas = TA1<double>(n);
svd.getU(W);
svd.getV(Vt);
Vt = transpose(Vt);
svd.getSingularValues(sigmas);
//
// Check any reflections before rotation of the points;
// if det(W)*det(V) == -1 then we just reflect
// the principal axis corresponding to the smallest eigenvalue by -1
//
JAMA::LU<double> LU_Vt(Vt);
JAMA::LU<double> LU_W(W);
if ( LU_W.det() * LU_Vt.det() < 0.0 )
{
//std::cout << "_________REFLECTION_________" << std::endl;
// revese the smallest axes and last sigma
for ( int i = 0; i < n; i++ )
W[n-1][i] = -W[n-1][i];
sigmas[n-1] = -sigmas[n-1];
}
// calculate the rotation matrix, U.
// U = W * Vt
TA2<double> U = TA2<double>(TNT::matmult(W, Vt));
//
// Now calculate the RMSD
//
double sig = 0.0;
for ( int i = 0; i < (int) n; i++ )
sig += sigmas[i];
double curRMSD = sqrt(fabs((E0 - 2*sig) / (double) m ));
//
// Save the best
//
if ( curRMSD < bestRMSD || ( curRMSD == bestRMSD && c1.dim1() > bestLen )) {
bestU = U.copy();
bestRMSD = curRMSD;
bestCOM1 = c1COM.copy();
bestCOM2 = c2COM.copy();
bestLen = curLen;
bestO = o;
}
}
if ( bestRMSD == 1e6 ) {
std::cout << "ERROR: Best RMSD found was 1e6. Broken.\n";
return nullptr;
}
PyObject* pyU = Py_BuildValue( "[f,f,f,f, f,f,f,f, f,f,f,f, f,f,f,f]",
bestU[0][0], bestU[1][0], bestU[2][0], bestCOM1[0],
bestU[0][1], bestU[1][1], bestU[2][1], bestCOM1[1],
bestU[0][2], bestU[1][2], bestU[2][2], bestCOM1[2],
-bestCOM2[0], -bestCOM2[1], -bestCOM2[2], 1.);
PyObject* pyPathA = PyList_New(0);
PyObject* pyPathB = PyList_New(0);
for (int j = 0; j < smaller && paths[bestO][j].first != -1; j++) {
PyObject* v = Py_BuildValue("i", paths[bestO][j].first);
PyList_Append(pyPathA, v);
Py_DECREF(v);
v = Py_BuildValue("i", paths[bestO][j].second);
PyList_Append(pyPathB, v);
Py_DECREF(v);
}
return Py_BuildValue("[ifNNN]", bestLen, bestRMSD, pyU, pyPathA, pyPathB);
}
TA2<double> transpose(const TA2<double>& v)
{
int m = (int) v.dim1();
int n = (int) v.dim2();
TA2<double> rVal(n,m);
for ( int i = 0; i < m; i++ )
for ( int j = 0; j < n; j++ )
rVal[j][i] = v[i][j];
return rVal;
}
#endif