pymol-open-source/layer2/USalign.cpp

/**
 * TM-align algorithm implementation for PyMOL.
 *
 * Ported from USalign (Zhang lab) to modern C++17 with glm math types.
 * Original algorithm: Zhang & Skolnick, Nucl. Acids Res. 33, 2302 (2005)
 *
 * Strips all file I/O — coordinates come from PyMOL selections.
 * Replaces double** with std::vector<glm::dvec3>, NewArray/DeleteArray with
 * RAII containers, scattered output params with result structs.
 */

/*
==============================================================================
   US-align: universal structure alignment of monomeric and complex proteins
   and nucleic acids

   This program was written by Chengxin Zhang at Yang Zhang lab,
   Department of Computational Medicine and Bioinformatics,
   University of Michigan, 100 Washtenaw Ave, Ann Arbor, MI 48109-2218.
   Please report issues to zhanglab@zhanggroup.org

   References:
   * C Zhang, M Shine, A Pyle, Y Zhang (2022) Nat Methods. 19(9), 1109-1115.
   * C Zhang, L Freddolino, Y Zhang (2025) Nature Protocols,
https://doi.org/10.1038/s41596-025-01189-x

   DISCLAIMER:
     Permission to use, copy, modify, and distribute this program for
     any purpose, with or without fee, is hereby granted, provided that
     the notices on the head, the reference information, and this
     copyright notice appear in all copies or substantial portions of
     the Software. It is provided "as is" without express or implied
     warranty.
===============================================================================
*/

#include "USalign.h"

#include <algorithm>
#include <cmath>
#include <numeric>
#include <vector>

#include <glm/glm.hpp>

namespace pymol::usalign
{

// ========================================================================
// DPWorkspace
// ========================================================================

void DPWorkspace::resize(int xlen, int ylen)
{
  rows = xlen + 1;
  cols = ylen + 1;
  score_flat.assign(static_cast<size_t>(rows) * cols, 0.0);
  val_flat.assign(static_cast<size_t>(rows) * cols, 0.0);
  path_flat.assign(static_cast<size_t>(rows) * cols, false);

  int minlen = std::min(xlen, ylen);
  xtm.resize(minlen);
  ytm.resize(minlen);
  xt.resize(xlen);
  r1.resize(minlen);
  r2.resize(minlen);
}

// ========================================================================
// Parameter initialization (from param_set.h)
// ========================================================================

struct TMParameters {
  double d0 = 0.0;
  double d0_search = 0.0;
  double Lnorm = 0.0;
  double score_d8 = 0.0;
  double D0_MIN = 0.0;
  double dcu0 = 0.0;
};

static TMParameters compute_search_params(int xlen, int ylen)
{
  TMParameters p;
  p.D0_MIN = 0.5;
  p.dcu0 = 4.25;

  p.Lnorm = std::min(xlen, ylen);
  if (p.Lnorm <= 19)
    p.d0 = 0.168;
  else
    p.d0 = 1.24 * std::pow(p.Lnorm - 15.0, 1.0 / 3.0) - 1.8;
  // During the search phase, d0 is inflated by 0.8 compared to final scoring.
  // This wider distance cutoff helps the iterative refinement explore more seeds.
  p.D0_MIN = p.d0 + 0.8;
  p.d0 = p.D0_MIN;

  p.d0_search = p.d0;
  if (p.d0_search > 8.0)
    p.d0_search = 8.0;
  if (p.d0_search < 4.5)
    p.d0_search = 4.5;

  p.score_d8 = 1.5 * std::pow(p.Lnorm, 0.3) + 3.5;
  return p;
}

static void compute_final_params(
    double len, double& D0_MIN, double& Lnorm, double& d0, double& d0_search)
{
  D0_MIN = 0.5;
  Lnorm = len;
  if (Lnorm <= 21)
    d0 = 0.5;
  else
    d0 = 1.24 * std::pow(Lnorm - 15.0, 1.0 / 3.0) - 1.8;
  if (d0 < D0_MIN)
    d0 = D0_MIN;
  d0_search = d0;
  if (d0_search > 8.0)
    d0_search = 8.0;
  if (d0_search < 4.5)
    d0_search = 4.5;
}

// ========================================================================
// Geometry utilities
// ========================================================================

static inline double dist2(const glm::dvec3& a, const glm::dvec3& b)
{
  auto d = a - b;
  return glm::dot(d, d);
}

static inline glm::dvec3 apply_transform(
    const glm::dvec3& v, const Superposition& sup)
{
  return sup.rotation * v + sup.translation;
}

static void do_rotation(const std::vector<glm::dvec3>& src,
    std::vector<glm::dvec3>& dst, int n, const Superposition& sup)
{
  for (int i = 0; i < n; i++)
    dst[i] = apply_transform(src[i], sup);
}

// ========================================================================
// Kabsch superposition (from Kabsch.h)
// ========================================================================

static bool kabsch(const std::vector<glm::dvec3>& x,
    const std::vector<glm::dvec3>& y, int n, int mode, double& rms,
    Superposition& sup)
{
  double e0, rms1, d, h, g;
  double cth, sth, sqrth, p, det, sigma;
  double xc[3], yc[3];
  double a[3][3], b[3][3], r[3][3], e[3], rr[6], ss[6];
  const double sqrt3 = 1.73205080756888;
  const double tol = 0.01;
  const int ip[] = {0, 1, 3, 1, 2, 4, 3, 4, 5};
  const int ip2312[] = {1, 2, 0, 1};

  int a_failed = 0, b_failed = 0;
  const double epsilon = 0.00000001;

  rms = 0;
  rms1 = 0;
  e0 = 0;

  double t[3];
  double u[3][3];

  double s1[3] = {0, 0, 0};
  double s2[3] = {0, 0, 0};
  double sx[3] = {0, 0, 0};
  double sy[3] = {0, 0, 0};
  double sz[3] = {0, 0, 0};

  for (int i = 0; i < 3; i++) {
    xc[i] = 0.0;
    yc[i] = 0.0;
    t[i] = 0.0;
    for (int j = 0; j < 3; j++) {
      u[i][j] = (i == j) ? 1.0 : 0.0;
      r[i][j] = 0.0;
      a[i][j] = (i == j) ? 1.0 : 0.0;
    }
  }

  if (n < 1)
    return false;

  // compute centers
  for (int i = 0; i < n; i++) {
    double c1[3] = {x[i].x, x[i].y, x[i].z};
    double c2[3] = {y[i].x, y[i].y, y[i].z};
    for (int j = 0; j < 3; j++) {
      s1[j] += c1[j];
      s2[j] += c2[j];
    }
    for (int j = 0; j < 3; j++) {
      sx[j] += c1[0] * c2[j];
      sy[j] += c1[1] * c2[j];
      sz[j] += c1[2] * c2[j];
    }
  }
  for (int i = 0; i < 3; i++) {
    xc[i] = s1[i] / n;
    yc[i] = s2[i] / n;
  }

  if (mode == 2 || mode == 0) {
    for (int mm = 0; mm < n; mm++) {
      double xx[3] = {x[mm].x, x[mm].y, x[mm].z};
      double yy[3] = {y[mm].x, y[mm].y, y[mm].z};
      for (int nn = 0; nn < 3; nn++)
        e0 += (xx[nn] - xc[nn]) * (xx[nn] - xc[nn]) +
              (yy[nn] - yc[nn]) * (yy[nn] - yc[nn]);
    }
  }

  for (int j = 0; j < 3; j++) {
    r[j][0] = sx[j] - s1[0] * s2[j] / n;
    r[j][1] = sy[j] - s1[1] * s2[j] / n;
    r[j][2] = sz[j] - s1[2] * s2[j] / n;
  }

  // compute determinant
  det = r[0][0] * (r[1][1] * r[2][2] - r[1][2] * r[2][1]) -
        r[0][1] * (r[1][0] * r[2][2] - r[1][2] * r[2][0]) +
        r[0][2] * (r[1][0] * r[2][1] - r[1][1] * r[2][0]);
  sigma = det;

  // compute trans(r)*r
  int m = 0;
  for (int j = 0; j < 3; j++)
    for (int i = 0; i <= j; i++)
      rr[m++] = r[0][i] * r[0][j] + r[1][i] * r[1][j] + r[2][i] * r[2][j];

  double spur = (rr[0] + rr[2] + rr[5]) / 3.0;
  double cof =
      (((((rr[2] * rr[5] - rr[4] * rr[4]) + rr[0] * rr[5]) - rr[3] * rr[3]) +
           rr[0] * rr[2]) -
          rr[1] * rr[1]) /
      3.0;
  det = det * det;

  for (int i = 0; i < 3; i++)
    e[i] = spur;

  if (spur > 0) {
    d = spur * spur;
    h = d - cof;
    g = (spur * cof - det) / 2.0 - spur * h;

    if (h > 0) {
      sqrth = std::sqrt(h);
      d = h * h * h - g * g;
      if (d < 0.0)
        d = 0.0;
      d = std::atan2(std::sqrt(d), -g) / 3.0;
      cth = sqrth * std::cos(d);
      sth = sqrth * sqrt3 * std::sin(d);
      e[0] = (spur + cth) + cth;
      e[1] = (spur - cth) + sth;
      e[2] = (spur - cth) - sth;

      if (mode != 0) {
        for (int l = 0; l < 3; l += 2) {
          d = e[l];
          ss[0] = (d - rr[2]) * (d - rr[5]) - rr[4] * rr[4];
          ss[1] = (d - rr[5]) * rr[1] + rr[3] * rr[4];
          ss[2] = (d - rr[0]) * (d - rr[5]) - rr[3] * rr[3];
          ss[3] = (d - rr[2]) * rr[3] + rr[1] * rr[4];
          ss[4] = (d - rr[0]) * rr[4] + rr[1] * rr[3];
          ss[5] = (d - rr[0]) * (d - rr[2]) - rr[1] * rr[1];

          for (int i = 0; i < 6; i++)
            if (std::fabs(ss[i]) <= epsilon)
              ss[i] = 0.0;

          int j;
          if (std::fabs(ss[0]) >= std::fabs(ss[2])) {
            j = 0;
            if (std::fabs(ss[0]) < std::fabs(ss[5]))
              j = 2;
          } else if (std::fabs(ss[2]) >= std::fabs(ss[5]))
            j = 1;
          else
            j = 2;

          d = 0.0;
          j = 3 * j;
          for (int i = 0; i < 3; i++) {
            int k = ip[i + j];
            a[i][l] = ss[k];
            d += ss[k] * ss[k];
          }

          if (d > epsilon)
            d = 1.0 / std::sqrt(d);
          else
            d = 0.0;
          for (int i = 0; i < 3; i++)
            a[i][l] *= d;
        }

        d = a[0][0] * a[0][2] + a[1][0] * a[1][2] + a[2][0] * a[2][2];
        int m1, m0;
        if ((e[0] - e[1]) > (e[1] - e[2])) {
          m1 = 2;
          m0 = 0;
        } else {
          m1 = 0;
          m0 = 2;
        }
        p = 0;
        for (int i = 0; i < 3; i++) {
          a[i][m1] -= d * a[i][m0];
          p += a[i][m1] * a[i][m1];
        }
        if (p <= tol) {
          p = 1.0;
          int j = 0;
          for (int i = 0; i < 3; i++) {
            if (p < std::fabs(a[i][m0]))
              continue;
            p = std::fabs(a[i][m0]);
            j = i;
          }
          int k = ip2312[j];
          int l = ip2312[j + 1];
          p = std::sqrt(a[k][m0] * a[k][m0] + a[l][m0] * a[l][m0]);
          if (p > tol) {
            a[j][m1] = 0.0;
            a[k][m1] = -a[l][m0] / p;
            a[l][m1] = a[k][m0] / p;
          } else
            a_failed = 1;
        } else {
          p = 1.0 / std::sqrt(p);
          for (int i = 0; i < 3; i++)
            a[i][m1] *= p;
        }
        if (a_failed != 1) {
          a[0][1] = a[1][2] * a[2][0] - a[1][0] * a[2][2];
          a[1][1] = a[2][2] * a[0][0] - a[2][0] * a[0][2];
          a[2][1] = a[0][2] * a[1][0] - a[0][0] * a[1][2];
        }
      }
    }

    if (mode != 0 && a_failed != 1) {
      for (int l = 0; l < 2; l++) {
        d = 0.0;
        for (int i = 0; i < 3; i++) {
          b[i][l] = r[i][0] * a[0][l] + r[i][1] * a[1][l] + r[i][2] * a[2][l];
          d += b[i][l] * b[i][l];
        }
        if (d > epsilon)
          d = 1.0 / std::sqrt(d);
        else
          d = 0.0;
        for (int i = 0; i < 3; i++)
          b[i][l] *= d;
      }
      d = b[0][0] * b[0][1] + b[1][0] * b[1][1] + b[2][0] * b[2][1];
      p = 0.0;
      for (int i = 0; i < 3; i++) {
        b[i][1] -= d * b[i][0];
        p += b[i][1] * b[i][1];
      }
      if (p <= tol) {
        p = 1.0;
        int j = 0;
        for (int i = 0; i < 3; i++) {
          if (p < std::fabs(b[i][0]))
            continue;
          p = std::fabs(b[i][0]);
          j = i;
        }
        int k = ip2312[j];
        int l = ip2312[j + 1];
        p = std::sqrt(b[k][0] * b[k][0] + b[l][0] * b[l][0]);
        if (p > tol) {
          b[j][1] = 0.0;
          b[k][1] = -b[l][0] / p;
          b[l][1] = b[k][0] / p;
        } else
          b_failed = 1;
      } else {
        p = 1.0 / std::sqrt(p);
        for (int i = 0; i < 3; i++)
          b[i][1] *= p;
      }
      if (b_failed != 1) {
        b[0][2] = b[1][0] * b[2][1] - b[1][1] * b[2][0];
        b[1][2] = b[2][0] * b[0][1] - b[2][1] * b[0][0];
        b[2][2] = b[0][0] * b[1][1] - b[0][1] * b[1][0];
        for (int i = 0; i < 3; i++)
          for (int j = 0; j < 3; j++)
            u[i][j] = b[i][0] * a[j][0] + b[i][1] * a[j][1] + b[i][2] * a[j][2];
      }
      for (int i = 0; i < 3; i++)
        t[i] = ((yc[i] - u[i][0] * xc[0]) - u[i][1] * xc[1]) - u[i][2] * xc[2];
    }
  } else {
    for (int i = 0; i < 3; i++)
      t[i] = ((yc[i] - u[i][0] * xc[0]) - u[i][1] * xc[1]) - u[i][2] * xc[2];
  }

  // compute rms
  for (int i = 0; i < 3; i++) {
    if (e[i] < 0)
      e[i] = 0;
    e[i] = std::sqrt(e[i]);
  }
  d = e[2];
  if (sigma < 0.0)
    d = -d;
  d = (d + e[1]) + e[0];

  if (mode == 2 || mode == 0) {
    rms1 = (e0 - d) - d;
    if (rms1 < 0.0)
      rms1 = 0.0;
  }

  rms = rms1;

  // convert to glm types
  sup.translation = glm::dvec3(t[0], t[1], t[2]);
  sup.rotation = glm::dmat3(u[0][0], u[1][0], u[2][0], u[0][1], u[1][1],
      u[2][1], u[0][2], u[1][2], u[2][2]);

  return true;
}

// ========================================================================
// Secondary structure assignment (from se.h:make_sec)
// ========================================================================

static char sec_str_classify(double dis13, double dis14, double dis15,
    double dis24, double dis25, double dis35)
{
  double delta = 2.1;
  if (std::fabs(dis15 - 6.37) < delta && std::fabs(dis14 - 5.18) < delta &&
      std::fabs(dis25 - 5.18) < delta && std::fabs(dis13 - 5.45) < delta &&
      std::fabs(dis24 - 5.45) < delta && std::fabs(dis35 - 5.45) < delta)
    return 'H';

  delta = 1.42;
  if (std::fabs(dis15 - 13.0) < delta && std::fabs(dis14 - 10.4) < delta &&
      std::fabs(dis25 - 10.4) < delta && std::fabs(dis13 - 6.1) < delta &&
      std::fabs(dis24 - 6.1) < delta && std::fabs(dis35 - 6.1) < delta)
    return 'E';

  if (dis15 < 8)
    return 'T';
  return 'C';
}

static void assign_secondary_structure(
    const std::vector<glm::dvec3>& ca, std::vector<char>& sec)
{
  int len = static_cast<int>(ca.size());
  sec.assign(len, 'C');

  for (int i = 0; i < len; i++) {
    int j1 = i - 2, j2 = i - 1, j3 = i, j4 = i + 1, j5 = i + 2;
    if (j1 >= 0 && j5 < len) {
      double d13 = std::sqrt(dist2(ca[j1], ca[j3]));
      double d14 = std::sqrt(dist2(ca[j1], ca[j4]));
      double d15 = std::sqrt(dist2(ca[j1], ca[j5]));
      double d24 = std::sqrt(dist2(ca[j2], ca[j4]));
      double d25 = std::sqrt(dist2(ca[j2], ca[j5]));
      double d35 = std::sqrt(dist2(ca[j3], ca[j5]));
      sec[i] = sec_str_classify(d13, d14, d15, d24, d25, d35);
    }
  }
}

// ========================================================================
// Scoring functions (from TMalign.h:score_fun8)
// ========================================================================

// Lnorm > 0: normalize by Lnorm (used during main search)
// Lnorm <= 0: normalize by n_ali (used during standard/final scoring)
static int score_fun8(const std::vector<glm::dvec3>& xa,
    const std::vector<glm::dvec3>& ya, int n_ali, double d,
    std::vector<int>& i_ali, double& score1, int score_sum_method,
    double score_d8, double d0, double Lnorm = 0.0)
{
  double score_sum = 0;
  double d_tmp = d * d;
  double d02 = d0 * d0;
  double score_d8_cut = score_d8 * score_d8;

  int n_cut, inc = 0;
  while (true) {
    n_cut = 0;
    score_sum = 0;
    for (int i = 0; i < n_ali; i++) {
      double di = dist2(xa[i], ya[i]);
      if (di < d_tmp) {
        i_ali[n_cut] = i;
        n_cut++;
      }
      if (score_sum_method == 8) {
        if (di <= score_d8_cut)
          score_sum += 1.0 / (1.0 + di / d02);
      } else
        score_sum += 1.0 / (1.0 + di / d02);
    }
    if (n_cut < 3 && n_ali > 3) {
      inc++;
      double dinc = d + inc * 0.5;
      d_tmp = dinc * dinc;
    } else
      break;
  }
  score1 = score_sum / (Lnorm > 0.0 ? Lnorm : n_ali);
  return n_cut;
}

// ========================================================================
// TMscore8_search — iterative fragment refinement (from TMalign.h)
// ========================================================================

// Lnorm > 0: normalize by Lnorm (used during main search)
// Lnorm <= 0: normalize by n_ali (used during standard/final scoring)
static double TMscore8_search(DPWorkspace& ws, int Lali, Superposition& sup_out,
    int simplify_step, int score_sum_method, double local_d0_search,
    double score_d8, double d0, double Lnorm = 0.0)
{
  double score_max = -1, score, rmsd;
  const int n_it = 20;
  const int n_init_max = 6;
  int L_ini[6];
  int L_ini_min = 4;
  if (Lali < L_ini_min)
    L_ini_min = Lali;

  int n_init = 0;
  int i;
  for (i = 0; i < n_init_max - 1; i++) {
    n_init++;
    L_ini[i] = static_cast<int>(Lali / std::pow(2.0, static_cast<double>(i)));
    if (L_ini[i] <= L_ini_min) {
      L_ini[i] = L_ini_min;
      break;
    }
  }
  if (i == n_init_max - 1) {
    n_init++;
    L_ini[i] = L_ini_min;
  }

  std::vector<int> i_ali(Lali);
  std::vector<int> k_ali(Lali);
  Superposition sup_tmp;

  for (int i_init = 0; i_init < n_init; i_init++) {
    int L_frag = L_ini[i_init];
    int iL_max = Lali - L_frag;

    i = 0;
    while (true) {
      for (int k = 0; k < L_frag; k++) {
        int kk = k + i;
        ws.r1[k] = ws.xtm[kk];
        ws.r2[k] = ws.ytm[kk];
        k_ali[k] = kk;
      }
      kabsch(ws.r1, ws.r2, L_frag, 1, rmsd, sup_tmp);
      do_rotation(ws.xtm, ws.xt, Lali, sup_tmp);

      double d_cut = local_d0_search - 1;
      int n_cut = score_fun8(ws.xt, ws.ytm, Lali, d_cut, i_ali, score,
          score_sum_method, score_d8, d0, Lnorm);
      if (score > score_max) {
        score_max = score;
        sup_out = sup_tmp;
      }

      // iterative refinement
      d_cut = local_d0_search + 1;
      for (int it = 0; it < n_it; it++) {
        int ka = n_cut;
        for (int k = 0; k < n_cut; k++) {
          int m = i_ali[k];
          ws.r1[k] = ws.xtm[m];
          ws.r2[k] = ws.ytm[m];
          k_ali[k] = m;
        }

        kabsch(ws.r1, ws.r2, n_cut, 1, rmsd, sup_tmp);
        do_rotation(ws.xtm, ws.xt, Lali, sup_tmp);
        n_cut = score_fun8(ws.xt, ws.ytm, Lali, d_cut, i_ali, score,
            score_sum_method, score_d8, d0, Lnorm);
        if (score > score_max) {
          score_max = score;
          sup_out = sup_tmp;
        }

        if (n_cut == ka) {
          int k;
          for (k = 0; k < n_cut; k++)
            if (i_ali[k] != k_ali[k])
              break;
          if (k == n_cut)
            break;
        }
      }

      if (i < iL_max) {
        i += simplify_step;
        if (i > iL_max)
          i = iL_max;
      } else
        break;
    }
  }
  return score_max;
}

// ========================================================================
// Detailed search (from TMalign.h:detailed_search)
// ========================================================================

// Lnorm > 0: normalize by Lnorm (used during main search)
// Lnorm <= 0: normalize by n_ali (used during standard/final scoring)
static double detailed_search(DPWorkspace& ws, const std::vector<glm::dvec3>& x,
    const std::vector<glm::dvec3>& y, int xlen, int ylen,
    const std::vector<int>& invmap, Superposition& sup, int simplify_step,
    int score_sum_method, double local_d0_search, double score_d8, double d0,
    double Lnorm = 0.0)
{
  int k = 0;
  for (int i = 0; i < ylen; i++) {
    int j = invmap[i];
    if (j >= 0) {
      ws.xtm[k] = x[j];
      ws.ytm[k] = y[i];
      k++;
    }
  }
  return TMscore8_search(
      ws, k, sup, simplify_step, score_sum_method, local_d0_search,
      score_d8, d0, Lnorm);
}

// ========================================================================
// Get score fast — quick 3-pass evaluation (from TMalign.h:get_score_fast)
// ========================================================================

static double get_score_fast(DPWorkspace& ws, const std::vector<glm::dvec3>& x,
    const std::vector<glm::dvec3>& y, int xlen, int ylen,
    const std::vector<int>& invmap, double d0, double d0_search,
    Superposition& sup)
{
  double rms;
  int k = 0;
  for (int j = 0; j < ylen; j++) {
    int i = invmap[j];
    if (i >= 0) {
      ws.r1[k] = x[i];
      ws.r2[k] = y[j];
      ws.xtm[k] = x[i];
      ws.ytm[k] = y[j];
      k++;
    }
  }
  kabsch(ws.r1, ws.r2, k, 1, rms, sup);

  double d02 = d0 * d0;
  double d00 = d0_search;
  double d002 = d00 * d00;
  int n_ali = k;

  std::vector<double> dis(n_ali);
  double tmscore = 0;
  for (int kk = 0; kk < n_ali; kk++) {
    auto xrot = apply_transform(ws.xtm[kk], sup);
    double di = dist2(xrot, ws.ytm[kk]);
    dis[kk] = di;
    tmscore += 1.0 / (1.0 + di / d02);
  }

  // second iteration
  double d002t = d002;
  {
    std::vector<double> dis_sorted(dis.begin(), dis.end());
    std::sort(dis_sorted.begin(), dis_sorted.end());
    if (n_ali > 2 && d002t < dis_sorted[2])
      d002t = dis_sorted[2];
  }
  while (true) {
    int j = 0;
    for (int kk = 0; kk < n_ali; kk++) {
      if (dis[kk] <= d002t) {
        ws.r1[j] = ws.xtm[kk];
        ws.r2[j] = ws.ytm[kk];
        j++;
      }
    }
    if (j < 3 && n_ali > 3)
      d002t += 0.5;
    else
      break;
  }

  double tmscore1, tmscore2;
  int j_count;
  {
    // count how many passed
    j_count = 0;
    for (int kk = 0; kk < n_ali; kk++)
      if (dis[kk] <= d002t)
        j_count++;
  }

  if (n_ali != j_count) {
    Superposition sup2;
    kabsch(ws.r1, ws.r2, j_count, 1, rms, sup2);
    tmscore1 = 0;
    for (int kk = 0; kk < n_ali; kk++) {
      auto xrot = apply_transform(ws.xtm[kk], sup2);
      double di = dist2(xrot, ws.ytm[kk]);
      dis[kk] = di;
      tmscore1 += 1.0 / (1.0 + di / d02);
    }

    // third iteration
    d002t = d002 + 1;
    {
      std::vector<double> dis_sorted(dis.begin(), dis.end());
      std::sort(dis_sorted.begin(), dis_sorted.end());
      if (n_ali > 2 && d002t < dis_sorted[2])
        d002t = dis_sorted[2];
    }
    while (true) {
      int j = 0;
      for (int kk = 0; kk < n_ali; kk++) {
        if (dis[kk] <= d002t) {
          ws.r1[j] = ws.xtm[kk];
          ws.r2[j] = ws.ytm[kk];
          j++;
        }
      }
      if (j < 3 && n_ali > 3)
        d002t += 0.5;
      else {
        j_count = j;
        break;
      }
    }

    Superposition sup3;
    kabsch(ws.r1, ws.r2, j_count, 1, rms, sup3);
    tmscore2 = 0;
    for (int kk = 0; kk < n_ali; kk++) {
      auto xrot = apply_transform(ws.xtm[kk], sup3);
      double di = dist2(xrot, ws.ytm[kk]);
      tmscore2 += 1.0 / (1.0 + di / d02);
    }
  } else {
    tmscore1 = tmscore;
    tmscore2 = tmscore;
  }

  if (tmscore1 >= tmscore)
    tmscore = tmscore1;
  if (tmscore2 >= tmscore)
    tmscore = tmscore2;
  return tmscore;
}

// ========================================================================
// Needleman-Wunsch DP variants (from NW.h)
// ========================================================================

// NW with precomputed score matrix
static void nw_align_score(
    DPWorkspace& ws, int len1, int len2, double gap_open, std::vector<int>& j2i)
{
  for (int i = 0; i <= len1; i++) {
    ws.val(i, 0) = 0;
    ws.set_path(i, 0, false);
  }
  for (int j = 0; j <= len2; j++) {
    ws.val(0, j) = 0;
    ws.set_path(0, j, false);
  }
  for (int j = 0; j < len2; j++)
    j2i[j] = -1;

  for (int i = 1; i <= len1; i++) {
    for (int j = 1; j <= len2; j++) {
      double d = ws.val(i - 1, j - 1) + ws.score(i, j);
      double h = ws.val(i - 1, j);
      if (ws.path(i - 1, j))
        h += gap_open;
      double v = ws.val(i, j - 1);
      if (ws.path(i, j - 1))
        v += gap_open;

      if (d >= h && d >= v) {
        ws.set_path(i, j, true);
        ws.val(i, j) = d;
      } else {
        ws.set_path(i, j, false);
        ws.val(i, j) = (v >= h) ? v : h;
      }
    }
  }

  // traceback
  int i = len1, j = len2;
  while (i > 0 && j > 0) {
    if (ws.path(i, j)) {
      j2i[j - 1] = i - 1;
      i--;
      j--;
    } else {
      double h = ws.val(i - 1, j);
      if (ws.path(i - 1, j))
        h += gap_open;
      double v = ws.val(i, j - 1);
      if (ws.path(i, j - 1))
        v += gap_open;
      if (v >= h)
        j--;
      else
        i--;
    }
  }
}

// NW with geometric scoring using rotation
static void nw_align_transform(DPWorkspace& ws,
    const std::vector<glm::dvec3>& x, const std::vector<glm::dvec3>& y,
    int len1, int len2, const Superposition& sup, double d02, double gap_open,
    std::vector<int>& j2i)
{
  for (int i = 0; i <= len1; i++) {
    ws.val(i, 0) = 0;
    ws.set_path(i, 0, false);
  }
  for (int j = 0; j <= len2; j++) {
    ws.val(0, j) = 0;
    ws.set_path(0, j, false);
  }
  for (int j = 0; j < len2; j++)
    j2i[j] = -1;

  for (int i = 1; i <= len1; i++) {
    auto xx = apply_transform(x[i - 1], sup);
    for (int j = 1; j <= len2; j++) {
      double dij = dist2(xx, y[j - 1]);
      double d = ws.val(i - 1, j - 1) + 1.0 / (1.0 + dij / d02);
      double h = ws.val(i - 1, j);
      if (ws.path(i - 1, j))
        h += gap_open;
      double v = ws.val(i, j - 1);
      if (ws.path(i, j - 1))
        v += gap_open;

      if (d >= h && d >= v) {
        ws.set_path(i, j, true);
        ws.val(i, j) = d;
      } else {
        ws.set_path(i, j, false);
        ws.val(i, j) = (v >= h) ? v : h;
      }
    }
  }

  int i = len1, j = len2;
  while (i > 0 && j > 0) {
    if (ws.path(i, j)) {
      j2i[j - 1] = i - 1;
      i--;
      j--;
    } else {
      double h = ws.val(i - 1, j);
      if (ws.path(i - 1, j))
        h += gap_open;
      double v = ws.val(i, j - 1);
      if (ws.path(i, j - 1))
        v += gap_open;
      if (v >= h)
        j--;
      else
        i--;
    }
  }
}

// NW with secondary structure matching
static void nw_align_ss(DPWorkspace& ws, const std::vector<char>& secx,
    const std::vector<char>& secy, int len1, int len2, double gap_open,
    std::vector<int>& j2i)
{
  for (int i = 0; i <= len1; i++) {
    ws.val(i, 0) = 0;
    ws.set_path(i, 0, false);
  }
  for (int j = 0; j <= len2; j++) {
    ws.val(0, j) = 0;
    ws.set_path(0, j, false);
  }
  for (int j = 0; j < len2; j++)
    j2i[j] = -1;

  for (int i = 1; i <= len1; i++) {
    for (int j = 1; j <= len2; j++) {
      double d = ws.val(i - 1, j - 1) + 1.0 * (secx[i - 1] == secy[j - 1]);
      double h = ws.val(i - 1, j);
      if (ws.path(i - 1, j))
        h += gap_open;
      double v = ws.val(i, j - 1);
      if (ws.path(i, j - 1))
        v += gap_open;

      if (d >= h && d >= v) {
        ws.set_path(i, j, true);
        ws.val(i, j) = d;
      } else {
        ws.set_path(i, j, false);
        ws.val(i, j) = (v >= h) ? v : h;
      }
    }
  }

  int i = len1, j = len2;
  while (i > 0 && j > 0) {
    if (ws.path(i, j)) {
      j2i[j - 1] = i - 1;
      i--;
      j--;
    } else {
      double h = ws.val(i - 1, j);
      if (ws.path(i - 1, j))
        h += gap_open;
      double v = ws.val(i, j - 1);
      if (ws.path(i, j - 1))
        v += gap_open;
      if (v >= h)
        j--;
      else
        i--;
    }
  }
}

// ========================================================================
// Initial alignment seeds
// ========================================================================

// Seed 1: gapless threading
static double get_initial(DPWorkspace& ws, const std::vector<glm::dvec3>& x,
    const std::vector<glm::dvec3>& y, int xlen, int ylen, std::vector<int>& y2x,
    double d0, double d0_search, bool fast_opt, Superposition& sup)
{
  int min_len = std::min(xlen, ylen);
  int min_ali = min_len / 2;
  if (min_ali <= 5)
    min_ali = 5;
  int n1 = -ylen + min_ali;
  int n2 = xlen - min_ali;

  int k_best = n1;
  double tmscore_max = -1;

  for (int k = n1; k <= n2; k += (fast_opt ? 5 : 1)) {
    for (int j = 0; j < ylen; j++) {
      int i = j + k;
      y2x[j] = (i >= 0 && i < xlen) ? i : -1;
    }

    double tmscore =
        get_score_fast(ws, x, y, xlen, ylen, y2x, d0, d0_search, sup);
    if (tmscore >= tmscore_max) {
      tmscore_max = tmscore;
      k_best = k;
    }
  }

  for (int j = 0; j < ylen; j++) {
    int i = j + k_best;
    y2x[j] = (i >= 0 && i < xlen) ? i : -1;
  }
  return tmscore_max;
}

// Seed 2: secondary structure alignment
static void get_initial_ss(DPWorkspace& ws, const std::vector<char>& secx,
    const std::vector<char>& secy, int xlen, int ylen, std::vector<int>& y2x)
{
  nw_align_ss(ws, secx, secy, xlen, ylen, -1.0, y2x);
}

// Seed 3: local fragment superposition
static bool get_initial5(DPWorkspace& ws, const std::vector<glm::dvec3>& x,
    const std::vector<glm::dvec3>& y, int xlen, int ylen, std::vector<int>& y2x,
    double d0, double d0_search, bool fast_opt, double D0_MIN)
{
  double rmsd;
  Superposition sup;

  double d01 = d0 + 1.5;
  if (d01 < D0_MIN)
    d01 = D0_MIN;
  double d02 = d01 * d01;

  double GLmax = 0;
  int aL = std::min(xlen, ylen);
  std::vector<int> invmap(ylen, -1);

  int n_jump1;
  if (xlen > 250)
    n_jump1 = 45;
  else if (xlen > 200)
    n_jump1 = 35;
  else if (xlen > 150)
    n_jump1 = 25;
  else
    n_jump1 = 15;
  if (n_jump1 > xlen / 3)
    n_jump1 = xlen / 3;

  int n_jump2;
  if (ylen > 250)
    n_jump2 = 45;
  else if (ylen > 200)
    n_jump2 = 35;
  else if (ylen > 150)
    n_jump2 = 25;
  else
    n_jump2 = 15;
  if (n_jump2 > ylen / 3)
    n_jump2 = ylen / 3;

  int n_frag[2] = {20, 100};
  if (n_frag[0] > aL / 3)
    n_frag[0] = aL / 3;
  if (n_frag[1] > aL / 2)
    n_frag[1] = aL / 2;

  if (fast_opt) {
    n_jump1 *= 5;
    n_jump2 *= 5;
  }

  bool flag = false;
  for (int i_frag = 0; i_frag < 2; i_frag++) {
    int m1 = xlen - n_frag[i_frag] + 1;
    int m2 = ylen - n_frag[i_frag] + 1;

    for (int i = 0; i < m1; i += n_jump1) {
      for (int j = 0; j < m2; j += n_jump2) {
        for (int k = 0; k < n_frag[i_frag]; k++) {
          ws.r1[k] = x[k + i];
          ws.r2[k] = y[k + j];
        }

        kabsch(ws.r1, ws.r2, n_frag[i_frag], 1, rmsd, sup);
        nw_align_transform(ws, x, y, xlen, ylen, sup, d02, 0.0, invmap);
        double GL =
            get_score_fast(ws, x, y, xlen, ylen, invmap, d0, d0_search, sup);
        if (GL > GLmax) {
          GLmax = GL;
          for (int ii = 0; ii < ylen; ii++)
            y2x[ii] = invmap[ii];
          flag = true;
        }
      }
    }
  }
  return flag;
}

// Score matrix for ssplus seed
static void score_matrix_rmsd_sec(DPWorkspace& ws,
    const std::vector<char>& secx, const std::vector<char>& secy,
    const std::vector<glm::dvec3>& x, const std::vector<glm::dvec3>& y,
    int xlen, int ylen, const std::vector<int>& y2x, double D0_MIN, double d0)
{
  Superposition sup;
  double rmsd;
  double d01 = d0 + 1.5;
  if (d01 < D0_MIN)
    d01 = D0_MIN;
  double d02 = d01 * d01;

  int k = 0;
  for (int j = 0; j < ylen; j++) {
    int i = y2x[j];
    if (i >= 0) {
      ws.r1[k] = x[i];
      ws.r2[k] = y[j];
      k++;
    }
  }
  kabsch(ws.r1, ws.r2, k, 1, rmsd, sup);

  for (int ii = 0; ii < xlen; ii++) {
    auto xx = apply_transform(x[ii], sup);
    for (int jj = 0; jj < ylen; jj++) {
      double dij = dist2(xx, y[jj]);
      if (secx[ii] == secy[jj])
        ws.score(ii + 1, jj + 1) = 1.0 / (1.0 + dij / d02) + 0.5;
      else
        ws.score(ii + 1, jj + 1) = 1.0 / (1.0 + dij / d02);
    }
  }
}

// Seed 4: secondary structure plus structural distance
static void get_initial_ssplus(DPWorkspace& ws, const std::vector<char>& secx,
    const std::vector<char>& secy, const std::vector<glm::dvec3>& x,
    const std::vector<glm::dvec3>& y, int xlen, int ylen,
    const std::vector<int>& y2x0, std::vector<int>& y2x, double D0_MIN,
    double d0)
{
  score_matrix_rmsd_sec(ws, secx, secy, x, y, xlen, ylen, y2x0, D0_MIN, d0);
  nw_align_score(ws, xlen, ylen, -1.0, y2x);
}

// Find max continuous fragment
static void find_max_frag(const std::vector<glm::dvec3>& x, int len,
    int& start_max, int& end_max, double dcu0, bool fast_opt)
{
  int fra_min = fast_opt ? 8 : 4;
  int r_min = static_cast<int>(len / 3.0);
  if (r_min > fra_min)
    r_min = fra_min;

  int Lfr_max = 0;
  double dcu_cut = dcu0 * dcu0;
  int inc = 0;

  while (Lfr_max < r_min) {
    Lfr_max = 0;
    int j = 1;
    int start = 0;
    for (int i = 1; i < len; i++) {
      if (dist2(x[i - 1], x[i]) < dcu_cut) {
        j++;
        if (i == len - 1) {
          if (j > Lfr_max) {
            Lfr_max = j;
            start_max = start;
            end_max = i;
          }
          j = 1;
        }
      } else {
        if (j > Lfr_max) {
          Lfr_max = j;
          start_max = start;
          end_max = i - 1;
        }
        j = 1;
        start = i;
      }
    }
    if (Lfr_max < r_min) {
      inc++;
      double dinc = std::pow(1.1, static_cast<double>(inc)) * dcu0;
      dcu_cut = dinc * dinc;
    }
  }
}

// Seed 5: fragment gapless threading
static double get_initial_fgt(DPWorkspace& ws, const std::vector<glm::dvec3>& x,
    const std::vector<glm::dvec3>& y, int xlen, int ylen, std::vector<int>& y2x,
    double d0, double d0_search, double dcu0, bool fast_opt, Superposition& sup)
{
  int fra_min = fast_opt ? 8 : 4;
  int fra_min1 = fra_min - 1;

  int xstart = 0, ystart = 0, xend = 0, yend = 0;
  find_max_frag(x, xlen, xstart, xend, dcu0, fast_opt);
  find_max_frag(y, ylen, ystart, yend, dcu0, fast_opt);

  int Lx = xend - xstart + 1;
  int Ly = yend - ystart + 1;
  int L_fr = std::min(Lx, Ly);
  std::vector<int> ifr(L_fr);
  std::vector<int> y2x_(ylen, -1);

  if (Lx < Ly || (Lx == Ly && xlen <= ylen)) {
    for (int i = 0; i < L_fr; i++)
      ifr[i] = xstart + i;
  } else {
    for (int i = 0; i < L_fr; i++)
      ifr[i] = ystart + i;
  }

  int L0 = std::min(xlen, ylen);
  if (L_fr == L0) {
    int n1 = static_cast<int>(L0 * 0.1);
    int n2 = static_cast<int>(L0 * 0.89);
    int j = 0;
    for (int i = n1; i <= n2; i++)
      ifr[j++] = ifr[i];
    L_fr = j;
  }

  double tmscore_max = -1;

  if (Lx < Ly || (Lx == Ly && xlen <= ylen)) {
    int L1 = L_fr;
    int min_len = std::min(L1, ylen);
    int min_ali = static_cast<int>(min_len / 2.5);
    if (min_ali <= fra_min1)
      min_ali = fra_min1;
    int n1 = -ylen + min_ali;
    int n2 = L1 - min_ali;

    for (int k = n1; k <= n2; k += (fast_opt ? 3 : 1)) {
      for (int j = 0; j < ylen; j++) {
        int i = j + k;
        y2x_[j] = (i >= 0 && i < L1) ? ifr[i] : -1;
      }

      double tmscore =
          get_score_fast(ws, x, y, xlen, ylen, y2x_, d0, d0_search, sup);
      if (tmscore >= tmscore_max) {
        tmscore_max = tmscore;
        for (int j = 0; j < ylen; j++)
          y2x[j] = y2x_[j];
      }
    }
  } else {
    int L2 = L_fr;
    int min_len = std::min(xlen, L2);
    int min_ali = static_cast<int>(min_len / 2.5);
    if (min_ali <= fra_min1)
      min_ali = fra_min1;
    int n1 = -L2 + min_ali;
    int n2 = xlen - min_ali;

    for (int k = n1; k <= n2; k++) {
      for (int j = 0; j < ylen; j++)
        y2x_[j] = -1;

      for (int j = 0; j < L2; j++) {
        int i = j + k;
        if (i >= 0 && i < xlen)
          y2x_[ifr[j]] = i;
      }

      double tmscore =
          get_score_fast(ws, x, y, xlen, ylen, y2x_, d0, d0_search, sup);
      if (tmscore >= tmscore_max) {
        tmscore_max = tmscore;
        for (int j = 0; j < ylen; j++)
          y2x[j] = y2x_[j];
      }
    }
  }

  return tmscore_max;
}

// ========================================================================
// DP_iter — iterative DP refinement (from TMalign.h:DP_iter)
// ========================================================================

static double DP_iter(DPWorkspace& ws, const std::vector<glm::dvec3>& x,
    const std::vector<glm::dvec3>& y, int xlen, int ylen, Superposition& sup,
    std::vector<int>& invmap0, int g1, int g2, int iteration_max,
    double local_d0_search, double D0_MIN, double Lnorm, double d0,
    double score_d8)
{
  double gap_open[2] = {-0.6, 0};
  double rmsd;
  std::vector<int> invmap(ylen, -1);

  double tmscore_max = -1, tmscore_old = 0;
  int score_sum_method = 8, simplify_step = 40;
  double d02 = d0 * d0;

  for (int g = g1; g < g2; g++) {
    for (int iteration = 0; iteration < iteration_max; iteration++) {
      nw_align_transform(ws, x, y, xlen, ylen, sup, d02, gap_open[g], invmap);

      int k = 0;
      for (int j = 0; j < ylen; j++) {
        int i = invmap[j];
        if (i >= 0) {
          ws.xtm[k] = x[i];
          ws.ytm[k] = y[j];
          k++;
        }
      }

      double tmscore = TMscore8_search(ws, k, sup, simplify_step,
          score_sum_method, local_d0_search, score_d8, d0, Lnorm);

      if (tmscore > tmscore_max) {
        tmscore_max = tmscore;
        for (int i = 0; i < ylen; i++)
          invmap0[i] = invmap[i];
      }

      if (iteration > 0) {
        if (std::fabs(tmscore_old - tmscore) < 0.000001)
          break;
      }
      tmscore_old = tmscore;
    }
  }
  return tmscore_max;
}

// ========================================================================
// Main orchestrator: tm_align
// ========================================================================

TMAlignResult TMalign(const std::vector<glm::dvec3>& target_ca,
    const std::vector<glm::dvec3>& mobile_ca, const std::string& target_seq,
    const std::string& mobile_seq, bool fast)
{
  // In USalign convention: x = mobile (structure to move), y = target
  // The invmap maps target indices to mobile indices: invmap[j_target] =
  // i_mobile
  const auto& x = mobile_ca;
  const auto& y = target_ca;
  const auto& seqx = mobile_seq;
  const auto& seqy = target_seq;
  int xlen = static_cast<int>(x.size());
  int ylen = static_cast<int>(y.size());

  TMAlignResult result;

  if (xlen < 3 || ylen < 3)
    return result;

  // Secondary structure
  std::vector<char> secx, secy;
  assign_secondary_structure(x, secx);
  assign_secondary_structure(y, secy);

  // Allocate workspace
  DPWorkspace ws;
  ws.resize(xlen, ylen);

  // Parameters
  auto params = compute_search_params(xlen, ylen);
  int simplify_step = 40;
  int score_sum_method = 8;

  std::vector<int> invmap0(ylen, -1);
  std::vector<int> invmap(ylen, -1);
  double TMmax = -1;
  Superposition sup, sup_best;

  double ddcc = (params.Lnorm <= 40) ? 0.1 : 0.4;
  double local_d0_search = params.d0_search;

  // ============================================
  // Seed 1: gapless threading
  // ============================================
  get_initial(
      ws, x, y, xlen, ylen, invmap0, params.d0, params.d0_search, fast, sup);
  double TM = detailed_search(ws, x, y, xlen, ylen, invmap0, sup, simplify_step,
      score_sum_method, local_d0_search, params.score_d8, params.d0,
      params.Lnorm);
  if (TM > TMmax)
    TMmax = TM;

  TM = DP_iter(ws, x, y, xlen, ylen, sup, invmap, 0, 2, fast ? 2 : 30,
      local_d0_search, params.D0_MIN, params.Lnorm, params.d0, params.score_d8);
  if (TM > TMmax) {
    TMmax = TM;
    for (int i = 0; i < ylen; i++)
      invmap0[i] = invmap[i];
  }

  // ============================================
  // Seed 2: secondary structure alignment
  // ============================================
  get_initial_ss(ws, secx, secy, xlen, ylen, invmap);
  TM = detailed_search(ws, x, y, xlen, ylen, invmap, sup, simplify_step,
      score_sum_method, local_d0_search, params.score_d8, params.d0,
      params.Lnorm);
  if (TM > TMmax) {
    TMmax = TM;
    for (int i = 0; i < ylen; i++)
      invmap0[i] = invmap[i];
  }
  if (TM > TMmax * 0.2) {
    TM = DP_iter(ws, x, y, xlen, ylen, sup, invmap, 0, 2, fast ? 2 : 30,
        local_d0_search, params.D0_MIN, params.Lnorm, params.d0,
        params.score_d8);
    if (TM > TMmax) {
      TMmax = TM;
      for (int i = 0; i < ylen; i++)
        invmap0[i] = invmap[i];
    }
  }

  // ============================================
  // Seed 3: local fragment superposition
  // ============================================
  if (get_initial5(ws, x, y, xlen, ylen, invmap, params.d0, params.d0_search,
          fast, params.D0_MIN)) {
    TM = detailed_search(ws, x, y, xlen, ylen, invmap, sup, simplify_step,
        score_sum_method, local_d0_search, params.score_d8, params.d0,
        params.Lnorm);
    if (TM > TMmax) {
      TMmax = TM;
      for (int i = 0; i < ylen; i++)
        invmap0[i] = invmap[i];
    }
    if (TM > TMmax * ddcc) {
      TM = DP_iter(ws, x, y, xlen, ylen, sup, invmap, 0, 2, 2, local_d0_search,
          params.D0_MIN, params.Lnorm, params.d0, params.score_d8);
      if (TM > TMmax) {
        TMmax = TM;
        for (int i = 0; i < ylen; i++)
          invmap0[i] = invmap[i];
      }
    }
  }

  // ============================================
  // Seed 4: SS + structural distance
  // ============================================
  get_initial_ssplus(ws, secx, secy, x, y, xlen, ylen, invmap0, invmap,
      params.D0_MIN, params.d0);
  TM = detailed_search(ws, x, y, xlen, ylen, invmap, sup, simplify_step,
      score_sum_method, local_d0_search, params.score_d8, params.d0,
      params.Lnorm);
  if (TM > TMmax) {
    TMmax = TM;
    for (int i = 0; i < ylen; i++)
      invmap0[i] = invmap[i];
  }
  if (TM > TMmax * ddcc) {
    TM = DP_iter(ws, x, y, xlen, ylen, sup, invmap, 0, 2, fast ? 2 : 30,
        local_d0_search, params.D0_MIN, params.Lnorm, params.d0,
        params.score_d8);
    if (TM > TMmax) {
      TMmax = TM;
      for (int i = 0; i < ylen; i++)
        invmap0[i] = invmap[i];
    }
  }

  // ============================================
  // Seed 5: fragment gapless threading
  // ============================================
  get_initial_fgt(ws, x, y, xlen, ylen, invmap, params.d0, params.d0_search,
      params.dcu0, fast, sup);
  TM = detailed_search(ws, x, y, xlen, ylen, invmap, sup, simplify_step,
      score_sum_method, local_d0_search, params.score_d8, params.d0,
      params.Lnorm);
  if (TM > TMmax) {
    TMmax = TM;
    for (int i = 0; i < ylen; i++)
      invmap0[i] = invmap[i];
  }
  if (TM > TMmax * ddcc) {
    TM = DP_iter(ws, x, y, xlen, ylen, sup, invmap, 1, 2, 2, local_d0_search,
        params.D0_MIN, params.Lnorm, params.d0, params.score_d8);
    if (TM > TMmax) {
      TMmax = TM;
      for (int i = 0; i < ylen; i++)
        invmap0[i] = invmap[i];
    }
  }

  // ============================================
  // Verify alignment exists
  // ============================================
  {
    bool flag = false;
    for (int i = 0; i < ylen; i++) {
      if (invmap0[i] >= 0) {
        flag = true;
        break;
      }
    }
    if (!flag)
      return result;
  }

  // ============================================
  // Final detailed search with simplify_step=1
  // ============================================
  simplify_step = fast ? 40 : 1;
  score_sum_method = 8;
  detailed_search(ws, x, y, xlen, ylen, invmap0, sup, simplify_step,
      score_sum_method, local_d0_search, params.score_d8, params.d0);

  // Apply rotation to mobile and select pairs within score_d8
  {
    std::vector<glm::dvec3> xt(xlen);
    do_rotation(x, xt, xlen, sup);

    std::vector<int> m1, m2; // aligned indices
    int n_ali = 0;
    for (int j = 0; j < ylen; j++) {
      int i = invmap0[j];
      if (i >= 0) {
        n_ali++;
        double d = std::sqrt(dist2(xt[i], y[j]));
        if (d <= params.score_d8) {
          m1.push_back(i);
          m2.push_back(j);

          ws.xtm[static_cast<int>(m1.size()) - 1] = x[i];
          ws.ytm[static_cast<int>(m1.size()) - 1] = y[j];
          ws.r1[static_cast<int>(m1.size()) - 1] = xt[i];
          ws.r2[static_cast<int>(m1.size()) - 1] = y[j];
        }
      }
    }
    int n_ali8 = static_cast<int>(m1.size());

    if (n_ali8 < 1)
      return result;

    // RMSD of aligned residues
    double rmsd0;
    Superposition sup_rmsd;
    kabsch(ws.r1, ws.r2, n_ali8, 0, rmsd0, sup_rmsd);
    rmsd0 = std::sqrt(rmsd0 / n_ali8);

    // ============================================
    // Final TM-scores normalized by each length
    // ============================================
    simplify_step = 1;
    score_sum_method = 0;
    Superposition sup_final;

    // Normalized by target length (ylen)
    double D0_MIN_f, Lnorm_f, d0_target, d0_search_f;
    compute_final_params(
        static_cast<double>(ylen), D0_MIN_f, Lnorm_f, d0_target, d0_search_f);
    double TM1 = TMscore8_search(ws, n_ali8, sup_final, simplify_step,
        score_sum_method, d0_search_f, params.score_d8, d0_target, Lnorm_f);

    // Normalized by mobile length (xlen)
    double d0_mobile;
    compute_final_params(
        static_cast<double>(xlen), D0_MIN_f, Lnorm_f, d0_mobile, d0_search_f);
    Superposition sup_mobile;
    double TM2 = TMscore8_search(ws, n_ali8, sup_mobile, simplify_step,
        score_sum_method, d0_search_f, params.score_d8, d0_mobile, Lnorm_f);

    // Use the rotation from target-normalized scoring for the final transform
    // (matches original TMalign behavior: t0, u0 come from TM1 computation)

    // ============================================
    // Build alignment strings
    // ============================================
    // d0 for alignment display (same as target normalization)
    double d0_out = d0_target;

    // Re-rotate with best transform for alignment string generation
    do_rotation(x, xt, xlen, sup_final);

    int ali_len = xlen + ylen;
    std::string seqxA(ali_len, '-');
    std::string seqM(ali_len, ' ');
    std::string seqyA(ali_len, '-');

    int kk = 0, i_old = 0, j_old = 0;
    double Liden = 0;
    for (int k = 0; k < n_ali8; k++) {
      for (int i = i_old; i < m1[k]; i++) {
        seqxA[kk] = (i < static_cast<int>(seqx.size())) ? seqx[i] : 'X';
        seqyA[kk] = '-';
        seqM[kk] = ' ';
        kk++;
      }
      for (int j = j_old; j < m2[k]; j++) {
        seqxA[kk] = '-';
        seqyA[kk] = (j < static_cast<int>(seqy.size())) ? seqy[j] : 'X';
        seqM[kk] = ' ';
        kk++;
      }

      char cx = (m1[k] < static_cast<int>(seqx.size())) ? seqx[m1[k]] : 'X';
      char cy = (m2[k] < static_cast<int>(seqy.size())) ? seqy[m2[k]] : 'X';
      seqxA[kk] = cx;
      seqyA[kk] = cy;
      Liden += (cx == cy) ? 1 : 0;
      double dd = std::sqrt(dist2(xt[m1[k]], y[m2[k]]));
      seqM[kk] = (dd < d0_out) ? ':' : '.';
      kk++;
      i_old = m1[k] + 1;
      j_old = m2[k] + 1;
    }
    // tail
    for (int i = i_old; i < xlen; i++) {
      seqxA[kk] = (i < static_cast<int>(seqx.size())) ? seqx[i] : 'X';
      seqyA[kk] = '-';
      seqM[kk] = ' ';
      kk++;
    }
    for (int j = j_old; j < ylen; j++) {
      seqxA[kk] = '-';
      seqyA[kk] = (j < static_cast<int>(seqy.size())) ? seqy[j] : 'X';
      seqM[kk] = ' ';
      kk++;
    }

    // Fill result
    result.tm_score_target = TM1;
    result.tm_score_mobile = TM2;
    result.d0_target = d0_target;
    result.d0_mobile = d0_mobile;
    result.rmsd = rmsd0;
    result.aligned_length = n_ali8;
    result.seq_identity = (n_ali8 > 0) ? Liden / n_ali8 : 0.0;
    result.transform = sup_final;
    result.mobile_indices.assign(m1.begin(), m1.end());
    result.target_indices.assign(m2.begin(), m2.end());
    result.seq_mobile = seqxA.substr(0, kk);
    result.seq_target = seqyA.substr(0, kk);
    result.seq_match = seqM.substr(0, kk);
  }

  return result;
}

} // namespace pymol::usalign