Files
foldingdiff/bin/mds_structures.py
2023-02-09 18:15:39 -08:00

168 lines
5.1 KiB
Python

"""
Run MDS on structures to create an embedding visualization
Coloring options:
* training TM similarity
* scTM
* helix/beta strand annotations
* length
"""
import os
import json
import logging
from glob import glob
import argparse
import pandas as pd
from sklearn.manifold import MDS
from matplotlib import pyplot as plt
from hclust_structures import get_pairwise_tmscores, int_getter
from annot_secondary_structures import count_structures_in_pdb
# :)
SEED = int(
float.fromhex("2254616977616e2069732061206672656520636f756e74727922") % 10000
)
def len_pdb_structure(fname: str) -> int:
"""Return the integer length of the PDB structure"""
with open(fname) as source:
atom_lines = [l.strip() for l in source if l.startswith("ATOM")]
last_line_tokens = atom_lines[-1].split()
last_line_l = int(last_line_tokens[5])
assert int(len(atom_lines) / 3) == last_line_l
return last_line_l
def build_parser():
parser = argparse.ArgumentParser(
usage=__doc__, formatter_class=argparse.ArgumentDefaultsHelpFormatter
)
parser.add_argument("dirname", type=str, help="Directory containing PDB files")
parser.add_argument("--sctm", type=str, default="", help="scTM scores JSON file")
parser.add_argument(
"--trainingtm", type=str, default="", help="Training TM score JSON"
)
parser.add_argument(
"-o",
"--output",
type=str,
default="tmscore_mds",
help="PDF file prefix to write output to",
)
return parser
def main():
"""Run script"""
parser = build_parser()
args = parser.parse_args()
# Get files
fnames = sorted(
glob(os.path.join(args.dirname, "*.pdb")),
key=lambda x: int_getter(os.path.basename(x)),
)
logging.info(f"Computing TMscore on {len(fnames)} structures")
# in the dissimilarity matrix, larger values should indicate more distant points
# therefore we want to do 1 - TMscore (since larger values are closer in TMscore space)
pdist_df = 1.0 - get_pairwise_tmscores(fnames, sctm_scores_json=args.sctm)
mds = MDS(
n_components=2,
dissimilarity="precomputed",
metric=False, # TMscores do not respect triangle inequality
n_jobs=-1,
random_state=SEED,
)
embedding = pd.DataFrame(
mds.fit_transform(pdist_df.values),
index=pdist_df.index,
columns=["MDS1", "MDS2"],
)
format_strings = {
"Number helices": "{x:.1f}",
}
# For a variety of coloring keys, compute/read the scores and color scatter
# plot by the scores.
for k, v in {
"Max training TM": args.trainingtm,
"scTM": args.sctm,
"length": len_pdb_structure,
"Number helices": lambda x: count_structures_in_pdb(x, "psea")[0],
"Number sheets": lambda x: count_structures_in_pdb(x, "psea")[1],
"null": None,
}.items():
if v is None or v:
logging.info(f"Coloring by {k} scores")
figsize = (6.4, 4.8)
annot_points = False
if v is None:
scores = None
figsize = (12.8, 9.6)
annot_points = True
# If we are doing the null, the plot very big and label each
# point with the text id
elif callable(v):
fname_to_key = lambda f: os.path.basename(f).split(".")[0]
scores = {
fname_to_key(f): v(f)
for f in fnames
if fname_to_key(f) in embedding.index
}
scores = embedding.index.map(scores)
elif os.path.isfile(v):
with open(v) as source:
scores = embedding.index.map(json.load(source))
else:
raise ValueError(f"Invalid value for {k}: {v}")
fig, ax = plt.subplots(figsize=figsize, dpi=300)
points = ax.scatter(
embedding.iloc[:, 0],
embedding.iloc[:, 1],
s=8,
c=scores,
cmap="RdYlBu",
alpha=0.9,
)
if annot_points:
for i in range(len(embedding)):
ax.annotate(
embedding.index[i],
(embedding.iloc[i, 0], embedding.iloc[i, 1]),
fontsize=6,
)
ax.set(
xlabel="MDS 1",
ylabel="MDS 2",
)
if not k == "null":
ax.set(
xticks=[],
yticks=[],
title=k,
)
if scores is not None:
cbar = plt.colorbar(
points,
ax=ax,
fraction=0.08,
pad=0.04,
location="right",
# format=format_strings.get(k, None),
)
cbar.ax.set_ylabel(k, fontsize=12)
fig.savefig(f"{args.output}_mds_{k}.pdf", bbox_inches="tight")
if __name__ == "__main__":
logging.basicConfig(level=logging.INFO)
main()