From 310f240c0045310f399b7a4f388e3cd5da3e497d Mon Sep 17 00:00:00 2001
From: "Zhouran Qiao zhuoran.qiao@entos.ai"
 <zhuoran@sonic-ssh.entos-foundry.ai>
Date: Tue, 14 Feb 2023 16:58:06 -0800
Subject: [PATCH 01/25] Resolving torch cuda availablity for cpu-only
 installation

---
 setup.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 368e1d8..57e6f26 100644
--- a/setup.py
+++ b/setup.py
@@ -16,6 +16,7 @@ import os
 from setuptools import setup, Extension, find_packages
 import subprocess
 
+import torch
 from torch.utils.cpp_extension import BuildExtension, CppExtension, CUDAExtension, CUDA_HOME
 
 from scripts.utils import get_nvidia_cc
@@ -37,7 +38,7 @@ extra_cuda_flags = [
 ]
 
 def get_cuda_bare_metal_version(cuda_dir):
-    if cuda_dir==None:
+    if cuda_dir==None or torch.version.cuda==None:
         print("CUDA is not found, cpu version is installed")
         return None, -1, 0
     else:

From bb11fb189ad3bcaf28ed9cc168b6cf09e5b84973 Mon Sep 17 00:00:00 2001
From: Zhuoran Qiao <zrqiao0@gmail.com>
Date: Thu, 16 Feb 2023 12:37:33 -0800
Subject: [PATCH 02/25] Update setup.py

---
 setup.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/setup.py b/setup.py
index 57e6f26..2545e52 100644
--- a/setup.py
+++ b/setup.py
@@ -1,3 +1,4 @@
+# Originally from Openfold https://github.com/aqlaboratory/openfold, modified.
 # Copyright 2021 AlQuraishi Laboratory
 # Copyright 2021 DeepMind Technologies Limited
 #

From 5ddaa186b9088f1ab699ea5807c40736cc0c7547 Mon Sep 17 00:00:00 2001
From: Jose Duarte <jose.m.duarte@gmail.com>
Date: Mon, 20 Feb 2023 09:52:49 -0800
Subject: [PATCH 03/25] WIP: towards modelcif writing

---
 openfold/np/protein.py | 63 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 63 insertions(+)

diff --git a/openfold/np/protein.py b/openfold/np/protein.py
index af8739c..b982be0 100644
--- a/openfold/np/protein.py
+++ b/openfold/np/protein.py
@@ -23,6 +23,12 @@ import string
 from openfold.np import residue_constants
 from Bio.PDB import PDBParser
 import numpy as np
+import modelcif
+import modelcif.model
+import modelcif.dumper
+import modelcif.reference
+import modelcif.protocol
+import modelcif.alignment
 
 
 FeatureDict = Mapping[str, np.ndarray]
@@ -385,6 +391,63 @@ def to_pdb(prot: Protein) -> str:
     return "\n".join(pdb_lines)
 
 
+def to_modelcif(prot: Protein) -> str:
+    """
+    Converts a `Protein` instance to a ModelCIF string.
+
+    Args:
+      prot: The protein to convert to PDB.
+
+    Returns:
+      ModelCIF string.
+    """
+
+    atom_mask = prot.atom_mask
+    aatype = prot.aatype
+    atom_positions = prot.atom_positions
+    residue_index = prot.residue_index.astype(np.int32)
+    b_factors = prot.b_factors
+    chain_index = prot.chain_index
+
+    system = modelcif.System(title='Ligand example')
+
+    # Describe the amino acid (polymer) sequences as Entity objects, for both
+    # template and model:
+    model_e = modelcif.Entity('AYVINDSCIA', description='Model subunit')
+
+    # Define the model assembly, as two AsymUnits. NonPolymerFromTemplate is a
+    # subclass of AsymUnit that additionally notes the Template from which it
+    # was derived. In this case we state that the ligand was simply copied from
+    # the template into the target (explicit=False):
+    asymA = modelcif.AsymUnit(model_e, details='Model subunit A', id='A')
+    modeled_assembly = modelcif.Assembly((asymA,), name='Modeled assembly')
+
+    class MyModel(modelcif.model.HomologyModel):
+        asym_unit_map = {'A': asymA}
+
+        def get_atoms(self):
+            for asym, seq_id, type_symbol, atom_id, x, y, z in atoms:
+                yield modelcif.model.Atom(
+                    asym_unit=self.asym_unit_map[asym], type_symbol=type_symbol,
+                    seq_id=seq_id, atom_id=atom_id, x=x, y=y, z=z,
+                    het=seq_id is None)
+
+    # Add the model and modeling protocol to the file and write them out:
+    model = MyModel(assembly=modeled_assembly, name='Best scoring model')
+
+    model_group = modelcif.model.ModelGroup([model], name='All models')
+    system.model_groups.append(model_group)
+
+    protocol = modelcif.protocol.Protocol()
+    # protocol.steps.append(modelcif.protocol.ModelingStep(
+    #     input_data=aln, output_data=model))
+    system.protocols.append(protocol)
+
+    with open('/Users/jose/output3.cif', 'w') as fh:
+        modelcif.dumper.write(fh, [system])
+
+
+
 def ideal_atom_mask(prot: Protein) -> np.ndarray:
     """Computes an ideal atom mask.
 

From 7bc13cfc8ecd232d412c6c70856a444e9b3f4434 Mon Sep 17 00:00:00 2001
From: Jose Duarte <jose.m.duarte@gmail.com>
Date: Mon, 20 Feb 2023 09:53:13 -0800
Subject: [PATCH 04/25] WIP: towards modelcif writing, conda

---
 environment.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/environment.yml b/environment.yml
index 0dfb9db..b8a2082 100644
--- a/environment.yml
+++ b/environment.yml
@@ -27,4 +27,5 @@ dependencies:
       - typing-extensions==3.10.0.2
       - pytorch_lightning==1.5.10
       - wandb==0.12.21
+      - modelcif==0.7
       - git+https://github.com/NVIDIA/dllogger.git

From ba8de687af386d0bca9da890cdc0a227b424e692 Mon Sep 17 00:00:00 2001
From: Jose Duarte <jose.m.duarte@gmail.com>
Date: Mon, 20 Feb 2023 11:26:15 -0800
Subject: [PATCH 05/25] Working implementation for 1 chain case

---
 openfold/np/protein.py | 97 +++++++++++++++++++++++++++++++++---------
 1 file changed, 76 insertions(+), 21 deletions(-)

diff --git a/openfold/np/protein.py b/openfold/np/protein.py
index b982be0..f15e135 100644
--- a/openfold/np/protein.py
+++ b/openfold/np/protein.py
@@ -62,7 +62,7 @@ class Protein:
     # Chain indices for multi-chain predictions
     chain_index: Optional[np.ndarray] = None
 
-    # Optional remark about the protein. Included as a comment in output PDB 
+    # Optional remark about the protein. Included as a comment in output PDB
     # files
     remark: Optional[str] = None
 
@@ -177,7 +177,7 @@ def from_proteinnet_string(proteinnet_str: str) -> Protein:
         tag.strip() for tag in re.split(tag_re, proteinnet_str) if len(tag) > 0
     ]
     groups = zip(tags[0::2], [l.split('\n') for l in tags[1::2]])
-   
+
     atoms = ['N', 'CA', 'C']
     aatype = None
     atom_positions = None
@@ -252,7 +252,7 @@ def add_pdb_headers(prot: Protein, pdb_str: str) -> str:
     """
     out_pdb_lines = []
     lines = pdb_str.split('\n')
-    
+
     remark = prot.remark
     if(remark is not None):
         out_pdb_lines.append(f"REMARK {remark}")
@@ -347,7 +347,7 @@ def to_pdb(prot: Protein) -> str:
                 0
             ]  # Protein supports only C, N, O, S, this works.
             charge = ""
-    
+
             chain_tag = "A"
             if(chain_index is not None):
                 chain_tag = chain_tags[chain_index[i]]
@@ -402,6 +402,10 @@ def to_modelcif(prot: Protein) -> str:
       ModelCIF string.
     """
 
+    restypes = residue_constants.restypes + ["X"]
+    res_1to3 = lambda r: residue_constants.restype_1to3.get(restypes[r], "UNK")
+    atom_types = residue_constants.atom_types
+
     atom_mask = prot.atom_mask
     aatype = prot.aatype
     atom_positions = prot.atom_positions
@@ -409,16 +413,14 @@ def to_modelcif(prot: Protein) -> str:
     b_factors = prot.b_factors
     chain_index = prot.chain_index
 
-    system = modelcif.System(title='Ligand example')
+    system = modelcif.System(title='OpenFold prediction')
 
-    # Describe the amino acid (polymer) sequences as Entity objects, for both
-    # template and model:
-    model_e = modelcif.Entity('AYVINDSCIA', description='Model subunit')
+    # sequence into entity
+    n = aatype.shape[0]
+    seq = [restypes[aatype[i]] for i in range(n)]
+    model_e = modelcif.Entity("".join(seq), description='Model subunit')
 
-    # Define the model assembly, as two AsymUnits. NonPolymerFromTemplate is a
-    # subclass of AsymUnit that additionally notes the Template from which it
-    # was derived. In this case we state that the ligand was simply copied from
-    # the template into the target (explicit=False):
+    # Define the model assembly
     asymA = modelcif.AsymUnit(model_e, details='Model subunit A', id='A')
     modeled_assembly = modelcif.Assembly((asymA,), name='Modeled assembly')
 
@@ -426,11 +428,55 @@ def to_modelcif(prot: Protein) -> str:
         asym_unit_map = {'A': asymA}
 
         def get_atoms(self):
-            for asym, seq_id, type_symbol, atom_id, x, y, z in atoms:
-                yield modelcif.model.Atom(
-                    asym_unit=self.asym_unit_map[asym], type_symbol=type_symbol,
-                    seq_id=seq_id, atom_id=atom_id, x=x, y=y, z=z,
-                    het=seq_id is None)
+
+            prev_chain_index = 0
+            chain_tags = string.ascii_uppercase
+            # Add all atom sites.
+            for i in range(n):
+                res_name_3 = res_1to3(aatype[i])
+                for atom_name, pos, mask, b_factor in zip(
+                        atom_types, atom_positions[i], atom_mask[i], b_factors[i]
+                ):
+                    if mask < 0.5:
+                        continue
+
+                    name = atom_name if len(atom_name) == 4 else f" {atom_name}"
+                    element = atom_name[0]  # Protein supports only C, N, O, S, this works.
+
+                    chain_tag = "A"
+                    if chain_index is not None:
+                        chain_tag = chain_tags[chain_index[i]]
+
+                    # TODO check that residue indices are ok
+                    # TODO how to set residue types? do they come from the entity sequence set above?
+                    yield modelcif.model.Atom(
+                        asym_unit=asymA, type_symbol=element,
+                        seq_id=residue_index[i], atom_id=name,
+                        x=pos[0], y=pos[1], z=pos[2],
+                        het=False, biso=b_factor, occupancy=1.00)
+
+                # TODO multiple chains
+                # should_terminate = (i == n - 1)
+                # if(chain_index is not None):
+                #     if(i != n - 1 and chain_index[i + 1] != prev_chain_index):
+                #         should_terminate = True
+                #         prev_chain_index = chain_index[i + 1]
+                #
+                # if(should_terminate):
+                #     # Close the chain.
+                #     chain_end = "TER"
+                #     chain_termination_line = (
+                #         f"{chain_end:<6}{atom_index:>5}      "
+                #         f"{res_1to3(aatype[i]):>3} "
+                #         f"{chain_tag:>1}{residue_index[i]:>4}"
+                #     )
+                #     pdb_lines.append(chain_termination_line)
+                #     atom_index += 1
+                #
+                #     if(i != n - 1):
+                #         # "prev" is a misnomer here. This happens at the beginning of
+                #         # each new chain.
+                #         pdb_lines.extend(get_pdb_headers(prot, prev_chain_index))
 
     # Add the model and modeling protocol to the file and write them out:
     model = MyModel(assembly=modeled_assembly, name='Best scoring model')
@@ -442,10 +488,10 @@ def to_modelcif(prot: Protein) -> str:
     # protocol.steps.append(modelcif.protocol.ModelingStep(
     #     input_data=aln, output_data=model))
     system.protocols.append(protocol)
-
-    with open('/Users/jose/output3.cif', 'w') as fh:
-        modelcif.dumper.write(fh, [system])
-
+    fh = io.StringIO()
+    modelcif.dumper.write(fh, [system])
+    modelcifstr = fh.getvalue()
+    return modelcifstr
 
 
 def ideal_atom_mask(prot: Protein) -> np.ndarray:
@@ -499,3 +545,12 @@ def from_prediction(
         parents=parents,
         parents_chain_index=parents_chain_index,
     )
+
+
+if __name__ == "__main__":
+    with open('/home/jose/Downloads/171l.pdb', 'r') as file:
+        pdbstr = file.read()
+
+    prot = from_pdb_string(pdbstr)
+    cifstr = to_modelcif(prot)
+    print(cifstr)
\ No newline at end of file

From 4f662f832c4dd437faa470bd493721c28ec92cb7 Mon Sep 17 00:00:00 2001
From: Jose Duarte <jose.m.duarte@gmail.com>
Date: Mon, 20 Feb 2023 13:54:19 -0800
Subject: [PATCH 06/25] Now should work for multi-chain. But it is untested.
 Also fixed important bug: atom names were padded

---
 openfold/np/protein.py | 103 +++++++++++++++++++++--------------------
 1 file changed, 52 insertions(+), 51 deletions(-)

diff --git a/openfold/np/protein.py b/openfold/np/protein.py
index f15e135..28a669c 100644
--- a/openfold/np/protein.py
+++ b/openfold/np/protein.py
@@ -403,7 +403,6 @@ def to_modelcif(prot: Protein) -> str:
     """
 
     restypes = residue_constants.restypes + ["X"]
-    res_1to3 = lambda r: residue_constants.restype_1to3.get(restypes[r], "UNK")
     atom_types = residue_constants.atom_types
 
     atom_mask = prot.atom_mask
@@ -413,71 +412,67 @@ def to_modelcif(prot: Protein) -> str:
     b_factors = prot.b_factors
     chain_index = prot.chain_index
 
+    n = aatype.shape[0]
+    if chain_index is None:
+        chain_index = [0 for i in range(n)]
+
     system = modelcif.System(title='OpenFold prediction')
 
-    # sequence into entity
-    n = aatype.shape[0]
-    seq = [restypes[aatype[i]] for i in range(n)]
-    model_e = modelcif.Entity("".join(seq), description='Model subunit')
+    # Finding chains and creating entities
+    seqs = {}
+    seq = []
+    last_chain_idx = None
+    for i in range(n):
+        if last_chain_idx is not None and last_chain_idx != chain_index[i]:
+            seqs[last_chain_idx] = seq
+            seq = []
+        seq.append(restypes[aatype[i]])
+        last_chain_idx = chain_index[i]
+    # finally add the last chain
+    if last_chain_idx not in seqs:
+        seqs[last_chain_idx] = seq
 
-    # Define the model assembly
-    asymA = modelcif.AsymUnit(model_e, details='Model subunit A', id='A')
-    modeled_assembly = modelcif.Assembly((asymA,), name='Modeled assembly')
+    # now reduce sequences to unique ones (note this won't work if different asyms have different unmodelled regions)
+    unique_seqs = {}
+    for chain_idx in seqs.keys():
+        seq = "".join(seqs[chain_idx])
+        if seq in unique_seqs:
+            unique_seqs[seq].append(chain_idx)
+        else:
+            unique_seqs[seq] = [chain_idx]
+
+    # adding 1 entity per unique sequence
+    entities_map = {}
+    for key, value in unique_seqs.items():
+        model_e = modelcif.Entity("".join(key), description='Model subunit')
+        for chain_idx in value:
+            entities_map[chain_idx] = model_e
+
+    chain_tags = string.ascii_uppercase
+    asym_unit_map = {}
+    for chain_idx in set(chain_index):
+        # Define the model assembly
+        chain_id = chain_tags[chain_idx]
+        asym = modelcif.AsymUnit(entities_map[chain_idx], details='Model subunit %s' % chain_id, id=chain_id)
+        asym_unit_map[chain_idx] = asym
+    modeled_assembly = modelcif.Assembly(asym_unit_map.values(), name='Modeled assembly')
 
     class MyModel(modelcif.model.HomologyModel):
-        asym_unit_map = {'A': asymA}
-
         def get_atoms(self):
-
-            prev_chain_index = 0
-            chain_tags = string.ascii_uppercase
             # Add all atom sites.
             for i in range(n):
-                res_name_3 = res_1to3(aatype[i])
                 for atom_name, pos, mask, b_factor in zip(
                         atom_types, atom_positions[i], atom_mask[i], b_factors[i]
                 ):
                     if mask < 0.5:
                         continue
-
-                    name = atom_name if len(atom_name) == 4 else f" {atom_name}"
                     element = atom_name[0]  # Protein supports only C, N, O, S, this works.
-
-                    chain_tag = "A"
-                    if chain_index is not None:
-                        chain_tag = chain_tags[chain_index[i]]
-
-                    # TODO check that residue indices are ok
-                    # TODO how to set residue types? do they come from the entity sequence set above?
                     yield modelcif.model.Atom(
-                        asym_unit=asymA, type_symbol=element,
-                        seq_id=residue_index[i], atom_id=name,
+                        asym_unit=asym_unit_map[chain_index[i]], type_symbol=element,
+                        seq_id=residue_index[i], atom_id=atom_name,
                         x=pos[0], y=pos[1], z=pos[2],
                         het=False, biso=b_factor, occupancy=1.00)
 
-                # TODO multiple chains
-                # should_terminate = (i == n - 1)
-                # if(chain_index is not None):
-                #     if(i != n - 1 and chain_index[i + 1] != prev_chain_index):
-                #         should_terminate = True
-                #         prev_chain_index = chain_index[i + 1]
-                #
-                # if(should_terminate):
-                #     # Close the chain.
-                #     chain_end = "TER"
-                #     chain_termination_line = (
-                #         f"{chain_end:<6}{atom_index:>5}      "
-                #         f"{res_1to3(aatype[i]):>3} "
-                #         f"{chain_tag:>1}{residue_index[i]:>4}"
-                #     )
-                #     pdb_lines.append(chain_termination_line)
-                #     atom_index += 1
-                #
-                #     if(i != n - 1):
-                #         # "prev" is a misnomer here. This happens at the beginning of
-                #         # each new chain.
-                #         pdb_lines.extend(get_pdb_headers(prot, prev_chain_index))
-
     # Add the model and modeling protocol to the file and write them out:
     model = MyModel(assembly=modeled_assembly, name='Best scoring model')
 
@@ -548,9 +543,15 @@ def from_prediction(
 
 
 if __name__ == "__main__":
-    with open('/home/jose/Downloads/171l.pdb', 'r') as file:
+    pdb_file = '/home/jose/Downloads/171l.pdb'
+    # pdb_file = '/home/jose/Downloads/2trx.pdb'
+    cif_file = '/home/jose/test.cif'
+
+    with open(pdb_file, 'r') as file:
         pdbstr = file.read()
 
-    prot = from_pdb_string(pdbstr)
+    prot = from_pdb_string(pdbstr, "A")
     cifstr = to_modelcif(prot)
-    print(cifstr)
\ No newline at end of file
+    print(cifstr)
+    with open(cif_file, 'w') as fw:
+        fw.write(cifstr)

From 9db8dc369b0c247012f60591e7e088ab44268d3c Mon Sep 17 00:00:00 2001
From: Jose Duarte <jose.m.duarte@gmail.com>
Date: Mon, 20 Feb 2023 14:08:05 -0800
Subject: [PATCH 07/25] Adding output_cif as CLI argument

---
 run_pretrained_openfold.py | 50 +++++++++++++++++++++++---------------
 1 file changed, 30 insertions(+), 20 deletions(-)

diff --git a/run_pretrained_openfold.py b/run_pretrained_openfold.py
index 7bd99be..66bd2d5 100644
--- a/run_pretrained_openfold.py
+++ b/run_pretrained_openfold.py
@@ -1,6 +1,6 @@
 # Copyright 2021 AlQuraishi Laboratory
 # Copyright 2021 DeepMind Technologies Limited
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -35,7 +35,7 @@ torch_versions = torch.__version__.split(".")
 torch_major_version = int(torch_versions[0])
 torch_minor_version = int(torch_versions[1])
 if(
-    torch_major_version > 1 or 
+    torch_major_version > 1 or
     (torch_major_version == 1 and torch_minor_version >= 12)
 ):
     # Gives a large speedup on Ampere-class GPUs
@@ -70,7 +70,7 @@ def precompute_alignments(tags, seqs, alignment_dir, args):
         local_alignment_dir = os.path.join(alignment_dir, tag)
         if(args.use_precomputed_alignments is None and not os.path.isdir(local_alignment_dir)):
             logger.info(f"Generating alignments for {tag}...")
-                
+
             os.makedirs(local_alignment_dir)
 
             alignment_runner = data_pipeline.AlignmentRunner(
@@ -141,13 +141,13 @@ def main(args):
     os.makedirs(args.output_dir, exist_ok=True)
 
     config = model_config(args.config_preset, long_sequence_inference=args.long_sequence_inference)
-    
+
     if(args.trace_model):
         if(not config.data.predict.fixed_size):
             raise ValueError(
                 "Tracing requires that fixed_size mode be enabled in the config"
             )
-    
+
     template_featurizer = templates.TemplateHitFeaturizer(
         mmcif_dir=args.template_mmcif_dir,
         max_template_date=args.max_template_date,
@@ -165,10 +165,10 @@ def main(args):
     random_seed = args.data_random_seed
     if random_seed is None:
         random_seed = random.randrange(2**32)
-    
+
     np.random.seed(random_seed)
     torch.manual_seed(random_seed + 1)
-    
+
     feature_processor = feature_pipeline.FeaturePipeline(config.data)
     if not os.path.exists(output_dir_base):
         os.makedirs(output_dir_base)
@@ -183,7 +183,7 @@ def main(args):
         # Gather input sequences
         with open(os.path.join(args.fasta_dir, fasta_file), "r") as fp:
             data = fp.read()
-    
+
         tags, seqs = parse_fasta(data)
         # assert len(tags) == len(set(tags)), "All FASTA tags must be unique"
         tag = '-'.join(tags)
@@ -206,10 +206,10 @@ def main(args):
             output_name = f'{tag}_{args.config_preset}'
             if args.output_postfix is not None:
                 output_name = f'{output_name}_{args.output_postfix}'
-    
+
             # Does nothing if the alignments have already been computed
             precompute_alignments(tags, seqs, alignment_dir, args)
-        
+
             feature_dict = feature_dicts.get(tag, None)
             if(feature_dict is None):
                 feature_dict = generate_feature_dict(
@@ -234,7 +234,7 @@ def main(args):
             )
 
             processed_feature_dict = {
-                k:torch.as_tensor(v, device=args.model_device) 
+                k:torch.as_tensor(v, device=args.model_device)
                 for k,v in processed_feature_dict.items()
             }
 
@@ -255,30 +255,36 @@ def main(args):
 
             # Toss out the recycling dimensions --- we don't need them anymore
             processed_feature_dict = tensor_tree_map(
-                lambda x: np.array(x[..., -1].cpu()), 
+                lambda x: np.array(x[..., -1].cpu()),
                 processed_feature_dict
             )
             out = tensor_tree_map(lambda x: np.array(x.cpu()), out)
 
             unrelaxed_protein = prep_output(
-                out, 
-                processed_feature_dict, 
-                feature_dict, 
-                feature_processor, 
+                out,
+                processed_feature_dict,
+                feature_dict,
+                feature_processor,
                 args.config_preset,
                 args.multimer_ri_gap,
                 args.subtract_plddt
             )
 
+            unrelaxed_file_suffix = "_unrelaxed.pdb"
+            if args.cif_output:
+                unrelaxed_file_suffix = "_unrelaxed.cif"
             unrelaxed_output_path = os.path.join(
-                output_directory, f'{output_name}_unrelaxed.pdb'
+                output_directory, f'{output_name}{unrelaxed_file_suffix}'
             )
 
             with open(unrelaxed_output_path, 'w') as fp:
-                fp.write(protein.to_pdb(unrelaxed_protein))
+                if args.cif_output:
+                    fp.write(protein.to_modelcif(unrelaxed_protein))
+                else:
+                    fp.write(protein.to_pdb(unrelaxed_protein))
 
             logger.info(f"Output written to {unrelaxed_output_path}...")
-            
+
             if not args.skip_relaxation:
                 # Relax the prediction.
                 logger.info(f"Running relaxation on {unrelaxed_output_path}...")
@@ -373,12 +379,16 @@ if __name__ == "__main__":
         "--long_sequence_inference", action="store_true", default=False,
         help="""enable options to reduce memory usage at the cost of speed, helps longer sequences fit into GPU memory, see the README for details"""
     )
+    parser.add_argument(
+        "--cif_output", action="store_true", default=False,
+        help="Output predicted models in ModelCIF format instead of PDB format (default)"
+    )
     add_data_args(parser)
     args = parser.parse_args()
 
     if(args.jax_param_path is None and args.openfold_checkpoint_path is None):
         args.jax_param_path = os.path.join(
-            "openfold", "resources", "params", 
+            "openfold", "resources", "params",
             "params_" + args.config_preset + ".npz"
         )
 

From 951d47cf345dc52ff4f300d89df1785559252846 Mon Sep 17 00:00:00 2001
From: Jose Duarte <jose.m.duarte@gmail.com>
Date: Mon, 20 Feb 2023 14:52:14 -0800
Subject: [PATCH 08/25] Ab-initio is more appropriate

---
 openfold/np/protein.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/openfold/np/protein.py b/openfold/np/protein.py
index 28a669c..ff245ec 100644
--- a/openfold/np/protein.py
+++ b/openfold/np/protein.py
@@ -457,7 +457,7 @@ def to_modelcif(prot: Protein) -> str:
         asym_unit_map[chain_idx] = asym
     modeled_assembly = modelcif.Assembly(asym_unit_map.values(), name='Modeled assembly')
 
-    class MyModel(modelcif.model.HomologyModel):
+    class MyModel(modelcif.model.AbInitioModel):
         def get_atoms(self):
             # Add all atom sites.
             for i in range(n):

From f43640b9804c9e7f38a448080756bc6a41fcb516 Mon Sep 17 00:00:00 2001
From: Jose Duarte <jose.m.duarte@gmail.com>
Date: Mon, 20 Feb 2023 15:29:46 -0800
Subject: [PATCH 09/25] Adding ma_qa_metric_local

---
 openfold/np/protein.py | 28 +++++++++++++++++++++++++---
 1 file changed, 25 insertions(+), 3 deletions(-)

diff --git a/openfold/np/protein.py b/openfold/np/protein.py
index ff245ec..cc1c8f8 100644
--- a/openfold/np/protein.py
+++ b/openfold/np/protein.py
@@ -29,6 +29,7 @@ import modelcif.dumper
 import modelcif.reference
 import modelcif.protocol
 import modelcif.alignment
+import modelcif.qa_metric
 
 
 FeatureDict = Mapping[str, np.ndarray]
@@ -457,7 +458,12 @@ def to_modelcif(prot: Protein) -> str:
         asym_unit_map[chain_idx] = asym
     modeled_assembly = modelcif.Assembly(asym_unit_map.values(), name='Modeled assembly')
 
-    class MyModel(modelcif.model.AbInitioModel):
+    class _LocalPLDDT(modelcif.qa_metric.Local, modelcif.qa_metric.PLDDT):
+        name = "pLDDT"
+        software = None
+        description = "Predicted lddt"
+
+    class _MyModel(modelcif.model.AbInitioModel):
         def get_atoms(self):
             # Add all atom sites.
             for i in range(n):
@@ -473,8 +479,24 @@ def to_modelcif(prot: Protein) -> str:
                         x=pos[0], y=pos[1], z=pos[2],
                         het=False, biso=b_factor, occupancy=1.00)
 
+        def add_scores(self):
+            # TODO global scores
+            # self.qa_metrics.extend((_GlobalPLDDT(np.mean(scores_json["plddt"]))))
+            done_residues = {}
+            # local scores
+            for i in range(n):
+                for mask, b_factor in zip(atom_mask[i], b_factors[i]):
+                    if mask < 0.5:
+                        continue
+                    # add 1 per residue, not 1 per atom
+                    if residue_index[i] not in done_residues:
+                        self.qa_metrics.append(
+                            _LocalPLDDT(asym_unit_map[chain_index[i]].residue(residue_index[i]), b_factor))
+                    done_residues[residue_index[i]] = True
+
     # Add the model and modeling protocol to the file and write them out:
-    model = MyModel(assembly=modeled_assembly, name='Best scoring model')
+    model = _MyModel(assembly=modeled_assembly, name='Best scoring model')
+    model.add_scores()
 
     model_group = modelcif.model.ModelGroup([model], name='All models')
     system.model_groups.append(model_group)
@@ -545,7 +567,7 @@ def from_prediction(
 if __name__ == "__main__":
     pdb_file = '/home/jose/Downloads/171l.pdb'
     # pdb_file = '/home/jose/Downloads/2trx.pdb'
-    cif_file = '/home/jose/test.cif'
+    cif_file = '/home/jose/test1.cif'
 
     with open(pdb_file, 'r') as file:
         pdbstr = file.read()

From 00725db14e0728e65f403951b4ca049e10bd6505 Mon Sep 17 00:00:00 2001
From: Jose Duarte <jose.m.duarte@gmail.com>
Date: Mon, 20 Feb 2023 15:41:48 -0800
Subject: [PATCH 10/25] Removing protocol, unclear what is it for

---
 openfold/np/protein.py | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/openfold/np/protein.py b/openfold/np/protein.py
index cc1c8f8..aa28c34 100644
--- a/openfold/np/protein.py
+++ b/openfold/np/protein.py
@@ -501,10 +501,6 @@ def to_modelcif(prot: Protein) -> str:
     model_group = modelcif.model.ModelGroup([model], name='All models')
     system.model_groups.append(model_group)
 
-    protocol = modelcif.protocol.Protocol()
-    # protocol.steps.append(modelcif.protocol.ModelingStep(
-    #     input_data=aln, output_data=model))
-    system.protocols.append(protocol)
     fh = io.StringIO()
     modelcif.dumper.write(fh, [system])
     modelcifstr = fh.getvalue()
@@ -567,7 +563,7 @@ def from_prediction(
 if __name__ == "__main__":
     pdb_file = '/home/jose/Downloads/171l.pdb'
     # pdb_file = '/home/jose/Downloads/2trx.pdb'
-    cif_file = '/home/jose/test1.cif'
+    cif_file = '/home/jose/test2.cif'
 
     with open(pdb_file, 'r') as file:
         pdbstr = file.read()

From 20bfe6bcb867273ac0fa0bae1a9667d5bf68df4d Mon Sep 17 00:00:00 2001
From: Jose Duarte <jose.m.duarte@gmail.com>
Date: Mon, 20 Feb 2023 15:43:09 -0800
Subject: [PATCH 11/25] Redundant variable

---
 openfold/np/protein.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/openfold/np/protein.py b/openfold/np/protein.py
index aa28c34..d8e7328 100644
--- a/openfold/np/protein.py
+++ b/openfold/np/protein.py
@@ -503,8 +503,7 @@ def to_modelcif(prot: Protein) -> str:
 
     fh = io.StringIO()
     modelcif.dumper.write(fh, [system])
-    modelcifstr = fh.getvalue()
-    return modelcifstr
+    return fh.getvalue()
 
 
 def ideal_atom_mask(prot: Protein) -> np.ndarray:

From 6bb6d7ba70d4ce6dbcb0290383999ddd46f9f658 Mon Sep 17 00:00:00 2001
From: Jose Duarte <jose.m.duarte@gmail.com>
Date: Tue, 21 Feb 2023 22:40:26 -0800
Subject: [PATCH 12/25] Add global score and fixed some issues with multi-chain
 local scores. Now multichain tested and seems to work.

---
 openfold/np/protein.py | 34 ++++++++++++++++++++++++----------
 1 file changed, 24 insertions(+), 10 deletions(-)

diff --git a/openfold/np/protein.py b/openfold/np/protein.py
index d8e7328..35ff465 100644
--- a/openfold/np/protein.py
+++ b/openfold/np/protein.py
@@ -463,6 +463,11 @@ def to_modelcif(prot: Protein) -> str:
         software = None
         description = "Predicted lddt"
 
+    class _GlobalPLDDT(modelcif.qa_metric.Global, modelcif.qa_metric.PLDDT):
+        name = "pLDDT"
+        software = None
+        description = "Global pLDDT, mean of per-residue pLDDTs"
+
     class _MyModel(modelcif.model.AbInitioModel):
         def get_atoms(self):
             # Add all atom sites.
@@ -480,19 +485,27 @@ def to_modelcif(prot: Protein) -> str:
                         het=False, biso=b_factor, occupancy=1.00)
 
         def add_scores(self):
-            # TODO global scores
-            # self.qa_metrics.extend((_GlobalPLDDT(np.mean(scores_json["plddt"]))))
-            done_residues = {}
             # local scores
+            plldt_per_residue = {}
             for i in range(n):
                 for mask, b_factor in zip(atom_mask[i], b_factors[i]):
                     if mask < 0.5:
                         continue
                     # add 1 per residue, not 1 per atom
-                    if residue_index[i] not in done_residues:
-                        self.qa_metrics.append(
-                            _LocalPLDDT(asym_unit_map[chain_index[i]].residue(residue_index[i]), b_factor))
-                    done_residues[residue_index[i]] = True
+                    if chain_index[i] not in plldt_per_residue:
+                        # first time a chain index is seen: add the key and start the residue dict
+                        plldt_per_residue[chain_index[i]] = {residue_index[i]: b_factor}
+                    if residue_index[i] not in plldt_per_residue[chain_index[i]]:
+                        plldt_per_residue[chain_index[i]][residue_index[i]] = b_factor
+            plddts = []
+            for chain_idx in plldt_per_residue:
+                for residue_idx in plldt_per_residue[chain_idx]:
+                    plddt = plldt_per_residue[chain_idx][residue_idx]
+                    plddts.append(plddt)
+                    self.qa_metrics.append(
+                        _LocalPLDDT(asym_unit_map[chain_idx].residue(residue_idx), plddt))
+            # global score
+            self.qa_metrics.append((_GlobalPLDDT(np.mean(plddts))))
 
     # Add the model and modeling protocol to the file and write them out:
     model = _MyModel(assembly=modeled_assembly, name='Best scoring model')
@@ -560,14 +573,15 @@ def from_prediction(
 
 
 if __name__ == "__main__":
-    pdb_file = '/home/jose/Downloads/171l.pdb'
+    # pdb_file = '/Users/jose/Downloads/171l.pdb'
     # pdb_file = '/home/jose/Downloads/2trx.pdb'
-    cif_file = '/home/jose/test2.cif'
+    pdb_file = '/Users/jose/Downloads/1bwd.pdb'
+    cif_file = '/Users/jose/test2.cif'
 
     with open(pdb_file, 'r') as file:
         pdbstr = file.read()
 
-    prot = from_pdb_string(pdbstr, "A")
+    prot = from_pdb_string(pdbstr)
     cifstr = to_modelcif(prot)
     print(cifstr)
     with open(cif_file, 'w') as fw:

From 8805313ccc8a169dabc50caf599a94994bb69951 Mon Sep 17 00:00:00 2001
From: Jose Duarte <jose.m.duarte@gmail.com>
Date: Sun, 26 Feb 2023 10:26:30 -0800
Subject: [PATCH 13/25] Various minor fixes

---
 openfold/np/protein.py | 29 +++++++++++++++--------------
 1 file changed, 15 insertions(+), 14 deletions(-)

diff --git a/openfold/np/protein.py b/openfold/np/protein.py
index 35ff465..4a333db 100644
--- a/openfold/np/protein.py
+++ b/openfold/np/protein.py
@@ -394,7 +394,9 @@ def to_pdb(prot: Protein) -> str:
 
 def to_modelcif(prot: Protein) -> str:
     """
-    Converts a `Protein` instance to a ModelCIF string.
+    Converts a `Protein` instance to a ModelCIF string. Chains with identical modelled coordinates
+    will be treated as the same polymer entity. But note that if chains differ in modelled regions,
+    no attempt is made at identifying them as a single polymer entity.
 
     Args:
       prot: The protein to convert to PDB.
@@ -430,13 +432,12 @@ def to_modelcif(prot: Protein) -> str:
         seq.append(restypes[aatype[i]])
         last_chain_idx = chain_index[i]
     # finally add the last chain
-    if last_chain_idx not in seqs:
-        seqs[last_chain_idx] = seq
+    seqs[last_chain_idx] = seq
 
     # now reduce sequences to unique ones (note this won't work if different asyms have different unmodelled regions)
     unique_seqs = {}
-    for chain_idx in seqs.keys():
-        seq = "".join(seqs[chain_idx])
+    for chain_idx, seq_list in seqs.items():
+        seq = "".join(seq_list)
         if seq in unique_seqs:
             unique_seqs[seq].append(chain_idx)
         else:
@@ -445,7 +446,7 @@ def to_modelcif(prot: Protein) -> str:
     # adding 1 entity per unique sequence
     entities_map = {}
     for key, value in unique_seqs.items():
-        model_e = modelcif.Entity("".join(key), description='Model subunit')
+        model_e = modelcif.Entity(key, description='Model subunit')
         for chain_idx in value:
             entities_map[chain_idx] = model_e
 
@@ -486,21 +487,21 @@ def to_modelcif(prot: Protein) -> str:
 
         def add_scores(self):
             # local scores
-            plldt_per_residue = {}
+            plddt_per_residue = {}
             for i in range(n):
                 for mask, b_factor in zip(atom_mask[i], b_factors[i]):
                     if mask < 0.5:
                         continue
                     # add 1 per residue, not 1 per atom
-                    if chain_index[i] not in plldt_per_residue:
+                    if chain_index[i] not in plddt_per_residue:
                         # first time a chain index is seen: add the key and start the residue dict
-                        plldt_per_residue[chain_index[i]] = {residue_index[i]: b_factor}
-                    if residue_index[i] not in plldt_per_residue[chain_index[i]]:
-                        plldt_per_residue[chain_index[i]][residue_index[i]] = b_factor
+                        plddt_per_residue[chain_index[i]] = {residue_index[i]: b_factor}
+                    if residue_index[i] not in plddt_per_residue[chain_index[i]]:
+                        plddt_per_residue[chain_index[i]][residue_index[i]] = b_factor
             plddts = []
-            for chain_idx in plldt_per_residue:
-                for residue_idx in plldt_per_residue[chain_idx]:
-                    plddt = plldt_per_residue[chain_idx][residue_idx]
+            for chain_idx in plddt_per_residue:
+                for residue_idx in plddt_per_residue[chain_idx]:
+                    plddt = plddt_per_residue[chain_idx][residue_idx]
                     plddts.append(plddt)
                     self.qa_metrics.append(
                         _LocalPLDDT(asym_unit_map[chain_idx].residue(residue_idx), plddt))

From 9c38e9587ea9ae1003fc5259c000f879b16e1da2 Mon Sep 17 00:00:00 2001
From: Vaclav Hanzl <64649594+vaclavhanzl@users.noreply.github.com>
Date: Sun, 5 Mar 2023 17:23:45 +0100
Subject: [PATCH 14/25] Fix notebook: Colab now has python 3.8, fix imports,
 mitigate UTF-8 glitch

* Python 3.8 is now what we get in Colab, use it in our Conda as well
* Fix imports, make some unconditional - chain of depencencies makes them
  needed even for relax_prediction==False
* Mitigate spurious error "A UTF-8 locale is required. Got ANSI_X3.4-1968"
  which makes all shell commands defunct - use shutil.make_archive()
---
 notebooks/OpenFold.ipynb | 29 +++++++++++++++--------------
 1 file changed, 15 insertions(+), 14 deletions(-)

diff --git a/notebooks/OpenFold.ipynb b/notebooks/OpenFold.ipynb
index b816e91..23abefa 100755
--- a/notebooks/OpenFold.ipynb
+++ b/notebooks/OpenFold.ipynb
@@ -121,10 +121,11 @@
         "    %env PATH=/opt/conda/bin:{PATH}\n",
         "\n",
         "    # Install the required versions of all dependencies.\n",
+        "    %shell conda install -y -q conda==4.13.0\n",
         "    %shell conda install -y -q -c conda-forge -c bioconda \\\n",
         "      kalign2=2.04 \\\n",
         "      hhsuite=3.3.0 \\\n",
-        "      python=3.7 \\\n",
+        "      python=3.8 \\\n",
         "      2>&1 1>/dev/null\n",
         "    %shell pip install -q \\\n",
         "      ml-collections==0.1.0 \\\n",
@@ -180,15 +181,14 @@
         "    %shell cp -f /content/stereo_chemical_props.txt /content/openfold/openfold/resources\n",
         "    %shell /usr/bin/python3 -m pip install -q ./openfold\n",
         "\n",
+        "    %shell conda install -y -q -c conda-forge openmm=7.5.1\n",
+        "    # Apply OpenMM patch.\n",
+        "    %shell pushd /opt/conda/lib/python3.8/site-packages/ && \\\n",
+        "        patch -p0 < /content/openfold/lib/openmm.patch && \\\n",
+        "        popd\n",
+        "\n",
         "    if(relax_prediction):\n",
-        "      %shell conda install -y -q -c conda-forge \\\n",
-        "        openmm=7.5.1 \\\n",
-        "        pdbfixer=1.7\n",
-        "      \n",
-        "      # Apply OpenMM patch.\n",
-        "      %shell pushd /opt/conda/lib/python3.7/site-packages/ && \\\n",
-        "          patch -p0 < /content/openfold/lib/openmm.patch && \\\n",
-        "          popd\n",
+        "      %shell conda install -y -q -c conda-forge pdbfixer=1.7\n",
         "\n",
         "    if(weight_set == 'AlphaFold'):\n",
         "      %shell mkdir --parents \"{ALPHAFOLD_PARAMS_DIR}\"\n",
@@ -222,8 +222,8 @@
         "import unittest.mock\n",
         "import sys\n",
         "\n",
-        "sys.path.insert(0, '/usr/local/lib/python3.7/site-packages/')\n",
-        "sys.path.append('/opt/conda/lib/python3.7/site-packages')\n",
+        "sys.path.insert(0, '/usr/local/lib/python3.8/site-packages/')\n",
+        "sys.path.append('/opt/conda/lib/python3.8/site-packages')\n",
         "\n",
         "# Allows us to skip installing these packages\n",
         "unnecessary_modules = [\n",
@@ -247,6 +247,7 @@
         "import numpy as np\n",
         "import py3Dmol\n",
         "import torch\n",
+        "import shutil\n",
         "\n",
         "# A filthy hack to avoid slow Linear layer initialization\n",
         "import openfold.model.primitives\n",
@@ -269,7 +270,7 @@
         "from openfold.np import protein\n",
         "if(relax_prediction):\n",
         "  from openfold.np.relax import relax\n",
-        "  from openfold.np.relax import utils\n",
+        "from openfold.np.relax.utils import overwrite_b_factors\n",
         "from openfold.utils.import_weights import import_jax_weights_\n",
         "from openfold.utils.tensor_utils import tensor_tree_map\n",
         "\n",
@@ -590,7 +591,7 @@
         "      banded_b_factors.append(idx)\n",
         "      break\n",
         "banded_b_factors = np.array(banded_b_factors)[:, None] * final_atom_mask\n",
-        "to_visualize_pdb = utils.overwrite_b_factors(best_pdb, banded_b_factors)\n",
+        "to_visualize_pdb = overwrite_b_factors(best_pdb, banded_b_factors)\n",
         "\n",
         "# --- Visualise the prediction & confidence ---\n",
         "show_sidechains = True\n",
@@ -688,7 +689,7 @@
         "\n",
         "\n",
         "# --- Download the predictions ---\n",
-        "!zip -q -r {output_dir}.zip {output_dir}\n",
+        "shutil.make_archive(base_name='prediction', format='zip', root_dir=output_dir)\n",
         "files.download(f'{output_dir}.zip')"
       ],
       "execution_count": null,

From c84fd44340d2a92fd6a1358048f27b276dba73ab Mon Sep 17 00:00:00 2001
From: Vaclav Hanzl <64649594+vaclavhanzl@users.noreply.github.com>
Date: Mon, 6 Mar 2023 11:10:57 +0100
Subject: [PATCH 15/25] Fix notebook: Better mitigation of the ANSI_X3.4-1968
 problem in Colab

Colab is easily broken by openmm and other software messing with locale.
This mitigation patch lets the %shell and ! work again.
It is much stronger solution than just avoiding shell when making zip.
---
 notebooks/OpenFold.ipynb | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/notebooks/OpenFold.ipynb b/notebooks/OpenFold.ipynb
index 23abefa..1b5f630 100755
--- a/notebooks/OpenFold.ipynb
+++ b/notebooks/OpenFold.ipynb
@@ -249,6 +249,13 @@
         "import torch\n",
         "import shutil\n",
         "\n",
+        "# Prevent shell magic being broken by openmm, prevent this cryptic error:\n",
+        "# \"NotImplementedError: A UTF-8 locale is required. Got ANSI_X3.4-1968\"\n",
+        "import locale\n",
+        "def getpreferredencoding(do_setlocale = True):\n",
+        "    return \"UTF-8\"\n",
+        "locale.getpreferredencoding = getpreferredencoding\n",
+        "\n",
         "# A filthy hack to avoid slow Linear layer initialization\n",
         "import openfold.model.primitives\n",
         "\n",

From 007ab6f35b1cdf5a8cb83fe4a9a01eefb9e1c8b5 Mon Sep 17 00:00:00 2001
From: Vaclav Hanzl <64649594+vaclavhanzl@users.noreply.github.com>
Date: Mon, 6 Mar 2023 13:53:26 +0100
Subject: [PATCH 16/25] Fix notebook: Also save the protein to zip when
 relaxation is off

Previously protein got computed and shown in both cases but only
saved to the downloaded zip when relaxation was done.
---
 notebooks/OpenFold.ipynb | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/notebooks/OpenFold.ipynb b/notebooks/OpenFold.ipynb
index 1b5f630..96b7df2 100755
--- a/notebooks/OpenFold.ipynb
+++ b/notebooks/OpenFold.ipynb
@@ -579,14 +579,13 @@
         "    relaxed_pdb, _, _ = amber_relaxer.process(\n",
         "        prot=unrelaxed_proteins[best_model_name]\n",
         "    )\n",
-        "\n",
-        "    # Write out the prediction\n",
-        "    pred_output_path = os.path.join(output_dir, 'selected_prediction.pdb')\n",
-        "    with open(pred_output_path, 'w') as f:\n",
-        "      f.write(relaxed_pdb)\n",
-        "\n",
         "    best_pdb = relaxed_pdb\n",
         "\n",
+        "  # Write out the prediction\n",
+        "  pred_output_path = os.path.join(output_dir, 'selected_prediction.pdb')\n",
+        "  with open(pred_output_path, 'w') as f:\n",
+        "    f.write(best_pdb)\n",
+        "\n",
         "  pbar.update(n=1)  # Finished AMBER relax.\n",
         "\n",
         "# Construct multiclass b-factors to indicate confidence bands\n",

From 1fafb86898585e90c830351983d63ad46ef370b3 Mon Sep 17 00:00:00 2001
From: Vaclav Hanzl <64649594+vaclavhanzl@users.noreply.github.com>
Date: Mon, 6 Mar 2023 17:54:11 +0100
Subject: [PATCH 17/25] Improve notebook: Make install/import independent of
 relax_prediction

This way user can repeat just the first and last cell to get both versions.
---
 notebooks/OpenFold.ipynb | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/notebooks/OpenFold.ipynb b/notebooks/OpenFold.ipynb
index 96b7df2..8f00178 100755
--- a/notebooks/OpenFold.ipynb
+++ b/notebooks/OpenFold.ipynb
@@ -186,9 +186,7 @@
         "    %shell pushd /opt/conda/lib/python3.8/site-packages/ && \\\n",
         "        patch -p0 < /content/openfold/lib/openmm.patch && \\\n",
         "        popd\n",
-        "\n",
-        "    if(relax_prediction):\n",
-        "      %shell conda install -y -q -c conda-forge pdbfixer=1.7\n",
+        "    %shell conda install -y -q -c conda-forge pdbfixer=1.7\n",
         "\n",
         "    if(weight_set == 'AlphaFold'):\n",
         "      %shell mkdir --parents \"{ALPHAFOLD_PARAMS_DIR}\"\n",
@@ -275,8 +273,7 @@
         "from openfold.data.tools import jackhmmer\n",
         "from openfold.model import model\n",
         "from openfold.np import protein\n",
-        "if(relax_prediction):\n",
-        "  from openfold.np.relax import relax\n",
+        "from openfold.np.relax import relax\n",
         "from openfold.np.relax.utils import overwrite_b_factors\n",
         "from openfold.utils.import_weights import import_jax_weights_\n",
         "from openfold.utils.tensor_utils import tensor_tree_map\n",

From 8a76cc153e408fea72c8607535de8636609303cc Mon Sep 17 00:00:00 2001
From: Jose Duarte <jose.m.duarte@gmail.com>
Date: Mon, 6 Mar 2023 15:48:23 -0800
Subject: [PATCH 18/25] WIP towards writing relaxed structure as modelcif

---
 openfold/np/relax/amber_minimize.py |  3 ---
 openfold/np/relax/relax.py          |  4 +++-
 openfold/utils/script_utils.py      | 12 ++++++++----
 run_pretrained_openfold.py          |  2 +-
 thread_sequence.py                  |  2 +-
 5 files changed, 13 insertions(+), 10 deletions(-)

diff --git a/openfold/np/relax/amber_minimize.py b/openfold/np/relax/amber_minimize.py
index dfc0984..6448e15 100644
--- a/openfold/np/relax/amber_minimize.py
+++ b/openfold/np/relax/amber_minimize.py
@@ -524,9 +524,6 @@ def run_pipeline(
     _check_residues_are_well_defined(prot)
     pdb_string = clean_protein(prot, checks=checks)
 
-    # We keep the input around to restore metadata deleted by the relaxer
-    input_prot = prot
-
     exclude_residues = exclude_residues or []
     exclude_residues = set(exclude_residues)
     violations = np.inf
diff --git a/openfold/np/relax/relax.py b/openfold/np/relax/relax.py
index 155e379..a6fd804 100644
--- a/openfold/np/relax/relax.py
+++ b/openfold/np/relax/relax.py
@@ -57,7 +57,7 @@ class AmberRelaxation(object):
         self._use_gpu = use_gpu
 
     def process(
-        self, *, prot: protein.Protein
+        self, *, prot: protein.Protein, cif_output: bool
     ) -> Tuple[str, Dict[str, Any], np.ndarray]:
         """Runs Amber relax on a prediction, adds hydrogens, returns PDB string."""
         out = amber_minimize.run_pipeline(
@@ -78,6 +78,8 @@ class AmberRelaxation(object):
             "attempts": out["min_attempts"],
             "rmsd": rmsd,
         }
+        # TODO write all this as ModelCIF if param is true. Should be simply proteint.to_modelcif(prot)
+        #  and then add the other pieces, except that clean_protein() does quite some additional stuff...
         pdb_str = amber_minimize.clean_protein(prot)
         min_pdb = utils.overwrite_pdb_coordinates(pdb_str, min_pos)
         min_pdb = utils.overwrite_b_factors(min_pdb, prot.b_factors)
diff --git a/openfold/utils/script_utils.py b/openfold/utils/script_utils.py
index 38de08b..eaa054e 100644
--- a/openfold/utils/script_utils.py
+++ b/openfold/utils/script_utils.py
@@ -228,7 +228,7 @@ def prep_output(out, batch, feature_dict, feature_processor, config_preset, mult
     return unrelaxed_protein
 
 
-def relax_protein(config, model_device, unrelaxed_protein, output_directory, output_name):
+def relax_protein(config, model_device, unrelaxed_protein, output_directory, output_name, cif_output):
     amber_relaxer = relax.AmberRelaxation(
         use_gpu=(model_device != "cpu"),
         **config.relax,
@@ -239,7 +239,8 @@ def relax_protein(config, model_device, unrelaxed_protein, output_directory, out
     if "cuda" in model_device:
         device_no = model_device.split(":")[-1]
         os.environ["CUDA_VISIBLE_DEVICES"] = device_no
-    relaxed_pdb_str, _, _ = amber_relaxer.process(prot=unrelaxed_protein)
+    # the struct_str will contain either a PDB-format or a ModelCIF format string
+    struct_str, _, _ = amber_relaxer.process(prot=unrelaxed_protein, cif_output=cif_output)
     os.environ["CUDA_VISIBLE_DEVICES"] = visible_devices
     relaxation_time = time.perf_counter() - t
 
@@ -247,10 +248,13 @@ def relax_protein(config, model_device, unrelaxed_protein, output_directory, out
     update_timings({"relaxation": relaxation_time}, os.path.join(output_directory, "timings.json"))
 
     # Save the relaxed PDB.
+    suffix = "_relaxed.pdb"
+    if cif_output:
+        suffix = "_relaxed.cif"
     relaxed_output_path = os.path.join(
-        output_directory, f'{output_name}_relaxed.pdb'
+        output_directory, f'{output_name}{suffix}'
     )
     with open(relaxed_output_path, 'w') as fp:
-        fp.write(relaxed_pdb_str)
+        fp.write(struct_str)
 
     logger.info(f"Relaxed output written to {relaxed_output_path}...")
\ No newline at end of file
diff --git a/run_pretrained_openfold.py b/run_pretrained_openfold.py
index 66bd2d5..2393ab3 100644
--- a/run_pretrained_openfold.py
+++ b/run_pretrained_openfold.py
@@ -288,7 +288,7 @@ def main(args):
             if not args.skip_relaxation:
                 # Relax the prediction.
                 logger.info(f"Running relaxation on {unrelaxed_output_path}...")
-                relax_protein(config, args.model_device, unrelaxed_protein, output_directory, output_name)
+                relax_protein(config, args.model_device, unrelaxed_protein, output_directory, output_name, args.cif_output)
 
             if args.save_outputs:
                 output_dict_path = os.path.join(
diff --git a/thread_sequence.py b/thread_sequence.py
index e67fcdd..48b409f 100644
--- a/thread_sequence.py
+++ b/thread_sequence.py
@@ -106,7 +106,7 @@ def main(args):
         logger.info(f"Output written to {unrelaxed_output_path}...")
 
         logger.info(f"Running relaxation on {unrelaxed_output_path}...")
-        relax_protein(config, args.model_device, unrelaxed_protein, output_directory, output_name)
+        relax_protein(config, args.model_device, unrelaxed_protein, output_directory, output_name, False)
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()

From 8847fb6945e80f3e97f9479724e28ffeecca42a7 Mon Sep 17 00:00:00 2001
From: josemduarte <jose.m.duarte@gmail.com>
Date: Mon, 6 Mar 2023 16:32:28 -0800
Subject: [PATCH 19/25] Now modelcif for relaxed should work

---
 openfold/np/relax/relax.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/openfold/np/relax/relax.py b/openfold/np/relax/relax.py
index a6fd804..2711c76 100644
--- a/openfold/np/relax/relax.py
+++ b/openfold/np/relax/relax.py
@@ -78,8 +78,6 @@ class AmberRelaxation(object):
             "attempts": out["min_attempts"],
             "rmsd": rmsd,
         }
-        # TODO write all this as ModelCIF if param is true. Should be simply proteint.to_modelcif(prot)
-        #  and then add the other pieces, except that clean_protein() does quite some additional stuff...
         pdb_str = amber_minimize.clean_protein(prot)
         min_pdb = utils.overwrite_pdb_coordinates(pdb_str, min_pos)
         min_pdb = utils.overwrite_b_factors(min_pdb, prot.b_factors)
@@ -91,5 +89,11 @@ class AmberRelaxation(object):
         ]
 
         min_pdb = protein.add_pdb_headers(prot, min_pdb)
+        output_str = min_pdb
+        if cif_output:
+            # TODO the model cif will be missing some metadata like headers (PARENTs and
+            #      REMARK with some details of the run, like num of recycles)
+            final_prot = protein.from_pdb_string(min_pdb)
+            output_str = protein.to_modelcif(final_prot)
 
-        return min_pdb, debug_data, violations
+        return output_str, debug_data, violations

From 299f8948761e3f6688cad19004c359d9942e3419 Mon Sep 17 00:00:00 2001
From: josemduarte <jose.m.duarte@gmail.com>
Date: Mon, 6 Mar 2023 16:39:53 -0800
Subject: [PATCH 20/25] Cleanup

---
 openfold/np/protein.py | 16 ----------------
 1 file changed, 16 deletions(-)

diff --git a/openfold/np/protein.py b/openfold/np/protein.py
index 4a333db..df8ed32 100644
--- a/openfold/np/protein.py
+++ b/openfold/np/protein.py
@@ -571,19 +571,3 @@ def from_prediction(
         parents=parents,
         parents_chain_index=parents_chain_index,
     )
-
-
-if __name__ == "__main__":
-    # pdb_file = '/Users/jose/Downloads/171l.pdb'
-    # pdb_file = '/home/jose/Downloads/2trx.pdb'
-    pdb_file = '/Users/jose/Downloads/1bwd.pdb'
-    cif_file = '/Users/jose/test2.cif'
-
-    with open(pdb_file, 'r') as file:
-        pdbstr = file.read()
-
-    prot = from_pdb_string(pdbstr)
-    cifstr = to_modelcif(prot)
-    print(cifstr)
-    with open(cif_file, 'w') as fw:
-        fw.write(cifstr)

From 03f3a7f59cf0b62dea47df321f62255d2184143e Mon Sep 17 00:00:00 2001
From: josemduarte <jose.m.duarte@gmail.com>
Date: Mon, 6 Mar 2023 16:41:14 -0800
Subject: [PATCH 21/25] Correcting docs

---
 openfold/np/protein.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/openfold/np/protein.py b/openfold/np/protein.py
index df8ed32..72bb87a 100644
--- a/openfold/np/protein.py
+++ b/openfold/np/protein.py
@@ -82,8 +82,7 @@ def from_pdb_string(pdb_str: str, chain_id: Optional[str] = None) -> Protein:
 
     Args:
       pdb_str: The contents of the pdb file
-      chain_id: If None, then the pdb file must contain a single chain (which
-        will be parsed). If chain_id is specified (e.g. A), then only that chain
+      chain_id: If None, then the whole pdb file is parsed. If chain_id is specified (e.g. A), then only that chain
         is parsed.
 
     Returns:

From bdbfef1de1daeced31d394224a8ed69a7c0929c0 Mon Sep 17 00:00:00 2001
From: Jonathan King <jonathanking@berkeley.edu>
Date: Tue, 14 Mar 2023 16:09:22 -0400
Subject: [PATCH 22/25] Fix check for max_seqlen.

Previously, long sequences were not excluded from the script.
This commit changes the comparison to exclude sequences with length
greater than args.max_seqlen.
---
 scripts/download_cameo.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/scripts/download_cameo.py b/scripts/download_cameo.py
index 815a34b..11e5cec 100644
--- a/scripts/download_cameo.py
+++ b/scripts/download_cameo.py
@@ -57,9 +57,8 @@ def main(args):
 
         seq = mmcif_object.chain_to_seqres[chain_id]
 
-        if(args.max_seqlen > 0):
-            if(len(seq) > len(seq)):
-                continue
+        if(args.max_seqlen > 0 and len(seq) > args.max_seqlen):
+            continue
 
         fasta_file = '\n'.join([
             f">{pdb_id}_{chain_id}",

From 6625e8dfffbade9d5f6c94ca33db0462554ccb2f Mon Sep 17 00:00:00 2001
From: Nikita Smetanin <nikitasmetanin@fb.com>
Date: Thu, 16 Mar 2023 21:45:09 +0000
Subject: [PATCH 23/25] Improve TriangularMultiplicativeUpdate stability in
 fp16 mode

---
 openfold/model/triangular_multiplicative_update.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/openfold/model/triangular_multiplicative_update.py b/openfold/model/triangular_multiplicative_update.py
index 3885e4c..5e6f568 100644
--- a/openfold/model/triangular_multiplicative_update.py
+++ b/openfold/model/triangular_multiplicative_update.py
@@ -392,8 +392,13 @@ class TriangleMultiplicativeUpdate(nn.Module):
         b = mask
         b = b * self.sigmoid(self.linear_b_g(z))
         b = b * self.linear_b_p(z)
-        
-        if(is_fp16_enabled()): 
+
+        # Prevents overflow of torch.matmul in combine projections in
+        # reduced-precision modes
+        a = a / a.std()
+	b = b / b.std()
+
+        if(is_fp16_enabled()):
             with torch.cuda.amp.autocast(enabled=False):
                 x = self._combine_projections(a.float(), b.float())
         else:

From ee5d2c35b0cce4826257adb937c4058b31e2ae88 Mon Sep 17 00:00:00 2001
From: Gustaf Ahdritz <gahdritz@gmail.com>
Date: Mon, 10 Apr 2023 01:32:33 -0400
Subject: [PATCH 24/25] Remove comment

---
 setup.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/setup.py b/setup.py
index 2545e52..57e6f26 100644
--- a/setup.py
+++ b/setup.py
@@ -1,4 +1,3 @@
-# Originally from Openfold https://github.com/aqlaboratory/openfold, modified.
 # Copyright 2021 AlQuraishi Laboratory
 # Copyright 2021 DeepMind Technologies Limited
 #

From c21e5e7ae15613b3b7097948d02d45d4236bedb8 Mon Sep 17 00:00:00 2001
From: Gustaf Ahdritz <gahdritz@gmail.com>
Date: Mon, 10 Apr 2023 01:38:28 -0400
Subject: [PATCH 25/25] Fix spacing issue

---
 openfold/model/triangular_multiplicative_update.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/openfold/model/triangular_multiplicative_update.py b/openfold/model/triangular_multiplicative_update.py
index 5e6f568..ff10cea 100644
--- a/openfold/model/triangular_multiplicative_update.py
+++ b/openfold/model/triangular_multiplicative_update.py
@@ -396,7 +396,7 @@ class TriangleMultiplicativeUpdate(nn.Module):
         # Prevents overflow of torch.matmul in combine projections in
         # reduced-precision modes
         a = a / a.std()
-	b = b / b.std()
+        b = b / b.std()
 
         if(is_fp16_enabled()):
             with torch.cuda.amp.autocast(enabled=False):