From 310f240c0045310f399b7a4f388e3cd5da3e497d Mon Sep 17 00:00:00 2001 From: "Zhouran Qiao zhuoran.qiao@entos.ai" Date: Tue, 14 Feb 2023 16:58:06 -0800 Subject: [PATCH 01/25] Resolving torch cuda availablity for cpu-only installation --- setup.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 368e1d8..57e6f26 100644 --- a/setup.py +++ b/setup.py @@ -16,6 +16,7 @@ import os from setuptools import setup, Extension, find_packages import subprocess +import torch from torch.utils.cpp_extension import BuildExtension, CppExtension, CUDAExtension, CUDA_HOME from scripts.utils import get_nvidia_cc @@ -37,7 +38,7 @@ extra_cuda_flags = [ ] def get_cuda_bare_metal_version(cuda_dir): - if cuda_dir==None: + if cuda_dir==None or torch.version.cuda==None: print("CUDA is not found, cpu version is installed") return None, -1, 0 else: From bb11fb189ad3bcaf28ed9cc168b6cf09e5b84973 Mon Sep 17 00:00:00 2001 From: Zhuoran Qiao Date: Thu, 16 Feb 2023 12:37:33 -0800 Subject: [PATCH 02/25] Update setup.py --- setup.py | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.py b/setup.py index 57e6f26..2545e52 100644 --- a/setup.py +++ b/setup.py @@ -1,3 +1,4 @@ +# Originally from Openfold https://github.com/aqlaboratory/openfold, modified. # Copyright 2021 AlQuraishi Laboratory # Copyright 2021 DeepMind Technologies Limited # From 5ddaa186b9088f1ab699ea5807c40736cc0c7547 Mon Sep 17 00:00:00 2001 From: Jose Duarte Date: Mon, 20 Feb 2023 09:52:49 -0800 Subject: [PATCH 03/25] WIP: towards modelcif writing --- openfold/np/protein.py | 63 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 63 insertions(+) diff --git a/openfold/np/protein.py b/openfold/np/protein.py index af8739c..b982be0 100644 --- a/openfold/np/protein.py +++ b/openfold/np/protein.py @@ -23,6 +23,12 @@ import string from openfold.np import residue_constants from Bio.PDB import PDBParser import numpy as np +import modelcif +import modelcif.model +import modelcif.dumper +import modelcif.reference +import modelcif.protocol +import modelcif.alignment FeatureDict = Mapping[str, np.ndarray] @@ -385,6 +391,63 @@ def to_pdb(prot: Protein) -> str: return "\n".join(pdb_lines) +def to_modelcif(prot: Protein) -> str: + """ + Converts a `Protein` instance to a ModelCIF string. + + Args: + prot: The protein to convert to PDB. + + Returns: + ModelCIF string. + """ + + atom_mask = prot.atom_mask + aatype = prot.aatype + atom_positions = prot.atom_positions + residue_index = prot.residue_index.astype(np.int32) + b_factors = prot.b_factors + chain_index = prot.chain_index + + system = modelcif.System(title='Ligand example') + + # Describe the amino acid (polymer) sequences as Entity objects, for both + # template and model: + model_e = modelcif.Entity('AYVINDSCIA', description='Model subunit') + + # Define the model assembly, as two AsymUnits. NonPolymerFromTemplate is a + # subclass of AsymUnit that additionally notes the Template from which it + # was derived. In this case we state that the ligand was simply copied from + # the template into the target (explicit=False): + asymA = modelcif.AsymUnit(model_e, details='Model subunit A', id='A') + modeled_assembly = modelcif.Assembly((asymA,), name='Modeled assembly') + + class MyModel(modelcif.model.HomologyModel): + asym_unit_map = {'A': asymA} + + def get_atoms(self): + for asym, seq_id, type_symbol, atom_id, x, y, z in atoms: + yield modelcif.model.Atom( + asym_unit=self.asym_unit_map[asym], type_symbol=type_symbol, + seq_id=seq_id, atom_id=atom_id, x=x, y=y, z=z, + het=seq_id is None) + + # Add the model and modeling protocol to the file and write them out: + model = MyModel(assembly=modeled_assembly, name='Best scoring model') + + model_group = modelcif.model.ModelGroup([model], name='All models') + system.model_groups.append(model_group) + + protocol = modelcif.protocol.Protocol() + # protocol.steps.append(modelcif.protocol.ModelingStep( + # input_data=aln, output_data=model)) + system.protocols.append(protocol) + + with open('/Users/jose/output3.cif', 'w') as fh: + modelcif.dumper.write(fh, [system]) + + + def ideal_atom_mask(prot: Protein) -> np.ndarray: """Computes an ideal atom mask. From 7bc13cfc8ecd232d412c6c70856a444e9b3f4434 Mon Sep 17 00:00:00 2001 From: Jose Duarte Date: Mon, 20 Feb 2023 09:53:13 -0800 Subject: [PATCH 04/25] WIP: towards modelcif writing, conda --- environment.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/environment.yml b/environment.yml index 0dfb9db..b8a2082 100644 --- a/environment.yml +++ b/environment.yml @@ -27,4 +27,5 @@ dependencies: - typing-extensions==3.10.0.2 - pytorch_lightning==1.5.10 - wandb==0.12.21 + - modelcif==0.7 - git+https://github.com/NVIDIA/dllogger.git From ba8de687af386d0bca9da890cdc0a227b424e692 Mon Sep 17 00:00:00 2001 From: Jose Duarte Date: Mon, 20 Feb 2023 11:26:15 -0800 Subject: [PATCH 05/25] Working implementation for 1 chain case --- openfold/np/protein.py | 97 +++++++++++++++++++++++++++++++++--------- 1 file changed, 76 insertions(+), 21 deletions(-) diff --git a/openfold/np/protein.py b/openfold/np/protein.py index b982be0..f15e135 100644 --- a/openfold/np/protein.py +++ b/openfold/np/protein.py @@ -62,7 +62,7 @@ class Protein: # Chain indices for multi-chain predictions chain_index: Optional[np.ndarray] = None - # Optional remark about the protein. Included as a comment in output PDB + # Optional remark about the protein. Included as a comment in output PDB # files remark: Optional[str] = None @@ -177,7 +177,7 @@ def from_proteinnet_string(proteinnet_str: str) -> Protein: tag.strip() for tag in re.split(tag_re, proteinnet_str) if len(tag) > 0 ] groups = zip(tags[0::2], [l.split('\n') for l in tags[1::2]]) - + atoms = ['N', 'CA', 'C'] aatype = None atom_positions = None @@ -252,7 +252,7 @@ def add_pdb_headers(prot: Protein, pdb_str: str) -> str: """ out_pdb_lines = [] lines = pdb_str.split('\n') - + remark = prot.remark if(remark is not None): out_pdb_lines.append(f"REMARK {remark}") @@ -347,7 +347,7 @@ def to_pdb(prot: Protein) -> str: 0 ] # Protein supports only C, N, O, S, this works. charge = "" - + chain_tag = "A" if(chain_index is not None): chain_tag = chain_tags[chain_index[i]] @@ -402,6 +402,10 @@ def to_modelcif(prot: Protein) -> str: ModelCIF string. """ + restypes = residue_constants.restypes + ["X"] + res_1to3 = lambda r: residue_constants.restype_1to3.get(restypes[r], "UNK") + atom_types = residue_constants.atom_types + atom_mask = prot.atom_mask aatype = prot.aatype atom_positions = prot.atom_positions @@ -409,16 +413,14 @@ def to_modelcif(prot: Protein) -> str: b_factors = prot.b_factors chain_index = prot.chain_index - system = modelcif.System(title='Ligand example') + system = modelcif.System(title='OpenFold prediction') - # Describe the amino acid (polymer) sequences as Entity objects, for both - # template and model: - model_e = modelcif.Entity('AYVINDSCIA', description='Model subunit') + # sequence into entity + n = aatype.shape[0] + seq = [restypes[aatype[i]] for i in range(n)] + model_e = modelcif.Entity("".join(seq), description='Model subunit') - # Define the model assembly, as two AsymUnits. NonPolymerFromTemplate is a - # subclass of AsymUnit that additionally notes the Template from which it - # was derived. In this case we state that the ligand was simply copied from - # the template into the target (explicit=False): + # Define the model assembly asymA = modelcif.AsymUnit(model_e, details='Model subunit A', id='A') modeled_assembly = modelcif.Assembly((asymA,), name='Modeled assembly') @@ -426,11 +428,55 @@ def to_modelcif(prot: Protein) -> str: asym_unit_map = {'A': asymA} def get_atoms(self): - for asym, seq_id, type_symbol, atom_id, x, y, z in atoms: - yield modelcif.model.Atom( - asym_unit=self.asym_unit_map[asym], type_symbol=type_symbol, - seq_id=seq_id, atom_id=atom_id, x=x, y=y, z=z, - het=seq_id is None) + + prev_chain_index = 0 + chain_tags = string.ascii_uppercase + # Add all atom sites. + for i in range(n): + res_name_3 = res_1to3(aatype[i]) + for atom_name, pos, mask, b_factor in zip( + atom_types, atom_positions[i], atom_mask[i], b_factors[i] + ): + if mask < 0.5: + continue + + name = atom_name if len(atom_name) == 4 else f" {atom_name}" + element = atom_name[0] # Protein supports only C, N, O, S, this works. + + chain_tag = "A" + if chain_index is not None: + chain_tag = chain_tags[chain_index[i]] + + # TODO check that residue indices are ok + # TODO how to set residue types? do they come from the entity sequence set above? + yield modelcif.model.Atom( + asym_unit=asymA, type_symbol=element, + seq_id=residue_index[i], atom_id=name, + x=pos[0], y=pos[1], z=pos[2], + het=False, biso=b_factor, occupancy=1.00) + + # TODO multiple chains + # should_terminate = (i == n - 1) + # if(chain_index is not None): + # if(i != n - 1 and chain_index[i + 1] != prev_chain_index): + # should_terminate = True + # prev_chain_index = chain_index[i + 1] + # + # if(should_terminate): + # # Close the chain. + # chain_end = "TER" + # chain_termination_line = ( + # f"{chain_end:<6}{atom_index:>5} " + # f"{res_1to3(aatype[i]):>3} " + # f"{chain_tag:>1}{residue_index[i]:>4}" + # ) + # pdb_lines.append(chain_termination_line) + # atom_index += 1 + # + # if(i != n - 1): + # # "prev" is a misnomer here. This happens at the beginning of + # # each new chain. + # pdb_lines.extend(get_pdb_headers(prot, prev_chain_index)) # Add the model and modeling protocol to the file and write them out: model = MyModel(assembly=modeled_assembly, name='Best scoring model') @@ -442,10 +488,10 @@ def to_modelcif(prot: Protein) -> str: # protocol.steps.append(modelcif.protocol.ModelingStep( # input_data=aln, output_data=model)) system.protocols.append(protocol) - - with open('/Users/jose/output3.cif', 'w') as fh: - modelcif.dumper.write(fh, [system]) - + fh = io.StringIO() + modelcif.dumper.write(fh, [system]) + modelcifstr = fh.getvalue() + return modelcifstr def ideal_atom_mask(prot: Protein) -> np.ndarray: @@ -499,3 +545,12 @@ def from_prediction( parents=parents, parents_chain_index=parents_chain_index, ) + + +if __name__ == "__main__": + with open('/home/jose/Downloads/171l.pdb', 'r') as file: + pdbstr = file.read() + + prot = from_pdb_string(pdbstr) + cifstr = to_modelcif(prot) + print(cifstr) \ No newline at end of file From 4f662f832c4dd437faa470bd493721c28ec92cb7 Mon Sep 17 00:00:00 2001 From: Jose Duarte Date: Mon, 20 Feb 2023 13:54:19 -0800 Subject: [PATCH 06/25] Now should work for multi-chain. But it is untested. Also fixed important bug: atom names were padded --- openfold/np/protein.py | 103 +++++++++++++++++++++-------------------- 1 file changed, 52 insertions(+), 51 deletions(-) diff --git a/openfold/np/protein.py b/openfold/np/protein.py index f15e135..28a669c 100644 --- a/openfold/np/protein.py +++ b/openfold/np/protein.py @@ -403,7 +403,6 @@ def to_modelcif(prot: Protein) -> str: """ restypes = residue_constants.restypes + ["X"] - res_1to3 = lambda r: residue_constants.restype_1to3.get(restypes[r], "UNK") atom_types = residue_constants.atom_types atom_mask = prot.atom_mask @@ -413,71 +412,67 @@ def to_modelcif(prot: Protein) -> str: b_factors = prot.b_factors chain_index = prot.chain_index + n = aatype.shape[0] + if chain_index is None: + chain_index = [0 for i in range(n)] + system = modelcif.System(title='OpenFold prediction') - # sequence into entity - n = aatype.shape[0] - seq = [restypes[aatype[i]] for i in range(n)] - model_e = modelcif.Entity("".join(seq), description='Model subunit') + # Finding chains and creating entities + seqs = {} + seq = [] + last_chain_idx = None + for i in range(n): + if last_chain_idx is not None and last_chain_idx != chain_index[i]: + seqs[last_chain_idx] = seq + seq = [] + seq.append(restypes[aatype[i]]) + last_chain_idx = chain_index[i] + # finally add the last chain + if last_chain_idx not in seqs: + seqs[last_chain_idx] = seq - # Define the model assembly - asymA = modelcif.AsymUnit(model_e, details='Model subunit A', id='A') - modeled_assembly = modelcif.Assembly((asymA,), name='Modeled assembly') + # now reduce sequences to unique ones (note this won't work if different asyms have different unmodelled regions) + unique_seqs = {} + for chain_idx in seqs.keys(): + seq = "".join(seqs[chain_idx]) + if seq in unique_seqs: + unique_seqs[seq].append(chain_idx) + else: + unique_seqs[seq] = [chain_idx] + + # adding 1 entity per unique sequence + entities_map = {} + for key, value in unique_seqs.items(): + model_e = modelcif.Entity("".join(key), description='Model subunit') + for chain_idx in value: + entities_map[chain_idx] = model_e + + chain_tags = string.ascii_uppercase + asym_unit_map = {} + for chain_idx in set(chain_index): + # Define the model assembly + chain_id = chain_tags[chain_idx] + asym = modelcif.AsymUnit(entities_map[chain_idx], details='Model subunit %s' % chain_id, id=chain_id) + asym_unit_map[chain_idx] = asym + modeled_assembly = modelcif.Assembly(asym_unit_map.values(), name='Modeled assembly') class MyModel(modelcif.model.HomologyModel): - asym_unit_map = {'A': asymA} - def get_atoms(self): - - prev_chain_index = 0 - chain_tags = string.ascii_uppercase # Add all atom sites. for i in range(n): - res_name_3 = res_1to3(aatype[i]) for atom_name, pos, mask, b_factor in zip( atom_types, atom_positions[i], atom_mask[i], b_factors[i] ): if mask < 0.5: continue - - name = atom_name if len(atom_name) == 4 else f" {atom_name}" element = atom_name[0] # Protein supports only C, N, O, S, this works. - - chain_tag = "A" - if chain_index is not None: - chain_tag = chain_tags[chain_index[i]] - - # TODO check that residue indices are ok - # TODO how to set residue types? do they come from the entity sequence set above? yield modelcif.model.Atom( - asym_unit=asymA, type_symbol=element, - seq_id=residue_index[i], atom_id=name, + asym_unit=asym_unit_map[chain_index[i]], type_symbol=element, + seq_id=residue_index[i], atom_id=atom_name, x=pos[0], y=pos[1], z=pos[2], het=False, biso=b_factor, occupancy=1.00) - # TODO multiple chains - # should_terminate = (i == n - 1) - # if(chain_index is not None): - # if(i != n - 1 and chain_index[i + 1] != prev_chain_index): - # should_terminate = True - # prev_chain_index = chain_index[i + 1] - # - # if(should_terminate): - # # Close the chain. - # chain_end = "TER" - # chain_termination_line = ( - # f"{chain_end:<6}{atom_index:>5} " - # f"{res_1to3(aatype[i]):>3} " - # f"{chain_tag:>1}{residue_index[i]:>4}" - # ) - # pdb_lines.append(chain_termination_line) - # atom_index += 1 - # - # if(i != n - 1): - # # "prev" is a misnomer here. This happens at the beginning of - # # each new chain. - # pdb_lines.extend(get_pdb_headers(prot, prev_chain_index)) - # Add the model and modeling protocol to the file and write them out: model = MyModel(assembly=modeled_assembly, name='Best scoring model') @@ -548,9 +543,15 @@ def from_prediction( if __name__ == "__main__": - with open('/home/jose/Downloads/171l.pdb', 'r') as file: + pdb_file = '/home/jose/Downloads/171l.pdb' + # pdb_file = '/home/jose/Downloads/2trx.pdb' + cif_file = '/home/jose/test.cif' + + with open(pdb_file, 'r') as file: pdbstr = file.read() - prot = from_pdb_string(pdbstr) + prot = from_pdb_string(pdbstr, "A") cifstr = to_modelcif(prot) - print(cifstr) \ No newline at end of file + print(cifstr) + with open(cif_file, 'w') as fw: + fw.write(cifstr) From 9db8dc369b0c247012f60591e7e088ab44268d3c Mon Sep 17 00:00:00 2001 From: Jose Duarte Date: Mon, 20 Feb 2023 14:08:05 -0800 Subject: [PATCH 07/25] Adding output_cif as CLI argument --- run_pretrained_openfold.py | 50 +++++++++++++++++++++++--------------- 1 file changed, 30 insertions(+), 20 deletions(-) diff --git a/run_pretrained_openfold.py b/run_pretrained_openfold.py index 7bd99be..66bd2d5 100644 --- a/run_pretrained_openfold.py +++ b/run_pretrained_openfold.py @@ -1,6 +1,6 @@ # Copyright 2021 AlQuraishi Laboratory # Copyright 2021 DeepMind Technologies Limited -# +# # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at @@ -35,7 +35,7 @@ torch_versions = torch.__version__.split(".") torch_major_version = int(torch_versions[0]) torch_minor_version = int(torch_versions[1]) if( - torch_major_version > 1 or + torch_major_version > 1 or (torch_major_version == 1 and torch_minor_version >= 12) ): # Gives a large speedup on Ampere-class GPUs @@ -70,7 +70,7 @@ def precompute_alignments(tags, seqs, alignment_dir, args): local_alignment_dir = os.path.join(alignment_dir, tag) if(args.use_precomputed_alignments is None and not os.path.isdir(local_alignment_dir)): logger.info(f"Generating alignments for {tag}...") - + os.makedirs(local_alignment_dir) alignment_runner = data_pipeline.AlignmentRunner( @@ -141,13 +141,13 @@ def main(args): os.makedirs(args.output_dir, exist_ok=True) config = model_config(args.config_preset, long_sequence_inference=args.long_sequence_inference) - + if(args.trace_model): if(not config.data.predict.fixed_size): raise ValueError( "Tracing requires that fixed_size mode be enabled in the config" ) - + template_featurizer = templates.TemplateHitFeaturizer( mmcif_dir=args.template_mmcif_dir, max_template_date=args.max_template_date, @@ -165,10 +165,10 @@ def main(args): random_seed = args.data_random_seed if random_seed is None: random_seed = random.randrange(2**32) - + np.random.seed(random_seed) torch.manual_seed(random_seed + 1) - + feature_processor = feature_pipeline.FeaturePipeline(config.data) if not os.path.exists(output_dir_base): os.makedirs(output_dir_base) @@ -183,7 +183,7 @@ def main(args): # Gather input sequences with open(os.path.join(args.fasta_dir, fasta_file), "r") as fp: data = fp.read() - + tags, seqs = parse_fasta(data) # assert len(tags) == len(set(tags)), "All FASTA tags must be unique" tag = '-'.join(tags) @@ -206,10 +206,10 @@ def main(args): output_name = f'{tag}_{args.config_preset}' if args.output_postfix is not None: output_name = f'{output_name}_{args.output_postfix}' - + # Does nothing if the alignments have already been computed precompute_alignments(tags, seqs, alignment_dir, args) - + feature_dict = feature_dicts.get(tag, None) if(feature_dict is None): feature_dict = generate_feature_dict( @@ -234,7 +234,7 @@ def main(args): ) processed_feature_dict = { - k:torch.as_tensor(v, device=args.model_device) + k:torch.as_tensor(v, device=args.model_device) for k,v in processed_feature_dict.items() } @@ -255,30 +255,36 @@ def main(args): # Toss out the recycling dimensions --- we don't need them anymore processed_feature_dict = tensor_tree_map( - lambda x: np.array(x[..., -1].cpu()), + lambda x: np.array(x[..., -1].cpu()), processed_feature_dict ) out = tensor_tree_map(lambda x: np.array(x.cpu()), out) unrelaxed_protein = prep_output( - out, - processed_feature_dict, - feature_dict, - feature_processor, + out, + processed_feature_dict, + feature_dict, + feature_processor, args.config_preset, args.multimer_ri_gap, args.subtract_plddt ) + unrelaxed_file_suffix = "_unrelaxed.pdb" + if args.cif_output: + unrelaxed_file_suffix = "_unrelaxed.cif" unrelaxed_output_path = os.path.join( - output_directory, f'{output_name}_unrelaxed.pdb' + output_directory, f'{output_name}{unrelaxed_file_suffix}' ) with open(unrelaxed_output_path, 'w') as fp: - fp.write(protein.to_pdb(unrelaxed_protein)) + if args.cif_output: + fp.write(protein.to_modelcif(unrelaxed_protein)) + else: + fp.write(protein.to_pdb(unrelaxed_protein)) logger.info(f"Output written to {unrelaxed_output_path}...") - + if not args.skip_relaxation: # Relax the prediction. logger.info(f"Running relaxation on {unrelaxed_output_path}...") @@ -373,12 +379,16 @@ if __name__ == "__main__": "--long_sequence_inference", action="store_true", default=False, help="""enable options to reduce memory usage at the cost of speed, helps longer sequences fit into GPU memory, see the README for details""" ) + parser.add_argument( + "--cif_output", action="store_true", default=False, + help="Output predicted models in ModelCIF format instead of PDB format (default)" + ) add_data_args(parser) args = parser.parse_args() if(args.jax_param_path is None and args.openfold_checkpoint_path is None): args.jax_param_path = os.path.join( - "openfold", "resources", "params", + "openfold", "resources", "params", "params_" + args.config_preset + ".npz" ) From 951d47cf345dc52ff4f300d89df1785559252846 Mon Sep 17 00:00:00 2001 From: Jose Duarte Date: Mon, 20 Feb 2023 14:52:14 -0800 Subject: [PATCH 08/25] Ab-initio is more appropriate --- openfold/np/protein.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/openfold/np/protein.py b/openfold/np/protein.py index 28a669c..ff245ec 100644 --- a/openfold/np/protein.py +++ b/openfold/np/protein.py @@ -457,7 +457,7 @@ def to_modelcif(prot: Protein) -> str: asym_unit_map[chain_idx] = asym modeled_assembly = modelcif.Assembly(asym_unit_map.values(), name='Modeled assembly') - class MyModel(modelcif.model.HomologyModel): + class MyModel(modelcif.model.AbInitioModel): def get_atoms(self): # Add all atom sites. for i in range(n): From f43640b9804c9e7f38a448080756bc6a41fcb516 Mon Sep 17 00:00:00 2001 From: Jose Duarte Date: Mon, 20 Feb 2023 15:29:46 -0800 Subject: [PATCH 09/25] Adding ma_qa_metric_local --- openfold/np/protein.py | 28 +++++++++++++++++++++++++--- 1 file changed, 25 insertions(+), 3 deletions(-) diff --git a/openfold/np/protein.py b/openfold/np/protein.py index ff245ec..cc1c8f8 100644 --- a/openfold/np/protein.py +++ b/openfold/np/protein.py @@ -29,6 +29,7 @@ import modelcif.dumper import modelcif.reference import modelcif.protocol import modelcif.alignment +import modelcif.qa_metric FeatureDict = Mapping[str, np.ndarray] @@ -457,7 +458,12 @@ def to_modelcif(prot: Protein) -> str: asym_unit_map[chain_idx] = asym modeled_assembly = modelcif.Assembly(asym_unit_map.values(), name='Modeled assembly') - class MyModel(modelcif.model.AbInitioModel): + class _LocalPLDDT(modelcif.qa_metric.Local, modelcif.qa_metric.PLDDT): + name = "pLDDT" + software = None + description = "Predicted lddt" + + class _MyModel(modelcif.model.AbInitioModel): def get_atoms(self): # Add all atom sites. for i in range(n): @@ -473,8 +479,24 @@ def to_modelcif(prot: Protein) -> str: x=pos[0], y=pos[1], z=pos[2], het=False, biso=b_factor, occupancy=1.00) + def add_scores(self): + # TODO global scores + # self.qa_metrics.extend((_GlobalPLDDT(np.mean(scores_json["plddt"])))) + done_residues = {} + # local scores + for i in range(n): + for mask, b_factor in zip(atom_mask[i], b_factors[i]): + if mask < 0.5: + continue + # add 1 per residue, not 1 per atom + if residue_index[i] not in done_residues: + self.qa_metrics.append( + _LocalPLDDT(asym_unit_map[chain_index[i]].residue(residue_index[i]), b_factor)) + done_residues[residue_index[i]] = True + # Add the model and modeling protocol to the file and write them out: - model = MyModel(assembly=modeled_assembly, name='Best scoring model') + model = _MyModel(assembly=modeled_assembly, name='Best scoring model') + model.add_scores() model_group = modelcif.model.ModelGroup([model], name='All models') system.model_groups.append(model_group) @@ -545,7 +567,7 @@ def from_prediction( if __name__ == "__main__": pdb_file = '/home/jose/Downloads/171l.pdb' # pdb_file = '/home/jose/Downloads/2trx.pdb' - cif_file = '/home/jose/test.cif' + cif_file = '/home/jose/test1.cif' with open(pdb_file, 'r') as file: pdbstr = file.read() From 00725db14e0728e65f403951b4ca049e10bd6505 Mon Sep 17 00:00:00 2001 From: Jose Duarte Date: Mon, 20 Feb 2023 15:41:48 -0800 Subject: [PATCH 10/25] Removing protocol, unclear what is it for --- openfold/np/protein.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/openfold/np/protein.py b/openfold/np/protein.py index cc1c8f8..aa28c34 100644 --- a/openfold/np/protein.py +++ b/openfold/np/protein.py @@ -501,10 +501,6 @@ def to_modelcif(prot: Protein) -> str: model_group = modelcif.model.ModelGroup([model], name='All models') system.model_groups.append(model_group) - protocol = modelcif.protocol.Protocol() - # protocol.steps.append(modelcif.protocol.ModelingStep( - # input_data=aln, output_data=model)) - system.protocols.append(protocol) fh = io.StringIO() modelcif.dumper.write(fh, [system]) modelcifstr = fh.getvalue() @@ -567,7 +563,7 @@ def from_prediction( if __name__ == "__main__": pdb_file = '/home/jose/Downloads/171l.pdb' # pdb_file = '/home/jose/Downloads/2trx.pdb' - cif_file = '/home/jose/test1.cif' + cif_file = '/home/jose/test2.cif' with open(pdb_file, 'r') as file: pdbstr = file.read() From 20bfe6bcb867273ac0fa0bae1a9667d5bf68df4d Mon Sep 17 00:00:00 2001 From: Jose Duarte Date: Mon, 20 Feb 2023 15:43:09 -0800 Subject: [PATCH 11/25] Redundant variable --- openfold/np/protein.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/openfold/np/protein.py b/openfold/np/protein.py index aa28c34..d8e7328 100644 --- a/openfold/np/protein.py +++ b/openfold/np/protein.py @@ -503,8 +503,7 @@ def to_modelcif(prot: Protein) -> str: fh = io.StringIO() modelcif.dumper.write(fh, [system]) - modelcifstr = fh.getvalue() - return modelcifstr + return fh.getvalue() def ideal_atom_mask(prot: Protein) -> np.ndarray: From 6bb6d7ba70d4ce6dbcb0290383999ddd46f9f658 Mon Sep 17 00:00:00 2001 From: Jose Duarte Date: Tue, 21 Feb 2023 22:40:26 -0800 Subject: [PATCH 12/25] Add global score and fixed some issues with multi-chain local scores. Now multichain tested and seems to work. --- openfold/np/protein.py | 34 ++++++++++++++++++++++++---------- 1 file changed, 24 insertions(+), 10 deletions(-) diff --git a/openfold/np/protein.py b/openfold/np/protein.py index d8e7328..35ff465 100644 --- a/openfold/np/protein.py +++ b/openfold/np/protein.py @@ -463,6 +463,11 @@ def to_modelcif(prot: Protein) -> str: software = None description = "Predicted lddt" + class _GlobalPLDDT(modelcif.qa_metric.Global, modelcif.qa_metric.PLDDT): + name = "pLDDT" + software = None + description = "Global pLDDT, mean of per-residue pLDDTs" + class _MyModel(modelcif.model.AbInitioModel): def get_atoms(self): # Add all atom sites. @@ -480,19 +485,27 @@ def to_modelcif(prot: Protein) -> str: het=False, biso=b_factor, occupancy=1.00) def add_scores(self): - # TODO global scores - # self.qa_metrics.extend((_GlobalPLDDT(np.mean(scores_json["plddt"])))) - done_residues = {} # local scores + plldt_per_residue = {} for i in range(n): for mask, b_factor in zip(atom_mask[i], b_factors[i]): if mask < 0.5: continue # add 1 per residue, not 1 per atom - if residue_index[i] not in done_residues: - self.qa_metrics.append( - _LocalPLDDT(asym_unit_map[chain_index[i]].residue(residue_index[i]), b_factor)) - done_residues[residue_index[i]] = True + if chain_index[i] not in plldt_per_residue: + # first time a chain index is seen: add the key and start the residue dict + plldt_per_residue[chain_index[i]] = {residue_index[i]: b_factor} + if residue_index[i] not in plldt_per_residue[chain_index[i]]: + plldt_per_residue[chain_index[i]][residue_index[i]] = b_factor + plddts = [] + for chain_idx in plldt_per_residue: + for residue_idx in plldt_per_residue[chain_idx]: + plddt = plldt_per_residue[chain_idx][residue_idx] + plddts.append(plddt) + self.qa_metrics.append( + _LocalPLDDT(asym_unit_map[chain_idx].residue(residue_idx), plddt)) + # global score + self.qa_metrics.append((_GlobalPLDDT(np.mean(plddts)))) # Add the model and modeling protocol to the file and write them out: model = _MyModel(assembly=modeled_assembly, name='Best scoring model') @@ -560,14 +573,15 @@ def from_prediction( if __name__ == "__main__": - pdb_file = '/home/jose/Downloads/171l.pdb' + # pdb_file = '/Users/jose/Downloads/171l.pdb' # pdb_file = '/home/jose/Downloads/2trx.pdb' - cif_file = '/home/jose/test2.cif' + pdb_file = '/Users/jose/Downloads/1bwd.pdb' + cif_file = '/Users/jose/test2.cif' with open(pdb_file, 'r') as file: pdbstr = file.read() - prot = from_pdb_string(pdbstr, "A") + prot = from_pdb_string(pdbstr) cifstr = to_modelcif(prot) print(cifstr) with open(cif_file, 'w') as fw: From 8805313ccc8a169dabc50caf599a94994bb69951 Mon Sep 17 00:00:00 2001 From: Jose Duarte Date: Sun, 26 Feb 2023 10:26:30 -0800 Subject: [PATCH 13/25] Various minor fixes --- openfold/np/protein.py | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/openfold/np/protein.py b/openfold/np/protein.py index 35ff465..4a333db 100644 --- a/openfold/np/protein.py +++ b/openfold/np/protein.py @@ -394,7 +394,9 @@ def to_pdb(prot: Protein) -> str: def to_modelcif(prot: Protein) -> str: """ - Converts a `Protein` instance to a ModelCIF string. + Converts a `Protein` instance to a ModelCIF string. Chains with identical modelled coordinates + will be treated as the same polymer entity. But note that if chains differ in modelled regions, + no attempt is made at identifying them as a single polymer entity. Args: prot: The protein to convert to PDB. @@ -430,13 +432,12 @@ def to_modelcif(prot: Protein) -> str: seq.append(restypes[aatype[i]]) last_chain_idx = chain_index[i] # finally add the last chain - if last_chain_idx not in seqs: - seqs[last_chain_idx] = seq + seqs[last_chain_idx] = seq # now reduce sequences to unique ones (note this won't work if different asyms have different unmodelled regions) unique_seqs = {} - for chain_idx in seqs.keys(): - seq = "".join(seqs[chain_idx]) + for chain_idx, seq_list in seqs.items(): + seq = "".join(seq_list) if seq in unique_seqs: unique_seqs[seq].append(chain_idx) else: @@ -445,7 +446,7 @@ def to_modelcif(prot: Protein) -> str: # adding 1 entity per unique sequence entities_map = {} for key, value in unique_seqs.items(): - model_e = modelcif.Entity("".join(key), description='Model subunit') + model_e = modelcif.Entity(key, description='Model subunit') for chain_idx in value: entities_map[chain_idx] = model_e @@ -486,21 +487,21 @@ def to_modelcif(prot: Protein) -> str: def add_scores(self): # local scores - plldt_per_residue = {} + plddt_per_residue = {} for i in range(n): for mask, b_factor in zip(atom_mask[i], b_factors[i]): if mask < 0.5: continue # add 1 per residue, not 1 per atom - if chain_index[i] not in plldt_per_residue: + if chain_index[i] not in plddt_per_residue: # first time a chain index is seen: add the key and start the residue dict - plldt_per_residue[chain_index[i]] = {residue_index[i]: b_factor} - if residue_index[i] not in plldt_per_residue[chain_index[i]]: - plldt_per_residue[chain_index[i]][residue_index[i]] = b_factor + plddt_per_residue[chain_index[i]] = {residue_index[i]: b_factor} + if residue_index[i] not in plddt_per_residue[chain_index[i]]: + plddt_per_residue[chain_index[i]][residue_index[i]] = b_factor plddts = [] - for chain_idx in plldt_per_residue: - for residue_idx in plldt_per_residue[chain_idx]: - plddt = plldt_per_residue[chain_idx][residue_idx] + for chain_idx in plddt_per_residue: + for residue_idx in plddt_per_residue[chain_idx]: + plddt = plddt_per_residue[chain_idx][residue_idx] plddts.append(plddt) self.qa_metrics.append( _LocalPLDDT(asym_unit_map[chain_idx].residue(residue_idx), plddt)) From 9c38e9587ea9ae1003fc5259c000f879b16e1da2 Mon Sep 17 00:00:00 2001 From: Vaclav Hanzl <64649594+vaclavhanzl@users.noreply.github.com> Date: Sun, 5 Mar 2023 17:23:45 +0100 Subject: [PATCH 14/25] Fix notebook: Colab now has python 3.8, fix imports, mitigate UTF-8 glitch * Python 3.8 is now what we get in Colab, use it in our Conda as well * Fix imports, make some unconditional - chain of depencencies makes them needed even for relax_prediction==False * Mitigate spurious error "A UTF-8 locale is required. Got ANSI_X3.4-1968" which makes all shell commands defunct - use shutil.make_archive() --- notebooks/OpenFold.ipynb | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/notebooks/OpenFold.ipynb b/notebooks/OpenFold.ipynb index b816e91..23abefa 100755 --- a/notebooks/OpenFold.ipynb +++ b/notebooks/OpenFold.ipynb @@ -121,10 +121,11 @@ " %env PATH=/opt/conda/bin:{PATH}\n", "\n", " # Install the required versions of all dependencies.\n", + " %shell conda install -y -q conda==4.13.0\n", " %shell conda install -y -q -c conda-forge -c bioconda \\\n", " kalign2=2.04 \\\n", " hhsuite=3.3.0 \\\n", - " python=3.7 \\\n", + " python=3.8 \\\n", " 2>&1 1>/dev/null\n", " %shell pip install -q \\\n", " ml-collections==0.1.0 \\\n", @@ -180,15 +181,14 @@ " %shell cp -f /content/stereo_chemical_props.txt /content/openfold/openfold/resources\n", " %shell /usr/bin/python3 -m pip install -q ./openfold\n", "\n", + " %shell conda install -y -q -c conda-forge openmm=7.5.1\n", + " # Apply OpenMM patch.\n", + " %shell pushd /opt/conda/lib/python3.8/site-packages/ && \\\n", + " patch -p0 < /content/openfold/lib/openmm.patch && \\\n", + " popd\n", + "\n", " if(relax_prediction):\n", - " %shell conda install -y -q -c conda-forge \\\n", - " openmm=7.5.1 \\\n", - " pdbfixer=1.7\n", - " \n", - " # Apply OpenMM patch.\n", - " %shell pushd /opt/conda/lib/python3.7/site-packages/ && \\\n", - " patch -p0 < /content/openfold/lib/openmm.patch && \\\n", - " popd\n", + " %shell conda install -y -q -c conda-forge pdbfixer=1.7\n", "\n", " if(weight_set == 'AlphaFold'):\n", " %shell mkdir --parents \"{ALPHAFOLD_PARAMS_DIR}\"\n", @@ -222,8 +222,8 @@ "import unittest.mock\n", "import sys\n", "\n", - "sys.path.insert(0, '/usr/local/lib/python3.7/site-packages/')\n", - "sys.path.append('/opt/conda/lib/python3.7/site-packages')\n", + "sys.path.insert(0, '/usr/local/lib/python3.8/site-packages/')\n", + "sys.path.append('/opt/conda/lib/python3.8/site-packages')\n", "\n", "# Allows us to skip installing these packages\n", "unnecessary_modules = [\n", @@ -247,6 +247,7 @@ "import numpy as np\n", "import py3Dmol\n", "import torch\n", + "import shutil\n", "\n", "# A filthy hack to avoid slow Linear layer initialization\n", "import openfold.model.primitives\n", @@ -269,7 +270,7 @@ "from openfold.np import protein\n", "if(relax_prediction):\n", " from openfold.np.relax import relax\n", - " from openfold.np.relax import utils\n", + "from openfold.np.relax.utils import overwrite_b_factors\n", "from openfold.utils.import_weights import import_jax_weights_\n", "from openfold.utils.tensor_utils import tensor_tree_map\n", "\n", @@ -590,7 +591,7 @@ " banded_b_factors.append(idx)\n", " break\n", "banded_b_factors = np.array(banded_b_factors)[:, None] * final_atom_mask\n", - "to_visualize_pdb = utils.overwrite_b_factors(best_pdb, banded_b_factors)\n", + "to_visualize_pdb = overwrite_b_factors(best_pdb, banded_b_factors)\n", "\n", "# --- Visualise the prediction & confidence ---\n", "show_sidechains = True\n", @@ -688,7 +689,7 @@ "\n", "\n", "# --- Download the predictions ---\n", - "!zip -q -r {output_dir}.zip {output_dir}\n", + "shutil.make_archive(base_name='prediction', format='zip', root_dir=output_dir)\n", "files.download(f'{output_dir}.zip')" ], "execution_count": null, From c84fd44340d2a92fd6a1358048f27b276dba73ab Mon Sep 17 00:00:00 2001 From: Vaclav Hanzl <64649594+vaclavhanzl@users.noreply.github.com> Date: Mon, 6 Mar 2023 11:10:57 +0100 Subject: [PATCH 15/25] Fix notebook: Better mitigation of the ANSI_X3.4-1968 problem in Colab Colab is easily broken by openmm and other software messing with locale. This mitigation patch lets the %shell and ! work again. It is much stronger solution than just avoiding shell when making zip. --- notebooks/OpenFold.ipynb | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/notebooks/OpenFold.ipynb b/notebooks/OpenFold.ipynb index 23abefa..1b5f630 100755 --- a/notebooks/OpenFold.ipynb +++ b/notebooks/OpenFold.ipynb @@ -249,6 +249,13 @@ "import torch\n", "import shutil\n", "\n", + "# Prevent shell magic being broken by openmm, prevent this cryptic error:\n", + "# \"NotImplementedError: A UTF-8 locale is required. Got ANSI_X3.4-1968\"\n", + "import locale\n", + "def getpreferredencoding(do_setlocale = True):\n", + " return \"UTF-8\"\n", + "locale.getpreferredencoding = getpreferredencoding\n", + "\n", "# A filthy hack to avoid slow Linear layer initialization\n", "import openfold.model.primitives\n", "\n", From 007ab6f35b1cdf5a8cb83fe4a9a01eefb9e1c8b5 Mon Sep 17 00:00:00 2001 From: Vaclav Hanzl <64649594+vaclavhanzl@users.noreply.github.com> Date: Mon, 6 Mar 2023 13:53:26 +0100 Subject: [PATCH 16/25] Fix notebook: Also save the protein to zip when relaxation is off Previously protein got computed and shown in both cases but only saved to the downloaded zip when relaxation was done. --- notebooks/OpenFold.ipynb | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/notebooks/OpenFold.ipynb b/notebooks/OpenFold.ipynb index 1b5f630..96b7df2 100755 --- a/notebooks/OpenFold.ipynb +++ b/notebooks/OpenFold.ipynb @@ -579,14 +579,13 @@ " relaxed_pdb, _, _ = amber_relaxer.process(\n", " prot=unrelaxed_proteins[best_model_name]\n", " )\n", - "\n", - " # Write out the prediction\n", - " pred_output_path = os.path.join(output_dir, 'selected_prediction.pdb')\n", - " with open(pred_output_path, 'w') as f:\n", - " f.write(relaxed_pdb)\n", - "\n", " best_pdb = relaxed_pdb\n", "\n", + " # Write out the prediction\n", + " pred_output_path = os.path.join(output_dir, 'selected_prediction.pdb')\n", + " with open(pred_output_path, 'w') as f:\n", + " f.write(best_pdb)\n", + "\n", " pbar.update(n=1) # Finished AMBER relax.\n", "\n", "# Construct multiclass b-factors to indicate confidence bands\n", From 1fafb86898585e90c830351983d63ad46ef370b3 Mon Sep 17 00:00:00 2001 From: Vaclav Hanzl <64649594+vaclavhanzl@users.noreply.github.com> Date: Mon, 6 Mar 2023 17:54:11 +0100 Subject: [PATCH 17/25] Improve notebook: Make install/import independent of relax_prediction This way user can repeat just the first and last cell to get both versions. --- notebooks/OpenFold.ipynb | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/notebooks/OpenFold.ipynb b/notebooks/OpenFold.ipynb index 96b7df2..8f00178 100755 --- a/notebooks/OpenFold.ipynb +++ b/notebooks/OpenFold.ipynb @@ -186,9 +186,7 @@ " %shell pushd /opt/conda/lib/python3.8/site-packages/ && \\\n", " patch -p0 < /content/openfold/lib/openmm.patch && \\\n", " popd\n", - "\n", - " if(relax_prediction):\n", - " %shell conda install -y -q -c conda-forge pdbfixer=1.7\n", + " %shell conda install -y -q -c conda-forge pdbfixer=1.7\n", "\n", " if(weight_set == 'AlphaFold'):\n", " %shell mkdir --parents \"{ALPHAFOLD_PARAMS_DIR}\"\n", @@ -275,8 +273,7 @@ "from openfold.data.tools import jackhmmer\n", "from openfold.model import model\n", "from openfold.np import protein\n", - "if(relax_prediction):\n", - " from openfold.np.relax import relax\n", + "from openfold.np.relax import relax\n", "from openfold.np.relax.utils import overwrite_b_factors\n", "from openfold.utils.import_weights import import_jax_weights_\n", "from openfold.utils.tensor_utils import tensor_tree_map\n", From 8a76cc153e408fea72c8607535de8636609303cc Mon Sep 17 00:00:00 2001 From: Jose Duarte Date: Mon, 6 Mar 2023 15:48:23 -0800 Subject: [PATCH 18/25] WIP towards writing relaxed structure as modelcif --- openfold/np/relax/amber_minimize.py | 3 --- openfold/np/relax/relax.py | 4 +++- openfold/utils/script_utils.py | 12 ++++++++---- run_pretrained_openfold.py | 2 +- thread_sequence.py | 2 +- 5 files changed, 13 insertions(+), 10 deletions(-) diff --git a/openfold/np/relax/amber_minimize.py b/openfold/np/relax/amber_minimize.py index dfc0984..6448e15 100644 --- a/openfold/np/relax/amber_minimize.py +++ b/openfold/np/relax/amber_minimize.py @@ -524,9 +524,6 @@ def run_pipeline( _check_residues_are_well_defined(prot) pdb_string = clean_protein(prot, checks=checks) - # We keep the input around to restore metadata deleted by the relaxer - input_prot = prot - exclude_residues = exclude_residues or [] exclude_residues = set(exclude_residues) violations = np.inf diff --git a/openfold/np/relax/relax.py b/openfold/np/relax/relax.py index 155e379..a6fd804 100644 --- a/openfold/np/relax/relax.py +++ b/openfold/np/relax/relax.py @@ -57,7 +57,7 @@ class AmberRelaxation(object): self._use_gpu = use_gpu def process( - self, *, prot: protein.Protein + self, *, prot: protein.Protein, cif_output: bool ) -> Tuple[str, Dict[str, Any], np.ndarray]: """Runs Amber relax on a prediction, adds hydrogens, returns PDB string.""" out = amber_minimize.run_pipeline( @@ -78,6 +78,8 @@ class AmberRelaxation(object): "attempts": out["min_attempts"], "rmsd": rmsd, } + # TODO write all this as ModelCIF if param is true. Should be simply proteint.to_modelcif(prot) + # and then add the other pieces, except that clean_protein() does quite some additional stuff... pdb_str = amber_minimize.clean_protein(prot) min_pdb = utils.overwrite_pdb_coordinates(pdb_str, min_pos) min_pdb = utils.overwrite_b_factors(min_pdb, prot.b_factors) diff --git a/openfold/utils/script_utils.py b/openfold/utils/script_utils.py index 38de08b..eaa054e 100644 --- a/openfold/utils/script_utils.py +++ b/openfold/utils/script_utils.py @@ -228,7 +228,7 @@ def prep_output(out, batch, feature_dict, feature_processor, config_preset, mult return unrelaxed_protein -def relax_protein(config, model_device, unrelaxed_protein, output_directory, output_name): +def relax_protein(config, model_device, unrelaxed_protein, output_directory, output_name, cif_output): amber_relaxer = relax.AmberRelaxation( use_gpu=(model_device != "cpu"), **config.relax, @@ -239,7 +239,8 @@ def relax_protein(config, model_device, unrelaxed_protein, output_directory, out if "cuda" in model_device: device_no = model_device.split(":")[-1] os.environ["CUDA_VISIBLE_DEVICES"] = device_no - relaxed_pdb_str, _, _ = amber_relaxer.process(prot=unrelaxed_protein) + # the struct_str will contain either a PDB-format or a ModelCIF format string + struct_str, _, _ = amber_relaxer.process(prot=unrelaxed_protein, cif_output=cif_output) os.environ["CUDA_VISIBLE_DEVICES"] = visible_devices relaxation_time = time.perf_counter() - t @@ -247,10 +248,13 @@ def relax_protein(config, model_device, unrelaxed_protein, output_directory, out update_timings({"relaxation": relaxation_time}, os.path.join(output_directory, "timings.json")) # Save the relaxed PDB. + suffix = "_relaxed.pdb" + if cif_output: + suffix = "_relaxed.cif" relaxed_output_path = os.path.join( - output_directory, f'{output_name}_relaxed.pdb' + output_directory, f'{output_name}{suffix}' ) with open(relaxed_output_path, 'w') as fp: - fp.write(relaxed_pdb_str) + fp.write(struct_str) logger.info(f"Relaxed output written to {relaxed_output_path}...") \ No newline at end of file diff --git a/run_pretrained_openfold.py b/run_pretrained_openfold.py index 66bd2d5..2393ab3 100644 --- a/run_pretrained_openfold.py +++ b/run_pretrained_openfold.py @@ -288,7 +288,7 @@ def main(args): if not args.skip_relaxation: # Relax the prediction. logger.info(f"Running relaxation on {unrelaxed_output_path}...") - relax_protein(config, args.model_device, unrelaxed_protein, output_directory, output_name) + relax_protein(config, args.model_device, unrelaxed_protein, output_directory, output_name, args.cif_output) if args.save_outputs: output_dict_path = os.path.join( diff --git a/thread_sequence.py b/thread_sequence.py index e67fcdd..48b409f 100644 --- a/thread_sequence.py +++ b/thread_sequence.py @@ -106,7 +106,7 @@ def main(args): logger.info(f"Output written to {unrelaxed_output_path}...") logger.info(f"Running relaxation on {unrelaxed_output_path}...") - relax_protein(config, args.model_device, unrelaxed_protein, output_directory, output_name) + relax_protein(config, args.model_device, unrelaxed_protein, output_directory, output_name, False) if __name__ == "__main__": parser = argparse.ArgumentParser() From 8847fb6945e80f3e97f9479724e28ffeecca42a7 Mon Sep 17 00:00:00 2001 From: josemduarte Date: Mon, 6 Mar 2023 16:32:28 -0800 Subject: [PATCH 19/25] Now modelcif for relaxed should work --- openfold/np/relax/relax.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/openfold/np/relax/relax.py b/openfold/np/relax/relax.py index a6fd804..2711c76 100644 --- a/openfold/np/relax/relax.py +++ b/openfold/np/relax/relax.py @@ -78,8 +78,6 @@ class AmberRelaxation(object): "attempts": out["min_attempts"], "rmsd": rmsd, } - # TODO write all this as ModelCIF if param is true. Should be simply proteint.to_modelcif(prot) - # and then add the other pieces, except that clean_protein() does quite some additional stuff... pdb_str = amber_minimize.clean_protein(prot) min_pdb = utils.overwrite_pdb_coordinates(pdb_str, min_pos) min_pdb = utils.overwrite_b_factors(min_pdb, prot.b_factors) @@ -91,5 +89,11 @@ class AmberRelaxation(object): ] min_pdb = protein.add_pdb_headers(prot, min_pdb) + output_str = min_pdb + if cif_output: + # TODO the model cif will be missing some metadata like headers (PARENTs and + # REMARK with some details of the run, like num of recycles) + final_prot = protein.from_pdb_string(min_pdb) + output_str = protein.to_modelcif(final_prot) - return min_pdb, debug_data, violations + return output_str, debug_data, violations From 299f8948761e3f6688cad19004c359d9942e3419 Mon Sep 17 00:00:00 2001 From: josemduarte Date: Mon, 6 Mar 2023 16:39:53 -0800 Subject: [PATCH 20/25] Cleanup --- openfold/np/protein.py | 16 ---------------- 1 file changed, 16 deletions(-) diff --git a/openfold/np/protein.py b/openfold/np/protein.py index 4a333db..df8ed32 100644 --- a/openfold/np/protein.py +++ b/openfold/np/protein.py @@ -571,19 +571,3 @@ def from_prediction( parents=parents, parents_chain_index=parents_chain_index, ) - - -if __name__ == "__main__": - # pdb_file = '/Users/jose/Downloads/171l.pdb' - # pdb_file = '/home/jose/Downloads/2trx.pdb' - pdb_file = '/Users/jose/Downloads/1bwd.pdb' - cif_file = '/Users/jose/test2.cif' - - with open(pdb_file, 'r') as file: - pdbstr = file.read() - - prot = from_pdb_string(pdbstr) - cifstr = to_modelcif(prot) - print(cifstr) - with open(cif_file, 'w') as fw: - fw.write(cifstr) From 03f3a7f59cf0b62dea47df321f62255d2184143e Mon Sep 17 00:00:00 2001 From: josemduarte Date: Mon, 6 Mar 2023 16:41:14 -0800 Subject: [PATCH 21/25] Correcting docs --- openfold/np/protein.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/openfold/np/protein.py b/openfold/np/protein.py index df8ed32..72bb87a 100644 --- a/openfold/np/protein.py +++ b/openfold/np/protein.py @@ -82,8 +82,7 @@ def from_pdb_string(pdb_str: str, chain_id: Optional[str] = None) -> Protein: Args: pdb_str: The contents of the pdb file - chain_id: If None, then the pdb file must contain a single chain (which - will be parsed). If chain_id is specified (e.g. A), then only that chain + chain_id: If None, then the whole pdb file is parsed. If chain_id is specified (e.g. A), then only that chain is parsed. Returns: From bdbfef1de1daeced31d394224a8ed69a7c0929c0 Mon Sep 17 00:00:00 2001 From: Jonathan King Date: Tue, 14 Mar 2023 16:09:22 -0400 Subject: [PATCH 22/25] Fix check for max_seqlen. Previously, long sequences were not excluded from the script. This commit changes the comparison to exclude sequences with length greater than args.max_seqlen. --- scripts/download_cameo.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/scripts/download_cameo.py b/scripts/download_cameo.py index 815a34b..11e5cec 100644 --- a/scripts/download_cameo.py +++ b/scripts/download_cameo.py @@ -57,9 +57,8 @@ def main(args): seq = mmcif_object.chain_to_seqres[chain_id] - if(args.max_seqlen > 0): - if(len(seq) > len(seq)): - continue + if(args.max_seqlen > 0 and len(seq) > args.max_seqlen): + continue fasta_file = '\n'.join([ f">{pdb_id}_{chain_id}", From 6625e8dfffbade9d5f6c94ca33db0462554ccb2f Mon Sep 17 00:00:00 2001 From: Nikita Smetanin Date: Thu, 16 Mar 2023 21:45:09 +0000 Subject: [PATCH 23/25] Improve TriangularMultiplicativeUpdate stability in fp16 mode --- openfold/model/triangular_multiplicative_update.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/openfold/model/triangular_multiplicative_update.py b/openfold/model/triangular_multiplicative_update.py index 3885e4c..5e6f568 100644 --- a/openfold/model/triangular_multiplicative_update.py +++ b/openfold/model/triangular_multiplicative_update.py @@ -392,8 +392,13 @@ class TriangleMultiplicativeUpdate(nn.Module): b = mask b = b * self.sigmoid(self.linear_b_g(z)) b = b * self.linear_b_p(z) - - if(is_fp16_enabled()): + + # Prevents overflow of torch.matmul in combine projections in + # reduced-precision modes + a = a / a.std() + b = b / b.std() + + if(is_fp16_enabled()): with torch.cuda.amp.autocast(enabled=False): x = self._combine_projections(a.float(), b.float()) else: From ee5d2c35b0cce4826257adb937c4058b31e2ae88 Mon Sep 17 00:00:00 2001 From: Gustaf Ahdritz Date: Mon, 10 Apr 2023 01:32:33 -0400 Subject: [PATCH 24/25] Remove comment --- setup.py | 1 - 1 file changed, 1 deletion(-) diff --git a/setup.py b/setup.py index 2545e52..57e6f26 100644 --- a/setup.py +++ b/setup.py @@ -1,4 +1,3 @@ -# Originally from Openfold https://github.com/aqlaboratory/openfold, modified. # Copyright 2021 AlQuraishi Laboratory # Copyright 2021 DeepMind Technologies Limited # From c21e5e7ae15613b3b7097948d02d45d4236bedb8 Mon Sep 17 00:00:00 2001 From: Gustaf Ahdritz Date: Mon, 10 Apr 2023 01:38:28 -0400 Subject: [PATCH 25/25] Fix spacing issue --- openfold/model/triangular_multiplicative_update.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/openfold/model/triangular_multiplicative_update.py b/openfold/model/triangular_multiplicative_update.py index 5e6f568..ff10cea 100644 --- a/openfold/model/triangular_multiplicative_update.py +++ b/openfold/model/triangular_multiplicative_update.py @@ -396,7 +396,7 @@ class TriangleMultiplicativeUpdate(nn.Module): # Prevents overflow of torch.matmul in combine projections in # reduced-precision modes a = a / a.std() - b = b / b.std() + b = b / b.std() if(is_fp16_enabled()): with torch.cuda.amp.autocast(enabled=False):