Files
libcifpp/include/cif++/model.hpp
Maarten L. Hekkelman 4b05eec45a First steps
2026-04-01 10:58:33 +02:00

1240 lines
42 KiB
C++

/*-
* SPDX-License-Identifier: BSD-2-Clause
*
* Copyright (c) 2020 NKI/AVL, Netherlands Cancer Institute
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#pragma once
#include "cif++/atom_type.hpp"
#include "cif++/datablock.hpp"
#include "cif++/point.hpp"
#include "cif++/row.hpp"
#include <memory>
#if __cpp_lib_format
# include <format>
# include <utility>
#endif
/** @file model.hpp
*
* This file contains code to work with models of molecules.
*
* The classes available encapsulate the real world concepts of
* atoms, residues, monomers, polymers and everything is then
* bound together in a structure.
*
* This code is not finished yet, ideally it would be a high
* level interface to manipulate macro molecular structures
* and an attempt has been made to start work on this. But
* there's still a lot that needs to be implemented.
*
* However, the code that is here is still useful in
* manipulating the underlying mmCIF data model.
*
*/
namespace cif
{
class file;
}
namespace cif::mm
{
class atom;
class residue;
class monomer;
class polymer;
class structure;
// --------------------------------------------------------------------
/**
* @brief The class atom encapsulates the data in _atom_site and
* _atom_site_anisotrop
*
* The class atom is a kind of flyweight class. It can be copied
* with low overhead. All data is stored in the underlying mmCIF
* categories but some very often used items are cached in the
* impl.
*
* It is also possible to have symmetry copies of atoms. They
* share the same data in the cif::category but their location
* differs by using a symmetry operator.
*/
class atom
{
private:
/** @cond */
struct atom_impl : public std::enable_shared_from_this<atom_impl>
{
atom_impl(datablock &db, std::string_view id)
: m_db(db)
, m_cat(db["atom_site"])
, m_id(id)
{
auto r = row();
if (r)
std::tie(m_location.x, m_location.y, m_location.z) = r.get<float, float, float>("Cartn_x", "Cartn_y", "Cartn_z");
}
// constructor for a symmetry copy of an atom
atom_impl(const atom_impl &impl, const point &loc, const std::string &sym_op)
: atom_impl(impl)
{
m_location = loc;
m_symop = sym_op;
}
atom_impl(const atom_impl &i) = default;
[[nodiscard]] int compare(const atom_impl &b) const;
// bool getAnisoU(float anisou[6]) const;
[[nodiscard]] int get_charge() const;
void moveTo(const point &p);
// const compound *compound() const;
[[nodiscard]] const item_value &get_property(std::string_view name) const;
void set_property(const std::string_view name, item_value value);
row_handle row()
{
return m_cat[{ { .name = "id", .value = m_id } }];
}
[[nodiscard]] const_row_handle row() const
{
return m_cat[{ { .name = "id", .value = m_id } }];
}
row_handle row_aniso()
{
row_handle result{};
auto cat = m_db.get("atom_site_anisotrop");
if (cat)
result = cat->operator[]({ { .name = "id", .value = m_id } });
return result;
}
[[nodiscard]] const_row_handle row_aniso() const
{
row_handle result{};
auto cat = m_db.get("atom_site_anisotrop");
if (cat)
result = cat->operator[]({ { .name = "id", .value = m_id } });
return result;
}
datablock &m_db;
category &m_cat;
std::string m_id;
point m_location;
std::string m_symop = "1_555";
};
/** @endcond */
public:
/**
* @brief Construct a new, empty atom object
*/
atom() = default;
/**
* @brief Construct a new atom object using @a impl as impl
*
* @param impl The implementation objectt
*/
atom(std::shared_ptr<atom_impl> impl)
: m_impl(std::move(impl))
{
}
/**
* @brief Copy construct a new atom object
*/
atom(const atom &rhs) // NOLINT(modernize-use-equals-default)
: m_impl(rhs.m_impl)
{
}
/**
* @brief Move construct a new atom object
*/
atom(atom &&rhs)
{
std::swap(m_impl, rhs.m_impl);
}
/// \brief Copy assignement operator
atom &operator=(atom rhs)
{
std::swap(m_impl, rhs.m_impl);
return *this;
}
/**
* @brief Construct a new atom object based on a cif::row
*
* @param db The datablock where the _atom_site category resides
* @param row The row containing the data for this atom
*/
atom(datablock &db, const_row_handle row)
: atom(std::make_shared<atom_impl>(db, row["id"].get<std::string>()))
{
}
/**
* @brief A special constructor to create symmetry copies
*
* @param rhs The original atom to copy
* @param symmmetry_location The symmetry location
* @param symmetry_operation The symmetry operator used
*/
atom(const atom &rhs, const point &symmmetry_location, const std::string &symmetry_operation)
: atom(std::make_shared<atom_impl>(*rhs.m_impl, symmmetry_location, symmetry_operation))
{
}
/// \brief To quickly test if the atom has data
explicit operator bool() const { return m_impl.operator bool(); }
/// \brief Return the item named @a name in the _atom_site category for this atom
[[nodiscard]] const item_value &get_property_value(std::string_view name) const
{
if (not m_impl)
throw std::logic_error("Error trying to fetch a property from an uninitialized atom");
return m_impl->get_property(name);
}
/// \brief Return the item named @a name in the _atom_site category for this atom as string
[[nodiscard]] auto get_property(std::string_view name) const
{
return get_property_value(name).get<std::string>();
}
/// \brief Return the item named @a name in the _atom_site category for this atom as float
[[nodiscard]] auto get_property_float(std::string_view name) const
{
return get_property_value(name).get<float>();
}
/// \brief Return the item named @a name in the _atom_site category for this atom as string
[[nodiscard]] auto get_property_int(std::string_view name) const
{
return get_property_value(name).get<int>();
}
/// \brief Set value for the item named @a name in the _atom_site category to @a value
void set_property(const std::string_view name, item_value value)
{
if (not m_impl)
throw std::logic_error("Error trying to modify an uninitialized atom");
m_impl->set_property(name, std::move(value));
}
/// \brief Set value for the item named @a name in the _atom_site category to @a value
template <typename T>
void set_property(const std::string_view name, const T &value)
requires(std::is_arithmetic_v<T>)
{
set_property(name, std::to_string(value));
}
/** Return the ID of the _atom_site record.
*
* @note Although I've never seen anything other than integers,
* the standard says this should be a string and so we use that.
*/
[[nodiscard]] const std::string &id() const { return impl().m_id; }
/// \brief Return the type of the atom
[[nodiscard]] cif::atom_type get_type() const { return atom_type_traits(get_property("type_symbol")).type(); }
/// \brief Return the cached location of this atom
[[nodiscard]] point get_location() const { return impl().m_location; }
/// \brief Set the location of this atom, will set both the cached data as well as the data in the underlying _atom_site category
void set_location(point p)
{
if (not m_impl)
throw std::logic_error("Error trying to modify an uninitialized atom");
m_impl->moveTo(p);
}
/// \brief Translate the position of this atom by \a t
void translate(point t)
{
set_location(get_location() + t);
}
/// \brief Rotate the position of this atom by \a q
void rotate(quaternion q)
{
set_location(q * get_location());
}
/// \brief rotate the coordinates of this atom by \a q around point \a p
void rotate(quaternion q, point p)
{
auto loc = get_location();
loc -= p;
loc = q * loc;
loc += p;
set_location(loc);
}
/// \brief Translate and rotate the position of this atom by \a t and \a q
void translate_and_rotate(point t, quaternion q)
{
auto loc = get_location();
loc += t;
loc = q * loc;
set_location(loc);
}
/// \brief Translate, rotate and translate again the coordinates this atom by \a t1 , \a q and \a t2
void translate_rotate_and_translate(point t1, quaternion q, point t2)
{
auto loc = get_location();
loc += t1;
loc = q * loc;
loc += t2;
set_location(loc);
}
/// for direct access to underlying data, be careful!
[[nodiscard]] const_row_handle get_row() const { return impl().row(); }
/// for direct access to underlying data, be careful!
[[nodiscard]] const_row_handle get_row_aniso() const { return impl().row_aniso(); }
/// Return if the atom is actually a symmetry copy or the original one
[[nodiscard]] bool is_symmetry_copy() const { return impl().m_symop != "1_555"; }
/// Return the symmetry operator used
[[nodiscard]] std::string symmetry() const { return impl().m_symop; }
/// Return true if this atom is part of a water molecule
[[nodiscard]] bool is_water() const
{
auto comp_id = get_label_comp_id();
return comp_id == "HOH" or comp_id == "H2O" or comp_id == "WAT";
}
/// Return the charge
[[nodiscard]] int get_charge() const { return impl().get_charge(); }
/// Return the occupancy
[[nodiscard]] float get_occupancy() const { return get_property_float("occupancy"); }
// specifications
[[nodiscard]] std::string get_label_asym_id() const { return get_property("label_asym_id"); } ///< Return the label_asym_id property
[[nodiscard]] int get_label_seq_id() const { return get_property_int("label_seq_id"); } ///< Return the label_seq_id property
[[nodiscard]] std::string get_label_atom_id() const { return get_property("label_atom_id"); } ///< Return the label_atom_id property
[[nodiscard]] std::string get_label_alt_id() const { return get_property("label_alt_id"); } ///< Return the label_alt_id property
[[nodiscard]] std::string get_label_comp_id() const { return get_property("label_comp_id"); } ///< Return the label_comp_id property
[[nodiscard]] std::string get_label_entity_id() const { return get_property("label_entity_id"); } ///< Return the label_entity_id property
[[nodiscard]] std::string get_auth_asym_id() const { return get_property("auth_asym_id"); } ///< Return the auth_asym_id property
[[nodiscard]] std::string get_auth_seq_id() const { return get_property("auth_seq_id"); } ///< Return the auth_seq_id property
[[nodiscard]] std::string get_auth_atom_id() const { return get_property("auth_atom_id"); } ///< Return the auth_atom_id property
[[nodiscard]] std::string get_auth_alt_id() const { return get_property("pdbx_auth_alt_id"); } ///< Return the auth_alt_id property
[[nodiscard]] std::string get_auth_comp_id() const { return get_property("auth_comp_id"); } ///< Return the auth_comp_id property
[[nodiscard]] std::string get_pdb_ins_code() const { return get_property("pdbx_PDB_ins_code"); } ///< Return the pdb_ins_code property
/// Return true if this atom is an alternate
[[nodiscard]] bool is_alternate() const
{
if (auto alt_id = get_property_value("label_alt_id"); alt_id.empty())
return false;
return true;
}
/// Convenience method to return a string that might be ID in PDB space
[[nodiscard]] std::string pdb_id() const
{
return get_label_comp_id() + '_' + get_auth_asym_id() + '_' + get_auth_seq_id() + get_pdb_ins_code();
}
/// Compare two atoms
bool operator==(const atom &rhs) const
{
if (m_impl == rhs.m_impl)
return true;
if (not(m_impl and rhs.m_impl))
return false;
return &m_impl->m_db == &rhs.m_impl->m_db and m_impl->m_id == rhs.m_impl->m_id;
}
/// Compare two atoms
bool operator!=(const atom &rhs) const
{
return not operator==(rhs);
}
/// Is this atom a backbone atom
[[nodiscard]] bool is_back_bone() const
{
auto atomID = get_label_atom_id();
return atomID == "N" or atomID == "O" or atomID == "C" or atomID == "CA";
}
/// swap
void swap(atom &b)
{
std::swap(m_impl, b.m_impl);
}
/// Compare this atom with @a b
[[nodiscard]] int compare(const atom &b) const { return impl().compare(*b.m_impl); }
/// Should this atom sort before @a rhs
bool operator<(const atom &rhs) const
{
return compare(rhs) < 0;
}
/// Write the atom to std::ostream @a os
friend std::ostream &operator<<(std::ostream &os, const atom &atom);
private:
friend class structure;
[[nodiscard]] const atom_impl &impl() const
{
if (not m_impl)
throw std::runtime_error("Uninitialized atom, not found?");
return *m_impl;
}
std::shared_ptr<atom_impl> m_impl;
};
/** swap */
inline void swap(atom &a, atom &b)
{
a.swap(b);
}
/** Calculate the distance between atoms @a and @a b in ångström */
inline float distance(const atom &a, const atom &b)
{
return distance(a.get_location(), b.get_location());
}
// --------------------------------------------------------------------
/**
* @brief The entity types that can be found in a mmCIF file
*
*/
enum class EntityType
{
Polymer, ///< entity is a polymer
NonPolymer, ///< entity is not a polymer
Macrolide, ///< entity is a macrolide
Water, ///< water in the solvent model
Branched ///< entity is branched
};
// --------------------------------------------------------------------
/**
* @brief The class residue is a collection of atoms forming a molecule
*
* This class is used to store ligand e.g. Derived classes are monomer
* and sugar.
*/
class residue
{
public:
friend class structure;
/**
* @brief Construct a new residue object based on key items
*/
residue(structure &structure, std::string compoundID, std::string asymID, int seqID,
std::string authAsymID, std::string authSeqID, std::string pdbInsCode)
: m_structure(&structure)
, m_compound_id(std::move(compoundID))
, m_asym_id(std::move(asymID))
, m_seq_id(seqID)
, m_pdb_strand_id(std::move(authAsymID))
, m_pdb_seq_num(std::move(authSeqID))
, m_pdb_ins_code(std::move(pdbInsCode))
{
}
/** Construct a new residue in structure with the atoms in @a atoms */
residue(structure &structure, const std::vector<atom> &atoms);
/** @cond */
residue(const residue &rhs) = default;
residue(residue &&rhs)
{
swap(*this, rhs);
}
residue &operator=(residue rhs)
{
swap(*this, rhs);
return *this;
}
friend void swap(residue &a, residue &b) noexcept
{
if (&a != &b)
{
std::swap(a.m_structure, b.m_structure);
std::swap(a.m_asym_id, b.m_asym_id);
std::swap(a.m_seq_id, b.m_seq_id);
std::swap(a.m_pdb_ins_code, b.m_pdb_ins_code);
std::swap(a.m_atoms, b.m_atoms);
}
}
virtual ~residue() = default;
/** @endcond */
/** Return the entity_id of this residue */
[[nodiscard]] std::string get_entity_id() const;
/** Return the entity type of this residue */
[[nodiscard]] EntityType entity_type() const;
[[nodiscard]] const std::string &get_asym_id() const { return m_asym_id; } ///< Return the asym_id
[[nodiscard]] int get_seq_id() const { return m_seq_id; } ///< Return the seq_id
[[nodiscard]] const std::string get_pdb_strand_id() const { return m_pdb_strand_id; } ///< Return the pdb_strand_id
[[nodiscard]] const std::string get_pdb_seq_num() const { return m_pdb_seq_num; } ///< Return the pdb_seq_num
[[nodiscard]] std::string get_pdb_ins_code() const { return m_pdb_ins_code; } ///< Return the pdb_ins_code
[[nodiscard]] const std::string &get_compound_id() const { return m_compound_id; } ///< Return the compound_id
void set_compound_id(const std::string &id) { m_compound_id = id; } ///< Set the compound_id to @a id
/** Return the structure this residue belongs to */
[[nodiscard]] structure *get_structure() const { return m_structure; }
/** Return a list of the atoms for this residue */
std::vector<atom> &atoms()
{
return m_atoms;
}
/** Return a const list of the atoms for this residue */
[[nodiscard]] const std::vector<atom> &atoms() const
{
return m_atoms;
}
/** Add atom @a atom to the atoms in this residue */
void add_atom(atom &atom);
/// \brief Unique atoms returns only the atoms without alternates and the first of each alternate atom id.
[[nodiscard]] std::vector<atom> unique_atoms() const;
/// \brief Return the atom with atom_id @a atomID
[[nodiscard]] atom get_atom_by_atom_id(const std::string &atomID) const;
/// \brief Return the atom with atom_id @a atomID and alternate_id @a altID
[[nodiscard]] atom get_atom_by_atom_id(const std::string &atomID, const std::string &altID) const;
/// \brief Return the list of atoms having ID \a atomID
///
/// This includes all alternate atoms with this ID
/// whereas get_atom_by_atom_id only returns the first unique atom
[[nodiscard]] std::vector<atom> get_atoms_by_id(const std::string &atomID) const;
/// \brief Is this residue a single entity?
[[nodiscard]] bool is_entity() const;
/// \brief Is this residue a water molecule?
[[nodiscard]] bool is_water() const { return m_compound_id == "HOH"; }
/// \brief Return true if this residue has alternate atoms
[[nodiscard]] bool has_alternate_atoms() const;
/// \brief Return true if this residue has alternate atoms for the atom \a atomID
[[nodiscard]] bool has_alternate_atoms_for(const std::string &atomID) const;
/// \brief Return the list of unique alt ID's present in this residue
[[nodiscard]] std::set<std::string> get_alternate_ids() const;
/// \brief Return the list of unique atom ID's
[[nodiscard]] std::set<std::string> get_atom_ids() const;
/// \brief Return a tuple containing the center location and the radius for the atoms of this residue
[[nodiscard]] std::tuple<point, float> center_and_radius() const;
/// \brief Write the residue @a res to the std::ostream @a os
friend std::ostream &operator<<(std::ostream &os, const residue &res);
/// \brief Return true if this residue is equal to @a rhs
bool operator==(const residue &rhs) const
{
return this == &rhs or (m_structure == rhs.m_structure and
m_seq_id == rhs.m_seq_id and
m_asym_id == rhs.m_asym_id and
m_compound_id == rhs.m_compound_id and
m_pdb_seq_num == rhs.m_pdb_seq_num);
}
/// @brief Create a new atom and add it to the list
/// @return newly created atom
virtual atom create_new_atom(atom_type inType, const std::string &inAtomID, point inLocation);
protected:
/** @cond */
residue() = default;
structure *m_structure = nullptr;
std::string m_compound_id, m_asym_id;
int m_seq_id = 0;
std::string m_pdb_strand_id, m_pdb_seq_num, m_pdb_ins_code;
std::vector<atom> m_atoms;
/** @endcond */
};
// --------------------------------------------------------------------
/**
* @brief a monomer models a single residue in a protein chain
*
*/
class monomer : public residue
{
public:
/// \brief constructor with actual values
monomer(const polymer &polymer, std::size_t index, int seqID, const std::string &authSeqID,
const std::string &pdbInsCode, const std::string &compoundID);
/// \brief Copy constructor
monomer(const monomer &rhs) = default;
/// \brief Move constructor
monomer(monomer &&rhs)
{
swap(*this, rhs);
}
/// Assignment for both move and copy (modern move semantics)
monomer &operator=(monomer rhs)
{
swap(*this, rhs);
return *this;
}
/// swap two monomers
friend void swap(monomer &a, monomer &b) noexcept
{
assert(a.m_polymer == b.m_polymer);
std::swap(a.m_index, b.m_index);
swap(static_cast<residue &>(a), static_cast<residue &>(b));
}
[[nodiscard]] bool is_first_in_chain() const; ///< Return if this residue is the first residue in the chain
[[nodiscard]] bool is_last_in_chain() const; ///< Return if this residue is the last residue in the chain
[[nodiscard]] const monomer &prev() const; ///< Return previous monomer in polymer
[[nodiscard]] const monomer &next() const; ///< Return next monomer in polymer
// convenience
[[nodiscard]] bool has_alpha() const; ///< Return if a alpha value can be calculated (depends on location in chain)
[[nodiscard]] bool has_kappa() const; ///< Return if a kappa value can be calculated (depends on location in chain)
// Assuming this is really an amino acid...
[[nodiscard]] float phi() const; ///< Return the phi value for this residue
[[nodiscard]] float psi() const; ///< Return the psi value for this residue
[[nodiscard]] float alpha() const; ///< Return the alpha value for this residue
[[nodiscard]] float kappa() const; ///< Return the kappa value for this residue
[[nodiscard]] float tco() const; ///< Return the tco value for this residue
[[nodiscard]] float omega() const; ///< Return the omega value for this residue
// torsion angles
[[nodiscard]] std::size_t nr_of_chis() const; ///< Return how many torsion angles can be calculated
[[nodiscard]] float chi(std::size_t i) const; ///< Return torsion angle @a i
[[nodiscard]] bool is_cis() const; ///< Return true if this residue is in a cis conformation
/// \brief Returns true if the four atoms C, CA, N and O are present
[[nodiscard]] bool is_complete() const;
/// \brief Returns true if any of the backbone atoms has an alternate
[[nodiscard]] bool has_alternate_backbone_atoms() const;
[[nodiscard]] atom CAlpha() const { return get_atom_by_atom_id("CA"); } ///< Return the CAlpha atom
[[nodiscard]] atom C() const { return get_atom_by_atom_id("C"); } ///< Return the C atom
[[nodiscard]] atom N() const { return get_atom_by_atom_id("N"); } ///< Return the N atom
[[nodiscard]] atom O() const { return get_atom_by_atom_id("O"); } ///< Return the O atom
[[nodiscard]] atom H() const { return get_atom_by_atom_id("H"); } ///< Return the H atom
/// \brief Return true if this monomer is bonded to monomer @a rhs
[[nodiscard]] bool is_bonded_to(const monomer &rhs) const
{
return this != &rhs and are_bonded(*this, rhs);
}
/**
* @brief Return true if the distance between the CA atoms of the
* two monomers @a a and @a b are within the expected range with
* an error margin of @a errorMargin.
*
* The expected distance is 3.0 ångström for a cis conformation
* and 3.8 ångström for trans.
*/
static bool are_bonded(const monomer &a, const monomer &b, float errorMargin = 0.5f);
/// \brief Return true if the bond between @a a and @a b is cis
static bool is_cis(const monomer &a, const monomer &b);
/// \brief Return the omega angle between @a a and @a b
static float omega(const monomer &a, const monomer &b);
/// \brief Return the chiral volume, only for LEU and VAL
[[nodiscard]] float chiral_volume() const;
/// \brief Compare this monomer with \a rhs
bool operator==(const monomer &rhs) const
{
return m_polymer == rhs.m_polymer and m_index == rhs.m_index;
}
atom create_new_atom(atom_type inType, const std::string &inAtomID, point inLocation) override;
private:
const polymer *m_polymer;
std::size_t m_index{};
};
// --------------------------------------------------------------------
/**
* @brief A polymer is simply a list of monomers
*
*/
class polymer : public std::vector<monomer>
{
public:
/// \brief Constructor
polymer(structure &s, std::string entityID, std::string asymID, std::string auth_asym_id);
polymer(const polymer &) = delete;
polymer &operator=(const polymer &) = delete;
[[nodiscard]] structure *get_structure() const { return m_structure; } ///< Return the structure
[[nodiscard]] std::string get_asym_id() const { return m_asym_id; } ///< Return the asym_id
[[nodiscard]] std::string get_pdb_strand_id() const { return m_pdb_strand_id; } ///< Return the PDB chain ID, actually
[[nodiscard]] std::string get_entity_id() const { return m_entity_id; } ///< Return the entity_id
private:
structure *m_structure;
std::string m_entity_id;
std::string m_asym_id;
std::string m_pdb_strand_id;
};
// --------------------------------------------------------------------
// sugar and branch, to describe glycosylation sites
class branch;
/**
* @brief A sugar is a residue that is part of a glycosylation site
*
*/
class sugar : public residue
{
public:
/// \brief constructor
sugar(branch &branch, const std::string &compoundID,
const std::string &asymID, int authSeqID);
/** @cond */
sugar(const sugar &rhs) = default;
sugar(sugar &&rhs)
{
swap(*this, rhs);
}
sugar &operator=(sugar rhs)
{
swap(*this, rhs);
return *this;
}
friend void swap(sugar &a, sugar &b) noexcept
{
assert(a.m_branch == b.m_branch);
std::swap(a.m_link, b.m_link);
swap(static_cast<residue &>(a), static_cast<residue &>(b));
}
/** @endcond */
/**
* @brief Return the sugar number in the glycosylation tree
*
* To store the sugar number, the auth_seq_id item has been overloaded
* in the specification. But since a sugar number should be, ehm, a number
* and auth_seq_id is specified to contain a string, we do a check here
* to see if it really is a number.
*
* @return The sugar number
*/
[[nodiscard]] int num() const
{
int result;
auto r = std::from_chars(m_pdb_seq_num.data(), m_pdb_seq_num.data() + m_pdb_seq_num.length(), result);
if (r.ec != std::errc{})
throw std::runtime_error("The auth_seq_id should be a number for a sugar");
return result;
}
/// \brief Return the name of this sugar
[[nodiscard]] std::string name() const;
/// \brief Return the atom the C1 is linked to
[[nodiscard]] atom get_link() const { return m_link; }
/// \brief Set the link atom C1 is linked to to @a link
void set_link(atom link) { m_link = link; }
/// \brief Return the sugar number of the sugar linked to C1
[[nodiscard]] std::size_t get_link_nr() const
{
std::size_t result = 0;
if (m_link)
result = m_link.get_property_int("auth_seq_id");
return result;
}
/// \brief Construct an atom based on the info in @a atom_info and add it to this sugar
atom add_atom(row_initializer atom_info);
private:
branch *m_branch;
atom m_link;
};
/**
* @brief A branch is a list of sugars
*
* A list is how it is stored, but a branch is like a branch in a tree,
* with potentially lots of sub branches. Each sugar is linked to a sugar
* up in the branch with its (almost always) C1 atom.
*
*/
class branch : public std::vector<sugar>
{
public:
/// \brief constructor
branch(structure &structure, std::string asym_id, std::string entity_id);
branch(const branch &) = delete;
branch &operator=(const branch &) = delete;
/** @cond */
branch(branch &&) = default;
branch &operator=(branch &&) = default;
/** @endcond */
/// \brief Update the link atoms in all sugars in this branch
void link_atoms();
/// \brief Return the name of the branch
[[nodiscard]] std::string name() const;
/// \brief Return the weight of the branch based on the formulae of the sugars
[[nodiscard]] float weight() const;
[[nodiscard]] std::string get_asym_id() const { return m_asym_id; } ///< Return the asym_id
[[nodiscard]] std::string get_entity_id() const { return m_entity_id; } ///< Return the entity_id
[[nodiscard]] structure &get_structure() { return *m_structure; } ///< Return the structure
[[nodiscard]] structure &get_structure() const { return *m_structure; } ///< Return the structure
/// \brief Return a reference to the sugar with number @a num
[[nodiscard]] sugar &get_sugar_by_num(int nr);
/// \brief Return a const reference to the sugar with number @a num
[[nodiscard]] const sugar &get_sugar_by_num(int nr) const
{
return const_cast<branch *>(this)->get_sugar_by_num(nr);
}
/// \brief Construct a new sugar with compound ID @a compound_id in this branch
/// and return a reference to the newly created sugar. Use this to create a first
/// sugar in a branch.
sugar &construct_sugar(const std::string &compound_id);
/// \brief Construct a new sugar with compound ID @a compound_id in this branch
/// and return a reference to the newly created sugar. The newly created sugar
/// will be connected to an already created sugar in the branch using the
/// information in @a atom_id, @a linked_sugar_nr and @a linked_atom_id
sugar &construct_sugar(const std::string &compound_id, const std::string &atom_id,
int linked_sugar_nr, const std::string &linked_atom_id);
private:
friend sugar;
[[nodiscard]] std::string name(const sugar &s) const;
structure *m_structure;
std::string m_asym_id, m_entity_id;
};
/** @brief Enumeration for controlling atom selection based on occupancy. */
enum class occupancy_policy
{
/** @brief Include all atoms regardless of their occupancy factor. */
ALL = 0,
/** @brief Select only alternate atoms with the maximum occupancy factor.
* If multiple atoms have the same maximum occupancy, choose the one with the minimum B-factor.
* If multiple atoms share both the maximum occupancy and the minimum B-factor, select the first encountered atom.
*/
MAX = 1,
/** @brief Select only alternate atoms with the minimum occupancy factor.
* Similar to MAX, if multiple atoms have the same minimum occupancy, choose the one with the minimum B-factor.
* If multiple atoms share both the minimum occupancy and the minimum B-factor, select the first encountered atom.
*/
MIN = 2,
/** @brief Exclude all atoms with an occupancy factor greater than zero. */
UNOCCUPIED = 3
};
/// When creating a structure, you can specify what to include using this options class.
struct structure_open_options
{
bool skip_hydrogen = false; ///< Do not include hydrogen atoms in the structure object
bool skip_hetatom = false; ///< Do not include HET atoms in the structure object
bool skip_water = false; ///< Do not include water atoms in the structure object
occupancy_policy occupancy_mode = occupancy_policy::ALL; ///< By default, the occupancy policy is set to occupancy_policy::ALL
std::vector<std::string> asyms; ///< The asyms to load, if empty load all
std::optional<float> min_b_factor; ///< Only load atoms with at least this b_factor
std::optional<float> max_b_factor; ///< Only load atoms with at most this b_factor
};
// --------------------------------------------------------------------
/**
* @brief A structure is the combination of polymers, ligand and sugar branches found
* in the mmCIF file. This will always contain one model, the first model is taken
* if not otherwise specified.
*
*/
class structure
{
public:
/// \brief Read the structure from cif::file @a p
structure(file &p, std::size_t modelNr = 1, structure_open_options options = {});
/// \brief Load the structure from already parsed mmCIF data in @a db
structure(datablock &db, std::size_t modelNr = 1, structure_open_options options = {});
/** @cond */
structure(structure &&s) = default;
/** @endcond */
// structures cannot be copied.
structure(const structure &) = delete;
structure &operator=(const structure &) = delete;
~structure() = default;
/// \brief Return the model number
[[nodiscard]] std::size_t get_model_nr() const { return m_model_nr; }
/// \brief Return a list of all the atoms in this structure
[[nodiscard]] const std::vector<atom> &atoms() const { return m_atoms; }
[[nodiscard]] EntityType get_entity_type_for_entity_id(const std::string entityID) const; ///< Return the entity type for the entity with id @a entity_id
[[nodiscard]] EntityType get_entity_type_for_asym_id(const std::string asymID) const; ///< Return the entity type for the asym with id @a asym_id
[[nodiscard]] const std::list<polymer> &polymers() const { return m_polymers; } ///< Return the list of polymers
[[nodiscard]] std::list<polymer> &polymers() { return m_polymers; } ///< Return the list of polymers
[[nodiscard]] polymer &get_polymer_by_asym_id(const std::string &asymID); ///< Return the polymer having asym ID @a asymID
[[nodiscard]] const polymer &get_polymer_by_asym_id(const std::string &asymID) const ///< Return the polymer having asym ID @a asymID
{
return const_cast<structure *>(this)->get_polymer_by_asym_id(asymID);
}
[[nodiscard]] const std::list<branch> &branches() const { return m_branches; } ///< Return the list of all branches
[[nodiscard]] std::list<branch> &branches() { return m_branches; } ///< Return the list of all branches
[[nodiscard]] branch &get_branch_by_asym_id(const std::string &asymID); ///< Return the branch having asym ID @a asymID
[[nodiscard]] const branch &get_branch_by_asym_id(const std::string &asymID) const; ///< Return the branch having asym ID @a asymID
[[nodiscard]] const std::vector<residue> &non_polymers() const { return m_non_polymers; } ///< Return the list of non-polymers, actually the list of ligands
[[nodiscard]] bool has_atom_id(const std::string &id) const; ///< Return true if an atom with ID @a id exists in this structure
[[nodiscard]] atom get_atom_by_id(const std::string &id) const; ///< Return the atom with ID @a id
/// \brief Return the atom identified by the label_ values specified
[[nodiscard]] atom get_atom_by_label(const std::string &atomID, const std::string &asymID,
const std::string &compID, int seqID, const std::string &altID = "");
/// \brief Return the atom closest to point \a p
[[nodiscard]] atom get_atom_by_position(point p) const;
/// \brief Return the atom closest to point \a p with atom type \a type in a residue of type \a res_type
[[nodiscard]] atom get_atom_by_position_and_type(point p, std::string_view type, std::string_view res_type) const;
/// \brief Create a non-poly residue based on atoms already present in this structure.
residue &create_residue(const std::vector<atom> &atoms);
/// \brief Get a non-poly residue for an asym with id \a asymID
[[nodiscard]] residue &get_residue(const std::string &asymID)
{
return get_residue(asymID, 0, "");
}
/// \brief Get a non-poly residue for an asym with id \a asymID
[[nodiscard]] const residue &get_residue(const std::string &asymID) const
{
return get_residue(asymID, 0, "");
}
/// \brief Get a residue for an asym with id \a asymID seq id \a seqID and authSeqID \a authSeqID
[[nodiscard]] residue &get_residue(const std::string &asymID, int seqID, const std::string &authSeqID);
/// \brief Get a the single residue for an asym with id \a asymID seq id \a seqID and authSeqID \a authSeqID
[[nodiscard]] const residue &get_residue(const std::string &asymID, int seqID, const std::string &authSeqID) const
{
return const_cast<structure *>(this)->get_residue(asymID, seqID, authSeqID);
}
/// \brief Get a residue for an asym with id \a asymID, compound id \a compID, seq id \a seqID and authSeqID \a authSeqID
[[nodiscard]] residue &get_residue(const std::string &asymID, const std::string &compID, int seqID, const std::string &authSeqID);
/// \brief Get a residue for an asym with id \a asymID, compound id \a compID, seq id \a seqID and authSeqID \a authSeqID
[[nodiscard]] const residue &get_residue(const std::string &asymID, const std::string &compID, int seqID, const std::string &authSeqID) const
{
return const_cast<structure *>(this)->get_residue(asymID, compID, seqID, authSeqID);
}
/// \brief Get a the residue for atom \a atom
[[nodiscard]] residue &get_residue(const atom &atom)
{
return get_residue(atom.get_label_asym_id(), atom.get_label_comp_id(), atom.get_label_seq_id(), atom.get_auth_seq_id());
}
/// \brief Get a the residue for atom \a atom
[[nodiscard]] const residue &get_residue(const atom &atom) const
{
return get_residue(atom.get_label_asym_id(), atom.get_label_comp_id(), atom.get_label_seq_id(), atom.get_auth_seq_id());
}
// Actions. Originally a lot more actions were expected here
/// \brief Remove atom @a a
void remove_atom(atom &a)
{
remove_atom(a, true);
}
void swap_atoms(atom a1, atom a2); ///< swap the labels for these atoms
void move_atom(atom a, point p); ///< move atom to a new location
/**
* @brief Change residue @a res to a new compound ID optionally
* remapping atoms.
*
* A new chem_comp entry as well as an entity is created if needed and
* if the list of @a remappedAtoms is not empty it is used to remap.
*
* The array in @a remappedAtoms contains tuples of strings, both
* strings contain an atom_id. The first is the one in the current
* residue and the second is the atom_id that should be used instead.
* If the second string is empty, the atom is removed from the residue.
*
* @param res
* @param newcompound
* @param remappedAtoms
*/
void change_residue(residue &res, const std::string &newcompound,
const std::vector<std::tuple<std::string, std::string>> &remappedAtoms);
/// \brief Remove a residue, can be monomer or nonpoly
///
/// \param asym_id The asym ID
/// \param seq_id The sequence ID
/// \param auth_seq_id The auth sequence ID
void remove_residue(const std::string &asym_id, int seq_id, const std::string &auth_seq_id);
/// \brief Create a new non-polymer entity, returns new ID
/// \param mon_id The mon_id for the new nonpoly, must be an existing and known compound from CCD
/// \return The ID of the created entity
std::string create_non_poly_entity(const std::string &mon_id);
/// \brief Create a new NonPolymer struct_asym with atoms constructed from \a atoms, returns asym_id.
/// This method assumes you are copying data from one cif file to another.
///
/// \param entity_id The entity ID of the new nonpoly
/// \param atoms The array of atom_site rows containing the data.
/// \return The newly create asym ID
std::string create_non_poly(const std::string &entity_id, const std::vector<atom> &atoms);
/// \brief Create a new NonPolymer struct_asym with atoms constructed from info in \a atom_info, returns asym_id.
/// This method creates new atom records filled with info from the info.
///
/// \param entity_id The entity ID of the new nonpoly
/// \param atoms The array of sets of item data containing the data for the atoms.
/// \return The newly create asym ID
std::string create_non_poly(const std::string &entity_id, std::vector<row_initializer> atoms);
/// \brief Create a new NonPolymer struct_asym for a compound of type \a compound_id, returns asym_id.
/// This method creates new atom records filled with info from the CCD compound info.
///
/// \param compound_id The compound ID of the new nonpoly
/// \param skip_hydrogen Do not create hydrogen atoms when true
/// \return The newly create asym ID
std::string create_non_poly(const std::string &compound_id, bool skip_hydrogen);
/// \brief Create a new water with atom constructed from info in \a atom_info
/// This method creates a new atom record filled with info from the info.
///
/// \param atom The set of item data containing the data for the atoms.
void create_water(row_initializer atom);
/// \brief Create a link, a struct_conn record for two atoms.
///
/// \param a1 Atom 1
/// \param a2 Atom 2
/// \param link_type The struct_conn_type ID for the link
/// \param role The pdbx_role field value
/// \return The ID of the struct_conn record created
std::string create_link(atom a1, atom a2, const std::string &link_type, const std::string &role);
/// \brief Create a new and empty (sugar) branch
branch &create_branch();
// /// \brief Create a new (sugar) branch with one first NAG containing atoms constructed from \a atoms
// branch &create_branch(std::vector<row_initializer> atoms);
// /// \brief Extend an existing (sugar) branch identified by \a asymID with one sugar containing atoms constructed from \a atom_info
// ///
// /// \param asym_id The asym id of the branch to extend
// /// \param atom_info Array containing the info for the atoms to construct for the new sugar
// /// \param link_sugar The sugar to link to, note: this is the sugar number (1 based)
// /// \param link_atom The atom id of the atom linked in the sugar
// branch &extend_branch(const std::string &asym_id, std::vector<row_initializer> atom_info,
// int link_sugar, const std::string &link_atom);
/// \brief Remove \a branch
void remove_branch(branch &branch);
/// \brief Remove residue \a res
///
/// \param res The residue to remove
void remove_residue(residue &res);
/// \brief Translate the coordinates of all atoms in the structure by \a t
void translate(point t);
/// \brief Rotate the coordinates of all atoms in the structure by \a q
void rotate(quaternion t);
/// \brief Translate and rotate the coordinates of all atoms in the structure by \a t and \a q
void translate_and_rotate(point t, quaternion q);
/// \brief Translate, rotate and translate again the coordinates of all atoms in the structure by \a t1 , \a q and \a t2
void translate_rotate_and_translate(point t1, quaternion q, point t2);
/// \brief Remove all categories that have no rows left
void cleanup_empty_categories();
/// \brief Direct access to underlying data
[[nodiscard]] category &get_category(std::string_view name) const
{
return m_db[name];
}
/// \brief Direct access to underlying data
[[nodiscard]] datablock &get_datablock() const
{
return m_db;
}
/// \brief Check if all atoms are part of either a polymer, a branch or one of the non-polymer residues
void validate_atoms() const;
/// \brief emplace a newly created atom using @a args
atom &emplace_atom(datablock &db, const_row_handle rh)
{
return emplace_atom(atom{ db, rh });
}
/// \brief emplace the moved atom @a atom
atom &emplace_atom(atom atom);
/// \brief Reorder atom_site atoms based on 'natural' ordering
void reorder_atoms();
private:
friend polymer;
friend residue;
void load_atoms_for_model(structure_open_options options);
std::string insert_compound(const std::string &compoundID, bool is_entity);
std::string create_entity_for_branch(branch &branch);
void load_data();
void remove_atom(atom &a, bool removeFromResidue);
void remove_sugar(sugar &sugar);
datablock &m_db;
std::size_t m_model_nr;
std::vector<atom> m_atoms;
std::vector<std::size_t> m_atom_index;
std::list<polymer> m_polymers;
std::list<branch> m_branches;
std::vector<residue> m_non_polymers;
};
} // namespace cif::mm