Files
libcifpp/src/validate.cpp
Maarten L. Hekkelman 96f0ae694c Avoid some string copies
2026-02-21 14:25:13 +01:00

648 lines
17 KiB
C++

/*-
* SPDX-License-Identifier: BSD-2-Clause
*
* Copyright (c) 2020 NKI/AVL, Netherlands Cancer Institute
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "cif++/validate.hpp"
#include "cif++/cif++.hpp"
#include <cassert>
#include <charconv>
#include <cstddef>
#include <filesystem>
#include <format>
#include <iomanip>
#include <iostream>
#include <list>
#include <memory>
#include <mutex>
#include <optional>
#include <ranges>
#include <set>
#include <stdexcept>
#include <string>
#include <string_view>
#include <system_error>
#include <tuple>
#include <utility>
#include <vector>
// The validator depends on regular expressions. Unfortunately,
// the implementation of std::regex in g++ is buggy and crashes
// on reading the pdbx dictionary. We used to use boost regex
// instead but using pcre2 is even easier and faster.
#define PCRE2_CODE_UNIT_WIDTH 8
#include <pcre2.h>
namespace cif
{
validation_exception::validation_exception(std::error_code ec)
: runtime_error(ec.message())
{
}
validation_exception::validation_exception(std::error_code ec, std::string_view category)
: runtime_error((ec.message() + "; category: ").append(category))
{
}
validation_exception::validation_exception(std::error_code ec, std::string_view category, std::string_view item)
: runtime_error((ec.message() + "; category: ").append(category).append("; item: ").append(item))
{
}
// --------------------------------------------------------------------
struct regex_impl
{
regex_impl(std::string_view rx);
~regex_impl();
regex_impl(const regex_impl &) = delete;
regex_impl &operator=(const regex_impl &) = delete;
bool match(std::string_view v) const;
private:
pcre2_code_8 *m_rx = nullptr;
pcre2_match_data *m_data = nullptr;
mutable std::mutex m_mutex;
};
regex_impl::regex_impl(std::string_view rx)
{
int err_code;
size_t err_offset;
m_rx = pcre2_compile(reinterpret_cast<PCRE2_SPTR>(rx.data()), rx.length(), 0, &err_code, &err_offset, nullptr);
if (m_rx == nullptr)
{
char buffer[256];
int n = pcre2_get_error_message(err_code, reinterpret_cast<unsigned char *>(&buffer[0]), sizeof(buffer));
throw std::runtime_error(std::string("PCRE2 compilation failed: ") + std::string{ buffer, buffer + n });
}
m_data = pcre2_match_data_create_from_pattern(m_rx, nullptr);
}
regex_impl::~regex_impl()
{
std::unique_lock lock(m_mutex);
if (m_data)
pcre2_match_data_free(m_data);
if (m_rx)
pcre2_code_free(m_rx);
}
bool regex_impl::match(std::string_view v) const
{
std::unique_lock lock(m_mutex);
bool result = false;
if (int rc = pcre2_match(m_rx, reinterpret_cast<PCRE2_SPTR>(v.data()), v.length(), 0, 0, m_data, nullptr); rc >= 0)
result = true;
else if (rc != PCRE2_ERROR_NOMATCH)
std::cerr << "Error matching with pcre\n";
return result;
}
// --------------------------------------------------------------------
DDL_PrimitiveType map_to_primitive_type(std::string_view s, std::error_code &ec) noexcept
{
ec = {};
DDL_PrimitiveType result = DDL_PrimitiveType::Char;
if (iequals(s, "char"))
result = DDL_PrimitiveType::Char;
else if (iequals(s, "uchar"))
result = DDL_PrimitiveType::UChar;
else if (iequals(s, "numb"))
result = DDL_PrimitiveType::Numb;
else
ec = make_error_code(validation_error::not_a_known_primitive_type);
return result;
}
DDL_PrimitiveType map_to_primitive_type(std::string_view s)
{
std::error_code ec;
auto result = map_to_primitive_type(s, ec);
if (ec)
throw std::system_error(ec, std::string{ s });
return result;
}
// --------------------------------------------------------------------
type_validator::type_validator(std::string_view name, DDL_PrimitiveType type, std::string_view rx)
: m_name(name)
, m_primitive_type(type)
, m_rx(new regex_impl(rx.empty() ? ".+" : rx)) /// Empty regular expressions are not allowed, in libcpp's std::regex (POSIX?)
{
}
int type_validator::compare(const item_value &a, const item_value &b) const
{
switch (m_primitive_type)
{
using enum DDL_PrimitiveType;
case Numb:
{
if (a.is_number() and b.is_number())
return a.compare(b);
auto da = a.get<double>();
auto db = b.get<double>();
return da < db
? -1
: da > db
? 1
: 0;
}
case UChar:
if (a.is_string() and b.is_string())
return a.compare(b, true);
return icompare(a.str(), b.str());
case Char:
if (a.is_string() and b.is_string())
return a.compare(b, false);
return a.str().compare(b.str());
default:
throw std::runtime_error("invalid primitive type");
}
}
// --------------------------------------------------------------------
void item_validator::validate_value(const item_value &value) const
{
std::error_code ec;
if (not validate_value(value, ec))
throw std::system_error(ec, std::format("'{}' is not a valid value for {}", value.str(), m_item_name));
}
bool item_validator::validate_value(const item_value &value, std::error_code &ec) const noexcept
{
ec.clear();
if (not value.empty())
{
if (m_type != nullptr)
{
if (m_type->m_primitive_type == DDL_PrimitiveType::Numb)
{
if (not value.is_number())
ec = make_error_code(validation_error::value_is_not_a_number);
}
else
{
if (value.is_number())
ec = make_error_code(validation_error::value_is_not_a_char_string);
else
{
try
{
if (not m_type->m_rx->match(value.sv()))
ec = make_error_code(validation_error::value_does_not_match_rx);
}
catch (...)
{
ec = make_error_code(validation_error::value_does_not_match_rx);
}
if (ec == std::errc{} and not m_enums.empty() and m_enums.count(value.str()) == 0)
ec = make_error_code(validation_error::value_is_not_in_enumeration_list);
}
}
}
}
return ec == std::errc{};
}
// --------------------------------------------------------------------
void category_validator::add_item_validator(item_validator &&v)
{
if (v.m_mandatory)
m_mandatory_items.insert(v.m_item_name);
v.m_category = m_name;
auto i = std::ranges::find(m_item_validators, v);
if (i != m_item_validators.end())
{
if (VERBOSE >= 4)
std::cout << "Could not add validator for item " << v.m_item_name << " to category " << m_name << '\n';
}
else
m_item_validators.emplace_back(std::move(v));
}
const item_validator *category_validator::get_validator_for_item(std::string_view item_name) const
{
const item_validator *result = nullptr;
auto i = std::ranges::find(m_item_validators, item_validator{ std::string(item_name) });
if (i != m_item_validators.end())
result = &*i;
else if (VERBOSE > 4)
std::cout << "No validator for item " << item_name << '\n';
return result;
}
const item_validator *category_validator::get_validator_for_aliased_item(std::string_view item_name) const
{
const item_validator *result = nullptr;
for (auto &iv : m_item_validators)
{
for (auto &ai : iv.m_aliases)
{
const auto &[cat, name] = split_item_name(ai.m_name);
if (iequals(name, item_name) and iequals(cat, m_name))
{
result = &iv;
break;
}
}
if (result)
break;
}
return result;
}
// --------------------------------------------------------------------
void swap(validator &a, validator &b) noexcept
{
std::swap(a.m_audit_conform, b.m_audit_conform);
std::swap(a.m_strict, b.m_strict);
std::swap(a.m_type_validators, b.m_type_validators);
std::swap(a.m_category_validators, b.m_category_validators);
std::swap(a.m_link_validators, b.m_link_validators);
}
void validator::parse(std::istream &is)
{
parse_dictionary(*this, is);
}
void validator::add_type_validator(type_validator &&v)
{
m_type_validators.emplace(v);
}
const type_validator *validator::get_validator_for_type(std::string_view typeCode) const
{
const type_validator *result = nullptr;
auto i = m_type_validators.find(type_validator{ std::string(typeCode), DDL_PrimitiveType::Char, {} });
if (i != m_type_validators.end())
result = &*i;
return result;
}
void validator::add_category_validator(category_validator &&v)
{
m_category_validators.emplace(v);
}
const category_validator *validator::get_validator_for_category(std::string_view category) const
{
const category_validator *result = nullptr;
auto i = m_category_validators.find(category_validator{ std::string(category) });
if (i != m_category_validators.end())
result = &*i;
return result;
}
item_validator *validator::get_validator_for_item(std::string_view item_name) const
{
item_validator *result = nullptr;
std::string cat, item;
std::tie(cat, item) = split_item_name(item_name);
auto *cv = get_validator_for_category(cat);
if (cv != nullptr)
result = const_cast<item_validator *>(cv->get_validator_for_item(item));
if (result == nullptr and VERBOSE > 4)
std::cout << "No validator for item " << item_name << '\n';
return result;
}
void validator::add_link_validator(link_validator &&v)
{
assert(v.m_parent_keys.size() == v.m_child_keys.size());
if (v.m_parent_keys.size() != v.m_child_keys.size())
throw std::runtime_error("unequal number of keys for parent and child in link");
auto pcv = get_validator_for_category(v.m_parent_category);
auto ccv = get_validator_for_category(v.m_child_category);
if (pcv == nullptr)
throw std::runtime_error("unknown parent category " + v.m_parent_category);
if (ccv == nullptr)
throw std::runtime_error("unknown child category " + v.m_child_category);
for (std::size_t i = 0; i < v.m_parent_keys.size(); ++i)
{
auto piv = pcv->get_validator_for_item(v.m_parent_keys[i]);
if (piv == nullptr)
throw std::runtime_error("unknown parent item _" + v.m_parent_category + '.' + v.m_parent_keys[i]);
auto civ = ccv->get_validator_for_item(v.m_child_keys[i]);
if (civ == nullptr)
throw std::runtime_error("unknown child item _" + v.m_child_category + '.' + v.m_child_keys[i]);
if (civ->m_type == nullptr and piv->m_type != nullptr)
const_cast<item_validator *>(civ)->m_type = piv->m_type;
}
m_link_validators.emplace_back(std::move(v));
}
std::vector<const link_validator *> validator::get_links_for_parent(std::string_view category) const
{
std::vector<const link_validator *> result;
for (auto &l : m_link_validators)
{
if (iequals(l.m_parent_category, category))
result.push_back(&l);
}
return result;
}
std::vector<const link_validator *> validator::get_links_for_child(std::string_view category) const
{
std::vector<const link_validator *> result;
for (auto &l : m_link_validators)
{
if (iequals(l.m_child_category, category))
result.push_back(&l);
}
return result;
}
void validator::report_error(std::error_code ec, bool fatal) const
{
if (m_strict or fatal)
throw validation_exception(ec);
else if (VERBOSE > 0)
std::cerr << ec.message() << '\n';
}
void validator::report_error(std::error_code ec, std::string_view category,
std::string_view item, bool fatal) const
{
if (m_strict or fatal)
{
if (item.empty())
throw validation_exception(ec, category);
else
throw validation_exception(ec, category, item);
}
if (VERBOSE > 0)
std::cerr << ec.message()
<< "; category: " << std::quoted(category)
<< " item: " << std::quoted(item)
<< '\n';
}
void validator::fill_audit_conform(category &audit_conform) const
{
audit_conform.clear();
audit_conform.emplace(m_audit_conform.begin(), m_audit_conform.end());
}
bool validator::matches_audit_conform(const category &audit_conform) const
{
if (audit_conform.empty())
return false;
auto ai = m_audit_conform.begin();
auto bi = audit_conform.begin();
while (ai != m_audit_conform.end() and bi != audit_conform.end())
{
const auto &[name_a, version_a] = ai->get<std::string, std::optional<std::string>>("dict_name", "dict_version");
const auto &[name_b, version_b] = bi->get<std::string, std::optional<std::string>>("dict_name", "dict_version");
++ai;
++bi;
if (name_a != name_b)
return false;
if (not version_b.has_value() or not version_a.has_value())
continue;
if (validator_factory::check_version(name_a, *version_b, *version_a) == false)
return false;
}
return ai == m_audit_conform.end() and bi == audit_conform.end();
}
void validator::append_audit_conform(const std::string &name, const std::optional<std::string> &version)
{
m_audit_conform.emplace({ //
{ "dict_name", name },
{ "dict_version", version } });
}
// --------------------------------------------------------------------
validator_factory &validator_factory::instance()
{
static validator_factory s_instance;
return s_instance;
}
const validator *validator_factory::get(std::string_view dictionary_name)
{
category audit_conform("audit_conform");
for (auto part : cif::split(dictionary_name, ";,", true))
audit_conform.emplace({ { "dict_name", part } });
return get(audit_conform);
}
const validator *validator_factory::get(const category &audit_conform)
{
const validator *result = nullptr;
std::scoped_lock lock(m_mutex);
// Check existing first
for (auto &v : m_validators)
{
if (v.matches_audit_conform(audit_conform))
result = &v;
}
// If the audit conform contains only one record, this is easy
if (result == nullptr and audit_conform.size() == 1)
{
const auto &[name, version] =
audit_conform.front().get<std::string, std::optional<std::string>>("dict_name", "dict_version");
if (not name.empty())
result = &m_validators.emplace_back(construct_validator(name, version));
}
if (result == nullptr)
{
// A new, merged dictionary
std::optional<validator> v;
for (const auto &[name, version] : audit_conform.rows<std::string, std::optional<std::string>>("dict_name", "dict_version"))
{
if (name.empty())
continue;
if (not v) // first dict
v = construct_validator(name, version);
else // additional/extending dict
{
auto data = load_resource(name);
if (not data)
throw std::runtime_error("Could not load dictionary " + std::string{ name });
v->parse(*data);
}
}
if (v)
result = &m_validators.emplace_back(std::move(*v));
}
return result;
}
const validator &validator_factory::operator[](const category &audit_conform)
{
auto v = get(audit_conform);
if (v == nullptr)
throw std::runtime_error("Could not load dictionary for audit_conform");
return *v;
}
const validator &validator_factory::operator[](std::string_view dictionary_name)
{
auto v = get(dictionary_name);
if (v == nullptr)
throw std::runtime_error("Could not load dictionary for " + std::string{ dictionary_name });
return *v;
}
validator validator_factory::construct_validator(std::string_view name, std::optional<std::string> version)
{
auto data = load_resource(name);
if (not data and name == "mmcif_pdbx_v50")
data = load_resource("mmcif_pdbx.dic");
if (not data)
throw std::runtime_error("Could not load dictionary " + std::string{ name });
validator v;
v.parse(*data);
if (version.has_value() and VERBOSE >= 0 and
not v.matches_audit_conform(category{ "audit_conform", //
{ { "dict_name", name }, { "dict_version", version } } }))
{
std::clog << "Loaded dictionary does not match name=" << name << " and version=" << version.value_or("''") << "\n";
}
return v;
}
bool validator_factory::check_version(std::string_view name, std::string_view expected, std::string_view found)
{
bool result = true;
auto el = cif::split(expected, ".");
auto fl = cif::split(found, ".");
auto eli = el.begin();
auto fli = fl.begin();
while (eli != el.end() and fli != fl.end())
{
int e_int, f_int;
if (auto [ptr, ec] = std::from_chars(eli->data(), eli->data() + eli->length(), e_int); ec != std::errc{})
{
std::clog << "Could not parse requested version string for dictionary " << std::quoted(expected) << "\n";
result = false;
break;
}
if (auto [ptr, ec] = std::from_chars(fli->data(), fli->data() + fli->length(), f_int); ec != std::errc{})
{
std::clog << "Could not parse version string in dictionary " << name << " " << std::quoted(found) << "\n";
result = false;
break;
}
if (f_int > e_int) // newer version, assume this is ok
break;
if (f_int < e_int)
{
std::clog << "The version in dictionary " << name << " is lower than requested, this may cause validation errors\n";
result = false;
break;
}
++eli;
++fli;
}
return result;
}
} // namespace cif