/* * CIF tokenizer * * All keys are canonicalized to lowercase * * (c) 2014 Schrodinger, Inc. */ #include #include #include #include #include #include #include #include #include #include #include "CifFile.h" #include "File.h" #include "MemoryDebug.h" #include "strcasecmp.h" #if !defined(_PYMOL_NO_MSGPACKC) #include #endif using namespace pymol::cif; namespace pymol { namespace _cif_detail { template <> const char* raw_to_typed(const char* s) { return s; } template <> std::string raw_to_typed(const char* s) { return s; } template <> char raw_to_typed(const char* s) { return s[0]; } template <> int raw_to_typed(const char* s) { return atoi(s); } /** * Convert to floating point number, ignores uncertainty notation * 1.23(45)e2 -> 1.23e2 */ template <> double raw_to_typed(const char* s) { const char *close, *open = strchr(s, '('); if (open && (close = strchr(open, ')'))) { return atof(std::string(s, open - s).append(close + 1).c_str()); } return atof(s); } template <> float raw_to_typed(const char* s) { return static_cast(raw_to_typed(s)); } } // namespace _cif_detail // basic IO and string handling // Return true if "c" is whitespace or null static bool iswhitespace0(char c) { return strchr(" \t\r\n", c) ? true : false; } // Return true if "c" is whitespace static bool iswhitespace(char c) { return (c && iswhitespace0(c)); } // Return true if "c" is line feed or carriage return static bool islinefeed(char c) { return (c == '\r' || c == '\n'); } // Return true if "c" is line feed or carriage return or null static bool islinefeed0(char c) { return (!c || islinefeed(c)); } // Return true if "c" is double or single quote static bool isquote(char c) { return (c == '"' || c == '\''); } // FreeBSD name conflict #ifdef isspecial #undef isspecial #endif // Return true if token is a STAR keyword static bool isspecial(const char *token) { return (token[0] == '_' || strncasecmp("data_", token, 5) == 0 || strncasecmp("save_", token, 5) == 0 || strcasecmp("loop_", token) == 0 || strcasecmp("stop_", token) == 0 || strcasecmp("global_", token) == 0); } // convert all chars to lowercase static void tolowerinplace(char *p) { for (; *p; p++) { if (*p <= 'Z' && *p >= 'A') *p -= 'Z' - 'z'; } } // CIF stuff static const cif_array EMPTY_ARRAY(nullptr); /* * Class to store CIF loops. Only for parsing, do not use in any higher level * reading functions. */ class cif_loop { public: int ncols; int nrows; const char **values; // methods const char * get_value_raw(int row, int col) const; }; // get table value, return nullptr if indices out of bounds const char * cif_loop::get_value_raw(int row, int col) const { if (row >= nrows) return nullptr; return values[row * ncols + col]; } // get the number of elements in this array unsigned cif_array::size() const { if (auto arr = std::get_if(&m_array)) { return (arr->col == cif_detail::cif_str_array::NOT_IN_LOOP) ? 1 : arr->pointer.loop->nrows; } else if (auto arr = std::get_if(&m_array)) { return arr->m_arr.size(); } return 0; } /// Get array value, return nullptr if `pos >= size()` or value in ['.', '?'] const char* cif_detail::cif_str_array::get_value_raw(unsigned pos) const { if (col == NOT_IN_LOOP) return (pos > 0) ? nullptr : pointer.value; return pointer.loop->get_value_raw(pos, col); } // true if all values in ['.', '?'] bool cif_array::is_missing_all() const { for (unsigned i = 0, n = size(); i != n; ++i) { if (!is_missing(i)) return false; } return true; } /** * Get a pointer to array or nullptr if not found * * Can lookup different aliases, the first one found is returned. * Also supports an alias shortcut for the trivial case where mmCIF uses * a colon and CIF uses an underscore: `get_arr("_foo?bar")` is identical to * `get_arr("_foo.bar", "_foo_bar")` * * @param key data name, must be lower case */ const cif_array * cif_data::get_arr(const char * key) const { if (auto data = std::get_if(&m_data)) { const auto& dict = data->m_dict; const char* p = strchr(key, '?'); std::remove_reference_t::const_iterator it; #ifndef NDEBUG for (const char* q = key; *q; ++q) { assert("key must be lower case" && !('Z' >= *q && *q >= 'A')); } #endif // support alias shortcut: '?' matches '.' and '_' if (p != nullptr) { std::string tmp(key); // replace '?' by '.' or '_' tmp[p - key] = '.'; if ((it = dict.find(tmp.c_str())) != dict.end()) return &it->second; tmp[p - key] = '_'; if ((it = dict.find(tmp.c_str())) != dict.end()) return &it->second; } else { if ((it = dict.find(key)) != dict.end()) return &it->second; } } else if (auto data = std::get_if(&m_data)) { const auto& dict = data->m_dict; std::string_view keyView(key); auto split_key = [](const char c) { return c == '.' /*|| c == '_'*/ || c == '?'; }; auto splitTokenIt = std::find_if(keyView.begin(), keyView.end(), split_key); if (splitTokenIt == keyView.end()) { return nullptr; } auto dist = std::distance(keyView.begin(), splitTokenIt); auto categoryView = keyView.substr(0, dist); auto categoryStr = std::string(categoryView); auto categoryIt = dict.find(categoryStr.c_str()); if (categoryIt == dict.end()) { return nullptr; } auto& category = categoryIt->second; auto columnView = keyView.substr(dist + 1); auto columnStr = std::string(columnView); auto columnIt = category.find(columnStr.c_str()); if (columnIt == category.end()) { return nullptr; } return &columnIt->second; } return nullptr; } const char* cif_data::code() const { if (auto data = std::get_if(&m_data)) { return data->m_code ? data->m_code : ""; } if (auto data = std::get_if(&m_data)) { return data->m_code.c_str(); } return ""; } const cif_array* cif_data::empty_array() { return &EMPTY_ARRAY; } const cif_detail::cif_str_data* cif_data::get_saveframe(const char* code) const { if (auto data = std::get_if(&m_data)) { const auto& saveframes = data->m_saveframes; auto it = saveframes.find(code); if (it != saveframes.end()) return &it->second; } return nullptr; } bool cif_file::parse_file(const char* filename) { char* contents = FileGetContents(filename, nullptr); if (!contents) { error(std::string("failed to read file ").append(filename).c_str()); return false; } return parse(std::move(contents)); } bool cif_file::parse_string(const char* contents) { return parse(std::move(mstrdup(contents))); } void cif_file::error(const char* msg) { std::cout << "ERROR " << msg << std::endl; } // constructor cif_file::cif_file(const char* filename, const char* contents_) { if (contents_) { parse_string(contents_); } else if (filename) { parse_file(filename); } } // constructor cif_file::cif_file() = default; cif_file::cif_file(cif_file&&) = default; // move assignment cif_file& cif_file::operator=(cif_file&&) = default; // destructor cif_file::~cif_file() = default; bool cif_file::parse(char*&& p) { m_datablocks.clear(); m_tokens.clear(); m_contents.reset(p); if (!p) { error("parse(nullptr)"); return false; } auto& tokens = m_tokens; char quote; char prev = '\0'; std::vector keypossible; // tokenize while (true) { while (iswhitespace(*p)) prev = *(p++); if (!*p) break; if (*p == '#') { while (!(islinefeed0(*++p))); prev = *p; } else if (isquote(*p)) { // will nullptr the closing quote quote = *p; keypossible.push_back(false); tokens.push_back(p + 1); while (*++p && !(*p == quote && iswhitespace0(p[1]))); if (*p) *(p++) = 0; prev = *p; } else if (*p == ';' && islinefeed(prev)) { // multi-line tokens start with ";" and end with "\n;" // multi-line tokens cannot be keys, only values. keypossible.push_back(false); tokens.push_back(p + 1); // advance until `\n;` while (*++p && !(islinefeed(*p) && p[1] == ';')); // step to next line and null the line feed if (*p) { *p = 0; // \r\n on Windows) if (p - 1 > tokens.back() && *(p - 1) == '\r') { *(p - 1) = 0; } p += 2; } prev = ';'; } else { // will null the whitespace char * q = p++; while (!iswhitespace0(*p)) ++p; prev = *p; if (p - q == 1 && (*q == '?' || *q == '.')) { // store values '.' (inapplicable) and '?' (unknown) as null-pointers q = nullptr; keypossible.push_back(false); } else { if (*p) *(p++) = 0; keypossible.push_back(true); } tokens.push_back(q); } } cif_detail::cif_str_data* current_frame = nullptr; std::vector frame_stack; std::unique_ptr global_block; decltype(m_datablocks) datablocksnew; // parse into dictionary for (unsigned int i = 0, n = tokens.size(); i < n; i++) { if (!keypossible[i]) { error("expected key (1)"); return false; } else if (tokens[i][0] == '_') { if (!current_frame) { error("missing data_ (unexpected data name)"); return false; } if (i + 1 == n) { error("truncated"); return false; } tolowerinplace(tokens[i]); current_frame->m_dict[tokens[i]].m_array = cif_detail::cif_str_array{}; auto& cif_arr = std::get( current_frame->m_dict[tokens[i]].m_array); cif_arr.set_value(tokens[i + 1]); i++; } else if (strcasecmp("loop_", tokens[i]) == 0) { if (!current_frame) { error("missing data_ (unexpected loop)"); return false; } int ncols = 0; int nrows = 0; cif_loop *loop = nullptr; // loop data loop = new cif_loop; current_frame->m_loops.emplace_back(loop); // columns while (++i < n && keypossible[i] && tokens[i][0] == '_') { tolowerinplace(tokens[i]); current_frame->m_dict[tokens[i]].m_array = cif_detail::cif_str_array{}; auto& cif_arr = std::get( current_frame->m_dict[tokens[i]].m_array); cif_arr.set_loop(loop, ncols); ncols++; } if (loop) { // loop data loop->values = (const char **) &tokens[i]; loop->ncols = ncols; } // rows while (i < n && !(keypossible[i] && isspecial(tokens[i]))) { i += ncols; if (i > n) { error("truncated loop"); return false; } nrows++; } // loop data if (loop) { loop->nrows = nrows; } i--; } else if (strncasecmp("data_", tokens[i], 5) == 0) { auto& new_data = datablocksnew[tokens[i] + 5]; new_data.m_data = cif_detail::cif_str_data(); current_frame = &std::get(new_data.m_data); current_frame->m_code = tokens[i] + 5; frame_stack = {current_frame}; } else if (strncasecmp("global_", tokens[i], 5) == 0) { // STAR feature, not supported in CIF auto new_data = new cif_data; new_data->m_data = cif_detail::cif_str_data{}; current_frame = &std::get(new_data->m_data); global_block.reset(new_data); frame_stack = {current_frame}; } else if (strncasecmp("save_", tokens[i], 5) == 0) { if (tokens[i][5]) { // begin if (!current_frame) { error("top-level save_"); return false; } const char * key(tokens[i] + 5); current_frame = ¤t_frame->m_saveframes[key]; frame_stack.push_back(current_frame); } else { // end if (frame_stack.size() < 2) { error("unexpected save_"); return false; } frame_stack.pop_back(); current_frame = frame_stack.back(); } } else { error("expected key (2)"); return false; } } m_datablocks = std::move(datablocksnew); return true; } #if !defined(_PYMOL_NO_MSGPACKC) template void decodeAndPushBack(const std::vector& bytes, std::size_t& i, std::size_t size, std::vector& result) { T value; std::memcpy(&value, &bytes[i], size); result.push_back(value); } static std::vector byte_array_decode(const std::vector& bytes, DataTypes dataType) { std::vector result; std::unordered_map dataTypeSize = { {DataTypes::Int8, sizeof(std::int8_t)}, {DataTypes::Int16, sizeof(std::int16_t)}, {DataTypes::Int32, sizeof(std::int32_t)}, {DataTypes::UInt8, sizeof(std::uint8_t)}, {DataTypes::UInt16, sizeof(std::uint16_t)}, {DataTypes::UInt32, sizeof(std::uint32_t)}, {DataTypes::Float32, sizeof(float)}, {DataTypes::Float64, sizeof(double)}, }; auto size = dataTypeSize[dataType]; for (std::size_t i = 0; i < bytes.size(); i += size) { CifArrayElement valueVar; switch (dataType) { case DataTypes::Int8: decodeAndPushBack(bytes, i, size, result); break; case DataTypes::Int16: decodeAndPushBack(bytes, i, size, result); break; case DataTypes::Int32: decodeAndPushBack(bytes, i, size, result); break; case DataTypes::UInt8: decodeAndPushBack(bytes, i, size, result); break; case DataTypes::UInt16: decodeAndPushBack(bytes, i, size, result); break; case DataTypes::UInt32: decodeAndPushBack(bytes, i, size, result); break; case DataTypes::Float32: decodeAndPushBack(bytes, i, size, result); break; case DataTypes::Float64: decodeAndPushBack(bytes, i, size, result); break; } } return result; } static std::vector integer_packing_decode( const std::vector& packedInts, int byteCount, int srcSize, bool isUnsigned) { std::vector result(srcSize); std::int32_t upperLimit; if (isUnsigned) { upperLimit = byteCount == 1 ? std::numeric_limits::max() : std::numeric_limits::max(); } else { upperLimit = byteCount == 1 ? std::numeric_limits::max() : std::numeric_limits::max(); } std::int32_t lowerLimit = -upperLimit - 1; auto as_int = [isUnsigned, byteCount](auto&& elem) -> std::int32_t { if (isUnsigned) { return byteCount == 1 ? static_cast(std::get(elem)) : static_cast(std::get(elem)); } else { return byteCount == 1 ? static_cast(std::get(elem)) : static_cast(std::get(elem)); } }; auto at_limit = [isUnsigned, upperLimit, lowerLimit](std::int32_t t) -> bool { return isUnsigned ? (t == upperLimit) : (t == upperLimit || t == lowerLimit); }; for (int i = 0, j = 0; i < packedInts.size(); ++i, ++j) { std::int32_t value = 0; std::int32_t t = as_int(packedInts[i]); while (at_limit(t)) { value += t; t = as_int(packedInts[++i]); } value += t; result[j] = value; } return result; } static std::vector delta_decode( std::vector& data, std::int32_t origin, DataTypes srcType) { std::vector result = data; result[0] = origin; auto add_int32_t = [](auto&& a, auto&& b) -> std::int32_t { return std::get(a) + std::get(b); }; std::inclusive_scan(result.begin(), result.end(), result.begin(), add_int32_t); return result; } static std::vector run_length_decode( std::vector& data, DataTypes srcType, int srcSize) { std::vector result; for (std::size_t i = 0; i < data.size(); i += 2) { auto item = std::get(data[i]); auto count = std::get(data[i + 1]); for (std::int32_t j = 0; j < count; j++) { result.push_back(item); } } return result; } static std::vector fixed_array_decode( std::vector& data, int factor, DataTypes srcType) { std::vector result = data; auto div_int32_t = [factor, srcType](auto&& a) -> auto { return srcType == DataTypes::Float32 ? std::get(a) / static_cast(factor) : std::get(a) / static_cast(factor); }; std::transform(data.begin(), data.end(), result.begin(), div_int32_t); return result; } static std::vector interval_quant_decode( std::vector& data, double min, double max, int numSteps, DataTypes srcType) { std::vector result = data; auto delta = (max - min) / (numSteps - 1); std::transform(data.begin(), data.end(), result.begin(), [min, delta](auto&& a) -> double { return min + std::get(a) * delta; }); return result; } static std::vector parse_bcif_decode( const std::vector& rawData, std::vector>& dataEncoding); static std::vector string_array_decode( const std::vector& data, std::vector>& indicesEncoding, const std::string& stringData, const std::vector& offsets, std::vector>& offsetEncoding) { auto decodedOffsets = parse_bcif_decode(offsets, offsetEncoding); auto indices = parse_bcif_decode(data, indicesEncoding); std::vector result; result.reserve(indices.size()); std::vector strings = {""}; strings.reserve(decodedOffsets.size()); for (int i = 1; i < decodedOffsets.size(); i++) { auto start = std::get(decodedOffsets[i - 1]); auto end = std::get(decodedOffsets[i]); auto str = stringData.substr(start, end - start); strings.push_back(str); } for (int i = 0; i < indices.size(); i++) { auto index = std::get(indices[i]); result.push_back(strings[index + 1]); } return result; } static void parse_bcif_decode_kind(const std::string& kind, const std::vector& rawData, std::vector& result, std::map& dataEncoding) { if (kind == "ByteArray") { auto type = dataEncoding["type"].as(); result = byte_array_decode(rawData, static_cast(type)); } else if (kind == "FixedPoint") { auto factor = dataEncoding["factor"].as(); auto srcType = dataEncoding["srcType"].as(); result = fixed_array_decode(result, factor, static_cast(srcType)); } else if (kind == "IntervalQuantization") { auto min = dataEncoding["min"].as(); auto max = dataEncoding["max"].as(); auto numSteps = dataEncoding["numSteps"].as(); auto srcType = dataEncoding["srcType"].as(); result = interval_quant_decode(result, min, max, numSteps, static_cast(srcType)); } else if (kind == "RunLength") { auto srcType = dataEncoding["srcType"].as(); auto srcSize = dataEncoding["srcSize"].as(); result = run_length_decode(result, static_cast(srcType), srcSize); } else if (kind == "Delta") { auto origin = dataEncoding["origin"].as(); auto srcType = dataEncoding["srcType"].as(); result = delta_decode(result, origin, static_cast(srcType)); } else if (kind == "IntegerPacking") { auto byteCount = dataEncoding["byteCount"].as(); auto srcSize = dataEncoding["srcSize"].as(); auto isUnsigned = dataEncoding["isUnsigned"].as(); result = integer_packing_decode(result, byteCount, srcSize, isUnsigned); } else if (kind == "StringArray") { auto indicesEncoding = dataEncoding["dataEncoding"].as>>(); auto stringData = dataEncoding["stringData"].as(); auto offsets = dataEncoding["offsets"].as>(); auto offsetEncoding = dataEncoding["offsetEncoding"].as>>(); result = string_array_decode(rawData, indicesEncoding, stringData, offsets, offsetEncoding); } } static std::vector parse_bcif_decode(const std::vector& rawData, std::vector>& dataEncoding) { std::vector result; for (auto it = std::rbegin(dataEncoding); it != std::rend(dataEncoding); ++it) { auto& dataEncode = *it; parse_bcif_decode_kind( dataEncode["kind"].as(), rawData, result, dataEncode); } return result; } bool cif_file::parse_bcif(const char* bytes, std::size_t size) { m_datablocks.clear(); m_tokens.clear(); auto oh = msgpack::unpack(bytes, size); auto msgobj = oh.get(); auto dict = msgobj.as>(); auto dataBlocksRaw = dict["dataBlocks"].as>(); for (const auto& block : dataBlocksRaw) { auto blockMap = block.as>(); auto header = blockMap["header"].as(); auto categoriesRaw = blockMap["categories"].as>(); auto& categoriesData = m_datablocks[header].m_data.emplace(); categoriesData.m_code = header; // Needed for multiplexing for (const auto& category : categoriesRaw) { auto categoryMap = category.as>(); auto categoryName = categoryMap["name"].as(); std::transform(categoryName.begin(), categoryName.end(), categoryName.begin(), ::tolower); auto columnsRaw = categoryMap["columns"].as>(); auto& columns = categoriesData.m_dict[categoryName]; for (const auto& column : columnsRaw) { auto columnMap = column.as>(); auto columnName = columnMap["name"].as(); std::transform(columnName.begin(), columnName.end(), columnName.begin(), ::tolower); auto dataRaw = columnMap["data"].as>(); auto dataData = dataRaw["data"].as>(); auto dataEncoding = dataRaw["encoding"].as>>(); columns[columnName] = parse_bcif_decode(dataData, dataEncoding); } } } return true; } #else bool cif_file::parse_bcif(const char* bytes, std::size_t size) { return false; } #endif // !defined(_PYMOL_NO_MSGPACKC) } // namespace pymol // vi:sw=2:ts=2