mirror of
https://github.com/abseil/abseil-cpp.git
synced 2026-06-04 12:07:05 +08:00
This is supposed to be a zero-diff change. PiperOrigin-RevId: 756859112 Change-Id: Ia81a84bc5d1e6f2a1299ca0ff5dbcec48583ab76
149 lines
5.7 KiB
C++
149 lines
5.7 KiB
C++
// Copyright 2017 The Abseil Authors.
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
//
|
|
// https://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
// UTF8 utilities, implemented to reduce dependencies.
|
|
|
|
#include "absl/strings/internal/utf8.h"
|
|
|
|
#include <cstddef>
|
|
#include <cstdint>
|
|
#include <limits>
|
|
|
|
#include "absl/base/config.h"
|
|
|
|
namespace absl {
|
|
ABSL_NAMESPACE_BEGIN
|
|
namespace strings_internal {
|
|
|
|
size_t EncodeUTF8Char(char* buffer, char32_t utf8_char) {
|
|
if (utf8_char <= 0x7F) {
|
|
*buffer = static_cast<char>(utf8_char);
|
|
return 1;
|
|
} else if (utf8_char <= 0x7FF) {
|
|
buffer[1] = static_cast<char>(0x80 | (utf8_char & 0x3F));
|
|
utf8_char >>= 6;
|
|
buffer[0] = static_cast<char>(0xC0 | utf8_char);
|
|
return 2;
|
|
} else if (utf8_char <= 0xFFFF) {
|
|
buffer[2] = static_cast<char>(0x80 | (utf8_char & 0x3F));
|
|
utf8_char >>= 6;
|
|
buffer[1] = static_cast<char>(0x80 | (utf8_char & 0x3F));
|
|
utf8_char >>= 6;
|
|
buffer[0] = static_cast<char>(0xE0 | utf8_char);
|
|
return 3;
|
|
} else {
|
|
buffer[3] = static_cast<char>(0x80 | (utf8_char & 0x3F));
|
|
utf8_char >>= 6;
|
|
buffer[2] = static_cast<char>(0x80 | (utf8_char & 0x3F));
|
|
utf8_char >>= 6;
|
|
buffer[1] = static_cast<char>(0x80 | (utf8_char & 0x3F));
|
|
utf8_char >>= 6;
|
|
buffer[0] = static_cast<char>(0xF0 | utf8_char);
|
|
return 4;
|
|
}
|
|
}
|
|
|
|
size_t WideToUtf8(wchar_t wc, char* buf, ShiftState& s) {
|
|
// Reinterpret the output buffer `buf` as `unsigned char*` for subsequent
|
|
// bitwise operations. This ensures well-defined behavior for bit
|
|
// manipulations (avoiding issues with signed `char`) and is safe under C++
|
|
// aliasing rules, as `unsigned char` can alias any type.
|
|
auto* ubuf = reinterpret_cast<unsigned char*>(buf);
|
|
const uint32_t v = static_cast<uint32_t>(wc);
|
|
constexpr size_t kError = static_cast<size_t>(-1);
|
|
|
|
if (v <= 0x007F) {
|
|
// 1-byte sequence (U+0000 to U+007F).
|
|
// 0xxxxxxx.
|
|
ubuf[0] = (0b0111'1111 & v);
|
|
s = {}; // Reset surrogate state.
|
|
return 1;
|
|
} else if (0x0080 <= v && v <= 0x07FF) {
|
|
// 2-byte sequence (U+0080 to U+07FF).
|
|
// 110xxxxx 10xxxxxx.
|
|
ubuf[0] = 0b1100'0000 | (0b0001'1111 & (v >> 6));
|
|
ubuf[1] = 0b1000'0000 | (0b0011'1111 & v);
|
|
s = {}; // Reset surrogate state.
|
|
return 2;
|
|
} else if ((0x0800 <= v && v <= 0xD7FF) || (0xE000 <= v && v <= 0xFFFF)) {
|
|
// 3-byte sequence (U+0800 to U+D7FF or U+E000 to U+FFFF).
|
|
// Excludes surrogate code points U+D800-U+DFFF.
|
|
// 1110xxxx 10xxxxxx 10xxxxxx.
|
|
ubuf[0] = 0b1110'0000 | (0b0000'1111 & (v >> 12));
|
|
ubuf[1] = 0b1000'0000 | (0b0011'1111 & (v >> 6));
|
|
ubuf[2] = 0b1000'0000 | (0b0011'1111 & v);
|
|
s = {}; // Reset surrogate state.
|
|
return 3;
|
|
} else if (0xD800 <= v && v <= 0xDBFF) {
|
|
// High Surrogate (U+D800 to U+DBFF).
|
|
// This part forms the first two bytes of an eventual 4-byte UTF-8 sequence.
|
|
const unsigned char high_bits_val = (0b0000'1111 & (v >> 6)) + 1;
|
|
|
|
// First byte of the 4-byte UTF-8 sequence (11110xxx).
|
|
ubuf[0] = 0b1111'0000 | (0b0000'0111 & (high_bits_val >> 2));
|
|
// Second byte of the 4-byte UTF-8 sequence (10xxxxxx).
|
|
ubuf[1] = 0b1000'0000 | //
|
|
(0b0011'0000 & (high_bits_val << 4)) | //
|
|
(0b0000'1111 & (v >> 2));
|
|
// Set state for high surrogate after writing to buffer.
|
|
s = {true, static_cast<unsigned char>(0b0000'0011 & v)};
|
|
return 2; // Wrote 2 bytes, expecting 2 more from a low surrogate.
|
|
} else if (0xDC00 <= v && v <= 0xDFFF) {
|
|
// Low Surrogate (U+DC00 to U+DFFF).
|
|
// This part forms the last two bytes of a 4-byte UTF-8 sequence,
|
|
// using state from a preceding high surrogate.
|
|
if (!s.saw_high_surrogate) {
|
|
// Error: Isolated low surrogate without a preceding high surrogate.
|
|
// s remains in its current (problematic) state.
|
|
// Caller should handle error.
|
|
return kError;
|
|
}
|
|
|
|
// Third byte of the 4-byte UTF-8 sequence (10xxxxxx).
|
|
ubuf[0] = 0b1000'0000 | //
|
|
(0b0011'0000 & (s.bits << 4)) | //
|
|
(0b0000'1111 & (v >> 6));
|
|
// Fourth byte of the 4-byte UTF-8 sequence (10xxxxxx).
|
|
ubuf[1] = 0b1000'0000 | (0b0011'1111 & v);
|
|
|
|
s = {}; // Reset surrogate state, pair complete.
|
|
return 2; // Wrote 2 more bytes, completing the 4-byte sequence.
|
|
} else if constexpr (0xFFFF < std::numeric_limits<wchar_t>::max()) {
|
|
// Conditionally compile the 4-byte direct conversion branch.
|
|
// This block is compiled only if wchar_t can represent values > 0xFFFF.
|
|
// It's placed after surrogate checks to ensure surrogates are handled by
|
|
// their specific logic. This inner 'if' is the runtime check for the 4-byte
|
|
// range. At this point, v is known not to be in the 1, 2, or 3-byte BMP
|
|
// ranges, nor is it a surrogate code point.
|
|
if (0x10000 <= v && v <= 0x10FFFF) {
|
|
// 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx.
|
|
ubuf[0] = 0b1111'0000 | (0b0000'0111 & (v >> 18));
|
|
ubuf[1] = 0b1000'0000 | (0b0011'1111 & (v >> 12));
|
|
ubuf[2] = 0b1000'0000 | (0b0011'1111 & (v >> 6));
|
|
ubuf[3] = 0b1000'0000 | (0b0011'1111 & v);
|
|
s = {}; // Reset surrogate state.
|
|
return 4;
|
|
}
|
|
}
|
|
|
|
// Invalid wchar_t value (e.g., out of Unicode range, or unhandled after all
|
|
// checks).
|
|
s = {}; // Reset surrogate state.
|
|
return kError;
|
|
}
|
|
|
|
} // namespace strings_internal
|
|
ABSL_NAMESPACE_END
|
|
} // namespace absl
|