mirror of
https://github.com/abseil/abseil-cpp.git
synced 2026-06-04 12:07:05 +08:00
This is supposed to be a zero-diff change. PiperOrigin-RevId: 756859112 Change-Id: Ia81a84bc5d1e6f2a1299ca0ff5dbcec48583ab76
260 lines
9.3 KiB
C++
260 lines
9.3 KiB
C++
// Copyright 2017 The Abseil Authors.
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
//
|
|
// https://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
#include "absl/strings/internal/utf8.h"
|
|
|
|
#include <cstddef>
|
|
#include <cstdint>
|
|
#include <cstring>
|
|
#include <string>
|
|
#include <type_traits>
|
|
#include <utility>
|
|
#include <vector>
|
|
|
|
#include "gmock/gmock.h"
|
|
#include "gtest/gtest.h"
|
|
#include "absl/base/port.h"
|
|
#include "absl/strings/string_view.h"
|
|
|
|
namespace {
|
|
|
|
using ::absl::strings_internal::kMaxEncodedUTF8Size;
|
|
using ::absl::strings_internal::ShiftState;
|
|
using ::absl::strings_internal::WideToUtf8;
|
|
using ::testing::StartsWith;
|
|
using ::testing::TestParamInfo;
|
|
using ::testing::TestWithParam;
|
|
using ::testing::ValuesIn;
|
|
|
|
#if !defined(__cpp_char8_t)
|
|
#if defined(__clang__)
|
|
#pragma clang diagnostic push
|
|
#pragma clang diagnostic ignored "-Wc++2a-compat"
|
|
#endif
|
|
TEST(EncodeUTF8Char, BasicFunction) {
|
|
std::pair<char32_t, std::string> tests[] = {{0x0030, u8"\u0030"},
|
|
{0x00A3, u8"\u00A3"},
|
|
{0x00010000, u8"\U00010000"},
|
|
{0x0000FFFF, u8"\U0000FFFF"},
|
|
{0x0010FFFD, u8"\U0010FFFD"}};
|
|
for (auto& test : tests) {
|
|
char buf0[7] = {'\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00'};
|
|
char buf1[7] = {'\xFF', '\xFF', '\xFF', '\xFF', '\xFF', '\xFF', '\xFF'};
|
|
char* buf0_written =
|
|
&buf0[absl::strings_internal::EncodeUTF8Char(buf0, test.first)];
|
|
char* buf1_written =
|
|
&buf1[absl::strings_internal::EncodeUTF8Char(buf1, test.first)];
|
|
int apparent_length = 7;
|
|
while (buf0[apparent_length - 1] == '\x00' &&
|
|
buf1[apparent_length - 1] == '\xFF') {
|
|
if (--apparent_length == 0) break;
|
|
}
|
|
EXPECT_EQ(apparent_length, buf0_written - buf0);
|
|
EXPECT_EQ(apparent_length, buf1_written - buf1);
|
|
EXPECT_EQ(apparent_length, test.second.length());
|
|
EXPECT_EQ(std::string(buf0, apparent_length), test.second);
|
|
EXPECT_EQ(std::string(buf1, apparent_length), test.second);
|
|
}
|
|
char buf[32] = "Don't Tread On Me";
|
|
EXPECT_LE(absl::strings_internal::EncodeUTF8Char(buf, 0x00110000),
|
|
absl::strings_internal::kMaxEncodedUTF8Size);
|
|
char buf2[32] = "Negative is invalid but sane";
|
|
EXPECT_LE(absl::strings_internal::EncodeUTF8Char(buf2, -1),
|
|
absl::strings_internal::kMaxEncodedUTF8Size);
|
|
}
|
|
#if defined(__clang__)
|
|
#pragma clang diagnostic pop
|
|
#endif
|
|
#endif // !defined(__cpp_char8_t)
|
|
|
|
struct WideToUtf8TestCase {
|
|
std::string description;
|
|
wchar_t input;
|
|
std::string expected_utf8_str;
|
|
size_t expected_bytes_written;
|
|
ShiftState initial_state = {false, 0};
|
|
ShiftState expected_state = {false, 0};
|
|
};
|
|
|
|
std::vector<WideToUtf8TestCase> GetWideToUtf8TestCases() {
|
|
constexpr size_t kError = static_cast<size_t>(-1);
|
|
std::vector<WideToUtf8TestCase> cases = {
|
|
{"ASCII_A", L'A', "A", 1},
|
|
{"NullChar", L'\0', std::string("\0", 1), 1},
|
|
{"ASCII_Max_7F", L'\x7F', "\x7F", 1},
|
|
|
|
{"TwoByte_Min_80", L'\u0080', "\xC2\x80", 2},
|
|
{"PoundSign_A3", L'\u00A3', "\xC2\xA3", 2},
|
|
{"TwoByte_Max_7FF", L'\u07FF', "\xDF\xBF", 2},
|
|
|
|
{"ThreeByte_Min_800", L'\u0800', "\xE0\xA0\x80", 3},
|
|
{"EuroSign_20AC", L'\u20AC', "\xE2\x82\xAC", 3},
|
|
{"BMP_MaxBeforeSurrogates_D7FF", L'\uD7FF', "\xED\x9F\xBF", 3},
|
|
{"BMP_FFFF", L'\uFFFF', "\xEF\xBF\xBF", 3},
|
|
|
|
{"IsolatedHighSurr_D800", L'\xD800', "\xF0\x90", 2, {}, {true, 0}},
|
|
{"IsolatedHighSurr_DBFF", L'\xDBFF', "\xF4\x8F", 2, {}, {true, 3}},
|
|
|
|
{"HighSurr_D800_after_HighD800",
|
|
L'\xD800',
|
|
"\xF0\x90",
|
|
2,
|
|
{true, 0},
|
|
{true, 0}},
|
|
{"HighSurr_DBFF_after_HighDBFF",
|
|
L'\xDBFF',
|
|
"\xF4\x8F",
|
|
2,
|
|
{true, 3},
|
|
{true, 3}},
|
|
|
|
{"LowSurr_DC00_after_HighD800", L'\xDC00', "\x80\x80", 2, {true, 0}, {}},
|
|
{"LowSurr_DFFD_after_HighDBFF", L'\xDFFD', "\xBF\xBD", 2, {true, 3}, {}},
|
|
{"LowSurr_DC00_with_InitialState_saw_high_bits_1",
|
|
L'\xDC00',
|
|
"\x90\x80",
|
|
2,
|
|
{true, 1},
|
|
{}},
|
|
|
|
// Final state = initial on error.
|
|
{"Error_IsolatedLowSurr_DC00_NoPriorHigh", L'\xDC00', "", kError, {}, {}},
|
|
{"Error_IsolatedLowSurr_DFFF_NoPriorHigh", L'\xDFFF', "", kError, {}, {}},
|
|
|
|
#if (defined(WCHAR_MAX) && WCHAR_MAX > 0xFFFF)
|
|
{"DirectSupplementaryChars_U10000", static_cast<wchar_t>(0x10000),
|
|
"\xF0\x90\x80\x80", 4},
|
|
{"DirectSupplementaryChars_U10FFFD", static_cast<wchar_t>(0x10FFFD),
|
|
"\xF4\x8F\xBF\xBD", 4},
|
|
#endif
|
|
};
|
|
|
|
wchar_t minus_one = static_cast<wchar_t>(-1);
|
|
if constexpr (sizeof(wchar_t) == 2) {
|
|
cases.push_back({"WChar_MinusOne_as_FFFF", minus_one, "\xEF\xBF\xBF", 3});
|
|
} else {
|
|
cases.push_back(
|
|
{"Error_WChar_MinusOne_as_FFFFFFFF", minus_one, "", kError, {}, {}});
|
|
}
|
|
|
|
if constexpr (sizeof(wchar_t) >= 4) {
|
|
#ifdef WCHAR_MAX
|
|
if (static_cast<uintmax_t>(WCHAR_MAX) >= 0x110000UL) {
|
|
cases.push_back({"Error_OutOfRange_110000",
|
|
static_cast<wchar_t>(0x110000UL),
|
|
"",
|
|
kError,
|
|
{},
|
|
{}});
|
|
}
|
|
#else
|
|
cases.push_back({"Error_OutOfRange_110000_fallback",
|
|
static_cast<wchar_t>(0x110000UL),
|
|
"",
|
|
kError,
|
|
{},
|
|
{}});
|
|
#endif
|
|
}
|
|
return cases;
|
|
}
|
|
|
|
class WideToUtf8ParamTest : public TestWithParam<WideToUtf8TestCase> {};
|
|
|
|
TEST_P(WideToUtf8ParamTest, SingleCharConversion) {
|
|
const auto& test_case = GetParam();
|
|
ShiftState state = test_case.initial_state;
|
|
constexpr char kFillChar = '\xAB';
|
|
std::string buffer(32, kFillChar);
|
|
|
|
size_t bytes_written = WideToUtf8(test_case.input, buffer.data(), state);
|
|
|
|
EXPECT_EQ(bytes_written, test_case.expected_bytes_written);
|
|
EXPECT_THAT(buffer, StartsWith(test_case.expected_utf8_str));
|
|
|
|
// The remaining bytes should be unchanged.
|
|
ASSERT_LT(test_case.expected_utf8_str.length(), buffer.size());
|
|
EXPECT_EQ(buffer[test_case.expected_utf8_str.length()], kFillChar);
|
|
|
|
EXPECT_EQ(state.saw_high_surrogate,
|
|
test_case.expected_state.saw_high_surrogate);
|
|
EXPECT_EQ(state.bits, test_case.expected_state.bits);
|
|
}
|
|
|
|
INSTANTIATE_TEST_SUITE_P(WideCharToUtf8Conversion, WideToUtf8ParamTest,
|
|
ValuesIn(GetWideToUtf8TestCases()),
|
|
[](auto info) { return info.param.description; });
|
|
|
|
// Comprehensive test string for validating wchar_t to UTF-8 conversion.
|
|
// This string is designed to cover a variety of Unicode character types and
|
|
// sequences:
|
|
// 1. Basic ASCII characters (within names, numbers, and spacing).
|
|
// 2. Common 2-byte UTF-8 sequences:
|
|
// - Accented Latin characters (e.g., 'á' in "Holá").
|
|
// - Hebrew text with combining vowel points (e.g., "שָׁלוֹם").
|
|
// 3. Common 3-byte UTF-8 sequences:
|
|
// - Currency symbols (e.g., '€').
|
|
// - CJK characters (e.g., "你好", "中").
|
|
// - Components of complex emojis like the Zero Width Joiner (ZWJ) and
|
|
// Heart symbol.
|
|
// 4. Various 4-byte UTF-8 sequences (representing Supplementary Plane
|
|
// characters):
|
|
// - An emoji with a skin tone modifier ("👍🏻").
|
|
// - A flag emoji composed of regional indicators ("🇺🇸").
|
|
// - A complex ZWJ emoji sequence ("👩❤️💋👨") combining
|
|
// SP characters (👩, 💋, 👨) with BMP characters (ZWJ and ❤️).
|
|
// - These are critical for testing the correct handling of surrogate pairs
|
|
// when wchar_t is 2 bytes (e.g., on Windows).
|
|
// The goal is to ensure accurate conversion across a diverse set of
|
|
// characters.
|
|
//
|
|
// clang-format off
|
|
#define WIDE_STRING_LITERAL L"Holá €1 你好 שָׁלוֹם 👍🏻🇺🇸👩❤️💋👨 中"
|
|
#define UTF8_STRING_LITERAL u8"Holá €1 你好 שָׁלוֹם 👍🏻🇺🇸👩❤️💋👨 中"
|
|
// clang-format on
|
|
|
|
absl::string_view GetUtf8TestString() {
|
|
// `u8""` forces UTF-8 encoding; MSVC will default to e.g. CP1252 (and warn)
|
|
// without it. However, the resulting character type differs between pre-C++20
|
|
// (`char`) and C++20 (`char8_t`). So deduce the right character type for all
|
|
// C++ versions, init it with UTF-8, then `memcpy()` to get the result as a
|
|
// `char*`
|
|
static absl::string_view kUtf8TestString = [] {
|
|
using ConstChar8T = std::remove_reference_t<decltype(*u8"a")>;
|
|
constexpr ConstChar8T kOutputUtf8[] = UTF8_STRING_LITERAL;
|
|
static char output[sizeof kOutputUtf8];
|
|
std::memcpy(output, kOutputUtf8, sizeof kOutputUtf8);
|
|
return output;
|
|
}();
|
|
|
|
return kUtf8TestString;
|
|
}
|
|
|
|
TEST(WideToUtf8, FullString) {
|
|
std::string buffer(kMaxEncodedUTF8Size * sizeof(WIDE_STRING_LITERAL), '\0');
|
|
char* buffer_ptr = buffer.data();
|
|
|
|
ShiftState state;
|
|
for (const wchar_t wc : WIDE_STRING_LITERAL) {
|
|
buffer_ptr += WideToUtf8(wc, buffer_ptr, state);
|
|
}
|
|
|
|
EXPECT_THAT(buffer, StartsWith(GetUtf8TestString()));
|
|
}
|
|
|
|
#undef WIDE_STRING_LITERAL
|
|
#undef UTF8_STRING_LITERAL
|
|
|
|
} // namespace
|