mirror of
https://github.com/abseil/abseil-cpp.git
synced 2026-06-04 20:14:23 +08:00
259 lines
10 KiB
C++
259 lines
10 KiB
C++
// Copyright 2024 The Abseil Authors
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
//
|
|
// https://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
#include "absl/debugging/internal/decode_rust_punycode.h"
|
|
|
|
#include <cstddef>
|
|
#include <cstdint>
|
|
#include <cstring>
|
|
|
|
#include "absl/base/config.h"
|
|
#include "absl/base/nullability.h"
|
|
#include "absl/debugging/internal/bounded_utf8_length_sequence.h"
|
|
#include "absl/debugging/internal/utf8_for_code_point.h"
|
|
|
|
namespace absl {
|
|
ABSL_NAMESPACE_BEGIN
|
|
namespace debugging_internal {
|
|
|
|
namespace {
|
|
|
|
// Decoding Punycode requires repeated random-access insertion into a stream of
|
|
// variable-length UTF-8 code-point encodings. We need this to be tolerably
|
|
// fast (no N^2 slowdown for unfortunate inputs), and we can't allocate any data
|
|
// structures on the heap (async-signal-safety).
|
|
//
|
|
// It is pragmatic to impose a moderately low limit on the identifier length and
|
|
// bail out if we ever hit it. Then BoundedUtf8LengthSequence efficiently
|
|
// determines where to insert the next code point, and memmove efficiently makes
|
|
// room for it.
|
|
//
|
|
// The chosen limit is a round number several times larger than identifiers
|
|
// expected in practice, yet still small enough that a memmove of this many
|
|
// UTF-8 characters is not much more expensive than the division and modulus
|
|
// operations that Punycode decoding requires.
|
|
constexpr uint32_t kMaxChars = 256;
|
|
|
|
// Constants from RFC 3492 section 5.
|
|
constexpr uint32_t kBase = 36, kTMin = 1, kTMax = 26, kSkew = 38, kDamp = 700;
|
|
|
|
constexpr uint32_t kMaxCodePoint = 0x10ffff;
|
|
|
|
// Overflow threshold in DecodeRustPunycode's inner loop; see comments there.
|
|
constexpr uint32_t kMaxI = 1 << 30;
|
|
|
|
// If punycode_begin .. punycode_end begins with a prefix matching the regular
|
|
// expression [0-9a-zA-Z_]+_, removes that prefix, copies all but the final
|
|
// underscore into out_begin .. out_end, sets num_ascii_chars to the number of
|
|
// bytes copied, and returns true. (A prefix of this sort represents the
|
|
// nonempty subsequence of ASCII characters in the corresponding plaintext.)
|
|
//
|
|
// If punycode_begin .. punycode_end does not contain an underscore, sets
|
|
// num_ascii_chars to zero and returns true. (The encoding of a plaintext
|
|
// without any ASCII characters does not carry such a prefix.)
|
|
//
|
|
// Returns false and zeroes num_ascii_chars on failure (either parse error or
|
|
// not enough space in the output buffer).
|
|
bool ConsumeOptionalAsciiPrefix(const char*& punycode_begin,
|
|
const char* const punycode_end,
|
|
char* const out_begin,
|
|
char* const out_end,
|
|
uint32_t& num_ascii_chars) {
|
|
num_ascii_chars = 0;
|
|
|
|
// Remember the last underscore if any. Also use the same string scan to
|
|
// reject any ASCII bytes that do not belong in an identifier, including NUL,
|
|
// as well as non-ASCII bytes, which should have been delta-encoded instead.
|
|
int last_underscore = -1;
|
|
for (int i = 0; i < punycode_end - punycode_begin; ++i) {
|
|
const char c = punycode_begin[i];
|
|
if (c == '_') {
|
|
last_underscore = i;
|
|
continue;
|
|
}
|
|
// We write out the meaning of absl::ascii_isalnum rather than call that
|
|
// function because its documentation does not promise it will remain
|
|
// async-signal-safe under future development.
|
|
if ('a' <= c && c <= 'z') continue;
|
|
if ('A' <= c && c <= 'Z') continue;
|
|
if ('0' <= c && c <= '9') continue;
|
|
return false;
|
|
}
|
|
|
|
// If there was no underscore, that means there were no ASCII characters in
|
|
// the plaintext, so there is no prefix to consume. Our work is done.
|
|
if (last_underscore < 0) return true;
|
|
|
|
// Otherwise there will be an underscore delimiter somewhere. It can't be
|
|
// initial because then there would be no ASCII characters to its left, and no
|
|
// delimiter would have been added in that case.
|
|
if (last_underscore == 0) return false;
|
|
|
|
// Any other position is reasonable. Make sure there's room in the buffer.
|
|
if (last_underscore + 1 > out_end - out_begin) return false;
|
|
|
|
// Consume and write out the ASCII characters.
|
|
num_ascii_chars = static_cast<uint32_t>(last_underscore);
|
|
std::memcpy(out_begin, punycode_begin, num_ascii_chars);
|
|
out_begin[num_ascii_chars] = '\0';
|
|
punycode_begin += num_ascii_chars + 1;
|
|
return true;
|
|
}
|
|
|
|
// Returns the value of `c` as a base-36 digit according to RFC 3492 section 5,
|
|
// or -1 if `c` is not such a digit.
|
|
int DigitValue(char c) {
|
|
if ('0' <= c && c <= '9') return c - '0' + 26;
|
|
if ('a' <= c && c <= 'z') return c - 'a';
|
|
if ('A' <= c && c <= 'Z') return c - 'A';
|
|
return -1;
|
|
}
|
|
|
|
// Consumes the next delta encoding from punycode_begin .. punycode_end,
|
|
// updating i accordingly. Returns true on success. Returns false on parse
|
|
// failure or arithmetic overflow.
|
|
bool ScanNextDelta(const char*& punycode_begin, const char* const punycode_end,
|
|
uint32_t bias, uint32_t& i) {
|
|
uint64_t w = 1; // 64 bits to prevent overflow in w *= kBase - t
|
|
|
|
// "for k = base to infinity in steps of base do begin ... end" in RFC 3492
|
|
// section 6.2. Each loop iteration scans one digit of the delta.
|
|
for (uint32_t k = kBase; punycode_begin != punycode_end; k += kBase) {
|
|
const int digit_value = DigitValue(*punycode_begin++);
|
|
if (digit_value < 0) return false;
|
|
|
|
// Compute this in 64-bit arithmetic so we can check for overflow afterward.
|
|
const uint64_t new_i = i + static_cast<uint64_t>(digit_value) * w;
|
|
|
|
// Valid deltas are bounded by (#chars already emitted) * kMaxCodePoint, but
|
|
// invalid input could encode an arbitrarily large delta. Nip that in the
|
|
// bud here.
|
|
static_assert(
|
|
kMaxI >= kMaxChars * kMaxCodePoint,
|
|
"kMaxI is too small to prevent spurious failures on good input");
|
|
if (new_i > kMaxI) return false;
|
|
|
|
static_assert(
|
|
kMaxI < (uint64_t{1} << 32),
|
|
"Make kMaxI smaller or i 64 bits wide to prevent silent wraparound");
|
|
i = static_cast<uint32_t>(new_i);
|
|
|
|
// Compute the threshold that determines whether this is the last digit and
|
|
// (if not) what the next digit's place value will be. This logic from RFC
|
|
// 3492 section 6.2 is explained in section 3.3.
|
|
uint32_t t;
|
|
if (k <= bias + kTMin) {
|
|
t = kTMin;
|
|
} else if (k >= bias + kTMax) {
|
|
t = kTMax;
|
|
} else {
|
|
t = k - bias;
|
|
}
|
|
if (static_cast<uint32_t>(digit_value) < t) return true;
|
|
|
|
// If this gets too large, the range check on new_i in the next iteration
|
|
// will catch it. We know this multiplication will not overwrap because w
|
|
// is 64 bits wide.
|
|
w *= kBase - t;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
} // namespace
|
|
|
|
absl::Nullable<char*> DecodeRustPunycode(DecodeRustPunycodeOptions options) {
|
|
const char* punycode_begin = options.punycode_begin;
|
|
const char* const punycode_end = options.punycode_end;
|
|
char* const out_begin = options.out_begin;
|
|
char* const out_end = options.out_end;
|
|
|
|
// Write a NUL terminator first. Later memcpy calls will keep bumping it
|
|
// along to its new right place.
|
|
const size_t out_size = static_cast<size_t>(out_end - out_begin);
|
|
if (out_size == 0) return nullptr;
|
|
*out_begin = '\0';
|
|
|
|
// RFC 3492 section 6.2 begins here. We retain the names of integer variables
|
|
// appearing in that text.
|
|
uint32_t n = 128, i = 0, bias = 72, num_chars = 0;
|
|
|
|
// If there are any ASCII characters, consume them and their trailing
|
|
// underscore delimiter.
|
|
if (!ConsumeOptionalAsciiPrefix(punycode_begin, punycode_end,
|
|
out_begin, out_end, num_chars)) {
|
|
return nullptr;
|
|
}
|
|
uint32_t total_utf8_bytes = num_chars;
|
|
|
|
BoundedUtf8LengthSequence<kMaxChars> utf8_lengths;
|
|
|
|
// "while the input is not exhausted do begin ... end"
|
|
while (punycode_begin != punycode_end) {
|
|
if (num_chars >= kMaxChars) return nullptr;
|
|
|
|
const uint32_t old_i = i;
|
|
|
|
if (!ScanNextDelta(punycode_begin, punycode_end, bias, i)) return nullptr;
|
|
|
|
// Update bias as in RFC 3492 section 6.1. (We have inlined adapt.)
|
|
uint32_t delta = i - old_i;
|
|
delta /= (old_i == 0 ? kDamp : 2);
|
|
delta += delta/(num_chars + 1);
|
|
bias = 0;
|
|
while (delta > ((kBase - kTMin) * kTMax)/2) {
|
|
delta /= kBase - kTMin;
|
|
bias += kBase;
|
|
}
|
|
bias += ((kBase - kTMin + 1) * delta)/(delta + kSkew);
|
|
|
|
// Back in section 6.2, compute the new code point and insertion index.
|
|
static_assert(
|
|
kMaxI + kMaxCodePoint < (uint64_t{1} << 32),
|
|
"Make kMaxI smaller or n 64 bits wide to prevent silent wraparound");
|
|
n += i/(num_chars + 1);
|
|
i %= num_chars + 1;
|
|
|
|
// To actually insert, we need to convert the code point n to UTF-8 and the
|
|
// character index i to an index into the byte stream emitted so far. First
|
|
// prepare the UTF-8 encoding for n, rejecting surrogates, overlarge values,
|
|
// and anything that won't fit into the remaining output storage.
|
|
Utf8ForCodePoint utf8_for_code_point(n);
|
|
if (!utf8_for_code_point.ok()) return nullptr;
|
|
if (total_utf8_bytes + utf8_for_code_point.length + 1 > out_size) {
|
|
return nullptr;
|
|
}
|
|
|
|
// Now insert the new character into both our length map and the output.
|
|
uint32_t n_index =
|
|
utf8_lengths.InsertAndReturnSumOfPredecessors(
|
|
i, utf8_for_code_point.length);
|
|
std::memmove(
|
|
out_begin + n_index + utf8_for_code_point.length, out_begin + n_index,
|
|
total_utf8_bytes + 1 - n_index);
|
|
std::memcpy(out_begin + n_index, utf8_for_code_point.bytes,
|
|
utf8_for_code_point.length);
|
|
total_utf8_bytes += utf8_for_code_point.length;
|
|
++num_chars;
|
|
|
|
// Finally, advance to the next state before continuing.
|
|
++i;
|
|
}
|
|
|
|
return out_begin + total_utf8_bytes;
|
|
}
|
|
|
|
} // namespace debugging_internal
|
|
ABSL_NAMESPACE_END
|
|
} // namespace absl
|