From 16bf8b38ea536ca4cc1b7ccc2d90f6bc3fbfd804 Mon Sep 17 00:00:00 2001 From: apio Date: Wed, 21 Dec 2022 20:08:43 +0100 Subject: [PATCH] UTF-8 decoder: Error out on overlong encodings --- luna/src/Utf8.cpp | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/luna/src/Utf8.cpp b/luna/src/Utf8.cpp index 0e543381..7ebc0c51 100644 --- a/luna/src/Utf8.cpp +++ b/luna/src/Utf8.cpp @@ -21,6 +21,14 @@ static Result wide_char_length_as_utf8(wchar_t c) return err(EILSEQ); } +static inline usize wide_char_length_as_utf8_unchecked(wchar_t c) +{ + if (c <= 0x7f) return 1; + if (c <= 0x7ff) return 2; + if (c <= 0xffff) return 3; + return 4; +} + static Result encode_wide_char_as_utf8(wchar_t c, char* result, usize& len) { len = TRY(wide_char_length_as_utf8(c)); @@ -57,7 +65,7 @@ static Result encode_wide_char_as_utf8(wchar_t c, char* result, usize& len unreachable(); } -static Result encode_utf8_as_wide_char(const char* beg, usize& len) +static Result encode_utf8_as_wide_char_impl(const char* beg, usize& len) { usize utf8_len = TRY(utf8_char_length(*beg)); if (utf8_len > len) return err(EILSEQ); // Unterminated sequence @@ -102,6 +110,19 @@ static Result encode_utf8_as_wide_char(const char* beg, usize& len) unreachable(); } +static Result encode_utf8_as_wide_char(const char* beg, usize& len) +{ + wchar_t result = TRY(encode_utf8_as_wide_char_impl(beg, len)); + // NOTE: We already know this is a valid code-point, since we constructed it ourselves and already checked the + // range. + if (len != wide_char_length_as_utf8_unchecked(result)) + { + // OVERLONG ENCODING!! This is nasty, error out. + return err(EILSEQ); + } + return result; +} + Utf8StringDecoder::Utf8StringDecoder(const char* str) : m_str(str), m_byte_length(strlen(str)) { }