From 6389099808f35242fba49035f0fdc467a883c176 Mon Sep 17 00:00:00 2001 From: apio Date: Sun, 18 Dec 2022 14:34:50 +0100 Subject: [PATCH] UTF-8 part 2: Encoding wide-character strings into UTF-8 We now have Utf8StringEncoder and Utf8Encoder (no state this time) --- luna/include/luna/Utf8.h | 29 +++++++++++- luna/src/Utf8.cpp | 95 ++++++++++++++++++++++++++++++++++++++-- 2 files changed, 120 insertions(+), 4 deletions(-) diff --git a/luna/include/luna/Utf8.h b/luna/include/luna/Utf8.h index 2361c918..25509b73 100644 --- a/luna/include/luna/Utf8.h +++ b/luna/include/luna/Utf8.h @@ -15,7 +15,7 @@ class Utf8StringDecoder Result code_points() const; - // The caller must ensure that 'buf' is at least wide_length() + a NULL wide. + // The caller must ensure that 'buf' is at least code_points() + a NULL wide. Result decode(wchar_t* buf) const; private: @@ -23,6 +23,26 @@ class Utf8StringDecoder usize m_byte_length; }; +class Utf8StringEncoder +{ + public: + Utf8StringEncoder(const wchar_t* str); + + usize code_points() const + { + return m_code_points; + } + + Result byte_length() const; + + // The caller must ensure that 'buf' is at least byte_length() + a NULL wide. + Result encode(char* buf) const; + + private: + const wchar_t* m_str; + usize m_code_points; +}; + class Utf8StateDecoder { public: @@ -35,4 +55,11 @@ class Utf8StateDecoder char m_state[4]; usize m_state_len = 0; usize m_state_index = 0; +}; + +class Utf8Encoder +{ + public: + // Does not null-terminate. Returns the number of bytes written. + Result encode(wchar_t c, char buf[4]); }; \ No newline at end of file diff --git a/luna/src/Utf8.cpp b/luna/src/Utf8.cpp index f1c9c666..4445c355 100644 --- a/luna/src/Utf8.cpp +++ b/luna/src/Utf8.cpp @@ -10,7 +10,52 @@ static Result utf8_char_length(char c) return err(EILSEQ); } -static Result encode_utf8_char_into_wide_char(const char* beg, usize& len) +static Result wide_char_length_as_utf8(wchar_t c) +{ + if (c <= 0x7f) return 1; + if (c <= 0x7ff) return 2; + if (c <= 0xffff) return 3; + if (c <= 0x10ffff) return 4; + return err(EILSEQ); +} + +static Result encode_wide_char_as_utf8(wchar_t c, char* result, usize& len) +{ + len = TRY(wide_char_length_as_utf8(c)); + + u8* buf = (u8*)result; + + if (len == 1) + { + buf[0] = c & 0x7f; + return {}; + } + if (len == 2) + { + buf[0] = 0b11000000 | ((c & 0x7c0) >> 6); + buf[1] = 0b10000000 | (c & 0x3f); + return {}; + } + if (len == 3) + { + buf[0] = 0b11100000 | ((c & 0xf000) >> 12); + buf[1] = 0b10000000 | ((c & 0xfc0) >> 6); + buf[2] = 0b10000000 | (c & 0x3f); + return {}; + } + if (len == 4) + { + buf[0] = 0b11110000 | ((c & 0x1c0000) >> 18); + buf[1] = 0b10000000 | ((c & 0x3f000) >> 12); + buf[2] = 0b10000000 | ((c & 0xfc0) >> 6); + buf[3] = 0b10000000 | (c & 0x3f); + return {}; + } + + unreachable(); +} + +static Result encode_utf8_as_wide_char(const char* beg, usize& len) { usize utf8_len = TRY(utf8_char_length(*beg)); if (utf8_len > len) return err(EILSEQ); // Unterminated sequence @@ -82,7 +127,7 @@ Result Utf8StringDecoder::decode(wchar_t* buf) const while ((usize)(it - m_str) < m_byte_length) { usize len = m_byte_length - (usize)(it - m_str); // Remaining space - *buf = TRY(encode_utf8_char_into_wide_char(it, len)); + *buf = TRY(encode_utf8_as_wide_char(it, len)); it += len; buf++; } @@ -92,6 +137,41 @@ Result Utf8StringDecoder::decode(wchar_t* buf) const return {}; } +Utf8StringEncoder::Utf8StringEncoder(const wchar_t* str) : m_str(str), m_code_points(wcslen(str)) +{ +} + +Result Utf8StringEncoder::byte_length() const +{ + const wchar_t* it = m_str; + usize len = 0; + + while (*it) + { + len += TRY(wide_char_length_as_utf8(*it)); + it++; + } + + return len; +} + +Result Utf8StringEncoder::encode(char* buf) const +{ + const wchar_t* it = m_str; + + while (*it) + { + usize len = 0; + TRY(encode_wide_char_as_utf8(*it, buf, len)); + buf += len; + it++; + } + + *buf = 0; + + return {}; +} + Utf8StateDecoder::Utf8StateDecoder() : m_state_len(0), m_state_index(0) { } @@ -117,7 +197,7 @@ Result> Utf8StateDecoder::feed(char c) if (m_state_index == m_state_len - 1) { usize len = m_state_len; - wchar_t wc = TRY(encode_utf8_char_into_wide_char(m_state, len)); + wchar_t wc = TRY(encode_utf8_as_wide_char(m_state, len)); m_state_len = 0; return Option{wc}; } @@ -128,4 +208,13 @@ Result> Utf8StateDecoder::feed(char c) void Utf8StateDecoder::reset() { m_state_index = m_state_len = 0; +} + +Result Utf8Encoder::encode(wchar_t c, char buf[4]) +{ + usize len = 0; + + TRY(encode_wide_char_as_utf8(c, buf, len)); + + return len; } \ No newline at end of file