From 75ba14a3adacdeaf240e034656f2e7bd74d78a30 Mon Sep 17 00:00:00 2001 From: apio Date: Sun, 18 Dec 2022 13:04:40 +0100 Subject: [PATCH] Add UTF-8 decoder support!! --- luna/CMakeLists.txt | 1 + luna/include/luna/Utf8.h | 38 ++++++++++++ luna/src/Utf8.cpp | 131 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 170 insertions(+) create mode 100644 luna/include/luna/Utf8.h create mode 100644 luna/src/Utf8.cpp diff --git a/luna/CMakeLists.txt b/luna/CMakeLists.txt index 53a8a28b..77f51628 100644 --- a/luna/CMakeLists.txt +++ b/luna/CMakeLists.txt @@ -10,6 +10,7 @@ set(FREESTANDING_SOURCES src/Stack.cpp src/Alloc.cpp src/OwnedStringView.cpp + src/Utf8.cpp ) set(SOURCES diff --git a/luna/include/luna/Utf8.h b/luna/include/luna/Utf8.h new file mode 100644 index 00000000..2361c918 --- /dev/null +++ b/luna/include/luna/Utf8.h @@ -0,0 +1,38 @@ +#pragma once +#include +#include +#include + +class Utf8StringDecoder +{ + public: + Utf8StringDecoder(const char* str); + + usize byte_length() const + { + return m_byte_length; + } + + Result code_points() const; + + // The caller must ensure that 'buf' is at least wide_length() + a NULL wide. + Result decode(wchar_t* buf) const; + + private: + const char* m_str; + usize m_byte_length; +}; + +class Utf8StateDecoder +{ + public: + Utf8StateDecoder(); + + Result> feed(char c); + void reset(); + + private: + char m_state[4]; + usize m_state_len = 0; + usize m_state_index = 0; +}; \ No newline at end of file diff --git a/luna/src/Utf8.cpp b/luna/src/Utf8.cpp new file mode 100644 index 00000000..f1c9c666 --- /dev/null +++ b/luna/src/Utf8.cpp @@ -0,0 +1,131 @@ +#include +#include + +static Result utf8_char_length(char c) +{ + if ((c & 0b11111000) == 0b11110000) return 4; + if ((c & 0b11110000) == 0b11100000) return 3; + if ((c & 0b11100000) == 0b11000000) return 2; + if ((c & 0b10000000) == 0b00000000) return 1; + return err(EILSEQ); +} + +static Result encode_utf8_char_into_wide_char(const char* beg, usize& len) +{ + usize utf8_len = TRY(utf8_char_length(*beg)); + if (utf8_len > len) return err(EILSEQ); // Unterminated sequence + len = utf8_len; // Enough space for the sequence, let's return the resulting length + + if (len == 1) { return beg[0] & 0x7f; } + if (len == 2) + { + if ((beg[1] & 0b11000000) != 0b10000000) return err(EILSEQ); + wchar_t c = beg[0] & 0x1f; + c <<= 6; + c |= beg[1] & 0x3f; + return c; + } + if (len == 3) + { + if ((beg[1] & 0b11000000) != 0b10000000) return err(EILSEQ); + if ((beg[2] & 0b11000000) != 0b10000000) return err(EILSEQ); + wchar_t c = beg[0] & 0x0f; + c <<= 6; + c |= beg[1] & 0x3f; + c <<= 6; + c |= beg[2] & 0x3f; + return c; + } + if (len == 4) + { + if ((beg[1] & 0b11000000) != 0b10000000) return err(EILSEQ); + if ((beg[2] & 0b11000000) != 0b10000000) return err(EILSEQ); + if ((beg[3] & 0b11000000) != 0b10000000) return err(EILSEQ); + wchar_t c = beg[0] & 0x07; + c <<= 6; + c |= beg[1] & 0x3f; + c <<= 6; + c |= beg[2] & 0x3f; + c <<= 6; + c |= beg[3] & 0x3f; + if (c > 0x10ffff) return err(EILSEQ); + return c; + } + + unreachable(); +} + +Utf8StringDecoder::Utf8StringDecoder(const char* str) : m_str(str), m_byte_length(strlen(str)) +{ +} + +Result Utf8StringDecoder::code_points() const +{ + const char* it = m_str; + usize len = 0; + + while ((usize)(it - m_str) < m_byte_length) + { + usize utf8_len = TRY(utf8_char_length(*it)); + if ((usize)(it - m_str) + utf8_len > m_byte_length) return err(EILSEQ); + it += utf8_len; + len++; + } + + return len; +} + +Result Utf8StringDecoder::decode(wchar_t* buf) const +{ + const char* it = m_str; + + while ((usize)(it - m_str) < m_byte_length) + { + usize len = m_byte_length - (usize)(it - m_str); // Remaining space + *buf = TRY(encode_utf8_char_into_wide_char(it, len)); + it += len; + buf++; + } + + *buf = 0; + + return {}; +} + +Utf8StateDecoder::Utf8StateDecoder() : m_state_len(0), m_state_index(0) +{ +} + +Result> Utf8StateDecoder::feed(char c) +{ + if (m_state_len == 0) + { + m_state_len = TRY(utf8_char_length(c)); + if (m_state_len == 1) + { + m_state_len = 0; + return Option{c & 0x7f}; + } + m_state_index = 0; + m_state[m_state_index] = c; + return {{}}; + } + + m_state_index++; + m_state[m_state_index] = c; + + if (m_state_index == m_state_len - 1) + { + usize len = m_state_len; + wchar_t wc = TRY(encode_utf8_char_into_wide_char(m_state, len)); + m_state_len = 0; + return Option{wc}; + } + + return {{}}; +} + +void Utf8StateDecoder::reset() +{ + m_state_index = m_state_len = 0; +} \ No newline at end of file