Add UTF-8 decoder support!!

2022-12-18 13:04:40 +01:00 · 2022-12-18 13:04:40 +01:00 · 75ba14a3ad
commit 75ba14a3ad
parent 23d405bbda
3 changed files with 170 additions and 0 deletions
--- a/luna/CMakeLists.txt
+++ b/luna/CMakeLists.txt
@ -10,6 +10,7 @@ set(FREESTANDING_SOURCES
    src/Stack.cpp
    src/Alloc.cpp
    src/OwnedStringView.cpp
    src/Utf8.cpp
 )
 set(SOURCES
--- a/luna/include/luna/Utf8.h
+++ b/luna/include/luna/Utf8.h
@ -0,0 +1,38 @@
 #pragma once
 #include <luna/Result.h>
 #include <luna/Types.h>
 #include <stddef.h>
 class Utf8StringDecoder
 {
  public:
    Utf8StringDecoder(const char* str);
    usize byte_length() const
    {
        return m_byte_length;
    }
    Result<usize> code_points() const;
    // The caller must ensure that 'buf' is at least wide_length() + a NULL wide.
    Result<void> decode(wchar_t* buf) const;
  private:
    const char* m_str;
    usize m_byte_length;
 };
 class Utf8StateDecoder
 {
  public:
    Utf8StateDecoder();
    Result<Option<wchar_t>> feed(char c);
    void reset();
  private:
    char m_state[4];
    usize m_state_len = 0;
    usize m_state_index = 0;
 };
--- a/luna/src/Utf8.cpp
+++ b/luna/src/Utf8.cpp
@ -0,0 +1,131 @@
 #include <luna/CString.h>
 #include <luna/Utf8.h>
 static Result<usize> utf8_char_length(char c)
 {
    if ((c & 0b11111000) == 0b11110000) return 4;
    if ((c & 0b11110000) == 0b11100000) return 3;
    if ((c & 0b11100000) == 0b11000000) return 2;
    if ((c & 0b10000000) == 0b00000000) return 1;
    return err(EILSEQ);
 }
 static Result<wchar_t> encode_utf8_char_into_wide_char(const char* beg, usize& len)
 {
    usize utf8_len = TRY(utf8_char_length(*beg));
    if (utf8_len > len) return err(EILSEQ); // Unterminated sequence
    len = utf8_len;                         // Enough space for the sequence, let's return the resulting length
    if (len == 1) { return beg[0] & 0x7f; }
    if (len == 2)
    {
        if ((beg[1] & 0b11000000) != 0b10000000) return err(EILSEQ);
        wchar_t c = beg[0] & 0x1f;
        c <<= 6;
        c |= beg[1] & 0x3f;
        return c;
    }
    if (len == 3)
    {
        if ((beg[1] & 0b11000000) != 0b10000000) return err(EILSEQ);
        if ((beg[2] & 0b11000000) != 0b10000000) return err(EILSEQ);
        wchar_t c = beg[0] & 0x0f;
        c <<= 6;
        c |= beg[1] & 0x3f;
        c <<= 6;
        c |= beg[2] & 0x3f;
        return c;
    }
    if (len == 4)
    {
        if ((beg[1] & 0b11000000) != 0b10000000) return err(EILSEQ);
        if ((beg[2] & 0b11000000) != 0b10000000) return err(EILSEQ);
        if ((beg[3] & 0b11000000) != 0b10000000) return err(EILSEQ);
        wchar_t c = beg[0] & 0x07;
        c <<= 6;
        c |= beg[1] & 0x3f;
        c <<= 6;
        c |= beg[2] & 0x3f;
        c <<= 6;
        c |= beg[3] & 0x3f;
        if (c > 0x10ffff) return err(EILSEQ);
        return c;
    }
    unreachable();
 }
 Utf8StringDecoder::Utf8StringDecoder(const char* str) : m_str(str), m_byte_length(strlen(str))
 {
 }
 Result<usize> Utf8StringDecoder::code_points() const
 {
    const char* it = m_str;
    usize len = 0;
    while ((usize)(it - m_str) < m_byte_length)
    {
        usize utf8_len = TRY(utf8_char_length(*it));
        if ((usize)(it - m_str) + utf8_len > m_byte_length) return err(EILSEQ);
        it += utf8_len;
        len++;
    }
    return len;
 }
 Result<void> Utf8StringDecoder::decode(wchar_t* buf) const
 {
    const char* it = m_str;
    while ((usize)(it - m_str) < m_byte_length)
    {
        usize len = m_byte_length - (usize)(it - m_str); // Remaining space
        *buf = TRY(encode_utf8_char_into_wide_char(it, len));
        it += len;
        buf++;
    }
    *buf = 0;
    return {};
 }
 Utf8StateDecoder::Utf8StateDecoder() : m_state_len(0), m_state_index(0)
 {
 }
 Result<Option<wchar_t>> Utf8StateDecoder::feed(char c)
 {
    if (m_state_len == 0)
    {
        m_state_len = TRY(utf8_char_length(c));
        if (m_state_len == 1)
        {
            m_state_len = 0;
            return Option<wchar_t>{c & 0x7f};
        }
        m_state_index = 0;
        m_state[m_state_index] = c;
        return {{}};
    }
    m_state_index++;
    m_state[m_state_index] = c;
    if (m_state_index == m_state_len - 1)
    {
        usize len = m_state_len;
        wchar_t wc = TRY(encode_utf8_char_into_wide_char(m_state, len));
        m_state_len = 0;
        return Option<wchar_t>{wc};
    }
    return {{}};
 }
 void Utf8StateDecoder::reset()
 {
    m_state_index = m_state_len = 0;
 }