From 75ba14a3adacdeaf240e034656f2e7bd74d78a30 Mon Sep 17 00:00:00 2001
From: apio <blobs.trading@gmail.com>
Date: Sun, 18 Dec 2022 13:04:40 +0100
Subject: [PATCH] Add UTF-8 decoder support!!

---
 luna/CMakeLists.txt      |   1 +
 luna/include/luna/Utf8.h |  38 ++++++++++++
 luna/src/Utf8.cpp        | 131 +++++++++++++++++++++++++++++++++++++++
 3 files changed, 170 insertions(+)
 create mode 100644 luna/include/luna/Utf8.h
 create mode 100644 luna/src/Utf8.cpp
diff --git a/luna/CMakeLists.txt b/luna/CMakeLists.txt
index 53a8a28b..77f51628 100644
--- a/luna/CMakeLists.txt
+++ b/luna/CMakeLists.txt
@@ -10,6 +10,7 @@ set(FREESTANDING_SOURCES
     src/Stack.cpp
     src/Alloc.cpp
     src/OwnedStringView.cpp
+    src/Utf8.cpp
 )
 
 set(SOURCES
diff --git a/luna/include/luna/Utf8.h b/luna/include/luna/Utf8.h
new file mode 100644
index 00000000..2361c918
--- /dev/null
+++ b/luna/include/luna/Utf8.h
@@ -0,0 +1,38 @@
+#pragma once
+#include <luna/Result.h>
+#include <luna/Types.h>
+#include <stddef.h>
+
+class Utf8StringDecoder
+{
+  public:
+    Utf8StringDecoder(const char* str);
+
+    usize byte_length() const
+    {
+        return m_byte_length;
+    }
+
+    Result<usize> code_points() const;
+
+    // The caller must ensure that 'buf' is at least wide_length() + a NULL wide.
+    Result<void> decode(wchar_t* buf) const;
+
+  private:
+    const char* m_str;
+    usize m_byte_length;
+};
+
+class Utf8StateDecoder
+{
+  public:
+    Utf8StateDecoder();
+
+    Result<Option<wchar_t>> feed(char c);
+    void reset();
+
+  private:
+    char m_state[4];
+    usize m_state_len = 0;
+    usize m_state_index = 0;
+};
\ No newline at end of file
diff --git a/luna/src/Utf8.cpp b/luna/src/Utf8.cpp
new file mode 100644
index 00000000..f1c9c666
--- /dev/null
+++ b/luna/src/Utf8.cpp
@@ -0,0 +1,131 @@
+#include <luna/CString.h>
+#include <luna/Utf8.h>
+
+static Result<usize> utf8_char_length(char c)
+{
+    if ((c & 0b11111000) == 0b11110000) return 4;
+    if ((c & 0b11110000) == 0b11100000) return 3;
+    if ((c & 0b11100000) == 0b11000000) return 2;
+    if ((c & 0b10000000) == 0b00000000) return 1;
+    return err(EILSEQ);
+}
+
+static Result<wchar_t> encode_utf8_char_into_wide_char(const char* beg, usize& len)
+{
+    usize utf8_len = TRY(utf8_char_length(*beg));
+    if (utf8_len > len) return err(EILSEQ); // Unterminated sequence
+    len = utf8_len;                         // Enough space for the sequence, let's return the resulting length
+
+    if (len == 1) { return beg[0] & 0x7f; }
+    if (len == 2)
+    {
+        if ((beg[1] & 0b11000000) != 0b10000000) return err(EILSEQ);
+        wchar_t c = beg[0] & 0x1f;
+        c <<= 6;
+        c |= beg[1] & 0x3f;
+        return c;
+    }
+    if (len == 3)
+    {
+        if ((beg[1] & 0b11000000) != 0b10000000) return err(EILSEQ);
+        if ((beg[2] & 0b11000000) != 0b10000000) return err(EILSEQ);
+        wchar_t c = beg[0] & 0x0f;
+        c <<= 6;
+        c |= beg[1] & 0x3f;
+        c <<= 6;
+        c |= beg[2] & 0x3f;
+        return c;
+    }
+    if (len == 4)
+    {
+        if ((beg[1] & 0b11000000) != 0b10000000) return err(EILSEQ);
+        if ((beg[2] & 0b11000000) != 0b10000000) return err(EILSEQ);
+        if ((beg[3] & 0b11000000) != 0b10000000) return err(EILSEQ);
+        wchar_t c = beg[0] & 0x07;
+        c <<= 6;
+        c |= beg[1] & 0x3f;
+        c <<= 6;
+        c |= beg[2] & 0x3f;
+        c <<= 6;
+        c |= beg[3] & 0x3f;
+        if (c > 0x10ffff) return err(EILSEQ);
+        return c;
+    }
+
+    unreachable();
+}
+
+Utf8StringDecoder::Utf8StringDecoder(const char* str) : m_str(str), m_byte_length(strlen(str))
+{
+}
+
+Result<usize> Utf8StringDecoder::code_points() const
+{
+    const char* it = m_str;
+    usize len = 0;
+
+    while ((usize)(it - m_str) < m_byte_length)
+    {
+        usize utf8_len = TRY(utf8_char_length(*it));
+        if ((usize)(it - m_str) + utf8_len > m_byte_length) return err(EILSEQ);
+        it += utf8_len;
+        len++;
+    }
+
+    return len;
+}
+
+Result<void> Utf8StringDecoder::decode(wchar_t* buf) const
+{
+    const char* it = m_str;
+
+    while ((usize)(it - m_str) < m_byte_length)
+    {
+        usize len = m_byte_length - (usize)(it - m_str); // Remaining space
+        *buf = TRY(encode_utf8_char_into_wide_char(it, len));
+        it += len;
+        buf++;
+    }
+
+    *buf = 0;
+
+    return {};
+}
+
+Utf8StateDecoder::Utf8StateDecoder() : m_state_len(0), m_state_index(0)
+{
+}
+
+Result<Option<wchar_t>> Utf8StateDecoder::feed(char c)
+{
+    if (m_state_len == 0)
+    {
+        m_state_len = TRY(utf8_char_length(c));
+        if (m_state_len == 1)
+        {
+            m_state_len = 0;
+            return Option<wchar_t>{c & 0x7f};
+        }
+        m_state_index = 0;
+        m_state[m_state_index] = c;
+        return {{}};
+    }
+
+    m_state_index++;
+    m_state[m_state_index] = c;
+
+    if (m_state_index == m_state_len - 1)
+    {
+        usize len = m_state_len;
+        wchar_t wc = TRY(encode_utf8_char_into_wide_char(m_state, len));
+        m_state_len = 0;
+        return Option<wchar_t>{wc};
+    }
+
+    return {{}};
+}
+
+void Utf8StateDecoder::reset()
+{
+    m_state_index = m_state_len = 0;
+}
\ No newline at end of file