UTF-8 part 2: Encoding wide-character strings into UTF-8

We now have Utf8StringEncoder and Utf8Encoder (no state this time)
Add wcslen()
2022-12-18 14:34:50 +01:00 · 2022-12-18 14:33:13 +01:00
4 changed files with 130 additions and 4 deletions
--- a/luna/include/luna/CString.h
+++ b/luna/include/luna/CString.h
@ -9,5 +9,7 @@ extern "C"
    void* memmove(void* dest, const void* src, usize n);
    usize strlen(const char* str);
    usize wcslen(const wchar_t* str);
    char* strdup(const char* str);
 }
--- a/luna/include/luna/Utf8.h
+++ b/luna/include/luna/Utf8.h
@ -15,7 +15,7 @@ class Utf8StringDecoder
    Result<usize> code_points() const;
-    // The caller must ensure that 'buf' is at least wide_length() + a NULL wide.
+    // The caller must ensure that 'buf' is at least code_points() + a NULL wide.
    Result<void> decode(wchar_t* buf) const;
  private:
@ -23,6 +23,26 @@ class Utf8StringDecoder
    usize m_byte_length;
 };
 class Utf8StringEncoder
 {
  public:
    Utf8StringEncoder(const wchar_t* str);
    usize code_points() const
    {
        return m_code_points;
    }
    Result<usize> byte_length() const;
    // The caller must ensure that 'buf' is at least byte_length() + a NULL wide.
    Result<void> encode(char* buf) const;
  private:
    const wchar_t* m_str;
    usize m_code_points;
 };
 class Utf8StateDecoder
 {
  public:
@ -36,3 +56,10 @@ class Utf8StateDecoder
    usize m_state_len = 0;
    usize m_state_index = 0;
 };
 class Utf8Encoder
 {
  public:
    // Does not null-terminate. Returns the number of bytes written.
    Result<usize> encode(wchar_t c, char buf[4]);
 };
--- a/luna/src/CString.cpp
+++ b/luna/src/CString.cpp
@ -46,6 +46,14 @@ extern "C"
        return (usize)(i - str);
    }
    usize wcslen(const wchar_t* str)
    {
        const wchar_t* i = str;
        for (; *i; ++i)
            ;
        return (usize)(i - str);
    }
    char* strdup(const char* str)
    {
        const usize len = strlen(str);
--- a/luna/src/Utf8.cpp
+++ b/luna/src/Utf8.cpp
@ -10,7 +10,52 @@ static Result<usize> utf8_char_length(char c)
    return err(EILSEQ);
 }
-static Result<wchar_t> encode_utf8_char_into_wide_char(const char* beg, usize& len)
+static Result<usize> wide_char_length_as_utf8(wchar_t c)
 {
    if (c <= 0x7f) return 1;
    if (c <= 0x7ff) return 2;
    if (c <= 0xffff) return 3;
    if (c <= 0x10ffff) return 4;
    return err(EILSEQ);
 }
 static Result<void> encode_wide_char_as_utf8(wchar_t c, char* result, usize& len)
 {
    len = TRY(wide_char_length_as_utf8(c));
    u8* buf = (u8*)result;
    if (len == 1)
    {
        buf[0] = c & 0x7f;
        return {};
    }
    if (len == 2)
    {
        buf[0] = 0b11000000 | ((c & 0x7c0) >> 6);
        buf[1] = 0b10000000 | (c & 0x3f);
        return {};
    }
    if (len == 3)
    {
        buf[0] = 0b11100000 | ((c & 0xf000) >> 12);
        buf[1] = 0b10000000 | ((c & 0xfc0) >> 6);
        buf[2] = 0b10000000 | (c & 0x3f);
        return {};
    }
    if (len == 4)
    {
        buf[0] = 0b11110000 | ((c & 0x1c0000) >> 18);
        buf[1] = 0b10000000 | ((c & 0x3f000) >> 12);
        buf[2] = 0b10000000 | ((c & 0xfc0) >> 6);
        buf[3] = 0b10000000 | (c & 0x3f);
        return {};
    }
    unreachable();
 }
 static Result<wchar_t> encode_utf8_as_wide_char(const char* beg, usize& len)
 {
    usize utf8_len = TRY(utf8_char_length(*beg));
    if (utf8_len > len) return err(EILSEQ); // Unterminated sequence
@ -82,7 +127,7 @@ Result<void> Utf8StringDecoder::decode(wchar_t* buf) const
    while ((usize)(it - m_str) < m_byte_length)
    {
        usize len = m_byte_length - (usize)(it - m_str); // Remaining space
-        *buf = TRY(encode_utf8_char_into_wide_char(it, len));
+        *buf = TRY(encode_utf8_as_wide_char(it, len));
        it += len;
        buf++;
    }
@ -92,6 +137,41 @@ Result<void> Utf8StringDecoder::decode(wchar_t* buf) const
    return {};
 }
 Utf8StringEncoder::Utf8StringEncoder(const wchar_t* str) : m_str(str), m_code_points(wcslen(str))
 {
 }
 Result<usize> Utf8StringEncoder::byte_length() const
 {
    const wchar_t* it = m_str;
    usize len = 0;
    while (*it)
    {
        len += TRY(wide_char_length_as_utf8(*it));
        it++;
    }
    return len;
 }
 Result<void> Utf8StringEncoder::encode(char* buf) const
 {
    const wchar_t* it = m_str;
    while (*it)
    {
        usize len = 0;
        TRY(encode_wide_char_as_utf8(*it, buf, len));
        buf += len;
        it++;
    }
    *buf = 0;
    return {};
 }
 Utf8StateDecoder::Utf8StateDecoder() : m_state_len(0), m_state_index(0)
 {
 }
@ -117,7 +197,7 @@ Result<Option<wchar_t>> Utf8StateDecoder::feed(char c)
    if (m_state_index == m_state_len - 1)
    {
        usize len = m_state_len;
-        wchar_t wc = TRY(encode_utf8_char_into_wide_char(m_state, len));
+        wchar_t wc = TRY(encode_utf8_as_wide_char(m_state, len));
        m_state_len = 0;
        return Option<wchar_t>{wc};
    }
@ -129,3 +209,12 @@ void Utf8StateDecoder::reset()
 {
    m_state_index = m_state_len = 0;
 }
 Result<usize> Utf8Encoder::encode(wchar_t c, char buf[4])
 {
    usize len = 0;
    TRY(encode_wide_char_as_utf8(c, buf, len));
    return len;
 }
Author	SHA1	Message	Date
apio	6389099808	UTF-8 part 2: Encoding wide-character strings into UTF-8 All checks were successful continuous-integration/drone/push Build is passing Details We now have Utf8StringEncoder and Utf8Encoder (no state this time)	2022-12-18 14:34:50 +01:00
apio	9c1c6bb320	Add wcslen() I think this can go in CString.h, no need to create a separate CWChar.h or something	2022-12-18 14:33:13 +01:00