luna: Make Utf8String{De,En}coders return the number of bytes written

This means we can avoid a call to code_points() in mbstowcs(), which would parse a string twice.
2023-01-14 11:55:19 +01:00 · 2023-01-14 11:55:19 +01:00 · 00ee8314b3
commit 00ee8314b3
parent da805eec83
3 changed files with 16 additions and 19 deletions
--- a/libc/src/stdlib.cpp
+++ b/libc/src/stdlib.cpp
@ -104,22 +104,15 @@ extern "C"
        __builtin_unreachable();
    }

-    // FIXME: This is walking a UTF-8 string twice. Once to decode, and another to count code points.
    size_t mbstowcs(wchar_t* buf, const char* src, size_t max)
    {
        if (max == 0) return 0;

        Utf8StringDecoder decoder(src);

-        auto rc = decoder.decode(buf, max);
+        if (!buf) { return decoder.code_points().value_or((size_t)-1); }

-        if (rc.has_error()) return (size_t)-1;
-
-        size_t code_points = decoder.code_points().value_or(0);
-
-        if (code_points >= max) return max - 1;
-
-        return code_points;
+        return decoder.decode(buf, max).value_or((size_t)-1);
    }

    void* malloc(size_t size)
--- a/luna/include/luna/Utf8.h
+++ b/luna/include/luna/Utf8.h
@ -16,9 +16,9 @@ class Utf8StringDecoder
    Result<usize> code_points() const;

    // The caller must ensure that 'buf' is at least code_points() + a NULL wide.
-    Result<void> decode(wchar_t* buf) const;
+    Result<usize> decode(wchar_t* buf) const;

-    Result<void> decode(wchar_t* buf, usize max) const;
+    Result<usize> decode(wchar_t* buf, usize max) const;

  private:
    const char* m_str;
@ -38,9 +38,9 @@ class Utf8StringEncoder
    Result<usize> byte_length() const;

    // The caller must ensure that 'buf' is at least byte_length() + a NULL wide.
-    Result<void> encode(char* buf) const;
+    Result<usize> encode(char* buf) const;

-    Result<void> encode(char* buf, usize max) const;
+    Result<usize> encode(char* buf, usize max) const;

  private:
    const wchar_t* m_str;
--- a/luna/src/Utf8.cpp
+++ b/luna/src/Utf8.cpp
@ -2,6 +2,8 @@
 #include <luna/CString.h>
 #include <luna/Utf8.h>

+// FIXME: Not enough space for a sequence is not an error. (mbstowcs(3) and wcstombs(3), case 2 when buf is not NULL)
+
 static_assert(WCHAR_MAX > 0x10ffff);

 static Result<usize> utf8_char_length(char c)
@ -146,9 +148,10 @@ Result<usize> Utf8StringDecoder::code_points() const
    return len;
 }

-Result<void> Utf8StringDecoder::decode(wchar_t* buf, usize max) const
+Result<usize> Utf8StringDecoder::decode(wchar_t* buf, usize max) const
 {
    const char* it = m_str;
+    wchar_t* const buf_start = buf;

    while ((usize)(it - m_str) < m_byte_length && max--)
    {
@ -160,10 +163,10 @@ Result<void> Utf8StringDecoder::decode(wchar_t* buf, usize max) const

    *buf = 0;

-    return {};
+    return (usize)(buf - buf_start);
 }

-Result<void> Utf8StringDecoder::decode(wchar_t* buf) const
+Result<usize> Utf8StringDecoder::decode(wchar_t* buf) const
 {
    return decode(buf, (usize)-1);
 }
@ -186,9 +189,10 @@ Result<usize> Utf8StringEncoder::byte_length() const
    return len;
 }

-Result<void> Utf8StringEncoder::encode(char* buf, usize max) const
+Result<usize> Utf8StringEncoder::encode(char* buf, usize max) const
 {
    const wchar_t* it = m_str;
+    char* const buf_start = buf;

    while (*it && max > 1)
    {
@ -201,10 +205,10 @@ Result<void> Utf8StringEncoder::encode(char* buf, usize max) const

    *buf = 0;

-    return {};
+    return (usize)(buf - buf_start);
 }

-Result<void> Utf8StringEncoder::encode(char* buf) const
+Result<usize> Utf8StringEncoder::encode(char* buf) const
 {
    return encode(buf, (usize)-1);
 }