libc: Implement wcstombs()

luna: Make Utf8String{De,En}coders return the number of bytes written
This means we can avoid a call to code_points() in mbstowcs(), which would parse a string twice.
2023-01-14 11:59:08 +01:00 · 2023-01-14 11:55:19 +01:00
4 changed files with 27 additions and 16 deletions
--- a/libc/include/stdlib.h
+++ b/libc/include/stdlib.h
@ -110,6 +110,9 @@ extern "C"
    /* Convert a multibyte character string to a wide character string. */
    size_t mbstowcs(wchar_t* buf, const char* src, size_t max);

+    /* Convert a wide character string to a multibyte character string. */
+    size_t wcstombs(char* buf, const wchar_t* src, size_t max);
+
 #ifdef __cplusplus
 }
 #endif
--- a/libc/src/stdlib.cpp
+++ b/libc/src/stdlib.cpp
@ -104,22 +104,26 @@ extern "C"
        __builtin_unreachable();
    }

-    // FIXME: This is walking a UTF-8 string twice. Once to decode, and another to count code points.
    size_t mbstowcs(wchar_t* buf, const char* src, size_t max)
    {
        if (max == 0) return 0;

        Utf8StringDecoder decoder(src);

-        auto rc = decoder.decode(buf, max);
+        if (!buf) { return decoder.code_points().value_or((size_t)-1); }

-        if (rc.has_error()) return (size_t)-1;
+        return decoder.decode(buf, max).value_or((size_t)-1);
+    }

-        size_t code_points = decoder.code_points().value_or(0);
+    size_t wcstombs(char* buf, const wchar_t* src, size_t max)
+    {
+        if (max == 0) return 0;

-        if (code_points >= max) return max - 1;
+        Utf8StringEncoder encoder(src);

-        return code_points;
+        if (!buf) { return encoder.byte_length().value_or((size_t)-1); }
+
+        return encoder.encode(buf, max).value_or((size_t)-1);
    }

    void* malloc(size_t size)
--- a/luna/include/luna/Utf8.h
+++ b/luna/include/luna/Utf8.h
@ -16,9 +16,9 @@ class Utf8StringDecoder
    Result<usize> code_points() const;

    // The caller must ensure that 'buf' is at least code_points() + a NULL wide.
-    Result<void> decode(wchar_t* buf) const;
+    Result<usize> decode(wchar_t* buf) const;

-    Result<void> decode(wchar_t* buf, usize max) const;
+    Result<usize> decode(wchar_t* buf, usize max) const;

  private:
    const char* m_str;
@ -38,9 +38,9 @@ class Utf8StringEncoder
    Result<usize> byte_length() const;

    // The caller must ensure that 'buf' is at least byte_length() + a NULL wide.
-    Result<void> encode(char* buf) const;
+    Result<usize> encode(char* buf) const;

-    Result<void> encode(char* buf, usize max) const;
+    Result<usize> encode(char* buf, usize max) const;

  private:
    const wchar_t* m_str;
--- a/luna/src/Utf8.cpp
+++ b/luna/src/Utf8.cpp
@ -2,6 +2,8 @@
 #include <luna/CString.h>
 #include <luna/Utf8.h>

+// FIXME: Not enough space for a sequence is not an error. (mbstowcs(3) and wcstombs(3), case 2 when buf is not NULL)
+
 static_assert(WCHAR_MAX > 0x10ffff);

 static Result<usize> utf8_char_length(char c)
@ -146,9 +148,10 @@ Result<usize> Utf8StringDecoder::code_points() const
    return len;
 }

-Result<void> Utf8StringDecoder::decode(wchar_t* buf, usize max) const
+Result<usize> Utf8StringDecoder::decode(wchar_t* buf, usize max) const
 {
    const char* it = m_str;
+    wchar_t* const buf_start = buf;

    while ((usize)(it - m_str) < m_byte_length && max--)
    {
@ -160,10 +163,10 @@ Result<void> Utf8StringDecoder::decode(wchar_t* buf, usize max) const

    *buf = 0;

-    return {};
+    return (usize)(buf - buf_start);
 }

-Result<void> Utf8StringDecoder::decode(wchar_t* buf) const
+Result<usize> Utf8StringDecoder::decode(wchar_t* buf) const
 {
    return decode(buf, (usize)-1);
 }
@ -186,9 +189,10 @@ Result<usize> Utf8StringEncoder::byte_length() const
    return len;
 }

-Result<void> Utf8StringEncoder::encode(char* buf, usize max) const
+Result<usize> Utf8StringEncoder::encode(char* buf, usize max) const
 {
    const wchar_t* it = m_str;
+    char* const buf_start = buf;

    while (*it && max > 1)
    {
@ -201,10 +205,10 @@ Result<void> Utf8StringEncoder::encode(char* buf, usize max) const

    *buf = 0;

-    return {};
+    return (usize)(buf - buf_start);
 }

-Result<void> Utf8StringEncoder::encode(char* buf) const
+Result<usize> Utf8StringEncoder::encode(char* buf) const
 {
    return encode(buf, (usize)-1);
 }
Author	SHA1	Message	Date
apio	e3ef29e80d	libc: Implement wcstombs() All checks were successful continuous-integration/drone/push Build is passing Details	2023-01-14 11:59:08 +01:00
apio	00ee8314b3	luna: Make Utf8String{De,En}coders return the number of bytes written This means we can avoid a call to code_points() in mbstowcs(), which would parse a string twice.	2023-01-14 11:55:19 +01:00