From 00ee8314b39b0f8fd0c6a9693f43691d2f505056 Mon Sep 17 00:00:00 2001 From: apio Date: Sat, 14 Jan 2023 11:55:19 +0100 Subject: [PATCH] luna: Make Utf8String{De,En}coders return the number of bytes written This means we can avoid a call to code_points() in mbstowcs(), which would parse a string twice. --- libc/src/stdlib.cpp | 11 ++--------- luna/include/luna/Utf8.h | 8 ++++---- luna/src/Utf8.cpp | 16 ++++++++++------ 3 files changed, 16 insertions(+), 19 deletions(-) diff --git a/libc/src/stdlib.cpp b/libc/src/stdlib.cpp index 1eed9f9d..7d107125 100644 --- a/libc/src/stdlib.cpp +++ b/libc/src/stdlib.cpp @@ -104,22 +104,15 @@ extern "C" __builtin_unreachable(); } - // FIXME: This is walking a UTF-8 string twice. Once to decode, and another to count code points. size_t mbstowcs(wchar_t* buf, const char* src, size_t max) { if (max == 0) return 0; Utf8StringDecoder decoder(src); - auto rc = decoder.decode(buf, max); + if (!buf) { return decoder.code_points().value_or((size_t)-1); } - if (rc.has_error()) return (size_t)-1; - - size_t code_points = decoder.code_points().value_or(0); - - if (code_points >= max) return max - 1; - - return code_points; + return decoder.decode(buf, max).value_or((size_t)-1); } void* malloc(size_t size) diff --git a/luna/include/luna/Utf8.h b/luna/include/luna/Utf8.h index 162ee2ea..05faf98a 100644 --- a/luna/include/luna/Utf8.h +++ b/luna/include/luna/Utf8.h @@ -16,9 +16,9 @@ class Utf8StringDecoder Result code_points() const; // The caller must ensure that 'buf' is at least code_points() + a NULL wide. - Result decode(wchar_t* buf) const; + Result decode(wchar_t* buf) const; - Result decode(wchar_t* buf, usize max) const; + Result decode(wchar_t* buf, usize max) const; private: const char* m_str; @@ -38,9 +38,9 @@ class Utf8StringEncoder Result byte_length() const; // The caller must ensure that 'buf' is at least byte_length() + a NULL wide. - Result encode(char* buf) const; + Result encode(char* buf) const; - Result encode(char* buf, usize max) const; + Result encode(char* buf, usize max) const; private: const wchar_t* m_str; diff --git a/luna/src/Utf8.cpp b/luna/src/Utf8.cpp index 1311c68a..269efdbd 100644 --- a/luna/src/Utf8.cpp +++ b/luna/src/Utf8.cpp @@ -2,6 +2,8 @@ #include #include +// FIXME: Not enough space for a sequence is not an error. (mbstowcs(3) and wcstombs(3), case 2 when buf is not NULL) + static_assert(WCHAR_MAX > 0x10ffff); static Result utf8_char_length(char c) @@ -146,9 +148,10 @@ Result Utf8StringDecoder::code_points() const return len; } -Result Utf8StringDecoder::decode(wchar_t* buf, usize max) const +Result Utf8StringDecoder::decode(wchar_t* buf, usize max) const { const char* it = m_str; + wchar_t* const buf_start = buf; while ((usize)(it - m_str) < m_byte_length && max--) { @@ -160,10 +163,10 @@ Result Utf8StringDecoder::decode(wchar_t* buf, usize max) const *buf = 0; - return {}; + return (usize)(buf - buf_start); } -Result Utf8StringDecoder::decode(wchar_t* buf) const +Result Utf8StringDecoder::decode(wchar_t* buf) const { return decode(buf, (usize)-1); } @@ -186,9 +189,10 @@ Result Utf8StringEncoder::byte_length() const return len; } -Result Utf8StringEncoder::encode(char* buf, usize max) const +Result Utf8StringEncoder::encode(char* buf, usize max) const { const wchar_t* it = m_str; + char* const buf_start = buf; while (*it && max > 1) { @@ -201,10 +205,10 @@ Result Utf8StringEncoder::encode(char* buf, usize max) const *buf = 0; - return {}; + return (usize)(buf - buf_start); } -Result Utf8StringEncoder::encode(char* buf) const +Result Utf8StringEncoder::encode(char* buf) const { return encode(buf, (usize)-1); }