From b851dcf9b90fb4dc49d2ced76857bfce9e67d9a4 Mon Sep 17 00:00:00 2001 From: apio Date: Fri, 6 Jan 2023 21:01:37 +0100 Subject: [PATCH] libc: Implement mbstowcs() using Utf8StringDecoder --- libc/include/stdlib.h | 5 +++++ libc/src/stdlib.cpp | 18 ++++++++++++++++++ luna/include/luna/Utf8.h | 4 ++++ luna/src/Utf8.cpp | 25 +++++++++++++++++++------ 4 files changed, 46 insertions(+), 6 deletions(-) diff --git a/libc/include/stdlib.h b/libc/include/stdlib.h index 38cb7095..4e4b3e56 100644 --- a/libc/include/stdlib.h +++ b/libc/include/stdlib.h @@ -27,6 +27,8 @@ typedef struct long long rem; } lldiv_t; +#define MB_CUR_MAX 4 + #ifdef __cplusplus extern "C" { @@ -98,6 +100,9 @@ extern "C" void qsort(void*, size_t, size_t, int (*)(const void*, const void*)); void* bsearch(const void*, const void*, size_t, size_t, int (*)(const void*, const void*)); + /* Convert a multibyte character string to a wide character string. */ + size_t mbstowcs(wchar_t* buf, const char* src, size_t max); + #ifdef __cplusplus } #endif diff --git a/libc/src/stdlib.cpp b/libc/src/stdlib.cpp index 12f338a4..b84dda9c 100644 --- a/libc/src/stdlib.cpp +++ b/libc/src/stdlib.cpp @@ -1,5 +1,6 @@ #include #include +#include #include #include #include @@ -99,4 +100,21 @@ extern "C" syscall(SYS_exit); __builtin_unreachable(); } + + size_t mbstowcs(wchar_t* buf, const char* src, size_t max) + { + if (max == 0) return 0; + + Utf8StringDecoder decoder(src); + + auto rc = decoder.decode(buf, max); + + if (rc.has_error()) return (size_t)-1; + + size_t code_points = decoder.code_points().value_or(0); + + if (code_points >= max) return max - 1; + + return code_points; + } } diff --git a/luna/include/luna/Utf8.h b/luna/include/luna/Utf8.h index 280d29ed..2f5b1b45 100644 --- a/luna/include/luna/Utf8.h +++ b/luna/include/luna/Utf8.h @@ -18,6 +18,8 @@ class Utf8StringDecoder // The caller must ensure that 'buf' is at least code_points() + a NULL wide. Result decode(wchar_t* buf) const; + Result decode(wchar_t* buf, size_t max) const; + private: const char* m_str; usize m_byte_length; @@ -38,6 +40,8 @@ class Utf8StringEncoder // The caller must ensure that 'buf' is at least byte_length() + a NULL wide. Result encode(char* buf) const; + Result encode(char* buf, size_t max) const; + private: const wchar_t* m_str; usize m_code_points; diff --git a/luna/src/Utf8.cpp b/luna/src/Utf8.cpp index 3a84df36..5ebb880d 100644 --- a/luna/src/Utf8.cpp +++ b/luna/src/Utf8.cpp @@ -32,7 +32,9 @@ static inline usize wide_char_length_as_utf8_unchecked(wchar_t c) static Result encode_wide_char_as_utf8(wchar_t c, char* result, usize& len) { - len = TRY(wide_char_length_as_utf8(c)); + usize utf8_len = TRY(wide_char_length_as_utf8(c)); + + if (utf8_len > len) { return err(EILSEQ); } u8* buf = (u8*)result; @@ -144,11 +146,11 @@ Result Utf8StringDecoder::code_points() const return len; } -Result Utf8StringDecoder::decode(wchar_t* buf) const +Result Utf8StringDecoder::decode(wchar_t* buf, size_t max) const { const char* it = m_str; - while ((usize)(it - m_str) < m_byte_length) + while ((usize)(it - m_str) < m_byte_length && max--) { usize len = m_byte_length - (usize)(it - m_str); // Remaining space *buf = TRY(encode_utf8_as_wide_char(it, len)); @@ -161,6 +163,11 @@ Result Utf8StringDecoder::decode(wchar_t* buf) const return {}; } +Result Utf8StringDecoder::decode(wchar_t* buf) const +{ + return decode(buf, (size_t)-1); +} + Utf8StringEncoder::Utf8StringEncoder(const wchar_t* str) : m_str(str), m_code_points(wcslen(str)) { } @@ -179,15 +186,16 @@ Result Utf8StringEncoder::byte_length() const return len; } -Result Utf8StringEncoder::encode(char* buf) const +Result Utf8StringEncoder::encode(char* buf, size_t max) const { const wchar_t* it = m_str; - while (*it) + while (*it && max > 1) { - usize len = 0; + usize len = max - 1; TRY(encode_wide_char_as_utf8(*it, buf, len)); buf += len; + max -= len; it++; } @@ -196,6 +204,11 @@ Result Utf8StringEncoder::encode(char* buf) const return {}; } +Result Utf8StringEncoder::encode(char* buf) const +{ + return encode(buf, (size_t)-1); +} + Utf8StateDecoder::Utf8StateDecoder() : m_state_len(0), m_state_index(0) { }