#include #include #include static_assert(WCHAR_MAX > 0x10ffff); static Result utf8_char_length(char c) { if ((c & 0b11111000) == 0b11110000) return 4; if ((c & 0b11110000) == 0b11100000) return 3; if ((c & 0b11100000) == 0b11000000) return 2; if ((c & 0b10000000) == 0b00000000) return 1; return err(EILSEQ); } static Result wide_char_length_as_utf8(wchar_t c) { if (c <= 0x7f) return 1; if (c <= 0x7ff) return 2; if (c <= 0xffff) return 3; if (c <= 0x10ffff) return 4; return err(EILSEQ); } static inline usize wide_char_length_as_utf8_unchecked(wchar_t c) { if (c <= 0x7f) return 1; if (c <= 0x7ff) return 2; if (c <= 0xffff) return 3; return 4; } static Result encode_wide_char_as_utf8(wchar_t c, char* result, usize& len) { const usize utf8_len = TRY(wide_char_length_as_utf8(c)); if (utf8_len > len) { return err(EILSEQ); } u8* buf = (u8*)result; if (len == 1) { buf[0] = c & 0x7f; return {}; } if (len == 2) { buf[0] = 0b11000000 | ((c & 0x7c0) >> 6); buf[1] = 0b10000000 | (c & 0x3f); return {}; } if (len == 3) { buf[0] = 0b11100000 | ((c & 0xf000) >> 12); buf[1] = 0b10000000 | ((c & 0xfc0) >> 6); buf[2] = 0b10000000 | (c & 0x3f); return {}; } if (len == 4) { buf[0] = 0b11110000 | ((c & 0x1c0000) >> 18); buf[1] = 0b10000000 | ((c & 0x3f000) >> 12); buf[2] = 0b10000000 | ((c & 0xfc0) >> 6); buf[3] = 0b10000000 | (c & 0x3f); return {}; } unreachable(); } static Result encode_utf8_as_wide_char_impl(const char* beg, usize& len) { const usize utf8_len = TRY(utf8_char_length(*beg)); if (utf8_len > len) return err(EILSEQ); // Unterminated sequence len = utf8_len; // Enough space for the sequence, let's return the resulting length if (len == 1) { return beg[0] & 0x7f; } if (len == 2) { if ((beg[1] & 0b11000000) != 0b10000000) return err(EILSEQ); wchar_t c = beg[0] & 0x1f; c <<= 6; c |= beg[1] & 0x3f; return c; } if (len == 3) { if ((beg[1] & 0b11000000) != 0b10000000) return err(EILSEQ); if ((beg[2] & 0b11000000) != 0b10000000) return err(EILSEQ); wchar_t c = beg[0] & 0x0f; c <<= 6; c |= beg[1] & 0x3f; c <<= 6; c |= beg[2] & 0x3f; return c; } if (len == 4) { if ((beg[1] & 0b11000000) != 0b10000000) return err(EILSEQ); if ((beg[2] & 0b11000000) != 0b10000000) return err(EILSEQ); if ((beg[3] & 0b11000000) != 0b10000000) return err(EILSEQ); wchar_t c = beg[0] & 0x07; c <<= 6; c |= beg[1] & 0x3f; c <<= 6; c |= beg[2] & 0x3f; c <<= 6; c |= beg[3] & 0x3f; if (c > 0x10ffff) return err(EILSEQ); return c; } unreachable(); } static Result encode_utf8_as_wide_char(const char* beg, usize& len) { const wchar_t result = TRY(encode_utf8_as_wide_char_impl(beg, len)); // NOTE: We already know this is a valid code-point, since we constructed it ourselves and already checked the // range. if (len != wide_char_length_as_utf8_unchecked(result)) { // OVERLONG ENCODING!! This is nasty, error out. return err(EILSEQ); } return result; } Utf8StringDecoder::Utf8StringDecoder(const char* str) : m_str(str), m_byte_length(strlen(str)) { } Result Utf8StringDecoder::code_points() const { const char* it = m_str; usize len = 0; while ((usize)(it - m_str) < m_byte_length) { const usize utf8_len = TRY(utf8_char_length(*it)); if ((usize)(it - m_str) + utf8_len > m_byte_length) return err(EILSEQ); it += utf8_len; len++; } return len; } Result Utf8StringDecoder::decode(wchar_t* buf, size_t max) const { const char* it = m_str; while ((usize)(it - m_str) < m_byte_length && max--) { usize len = m_byte_length - (usize)(it - m_str); // Remaining space *buf = TRY(encode_utf8_as_wide_char(it, len)); it += len; buf++; } *buf = 0; return {}; } Result Utf8StringDecoder::decode(wchar_t* buf) const { return decode(buf, (size_t)-1); } Utf8StringEncoder::Utf8StringEncoder(const wchar_t* str) : m_str(str), m_code_points(wcslen(str)) { } Result Utf8StringEncoder::byte_length() const { const wchar_t* it = m_str; usize len = 0; while (*it) { len += TRY(wide_char_length_as_utf8(*it)); it++; } return len; } Result Utf8StringEncoder::encode(char* buf, size_t max) const { const wchar_t* it = m_str; while (*it && max > 1) { usize len = max - 1; TRY(encode_wide_char_as_utf8(*it, buf, len)); buf += len; max -= len; it++; } *buf = 0; return {}; } Result Utf8StringEncoder::encode(char* buf) const { return encode(buf, (size_t)-1); } Utf8StateDecoder::Utf8StateDecoder() : m_state_len(0), m_state_index(0) { } Result> Utf8StateDecoder::feed(char c) { if (m_state_len == 0) { m_state_len = TRY(utf8_char_length(c)); if (m_state_len == 1) { m_state_len = 0; return Option { c & 0x7f }; } m_state_index = 0; m_state[m_state_index] = c; return { {} }; } m_state_index++; m_state[m_state_index] = c; if (m_state_index == m_state_len - 1) { usize len = m_state_len; const wchar_t wc = TRY(encode_utf8_as_wide_char(m_state, len)); m_state_len = 0; return Option { wc }; } return { {} }; } void Utf8StateDecoder::reset() { m_state_index = m_state_len = 0; } Result Utf8Encoder::encode(wchar_t c, char buf[4]) { usize len = 0; TRY(encode_wide_char_as_utf8(c, buf, len)); return len; }