274 lines
6.4 KiB
C++
274 lines
6.4 KiB
C++
#include <limits.h>
|
|
#include <luna/CString.h>
|
|
#include <luna/Utf8.h>
|
|
|
|
static_assert(WCHAR_MAX > 0x10ffff);
|
|
|
|
static Result<usize> utf8_char_length(char c)
|
|
{
|
|
if ((c & 0b11111000) == 0b11110000) return 4;
|
|
if ((c & 0b11110000) == 0b11100000) return 3;
|
|
if ((c & 0b11100000) == 0b11000000) return 2;
|
|
if ((c & 0b10000000) == 0b00000000) return 1;
|
|
return err(EILSEQ);
|
|
}
|
|
|
|
static Result<usize> wide_char_length_as_utf8(wchar_t c)
|
|
{
|
|
if (c <= 0x7f) return 1;
|
|
if (c <= 0x7ff) return 2;
|
|
if (c <= 0xffff) return 3;
|
|
if (c <= 0x10ffff) return 4;
|
|
return err(EILSEQ);
|
|
}
|
|
|
|
static inline usize wide_char_length_as_utf8_unchecked(wchar_t c)
|
|
{
|
|
if (c <= 0x7f) return 1;
|
|
if (c <= 0x7ff) return 2;
|
|
if (c <= 0xffff) return 3;
|
|
return 4;
|
|
}
|
|
|
|
static Result<bool> encode_wide_char_as_utf8(wchar_t c, char* result, usize& len)
|
|
{
|
|
const usize utf8_len = TRY(wide_char_length_as_utf8(c));
|
|
|
|
if (utf8_len > len) { return false; } // Not enough space
|
|
len = utf8_len;
|
|
|
|
u8* buf = (u8*)result;
|
|
|
|
if (len == 1)
|
|
{
|
|
buf[0] = c & 0x7f;
|
|
return true;
|
|
}
|
|
if (len == 2)
|
|
{
|
|
buf[0] = 0b11000000 | ((c & 0x7c0) >> 6);
|
|
buf[1] = 0b10000000 | (c & 0x3f);
|
|
return true;
|
|
}
|
|
if (len == 3)
|
|
{
|
|
buf[0] = 0b11100000 | ((c & 0xf000) >> 12);
|
|
buf[1] = 0b10000000 | ((c & 0xfc0) >> 6);
|
|
buf[2] = 0b10000000 | (c & 0x3f);
|
|
return true;
|
|
}
|
|
if (len == 4)
|
|
{
|
|
buf[0] = 0b11110000 | ((c & 0x1c0000) >> 18);
|
|
buf[1] = 0b10000000 | ((c & 0x3f000) >> 12);
|
|
buf[2] = 0b10000000 | ((c & 0xfc0) >> 6);
|
|
buf[3] = 0b10000000 | (c & 0x3f);
|
|
return true;
|
|
}
|
|
|
|
unreachable();
|
|
}
|
|
|
|
static Result<wchar_t> encode_utf8_as_wide_char_impl(const char* beg, usize& len)
|
|
{
|
|
const usize utf8_len = TRY(utf8_char_length(*beg));
|
|
if (utf8_len > len) return err(EILSEQ); // Unterminated sequence
|
|
len = utf8_len; // Enough space for the sequence, let's return the resulting length
|
|
|
|
if (len == 1) { return beg[0] & 0x7f; }
|
|
if (len == 2)
|
|
{
|
|
if ((beg[1] & 0b11000000) != 0b10000000) return err(EILSEQ);
|
|
wchar_t c = beg[0] & 0x1f;
|
|
c <<= 6;
|
|
c |= beg[1] & 0x3f;
|
|
return c;
|
|
}
|
|
if (len == 3)
|
|
{
|
|
if ((beg[1] & 0b11000000) != 0b10000000) return err(EILSEQ);
|
|
if ((beg[2] & 0b11000000) != 0b10000000) return err(EILSEQ);
|
|
wchar_t c = beg[0] & 0x0f;
|
|
c <<= 6;
|
|
c |= beg[1] & 0x3f;
|
|
c <<= 6;
|
|
c |= beg[2] & 0x3f;
|
|
return c;
|
|
}
|
|
if (len == 4)
|
|
{
|
|
if ((beg[1] & 0b11000000) != 0b10000000) return err(EILSEQ);
|
|
if ((beg[2] & 0b11000000) != 0b10000000) return err(EILSEQ);
|
|
if ((beg[3] & 0b11000000) != 0b10000000) return err(EILSEQ);
|
|
wchar_t c = beg[0] & 0x07;
|
|
c <<= 6;
|
|
c |= beg[1] & 0x3f;
|
|
c <<= 6;
|
|
c |= beg[2] & 0x3f;
|
|
c <<= 6;
|
|
c |= beg[3] & 0x3f;
|
|
if (c > 0x10ffff) return err(EILSEQ);
|
|
return c;
|
|
}
|
|
|
|
unreachable();
|
|
}
|
|
|
|
static Result<wchar_t> encode_utf8_as_wide_char(const char* beg, usize& len)
|
|
{
|
|
const wchar_t result = TRY(encode_utf8_as_wide_char_impl(beg, len));
|
|
// NOTE: We already know this is a valid code-point, since we constructed it ourselves and already checked the
|
|
// range.
|
|
if (len != wide_char_length_as_utf8_unchecked(result))
|
|
{
|
|
// OVERLONG ENCODING!! This is nasty, error out.
|
|
return err(EILSEQ);
|
|
}
|
|
return result;
|
|
}
|
|
|
|
Utf8StringDecoder::Utf8StringDecoder(const char* str) : m_str(str), m_byte_length(strlen(str))
|
|
{
|
|
}
|
|
|
|
Result<usize> Utf8StringDecoder::code_points() const
|
|
{
|
|
const char* it = m_str;
|
|
usize len = 0;
|
|
|
|
while ((usize)(it - m_str) < m_byte_length)
|
|
{
|
|
usize mb_len = m_byte_length - (usize)(it - m_str); // Remaining space
|
|
TRY(encode_utf8_as_wide_char(it, mb_len));
|
|
it += mb_len;
|
|
len++;
|
|
}
|
|
|
|
return len;
|
|
}
|
|
|
|
Result<usize> Utf8StringDecoder::decode(wchar_t* buf, usize max) const
|
|
{
|
|
const char* it = m_str;
|
|
wchar_t* const buf_start = buf;
|
|
|
|
while ((usize)(it - m_str) < m_byte_length && max--)
|
|
{
|
|
usize len = m_byte_length - (usize)(it - m_str); // Remaining space
|
|
*buf = TRY(encode_utf8_as_wide_char(it, len));
|
|
it += len;
|
|
buf++;
|
|
}
|
|
|
|
*buf = 0;
|
|
|
|
return (usize)(buf - buf_start);
|
|
}
|
|
|
|
Result<usize> Utf8StringDecoder::decode(wchar_t* buf) const
|
|
{
|
|
return decode(buf, (usize)-1);
|
|
}
|
|
|
|
Utf8StringEncoder::Utf8StringEncoder(const wchar_t* str) : m_str(str), m_code_points(wcslen(str))
|
|
{
|
|
}
|
|
|
|
Result<usize> Utf8StringEncoder::byte_length() const
|
|
{
|
|
const wchar_t* it = m_str;
|
|
usize len = 0;
|
|
|
|
while (*it)
|
|
{
|
|
len += TRY(wide_char_length_as_utf8(*it));
|
|
it++;
|
|
}
|
|
|
|
return len;
|
|
}
|
|
|
|
Result<usize> Utf8StringEncoder::encode(char* buf, usize max) const
|
|
{
|
|
const wchar_t* it = m_str;
|
|
char* const buf_start = buf;
|
|
|
|
while (*it && max > 1)
|
|
{
|
|
usize len = max - 1;
|
|
bool ok = TRY(encode_wide_char_as_utf8(*it, buf, len));
|
|
if (!ok) break;
|
|
buf += len;
|
|
max -= len;
|
|
it++;
|
|
}
|
|
|
|
*buf = 0;
|
|
|
|
return (usize)(buf - buf_start);
|
|
}
|
|
|
|
Result<usize> Utf8StringEncoder::encode(char* buf) const
|
|
{
|
|
return encode(buf, (usize)-1);
|
|
}
|
|
|
|
Utf8StateDecoder::Utf8StateDecoder() : m_state_len(0), m_state_index(0)
|
|
{
|
|
}
|
|
|
|
Result<bool> Utf8StateDecoder::feed(char c)
|
|
{
|
|
check(!m_has_character_ready);
|
|
|
|
if (m_state_len == 0)
|
|
{
|
|
m_state_len = TRY(utf8_char_length(c));
|
|
if (m_state_len == 1)
|
|
{
|
|
m_state_len = 0;
|
|
m_decoded_character = c & 0x7f;
|
|
m_has_character_ready = true;
|
|
return true;
|
|
}
|
|
m_state_index = 0;
|
|
m_state[m_state_index] = c;
|
|
return false;
|
|
}
|
|
|
|
m_state_index++;
|
|
m_state[m_state_index] = c;
|
|
|
|
if (m_state_index == m_state_len - 1)
|
|
{
|
|
usize len = m_state_len;
|
|
m_decoded_character = TRY(encode_utf8_as_wide_char(m_state, len));
|
|
m_has_character_ready = true;
|
|
m_state_len = 0;
|
|
return true;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
Result<wchar_t> Utf8StateDecoder::extract()
|
|
{
|
|
check(m_has_character_ready);
|
|
m_has_character_ready = false;
|
|
return m_decoded_character;
|
|
}
|
|
|
|
void Utf8StateDecoder::reset()
|
|
{
|
|
m_state_index = m_state_len = 0;
|
|
}
|
|
|
|
Result<usize> Utf8Encoder::encode(wchar_t c, char buf[4])
|
|
{
|
|
usize len = 0;
|
|
|
|
TRY(encode_wide_char_as_utf8(c, buf, len));
|
|
|
|
return len;
|
|
}
|