sapphire/src/Lexer.cpp

400 lines
15 KiB
C++

#include "Lexer.h"
#include "Error.h"
#include "utils.h"
#define WHITESPACE "\t \n"
#define LETTERS "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWYZ_"
#define IDENTIFIERS "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWYZ_0123456789"
#define DIGITS "0123456789"
const std::array<std::string, TYPE_COUNT> Lexer::types = {"void", "bool", "str", "i8", "i16", "i32", "i64",
"u8", "u16", "u32", "u64", "f32", "f64", "f128"};
Lexer::Lexer(const std::string& fname) : loc(1, 0, fname), index(-1), prev_loc(1, 0, fname)
{
}
Lexer::~Lexer()
{
}
int Lexer::advance()
{
prev_loc = loc;
++index;
loc.advance();
if (index >= current_lexed_text.size()) return 0;
current_char = current_lexed_text[index];
loc.pos_from_char(current_char);
if (current_char == '\n')
{
previous_line_text = current_line_text;
current_line_text = this->recalculate_current_line(current_lexed_text);
}
return 1;
}
int Lexer::rewind()
{
loc = prev_loc;
--index;
if (index == -1) return 0;
if (current_char == '\n')
{
current_line_text = previous_line_text;
}
current_char = current_lexed_text[index];
return 1;
}
std::string Lexer::recalculate_current_line(const std::string& text)
{
int idx = index;
std::string final_str;
++idx;
while (idx != text.size() && text[idx] != '\n')
{
final_str += text[idx];
++idx;
}
return final_str;
}
std::unique_ptr<Lexer> Lexer::make_lexer(const std::string& fname)
{
return std::unique_ptr<Lexer>(new Lexer(fname)); // not using make_shared because the constructor is private
}
void Lexer::assign_parent_location(std::unique_ptr<Lexer>& lexer, const std::shared_ptr<Location>& loc)
{
lexer->loc.parent = loc;
}
bool Lexer::is_in_string(const std::string& string, const char& character)
{
return string.find(character) != std::string::npos;
}
TokenStream Lexer::lex(const std::string& text)
{
TokenStream result;
bool comment = false;
current_lexed_text = text;
current_line_text = this->recalculate_current_line(current_lexed_text);
while (this->advance())
{
if (this->current_char == '\n') comment = false;
if (comment) continue;
if (is_in_string(WHITESPACE, current_char)) continue;
else if (is_in_string(LETTERS, current_char))
{
result.push_back(create_identifier());
}
else if (is_in_string(DIGITS, current_char))
{
result.push_back(create_number());
}
else if (current_char == '\'')
{
result.push_back(create_string());
}
else
switch (current_char)
{
case '/':
if (index + 1 != current_lexed_text.size())
{
if (current_lexed_text[index + 1] == '/')
{
comment = true;
break;
}
}
result.push_back(Token::make_with_line({TT_Div, loc}, current_line_text));
break;
case '+':
result.push_back(Token::make_with_line({TT_Plus, loc}, current_line_text));
break;
case '-':
result.push_back(Token::make_with_line({TT_Minus, loc}, current_line_text));
break;
case '*':
result.push_back(Token::make_with_line({TT_Mul, loc}, current_line_text));
break;
case '@':
result.push_back(Token::make_with_line({TT_At, loc}, current_line_text));
break;
case '=':
result.push_back(Token::make_with_line({TT_Equal, loc}, current_line_text));
break;
case '>':
result.push_back(Token::make_with_line({TT_GreaterThan, loc}, current_line_text));
break;
case '<':
result.push_back(Token::make_with_line({TT_LessThan, loc}, current_line_text));
break;
case '(':
result.push_back(Token::make_with_line({TT_LParen, loc}, current_line_text));
break;
case ')':
result.push_back(Token::make_with_line({TT_RParen, loc}, current_line_text));
break;
case '{':
result.push_back(Token::make_with_line({TT_LBracket, loc}, current_line_text));
break;
case '}':
result.push_back(Token::make_with_line({TT_RBracket, loc}, current_line_text));
break;
case ';':
result.push_back(Token::make_with_line({TT_Semicolon, loc}, current_line_text));
break;
case '.':
result.push_back(Token::make_with_line({TT_Period, loc}, current_line_text));
break;
case ',':
result.push_back(Token::make_with_line({TT_Comma, loc}, current_line_text));
break;
case '!':
result.push_back(Token::make_with_line({TT_Exclamation, loc}, current_line_text));
break;
case '[':
result.push_back(Token::make_with_line({TT_Exclamation, loc}, current_line_text));
break;
case ']':
result.push_back(Token::make_with_line({TT_Exclamation, loc}, current_line_text));
break;
case '\377':
result.push_back(Token(TT_EOF, loc));
return result;
default:
Error::throw_error(loc, current_line_text, "unknown character");
}
}
result.push_back(Token(TT_EOF, loc));
return result;
}
Token Lexer::create_identifier()
{
std::vector<char> characters;
int prev_line = loc.line;
int prev_column = loc.column;
bool is_path = false;
bool last_was_path = false;
Location saved_loc = this->loc;
Location saved_prev_loc = this->prev_loc;
characters.push_back(current_char);
while (this->advance())
{
if (is_in_string(IDENTIFIERS, current_char))
{
characters.push_back(current_char);
last_was_path = false;
}
else if (current_char == '/')
{
if (last_was_path)
{
characters.pop_back();
this->loc = saved_loc;
this->prev_loc = saved_prev_loc;
this->rewind();
std::string identifier(characters.begin(), characters.end());
return Token::make_with_line({TT_Path, identifier, {prev_line, prev_column, loc.fname}},
current_line_text);
}
saved_loc = this->loc;
saved_prev_loc = this->prev_loc;
characters.push_back(current_char);
is_path = true;
last_was_path = true;
}
else
{
this->rewind();
std::string identifier(characters.begin(), characters.end());
if (is_path)
return Token::make_with_line({TT_Path, identifier, {prev_line, prev_column, loc.fname}},
current_line_text);
auto location = std::find(types.begin(), types.end(), identifier);
if (location != types.end())
{
return Token::make_with_line({TT_Type, identifier, {prev_line, prev_column, loc.fname}},
current_line_text);
}
if (identifier == "import")
return Token::make_with_line({TT_Import, {prev_line, prev_column, loc.fname}}, current_line_text);
if (identifier == "syscall0")
return Token::make_with_line({TT_Syscall0, {prev_line, prev_column, loc.fname}}, current_line_text);
if (identifier == "syscall1")
return Token::make_with_line({TT_Syscall1, {prev_line, prev_column, loc.fname}}, current_line_text);
if (identifier == "syscall2")
return Token::make_with_line({TT_Syscall2, {prev_line, prev_column, loc.fname}}, current_line_text);
if (identifier == "syscall3")
return Token::make_with_line({TT_Syscall3, {prev_line, prev_column, loc.fname}}, current_line_text);
if (identifier == "syscall4")
return Token::make_with_line({TT_Syscall4, {prev_line, prev_column, loc.fname}}, current_line_text);
if (identifier == "syscall5")
return Token::make_with_line({TT_Syscall5, {prev_line, prev_column, loc.fname}}, current_line_text);
if (identifier == "compmacro")
return Token::make_with_line({TT_CompilerMacro, {prev_line, prev_column, loc.fname}},
current_line_text);
if (identifier == "let")
return Token::make_with_line({TT_Let, {prev_line, prev_column, loc.fname}}, current_line_text);
if (identifier == "in")
return Token::make_with_line({TT_In, {prev_line, prev_column, loc.fname}}, current_line_text);
return Token::make_with_line({TT_Identifier, identifier, {prev_line, prev_column, loc.fname}},
current_line_text);
}
}
std::string identifier(characters.begin(), characters.end());
if (is_path)
return Token::make_with_line({TT_Path, identifier, {prev_line, prev_column, loc.fname}}, current_line_text);
auto location = std::find(types.begin(), types.end(), identifier);
if (location != types.end())
{
return Token::make_with_line({TT_Type, identifier, {prev_line, prev_column, loc.fname}}, current_line_text);
}
if (identifier == "import")
return Token::make_with_line({TT_Import, {prev_line, prev_column, loc.fname}}, current_line_text);
if (identifier == "syscall0")
return Token::make_with_line({TT_Syscall0, {prev_line, prev_column, loc.fname}}, current_line_text);
if (identifier == "syscall1")
return Token::make_with_line({TT_Syscall1, {prev_line, prev_column, loc.fname}}, current_line_text);
if (identifier == "syscall2")
return Token::make_with_line({TT_Syscall2, {prev_line, prev_column, loc.fname}}, current_line_text);
if (identifier == "syscall3")
return Token::make_with_line({TT_Syscall3, {prev_line, prev_column, loc.fname}}, current_line_text);
if (identifier == "syscall4")
return Token::make_with_line({TT_Syscall4, {prev_line, prev_column, loc.fname}}, current_line_text);
if (identifier == "syscall5")
return Token::make_with_line({TT_Syscall5, {prev_line, prev_column, loc.fname}}, current_line_text);
if (identifier == "compmacro")
return Token::make_with_line({TT_CompilerMacro, {prev_line, prev_column, loc.fname}}, current_line_text);
if (identifier == "let")
return Token::make_with_line({TT_Let, {prev_line, prev_column, loc.fname}}, current_line_text);
if (identifier == "in")
return Token::make_with_line({TT_In, {prev_line, prev_column, loc.fname}}, current_line_text);
return Token::make_with_line({TT_Identifier, identifier, {prev_line, prev_column, loc.fname}}, current_line_text);
}
Token Lexer::create_number()
{
std::vector<char> characters;
int prev_line = loc.line;
int prev_column = loc.column;
int dot_count = 0;
characters.push_back(current_char);
while (this->advance())
{
if (is_in_string(DIGITS, current_char))
{
characters.push_back(current_char);
}
else if (current_char == '.')
{
if (dot_count == 0)
{
characters.push_back(current_char);
++dot_count;
}
else
{
Error::throw_warning(loc, current_line_text, "floats can only have one dot");
this->rewind();
float tk_value = std::stof(std::string(characters.begin(), characters.end()).c_str());
return Token::make_with_line({TT_Float, tk_value, {prev_line, prev_column, loc.fname}},
current_line_text);
}
}
else
{
this->rewind();
if (dot_count != 0)
{
float tk_value = std::stof(std::string(characters.begin(), characters.end()).c_str());
return Token::make_with_line({TT_Float, tk_value, {prev_line, prev_column, loc.fname}},
current_line_text);
}
int tk_value = atoi(std::string(characters.begin(), characters.end()).c_str());
return Token::make_with_line({TT_Number, tk_value, {prev_line, prev_column, loc.fname}}, current_line_text);
}
}
if (dot_count != 0)
{
float tk_value = std::stof(std::string(characters.begin(), characters.end()).c_str());
return Token::make_with_line({TT_Float, tk_value, {prev_line, prev_column, loc.fname}}, current_line_text);
}
int tk_value = atoi(std::string(characters.begin(), characters.end()).c_str());
return Token::make_with_line({TT_Number, tk_value, {prev_line, prev_column, loc.fname}}, current_line_text);
}
Token Lexer::create_string()
{
std::vector<char> characters;
int prev_line = loc.line;
int prev_column = loc.column;
while (this->advance())
{
if (current_char == '\n')
{
this->rewind();
Error::throw_error(loc, current_line_text, "expected end of string but got newline");
}
if (current_char == '\'')
{
std::string identifier(characters.begin(), characters.end());
return Token::make_with_line({TT_String, identifier, {prev_line, prev_column, loc.fname}},
current_line_text);
}
if (current_char == '\\')
{
if (index + 1 == current_lexed_text.size())
{
Error::throw_error(loc, current_line_text, "unfinished escape sequence");
}
switch (current_lexed_text[index + 1])
{
case 'n':
characters.push_back('\n');
break;
case '\'':
characters.push_back('\'');
break;
case '\\':
characters.push_back('\\');
break;
default:
Error::throw_error(loc, current_line_text, "unknown escape sequence");
}
++index;
++loc.column;
continue;
}
characters.push_back(current_char);
}
this->rewind();
Error::throw_error(loc, current_line_text, "expected end of string but got EOF");
return Token(TT_Null, loc); // unreachable since Error::throw_error calls exit()
}