400 lines
15 KiB
C++
400 lines
15 KiB
C++
#include "Lexer.h"
|
|
#include "Error.h"
|
|
#include "utils.h"
|
|
|
|
#define WHITESPACE "\t \n"
|
|
#define LETTERS "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWYZ_"
|
|
#define IDENTIFIERS "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWYZ_0123456789"
|
|
#define DIGITS "0123456789"
|
|
|
|
const std::array<std::string, TYPE_COUNT> Lexer::types = {"void", "bool", "str", "i8", "i16", "i32", "i64",
|
|
"u8", "u16", "u32", "u64", "f32", "f64", "f128"};
|
|
|
|
Lexer::Lexer(const std::string& fname) : loc(1, 0, fname), index(-1), prev_loc(1, 0, fname)
|
|
{
|
|
}
|
|
|
|
Lexer::~Lexer()
|
|
{
|
|
}
|
|
|
|
int Lexer::advance()
|
|
{
|
|
prev_loc = loc;
|
|
++index;
|
|
loc.advance();
|
|
if (index >= current_lexed_text.size()) return 0;
|
|
current_char = current_lexed_text[index];
|
|
loc.pos_from_char(current_char);
|
|
if (current_char == '\n')
|
|
{
|
|
previous_line_text = current_line_text;
|
|
current_line_text = this->recalculate_current_line(current_lexed_text);
|
|
}
|
|
return 1;
|
|
}
|
|
|
|
int Lexer::rewind()
|
|
{
|
|
loc = prev_loc;
|
|
--index;
|
|
if (index == -1) return 0;
|
|
if (current_char == '\n')
|
|
{
|
|
current_line_text = previous_line_text;
|
|
}
|
|
current_char = current_lexed_text[index];
|
|
return 1;
|
|
}
|
|
|
|
std::string Lexer::recalculate_current_line(const std::string& text)
|
|
{
|
|
int idx = index;
|
|
std::string final_str;
|
|
++idx;
|
|
while (idx != text.size() && text[idx] != '\n')
|
|
{
|
|
final_str += text[idx];
|
|
++idx;
|
|
}
|
|
return final_str;
|
|
}
|
|
|
|
std::unique_ptr<Lexer> Lexer::make_lexer(const std::string& fname)
|
|
{
|
|
return std::unique_ptr<Lexer>(new Lexer(fname)); // not using make_shared because the constructor is private
|
|
}
|
|
|
|
void Lexer::assign_parent_location(std::unique_ptr<Lexer>& lexer, const std::shared_ptr<Location>& loc)
|
|
{
|
|
lexer->loc.parent = loc;
|
|
}
|
|
|
|
bool Lexer::is_in_string(const std::string& string, const char& character)
|
|
{
|
|
return string.find(character) != std::string::npos;
|
|
}
|
|
|
|
TokenStream Lexer::lex(const std::string& text)
|
|
{
|
|
TokenStream result;
|
|
bool comment = false;
|
|
current_lexed_text = text;
|
|
current_line_text = this->recalculate_current_line(current_lexed_text);
|
|
|
|
while (this->advance())
|
|
{
|
|
if (this->current_char == '\n') comment = false;
|
|
|
|
if (comment) continue;
|
|
|
|
if (is_in_string(WHITESPACE, current_char)) continue;
|
|
|
|
else if (is_in_string(LETTERS, current_char))
|
|
{
|
|
result.push_back(create_identifier());
|
|
}
|
|
|
|
else if (is_in_string(DIGITS, current_char))
|
|
{
|
|
result.push_back(create_number());
|
|
}
|
|
|
|
else if (current_char == '\'')
|
|
{
|
|
result.push_back(create_string());
|
|
}
|
|
|
|
else
|
|
switch (current_char)
|
|
{
|
|
case '/':
|
|
if (index + 1 != current_lexed_text.size())
|
|
{
|
|
if (current_lexed_text[index + 1] == '/')
|
|
{
|
|
comment = true;
|
|
break;
|
|
}
|
|
}
|
|
result.push_back(Token::make_with_line({TT_Div, loc}, current_line_text));
|
|
break;
|
|
case '+':
|
|
result.push_back(Token::make_with_line({TT_Plus, loc}, current_line_text));
|
|
break;
|
|
case '-':
|
|
result.push_back(Token::make_with_line({TT_Minus, loc}, current_line_text));
|
|
break;
|
|
case '*':
|
|
result.push_back(Token::make_with_line({TT_Mul, loc}, current_line_text));
|
|
break;
|
|
case '@':
|
|
result.push_back(Token::make_with_line({TT_At, loc}, current_line_text));
|
|
break;
|
|
case '=':
|
|
result.push_back(Token::make_with_line({TT_Equal, loc}, current_line_text));
|
|
break;
|
|
case '>':
|
|
result.push_back(Token::make_with_line({TT_GreaterThan, loc}, current_line_text));
|
|
break;
|
|
case '<':
|
|
result.push_back(Token::make_with_line({TT_LessThan, loc}, current_line_text));
|
|
break;
|
|
case '(':
|
|
result.push_back(Token::make_with_line({TT_LParen, loc}, current_line_text));
|
|
break;
|
|
case ')':
|
|
result.push_back(Token::make_with_line({TT_RParen, loc}, current_line_text));
|
|
break;
|
|
case '{':
|
|
result.push_back(Token::make_with_line({TT_LBracket, loc}, current_line_text));
|
|
break;
|
|
case '}':
|
|
result.push_back(Token::make_with_line({TT_RBracket, loc}, current_line_text));
|
|
break;
|
|
case ';':
|
|
result.push_back(Token::make_with_line({TT_Semicolon, loc}, current_line_text));
|
|
break;
|
|
case '.':
|
|
result.push_back(Token::make_with_line({TT_Period, loc}, current_line_text));
|
|
break;
|
|
case ',':
|
|
result.push_back(Token::make_with_line({TT_Comma, loc}, current_line_text));
|
|
break;
|
|
case '!':
|
|
result.push_back(Token::make_with_line({TT_Exclamation, loc}, current_line_text));
|
|
break;
|
|
case '[':
|
|
result.push_back(Token::make_with_line({TT_Exclamation, loc}, current_line_text));
|
|
break;
|
|
case ']':
|
|
result.push_back(Token::make_with_line({TT_Exclamation, loc}, current_line_text));
|
|
break;
|
|
case '\377':
|
|
result.push_back(Token(TT_EOF, loc));
|
|
return result;
|
|
default:
|
|
Error::throw_error(loc, current_line_text, "unknown character");
|
|
}
|
|
}
|
|
|
|
result.push_back(Token(TT_EOF, loc));
|
|
|
|
return result;
|
|
}
|
|
|
|
Token Lexer::create_identifier()
|
|
{
|
|
std::vector<char> characters;
|
|
int prev_line = loc.line;
|
|
int prev_column = loc.column;
|
|
bool is_path = false;
|
|
bool last_was_path = false;
|
|
Location saved_loc = this->loc;
|
|
Location saved_prev_loc = this->prev_loc;
|
|
|
|
characters.push_back(current_char);
|
|
|
|
while (this->advance())
|
|
{
|
|
if (is_in_string(IDENTIFIERS, current_char))
|
|
{
|
|
characters.push_back(current_char);
|
|
last_was_path = false;
|
|
}
|
|
else if (current_char == '/')
|
|
{
|
|
if (last_was_path)
|
|
{
|
|
characters.pop_back();
|
|
this->loc = saved_loc;
|
|
this->prev_loc = saved_prev_loc;
|
|
this->rewind();
|
|
std::string identifier(characters.begin(), characters.end());
|
|
return Token::make_with_line({TT_Path, identifier, {prev_line, prev_column, loc.fname}},
|
|
current_line_text);
|
|
}
|
|
|
|
saved_loc = this->loc;
|
|
saved_prev_loc = this->prev_loc;
|
|
|
|
characters.push_back(current_char);
|
|
is_path = true;
|
|
last_was_path = true;
|
|
}
|
|
else
|
|
{
|
|
this->rewind();
|
|
std::string identifier(characters.begin(), characters.end());
|
|
if (is_path)
|
|
return Token::make_with_line({TT_Path, identifier, {prev_line, prev_column, loc.fname}},
|
|
current_line_text);
|
|
auto location = std::find(types.begin(), types.end(), identifier);
|
|
if (location != types.end())
|
|
{
|
|
return Token::make_with_line({TT_Type, identifier, {prev_line, prev_column, loc.fname}},
|
|
current_line_text);
|
|
}
|
|
if (identifier == "import")
|
|
return Token::make_with_line({TT_Import, {prev_line, prev_column, loc.fname}}, current_line_text);
|
|
if (identifier == "syscall0")
|
|
return Token::make_with_line({TT_Syscall0, {prev_line, prev_column, loc.fname}}, current_line_text);
|
|
if (identifier == "syscall1")
|
|
return Token::make_with_line({TT_Syscall1, {prev_line, prev_column, loc.fname}}, current_line_text);
|
|
if (identifier == "syscall2")
|
|
return Token::make_with_line({TT_Syscall2, {prev_line, prev_column, loc.fname}}, current_line_text);
|
|
if (identifier == "syscall3")
|
|
return Token::make_with_line({TT_Syscall3, {prev_line, prev_column, loc.fname}}, current_line_text);
|
|
if (identifier == "syscall4")
|
|
return Token::make_with_line({TT_Syscall4, {prev_line, prev_column, loc.fname}}, current_line_text);
|
|
if (identifier == "syscall5")
|
|
return Token::make_with_line({TT_Syscall5, {prev_line, prev_column, loc.fname}}, current_line_text);
|
|
if (identifier == "compmacro")
|
|
return Token::make_with_line({TT_CompilerMacro, {prev_line, prev_column, loc.fname}},
|
|
current_line_text);
|
|
if (identifier == "let")
|
|
return Token::make_with_line({TT_Let, {prev_line, prev_column, loc.fname}}, current_line_text);
|
|
if (identifier == "in")
|
|
return Token::make_with_line({TT_In, {prev_line, prev_column, loc.fname}}, current_line_text);
|
|
return Token::make_with_line({TT_Identifier, identifier, {prev_line, prev_column, loc.fname}},
|
|
current_line_text);
|
|
}
|
|
}
|
|
|
|
std::string identifier(characters.begin(), characters.end());
|
|
if (is_path)
|
|
return Token::make_with_line({TT_Path, identifier, {prev_line, prev_column, loc.fname}}, current_line_text);
|
|
auto location = std::find(types.begin(), types.end(), identifier);
|
|
if (location != types.end())
|
|
{
|
|
return Token::make_with_line({TT_Type, identifier, {prev_line, prev_column, loc.fname}}, current_line_text);
|
|
}
|
|
if (identifier == "import")
|
|
return Token::make_with_line({TT_Import, {prev_line, prev_column, loc.fname}}, current_line_text);
|
|
if (identifier == "syscall0")
|
|
return Token::make_with_line({TT_Syscall0, {prev_line, prev_column, loc.fname}}, current_line_text);
|
|
if (identifier == "syscall1")
|
|
return Token::make_with_line({TT_Syscall1, {prev_line, prev_column, loc.fname}}, current_line_text);
|
|
if (identifier == "syscall2")
|
|
return Token::make_with_line({TT_Syscall2, {prev_line, prev_column, loc.fname}}, current_line_text);
|
|
if (identifier == "syscall3")
|
|
return Token::make_with_line({TT_Syscall3, {prev_line, prev_column, loc.fname}}, current_line_text);
|
|
if (identifier == "syscall4")
|
|
return Token::make_with_line({TT_Syscall4, {prev_line, prev_column, loc.fname}}, current_line_text);
|
|
if (identifier == "syscall5")
|
|
return Token::make_with_line({TT_Syscall5, {prev_line, prev_column, loc.fname}}, current_line_text);
|
|
if (identifier == "compmacro")
|
|
return Token::make_with_line({TT_CompilerMacro, {prev_line, prev_column, loc.fname}}, current_line_text);
|
|
if (identifier == "let")
|
|
return Token::make_with_line({TT_Let, {prev_line, prev_column, loc.fname}}, current_line_text);
|
|
if (identifier == "in")
|
|
return Token::make_with_line({TT_In, {prev_line, prev_column, loc.fname}}, current_line_text);
|
|
return Token::make_with_line({TT_Identifier, identifier, {prev_line, prev_column, loc.fname}}, current_line_text);
|
|
}
|
|
|
|
Token Lexer::create_number()
|
|
{
|
|
std::vector<char> characters;
|
|
int prev_line = loc.line;
|
|
int prev_column = loc.column;
|
|
int dot_count = 0;
|
|
|
|
characters.push_back(current_char);
|
|
|
|
while (this->advance())
|
|
{
|
|
if (is_in_string(DIGITS, current_char))
|
|
{
|
|
characters.push_back(current_char);
|
|
}
|
|
else if (current_char == '.')
|
|
{
|
|
if (dot_count == 0)
|
|
{
|
|
characters.push_back(current_char);
|
|
++dot_count;
|
|
}
|
|
else
|
|
{
|
|
Error::throw_warning(loc, current_line_text, "floats can only have one dot");
|
|
this->rewind();
|
|
float tk_value = std::stof(std::string(characters.begin(), characters.end()).c_str());
|
|
return Token::make_with_line({TT_Float, tk_value, {prev_line, prev_column, loc.fname}},
|
|
current_line_text);
|
|
}
|
|
}
|
|
else
|
|
{
|
|
this->rewind();
|
|
if (dot_count != 0)
|
|
{
|
|
float tk_value = std::stof(std::string(characters.begin(), characters.end()).c_str());
|
|
return Token::make_with_line({TT_Float, tk_value, {prev_line, prev_column, loc.fname}},
|
|
current_line_text);
|
|
}
|
|
int tk_value = atoi(std::string(characters.begin(), characters.end()).c_str());
|
|
return Token::make_with_line({TT_Number, tk_value, {prev_line, prev_column, loc.fname}}, current_line_text);
|
|
}
|
|
}
|
|
|
|
if (dot_count != 0)
|
|
{
|
|
float tk_value = std::stof(std::string(characters.begin(), characters.end()).c_str());
|
|
return Token::make_with_line({TT_Float, tk_value, {prev_line, prev_column, loc.fname}}, current_line_text);
|
|
}
|
|
int tk_value = atoi(std::string(characters.begin(), characters.end()).c_str());
|
|
return Token::make_with_line({TT_Number, tk_value, {prev_line, prev_column, loc.fname}}, current_line_text);
|
|
}
|
|
|
|
Token Lexer::create_string()
|
|
{
|
|
std::vector<char> characters;
|
|
int prev_line = loc.line;
|
|
int prev_column = loc.column;
|
|
|
|
while (this->advance())
|
|
{
|
|
if (current_char == '\n')
|
|
{
|
|
this->rewind();
|
|
Error::throw_error(loc, current_line_text, "expected end of string but got newline");
|
|
}
|
|
if (current_char == '\'')
|
|
{
|
|
std::string identifier(characters.begin(), characters.end());
|
|
return Token::make_with_line({TT_String, identifier, {prev_line, prev_column, loc.fname}},
|
|
current_line_text);
|
|
}
|
|
if (current_char == '\\')
|
|
{
|
|
if (index + 1 == current_lexed_text.size())
|
|
{
|
|
Error::throw_error(loc, current_line_text, "unfinished escape sequence");
|
|
}
|
|
switch (current_lexed_text[index + 1])
|
|
{
|
|
case 'n':
|
|
characters.push_back('\n');
|
|
break;
|
|
case '\'':
|
|
characters.push_back('\'');
|
|
break;
|
|
case '\\':
|
|
characters.push_back('\\');
|
|
break;
|
|
default:
|
|
Error::throw_error(loc, current_line_text, "unknown escape sequence");
|
|
}
|
|
++index;
|
|
++loc.column;
|
|
continue;
|
|
}
|
|
|
|
characters.push_back(current_char);
|
|
}
|
|
this->rewind();
|
|
Error::throw_error(loc, current_line_text, "expected end of string but got EOF");
|
|
|
|
return Token(TT_Null, loc); // unreachable since Error::throw_error calls exit()
|
|
}
|