#include "Lexer.h" #include "Error.h" #include "utils.h" #define WHITESPACE "\t \n" #define LETTERS "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWYZ_" #define IDENTIFIERS "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWYZ_0123456789" #define DIGITS "0123456789" const std::array Lexer::types = {"void", "bool", "str", "i8", "i16", "i32", "i64", "u8", "u16", "u32", "u64", "f32", "f64", "f128"}; Lexer::Lexer(const std::string& fname) : loc(1, 0, fname), index(-1), prev_loc(1, 0, fname) { } Lexer::~Lexer() { } int Lexer::advance() { prev_loc = loc; ++index; loc.advance(); if (index >= current_lexed_text.size()) return 0; current_char = current_lexed_text[index]; loc.pos_from_char(current_char); if (current_char == '\n') { previous_line_text = current_line_text; current_line_text = this->recalculate_current_line(current_lexed_text); } return 1; } int Lexer::rewind() { loc = prev_loc; --index; if (index == -1) return 0; if (current_char == '\n') { current_line_text = previous_line_text; } current_char = current_lexed_text[index]; return 1; } std::string Lexer::recalculate_current_line(const std::string& text) { int idx = index; std::string final_str; ++idx; while (idx != text.size() && text[idx] != '\n') { final_str += text[idx]; ++idx; } return final_str; } std::unique_ptr Lexer::make_lexer(const std::string& fname) { return std::unique_ptr(new Lexer(fname)); // not using make_shared because the constructor is private } void Lexer::assign_parent_location(std::unique_ptr& lexer, const std::shared_ptr& loc) { lexer->loc.parent = loc; } bool Lexer::is_in_string(const std::string& string, const char& character) { return string.find(character) != std::string::npos; } TokenStream Lexer::lex(const std::string& text) { TokenStream result; bool comment = false; current_lexed_text = text; current_line_text = this->recalculate_current_line(current_lexed_text); while (this->advance()) { if (this->current_char == '\n') comment = false; if (comment) continue; if (is_in_string(WHITESPACE, current_char)) continue; else if (is_in_string(LETTERS, current_char)) { result.push_back(create_identifier()); } else if (is_in_string(DIGITS, current_char)) { result.push_back(create_number()); } else if (current_char == '\'') { result.push_back(create_string()); } else switch (current_char) { case '/': if (index + 1 != current_lexed_text.size()) { if (current_lexed_text[index + 1] == '/') { comment = true; break; } } result.push_back(Token::make_with_line({TT_Div, loc}, current_line_text)); break; case '+': result.push_back(Token::make_with_line({TT_Plus, loc}, current_line_text)); break; case '-': result.push_back(Token::make_with_line({TT_Minus, loc}, current_line_text)); break; case '*': result.push_back(Token::make_with_line({TT_Mul, loc}, current_line_text)); break; case '@': result.push_back(Token::make_with_line({TT_At, loc}, current_line_text)); break; case '=': result.push_back(Token::make_with_line({TT_Equal, loc}, current_line_text)); break; case '>': result.push_back(Token::make_with_line({TT_GreaterThan, loc}, current_line_text)); break; case '<': result.push_back(Token::make_with_line({TT_LessThan, loc}, current_line_text)); break; case '(': result.push_back(Token::make_with_line({TT_LParen, loc}, current_line_text)); break; case ')': result.push_back(Token::make_with_line({TT_RParen, loc}, current_line_text)); break; case '{': result.push_back(Token::make_with_line({TT_LBracket, loc}, current_line_text)); break; case '}': result.push_back(Token::make_with_line({TT_RBracket, loc}, current_line_text)); break; case ';': result.push_back(Token::make_with_line({TT_Semicolon, loc}, current_line_text)); break; case '.': result.push_back(Token::make_with_line({TT_Period, loc}, current_line_text)); break; case ',': result.push_back(Token::make_with_line({TT_Comma, loc}, current_line_text)); break; case '!': result.push_back(Token::make_with_line({TT_Exclamation, loc}, current_line_text)); break; case '[': result.push_back(Token::make_with_line({TT_Exclamation, loc}, current_line_text)); break; case ']': result.push_back(Token::make_with_line({TT_Exclamation, loc}, current_line_text)); break; case '\377': result.push_back(Token(TT_EOF, loc)); return result; default: Error::throw_error(loc, current_line_text, "unknown character"); } } result.push_back(Token(TT_EOF, loc)); return result; } Token Lexer::create_identifier() { std::vector characters; int prev_line = loc.line; int prev_column = loc.column; bool is_path = false; bool last_was_path = false; Location saved_loc = this->loc; Location saved_prev_loc = this->prev_loc; characters.push_back(current_char); while (this->advance()) { if (is_in_string(IDENTIFIERS, current_char)) { characters.push_back(current_char); last_was_path = false; } else if (current_char == '/') { if (last_was_path) { characters.pop_back(); this->loc = saved_loc; this->prev_loc = saved_prev_loc; this->rewind(); std::string identifier(characters.begin(), characters.end()); return Token::make_with_line({TT_Path, identifier, {prev_line, prev_column, loc.fname}}, current_line_text); } saved_loc = this->loc; saved_prev_loc = this->prev_loc; characters.push_back(current_char); is_path = true; last_was_path = true; } else { this->rewind(); std::string identifier(characters.begin(), characters.end()); if (is_path) return Token::make_with_line({TT_Path, identifier, {prev_line, prev_column, loc.fname}}, current_line_text); auto location = std::find(types.begin(), types.end(), identifier); if (location != types.end()) { return Token::make_with_line({TT_Type, identifier, {prev_line, prev_column, loc.fname}}, current_line_text); } if (identifier == "import") return Token::make_with_line({TT_Import, {prev_line, prev_column, loc.fname}}, current_line_text); if (identifier == "syscall0") return Token::make_with_line({TT_Syscall0, {prev_line, prev_column, loc.fname}}, current_line_text); if (identifier == "syscall1") return Token::make_with_line({TT_Syscall1, {prev_line, prev_column, loc.fname}}, current_line_text); if (identifier == "syscall2") return Token::make_with_line({TT_Syscall2, {prev_line, prev_column, loc.fname}}, current_line_text); if (identifier == "syscall3") return Token::make_with_line({TT_Syscall3, {prev_line, prev_column, loc.fname}}, current_line_text); if (identifier == "syscall4") return Token::make_with_line({TT_Syscall4, {prev_line, prev_column, loc.fname}}, current_line_text); if (identifier == "syscall5") return Token::make_with_line({TT_Syscall5, {prev_line, prev_column, loc.fname}}, current_line_text); if (identifier == "compmacro") return Token::make_with_line({TT_CompilerMacro, {prev_line, prev_column, loc.fname}}, current_line_text); if (identifier == "let") return Token::make_with_line({TT_Let, {prev_line, prev_column, loc.fname}}, current_line_text); if (identifier == "in") return Token::make_with_line({TT_In, {prev_line, prev_column, loc.fname}}, current_line_text); return Token::make_with_line({TT_Identifier, identifier, {prev_line, prev_column, loc.fname}}, current_line_text); } } std::string identifier(characters.begin(), characters.end()); if (is_path) return Token::make_with_line({TT_Path, identifier, {prev_line, prev_column, loc.fname}}, current_line_text); auto location = std::find(types.begin(), types.end(), identifier); if (location != types.end()) { return Token::make_with_line({TT_Type, identifier, {prev_line, prev_column, loc.fname}}, current_line_text); } if (identifier == "import") return Token::make_with_line({TT_Import, {prev_line, prev_column, loc.fname}}, current_line_text); if (identifier == "syscall0") return Token::make_with_line({TT_Syscall0, {prev_line, prev_column, loc.fname}}, current_line_text); if (identifier == "syscall1") return Token::make_with_line({TT_Syscall1, {prev_line, prev_column, loc.fname}}, current_line_text); if (identifier == "syscall2") return Token::make_with_line({TT_Syscall2, {prev_line, prev_column, loc.fname}}, current_line_text); if (identifier == "syscall3") return Token::make_with_line({TT_Syscall3, {prev_line, prev_column, loc.fname}}, current_line_text); if (identifier == "syscall4") return Token::make_with_line({TT_Syscall4, {prev_line, prev_column, loc.fname}}, current_line_text); if (identifier == "syscall5") return Token::make_with_line({TT_Syscall5, {prev_line, prev_column, loc.fname}}, current_line_text); if (identifier == "compmacro") return Token::make_with_line({TT_CompilerMacro, {prev_line, prev_column, loc.fname}}, current_line_text); if (identifier == "let") return Token::make_with_line({TT_Let, {prev_line, prev_column, loc.fname}}, current_line_text); if (identifier == "in") return Token::make_with_line({TT_In, {prev_line, prev_column, loc.fname}}, current_line_text); return Token::make_with_line({TT_Identifier, identifier, {prev_line, prev_column, loc.fname}}, current_line_text); } Token Lexer::create_number() { std::vector characters; int prev_line = loc.line; int prev_column = loc.column; int dot_count = 0; characters.push_back(current_char); while (this->advance()) { if (is_in_string(DIGITS, current_char)) { characters.push_back(current_char); } else if (current_char == '.') { if (dot_count == 0) { characters.push_back(current_char); ++dot_count; } else { Error::throw_warning(loc, current_line_text, "floats can only have one dot"); this->rewind(); float tk_value = std::stof(std::string(characters.begin(), characters.end()).c_str()); return Token::make_with_line({TT_Float, tk_value, {prev_line, prev_column, loc.fname}}, current_line_text); } } else { this->rewind(); if (dot_count != 0) { float tk_value = std::stof(std::string(characters.begin(), characters.end()).c_str()); return Token::make_with_line({TT_Float, tk_value, {prev_line, prev_column, loc.fname}}, current_line_text); } int tk_value = atoi(std::string(characters.begin(), characters.end()).c_str()); return Token::make_with_line({TT_Number, tk_value, {prev_line, prev_column, loc.fname}}, current_line_text); } } if (dot_count != 0) { float tk_value = std::stof(std::string(characters.begin(), characters.end()).c_str()); return Token::make_with_line({TT_Float, tk_value, {prev_line, prev_column, loc.fname}}, current_line_text); } int tk_value = atoi(std::string(characters.begin(), characters.end()).c_str()); return Token::make_with_line({TT_Number, tk_value, {prev_line, prev_column, loc.fname}}, current_line_text); } Token Lexer::create_string() { std::vector characters; int prev_line = loc.line; int prev_column = loc.column; while (this->advance()) { if (current_char == '\n') { this->rewind(); Error::throw_error(loc, current_line_text, "expected end of string but got newline"); } if (current_char == '\'') { std::string identifier(characters.begin(), characters.end()); return Token::make_with_line({TT_String, identifier, {prev_line, prev_column, loc.fname}}, current_line_text); } if (current_char == '\\') { if (index + 1 == current_lexed_text.size()) { Error::throw_error(loc, current_line_text, "unfinished escape sequence"); } switch (current_lexed_text[index + 1]) { case 'n': characters.push_back('\n'); break; case '\'': characters.push_back('\''); break; case '\\': characters.push_back('\\'); break; default: Error::throw_error(loc, current_line_text, "unknown escape sequence"); } ++index; ++loc.column; continue; } characters.push_back(current_char); } this->rewind(); Error::throw_error(loc, current_line_text, "expected end of string but got EOF"); return Token(TT_Null, loc); // unreachable since Error::throw_error calls exit() }