#include "Lexer.h" #include "Error.h" #include #define WHITESPACE "\t \n" #define LETTERS "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWYZ_" #define IDENTIFIERS "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWYZ_0123456789" #define DIGITS "0123456789" const std::array Lexer::types = {"void","bool","str","i8","i16","i32","i64","u8","u16","u32","u64","f32","f64","f128"}; Lexer::Lexer(const std::string& fname) : loc(1,0,fname), index(-1), prev_loc(1,0,fname) { } Lexer::~Lexer() { } int Lexer::advance() { prev_loc = loc; ++index; loc.advance(); if(index == current_lexed_text.size()) return 0; current_char = current_lexed_text[index]; loc.pos_from_char(current_char); if(current_char == '\n') { previous_line_text = current_line_text; current_line_text = this->recalculate_current_line(current_lexed_text); } return 1; } int Lexer::rewind() { loc = prev_loc; --index; if(index == -1) return 0; if(current_char == '\n') { current_line_text = previous_line_text; } current_char = current_lexed_text[index]; return 1; } std::string Lexer::recalculate_current_line(const std::string& text) { int idx = index; std::string final_str; ++idx; while(idx != text.size() && text[idx] != '\n') { final_str += text[idx]; ++idx; } return final_str; } std::shared_ptr Lexer::make_lexer(const std::string& fname) { return std::shared_ptr(new Lexer(fname)); // not using make_shared because the constructor is private } void Lexer::assign_parent_location(std::shared_ptr& lexer, const std::shared_ptr& loc) { lexer->loc.parent = loc; } bool Lexer::is_in_string(const std::string& string, const char& character) { return string.find(character) != std::string::npos; } TokenStream Lexer::lex(const std::string& text) { TokenStream result; bool comment = false; current_lexed_text = text; current_line_text = this->recalculate_current_line(current_lexed_text); while(this->advance()) { if(this->current_char == '\n') comment = false; if(comment) continue; if(is_in_string(WHITESPACE,current_char)) continue; else if(is_in_string(LETTERS,current_char)) { result.push_back(create_identifier()); } else if(is_in_string(DIGITS,current_char)) { result.push_back(create_number()); } else if(current_char == '\'') { result.push_back(create_string()); } else switch(current_char) { case '/': if(index + 1 != current_lexed_text.size()) { if(current_lexed_text[index+1] == '/') { comment = true; break; } } result.push_back(Token::make_with_line({TT_Div,loc},current_line_text)); break; case '+': result.push_back(Token::make_with_line({TT_Plus,loc},current_line_text)); break; case '-': result.push_back(Token::make_with_line({TT_Minus,loc},current_line_text)); break; case '*': result.push_back(Token::make_with_line({TT_Mul,loc},current_line_text)); break; case '@': result.push_back(Token::make_with_line({TT_At,loc},current_line_text)); break; case '=': result.push_back(Token::make_with_line({TT_Equal,loc},current_line_text)); break; case '>': result.push_back(Token::make_with_line({TT_GreaterThan,loc},current_line_text)); break; case '<': result.push_back(Token::make_with_line({TT_LessThan,loc},current_line_text)); break; case '(': result.push_back(Token::make_with_line({TT_LParen,loc},current_line_text)); break; case ')': result.push_back(Token::make_with_line({TT_RParen,loc},current_line_text)); break; case '{': result.push_back(Token::make_with_line({TT_RBracket,loc},current_line_text)); break; case '}': result.push_back(Token::make_with_line({TT_LBracket,loc},current_line_text)); break; case ';': result.push_back(Token::make_with_line({TT_Semicolon,loc},current_line_text)); break; case '.': result.push_back(Token::make_with_line({TT_Period,loc},current_line_text)); break; case ',': result.push_back(Token::make_with_line({TT_Comma,loc},current_line_text)); break; case '!': result.push_back(Token::make_with_line({TT_Exclamation,loc},current_line_text)); break; case '[': result.push_back(Token::make_with_line({TT_Exclamation,loc},current_line_text)); break; case ']': result.push_back(Token::make_with_line({TT_Exclamation,loc},current_line_text)); break; default: Error::throw_error(loc,current_line_text,"unknown character"); } } result.push_back(Token(TT_EOF,loc)); return result; } Token Lexer::create_identifier() { std::vector characters; int prev_line = loc.line; int prev_column = loc.column; bool is_path = false; bool last_was_path = false; Location saved_loc = this->loc; Location saved_prev_loc = this->prev_loc; characters.push_back(current_char); while(this->advance()) { if(is_in_string(IDENTIFIERS,current_char)) { characters.push_back(current_char); last_was_path = false; } else if(current_char == '/') { if(last_was_path) { characters.pop_back(); this->loc = saved_loc; this->prev_loc = saved_prev_loc; this->rewind(); std::string identifier(characters.begin(), characters.end()); return Token::make_with_line({TT_Path,identifier,{prev_line,prev_column,loc.fname}},current_line_text); } saved_loc = this->loc; saved_prev_loc = this->prev_loc; characters.push_back(current_char); is_path = true; last_was_path = true; } else { this->rewind(); std::string identifier(characters.begin(), characters.end()); if(is_path) return Token::make_with_line({TT_Path,identifier,{prev_line,prev_column,loc.fname}},current_line_text); auto location = std::find(types.begin(),types.end(),identifier); if(location != types.end()) { return Token::make_with_line({TT_Type,identifier,{prev_line,prev_column,loc.fname}},current_line_text); } if (identifier == "import") return Token::make_with_line({TT_Import,{prev_line,prev_column,loc.fname}},current_line_text); if (identifier == "syscall0") return Token::make_with_line({TT_Syscall0,{prev_line,prev_column,loc.fname}},current_line_text); if (identifier == "syscall1") return Token::make_with_line({TT_Syscall1,{prev_line,prev_column,loc.fname}},current_line_text); if (identifier == "syscall2") return Token::make_with_line({TT_Syscall2,{prev_line,prev_column,loc.fname}},current_line_text); if (identifier == "syscall3") return Token::make_with_line({TT_Syscall3,{prev_line,prev_column,loc.fname}},current_line_text); if (identifier == "syscall4") return Token::make_with_line({TT_Syscall4,{prev_line,prev_column,loc.fname}},current_line_text); if (identifier == "syscall5") return Token::make_with_line({TT_Syscall5,{prev_line,prev_column,loc.fname}},current_line_text); if( identifier == "compmacro" ) return Token::make_with_line({TT_CompilerMacro,{prev_line,prev_column,loc.fname}},current_line_text); return Token::make_with_line({TT_Identifier,identifier,{prev_line,prev_column,loc.fname}},current_line_text); } } std::string identifier(characters.begin(), characters.end()); if(is_path) return Token::make_with_line({TT_Path,identifier,{prev_line,prev_column,loc.fname}},current_line_text); auto location = std::find(types.begin(),types.end(),identifier); if(location != types.end()) { return Token::make_with_line({TT_Type,identifier,{prev_line,prev_column,loc.fname}},current_line_text); } if (identifier == "import") return Token::make_with_line({TT_Import,{prev_line,prev_column,loc.fname}},current_line_text); if (identifier == "syscall0") return Token::make_with_line({TT_Syscall0,{prev_line,prev_column,loc.fname}},current_line_text); if (identifier == "syscall1") return Token::make_with_line({TT_Syscall1,{prev_line,prev_column,loc.fname}},current_line_text); if (identifier == "syscall2") return Token::make_with_line({TT_Syscall2,{prev_line,prev_column,loc.fname}},current_line_text); if (identifier == "syscall3") return Token::make_with_line({TT_Syscall3,{prev_line,prev_column,loc.fname}},current_line_text); if (identifier == "syscall4") return Token::make_with_line({TT_Syscall4,{prev_line,prev_column,loc.fname}},current_line_text); if (identifier == "syscall5") return Token::make_with_line({TT_Syscall5,{prev_line,prev_column,loc.fname}},current_line_text); if( identifier == "compmacro" ) return Token::make_with_line({TT_CompilerMacro,{prev_line,prev_column,loc.fname}},current_line_text); return Token::make_with_line({TT_Identifier,identifier,{prev_line,prev_column,loc.fname}},current_line_text); } Token Lexer::create_number() { std::vector characters; int prev_line = loc.line; int prev_column = loc.column; int dot_count = 0; characters.push_back(current_char); while(this->advance()) { if(is_in_string(DIGITS,current_char)) { characters.push_back(current_char); } else if (current_char == '.') { if(dot_count == 0) { characters.push_back(current_char); ++dot_count; } else { Error::throw_warning(loc,current_line_text,"floats can only have one dot"); this->rewind(); float tk_value = std::stof(std::string(characters.begin(), characters.end()).c_str()); return Token::make_with_line({TT_Float,tk_value,{prev_line,prev_column,loc.fname}},current_line_text); } } else { this->rewind(); if(dot_count != 0) { float tk_value = std::stof(std::string(characters.begin(), characters.end()).c_str()); return Token::make_with_line({TT_Float,tk_value,{prev_line,prev_column,loc.fname}},current_line_text); } int tk_value = atoi(std::string(characters.begin(), characters.end()).c_str()); return Token::make_with_line({TT_Number,tk_value,{prev_line,prev_column,loc.fname}},current_line_text); } } if(dot_count != 0) { float tk_value = std::stof(std::string(characters.begin(), characters.end()).c_str()); return Token::make_with_line({TT_Float,tk_value,{prev_line,prev_column,loc.fname}},current_line_text); } int tk_value = atoi(std::string(characters.begin(), characters.end()).c_str()); return Token::make_with_line({TT_Number,tk_value,{prev_line,prev_column,loc.fname}},current_line_text); } Token Lexer::create_string() { std::vector characters; int prev_line = loc.line; int prev_column = loc.column; while(this->advance()) { if(current_char == '\n') { this->rewind(); Error::throw_error(loc,current_line_text,"expected end of string but got newline"); } if(current_char == '\'') { std::string identifier(characters.begin(), characters.end()); return Token::make_with_line({TT_String,identifier,{prev_line,prev_column,loc.fname}},current_line_text); } if(current_char == '\\') { if(index + 1 == current_lexed_text.size()) { Error::throw_error(loc,current_line_text,"unfinished escape sequence"); } switch(current_lexed_text[index+1]) { case 'n': characters.push_back('\n'); break; case '\'': characters.push_back('\''); break; case '\\': characters.push_back('\\'); break; default: Error::throw_error(loc,current_line_text,"unknown escape sequence"); } ++index; ++loc.column; continue; } characters.push_back(current_char); } this->rewind(); Error::throw_error(loc,current_line_text,"expected end of string but got EOF"); return Token(TT_Null,loc); // unreachable since Error::throw_error calls exit() }