360 lines
13 KiB
C++
360 lines
13 KiB
C++
#include "Lexer.h"
|
|
#include "Error.h"
|
|
#include <algorithm>
|
|
|
|
#define WHITESPACE "\t \n"
|
|
#define LETTERS "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWYZ_"
|
|
#define IDENTIFIERS "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWYZ_0123456789"
|
|
#define DIGITS "0123456789"
|
|
|
|
const std::array<std::string,TYPE_COUNT> Lexer::types = {"void","bool","str","i8","i16","i32","i64","u8","u16","u32","u64","f32","f64","f128"};
|
|
|
|
Lexer::Lexer(const std::string& fname)
|
|
: loc(1,0,fname), index(-1), prev_loc(1,0,fname)
|
|
{
|
|
}
|
|
|
|
Lexer::~Lexer()
|
|
{
|
|
}
|
|
|
|
int Lexer::advance()
|
|
{
|
|
prev_loc = loc;
|
|
++index;
|
|
loc.advance();
|
|
if(index == current_lexed_text.size()) return 0;
|
|
current_char = current_lexed_text[index];
|
|
loc.pos_from_char(current_char);
|
|
if(current_char == '\n')
|
|
{
|
|
previous_line_text = current_line_text;
|
|
current_line_text = this->recalculate_current_line(current_lexed_text);
|
|
}
|
|
return 1;
|
|
}
|
|
|
|
int Lexer::rewind()
|
|
{
|
|
loc = prev_loc;
|
|
--index;
|
|
if(index == -1) return 0;
|
|
if(current_char == '\n')
|
|
{
|
|
current_line_text = previous_line_text;
|
|
}
|
|
current_char = current_lexed_text[index];
|
|
return 1;
|
|
}
|
|
|
|
std::string Lexer::recalculate_current_line(const std::string& text)
|
|
{
|
|
int idx = index;
|
|
std::string final_str;
|
|
++idx;
|
|
while(idx != text.size() && text[idx] != '\n')
|
|
{
|
|
final_str += text[idx];
|
|
++idx;
|
|
}
|
|
return final_str;
|
|
}
|
|
|
|
std::shared_ptr<Lexer> Lexer::make_lexer(const std::string& fname)
|
|
{
|
|
return std::shared_ptr<Lexer>(new Lexer(fname)); // not using make_shared because the constructor is private
|
|
}
|
|
|
|
void Lexer::assign_parent_location(std::shared_ptr<Lexer>& lexer, const std::shared_ptr<Location>& loc)
|
|
{
|
|
lexer->loc.parent = loc;
|
|
}
|
|
|
|
bool Lexer::is_in_string(const std::string& string, const char& character)
|
|
{
|
|
return string.find(character) != std::string::npos;
|
|
}
|
|
|
|
TokenStream Lexer::lex(const std::string& text)
|
|
{
|
|
TokenStream result;
|
|
bool comment = false;
|
|
current_lexed_text = text;
|
|
current_line_text = this->recalculate_current_line(current_lexed_text);
|
|
|
|
while(this->advance())
|
|
{
|
|
if(this->current_char == '\n') comment = false;
|
|
|
|
if(comment) continue;
|
|
|
|
if(is_in_string(WHITESPACE,current_char)) continue;
|
|
|
|
else if(is_in_string(LETTERS,current_char))
|
|
{
|
|
result.push_back(create_identifier());
|
|
}
|
|
|
|
else if(is_in_string(DIGITS,current_char))
|
|
{
|
|
result.push_back(create_number());
|
|
}
|
|
|
|
else if(current_char == '\'')
|
|
{
|
|
result.push_back(create_string());
|
|
}
|
|
|
|
else switch(current_char)
|
|
{
|
|
case '/':
|
|
if(index + 1 != current_lexed_text.size())
|
|
{
|
|
if(current_lexed_text[index+1] == '/')
|
|
{
|
|
comment = true;
|
|
break;
|
|
}
|
|
}
|
|
result.push_back(Token::make_with_line({TT_Div,loc},current_line_text));
|
|
break;
|
|
case '+':
|
|
result.push_back(Token::make_with_line({TT_Plus,loc},current_line_text));
|
|
break;
|
|
case '-':
|
|
result.push_back(Token::make_with_line({TT_Minus,loc},current_line_text));
|
|
break;
|
|
case '*':
|
|
result.push_back(Token::make_with_line({TT_Mul,loc},current_line_text));
|
|
break;
|
|
case '@':
|
|
result.push_back(Token::make_with_line({TT_At,loc},current_line_text));
|
|
break;
|
|
case '=':
|
|
result.push_back(Token::make_with_line({TT_Equal,loc},current_line_text));
|
|
break;
|
|
case '>':
|
|
result.push_back(Token::make_with_line({TT_GreaterThan,loc},current_line_text));
|
|
break;
|
|
case '<':
|
|
result.push_back(Token::make_with_line({TT_LessThan,loc},current_line_text));
|
|
break;
|
|
case '(':
|
|
result.push_back(Token::make_with_line({TT_LParen,loc},current_line_text));
|
|
break;
|
|
case ')':
|
|
result.push_back(Token::make_with_line({TT_RParen,loc},current_line_text));
|
|
break;
|
|
case '{':
|
|
result.push_back(Token::make_with_line({TT_RBracket,loc},current_line_text));
|
|
break;
|
|
case '}':
|
|
result.push_back(Token::make_with_line({TT_LBracket,loc},current_line_text));
|
|
break;
|
|
case ';':
|
|
result.push_back(Token::make_with_line({TT_Semicolon,loc},current_line_text));
|
|
break;
|
|
case '.':
|
|
result.push_back(Token::make_with_line({TT_Period,loc},current_line_text));
|
|
break;
|
|
case ',':
|
|
result.push_back(Token::make_with_line({TT_Comma,loc},current_line_text));
|
|
break;
|
|
case '!':
|
|
result.push_back(Token::make_with_line({TT_Exclamation,loc},current_line_text));
|
|
break;
|
|
case '[':
|
|
result.push_back(Token::make_with_line({TT_Exclamation,loc},current_line_text));
|
|
break;
|
|
case ']':
|
|
result.push_back(Token::make_with_line({TT_Exclamation,loc},current_line_text));
|
|
break;
|
|
default:
|
|
Error::throw_error(loc,current_line_text,"unknown character");
|
|
}
|
|
}
|
|
|
|
result.push_back(Token(TT_EOF,loc));
|
|
|
|
return result;
|
|
}
|
|
|
|
Token Lexer::create_identifier()
|
|
{
|
|
std::vector<char> characters;
|
|
int prev_line = loc.line;
|
|
int prev_column = loc.column;
|
|
bool is_path = false;
|
|
bool last_was_path = false;
|
|
Location saved_loc = this->loc;
|
|
Location saved_prev_loc = this->prev_loc;
|
|
|
|
characters.push_back(current_char);
|
|
|
|
while(this->advance())
|
|
{
|
|
if(is_in_string(IDENTIFIERS,current_char))
|
|
{
|
|
characters.push_back(current_char);
|
|
last_was_path = false;
|
|
}
|
|
else if(current_char == '/')
|
|
{
|
|
if(last_was_path) {
|
|
characters.pop_back();
|
|
this->loc = saved_loc;
|
|
this->prev_loc = saved_prev_loc;
|
|
this->rewind();
|
|
std::string identifier(characters.begin(), characters.end());
|
|
return Token::make_with_line({TT_Path,identifier,{prev_line,prev_column,loc.fname}},current_line_text);
|
|
}
|
|
|
|
saved_loc = this->loc;
|
|
saved_prev_loc = this->prev_loc;
|
|
|
|
characters.push_back(current_char);
|
|
is_path = true;
|
|
last_was_path = true;
|
|
}
|
|
else
|
|
{
|
|
this->rewind();
|
|
std::string identifier(characters.begin(), characters.end());
|
|
if(is_path) return Token::make_with_line({TT_Path,identifier,{prev_line,prev_column,loc.fname}},current_line_text);
|
|
auto location = std::find(types.begin(),types.end(),identifier);
|
|
if(location != types.end())
|
|
{
|
|
return Token::make_with_line({TT_Type,identifier,{prev_line,prev_column,loc.fname}},current_line_text);
|
|
}
|
|
if (identifier == "import") return Token::make_with_line({TT_Import,{prev_line,prev_column,loc.fname}},current_line_text);
|
|
if (identifier == "syscall0") return Token::make_with_line({TT_Syscall0,{prev_line,prev_column,loc.fname}},current_line_text);
|
|
if (identifier == "syscall1") return Token::make_with_line({TT_Syscall1,{prev_line,prev_column,loc.fname}},current_line_text);
|
|
if (identifier == "syscall2") return Token::make_with_line({TT_Syscall2,{prev_line,prev_column,loc.fname}},current_line_text);
|
|
if (identifier == "syscall3") return Token::make_with_line({TT_Syscall3,{prev_line,prev_column,loc.fname}},current_line_text);
|
|
if (identifier == "syscall4") return Token::make_with_line({TT_Syscall4,{prev_line,prev_column,loc.fname}},current_line_text);
|
|
if (identifier == "syscall5") return Token::make_with_line({TT_Syscall5,{prev_line,prev_column,loc.fname}},current_line_text);
|
|
if( identifier == "compmacro" ) return Token::make_with_line({TT_CompilerMacro,{prev_line,prev_column,loc.fname}},current_line_text);
|
|
return Token::make_with_line({TT_Identifier,identifier,{prev_line,prev_column,loc.fname}},current_line_text);
|
|
}
|
|
}
|
|
|
|
std::string identifier(characters.begin(), characters.end());
|
|
if(is_path) return Token::make_with_line({TT_Path,identifier,{prev_line,prev_column,loc.fname}},current_line_text);
|
|
auto location = std::find(types.begin(),types.end(),identifier);
|
|
if(location != types.end())
|
|
{
|
|
return Token::make_with_line({TT_Type,identifier,{prev_line,prev_column,loc.fname}},current_line_text);
|
|
}
|
|
if (identifier == "import") return Token::make_with_line({TT_Import,{prev_line,prev_column,loc.fname}},current_line_text);
|
|
if (identifier == "syscall0") return Token::make_with_line({TT_Syscall0,{prev_line,prev_column,loc.fname}},current_line_text);
|
|
if (identifier == "syscall1") return Token::make_with_line({TT_Syscall1,{prev_line,prev_column,loc.fname}},current_line_text);
|
|
if (identifier == "syscall2") return Token::make_with_line({TT_Syscall2,{prev_line,prev_column,loc.fname}},current_line_text);
|
|
if (identifier == "syscall3") return Token::make_with_line({TT_Syscall3,{prev_line,prev_column,loc.fname}},current_line_text);
|
|
if (identifier == "syscall4") return Token::make_with_line({TT_Syscall4,{prev_line,prev_column,loc.fname}},current_line_text);
|
|
if (identifier == "syscall5") return Token::make_with_line({TT_Syscall5,{prev_line,prev_column,loc.fname}},current_line_text);
|
|
if( identifier == "compmacro" ) return Token::make_with_line({TT_CompilerMacro,{prev_line,prev_column,loc.fname}},current_line_text);
|
|
return Token::make_with_line({TT_Identifier,identifier,{prev_line,prev_column,loc.fname}},current_line_text);
|
|
}
|
|
|
|
Token Lexer::create_number()
|
|
{
|
|
std::vector<char> characters;
|
|
int prev_line = loc.line;
|
|
int prev_column = loc.column;
|
|
int dot_count = 0;
|
|
|
|
characters.push_back(current_char);
|
|
|
|
while(this->advance())
|
|
{
|
|
if(is_in_string(DIGITS,current_char))
|
|
{
|
|
characters.push_back(current_char);
|
|
}
|
|
else if (current_char == '.')
|
|
{
|
|
if(dot_count == 0)
|
|
{
|
|
characters.push_back(current_char);
|
|
++dot_count;
|
|
}
|
|
else {
|
|
Error::throw_warning(loc,current_line_text,"floats can only have one dot");
|
|
this->rewind();
|
|
float tk_value = std::stof(std::string(characters.begin(), characters.end()).c_str());
|
|
return Token::make_with_line({TT_Float,tk_value,{prev_line,prev_column,loc.fname}},current_line_text);
|
|
}
|
|
}
|
|
else
|
|
{
|
|
this->rewind();
|
|
if(dot_count != 0)
|
|
{
|
|
float tk_value = std::stof(std::string(characters.begin(), characters.end()).c_str());
|
|
return Token::make_with_line({TT_Float,tk_value,{prev_line,prev_column,loc.fname}},current_line_text);
|
|
}
|
|
int tk_value = atoi(std::string(characters.begin(), characters.end()).c_str());
|
|
return Token::make_with_line({TT_Number,tk_value,{prev_line,prev_column,loc.fname}},current_line_text);
|
|
}
|
|
}
|
|
|
|
if(dot_count != 0)
|
|
{
|
|
float tk_value = std::stof(std::string(characters.begin(), characters.end()).c_str());
|
|
return Token::make_with_line({TT_Float,tk_value,{prev_line,prev_column,loc.fname}},current_line_text);
|
|
}
|
|
int tk_value = atoi(std::string(characters.begin(), characters.end()).c_str());
|
|
return Token::make_with_line({TT_Number,tk_value,{prev_line,prev_column,loc.fname}},current_line_text);
|
|
}
|
|
|
|
Token Lexer::create_string()
|
|
{
|
|
std::vector<char> characters;
|
|
int prev_line = loc.line;
|
|
int prev_column = loc.column;
|
|
|
|
while(this->advance())
|
|
{
|
|
if(current_char == '\n')
|
|
{
|
|
this->rewind();
|
|
Error::throw_error(loc,current_line_text,"expected end of string but got newline");
|
|
}
|
|
if(current_char == '\'')
|
|
{
|
|
std::string identifier(characters.begin(), characters.end());
|
|
return Token::make_with_line({TT_String,identifier,{prev_line,prev_column,loc.fname}},current_line_text);
|
|
}
|
|
if(current_char == '\\')
|
|
{
|
|
if(index + 1 == current_lexed_text.size())
|
|
{
|
|
Error::throw_error(loc,current_line_text,"unfinished escape sequence");
|
|
}
|
|
switch(current_lexed_text[index+1])
|
|
{
|
|
case 'n':
|
|
characters.push_back('\n');
|
|
break;
|
|
case '\'':
|
|
characters.push_back('\'');
|
|
break;
|
|
case '\\':
|
|
characters.push_back('\\');
|
|
break;
|
|
default:
|
|
Error::throw_error(loc,current_line_text,"unknown escape sequence");
|
|
}
|
|
++index;
|
|
++loc.column;
|
|
continue;
|
|
}
|
|
|
|
characters.push_back(current_char);
|
|
}
|
|
this->rewind();
|
|
Error::throw_error(loc,current_line_text,"expected end of string but got EOF");
|
|
|
|
return Token(TT_Null,loc); // unreachable since Error::throw_error calls exit()
|
|
}
|