sapphire/src/Lexer.cpp
2022-06-07 18:12:43 +02:00

360 lines
13 KiB
C++

#include "Lexer.h"
#include "Error.h"
#include <algorithm>
#define WHITESPACE "\t \n"
#define LETTERS "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWYZ_"
#define IDENTIFIERS "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWYZ_0123456789"
#define DIGITS "0123456789"
const std::array<std::string,TYPE_COUNT> Lexer::types = {"void","bool","str","i8","i16","i32","i64","u8","u16","u32","u64","f32","f64","f128"};
Lexer::Lexer(const std::string& fname)
: loc(1,0,fname), index(-1), prev_loc(1,0,fname)
{
}
Lexer::~Lexer()
{
}
int Lexer::advance()
{
prev_loc = loc;
++index;
loc.advance();
if(index == current_lexed_text.size()) return 0;
current_char = current_lexed_text[index];
loc.pos_from_char(current_char);
if(current_char == '\n')
{
previous_line_text = current_line_text;
current_line_text = this->recalculate_current_line(current_lexed_text);
}
return 1;
}
int Lexer::rewind()
{
loc = prev_loc;
--index;
if(index == -1) return 0;
if(current_char == '\n')
{
current_line_text = previous_line_text;
}
current_char = current_lexed_text[index];
return 1;
}
std::string Lexer::recalculate_current_line(const std::string& text)
{
int idx = index;
std::string final_str;
++idx;
while(idx != text.size() && text[idx] != '\n')
{
final_str += text[idx];
++idx;
}
return final_str;
}
std::shared_ptr<Lexer> Lexer::make_lexer(const std::string& fname)
{
return std::shared_ptr<Lexer>(new Lexer(fname)); // not using make_shared because the constructor is private
}
void Lexer::assign_parent_location(std::shared_ptr<Lexer>& lexer, const std::shared_ptr<Location>& loc)
{
lexer->loc.parent = loc;
}
bool Lexer::is_in_string(const std::string& string, const char& character)
{
return string.find(character) != std::string::npos;
}
TokenStream Lexer::lex(const std::string& text)
{
TokenStream result;
bool comment = false;
current_lexed_text = text;
current_line_text = this->recalculate_current_line(current_lexed_text);
while(this->advance())
{
if(this->current_char == '\n') comment = false;
if(comment) continue;
if(is_in_string(WHITESPACE,current_char)) continue;
else if(is_in_string(LETTERS,current_char))
{
result.push_back(create_identifier());
}
else if(is_in_string(DIGITS,current_char))
{
result.push_back(create_number());
}
else if(current_char == '\'')
{
result.push_back(create_string());
}
else switch(current_char)
{
case '/':
if(index + 1 != current_lexed_text.size())
{
if(current_lexed_text[index+1] == '/')
{
comment = true;
break;
}
}
result.push_back(Token::make_with_line({TT_Div,loc},current_line_text));
break;
case '+':
result.push_back(Token::make_with_line({TT_Plus,loc},current_line_text));
break;
case '-':
result.push_back(Token::make_with_line({TT_Minus,loc},current_line_text));
break;
case '*':
result.push_back(Token::make_with_line({TT_Mul,loc},current_line_text));
break;
case '@':
result.push_back(Token::make_with_line({TT_At,loc},current_line_text));
break;
case '=':
result.push_back(Token::make_with_line({TT_Equal,loc},current_line_text));
break;
case '>':
result.push_back(Token::make_with_line({TT_GreaterThan,loc},current_line_text));
break;
case '<':
result.push_back(Token::make_with_line({TT_LessThan,loc},current_line_text));
break;
case '(':
result.push_back(Token::make_with_line({TT_LParen,loc},current_line_text));
break;
case ')':
result.push_back(Token::make_with_line({TT_RParen,loc},current_line_text));
break;
case '{':
result.push_back(Token::make_with_line({TT_RBracket,loc},current_line_text));
break;
case '}':
result.push_back(Token::make_with_line({TT_LBracket,loc},current_line_text));
break;
case ';':
result.push_back(Token::make_with_line({TT_Semicolon,loc},current_line_text));
break;
case '.':
result.push_back(Token::make_with_line({TT_Period,loc},current_line_text));
break;
case ',':
result.push_back(Token::make_with_line({TT_Comma,loc},current_line_text));
break;
case '!':
result.push_back(Token::make_with_line({TT_Exclamation,loc},current_line_text));
break;
case '[':
result.push_back(Token::make_with_line({TT_Exclamation,loc},current_line_text));
break;
case ']':
result.push_back(Token::make_with_line({TT_Exclamation,loc},current_line_text));
break;
default:
Error::throw_error(loc,current_line_text,"unknown character");
}
}
result.push_back(Token(TT_EOF,loc));
return result;
}
Token Lexer::create_identifier()
{
std::vector<char> characters;
int prev_line = loc.line;
int prev_column = loc.column;
bool is_path = false;
bool last_was_path = false;
Location saved_loc = this->loc;
Location saved_prev_loc = this->prev_loc;
characters.push_back(current_char);
while(this->advance())
{
if(is_in_string(IDENTIFIERS,current_char))
{
characters.push_back(current_char);
last_was_path = false;
}
else if(current_char == '/')
{
if(last_was_path) {
characters.pop_back();
this->loc = saved_loc;
this->prev_loc = saved_prev_loc;
this->rewind();
std::string identifier(characters.begin(), characters.end());
return Token::make_with_line({TT_Path,identifier,{prev_line,prev_column,loc.fname}},current_line_text);
}
saved_loc = this->loc;
saved_prev_loc = this->prev_loc;
characters.push_back(current_char);
is_path = true;
last_was_path = true;
}
else
{
this->rewind();
std::string identifier(characters.begin(), characters.end());
if(is_path) return Token::make_with_line({TT_Path,identifier,{prev_line,prev_column,loc.fname}},current_line_text);
auto location = std::find(types.begin(),types.end(),identifier);
if(location != types.end())
{
return Token::make_with_line({TT_Type,identifier,{prev_line,prev_column,loc.fname}},current_line_text);
}
if (identifier == "import") return Token::make_with_line({TT_Import,{prev_line,prev_column,loc.fname}},current_line_text);
if (identifier == "syscall0") return Token::make_with_line({TT_Syscall0,{prev_line,prev_column,loc.fname}},current_line_text);
if (identifier == "syscall1") return Token::make_with_line({TT_Syscall1,{prev_line,prev_column,loc.fname}},current_line_text);
if (identifier == "syscall2") return Token::make_with_line({TT_Syscall2,{prev_line,prev_column,loc.fname}},current_line_text);
if (identifier == "syscall3") return Token::make_with_line({TT_Syscall3,{prev_line,prev_column,loc.fname}},current_line_text);
if (identifier == "syscall4") return Token::make_with_line({TT_Syscall4,{prev_line,prev_column,loc.fname}},current_line_text);
if (identifier == "syscall5") return Token::make_with_line({TT_Syscall5,{prev_line,prev_column,loc.fname}},current_line_text);
if( identifier == "compmacro" ) return Token::make_with_line({TT_CompilerMacro,{prev_line,prev_column,loc.fname}},current_line_text);
return Token::make_with_line({TT_Identifier,identifier,{prev_line,prev_column,loc.fname}},current_line_text);
}
}
std::string identifier(characters.begin(), characters.end());
if(is_path) return Token::make_with_line({TT_Path,identifier,{prev_line,prev_column,loc.fname}},current_line_text);
auto location = std::find(types.begin(),types.end(),identifier);
if(location != types.end())
{
return Token::make_with_line({TT_Type,identifier,{prev_line,prev_column,loc.fname}},current_line_text);
}
if (identifier == "import") return Token::make_with_line({TT_Import,{prev_line,prev_column,loc.fname}},current_line_text);
if (identifier == "syscall0") return Token::make_with_line({TT_Syscall0,{prev_line,prev_column,loc.fname}},current_line_text);
if (identifier == "syscall1") return Token::make_with_line({TT_Syscall1,{prev_line,prev_column,loc.fname}},current_line_text);
if (identifier == "syscall2") return Token::make_with_line({TT_Syscall2,{prev_line,prev_column,loc.fname}},current_line_text);
if (identifier == "syscall3") return Token::make_with_line({TT_Syscall3,{prev_line,prev_column,loc.fname}},current_line_text);
if (identifier == "syscall4") return Token::make_with_line({TT_Syscall4,{prev_line,prev_column,loc.fname}},current_line_text);
if (identifier == "syscall5") return Token::make_with_line({TT_Syscall5,{prev_line,prev_column,loc.fname}},current_line_text);
if( identifier == "compmacro" ) return Token::make_with_line({TT_CompilerMacro,{prev_line,prev_column,loc.fname}},current_line_text);
return Token::make_with_line({TT_Identifier,identifier,{prev_line,prev_column,loc.fname}},current_line_text);
}
Token Lexer::create_number()
{
std::vector<char> characters;
int prev_line = loc.line;
int prev_column = loc.column;
int dot_count = 0;
characters.push_back(current_char);
while(this->advance())
{
if(is_in_string(DIGITS,current_char))
{
characters.push_back(current_char);
}
else if (current_char == '.')
{
if(dot_count == 0)
{
characters.push_back(current_char);
++dot_count;
}
else {
Error::throw_warning(loc,current_line_text,"floats can only have one dot");
this->rewind();
float tk_value = std::stof(std::string(characters.begin(), characters.end()).c_str());
return Token::make_with_line({TT_Float,tk_value,{prev_line,prev_column,loc.fname}},current_line_text);
}
}
else
{
this->rewind();
if(dot_count != 0)
{
float tk_value = std::stof(std::string(characters.begin(), characters.end()).c_str());
return Token::make_with_line({TT_Float,tk_value,{prev_line,prev_column,loc.fname}},current_line_text);
}
int tk_value = atoi(std::string(characters.begin(), characters.end()).c_str());
return Token::make_with_line({TT_Number,tk_value,{prev_line,prev_column,loc.fname}},current_line_text);
}
}
if(dot_count != 0)
{
float tk_value = std::stof(std::string(characters.begin(), characters.end()).c_str());
return Token::make_with_line({TT_Float,tk_value,{prev_line,prev_column,loc.fname}},current_line_text);
}
int tk_value = atoi(std::string(characters.begin(), characters.end()).c_str());
return Token::make_with_line({TT_Number,tk_value,{prev_line,prev_column,loc.fname}},current_line_text);
}
Token Lexer::create_string()
{
std::vector<char> characters;
int prev_line = loc.line;
int prev_column = loc.column;
while(this->advance())
{
if(current_char == '\n')
{
this->rewind();
Error::throw_error(loc,current_line_text,"expected end of string but got newline");
}
if(current_char == '\'')
{
std::string identifier(characters.begin(), characters.end());
return Token::make_with_line({TT_String,identifier,{prev_line,prev_column,loc.fname}},current_line_text);
}
if(current_char == '\\')
{
if(index + 1 == current_lexed_text.size())
{
Error::throw_error(loc,current_line_text,"unfinished escape sequence");
}
switch(current_lexed_text[index+1])
{
case 'n':
characters.push_back('\n');
break;
case '\'':
characters.push_back('\'');
break;
case '\\':
characters.push_back('\\');
break;
default:
Error::throw_error(loc,current_line_text,"unknown escape sequence");
}
++index;
++loc.column;
continue;
}
characters.push_back(current_char);
}
this->rewind();
Error::throw_error(loc,current_line_text,"expected end of string but got EOF");
return Token(TT_Null,loc); // unreachable since Error::throw_error calls exit()
}