sapphire/sapphire.cpp

858 lines
27 KiB
C++

#include <iostream>
#include <string>
#include <stdlib.h>
#include <stdio.h>
#include <vector>
#include <regex>
#include <array>
#include <sstream>
#include <fstream>
#define IMPORT_MAX_DEEP 100
#define STRLEN "strlen: ; -- length of null-terminated string in rdi --\n xor rax, rax\n mov rcx, -1\n cld\n repne scasb\n mov rax, rcx\n add rax, 2\n neg rax\n ret\n"
#define PRINT "print: ; -- print null-terminated string in rdi --\n mov rsi, rdi\n call strlen\n mov rdx, rax\n mov rdi, 1\n mov rax, 1\n syscall\n ret\n"
#define __FASM
#ifdef __FASM
#define ASM_HEADER "format ELF64 executable 3\n"
#define ASM_TEXT_SECTION "segment readable executable\n"
#define ASM_DATA_SECTION "segment readable writable\n"
#define ASM_BSS_SECTION ""
#define ASM_ENTRY "entry start\n"
#define ASM_ENTRY_DECL "start:\n"
#elif defined(__NASM)
#define ASM_HEADER "BITS 64\n"
#define ASM_TEXT_SECTION "section .text\n"
#define ASM_DATA_SECTION "section .data\n"
#define ASM_BSS_SECTION "section .bss\n"
#define ASM_ENTRY "global _start\n"
#define ASM_ENTRY_DECL "_start:\n"
#else
#endif
std::regex letter("[a-zA-Z_]");
std::regex identifier("[a-zA-Z0-9_]");
std::regex number("[0-9]");
std::regex whitespace("[\t \n]");
int g_importCount = 0;
std::string g_current_line;
std::vector<std::string> imported_files;
enum TokenType
{
TT_Identifier,
TT_Number,
TT_Float,
TT_Keyword,
TT_String,
TT_Plus,
TT_Minus,
TT_Mul,
TT_Div,
TT_At,
TT_Equal,
TT_LessThan,
TT_GreaterThan,
TT_LParen,
TT_RParen,
TT_LBracket,
TT_RBracket,
TT_Semicolon,
TT_LoadedString,
TT_EOF,
TT_Null
};
static std::string token_strings[] = {
"TT_IDENTIFIER",
"TT_NUMBER",
"TT_FLOAT",
"TT_KEYWORD",
"TT_STRING",
"TT_PLUS",
"TT_MINUS",
"TT_MUL",
"TT_DIV",
"TT_AT",
"TT_EQUAL",
"TT_LESSTHAN",
"TT_GREATERTHAN",
"TT_LPAREN",
"TT_RPAREN",
"TT_LBRACKET",
"TT_RBRACKET",
"TT_SEMICOLON",
"TT_LOADEDSTRING",
"TT_EOF",
"TT_NULL"
};
struct Token
{
TokenType tk_type;
int int_value;
std::string string_value;
float float_value;
int line;
int column;
std::string fname;
std::string line_ctx;
Token(const TokenType& type, const int& lineno, const int& colno, const std::string& name)
: tk_type(type), line(lineno), column(colno), fname(name)
{
line_ctx = g_current_line;
}
Token(const TokenType& type, const int& val, const int& lineno, const int& colno, const std::string& name)
: tk_type(type), int_value(val), line(lineno), column(colno), fname(name)
{
line_ctx = g_current_line;
}
Token(const TokenType& type, const std::string& val, const int& lineno, const int& colno, const std::string& name)
: tk_type(type), string_value(val), line(lineno), column(colno), fname(name)
{
line_ctx = g_current_line;
}
Token(const TokenType& type, const float& val, const int& lineno, const int& colno, const std::string& name)
: tk_type(type), float_value(val), line(lineno), column(colno), fname(name)
{
line_ctx = g_current_line;
}
std::string to_string() const
{
char linestr[32];
sprintf(linestr,"%d",line);
char colstr[32];
sprintf(colstr,"%d",column);
if(tk_type == TT_Number)
{
char num[32];
sprintf(num,"%d",int_value);
return "INT:" + std::string(num) + " (" + fname + ":" + std::string(linestr) + ":" + std::string(colstr) + ")";
} else if (tk_type == TT_Float)
{
char num[64];
sprintf(num,"%f",float_value);
return "FLOAT:" + std::string(num) + " (" + fname + ":" + std::string(linestr) + ":" + std::string(colstr) + ")";
}
else if (tk_type == TT_Identifier){
return "ID:" + string_value + " (" + fname + ":" + std::string(linestr) + ":" + std::string(colstr) + ")";
} else if (tk_type == TT_Keyword){
return "KEYWORD:" + string_value + " (" + fname + ":" + std::string(linestr) + ":" + std::string(colstr) + ")";
} else if (tk_type == TT_String)
{
return "STRING:" + std::string("\'") + string_value + std::string("\'") + " (" + fname + ":" + std::string(linestr) + ":" + std::string(colstr) + ")";
}
std::string details = std::string(" (") + fname + ":" + std::string(linestr) + ":" + std::string(colstr) + ")";
switch(tk_type)
{
case TT_EOF:
return "EOF" + details;
case TT_Plus:
return "PLUS" + details;
case TT_Minus:
return "MINUS" + details;
case TT_Mul:
return "MUL" + details;
case TT_Div:
return "DIV" + details;
case TT_At:
return "AT" + details;
case TT_Equal:
return "EQUAL" + details;
case TT_LessThan:
return "LESSTHAN" + details;
case TT_GreaterThan:
return "GREATERTHAN" + details;
case TT_LParen:
return "LPAREN" + details;
case TT_RParen:
return "RPAREN" + details;
case TT_LBracket:
return "LBRACKET" + details;
case TT_RBracket:
return "RBRACKET" + details;
case TT_Semicolon:
return "SEMICOLON" + details;
case TT_LoadedString:
return "LDSTRING" + details;
}
return "";
}
};
std::string tokentype_as_string(const TokenType& type)
{
return token_strings[type];
}
struct Sentence
{
std::string type_name;
};
struct Function : public Sentence
{
std::string type_name = "function";
std::vector<Token> fun_tokens;
Function(std::vector<Token> tokens) : fun_tokens(tokens){}
};
struct DeclVar : public Sentence
{
std::string type_name = "decl";
std::vector<Token> vtokens;
DeclVar(std::vector<Token> tokens) : vtokens(tokens){}
};
struct Variable
{
int size;
std::string identifier;
Variable(int _size, std::string _identifier) : size(_size), identifier(_identifier){}
};
struct StringLiteral
{
std::string data;
std::string identifier;
StringLiteral(std::string _data, std::string _identifier) : data(_data), identifier(_identifier){}
};
std::array<std::string, 3> keywords{"out", "var","import"};
std::vector<Variable> registered_vars;
std::vector<StringLiteral> registered_strings;
std::string make_asm_string(StringLiteral str);
std::vector<Token> lex_tokens(const std::string&, const std::string&);
std::vector<Token> evaluate_imports(const std::string& text, const std::vector<Token>& tokens);
std::string read_file(const std::string&);
void compiler_error(const std::string& text, const int& line, const int& column, const std::string& fname, const std::string& details);
void compiler_warning(const std::string& text, const int& line, const int& column, const std::string& fname, const std::string& details);
std::vector<Sentence> parse_tokens(const std::vector<Token>& tokens);
int main(int argc, char** argv)
{
std::string fname;
if(argc < 2)
{
fname = "test.sp";
}
else fname = (const char*)argv[1];
std::string command = read_file(fname);
std::vector<Token> main_tokens = lex_tokens(command,fname);
imported_files.push_back(fname.substr(0,fname.find_last_of('.')));
main_tokens = evaluate_imports(command,main_tokens);
parse_tokens(main_tokens);
std::string assembly;
assembly += ASM_HEADER;
assembly += "; Assembly generated by the Sapphire compiler.\n";
assembly += ASM_TEXT_SECTION;
assembly += ASM_ENTRY;
assembly += STRLEN;
assembly += PRINT;
assembly += ASM_ENTRY_DECL;
assembly += "; -- exit with code 0 --\n";
assembly += " mov rax, 60\n";
assembly += " xor rdi, rdi\n";
assembly += " syscall\n";
if(registered_strings.size() != 0)
{
assembly += "\n";
assembly += ASM_DATA_SECTION;
for(auto asm_string : registered_strings)
{
assembly += make_asm_string(asm_string);
}
}
std::string outfile_basename = fname.substr(0,fname.find_last_of('.'));
std::ofstream outfile(outfile_basename + ".asm");
outfile << assembly;
outfile.close();
#ifdef __NASM
system(std::string("nasm -f elf64 " + outfile_basename + ".asm -o" + outfile_basename + ".o").c_str());
system(std::string("ld " + outfile_basename + ".o -o" + outfile_basename).c_str());
#else
system(std::string("fasm " + outfile_basename + ".asm").c_str());
system((std::string("chmod +x ") + outfile_basename).c_str());
#endif
std::cout << fname + " > " + std::string(outfile_basename) + "\n";
}
Token construct_identifier(const std::string&, int& , int&, int&, const std::string&);
Token construct_number(const std::string&, int&, int&, int&, const std::string&);
Token construct_string(const std::string&, int&, int&, int&, const std::string&);
std::string read_file(const std::string& fname)
{
std::ifstream main_file;
main_file.open(fname);
if(!main_file.is_open()) return "";
std::vector<char> file_chars;
char fchar;
while ( main_file ) {
fchar = main_file.get();
if(fchar != -1 ) file_chars.push_back(fchar);
}
main_file.close();
return std::string(file_chars.begin(),file_chars.end());
}
std::string recalculate_current_line(const std::string& text, int index)
{
std::string final_str;
++index;
while(index != text.size() && text[index] != '\n')
{
final_str += text[index];
++index;
}
return final_str;
}
std::string rewind_current_line(const std::string& text, int index)
{
--index;
while(text[index] != '\n')
{
--index;
if(index == 0)
{
return recalculate_current_line(text,-1);
}
}
return recalculate_current_line(text,index);
}
std::vector<Token> lex_tokens(const std::string& text, const std::string& fname)
{
int line = 1;
int column = 0;
int index = -1;
std::vector<Token> result;
bool comment = false;
g_current_line = recalculate_current_line(text,-1);
while(index < (int)text.size())
{
++index;
++column;
if(text[index] == '\n')
{
++line;
column = 0;
g_current_line = recalculate_current_line(text,index);
comment = false;
}
if(comment) continue;
char cstyle_char[2]{text[index],'\0'};
const char* character = (const char*)cstyle_char;
if(std::regex_match(character,whitespace))
{
continue;
}
else if(std::regex_match(character,letter))
{
result.push_back(construct_identifier(text,index,line,column,fname));
}
else if(std::regex_match(character,number))
{
result.push_back(construct_number(text,index,line,column,fname));
} else if (text[index] == '\'')
{
result.push_back(construct_string(text,index,line,column,fname));
} else if(index == text.size())
{
result.push_back(Token(TT_EOF,line,column,fname));
} else switch(text[index])
{
case '+':
result.push_back(Token(TT_Plus,line,column,fname));
break;
case '-':
result.push_back(Token(TT_Minus,line,column,fname));
break;
case '*':
result.push_back(Token(TT_Mul,line,column,fname));
break;
case '/':
if(index != text.size())
{
if(text[index+1] == '/')
{
comment = true;
break;
}
}
result.push_back(Token(TT_Div,line,column,fname));
break;
case '@':
result.push_back(Token(TT_At,line,column,fname));
break;
case '=':
result.push_back(Token(TT_Equal,line,column,fname));
break;
case '<':
result.push_back(Token(TT_LessThan,line,column,fname));
break;
case '>':
result.push_back(Token(TT_GreaterThan,line,column,fname));
break;
case '(':
result.push_back(Token(TT_LParen,line,column,fname));
break;
case ')':
result.push_back(Token(TT_RParen,line,column,fname));
break;
case '{':
result.push_back(Token(TT_LBracket,line,column,fname));
break;
case '}':
result.push_back(Token(TT_RBracket,line,column,fname));
break;
case ';':
result.push_back(Token(TT_Semicolon,line,column,fname));
break;
default:
compiler_error(g_current_line,line,column,fname,"unknown character");
break;
}
}
return result;
}
std::string get_line(const std::string&, int);
Token construct_identifier(const std::string& text, int& index, int& line, int& column, const std::string& fname)
{
std::vector<char> id_symbols;
int prev_column = column * 1;
int prev_line = line * 1;
char cstyle_char[2]{text[index],'\0'};
const char* character = (const char*)cstyle_char;
if(std::regex_match(character,identifier))
{
id_symbols.push_back(text[index]);
} else
{
--index;
--column;
if(text[index] == '\n')
{
--line;
column = get_line(text,line-1).size() + 1;
g_current_line = rewind_current_line(text,index);
}
std::string identifier(id_symbols.begin(), id_symbols.end());
std::string* location = std::find(keywords.begin(),keywords.end(),identifier);
if(location != keywords.end())
{
return Token(TT_Keyword,identifier,prev_line,prev_column,fname);
}
return Token(TT_Identifier,identifier,prev_line,prev_column,fname);
}
while(index < text.size() || index == -1)
{
++index;
++column;
if(text[index] == '\n')
{
++line;
g_current_line = recalculate_current_line(text,index);
column = 0;
}
char cstyle_char[2]{text[index],'\0'};
const char* character = (const char*)cstyle_char;
if(std::regex_match(character,identifier))
{
id_symbols.push_back(text[index]);
} else
{
--index;
--column;
if(text[index] == '\n')
{
--line;
column = get_line(text,line-1).size() + 1;
g_current_line = rewind_current_line(text,index);
}
std::string identifier(id_symbols.begin(), id_symbols.end());
std::string* location = std::find(keywords.begin(),keywords.end(),identifier);
if(location != keywords.end())
{
return Token(TT_Keyword,identifier,prev_line,prev_column,fname);
}
return Token(TT_Identifier,identifier,prev_line,prev_column,fname);
}
}
std::string identifier(id_symbols.begin(), id_symbols.end());
std::string* location = std::find(keywords.begin(),keywords.end(),identifier);
if(location != keywords.end())
{
return Token(TT_Keyword,identifier,prev_line,prev_column,fname);
}
return Token(TT_Identifier,identifier,prev_line,prev_column,fname);
}
Token construct_number(const std::string& text, int& index, int& line, int& column, const std::string& fname)
{
std::vector<char> num_symbols;
int dot_count = 0;
int prev_column = column * 1;
int prev_line = line * 1;
char cstyle_char[2]{text[index],'\0'};
const char* character = (const char*)cstyle_char;
if(std::regex_match(character,number))
{
num_symbols.push_back(text[index]);
} else if (text[index] == '.')
{
if (dot_count == 0)
{
num_symbols.push_back(text[index]);
++dot_count;
} else
{
--index;
--column;
if(text[index] == '\n')
{
--line;
g_current_line = rewind_current_line(text,index);
column = get_line(text,line-1).size() + 1;
}
float tk_value = std::stof(std::string(num_symbols.begin(), num_symbols.end()).c_str());
return Token(TT_Float,tk_value,prev_line,prev_column,fname);
}
}else
{
--index;
--column;
if(text[index] == '\n')
{
--line;
column = get_line(text,line-1).size() + 1;
g_current_line = rewind_current_line(text,index);
}
if(dot_count != 0)
{
float tk_value = std::stof(std::string(num_symbols.begin(), num_symbols.end()).c_str());
return Token(TT_Float,tk_value,prev_line,prev_column,fname);
}
int tk_value = atoi(std::string(num_symbols.begin(), num_symbols.end()).c_str());
return Token(TT_Number,tk_value,prev_line,prev_column,fname);
}
while(index < text.size() || index == -1)
{
++index;
++column;
if(text[index] == '\n')
{
++line;
column = 0;
g_current_line = recalculate_current_line(text,index);
}
char cstyle_char[2]{text[index],'\0'};
const char* character = (const char*)cstyle_char;
if(std::regex_match(character,number))
{
num_symbols.push_back(text[index]);
} else if (text[index] == '.')
{
if (dot_count == 0)
{
num_symbols.push_back(text[index]);
++dot_count;
} else
{
--index;
--column;
if(text[index] == '\n')
{
--line;
g_current_line = rewind_current_line(text,index);
column = get_line(text,line-1).size() + 1;
}
float tk_value = std::stof(std::string(num_symbols.begin(), num_symbols.end()).c_str());
return Token(TT_Float,tk_value,prev_line,prev_column,fname);
}
}else
{
--index;
--column;
if(text[index] == '\n')
{
--line;
column = get_line(text,line-1).size() + 1;
g_current_line = rewind_current_line(text,index);
}
if(dot_count != 0)
{
float tk_value = std::stof(std::string(num_symbols.begin(), num_symbols.end()).c_str());
return Token(TT_Float,tk_value,prev_line,prev_column,fname);
}
int tk_value = atoi(std::string(num_symbols.begin(), num_symbols.end()).c_str());
return Token(TT_Number,tk_value,prev_line,prev_column,fname);
}
}
--index;
--column;
if(text[index] == '\n')
{
--line;
column = get_line(text,line-1).size() + 1;
g_current_line = rewind_current_line(text,index);
}
if(dot_count != 0)
{
float tk_value = std::stof(std::string(num_symbols.begin(), num_symbols.end()).c_str());
return Token(TT_Float,tk_value,prev_line,prev_column,fname);
}
int tk_value = atoi(std::string(num_symbols.begin(), num_symbols.end()).c_str());
return Token(TT_Number,tk_value,prev_line,prev_column,fname);
}
Token construct_string(const std::string& text, int& index, int& line, int& column, const std::string& fname)
{
std::vector<char> string_chars;
int prev_column = column * 1;
int prev_line = line * 1;
while(index < text.size())
{
++index;
++column;
if(text[index] == '\n')
{
compiler_error(g_current_line,line,column,fname,"expected \', but got a newline");
}
if(text[index] == '\'')
{
return Token(TT_String,std::string(string_chars.begin(),string_chars.end()),prev_line,prev_column,fname);
}
if(text[index] == '\\')
{
if(index + 1 == text.size())
{
compiler_error(g_current_line,line,column,fname,"unfinished escape sequence");
}
switch(text[index+1])
{
case 'n':
string_chars.push_back('\n');
break;
case '\'':
string_chars.push_back('\'');
break;
case '\\':
string_chars.push_back('\\');
break;
default:
compiler_error(g_current_line,line,column,fname,"unknown escape sequence");
}
++index;
++column;
continue;
}
string_chars.push_back(text[index]);
}
compiler_error(text,line,column,fname,"expected \', but got EOF");
exit(127);
}
std::string get_spaces(int);
void compiler_error(const std::string& text, const int& line, const int& column, const std::string& fname, const std::string& details)
{
char linestr[32];
sprintf(linestr,"%d",line);
char colstr[32];
sprintf(colstr,"%d",column);
std::cerr << "\033[1;1m";
std::cerr << fname;
std::cerr << ":";
std::cerr << linestr;
std::cerr << ":";
std::cerr << colstr;
std::cerr << ": ";
std::cerr << "\033[31;49m";
std::cerr << "error: ";
std::cerr << "\033[0;0m";
std::cerr << details;
std::cerr << std::endl;
std::cerr << linestr;
std::cerr << get_spaces(4);
std::cerr << text;
std::cerr << std::endl;
std::cerr << get_spaces(4 + std::string(linestr).size());
std::cerr << get_spaces(column - 1);
std::cerr << "\033[31;49m";
std::cerr << "^";
std::cerr << "\033[0;0m";
std::cerr << std::endl;
exit(1);
}
void compiler_warning(const std::string& text, const int& line, const int& column, const std::string& fname, const std::string& details)
{
char linestr[32];
sprintf(linestr,"%d",line);
char colstr[32];
sprintf(colstr,"%d",column);
std::cout << "\033[1;1m";
std::cout << fname;
std::cout << ":";
std::cout << linestr;
std::cout << ":";
std::cout << colstr;
std::cout << ": ";
std::cout << "\033[33;49m";
std::cout << "warning: ";
std::cout << "\033[0;0m";
std::cout << details;
std::cout << std::endl;
std::cout << linestr;
std::cout << get_spaces(4);
std::cout << text;
std::cout << std::endl;
std::cout << get_spaces(4 + std::string(linestr).size());
std::cout << get_spaces(column - 1);
std::cout << "\033[33;49m";
std::cout << "^";
std::cout << "\033[0;0m";
std::cout << std::endl;
}
std::string get_line(const std::string& str, int line_no)
{
std::string line;
std::istringstream stream(str);
while (line_no-- >= 0)
std::getline(stream, line);
return line;
}
std::string get_spaces(int spacenum)
{
std::string output = "";
for(int i = 0; i < spacenum; i++)
{
output += " ";
}
return output;
}
std::string make_asm_string(StringLiteral str)
{
std::string result;
result += str.identifier;
result += ": ; -- string literal -- \n";
result += " db ";
char code_point[4];
sprintf(code_point,"%d",str.data[0]);
result += std::string(code_point);
for(int i = 1; i < str.data.size(); i++)
{
char code_point[4];
sprintf(code_point,"%d",str.data[i]);
result += (", " + std::string(code_point));
}
result += ", 0";
result += "\n";
return result;
}
std::vector<Token> evaluate_imports(const std::string& text, const std::vector<Token>& tokens)
{
int i = 0;
std::vector<Token> ret_tk = tokens;
std::vector<Token> new_tokens;
while(tokens[i].tk_type != TT_EOF)
{
if (g_importCount > IMPORT_MAX_DEEP) {
compiler_error(tokens[i].line_ctx,tokens[i].line,tokens[i].column,tokens[i].fname,"import tree too deep");
}
if(tokens[i].tk_type == TT_Keyword && tokens[i].string_value == keywords[2])
{
if(tokens[i+1].tk_type == TT_EOF) {compiler_error(text,tokens[i].line,tokens[i].column,tokens[i].fname,"did not expect EOF after 'import' keyword");}
if(tokens[i+1].tk_type == TT_Identifier)
{
if(std::find(imported_files.begin(), imported_files.end(), tokens[i+1].string_value) != imported_files.end()) {
compiler_error(tokens[i+2].line_ctx,tokens[i+2].line,tokens[i+2].column,tokens[i+2].fname,"file already imported");
}
if(tokens[i+2].tk_type != TT_Semicolon) {compiler_error(text,tokens[i+2].line,tokens[i+2].column,tokens[i+2].fname,"expected a semicolon after import statement");}
std::ifstream ifile(tokens[i+1].string_value + ".sp");
if (!ifile.good()) {
compiler_error(tokens[i+1].line_ctx,tokens[i+1].line,tokens[i+1].column,tokens[i+1].fname,"file '" + tokens[i+1].string_value + ".sp' not found");
}
ifile.close();
std::string imported_file_contents = read_file(tokens[i+1].string_value + ".sp");
std::vector<Token> imported_tokens = lex_tokens(imported_file_contents,tokens[i+1].string_value + ".sp");
imported_tokens.pop_back(); // remove EOF at end of token stream
new_tokens.insert(new_tokens.end(),imported_tokens.begin(),imported_tokens.end());
ret_tk[i] = Token(TT_Null,ret_tk[i].line,ret_tk[i].column,ret_tk[i].fname); // remove import data
ret_tk[i+1] = Token(TT_Null,ret_tk[i+1].line,ret_tk[i+1].column,ret_tk[i+1].fname); // remove import data
ret_tk[i+2] = Token(TT_Null,ret_tk[i+2].line,ret_tk[i+2].column,ret_tk[i+2].fname); // remove import data
imported_files.push_back(tokens[i+1].string_value);
} else {
compiler_error(tokens[i+1].line_ctx,tokens[i+1].line,tokens[i+1].column,tokens[i+1].fname,"import statement must use an identifier");
}
}
++i;
}
if(new_tokens.size() != 0)
{
new_tokens.insert(new_tokens.end(),ret_tk.begin(),ret_tk.end());
++g_importCount;
return evaluate_imports(text,new_tokens);
}
return ret_tk;
}
std::vector<Sentence> parse_tokens(const std::vector<Token>& tokens)
{
int i = 0;
while(i < tokens.size())
{
if(tokens[i].tk_type == TT_Null)
{
++i;
continue;
}
std::cout << tokens[i].to_string() << std::endl;
++i;
}
return std::vector<Sentence>();
}