#include #include #include #include #include #include #include #include #include #define IMPORT_MAX_DEEP 100 #define STRLEN "strlen: ; -- length of null-terminated string in rdi --\n xor rax, rax\n mov rcx, -1\n cld\n repne scasb\n mov rax, rcx\n add rax, 2\n neg rax\n ret\n" #define PRINT "print: ; -- print null-terminated string in rdi --\n mov rsi, rdi\n call strlen\n mov rdx, rax\n mov rdi, 1\n mov rax, 1\n syscall\n ret\n" #define __FASM #ifdef __FASM #define ASM_HEADER "format ELF64 executable 3\n" #define ASM_TEXT_SECTION "segment readable executable\n" #define ASM_DATA_SECTION "segment readable writable\n" #define ASM_BSS_SECTION "" #define ASM_ENTRY "entry start\n" #define ASM_ENTRY_DECL "start:\n" #elif defined(__NASM) #define ASM_HEADER "BITS 64\n" #define ASM_TEXT_SECTION "section .text\n" #define ASM_DATA_SECTION "section .data\n" #define ASM_BSS_SECTION "section .bss\n" #define ASM_ENTRY "global _start\n" #define ASM_ENTRY_DECL "_start:\n" #else #endif std::regex letter("[a-zA-Z_]"); std::regex identifier("[a-zA-Z0-9_]"); std::regex number("[0-9]"); std::regex whitespace("[\t \n]"); int g_importCount = 0; std::string g_current_line; std::vector imported_files; enum TokenType { TT_Identifier, TT_Number, TT_Float, TT_Keyword, TT_String, TT_Plus, TT_Minus, TT_Mul, TT_Div, TT_At, TT_Equal, TT_LessThan, TT_GreaterThan, TT_LParen, TT_RParen, TT_LBracket, TT_RBracket, TT_Semicolon, TT_LoadedString, TT_EOF, TT_Null }; static std::string token_strings[] = { "TT_IDENTIFIER", "TT_NUMBER", "TT_FLOAT", "TT_KEYWORD", "TT_STRING", "TT_PLUS", "TT_MINUS", "TT_MUL", "TT_DIV", "TT_AT", "TT_EQUAL", "TT_LESSTHAN", "TT_GREATERTHAN", "TT_LPAREN", "TT_RPAREN", "TT_LBRACKET", "TT_RBRACKET", "TT_SEMICOLON", "TT_LOADEDSTRING", "TT_EOF", "TT_NULL" }; struct Token { TokenType tk_type; int int_value; std::string string_value; float float_value; int line; int column; std::string fname; std::string line_ctx; Token(const TokenType& type, const int& lineno, const int& colno, const std::string& name) : tk_type(type), line(lineno), column(colno), fname(name) { line_ctx = g_current_line; } Token(const TokenType& type, const int& val, const int& lineno, const int& colno, const std::string& name) : tk_type(type), int_value(val), line(lineno), column(colno), fname(name) { line_ctx = g_current_line; } Token(const TokenType& type, const std::string& val, const int& lineno, const int& colno, const std::string& name) : tk_type(type), string_value(val), line(lineno), column(colno), fname(name) { line_ctx = g_current_line; } Token(const TokenType& type, const float& val, const int& lineno, const int& colno, const std::string& name) : tk_type(type), float_value(val), line(lineno), column(colno), fname(name) { line_ctx = g_current_line; } std::string to_string() const { char linestr[32]; sprintf(linestr,"%d",line); char colstr[32]; sprintf(colstr,"%d",column); if(tk_type == TT_Number) { char num[32]; sprintf(num,"%d",int_value); return "INT:" + std::string(num) + " (" + fname + ":" + std::string(linestr) + ":" + std::string(colstr) + ")"; } else if (tk_type == TT_Float) { char num[64]; sprintf(num,"%f",float_value); return "FLOAT:" + std::string(num) + " (" + fname + ":" + std::string(linestr) + ":" + std::string(colstr) + ")"; } else if (tk_type == TT_Identifier){ return "ID:" + string_value + " (" + fname + ":" + std::string(linestr) + ":" + std::string(colstr) + ")"; } else if (tk_type == TT_Keyword){ return "KEYWORD:" + string_value + " (" + fname + ":" + std::string(linestr) + ":" + std::string(colstr) + ")"; } else if (tk_type == TT_String) { return "STRING:" + std::string("\'") + string_value + std::string("\'") + " (" + fname + ":" + std::string(linestr) + ":" + std::string(colstr) + ")"; } std::string details = std::string(" (") + fname + ":" + std::string(linestr) + ":" + std::string(colstr) + ")"; switch(tk_type) { case TT_EOF: return "EOF" + details; case TT_Plus: return "PLUS" + details; case TT_Minus: return "MINUS" + details; case TT_Mul: return "MUL" + details; case TT_Div: return "DIV" + details; case TT_At: return "AT" + details; case TT_Equal: return "EQUAL" + details; case TT_LessThan: return "LESSTHAN" + details; case TT_GreaterThan: return "GREATERTHAN" + details; case TT_LParen: return "LPAREN" + details; case TT_RParen: return "RPAREN" + details; case TT_LBracket: return "LBRACKET" + details; case TT_RBracket: return "RBRACKET" + details; case TT_Semicolon: return "SEMICOLON" + details; case TT_LoadedString: return "LDSTRING" + details; } return ""; } }; std::string tokentype_as_string(const TokenType& type) { return token_strings[type]; } struct Sentence { std::string type_name; }; struct Function : public Sentence { std::string type_name = "function"; std::vector fun_tokens; Function(std::vector tokens) : fun_tokens(tokens){} }; struct DeclVar : public Sentence { std::string type_name = "decl"; std::vector vtokens; DeclVar(std::vector tokens) : vtokens(tokens){} }; struct Variable { int size; std::string identifier; Variable(int _size, std::string _identifier) : size(_size), identifier(_identifier){} }; struct StringLiteral { std::string data; std::string identifier; StringLiteral(std::string _data, std::string _identifier) : data(_data), identifier(_identifier){} }; std::array keywords{"out", "var","import"}; std::vector registered_vars; std::vector registered_strings; std::string make_asm_string(StringLiteral str); std::vector lex_tokens(const std::string&, const std::string&); std::vector evaluate_imports(const std::string& text, const std::vector& tokens); std::string read_file(const std::string&); void compiler_error(const std::string& text, const int& line, const int& column, const std::string& fname, const std::string& details); void compiler_warning(const std::string& text, const int& line, const int& column, const std::string& fname, const std::string& details); std::vector parse_tokens(const std::vector& tokens); int main(int argc, char** argv) { std::string fname; if(argc < 2) { fname = "test.sp"; } else fname = (const char*)argv[1]; std::string command = read_file(fname); std::vector main_tokens = lex_tokens(command,fname); imported_files.push_back(fname.substr(0,fname.find_last_of('.'))); main_tokens = evaluate_imports(command,main_tokens); parse_tokens(main_tokens); std::string assembly; assembly += ASM_HEADER; assembly += "; Assembly generated by the Sapphire compiler.\n"; assembly += ASM_TEXT_SECTION; assembly += ASM_ENTRY; assembly += STRLEN; assembly += PRINT; assembly += ASM_ENTRY_DECL; assembly += "; -- exit with code 0 --\n"; assembly += " mov rax, 60\n"; assembly += " xor rdi, rdi\n"; assembly += " syscall\n"; if(registered_strings.size() != 0) { assembly += "\n"; assembly += ASM_DATA_SECTION; for(auto asm_string : registered_strings) { assembly += make_asm_string(asm_string); } } std::string outfile_basename = fname.substr(0,fname.find_last_of('.')); std::ofstream outfile(outfile_basename + ".asm"); outfile << assembly; outfile.close(); #ifdef __NASM system(std::string("nasm -f elf64 " + outfile_basename + ".asm -o" + outfile_basename + ".o").c_str()); system(std::string("ld " + outfile_basename + ".o -o" + outfile_basename).c_str()); #else system(std::string("fasm " + outfile_basename + ".asm").c_str()); system((std::string("chmod +x ") + outfile_basename).c_str()); #endif std::cout << fname + " > " + std::string(outfile_basename) + "\n"; } Token construct_identifier(const std::string&, int& , int&, int&, const std::string&); Token construct_number(const std::string&, int&, int&, int&, const std::string&); Token construct_string(const std::string&, int&, int&, int&, const std::string&); std::string read_file(const std::string& fname) { std::ifstream main_file; main_file.open(fname); if(!main_file.is_open()) return ""; std::vector file_chars; char fchar; while ( main_file ) { fchar = main_file.get(); if(fchar != -1 ) file_chars.push_back(fchar); } main_file.close(); return std::string(file_chars.begin(),file_chars.end()); } std::string recalculate_current_line(const std::string& text, int index) { std::string final_str; ++index; while(index != text.size() && text[index] != '\n') { final_str += text[index]; ++index; } return final_str; } std::string rewind_current_line(const std::string& text, int index) { --index; while(text[index] != '\n') { --index; if(index == 0) { return recalculate_current_line(text,-1); } } return recalculate_current_line(text,index); } std::vector lex_tokens(const std::string& text, const std::string& fname) { int line = 1; int column = 0; int index = -1; std::vector result; bool comment = false; g_current_line = recalculate_current_line(text,-1); while(index < (int)text.size()) { ++index; ++column; if(text[index] == '\n') { ++line; column = 0; g_current_line = recalculate_current_line(text,index); comment = false; } if(comment) continue; char cstyle_char[2]{text[index],'\0'}; const char* character = (const char*)cstyle_char; if(std::regex_match(character,whitespace)) { continue; } else if(std::regex_match(character,letter)) { result.push_back(construct_identifier(text,index,line,column,fname)); } else if(std::regex_match(character,number)) { result.push_back(construct_number(text,index,line,column,fname)); } else if (text[index] == '\'') { result.push_back(construct_string(text,index,line,column,fname)); } else if(index == text.size()) { result.push_back(Token(TT_EOF,line,column,fname)); } else switch(text[index]) { case '+': result.push_back(Token(TT_Plus,line,column,fname)); break; case '-': result.push_back(Token(TT_Minus,line,column,fname)); break; case '*': result.push_back(Token(TT_Mul,line,column,fname)); break; case '/': if(index != text.size()) { if(text[index+1] == '/') { comment = true; break; } } result.push_back(Token(TT_Div,line,column,fname)); break; case '@': result.push_back(Token(TT_At,line,column,fname)); break; case '=': result.push_back(Token(TT_Equal,line,column,fname)); break; case '<': result.push_back(Token(TT_LessThan,line,column,fname)); break; case '>': result.push_back(Token(TT_GreaterThan,line,column,fname)); break; case '(': result.push_back(Token(TT_LParen,line,column,fname)); break; case ')': result.push_back(Token(TT_RParen,line,column,fname)); break; case '{': result.push_back(Token(TT_LBracket,line,column,fname)); break; case '}': result.push_back(Token(TT_RBracket,line,column,fname)); break; case ';': result.push_back(Token(TT_Semicolon,line,column,fname)); break; default: compiler_error(g_current_line,line,column,fname,"unknown character"); break; } } return result; } std::string get_line(const std::string&, int); Token construct_identifier(const std::string& text, int& index, int& line, int& column, const std::string& fname) { std::vector id_symbols; int prev_column = column * 1; int prev_line = line * 1; char cstyle_char[2]{text[index],'\0'}; const char* character = (const char*)cstyle_char; if(std::regex_match(character,identifier)) { id_symbols.push_back(text[index]); } else { --index; --column; if(text[index] == '\n') { --line; column = get_line(text,line-1).size() + 1; g_current_line = rewind_current_line(text,index); } std::string identifier(id_symbols.begin(), id_symbols.end()); std::string* location = std::find(keywords.begin(),keywords.end(),identifier); if(location != keywords.end()) { return Token(TT_Keyword,identifier,prev_line,prev_column,fname); } return Token(TT_Identifier,identifier,prev_line,prev_column,fname); } while(index < text.size() || index == -1) { ++index; ++column; if(text[index] == '\n') { ++line; g_current_line = recalculate_current_line(text,index); column = 0; } char cstyle_char[2]{text[index],'\0'}; const char* character = (const char*)cstyle_char; if(std::regex_match(character,identifier)) { id_symbols.push_back(text[index]); } else { --index; --column; if(text[index] == '\n') { --line; column = get_line(text,line-1).size() + 1; g_current_line = rewind_current_line(text,index); } std::string identifier(id_symbols.begin(), id_symbols.end()); std::string* location = std::find(keywords.begin(),keywords.end(),identifier); if(location != keywords.end()) { return Token(TT_Keyword,identifier,prev_line,prev_column,fname); } return Token(TT_Identifier,identifier,prev_line,prev_column,fname); } } std::string identifier(id_symbols.begin(), id_symbols.end()); std::string* location = std::find(keywords.begin(),keywords.end(),identifier); if(location != keywords.end()) { return Token(TT_Keyword,identifier,prev_line,prev_column,fname); } return Token(TT_Identifier,identifier,prev_line,prev_column,fname); } Token construct_number(const std::string& text, int& index, int& line, int& column, const std::string& fname) { std::vector num_symbols; int dot_count = 0; int prev_column = column * 1; int prev_line = line * 1; char cstyle_char[2]{text[index],'\0'}; const char* character = (const char*)cstyle_char; if(std::regex_match(character,number)) { num_symbols.push_back(text[index]); } else if (text[index] == '.') { if (dot_count == 0) { num_symbols.push_back(text[index]); ++dot_count; } else { --index; --column; if(text[index] == '\n') { --line; g_current_line = rewind_current_line(text,index); column = get_line(text,line-1).size() + 1; } float tk_value = std::stof(std::string(num_symbols.begin(), num_symbols.end()).c_str()); return Token(TT_Float,tk_value,prev_line,prev_column,fname); } }else { --index; --column; if(text[index] == '\n') { --line; column = get_line(text,line-1).size() + 1; g_current_line = rewind_current_line(text,index); } if(dot_count != 0) { float tk_value = std::stof(std::string(num_symbols.begin(), num_symbols.end()).c_str()); return Token(TT_Float,tk_value,prev_line,prev_column,fname); } int tk_value = atoi(std::string(num_symbols.begin(), num_symbols.end()).c_str()); return Token(TT_Number,tk_value,prev_line,prev_column,fname); } while(index < text.size() || index == -1) { ++index; ++column; if(text[index] == '\n') { ++line; column = 0; g_current_line = recalculate_current_line(text,index); } char cstyle_char[2]{text[index],'\0'}; const char* character = (const char*)cstyle_char; if(std::regex_match(character,number)) { num_symbols.push_back(text[index]); } else if (text[index] == '.') { if (dot_count == 0) { num_symbols.push_back(text[index]); ++dot_count; } else { --index; --column; if(text[index] == '\n') { --line; g_current_line = rewind_current_line(text,index); column = get_line(text,line-1).size() + 1; } float tk_value = std::stof(std::string(num_symbols.begin(), num_symbols.end()).c_str()); return Token(TT_Float,tk_value,prev_line,prev_column,fname); } }else { --index; --column; if(text[index] == '\n') { --line; column = get_line(text,line-1).size() + 1; g_current_line = rewind_current_line(text,index); } if(dot_count != 0) { float tk_value = std::stof(std::string(num_symbols.begin(), num_symbols.end()).c_str()); return Token(TT_Float,tk_value,prev_line,prev_column,fname); } int tk_value = atoi(std::string(num_symbols.begin(), num_symbols.end()).c_str()); return Token(TT_Number,tk_value,prev_line,prev_column,fname); } } --index; --column; if(text[index] == '\n') { --line; column = get_line(text,line-1).size() + 1; g_current_line = rewind_current_line(text,index); } if(dot_count != 0) { float tk_value = std::stof(std::string(num_symbols.begin(), num_symbols.end()).c_str()); return Token(TT_Float,tk_value,prev_line,prev_column,fname); } int tk_value = atoi(std::string(num_symbols.begin(), num_symbols.end()).c_str()); return Token(TT_Number,tk_value,prev_line,prev_column,fname); } Token construct_string(const std::string& text, int& index, int& line, int& column, const std::string& fname) { std::vector string_chars; int prev_column = column * 1; int prev_line = line * 1; while(index < text.size()) { ++index; ++column; if(text[index] == '\n') { compiler_error(g_current_line,line,column,fname,"expected \', but got a newline"); } if(text[index] == '\'') { return Token(TT_String,std::string(string_chars.begin(),string_chars.end()),prev_line,prev_column,fname); } if(text[index] == '\\') { if(index + 1 == text.size()) { compiler_error(g_current_line,line,column,fname,"unfinished escape sequence"); } switch(text[index+1]) { case 'n': string_chars.push_back('\n'); break; case '\'': string_chars.push_back('\''); break; case '\\': string_chars.push_back('\\'); break; default: compiler_error(g_current_line,line,column,fname,"unknown escape sequence"); } ++index; ++column; continue; } string_chars.push_back(text[index]); } compiler_error(text,line,column,fname,"expected \', but got EOF"); exit(127); } std::string get_spaces(int); void compiler_error(const std::string& text, const int& line, const int& column, const std::string& fname, const std::string& details) { char linestr[32]; sprintf(linestr,"%d",line); char colstr[32]; sprintf(colstr,"%d",column); std::cerr << "\033[1;1m"; std::cerr << fname; std::cerr << ":"; std::cerr << linestr; std::cerr << ":"; std::cerr << colstr; std::cerr << ": "; std::cerr << "\033[31;49m"; std::cerr << "error: "; std::cerr << "\033[0;0m"; std::cerr << details; std::cerr << std::endl; std::cerr << linestr; std::cerr << get_spaces(4); std::cerr << text; std::cerr << std::endl; std::cerr << get_spaces(4 + std::string(linestr).size()); std::cerr << get_spaces(column - 1); std::cerr << "\033[31;49m"; std::cerr << "^"; std::cerr << "\033[0;0m"; std::cerr << std::endl; exit(1); } void compiler_warning(const std::string& text, const int& line, const int& column, const std::string& fname, const std::string& details) { char linestr[32]; sprintf(linestr,"%d",line); char colstr[32]; sprintf(colstr,"%d",column); std::cout << "\033[1;1m"; std::cout << fname; std::cout << ":"; std::cout << linestr; std::cout << ":"; std::cout << colstr; std::cout << ": "; std::cout << "\033[33;49m"; std::cout << "warning: "; std::cout << "\033[0;0m"; std::cout << details; std::cout << std::endl; std::cout << linestr; std::cout << get_spaces(4); std::cout << text; std::cout << std::endl; std::cout << get_spaces(4 + std::string(linestr).size()); std::cout << get_spaces(column - 1); std::cout << "\033[33;49m"; std::cout << "^"; std::cout << "\033[0;0m"; std::cout << std::endl; } std::string get_line(const std::string& str, int line_no) { std::string line; std::istringstream stream(str); while (line_no-- >= 0) std::getline(stream, line); return line; } std::string get_spaces(int spacenum) { std::string output = ""; for(int i = 0; i < spacenum; i++) { output += " "; } return output; } std::string make_asm_string(StringLiteral str) { std::string result; result += str.identifier; result += ": ; -- string literal -- \n"; result += " db "; char code_point[4]; sprintf(code_point,"%d",str.data[0]); result += std::string(code_point); for(int i = 1; i < str.data.size(); i++) { char code_point[4]; sprintf(code_point,"%d",str.data[i]); result += (", " + std::string(code_point)); } result += ", 0"; result += "\n"; return result; } std::vector evaluate_imports(const std::string& text, const std::vector& tokens) { int i = 0; std::vector ret_tk = tokens; std::vector new_tokens; while(tokens[i].tk_type != TT_EOF) { if (g_importCount > IMPORT_MAX_DEEP) { compiler_error(tokens[i].line_ctx,tokens[i].line,tokens[i].column,tokens[i].fname,"import tree too deep"); } if(tokens[i].tk_type == TT_Keyword && tokens[i].string_value == keywords[2]) { if(tokens[i+1].tk_type == TT_EOF) {compiler_error(text,tokens[i].line,tokens[i].column,tokens[i].fname,"did not expect EOF after 'import' keyword");} if(tokens[i+1].tk_type == TT_Identifier) { if(std::find(imported_files.begin(), imported_files.end(), tokens[i+1].string_value) != imported_files.end()) { compiler_error(tokens[i+2].line_ctx,tokens[i+2].line,tokens[i+2].column,tokens[i+2].fname,"file already imported"); } if(tokens[i+2].tk_type != TT_Semicolon) {compiler_error(text,tokens[i+2].line,tokens[i+2].column,tokens[i+2].fname,"expected a semicolon after import statement");} std::ifstream ifile(tokens[i+1].string_value + ".sp"); if (!ifile.good()) { compiler_error(tokens[i+1].line_ctx,tokens[i+1].line,tokens[i+1].column,tokens[i+1].fname,"file '" + tokens[i+1].string_value + ".sp' not found"); } ifile.close(); std::string imported_file_contents = read_file(tokens[i+1].string_value + ".sp"); std::vector imported_tokens = lex_tokens(imported_file_contents,tokens[i+1].string_value + ".sp"); imported_tokens.pop_back(); // remove EOF at end of token stream new_tokens.insert(new_tokens.end(),imported_tokens.begin(),imported_tokens.end()); ret_tk[i] = Token(TT_Null,ret_tk[i].line,ret_tk[i].column,ret_tk[i].fname); // remove import data ret_tk[i+1] = Token(TT_Null,ret_tk[i+1].line,ret_tk[i+1].column,ret_tk[i+1].fname); // remove import data ret_tk[i+2] = Token(TT_Null,ret_tk[i+2].line,ret_tk[i+2].column,ret_tk[i+2].fname); // remove import data imported_files.push_back(tokens[i+1].string_value); } else { compiler_error(tokens[i+1].line_ctx,tokens[i+1].line,tokens[i+1].column,tokens[i+1].fname,"import statement must use an identifier"); } } ++i; } if(new_tokens.size() != 0) { new_tokens.insert(new_tokens.end(),ret_tk.begin(),ret_tk.end()); ++g_importCount; return evaluate_imports(text,new_tokens); } return ret_tk; } std::vector parse_tokens(const std::vector& tokens) { int i = 0; while(i < tokens.size()) { if(tokens[i].tk_type == TT_Null) { ++i; continue; } std::cout << tokens[i].to_string() << std::endl; ++i; } return std::vector(); }