/* Original implementation of the Sapphire compiler, in one file. WARNING: THIS IS LEGACY. This was the original thing I wrote to make a simple compiler, but it was clear it wasn't gonna cut it. The code was difficult to read with everything scattered across the file, and I had to throw random forward declarations everywhere. Also, it was limited, since my intention was to make it output assembly code manually. As you can see, the definitions are for x86-64 Linux assembly. Now I'm using LLVM, which will make it a lot easier to do stuff. Eventually, I found that splitting the code into multiple files, using classes and such, would be much better and easier. So I began a rewrite of this compiler in the src/ directory, which I found was way easier to write, and was more productive and more. The src/ compiler has come a long way since then, and this code is just kept around for me to be able to see it in the future and facepalm at it. I'm aware of Git, I'm using it, but I didn't think keeping this would make any harm. it's very clearly separated from the rest of the code. */ #include #include #include #include #include #include #include #include #include #define IMPORT_MAX_DEEP 100 #define STRLEN "strlen: ; -- length of null-terminated string in rdi --\n xor rax, rax\n mov rcx, -1\n cld\n repne scasb\n mov rax, rcx\n add rax, 2\n neg rax\n ret\n" #define PRINT "print: ; -- print null-terminated string in rdi --\n mov rsi, rdi\n call strlen\n mov rdx, rax\n mov rdi, 1\n mov rax, 1\n syscall\n ret\n" #define __FASM #ifdef __FASM #define ASM_HEADER "format ELF64 executable 3\n" #define ASM_TEXT_SECTION "segment readable executable\n" #define ASM_DATA_SECTION "segment readable writable\n" #define ASM_BSS_SECTION "" #define ASM_ENTRY "entry start\n" #define ASM_ENTRY_DECL "start:\n" #elif defined(__NASM) #define ASM_HEADER "BITS 64\n" #define ASM_TEXT_SECTION "section .text\n" #define ASM_DATA_SECTION "section .data\n" #define ASM_BSS_SECTION "section .bss\n" #define ASM_ENTRY "global _start\n" #define ASM_ENTRY_DECL "_start:\n" #else #endif std::regex letter("[a-zA-Z_]"); std::regex identifier("[a-zA-Z0-9_]"); std::regex number("[0-9]"); std::regex whitespace("[\t \n]"); int g_importCount = 0; std::string g_current_line; std::vector imported_files; enum TokenType { TT_Identifier, TT_Number, TT_Float, TT_Keyword, TT_String, TT_Plus, TT_Minus, TT_Mul, TT_Div, TT_At, TT_Equal, TT_LessThan, TT_GreaterThan, TT_LParen, TT_RParen, TT_LBracket, TT_RBracket, TT_Semicolon, TT_LoadedString, TT_EOF, TT_Null }; static std::string token_strings[] = { "TT_IDENTIFIER", "TT_NUMBER", "TT_FLOAT", "TT_KEYWORD", "TT_STRING", "TT_PLUS", "TT_MINUS", "TT_MUL", "TT_DIV", "TT_AT", "TT_EQUAL", "TT_LESSTHAN", "TT_GREATERTHAN", "TT_LPAREN", "TT_RPAREN", "TT_LBRACKET", "TT_RBRACKET", "TT_SEMICOLON", "TT_LOADEDSTRING", "TT_EOF", "TT_NULL" }; struct Token { TokenType tk_type; int int_value; std::string string_value; float float_value; int line; int column; std::string fname; std::string line_ctx; Token(const TokenType& type, const int& lineno, const int& colno, const std::string& name) : tk_type(type), line(lineno), column(colno), fname(name) { line_ctx = g_current_line; } Token(const TokenType& type, const int& val, const int& lineno, const int& colno, const std::string& name) : tk_type(type), int_value(val), line(lineno), column(colno), fname(name) { line_ctx = g_current_line; } Token(const TokenType& type, const std::string& val, const int& lineno, const int& colno, const std::string& name) : tk_type(type), string_value(val), line(lineno), column(colno), fname(name) { line_ctx = g_current_line; } Token(const TokenType& type, const float& val, const int& lineno, const int& colno, const std::string& name) : tk_type(type), float_value(val), line(lineno), column(colno), fname(name) { line_ctx = g_current_line; } std::string to_string() const { char linestr[32]; sprintf(linestr,"%d",line); char colstr[32]; sprintf(colstr,"%d",column); if(tk_type == TT_Number) { char num[32]; sprintf(num,"%d",int_value); return "INT:" + std::string(num) + " (" + fname + ":" + std::string(linestr) + ":" + std::string(colstr) + ")"; } else if (tk_type == TT_Float) { char num[64]; sprintf(num,"%f",float_value); return "FLOAT:" + std::string(num) + " (" + fname + ":" + std::string(linestr) + ":" + std::string(colstr) + ")"; } else if (tk_type == TT_Identifier){ return "ID:" + string_value + " (" + fname + ":" + std::string(linestr) + ":" + std::string(colstr) + ")"; } else if (tk_type == TT_Keyword){ return "KEYWORD:" + string_value + " (" + fname + ":" + std::string(linestr) + ":" + std::string(colstr) + ")"; } else if (tk_type == TT_String) { return "STRING:" + std::string("\'") + string_value + std::string("\'") + " (" + fname + ":" + std::string(linestr) + ":" + std::string(colstr) + ")"; } std::string details = std::string(" (") + fname + ":" + std::string(linestr) + ":" + std::string(colstr) + ")"; switch(tk_type) { case TT_EOF: return "EOF" + details; case TT_Plus: return "PLUS" + details; case TT_Minus: return "MINUS" + details; case TT_Mul: return "MUL" + details; case TT_Div: return "DIV" + details; case TT_At: return "AT" + details; case TT_Equal: return "EQUAL" + details; case TT_LessThan: return "LESSTHAN" + details; case TT_GreaterThan: return "GREATERTHAN" + details; case TT_LParen: return "LPAREN" + details; case TT_RParen: return "RPAREN" + details; case TT_LBracket: return "LBRACKET" + details; case TT_RBracket: return "RBRACKET" + details; case TT_Semicolon: return "SEMICOLON" + details; case TT_LoadedString: return "LDSTRING" + details; } return ""; } }; std::string tokentype_as_string(const TokenType& type) { return token_strings[type]; } struct Sentence { std::string type_name; }; struct Function : public Sentence { std::string type_name = "function"; std::vector fun_tokens; Function(std::vector tokens) : fun_tokens(tokens){} }; struct DeclVar : public Sentence { std::string type_name = "decl"; std::vector vtokens; DeclVar(std::vector tokens) : vtokens(tokens){} }; struct Variable { int size; std::string identifier; Variable(int _size, std::string _identifier) : size(_size), identifier(_identifier){} }; struct StringLiteral { std::string data; std::string identifier; StringLiteral(std::string _data, std::string _identifier) : data(_data), identifier(_identifier){} }; std::array keywords{"out", "var","import"}; std::vector registered_vars; std::vector registered_strings; std::string make_asm_string(StringLiteral str); std::vector lex_tokens(const std::string&, const std::string&); std::vector evaluate_imports(const std::string& text, const std::vector& tokens); std::string read_file(const std::string&); void compiler_error(const std::string& text, const int& line, const int& column, const std::string& fname, const std::string& details); void compiler_warning(const std::string& text, const int& line, const int& column, const std::string& fname, const std::string& details); std::vector parse_tokens(const std::vector& tokens); int main(int argc, char** argv) { std::string fname; if(argc < 2) { fname = "test.sp"; } else fname = (const char*)argv[1]; std::string command = read_file(fname); std::vector main_tokens = lex_tokens(command,fname); imported_files.push_back(fname.substr(0,fname.find_last_of('.'))); main_tokens = evaluate_imports(command,main_tokens); parse_tokens(main_tokens); std::string assembly; assembly += ASM_HEADER; assembly += "; Assembly generated by the Sapphire compiler.\n"; assembly += ASM_TEXT_SECTION; assembly += ASM_ENTRY; assembly += STRLEN; assembly += PRINT; assembly += ASM_ENTRY_DECL; assembly += "; -- exit with code 0 --\n"; assembly += " mov rax, 60\n"; assembly += " xor rdi, rdi\n"; assembly += " syscall\n"; if(registered_strings.size() != 0) { assembly += "\n"; assembly += ASM_DATA_SECTION; for(auto asm_string : registered_strings) { assembly += make_asm_string(asm_string); } } std::string outfile_basename = fname.substr(0,fname.find_last_of('.')); std::ofstream outfile(outfile_basename + ".asm"); outfile << assembly; outfile.close(); #ifdef __NASM system(std::string("nasm -f elf64 " + outfile_basename + ".asm -o" + outfile_basename + ".o").c_str()); system(std::string("ld " + outfile_basename + ".o -o" + outfile_basename).c_str()); #else system(std::string("fasm " + outfile_basename + ".asm").c_str()); system((std::string("chmod +x ") + outfile_basename).c_str()); #endif std::cout << fname + " > " + std::string(outfile_basename) + "\n"; } Token construct_identifier(const std::string&, int& , int&, int&, const std::string&); Token construct_number(const std::string&, int&, int&, int&, const std::string&); Token construct_string(const std::string&, int&, int&, int&, const std::string&); std::string read_file(const std::string& fname) { std::ifstream main_file; main_file.open(fname); if(!main_file.is_open()) return ""; std::vector file_chars; char fchar; while ( main_file ) { fchar = main_file.get(); if(fchar != -1 ) file_chars.push_back(fchar); } main_file.close(); return std::string(file_chars.begin(),file_chars.end()); } std::string recalculate_current_line(const std::string& text, int index) { std::string final_str; ++index; while(index != text.size() && text[index] != '\n') { final_str += text[index]; ++index; } return final_str; } std::string rewind_current_line(const std::string& text, int index) { --index; while(text[index] != '\n') { --index; if(index == 0) { return recalculate_current_line(text,-1); } } return recalculate_current_line(text,index); } std::vector lex_tokens(const std::string& text, const std::string& fname) { int line = 1; int column = 0; int index = -1; std::vector result; bool comment = false; g_current_line = recalculate_current_line(text,-1); while(index < (int)text.size()) { ++index; ++column; if(text[index] == '\n') { ++line; column = 0; g_current_line = recalculate_current_line(text,index); comment = false; } if(comment) continue; char cstyle_char[2]{text[index],'\0'}; const char* character = (const char*)cstyle_char; if(std::regex_match(character,whitespace)) { continue; } else if(std::regex_match(character,letter)) { result.push_back(construct_identifier(text,index,line,column,fname)); } else if(std::regex_match(character,number)) { result.push_back(construct_number(text,index,line,column,fname)); } else if (text[index] == '\'') { result.push_back(construct_string(text,index,line,column,fname)); } else if(index == text.size()) { result.push_back(Token(TT_EOF,line,column,fname)); } else switch(text[index]) { case '+': result.push_back(Token(TT_Plus,line,column,fname)); break; case '-': result.push_back(Token(TT_Minus,line,column,fname)); break; case '*': result.push_back(Token(TT_Mul,line,column,fname)); break; case '/': if(index != text.size()) { if(text[index+1] == '/') { comment = true; break; } } result.push_back(Token(TT_Div,line,column,fname)); break; case '@': result.push_back(Token(TT_At,line,column,fname)); break; case '=': result.push_back(Token(TT_Equal,line,column,fname)); break; case '<': result.push_back(Token(TT_LessThan,line,column,fname)); break; case '>': result.push_back(Token(TT_GreaterThan,line,column,fname)); break; case '(': result.push_back(Token(TT_LParen,line,column,fname)); break; case ')': result.push_back(Token(TT_RParen,line,column,fname)); break; case '{': result.push_back(Token(TT_LBracket,line,column,fname)); break; case '}': result.push_back(Token(TT_RBracket,line,column,fname)); break; case ';': result.push_back(Token(TT_Semicolon,line,column,fname)); break; default: compiler_error(g_current_line,line,column,fname,"unknown character"); break; } } return result; } std::string get_line(const std::string&, int); Token construct_identifier(const std::string& text, int& index, int& line, int& column, const std::string& fname) { std::vector id_symbols; int prev_column = column * 1; int prev_line = line * 1; char cstyle_char[2]{text[index],'\0'}; const char* character = (const char*)cstyle_char; if(std::regex_match(character,identifier)) { id_symbols.push_back(text[index]); } else { --index; --column; if(text[index] == '\n') { --line; column = get_line(text,line-1).size() + 1; g_current_line = rewind_current_line(text,index); } std::string identifier(id_symbols.begin(), id_symbols.end()); std::string* location = std::find(keywords.begin(),keywords.end(),identifier); if(location != keywords.end()) { return Token(TT_Keyword,identifier,prev_line,prev_column,fname); } return Token(TT_Identifier,identifier,prev_line,prev_column,fname); } while(index < text.size() || index == -1) { ++index; ++column; if(text[index] == '\n') { ++line; g_current_line = recalculate_current_line(text,index); column = 0; } char cstyle_char[2]{text[index],'\0'}; const char* character = (const char*)cstyle_char; if(std::regex_match(character,identifier)) { id_symbols.push_back(text[index]); } else { --index; --column; if(text[index] == '\n') { --line; column = get_line(text,line-1).size() + 1; g_current_line = rewind_current_line(text,index); } std::string identifier(id_symbols.begin(), id_symbols.end()); std::string* location = std::find(keywords.begin(),keywords.end(),identifier); if(location != keywords.end()) { return Token(TT_Keyword,identifier,prev_line,prev_column,fname); } return Token(TT_Identifier,identifier,prev_line,prev_column,fname); } } std::string identifier(id_symbols.begin(), id_symbols.end()); std::string* location = std::find(keywords.begin(),keywords.end(),identifier); if(location != keywords.end()) { return Token(TT_Keyword,identifier,prev_line,prev_column,fname); } return Token(TT_Identifier,identifier,prev_line,prev_column,fname); } Token construct_number(const std::string& text, int& index, int& line, int& column, const std::string& fname) { std::vector num_symbols; int dot_count = 0; int prev_column = column * 1; int prev_line = line * 1; char cstyle_char[2]{text[index],'\0'}; const char* character = (const char*)cstyle_char; if(std::regex_match(character,number)) { num_symbols.push_back(text[index]); } else if (text[index] == '.') { if (dot_count == 0) { num_symbols.push_back(text[index]); ++dot_count; } else { --index; --column; if(text[index] == '\n') { --line; g_current_line = rewind_current_line(text,index); column = get_line(text,line-1).size() + 1; } float tk_value = std::stof(std::string(num_symbols.begin(), num_symbols.end()).c_str()); return Token(TT_Float,tk_value,prev_line,prev_column,fname); } }else { --index; --column; if(text[index] == '\n') { --line; column = get_line(text,line-1).size() + 1; g_current_line = rewind_current_line(text,index); } if(dot_count != 0) { float tk_value = std::stof(std::string(num_symbols.begin(), num_symbols.end()).c_str()); return Token(TT_Float,tk_value,prev_line,prev_column,fname); } int tk_value = atoi(std::string(num_symbols.begin(), num_symbols.end()).c_str()); return Token(TT_Number,tk_value,prev_line,prev_column,fname); } while(index < text.size() || index == -1) { ++index; ++column; if(text[index] == '\n') { ++line; column = 0; g_current_line = recalculate_current_line(text,index); } char cstyle_char[2]{text[index],'\0'}; const char* character = (const char*)cstyle_char; if(std::regex_match(character,number)) { num_symbols.push_back(text[index]); } else if (text[index] == '.') { if (dot_count == 0) { num_symbols.push_back(text[index]); ++dot_count; } else { --index; --column; if(text[index] == '\n') { --line; g_current_line = rewind_current_line(text,index); column = get_line(text,line-1).size() + 1; } float tk_value = std::stof(std::string(num_symbols.begin(), num_symbols.end()).c_str()); return Token(TT_Float,tk_value,prev_line,prev_column,fname); } }else { --index; --column; if(text[index] == '\n') { --line; column = get_line(text,line-1).size() + 1; g_current_line = rewind_current_line(text,index); } if(dot_count != 0) { float tk_value = std::stof(std::string(num_symbols.begin(), num_symbols.end()).c_str()); return Token(TT_Float,tk_value,prev_line,prev_column,fname); } int tk_value = atoi(std::string(num_symbols.begin(), num_symbols.end()).c_str()); return Token(TT_Number,tk_value,prev_line,prev_column,fname); } } --index; --column; if(text[index] == '\n') { --line; column = get_line(text,line-1).size() + 1; g_current_line = rewind_current_line(text,index); } if(dot_count != 0) { float tk_value = std::stof(std::string(num_symbols.begin(), num_symbols.end()).c_str()); return Token(TT_Float,tk_value,prev_line,prev_column,fname); } int tk_value = atoi(std::string(num_symbols.begin(), num_symbols.end()).c_str()); return Token(TT_Number,tk_value,prev_line,prev_column,fname); } Token construct_string(const std::string& text, int& index, int& line, int& column, const std::string& fname) { std::vector string_chars; int prev_column = column * 1; int prev_line = line * 1; while(index < text.size()) { ++index; ++column; if(text[index] == '\n') { compiler_error(g_current_line,line,column,fname,"expected \', but got a newline"); } if(text[index] == '\'') { return Token(TT_String,std::string(string_chars.begin(),string_chars.end()),prev_line,prev_column,fname); } if(text[index] == '\\') { if(index + 1 == text.size()) { compiler_error(g_current_line,line,column,fname,"unfinished escape sequence"); } switch(text[index+1]) { case 'n': string_chars.push_back('\n'); break; case '\'': string_chars.push_back('\''); break; case '\\': string_chars.push_back('\\'); break; default: compiler_error(g_current_line,line,column,fname,"unknown escape sequence"); } ++index; ++column; continue; } string_chars.push_back(text[index]); } compiler_error(text,line,column,fname,"expected \', but got EOF"); exit(127); } std::string get_spaces(int); void compiler_error(const std::string& text, const int& line, const int& column, const std::string& fname, const std::string& details) { char linestr[32]; sprintf(linestr,"%d",line); char colstr[32]; sprintf(colstr,"%d",column); std::cerr << "\033[1;1m"; std::cerr << fname; std::cerr << ":"; std::cerr << linestr; std::cerr << ":"; std::cerr << colstr; std::cerr << ": "; std::cerr << "\033[31;49m"; std::cerr << "error: "; std::cerr << "\033[0;0m"; std::cerr << details; std::cerr << std::endl; std::cerr << linestr; std::cerr << get_spaces(4); std::cerr << text; std::cerr << std::endl; std::cerr << get_spaces(4 + std::string(linestr).size()); std::cerr << get_spaces(column - 1); std::cerr << "\033[31;49m"; std::cerr << "^"; std::cerr << "\033[0;0m"; std::cerr << std::endl; exit(1); } void compiler_warning(const std::string& text, const int& line, const int& column, const std::string& fname, const std::string& details) { char linestr[32]; sprintf(linestr,"%d",line); char colstr[32]; sprintf(colstr,"%d",column); std::cout << "\033[1;1m"; std::cout << fname; std::cout << ":"; std::cout << linestr; std::cout << ":"; std::cout << colstr; std::cout << ": "; std::cout << "\033[33;49m"; std::cout << "warning: "; std::cout << "\033[0;0m"; std::cout << details; std::cout << std::endl; std::cout << linestr; std::cout << get_spaces(4); std::cout << text; std::cout << std::endl; std::cout << get_spaces(4 + std::string(linestr).size()); std::cout << get_spaces(column - 1); std::cout << "\033[33;49m"; std::cout << "^"; std::cout << "\033[0;0m"; std::cout << std::endl; } std::string get_line(const std::string& str, int line_no) { std::string line; std::istringstream stream(str); while (line_no-- >= 0) std::getline(stream, line); return line; } std::string get_spaces(int spacenum) { std::string output = ""; for(int i = 0; i < spacenum; i++) { output += " "; } return output; } std::string make_asm_string(StringLiteral str) { std::string result; result += str.identifier; result += ": ; -- string literal -- \n"; result += " db "; char code_point[4]; sprintf(code_point,"%d",str.data[0]); result += std::string(code_point); for(int i = 1; i < str.data.size(); i++) { char code_point[4]; sprintf(code_point,"%d",str.data[i]); result += (", " + std::string(code_point)); } result += ", 0"; result += "\n"; return result; } std::vector evaluate_imports(const std::string& text, const std::vector& tokens) { int i = 0; std::vector ret_tk = tokens; std::vector new_tokens; while(tokens[i].tk_type != TT_EOF) { if (g_importCount > IMPORT_MAX_DEEP) { compiler_error(tokens[i].line_ctx,tokens[i].line,tokens[i].column,tokens[i].fname,"import tree too deep"); } if(tokens[i].tk_type == TT_Keyword && tokens[i].string_value == keywords[2]) { if(tokens[i+1].tk_type == TT_EOF) {compiler_error(text,tokens[i].line,tokens[i].column,tokens[i].fname,"did not expect EOF after 'import' keyword");} if(tokens[i+1].tk_type == TT_Identifier) { if(std::find(imported_files.begin(), imported_files.end(), tokens[i+1].string_value) != imported_files.end()) { compiler_error(tokens[i+2].line_ctx,tokens[i+2].line,tokens[i+2].column,tokens[i+2].fname,"file already imported"); } if(tokens[i+2].tk_type != TT_Semicolon) {compiler_error(text,tokens[i+2].line,tokens[i+2].column,tokens[i+2].fname,"expected a semicolon after import statement");} std::ifstream ifile(tokens[i+1].string_value + ".sp"); if (!ifile.good()) { compiler_error(tokens[i+1].line_ctx,tokens[i+1].line,tokens[i+1].column,tokens[i+1].fname,"file '" + tokens[i+1].string_value + ".sp' not found"); } ifile.close(); std::string imported_file_contents = read_file(tokens[i+1].string_value + ".sp"); std::vector imported_tokens = lex_tokens(imported_file_contents,tokens[i+1].string_value + ".sp"); imported_tokens.pop_back(); // remove EOF at end of token stream new_tokens.insert(new_tokens.end(),imported_tokens.begin(),imported_tokens.end()); ret_tk[i] = Token(TT_Null,ret_tk[i].line,ret_tk[i].column,ret_tk[i].fname); // remove import data ret_tk[i+1] = Token(TT_Null,ret_tk[i+1].line,ret_tk[i+1].column,ret_tk[i+1].fname); // remove import data ret_tk[i+2] = Token(TT_Null,ret_tk[i+2].line,ret_tk[i+2].column,ret_tk[i+2].fname); // remove import data imported_files.push_back(tokens[i+1].string_value); } else { compiler_error(tokens[i+1].line_ctx,tokens[i+1].line,tokens[i+1].column,tokens[i+1].fname,"import statement must use an identifier"); } } ++i; } if(new_tokens.size() != 0) { new_tokens.insert(new_tokens.end(),ret_tk.begin(),ret_tk.end()); ++g_importCount; return evaluate_imports(text,new_tokens); } return ret_tk; } std::vector parse_tokens(const std::vector& tokens) { int i = 0; while(i < tokens.size()) { if(tokens[i].tk_type == TT_Null) { ++i; continue; } std::cout << tokens[i].to_string() << std::endl; ++i; } return std::vector(); }