shitshow

A shitty programming language
git clone git://git.bain.cz/shitshow.git
Log | Files | Refs

commit da79a2cc7217c2e804dd286e603080c35b1b7076
Author: bain3 <31798786+bain3@users.noreply.github.com>
Date:   Fri, 30 Apr 2021 10:02:36 +0200

Initial Commit

Diffstat:
ACMakeLists.txt | 9+++++++++
Alexer/handlers.cpp | 58++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Alexer/lexer.cpp | 56++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Alexer/lexer.h | 54++++++++++++++++++++++++++++++++++++++++++++++++++++++
Amain.cpp | 20++++++++++++++++++++
Aparser/parser.cpp | 119+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Aparser/parser.h | 70++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
7 files changed, 386 insertions(+), 0 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt @@ -0,0 +1,8 @@ +cmake_minimum_required(VERSION 3.19) +project(shitshow) + +set(CMAKE_CXX_STANDARD 17) + +file(GLOB lexer_src lexer/*.cpp lexer/*.h) +file(GLOB parser_src parser/*.cpp parser/*.h) +add_executable(shitshow ${lexer_src} ${parser_src} main.cpp) +\ No newline at end of file diff --git a/lexer/handlers.cpp b/lexer/handlers.cpp @@ -0,0 +1,57 @@ +#include "lexer.h" +#include <regex> + +int lexer::handlers::single_char(const lexer::GrammarRule &rule, lexer::Token &token, const std::string &input) { + if (input[0] == rule.definition[0]) { + token.type = rule.type; + token.value = input[0]; + return 1; + } + return 0; +} + +int lexer::handlers::multi_char(const GrammarRule &rule, Token &token, const std::string &input) { + if (input.length() >= rule.definition.length()) { + std::string sub = input.substr(rule.definition.length()); + if (sub == rule.definition) { + token.type = rule.type; + token.value = sub; + return rule.definition.length(); // NOLINT(cppcoreguidelines-narrowing-conversions) + } + } + return 0; +} + +int lexer::handlers::regex(const GrammarRule &rule, Token &token, const std::string &input) { + std::regex rgx(rule.definition); + int consumed = 0; + std::string val; + val += input[0]; + while (std::regex_match(val, rgx)) { + consumed++; + val += input[consumed]; + } + if (consumed > 0) { + token.type = rule.type; + val.pop_back(); + token.value = val; + } + return consumed; +} + +int lexer::handlers::string_handler(const GrammarRule &rule, Token &token, const std::string &input) { + int length = input.length(); // NOLINT(cppcoreguidelines-narrowing-conversions) + int consumed = 0; + std::string val; + if (input[0] == '"') { + for (consumed = 1; consumed < length && input[consumed] != '"'; consumed++) { + val += input[consumed]; + } + } + if (consumed) { + consumed++; + token.type = rule.type; + token.value = val; + } + return consumed; +} +\ No newline at end of file diff --git a/lexer/lexer.cpp b/lexer/lexer.cpp @@ -0,0 +1,56 @@ + +#include "lexer.h" + +#include <utility> +#include <string> +#include <iostream> + +lexer::Lexer::Lexer(std::vector<GrammarRule> grammar_rules) { + this->rules = std::move(grammar_rules); +} + +std::vector<lexer::Token> lexer::Lexer::tokenize_line(const std::string& line) { // NOLINT(readability-convert-member-functions-to-static) + return std::vector<lexer::Token>(); +} + +std::vector<lexer::Token> lexer::Lexer::tokenize(const std::string &part) { + std::string cpy = part; + std::vector<Token> output; + if (cpy.empty()) return output; + int total_consumed = 0; + while (!cpy.empty()) { + int consumed = 0; + if (cpy[0] == ' ') {total_consumed++; cpy.erase(0, 1); continue;} // skip whitespace + Token token{}; + for (const GrammarRule& rule : rules) { + switch (rule.handler) { + case SINGLE_CHAR: + consumed = handlers::single_char(rule, token, cpy); + break; + case MULTI_CHAR: + consumed = handlers::multi_char(rule, token, cpy); + break; + case REGEX: + consumed = handlers::regex(rule, token, cpy); + break; + case STRING: + consumed = handlers::string_handler(rule, token, cpy); + break; + } + if (consumed) break; + } + if (consumed) { + output.push_back(token); + } else { + std::cerr << "Error on pos " << total_consumed << ". Unrecognized token." << std::endl; + std::cerr << part << std::endl; + for (int i = 0; i < total_consumed; i++) std::cerr << " "; + std::cerr << "^" << std::endl; + std::exit(1); + } + cpy.erase(0, consumed); + total_consumed += consumed; + } + return output; +} + diff --git a/lexer/lexer.h b/lexer/lexer.h @@ -0,0 +1,54 @@ +#ifndef SHITSHOW_LEXER_H +#define SHITSHOW_LEXER_H + + +#include <string> +#include <vector> + +namespace lexer { + enum TokenType { + NAME, + SEMICOLON, + TYPE_INT, + ASSIGNMENT, + PRINT, + NUMBER_LITERAL, + STRING_LITERAL, + LEFT_PARENT, + RIGHT_PARENT, + LEFT_BRACKET, + RIGHT_BRACKET + }; + enum HandlerType { + SINGLE_CHAR, + MULTI_CHAR, + REGEX, + STRING + }; + struct GrammarRule { + TokenType type; + std::string definition; + HandlerType handler; + }; + struct Token { + TokenType type; + std::string value; + }; + class Lexer { + std::vector<GrammarRule> rules; + public: + explicit Lexer(std::vector<GrammarRule> grammar_rules); + std::vector<Token> tokenize_line(const std::string& line); + std::vector<Token> tokenize(const std::string &part); + }; + + namespace handlers { + int single_char (const GrammarRule &rule, Token &token, const std::string &input); + int multi_char (const GrammarRule &rule, Token &token, const std::string &input); + int regex (const GrammarRule &rule, Token &token, const std::string &input); + int string_handler(const GrammarRule &rule, Token &token, const std::string &input); + } +} + + +#endif //SHITSHOW_LEXER_H diff --git a/main.cpp b/main.cpp @@ -0,0 +1,20 @@ +#include <iostream> +#include "lexer/lexer.h" + +int main() { + lexer::Lexer lxr({ + {.type=lexer::TokenType::SEMICOLON, .definition=";", .handler=lexer::HandlerType::SINGLE_CHAR}, + {.type=lexer::TokenType::ASSIGNMENT, .definition="=", .handler=lexer::HandlerType::SINGLE_CHAR}, + {.type=lexer::TokenType::LEFT_PARENT, .definition="(", .handler=lexer::HandlerType::SINGLE_CHAR}, + {.type=lexer::TokenType::RIGHT_PARENT, .definition=")", .handler=lexer::HandlerType::SINGLE_CHAR}, + {.type=lexer::TokenType::LEFT_BRACKET, .definition="{", .handler=lexer::HandlerType::SINGLE_CHAR}, + {.type=lexer::TokenType::RIGHT_BRACKET, .definition="}", .handler=lexer::HandlerType::SINGLE_CHAR}, + {.type=lexer::TokenType::NAME, .definition=R"([A-Za-z_](?:[\w]+)?)", .handler=lexer::HandlerType::REGEX}, + {.type=lexer::TokenType::NUMBER_LITERAL, .definition=R"([0-9](?:.[0-9])?)", .handler=lexer::HandlerType::REGEX}, + {.type=lexer::TokenType::STRING_LITERAL, .handler=lexer::HandlerType::STRING} + }); + std::vector<lexer::Token> out = lxr.tokenize("int i = 0; print i;", 0); + for (const lexer::Token& token : out) { + std::cout << token.type << ": " << token.value << std::endl; + } +} diff --git a/parser/parser.cpp b/parser/parser.cpp @@ -0,0 +1,119 @@ + +#include "parser.h" +#include <iostream> + +void error(const std::string& msg) { + std::cerr << "Error while parsing:" << std::endl << msg << std::endl; + std::exit(1); +} + +void error(const std::string &msg, const std::string &linectx) { + std::cerr << "Error while parsing:" << std::endl << msg << std::endl; + std::cerr << " " << linectx << std::endl; + std::exit(1); +} + +void error(const std::string &msg, const std::string &linectx, const int &column) { + std::cerr << "Error while parsing:" << std::endl << msg << std::endl; + std::cerr << " " << linectx << std::endl; + for (int i = 0; i < column+4; i++) std::cerr << " "; + std::cerr << std::endl; + std::exit(1); +} + +std::string reconstruct_code(const std::vector<lexer::Token>& tokens) { + std::string output; + for (const auto &token : tokens) output += token.value; + return output; +} + +parser::elements::Block parser::parse_block(const std::vector<lexer::Token> &token_stream, int start_at) { + int consumed = start_at; + int brackets = 0; + elements::Block block; + std::vector<lexer::Token> statement_tokens; + while (consumed < token_stream.size()) { + lexer::Token token = token_stream[consumed]; + switch (token.type) { + case lexer::SEMICOLON: + block.children.push_back(parse_statement(statement_tokens)); + statement_tokens.clear(); + break; + case lexer::LEFT_BRACKET: + brackets++; + statement_tokens.push_back(token); + break; + case lexer::RIGHT_BRACKET: + if (!brackets) { + block.children.push_back(parse_statement(statement_tokens)); + statement_tokens.clear(); + } + break; + default: + statement_tokens.push_back(token); + } + } + return block; +} + +parser::elements::Statement parser::parse_statement(const std::vector<lexer::Token> &token_stream) { + int consumed = 0; + elements::Statement statement; + if (token_stream.empty()) { + return statement; + } + while (true) { + lexer::Token token = token_stream[consumed]; + switch (token.type) { + case lexer::NAME: { + if (token_stream.size() < consumed+2) { + return statement; + } + lexer::Token token2 = token_stream[consumed+1]; + switch (token2.type) { + case lexer::ASSIGNMENT: { + if (token_stream.size() < consumed+3) { + error("Nothing to assign.", reconstruct_code(token_stream)); + } + elements::Expression exp = parse_expression(token_stream, consumed+2); + elements::Assignment assignment { + .name = token.value, + .value = exp, + }; + statement.children.push_back(assignment); + break; + } + default: + error("Token: "+token2.value+" unexpected at this point.", reconstruct_code(token_stream)); + } + break; + } + case lexer::TYPE_INT: { + if (token_stream.size() < consumed+3) { + error("What am I declaring? Missing name to declare.", reconstruct_code(token_stream)); + } else if (token_stream[consumed+1].type != lexer::TokenType::NAME) { + error("Can only declare names.", reconstruct_code(token_stream)); + } + elements::Declaration declaration { + .name = token_stream[consumed+1].value, + .data_type = INT + }; + statement.children.push_back(declaration); + if (token_stream.size() > consumed+2 && token_stream[consumed+2].type == lexer::ASSIGNMENT) + continue; // continue to run the main loop to get the assignment + } + case lexer::PRINT: + case lexer::LEFT_BRACKET: + break; + default: + error("Token: " + token.value + " unexpected at this point", reconstruct_code(token_stream)); + } + break; // break out of the main loop + } + return statement; +} + +parser::elements::Expression parser::parse_expression(const std::vector<lexer::Token> &token_stream, int start_at) { + return parser::elements::Expression(); +} + diff --git a/parser/parser.h b/parser/parser.h @@ -0,0 +1,70 @@ + +#ifndef SHITSHOW_PARSER_H +#define SHITSHOW_PARSER_H +#include <string> +#include <vector> +#include "../lexer/lexer.h" + +namespace parser { + enum ParserElementType { + BLOCK, + EXPRESSION, + STATEMENT, + DECLARATION, + ASSIGNMENT, + PRINT, + CONST_DEFINE, + NAME, + CALL + }; + enum DataType { + INT, + STRING + }; + namespace elements { + struct ParserElement { ParserElementType type; }; + struct Expression : ParserElement { + ParserElementType type = EXPRESSION; + std::vector<ParserElement> children; + }; + struct Statement : Expression { + ParserElementType type = STATEMENT; + }; + struct Block : ParserElement { + ParserElementType type = BLOCK; + std::vector<Statement> children; + }; + struct Declaration : ParserElement { + ParserElementType type = DECLARATION; + std::string name; + DataType data_type; + }; + struct Assignment : ParserElement { + ParserElementType type = ASSIGNMENT; + std::string name; + Expression value; + }; + struct ConstDefine : ParserElement { + ParserElementType type = CONST_DEFINE; + DataType data_type; + std::string value; + }; + struct Call : ParserElement { + ParserElementType type = CALL; + std::string name; + }; + struct Name : ParserElement { + ParserElementType type = NAME; + std::string name; + }; + struct Print : ParserElement { + ParserElementType type = PRINT; + ParserElement value; + }; + } + elements::Block parse_block(const std::vector<lexer::Token> &token_stream, int start_at); + elements::Statement parse_statement(const std::vector<lexer::Token> &token_stream); + elements::Expression parse_expression(const std::vector<lexer::Token> &token_stream, int start_at); +} + +#endif //SHITSHOW_PARSER_H