commit da79a2cc7217c2e804dd286e603080c35b1b7076
Author: bain3 <31798786+bain3@users.noreply.github.com>
Date: Fri, 30 Apr 2021 10:02:36 +0200
Initial Commit
Diffstat:
7 files changed, 386 insertions(+), 0 deletions(-)
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -0,0 +1,8 @@
+cmake_minimum_required(VERSION 3.19)
+project(shitshow)
+
+set(CMAKE_CXX_STANDARD 17)
+
+file(GLOB lexer_src lexer/*.cpp lexer/*.h)
+file(GLOB parser_src parser/*.cpp parser/*.h)
+add_executable(shitshow ${lexer_src} ${parser_src} main.cpp)
+\ No newline at end of file
diff --git a/lexer/handlers.cpp b/lexer/handlers.cpp
@@ -0,0 +1,57 @@
+#include "lexer.h"
+#include <regex>
+
+int lexer::handlers::single_char(const lexer::GrammarRule &rule, lexer::Token &token, const std::string &input) {
+ if (input[0] == rule.definition[0]) {
+ token.type = rule.type;
+ token.value = input[0];
+ return 1;
+ }
+ return 0;
+}
+
+int lexer::handlers::multi_char(const GrammarRule &rule, Token &token, const std::string &input) {
+ if (input.length() >= rule.definition.length()) {
+ std::string sub = input.substr(rule.definition.length());
+ if (sub == rule.definition) {
+ token.type = rule.type;
+ token.value = sub;
+ return rule.definition.length(); // NOLINT(cppcoreguidelines-narrowing-conversions)
+ }
+ }
+ return 0;
+}
+
+int lexer::handlers::regex(const GrammarRule &rule, Token &token, const std::string &input) {
+ std::regex rgx(rule.definition);
+ int consumed = 0;
+ std::string val;
+ val += input[0];
+ while (std::regex_match(val, rgx)) {
+ consumed++;
+ val += input[consumed];
+ }
+ if (consumed > 0) {
+ token.type = rule.type;
+ val.pop_back();
+ token.value = val;
+ }
+ return consumed;
+}
+
+int lexer::handlers::string_handler(const GrammarRule &rule, Token &token, const std::string &input) {
+ int length = input.length(); // NOLINT(cppcoreguidelines-narrowing-conversions)
+ int consumed = 0;
+ std::string val;
+ if (input[0] == '"') {
+ for (consumed = 1; consumed < length && input[consumed] != '"'; consumed++) {
+ val += input[consumed];
+ }
+ }
+ if (consumed) {
+ consumed++;
+ token.type = rule.type;
+ token.value = val;
+ }
+ return consumed;
+}
+\ No newline at end of file
diff --git a/lexer/lexer.cpp b/lexer/lexer.cpp
@@ -0,0 +1,56 @@
+
+#include "lexer.h"
+
+#include <utility>
+#include <string>
+#include <iostream>
+
+lexer::Lexer::Lexer(std::vector<GrammarRule> grammar_rules) {
+ this->rules = std::move(grammar_rules);
+}
+
+std::vector<lexer::Token> lexer::Lexer::tokenize_line(const std::string& line) { // NOLINT(readability-convert-member-functions-to-static)
+ return std::vector<lexer::Token>();
+}
+
+std::vector<lexer::Token> lexer::Lexer::tokenize(const std::string &part) {
+ std::string cpy = part;
+ std::vector<Token> output;
+ if (cpy.empty()) return output;
+ int total_consumed = 0;
+ while (!cpy.empty()) {
+ int consumed = 0;
+ if (cpy[0] == ' ') {total_consumed++; cpy.erase(0, 1); continue;} // skip whitespace
+ Token token{};
+ for (const GrammarRule& rule : rules) {
+ switch (rule.handler) {
+ case SINGLE_CHAR:
+ consumed = handlers::single_char(rule, token, cpy);
+ break;
+ case MULTI_CHAR:
+ consumed = handlers::multi_char(rule, token, cpy);
+ break;
+ case REGEX:
+ consumed = handlers::regex(rule, token, cpy);
+ break;
+ case STRING:
+ consumed = handlers::string_handler(rule, token, cpy);
+ break;
+ }
+ if (consumed) break;
+ }
+ if (consumed) {
+ output.push_back(token);
+ } else {
+ std::cerr << "Error on pos " << total_consumed << ". Unrecognized token." << std::endl;
+ std::cerr << part << std::endl;
+ for (int i = 0; i < total_consumed; i++) std::cerr << " ";
+ std::cerr << "^" << std::endl;
+ std::exit(1);
+ }
+ cpy.erase(0, consumed);
+ total_consumed += consumed;
+ }
+ return output;
+}
+
diff --git a/lexer/lexer.h b/lexer/lexer.h
@@ -0,0 +1,54 @@
+#ifndef SHITSHOW_LEXER_H
+#define SHITSHOW_LEXER_H
+
+
+#include <string>
+#include <vector>
+
+namespace lexer {
+ enum TokenType {
+ NAME,
+ SEMICOLON,
+ TYPE_INT,
+ ASSIGNMENT,
+ PRINT,
+ NUMBER_LITERAL,
+ STRING_LITERAL,
+ LEFT_PARENT,
+ RIGHT_PARENT,
+ LEFT_BRACKET,
+ RIGHT_BRACKET
+ };
+ enum HandlerType {
+ SINGLE_CHAR,
+ MULTI_CHAR,
+ REGEX,
+ STRING
+ };
+ struct GrammarRule {
+ TokenType type;
+ std::string definition;
+ HandlerType handler;
+ };
+ struct Token {
+ TokenType type;
+ std::string value;
+ };
+ class Lexer {
+ std::vector<GrammarRule> rules;
+ public:
+ explicit Lexer(std::vector<GrammarRule> grammar_rules);
+ std::vector<Token> tokenize_line(const std::string& line);
+ std::vector<Token> tokenize(const std::string &part);
+ };
+
+ namespace handlers {
+ int single_char (const GrammarRule &rule, Token &token, const std::string &input);
+ int multi_char (const GrammarRule &rule, Token &token, const std::string &input);
+ int regex (const GrammarRule &rule, Token &token, const std::string &input);
+ int string_handler(const GrammarRule &rule, Token &token, const std::string &input);
+ }
+}
+
+
+#endif //SHITSHOW_LEXER_H
diff --git a/main.cpp b/main.cpp
@@ -0,0 +1,20 @@
+#include <iostream>
+#include "lexer/lexer.h"
+
+int main() {
+ lexer::Lexer lxr({
+ {.type=lexer::TokenType::SEMICOLON, .definition=";", .handler=lexer::HandlerType::SINGLE_CHAR},
+ {.type=lexer::TokenType::ASSIGNMENT, .definition="=", .handler=lexer::HandlerType::SINGLE_CHAR},
+ {.type=lexer::TokenType::LEFT_PARENT, .definition="(", .handler=lexer::HandlerType::SINGLE_CHAR},
+ {.type=lexer::TokenType::RIGHT_PARENT, .definition=")", .handler=lexer::HandlerType::SINGLE_CHAR},
+ {.type=lexer::TokenType::LEFT_BRACKET, .definition="{", .handler=lexer::HandlerType::SINGLE_CHAR},
+ {.type=lexer::TokenType::RIGHT_BRACKET, .definition="}", .handler=lexer::HandlerType::SINGLE_CHAR},
+ {.type=lexer::TokenType::NAME, .definition=R"([A-Za-z_](?:[\w]+)?)", .handler=lexer::HandlerType::REGEX},
+ {.type=lexer::TokenType::NUMBER_LITERAL, .definition=R"([0-9](?:.[0-9])?)", .handler=lexer::HandlerType::REGEX},
+ {.type=lexer::TokenType::STRING_LITERAL, .handler=lexer::HandlerType::STRING}
+ });
+ std::vector<lexer::Token> out = lxr.tokenize("int i = 0; print i;", 0);
+ for (const lexer::Token& token : out) {
+ std::cout << token.type << ": " << token.value << std::endl;
+ }
+}
diff --git a/parser/parser.cpp b/parser/parser.cpp
@@ -0,0 +1,119 @@
+
+#include "parser.h"
+#include <iostream>
+
+void error(const std::string& msg) {
+ std::cerr << "Error while parsing:" << std::endl << msg << std::endl;
+ std::exit(1);
+}
+
+void error(const std::string &msg, const std::string &linectx) {
+ std::cerr << "Error while parsing:" << std::endl << msg << std::endl;
+ std::cerr << " " << linectx << std::endl;
+ std::exit(1);
+}
+
+void error(const std::string &msg, const std::string &linectx, const int &column) {
+ std::cerr << "Error while parsing:" << std::endl << msg << std::endl;
+ std::cerr << " " << linectx << std::endl;
+ for (int i = 0; i < column+4; i++) std::cerr << " ";
+ std::cerr << std::endl;
+ std::exit(1);
+}
+
+std::string reconstruct_code(const std::vector<lexer::Token>& tokens) {
+ std::string output;
+ for (const auto &token : tokens) output += token.value;
+ return output;
+}
+
+parser::elements::Block parser::parse_block(const std::vector<lexer::Token> &token_stream, int start_at) {
+ int consumed = start_at;
+ int brackets = 0;
+ elements::Block block;
+ std::vector<lexer::Token> statement_tokens;
+ while (consumed < token_stream.size()) {
+ lexer::Token token = token_stream[consumed];
+ switch (token.type) {
+ case lexer::SEMICOLON:
+ block.children.push_back(parse_statement(statement_tokens));
+ statement_tokens.clear();
+ break;
+ case lexer::LEFT_BRACKET:
+ brackets++;
+ statement_tokens.push_back(token);
+ break;
+ case lexer::RIGHT_BRACKET:
+ if (!brackets) {
+ block.children.push_back(parse_statement(statement_tokens));
+ statement_tokens.clear();
+ }
+ break;
+ default:
+ statement_tokens.push_back(token);
+ }
+ }
+ return block;
+}
+
+parser::elements::Statement parser::parse_statement(const std::vector<lexer::Token> &token_stream) {
+ int consumed = 0;
+ elements::Statement statement;
+ if (token_stream.empty()) {
+ return statement;
+ }
+ while (true) {
+ lexer::Token token = token_stream[consumed];
+ switch (token.type) {
+ case lexer::NAME: {
+ if (token_stream.size() < consumed+2) {
+ return statement;
+ }
+ lexer::Token token2 = token_stream[consumed+1];
+ switch (token2.type) {
+ case lexer::ASSIGNMENT: {
+ if (token_stream.size() < consumed+3) {
+ error("Nothing to assign.", reconstruct_code(token_stream));
+ }
+ elements::Expression exp = parse_expression(token_stream, consumed+2);
+ elements::Assignment assignment {
+ .name = token.value,
+ .value = exp,
+ };
+ statement.children.push_back(assignment);
+ break;
+ }
+ default:
+ error("Token: "+token2.value+" unexpected at this point.", reconstruct_code(token_stream));
+ }
+ break;
+ }
+ case lexer::TYPE_INT: {
+ if (token_stream.size() < consumed+3) {
+ error("What am I declaring? Missing name to declare.", reconstruct_code(token_stream));
+ } else if (token_stream[consumed+1].type != lexer::TokenType::NAME) {
+ error("Can only declare names.", reconstruct_code(token_stream));
+ }
+ elements::Declaration declaration {
+ .name = token_stream[consumed+1].value,
+ .data_type = INT
+ };
+ statement.children.push_back(declaration);
+ if (token_stream.size() > consumed+2 && token_stream[consumed+2].type == lexer::ASSIGNMENT)
+ continue; // continue to run the main loop to get the assignment
+ }
+ case lexer::PRINT:
+ case lexer::LEFT_BRACKET:
+ break;
+ default:
+ error("Token: " + token.value + " unexpected at this point", reconstruct_code(token_stream));
+ }
+ break; // break out of the main loop
+ }
+ return statement;
+}
+
+parser::elements::Expression parser::parse_expression(const std::vector<lexer::Token> &token_stream, int start_at) {
+ return parser::elements::Expression();
+}
+
diff --git a/parser/parser.h b/parser/parser.h
@@ -0,0 +1,70 @@
+
+#ifndef SHITSHOW_PARSER_H
+#define SHITSHOW_PARSER_H
+#include <string>
+#include <vector>
+#include "../lexer/lexer.h"
+
+namespace parser {
+ enum ParserElementType {
+ BLOCK,
+ EXPRESSION,
+ STATEMENT,
+ DECLARATION,
+ ASSIGNMENT,
+ PRINT,
+ CONST_DEFINE,
+ NAME,
+ CALL
+ };
+ enum DataType {
+ INT,
+ STRING
+ };
+ namespace elements {
+ struct ParserElement { ParserElementType type; };
+ struct Expression : ParserElement {
+ ParserElementType type = EXPRESSION;
+ std::vector<ParserElement> children;
+ };
+ struct Statement : Expression {
+ ParserElementType type = STATEMENT;
+ };
+ struct Block : ParserElement {
+ ParserElementType type = BLOCK;
+ std::vector<Statement> children;
+ };
+ struct Declaration : ParserElement {
+ ParserElementType type = DECLARATION;
+ std::string name;
+ DataType data_type;
+ };
+ struct Assignment : ParserElement {
+ ParserElementType type = ASSIGNMENT;
+ std::string name;
+ Expression value;
+ };
+ struct ConstDefine : ParserElement {
+ ParserElementType type = CONST_DEFINE;
+ DataType data_type;
+ std::string value;
+ };
+ struct Call : ParserElement {
+ ParserElementType type = CALL;
+ std::string name;
+ };
+ struct Name : ParserElement {
+ ParserElementType type = NAME;
+ std::string name;
+ };
+ struct Print : ParserElement {
+ ParserElementType type = PRINT;
+ ParserElement value;
+ };
+ }
+ elements::Block parse_block(const std::vector<lexer::Token> &token_stream, int start_at);
+ elements::Statement parse_statement(const std::vector<lexer::Token> &token_stream);
+ elements::Expression parse_expression(const std::vector<lexer::Token> &token_stream, int start_at);
+}
+
+#endif //SHITSHOW_PARSER_H