Skip to content

Commit

Permalink
Merge pull request #74 from certik/parser
Browse files Browse the repository at this point in the history
Initial parser implementation
  • Loading branch information
Shaikh-Ubaid authored Jan 26, 2024
2 parents 0c22405 + 443cdf2 commit 50aa7ce
Show file tree
Hide file tree
Showing 15 changed files with 3,784 additions and 9 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/CI.yml
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ jobs:
if: contains(matrix.os, 'ubuntu') || contains(matrix.os, 'macos')
run: |
export CPATH=$CONDA_PREFIX/include:$CPATH
lc --show-ast tests/test.cpp
lc --show-clang-ast tests/test.cpp
lc examples/expr2.c --show-asr
./run_tests.py
Expand Down
4 changes: 4 additions & 0 deletions build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,13 @@

set -ex

python src/libasr/asdl_cpp.py src/lc/LC.asdl src/lc/ast.h
python src/libasr/asdl_cpp.py src/libasr/ASR.asdl src/libasr/asr.h
python src/libasr/wasm_instructions_visitor.py

(cd src/lc/parser && re2c -W -b tokenizer.re -o tokenizer.cpp)
(cd src/lc/parser && bison -Wall -d -r all parser.yy)

cmake \
-DCMAKE_BUILD_TYPE=Debug \
-DWITH_LLVM=yes \
Expand Down
2 changes: 2 additions & 0 deletions environment_unix.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,3 +12,5 @@ dependencies:
- python=3.12.0
- toml=0.10.2
- xtensor=0.24.7
- bison=3.4
- re2c=3.1
16 changes: 8 additions & 8 deletions src/bin/lc.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -95,18 +95,18 @@ class ClangCheckActionFactory {
public:

std::string infile, ast_dump_file, ast_dump_filter;
bool ast_list, ast_print, show_ast;
bool ast_list, ast_print, show_clang_ast;

ClangCheckActionFactory(std::string infile_, std::string ast_dump_file,
std::string ast_dump_filter, bool ast_list,
bool ast_print, bool show_ast): infile(infile_),
bool ast_print, bool show_clang_ast): infile(infile_),
ast_dump_file(ast_dump_file), ast_dump_filter(ast_dump_filter),
ast_list(ast_list), ast_print(ast_print), show_ast(show_ast) {}
ast_list(ast_list), ast_print(ast_print), show_clang_ast(show_clang_ast) {}

std::unique_ptr<clang::ASTConsumer> newASTConsumer() {
if (ast_list) {
return clang::CreateASTDeclNodeLister();
} else if ( show_ast ) {
} else if ( show_clang_ast ) {
llvm::raw_fd_ostream* llvm_fd_ostream = nullptr;
if ( ast_dump_file.size() > 0 ) {
std::error_code errorCode;
Expand Down Expand Up @@ -689,7 +689,7 @@ int mainApp(int argc, const char **argv) {
bool ast_list = false;
bool ast_print = false;
std::string ast_dump_filter = "";
bool show_ast = false;
bool show_clang_ast = false;
bool show_asr = false;
bool arg_no_indent = false;
bool arg_no_color = false;
Expand All @@ -713,7 +713,7 @@ int mainApp(int argc, const char **argv) {
app.add_flag("--ast-print", ast_print, "Build ASTs and then pretty-print them");
app.add_flag("--ast-dump-filter", ast_dump_filter, "Use with -ast-dump or -ast-print to dump/print"
" only AST declaration nodes having a certain substring in a qualified name.");
app.add_flag("--show-ast", show_ast, "Show AST for the given file and exit");
app.add_flag("--show-clang-ast", show_clang_ast, "Show Clang AST for the given file and exit");
app.add_flag("--show-asr", show_asr, "Show ASR for the given file and exit");
app.add_flag("--no-indent", arg_no_indent, "Turn off Indented print ASR/AST");
app.add_flag("--no-color", arg_no_color, "Turn off colored AST/ASR");
Expand Down Expand Up @@ -762,9 +762,9 @@ int mainApp(int argc, const char **argv) {
std::string infile = sourcePaths[0];

// Handle Clang related options in the following
if (show_ast || ast_list || ast_print) {
if (show_clang_ast || ast_list || ast_print) {
ClangCheckActionFactory CheckFactory(infile, ast_dump_file,
ast_dump_filter, ast_list, ast_print, show_ast);
ast_dump_filter, ast_list, ast_print, show_clang_ast);
int status = Tool.run(newFrontendActionFactory(&CheckFactory).get());
return status;
}
Expand Down
4 changes: 4 additions & 0 deletions src/lc/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,8 @@
set(SRC
parser/tokenizer.cpp
parser/parser.cpp
parser/parser.tab.cc

utils.cpp
)

Expand Down
154 changes: 154 additions & 0 deletions src/lc/LC.asdl
Original file line number Diff line number Diff line change
@@ -0,0 +1,154 @@
-- ASDL's 4 builtin types are:
-- identifier, int, string, constant

module LC
{
mod = Module(stmt* body, type_ignore* type_ignores)
| Interactive(stmt* body)
| Expression(expr body)
| FunctionType(expr* argtypes, expr returns)

stmt = FunctionDef(identifier name, arguments args,
stmt* body, expr* decorator_list, expr? returns,
string? type_comment)
| AsyncFunctionDef(identifier name, arguments args,
stmt* body, expr* decorator_list, expr? returns,
string? type_comment)

| ClassDef(identifier name,
expr* bases,
keyword* keywords,
stmt* body,
expr* decorator_list)
| Return(expr? value)

| Delete(expr* targets)
| Assign(expr* targets, expr value, string? type_comment)
| AugAssign(expr target, operator op, expr value)
-- 'simple' indicates that we annotate simple name without parens
| AnnAssign(expr target, expr annotation, expr? value, int simple)

-- use 'orelse' because else is a keyword in target languages
| For(expr target, expr iter, stmt* body, stmt* orelse, string? type_comment)
| AsyncFor(expr target, expr iter, stmt* body, stmt* orelse, string? type_comment)
| While(expr test, stmt* body, stmt* orelse)
| If(expr test, stmt* body, stmt* orelse)
| With(withitem* items, stmt* body, string? type_comment)
| AsyncWith(withitem* items, stmt* body, string? type_comment)

| Match(expr subject, match_case* cases)

| Raise(expr? exc, expr? cause)
| Try(stmt* body, excepthandler* handlers, stmt* orelse, stmt* finalbody)
| Assert(expr test, expr? msg)

| Import(alias* names)
| ImportFrom(identifier? module, alias* names, int level)

| Global(identifier* names)
| Nonlocal(identifier* names)
| Expr(expr value)
| Pass | Break | Continue

-- col_offset is the byte offset in the utf8 string the parser uses
attributes (int lineno, int col_offset, int? end_lineno, int? end_col_offset)

-- BoolOp() can use left & right?
expr = BoolOp(boolop op, expr* values)
| NamedExpr(expr target, expr value)
| BinOp(expr left, operator op, expr right)
| UnaryOp(unaryop op, expr operand)
| Lambda(arguments args, expr body)
| IfExp(expr test, expr body, expr orelse)
| Dict(expr* keys, expr* values)
| Set(expr* elts)
| ListComp(expr elt, comprehension* generators)
| SetComp(expr elt, comprehension* generators)
| DictComp(expr key, expr value, comprehension* generators)
| GeneratorExp(expr elt, comprehension* generators)
-- the grammar constrains where yield expressions can occur
| Await(expr value)
| Yield(expr? value)
| YieldFrom(expr value)
-- need sequences for compare to distinguish between
-- x < 4 < 3 and (x < 4) < 3
| Compare(expr left, cmpop ops, expr* comparators)
| Call(expr func, expr* args, keyword* keywords)
| FormattedValue(expr value, int conversion, expr? format_spec)
| JoinedStr(expr* values)
-- | Constant(constant value, string? kind)
-- Our specific nodes that are used instead of Constant:
| ConstantStr(string value, string? kind)
| ConstantInt(int value, string? kind)
| ConstantBool(bool value, string? kind)
| ConstantFloat(float value, string? kind)
| ConstantComplex(float re, float im, string? kind)
| ConstantEllipsis(string? kind)
| ConstantNone(string? kind)
| ConstantBytes(string value, string? kind)

-- the following expression can appear in assignment context
| Attribute(expr value, identifier attr, expr_context ctx)
| Subscript(expr value, expr slice, expr_context ctx)
| Starred(expr value, expr_context ctx)
| Name(identifier id, expr_context ctx)
| List(expr* elts, expr_context ctx)
| Tuple(expr* elts, expr_context ctx)

-- can appear only in Subscript
| Slice(expr? lower, expr? upper, expr? step)

-- col_offset is the byte offset in the utf8 string the parser uses
attributes (int lineno, int col_offset, int? end_lineno, int? end_col_offset)

expr_context = Load | Store | Del

boolop = And | Or

operator = Add | Sub | Mult | MatMult | Div | Mod | Pow | LShift
| RShift | BitOr | BitXor | BitAnd | FloorDiv

unaryop = Invert | Not | UAdd | USub

cmpop = Eq | NotEq | Lt | LtE | Gt | GtE | Is | IsNot | In | NotIn

comprehension = (expr target, expr iter, expr* ifs, int is_async)

excepthandler = ExceptHandler(expr? type, identifier? name, stmt* body)
attributes (int lineno, int col_offset, int? end_lineno, int? end_col_offset)

arguments = (arg* posonlyargs, arg* args, arg* vararg, arg* kwonlyargs,
expr* kw_defaults, arg* kwarg, expr* defaults)

arg = (identifier arg, expr? annotation, string? type_comment)
attributes (int lineno, int col_offset, int? end_lineno, int? end_col_offset)

-- keyword arguments supplied to call (NULL identifier for **kwargs)
keyword = (identifier? arg, expr value)
attributes (int lineno, int col_offset, int? end_lineno, int? end_col_offset)

-- import name with optional 'as' alias.
alias = (identifier name, identifier? asname)
attributes (int lineno, int col_offset, int? end_lineno, int? end_col_offset)

withitem = (expr context_expr, expr? optional_vars)

match_case = (pattern pattern, expr? guard, stmt* body)

pattern = MatchValue(expr value)
-- | MatchSingleton(constant value)
| MatchSingleton(expr value)
| MatchSequence(pattern* patterns)
| MatchMapping(expr* keys, pattern* patterns, identifier? rest)
| MatchClass(expr cls, pattern* patterns, identifier* kwd_attrs, pattern* kwd_patterns)

| MatchStar(identifier? name)
-- The optional "rest" MatchMapping parameter handles capturing extra mapping keys

| MatchAs(pattern? pattern, identifier? name)
| MatchOr(pattern* patterns)

attributes (int lineno, int col_offset, int end_lineno, int end_col_offset)

type_ignore = TypeIgnore(int lineno, string tag)
}
136 changes: 136 additions & 0 deletions src/lc/parser/parser.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
#include <iostream>
#include <string>
#include <sstream>
#include <fstream>
#include <chrono>

#include <lc/parser/parser.h>
#include <lc/parser/parser.tab.hh>
#include <libasr/diagnostics.h>
#include <libasr/string_utils.h>
#include <libasr/utils.h>
#include <lc/parser/parser_exception.h>

namespace LCompilers::LC {

Result<LC::AST::Module_t*> parse(Allocator &al, const std::string &s,
uint32_t prev_loc, diag::Diagnostics &diagnostics)
{
Parser p(al, diagnostics);
try {
p.parse(s, prev_loc);
} catch (const parser_local::TokenizerError &e) {
Error error;
diagnostics.diagnostics.push_back(e.d);
return error;
} catch (const parser_local::ParserError &e) {
Error error;
diagnostics.diagnostics.push_back(e.d);
return error;
}

Location l;
if (p.result.size() == 0) {
l.first=0;
l.last=0;
} else {
l.first=p.result[0]->base.loc.first;
l.last=p.result[p.result.size()-1]->base.loc.last;
}
return (LC::AST::Module_t*)LC::AST::make_Module_t(al, l,
p.result.p, p.result.size(), p.type_ignore.p, p.type_ignore.size());
}

void Parser::parse(const std::string &input, uint32_t prev_loc)
{
inp = input;
if (inp.size() > 0) {
if (inp[inp.size()-1] != '\n') inp.append("\n");
} else {
inp.append("\n");
}
m_tokenizer.set_string(inp, prev_loc);
if (yyparse(*this) == 0) {
return;
}
throw parser_local::ParserError("Parsing unsuccessful (internal compiler error)");
}

void Parser::handle_yyerror(const Location &loc, const std::string &msg)
{
std::string message;
if (msg == "syntax is ambiguous") {
message = "Internal Compiler Error: syntax is ambiguous in the parser";
} else if (msg == "syntax error") {
YYSTYPE yylval_;
YYLTYPE yyloc_;
this->m_tokenizer.cur = this->m_tokenizer.tok;
int token = this->m_tokenizer.lex(this->m_a, yylval_, yyloc_, diag);
if (token == yytokentype::END_OF_FILE) {
message = "End of file is unexpected here";
} else if (token == yytokentype::TK_NEWLINE) {
message = "Newline is unexpected here";
} else {
std::string token_str = this->m_tokenizer.token();
std::string token_type = token2text(token);
if (token_str == token_type) {
message = "Token '" + token_str + "' is unexpected here";
} else {
message = "Token '" + token_str + "' (of type '" + token2text(token) + "') is unexpected here";
}
}
} else {
message = "Internal Compiler Error: parser returned unknown error";
}
throw parser_local::ParserError(message, loc);
}

bool file_exists(const std::string &name) {
std::ifstream file(name);
if (!file.is_open()) {
return false;
}
return true;
}

std::string unique_filename(const std::string &prefix) {
uint64_t ms = std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::steady_clock::now().time_since_epoch()).count();
ms = ms % 1000000000;
srand((unsigned) ms);
std::string hex = "0123456789ABCDEF";
std::string random_hash;
for (int i=0; i < 6; i++) {
random_hash += hex[rand() % 16];
}
int counter = 1;
std::string filename = prefix + random_hash + std::to_string(counter);
while (file_exists(filename)) {
counter++;
filename = prefix + random_hash + std::to_string(counter);
}
return filename;
}

Result<LC::AST::ast_t*> parse_python_file(Allocator &al,
const std::string &/*runtime_library_dir*/,
const std::string &infile,
diag::Diagnostics &diagnostics,
uint32_t prev_loc,
[[maybe_unused]] bool new_parser) {
LC::AST::ast_t* ast;
// We will be using the new parser from now on
new_parser = true;
LCOMPILERS_ASSERT(new_parser)
std::string input = read_file(infile);
Result<LC::AST::Module_t*> res = parse(al, input, prev_loc, diagnostics);
if (res.ok) {
ast = (LC::AST::ast_t*)res.result;
} else {
LCOMPILERS_ASSERT(diagnostics.has_error())
return Error();
}
return ast;
}


} // namespace LCompilers::LC
Loading

0 comments on commit 50aa7ce

Please sign in to comment.