Return to Home page

From Erlang to LLVM

Abstract

The project goal was to develop a compiler for a subset of Erlang (miniErlang) that generates LLVM IR in human-readable format, built using JFlex and CUP.

Results achieved

Implementation of a mini virtual machine (Erlang is based on the BEAM VM):
- BIFs (built-in functions):
  - is_atom/1
  - is_boolean/1
  - is_float/1
  - is_integer/1
  - is_integer/1
  - is_function/1
  - is_list/1
  - is_number/1
  - abs/1
  - float/1
  - hd/1
  - tl/1
  - length/1
  - round/1
  - trunc/1
- Other functions:
  - io:format/1, io:format/2
  - lists:nth/1
  - lists:append/2
Language
- Terms: Number, List, Atom, Boolean
- Expressions:
- Arithmetic operations
  - Sum (e.g. A + B),
  - Subtraction (e.g. A - B)
  - Sign inversion (e.g. - B)
  - Multiplication (e.g. A * B)
  - Division (e.g. A / B)
  - Integer division (e.g. A div B)
  - Modulo (e.g. A rem B)
- Comparison operations
  - Exactly equal (e.g. A =:= B)
  - Equal (e.g. A == B)
  - Exactly not equal (e.g. A =/= B)
  - Not equal (e.g. A /= B)
  - Less than (e.g. A < B)
  - Less than or equal (e.g. A =< B)
  - Greater than (e.g. A > B)
  - Greater than or equal (e.g. A >= B)
- Boolean operations
  - And (e.g. A and B)
  - Or (e.g. A or B)
  - Xor (e.g. A xor B)
  - Not (e.g. not A)
- List append (e.g. A ++ B)
- Pattern Matching:
  - Single variable
  - List based (Only available using the match operator, not in functions)
Function structures (one parameter):
- Function clauses
- Function clauses with guards

miniErlang VM

The miniErlang VM is written in C++, it implements the basic data item construct used in every operation, BIFs (Built-in Functions) and functions needed to support Erlang language features, such as the fact that is dynamically typed. The miniErlang VM is stored in the project folder in two formats: C++ for compiler development, LLVM IR for compiler use; when compiling Erlang code, the latter is combined with the output of the compiler. The code that implements it is very verbose (results in 1k LOC) due to the many error handling conditions, for this reason only the most interesting aspects will be reported.

Literal Struct

It is the basic data item object, it handles typing and any operation that can be performed with basic data item.

Following, a small part of the implementation that should give an idea of how it works:

typedef enum
{
    Integer,
    Float,
    List,
    Atom,
    Undefined,
    Boolean
} LiteralType;
 
typedef struct Literal
{
    LiteralType type = Undefined;
    void *ptr = nullptr;
 
    Literal(int value) : type(Integer), ptr(new int(value))
    {
    }
    Literal(double value) : type(Float), ptr(new double(value)) {}
    Literal(size_t value) : type(Atom), ptr(new size_t(value)) {}
    Literal(bool value) : type(Boolean), ptr(new bool(value)) {}
    Literal(list<Literal> value) : type(List)
    {
        if (value.size() == 0)
        {
            ptr = nullptr;
        }
        else
        {
            if (value.begin()->type == Undefined)
            {
                error("undefined expression in list initialization: not allowed.");
            }
 
            ptr = new pair<Literal, Literal>(*value.begin(), list<Literal>(next(value.begin()), value.end()));
        }
    }
    Literal(Literal head, Literal tail) : type(List)
    {
        if (head.type == Undefined || tail.type == Undefined)
        {
            error("undefined expression in list initialization is not allowed.");
        }
        ptr = new pair<Literal, Literal>(head, tail);
    }
    Literal() : type(Undefined) {}
 
    /* Copy constructor */
    Literal(const Literal &a)
    {
        this->type = a.type;
        switch (this->type)
        {
        case Integer:
            this->ptr = new int(*(int *)a.ptr);
            break;
        case Float:
            this->ptr = new double(*(double *)a.ptr);
            break;
        case List:
            this->ptr = a.ptr != nullptr ? new pair<Literal, Literal>(*(pair<Literal, Literal> *)a.ptr) : nullptr;
            break;
        case Atom:
            this->ptr = new size_t(*(size_t *)a.ptr);
            break;
        case Boolean:
            this->ptr = new bool(*(bool *)a.ptr);
            break;
        case Undefined:
            error("bad copy.");
        }
    }
 
    void match(const Literal &match_var)
    {
        if (this->type == Undefined)
        {
            this->type = match_var.type;
            switch (this->type)
            {
            case Integer:
                this->ptr = new int(*(int *)match_var.ptr);
                break;
            case Float:
                this->ptr = new double(*(double *)match_var.ptr);
                break;
            case List:
                this->ptr = match_var.ptr != nullptr ? new pair<Literal, Literal>(*(pair<Literal, Literal> *)match_var.ptr) : nullptr;
 
                break;
            case Atom:
                this->ptr = new size_t(*(size_t *)match_var.ptr);
                break;
            case Boolean:
                this->ptr = new bool(*(bool *)match_var.ptr);
                break;
            case Undefined:
                error("bad matching");
            }
        }
 
        if (*this != match_var)
        {
            error("bad matching");
        }
    }
 
    bool try_match(const Literal &match_var)
    {
        try
        {
            match(match_var);
        }
        catch (invalid_argument e)
        {
            return false;
        }
        return true;
    }
    int getInt() const
    {
        if (type != Integer)
        {
            error("Type error.");
        }
 
        int result = *(int *)this->ptr;
        return result;
    }
 
    double getFloat() const
    {
        if (type != Float)
            error("Type error.");
        double result = *(double *)this->ptr;
        return result;
    }
 
    list<Literal> getList() const
    {
        if (type != List)
            error("bad matching: not a list.");
        list<Literal> result;
        if (ptr != nullptr)
        {
            pair<Literal, Literal> element = *(pair<Literal, Literal> *)ptr;
            result.push_back(element.first);
            void *iterator = element.second.ptr;
            while (iterator != nullptr && element.second.type == List)
            {
                element = *(pair<Literal, Literal> *)iterator;
                result.push_back(element.first);
                iterator = element.second.ptr;
            }
        }
        return result;
    }
 
    size_t getAtom() const
    {
        if (type != Atom)
            error("Type error.");
        size_t result = *(size_t *)this->ptr;
        return result;
    }
 
    bool getBoolean() const
    {
        if (type != Boolean)
            error("Type error.");
        bool result = *(bool *)this->ptr;
        return result;
    }
}

Erlang standard functions

float/1 impelementation

float/1 is a built in function, that given a Number as a parameter, return its equivalent but with floating-point representation.

Literal BIF_float(const Literal &l)
{
    if (!l.isNumber())
    {
        error("float can only be applied to numbers.");
    }
    if (l.type == Integer)
    {
        return (double)l.getInt();
    }
    return l;
}

io:format/2 implementation

It equivalent in some ways to C's printf, supports in a limited way ~s and ~w control sequences (https://erlang.org/doc/man/io.html#format-2).

io:format/2 example:

io:format("Number one: ~w, \"hello\" string: ~s, \"hello\" list representation: ~w ~n", [1, "hello", "hello"]).
% outputs 'Number one: 1, "hello" string: hello, "hello" list representation: [104,101,108,108,111] ~n\n"
 
Implementation:
<code C++[enable_line_numbers="true"]>
Literal ioformat(const Literal &format, const Literal &data)
{
    if (format.type != List || data.type != List)
    {
        error("bad argument\n\tin function io:format: needs 2 list parameters (format and data).");
    }
 
    list<Literal> llist = data.getList();
 
    regex n("(([^~]|^)(~n))");
    regex ee("(([^~]|^)~(w|s))");
 
    string ss = format.getString('s');
    ss = regex_replace(ss, n, "$2\n");
 
    smatch mm;
 
    string to_print = "";
 
    auto llist_it = llist.begin();
    int i = 0;
    while (regex_search(ss, mm, ee))
    {
        if (i++ >= llist.size())
        {
            error("bad argument\n\tin function io:format: data control sequences are more than elements in data list.");
        }
        to_print += mm.prefix().str() + mm.format("$2");
        if (mm.format("$3").compare("w") == 0)
        {
            if (llist_it->type == List)
            {
                to_print += "[" + llist_it->getString('w') + "]";
            }
            else
            {
                to_print += llist_it->getString('w');
            }
        }
        else
        {
            to_print += llist_it->getString('s');
        }
 
        llist_it = next(llist_it);
        ss = mm.suffix().str();
    }
 
    if (i < llist.size())
    {
        error("bad argument\n\tin function io:format: data control sequences are less than elements in data list.");
    }
 
    cout << to_print << ss;
 
    return Literal(true);
}

++ operator and lists:append/2 implementation

++ and lists:append/2 concatenate two lists.

Example usage of the ++ operator: ListA ++ ListB.

Usage of lists:append/2: lists:append(ListA, ListB).

Implementation:

Literal listsappend(const Literal &a, const Literal &b)
{
    if (a.type != List || b.type != List)
    {
        error("bad argument\n\tin function lists:append: needs 2 list parameters (a and b).");
    }
    if (!a.isProperList() || !b.isProperList())
    {
        error("bad argument\n\tin function lists:append: improper lists are not supported.");
    }
 
    list<Literal> concatenation = a.getList();
    list<Literal> b_list = b.getList();
    concatenation.insert(concatenation.end(), b_list.begin(), b_list.end());
 
    return Literal(concatenation);
}

Compiler

The compiler is composed of a scanner (generated by JFlex) and a parser (generated by CUP), every time the scanner recognizes a symbol it passes it to the parser. The Symbol class holds information about each recognized symbol.

Scanner

The scanner is produced using jFlex (a lexical analyzer generator written in Java). JFlex is designed to work together with the LALR parser generator CUP.

nl = \r|\n|\r\n
ws = [ \t]
 
%%
 
/* Symbols */
"("     {	return symbol(sym.ROUND_OPEN);	}
")"     {	return symbol(sym.ROUND_CLOSE);	}
"["     {	return symbol(sym.SQUARE_OPEN);	}
"]"     {	return symbol(sym.SQUARE_CLOSE);	}
"{"     {	return symbol(sym.BRACE_OPEN);	}
"}"     {	return symbol(sym.BRACE_CLOSE);	}
"/"     {	return symbol(sym.SLASH);	}
"."     {	return symbol(sym.DOT);	}
","     {	return symbol(sym.COMMA);	}
":"     {	return symbol(sym.COLON);	}
";"     {	return symbol(sym.SEMICOLON);	}
"="     {	return symbol(sym.MATCH);	}
"|"     {	return symbol(sym.VERTICAL_BAR);	}
"!"     {	return symbol(sym.NOT);	}
"-"     {	return symbol(sym.HYPHEN);	}
"+"     {	return symbol(sym.PLUS);	}
"++"    {	return symbol(sym.PLUS_PLUS);	}
"*"     {	return symbol(sym.STAR);	}
"->"    {	return symbol(sym.RIGHT_ARROW);	}
"<-"    {	return symbol(sym.LEFT_ARROW);	}
"=="    {	return symbol(sym.EQ);	}
"=:="   {	return symbol(sym.EXACT_EQ);	}
"/="    {	return symbol(sym.NOT_EQ);	}
"=/="   {	return symbol(sym.EXACT_NOT_EQ);	}
"<"     {	return symbol(sym.LESS);	}
"=<"    {	return symbol(sym.LESS_EQ);	}
">"     {	return symbol(sym.GREATER);	}
">="    {	return symbol(sym.GREATER_EQ);	}
 
 
/* Keywords */
"and"       {	return symbol(sym.K_AND);	}
"not"       {	return symbol(sym.K_NOT);	}
"or"        {	return symbol(sym.K_OR);	}
"xor"       {	return symbol(sym.K_XOR);	}
"div"       {	return symbol(sym.K_DIV);	}
"rem"       {	return symbol(sym.K_REM);	}
"when"      {	return symbol(sym.K_WHEN);	}
 
/* Boolean terms */
"false"	{ return new Symbol(sym.BOOLEAN, yyline, yycolumn, new String(yytext())); }
"true"	{ return new Symbol(sym.BOOLEAN, yyline, yycolumn, new String(yytext())); }
 
/* String term */
[\"]([^\"\\]|\\.)*[\"]	{ return new Symbol(sym.STRING, yyline, yycolumn, new String(yytext().substring(1, yytext().length() - 1))); }
 
/* Variable names */
[A-Z_][0-9a-zA-Z_@]*	{ return new Symbol(sym.VARIABLE, yyline, yycolumn, new String(yytext())); }
 
/* Atom terms */
[a-z][0-9a-zA-Z_@]*		{ return new Symbol(sym.ATOM, yyline, yycolumn, new String(yytext())); }
[\']([^\'\\]|\\.)*[\']	{ return new Symbol(sym.ATOM, yyline, yycolumn, new String(yytext().substring(1, yytext().length() - 1))); }
 
/* Float terms */
[0-9]*\.[0-9]+	{ return new Symbol(sym.FLOAT, yyline, yycolumn, new Float(yytext())); }
 
/* Integer terms */
[1-9][0-9]*|0	{ return new Symbol(sym.INT, yyline, yycolumn, new Integer(yytext())); }
 
/* Comments */
"%" [^\r\n]* {nl}?	{	;	}
 
{ws}|{nl}       {	;	}
 
.	{	System.out.println("SCANNER ERROR: "+yytext());	}

Parser

The parser generator is written in CUP, the code is generated by Java classes, one for each language item.

Grammar start

Erlang does not support global variables, nor expressions that are not part of a function, thus the program must start with a function definition.

start with program;
 
program ::= function_seq:funs {:
  funs.generateCode(manager, null);
  manager.checkUndefinedFunctionsCalls();
  if (sem() && parser.semErrors == 0) {
  	System.out.println(parser.outputBuffer);
  } else {
  	System.err.println("\nOUTPUT COULD NOT BE PRODUCED DUE TO ERRORS\n");
  }
  System.err.println(parser.errorBuffer);
 
  System.err.println("######################");
  System.err.println("Syntax Warnings : " + parser.synWarnings);
  System.err.println("Semantic Errors  : " + parser.semErrors);
  System.err.println("Semantic Warnings: " + parser.semWarnings);
:};
 
function_seq ::=
		function_seq:funSeq function:fun	{:	RESULT = new FunctionSequence(funSeq, fun);		:}
	|	function:fun 				{:	RESULT = new FunctionSequence(fun);			:}
;

The Manager class is a wrapper to access the parser, it is used by every class corresponding to a non terminal. On line 4 of the above code, the Manager is used to check whether there where calls to functions that are have not been defined, either in the file being compiled or in the miniErlang VM.

Every non terminal used in the parser has an equivalent class defined with these common methods:

generateCode(): called by its parent to generate the corresponding LLVM IR code;
destructDependencies(): deallocates every object instated by the child non terminals;
destruct(): that calls destructDependencies() and deallocates every object instatiated in by the non terminal;

and common attributes:

subgraphSize: the size of the code produced by generateCode();
label: the LLVM IR label used to store the object resulting from the operations performed in generateCode().

FunctionSequence Class:

public class FunctionSequence extends Node {
	FunctionSequence seqHead;
	Function tail;
 
	public FunctionSequence(Function tail) {
		this.tail = tail;
		seqHead = null;
	}
	public FunctionSequence(FunctionSequence head, Function tail) {
		seqHead = head;
		this.tail = tail;
	}
 
	public void generateCode(Manager manager, Node parent) {
		super.generateCode(manager, parent);
		if(seqHead != null) {
			seqHead.generateCode(manager, this);
		}
		tail.generateCode(manager, this);
	}
 
	public long destruct(Manager manager, Node caller) {
		return 0;
 
	}
	public long destructDependencies(Manager manager, Node caller) {
		return 0;
	}
 
}

Functions

Due to the complexity derived from implementation details of the miniErlang VM, functions clauses can have at most 1 argument.

function ::=
		function_clause:head function_clause_seq:tail	{:	RESULT = new Function(head, tail);				:}
	|	function_clause:head error			{:	syntaxError("Invalid function definition.");	:}
;
 
function_clause_seq ::=
		SEMICOLON function_clause:head function_clause_seq:tail {:	RESULT = new FunctionClauseSequence(head, tail);	:}
	|	DOT							{:	RESULT = null;						:}	
 
;
 
function_clause ::=
		ATOM:name argument:arg	RIGHT_ARROW expression_seq:expressionSeq		{:	RESULT = new FunctionClause(name, arg, expressionSeq);		:}
	|	ATOM:name argument:arg K_WHEN guard:g RIGHT_ARROW expression_seq:expressionSeq	{:	RESULT = new FunctionClause(name, arg, g, expressionSeq);	:}
	|	error argument RIGHT_ARROW expression_seq					{:	syntaxError("Invalid function name.");				:}
	|	error argument K_WHEN guard RIGHT_ARROW expression_seq				{:	syntaxError("Invalid function name.");				:}	
	|	ATOM error RIGHT_ARROW expression_seq						{:	syntaxError("Invalid function clause argument.");		:}
	|	ATOM error K_WHEN guard RIGHT_ARROW expression_seq				{:	syntaxError("Invalid function clause argument.");		:}	
	|	ATOM argument RIGHT_ARROW error							{:	syntaxError("Invalid function clause body.");			:}
	|	ATOM argument K_WHEN guard RIGHT_ARROW error					{:	syntaxError("Invalid function clause body.");			:}
	|	ATOM argument K_WHEN error RIGHT_ARROW expression_seq				{:	syntaxError("Invalid function clause guard.");			:}	
;
 
argument ::=
		ROUND_OPEN expression:arg ROUND_CLOSE	{:	RESULT = arg;	:}
	|	ROUND_OPEN ROUND_CLOSE			{:	RESULT = null;	:}
;
 
guard ::= 
		expression:head guard_tail:tail	{:	RESULT = new Guard(new AltList(head, tail));	:}
;
guard_tail ::=
		COMMA expression:head guard_tail:tail	{:	RESULT = new AltList(head, tail);	:}
	|						{:	RESULT = null;				:}	
;

The Function class handles generating the function firm and the code that throws an error if no matched was found between the function parameter and the function clauses. Function.generateCode():

public void generateCode(Manager manager, Node parent) {
    super.generateCode(manager, parent);
 
    manager.checkTailMatch(head, tail);
    manager.setFunctionName(manager.getFunctionName(head.name, head.argument));
    manager.openClause();
 
    long returnLabel = manager.genLabel();
    manager.setReturnLabel(returnLabel);
 
    if (head.argument != null) {
      long parameterLabel = manager.genLabel();
      manager.setParameterLabel(parameterLabel);
      manager.dumpFormatln(
              "define void %s(%%%s* noalias sret align 8 %%%d, %%%s* %%%d) #0 personality i8*"
                  + " bitcast (i32 (...)* @__gxx_personality_v0 to i8*) {",
              manager.getFunctionName(),
              Const.LITERAL_STRUCT,
              returnLabel,
              Const.LITERAL_STRUCT,
              parameterLabel);
    } else {
      manager.dumpFormatln(
              "define void %s(%%%s* noalias sret align 8 %%%d) #0 personality i8* bitcast (i32"
                  + " (...)* @__gxx_personality_v0 to i8*) {",
              manager.getFunctionName(), Const.LITERAL_STRUCT, returnLabel);
    }
    manager.genLabel();
    long returnPointerLabel = manager.genLabel();
    manager.dumpFormatln("\t%%%d = alloca i8*, align 8", returnPointerLabel);
    long bitcastReturnPointerLabel = manager.genLabel();
 
    manager.dumpFormatln(
            "\t%%%d = bitcast %%%s* %%%d to i8*",
            bitcastReturnPointerLabel, Const.LITERAL_STRUCT, returnLabel);
    manager.dumpFormatln(
            "\tstore i8* %%%d, i8** %%%d, align 8", bitcastReturnPointerLabel, returnPointerLabel);
 
    long resumePointerLabel = manager.genLabel();
    manager.setResumePointer(resumePointerLabel);
    long resumeIntegerLabel = manager.genLabel();
    manager.setResumeInteger(resumeIntegerLabel);
    manager.dumpFormatln("\t%%%d = alloca i8*, align 8", resumePointerLabel);
    manager.dumpFormatln("\t%%%d = alloca i32, align 4", resumeIntegerLabel);
 
    head.generateCode(manager, this);
    if (tail != null) {
      manager.dumpCodeLabel();
      tail.generateCode(manager, this);
    }
 
    if (head.argument != null) {
      manager.dumpCodeLabel();
      manager.dumpFormatln("\tcall void %s()", Const.BAD_MATCHING_ERROR);
      manager.dumpln("\tret void");
    }
    manager.dumpln("}\n");
    manager.popFunctionSymbols();
  }

FunctionClauseSequence is very similar in nature to FunctionSequence as it just generates the list of FunctionClauses.

FunctionClause handles function clauses with: no parameter, 1 parameter and with guards. If the parameter is a constant term, the generator will match the run-time parameter with the term declared in the function clause. If on the other hand the parameter is a variable, its name will be bound to the LLVM IR label previously set by the Function class.

The subgraphSize attribute of the child non terminals is used to compute the label the next clause will be at, that will need to be evaluated if the current function clause doesn't match the run-time parameter.

FunctionClause.generateCode():

public void generateCode(Manager manager, Node parent) {
    this.parent = parent;
 
    if (argument != null) {
      if (argument instanceof Variable) {
         Variable variableArgument = (Variable) argument;
        variableArgument.generateCode(manager, this, manager.getParameterLabel());
      } else {
        // Check that runtime argument is equal to function-def argument.
        argument.generateCode(manager, this);
 
        manager.dumpCodeLabel();
        long matchingLabel = manager.genLabel();
        manager.dumpFormatln(
            "\t%%%d = invoke zeroext i1 %s(%%%s* %%%d, %%%s* nonnull align 8"
                + " dereferenceable(16) %%%d)",
            matchingLabel,
            Const.LITERAL_CLAUSE_MATCH,
            Const.LITERAL_STRUCT,
            manager.getParameterLabel(),
            Const.LITERAL_STRUCT,
            argument.label);
 
        long unwindLabel = matchingLabel + 1;
        long branchLabel = unwindLabel + CLEANUP_LABEL_SIZE + RESUME_LABEL_SIZE;
        manager.dumpFormatln("\t\tto label %%%d unwind label %%%d", branchLabel, unwindLabel);
 
        manager.cleanupError();
        argument.destruct(manager, this);
        manager.resumeError();
        long rbl = manager.genLabel();
        manager.dumpln(branchLabel + ":");
        argument.destruct(manager, this);
        long clauseExpressions = branchLabel + 1;
        long nextClause = branchLabel + 2 + expressions.subgraphSize;
        manager.dumpFormatln(
            "\tbr i1 %%%d, label %%%d, label %%%d", matchingLabel, clauseExpressions, nextClause);
        manager.dumpCodeLabel();
      }
    }
    if (guard != null) {
      guard.generateCode(manager, this);
      manager.dumpCodeLabel();
 
      long clauseExpressions = manager.getCurrentLabel();
      long nextClause = manager.getCurrentLabel() + 1 + expressions.subgraphSize;
      manager.dumpFormatln(
          "\tbr i1 %%%d, label %%%d, label %%%d", guard.label, clauseExpressions, nextClause);
      manager.dumpCodeLabel();
    }
    expressions.generateCode(manager, this);
    expressions.generateReturn(manager);
    manager.closeClause();
  }

Guards

A guard is a series of guard expressions separated by a comma. Guards can be used to perform simple tests and comparisons on the variables in a pattern; they can be used in function defintions to expand the power of pattern matching.

The optional guard of the function clause is implemented by evaluating every condition, storing each into a list, and passing the list to the eval_guard() function, implemented in the miniErlang VM.

Guard.generateCode():

  public void generateCode(Manager manager, Node parent) {
    super.generateCode(manager, parent);
 
    this.checkGuardSemantic(manager);
    guard_expression.generateCode(manager, this);
    manager.dumpCodeLabel();
    label = manager.genLabel();
    manager.dumpFormatln("%%%d = invoke zeroext i1 %s(%%%s* %%%d)",
    		label, Const.EVAL_GUARD, Const.LITERAL_STRUCT, guard_expression.label);
    long afterUnwind = manager.getCurrentLabel() + CLEANUP_LABEL_SIZE + RESUME_LABEL_SIZE;
    manager.dumpFormatln("\t\tto label %%%d unwind label %%%d", afterUnwind, manager.getCurrentLabel());
    manager.cleanupError();
    destructDependencies(manager, this);
    manager.resumeError();
  }

Terms

Terms are the basic data object usable in Erlang, their implementation is trivial: it requires calling the Literal struct constructor relative to the type of object that is needed.

term ::=
		ATOM:value		{:	RESULT = new Atom(value);				:}		   
	|	FLOAT:value		{:	RESULT = new minierlang.exp.terms.Number(value);	:}
	|	INT:value		{:	RESULT = new minierlang.exp.terms.Number(value);	:}
	|	BOOLEAN:value		{:	RESULT = new Bool(value);				:}
	|	STRING:value		{:	RESULT = new List(value);				:}
	|	non_empty_list:value	{:	RESULT = value;						:}
	|	empty_list:value	{:	RESULT = value;						:}
;
 
empty_list ::=
		SQUARE_OPEN SQUARE_CLOSE	{:	RESULT = new List(null, null);	:}
;
 
non_empty_list ::= 
		normal_list:list	{:	RESULT = list;	:}
	|	alt_list:list		{:	RESULT = list;	:}
;	
normal_list ::=
		SQUARE_OPEN expression:head normal_list_tail:tail	{:	RESULT = new AltList(head, tail);	:}	
;		
normal_list_tail ::=
		COMMA expression:head normal_list_tail:tail	{:	RESULT = new AltList(head, tail);	:}
	|	SQUARE_CLOSE					{:	RESULT = null;						:}
;
 
alt_list ::= 
		SQUARE_OPEN expression:head alt_list_tail:tail SQUARE_CLOSE		{:	RESULT = new AltList(head, tail);	:}
	|	SQUARE_OPEN expression:head VERTICAL_BAR expression:tail SQUARE_CLOSE	{:	RESULT = new AltList(head, tail);	:}
;
alt_list_tail ::= 
		COMMA expression:head alt_list_tail:tail	{:	RESULT = new AltList(head, tail);			:}
	|	VERTICAL_BAR expression:actual_tail		{:	RESULT = actual_tail;	:}
;

Atoms

Atoms are used to represent different non-numerical constant values; they start with lowercase letters, followed by a sequence of alphanumeric characters or _, or @.

Atoms in the miniErlang VM are represented as long unsigned integers, since two atoms can be either equal equal or not (i.e. one can't be greater than the other).

Atom class:

public class Atom extends Term {
  private String stringForm;
  private long atomId;
  public Atom(String atom) {
    stringForm = atom;
    this.subgraphSize = 1 + Node.CLEANUP_LABEL_SIZE + Node.RESUME_LABEL_SIZE;
  }
 
  public void generateCode(Manager manager, Node parent) {
    super.generateCode(manager, parent);
    manager.dumpln("\t; start " + this.getClass().getName());
 
    atomId = manager.getAtom(stringForm);
    label = allocate(manager);    
    manager.dumpFormatln(
            "\tinvoke void %s(%%%s* %%%d, i64 %d)", Const.LITERAL_CONSTRUCT_ATOM, Const.LITERAL_STRUCT, label, atomId);
 
 
    long unwindLabel = label + 1;
    long branchLabel = unwindLabel + Node.CLEANUP_LABEL_SIZE + Node.RESUME_LABEL_SIZE;
    manager.dumpFormatln("\t\tto label %%%d unwind label %%%d", branchLabel, unwindLabel);
 
    manager.cleanupError();
    destructDependencies(manager, this);
    manager.resumeError();
  }
}

Boolean, Integer and Float terms are generated similarly to Atoms, but with a different constructor.

Lists

Lists are created by enclosing the list elements in square brackets; they can contain values of any type.

Lists can be represented in two ways in Erlang:

comma separated (e.g. [1, 2, 3, 4]);
head tail format (e.g. [1 | [2 | [3 | [4 | []]]]]);

The two representation are equivalent in terms of accessory data structures.

Two classes generate code for lists:

List: handles conversion from string to List Literal.
AltList: handles recursive list instantiation.

Every AltList must be terminated by an empty List object to maintain representations equivalent.

AltList class:

public class AltList extends Term {
  public Expression head;
  public Expression tail;
 
  public AltList(Expression head, Expression tail) {
    this.head = head;
    this.tail = (tail == null ? new List("") : tail);
    this.subgraphSize =
        3
            + CLEANUP_LABEL_SIZE
            + RESUME_LABEL_SIZE
            + this.head.subgraphSize
            + this.tail.subgraphSize;
    ;
  }
 
  public void generateCode(Manager manager, Node parent) {
    super.generateCode(manager, parent);
    manager.dumpln("\t; start " + this.getClass().getName());
 
    head.generateCode(manager, this);
    manager.dumpCodeLabel();
    tail.generateCode(manager, this);
    manager.dumpCodeLabel();
 
    label = allocate(manager);
    manager.dumpFormatln(
        "\tinvoke void %s(%%%s* %%%d, %%%s* %%%d, %%%s* %%%d)",
        Const.LITERAL_CONSTRUCT_LIST_ELEMENT,
        Const.LITERAL_STRUCT,
        label,
        Const.LITERAL_STRUCT,
        head.label,
        Const.LITERAL_STRUCT,
        tail.label);
    long afterUnwind = manager.getCurrentLabel() + CLEANUP_LABEL_SIZE + RESUME_LABEL_SIZE;
    manager.dumpFormatln(
        "\t\tto label %%%d unwind label %%%d", afterUnwind, manager.getCurrentLabel());
    manager.cleanupError();
    destructDependencies(manager, this);
    manager.resumeError();
  }
 
  public long destructDependencies(Manager manager, Node caller) {
    long maxParentDep = super.destructDependencies(manager, caller);
    if (head != caller) {
      if (head.label > maxParentDep) {
        maxParentDep = Math.max(head.destruct(manager, this), maxParentDep);
      }
      if (tail != caller) {
        maxParentDep = Math.max(tail.destructDependencies(manager, this), maxParentDep);
      }
    }
 
    return maxParentDep;
  }
}

Expressions

In Erlang every instruction (except function definitions) is an expression, and every expression holds an immutable value. The Expression class is inherited by the great majority of all classes in the project.

Grammar rules regarding expressions:

expression ::=	
		pattern_matching:expr			{:	RESULT = expr;				:}
	|	term:expr				{:	RESULT = expr;				:}
	|	VARIABLE:var				{:	RESULT = new Variable(var);		:}
	|	term_comparison:expr			{:	RESULT = expr;				:}
	|	arithmetic_expression:expr		{:	RESULT = expr;				:} 
	|	boolean_expression:expr			{:	RESULT = expr;				:} 
	|	function_call:expr			{:	RESULT = expr;				:}
	|	ROUND_OPEN expression:expr ROUND_CLOSE	{:	RESULT = expr;				:}
	|	lists_append:expr			{:	RESULT = expr;				:}
;
 
term_comparison ::=
		expression:lhs EQ expression:rhs		{:	RESULT = new Equals(lhs, rhs);			:}
	|	expression:lhs EXACT_EQ expression:rhs		{:	RESULT = new ExactEquals(lhs, rhs);		:}
	|	expression:lhs NOT_EQ expression:rhs		{:	RESULT = new NotEquals(lhs, rhs);		:}
	|	expression:lhs EXACT_NOT_EQ expression:rhs	{:	RESULT = new ExactNotEquals(lhs, rhs);	:}
	|	expression:lhs LESS expression:rhs		{:	RESULT = new Less(lhs, rhs);			:}
	|	expression:lhs LESS_EQ expression:rhs		{:	RESULT = new LessEquals(lhs, rhs);		:}
	|	expression:lhs GREATER expression:rhs		{:	RESULT = new Greater(lhs, rhs);			:}
	|	expression:lhs GREATER_EQ expression:rhs	{:	RESULT = new GreaterEquals(lhs, rhs);	:}
;
 
arithmetic_expression ::=
		PLUS expression:val				{:	RESULT = val;						:}
	|	HYPHEN expression:val				{:	RESULT = new Negative(val);			:}
	|	expression:lhs PLUS expression:rhs		{:	RESULT = new Add(lhs, rhs);			:}
	|	expression:lhs HYPHEN expression:rhs 		{:	RESULT = new Sub(lhs, rhs);			:}
	|	expression:lhs STAR expression:rhs		{:	RESULT = new Mul(lhs, rhs);			:}
	|	expression:lhs SLASH expression:rhs		{:	RESULT = new Div(lhs, rhs);			:}
	|	expression:lhs K_DIV expression:rhs		{:	RESULT = new IntegerDiv(lhs, rhs);	:}
	|	expression:lhs K_REM expression:rhs		{:	RESULT = new Rem(lhs, rhs);			:}
;
 
boolean_expression ::=
		NOT expression:rhs			{:	RESULT = new Not(rhs);		:}
	|	expression:lhs K_AND expression:rhs	{:	RESULT = new And(lhs, rhs);	:}
	|	expression:lhs K_OR expression:rhs	{:	RESULT = new Or(lhs, rhs);	:}
	|	expression:lhs K_XOR expression:rhs	{:	RESULT = new Xor(lhs, rhs);	:}
;

Variables

Variables are a special kind of expression, in that they can hold no value (i.e. can be unbounded), but once a value is assigned to them, it cannot be changed; variables in Erlang are bound to values through pattern matching.

Example:

1> X.  ** 1: variable 'X' is unbound **
2> X = 2.
2
3> X + 1.
3
4> X = 3. ** exception error: no match of right hand side value 3

Binary Expressions

Most expression are binary (they operate on 2 expressions), there are three types of binary expressions: right associative, left associative, non associative. The precedence of the operators is defined as following:

// lowest priority
precedence right MATCH;
precedence nonassoc EQ, NOT_EQ, LESS_EQ, LESS, GREATER_EQ, GREATER, EXACT_EQ, EXACT_NOT_EQ;
precedence right PLUS_PLUS;
precedence left PLUS, HYPHEN, K_OR, K_XOR;
precedence left SLASH, STAR, K_DIV, K_REM, K_AND;
precedence nonassoc K_NOT;
precedence nonassoc SHARP;
precedence nonassoc COLON;
// highest priority

Two general classes have been implemented to handle binary expression (as in operations between two expressions):

LeftAssocBinaryExpression:

public abstract class LeftAssocBinaryExpression extends Expression {
  protected Expression lhs, rhs;
 
  public LeftAssocBinaryExpression(Expression lhs, Expression rhs) {
	  this.lhs = lhs;
	    this.rhs = rhs;
	  subgraphSize = 3 + rhs.subgraphSize + lhs.subgraphSize + CLEANUP_LABEL_SIZE + RESUME_LABEL_SIZE;
 
  }
 
  public void genericGenerateCode(String function, Manager manager, Node parent) {
	  super.generateCode(manager, parent);
 
	    rhs.generateCode(manager, this);
		  manager.dumpCodeLabel();
	    lhs.generateCode(manager, this);
 
 
    manager.dumpCodeLabel();
    label = allocate(manager);
    manager.dumpFormatln(
            "\tinvoke void %s(%%%s* sret align 8 %%%d, %%%s* %%%d, %%%s* nonnull align 8"
                + " dereferenceable(16) %%%d)",
            function,
            Const.LITERAL_STRUCT,
            label,
            Const.LITERAL_STRUCT,
            lhs.label,
            Const.LITERAL_STRUCT,
            rhs.label);
 
    long unwindLabel = label + 1;
    long branchLabel = unwindLabel + Node.CLEANUP_LABEL_SIZE + Node.RESUME_LABEL_SIZE;
    manager.dumpFormatln("\t\tto label %%%d unwind label %%%d", branchLabel, unwindLabel);
 
    manager.cleanupError();
    destructDependencies(manager, this);
    manager.resumeError();
  }
 
 
  public long destructDependencies(Manager manager, Node caller) {
    long maxParentDep = super.destructDependencies(manager, caller);
    if (rhs != caller) {
      if (rhs.label >= maxParentDep) {
        manager.dumpln("\t; l rhs (" + rhs.label + ") maxdep (" + maxParentDep + ")");
        maxParentDep = Math.max(rhs.destruct(manager, this), maxParentDep);
      }
      if (lhs != caller && lhs.label >= maxParentDep) {
        manager.dumpln("\t; l lhs (" + lhs.label + ") maxdep (" + maxParentDep + ")");
        maxParentDep = Math.max(lhs.destruct(manager, this), maxParentDep);
      }
    }
 
    return maxParentDep;
  }
}

This class is inherited by every left associative operation. The class for right associative operations is very similar, but inverts the order of the code generation of its dependencies.

Example of LeftAssociative class (Add):

public class Add extends LeftAssocBinaryExpression {
  public Add(Expression lhs, Expression rhs) {
    super(lhs, rhs);
  }
 
  public void generateCode(Manager manager, Node parent) {
    manager.dumpln("\t; start " + this.getClass().getName() + " (" + subgraphSize + ")");
    super.genericGenerateCode(Const.LITERAL_ADD, manager, parent);
  }
}

Function Calls

Function calls are very simple expressions, their name can include the module from which they come from.

function_call ::=
		ATOM:function_name ROUND_OPEN ROUND_CLOSE						{:	RESULT = new FunctionCall(function_name, null);							:}
	|	ATOM:function_name ROUND_OPEN expression_seq:parameters ROUND_CLOSE			{:	RESULT = new FunctionCall(function_name, parameters);					:}
	|	ATOM:module COLON ATOM:function_name ROUND_OPEN expression_seq:parameters ROUND_CLOSE	{:	RESULT = new FunctionCall(module + "." + function_name, parameters);	:}
	|	ATOM:function_name ROUND_OPEN error							{:	syntaxError("Invalid function call.");						:}
;

The functionCall class, translates the function name given as input into LLVM IR equivalent, and prints the relative parameters. FunctionCall.generateCode()

public void generateCode(Manager manager, Node parent) {
    super.generateCode(manager, parent);
 
    manager.recordFunctionCall(name, parameters);
 
 
    if (parameters != null) {
      parameters.generateCode(manager, this);
      manager.dumpCodeLabel();
      label = allocate(manager);
      manager.dump(
          String.format(
              "\tinvoke void %s(%%%s* sret align 8 %%%d",
              manager.getFunctionName(name, parameters), Const.LITERAL_STRUCT, label));
 
      ExpressionSequence dfs_node = parameters;
      while (dfs_node != null) {
        manager.dump(String.format(", %%%s* %%%d", Const.LITERAL_STRUCT, dfs_node.head.label));
        dfs_node = dfs_node.tail;
      }
      manager.dumpln(")");
 
    } else {
      label = allocate(manager);
      manager.dumpFormatln(
          "\tinvoke void %s(%%%s* sret align 8 %%%d)",
          manager.getFunctionName(name, parameters), Const.LITERAL_STRUCT, label);
    }
 
    long unwindLabel = manager.getCurrentLabel(),
        branchLabel = unwindLabel + Node.CLEANUP_LABEL_SIZE + Node.RESUME_LABEL_SIZE;
 
    manager.dumpFormatln("\t\tto label %%%d unwind label %%%d", branchLabel, unwindLabel);
 
    manager.cleanupError();
    destructDependencies(manager, this);
    manager.resumeError();
  }

The ++ operator is supported through the lists_append non terminal, that simply translates it into a call to the function 'lists:append':

lists_append ::=
		expression:a PLUS_PLUS expression:b {:	RESULT = new FunctionCall("lists.append", new ExpressionSequence(a, new ExpressionSequence(b, null)));	:}
;

Pattern Matching

This is the most complex feature of Erlang: In a pattern matching expression, a left-hand side pattern is matched against a right-hand side term. If the matching succeeds, any unbound variables in the pattern become bound. If the matching fails, a run-time error occurs.

In this project, pattern matching was implemented for the match operator (=) only, it is not supported in function clauses, since that are inherently more complex to handle.

The following action code distinguishes between expressions with left-hand side pattern that is List and those that aren't, since lists can carry with them more than one variable to bound, and thus recursive behavior.

pattern_matching ::= 
		expression:lhs MATCH expression:rhs	{:	RESULT = lhs instanceof AltList ? new ListMatching((AltList)lhs, rhs) : new Match(lhs, rhs);	:}
	|	expression MATCH error			{:	syntaxError("Invalid match operation.");	:}
	|	error MATCH expression			{:	syntaxError("Invalid match operation.");	:}
;

The Match class is similar to other binary expressions such as the Add illustrated previously.

ListMatching is the most complex class of the entire compiler, because it has to handle complex pattern matching (e.g. [First, Second, [HeadOfThird | TailOfThird]] = [1, 2, [3, 4, 5]]);

ListMatching Class:

public class ListMatching extends Expression {
  static long temporaryLabel = 0;
  AltList lhs;
  Expression rhs;
  Expression headMatch, tailMatch;
 
  public ListMatching(AltList lhs, Expression rhs) {
    // User code can't declare variables starting with a lowercase letter, so there can't be any
    // collision.
    long currentTemporaryLabel = temporaryLabel++;
    this.rhs = new Match(new Variable("tmp" + currentTemporaryLabel), rhs);
    headMatch =
        lhs.head instanceof AltList
            ? new ListMatching(
                (AltList) lhs.head,
                new FunctionCall(
                    "hd",
                    new ExpressionSequence(new Variable("tmp" + currentTemporaryLabel), null)))
            : new Match(
                lhs.head,
                new FunctionCall(
                    "hd",
                    new ExpressionSequence(new Variable("tmp" + currentTemporaryLabel), null)));
    if (lhs.tail != null && lhs.tail instanceof AltList) {
      tailMatch =
          new ListMatching(
              (AltList) lhs.tail,
              new FunctionCall(
                  "tl", new ExpressionSequence(new Variable("tmp" + currentTemporaryLabel), null)));
 
    } else {
      tailMatch =
          new Match(
              lhs.tail != null ? lhs.tail : new List(""),
              new FunctionCall(
                  "tl", new ExpressionSequence(new Variable("tmp" + currentTemporaryLabel), null)));
    }
 
    this.subgraphSize = 2 + this.rhs.subgraphSize + headMatch.subgraphSize + tailMatch.subgraphSize;
  }
 
  public void generateCode(Manager manager, Node parent) {
    manager.dumpln("\t; start " + this.getClass().getName() + "(" + subgraphSize + ")");
    super.generateCode(manager, parent);
 
    rhs.generateCode(manager, this);
    label = rhs.label;
    manager.dumpCodeLabel();
 
    headMatch.generateCode(manager, this);
    manager.dumpCodeLabel();
 
    tailMatch.generateCode(manager, this);
  }
 
  public long destruct(Manager manager, Node caller) {
    return destructDependencies(manager, caller);
  }
 
  public long destructDependencies(Manager manager, Node caller) {
 
    long maxParentDep = 0;
    if (caller != parent) {
      maxParentDep = parent.destruct(manager, this);
    }
 
    if (rhs != caller) {
      if (rhs.label >= maxParentDep) {
        maxParentDep = Math.max(rhs.destruct(manager, this), maxParentDep);
      }
      if (headMatch != caller) {
        if (headMatch.label > maxParentDep) {
          maxParentDep = Math.max(headMatch.destruct(manager, this), maxParentDep);
        }
        if (tailMatch != caller && tailMatch.label > maxParentDep) {
          maxParentDep = Math.max(tailMatch.destruct(manager, this), maxParentDep);
        } else {
        }
      }
    }
    return maxParentDep;
  }
}

Example code

Fibonacci:

fib(0) -> 0;    
fib(1) -> 1;
fib(N) -> Res = fib(N-1) + fib(N-2).

Pattern Matching:

patternMatching() ->
    Five = 5,
    io:format("Five: ~w~n", [Five]),
    % outputs: "Five: 5\n"
    [H | T] = [1, 2, 3, 4],
    io:format("H: ~w T: ~w~n", [H, T]),
    % outputs: "H: 1 T: [2,3,4]\n"
    ComplexList = [1, 2, [3, 4, 5]],
    [First, Second, [HeadOfThird | TailOfThird]] = ComplexList,
    io:format("Destructured ComplexList -> First: ~w Second: ~w HeadOfThird: ~w TailOfThird: ~w~n", [First, Second, HeadOfThird, TailOfThird]).
    % outputs: "Destructured ComplexList -> First: 1 Second: 2 HeadOfThird: 3 TailOfThird: [4,5]\n"

Usage guide

Code written to ./test.erl will be the target of the compiler. To compile and run the code run make.

Compiler repository

Code is accessible at: https://github.com/enricocarraro/mini-erlang-compiler.

If you found any error, or if you want to partecipate to the editing of this wiki, please contact: admin [at] skenz.it

You can reuse, distribute or modify the content of this page, but you must cite in any document (or webpage) this url: https://www.skenz.it/compilers/erlang_to_llvm?do=revisions

Skenz - How To Wiki

Table of Contents