Lex/Yacc: "line 1: syntax error, unexpected $end, expecting FUNCTION" lex compile error

1.5k views Asked by At

I'm trying to build my compile in lex and yacc for a new language name CSimple. (This is the manual for the language: http://www.cs.ucsb.edu/~chris/teaching/cs160/projects/language.html )

I need to print in pre-order the parse syntax tree given inputs like:

x=x+4;

and also-

procedure foo(i,j :integer) return integer { return 0;}

and so on...

For this input it's keep prints just the lexamas and always the compiler error says:

50, x20, =50, x30, +56, 439, ; //lexemas for the first input 
4, procedure7650, foo35, (50, i42, ,50, j61,  :7, integer36, )7615, return767, integer7640, {7615, return7656, 039, ;41, } //lexemas for the second input 
line 1: syntax error, unexpected $end, expecting FUNCTION //The error

This is my lex file:

%{
#include <stdlib.h>
#include <stdio.h>

#define YYDEBUG 0
void yyerror(const char *);
extern char* yytext;
char* func (char* token, int index);
%}

%%
boolean {func("BOOLEAN", 1);}
true {func("TRUE", 2);}
false {func("FALSE", 3);}
procedure {func("FUNCTION", 4);}
float {func("FLOAT", 5);}
char {func("CHAR", 6);}
integer {func("INT", 7);}
string {func("STRING", 8);}
intptr {func("INTPTR", 9);}
charptr {func("CHARPTR", 10);}
if {func("COND", 11);}
else {func("BLOCK", 12);}
while {func("WHILE_COND", 13);}
var {func("VARIABLE", 14);}
return {func("RETURN", 15);}
null {func("NL", 16);}

\&\& {func("AND", 17);}
\/ {func("DIVISION_OP", 18);}
\/\%.*\%\/ {func("COMMENT", 19);} 
\= {func("ASSIGN", 20);}
\=\= {func("EQUAL", 21);}
\> {func("BIGGER_THEN", 22);}
\>\= {func("BIGGER_OR_EQUAL", 23);}
\< {func("SMALLER_THEN", 24);}
\<\= {func("SMALLER_OR_EQUAL", 25);}
\- {func("MINUS", 26);}
\! {func("LOGICAL_NOT", 27);}
\!\= {func("NOT_EQUAL", 28);}
\|\| {func("OR", 29);}
\+ {func("PLUS", 30);}
\* {func("MUL", 31);}
\& {func("ADDRESS_OF", 32);}
\^ {func("DEREFERANCE", 33);}
\^\^ {func("SYNTAX_ERROR", 34);}
\( {func("L_BRACKET", 35);}
\) {func("R_BRACKET", 36);}
\[ {func("L_STRING_INDEX", 37);}
\] {func("R_STRING_INDEX", 38);}
\; {func("EOS", 39);}
\{ {func("OB", 40);}
\} {func("CB", 41);}
\, {func("COMMA", 42);}
\: {func("VAR_DEC", 43);}
\_ {func("UNDERSCORE", 44);}
\|[\-]*[0-9]+\| {func("ABSULUTE_VALUE_OF_INT", 45);}
\|[a-zA-Z0-9]+\| {func("DECLARED_LENGTH_OF_STRING", 46);}
\&[0-9]* {func("LINKER_ERROR", 47);}
\&[a-zA-Z]+[\+|\-|\*|\/][a-zA-Z]+ {func("LINKER_ERROR", 48);}
\&[^STRING_TYPE\[0-9]+\]] {func("LINKER_ERROR", 49);}

[a-zA-Z]+[_]*[a-zA-Z0-9]* {func("IDENTIFIER", 50);}
[\"][a-zA-Z0-9]+[\"] {func("STRING_TYPE", 51);}
[\'].[\'] {func("CHAR_TYPE", 52);}
[\']..+[\'] {func("SYNTAX_ERROR", 53);}
[0-9]+[\.][0-9]+ {func("FLOAT_CONST", 54);}
[\-][0-9]+[\.][0-9]+ {func("FLOAT_CONST", 55);}
0|[1-9]+[0-9]* {func("INTEGER_CONST", 56);}
[\-][1-9]+[0-9]* {func("INTEGER_CONST", 57);}
0[x|X][0-9]+[a-fA-F0-9]*[a-fA-F0-9]* {func("HEX_NUMBER", 58);}
[0][^xX][1-7]+[0-7]* {func("OCTAL_NUMBER", 59);}
[0|1]+[b] {func("BINARY_NUMBER", 60);}
[^IDENTIFIER][\:] {func("SYNTAX_ERROR", 61);}
[IDENTIFIER\,]*[IDENTIFIER\:] {func("PARAMETER_LIST", 62);}

\([.*]\)\[[.*]\] {func("SYNTAX_ERROR", 63);}
[^[[IDENTIFIER|string\[integer\]|[\^][a-zA-Z]]+]][\=] {func("TYPE_MISMATCH_ERROR", 64);}
[a-zA-Z]+[=][a-zA-Z]+[=] {func("SYNTAX_ERROR", 65);}
\"\m\a\i\n\(\)\" {func("CASE_SENSETIVE_ERROR", 66);}
\([^[\)]] {func("SYNTAX_ERROR", 67);} 
\{[^[\}]] {func("SYNTAX_ERROR", 68);}
if|while[^\(] {func("SYNTAX_ERROR", 69);}
else[^\{] {func("SYNTAX_ERROR", 70);}

procedure[^[[IDENTIFIER][\(][PARAMETER_LIST]*[\)]return[boolean|char|integer|intptr|charptr][\{]]] {func("FUNC_DECL_ERROR", 71);}
[PARAMETER_LIST][boolean|char|integer|intptr|charptr][\;] {func("DECL_LIST", 72);}
[IDENTIFIER\:][boolean|char|integer|intptr|charptr|string[INTEGER_CONST]][\;] {func("DECL", 73);}
var[^[DECL_LIST|DECL]] {func("DECL_ERROR", 74);}
return[^[[true|false|CHAR_TYPE|INTEGER_CONST][\;]]] {func("RETURN_ERROR", 75);}
[ ]+ {printf("76");}
--[^ \n\;\:\[\]\{\}\(\)\,]+ {func("SYNTAX_ERROR", 77);}
%%
char* func (char* token, int index)
{
 printf("%d, %s", index, yytext);
 return token;
}

int yywrap(void) {
    return 1;
}

This is the yacc file:

%{
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

int yylex(void);
void yyerror(const char *);

typedef struct node{
        char *token;
    int line_number;
        struct node *left;
        struct node *right;
} node;

#define YYSTYPE struct node *

node *mknode(node *left, node *right, char *token);
void printtree(node *tree);
int yacc_line_number = 1;

%}

%error-verbose

%start program

%token BOOLEAN TRUE FALSE FUNCTION FLOAT CHAR INT STRING INTPTR CHARPTR DECL
%token L_BRACKET COND WHILE_COND BLOCK VARIABLE RETURN OB CB EOS FUNC_DECL_ERROR
%token AND DIVISION_OP COMMENT ASSIGN EQUAL BIGGER_THEN BIGGER_OR_EQUAL DECL_ERROR
%token SMALLER_THEN SMALLER_OR_EQUAL MINUS LOGICAL_NOT NOT_EQUAL OR PLUS DECL_LIST
%token MUL ADDRESS_OF SEMANTIC_ERROR DEREFERANCE SYNTAX_ERROR PARAMETER_LIST
%token R_BRACKET L_STRING_INDEX R_STRING_INDEX COMMA VAR_DEC NL TYPE_MISMATCH_ERROR
%token UNDERSCORE ABSULUTE_VALUE_OF_INT DECLARED_LENGTH_OF_STRING IDENTIFIER
%token STRING_TYPE CHAR_TYPE FLOAT_CONST INTEGER_CONST HEX_NUMBER OCTAL_NUMBER
%token BINARY_NUMBER CASE_SENSETIVE_ERROR LINKER_ERROR IFX RETURN_ERROR
%left EOS
//%right ASSIGN   

%nonassoc IFX
%nonassoc BLOCK

%%
program:method_declarations {printtree($1);};
method_declarations:method_declaration {$$=$1;}
                   |method_declarations method_declaration {$$ = mknode($1,$2,"");};

method_declaration:FUNCTION IDENTIFIER L_BRACKET R_BRACKET RETURN type OB statement_block CB {$1->left=$2; $1->right=$8; $$=$1;}
                   |FUNCTION IDENTIFIER L_BRACKET PARAMETER_LIST type R_BRACKET RETURN type OB statement_block CB {$1->left=$2; $1->right=$10; $$=$1;};
type: BOOLEAN {$$=$1;} | CHAR {$$=$1;} | CHARPTR {$$=$1;} | INTPTR {$$=$1;} | INT {$$=$1;};

statement_block: /* none */ {$$ = 0;} | statement_block statement {$$ = mknode($1,$2,"");};
statement: simple_statement EOS {$$=$1;} | compound_statement {$$=$1;} | OB statement_block CB {$$=$2;};

simple_statement: declarative_statement {$$=$1;}| assignment_statement {$$=$1;};
declarative_statement: VARIABLE IDENTIFIER dec_statement {$1->left = $2; $1->right=$3; $$=$1;};
dec_statement: VAR_DEC type EOS {$$=$2;};
assignment_statement: IDENTIFIER {$$=$1;} | IDENTIFIER ASSIGN expression { $2->left=$1; $2->right=$3; $$=$2;};

expression: or_expression {$$=$1;};
or_expression: and_expression {$$=$1;} | or_expression OR and_expression {$2->left=$1; $2->right=$3; $$=$2;};
and_expression: relop_expression {$$=$1;} | and_expression AND relop_expression {$2->left=$1; $2->right=$3; $$=$2;};
relop_expression: ltgt_expression {$$=$1;} | relop_expression NOT_EQUAL ltgt_expression {$2->left=$1;$2->right=$3; $$=$2;} | relop_expression EQUAL ltgt_expression {$2->left=$1;$2->right=$3; $$=$2;};
ltgt_expression: addop_expression {$$=$1;} | ltgt_expression BIGGER_THEN addop_expression {$2->left=$1; $2->right=$3; $$=$2;} | ltgt_expression SMALLER_THEN addop_expression {$2->left=$1; $2->right=$3; $$=$2;} | ltgt_expression BIGGER_OR_EQUAL addop_expression {$2->left=$1; $2->right=$3; $$=$2;} | ltgt_expression SMALLER_OR_EQUAL addop_expression {$2->left=$1; $2->right=$3; $$=$2;};
addop_expression: mulop_expression {$$=$1;} | addop_expression PLUS mulop_expression {$2->left=$1; $2->right=$3; $$=$2;} | addop_expression MINUS mulop_expression {$2->left=$1; $2->right=$3; $$=$2;};

mulop_expression: term {$$=$1;} | mulop_expression MUL term {$2->left=$1; $2->right=$3; $$=$2;} | mulop_expression DIVISION_OP term {$2->left=$1; $2->right=$3; $$=$2;};

term: LOGICAL_NOT value {$1->left=$2; $$=$1;} | PLUS value {$1->left=$2; $$=$1;} | MINUS value {$1->left=$2; $$=$1;} | value {$$=$1;};
value: IDENTIFIER {$$=$1;} | STRING_TYPE {$$=$1;} | CHAR_TYPE {$$=$1;} | FLOAT_CONST {$$=$1;} | HEX_NUMBER {$$=$1;} | INTEGER_CONST {$$=$1;} | OCTAL_NUMBER {$$=$1;} | BINARY_NUMBER {$$=$1;} | TRUE {$$=$1;} | FALSE {$$=$1;} | L_BRACKET expression R_BRACKET {$$=$2;};

compound_statement: if_statement {$$=$1;} | l_statement {$$=$1;};
if_statement: COND L_BRACKET expression R_BRACKET statement %prec IFX { $1->left=$3; $1->right=$5; $$=$1;}
            | COND L_BRACKET expression R_BRACKET statement BLOCK statement {$1->left=$3; $1->right = mknode($5,$6,""); $6->left=$7; $$=$1;};
l_statement: while_statement {$$=$1;};
while_statement: WHILE_COND L_BRACKET expression R_BRACKET statement {$1->left=$3; $1->right=$5; $$=$1;};

%%
#include "lex.yy.c"

int main (void) {yyparse(); return 0;}

node *mknode(node *left, node *right, char *token)
{
 node *newnode = (node *)malloc(sizeof(node));
 char *newstr = (char *)malloc(sizeof(token)+1);
 strcpy(newstr,token);
 newnode->left = left;    
 newnode->right = right;
 newnode->token = newstr;
 return(newnode);
}

void printtree(node *tree)
{
 int i; 
 static int line1 = 0;
 if(!tree){
  return;
 }
 if (tree->line_number > yacc_line_number){
   printf("\nLine(%d)",tree->line_number);
   yacc_line_number = tree->line_number;
 }
 if (tree->left || tree->right){
     if (tree->line_number == 1 && !line1){
        printf("\nLine(%d)",tree->line_number);
        line1 = 1;
     }
     if (tree->line_number>0) { 
       printf("\n");
            for(i = 0; i < tree->line_number; i++){
               printf("_");
                }
            printf("(");
      }
        }

        printf(" %s ",tree->token);

        if (tree->left){
          printtree(tree->left);
        }

        if (tree->right){
          printtree(tree->right);
        }

        if (tree->left || tree->right){
          printf(")");
        }
}

extern int yylineno;

void yyerror(const char *s) {
    fprintf(stderr, "line %d: %s\n", yylineno, s);
}

Help is needed :) thanks.

1

There are 1 answers

3
Chris Dodd On BEST ANSWER

Your lexer recognizes tokens and prints them, but never returns them to the parser, so it reads the entire input, printing the tokens, and then returns the $end (EOF, 0) token to the parser. The parser sees that token and gives a syntax error, as it requires at least one method_declaration in the input.

What you want is to have your lexer return the tokens as it recognizes them, rather than just continuing to read more tokens. The parser will call it repeatedly, expecting the next token each time. Your parser is also set up to expect node * values set by the lexer in yylval. So you need lex rules like:

boolean { yylval = mknode(0, 0, "BOOLEAN"); return BOOLEAN; }
true    { yylval = mknode(0, 0, "TRUE");    return TRUE; }
false   { yylval = mknode(0, 0, "FALSE");   return FALSE; }
    :

and so forth.