Loading Matlab2SMT/src/main/java/ca/mcmaster/cas/matlab2smt/App.java +1 −1 Original line number Diff line number Diff line Loading @@ -12,7 +12,7 @@ public class App { public static void main( String[] args ) { String[] args2 = {"ca.mcmaster.cas.matlab2smt.VariableParser", "varlist", "-tree"}; String[] args2 = {"ca.mcmaster.cas.matlab2smt.Matlab", "statement", "-tree"}; try { TestRig.main(args2); } catch (Exception ex) { Loading Matlab2SMT/src/main/java/ca/mcmaster/cas/matlab2smt/Matlab.g4 0 → 100644 +671 −0 Original line number Diff line number Diff line grammar Matlab; /* Things to fix: 0) lots of warnings. This is because of anonymous function handles; see details in appropriate XXX comment. if you comment out one line, all of the warnings go away. I believe that even with the warnings, the grammar is doing the right thing. 1) "clear" statements can accept wildcards as part of IDs, but we can't (due to the way lexing happens) 2) control flow statements require a newline or comma to resolve an ambiguity (but matlab doesn't) 3) vector construction requires comma delimiters because of whitespace madness (but matlab doesn't) 4) matrix transpose doesn't work (again because of lexing, and distinguishing strings from transposes). 5) block comments don't work (and I can't figure out why) For each case, see the appropriate XXX comment below. */ /*options { output=AST; } tokens { // imaginary nodes for our AST PROGRAM; FUNCTION; FUNCTION_RETURN; PARAMETER_LIST; FUNCTION_PARAMETER_LIST; STATEMENT_LIST; EXPRESSION; EXPR_STMT; NULL_STMT; ASSIGN; APPLY; FIELDACCESS; DYNFIELDACCESS; CELLACCESS; MATRIX; VECTOR; CELL; CLEAR; LHS; RHS; ID_NODE; PARENS; }*/ @parser::members{ // This scans backwards in the token stream looking for a hidden newline. // The newline must occur after the last visible token and before the current token. // C IMPLEMENTATION /* int check_for_hidden_newline( pmatlabParser ctx ) { pANTLR3_TOKEN_STREAM ts = ctx->pParser->tstream; int tok_ind = INDEX(); // this is the index of LT(1) pANTLR3_COMMON_TOKEN cur_tok = LT(1); ANTLR3_UINT32 cur_tok_chan = cur_tok->getChannel( cur_tok ); // this is the current channel. int rval = 0; while ( tok_ind > 0 ) { tok_ind--; cur_tok = ts->get( ts, tok_ind ); if ( cur_tok->getChannel( cur_tok ) == cur_tok_chan ) { // uh-oh. we found a non-hidden token further back in the stream, but no newline in between. fail. break; } if ( cur_tok->getType( cur_tok ) == NL ) { // found it! rval = 1; break; } } return rval; } */ // JAVA IMPLEMENTATION // Useful for debugging. NOTE: UNCOMMENT THE RELEVANT LINE IN hidden_nl! boolean check_for_hidden_newline( TokenStream input ) { int tok_ind = input.index(); // this is the index of LT(1) Token cur_tok = input.LT(1); int cur_tok_chan = cur_tok.getChannel(); // this is the current channel. boolean rval = false; while ( tok_ind > 0 ) { tok_ind--; cur_tok = input.get( tok_ind ); if ( cur_tok.getChannel() == cur_tok_chan ) { // uh-oh. we found a non-hidden token further back in the stream, but no newline in between. fail. break; } if ( cur_tok.getType() == NL ) { // found it! rval = true; break; } } return rval; } } // end of parser::members // // ================================================================== // // PARSER RULES // // we mostly want to ignore whitespace, but every now and then // it's significant -- as a statement delimiter, as part of // matrix construction, etc. This checks the hiddent channel for a newline. hidden_nl // Java : ( { check_for_hidden_newline( _input ) }? ) // C //: ( { check_for_hidden_newline( ctx ) }? ) ; nloc : ( hidden_nl | COMMA ); // mnemonic: newline or comma nlos : ( hidden_nl | SEMI ); // mnemonic: newline or semicolon // // scripts and m-files // mfile : function_definition+; scriptfile: statement_list; program : func_or_statement_list; // // functions and statements // function_definition : FUNCTION function_return? ID parameter_list? nloc func_or_statement_list END ; function_return : ID EQ | LSBRACE (ID COMMA?)+? RSBRACE EQ ; // the contents of a function (or a .m file) are statements and function definitions func_or_statement: ( function_definition | statement ); func_or_statement_list: func_or_statement*; // there are times when you can have a list of statements, but not function // definitions -- for example, inside of an "if" block. statement_list : statement* ; parameter_list : LPAREN ( ID COMMA? )* RPAREN ; // Note: there is a functional difference between terminating a statement with a // newline vs. a semicolon, so we have to remember the appropriate token in the AST. statement : lhs EQ rhs nlosoc #ASSIGNMENT | expression nlosoc #EXPRESSION | if_statement #IF_STATEMENT | for_statement #FOR_STATEMENT | while_statement #WHILE_STATEMENT | switch_statement #SWITCH_STATEMENT | try_statement #TRY_STATEMENT | return_statement #RETURN_STATEMENT | break_statement #BREAK_STATEMENT | continue_statement #CONTINUE_STATEMENT | clear_statement # CLEAR_STATEMENT | global_statement #GLOBAL_STATEMENT | persistent_statement #PERSISTENT_STATEMENT | SEMI # NULL_STMT // a null statement ; nlosoc : ( hidden_nl | SEMI | COMMA )^; lhs: id_plus_indexers; rhs: expression; /* XXX CONTROL FLOW AMBIGUITIES: The statement while a (5) end; is parsed as while (a(5)) [empty] end; and not while a (5) end; but I can't seem to get a non-ambiguous grammar to do that. so, I enforce a newline or a comma after the expression to delimit it from the body of the statements. everywhere you see a "nloc" below, you shouldn't really need one; this grammar therefore parses only a subset of "true" matlab. (Of course, *I* think that reasonable code wouldn't have such ambiguities, don't you? :) ) Similarly: The statement while 1 -5, end; is parsed as while (1) -5, end; while the statement while 1 - 5, end; is parsed as while (1-5) [empty] end; NOTE : THIS FEELS A LOT LIKE THE VECTOR AMBIGUITIES BELOW. Perhaps solving one solves them both, since both are essentially 'space-delimited expressions' ambiguities. */ if_statement : IF expression nloc statement_list elseif_statement* else_statement? END ; elseif_statement : ELSEIF expression nloc statement_list ; else_statement : ELSE statement_list ; for_statement : FOR ID EQ expression nloc statement_list END ; while_statement : WHILE expression nloc statement_list END ; switch_statement : SWITCH expression nloc case_statement* otherwise_statement? END ; case_statement : CASE expression nloc statement_list ; otherwise_statement : OTHERWISE nloc statement_list ; try_statement : TRY statement_list catch_statement? END ; catch_statement : CATCH ID? nlosoc statement_list ; return_statement : RETURNS^ nlosoc ; break_statement : BREAK^ nlosoc ; continue_statement : CONTINUE^ nlosoc ; global_statement : GLOBAL (ID COMMA?)+? nlosoc ; persistent_statement : PERSISTENT (ID COMMA?)+? nlosoc ; /* XXX CLEAR STATEMENT WILDCARDS How can we fix wildcards in clear statements? For example, "clear foo* bob" ought to be parsed as clear (foo*) (bob) but this is hard since the '*' is lexed as its own character, and we've discarded the fact that there is whitespace between '*' and 'bob', but not between 'foo' and '*'. This means we can't tell that it's supposed to be part of the foo identifier. Options for solving: 1) parser-context sensitive lexing? (I don't think this is right) 2) poke around in the hidden channel to find out where the whitespace is. this is probably the best solution, but seems like a pain. */ clear_statement : CLEAR (ID COMMA?)*? nlosoc ; // // =============================== // // a precedence hierarchy for parsing expressions // these are groups of operators that have equivalent precedences g1 : ( NEQ | DOUBLE_EQ | GRTE | GRT | LSTE | LST ); g2 : ( PLUS | MINUS ); g3 : ( LEFTDIV | RIGHTDIV | TIMES | EL_LEFTDIV | EL_RIGHTDIV | EL_TIMES ); g4 : ( EXP | EL_EXP ); /* XXX MATRIX TRANSPOSE PROBLEM. The single quote operator is problematic because of things like this: aa' + foo('some string here')+bb' Right now, the operator is placed in the correct place in the grammar, and the grammar checks out just fine, but you get lexing errors if you try to use it. */ postfix_operator : ( CCT | EL_CCT ); prefix_operator : ( PLUS | MINUS | NEG ); // the hierarchy is defined from LOWEST to HIGHEST priority. expression : e0; e0 : e1; e1 : e2 (LOG_OR^ e2)*; e2 : e3 (LOG_AND^ e3)*; e3 : e4 (BIN_OR^ e4)*; e4 : e5 (BIN_AND^ e5)*; e5 : e6 (g1^ e6)*; e6 : e7 (COLON^ e7)*; e7 : e8 (g2^ e8)*; e8 : e9 (g3^ e9)*; e9 : prefix_operator^ e9 | e10; e10 : e11 (g4^ e11)*; // note: in matlab, exponentiation is left-associative e11 : unary_expression postfix_operator^?; unary_expression : base_expression #BASE_EXPRESIION | LPAREN expression RPAREN # PARENS_EXPRESSION ; base_expression : id_plus_indexers | INT | FLOAT | STRING | anon_func_handle | cell | matrix ; /* XXX ANONYMOUS EXPRESSION AMBIGUITIES This generates a ton of warnings, but it does the right thing. At least, I think it does. We want anonymous expressions to be "greedy". The statement a+@()x+y parses as a+( @()x+y ) and not as a+ ( @()x ) +y but I can't seem to make the right behavior explicit. The way that ANTLR is disabling the alternatives seems to result in the right behavior, though. If you comment out the second alternative here, the grammar should check out perfectly clean. */ anon_func_handle : AT ID | AT parameter_list (expression)? ; // this captures things like foo.(bar){3,4}.baz id_plus_indexers : ( i1=ID ) ( DOT ( i2=ID ) | LPAREN expression RPAREN | LPAREN fpl1=function_parameter_list? RPAREN | LBRACE fpl2=function_parameter_list RBRACE )* ; // also permits the use of the colon as an "expression" function_parameter_list : function_parameter ( COMMA function_parameter )* ; function_parameter : expression | COLON ; matrix : LSBRACE vector? ( nlos vector )* RSBRACE; cell : LBRACE vector? ( nlos vector )* RBRACE; /* XXX I think the rule is the following 1) if a +/- does not have any space to the right, it's interpreted as a unary op 2) if there's no whitespace to the left of the operator, it's grouped to the left. 3) if there's whitespace to the right and left, it's treated as a binary op, and expressions are required on the right and left. Vectors are pretty crazy. Because you have a list of expressions that can be separated by nothing but whitespace, all sorts of parsing ambiguities start happening. Whitespace becomes syntactically meaningful in strange ways that are not totally context free. For example, [ a + b ] is parsed as [ (a+b) ] [ a+b ] is parsed as [ (a+b) ] [ a +b ] is parsed as [ (a) (+b) ] A longer example: [ 1+ 2+ 3 ] parses to 6 [ 1 + 2+ 3 ] 6 [ 1 +2+ 3 ] 1 5 [ 1+ 2 + 3 ] 6 [ 1 + 2 + 3 ] 6 [ 1 +2 + 3 ] 1 5 [ 1+ 2 +3 ] 3 3 [ 1 + 2 +3 ] 3 3 [ 1 +2 +3 ] 1 2 3 So, the weird thing is that we can't just look for the absence of a space between '+' and the next character to determine if it's a binary or unary operator -- we also have to know if there's a space to the left of it. This is mostly a problem for prefix operators which can be ambiguous in this context, but it also shows up in things like cell arrays of anonymous function expressions. size( { @()a+b } ) is [ 1 1 ] size( { @()a +b } ) is [ 1 2 ] size( { @()a + b } ) is [ 1 1 ] NOTE : this feels like the control flow ambiguities above, since both are essentially "whitespace-delimited expression" problems. */ // XXX The COMMA should really have a ? after it!!! // vector : ( expression COMMA? )+; vector : expression ( COMMA expression )*; // // ================================================================== // // LEXER RULES // // // language keywords // BREAK : 'break'; CASE : 'case'; CATCH : 'catch'; CONTINUE: 'continue'; ELSE : 'else'; ELSEIF : 'elseif'; END : 'end'; FOR : 'for'; FUNCTION: 'function'; GLOBAL : 'global'; IF : 'if'; OTHERWISE: 'otherwise'; PERSISTENT: 'persistent'; RETURNS : 'return'; // not "RETURN" to avoid #define conflicts with readline.h SWITCH : 'switch'; TRY : 'try'; VARARGIN: 'varargin'; WHILE : 'while'; CLEAR : 'clear'; ENDS : END SEMI? ; // // operators and assignments // DOUBLE_EQ : '=='; LOG_OR : '||'; LOG_AND : '&&'; LSTE : '<='; GRTE : '>='; NEQ : '~='; EL_TIMES : '.*'; EL_LEFTDIV : './'; EL_RIGHTDIV : '.\\'; EL_EXP : '.^'; EL_CCT : '.\''; EQ : '='; BIN_OR : '|'; BIN_AND : '&'; LST : '<'; GRT : '>'; COLON : ':'; PLUS : '+'; MINUS : '-'; NEG : '~'; TIMES : '*'; LEFTDIV : '/'; RIGHTDIV: '\\'; EXP : '^'; CCT : '\''; // // Other useful language snippets // SEMI : ';'; LPAREN : '('; RPAREN : ')'; LBRACE : '{'; RBRACE : '}'; LSBRACE : '['; RSBRACE : ']'; AT : '@'; DOT : '.'; COMMA : ','; // // comments // NL : '\r'? '\n' -> channel(HIDDEN); // newline // XXX I can't seem to get block comments to work. The problem is that // no matter what I do, the linecomment ends up "overriding" the block // comment, and I get a lex error. I've tried syntactic predicates, // but they didn't help... // If I comment out the LINECOMMENT rule, the BLOCKCOMMENT works fine. // So, since I can only seem to have one or the other, I'm commenting // out BLOCKCOMMENT for now. //BLOCKCOMMENT // : '%{' (options{greedy=false;} : .)* '%}' { $channel = HIDDEN; } // ; LINECOMMENT : '%' .*? NL -> channel(HIDDEN) ; // I think this is how to use syntactic predicates, but it doesn't seem to work. //COMMENT // : ( '%{' ) => '%{' (options{greedy=false;}: .)* '%}' { $channel = HIDDEN; } // | ( '%' (options{greedy=false;}: .)* NL ) { $channel = HIDDEN; } // ; THREEDOTS : ( '...' NL ) -> channel(HIDDEN) ; // // identifiers, strings, numbers, whitespace // ID : ('a'..'z'|'A'..'Z') ('a'..'z'|'A'..'Z'|'0'..'9'|'_')* ; INT : '0'..'9'+ ; FLOAT : ('0'..'9')+ '.' ('0'..'9')* EXPONENT? | '.' ('0'..'9')+ EXPONENT? | ('0'..'9')+ EXPONENT ; STRING : '\'' ( ESC_SEQ | ~('\\'|'\'') )* '\'' ; WS : ( ' ' | '\t' ) -> skip ; fragment EXPONENT : ('e'|'E') ('+'|'-')? ('0'..'9')+ ; fragment HEX_DIGIT : ('0'..'9'|'a'..'f'|'A'..'F') ; fragment ESC_SEQ : '\\' ('b'|'t'|'n'|'f'|'r'|'\"'|'\''|'\\') | UNICODE_ESC | OCTAL_ESC ; fragment OCTAL_ESC : '\\' ('0'..'3') ('0'..'7') ('0'..'7') | '\\' ('0'..'7') ('0'..'7') | '\\' ('0'..'7') ; fragment UNICODE_ESC : '\\' 'u' HEX_DIGIT HEX_DIGIT HEX_DIGIT HEX_DIGIT ; Loading
Matlab2SMT/src/main/java/ca/mcmaster/cas/matlab2smt/App.java +1 −1 Original line number Diff line number Diff line Loading @@ -12,7 +12,7 @@ public class App { public static void main( String[] args ) { String[] args2 = {"ca.mcmaster.cas.matlab2smt.VariableParser", "varlist", "-tree"}; String[] args2 = {"ca.mcmaster.cas.matlab2smt.Matlab", "statement", "-tree"}; try { TestRig.main(args2); } catch (Exception ex) { Loading
Matlab2SMT/src/main/java/ca/mcmaster/cas/matlab2smt/Matlab.g4 0 → 100644 +671 −0 Original line number Diff line number Diff line grammar Matlab; /* Things to fix: 0) lots of warnings. This is because of anonymous function handles; see details in appropriate XXX comment. if you comment out one line, all of the warnings go away. I believe that even with the warnings, the grammar is doing the right thing. 1) "clear" statements can accept wildcards as part of IDs, but we can't (due to the way lexing happens) 2) control flow statements require a newline or comma to resolve an ambiguity (but matlab doesn't) 3) vector construction requires comma delimiters because of whitespace madness (but matlab doesn't) 4) matrix transpose doesn't work (again because of lexing, and distinguishing strings from transposes). 5) block comments don't work (and I can't figure out why) For each case, see the appropriate XXX comment below. */ /*options { output=AST; } tokens { // imaginary nodes for our AST PROGRAM; FUNCTION; FUNCTION_RETURN; PARAMETER_LIST; FUNCTION_PARAMETER_LIST; STATEMENT_LIST; EXPRESSION; EXPR_STMT; NULL_STMT; ASSIGN; APPLY; FIELDACCESS; DYNFIELDACCESS; CELLACCESS; MATRIX; VECTOR; CELL; CLEAR; LHS; RHS; ID_NODE; PARENS; }*/ @parser::members{ // This scans backwards in the token stream looking for a hidden newline. // The newline must occur after the last visible token and before the current token. // C IMPLEMENTATION /* int check_for_hidden_newline( pmatlabParser ctx ) { pANTLR3_TOKEN_STREAM ts = ctx->pParser->tstream; int tok_ind = INDEX(); // this is the index of LT(1) pANTLR3_COMMON_TOKEN cur_tok = LT(1); ANTLR3_UINT32 cur_tok_chan = cur_tok->getChannel( cur_tok ); // this is the current channel. int rval = 0; while ( tok_ind > 0 ) { tok_ind--; cur_tok = ts->get( ts, tok_ind ); if ( cur_tok->getChannel( cur_tok ) == cur_tok_chan ) { // uh-oh. we found a non-hidden token further back in the stream, but no newline in between. fail. break; } if ( cur_tok->getType( cur_tok ) == NL ) { // found it! rval = 1; break; } } return rval; } */ // JAVA IMPLEMENTATION // Useful for debugging. NOTE: UNCOMMENT THE RELEVANT LINE IN hidden_nl! boolean check_for_hidden_newline( TokenStream input ) { int tok_ind = input.index(); // this is the index of LT(1) Token cur_tok = input.LT(1); int cur_tok_chan = cur_tok.getChannel(); // this is the current channel. boolean rval = false; while ( tok_ind > 0 ) { tok_ind--; cur_tok = input.get( tok_ind ); if ( cur_tok.getChannel() == cur_tok_chan ) { // uh-oh. we found a non-hidden token further back in the stream, but no newline in between. fail. break; } if ( cur_tok.getType() == NL ) { // found it! rval = true; break; } } return rval; } } // end of parser::members // // ================================================================== // // PARSER RULES // // we mostly want to ignore whitespace, but every now and then // it's significant -- as a statement delimiter, as part of // matrix construction, etc. This checks the hiddent channel for a newline. hidden_nl // Java : ( { check_for_hidden_newline( _input ) }? ) // C //: ( { check_for_hidden_newline( ctx ) }? ) ; nloc : ( hidden_nl | COMMA ); // mnemonic: newline or comma nlos : ( hidden_nl | SEMI ); // mnemonic: newline or semicolon // // scripts and m-files // mfile : function_definition+; scriptfile: statement_list; program : func_or_statement_list; // // functions and statements // function_definition : FUNCTION function_return? ID parameter_list? nloc func_or_statement_list END ; function_return : ID EQ | LSBRACE (ID COMMA?)+? RSBRACE EQ ; // the contents of a function (or a .m file) are statements and function definitions func_or_statement: ( function_definition | statement ); func_or_statement_list: func_or_statement*; // there are times when you can have a list of statements, but not function // definitions -- for example, inside of an "if" block. statement_list : statement* ; parameter_list : LPAREN ( ID COMMA? )* RPAREN ; // Note: there is a functional difference between terminating a statement with a // newline vs. a semicolon, so we have to remember the appropriate token in the AST. statement : lhs EQ rhs nlosoc #ASSIGNMENT | expression nlosoc #EXPRESSION | if_statement #IF_STATEMENT | for_statement #FOR_STATEMENT | while_statement #WHILE_STATEMENT | switch_statement #SWITCH_STATEMENT | try_statement #TRY_STATEMENT | return_statement #RETURN_STATEMENT | break_statement #BREAK_STATEMENT | continue_statement #CONTINUE_STATEMENT | clear_statement # CLEAR_STATEMENT | global_statement #GLOBAL_STATEMENT | persistent_statement #PERSISTENT_STATEMENT | SEMI # NULL_STMT // a null statement ; nlosoc : ( hidden_nl | SEMI | COMMA )^; lhs: id_plus_indexers; rhs: expression; /* XXX CONTROL FLOW AMBIGUITIES: The statement while a (5) end; is parsed as while (a(5)) [empty] end; and not while a (5) end; but I can't seem to get a non-ambiguous grammar to do that. so, I enforce a newline or a comma after the expression to delimit it from the body of the statements. everywhere you see a "nloc" below, you shouldn't really need one; this grammar therefore parses only a subset of "true" matlab. (Of course, *I* think that reasonable code wouldn't have such ambiguities, don't you? :) ) Similarly: The statement while 1 -5, end; is parsed as while (1) -5, end; while the statement while 1 - 5, end; is parsed as while (1-5) [empty] end; NOTE : THIS FEELS A LOT LIKE THE VECTOR AMBIGUITIES BELOW. Perhaps solving one solves them both, since both are essentially 'space-delimited expressions' ambiguities. */ if_statement : IF expression nloc statement_list elseif_statement* else_statement? END ; elseif_statement : ELSEIF expression nloc statement_list ; else_statement : ELSE statement_list ; for_statement : FOR ID EQ expression nloc statement_list END ; while_statement : WHILE expression nloc statement_list END ; switch_statement : SWITCH expression nloc case_statement* otherwise_statement? END ; case_statement : CASE expression nloc statement_list ; otherwise_statement : OTHERWISE nloc statement_list ; try_statement : TRY statement_list catch_statement? END ; catch_statement : CATCH ID? nlosoc statement_list ; return_statement : RETURNS^ nlosoc ; break_statement : BREAK^ nlosoc ; continue_statement : CONTINUE^ nlosoc ; global_statement : GLOBAL (ID COMMA?)+? nlosoc ; persistent_statement : PERSISTENT (ID COMMA?)+? nlosoc ; /* XXX CLEAR STATEMENT WILDCARDS How can we fix wildcards in clear statements? For example, "clear foo* bob" ought to be parsed as clear (foo*) (bob) but this is hard since the '*' is lexed as its own character, and we've discarded the fact that there is whitespace between '*' and 'bob', but not between 'foo' and '*'. This means we can't tell that it's supposed to be part of the foo identifier. Options for solving: 1) parser-context sensitive lexing? (I don't think this is right) 2) poke around in the hidden channel to find out where the whitespace is. this is probably the best solution, but seems like a pain. */ clear_statement : CLEAR (ID COMMA?)*? nlosoc ; // // =============================== // // a precedence hierarchy for parsing expressions // these are groups of operators that have equivalent precedences g1 : ( NEQ | DOUBLE_EQ | GRTE | GRT | LSTE | LST ); g2 : ( PLUS | MINUS ); g3 : ( LEFTDIV | RIGHTDIV | TIMES | EL_LEFTDIV | EL_RIGHTDIV | EL_TIMES ); g4 : ( EXP | EL_EXP ); /* XXX MATRIX TRANSPOSE PROBLEM. The single quote operator is problematic because of things like this: aa' + foo('some string here')+bb' Right now, the operator is placed in the correct place in the grammar, and the grammar checks out just fine, but you get lexing errors if you try to use it. */ postfix_operator : ( CCT | EL_CCT ); prefix_operator : ( PLUS | MINUS | NEG ); // the hierarchy is defined from LOWEST to HIGHEST priority. expression : e0; e0 : e1; e1 : e2 (LOG_OR^ e2)*; e2 : e3 (LOG_AND^ e3)*; e3 : e4 (BIN_OR^ e4)*; e4 : e5 (BIN_AND^ e5)*; e5 : e6 (g1^ e6)*; e6 : e7 (COLON^ e7)*; e7 : e8 (g2^ e8)*; e8 : e9 (g3^ e9)*; e9 : prefix_operator^ e9 | e10; e10 : e11 (g4^ e11)*; // note: in matlab, exponentiation is left-associative e11 : unary_expression postfix_operator^?; unary_expression : base_expression #BASE_EXPRESIION | LPAREN expression RPAREN # PARENS_EXPRESSION ; base_expression : id_plus_indexers | INT | FLOAT | STRING | anon_func_handle | cell | matrix ; /* XXX ANONYMOUS EXPRESSION AMBIGUITIES This generates a ton of warnings, but it does the right thing. At least, I think it does. We want anonymous expressions to be "greedy". The statement a+@()x+y parses as a+( @()x+y ) and not as a+ ( @()x ) +y but I can't seem to make the right behavior explicit. The way that ANTLR is disabling the alternatives seems to result in the right behavior, though. If you comment out the second alternative here, the grammar should check out perfectly clean. */ anon_func_handle : AT ID | AT parameter_list (expression)? ; // this captures things like foo.(bar){3,4}.baz id_plus_indexers : ( i1=ID ) ( DOT ( i2=ID ) | LPAREN expression RPAREN | LPAREN fpl1=function_parameter_list? RPAREN | LBRACE fpl2=function_parameter_list RBRACE )* ; // also permits the use of the colon as an "expression" function_parameter_list : function_parameter ( COMMA function_parameter )* ; function_parameter : expression | COLON ; matrix : LSBRACE vector? ( nlos vector )* RSBRACE; cell : LBRACE vector? ( nlos vector )* RBRACE; /* XXX I think the rule is the following 1) if a +/- does not have any space to the right, it's interpreted as a unary op 2) if there's no whitespace to the left of the operator, it's grouped to the left. 3) if there's whitespace to the right and left, it's treated as a binary op, and expressions are required on the right and left. Vectors are pretty crazy. Because you have a list of expressions that can be separated by nothing but whitespace, all sorts of parsing ambiguities start happening. Whitespace becomes syntactically meaningful in strange ways that are not totally context free. For example, [ a + b ] is parsed as [ (a+b) ] [ a+b ] is parsed as [ (a+b) ] [ a +b ] is parsed as [ (a) (+b) ] A longer example: [ 1+ 2+ 3 ] parses to 6 [ 1 + 2+ 3 ] 6 [ 1 +2+ 3 ] 1 5 [ 1+ 2 + 3 ] 6 [ 1 + 2 + 3 ] 6 [ 1 +2 + 3 ] 1 5 [ 1+ 2 +3 ] 3 3 [ 1 + 2 +3 ] 3 3 [ 1 +2 +3 ] 1 2 3 So, the weird thing is that we can't just look for the absence of a space between '+' and the next character to determine if it's a binary or unary operator -- we also have to know if there's a space to the left of it. This is mostly a problem for prefix operators which can be ambiguous in this context, but it also shows up in things like cell arrays of anonymous function expressions. size( { @()a+b } ) is [ 1 1 ] size( { @()a +b } ) is [ 1 2 ] size( { @()a + b } ) is [ 1 1 ] NOTE : this feels like the control flow ambiguities above, since both are essentially "whitespace-delimited expression" problems. */ // XXX The COMMA should really have a ? after it!!! // vector : ( expression COMMA? )+; vector : expression ( COMMA expression )*; // // ================================================================== // // LEXER RULES // // // language keywords // BREAK : 'break'; CASE : 'case'; CATCH : 'catch'; CONTINUE: 'continue'; ELSE : 'else'; ELSEIF : 'elseif'; END : 'end'; FOR : 'for'; FUNCTION: 'function'; GLOBAL : 'global'; IF : 'if'; OTHERWISE: 'otherwise'; PERSISTENT: 'persistent'; RETURNS : 'return'; // not "RETURN" to avoid #define conflicts with readline.h SWITCH : 'switch'; TRY : 'try'; VARARGIN: 'varargin'; WHILE : 'while'; CLEAR : 'clear'; ENDS : END SEMI? ; // // operators and assignments // DOUBLE_EQ : '=='; LOG_OR : '||'; LOG_AND : '&&'; LSTE : '<='; GRTE : '>='; NEQ : '~='; EL_TIMES : '.*'; EL_LEFTDIV : './'; EL_RIGHTDIV : '.\\'; EL_EXP : '.^'; EL_CCT : '.\''; EQ : '='; BIN_OR : '|'; BIN_AND : '&'; LST : '<'; GRT : '>'; COLON : ':'; PLUS : '+'; MINUS : '-'; NEG : '~'; TIMES : '*'; LEFTDIV : '/'; RIGHTDIV: '\\'; EXP : '^'; CCT : '\''; // // Other useful language snippets // SEMI : ';'; LPAREN : '('; RPAREN : ')'; LBRACE : '{'; RBRACE : '}'; LSBRACE : '['; RSBRACE : ']'; AT : '@'; DOT : '.'; COMMA : ','; // // comments // NL : '\r'? '\n' -> channel(HIDDEN); // newline // XXX I can't seem to get block comments to work. The problem is that // no matter what I do, the linecomment ends up "overriding" the block // comment, and I get a lex error. I've tried syntactic predicates, // but they didn't help... // If I comment out the LINECOMMENT rule, the BLOCKCOMMENT works fine. // So, since I can only seem to have one or the other, I'm commenting // out BLOCKCOMMENT for now. //BLOCKCOMMENT // : '%{' (options{greedy=false;} : .)* '%}' { $channel = HIDDEN; } // ; LINECOMMENT : '%' .*? NL -> channel(HIDDEN) ; // I think this is how to use syntactic predicates, but it doesn't seem to work. //COMMENT // : ( '%{' ) => '%{' (options{greedy=false;}: .)* '%}' { $channel = HIDDEN; } // | ( '%' (options{greedy=false;}: .)* NL ) { $channel = HIDDEN; } // ; THREEDOTS : ( '...' NL ) -> channel(HIDDEN) ; // // identifiers, strings, numbers, whitespace // ID : ('a'..'z'|'A'..'Z') ('a'..'z'|'A'..'Z'|'0'..'9'|'_')* ; INT : '0'..'9'+ ; FLOAT : ('0'..'9')+ '.' ('0'..'9')* EXPONENT? | '.' ('0'..'9')+ EXPONENT? | ('0'..'9')+ EXPONENT ; STRING : '\'' ( ESC_SEQ | ~('\\'|'\'') )* '\'' ; WS : ( ' ' | '\t' ) -> skip ; fragment EXPONENT : ('e'|'E') ('+'|'-')? ('0'..'9')+ ; fragment HEX_DIGIT : ('0'..'9'|'a'..'f'|'A'..'F') ; fragment ESC_SEQ : '\\' ('b'|'t'|'n'|'f'|'r'|'\"'|'\''|'\\') | UNICODE_ESC | OCTAL_ESC ; fragment OCTAL_ESC : '\\' ('0'..'3') ('0'..'7') ('0'..'7') | '\\' ('0'..'7') ('0'..'7') | '\\' ('0'..'7') ; fragment UNICODE_ESC : '\\' 'u' HEX_DIGIT HEX_DIGIT HEX_DIGIT HEX_DIGIT ;