xquery-engine / src / main / resources / XGrammar.g4
XGrammar.g4
Raw
grammar XGrammar;

prog: xq+ EOF;

ap: doc SLASH rp        # ap_Slash           //relative to root node
    | doc DSLASH rp     # ap_Dslash          //relative to any node
    ;

rp: TAGNAME             # rp_Tag
    | WILDCARD		    # rp_Wild
    | PERIOD		    # rp_Per
    | DPERIOD           # rp_Dper
    | TEXT_FUNC         # rp_Text
    | AT_SYM TAGNAME    # rp_Attr
    | '(' rp ')'        # rp_Par
    | rp SLASH rp  	    # rp_Slash
    | rp DSLASH rp      # rp_DSlash
    | rp '[' f ']'      # rp_Filter
    | rp COMMA rp       # rp_Concat
    ;

f:  rp                            # f_Rp
    | rp EQ rp        	          # f_Eq             // by object value
    | rp IS rp                    # f_Is             // by object id
    | rp EQ STRINGCONST           # f_Const          // rp = StringConstant, ensure that EQ is not 'eq'
    | '(' f ')'                   # f_Par
    | f OR f 	                  # f_Or
    | f AND f		              # f_And
    | NOT f                       # f_Not
    ;

doc: FILENAME;

//collection: ;

xq: ap                                                      # xq_Ap
    | '(' xq ')'                                            # xp_Par
    | xq SLASH rp                                           # xq_Slash
    | xq DSLASH rp                                          # xq_Dslash
    | xq COMMA xq                                           # xq_Concat
    | '<' TAGNAME '>' '{' xq '}' '<' SLASH TAGNAME '>'      # xq_Tag
    | '<' TAGNAME '>'  xq  '<' SLASH TAGNAME '>'            # xq_Tag2
    | forClause (letClause)? (whereClause)? returnClause    # xq_FLWR
    | letClause xq                                          # xq_Let
    | FOR VAR IN join returnClause                          # xq_Join
    | VAR                                                   # xq_Var
    | STRINGCONST                                           # xq_Const
    ;

// For the following, add check that the last token isn't a comma
forClause: FOR (VAR IN xq (COMMA)*)+;

letClause: LET (VAR ASSIGN xq (COMMA)*)+;

whereClause: WHERE cond;

returnClause: RETURN xq;

//join rules
returnClauseJoin: VAR                                                               # join_return
                    | returnClauseJoin COMMA returnClauseJoin                       # join_concat
                    | '<'TAGNAME'>' '{' returnClauseJoin '}' '<' SLASH TAGNAME '>'  # join_tag_constr
                    | '<'TAGNAME'>' returnClauseJoin '<' SLASH TAGNAME '>'          # join_tag_no_constr
                    | ap                                                            # join_ap
                    ;

condClauseJoin: VAR EQ VAR                                                          # join_cond_var_eq
                | VAR EQ STRINGCONST                                                # join_cond_var_const_eq
                | STRINGCONST EQ VAR                                                # join_cond_var_const_eq
                | STRINGCONST EQ STRINGCONST                                        # join_cond_const_eq
                | condClauseJoin AND condClauseJoin                                 # join_cond_and
                ;

attrList: '[' (TAGNAME (COMMA)*)+ ']';

//xqJoin: forClause (WHERE condClauseJoin)* RETURN returnClauseJoin;
xqJoin: forClause (whereClause)? returnClause;

join: JOIN '(' (join | xqJoin) COMMA (join | xqJoin) COMMA (attrList (COMMA)*)+')';


cond: cond AND cond                                     # cond_And
      | cond OR cond                                    # cond_Or
      | xq EQ xq                                        # cond_Eq
      | xq IS xq                                        # cond_Is
      | EMPTY '(' xq ')'                                # cond_Empty
      | SOME (VAR IN xq (COMMA)*)+ SATISFIES cond       # cond_Some
      | '(' cond ')'                                    # cond_Par
      | NOT cond                                        # cond_Not
      ;



//path tokens
TEXT_FUNC: 'text()';
AT_SYM: '@';
SLASH: '/';
DSLASH: '//';
COMMA: ',';
DQUOTE: '"';
WILDCARD: '*';
PERIOD: '.';
DPERIOD: '..';
DOLLAR: '$';

//filter tokens-EQ is by value and IS is by object id
EQ: ('eq' | '=');
IS: ('==' | 'is');
AND: 'and';
OR: 'or';
NOT: 'not';


//XQuery Tokens
ASSIGN: ':=';
WHERE: 'where';
RETURN: 'return';
LET: 'let';
IN: 'in';
FOR: 'for';
SOME: 'some';
SATISFIES: 'satisfies';
EMPTY: 'empty';
VAR: DOLLAR TAGNAME;

//Join Tokens
JOIN: 'join';



// The TEXT rule needs to be after the above tokens so as to avoid matching 'and', 'not' etc to TEXT

//letters, digits, hyphens, underscores, periods
TAGNAME: [a-zA-Z0-9._-]+;

FILENAME: 'doc("' (TAGNAME | [ !@#$%^&()]+) '")'
          | 'document("' (TAGNAME | [ !@#$%^&()]+) '")'
          | 'collection("' (TAGNAME) '")';
ESCAPE: '\\' (['"\\]);
STRINGCONST:  '"' (ESCAPE | ~["\\])* '"' | '\'' (ESCAPE | ~['\\])* '\'';
//TEXT: [a-zA-Z0-9_-]+; //must have at least one character
WS: [ \t\n\r]+ -> skip; //skip whitespace