"use strict" var hasOwnProperty = Object.prototype.hasOwnProperty var toString = Object.prototype.toString var hasSticky = typeof new RegExp().sticky === "boolean" /***************************************************************************/ function isRegExp(o) { return o && toString.call(o) === "[object RegExp]" } function isObject(o) { return o && typeof o === "object" && !isRegExp(o) && !Array.isArray(o) } function reEscape(s) { return s.replace(/[-\/\\^$*+?.()|[\]{}]/g, "\\$&") } function reGroups(s) { var re = new RegExp("|" + s) return re.exec("").length - 1 } function reCapture(s) { return "(" + s + ")" } function reUnion(regexps) { if (!regexps.length) return "(?!)" var source = regexps .map(function (s) { return "(?:" + s + ")" }) .join("|") return "(?:" + source + ")" } function regexpOrLiteral(obj) { if (typeof obj === "string") { return "(?:" + reEscape(obj) + ")" } else if (isRegExp(obj)) { // TODO: consider /u support if (obj.ignoreCase) throw new Error("RegExp /i flag not allowed") if (obj.global) throw new Error("RegExp /g flag is implied") if (obj.sticky) throw new Error("RegExp /y flag is implied") if (obj.multiline) throw new Error("RegExp /m flag is implied") return obj.source } else { throw new Error("Not a pattern: " + obj) } } function pad(s, length) { if (s.length > length) { return s } return Array(length - s.length + 1).join(" ") + s } function lastNLines(string, numLines) { var position = string.length var lineBreaks = 0 while (true) { var idx = string.lastIndexOf("\n", position - 1) if (idx === -1) { break } else { lineBreaks++ } position = idx if (lineBreaks === numLines) { break } if (position === 0) { break } } var startPosition = lineBreaks < numLines ? 0 : position + 1 return string.substring(startPosition).split("\n") } function objectToRules(object) { var keys = Object.getOwnPropertyNames(object) var result = [] for (var i = 0; i < keys.length; i++) { var key = keys[i] var thing = object[key] var rules = [].concat(thing) if (key === "include") { for (var j = 0; j < rules.length; j++) { result.push({ include: rules[j] }) } continue } var match = [] rules.forEach(function (rule) { if (isObject(rule)) { if (match.length) result.push(ruleOptions(key, match)) result.push(ruleOptions(key, rule)) match = [] } else { match.push(rule) } }) if (match.length) result.push(ruleOptions(key, match)) } return result } function arrayToRules(array) { var result = [] for (var i = 0; i < array.length; i++) { var obj = array[i] if (obj.include) { var include = [].concat(obj.include) for (var j = 0; j < include.length; j++) { result.push({ include: include[j] }) } continue } if (!obj.type) { throw new Error("Rule has no type: " + JSON.stringify(obj)) } result.push(ruleOptions(obj.type, obj)) } return result } function ruleOptions(type, obj) { if (!isObject(obj)) { obj = { match: obj } } if (obj.include) { throw new Error("Matching rules cannot also include states") } // nb. error and fallback imply lineBreaks var options = { defaultType: type, lineBreaks: !!obj.error || !!obj.fallback, pop: false, next: null, push: null, error: false, fallback: false, value: null, type: null, shouldThrow: false, } // Avoid Object.assign(), so we support IE9+ for (var key in obj) { if (hasOwnProperty.call(obj, key)) { options[key] = obj[key] } } // type transform cannot be a string if (typeof options.type === "string" && type !== options.type) { throw new Error( "Type transform cannot be a string (type '" + options.type + "' for token '" + type + "')" ) } // convert to array var match = options.match options.match = Array.isArray(match) ? match : match ? [match] : [] options.match.sort(function (a, b) { return isRegExp(a) && isRegExp(b) ? 0 : isRegExp(b) ? -1 : isRegExp(a) ? +1 : b.length - a.length }) return options } function toRules(spec) { return Array.isArray(spec) ? arrayToRules(spec) : objectToRules(spec) } var defaultErrorRule = ruleOptions("error", { lineBreaks: true, shouldThrow: true }) function compileRules(rules, hasStates) { var errorRule = null var fast = Object.create(null) var fastAllowed = true var unicodeFlag = null var groups = [] var parts = [] // If there is a fallback rule, then disable fast matching for (var i = 0; i < rules.length; i++) { if (rules[i].fallback) { fastAllowed = false } } for (var i = 0; i < rules.length; i++) { var options = rules[i] if (options.include) { // all valid inclusions are removed by states() preprocessor throw new Error("Inheritance is not allowed in stateless lexers") } if (options.error || options.fallback) { // errorRule can only be set once if (errorRule) { if (!options.fallback === !errorRule.fallback) { throw new Error( "Multiple " + (options.fallback ? "fallback" : "error") + " rules not allowed (for token '" + options.defaultType + "')" ) } else { throw new Error( "fallback and error are mutually exclusive (for token '" + options.defaultType + "')" ) } } errorRule = options } var match = options.match.slice() if (fastAllowed) { while (match.length && typeof match[0] === "string" && match[0].length === 1) { var word = match.shift() fast[word.charCodeAt(0)] = options } } // Warn about inappropriate state-switching options if (options.pop || options.push || options.next) { if (!hasStates) { throw new Error( "State-switching options are not allowed in stateless lexers (for token '" + options.defaultType + "')" ) } if (options.fallback) { throw new Error( "State-switching options are not allowed on fallback tokens (for token '" + options.defaultType + "')" ) } } // Only rules with a .match are included in the RegExp if (match.length === 0) { continue } fastAllowed = false groups.push(options) // Check unicode flag is used everywhere or nowhere for (var j = 0; j < match.length; j++) { var obj = match[j] if (!isRegExp(obj)) { continue } if (unicodeFlag === null) { unicodeFlag = obj.unicode } else if (unicodeFlag !== obj.unicode && options.fallback === false) { throw new Error("If one rule is /u then all must be") } } // convert to RegExp var pat = reUnion(match.map(regexpOrLiteral)) // validate var regexp = new RegExp(pat) if (regexp.test("")) { throw new Error("RegExp matches empty string: " + regexp) } var groupCount = reGroups(pat) if (groupCount > 0) { throw new Error("RegExp has capture groups: " + regexp + "\nUse (?: … ) instead") } // try and detect rules matching newlines if (!options.lineBreaks && regexp.test("\n")) { throw new Error("Rule should declare lineBreaks: " + regexp) } // store regex parts.push(reCapture(pat)) } // If there's no fallback rule, use the sticky flag so we only look for // matches at the current index. // // If we don't support the sticky flag, then fake it using an irrefutable // match (i.e. an empty pattern). var fallbackRule = errorRule && errorRule.fallback var flags = hasSticky && !fallbackRule ? "ym" : "gm" var suffix = hasSticky || fallbackRule ? "" : "|" if (unicodeFlag === true) flags += "u" var combined = new RegExp(reUnion(parts) + suffix, flags) return { regexp: combined, groups: groups, fast: fast, error: errorRule || defaultErrorRule } } function compile(rules) { var result = compileRules(toRules(rules)) return new Lexer({ start: result }, "start") } function checkStateGroup(g, name, map) { var state = g && (g.push || g.next) if (state && !map[state]) { throw new Error( "Missing state '" + state + "' (in token '" + g.defaultType + "' of state '" + name + "')" ) } if (g && g.pop && +g.pop !== 1) { throw new Error("pop must be 1 (in token '" + g.defaultType + "' of state '" + name + "')") } } function compileStates(states, start) { var all = states.$all ? toRules(states.$all) : [] delete states.$all var keys = Object.getOwnPropertyNames(states) if (!start) start = keys[0] var ruleMap = Object.create(null) for (var i = 0; i < keys.length; i++) { var key = keys[i] ruleMap[key] = toRules(states[key]).concat(all) } for (var i = 0; i < keys.length; i++) { var key = keys[i] var rules = ruleMap[key] var included = Object.create(null) for (var j = 0; j < rules.length; j++) { var rule = rules[j] if (!rule.include) continue var splice = [j, 1] if (rule.include !== key && !included[rule.include]) { included[rule.include] = true var newRules = ruleMap[rule.include] if (!newRules) { throw new Error( "Cannot include nonexistent state '" + rule.include + "' (in state '" + key + "')" ) } for (var k = 0; k < newRules.length; k++) { var newRule = newRules[k] if (rules.indexOf(newRule) !== -1) continue splice.push(newRule) } } rules.splice.apply(rules, splice) j-- } } var map = Object.create(null) for (var i = 0; i < keys.length; i++) { var key = keys[i] map[key] = compileRules(ruleMap[key], true) } for (var i = 0; i < keys.length; i++) { var name = keys[i] var state = map[name] var groups = state.groups for (var j = 0; j < groups.length; j++) { checkStateGroup(groups[j], name, map) } var fastKeys = Object.getOwnPropertyNames(state.fast) for (var j = 0; j < fastKeys.length; j++) { checkStateGroup(state.fast[fastKeys[j]], name, map) } } return new Lexer(map, start) } function keywordTransform(map) { // Use a JavaScript Map to map keywords to their corresponding token type // unless Map is unsupported, then fall back to using an Object: var isMap = typeof Map !== "undefined" var reverseMap = isMap ? new Map() : Object.create(null) var types = Object.getOwnPropertyNames(map) for (var i = 0; i < types.length; i++) { var tokenType = types[i] var item = map[tokenType] var keywordList = Array.isArray(item) ? item : [item] keywordList.forEach(function (keyword) { if (typeof keyword !== "string") { throw new Error("keyword must be string (in keyword '" + tokenType + "')") } if (isMap) { reverseMap.set(keyword, tokenType) } else { reverseMap[keyword] = tokenType } }) } return function (k) { return isMap ? reverseMap.get(k) : reverseMap[k] } } /***************************************************************************/ var Lexer = function (states, state) { this.startState = state this.states = states this.buffer = "" this.stack = [] this.reset() } Lexer.prototype.reset = function (data, info) { this.buffer = data || "" this.index = 0 this.line = info ? info.line : 1 this.col = info ? info.col : 1 this.queuedToken = info ? info.queuedToken : null this.queuedText = info ? info.queuedText : "" this.queuedThrow = info ? info.queuedThrow : null this.setState(info ? info.state : this.startState) this.stack = info && info.stack ? info.stack.slice() : [] return this } Lexer.prototype.save = function () { return { line: this.line, col: this.col, state: this.state, stack: this.stack.slice(), queuedToken: this.queuedToken, queuedText: this.queuedText, queuedThrow: this.queuedThrow, } } Lexer.prototype.setState = function (state) { if (!state || this.state === state) return this.state = state var info = this.states[state] this.groups = info.groups this.error = info.error this.re = info.regexp this.fast = info.fast } Lexer.prototype.popState = function () { this.setState(this.stack.pop()) } Lexer.prototype.pushState = function (state) { this.stack.push(this.state) this.setState(state) } var eat = hasSticky ? function (re, buffer) { // assume re is /y return re.exec(buffer) } : function (re, buffer) { // assume re is /g var match = re.exec(buffer) // will always match, since we used the |(?:) trick if (match[0].length === 0) { return null } return match } Lexer.prototype._getGroup = function (match) { var groupCount = this.groups.length for (var i = 0; i < groupCount; i++) { if (match[i + 1] !== undefined) { return this.groups[i] } } throw new Error("Cannot find token type for matched text") } function tokenToString() { return this.value } Lexer.prototype.next = function () { var index = this.index // If a fallback token matched, we don't need to re-run the RegExp if (this.queuedGroup) { var token = this._token(this.queuedGroup, this.queuedText, index) this.queuedGroup = null this.queuedText = "" return token } var buffer = this.buffer if (index === buffer.length) { return // EOF } // Fast matching for single characters var group = this.fast[buffer.charCodeAt(index)] if (group) { return this._token(group, buffer.charAt(index), index) } // Execute RegExp var re = this.re re.lastIndex = index var match = eat(re, buffer) // Error tokens match the remaining buffer var error = this.error if (match == null) { return this._token(error, buffer.slice(index, buffer.length), index) } var group = this._getGroup(match) var text = match[0] if (error.fallback && match.index !== index) { this.queuedGroup = group this.queuedText = text // Fallback tokens contain the unmatched portion of the buffer return this._token(error, buffer.slice(index, match.index), index) } return this._token(group, text, index) } Lexer.prototype._token = function (group, text, offset) { // count line breaks var lineBreaks = 0 if (group.lineBreaks) { var matchNL = /\n/g var nl = 1 if (text === "\n") { lineBreaks = 1 } else { while (matchNL.exec(text)) { lineBreaks++ nl = matchNL.lastIndex } } } var token = { type: (typeof group.type === "function" && group.type(text)) || group.defaultType, value: typeof group.value === "function" ? group.value(text) : text, text: text, toString: tokenToString, offset: offset, lineBreaks: lineBreaks, line: this.line, col: this.col, } // nb. adding more props to token object will make V8 sad! var size = text.length this.index += size this.line += lineBreaks if (lineBreaks !== 0) { this.col = size - nl + 1 } else { this.col += size } // throw, if no rule with {error: true} if (group.shouldThrow) { var err = new Error(this.formatError(token, "invalid syntax")) throw err } if (group.pop) this.popState() else if (group.push) this.pushState(group.push) else if (group.next) this.setState(group.next) return token } if (typeof Symbol !== "undefined" && Symbol.iterator) { var LexerIterator = function (lexer) { this.lexer = lexer } LexerIterator.prototype.next = function () { var token = this.lexer.next() return { value: token, done: !token } } LexerIterator.prototype[Symbol.iterator] = function () { return this } Lexer.prototype[Symbol.iterator] = function () { return new LexerIterator(this) } } Lexer.prototype.formatError = function (token, message) { if (token == null) { // An undefined token indicates EOF var text = this.buffer.slice(this.index) var token = { text: text, offset: this.index, lineBreaks: text.indexOf("\n") === -1 ? 0 : 1, line: this.line, col: this.col, } } var numLinesAround = 2 var firstDisplayedLine = Math.max(token.line - numLinesAround, 1) var lastDisplayedLine = token.line + numLinesAround var lastLineDigits = String(lastDisplayedLine).length var displayedLines = lastNLines(this.buffer, this.line - token.line + numLinesAround + 1).slice( 0, 5 ) var errorLines = [] errorLines.push(message + " at line " + token.line + " col " + token.col + ":") errorLines.push("") for (var i = 0; i < displayedLines.length; i++) { var line = displayedLines[i] var lineNo = firstDisplayedLine + i errorLines.push(pad(String(lineNo), lastLineDigits) + " " + line) if (lineNo === token.line) { errorLines.push(pad("", lastLineDigits + token.col + 1) + "^") } } return errorLines.join("\n") } Lexer.prototype.clone = function () { return new Lexer(this.states, this.state) } Lexer.prototype.has = function (tokenType) { return true } export { compile, compileStates as states, keywordTransform as keyword, } export const error = Object.freeze({ error: true }) export const fallback = Object.freeze({ fallback: true }) export default { compile, states: compileStates, keyword: keywordTransform, error, fallback }