JSDoc: Source: language.js

"use strict";
const LexElement = require("./lex-element.js");
const LexRole = require("./lex-role.js");
const SyntaxRule = require("./syntax-rule.js");
const Term = require("./term.js");
const Token = require("./token.js");
const debug = require("debug")("Language");

/**
 * A language syntax definition
 * @class
 * @typedef {Array<string|LexElement>} Rule
 * @typedef {function(Term):any} Evaluator
 */
class Language {
    /**
     * Define a language syntax with BNF like description.
     * @constructor
     * @param {SyntaxRule[]} rules array of syntax definition
     */
    constructor(rules) {
        /** @type {string} */
        this.root = rules[0].name;
        /** @type {Record<string, SyntaxRule>} */
        this.rules = {};
        rules.forEach(rule => {
            this.rules[rule.name] = rule;
        });
    }
    /**
     * Create a syntax rule as an element for parameter of the constructor.
     * @static
     * @public
     * @param {string} name syntax name
     * @param {Rule[]} rules syntax rules
     * @param {Evaluator} evaluator evaluator function
     * @returns {SyntaxRule} syntax object
     */
    static syntax(name, rules, evaluator) {
        return new SyntaxRule(name, rules, evaluator);
    }
    /**
     * Tokenize the source code by lexical analysis.
     * @public
     * @param {string} source source code
     * @returns {Array<Token>} an array of lexical tokens.
     *  It contains tokens type of whitespaces, string literal, number literal, or punctuators.
     *  Types of token:
     *  1. `Language.whitespace`
     *  1. `Language.strlit`
     *  1. `Language.numlit`
     *  1. `Language.punct`
     */
    tokenize(source) {
        return Language.LexAnalyzer.parse(source);
    }
    /**
     * Analyze the codes by recursive descent parsing based on the BNF rule.
     * @public
     * @param {string|Array<Token>} source source code or token list
     * @returns {Term} result
     */
    parse(source) {
        const tokens = Array.isArray(source) ? source : this.tokenize(source);
        const result = this._parse(this.root, tokens, 0);
        if(result.nTok < tokens.length) {
            result.term.setError(
                new Error(`syntax error`),
                tokens[result.nTok]);
        }
        return result.term;
    }
    /**
     * @private
     * @param {string} name A term name
     * @param {Token[]} tokenList An array of token
     * @param {number} iTokStart index of token to start parsing
     * @returns {{term:Term, nTok:number}} A result
     */
    _parse(name, tokenList, iTokStart) {
        debug(`parse ${name}`);
        const syntax = this.rules[name];
        if(!syntax) {
            throw new Error(`FATAL: no syntax rule for ${JSON.stringify(name)}`);
        }
        const term = this.createTerm(name);
        const rules = syntax.rules;
        let nTok = 0;
        let iToken = iTokStart;
        for(let iRule = 0; iRule < rules.length; iRule++) {
            const rule = rules[iRule];
            debug(`iRule: ${iRule} of ${rules.length} ${JSON.stringify(rule)}`);
            term.clear();
            nTok = 0;
            iToken = iTokStart;
            for(let iElement = 0; iElement < rule.length; iElement++) {
                iToken = iTokStart + nTok;
                if(iToken >= tokenList.length) {
                    term.setError(new Error(`syntax error`));
                    nTok = 0;
                    break;
                }
                const token = tokenList[iToken];

                // Treat whitespaces
                if(token.isWhiteSpace()) {
                    // Whitespaces are stored to the term, but not to be
                    // analyzed, because whitespaces had no meanings.
                    term.addTerm(token);

                    // Next token will be analyzed by same rule element;
                    nTok++;
                    iElement--;
                    continue;
                }

                const element = rule[iElement];
                if(typeof element === "string") {
                    const ruleName = element;
                    const subResult = this._parse(ruleName, tokenList, iToken);
                    if(subResult.term.error) {
                        term.setError(
                            new Error(`syntax error`),
                            subResult.term.errorToken);
                        nTok = 0;
                        break;
                    }
                    term.addTerm(subResult.term);
                    nTok += subResult.nTok;
                } else {
                    const lexElement = element; // as LexElement
                    if(!lexElement.isMatch(token)) {
                        term.setError(new Error(`syntax error`), token);
                        nTok = 0;
                        break;
                    }
                    term.addTerm(token);
                    nTok++;
                }
            }
            if(term.error == null) {
                // Parsing ends with no error
                break;
            }
            if(iRule + 1 >= rules.length) {
                term.setError(new Error(`syntax error`));
                break;
            }
        }
        return {term, nTok};
    }
    /**
     * Evaluate an analyzed code.
     * @public
     * @param {Term} term A term returned from `Language#parse'
     * @returns {any} A value returned from the evaluator for the term.
     */
    evaluate(term) {
        const _eval = (term) => {
            const name = term.name;
            if(this.rules[name].evaluator) {
                const value = this.rules[name].evaluator(term);
                debug(`${name}.value: ${JSON.stringify(value)}`);
                return value;
            }
            if(term.elements.length === 1 && term.elements[0] instanceof Term) {
                const value = _eval(term.elements[0]);
                debug(`${name}.value: ${JSON.stringify(value)}`);
                return value;
            }
            throw new Error(`FATAL: no evaluator for the rule named ${name}`);
        };
        return _eval(term);
    }
    /**
     * @private
     * @param {string} name A rule name
     * @returns {Term} A term representing the rule
     */
    createTerm(name) {
        const term = Term.create(name, {
            contents: () => this.getContents(term),
            str: () => this.getString(term),
        });
        return term;
    }
    /**
     * @private
     * @param {Term} term term as a result that the parse method returns
     * @returns {Array} Evaluated values for each elements in the term
     */
    getContents(term) {
        const contents = term.elements.filter(
            // remove the whitespaces
            e => (e instanceof Term || !e.isWhiteSpace())
        ).map(e => {
            if(e instanceof Term) {
                const term = e; // as Term
                return this.evaluate(term);
            } else {
                const token = e; // as Token
                return token.getTerm();
            }
        });
        return contents;
    }
    /**
     * @private
     * @param {Term} term term as a result that the parse method returns
     * @returns {Array} Evaluated values for each elements in the term
     */
    getString(term) {
        const contents = term.elements.map(e => {
            if(e instanceof Term) {
                const term = e; // as Term
                return this.evaluate(term);
            } else {
                const token = e; // as Token
                return token.getTerm();
            }
        });
        return contents.join("");
    }
}
/**
 * @private
 * @class
 */
const LexAnalyzer = class {
    /**
     * @constructor
     */
    constructor() {
        this._mode = "";
        this._tokenList = [];
        this._token = null;
        this._source = "";
        this._lineNum = 1;
        this._columnPos = 1;
        this._i = 0;
        this._c = null;
    }
    /**
     * Parse lexical tokesn.
     * @param {string} source The source text to be parse.
     * @returns {Token[]} The result of parsing.
     */
    static parse(source) {
        let parser = new LexAnalyzer();
        return parser.parse(source);
    }

    /**
     * Test if the character is a kind of white-spaces.
     * @param {string} c a character.
     * @returns {boolean} the test result.
     */
    static isWhite(c) {
        return c.match(/^\s/);
    }

    /**
     * Test if the character is a kind of alphabet.
     * @param {string} c a character.
     * @returns {boolean} the test result.
     */
    static isAlpha(c) {
        return c.match(/^[_a-z]/i);
    }

    /**
     * Test if the character is a kind of digits.
     * @param {string} c a character.
     * @returns {boolean} the test result.
     */
    static isDigit(c) {
        return c.match(/^[0-9]/);
    }

    /**
     * Parse lexical tokesn.
     * @param {string} source The source text to be parse.
     * @returns {Array<Token>} The result of parsing.
     */
    parse(source) {
        this._mode = "";
        this._tokenList = [];
        this._token = null;
        this._lineNum = 1;
        this._columnPos = 1;
        this._i = 0;
        this._c = null;
        this._source = source;
        while(this._i < this._source.length) {
            this._c = this._source.charAt(this._i);
            switch(this._mode) {
                case "":
                    this.parseDefault();
                    break;
                case LexAnalyzer.WS:
                    this.parseWhiteSpace();
                    break;
                case LexAnalyzer.STRLIT:
                    this.parseIdentifier();
                    break;
                case LexAnalyzer.NUMLIT:
                    this.parseNumberLiteral();
                    break;
            }
            ++this._i;
            ++this._columnPos;
        }
        if(this._token != null) {
            this.finishToken();
        }
        return this._tokenList;
    }

    /**
     * Parse in initial state.
     * @returns {undefined}
     */
    parseDefault() {
        this._token = new Token();
        this._token.setPos(this._lineNum, this._columnPos);
        if(LexAnalyzer.isWhite(this._c)) {
            this._token.pushChar(this._c);
            this._mode = LexAnalyzer.WS;
        } else if(LexAnalyzer.isAlpha(this._c)) {
            this._token.pushChar(this._c);
            this._mode = LexAnalyzer.STRLIT;
        } else if(LexAnalyzer.isDigit(this._c)) {
            this._token.pushChar(this._c);
            this._mode = LexAnalyzer.NUMLIT;
        } else /* if(LexAnalyzer.isPunct(this._c)) */ {
            this._token.pushChar(this._c);
            this.finishToken(LexAnalyzer.PUNCT);
        }
    }

    /**
     * Parse white-spaces.
     * @returns {undefined}
     */
    parseWhiteSpace() {
        if(LexAnalyzer.isWhite(this._c)) {
            this._token.pushChar(this._c);
            if(this._c == "\n") {
                this._columnPos = 0;
                this._lineNum++;
            }
        } else {
            this.finishToken();
            this.ungetChar();
        }
    }

    /**
     * Parse an identifier.
     * @returns {undefined}
     */
    parseIdentifier() {
        if(LexAnalyzer.isAlpha(this._c)) {
            this._token.pushChar(this._c);
        } else {
            this.finishToken();
            this.ungetChar();
        }
    }

    /**
     * Parse a number literal.
     * @returns {undefined}
     */
    parseNumberLiteral() {
        if(this._c.match(/^[0-9]$/i)) {
            this._token.pushChar(this._c);
        } else {
            this.finishToken();
            this.ungetChar();
        }
    }

    /**
     * Finish the token parsing.
     * @param {string} mode A mode name to be set to the token finally.
     * If this parameter is null, the tokenizer's currently mode is used.
     * @returns {undefined}
     */
    finishToken(mode) {
        this._token.setType(mode || this._mode);
        this._token.fixTerm();
        this._tokenList.push(this._token);
        this._token = null;
        this._mode = "";
    }

    /**
     * Push back the parsing char.
     * @returns {undefined}
     */
    ungetChar() {
        --this._i;
        --this._columnPos;
    }
};
LexAnalyzer.WS = "WS";
LexAnalyzer.STRLIT = "STRLIT";
LexAnalyzer.NUMLIT = "NUMLIT";
LexAnalyzer.PUNCT = "PUNCT";

Language.literal = (value) => new LexElement(LexRole.lit, value, false);
Language.lex = (value) => new LexElement(LexRole.lex, value, false);
Language.whitespace = Language.lex(LexAnalyzer.WS);
Language.strlit = Language.lex(LexAnalyzer.STRLIT);
Language.numlit = Language.lex(LexAnalyzer.NUMLIT);
Language.punct = Language.lex(LexAnalyzer.PUNCT);
Language.LexAnalyzer = LexAnalyzer;
module.exports = Language;