The London Perl and Raku Workshop takes place on 26th Oct 2024. If your company depends on Perl, please consider sponsoring and/or attending.
Class('Document.Parser', function() {

var proto = this.prototype;

proto.className = 'Document.Parser';

proto.init = function() {}

proto.parse = function(input, receiver) {
    this.input = (input.search(/\n$/) == -1) ? input+"\n" : input;
    if (receiver) this.receiver = receiver;
    this.receiver.init();
    this.grammar = this.create_grammar();
    this.parse_blocks('top');
    return this.receiver.content();
}

proto.create_grammar = function() {
    throw "Please define create_grammar in a derived class of Document.Parser.";
};

//------------------------------------------------------------------------------
// Parse input into a series of blocks. With each iteration the parser must
// match a block at position 0 of the text, and remove that block from the
// input reparse it further. This continues until there is no input left.
//------------------------------------------------------------------------------
proto.parse_blocks = function(container_type) {
    var types = this.grammar[container_type].blocks; // Document.contains[container_type];
    if (!types) return;
    while (this.input.length) {
        var length = this.input.length;
        for (var i = 0; i < types.length; i++) {
            var type = types[i];
            var matched = this.find_match('matched_block', type);
            if (matched) {
                this.input = this.input.substr(matched.end);
                this.handle_match(type, matched);
                break;
            }
        }
        if (this.input.length >= length)
            throw this.classname + ': Reduction error for:\n' + this.input +
            '\n' + JSON.stringify(this);
    }
    return;
}

proto.handle_match = function(type, match) {
    var grammar = this.grammar[type];
    var parse = grammar.blocks ? 'parse_blocks' : 'parse_phrases';
    // console.log("Subparsing " + parse + '(' + type + '): ');
    // console.log(match);
    this.subparse(parse, match, type, grammar.filter);
}

proto.find_match = function(matched_func, type) {
    var re = this.grammar[type].match;
    if (!re) throw 'no regexp for type: ' + type;
    var capture = this.input.match(re);
    if (capture) {
        // console.log("Found match " + type + " - " + matched_func);
        var match = this[matched_func].call(this, capture, this.grammar[type].lookbehind);
        match.type = this.grammar[type].type || type;
        // console.log(match);
        return match;
    }
    return;
};

//------------------------------------------------------------------------------
// This code parses a chunk into interleaved pieces of plain text and
// phrases. It repeatedly tries to match every possible phrase and
// then takes the match closest to the start. Everything before a
// match is written as text. Matched phrases are subparsed according
// to their rules. This continues until the input is all eaten.
//------------------------------------------------------------------------------
proto.parse_phrases = function(container_type) {
    var types = this.grammar[container_type].phrases;
    if (!types) { this.receiver.text_node(this.input || ''); return }
    // console.log("INPUT: " + this.input);
    while (this.input.length) {
        var match = null;
        for (var i = 0; i < types.length; i++) {
            var type = types[i];
            var matched = this.find_match('matched_phrase', type);
            if (! matched) continue;

            if (!match || (matched.begin < match.begin)) {
                match = matched;
                if (match.begin == 0) break;
            }
        }
        if (!match) {
            // console.log("NO MATCH: " + this.input);
            this.receiver.text_node(this.input || '');
            break;
        }
        if (match.begin != 0) {
            // console.log("MATCH OFFSET:" + this.input + " (" + match.type + ")" + match.begin);
            this.receiver.text_node(this.input.substr(0, match.begin) || '');
            }
        this.input = this.input.substr(match.end);
        this.handle_match(match.type, match);
    }
    return;
}

proto.subparse = function(func, match, type, filter) {
    /* The call could cause side effects to the match object. */
    match.type = this.grammar[type].type;
    if (match.type == null) match.type = type;

    var filtered_text = filter ? filter(match) : null;

    if (match.type) this.receiver.begin_node(match);

    var parser = eval('new ' + this.className + '()');

    parser.input = (filtered_text == null) ? match.text : filtered_text;
    parser.grammar = this.grammar;
    parser.receiver = this.receiver.instantiate();
    // console.log("SEEDED: (" + type + ")" + parser.input);
    parser[func].call(parser, type);
    this.receiver.insert(parser.receiver);

    if (match.type) this.receiver.end_node(match);
}

//------------------------------------------------------------------------------
// Helper functions
//
// These are the odds and ends called by the code above.
//------------------------------------------------------------------------------

/* Blocks has no lookbehinds, so:
 * All match begins at 0. The first capture is text; the next ones are various parts.
 */
proto.matched_block = function(capture) {
    return {
        begin: capture.index,
        text: capture[1],
        end: capture[0].length,
        1: capture[2],
        2: capture[3],
        3: capture[4]
    };
}

/* The first capture in a Phrases is the lookbehind. So:
 */
proto.matched_phrase = function(capture, lookbehind) {
    if (lookbehind) {
        var text = capture[2];
        var begin = this.input.indexOf(capture[1]);
        return {
            text: text,
            begin: begin,
            end: (begin + capture[1].length),
            1: RegExp.$2,
            2: RegExp.$3,
            3: RegExp.$4
        };
    }

    return {
        begin: capture.index,
        text: capture[1],
        end: capture.index + capture[0].length,
        1: capture[2],
        2: capture[3],
        3: capture[4]
    };
}

});