The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
##
# name:      WikiText::Kwiki::Parser
# abstract:  Kwiki WikiText Parser Module
# author:    Ingy döt Net <ingy@cpan.org>
# license:   perl
# copyright: 2008, 2010, 2011

package WikiText::Kwiki::Parser;
use strict;
use warnings;
use base 'WikiText::Parser';

# use XXX;

# Reusable regexp generators used by the grammar
my $ALPHANUM = '\p{Letter}\p{Number}\pM';

# These are all stolen from URI.pm
my $reserved   = q{;/?:@&=+$,[]#};
my $mark       = q{-_.!~*'()};
my $unreserved = "A-Za-z0-9\Q$mark\E";
my $uric       = quotemeta($reserved) . $unreserved . "%";

sub create_grammar {
    my $all_blocks = [
        qw(
            pre
            hr
            hx
            ul
            ol
            table
            p
            empty
            else
        )
    ];
    my $all_phrases = [
        qw(
            tt
            b
            i
            nolink
            camel
            hyper
            force
        )
    ];

    return {
        _all_blocks => $all_blocks,
        _all_phrases => $all_phrases,

        top => {
            blocks => $all_blocks,
        },

        pre => {
            match => qr/^((?: +\S.*\n)(?:(?: +\S.*\n| *\n)*(?: +\S.*\n))?)/,
            filter => sub {
                my $node = shift;
                while (not /^\S/m) {
                    s/^ //gm;
                }
                $node->{text} = $_;
            },
        },

        hr => {
            match => qr/^--+(?:\s*\n)?/,
        },

        hx => {
            match => qr/^(=+) *(.*?)(\s+=+)?\s*?\n+/,
            phrases => $all_phrases,
            filter => sub {
                my $node = shift;
                $node->{type} = 'h' . length($node->{1});
                $_ = $node->{text} = $node->{2};
            },
        },

        ul => {
            match => re_list('\*'),
            blocks => [qw(ul ol subl li)],
            filter => sub { s/^[\*\0] *//mg },
        },

        ol => {
            match => re_list('\0'),
            blocks => [qw(ul ol subl li)],
            filter => sub { s/^[\*\0] *//mg },
        },

        subl => {
            type => 'li',

            match => qr/^(          # Block must start at beginning
                                    # Capture everything in $1
                (.*)\n              # Capture the whole first line
                [\*\0]+\ .*\n      # Line starting with one or more bullet
                (?:[\*\0]+\ .*\n)*  # Lines starting with '*' or '0'
            )(?:\s*\n)?/x,          # Eat trailing lines
            blocks => [qw(ul ol li2)],
        },

        li => {
            match => qr/(.*\n)/,    # Capture the whole line
            phrases => $all_phrases,
        },

        li2 => {
            type => '',
            match => qr/(.*\n)/,    # Capture the whole line
            phrases => $all_phrases,
        },

        table => {
            match => qr/^(
                (
                    (?m:^\|.*\|\ \n(?=\|))
                    |
                    (?m:^\|.*\|\ \ +\n)
                    |
                    (?ms:^\|.*?\|\n)
                )+
            )(?:\s*\n)?/x,
            blocks => ['tr'],
        },

        tr => {
            match => qr/^((?m:^\|.*?\|(?:\n| \n(?=\|)|  +\n)))/s,
            blocks => ['td'],
            filter => sub { s/\s+\z// },
        },

        td => {
            match => qr/\|?\s*(.*?)\s*\|\n?/s,
            phrases => $all_phrases,
        },

        p => {
            match => qr/^(
                (?:
                    (?!
                        [\ \#\=\|] |
                        [\*\0]\ |
                        ----\n
                    )
                    .*\S.*\n
                )+
            )
            (\ *\n)*
            /x,
            phrases => $all_phrases,
            filter => sub { s/ +$//gm },
        },

        empty => {
            match => qr/^(?:#.*|\ *)\n/,
            filter => sub {
                my $node = shift;
                $node->{type} = '';
            },
        },

        else => {
            match => qr/^(.*)\n/,
            phrases => [],
            filter => sub {
                my $node = shift;
                $node->{type} = 'p';
            },
        },

        tt => {
            match => re_huggy(q{\[\=}, q{\]}),
        },

        b => {
            match => re_huggy(q{\*}),
            phrases => $all_phrases,
        },

        i => {
            match => re_huggy(q{\/}),
            phrases => $all_phrases,
        },

        nolink => {
            type => 'text',
            match => qr/!(\w+)/,
        },

        camel => {
            type => 'link',
            match => qr/([A-Z][$ALPHANUM]*[a-z][A-Z][$ALPHANUM]*)/,
            filter => sub {
                my $node = shift;
                $node->{attributes}{target} = $node->{1};
            },
        },

        hyper => {
            type => 'link',
            match => qr{
                (
                    (?:http|https|ftp|irc|file):
                    (?://)?
                    [$uric]+
                    [A-Za-z0-9/#]
                )
            }x,
            filter => sub {
                my $node = shift;
                $node->{attributes}{target} = $node->{1};
            },
        },

        force => {
            type => 'link',
            match => qr/\[([^\]]+)\]/,
            filter => sub {
                my $node = shift;
                $node->{attributes}{target} = $node->{1};
            },
        },
    };
}

sub re_huggy {
    my $brace1 = shift;
    my $brace2 = shift || $brace1;

    qr/
        (?:^|(?<=[^{$ALPHANUM}$brace1]))$brace1(?=\S)(?!$brace2)
        (.*?)
        $brace2(?=[^{$ALPHANUM}$brace2]|\z)
    /x;
}

sub re_list {
    my $bullet = shift;
    return qr/^(            # Block must start at beginning
                            # Capture everything in $1
        ^$bullet+\ .*\n     # Line starting with one or more bullet
        (?:[\*\0]+\ .*\n)*  # Lines starting with '*' or '0'
    )(?:\s*\n)?/x,          # Eat trailing lines
}

1;