The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.

package WAP::wbxml;

use strict;
use warnings;

our $VERSION = '1.13';

=head1 NAME

WAP::wbxml - Binarization of XML file

=head1 SYNOPSIS

  use XML::DOM;
  use WAP::wbxml;

  $parser = new XML::DOM::Parser;
  $doc_xml = $parser->parsefile($infile);

  $rules = WAP::wbxml::WbRules::Load();
  $wbxml = new WAP::wbxml($rules, $publicid);
  $output = $wbxml->compile($doc_xml, $encoding);

=head1 DESCRIPTION

This module implements binarisation of XML file according the specification :

WAP - Wireless Application Protocol /
Binary XML Content Format Specification /
Version 1.3 WBXML (15th May 2000 Approved)

The XML input file must refere to a DTD with a public identifier.

The file WAP/wap.wbrules.xml configures this tool for all known DTD.

This module needs I18N::Charset and XML::DOM modules.

WAP Specifications, including Binary XML Content Format (WBXML)
 are available on E<lt>http://www.wapforum.org/E<gt>.

=over 4

=cut

use integer;
use bytes;

use MIME::Base64;
use WAP::wbxml::WbRules;
use XML::DOM;

# Global tokens
use constant SWITCH_PAGE    => 0x00;
use constant _END           => 0x01;
use constant ENTITY         => 0x02;
use constant STR_I          => 0x03;
use constant LITERAL        => 0x04;
use constant EXT_I_0        => 0x40;
use constant EXT_I_1        => 0x41;
use constant EXT_I_2        => 0x42;
use constant PI             => 0x43;
use constant LITERAL_C      => 0x44;
use constant EXT_T_0        => 0x80;
use constant EXT_T_1        => 0x81;
use constant EXT_T_2        => 0x82;
use constant STR_T          => 0x83;
use constant LITERAL_A      => 0x84;
use constant EXT_0          => 0xC0;
use constant EXT_1          => 0xC1;
use constant EXT_2          => 0xC2;
use constant OPAQUE         => 0xC3;
use constant LITERAL_AC     => 0xC4;
# Global token masks
use constant NULL           => 0x00;
use constant HAS_CHILD      => 0x40;
use constant HAS_ATTR       => 0x80;

=item new

 $wbxml = new WAP::wbxml($rules, $publicid);

Create a instance of WBinarizer for a specified kind of DTD.

If PublicId is undefined, the first found rules are used.

If the DTD is not known in the rules, default rules are used.

=cut

sub new {
    my $proto = shift;
    my $class = ref($proto) || $proto;
    my $self = {};
    bless $self, $class;
    my ($rules, $publicid) = @_;
    $self->{publicid} = $publicid;
    $self->{rules} = $rules;
    if ($publicid) {
        $self->{rulesApp} = $rules->{App}->{$publicid};
        unless ($self->{rulesApp}) {
            $self->{rulesApp} = $rules->{DefaultApp};
            warn "Using default rules.\n";
        }
    }
    else {
        my ($val) = values %{$rules->{App}};
        $self->{rulesApp} = $val;
    }
    $self->{skipDefault} = $self->{rulesApp}->{skipDefault};
    $self->{variableSubs} = $self->{rulesApp}->{variableSubs};
    $self->{tagCodepage} = 0;
    $self->{attrCodepage} = 0;
    return $self;
}

sub compileDatetime {
    # WAP / WML
    my $self = shift;
    my ($content) = @_;
    my $str;
    if ($content =~ /(\d+)-(\d+)-(\d+)T(\d+)\.(\d+)\.(\d+)Z/) {
        my $year  = chr (16 * ($1 / 1000) + (($1 / 100) % 10))
                  . chr (16 * (($1 / 10) % 10) + ($1 % 10));
        my $month = chr (16 * ($2 / 10) + ($2 % 10));
        my $day   = chr (16 * ($3 / 10) + ($3 % 10));
        my $hour  = chr (16 * ($4 / 10) + ($4 % 10));
        my $min   = chr (16 * ($5 / 10) + ($5 % 10));
        my $sec   = chr (16 * ($6 / 10) + ($6 % 10));
        $str  = $year . $month . $day;
        $str .= $hour if (ord $hour or ord $min or ord $sec);
        $str .= $min  if (ord $min or ord $sec);
        $str .= $sec  if (ord $sec);
    }
    else {
        warn "Validate 'Datetime' error : $content.\n";
        $str = "\x19\x70\x01\x01";
    }
    $self->putb('body', OPAQUE);
    $self->putmb('body', length $str);
    $self->putstr('body', $str);
}

sub compileBinaryWV {
    # WV
    my $self = shift;
    my ($value) = @_;
    $value =~ s/\s+//g;
    my $data = decode_base64($value);
    if (length $data) {
        $self->putb('body', OPAQUE);
        $self->putmb('body', length $data);
        $self->putstr('body', $data);
    }
}

sub compileIntegerWV {
    # WV
    my $self = shift;
    my ($value) = @_;
    $value =~ s/\s+/ /g;
    unless ($value =~ /^\s*$/) {
        if ($value < 0 and $value > 4294967295) {
            warn "'Integer' error : $value.\n";
            $self->compilePreserveStringI($value);
        }
        else {
            $self->putb('body', OPAQUE);
            if    ($value < 256) {
                $self->putmb('body', 1);
                $self->putb('body', $value);
            }
            elsif ($value < 65536) {
                $self->putmb('body', 2);
                $self->putstr('body', pack("n", $value));
            }
            else {
                $self->putmb('body', 4);
                $self->putstr('body', pack("N", $value));
            }
        }
    }
}

sub compileDatetimeWV {
    # WV
    my $self = shift;
    my ($content) = @_;
    my $str;
    if ($content =~ /^\s*(\d\d\d\d)(\d\d)(\d\d)T(\d\d)(\d\d)(\d\d)(Z)?\s*$/) {
        my $year  = $1;
        my $month = $2;
        my $day   = $3;
        my $hour  = $4;
        my $min   = $5;
        my $sec   = $6;
        my $tz    = $7 || "\0";
        $self->putb('body', OPAQUE);
        $self->putmb('body', 6);
        $self->putb('body', $year >> 6);
        $self->putb('body', (($year & 0x03F) << 2) | ($month >> 2));
        $self->putb('body', (($month & 0x3) << 6) | ($day << 1) | ($hour >> 4));
        $self->putb('body', (($hour & 0xF) << 4) | ($min >> 2));
        $self->putb('body', (($min & 0x3) << 6) | $sec);
        $self->putb('body', ord $tz);
    }
    else {
        warn "'Datetime' error : $content.\n";
        $self->compilePreserveStringI($content);
    }
}

sub compilePreserveStringT {
    my $self = shift;
    my ($str) = @_;
    if (exists $self->{h_str}->{$str}) {
        $self->putmb('body', $self->{h_str}->{$str});
    }
    else {
        my $pos = length $self->{strtbl};
        $self->{h_str}->{$str} = $pos;
#       print $pos," ",$str,"\n";
        $self->putmb('body', $pos);
        $self->putstr('strtbl', $str);
        $self->putb('strtbl', NULL);
    }
}

sub compilePreserveStringI {
    my $self = shift;
    my ($str) = @_;
    my $idx = $self->{rulesApp}->getExtValue($str, 'Ext0Values');
    if (defined $idx) {
        $self->putb('body', EXT_T_0);
        $self->putmb('body', $idx);
        return;
    }
    $idx = $self->{rulesApp}->getExtValue($str, 'Ext1Values');
    if (defined $idx) {
        $self->putb('body', EXT_T_1);
        $self->putmb('body', $idx);
        return;
    }
    $idx = $self->{rulesApp}->getExtValue($str, 'Ext2Values');
    if (defined $idx) {
        $self->putb('body', EXT_T_2);
        $self->putmb('body', $idx);
        return;
    }
    $self->putb('body', STR_I);
    $self->putstr('body', $str);
    $self->putb('body', NULL);
}

sub compileStringI {
    my $self = shift;
    my ($str) = @_;
    $str =~ s/\s+/ /g;
    $self->compilePreserveStringI($str) unless ($str =~ /^\s*$/);
}

sub compileStringIwithVariables {
    # WAP / WML
    my $self = shift;
    my ($str) = @_;
    my $text = '';
    while ($str) {
        for ($str) {
            s/^([^\$]+)//
                    and $text .= $1,
                        last;

            s/^\$\$//
                    and $text .= '$',
                        last;

            s/^\$([A-Z_a-z][0-9A-Z_a-z]*)//
                    and $self->compileStringI($text),
                        $text = q{},
                        $self->putb('body', EXT_T_2),
                        $self->compilePreserveStringT($1),
                        last;

            s/^\$\(\s*([A-Z_a-z][0-9A-Z_a-z]*)\s*\)//
                    and $self->compileStringI($text),
                        $text = q{},
                        $self->putb('body', EXT_T_2),
                        $self->compilePreserveStringT($1),
                        last;

            s/^\$\(\s*([A-Z_a-z][0-9A-Z_a-z]*)\s*:\s*escape\s*\)//
                    and $self->compileStringI($text),
                        $text = q{},
                        $self->putb('body', EXT_T_0),
                        $self->compilePreserveStringT($1),
                        last;

            s/^\$\(\s*([A-Z_a-z][0-9A-Z_a-z]*)\s*:\s*unesc\s*\)//
                    and $self->compileStringI($text),
                        $text = q{},
                        $self->putb('body', EXT_T_1),
                        $self->compilePreserveStringT($1),
                        last;

            s/^\$\(\s*([A-Z_a-z][0-9A-Z_a-z]*)\s*:\s*noesc\s*\)//
                    and $self->compileStringI($text),
                        $text = q{},
                        $self->putb('body', EXT_T_2),
                        $self->compilePreserveStringT($1),
                        last;

            s/^\$\(\s*([A-Z_a-z][0-9A-Z_a-z]*)\s*:\s*([Ee]([Ss][Cc][Aa][Pp][Ee])?)\s*\)//
                    and $self->compileStringI($text),
                        $text = q{},
                        $self->putb('body', EXT_T_0),
                        $self->compilePreserveStringT($1),
                        warn "deprecated-var : $1:$2\n",
                        last;

            s/^\$\(\s*([A-Z_a-z][0-9A-Z_a-z]*)\s*:\s*([Uu][Nn]([Ee][Ss][Cc])?)\s*\)//
                    and $self->compileStringI($text),
                        $text = q{},
                        $self->putb('body', EXT_T_1),
                        $self->compilePreserveStringT($1),
                        warn "deprecated-var : $1:$2\n",
                        last;

            s/^\$\(\s*([A-Z_a-z][0-9A-Z_a-z]*)\s*:\s*([Nn][Oo]([Ee][Ss][Cc])?)\s*\)//
                    and $self->compileStringI($text),
                        $text = q{},
                        $self->putb('body', EXT_T_2),
                        $self->compilePreserveStringT($1),
                        warn "deprecated-var : $1:$2\n",
                        last;

            warn "Pb with: $str \n";
            return;
        }
    }
    $self->compileStringI($text);
}

sub compileEntity {
    my $self = shift;
    my ($entity) = @_;
    if (exists $self->{rulesApp}->{CharacterEntity}{$entity}) {
        my $code = $self->{rulesApp}->{CharacterEntity}{$entity};
        $self->putb('body', ENTITY);
        $self->putmb('body', $code);
    }
    else {
        warn "entity reference : $entity";
    }
}

sub compileAttributeExtToken {
    my $self = shift;
    my ($ext_token) = @_;
    my $codepage = $ext_token / 256;
    my $token = $ext_token % 256;
    if ($codepage != $self->{attrCodepage}) {
        $self->putb('body', SWITCH_PAGE);
        $self->putb('body', $codepage);
        $self->{attrCodepage} = $codepage;
    }
    $self->putb('body', $token);
}

sub compileTagExtToken {
    my $self = shift;
    my ($ext_token) = @_;
    my $codepage = $ext_token / 256;
    my $token = $ext_token % 256;
    if ($codepage != $self->{tagCodepage}) {
        $self->putb('body', SWITCH_PAGE);
        $self->putb('body', $codepage);
        $self->{tagCodepage} = $codepage;
    }
    $self->putb('body', $token);
}

sub compileAttributeValues {
    my $self = shift;
    my ($value) = @_;
    my $attr;
    my $start;
    my $end;
    while (1) {
        ($attr, $start, $end) = $self->{rulesApp}->getAttrValue($value, $self->{attrCodepage});
        last unless ($attr);
        $self->compilePreserveStringI($start) if (defined $start);
        $self->compileAttributeExtToken($attr->{ext_token});
        $value = $end;
    }
    $self->compilePreserveStringI($start) if (defined $start);
}

sub compileProcessingInstruction {
    my $self = shift;
    my ($target, $data) = @_;
    $self->putb('body', PI);
    my ($attr_start, $dummy) = $self->{rulesApp}->getAttrStart($target, q{}, $self->{attrCodepage});
    if ($attr_start) {
        # well-known attribute name
        $self->compileAttributeExtToken($attr_start->{ext_token});
    }
    else {
        # unknown attribute name
        $self->putb('body', LITERAL);
        $self->compilePreserveStringT($target);
    }
    if (defined $data) {
        $self->compileAttributeValues($data);
    }
    $self->putb('body', _END);
}

sub prepareAttribute {
    my $self = shift;
    my ($tagname, $attr) = @_;
    my $attr_name = $attr->getName();
    my $attr_value = $attr->getValue();
    my ($attr_start, $remain) = $self->{rulesApp}->getAttrStart($attr_name, $attr_value, $self->{attrCodepage});
    if ($attr_start) {
        # well-known attribute name
        my $default_list = $attr_start->{default} || q{};
        my $fixed_list = $attr_start->{fixed} || q{};
        if (! $remain) {
            return 0 if (index($fixed_list, $tagname) >= 0);
            return 0 if ($self->{skipDefault} and index($default_list, $tagname) >= 0);
        }
    }
    return 1;
}

sub compileAttribute {
    my $self = shift;
    my ($tagname, $attr) = @_;
    my $attr_name = $attr->getName();
    my $attr_value = $attr->getValue();
    my ($attr_start, $remain) = $self->{rulesApp}->getAttrStart($attr_name, $attr_value, $self->{attrCodepage});
    if ($attr_start) {
        # well-known attribute name
        my $default_list = $attr_start->{default} || q{};
        my $fixed_list = $attr_start->{fixed} || q{};
        my $validate = $attr_start->{validate} || q{};
        my $encoding = $attr_start->{encoding} || q{};
        unless ($remain) {
            return if (index($fixed_list, $tagname) >= 0);
            return if ($self->{skipDefault} and index($default_list, $tagname) >= 0);
        }
        $self->compileAttributeExtToken($attr_start->{ext_token});

        if ($encoding eq 'iso-8601') {
            $self->compileDatetime($attr_value);
        }
        else {
            if ($remain ne q{}) {
                if ($validate eq 'length') {
                    warn "Validate 'length' error : $remain.\n"
                            unless ($remain =~ /^[0-9]+%?$/);
                    $self->compilePreserveStringI($remain);
                }
                else {
                    if ($self->{variableSubs} and $validate eq 'vdata') {
                        if (index($remain, "\$") >= 0) {
                            $self->compileStringIwithVariables($remain);
                        }
                        else {
                            $self->compileAttributeValues($remain);
                        }
                    }
                    else {
                        $self->compileAttributeValues($remain);
                    }
                }
            }
        }
    }
    else {
        # unknown attribute name
        $self->putb('body', LITERAL);
        $self->compilePreserveStringT($attr_name);
        $self->putb('body', STR_T);
        $self->compilePreserveStringT($attr_value);
    }
}

sub compileElement {
    my $self = shift;
    my ($elt, $xml_lang, $xml_space) = @_;
    my $cpl_token = NULL;
    my $tagname = $elt->getNodeName();
    my $attrs = $elt->getAttributes();
    if ($attrs->getLength()) {
        my $attr;
        $attr = $elt->getAttribute('xml:lang');
        $xml_lang = $attr if ($attr);
        $attr = $elt->getAttribute('xml:space');
        $xml_space = $attr if ($attr);
        my $nb = 0;
        for (my $i = 0; $i < $attrs->getLength(); $i ++) {
            my $attr = $attrs->item($i);
            if ($attr->getNodeType() == ATTRIBUTE_NODE) {
                $nb += $self->prepareAttribute($tagname, $attr);
            }
        }
        $cpl_token |= HAS_ATTR if ($nb);
    }
    if ($elt->hasChildNodes()) {
        $cpl_token |= HAS_CHILD;
    }
    my $tag_token = $self->{rulesApp}->getTag($tagname, $self->{tagCodepage});
    if ($tag_token) {
        # well-known tag name
        $self->compileTagExtToken($cpl_token | $tag_token->{ext_token});
    }
    else {
        # unknown tag name
        $self->putb('body', $cpl_token | LITERAL);
        $self->compilePreserveStringT($tagname);
    }
    if ($cpl_token & HAS_ATTR) {
        for (my $i = 0; $i < $attrs->getLength(); $i ++) {
            my $attr = $attrs->item($i);
            if ($attr->getNodeType() == ATTRIBUTE_NODE) {
                $self->compileAttribute($tagname, $attr);
            }
        }
        $self->putb('body', _END);
    }
    if ($cpl_token & HAS_CHILD) {
        $self->compileContent($elt->getFirstChild(), $tag_token, $xml_lang, $xml_space);
        $self->putb('body', _END);
    }
}

sub compileContent {
    my $self = shift;
    my ($tag, $parent, $xml_lang, $xml_space) = @_;
    for (my $node = $tag;
            $node;
            $node = $node->getNextSibling() ) {
        my $type = $node->getNodeType();
        if    ($type == ELEMENT_NODE) {
            $self->compileElement($node, $xml_lang, $xml_space);
        }
        elsif ($type == TEXT_NODE) {
            my $value = $node->getNodeValue();
            if ($self->{variableSubs}) {
                $self->compileStringIwithVariables($value);
            }
            else {
                if ($xml_space eq 'preserve') {
                    $self->compilePreserveStringI($value) unless ($value =~ /^\s*$/);
                }
                else {
                    my $encoding = ($parent and exists $parent->{encoding}) ? $parent->{encoding} : "";
                    if    ($encoding eq 'base64') {
                        $self->compileBinaryWV($value);
                    }
                    elsif ($encoding eq 'datetime') {
                        $self->compileDatetimeWV($value);
                    }
                    elsif ($encoding eq 'integer') {
                        $self->compileIntegerWV($value);
                    }
                    else {
                        $self->compileStringI($value);
                    }
                }
            }
        }
        elsif ($type == CDATA_SECTION_NODE) {
            my $value = $node->getNodeValue();
            $self->compilePreserveStringI($value);
        }
        elsif ($type == COMMENT_NODE) {
            # do nothing
        }
        elsif ($type == ENTITY_REFERENCE_NODE) {
            $self->compileEntity($node->getNodeName());
        }
        elsif ($type == PROCESSING_INSTRUCTION_NODE) {
            my $target = $node->getTarget();
            my $data = $node->getData();
            $self->compileProcessingInstruction($target, $data);
        }
        else {
            die "unexcepted ElementType in compileContent : $type\n";
        }
    }
}

sub compileBody {
    my $self = shift;
    my ($doc) = @_;
    my $xml_lang = q{};
    my $xml_space = $self->{rulesApp}->{xmlSpace};
    for (my $node = $doc->getFirstChild();
            $node;
            $node = $node->getNextSibling() ) {
        my $type = $node->getNodeType();
        if      ($type == ELEMENT_NODE) {
            $self->compileElement($node, $xml_lang, $xml_space);
        }
        elsif ($type == PROCESSING_INSTRUCTION_NODE) {
            my $target = $node->getTarget();
            my $data = $node->getData();
            $self->compileProcessingInstruction($target, $data);
        }
    }
}

sub compileCharSet {
    my $self = shift;
    my ($encoding) = @_;
    if (defined $encoding) {
        eval "use I18N::Charset";
        die $@ if ($@);
        my $mib = charset_name_to_mib($encoding);
        if (defined $mib) {
            $self->putmb('header', $mib);
        }
        else {
            warn "unknown encoding.\n";
            $self->putmb('header', 0);  # unknown encoding
        }
    }
    else {
        $self->putmb('header', 106);        # UTF-8 : default XML encoding
    }
}

sub compilePublicId {
    my $self = shift;
    if (! $self->{publicid}) {
        $self->putmb('header', 1);
    }
    elsif (exists $self->{rules}->{PublicIdentifiers}->{$self->{publicid}}) {
        my $publicid = $self->{rules}->{PublicIdentifiers}->{$self->{publicid}};
        $self->putmb('header', $publicid);
    }
    else {
        $self->putb('header', NULL);
        my $pos = length $self->{strtbl};   # 0
        $self->{h_str}->{$self->{publicid}} = $pos;
        $self->putmb('header', $pos);
        $self->putstr('strtbl', $self->{publicid});
        $self->putb('strtbl', NULL);
    }
}

sub compileVersion {
    my $self = shift;
    $self->putb('header', $self->{rules}->{version});
}

=item compile

 $output = $wbxml->compile($doc_xml, $encoding);

Compiles a XML document.

=cut

sub compile {
    my $self = shift;
    my ($doc, $encoding) = @_;
    $self->{header} = q{};
    $self->{body} = q{};
    $self->{strtbl} = q{};
    $self->{h_str} = {};
    $self->{tagCodepage} = 0;
    $self->{attrCodepage} = 0;
    $self->compileVersion();
    $self->compilePublicId();
    $self->compileCharSet($encoding);
    $self->compileBody($doc);
    $self->putmb('header', length $self->{strtbl});
    my $out = $self->{header} . $self->{strtbl} . $self->{body};
    return $out;
}

=item outfile

 $filename = $wbxml->outfile($infile);

Builds output filename with the good extension.

=cut

sub outfile {
    my $self = shift;
    my ($infile) = @_;
    my $filename = $infile;
    if ($filename =~ /\.[^\.]+$/) {
        $filename =~ s/\.[^\.]+$/\./;
    }
    else {
        $filename .= '.';
    }
    $filename .= $self->{rulesApp}->{tokenisedExt};
    return $filename;
}

sub putb {
    my $self = shift;
    my ($str, $val) = @_;
    $self->{$str} = $self->{$str} . chr $val;
}

sub putmb {
    my $self = shift;
    my ($str, $val) = @_;
    my $tmp = chr ($val & 0x7f);
    for ($val >>= 7; $val != 0; $val >>= 7) {
        $tmp = chr (0x80 | ($val & 0x7f)) . $tmp;
    }
    $self->{$str} = $self->{$str} . $tmp;
}

sub putstr {
    my $self = shift;
    my ($str, $val) = @_;
    $self->{$str} = $self->{$str} . $val;
}

=back

=head1 SEE ALSO

wbxmlc, WAP::SAXDriver::wbxml

=head1 COPYRIGHT

(c) 2001-2011 Francois PERRAD, France.

This program (WAP::wbxml.pm and the internal DTD of wbrules.xml) is distributed
under the terms of the Artistic Licence.

The WAP Specification are copyrighted by the Wireless Application Protocol Forum Ltd.
See E<lt>http://www.wapforum.org/what/copyright.htmE<gt>.

=head1 AUTHOR

Francois PERRAD, francois.perrad@gadz.org

=cut

1;