The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
#!/usr/local/bin/perl
#
# $Id: ucmlint,v 0.2 2002/04/22 02:45:50 dankogai Exp $
#

use strict;
our  $VERSION = do { my @r = (q$Revision: 0.2 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };

use Getopt::Std;
our %Opt;
getopts("Dehfv", \%Opt);

if ($Opt{e}){
   eval{ require Encode; };
   $@ and die "can't load Encode : $@";
}

$Opt{h} and help();
@ARGV or help();

sub help{
    print <<"";
$0 -[Dehfv] [ucm files ...]
  -D debug mode on
  -e test with Encode module also (requires perl 5.7.3 or higher)
  -h shows this message
  -f forces roundtrip check even for |[123]
  -v verbose mode

}

$| = 1;
my (%Hdr, %U2E, %E2U);
my $in_charmap = 0;
my $nerror = 0;
my $nwarning = 0;

sub nit($;$){
    my ($msg, $level) = @_;
    my $lstr;
    if ($level == 2){
	$lstr = 'notice';
    }elsif ($level == 1){
	$lstr = 'warning'; $nwarning++;
    }else{
	$lstr = 'error'; $nerror++;
    }
    print "$ARGV:$lstr in line $.: $msg\n";
}

for $ARGV (@ARGV){
    open UCM, $ARGV or die "$ARGV:$!";
    %Hdr = %U2E = %E2U = ();
    $in_charmap = $nerror = $nwarning = 0;
    $. = 0;
    while(<UCM>){
	chomp;
	s/\s*#.*$//o; /^$/ and next;
	if ($_ eq "CHARMAP"){ 
	    $in_charmap = 1;
	    for my $must (qw/code_set_name mb_cur_min mb_cur_max/){
		exists $Hdr{$must} or nit "<$must> nonexistent";
	    }
	    $Hdr{mb_cur_min} > $Hdr{mb_cur_max}
	    and nit sprintf("mb_cur_min(%d) > mb_cur_max(%d)",
			    $Hdr{mb_cur_min},$Hdr{mb_cur_max});
	    $in_charmap = 1;
	    next;
	}
	unless ($in_charmap){
	    my($hkey, $hvalue) = /^<(\S+)>\s+[\"\']?([^\"\']+)/o or next;
	    $Opt{D} and warn "$hkey => $hvalue";
	    if ($hkey eq "code_set_name"){ # name check
		exists $Hdr{code_set_name} 
		and nit "Duplicate <code_set_name>: $hkey";
	    }
	    if ($hkey eq "code_set_alias"){ # alias check
		$hvalue eq $Hdr{code_set_name}
		and nit qq(alias "$hvalue" is already in <code_set_name>);
	    }
	    $Hdr{$hkey} = $hvalue;
	}else{
	    my $name = $Hdr{code_set_name};
	    my($unistr, $encstr, $fb) = /^(\S+)\s+(\S+)\s(\S+)/o or next;
	    $Opt{v} and nit $_, 2;
	    my $uni = uniparse($unistr);
	    my $enc = encparse($encstr);
	    $fb =~ /^\|([0123])$/ or nit "malformed fallback: $fb";
	    $fb = $1; 
	    $Opt{f} and $fb = 0;
	    unless ($fb == 1){ # check uni -> enc
		if (exists $U2E{$uni}){
		    nit "dupe encode map: U$uni => $U2E{$uni} and $enc", 1;
		}else{
		    $U2E{$uni} = $enc;
		    if ($Opt{e} and $fb != 3) {
			my $e = hex2enc($enc);
			my $u = hex2uni($uni);
			my $eu = Encode::encode($name, $u);
			$e eq $eu
			    or nit qq(encode('$name', $uni) != $enc);
		    }
		}
	    }
	    unless ($fb == 3){  # check enc -> uni
		if (exists $E2U{$enc}){
		    nit "dupe decode map: $enc => U$E2U{$enc} and U$uni", 1;
		}else{
		    $E2U{$enc} = $uni;
		    if ($Opt{e} and $fb != 1) {
			my $e = hex2enc($enc);
			my $u = hex2uni($uni);
			$Opt{D} and warn "$uni, $enc";
			my $de = Encode::decode($name, $e);
			$de eq $u
			    or nit qq(decode('$name', $enc) != $uni);
		    }
		}
	    }
	    # warn "$uni, $enc, $fb";
	}
    }
    $in_charmap or nit "Where is CHARMAP?";
    checkRT();
    printf ("$ARGV: %s error%s found\n", 
	    ($nerror == 0 ? 'no' : $nerror),
	    ($nerror > 1 ? 's' : ''));
}

exit;

sub hex2enc{
    pack("C*", map {hex($_)} split(",", shift));
}
sub hex2uni{
    join("", map { chr(hex($_)) } split(",", shift));
}

sub checkRT{
    for my $uni (keys %E2U){
	my $enc = $U2E{$uni} or next; # okay
	$E2U{$U2E{$uni}} eq $uni or
	    nit "RT failure: U$uni => $enc =>U$E2U{$U2E{$uni}}";
    }
    for my $enc (keys %E2U){
	my $uni =  $E2U{$enc} or next; # okay
	$U2E{$E2U{$enc}} eq $enc or
	    nit "RT failure: $enc => U$uni => $U2E{$E2U{$enc}}";
    }
}


sub uniparse{
    my $str = shift;
    my @u;
    push @u, $1 while($str =~ /\G<U(.*?)>/ig);
    for my $u (@u){
	$u =~ /^([0-9A-Za-z]+)$/o
	    or nit "malformed Unicode character: $u";
    }
    return join(',', @u);
}

sub encparse{
    my $str = shift;
    my @e;
    for my $e (split /\\x/io, $str){
	$e or next; # first \x
	$e =~ /^([0-9A-Za-z]{1,2})$/io
	    or nit "Hex $e in $str is bogus";
	push @e, $1;
    }
    return join(',', @e);
}



__END__

A UCM file looks like this.

  #
  # Comments
  #
  <code_set_name> "US-ascii" # Required
  <code_set_alias> "ascii"   # Optional
  <mb_cur_min> 1             # Required; usually 1
  <mb_cur_max> 1             # Max. # of bytes/char
  <subchar> \x3F             # Substitution char
  #
  CHARMAP
  <U0000> \x00 |0 # <control>
  <U0001> \x01 |0 # <control>
  <U0002> \x02 |0 # <control>
  ....
  <U007C> \x7C |0 # VERTICAL LINE
  <U007D> \x7D |0 # RIGHT CURLY BRACKET
  <U007E> \x7E |0 # TILDE
  <U007F> \x7F |0 # <control>
  END CHARMAP