#!/usr/bin/perl -w

use Getopt::Long qw(:config no_ignore_case);
use File::Basename qw(basename);
use Pod::Usage;
use strict;

##------------------------------------------------------------------------------
## Constants & Globals
our $prog = basename($0);

our $outbase = undef;
our $labfile = undef;
our $sclfile = undef;

our $want_specials = 0;
our $sigma = '<sigma>';
our $epsilon = '<epsilon>';
our $category = '<category>';

##------------------------------------------------------------------------------
## Command-line
our ($help);
GetOptions(##-- General
	   'help|h' => \$help,
	   #'verbose|v=i' => \$verbose,
	   #'quiet|q' => sub { $verbose=0; },

	   ##-- I/O
	   'special-symbols|specials|L!' => \$want_specials,
	   'output|out|o=s' => \$outbase,

	   'lab-output|labout|lab|lo=s' => \$labfile,
	   'scl-output|sclout|scl|so=s' => \$sclfile,

	   'epsilon|eps|e=s' => \$epsilon,
	   'sigma|E' => \$sigma,
	   'category|cat|c=s' => \$category,
	  );

pod2usage({-exitval=>0,-verbose=>0,}) if ($help);
pod2usage({-message=>"No input symbol file given!",-exitval=>0,-verbose=>0,}) if (@ARGV < 1);

##------------------------------------------------------------------------------
## escaping

## $str = unescape($str_escaped)
sub unescape {
  my $s = shift;
  $s =~ s/\\n/\n/g;
  $s =~ s/\\r/\r/g;
  $s =~ s/\\t/\t/g;
#  $s =~ s/\\v/\v/g;
  $s =~ s/\\x([0-9a-f]{1,2})/chr($1)/gxi;
  $s =~ s/\\(.)/$1/g;
  return $s;
}

## @sl_uniq = sluniq(@sorted_list)
##  + sorts list
sub sluniq {
  my ($prev);
  return map {defined($prev) && $_ eq $prev ? qw() : ($prev=$_)} @_;
}

## @l_uniq = luniq(@list)
##  + sorts list
sub luniq {
  return sluniq(sort @_);
}


## DATA
##  %class2terms : ($class => \@terms, ...)
##  %cat2feat    : ($category => \@features, ...)
##  %sym2id      : ($term => $id, ...)
##  @id2sym      : ([$id] => $term, ...)

my (%class2terms,%cat2feat);
my %sym2id = ($epsilon=>0);
my @id2sym = ($epsilon);

## $id_or_empty = ensure_symbol($sym)
sub ensure_symbol {
  my $sym = shift;
  return $sym2id{$sym} if (exists $sym2id{$sym});
  return qw() if (exists $class2terms{$sym});
  return qw() if (exists $cat2feat{$sym});
  ##
  ##-- new symbol: create as terminal
  push(@id2sym,$sym);
  return $sym2id{$sym} = $#id2sym;
}

## @ids = ensure_symbols(@syms)
sub ensure_symbols {
  return map {ensure_symbol($_)} @_;
}

## @sorted = idsort(@terminals)
sub idsort {
  return sort {$sym2id{$a}<=>$sym2id{$b}} @_;
}

## @terms_nodups = terminals(@syms);
sub terminals {
  my @queue = (@_);
  my %terms = qw();
  my %visited = qw();
  my ($sym,$key);
  while (defined($sym=shift(@queue))) {
    next if (exists $visited{$sym});
    $visited{$sym}  = 1;
    if (exists $class2terms{$sym}) {
      push(@queue, @{$class2terms{$sym}});
    }
    elsif (defined $sym2id{$sym}) {
      $terms{$sym} = ++$key;
    }
    else {
      ensure_symbol($sym);
      $terms{$sym} = ++$key;
    }
  }
  return sort {$terms{$a}<=>$terms{$b}} keys %terms;
}


##------------------------------------------------------------------------------
## MAIN

##-- get filenames
our $symfile = shift;
die "$prog: could not read file symbols-file '$symfile' or '$symfile.sym'" if (!-r "$symfile" && !-r"$symfile.sym");
$symfile = "$symfile.sym" if (!-r $symfile);

($outbase = $symfile) =~ s/\.sym$// if (!$outbase);
$labfile = "$outbase.lab" if (!$labfile);
$sclfile = "$outbase.scl" if (!$sclfile);

##-- load symspec
open(my $symfh, "<", $symfile)
  or die("$prog: open failed for '$symfile': $!");

my ($class,@vals);
while (<$symfh>) {
  chomp;
  next if (/^\s*$/);
  ($class,@vals) = map {unescape($_)} split(/\s+/,$_);

  if ($class eq 'Category:') {
    ##-- category: parse features
    my $cat = shift @vals;
    $cat2feat{$cat} = [@vals];
    ensure_symbols(map {"_$_"} ($cat,@vals));
  }
  else {
    ##-- symbol class: parse it
    push(@{$class2terms{$class}}, terminals(@vals));
  }
}
close $symfh;

##-- maybe add lextools special symbols
if ($want_specials) {
  foreach my $spec (
		    {class=>'<boundary>', vals=>[qw($$ ++ ww aa ii), ',,', qw(.. !! ??)]},
		    {class=>undef,        vals=>[qw(<xml> </xml>)]},
		    {class=>'<accent>',   vals=>[qw(acc:+ acc:- acc:c)]},
		    {class=>'numval',     vals=>[
						 (map {"10^$_"} (0..20)),
						 (map {"20^$_"} (0..2)),
						 (map {"$_*"} (0..9)),
						]},
		    {class=>'multiplier', vals=>[(map {"$_*"} (0..9))],},
		    {class=>undef,          vals=>[qw(<bos> <eos>)]},
		   )
    {
      ensure_symbols(@{$spec->{vals}});
      push(@{$class2terms{$spec->{class}}}, idsort(terminals(@{$spec->{vals}}))) if (defined($spec->{class}));
    }
}


##-- set 'sigma' class
$class2terms{$sigma} = [@id2sym[1..$#id2sym]];

##-- dump (debug)
#use Data::Dumper;
#print STDERR Data::Dumper->Dump([\%class2terms,\%cat2feat,\%sym2id],[qw(class2terms cat2feat sym2id)]);

##-- dump (labels)
open(my $labfh, ">$labfile")
  or die("$prog: open failed for labels-file '$labfile': $!");
foreach (0..$#id2sym) {
  print $labfh $id2sym[$_],"\t",$_,"\n";
}
close $labfh;

##-- dump (superclasses)
open(my $sclfh, ">$sclfile")
  or die("$prog: open failed for labels-file '$labfile': $!");
foreach my $cls (sort keys %class2terms) {
  print $sclfh
    map {"$cls\t$_\n"}
    #sort {$a<=>$b}			##-- do NOT label-sort things here; it can mess up 1-1 correspondences for e.g. lexrulecomp "=>" operator
    @sym2id{@{$class2terms{$cls}}};
}
foreach my $cat (sort keys %cat2feat) {
  print $sclfh
    ("$category\t", $sym2id{"_$cat"}, "\n",
     (map {"_$cat\t".$sym2id{"_$_"}."\n"} @{$cat2feat{$cat}}),
    );
}
close $sclfh;


__END__

=pod

=head1 NAME

gfsm-makelabe.perl - split lextools symbol specification into *.lab and *.scl

=head1 SYNOPSIS

 gfsm-makelab.perl [OPTIONS] SYMFILE[.sym]

 Options:
  -h , -help                  # this help message
  -L , -special-symbols       # include lextools-style special symbols?
  -o , -output OUTBASE        # specify output basename (default=SYMFILE)
  -lo, -lab-output LABFILE    # specify lab-file output (default=OUTBASE.lab)
  -so, -scl-output SCLFILE    # specify scl-file output (default=OUTBASE.scl)

=cut

##------------------------------------------------------------------------------
## Options and Arguments
##------------------------------------------------------------------------------
=pod

=head1 OPTIONS AND ARGUMENTS

Not yet written.

=cut

##------------------------------------------------------------------------------
## Description
##------------------------------------------------------------------------------
=pod

=head1 DESCRIPTION

Split lextools-style symbol specifications (*.sym) into terminal labels (*.lab) and superclass labels (*.scl).

=cut

##------------------------------------------------------------------------------
## See Also
##------------------------------------------------------------------------------
=pod

=head1 SEE ALSO

perl(1),
...

=cut

##------------------------------------------------------------------------------
## Footer
##------------------------------------------------------------------------------
=pod

=head1 AUTHOR

Bryan Jurish E<lt>moocow@cpan.orgE<gt>

=cut