The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
#!/usr/bin/perl -w -s
use XML::DT;
use Data::Dumper;
use File::Temp;
use Term::ReadLine;
#use locale;

use strict;

our ($latin1,$html,$show_att,$expand_att_id);
our ($lines,$t,$shell);

my (@files)=@ARGV;
@ARGV=();



mkxmltypes (@files);


sub mkxmltypes {
  my %type=();
  my @files = @_;
  my %root = ();
  my %att=();
  my %dom=();

  my %ele=();
  my %elel=();
  my %atl=();
  my %handler=(
#    '-outputenc' => 'ISO-8859-1',
    '-default'   => sub{ 
          $c =~ s/,$//;
          push(@{$type{$q}}, (eval("[$c]") || "?$c"));
          $elel{$q}++;
          if(ctxt(1)){ $ele{ctxt(1)}{$q} ++;}
          else       { $root{$q}++}
          for(keys(%v)){
              $atl{$_}++;
              $att{$q}{$_}{tipo($v{$_})||"_str"} ++  ;
              $dom{$q}{$_}{$v{$_}} ++ } 
          "'$q',";
        },
    '-pcdata'    => sub{ if ($c =~ /[^ \t\n]/){ $ele{ctxt(1)}{"#PCDATA"}=1;
"'#PCDATA'," } else {""}},
  );

  if  ($html) { $handler{'-html'} = 1;} 
  if($latin1) { $handler{'-inputenc'}='ISO-8859-1';}

  for my $fname (@files){
    if($lines){
      my $tmpfile = File::Temp->new( UNLINK => 0 )->filename;
      system("head -$lines $fname | xmllint --recover - > $tmpfile");
      $fname = $tmpfile;
    }
    dt($fname,%handler); 
    unlink($fname) if $lines;
  }

  print "# ", join(" ",keys %root)," ...",  scalar(localtime(time)) ,"\n";

  my %resumofinal=();
  for (keys %type){
     my @tipo=();
     for my $lista (@{$type{$_}}){ push (@tipo, processa($lista)) }
     $resumofinal{$_}=resumele(processa2([@tipo])).resumeatts($att{$_});
  }

  if($shell){ shell($t,\%root,\%ele,\%att,\%dom,\%resumofinal,\%atl); }
  else{       pprint(\%resumofinal,ordem(\%ele,(($t) ||(keys %root) ))); }
}

sub shell{
    my ($t,$root,$ele,$att,$dom,$resumofinal,$atl) = @_;
    my $last=(keys %$root)[0];
    my $elepat = q{[\w:]+};
    my $max = 10;
    my $term = new Term::ReadLine 'sample';
    my $tas = $term->Attribs;
    $tas->{completion_entry_function}= $tas->{list_completion_function};
    $tas->{completion_word} = [ keys(%$ele), keys(%$atl) ];

    pprint($resumofinal,ordem($ele,(($t) ||(keys %$root) )));
    while ( defined ($_ = $term->readline("\npfs> ")) ) {
      chomp(); 
      $term->addhistory($_) if /\S/;
      s/^\s*(.*?)\s*$/$1/;
      if(/($elepat)\[\@?($elepat)\]/){
         print resumeatt($att->{$1}{$2},$dom->{$1}{$2},$max);
         $last = $1}
      elsif(/\!max\s*=?\s*(\d+)/){$max=$1;}
      elsif(/\.($elepat)/){
         print resumeatt($att->{$last}{$1},$dom->{$last}{$1},$max);}
      elsif(!$_ or defined $ele->{$_}) {
         $last=$_; 
         pprint($resumofinal,ordem($ele,(($_) ||(keys %$root) ))); }
      else{
         for my $e (keys %$att){
           for my $a (keys %{$att->{$e}}){
            print "$e($a):",
                  resumeatt($att->{$e}{$a},$dom->{$e}{$a},$max) if($a eq $_)
           }
         }
      }
    } 
}

sub ordem{
  my ($rel,@st)=@_;
  my @r=();
  my %visited = ('#PCDATA' => 1);

  while(@st){
    my $next = shift(@st);
    next if $visited{$next};
    push(@r,$next);
    $visited{$next} = 1;
    push(@st, (grep {! $visited{$_}} (keys %{$rel->{$next}})));
  }
  \@r;
}

sub pprint{
  my $r     = shift;
  my $order = shift;
  for (@$order){ print "$_ \t=>  $r->{$_}\n";}
}

sub resumeatts{
 my $a=shift;
 my $r="";
 for (keys(%{$a})) {
   if($expand_att_id){ $r .= "\n\t\t * $_:(".join(",",keys %{$a->{$_}}) . ")" }
   else              { $r .= " * $_" }
 }
 $r
}

sub resumeatt{
 my $a=shift;
 my $d=shift;
 my $max = shift(@_) || 10;
 my $r= join("|",keys %{$a}) ;
 my @domact = (grep {defined $_} ((keys %{$d}))[0..$max]);
 $domact[$max] = '...' if $domact[$max];
 $r . " = {". join(",",@domact) . "}\n";
}

sub processa{
 my $a=shift;
 if( @$a == 0 ) {            +{ _isa =>"empty"} }
 elsif( @$a == 1 && $a->[0] eq '#PCDATA') { 
                             +{ _isa =>"text"     ,$a->[0] =>[1,1]} }
 elsif( @$a == 1 )       {   +{ _isa =>"singleton",$a->[0] =>[1,1]} }
 else{ my %f = (); 
       for (@$a){$f{$_}[0]++,$f{$_}[1]++} 
       my $dif = scalar keys %f;
       if($dif == 1) {       +{ _isa =>"seq", %f} ; }
       elsif($dif == @$a) {  +{ _isa =>"tup", %f}; }
       elsif($f{'#PCDATA'}){ +{ _isa =>"mixed", %f }; }
       else {                +{ _isa =>"mtup", %f } }
 }
}

sub processa2{
 my $a=shift;
 if   ( @$a == 0 ) { die("no sons????") }
 elsif( @$a == 1 ) { $a->[0] }
 else{ 
   my %f     = (); 
   my %maybe = ();
   for (@$a){$f{sons2str($_)}++;
             $maybe{$_->{_isa}}++ } 
   my $dif = scalar keys %f;
   if   ($dif == 1)    { $a->[0]; }
   elsif($maybe{mixed} || $maybe{text}){ +{%{join_sons($a)}, _isa=> "mixed"} }
   else { my %s= %{join_sons($a)};
          if(keys %s == 1) { +{%s, _isa => "seq"}}
          else             { +{%s, _isa => "mtup"} }
   }
 }
}

sub resumele{
 my $a=shift;
##  print Dumper($a);
 my $i = $a->{_isa};
 delete $a->{_isa};
 if    ($i eq "text")      {"text"}
 elsif ($i eq "empty")     {"empty"}
 elsif ($i eq "singleton") {join(", ", keys %{$a}) }
 elsif ($i eq "mixed")     {delete $a->{'#PCDATA'};
              if(keys %{$a}){ "mixed(".join(", ", keys %{$a}).")"}
              else {"text"}
      }
 elsif ($i eq "tup")       {"tup(".join(", ", keys %{$a}).")"}
 elsif ($i eq "seq")       {"seq(".join(", ", keys %{$a}).")"}
 else { my $r= "mtup(";
        for(sort keys %$a){ 
          $r .= "$_, "  if ( $a->{$_}[0] == 1 && $a->{$_}[1] == 1 );
          $r .= "$_?, " if ( $a->{$_}[0] == 0 && $a->{$_}[1] == 1 );
          $r .= "$_*, " if ( $a->{$_}[0] == 0 && $a->{$_}[1] > 1 );
          $r .= "$_+, " if ( $a->{$_}[0] >  0 && $a->{$_}[1] > 1 );
          $r .= Dumper($a) if($r =~ /\($/ );
        }
        $r =~ s/, $//;
        $r.=")";
      }
}

sub join_sons{
 my $a = shift;
 my %final = ( map { ($_ => [$a->[0]{$_}[0], $a->[0]{$_}[1]])}   
         grep {$_ ne "_isa"} keys %{$a->[0]});
 my %todas=();
 for (@$a){
   my @novas=keys %{$_};
   @todas{@novas}= @novas;
   for my $k (keys %todas){ 
          next if $k =~ /_isa/;
          $final{$k}[0]=0  unless  $final{$k}[0];
          unless (exists $_->{$k}){ $final{$k}[0]=0;
                                    next} 
          $final{$k}[1]=$_->{$k}[1]  if $_->{$k}[1] > ($final{$k}[1] || 0);
          $final{$k}[0]=$_->{$k}[0]  if $_->{$k}[0] < $final{$k}[0];
   }
 }
 \%final
}

sub sons2str{
 my $a = shift;
 join(' ',($a->{_isa},map { $_ . ($a->{$_}[0]==1 ? "" : "+") } 
     grep {$_ ne "_isa"} sort keys %$a));
}

sub tipo{
 my $a=shift;
 for ($a){
     if(/^\s*\d+\s*$/)                      {return "_int" }
  elsif(m{^\s*(https?|ftp|file)://\w[~&=?\w:/.-]+\s*$}i){return "_url" }
  elsif(/^\s*\d+\.\d+\s*$/)                 {return "_real" }
  elsif(/^\w+$/)                            {return "_id" }
  elsif(m{^\s*[\w.-]+\@\w[\w_:/.-]+\s*$})   {return "_email" }
  else                                      {return undef } 
 }
}

__END__

=head1 NAME

mkxmltype - Make XML analysis using XML::DT

=head1 SYNOPSIS

  mkxmltype <xmlfile>

=head1 DESCRIPTION

This command tries to infer DTD and Camlila-like types for a specific XML file;

=head1 Options

 -latin1             input file encoding is forced to be latin1
 -html               uses html (libxml2) parser
 -show_att           Show attribute values
 -expand_att_id
 -lines=20000        just reads the first 20000 lines of the XML file
 -t
 -shell              Enter interactive shell mode

=head1 SEE ALSO

XML::DT(1), mkdtskel(1), mkdtdskel and perl(1)

=cut