#!/usr/bin/perl -w -s
use XML::DT;
use Data::Dumper;
use File::Temp;
use Term::ReadLine;
#use locale;
use strict;
our ($latin1,$html,$show_att,$expand_att_id);
our ($lines,$t,$shell);
my (@files)=@ARGV;
@ARGV=();
mkxmltypes (@files);
sub mkxmltypes {
my %type=();
my @files = @_;
my %root = ();
my %att=();
my %dom=();
my %ele=();
my %elel=();
my %atl=();
my %handler=(
# '-outputenc' => 'ISO-8859-1',
'-default' => sub{
$c =~ s/,$//;
push(@{$type{$q}}, (eval("[$c]") || "?$c"));
$elel{$q}++;
if(ctxt(1)){ $ele{ctxt(1)}{$q} ++;}
else { $root{$q}++}
for(keys(%v)){
$atl{$_}++;
$att{$q}{$_}{tipo($v{$_})||"_str"} ++ ;
$dom{$q}{$_}{$v{$_}} ++ }
"'$q',";
},
'-pcdata' => sub{ if ($c =~ /[^ \t\n]/){ $ele{ctxt(1)}{"#PCDATA"}=1;
"'#PCDATA'," } else {""}},
);
if ($html) { $handler{'-html'} = 1;}
if($latin1) { $handler{'-inputenc'}='ISO-8859-1';}
for my $fname (@files){
if($lines){
my $tmpfile = File::Temp->new( UNLINK => 0 )->filename;
system("head -$lines $fname | xmllint --recover - > $tmpfile");
$fname = $tmpfile;
}
dt($fname,%handler);
unlink($fname) if $lines;
}
print "# ", join(" ",keys %root)," ...", scalar(localtime(time)) ,"\n";
my %resumofinal=();
for (keys %type){
my @tipo=();
for my $lista (@{$type{$_}}){ push (@tipo, processa($lista)) }
$resumofinal{$_}=resumele(processa2([@tipo])).resumeatts($att{$_});
}
if($shell){ shell($t,\%root,\%ele,\%att,\%dom,\%resumofinal,\%atl); }
else{ pprint(\%resumofinal,ordem(\%ele,(($t) ||(keys %root) ))); }
}
sub shell{
my ($t,$root,$ele,$att,$dom,$resumofinal,$atl) = @_;
my $last=(keys %$root)[0];
my $elepat = q{[\w:]+};
my $max = 10;
my $term = new Term::ReadLine 'sample';
my $tas = $term->Attribs;
$tas->{completion_entry_function}= $tas->{list_completion_function};
$tas->{completion_word} = [ keys(%$ele), keys(%$atl) ];
pprint($resumofinal,ordem($ele,(($t) ||(keys %$root) )));
while ( defined ($_ = $term->readline("\npfs> ")) ) {
chomp();
$term->addhistory($_) if /\S/;
s/^\s*(.*?)\s*$/$1/;
if(/($elepat)\[\@?($elepat)\]/){
print resumeatt($att->{$1}{$2},$dom->{$1}{$2},$max);
$last = $1}
elsif(/\!max\s*=?\s*(\d+)/){$max=$1;}
elsif(/\.($elepat)/){
print resumeatt($att->{$last}{$1},$dom->{$last}{$1},$max);}
elsif(!$_ or defined $ele->{$_}) {
$last=$_;
pprint($resumofinal,ordem($ele,(($_) ||(keys %$root) ))); }
else{
for my $e (keys %$att){
for my $a (keys %{$att->{$e}}){
print "$e($a):",
resumeatt($att->{$e}{$a},$dom->{$e}{$a},$max) if($a eq $_)
}
}
}
}
}
sub ordem{
my ($rel,@st)=@_;
my @r=();
my %visited = ('#PCDATA' => 1);
while(@st){
my $next = shift(@st);
next if $visited{$next};
push(@r,$next);
$visited{$next} = 1;
push(@st, (grep {! $visited{$_}} (keys %{$rel->{$next}})));
}
\@r;
}
sub pprint{
my $r = shift;
my $order = shift;
for (@$order){ print "$_ \t=> $r->{$_}\n";}
}
sub resumeatts{
my $a=shift;
my $r="";
for (keys(%{$a})) {
if($expand_att_id){ $r .= "\n\t\t * $_:(".join(",",keys %{$a->{$_}}) . ")" }
else { $r .= " * $_" }
}
$r
}
sub resumeatt{
my $a=shift;
my $d=shift;
my $max = shift(@_) || 10;
my $r= join("|",keys %{$a}) ;
my @domact = (grep {defined $_} ((keys %{$d}))[0..$max]);
$domact[$max] = '...' if $domact[$max];
$r . " = {". join(",",@domact) . "}\n";
}
sub processa{
my $a=shift;
if( @$a == 0 ) { +{ _isa =>"empty"} }
elsif( @$a == 1 && $a->[0] eq '#PCDATA') {
+{ _isa =>"text" ,$a->[0] =>[1,1]} }
elsif( @$a == 1 ) { +{ _isa =>"singleton",$a->[0] =>[1,1]} }
else{ my %f = ();
for (@$a){$f{$_}[0]++,$f{$_}[1]++}
my $dif = scalar keys %f;
if($dif == 1) { +{ _isa =>"seq", %f} ; }
elsif($dif == @$a) { +{ _isa =>"tup", %f}; }
elsif($f{'#PCDATA'}){ +{ _isa =>"mixed", %f }; }
else { +{ _isa =>"mtup", %f } }
}
}
sub processa2{
my $a=shift;
if ( @$a == 0 ) { die("no sons????") }
elsif( @$a == 1 ) { $a->[0] }
else{
my %f = ();
my %maybe = ();
for (@$a){$f{sons2str($_)}++;
$maybe{$_->{_isa}}++ }
my $dif = scalar keys %f;
if ($dif == 1) { $a->[0]; }
elsif($maybe{mixed} || $maybe{text}){ +{%{join_sons($a)}, _isa=> "mixed"} }
else { my %s= %{join_sons($a)};
if(keys %s == 1) { +{%s, _isa => "seq"}}
else { +{%s, _isa => "mtup"} }
}
}
}
sub resumele{
my $a=shift;
## print Dumper($a);
my $i = $a->{_isa};
delete $a->{_isa};
if ($i eq "text") {"text"}
elsif ($i eq "empty") {"empty"}
elsif ($i eq "singleton") {join(", ", keys %{$a}) }
elsif ($i eq "mixed") {delete $a->{'#PCDATA'};
if(keys %{$a}){ "mixed(".join(", ", keys %{$a}).")"}
else {"text"}
}
elsif ($i eq "tup") {"tup(".join(", ", keys %{$a}).")"}
elsif ($i eq "seq") {"seq(".join(", ", keys %{$a}).")"}
else { my $r= "mtup(";
for(sort keys %$a){
$r .= "$_, " if ( $a->{$_}[0] == 1 && $a->{$_}[1] == 1 );
$r .= "$_?, " if ( $a->{$_}[0] == 0 && $a->{$_}[1] == 1 );
$r .= "$_*, " if ( $a->{$_}[0] == 0 && $a->{$_}[1] > 1 );
$r .= "$_+, " if ( $a->{$_}[0] > 0 && $a->{$_}[1] > 1 );
$r .= Dumper($a) if($r =~ /\($/ );
}
$r =~ s/, $//;
$r.=")";
}
}
sub join_sons{
my $a = shift;
my %final = ( map { ($_ => [$a->[0]{$_}[0], $a->[0]{$_}[1]])}
grep {$_ ne "_isa"} keys %{$a->[0]});
my %todas=();
for (@$a){
my @novas=keys %{$_};
@todas{@novas}= @novas;
for my $k (keys %todas){
next if $k =~ /_isa/;
$final{$k}[0]=0 unless $final{$k}[0];
unless (exists $_->{$k}){ $final{$k}[0]=0;
next}
$final{$k}[1]=$_->{$k}[1] if $_->{$k}[1] > ($final{$k}[1] || 0);
$final{$k}[0]=$_->{$k}[0] if $_->{$k}[0] < $final{$k}[0];
}
}
\%final
}
sub sons2str{
my $a = shift;
join(' ',($a->{_isa},map { $_ . ($a->{$_}[0]==1 ? "" : "+") }
grep {$_ ne "_isa"} sort keys %$a));
}
sub tipo{
my $a=shift;
for ($a){
if(/^\s*\d+\s*$/) {return "_int" }
elsif(m{^\s*(https?|ftp|file)://\w[~&=?\w:/.-]+\s*$}i){return "_url" }
elsif(/^\s*\d+\.\d+\s*$/) {return "_real" }
elsif(/^\w+$/) {return "_id" }
elsif(m{^\s*[\w.-]+\@\w[\w_:/.-]+\s*$}) {return "_email" }
else {return undef }
}
}
__END__
=head1 NAME
mkxmltype - Make XML analysis using XML::DT
=head1 SYNOPSIS
mkxmltype <xmlfile>
=head1 DESCRIPTION
This command tries to infer DTD and Camlila-like types for a specific XML file;
=head1 Options
-latin1 input file encoding is forced to be latin1
-html uses html (libxml2) parser
-show_att Show attribute values
-expand_att_id
-lines=20000 just reads the first 20000 lines of the XML file
-t
-shell Enter interactive shell mode
=head1 SEE ALSO
XML::DT(1), mkdtskel(1), mkdtdskel and perl(1)
=cut