########################################
# 060.pdups - partial duplicate removal
########################################
use t::lib;
use t::utilBabel;
use Carp;
use Getopt::Long;
use Graph::Directed;
use List::MoreUtils qw(uniq any);
use List::Util qw(min max);
use Math::BaseCalc;
use Text::Abbrev;
use Class::AutoDB;
use Data::Babel;
use Benchmark qw(:hireswallclock);
use Test::More;
use strict;
my @OPTIONS=qw(bundle=s graph_type=s arity=i link_type=s num_maptables=i num_groups=i
keep_pdups
pdups_group_cutoffs=s pdups_prefixmatcher_cutoffs=s pdups_prefixmatcher_classes=s);
# defaults make a binary tree of depth 3 with reasonable sized db
my %DEFAULTS=(bundle=>'install',
graph_type=>'tree',arity=>2,link_type=>'star',num_maptables=>7,num_groups=>1);
my $autodb=new Class::AutoDB(database=>'test',create=>1);
isa_ok($autodb,'Class::AutoDB','sanity test - $autodb');
my $dbh=$autodb->dbh;
my $OPTIONS=get_options();
my $babel=make_babel();
load_database($OPTIONS->num_groups);
my @idtype_names=map {$_->name} @{$babel->idtypes};
my %idtypes=group {my($group)=/^(.+?)_/; $group} @idtype_names;
my @leafs=sort @{$idtypes{leaf}};
my @links=sort @{$idtypes{link}};
my $input=$leafs[$#leafs]; # 'extreme' leaf - bottom right in tree
my($group_cutoffs,$matcher_cutoffs,$matcher_classes)=
@$OPTIONS{qw(pdups_group_cutoffs pdups_prefixmatcher_cutoffs pdups_prefixmatcher_classes)};
for my $group_cutoff (@$group_cutoffs) {
for my $matcher_cutoff (@$matcher_cutoffs) {
for my $matcher_class (@$matcher_classes) {
doit_all($group_cutoff,$matcher_cutoff,$matcher_class);
doit_all($group_cutoff,$matcher_cutoff,$matcher_class,'keep_pdups') if $OPTIONS->keep_pdups;
}}}
done_testing();
sub doit_all {
my($group_cutoff,$matcher_cutoff,$matcher_class,$keep_pdups)=@_;
$babel->pdups_group_cutoff($group_cutoff) if defined $group_cutoff;
$babel->pdups_prefixmatcher_cutoff($matcher_cutoff) if defined $matcher_cutoff;
$babel->pdups_prefixmatcher_class($matcher_class) if defined $matcher_class;
my $label_all=label($group_cutoff,$matcher_cutoff,$matcher_class,$keep_pdups);
# case 1: short paths
my $label="case 1. short paths $label_all";
my $ok=1;
my @outputs;
for my $leaf (@leafs) {
push(@outputs,$leaf);
$ok&&=doit($input,\@outputs,$keep_pdups,$label) or last;
}
report_pass($ok,$label);
# case 2: long paths
my $label="case 2. long paths $label_all";
my $ok=1;
my @outputs=@leafs;
for my $link (@links) {
push(@outputs,$link);
$ok&&=doit($input,\@outputs,$keep_pdups,$label) or last;
}
report_pass($ok,$label);
# case 3: mixed paths
my $label="case 3. mixed paths $label_all";
my $ok=1;
for my $link (@links) {
my @outputs=@leafs;
push(@outputs,$link);
$ok&&=doit($input,\@outputs,$keep_pdups,$label) or last;
}
report_pass($ok,$label);
}
# args are idtype names
sub doit {
my($input_name,$output_names,$keep_pdups,$label)=@_;
my @args=(input_idtype=>$input_name,output_idtypes=>$output_names);
push(@args,keep_pdups=>$keep_pdups) if defined $keep_pdups;
# my $t0=new Benchmark;
my $actual=$babel->translate(@args);
# diag 'translate: ',timestr(timediff(new Benchmark, $t0));
# my $t0=new Benchmark;
my $correct=select_ur(babel=>$babel,@args);
# diag 'select_ur: ',timestr(timediff(new Benchmark, $t0));
cmp_table_quietly($actual,$correct,"$label input=$input_name, outputs=@$output_names");
}
sub label {
my($group_cutoff,$matcher_cutoff,$matcher_class,$keep_pdups)=@_;
my $label;
if (any {defined $_} ($group_cutoff,$matcher_cutoff,$matcher_class)) {
$group_cutoff=!defined $group_cutoff? 'default':
($group_cutoff==0? 'always': ($group_cutoff>=1e6? 'never': $group_cutoff));
$matcher_cutoff=!defined $matcher_cutoff? 'default':
($matcher_cutoff==0? 'always': ($matcher_cutoff>=1e6? 'never': $matcher_cutoff));
$matcher_class=!defined $matcher_class? 'default': $matcher_class;
$label=
"group_cutoff=$group_cutoff, matcher_cutoff=$matcher_cutoff, matcher_class=$matcher_class";
}
if (defined $keep_pdups) {
$label.=', ' if $label;
$label.="keep_pdups=".($keep_pdups? 1: 0);
}
$label;
}
sub get_options {
# initialize to defaults then overwrite with ones explicitly set
my %OPTIONS=%DEFAULTS;
GetOptions(\%OPTIONS,@OPTIONS);
# expand abbreviations
my %bundle=abbrev qw(install full);
my %graph_type=abbrev qw(star chain tree);
my %link_type=abbrev qw(starlike chainlike);
for my $option (qw(bundle graph_type link_type)) {
next unless defined $OPTIONS{$option};
my %abbrev=eval "\%$option";
$OPTIONS{$option}=$abbrev{$OPTIONS{$option}} or confess "illegal value for option $option";
}
my $full=$OPTIONS{bundle} eq 'full';
# deal with list options
for my $option
(qw(pdups_group_cutoffs pdups_prefixmatcher_cutoffs pdups_prefixmatcher_classes)) {
next unless defined $OPTIONS{$option};
my $string=$OPTIONS{$option};
my @list=split(/\W+/,$string);
$OPTIONS{$option}=\@list;
}
$OPTIONS{keep_pdups}=1 if $full && !defined $OPTIONS{keep_pdups};
# for install, cutoffs default to default
# for full, cutoffs default to default, always, never
my $cutoffs=$OPTIONS{pdups_group_cutoffs};
$cutoffs=$full? [0,1e6]: [] unless defined $cutoffs;
$OPTIONS{pdups_group_cutoffs}=[undef,@$cutoffs];
my $cutoffs=$OPTIONS{pdups_prefixmatcher_cutoffs};
$cutoffs=$full? [0,1e6]: [] unless defined $cutoffs;
$OPTIONS{pdups_prefixmatcher_cutoffs}=[undef,@$cutoffs];
my $classes=$OPTIONS{pdups_prefixmatcher_classes};
$classes=$full? [undef,qw(Trie BinarySearchTree BinarySearchList PrefixHash)]: [undef]
unless defined $classes;
$OPTIONS{pdups_prefixmatcher_classes}=$classes;
new Hash::AutoHash %OPTIONS;
}
sub make_babel {
my($num_maptables,$arity,$link_type)=@$OPTIONS{qw(num_maptables arity link_type)};
# make graph to guide schema construction. each node will generate a maptable
my $graph=new Graph::Directed;
my $root=0; # root is node 0
$graph->add_vertex($root);
my $more=$num_maptables-1; # number of nodes remaining
my @roots=$root; # queue of nodes to root subtrees
while ($more) {
my $root=shift @roots;
for (1..min($arity,$more)) {
my $kid=$num_maptables-$more--;
$graph->add_edge($root,$kid);
push(@roots,$kid);
}}
my @nodes=$graph->vertices;
# make component objects and Babel
# 'link' IdTypes connect MapTables.
# 'leaf' IdTypes are private to each MapTable
my $sql_type='VARCHAR(255)';
my(@idtypes,@masters,@maptables);
# make leaf IdTypes
for my $i (@nodes) {
my $idtype_name='leaf_'.sprintf('%03d',$i);
push(@idtypes,new Data::Babel::IdType(name=>$idtype_name,sql_type=>$sql_type));
}
# make link IdTypes
for my $i (@nodes) {
my @kids=$graph->successors($i);
next unless @kids;
my $idtype_name='link_'.sprintf('%03d',$i);
if ($link_type eq 'starlike') {
# 1 link per level connecting parent to all kids
push(@idtypes,new Data::Babel::IdType(name=>$idtype_name,sql_type=>$sql_type));
} else {
# each link connects parent to one child
for my $k (@kids) {
push(@idtypes,new Data::Babel::IdType(name=>$idtype_name.'_'.sprintf('%03d',$k),
sql_type=>$sql_type));
}}}
# make MapTables
for my $i (@nodes) {
my $maptable_num=sprintf('%03d',$i);
my $maptable_name="maptable_$maptable_num";
my ($parent)=$graph->predecessors($i);
my @kids=$graph->successors($i);
my @idtypes="leaf_$maptable_num";
if ($link_type eq 'starlike') {
# 1 link per level connecting parent to all kids
push(@idtypes,'link_'.sprintf('%03d',$parent)) if defined $parent;
push(@idtypes,"link_$maptable_num") if @kids;
} else {
# each link connects parent to one child
push(@idtypes,join('_','link',sprintf('%03d',$parent),$maptable_num)) if defined $parent;
push(@idtypes,map {join('_','link',$maptable_num,sprintf('%03d',$_))} @kids);
}
@idtypes=sort @idtypes;
push(@maptables,new Data::Babel::MapTable(name=>$maptable_name,idtypes=>\@idtypes));
}
# make Babel
my $babel=new Data::Babel
(name=>'test',autodb=>$autodb,idtypes=>\@idtypes,masters=>\@masters,maptables=>\@maptables);
isa_ok($babel,'Data::Babel','sanity test - $babel');
my @errstrs=$babel->check_schema;
ok(!@errstrs,'sanity test - check_schema');
diag(join("\n",@errstrs)) if @errstrs;
$babel;
}
sub load_database {
my($num_groups)=@_;
map {load_maptable($_,$num_groups)} @{$babel->maptables};
$babel->load_implicit_masters;
load_ur($babel,'ur');
my $ok=check_database_sanity($babel,'sanity test - database',$OPTIONS->num_maptables);
report_pass($ok,'sanity test - database looks okay');
}
# arg is maptable number
sub load_maptable {
my($maptable,$num_groups)=@_;
$num_groups=1 unless defined $num_groups;
my $maptable_name=$maptable->name;
my($num)=$maptable_name=~/_(\d+)$/;
my $leaf_type="leaf_$num";
my @link_types=grep /^link/,map {$_->name} @{$maptable->idtypes};
my @data;
for (my $i=1; $i<=$num_groups; $i++) {
my $num=sprintf('%04i',$i);
my $leaf_vals=[undef,"$leaf_type/aaa_$num"];
my @link_vals=map {[undef,"$_/aaa_$num","$_/multi","$_/nomatch_$maptable_name"]} @link_types;
my @rows=cross_product($leaf_vals,@link_vals);
@rows=uniq_rows(\@rows);
@rows=grep {any {defined $_} @$_} @rows;
push(@data,@rows);
}
t::utilBabel::load_maptable($babel,$maptable,@data);
}