The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
package t::utilBabel;
use t::util;
use Carp;
use Test::More;
use Test::Deep qw(cmp_details deep_diag subbagof);
use List::Util qw(first min);
use List::MoreUtils qw(uniq any);
use Hash::AutoHash::Args;
# use Hash::AutoHash::MultiValued;
use Exporter();
use strict;
our @ISA=qw(Exporter);

our @EXPORT=
  (@t::util::EXPORT,
   qw(check_object_basics sort_objects power_subsets cross_product vary_case uniq_rows sql_in
      prep_tabledata load_maptable load_master load_ur select_ur select_ur_sanity cleanup_db
      check_database_sanity check_maptables_sanity check_masters_sanity 
      check_handcrafted_idtypes check_handcrafted_masters check_handcrafted_maptables
      check_handcrafted_name2idtype check_handcrafted_name2master check_handcrafted_name2maptable
      check_handcrafted_id2object check_handcrafted_id2name check_implicit_masters
      load_handcrafted_maptables load_handcrafted_masters
      cmp_table cmp_table_quietly cmp_table_nocase
      cmp_op cmp_op_quietly cmp_op_quickly
      pnames pgraph
    ));
# NG 13-07-16: not used now but keep for future
push(@EXPORT,qw(grep_rows sort_rows check_table 
		cmp_objects cmp_objects_quietly order_tables));
# NG 13-07-16: used internally
#              filter_ur count_ur cleanup_ur
# NG 13-07-16: obsolete. 
#              sort_name_lists

sub check_object_basics {
  my($object,$class,$name,$label)=@_;
  report_fail($object,"$label connected object defined") or return 0;
  $object->name;		# touch object in case still Oid
  # NG 13-09-24: changed to isa_ok_quietly
  # report_fail(UNIVERSAL::isa($object,$class),"$label: class") or return 0;
  isa_ok_quietly($object,$class,$label) or return 0;
  report_fail($object->name eq $name,"$label: name") or return 0;
  return 1;
}
sub check_objects_basics {
  my($objects,$class,$names,$label)=@_;
  my @objects=sort_objects($objects,$label);
  for my $i (0..$#$objects) {
    my $object=$objects->[$i];
    check_object_basics($objects->[$i],$class,$names->[$i],"$label object $i") or return 0;
  }
  return 1;
}
# sort by name.
sub sort_objects {
  my($objects,$label)=@_;
  # hmm.. this doesn't work for Oids. not important anyway, so just bag it
  # TODO: revisit when AutoDB provides public method for fetching Oids.
#   # make sure all objects have names
#   for my $i (0..$#$objects) {
#     my $object=$objects->[$i];
#     report_fail(UNIVERSAL::can($object,'name'),"$label object $i: has name method") 
#       or return ();
#   }
  my @sorted_objects=sort {$a->name cmp $b->name} @$objects;
  wantarray? @sorted_objects: \@sorted_objects;
}
# NG 13-07-07: generate subsets of power set. much faster than Set::Scalar
#             for large sets if number of subsets not too big
# 1st argument is either ARRAY ref or integer
#   if integer, it's universe size; universe is 0..$size-1
#   if ARRAY, it's universe
# min, max are sizes of subsets emitted
# if only one provided, it's max
# if none provided, it's regular power_set. equivalent to min=0, max=size
sub power_subsets {
  my($size,$universe);
  if (ref($_[0])) {
    $universe=shift;
    $size=scalar @$universe;
  } else {
    $size=shift;
  }
  my($min,$max)=@_==1? (0,@_): (@_>1? @_: (0,$size));
  my @subsets=(defined $max && $max==0)? []: _power_subsets(0,$size,$min,$max);
  @subsets=map {[@$universe[@{$_}]]} @subsets if defined $universe;
  wantarray? @subsets: \@subsets;
}
sub _power_subsets {
  my($i,$size,$min,$max)=@_;
  return ([],[$i]) if $i==$size-1;
  my @recurse=_power_subsets($i+1,$size,$min,$max);
  # cases: size refers to size of downstream subset
  # 1) size==max. implies size>=min. pass along, don't add current
  # 2) size<max && size+i>=min.      pass along, and add current
  # 3) size<max && size+1+i>=min.    add current
  # 4) else.  do nothing
  my @out=map {
    scalar(@$_)==$max? $_ :
      (scalar(@$_)+$i>=$min? ($_,[$i,@$_]): 
       (scalar(@$_)+1+$i>=$min? [$i,@$_]: ()))} @recurse;

  @out;
}
# NG 13-07-20: cartesion product of a list of vectors (ARRAYs)
# arguments are either ARRAY refs or integers
#   if integer, it's universe size; universe is 0..$size-1
#   if ARRAY, it's universe
sub cross_product {
  my @out;
  if (any {ref $_} @_) {
    my @sets=@_;
    my @sizes=map {ref $_? scalar(@$_): $_} @sets;
    my @idx_rows=_cross_product(@sizes);
    for my $idx_row (@idx_rows)  {
      my @row=map
	{my $idx=$idx_row->[$_]; ref $sets[$_]? $sets[$_]->[$idx]: $idx;} 0..@sets-1;
      push(@out,\@row);
    }
  } else {
    @out=_cross_product(@_);
  }
  wantarray? @out: \@out;
}
sub _cross_product {
  my @sizes=@_;
  return unless @sizes;
  my $size=shift @sizes;
  return map {[$_]} 0..$size-1 unless @sizes;
  my @recurse=_cross_product(@sizes);
  map {my $i=$_; map {[$i,@$_]} @recurse} 0..$size-1;
}

# NG 13-06-15: added vary_case to test case insensitve comparisons
# vary case for list of ids
sub vary_case {
  my $wantarray=@_==0||@_>1||!ref($_[0]);
  my @in=flatten(@_);
  my @out;
  my $case=0;
  while(my $id=shift @in) {
    if ($case==0) {push(@out,lc($id))}
    elsif ($case==1) {push(@out,uc($id))}
    elsif ($case==2) {push(@out,ucfirst($id))}
    else {push(@out,ucsecond($id))}
    $case=(++$case%4);
  }
  $wantarray? @out: \@out;
}
sub ucsecond {substr($_[0],0,1).ucfirst(substr($_[0],1))}

# scrunch whitespace
sub scrunch {
  my($x)=@_;
  $x=~s/\s+/ /g;
  $x=~s/^\s+|\s+$//g;
  $x;
}
sub scrunched_eq {scrunch($_[0]) eq scrunch($_[1]);}

########################################
# these functions deal w/ relational tables

# prepare table data
# data can be 
#   string: one line per row; each row is whitespace-separated values
#   list or ARRAY of strings: each string is row
#   list or ARRAY of ARRAYs: each sub-ARRAY is row
# CAUTION: 2nd & 3rd cases ambiguous: list of 1 ARRAY could fit either case!
sub prep_tabledata {
  # NG 12-08-24: fixed to handle list or ARRAY of ARRAYs as documented
  # my @rows=(@_==1 && !ref $_[0])? split(/\n+/,$_[0]): flatten(@_);
  my @rows=(@_==1 && !ref $_[0])? split(/\n+/,$_[0]): (@_==1)? flatten(@_): @_;
  # clean whitespace and split rows 
  @rows=map {ref($_)? $_: do {s/^\s+|\s+$//g; s/\s+/ /g; [split(' ',$_)]}} @rows;
  # convert NULLS into undefs
  for my $row (@rows) {
    map {$_=undef if 'NULL' eq uc($_)} @$row;
  }
  \@rows;
}
sub load_maptable {
  my($babel,$maptable)=splice(@_,0,2);
  my $data=prep_tabledata(@_);
  ref $maptable or $maptable=$babel->name2maptable($maptable);

  # code adapted from ConnectDots::LoadMapTable Step
  my $tablename=$maptable->tablename;
  my @idtypes=@{$maptable->idtypes};
  my @column_names=map {$_->name} @idtypes;
  my @column_sql_types=map {$_->sql_type} @idtypes;
  my @column_defs=map {$column_names[$_].' '.$column_sql_types[$_]} (0..$#idtypes);
  my @indexes=@column_names;

  # code adapted from MainData::LoadData Step
  my $dbh=$babel->autodb->dbh;
  $dbh->do(qq(DROP TABLE IF EXISTS $tablename));
  my $columns=join(', ',@column_defs);
  $dbh->do(qq(CREATE TABLE $tablename ($columns)));

  # new code: insert data into table
  my @values=map {'('.join(', ',map {$dbh->quote($_)} @$_).')'} @$data;
  my $values=join(",\n",@values);
  $dbh->do(qq(INSERT INTO $tablename VALUES\n$values));

  # code adapted from MainData::LoadData Step
  # put parens around single columns
  my @alters=map {"($_)"} @indexes; # put parens around single columns
  my $alters=join(', ',map {"ADD INDEX $_"} @alters);
  $dbh->do(qq(ALTER TABLE $tablename $alters));
}
sub load_master {
  my($babel,$master)=splice(@_,0,2);
  ref $master or $master=$babel->name2master($master);
  if ($master->implicit) {
  TODO: {
      fail("futile to load data for implicit master. use load_implicit_master instead");
      return;
    }}
  my $data=prep_tabledata(@_);

  # code adapted from ConnectDots::LoadMaster, ConnectDots::LoadImpMaster, MainData::LoadData
  my $tablename=$master->tablename;
  my $idtype=$master->idtype;
  my $column_name=$idtype->name;
  my $column_sql_type=$idtype->sql_type;
  my $column_def="$column_name $column_sql_type";
  # NG 12-11-18: add _X_ column for history
  $column_def.=", _X_$column_name $column_sql_type" if $master->history;
  my $column_list=!$master->history? $column_name: " _X_$column_name, $column_name";

  # NG 12-09-30: no longer get here if master implicit
  # my $query=$master->query;

  my $dbh=$babel->autodb->dbh;
  # NG 12-08-24: moved DROPs out conditionals since master could be table in one babel
  #              and view in another
  $dbh->do(qq(DROP VIEW IF EXISTS $tablename));
  $dbh->do(qq(DROP TABLE IF EXISTS $tablename));
  # NG 12-09-30: no longer get here if master implicit
  # if ($master->view) {
  #   $dbh->do(qq(CREATE VIEW $tablename AS\n$query));
  #   return;
  # }
  my $sql=qq(CREATE TABLE $tablename ($column_def));
  # NG 12-09-30: no longer get here if master implicit
  # $sql.=" AS\n$query" if $master->implicit; # if implicit, load data via query
  $dbh->do($sql);
  # NG 12-09-30: no longer get here if master implicit
  # if (!$master->implicit) {
  # new code: insert data into table
  my @values=map {'('.join(', ',map {$dbh->quote($_)} @$_).')'} @$data;
  my $values=join(",\n",@values);
  $dbh->do(qq(INSERT INTO $tablename ($column_list) VALUES\n$values));
  # }
  # code adapted from MainData::LoadData Step
  $dbh->do(qq(ALTER TABLE $tablename ADD INDEX ($column_name)));
  # NG 12-11-18: add _X_ column for history
  $dbh->do(qq(ALTER TABLE $tablename ADD INDEX ("_X_$column_name"))) if $master->history;

}
# create universal relation (UR)
# algorithm: natual full outer join of all maptables and explicit masters
#            any pre-order traversal of schema graph will work (I think!)
# sadly, since MyQSL still lacks full outer joins, have to emulate with left/right
# joins plus union. do it step-by-step: I couldn't figure out how to do it in
# one SQL statement...
# >>> assume that lexical order of maptables gives a valid pre-order <<<
# NG 13-06-12: assumption above is certainly worng in some tests and probably wrong 
#              in many others.  scary this wasn't cught sooner.
#              as a first step towards a solution, add @tables arg to let caller
#              pass in a valid join order
sub load_ur {
  my($babel,$urname,@tables)=@_;
  $urname or $urname='ur';
  # if @tables not given, ASSUME lexical order of maptables is valid pre-order
  # this will change soon
  if (@tables) {
    @tables=map {$babel->name2maptable($_)} @tables;
  } else {
    @tables=sort {$a->tablename cmp $b->tablename} @{$babel->maptables};
  }
  # add in explicit Masters. order doesn't matter so long as they're last
  push(@tables,grep {$_->explicit} @{$babel->masters});
  # %column2type maps column_names to sql types
  my %column2type;
  my @idtypes=@{$babel->idtypes};
  @column2type{map {$_->name} @idtypes}=map {$_->sql_type} @idtypes;
  my @x_idtypes=grep {$_->history} @idtypes;
  @column2type{map {'_X_'.$_->name} @x_idtypes}=map {$_->sql_type} @x_idtypes;

  my $left=shift @tables;
  while (my $right=shift @tables) {
    my $result_name=@tables? undef: $urname; # final answer is 'ur'
    $left=full_join($babel,$left,$right,$result_name,\%column2type);
  }
  $left;
}
# NG 11-01-21: added 'translate all'
# NG 12-08-22: added 'filters'
# NG 12-09-04: rewrote to do filtering in Perl - seems more robust test strategy
# NG 12-09-21: added support for input_ids=>scalar, filters=>ARRAY,  
#              all semantics of filter=>undef
# NG 12-11-18: added support for histories
# NG 12-11-20: fixed input column for histories: 0th column is '_X_' if input has history
# NG 12-11-23: added validate
# NG 13-06-10: changed to do case insensitive comparisons, eg, searching for 'htt' as gene_symbol
# NG 13-10-14: added query
# select data from ur (will actually work for any table)
sub select_ur {
  my $args=new Hash::AutoHash::Args(@_);
  my($babel,$urname,$input_idtype,$input_ids,$output_idtypes,$filters,$query,$validate,$nocase)=
    @$args{qw(babel urname input_idtype input_ids output_idtypes filters query validate nocase)};
  confess "input_idtype must be set. call select_ur_sanity instead" unless $input_idtype;
  # confess "Only one of inputs_ids or input_ids_all may be set" if $input_ids && $input_ids_all;
  $urname or $urname=$args->tablename || 'ur';
  my $input_idtype=ref $input_idtype? $input_idtype->name: $input_idtype;
  if (defined $input_ids) {
    $input_ids=[$input_ids] unless ref $input_ids;
    confess "bad input id: ref or stringified ref"
      if grep {ref($_) || $_=~/ARRAY|HASH/} @$input_ids;
    # NG 12-11-14: drop duplicate input ids so validate won't get extra invalid ids
    $input_ids=[uniq(@$input_ids)];
  }
  my @output_idtypes=map {ref $_? $_->name: $_} @$output_idtypes;
  $filters=filters_array($filters) if ref $filters eq 'ARRAY';
  my @filter_idtypes=keys %$filters;
  
  my $dbh=$babel->autodb->dbh;
  # NG 10-08-25: removed 'uniq' since duplicate columns are supposed to be kept
  # my @columns=uniq grep {length($_)} ($input_idtype,@output_idtypes);
  # NG 12-09-04: include filter_idtypes so we can do filtering in Perl
  # NG 12-09-04: test for length obsolete, since input_idtype required
  # my @columns=grep {length($_)} ($input_idtype,@filter_idtypes,@output_idtypes);
  # NG 12-11-20: 0th column is '_X_' if input has history
  my @columns=((!_has_history($babel,$input_idtype)? $input_idtype: "_X_$input_idtype"),
	       @filter_idtypes,@output_idtypes);
  # NG 12-11-18: tack on filter history columns
  push(@columns,map {"_X_$_"} grep {_has_history($babel,$_)} @filter_idtypes);
  my $columns=join(', ',@columns);
  my $sql=qq(SELECT DISTINCT $columns FROM $urname WHERE $columns[0] IS NOT NULL);
  my $table=$dbh->selectall_arrayref($sql);
  # hang onto valid input ids if doing validate
  # NG 13-06-10: added 'lc' for case insensitive comparisons
  my %valid=map {lc $_->[0]=>1} @$table if $validate;
  # NG 13-10-14: added query. have to do query after validate.
  $sql.=" AND ($query)" if $query;
  my $table=$dbh->selectall_arrayref($sql);

  # now do filtering. columns are input, filters, then outputs, finally history columns
  my %name2idx=map {$columns[$_]=>$_} 0..$#columns;
  $table=filter_ur($table,0,$input_ids);
  for(my $j=0; $j<@filter_idtypes && @$table; $j++) {
    my $filter_ids=$filters->{$filter_idtypes[$j]};
    $table=filter_ur($table,$name2idx{"_X_$columns[$j+1]"}||$j+1,$filter_ids);
  }
  # remove filter_idtype columns
  map {splice(@$_,1,@filter_idtypes)} @$table;
  # NG 12-11-18: remove history columns
  map {splice(@$_,1+@output_idtypes)} @$table;
  # remove duplicate rows. dups can arise when filter columns spliced out
  $table=uniq_rows($table);

  # NG 10-11-10: remove rows whose output columns are all NULL, because translate now skips these
  # NG 12-09-04: rewrote loop to one-liner below
  # NG 12-11-23: don't remove NULL rows when validate set
  unless ($validate) {
    @$table=grep {my @row=@$_; grep {defined $_} @row[1..$#row]} @$table if @output_idtypes;
  } else {
    # %id2valid maps input ids to validity
    # %have_id tells which input ids are in result
    # @missing_ids are input ids not in result - some are valid, some not
    $input_ids=[keys %valid] unless $input_ids; # input_ids_all
    # NG 13-06-10: added 'lc' for case insensitive comparisons
    my %id2valid=map {my $id=lc $_; $id=>$valid{$id}||0} @$input_ids;
    my %have_id=map {lc $_->[0]=>1} @$table;
    my @missing_ids=grep {!$have_id{lc $_}} @$input_ids;
    # existing rows are valid - splice in 'valid' column
    map {splice(@$_,1,0,1)} @$table;
    # add rows for missings ids - some valid, some not
    # NG 13-06-15: added '||0' for case insensitive comparisons
    push(@$table,map {[$_,$id2valid{$_}||0,(undef)x@$output_idtypes]} @missing_ids);
  }
  # NG 13-07-15: remove partial duplicates
  $table=remove_pdups($table) unless $args->keep_pdups;
  $table;
}

sub remove_pdups {
  my $table=shift;

  my %groups=group {$_->[0]} @$table;
  my $pseudo_dups=0;
  $table=[];
  for my $group (values %groups) {
    push(@$table,@$group),next unless scalar(@$group)>1;
    my @rows=sort {undefs($a)<=>undefs($b)} @$group;
    my %rows=map {$_=>$rows[$_]} (0..$#rows);
    my %pseudo_dups;
    for(my $i=0; $i<@rows-1; $i++) {
      next unless $rows{$i};
      for(my $j=$i+1; $j<@rows; $j++) {
	next unless $rows{$j};
	delete $rows{$j}, $pseudo_dups++ if pseudo_dup($rows[$i],$rows[$j]);
      }
    }
    push(@$table,values %rows);
  }
  # TODO: put this under some sort of flag...
  # diag('+++ select_ur pseudo_dups='.$pseudo_dups) if $pseudo_dups;
  # diag('+++ select_ur pseudo_dups='.$pseudo_dups);

  $table;
}
# used to sort by number of undefs
sub undefs {scalar grep {!defined $_} @{$_[0]};}
# row $j is pseudo-dup of $i if they agree wherever both defined, else $j is undef
sub pseudo_dup {
  my($rowi,$rowj)=@_;
  for(my $k=0; $k<=@$rowi; $k++) {
    return 0 if defined $rowj->[$k] && $rowi->[$k] ne $rowj->[$k];
  }
  1;
}

# NG 13-06-10: changed to do case insensitive comparisons
sub filter_ur {
  my($table,$col,$ids)=@_;
  if (defined $ids) {
    $ids=[$ids] unless ref $ids;
    confess "bad filter id for column $col: ref or stringified ref"
      if grep {ref($_) || $_=~/ARRAY|HASH/} @$ids;
    if (@$ids) {
      my(@table1,@table2);
      my @defined_ids=grep {defined $_} @$ids;
      # NG 12-10-29: changed pattern to match entire field
      my $pattern=join('|',map {"\^$_\$"} @defined_ids);
      # NG 13-06-01: added 'i' for case insensitive matching
      $pattern=qr/$pattern/i;
      @table1=grep {$_->[$col]=~/$pattern/} @$table if @defined_ids;
      @table2=grep {!defined $_->[$col]} @$table if @defined_ids!=@$ids;
      @$table=(@table1,@table2);
    } else {			# empty list of ids - result empty
      @$table=();
    }
  } else {			# filter=>undef
    @$table=grep {defined $_->[$col]} @$table;
  }
  $table;
}
# NG 13-06-10: changed to do case insensitive comparisons
# remove duplicate rows from table
sub uniq_rows {
  my($rows)=@_;
  # NG 13-06-10: added 'lc' for case insensitive comparisons
  my @row_strings=map {lc join($;,@$_)} @$rows;
  my %seen;
  my @uniq_rows;
  for(my $i=0; $i<@$rows; $i++) {
    my $row_string=$row_strings[$i];
    push(@uniq_rows,$rows->[$i]) unless $seen{$row_string}++;
  }
  # $uniq_rows;
  wantarray? @uniq_rows: \@uniq_rows;
}
# NG 13-06-21: moved from 050/translate.pm and renamed from grep_table
sub grep_rows {
  my($table,$col,$ids)=@_;
  my $pattern=join('|',map {"\^$_\$"} @$ids);
  $pattern=qr/$pattern/;
  [grep {$_->[$col]=~/$pattern/} @$table];
}
# NG 13-06-19: sort arrays of names, typically output subsets produced by power_set
# NG 13-07-16: renamed 
sub sort_rows {
  sort {@$a<=>@$b || first {$_} map {$a->[$_] cmp $b->[$_]} 0..$#$a} @_;
}
# make SQL IN clauses for test queries. 
# arguments are 1) single value, 2) ARRAY of values, or 3) column=>values pairs
sub sql_in {
  my $sql;
  if (@_==1) {
    my @values=flatten(shift);
    $sql='IN ('.join(',',map {"'$_'"} @values).')';
  } else {
    my %args=@_;
    my @sql;
    while (my($column,$values)=each %args) {
      my @values=flatten($values);
      push(@sql,"$column IN (".join(',',map {"'$_'"} @values).')');
    }
    $sql=join(' AND ',@sql);
  }
  $sql;
}

# process filters ARRAY - a bit hacky 'cuz filter=>undef not same as filter=>[undef]
sub filters_array {
  my @filters=@{$_[0]};
  my(%filters,%filter_undef);
  # code adapted from Hash::AutoHash::MultiValued
  while (@filters>1) { 
    my($key,$value)=splice @filters,0,2; # shift 1st two elements
    if (defined $value || $filter_undef{$key}) { 
      # store value if defined or key has multiple occurrences of undef
      my $list=$filters{$key}||($filters{$key}=[]);
      if (defined $value) {
	push(@$list,$value) unless ref $value;
	push(@$list,@$value) if ref $value;
      }
    } else {
      $filter_undef{$key}++;
    }}
  # add the undefs to %filters
  for my $key (keys %filter_undef) {
    my $list=$filters{$key};
    if (defined $list) {
      push(@$list,undef);
    } else {
      $filters{$key}=undef;
    }
   }
  \%filters;
}

# NG 12-09-04: separated ur sanity tests from real tests
sub select_ur_sanity {
  my $args=new Hash::AutoHash::Args(@_);
  my($babel,$urname,$output_idtypes)=@$args{qw(babel urname output_idtypes)};
  my @output_idtypes=map {ref $_? $_->name: $_} @$output_idtypes;

  my $dbh=$babel->autodb->dbh;
  my $columns=join(', ',@output_idtypes);
  my $sql=qq(SELECT DISTINCT $columns FROM $urname);
  my $table=$dbh->selectall_arrayref($sql);

  # remove NULL rows (probably aren't any)
  @$table=grep {my @row=@$_; grep {defined $_} @row} @$table;
  $table;
}
# NG 12-09-23: added count_ur. simple wrapper around select_ur
sub count_ur {
  my $table=select_ur(@_);
  scalar @$table;
}
# NG 12-11-18: check that table exists and is non-empty
sub check_table {
  my($babel,$table,$label)=@_;
  my $dbh=$babel->autodb->dbh;
  my $ok=1;
  my $sql=qq(SHOW TABLES LIKE '$table');
  my $tables=$dbh->selectcol_arrayref($sql);
  $ok&&=report_fail(!$dbh->err,"$label database query failed: ".$dbh->errstr) or return 0;
  $ok&&=report_fail(scalar @$tables,"$label table $table does not exist") or return 0;
  $ok&&=cmp_quietly($tables,[$table],"$label SHOW TABLES got incorrect result") or return 0;
  my $sql=qq(SELECT COUNT(*) FROM $table);
  my($count)=$dbh->selectrow_array($sql);
  $ok&&=report_fail(!$dbh->err,"$label database query failed: ".$dbh->errstr) or return 0;
  report_fail($count,"$label table $table is empty");
}
# NG 12-11-18: check database for sanity
sub check_database_sanity {
  my($babel,$label,$num_maptables)=@_;
  my $ok=1;
  $ok&&=check_maptables_sanity($babel,"$label check maptables",$num_maptables);
  $ok&&=check_masters_sanity($babel,"$label check masters");
  $ok;
}

# NG 12-11-18: check maptables for sanity
sub check_maptables_sanity {
  my($babel,$label,$num_maptables)=@_;
  my $dbh=$babel->autodb->dbh;
  my $ok=1;
  my @maptables=@{$babel->maptables};
  $ok&&=
    is_quietly($num_maptables,scalar @maptables,"$label BAD NEWS: number of maptables wrong!!")
      or return 0;
  for my $table (map {$_->name} @maptables) {
    $ok&&=check_table($babel,$table,"$label MapTable $table");
  }
  $ok;
}
# NG 12-11-18: check master tables for sanity
sub check_masters_sanity {
  my($babel,$label)=@_;
  my $dbh=$babel->autodb->dbh;
  my $ok=1;
  my @maptables=@{$babel->maptables};
  for my $maptable (@maptables) {
    my $maptable_name=$maptable->name;
    my @idtypes=@{$maptable->idtypes};
    for my $idtype (@idtypes) {
      my $idtype_name=$idtype->name;
      my $master=$idtype->master;
      my $master_name=$master->name;
      $ok&&=is_quietly
	($master_name,"${idtype_name}_master", "$label BAD NEWS: master name wrong!!") 
	  or return 0;
      my $sql=qq(SELECT $idtype_name FROM $maptable_name WHERE $idtype_name NOT IN 
                  (SELECT $idtype_name FROM $master_name));
      my $missing=$dbh->selectcol_arrayref($sql);
      $ok&&=report_fail(!$dbh->err,"$label database query failed: ".$dbh->errstr) or return 0;
      $ok&&=report_fail(@$missing==0,"$label some ids in $maptable_name missing from $master_name; here are a few: ".join(', ',@$missing[0..2])) or return 0;
    }
  }
  $ok;
}

# cmp ARRAYs of Babel component objects (anything with an 'id' method will work)
# like cmp_bag but 
# 1) reports errors the way we want them
# 2) sorts the args to avoid Test::Deep's 'bag' which is ridiculously slow...
sub cmp_objects {
  my($actual,$correct,$label,$file,$line)=@_;
  my $ok=cmp_objects_quietly($actual,$correct,$label,$file,$line);
  report_pass($ok,$label);
}
sub cmp_objects_quietly {
  my($actual,$correct,$label,$file,$line)=@_;
  my @actual_sorted=sort {$a->id cmp $b->id} @$actual;
  my @correct_sorted=sort  {$a->id cmp $b->id} @$correct;
  cmp_quietly(\@actual_sorted,\@correct_sorted,$label,$file,$line);
}
# like cmp_bag but 
# 1) reports errors the way we want them
# 2) sorts the args to avoid Test::Deep's 'bag' which is ridiculously slow...
# NG 10-11-08: extend to test limit. CAUTION: limit should be small or TOO SLOW!
sub cmp_table {
  my($actual,$correct,$label,$file,$line,$limit)=@_;
  my $ok=cmp_table_quietly($actual,$correct,$label,$file,$line,$limit);
  report_pass($ok,$label);
}
# NG 13-06-10: added cmp_table_nocase for case insensitive comparisons, eg,
#   searching for 'htt' as gene_symbol
sub cmp_table_nocase {
  my($actual,$correct,$label,$file,$line,$limit)=@_;
  my @actual=map {_lc_row($_)} @$actual;
  my @correct=map {_lc_row($_)} @$correct;
  my $ok=cmp_table_quietly(\@actual,\@correct,$label,$file,$line,$limit);
  report_pass($ok,$label);
}
sub _lc_row {
  my @row=@{$_[0]};
  @row=map {lc($_)} @row;
  \@row;
}

sub cmp_table_quietly {
  my($actual,$correct,$label,$file,$line,$limit)=@_;
  unless (defined $limit) {
    my @actual_sorted=sort cmp_rows @$actual;
    my @correct_sorted=sort cmp_rows @$correct;
    # my $ok=cmp_quietly($actual,bag(@$correct),$label,$file,$line);
    return cmp_quietly(\@actual_sorted,\@correct_sorted,$label,$file,$line);
  } else {
    my $correct_count=min(scalar(@$correct),$limit);
    report_fail(@$actual==$correct_count,
		"$label: expected $correct_count row(s), got ".scalar @$actual,$file,$line)
      or return 0;
    return cmp_quietly($actual,subbagof(@$correct),$label,$file,$line);
  }
  1;
}
# cmp_op & cmp_op_quietly used for merged translate/count tests
# $actual can be table or count
# $correct always table
# $op is 'translate' or 'count'
sub cmp_op {
  my($actual,$correct,$op,$label,$file,$line,$limit)=@_;
  if ($op eq 'translate') {
    cmp_table($actual,$correct,$label,$file,$line,$limit);
  } elsif ($op eq 'count') {
    $correct=@$correct;
    $correct=min($correct,$limit) if defined $limit;
    my($ok,$details)=cmp_details($actual,$correct);
    report($ok,$label,$file,$line,$details);
  } else {
    confess "Unknow op $op: should be 'translate' or 'count'";
  }
}

sub cmp_op_quietly {
  my($actual,$correct,$op,$label,$file,$line,$limit)=@_;
  if ($op eq 'translate') {
    cmp_table_quietly($actual,$correct,$label,$file,$line,$limit);
  } elsif ($op eq 'count') {
    $correct=@$correct;
    $correct=min($correct,$limit) if defined $limit;
    cmp_quietly($actual,$correct,$label,$file,$line);
  } else {
    confess "Unknow op $op: should be 'translate' or 'count'";
  }
}
# used by big IN tests, because cmp_op way too slow. assumes $correct bigger than $actual
# quiet, even though name doesn't say so
sub cmp_op_quickly {
  my($actual,$correct,$op,$label,$file,$line,$limit)=@_;
  my $correct_count=defined $limit? min(@$correct,$limit): @$correct;
  if ($op eq 'count') {
    return cmp_quietly($actual,$correct_count,$label,$file,$line);
  } elsif ($op eq 'translate') {
    my $actual_count=@$actual;
    my $ok=cmp_quietly($actual_count,$correct_count,$label,$file,$line) or return 0;
    my %correct=map {join($;,@$_)=>1} @$correct;
    my @actual=map {join($;,@$_)} @$actual;
    my @bad=grep {!$correct{$_}} @actual;
    return 1 unless @bad;
    ($file,$line)=called_from($file,$line);
    fail($label);
    diag("from $file line $line") if defined $file;
    diag('actual has ',scalar(@bad),' row(s) that are not in correct',"\n",
	 'sorry I cannot provide details...');
    return 0;
  } else {
    confess "Unknown op $op: should be 'translate' or 'count'" ;
  }
}
# sort subroutine: $a, $b are ARRAYs of strings. should be same lengths. cmp element by element
sub cmp_rows {
  my $ret;
  for (0..$#$a) {
    return $ret if $ret=$a->[$_] cmp $b->[$_];
  }
  # equal up to here. if $b has more, then $a is smaller
  $#$a <=> $#$b;
}
# NG 13-06-12: construct pre-order traversal of schema 'table-graph';
sub order_tables {
  my($babel)=@_;
  my $graph=new Graph::Undirected;
  my @tables=@{$babel->maptables};
  for my $table (@tables) {
    my @idtypes=@{$table->idtypes};
    my @neighbors=grep {$_!=$table} map {@{$_->maptables}} @idtypes;
    my $me=$table->name;
    map {$graph->add_edge($me,$_->name)} @neighbors;
  }
  my $tree=$graph->minimum_spanning_tree;
  # now do a pre-order traversal, eg, depth-first
  my @nodes=$tree->vertices;
  my %avbl=map {$_=>1} @nodes;
  my @future=$nodes[0];
  my @traversal;
  while (@future) {
    my $node=shift @future;
    next unless $avbl{$node};	# don't process if already visited
    push(@traversal,$node);
    $avbl{$node}=0;		# mark as visited
    push(@future,grep {$avbl{$_}} $tree->neighbors($node));
  }
  wantarray? @traversal: \@traversal;
}
# emulate natural full outer join. return result table
# $result is optional name of result table. if not set, unique name generated
# TODO: add option to delete intermediate tables as we go.
sub full_join {
  my($babel,$left,$right,$resultname,$column2type)=@_;
  my $leftname=$left->tablename;
  my $rightname=$right->tablename;
 # left is usually t::FullOuterJoinTable but can be MapTable or Master
  my @column_names=
    $left->isa('t::FullOuterJoinTable')? @{$left->column_names}: map {$_->name} @{$left->idtypes};
  # right is always MapTable or Master
  push(@column_names,map {$_->name} @{$right->idtypes});
  # NG 12-11:18: added histories
  push(@column_names,'_X_'.$left->idtype->name)
    if $left->isa('Data::Babel::Master') && $left->history;
  push(@column_names,'_X_'.$right->idtype->name)
    if $right->isa('Data::Babel::Master') && $right->history;

  @column_names=uniq(@column_names);
  my @column_defs=map {$_.' '.$column2type->{$_}} @column_names;
  my $column_names=join(', ',@column_names);
  my $column_defs=join(', ',@column_defs);
  
  my $result=new t::FullOuterJoinTable(name=>$resultname,column_names=>\@column_names);
  $resultname=$result->tablename;
  # code adapted from MainData::LoadData Step
  my $dbh=$babel->autodb->dbh;
  $dbh->do(qq(DROP TABLE IF EXISTS $resultname));
  my $column_list=join(', ',@column_defs);
  my $query=qq
    (SELECT $column_names FROM $leftname NATURAL LEFT OUTER JOIN $rightname
     UNION
     SELECT $column_names FROM $leftname NATURAL RIGHT OUTER JOIN $rightname);
  $dbh->do(qq(CREATE TABLE $resultname ($column_list) AS\n$query));
  $result;
}
# drop all tables and views associated with Babel tests
#   arg is generally AutoDB
#   do at start, rather than end, to leave bread crumbs for post-run debugging
sub cleanup_db {
  my($autodb,$keep_ur)=@_;
  my $dbh=$autodb->dbh;
  my @tables=(@{$dbh->selectcol_arrayref(qq(SHOW TABLES LIKE '%maptable%'))},
	      @{$dbh->selectcol_arrayref(qq(SHOW TABLES LIKE '%master%'))});
  map {$dbh->do(qq(DROP TABLE IF EXISTS $_))} @tables;
  map {$dbh->do(qq(DROP VIEW IF EXISTS $_))} @tables;
  cleanup_ur($dbh) unless $keep_ur;
}
# arg is dbh, autodb, or babel. clean up intermediate tables created en route to ur
sub cleanup_ur {t::FullOuterJoinTable->cleanup(@_) }

########################################
# these functions test our 'standard' hand-crafted Babel & components

sub check_handcrafted_idtypes {
  my($actual,$mature,$label)=@_;
  $label or $label='idtypes'.($mature? ' (mature)': '');
  my $num=4;
  my $class='Data::Babel::IdType';
  report_fail(@$actual==$num,"$label: number of elements") or return 0;
  my @actual=sort_objects($actual,$label) or return 0;
  for my $i (0..$#actual) {
    my $actual=$actual[$i];
    my $suffix='00'.($i+1);
    report_fail(UNIVERSAL::isa($actual,$class),"$label object $i: class") or return 0;
    report_fail($actual->name eq "type_$suffix","$label object $i: name") or return 0;
    report_fail($actual->id eq "idtype:type_$suffix","$label object $i: id") or return 0;
    report_fail($actual->display_name eq "display_name_$suffix",
		"$label object $i: display_name") or return 0;
    report_fail($actual->referent eq "referent_$suffix","$label object $i: referent") or return 0;
    report_fail($actual->defdb eq "defdb_$suffix","$label object $i: defdb") or return 0;
    report_fail($actual->meta eq "meta_$suffix","$label object $i: meta") or return 0;
    report_fail($actual->format eq "format_$suffix","$label object $i: format") or return 0;
    report_fail($actual->sql_type eq "VARCHAR(255)","$label object $i: sql_type") or return 0;
    report_fail(as_bool($actual->internal)==0,"$label object $i: internal") or return 0;
    report_fail(as_bool($actual->external)==1,"$label object $i: external") or return 0;
    if ($mature) {
      my $babel=$actual->babel;
      check_object_basics($babel,'Data::Babel','test',"$label object $i babel");
      check_object_basics($actual->master,'Data::Babel::Master',
			  "type_${suffix}_master","$label object $i master");
      # NG 13-09-24: added tests for maptables
      my $maptables=$actual->maptables;
      map {isa_ok_quietly($_,'Data::Babel::MapTable',"$label object $i maptable")} @{$maptables}
	or return 0;
      my @correct_maptables=
	($suffix eq '001'? map {$babel->name2maptable("maptable_$_")} qw(001):
	 ($suffix eq '002'? map {$babel->name2maptable("maptable_$_")} qw(001 002): 
	  ($suffix eq '003'? map {$babel->name2maptable("maptable_$_")} qw(002 003): 
	   ($suffix eq '004'? map {$babel->name2maptable("maptable_$_")} qw(003):
	    confess "Unexpected suffix $suffix"))));
      cmp_bag_quietly($maptables,\@correct_maptables,"$label object $i maptables") or return 0;
    }
  }
  pass($label);
}

# masters 2&3 are implicit, hence some of their content is special
# NG 10-11-10: implicit Masters now have clauses to exclude NULLs in their queries
sub check_handcrafted_masters {
  my($actual,$mature,$label)=@_;
  $label or $label='masters'.($mature? ' (mature)': '');
  my $num=$mature? 4: 2;
  my $class='Data::Babel::Master';
  report_fail(@$actual==$num,"$label: number of elements") or return 0;
  my @actual=sort_objects($actual,$label) or return 0;
  for my $i (0..$#actual) {
    my $actual=$actual[$i];
    my $suffix='00'.($i+1);
    my $name="type_${suffix}_master";
    my $id="master:$name";
    # NG 13-09-02: DEPRECATED workflow related attributes
    my($query,$view,$implicit);
    # masters 2&3 are implicit, hence some of their content is special
    if ($i>=2) {
      $implicit=1; $query=1;
      if ($i==3) {
	$view=1;      
      }}
    isa_ok_quietly($actual,$class,"$label object $i: class") or return 0;
    report_fail($actual->name eq $name,"$label object $i: name") or return 0;
    report_fail($actual->id eq $id,"$label object $i: id") or return 0;
    report_fail(as_bool($actual->implicit)==$implicit,"$label object $i: implicit") or return 0;
    if ($mature) {
      report_fail(as_bool($actual->query)==$query,"$label object $i: query") or return 0;
      report_fail(as_bool($actual->view)==$view,"$label object $i: view") or return 0;
      check_object_basics($actual->babel,'Data::Babel','test',"$label object $i babel");
      check_object_basics($actual->idtype,'Data::Babel::IdType',
			  "type_$suffix","$label object $i idtype");
    }
  }
  pass($label);
}

sub check_handcrafted_maptables {
  my($actual,$mature,$label)=@_;
  $label or $label='maptables'.($mature? ' (mature)': '');
  my $num=3;
  my $class='Data::Babel::MapTable';
  report_fail(@$actual==$num,"$label: number of elements") or return 0;
  my @actual=sort_objects($actual,$label) or return 0;
  # NG 13-09-02: DEPRECATED workflow related attributes
  for my $i (0..$#actual) {
    my $actual=$actual[$i];
    my $suffix='00'.($i+1);
    my $suffix1='00'.($i+2);
    my $name="maptable_$suffix";
    my $id="maptable:$name";
    report_fail(UNIVERSAL::isa($actual,$class),"$label object $i: class") or return 0;
    report_fail($actual->name eq $name,"$label object $i: name") or return 0;
    report_fail($actual->id eq $id,"$label object $i: id") or return 0;
    # NG 13-06-12: compare as sets 'cuz perl 5.18 no longer preserves order
     if ($mature) {
      check_object_basics($actual->babel,'Data::Babel','test',"$label object $i babel");
      check_objects_basics($actual->idtypes,'Data::Babel::IdType',
			  ["type_$suffix","type_$suffix1"],"$label object $i idtypes");
    }
  }
  pass($label);
}

sub check_handcrafted_name2idtype {
  my($babel)=@_;
  my $label='name2idtype';
  my %name2idtype=map {$_->name=>$_} @{$babel->idtypes};
  for my $name (qw(type_001 type_002 type_003 type_004)) {
    my $actual=$babel->name2idtype($name);
    report_fail($actual==$name2idtype{$name},"$label: object $name") or return 0;
  }
  pass($label);
}
sub check_handcrafted_name2master {
  my($babel)=@_;
  my $label='name2master';
  my %name2master=map {$_->name=>$_} @{$babel->masters};
  for my $name (qw(type_001 type_002 type_003 type_004)) {
    my $actual=$babel->name2master($name);
    report_fail($actual==$name2master{$name},"$label: object $name") or return 0;
  }
  pass($label);
}
sub check_handcrafted_name2maptable {
  my($babel)=@_;
  my $label='name2maptable';
  my %name2maptable=map {$_->name=>$_} @{$babel->maptables};
  for my $name (qw(type_001 type_002 type_003 type_004)) {
    my $actual=$babel->name2maptable($name);
    report_fail($actual==$name2maptable{$name},"$label: object $name") or return 0;
  }
  pass($label);
}
sub check_handcrafted_id2object {
  my($babel)=@_;
  my $label='id2object';
  my @objects=(@{$babel->idtypes},@{$babel->masters},@{$babel->maptables});
  my %id2object=map {$_->id=>$_} @objects;
  my @ids=
    (qw(idtype:type_001 idtype:type_002 idtype:type_003 idtype:type_004),
     qw(master:type_001_master master:type_002_master master:type_003_master master:type_004_master),
     qw(maptable:maptable_001 maptable:maptable_002 maptable:maptable_003));
  for my $id (@ids) {
    my $actual=$babel->id2object($id);
    report_fail($actual==$id2object{$id},"$label: object $id") or return 0;
  }
  pass($label);
}
sub check_handcrafted_id2name {
  my($babel)=@_;
  my $label='id2name';
  my @ids=
    (qw(idtype:type_001 idtype:type_002 idtype:type_003 idtype:type_004),
     qw(master:type_001_master master:type_002_master master:type_003_master master:type_004_master),
     qw(maptable:maptable_001 maptable:maptable_002 maptable:maptable_003));
  my @names=
    (qw(type_001 type_002 type_003 type_004),
     qw(type_001_master type_002_master type_003_master type_004_master),
     qw(maptable_001 maptable_002 maptable_003));
  my %id2name=map {$ids[$_]=>$names[$_]} (0..$#ids);
  for my $id (@ids) {
    my $actual=$babel->id2name($id);
    report_fail($actual eq $id2name{$id},"$label: object $id") or return 0;
  }
  pass($label);
}

sub load_handcrafted_maptables {
  my($babel,$data)=@_;
  for my $name (qw(maptable_001 maptable_002 maptable_003)) {
    load_maptable($babel,$name,$data->$name->data);
  }
}
sub load_handcrafted_masters {
  my($babel,$data)=@_;
  # explicit masters
  for my $name (qw(type_001_master type_002_master)) {
    load_master($babel,$name,$data->$name->data);
  }
  # # NG 12-09-27. loop below no subsumed in load_implicit_masters
  # # implicit masters have no data
  # for my $name (qw(type_003_master type_004_master)) {
  #   load_master($babel,$name);
  # }
}
# NG 12-09-27: added load_implicit_masters and test below
# must be called after maptables loaded
sub check_implicit_masters {
  my($babel,$data,$label,$file,$line)=@_;
  my $dbh=$babel->dbh;
  my $ok=1;
  for my $master (grep {$_->implicit} @{$babel->masters}) {
    my $name=$master->name;
    my $correct=prep_tabledata($data->$name->data);
    my $actual=$dbh->selectall_arrayref(qq(SELECT * FROM $name));
    $ok&&=cmp_table_quietly($actual,$correct,"$label: $name",$file,$line);
  }
  report_pass($ok,$label);
}

########################################
# utility functions for history idtypes
# arg is IdType object or name
sub _has_history {
  my($babel,$idtype)=@_;
  ref $idtype or $idtype=$babel->name2idtype($idtype);
  $idtype->history;
}
# sub _history_name {
#   my($babel,$idtype)=@_;
#   ref $idtype and $idtype=$idtype->name;
#   "_X_$idtype";
# }

# debugging functions

# print names 
sub pnames {
  my $sep=shift;
  print join($sep,map {$_->name} @_);
}
# print graph, typically as sif
sub pgraph {
  my($graph,$file,$format)=@_;
  length($format)? $format=lc($format): ($format='sif');
  confess "Invalid format $format: must be sif or txt" unless $format=~/sif|txt/;
  if ($file) {
    open(OUT,'>',$file) || confess "Cannot create output file $file: $!";
  } else {
    *OUT=*STDOUT;
  }
  if ($format eq 'sif') {
    print OUT join("\n",map {my($v0,$v1)=@$_;"$v0 - $v1"} $graph->edges),"\n";
  } else {
    print OUT '  ',join("\n  ",map {_edge_str($graph,$_)} _sort_edges($graph->edges)),"\n";
  }
  close OUT if $file;
}
# # NG 13-10-05 print table dumps to track down FAILs seen by 
# #              David Cantrell (reports 34101829, 34102877)
# sub diag_rows {
#   my($rows)=@_;
#   my @diag='----------';
#   for my $row (@$rows) {
#     # replace undef by NULL
#     push(@diag,join("\t",map {defined $_? $_: 'NULL'} @$row));
#   }
#   push(@diag,'----------');
#   my $diag=join("\n",@diag);
#   diag($diag);
#   return 1;
# }
# # NG 13-09-15: print table dumps to track down FAILs seen by 
# #              David Cantrell (reports 34101829, 34102877)
# sub diag_table {
#   my($table,@cols)=@_;
#   my $cols=@cols? join(',',@cols): '*';
#   my $sth=$dbh->prepare(qq(SELECT $cols FROM $table)) or goto FAIL;
#   $sth->execute() or goto FAIL;
#   my @cols=@{$sth->{NAME}};
#   my $rows=$sth->fetchall_arrayref() or goto FAIL;
#   my @diag=("table $table:",join("\t",@cols));
#   for my $row (@$rows) {
#     # replace undef by NULL
#     push(@diag,join("\t",map {defined $_? $_: 'NULL'} @$row));
#   }
#   push(@diag,'----------');
#   my $diag=join("\n",@diag);
#   diag($diag);
#   return 1;
#  FAIL:
#   fail("dump table $table");
#   diag("While trying to dump table $table for diagnostic purposes, we got the following DBI error message\n".DBI->errstr);
#   return 0;
# }
1;

package t::FullOuterJoinTable;
# simple class to represent intermediate tables used to emulate full outer joins
use strict;
use Carp;
use Class::AutoClass;
use vars qw(@AUTO_ATTRIBUTES @OTHER_ATTRIBUTES @CLASS_ATTRIBUTES %SYNONYMS %DEFAULTS);
use base qw(Class::AutoClass);

@AUTO_ATTRIBUTES=qw(name column_names);
@OTHER_ATTRIBUTES=qw(seqnum);
@CLASS_ATTRIBUTES=qw();
%SYNONYMS=(tablename=>'name',columns=>'column_names');
%DEFAULTS=(column_names=>[]);
Class::AutoClass::declare;

our $seqnum=0;
sub seqnum {shift; @_? $seqnum=$_[0]: $seqnum}

sub _init_self {
  my($self,$class,$args)=@_;
  return unless $class eq __PACKAGE__; # to prevent subclasses from re-running this
  my $name=$self->name || $self->name('fulljoin_'.sprintf('%03d',++$seqnum));
}
sub cleanup {
  my($class,$obj)=@_;
  my $dbh;
  if (ref $obj) {$dbh=$obj->isa('DBI::db')? $obj: $obj->dbh;}
  else {$dbh=Data::Babel->autodb->dbh;}

  # drop all tables that look like our intermediates
  my @tables=@{$dbh->selectcol_arrayref(qq(SHOW TABLES LIKE 'fulljoin_%'))};
  # being a bit paranoid, make sure each table ends with 3 digits
  @tables=grep /\d\d\d$/,@tables;
  map {$dbh->do(qq(DROP TABLE IF EXISTS $_))} @tables;

  # drop ur
  $dbh->do(qq(DROP TABLE IF EXISTS ur));
}
1;