The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
#!/usr/bin/env perl

use warnings;
use strict;

use Data::Dumper;
use Carp;

use 5.010;

use Bio::Gonzales::Seq::IO qw/faiterate/;
use Digest::SHA1 qw/sha1_hex/;
use Getopt::Long::Descriptive;

my ( $opt, $usage ) = describe_options(
  '%c %o <file1.fa> <file2.fa>',
  ['diff fasta files based on id, description and sequence identity.'],
  ['Deactivate options with --no<option>, e.g. %c --noseq'],
  [],
  [ 'seq|s!',  'check if sequences are identical. (On by default)',    { default => 1 } ],
  [ 'id|i!',   'check if ids are identical. (On by default)',          { default => 1 } ],
  [ 'desc|d!', 'check if descriptions are identical. (Off by default)', { default => 0 } ],
  [ 'rm_gaps', 'remove gaps',                                          { default => 0 } ],
  [],
  [ 'verbose|v', "print extra stuff" ],
  [ 'help',      "print usage message and exit" ],
  [],
  ["Report bugs to jw\@bargsten.org"],
);

print( $usage->text ), exit if $opt->help;

my ( $afile, $bfile ) = @ARGV;

print( $usage->text ), exit
  unless ( $afile && $bfile && -f $afile && -f $bfile );

say "--- $afile";
my $aiter = faiterate $afile;

say "+++ $bfile";
my $biter = faiterate $bfile;

my %compare;

hash_seqs( $aiter, \%compare, 'a' );
hash_seqs( $biter, \%compare, 'b' );

compare( \%compare, 'a', 'b' );

sub compare {
  my ( $diff, $taga, $tagb ) = @_;

  for my $v ( values %$diff ) {

    if ( exists( $v->{$taga} ) && exists( $v->{$tagb} ) ) {
      #is there a difference between them? the only chance could be the number of duplicates
      #since the key encodes already the rest/uniqueness
      unless ( @{ $v->{$taga}{duplicates} } == @{ $v->{$tagb}{duplicates} } ) {
        say "- " . join " ", @{ $v->{$taga} }{ 'id', 'desc' },
          ">>>> num duplicates: " . scalar @{ $v->{$taga}{duplicates} };
        say "+ " . join " ", @{ $v->{$tagb} }{ 'id', 'desc' },
          ">>>> num duplicates: " . scalar @{ $v->{$tagb}{duplicates} };
      } elsif ( $opt->verbose ) {
        say "= "
          . join( " <> ", @{ $v->{$taga} }{ 'sha', 'id', 'desc' } ) . " // "
          . join( " <> ", @{ $v->{$tagb} }{ 'sha', 'id', 'desc' } );

      }
    } elsif ( exists( $v->{$taga} ) ) {
      #only a
      say "- " . join " <> ", @{ $v->{$taga} }{ 'sha', 'id', 'desc' };
    } elsif ( exists( $v->{$tagb} ) ) {
      #only b
      say "+ " . join " <> ", @{ $v->{$tagb} }{ 'sha', 'id', 'desc' };
    }
  }

  return;
}

sub hash_seqs {
  my ( $siter, $diff, $tag ) = @_;

  while ( my $s = $siter->() ) {
    my @key;

    my $sha = "n/a";
    if ( $opt->seq ) {
      my $seq = $s->seq;
      $seq =~ y/-.//d
        if ( $opt->rm_gaps );
      $sha = sha1_hex($seq);
      push @key, $sha;
    }

    push @key, $s->id
      if ( $opt->id );

    push @key, $s->desc
      if ( $opt->desc );

    my $key = join "_", @key;

    $diff->{$key} //= {};

    if ( exists( $diff->{$key}{$tag} ) ) {
      # we have duplicates within one file
      push @{ $diff->{$key}{$tag}{duplicates} }, { id => $s->id, desc => $s->desc, sha => $sha };
    } else {
      $diff->{$key}{$tag} = { id => $s->id, desc => $s->desc, sha => $sha, duplicates => [] };
    }
  }
  return;
}

__END__

=head1 NAME

fadiff - diff fasta files

=head1 SYNOPSIS

    ./fadiff [OPTIONS] <FILE_1.FA> <FILE_2.FA>

=head1 DESCRIPTION


=head1 OPTIONS

=over 4

=item B<< --seq (--noseq) >>

Check (or not) if sequences are identical.
Enabled by default.

=item B<< --id (--noid) >>

Check (or not) if ids are identical.
Enabled by default.

=item B<< --desc (--nodesc) >>

Check (or not) if descriptions are identical.
Enabled by default.

=back

=head1 SEE ALSO

L<Bio::Gonzales::Seq::IO>

=head1 AUTHOR

jw bargsten, C<< <joachim.bargsten at wur.nl> >>

=cut