The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
package Bio::Gonzales::Domain::Group;

use Mouse;

use warnings;
use strict;
use Carp;
use Data::Dumper;
use Bio::Gonzales::Feat::IO::GFF3;

use 5.010;
use List::MoreUtils qw/uniq first_value any/;

our $VERSION = '0.073'; # VERSION

#my $q            = Bio::Gonzales::Search::IO::HMMER3->new( file => $out )->parse;
has search_result               => ( is => 'rw', required   => 1 );
has required_domains            => ( is => 'rw', required   => 1 );
has forbidden_domains           => ( is => 'rw', default    => sub { [] } );
has can_have_additional_domains => ( is => 'rw', default    => 1 );
has group_idcs                  => ( is => 'rw', lazy_build => 1 );
has only_best_hit => ( is => 'rw' );

sub add_required {

}

sub add_forbidden {

}

sub BUILD {
  my ($self) = @_;
  my $r = $self->domain_list( $self->search_result );

  $self->required_domains( _match_domain_names( $r, $self->required_domains ) )
    if ( @{ $self->required_domains } > 0 );
  $self->forbidden_domains( _match_domain_names( $r, $self->forbidden_domains ) )
    if ( @{ $self->forbidden_domains } > 0 );

  return $self;
}

sub domain_list {
  my $r = pop;

  return [ keys %$r ];
}

sub _match_domain_names {
  my ( $reference, $query ) = @_;

  my @gresult;
  for my $q (@$query) {
    my @result;
    for my $e (@$q) {
      push @result, grep {/$e/} @$reference;
    }
    push @gresult, \@result if (@result);
  }
  return \@gresult;
}

sub _build_group_idcs {
  my ($self) = @_;

  my $domain_groups = $self->required_domains;
  my %group;
  my $i = 0;
  for my $g (@$domain_groups) {
    map { $group{$_} = $i } @$g;
    $i++;
  }
  return \%group;
}

sub filter_hits {
  my ($self) = @_;

  my $q          = $self->search_result;
  my $id_lookup  = { map { $_ => 1 } @{ $self->filter_ids } };
  my $group_idcs = $self->group_idcs;

  my @hit_col;
  while ( my ( $domain_id, $result ) = each %$q ) {
    next unless ( exists( $group_idcs->{$domain_id} ) );
    while ( my ( $seq_id, $hits ) = each %$result ) {
      next if ( $id_lookup && !exists( $id_lookup->{$seq_id} ) );
      for my $hit (@$hits) {
        $hit->{domain_id} = $domain_id;
        $hit->{seq_id}    = $seq_id;
        $hit->{group_id}  = $group_idcs->{$domain_id};
        push @hit_col, $hit;
      }
    }
  }
  return $self->_pick_best_domain_hits( \@hit_col )
    if ( $self->only_best_hit );

  return \@hit_col;
}

sub _pick_best_domain_hits {
  my ( $self, $hit_list ) = @_;

  my %group_collection;
  for my $h (@$hit_list) {
    my $id = $h->{seq_id} . "__//__" . $h->{group_id};
    $group_collection{$id} //= [];

    push @{ $group_collection{$id} }, $h;
  }

  my @best_ones;
  for my $h ( values %group_collection ) {
    my $max;

    #find the domain with the maximal score
    map { $max = $_ if ( !$max || $_->{score} > $max->{score} ) } @$h;
    push @best_ones, $max;
  }

  return \@best_ones;
}

sub to_gff {
  my ( $self, $dest ) = @_;
  my $hits = $self->filter_hits;

  my $gffout = Bio::Gonzales::Feat::IO::GFF3->new( file_or_fh => $dest, mode => '>' );

  my $gidcs = $self->group_idcs;
  while ( my ( $d, $idx ) = each %$gidcs ) {
    $gffout->write_comment(" Group $idx contains domain: $d");
  }

  my $i = 0;
  for my $max (@$hits) {

    $gffout->write_feat(
      Bio::Gonzales::Feat->new(
        seq_id     => $max->{seq_id},
        source     => 'hmmer',
        type       => 'region',
        start      => $max->{env_from},
        end        => $max->{env_to},
        strand     => 0,
        score      => $max->{score},
        attributes => {
          ID       => [ "hit_" . $i++ ],
          domainID => [ $max->{domain_id} ],
          groupID  => [ $max->{group_id} ]
        }
      )
    );
  }

  $gffout->close;
}

sub filter_ids {
  my ($self) = @_;

  my $q       = $self->search_result;
  my $rgroups = $self->required_domains;
  return unless ($rgroups);

  my %id_occurrence;

  # select the ids for every group by...
  for my $domain_synonyms (@$rgroups) {
    my @ids;
    # iterating through the domains ...
    for my $domain_synonym (@$domain_synonyms) {
      # and storing all ids in an group-wide id list
      push @ids, grep { @{ $q->{$domain_synonym}{$_} } > 0 } keys %{ $q->{$domain_synonym} };
    }

    # we need to eliminate duplicate ids (multiple synonyms per group)
    # and increment the occurrence of the found ids
    map { $id_occurrence{$_}++ } uniq @ids;

  }

  # if an id is in every group it occurred #num of rgroups times and is not forbidden, return it
  return [ grep { $id_occurrence{$_} == @$rgroups && !$self->_is_forbidden($_) } keys %id_occurrence ];
}

sub _is_forbidden {
  my ( $self, $id ) = @_;

  my $q       = $self->search_result;
  my $fgroups = $self->forbidden_domains;
  for my $fg (@$fgroups) {
    for my $fd (@$fg) {
      if ( exists( $q->{$fd}{$id} ) ) {
        return 1;
      }
    }
  }
  return;

}
__PACKAGE__->meta->make_immutable();