The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
=head1 NAME

Bio::Polloc::Rule::tandemrepeat - A rule of type tandemrepeat

=head1 DESCRIPTION

This rule is similar to Bio::Polloc::Rule::repeat, but employes TRF (Benson 1999, NAR 27(2):573-580)
for the repeats calculation.

=head1 AUTHOR - Luis M. Rodriguez-R

Email lmrodriguezr at gmail dot com

=cut

package Bio::Polloc::Rule::tandemrepeat;
use base qw(Bio::Polloc::RuleI);
use strict;
use Bio::Polloc::Polloc::IO;
use Bio::Polloc::LocusI;
use Bio::SeqIO;
# For TRF:
use File::Spec;
use File::Basename;
use Cwd;
use Config;
our $VERSION = 1.0503; # [a-version] from Bio::Polloc::Polloc::Version


=head1 APPENDIX

Methods provided by the package

=head2 new

Generic initialization method.

=cut

sub new {
   my($caller,@args) = @_;
   my $self = $caller->SUPER::new(@args);
   $self->_initialize(@args);
   return $self;
}

=head2 execute

This is where magic happens.  Translates the parameters of the object into a call to
B<TRF>, and scans the sequence for repeats.

=head3 Arguments

The sequence (C<-seq>) as a L<Bio::Seq> or a L<Bio::SeqIO> object.

=head3 Returns

An array reference populated with L<Bio::Polloc::Locus::tandemrepeat> objects.

=cut

sub execute {
   my($self,@args) = @_;
   my($seq) = $self->_rearrange([qw(SEQ)], @args);
   
   $self->throw("You must provide a sequence to evaluate the rule", $seq) unless $seq;
   $self->throw("You must provide an object as sequence", $seq)
   		unless UNIVERSAL::can($seq,'isa');
   
   # For Bio::SeqIO objects
   if($seq->isa('Bio::SeqIO')){
      my @feats = ();
      while(my $s = $seq->next_seq){
         push(@feats, @{$self->execute(-seq=>$s)})
      }
      return wantarray ? @feats : \@feats;
   }
   
   $self->throw("Illegal class of sequence '".ref($seq)."'", $seq) unless $seq->isa('Bio::Seq');

   # Include safe_value parameters
   # MINSIZE MAXSIZE MINPERIOD MAXPERIOD EXP MATCH MISMATCH INDELS MINSCORE MAXSCORE PM PI
   my %c_v = (
      "minsize" => 0,	"maxsize" => 1e20,
      "minperiod" => 0,	"maxperiod" => 500,
      "exp" => 0,
      "match" => 2,	"mismatch"=> 3,	"indels" => 5,
      "minscore" => 50,	"maxscore" => 0,
      "minsim" => 0,	"maxsim" => 100,
      "pm" => 80,	"pi" => 20
   );
   for my $k ( keys %c_v){
      my $p = $self->_search_value($k);
      $c_v{$k} = $p if defined $p; # override defaults
   }

   # Create the IO master
   my $io = Bio::Polloc::Polloc::IO->new();

   # Search for TRF
   $self->source('trf'); # For GFF
   my $trf = $self->_executable(defined $self->ruleset ? $self->ruleset->value("path") : undef)
   	or $self->throw("Could not find the trf binary");
   
   # Next line is because of the horrible practice of creating files at CWD out
   # of thin air (with no possibility to define another way!)
   $trf = File::Spec->rel2abs($trf) unless File::Spec->file_name_is_absolute($trf);
   
   # Write the sequence
   my($seq_fh, $seq_file) = $io->tempfile;
   close $seq_fh;
   my $seqO = Bio::SeqIO->new(-file=>">$seq_file", -format=>'Fasta');
   $seqO->write_seq($seq);
   
   # Run it
   my @run = ($trf);
   push @run, $seq_file;
   push @run, $c_v{'match'}, $c_v{'mismatch'}, $c_v{'indels'};
   push @run, $c_v{'pm'}, $c_v{'pi'}, $c_v{'minscore'}, $c_v{'maxperiod'};
   push @run, "-h"; #<- Simplifies output, and produces one file instead of tons!
   push @run, "2>&1";
   push @run, "|";
   my $cwd = cwd();
   # Finger crossed
   my $tmpdir = Bio::Polloc::Polloc::IO->tempdir();
   chdir $tmpdir or $self->throw("I can not move myself to the temporal directory: $!", $tmpdir);
   $self->debug("Hello from ".cwd());
   $self->debug("Running: ".join(" ",@run)." [CWD: ".cwd()."]");
   my $run = Bio::Polloc::Polloc::IO->new(-file=>join(" ",@run));
   my $runout = '';
   while($run->_readline){$runout.=$_ if defined $_} # Do nothing, this is truly unuseful output, just save it in case of errors (for debugging)
   $run->close();
   # Finger crossed (yes, again)
   chdir $cwd or $self->throw("I can not move myself to the original directory: $!", $cwd);
   $self->debug("Hello from ".cwd());
   
   # Try to locate the output (belive it or not)...
   my $outfile = Bio::Polloc::Polloc::IO->catfile($tmpdir, basename($seq_file) . "." .
   		$c_v{'match'} . "." . $c_v{'mismatch'} . "." . $c_v{'indels'} . "." .
		$c_v{'pm'} . "." . $c_v{'pi'} . "." . $c_v{'minscore'} . "." .
		$c_v{'maxperiod'} . ".dat");
   $self->throw("Impossible to locate the output file of TRF: '$outfile', (".join(" ", @run).") showing full output", $runout) unless -e $outfile;
   
   # And finally parse it
   my $ontable = 0;
   my @feats = ();
   $run = Bio::Polloc::Polloc::IO->new(-file=>$outfile);
   while(my $line = $run->_readline){
      if($line =~ m/^Parameters:\s/){
         $ontable = 1;
      }elsif($ontable){
	 chomp $line;
	 next if $line =~ /^\s*$/;
	 #from to period-size copy-number consensus-size percent-matches percent-indels score A T C G entropy consensus sequence
	 #269 316 18 2.6 19 56 3 51 8 45 35 10 1.68 GTCGCGGCCACGTGCACCC GTCGCGTCCACGTGCGCCCGAGCCGGC...
	 my @v = split /\s+/, $line;
	 $#v==14 or $self->throw("Unexpected line $.",$line,"Bio::Polloc::Polloc::ParsingException");
	 # MINSIZE MAXSIZE MINPERIOD MAXPERIOD EXP MATCH MISMATCH INDELS MINSCORE MAXSCORE
	 $self->debug("Checking additional parameters");
	 next if length($v[14]) > $c_v{'maxsize'} or length($v[14]) < $c_v{'minsize'};
	 next if $v[2] > $c_v{'maxperiod'} or $v[2] < $c_v{'minperiod'};
	 next if $v[3] < $c_v{'exp'};
	 next if $v[7] < $c_v{'minscore'};
	 next if $c_v{'maxscore'} and $v[7] > $c_v{'maxscore'};
	 next if $v[5] < $c_v{'minsim'} or $v[5] > $c_v{'maxsim'};
	 $self->debug("Cool!");
	 
	 my $id = $self->_next_child_id;
	 push @feats, Bio::Polloc::LocusI->new(
	 		-type=>'repeat', # Be careful, usually $self->type
			-rule=>$self, -seq=>$seq,
			-from=>$v[0]+0, -to=>$v[1]+0, -strand=>"+",
			-name=>$self->name,
			-id=>(defined $id ? $id : ""),
			-period=>$v[2]+0, -exponent=>$v[3]+0,
			-consensus=>$v[13],
			-error=>100-$v[5],
			-score=>$v[7],
			-repeats=>$v[14]);
      }
   }
   $run->close();
   unlink $outfile;
   rmdir $tmpdir;
   return wantarray ? @feats : \@feats;
}

=head2 stringify_value

Produces a string with the value of the rule.

=cut

sub stringify_value {
   my ($self,@args) = @_;
   my $out = "";
   for my $k (keys %{$self->value}){
      $out.= "$k=>".(defined $self->value->{$k} ? $self->value->{$k} : "")." ";
   }
   return $out;
}

=head2 value

=head3 Arguments

Value (I<str> or I<hashref> or I<arrayref>).  The supported keys are:

=over

=item -minsize I<int>

Minimum size of the repeat

=item -maxsize I<int>

Maximum size of the repeat

=item -minperiod I<float>

Minimum period of the repeat

=item -maxperiod I<float>

Maximum period of the repeat

=item -exp I<float>

Minimum exponent (number of repeats)

=item -match I<int>

Matching weight

=item -mismatch I<int>

Mismatching penalty

=item -indels I<int>

Indel penalty

=item -minscore I<float>

Minimum score

=item -maxscore I<float>

Maximum score

=item -minsim I<float>

Minimum similarity percent

=item -maxsim I<float>

Maximum similarity percent

=item -pm I<float>

Match probability

=item -pi I<float>

Indel probability

=back

=head3 Return

Value (I<hashref> or C<undef>).

=head1 INTERNAL METHODS

Methods intended to be used only within the scope of Bio::Polloc::*

=head2 _parameters

=cut

sub _parameters { return [qw(MINSIZE MAXSIZE MINPERIOD MAXPERIOD EXP MATCH MISMATCH INDELS MINSCORE MAXSCORE MINSIM MAXSIM PM PI)] }

=head2 _executable

Attempts to find the TRF executable.

=cut

sub _executable {
   my($self, $path) = @_;
   my $exe;
   my $bin;
   my $name = 'trf';
   my $io = "Bio::Polloc::Polloc::IO";
   $self->debug("Searching the $name binary for $^O");
   # Note the darwin support.  This is because darwin is unable to execute
   # the linux binary (despite its origin)
   $bin=$^O =~ /(macos|darwin)/i ? "$name.macosx.bin" :
   	$^O =~ /mswin/i ? "$name.exe" :
	$Config{'archname64'} ? "$name.linux64.bin" :
	"$name.linux32.bin";
   my @pre = ('');
   unshift @pre, $path if $path;
   for my $p (@pre) {
      # Try first WITH version, to avoid v3 series
      for my $v ((404, 403, 402, 401, 400, '')){
         for my $e (('', '.bin', '.exe')){
	    for my $n (($bin, $name)){
	       $exe = $io->exists_exe($p . $n . $v . $e);
	       return $exe if $exe;
	    }
	 }
      }
   }
   # Other awful naming systems can be used by this package, but here we stop!
}

=head2 _initialize

=cut

sub _initialize {
   my($self,@args) = @_;
   $self->type('tandemrepeat');
}

1;