=head1 NAME
Bio::Polloc::Rule::tandemrepeat - A rule of type tandemrepeat
=head1 DESCRIPTION
This rule is similar to Bio::Polloc::Rule::repeat, but employes TRF (Benson 1999, NAR 27(2):573-580)
for the repeats calculation.
=head1 AUTHOR - Luis M. Rodriguez-R
Email lmrodriguezr at gmail dot com
=cut
package Bio::Polloc::Rule::tandemrepeat;
use base qw(Bio::Polloc::RuleI);
use strict;
use Bio::Polloc::Polloc::IO;
use Bio::Polloc::LocusI;
use Bio::SeqIO;
# For TRF:
use File::Spec;
use File::Basename;
use Cwd;
use Config;
our $VERSION = 1.0503; # [a-version] from Bio::Polloc::Polloc::Version
=head1 APPENDIX
Methods provided by the package
=head2 new
Generic initialization method.
=cut
sub new {
my($caller,@args) = @_;
my $self = $caller->SUPER::new(@args);
$self->_initialize(@args);
return $self;
}
=head2 execute
This is where magic happens. Translates the parameters of the object into a call to
B<TRF>, and scans the sequence for repeats.
=head3 Arguments
The sequence (C<-seq>) as a L<Bio::Seq> or a L<Bio::SeqIO> object.
=head3 Returns
An array reference populated with L<Bio::Polloc::Locus::tandemrepeat> objects.
=cut
sub execute {
my($self,@args) = @_;
my($seq) = $self->_rearrange([qw(SEQ)], @args);
$self->throw("You must provide a sequence to evaluate the rule", $seq) unless $seq;
$self->throw("You must provide an object as sequence", $seq)
unless UNIVERSAL::can($seq,'isa');
# For Bio::SeqIO objects
if($seq->isa('Bio::SeqIO')){
my @feats = ();
while(my $s = $seq->next_seq){
push(@feats, @{$self->execute(-seq=>$s)})
}
return wantarray ? @feats : \@feats;
}
$self->throw("Illegal class of sequence '".ref($seq)."'", $seq) unless $seq->isa('Bio::Seq');
# Include safe_value parameters
# MINSIZE MAXSIZE MINPERIOD MAXPERIOD EXP MATCH MISMATCH INDELS MINSCORE MAXSCORE PM PI
my %c_v = (
"minsize" => 0, "maxsize" => 1e20,
"minperiod" => 0, "maxperiod" => 500,
"exp" => 0,
"match" => 2, "mismatch"=> 3, "indels" => 5,
"minscore" => 50, "maxscore" => 0,
"minsim" => 0, "maxsim" => 100,
"pm" => 80, "pi" => 20
);
for my $k ( keys %c_v){
my $p = $self->_search_value($k);
$c_v{$k} = $p if defined $p; # override defaults
}
# Create the IO master
my $io = Bio::Polloc::Polloc::IO->new();
# Search for TRF
$self->source('trf'); # For GFF
my $trf = $self->_executable(defined $self->ruleset ? $self->ruleset->value("path") : undef)
or $self->throw("Could not find the trf binary");
# Next line is because of the horrible practice of creating files at CWD out
# of thin air (with no possibility to define another way!)
$trf = File::Spec->rel2abs($trf) unless File::Spec->file_name_is_absolute($trf);
# Write the sequence
my($seq_fh, $seq_file) = $io->tempfile;
close $seq_fh;
my $seqO = Bio::SeqIO->new(-file=>">$seq_file", -format=>'Fasta');
$seqO->write_seq($seq);
# Run it
my @run = ($trf);
push @run, $seq_file;
push @run, $c_v{'match'}, $c_v{'mismatch'}, $c_v{'indels'};
push @run, $c_v{'pm'}, $c_v{'pi'}, $c_v{'minscore'}, $c_v{'maxperiod'};
push @run, "-h"; #<- Simplifies output, and produces one file instead of tons!
push @run, "2>&1";
push @run, "|";
my $cwd = cwd();
# Finger crossed
my $tmpdir = Bio::Polloc::Polloc::IO->tempdir();
chdir $tmpdir or $self->throw("I can not move myself to the temporal directory: $!", $tmpdir);
$self->debug("Hello from ".cwd());
$self->debug("Running: ".join(" ",@run)." [CWD: ".cwd()."]");
my $run = Bio::Polloc::Polloc::IO->new(-file=>join(" ",@run));
my $runout = '';
while($run->_readline){$runout.=$_ if defined $_} # Do nothing, this is truly unuseful output, just save it in case of errors (for debugging)
$run->close();
# Finger crossed (yes, again)
chdir $cwd or $self->throw("I can not move myself to the original directory: $!", $cwd);
$self->debug("Hello from ".cwd());
# Try to locate the output (belive it or not)...
my $outfile = Bio::Polloc::Polloc::IO->catfile($tmpdir, basename($seq_file) . "." .
$c_v{'match'} . "." . $c_v{'mismatch'} . "." . $c_v{'indels'} . "." .
$c_v{'pm'} . "." . $c_v{'pi'} . "." . $c_v{'minscore'} . "." .
$c_v{'maxperiod'} . ".dat");
$self->throw("Impossible to locate the output file of TRF: '$outfile', (".join(" ", @run).") showing full output", $runout) unless -e $outfile;
# And finally parse it
my $ontable = 0;
my @feats = ();
$run = Bio::Polloc::Polloc::IO->new(-file=>$outfile);
while(my $line = $run->_readline){
if($line =~ m/^Parameters:\s/){
$ontable = 1;
}elsif($ontable){
chomp $line;
next if $line =~ /^\s*$/;
#from to period-size copy-number consensus-size percent-matches percent-indels score A T C G entropy consensus sequence
#269 316 18 2.6 19 56 3 51 8 45 35 10 1.68 GTCGCGGCCACGTGCACCC GTCGCGTCCACGTGCGCCCGAGCCGGC...
my @v = split /\s+/, $line;
$#v==14 or $self->throw("Unexpected line $.",$line,"Bio::Polloc::Polloc::ParsingException");
# MINSIZE MAXSIZE MINPERIOD MAXPERIOD EXP MATCH MISMATCH INDELS MINSCORE MAXSCORE
$self->debug("Checking additional parameters");
next if length($v[14]) > $c_v{'maxsize'} or length($v[14]) < $c_v{'minsize'};
next if $v[2] > $c_v{'maxperiod'} or $v[2] < $c_v{'minperiod'};
next if $v[3] < $c_v{'exp'};
next if $v[7] < $c_v{'minscore'};
next if $c_v{'maxscore'} and $v[7] > $c_v{'maxscore'};
next if $v[5] < $c_v{'minsim'} or $v[5] > $c_v{'maxsim'};
$self->debug("Cool!");
my $id = $self->_next_child_id;
push @feats, Bio::Polloc::LocusI->new(
-type=>'repeat', # Be careful, usually $self->type
-rule=>$self, -seq=>$seq,
-from=>$v[0]+0, -to=>$v[1]+0, -strand=>"+",
-name=>$self->name,
-id=>(defined $id ? $id : ""),
-period=>$v[2]+0, -exponent=>$v[3]+0,
-consensus=>$v[13],
-error=>100-$v[5],
-score=>$v[7],
-repeats=>$v[14]);
}
}
$run->close();
unlink $outfile;
rmdir $tmpdir;
return wantarray ? @feats : \@feats;
}
=head2 stringify_value
Produces a string with the value of the rule.
=cut
sub stringify_value {
my ($self,@args) = @_;
my $out = "";
for my $k (keys %{$self->value}){
$out.= "$k=>".(defined $self->value->{$k} ? $self->value->{$k} : "")." ";
}
return $out;
}
=head2 value
=head3 Arguments
Value (I<str> or I<hashref> or I<arrayref>). The supported keys are:
=over
=item -minsize I<int>
Minimum size of the repeat
=item -maxsize I<int>
Maximum size of the repeat
=item -minperiod I<float>
Minimum period of the repeat
=item -maxperiod I<float>
Maximum period of the repeat
=item -exp I<float>
Minimum exponent (number of repeats)
=item -match I<int>
Matching weight
=item -mismatch I<int>
Mismatching penalty
=item -indels I<int>
Indel penalty
=item -minscore I<float>
Minimum score
=item -maxscore I<float>
Maximum score
=item -minsim I<float>
Minimum similarity percent
=item -maxsim I<float>
Maximum similarity percent
=item -pm I<float>
Match probability
=item -pi I<float>
Indel probability
=back
=head3 Return
Value (I<hashref> or C<undef>).
=head1 INTERNAL METHODS
Methods intended to be used only within the scope of Bio::Polloc::*
=head2 _parameters
=cut
sub _parameters { return [qw(MINSIZE MAXSIZE MINPERIOD MAXPERIOD EXP MATCH MISMATCH INDELS MINSCORE MAXSCORE MINSIM MAXSIM PM PI)] }
=head2 _executable
Attempts to find the TRF executable.
=cut
sub _executable {
my($self, $path) = @_;
my $exe;
my $bin;
my $name = 'trf';
my $io = "Bio::Polloc::Polloc::IO";
$self->debug("Searching the $name binary for $^O");
# Note the darwin support. This is because darwin is unable to execute
# the linux binary (despite its origin)
$bin=$^O =~ /(macos|darwin)/i ? "$name.macosx.bin" :
$^O =~ /mswin/i ? "$name.exe" :
$Config{'archname64'} ? "$name.linux64.bin" :
"$name.linux32.bin";
my @pre = ('');
unshift @pre, $path if $path;
for my $p (@pre) {
# Try first WITH version, to avoid v3 series
for my $v ((404, 403, 402, 401, 400, '')){
for my $e (('', '.bin', '.exe')){
for my $n (($bin, $name)){
$exe = $io->exists_exe($p . $n . $v . $e);
return $exe if $exe;
}
}
}
}
# Other awful naming systems can be used by this package, but here we stop!
}
=head2 _initialize
=cut
sub _initialize {
my($self,@args) = @_;
$self->type('tandemrepeat');
}
1;