The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
package Bio::SSRTool;

use 5.006;
use strict;
use warnings;
use vars qw( @ISA @EXPORT @EXPORT_OK );
use Carp qw( croak );
use IO::Scalar;

@ISA = qw( Exporter );
@EXPORT = qw( find_ssr );
@EXPORT_OK = qw( find_ssr );

our $VERSION = '0.04';

=head1 NAME

Bio::SSRTool - The great new Bio::SSRTool!

=head1 SYNOPSIS

Examines FASTA-formatted sequence data for simple sequence repeats (SSRs).

    use Bio::SSRTool;

=head1 EXPORT

=head1 SUBROUTINES/METHODS

=cut

# ------------------------------------------------------------
sub find_ssr {

=head2 find_ssr

    my $ssr_tool = Bio::SSRTool->new();
    my @ssrs = $ssr_tool->find_ssr( $seq, { min_repeats => 10 } );

Or: 

    use Bio::SSRTool 'find_ssr';

    my @ssrs = find_ssr( $fh, { motif_length => 'trimer' } );

The "find_ssr" routine expects a sequence string in FASTA format 
(or a filehandle to read a FASTA file) and an optional hash-ref of 
arguments including:

=over 4

=item min_repeats

A positive integer

=item motif_length

A positive integer between 1 and 10.  Default is 4.

=back

=cut

    my $seq         = shift or return;
    my $args        = shift || {};
    my $motif_len   = $args->{'motif_length'} || 4;
    my $min_repeats = $args->{'min_repeats'}  || 5;

    unless ( 
        $motif_len =~ /^\d{1,2}$/ && $motif_len > 0 && $motif_len < 11
    ) {
        croak "Invalid motif length '$motif_len'";
    }

    #
    # Make sure it acts like a filehandle
    #
    if ( ref $seq ne 'GLOB' ) {
        if ( $seq !~ /\n$/ && -e $seq ) {
            my $tmp = $seq;
            open my $fh, '<', $tmp or die "Can't read '$tmp': $!\n";
            $seq = $fh;
        }
        else {
            my $tmp = $seq;
            $seq = IO::Scalar->new( \$tmp );
        }
    }

    my @ssrs;
    $/ = '>';
    while ( my $rec = <$seq> ) {
        chomp $rec;
        next unless $rec;
        my ( $titleline, $sequence ) = split /\n/, $rec, 2;
        next unless ( $sequence && $titleline );

        # the ID is the first whitespace-delimited item on titleline
        my ( $id ) = $titleline =~ /^(\S+)/;  
        $id          ||= 'INPUT';
        $sequence      =~ s/\s+//g; # concatenate multi-line sequence
        my $seqlength  = length $sequence;
        my $ssr_number = 1; # track multiple ssrs within a single sequence
        my %locations;      # track location of SSRs as detected

        # test each spec against sequence
        for my $len ( 1 .. $motif_len ) {
            my $re = qr/(([gatc]{$len})\2{$min_repeats,})/;

            while ( $sequence =~ m/$re/ig ) {
                my $ssr   = $1;
                my $motif = lc $2;

                # reject "aaaaaaaaa", "ggggggggggg", etc.
                next if _homopolymer( $motif, $len );
                my $ssrlength  = length( $ssr );        # SSR length
                my $repeats    = $ssrlength / $len;     # of rep units
                my $end        = pos( $sequence );      # where SSR ends
                pos($sequence) = $end - $len;           # see docs
                my $start      = $end - $ssrlength + 1; # SSR starts
                my $id_and_num = $id . "-" . $ssr_number++;

                # count SSR only once
                unless ( $locations{ $start }++ ) {
                    push @ssrs, {
                        sequence    => $id_and_num, 
                        motif       => $motif, 
                        num_repeats => $repeats,
                        start       => $start,      
                        end         => $end,   
                        seq_length  => $seqlength
                    };
                }
            }
        }
    }

    return @ssrs;
}

# ------------------------------------------------------------
sub _homopolymer {
    # returns 'true' if motif is repeat of single nucleotide
    my ( $motif, $motiflength ) = @_;
    my ( $reps ) = $motiflength - 1;
    return $motif =~ /([gatc])\1{$reps}/;
}

# ------------------------------------------------------------
=head1 AUTHOR

Ken Youens-Clark, C<< <kclark at cpan.org> >>

=head1 BUGS

Please report any bugs or feature requests to C<bug-bio-ssrtool at
rt.cpan.org>, or through the web interface at
L<http://rt.cpan.org/NoAuth/ReportBug.html?Queue=Bio-SSRTool>.  I will be
notified, and then you'll automatically be notified of progress on your bug
as I make changes.

=head1 SUPPORT

You can find documentation for this module with the perldoc command.

    perldoc Bio::SSRTool

=over 4

=item * RT: CPAN's request tracker (report bugs here)

L<http://rt.cpan.org/NoAuth/Bugs.html?Dist=Bio-SSRTool>

=item * AnnoCPAN: Annotated CPAN documentation

L<http://annocpan.org/dist/Bio-SSRTool>

=item * CPAN Ratings

L<http://cpanratings.perl.org/d/Bio-SSRTool>

=item * Search CPAN

L<http://search.cpan.org/dist/Bio-SSRTool/>

=back

=head1 ACKNOWLEDGEMENTS

This was originally written in 1999 by Sam Cartinhour.  Thanks to Jim
Thomason for code review.

=head1 LICENSE AND COPYRIGHT

Copyright 2012 Ken Youens-Clark.

This program is released under the following license: GPL

=cut

1;