The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
use 5.008;                   # Require at least Perl version 5.10
use strict;                  # Must declare all variables before using them
use warnings;                # Emit helpful warnings
use Test::More;              # Testing module
use Test::LongString;        # Compare strings byte by byte
use Data::Section -setup;    # Have various DATA sections, allows for mock files
use lib 'lib';               # add 'lib' to @INC
use Bio::App::SELEX::RNAmotifAnalysis;
use File::Slurp qw( slurp );

use autodie;    # Automatically throw fatal exceptions for common unrecoverable
                #   errors (e.g. trying to open a non-existent file)

{   # Test 1
    my $fh            = fh_from('input');
    my $expected_aref = [
        [ 'abcdefghijklmnop',   3 ],
        [ 'erwoprhasdfasfd',    2 ],
        [ 'ponmlkjihgfdcba',    2 ],
        [ 'ponmlkjihgfedcba',   1 ],
        [ 'abcdeghijklmnop',    1 ],
        [ 'abcddefghijklmnopp', 1 ],
        [ 'ponmlkjihgfedcbb',   1 ],
    ];
    test_get_sequences_from( $fh, $expected_aref );
}

sub test_get_sequences_from {
    my $fh            = shift;
    my $expected_aref = shift;
    my $result_aref   = Bio::App::SELEX::RNAmotifAnalysis::get_sequences_from($fh, 'simple');
    is_deeply( $result_aref, $expected_aref, 'sequences correctly extracted' );
}

{   #Test 2
    my $expected_cluster    = 3;
    my $sample_cluster_href = sample_cluster();
    my $seq                 = [ 'ponmlkjihgfedcbd', 'X' ];
    my $result_cluster =
      Bio::App::SELEX::RNAmotifAnalysis::matching_cluster( 5, $sample_cluster_href, $seq,
        5 );
    is( $result_cluster, $expected_cluster,
        'found correct cluster for ' . $seq->[1] );
}

{   #Test 3
    my $expected_cluster_href = sample_cluster();
    my $input_fh              = fh_from('input');
    my $result_cluster_href   = Bio::App::SELEX::RNAmotifAnalysis::cluster(
        max_distance => 5,
        fh           => $input_fh,
        max_clusters => 5,
        file_type    => 'simple',
    );

    is_deeply( $result_cluster_href, $expected_cluster_href,
        'clusters correctly determined' );
}

{   #Test 4
    my $input_fh            = fh_from('input');
    my $result_cluster_href = Bio::App::SELEX::RNAmotifAnalysis::cluster(
        max_distance => 5,
        fh           => $input_fh,
        max_clusters => 2,
        file_type    => 'simple',
    );
    my $expected_cluster_href = sample_cluster1();

    is_deeply( $result_cluster_href, $expected_cluster_href,
        'limiting number of clusters works!' );
}

{   #Test 5
    my $input_fh            = fh_from('odd');
    my $seed_fh             = fh_from('seed');
    my $result_cluster_href = Bio::App::SELEX::RNAmotifAnalysis::cluster(
        max_distance => 5,
        fh           => $input_fh,
        seed_fh      => $seed_fh,
        max_clusters => 5,
        file_type    => 'simple',
    );
    my $expected_cluster_href = expected_odd_cluster();
    is_deeply( $result_cluster_href, $expected_cluster_href,
        'explicit seed clusters works!' );
}

{   # Test 6: Changing seed can change outcome (compare to Test 5 results)
    my $input_fh            = fh_from('seed');
    my $seed_fh             = fh_from('odd');
    my $result_cluster_href = Bio::App::SELEX::RNAmotifAnalysis::cluster(
        max_distance => 5,
        fh           => $input_fh,
        seed_fh      => $seed_fh,
        max_clusters => 5,
        file_type    => 'simple',
    );
    my $expected_cluster_href = expected_odd_as_seed_cluster();
    is_deeply( $result_cluster_href, $expected_cluster_href,
        'Different seed clusters works!' );
}

sub sample_cluster1 {
    return {
        1 => [
            [ 'abcdefghijklmnop',   3 ],
            [ 'abcdeghijklmnop',    1 ],
            [ 'abcddefghijklmnopp', 1 ],
        ],
        2 => [ [ 'erwoprhasdfasfd', 2 ], ],

    };
}

sub sample_cluster {
    return {
        %{ sample_cluster1() },
        3 => [
            [ 'ponmlkjihgfdcba',  2 ],
            [ 'ponmlkjihgfedcba', 1 ],
            [ 'ponmlkjihgfedcbb', 1 ],
        ],
    };
}

sub expected_odd_cluster {
    return {
        1 => [
            [ 'ABCDEFGHIJKLMNO', 1 ],
            [ 'BBCDEFGHIJKLMBB', 1 ],
            [ 'ABCDBBBBIJKLMNO', 1 ],
        ]
    };
}

sub expected_odd_as_seed_cluster {
    return {
        1 => [ [ 'BBCDEFGHIJKLMBB', 1 ], [ 'ABCDEFGHIJKLMNO', 1 ], ],
        2 => [ [ 'ABCDBBBBIJKLMNO', 1 ], ]
    };
}

{    # Test
    my $all_cluster_string;
    open( my $fh_all, '>', \$all_cluster_string );

    my $cluster1_string;
    open( my $fh1, '>', \$cluster1_string );

    my $cluster2_string;
    open( my $fh2, '>', \$cluster2_string );

    my $cluster3_string;
    open( my $fh3, '>', \$cluster3_string );

    my $cluster_href = sample_cluster();
    my $fh_href      = {
        1 => $fh1,
        2 => $fh2,
        3 => $fh3,
    };

    Bio::App::SELEX::RNAmotifAnalysis::write_out_clusters(
        cluster_href => $cluster_href,
        fh_all       => $fh_all,
        fh_href      => $fh_href,
        max_top_seqs => 1000
    );
    my $expected_cluster1 = string_from('cluster1');
    my $expected_cluster2 = string_from('cluster2');
    my $expected_cluster3 = string_from('cluster3');
    is_string( $cluster1_string, $expected_cluster1, 'cluster1 good' );
    is_string( $cluster2_string, $expected_cluster2, 'cluster2 good' );
    is_string( $cluster3_string, $expected_cluster3, 'cluster3 good' );
}

{    # Test 7: Maximum top sequences
    my $all_cluster_string;
    open( my $fh_all, '>', \$all_cluster_string );

    my %string_for = (
        1 => '',
        2 => '',
        3 => '',
    );
    open( my $fh1, '>', \$string_for{1} );

    open( my $fh2, '>', \$string_for{2} );

    open( my $fh3, '>', \$string_for{3} );

    my $cluster_href = sample_cluster();
    my $fh_href      = {
        1 => $fh1,
        2 => $fh2,
        3 => $fh3,
    };

    my $input_fh = fh_from('input');

    Bio::App::SELEX::RNAmotifAnalysis::write_out_clusters(
        cluster_href => $cluster_href,
        fh_all       => $fh_all,
        fh_href      => $fh_href,
        max_top_seqs => 2
    );

    for my $cluster ( 1 .. 2 ) {
        my $expected_fasta_top = string_from("fasta_top_cluster_$cluster");
        my $expected_fasta_overage =
          string_from("fasta_overage_cluster_$cluster");
        my $overage_file_name = "cluster_${cluster}_overage.fasta";
        my $result_overage    = slurp $overage_file_name;

        is( $string_for{$cluster}, $expected_fasta_top, 'top seqs works' );
        is( $result_overage, $expected_fasta_overage,
            'overage fasta file correct' );
        delete_temp_file($overage_file_name);
    }
}

{    # Test
    my $all_cluster_string;
    open( my $fh_all, '>', \$all_cluster_string );

    my $cluster1_string;
    open( my $fh1, '>', \$cluster1_string );

    my $cluster2_string;
    open( my $fh2, '>', \$cluster2_string );

    my $cluster3_string;
    open( my $fh3, '>', \$cluster3_string );

    my $cluster_href = sample_cluster();
    my $fh_href      = {
        1 => $fh1,
        2 => $fh2,
        3 => $fh3,
    };

    Bio::App::SELEX::RNAmotifAnalysis::write_out_clusters(
        cluster_href => $cluster_href,
        fh_all       => $fh_all,
        fh_href      => $fh_href,
        max_top_seqs => 1000
    );
    my $expected_cluster1 = string_from('cluster1');
    my $expected_cluster2 = string_from('cluster2');
    my $expected_cluster3 = string_from('cluster3');
    is_string( $cluster1_string, $expected_cluster1, 'cluster1 good' );
    is_string( $cluster2_string, $expected_cluster2, 'cluster2 good' );
    is_string( $cluster3_string, $expected_cluster3, 'cluster3 good' );
}

done_testing();

sub delete_temp_file {
    my $filename = shift;
    my $result   = unlink $filename;
    ok( $result, "successfully deleted temporary file '$filename'" );
}

sub fh_to_empty_string {
    my $string = '';
    open( my $fh, '>', \$string );
    return $fh;
}

sub sref_from {
    my $section = shift;

    #Scalar reference from the section
    return __PACKAGE__->section_data($section);
}

sub string_from {
    my $section = shift;

    #Get the scalar reference
    my $sref = sref_from($section);

    #Return the actual scalar (probably a string), not the reference to it
    return ${$sref};
}

sub fh_from {
    my $section = shift;
    my $sref    = sref_from($section);

    #Create filehandle to the referenced scalar
    open( my $fh, '<', $sref );
    return $fh;
}

#------------------------------------------------------------------------
# IMPORTANT!
#
# Each line from each section automatically ends with a newline character
#------------------------------------------------------------------------

__DATA__
__[ input ]__
abcdefghijklmnop
abcdefghijklmnop
abcdefghijklmnop
abcdeghijklmnop
abcddefghijklmnopp
ponmlkjihgfedcba
ponmlkjihgfedcbb
ponmlkjihgfdcba
ponmlkjihgfdcba
erwoprhasdfasfd
erwoprhasdfasfd
__[ cluster1 ]__
>1.1.3
abcdefghijklmnop
>1.2.1
abcdeghijklmnop
>1.3.1
abcddefghijklmnopp
__[ cluster2 ]__
>2.1.2
ponmlkjihgfdcba
>2.2.1
ponmlkjihgfedcba
>2.3.1
ponmlkjihgfedcbb
__[ cluster3 ]__
>3.1.2
erwoprhasdfasfd
>3.1.2b
erwoprhasdfasfd
__[ seed ]__
ABCDEFGHIJKLMNO
__[ odd ]__
BBCDEFGHIJKLMBB
ABCDBBBBIJKLMNO
__[ odd_cluster ]__
>1.1.1
ABCDBBBBIJKLMNO
>1.2.1
ABCDEFGHIJKLMNO
>1.3.1
BBCDEFGHIJKLMBB
__[ seeded_cluster1 ]__
>1.1.1
abcdeghijklmnop
>1.2.3
abcdefghijklmnop
>1.3.1
abcddefghijklmnopp
__[ fasta_top_cluster_1 ]__
>1.1.3
abcdefghijklmnop
>1.2.1
abcdeghijklmnop
__[ fasta_overage_cluster_1 ]__
>1.3.1
abcddefghijklmnopp
__[ fasta_top_cluster_2 ]__
>2.1.2
ponmlkjihgfdcba
>2.2.1
ponmlkjihgfedcba
__[ fasta_overage_cluster_2 ]__
>2.3.1
ponmlkjihgfedcbb