The London Perl and Raku Workshop takes place on 26th Oct 2024. If your company depends on Perl, please consider sponsoring and/or attending.
#-*-perl-*-
#$Id: BEDTools.t kortsch $

use strict;
use warnings;
no warnings qw(once);
our $home;

my $v = 0; # private verbosity - this module only

BEGIN {
    $home = '.';    # set to '.' for Build use,
                    # '..' for debugging from .t file
    unshift @INC, $home;
    use Bio::Root::Test;
    test_begin(-tests => 423,
               -requires_modules => [qw(IPC::Run Bio::Tools::Run::BEDTools)]);
}

use Bio::Tools::Run::WrapperBase;
use Bio::SeqIO;
use Bio::Tools::GuessSeqFormat;

# test command functionality

ok my $bedtoolsfac = Bio::Tools::Run::BEDTools->new, "make a default factory";
is $bedtoolsfac->command, 'bam_to_bed', "default to command 'bam_to_bed'";

my @commands = qw(
    annotate         fasta_from_bed       overlap
    bam_to_bed       genome_coverage      pair_to_pair
    bed_to_bam       graph_union          pair_to_bed
    bed_to_IGV       group_by             shuffle
    b12_to_b6        intersect            slop
    closest          links                sort
    complement       mask_fasta_from_bed  subtract
    coverage         merge                window
);


my %p = (
    'annotate'             => 0,
    'bam_to_bed'           => 2,
    'bed_to_bam'           => 1,
    'bed_to_IGV'           => 5,
    'b12_to_b6'            => 0,
    'closest'              => 1,
    'complement'           => 0,
    'coverage'             => 0,
    'fasta_from_bed'       => 0,
    'genome_coverage'      => 2,
    'graph_union'          => 2,
    'group_by'             => 3,
    'intersect'            => 1,
    'links'                => 3,
    'mask_fasta_from_bed'  => 0,
    'merge'                => 1,
    'overlap'              => 1,
    'pair_to_bed'          => 2,
    'pair_to_pair'         => 3,
    'shuffle'              => 2,
    'slop'                 => 3,
    'sort'                 => 0,
    'subtract'             => 1,
    'window'               => 3
     );

my %s = (
    'annotate'             => 4,
    'bam_to_bed'           => 6,
    'bed_to_bam'           => 2,
    'bed_to_IGV'           => 2,
    'b12_to_b6'            => 0,
    'closest'              => 2,
    'complement'           => 0,
    'coverage'             => 4,
    'fasta_from_bed'       => 3,
    'genome_coverage'      => 4,
    'graph_union'          => 2,
    'group_by'             => 0,
    'intersect'            => 11,
    'links'                => 0,
    'mask_fasta_from_bed'  => 1,
    'merge'                => 3,
    'overlap'              => 0,
    'pair_to_bed'          => 4,
    'pair_to_pair'         => 3,
    'shuffle'              => 1,
    'slop'                 => 1,
    'sort'                 => 6,
    'subtract'             => 1,
    'window'               => 5
    );

my $bam_file = test_input_file('Ft.bam');
my $bed_file = test_input_file('Ft.bed');
my $bed12_file = test_input_file('Ft.bed12');
my $fas_file = test_input_file('Ft.frag.fas');
my $bedpe1_file = test_input_file('e_coli_1.bedpe');
my $bedpe2_file = test_input_file('e_coli_2.bedpe');
my $bed3_file = test_input_file('e_coli.bed3');
my $bg1_file = test_input_file('1.bg');
my $bg2_file = test_input_file('2.bg');
my $bg3_file = test_input_file('3.bg');

my %format_lookup = (
    'annotate'             => 'bed',
    'bam_to_bed'           => 'bed',
    'bed_to_bam'           => 'bam',
    'bed_to_IGV'           => 'igv',
    'b12_to_b6'            => 'bed',
    'closest'              => 'bedpe',
    'complement'           => 'bed',
    'coverage'             => 'bed',
    'fasta_from_bed'       => 'fasta',
    'genome_coverage'      => 'tab',
    'graph_union'          => 'bg',
    'group_by'             => 'bed',
    'intersect'            => 'bed|bam',
    'links'                => 'html',
    'mask_fasta_from_bed'  => 'fasta',
    'merge'                => 'bed',
    'overlap'              => 'bed',
    'pair_to_bed'          => 'bedpe|bam',
    'pair_to_pair'         => 'bedpe',
    'slop'                 => 'bed',
    'shuffle'              => 'bed',
    'sort'                 => 'bed',
    'subtract'             => 'bed',
    'window'               => 'bedpe'
    );

my %result_lookup = (
    'annotate'             => 1385,  # OK
    'bam_to_bed'           => 1385,  # OK
    'fasta_from_bed'       => 1385,  # OK
    'mask_fasta_from_bed'  => 1,     # OK
    'shuffle'              => 828,   # OK
    'window'               => 74998, # OK
    'closest'              => 845,   # OK
    'genome_coverage'      => 38,    # OK
    'merge'                => 242,   # OK
    'slop'                 => 828,   # OK
    'complement'           => 291,   # OK - change in data provided by BEDTools or behaviour of complementBed? was 243
    'intersect'            => 72534, # OK
    'pair_to_bed'          => 2,     # OK
    'sort'                 => 828,   # OK
    'coverage'             => 57261, # OK
    'links'                => 11603, # OK
    'pair_to_pair'         => 497,   # OK
    'subtract'             => 57959, # OK
    'group_by'             => 1,     # OK
    'b12_to_b6'            => 1385,  # OK
    'overlap'              => 500    # OK
    );

SKIP : {
    test_skip( -requires_executable => $bedtoolsfac,
               -tests => 421 );

    # Sorry to all those out there who don't have a find command
    # - perhaps someone can add an alternative
    my $rmsk_bed;
    my $gene_bed;
    my $mm8_genome;
    my $mm9_genome;
    my $hg18_genome;
    my $hg19_genome;
    if ($^O =~ m/mswin/i) {
        use Cwd 'getcwd';
        my $dir = getcwd;

        # Assume files are in drive C:\, change location if this is not true
        chdir 'C:\\'; # Search in all C:
        ($rmsk_bed)    = `cmd.exe /C dir /B /S rmsk.hg18.chr21.bed 2>NUL`;
        ($gene_bed)    = `cmd.exe /C dir /B /S knownGene.hg18.chr21.bed 2>NUL`;
        ($mm8_genome)  = `cmd.exe /C dir /B /S mouse.mm8.genome 2>NUL`;
        ($mm9_genome)  = `cmd.exe /C dir /B /S mouse.mm9.genome 2>NUL`;
        ($hg18_genome) = `cmd.exe /C dir /B /S human.hg18.genome 2>NUL`;
        ($hg19_genome) = `cmd.exe /C dir /B /S human.hg19.genome 2>NUL`;

        chdir $dir; # Go back to $dir;
    }
    else {
        ($rmsk_bed)    = `find /usr -name 'rmsk.hg18.chr21.bed' 2>/dev/null`;
        ($gene_bed)    = `find /usr -name 'knownGene.hg18.chr21.bed' 2>/dev/null`;
        ($mm8_genome)  = `find /usr -name 'mouse.mm8.genome' 2>/dev/null`;
        ($mm9_genome)  = `find /usr -name 'mouse.mm9.genome' 2>/dev/null`;
        ($hg18_genome) = `find /usr -name 'human.hg18.genome' 2>/dev/null`;
        ($hg19_genome) = `find /usr -name 'human.hg19.genome' 2>/dev/null`;
    }
    chomp $rmsk_bed    if $rmsk_bed;
    chomp $gene_bed    if $gene_bed;
    chomp $mm8_genome  if $mm8_genome;
    chomp $mm9_genome  if $mm9_genome;
    chomp $hg18_genome if $hg18_genome;
    chomp $hg19_genome if $hg19_genome;

    COMMAND : for (@commands) {

        $v && diag("Testing command: '$_'");
        ok( my $bedtoolsfac = Bio::Tools::Run::BEDTools->new(-command => $_),
            "make a factory using command '$_'" );
        $v && diag(" return command");
        is( my $command = $bedtoolsfac->command, $_, "factory command for '$_' is correct" );
        $v && diag(" return switches and params");
        is( scalar $bedtoolsfac->available_parameters, $p{$_}+1+$s{$_}, "all available options for '$_'" );
        is( scalar $bedtoolsfac->available_parameters('params'), $p{$_}+1, "available parameters for '$_'" );
        is( scalar $bedtoolsfac->available_parameters('switches'), $s{$_}, "available switches for '$_'" );
        $v && diag(" return executable version");
        ok( $bedtoolsfac->version, "get version for '$_'" );

        for ($command) {
            $v && diag(" run command as default");
            m/^annotate$/ && do {
                ok( my $result = $bedtoolsfac->run( -bgv => $bed_file,
                                                    -ann => [$bed3_file] ),
                    "can run command '$command'" );
                last;
            };
            m/^bam_to_bed$/ && do {
                ok( my $result = $bedtoolsfac->run( -bam => $bam_file ),
                    "can run command '$command'" );
                last;
            };
            m/^(?:fasta_from_bed|mask_fasta_from_bed)$/ && do {
                ok( my $result = $bedtoolsfac->run( -seq => $fas_file,
                                                    -bgv => $bed_file ),
                    "can run command '$command'" );
                last;
            };
            m/^(?:bed_to_IGV|merge|sort|links)$/ && do {
                ok( my $result = $bedtoolsfac->run( -bgv => $gene_bed ),
                    "can run command '$command'" );
                last;
            };
            m/^(?:bed_to_bam|shuffle|slop|complement)$/ && do {
                is( $bedtoolsfac->add_bidirectional(100), 100,
                    "can set parameter -add_bidirectional => 100 " ) if $command eq 'slop';
                ok( my $result = $bedtoolsfac->run( -bgv    => $gene_bed,
                                                    -genome => $hg18_genome ),
                    "can run command '$command'" );
                last;
            };
            m/^genome_coverage$/ && do {
                ok( my $result = $bedtoolsfac->run( -bed    => $gene_bed,
                                                    -genome => $hg18_genome ),
                    "can run command '$command'" );
                last;
            };
            m/^(?:window|closest|coverage|subtract|intersect)$/ && do {
                ok( my $result = $bedtoolsfac->run( -bgv1 => $gene_bed,
                                                    -bgv2 => $rmsk_bed ),
                    "can run command '$command'" );
                last;
            };
            m/^pair_to_pair$/ && do {
                is( $bedtoolsfac->type('neither'), 'neither',
                    "can set parameter -type => 'neither'" );
                ok( my $result = $bedtoolsfac->run( -bedpe1 => $bedpe1_file,
                                                    -bedpe2 => $bedpe2_file ),
                    "can run command '$command'" );
                last;
            };
            m/^pair_to_bed$/ && do {
                ok( my $result = $bedtoolsfac->run( -bedpe => $bedpe1_file,
                                                    -bgv => $bed3_file ),
                    "can run command '$command'" );
                last;
            };
            m/^overlap$/ && do {
                is( $bedtoolsfac->columns('2,3,5,6'), '2,3,5,6',
                    "can set parameter -columns => '2,3,5,6' " );
                ok( my $result = $bedtoolsfac->run( -bed => $bedpe1_file, ),
                    "can run command '$command'" );
                last;
            };
            m/^b12_to_b6$/ && do {
                ok( my $result = $bedtoolsfac->run( -bed => $bed12_file, ),
                    "can run command '$command'" );
                last;
            };
            m/^group_by$/ && do {
                is( $bedtoolsfac->group(1), 1,
                    "can set parameter -group => 1 " );
                is( $bedtoolsfac->columns('2,2,3,3'), '2,2,3,3',
                    "can set parameter -columns => '2,2,3,3' " );
                is( $bedtoolsfac->operations('min,max,min,max'), 'min,max,min,max',
                    "can set parameter -operations => 'min,max,min,max' " );
                ok( my $result = $bedtoolsfac->run( -bed => $bed3_file ),
                    "can run command '$command'" );
                last;
            };
            m/^graph_union$/ && do {
                ok( my $result = $bedtoolsfac->run( -bg => [$bg1_file, $bg2_file, $bg2_file] ),
                    "can run command '$command'" );
                last;
            };
            do {
                # we should never get here - internal test
                fail( "all commands tested - missed '$_'");
            };
        }

        $v && diag(" check file has been written");
        ok( eval { (-e $bedtoolsfac->result && -r _) }, "result files exists for command '$command'");
        $v && diag(" check can get internal result format description and confirm it");
        ok( my $format = $bedtoolsfac->result( -want => 'format' ),
            "can return output format for command '$command'" );
        like( $format, qr/(?:$format_lookup{$command})/,
            "result claims to be in correct format for command '$command'" );
        $v && diag(" check can get internal result file name value");
        ok( my $file = $bedtoolsfac->result(-want=>'raw'),
            "can return output file for command '$command'" );
        if ($command eq 'links') {
            $v && diag(" check result file is html and correct size");
            ok eval { (-e $file)&&(-r _) }, "make readable output";
            open (FILE, $file);
            my $lines =(my $first_line)= <FILE>;
            close FILE;
            like( $first_line, qr/\<html\>/, " - html tag line");
            is( $lines, $result_lookup{$command}, " - number of lines");
        } elsif ($command eq 'genome_coverage') {
            $v && diag(" check result file is correct size");
            ok eval { (-e $file)&&(-r _) }, "make readable output";
            open (FILE, $file);
            my $lines =()= <FILE>;
            close FILE;
            is( $lines, $result_lookup{$command}, " - number of lines");
        } else {
            $v && diag(" check can get internal result format matches result file");
            my $guesser = Bio::Tools::GuessSeqFormat->new( -file => $file );
            for ($format_lookup{$command}) {
                m/^(?:bed|bedpe|tab)$/ && do {
                    is( $guesser->guess, 'tab', "file format of '$file' consistent with claim for '$command'" );
                    last;
                };
                m/^fasta$/ && do {
                    is( $guesser->guess, 'fasta', "file format consistent with claim for '$command'" );
                }
            }
        }
        $v && diag(" check can get and set wanted result type");
        is( $bedtoolsfac->want('Bio::Root::IO'), 'Bio::Root::IO',
            "can set want to IO object for command '$command'" );
        $v && diag(" check can get a Bio::Root::IO object");
        ok( my $objres = $bedtoolsfac->result, "can get the basic object result for command '$command'" );
        $v && diag(" - check can it is actually a Bio::Root::IO object");
        isa_ok( $objres, 'Bio::Root::IO', "returned object is correct for command '$command'" );
        for ($format_lookup{$command}) {
            $v && diag(" check can can get format-specific result object if supported");
            m/(?:bed|bedpe)/ && do {
                $v && diag(" - Bio::SeqFeature::Collection");
                ok( my $objres = $bedtoolsfac->result( -want => 'Bio::SeqFeature::Collection' ),
                    "can get the specific object result for command '$command'" );
                isa_ok( $objres, 'Bio::SeqFeature::Collection',
                    "returned object is correct for command '$command'" );
                $v && diag(" - correct number of features");
                is( scalar $objres->get_all_features, $result_lookup{$command},
                    "correct number of features for command '$command'" );
                last;
            };
            m/^fasta$/ && do {
                $v && diag(" - Bio::SeqIO");
                ok( my $objres = $bedtoolsfac->result( -want => 'Bio::SeqIO' ),
                    "can get the specific object result for command '$command'" );
                isa_ok( $objres, 'Bio::SeqIO',
                    "returned object is correct for command '$command'" );
                my $seq_count = 0;
                while( my $seq = $objres->next_seq ) {
                    $seq_count++;
                }
                $v && diag(" - correct number of sequences");
                is( $seq_count, $result_lookup{$command}, "correct number of sequences for command '$command'" );
            }
        }
    }
}

1;