The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
#!perl

=head1 NAME

bp_search2gff

=head1 SYNOPSIS

Usage:

  bp_search2gff [-o outputfile] [-f reportformat] [-i inputfilename]  OR file1 file2 ..

=head1 DESCRIPTION

This script will turn a SearchIO report (BLAST, FASTP, SSEARCH, 
AXT, WABA) into GFF.

The options are:

   -i infilename      - (optional) inputfilename, will read
                        either ARGV files or from STDIN
   -o filename        - the output filename [default STDOUT]
   -f format          - search result format (blast, fasta,waba,axt)
                        (ssearch is fasta format). default is blast.
   -t/--type seqtype  - if you want to see query or hit information
                        in the GFF report
   -s/--source        - specify the source (will be algorithm name
                        otherwise like BLASTN)
   --method           - the method tag (primary_tag) of the features
                        (default is similarity)
   --scorefunc        - a string or a file that when parsed evaluates
                        to a closure which will be passed a feature
                        object and that returns the score to be printed
   --locfunc          - a string or a file that when parsed evaluates
                        to a closure which will be passed two
                        features, query and hit, and returns the
                        location (Bio::LocationI compliant) for the
                        GFF3 feature created for each HSP; the closure
                        may use the clone_loc() and create_loc()
                        functions for convenience, see their PODs
   --onehsp           - only print the first HSP feature for each hit
   -p/--parent        - the parent to which HSP features should refer
                        if not the name of the hit or query (depending
                        on --type)
   --target/--notarget - whether to always add the Target tag or not
   -h                 - this help menu
   --version          - GFF version to use (put a 3 here to use gff 3)
   --component        - generate GFF component fields (chromosome)
   -m/--match         - generate a 'match' line which is a container
                        of all the similarity HSPs
   --addid            - add ID tag in the absence of --match
   -c/--cutoff        - specify an evalue cutoff

Additionally specify the filenames you want to process on the
command-line.  If no files are specified then STDIN input is assumed.
You specify this by doing: bp_search2gff E<lt> file1 file2 file3

=head1 AUTHOR

Jason Stajich, jason-at-bioperl-dot-org

=head1 Contributors

Hilmar Lapp, hlapp-at-gmx-dot-net

=cut

use strict;
use warnings;
use Bio::Tools::GFF;
use Getopt::Long;
use Bio::SearchIO;
use Bio::Location::Simple;    # pre-declare to simplify $locfunc implementations
use Bio::Location::Atomic;    # pre-declare to simplify $locfunc implementations
use Storable qw(dclone);      # for cloning location objects
use Bio::Factory::FTLocationFactory;

my (
    $output,       # output file (if not stdout)
    $input,        # name of the input file
    $format,       # format of the input file, defauly is blast
    $type,         # 'query' or 'hit'
    $cutoff,       # cut-off value for e-value filter
    $sourcetag,    # explicit source tag (will be taken from program
                   # otherwise
    $methodtag,    # primary tag (a.k.a. method), default 'similarity'
    $gffver,       # GFF version (dialect) to write
    $scorefunc,    # closure returning the score for a passed feature
    $locfunc,      # closure returning a location object for a passed
                   # query and hit feature
    $addid,        # flag: whether to always add the ID for $match == 0
    $parent,       # the name of the parent to use; if set and $match == 0
                   # will always add the target
    $comp,         # flag: whether to print a component feature
    $addtarget,    # flag: whether to always add the Target tag, default
                   # is true
    $match,        # flag: whether to print match lines as containers
    $onehsp,       # flag: whether to consider only the first HSP for a hit
    $quiet,        # flag: run quietly
    $help          # flag: show help screen
);                 

# set defaults:
$format    = 'blast';
$type      = 'query';
$gffver    = 2;
$methodtag = "similarity";
$addtarget = 1;

GetOptions(
    'i|input:s'   => \$input,
    'component'   => \$comp,
    'm|match'     => \$match,
    'o|output:s'  => \$output,
    'f|format:s'  => \$format,
    's|source:s'  => \$sourcetag,
    'method=s'    => \$methodtag,
    'addid'       => \$addid,
    'scorefunc=s' => \$scorefunc,
    'locfunc=s'   => \$locfunc,
    'p|parent=s'  => \$parent,
    'target!'     => \$addtarget,
    'onehsp'      => \$onehsp,
    't|type:s'    => \$type,
    'c|cutoff:s'  => \$cutoff,
    'v|version:i' => \$gffver,
    'q|quiet'     => \$quiet,
    'h|help'      => sub {
        exec( 'perldoc', $0 );
        exit(0);
    },
);
$type = lc($type);
if ( $type =~ /target/ ) { $type = 'hit' }
elsif ( $type ne 'query' && $type ne 'hit' ) {
    die("seqtype must be either 'query' or 'hit'");
}

# custom or default function returning the score
$scorefunc =
  defined($scorefunc) ? parse_code($scorefunc) : sub { shift->score };

# custom or default function returning the location
$locfunc = defined($locfunc) ? parse_code($locfunc) : sub { shift->location };

# if --match is given then $addid needs to be disabled
$addid = undef if $addid && $match;

# if no input is provided STDIN will be used
my $parser = new Bio::SearchIO(
    -format  => $format,
    -verbose => $quiet ? -1 : 0,
    -file    => $input
);

my $out;
if ( defined $output ) {
    $out = new Bio::Tools::GFF(
        -gff_version => $gffver,
        -file        => ">$output"
    );
}
else {
    $out = new Bio::Tools::GFF( -gff_version => $gffver );    # STDOUT
}
my ( %seen_hit, %seen );
my $other = $type eq 'query' ? 'hit' : 'query';

while ( my $result = $parser->next_result ) {
    my $qname = $result->query_name;
    if (   $comp
        && $type eq 'query'
        && $result->query_length )
    {
        $out->write_feature(
            Bio::SeqFeature::Generic->new(
                -start       => 1,
                -end         => $result->query_length,
                -seq_id      => $qname,
                -source_tag  => 'chromosome',
                -primary_tag => 'Component',
                -tag         => {
                    'Sequence' => $qname
                }
            )
        );
    }
    while ( my $hit = $result->next_hit ) {
        next if ( defined $cutoff && $hit->significance > $cutoff );
        my $acc = $qname;
        if ( $seen{ $qname . "-" . $hit->name }++ ) {
            $acc = $qname . "-" . $seen{ $qname . '-' . $hit->name };
        }

        if (   $comp
            && $type eq 'hit'
            && $hit->length
            && !$seen_hit{ $hit->name }++ )
        {
            $out->write_feature(
                Bio::SeqFeature::Generic->new(
                    -start       => 1,
                    -end         => $hit->length,
                    -seq_id      => $hit->name,
                    -source_tag  => 'chromosome',
                    -primary_tag => 'Component',
                    -tag         => {
                        'Sequence' => $hit->name
                    }
                )
            );
        }
        my ( %min, %max, $seqid, $name, $st );
        while ( my $hsp = $hit->next_hsp ) {
            my $feature = new Bio::SeqFeature::Generic;
            my ( $proxyfor, $otherf );
            if ( $type eq 'query' ) {
                ( $proxyfor, $otherf ) = ( $hsp->query, $hsp->hit );
                $name ||= $hit->name;
            }
            else {
                ( $otherf, $proxyfor ) = ( $hsp->query, $hsp->hit );
                $name ||= $acc;
            }
            $proxyfor->score( $hit->bits ) unless ( $proxyfor->score );
            if ( ( $gffver == 3 ) && ( $match || $parent ) ) {
                $feature->add_tag_value( 'Parent', $parent || $name );
            }

            $min{$type} = $proxyfor->start
              unless defined $min{$type} && $min{$type} < $proxyfor->start;
            $max{$type} = $proxyfor->end
              unless defined $max{$type} && $max{$type} > $proxyfor->end;
            $min{$other} = $otherf->start
              unless defined $min{$other} && $min{$other} < $otherf->start;
            $max{$other} = $otherf->end
              unless defined $max{$other} && $max{$other} > $otherf->end;
            if ( $addtarget || $match ) {
                $feature->add_tag_value( 'Target', 'Sequence:' . $name );
                $feature->add_tag_value( 'Target', $otherf->start );
                $feature->add_tag_value( 'Target', $otherf->end );
            }
            if ($addid) {
                $feature->add_tag_value( 'ID', $name );
            }

            $feature->location( &$locfunc( $proxyfor, $otherf ) );

            #  strand for feature is always going to be product of
            #  query & hit strands so that target can always be just
            #  '+'
            $feature->strand( $proxyfor->strand * $otherf->strand );
            if ($sourcetag) {
                $feature->source_tag($sourcetag);
            }
            else {
                $feature->source_tag( $proxyfor->source_tag );
            }
            $feature->score( &$scorefunc($proxyfor) );
            $feature->frame( $proxyfor->frame );
            $feature->seq_id( $proxyfor->seq_id );
            $feature->primary_tag($methodtag);

            # add annotation if encoded in the query description
            my $desc = $result->query_description;
            while ( $desc =~ /\/([^=]+)=(\S+)/g ) {
                $feature->add_tag_value( $1, $2 );
            }
            $seqid ||= $proxyfor->seq_id;
            $out->write_feature($feature);
            $st ||= $sourcetag || $proxyfor->source_tag;
            last if $onehsp;
        }
        if ($match) {

            my $matchf = Bio::SeqFeature::Generic->new(
                -start       => $min{$type},
                -end         => $max{$type},
                -strand      => $hit->strand($type) * $hit->strand($other),
                -primary_tag => 'match',
                -source_tag  => $st,
                -score       => $hit->bits,
                -seq_id      => $seqid
            );
            if ( $gffver == 3 ) {
                $matchf->add_tag_value( 'ID', $name );
            }
            $matchf->add_tag_value( 'Target', "Sequence:$name" );
            $out->write_feature($matchf);
        }
    }
}

sub parse_code {
    my $src = shift;
    my $code;

    # file or subroutine?
    if ( -r $src ) {
        if ( !( ( $code = do $src ) && ( ref($code) eq "CODE" ) ) ) {
            die "error in parsing code block $src: $@" if $@;
            die "unable to read file $src: $!"         if $!;
            die "failed to run $src, or it failed to return a closure";
        }
    }
    else {
        $code = eval $src;
        die "error in parsing code block \"$src\": $@" if $@;
        die "\"$src\" fails to return a closure"
          unless ref($code) eq "CODE";
    }
    return $code;
}

=head2 clone_loc

Title   : clone_loc
Usage   : my $l = clone_loc($feature->location);
Function: Helper function to simplify the task of cloning locations
           for --locfunc closures.

          Presently simply implemented using Storable::dclone().
Example :
Returns : A L<Bio::LocationI> object of the same type and with the
          same properties as the argument, but physically different.
          All structured properties will be cloned as well.
Args    : A L<Bio::LocationI> compliant object

=cut

sub clone_loc {
    return dclone(shift);
}

=head2 create_loc

Title   : create_loc
Usage   : my $l = create_loc("10..12");
Function: Helper function to simplify the task of creating locations
          for --locfunc closures. Creates a location from a feature-
          table formatted string.

Example :
Returns : A L<Bio::LocationI> object representing the location given
          as formatted string.
Args    : A GenBank feature-table formatted string.

=cut

sub create_loc {
    return Bio::Factory::FTLocationFactory->from_string(shift);
}