The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
#!/usr/bin/perl -w

=head1 NAME 

ace2gff.pl - convert a phrap produced, ace file into a gff formatted file

=head1 SYNOPSIS

  ./ace2gff.pl options ace_file

    -g|--gff=gff_version   gff version output (default:3)
    -h|--help              Print usage
    -t|--type=Source_type  specify the source type (default:phrap)


=head1 OPTIONS

  -g|--gff=gff_version   gff version output (default:3)
  -h|--help              Print usage
  -t|--type=Source_type  specify the source type (default:phrap)

=head1 DESCRIPTION

convert a phrap produced, ace file into a gff formatted file

This program has only been tested for ace files generated at 
Washington University in St. Louis.



=cut

# ----------------------------------------------------
use strict;
use Pod::Usage;
use Getopt::Long;
use Bio::Assembly::IO;
use Bio::Tools::GFF;

my $map_name;
my $map_start;
my $map_stop;
my $map_strand;
my $feature_name;
my $feature_start;
my $feature_stop;
my $feature_actual_start;
my $feature_strand;
my $gff_version=3;
my $source_type="phrap";
my $feature;
my $tags;

my ( $help, $test, $skip );
GetOptions( 
    'h|help'   => \$help,
    't|type=s'   => \$source_type,
    'g|gff=i'   => \$gff_version,
);
pod2usage if ($help or !@ARGV) ;


die "Version $gff_version of GFF is not supported\n"
    if ($gff_version<2 or $gff_version>3);

my $file_in = $ARGV[$#ARGV];

my $in  = Bio::Assembly::IO->new(-file => $file_in , '-format' => 'ace'); 
my $gffio = Bio::Tools::GFF->new( -gff_version => $gff_version);


while(my $assembly=$in->next_assembly()){
    my @contig_ids=$assembly->get_contig_ids;
    my @singlet_ids=$assembly->get_singlet_ids;
    last unless(@contig_ids or @singlet_ids);
    foreach my $contig_id (@contig_ids){
	###Set data for reference contig
	my $contig         = $assembly->get_contig_by_id($contig_id);
	$map_name          = "Contig".$contig_id;
	$map_strand        = $contig->{'_strand'};
	$map_start         = 1;
        $map_stop          = $contig->get_consensus_length();
	($map_start,$map_stop)=($map_stop,$map_start) if ($map_start > $map_stop);
	###Create a SeqFeature with the info for the reference line
	if ($gff_version==3){
	    $tags = {
		ID      => $map_name,
		Name    => $map_name,
	    } ;
	}
	elsif($gff_version==2){
	    $tags = {
		Contig => $map_name,
	    }; 
	}
	$feature = new Bio::SeqFeature::Generic
	    ( -start => $map_start, -end => $map_stop,
	      -strand => $map_strand, -primary => 'contig',
	      -source_tag   => $source_type,
	      -seq_id => $map_name,
	      -display_name => $map_name,
	      -tag    => $tags, 
	      );
    
	###Convert to gff and print
	print $gffio->write_feature($feature);

	###For each read in the contig, 
	###convert info into a SeqFeature,
	###and output as GFF
	my @seqs           = $contig->each_seq;
	die "ERROR: no reads in contig $contig_id\n" unless (@seqs);
	foreach my $seq (@seqs){ 
	    $feature_name  = $seq->id();
            $feature_start = $contig->get_seq_coord($seq)->start();
            $feature_stop  = $contig->get_seq_coord($seq)->end();
	    ($feature_start,$feature_stop)=($feature_stop,$feature_start) if ($feature_start > $feature_stop);
	    $feature_actual_start= $feature_start;
	    $feature_start=1 if ($feature_start<=0);
	    $feature_strand=($seq->strand()<0) ? -1: 1;

	    	
	    if ($gff_version==3){
		$tags = {
		    Parent  => $map_name,
		    ID      => $feature_name,
		    Name    => $feature_name,
		    };
		
	    }
	    elsif($gff_version==2){
		$tags = {
		    read  => $feature_name,
		}; 
	    }
	    if ($feature_actual_start!=$feature_start){
		$tags->{'actual_start'} = $feature_actual_start;
	    }
	    $feature = new Bio::SeqFeature::Generic
		( -start => $feature_start, -end => $feature_stop,
		  -strand => $feature_strand, -primary => 'read',
		  -source_tag   => $source_type,
		  -seq_id => $map_name,
		  -display_name => $feature_name,
		  -tag    => $tags,
		      );
	
	   
	    print $gffio->write_feature($feature);	    
	}
    }

    foreach my $singlet_id (@singlet_ids){
	###UNTESTED until I get an ace file with a singlet
	###Set data for singlet
	my $singlet         = $assembly->get_singlet_by_id($singlet_id);
	$map_name          = $singlet->id();
	$map_strand        = 1;
	$map_start         = 1;
        $map_stop          = $singlet->length();
	($map_start,$map_stop)=($map_stop,$map_start) if ($map_start > $map_stop);
	###Create a SeqFeature with the info for the reference line
	if ($gff_version==3){
	    $feature = new Bio::SeqFeature::Generic
		( -start => $map_start, -end => $map_stop,
		  -strand => $map_strand, -primary => 'read',
		  -source_tag   => $source_type,
		  -seq_id => $map_name,
		  -display_name => $map_name,
		  -tag    => {
		      ID      => $map_name,
		      Name    => $map_name,
		  } 
		  );
	}
	elsif($gff_version==2){
	    $feature = new Bio::SeqFeature::Generic
		( -start => $map_start, -end => $map_stop,
		  -strand => $map_strand, -primary => 'read',
		  -source_tag   => $source_type,
		  -seq_id => $map_name,
		  -display_name => $map_name,
		  -tag    => {
		      Contig => $map_name,
		  } 
		  );
	}
	###Convert to gff and print
	print $gffio->write_feature($feature);	
    }
}