The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
#!/usr/bin/env perl

use warnings;
use strict;

use Carp;
use Getopt::Long;
use Pod::Usage;

use Bio::SimpleAlign;
use Bio::LocatableSeq;
use Bio::AlignIO;

my %a;
GetOptions( \%a, 'help|h|?', ) or pod2usage( -verbose => 2 );
pod2usage( -verbose => 2 ) if ( $a{help} );

my ( $aln_file, $clustal_file ) = @ARGV;
pod2usage(
  -msg       => "ERROR: MUMMER alignment file not supplied or non-existent\n",
  -verbose   => 1,
  -noperldoc => 1
) unless ( $aln_file && -f $aln_file );

#set clustalw output
my %aln_opt = ( -format => 'clustalw' );
if ( $clustal_file && $clustal_file ne '-' ) {
  # real file
  $aln_opt{'-file'} = ">$clustal_file";
} else {
  # no file/stdout
  $aln_opt{'-fh'} = \*STDOUT;
}
my $aln_out = Bio::AlignIO->new(%aln_opt);

open my $aln_fh, '<', $aln_file or confess "Can't open filehandle: $!";
my $info = next_aln($aln_fh);
die "Can't find alignment header" unless ( $info =~ /^-- Alignments between (\S+) and (\S+)$/m );

while ( my $ao = next_aln($aln_fh) ) {
  $aln_out->write_aln( parse_alignment( $1, $2, $ao ) );
}

$aln_fh->close;

sub next_aln {
  my ($aln_fh) = @_;

  local $/ = "\n-- BEGIN alignment ";
  return <$aln_fh>;

}

sub parse_alignment {
  my ( $r_id, $q_id, $data ) = @_;

  my @raw_alignment = grep {/^\[|\d/} split /\n/, $data;

  #get rid of 1st and last lines (coord infomation)
  pop @raw_alignment;
  die
    unless ( ( shift @raw_alignment )
    =~ /^\[\s+([-+]1)\s+(\d+)\s+-\s+(\d+)\s+\|\s+([+-]1)\s+(\d+)\s+-\s+(\d+)\s+\]$/ );
  my %coords = (
    r_strand => ( $1 > 0 ? '+' : '-' ),
    r_start  => $2,
    r_end    => $3,
    q_strand => ( $4 > 0 ? '+' : '-' ),
    q_start  => $5,
    q_end    => $6,
  );

  my @alignments;
  for ( my $i = 0; $i < @raw_alignment; $i++ ) {
    my ( $start, $seq ) = split /\s+/, $raw_alignment[$i];
    $seq =~ s/\r\n/\n/;
    chomp $seq;
    $alignments[ $i % 2 ] .= $seq;
  }

  my $aln = Bio::SimpleAlign->new( -source => 'jwb', );

  #species-refseq(strand)/start-end
  #rice-3(+)/16598648-16600199
  #c_remanei-Crem_Contig172(-)/123228-124941

  my $r_seq = Bio::LocatableSeq->new(
    -seq   => $alignments[0],
    -id    => substr( $r_id, 0, 1 ) . "-$r_id($coords{r_strand})",
    -start => $coords{r_start},
  );
  $aln->add_seq($r_seq);

  my $q_seq = Bio::LocatableSeq->new(
    -seq   => $alignments[1],
    -id    => substr( $q_id, 0, 1 ) . "-$q_id($coords{q_strand})",
    -start => $coords{q_start},
  );
  $aln->add_seq($q_seq);

  $aln->map_chars( '\.', '-' );
  return $aln;
}

__END__

=head1 NAME

show-align2clustalw.pl - convert mummer alignment files to clustalw format

=head1 SYNOPSIS
    
    # store results in OUTPUT.aln
    perl show-align2clustalw.pl [OPTIONS] <MUMMER_ALIGNMENT.aln> <CLUSTALW_OUTPUT.aln>

    # print results to standard output
    perl show-align2clustalw.pl [OPTIONS] <MUMMER_ALIGNMENT.aln>
    # or
    perl show-align2clustalw.pl [OPTIONS] <MUMMER_ALIGNMENT.aln> -

=head1 DESCRIPTION

This script converts MUMMER's show-aligns output to clustalw formatted output.

=head1 OPTIONS

Alternative option names are separated by "|".

=over 4

=item B<< --help|h|? >>

Complete help.

=back

=cut
 
=head1 SEE ALSO

L<Bio::FeatureIO>, L<Bio::SeqFeature::Generic>

=head1 AUTHOR

jw bargsten, C<< <joachim.bargsten at wur.nl> >>

=cut