The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
#!/usr/bin/env perl
#original clustal2stockholm.pl from DART
#  (https://github.com/ihh/dart/blob/master/perl/clustal2stockholm.pl)
#   Modified by Christopher Bottoms to also incorporate structure data.
use strict;
use warnings;
use autodie;

use lib 'lib';
use Bio::App::SELEX::Stockholm;

my $usage = "\nUsage: $0 <ClustalW file> [gc_file]
           Converts a ClustalW file (from file or STDIN) to Stockholm format.
           Only converts sequence information, unless a secondary structure
           file is given as the second argument.\n";

main(@ARGV) unless caller();

sub main {
    my @cmd_line_args = @_;
    my @argv;
    my $gc_string;
    if (@ARGV) {
        my $arg     = shift @cmd_line_args;
        my $gc_file = shift @cmd_line_args;
        if ( defined $gc_file ) {
            open( my $gc_fh, '<', $gc_file );
            $gc_string = _get_last_gc_from_file($gc_fh);
            close($gc_fh);
        }

        # Exit with usage info if $arg is a flag. Otherwise save it for later.
        if ( $arg =~ /^-/ ) {
            if ( ( $arg eq "-h" ) || ( $arg eq "--help" ) ) {
                print $usage;
                exit;
            }
            else { die $usage; }
        }
        else {
            push @argv, $arg;
        }
    }
    else {

        # Make stdin the default input file.
        push @argv, "-";
    }

    my $file = shift @argv or die $usage;

    open( my $aln_fh, '<', $file );
    my $stk;

    if ( defined $gc_string ) {
        $stk =
          Bio::App::SELEX::Stockholm->new( gc => { SS_cons => $gc_string } );
    }
    else {
        $stk = Bio::App::SELEX::Stockholm->new();
    }

    my $gapChars = '-';
    while (<$aln_fh>) {
        my @a = split;
        next unless @a == 2;

        my ( $seq, $data ) = @a;
        next
          if ( $data =~ /[^a-zA-Z$gapChars]/ )
          ;    # skip primary sequence conservation lines, etc.

        if ( defined $stk->seqdata->{$seq} ) {
            $stk->seqdata->{$seq} .= $data;
        }
        else {
            push @{ $stk->seqname }, $seq;
            $stk->seqdata->{$seq} = $data;
        }

    }
    die unless $stk->is_flush();

    print $stk->to_string();
}

sub _extract_gc_string_from {
    my $line = shift;

    #Remove newline
    chomp $line;

    #Remove carriage return;
    $line =~ s/\r//;

    #Remove everything from the first space to the end of the line
    $line =~ s/\s+.*\z//;

    return $line;
}

sub _get_last_gc_from_file {
    my $fh = shift;

    #Read all lines of file
    my @lines = <$fh>;

    # Extract the gc string from the last line of the file
    my $gc_string = _extract_gc_string_from($lines[-1]);

    return $gc_string;
}

1;