The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
#!/usr/bin/perl
# PROGRAM  : clustalw.pl
# PURPOSE  : Demonstrate possible uses of Bio::Tools::Run::Alignment::Clustalw.pm
# AUTHOR   : Peter Schattner schattner@alum.mit.edu
# CREATED  : Oct 06 2000
#
# INSTALLATION
#
# You will need to have installed clustalw and to ensure that Clustalw.pm can find it.
# This can be done in different ways (bash syntax):
#	   export PATH=$PATH:/home/peter/clustalw1.8
#  or
#     define an environmental variable CLUSTALDIR:
#	   export CLUSTALDIR=/home/peter/clustalw1.8   
#  or
#     include a definition of an environmental variable CLUSTALDIR in every
#     script that will use Clustal.pm.
#	   BEGIN {$ENV{CLUSTALDIR} = '/home/peter/clustalw1.8/'; }
#
#  We are going to demonstrate 3 possible applications of Clustalw.pm:
#	1. Test effect of varying clustalw alignment parameter(s) on resulting alignment
#	2. Test effect of changing the order that sequences are added to the alignment
#		on the resulting alignment
#	3. Test effect of incorporating an "anchor point" in the alignment process
#
#  Before we can do any tests, we need to set up the environment, create the factory
#  and read in the unaligned sequences.
#

#BEGIN {
#	$ENV{CLUSTALDIR} = '/home/peter/clustalw1.8/';
#}

use Getopt::Long;
use Bio::Tools::Run::Alignment::Clustalw;
use Bio::SimpleAlign;
use Bio::AlignIO;
use Bio::SeqIO;
use strict;

# set some default values
my $infile = 't/data/cysprot1a.fa';
my @params = ('quiet' => 1 );
my $do_only = '123';   # string listing examples to be executed. Default is to
			              # execute all tests (ie 1,2 and 3)
my $param = 'ktuple';  # parameter to be varied in example 1
my $startvalue = 1;    # initial value for parameter $param
my $stopvalue = 3;     # final value for parameter $param
my $regex = 'W[AT]F';  # regular expression for 'anchoring' alignment in example 3
my $extension = 30; 	  # distance regexp anchor should be extended in each direction
			              # for local alignment in example 3
my $helpflag = 0;      # Flag to show usage info.

# get user options
my @argv = @ARGV;  # copy ARGV before GetOptions() massacres it.

&GetOptions("h!" => \$helpflag, "help!" => \$helpflag,
				"in=s" => \$infile,
				"param=s" => \$param,
				"do=s" =>  \$do_only,
				"start=i" =>  \$startvalue,
				"stop=i" =>  \$stopvalue,
				"ext=i" =>  \$extension,
				"regex=s" =>  \$regex,) ;

if ($helpflag) { &clustalw_usage(); exit 0;}

# create factory & set user-specified global clustalw parameters
foreach my $argv (@argv) {
	unless ($argv =~ /^(.*)=>(.*)$/) { next;}
	push (@params, $1 => $2);
}
my  $factory = Bio::Tools::Run::Alignment::Clustalw->new(@params);
	

# put unaligned sequences in a Bio::Seq array
my $str = Bio::SeqIO->new(-file=> $infile, '-format' => 'Fasta');
my ($paramvalue, $aln, $subaln, @consensus, $seq_num, $string, $strout, $id);
my @seq_array =();
while ( my $seq = $str->next_seq() ) { push (@seq_array, $seq) ;}

# Do each example that has digit present in variable $do_only
$_= $do_only;
/1/ && &vary_params();
/2/ && &vary_align_order();
/3/ && &anchored_align();

## End of "main"

#################################################
#   vary_params(): Example demonstrating varying of clustalw parameter
#

sub vary_params {

	print "Beginning parameter-varying example... \n";

	# Now we'll create several alignments, 1 for each value of the selected
	# parameter. We also compute a simple consensus string for each alignment.
	# (In the default case, we vary the "ktuple" parameter,  creating 3
	# alignments using ktuple values from 1 to 3.)

	my $index =0;
	for ($paramvalue = $startvalue; $paramvalue < ($stopvalue + 1); $paramvalue++) {
		$factory->$param($paramvalue);  # set parameter	value
		print "Performing alignment with $param = $paramvalue \n";
		$aln = $factory->align(\@seq_array);
		$string = $aln->consensus_string(); # Get consensus of alignment
		# convert '?' to 'X' at non-consensus positions
		$string =~ s/\?/X/g;
		$consensus[$index] = Bio::Seq->new(-id=>"$param=$paramvalue",-seq=>$string);
		$index++;
	}
	# Compare consensus strings for alignments with different $param values by
	# making an alignment of the different consensus strings
	# $factory->ktuple(1);  # set ktuple parameter	
	print "Performing alignment of $param consensus sequences \n";
	$aln = $factory->align(\@consensus);
	$strout = Bio::AlignIO->newFh('-format' => 'msf');
	print $strout $aln;

	return 1;
}


#################################################
#   vary_align_order():
#
# For our second example, we'll test the effect of changing the order
# that sequences are added to the alignment

sub vary_align_order {

	print "\nBeginning alignment-order-changing example... \n";

	@consensus = ();  # clear array
	for ($seq_num = 0; $seq_num < scalar(@seq_array); $seq_num++) {
		my $obj_out = shift @seq_array;  # remove one Seq object from array and save
		$id = $obj_out->display_id;
		# align remaining sequences
		print "Performing alignment with sequence $id left out \n";
		$subaln = $factory->align(\@seq_array);
		# add left-out sequence to subalignment
		$aln = $factory->profile_align($subaln,$obj_out);
		$string = $aln->consensus_string(); # Get consensus of alignment
		# convert '?' to 'X' for non-consensus positions
		$string =~ s/\?/X/g;
		$consensus[$seq_num] = Bio::Seq->new(-id=>"$id left out",-seq=>$string);
		push @seq_array, $obj_out;  # return Seq object for next (sub) alignment
	}

	# Compare consensus strings for alignments created in different orders
	# $factory->ktuple(1);  # set ktuple parameter	
	print "\nPerforming alignment of consensus sequences for different reorderings \n";
	print "Each consensus is labeled by the sequence which was omitted in the initial alignment\n";
	$aln = $factory->align(\@consensus);
	$strout = Bio::AlignIO->newFh('-format' => 'msf');
	print $strout $aln;

	return 1;
}

#################################################
#   anchored_align()
#
# For our last example, we'll test a way to perform a local alignment by
# "anchoring" the alignment to a regular expression.  This is similar
# to the approach taken in the recent dbclustal program.
# In principle, we could write a script to search for a good regular expression
# to use. Instead, here we'll simply choose one manually after looking at the
# previous alignments.

sub anchored_align {

	my @local_array = ();
	my @seqs_not_matched = ();

	print "\n Beginning anchored-alignment example... \n";

	for ($seq_num = 0; $seq_num < scalar(@seq_array); $seq_num++) {
		my $seqobj = $seq_array[$seq_num];
		my $seq =  $seqobj->seq();
		my $id =  $seqobj->id();
		# if $regex is not found in the sequence, save sequence id name and set
		# array value =0 for later
		unless ($seq =~/$regex/) {
			$local_array[$seq_num] = 0;
			push (@seqs_not_matched, $id) ;
			next;
		}
		# find positions of start and of subsequence to be aligned
		my $match_start_pos = length($`);
		my $match_stop_pos = length($`) + length($&);
		my	$start =  ($match_start_pos - $extension) > 1 ? 
		  ($match_start_pos - $extension) +1 : 1;
		my	$stop =  ($match_stop_pos + $extension) < length($seq) ?
		  ($match_stop_pos + $extension) : length($seq);
		my $string = $seqobj->subseq($start, $stop);

		$local_array[$seq_num] = Bio::Seq->new(-id=>$id, -seq=>$string);
	}
	@local_array = grep $_ , @local_array; # remove array entries with no match

	# Perform alignment on the local segments of the sequences which match "anchor"
	$aln = $factory->align(\@local_array);
	my $consensus  = $aln->consensus_string(); # Get consensus of local alignment

	if (scalar(@seqs_not_matched) ) {
		print " Sequences not matching $regex : @seqs_not_matched \n"
	} else {
		print " All sequences match $regex : @seqs_not_matched \n"
	}
	print "Consensus sequence of local alignment: $consensus \n";

	return 1;
}

#----------------
sub clustalw_usage {
#----------------

#-----------------------
# Prints usage information for general parameters.

    print STDERR <<"QQ_PARAMS_QQ";

 Command-line accessible script variables and commands:
 -------------------------------
 -h 		       :  Display this usage info and exit.
 -in <str>	    :  File containing input sequences in fasta format (default = $infile) .
 -do <str>	    :  String listing examples to be executed. Default is to execute
		             all tests (ie default = '123')
 -param <str>   :  Parameter to be varied in example 1. Any clustalw parameter
		             which takes inteer values can be varied (default = 'ktuple')
 -start <int>   :  Initial value for varying parameter in example 1 (default = 1)
 -stop <int>    :  Final value for varying parameter (default = 3)
 -regex   <str> :  Regular expression for 'anchoring' alignment in example 3
                   (default = $regex)
 -ext <int>     :  Distance regexp anchor should be extended in each direction
		             for local alignment in example 3   (default = 30)

In addition, any valid Clustalw parameter can be set using the syntax 
"parameter=>value" as in "ktuple=>3"

So a typical command lines might be:
 > clustalw.pl -param=pairgap -start=2 -stop=3 -do=1 "ktuple=>3"
or
 > clustalw.pl -ext=10 -regex='W[AST]F' -do=23 -in='t/cysprot1a.fa'

QQ_PARAMS_QQ

}