The London Perl and Raku Workshop takes place on 26th Oct 2024. If your company depends on Perl, please consider sponsoring and/or attending.
#!/usr/bin/perl -w
# This script will load the gbrowse_syn alignment database directly from a
# multiple sequence alignment file.
BEGIN {
  # Check for DBI before running the script
  # Doing this here will allow the "compile" tests to pass for GBrowse
  # even if DBI is not installed.
  eval {
      require DBI;
      DBI->import;
  };

  if ($@) {
      die "The DBI perl module is required to run this script\n";
  }
}


use strict;
use Bio::AlignIO;
use List::Util 'sum';
use Getopt::Long;
use Bio::DB::GFF::Util::Binning 'bin';

use Data::Dumper;

use constant MINBIN   => 1000;
use constant FORMAT   => 'clustalw';
use constant VERBOSE  => 0;
use constant MAPRES   => 100;

use vars qw/$format $create $user $pass $dsn $aln_idx $verbose $nomap $mapres $hit_idx $pidx %map/;

$| = 1;
GetOptions(
	   'format=s'    => \$format,
           'user=s'      => \$user,
	   'pass=s'      => \$pass,
	   'dsn=s'       => \$dsn,
           'map=i'       => \$mapres,
	   'verbose'     => \$verbose,
	   'nomap'       => \$nomap,
	   'create'      => \$create
	   );

my $usage = "Usage: load_alignments_msa.pl -u username -p password -d database [-f format, -m map_resolution, -v, -n, -c] file1, file2 ... filen\n\n";

$dsn     || die $usage;
$user    || die $usage;
$format  ||= FORMAT;
$verbose ||= VERBOSE;
$mapres  ||= MAPRES;

my ($dbh,$sth_hit,$sth_map) = prepare_database($dsn,$user,$pass);

while (my $infile = shift) {
  print "Processing alignment file $infile...\n" if $verbose;
  my $alignIO = Bio::AlignIO->new( -file   => $infile, 
				   -format => $format);

  while (my $aln = $alignIO->next_aln) {
    my $len = $aln->length;
    $pidx = 0;
    print STDERR "Processing Multiple Sequence Alignment " . ++$aln_idx . " (length $len)\t\t\t\r" if $verbose; 
    next if $aln->num_sequences < 2;
    my %seq;
    %map = ();
    my $map = {};
    for my $seq ($aln->each_seq) {
      my $seqid = $seq->id;
      my ($species,$ref,$strand) = check_name_format($seqid,$seq);
      next if $seq->seq =~ /^-+$/;
      $strand ||= $seq->start < $seq->end ? '+' : '-';
      # We have to tell the sequence object what its strand is
      $seq->strand($strand eq '-' ? -1 : 1) unless $seq->strand;
      $seq{$species} = [$ref, $seq->display_name, $seq->start, $seq->end, $strand, $seq->seq, $seq]; 
    }
    
    # make all pairwise hits and grid coordinates
    my @species = keys %seq;
    
    for my $p (map_pairwise(@species)) {
      my ($s1,$s2) = @$p;
      my $array1 = $seq{$s1};
      my $array2 = $seq{$s2};
      
      my $seq1 = $$array1[6];
      my $seq2 = $$array2[6];

      unless ($nomap) {
	$array1->[7] = make_map($seq1,$seq2,$map);
	$array2->[7] = make_map($seq2,$seq1,$map);
      }

      make_hit($s1 => $array1, $s2 => $array2);
    }
  }
}  

# Make coordinate maps at the specified resolution
sub make_map {
  my ($s1,$s2,$map) = @_;
  $s1 && $s2 || return {};
  unless (UNIVERSAL::can($s1,'isa')) {
    warn "WTF? $s1 $s2\n" and next;
  }
  
  column_to_residue_number($s1,$s2);
  my $coord = nearest($mapres,$s1->start);
  $coord += $mapres if $coord < $s1->start;
  my @map;
  
  my $reverse = $s1->strand ne $s2->strand;
 
  # have to get the column number from residue position, then
  # the matching residue num from the column number 
  while ($coord < $s1->end) {
    my $col     = column_from_residue_number($s1,$coord);
    my $coord2  = residue_from_column_number($s2,$col) if $col;
    push @map, ($coord,$coord2) if $coord2;
    $coord += $mapres;
  }
  return {@map};
}

sub column_to_residue_number {
  for my $seq (@_) {
    my $str = $seq->seq;
    my $id  = $seq->id;
    next if $map{$id};
    my $rev = $seq->strand < 0;
    my $res = $rev ? $seq->end - 1 : $seq->start + 1;
    my @cols = split '', $str;
    
    my $pos;
    my $col;
    for my $chr (@cols) {
      unless ($chr eq '-') {
	$rev ? $res-- : $res++;
      }

      $col++;
      $map{$id}{col}{$col} = $res;
      $map{$id}{res}{$res} ||= $col;
    }
  }
}

sub column_from_residue_number {
  my ($seq, $res) = @_;
  my $id = $seq->id;
  return $map{$id}{res}{$res};  
}

sub residue_from_column_number {
  my ($seq, $col) = @_;
  my $id = $seq->id;
  print"WTF? $seq $id $col\n" unless $id &&$col;
  return $map{$id}{col}{$col};
}

sub make_hit {
  my ($s1,$aln1,$s2,$aln2,$fh) = @_;
  my $rightnum = $nomap ? 7 : 8;
  die "wrong number of keys @$aln1" unless @$aln1 == $rightnum;
  die "wrong number of keys @$aln2" unless @$aln2 == $rightnum;
  my $map1 = $aln1->[7] || {};
  my $map2 = $aln2->[7] || {};

  # not using these yet
  my ($cigar1,$cigar2) = qw/. ./;

  load_alignment($s1,@{$aln1}[0,2..4],$cigar1,$s2,@{$aln2}[0,2..4],$cigar2,$map1,$map2);
}

sub map_pairwise {
  my @out;
  for my $i (0..$#_) {
    for my $j ($i+1..$#_) {
      push @out, [$_[$i], $_[$j]];
    }
  }
  return @out;
}

# stolen from Math::Round
sub nearest {
  my $targ = abs(shift);
  my $half = 0.50000000000008;
  my @res  = map {
    if ($_ >= 0) { $targ * int(($_ + $half * $targ) / $targ); }
    else { $targ * POSIX::ceil(($_ - $half * $targ) / $targ); }
  } @_;

  return (wantarray) ? @res : $res[0];
}

sub check_name_format {
  my $name = shift;
  my $seq  = shift;

  my $nogood = <<"  END";

  Problem with sequence name $name
  The Sequence name needs to contain some meta-data to identify
      the species, reference sequence and coordinates.


  Supported Sequence Name formats:

  # Downloaded via Ensembl Compara API
  species/seqid/start-end
      where species   = name of species, genome, strain, etc (string with no '-' characters)
            sequence  = name of reference sequence (string with no '/' characters)
            start     = start coordinate of the alignment relative to the reference sequence (integer)
            end       = end coordinate of the alignment relative to the reference sequence   (integer)
            in this format, the strand is + unless end < start

  # Legacy gbrowse_syn format
  species-seqid(strand)/start..end
      where (strand)  = orientation of the alignment (relative to the reference sequence; + or -)

  Examples:
    homo_sapiens/1/100000-200000
    c_elegans-I(+)/1..2300

  END
  ;

  die $nogood unless $name =~ /^([^-]+)-([^\(]+)\(([+-])\)$/  # Why did I do this?
              ||     $name =~ m!^([^/]+)/([^/]+)!;            # from Bio::LocatableSeq
  die $nogood unless $seq->start && $seq->end;
  return ($1,$2,$3);
}


sub prepare_database {
  my ($dns,$user,$pass) = @_;
  $dsn = "dbi:mysql:$dsn" unless $dsn =~ /^dbi/;

  my $dbh = DBI->connect($dsn, $user, $pass) or die DBI->errstr;

  if ($create) {
    $dbh->do('drop table if exists alignments') or die DBI->errstr;
    $dbh->do('drop table if exists map') or die DBI->errstr;
  
    $dbh->do(<<"    END;") or die DBI->errstr;
    create table alignments (
			 hit_id    int not null auto_increment,
			 hit_name  varchar(100) not null,
			 src1      varchar(100) not null,
			 ref1      varchar(100) not null,
			 start1    int not null,
			 end1      int not null,
			 strand1   enum('+','-') not null,
			 seq1      mediumtext,
			 bin       double(20,6) not null,
			 src2      varchar(100) not null,
			 ref2      varchar(100) not null,
			 start2    int not null,
			 end2      int not null,
			 strand2   enum('+','-') not null,
			 seq2      mediumtext,
			 primary key(hit_id),
			 index(src1,ref1,bin,start1,end1)
			 )
    END;

    $dbh->do(<<"    END;") or die DBI->errstr;
    create table map (
	  	  map_id int not null auto_increment,
		  hit_name  varchar(100) not null,
                  src1 varchar(100),
		  pos1 int not null,
		  pos2 int not null,
		  primary key(map_id),
		  index(hit_name)
		  )
    END;

    $dbh->do('alter table alignments disable keys');
    $dbh->do('alter table map');
  }
  
  my $hit_insert = $dbh->prepare(<<END) or die $dbh->errstr;
insert into alignments (hit_name,src1,ref1,start1,end1,strand1,seq1,bin,src2,ref2,start2,end2,strand2,seq2) 
values(?,?,?,?,?,?,?,?,?,?,?,?,?,?)
END
;

  my $map_insert = $dbh->prepare('insert into map (hit_name,src1,pos1,pos2) values(?,?,?,?)');

  return $dbh, $hit_insert, $map_insert;
}

sub load_alignment {
  my ($src1,$ref1,$start1,$end1,$strand1,$seq1,$src2,$ref2,$start2,$end2,$strand2,$seq2,$map1,$map2) = @_;

  # not using the cigar strings right now
  ($seq1,$seq2) = ('','');  

  $map1 ||= {};
  $map2 ||= {};

  my %map1 = %$map1;
  my %map2 = %$map2;

  # standardize hit names
  my $hit1 = 'H' . sprintf('%010s', ++$hit_idx);
  my $bin1 = scalar bin($start1,$end1,MINBIN);
  my $bin2 = scalar bin($start2,$end2,MINBIN);
  my $hit2 = "${hit1}r";
  
  # force ref strand to always be positive and invert target strand as required
  invert(\$strand1,\$strand2) if $strand1 eq '-'; 

  $sth_hit->execute($hit1,$src1,$ref1,$start1,$end1,$strand1,$seq1,$bin1,
		    $src2,$ref2,$start2,$end2,$strand2,$seq2) or warn $sth_hit->errstr;

  unless ($nomap) {
    for my $pos (sort {$a<=>$b} keys %map1) {
      next unless $pos && $map1{$pos};
      $sth_map->execute($hit1,$src1,$pos,$map1{$pos}) or die $sth_map->errstr; 
    }
  }

  # reciprocal hit is also saved to facilitate switching amongst reference sequences
  invert(\$strand1,\$strand2) if $strand2 eq '-';
  
  $sth_hit->execute($hit2,$src2,$ref2,$start2,$end2,$strand2,$seq2,$bin2,
		    $src1,$ref1,$start1,$end1,$strand1,$seq1) or warn $sth_hit->errstr;


  # saving pair-wise coordinate maps -- these are needed for gridlines
  unless ($nomap) {
    for my $pos (sort {$a<=>$b} keys %map2) {
      next unless $pos && $map2{$pos};
      $sth_map->execute($hit1,$src2,$pos,$map2{$pos}) or die $sth_map->errstr;
    }
  }

  print STDERR "Processed pair-wise alignment ".++$pidx."\r" if $verbose;

}

sub done {
  $dbh->do('alter table alignments enable keys');
  print "\nDone!\n\n";
}

sub invert {
  my $strand1 = shift;
  my $strand2 = shift;
  $$strand1 = $$strand1 eq '+' ? '-' : '+';
  $$strand2 = $$strand2 eq '+' ? '-' : '+'; 
}