=head1 NAME

InSilicoSpectro::InSilico::RetentionTimer::Hodges Prediction of peptide retention time method by sum of amino acid coefficients.

=head1 SYNOPSIS

  use InSilicoSpectro::InSilico::RetentionTimer::Hodges;
  use InSilicoSpectro::InSilico::ExpCalibrator;

  # create the retention time predictor and select the current coefficients
  my $rt = InSilicoSpectro::InSilico::RetentionTimer::Hodges->new(current=>'Guo86');

  # make the predictor to learn
  $rt->learn( data=>{expseqs=>['ELGFQG','HPGDFGADAQAAMSK','LSSPATLNSR','RFIK'],
              exptimes=>[1314,1194,1152,1500],
              expmodif=>['::Oxidation_M::::','::::::::::::::::',':::::']});

  # learn also the correction for peptide length
  $rt->learn_lc( expseqs=>['LFTGHPETLEK','HPGDFGADAQAAMSK','LSSPATLNSR',],
                 exptimes=>[1224,1152,1500], );

  # predict retention time for a given peptide
  $rt->predict( peptide=>'ACFGDMKWVTFISLLRPLLFSSAYSRGVFRRDTHKSEIAHRFKDLGE',
                modification=>' ::::::::::::::::::::::::::::::::::Oxidation_M:::::::::::::');

  # filter current data
  $rt->filter( filter=>10 );

  # save current coefficients
  $rt->write_xml( confile=>$file,current=>'Exp00' );

  # retrieve previously saved coefficients
  $rt->read_xml( current=>'Exp01' );

  # assigns a calibrator to the predictor
  $ec=InSilicoSpectro::InSilico::ExpCalibrator->new( fitting=>'spline' );

  # fits the calibrator from experimental values
  $rt->calibrate( data=>{calseqs=>['ELGFQG','HPGDFGADAQAAMSK','LSSPATLNSR','RFIK'],
                  caltimes=>[1314,1194,1152,1500],
                  calmodifs=>['::Oxidation_M::::','::::::::::::::::',':::::']},
                 calibrator=>$ec );

  # save current calibrator
  $rt->write_cal( calfile=>$file );

  # retrieve previously saved calibrator
  $rt->read_cal ( calfile=>$file );

=head1 DESCRIPTION

Prediction of reversed-phase HPLC retention time for peptides using the sum of retention coefficients for every amino acid. The coefficients can be chosen from a list of selected precomputed values from the literature or can be learned by means of a multilinear regression fitted from experimental data.

A correction factor for polypeptide chain length is also available.

=head1 METHODS

=head3 my $rt=InSilicoSpectro::InSilico::RetentionTimer::Hodges->new( %h )

$h contains a hash.

=head3 $rt->getAuthorList()

REturns an array of authors name with available parameters values in the config file

=head3 $rt->learn( data=>{expseqs=>\@seqs,exptimes=>\@times,expmodif=>\@modifs} );

Learn the coefficients from experimental data.

=head3 $rt->learn_lc( data=>{expseqs=>\@seqs,exptimes=>\@times,expmodif=>\@modifs}  );

Learn correction factor for polypeptide chain length.

=head3 $rt->predict( peptide=>$str );

Predict retention time for the peptide.

=head3 $rt->predictor( peptide=>$str );

Same as predict() but without the calibrator's experimental fitting.

=head3 $rt->calibrate( calseqs=>\@str, caltimes=>\@val,fitting=>$str );

Train the predictor with experimental data and the chosen fitting method.

=over 4

=item fitting=>'linear'|'spline'

Method used for fitting.

=back

=head3 $rc->filter( filter=>$pc,error=>$str )

Filter experimental data in $rc->{data} by a cutting threshold of relative prediction error of $pc (in %).

=over 4

=item error=>'relative'|'absolute'

Type of error for filtering.

=back

=head3 $rt->writexml( confile=>$file );

Write coefficients.

=head3 $rt->readxml( current=>$str );

Retrieve saved coefficients and t0 labelled as in "current".

=head3 $rt->delete_coef( current=>$str );

Delete permanently from the current file the list of coefficients identified by $str.

=head3 @list=$rt->list_coef( $str );

Return a list of currently available sets of coefficients

=head3 @list=$rt->list_coef( );

List available coefficients in the current file.

=head3 %coef=$rt->get_coef( );

Return a hash with the current coefficients.

=head3 $value=$rt->set_t0();

Set the value of delay.

=head3 $value=$rt->get_t0();

Get the current value of delay.

=head3 $rt->calibrate( data=>{calseqs=>\@seqs,caltimes=>\@times,calmodifs=>\@modifs},calibrator=>$ec );

Calibrate the predictor with experimental data and saves it in $rt->{calibrator}.

=over 4

=item calibrator=>$ec

Reference to a InSilicoSpectro::InSilico::ExpCalibrator class instance.

=back

=head3 $rt->write_cal( calfile=>$file );

Save current calibrator.

=head3 $rt->read_cal ( calfile=>$file );

Retrieve a previously saved calibrator.

=head3 $rt->set( $name );

Set an instance parameter.

=head3 $rt->get( $name );

Get an instance parameter.

=head1 EXAMPLES

see InSilicoSpectro/t/InSilico/testHodges.pl script

=head1 SEE ALSO

InSilicoSpectro::InSilico::RetentionTimer

InSilicoSpectro::InSilico::ExpCalibrator

Guo D, Mant CT, Taneja AK, Parker JMR, Hodges RS. "Prediction of peptide retention times in reversed-phase high-performance liquid chromatography I. Determination of retention coefficients of amino acid residues of model synthetic peptides," J Chromatogr. 1986; 359:499-518.

Mant CT, Zhou NE, Hodges RS. "Correlation of protein retention times in reversed-phase chromatography with polypeptide chain length and hydrophobicity," J Chromatogr. 1989; 476:363-75.

=head1 COPYRIGHT

Copyright (C) 2004-2005  Geneva Bioinformatics www.genebio.com

This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.

This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
Lesser General Public License for more details.

You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA

=head1 AUTHORS

Pablo Carbonell, Alexandre Masselot, www.genebio.com

=cut

use strict;

package InSilicoSpectro::InSilico::RetentionTimer::Hodges;
require Exporter;
use Carp;

use InSilicoSpectro::InSilico::RetentionTimer;
use InSilicoSpectro::InSilico::ExpCalibrator;
use File::Basename;

our (@ISA, @EXPORT, @EXPORT_OK);
@ISA = qw(Exporter InSilicoSpectro::InSilico::RetentionTimer);
@EXPORT = qw(&getAuthorList);
@EXPORT_OK = ();

our $defConfigFile=dirname(__FILE__).'/'.'hodges_coef.xml';

sub new{
  my ($pkg,%h)=@_;# pkg: name of the module; h: hash with the rest of parameters
  my $rt=$pkg->SUPER::new;# Create a reference

# Setting of properties
  $rt->set('confile', getDefaultConfigFile());# init file


  $rt->set('current','Guo86');# Select precomputed values
  $rt->set('data',{%{$rt->get('data')},expmodif=>[]});
  $rt->set('overwrite',0);# Overwrite old values when learning?
  $rt->set('modif',0); # Use modifications as dummy variables?
  $rt->set('comments','');

  $rt->set('length_correction',0);# If set, calculate length correction

  foreach (keys %h){ $rt->set($_, $h{$_}) }

  $rt->read_xml();# Read the retention times coefficients into a Hash
  $rt->set('confile','-'); # default file

  if (scalar @{$rt->{data}{expseqs}}) {
    $rt->learn();
  }

  return $rt;
}

#------------------------------- accessors

sub getDefaultConfigFile{
  return $defConfigFile;
}

sub getAuthorList{
  my $this=shift;
  my @tmp;

  my $twig=XML::Twig->new(twig_handlers =>{values => 
					   sub {push @tmp, $_->atts->{author}}
					  }
 );
  $twig->parsefile($this->{confile}||getDefaultConfigFile); # build it
  $twig->purge; # purge it
  return @tmp;
}


# -------------------------------   predictor

sub predict {
  my ($this,%h)=@_;
  foreach (keys %h){ $this->set($_, $h{$_}) }

  return $this->SUPER::predict($this->predictor);

}

sub predictor {

  my ($this,%h)=@_;
  foreach (keys %h){ $this->set($_, $h{$_}) }

  my %counter;
  my %coeff;
  if (exists($this->{coeff}{$this->{current}})) {
    %coeff=%{$this->{coeff}{$this->{current}}};
  } else {
    croak "Set of coefficients $this->{current} not defined";
    }

  foreach (split '',$this->{peptide}) {
    if (exists($coeff{Rc}{$_})) {
      $counter{$_}++;
    } else {
      carp "Coefficient for symbol $_ not defined. Discarding";
      }
  }
  my $R=$coeff{t0};
  foreach (keys %counter) {
    $R+=$counter{$_}*$coeff{Rc}{$_};
  }

 if ($this->{modif}) {
   # Count modifications for peptide
   foreach (split ':',$this->{modification}) {
     if (exists $coeff{Rc}{"~$_"}) { $R+=$coeff{Rc}{"~$_"} if /(.+)/ } }
 }

  if ($this->{length_correction}) {
    if (length($this->{peptide})>10) {
      $R-=$coeff{lc}{slope}*$R*log(length($this->{peptide}))
	+$coeff{lc}{intercept};
    }
  }

  return($R);
}


# -------------------------------   learn

sub learn {

  my ($this,%h)=@_;
  foreach (keys %h){ $this->set($_, $h{$_}) }

  if ((!exists $this->{coeff}{$this->{current}}) or ($this->{overwrite})) {

    my (@theta,$reg);
    my %coeff;
    my @expdata=@{$this->{data}{expseqs}};# Sequences
    my @prdata=@{$this->{data}{exptimes}};# Retention times
    my @expmodif=@{$this->{data}{expmodif}};
    my @aas=split '','ACDEFGHIKLMNPQRSTVWY';
    my %listmodif;

    unshift (@aas,'t0');

    if ($this->{modif}) { 
      foreach (@expmodif) { foreach (split ':') { $listmodif{"~$_"}++ if /(.+)/ } }
      foreach (sort keys %listmodif) { push(@aas,$_) }
	}

    if ((scalar @aas)>(scalar @expdata)) {
      carp "Too few data points for learning";
    } else {
      $reg = Statistics::Regression->new(scalar @aas, "multiple regression",\@aas );

    # Add data points
      for (my $k=0;$k<(scalar @expdata);$k++) {
	my @vector=count_aa($expdata[$k]);
	if ($this->{modif}) {
	  # Count modifications for peptide
	  my %modif;
	  foreach (sort keys %listmodif) {$modif{$_}=0};
	  foreach (split ':',$expmodif[$k]) { $modif{"~$_"}++ if /(.+)/ };
	  foreach (sort keys %listmodif) { push(@vector,$modif{$_}) };
	}
	unshift(@vector,1); #Intercept
	$reg->include($prdata[$k],\@vector);# Add point
      }

      @theta = $reg->theta;
      $coeff{t0}=shift(@theta);

      shift(@aas);
      $coeff{Rc}= {};
      for (my $k=0;$k<(scalar @aas);$k++) {# Vector of data
	$coeff{Rc}{$aas[$k]}=$theta[$k];
      }
      $coeff{comments}=$this->{comments};

      $this->{coeff}{$this->{current}}=\%coeff;
      return 1;
    }
  }
  return 0;
}

sub learn_lc { # learn length correction

  my ($this,%h)=@_;
  foreach (keys %h){ $this->set($_, $h{$_}) }

  my $reg;

  my $pcoeff=$this->{coeff}{$this->{current}};
  my @expdata=@{$this->{data}{expseqs}};# Sequences
  my @prdata=@{$this->{data}{exptimes}};# Retention times
  my @expmodif=@{$this->{data}{expmodif}};

  $reg = Statistics::Regression->new(2, "linear regression",['b','m']);

  # Add data points
  $this->{length_correction}=0;
  for (my $k=0;$k<(scalar @expdata);$k++) {
    if (length($expdata[$k])>10) {
      $reg->include( $this->predictor(peptide=>$expdata[$k],modification=>$expmodif[$k]
				     )-$prdata[$k], 
		[1,$this->predictor(peptide=>$expdata[$k],modification=>$expmodif[$k]
				   )*log(length($expdata[$k]))] ); 
    }
  }
  if ($reg->theta) {
    $this->{length_correction}=1;
    (${$pcoeff}{lc}{intercept},${$pcoeff}{lc}{slope})=$reg->theta;
  }
}

# -------------------------------   calibrate

sub calibrate {

  my ($this,%h)=@_;
  foreach (keys %h) { $this->set($_, $h{$_}) }

  my (@prdata);
  my $k;

  for ($k=0;$k<scalar(@{$this->{data}{calseqs}});$k++) {
     push(@prdata,$this->predictor(peptide=>${$this->{data}{calseqs}}[$k],
				   modification=>${$this->{data}{calmodifs}}[$k]));# Assign an index to each prediction
  }

  $this->set('data',{prdata=>\@prdata});
  $this->SUPER::calibrate;

}

# -------------------------------   Filter data

sub filter {

  my ($this,%h)=@_;
  foreach (keys %h){ $this->set($_, $h{$_}) }

  my @error;

  for(my $m=0;$m<=$#{$this->{data}{expseqs}};$m++) {
     push(@error,(abs($this->predict(peptide=>${$this->{data}{expseqs}}[$m],
				     modification=>${$this->{data}{expmodif}}[$m])
		      -${$this->{data}{exptimes}}[$m])));
 }
 return $this->SUPER::filter(\@error);
}

# -----------------------------  restore / delete Rc

# delete coefficients
sub delete_coef {

  my ($this,$name)=@_;

  if ($name ne $this->{current}) { delete $this->{coeff}{$name} }
}

# list available set of coefficients
sub list_coef {

  my ($this)=@_;

  return (keys %{$this->{coeff}});
}

sub get_coef {

  my ($this)=@_;

  return %{$this->{coeff}{$this->{current}}{Rc}};
}

# -----------------------------  set / get t0

sub set_t0 {

  my ($this,$value)=@_;

  $this->{coeff}{$this->{current}}{t0}=$value;
}

sub get_t0 {
  my ($this)=@_;
  return $this->{coeff}{$this->{current}}{t0};
}

# -------------------------------   write / read xml file with Rc data

sub write_xml {

  my ($this,%h)=@_;
  foreach (keys %h) { $this->set($_, $h{$_}) }

  my $str;

  $str="<coefficients>"."\n";
  foreach my $author (keys %{$this->{coeff}}) {
     $str.="\t".'<values author="'.$author.'">'."\n";
     foreach my $item (sort keys %{$this->{coeff}{$author}}) {
       if (ref $this->{coeff}{$author}{$item}) {
	 foreach (sort keys %{$this->{coeff}{$author}{$item}}) {
	   if ( $this->{coeff}{$author}{$item}{$_}=~/^([+-]?)(?=\d|\.\d)\d*(\.\d*)?([Ee]([+-]?\d+))?$/ ) {
	   $str.="\t\t"."<$item data=".'"'.$_.'">'.sprintf("%.3e",$this->{coeff}{$author}{$item}{$_})."</$item>"."\n";
	 } else {
	   $str.="\t\t"."<$item data=".'"'.$_.'">'.$this->{coeff}{$author}{$item}{$_}."</$item>"."\n";
	   }
	 }
       } else {
	 if ( $this->{coeff}{$author}{$item}=~/^([+-]?)(?=\d|\.\d)\d*(\.\d*)?([Ee]([+-]?\d+))?$/ )  {
	 $str.="\t\t"."<$item>".sprintf("%.3e",$this->{coeff}{$author}{$item})."</$item>"."\n";
       } else {
	 $str.="\t\t"."<$item>".$this->{coeff}{$author}{$item}."</$item>"."\n";
       }
       }
     }
     $str.="\t"."</values>"."\n";
   }
   $str.="</coefficients>";

  if (open(XMLFILE,'>'.$this->{confile})) {
    print XMLFILE $str;
    close XMLFILE;
  } else {
    carp "Bad configuration file";
  }
}


sub read_xml {

  my  ($this,%h)=@_;

  foreach (keys %h){ $this->set($_, $h{$_}) }

  my $twig=XML::Twig->new(twig_handlers =>{values => sub {
			  foreach my $baby ($_->children) {
			    if (%{$baby->atts}) {
			      foreach my $item (values %{$baby->atts}){
				$this->{coeff}{${$_->atts}{author}}{$baby->gi}{$item}=$baby->text;		
			      }
			    } else {
			      $this->{coeff}{${$_->atts}{author}}{$baby->gi}=$baby->text;
			    } } },} );
  $twig->parsefile($this->{confile}); # build it
  $twig->purge; # purge it

}


# -------------------------------   misc
return 1;