package BioX::SeqUtils::RandomSequence;
use Class::Std;
use Class::Std::Utils;
use Bio::Tools::CodonTable;
use warnings;
use strict;
use Carp;
use version; our $VERSION = qv('0.9.5');
{
my %type_of :ATTR( :get<type> :set<type> :default<'2'> :init_arg<y> );
my %length_of :ATTR( :get<length> :set<length> :default<'2'> :init_arg<l> );
my %table_of :ATTR( :get<table> :set<table> :default<'1'> :init_arg<s> );
my %a_freq_of :ATTR( :get<a_freq> :set<a_freq> :default<'1'> :init_arg<a> );
my %c_freq_of :ATTR( :get<c_freq> :set<c_freq> :default<'1'> :init_arg<c> );
my %g_freq_of :ATTR( :get<g_freq> :set<g_freq> :default<'1'> :init_arg<g> );
my %t_freq_of :ATTR( :get<t_freq> :set<t_freq> :default<'1'> :init_arg<t> );
my %tmpl_of :ATTR( :get<tmpl> :set<tmpl> :default<''> );
sub START {
my ($self, $ident, $arg_ref) = @_;
$self->_check_type();
$self->_reset_tmpl();
#print "START LENGTH: " . $self->get_length() . "\n";
if ((! defined $arg_ref->{y} ) && defined $arg_ref->{l} ) { $self->set_type('dna'); } # Type not defined but length is = DNA
#print "START TYPE: " . $self->get_type() . "\n";
return;
}
sub rand_seq {
#print " rand_seq() \n";
my ( $self, $arg_ref ) = @_;
my $type = $self->get_type();
#print " TYPE: $type \n";
if ( $type =~ m/^2/ ) { return $self->rand_dna( $arg_ref ); }
elsif ( $type =~ m/^d/ ) { return $self->rand_dna( $arg_ref ); }
elsif ( $type =~ m/^r/ ) { return $self->rand_rna( $arg_ref ); }
elsif ( $type =~ m/^p/ ) { return $self->rand_pro( $arg_ref ); }
elsif ( $type =~ m/^s/ ) { return $self->rand_pro_set( $arg_ref ); }
return;
}
sub rand_dna {
#print " rand_dna() \n";
my ( $self, $arg_ref ) = @_;
# Set parameters redefined by this method
$self->_args_to_attributes( $arg_ref );
# Create random nucleotide sequence of specified length
my $tmpl_length = $self->get_length();
my $nucleotides = $self->randomize_tmpl();
while ( length($nucleotides) < $tmpl_length ) { $nucleotides .= $self->randomize_tmpl(); }
#print " rand_dna()->trim() $tmpl_length \n";
$nucleotides =~ s/^([ACGT]{$tmpl_length}).*$/$1/;
return $nucleotides;
}
sub rand_rna {
#print " rand_rna() \n";
my ( $self, $arg_ref ) = @_;
my $nucleotides = $self->rand_dna( $arg_ref );
$nucleotides =~ s/T/U/g;
return $nucleotides;
}
sub rand_pro {
#print " rand_pro() \n";
my ( $self, $arg_ref ) = @_;
# Set type to protein
$arg_ref->{y} = 'p';
# Set parameters redefined by this method
$self->_args_to_attributes( $arg_ref );
my $seq = $self->rand_dna();
#print " Sequence: $seq \n";
my $codon_table = Bio::Tools::CodonTable->new( -id => $self->get_table() );
if ( $codon_table->name() eq '' ) { print " Error: Codon Table " . $self->get_table() . " not defined.\n"; exit; }
my $protein = $codon_table->translate( $seq );
#print " Protein: $protein \n";
return $protein;
}
sub rand_pro_set {
#print " rand_pro_set() \n";
my ( $self, $arg_ref ) = @_;
#print " LENGTH: " . $arg_ref->{l} . "\n";
# Set parameters redefined by this method
$self->_args_to_attributes( $arg_ref );
#print " LENGTH: " . $self->get_length() . "\n";
my $seq = $self->rand_dna({ l => $self->get_length() + 1 });
my $seq1 = $seq; $seq1 =~ s/.$//; # Remove the last base
my $seq2 = $seq; $seq2 =~ s/^.//; # Remove the first base
#print "seq1: $seq1 \n";
#print "seq2: $seq2 \n";
$self->set_length( $self->get_length() - 1 );
my $codon_table = Bio::Tools::CodonTable->new( -id => $self->get_table() );
if ( $codon_table->name() eq '' ) { print " Error: Codon Table " . $self->get_table() . " not defined.\n"; exit; }
my $protein1 = $codon_table->translate( $seq1 );
my $protein2 = $codon_table->translate( $seq2 );
#print "pro1: $protein1 \n";
#print "pro2: $protein2 \n";
return wantarray() ? ( $protein1, $protein2 ) : [ $protein1, $protein2 ];
}
sub randomize_tmpl {
my ( $self ) = @_;
my $tmpl = $self->get_tmpl();
my @tmpl = @$tmpl;
for ( my $i = @tmpl; $i >= 0; --$i ) {
my $j = int rand ($i + 1);
next if $i == $j;
@tmpl[$i,$j] = @tmpl[$j,$i];
}
no warnings 'all';
return join("", @tmpl);
}
sub _args_to_attributes {
#print "args_to_attributes\n";
my ( $self, $arg_ref ) = @_;
# Type (y)
if ( defined $arg_ref->{y} ) { $self->set_type( $arg_ref->{y} ); }
$self->_check_type();
my $type = $self->get_type();
#print " TYPE: $type\n";
# Length (l)
if ( ($type =~ m/^p/ or $type =~ m/^s/) and defined $arg_ref->{l} ) { $self->set_length( $arg_ref->{l} * 3 ); }
elsif ( $type =~ m/^2/ ) { $self->set_length( 2 ); }
elsif ( defined $arg_ref->{1} ) { $self->set_length( $arg_ref->{l} ); }
# Frequencies (a,c,g,t)
my $freq_changed = 0;
if ( defined $arg_ref->{a} ) { $self->set_a_freq( $arg_ref->{a} ); $freq_changed++; }
if ( defined $arg_ref->{c} ) { $self->set_c_freq( $arg_ref->{c} ); $freq_changed++; }
if ( defined $arg_ref->{g} ) { $self->set_g_freq( $arg_ref->{g} ); $freq_changed++; }
if ( defined $arg_ref->{t} ) { $self->set_t_freq( $arg_ref->{t} ); $freq_changed++; }
# All frequencies must be set together or they are all reset to 1
if ( $freq_changed && $freq_changed < 4 ) { $self->set_a_freq( 1 ); $self->set_c_freq( 1 ); $self->set_g_freq( 1 ); $self->set_t_freq( 1 ); }
$self->_reset_tmpl() if $freq_changed;
return;
}
sub _check_type {
my ( $self, $arg_ref ) = @_;
my $type = $self->get_type();
if ( $type =~ m/^[^2drps]/ ) { print STDERR " Error: Type (y) must be 2, d, r, p, or s.\n"; exit; }
return;
}
sub _reset_tmpl {
my ( $self, $arg_ref ) = @_;
my @tmpl = split( //, 'A' x $self->get_a_freq() .
'C' x $self->get_c_freq() .
'G' x $self->get_g_freq() .
'T' x $self->get_t_freq() );
$self->set_tmpl( \@tmpl );
return;
}
}
1; # Magic true value required at end of module
__END__
=head1 NAME
BioX::SeqUtils::RandomSequence - Creates random DNA, RNA and protein sequences with given nucleotide frequencies.
=head1 VERSION
This document describes BioX::SeqUtils::RandomSequence version 0.9.5
=head1 SYNOPSIS
The randomizer object accepts parameters for sequence length (l), codon table (s), sequence
type (y), and frequencies for each of the nucleotide bases in DNA (a, c, g, t). The defaults
are shown below:
use BioX::SeqUtils::RandomSequence;
my $randomizer = BioX::SeqUtils::RandomSequence->new({ l => 2,
s => 1,
y => "dna",
a => 1,
c => 1,
g => 1,
t => 1 });
print $randomizer->rand_seq(), "\n";
=head1 DESCRIPTION
Create random DNA, RNA and protein sequences.
=head3 NUCLEOTIDE FREQUENCIES
All four frequencies are set to "1" by default ( so that the probablity of each A, C, G, T is
0.25 ). The frequencies should always be positive integers, and you should consider what you
choose. The algorithm works by creating a template with length equal to the sum L of A_freq,
C_freq, G_freq, and T_freq with exactly the numbers of each assigned to those frequencies. The
template is resorted for each L length part of the required sequence (and trimmed to required
length). For example, using the default frequencies, a sequence 100 bases long will have
exactly 25 A, 25 C, 25 G, and 25 T. If you want sequences from a wider distribution, use
four digit (or greater) values for the frequencies. For a sequence length of a few dozen bases,
this example would be broad enough to create repeat
islands: ($A_freq, $C_freq, $G_freq, $T_freq) = (2245, 2755, 2755, 2245).
=head3 NUCLEOTIDE FREQUENCIES UNDERLIE PROTEINS
Protein sequences are translated from random DNA sequence of the necessary length
using the assigned nucleotide frequencies. This module does not allow you to directly
influence the amino acid frequencies. If you need this sort of functionality, please contact
the author.
=head1 METHODS
=over
=item * rand_seq()
After creating a randomizer object, each sequence type can be accessed using the "y" (tYpe)
parameter with rand_seq(). The default type is "2" (for dinucleotide, a length two dna
sequence). The other types are "d" (dna), "r" (rna), "p" (protein), and "s" (protein set).
You can use the same randomizer object to create all types of sequences, by passing the
changing parameters with each call.
my $dinucleotide = $randomizer->rand_seq(); # Default settings
my $nuc_short = $randomizer->rand_seq({ y => 'd', l => 21 }); # Create DNA length 21
my $nuc_long = $randomizer->rand_seq({ l => 2200 }); # Still DNA, now length 2200
my $nuc_richer = $randomizer->rand_seq({ a => 225,
c => 275,
g => 275,
t => 225 }); # Still length 2200, GC richer
my $protein_now = $randomizer->rand_seq({ y => 'p' }); # Still richer GC
my $protein_def = $randomizer->rand_seq({ a => 1 }); # Missing bases resets all freq to 1
my $protein_new = $randomizer->rand_seq({ y => 'p',
s => 3 }); # Use codon table 'Yeast Mitochondrial'
The type parameter only works with rand_seq().
=item * rand_dna()
This method may be used directly to create DNA sequences.
my $dinucleotide = $randomizer->rand_dna();
my $dna = $randomizer->rand_dna({ l => 2200 });
$dna = $randomizer->rand_seq({ l => 200,
a => 225,
c => 275,
g => 275,
t => 225 }); # Larger variance
=item * rand_rna()
This method may be used directly to create RNA sequences.
my $rna = $randomizer->rand_rna({ l => 21 });
$rna = $randomizer->rand_rna({ l => 1000,
a => 225,
c => 275,
g => 275,
t => 225 });
=item * rand_pro()
This method may be used directly to create protein sequences.
A protein of the given length L is created by translating a random DNA sequence of
length L * 3 with the given nucleotide frequencies.
my $protein = $randomizer->rand_pro();
=item * rand_pro_set()
This method may be used directly to create a protein sequence set.
A protein set is correlatable at the DNA level by creating a random
DNA sequence with the given nucleotide frequencies of length L * 3 + 1, removing
the first base for sequence 1 and removing the last base for sequence 2, then
translating them into proteins.
This method uses wantarray(), and will either return a list or list
reference (scalar) depending on the context:
my ($pro1, $pro2) = $randomizer->rand_pro_set();
my $protein_set = $randomizer->rand_pro_set();
=back
=head1 SCRIPTS
The package includes scripts for random dna, rna, dinucleotide, and protein
sequences. The length and frequency parameters should always be integers.
To create a dinucleotide sequence:
./random-dna.pp # Defaults: length 2, all frequencies 1
./random-dna.pp -a250 -c250 -g250 -t250 # Create broader distribution
To create a dna sequence:
./random-dna.pp -l21 # Defaults: all frequencies 1 ( p = .25 )
./random-dna.pp -l2200 -a23 -c27 -g27 -t23 # Enrich GC content with length 2200
To create a rna sequence:
./random-rna.pp -l100
./random-rna.pp -l2200 -a23 -c27 -g27 -t23
To create a protein sequence:
./random-protein.pp # Defaults: length 2, all frequencies .25
./random-protein.pp -l2200 -a23 -c27 -g27 -t23 # Enrich underlying GC content, aa length 2200
To create a protein set (with common DNA shifted by one base):
./random-protein-set.pp # Defaults: length 2, all frequencies .25
./random-protein-set.pp -l2200 -a23 -c27 -g27 -t23 # Enrich underlying GC content
Additionally, a "master script" uses a tYpe parameter for any:
./random-sequence.pp # Type 2 dinucleotide
./random-sequence.pp -yd -l100 # Type d dna
./random-sequence.pp -yr -l100 # Type r rna
./random-sequence.pp -yp -l100 # Type p protein
./random-sequence.pp -ys -l100 # Type s protein set
This module uses Bio::Tools::CodonTable for translations, and the parameter s can be used to
change from the default (1) "Standard":
./random-protein.pp -l2200 -s2 # Non-standard codon table
=head1 CONFIGURATION AND ENVIRONMENT
None.
=head1 DEPENDENCIES
Class::Std;
Class::Std::Utils;
Bio::Tools::CodonTable;
=head1 INCOMPATIBILITIES
None reported.
=head1 BUGS AND LIMITATIONS
No bugs have been reported.
Please report any bugs or feature requests to
C<bug-biox-sequtils-randomsequence@rt.cpan.org>, or through the web interface at
L<http://rt.cpan.org>.
=head1 AUTHOR
Roger A Hall C<< <rogerhall@cpan.org> >>
=head1 LICENSE AND COPYRIGHT
Copyleft (c) 2009, Roger A Hall C<< <rogerhall@cpan.org> >>. All rights reserved.
This module is free software; you can redistribute it and/or
modify it under the same terms as Perl itself. See L<perlartistic>.
=head1 DISCLAIMER OF WARRANTY
BECAUSE THIS SOFTWARE IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
FOR THE SOFTWARE, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN
OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
PROVIDE THE SOFTWARE "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER
EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE
ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE SOFTWARE IS WITH
YOU. SHOULD THE SOFTWARE PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL
NECESSARY SERVICING, REPAIR, OR CORRECTION.
IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
REDISTRIBUTE THE SOFTWARE AS PERMITTED BY THE ABOVE LICENCE, BE
LIABLE TO YOU FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL,
OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE
THE SOFTWARE (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING
RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A
FAILURE OF THE SOFTWARE TO OPERATE WITH ANY OTHER SOFTWARE), EVEN IF
SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
SUCH DAMAGES.
=cut
Option a +int frequency of nucleotide A
Option c +int frequency of nucleotide C
Option g +int frequency of nucleotide G
Option l +int length
Option t +int frequency of nucleotide T
Option s +int codon table
Option y 2,d,r,p,s type (dinucleotide, dna, rna, protein, set)