The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
#!/usr/bin/perl
use strict;
use vars qw($USAGE);

# random sequence generator #
# -c=1 option will cause prot sequences to be built 
# using vertebrate aa frequencies, 
# with option -a putting a 1st methionine residues on. Frequencies are
# calculated from the NCBI human RefSeq protein sequences 
# -c and -a only affect protein sequences
# -a only works in conjunction with -c
# -n number of random sequences, default = 1

use Bio::PrimarySeq;
use Bio::SeqIO;
use Getopt::Long;
my ($length,$type,$filename,$comp,$met);

$USAGE = 'usage: generate_random_seq.pl --length=1000 --type=dna --filename=/tmp/test.seq --number=50';

my %alphabets = ( 'dna' => [qw(C A G T)],
                  'rna' => [qw(C A G U)],
                  'prot'=> [qw( A C D E F G H I K L M N P Q R S T V W Y)],
              );
# make random num from 1-10000. numbers in this array reflect the frequency,
# e.g., a random number from 1.744 = A, 745-991 = C etc;
my @aa_frequencies = qw(744 991 1398 2017 2378 3104 3349 3726 4239 5273 5443 
                        5749 6410 6848 7455 8263 8760 9340 9488 9713 10000);
my $number = 1;

&GetOptions
  (
   'l|length:s'          => \$length,
   't|type|m|alphabet:s' => \$type,
   'f|file|filename:s'   => \$filename,
   'c|composition:s'     => \$comp,
   'a|methionine:s'      => \$met,
   'n|number:s'          => \$number
  );

assert ( $type && defined ($alphabets{lc $type}),
         $USAGE);
assert ( $length && $length =~ /^\d+$/, $USAGE );

foreach my $num (1..$number) {
   my $sequence = "";
   my $alphabet = $alphabets{lc $type};
   my $sspace = scalar @$alphabet;
   if (!$comp || $type ne 'prot') {
      foreach ( 1..$length ) {
	 $sequence .= $alphabet->[ int rand($sspace) ];
      }
   }elsif ($type eq 'prot') {
      $sequence = build_seq($length, \@aa_frequencies);
   }
   my $seq =  Bio::PrimarySeq->new(-seq        => $sequence, 
				   -display_id => 'randomseq'.$num);
   my %args = (-format => 'fasta');
   if( $filename ) { $args{-file} = ">>$filename" }
   my $seqio = Bio::SeqIO->new(%args);
   $seqio->write_seq($seq);
}

sub assert { die $_[1] unless( $_[0] ); }

sub build_seq {
   #takes seqlen and ref to frequency data as parameters
   my ($len, $pf)  = @_;
   my $str = ($met)?'M':'';
   my $i = ($met)?1:0;
   for ($i..$len-1) {
      my $aa = int(rand (10000)) ;
      my $j = 0;
      while ($pf->[$j] < $aa && $j <19) {
	 $j++;
      }
      $str .= $alphabets{'prot'}[$j];
   }
   print "str is $str\n";
   return $str;
}