The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
package Mock::Populate;
BEGIN {
  $Mock::Populate::AUTHORITY = 'cpan:GENE';
}

# ABSTRACT: Mock data creation

our $VERSION = '0.09';

use strict;
use warnings;

use constant NDATA => 10;
use constant PREC  => 2;
use constant DOF   => 2;
use constant SIZE  => 8;

use Data::SimplePassword;
use Date::Range;
use Date::Simple qw(date today);
use Image::Dot;
use List::Util qw(shuffle);
use Mock::Person;
use Statistics::Distributions;
use Text::Password::Pronounceable;
use Text::Unidecode;
use Time::Local;


sub date_ranger {
    my %args = @_;
    # Set defaults.
    $args{start} ||= '2001-01-01';
    $args{end}   ||= today();
    $args{N}     ||= NDATA;

    # Convert the dates into a range.
    my $date1 = date($args{start});
    my $date2 = date($args{end});
    my $range = Date::Range->new($date1, $date2);

    # Declare the number of days in the range.
    my $offset = 0;

    # Bucket for our result list.
    my @results;

    for(1 .. $args{N}) {
        # Get a random number of days in the range.
        $offset = int(rand $range->length);

        # Save the stringified start date plus the offest.
        my $date = $date1 + $offset;
        push @results, "$date";
    }

    return \@results;
}


sub date_modifier {
    # Get the number of days in the future and the date list.
    my ($offset, @dates) = @_;

    # Bucket for our result list.
    my @results;

    for my $date (@dates) {
        # Cast the current date string as an object.
        my $current = date($date);

        # Get a random number of days in the future.
        my $m = int(rand $offset) + 1;

        # Save the stringified date plus the offest.
        $date = $current + $m;
        push @results, "$date";
    }

    return \@results;
}


sub time_ranger {
    my %args = @_;
    # Set defaults.
    $args{stamp} ||= 1;
    $args{start} ||= '00:00:00';
    $args{end}   ||= '';
    $args{N}     ||= NDATA;

    # Split the :-separated times.
    my @start = split ':', $args{start};
    my @end   = $args{end} ? split(':', $args{end}) : _now();
    #warn "S->E: @start -> @end\n";

    # Compute the number of seconds between start and end.
    my $start_time = timegm(@start[2, 1, 0], (localtime(time))[3, 4, 5]);
    my $end_time   = timegm(@end[2, 1, 0], (localtime(time))[3, 4, 5]);
    my $range = $end_time - $start_time;
    #warn "R: $end_time (@end) - $start_time (@start) = $range\n";

    # Declare the number of seconds.
    my $offset = 0;

    # Bucket for our result list.
    my @results;

    # Generate a time, N times.
    for(1 .. $args{N}) {
        # Get a random number of seconds in the range.
        $offset = int(rand $range);

        # Print the start time plus the offest seconds.
        if ($args{stamp}) {
            # In HH:MM::SS format.
            my $time = scalar localtime($start_time + $offset);
            push @results, (split / /, $time)[3];
        }
        else {
            # As a number of seconds from the "epoc."
            push @results, $start_time + $offset;
        }
    }

    return \@results;
}

sub _now { # Return hour, minute, second.
    return (localtime(time))[2, 1, 0];
}


sub number_ranger {
    my %args = @_;
    # Set defaults.
    $args{start}  ||= 0;
    $args{end}    ||= NDATA;
    $args{prec}   ||= PREC;
    $args{random} ||= 1;
    $args{N}      ||= NDATA;

    # Bucket for our result list.
    my @results;

    # Do we want random numbers?
    if ($args{random}) {
        # Roll!
        for(1 .. $args{N}) {
            # Get our random candidate.
            my $x = rand($args{end});
            # Make sure it is above the start value.
            while ($x < $args{start}) {
                $x = rand($args{end});
            }
            push @results, $x;
        }
    }
    else {
        # Use a simple sequence of integers.
        @results = ($args{start} .. $args{end});
    }

    return \@results;
}


sub name_ranger {
    my %args = @_;
    # Set defaults.
    $args{gender}  ||= 0;
    $args{names}   ||= 2;
    $args{country} ||= 'us';
    $args{N}       ||= NDATA;

    # Bucket for our result list.
    my @results;

    # Roll!
    for my $i (1 .. $args{N}) {
        # Get our random person.
        my $p = '';
        # If gender is 'both' alternate male-female.
        # Or if gender is not 'male' then ...female!
        if (($args{gender} eq 'b' && $i % 2) || $args{gender} eq 'f') {
            $p = Mock::Person::name(sex => 'female', country => $args{country});
        }
        else {
            $p = Mock::Person::name(sex => 'male', country => $args{country});
        }
        # Only use the requested number of names.
        my @names = split / /, $p;
        my $name = '';
        if ($args{names} == 1) {
            push @results, $names[-1];
        }
        elsif ($args{names} == 2) {
            push @results, "@names[0,-1]";
        }
        else {
            push @results, $p;
        }
    }

    return \@results;
}


sub email_modifier {
    my @people = @_;

    # Bucket for our results.
    my @results = ();

    # Generate email addresses if requested.
    my @tld = qw( com net org edu );

    for my $p (@people) {
        # Break up the name.
        my @name = split / /, $p;

        # Turn any unicode characters into something ascii.
        $_ = unidecode($_) for @name;

        # Add an email address for the person.
        my $email = lc($name[0]);
        $email .= '.'. lc($name[-1]) if @name > 1;
        $email .= '@example.' . $tld[rand @tld];
        push @results, $email;
    }

    return \@results;
}


sub distributor {
    my %args = @_;
    # Set defaults.
    $args{type} ||= 'u';
    $args{prec} ||= PREC;
    $args{dof}  ||= DOF;
    $args{N}    ||= NDATA;

    # Separate numerator/denominator for F degs-of-freedm.
    my $e = 1;
    ($args{dof}, $e) = split(/\//, $args{dof}) if $args{type} eq 'f';

    # Bucket for our result list.
    my @results;

    # Roll!
    for(1 .. $args{N}) {
        # Select distribution.
        if ($args{type} eq 'c') {
            # Chi-squared
            push @results, Statistics::Distributions::chisqrdistr($args{dof}, rand);
        }
        elsif ($args{type} eq 's') {
            # Student's T
            push @results, Statistics::Distributions::tdistr($args{dof}, rand);
        }
        elsif ($args{type} eq 'f') {
            # F distribution
            push @results, Statistics::Distributions::fdistr($args{dof}, $e, rand);
        }
        else {
            # Normal
            push @results, Statistics::Distributions::udistr(rand);
        }
    }

    return \@results;
}


sub shuffler {
    # Get the desired number of data-points.
    my $n = defined $_[0] ? shift : 9;
    # Get the items to shuffle.
    my @items = @_ ? @_ : ('a' .. 'j');
    return [ shuffle(@items) ];
}


sub string_ranger {
    my %args = @_;
    # Set defaults.
    $args{length} ||= SIZE;
    $args{type}   ||= 'default';
    $args{N}      ||= NDATA;

    # Declare a pw instance.
    my $sp = Data::SimplePassword->new;

    # Declare the types (lifted directly from rndpassword).
    my $chars = {
        default => [ 0..9, 'a'..'z', 'A'..'Z' ],
        ascii   => [ map { sprintf "%c", $_ } 33 .. 126 ],
        base64  => [ 0..9, 'a'..'z', 'A'..'Z', qw(+ /) ],
        path    => [ 0..9, 'a'..'z', 'A'..'Z', qw(. /) ],
        simple  => [ 0..9, 'a'..'z' ],
        alpha   => [ 'a'..'z' ],
        digit   => [ 0..9 ],
        binary  => [ qw(0 1) ],
        morse   => [ qw(. -) ],
        hex     => [ 0..9, 'a'..'f' ],
        pron    => [],
    };
    # Set the chars based on the given type.
    $sp->chars( @{ $chars->{$args{type}} } );

    # Declare a bucket for our results.
    my @results = ();

    # Roll!
    for(1 .. $args{N}) {
        if ($args{type} eq 'pron') {
            push @results, Text::Password::Pronounceable->generate(
                $args{length}, $args{length});
        }
        else {
            push @results, $sp->make_password($args{length});
        }
    }

    return \@results;
}


sub image_ranger {
    my %args = @_;
    # Set defaults.
    $args{size} ||= SIZE;
    $args{N}    ||= NDATA;

    # Declare a bucket for our results.
    my @results = ();

    # Start with a 1x1 pixel image.
    my $img = dot_PNG_RGB(0, 0, 0);

    # XXX This is naive and sad:
    # Pull-apart the image data.
    (my $head = $img) =~ s/^(.*?IDAT).*$/$1/ms;
    (my $tail = $img) =~ s/^.*?(IEND.*)$/$1/ms;
    $img =~ s/^.*?IDAT(.*?)IEND.*$/$1/ms;

    for (1 .. $args{N}) {
        # Increase the byte size (not dimension).
        my $i = $head . ($img x int(rand $args{size})) . $tail;
        #warn "L: ",length($i), "\n";

        # Save the result.
        push @results, $i;
    }

    return \@results;
}


sub collate {
    # Accept any number of columns.
    my @columns = @_;

    # Make a copy of the columns to peel off.
    my @lists = @columns;

    # Declare the bucket for our arrayrefs.
    my @results = ();

    # Add each list item to rows of collated.
    for my $list (@columns) {
        for my $i (0 .. @$list - 1) {
            push @{ $results[$i] }, $list->[$i];
        }
    }

    return \@results;
}

1;

__END__

=pod

=encoding UTF-8

=head1 NAME

Mock::Populate - Mock data creation

=head1 VERSION

version 0.09

=head1 SYNOPSIS

  use Mock::Populate;
  # * Call each function below with Mock::Populate::foo(...
  $ids    = number_ranger(start => 1, end => 1001, prec => 0, random => 0, N => $n);
  $money  = number_ranger(start => 1000, end => 5000, prec => 2, random => 1, N => $n);
  $create = date_ranger(start => '1900-01-01', end => '2020-12-31', N => $n);
  $modify = date_modifier($offset, @$create);
  $times  = time_ranger(stamp => 1, start => '01:02:03' end =>'23:59:59', N => $n);
  $people = name_ranger(gender => 'b', names => 2, country => 'us', N => $n);
  $email  = email_ranger(@$people);
  $shuff  = shuffler($n, qw(foo bar baz goo ber buz));
  $stats  = distributor(type => 'u', prec => 4, dof => 2, N => $n);
  $string = string_ranger(length => 32, type => 'base64', N => $n);
  $imgs   = image_ranger(size => 10, N => $n);  # *size is density, not pixel dimension
  $coll   = collate($ids, $people, $email, $create, $times, $modify, $times);

=head1 DESCRIPTION

This is a set of functions for mock data creation.

No functions are exported, so use the entire C<Mock::Populate::*> namespace when
calling each.

Each function produces a list of elements that can be used as database columns.
The handy C<collate()> function takes these columns and returns a list of
(arrayref) rows.  This can then be processed into CSV, JSON, etc.  It can also
be directly inserted into your favorite database, with your favorite perl ORM.

=head1 NAME

Mock::Populate - Mock data creation

=head1 FUNCTIONS

=head2 date_ranger()

  $results = date_ranger(start => $start, end => $end, N => $n);

Return a list of N random dates within a range.  The start and end dates and
desired number of data-points arguments are all optional.  The defaults are:

  start: 2000-01-01
  end: today (computed if not given)
  N: 10

The dates must be given as B<YYYY-MM-DD> strings.

=head2 date_modifier()

  $modify = date_modifier($offset, @$dates);

Returns a new list of random future dates, based on the offset, and respective
to each given date.

=head2 time_ranger()

  $results = time_ranger(
    stamp => $stamp, start => $start, end => $end,
    N => $n);

Return a list of N random times within a range.  The start and end times and
desired number of data-points arguments are all optional.  The defaults are:

  stamp: 1 (boolean)
  start: 00-00-00
  end: now (computed if not given)
  N: 10

The times must be given as B<HH-MM-SS> strings.

=head2 number_ranger()

  $results = number_ranger(
    start => $start, end => $end,
    prec => $prec, random => $random,
    N => $n)

Return a list of N random numbers within a range.  The start, end, precision,
whether we want random or sequential numbers and desired number of data-points
arguments are all optional.  The defaults are:

  start: 0
  end: 9
  precision: 2
  random: 1
  N: 10

=head2 name_ranger()

  $results = name_ranger(
    gender => $gender, names => $names, country => $country,
    N => $n)

Return a list of N random person names.  The gender, number of names and
desired number of data-points arguments are all optional.  The defaults are:

  gender: b (options: both, female, male)
  names: 2 (first, last)
  country: us
  N: 10

=head2 email_modifier()

  $results = email_modifier(@people)
  # first.last@example.{com,net,org,edu}

Return a list of N email addresses based on a list of given names.

=head2 distributor()

  $results = distributor(type => $type, prec => $prec, dof => $dof, N => $n)

Return a list of N distribution values.  The type, precision, degrees-of-freedom
and desired number of data-points arguments are optional.  The defaults are:

  type: u (normal)
  precision: 2
  degrees-of-freedom: 2
  N: 10

=head3 Types

This function uses single letter identifiers:

  u: Normal distribution (default)
  c: Chi-squared distribution
  s: Student's T distribution
  f: F distribution

=head3 Degrees of freedom

Given the type, this function accepts the following:

  c: A single integer
  s: A single integer
  f: A fraction string of the form 'N/D' (default 2/1)

=head2 shuffler()

  $results = shuffler($n, @items)

Return a shuffled list of B<$n> items.  The items and number of data-points
arguments are optional.  The defaults are:

  n: 10
  items: a b c d e f g h i j

=head2 string_ranger()

  $results = string_ranger(type => $type, length => $length, N => $n)

Return a list of N strings.  The strings and number of data-points
arguments are optional.  The defaults are:

  type: default
  length: 8
  N: 10

* This function is nearly identical to the L<Data::SimplePassword>
C<rndpassword> program, but allows you to generate a finite number of results.

=head3 Types

  Types     Output sample     Character set
  ___________________________________________________
  default   0xaVbi3O2Lz8E69s  0..9 a..z A..Z
  ascii     n:.T<Gr!,e*[k=eu  visible ascii
  base64    PC2gb5/8+fBDuw+d  0..9 a..z A..Z / +
  path      PC2gb5/8.fBDuw.d  0..9 a..z A..Z / .
  simple    xek4imbjcmctsxd3  0..9 a..z
  hex       89504e470d0a1a0a  0..9 a..f
  alpha     femvifzscyvvlwvn  a..z
  pron      werbucedicaremoz  a..z but pronounceable!
  digit     7563919623282657  0..9
  binary    1001011110000101
  morse     -.--...-.--.-..-

=head2 image_ranger()

  $results = image_ranger(size => $size, N => $n)

Return a list of N 1x1 pixel images of varying byte sizes (not image dimension).
The byte size and number of data-points are both optional.

The defaults are:

  N: 10
  size: 8

=head2 collate()

  $rows = collate(@columns)

Return a list of lists representing a 2D table of rows, given the lists
provided, with each member added to a row, respectively.

=head1 SEE ALSO

L<Data::SimplePassword>

L<Date::Range>

L<Date::Simple>

L<Image::Dot>

L<List::Util>

L<Mock::Person>

L<Statistics::Distributions>

L<Text::Password::Pronounceable>

L<Text::Unidecode>

L<Time::Local>

L<Data::Random> does nearly the exact same thing. Whoops!

=head1 TODO

Implement dirty-data randomizing.

  unexpected formats: iso-8859-1, utf-16, windows codepage,
  BOM (byte order marker),
  broken unicode,
  garbled binary,
  \r and \n variations,
  commas or $ in currencies ("format fuckups"),
  bad JSON,
  broken XML,
  bad ' and " in CSV,
  statistical outliers,
  time-series drops and spikes,
  duplicate data,
  missing data,
  truncated data,

=head1 AUTHOR

Gene Boggs <gene@cpan.org>

=head1 COPYRIGHT AND LICENSE

This software is copyright (c) 2014 by Gene Boggs.

This is free software; you can redistribute it and/or modify it under
the same terms as the Perl 5 programming language system itself.

=cut