The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
package SQL::Type::Guess;
use strict;
use vars qw($VERSION);
$VERSION= '0.03';

=head1 NAME

SQL::Type::Guess - guess an appropriate column type for a set of data

=head1 SYNOPSIS

    my @data=(
      { fool => 1, when => '20140401', greeting => 'Hello', value => '1.05'  },
      { fool => 0, when => '20140402', greeting => 'World', value => '99.05' },
      { fool => 0, when => '20140402', greeting => 'World', value => '9.005' },
    );

    my $g= SQL::Type::Guess->new();
    $g->guess( @data );

    print $g->as_sql( table => 'test' );
    # create table test (
    #    "fool" decimal(1,0),
    #    "greeting" varchar(5),
    #    "value" decimal(5,3),
    #    "when" date
    # )

=cut

=head1 METHODS

=head2 C<< SQL:::Type::Guess->new( %OPTIONS ) >>

  my $g= SQL::Type::Guess->new();

Creates a new C<SQL::Type::Guess> instance. The following options are
supported:

=over 4

=item B<column_type>

Hashref of already known column types.

=item B<column_map>

Hashref mapping the combinations SQL type names
to the resulting type name.

=back

=cut

sub new {
    my( $class, %options )= @_;
    
    $options{ column_type } ||= {};
    $options{ column_map } ||= {
        ";date" => 'date',
        ";decimal" => 'decimal(%2$d,%3$d)',
        ";varchar" => 'varchar(%1$d)',
        "date;" => 'date',
        "decimal;" => 'decimal(%2$d,%3$d)',
        "varchar;" => 'varchar(%1$d)',
        "varchar;date" => 'varchar(%1$d)',
        "varchar;decimal" => 'varchar(%1$d)',
        "varchar;varchar" => 'varchar(%1$d)',
        "date;decimal" => 'decimal(%2$d,%3$d)',
        "date;varchar" => 'varchar(%1$d)',
        "date;date" => 'date',
        "decimal;date" => 'decimal(%2$d,%3$d)',
        "decimal;varchar" => 'varchar(%1$d)',
        "decimal;decimal" => 'decimal(%2$d,%3$d)',
        ";" => '',
    };
    
    bless \%options => $class;
}

=head2 C<< $g->column_type >>

    $g->guess({ foo => 1, bar => 'Hello' },{ foo => 1000, bar => 'World' });
    print $g->column_type->{ 'foo' } # decimal(4,0)

Returns a hashref containing the SQL types to store all
values in the columns seen so far.

=cut

sub column_type { $_[0]->{column_type} };

=head2 C<< $g->column_map >>

Returns the hashref used for the type transitions. The current
transitions used for generalizing data are:

  date -> decimal -> varchar

This is not entirely safe, as C<2014-01-01> can't be safely
loaded into an C<decimal> column, but assuming your data is representative
of the data to be stored that shouldn't be much of an issue.

=cut

sub column_map  { $_[0]->{column_map} };

=head2 C<< $g->guess_data_type $OLD_TYPE, @VALUES >>

    my $type= $g->guess_data_type( $type, 1,2,3,undef,'Hello','World', );

Returns the data type that encompasses the already established data type in C<$type>
and the new values as passed in via C<@values>.

If there is no preexisting data type, C<$type> can be C<undef> or the empty string.

=cut

sub guess_data_type {
    my( $self, $type, @values )= @_;

    my $column_map= $self->column_map;
    for my $value (@values) {
        my $old_type= $type;

        my $this_value_type= '';
        my $pre= 0;
        my $post= 0;
        my $length= length $value || 0;
        # Sorry, the list of recognizers is currently just hardcoded
        if( ! defined $value or $value =~ /^$/) {
            # ... nothing to guess here
        } elsif( $value =~ /^((?:19|20)\d\d)-?(0\d|1[012])-?([012]\d|3[01])$/) {
            $this_value_type= 'date';
            $pre= 8;
        } elsif( $value =~ /^\s*[+-]?(\d+)\s*$/) {
            $this_value_type= 'decimal';
            $pre= length( $1 );
            $post= 0;
        } elsif( $value =~ /^\s*[+-]?(\d+)\.(\d+)\s*$/) {
            $this_value_type= 'decimal';
            $pre= length( $1 );
            $post= length( $2 );
        } else {
            $this_value_type= 'varchar';
        };
        
        if( $type ) {
            if( $type =~ s/\s*\((\d+)\)// ) {
                $length= $1 > $length ? $1 : $length;
            } elsif( $type =~ s/\s*\((\d+),(\d+)\)// ) {
                my( $new_prec, $new_post )= ($1,$2);
                my $new_pre= $new_prec - $new_post;
                $pre= $new_pre > $pre ? $new_pre : $pre;
                $post= $2 > $post ? $2 : $post;
            };
        } else {
            $type= '';
        };
        
        if( $type ne $this_value_type ) {
            if( not exists $column_map->{ "$type;$this_value_type" }) {
                die "Unknown transition '$type' => '$this_value_type'";
            };
        };
        $type= sprintf $column_map->{ "$type;$this_value_type" }, $length, $pre+$post, $post;
    };
    $type
};

=head2 C<< $g->guess( @RECORDS ) >>

    my @data= (
        { rownum => 1, name => 'John Smith', street => 'Nowhere Road', birthday => '1996-01-01' },
        { rownum => 2, name => 'John Doe', street => 'Anywhere Plaza', birthday => '1904-01-01' },
        { rownum => 3, name => 'John Bull', street => 'Everywhere Street', birthday => '2001-09-01' },
    );
    $g->guess( @data );

Modifies the data types for the keys in the given hash.

=cut

sub guess {
    my( $self, @records )= @_;
    my $column_type= $self->column_type;
    for my $row (@records) {
        for my $col (keys %$row) {
            my( $new_type )= $self->guess_data_type($column_type->{$col}, $row->{ $col });
            if( $new_type ne ($column_type->{ $col } || '')) {
                #print sprintf "%s: %s => %s ('%s')\n",
                #    $col, ($column_type{ $col } || 'unknown'), ($new_type || 'unknown'), $info->{$col};
                $column_type->{ $col }= $new_type;
            };
        }
    }
}

=head2 C<< $g->as_sql %OPTIONS >>

    print $g->as_sql();

Returns an SQL string that describes the data seen so far.

Options:

=over 4

=item B<user>

Supply a username for the table

=item B<columns>

This allows you to specify the columns and their order. The default
is alphabetical order of the columns.

=back

=cut

sub as_sql {
    my( $self, %options )= @_;
    my $table= $options{ table };
    my $user= defined $options{ user }
              ? "$options{ user }."
              : ''
              ;
    my $column_type= $self->column_type;
    $options{ columns }||= [ sort keys %{ $column_type } ];
    my $columns= join ",\n", map { qq{    "$_" $column_type->{ $_ }} } @{ $options{ columns }};
        my($sql)= <<SQL;
create table $user$table (
$columns
)
SQL
    return $sql;
}

1;

=head1 BUG TRACKER

Please report bugs in this module via the RT CPAN bug queue at
L<https://rt.cpan.org/Public/Dist/Display.html?Name=SQL-Type-Guess>
or via mail to L<sql-type-guess-Bugs@rt.cpan.org>.

=head1 AUTHOR

Max Maischein C<corion@cpan.org>

=head1 COPYRIGHT (c)

Copyright 2014 by Max Maischein C<corion@cpan.org>.

=head1 LICENSE

This module is released under the same terms as Perl itself.

=cut