The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
# Import.pm - Easy to Use DBI import interface

# Copyright (C) 2004-2012 Stefan Hornburg (Racke) <racke@linuxia.de>

# Authors: Stefan Hornburg (Racke) <racke@linuxia.de>
# Maintainer: Stefan Hornburg (Racke) <racke@linuxia.de>
# Version: 0.19

# This file is free software; you can redistribute it and/or modify it
# under the terms of the GNU General Public License as published by the
# Free Software Foundation; either version 2, or (at your option) any
# later version.

# This file is distributed in the hope that it will be
# useful, but WITHOUT ANY WARRANTY; without even the implied warranty
# of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# General Public License for more details.

# You should have received a copy of the GNU General Public License
# along with this file; see the file COPYING.  If not, write to the Free
# Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.

package DBIx::Easy::Import;

use strict;
use vars qw($VERSION @ISA @EXPORT @EXPORT_OK);

require Exporter;

@ISA = qw(Exporter);
# Items to export into callers namespace by default. Note: do not export
# names by default without a very good reason. Use EXPORT_OK instead.
# Do not simply export all your public functions/methods/constants.
@EXPORT = qw(
);

# Public variables
$VERSION = '0.19';

use DBI;
use DBIx::Easy;

sub new {
	my $proto = shift;
	my $class = ref($proto) || $proto;
	my $self = {driver => shift, database => shift};

	bless ($self, $class);
}

sub update {
	my ($self, %params) = @_;

	$self->_do_import(%params);
}

sub initialize {
	my ($self, $file, $format) = @_;
	my ($sep_char);
	
	$format = uc($format);

	if ($format =~ /^CSV/) {
		$format = 'CSV';
		if ($') {
			$sep_char = $';
			$sep_char =~ s/^\s+//;
			$sep_char =~ s/\s+$//;
		}
		eval {
			require Text::CSV_XS;
		};
		if ($@) {
			die "$0: couldn't load module Text::CSV_XS\n";
		}
		$self->{func} = \&get_columns_csv;
		$self->{parser} = new Text::CSV_XS ({'binary' => 1, 'sep_char' => $sep_char});
	} elsif ($format eq 'XLS') {
		eval {
			require Spreadsheet::ParseExcel;
		};
		if ($@) {
			die "$0: couldn't load module Spreadsheet::ParseExcel\n";
		}
		$self->{parser} = new Spreadsheet::ParseExcel;
	} elsif ($format eq 'TAB') {
		$self->{func} = \&get_columns_tab;
	} else {
		die qq{$0: unknown format "$format"}, "\n";
	}

	if ($file) {
		# read input from file
		require IO::File;
		$self->{fd_input} = new IO::File;
		$self->{fd_input}->open($file)
			|| die "$0: couldn't open $file: $!\n";
	} else {
		# read input from standard input
		require IO::Handle;
		$self->{fd_input} = new IO::Handle;
		$self->{fd_input}->fdopen(fileno(STDIN),'r');
	}

}

sub _do_import {
	my ($self, %params) = @_;
	my ($format, $sep_char, %colmap, %hcolmap);

	$self->{colflag} = 1;

	$self->initialize($params{file}, $params{'format'});
	
	my @columns;

	if ($params{'columns'}) {
		$self->{colflag} = ! ($params{'columns'} =~ s/\s*[\!^]//);
		
		# setup positive/negative list for columns
		for (split(/\s*,\s*/, $params{'columns'})) {
			$self->{usecol}->{$_} = $self->{colflag};
		}
	}

	if (ref($params{'map'}) eq 'HASH') {
		%colmap = %{$params{'map'}};
	} elsif ($params{'map'}) {
		# parse column name mapping
		my ($head, $name);
		foreach (split (/;/, $params{'map'})) {
			($head, $name) = split /=/;
			$colmap{$head} = $name;
		}
	}

	if (1) {
		my %hcolmap;
		my @columns;

		if ($self->{func}->($self, \@columns) <= 0)  {
			die "$0: couldn't find headline\n";
		}

		if ($params{'map_filter'} eq 'lc') {
			@columns = map {lc($_)} @columns;
		}

		# remove whitespace from column names and mark them
		map {s/^\s+//; s/\s+$//; $hcolmap{$_} = 1;} @columns;

		if ($params{'map'}) {
			my @newcolumns;
        
			# filter column names
			foreach (@columns) {
				if (exists $colmap{$_}) {
					push (@newcolumns, $colmap{$_});
					$hcolmap{$colmap{$_}} = 1;
				} else {
					push (@newcolumns, $_);
				}
			}
			@columns = @newcolumns;
		}

		# add any other columns explicitly selected
		for (sort (keys %{$self->{usecol}})) {
			next if $hcolmap{$_};
			next unless exists $self->{usecol}->{$_};
			next unless $self->{usecol}->{$_};
			push (@columns, $_);
		}

		$self->{fieldmap}->{$params{table}} = \@columns;
	}
	
	# database access
	my $dbif = $self->{dbif} = new DBIx::Easy ($self->{driver} || $params{driver},
							   $self->{database} || $params{database});
	# determine column names
    my @names = $self->column_names ($params{table});
    my $fieldnames = \@names;
	my @values;

	while ($self->{func}->($self, \@columns))  {
		my (@data);
		
		@values = @columns;
		
		# sanity checks on input data
		my $typeref = $dbif -> typemap ($params{table});
		my $sizeref = $dbif -> sizemap ($params{table});

		for (my $i = 0; $i <= $#$fieldnames; $i++) {
			# check for column exclusion
			if (keys %{$self->{usecol}}) {
				# note: we do not check the actual value !!
				if ($self->{colflag} && ! exists $self->{usecol}->{$$fieldnames[$i]}) {
					next;
				}
				if (! $self->{colflag} && exists $self->{usecol}->{$$fieldnames[$i]}) {
					next;
				}
			}
			
			# expand newlines and tabulators
			if (defined $values[$i]) {
				$values[$i] =~ s/\\n/\n/g;
				$values[$i] =~ s/\\t/\t/g;
			}
        
			# check if input exceeds column capacity
			unless (exists $$typeref{$$fieldnames[$i]}) {
				warn ("$0: No type information for column $$fieldnames[$i] found\n");
				next;
			}
			unless (exists $$sizeref{$$fieldnames[$i]}) {
				warn ("$0: No size information for column $$fieldnames[$i] found\n");
				next;
			}
			if ($$typeref{$$fieldnames[$i]} == DBI::SQL_CHAR) {
				if (defined $values[$i]) {
					if (length($values[$i]) > $$sizeref{$$fieldnames[$i]}) {
						warn ("$0: Data for field $$fieldnames[$i] truncated: $values[$i]\n");
						$values[$i] = substr($values[$i], 0,
											 $$sizeref{$$fieldnames[$i]});
					}
				} else {
					# avoid insertion of NULL values
					$values[$i] = '';
				}	
			} elsif ($$typeref{$$fieldnames[$i]} == DBI::SQL_VARCHAR) {
				if (defined $values[$i]) {
					if (length($values[$i]) > $$sizeref{$$fieldnames[$i]}) {
						warn ("$0: Data for field $$fieldnames[$i] truncated: $values[$i]\n");
						$values[$i] = substr($values[$i], 0,
											 $$sizeref{$$fieldnames[$i]});
					}
				} else {
					# avoid insertion of NULL values
					$values[$i] = '';
				}
			}
#        push (@data, $$fieldnames[$i], $values[$i]);
		}
		
		# check if record exists
		my %keymap = $self->key_names ($params{table}, $params{'keys'} || 1, 1);
		my @keys = (keys(%keymap));
		my @terms = map {$_ . ' = ' . $dbif->quote($values[$keymap{$_}])}
			(@keys);
		my $sth = $dbif -> process ('SELECT ' . join(', ', @keys)
								 . " FROM $params{table} WHERE "
								 . join (' AND ', @terms));
		while ($sth -> fetch) {}
		
		if ($sth -> rows () > 1) {
			$" = ', ';
			die ("$0: duplicate key(s) @keys in table $params{table}\n");
		}

		my $update = $sth -> rows ();
		$sth -> finish ();
    
		# generate SQL statement
		for (my $i = 0; $i <= $#$fieldnames; $i++) {
			# check for column exclusion
			if (keys %{$self->{usecol}}) {
				# note: we do not check the actual value !!
				if ($self->{colflag} && ! exists $self->{usecol}->{$$fieldnames[$i]}) {
					next;
				}
				if (! $self->{colflag} && exists $self->{usecol}->{$$fieldnames[$i]}) {
					next;
				}
			}
			# expand newlines
			if (defined $values[$i]) {
				$values[$i] =~ s/\\n/\n/g;
			}
			push (@data, $$fieldnames[$i], $values[$i]);
		}

		if ($update) {
			$dbif -> update ($params{table}, join (' AND ', @terms), @data);
		} else {
			if ($params{'update_only'}) {
				$" = ', ';
				die ("$0: key(s) @keys not found\n");
			}
			$dbif -> insert ($params{table}, @data);
		}
	}
}

# -------------------------------------------------
# FUNCTION: column_names DBIF TABLE [START]
#
# Returns array with column names from table TABLE
# using database connection DBIF.
# Optional parameter START specifies column where
# the array should start with.
# -------------------------------------------------

sub column_names {
    my ($self, $table, $start) = @_;    
    my ($names, $sth);

    $start = 0 unless $start;
    
    if (exists $self->{fieldmap}->{$table}) {
        $names = $self->{fieldmap}->{$table};
    } else {
        $sth = $self->{dbif}-> process ("SELECT * FROM $table WHERE 0 = 1");
        $names = $self->{fieldmap}->{$table} = $sth -> {NAME};
        $sth -> finish ();
    }

    @$names[$start .. $#$names];
}


# --------------------------------------------------
# FUNCTION: key_names DBIF TABLE KEYSPEC [HASH]
#
# Returns array with key names for table TABLE.
# Database connection DBIF may be used to
# retrieve necessary information.
# KEYSPEC contains desired keys, either a numeric
# value or a comma-separated list of keys.
# If HASH is set, a mapping between key name
# and position is returned.
# --------------------------------------------------

sub key_names () {
    my ($self, $table, $keyspec, $hash) = @_;
    
    my ($numkeysleft, $i);
    my @columns = $self->column_names ($table);
    my (@keys, %kmap);
    
    $keyspec =~ s/^\s+//; $keyspec =~ s/\s+$//;

    if ($keyspec =~ /^\d+$/) {
        #
        # passed keys are numeric, figure out the names
        #

        $numkeysleft = $keyspec;

        for ($i = 0; $i < $numkeysleft && $i < @columns; $i++) {
            if (keys %{$self->{usecol}}) {
                # note: we do not check the actual value !!
                if ($self->{colflag} && ! exists $self->{usecol}->{$columns[$i]}) {
                    $numkeysleft++;
                    next;
                }
                if (! $self->{colflag} && exists $self->{usecol}->{$columns[$i]}) {
                    $numkeysleft++;
                    next;
                }
            }
            if ($hash) {
                $kmap{$columns[$i]} = $i;
            } else {
                push (@keys, $columns[$i]);
            }
        }
	} else {
        #
        # key names are passed explicitly
        #

        my %colmap;
        
        for ($i = 0; $i < @columns; $i++) {
            $colmap{$columns[$i]} = $i;
        }
        
        for (split (/\s*,\s*/, $keyspec)) {
            # sanity check
            unless (exists $colmap{$_}) {
                die "$0: key \"$_\" appears not in column list\n";
            }
            
            if ($hash) {
                $kmap{$_} = $colmap{$_};
            } else {
                push (@keys, $_);
            }
        }
    }

    return $hash ? %kmap : @keys;
}

# FUNCTION: get_columns_csv IREF FD COLREF

sub get_columns_csv {
	my ($self, $colref) = @_;
	my $line;
	my $msg;
	my $fd = $self->{fd_input};
	
	while (defined ($line = <$fd>)) {
		if ($self->{parser}->parse($line)) {
			# csv line completed, delete buffer
			@$colref = $self->{parser}->fields();
			$self->{buffer} = '';
			return @$colref;
		} else {
			if (($line =~ tr/"/"/) % 2) {
			# odd number of quotes, try again with next line
				$self->{buffer} = $line;
			} else {
				$msg = "$0: $.: line not in CSV format: " . $self->{parser}->error_input() . "\n";
				die ($msg);
			}
		}
	}
}

# ----------------------------------------
# FUNCTION: get_columns_tab IREF FD COLREF
#
# Get columns from a tab separated file.
# ----------------------------------------

sub get_columns_tab {
	my ($self, $colref) = @_;
	my $line;
	my $fd = $self->{fd_input};
	
	while (defined ($line = <$fd>)) {
		# skip empty/blank/comment lines
		next if $line =~ /^\#/; next if $line =~ /^\s*$/;
		# remove newlines and carriage returns
		chomp ($line);
		$line =~ s/\r$//;

		@$colref = split (/\t/, $line);
		return @$colref;
	}
}

# ----------------------------------------
# FUNCTION: get_columns_xls IREF FD COLREF
#
# Get columns from a XLS spreadsheet.
# ----------------------------------------

sub get_columns_xls {
	my ($iref, $fd, $colref) = @_;

	unless ($iref->{workbook}) {
		# parse the spreadsheet once
		$iref->{workbook} = $iref->{object}->Parse($fd);
		unless ($iref->{workbook}) {
			die "$0: couldn't parse spreadsheet\n";
		}
		$iref->{worksheet} = $iref->{workbook}->{Worksheet}[0];
		$iref->{row} = 0;
	}

	if ($iref->{row} <= $iref->{worksheet}->{MaxRow}) {
		@$colref = map {$_->Value()}
			@{$iref->{worksheet}->{Cells}[$iref->{row}++]};
		return @$colref;
	}
}

sub get_columns {
	my ($self, $colref) = @_;

	return $self->{func}->($self, $colref);
}

1;