#!/usr/local/bin/perl
# $Id: excel2txt 19 2010-07-21 15:55:47Z kyclark $
use strict;
use warnings;
use Cwd;
use English qw( -no_match_vars );
use File::Basename qw( basename );
use File::Spec::Functions;
use File::Path qw( mkpath );
use Getopt::Long;
use Pod::Usage;
use Readonly;
use Spreadsheet::ParseExcel;
our $VERSION = version->new('0.05');
Readonly my $DASH => q{-};
Readonly my $EMPTY_STR => q{};
Readonly my $HTML => q{html};
Readonly my $XML => q{xml};
Readonly my $YAML => q{yaml};
Readonly my $TXT => q{txt};
Readonly my $NL => qq{\n};
Readonly my $XML_HEADER => q[<?xml version="1.0" encoding="iso-8859-1"?>];
Readonly my %VALID_FORMAT => map { $_, 1 } ( $HTML, $XML, $TXT, $YAML );
my $be_quiet = 0;
my $delimiter = qq{\t};
my $normalize_headers = 0;
my $out_dir = cwd();
my $output_format = $TXT;
my ( $help, $man_page, $show_version );
GetOptions(
'd|ofs:s' => \$delimiter,
'f|output-format:s' => \$output_format,
'n|normalize-headers' => \$normalize_headers,
'o|out-dir:s' => \$out_dir,
'q|quiet' => \$be_quiet,
'help' => \$help,
'man' => \$man_page,
'version' => \$show_version,
) or pod2usage(2);
if ( $help || $man_page ) {
pod2usage({
-exitval => 0,
-verbose => $man_page ? 2 : 1
});
};
if ( $show_version ) {
my $prog = basename( $PROGRAM_NAME );
print "$prog v$VERSION\n";
exit 0;
}
my @files = @ARGV or pod2usage('No input files');
my $debug = sub { print join( $NL, @_, $EMPTY_STR ) if !$be_quiet };
if ( !-d $out_dir ) {
mkpath( $out_dir );
}
$output_format = lc $output_format;
if ( !$VALID_FORMAT{ $output_format } ) {
pod2usage( sprintf(
"'%s' is not a valid output format, please choose from %s",
$output_format, join( ', ', map { qq["$_"] } sort keys %VALID_FORMAT ),
) );
}
if ( $output_format eq $XML ) {
require XML::Simple;
}
elsif ( $output_format eq $YAML ) {
require YAML;
}
my ( $num_files_processed, $num_out_files ) = ( 0, 0 );
INPUT_FILE:
for my $file ( @files ) {
my $err = $EMPTY_STR;
if ( !-e $file ) {
$err = q[doesn't exist];
}
elsif ( !-s _ ) {
$err = 'is zero-length';
}
elsif ( !-r _ ) {
$err = 'is unreadable';
}
if ( $err ) {
warn "'$file' $err: skipping.\n";
next INPUT_FILE;
}
$debug->('Processing '. basename($file));
my $workbook = Spreadsheet::ParseExcel::Workbook->Parse( $file );
my $output_base = _normalize( basename( $file ) );
$output_base =~ s/\.xls$//;
if ( ref $workbook->{'Worksheet'} ne 'ARRAY' ) {
warn "'$file' has no worksheets (not an Excel spreadsheet?)\n";
next INPUT_FILE;
}
my $num_worksheets = scalar @{ $workbook->{'Worksheet'} };
WORKSHEET:
for my $ws ( @{ $workbook->{'Worksheet'} } ) {
my $min_row = $ws->{'MinRow'};
my $min_col = $ws->{'MinCol'};
my $max_row = $ws->{'MaxRow'} or next WORKSHEET;
my $max_col = $ws->{'MaxCol'} or next WORKSHEET;
my $ws_name = _normalize( $ws->{'Name'} );
my $out_name;
if ( $num_worksheets > 1 ) {
$out_name = join $DASH, $output_base, $ws_name;
}
else {
$out_name = $output_base;
}
my $suffix = '.' . $output_format;
my $out_file = catfile( $out_dir, $out_name . $suffix );
if ( -e $out_file && -s _ ) {
( my $base = $out_file ) =~ /${suffix}$/;
my $i;
while ( -e $out_file && -e _ ) {
$out_file = join( $EMPTY_STR,
join( $DASH , $base, ++$i ), $suffix
);
}
}
$debug->("Writing '$out_file'");
open my $out_fh, '>', $out_file
or die "Can't write to '$out_file': $!\n";
my $num_rows = 0;
my @field_names;
my @data;
{
no warnings;
ROW:
for my $row_num ( $min_row .. $max_row ) {
my @row;
for my $col_num ( $min_col .. $max_col ) {
my $cell = $ws->{'Cells'}[ $row_num ][ $col_num ];
my $val = defined $cell ? $cell->Value : $EMPTY_STR;
$val =~ s/\n//gxms;
push @row, $val;
}
if ( @row ) {
$num_rows++;
if ( $num_rows == 1 ) {
if ( $normalize_headers || $output_format eq $XML ) {
@row = map { _normalize($_) } @row;
}
@field_names = @row;
if ( $output_format eq $HTML ) {
print {$out_fh} "<table>\n";
}
elsif ( $output_format ne $TXT ) {
next ROW;
}
}
if ( $output_format eq $TXT ) {
print {$out_fh} join( $delimiter, @row ), $NL;
}
elsif ( $output_format eq $HTML ) {
my $cell_type = $num_rows == 1 ? 'th' : 'td';
print {$out_fh} join $NL,
'<tr>',
( map { qq[ <$cell_type>$_</$cell_type>] } @row ),
'</tr>',
$EMPTY_STR
;
}
else {
push @data, {
map { $field_names[$_], $row[$_] } 0 .. $#row
};
}
}
}
}
if ( $output_format eq $HTML ) {
print {$out_fh} "</table>\n";
}
elsif ( $output_format eq $XML ) {
XML::Simple::XMLout(
{ $ws_name => \@data },
OutputFile => $out_fh,
);
}
elsif ( $output_format eq $YAML ) {
print {$out_fh} YAML::Dump(@data);
}
close $out_fh;
if ( $num_rows <= 1 ) {
warn "No data in worksheet '$ws' in file '$file'\n";
unlink $out_file;
next WORKSHEET;
}
$num_out_files++;
}
$num_files_processed++;
}
$debug->(sprintf(
"Done, processed %s Excel file%s, created %s data file%s.",
$num_files_processed,
$num_files_processed == 1 ? $EMPTY_STR : 's',
$num_out_files,
$num_out_files == 1 ? $EMPTY_STR : 's',
));
exit 0;
# ----------------------------------------------------
sub _normalize {
my $in = shift;
if ( defined $in && $in ne $EMPTY_STR ) {
$in = lc $in;
$in =~ s/\s+/_/g;
$in =~ s/[^a-zA-Z0-9._-]//g;
}
return $in;
}
__END__
# ----------------------------------------------------
=head1 NAME
excel2txt - convert Excel data to delimited text files
=head1 SYNOPSIS
excel2txt [options] File1.xls [File2.xls ...]
Options:
-d|--ofs Output field delimiter (default is Tab)
-f|--output-format "txt," "html," "xml" or "yaml" (defaults to "txt")
-n|--normalize-headers Normalize column headers (see below)
-o|--out-dir Where to place output file (defaults to CWD)
-q|--quiet Do not print any status messages
--help Show brief help and exit
--man Show full documentation
--version Show version and exit
=head1 DESCRIPTION
For each worksheet within an Excel spreadsheet, creates a text file.
By default, the output files will be plaint text files using a Tab for
the delimiter. Use the "-d" switch to specify a different delimiter
such as a comma. You may also choose to create an HTML >table<,
an XML file, or a YAML dump using the "-f" option.
The output file names will be normalized such that they will consist
of only lowercase letters with spaces replaced by underscores and
non-alphabetic characters deleted. The "-n" option will also apply this
transformation the column headers. If there is only one worksheet in
an spreadsheet, then the output file will simply be the spreadsheet's
name; if there is more than one worksheet, then a separate output
file will be created using the spreadsheet's name plus the worksheet's
name. In any event where the default output file exists and is of a
non-zero size, then a "-1" (or "-2," etc.) will be added until a file
name is found that is not in use.
By default, progress messages are printed. If you do not wish to see
these, use the "-q" flag.
=head1 SEE ALSO
Spreadsheet::ParseExcel, http://code.google.com/p/perl-excel2txt/.
=head1 AUTHOR
Ken Youens-Clark E<lt>kclark@cpan.orgE<gt>.
=head1 COPYRIGHT
Copyright (c) 2005-10 Ken Youens-Clark
This library is free software; you can redistribute it and/or modify
it under the same terms as Perl itself.
=cut