The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
#!/usr/local/bin/perl

# $Id: excel2txt 19 2010-07-21 15:55:47Z kyclark $

use strict;
use warnings;
use Cwd;
use English qw( -no_match_vars );
use File::Basename qw( basename );
use File::Spec::Functions;
use File::Path qw( mkpath );
use Getopt::Long;
use Pod::Usage;
use Readonly;
use Spreadsheet::ParseExcel;

our $VERSION = version->new('0.05');

Readonly my $DASH         => q{-};
Readonly my $EMPTY_STR    => q{};
Readonly my $HTML         => q{html};
Readonly my $XML          => q{xml};
Readonly my $YAML         => q{yaml};
Readonly my $TXT          => q{txt};
Readonly my $NL           => qq{\n};
Readonly my $XML_HEADER   => q[<?xml version="1.0" encoding="iso-8859-1"?>];
Readonly my %VALID_FORMAT => map { $_, 1 } ( $HTML, $XML, $TXT, $YAML );

my $be_quiet              = 0;
my $delimiter             = qq{\t};
my $normalize_headers     = 0;
my $out_dir               = cwd();
my $output_format         = $TXT;
my ( $help, $man_page, $show_version );

GetOptions(
    'd|ofs:s'             => \$delimiter,
    'f|output-format:s'   => \$output_format,
    'n|normalize-headers' => \$normalize_headers,
    'o|out-dir:s'         => \$out_dir,
    'q|quiet'             => \$be_quiet,
    'help'                => \$help,
    'man'                 => \$man_page,
    'version'             => \$show_version,
) or pod2usage(2);

if ( $help || $man_page ) {
    pod2usage({
        -exitval => 0,
        -verbose => $man_page ? 2 : 1
    });
}; 

if ( $show_version ) {
    my $prog = basename( $PROGRAM_NAME );
    print "$prog v$VERSION\n";
    exit 0;
}

my @files = @ARGV or pod2usage('No input files');
my $debug = sub { print join( $NL, @_, $EMPTY_STR ) if !$be_quiet };

if ( !-d $out_dir ) {
    mkpath( $out_dir );
}

$output_format = lc $output_format;
if ( !$VALID_FORMAT{ $output_format } ) {
    pod2usage( sprintf(
        "'%s' is not a valid output format, please choose from %s",
        $output_format, join( ', ', map { qq["$_"] } sort keys %VALID_FORMAT ),
    ) );
}

if ( $output_format eq $XML ) {
    require XML::Simple;
}
elsif ( $output_format eq $YAML ) {
    require YAML;
}

my ( $num_files_processed, $num_out_files ) = ( 0, 0 );

INPUT_FILE:
for my $file ( @files ) {
    my $err = $EMPTY_STR;
    if ( !-e $file ) {
        $err = q[doesn't exist];
    }
    elsif ( !-s _ ) {
        $err = 'is zero-length';
    }
    elsif ( !-r _ ) {
        $err = 'is unreadable';
    }

    if ( $err ) {
        warn "'$file' $err: skipping.\n";
        next INPUT_FILE;
    }

    $debug->('Processing '. basename($file));
    my $workbook    = Spreadsheet::ParseExcel::Workbook->Parse( $file );
    my $output_base = _normalize( basename( $file ) );
    $output_base    =~ s/\.xls$//;

    if ( ref $workbook->{'Worksheet'} ne 'ARRAY' ) {
        warn "'$file' has no worksheets (not an Excel spreadsheet?)\n";
        next INPUT_FILE;
    }

    my $num_worksheets = scalar @{ $workbook->{'Worksheet'} };

    WORKSHEET:
    for my $ws ( @{ $workbook->{'Worksheet'} } ) {
        my $min_row = $ws->{'MinRow'};
        my $min_col = $ws->{'MinCol'};
        my $max_row = $ws->{'MaxRow'} or next WORKSHEET;
        my $max_col = $ws->{'MaxCol'} or next WORKSHEET;
        my $ws_name = _normalize( $ws->{'Name'} );

        my $out_name;
        if ( $num_worksheets > 1 ) {
            $out_name = join $DASH, $output_base, $ws_name;
        }
        else {
            $out_name = $output_base;
        }

        my $suffix   = '.' . $output_format;
        my $out_file = catfile( $out_dir, $out_name . $suffix );

        if ( -e $out_file && -s _ ) {
            ( my $base = $out_file ) =~ /${suffix}$/;

            my $i;
            while ( -e $out_file && -e _ ) {
                $out_file = join( $EMPTY_STR, 
                    join( $DASH , $base, ++$i ), $suffix 
                );
            }
        }

        $debug->("Writing '$out_file'");

        open my $out_fh, '>', $out_file
            or die "Can't write to '$out_file': $!\n";

        my $num_rows = 0;

        my @field_names;
        my @data;
        { 
            no warnings;
            ROW:
            for my $row_num ( $min_row .. $max_row ) {
                my @row;
                for my $col_num ( $min_col .. $max_col ) {
                    my $cell = $ws->{'Cells'}[ $row_num ][ $col_num ];
                    my $val  = defined $cell ? $cell->Value : $EMPTY_STR;
                    $val     =~ s/\n//gxms;
                    push @row, $val;
                }

                if ( @row ) {
                    $num_rows++;

                    if ( $num_rows == 1 ) {
                        if ( $normalize_headers || $output_format eq $XML ) {
                            @row = map { _normalize($_) } @row;
                        }

                        @field_names = @row;

                        if ( $output_format eq $HTML ) {
                            print {$out_fh} "<table>\n";
                        }
                        elsif ( $output_format ne $TXT ) {
                            next ROW;
                        }
                    }

                    if ( $output_format eq $TXT ) {
                        print {$out_fh} join( $delimiter, @row ), $NL;
                    } 
                    elsif ( $output_format eq $HTML ) {
                        my $cell_type = $num_rows == 1 ? 'th' : 'td';

                        print {$out_fh} join $NL,
                            '<tr>',
                            ( map { qq[  <$cell_type>$_</$cell_type>] } @row ),
                            '</tr>',
                            $EMPTY_STR
                        ;
                    }
                    else {
                        push @data, {
                            map { $field_names[$_], $row[$_] } 0 .. $#row
                        };
                    }
                }
            }
        }

        if ( $output_format eq $HTML ) {
            print {$out_fh} "</table>\n";
        }
        elsif ( $output_format eq $XML ) {
            XML::Simple::XMLout( 
                { $ws_name => \@data },
                OutputFile => $out_fh,
            );
        }
        elsif ( $output_format eq $YAML ) {
            print {$out_fh} YAML::Dump(@data);
        }

        close $out_fh;

        if ( $num_rows <= 1 ) {
            warn "No data in worksheet '$ws' in file '$file'\n";
            unlink $out_file;
            next WORKSHEET;
        }

        $num_out_files++;
    }

    $num_files_processed++;
}

$debug->(sprintf(
    "Done, processed %s Excel file%s, created %s data file%s.",
    $num_files_processed, 
    $num_files_processed == 1 ? $EMPTY_STR : 's', 
    $num_out_files,
    $num_out_files       == 1 ? $EMPTY_STR : 's', 
));

exit 0;

# ----------------------------------------------------
sub _normalize {
    my $in = shift;
    if ( defined $in && $in ne $EMPTY_STR ) {
        $in = lc $in;
        $in =~ s/\s+/_/g;
        $in =~ s/[^a-zA-Z0-9._-]//g;
    }
    return $in;
}

__END__

# ----------------------------------------------------

=head1 NAME

excel2txt - convert Excel data to delimited text files

=head1 SYNOPSIS

  excel2txt [options] File1.xls [File2.xls ...]

Options:

  -d|--ofs                Output field delimiter (default is Tab)
  -f|--output-format      "txt," "html," "xml" or "yaml" (defaults to "txt")
  -n|--normalize-headers  Normalize column headers (see below)
  -o|--out-dir            Where to place output file (defaults to CWD)
  -q|--quiet              Do not print any status messages

  --help                  Show brief help and exit
  --man                   Show full documentation
  --version               Show version and exit

=head1 DESCRIPTION

For each worksheet within an Excel spreadsheet, creates a text file.
By default, the output files will be plaint text files using a Tab for 
the delimiter.  Use the "-d" switch to specify a different delimiter
such as a comma.  You may also choose to create an HTML &gt;table&lt;,
an XML file, or a YAML dump using the "-f" option.

The output file names will be normalized such that they will consist
of only lowercase letters with spaces replaced by underscores and
non-alphabetic characters deleted.  The "-n" option will also apply this
transformation the column headers.  If there is only one worksheet in
an spreadsheet, then the output file will simply be the spreadsheet's
name;  if there is more than one worksheet, then a separate output
file will be created using the spreadsheet's name plus the worksheet's
name.  In any event where the default output file exists and is of a
non-zero size, then a "-1" (or "-2," etc.) will be added until a file
name is found that is not in use.

By default, progress messages are printed.  If you do not wish to see
these, use the "-q" flag.

=head1 SEE ALSO

Spreadsheet::ParseExcel, http://code.google.com/p/perl-excel2txt/.

=head1 AUTHOR

Ken Youens-Clark E<lt>kclark@cpan.orgE<gt>.

=head1 COPYRIGHT

Copyright (c) 2005-10 Ken Youens-Clark

This library is free software;  you can redistribute it and/or modify 
it under the same terms as Perl itself.

=cut