I was inspired by the recent contribution from
eric256 (
CSV to Excel Converter), and was also struck by the coincidence that I had just finished this script to convert from Excel to something else...
This is "general-purpose" in the sense that it treats all Excel files the same way, but it's "customized" in the sense that I was compelled to preserve certain "extra features" of Excel formatting: background color of cells, foreground color and basic style of font (bold/italic/underline), and even substring features where font color and/or style change at points within a given cell. (And it has to handle unicode, of course.)
Naturally, I had to invent my own xml structure for all this. (And I wanted the output format to be "grep-friendly" -- sometimes it's nice to have a simple alternative to parsing xml... ;) I wrote a POD man page for it, but I'm leaving that out for brevity. I think the output will speak for itself -- it's pretty simple.
#!/usr/bin/perl
use strict;
use Encode;
use Cwd qw/abs_path/;
use File::Basename;
use Spreadsheet::ParseExcel;
my $Usage = "$0 file.xls > file.txt\n";
( @ARGV == 1 and -f $ARGV[0] ) or die $Usage;
my $filepath = shift;
my ( $name, $path, $suff ) = fileparse( $filepath, qw/.xls/ );
if ( $path !~ m{^/} ) {
$path ||= ".";
$path = abs_path( $path );
}
my $xl = Spreadsheet::ParseExcel->new;
my $wb = $xl->Parse( $filepath ) or die "$filepath: $!\n";
my %abrckt = ( '<' => '<', '>' => '>' );
binmode STDOUT, ":utf8";
print "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n";
print "<xlsfile name=\"$name\" path=\"$path\">\n";
for my $sheet ( @{$wb->{Worksheet}} ) {
printf( "<sheet name=\"%s\" >\n", $sheet->{Name} );
$sheet->{MaxRow} ||= $sheet->{MinRow};
for my $row ( $sheet->{MinRow} .. $sheet->{MaxRow} ) {
$sheet->{MaxCol} ||= $sheet->{MinCol};
my $col_ltr = 'A';
for my $col ( $sheet->{MinCol} .. $sheet->{MaxCol} ) {
my $cell = $sheet->{Cells}[$row][$col];
if ( ! $cell ) {
$col_ltr++;
next;
}
my $val = $cell->{Val};
if ( !defined( $val ) or $val eq '' ) {
$col_ltr++;
next;
}
$val = decode( "UTF-16BE", $val ) if ( $cell->{Code} eq 'u
+cs2' );
if ( $val =~ /^\s*$/ ) {
$col_ltr++;
next;
}
$val =~ s/\&/\&/g;
$val =~ s/([<>])/$abrckt{$1}/g;
my $attrstr = get_cell_attribs( $cell->{Format} );
if ( not $cell->{Rich} ) {
$attrstr .= get_font_attribs( $cell->{Format}{Font} );
printf( "<d r=\"%s\" c=\"%s\"%s>%s</d>\n",
$row+1, $col_ltr++, $attrstr, $val );
}
else {
my $bgnchr = 0;
my $curr_attr = get_font_attribs( $cell->{Format}{Font
+} );
my @chunks = ( );
for my $rich ( @{$cell->{Rich}} ) {
my ( $rpos, $rfont ) = @$rich;
my $chnkval = substr( $val, $bgnchr, $rpos-$bgnchr
+ );
if ( $chnkval !~ /^\s*$/ ) {
push @chunks, { val => $chnkval,
fnt => $curr_attr };
}
$curr_attr = get_font_attribs( $rfont );
$bgnchr = $rpos;
last if ( $rpos >= length( $val ) or
substr( $val, $rpos ) =~ /^\s*$/ );
}
push @chunks, { val => substr( $val, $bgnchr ),
fnt => $curr_attr };
if ( @chunks > 1 ) {
my $j = 1; # check for and merge adjacent blocks
+with same attributes
while ( $j < @chunks ) {
my $i = $j-1;
if ( $chunks[$i]{fnt} ne $chunks[$j]{fnt} ) {
$j++;
} else {
$chunks[$j]{val} = $chunks[$i]{val} . $chu
+nks[$j]{val};
@chunks = splice( @chunks, $i, $j );
}
}
}
if ( @chunks == 1 ) {
$attrstr .= $curr_attr;
printf( "<d r=\"%s\" c=\"%s\"%s>%s</d>\n",
$row+1, $col_ltr++, $attrstr, $val );
}
else {
printf( "<d r=\"%s\" c=\"%s\"%s><fullval>%s</fullv
+al>\n",
$row+1, $col_ltr++, $attrstr, $val );
for my $chnk ( @chunks ) {
printf( " <sd%s>%s</sd>\n", $chnk->{fnt}, $chn
+k->{val} );
}
print "</d>\n";
}
}
}
}
print "</sheet>\n";
}
print "</xlsfile>";
sub get_font_attribs
{
my ( $font ) = @_;
my %fontfmt = ();
$fontfmt{style} = 'b' if ( $font->{Attr} & 1 );
$fontfmt{style} .= 'i' if ( $font->{Attr} & 2 );
$fontfmt{style} .= 'u' if ( $font->{Attr} & 4 );
my $i = Spreadsheet::ParseExcel->ColorIdxToRGB( $font->{Color} );
$fontfmt{fgclr} = $i unless ( $i =~ /^0+$/ );
attr_hash2str( \%fontfmt );
}
sub get_cell_attribs
{
my ( $form ) = @_;
my %cellfmt = ();
if ( $form->{Fill}[0] != 0 and $form->{Fill}[1] != 64 ) {
$cellfmt{bgclr} = Spreadsheet::ParseExcel->ColorIdxToRGB( $for
+m->{Fill}[1] );
}
attr_hash2str( \%cellfmt );
}
sub attr_hash2str
{
my $hash = shift;
my $attrstr = '';
if ( keys %$hash ) {
$attrstr = " ". join( " ",
map { sprintf( "%s=\"%s\"", $_, $$hash{$
+_} ) }
sort keys %$hash );
}
return $attrstr;
}