The London Perl and Raku Workshop takes place on 26th Oct 2024. If your company depends on Perl, please consider sponsoring and/or attending.
# --*-Perl-*--
# $Id: MSWord.pm 19 2004-12-12 12:15:43Z tandler $
#

=head1 NAME

PBib::Document::MSWord - Handle Word Documents

=head1 SYNOPSIS

  use PBib::Document::MSWord;

=head1 DESCRIPTION

=head2 EXPORT

=cut

package PBib::Document::MSWord;
use 5.006;
use strict;
use warnings;
#  use English;

# for debug:
use Data::Dumper;

BEGIN {
    use vars qw($Revision $VERSION);
	my $major = 1; q$Revision: 19 $ =~ /: (\d+)/; my ($minor) = ($1); $VERSION = "$major." . ($minor<10 ? '0' : '') . $minor;
}

# superclass
use base qw(PBib::Document::PBib);
#  our @ISA = qw(PBib::Document::PBib);

# used modules
use Win32::OLE;

# used own modules
use PBib::Document::RTF;

# module variables
#use vars qw(mmmm);

END {
	my $Count = Win32::OLE->EnumAllObjects(sub {
			my $Object = shift;
			my $Class = Win32::OLE->QueryObjectType($Object);
			printf STDERR "# Object=%s Class=%s\n", $Object, $Class;
		});
	print STDERR "Document::MSWord: $Count OLE objects left ...\n";
}

sub DESTROY ($) {
	my $self = shift;
	print STDERR "Document::MSWord: Destroy document ", $self->filename(), "\n" if $self->{verbose};
	#  $self->close();
}


#
#
# text access methods
#
#

sub paragraphs {
  my $self = shift;
  return $self->{'paragraphs'} if defined($self->{'paragraphs'});
  my $wordPars = $self->wordParagraphs();
  my @pars = map { convertWordText($_) } @$wordPars;
  $self->{'paragraphs'} = \@pars;
  return \@pars;
}


#
#
# converting
#
#

# do anything you want to before being converted
# the given object is used for conversion.
sub prepareConvert {
	my ($self, $conv) = @_;
	
	# we're doing the conversion based on RTF,
	# so first, convert the DOC to RTF
	my $file = $self->saveAsRTF();
	return undef unless $file;
	return $self if $file eq $self->filename();
	
	my $outDoc = $conv->outDoc();
	if( ref($outDoc) ne 'PBib::Document::RTF' ) {
		# the output for conversion has to be the same format
		# --> create new (temp) outfile
		my $name = $outDoc->filename();
		#  $name .= '.rtf' unless $name =~ s/\.\w+$/.rtf/;
		$conv->{'outDoc'} = new PBib::Document::RTF(
			'filename' => $name,
			'mode' => 'w',
			'finalizeConvert' => $outDoc, ## OBSOLETE??
			);
		#  print Dumper $conv->{'outDoc'};
	}
	my $inDoc = new PBib::Document::RTF(
		'filename' => $file,
		'mode' => 'r',
		);
	#  print Dumper $inDoc;
	$inDoc->close(); # close word document
	#  print Dumper $conv->{'foundInfo'};
	$conv->{'foundInfo'} = undef;
	return $inDoc;
}

# do anything you want to after being converted
# the given object is used for further processing.
sub finalizeConvert {
	my ($self, $conv) = @_;
	#  print Dumper $self;
	return $self;
}

#
#
# converting to internal format
#
#

sub convertWordText ($) {
   # Here some characters could be converted like:
   my $text = shift;

# remove 0-bytes
$text =~ s/\x00//g;

# replace CR-LF
# Paragraph end is "\x0d", line-break is "\x0b"
$text =~ s/\x0a//g;		# strip LF
$text =~ s/\x0d/\n\n/g;	# convert CR to \n
$text =~ s/\x0b/\n/g;

# replace special chars

# german quotes: double open "\x1e\x20", close "\x1c\x20",
#		single open "\x1a\x20", close "\x18\x20"
# english quotes: double open "\x1c\x20", close "\x1d\x20",
#		single open "\x18\x20", close "\x19\x20"
$text =~ s/\x18\x20/'/g;
$text =~ s/\x19\x20/'/g;
$text =~ s/\x1a\x20/'/g;
$text =~ s/\x1c\x20/"/g;
$text =~ s/\x1d\x20/"/g;
$text =~ s/\x1e\x20/"/g;

### missing "\x1b\x20" ??

# hyphens:
# normal hyphen "\x2d" '-'
# nonbreaking "\x1e(\x00)?" -- bei 2-byte-text folgt ein 0-byte
# optional hyphen "\x1f"
# en-dash "\x13\x20" oder "\x96"
# em-dash "\x14\x20" oder "\x97"
### missing "\x95" ??
$text =~ s/\x96/--/g; #$text =~ s/\x13\x20/--/g;
$text =~ s/\x97/---/g; #$text =~ s/\x14\x20/---/g;
$text =~ s/\x1f/{{-}}/g;
# ellipsis "\x85"
$text =~ s/\x85/.../g;
$text =~ tr/\x1e\x84\x91\x92\x93\x94/-"`'""/;

# non-breaking space: "\xa0"
# en-space: "\x02\x20"
# em-space: "\x03\x20"
$text =~ s/\xa0/{{ }}/g;
$text =~ s/\x02\x20/  /g;
$text =~ s/\x03\x20/   /g;


# replace word fields
$text =~ s/\x13\s*REF\s(\S+)\s[^\x14\x15]*\x14Error! Reference source not found\.\x15/[$1]/g;
$text =~ s/\x13\s*REF\s(\S+)\s[^\x14\x15]*\x14([^\x15]*)\x15/ $2 . quoteRef($1) /eg;
$text =~ s/\x13([^\x14\x15]*)\x14([^\x15]*)\x15/$2\{\{$1\}\}/g;
#$text =~ tr/\x13\x14\x15/{|}/;

# pictures?
$text =~ s/\x01/{{picture}}/g;

# escape all other control chars
$text =~ s/([\x01-\x09\x0b\x0c\x0e-\x1f\x80-\x9f])/'{{'.ord($1).'}}'/eg;

#   $text =~ s/[\x08\x09]/\t/g;
#   $text =~ s/(\x07\x07)/$1\x0d/g;
#   $text =~ s/\x07/ /g;
#   $text =~ s/[\xa0]/ /g;
#   $text =~ s/[\x0b\x0c\x0e]/\x0d/g;
#   $text =~ tr/\x1e\x84\x91\x92\x93\x94/-"`'""/;

   # Away with Words control characters
#   $text =~ s/[\x00-\x06\x0f-\x1f\x80-\x9f]//g;
  return $text;
}

sub quoteRef ($) {
  my $ref = shift;
  return ( $ref =~ /^(Sec)|(Req)|(Fig)/ ) ? "{{$ref}}" : "[$ref]"
}


sub quoteFieldId { my ($self, $id) = @_;
#
# return a valid field ID
#
# strip all non-bookmark chars, and add a prefix "r"
#
  $id =~ s/[^A-Z0-9]//gi;
  return $id;
}


#
#
#
#
#

sub replaceAll {
	my ($self, $find, $text, $repl) = @_;
	
	wordReplaceAll($find, $text, $repl);
	
	# now check field for [# ... #] patterns
	# ... it's important that the text between [+...+] and [-...-] is NOT matched greedy! --> .*?
	while( $repl =~ s/(\[\+.+?\+\].*?\[\-.+?\-\])// ) {
		$self->xtags()->{$1} = 1;
print "<<<$1>>>\n";
	}
}


our %xchars = (
	'p' => '^p',
	'br' => '^|',
	'pbr' => '^m',
	'cbr' => '^n',
	'tab' => '^t',
	'em-' => '^+',
	'en-' => '^=',
	'nbr ' => '^s',
	'nbr-' => '^~',
	'opt-' => '^-',
	);
#	'optbr'	- word has no opt. line break
#	'em '		- ???
#	'en '		- ???
sub xchar {
	my ($self, $xchar) = @_;
	return $xchars{$xchar} || '';
}

sub finishReplace ($$) {
	my ($self, $sel) = @_;
	my $find = $sel->Find();
	foreach my $xtag (keys %{$self->xtags()}) {
		#    print "$xtag\n";
		$self->xtagToClipboard($sel, $xtag);
		wordReplaceAll($find, $xtag, "^c");
	}
	print "xchars [#...#]\n";
	foreach my $xchar (keys %xchars) {
		wordReplaceAll($find, "[#$xchar#]", $xchars{$xchar});
	}
	#  wordReplaceAll($find, "[#p#]", "^p"); # new paragraph
	#  wordReplaceAll($find, "[#br#]", "^|"); # line break
	#  wordReplaceAll($find, "[#pbr#]", "^m"); # page break
	#  wordReplaceAll($find, "[#cbr#]", "^n"); # column break
	#  wordReplaceAll($find, "[#tab#]", "^t");
	#  wordReplaceAll($find, "[#endash#]", "^=");
	#  wordReplaceAll($find, "[#emdash#]", "^+");
	#  wordReplaceAll($find, "[#nbr #]", "^s");
	#  wordReplaceAll($find, "[#nbr-#]", "^~");
	#  wordReplaceAll($find, "[#opt-#]", "^-");
}

sub xtagToClipboard {
	my ($self, $sel, $xtag) = @_;
	
	$xtag =~ /^\[\+(.+?)\+\]/;
	my $tag = $1;
	$xtag =~ /^\[\+$tag\+\](.*)\[\-$tag\-\]$/;
	my $text = $1;
#	print "$tag ...\n";
	$tag =~ /^([a-zA-Z]+)(?::(.*))?$/;
	my $type = $1;
	my $arg = $2;
	my $f = $type . "ToClipboard";
print "$type($arg): <", substr($text,0,30), ">\n";
	$self->startClip($sel);
    $self->$f($sel, $text, $arg);
	$self->stopClip($sel);
}
sub startClip {
	my ($self, $sel) = @_;
	$sel->HomeKey({ 'Unit' => wdStory() });
	$sel->TypeParagraph();
	$sel->MoveLeft({ 'Unit' => wdCharacter(), 'Count' => 1 });
}
sub stopClip {
	my ($self, $sel) = @_;
	$sel->HomeKey({ 'Unit' => wdStory(), 'Extend' => wdExtend() });
	$sel->Cut();
	$sel->delete({ 'Unit' => wdCharacter(), 'Count' => 1 });
}



#
#
# text formating methods
#
#


# text styles

sub iToClipboard {
	my ($self, $sel, $text) = @_;
	$sel->Font()->{'Italic'} = wdToggle();
	$sel->TypeText({ 'Text' => $text });
	$sel->Font()->{'Italic'} = wdToggle();
}
sub bToClipboard {
	my ($self, $sel, $text) = @_;
	$sel->Font()->{'Bold'} = wdToggle();
	$sel->TypeText({ 'Text' => $text });
	$sel->Font()->{'Bold'} = wdToggle();
}
sub uToClipboard {
	my ($self, $sel, $text, $arg) = @_;
	$sel->Font()->{'Underline'} = wdUnderlineSingle();
	$sel->TypeText({ 'Text' => $text });
	$sel->Font()->{'Underline'} = wdUnderlineNone();
}


# fonts

sub ttToClipboard {
	my ($self, $sel, $text) = @_;
	$sel->Font()->{'Underline'} = wdUnderlineSingle();
	$sel->TypeText({ 'Text' => $text });
	$sel->Font()->{'Underline'} = wdUnderlineNone();
}

# fields

sub fieldToClipboard {
	my ($self, $sel, $text, $arg) = @_;
#	$sel->TypeText({ 'Text' => $xchar });
    $sel->Fields()->Add({ 'Range' => $sel->Range(), 'Type' => wdFieldEmpty(),
    	Text => $arg,
    	'PreserveFormatting' => 1 });
	$sel->EndKey({ 'Unit' => wdLine() });
}


sub bkmkToClipboard {
	my ($self, $sel, $text, $arg) = @_;
	$sel->TypeText({ 'Text' => $text });
	$sel->HomeKey({ 'Unit' => wdLine(), 'Extend' => wdExtend() });
    my $bk = $sel->Application()->ActiveDocument()->Bookmarks();
    $bk->Add({ 'Range' => $sel->Range(), 'Name' => $arg });
#    $bk->DefaultSorting = wdSortByName
#    $bk->ShowHidden = False
#exit(42);
#
#### the past of the bookmark doesn't work ... well ...
#
	$sel->EndKey({ 'Unit' => wdLine() });
}

sub bkmkrefToClipboard {
	my ($self, $sel, $text, $arg) = @_;
	$sel->TypeText({ 'Text' => $text });
	$sel->HomeKey({ 'Unit' => wdLine(), 'Extend' => wdExtend() });
    $sel->Application()->ActiveDocument()->Hyperlinks()->Add({ 'Anchor' => $sel->Range(), 
    	'Address' => "",
        'SubAddress' => $arg });
	$sel->EndKey({ 'Unit' => wdLine() });
}

sub hrefToClipboard {
	my ($self, $sel, $text, $arg) = @_;
	$sel->TypeText({ 'Text' => $text });
	$sel->HomeKey({ 'Unit' => wdLine(), 'Extend' => wdExtend() });
    $sel->Application()->ActiveDocument()->Hyperlinks()->Add({ 'Anchor' => $sel->Range(), 
    	'Address' => $arg,
        'SubAddress' => '' });
	$sel->EndKey({ 'Unit' => wdLine() });
}


#
#
# interactive editing methods
#
#

sub openInEditor { my ($self) = @_;
  my $filename = $self->filename();
  if( ! defined($filename) ) {
    print STDERR "can't open document with no filename specified.\n";
	return;
  }
  openWordDocument($filename);
}

sub jumpToBookmark {
  my ($self, $bookmark) = @_;
# this feature require some interaction with an appropriate editor
# application for this kind of document
# open the document in an editor, and jump to the given bookmark
  my $filename = $self->filename();
  if( not defined($filename) ) {
    print STDERR "can't open document with no filename specified.\n";
	return;
  }
  openWordDocument($filename, $bookmark);
}

sub searchInEditor { my ($self, $text) = @_;
  $self->openInEditor();
  searchWordDocument({'Text' => $text});
}

sub saveAsRTF {
	my ($self, $name) = @_;
	if( ! defined $name ) {
		$name = $self->filename();
		$name .= '-tmp-pbib$$.rtf' unless $name =~ s/\.\w+$/-tmp-pbib$$.rtf/;
	}
	my $doc = $self->doc();
	return undef unless defined $doc;
	
	# first save the original format to avoid lost changes
	print STDERR "save ", $self->filename(), " (doc)\n" unless $self->{quiet};
	$doc->Save();
	
	print STDERR "save as $name (rtf)\n" unless $self->{quiet};
	my $result = $doc->SaveAs({
		'FileName' => $name,
		'FileFormat' => wdFormatRTF(),
		'AddToRecentFiles' => 0,
		'EmbedTrueTypeFonts' => 0,
		});
	#  print STDERR " --> <", $result ? $result : "<undef>", ">\n";
	return $name;
}

#
#
# word access methods
#
#

sub doc {
  my $self = shift;
  my $wd = $self->{'wd'};
  if( ! defined($wd) ) {
    my $filename = $self->filename();
    print "try to open $filename using OLE ...\n" if $self->{verbose};
    $wd = Win32::OLE->GetObject($filename);
    if( ! defined($wd) ) {
      print "can't open $filename, error: ", Win32::OLE->LastError(), "\n";
      return undef;
    }
    #printProps($wd);
    print "got word handle: ", type($wd), "\n" if $self->{verbose};
    $self->{'wd'} = $wd;
  }
  return $wd;
}

sub close {
	my $self = shift;
	my $wd = $self->{'wd'};
	if( $wd ) {
		print STDERR "close ", $self->filename(), "\n" if $self->{verbose};
		$wd->Close();
		$self->{wd} = undef;
	}
}


sub wordParagraphs {
#
# return all paragraphs of this document in word's internal coding
#
  my ($self) = @_;
  my @pars;

  my $wd = $self->doc();
  if( not defined($wd) ) {
    return ();
  }

  my $c = $wd->Content();
  # printProps($c);
  my $t = $c->Text();
  print STDERR length($t), " bytes of text\n";

  @pars = split(/\r/, $t);
  print STDERR scalar(@pars), " paragraphs.\n";
  return \@pars;
#
#### old version: much slower!
#
#  my $par = $wd->Paragraphs()->First();
#  #printProps($par);
#  #print "first par: <<", $par->Range()->Text(), ">>\n";
#  while( defined($par) ) {
#    print '.';
#    push @pars, $par->Range()->Text();
#    $par = $par->Next();
#  }
#  print " done: ", scalar(@pars), " paragraphs.\n";
#  return @pars;
}



sub parStyle ($$) {
  my $self = shift; my ($wdPar) = @_;
  return $wdPar->Style()->NameLocal();
}

sub parBookmarks ($$) {
  my $self = shift; my ($wdPar) = @_;
  return $wdPar->Range()->Bookmarks();
}


sub figureName ($$) {
# If this par's style is 'Figure' or 'Caption',
# look for the first bookmark in its Caption
# and return its name
  my $self = shift; my ($wdPar) = @_;
  my $style = $self->parStyle($wdPar);
  if( $style eq 'Figure' or $style eq 'figure' ) {
    $wdPar = $wdPar->Next();
    $style = $self->parStyle($wdPar);
  }
  if( $style ne 'Caption' ) { return undef; }
  my $bks = $self->parBookmarks($wdPar);
  if( $bks->Count() < 1 ) { return undef; }
  return $bks->Item(1)->Name();
}



#
#
# class methods
#
#


sub wordReplaceAll {
	my ($find, $text, $replacement) = @_;
	my $idx = 0;
	while( length($replacement) >= 250 ) {
		my $mark = "[#$idx#]"; $idx ++;
		my $temp = substr($replacement, 0, 240);
		$replacement = substr($replacement, 240);
		wordBasicReplaceAll($find, $text, $temp . $mark);
		$text = $mark;
	}
	wordBasicReplaceAll($find, $text, $replacement);
}

sub wordBasicReplaceAll {
	my ($find, $text, $replacement) = @_;
#	print "replace <$text> with <$replacement>, length = ", length($replacement), "\n";
    $find->ClearFormatting();
    $find->Replacement()->ClearFormatting();
    $find->{'Text'} = $text;
    $find->Replacement->{'Text'} = $replacement;
    $find->{'Forward'} = 1;
    $find->{'Wrap'} = PBib::Document::MSWord::wdFindContinue();
    $find->{'format'} = 0;
    $find->{'MatchCase'} = 1;
    $find->{'MatchWholeWord'} = 0;
    $find->{'MatchWildcards'} = 0;
    $find->{'MatchSoundsLike'} = 0;
    $find->{'MatchAllWordForms'} = 0;
    $find->Execute({ 'Replace' => PBib::Document::MSWord::wdReplaceAll() });
}



#
# word constants
#

# WdUnits
sub wdCharacter { 1 }
sub wdLine { 5 }
sub wdParagraph { 4 }
sub wdStory { 6 } #a story is a text flow, e.g. the main flow, or the headings, footnotes, etc.

# WdReplace
sub wdReplaceAll { 2 }
sub wdReplaceNone { 0 }
sub wdReplaceOne { 1 }

# WdMovementType
sub wdMove { 0 }
sub wdExtend { 1 }

# WdFindWrap
sub wdFindContinue { 1 }

# WdFieldType
sub wdFieldEmpty { -1 }

# WdConstants
sub wdToggle { 9999998 }

# WdUnderline
sub wdUnderlineNone { 0 }
sub wdUnderlineSingle { 1 }

# WdSaveFormat
sub wdFormatDocument { 0 }
sub wdFormatText { 2 }
sub wdFormatRTF { 6 }
sub wdFormatUnicodeText { 7 }


#
#
#

sub app {
	my ($class) = @_;
	$class = 'Word.Application' unless $class;
	my $app;
	eval('$app = Win32::OLE->GetActiveObject($class)');
	goterror("No '$class' installed", 1) if $@;
	unless( $app ) {
		#  $app = Win32::OLE->new($class, sub {$_[0]->Quit();})
		$app = Win32::OLE->new($class)
			or goterror("can't get OLE handle for '$class'", 1);
	}
	return $app;
}

sub openWordDocument {
	my ($filename, $bookmark) = @_;
	print "open file in Word: $filename", ($bookmark ? "#$bookmark":""), "\n";
	my $app = app();
	my $adoc = $app->Documents()->Open({FileName => $filename});
	print "Open --> $adoc, ", type($adoc), "\n";
	unless( $adoc ) {
		print "open failed? no active document in word!\n";
		return undef;
	}
	
	# active word and the document
	$app->Activate();
	$adoc->Activate();
	
	# jump to a bookmark?
	if( $bookmark ) {
		my $result = $adoc->FollowHyperlink({
			'Address' => $filename,
			( $bookmark ? ('SubAddress' => $bookmark) : ()),
			'NewWindow'=> 1,
			'AddHistory'=>1});
	#    or goterror("open failed");
		print "FollowHyperlink --> ", ($result ? $result : 'undef');
	}
	return $adoc;
}

sub searchWordDocument {
  my ($findArgs) = @_;
  my $app = app();
  print STDERR "search for:\n";
  my $sel = $app->Selection();
  $sel->MoveRight({'Count' => 1});
  my $find = $sel->Find();
  $find->ClearFormatting();
  $find->Replacement->{'Text'} = '';
  $find->{'Forward'} = 1;
  $find->{'Wrap'} = 1;
  my ($k, $v);
  while (($k, $v) = (each %$findArgs)) {
    print STDERR "  $k = $v\n";
    $find->{$k} = $v;
  }
  $find->Execute();
}



sub goterror {
  my ($msg, $fatal) = @_;
  my $err = Win32::OLE::LastError();
  $msg = "$msg\n$err\n";
  die $msg if $fatal;
  print STDERR $msg;
}


#
#
# debugging class methods
#
#

sub type($) {
  my $Object = shift;
  return Win32::OLE->QueryObjectType($Object)
}

sub props($) {
  my $o = shift;
  return keys(%{$o})
}

sub printProps($) {
  my $o = shift;
  if( not defined($o) ) {
    print "printProps(undef) -- maybe there was an error?\n";
	my $err = Win32::OLE::LastError();
	print "(last error = $err)\n";
  }
  my $p;
  print $o, " [", type($o), "]: ";
  foreach $p (props($o)) {
    print "$p ";
  }
  print "\n";
}


1;

#
# $Log: MSWord.pm,v $
# Revision 1.12  2004/03/29 13:07:18  tandler
# added destrocture to close word handle
#
# Revision 1.11  2003/06/12 22:10:44  tandler
# new sub prepareConvert() that opens outDoc() in editor
# improved saveAsRTF()
# new close()
# improved app()
# much improved openWordDocument()
#
# Revision 1.10  2002/10/12 15:54:33  peter
# fixed
#
# Revision 1.9  2002/10/11 10:15:11  peter
# refactored: uses new superclass Document::PBib
#
# Revision 1.8  2002/09/23 11:07:04  peter
# save as RTF
#
# Revision 1.7  2002/08/22 10:41:53  peter
# - direct search/replace support for word ...
#
# Revision 1.5  2002/06/29 18:30:00  Diss
# result handling of jump-to-hyperlink changed
#
# Revision 1.4  2002/06/24 10:42:37  Diss
# minor changes
#
# Revision 1.3  2002/06/06 10:24:00  Diss
# searchInEditor support - jump to CiteKeys in editor
# (litUI uses PBib::Doc classes)
#
# Revision 1.2  2002/06/06 09:02:34  Diss
# merged with features of ReadDoc.pm (which should be obsolete by now)
#
# Revision 1.1  2002/05/27 10:25:29  Diss
# started editing support
#
# Revision 1.2  2002/03/27 10:00:51  Diss
# new module structure, not yet included in LitRefs/LitUI (R2)
#
# Revision 1.1  2002/03/18 11:15:50  Diss
# major additions: replace [] refs, generate bibliography using [{}], ...
#