# --*-Perl-*--
# $Id: MSWord.pm 19 2004-12-12 12:15:43Z tandler $
#
=head1 NAME
PBib::Document::MSWord - Handle Word Documents
=head1 SYNOPSIS
use PBib::Document::MSWord;
=head1 DESCRIPTION
=head2 EXPORT
=cut
package PBib::Document::MSWord;
use 5.006;
use strict;
use warnings;
# use English;
# for debug:
use Data::Dumper;
BEGIN {
use vars qw($Revision $VERSION);
my $major = 1; q$Revision: 19 $ =~ /: (\d+)/; my ($minor) = ($1); $VERSION = "$major." . ($minor<10 ? '0' : '') . $minor;
}
# superclass
use base qw(PBib::Document::PBib);
# our @ISA = qw(PBib::Document::PBib);
# used modules
use Win32::OLE;
# used own modules
use PBib::Document::RTF;
# module variables
#use vars qw(mmmm);
END {
my $Count = Win32::OLE->EnumAllObjects(sub {
my $Object = shift;
my $Class = Win32::OLE->QueryObjectType($Object);
printf STDERR "# Object=%s Class=%s\n", $Object, $Class;
});
print STDERR "Document::MSWord: $Count OLE objects left ...\n";
}
sub DESTROY ($) {
my $self = shift;
print STDERR "Document::MSWord: Destroy document ", $self->filename(), "\n" if $self->{verbose};
# $self->close();
}
#
#
# text access methods
#
#
sub paragraphs {
my $self = shift;
return $self->{'paragraphs'} if defined($self->{'paragraphs'});
my $wordPars = $self->wordParagraphs();
my @pars = map { convertWordText($_) } @$wordPars;
$self->{'paragraphs'} = \@pars;
return \@pars;
}
#
#
# converting
#
#
# do anything you want to before being converted
# the given object is used for conversion.
sub prepareConvert {
my ($self, $conv) = @_;
# we're doing the conversion based on RTF,
# so first, convert the DOC to RTF
my $file = $self->saveAsRTF();
return undef unless $file;
return $self if $file eq $self->filename();
my $outDoc = $conv->outDoc();
if( ref($outDoc) ne 'PBib::Document::RTF' ) {
# the output for conversion has to be the same format
# --> create new (temp) outfile
my $name = $outDoc->filename();
# $name .= '.rtf' unless $name =~ s/\.\w+$/.rtf/;
$conv->{'outDoc'} = new PBib::Document::RTF(
'filename' => $name,
'mode' => 'w',
'finalizeConvert' => $outDoc, ## OBSOLETE??
);
# print Dumper $conv->{'outDoc'};
}
my $inDoc = new PBib::Document::RTF(
'filename' => $file,
'mode' => 'r',
);
# print Dumper $inDoc;
$inDoc->close(); # close word document
# print Dumper $conv->{'foundInfo'};
$conv->{'foundInfo'} = undef;
return $inDoc;
}
# do anything you want to after being converted
# the given object is used for further processing.
sub finalizeConvert {
my ($self, $conv) = @_;
# print Dumper $self;
return $self;
}
#
#
# converting to internal format
#
#
sub convertWordText ($) {
# Here some characters could be converted like:
my $text = shift;
# remove 0-bytes
$text =~ s/\x00//g;
# replace CR-LF
# Paragraph end is "\x0d", line-break is "\x0b"
$text =~ s/\x0a//g; # strip LF
$text =~ s/\x0d/\n\n/g; # convert CR to \n
$text =~ s/\x0b/\n/g;
# replace special chars
# german quotes: double open "\x1e\x20", close "\x1c\x20",
# single open "\x1a\x20", close "\x18\x20"
# english quotes: double open "\x1c\x20", close "\x1d\x20",
# single open "\x18\x20", close "\x19\x20"
$text =~ s/\x18\x20/'/g;
$text =~ s/\x19\x20/'/g;
$text =~ s/\x1a\x20/'/g;
$text =~ s/\x1c\x20/"/g;
$text =~ s/\x1d\x20/"/g;
$text =~ s/\x1e\x20/"/g;
### missing "\x1b\x20" ??
# hyphens:
# normal hyphen "\x2d" '-'
# nonbreaking "\x1e(\x00)?" -- bei 2-byte-text folgt ein 0-byte
# optional hyphen "\x1f"
# en-dash "\x13\x20" oder "\x96"
# em-dash "\x14\x20" oder "\x97"
### missing "\x95" ??
$text =~ s/\x96/--/g; #$text =~ s/\x13\x20/--/g;
$text =~ s/\x97/---/g; #$text =~ s/\x14\x20/---/g;
$text =~ s/\x1f/{{-}}/g;
# ellipsis "\x85"
$text =~ s/\x85/.../g;
$text =~ tr/\x1e\x84\x91\x92\x93\x94/-"`'""/;
# non-breaking space: "\xa0"
# en-space: "\x02\x20"
# em-space: "\x03\x20"
$text =~ s/\xa0/{{ }}/g;
$text =~ s/\x02\x20/ /g;
$text =~ s/\x03\x20/ /g;
# replace word fields
$text =~ s/\x13\s*REF\s(\S+)\s[^\x14\x15]*\x14Error! Reference source not found\.\x15/[$1]/g;
$text =~ s/\x13\s*REF\s(\S+)\s[^\x14\x15]*\x14([^\x15]*)\x15/ $2 . quoteRef($1) /eg;
$text =~ s/\x13([^\x14\x15]*)\x14([^\x15]*)\x15/$2\{\{$1\}\}/g;
#$text =~ tr/\x13\x14\x15/{|}/;
# pictures?
$text =~ s/\x01/{{picture}}/g;
# escape all other control chars
$text =~ s/([\x01-\x09\x0b\x0c\x0e-\x1f\x80-\x9f])/'{{'.ord($1).'}}'/eg;
# $text =~ s/[\x08\x09]/\t/g;
# $text =~ s/(\x07\x07)/$1\x0d/g;
# $text =~ s/\x07/ /g;
# $text =~ s/[\xa0]/ /g;
# $text =~ s/[\x0b\x0c\x0e]/\x0d/g;
# $text =~ tr/\x1e\x84\x91\x92\x93\x94/-"`'""/;
# Away with Words control characters
# $text =~ s/[\x00-\x06\x0f-\x1f\x80-\x9f]//g;
return $text;
}
sub quoteRef ($) {
my $ref = shift;
return ( $ref =~ /^(Sec)|(Req)|(Fig)/ ) ? "{{$ref}}" : "[$ref]"
}
sub quoteFieldId { my ($self, $id) = @_;
#
# return a valid field ID
#
# strip all non-bookmark chars, and add a prefix "r"
#
$id =~ s/[^A-Z0-9]//gi;
return $id;
}
#
#
#
#
#
sub replaceAll {
my ($self, $find, $text, $repl) = @_;
wordReplaceAll($find, $text, $repl);
# now check field for [# ... #] patterns
# ... it's important that the text between [+...+] and [-...-] is NOT matched greedy! --> .*?
while( $repl =~ s/(\[\+.+?\+\].*?\[\-.+?\-\])// ) {
$self->xtags()->{$1} = 1;
print "<<<$1>>>\n";
}
}
our %xchars = (
'p' => '^p',
'br' => '^|',
'pbr' => '^m',
'cbr' => '^n',
'tab' => '^t',
'em-' => '^+',
'en-' => '^=',
'nbr ' => '^s',
'nbr-' => '^~',
'opt-' => '^-',
);
# 'optbr' - word has no opt. line break
# 'em ' - ???
# 'en ' - ???
sub xchar {
my ($self, $xchar) = @_;
return $xchars{$xchar} || '';
}
sub finishReplace ($$) {
my ($self, $sel) = @_;
my $find = $sel->Find();
foreach my $xtag (keys %{$self->xtags()}) {
# print "$xtag\n";
$self->xtagToClipboard($sel, $xtag);
wordReplaceAll($find, $xtag, "^c");
}
print "xchars [#...#]\n";
foreach my $xchar (keys %xchars) {
wordReplaceAll($find, "[#$xchar#]", $xchars{$xchar});
}
# wordReplaceAll($find, "[#p#]", "^p"); # new paragraph
# wordReplaceAll($find, "[#br#]", "^|"); # line break
# wordReplaceAll($find, "[#pbr#]", "^m"); # page break
# wordReplaceAll($find, "[#cbr#]", "^n"); # column break
# wordReplaceAll($find, "[#tab#]", "^t");
# wordReplaceAll($find, "[#endash#]", "^=");
# wordReplaceAll($find, "[#emdash#]", "^+");
# wordReplaceAll($find, "[#nbr #]", "^s");
# wordReplaceAll($find, "[#nbr-#]", "^~");
# wordReplaceAll($find, "[#opt-#]", "^-");
}
sub xtagToClipboard {
my ($self, $sel, $xtag) = @_;
$xtag =~ /^\[\+(.+?)\+\]/;
my $tag = $1;
$xtag =~ /^\[\+$tag\+\](.*)\[\-$tag\-\]$/;
my $text = $1;
# print "$tag ...\n";
$tag =~ /^([a-zA-Z]+)(?::(.*))?$/;
my $type = $1;
my $arg = $2;
my $f = $type . "ToClipboard";
print "$type($arg): <", substr($text,0,30), ">\n";
$self->startClip($sel);
$self->$f($sel, $text, $arg);
$self->stopClip($sel);
}
sub startClip {
my ($self, $sel) = @_;
$sel->HomeKey({ 'Unit' => wdStory() });
$sel->TypeParagraph();
$sel->MoveLeft({ 'Unit' => wdCharacter(), 'Count' => 1 });
}
sub stopClip {
my ($self, $sel) = @_;
$sel->HomeKey({ 'Unit' => wdStory(), 'Extend' => wdExtend() });
$sel->Cut();
$sel->delete({ 'Unit' => wdCharacter(), 'Count' => 1 });
}
#
#
# text formating methods
#
#
# text styles
sub iToClipboard {
my ($self, $sel, $text) = @_;
$sel->Font()->{'Italic'} = wdToggle();
$sel->TypeText({ 'Text' => $text });
$sel->Font()->{'Italic'} = wdToggle();
}
sub bToClipboard {
my ($self, $sel, $text) = @_;
$sel->Font()->{'Bold'} = wdToggle();
$sel->TypeText({ 'Text' => $text });
$sel->Font()->{'Bold'} = wdToggle();
}
sub uToClipboard {
my ($self, $sel, $text, $arg) = @_;
$sel->Font()->{'Underline'} = wdUnderlineSingle();
$sel->TypeText({ 'Text' => $text });
$sel->Font()->{'Underline'} = wdUnderlineNone();
}
# fonts
sub ttToClipboard {
my ($self, $sel, $text) = @_;
$sel->Font()->{'Underline'} = wdUnderlineSingle();
$sel->TypeText({ 'Text' => $text });
$sel->Font()->{'Underline'} = wdUnderlineNone();
}
# fields
sub fieldToClipboard {
my ($self, $sel, $text, $arg) = @_;
# $sel->TypeText({ 'Text' => $xchar });
$sel->Fields()->Add({ 'Range' => $sel->Range(), 'Type' => wdFieldEmpty(),
Text => $arg,
'PreserveFormatting' => 1 });
$sel->EndKey({ 'Unit' => wdLine() });
}
sub bkmkToClipboard {
my ($self, $sel, $text, $arg) = @_;
$sel->TypeText({ 'Text' => $text });
$sel->HomeKey({ 'Unit' => wdLine(), 'Extend' => wdExtend() });
my $bk = $sel->Application()->ActiveDocument()->Bookmarks();
$bk->Add({ 'Range' => $sel->Range(), 'Name' => $arg });
# $bk->DefaultSorting = wdSortByName
# $bk->ShowHidden = False
#exit(42);
#
#### the past of the bookmark doesn't work ... well ...
#
$sel->EndKey({ 'Unit' => wdLine() });
}
sub bkmkrefToClipboard {
my ($self, $sel, $text, $arg) = @_;
$sel->TypeText({ 'Text' => $text });
$sel->HomeKey({ 'Unit' => wdLine(), 'Extend' => wdExtend() });
$sel->Application()->ActiveDocument()->Hyperlinks()->Add({ 'Anchor' => $sel->Range(),
'Address' => "",
'SubAddress' => $arg });
$sel->EndKey({ 'Unit' => wdLine() });
}
sub hrefToClipboard {
my ($self, $sel, $text, $arg) = @_;
$sel->TypeText({ 'Text' => $text });
$sel->HomeKey({ 'Unit' => wdLine(), 'Extend' => wdExtend() });
$sel->Application()->ActiveDocument()->Hyperlinks()->Add({ 'Anchor' => $sel->Range(),
'Address' => $arg,
'SubAddress' => '' });
$sel->EndKey({ 'Unit' => wdLine() });
}
#
#
# interactive editing methods
#
#
sub openInEditor { my ($self) = @_;
my $filename = $self->filename();
if( ! defined($filename) ) {
print STDERR "can't open document with no filename specified.\n";
return;
}
openWordDocument($filename);
}
sub jumpToBookmark {
my ($self, $bookmark) = @_;
# this feature require some interaction with an appropriate editor
# application for this kind of document
# open the document in an editor, and jump to the given bookmark
my $filename = $self->filename();
if( not defined($filename) ) {
print STDERR "can't open document with no filename specified.\n";
return;
}
openWordDocument($filename, $bookmark);
}
sub searchInEditor { my ($self, $text) = @_;
$self->openInEditor();
searchWordDocument({'Text' => $text});
}
sub saveAsRTF {
my ($self, $name) = @_;
if( ! defined $name ) {
$name = $self->filename();
$name .= '-tmp-pbib$$.rtf' unless $name =~ s/\.\w+$/-tmp-pbib$$.rtf/;
}
my $doc = $self->doc();
return undef unless defined $doc;
# first save the original format to avoid lost changes
print STDERR "save ", $self->filename(), " (doc)\n" unless $self->{quiet};
$doc->Save();
print STDERR "save as $name (rtf)\n" unless $self->{quiet};
my $result = $doc->SaveAs({
'FileName' => $name,
'FileFormat' => wdFormatRTF(),
'AddToRecentFiles' => 0,
'EmbedTrueTypeFonts' => 0,
});
# print STDERR " --> <", $result ? $result : "<undef>", ">\n";
return $name;
}
#
#
# word access methods
#
#
sub doc {
my $self = shift;
my $wd = $self->{'wd'};
if( ! defined($wd) ) {
my $filename = $self->filename();
print "try to open $filename using OLE ...\n" if $self->{verbose};
$wd = Win32::OLE->GetObject($filename);
if( ! defined($wd) ) {
print "can't open $filename, error: ", Win32::OLE->LastError(), "\n";
return undef;
}
#printProps($wd);
print "got word handle: ", type($wd), "\n" if $self->{verbose};
$self->{'wd'} = $wd;
}
return $wd;
}
sub close {
my $self = shift;
my $wd = $self->{'wd'};
if( $wd ) {
print STDERR "close ", $self->filename(), "\n" if $self->{verbose};
$wd->Close();
$self->{wd} = undef;
}
}
sub wordParagraphs {
#
# return all paragraphs of this document in word's internal coding
#
my ($self) = @_;
my @pars;
my $wd = $self->doc();
if( not defined($wd) ) {
return ();
}
my $c = $wd->Content();
# printProps($c);
my $t = $c->Text();
print STDERR length($t), " bytes of text\n";
@pars = split(/\r/, $t);
print STDERR scalar(@pars), " paragraphs.\n";
return \@pars;
#
#### old version: much slower!
#
# my $par = $wd->Paragraphs()->First();
# #printProps($par);
# #print "first par: <<", $par->Range()->Text(), ">>\n";
# while( defined($par) ) {
# print '.';
# push @pars, $par->Range()->Text();
# $par = $par->Next();
# }
# print " done: ", scalar(@pars), " paragraphs.\n";
# return @pars;
}
sub parStyle ($$) {
my $self = shift; my ($wdPar) = @_;
return $wdPar->Style()->NameLocal();
}
sub parBookmarks ($$) {
my $self = shift; my ($wdPar) = @_;
return $wdPar->Range()->Bookmarks();
}
sub figureName ($$) {
# If this par's style is 'Figure' or 'Caption',
# look for the first bookmark in its Caption
# and return its name
my $self = shift; my ($wdPar) = @_;
my $style = $self->parStyle($wdPar);
if( $style eq 'Figure' or $style eq 'figure' ) {
$wdPar = $wdPar->Next();
$style = $self->parStyle($wdPar);
}
if( $style ne 'Caption' ) { return undef; }
my $bks = $self->parBookmarks($wdPar);
if( $bks->Count() < 1 ) { return undef; }
return $bks->Item(1)->Name();
}
#
#
# class methods
#
#
sub wordReplaceAll {
my ($find, $text, $replacement) = @_;
my $idx = 0;
while( length($replacement) >= 250 ) {
my $mark = "[#$idx#]"; $idx ++;
my $temp = substr($replacement, 0, 240);
$replacement = substr($replacement, 240);
wordBasicReplaceAll($find, $text, $temp . $mark);
$text = $mark;
}
wordBasicReplaceAll($find, $text, $replacement);
}
sub wordBasicReplaceAll {
my ($find, $text, $replacement) = @_;
# print "replace <$text> with <$replacement>, length = ", length($replacement), "\n";
$find->ClearFormatting();
$find->Replacement()->ClearFormatting();
$find->{'Text'} = $text;
$find->Replacement->{'Text'} = $replacement;
$find->{'Forward'} = 1;
$find->{'Wrap'} = PBib::Document::MSWord::wdFindContinue();
$find->{'format'} = 0;
$find->{'MatchCase'} = 1;
$find->{'MatchWholeWord'} = 0;
$find->{'MatchWildcards'} = 0;
$find->{'MatchSoundsLike'} = 0;
$find->{'MatchAllWordForms'} = 0;
$find->Execute({ 'Replace' => PBib::Document::MSWord::wdReplaceAll() });
}
#
# word constants
#
# WdUnits
sub wdCharacter { 1 }
sub wdLine { 5 }
sub wdParagraph { 4 }
sub wdStory { 6 } #a story is a text flow, e.g. the main flow, or the headings, footnotes, etc.
# WdReplace
sub wdReplaceAll { 2 }
sub wdReplaceNone { 0 }
sub wdReplaceOne { 1 }
# WdMovementType
sub wdMove { 0 }
sub wdExtend { 1 }
# WdFindWrap
sub wdFindContinue { 1 }
# WdFieldType
sub wdFieldEmpty { -1 }
# WdConstants
sub wdToggle { 9999998 }
# WdUnderline
sub wdUnderlineNone { 0 }
sub wdUnderlineSingle { 1 }
# WdSaveFormat
sub wdFormatDocument { 0 }
sub wdFormatText { 2 }
sub wdFormatRTF { 6 }
sub wdFormatUnicodeText { 7 }
#
#
#
sub app {
my ($class) = @_;
$class = 'Word.Application' unless $class;
my $app;
eval('$app = Win32::OLE->GetActiveObject($class)');
goterror("No '$class' installed", 1) if $@;
unless( $app ) {
# $app = Win32::OLE->new($class, sub {$_[0]->Quit();})
$app = Win32::OLE->new($class)
or goterror("can't get OLE handle for '$class'", 1);
}
return $app;
}
sub openWordDocument {
my ($filename, $bookmark) = @_;
print "open file in Word: $filename", ($bookmark ? "#$bookmark":""), "\n";
my $app = app();
my $adoc = $app->Documents()->Open({FileName => $filename});
print "Open --> $adoc, ", type($adoc), "\n";
unless( $adoc ) {
print "open failed? no active document in word!\n";
return undef;
}
# active word and the document
$app->Activate();
$adoc->Activate();
# jump to a bookmark?
if( $bookmark ) {
my $result = $adoc->FollowHyperlink({
'Address' => $filename,
( $bookmark ? ('SubAddress' => $bookmark) : ()),
'NewWindow'=> 1,
'AddHistory'=>1});
# or goterror("open failed");
print "FollowHyperlink --> ", ($result ? $result : 'undef');
}
return $adoc;
}
sub searchWordDocument {
my ($findArgs) = @_;
my $app = app();
print STDERR "search for:\n";
my $sel = $app->Selection();
$sel->MoveRight({'Count' => 1});
my $find = $sel->Find();
$find->ClearFormatting();
$find->Replacement->{'Text'} = '';
$find->{'Forward'} = 1;
$find->{'Wrap'} = 1;
my ($k, $v);
while (($k, $v) = (each %$findArgs)) {
print STDERR " $k = $v\n";
$find->{$k} = $v;
}
$find->Execute();
}
sub goterror {
my ($msg, $fatal) = @_;
my $err = Win32::OLE::LastError();
$msg = "$msg\n$err\n";
die $msg if $fatal;
print STDERR $msg;
}
#
#
# debugging class methods
#
#
sub type($) {
my $Object = shift;
return Win32::OLE->QueryObjectType($Object)
}
sub props($) {
my $o = shift;
return keys(%{$o})
}
sub printProps($) {
my $o = shift;
if( not defined($o) ) {
print "printProps(undef) -- maybe there was an error?\n";
my $err = Win32::OLE::LastError();
print "(last error = $err)\n";
}
my $p;
print $o, " [", type($o), "]: ";
foreach $p (props($o)) {
print "$p ";
}
print "\n";
}
1;
#
# $Log: MSWord.pm,v $
# Revision 1.12 2004/03/29 13:07:18 tandler
# added destrocture to close word handle
#
# Revision 1.11 2003/06/12 22:10:44 tandler
# new sub prepareConvert() that opens outDoc() in editor
# improved saveAsRTF()
# new close()
# improved app()
# much improved openWordDocument()
#
# Revision 1.10 2002/10/12 15:54:33 peter
# fixed
#
# Revision 1.9 2002/10/11 10:15:11 peter
# refactored: uses new superclass Document::PBib
#
# Revision 1.8 2002/09/23 11:07:04 peter
# save as RTF
#
# Revision 1.7 2002/08/22 10:41:53 peter
# - direct search/replace support for word ...
#
# Revision 1.5 2002/06/29 18:30:00 Diss
# result handling of jump-to-hyperlink changed
#
# Revision 1.4 2002/06/24 10:42:37 Diss
# minor changes
#
# Revision 1.3 2002/06/06 10:24:00 Diss
# searchInEditor support - jump to CiteKeys in editor
# (litUI uses PBib::Doc classes)
#
# Revision 1.2 2002/06/06 09:02:34 Diss
# merged with features of ReadDoc.pm (which should be obsolete by now)
#
# Revision 1.1 2002/05/27 10:25:29 Diss
# started editing support
#
# Revision 1.2 2002/03/27 10:00:51 Diss
# new module structure, not yet included in LitRefs/LitUI (R2)
#
# Revision 1.1 2002/03/18 11:15:50 Diss
# major additions: replace [] refs, generate bibliography using [{}], ...
#