lib/MIME/AltWords0.pm - metacpan.org

package MIME::AltWords;
# Dat: `package MIME::AltWords0' would be incorrect. We don't need the zero.
# copy of package MIME::Words (version 5.420);
# at Wed Sep 27 07:52:29 CEST 2006

=head1 NAME

MIME::AltWords0 - copy of MIME::Words (not for direct use)


=head1 SYNOPSIS

L<MIME::AltWords0> is an auxilary package used by L<MIME::AltWords>.

If you want to encode or decode MIME words (such as
C<=?ISO-8859-2?Q?_=E1ll_e=E1r?=>) found in e-mail message headers (mostly
Subject, From and To), the recommended Perl module to use is
L<MIME::AltWords>. Please don't use L<MIME::AltWords0> (which is
useless by itself anyway) or L<MIME::Words> (version 5.420 has several
serious bugs both with encoding and decoding).

Most users shouldn't read on, but they should read L<MIME::AltWords>
instead.

=head1 DESCRIPTION

The following functions have been moved to L<MIME::AltWords>:
decode_mimewords() (some code stays here as decode_mimewords_wantarray()),

    use MIME::AltWords0 qw(:all);   
     
    ### Split string into array of decoded [DATA,CHARSET] pairs:
    @decoded = decode_mimewords_wantarray(
          'To: =?ISO-8859-1?Q?Keld_J=F8rn_Simonsen?= <keld@dkuug.dk>',
          );
     
Fellow Americans, you probably won't know what the hell this module
is for.  Europeans, Russians, et al, you probably do.  C<:-)>. 

For example, here's a valid MIME header you might get:

      From: =?US-ASCII?Q?Keith_Moore?= <moore@cs.utk.edu>
      To: =?ISO-8859-1?Q?Keld_J=F8rn_Simonsen?= <keld@dkuug.dk>
      CC: =?ISO-8859-1?Q?Andr=E9_?= Pirard <PIRARD@vm1.ulg.ac.be>
      Subject: =?ISO-8859-1?B?SWYgeW91IGNhbiByZWFkIHRoaXMgeW8=?=
       =?ISO-8859-2?B?dSB1bmRlcnN0YW5kIHRoZSBleGFtcGxlLg==?=
       =?US-ASCII?Q?.._cool!?=

The fields basically decode to (sorry, I can only approximate the
Latin characters with 7 bit sequences /o and 'e):

      From: Keith Moore <moore@cs.utk.edu>
      To: Keld J/orn Simonsen <keld@dkuug.dk>
      CC: Andr'e  Pirard <PIRARD@vm1.ulg.ac.be>
      Subject: If you can read this you understand the example... cool!


=head1 PUBLIC INTERFACE

=over 4

=cut

require 5.001;

### Pragmas:
use strict;
use vars qw($VERSION @EXPORT_OK %EXPORT_TAGS @ISA);

### Exporting:
use Exporter;
%EXPORT_TAGS = (all => [qw(decode_mimewords encode_mimewords encode_mimeword
			   )]);
Exporter::export_ok_tags('all');

### Inheritance:
@ISA = qw(Exporter);

### Other modules:
use MIME::Base64;
use MIME::QuotedPrint;



#------------------------------
#
# Globals...
#
#------------------------------

### The package version, both in 1.23 style *and* usable by MakeMaker:
$VERSION = "5.420";

use vars qw($NONPRINT); #### pts ####

### Nonprintables (controls + x7F + 8bit):
$NONPRINT = "\\x00-\\x1F\\x7F-\\xFF"; 


#------------------------------

# _decode_Q STRING
#     Private: used by _decode_header() to decode "Q" encoding, which is
#     almost, but not exactly, quoted-printable.  :-P
sub _decode_Q {
    my $str = shift;
    $str =~ s/_/\x20/g;                                # RFC-1522, Q rule 2
    $str =~ s/=([\da-fA-F]{2})/pack("C", hex($1))/ge;  # RFC-1522, Q rule 1
    $str;
}

# _encode_Q STRING
#     Private: used by _encode_header() to decode "Q" encoding, which is
#     almost, but not exactly, quoted-printable.  :-P
sub _encode_Q {
    my $str = shift;
    $str =~ s{([_\?\=$NONPRINT])}{sprintf("=%02X", ord($1))}eog;
    $str;
}

# _decode_B STRING
#     Private: used by _decode_header() to decode "B" encoding.
sub _decode_B {
    my $str = shift;
    decode_base64($str);
}

# _encode_B STRING
#     Private: used by _decode_header() to decode "B" encoding.
sub _encode_B {
    my $str = shift;
    encode_base64($str, '');
}



#------------------------------

=item decode_mimewords_wantarray ENCODED, [OPTS...]

I<Function.>
Go through the string looking for RFC-1522-style "Q"
(quoted-printable, sort of) or "B" (base64) encoding, and decode them.

B<In an array context,> splits the ENCODED string into a list of decoded 
C<[DATA, CHARSET]> pairs, and returns that list.  Unencoded 
data are returned in a 1-element array C<[DATA]>, giving an effective 
CHARSET of C<undef>.

    $enc = '=?ISO-8859-1?Q?Keld_J=F8rn_Simonsen?= <keld@dkuug.dk>';
    foreach (decode_mimewords($enc)) {
        print "", ($_[1] || 'US-ASCII'), ": ", $_[0], "\n";
    }

B<In a scalar context,> the result is undocumented, please see
L<MIME::AltWords/decode_mimewords> instead.

In the event of a syntax error, $@ will be set to a description 
of the error, but parsing will continue as best as possible (so as to
get I<something> back when decoding headers).
$@ will be false if no error was detected.

Any arguments past the ENCODED string are taken to define a hash of options:

=over 4

=item Field

Name of the mail field this string came from.  I<Currently ignored.>

=back

=cut

sub decode_mimewords_wantarray {
    my $encstr = shift;
    my %params = @_;
    my @tokens;
    $@ = '';           ### error-return

    ### Collapse boundaries between adjacent encoded words:
    $encstr =~ s{(\?\=)\s*(\=\?)}{$1$2}gs;
    pos($encstr) = 0;
    ### print STDOUT "ENC = [", $encstr, "]\n";

    ### Decode:
    my ($charset, $encoding, $enc, $dec);
    while (1) {
	last if (pos($encstr) >= length($encstr));
	my $pos = pos($encstr);               ### save it

	### Case 1: are we looking at "=?..?..?="?
	if ($encstr =~    m{\G             # from where we left off..
			    =\?([^?]*)     # "=?" + charset +
			     \?([bq])      #  "?" + encoding +
			     \?([^?]+)     #  "?" + data maybe with spcs +
			     \?=           #  "?="
			    }xgi) {
	    ($charset, $encoding, $enc) = ($1, lc($2), $3);
	    $dec = (($encoding eq 'q') ? _decode_Q($enc) : _decode_B($enc));
	    push @tokens, [$dec, $charset];
	    next;
	}

	### Case 2: are we looking at a bad "=?..." prefix? 
	### We need this to detect problems for case 3, which stops at "=?":
	pos($encstr) = $pos;               # reset the pointer.
	if ($encstr =~ m{\G=\?}xg) {
	    $@ .= qq|unterminated "=?..?..?=" in "$encstr" (pos $pos)\n|;
	    push @tokens, ['=?'];
	    next;
	}

	### Case 3: are we looking at ordinary text?
	pos($encstr) = $pos;               # reset the pointer.
	if ($encstr =~ m{\G                # from where we left off...
			 ([\x00-\xFF]*?    #   shortest possible string,
			  \n*)             #   followed by 0 or more NLs,
		         (?=(\Z|=\?))      # terminated by "=?" or EOS
			}xg) {
	    length($1) or die "MIME::AltWords0: internal logic err: empty token\n";
	    push @tokens, [$1];
	    next;
	}
	
	if ($encstr=~m{\G([\x00-\xFF]*)[^\x00-\xFF]+}g) { #### pts ####
	    $@.=qq|wide character in encoded string\n|;
	    push @tokens, [$1] if 0!=length($1);
	    next;
	}

	### Case 4: bug!
	die "MIME::AltWords0: unexpected case:\n($encstr) pos $pos\n\t".
	    "Please alert developer.\n";
    }
    return (wantarray ? @tokens : join('',map {$_->[0]} @tokens));
}

#------------------------------

# vvv buggy implementation of encode_mimeword() commented out,
#     see MIME::AltWords

=begin comment

=item encode_mimeword RAW, [ENCODING], [CHARSET]

I<Function.>
Encode a single RAW "word" that has unsafe characters.
The "word" will be encoded in its entirety.

    ### Encode "<<Franc,ois>>":
    $encoded = encode_mimeword("\xABFran\xE7ois\xBB");

You may specify the ENCODING (C<"Q"> or C<"B">), which defaults to C<"Q">.
You may specify the CHARSET, which defaults to C<iso-8859-1>.

=cut

=begin commentcode

sub encode_mimeword {
    my $word = shift;
    my $encoding = uc(shift || 'Q');
    my $charset  = uc(shift || 'ISO-8859-1');
    my $encfunc  = (($encoding eq 'Q') ? \&_encode_Q : \&_encode_B);
    "=?$charset?$encoding?" . &$encfunc($word) . "?=";
}

=cut

#------------------------------

# vvv buggy implementation of encode_mimeword() commented out,
#     see MIME::AltWords

=begin comment

=item encode_mimewords RAW, [OPTS]

I<Function.>
Given a RAW string, try to find and encode all "unsafe" sequences 
of characters:

    ### Encode a string with some unsafe "words":
    $encoded = encode_mimewords("Me and \xABFran\xE7ois\xBB");

Returns the encoded string.
Any arguments past the RAW string are taken to define a hash of options:

=over 4

=item Charset

Encode all unsafe stuff with this charset.  Default is 'ISO-8859-1',
a.k.a. "Latin-1".

=item Encoding

The encoding to use, C<"q"> or C<"b">.  The default is C<"q">.

=item Field

Name of the mail field this string will be used in.  I<Currently ignored.>

=back

B<Warning:> this is a quick-and-dirty solution, intended for character
sets which overlap ASCII.  B<It does not comply with the RFC-1522
rules regarding the use of encoded words in message headers>.
You may want to roll your own variant,
using C<encoded_mimeword()>, for your application.
I<Thanks to Jan Kasprzak for reminding me about this problem.>

=cut

=begin commentcode

sub encode_mimewords {
    my ($rawstr, %params) = @_;
    my $charset  = $params{Charset} || 'ISO-8859-1';
    my $encoding = lc($params{Encoding} || 'q');

    ### Encode any "words" with unsafe characters.
    ###    We limit such words to 18 characters, to guarantee that the 
    ###    worst-case encoding give us no more than 54 + ~10 < 75 characters
    my $word;
    $rawstr =~ s{([a-zA-Z0-9\x7F-\xFF]{1,18})}{     ### get next "word"
	$word = $1;
	(($word !~ /[$NONPRINT]/o) 
	 ? $word                                          ### no unsafe chars
	 : encode_mimeword($word, $encoding, $charset));  ### has unsafe chars
    }xeg;
    $rawstr;
}

1;
__END__

=cut

=back

=head1 NOTES

Exports its principle functions by default, in keeping with 
MIME::Base64 and MIME::QuotedPrint.


=head1 AUTHOR

L<MIME::AltWords0> was copied from L<MIME::Words> (version 5.420) by
Péter Szabó (F<pts@fazekas.hu>) at
Wed Sep 27 07:57:51 CEST 2006.

Here is the original information for L<MIME::Words>.

Eryq (F<eryq@zeegee.com>), ZeeGee Software Inc (F<http://www.zeegee.com>).
David F. Skoll (dfs@roaringpenguin.com) http://www.roaringpenguin.com

All rights reserved.  This program is free software; you can redistribute 
it and/or modify it under the same terms as Perl itself.

Thanks also to...

      Kent Boortz        For providing the idea, and the baseline 
                         RFC-1522-decoding code!
      KJJ at PrimeNet    For requesting that this be split into
                         its own module.
      Stephane Barizien  For reporting a nasty bug.


=head1 VERSION

$Revision: 1.14 $ $Date: 2006/03/17 21:03:23 $

=cut


#------------------------------
# Execute simple test if run as a script.
#------------------------------
{ 
  package main; no strict;
  eval join('',<main::DATA>) || die "$@ $main::DATA" unless caller();
}
1;           # end the module
__END__

# Dat: these tests have been integrated to MIME::AltWords

### Pick up other MIME stuff, just in case...
BEGIN { unshift @INC, ".", "./etc", "./lib" };
import MIME::AltWords0;

my @encs = (
	    '=?US-ASCII?Q?Keith_Moore?= <moore@cs.utk.edu>',
	    '=?ISO-8859-1?Q?Keld_J=F8rn_Simonsen?= <keld@dkuug.dk>',
	    '=?ISO-8859-1?Q?Andr=E9_?= Pirard <PIRARD@vm1.ulg.ac.be>',
	    ('=?ISO-8859-1?B?SWYgeW91IGNhbiByZWFkIHRoaXMgeW8=?='.
	     '=?ISO-8859-2?B?dSB1bmRlcnN0YW5kIHRoZSBleGFtcGxlLg==?='.
	     '=?US-ASCII?Q?.._cool!?='));
foreach $enc (@encs) {
    $x = decode_mimewords($enc);
    print "DEC: ", $x, "\n";
}

### Encode a single unsafe word:
$encoded = encode_mimeword("\xABFran\xE7ois\xBB");
print "ENC1: ", $encoded, "\n";
    
### Encode a string, trying to find the unsafe words inside it: 
$encoded = encode_mimewords("Me and \xABFran\xE7ois\xBB at the beach");
print "ENC2: ", $encoded, "\n";

### Encode "<<Franc,ois>>":
my $unsafe = <<EOF;
Me and \xABFran\xE7ois\xBB, down at the beach
with Dave <dave\@ether.net>
EOF
$encoded = encode_mimewords($unsafe);
print "ENC3: ", $encoded, "\n";
print "DEC3: ", scalar(decode_mimewords($encoded)), "\n";

### So we know everything went well...
exit 0;

#------------------------------
	Global
`s`	Focus search bar
`?`	Bring up this help dialog
	GitHub
`g` `p`	Go to pull requests
`g` `i`	go to github issues (only if github is preferred repository)
	POD
`g` `a`	Go to author
`g` `c`	Go to changes
`g` `i`	Go to issues
`g` `d`	Go to dist
`g` `r`	Go to repository/SCM
`g` `s`	Go to source
`g` `b`	Go to file browse
	Search terms
module: (e.g. module:Plugin)
distribution: (e.g. distribution:Dancer auth)
author: (e.g. author:SONGMU Redis)
version: (e.g. version:1.00)