The London Perl and Raku Workshop takes place on 26th Oct 2024. If your company depends on Perl, please consider sponsoring and/or attending.
package Convert::YText;

use strict;
use warnings;
use Carp;

use vars qw/$VERSION @ISA @EXPORT_OK/;
@ISA = 'Exporter';
@EXPORT_OK = qw( encode_ytext decode_ytext validate_ytext);

$VERSION="0.2";

=head1 NAME

Convert::YText - Quotes strings suitably for rfc2822 local part

=head1 VERSION

Version 0.2

=head1 SYNOPSIS

use Convert::YText qw(encode_ytext decode_ytext);

$encoded=encode_ytext($string);
$decoded=decode_ytext($encoded);

($decoded eq $string) || die "this should never happen!";

=head1 DESCRIPTION

Convert::YText converts strings to and from "YText", a format inspired
by xtext defined in RFC1894, the MIME base64 and quoted-printable
types (RFC 1394).  The main goal is encode a UTF8 string into something safe
for use as the local part in an internet email address  (RFC2822).

By default spaces are replaced with "+", "/" with "~", the characters
"A-Za-z0-9_.-" encode as themselves, and everything else is written
"=USTR=" where USTR is the base64 (using "A-Za-z0-9_." as digits)
encoding of the unicode character code.  The encoding is configurable
(see below).

=head1 PROCEDURAL INTERFACE

The module can can export C<encode_ytext> which converts arbitrary
unicode string into a "safe" form, and C<decode_ytext> which recovers
the original text.  C<validate_ytext> is a heuristic which returns 0
for bad input.


=cut


sub encode_ytext{
  my $str=shift;
  my $object = Convert::YText->new();
  return $object->encode($str);
}

sub decode_ytext{
  my $str=shift;
  my $object = Convert::YText->new();
  return $object->decode($str);
}

sub validate_ytext{
  my $str=shift;
  my $object = Convert::YText->new();
  return $object->valid($str);
}

=head1 OBJECT ORIENTED INTERFACE.

For more control, you will need to use the OO interface.

=head2 new

Create a new encoding object.

=head3 Arguments

Arguments are by name (i.e. a hash).

=over

=item DIGIT_STRING ("A-Za-z0-9_.") Must be 64 characters long

=item ESCAPE_CHAR ('=') Must not be in digit string.

=item SPACE_CHAR ('+') Non digit to replace space. Can be the empty string.

=item SLASH_CHAR ( '~') Non digit to replace slash. Can be the empty string.

=item EXTRA_CHARS ('._\-') Other characters to leave unencoded.

=back

=cut

sub new {
  my $class = shift;

  my %params=@_;

  my $self = { ESCAPE_CHAR=>'=',
	       SPACE_CHAR=>'+',
	       SLASH_CHAR=>'~',
	       EXTRA_CHARS=>'-',
	       DIGIT_STRING=>
	       "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789_."
	     };

  while (my ($key,$val) = each %params){
    $self->{$key} = $val;
  };

  croak("DIGIT_STRING must have 64 characters got: ".$self->{DIGIT_STRING}) if (length($self->{DIGIT_STRING})!=64);

  # computed values. Setting directly is probably a bad idea.

  $self->{DIGITS}=[split "",$self->{DIGIT_STRING}];
  $self->{NO_ESCAPE}= $self->{DIGIT_STRING}.$self->{EXTRA_CHARS}.( length($self->{SPACE_CHAR}) ? ' ' : '' ) 
    . (length($self->{SLASH_CHAR}) ? '/' : '');

  $self->{ESCRX}=qr{\Q$self->{ESCAPE_CHAR}\E([\Q$self->{DIGIT_STRING}\E]+)\Q$self->{ESCAPE_CHAR}\E};

  $self->{MUST64}=qr{[^\Q$self->{NO_ESCAPE}\E]};

  $self->{VALIDRX}=qr{[\Q$self->{ESCAPE_CHAR}$self->{NO_ESCAPE}\E]+};

  bless ($self, $class);
  return $self;
}


sub encode_num{
  my $self=shift;
  my $num=shift;
  my $str="";

  while ($num>0){
    my $remainder=$num % 64;
    $num=$num >> 6;
    $str = $self->{DIGITS}->[$remainder].$str;
  }
  return $str;
}

sub decode_str{
  my $self=shift;
  my $str=shift;
  my @chars=split "",$str;
  my $num=0;

  while (scalar(@chars)>0){
    my $remainder=index $self->{DIGIT_STRING},$chars[0];
	
    croak("not a digit: ".$chars[0]. " in \"$str\"") if ($remainder <0);

    $num=$num << 6;
    $num+=$remainder;
    shift @chars;
  }
    
  return chr($num);
}

=head2 encode

=head3 Arguments

a string to encode.

=head3 Returns

encoded string

=cut

sub encode{
  my $self=shift;
  my $str=shift;
  
  $str=~ s/($self->{MUST64})/"$self->{ESCAPE_CHAR}".encode_num($self,ord($1))."$self->{ESCAPE_CHAR}"/ge;
  $str=~ s|/|$self->{SLASH_CHAR}|g if (length($self->{SLASH_CHAR}));
  $str=~ s/ /$self->{SPACE_CHAR}/g;
    
    return $str;
};

=head2 decode

=head3 Arguments

a string to decode.

=head3 Returns

encoded string

=cut

sub decode{
  my $self=shift;
  my $str = shift;
   
  $str=~ s/\Q$self->{SPACE_CHAR}\E/ /g if (length($self->{SPACE_CHAR}));
  $str=~ s|\Q$self->{SLASH_CHAR}\E|/|g if (length($self->{SLASH_CHAR}));
  $str=~ s/$self->{ESCRX}/ decode_str($self,$1)/eg;
  return $str;
}

=head2 valid

Simple necessary but not sufficient test for validity.

=cut 

sub valid{
  my $self=shift;
  my $str = shift;
   
  return $str =~ m/$self->{VALIDRX}/;
}

=head1 DISCUSSION

According to RFC 2822, the following non-alphanumerics are OK for the
local part of an address: "!#$%&'*+-/=?^_`{|}~". On the other hand, it
seems common in practice to block addresses having "%!/|`#&?" in the
local part.  The idea is to restrict ourselves to basic ASCII
alphanumerics, plus a small set of printable ASCII, namely "=_+-~.".


The characters '+' and '-' are pretty widely used to attach suffixes
(although usually only one works on a given mail host). It seems ok to
use '+-', since the first marks the beginning of a suffix, and then is
a regular character. The character '.' also seems mostly permissable.


=head1 AUTHOR

David Bremner, E<lt>ddb@cpan.org<gt>

=head1 COPYRIGHT

Copyright (C) 2011 David Bremner.  All Rights Reserved.

This module is free software; you can redistribute it and/or modify it
under the same terms as Perl itself.

=head1 SEE ALSO

L<MIME::Base64>, L<MIME::Decoder::Base64>, L<MIME::Decoder::QuotedPrint>.

=cut

1;