The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
# mt-aws-glacier - Amazon Glacier sync client
# Copyright (C) 2012-2014  Victor Efimov
# http://mt-aws.com (also http://vs-dev.com) vs@vs-dev.com
# License: GPLv3
#
# This file is part of "mt-aws-glacier"
#
#    mt-aws-glacier is free software: you can redistribute it and/or modify
#    it under the terms of the GNU General Public License as published by
#    the Free Software Foundation, either version 3 of the License, or
#    (at your option) any later version.
#
#    mt-aws-glacier is distributed in the hope that it will be useful,
#    but WITHOUT ANY WARRANTY; without even the implied warranty of
#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#    GNU General Public License for more details.
#
#    You should have received a copy of the GNU General Public License
#    along with this program.  If not, see <http://www.gnu.org/licenses/>.

package App::MtAws::MetaData;

our $VERSION = '1.117';

use strict;
use warnings;
use utf8;
use Encode;

use MIME::Base64;
use JSON::XS;
use POSIX;
use Time::Local;
use App::MtAws::DateTime;

use constant MAX_SIZE => 1024;
use constant META_JOB_TYPE_FULL => 'full';

use Exporter 'import';

our @EXPORT = qw/meta_decode meta_job_decode meta_encode meta_job_encode META_JOB_TYPE_FULL/;
=head1 MT-AWS-GLACIER metadata format ('x-amz-archive-description' field).

Function definitions:
=====================
base64url() input - byte sequence, output - byte sequence
	Is Base64 URL algorithm: http://en.wikipedia.org/wiki/Base64#URL_applications
	basically it's base64 but with '=' padding removed, characters '+', '/' replaced with '-', '_' resp. and no new lines.

json_utf8() - input - Hash, output - byte sequence
	JSON string in UTF-8 representation. Can contain not-escaped UTF-8 characters. Will not contain linefeed. Hash objects are unordered.

latin1_to_utf8() - input - byte sequence, output - byte sequence
	Treats input data as Latin1 (ISO 8859-1) encoded sequence and converts it to UTF-8 sequence

isoO8601() - input - time, output - character string
	ISOO8601 time in the following format YYYYMMDDTHHMMSSZ. Only UTC timezone. No leap seconds supported.
	Supported year range is from 1000 to 9999
	When encoding isoO8601() mt-aws-glacier will not store leap seconds. When decoding from isoO8601 leap seconds will be dropped.

{'filename': FILENAME, 'mtime': iso8601(MTIME)}
	Hash with two keys: 'filename' and 'mtime'. Corresponds to JSON 'Object'.

Input data:
=====================

FILENAME (character string)
	Is a relative filename (no leading slash). Filename is taken from file system and treated as a character sequence
	with known encoding.
MTIME (time)
	is file last modification time with 1 second resolution. Can be below Y1970.
	Internal representation is epoch time, so it can be any valid epoch time (including negative values and zero).Supported
	range - from year 1000 to 9999 (inclusive)

Version 'mt2'
=====================

x-amz-archive-description = 'mt2' <space> base64url(json_utf8({'filename': FILENAME, 'mtime': iso8601(MTIME)}))

Version 'mt1'
=====================

x-amz-archive-description = 'mt1' <space> base64url(latin1_to_utf8(json_utf8({'filename': FILENAME, 'mtime': iso8601(MTIME)})))

This format actually contains a bug - data is double encoded. However it does not affect data integrity. UTF-8 double encoded data can be
perfectly decoded (see http://www.j3e.de/linux/convmv/man/) - that's why the bug was unnoticed during one month.
This format was in use starting from version 0.80beta (2012-12-27) till 0.84beta (2013-01-28).

NOTES:
=====================

1) This specification assumes that in our programming language we have two different types of Strings: Byte string (byte sequence) and Character strings.
Byte string is sequence of octets. Character string is an internal representation of sequence of characters. Character strings cannot have encodings
by definition - it's internal, encoding is known to language implementation.

Some programming languages (like Ruby) have different model, when every string is a sequence of bytes with a known encoding (or no encoding at all).

2) According to this spec. Same (FILENAME,MTIME) values can produce different x-amz-archive-description, as JSON hash is unordered.

3) This specification explains how to _encode_ data (because it's a specification). However it's easy to
understant how to decode it back.

4) Path separator in filename is '/'

=cut

my $meta_coder = ($JSON::XS::VERSION ge '1.4') ?
	JSON::XS->new->utf8->max_depth(1)->max_size(MAX_SIZE) : # some additional abuse-protection
	JSON::XS->new->utf8; # it's still protected by length checking below

sub meta_decode
{
	my ($str) = @_;
	return unless defined $str; # protect from undef $str

	my ($marker, $b64) = _split_meta($str);
	return unless defined $marker;
	if ($marker eq 'mt1') {
		return _decode_filename_and_mtime(_decode_json(_decode_utf8(_decode_b64($b64))));
	} elsif ($marker eq 'mt2') {
		return _decode_filename_and_mtime(_decode_json(_decode_b64($b64)));
	} else {
		return;
	}
}

sub meta_job_decode
{
	my ($str) = @_;
	return unless defined $str; # protect from undef $str

	my ($marker, $b64) = _split_meta($str);
	return unless defined $marker;
	if ($marker eq 'mtijob1') {
		_decode_jobs(_decode_json(_decode_b64($b64)));
	} else {
		return;
	}
}

sub _split_meta
{
	my ($str) = @_;
	my ($marker, $b64) = split(' ', $str); # split will return empty list if string is empty or contains spaces only
	return if !defined $b64 || length($b64) > MAX_SIZE;
	return ($marker, $b64);
}

sub _decode_b64
{
	my ($str) = @_;
	return eval {
		$str =~ tr{-_}{+/};
		my $padding_n = length($str) % 4;
		$str .= ('=' x (4 - $padding_n) ) if $padding_n;
		MIME::Base64::decode_base64($str);
	}; # undef if eval failed
}

sub _decode_utf8
{
	my ($str) = @_;
	return unless defined $str;
	return eval {
		decode("UTF-8", $str, Encode::DIE_ON_ERR|Encode::LEAVE_SRC)
	};  # undef if eval failed
}

sub _decode_json
{
	my ($str) = @_;
	return unless defined $str;
	eval { $meta_coder->decode($str) }
}

sub _decode_filename_and_mtime
{
	my ($h) = @_;
	return unless defined $h;
	return unless defined($h->{filename}) && defined($h->{mtime});
	 # TODO: is that good to return undef everytime something missing? Maybe return error in case signature etc
	 # correct but time is broken - it's more robust.
	defined(my $mtime = iso8601_to_epoch($h->{mtime})) or return;
	return ($h->{filename}, $mtime);
}

sub _decode_jobs
{
	my ($h) = @_;
	return unless defined $h;
	return unless defined($h->{type});
	return ($h->{type});
}

sub meta_encode
{
	my ($relfilename, $mtime) = @_;
	return unless defined($mtime) && defined($relfilename);
	defined(my $res = _encode_b64(_encode_json(_encode_filename_and_mtime($relfilename, $mtime)))) or return;
	$res = "mt2 ".$res;
	return if length($res) > MAX_SIZE;
	return $res;
}

sub meta_job_encode
{
	my ($type) = @_;
	my $res = "mtijob1 "._encode_b64(_encode_json({ type => $type }));
	return if length($res) > MAX_SIZE;
	return $res;
}

sub _encode_b64
{
	my ($str) = @_;
	return unless defined $str;
	my $res = MIME::Base64::encode_base64($str,'');
	$res =~ s/=+\z//;
	$res =~ tr{+/}{-_};
	return $res;
}

sub _encode_utf8
{
	my ($str) = @_;
	return encode("UTF-8",$str,Encode::DIE_ON_ERR|Encode::LEAVE_SRC);
}

sub _encode_filename_and_mtime
{
	my ($relfilename, $mtime) = @_;
	defined(my $iso = epoch_to_iso8601($mtime)) or return;
	return {
		mtime => $iso,
		filename => $relfilename
	};
}

sub _encode_json
{
	my ($h) = @_;
	return unless defined $h;
	return $meta_coder->encode($h);
}

1;

__END__