The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
package DataStore::CAS::FS::DirCodec::Unix;
use 5.008;
use strict;
use warnings;
use Try::Tiny;
use Carp;
use JSON;
use Scalar::Util 'looks_like_number';
require DataStore::CAS::FS::InvalidUTF8;
require DataStore::CAS::FS::Dir;

use parent 'DataStore::CAS::FS::DirCodec';

our $VERSION= '0.0100';

__PACKAGE__->register_format(unix => __PACKAGE__);

# ABSTRACT: Efficiently encode only the attributes of a UNIX stat()


our %_TypeToCode= (
	file => ord('f'), dir => ord('d'), symlink => ord('l'),
	chardev => ord('c'), blockdev => ord('b'),
	pipe => ord('p'), socket => ord('s')
);
our %_CodeToType= map { $_TypeToCode{$_} => $_ } keys %_TypeToCode;
our @_FieldOrder= qw(
	type name ref size modify_ts unix_uid unix_gid unix_mode unix_ctime
	unix_atime unix_nlink unix_dev unix_inode unix_blocksize unix_blockcount
);

sub encode {
	my ($class, $entry_list, $metadata)= @_;
	$metadata= defined($metadata)? { %$metadata } : {};
	defined $metadata->{_}
		and croak '$metadata{_} is reserved for the directory encoder';
	my (%umap, %gmap);
	my @entries= map {
		my $e= ref $_ eq 'HASH'? $_ : $_->as_hash;
		my $code= $_TypeToCode{$e->{type}}
			or croak "Unknown directory entry type: ".$e->{type};
		my $name= $e->{name};
		my $ref= $e->{ref};

		!defined $name? croak "Missing required attribute 'name'"
			: !ref $name? utf8::encode($name)
			: ref($name)->can('is_non_unicode')? ($name= "$name")
			: croak "'name' must be a scalar or a InvalidUTF8 instance";

		!defined $ref? ($ref= '')
			: !ref $ref? utf8::encode($ref)
			: ref($ref)->can('is_non_unicode')? ($ref= "$ref")
			: croak "'ref' must be a scalar or a InvalidUTF8 instance";

		$umap{$e->{unix_uid}}= $e->{unix_user}
			if defined $e->{unix_uid} && defined $e->{unix_user};
		$gmap{$e->{unix_gid}}= $e->{unix_group}
			if defined $e->{unix_gid} && defined $e->{unix_group};

		my $int_attr_str= join(":",
			map { !defined $_? '' : looks_like_number($_)? $_ : croak "Invalid unix attribute number: $_" }
				@{$e}{@_FieldOrder[3..$#_FieldOrder]}
		);
		# As an optimization, all undef trailing fields can be chopped off.
		$int_attr_str =~ s/:+$//;
		
		croak "'name' too long: '$name'" if length($name) > 255;
		croak "'ref' too long: '$ref'" if length($ref) > 255;
		croak "Unix fields too long: '$int_attr_str'" if length($int_attr_str) > 255;
		pack('CCCC', length($name), length($ref), length($int_attr_str), $code).$name."\0".$ref."\0".$int_attr_str;
	} @$entry_list;

	# Save the mapping of UID to User and GID to Group
	$metadata->{_}{umap}= \%umap;
	$metadata->{_}{gmap}= \%gmap;
	
	my $meta_json= JSON->new->utf8->canonical->convert_blessed->encode($metadata);
	my $ret= "CAS_Dir 04 unix\n"
		.pack('N', length($meta_json)).$meta_json
		.join('', sort { substr($a,4) cmp substr($b,4) } @entries);
	croak "Accidental unicode concatenation"
		if utf8::is_utf8($ret);
	$ret;
}


sub decode {
	my ($class, $params)= @_;
	$params->{format}= $class->_read_format($params)
		unless defined $params->{format};
	my $handle= $params->{handle};
	if (!$handle) {
		if (defined $params->{data}) {
			open($handle, '<', \$params->{data})
				or croak "can't open handle to scalar";
		} else {
			$handle= $params->{file}->open;
		}
	}

	my $header_len= $class->_calc_header_length($params->{format});
	seek($handle, $header_len, 0) or croak "seek: $!";

	my (@entries, $buf, $pos);

	# first, pull out the metadata, which includes the UID map and GID map.
	$class->_readall($handle, $buf, 4);
	my ($dirmeta_len)= unpack('N', $buf);
	$class->_readall($handle, my $json, $dirmeta_len);
	my $enc= JSON->new()->utf8->canonical->convert_blessed;
	DataStore::CAS::FS::InvalidUTF8->add_json_filter($enc);
	my $meta= $enc->decode($json);

	# Quick sanity checks
	ref $meta->{_}{umap} and ref $meta->{_}{gmap}
		or croak "Incorrect directory metadata";
	my $dirmeta= delete $meta->{_};

	while (!eof $handle) {
		$class->_readall($handle, $buf, 4);
		my ($name_len, $ref_len, $meta_len, $code)= unpack('CCCC', $buf);
		$class->_readall($handle, $buf, $name_len+$ref_len+$meta_len+2);
		my @fields= (
			$dirmeta,
			$code,
			DataStore::CAS::FS::InvalidUTF8->decode_utf8(substr($buf, 0, $name_len)),
			$ref_len? DataStore::CAS::FS::InvalidUTF8->decode_utf8(substr($buf, $name_len+1, $ref_len)) : undef,
			map { length($_)? $_ : undef } split(":", substr($buf, $name_len+$ref_len+2, $meta_len)),
		);
		push @entries, bless(\@fields, __PACKAGE__.'::Entry');
	}
	close $handle;
	return DataStore::CAS::FS::Dir->new(
		file => $params->{file},
		format => $params->{format},
		metadata => $meta,
		entries => \@entries,
	);
}

package DataStore::CAS::FS::DirCodec::Unix::Entry;
use strict;
use warnings;
use parent 'DataStore::CAS::FS::DirEnt';

sub _dirmeta        { $_[0][0] }
sub type            { $_CodeToType{$_[0][1]} }
sub name            { $_[0][2] }
sub ref             { $_[0][3] }
sub size            { $_[0][4] }
sub modify_ts       { $_[0][5] }
sub unix_uid        { $_[0][6] }
sub unix_gid        { $_[0][7] }
sub unix_mode       { $_[0][8] }
sub unix_ctime      { $_[0][9] }
sub unix_atime      { $_[0][10] }
sub unix_nlink      { $_[0][11] }
sub unix_dev        { $_[0][12] }
sub unix_inode      { $_[0][13] }
sub unix_blocksize  { $_[0][14] }
sub unix_blockcount { $_[0][15] }

*unix_mtime= *modify_ts;
sub unix_user       { my $self= shift; $self->_dirmeta->{umap}{ $self->unix_uid } }
sub unix_group      { my $self= shift; $self->_dirmeta->{gmap}{ $self->unix_gid } }

sub as_hash {
	my $self= shift;
	return {
		type => $self->type,
		map { $_FieldOrder[$_-1] => $self->[$_] } grep { defined $self->[$_] } 2 .. $#$self
	};
}

1;

__END__

=pod

=head1 NAME

DataStore::CAS::FS::DirCodec::Unix - Efficiently encode only the attributes of a UNIX stat()

=head1 VERSION

version 0.0101

=head1 DESCRIPTION

This directory encoder/decoder encodes only the fields of a L<DirEnt|DataStore::CAS::FS::DirEnt>
corresponding to a unix stat_t structure.  (or more precisely, the fields
perl returns from the stat function)  Any other fields in the L<DirEnt|DataStore::CAS::FS::DirEnt> are
ignored.

It does this much more efficiently than would be done in L<JSON>, but still uses
text, to avoid complications of endian-ness and word size. (and because 32-bit
perl can't numerically process 64-bit integers)  The encoding is further
optimized by ordering the fields by likelyhood of being used, and truncating
records at the last used field.

It also imposes some restrictions: 'name' and 'ref' must each be less than 256
bytes when encoded as UTF-8.  There is also a limitation on the unix stat
values, but they will all fit even with max-length 64 bit integers, so this
shouldn't ever be a problem.

=head1 METHODS

=head2 encode

  $serialized= $class->encode( \@entries, \%metadata )

See L<DirCodec-E<gt>encode|DataStore::CAS::FS::DirCodec/encode> for details.

=head2 decode

  my $dir= $class->decode( \%params )

See L<DirCodec-E<gt>load|DataStore::CAS::FS::DirCodec/load> for details on C<%params>.

=head1 AUTHOR

Michael Conrad <mconrad@intellitree.com>

=head1 COPYRIGHT AND LICENSE

This software is copyright (c) 2013 by Michael Conrad, and IntelliTree Solutions llc.

This is free software; you can redistribute it and/or modify it under
the same terms as the Perl 5 programming language system itself.

=cut