The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
package SAS::TRX;

use 5.006;
use strict;
use warnings;

our $VERSION = '0.06';

use IO::File;

#
#	Constructor
#
sub new
{
	my $class	= shift;
	my $self	= {
		FH => undef,
		TRX => {},
	};
	
	bless ($self,$class);
        return $self;
}

#
#	Given a file named
#
sub load
{
	my $self	= shift;
	my $src		= shift;

	$self->{FH} = new IO::File $src;

	$self->read_trx();
}

my $LIBRARY_HEADER = 'HEADER RECORD*******LIBRARY HEADER RECORD!!!!!!!000000000000000000000000000000  ';
my $NAMESTR_HEADER = 'HEADER RECORD\*\*\*\*\*\*\*NAMESTR HEADER RECORD!!!!!!!000000(\d{4})00000000000000000000';
my $MEMBER_HEADER  = 'HEADER RECORD\*\*\*\*\*\*\*MEMBER  HEADER RECORD!!!!!!!000000000000000001600000000(\d{3})';
my $DSCRPTR_HEADER = 'HEADER RECORD*******DSCRPTR HEADER RECORD!!!!!!!000000000000000000000000000000  ';
my $OBS_HEADER	   = 'HEADER RECORD*******OBS     HEADER RECORD!!!!!!!000000000000000000000000000000  ';

#
#	Get LIBRARY header. Abort if not found
#
sub library_hdr
{
	my $self	= shift;

	my ($tmp, %lhdata);

	read $self->{FH}, $tmp, 80;
	die "LIBRARY header not found:$tmp:" unless $tmp eq $LIBRARY_HEADER;

	read $self->{FH}, $tmp, 80;
	@lhdata{qw(SYMBOL1 SYMBOL2 LIB VER OS CREATE)} = unpack '(A8)4 A32 A16', $tmp;

	read $self->{FH}, $tmp, 80;
	@lhdata{qw(DATETIME16)} = unpack 'A16', $tmp;

	@{$self}{qw(LIB VER)} = @lhdata{qw(LIB VER)};
}

#
#	Get member descriptor data
#
sub descriptor_hdr
{
	my $self	= shift;

	my ($tmp, %mhdata);

	read $self->{FH}, $tmp, 80;
	die "DSCRPTR header not found" unless ($tmp eq $DSCRPTR_HEADER);

	read $self->{FH}, $tmp, 80;
	@mhdata{qw(SYMBOL DSNAME SASDATA OS BLANKS CREATE)} = unpack '(A8)5 A24 A16', $tmp;

	read $self->{FH}, $tmp, 80;
	@mhdata{qw(DATETIME16 BLANKS DSLABEL DSTYPE)} = unpack 'A16 A16 A40 A8', $tmp;

	return @mhdata{qw{DSNAME DSLABEL DSTYPE}};
}

#
#	Get NAMESTR header. Return number of NAMESTR records
#
sub namestr_hdr
{
	my $self = shift;

	my ($tmp, $nnames);
	read $self->{FH}, $tmp, 80;

	die 'NAMESTR header not found' unless ($tmp =~ m/$NAMESTR_HEADER/o);
	return $1;
}

#
#	Get NAMESTR record
#
sub namestr_rec
{
	my $self	= shift;
	my $reclen	= shift;

	my ($tmp, %nsdata);

	read $self->{FH}, $tmp, $reclen;
	@nsdata{qw(NTYPE NHFUN NLNG NVAR0 NNAME NLABEL NFORM NFL NFD NFJ NFILL NIFORM NIFL NIFD NPOS REST)} = unpack 'n4 A8 A40 A8 n3 A2 A8 n2 N A52', $tmp;
	return \%nsdata;
}

#
#	Get OBS header
#
sub obs_hdr
{
	my $self	= shift;

	my $tmp;
	read $self->{FH}, $tmp, 80;

	die "OBS header not found:$tmp:" unless ($tmp eq $OBS_HEADER);
}


#
#	Read library member
#
sub get_member
{
	my $self	= shift;

	my ($vars, $i, $tmp);

	my $nstr_len	= $self->{NSTR_LEN};

	my ($dsname, $dslabel, $dstype) = $self->descriptor_hdr();
	$self->{TRX}{$dsname}{DSLABEL} = $dstype;
	$self->{TRX}{$dsname}{DSTYPE}  = $dstype;

	# Dataset structure description
	$vars = $self->namestr_hdr();
	for ($i=0; $i < $vars; $i++) {
		push @{ $self->{TRX}{$dsname}{VAR} }, $self->namestr_rec($nstr_len);
	}
	# Align to the next punch card
	if ($vars * $nstr_len % 80) {
		seek($self->{FH}, 80 - $vars * $nstr_len % 80, 1);
	}



	my ($databuf, $rowlen, $var, $format, @types);
	# Compute row length	
	$tmp = $#{$self->{TRX}{$dsname}{VAR}};
	$rowlen = $self->{TRX}{$dsname}{VAR}[$tmp]{NPOS}+
		$self->{TRX}{$dsname}{VAR}[$tmp]{NLNG};

	$self->{TRX}{$dsname}{CNAMES} = [];
	$self->{TRX}{$dsname}{CTYPES} = [];
	# Compute conversion formats.
	foreach $var (@{ $self->{TRX}{$dsname}{VAR} }) {
		$format .= 'a' . $var->{NLNG};

		# Remember just a list of variable names
		push @{$self->{TRX}{$dsname}{CNAMES}}, $var->{NNAME};
		# And types
		push @{$self->{TRX}{$dsname}{CTYPES}}, $var->{NTYPE};
	}

	# Upload to destination. May create header for compressed INSERT
	$self->data_header($dsname) if ($self->can('data_header'));

	# Observation data
	$self->obs_hdr();
	$databuf='';
	do {
		while (length($databuf) >= $rowlen) {
			$self->row2array($dsname, $databuf, $format);
			$databuf = substr($databuf, $rowlen);
			last unless $databuf =~ /[^ ]/go;
		}
		read $self->{FH}, $tmp, 80;
		$databuf .= $tmp;
	} until (eof ($self->{FH}) || $tmp =~ m/$MEMBER_HEADER/o);
	$self->{NSTR_LEN} = $1;	# In case the library is joined from various platforms data

	# Upload to destination. May create header for compressed INSERT
	$self->data_footer($dsname) if $self->can('data_footer');
}

#
#	Convert TRX observation (data row)
#	into array of values
#
sub row2array
{
	my ($self, $dsname, $row, $format) = @_;


	my @data = unpack($format, $row);
	
	for (my $i=0; $i<= $#data; $i++) {
		if ($self->{TRX}{$dsname}{CTYPES}[$i] == 1) {
			$data[$i] = ibm_float($data[$i]);
		} else {
			# Trim whitespaces
			$data[$i] =~ s/\s+$//;
			$data[$i] =~ s/^\s+//;
		}
	}

	# Unload to target
	$self->data_row($dsname, \@data) if $self->can('data_row');
}

#
#	Decrypt TRX numeric representation
#
#	I agree that "significand" is "that which is to be signified".
#	Let the meaningful part be "mantissa". As it was before.
#	
sub ibm_float
{
	my $value = shift;

	my ($firstbyte,$bin) = unpack "CB*", $value;

	if ($bin == 0) {
		return	undef	if ($firstbyte);	# Undefined values
		return	0;
	}

	my $exp=($firstbyte & 0x7F) - 0x40;
	my $mantissa = 0;

	while (length($bin)) {
		$mantissa += 1 if (chop $bin);
		$mantissa /= 2;
	}

	$mantissa = -$mantissa	if ($firstbyte & 0x80);
	return	$mantissa*(16**$exp);
}

#
#	Read library
#
sub read_trx
{
	my $self	= shift;

	my ($tmp, $nstr_len);

	$self->library_hdr();

	# Skip possible junk until member header
	do {
		read $self->{FH}, $tmp, 80;
	} until (eof($self->{FH}) || $tmp =~ m/$MEMBER_HEADER/o);
	$self->{NSTR_LEN} = $1;

	# Get library members
	until (eof($self->{FH})) {
		$self->get_member();
	}
	# We have got it all. Dump the results, if anybody cares
	$self->data_description() if $self->can('data_description');
}


1;

__END__

=head1 NAME

SAS::TRX - [Abstract] class, provides SAS transport (XPORT) format decoding.
Calls [overloaded] methods for subsequent output formatting.

=head1 SYNOPSIS

  use base SAS::TRX;

  Provides SAS TRX-related functionality for a child.

=item B<load>

  my $trx = new SAS::TRX;
  $trx->load('filename'); # load and parse 'filename'.

  During the parse will call the following [child] methods, if available:

    data_header		- after all NAMESTR blocks for a TRX library member
			  have been parsed, a list of columns is available
    data_row		- for each observation
    data_footer		- after OBS block has been parsed
    data_description	- at the end of TRX parsing

  

=head1 DESCRIPTION

SAS transport format (XPORT) access

The following deviations from standard are allowed:
	1. Numbers can be any length >1 bytes, exponent is always 7 bit
	2. "Missing values" may have any non-zero exponent

Removes leading and trailing whitespaces while character values transformation.

=head2 EXPORT

Nothing is exported.


=head1 SEE ALSO

SAS::TRX::MySQL for example of usage

TS-140 (http://support.sas.com/techsup/technote/ts140.html)
for format description


=head1 AUTHOR

Alexander Kuznetsov, <acca (at) cpan.org>

=head1 COPYRIGHT AND LICENSE

Copyright (C) 2009 by Alexander Kuznetsov

This library is free software; you can redistribute it and/or modify
it under the same terms as Perl itself, either Perl version 5.10.0 or,
at your option, any later version of Perl 5 you may have available.


=cut