The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
/**********************************************************************
 * A general purpose limited-entropy Rice compressor library
 * 
 * The Rice algorithm is described by Rice, R.F., Yeh, P.-S., and 
 * Miller, W. H. 1993, in Proc. of the 9th AIAA Computing in Aerospace
 * Conference, AIAA-93-45411-CP.  Rice algorithms in general are simplified
 * Golomb codes that are useful for coding data with certain statistical
 * properties (generally, that differences between samples are typically 
 * smaller than the coded dynamic range).  This code compresses blocks
 * of samples (typically 16 or 32 samples at a time) that are stored 
 * in normal 2's complement signed integer form, with a settable number 
 * of 8-bit bytes per sample.
 *
 * Strict Rice coding gives rise
 * (in principle) to extremely large symbols in the worst high-entropy 
 * case, so this library includes a block-level switch 
 * 
 * Assumptions: int is 32 bits ("long int"); short is 16 bits, byte is 8 bits.
 *
 * HISTORICAL NOTE:
 * 
 * This compression library is modified from the CFITSIO library,
 * which is distributed by the U.S. government under the above
 * Free-compatible license.  The code was originally written by
 * Richard White at the STScI and contributed to CFITSIO in July 1999.
 * The code has been further modified (Craig DeForest) to work in a
 * more general-purpose way than just within CFITSIO.
 * 
 *
 * LICENSING & COPYRIGHT: 
 *
 * Portions of this code are copyright (c) U.S. Government; the
 * modifications are copyright (c) Craig DeForest.  The entire library
 * (including modifications) is licensed under the following terms 
 * (inherited from CFITSIO v. 3.24):
 *
 * Permission to freely use, copy, modify, and distribute this software
 * and its documentation without fee is hereby granted, provided that this
 * copyright notice and disclaimer of warranty appears in all copies.
 * 
 * DISCLAIMER:
 *
 * THE SOFTWARE IS PROVIDED 'AS IS' WITHOUT ANY WARRANTY OF ANY KIND,
 * EITHER EXPRESSED, IMPLIED, OR STATUTORY, INCLUDING, BUT NOT LIMITED
 * TO, ANY WARRANTY THAT THE SOFTWARE WILL CONFORM TO SPECIFICATIONS,
 * ANY IMPLIED WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR
 * PURPOSE, AND FREEDOM FROM INFRINGEMENT, AND ANY WARRANTY THAT THE
 * DOCUMENTATION WILL CONFORM TO THE SOFTWARE, OR ANY WARRANTY THAT
 * THE SOFTWARE WILL BE ERROR FREE.  IN NO EVENT SHALL NASA BE LIABLE
 * FOR ANY DAMAGES, INCLUDING, BUT NOT LIMITED TO, DIRECT, INDIRECT,
 * SPECIAL OR CONSEQUENTIAL DAMAGES, ARISING OUT OF, RESULTING FROM,
 * OR IN ANY WAY CONNECTED WITH THIS SOFTWARE, WHETHER OR NOT BASED
 * UPON WARRANTY, CONTRACT, TORT , OR OTHERWISE, WHETHER OR NOT INJURY
 * WAS SUSTAINED BY PERSONS OR PROPERTY OR OTHERWISE, AND WHETHER OR
 * NOT LOSS WAS SUSTAINED FROM, OR AROSE OUT OF THE RESULTS OF, OR USE
 * OF, THE SOFTWARE OR SERVICES PROVIDED HEREUNDER.
 *
 */


#include <stdio.h>
#include <stdlib.h>
#include <string.h>

typedef unsigned char Buffer_t;
typedef struct {
	int bitbuffer;		/* bit buffer					*/
	int bits_to_go;		/* bits to go in buffer			*/
	Buffer_t *start;	/* start of buffer				*/
	Buffer_t *current;	/* current position in buffer	*/
	Buffer_t *end;		/* end of buffer				*/
} Buffer;

#define putcbuf(c,mf) 	((*(mf->current)++ = c), 0)

static void start_outputing_bits(Buffer *buffer);
static int done_outputing_bits(Buffer *buffer);
static int output_nbits(Buffer *buffer, int bits, int n);

/**********************************************************************
 * rcomp 
 * 
 * Usage: 
 *   bytes = rcomp( a, sampsiz, nx, buf, buflen, nblock )
 * 
 *   a is a pointer to the input buffer, which contains signed integer
 *   data to be encoded, either as bytes, shorts, or longs.
 * 
 *   sampsiz tells the sample size in bytes (1, 2, or 4)
 * 
 *   nx is the number of input samples to encode.
 * 
 *   buf is a pointer to the output buffer, which must be predeclared.
 * 
 *   clen is the size of the output buffer, in bytes.
 *
 *   nblock is the coding block size to use, in samples (typ. 16 or 32)
 *
 *   
 * The data are encoded (and hopefully compressed) into the output buffer, 
 * and the length of the encoded data is returned.  In case of failure 
 * (e.g. buffer too small) -1 is returned.
 * 
 * The CFITSIO code has this broken out into multiple routines for
 * different data types, but I (CED) have recombined them: the
 * overhead of using a couple of switch() statements to combine them
 * is believed (by me) to be negligible on modern architectures: the
 * process is close to memory-bound, and branch prediction on high end
 * microprocessors makes the type switches take 0 cycles anyway on
 * most iterations.
 * 
 */

int rcomp(void *a_v,		/* input array			*/
	  int bsize,            /* sample size (in bytes)       */
	  int nx,		/* number of input pixels	*/
	  unsigned char *c,	/* output buffer		*/
	  int clen,		/* max length of output		*/
	  int nblock)		/* coding block size		*/
{
Buffer bufmem, *buffer = &bufmem;
int *a = (int *)a_v;
int i, j, thisblock;
int lastpix, nextpix, pdiff;
int v, fs, fsmask, top, fsmax, fsbits, bbits;
int lbitbuffer, lbits_to_go;
unsigned int psum;
double pixelsum, dpsum;
unsigned int *diff;

 // Blocksize is picked so that boundaries lie on 64-bit word edges for all data types
 if(nblock & 0x7 ) { 
   fprintf(stderr,"rcomp: nblock must be divisible by 4 (is %d)\n",nblock);
   fflush(stderr);
   return(-1);
 }

 /* Magic numbers from fits_rcomp in CFITSIO; these have to match the ones in 
 *  rdecomp, below 
 */
 switch(bsize) {
 case 1: // byte
   fsbits = 3;
   fsmax = 6;
   break;
 case 2: // int
   fsbits = 4;
   fsmax = 14;
   break;
 case 4: // long
   fsbits = 5;
   fsmax = 25;
   break;
 default:
   fprintf(stderr,"rcomp: bsize must be 1, 2, or 4 bytes");
   fflush(stderr);
   return(-1);
 }
 
 bbits = 1<<fsbits;
 
 /*
  * Set up buffer pointers
  */
 buffer->start = c;
 buffer->current = c;
 buffer->end = c+clen;
 buffer->bits_to_go = 8;
 
 /*
  * array for differences mapped to non-negative values
  * Treat as an array of longs so it works in all cases
  *
  */
 diff = (unsigned int *) malloc(nblock*sizeof(unsigned int));
 if (diff == (unsigned int *) NULL) {
   fprintf(stderr,"rcomp: insufficient memory (allocating %d ints for internal buffer)",nblock);
   fflush(stderr);
   return(-1);
 }
 /*
  * Code in blocks of nblock pixels
  */
 start_outputing_bits(buffer);

 /* write out first sample to the first bsize bytes of the buffer */
 {
   int a0;
   int z;
   a0 = a[0];
   z = output_nbits(buffer, a0, bsize * 8);
   if (z) {
     // no error message - buffer overruns are silent
     free(diff);
     return(-1);
 }
 }

 /* the first difference will always be zero */
 switch(bsize) {
 case 1: lastpix = *((char *)a); break;
 case 2: lastpix = *((short *)a); break;
 case 4: lastpix = *((int *)a); break;
 default: break; // never happens (would be caught by first switch)
 }

 thisblock = nblock;

 for (i=0; i<nx; i += nblock) {
   /* last block may be shorter */
   if (nx-i < nblock) thisblock = nx-i;
   /*
    * Compute differences of adjacent pixels and map them to unsigned values.
    * Note that this may overflow the integer variables -- that's
    * OK, because we can recover when decompressing.  If we were
    * compressing shorts or bytes, would want to do this arithmetic
    * with short/byte working variables (though diff will still be
    * passed as an int.)
    *
    * compute sum of mapped pixel values at same time
    * use double precision for sum to allow 32-bit integer inputs
    *
    * This is the last time we refer directly to the input data - they
    * are converted from byte/short/long format to long diffs, so 
    * no more type switches are needed.
    * 
    */
	pixelsum = 0.0;
	for (j=0; j<thisblock; j++) {
	  switch(bsize) {
	  case 1: nextpix = ((char *)a)[i+j]; break;
	  case 2: nextpix = ((short *)a)[i+j]; break;
	  case 4: nextpix = ((int *)a)[i+j]; break;
	  default: break; // never happens
	  }

	  pdiff = nextpix - lastpix;
	  diff[j] = (unsigned int) ((pdiff<0) ? ~(pdiff<<1) : (pdiff<<1));
	  pixelsum += diff[j];
	  lastpix = nextpix;

	}

	/*
	 * compute number of bits to split from sum
	 */
	dpsum = (pixelsum - (thisblock/2) - 1)/thisblock;
	if (dpsum < 0) dpsum = 0.0;
	psum = ((unsigned int) dpsum ) >> 1;
	for (fs = 0; psum>0; fs++) psum >>= 1;

	/*
	 * write the codes
	 * fsbits ID bits used to indicate split level
	 */
	if (fs >= fsmax) {
	  /* Special high entropy case when FS >= fsmax
	   * Just write pixel difference values directly, no Rice coding at all.
	   */
	  if (output_nbits(buffer, fsmax+1, fsbits) ) {
	    // no error message - buffer overrun is silent.
	    free(diff);
	    return(-1);
	  }
	  for (j=0; j<thisblock; j++) {
	    if (output_nbits(buffer, diff[j], bbits) ) {
	      free(diff);
	      return(-1);
	    }
	  }
	} else if (fs == 0 && pixelsum == 0) {
	  /*
	   * special low entropy case when FS = 0 and pixelsum=0 (all
	   * pixels in block are zero.)
	   * Output a 0 and return
	   */
	  if (output_nbits(buffer, 0, fsbits) ) {
	    free(diff);
	    return(-1);
	  }
	} else {
	  /* normal case: not either very high or very low entropy */
	  if (output_nbits(buffer, fs+1, fsbits) ) {
	    free(diff);
	    return(-1);
	  }
	  fsmask = (1<<fs) - 1;
	  /*
	   * local copies of bit buffer to improve optimization
	   */
	  lbitbuffer = buffer->bitbuffer;
	  lbits_to_go = buffer->bits_to_go;
	  for (j=0; j<thisblock; j++) {
	    v = diff[j];
	    top = v >> fs;
	    /*
	     * top is coded by top zeros + 1
	     */
	    if (lbits_to_go >= top+1) {
	      lbitbuffer <<= top+1;
	      lbitbuffer |= 1;
	      lbits_to_go -= top+1;
	    } else {
	      lbitbuffer <<= lbits_to_go;
	      putcbuf(lbitbuffer & 0xff,buffer);
	      
	      for (top -= lbits_to_go; top>=8; top -= 8) {
		putcbuf(0, buffer);
	      }
	      lbitbuffer = 1;
	      lbits_to_go = 7-top;
	    }
	    /*
	     * bottom FS bits are written without coding
	     * code is output_nbits, moved into this routine to reduce overheads
	     * This code potentially breaks if FS>24, so I am limiting
	     * FS to 24 by choice of FSMAX above.
	     */
	    if (fs > 0) {
	      lbitbuffer <<= fs;
	      lbitbuffer |= v & fsmask;
	      lbits_to_go -= fs;
	      while (lbits_to_go <= 0) {
		putcbuf((lbitbuffer>>(-lbits_to_go)) & 0xff,buffer);
		lbits_to_go += 8;
	      }
	    }
	  }
	  
	  /* check if overflowed output buffer */
	  if (buffer->current > buffer->end) {
	    free(diff);
	    return(-1);
	  }
	  buffer->bitbuffer = lbitbuffer;
	  buffer->bits_to_go = lbits_to_go;
	}
 }
 done_outputing_bits(buffer);
 free(diff);
 /*
  * return number of bytes used
  */
 return(buffer->current - buffer->start);
}

/*---------------------------------------------------------------------------*/
/* bit_output.c
 *
 * Bit output routines
 * Procedures return zero on success, EOF on end-of-buffer
 *
 * Programmer: R. White     Date: 20 July 1998
 */

/* Initialize for bit output */

static void start_outputing_bits(Buffer *buffer)
{
    /*
     * Buffer is empty to start with
     */
    buffer->bitbuffer = 0;
    buffer->bits_to_go = 8;
}

/*---------------------------------------------------------------------------*/
/* Output N bits (N must be <= 32) */

static int output_nbits(Buffer *buffer, int bits, int n)
{
/* local copies */
int lbitbuffer;
int lbits_to_go;
    /* AND mask for the right-most n bits */
    static unsigned int mask[33] = 
         {0,
	  0x1,       0x3,       0x7,       0xf,       0x1f,       0x3f,       0x7f,       0xff,
	  0x1ff,     0x3ff,     0x7ff,     0xfff,     0x1fff,     0x3fff,     0x7fff,     0xffff,
	  0x1ffff,   0x3ffff,   0x7ffff,   0xfffff,   0x1fffff,   0x3fffff,   0x7fffff,   0xffffff,
	  0x1ffffff, 0x3ffffff, 0x7ffffff, 0xfffffff, 0x1fffffff, 0x3fffffff, 0x7fffffff, 0xffffffff};

    /*
     * insert bits at end of bitbuffer
     */

    lbitbuffer = buffer->bitbuffer;
    lbits_to_go = buffer->bits_to_go;
    if (lbits_to_go+n > 32) {
	/*
	 * special case for large n: put out the top lbits_to_go bits first
	 * note that 0 < lbits_to_go <= 8
	 */
	lbitbuffer <<= lbits_to_go;
/*	lbitbuffer |= (bits>>(n-lbits_to_go)) & ((1<<lbits_to_go)-1); */
	lbitbuffer |= (bits>>(n-lbits_to_go)) & *(mask+lbits_to_go);

	if(buffer->current >= buffer->end - 1)
	  return 1;

	putcbuf(lbitbuffer & 0xff,buffer);
	n -= lbits_to_go;
	lbits_to_go = 8;
    }
    lbitbuffer <<= n;
/*    lbitbuffer |= ( bits & ((1<<n)-1) ); */
    lbitbuffer |= ( bits & *(mask+n) );
    lbits_to_go -= n;
    while (lbits_to_go <= 0) {
	/*
	 * bitbuffer full, put out top 8 bits
	 */
      if(buffer->current >= buffer->end)
	return 1;
      
      putcbuf((lbitbuffer>>(-lbits_to_go)) & 0xff,buffer);
      lbits_to_go += 8;
    }
    buffer->bitbuffer = lbitbuffer;
    buffer->bits_to_go = lbits_to_go;

      if(buffer->bits_to_go < 8 && buffer->current >= buffer->end -2)
	return 1;

    return(0);
}
/*---------------------------------------------------------------------------*/
/* Flush out the last bits */

static int done_outputing_bits(Buffer *buffer)
{
    if(buffer->bits_to_go < 8) {
	putcbuf(buffer->bitbuffer<<buffer->bits_to_go,buffer);
    }
    return(0);
}


/**********************************************************************
 * rdecomp
 * 
 * Usage:
 *   errflag = rdecomp(a, clen, outbuf, sampsiz, nx, nblock)
 * 
 *   a is a pointer to the input buffer, which contains rice-compressed
 *   data (e.g. from rcomp, above).
 * 
 *   clen is the length of the input buffer, in bytes.
 * 
 *   outbuf is a pointer to the output buffer, which should be 
 *   a pre-allocated array of chars, shorts, or longs according to 
 *   sampsiz.
 * 
 *   sampsiz tells the sample size in bytes (1, 2, or 4)
 * 
 *   nx tells the number of samples in the output buffer (which are
 *   all expected to be present in the compressed stream).
 *
 *   nblock is the block size, in samples, for compression.
 * 
 *
 *   The data are decoded into the output buffer.  On normal completion
 *   0 is returned.
 */

int rdecomp (unsigned char *c,		/* input buffer			    */
	     int clen,			/* length of input (bytes)	    */
	     void *array,	        /* output array		 	    */
	     int bsize,                 /* bsize - bytes per pix of output  */
	     int nx,			/* number of output pixels          */
	     int nblock)		/* coding block size (in pixels)    */
{
  int i, k, imax;
  int nbits, nzero, fs;
  unsigned char *cend, bytevalue;
  unsigned int b, diff, lastpix;
  int fsmax, fsbits, bbits;
  static int *nonzero_count = (int *)NULL;
  
  /*
   * From bsize derive:
   * FSBITS = # bits required to store FS
   * FSMAX = maximum value for FS
   * BBITS = bits/pixel for direct coding
   *
   * (These magic numbers have to match the ones in rcomp above.)
   */
  
  
  switch (bsize) {
  case 1:
    fsbits = 3;
    fsmax = 6;
    break;
  case 2:
    fsbits = 4;
    fsmax = 14;
    break;
  case 4:
    fsbits = 5;
    fsmax = 25;
    break;
  default:
    fprintf(stderr,"rdecomp: bsize must be 1, 2, or 4 bytes");
    fflush(stderr);
    return 1;
  }
  
  bbits = 1<<fsbits;

  if (nonzero_count == (int *) NULL) {
    /*
     * nonzero_count is lookup table giving number of bits
     * in 8-bit values not including leading zeros; gets allocated 
     * and calculated the first time through
     */
    
    /*  NOTE!!!  This memory never gets freed (permanent table)  */
    nonzero_count = (int *) malloc(256*sizeof(int));
    if (nonzero_count == (int *) NULL) {
      fprintf(stderr,"rdecomp: insufficient memory!\n");
      fflush(stderr);
      return 1;
    }
    nzero = 8;
    k = 128;
    for (i=255; i>=0; ) {
      for ( ; i>=k; i--) nonzero_count[i] = nzero;
      k = k/2;
      nzero--;
    }
  }

    /*
     * Decode in blocks of nblock pixels
     */

    /* first bytes of input buffer contain the value of the first */
    /* integer value, without any encoding */


    cend = c + clen;
    
    lastpix = 0;
    switch(bsize) {
    case 4:
      bytevalue = c[0];
      lastpix = lastpix | (bytevalue<<24);
      bytevalue = c[1];
      lastpix = lastpix | (bytevalue<<16);
      bytevalue = c[2];
      lastpix = lastpix | (bytevalue<<8);
      bytevalue = c[3];
      lastpix = lastpix | bytevalue;
      c+=4;
      break;
    case 2:
      bytevalue = c[0];
      lastpix = lastpix | (bytevalue<<8);
      bytevalue = c[1];
      lastpix = lastpix | bytevalue;
      c+=2;
      break;
    case 1:
      lastpix = c[0];
      c++;
      break;
    default: // never happens
      break; 
    }

    b = *c++;		    /* bit buffer			*/
    nbits = 8;		    /* number of bits remaining in b	*/
    for (i = 0; i<nx; ) {
	/* get the FS value from first fsbits */
	nbits -= fsbits;
	while (nbits < 0) {
	    b = (b<<8) | (*c++);
	    nbits += 8;
	}
	fs = (b >> nbits) - 1;

	b &= (1<<nbits)-1;
	/* loop over the next block */
	imax = i + nblock;
	if (imax > nx) imax = nx;
	if (fs<0) {
	    /* low-entropy case, all zero differences */
	  for ( ; i<imax; i++) {
	    switch(bsize) {
	    case 1: ((char *)array)[i] = lastpix; break;
	    case 2: ((short *)array)[i] = lastpix; break;
	    case 4: ((int *)array)[i] = lastpix; break;
	    default: break;
	    }
	  }
	} else if (fs==fsmax) {
	    /* high-entropy case, directly coded pixel values */
	    for ( ; i<imax; i++) {
		k = bbits - nbits;
		diff = b<<k;
		for (k -= 8; k >= 0; k -= 8) {
		    b = *c++;
		    diff |= b<<k;
		}
		if (nbits>0) {
		    b = *c++;
		    diff |= b>>(-k);
		    b &= (1<<nbits)-1;
		} else {
		    b = 0;
		}
		/*
		 * undo mapping and differencing
		 * Note that some of these operations will overflow the
		 * unsigned int arithmetic -- that's OK, it all works
		 * out to give the right answers in the output file.
		 */
		if ((diff & 1) == 0) {
		    diff = diff>>1;
		} else {
		    diff = ~(diff>>1);
		}

		switch(bsize) {
		case 1: 
		  ((char *)array)[i] = diff + lastpix; 
		  lastpix = ((char *)array)[i];
		  break;
		case 2: 
		  ((short *)array)[i] = diff + lastpix;
		  lastpix = ((short *)array)[i];
		  break;
		case 4:
		  ((int *)array)[i] = diff + lastpix;
		  lastpix = ((int *)array)[i];
		  break;
		default: // never happens
		  break;
		}
	    }
	} else {
	    /* normal case, Rice coding */
	    for ( ; i<imax; i++) {
		/* count number of leading zeros */
		while (b == 0) {
		    nbits += 8;
		    b = *c++;
		}
		nzero = nbits - nonzero_count[b];
		nbits -= nzero+1;
		/* flip the leading one-bit */
		b ^= 1<<nbits;
		/* get the FS trailing bits */
		nbits -= fs;
		while (nbits < 0) {
		    b = (b<<8) | (*c++);
		    nbits += 8;
		}
		diff = (nzero<<fs) | (b>>nbits);
		b &= (1<<nbits)-1;

		/* undo mapping and differencing */
		if ((diff & 1) == 0) {
		    diff = diff>>1;
		} else {
		    diff = ~(diff>>1);
		}

		switch(bsize) {
		case 1: 
		  ((char *)array)[i] = diff + lastpix; 
		  lastpix = ((char *)array)[i];
		  break;
		case 2: 
		  ((short *)array)[i] = diff + lastpix;
		  lastpix = ((short *)array)[i];
		  break;
		case 4:
		  ((int *)array)[i] = diff + lastpix;
		  lastpix = ((int *)array)[i];
		  break;
		default: // never happens
		  break;
		}
	    }
	}
	if (c > cend) {
	  fprintf(stderr,"rdecomp: decompression error: hit end of compressed byte stream\n");
	  fflush(stderr);
	  return 1;
	}
    }
    return 0;
}