/*
* Digest::FP56x1xor.xs - C implementation of a 56bit pseudo additive fingerprint hash.
*
* (C) 2007-2008 jw@suse.de, Novell Inc.
* This module is free software. IT may be used
* and/or modified under the same terms as perl itself.
*
* See also
* perldoc perlxs
* perldoc perlguts
* perldoc perlxstut
* perldoc perlapi
*
* Currently gen_l() croaks, if no 64bit integers are available
* and gen() is suggested, which returns a hex string.
* If the compiler does not respond to 'long long' with 64bit,
* we die().
*
*/
#include "EXTERN.h"
#include "perl.h"
#include "XSUB.h"
#include "ppport.h"
#define VERSION "0.15" // keep in sync with Changes, lib/Digest/FP56x1xor.pm
#if !defined(_STDINT_H) && !defined(__WORDSIZE)
// stdint.h and bits/types.h also have these:
typedef unsigned long long uint64_t;
typedef unsigned int uint32_t;
#endif
static uint64_t rand56[256] = {
#include "random56x256.h"
};
// rotate56 rotates the lower 56 bits in h by m bits.
// The high 8 bit of h are cleared and remain zero.
static uint64_t rotate56(uint64_t h, int m)
{
uint64_t x = (h << m) & 0x00ffffffffffffffLL;
h &= 0x00ffffffffffffffLL;
x |= (h >> (56-m));
return x;
}
#if 0
// hmm, is this useful? how ould perl work with that
// string? I need to push it into
// postgresql and sqlite databases and have it compared there.
// the sql statements for this are always 7bit ascii, right?
//
static unsigned char *uint64tostr8(uint64_t h, unsigned char *buf)
{
unsigned char buf8[9];
int i;
if (!buf) buf = buf8;
for (i = 0; i < 8; i++)
{
buf[i] = (unsigned char)((h>>(8*(7-i))) & 0xff);
}
return buf;
}
#endif
// fp is a 64bit integer.
// highest 2 bit are always zero, next 6 bit are the modulo count.
// remaining lower 56 bit are fingerprint hash value.
static uint64_t gen_fp(const char *text, unsigned int len)
{
uint64_t h = 0;
unsigned int m = 0; // byte counter modulo 56
unsigned int c;
if (sizeof(uint64_t) != 8)
croak("sizeof(uint64_t) is %d, need 8\n", (int)sizeof(uint64_t));
while (len-- > 0)
{
c = (unsigned char)*text++;
uint64_t x = rotate56(rand56[c], m);
h ^= x;
if (++m >= 56) m = 0;
}
h |= ((uint64_t)m) << 56;
return h;
}
static uint64_t cat_fp(uint64_t h1, uint64_t h2)
{
unsigned int m1 = (h1 >> 56);
unsigned int m2 = (h2 >> 56);
uint64_t r = rotate56(h2, m1) ^ (h1 & 0x00ffffffffffffffLL);
m2 += m1;
if (m2 >= 56) m2 -= 56;
r |= ((uint64_t)m2)<<56;
return r;
}
static int xc2i(unsigned char c)
{
// ascii has numbers before upper before lower case.
if (c <= '9') return c - '0';
if (c <= 'F') return c - 'A' + 10;
return c - 'a' + 10;
}
static uint64_t x2i64(unsigned char *t, int n)
{
uint64_t h = 0;
while (n-- > 0)
h = (h<<4) + xc2i(*t++);
return h;
}
static char *ll2x16(uint64_t fp)
{
static char buf[18];
sprintf(buf, "%08x%08x", (unsigned int)(fp>>32), (unsigned int)(fp & 0xffffffff));
return buf;
}
#if 0
// same as ll2x16, but with 0x prefix
// may be used for human consumption. Databases don't like that.
static char *ll2x18(uint64_t fp)
{
static char buf[20];
sprintf(buf, "0x%08x%08x", (unsigned int)(fp>>32), (unsigned int)(fp & 0xffffffff));
return buf;
}
#endif
static uint64_t cat_fp_x(const char *x1, const char *x2)
{
uint64_t h1 = x2i64((unsigned char *)x1, strlen(x1));
uint64_t h2 = x2i64((unsigned char *)x2, strlen(x2));
return cat_fp(h1, h2);
}
// private version of \w, just like perl in ascii C locale.
// but also with quote chars, so that <div id="foo" name=bar>
// is preserverd during cooked()
static int is_word_ch_q(int c)
{
if ((c <= 'z' && c >= 'a') ||
(c <= 'Z' && c >= 'A') ||
(c <= '9' && c >= '0') ||
c == '_' || c == '"' || c == '\'')
return 1;
return 0;
}
// called with len including the trailing \n or \f.
// returns the number of char written to outp
// it never writes more than len chars to out.
static int cook_text(char *outp, const char *text, int len)
{
char *p = outp;
char *s = p;
unsigned char lc = ' '; // last char we sent to output
int line_start = 1; // we are at start of a line.
int whitespace = 0; // we saw whitespace chars since lc.
while (len-- > 0)
{
unsigned char c = *text++;
if (!c) continue; // completly ignore \0 chars
if (line_start && (c == '+' || c == '-' || c == '<' || c == '>'))
{
line_start = 0;
continue; // ignore typ. diff chars at line start.
}
line_start = 0;
if (c <= ' ') // (Whitespace: all character codes <= 32)
{
if (c == '\n' || c == '\r' || c == '\f' || c == '\v')
line_start = 1;
whitespace = 1;
continue;
}
// output a space *only* to prevent run together words.
if (whitespace && is_word_ch_q(lc) && is_word_ch_q(c)) *p++ = ' ';
whitespace = 0;
// this ~-magic does not really fold utf8 to latin1.
// maybe it helps to map other encodings.
if (c >= 128)
{
if (lc < 128) *p++ = '~';
lc = c;
}
else
{
*p++ = lc = c;
}
}
return p - s;
}
MODULE = Digest::FP56x1xor PACKAGE = Digest::FP56x1xor
PROTOTYPES: ENABLE
long gen_l(text)
const char *text
CODE:
if (sizeof(uint64_t) != 8)
croak("sizeof(uint64_t) is %d, need 8. Please try the gen() method\n", (int)sizeof(uint64_t));
RETVAL = gen_fp(text, strlen(text));
OUTPUT:
RETVAL
long cat_l(h1, h2)
long h1
long h2
CODE:
RETVAL = cat_fp((uint64_t)h1, (uint64_t)h2);
OUTPUT:
RETVAL
SV *gen(sv_text, ...)
SV *sv_text
INIT:
STRLEN text_len;
IV start_off;
IV len;
const char *text;
text = (const char *)SvPV(sv_text, text_len);
if ( items > 1)
{
start_off = SvIV(ST(1));
if (start_off > text_len)
croak("specified offset=%d is beyond text_len=%d",
(int)start_off, (int)text_len);
text_len -= start_off;
}
else
start_off = 0;
if ( items > 2)
{
len = SvIV(ST(2));
if (len > text_len)
croak("specified length=%d exceeds text_len=%d", (int)len, (int)text_len);
text_len = len;
}
CODE:
uint64_t fp = gen_fp(text+start_off, text_len);
RETVAL = newSVpvn(ll2x16(fp), 16);
OUTPUT:
RETVAL
SV *cat(x1, x2)
const char *x1
const char *x2
CODE:
uint64_t fp = cat_fp_x(x1, x2);
RETVAL = newSVpvn(ll2x16(fp), 16);
OUTPUT:
RETVAL
SV *ll2x(fp)
long fp;
CODE:
RETVAL = newSVpvn(ll2x16(fp), 16);
OUTPUT:
RETVAL
long x2l(x)
const char *x
CODE:
RETVAL = x2i64((unsigned char *)x, strlen(x));
OUTPUT:
RETVAL
SV*
cooked(sv_text, ...)
SV *sv_text
INIT:
STRLEN text_len; // like strlen(text), but including '\0' bytes. STRLEN is unsigned!
IV start_off;
IV len;
const char *text;
char *cooked_text;
int cooked_len;
SV *text_out;
text = (const char *)SvPV(sv_text, text_len);
if ( items > 1)
{
start_off = SvIV(ST(1));
if (start_off > text_len)
croak("specified offset=%d is beyond text_len=%d",
(int)start_off, (int)text_len);
text_len -= start_off;
}
else
start_off = 0;
if ( items > 2)
{
len = SvIV(ST(2));
if (len > text_len)
croak("specified length=%d exceeds text_len=%d", (int)len, (int)text_len);
text_len = len;
}
CODE:
cooked_text = (char *)malloc(text_len); // never longer than raw text;
cooked_len = cook_text(cooked_text, text+start_off, text_len);
// transfer output from C to perl
text_out = newSVpvn(cooked_text, cooked_len);
// forget C
free((char *)cooked_text);
// do not free text, it points into perl proper.
// (Would cause an abort in PerlIOBuf_close)
RETVAL = text_out;
OUTPUT:
RETVAL