#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include <ctype.h>
#include <assert.h>
//#define MAXN 1000000000UL /* 10^ 9 50847534*/
#define MAXN 10000000000UL /* 10^10 455052511 */
//#define MAXN 100000000UL /* 10^ 8 5761455*/
#define WTYPE unsigned long
#define BITS_PER_WORD (8 * sizeof(WTYPE))
#define MAXBIT (BITS_PER_WORD-1)
#define NWORDS(bits) ( ((bits)+BITS_PER_WORD-1) / BITS_PER_WORD )
#define NBYTES(bits) ( ((bits)+8-1) / 8 )
#define W_CONST(c) ((WTYPE)c##UL)
#define W_ZERO W_CONST(0)
#define W_ONE W_CONST(1)
#define W_FFFF W_CONST(~0)
#define SET_ARRAY_BIT(ar,n) \
ar[(n)/BITS_PER_WORD] |= (W_ONE << ((n)%BITS_PER_WORD))
#define XOR_ARRAY_BIT(ar,n) \
ar[(n)/BITS_PER_WORD] ^= (W_ONE << ((n)%BITS_PER_WORD))
#define CLR_ARRAY_BIT(ar,n) \
ar[(n)/BITS_PER_WORD] &= ~(W_ONE << ((n)%BITS_PER_WORD))
#define IS_SET_ARRAY_BIT(ar,n) \
(ar[(n)/BITS_PER_WORD] & (W_ONE << ((n)%BITS_PER_WORD)) )
/* primes 2,3,5,7,11,13,17,19,23,29,31 as bits */
#define SMALL_PRIMES_MASK W_CONST(2693408940)
/* if (MOD235_MASK >> (n%30)) & 1, n is a multiple of 2, 3, or 5. */
#define MOD235_MASK W_CONST(1601558397)
static int _is_prime7(WTYPE x)
{
WTYPE q, i;
/* Check for small primes */
q = x/ 7; if (q< 7) return 1; if (x==(q* 7)) return 0;
q = x/11; if (q<11) return 1; if (x==(q*11)) return 0;
q = x/13; if (q<13) return 1; if (x==(q*13)) return 0;
q = x/17; if (q<17) return 1; if (x==(q*17)) return 0;
q = x/19; if (q<19) return 1; if (x==(q*19)) return 0;
q = x/23; if (q<23) return 1; if (x==(q*23)) return 0;
q = x/29; if (q<29) return 1; if (x==(q*29)) return 0;
/* wheel factorization, mod-30 loop */
i = 31;
while (1) {
q = x/i; if (q<i) return 1; if (x==(q*i)) return 0; i += 6;
q = x/i; if (q<i) return 1; if (x==(q*i)) return 0; i += 4;
q = x/i; if (q<i) return 1; if (x==(q*i)) return 0; i += 2;
q = x/i; if (q<i) return 1; if (x==(q*i)) return 0; i += 4;
q = x/i; if (q<i) return 1; if (x==(q*i)) return 0; i += 2;
q = x/i; if (q<i) return 1; if (x==(q*i)) return 0; i += 4;
q = x/i; if (q<i) return 1; if (x==(q*i)) return 0; i += 6;
q = x/i; if (q<i) return 1; if (x==(q*i)) return 0; i += 2;
}
return 1;
}
static const long prime_next_small[] =
{2,2,3,5,5,7,7,11,11,11,11,13,13,17,17,17,17,19,19,23,23,23,23,
29,29,29,29,29,29,31,31,37,37,37,37,37,37,41,41,41,41,43,43,47,
47,47,47,53,53,53,53,53,53,59,59,59,59,59,59,61,61,67,67,67,67,67,67,71};
#define NPRIME_NEXT_SMALL (sizeof(prime_next_small)/sizeof(prime_next_small[0]))
static WTYPE next_prime(WTYPE x)
{
static const WTYPE L = 30;
WTYPE k0, n;
static const WTYPE indices[] = {1, 7, 11, 13, 17, 19, 23, 29};
static const WTYPE M = 8;
int index;
if (x < NPRIME_NEXT_SMALL)
return prime_next_small[x];
x++;
k0 = x/L;
index = 0; while ((x-k0*L) > indices[index]) index++;
n = L*k0 + indices[index];
while (!_is_prime7(n)) {
if (++index == M) { k0++; index = 0; }
n = L*k0 + indices[index];
}
return n;
}
/*
* Various sieves.
* Timings for counting the first 10^10 (10B) primes, in seconds.
* Pi(10^10) = 455,052,511
*
* Note: These numbers are old -- Math::Prime::Util has much faster
* segment sieving now (faster than primegen), and the LMO prime
* count is ridiculously fast compared to sieving (it takes only
* a few milliseconds).
*
* 1.9 primesieve 3.6 (even faster with multiple threads)
* 5.6 Tomás Oliveira e Silva's segmented sieve v2 (Sep 2010)
* 6.6 primegen (optimized Sieve of Atkin)
* 11.2 Tomás Oliveira e Silva's segmented sieve v1 (May 2003)
*
* 15.9 sieve_erat30 (my wheel 30 Erat)
* 17.2 sieve_erat30tm (Terje wheel 30)
* 31.9 sieve_eratek (Sorensen inspired)
* 35.5 sieve_erat (Simple Erat)
* 35.5 sieve_erat23 (simple erat mod)
* 33.4 sieve_atkin (Praxis)
* 72.8 sieve_atkin_2 (Fixup of naive)
* 91.6 sieve_atkin_naive (Wikipedia-like)
*
* Retested after ensuring machine was idle. As expected, the segmented
* sievers improve some, and the sievers that fill giant memory spaces
* improve a lot when other memory traffic is removed.
*/
/*
* Straightforward Sieve of Eratosthenes.
*
* Uses 1 bit per odd number.
*
* Time for Pi(10^10) = 54.6s
*/
static WTYPE* sieve_erat(WTYPE end)
{
WTYPE* mem;
size_t n, s;
size_t last = (end+1)/2;
mem = (WTYPE*) calloc( NWORDS(last), sizeof(WTYPE) );
assert(mem != 0);
// Tight:
// for (n = 3; (n*n) <= end; n = next_prime(n))
// for (s = n*n; s <= end; s += 2*n)
// SET_ARRAY_BIT(mem,s/2);
n = 3;
while ( (n*n) <= end) {
for (s = n*n; s <= end; s += 2*n)
SET_ARRAY_BIT(mem,s/2);
// Could do: n = next_prime(n)
do { n += 2; } while (IS_SET_ARRAY_BIT(mem,n/2));
}
SET_ARRAY_BIT(mem, 1/2); /* 1 is composite */
return mem;
}
/*
* Naive Wheel factoring based on algorithm Ek from Sorenson 1991
*
* Uses 1 bit per odd number.
*
* Note we're including initialization code that marks all 3,5 multiples.
* If the caller didn't look at these, this could be skipped.
*
* Time for Pi(10^10) = 46.4s
*/
static WTYPE* sieve_eratek(WTYPE end)
{
WTYPE* mem;
size_t p, f, x;
size_t last = (end+1)/2;
static const WTYPE wheel[] = {1, 7, 11, 13, 17, 19, 23, 29};
static const WTYPE W[] = {0,6,0,0,0,0,0,4,0,0,0,2,0,4,0,0,0,2,0,4,0,0,0,6,0,0,0,0,0,2,0};
mem = (WTYPE*) calloc( NWORDS(last), sizeof(WTYPE) );
assert(mem != 0);
/* Mark all multiples of 3 and 5 as composite. */
//for (p = 3*3; p <= end; p += 2*3) SET_ARRAY_BIT(mem,p/2);
//for (p = 5*5; p <= end; p += 2*5) SET_ARRAY_BIT(mem,p/2);
p = 9;
while (p <= end) {
SET_ARRAY_BIT(mem,p/2); p += 6; if (p > end) break; // mark 9, p = 15
SET_ARRAY_BIT(mem,p/2); p += 6; if (p > end) break; // mark 15, p = 21
SET_ARRAY_BIT(mem,p/2); p += 4; if (p > end) break; // mark 21, p = 25
SET_ARRAY_BIT(mem,p/2); p += 2; if (p > end) break; // mark 25, p = 27
SET_ARRAY_BIT(mem,p/2); p += 6; if (p > end) break; // mark 27, p = 33
SET_ARRAY_BIT(mem,p/2); p += 2; if (p > end) break; // mark 33, p = 35
SET_ARRAY_BIT(mem,p/2); p += 4; // mark 35, p = 39
}
p = 7;
while ((p*p) <= end) {
{
size_t fidx = p%30;
f = p;
/* Here's the problem -- for each prime, we're walking the array from
* start to finish 8 times. The operation count is the same as the
* faster wheel-30 sieves, but this is just horrible for the cache. */
while (f < p+30) {
for (x = p*f; x <= end; x += p*30)
SET_ARRAY_BIT(mem,x/2);
size_t move = W[fidx];
f += move; fidx += move;
if (fidx > 30) fidx -= 30;
}
}
//p = next_prime(p);
do { p += 2; } while (IS_SET_ARRAY_BIT(mem,p/2));
}
SET_ARRAY_BIT(mem, 1/2); /* 1 is composite */
CLR_ARRAY_BIT(mem, 3/2); /* 3 is prime */
CLR_ARRAY_BIT(mem, 5/2); /* 5 is prime */
return mem;
}
/*
* Straightforward Sieve of Eratosthenes, but skipping 3.
*
* Uses 1 bit per odd number.
*
* Time for Pi(10^10) = 48.5s
*/
static WTYPE* sieve_base23(WTYPE end)
{
WTYPE* mem;
size_t n, s;
size_t last = (end+1)/2;
mem = (WTYPE*) calloc( NWORDS(last), sizeof(WTYPE) );
assert(mem != 0);
SET_ARRAY_BIT(mem, 1/2); /* 1 is composite */
/* Mark all multiples of 3. Could skip if callers know this. */
for (n = 3*3; n <= end; n += 2*3) SET_ARRAY_BIT(mem,n/2);
n = 5;
while ( (n*n) <= end ) {
if (!IS_SET_ARRAY_BIT(mem,n/2)) {
for (s = n*n; s <= end; s += 2*n) {
SET_ARRAY_BIT(mem,s/2);
}
}
n += 2;
if ( ((n*n) <= end) && (!IS_SET_ARRAY_BIT(mem,n/2)) ) {
for (s = n*n; s <= end; s += 2*n) {
SET_ARRAY_BIT(mem,s/2);
}
}
n += 4;
}
return mem;
}
static unsigned char mask_tab[30] = {
0, 1, 0, 0, 0, 0, 0, 2, 0, 0, 0, 4, 0, 8, 0,
0, 0, 16, 0, 32, 0, 0, 0, 64, 0, 0, 0, 0, 0, 128 };
#define IS_SIEVE30_SET(n) \
(mem[n/30] & mask_tab[n%30])
/* Proper wheel 30 sieve based on code from Terje Mathisen (1998) */
static unsigned char* sieve_erat30tm(WTYPE end)
{
unsigned char* mem;
size_t max_buf, buffer_words;
WTYPE prime;
max_buf = (end + 29) / 30;
buffer_words = (end + (30*sizeof(WTYPE)) - 1) / (30*sizeof(WTYPE));
mem = (unsigned char*) calloc( buffer_words, sizeof(WTYPE) );
assert(mem != 0);
for (prime = 7; (prime*prime) <= end; prime = next_prime(prime)) {
WTYPE step = prime * 2;
WTYPE curr = prime * prime;
WTYPE dcurr = curr/30;
WTYPE mcurr = curr - dcurr*30;
WTYPE i;
WTYPE dstep[30];
WTYPE nextm[30];
for (i = 1; i < 30; i += 2) {
WTYPE d, m;
WTYPE s = i;
do {
s += step;
d = s/30;
m = s - d*30;
} while (mask_tab[m] == 0);
dstep[i] = d;
nextm[i] = m;
}
do {
//if ((mem[dcurr] & mask_tab[mcurr]) == 0)
mem[dcurr] |= mask_tab[mcurr];
dcurr += dstep[mcurr];
mcurr = nextm[mcurr];
} while (dcurr < max_buf);
}
mem[0] |= mask_tab[1]; /* 1 is composite */
return mem;
}
/* My wheel 30 sieve */
static unsigned char* sieve_erat30(WTYPE end)
{
unsigned char* mem;
size_t max_buf, buffer_words;
WTYPE prime;
max_buf = (end + 29) / 30;
buffer_words = (end + (30*sizeof(WTYPE)) - 1) / (30*sizeof(WTYPE));
mem = (unsigned char*) calloc( buffer_words, sizeof(WTYPE) );
assert(mem != 0);
/* Shortcut to mark 7. Purely an optimization. */
/* However, we could tweak a little by doing:
* - use malloc instead of calloc
* - use a loop of memcpy(len*2) to speed up vs. the 7-byte loop below
* - memset 0 between max_buf and buffer_words
* If you take this one step further you can initialize a few more primes.
* You may then discover Tomás Oliveira e Silva already did that in his
* segmented siever (v2) where he constructs a pattern of 2/3/5/7/11/13
* marks, and then initializes buckets from that (though he does a byte
* copy instead of using memcpy).
*/
#if 1
if ( (7*7) <= end ) {
WTYPE d = 1;
while ( (d+6) < max_buf) {
mem[d+0] = 0x20; mem[d+1] = 0x10; mem[d+2] = 0x81; mem[d+3] = 0x08;
mem[d+4] = 0x04; mem[d+5] = 0x40; mem[d+6] = 0x02; d += 7;
}
if ( d < max_buf ) mem[d++] = 0x20;
if ( d < max_buf ) mem[d++] = 0x10;
if ( d < max_buf ) mem[d++] = 0x81;
if ( d < max_buf ) mem[d++] = 0x08;
if ( d < max_buf ) mem[d++] = 0x04;
if ( d < max_buf ) mem[d++] = 0x40;
//assert(d >= max_buf);
}
#endif
for (prime = 11; (prime*prime) <= end; prime = next_prime(prime)) {
WTYPE d = (prime*prime)/30;
WTYPE m = (prime*prime) - d*30;
WTYPE dinc = (2*prime)/30;
WTYPE minc = (2*prime) - dinc*30;
WTYPE wdinc[8];
unsigned char wmask[8];
int i;
/* Find the positions of the next composites we will mark */
for (i = 1; i <= 8; i++) {
WTYPE dlast = d;
do {
d += dinc;
m += minc;
if (m >= 30) { d++; m -= 30; }
} while ( mask_tab[m] == 0 );
wdinc[i-1] = d - dlast;
wmask[i%8] = mask_tab[m];
}
d -= prime;
//assert(d == ((prime*prime)/30));
//assert(d < max_buf);
//assert(prime = (wdinc[0]+wdinc[1]+wdinc[2]+wdinc[3]+wdinc[4]+wdinc[5]+wdinc[6]+wdinc[7]));
/* Mark them */
i = 0;
do {
mem[d] |= wmask[i];
d += wdinc[i];
i = (i+1) & 7;
} while (d < max_buf);
}
mem[0] |= mask_tab[1]; /* 1 is composite */
return mem;
}
/*
* Naive Sieve of Atkin.
*
* Uses 1 bit per odd number.
*
* This is really slow. Just keeping it here as a reference.
*
* Time for Pi(10^10) = 123.5s
*/
static WTYPE* sieve_atkin_naive(WTYPE end)
{
WTYPE* mem;
size_t x, y, n, sqlimit;
size_t last = (end+1+1)/2;
long loopend, y_limit, dn;
end++;
mem = (WTYPE*) malloc( NWORDS(last) * sizeof(WTYPE) );
assert(mem != 0);
/* mark everything as a composite */
memset(mem, 0xFF, NBYTES(last));
sqlimit = sqrt(end);
for (x = 1; x <= sqlimit; x++) {
for (y = 1; y <= sqlimit; y++) {
n = 4*x*x + y*y;
if ( (n <= end) && (n % 12 == 1 || n % 12 == 5) )
XOR_ARRAY_BIT(mem,n/2);
n = 3*x*x + y*y;
if ( (n <= end) && (n % 12 == 7) )
XOR_ARRAY_BIT(mem,n/2);
n = 3*x*x - y*y;
if ( (n <= end) && (x > y) && (n % 12 == 11) )
XOR_ARRAY_BIT(mem,n/2);
}
}
/* Mark all squares of primes as composite */
for (n = 5; n <= sqlimit; n += 2)
if (!IS_SET_ARRAY_BIT(mem,n/2))
for (y = n*n; y <= end; y += 2*n*n)
SET_ARRAY_BIT(mem,y/2);
CLR_ARRAY_BIT(mem, 3/2); /* 3 is prime */
return mem;
}
/*
* Better Sieve of Atkin.
*
* Uses 1 bit per odd number.
*
* Just some simple optimizations to make it a little better. Still not good.
*
* Time for Pi(10^10) = 97.2s
*/
static WTYPE* sieve_atkin_2(WTYPE end)
{
WTYPE* mem;
size_t x, y, n, sqlimit;
size_t last = (end+1+1)/2;
long loopend, y_limit, dn;
end++;
mem = (WTYPE*) malloc( NWORDS(last) * sizeof(WTYPE) );
assert(mem != 0);
/* mark everything as a composite */
memset(mem, 0xFF, NBYTES(last));
sqlimit = sqrtf(end);
for (x = 1; x <= sqlimit; x++) {
{
size_t xx4 = 4*x*x;
y = 1;
for (n = xx4+1; n <= end; n = xx4+y*y) {
size_t nmod12 = n%12;
if ( (nmod12 == 1) || (nmod12 == 5) )
XOR_ARRAY_BIT(mem,n/2);
y++;
}
}
{
size_t xx3 = 3*x*x;
y = 1;
for (n = xx3+1; n <= end; n = xx3+y*y) {
size_t nmod12 = n%12;
if (nmod12 == 7)
XOR_ARRAY_BIT(mem,n/2);
y++;
}
y = x-1;
while ( y*y >= xx3 )
y--;
for (n = xx3-y*y; y >= 1 && n <= end; n = xx3-y*y) {
size_t nmod12 = n%12;
if (nmod12 == 11)
XOR_ARRAY_BIT(mem,n/2);
y--;
}
}
}
/* Mark all squares of primes as composite */
for (n = 5; n <= sqlimit; n += 2)
if (!IS_SET_ARRAY_BIT(mem,n/2))
for (y = n*n; y <= end; y += 2*n*n)
SET_ARRAY_BIT(mem,y/2);
CLR_ARRAY_BIT(mem, 3/2); /* 3 is prime */
return mem;
}
/*
* Better Sieve of Atkin.
*
* Uses 1 bit per odd number.
*
* From Mike on Programming Praxis. Pretty fast, but not really an improvement
* over a good SoE. Note that the limits aren't handled quite right, so I have
* to add an "if (n <= end)" in front of each XOR.
*
* Time for Pi(10^10) = 53.9s
*/
static WTYPE* sieve_atkin(WTYPE end)
{
WTYPE* mem;
size_t n, s, k;
size_t last = (end+1)/2; /* Extra space allocated */
long loopend, y_limit, dn;
end++;
mem = (WTYPE*) malloc( NWORDS(last) * sizeof(WTYPE) );
assert(mem != 0);
/* mark everything as a composite */
memset(mem, 0xFF, NBYTES(last));
{
long xx3 = 3;
long dxx;
loopend = 12 * (long) sqrtf((end-1)/3.0);
for (dxx = 0; dxx < loopend; dxx += 24) {
xx3 += dxx;
y_limit = (long) (12.0*sqrtf( end - xx3 )) - 36;
n = xx3 + 16;
for (dn = -12; dn < (y_limit+1); dn += 72) {
n += dn;
if (n <= end) XOR_ARRAY_BIT(mem,n/2);
}
n = xx3 + 4;
for (dn = 12; dn < (y_limit+1); dn += 72) {
n += dn;
if (n <= end) XOR_ARRAY_BIT(mem,n/2);
}
}
}
{
long xx4 = 0;
long dxx4;
loopend = 8 * (long) sqrtf((end-1)/4.0) + 4;
for (dxx4 = 4; dxx4 < loopend; dxx4 += 8) {
xx4 += dxx4;
n = xx4 + 1;
if (xx4%3) {
y_limit = 4 * (long)sqrtf( end - xx4 ) - 3;
for (dn = 0; dn < y_limit; dn += 8) {
n += dn;
if (n <= end) XOR_ARRAY_BIT(mem,n/2);
}
} else {
y_limit = 12 * (long)sqrtf( end - xx4 ) - 36;
n = xx4 + 25;
for (dn = -24; dn < (y_limit+1); dn += 72) {
n += dn;
if (n <= end) XOR_ARRAY_BIT(mem,n/2);
}
n = xx4 + 1;
for (dn = 24; dn < (y_limit+1); dn += 72) {
n += dn;
if (n <= end) XOR_ARRAY_BIT(mem,n/2);
}
}
}
}
{
long xx = 1;
long x;
loopend = (long) sqrtf((float)end/2.0) + 1;
for (x = 3; x < loopend; x += 2) {
xx += 4*x - 4;
n = 3*xx;
if (n > end) {
long min_y = (( (long) (sqrtf(n - end)) >>2)<<2);
long yy = min_y * min_y;
n -= yy;
s = 4*min_y + 4;
} else {
s = 4;
}
for (dn = s; dn < 4*x; dn += 8) {
n -= dn;
if ((n <= end) && ((n%12) == 11))
XOR_ARRAY_BIT(mem,n/2);
}
}
xx = 0;
loopend = (long) sqrtf((float)end/2.0) + 1;
for (x = 2; x < loopend; x += 2) {
xx += 4*x - 4;
n = 3*xx;
if (n > end) {
long min_y = (( (long) (sqrtf(n - end)) >>2)<<2)-1;
long yy = min_y * min_y;
n -= yy;
s = 4*min_y + 4;
} else {
n--;
s = 0;
}
for (dn = s; dn < 4*x; dn += 8) {
n -= dn;
if ((n <= end) && ((n%12) == 11))
XOR_ARRAY_BIT(mem,n/2);
}
}
}
/* Mark all squares of primes as composite */
loopend = (long) sqrtf(end) + 1;
for (n = 5; n < loopend; n += 2)
if (!IS_SET_ARRAY_BIT(mem,n/2))
for (k = n*n; k <= end; k += 2*n*n)
SET_ARRAY_BIT(mem,k/2);
CLR_ARRAY_BIT(mem, 3/2); /* 3 is prime */
CLR_ARRAY_BIT(mem, 5/2); /* 5 is prime */
return mem;
}
int main(void)
{
WTYPE s;
WTYPE high = (MAXN-1)/2;
WTYPE full_words;
int count = 1;
#if 0
WTYPE* sieve = sieve_erat(MAXN);
//WTYPE* sieve = sieve_eratek(MAXN);
//WTYPE* sieve = sieve_base23(MAXN);
//WTYPE* sieve = sieve_atkin_naive(MAXN);
//WTYPE* sieve = sieve_atkin_2(MAXN);
//WTYPE* sieve = sieve_atkin(MAXN);
full_words = NWORDS(high) - 1;
s = 0;
/* Count 0 bits using Wegner/Lehmer/Kernighan method. */
for (; s < full_words; s++) {
WTYPE word = ~sieve[s];
while (word) {
word &= word-1;
count++;
}
}
/* Count primes in the last (partial) word */
for (s = full_words*BITS_PER_WORD; s <= high; s++)
if ( ! IS_SET_ARRAY_BIT(sieve, s) )
count++;
#else
//unsigned char* sieve = sieve_erat30tm(MAXN);
unsigned char* sieve = sieve_erat30(MAXN);
count = 3; /* 2, 3, 5 */
full_words = ((MAXN-1)/30) / sizeof(WTYPE);
WTYPE* wsieve = (WTYPE*) sieve;
for (s = 0; s < full_words; s++) {
WTYPE word = ~wsieve[s];
while (word) {
word &= word-1;
count++;
}
}
/* Count primes in the last (partial) word */
{
static const WTYPE wheel[] = {1, 7, 11, 13, 17, 19, 23, 29};
WTYPE m = 0;
WTYPE d = full_words * sizeof(WTYPE);
while ( (d*30+wheel[m]) <= MAXN ) {
if ((sieve[d] & mask_tab[wheel[m]]) == 0)
count++;
m++; if (m == 8) { m = 0; d++; }
}
}
#endif
free(sieve);
printf("Pi(%lu) = %d\n", MAXN, count);
return 0;
}