The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
// Copyright 2010 Google Inc.
// Modified by Yuji Kaneda
// 
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// 
//      http://www.apache.org/licenses/LICENSE-2.0
// 
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// ------------------------------------------------------------------------

#include <string>
#include <math.h>
#include <assert.h>
#include <algorithm>

#include "openssl/md5.h"

// #include "public/szltype.h"
// #include "public/szlvalue.h"
// #include "public/szlencoder.h"
// #include "public/szldecoder.h"
// #include "public/szltabentry.h"

namespace SZaru {

// Return the dimensions of a sketch such that nTabs * tabSize ~= totalSize.
// Our algorithm assumes tabSize is a pow of 2, nTabs is odd.

template <typename Value>
void SzlSketch<Value>::Dims(int totalSize, int* nTabs, int* tabSize) {
  int ts = totalSize / 31;
  int bits;
  for (bits = 2; bits < 32 && ts > (1 << bits); bits++)
    ;
  int tabs;
  for (tabs = kMaxTabs; tabs > kMinTabs; tabs -= 2)
    if (((tabs - 2) << bits) < totalSize)
      break;
  *nTabs = tabs;
  *tabSize = 1 << bits;
}

template <typename Value>
SzlSketch<Value>::SzlSketch(int nTabs, int tabSize)
  : weights_(new Value[nTabs * tabSize]),
    nTabs_(nTabs),
    tabSize_(tabSize) {
  // SzlValues's are clear by default, so we don't need to clear weights_
  
  // Check for valid nTabs, pow(2) tabSize;
  // CHECK(nTabs >= kMinTabs && nTabs <= kMaxTabs && (nTabs & 1) == 1);
  // CHECK(tabSize > 0 && (tabSize & (tabSize - 1)) == 0);

  int bits;
  for (bits = 0; bits < 32 && tabSize > (1 << bits); bits++)
    ;
  tabBits_ = bits;
}

template <typename Value>
SzlSketch<Value>::~SzlSketch() {
  // int n = nTabs_ * tabSize_;
  // for (int i = 0; i <  n; i++)
  // weight_ops_.Clear(&weights_[i]);
  delete[] weights_;

  // for (int i = 0; i < nTabs_; i++)
  //    weight_ops_.Clear(&tmp_[i]);
}

// int SzlSketch<Value>::Memory() {
//   int mem = sizeof(SzlSketch);

//   int n = nTabs_ * tabSize_;
//   for (int i = 0; i < n; i++)
//     mem += weight_ops_.Memory(weights_[i]);

//   for (int i = 0; i < nTabs_; i++)
//     mem += weight_ops_.Memory(tmp_[i]);

//   return mem;
// }

// Encode the weights in the SzlSketch.
// void SzlSketch<Value>::Encode(SzlEncoder* enc) {
//   int n = nTabs_ * tabSize_;
//   for (int i = 0; i < n; i++)
//     weight_ops_.Encode(weights_[i], enc);
// }

// // Decode the weights in the SzlSketch.
// bool SzlSketch<Value>::Decode(SzlDecoder* dec) {
//   int n = nTabs_ * tabSize_;
//   for (int i = 0; i < n; i++)
//     if (!weight_ops_.Decode(dec, &weights_[i]))
//       return false;
//   return true;
// }

template <typename Value>
void SzlSketch<Value>::AddSketch(const SzlSketch& sketch) {
  assert(sketch.tabSize() == tabSize());
  int n = nTabs_ * tabSize_;
  for (int i = 0; i < n; ++i)
    // weight_ops_.Add(sketch.weights_[i], &weights_[i]);
    weights_[i] += sketch.weights_[i];
}

// Compute the indices into the sketch weights,
//   We need nTabs different hashes of the string,
//   which we get by computing md5 on slightly different strings.
template <typename Value>
void SzlSketch<Value>::ComputeIndex(const string& s, Index* index) {
  // original set of hash bits comes from a good hash of the key.
  uint8 digest[MD5_DIGEST_LENGTH];
  MD5Digest(s.data(), s.size(), &digest);

  int digi = 0;
  uint32 bits = 0;              // shift register with our hash bits
  int nbits = 0;
  int origin = 0;
  for (int i = 0; i < nTabs_; ++i) {
    // get enough hash bits from our good hash function
    while (nbits < tabBits_ + 1) {
      if (digi == MD5_DIGEST_LENGTH) {
        // rehash the hash to get more hash bits
        MD5Digest(digest, MD5_DIGEST_LENGTH, &digest);
        digi = 0;
      }
      bits |= digest[digi++] << nbits;
      nbits += 8;
    }

    // compute the index into this row of the sketch
    // and the sign we should use while summing.
    int ind = bits & ((1 << tabBits_) - 1);
    index->index[i].elem = origin + ind;
    origin += tabSize_;
    bits >>= tabBits_;
    index->index[i].sign = bits & 1;
    bits >>= 1;
    nbits -= tabBits_ + 1;
  }
}

// Adjust the sketch for a given index.
//   Just add or subtract the weight for each entry.
template <typename Value>
void SzlSketch<Value>::AddSub(Index* index,
			      Value value,
			      int isAdd) {
  for (int i = 0; i < nTabs_; ++i) {
    assert(index->index[i].elem >= static_cast<size_t>(i) * tabSize_
           && index->index[i].elem < (static_cast<size_t>(i) + 1) * tabSize_
           && index->index[i].sign <= 1);
    Value *w = &weights_[index->index[i].elem];
    if (index->index[i].sign == static_cast<uint32_t>(isAdd))
      // weight_ops_.Sub(value, w);
      *w -= value;
    else
      // weight_ops_.Add(value, w);
      *w += value;
  }
}

/*
 * Comparison function for finding the median
 */
// struct SzlSketchEstLess {
//   SzlSketchEstLess(const SzlOps& ops, int pos): ops_(ops), pos_(pos) {}
//   bool operator()(const double* w1, const double* w2) const {
//     return ops_.LessAtPos(*w1, pos_, *w2);
//   }
//   const SzlOps& ops_;
//   int pos_;
// };

// Estimate the weight for an index.
//   We use the median as the estimate, since it's supposed to be superior
//   to using the mean.
template <typename Value>
void SzlSketch<Value>::Estimate(Index* index, Value* est) {
  // first make the array of all weights with signs corrected
  Value values[kMaxTabs];
  for (int i = 0; i < nTabs_; ++i) {
    Value w = weights_[index->index[i].elem];
    if (index->index[i].sign) {
      // values[i] = &tmp_[i];
      // weight_ops_.Negate(*w, &tmp_[i]);
      values[i] = -w;
    } else {
      values[i] = w;
    }
  }

  unsigned nTabs2 = nTabs_ >> 1;
  Value* mid = &values[nTabs2];
  Value* last = &values[nTabs_];
  // for (int i = 0; i < weight_ops_.nflats(); i++) {
//     SzlSketchEstLess estless(weight_ops_, i);
//     nth_element(values, mid, last, estless);
//     weight_ops_.AssignAtPos(**mid, i, est);
//   }
  
  nth_element(values, mid, last);
  est = mid;
}

// Compute the estimated standard deviation of values in the sketch.
template <typename Value>
void SzlSketch<Value>::StdDeviation(double* deviations) {
  // int nvals = weight_ops_.nflats();
  int nvals = 1;
  for (int j = 0; j < nvals; j++)
    deviations[j] = 0.;

  if (tabSize_ == 0)
    return;

  // compute the error in each column of the sketch.
  double* columns = new double[tabSize_ * nvals];
  Value col;
  double* ave = new double[nvals];
  for (int i = 0; i < nvals; i++)
    ave[i] = 0.;
  double* column = columns;
  double* colv = new double[nvals];
  for (int i = 0; i < tabSize_; i++) {
    Index index;
    int origin = 0;
    for (int row = 0; row < nTabs_; row++) {
      index.index[row].elem = i + origin;
      index.index[row].sign = 0;
      origin += tabSize_;
    }
    Estimate(&index, &col);
    //weight_ops_.ToFloat(col, colv);
    colv[0] = col;

    for (int j = 0; j < nvals; j++) {
      double d = colv[j];
      column[j] = d;
      ave[j] += d;
    }
    column += nvals;
  }

  // Compute the standard deviation of columns.
  for (int j = 0; j < nvals; j++)
    ave[j] /= tabSize_;
  column = columns;
  for (int i = 0; i < tabSize_; i++) {
    for (int j = 0; j < nvals; j++) {
      double d = column[j] - ave[j];
      deviations[j] += d * d;
    }
    column += nvals;
  }

  // weight_ops_.Clear(&col);
  delete[] columns;
  delete[] ave;
  delete[] colv;

  for (int j = 0; j < nvals; j++) {
    if (deviations[j] > .00000001)
      deviations[j] = sqrt(deviations[j] / tabSize_);
    else
      deviations[j] = 0.;
  }
}

}