The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
/*
  Darts -- Double-ARray Trie System

  $Id: darts.h,v 1.1 2009/04/06 10:25:41 dankogai Exp $;

  Copyright(C) 2001-2007 Taku Kudo <taku@chasen.org>
*/
#ifndef DARTS_H_
#define DARTS_H_

#define DARTS_VERSION "0.32"
#include <vector>
#include <cstring>
#include <cstdio>

#ifdef HAVE_ZLIB_H
namespace zlib {
#include <zlib.h>
}
#define SH(p)((unsigned short)(unsigned char)((p)[0]) | ((unsigned short)(unsigned char)((p)[1]) << 8))
#define LG(p)((unsigned long)(SH(p)) |((unsigned long)(SH((p)+2)) << 16))
#endif

namespace Darts {

  template <class T> inline T _max(T x, T y) { return(x > y) ? x : y; }
  template <class T> inline T* _resize(T* ptr, size_t n, size_t l, T v) {
    T *tmp = new T[l];
    for (size_t i = 0; i < n; ++i) tmp[i] = ptr[i];
    for (size_t i = n; i < l; ++i) tmp[i] = v;
    delete [] ptr;
    return tmp;
  }

  template <class T>
  class Length {
  public: size_t operator()(const T *key) const
    { size_t i; for (i = 0; key[i] != static_cast<T>(0); ++i) {} return i; }
  };

  template <> class Length<char> {
  public: size_t operator()(const char *key) const
    { return std::strlen(key); }
  };

  template  <class node_type_,  class node_u_type_,
             class array_type_, class array_u_type_,
             class length_func_ = Length<node_type_> >
  class DoubleArrayImpl {
  public:
    typedef array_type_  value_type;
    typedef node_type_   key_type;
    typedef array_type_  result_type;  // for compatibility

    struct result_pair_type {
      value_type value;
      size_t     length;
    };

    explicit DoubleArrayImpl(): array_(0), used_(0),
                                size_(0), alloc_size_(0),
                                no_delete_(0), error_(0) {}
    virtual ~DoubleArrayImpl() { clear(); }

    void set_result(value_type *x, value_type r, size_t) const {
      *x = r;
    }

    void set_result(result_pair_type *x, value_type r, size_t l) const {
      x->value = r;
      x->length = l;
    }

    void set_array(void *ptr, size_t size = 0) {
      clear();
      array_ = reinterpret_cast<unit_t *>(ptr);
      no_delete_ = true;
      size_ = size;
    }

    const void *array() const {
      return const_cast<const void *>(reinterpret_cast<void *>(array_));
    }

    void clear() {
      if (!no_delete_)
        delete [] array_;
      delete [] used_;
      array_ = 0;
      used_ = 0;
      alloc_size_ = 0;
      size_ = 0;
      no_delete_ = false;
    }

    size_t unit_size()  const { return sizeof(unit_t); }
    size_t size()       const { return size_; }
    size_t total_size() const { return size_ * sizeof(unit_t); }

    size_t nonzero_size() const {
      size_t result = 0;
      for (size_t i = 0; i < size_; ++i)
        if (array_[i].check) ++result;
      return result;
    }

    int build(size_t     key_size,
              const key_type   **key,
              const size_t     *length = 0,
              const value_type *value = 0,
              int (*progress_func)(size_t, size_t) = 0) {
      if (!key_size || !key) return 0;

      progress_func_ = progress_func;
      key_           = key;
      length_        = length;
      key_size_      = key_size;
      value_         = value;
      progress_      = 0;

      resize(8192);

      array_[0].base = 1;
      next_check_pos_ = 0;

      node_t root_node;
      root_node.left  = 0;
      root_node.right = key_size;
      root_node.depth = 0;

      std::vector<node_t> siblings;
      fetch(root_node, siblings);
      insert(siblings);

      size_ += (1 << 8 * sizeof(key_type)) + 1;
      if (size_ >= alloc_size_) resize(size_);

      delete [] used_;
      used_ = 0;

      return error_;
    }

    int open(const char *file,
             const char *mode = "rb",
             size_t offset = 0,
             size_t size = 0) {
      std::FILE *fp = std::fopen(file, mode);
      if (!fp) return -1;
      if (std::fseek(fp, offset, SEEK_SET) != 0) return -1;

      if (!size) {
        if (std::fseek(fp, 0L,     SEEK_END) != 0) return -1;
        size = std::ftell(fp);
        if (std::fseek(fp, offset, SEEK_SET) != 0) return -1;
      }

      clear();

      size_ = size;
      size_ /= sizeof(unit_t);
      array_ = new unit_t[size_];
      if (size_ != std::fread(reinterpret_cast<unit_t *>(array_),
                              sizeof(unit_t), size_, fp)) return -1;
      std::fclose(fp);

      return 0;
    }

    int save(const char *file,
             const char *mode = "wb",
             size_t offset = 0) {
      if (!size_) return -1;
      std::FILE *fp = std::fopen(file, mode);
      if (!fp) return -1;
      if (size_ != std::fwrite(reinterpret_cast<unit_t *>(array_),
                               sizeof(unit_t), size_, fp))
        return -1;
      std::fclose(fp);
      return 0;
    }

#ifdef HAVE_ZLIB_H
    int gzopen(const char *file,
               const char *mode = "rb",
               size_t offset = 0,
               size_t size = 0) {
      std::FILE *fp  = std::fopen(file, mode);
      if (!fp) return -1;
      clear();

      size_ = size;
      if (!size_) {
        if (-1L != static_cast<long>(std::fseek(fp, -8, SEEK_END))) {
          char buf[8];
          if (std::fread(static_cast<char*>(buf),
                         1, 8, fp) != sizeof(buf)) {
            std::fclose(fp);
            return -1;
          }
          size_ = LG(buf+4);
          size_ /= sizeof(unit_t);
        }
      }
      std::fclose(fp);

      if (!size_) return -1;

      zlib::gzFile gzfp = zlib::gzopen(file, mode);
      if (!gzfp) return -1;
      array_ = new unit_t[size_];
      if (zlib::gzseek(gzfp, offset, SEEK_SET) != 0) return -1;
      zlib::gzread(gzfp, reinterpret_cast<unit_t *>(array_),
                   sizeof(unit_t) * size_);
      zlib::gzclose(gzfp);
      return 0;
    }

    int gzsave(const char *file, const char *mode = "wb",
               size_t offset = 0) {
      zlib::gzFile gzfp = zlib::gzopen(file, mode);
      if (!gzfp) return -1;
      zlib::gzwrite(gzfp, reinterpret_cast<unit_t *>(array_),
                    sizeof(unit_t) * size_);
      zlib::gzclose(gzfp);
      return 0;
    }
#endif

    template <class T>
    inline void exactMatchSearch(const key_type *key,
                                 T & result,
                                 size_t len = 0,
                                 size_t node_pos = 0) const {
      result = exactMatchSearch<T> (key, len, node_pos);
      return;
    }

    template <class T>
    inline T exactMatchSearch(const key_type *key,
                              size_t len = 0,
                              size_t node_pos = 0) const {
      if (!len) len = length_func_()(key);

      T result;
      set_result(&result, -1, 0);

      register array_type_  b = array_[node_pos].base;
      register array_u_type_ p;

      for (register size_t i = 0; i < len; ++i) {
        p = b +(node_u_type_)(key[i]) + 1;
        if (static_cast<array_u_type_>(b) == array_[p].check)
          b = array_[p].base;
        else
          return result;
      }

      p = b;
      array_type_ n = array_[p].base;
      if (static_cast<array_u_type_>(b) == array_[p].check && n < 0)
        set_result(&result, -n-1, len);

      return result;
    }

    template <class T>
    size_t commonPrefixSearch(const key_type *key,
                              T* result,
                              size_t result_len,
                              size_t len = 0,
                              size_t node_pos = 0) const {
      if (!len) len = length_func_()(key);

      register array_type_  b   = array_[node_pos].base;
      register size_t     num = 0;
      register array_type_  n;
      register array_u_type_ p;

      for (register size_t i = 0; i < len; ++i) {
        p = b;  // + 0;
        n = array_[p].base;
        if ((array_u_type_) b == array_[p].check && n < 0) {
          // result[num] = -n-1;
          if (num < result_len) set_result(&result[num], -n-1, i);
          ++num;
        }

        p = b +(node_u_type_)(key[i]) + 1;
        if ((array_u_type_) b == array_[p].check)
          b = array_[p].base;
        else
          return num;
      }

      p = b;
      n = array_[p].base;

      if ((array_u_type_)b == array_[p].check && n < 0) {
        if (num < result_len) set_result(&result[num], -n-1, len);
        ++num;
      }

      return num;
    }

    value_type traverse(const key_type *key,
                        size_t &node_pos,
                        size_t &key_pos,
                        size_t len = 0) const {
      if (!len) len = length_func_()(key);

      register array_type_  b = array_[node_pos].base;
      register array_u_type_ p;

      for (; key_pos < len; ++key_pos) {
        p = b + (node_u_type_)(key[key_pos]) + 1;
        if (static_cast<array_u_type_>(b) == array_[p].check) {
          node_pos = p;
          b = array_[p].base;
        } else {
          return -2;  // no node
        }
      }

      p = b;
      array_type_ n = array_[p].base;
      if (static_cast<array_u_type_>(b) == array_[p].check && n < 0)
        return -n-1;

      return -1;  // found, but no value
    }

  private:
    struct node_t {
      array_u_type_ code;
      size_t     depth;
      size_t     left;
      size_t     right;
    };

    struct unit_t {
      array_type_   base;
      array_u_type_ check;
    };

    unit_t        *array_;
    unsigned char *used_;
    size_t        size_;
    size_t        alloc_size_;
    size_t        key_size_;
    const node_type_    **key_;
    const size_t        *length_;
    const array_type_   *value_;
    size_t        progress_;
    size_t        next_check_pos_;
    bool          no_delete_;
    int           error_;
    int (*progress_func_)(size_t, size_t);

    size_t resize(const size_t new_size) {
      unit_t tmp;
      tmp.base = 0;
      tmp.check = 0;
      array_ = _resize(array_, alloc_size_, new_size, tmp);
      used_  = _resize(used_, alloc_size_, new_size,
                       static_cast<unsigned char>(0));
      alloc_size_ = new_size;
      return new_size;
    }

    size_t fetch(const node_t &parent, std::vector <node_t> &siblings) {
      if (error_ < 0) return 0;

      array_u_type_ prev = 0;

      for (size_t i = parent.left; i < parent.right; ++i) {
        if ((length_ ? length_[i] : length_func_()(key_[i])) < parent.depth)
          continue;

        const node_u_type_ *tmp = reinterpret_cast<const node_u_type_ *>(key_[i]);

        array_u_type_ cur = 0;
        if ((length_ ? length_[i] : length_func_()(key_[i])) != parent.depth)
          cur = (array_u_type_)tmp[parent.depth] + 1;

        if (prev > cur) {
          error_ = -3;
          return 0;
        }

        if (cur != prev || siblings.empty()) {
          node_t tmp_node;
          tmp_node.depth = parent.depth + 1;
          tmp_node.code  = cur;
          tmp_node.left  = i;
          if (!siblings.empty()) siblings[siblings.size()-1].right = i;

          siblings.push_back(tmp_node);
        }

        prev = cur;
      }

      if (!siblings.empty())
        siblings[siblings.size()-1].right = parent.right;

      return siblings.size();
    }

    size_t insert(const std::vector <node_t> &siblings) {
      if (error_ < 0) return 0;

      size_t begin = 0;
      size_t pos   = _max((size_t)siblings[0].code + 1, next_check_pos_) - 1;
      size_t nonzero_num = 0;
      int    first = 0;

      if (alloc_size_ <= pos) resize(pos + 1);

      while (true) {
      next:
        ++pos;

        if (alloc_size_ <= pos) resize(pos + 1);

        if (array_[pos].check) {
          ++nonzero_num;
          continue;
        } else if (!first) {
          next_check_pos_ = pos;
          first = 1;
        }

        begin = pos - siblings[0].code;
        if (alloc_size_ <= (begin + siblings[siblings.size()-1].code))
          resize(static_cast<size_t>(alloc_size_ *
                                     _max(1.05, 1.0 * key_size_ / progress_)));

        if (used_[begin]) continue;

        for (size_t i = 1; i < siblings.size(); ++i)
          if (array_[begin + siblings[i].code].check != 0) goto next;

        break;
      }

      // -- Simple heuristics --
      // if the percentage of non-empty contents in check between the index
      // 'next_check_pos' and 'check' is greater than some constant
      // value(e.g. 0.9),
      // new 'next_check_pos' index is written by 'check'.
      if (1.0 * nonzero_num/(pos - next_check_pos_ + 1) >= 0.95)
        next_check_pos_ = pos;

      used_[begin] = 1;
      size_ = _max(size_,
                   begin +
                   static_cast<size_t>(siblings[siblings.size() - 1].code + 1));

      for (size_t i = 0; i < siblings.size(); ++i)
        array_[begin + siblings[i].code].check = begin;

      for (size_t i = 0; i < siblings.size(); ++i) {
        std::vector <node_t> new_siblings;

        if (!fetch(siblings[i], new_siblings)) {
          array_[begin + siblings[i].code].base =
            value_ ?
            static_cast<array_type_>(-value_[siblings[i].left]-1) :
            static_cast<array_type_>(-siblings[i].left-1);

          if (value_ && (array_type_)(-value_[siblings[i].left]-1) >= 0) {
            error_ = -2;
            return 0;
          }

          ++progress_;
          if (progress_func_)(*progress_func_)(progress_, key_size_);

        } else {
          size_t h = insert(new_siblings);
          array_[begin + siblings[i].code].base = h;
        }
      }

      return begin;
    }

  };

#if 4 == 2
  typedef Darts::DoubleArrayImpl<char, unsigned char, short,
                                 unsigned short> DoubleArray;
#define DARTS_ARRAY_SIZE_IS_DEFINED 1
#endif

#if 4 == 4 && !defined(DARTS_ARRAY_SIZE_IS_DEFINED)
  typedef Darts::DoubleArrayImpl<char, unsigned char, int,
                                 unsigned int> DoubleArray;
#define DARTS_ARRAY_SIZE_IS_DEFINED 1
#endif

#if 4 == 4 && !defined(DARTS_ARRAY_SIZE_IS_DEFINED)
  typedef Darts::DoubleArrayImpl<char, unsigned char, long,
                                 unsigned long> DoubleArray;
#define DARTS_ARRAY_SIZE_IS_DEFINED 1
#endif

#if 4 == 8 && !defined(DARTS_ARRAY_SIZE_IS_DEFINED)
  typedef Darts::DoubleArrayImpl<char, unsigned char, long long,
                                 unsigned long long> DoubleArray;
#endif
}
#endif