The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
#include <string.h>
#include <stdarg.h>
#include <stdio.h>
#include <stdlib.h>
#include <errno.h>
#include <string>
#include <vector>

#include <sys/time.h>
#include <unistd.h>

#include <sparsehash/dense_hash_map>

#include "fastq-lib.h"
#include "utils.h"
#include "tidx.h"

void usage(FILE *f);

using namespace std;
using namespace google;

double xtime();

bool annot_comp (const annot &a, const annot &b) { return (a.beg < b.beg); }

template <typename L, typename R> void append(L& lhs, R const& rhs) { lhs.insert(lhs.end(), rhs.begin(), rhs.end()); }
template <typename L, typename R> void prepend(L& lhs, R const& rhs) { lhs.insert(lhs.begin(), rhs.begin(), rhs.end()); }

struct string_annot_serializer {
  bool operator()(FILE* fp, const std::pair<const string&, const vector<annot>& >& value) const {

    {
    assert(value.first.length() <= UCHAR_MAX);
    const unsigned char size = value.first.length();
    if (fwrite(&size, sizeof(size), 1, fp) != 1)
      return false;
    if (fwrite(value.first.data(), size, 1, fp) != 1)
      return false;
    }

    {
    const vector<annot>&van=value.second;
    const unsigned long size = van.size();
    if (fwrite(&size, sizeof(size), 1, fp) != 1)
      return false;
    int i;
    for (i=0;i<size;++i) {
        if (fwrite(&van[i].beg, sizeof(van[i].beg), 1, fp) != 1)
          return false;
        if (fwrite(&van[i].end, sizeof(van[i].end), 1, fp) != 1)
          return false;
        assert(van[i].pos.size() <= USHRT_MAX);
        const unsigned short size = van[i].pos.size();
        if (fwrite(&size, sizeof(size), 1, fp) != 1)
          return false;
        int j;
        for (j=0;j<size;++j) {
            if (fwrite(&van[i].pos[j], sizeof(van[i].pos[j]), 1, fp) != 1)
              return false;
        }
    }
    }

    return true;
  }

  bool operator()(FILE* fp, std::pair<const string, vector<annot> >* value) const {

    {

    string buf;
    unsigned char size;    // all strings are <= 255 chars long
    if (fread(&size, sizeof(size), 1, fp) != 1)
      return false;

    if(size>buf.size()) buf.resize(size*2);

    if (fread((void *)buf.data(), size, 1, fp) != 1) {
      return false;
    }
    // necessarry to "new" the value which must be const, except during "unsearialization"
    // api shouldn't foist this on the user ... should be behind the scenes
    string * ncs = const_cast<string *>(&value->first);
    new(ncs) string(buf.data(), (size_t)size);
    
    }

    {
    
    vector<annot> &van=value->second;
    unsigned long size;
    if (fread(&size, sizeof(size), 1, fp) != 1)
      return false;
    int i;
    van.resize(size);
    for (i=0;i<size;++i) {
        if (fread(&van[i].beg, sizeof(van[i].beg), 1, fp) != 1)
          return false;
        if (fread(&van[i].end, sizeof(van[i].beg), 1, fp) != 1)
          return false;
        unsigned short size;
        if (fread(&size, sizeof(size), 1, fp) != 1)
          return false;
        int j;
        van[i].pos.resize(size);
        for(j=0;j<size;++j) {
            if (fread(&van[i].pos[j], sizeof(van[i].pos[j]), 1, fp) != 1)
              return false;
        }
    }

    }

    return true;
  }
};


void chomp_line(struct line &l) {
    if (l.s[l.n-1] == '\n') l.s[--l.n]='\0';       // chomp
    if (l.s[l.n-1] == '\r') l.s[--l.n]='\0';       // chomp
}

vector <long int> empty_vector;
const vector<long int> &tidx::lookup(const char *chr, int pos) {
    dense_hash_map<string,vector<annot> >::iterator it=map.find(chr);
    if (it == map.end()) return empty_vector;
    vector<annot> &va = it->second;
    if (debug) fprintf(stderr,"lookup: %s:%d -> %d\n", chr, pos, (int) va.size());
    int b=0, t=va.size(), c=0;
    while (t>b) {
        c=(t+b)/2;
//        printf("here1: c:%d, t:%d, b:%d, pos:%d, beg:%d, end:%d, res:%d\n", c, t, b, pos, va[c].beg, va[c].end, va[c].pos[0]);
        if (pos == va[c].beg)
            break;
        else if (pos < va[c].beg)
            t=c-1;
        else if (pos > va[c].beg) {
            if (pos <= va[c].end) {
                return va[c].pos;
            }
            b=c+1;
        }
    }
    
    if (t == b)
        c = t;
//    printf("here2: c:%d, t:%d, b:%d, pos:%d, beg:%d, end:%d, res:%d\n", c, t, b, pos, va[c].beg, va[c].end, va[c].pos[0]);
    if (pos >= va[c].beg && pos <= va[c].end) {
        return va[c].pos;
    }
    return empty_vector;
}

vector<long int> tidx::lookup_r(const char *chr, int beg, int end) {
    dense_hash_map<string,vector<annot> >::iterator it=map.find(chr);
    if (it == map.end()) return empty_vector;
    vector<annot> &va = it->second;
    if (debug) fprintf(stderr,"lookup_r: %s:%d.%d -> %d\n", chr, beg, end, (int) va.size());
    int b=0, t=va.size(), c=0;
    while (t>b) {
        c=(t+b)/2;
        if (beg == va[c].beg)
            break;
        else if (beg < va[c].beg)
            t=c-1;
        else if (beg > va[c].beg) {
            if (beg <= va[c].end) 
                break;
            b=c+1;
        }
    }
    if (t == b)
        c = t;
    vector<long int> res;
    while (c<va.size() && end >= va[c].beg && beg <= va[c].end) {
        append(res,va[c].pos);
        ++c;
    }
    return res;
}

string tidx::lookup(const char *chr, int pos, const char *msep) { 
//    printf("here2\n");
    const vector<long int> &v = lookup(chr, pos);
    string res;
    if (!fh) {
        fh=fopen(path.c_str(),"rb");
        if (!fh) 
            fail("%s:%s\n",path.c_str(),strerror(errno));
    }
    string line;
    int i;
	struct line l; meminit(l);
    for (i=0;i<v.size();++i) {
        fseek(fh,v[i],0);
    	read_line(fh, l);
        chomp_line(l);
        res += msep;
        res += string(l.s, l.n);
    }
    return res;
}

string tidx::lookup_r(const char *chr, int beg, int end, const char *msep) {
//    printf("here2\n");
    const vector<long int> &v = lookup_r(chr, beg, end);
    string res;
    if (!fh) {
        fh=fopen(path.c_str(),"rb");
        if (!fh)
            fail("%s:%s\n",path.c_str(),strerror(errno));
    }
    string line;
    int i;
    struct line l; meminit(l);
    for (i=0;i<v.size();++i) {
        fseek(fh,v[i],0);
        read_line(fh, l);
        chomp_line(l);
        res += msep;
        res += string(l.s, l.n);
    }
    return res;
}

string api_ret = "";
const char *tidx::lookup_c(const char *chr, int pos, const char *msep) {
    api_ret = lookup(chr, pos, msep);
    return api_ret.c_str();
}

const char *tidx::lookup_cr(const char *chr, int beg, int end, const char *msep) {
    api_ret = lookup_r(chr, beg, end, msep);
    return api_ret.c_str();
}

bool tidx::read(const char *in) {
    string uin = string_format("gunzip -c %s.tidx", in);

    if (debug) fprintf(stderr, "read %s\n", in);
    FILE *fun=popen(uin.c_str(),"r");
    if (!fun) {
        return false;
    }
    map.unserialize(string_annot_serializer(), fun);
    path=in;
    return true;
}

void tidx::init() {
    debug=false;
    fh=NULL;
    map.set_empty_key("-");
}

void tidx::dump(FILE *fh) {
    fprintf(fh,"#file\t%s\n",path.c_str());
    dense_hash_map<string,vector<annot> >::iterator it = map.begin();;
    while (it != map.end()) {
        vector<annot> &van = it->second;
        int i;
        for (i=0;i<van.size();++i) {
            fprintf(fh, "%s\t%d\t%d\t%ld\t%ld\n", it->first.c_str(), van[i].beg, van[i].end, van[i].pos.size(), van[i].pos[0]);
        }
        ++it;
    }    
}

// fun part
void tidx::build(const char *in, const char *sep, int nchr, int nbeg, int nend, int skip_i, char skip_c, bool sub_e) {
	FILE *fin=fopen(in,"r");

    if (!fin)
        fail("%s:%s\n",in,strerror(errno));

    if (nend == -1)
        nend = nbeg;

    string out = string_format("gzip -c > %s.tidx", in);
    FILE *fout=popen(out.c_str(),"w");
    if (!fout)
        fail("%s:%s\n", out.c_str(),strerror(errno));

    double xst = xtime();

	struct line l; meminit(l);
    int nlast = max(max(nbeg,nend),nchr);
    int nl = 0;

    string p_chr = "%";
    path=in;
    vector<annot> *pvan;
    long tpos = ftell(fin);
    // read in the annotation file
    if (debug) fprintf(stderr, "reading %s (%d, %d, %d)\n", in, nchr, nbeg, nend);
	while (read_line(fin, l)>0) {
        ++nl;
        if (skip_i > 0 || *l.s==skip_c) {
            --skip_i;
        } else {
            chomp_line(l);
            vector<char *> v = split(l.s, sep);
            if (nlast >= v.size()) {
                fail("error, file %s, line %d: missing info\n", in, nl);
            } 
            annot a;
            a.beg=atoi(v[nbeg]);
            a.end=atoi(v[nend]);
            if (sub_e) --a.end;
            if (a.beg > a.end) {
                fail("error, file %s, line %d: beg > end : %d > %d\n", in, nl, a.beg, a.end);
            }
            a.pos.push_back(tpos);
            if (strcmp(v[nchr], p_chr.c_str())) {       // speed up
                pvan = &map[v[nchr]];
                p_chr=v[nchr];
            }
            pvan->push_back(a);
        }
        tpos = ftell(fin);
	}
    dense_hash_map<string,vector<annot> >::iterator it;

    // for each chromosome
    it = map.begin();
    while (it != map.end()) {
        vector<annot> &van = it->second;
        // sort the annotation file by beginning of region
        sort(van.begin(), van.end(), annot_comp);
        int i;
        if (debug) fprintf(stderr, "frag %s : %ld ->", it->first.c_str(), van.size());
        for (i=0;i<van.size()-1;++i) {
            if (van[i].beg >= van[i+1].beg && van[i].end == van[i+1].end) {
                // exact match
                if (debug) fprintf(stderr, " [dup %d]", van[i].beg);
                // merge annotations
                prepend(van[i+1].pos,van[i].pos);
                // skip next... (empty pos won't be serialized)
                assert(van[i].beg == van[i+1].beg);
                van[i].pos.clear();
            } else if (van[i].end >= van[i+1].beg) {
                if (debug) fprintf(stderr, " [ovr %d-%d:%ld ]", van[i].beg, van[i].end, van[i].pos[0]);
                // overlap next
                int new_st;
                int new_en;

                // forced to initialize here so we can use a reference (for efficiency)
                vector<long> new_ro = van[i].pos;
                
                if (van[i].end < van[i+1].end) {
                    // contained within next, so new frag starting after i stop
                    new_st = van[i].end + 1;
                    new_en = van[i+1].end;
                    new_ro = van[i+1].pos;                  // that only contains the other
                    van[i+1].end=van[i].end;                // shorten next to my end
                    append(van[i+1].pos,van[i].pos);        // and now the other contains me
                } else {
                    // passes next, so next contains all of me
                    new_st = van[i+1].end+1;                // new frag is after the end of next
                    new_en = van[i].end;
                    append(van[i+1].pos,van[i].pos);
                }

                van[i].end=van[i+1].beg-1;                  // shorten my end to less than the next's start
                
                if (debug) fprintf(stderr, " [i:%d:%d:%ld]", van[i].beg, van[i].end, van[i].pos[0]);
                if (debug) fprintf(stderr, " [i+1:%d:%d:%ld]", van[i+1].beg, van[i+1].end, van[i+1].pos[0]);

                if (new_en >= new_st) {                     // is this a real one?
                    if (debug) fprintf(stderr, " [n:%d:%d:%ld]", new_st, new_en, new_ro[0]);

                    int j = i+2;                            // figure out where it goes (shouldn't be far)
                    while (j < van.size() & new_st > van[j].beg) {
                        ++j;
                    }

                    annot a;
                    a.beg=new_st;
                    a.end=new_en;
                    a.pos=new_ro;
                                                            // (slow... use linked list, turn to array later for storage/bin search?)
                    van.insert(van.begin()+j, a);           // insert into the annot array
                }
            }
        }
        long j = 0;
        for (i=0;i<van.size();++i) {
            // overlap next
            if (van[i].pos.size() != 0 && van[i].beg <= van[i].end)
                if (i != j) 
                    van[j++]=van[i];
                else
                    ++j;
        }
        if (j != van.size()) {
            if (debug) fprintf(stderr, "(rm %ld) ", van.size()-j);
            van.resize(j);
        }
        if (debug) fprintf(stderr, " %ld\n", van.size());
        ++it;
    }

    double xen = xtime();
    double speed = xen-xst;

    if (debug) fprintf(stderr, "compiled in %g secs\n", speed);

    map.serialize(string_annot_serializer(), fout);
    pclose(fout);

    //
    xst = xtime();
    tidx tmap(in);
    xen = xtime();
    speed = xen-xst;
    if (debug) fprintf(stderr, "read in %g secs\n", speed);
    path=in;
}

double xtime() {
    struct timeval tm;
    gettimeofday(&tm, NULL);
    return (double) tm.tv_sec + ((double)tm.tv_usec)/1000000.0;
}

// for the api
void tidx_build(const char *file, const char *sep, int chr, int beg, int end, int skip_i, char skip_c, bool sub_e) {
    tidx n;
    n.build(file, sep, chr, beg, end, skip_i, skip_c, sub_e); 
}