The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
package Algorithm::Cluster::Record;
use strict;

use Algorithm::Cluster;

sub new {
    my $class = shift;
    my $self = {};
    $self->{data} = undef;
    $self->{mask} = undef;
    $self->{geneid} = undef;
    $self->{genename} = undef;
    $self->{gweight} = undef;
    $self->{gorder} = undef;
    $self->{expid} = undef;
    $self->{eweight} = undef;
    $self->{eorder} = undef;
    $self->{uniqid} = undef;
    bless($self, $class);
    return $self;
}

sub read {
    my $self = shift;
    my $handle = shift;
    my $line = <$handle>;
    chomp($line);
    my @words = split(/\t/, $line);
    my $n = scalar @words;
    $self->{uniqid} = $words[0];
    $self->{expid} = [];
    my %cols = (0 => 'GENEID');
    my $i;
    for ($i = 1; $i < $n; $i++) {
        my $word = $words[$i];
        if ($word eq 'NAME') {
            $cols{$i} = $word;
            $self->{genename} = ();
        }
        elsif ($word eq 'GWEIGHT') {
            $cols{$i} = $word;
            $self->{gweight} = ();
        }
        elsif ($word eq 'GORDER') {
            $cols{$i} = $word;
            $self->{gorder} = ();
        }
        else {
            push(@{$self->{expid}}, $word);
        }
    }
    $self->{geneid} = [];
    $self->{data} = [];
    $self->{mask} = [];
    my $needmask = 0;
    while ($line = <$handle>) {
	my $count = ($line =~ tr/\t//);
        @words = split(/\t/, $line);
	chomp @words;
        scalar @words == $n or die "Line with " . scalar @words . " columns found (expected $n): $!";
        my $start = 0;
        for my $key (keys %cols) {
            if ($key > $start) {
                $start = $key;
            }
        }
        if ($words[0] eq 'EWEIGHT') {
            @{$self->{eweight}} = @words[$start+1..$n-1];
        }
        elsif ($words[0] eq 'EORDER') {
            @{$self->{eorder}} = @words[$start+1..$n-1];
        }
        else {
            my @rowdata = ();
            my @rowmask = ();
            for ($i = 0; $i < $n; $i++) {
                my $word = $words[$i];
                if (defined $cols{$i}) {
                    if ($cols{$i} eq 'GENEID') {
                        push(@{$self->{geneid}}, $word);
                    }
                    elsif ($cols{$i} eq 'NAME') {
                        push(@{$self->{genename}}, $word);
                    }
                    elsif ($cols{$i} eq 'GWEIGHT') {
                        push(@{$self->{gweight}}, $word);
                    }
                    elsif ($cols{$i} eq 'GORDER') {
                        push(@{$self->{gorder}}, $word);
                    }
                }
                else {
                    if ($word) {
                        push(@rowdata, $word);
                        push(@rowmask, 1);
                    }
                    else {
                        push(@rowdata, 0.0);
                        push(@rowmask, 0);
                        $needmask = 1;
                    }
                }
            }
            push(@{$self->{data}}, [@rowdata]);
            push(@{$self->{mask}}, [@rowmask]);
        }
    }
    if (not $needmask) {
        $self->{mask} = undef;
    }
}


sub treecluster {
    my ($self, %args) = @_;
    my %default = (
        transpose  =>     0,
        dist       =>   'e',
        method     =>   'm',
    );
    my %param = (%default, %args);
    $param{data} = $self->{data};
    if (defined $self->{mask}) {
        $param{mask} = $self->{mask};
    }
    if ($param{transpose}==0) {
        $param{weight} = $self->{eweight};
    }
    else {
        $param{weight} = $self->{gweight};
    }
    return Algorithm::Cluster::treecluster(%param);
}

sub kcluster {
    my ($self, %args) = @_;
    my %default = (
        nclusters  =>     2,
        transpose  =>     0,
        npass      =>     1,
        method     =>   'a',
        dist       =>   'e',
        initidalid => undef,
    );
    my %param = (%default, %args);
    $param{data} = $self->{data};
    if (defined $self->{mask}) {
        $param{mask} = $self->{mask};
    }
    if ($param{transpose}==0) {
        $param{weight} = $self->{eweight};
    }
    else {
        $param{weight} = $self->{gweight};
    }
    return Algorithm::Cluster::kcluster(%param);
}

sub somcluster {
    my ($self, %args) = @_;
    my %default = (
        transpose  =>     0,
        nxgrid     =>     2,
        nygrid     =>     1,
        inittau    =>  0.02,
        niter      =>     1,
        dist       =>   'e',
    );
    my %param = (%default, %args);
    $param{data} = $self->{data};
    if (defined $self->{mask}) {
        $param{mask} = $self->{mask};
    }
    if ($param{transpose}==0) {
        $param{weight} = $self->{eweight};
    }
    else {
        $param{weight} = $self->{gweight};
    }
    return Algorithm::Cluster::somcluster(%param);
}

sub clustercentroids {
    my ($self, %args) = @_;
    my %default = (
        clusterid  => undef,
        method     =>   'a',
        transpose  =>     0,
    );
    my %param = (%default, %args);
    $param{data} = $self->{data};
    if (defined $self->{mask}) {
        $param{mask} = $self->{mask};
    }
    my @data = @{$self->{data}};
    return Algorithm::Cluster::clustercentroids(%param);
}

sub clusterdistance {
    my ($self, %args) = @_;
    my %default = (
        cluster1   =>   [0],
        cluster2   =>   [0],
        method     =>   'a',
        dist       =>   'e',
        transpose  =>     0,
    );
    my %param = (%default, %args);
    $param{data} = $self->{data};
    if (defined $self->{mask}) {
        $param{mask} = $self->{mask};
    }
    if ($param{transpose}==0) {
        $param{weight} = $self->{eweight};
    }
    else {
        $param{weight} = $self->{gweight};
    }
    return Algorithm::Cluster::clusterdistance(%param);
}

sub distancematrix {
    my ($self, %args) = @_;
    my %default = (
        dist       =>   'e',
        transpose  =>     0,
    );
    my %param = (%default, %args);
    $param{data} = $self->{data};
    if (defined $self->{mask}) {
        $param{mask} = $self->{mask};
    }
    if ($param{transpose}==0) {
        $param{weight} = $self->{eweight};
    }
    else {
        $param{weight} = $self->{gweight};
    }
    return Algorithm::Cluster::distancematrix(%param);
}

sub save {
    my ($self, %args) = @_;
    my %default = (
        geneclusters => undef,
        expclusters  => undef,
    );
    my %param = (%default, %args);
    $param{data} = $self->{data};
    my $ngenes = scalar @{$self->{geneid}};
    my $nexps = scalar @{$self->{expid}};
    my $jobname = $param{jobname};
    defined ($jobname) or die 'jobname undefined';
    my $geneclusters;
    my $expclusters;
    my $gene_cluster_type;
    my $exp_cluster_type;
    if (defined $param{geneclusters}) {
        $geneclusters = $param{geneclusters};
        if (ref($geneclusters) eq "ARRAY") {
            if (scalar @{$geneclusters} != $ngenes) {
                die "k-means solution found, but its size does not agree with the number of genes";
            }
            $gene_cluster_type = 'k'; # k-means clustering result
        }
        elsif (ref($geneclusters) eq "Algorithm::Cluster::Tree") {
            $gene_cluster_type = 'h'; # hierarchical clustering result
            my $n = $geneclusters->length;
            if ($n != $ngenes - 1) {
                die "Size of the hierarchical clustering tree ($n) should be equal to the number of genes ($ngenes) minus one";
            }
        }
        else {
            die "Cannot understand gene clustering result! $!";
        }
    }
    if (defined $param{expclusters}) {
        $expclusters = $param{expclusters};
        if (ref($expclusters) eq "ARRAY") {
            if (scalar @$expclusters != $nexps) {
                die "k-means solution found, but its size does not agree with the number of experiments";
            }
            $exp_cluster_type = 'k'; # k-means clustering result
        }
        elsif (ref($expclusters) eq "Algorithm::Cluster::Tree") {
            $exp_cluster_type = 'h'; # hierarchical clustering result
            my $n = $expclusters->length;
            if ($n != $nexps - 1) {
                die "Size of the hierarchical clustering tree ($n) should be equal to the number of experiments ($nexps) minus one";
            }
        }
        else {
            die "Cannot understand experiment clustering result! $!";
        }
    }
    my @gorder;
    if (defined $self->{gorder}) {
        @gorder = $self->{gorder};
    }
    else {
        @gorder = (0..$ngenes-1);
    }
    my @eorder;
    if (defined $self->{eorder}) {
        @eorder = $self->{eorder};
    }
    else {
        @eorder = (0..$nexps-1);
    }
    if (defined $gene_cluster_type and defined $exp_cluster_type) {
        if ($gene_cluster_type ne $exp_cluster_type) {
            die 'found one k-means and one hierarchical clustering solution in geneclusters and expclusters';
        }
    }
    my $gid = 0;
    my $aid = 0;
    my $filename = $jobname;
    my $postfix = '';
    my @geneindex;
    my @expindex;
    if ($gene_cluster_type eq 'h') {
        # Hierarchical clustering result
        @geneindex = _savetree(jobname   => $jobname,
                               tree      => $geneclusters,
                               order     => \@gorder,
                               transpose =>  0);
        $gid = 1;
    }
    elsif ($gene_cluster_type eq 'k') {
        # k-means clustering result
        $filename = $jobname . '_K';
        my $k = -1;
        foreach (@$geneclusters) {
            if ($_ > $k) {
                $k = $_;
            }
        }
        $k++;
        my $kggfilename = $jobname . "_K_G$k.kgg";
        @geneindex = $self->_savekmeans(filename => $kggfilename,
                                        clusterids => \@$geneclusters,
                                        order => \@gorder,
                                        transpose => 0);
        $postfix = "_G$k";
    }
    else {
        @geneindex = sort { $gorder[$a] <=> $gorder[$b] } (0..$ngenes-1);
    }
    if ($exp_cluster_type eq 'h') {
        # Hierarchical clustering result
        @expindex = _savetree(jobname   => $jobname,
                              tree      => $expclusters,
                              order     => \@eorder,
                              transpose =>  1);
        $aid = 1;
    }
    elsif ($exp_cluster_type eq 'k') {
        # k-means clustering result
        $filename = $jobname . '_K';
        my $k = -1;
        foreach (@$expclusters) {
            if ($_ > $k) {
                $k = $_;
            }
        }
        $k++;
        my $kagfilename = $jobname . "_K_A$k.kag";
        @expindex = $self->_savekmeans(filename => $kagfilename,
                                       clusterids => \@$expclusters,
                                       order => \@eorder,
                                       transpose => 1);
        $postfix = $postfix . "_A$k";
    }
    else {
        @expindex = sort { $eorder[$a] <=> $eorder[$b] } (0..$nexps-1);
    }
    $filename = $filename . $postfix;
    $self->_savedata(jobname    => $filename,
                     gid        => $gid,
                     aid        => $aid,
                     geneindex  => \@geneindex,
                     expindex   => \@expindex);
}

sub _savetree {
    my %param = @_;
    my $jobname = $param{jobname};
    my $tree = $param{tree};
    my @order = @{$param{order}};
    my $transpose = $param{transpose};
    my ($extension, $keyword);
    if ($transpose==0) {
        $extension = 'gtr';
        $keyword = 'GENE';
    }
    else {
        $extension = 'atr';
        $keyword = 'ARRY';
    }
    my $nnodes = $tree->length;
    open OUTPUT, ">$jobname.$extension" or die 'Error: Unable to open output file';
    my @nodeID = ('') x $nnodes;
    my @nodedist;
    my $i;
    for ($i = 0; $i < $nnodes; $i++) {
        my $node = $tree->get($i);
        push (@nodedist, $node->distance);
    }
    for (my $nodeindex = 0; $nodeindex < $nnodes; $nodeindex++) {
        my $min1 = $tree->get($nodeindex)->left;
        my $min2 = $tree->get($nodeindex)->right;
        $nodeID[$nodeindex] = "NODE" . ($nodeindex+1) . "X";
        print OUTPUT $nodeID[$nodeindex];
        print OUTPUT "\t";
        if ($min1 < 0) {
            my $index1 = -$min1-1;
            print OUTPUT $nodeID[$index1];
            print OUTPUT "\t";
            if ($nodedist[$index1] > $nodedist[$nodeindex]) {
            	$nodedist[$nodeindex] = $nodedist[$index1];
            }
        }
        else {
            print OUTPUT $keyword . $min1 . "X\t";
        }
        if ($min2 < 0) {
            my $index2 = -$min2-1;
            print OUTPUT $nodeID[$index2];
            print OUTPUT "\t";
            if ($nodedist[$index2] > $nodedist[$nodeindex]) {
            	$nodedist[$nodeindex] = $nodedist[$index2];
            }
        }
        else {
            print OUTPUT $keyword . $min2 . "X\t";
        }
        print OUTPUT 1.0-$nodedist[$nodeindex];
        print OUTPUT "\n";
    }
    close(OUTPUT);
    # Now set up order based on the tree structure
    return $tree->sort(\@order);
}

sub _savekmeans {
    my ($self, %param) = @_;
    my $filename = $param{filename};
    my @clusterids = @{$param{clusterids}};
    my @order = @{$param{order}};
    my $transpose = $param{transpose};
    my $label;
    my @names;
    if ($transpose == 0) {
        $label = $self->{uniqid};
        @names = @{$self->{geneid}};
    }
    else {
        $label = 'ARRAY';
        @names = @{$self->{expid}};
    }
    open OUTPUT, ">$filename" or die 'Error: Unable to open output file';
    print OUTPUT "$label\tGROUP\n";
    my $n = scalar @names;
    my @result = sort { $order[$a] <=> $order[$b] } (0..$n-1);
    my @sortedindex;
    my $cluster = 0;
    while (scalar @sortedindex < $n) {
        foreach (@result) {
            my $j = $_;
            my $cid = $clusterids[$j];
            if ($clusterids[$j]==$cluster) {
                print OUTPUT $names[$j] . "\t$cluster\n";
                push (@sortedindex, $j);
            }
        }
        $cluster++;
    }
    close(OUTPUT);
    return @sortedindex;
}

sub _savedata {
    my ($self, %param) = @_;
    my $jobname = $param{jobname};
    my $gid = $param{gid};
    my $aid = $param{aid};
    my @geneindex = @{$param{geneindex}};
    my @expindex = @{$param{expindex}};
    my @genename;
    if (defined $self->{genename}) {
        @genename = @{$self->{genename}};
    }
    else {
        @genename = @{$self->{geneid}};
    }
    my $ngenes = scalar @{$self->{geneid}};
    my $nexps = scalar @{$self->{expid}};
    open OUTPUT, ">$jobname.cdt" or die 'Error: Unable to open output file';
    my @mask;
    if (defined $self->{mask}) {
        @mask = @{$self->{mask}};
    }
    else {
        @mask = ([(1) x $nexps]) x $ngenes;
        # Each row contains identical shallow copies of the same vector;
        # modifying one row would affect the other rows.
    }
    my @gweight;
    if (defined $self->{gweight}) {
        @gweight = @{$self->{gweight}};
    }
    else {
        @gweight = (1) x $ngenes;
    }
    my @eweight;
    if (defined $self->{eweight}) {
        @eweight = @{$self->{eweight}};
    }
    else {
        @eweight = (1) x $nexps;
    }
    if ($gid) {
    	print OUTPUT "GID\t";
    }
    print OUTPUT $self->{uniqid};
    print OUTPUT "\tNAME\tGWEIGHT";
    # Now add headers for data columns
    foreach (@expindex) {
        print OUTPUT "\t" . $self->{expid}[$_];
    }
    print OUTPUT "\n";
    if ($aid) {
        print OUTPUT "AID";
        if ($gid) {
            print OUTPUT "\t";
        }
        print OUTPUT "\t\t";
        foreach (@expindex) {
            print OUTPUT "\tARRY" . $_ . 'X';
        }
        print OUTPUT "\n";
    }
    print OUTPUT "EWEIGHT";
    if ($gid) {
        print OUTPUT "\t";
    }
    print OUTPUT "\t\t";
    foreach (@expindex) {
        print OUTPUT "\t" . $eweight[$_];
    }
    print OUTPUT "\n";
    foreach (@geneindex) {
        my $i = $_;
        if ($gid) {
            print OUTPUT "GENE" . $i . "X\t";
        }
        print OUTPUT $self->{geneid}[$i] . "\t" . $genename[$i] . "\t" . $gweight[$i];
        foreach (@expindex) {
            my $j = $_;
            print OUTPUT "\t";
            if ($mask[$i][$j]) {
                print OUTPUT $self->{data}[$i][$j];
            }
        }
        print OUTPUT "\n";
    }
    close(OUTPUT);
}

1;