The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
/*
 * 25 Sep 2001
 * Some operations on pair_set's (alignments)
 * $Id: pair_set.c,v 1.5 2008/03/08 16:49:02 torda Exp $
 */

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#include "matrix.h"
#include "e_malloc.h"
#include "mprintf.h"
#include "pair_set_i.h"
#include "pair_set.h"
#include "read_seq_i.h"
#include "scratch.h"
#include "seq.h"

/* ---------------- pair_set_string    --------------------------
 * This could move to another file. It needs to know about
 * the innards of the sequence structure. Really, this file
 * should only contain stuff for pair sets, regardless of their
 * type.
 */
char *
pair_set_string (struct pair_set *pair_set, struct seq *s1, struct seq *s2)
{
    int **indices = pair_set -> indices;
    size_t i;
    size_t j;
    const char GAP = '-';

    seq_thomas2std (s1);
    seq_thomas2std (s2);

    scr_reset();
    for (j = 0; j < pair_set->m; j++) {
        for (i = 0; i < pair_set->n; i++) {
            char c1 = (indices[i][j] == GAP_INDEX ? GAP : 'X');
            scr_printf ("%c", c1);
        }
        scr_printf ("\n");
    }

    return (scr_printf ("%c", '\n'));
}

/* ---------------- multal_string    --------------------------
 * This could move to another file. It needs to know about
 * the innards of the sequence structure. Really, this file
 * should only contain stuff for pair sets, regardless of their
 * type.
 */
char *
multal_string (struct pair_set *pairset)
{
    size_t i;
    size_t j;
    int **indices = pairset->indices;

    scr_reset();
    for (j = 0; j < pairset->m; j++) {
        scr_printf("%-4d", (int)(indices[0][j]));
        for (i = 0; i < pairset->n; i++)
            scr_printf ("%4d ", indices[i][j]);
        scr_printf (" %d\n", indices[(int)(pairset->n-1)][j]);
    }

    return (scr_printf ("%c", '\n'));
}

/* ---------------- pair_set_score    --------------------------
 */
int
pair_set_score (struct pair_set *s, float *score, float *scr_smpl)
{
    if (s == NULL)
        return EXIT_FAILURE;
    *score    = s->score;
    *scr_smpl = s->smpl_score;
    return EXIT_SUCCESS;
}

/* ---------------- pair_set_extend   --------------------------
 * Take a short alignment and extend it. Well documented in
 * in file wurst.pod.
 */
int
pair_set_extend (struct pair_set *s, const size_t n0,
                 const size_t n1, const int long_or_short)
{
    int **p, **np;
    const char *this_sub = "pair_set_extend";
    size_t left, right, long_left, long_right, newsize, idx;
    int long_left_on_a  = -999,   /* The initialisation is only to provoke */
        long_right_on_a = -999;   /* a crash if code below is not correct */

    long_left = long_right = 0; /* Not necessary, but stops compiler warning */

    if ((long_or_short != EXT_SHORT) && (long_or_short != EXT_LONG)) {
        err_printf (this_sub, "Must be fed either $EXT_LONG or $EXT_SHORT\n");
        return EXIT_FAILURE;
    }
    if (s->m > 2) {
        err_printf (this_sub, "Only written for alignments of two strings.");
        err_printf (this_sub, "Given %u\n", (unsigned) s->m);
        return EXIT_FAILURE;
    }
    if (s->n == 0) {                      /* Asked to do a long extension, */
        if (long_or_short == EXT_LONG) {  /* but there are no aligned pairs */
            size_t i, pos;
            newsize = n0 + n1;
            np = i_matrix(newsize, 2);
            for (pos = 0, i = 0; i < n0; i++, pos++) {
                np[pos][0] = i;
                np[pos][1] = GAP_INDEX;
            }
            for ( i= 0; i < n1; i++, pos++) {
                np[pos][0] = GAP_INDEX;
                np[pos][1] = i;
            }
        } else {
            newsize = 0;
            np = NULL;
        }
        goto go_back;
    }

    p = s->indices;

    if (p[0][0] < 0 || p[0][1] < 0 || p[s->n - 1][0] < 0 || p[s->n - 1][1] < 0) {
        err_printf (this_sub, "This pair set has already been extended\n");
        return EXIT_FAILURE;
    }

    {                                        /* for short or long extension */
        size_t ia, ib;                    /* we need left and right values. */
        left = (p[0][0] < p[0][1]) ? p[0][0] : p[0][1];
        ia = n0 - p[s->n - 1][0];
        ib = n1 - p[s->n - 1][1];
        right = ((ia < ib) ? ia : ib) - 1;
    }
    newsize = s->n + left + right;
    if (long_or_short == EXT_LONG) {
        size_t a_dangle, b_dangle;
        if (p[0][0] > p[0][1]) {
            long_left = p[0][0];
            long_left_on_a = 1;
        } else {
            long_left = p[0][1];
            long_left_on_a = 0;
        }
        long_left -= left;
        a_dangle = n0 - p[s->n - 1][0];
        b_dangle = n1 - p[s->n - 1][1];
        if (a_dangle > b_dangle) {
            long_right = n0 - p[s->n - 1][0];
            long_right_on_a = 1;
        } else {
            long_right = n1 - p[s->n - 1][1];
            long_right_on_a = 0;
        }
        long_right = long_right - right - 1;
        newsize += long_left + long_right;
    }

    np = i_matrix(newsize, 2);

    idx = 0;           /* idx is maintained between the next series of loops */
    if ((long_or_short == EXT_LONG) && (long_left > 0)) { /* long left ext'n */
        size_t i;
        if (long_left_on_a) {
            for (i = 0; i < long_left; i++, idx++) {
                np[idx][0] = i;
                np[idx][1] = GAP_INDEX;
            }
        } else {
            for (i = 0; i < long_left; i++, idx++) {
                np[idx][0] = GAP_INDEX;
                np[idx][1] = i;
            }
        }
    }

    {                                                /* short left extension */
        int ia = p[0][0] - left;
        int ib = p[0][1] - left;
        for (  ; ia < p[0][0]; ia++, ib++, idx++) {
            np[idx][0] = ia;
            np[idx][1] = ib;
        }
    }

    {                                /* Copy over the original aligned pairs */
        size_t j;
        for ( j = 0; j < s->n; j++, idx++) {
            np[idx][0] = s->indices[j][0];
            np[idx][1] = s->indices[j][1];
        }
    }

    if (right > 0) {                                /* short right extension */
        size_t i;
        int ia = np[s->n - 1][0] + 1;
        int ib = np[s->n - 1][1] + 1;
        for ( i = 0; i < right ; i++, idx++, ia++, ib++) {
            np[idx][0] = ia;
            np[idx][1] = ib;
        }
    }

    if (long_or_short == EXT_LONG && long_right > 0) {/* long right extension*/
        if (long_right_on_a) {
            size_t i = np[idx-1][0] + 1;
            for ( ; i < n0; idx++, i++) {
                np[idx][0] = i;
                np[idx][1] = GAP_INDEX;
            }
        } else {
            size_t i = np[idx-1][1] + 1;
            for ( ; i < n1; idx++, i++) {
                np[idx][0] = GAP_INDEX;
                np[idx][1] = i;
            }
        }
    }

    kill_i_matrix(s->indices);
    s->indices = np;

 go_back:
    s->indices = np;
    s->n = newsize;
    return EXIT_SUCCESS;
}

/* ---------------- pair_set_coverage --------------------------
 * Given a pair_set structure and a sequence, return a string
 * which tells us which sites in the sequence and structure are filled.
 * We allocate the strings with E_MALLOC and return a pointer to
 * each char * via pcover1 and pcover2. The caller knows that is
 * has to free() the memory later.
 * Maybe we have done a sequence to structure alignment, maybe something
 * else. Whatever the case, we might be asked about coverage from
 * an alignment.
 */
int
pair_set_coverage (struct pair_set *p_s, size_t s1, size_t s2,
                   char **pcover1, char **pcover2)
{
    char *s_a, *s_b;
    int **p;
    size_t i;
    size_t size_s_a = sizeof (s_a[0]) * (s1 + 1);
    size_t size_s_b = sizeof (s_b[0]) * (s2 + 1);
    const char *this_sub = "pair_set_coverage";
    const char *too_small =
        "The sizes, %d and %d are too small for the pair_set.\n";

    s_a = E_MALLOC (size_s_a);
    memset (s_a, '0', size_s_a); /* Default (not covered) is '0' */
    s_a[s1] = '\0';              /* Will end up as string in interp, so */
                                 /* needs NULL terminator */
    s_b = E_MALLOC (size_s_b);
    memset (s_b, '0', size_s_b);
    s_b[s2] = '\0';

    p = p_s->indices;

    for ( i=0 ; i < p_s->n; i++ ) {
        if (p[i][0] == GAP_INDEX)
            continue;
        if (p[i][1] == GAP_INDEX)
            continue;
        if (p[i][0] > (int) s1)
            goto error;
        if (p[i][1] > (int) s2)
            goto error;
        s_a[p[i][0]] = '1';
        s_b[p[i][1]] = '1';
    }
    *pcover1 = s_a;
    *pcover2 = s_b;
    return EXIT_SUCCESS;
 error:
    s_a = E_REALLOC (s_a, 1);
    s_b = E_REALLOC (s_a, 1);
    s_a[0] = s_b[0] = '\0';
    err_printf (this_sub, too_small, s1, s2);
    err_printf (this_sub, "Seriously broken\n");
    return EXIT_FAILURE;
}

/* ---------------- pair_set_gap      --------------------------
 * Calculate gaps in a sequence, for use by model rescoring
 * code. We return two costs:
 *  gap opening
 *  gap widening.
 * To do this, we need two scale values.
 * We return each of the costs separately. If we were not
 * interested in optimising parameters, this routine should
 * combine the penalties and return one number.
 */
int
pair_set_gap (struct pair_set *p_s, float *open_cost, float *widen_cost,
              const float open_scale, const float widen_scale)
{
    int **p;
    unsigned open, widen;
    size_t idx;
    enum {ALIGNED, INGAP} state;
    const char *this_sub = "pair_set_gap";
    open = 0;
    widen = 0;

    if (p_s == NULL) {
        err_printf (this_sub, "pair_set broken\n");
        return EXIT_FAILURE;
    }
    p = p_s->indices;

    state = ALIGNED;
    for ( idx = 0; idx < p_s->n; idx++) {
        switch (state) {
        case ALIGNED:
            if (p[idx][1] == GAP_INDEX) {
                state = INGAP;
                open++;
            }
            break;
        case INGAP:
            if (p[idx][1] == GAP_INDEX)
                widen++;
            else
                state = ALIGNED;
        }
    }

    *open_cost  = open * open_scale;
    *widen_cost = widen * widen_scale;

    return EXIT_SUCCESS;
}

/* ---------------- pair_set_destroy  --------------------------
 */
void
pair_set_destroy (struct pair_set *p_s)
{
    if (p_s == NULL)
        return;
    if ( p_s->indices ){
        kill_i_matrix( p_s->indices );
    }
    free (p_s);
}

/* ---------------- pair_set_get_alignment_indices   --------------------------
 * To give an idea of the size of elements in a score matrix.
 * Do not look at first or last rows or columns.
 */
void
pair_set_get_alignment_indices (struct pair_set *p_s, int sequencenumber, int *start, int *stop)
{
    static const char *this_sub = "pair_set_get_alignment";
    if ( (sequencenumber <= (int) p_s->m) && (p_s->n > 0 ) ){
        *start = p_s->indices[  0  ][ sequencenumber ];
        *stop  = p_s->indices[ (int) (p_s->n - 1) ][ sequencenumber ];
    } else {
        *start = *stop = 0;
        err_printf (this_sub, "sequencenumber too high \n");
    }
}