src/wurstsrc/score_mat.c

/*
 * 1 March 2002
 * For manipulations of score matrices.
 * $Id: score_mat.c,v 1.1 2007/09/28 16:57:13 mmundry Exp $
 */

#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#include "e_malloc.h"
#include "fio.h"
#include "matrix.h"
#include "mprintf.h"
#include "read_seq_i.h"
#include "score_mat.h"
#include "score_mat_i.h"
#include "scratch.h"
#include "seq.h"
#include "str.h"
#include "pair_set.h"
#include "pair_set_i.h"





/* ---------------- score_mat_new   ---------------------------
 * Given the size of two things (sequences, structures),
 * allocate memory for a score matrix.
 * WATCH OUT !
 * There is a rule.. Score matrices have two extra columns and
 * two extra rows. This means makes it easier to implement
 * some scoring schemes with fancy end penalties. The extra
 * rows and columns are at the start and end of each scored
 * thing.
 */
struct score_mat *
score_mat_new (const size_t n_rows, const size_t n_cols)
{
    float **mat;
    struct score_mat *ret_mat;
    mat = f_matrix (n_rows + 2, n_cols + 2);
    memset (mat[0], (int)0.0, sizeof (mat[0][0]) * (n_rows+2) * (n_cols+2));
    ret_mat = E_MALLOC (sizeof (*ret_mat));
    ret_mat->mat = mat;
    ret_mat->n_rows = n_rows + 2;
    ret_mat->n_cols = n_cols + 2;
    return (ret_mat);
}

/* ---------------- score_mat_destroy -------------------------
 */
void
score_mat_destroy (struct score_mat *smat)
{
    const char *this_sub = "score_mat_destroy";
    if (smat == NULL) {
        err_printf (this_sub, "called to delete null score matrix\n");
        return ;
    }
    if ( smat->mat)
        kill_f_matrix (smat->mat);
    else
        err_printf (this_sub, "Called to delete score mat with no matrix\n");
    smat->mat = NULL; /* only to provoke an error if we try again */
    free (smat);
}

/* ---------------- score_mat_shift ---------------------------
 * Add a constant value to a score matrix
 * This one returns a new score matrix.
 * We shift all elements, except first and last row and column.
 */
struct score_mat *
score_mat_shift (struct score_mat *mat1, const float shift)
{
    struct score_mat *res;
    size_t i, j;
    const size_t n_rows = mat1->n_rows;
    const size_t n_cols = mat1->n_cols;
    const size_t last_row = n_rows - 1;
    const size_t last_col = n_cols - 1;
    res = score_mat_new (n_rows - 2, n_cols - 2);
    for (i = 1; i < last_row; i++)
        for (j = 1; j < last_col ; j++)
            res->mat[i][j] = mat1->mat[i][j] + shift;
    return res;
}

/* ---------------- score_mat_scale ---------------------------
 * Scale a score matrix. Return a new matrix.
 * We could do the operation in-place, but then some code at the
 * perl level can lead to destroying the matrix, and then referring
 * to the free()'d memory.
 */
struct score_mat *
score_mat_scale (struct score_mat *mat, const float scale)
{
    struct score_mat *res;
    float *dlast, *d, *m;
    res = score_mat_new (mat->n_rows - 2, mat->n_cols - 2);
    d = mat->mat[0];
    m = res->mat[0];
    dlast = d + mat->n_rows * mat->n_cols;
    for ( ; d < dlast ; d++, m++)
        *m = *d * scale;
    return res;
}

/* ---------------- score_mat_add  ----------------------------
 * Do like new_mat = mat1 + (k * mat2 + b)
 * Do *not* do the operation in-place. There is a good reason
 * for this.
 */
struct score_mat *
score_mat_add ( struct score_mat *mat1, struct score_mat *mat2,
                const float scale, const float shift)
{
    struct score_mat *res;
    const char *this_sub = "score_mat_add";
    const char *broken_sizes = "%s size mismatch, %u != %u\n";
    float *dlast, *d, *m1, *m2;

    if (mat1->n_rows != mat2->n_rows) {
        err_printf (this_sub, broken_sizes, "Row", mat1->n_rows, mat2->n_rows);
        return NULL;
    }
    if (mat1->n_cols != mat2->n_cols) {
        err_printf (this_sub, broken_sizes, "Col", mat1->n_cols, mat2->n_cols);
        return NULL;
    }
    res = score_mat_new (mat1->n_rows - 2, mat1->n_cols - 2);
    d = res->mat[0];
    m1 = mat1->mat[0];
    m2 = mat2->mat[0];
    dlast = d + mat1->n_rows * mat1 ->n_cols;
    for ( ; d < dlast; d++, m1++, m2++)
        *d = *m1 + (shift  + scale * *m2);
    return res;
}

/* ---------------- score_mat_info   --------------------------
 * To give an idea of the size of elements in a score matrix.
 * Do not look at first or last rows or columns.
 */
void
score_mat_info (const struct score_mat *score_mat,
                float *min, float *max, float *av,
                float *std_dev)
{
    float **mat = score_mat->mat;
    double sum = 0;
    double sumsq = 0;
    unsigned n = 0;
    size_t i, j;
    const size_t last_row = score_mat->n_rows - 1;
    const size_t last_col = score_mat->n_cols - 1;
    *min = *max = mat[1][1];
    for (i = 1; i < last_row; i++) {
        for (j = 1; j < last_col; j++) {
            float f = mat[i][j];
            if (f < *min)
                *min = f;
            if (f > *max)
                *max = f;
            sum += f;
            sumsq += (f * f);
            n++;
        }
    }

    *av   = (float) (sum / n);
    *std_dev = (float) (sqrt (n * sumsq - (sum * sum)) / n);
}

/* ---------------- score_mat_string --------------------------
 * This is not a score routine, it is not an alignment routine.
 * Maybe it should go in its own file.
 * Here are the tricks to remember.
 * Our M+N score matrix is actually (M+2)(M+2) so as to allow
 * for empty places and end-gaps. You don't have to use them if
 * you don't like them.
 * The loops below are far more elegant if we temporarily store
 * the strings with end bits at start and end.
 */
char *
score_mat_string (struct score_mat *smat, struct seq *s1, struct seq *s2)
{
    char *ret = NULL;
    char *str1, *str2;
    size_t n_rows = smat->n_rows;
    size_t n_cols = smat->n_cols;
    size_t i, j;
    const char *c_fmt = "%4c ";
    const char *f_fmt = "%4.2f ";
    const char *hat   = "^";
    seq_thomas2std (s1);
    seq_thomas2std (s2);
    str1 = save_str (hat);
    str1 = save_str_append (str1, s1->seq);
    str1 = save_str_append (str1, hat);
    str2 = save_str (hat);
    str2 = save_str_append (str2, s2->seq);
    str2 = save_str_append (str2, hat);

    scr_reset();
    scr_printf (c_fmt, ' ');
    for ( i = 0; i < n_cols; i++)
        scr_printf (c_fmt, str2[i]);
    scr_printf ("\n");
    for ( i = 0; i < n_rows; i++) {
        scr_printf (c_fmt, str1[i]);
        for ( j = 0 ; j < n_cols; j++)
            scr_printf (f_fmt, smat->mat[i][j]);
        ret = scr_printf ("\n");
    }
    free (str1);
    free (str2);
    return (ret);
}

/* ---------------- score_mat_write  --------------------------
 * Dump a score matrix to a file.
 */
int
score_mat_write (const struct score_mat *smat, const char *fname)
{
    FILE *fp;
    float **mat;
    unsigned int n_rows, n_cols;
    size_t to_write;
    const char *this_sub = "score_mat_write";
    extern const char *null_point, *prog_bug;

    if (sizeof(n_rows) != 4) {
        err_printf (this_sub, prog_bug, __FILE__, __LINE__);
        return EXIT_FAILURE;
    }
    if ( ! smat ) {
        err_printf (this_sub, null_point);
        return EXIT_FAILURE;
    }

    if ( smat->n_rows == 0 || smat->n_cols == 0) {
        err_printf (this_sub, "n_rows or columns is zero for %s\n", fname);
        return EXIT_FAILURE;
    }
    if ((fp = mfopen ( fname, "w", this_sub)) == NULL)
        return EXIT_FAILURE;

    n_rows = (unsigned int) smat->n_rows;
    n_cols = (unsigned int) smat->n_cols;

    if (fwrite (&n_rows, sizeof (n_rows), 1, fp) != 1)
        goto error;
    if (fwrite (&n_cols, sizeof (n_cols), 1, fp) != 1)
        goto error;

    mat = smat->mat;
    to_write = n_rows * n_cols;
    if (fwrite (mat[0], sizeof (mat[0][0]), to_write, fp) != to_write)
        goto error;
    fclose (fp);
    return EXIT_SUCCESS;
    error:
        mperror (this_sub);
        err_printf (this_sub, "Failed writing to %s", fname);
        fclose (fp);
        return EXIT_FAILURE;
}

/* ---------------- score_mat_read   --------------------------
 * Read a score matrix from a file. Return a newly allocated
 * score_mat pointer.
 */
struct score_mat *
score_mat_read ( const char *fname)
{
    float **mat;
    struct score_mat *ret_mat = NULL;
    FILE *fp;
    unsigned to_read;
    unsigned int n_rows, n_cols, ret;
    int err = 0;
    static int first = 1;
    const char *this_sub = "score_mat_read";
    const char *read_fail = "Read fail on %s in %s\n";
    extern const char *prog_bug;
    if (sizeof(n_rows) != 4) {
        err_printf (this_sub, prog_bug, __FILE__, __LINE__);
        return NULL;
    }
    if ((fp = mfopen (fname, "r", this_sub)) == NULL)
        return NULL;

    if ((err = file_no_cache(fp)) != 0) {
        if (first) {
            const char *no_cache = "cannot disable read cache: %s: %s";
            first = 0;
            err_printf (this_sub, no_cache, fname, strerror (err));
        }
    }

    if (fread (&n_rows, sizeof (n_rows), 1, fp) != 1) {
        err_printf (this_sub, read_fail, "n_rows", fname);
        goto end;
    }
    if (fread (&n_cols, sizeof (n_cols), 1, fp) != 1) {
        err_printf (this_sub, read_fail, "n_cols", fname);
        goto end;
    }

    ret_mat = score_mat_new (n_rows - 2, n_cols - 2);
    mat = ret_mat->mat;
    to_read = (n_rows ) * (n_cols);
    if ((ret = fread (mat[0], sizeof (mat[0][0]), to_read, fp)) != to_read) {
        err_printf (this_sub, "Failed reading %s. Wanted %lu items. Got %lu\n",
                    fname, (long unsigned) to_read, (long unsigned) ret);
        score_mat_destroy (ret_mat);
        goto end;
    }

    end:
    fclose (fp);
    return ret_mat;
}

/* ------------------------ score_mat_diag_wipe --------------------------
 * At duplicated scorings, we must get rid off the big selfalignments, in
 * case of very similar or even indentical aa sequences in order to discover
 * circluar permutated proteins (similar domains but different conectivity).
 */
void score_mat_diag_wipe( struct pair_set *p_s, struct score_mat *smat )
{
    int **p, **plast;
    float min, max, av, std_dev;

    score_mat_info ( smat, &min, &max, &av, &std_dev);

    p= p_s->indices;
    plast = p + p_s->n;
    for ( ; p < plast ; p++){
        int nc, nr;
        nc = (*p)[0];
        nr = (*p)[1];
        if (nc == GAP_INDEX)
            continue;
        if (nr == GAP_INDEX)
            continue;

	smat->mat[nc + 1][nr + 1] = min;
    }
}

/* ---------------- score_pvec_double_matrix -----------------
 * Duplication of the scoring matrix containing pvecs.
 * The result is a quadrified matrix with the extra columns and rows '$'.
 *
 *  $$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
 *  $--------------|--------------$
 *  $| smat        | smat_2       |$
 *  _|_______________|_______________|_
 *  $|             |             |$
 *  $| smat_3       | smat_4      |$
 *  $--------------|--------------$
 *  $$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
 * */

struct score_mat *
score_mat_double_matrix (const struct score_mat *smat){
    struct score_mat *new_smat;
    size_t new_n_cols, new_n_rows;
    size_t i, k;

    float **src_mat, **dst_mat; /* source and destination data */

    new_n_rows = (smat->n_rows -2) * 2;
    new_n_cols = (smat->n_cols -2) * 2;

    /* init new scoring matrix */
    new_smat = score_mat_new( new_n_rows, new_n_cols );
    src_mat = smat->mat;
    dst_mat = new_smat->mat;


    /* copy and duplication of the entries into the big new one
     * considering the extra cols and rows */

    /* first and second, filling the whole rows
     *  $$$$   $= extra rows and cols untouched
     *  $**$
     *  $--$
     *  $$$$
     *
     * */

    for (i = 1; i < smat->n_rows-1 ; i++){
        for(k = 0; k < 2; k++){
           memcpy( dst_mat[i]+(1+ k*(smat->n_cols-2)), src_mat[i]+1, (smat->n_cols-2) * sizeof(src_mat[0][0]) );
        }
    }

    /* third and fourth concurrently
     *  $$$$ $= extra rows and cols untouched
     *  $**$
     *  $**$
     *  $$$$
     */

     for ( i = 1 ; i < smat->n_rows-2; i++){
     	memcpy( dst_mat[i + smat->n_rows - 2]+1, dst_mat[i]+1,  (new_smat->n_cols-2) * 2 * sizeof(dst_mat[0][0]));
     }

    return new_smat;
}

/* ------------------------ score_mat_write_gnuplot ----------------------
 * For Plotting an alignment traceback, e.g. with gnuplot, you need the
 * smat->mat scores. This function writes them into an ASCII file with
 * filename as param with output in each line:
 * 	i j value
 */
int
score_mat_write_gnuplot( const struct score_mat *smat, const char *fname, const char *protA, const char *protB ){
    const char *this_sub = "score_mat_write_gnuplot";
    float **mat = smat->mat;
    size_t i,j;

    FILE *fp;
    if ((fp = mfopen ( fname, "w", this_sub)) == NULL)
        return EXIT_FAILURE;

    if ( mfprintf ( fp , "%s%s%s%s%s\n","# Data from ", protA, ".pdb and ", protB ,".pdb") < 0)
        goto error;

    if ( mfprintf ( fp, "# %u entries per side \n", (unsigned int) smat->n_cols ) < 0)
        goto error;

    if ( mfprintf ( fp, "# Total entries %u\n", (unsigned int)(smat->n_rows * smat->n_cols) ) < 0)
        goto error;

    for (i = 0; i < smat->n_rows; i++ ){
        for (j = 0; j < smat->n_cols; j++ ){
     	    if ( mfprintf ( fp, "%d %d %f \n", i, j , mat[i][j] ) < 0)
                goto error;
     	}
     	if ( mfprintf ( fp, "\n") < 0)
            goto error;
    }
    if ( mfprintf ( fp, "\n" )  < 0)
        goto error;

    fclose (fp);
    return EXIT_SUCCESS;

    error:
        mperror (this_sub);
        err_printf (this_sub, "Failed writing to %s", fname);
        fclose (fp);
        return EXIT_FAILURE;
}
	Global
`s`	Focus search bar
`?`	Bring up this help dialog
	GitHub
`g` `p`	Go to pull requests
`g` `i`	go to github issues (only if github is preferred repository)
	POD
`g` `a`	Go to author
`g` `c`	Go to changes
`g` `i`	Go to issues
`g` `d`	Go to dist
`g` `r`	Go to repository/SCM
`g` `s`	Go to source
`g` `b`	Go to file browse
	Search terms
module: (e.g. module:Plugin)
distribution: (e.g. distribution:Dancer auth)
author: (e.g. author:SONGMU Redis)
version: (e.g. version:1.00)