/*
* 21 mar 2005
* Read the influence file from an autoclass run for sequence
* classification and put the numbers into a table.
* Write code for getting the probability of a sequence fragment.
* Implementation..
* This assumes our input is the text results from an autoclass
* classification and it assumes a corresponding layout of the
* data. This means we look for characteristic strings and
* patterns. We do this with a mixture of the posix regex
* functions and calls to strstr(). In principle, we do not need
* both, but string matching is so much simpler, it is used to
* quickly hop over large parts of the input file.
*
* $Id: read_ac.c,v 1.3 2008/04/12 18:09:20 torda Exp $
*/
#define _XOPEN_SOURCE 600
#include <errno.h>
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <regex.h>
#include <stdlib.h>
#include <string.h>
#include "amino_a.h"
#include "e_malloc.h"
#include "fio.h"
#include "matrix.h"
#include "mprintf.h"
#include "prob_vec.h"
#include "prob_vec_i.h"
#include "read_ac.h"
#include "read_ac_i.h"
#include "read_seq_i.h"
#include "seq.h"
#include "seqprof.h"
#include "str.h"
/* ---------------- Structures -------------------------------- */
/* A classification is an array of classes, + the number of
* classes and the number of attributes per class.
* An attribute is an array of probabilities (aa_prob) for each
* amino acid.
*/
/* After much experimenting, it looks as if we do not need to
* store all the numbers that we can read from the classification
* file. We used to read into a "struct aa_prob". Now, we might
* as well remove all references to this and just read up the log
* of probabilities.
*/
#ifdef want_old_aa_prob
struct aa_prob {
float log_pp;
float prob_jkl;
float prob_kl;
};
#endif /* want_old_aa_prob */
/* ---------------- find_line ---------------------------------
* We are looking for a line containing a string.
* We will return that line. If the line is not found, return a
* NULL pointer.
* We use a buffer, provided by the caller, to store the line.
* s is the string we are looking for.
*/
static char *
find_line(char *buf, const int size, FILE *fp, const char *s)
{
do {
if (fgets (buf, size, fp) == NULL)
return NULL;
} while (strstr (buf, s) == NULL);
return buf;
}
/* ---------------- m_regcomp ---------------------------------
* This is just a wrapper around the regular expression
* compilation routine. To do the call properly requires all this
* extra baggage for printing out errors, so let us do it here.
* We assume the simplest case of extended regular expressions
* and no special handling of start or end of string.
*/
static int
m_regcomp ( regex_t *r_compiled, const char *regex)
{
int r;
enum { BSIZE = 256 } ;
char ebuf [BSIZE];
const int cflags = REG_EXTENDED;
const char *this_sub = "m_regcomp";
if ((r = regcomp (r_compiled, regex, cflags))) {
regerror (r, r_compiled, ebuf, BSIZE);
err_printf (this_sub, "%s\n", ebuf);
return EXIT_FAILURE;
}
return EXIT_SUCCESS;
}
/* ---------------- find_regex --------------------------------
* Read down the file until we find a line with the wanted
* regular expression.
* Return NULL if we do not find it.
* Read up to max lines. If we want to keep reading, in order to
* search for a file, set max to 0.
* Return a pointer (into the buffer) which points to our
* expression. For example...
* We are looking for "how" in the string "hello how are we".
* We will return the address of the buffer + 6.
*/
static char *
find_regex (char *buf, const int size, FILE *fp, const char *regex, int max)
{
char *ret = NULL;
regex_t reg;
int r;
size_t nmatch = 1;
regmatch_t pmatch[1];
if (m_regcomp (®, regex))
goto exit;
do { /* Read lines until either max lines are */
if (fgets (buf, size, fp) == NULL) /* read or we find the string */
goto exit;
r = regexec(®,buf, nmatch, pmatch, 0);
} while((r == REG_NOMATCH) && (--max != 0) );
if (r != REG_NOMATCH)
ret = buf + pmatch->rm_so;
exit:
regfree (®);
return ret;
}
/* ---------------- get_n_class -------------------------------
* Read the classification output and return the number of
* classes. Returning zero means we found an error.
*/
static size_t
get_n_class (FILE *fp, char *buf, const int bufsiz)
{
char *s;
long int l;
const char *this_sub = "get_n_class";
const char *class_str = "[0-9]* POPULATED CLASSE";
if ((s = find_regex ( buf, bufsiz, fp, class_str, 0)) == NULL) {
err_printf (this_sub, "Failed to find number of classes\n");
return 0;
}
l = strtol (s, NULL, 0);
return (size_t) l;
}
/* ---------------- get_n_att ---------------------------------
* Keep reading the file and find the number of attributes in the
* classification.
* There is a call to find_regex() below. This used to print an
* error when it did not find anything. Unfortunately, it the
* unified code, this may not be an error. We may be reading a
* file without sequence information.
*/
static size_t
get_n_att (FILE *fp, char *buf, const int bufsiz)
{
size_t n_att = 1;
const char *this_sub = "get_n_att";
const char *parse_fail = "Failed at %s:%d\n";
const char *look_for = "Looking for regex \"%s\"\n";
const char *heading = "num description ";
const char *magic_line = "[0-9]+ aa *[0-9] +[0-9].[0-9]+";
if ( ! find_line (buf, bufsiz, fp, heading)) {
err_printf (this_sub, parse_fail, __FILE__, __LINE__);
err_printf (this_sub, look_for, heading);
return 0;
}
if ( ! find_regex (buf, bufsiz, fp, magic_line, 20)) {
# ifdef want_parse_error
err_printf (this_sub, parse_fail, __FILE__, __LINE__);
err_printf (this_sub, look_for, magic_line);
# endif /* want_parse_error */
return 0;
}
while ( find_regex (buf, bufsiz, fp, magic_line, 1))
n_att++;
return n_att;
}
/* ---------------- get_three_num -----------------------------
* Given a string from the classification file, return the last
* three floating point numbers on the line.
* Put the answers into the float pointers.
* If something breaks, return EXIT_FAILURE;
*/
static int
get_three_num (char *buf, regex_t *three_num,
float *f1, float *f2, float *f3)
{
const char *this_sub = "get_three_num";
float **ff[3];
regmatch_t pmatch[1];
size_t nmatch = 1;
int eflags = 0;
int i;
ff[0] = &f1; ff[1] = &f2; ff[2] = &f3;
errno = 0;
for (i = 0; i < 3; i++) {
double d;
char *tmp;
if (regexec (three_num, buf, nmatch, pmatch, eflags) != 0)
return EXIT_FAILURE;
tmp = buf + pmatch->rm_so;
d = strtod (tmp, NULL);
if (errno && (d == 0.0)) {
err_printf (this_sub, "invalid double number %s\n", buf);
return EXIT_FAILURE;
}
buf += pmatch->rm_eo;
**(ff[i]) = (float)d;
}
return EXIT_SUCCESS;
}
/* ---------------- get_class_wt ------------------------------
* Given a string like
* "CLASS 0 - weight 3 normalized weight 0.500 rela"
* return the normalised class weight, 0.500, in this case.
*/
static float
get_class_wt (const char *buf)
{
float f;
char *s;
const char *s_n = "normalized weight ";
const char *this_sub = "get_class_wt";
if ((s = strstr (buf, s_n)) == NULL) {
err_printf (this_sub, "Failed to find %s in %s\n", s_n, buf);
return -1.0;
}
s += strlen (s_n);
f = (float) strtod (s, NULL);
return f;
}
/* ---------------- read_class --------------------------------
* We have the overview of the classification, now read up the
* description of each class. buf is just a line buffer for us to
* use. aa_clssfcn is the main classification.
* n_class is the number of the class we are reading.
* n_val is the number of values a descriptor can have. For this
* function, it is always 20, the number of amino acids.
*/
static int
read_class (FILE *fp, char *buf, const int bufsiz,
struct aa_clssfcn *aa_clssfcn, const size_t n_class,
const size_t num_aa)
{
float wt;
int want_new_att = 1;
int att_num = -1;
int ret = EXIT_SUCCESS; /* On error, set this and go to escape */
unsigned aa_seen = 0;
unsigned att_seen = 0;
regex_t new_att, next_aa, three_num, get_att_num;
const char *this_sub = "read_class";
const char *heading1 = " numb t mtt description I-jk";
/* 00 02 D SM aa 2 ............... 0.116 h .................. -1.79e+00 3.84e-03 2.30e-02 */
const char *s_class_wt = "normalized weight ";
const char *s_new_att = "[0-9]+ [0-9]+ .+M +.+[.]+.+[.]+ -*[0-9]";
/* n .................. -1.02e+00 1.53e-02 4.26e-02 */
/* const char *s_next_aa = "[a-zA-Z] [.]{13}"; */
const char *s_next_aa = "[a-zA-Z] [ .]{13}";
const char *s_three_num = "-*[0-9][.][0-9]+e[+-][0-9]+";
/* const char *s_get_att_num = "[0-9]{2,} [DR] +S.+[a-zA-Z0-9] [.]{13}"; */
const char *s_get_att_num = "[0-9]{2,} [DR] +S.+aa";
const char *broke_ijk = "Broke looking for I-jk value in \"%s\"\n";
if ( ! find_line (buf, bufsiz, fp, s_class_wt))
return EXIT_FAILURE;
if ((wt = get_class_wt (buf)) < 0) {
err_printf (this_sub, "Failed finding class weight on %s\n", buf);
return EXIT_FAILURE;
}
aa_clssfcn->class_wt[n_class] = wt;
if ( ! find_line (buf, bufsiz, fp, heading1))
return EXIT_FAILURE;
if (m_regcomp (&new_att, s_new_att) == EXIT_FAILURE)
return EXIT_FAILURE;
if (m_regcomp (&next_aa, s_next_aa) == EXIT_FAILURE)
return EXIT_FAILURE;
if (m_regcomp (&three_num, s_three_num) == EXIT_FAILURE)
return EXIT_FAILURE;
if (m_regcomp (&get_att_num, s_get_att_num) == EXIT_FAILURE)
return EXIT_FAILURE;
while ( fgets (buf, bufsiz, fp) && (att_seen < aa_clssfcn->n_att)) {
char *p = buf;
regmatch_t pmatch[1];
const size_t nmatch = 1;
int r;
const int eflags = 0;
char aa;
unsigned char t_aa;
float f1, f2, f3;
if (want_new_att) {
aa_seen = 0;
r = regexec (&get_att_num, p, nmatch, pmatch, eflags);
if (r != 0) /* This is a new attribute, so */
continue; /* first we get the attribute */
p += pmatch->rm_so; /* number into att_num */
att_num = (int) strtol (p, NULL, 0);
if (r !=0) {
err_printf (this_sub, broke_ijk, buf);
ret = EXIT_FAILURE;
goto escape;
}
want_new_att = 0;
}
r = regexec (&next_aa, p, nmatch, pmatch, eflags);
if (r != 0) /* Now we are looking for the substring */
break; /* which contains the amino acid letter */
p += pmatch->rm_so;
aa = *p++;
t_aa = std2thomas_char (aa);
if (get_three_num (p, &three_num, &f1, &f2, &f3) == EXIT_FAILURE) {
ret = EXIT_FAILURE;
goto escape;
}
aa_clssfcn->log_pp [n_class] [att_num] [t_aa] = f1;
if (++aa_seen >= num_aa) {
want_new_att = 1;
att_seen++;
}
}
escape:
regfree (&get_att_num);
regfree (&next_aa);
regfree (&new_att);
regfree (&three_num);
return ret;
}
/* ---------------- aa_clssfcn_destroy ------------------------
* This clean up function will be called by the perl interface.
*/
void
aa_clssfcn_destroy( struct aa_clssfcn* aa_clssfcn)
{
free (aa_clssfcn->class_wt);
kill_3d_array ((void *)aa_clssfcn->log_pp);
free (aa_clssfcn);
}
/* ---------------- new_aa_clssfcn-----------------------------
* Do all the allocating to set up a new classification.
*/
static struct aa_clssfcn *
new_aa_clssfcn ( const size_t n_class, const size_t n_att)
{
struct aa_clssfcn *aa_clssfcn = E_MALLOC (sizeof (*aa_clssfcn));
size_t s, t;
aa_clssfcn->n_class = n_class;
aa_clssfcn->n_att = n_att;
t = sizeof (aa_clssfcn->log_pp[0][0][0]);
aa_clssfcn->log_pp = d3_array (n_class, n_att, MIN_AA, t);
s = sizeof (aa_clssfcn->class_wt[0]);
aa_clssfcn->class_wt = E_MALLOC (n_class * s);
# ifdef fill_with_a_value_for_debugging
{
unsigned i, j, k;
unsigned n1 = n_class;
unsigned n2 = n_att;
unsigned n3 = MIN_AA;
struct aa_prob boo;
for (i = 0; i < n1; i++) {
for (j = 0; j < n2; j++) {
for (k = 0; k < n3; k++) {
boo.log_pp = i; boo.prob_jkl = j; boo.prob_kl = k;
aa_clssfcn->aa_prob [i][j][k] = boo;
}
}
}
}
# endif /* fill_with_a_value_for_debugging */
return aa_clssfcn;
}
/* ---------------- ac_read ----------------------------------
* Given a pointer to a filename, read in the data.
* This is an interface function, visible to the outside world.
*/
struct aa_clssfcn *
ac_read (const char *fname)
{
FILE *fp;
size_t n_class, n_att;
struct aa_clssfcn *aa_clssfcn = NULL;
size_t i;
const char *this_sub = "ac_read";
# ifndef BUFSIZ
enum {BUFSIZ = 1024};
# endif
char buf [BUFSIZ];
if ((fp = mfopen (fname, "r", this_sub)) == NULL)
return (NULL);
if ((n_class = get_n_class (fp, buf, BUFSIZ)) == 0)
goto end;
if ((n_att = get_n_att (fp, buf, BUFSIZ)) == 0)
goto end;
aa_clssfcn = new_aa_clssfcn ( n_class, n_att);
for (i = 0; i < n_class; i++) {
if (read_class (fp, buf, BUFSIZ, aa_clssfcn, i, MIN_AA)==EXIT_FAILURE){
aa_clssfcn_destroy (aa_clssfcn);
goto end;
}
}
end:
fclose (fp);
return (aa_clssfcn);
}
/* ---------------- ac_size ----------------------------------
* Return the fragment length size associated with a
* classification. This is *not* the number of classes. It will
* typically be a number between 4 and 9.
*/
size_t
ac_size (const struct aa_clssfcn *aa_clssfcn)
{
return aa_clssfcn->n_att;
}
/* ---------------- ac_nclass ---------------------------------
* Return the number of classes in a classification.
*/
size_t
ac_nclass ( const struct aa_clssfcn *aa_clssfcn)
{
return aa_clssfcn->n_class;
}
/* ---------------- ac_dump ----------------------------------
* Print some information about the classification.
*/
void
ac_dump ( const struct aa_clssfcn *aa_clssfcn)
{
mprintf ("The classification has %ld classes and fragment length of %ld\n",
(long int) aa_clssfcn->n_class, (long int) aa_clssfcn->n_att);
}
/* ---------------- computeMembershipAAProf --------------------
*/
int
computeMembershipAAProf (float **mship, const struct seqprof *sp,
const struct aa_clssfcn *aa_clssfcn)
{
size_t i, j, k, r;
const size_t n_pvec = sp->nres - aa_clssfcn->n_att + 1;
const char *this_sub = "computeMembershipAAProf";
if (sp->nres < aa_clssfcn->n_att) {
err_printf (this_sub, "Seq length %d to small for classification %d\n",
(int) sp->nres, (int) aa_clssfcn->n_att);
return EXIT_FAILURE;
}
memset (mship[0], 0, aa_clssfcn->n_class * n_pvec * sizeof(mship[0][0]));
for (i = 0; i < n_pvec; i++){ /* seq loop */
for (j = 0; j < aa_clssfcn->n_class; j++){
for (k = 0; k < aa_clssfcn->n_att; k++){ /* Class loop */
for (r = 0; r < blst_afbet_size; r++) /* amino acid loop */
mship[i][j] += aa_clssfcn->log_pp[j][k][r] *
sp->freq_mat[i+k][r];
}
mship[i][j] = exp(mship[i][j]);
}
}
return EXIT_SUCCESS;
}
/* ---------------- computeMembershipAA -----------------------
* mship[s][0] = exp (log_pp[0][0][s]+log_pp[0][1][s+1]+...+
* log_pp[0][n_att][s+n_att])
* Calculate the probability vectors for a sequence.
* The answers go straight into mship[][] of dimension n_pvec,
* and the number of classes.
* It does *not* multiply by the proper class weight. This should
* be done by the caller.
* Return EXIT_SUCCESS / FAILURE.
*/
int
computeMembershipAA (float **mship, struct seq *seq,
const struct aa_clssfcn *aa_clssfcn)
{
size_t i, j, k;
const char z_aa = 22;
const char x_aa = 20;
const size_t n_pvec = seq->length - aa_clssfcn->n_att + 1;
const char *this_sub = "computeMembershipAA";
if (seq->length < aa_clssfcn->n_att) {
err_printf (this_sub, "Seq length %d to small for classification %d\n",
(int) seq->length, (int) aa_clssfcn->n_att);
return EXIT_FAILURE;
}
seq_std2thomas (seq);
memset (mship[0], 0, n_pvec * aa_clssfcn->n_class * sizeof(mship[0][0]));
for (i = 0; i < n_pvec; i++){
for (j = 0; j < aa_clssfcn->n_class; j++){
for (k = 0; k < aa_clssfcn->n_att; k++){
unsigned char aa_i = (unsigned char)seq->seq[i+k];
if (aa_i == z_aa || aa_i == x_aa)
continue;
mship[i][j] += aa_clssfcn->log_pp[j][k][aa_i];
}
mship[i][j] = exp(mship[i][j]);
}
}
return EXIT_SUCCESS;
}
/* ---------------- seq_2_prob_vec ----------------------------
* Take a sequence object and return a probability vector.
* The probability vector routines are defined elsewhere since
* they will be used by both sequence and structure based
* functions.
* TODO: this function should be shut down, it is deprecated
* TODO: use the function in read_ac_strct_i.h instead
*/
struct prob_vec *
seq_2_prob_vec (struct seq *seq, const struct aa_clssfcn *aa_clssfcn)
{
size_t i, j;
const char *this_sub = "seq_2_prob_vec";
struct prob_vec *pvec;
const size_t prot_len = seq->length;
const size_t n_pvec = seq->length - aa_clssfcn->n_att + 1;
pvec = new_pvec (aa_clssfcn->n_att, prot_len, n_pvec, aa_clssfcn->n_class);
if (!pvec) {
err_printf (this_sub, "Failed to make probability vector\n");
return NULL;
}
if (computeMembershipAA (pvec->mship, seq, aa_clssfcn) == EXIT_FAILURE ) {
prob_vec_destroy (pvec);
return NULL;
}
for (i = 0; i < n_pvec; i++) {
double sum = 0.0;
for (j = 0; j < aa_clssfcn->n_class; j++) {
pvec->mship[i][j] *= aa_clssfcn->class_wt[j];
sum += pvec->mship[i][j];
}
for (j = 0; j < aa_clssfcn->n_class; j++)
pvec->mship[i][j] /= sum;
}
pvec->norm_type = PVEC_TRUE_PROB; /* Signal that probabilities are */
/* normalised so they sum to 1.0 */
return pvec;
}
#ifdef WANT_MAIN
#ifdef GCC
void usage () __attribute__ ((noreturn));
#endif
#include "str.h"
/* ---------------- spielen ---------------------------------
*/
static void
spielen (const struct clssfcn *aa_clssfcn)
{
const char *s[] = { "cgac", "ngsn", "npea", NULL};
char **ss;
size_t i;
size_t foo;
ss = (char **)s;
foo = aa_clssfcn->n_class;
do {
float *per_class = E_MALLOC (foo * sizeof (*per_class));
str_2_class (aa_clssfcn, per_class, *ss);
for ( i = 0; i < aa_clssfcn->n_class; i++)
mprintf ("%s class %4d %.3f\n", *ss, i, per_class[i]);
free (per_class);
} while (*(++ss) != NULL);
}
/* ---------------- usage ---------------------------------
*/
static void
usage (const char *name)
{
err_printf (name, ": input_file\n");
exit (EXIT_FAILURE);
}
/* ---------------- main ---------------------------------
*/
int
main (int argc, char *argv[])
{
struct aa_clssfcn *aa_clssfcn;
if (argc !=2)
usage(argv[0]);
if ( (aa_clssfcn = ac_read (argv[1])) == NULL)
return (EXIT_FAILURE);
spielen (aa_clssfcn);
aa_clssfcn_destroy (aa_clssfcn);
return (EXIT_SUCCESS);
}
#endif /* WANT_MAIN */