The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
/*  File: readseq.c
 *  Author: Richard Durbin (rd@sanger.ac.uk)
 *  Copyright (C) R Durbin, 1994
 *-------------------------------------------------------------------
 * Description: generic code to read Pearson format files (fasta)
 		>header line
		conv[x] is the internal code for char 'x'
		conv[x] == -1 or -2 means ignore. 
                conv[x] < -2 means error.
		will work on fil == stdin
 * Exported functions: readSequence, writeSequence, seqConvert
 * HISTORY:
 * Last edited: Dec 11 14:00 2000 (rd)
 * * Dec 29 23:35 1993 (rd): now works off FILE*, returns id and desc
 * Created: Tue Jan 19 21:14:35 1993 (rd)
 * CVS info: $Id: readseq.c,v 1.1 2003-04-15 20:30:39 lstein Exp $
 *-------------------------------------------------------------------
 */

#include "stdio.h"
#include "stdlib.h"
#include "string.h"
#include "ctype.h"

#pragma inline(add)
static char *messalloc (int n)
{
  char *result ;

  if (!(result = (char*) malloc (n)))
    { fprintf (stderr, "MALLOC failure reqesting %d bytes - aborting\n", n) ;
      exit (-1) ;
    }
  return result ;
}

#define messfree(x) free(x)

static void add (char c, char* *buf, int *buflen, int n)
{
  if (n >= *buflen)
    { 
      int blen = *buflen;
      if (blen < 0)
	{ blen = -blen ;
	  *buf = (char*) messalloc (blen) ;
	}
      else
	{ blen *= 2 ;
	  if ((*buf = realloc(*buf,blen)) == NULL)
             { fprintf (stderr, "REALLOC failure reqesting %d bytes - aborting\n", blen) ;
               exit (-1) ;
             }
	}
      *buflen = blen; 
    }
  (*buf)[n] = c;
}

int readSequence (FILE *fil, int *conv, 
                  char **seq, char **id, char **desc, int *length)
{
  char c ;
  int n ;
  static FILE *oldFil = 0 ;
  static int line ;
  int buflen ;

  if (fil != oldFil)
    { line = 1 ;
      oldFil = fil ;
    }
  
/* get id, descriptor */
  c = getc (fil) ;
  if (c == '>')			/* header line */
    { c = getc(fil) ;

      n = 0 ;			/* id */
      buflen = -32;
      while (!feof (fil) && c != ' ' && c != '\n' && c != '\t')
	{ if (id) add (c, id, &buflen, n++) ;
	  c = getc (fil) ;
	}
      if (id) add (0, id, &buflen, n) ;

				/* white space */
      while (!feof (fil) && (c == ' ' || c == '\t'))
	c = getc (fil) ;

      n = 0 ;			/* desc */
      buflen = -32 ;
      while (!feof (fil) && c != '\n')
	{ if (desc) add (c, desc, &buflen, n++) ;
	  c = getc (fil) ;
	}
      if (desc) add (0, desc, &buflen, n) ;

      ++line ;
    }
  else
    { ungetc (c, fil) ;		/* no header line */
      if (id) 
	*id = "" ;
      if (desc)
	*desc = "" ;
    }

  /* ensure whitespace ignored */

  conv[' '] = conv['\t'] = -1 ;
  conv['\n'] = -3 ;

  n = 0 ;			/* sequence */
  buflen = -1024 ;


  while (!feof (fil))
    { 
      c = getc (fil) ;
      if (c == '>')
	{ ungetc (c, fil) ;
	  break ;
	}

      if (c == EOF || c == EOF + 256) /* satisfies all compilers */
	break ;

    
      switch (conv[c]) {
        case -2:
          { if (id) 
              fprintf (stderr, "Bad char 0x%x = '%c' at line %d, base %d, sequence %s\n",
          	     c, c, line, n, *id) ;
            else
              fprintf (stderr, "Bad char 0x%x = '%c' at line %d, base %d\n",
          	     c, c, line, n) ;
            return 0 ;
          }
          break;
        case -3:
          ++line;
        case -1:
          break;
        default:
	  add (conv[c], seq, &buflen, n++) ;
      }
    }
  add (0, seq, &buflen, n) ;

  if (length)
    *length = n ;

  return n ;
}

/*****************************************************/

int seqConvert (char *seq, int *length, int *conv)
{
  int i, n = 0 ;
  int c ;

  for (i = 0 ; seq[i] ; ++i)
    { c = seq[i] ;
      if (length && i >= *length)
	break ;
      if (conv[c] < -2)
	{ fprintf (stderr, "Bad char 0x%x = '%c' at base %d in seqConvert\n", c, c, n) ;
	  return 0 ;
	}
      if (conv[c] >= 0)
	seq[n++] = conv[c] ;
    }
  if (n < i)
    seq[n] = 0 ;

  if (length)
    *length = n ;
  return n ;
}

/*****************************************************/

int writeSequence (FILE *fil, int *conv, 
		   char *seq, char *id, char *desc, int len)
{
  int i ;

  if (!id || !*id)
    { fprintf (stderr, "ERROR: writeSequence requires an id\n") ;
      return 0 ;
    }
  if (!fil)
    { fprintf (stderr, "ERROR: writeSequence requires a file\n") ;
      return 0 ;
    }
    
  fprintf (fil, ">%s", id) ;
  if (desc)
    fprintf (fil, " %s", desc) ;

  for (i = 0 ; i < len ; ++i)
    { if (!(i%60))
	fputc ('\n', fil) ;
      if (conv[seq[i]] > 0)
	fputc (conv[seq[i]], fil) ;
      else
	{ fprintf (stderr, "ERROR in writeSequence: %s[%d] = %d does not convert\n",
		   id, i, seq[i]) ;
	  return 0 ;
	}
    }
  fputc ('\n', fil) ;
  return len ;
}

/*********** read a matrix, using conv ************/

int readMatrix (char *name, int *conv, int** *mat)
{
  char matdirname[256] ;
  char fullname[512] ;
  FILE *fil ;
  char line[1024] = "#", *p;
  int i, j, nsymb, smax = 0 ;
  int symb[128] ;
  extern char* strtok (char*, const char*) ;

  if (getenv ("BLASTMAT")) 
    strcpy (matdirname, getenv ("BLASTMAT")) ;
  else
    strcpy (matdirname, "/nfs/disk100/pubseq/blastdb/") ;
  strcpy (fullname, matdirname) ;
  strcat (fullname, name) ;

  if (!(fil = fopen (name, "r")) && !(fil = fopen (fullname, "r")))
    { fprintf (stderr, "ERROR in readMatrix: could not open %s or %s\n",
	       name, matdirname) ;
      return 0 ;
    }
    
  while (!feof(fil) && *line == '#') /* comments */
    fgets (line, 1023, fil) ;

				/* character set */
  p = line ; while (*p == ' ' || *p == '\t' || *p == '\n') ++p ;
  for (i = 0 ; *p && i < 128 ; ++i)
    { symb[i] = conv[*p] ;
      if (symb[i] < -2)
	{ fprintf (stderr, "ERROR in readMatrix: illegal symbol %c\n", *p) ;
	  fclose (fil) ;
	  return 0 ;
	}
      if (symb[i] > smax)
	smax = symb[i] ;
      ++p ; while (*p == ' ' || *p == '\t' || *p == '\n') ++p ;
    }
  nsymb = i ;

  ++smax ;
  *mat = (int**) messalloc (smax * sizeof (int*)) ;
  for (i = 0 ; i < smax ; ++i)
    (*mat)[i] = (int*) messalloc (smax * sizeof (int)) ;

  for (i = 0 ; fgets(line, 1023, fil) && i < nsymb ; ++i)
    { p = line ; while (*p == ' ' || *p == '\t' || *p == '\n') ++p ;
      if (p && conv[*p] == symb[i])
	{ ++p ; while (*p == ' ' || *p == '\t' || *p == '\n') ++p ; }
      for (j = 0 ; *p && j < 128 ; ++j)
	{ if (symb[i] >= 0 && symb[j] >= 0) 
	    (*mat)[symb[i]][symb[j]] = atoi(p) ;
	  if (*p == '-') ++p ;
	  while (p && *p >= '0' && *p <= '9') ++p ;
	  while (*p == ' ' || *p == '\t' || *p == '\n') ++p ;
	}
      if (j != nsymb)
	{ fprintf (stderr, "ERROR in readMatrix: bad line: %s\n", line) ;
	  fclose (fil) ;
	  return 0 ;
	} 
    }

  fclose (fil) ;
  return 1 ;
}

/*********** standard conversion tables **************/

int dna2textConv[] = {
  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2, 
  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2, 
  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2, 
  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2, 
  -2, 'A',  -2, 'C',  -2,  -2,  -2, 'G',  -2,  -2,  -2,  -2,  -2,  -2, 'N',  -2,
  -2,  -2,  -2,  -2, 'T',  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,
  -2, 'A',  -2, 'C',  -2,  -2,  -2, 'G',  -2,  -2,  -2,  -2,  -2,  -2, 'N',  -2,
  -2,  -2,  -2,  -2, 'T',  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,
} ;

int dna2textAmbig2NConv[] = {
  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2, 
  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2, 
  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2, 
  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2, 
  -2, 'A', 'N', 'C', 'N',  -2,  -2, 'G', 'N',  -2,  -2, 'N',  -2, 'N', 'N',  -2,
  -2,  -2, 'N', 'N', 'T',  -2, 'N', 'N',  -2, 'N',  -2,  -2,  -2,  -2,  -2,  -2,
  -2, 'A', 'N', 'C', 'N',  -2,  -2, 'G', 'N',  -2,  -2, 'N',  -2, 'N', 'N',  -2,
  -2,  -2, 'N', 'N', 'T',  -2, 'N', 'N',  -2, 'N',  -2,  -2,  -2,  -2,  -2,  -2,
} ;

int dna2indexConv[] = {
  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2, 
  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2, 
  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2, 
  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2, 
  -2,   0,  -2,   1,  -2,  -2,  -2,   2,  -2,  -2,  -2,  -2,  -2,  -2,   4,  -2,
  -2,  -2,  -2,  -2,   3,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,
  -2,   0,  -2,   1,  -2,  -2,  -2,   2,  -2,  -2,  -2,  -2,  -2,  -2,   4,  -2,
  -2,  -2,  -2,  -2,   3,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,
} ;

int dna2binaryConv[] = {
  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2, 
  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2, 
  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2, 
  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2, 
  -2,   1,  -2,   8,  -2,  -2,  -2,   4,  -2,  -2,  -2,  -2,  -2,  -2,  15,  -2,
  -2,  -2,  -2,  -2,   2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,
  -2,   1,  -2,   8,  -2,  -2,  -2,   4,  -2,  -2,  -2,  -2,  -2,  -2,  15,  -2,
  -2,  -2,  -2,  -2,   2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,
} ;

int aa2textConv[] = {
  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2, 
  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2, 
  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2, 
  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2, 
  -2, 'A', 'X', 'C', 'D', 'E', 'F', 'G', 'H', 'I',  -2, 'K', 'L', 'M', 'N',  -2,
 'P', 'Q', 'R', 'S', 'T',  -2, 'V', 'W', 'X', 'Y', 'X',  -2,  -2,  -2,  -2,  -2,
  -2, 'A', 'X', 'C', 'D', 'E', 'F', 'G', 'H', 'I',  -2, 'K', 'L', 'M', 'N',  -2,
 'P', 'Q', 'R', 'S', 'T',  -2, 'V', 'W', 'X', 'Y', 'X',  -2,  -2,  -2,  -2,  -2,
} ;

int aa2indexConv[] = {
  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2, 
  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2, 
  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2, 
  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2, 
  -2,   0,  20,   1,   2,   3,   4,   5,   6,   7,  -2,   8,   9,  10,  11,  -2,
  12,  13,  14,  15,  16,  -2,  17,  18,  20,  19,  20,  -2,  -2,  -2,  -2,  -2,
  -2,   0,  20,   1,   2,   3,   4,   5,   6,   7,  -2,   8,   9,  10,  11,  -2,
  12,  13,  14,  15,  16,  -2,  17,  18,  20,  19,  20,  -2,  -2,  -2,  -2,  -2,
} ;

/**************** end of file ***************/