The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
/// Author: Cyriac Kandoth
/// Date: 01.19.2011
/// Description: Counts bases with sufficient read-depth in regions of interest within two BAMs
/// Notes:
/// - If ROIs of the same gene overlap, they will not be merged. Use BEDtools' mergeBed if needed
/// - The totals written at the end count each base only once, even if it is in multiple ROIs

#define _GNU_SOURCE
#include <stdio.h>
#include "sam.h"
#include "faidx.h"
#include "khash.h"

KHASH_MAP_INIT_STR( s, int )

// Initializes the header hash in a given bam_header_t. Defined in samtools/bam_aux.c
void bam_init_header_hash( bam_header_t *header );

// Create some values that represent the different refseq bp-classes
enum bp_class_t { AT, CG, CpG, IUB, UNKNOWN };

typedef struct
{
  int beg, end; // The start and stop of a region of interest
  int min_mapq; // Minimum mapping quality of the reads to pileup
  int min_depth_bam1, min_depth_bam2; // Minimum read depth required per bam
  bool *bam1_cvg; // Tags bases in a region of bam1 with the minimum required read-depth
  int covd_bases; // Counts bases in a region that has the minimum read depth in both bams
  int base_cnt[4]; // Counts covered bases in an ROI of 4 bp-classes AT, CG, CpG, IUB
  int tot_covd_bases; // Counts bases in all ROIs that have the minimum read depth in both bams
  int tot_base_cnt[4]; // Counts covered bases in all ROIs of 4 bp-classes AT, CG, CpG, IUB
  char *ref_seq; // Contains the reference sequence for the entire chromosome a region lies in
  char *bp_class; // This prevents counting the same base twice when ROIs overlap in a chromosome
  int ref_id, ref_len; // A chromosome's ID in the the BAM header hash, and it's length
  samfile_t *sam1, *sam2; // The two bam files that need to be piled-up
} pileup_data_t;

// Callback for bam_fetch() pushed only alignments that pass the minimum mapping quality
static int fetch_func( const bam1_t *b, void *data )
{
  bam_plbuf_t *buf = (bam_plbuf_t*)data;
  bam_plbuf_push( b, buf ); // Invokes the callback function specified in bam_plbuf_init()
  return 0;
}

// Callback for bam_plbuf_init() when running a pileup on bam1
static int pileup_func_1( uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *pl, void *data )
{
  pileup_data_t *tmp = (pileup_data_t*)data;
  if( (int)pos >= tmp->beg && (int)pos < tmp->end )
  {
    // Count the number of reads that pass the mapping quality threshold across this base
    int i, mapq_n = 0;
    for( i = 0; i < n; ++i )
    {
      const bam_pileup1_t *base = pl + i;
      if( !base->is_del && base->b->core.qual >= tmp->min_mapq )
        mapq_n++;
    }
    tmp->bam1_cvg[(int)pos - tmp->beg] = ( mapq_n >= tmp->min_depth_bam1 );
  }
  return 0;
}

// Callback for bam_plbuf_init() when running a pileup on bam2
static int pileup_func_2( uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *pl, void *data )
{
  pileup_data_t *tmp = (pileup_data_t*)data;
  if( (int)pos >= tmp->beg && (int)pos < tmp->end && tmp->bam1_cvg[(int)pos - tmp->beg] )
  {
    // Count the number of reads that pass the mapping quality threshold across this base
    int i, mapq_n = 0;
    for( i = 0; i < n; ++i )
    {
      const bam_pileup1_t *base = pl + i;
      if( !base->is_del && base->b->core.qual >= tmp->min_mapq )
        mapq_n++;
    }
    if( mapq_n >= tmp->min_depth_bam2 )
    {
      int class = (int)(tmp->bp_class[pos]);
      if( class == UNKNOWN )
      {
        char base = tmp->ref_seq[pos];
        char prev_base = tmp->ref_seq[pos-1];
        char next_base = tmp->ref_seq[pos+1];
        if( base == 'A' || base == 'T' )
          class = AT;
        else if(( base == 'C' && next_base == 'G' ) || ( base == 'G' && prev_base == 'C' ))
          class = CpG;
        else if( base == 'C' || base == 'G' )
          class = CG;
        else
          class = IUB;
        ++tmp->covd_bases;
        ++tmp->base_cnt[class];
        ++tmp->tot_covd_bases;
        ++tmp->tot_base_cnt[class];
        tmp->bp_class[pos] = (char)class; // Tag this as seen and save its class for an overlapping ROI
      }
      else
      {
        ++tmp->covd_bases;
        ++tmp->base_cnt[class];
      }
    }
  }
  return 0;
}

int main( int argc, char *argv[] )
{
  // This data is shared across functions
  pileup_data_t data;
  data.ref_id = -1;
  data.ref_seq = NULL;
  data.bp_class = NULL;

  // Set default min_mapq and min_depths
  data.min_mapq = 20;
  data.min_depth_bam1 = 6;
  data.min_depth_bam2 = 8;
  if( argc != 6 && argc != 9 )
  {
    fprintf( stderr, "\nUsage: %s <bam1> <bam2> <roi_file> <ref_seq_fasta> <output_file> [min_depth_bam1 min_depth_bam2 min_mapq]", argv[0] );
    fprintf( stderr, "\nDefaults: min_depth_bam1 = %i, min_depth_bam2 = %i, min_mapq = %i", data.min_depth_bam1, data.min_depth_bam2, data.min_mapq );
    fprintf( stderr, "\nNOTE: ROI file *must* be sorted by chromosome/contig names\n\n" );
    return 1;
  }
  // Set user-defined min_depths if specified
  if( argc == 9 )
  {
    data.min_depth_bam1 = atoi( argv[6] );
    data.min_depth_bam2 = atoi( argv[7] );
    data.min_mapq = atoi( argv[8] );
  }

  // Open both BAM files and load their index files
  data.sam1 = samopen( argv[1], "rb", 0 );
  if( !data.sam1 )
    fprintf( stderr, "Failed to open BAM file %s\n", argv[1] );
  bam_index_t *idx1 = bam_index_load( argv[1] );
  if( !idx1 )
    fprintf( stderr, "BAM index file is not available for %s\n", argv[1] );
  data.sam2 = samopen( argv[2], "rb", 0 );
  if( !data.sam2 )
    fprintf( stderr, "Failed to open BAM file %s\n", argv[2] );
  bam_index_t *idx2 = bam_index_load( argv[2] );
  if( !idx2 )
    fprintf( stderr, "BAM index file is not available for %s\n", argv[2] );

  // Open the file with the annotated regions of interest
  FILE *roiFp = fopen( argv[3], "r" );
  if( !roiFp )
    fprintf( stderr, "Failed to open ROI file %s\n", argv[3] );

  // Load an index to the reference sequence fasta file
  faidx_t *ref_fai = fai_load( argv[4] );
  if( !ref_fai )
    fprintf( stderr, "Failed to open reference fasta file %s\n", argv[4] );

  // Open the output file to write to
  FILE* outFp = fopen( argv[5], "w" );
  if( !outFp )
    fprintf( stderr, "Failed to open output file %s\n", argv[5] );

  // Show the user any and all errors they need to fix above before quitting the program
  if( !data.sam1 || !idx1 || !data.sam2 || !idx2 || !roiFp || !ref_fai || !outFp )
    return 1;

  // Write a header with column titles for the output file
  fprintf( outFp, "#NOTE: Last line in file shows non-overlapping totals across all ROIs\n" );
  fprintf( outFp, "#Gene\tROI\tLength\tCovered\tATs_Covered\tCGs_Covered\tCpGs_Covered\n" );

  // Initialize a header hash to check for valid ref_names in bam1
  bam_init_header_hash( data.sam1->header );
  khash_t(s) *hdr_hash = (khash_t(s)*)data.sam1->header->hash;

  // Initialize the counters for the total number of non-overlapping bases in all ROIs
  data.tot_covd_bases = data.tot_base_cnt[AT] = data.tot_base_cnt[CG] = data.tot_base_cnt[CpG] = 0;

  size_t length;
  char *line = NULL;
  while( getline( &line, &length, roiFp ) != -1 )
  {
    char ref_name[20], gene_name[50];
    if( sscanf( line, "%s %i %i %s", ref_name, &data.beg, &data.end, gene_name ) == 4 )
    {
      int ref_id;
      // If this region is valid in bam1, we'll assume it's also valid in bam2
      khiter_t iter = kh_get( s, hdr_hash, ref_name );
      if( iter == kh_end( hdr_hash ) || data.beg > data.end )
      {
        fprintf( stderr, "Skipping invalid ROI: %s", line );
      }
      else
      {
        --data.beg; // Make the start locus a 0-based coordinate
        ref_id = kh_value( hdr_hash, iter );
        int bases = data.end - data.beg;
        data.covd_bases = data.base_cnt[AT] = data.base_cnt[CG] = data.base_cnt[CpG] = 0;
        data.bam1_cvg = (bool*)calloc( bases, sizeof( bool )); // calloc also sets them to zero

        // Load this whole chromosome's refseq unless already loaded for the previous ROI
        if( data.ref_seq == NULL || ref_id != data.ref_id )
        {
          if( data.ref_seq )
            free( data.ref_seq );
          if( data.bp_class )
            free( data.bp_class );
          data.ref_seq = fai_fetch( ref_fai, data.sam1->header->target_name[ref_id], &data.ref_len );
          data.bp_class = (char*)malloc( data.ref_len * sizeof( char ));
          memset( data.bp_class, UNKNOWN, data.ref_len );
          data.ref_id = ref_id;
        }

        // If the ROI is at a chromosome tip, edit it so we can look for CpGs without segfaulting
        if( data.beg == 0 )
          ++data.beg;
        if( data.end == data.ref_len )
          --data.end;

        // Pileup bam1 and tag all the bases which have sufficient read depth
        bam_plbuf_t *buf1 = bam_plbuf_init( pileup_func_1, &data ); // Initialize pileup
        bam_fetch( data.sam1->x.bam, idx1, ref_id, data.beg, data.end, buf1, fetch_func );
        bam_plbuf_push( 0, buf1 );
        bam_plbuf_destroy( buf1 );

        // Pileup bam2 and count bases with sufficient read depth, and tagged earlier in bam1
        bam_plbuf_t *buf2 = bam_plbuf_init( pileup_func_2, &data ); // Initialize pileup
        bam_fetch( data.sam2->x.bam, idx2, ref_id, data.beg, data.end, buf2, fetch_func );
        bam_plbuf_push( 0, buf2 );
        bam_plbuf_destroy( buf2 );

        fprintf( outFp, "%s\t%s:%i-%i\t%i\t%i\t%i\t%i\t%i\n", gene_name, ref_name, data.beg+1, data.end,
                 bases, data.covd_bases, data.base_cnt[AT], data.base_cnt[CG], data.base_cnt[CpG] );
        free( data.bam1_cvg );
      }
    }
    else
    {
      fprintf( stderr, "Badly formatted ROI: %s", line );
      fprintf( stderr, "\nROI file should be a tab-delimited list of [chrom, start, stop, annotation]" );
      fprintf( stderr, "\nwhere start and stop are both 1-based chromosomal loci" );
      fprintf( stderr, "\nFor example:\n20\t44429404\t44429608\tELMO2\nMT\t5903\t7445\tMT-CO1\n" );
      fprintf( stderr, "\nNOTE: ROI file *must* be sorted by chromosome/contig names\n\n" );
      return 1;
    }
  }

  // The final line in the file contains the non-overlapping base counts across all ROIs
  fprintf( outFp, "#NonOverlappingTotals\t\t\t%i\t%i\t%i\t%i\n", data.tot_covd_bases,
           data.tot_base_cnt[AT], data.tot_base_cnt[CG], data.tot_base_cnt[CpG] );

  // Cleanup
  if( line )
    free( line );
  if( data.ref_seq )
    free( data.ref_seq );
  if( data.bp_class )
    free( data.bp_class );
  bam_index_destroy( idx1 );
  bam_index_destroy( idx2 );
  samclose( data.sam1 );
  samclose( data.sam2 );
  fai_destroy( ref_fai );
  fclose( roiFp );
  fclose( outFp );
  return 0;
}