The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
#!/usr/bin/perl

#!/usr/bin/perl -w

=head1 NAME

ncbi_2_gff.pl - Massage NCBI chromosome annotation into GFF-format suitable for Bio::DB::GFF

=head1 VERSION (CVS-info)

 $RCSfile: process_ncbi_human.pl,v $
 $Revision: 1.1 $
 $Author: lstein $
 $Date: 2008-10-16 17:01:27 $


=head2 SYNOPSIS

   perl process_ncbi_human.pl [options] /path/to/gzipped/datafile(s)

=head2 DESCRIPTION

This script massages the chromosome annotation files located at

  ftp://ftp.ncbi.nih.gov/genomes/H_sapiens/maps/mapview/chromosome_order/

into the GFF-format recognized by Bio::DB::GFF. If the resulting GFF-files are loaded into a Bio::DB:GFF database using the utilities described below, the annotation can be viewed in the Generic Genome Browser (http://www.gmod.org/ggb/) and interfaced with using the Bio::DB:GFF libraries.
  (NB these NCBI-datafiles are dumps from their own mapviewer database backend, according to their READMEs)

To produce the GFF-files, download all the chr*sequence.gz files from the FTP-directory above. While in that same directory, run the following example command (see also help clause by running script with no arguments):

process_ncbi_human.pl --locuslink [path to LL.out_hs.gz] chr*sequence.gz

This will unzip all the files on the fly and open an output file with
the name chrom[$chrom]_ncbiannotation.gff for each, read the LocusLink
records into an in-memory hash and then read through the NCBI feature
lines, lookup 'locus' features in the LocusLink hash for details on
'locus' features and print to the proper GFF files.  LL.out_hs.gz is
accessible here at the time of writing:

  ftp://ftp.ncbi.nih.gov/refseq/LocusLink/LL.out_hs.gz

Note that several of the NCBI features are skipped from the
reformatting, either because their nature is not fully known at this
time (TAG,GS_TRAN) or their sheer volume stands in the way of them
being accessibly in Bio::DB::GFF at this time (EST similarities). You
can easily change this by modifying the $SKIP variable to your liking
to add or remove features, but if you add then you will have to add
handling for those new features.

To bulk-import the GFF-files into a Bio::DB::GFF database, use the
bulk_load_gff.pl utility provided with Bio::DB::GFF

=head2 AUTHOR

Gudmundur Arni Thorisson E<lt>mummi@cshl.orgE<gt>

Copyright (c) 2002 Cold Spring Harbor Laboratory

       This code is free software; you can redistribute it
       and/or modify it under the same terms as Perl itself.

=cut

use strict;
use Getopt::Long;
use IO::File;
use Bio::DB::GFF::Util::Binning 'bin';
use File::Basename;

my $self = basename($0);
my ($doTSCSNP,$doLocuslink,$debug);
my $opt = &GetOptions ('locuslink=s'  => \$doLocuslink,
		       'tscsnp=s'     => \$doTSCSNP,
		       'debug=s'      => \$debug,
		       );
die <<USAGE if(!defined($opt) || @ARGV == 0);
Usage: $self [options] <GFF filename or wildcard pattern>
  Massage NCBI chromosome annotation datafiles into GFF-format suitable for importing into  Bio::DB::GFF database. Note that the program handles both unzipped datafiles and gzipped, bzipped or compressed ones, so do not bother with unzipping big downloads before running.
  See 'perldoc $self' for more info
Options:
   --locuslink Path to zipped LocusLink file, currently located at
               ftp://ftp.ncbi.nih.gov/refseq/LocusLink/LL.out_hs.gz
               used to lookup gene description and official symbols
   --tscsnp    DSN string to TSC MySQL database to use for auxiliary
               SNP feature attributes (CSHL internal use)
   --debug     Enable debugging output for the DBI database driver.
               (CSHL internal use)

  Options can be abbreviated.  For example, you can use -l for
--locuslink.
Author: Gudmundur Arni Thorisson <mummi\@cshl.org>
Copyright (c) 2002 Cold Spring Harbor Laboratory
       This library is free software; you can redistribute it
       and/or modify it under the same terms as Perl itself.

USAGE
;

#Prepare decompression streams for input files, if necessary
my %FH;
print "\nPreparing input and output streams:\n";
foreach (@ARGV) {
    my $chrom=basename($_);                       # NG 02-10-24
    ($chrom) = $chrom=~/0?([0-9,XYxy]{1,2})/;     # NG 02-10-24
    unless($chrom)
    {
	print "can't get chrom name from filename '$_', SKIPPING";
	next;
    }
    $FH{'Chr'.$chrom} = IO::File->new("chrom$chrom\_ncbiannotation.gff",">") or die $_,": $!";
    $_ = "gunzip -c $_ |" if /\.gz$/;
    $_ = "uncompress -c $_ |" if /\.Z$/;
    $_ = "bunzip2 -c $_ |" if /\.bz2$/;
}

#If TSC SNP processing is to be performed, connect to db and prepare query
my $dbh;
my $tsc_sth;
if($doTSCSNP)
{
    #Is this an argstring or file with the string in it?
    my $dbistring = -f $doTSCSNP ? `cat $doTSCSNP` : $doTSCSNP;
    $dbh = &dbConnect($dbistring);
    $dbh->trace($debug) if $debug;
    print "\nConnecting to TSC database using '$dbistring'\n";
    my $query = qq/
SELECT ta.snp_id,
       ta.variation,
       tr.locus_id,
       tr.gene_symbol,
       tr.fxn_class,
       tf.institute_code as lab,
       tf.pop_type,
       tf.outcome,
       tf.pooled_data,
       tf.num_people_typed,
       tf.allele_a_freq as A,
       tf.allele_c_freq as C,
       tf.allele_g_freq as G,
       tf.allele_t_freq as T
FROM tbl_dbsnp_2_tsc dbsnp
LEFT JOIN tbl_refsnp_gene_fxn tr on tr.refsnp_id=dbsnp.rs_id
LEFT JOIN tbl_allele_freq tf on tf.dbsnp_id=dbsnp.ss_id
LEFT JOIN TBL_SNP_ALL ta on ta.snp_id = dbsnp.tsc_id
WHERE dbsnp.rs_id = ? limit 1/;
    $tsc_sth = $dbh->prepare($query);
}

#If Locuslink-processing is to be performed, Read 
#previously cached data structure from disk
my $llData;


if($doLocuslink)
{
    $doLocuslink = "gunzip -c $doLocuslink |" if $doLocuslink =~ /\.gz$/;
    $doLocuslink = "uncompress -c $doLocuslink |" if $doLocuslink =~ /\.Z$/;
    $doLocuslink = "bunzip -c $doLocuslink |" if $doLocuslink =~ /\.bz$/;
    open LL,$doLocuslink || die $!;
    my $l = 0;
    while(<LL>)
    {
	$l++;
	print "\r--$l LocusLink records loaded" if $l % 100 ==0;
	my ($id,$osym,$isym,$mim,$chrom,$loc,$desc,$taxid,$db) = split /\t/;
	my $name = $osym || $isym;
	#print "  Loading in Locuslink id='$id',osym='$osym',isym='$isym',name='$name',desc='$desc'\n";
	$llData->{$id}->{name} = $name;
	$llData->{$id}->{isym} = $isym;
	$llData->{$id}->{mim}  = $mim;
	$llData->{$id}->{chrom}= $chrom;
	$llData->{$id}->{loc}  = $loc;
	$llData->{$id}->{desc} = $desc;
	$llData->{$id}->{taxid}= $taxid;
	$llData->{$id}->{db}   = $db;
    }
    close LL;
}


my %sources     = (snp        => 'dbSNP',
		   sts        => 'UniSTS',
		   locus      => 'LocusLink',
		   transcript => 'RefSeq',
		   transcript_mouse => 'RefSeq',
		   transcript_human => 'RefSeq',
		   component  => 'Genbank',
		   contig     => 'RefSeq',
		   tag        => 'SAGE',
		   gs_tran    => 'GenomeScan',
		   );
my %classes = (component    => 'Sequence',
	       sts          => 'STS',
	       snp          => 'SNP',
	       locus        => 'Locus',
	       transcript   => 'Transcript',
	       transcript_human   => 'Transcript',
	       transcript_mouse   => 'Transcript',
	       contig       => 'Contig',
	       clone        => 'Clone',
	       tag          => 'SAGE_tag',
	       gs_tran      => 'Transcript',
	       );

my %subcomponents = (transcript => 'exon',
		     transcript_human => 'exon',
		     transcript_mouse => 'exon',
		     gs_tran => 'exon',
		     locus      => 'exon',		     
		     component  => 'subcomponent',
		     );


#And now process all incoming data streams
my $i = 0;
my %maxCoords;
my $SKIP = q/^EST|component|clone/;
my %groups   = (); #aggregate parent features
my %density  = ();
my $binSize  = 100000;
my $max = 0;
print "\nStarting main loop:\n";
while(<>)
{
    chomp;
    next if /^\#/;
    my ($type,$objId,$name,$chrom,$start,$stop,$strand) = split "\t";
    my ($class,$source);
    my $score = '.';
    $strand = '.' unless $strand =~ m/^(\+|\-)$/;
    $type = lc $type;
    if($type eq 'gs_tran')
    {
	$type   = 'transcript';
	$source = 'GenomeScan';
    }
    elsif($type eq 'est_human' && $name =~ /^NM_/)
    {
	$type   = 'transcript'; 
	$source = 'RefSeq-human';
    }
    elsif($type eq 'est_mouse' && $name =~ /^NM_/)
    {
	$type   = 'transcript';
	$source ='RefSeq-mouse';
    }
    next if $type =~ /$SKIP/i;
    $i++;
    #my ($chrom,$ctg) = split /\|/,$chromctg;
    next if $chrom =~ /NT/; #ambigously placed NT-contig at start of chrom
    $chrom = "Chr$chrom";
    $max = $stop if $stop > $max;
    $class ||= $classes{$type};
    unless($class)
    {
	print "need class for type '$type': '$_' (OR add type to \$SKIP pattern\n";
	next;
    }
    my $method = $type;
    $source ||= $sources{$type} || die "ERROR: need source for type '$type'";
    $objId = $name if $type =~ /transcript|snp|contig|gs_tran|tag/;
    my $attributes = qq/$class $objId/;
    $attributes .= qq/; Name $name/ unless $objId eq $name;
    my $bin = &bin($start,$stop,$binSize);
    $bin =~ s/^[10]+\.[0]+//;
    $bin ||= 0;
    #print "\$bin='$bin' ($start=>$stop)\n";
    $density{$chrom}->{$method.'_dens:'.$source}->{$bin}++;
    
    #Deduce start/stop for certain parent features to be printed
    #to output file AFTER we've processed everything. This is 
    #necessary because NCBI only gives start/stop values for the child
    #features, like exons in a gene, but not the whole parent feature
    if($type =~ /transcript|locus/)
    {
	$groups{$type}->{$objId}->{$chrom}->{name} = $name;
	$groups{$type}->{$objId}->{$chrom}->{start} ||= 9999999999999;
	$groups{$type}->{$objId}->{$chrom}->{stop} ||= 0;
	$groups{$type}->{$objId}->{$chrom}->{start} = $start 
	    if  $start < $groups{$type}->{$objId}->{$chrom}->{start};
	$groups{$type}->{$objId}->{$chrom}->{stop} = $stop 
	    if $stop > $groups{$type}->{$objId}->{$chrom}->{stop}; 
	$groups{$type}->{$objId}->{$chrom}->{source} = $source;
	$groups{$type}->{$objId}->{$chrom}->{strand} = $strand;
	$groups{$type}->{$objId}->{$chrom}->{method} = $method;
	$groups{$type}->{$objId}->{$chrom}->{class} = $class;
	$method  = $subcomponents{$type};
	next if  $type eq 'locus';
    }
    #This is for internal CSHL usage
    elsif($type =~ /snp/ && $doTSCSNP)
    {
	#print "  -got refSNP ID: $name, let's do TSC lookup\n";
	if(my $tscAttributes = &queryTSCdb($dbh,$name))
	{
	    #$FH{$chrom}->print(qq/$chrom\tTSC\tsnp\t$start\t$stop\t.\t$strand\t.\t$tscAttributes\n/);
	    $attributes .= $tscAttributes;
	    #print "\$attributes='$attributes'\n";
	}
    }

    #Trying to work around the contig pile-up at the start of a chromosome 
    if($method eq 'contig' && $stop == 0)
    {
	print STDERR "SKIPPING, contig '$name' as stop = $stop and start = $start.\n";
    }

    #And finally print to the proper output stream
    $FH{$chrom}->print(qq/$chrom\t$source\t$method\t$start\t$stop\t.\t$strand\t$score\t$attributes\n/);

    #Collect max coordinates, to deduce chromosome sizes
    $maxCoords{$chrom} ||= 0;
    $maxCoords{$chrom} = $stop if $stop > $maxCoords{$chrom};
    
    #Progress indicator
    if ( $i % 1000 == 0) 
    {
	my ($chrom) = $ARGV=~ /0?([0-9,XYxy]{1,2})/;
	print STDERR "$i total features parsed. Now doing chromosome $chrom";
	print STDERR -t STDOUT && !$ENV{EMACS} ? "\r" : "\n";
    }
}#MAIN LOOP ENDS


#Print out group features like transcripts and genes that 
#were collected before and print to the proper output streams
print "\nPrinting out collected aggregate features\n";
foreach my $type(keys %groups)
{
    foreach my $objId (keys %{$groups{$type}})
    {
	#print "\$name='$name'\n";
	foreach my $chrom(keys %{$groups{$type}->{$objId}})
	{
	    my $name    = $groups{$type}->{$objId}->{$chrom}->{name};
	    my $start   = $groups{$type}->{$objId}->{$chrom}->{start};
	    my $stop    = $groups{$type}->{$objId}->{$chrom}->{stop};
	    my $strand  = $groups{$type}->{$objId}->{$chrom}->{strand};
	    my $method  = $groups{$type}->{$objId}->{$chrom}->{method};
	    my $class   = $groups{$type}->{$objId}->{$chrom}->{class};
	    my $source  = $groups{$type}->{$objId}->{$chrom}->{source};
	    if($type eq 'locus' && $doLocuslink)
	    {
		my $llInfo = '';
		my $ll    = $llData->{$objId};
		my $id    = $ll->{id};
		my $note  = $ll->{desc} ? qq/Note "$name:$ll->{desc}"/ : ' ';
		$note =~ s/;/\\;/g;
		$FH{$chrom}->print( qq/$chrom\t$source\t$method\t$start\t$stop\t.\t$strand\t.\tLocus $objId/);
		$FH{$chrom}->print(qq/; Name $name/) unless $objId eq $name;
		$FH{$chrom}->print(qq/; $note\n/);
	    }
	    else
	    {
		$FH{$chrom}->print(qq/$chrom\t$source\t$method\t$start\t$stop\t.\t$strand\t.\t$class $name; Name $name\n/);
	    }
	}
    }
}

#   	$density{$method.'_dens:'.$source}->{$bin}++;
#Print out the collected binned density stats
print "Printing out density stats\n";

foreach my $chrom(sort keys %density)
{
    foreach my $meth(sort keys %{$density{$chrom}})
    {
	my $bc = 0;
	foreach my $bin(sort {$a<=>$b}keys %{$density{$chrom}->{$meth}})
	{
	    $bc++;
	    my $count = $density{$chrom}->{$meth}->{$bin};
	    my $binstart = $bin*$binSize;
	    my $binstop  = $binstart+$binSize;
	    print "  \$bin=$bin,\$binstart=$binstart,\$count=$count\n";
	    $FH{$chrom}->print(qq/$chrom\tNCBI\t$meth\t$binstart\t$binstop\t$count\t.\t.\t\n/);
	}
	print " $bc bins for method $meth\n";
    }
}

#Print a line for the reference sequences themselves
while(my ($chrom,$max) = each %maxCoords)
{
    $FH{$chrom}->print(qq/$chrom\tassembly\tchromosome\t1\t$max\t.\t+\t.\tSequence \"$chrom\"\n/);
}

print "\nDONE. $i features parsed\n\n";

#------------------------------------------------
# Subroutines
#------------------------------------------------

#For internal CSHL use. Queries our inhouse MySQL database with 
#SNP Consortium data for various auxiliary data on some SNPs
sub queryTSCdb
{
    my $dbh   = shift;
    my $rs_id = shift;
    my $attributes;
    $rs_id  =~ s/rs//;

    #Baeat vid herna, na i classification string fra dbSNP
    my ($note,$tsc_id,$var,$lab,$dbsnp_id,$class,$gene_symbol,$locus_id,$freq);
    $tsc_sth->execute($rs_id) || die $@;
    my $tscInfo = $tsc_sth->fetchrow_hashref() || return undef;
    do{
	#while(my($k,$v) = each %$tscInfo){print "  '$k'=>'$v'\n";}
	#dbSNP stuff, may apply to more than just TSC snps
	$locus_id = $tscInfo->{locus_id};
	$class = $tscInfo->{fxn_class};
	$gene_symbol = $tscInfo->{gene_symbol};
 	$attributes .= qq/; SNPClass $class/ if $class;

	#TSC specific stuff
	$tsc_id = $tscInfo->{snp_id} || return $attributes;
	$tsc_id = sprintf("TSC%7.7d", $tsc_id);
	$var = lc $tscInfo->{variation};
	$lab = $tscInfo->{institute_code};
	$dbsnp_id = $tscInfo->{dbsnp_id};
	$freq = 1 if $tscInfo->{pop_type} && $tscInfo->{outcome} eq 'S';
	$attributes .= qq/; Alias $tsc_id/;
	#$attributes .= qq/; Variation $var/ if $var;
	$attributes .= qq/; AllFreq 1/ if $freq;
    }while($tscInfo = $tsc_sth->fetchrow_hashref());
    $note = qq/; Note "$tsc_id($var)"/;
    $rs_id = $dbh=$tsc_id=$var=$lab=$dbsnp_id=$class=$gene_symbol=$locus_id= $tscInfo=$freq= undef;
    return  $attributes.$note;
}

sub dbConnect
{
    my $dsn = shift;
    my $dbh;
    eval "require DBI; 1" or die "DBI module not installed. Cannot continue";
    eval{$dbh = DBI->connect($dsn,
			     {
				 RaiseError => 1,
				 FetchHashKeyName => 'NAME_lc',
			     }
                             )
         };
    if($@ || !$dbh)
    {
        print STDERR "ERROR, cannot connect to DB! $@\n";
        die $DBI::errstr;
    }
    return $dbh;
}