#
# BioStudio object
#
=head1 NAME
Bio::BioStudio
=head1 VERSION
Version 2.10
=head1 DESCRIPTION
=head1 AUTHOR
Sarah Richardson <smrichardson@lbl.gov>
=cut
package Bio::BioStudio;
use base qw(Bio::Root::Root);
use Bio::GeneDesign;
use Bio::BioStudio::ConfigData;
use Bio::BioStudio::Chromosome;
use Bio::BioStudio::Marker;
use Bio::BioStudio::Megachunk;
use Bio::BioStudio::Chunk;
use Bio::BioStudio::SeqFeature::Amplicon;
use Bio::BioStudio::SeqFeature::CDSVariant;
use Bio::BioStudio::SeqFeature::Chunk;
use Bio::BioStudio::SeqFeature::Codon;
use Bio::BioStudio::SeqFeature::Custom;
use Bio::BioStudio::SeqFeature::Megachunk;
use Bio::BioStudio::SeqFeature::ProposedDeletion;
use Bio::BioStudio::SeqFeature::RestrictionSite;
use Bio::BioStudio::SeqFeature::Tag;
use Bio::BioStudio::Repository qw(:BS);
use Bio::BioStudio::BLAST qw(:BS);
use Bio::BioStudio::DB qw(:BS);
use Digest::MD5;
use Scalar::Util qw(looks_like_number);
use File::Path qw(make_path);
use File::Basename;
use YAML::Tiny;
use Carp;
use DBI;
use strict;
use warnings;
our $VERSION = 2.10;
=head1 CONSTRUCTORS
=head2 new
Title : new
Function:
Returns : a new Bio::BioStudio object
Args :
=cut
sub new
{
my ($class, @args) = @_;
my $self = $class->SUPER::new(@args);
my ($repo) = $self->_rearrange([qw(repo)], @args);
bless $self, $class;
#Filepaths
$self->{bioperl_path} = Bio::BioStudio::ConfigData->config('bioperl_path');
$self->{script_path} = Bio::BioStudio::ConfigData->config('script_path');
$self->{tmp_path} = Bio::BioStudio::ConfigData->config('tmp_path');
$self->{conf} = Bio::BioStudio::ConfigData->config('conf_path');
$self->throw("$repo does not exist") if ($repo && ! -e $repo);
$repo = $repo || $self->{conf} . 'genome_repository/';
$self->{repo} = $repo;
#GBrowse configuration
my $gbrowse = Bio::BioStudio::ConfigData->config('gbrowse_support');
if ($gbrowse)
{
$self->{gbrowse} = 1;
}
else
{
$self->{gbrowse} = 0;
}
#Sun Grid Engine configuration
my $sge = Bio::BioStudio::ConfigData->config('SGE_support');
if ($sge)
{
$self->{SGE_support} = 1;
}
else
{
$self->{SGE_support} = 0;
}
#BLAST configuration
my $bl = Bio::BioStudio::ConfigData->config('blast_support');
if ($bl)
{
$self->{blast_support} = 1;
$self->{blast_registry} = {};
}
else
{
$self->{blast_support} = 0;
}
#Cairo configuration
my $cairo = Bio::BioStudio::ConfigData->config('cairo_support');
if ($cairo)
{
$self->{cairo} = 1;
$self->{cairo_conf_path} = $self->conf . 'cairo/';
$self->{cairo_colors_path} = $self->{cairo_conf_path} . 'Cairo_colors.yaml';
}
else
{
$self->{cairo} = 0;
}
$self->{db_engine} = Bio::BioStudio::ConfigData->config('db_engine');
#Custom features
$self->{path_to_features} = $self->{conf} . 'features/';
$self->{custom_features} = $self->_fetch_custom_features();
#Custom markers
$self->{path_to_markers} = $self->{conf} . 'markers/';
$self->{custom_markers} = $self->_fetch_custom_markers();
return $self;
}
=head1 FUNCTIONS
=head1 ACCESSORS
=cut
=head2 path_to_markers
=cut
sub path_to_markers
{
my ($self) = @_;
return $self->{path_to_markers};
}
=head2 custom_markers
=cut
sub custom_markers
{
my ($self) = @_;
return $self->{custom_markers};
}
=head2 path_to_repo
=cut
sub path_to_repo
{
my ($self) = @_;
return $self->{repo};
}
=head2 path_to_features
=cut
sub path_to_features
{
my ($self) = @_;
return $self->{path_to_features};
}
=head2 custom_features
=cut
sub custom_features
{
my ($self) = @_;
return $self->{custom_features};
}
=head2 tmp_path
=cut
sub tmp_path
{
my ($self) = @_;
return $self->{tmp_path};
}
=head2 script_path
=cut
sub script_path
{
my ($self) = @_;
return $self->{script_path};
}
=head2 gbrowse
=cut
sub gbrowse
{
my ($self) = @_;
return $self->{gbrowse};
}
=head2 SGE
=cut
sub SGE
{
my ($self) = @_;
return $self->{SGE_support};
}
=head2 BLAST
=cut
sub BLAST
{
my ($self) = @_;
return $self->{blast_support};
}
=head2 cairo
=cut
sub cairo
{
my ($self) = @_;
return $self->{cairo};
}
=head2 bioperl_path
=cut
sub bioperl_path
{
my ($self) = @_;
return $self->{bioperl_path};
}
=head2 conf
=cut
sub conf
{
my ($self) = @_;
return $self->{conf};
}
=head1 FUNCTIONS
=head2 set_chromosome
=cut
sub set_chromosome
{
my ($self, @args) = @_;
my ($chrname, $gbrowse) = $self->_rearrange([qw(chromosome gbrowse)], @args);
$self->throw('No chromosome name provided') unless ($chrname);
$gbrowse = $gbrowse || 0;
my $chr = Bio::BioStudio::Chromosome->new(
-name => $chrname,
-repo => $self->{repo},
-db_engine => $self->{db_engine},
-gbrowse => $gbrowse,
);
return $chr;
}
=head2 gather_latest_genome
=cut
sub gather_latest_genome
{
my ($self, @args) = @_;
my ($species) = $self->_rearrange([qw(species)], @args);
$self->throw('No species provided') unless ($species);
my @objset = ();
my $latest_names = _gather_latest($species, $self->{repo});
foreach my $chrname (@{$latest_names})
{
my $chr = Bio::BioStudio::Chromosome->new(-name => $chrname);
push @objset, $chr->seqobj;
}
return \@objset;
}
=head2 get_species_BLAST_database
=cut
sub get_species_BLAST_database
{
my ($self, @args) = @_;
if (! $self->{blast_support})
{
warn "BS_WARNING: No BLAST support in this installation of BioStudio.\n" .
"No BLAST factory will be made.\n";
return undef;
}
my ($species) = $self->_rearrange([qw(species)], @args);
$self->throw('No species provided') unless ($species);
my $factory = _get_species_BLAST_database($species, $self->{repo});
if (! exists $self->{blast_registry}->{$species})
{
$self->{blast_registry}->{$species} = $factory;
}
return $factory;
}
=head2 BLASTn_short_sequence
=cut
sub BLASTn_short_sequence
{
my ($self, @args) = @_;
if (! $self->{blast_support})
{
warn "BS_WARNING: No BLAST support in this installation of BioStudio.\n" .
"No BLAST can be run.\n";
return undef;
}
my ($species, $sequence, $count)
= $self->_rearrange([qw(species sequence count)], @args);
$self->throw('No species provided to BLAST') unless ($species);
$self->throw('No sequence provided to BLAST') unless ($sequence);
$count = $count || undef;
if (! ref $sequence)
{
my $sid = Digest::MD5::md5_hex(Digest::MD5::md5_hex(time().{}.rand().$$));
my $obj = Bio::Seq->new(-seq => $sequence, -id => $sid);
$sequence = $obj;
}
elsif (! $sequence->isa("Bio::Seq"))
{
$self->throw('Sequence to BLAST is not a Bio::Seq object');
}
my $seqlen = length $sequence->seq;
if ($seqlen > 40)
{
$self->throw('Sequence length > 40 inappropriate for this function');
}
my $factory = undef;
if (! exists $self->{blast_registry}->{$species})
{
$factory = _get_species_BLAST_database($species, $self->{repo});
$self->{blast_registry}->{$species} = $factory;
}
else
{
$factory = $self->{blast_registry}->{$species};
}
my @hits = _blastn_short($sequence, $factory);
return defined $count ? scalar @hits : @hits;
}
=head2 species_list
=cut
sub species_list
{
my ($self) = @_;
return _list_species($self->{repo});
}
=head2 source_list
=cut
sub source_list
{
my ($self) = @_;
return _list_repository($self->{repo});
}
=head2 gv_increment_warning
=cut
sub gv_increment_warning
{
my ($self, $chromosome) = @_;
my $species = $chromosome->species();
my $seqid = $chromosome->seq_id();
my $cver = $chromosome->chromosome_version();
my $ngver = $chromosome->genome_version() + 1;
my $gscale = $species . q{_} . $seqid . q{_} . $ngver . q{_} . $cver;
my $dblist = $self->source_list();
return $gscale if (exists $dblist->{$gscale});
return 0;
}
=head2 cv_increment_warning
=cut
sub cv_increment_warning
{
my ($self, $chromosome) = @_;
my $species = $chromosome->species();
my $seqid = $chromosome->seq_id();
my $gver = $chromosome->genome_version();
my $cver = $chromosome->chromosome_version();
my $ncver = $chromosome->GD->pad($cver + 1, 2);
my $cscale = $species . q{_} . $seqid . q{_} . $gver . q{_} . $ncver;
my $dblist = $self->source_list();
return $cscale if (exists $dblist->{$cscale});
return 0;
}
=head2 prepare_repository
=cut
sub prepare_repository
{
my ($self, @args) = @_;
my ($species, $chrname) = $self->_rearrange([qw(species chrname)], @args);
my $path = _prepare_repository($self->{repo}, $species, $chrname);
return $path;
}
=head1 ACCESSORS
=head2 genome_repository
A path to a directory that can be used as a genome repository. Defaults to
the config dir as set at install slash genome_repository.
=cut
sub genome_repository
{
my ($self, $value) = @_;
if ($value)
{
$self->throw("$value does not exist") unless (-e $value);
$value .= q{/} unless substr($value, -1, 1) eq q{/};
$self->{repo} = $value;
}
return $self->{repo};
}
=head2 db_engine
A path to a directory that can be used as a genome repository. Defaults to
the config dir as set at install slash genome_repository.
=cut
sub db_engine
{
my ($self, $value) = @_;
if ($value)
{
$self->{db_engine} = $value;
}
return $self->{db_engine};
}
=head2 _fetch_custom_features()
Returns a hashref of custom features in the BioStudio configuration directory.
Each key is a feature name, each value is a Bio::SeqFeature object.
=cut
sub _fetch_custom_features
{
my ($self) = @_;
my %features;
my $path = $self->{path_to_features};
if (-e $path)
{
opendir(my $FDIR, $path);
my @features = grep {$_ =~ m{\.fasta\z}msix} readdir($FDIR);
closedir $FDIR;
foreach my $feature (@features)
{
my $path = $self->path_to_features . $feature;
my ($iterator, $name) = _import_fasta($path);
my ($type, $sequence) = (undef, undef);
while ( my $obj = $iterator->next_seq() )
{
$type = $obj->id;
$sequence = $obj->seq;
last;
}
my $prototype = Bio::BioStudio::SeqFeature::Custom->new(
-prototype => $name,
-primary_tag => $type,
-default_sequence => $sequence,
);
$features{$name} = $prototype;
}
}
return \%features;
}
=head2 _fetch_custom_markers
=cut
sub _fetch_custom_markers
{
my ($self) = @_;
my %markers;
my $path = $self->{path_to_markers};
if (-e $path)
{
opendir(my $FDIR, $path);
my @markers = grep {$_ =~ m{\.gff\z}msix} readdir($FDIR);
closedir $FDIR;
my $n = 1;
foreach my $marker (@markers)
{
my $name = 'marker' . $n;
$n++;
$name = $1 if ($marker =~ m{([\w\d]+)\.gff\z}msix);
my $path = $self->{path_to_markers} . $marker;
my $db = Bio::DB::SeqFeature::Store->new(
-adaptor => 'memory',
-gff => $path,
-index_subfeatures => 'true'
);
$markers{$name} = Bio::BioStudio::Marker->new(
-name => $name,
-db => $db
);
}
}
return \%markers;
}
=head2 _import_fasta
=cut
sub _import_fasta
{
my ($path) = @_;
my $iterator = Bio::SeqIO->new(-file => $path);
my ($filename, $dirs, $suffix) = fileparse($path, qr/\.[^.]*/x);
return ($iterator, $filename);
}
1;
=head2 DESTROY
=cut
sub DESTROY
{
my ($self) = @_;
my @factories = values %{$self->{blast_registry}};
foreach my $factory (@factories)
{
$factory->cleanup();
}
$self->SUPER::DESTROY if $self->can("SUPER::DESTROY");
return;
}
__END__
=head1 COPYRIGHT AND LICENSE
Copyright (c) 2013, BioStudio developers
All rights reserved.
Redistribution and use in source and binary forms, with or without modification,
are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright notice, this
list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright notice, this
list of conditions and the following disclaimer in the documentation and/or
other materials provided with the distribution.
* The names of Johns Hopkins, the Joint Genome Institute, the Lawrence Berkeley
National Laboratory, the Department of Energy, and the BioStudio developers may
not be used to endorse or promote products derived from this software without
specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE DEVELOPERS BE LIABLE FOR ANY DIRECT, INDIRECT,
INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
=cut