Skip to content

Commit 1eed85e

Browse files
committed
added GRCh and Ensembl release specification for trans step
1 parent 5120e3a commit 1eed85e

File tree

5 files changed

+142
-33
lines changed

5 files changed

+142
-33
lines changed
8.38 MB
Binary file not shown.

bin/hotspot3d

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
use strict;
99
use warnings;
1010

11-
our $VERSION = 'V1.1.6';
11+
our $VERSION = 'V1.2.0';
1212

1313
use Carp;
1414
use FileHandle;

dist.ini

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
name = HotSpot3D
22
author = Beifang Niu, John Wallis, Adam D Scott, Sohini Sengupta , & Amila Weerasinghe from McDonnell Genome Institute of Washington University at St. Louis
3-
version = 1.1.6
3+
version = 1.2.0
44
license = Perl_5
55
copyright_holder = McDonnell Genome Institute at Washington University
66
copyright_year = 2013

lib/TGI/Mutpro/Preprocess/AllPreprocess.pm

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,8 @@ sub new {
4545
$this->{'command'} = "";
4646
$this->{'_OUTPUT_DIR'} = getcwd;
4747
$this->{'_BLAT'} = "blat";
48+
$this->{'GRCh'} = undef;
49+
$this->{'release'} = undef;
4850
$this->{'max_3d_dis'} = 100;
4951
$this->{'p_value_cutoff'} = 1;
5052
$this->{'min_seq_dis'} = 0;
@@ -63,6 +65,8 @@ sub process {
6365
$options = GetOptions (
6466
'output-dir=s' => \$this->{'_OUTPUT_DIR'},
6567
'blat=s' => \$this->{'_BLAT'},
68+
'grch=i' => \$this->{'GRCh'},
69+
'release=i' => \$this->{'release'},
6670
'3d-distance-cutoff=i' => \$this->{'max_3d_dis'},
6771
'p-value-cutoff=i' => \$this->{'p_value_cutoff'},
6872
'linear-cutoff=i' => \$this->{'min_seq_dis'},
@@ -96,6 +100,18 @@ sub blat {
96100
return $this->{'_BLAT'};
97101
}
98102

103+
sub grch {
104+
my $this = shift;
105+
if ( @_ ) { $this->{'GRCh'} = shift; }
106+
return $this->{'GRCh'};
107+
}
108+
109+
sub release {
110+
my $this = shift;
111+
if ( @_ ) { $this->{'release'} = shift; }
112+
return $this->{'release'};
113+
}
114+
99115
sub pvaluecutoff {
100116
my $this = shift;
101117
if ( @_ ) { $this->{'p_value_cutoff'} = shift; }
@@ -145,6 +161,12 @@ sub trans {
145161
my $this = shift;
146162
my $cmd = "hotspot3d trans --output-dir ".$this->outputDir();
147163
$cmd .= " --blat ".$this->blat();
164+
if ( $this->grch() ) {
165+
$cmd .= " --grch ".$this->grch();
166+
}
167+
if ( $this->release() ) {
168+
$cmd .= " --release ".$this->release();
169+
}
148170
print STDOUT "running: ".$cmd."\n";
149171
return system( $cmd );
150172
}
@@ -218,6 +240,9 @@ Usage: hotspot3d prep [options]
218240
OPTIONAL
219241
--start What step to start on ( calroi , statis , anno , trans , cosmic , prior ), default is calroi
220242
--blat Installation of blat to use for trans (defaults to your system default)
243+
--grch Genome build (37 or 38), defaults to 38 or according to --release input
244+
--release Ensembl release verion (55-87), defaults to 87 or to the latest release according to --grch input
245+
Note that releases 55-75 correspond to GRCh37 & 76-87 correspond to GRCh38
221246
--p-value-cutoff p_value cutoff(<=) for prior, default is 0.05
222247
--3d-distance-cutoff 3D distance cutoff (<= Angstroms) for prior, default is 20
223248
--linear-cutoff Linear distance cutoff (> peptides) for prior, default is 0

lib/TGI/Mutpro/Preprocess/Trans.pm

Lines changed: 115 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
package TGI::Mutpro::Preprocess::Trans;
22
#
33
#----------------------------------
4-
# $Authors: Beifang Niu
4+
# $Authors: Beifang Niu & Adam D Scott
55
# $Date: 2014-01-14 14:34:50 -0500 (Tue Jan 14 14:34:50 CST 2014) $
6-
# $Revision: $
6+
# $Revision: 2 $
77
# $URL: $
88
# $Doc: $ transcripts processing and added them in table
99
#----------------------------------
@@ -21,42 +21,38 @@ use File::Temp qw/ tempfile /;
2121

2222
use TGI::Mutpro::Preprocess::Uniprot;
2323

24+
my $LATESTGRCH = 38;
25+
my $LATEST38RELEASE = 87; #TODO will need to be updated as Ensembl has new releases
26+
my $EARLIEST38RELEASE = 76;
27+
my $LATEST37RELEASE = 75;
28+
my $EARLIEST37RELEASE = 55;
29+
2430
sub new {
2531
my $class = shift;
2632
my $this = {};
2733
$this->{_OUTPUT_DIR} = getcwd;
2834
$this->{_STAT} = undef;
2935
$this->{_BLAT} = "blat";
36+
$this->{GRCh} = undef;
37+
$this->{release} = undef;
3038
bless $this, $class;
3139
$this->process();
3240
return $this;
3341
}
3442

3543
sub process {
3644
my $this = shift;
37-
my ( $help, $options );
38-
unless( @ARGV ) { die $this->help_text(); };
39-
$options = GetOptions (
40-
'output-dir=s' => \$this->{_OUTPUT_DIR},
41-
'blat=s' => \$this->{_BLAT},
42-
'help' => \$help,
43-
);
44-
if ( $help ) { print STDERR help_text(); exit 0; };
45-
unless( $options ) { die $this->help_text(); };
46-
unless( $this->{_OUTPUT_DIR} ) { warn 'You must provide a output directory ! ', "\n"; die $this->help_text(); };
47-
unless( -e $this->{_OUTPUT_DIR} ) { warn 'HotSpot3D Trans Error: output directory does not exist ! ', "\n"; die $this->help_text(); };
48-
unless( system( "which $this->{_BLAT}" ) == 0 ) { warn 'HotSpot3D Trans Error: blat not found - $this->{_BLAT} ! ' , "\n"; die $this->help_text(); };
49-
#### processing ####
45+
$this->setOptions();
5046
# add transcript annotation for uniprot
5147
my ( $peptidesDir, $UniprotIdFile, $peptidesFile, $outputFile, );
5248
$peptidesDir = "$this->{_OUTPUT_DIR}\/humanPeptides";
5349
$UniprotIdFile = "$this->{_OUTPUT_DIR}\/hugo.uniprot.pdb.csv";
54-
$peptidesFile = "$peptidesDir\/Homo_sapiens.GRCh37.74.pep.all.fa";
50+
$peptidesFile = "$peptidesDir\/Homo_sapiens.GRCh".$this->{GRCh}.".".$this->{release}.".pep.all.fa";
5551
$outputFile = "$this->{_OUTPUT_DIR}\/hugo.uniprot.pdb.transcript.csv";
5652
unless( -e $peptidesDir ) { mkdir( $peptidesDir ) || die "can not make peptides directory !\n"; };
5753
## get peptide file
58-
my $url = 'ftp://ftp.ensembl.org/pub/release-74/fasta/homo_sapiens/pep/Homo_sapiens.GRCh37.74.pep.all.fa.gz';
59-
my $downloadFile = "$peptidesDir\/Homo_sapiens.GRCh37.74.pep.all.fa.gz";
54+
my $url = $this->makeEnsemblFastaURL();
55+
my $downloadFile = $peptidesFile.".gz";
6056
if ( not -e $downloadFile ) {
6157
getstore( $url, $downloadFile );
6258
}
@@ -83,32 +79,32 @@ sub process {
8379
#print ">uniprotseq\n";
8480
#print $uniprotSequence."\n";
8581

86-
my $transProtein = $uniprotRef->transProteinHash();
87-
next unless( defined $transProtein );
82+
my $enst2ensp = $uniprotRef->transProteinHash(); #non-versioned ENST ids
83+
next unless( defined $enst2ensp );
8884
my $transcriptContent = "";
89-
foreach my $transcript ( keys %{$transProtein} ) {
90-
#print ">$transcript\n";
91-
my $proteinid = $$transProtein{$transcript};
92-
my $proteinsequence = $$peptideSeqsRef{$proteinid};
93-
#print "$proteinsequence\n";
94-
next unless(defined $proteinsequence);
95-
if ( $uniprotSequence eq $proteinsequence ) {
96-
$transcriptContent .= $transcript."[1|1-".length($proteinsequence)."|".length($proteinsequence)."],";
85+
foreach my $enst ( keys %{$enst2ensp} ) { #non-versioned ENST id
86+
#print ">$enst\n";
87+
my $ensp = $enst2ensp->{$enst};
88+
my $proteinSequence = $this->getVersionedSequence( $peptideSeqsRef , $ensp ); #
89+
#print "$proteinSequence\n";
90+
next unless(defined $proteinSequence);
91+
if ( $uniprotSequence eq $proteinSequence ) {
92+
$transcriptContent .= $enst."[1|1-".length($proteinSequence)."|".length($proteinSequence)."],";
9793
} else {
9894
my ( undef, $tmp_uniprot_seq_file ) = tempfile();
9995
my $tmp_uniprot_fh = IO::File->new( $tmp_uniprot_seq_file, ">" ) or die "Temporary file could not be created. $!";
10096
$tmp_uniprot_fh->print( ">$uniprotId\n$uniprotSequence\n" );
10197
my ( undef, $tmp_transcript_protein_seq_file ) = tempfile();
10298
my $tmp_transcript_fh = IO::File->new( $tmp_transcript_protein_seq_file, ">" ) or die "Temporary file could not be created. $!";
103-
$tmp_transcript_fh->print( ">$transcript\n$proteinsequence\n" );
99+
$tmp_transcript_fh->print( ">$enst\n$proteinSequence\n" );
104100
$tmp_uniprot_fh->close; $tmp_transcript_fh->close;
105101
my ( undef, $tmp_blat_output_file ) = tempfile();
106102
my $blat = "blat";
107103
system( "$this->{_BLAT} $tmp_uniprot_seq_file $tmp_transcript_protein_seq_file -t=prot -q=prot -out=blast $tmp_blat_output_file" );
108104
# parse blat output
109105
my $tmp_parse_cont = "";
110106
map{ $tmp_parse_cont .= $_.":"; } @{$this->parse_blat_output( $tmp_blat_output_file, $uniprotId, 0.90 )};
111-
if ( $tmp_parse_cont ne "" ) { chop( $tmp_parse_cont ); $transcriptContent .= $transcript."[".$tmp_parse_cont."],"; };
107+
if ( $tmp_parse_cont ne "" ) { chop( $tmp_parse_cont ); $transcriptContent .= $enst."[".$tmp_parse_cont."],"; };
112108
# clean files
113109
unlink $tmp_uniprot_seq_file; unlink $tmp_transcript_protein_seq_file; unlink $tmp_blat_output_file;
114110

@@ -126,6 +122,86 @@ sub process {
126122
$fhout->close();
127123
}
128124

125+
sub setOptions {
126+
my ( $this ) = @_;
127+
my ( $help, $options );
128+
unless( @ARGV ) { die $this->help_text(); };
129+
$options = GetOptions (
130+
'output-dir=s' => \$this->{_OUTPUT_DIR},
131+
'blat=s' => \$this->{_BLAT},
132+
'grch=f' => \$this->{GRCh},
133+
'release=f' => \$this->{release},
134+
'help' => \$help,
135+
);
136+
if ( $help ) { print STDERR help_text(); exit 0; };
137+
unless( $options ) { die $this->help_text(); };
138+
unless( $this->{_OUTPUT_DIR} ) { warn 'You must provide a output directory ! ', "\n"; die $this->help_text(); };
139+
unless( -e $this->{_OUTPUT_DIR} ) { warn 'HotSpot3D Trans Error: output directory does not exist ! ', "\n"; die $this->help_text(); };
140+
if ( $this->checkBLAT() ) { warn 'HotSpot3D Trans Error: blat not found - $this->{_BLAT} ! ' , "\n"; die $this->help_text(); }
141+
$this->setEnsemblVersions();
142+
return;
143+
}
144+
145+
sub checkBLAT {
146+
my ( $this ) = @_;
147+
return system( "which $this->{_BLAT}" );
148+
}
149+
150+
sub setEnsemblVersions {
151+
my ( $this ) = @_;
152+
if ( not defined $this->{GRCh} and not defined $this->{release} ) {
153+
warn "Ensembl GRCh & release versions not specified, defaulting to latest: GRCh".$LATESTGRCH.".".$LATEST38RELEASE."\n";
154+
$this->{GRCh} = $LATESTGRCH;
155+
$this->{release} = $LATEST38RELEASE;
156+
} elsif ( not defined $this->{GRCh} ) {
157+
if ( $this->{release} >= $EARLIEST38RELEASE and $this->{release} <= $LATEST38RELEASE ) {
158+
$this->{GRCh} = 38;
159+
} elsif ( $this->{release} <= $LATEST37RELEASE and $this->{release} >= $EARLIEST37RELEASE ) {
160+
$this->{GRCh} = 37;
161+
} elsif ( $this->{release} <= 54 ) {
162+
warn "Ensembl releases <= 54 not supported\n";
163+
die $this->help_text();
164+
}
165+
warn "Ensembl GRCh version not specified. User specified release ".$this->{release}.", so using GRCh".$this->{GRCh}."\n";
166+
} elsif ( not defined $this->{release} ) {
167+
if ( $this->{GRCh} == 38 ) {
168+
warn "Ensembl release versions not specified, defaulting to latest: GRCh".$this->{GRCh}.".".$LATEST38RELEASE."\n";
169+
$this->{release} = $LATEST38RELEASE;
170+
} elsif ( $this->{GRCh} == 37 ) {
171+
warn "Ensembl release versions not specified, defaulting to latest: GRCh".$this->{GRCh}.".".$LATEST37RELEASE."\n";
172+
$this->{release} = 75;
173+
} else {
174+
warn "Unrecognized GRCh version (user set ".$this->{GRCh}.", not 37 or 38)\n";
175+
die $this->help_text();
176+
}
177+
} else {
178+
}
179+
}
180+
#### processing ####
181+
sub getVersionedSequence {
182+
my ( $this , $peptideSeqs , $ensp ) = @_;
183+
foreach my $id ( keys %{$peptideSeqs} ) {
184+
if ( $id =~ /$ensp/ ) {
185+
return $peptideSeqs->{$id};
186+
}
187+
}
188+
return undef;
189+
}
190+
191+
sub makeEnsemblFastaURL {
192+
my ( $this ) = @_;
193+
my $url = "ftp://ftp.ensembl.org/pub/release-";
194+
$url .= $this->{release};
195+
$url .= "/fasta/homo_sapiens/pep/Homo_sapiens.GRCh";
196+
$url .= $this->{GRCh};
197+
if ( $this->{GRCh} == 37 ) {
198+
$url .= ".";
199+
$url .= $this->{release};
200+
}
201+
$url .= ".pep.all.fa.gz";
202+
return $url;
203+
}
204+
129205
# loading peptides
130206
sub loadPeptides {
131207
my ( $this, $pepfile, ) = @_;
@@ -137,8 +213,13 @@ sub loadPeptides {
137213
$content = $name = "";
138214
foreach my $a (@entireFile) {
139215
if ($a =~ /^>/) {
216+
print $a."\n";
140217
$pep_hash{$name} = $content if ($name ne "");
141-
($name) = $a =~ /^>(\w+) /;
218+
if ( $this->{GRCh} == 38 ) {
219+
($name) = $a =~ /^>(\w+\.\d+) /;
220+
} else {
221+
($name) = $a =~ /^>(\w+) /;
222+
}
142223
$content = "";
143224
} else { chomp($a); $content .= $a; }
144225
}
@@ -228,6 +309,9 @@ Usage: hotspot3d trans [options]
228309
229310
OPTIONAL
230311
--blat Installation of blat to use (defaults to your system default)
312+
--grch Genome build (37 or 38), defaults to 38 or according to --release input
313+
--release Ensembl release verion (55-87), defaults to 87 or to the latest release according to --grch input
314+
Note that releases 55-75 correspond to GRCh37 & 76-87 correspond to GRCh38
231315
232316
--help this message
233317

0 commit comments

Comments
 (0)