11package TGI::Mutpro::Preprocess::Trans ;
22#
33# ----------------------------------
4- # $Authors: Beifang Niu
4+ # $Authors: Beifang Niu & Adam D Scott
55# $Date: 2014-01-14 14:34:50 -0500 (Tue Jan 14 14:34:50 CST 2014) $
6- # $Revision: $
6+ # $Revision: 2 $
77# $URL: $
88# $Doc: $ transcripts processing and added them in table
99# ----------------------------------
@@ -21,42 +21,38 @@ use File::Temp qw/ tempfile /;
2121
2222use TGI::Mutpro::Preprocess::Uniprot;
2323
24+ my $LATESTGRCH = 38;
25+ my $LATEST38RELEASE = 87; # TODO will need to be updated as Ensembl has new releases
26+ my $EARLIEST38RELEASE = 76;
27+ my $LATEST37RELEASE = 75;
28+ my $EARLIEST37RELEASE = 55;
29+
2430sub new {
2531 my $class = shift ;
2632 my $this = {};
2733 $this -> {_OUTPUT_DIR } = getcwd;
2834 $this -> {_STAT } = undef ;
2935 $this -> {_BLAT } = " blat" ;
36+ $this -> {GRCh } = undef ;
37+ $this -> {release } = undef ;
3038 bless $this , $class ;
3139 $this -> process();
3240 return $this ;
3341}
3442
3543sub process {
3644 my $this = shift ;
37- my ( $help , $options );
38- unless ( @ARGV ) { die $this -> help_text(); };
39- $options = GetOptions (
40- ' output-dir=s' => \$this -> {_OUTPUT_DIR },
41- ' blat=s' => \$this -> {_BLAT },
42- ' help' => \$help ,
43- );
44- if ( $help ) { print STDERR help_text(); exit 0; };
45- unless ( $options ) { die $this -> help_text(); };
46- unless ( $this -> {_OUTPUT_DIR } ) { warn ' You must provide a output directory ! ' , " \n " ; die $this -> help_text(); };
47- unless ( -e $this -> {_OUTPUT_DIR } ) { warn ' HotSpot3D Trans Error: output directory does not exist ! ' , " \n " ; die $this -> help_text(); };
48- unless ( system ( " which $this ->{_BLAT}" ) == 0 ) { warn ' HotSpot3D Trans Error: blat not found - $this->{_BLAT} ! ' , " \n " ; die $this -> help_text(); };
49- # ### processing ####
45+ $this -> setOptions();
5046 # add transcript annotation for uniprot
5147 my ( $peptidesDir , $UniprotIdFile , $peptidesFile , $outputFile , );
5248 $peptidesDir = " $this ->{_OUTPUT_DIR}\/ humanPeptides" ;
5349 $UniprotIdFile = " $this ->{_OUTPUT_DIR}\/ hugo.uniprot.pdb.csv" ;
54- $peptidesFile = " $peptidesDir \/ Homo_sapiens.GRCh37.74 .pep.all.fa" ;
50+ $peptidesFile = " $peptidesDir \/ Homo_sapiens.GRCh " . $this -> { GRCh }. " . " . $this -> { release }. " .pep.all.fa" ;
5551 $outputFile = " $this ->{_OUTPUT_DIR}\/ hugo.uniprot.pdb.transcript.csv" ;
5652 unless ( -e $peptidesDir ) { mkdir ( $peptidesDir ) || die " can not make peptides directory !\n " ; };
5753 # # get peptide file
58- my $url = ' ftp://ftp.ensembl.org/pub/release-74/fasta/homo_sapiens/pep/Homo_sapiens.GRCh37.74.pep.all.fa.gz ' ;
59- my $downloadFile = " $peptidesDir \/ Homo_sapiens.GRCh37.74.pep.all.fa .gz" ;
54+ my $url = $this -> makeEnsemblFastaURL() ;
55+ my $downloadFile = $peptidesFile . " .gz" ;
6056 if ( not -e $downloadFile ) {
6157 getstore( $url , $downloadFile );
6258 }
@@ -83,32 +79,32 @@ sub process {
8379 # print ">uniprotseq\n";
8480 # print $uniprotSequence."\n";
8581
86- my $transProtein = $uniprotRef -> transProteinHash();
87- next unless ( defined $transProtein );
82+ my $enst2ensp = $uniprotRef -> transProteinHash(); # non-versioned ENST ids
83+ next unless ( defined $enst2ensp );
8884 my $transcriptContent = " " ;
89- foreach my $transcript ( keys %{$transProtein } ) {
90- # print ">$transcript \n";
91- my $proteinid = $$transProtein { $transcript };
92- my $proteinsequence = $$peptideSeqsRef { $proteinid };
93- # print "$proteinsequence \n";
94- next unless (defined $proteinsequence );
95- if ( $uniprotSequence eq $proteinsequence ) {
96- $transcriptContent .= $transcript ." [1|1-" .length ($proteinsequence )." |" .length ($proteinsequence )." ]," ;
85+ foreach my $enst ( keys %{$enst2ensp } ) { # non-versioned ENST id
86+ # print ">$enst \n";
87+ my $ensp = $enst2ensp -> { $enst };
88+ my $proteinSequence = $this -> getVersionedSequence( $peptideSeqsRef , $ensp ); #
89+ # print "$proteinSequence \n";
90+ next unless (defined $proteinSequence );
91+ if ( $uniprotSequence eq $proteinSequence ) {
92+ $transcriptContent .= $enst ." [1|1-" .length ($proteinSequence )." |" .length ($proteinSequence )." ]," ;
9793 } else {
9894 my ( undef , $tmp_uniprot_seq_file ) = tempfile();
9995 my $tmp_uniprot_fh = IO::File-> new( $tmp_uniprot_seq_file , " >" ) or die " Temporary file could not be created. $! " ;
10096 $tmp_uniprot_fh -> print ( " >$uniprotId \n $uniprotSequence \n " );
10197 my ( undef , $tmp_transcript_protein_seq_file ) = tempfile();
10298 my $tmp_transcript_fh = IO::File-> new( $tmp_transcript_protein_seq_file , " >" ) or die " Temporary file could not be created. $! " ;
103- $tmp_transcript_fh -> print ( " >$transcript \n $proteinsequence \n " );
99+ $tmp_transcript_fh -> print ( " >$enst \n $proteinSequence \n " );
104100 $tmp_uniprot_fh -> close ; $tmp_transcript_fh -> close ;
105101 my ( undef , $tmp_blat_output_file ) = tempfile();
106102 my $blat = " blat" ;
107103 system ( " $this ->{_BLAT} $tmp_uniprot_seq_file $tmp_transcript_protein_seq_file -t=prot -q=prot -out=blast $tmp_blat_output_file " );
108104 # parse blat output
109105 my $tmp_parse_cont = " " ;
110106 map { $tmp_parse_cont .= $_ ." :" ; } @{$this -> parse_blat_output( $tmp_blat_output_file , $uniprotId , 0.90 )};
111- if ( $tmp_parse_cont ne " " ) { chop ( $tmp_parse_cont ); $transcriptContent .= $transcript ." [" .$tmp_parse_cont ." ]," ; };
107+ if ( $tmp_parse_cont ne " " ) { chop ( $tmp_parse_cont ); $transcriptContent .= $enst ." [" .$tmp_parse_cont ." ]," ; };
112108 # clean files
113109 unlink $tmp_uniprot_seq_file ; unlink $tmp_transcript_protein_seq_file ; unlink $tmp_blat_output_file ;
114110
@@ -126,6 +122,86 @@ sub process {
126122 $fhout -> close ();
127123}
128124
125+ sub setOptions {
126+ my ( $this ) = @_ ;
127+ my ( $help , $options );
128+ unless ( @ARGV ) { die $this -> help_text(); };
129+ $options = GetOptions (
130+ ' output-dir=s' => \$this -> {_OUTPUT_DIR },
131+ ' blat=s' => \$this -> {_BLAT },
132+ ' grch=f' => \$this -> {GRCh },
133+ ' release=f' => \$this -> {release },
134+ ' help' => \$help ,
135+ );
136+ if ( $help ) { print STDERR help_text(); exit 0; };
137+ unless ( $options ) { die $this -> help_text(); };
138+ unless ( $this -> {_OUTPUT_DIR } ) { warn ' You must provide a output directory ! ' , " \n " ; die $this -> help_text(); };
139+ unless ( -e $this -> {_OUTPUT_DIR } ) { warn ' HotSpot3D Trans Error: output directory does not exist ! ' , " \n " ; die $this -> help_text(); };
140+ if ( $this -> checkBLAT() ) { warn ' HotSpot3D Trans Error: blat not found - $this->{_BLAT} ! ' , " \n " ; die $this -> help_text(); }
141+ $this -> setEnsemblVersions();
142+ return ;
143+ }
144+
145+ sub checkBLAT {
146+ my ( $this ) = @_ ;
147+ return system ( " which $this ->{_BLAT}" );
148+ }
149+
150+ sub setEnsemblVersions {
151+ my ( $this ) = @_ ;
152+ if ( not defined $this -> {GRCh } and not defined $this -> {release } ) {
153+ warn " Ensembl GRCh & release versions not specified, defaulting to latest: GRCh" .$LATESTGRCH ." ." .$LATEST38RELEASE ." \n " ;
154+ $this -> {GRCh } = $LATESTGRCH ;
155+ $this -> {release } = $LATEST38RELEASE ;
156+ } elsif ( not defined $this -> {GRCh } ) {
157+ if ( $this -> {release } >= $EARLIEST38RELEASE and $this -> {release } <= $LATEST38RELEASE ) {
158+ $this -> {GRCh } = 38;
159+ } elsif ( $this -> {release } <= $LATEST37RELEASE and $this -> {release } >= $EARLIEST37RELEASE ) {
160+ $this -> {GRCh } = 37;
161+ } elsif ( $this -> {release } <= 54 ) {
162+ warn " Ensembl releases <= 54 not supported\n " ;
163+ die $this -> help_text();
164+ }
165+ warn " Ensembl GRCh version not specified. User specified release " .$this -> {release }." , so using GRCh" .$this -> {GRCh }." \n " ;
166+ } elsif ( not defined $this -> {release } ) {
167+ if ( $this -> {GRCh } == 38 ) {
168+ warn " Ensembl release versions not specified, defaulting to latest: GRCh" .$this -> {GRCh }." ." .$LATEST38RELEASE ." \n " ;
169+ $this -> {release } = $LATEST38RELEASE ;
170+ } elsif ( $this -> {GRCh } == 37 ) {
171+ warn " Ensembl release versions not specified, defaulting to latest: GRCh" .$this -> {GRCh }." ." .$LATEST37RELEASE ." \n " ;
172+ $this -> {release } = 75;
173+ } else {
174+ warn " Unrecognized GRCh version (user set " .$this -> {GRCh }." , not 37 or 38)\n " ;
175+ die $this -> help_text();
176+ }
177+ } else {
178+ }
179+ }
180+ # ### processing ####
181+ sub getVersionedSequence {
182+ my ( $this , $peptideSeqs , $ensp ) = @_ ;
183+ foreach my $id ( keys %{$peptideSeqs } ) {
184+ if ( $id =~ / $ensp / ) {
185+ return $peptideSeqs -> {$id };
186+ }
187+ }
188+ return undef ;
189+ }
190+
191+ sub makeEnsemblFastaURL {
192+ my ( $this ) = @_ ;
193+ my $url = " ftp://ftp.ensembl.org/pub/release-" ;
194+ $url .= $this -> {release };
195+ $url .= " /fasta/homo_sapiens/pep/Homo_sapiens.GRCh" ;
196+ $url .= $this -> {GRCh };
197+ if ( $this -> {GRCh } == 37 ) {
198+ $url .= " ." ;
199+ $url .= $this -> {release };
200+ }
201+ $url .= " .pep.all.fa.gz" ;
202+ return $url ;
203+ }
204+
129205# loading peptides
130206sub loadPeptides {
131207 my ( $this , $pepfile , ) = @_ ;
@@ -137,8 +213,13 @@ sub loadPeptides {
137213 $content = $name = " " ;
138214 foreach my $a (@entireFile ) {
139215 if ($a =~ / ^>/ ) {
216+ print $a ." \n " ;
140217 $pep_hash {$name } = $content if ($name ne " " );
141- ($name ) = $a =~ / ^>(\w +) / ;
218+ if ( $this -> {GRCh } == 38 ) {
219+ ($name ) = $a =~ / ^>(\w +\.\d +) / ;
220+ } else {
221+ ($name ) = $a =~ / ^>(\w +) / ;
222+ }
142223 $content = " " ;
143224 } else { chomp ($a ); $content .= $a ; }
144225 }
@@ -228,6 +309,9 @@ Usage: hotspot3d trans [options]
228309
229310 OPTIONAL
230311--blat Installation of blat to use (defaults to your system default)
312+ --grch Genome build (37 or 38), defaults to 38 or according to --release input
313+ --release Ensembl release verion (55-87), defaults to 87 or to the latest release according to --grch input
314+ Note that releases 55-75 correspond to GRCh37 & 76-87 correspond to GRCh38
231315
232316--help this message
233317
0 commit comments