@@ -84,6 +84,10 @@ sub new {
8484 $this -> {' subunit_dependence' } = undef ;
8585# TODO add meric based on protein or gene id, if protein, need hugo.uniprot.pdb.transcript.csv file
8686 $this -> {' meric_type' } = undef ;
87+
88+ $this -> {' gene_list_file' } = undef ;
89+ $this -> {' structure_list_file' } = undef ;
90+ $this -> {' listOption' } = undef ;
8791
8892 $this -> {' processed' } = undef ;
8993 $this -> {' distance_matrix' } = undef ;
@@ -175,6 +179,12 @@ sub setOptions {
175179 ' parallel=s' => \$this -> {' parallel' },
176180 ' max-processes=i' => \$this -> {' max_processes' },
177181 ' meric-type=s' => \$this -> {' meric_type' },
182+ ' gene-list-file=s' => \$this -> {' gene_list_file' },
183+ ' structure-list-file=s' => \$this -> {' structure_list_file' },
184+ ' epsilon=f' => \$this -> {' Epsilon' },
185+ ' minPts=f' => \$this -> {' MinPts' },
186+ ' number-of-runs=f' => \$this -> {' number_of_runs' },
187+ ' probability-cut-off=f' => \$this -> {' probability_cut_off' },
178188
179189 ' help' => \$help ,
180190 );
@@ -276,6 +286,25 @@ sub setOptions {
276286 if ( not exists $tempMericHash -> {$this -> {' meric_type' }} ) {
277287 die " Error: meric-type should be one of the following: intra, monomer, homomer, inter, heteromer, multimer, unspecified\n " ;
278288 }
289+
290+ # #### START gene-list and structure-list options
291+ if ( defined $this -> {' gene_list_file' } ) {
292+ if ( not -e $this -> {' gene_list_file' } ) {
293+ warn " The input gene list file (" .$this -> {' gene_list_file' }." ) does not exist! " , " \n " ;
294+ die $this -> help_text();
295+ }
296+ }
297+
298+ if ( defined $this -> {' structure_list_file' } ) {
299+ if ( not -e $this -> {' structure_list_file' } ) {
300+ warn " The input structure list file (" .$this -> {' structure_list_file' }." ) does not exist! " , " \n " ;
301+ die $this -> help_text();
302+ }
303+ }
304+
305+ $this -> initializeLists();
306+ # #### END
307+
279308 print STDOUT " =====Parameters=====\n " ;
280309 print STDOUT " p-value-cutoff = " .$this -> {' p_value_cutoff' }." \n " ;
281310 print STDOUT " 3d-distance-cutoff = " .$this -> {' 3d_distance_cutoff' }." \n " ;
@@ -294,6 +323,46 @@ sub setOptions {
294323 return ;
295324}
296325
326+ sub initializeLists {
327+ my $this = shift ;
328+ # print "at initializeLists\n";
329+
330+ if ( defined $this -> {' gene_list_file' } ) {
331+ my $fh = new FileHandle;
332+ unless ( $fh -> open ( $this -> {' gene_list_file' } , " r" ) ) { die " Could not open gene list file $! \n " };
333+ while ( my $gene = <$fh > ) {
334+ chomp ( $gene );
335+ $this -> {' providedGenes' }-> {$gene } = 0; # hash with provided gene names as keys, will be used later in filterByProvidedLists
336+ }
337+ }
338+ if ( defined $this -> {' structure_list_file' } ) {
339+ my $fh = new FileHandle;
340+ unless ( $fh -> open ( $this -> {' structure_list_file' } , " r" ) ) { die " Could not open structures list file $! \n " };
341+ while ( my $structure = <$fh > ) {
342+ chomp ( $structure );
343+ $this -> {' providedStructures' }-> {$structure } = 0; # hash with provided structure names as keys, will be used later in filterByProvidedLists
344+ }
345+ }
346+ # #
347+ if ( (defined $this -> {' gene_list_file' }) && (not defined $this -> {' structure_list_file' }) ) {
348+ $this -> {' listOption' } = " GeneListOnly" ;
349+ # print "gene list only\n";
350+ }
351+ if ( (not defined $this -> {' gene_list_file' }) && (defined $this -> {' structure_list_file' }) ) {
352+ $this -> {' listOption' } = " StructureListOnly" ;
353+ warn " HotSpot3D::Cluster::setOptions warning: only the mutations present in the given structures are being considered\n " ;
354+ # print "structure list only\n";
355+ }
356+ if ( (defined $this -> {' gene_list_file' }) && (defined $this -> {' structure_list_file' }) ) {
357+ $this -> {' listOption' } = " BothLists" ;
358+ # print "both lists\n";
359+ }
360+ if ( (not defined $this -> {' gene_list_file' }) && (not defined $this -> {' structure_list_file' }) ) {
361+ $this -> {' listOption' } = " None" ;
362+ # print "no lists\n";
363+ }
364+ }
365+
297366sub launchClustering {
298367 my $this = shift ; # removed my ( $this, $help ) = @_;
299368 if ( $this -> {' clustering' } eq $NETWORK ) {
@@ -449,29 +518,35 @@ sub readPairwise { # shared
449518 $gene2 , $chromosome2 , $start2 , $stop2 , $aa_2 , $chain2 , $loc_2 , $domain2 , $cosmic2 ,
450519 $linearDistance , $infos ) = split /\t/ , $_ ;
451520
452- if ( $this -> checkMeric( $gene1 , $gene2 , $chain1 , $chain2 ) ) {
453- # print $_."\n";
454- $chain1 =~ s /\[ (\w )\] / $1 / g ;
455- $chain2 =~ s /\[ (\w )\] / $1 / g ;
456- my $proteinMutation = TGI::ProteinVariant-> new();
457- my $mutation1 = TGI::Variant-> new();
458- $mutation1 -> gene( $gene1 );
459- $mutation1 -> chromosome( $chromosome1 );
460- $mutation1 -> start( $start1 );
461- $mutation1 -> stop( $stop1 );
462- $proteinMutation -> aminoAcidChange( $aa_1 );
463- $mutation1 -> addProteinVariant( $proteinMutation );
464-
465- my $mutation2 = TGI::Variant-> new();
466- $mutation2 -> gene( $gene2 );
467- $mutation2 -> chromosome( $chromosome2 );
468- $mutation2 -> start( $start2 );
469- $mutation2 -> stop( $stop2 );
470- $proteinMutation -> aminoAcidChange( $aa_2 );
471- $mutation2 -> addProteinVariant( $proteinMutation );
472- # print "pair structures ".join( " " , ( $infos , $chain1 , $chain2 ) )."\n";
473- $this -> setDistance( $distance_matrix , $mutation1 , $mutation2 ,
474- $chain1 , $chain2 , $infos , $pdbCount );
521+ if ( $this -> checkByProvidedLists( $gene1 , $gene2 , $infos ) ) { # check to filter out mutation pairs according to a given gene list or structures list or both
522+ if ( defined $this -> {' structure_list_file' } ) { # if true, get only the pdbIDs which are in this list
523+ $infos = $this -> filterInfosByGivenStructures( $infos );
524+ # print "new infos= $infos\n";
525+ }
526+ if ( $this -> checkMeric( $gene1 , $gene2 , $chain1 , $chain2 ) ) {
527+ # print $_."\n";
528+ $chain1 =~ s /\[ (\w )\] / $1 / g ;
529+ $chain2 =~ s /\[ (\w )\] / $1 / g ;
530+ my $proteinMutation = TGI::ProteinVariant-> new();
531+ my $mutation1 = TGI::Variant-> new();
532+ $mutation1 -> gene( $gene1 );
533+ $mutation1 -> chromosome( $chromosome1 );
534+ $mutation1 -> start( $start1 );
535+ $mutation1 -> stop( $stop1 );
536+ $proteinMutation -> aminoAcidChange( $aa_1 );
537+ $mutation1 -> addProteinVariant( $proteinMutation );
538+
539+ my $mutation2 = TGI::Variant-> new();
540+ $mutation2 -> gene( $gene2 );
541+ $mutation2 -> chromosome( $chromosome2 );
542+ $mutation2 -> start( $start2 );
543+ $mutation2 -> stop( $stop2 );
544+ $proteinMutation -> aminoAcidChange( $aa_2 );
545+ $mutation2 -> addProteinVariant( $proteinMutation );
546+ # print "pair structures ".join( " " , ( $infos , $chain1 , $chain2 ) )."\n";
547+ $this -> setDistance( $distance_matrix , $mutation1 , $mutation2 ,
548+ $chain1 , $chain2 , $infos , $pdbCount );
549+ }
475550 }
476551 } $fh -> getlines;
477552 $fh -> close ();
@@ -486,6 +561,53 @@ sub readPairwise { # shared
486561 return ;
487562}
488563
564+ sub checkByProvidedLists {
565+ my ( $this , $gene1 , $gene2 , $infos ) = @_ ;
566+ # print join( " " , ( $gene1 , $gene2 , $infos , "" ) );
567+ if ( $this -> {' listOption' } eq " None" ) { # no lists given, proceed as usual
568+ # print "no lists: process as usual\n";
569+ return 1;
570+
571+ } elsif ( $this -> {' listOption' } eq " GeneListOnly" ) {
572+ if ( (exists $this -> {' providedGenes' }-> {$gene1 }) && (exists $this -> {' providedGenes' }-> {$gene2 }) ) {
573+ # print "gene check okay\n";
574+ return 1;
575+ } else { return 0; }
576+
577+ } elsif ( $this -> {' listOption' } eq " StructureListOnly" ) {
578+ my $newInfos = $this -> filterInfosByGivenStructures( $infos );
579+ if ( defined $newInfos ) {
580+ # print "structure check okay\n";
581+ return 1;
582+ }else { return 0; }
583+
584+ } elsif ( $this -> {' listOption' } eq " BothLists" ) {
585+ my $newInfos = filterInfosByGivenStructures( $this , $infos );
586+ if ( (defined $newInfos ) && (exists $this -> {' providedGenes' }-> {$gene1 }) && (exists $this -> {' providedGenes' }-> {$gene2 }) ) {
587+ # print "both checks okay\n";
588+ return 1;
589+ } else { return 0; }
590+ }
591+ }
592+
593+ sub filterInfosByGivenStructures { # takes the infos column from pairwise file and filter out the structures that are not present in the structures list
594+ my ( $this , $infos ) = @_ ;
595+ my $newInfos = undef ;
596+ my @infos = split /\|/ , $infos ;
597+ foreach my $info ( @infos ) {
598+ chomp( $info );
599+ next if ( $info eq "" );
600+ my ( $distance , $pdbID , $pvalue ) = split / / , $info ;
601+ next if ( not exists $this -> {' providedStructures' }-> {$pdbID } );
602+ if ( defined $newInfos ) {
603+ $newInfos = join ( " \| " , $newInfos , $info );
604+ } else {
605+ $newInfos = $info ;
606+ }
607+ }
608+ return $newInfos ;
609+ }
610+
489611sub checkMeric {
490612 my ( $this , $gene1 , $gene2 , $chain1 , $chain2 ) = @_ ;
491613# print join( " " , ( $gene1 , $chain1 , $gene2 , $chain2 , "" ) );
0 commit comments