#!/usr/bin/perl -w
#
#   This software was developed by Marcelo Reis, Joao C. Setubal,
#   David Haake and James Matsunaga
#
#   It should not be redistributed or used for any commercial purpose
#   without written permission from Joao C. Setubal setubal@vbi.vt.edu
#
#   release date September 2005
#
# This software is experimental in nature and is
# supplied "AS IS", without obligation by the authors to provide
# accompanying services or support.  The entire risk as to the quality
# and performance of the Software is with you. The authors
# EXPRESSLY DISCLAIM ANY AND ALL WARRANTIES REGARDING THE SOFTWARE,
# WHETHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO WARRANTIES
# PERTAINING TO MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.
#
# ====================================================================
# find_orf_score.pl
# ====================================================================
# reads a matrix and a fasta files, and finds the score for the 
# the peptide signal
# 
# ====================================================================
# Author: Reis, Marcelo S.    Oct/09/2002 
# ====================================================================

# force variable declaration
#
use strict;
use vars qw($id_size);

# immediate output flush
#
$| = 1; 

###                              
#### initializing the files ####
                              
# test number of parameters
#
if (@ARGV != 3){
    print STDERR "\nCalculates the lipoprotein score, writing it into a file.\n\n";
    print STDERR "Usage: $0";
    print STDERR " <weight-matrix file>";
    print STDERR " <multifasta file>";
    print STDERR " <output file>\n\n";
    exit 1;
}

my $matrix_file = $ARGV[0];
my $fasta_file = $ARGV[1];
my $output_file  = $ARGV[2];

my $problems = 0;

if (! -e "$fasta_file") {
  print STDERR "ERROR : File $fasta_file does not exist\n";
  $problems++;
}


# if we have a problem, exit:
#
if ($problems) {
  exit 1;
}

if (!open(FASTA, "$fasta_file")) {
  die "Problems when opening $fasta_file: $!\n";
}


if (!open(MATRIX, "$matrix_file")) {
  die "Problems when opening $matrix_file: $!\n";
}

if (!open(OUT, ">$output_file")) {
  die "Problems when opening $output_file: $!\n";
}

###
#### loading fastas and list in the memory ####

print "\nloading files into memory... ";


my $cont = 0;

my @buf_fasta;
my @cleavage;
my @gene_id;
my @gene_size;
my @product;

$/ = "\n";

while (<FASTA>){

  if (($_ =~ /gi\|(\d+)\|\S+\s+(.*)/)||       # NCBI fasta's header
      ($_ =~ /(\d+)\.\d+\s+\|\s+(.*)/)||      # Cop fasta's header
      ($_ =~ /(\S+\_\S+)\s+(.*)/)||           # Swiss Prot fasta's header
      ($_ =~ /(.*)/)){   		                # other header
      
    $gene_id[$cont] = $1;

    $gene_id[$cont] =~ s/\>//g;

    $product[$cont] = $2;
    
    $/ = ">";
    
    $_ = <FASTA>;
    
    $buf_fasta[$cont] = $_;
    
    $buf_fasta[$cont] =~ s/\>//g;
    $buf_fasta[$cont] =~ s/\n//g;
    
    $gene_size[$cont] = length $buf_fasta[$cont];
    
    $/ = "\n";

    $cleavage[$cont] = -1;
    
    $cont++;
    
  }
    
} # end while(<FASTA>)

my %matrix_1;
my %matrix_2;
my %matrix_3;
my %matrix_4;
my %matrix_5;

while (<MATRIX>){

    if ($_ =~ /(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)/){

	$matrix_1{$1} = $6;
	$matrix_2{$1} = $5;
	$matrix_3{$1} = $4;
	$matrix_4{$1} = $3;
	$matrix_5{$1} = $2;

    }
    
} # end while(<MATRIX>)

$matrix_1{'X'} = -10;
$matrix_2{'X'} = -10;
$matrix_3{'X'} = -10;
$matrix_4{'X'} = -10;
$matrix_5{'X'} = -10;


print "[done]";


# Kyte-Doolittle Hydrophobicity Matrix:

$matrix_5{'R'}=  -4.5;  
$matrix_5{'S'}=  -0.8;   
$matrix_5{'K'}=  -3.9;  
$matrix_5{'T'}=  -0.7;  
$matrix_5{'N'}=  -3.5;  
$matrix_5{'G'}=  -0.4;  
$matrix_5{'D'}=  -3.5;  
$matrix_5{'A'}=  1.8;  
$matrix_5{'Q'}=  -3.5;  
$matrix_5{'M'}=  1.9;  
$matrix_5{'E'}=  -3.5;  
$matrix_5{'C'}=  2.5;  
$matrix_5{'H'}=  -3.2;  
$matrix_5{'F'}=  2.8;  
$matrix_5{'P'}=  -1.6;  
$matrix_5{'L'}=  3.8;  
$matrix_5{'Y'}=  -1.3;  
$matrix_5{'V'}=  4.2;
$matrix_5{'W'}=  -0.9;  
$matrix_5{'I'}=  4.5;  


###
#### calculating the score with the weight-matrix ####

print "\ncalculating scores... ";

my $i = 0;
my $aa_code = "FSYC*WLPHQIMTNKRVADEG";
my $left = 0;
my $right = 0;
my $cont_bad = 0;
my $cont_good = 0;
my $cont_err = 0;
my $sair = 0;
my $actual_cleavage;
my @score;

my $best_score_hydro;
my $type;
my $type_temp;
my $cys;
my $score_lipo;
my $best_score_lipo;
my $score_n;


my $sec_score_hydro;
my $sec_type;
my $sec_cys;
my $sec_score_lipo;
my $sec_score_n;

my $hydro_cont = 0;
my $sec_hydro_cont = 0;


for ($i = 0; $i < $cont; $i++){

    ### finds the FIRST Cys between 
    ## 14th and 35th aa that satisfies the -3/-4 rules  ###
    # also verifies if there are at least 50aa after the chosen Cys #
    
    $best_score_hydro = -100;
    $actual_cleavage = -100;
    
    $best_score_lipo = -100;
    
    $sec_score_hydro = -100;
    $sec_score_lipo = -100;
    $sec_type = 0;
    $sec_cys = -1;
    $sec_hydro_cont = 0;
    $sec_score_n = 0;

    $score_n = 0;

    $type = 0;
    
    my $max = 35;
    if ($gene_size[$i] < $max){
	   $max = $gene_size[$i];
    }

    for($cys = 13; $cys <= $max; $cys++){
	
		if ((substr($buf_fasta[$i], $cys, 1) eq 'C')){ 
		    
		    my $hydro_temp = 5;
		    my $forbidden = 0;
		    my $score_hydro = 0;
	    
		    ## finds longest H-region and it's score ##
	    
		    while(($hydro_temp < $cys)&&($forbidden == 0)){
		
				if ((substr($buf_fasta[$i],($cys - $hydro_temp),1) eq 'R') ||
				    (substr($buf_fasta[$i],($cys - $hydro_temp),1) eq 'D') ||
				    (substr($buf_fasta[$i],($cys - $hydro_temp),1) eq 'H') ||
				    (substr($buf_fasta[$i],($cys - $hydro_temp),1) eq 'E') ||
				    (substr($buf_fasta[$i],($cys - $hydro_temp),1) eq 'K')) {
		    
			    $forbidden = 1;
		    
				}
			
				else{
	      
				    $score_hydro = $score_hydro + $matrix_5{substr($buf_fasta[$i],($cys - $hydro_temp),1)};
		    
				    $hydro_temp++;
		    
				}
	
         } # end while(($hydro_temp...
	
         my $tam_n = $cys - $hydro_temp + 1;

		   $score_n = 0;

		   while($tam_n >= 1){

           if ((substr($buf_fasta[$i],$tam_n,1) eq 'E') ||
               (substr($buf_fasta[$i],$tam_n,1) eq 'D')) {

             $score_n--;

           }

           if ((substr($buf_fasta[$i],$tam_n,1) eq 'R') ||
	            (substr($buf_fasta[$i],$tam_n,1) eq 'H') ||
               (substr($buf_fasta[$i],$tam_n,1) eq 'K')) {
              $score_n++;
           }

      	  $tam_n--;

	     } # end while($tam_n...
	    
	    $score_lipo = $matrix_1{substr($buf_fasta[$i],($cys - 1),1)} +
		 $matrix_2{substr($buf_fasta[$i],($cys - 2),1)} +
		 $matrix_3{substr($buf_fasta[$i],($cys - 3),1)} +
		 $matrix_4{substr($buf_fasta[$i],($cys - 4),1)};
	    	    
	    # checks for L,I,V,F in -3, -4 region  / for H-region greater than 6 aas (probable):
	    
	    if ( ((substr($buf_fasta[$i],($cys - 3),1) eq 'L')     ||
		 	 	(substr($buf_fasta[$i],($cys - 3),1) eq 'I')     ||
		 	 	(substr($buf_fasta[$i],($cys - 3),1) eq 'V')     ||
			  	(substr($buf_fasta[$i],($cys - 3),1) eq 'F')     ||
		  		(substr($buf_fasta[$i],($cys - 3),1) eq 'M')     ||
		  		(substr($buf_fasta[$i],($cys - 4),1) eq 'M')     ||
            (substr($buf_fasta[$i],($cys - 4),1) eq 'L')     ||
		  		(substr($buf_fasta[$i],($cys - 4),1) eq 'I')     ||
		  		(substr($buf_fasta[$i],($cys - 4),1) eq 'V')     ||
		  		(substr($buf_fasta[$i],($cys - 4),1) eq 'F'))    
		 
		 		&&
		 
		 		(! ((substr($buf_fasta[$i],($cys - 11),1) eq 'K')  ||
		     (substr($buf_fasta[$i],($cys - 11),1) eq 'R')  ||
		     (substr($buf_fasta[$i],($cys - 11),1) eq 'D')  ||
		     (substr($buf_fasta[$i],($cys - 11),1) eq 'E')  ||
		     (substr($buf_fasta[$i],($cys - 11),1) eq 'H')) 
		  		)
		 
		 		&&
		 
		 		((substr($buf_fasta[$i],($cys - 1),1) eq 'C') ||
		  		(substr($buf_fasta[$i],($cys - 1),1) eq 'N') ||
		  		(substr($buf_fasta[$i],($cys - 1),1) eq 'S') ||
		  		(substr($buf_fasta[$i],($cys - 1),1) eq 'A') ||
		  		(substr($buf_fasta[$i],($cys - 1),1) eq 'G') 
		  		)
       		 
            &&

           ($score_n > 0)

		 		&&

		 		($hydro_temp > 11)

		 		) {	        
		
			$type_temp = 1;
		
	    } # end if
	    
		# checks for -3/-4/-5 rules (possible)
	    
	    elsif (((substr($buf_fasta[$i],($cys - 3),1) eq 'L')  ||
		    (substr($buf_fasta[$i],($cys - 3),1) eq 'I')  ||
		    (substr($buf_fasta[$i],($cys - 3),1) eq 'V')  ||
		    (substr($buf_fasta[$i],($cys - 3),1) eq 'F')  ||
		    (substr($buf_fasta[$i],($cys - 3),1) eq 'Y')  ||
		    (substr($buf_fasta[$i],($cys - 4),1) eq 'L')  ||
		    (substr($buf_fasta[$i],($cys - 4),1) eq 'I')  ||
		    (substr($buf_fasta[$i],($cys - 4),1) eq 'V')  ||
		    (substr($buf_fasta[$i],($cys - 4),1) eq 'F')  ||
		    (substr($buf_fasta[$i],($cys - 4),1) eq 'Y')  ||
		    (substr($buf_fasta[$i],($cys - 5),1) eq 'Y')  ||
		    (substr($buf_fasta[$i],($cys - 5),1) eq 'L')  ||
		    (substr($buf_fasta[$i],($cys - 5),1) eq 'I')  ||
          (substr($buf_fasta[$i],($cys - 3),1) eq 'M')  ||
          (substr($buf_fasta[$i],($cys - 4),1) eq 'M')  ||
          (substr($buf_fasta[$i],($cys - 5),1) eq 'M')  ||
		    (substr($buf_fasta[$i],($cys - 5),1) eq 'V')  ||
		    (substr($buf_fasta[$i],($cys - 5),1) eq 'F')) 
		   
		   &&

                   ($score_n > 0)

		   &&

		   ($hydro_temp >= 11)

		   ){	        
		
		$type_temp = 2;
		
	    }
	    
	    else{

		$type_temp = 0;

	    }

	    # keeps a second lipobox
	    
	    if (($score_lipo > $best_score_lipo) || (($type == 0)&&($type_temp != 0)) ){
		
		# if there's a lipo already, keeps the current one as secondary
		
		if ($actual_cleavage != -100){
		    
		    $sec_type = $type;
		    $sec_score_hydro = $best_score_hydro;
		    $sec_cys = $actual_cleavage;
		    $sec_score_lipo = $best_score_lipo;
		    $sec_hydro_cont = $hydro_cont;
		    $sec_score_n = $score_n;		    

		}
		
		$best_score_lipo = $score_lipo;
		$actual_cleavage = $cys;
		$type = $type_temp;
		
		$best_score_hydro = $score_hydro;
		
		$hydro_cont = $hydro_temp;
	    
	    }
	    
	}


    }


    if ($best_score_lipo < 0){

  		$type = 0;
 
    }
 
 	 my $type_string = "NOT LIPOPROTEIN";
	 if($type == 1){
		$type_string = "PROBABLE LIPOPROTEIN";	 
	 }
	 elsif($type == 2){
		$type_string = "POSSIBLE LIPOPROTEIN";	 
	 }	 
    
    printf OUT ">Results for ORF %s: %s\nBEST HIT:\n\tcleavage site = %d",
		         $gene_id[$i], $type_string, $actual_cleavage;
	 printf OUT "\n\tscore H-Region = %f\n\tscore Lipobox = %f\n\tsize of H-Region = %d",
	            $best_score_hydro, $best_score_lipo, ($hydro_cont - 5);
	 printf OUT "\n\tscore N-Region = %d\nSECONDARY HIT:\n\tcleavage site = %d\n\t",
					$score_n, $sec_cys;  
	 printf OUT "score H-Region = %f\n\tscore Lipobox = %f\n\tsize of H-Region = %d",
			   	$sec_score_hydro, $sec_score_lipo, ($sec_hydro_cont - 5);
	 printf OUT "\n\tscore N-Region = %d\n\n",
               $sec_score_n;
    
}

###
#### writing output and closing files ####

print "[done]\nwriting output file... ";

close(FASTA);
close(MATRIX);
close(OUT);

print "[done]";
print "\n\nScores file creation was successful ($output_file)\n\n";

exit 0;
