#!/usr/bin/perl -w
#
#   This software was developed by Marcelo Reis, Joao C. Setubal,
#   David Haake and James Matsunaga
#
#   It should not be redistributed or used for any commercial purpose
#   without written permission from Joao C. Setubal setubal@vbi.vt.edu
#
#   release date September 2005
#
# This software is experimental in nature and is
# supplied "AS IS", without obligation by the authors to provide
# accompanying services or support.  The entire risk as to the quality
# and performance of the Software is with you. The authors
# EXPRESSLY DISCLAIM ANY AND ALL WARRANTIES REGARDING THE SOFTWARE,
# WHETHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO WARRANTIES
# PERTAINING TO MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.
#
#
# ====================================================================
# build_weight_matrix.pl
# ====================================================================
# reads a list and a fasta files, calculates the weight-matrix and
# writes it in a file.
# 
# ====================================================================
# Author: Reis, Marcelo S.    Oct/09/2002 
# ====================================================================
# ====================================================================

# force variable declaration
#
use strict;
use vars qw($id_size);

# immediate output flush
#
$| = 1; 


###                              
#### initializing the files ####
                              
# test number of parameters
#
if (@ARGV != 3){
    print STDERR "\nbuild_weight_matrix - calculates a lipobox's aminoacids weight-matrix, writing it in a file.\n\n";
    print STDERR "Usage: $0 ";
    print STDERR "<training set file> ";
    print STDERR "<multifasta file> ";
    print STDERR "<weight-matrix file>\n\n";
    exit 1;
}

my $fasta_file  = $ARGV[0];
my $orfs_file   = $ARGV[1];
my $matrix_file = $ARGV[2];
my $problems = 0;

if (! -e "$fasta_file") {
  print STDERR "ERROR : File $fasta_file does not exist\n";
  $problems++;
}

if (! -e "$orfs_file") {
  print STDERR "ERROR : File $orfs_file does not exist\n";
  $problems++;
}

# if we have a problem, exit:
#
if ($problems) {
  exit 1;
}

if (!open(FASTA, "$fasta_file")) {
  die "Problems when opening $fasta_file: $!\n";
}

if (!open(ORFS, "$orfs_file")) {
  die "Problems when opening $orfs_file: $!\n";
}

if (!open(MATRIX, ">$matrix_file")) {
  die "Problems when opening $matrix_file: $!\n";
}


###
#### loading fastas and list into the memory ####

print "\nloading files into memory... ";


$/ = undef;   
my $buf_orfs = <ORFS>;
$buf_orfs =~ s/>.*\n//g;
$buf_orfs =~  s/\n//g; 

my $cont = 0;
my @buf_fasta;
my @cleavage;
my @buf_id;

$/ = "\n";

while (<FASTA>){
    $_ =~ s/>//g;
    $_ =~ /(^\S+)\s+.*/;
    $buf_id[$cont] = $1;
    $/ = ">";
    $_ = <FASTA>;
    $_ =~ s/\n//g;
    $_ =~ s/>//g;
    $buf_fasta[$cont] = uc $_;
    $cleavage[$cont] = -1;
    $cont++;	
    $/ = "\n";
}

print "[done]";


###
#### calculating the weight-matrix ####

print "\ncalculating weight-matrix...\n";

# with orfs file we can find the "background frequence" 

my %back_freq;

my $i = 0;

my %fore_freq_1;
my %fore_freq_2;
my %fore_freq_3;
my %fore_freq_4;
my %fore_freq_5;

my %actual_freq_1;
my %actual_freq_2;
my %actual_freq_3;
my %actual_freq_4;
my %actual_freq_5;

my $aa_code = "FSYC*WLPHQIMTNKRVADEG";

my $left = 0;
my $right = 0;
my $sair = 0;

my $max;
my $actual_cleavage;
my $tam_hydrophobic;


for ($i=0; $i<21; $i++){
    
    $fore_freq_1{substr($aa_code,$i,1)} = 0;
    $fore_freq_2{substr($aa_code,$i,1)} = 0;
    $fore_freq_3{substr($aa_code,$i,1)} = 0;
    $fore_freq_4{substr($aa_code,$i,1)} = 0;
    $fore_freq_5{substr($aa_code,$i,1)} = 0;


    ## Q and T is considered "possible", so it has neutral weight (0) ##

    if ((substr($aa_code,$i,1) eq 'Q')||(substr($aa_code,$i,1) eq 'T')){
      $actual_freq_1{substr($aa_code,$i,1)} = 0;
    }
    else{
      $actual_freq_1{substr($aa_code,$i,1)} = -99;
    }

    $actual_freq_2{substr($aa_code,$i,1)} = -2;
    $actual_freq_3{substr($aa_code,$i,1)} = -2;
    $actual_freq_4{substr($aa_code,$i,1)} = -2;
    $actual_freq_5{substr($aa_code,$i,1)} = -2;

    $back_freq{substr($aa_code,$i,1)} = 0;
}


# with orfs file we can find the "background frequence" 

for ($i = 0; $i < length($buf_orfs); $i++){
    
    $back_freq{substr($buf_orfs,$i,1)}++;
    
}

# now we find the aa frequences from the lipo file

for ($i = 0; $i < $cont; $i++){
    
    $actual_cleavage = -100;
    
    $max = length($buf_fasta[$i]);
    
    for(my $cys = $max; $cys >= 14; $cys--){
	
	if (substr($buf_fasta[$i], $cys, 1) eq 'C'){
	    
	    # now checks for the rules
	    
	    my $ok = 1;
	    
	    for (my $hydro_cont = 5; $hydro_cont <= 10; $hydro_cont++){
		
		if ((substr($buf_fasta[$i],($cys - $hydro_cont),1) eq 'R') ||
		    (substr($buf_fasta[$i],($cys - $hydro_cont),1) eq 'D') ||
		    (substr($buf_fasta[$i],($cys - $hydro_cont),1) eq 'H') ||
		    (substr($buf_fasta[$i],($cys - $hydro_cont),1) eq 'E') ||
		    (substr($buf_fasta[$i],($cys - $hydro_cont),1) eq 'K')) {
		    
		    $ok = 0;
		    
		}
		
	    }
	    
	    
	    if ($ok){
		
		if (
		    ((substr($buf_fasta[$i],($cys - 3),1) eq 'L')  ||
		     (substr($buf_fasta[$i],($cys - 3),1) eq 'I')  ||
		     (substr($buf_fasta[$i],($cys - 3),1) eq 'V')  ||
		     (substr($buf_fasta[$i],($cys - 3),1) eq 'F')  ||
		     (substr($buf_fasta[$i],($cys - 4),1) eq 'L')  ||
		     (substr($buf_fasta[$i],($cys - 4),1) eq 'I')  ||
		     (substr($buf_fasta[$i],($cys - 4),1) eq 'V')  ||
		     (substr($buf_fasta[$i],($cys - 4),1) eq 'F'))  &&
		    ((substr($buf_fasta[$i],($cys - 1),1) eq 'A') ||
		     (substr($buf_fasta[$i],($cys - 1),1) eq 'S') ||
		     (substr($buf_fasta[$i],($cys - 1),1) eq 'G') ||
		     (substr($buf_fasta[$i],($cys - 1),1) eq 'N') ||
		     (substr($buf_fasta[$i],($cys - 1),1) eq 'C'))
		    ){ 
		    
		    $actual_cleavage = $cys;
		    
		}
		
	    }
	    
	}
	
    }
    
    print "\nORF $i:\tcleavage: $actual_cleavage; ";
    print "\tlipobox: " . substr($buf_fasta[$i],($actual_cleavage - 4), 5);
        
    # walk through the C-region of signal peptide, and take the aa freq values
    
    $fore_freq_1{substr($buf_fasta[$i],($actual_cleavage - 1),1)}++;
    $fore_freq_2{substr($buf_fasta[$i],($actual_cleavage - 2),1)}++;
    $fore_freq_3{substr($buf_fasta[$i],($actual_cleavage - 3),1)}++;
    $fore_freq_4{substr($buf_fasta[$i],($actual_cleavage - 4),1)}++;


}

# avoids division by zero
if($cont>1){
	$cont--;
}

# calculating the actual frequence :
#
# lg (fore_freq / back_freq)
#
# lg X = log X / log 2
#

for ($i=0; $i<21; $i++){
    
  $back_freq{substr($aa_code,$i,1)} = $back_freq{substr($aa_code,$i,1)} / length($buf_orfs);
  
  if ($fore_freq_1{substr($aa_code,$i,1)} > 0){ 
      $fore_freq_1{substr($aa_code,$i,1)} = $fore_freq_1{substr($aa_code,$i,1)} / $cont;
      $actual_freq_1{substr($aa_code,$i,1)} = log($fore_freq_1{substr($aa_code,$i,1)} / $back_freq{substr($aa_code,$i,1)}) / log(2);
      
  }
  
  if ($fore_freq_2{substr($aa_code,$i,1)} > 0){ 
      $fore_freq_2{substr($aa_code,$i,1)} = $fore_freq_2{substr($aa_code,$i,1)} / $cont;
      $actual_freq_2{substr($aa_code,$i,1)} = log($fore_freq_2{substr($aa_code,$i,1)} / $back_freq{substr($aa_code,$i,1)}) / log(2);
      
  }
  
  if ($fore_freq_3{substr($aa_code,$i,1)} > 0){ 
      $fore_freq_3{substr($aa_code,$i,1)} = $fore_freq_3{substr($aa_code,$i,1)} / $cont;
    $actual_freq_3{substr($aa_code,$i,1)} = log($fore_freq_3{substr($aa_code,$i,1)} / $back_freq{substr($aa_code,$i,1)}) / log(2);
    
  }
  
  if ($fore_freq_4{substr($aa_code,$i,1)} > 0){ 
    $fore_freq_4{substr($aa_code,$i,1)} = $fore_freq_4{substr($aa_code,$i,1)} / $cont;
    $actual_freq_4{substr($aa_code,$i,1)} = log($fore_freq_4{substr($aa_code,$i,1)} / $back_freq{substr($aa_code,$i,1)}) / log(2);
    
  }
  
  
}


print "\n\n[done]";

###
#### writing output and closing files ####

print "\n\nwriting output file... ";

print MATRIX "\nLipoprotein Weight Matrix\n\n";
print MATRIX "aa\t-5\t\-4\t\-3\t\-2\t\-1\n";

for ($i=0; $i<21; $i++){

  print MATRIX substr($aa_code,$i,1) . "\t";
  
  if ((substr($aa_code,$i,1) eq 'K')||
      (substr($aa_code,$i,1) eq 'D')||
      (substr($aa_code,$i,1) eq 'E')||
      (substr($aa_code,$i,1) eq '*')||
      (substr($aa_code,$i,1) eq 'H')||
      (substr($aa_code,$i,1) eq 'R')){
    
    print MATRIX "-99.000\t-99.000\t-99.000\t-99.000\t-99.000\n";
     
  }
  
  else{

    
  if ((substr($aa_code,$i,1) eq 'A')||
	 (substr($aa_code,$i,1) eq 'V')||
	 (substr($aa_code,$i,1) eq 'L')||
	 (substr($aa_code,$i,1) eq 'I')||
	 (substr($aa_code,$i,1) eq 'F')){
    
    print MATRIX "10.000\t";                                    	
    
  }
  
  else{
      
    print MATRIX "1.000\t";
      
  }
  
  printf MATRIX "%4.3f\t", $actual_freq_4{substr($aa_code,$i,1)};
  printf MATRIX "%4.3f\t", $actual_freq_3{substr($aa_code,$i,1)};
  printf MATRIX "%4.3f\t", $actual_freq_2{substr($aa_code,$i,1)};
  printf MATRIX "%4.3f\n", $actual_freq_1{substr($aa_code,$i,1)};
  
 }

}
    
close(FASTA);
close (ORFS);
close(MATRIX);

print "[done]";
print "\n\nWeight-matrix file creation was successful ($matrix_file)\n\n";

exit 0;
