#! /usr/bin/perl -w


# Author : huy.dinh@univie.ac.at
### NextGenMap profiling
# Version : 0.0.1

use strict;
use POSIX qw(ceil floor);

&bs_scorer_post_processing();

sub bs_scorer_post_processing {
	my $result_file = ""; my $ref_genome_file = ""; my $identity_cutoff = 85;
	print "Useage: perl biss_downstream.pl -r reference_file -m	mapping_outfile -c identity_cutoff \n";
	# building a hash of important parameters
	my $flag = 0;
	foreach my $arg (@ARGV) {
		if ($arg =~ /^-[m]/) {$flag = 1;next;}
		elsif ($arg =~ /^-[r]/) {$flag = 2;next;}
		elsif ($arg =~ /^-[c]/) {$flag = 3;next;}
		## getting values for paramters
		if ($flag==1) {$result_file = $arg; next;}
		elsif ($flag==2) {$ref_genome_file = $arg;next;}
		elsif ($flag==3) {$identity_cutoff = convert($arg);next;}
	}
	
	bs_scorer_profiling($ref_genome_file, 
						$result_file, $result_file."err", 
						$result_file."W.profile",
						$result_file."C.profile",
						$identity_cutoff);
}

sub bs_scorer_profiling {
	my ($ref_genome_file, $result_file, $error_file, $profile_W, $profile_C, $identity_cutoff) = @_;
	
	print("Reading ", $ref_genome_file, "... ");
	open(IN, $ref_genome_file ) or die "$!\n";
	my $first_line = 1;
	my $sequence = undef;
	while (<IN>) {
		chomp;
		if ($first_line == 1) {$first_line = 0; next;}
		$sequence = $sequence.$_;
	}
	close(IN);
	$sequence = uc $sequence;
	print(length($sequence), " nucleotides.\n");	
	
	my @profile_A = undef; my @profile_C = undef; my @profile_G = undef; my @profile_T = undef; my @profile_other = undef;	
	my @profile_A_crick = undef; my @profile_C_crick = undef; my @profile_G_crick = undef; my @profile_T_crick = undef; my @profile_other_crick = undef;	
	
	my $ref_length = length($sequence);
	
	for (my $i=0; $i<$ref_length; $i++) {
		$profile_A[$i] = $profile_C[$i] = $profile_G[$i] = $profile_T[$i] = $profile_other[$i] = 0;
		$profile_A_crick[$i] = $profile_C_crick[$i] = $profile_G_crick[$i] = $profile_T_crick[$i] = $profile_other_crick[$i] = 0;
	}	

	open(OUT1, ">$profile_W") or die "$profile_W: $!\n";
	open(OUT2, ">$profile_C") or die "$profile_C: $!\n";

	my $get = 0; my $result_begin = 0; my $is_unique = 0; my $result_genome = 0;
	my $aln_ref = ""; my $aln_read = "";
	
	my $count_short_len = 0;
	my $count_total = 0;
	
	open(IN, $result_file ) or die "$!\n";
	while (<IN>) {
		chomp;
		my @line=split /\s+/,$_;
		
		if ($get%4==0) {
			$result_begin = convert($line[11]);		
		}
		
		if ($get%4==1) {
			if ($line[1] eq "1") {$is_unique=1;} else {$is_unique=0};
			if (substr($line[4], 0, 1) eq "+") {$result_genome = 0;} #Watson
			else {$result_genome = 1;} #Crick
		}
		
		if ($get%4==2) {$aln_ref = $line[0];}
		
		if ($get%4==3 && $is_unique==1) {
			$count_total++;
			$aln_read = $line[0]; my $ind = 0; 
			my $similarity = 0; #hd($aln_ref, $aln_read);
			#print $aln_ref, "\n";print $aln_read, "\n";
			for (my $i=0; $i<length($aln_ref); $i++) {
				if (substr($aln_ref, $i, 1) eq substr($aln_read, $i, 1)) {$similarity++;}
				elsif (substr($aln_ref, $i, 1) eq "C" && substr($aln_read, $i, 1) eq "T" && $result_genome == 0) {$similarity++;}
				elsif (substr($aln_ref, $i, 1) eq "G" && substr($aln_read, $i, 1) eq "A" && $result_genome == 1) {$similarity++;}
				elsif (substr($aln_ref, $i, 1) eq "-" || substr($aln_read, $i, 1) eq "-") {$similarity++;}
			}
			#print $result_genome, "\n", $similarity, "\n", length($aln_ref), "\n"; exit;
			if ($similarity/length($aln_ref)*100 < $identity_cutoff) {
				$count_short_len++;
				$get++;
				next;
			}
			
			#removing the insertion
			if (index($aln_ref, "-") >=0) {
				my $temp2 = "";
				for (my $i=0; $i<length($aln_read); $i++) {
					if (substr($aln_ref, $i, 1) ne "-") {$temp2 = $temp2.substr($aln_ref, $i, 1);}
				}
				$aln_read = $temp2;
			}
			
			# getting alignment profile
			if ($result_genome == 0) {
				for (my $i=0; $i<length($aln_read); $i++) {
					if (substr($aln_read, $i, 1) eq "A") {$profile_A[$result_begin+$i]++;}
					elsif (substr($aln_read, $i, 1) eq "C") {$profile_C[$result_begin+$i]++;}
					elsif (substr($aln_read, $i, 1) eq "G") {$profile_G[$result_begin+$i]++;}
					elsif (substr($aln_read, $i, 1) eq "T") {$profile_T[$result_begin+$i]++;}
					else {$profile_other[$result_begin-1+$i]++;}
				}
			}
			else {
				my $revcomp = $aln_read;
				$revcomp =~ tr/ACGTacgt/TGCAtgca/; 
				$aln_read = $revcomp;
				for (my $i=0; $i<length($aln_read); $i++) {
					if (substr($aln_read, $i, 1) eq "A") {$profile_A_crick[$result_begin+$i]++;}
					elsif (substr($aln_read, $i, 1) eq "C") {$profile_C_crick[$result_begin+$i]++;}
					elsif (substr($aln_read, $i, 1) eq "G") {$profile_G_crick[$result_begin+$i]++;}
					elsif (substr($aln_read, $i, 1) eq "T") {$profile_T_crick[$result_begin+$i]++;}
					else {$profile_other_crick[$result_begin+$i]++;}
				}			
			}
		} 
		$get++;
	}
	
	# printing the alignment profile
	for (my $i=0; $i<$ref_length; $i++) {
		if (substr($sequence, $i, 1) eq "C") {
			print OUT1 $i, "\t";
			print OUT1 $profile_C[$i], "\t", $profile_T[$i], "\t", $profile_A[$i]+$profile_C[$i]+$profile_G[$i]+$profile_T[$i]+$profile_other[$i], "\n";
		}
		if (substr($sequence, $i, 1) eq "G") {
			print OUT2 $i, "\t";
			print OUT2 $profile_C_crick[$i], "\t", $profile_T_crick[$i], "\t", $profile_A_crick[$i]+$profile_C_crick[$i]+$profile_G_crick[$i]+$profile_T_crick[$i]+$profile_other_crick[$i], "\n";
		}
	}
	
	close(IN); close(OUT1); close(OUT2);
}

sub convert {
	my $strNumber = $_[0];
  $strNumber = sprintf("%04d", $strNumber);
  my $number = $strNumber + 1 - 1;
  return $number;
}

sub ACGT {
	if ($_[0] eq "A") {return 0;}
	elsif ($_[0] eq "C") {return 1;}
	elsif ($_[0] eq "G") {return 2;}
	elsif ($_[0] eq "T") {return 3;}
	else {return 4;}
}

sub hd {
    return ($_[0] ^ $_[1]) =~ tr/\001-\255//;
}

