/*
 * Kernels.cu
 *
 *  Created on: Jan 10, 2011
 *      Author: fritz
 */


__device__ __constant__ short gapread;
__device__ __constant__ short gapref;
__device__ __constant__ short const null = 0;
__device__ __constant__ short alignment_length_gpu;
__device__ __constant__ short read_len;
__device__ __constant__ short corr_len;
__device__ __constant__ short ref_len;
__device__ __constant__ short mismatch;
__device__ __constant__ short match;


typedef struct {
	short best_ref_index;
	short best_read_index;
} Index;


__global__ void interleaveSeq(char * src, char * dest) {

	unsigned int index = (blockIdx.x*blockDim.x + threadIdx.x) + (blockIdx.y*blockDim.y + threadIdx.y) *blockDim.x;

	src = src + (index * ref_len);
	dest = dest + ((index / blockDim.x) * blockDim.x * ref_len) + index % blockDim.x;
	for (int i = 0; i < ref_len; ++i) {
		dest[i * blockDim.x] =*src++;
	}
}



__global__ void SW_Cuda(char * scaffold_gpu,char *reads_gpu,float *results_gpu){

	unsigned int index = (blockIdx.x*blockDim.x + threadIdx.x) + (blockIdx.y*blockDim.y + threadIdx.y) *blockDim.x;

	extern __shared__ bool smemf[];
	short * matrix = (short*)&smemf[0];

	matrix +=threadIdx.x;
	scaffold_gpu += ((index / blockDim.x) * blockDim.x * ref_len) + index % blockDim.x;
	reads_gpu += (index *read_len);

	int left_cell =null;
	int curr_max=null;

	for(curr_max=null;curr_max<corr_len*blockDim.x;curr_max+=blockDim.x){
		matrix[curr_max]=null;
	}
	curr_max=null;

	for(char read_shared; (read_shared = *reads_gpu) !=  line_end; ++reads_gpu){

		left_cell =null;
		for(int z=0 ;z<corr_len-1;++z){ //corr_len


			if(read_shared==scaffold_gpu[z*blockDim.x]){
				matrix[z*blockDim.x]+=match;
			}else if(read_shared!='N'){
				matrix[z*blockDim.x]+=mismatch;
			}
			left_cell =max(left_cell+gapref,null);
			left_cell =max(left_cell,matrix[z*blockDim.x]);
			left_cell =max(left_cell,matrix[z*blockDim.x+blockDim.x]+gapread);
			curr_max=max(curr_max,left_cell);
			matrix[z*blockDim.x] = left_cell;
		}
		scaffold_gpu += blockDim.x;
	}
	results_gpu[index]=(float)curr_max;
}


__global__ void NW_Cuda(char * scaffold_gpu,char *reads_gpu,float *results_gpu){
	unsigned int index = (blockIdx.x*blockDim.x + threadIdx.x) + (blockIdx.y*blockDim.y + threadIdx.y) *blockDim.x;

	extern __shared__ bool smemf[];
	short * matrix = (short*)&smemf[0];

	matrix +=threadIdx.x;
	scaffold_gpu += ((index / blockDim.x) * blockDim.x * ref_len) + index % blockDim.x;
	reads_gpu += (index *read_len);

	short left_cell;
//	int curr_max=short_min;

	for(left_cell=null;left_cell<corr_len;left_cell++){
		matrix[left_cell*blockDim.x]=null;
	}
	matrix[(corr_len-1) * blockDim.x] = short_min-gapread;


	for(char read_shared; (read_shared = *reads_gpu) !=  line_end; ++reads_gpu){

		left_cell =short_min-gapref;
		for(int z=0 ;z<corr_len-1;++z){ //corr_len

			if(read_shared==scaffold_gpu[z*blockDim.x]){
				matrix[z*blockDim.x]+=match;
			}else if(read_shared!='N'){
				matrix[z*blockDim.x]+=mismatch;
			}
			left_cell =max(left_cell+gapref,matrix[z*blockDim.x]);
			left_cell =max(left_cell,matrix[z*blockDim.x+blockDim.x]+gapread);
//			curr_max=max(curr_max,left_cell);
			matrix[z*blockDim.x] = left_cell;
		}
		scaffold_gpu += blockDim.x;
	}

	left_cell=short_min;
	for(int z=0;z<corr_len-1;z++){
		if(left_cell<matrix[z*blockDim.x]){
			left_cell=matrix[z*blockDim.x];
		}
	}
	results_gpu[index]=(short)left_cell;

}




__global__ void CUDASW_Score( char const * scaff2,  char const * read2, short * result2, char * matrix2) {
	unsigned int global_index = (blockIdx.x*blockDim.x + threadIdx.x) + (blockIdx.y*blockDim.y + threadIdx.y) *blockDim.x;

	char * matrix = matrix2 + ((global_index / blockDim.x) * blockDim.x * ((corr_len + 1) * (read_len + 1))) + global_index % blockDim.x;
	const char * read = read2 + (global_index * read_len);
	const char * scaff = scaff2 + ((global_index /  blockDim.x) *  blockDim.x * ref_len) + global_index %  blockDim.x;
	short *result = result2 + result_number * global_index;

	extern __shared__ bool smemf[];
	short * l_matrix_lines = (short*)&smemf[0];
	short * local_matrix_line=l_matrix_lines + threadIdx.x * corr_len;

	//Init matrix lines
	for (short i = 0; i < corr_len; ++i) {
		local_matrix_line[i] = 0;
		matrix[i * blockDim.x] = CIGAR_STOP;
	}
	matrix[corr_len * blockDim.x] = CIGAR_STOP;

	short curr_max = -1;
	short read_index = 0;

	for (char read_char_cache; (read_char_cache = *read) != line_end; ++read) {
		matrix += (corr_len + 1) * blockDim.x;

		short left_cell = 0;
		matrix[0] = CIGAR_STOP;
		for (short ref_index = 0; ref_index < corr_len - 1; ++ref_index) {

			//init values
			left_cell += gapref;
			short diag_cell = local_matrix_line[ref_index];

			int pointer = CIGAR_X;
			if (read_char_cache == scaff[ref_index *  blockDim.x]) {
				diag_cell += match;
				pointer = CIGAR_EQ;
			} else if (read_char_cache != 'N' && read_char_cache != line_end) {
				diag_cell += mismatch;
			}



			short up_cell = local_matrix_line[ref_index + 1] + gapread;

			//find max
			short max_cell = 0;
			max_cell = max(left_cell, max_cell);
			max_cell = max(diag_cell, max_cell);
			max_cell = max(up_cell, max_cell);


			if (max_cell == up_cell && max_cell != (local_matrix_line[ref_index] + mismatch)) {
				//pointer = 2;
				pointer = CIGAR_I;
			} else if (max_cell == left_cell && max_cell != (local_matrix_line[ref_index] + mismatch)) {
				//pointer = 1;
				pointer = CIGAR_D;
			} else if (max_cell > 0 && (max_cell == diag_cell || max_cell == (local_matrix_line[ref_index] + mismatch) || max_cell == (local_matrix_line[ref_index] + match))) {
				//pointer = 4;
			} else {
				pointer = CIGAR_STOP;
			}

			matrix[(ref_index + 1)*blockDim.x] = pointer;

			if (max_cell > curr_max) {
				curr_max = max_cell;
				result[1] = ref_index;
				result[0] = read_index;
				result[3]=curr_max;
			}
			left_cell = max_cell;
			local_matrix_line[ref_index] = max_cell;
		}
		matrix[corr_len * blockDim.x] = CIGAR_STOP;
		scaff +=  blockDim.x;
		read_index += 1;
	}
	result[2] = read_index - result[0] - 1;
	if (read_index == 0) {
		result[0] = result[1] =2;
	}
}




__global__ void CUDANW_Score( char const * scaff,  char const * read, short * result, char * matrix) {
	unsigned int global_index = (blockIdx.x*blockDim.x + threadIdx.x) + (blockIdx.y*blockDim.y + threadIdx.y) *blockDim.x;

	matrix = matrix + ((global_index / blockDim.x) * blockDim.x * ((corr_len + 1) * (read_len + 1))) + global_index % blockDim.x;
	read = read + (global_index * read_len);
	scaff = scaff + ((global_index /  blockDim.x) *  blockDim.x * ref_len) + global_index %  blockDim.x;
	result = result + result_number * global_index;

	extern __shared__ bool smemf[];
	short * l_matrix_lines = (short*)&smemf[0];
	short * local_matrix_line=l_matrix_lines + threadIdx.x * corr_len;

	//Init matrix lines
	for (short i = 0; i < corr_len; ++i) {
		local_matrix_line[i] = 0;
		matrix[i * blockDim.x] = CIGAR_STOP;
	}
	matrix[corr_len * blockDim.x] = CIGAR_STOP;
	local_matrix_line[(corr_len - 1)] = short_min;

	short read_index = 0;

	for (char read_char_cache; (read_char_cache = *read) != line_end; ++read) {
//								printf("%c:\t", read_char_cache);
//								for (short i = 0; i < read_index; ++i) {
//									printf("\t");
//								}
		matrix += (corr_len + 1) * blockDim.x;

		short left_cell = short_min;
		matrix[0] = CIGAR_M;
		for (short ref_index = 0; ref_index < corr_len - 1; ++ref_index) {

			//init values
			left_cell += gapref;
			short diag_cell = local_matrix_line[ref_index];
			//			printf("%c == %c\n", read_char_cache, scaff[ref_index * blockDim.x]);
			int pointer = CIGAR_X;
			if (read_char_cache == scaff[ref_index * blockDim.x]) {

				diag_cell += match;//typedef struct {

				pointer = CIGAR_EQ;
			} else if (read_char_cache != 'N' && read_char_cache != line_end) {
				diag_cell += mismatch;
			}



			short up_cell = local_matrix_line[ref_index + 1] + gapread;

			//find max
			short max_cell = 0;
			//			max_cell = max(left_cell, max_cell);
			max_cell = max(diag_cell, left_cell);
			max_cell = max(up_cell, max_cell);


			if (max_cell == up_cell && max_cell != (local_matrix_line[ref_index] + mismatch)) {
				//pointer = 2;
				pointer = CIGAR_I;
			} else if (max_cell == left_cell && max_cell != (local_matrix_line[ref_index] + mismatch)) {
				//pointer = 1;
				pointer = CIGAR_D;
			} else if ((max_cell == diag_cell || max_cell == (local_matrix_line[ref_index] + mismatch) || max_cell == (local_matrix_line[ref_index] + match))) {
				//pointer = 4;
			} else {
				pointer = -666;
			}

			matrix[(ref_index + 1) * blockDim.x] = pointer;

			left_cell = max_cell;
			local_matrix_line[ref_index] = max_cell;
		}
		matrix[corr_len * blockDim.x] = CIGAR_M;
		scaff += blockDim.x;
		read_index += 1;


	}

	short curr_max = -1;
	for (short i = 0; i < corr_len; ++i) {
//		printf("\t%d ", local_matrix_line[i]);
		if (local_matrix_line[i] > curr_max) {
			curr_max = local_matrix_line[i];
			result[param_best_read_index] = read_index - 1;
			result[param_best_ref_index] = i;
		}
	}

	result[qend] = 0;
	if (read_index == 0) {
		result[param_best_read_index] = result[param_best_read_index] = 0;
	}
}






__global__ void CUDASW_Backtracking( char const * scaff, char const * read,  short * result,  char * matrix,char * alignments) {
	unsigned int nIndex = (blockIdx.x*blockDim.x + threadIdx.x) + (blockIdx.y*blockDim.y + threadIdx.y) *blockDim.x;

	extern __shared__ bool smemf[];
	short * alignment_index = (short*)&smemf[0];
	short * best_read_index= (short*)&smemf[blockDim.x*sizeof(short)];

	matrix = matrix + ((nIndex / blockDim.x) * blockDim.x * ((corr_len + 1) * (read_len + 1))) + nIndex % blockDim.x;
	read = read + (nIndex *read_len);
	scaff = scaff + ((nIndex / blockDim.x) * blockDim.x * ref_len) + nIndex % blockDim.x;
	result = result + result_number * nIndex;

	//Index index;
	best_read_index[threadIdx.x] = result[0];
	short best_ref_index = result[1];

	if (best_read_index[threadIdx.x] > 0) {

		alignments = alignments + (nIndex * alignment_length_gpu * 2);
		matrix += (((corr_len + 1) * (best_read_index[threadIdx.x] + 1)) * blockDim.x);

		short abs_ref_index = best_ref_index + best_read_index[threadIdx.x];
		alignment_index[threadIdx.x] = alignment_length_gpu - 2;

		alignments[alignment_length_gpu - 1] = '\0';
		alignments[alignment_length_gpu - 1 + alignment_length_gpu] = '\0';

		int pointer = CIGAR_STOP;
		while ((pointer = matrix[(best_ref_index + 1) * blockDim.x]) != CIGAR_STOP && best_read_index>=0) {
			//printf("%d ", pointer);
			if (pointer == CIGAR_X || pointer == CIGAR_EQ) {
				alignments[alignment_index[threadIdx.x]] = scaff[abs_ref_index-- * blockDim.x];
				alignments[alignment_index[threadIdx.x] + alignment_length_gpu] = read[best_read_index[threadIdx.x]];
				matrix -= ((corr_len + 1) * blockDim.x);
				best_read_index[threadIdx.x] -= 1;
			} else if (pointer == CIGAR_I) {
				alignments[alignment_index[threadIdx.x]] = '-';
				alignments[alignment_index[threadIdx.x] + alignment_length_gpu] = read[best_read_index[threadIdx.x]];
				matrix -= ((corr_len + 1) * blockDim.x);
				best_read_index[threadIdx.x] -= 1;
				best_ref_index += 1;
			} else {
				alignments[alignment_index[threadIdx.x]] = scaff[abs_ref_index-- * blockDim.x];
				alignments[alignment_index[threadIdx.x] + alignment_length_gpu] = '-';
				best_ref_index -= 1;
			}
			alignment_index[threadIdx.x] -= 1;
		}
		result[0] = abs_ref_index + 1;
		result[1] = best_read_index[threadIdx.x] + 1;
		result[3] = alignment_index[threadIdx.x];
	}

}




__global__ void CUDASW_Backtracking_CIGAR(char const * scaff, char const * read, short * result, char * matrix, short * alignments) {
	unsigned int global_index = (blockIdx.x*blockDim.x + threadIdx.x) + (blockIdx.y*blockDim.y + threadIdx.y) *blockDim.x;

	matrix = matrix + ((global_index / blockDim.x) * blockDim.x * ((corr_len + 1) * (read_len + 1))) + global_index % blockDim.x;
	read = read + (global_index * read_len);
	scaff = scaff + ((global_index /  blockDim.x) *  blockDim.x * ref_len) + global_index %  blockDim.x;
	result = result + result_number * global_index;
	extern __shared__ bool smemf[];
	short * best_ref_index = (short*)&smemf[0];
	short * best_read_index= (short*)&smemf[blockDim.x*sizeof(short)];


	//Index index;
	best_read_index[threadIdx.x] = result[param_best_read_index];
	best_ref_index[threadIdx.x] = result[param_best_ref_index];

	if (best_read_index[threadIdx.x] > 0) {

		alignments = alignments + (global_index * alignment_length_gpu * 2);
		matrix += (((corr_len + 1) * (best_read_index[threadIdx.x] + 1)) * blockDim.x);

		short abs_ref_index = best_ref_index[threadIdx.x] + best_read_index[threadIdx.x];
		short alignment_index = alignment_length_gpu - 2;

		int pointer = CIGAR_STOP;
		int cigar_element = CIGAR_S;
		int cigar_length = result[qend];
		while ((pointer = matrix[(best_ref_index[threadIdx.x] + 1) * blockDim.x]) != CIGAR_STOP) {
			//printf("%d ", pointer);
			if (pointer == CIGAR_X || pointer == CIGAR_EQ) {
				matrix -= ((corr_len + 1) * blockDim.x);
				best_read_index[threadIdx.x] --;
				abs_ref_index -= 1;
			} else if (pointer == CIGAR_I) {
				matrix -= ((corr_len + 1) * blockDim.x);
				best_read_index[threadIdx.x]--;
				best_ref_index[threadIdx.x] ++;
			} else {
				best_ref_index[threadIdx.x] --;
				abs_ref_index -= 1;
			}

			if (pointer == cigar_element) {
				cigar_length += 1;
			} else {
				alignments[alignment_index--] = (cigar_length << 4 | cigar_element);
				cigar_element = pointer;
				cigar_length = 1;
			}
		}
		alignments[alignment_index--] = (cigar_length << 4 | cigar_element);
		alignments[alignment_index] = ((best_read_index[threadIdx.x] + 1) << 4 | CIGAR_S);

		result[ref_position] = abs_ref_index + 1;
		result[qstart] = best_read_index[threadIdx.x] + 1;
		//qend was set by "forward" kernel
		result[alignment_offset] = alignment_index;
	}
}
