/*
 * Alignment.cpp
 *
 *  Created on: Jul 28, 2011
 *      Author: fritz
 */


#include "Aligment.cuh"
#include <stdio.h>


Alignment::Alignment(int gpu_id){
	cigar=bool(((gpu_id >> 8) & 0xFF)==1);
	gpu_id=gpu_id& 0xFF;


	int devcout = cudaDevCount();
	if (gpu_id >= devcout) {
		Log.Error("Your Pc is equipped with %i cards. You selected card number: %i", devcout, gpu_id);
		Log.Error("The GPU ID that you have chosen does not exists. Exiting.");
		exit(0);
	}

	cudaSetDevice(gpu_id);

	deviceProp = new cudaDeviceProp;
	memset(deviceProp, 0, sizeof(deviceProp));
	if (cudaSuccess != cudaGetDeviceProperties(deviceProp, gpu_id)) {
		Log.Error("Something is wrong with the graphics card. Exiting.");
		exit(0);
	}

	read_length = Config.GetInt("qry_max_len"); //Read Size
	corridor_length = Config.GetInt("corridor")+ 1;//Corridor Size
	ref_length = read_length + corridor_length; //Reference Size

	alignment_length = corridor_length + read_length+1;



	cudaStreamCreate(&stream[0]);
	cudaStreamCreate(&stream[1]);


}

void Alignment::thread_exit(){

	cudaFree(reads_gpu);
	cudaFree(scaffold_gpu);
	cudaFree(scaff_dest_gpu);
	cudaFreeHost(scaff_cpu);
	cudaFreeHost(reads_cpu);

	checkCUDAError("after delete2");
	if (type) {
		cudaFree(results_gpu);
	} else {
		cudaFree(matrix_gpu);

		if(!cigar){
			cudaFree(results_Alignment_gpu);
		}else{
			cudaFree(results_cigar_gpu);
		}
		cudaFree(pos_gpu);

		checkCUDAError("after delete1");
	}
	cudaThreadExit();
}

Alignment::~Alignment(){

	delete [] pos_cpu;
	delete [] results_cigar_cpu;
	delete [] results_Alignment_cpu;
}

void Alignment::checkCUDAError(const char *msg) {
	cudaError_t err = cudaGetLastError();
	if (cudaSuccess != err) {
		Log.Error("Cuda error: %s: %s.\n", msg, cudaGetErrorString(err));
		throw msg, cudaGetErrorString(err);
	}
}

int printCigarElement(char const op, short const length, char * cigar) {
	int offset = 0;
	offset = sprintf(cigar, "%d%c", length, op);
	return offset;
}

void Alignment::computeCigarMD(Align & result, int const gpuCigarOffset, short const * const gpuCigar, char const * const refSeq) {
	int alignment_length = corridor_length + read_length+1;


	int cigar_offset = 0;
	int md_offset = 0;

	if ((gpuCigar[gpuCigarOffset] >> 4) > 0) {
		cigar_offset += printCigarElement('S', gpuCigar[gpuCigarOffset] >> 4, result.pRef + cigar_offset);
		result.QStart = gpuCigar[gpuCigarOffset] >> 4;
	}

	int cigar_m_length = 0;
	int md_eq_length = 0;
	int ref_index = 0;
	for (int j = gpuCigarOffset + 1; j < (alignment_length - 1); ++j) {
		int op = gpuCigar[j] & 15;
		int length = gpuCigar[j] >> 4;

		//debugCigar(op, length);

		switch (op) {
			case CIGAR_X:
				cigar_m_length += length;

				//Produces: 	[0-9]+(([A-Z]+|\^[A-Z]+)[0-9]+)*
				//instead of: 	[0-9]+(([A-Z]|\^[A-Z]+)[0-9]+)*
				md_offset += sprintf(result.pQry + md_offset, "%d", md_eq_length);
				for (int k = 0; k < length; ++k) {
					md_offset += sprintf(result.pQry + md_offset, "%c", refSeq[ref_index++]);
				}
				md_eq_length = 0;

				break;
			case CIGAR_EQ:
				cigar_m_length += length;
				md_eq_length += length;
				ref_index += length;
				break;
			case CIGAR_D:
				if (cigar_m_length > 0) {
					cigar_offset += printCigarElement('M', cigar_m_length, result.pRef + cigar_offset);
					cigar_m_length = 0;
				}
				cigar_offset += printCigarElement('D', length, result.pRef + cigar_offset);

				md_offset += sprintf(result.pQry + md_offset, "%d", md_eq_length);
				md_eq_length = 0;
				result.pQry[md_offset++] = '^';
				for (int k = 0; k < length; ++k) {
					result.pQry[md_offset++] = refSeq[ref_index++];
				}

				break;
			case CIGAR_I:
				if (cigar_m_length > 0) {
					cigar_offset += printCigarElement('M', cigar_m_length, result.pRef + cigar_offset);
					cigar_m_length = 0;
				}
				cigar_offset += printCigarElement('I', length, result.pRef + cigar_offset);

				break;
			default:
				Log.Error("Invalid cigar string: %d", op);
				std::cout << "Offset: " << gpuCigarOffset << std::endl;
				for(int x = 0; x < alignment_length * 2; ++x) {
					std::cout << gpuCigar[x] << " ";
				}
				std::cout << std::endl;
				exit(1);
		}
	}
	md_offset += sprintf(result.pQry + md_offset, "%d", md_eq_length);
	if (cigar_m_length > 0) {
		cigar_offset += printCigarElement('M', cigar_m_length, result.pRef + cigar_offset);
		cigar_m_length = 0;
	}

	if ((gpuCigar[alignment_length - 1] >> 4) > 0) {
		cigar_offset += printCigarElement('S', gpuCigar[alignment_length - 1] >> 4, result.pRef + cigar_offset);
		result.QEnd = gpuCigar[alignment_length - 1] >> 4;
	}

	result.pRef[cigar_offset] = '\0';
	result.pQry[md_offset] = '\0';
}



void Alignment::CopyfromDevice(float * results,int number) {
	cudaMemcpy(results, results_gpu, number * sizeof(float), cudaMemcpyDeviceToHost);
}

void Alignment::CopyfromDevice_Alignemt() {
	if(mem_result_cigar!=0){
		cudaMemcpy(results_cigar_cpu, results_cigar_gpu, mem_result_cigar * sizeof(short), cudaMemcpyDeviceToHost);
	}else{
		cudaMemcpy(results_Alignment_cpu, results_Alignment_gpu, mem_result * sizeof(char), cudaMemcpyDeviceToHost);
	}
	cudaMemcpy(pos_cpu, pos_gpu, mem_pos * sizeof(short), cudaMemcpyDeviceToHost);
}



int Alignment::cudaDevCount() {

	int dev = 0;
	cudaGetDeviceCount(&dev);
	checkCUDAError("No device");
	return dev;
}



//cpu function:
void Alignment::Prepare(char const * const * const sorce,char * dest, int size, int alignment_number) {

	for (int i = 0; i < alignment_number; ++i) {
		memcpy(&dest[i * size], sorce[i], sizeof(char) * size);
	}

	if (alignment_number < batch_size) { // if there are not enough sequence pairs
		//set the restcomputeCigarMD
		memset(&dest[alignment_number * size], 0, (batch_size - alignment_number) *size * sizeof(char));
	}
}

int Alignment::getThreadPerMulti(int id) {

	switch (id) {
	case 10:
		return 768;
	case 11:
		return 768;
	case 12:
		return 1024;
	case 13:
		return 1024;
	case 20:
		return 1536;
	case 21:
		return 1536;
	default:
		return 768;
	}

}
unsigned int Alignment::calc_batchsize(long mem_per_thread) {

	threads = 256;
	int id = deviceProp->major * 10 + deviceProp->minor;
	blocks = deviceProp->multiProcessorCount * 4 * (getThreadPerMulti(id) / threads);
	long needed_mem = threads * blocks * mem_per_thread;

	float * try_gpu;
	cudaMalloc((void**) &try_gpu, needed_mem);
	cudaError_t err = cudaGetLastError();// check if enough mem

	while (cudaSuccess != err && blocks!=0) { //correct the block size if the mem is not enough
		blocks -= deviceProp->multiProcessorCount;
		needed_mem = threads * blocks * mem_per_thread;
		cudaMalloc((void**) &try_gpu, needed_mem);
		err = cudaGetLastError();
	}

	cudaFree(try_gpu);
	if (blocks == 0) {
		Log.Error("Fatal CUDA Error (Not enough free GPU memory).");
		exit(0);
	}
	return blocks * threads;
}




