///*
// * BS_mapping.cu
// *
// *  Created on: Jul 26, 2011
// *      Author: fritz
// */



#include "BS_mapping.h"

#include "BS_Scores.cu"



void BSCuda::Allocate_Scoring() {
	cudaHostAlloc(&scaff_cpu,(size_t)mem_scaff*sizeof(char),cudaHostAllocWriteCombined);
	cudaHostAlloc(&reads_cpu,(size_t)mem_reads*sizeof(char),cudaHostAllocWriteCombined);

//	ext_cpu = new char [mem_ext];

	cudaMalloc((void**) &scaff_dest_gpu, mem_scaff * sizeof(char));
	cudaMalloc((void**) &scaffold_gpu, mem_scaff * sizeof(char));
	cudaMalloc((void**) &reads_gpu, mem_reads * sizeof(char));

	cudaMalloc((void**) &results_gpu, mem_result * sizeof(float));

	cudaMalloc((void**) &ext_gpu, mem_ext * sizeof(char));
	checkCUDAError("after memcpy");
}

void BSCuda::Allocate_Alginment() {

	cudaHostAlloc(&scaff_cpu,(size_t)mem_scaff*sizeof(char),cudaHostAllocWriteCombined);
	cudaHostAlloc(&reads_cpu,(size_t)mem_reads*sizeof(char),cudaHostAllocWriteCombined);
	checkCUDAError("after allocate host");

	pos_cpu = new short[mem_pos];
	results_cigar_cpu=new short[mem_result_cigar];
	results_Alignment_cpu = new char[mem_result];
//	ext_cpu = new char [mem_ext];

	cudaMalloc((void**) &scaff_dest_gpu, mem_scaff * sizeof(char));
	cudaMalloc((void**) &scaffold_gpu, mem_scaff * sizeof(char));

	cudaMalloc((void**) &reads_gpu, mem_reads * sizeof(char));
	cudaMalloc((void**) &matrix_gpu, mem_matrix * sizeof(char));
	cudaMalloc((void**) &pos_gpu, mem_pos * sizeof(short));
	cudaMalloc((void**) &results_Alignment_gpu, mem_result * sizeof(char));
	cudaMalloc((void**) &results_cigar_gpu, mem_result_cigar * sizeof(short));
	cudaMalloc((void**) &ext_gpu, mem_ext * sizeof(char));

	checkCUDAError("after allocate5");
}

void BSCuda::FreeMem() {
	cudaFree(reads_gpu);
	cudaFree(scaffold_gpu);
	cudaFree(scaff_dest_gpu);
	cudaFreeHost(scaff_cpu);
	cudaFreeHost(reads_cpu);


	cudaFree(ext_gpu);

	checkCUDAError("after delete2");
	if (type) {
		cudaFree(results_gpu);
	} else {
		cudaFree(matrix_gpu);

		if(!cigar){
			cudaFree(results_Alignment_gpu);
		}else{
			cudaFree(results_cigar_gpu);
		}
		cudaFree(pos_gpu);

		checkCUDAError("after delete1");

		delete[] results_cigar_cpu;
		delete[] results_Alignment_cpu;
		delete[] pos_cpu;
	}
	checkCUDAError("after delete free mem");
}


void BSCuda::threadExit() {


	cudaUnbindTexture(trans);
	cudaFreeArray(cu_array);
	checkCUDAError("after delete array");
	cudaFree(reads_gpu);
	cudaFree(scaffold_gpu);
	cudaFree(scaff_dest_gpu);
	cudaFree(ext_gpu);

	cudaFreeHost(scaff_cpu);
	cudaFreeHost(reads_cpu);
	checkCUDAError("after delete host");
	if (type) {
		cudaFree(results_gpu);
	} else {
		cudaFree(matrix_gpu);
		if(mem_result_cigar!=0){
			cudaFree(results_Alignment_gpu);
		}else{
			cudaFree(results_cigar_gpu);
		}
		cudaFree(pos_gpu);
	}
	cudaStreamDestroy(stream[0]);
	cudaStreamDestroy(stream[1]);
	checkCUDAError("after delete end");
	cudaThreadExit();
}



BSCuda::BSCuda(int gpu_id):Alignment(gpu_id){
	cout<<"BS align"<<endl;
	short mat= (short)Config.GetFloat("score_match");
	short mis= (short)Config.GetFloat("score_mismatch");
	short gap_rea = (short)Config.GetFloat("score_gap_read");
	short gap_ref = (short)Config.GetFloat("score_gap_ref");
	short mat_TT=mat;//(short)Config.GetFloat("score_match_TT");
	short mat_TC=mis;//(short)Config.GetFloat("score_mismatch_TC");

	short scores_cpuTC[6*6] = {
			mat,mis,mis,mis,0,mis,
			mis,mat,mis,mis,0,mis,
			mis,mis,mat,mis,0,mis,
			mis,mat_TC,mis,mat_TT,0,mis,
			0,0,0,0,0,0,
			0  ,  0,  0,  0,  0,mat
	};
	cudaMemcpyToSymbol(scoresTC,scores_cpuTC,6 * 6 *sizeof(short));



	mat_TC=mis;//(short)Config.GetFloat("score_mismatch_AG");
	mat_TT=mat;//(short)Config.GetFloat("score_match_AA");

	short scores_cpuAG[6*6] = {
			mat_TT,mis,mis,mis,0,mis,
			mis,mat,mis,mis,0,mis,
			mat_TC,mis,mat,mis,0,mis,
			mis,mat_TC,mis,mat_TT,0,mis,
			0,0,0,0,0,0,
			0  ,  0,  0,  0,  0,mat
	};

	cudaMemcpyToSymbol(scoresAG,scores_cpuAG,6 * 6 *sizeof(short));



	cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<char>();

	cudaMallocArray( &cu_array, &channelDesc, 256, 1 );
	const char asci_trans[256] = {
			4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
			4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
			4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
			4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
			4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 5, 4,
			4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
			4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 5, 4,
			4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
			4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
			4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
			4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
			4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
			4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
			4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
			4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
			4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4
		};

	cudaMemcpyToArray( cu_array, 0, 0, asci_trans, 256*sizeof(char), cudaMemcpyHostToDevice);
	cudaBindTextureToArray( trans, cu_array, channelDesc);

	mem_ext=1;
	mem_pos = result_number;

	long mem_per_thread = (sizeof(char) * (corridor_length + 1) * (read_length + 1)) +(read_length * sizeof(char)) +( 2*ref_length * sizeof(char)) + (result_number * sizeof(short))+ (alignment_length  * 2 * sizeof(short))+sizeof(char);

	align_batch_size = calc_batchsize(mem_per_thread);


	block_size_align=blocks;

	mem_result=1;
	mem_per_thread = read_length * sizeof(char) + mem_result* sizeof(float) + 2*ref_length * sizeof(char)+sizeof(char);
	score_batch_size=calc_batchsize(mem_per_thread);
	block_size_score=blocks;
	type = true;
	step_count = Config.GetInt("step_count");
	if (step_count == 0) {
		step_count = 1;
	}
	SetForScoreing();

	cudaMemcpyToSymbol(gapread,&gap_rea,sizeof(short));
	cudaMemcpyToSymbol(gapref,&gap_ref,sizeof(short));


	cudaMemcpyToSymbol(corr_len,&corridor_length,sizeof(short));
	cudaMemcpyToSymbol(read_len,&read_length,sizeof(short));
	cudaMemcpyToSymbol(ref_len,&ref_length,sizeof(short));

	cudaMemcpyToSymbol(alignment_length_gpu,&alignment_length,sizeof(short));

	checkCUDAError("init");
	cudaStreamCreate(&stream[0]);
	cudaStreamCreate(&stream[1]);

}

BSCuda::~BSCuda(){

	threadExit();
}

//SETTER:

void BSCuda::Set_Meme_Scoring() {
	cudaMemset(ext_gpu, 0, mem_ext * sizeof(char));
	cudaMemset(scaff_dest_gpu, '\5', mem_scaff * sizeof(char));
	cudaMemset(results_gpu, 0, mem_result * sizeof(float));
	cudaMemset(scaffold_gpu,'\5', mem_scaff * sizeof(char));
	cudaMemset(reads_gpu, '\5', mem_reads * sizeof(char));
	checkCUDAError("set mem");
}
void BSCuda::Set_Meme_Alginment() {

	cudaMemset(ext_gpu, 0, mem_ext * sizeof(char));
	memset(results_Alignment_cpu, ' ', mem_result * sizeof(char));

	cudaMemset(scaff_dest_gpu, '\5', mem_scaff * sizeof(char));
	cudaMemset(scaffold_gpu, '\5', mem_scaff * sizeof(char));
	cudaMemset(reads_gpu, '\5', mem_reads * sizeof(char));

	cudaMemset(results_cigar_gpu, 0, mem_result_cigar * sizeof(short));
	cudaMemset(results_Alignment_gpu, '\0', mem_result * sizeof(char));
	cudaMemset(pos_gpu, 9, mem_pos * sizeof(short));
	cudaMemset(matrix_gpu,CIGAR_STOP, mem_matrix * sizeof(char));
}


void BSCuda::SetForScoreing() {
	if (!type) {
		FreeMem();
	}
	type = true;

	blocks=block_size_score;
	batch_size=score_batch_size;//calc_batchsize(mem_per_thread);


	mem_reads=read_length*batch_size;
	mem_scaff=ref_length*batch_size;

	mem_result = batch_size;
	mem_matrix =0;//batch_size;//(corridor_length+1)*batch_size;

	mem_ext=batch_size;
	shared_mem =threads*(corridor_length) * sizeof(short);

	Allocate_Scoring();
	Set_Meme_Scoring();
	checkCUDAError("init");


}

void BSCuda::SetForBacktracking() {

	if (type) {
		FreeMem();
	}
	type = false;

	long mem_per_thread = (sizeof(char) * (corridor_length + 1) * (read_length + 1)) +(read_length * sizeof(char)) +( 2*ref_length * sizeof(char)) + (result_number * sizeof(short))+ (alignment_length  * 2 * sizeof(short));
	batch_size = calc_batchsize(mem_per_thread);

	blocks=block_size_align;
	batch_size=align_batch_size;

	mem_reads=read_length*batch_size;
	mem_scaff=ref_length*batch_size;
	mem_ext=batch_size;
	mem_pos = result_number * batch_size;

	if(cigar){
		mem_result_cigar = alignment_length  * 2* batch_size;
		mem_result = 0;
	}else{
		mem_result = alignment_length * 2* batch_size;
		mem_result_cigar=0;
	}

	mem_matrix = (corridor_length + 1) * (read_length+ 1)*batch_size;


	shared_mem = threads*(corridor_length) * sizeof(short);

	Allocate_Alginment();
	Set_Meme_Alginment();
	checkCUDAError("init");
}

int BSCuda::GetScoreBatchSize() {
	return score_batch_size*step_count;
}
int BSCuda::GetAlignBatchSize() {
	return align_batch_size*step_count;
}

int BSCuda::CalcScores_SW(int const batchSize, char const * const * const refSeqList, char const * const * const qrySeqList, float * const results,char * extData) {

	dim3 dimBlock(threads, 1);
	dim3 dimGrid(blocks, 1);

	int batch =batchSize;
	Prepare(refSeqList,scaff_cpu,ref_length, min(batch,batch_size));

	cudaMemcpyAsync(scaffold_gpu, scaff_cpu, mem_scaff * sizeof(char), cudaMemcpyHostToDevice,stream[0]);
	interleaveSeq_BS<<<dimGrid,dimBlock,0,stream[0]>>>(scaffold_gpu,scaff_dest_gpu);

	for (int i = 0; i<batchSize; i += batch_size) {
		Prepare(&qrySeqList[i],reads_cpu,read_length, min(batch,batch_size));
		cudaMemcpyAsync(reads_gpu, reads_cpu, mem_reads * sizeof(char), cudaMemcpyHostToDevice,stream[1]);

		cudaMemcpyAsync(ext_gpu,(char*) &extData[i], min(batch_size,batch) * sizeof(char), cudaMemcpyHostToDevice,stream[1]);

		cudaStreamSynchronize(stream[0]);

		SW_BS_Cuda<<<dimGrid,dimBlock,shared_mem,stream[1]>>>(scaff_dest_gpu,reads_gpu,results_gpu,ext_gpu);

		batch-=batch_size;
		if (batch>0){
			Prepare(&refSeqList[i+batch_size],scaff_cpu,ref_length, min(batch,batch_size));
			cudaMemcpyAsync(scaffold_gpu, scaff_cpu, mem_scaff * sizeof(char), cudaMemcpyHostToDevice,stream[0]);
		}

		cudaStreamSynchronize(stream[1]);
		CopyfromDevice(&results[i], min(batch+batch_size,batch_size));
		if (batch>0){
			interleaveSeq_BS<<<dimGrid,dimBlock,0,stream[0]>>>(scaffold_gpu,scaff_dest_gpu);
		}
	}
	checkCUDAError("after calc SW score");

	return batchSize;

}

int BSCuda::CalcScores_NW(int const batchSize, char const * const * const refSeqList, char const * const * const qrySeqList, float * const results,char * extData) {

	dim3 dimBlock(threads, 1);
	dim3 dimGrid(blocks, 1);

	int batch =batchSize;
	Prepare(refSeqList,scaff_cpu,ref_length, min(batch,batch_size));

	cudaMemcpyAsync(scaffold_gpu, scaff_cpu, mem_scaff * sizeof(char), cudaMemcpyHostToDevice,stream[0]);
	interleaveSeq_BS<<<dimGrid,dimBlock,0,stream[0]>>>(scaffold_gpu,scaff_dest_gpu);

	for (int i = 0; i<batchSize; i += batch_size) {
		Prepare(&qrySeqList[i],reads_cpu,read_length, min(batch,batch_size));
		cudaMemcpyAsync(reads_gpu, reads_cpu, mem_reads * sizeof(char), cudaMemcpyHostToDevice,stream[1]);

		cudaMemcpyAsync(ext_gpu, (char*) &extData[i], min(batch_size,batch) * sizeof(char), cudaMemcpyHostToDevice,stream[1]);
		cudaStreamSynchronize(stream[0]);

		NW_BS_Cuda<<<dimGrid,dimBlock,shared_mem,stream[1]>>>(scaff_dest_gpu,reads_gpu,results_gpu,ext_gpu);

		batch-=batch_size;
		if (batch>0){
			Prepare(&refSeqList[i+batch_size],scaff_cpu,ref_length, min(batch,batch_size));
			cudaMemcpyAsync(scaffold_gpu, scaff_cpu, mem_scaff * sizeof(char), cudaMemcpyHostToDevice,stream[0]);

		}
		cudaStreamSynchronize(stream[1]);
		CopyfromDevice(&results[i], min(batch+batch_size,batch_size));
		if(batch>0){
			interleaveSeq_BS<<<dimGrid,dimBlock,0,stream[0]>>>(scaffold_gpu,scaff_dest_gpu);
		}
	}
	checkCUDAError("after calc NW score");
	return batchSize;

}

int BSCuda::Calc_Alignment_SW(int const batchSize, char const * const * const refSeqList, char const * const * const qrySeqList, Align * results,char * extData) {

	dim3 dimBlock(threads, 1);
	dim3 dimGrid(blocks, 1);
	int batch =batchSize;

	Prepare(refSeqList,scaff_cpu,ref_length, min(batch,batch_size));
	cudaMemcpyAsync(scaffold_gpu, scaff_cpu, mem_scaff * sizeof(char), cudaMemcpyHostToDevice,stream[0]);
	interleaveSeq_BS<<<dimGrid,dimBlock,0,stream[0]>>>(scaffold_gpu,scaff_dest_gpu);

	for (int i = 0; i<batchSize; i += batch_size) {
		Prepare(&qrySeqList[i],reads_cpu,read_length, min(batch,batch_size));
		cudaMemcpyAsync(reads_gpu, reads_cpu, mem_reads * sizeof(char), cudaMemcpyHostToDevice,stream[1]);
		cudaMemcpyAsync(ext_gpu, (char*)&extData[i], min(batch_size,batch) * sizeof(char), cudaMemcpyHostToDevice,stream[1]);
		cudaStreamSynchronize(stream[0]);
		CUDASW_BS_Score<<<dimGrid,dimBlock,shared_mem,stream[1]>>>(scaff_dest_gpu,reads_gpu, pos_gpu,matrix_gpu,ext_gpu);

		if (batch-batch_size>0){
			Prepare(&refSeqList[i+batch_size],scaff_cpu,ref_length, min(batch-batch_size,batch_size));
			cudaMemcpyAsync(scaffold_gpu, scaff_cpu, mem_scaff * sizeof(char), cudaMemcpyHostToDevice,stream[0]);
		}
		cudaStreamSynchronize(stream[1]);
		CUDA_BS_SW_Backtracking<<<dimGrid,dimBlock,2*threads*sizeof(short),stream[1]>>>(scaff_dest_gpu, reads_gpu,pos_gpu,matrix_gpu,results_Alignment_gpu);

		cudaStreamSynchronize(stream[1]);
		CopyfromDevice_Alignemt();
		if(batch-batch_size>0){
			interleaveSeq_BS<<<dimGrid,dimBlock,0,stream[0]>>>(scaffold_gpu,scaff_dest_gpu);
		}


		for (int j = 0; j < min(batch,batch_size); ++j) {
			float total = 0.0f;
			float match = 0.0f;
			char * read = results[i+j].pQry;
			char * ref = results[i+j].pRef;
			char * tempAlign = results_Alignment_cpu + j * alignment_length * 2;

			int index = 0;
			for (int t = pos_cpu[result_number * j + 3] + 1; t < alignment_length; ++t) {
				ref[index] = tempAlign[t];
				read[index] = tempAlign[t + alignment_length];

				if (read[index] != ' ' && read[index] != '-') {
					total++;
				}
				if (read[index] != ' ' && read[index] == ref[index]) {
					match++;
				}
				index += 1;
			}
			results[i+j].PositionOffset = pos_cpu[result_number * j];
			results[i+j].QStart = pos_cpu[result_number * j + 1];
			results[i+j].QEnd = pos_cpu[result_number * j + 2];
			results[i+j].Identity = match / total;

		}
		batch-=batch_size;
	}

	checkCUDAError("ende align");

	return batchSize;
}


int BSCuda::Calc_Alignment_SW_cigar(int const batchSize, char const * const * const refSeqList, char const * const * const qrySeqList, Align * results,char * extData) {

	dim3 dimBlock(threads, 1);
	dim3 dimGrid(blocks, 1);
	int batch =batchSize;


	Prepare(refSeqList,scaff_cpu,ref_length, min(batch,batch_size));
	cudaMemcpyAsync(scaffold_gpu, scaff_cpu, mem_scaff * sizeof(char), cudaMemcpyHostToDevice,stream[0]);
	interleaveSeq_BS<<<dimGrid,dimBlock,0,stream[0]>>>(scaffold_gpu,scaff_dest_gpu);

	for (int i = 0; i<batchSize; i += batch_size) {
		Prepare(&qrySeqList[i],reads_cpu,read_length, min(batch,batch_size));
		cudaMemcpyAsync(reads_gpu, reads_cpu, mem_reads * sizeof(char), cudaMemcpyHostToDevice,stream[1]);
		cudaMemcpyAsync(ext_gpu,(char*) &extData[i], min(batch_size,batch) * sizeof(char), cudaMemcpyHostToDevice,stream[1]);

		cudaStreamSynchronize(stream[0]);
		CUDANW_BS_Score<<<dimGrid,dimBlock,shared_mem,stream[1]>>>(scaff_dest_gpu,reads_gpu, pos_gpu,matrix_gpu,ext_gpu);

		if (batch-batch_size>0){
			Prepare(&refSeqList[i+batch_size],scaff_cpu,ref_length, min(batch-batch_size,batch_size));
			cudaMemcpyAsync(scaffold_gpu, scaff_cpu, mem_scaff * sizeof(char), cudaMemcpyHostToDevice,stream[0]);
		}

		CUDA_BS_SW_Backtracking_CIGAR<<<dimGrid,dimBlock,2*threads*sizeof(short),stream[1]>>>(scaff_dest_gpu, reads_gpu,pos_gpu,matrix_gpu,results_cigar_gpu);

		cudaStreamSynchronize(stream[1]);
		cudaMemcpy(results_cigar_cpu, results_cigar_gpu, mem_result_cigar * sizeof(short), cudaMemcpyDeviceToHost);
		cudaMemcpy(pos_cpu, pos_gpu, mem_pos * sizeof(short), cudaMemcpyDeviceToHost);

		if(batch-batch_size>0){
			interleaveSeq_BS<<<dimGrid,dimBlock,0,stream[0]>>>(scaffold_gpu,scaff_dest_gpu);
		}

		for (int j = 0; j < min(batch_size,batch); ++j) {
			short * gpuCigar = results_cigar_cpu + j * alignment_length * 2;
			computeCigarMD(results[i+j], pos_cpu[result_number * j + 3], gpuCigar, refSeqList[i+j] + pos_cpu[result_number * j]);
			results[i+j].PositionOffset = pos_cpu[result_number * j];
		}
		batch-=batch_size;
	}

	checkCUDAError("after calc SW cigar");

	return batchSize;
}

int BSCuda::Calc_Alignment_NW(int const batchSize, char const * const * const refSeqList, char const * const * const qrySeqList, Align * results,char * extData) {

	cout<<"CUDA NW"<<endl;
	dim3 dimBlock(threads, 1);
	dim3 dimGrid(blocks, 1);
	int batch =batchSize;

	Prepare(refSeqList,scaff_cpu,ref_length, min(batch,batch_size));
	cudaMemcpyAsync(scaffold_gpu, scaff_cpu, mem_scaff * sizeof(char), cudaMemcpyHostToDevice,stream[0]);
	interleaveSeq_BS<<<dimGrid,dimBlock,0,stream[0]>>>(scaffold_gpu,scaff_dest_gpu);
	for (int i = 0; i<batchSize; i += batch_size) {
		Prepare(&qrySeqList[i],reads_cpu,read_length, min(batch,batch_size));
		cudaMemcpyAsync(reads_gpu, reads_cpu, mem_reads * sizeof(char), cudaMemcpyHostToDevice,stream[1]);
		cudaMemcpyAsync(ext_gpu, (char *)&extData[i], min(batch_size,batch) * sizeof(char), cudaMemcpyHostToDevice,stream[1]);

		cudaStreamSynchronize(stream[0]);
		CUDANW_BS_Score<<<dimGrid,dimBlock,shared_mem,stream[1]>>>(scaff_dest_gpu,reads_gpu, pos_gpu,matrix_gpu,ext_gpu);

		if (batch-batch_size>0){
			Prepare(&refSeqList[i+batch_size],scaff_cpu,ref_length, min(batch-batch_size,batch_size));
			cudaMemcpyAsync(scaffold_gpu, scaff_cpu, mem_scaff * sizeof(char), cudaMemcpyHostToDevice,stream[0]);
		}
		cudaStreamSynchronize(stream[1]);
		CUDA_BS_SW_Backtracking<<<dimGrid,dimBlock,2*threads*sizeof(short),stream[1]>>>(scaff_dest_gpu, reads_gpu,pos_gpu,matrix_gpu,results_Alignment_gpu);

		cudaStreamSynchronize(stream[1]);
		CopyfromDevice_Alignemt();
		if(batch-batch_size>0){
			interleaveSeq_BS<<<dimGrid,dimBlock,0,stream[0]>>>(scaffold_gpu,scaff_dest_gpu);
		}


		for (int j = 0; j < min(batch,batch_size); ++j) {
			float total = 0.0f;
			float match = 0.0f;
			char * read = results[i+j].pQry;
			char * ref = results[i+j].pRef;
			char * tempAlign = results_Alignment_cpu + j * alignment_length * 2;

			int index = 0;
			for (int t = pos_cpu[result_number * j + 3] + 1; t < alignment_length; ++t) {
				ref[index] = tempAlign[t];
				read[index] = tempAlign[t + alignment_length];

				if (read[index] != ' ' && read[index] != '-') {
					total++;
				}
				if (read[index] != ' ' && read[index] == ref[index]) {
					match++;
				}
				index += 1;
			}
			results[i+j].PositionOffset = pos_cpu[result_number * j];
			results[i+j].QStart = pos_cpu[result_number * j + 1];
			results[i+j].QEnd = pos_cpu[result_number * j + 2];
			results[i+j].Identity = match / total;

		}
		batch-=batch_size;
	}

	checkCUDAError("ende align");

	return batchSize;
}

int BSCuda::Calc_Alignment_NW_cigar(int const batchSize, char const * const * const refSeqList, char const * const * const qrySeqList, Align * results,char * extData) {

	dim3 dimBlock(threads, 1);
	dim3 dimGrid(blocks, 1);
	int batch =batchSize;

	Prepare(refSeqList,scaff_cpu,ref_length, min(batch,batch_size));
	cudaMemcpyAsync(scaffold_gpu, scaff_cpu, mem_scaff * sizeof(char), cudaMemcpyHostToDevice,stream[0]);
	interleaveSeq_BS<<<dimGrid,dimBlock,0,stream[0]>>>(scaffold_gpu,scaff_dest_gpu);

	for (int i = 0; i<batchSize; i += batch_size) {
		Prepare(&qrySeqList[i],reads_cpu,read_length, min(batch,batch_size));
		cudaMemcpyAsync(reads_gpu, reads_cpu, mem_reads * sizeof(char), cudaMemcpyHostToDevice,stream[1]);
		cudaMemcpyAsync(ext_gpu, &extData[i], min(batch_size,batch) * sizeof(char), cudaMemcpyHostToDevice,stream[1]);

		cudaStreamSynchronize(stream[0]);
		CUDANW_BS_Score<<<dimGrid,dimBlock,shared_mem,stream[1]>>>(scaff_dest_gpu,reads_gpu, pos_gpu,matrix_gpu,ext_gpu);

		if (batch-batch_size>0){
			Prepare(&refSeqList[i+batch_size],scaff_cpu,ref_length, min(batch-batch_size,batch_size));
			cudaMemcpyAsync(scaffold_gpu, scaff_cpu, mem_scaff * sizeof(char), cudaMemcpyHostToDevice,stream[0]);
		}

		CUDA_BS_SW_Backtracking_CIGAR<<<dimGrid,dimBlock,2*threads*sizeof(short),stream[1]>>>(scaff_dest_gpu, reads_gpu,pos_gpu,matrix_gpu,results_cigar_gpu);

		cudaStreamSynchronize(stream[1]);
		cudaMemcpy(results_cigar_cpu, results_cigar_gpu, mem_result_cigar * sizeof(short), cudaMemcpyDeviceToHost);
		cudaMemcpy(pos_cpu, pos_gpu, mem_pos * sizeof(short), cudaMemcpyDeviceToHost);

		if(batch-batch_size>0){
			interleaveSeq_BS<<<dimGrid,dimBlock,0,stream[0]>>>(scaffold_gpu,scaff_dest_gpu);
		}

		for (int j = 0; j < min(batch_size,batch); ++j) {
			short * gpuCigar = results_cigar_cpu + j * alignment_length * 2;
			computeCigarMD(results[i+j], pos_cpu[result_number * j + 3], gpuCigar, refSeqList[i+j] + pos_cpu[result_number * j]);
			results[i+j].PositionOffset = pos_cpu[result_number * j];
		}
		batch-=batch_size;
	}

	checkCUDAError("after calc NW cigar");

	return batchSize;
}

int BSCuda::BatchAlign(int const mode, int const batchSize, char const * const * const refSeqList, char const * const * const qrySeqList, Align * const results,void * extData) {

	if (batchSize <= 0) {
		Log.Warning("Align for batchSize <= 0");
		return 0;
	}

	if (type) {
		SetForBacktracking();
	}
	void * extData2 = new char[batchSize];
	memset(extData2,0,batchSize*sizeof(char));

	switch (mode & 0xFF) {
		case 0:
			if(cigar){
				return Calc_Alignment_SW_cigar(batchSize, refSeqList, qrySeqList, results,(char *) extData);
			}else{
				return Calc_Alignment_SW(batchSize, refSeqList, qrySeqList, results,(char *) extData2);
			}
		case 1:
			if(cigar){
				return Calc_Alignment_NW_cigar(batchSize, refSeqList, qrySeqList, results,(char *) extData);
			}else{
				return Calc_Alignment_NW(batchSize, refSeqList, qrySeqList, results,(char *) extData);
			}
		default:
			return 0;
	}

}

int BSCuda::BatchScore(int const mode, int const batchSize, char const * const * const refSeqList, char const * const * const qrySeqList, float * const results,void  * extData) {
	if (batchSize <= 0) {
		Log.Warning("Score for batchSize <= 0");
		return 0;
	}

	if (!type) {
		SetForScoreing();
	}

	void * extData2 = new char[batchSize];
	memset(extData2,0,batchSize*sizeof(char));

	switch (mode & 0xFF) {
		case 0:
			return CalcScores_SW(batchSize, refSeqList, qrySeqList, results,(char *) extData2);
		case 1:
			return CalcScores_NW(batchSize, refSeqList, qrySeqList, results,(char *) extData2);
		default:
			Log.Error("Unsupported alignment mode %i", mode & 0xFF);
			return 0;
	}

}

