#include "NGMSSE.h"

#include <memory.h>

//ILog * _log = 0;
//IConfig * _config = 0;

#ifdef _WIN32
#define dllexport  __declspec(dllexport)
#define malloc16(ptr,size,align)  ptr = (short*) _aligned_malloc(size, align)
#else
#define dllexport
#include <stdlib.h>
#define malloc16(ptr,size,align)  posix_memalign(((void * *)&ptr), align, size)
#endif

//extern "C" dllexport int Cookie()
//{
//	return 0x10201021;
//}
//
//extern "C" dllexport void SetLog(ILog * log)
//{
//	_log = log;
//}
//
//extern "C" dllexport void SetConfig(IConfig * config)
//{
//	_config = config;
//}
//
//extern "C" dllexport IAlignment * CreateAlignment(int const gpu_id)
//{
//	return new SSEAligner();
//}
//extern "C" dllexport void DeleteAlignment(SSEAligner* instance) {
//	delete instance;
//}


int SSEAligner::BatchScore(int const mode, int const batchSize, char const * const * const refSeqList, char const * const * const qrySeqList, float * const results,void * extData)
{
	//short * scores = (short*) _aligned_malloc(sizeof(short) * (m_Corridor + 2) * 2 * 8, 16);
	short * scores = 0;

	int b1 = batchSize % 8;
	int b2 = batchSize / 8;

	for (int i = 0; i < b1; ++i) {
		results[i] = Score(refSeqList[i], qrySeqList[i]);
	}

#pragma omp parallel  private(scores) num_threads(Config.GetInt("ocl_threads"))
	{
		malloc16(scores, sizeof(short) * (m_Corridor + 2) * 2 * 8, 16);
#pragma omp for
		for (int i = 0; i < b2; ++i) {
			memset(scores, 0, sizeof(short) * (m_Corridor + 2) * 2 * 8);
			ScoreSSE(refSeqList + b1 + i * 8, qrySeqList + b1 + i * 8, scores);

			for (int j = 0; j < 8; ++j)
				results[b1 + i * 8 + j] = scores[j];
		}
	}

	return b1 + b2;
}

void SSEAligner::ScoreSSE(char const * const * const ref, char const * const * const qry, short * scores)
{
	__m128i score_max = m_xZero;

	align16 short qb[8];
	align16 short rb[8];
	__m128i q;
	__m128i r;

	int ll = 0;
	int cl = 1;
	for (int j = 1; j <= m_QueryLength; ++j)
	{
		for (int s = 0; s < 8; ++s)
		{
			qb[s] = *(qry[s] + j - 1);
		}
		q = _mm_load_si128((__m128i *) qb);

		for (int i = 1; i <= m_Corridor; ++i)
		{
			__m128i n = _mm_load_si128((__m128i *) (scores + 8 * (ll * (m_Corridor + 2) + i + 1)));
			__m128i nw = _mm_load_si128((__m128i *) (scores + 8 * (ll * (m_Corridor + 2) + i)));
			__m128i w = _mm_load_si128((__m128i *) (scores + 8 * (cl * (m_Corridor + 2) + i - 1)));

			for (int s = 0; s < 8; ++s)
			{
				rb[s] = *(ref[s] + j + i - 2);
			}
			r = _mm_load_si128((__m128i *) rb);

			n = _mm_add_epi16(n, m_xGapRef);
			w = _mm_add_epi16(w, m_xGapRead);

			__m128i cmp = _mm_cmpeq_epi16(q, r);
			nw = _mm_add_epi16(nw, _mm_and_si128(cmp, m_xMatch));
			nw = _mm_add_epi16(nw, _mm_andnot_si128(cmp, m_xMismatch));

			__m128i cur = _mm_max_epi16(n, _mm_max_epi16(w, _mm_max_epi16(nw, m_xZero)));

			score_max = _mm_max_epi16(cur, score_max);

			_mm_store_si128((__m128i *) (scores + 8 * (cl * (m_Corridor + 2) + i)), cur);
		}
		ll = cl;
		(++cl) &= 1;
	}

	_mm_store_si128((__m128i *) scores, score_max);
}

static inline int max(int a, int b)
{
	return (a > b) ? a : b;
}

short SSEAligner::Score(char const * const ref, char const * const qry)
{
	int * scores = new int[(m_Corridor + 2) * 2];
	memset(scores, 0, sizeof(int) * (m_Corridor + 2) * 2);

	int score_max = 0;

	int ll = 0;
	int cl = 1;
	for (int j = 1; j <= m_QueryLength; ++j)
	{
		char q = *(qry + j - 1);
		//printf("%c < ", q);
		for (int i = 1; i <= m_Corridor; ++i)
		{
			int n = scores[ll * (m_Corridor + 2) + i + 1];
			int nw = scores[ll * (m_Corridor + 2) + i];
			int w = scores[cl * (m_Corridor + 2) + i - 1];

			char r = *(ref + j + i - 2);
			//printf("%c ", r);

			n += m_GapRef;
			w += m_GapRead;
			nw += (r == q) ? m_Match : m_Mismatch;

			int cur = max(n, max(w, max(nw, 0)));

			if (cur > score_max)
				score_max = cur;

			scores[cl * (m_Corridor + 2) + i] = cur;
		}
		ll = cl;
		(++cl) &= 1;
		//printf("max = %i (%i %i)\n", score_max, ll, cl);
		//if (score_max > 20000)
		//	exit(2);
	}
	//	printf("score = %i\n", score_max);
	delete[] scores;
	return score_max;
}
