#ifndef __NGMSSE_H__
#define __NGMSSE_H__

#include "IAlignment.h"
#include "ILog.h"
#include "IConfig.h"

#include <emmintrin.h>


#if _WIN32
#define align16 __declspec(align(16))
#else
#define align16 __attribute__((aligned(16)))
#endif

class SSEAligner : public IAlignment
{
public:
	SSEAligner()
	{
		m_QueryLength = Config.GetInt("qry_max_len");
		m_Corridor = Config.GetInt("corridor");

		m_xMatch = BCShortToReg( m_Match = (short)Config.GetInt("score_match") );
		m_xMismatch = BCShortToReg( m_Mismatch = (short)Config.GetInt("score_mismatch") );
		m_xGapRef = BCShortToReg( m_GapRef = (short)Config.GetInt("score_gap_ref") );
		m_xGapRead = BCShortToReg( m_GapRead = (short)Config.GetInt("score_gap_read") );

		m_xZero = BCShortToReg(0);
	}

	int GetScoreBatchSize()
	{
		return 8 * cIterationsPerCall;
	}
	int GetAlignBatchSize()
	{
		return -1;
	}

	int BatchScore(
		int const mode, 
		int const batchSize,
		char const * const * const refSeqList,
		char const * const * const qrySeqList,
		float * const results,void * extData);

	virtual int BatchAlign(
		int const mode,
		int const batchSize,
		char const * const * const refSeqList,
		char const * const * const qrySeqList,
		Align * const results,void * extData)
	{
		return 0;
	}
private:
	inline __m128i BCShortToReg(short x)
	{
		align16 short buf[8];
		for (int i = 0; i < 8; ++i)
			buf[i] = x;

		return _mm_load_si128((__m128i*)buf);
	}

	void ScoreSSE(char const * const * const ref, char const * const * const qry, short * scores);
	short Score(char const * const ref, char const * const qry);

	static const int cIterationsPerCall = 1000;

	int m_QueryLength;
	int m_Corridor;

	short m_Match;
	short m_Mismatch;
	short m_GapRef;
	short m_GapRead;

	__m128i m_xMatch;
	__m128i m_xMismatch;
	__m128i m_xGapRead;
	__m128i m_xGapRef;

	__m128i m_xZero;
};

#endif
