/***************************************************************************
 *   Copyright (C) 2010 by Minh Anh Thi Nguyen, Tanja Gesell and Arndt von Haeseler   *
 *   minh.anh.nguyen@univie.ac.at, tanja.gesell@univie.ac.at, arndt.von.haeseler@univie.ac.at   *
 *                                                                         *
 *   This program is free software; you can redistribute it and/or modify  *
 *   it under the terms of the GNU General Public License as published by  *
 *   the Free Software Foundation; either version 2 of the License, or     *
 *   (at your option) any later version.                                   *
 *                                                                         *
 *   This program is distributed in the hope that it will be useful,       *
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
 *   GNU General Public License for more details.                          *
 *                                                                         *
 *   You should have received a copy of the GNU General Public License     *
 *   along with this program; if not, write to the                         *
 *   Free Software Foundation, Inc.,                                       *
 *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
 ***************************************************************************/
#ifndef UTILITY_H
#define UTILITY_H

#include <vector>
#include <set>
#include <string>
#include <string.h>
#include <iostream>
#include <fstream>
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <exception>
#include <ctime>
#include <cassert>
#include <algorithm>
#include <map>
#include <sstream>

using namespace std;

/**
General utilities

	@author Minh Anh Thi Nguyen, Tanja Gesell and Arndt von Haeseler 
<minh.anh.nguyen@univie.ac.at, tanja.gesell@univie.ac.at, arndt.von.haeseler@univie.ac.at>
*/

/**==================== CONSTANT VARIABLES =======================*/
/**
	number of nucleotides (A,C,G,T/U)
*/
const int StateNum = 4;
/** 
	the nucleotide charaters themselves 
*/
const char Nucleotides[4] = {'A','C','G','T'};
/**
	uncertain character ? and gap -
*/
const char UGAP[]       = ".-?";
/**
	Characters allowed in sequence: A, C, G, T/U and uncertain characters
*/
const char CharSet[] = "ACGTUacgtu.-?";
/**
	Characters allowed in sequence: A, C, G, T/U and uncertain characters
*/
const string ValidChars = "ACGTUacgtu.-?";
/**
	block size to print alignment in interleave (stardard) PHYLIP format
*/
const int BlockSize = 10;
/**
	number of blocks in a row
*/
const int NumBlock = 5;
/**
	Given a nucleotide character(Char) and a mutation type (Mut), output the mutated character
	Char\Mut        |       0       1       2       3
			--------|----------------------
			A       |       A       G       C       T
			C       |       C       T       A       G
			G       |       G       A       T       C
			T       |       T       C       G       A
	Note A=0, C=1, G=2, T=3
*/
// const char MutatedChar[4][4] = {'0', '2', '1', '3',
// 								'1', '3', '0', '2',
// 								'2', '0', '3', '1',
// 								'3', '1', '2', '0'};
const int MutatedInt[4][4] 	= {0, 2, 1, 3,
 								1, 3, 0, 2,
 								2, 0, 3, 1,
 								3, 1, 2, 0};
/**
	number of mutation types: 4
	type 0: no change
	type 1, 2, 3 see the above comment
*/
const int MutType = 4;
/**
	maximum number of space between sequence's name and the sequence itself
	This is used when reading interleave Phylip format
*/
const int MAX_NUM_SPACE = 256;
/**
	maximum size of a file name
*/
const int MAX_FILE_NAME = 200;
/**
	maximum number of arguments (in the command line)
*/
const int MAX_NUM_ARG = 50;

/**
	when writing tree:
		WT_BR_LEN - output branch length
		WT_BR_CLADE - put branch length into internal node name
		WT_TAXON_ID - output taxon ID
		WT_INT_NODE - for draw tree, draw the internal node
		WT_BR_SCALE - for draw tree, draw the branch proportional to its length
*/
const int WT_BR_LEN    = 1;  //000001
const int WT_BR_CLADE  = 2;  //000010
const int WT_TAXON_ID  = 4;  //000100	
const int WT_INT_NODE  = 8;  //001000
const int WT_BR_SCALE  = 16; //010000
const int WT_BR_ID     = 32; //100000

/**
	when print tree:
		BR_LEN to output branch length
		BR_ID to output branch id
		BR_NONE don't output neither branch length nor branch id
*/
enum BranchType {BR_LEN, BR_ID, BR_NONE};
// enum SelectType {PRO, INV_PRO, NONE};
const int PRO = 1;
const int INV_PRO = -1;
const int NONE = 0;
/**
	Name for the root of the tree
*/
const char ROOT_NAME[] = "fake_root";
const char INNER_ROOT_NAME[] = "inner_root";

/**
	Verbose level when printing help
*/
enum Verbose {ALL, PARTLY, HIDDEN};

/**
	indices of a vector containing mutations 
*/
enum MutType {IDEN, TS, TV1, TV2, SUM};


/**============ DEFINE SOME DATA TYPES ====================================*/
typedef vector< string > StringVec;
typedef vector< int > IntVec;
typedef vector< double > DoubleVec;
typedef vector< long > LongVec;
typedef vector< bool > BoolVec;
typedef vector< long double > LongDoubleVec;
typedef set< int, std::less< int> > IntSet;
#define matrix(T) vector< vector< T > >
/**
	matrix of double
*/
typedef matrix( double ) DoubleMatrix;
/**
	matrix of integers
*/
typedef matrix( int ) IntMatrix;
/**
	matrix of long integers
*/
typedef matrix( long ) LongMatrix;
/**
	matrix of booleans
*/
typedef matrix( bool ) BoolMatrix;

/**
	map floating point (double) number and integer number.
	key is the floating point number.
*/
typedef map< double, int, less< double > > DoubleIntMap;
typedef multimap< double, int, less< double > > DoubleIntMM;
/**
	map an integer number and a double number
	key is the integer number
*/
typedef map< int, double, less< int > > IntDoubleMap;

typedef map< int, int, less< int > > IntIntMap;
/** ===================== COMMON ERROR MASSAGES ====================*/
const char ERR_DUPLICATED_LEAVE[]              	= "Duplicated taxon's name in the tree: ";
const char ERR_DUPLICATED_SEQUENCE[]    		= "Duplicated sequence name: ";
const char ERR_DUPLICATED_INNER_NODE[]			= "Duplicated internal node names: ";
const char ERR_CONFLICT_ROOT[]                  = "Tree is already rooted, -o <file> is not allowed.";
const char ERR_NO_LEAF_NAME[]                   = "Find no taxon in the tree with name: ";
const char ERR_NO_SEQUENCE_NAME[]               = "Find no sequence with name: ";
const char ERR_NEG_BRANCH[]                     = "Negative branch length not allowed.";
const char ERR_TOO_FEW_TAXA[]                   = "Number of taxa must be greater than 2.";
const char ERR_NOT_BIFURCATING[]                = "Tree is not bifurcating";
const char ERR_ROOT_AT_INNER[]					= "Attempting to root the tree at inner node.";

const char ERR_NO_MEMORY[]         = "Not enough memory!";

const char ERR_READ_INPUT[]        = "File not found or incorrect input, pls check it again: ";
const char ERR_UNEXPECTED_EOF[]    = "Unexpected end of file: ";
const char ERR_READ_ANY[]          = "Unidentified error while reading file, pls check it carefully again: ";
const char ERR_WRITE_OUTPUT[]      = "Cannot write to file: ";
const char ERR_READ_MISSING_DATA[] = "Missing data in file: ";
/** ====================== INTERACTIVE QUESTIONS ==================*/
const char INTERACTIVE_QUESTION[]    = "Please enter your options OR -h for help OR q to quit OR y to start: ";

/**======================== COMMON FUNCTIONS ========================*/
/**
	@return TRUE if ch is a control character (ascii <= 32)
*/
inline bool controlchar(char ch) {
	return ch <= 32;
}
/**
	@return TRUE of ch is a character (not alphabet) used in Newick format
*/
inline bool is_newick_token(char ch) {
	return ch == ':' || ch == ';' || ch == ',' || ch == ')' || ch == '(';
}
/**
	print error message then exit program
*/
void outError(const char *error);
/**
	print error message then exit program
*/
void outError(string error);
/**
	print double error messages then exit the program
*/
void outError(const char *error, const char *msg);

/**
	print double error messages then exit program
*/
void outError(const char *error, string msg);
/**
	convert string to integer, with error checking
	@param str original string
	@return the integer value
*/
int convert_int(const char *str) throw (string);
/**
	convert string to double, with error checking
	@param str original string
	@return the double
*/
double convert_double(const char *str) throw (string);

/**
 * convert int to string
 * @param int
 * @return string
 */
string convertIntToString(int number);
/**
	to remove all space in a string
	@param str (IN/OUT)
*/
//void removeSpaces(string &str);

/**
	search for a string in a vector of strings.
	@param stringToFind the string to search (target)
	@param strVec a vector of string
	@RETURN the index of the stringToFind in strVec if found, otherwise -1
*/
int searchString(string stringToFind, StringVec strVec);

/**
	read strings from a file, each string in one line.
	@param inFile input file
	@param retVec the returned vector of strings
*/
void readStringVec(const char* inFile, StringVec &retVec);
/**
	read strings from a stream, each string in one line.
	@param in input stream
	@param retVec the returned vector of strings
*/
void readStringVec(ifstream &in, StringVec &retVec);
/**
	read a matrix of integers from a file.
	@param file name of the file
	@param n number of columns (intervals)
	@param retMat (OUT) the returned vector 
*/
void readIntMatrix(const char* file, const int col, IntMatrix &retMat);

/** SOME RANDOM GENERATOR FUNCTIONS AS PROVIDED BY C++. SEARCH FOR BETTER ONES??      */
/** --------------------------------------------------------------------------------- */
/**
	generate a random (double) number in 0..1
	@RETURN a random number in 0..1
*/
inline double random01()
{
        return double(rand())/(double)(RAND_MAX+1.0);
}
/**
	generate a sequence of random (double) numbers in 0..1
	@param size number of random numbers to be generated
	@RETURN a vector of (double) numbers generated
*/
inline DoubleVec random01Vec( const int size)
{
	DoubleVec ret;
	for ( int i = 0; i < size; i++ )
		ret.push_back(random01());
	return ret;
}

/**
	generate a random integer between begin and end [begin,end]
	@param begin
	@param end
	@return the generated number
*/
inline int randomInt (const int begin, const int end)
{
	assert(begin < end);
//	int interval = end - begin + 1;
//  	return (rand() % interval + begin);	
	//generate a random number in [0,interval]
// 	int temp = (int)round( (double)rand()/(double)(RAND_MAX+1.0) * (double)interval );	
 	return ( end - begin + 1) * ( rand()/(RAND_MAX+1.0) ) + begin; 
}

/**
	generate a sequence of random integers between begin and end
	@param begin
	@param end
	@return a vector containing the generated numbers
*/
inline IntVec randomIntVec (const int begin, const int end, const int size)
{
	IntVec ret;
	for ( int i = 0; i < size; i++ )
		ret.push_back(randomInt(begin,end));
	return ret;
}
/**================== A LIST CONTAINS PARAMETERS FOR THE PROGRAM ======== */
struct Params{
/**=================================================*/
/**            INPUT 	                            */
/**=================================================*/
	/**
		alignment file name. REQUIRED argument!
	*/
	char* alignFile;
	/**
		input tree file name. REQUIRED argument!
	*/
	char* treeFile;
	/**
		number of extra-mutations.
	*/
	int nExtra;
	/**
		to use branch length as expected number of extra substitutions per site.
	*/
	bool usebranch;
	/**
		tree to run puzzle if it is different from the above tree
	*/
	char *pzTree;
	/**
		number of repetitions
	*/
	int nRepeat;
	/**
		file contains outgroup. OPTIONAL.
	*/
	char* outgroupFile;
	/**
		file contains a list of positions in the alignment. OPTIONAL.
		Each line contains one number.
	*/
	char* positionFile;
	/**
		file contains a list of mutation types. OPTIONAL.
		Each line contains one number in {1,2,3}
	*/
	char* mutationFile;
	/**
		file contains branches. OPTIONAL.
	*/
	char* branchFile;
	/**
		file contains branch-mutation
	*/
	char* bmFile;
	/**
		file contains branch-alignmentSite (positions)
	*/
	char* bpFile;
	/**
		file contains branch-mutation-positiion
	*/
	char* bmpFile;
	/**
		file contains mutation-position
	*/
	char* mpFile;
	/**
		file contains mutation rates
	*/
// 	char* mrateFile;
	/**
		file contains parameters to run puzzle
	 */
	char* pzFile;
	/**
		file contains site rates
	*/
	char* prateFile;
	
	/**
		number indicating a method to choose branches on the tree.
		1: a branch is depicted with a probability proportional to its relative length.		
		0: a branch is depicted randomly according to a uniform distribution.
	*/
 	int br;
	/**
		number indicating a method to choose mutation types.
		1: a mutation type is depicted with a probability proportional to its relative rate. (-bprob is given)
		0: a mutation is depicted randomly according to a uniform distribution.
	*/
	int mr;
	/**
		number indicating a method to choose positions in the alignment.
		1: a position is depicted with a probability proportional to its relative rate.	
		0: a position is depicted randomly according to a uniform distribution.
	*/
	int pr;
	/**
		to provide mutation rates
	*/
	bool inputRate;
	/**
		mark if outgroupFile is already provided
	*/
	bool outgroupGiven;
	/**
		number of repeated
	*/
//	int nRepeat;
	/**
		seed for random number generator
	*/
	int seed;
/**===============================================*/
/**			DERIVED INPUT						*/
/**===============================================*/
	/**
		vector of outgroup taxon names. Derived directly.
	*/
	StringVec outgroup;
	/**
		vector of positions in the alignment. Derived directly.
	*/
	IntVec positions;
	/**
		vector of mutation types. Derived directly.
	*/
	IntVec mutations;
	/**
		vector of mutation rates.
	*/
	DoubleVec mutRates;
	/**
		vector of branch IDs. Need a tree underneath.
	*/
	IntVec branches;
	/**
		vector of site rates 
	*/
	DoubleVec pRates;
	/**
		to be selected site positions
	*/
	IntVec tbsPositions;
/**================================================*/
/**			FOR OUTPUTING							*/
/**===============================================*/
	/**
		prefix for the output files. Default is the input alignment file name.
	*/
	char* uprefix;
	/**
		output alignments in interleave format
	*/
	bool sd;
	/**
		do not prompt interactive interface
	*/
	bool ni;
	/**
		user setting is ready
	*/
	bool ready;
	/**
		whether the user wants to print detailed information
	*/
	bool add;
};

/** ============== SPECIFIC FUNCTIONS USED FOR THIS PROJECT ============*/
/**
	print copyright
*/
void printCopyright();
/**
	print help - usage	
*/
void printHelp(Verbose vb = ALL);

/**
	to derive user's input. Get data from input files
	@param params (IN/OUT)
*/
void deriveInput(Params &params);
/**
	to get a line of user's input
	@param message message dialog
	@param argc (OUT) number of strings in the input line
	@param argv (OUT) the vector of strings contained in the input line
*/
void getInputLine(const char *message, int &argc, StringVec &argv);

/**
	copy strings contained in an array of pointers of characters to a vector of strings
	@param argc number of strings
	@param argv the array of pointers
*/
StringVec copyToStringVec (const int argc, char* argv[]);
/**
	to mutate a nucleotide
	@param nuc the original nucleotide
	@param mutType the mutation type
	@return the mutated character
*/
char mutated(const char nuc, const int mutType);

/**
	Print user's specification
	@param params contains user's specification to run this program
*/
void printUsersSpec(const Params params);

/**
	Search for an interval containing a (double) number
	@param amap a map of (double, int), the first element is the key.
	@param key searching key.
	@return the second element of the ith item in the map such that 
			the interval [(i-1).first, i.first) contains the searching key
			OR -1 if could not find.
*/
int searchInMap(const DoubleIntMap amap, const double key);

/**
	Binary search for an interval [a,b) (in a sorted double array) containing a given number.
	@param key the query number
	@param vec the vector of sorted double numbers
	@param begin (IN/OUT) starting index
	@param end (IN/OUT) ending index
	@RETURN the index of the ending of the found interval (i-1, i] or -1 if there's no interval containing the key.
*/
int binarySearch(const DoubleVec vec, const double key, int &begin, int &end);

/**
	to randomly select branches (ID) on the tree or types of mutations.
	When branches or mutation types to be selected according to their relative weight (branch lengths or rates ),
	We need a map of DOUBLE and INT, constructed as follow:
		DOUBLE: the cummulative sum of their weight
		INT: the id
	So the key (DOUBLE) of the last item in the map should be 1.
	Therefore, if a generated number N (between 0 and 1), fall between the ith and (i+1)the keys,
	then the ith->second will be selected.
	This way allows us to select branches (mutation types) with probability (inversely) proportional 
	to their relative weights.
	@param doubleInt a map of DOUBLE and INT
	@param num total number should be generated	
	@param ret (IN/OUT) returned vector of the generated indices
	@RETURN -1 if not succeed, 0 otherwise
*/
int selectIndex(const DoubleIntMap doubleInt, const int num, IntVec &ret);

/**
	To select alignment positions or branches in the tree according to a given probability vector.
	The probability vector is already processed to produce the cutoff vector (cutoffVec) and the alias vector 
	(aliasVec). This is the alias method for sampling from a discrete distribution.
	@param cutoffVec: the cutoff vector (of double numbers)
	@param aliasVec: the alias vector (of integer numbers)
	@param num: the number of positions to select
	@param ret (IN/OUT) returned vector of the generated indices
	@RETURN -1 if not succeed, 0 otherwise.
	
*/
int selectIndexAlias(const DoubleVec cutoffVec, const IntVec aliasVec, const int num, IntVec &ret);

/**
	To generate the cutoff vector (of double numbers) and alias vector (of integer numbers/indices) for the alias method for sampling from discrete distribution.
	@param probs: the vector containing the given probabilities
	@param cutoffVec (OUT): the returned cutoff vector
	@param aliasVec (OUT): the returned alias vector
	@RETURN -1 if not succeed, 0 otherwise
*/
int generateCutoffAlias (const DoubleVec probs, DoubleVec &cutoffVec, IntVec &aliasVec);
/**
	to randomly select positions in the alignment or types of mutations (when no constraint is given).
	The selected number should be in [lower,upper] interval
	@param lower the lower bound
	@param upper the upper bound
	@param num total number should be generated
	@param overlap if the selected numbers can can be identical
	@param ret (IN/OUT) returned vector of the generated numbers
	@RETURN -1 if not succeed, 0 otherwise
	
*/
int selectNumber(const int lower, const int upper, const int num, IntVec &ret, bool overlap = true);
/*!
	prepare cummutative vector of mutation rates in order to select types of mutations according to their rates
	(proportionally, inversely proportionally, totally random - uniformly distributed).	\
	@param rates rates of the three Kimura 3st parameters.
	@param seType type of selection:
		PRO proportional to their rates
		INV_PRO inversely proportional to their rates
		NONE totally random, uniformly distributed
	
	@param ret (OUT) return value, a map between the cummutative sum and the mutation id
	@return 0 if succeed, 0 otherwise.
*/
int mutCumRate (const IntDoubleMap rates, DoubleIntMap & ret, int seType = PRO);

/*!
	prepare cummutative vector of mutation/site rates in order to select types of mutations according to their rates
	(proportionally, inversely proportionally, totally random - uniformly distributed).	\
	@param rates rates of the three Kimura 3st parameters.
	@param seType type of selection:
		PRO proportional to their rates
		INV_PRO inversely proportional to their rates
		NONE totally random, uniformly distributed
	
	@param ret (OUT) return value, a map between the cummutative sum and the mutation id
	@return 0 if succeed, 0 otherwise.
*/
int mutCumRate2 (const DoubleIntMM rates, DoubleIntMap & ret, int seType = PRO);

/**
	to count the expected number of events happen in a given time, given a rate of the event.
	@param rate rates (frequency) of the event
	@param time the total time
	@return the expected number of events
*/
int countEvents (const double rate, const double time);

/**
	to select a number of integers from a given source (vector of integer numbers)
	@param num the number of integers should be selected
	@param inputVec the source
	@param ret (OUT) the returned vector of integers selected
	@return -1 if not secceed, 0 otherwise
	
*/
int selectFromIntVec(const int num, const IntVec inputVec, IntVec &ret);
/**======================== FROM IQ-TREE package ======================================
	to incorporate class Alignment (without changing anything except deleting
=======================================================================================*/
/**
        Output a warning message to screen
        @param error warning message
*/
void outWarning(const char *warn);
void outWarning(string warn);

/**
        input type, tree or splits graph
*/
enum InputType {IN_NEWICK, IN_NEXUS, IN_OTHER};
/**
        detect the format of input file
        @param input_file file name
        @return
                IN_NEWICK if file in newick format,
                IN_NEXUS if in nexus format,
                IN_OTHER if file format unknown.
*/
InputType detectInputFile(char *input_file);

/**
	Read a vector of double (site likelihood) from an output file of puzzle or iqpnni .sitelh
	@param file file contains site likelihood
	@param logllVec (OUT) double vector contains site's log likelihood
*/
void readSitelh (const char* file, DoubleVec &logllVec);

/**
	Read a vector of double (site likelihood) from an input stream in format of .sitelh outputted by iqpnni & puzzle
	@param in input stream
	@param logllVec (OUT) double vector contains site's log likelihood
*/
void readSitelh (ifstream &in, DoubleVec &logllVec);

/**
	Read a file contains site-indices and double values for site rates
	@param file file contains index of a site and its rate
	@param index (OUT) integer vector contains indices for the sites
	@param rates (OUT) double vector contains rates for the indicated sites 
*/
void readSiteRates (const char* file, IntVec &index, DoubleVec &rates);

/**
	Read an input stream contains site-indices and double values for site rates
	@param in the input stream
	@param index (OUT) integer vector contains indices for the sites
	@param rates (OUT) double vector contains rates for the indicated sites 
*/
void readSiteRates (ifstream &in, IntVec &index, DoubleVec &rates);
 
void puzzleLogll (const char* align, const char* tree, const char* pzParam, const char* prefix, DoubleVec &logll);

double computeChisquare (const DoubleVec observed, const DoubleVec expected);

#endif
