// Linear DL distance score algorithm
// Input: Mseq, Nseq, size m, n

#define min(a,b) (a<b?a:b)
#define least(a,b,c,d) (min( min(a,b), min(c,d) ) )

extern int num_thread;
extern int alphabetaSize;
  
int calculateDLScore( int *Mseq, int *Nseq, int m, int n)
{
	int **DDDSP, maxdist;
	int **DDD, **PPP, *id_r, *p_r, id_c; // DDD: full score matrix, id_r: mapping alpha to latest row id, id_c: mapping alpha to latest column id
	int i, j, k, l;
	int temp, swap, up, diag, left; //left: insert, up: delete, diag: Match
	int ai, bj, cmp; // position in subMatrix;  
	int *r, *r1, *p; //private pointer
	int singalW, currSingal, RSingleID, SSingleID; 
	int *singal;
	
	singalW = 1000;//n/num_thread+1;
	singal = (int *) calloc( (n+1), sizeof(int)); 
	maxdist = m+n+1;

	// Declare space for DDD, id_r, id_c
	DDDSP = (int **) calloc( (alphabetaSize+1), sizeof(int*) );
	DDD = (int **) calloc( (alphabetaSize+1), sizeof(int*) );
	for(i=0; i <alphabetaSize+1; i++)
	{
		DDDSP[i] = (int*) calloc( (n+2), sizeof(int) );  // set as 0
		DDD[i] = DDDSP[i]+1;
		r = DDDSP[i];
		#pragma omp parallel for schedule(static) private(j)
		for(j = 0 ; j <n+2; j++)
		{
			r[j] = maxdist;
		} 
	}
	
	id_r = (int*) calloc ( (alphabetaSize+1), sizeof(int) ); 
	
	// Initialize the first row.
	r = DDD[0];

	#pragma omp parallel for schedule(static) private(j)
	for( j =0; j <= n; j++)
	{
		r[j] = j;
	}
	 
	#pragma omp parallel for schedule(static,1) ordered private(i, j, ai, bj, cmp, up, left, diag, swap, temp, r1, r, p, p_r, PPP, id_c, k, l, currSingal, RSingleID, SSingleID) firstprivate(singalW)
	for( i = 1; i <= m; i++ )
	{
		RSingleID = SSingleID = 0 ; 
		  
		while (1){
			#pragma omp flush (singal)
			#pragma omp atomic read
			currSingal= singal[RSingleID];
			if (currSingal == i-1) break; // row i starts process
		}
		RSingleID += singalW;
		 
		ai = Mseq[i-1];
		
		p = DDD[ai]; // give the space
		r1 = DDD[ai] = DDD[0]; // save the previous row, this value could not be used
		r = DDD[0] = p; // new current row.
			
		// make a copy
		p_r = (int*) calloc ( (alphabetaSize+1), sizeof(int) ); 
		PPP = (int **) calloc( (alphabetaSize+1), sizeof(int*) );
		for( j = 0; j < alphabetaSize+1; j++ )
		{
			p_r[j] = id_r[j];
			PPP[j] = DDD[j]; 
		}

		id_r[ai] = i; // map alphabeta to row id		
		id_c = 0;     // map alphabeta to column id
		 
		diag = i-1; // diag
		r[0] = temp = i; // left
		
		// process each column
		for( j = 1 ; j <= n; j++ )
		{
			if( j == RSingleID)
			{
				while (1){
					#pragma omp flush (singal)
					#pragma omp atomic read
					currSingal= singal[RSingleID];
					if (currSingal == i-1) break; // row i starts process
				}
				RSingleID += singalW;
			}
			
			bj = Nseq[j-1];
			k = p_r[bj]; l = id_c; // id_r[bj], id_r may changed, so we save it into local p_r
			
			cmp = 0;
			if( ai == bj )
			{
				cmp = 1;
				id_c = j;
			} 
			up =  r1[j] +1; // up
			left = temp +1;
			diag += 1-cmp;
			swap = PPP[bj][l-1]+ (i- k -1) + 1 + (j - l - 1);
			r[j] = temp = least( up, left, diag, swap) ;
 
			diag = up-1;
			
			 
				if( id_c-2 >= SSingleID + singalW-1 )
				{
					while( SSingleID + singalW-1 <= id_c-2 )
					{
						#pragma omp flush
						#pragma atomic write
						singal[SSingleID]++;
						#pragma omp flush (singal)
						SSingleID += singalW;
					}
				}   
				if( j == n   )
				{
					while( SSingleID <= n )
					{
						#pragma omp flush
						#pragma atomic write
						singal[SSingleID]++;
						#pragma omp flush (singal)
						SSingleID += singalW;
					}
				} 
		} 
	}
	return DDD[0][n];
}

int calculateDLScore_parallel (int *Mseq, int *Nseq, int m, int n)
{
	omp_set_num_threads( num_thread );
	return calculateDLScore( Mseq, Nseq, m, n);
}
