// stripDL DL distance score algorithm
// Input: Mseq, Nseq, size m, n

#define singalW 1000

#define min(a,b) (a<b?a:b)
#define least(a,b,c,d) (min( min(a,b), min(c,d) ) )

extern int num_thread;
extern int alphabetaSize;
extern int*  alpha;

// n1 is shifting coordinate
void calculateDLScore_ST_ByStrip( int *Mseq, int *Nseq, int m, int n, int n1, int **VVV, int* n_id_c, int maxdist)
{	
	// n_id_c : mapping of current strip, p_id_x mapping of last strip
	int **DDDSP;
	int **DDD, *id_r, *p_id_c, id_c; // DDD: full score matrix, id_r: mapping alpha to latest row id, id_c: mapping alpha to latest column id
	int i, j, k, l;
	int temp, swap, up, diag, left; //left: insert, up: delete, diag: Match
	int ai, bj, cmp; // position in subMatrix;  
	int *r, *r1, *p; //private pointer
	int savedleft;

	// make a copy of id_c since it will be overwitten.
	p_id_c = (int*)calloc((alphabetaSize + 1), sizeof(int));
	for (i = 0; i <alphabetaSize + 1; i++)
	{
		p_id_c[i] = n_id_c[i];// save previous id c from previous strip.
	}
	// set new idc 
	for (j = 1; j <= n; j++)
	{
		bj = Nseq[j - 1];
		n_id_c[bj] = j + n1; // not good for the parallel.
	}
 
	// Declare space for DDD, id_r, id_c
	DDDSP = (int **)calloc((alphabetaSize + 1), sizeof(int*));
	DDD = (int **)calloc((alphabetaSize + 1), sizeof(int*));
	for (i = 0; i <alphabetaSize + 1; i++)
	{
		DDDSP[i] = (int*)calloc((n + 2), sizeof(int));  // set as 0
		DDD[i] = DDDSP[i] + 1;

		r = DDDSP[i];
		for (j = 0; j <n + 2; j++)
		{
			r[j] = maxdist;
		}
	}


	id_r = (int*)calloc((alphabetaSize + 1), sizeof(int));

	// Initialize the first row.
	r = DDD[0];
	for (j = 0; j <= n; j++)
	{
		r[j] = j + n1;
	}
	VVV[0][0] = n + n1;

	// calculate other rows
	for (i = 1; i <= m; i++)
	{
		ai = Mseq[i - 1];

		p = DDD[ai]; // give the space
		DDD[ai] = r1 = r; // save the previous row, this value could not be used
		r = p; // new current row with old value.
		id_c = p_id_c[ai];  // map alphabeta to column id

		savedleft = r[0];
		diag = r1[0]; // previous row 
		r[0] = temp = VVV[0][i]; // copy previous value from last strip.

		// process each column
		for (j = 1; j <= n; j++)
		{
			bj = Nseq[j - 1];
			k = id_r[bj]; l = id_c;

			cmp = 0;
			if (ai == bj)
			{
				cmp = 1;
				id_c = j + n1;
			}
			up = r1[j] + 1; // up
			left = temp + 1;
			diag += 1 - cmp;
			swap = (l < n1 + 1 ? VVV[ai][k - 1] : DDD[bj][l - n1 - 1]) + (i - k - 1) + 1 + (j + n1 - l - 1);
			temp = least(up, left, diag, swap);
		 
				// before save current value into row, lets first recalculate VVV
				k = id_r[ai];   // save previous ai row
				if (  n_id_c[bj] == j + n1) // last column of this strip
				{ 
					VVV[bj][k - 1] = savedleft; // previous ai data which is saved in DDD[ai] , now is r.
				}
				savedleft = r[j];
			r[j] = temp;
			diag = up - 1;
		}

		// writing
		VVV[0][i] = temp; // save last column to VVV so it can be passed to next strip.
		id_r[ai] = i; // map alphabeta to row id 
	}

	for (i = 0; i < alphabetaSize; i++)
	{
		ai = alpha[i];
		k = id_r[ai];
			for (j = 0; j < alphabetaSize; j++)
			{
				bj = alpha[j];
				l = n_id_c[bj];
				if (l > n1)
				{
					VVV[bj][k - 1] = DDD[ai][l - n1 - 1];
				}
			}
	}
}

int calculateDLScore_ST (int *Mseq, int *Nseq, int m, int n, int stripW, int** VVV, int *id_c)
{ 
	int i, j;
	int n1, n2;
	int *r, maxdist;

	maxdist = m+n+1;
	
	r = VVV[0];
	for( i=0; i < m+1; i++)
	{
		r[i] = i;
	}

	// second column
	for( j = 1; j <=n; j+= stripW)
	{
		n1 = j;
		n2 = j+stripW -1; // left, right
		if( n2 > n)
			n2 = n;
		// add one col for previous value.
		calculateDLScore_ST_ByStrip(Mseq, Nseq+n1-1, m, n2-n1+1, n1-1, VVV, id_c, maxdist);
	}
	return VVV[0][m];
}



int calculateDLScoreReverse_ST_ByStrip( int *Mseq, int *Nseq, int m, int n, int n1, int **VVV, int* n_id_c, int maxdist)
{
	// n_id_c : mapping of current strip, p_id_x mapping of last strip
	int **DDDSP;
	int **DDD, *id_r, *p_id_c, id_c; // DDD: full score matrix, id_r: mapping alpha to latest row id, id_c: mapping alpha to latest column id
	int i, j, k, l;
	int temp, swap, up, diag, left; //left: insert, up: delete, diag: Match
	int ai, bj, cmp; // position in subMatrix;  
	int *r, *r1, *p;
	int savedleft;

	// make a copy of id_c since it will be overwitten.
	p_id_c = (int*) calloc ( (alphabetaSize+1), sizeof(int) );
	for(i=0; i <alphabetaSize+1; i++)
	{
		p_id_c[i] = n_id_c[i];// save previous id c from previous strip.
	}
	// set new idc 
	for( j=1; j<= n; j++)
	{
		bj = Nseq[n-j];
		n_id_c[bj] = j+n1; // not good for the parallel.
	}
	
	// Declare space for DDD, id_r, id_c
	DDDSP = (int **)calloc((alphabetaSize + 1), sizeof(int*));
	DDD = (int **)calloc((alphabetaSize + 1), sizeof(int*));
	for (i = 0; i <alphabetaSize + 1; i++)
	{
		DDDSP[i] = (int*)calloc((n + 2), sizeof(int));  // set as 0
		DDD[i] = DDDSP[i] + 1;

		r = DDDSP[i];
		for (j = 0; j <n + 2; j++)
		{
			r[j] = maxdist;
		}
	}
	id_r = (int*) calloc ( (alphabetaSize+1), sizeof(int) ); 
	
	
	// Initialize the first row.
	r = DDD[0];
	for( j =0; j <= n; j++)
	{
		r[j] = j+n1;
	}	
	VVV[0][0] = n+n1;
	
	// calculate other rows
	for( i = 1; i <= m; i++ )
	{
		ai = Mseq[m-i];

		p = DDD[ai]; // give the space
		DDD[ai] = r1 = r; // save the previous row, this value could not be used
		r = p; // new current row.
		id_c = p_id_c[ai];  // map alphabeta to column id
	
		savedleft = r[0];
		diag = r1[0]; // previous row 
		r[0] = temp = VVV[0][i]; // copy previous value from last strip.
		
		for( j = 1 ; j <= n; j++ )
		{// each column
			bj = Nseq[n-j];
			k = id_r[bj]; l = id_c;
			cmp = 0;
			if (ai == bj)
			{
				cmp = 1;
				id_c = j + n1;
			}
			
			
			up =  r1[j] +1; // up
			left = temp +1;
			diag += 1-cmp;
			swap = ( l < n1+1 ? VVV[ai][k-1] : DDD[bj][l-n1-1]) + (i- k -1) + 1 + (j+n1 - l - 1);
			temp = least(up, left, diag, swap);
		 	

			// before save current value into row, lets first recalculate VVV
			k = id_r[ai];   // save previous ai row
			if(  n_id_c[bj]== j+n1 ) // last column of this strip
			{ 
				VVV[bj][k-1] = savedleft; // previous ai data which is saved in DDD[ai] , now is r.
			}
			savedleft = r[j];
			r[j] = temp ;
			diag = up-1;
 
		}
		
		// writing
		VVV[0][i] = temp; // save last column to VVV so it can be passed to next strip.
		id_r[ai] = i; // map alphabeta to row id 
	}  
	
	for( i=0; i < alphabetaSize; i++ )
	{
		ai = alpha[i];
		k = id_r[ai];
		for( j = 0; j <  alphabetaSize; j++ )
		{
			bj = alpha[j];
			l = n_id_c[bj];
			if(  l >n1 )
			{
				VVV[bj][k-1] = DDD[ai][l-n1-1];
			}
		}
	}
}
 
int calculateDLScoreReverse_ST (int *Mseq, int *Nseq, int m, int n, int stripW, int** VVV, int *n_id_c)
{ 
	int i, j;
	int n1, n2; 
	int *r, maxdist;

	maxdist = m+n+1;
	
	r = VVV[0];
	for( i=0; i < m+1; i++)
	{
		r[i] = i;
	}
 
	// second column
	for( j = 1; j <=n; j+= stripW)
	{
		n1 = j;
		n2 = j+stripW -1; // left, right
		if( n2 > n)
			n2 = n;
		// add one col for previous value.
		calculateDLScoreReverse_ST_ByStrip(Mseq, Nseq+(n-n2), m, n2-n1+1, n1-1, VVV, n_id_c, maxdist);
	}
	return VVV[0][m];
}

void calculateDLTrace_ST(int *Mseq, int *Nseq, int m, int n, int stripW, int* path, int pi, int pj, int transpose)
{
	int d = 0;
	int i,j, k, l, l2, lbj;
	int **VVV1, **VVV2, *id_c1, *id_c2, *id_r;
	int **VVVSP1, **VVVSP2, *r;
	int minScore, temp, maxdist;
	int ai, bj;
	int si1, sj1, si2, sj2; 
	
	if( m < n )
	{
		return calculateDLTrace_ST( Nseq, Mseq, n, m, stripW, path, pj, pi, 1- transpose);
	}
	else if( n == 0  )
	{ // no recursive needed
		//return path;
	}
	else if( n == 1)
	{
		bj = (int) Nseq[1-1] ;  
		for( i =1; i <= m; i++ )
		{
			temp = (bj == Mseq[i-1])?1:0; // find matched
			if( temp )
			{
				break;
			} 
		}
		(i > m) && (i = m); // reset i

		if (transpose)
		{
			if (i > 1) { path[pj+pi+i-1] = pj;  }
			if (i < m) { path[pj + 1 + pi + i] = pj+1; }
		}
		else 
		{
			if (i > 1) { path[pi + i - 1+pj] = pi+i-1; 	}
			if (i < m) { path[pi + i+pj+1] = pi+i; }
		}
		//return path;
	}
	else
	{
		sj1 = sj2 = n/2;
		maxdist = m+n+1;

		VVVSP1 = (int **)calloc((alphabetaSize + 1), sizeof(int*));
		VVV1 = (int **)calloc((alphabetaSize + 1), sizeof(int*));
		for (i = 0; i <alphabetaSize + 1; i++)
		{
			VVVSP1[i] = (int*)calloc((m + 2), sizeof(int));  // set as 0
			VVV1[i] = VVVSP1[i] + 1;
	
			r = VVVSP1[i];
			for (j = 0; j <m + 2; j++)
			{
				r[j] = maxdist;
			}
		}
 
		VVVSP2 = (int **)calloc((alphabetaSize + 1), sizeof(int*));
		VVV2 = (int **)calloc((alphabetaSize + 1), sizeof(int*));
		for (i = 0; i <alphabetaSize + 1; i++)
		{
			VVVSP2[i] = (int*)calloc((m + 2), sizeof(int));  // set as 0
			VVV2[i] = VVVSP2[i] + 1;
	
			r = VVVSP2[i];
			for (j = 0; j <m + 2; j++)
			{
				r[j] = maxdist;
			}
		}
 
		id_c1 = (int*) calloc( alphabetaSize+1, sizeof(int));
		id_c2 = (int*) calloc( alphabetaSize+1, sizeof(int));
		calculateDLScore_ST( Mseq, Nseq, m, sj1, stripW, VVV1, id_c1);
		calculateDLScoreReverse_ST( Mseq, Nseq+sj1, m, n-sj1, stripW, VVV2, id_c2);
		 
		// find the middle pointer
		si1 = si2 =0; 
		minScore = VVV1[0][0] +VVV2[0][m];
		for( i =1 ; i<=m; i++)
		{
			temp = VVV1[0][i]+VVV2[0][m-i];
			if(temp < minScore)
			{
				minScore = temp;
				si1 = si2 = i;
			}
		}
		for (j = 0; j < alphabetaSize; j++)
		{
			bj = alpha[j];
			VVV1[bj][0] = id_c1[bj] - 1;
		}

		for( j=0; j < alphabetaSize; j++)
		{
			bj = alpha[j];	
			lbj = n + 1 - id_c2[bj];
			if( lbj > n) // this alpahbet doesn't exist
				continue;
			
			id_r = (int*)calloc(alphabetaSize + 1, sizeof(int));
			i = 1;
			ai = Mseq[0];
			id_r[ai] = i;
		 
			for( i =2 ; i<=m; i++)
			{
				ai = Mseq[i-1];
				k = id_r[bj];
				l = id_c1[ai];
				l2 = n + 1 - id_c2[ai];
				
				if( ai != bj && k && l && l2 >= lbj )
				{ 
					temp = VVV1[ai][k-1] + ( i-k-1)+1+(lbj-l-1) + VVV2[bj][m-i];
					if(temp < minScore)
					{

						minScore = temp;
						si1 = k-1;
						si2 = i;
						sj1 = l-1;
						sj2 = lbj;
					}
				}
				id_r[ai] = i;
			}
		} 

		// save middle pointer
		if( si1 == si2)
		{
			if (transpose)
			{
				path[pj + sj1+ pi + si1] = pj+sj1;
			}
			else
			{
				path[pi + si1+ pj + sj1] = pi+si1;
			}
		}
		else
		{
			if (transpose)
			{
				path[ pj + sj1 + pi + si1 ] = pj+sj1;
				path[ pj + sj2 + pi + si2 ] = pj+sj2;

			}
			else
			{
				path[ pi + si1+ pj + sj1] = pi+si1;
				path[ pi + si2+ pj + sj2] = pi+si2;
			}
		}
	
		calculateDLTrace_ST( Mseq, Nseq, si1, sj1, stripW, path, pi, pj, transpose);
		calculateDLTrace_ST( Mseq+si2, Nseq+sj2, m-si2, n-sj2, stripW, path, pi+si2, pj+sj2, transpose);
		//return path;	
	}
	//return minScore;
}


/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////


// n1 is shifting coordinate
void calculateDLScore_ByStrip( int *Mseq, int *Nseq, int m, int n, int n1, int **VVV, int* c_id_c, int maxdist, int stripID, int* singal)
{	
	// n_id_c : mapping of current strip, p_id_x mapping of last strip
	int **DDDSP;
	int **DDD, *id_r, *p_id_c, *n_id_c, id_c; // DDD: full score matrix, id_r: mapping alpha to latest row id, id_c: mapping alpha to latest column id
	int i, j, k, l;
	int temp, swap, up, diag, left; //left: insert, up: delete, diag: Match
	int ai, bj, cmp; // position in subMatrix;  
	int *r, *r1, *p; //private pointer
	int savedleft;
	int currSingal, RSingleID, SSingleID; 
	int lr; // last depend row
	
	RSingleID = SSingleID = 0 ;
	while (1){
			#pragma omp flush (singal)
			#pragma omp atomic read
			currSingal= singal[RSingleID];
			if (currSingal == stripID) break; // row i starts process
		}
	RSingleID += singalW;
		
	// make a copy of id_c since it will be overwitten.
	p_id_c = (int*) calloc ( (alphabetaSize+1), sizeof(int) );
	for(i=0; i <alphabetaSize+1; i++)
	{
		p_id_c[i] = c_id_c[i];// save previous id c from previous strip.
	}
	for( j=1; j<= n; j++)
	{
		bj = Nseq[j-1];
		c_id_c[bj] = j+n1; // overwrite
	}
	n_id_c = (int*) calloc ( (alphabetaSize+1), sizeof(int) );
	for(i=0; i <alphabetaSize+1; i++)
	{
		n_id_c[i] = c_id_c[i];// save previous id c from previous strip. 
	}
	 

	// Declare space for DDD, id_r, id_c
	DDDSP = (int **)calloc((alphabetaSize + 1), sizeof(int*));
	DDD = (int **)calloc((alphabetaSize + 1), sizeof(int*));
	for (i = 0; i <alphabetaSize + 1; i++)
	{
		DDDSP[i] = (int*)calloc((n + 2), sizeof(int));  // set as 0
		DDD[i] = DDDSP[i] + 1;

		r = DDDSP[i];
		for (j = 0; j <n + 2; j++)
		{
			r[j] = maxdist;
		}
	}

	id_r = (int*) calloc ( (alphabetaSize+1), sizeof(int) ); 
	
	// Initialize the first row.
	r = DDD[0];
	for( j =0; j <= n; j++)
	{
		r[j] = j+n1;
	}	
	VVV[0][0] = n+n1;
	
	// calculate other rows
	for( i = 1; i <= m; i++ )
	{
		if( i == RSingleID)
			{
				while (1){
					#pragma omp flush (singal)
					#pragma omp atomic read
					currSingal= singal[RSingleID];
					if (currSingal == stripID) break; // row i starts process
				}
				RSingleID += singalW;
			}
			
		ai = Mseq[i-1];

		p = DDD[ai]; // give the space
		DDD[ai] = r1 = r; // save the previous row, this value could not be used
		r = p; // new current row.
		id_c = p_id_c[ai];  // map alphabeta to column id
		
		savedleft = r[0];
		diag = r1[0]; // previous row 
		r[0] = temp = VVV[0][i]; // copy previous value from last strip.

		// process each column
		for( j = 1 ; j <= n; j++ )
		{
			bj = Nseq[j-1];
			k = id_r[bj]; l = id_c;
			cmp = 0;
			if (ai == bj)
			{
				cmp = 1;
				id_c = j + n1;
			}
			up = r1[j] + 1; // up
			left = temp + 1;
			diag += 1 - cmp;
			swap = ( l < n1+1 ? VVV[ai][k-1] : DDD[bj][l-n1-1]) + (i- k -1) + 1 + (j+n1 - l - 1);
			temp = least(up, left, diag, swap);
		 	// before save current value into row, lets first recalculate VVV
				k = id_r[ai];   // save previous ai row
				if(n_id_c[bj]== j+n1 ) // last column of this strip
				{ 
					VVV[bj][k-1] = savedleft; // previous ai data which is saved in DDD[ai] , now is r.
				}
				savedleft = r[j];
			r[j] = temp ;
			diag = up-1;
		}
		
		lr = 0;
		if( i%singalW ==0)
		{
			lr = i;
			for( j = 0; j< alphabetaSize;j++)
			{
				bj = alpha[j];
				k = id_r[bj]; 
				(k& lr > k ) && (lr = k );
			}
		}
		
		// writing
		VVV[0][i] = temp; // save last column to VVV so it can be passed to next strip.
		id_r[ai] = i; // map alphabeta to row id 
		
		if( lr-2 >= SSingleID + singalW-1 )
				{
					while( SSingleID + singalW-1 <= lr-2 )
					{
						#pragma omp flush
						#pragma atomic write
						singal[SSingleID]++;
						#pragma omp flush (singal)
						SSingleID += singalW;
					}
				}  
	}  
	
	for( i=0; i < alphabetaSize; i++ )
	{
		ai = alpha[i];
		k = id_r[ai];
		for( j = 0; j <  alphabetaSize; j++ )
		{
			bj = alpha[j];
			l = n_id_c[bj];
			if( l >n1 )
			{
				VVV[bj][k-1] = DDD[ai][l-n1-1];
			}
		}
	}
	
	while( SSingleID <= m )
					{
						#pragma omp flush
						#pragma atomic write
						singal[SSingleID]++;
						#pragma omp flush (singal)
						SSingleID += singalW;
					}
}

int calculateDLScore (int *Mseq, int *Nseq, int m, int n, int stripW, int** VVV, int *id_c)
{ 
	int i, j;
	int n1, n2;
	int *r, maxdist;
	int maxStripID, stripID;
	int *singal;
	
	maxdist = m+n+1;
	singal = (int *) calloc( (m+1), sizeof(int)); 
	
	r = VVV[0];
	#pragma omp parallel for schedule(static) private(i)
	for( i=0; i < m+1; i++)
	{
		r[i] = i;
	}



	maxStripID = n /stripW;
	if( maxStripID* stripW < n)
		maxStripID++;

	
	
	#pragma omp parallel for schedule(static,1) ordered private(stripID, n1, n2)
	for( stripID =0; stripID < maxStripID; stripID++)
	{
		n1 = stripID* stripW +1;
		n2 = stripID* stripW +1+ stripW -1; // left, right
		if( n2 > n)
			n2 = n;
		// add one col for previous value.
		calculateDLScore_ByStrip(Mseq, Nseq+n1-1, m, n2-n1+1, n1-1, VVV, id_c, maxdist, stripID, singal);
	}
	return VVV[0][m];
}



int calculateDLScoreReverse_ByStrip( int *Mseq, int *Nseq, int m, int n, int n1, int **VVV,  int* c_id_c, int maxdist, int stripID, int* singal)
{
	// n_id_c : mapping of current strip, p_id_x mapping of last strip
	int **DDDSP;
	int **DDD, *id_r, *p_id_c, *n_id_c, id_c; // DDD: full score matrix, id_r: mapping alpha to latest row id, id_c: mapping alpha to latest column id
	int i, j, k, l;
	int temp, swap, up, diag, left; //left: insert, up: delete, diag: Match
	int ai, bj, cmp; // position in subMatrix;  
	int *r, *r1, *p;
	int savedleft;
	int lr; // last depend row

	int currSingal, RSingleID, SSingleID; 
	
	RSingleID = SSingleID = 0 ;
	while (1){
			#pragma omp flush (singal)
			#pragma omp atomic read
			currSingal= singal[RSingleID];
			if (currSingal == stripID) break; // row i starts process
		}
	RSingleID += singalW;
	
	// make a copy of id_c since it will be overwitten.
	p_id_c = (int*) calloc ( (alphabetaSize+1), sizeof(int) );
	for(i=0; i <alphabetaSize+1; i++)
	{
		p_id_c[i] = c_id_c[i];// save previous id c from previous strip.
	}
	for( j=1; j<= n; j++)
	{
		bj = Nseq[n-j];
		c_id_c[bj] = j+n1; // not good for the parallel.
	}
	n_id_c = (int*) calloc ( (alphabetaSize+1), sizeof(int) );
	for(i=0; i <alphabetaSize+1; i++)
	{
		n_id_c[i] = c_id_c[i]; 
	}
	
	// Declare space for DDD, id_r, id_c
	DDDSP = (int **)calloc((alphabetaSize + 1), sizeof(int*));
	DDD = (int **)calloc((alphabetaSize + 1), sizeof(int*));
	for (i = 0; i <alphabetaSize + 1; i++)
	{
		DDDSP[i] = (int*)calloc((n + 2), sizeof(int));  // set as 0
		DDD[i] = DDDSP[i] + 1;

		r = DDDSP[i];
		for (j = 0; j <n + 2; j++)
		{
			r[j] = maxdist;
		}
	}
	id_r = (int*) calloc ( (alphabetaSize+1), sizeof(int) ); 
	

	// Initialize the first row.
	r = DDD[0];
	for( j =0; j <= n; j++)
	{
		r[j] = j+n1;
	}	
	VVV[0][0] = n+n1;
	
	// calculate other rows
	for( i = 1; i <= m; i++ )
	{
		if( i == RSingleID)
			{
				while (1){
					#pragma omp flush (singal)
					#pragma omp atomic read
					currSingal= singal[RSingleID];
					if (currSingal == stripID) break; // row i starts process
				}
				RSingleID += singalW;
			}
			
		ai = Mseq[m-i];

		p = DDD[ai]; // give the space
		DDD[ai] = r1 = r; // save the previous row, this value could not be used
		r = p; // new current row.
		id_c = p_id_c[ai];  // map alphabeta to column id
	
		savedleft = r[0];
		diag = r1[0]; // previous row 
		r[0] = temp = VVV[0][i]; // copy previous value from last strip.
		
		for( j = 1 ; j <= n; j++ )
		{// each column
			bj = Nseq[n-j];
			k = id_r[bj]; l = id_c;
			cmp = 0;
			if (ai == bj)
			{
				cmp = 1;
				id_c = j + n1;
			}
			
			up =  r1[j] +1; // up
			left = temp +1;
			diag += 1-cmp;
			swap = ( l < n1+1 ? VVV[ai][k-1] : DDD[bj][l-n1-1]) + (i- k -1) + 1 + (j+n1 - l - 1);
			temp = least(up, left, diag, swap);

			// before save current value into row, lets first recalculate VVV
			k = id_r[ai];   // save previous ai row
			if( n_id_c[bj]== j+n1 ) // last column of this strip
			{ 
				VVV[bj][k-1] = savedleft; // previous ai data which is saved in DDD[ai] , now is r.
			}
			savedleft = r[j];
			r[j] = temp ;
			diag = up-1;

		}
		
		lr = 0;
		if( i%singalW ==0)
		{
			lr = i;
			for( j = 0; j< alphabetaSize;j++)
			{
				bj = alpha[j];
				k = id_r[bj]; 
				(k& lr > k ) && (lr = k );
			}
		}
		
		// writing
		VVV[0][i] = temp; // save last column to VVV so it can be passed to next strip.
		id_r[ai] = i; // map alphabeta to row id 
		
		if( lr-2 >= SSingleID + singalW-1 )
				{
					while( SSingleID + singalW-1 <= lr-2 )
					{
						#pragma omp flush
						#pragma atomic write
						singal[SSingleID]++;
						#pragma omp flush (singal)
						SSingleID += singalW;
					}
				}  
	}  
	
	for( i=0; i < alphabetaSize; i++ )
	{
		ai = alpha[i];
		k = id_r[ai];
		for( j = 0; j <  alphabetaSize; j++ )
		{
			bj = alpha[j];
			l = n_id_c[bj];
			if( l >n1 )
			{
				VVV[bj][k-1] = DDD[ai][l-n1-1];
			}
		}
	}
	
	while( SSingleID <= m )
					{
						#pragma omp flush
						#pragma atomic write
						singal[SSingleID]++;
						#pragma omp flush (singal)
						SSingleID += singalW;
					}
}
 
int calculateDLScoreReverse (int *Mseq, int *Nseq, int m, int n, int stripW, int** VVV, int *n_id_c)
{ 
	int i, j;
	int n1, n2; 
	int *r, maxdist;

	int maxStripID, stripID;
	int *singal;
	
	maxdist = m+n+1;

	singal = (int *) calloc( (m+1), sizeof(int)); 
	
	r = VVV[0];
	#pragma omp parallel for schedule(static) private(i)
	for( i=0; i < m+1; i++)
	{
		r[i] = i;
	}	



	
	maxStripID = n /stripW;
	if( maxStripID* stripW < n)
		maxStripID++;
	
	
	#pragma omp parallel for schedule(static,1) ordered private(stripID, n1, n2)
	for( stripID =0; stripID < maxStripID; stripID++)
	{
		n1 = stripID* stripW +1;
		n2 = stripID* stripW +1+ stripW -1; // left, right
		if( n2 > n)
			n2 = n;
		// add one col for previous value.
		calculateDLScoreReverse_ByStrip(Mseq, Nseq+(n-n2), m, n2-n1+1, n1-1, VVV, n_id_c, maxdist, stripID, singal);
	}
	return VVV[0][m];
}

int calculateDLTrace(int *Mseq, int *Nseq, int m, int n, int stripW, int* path, int pi, int pj, int transpose, int level)
{
	int d = 0;
	int i,j, k, l, l2, lbj;
	int **VVV1, **VVV2, *id_c1, *id_c2, *id_r;
	int **VVVSP1, **VVVSP2, *r;
	int minScore, temp, maxdist;
	int ai, bj;
	int si1, sj1, si2, sj2; 
	
	if( m < n )
	{
		return calculateDLTrace( Nseq, Mseq, n, m, stripW, path, pj, pi, 1- transpose, level);
	}
	else if( n == 0  )
	{ // no recursive needed
		return 0; // path;
	}
	else if( n == 1)
	{
		bj = (int) Nseq[1-1] ;  
		for( i =1; i <= m; i++ )
		{
			temp = (bj == Mseq[i-1])?1:0; // find matched
			if( temp )
			{
				break;
			} 
		}
		(i > m) && (i = m); // reset i

		if (transpose)
		{
			if (i > 1) { path[pj+pi+i-1] = pj;  }
			if (i < m) { path[pj + 1 + pi + i] = pj+1; }
		}
		else 
		{
			if (i > 1) { path[pi + i - 1+pj] = pi+i-1; 	}
			if (i < m) { path[pi + i+pj+1] = pi+i; }
		}
		 
		return 0;// path;
	}
	else
	{
		sj1 = sj2 = n/2;
		maxdist = m+n+1;

		VVVSP1 = (int **)calloc((alphabetaSize + 1), sizeof(int*));
		VVV1 = (int **)calloc((alphabetaSize + 1), sizeof(int*));
		for (i = 0; i <alphabetaSize + 1; i++)
		{
			VVVSP1[i] = (int*)calloc((m + 2), sizeof(int));  // set as 0
			VVV1[i] = VVVSP1[i] + 1;
	
			r = VVVSP1[i];
			for (j = 0; j <m + 2; j++)
			{
				r[j] = maxdist;
			}
		}
 
		VVVSP2 = (int **)calloc((alphabetaSize + 1), sizeof(int*));
		VVV2 = (int **)calloc((alphabetaSize + 1), sizeof(int*));
		for (i = 0; i <alphabetaSize + 1; i++)
		{
			VVVSP2[i] = (int*)calloc((m + 2), sizeof(int));  // set as 0
			VVV2[i] = VVVSP2[i] + 1;
	
			r = VVVSP2[i];
			for (j = 0; j <m + 2; j++)
			{
				r[j] = maxdist;
			}
		}
 

		id_c1 = (int*) calloc( alphabetaSize+1, sizeof(int));
		id_c2 = (int*) calloc( alphabetaSize+1, sizeof(int));
		calculateDLScore( Mseq, Nseq, m, sj1, stripW, VVV1, id_c1);
		calculateDLScoreReverse( Mseq, Nseq+sj1, m, n-sj1, stripW, VVV2, id_c2);
		 
		 
		// find the middle pointer
		si1 = si2 =0; 
		minScore = VVV1[0][0] +VVV2[0][m];
		for( i =1 ; i<=m; i++)
		{
			temp = VVV1[0][i]+VVV2[0][m-i];
			if(temp < minScore)
			{
				minScore = temp;
				si1 = si2 = i;
			}
		}
		for (j = 0; j < alphabetaSize; j++)
		{
			bj = alpha[j];
			VVV1[bj][0] = id_c1[bj] - 1;
		}

		for( j=0; j < alphabetaSize; j++)
		{
			bj = alpha[j];	
			lbj = n + 1 - id_c2[bj];
			if( lbj > n) // this alpahbet doesn't exist
				continue;

			
			id_r = (int*)calloc(alphabetaSize + 1, sizeof(int));
			i = 1;
			ai = Mseq[0];
			id_r[ai] = i;
		 
			for( i =2 ; i<=m; i++)
			{
				ai = Mseq[i-1];
				k = id_r[bj];
				l = id_c1[ai];
				l2 = n + 1 - id_c2[ai];
				
				if( ai != bj && k && l && l2 >= lbj )
				{ 
					temp = VVV1[ai][k-1] + ( i-k-1)+1+(lbj-l-1) + VVV2[bj][m-i];
					if(temp < minScore)
					{

						minScore = temp;
						si1 = k-1;
						si2 = i;
						sj1 = l-1;
						sj2 = lbj;
					}
				}
				id_r[ai] = i;
			}
		} 

		// save middle pointer
		if( si1 == si2)
		{
			if (transpose)
			{
				path[pj + sj1+ pi + si1] = pj+sj1;
			}
			else
			{
				path[pi + si1+ pj + sj1] = pi+si1;
			}
			
		}
		else
		{
			if (transpose)
			{
				path[ pj + sj1 + pi + si1 ] = pj+sj1;
				path[ pj + sj2 + pi + si2 ] = pj+sj2;


			}
			else
			{
				path[ pi + si1+ pj + sj1] = pi+si1;
				path[ pi + si2+ pj + sj2] = pi+si2;
			}

		}


		if( level <= 5)
		{
			calculateDLTrace( Mseq, Nseq, si1, sj1, stripW, path, pi, pj, transpose, level+1);
			calculateDLTrace( Mseq+si2, Nseq+sj2, m-si2, n-sj2, stripW, path, pi+si2, pj+sj2, transpose, level+1);
		}
		else
        {
			#pragma omp parallel sections
            {
                #pragma omp section
				{
                    calculateDLTrace_ST( Mseq, Nseq, si1, sj1, stripW, path, pi, pj, transpose);
                }
                #pragma omp section
                {
					calculateDLTrace_ST( Mseq+si2, Nseq+sj2, m-si2, n-sj2, stripW, path, pi+si2, pj+sj2, transpose);
                }
            }
		} 

		
		//return path;	
	}
	return minScore;
}

int calculateDLTrace_parallel(int *Mseq, int *Nseq, int m, int n, int stripW, int* path, int pi, int pj, int transpose)
{
	
	omp_set_nested(1);  
	omp_set_num_threads( num_thread );
	return calculateDLTrace(Mseq, Nseq, m, n, stripW, path, pi, pj, transpose, 1);
}