C======================================================================
C
C    File: /home/ama-iris/g/fortranm/Berna-Mesh/NX/mesh_lib.F
C
C    Last Modified By: Rajit Manohar (rajit@vlsi.cs.caltech.edu)
C    Modification Date: Thu Jul 21 04:36:30 PDT 1994
C    Modification: Converted to P4.
C
C    Original Author: Berna Massingill (berna@vlsi.cs.caltech.edu)
C    Date: Thu Jul 21 03:11:27 PDT 1994
C    Info: MESH TEMPLATE --- 2-D Version.
C
C=======================================================================

C=======================================================================
C=======================================================================
C
C	main programs
C
C=======================================================================
C=======================================================================

C=======================================================================
C
C	main program -- define and connect ports, start processes
C
C=======================================================================

	program MAIN

	include 'mesh_uparms.h'
	include 'mesh_parms.h'
	include 'mesh_common.h'
	integer p4myid, p4ntotids

	logical error

	call p4init
        if (p4myid() .eq. 0) call p4crpg

C=======safety checks

        if (p4ntotids() .ne. NPROCS) then
           print *, 'SORRY: PROGRAM ONLY WORKS FOR ', NPROCS,
     -          ' PROCESSES.'
           stop
        endif

C	check for invalid combinations of dimensions and
C	process-grid dimensions

	error = .FALSE.

	if (NX - (XPROCS-1)*NXlsize .le. 0) then
		print*, 'invalid combination of NX, XPROCS'
		error = .TRUE.
	endif
	if (NY - (YPROCS-1)*NYlsize .le. 0) then
		print*, 'invalid combination of NY, YPROCS'
		error = .TRUE.
	endif
	if (error) STOP

	call pxpypz (p4myid()+1, iptype, ipx, ipy)
        if (iptype .eq. GRIDTYPE) then
           call mesh_grid_proc(ipx,ipy)
        else
           call mesh_host_proc
        endif
	call p4cleanup
	end

C=======================================================================
C
C	main program for host process -- copy arguments into
C		common block
C
C=======================================================================

        subroutine mesh_host_proc

	include 'mesh_uparms.h'
	include 'mesh_parms.h'
	include 'mesh_common.h'

	iproctype = HOSTTYPE
	iprocx = 0
	iprocy = 0

	NXlocal = 0
	NYlocal = 0

	call hostmain

	return
	end

C=======================================================================
C
C	main program for grid process -- copy arguments into
C		common block
C
C=======================================================================

        subroutine mesh_grid_proc(a_iprocx, a_iprocy)

	include 'mesh_uparms.h'
	include 'mesh_parms.h'
	include 'mesh_common.h'

	integer a_iprocx, a_iprocy

	iproctype = GRIDTYPE
	iprocx = a_iprocx
	iprocy = a_iprocy

	NXlocal = NXlsize
	IF (iprocx .EQ. XPROCS) NXlocal = NX-(XPROCS-1)*NXlsize
	NYlocal = NYlsize
	IF (iprocy .EQ. YPROCS) NYlocal = NY-(YPROCS-1)*NYlsize

	call gridmain

	return
	end

C=======================================================================
C=======================================================================
C
C	subroutines to redistribute data
C
C=======================================================================
C=======================================================================

C=======================================================================
C
C	subroutines to redistribute array from host to grid
C
C=======================================================================

C-----------------------------------------------------------------------
C
C	subroutine mesh_HtoG_host(host_array)
C	subroutine mesh_HtoG_grid(grid_array)
C
C	precondition:
C		D is a distributed array of size NX by NY.
C		in host process, host_array = D.
C		host process has called mesh_HtoG_host(host_array) and
C			all grid processes have called
C			 mesh_HtoG_grid(grid_array).
C	postcondition:
C		in each grid process, grid_array = local section of D.
C		(NOTE that the ghost boundaries of grid_array are not
C			guaranteed to be correct -- REVISE THIS?)
C
C-----------------------------------------------------------------------

	subroutine mesh_HtoG_host(host_array)

	include 'mesh_uparms.h'
	include 'mesh_parms.h'
	include 'mesh_common.h'

	real host_array(1:NX, 1:NY)

	real buff(IXLO:IXHI, IYLO:IYHI)

	isize = (IXHI-IXLO+1)*(IYHI-IYLO+1)*REAL_SIZE
	do ipx = 1, XPROCS
	do ipy = 1, YPROCS

C		copy data into buffer
		do iiloc = 1, NXlsize
		call iglobal(ipx, iiloc, iiglob)
		if (iiglob .le. NX) then
			do jjloc = 1, NYlsize
			call jglobal(ipy, jjloc, jjglob)
			if (jjglob .le. NY) then
				buff(iiloc, jjloc) =
     -					host_array(iiglob, jjglob)
			endif
			enddo
		endif
		enddo
C	send data to grid process
        idest = iprocnum(GRIDTYPE, ipx, ipy)-1
	call p4send (MSG_HTOGFLT_TAG, idest, buff, isize,irc)
	enddo
	enddo

	return
	end

C-----------------------------------------------------------------------

	subroutine mesh_HtoG_grid(grid_array)

	include 'mesh_uparms.h'
	include 'mesh_parms.h'
	include 'mesh_common.h'

	real grid_array(IXLO:IXHI, IYLO:IYHI)

	isize = (IXHI-IXLO+1)*(IYHI-IYLO+1)*REAL_SIZE
        ilen = isize
        ifrom = -1
        call p4recv (MSG_HTOGFLT_TAG,ifrom,grid_array,isize,ilen,irc)

	return
	end

C=======================================================================
C
C	subroutines to redistribute array from grid to host
C
C=======================================================================

C-----------------------------------------------------------------------
C
C	subroutine mesh_GtoH_host(host_array)
C	subroutine mesh_GtoH_grid(grid_array)
C
C	precondition:
C		D is a distributed array of size NX by NY.
C		in each grid process, grid_array = local section of D.
C		host process has called mesh_GtoH_host(host_array) and
C			all grid processes have called
C			mesh_GtoH_grid(grid_array).
C	postcondition:
C		in host process, host_array = D.
C
C-----------------------------------------------------------------------

	subroutine mesh_GtoH_host(host_array)

	include 'mesh_uparms.h'
	include 'mesh_parms.h'
	include 'mesh_common.h'
	real host_array(1:NX, 1:NY)

	real buff(IXLO:IXHI, IYLO:IYHI)

	do ipx = 1, XPROCS
	do ipy = 1, YPROCS

C		receive data from grid process
                inode = iprocnum (GRIDTYPE, ipx, ipy)-1
               	isize = (IXHI-IXLO+1)*(IYHI-IYLO+1)*REAL_SIZE
                ilen = isize
                call p4recv (MSG_GTOHFLT_TAG,inode,buff,isize,ilen,irc)
C		copy data from buffer
		do iiloc = 1, NXlsize
		call iglobal(ipx, iiloc, iiglob)
		if (iiglob .le. NX) then
			do jjloc = 1, NYlsize
			call jglobal(ipy, jjloc, jjglob)
			if (jjglob .le. NY) then
     				host_array(iiglob, jjglob) =
     -					buff(iiloc, jjloc)
			endif
			enddo
		endif
		enddo

	enddo
	enddo

	return
	end

C-----------------------------------------------------------------------

	subroutine mesh_GtoH_grid(grid_array)

	include 'mesh_uparms.h'
	include 'mesh_parms.h'
	include 'mesh_common.h'

	real grid_array(IXLO:IXHI, IYLO:IYHI)

	isize = (IXHI-IXLO+1)*(IYHI-IYLO+1)*REAL_SIZE
        idest = iprocnum (HOSTTYPE, 0, 0)-1
        call p4send (MSG_GTOHFLT_TAG, 0, grid_array, isize, irc)
	return
	end

C=======================================================================
C=======================================================================
C
C	subroutine to update ghost boundaries
C
C=======================================================================
C=======================================================================

C-----------------------------------------------------------------------
C
	subroutine mesh_update_bdry(grid_array)
C
C	precondition:
C		D is a distributed array of size NX by NY.
C		in each grid process, grid_array = local section of D.
C		all grid processes have called
C			mesh_update_bdry(grid_array).
C	postcondition:
C		in each grid process, ghost boundaries of grid_array
C			are "correct", including corners but excluding
C			external boundaries.
C		(SAY THIS MORE CLEARLY?)
C
C-----------------------------------------------------------------------

	include 'mesh_uparms.h'
	include 'mesh_parms.h'
	include 'mesh_common.h'

	real grid_array(IXLO:IXHI, IYLO:IYHI)

C--	REVISE THIS??
C	here we take the most simple and straightforward approach --
C	separate buffers for each communication axis and a lot of
C	copying -- no doubt there are more efficient approaches.

	real xbuffer(NGHOST, IYLO:IYHI)
	real ybuffer(IXLO:IXHI, NGHOST)

	integer XBUFFSIZE, YBUFFSIZE
	parameter (XBUFFSIZE = (NGHOST) * (IYHI-IYLO+1) * REAL_SIZE)
	parameter (YBUFFSIZE = (IXHI-IXLO+1) * (NGHOST) * REAL_SIZE)

	if (iproctype .eq. GRIDTYPE) then

C	update x-low boundaries
	if (iprocx .lt. XPROCS) then
		do mm = 1, NGHOST
		do jj = 1, NYlocal
			xbuffer(mm,jj) =
     -				grid_array(NXlsize+1-mm,jj)
		enddo
		enddo
                idest = iprocnum (GRIDTYPE, iprocx+1, iprocy)-1
                call p4send (MSG_XCH_XHI_TAG,idest,xbuffer, XBUFFSIZE,
     -               irc)
	endif
	if (iprocx .gt. 1) then
                ilen = XBUFFSIZE
                ifrom = -1
                call p4recv (MSG_XCH_XHI_TAG,ifrom,xbuffer,XBUFFSIZE,
     -          ilen,irc)
		do mm = 1, NGHOST
		do jj = 1, NYlocal
			grid_array(1-mm,jj) = xbuffer(mm,jj)
		enddo
		enddo
	endif

C	update x-high boundaries
	if (iprocx .gt. 1) then
		do mm = 1, NGHOST
		do jj = 1, NYlocal
			xbuffer(mm,jj) = grid_array(mm,jj)
		enddo
		enddo
                idest = iprocnum (GRIDTYPE, iprocx-1, iprocy)-1
                call p4send (MSG_XCH_XLOW_TAG,idest,xbuffer,XBUFFSIZE,
     -               irc)
	endif
	if (iprocx .lt. XPROCS) then
                ilen = XBUFFSIZE
                ifrom = -1
                call p4recv (MSG_XCH_XLOW_TAG,ifrom,xbuffer,XBUFFSIZE,
     -          ilen, irc)
		do mm = 1, NGHOST
		do jj = 1, NYlocal
			grid_array(NXlsize+mm,jj) = xbuffer(mm,jj)
		enddo
		enddo
	endif

C	update y-low boundaries, including x bdry data just updated
	if (iprocy .lt. YPROCS) then
		do mm = 1, NGHOST
		do ii = 1-NGHOST, NXlocal+NGHOST
			ybuffer(ii,mm) = grid_array(ii,NYlsize+1-mm)
		enddo
		enddo
                idest = iprocnum (GRIDTYPE, iprocx, iprocy+1)-1
                call p4send (MSG_XCH_YHI_TAG,idest,ybuffer, YBUFFSIZE,
     -               0)
	endif
	if (iprocy .gt. 1) then
                ilen = YBUFFSIZE
                ifrom = -1
                call p4recv (MSG_XCH_YHI_TAG,ifrom,ybuffer, YBUFFSIZE,
     -          ilen,irc)
		do mm = 1, NGHOST
		do ii = 1-NGHOST, NXlocal+NGHOST
			grid_array(ii,1-mm) = ybuffer(ii,mm)
		enddo
		enddo
	endif

C	update y-high boundaries, including x bdry data just updated
	if (iprocy .gt. 1) then
		do mm = 1, NGHOST
		do ii = 1-NGHOST, NXlocal+NGHOST
			ybuffer(ii,mm) = grid_array(ii,mm)
		enddo
		enddo
                idest = iprocnum (GRIDTYPE, iprocx, iprocy-1)-1
                call p4send (MSG_XCH_YLOW_TAG,idest,ybuffer,YBUFFSIZE,
     -               irc)
	endif
	if (iprocy .lt. YPROCS) then
                ilen = YBUFFSIZE
                ifrom = -1
                call p4recv (MSG_XCH_YLOW_TAG,ifrom,ybuffer, YBUFFSIZE,
     -          ilen,irc)
		do mm = 1, NGHOST
		do ii = 1-NGHOST, NXlocal+NGHOST
			grid_array(ii,NYlsize+mm) = ybuffer(ii,mm)
		enddo
		enddo
	endif

C	end of 'if gridtype'
	endif

	return
	end


C=======================================================================
C=======================================================================
C
C     MODIFIED BOUNDARY EXCHANGE: DOES EXCHANGE OF GLOBAL BOUNDARIES.
C
C=======================================================================
C=======================================================================
C#######################################################################
C
C    Modification By: Rajit Manohar (rajit@vlsi.cs.caltech.edu)
C    On: Wed Jul 20 23:55:26 PDT 1994
C
C
C       Different from usual mesh update boundary, since it exchanges
C       even the global boundaries. Needed for periodic boundary
C       conditions.
C#######################################################################
C-----------------------------------------------------------------------
C
	subroutine mesh_update_bdry_all(grid_array)
C
C	precondition:
C		D is a distributed array of size NX by NY.
C		in each grid process, grid_array = local section of D.
C		all grid processes have called
C			mesh_update_bdry(grid_array).
C	postcondition:
C		in each grid process, ghost boundaries of grid_array
C			are "correct", including corners and including
C			external boundaries.
C		(SAY THIS MORE CLEARLY?)
C
C-----------------------------------------------------------------------

	include 'mesh_uparms.h'
	include 'mesh_parms.h'
	include 'mesh_common.h'

	real grid_array(IXLO:IXHI, IYLO:IYHI)

C--	REVISE THIS??
C	here we take the most simple and straightforward approach --
C	separate buffers for each communication axis and a lot of
C	copying -- no doubt there are more efficient approaches.

	real xbuffer(NGHOST, IYLO:IYHI)
	real ybuffer(IXLO:IXHI, NGHOST)

	integer XBUFFSIZE, YBUFFSIZE
	parameter (XBUFFSIZE = (NGHOST) * (IYHI-IYLO+1) * REAL_SIZE)
	parameter (YBUFFSIZE = (IXHI-IXLO+1) * (NGHOST) * REAL_SIZE)

	if (iproctype .eq. GRIDTYPE) then

C	update x-low boundaries
	   do mm = 1, NGHOST
	      do jj = 1, NYlocal
		 xbuffer(mm,jj) =
     -                grid_array(NXlsize+1-mm,jj)
              enddo
           enddo
           idest = iprocnum (GRIDTYPE, mod(iprocx,XPROCS)+1, iprocy)-1
           call p4send (MSG_XCH_XHI_TAG,idest,xbuffer, XBUFFSIZE,irc)
           ilen = XBUFFSIZE
           ifrom = -1
           call p4recv (MSG_XCH_XHI_TAG,ifrom, xbuffer, XBUFFSIZE,
     -          ilen,irc)
           do mm = 1, NGHOST
              do jj = 1, NYlocal
                 grid_array(1-mm,jj) = xbuffer(mm,jj)
              enddo
           enddo

C	update x-high boundaries
           do mm = 1, NGHOST
              do jj = 1, NYlocal
                 xbuffer(mm,jj) = grid_array(mm,jj)
              enddo
           enddo
           idest = iprocnum (GRIDTYPE,
     -          mod(iprocx+XPROCS-2, XPROCS)+1, iprocy)-1
           call p4send (MSG_XCH_XLOW_TAG,idest,xbuffer, XBUFFSIZE,irc)
           ilen = XBUFFSIZE
           ifrom = -1
           call p4recv(MSG_XCH_XLOW_TAG,ifrom,xbuffer,XBUFFSIZE,
     -          ilen,irc)
           do mm = 1, NGHOST
              do jj = 1, NYlocal
                 grid_array(NXlsize+mm,jj) = xbuffer(mm,jj)
              enddo
           enddo

C	update y-low boundaries, including x bdry data just updated
           do mm = 1, NGHOST
              do ii = 1-NGHOST, NXlocal+NGHOST
                 ybuffer(ii,mm) = grid_array(ii,NYlsize+1-mm)
              enddo
           enddo
           idest = iprocnum (GRIDTYPE, iprocx,mod(iprocy,YPROCS)+1)-1
           call p4send (MSG_XCH_YHI_TAG, idest, ybuffer, YBUFFSIZE,irc)
           ilen = YBUFFSIZE
           ifrom = -1
           call p4recv(MSG_XCH_YHI_TAG,ifrom,ybuffer,YBUFFSIZE,
     -          ilen,irc)
           do mm = 1, NGHOST
              do ii = 1-NGHOST, NXlocal+NGHOST
                 grid_array(ii,1-mm) = ybuffer(ii,mm)
              enddo
           enddo

C	update y-high boundaries, including x bdry data just updated
           do mm = 1, NGHOST
              do ii = 1-NGHOST, NXlocal+NGHOST
                 ybuffer(ii,mm) = grid_array(ii,mm)
              enddo
           enddo
           idest = iprocnum (GRIDTYPE, iprocx,
     -          mod(iprocy+YPROCS-2,YPROCS)+1)-1
           call p4send (MSG_XCH_YLOW_TAG, idest, ybuffer, YBUFFSIZE,irc)
           ilen = YBUFFSIZE
           ifrom = -1
           call p4recv (MSG_XCH_YLOW_TAG,ifrom,ybuffer,YBUFFSIZE,
     -          ilen,irc)
           do mm = 1, NGHOST
              do ii = 1-NGHOST, NXlocal+NGHOST
                 grid_array(ii,NYlsize+mm) = ybuffer(ii,mm)
              enddo
           enddo

C	end of 'if gridtype'
	endif
	return
	end

C=======================================================================
C=======================================================================
C
C	global-reduction subroutines
C
C=======================================================================
C=======================================================================

C=======================================================================
C
C	subroutine to get integer maximum
C
C=======================================================================

C-----------------------------------------------------------------------
C
	subroutine mesh_merge_int_max(isize, int_in, int_out)
C
C	precondition:
C		int_in and int_out are integer arrays of size isize,
C			with int_in[P] denoting the value of int_in
C			in grid process P.
C		all processes (host and grid) have called
C			mesh_merge_int_max(isize, int_in, int_out).
C	postcondition:
C		in all processes (host and grid), int_out is the
C			(elementwise) maximum over int_in[P], where
C			P ranges over all grid (not host) processes.
C		in host process (but not in grid processes),
C			the values in int_in are undetermined.
C		(THAT IS -- in grid processes, int_in is an input-only
C			argument whose value is not changed, but in
C			the host process, int_in is used as work space.)
C
C-----------------------------------------------------------------------

	include 'mesh_uparms.h'
	include 'mesh_parms.h'
	include 'mesh_common.h'
	integer isize
	integer int_in(isize), int_out(isize)

	if (iproctype .eq. GRIDTYPE) then
                call p4send (MSG_GTOHINT_TAG, 0,int_in,
     -			isize*INTEGER_SIZE,irc)
                ilen = isize*INTEGER_SIZE
                ifrom = 0
                call p4recv (MSG_HTOGINT_TAG,ifrom,int_out,
     -			isize*INTEGER_SIZE,ilen,irc)
	else
C		compute maxima into int_out in host process
		ihost = iprocnum(HOSTTYPE, 0, 0, 0)
		do mm = 1, isize
			int_out(mm) = 0
		enddo

C		loop over processes in the same order as the sequential
C		version
		do iproc = 1, NPROCS
		if (iproc .ne. ihost) then
                        ilen = isize*INTEGER_SIZE
                        ifrom = iproc-1
                        call p4recv(MSG_GTOHINT_TAG,ifrom, int_in,
     -                       isize*INTEGER_SIZE,ilen, irc)
			do mm = 1, isize
				int_out(mm) = max(int_out(mm),int_in(mm))
			enddo
		endif
		enddo

C		send back to grid processes
		do ii = 1, XPROCS
		do jj = 1, YPROCS
                        idest = iprocnum(GRIDTYPE, ii, jj)-1
                        call p4send (MSG_HTOGINT_TAG,idest,int_out,
     -                       isize*INTEGER_SIZE,irc)
		enddo
		enddo
	endif

	return
	end

C=======================================================================
C
C	subroutine to get element with greatest absolute value (real)
C
C=======================================================================

C-----------------------------------------------------------------------
C
	subroutine mesh_merge_real_maxabs(isize, real_in, real_out)
C
C	precondition:
C		real_in and real_out are real arrays of size isize,
C			with real_in[P] denoting the value of real_in
C			in grid process P.
C		all processes (host and grid) have called
C			mesh_merge_real_maxabs(isize, real_in, real_out).
C	postcondition:
C		in all processes (host and grid), real_out is the
C			(elementwise) maximum absolute value over
C			real_in[P], where P ranges over all grid
C			(not host) processes.
C		in host process (but not in grid processes),
C			the values in real_in are undetermined.
C		(THAT IS -- in grid processes, real_in is an input-only
C			argument whose value is not changed, but in
C			the host process, real_in is used as work space.)
C
C-----------------------------------------------------------------------

	include 'mesh_uparms.h'
	include 'mesh_parms.h'
	include 'mesh_common.h'
	integer isize
	real real_in(isize), real_out(isize)

	if (iproctype .eq. GRIDTYPE) then
                call p4send (MSG_GTOHFLT_TAG,0,real_in,
     -			isize*REAL_SIZE,irc)
                ilen = isize*REAL_SIZE
                ifrom = 0
                call p4recv (MSG_HTOGFLT_TAG,ifrom,real_out,
     -			isize*REAL_SIZE,ilen,irc)
	else
C		compute maxima into real_out in host process
		ihost = iprocnum(HOSTTYPE, 0, 0, 0)
		do mm = 1, isize
			real_out(mm) = 0.0
		enddo
	
C		loop over processes in the same order as the sequential
C		version
		do iproc = 1, NPROCS
		if (iproc .ne. ihost) then
                        ilen = isize*REAL_SIZE
                        ifrom = iproc-1
                        call p4recv (MSG_GTOHFLT_TAG,ifrom, real_in,
     -                       isize*REAL_SIZE, ilen, irc)
			do mm = 1, isize
			if (abs(real_in(mm)) .gt. abs(real_out(mm)))
     -					real_out(mm) = real_in(mm)
			enddo
		endif
		enddo

C		send back to grid processes
		do ii = 1, XPROCS
		do jj = 1, YPROCS
                        idest = iprocnum (GRIDTYPE, ii, jj)-1
                        call p4send (MSG_HTOGFLT_TAG,idest,real_out,
     -                       isize*REAL_SIZE,irc)
		enddo
		enddo
	endif

	return
	end

C=======================================================================
C
C	subroutine to get element with greatest value (real)
C
C=======================================================================

C-----------------------------------------------------------------------
C
	subroutine mesh_merge_real_max(isize, real_in, real_out)
C
C	precondition:
C		real_in and real_out are real arrays of size isize,
C			with real_in[P] denoting the value of real_in
C			in grid process P.
C		all processes (host and grid) have called
C			mesh_merge_real_max(isize, real_in, real_out).
C	postcondition:
C		in all processes (host and grid), real_out is the
C			(elementwise) maximum over real_in[P],
C			where P ranges over all grid (not host)
C			processes.
C		in host process (but not in grid processes),
C			the values in real_in are undetermined.
C		(THAT IS -- in grid processes, real_in is an input-only
C			argument whose value is not changed, but in
C			the host process, real_in is used as work space.)
C
C-----------------------------------------------------------------------

	include 'mesh_uparms.h'
	include 'mesh_parms.h'
	include 'mesh_common.h'
	integer isize
	real real_in(isize), real_out(isize)

	if (iproctype .eq. GRIDTYPE) then
                call p4send (MSG_GTOHFLT_TAG,0,real_in,
     -			 isize*REAL_SIZE,irc)
                ilen = isize*REAL_SIZE
                ifrom = 0
                call p4recv (MSG_HTOGFLT_TAG,ifrom, real_out,
     -			isize*REAL_SIZE, ilen, irc)
	else
C		compute maxima into real_out in host process
		ihost = iprocnum(HOSTTYPE, 0, 0, 0)
		do mm = 1, isize
			real_out(mm) = 0.0
		enddo
	
C		loop over processes in the same order as the sequential
C		version
		do iproc = 1, NPROCS
		if (iproc .ne. ihost) then
                        ilen = isize*REAL_SIZE
                        ifrom = iproc-1
                        call p4recv (MSG_GTOHFLT_TAG,ifrom, real_in,
     -                       isize*REAL_SIZE,ilen, irc)
			do mm = 1, isize
				if (real_in(mm) .gt. real_out(mm))
     -					real_out(mm) = real_in(mm)
			enddo
		endif
		enddo

C		send back to grid processes
		do ii = 1, XPROCS
		do jj = 1, YPROCS
                        idest = iprocnum (GRIDTYPE, ii, jj)-1
                        call p4send (MSG_HTOGFLT_TAG,idest,real_out,
     -                       isize*REAL_SIZE,irc)
		enddo
		enddo
	endif

	return
	end

C=======================================================================
C
C	subroutine to get real sum
C
C=======================================================================

C-----------------------------------------------------------------------
C
	subroutine mesh_merge_real_sum(isize, real_in, real_out)
C
C	precondition:
C		real_in and real_out are real arrays of size isize,
C			with real_in[P] denoting the value of real_in
C			in grid process P.
C		all processes (host and grid) have called
C			mesh_merge_real_sum(isize, real_in, real_out).
C	postcondition:
C		in all processes (host and grid), real_out is the
C			(elementwise) sum[P], where P ranges over all
C			grid (not host) processes.
C		in host process (but not in grid processes),
C			the values in real_in are undetermined.
C		(THAT IS -- in grid processes, real_in is an input-only
C			argument whose value is not changed, but in
C			the host process, real_in is used as work space.)
C
C-----------------------------------------------------------------------

	include 'mesh_uparms.h'
	include 'mesh_parms.h'
	include 'mesh_common.h'
	integer isize
	real real_in(isize), real_out(isize)

	if (iproctype .eq. GRIDTYPE) then
                call p4send (MSG_GTOHFLT_TAG,0,real_in,
     -			isize*REAL_SIZE,irc)
                ilen = isize*REAL_SIZE
                ifrom = 0
                call p4recv (MSG_HTOGFLT_TAG,ifrom, real_out,
     -			isize*REAL_SIZE, ilen, irc)
	else
C		compute sums into real_out in host process
		ihost = iprocnum(HOSTTYPE, 0, 0, 0)
		do mm = 1, isize
			real_out(mm) = 0.0
		enddo
	
C		loop over processes in the same order as the sequential
C		version
		do iproc = 1, NPROCS
		if (iproc .ne. ihost) then
                        ilen = isize*REAL_SIZE
                        ifrom = iproc-1
                        call p4recv (MSG_GTOHFLT_TAG,ifrom, real_in,
     -                 	        isize*REAL_SIZE, ilen, irc)
			do mm = 1, isize
			real_out(mm) = real_out(mm) + real_in(mm)
			enddo
		endif
		enddo

C		send back to grid processes
		do ii = 1, XPROCS
		do jj = 1, YPROCS
                        idest = iprocnum (GRIDTYPE, ii, jj)-1
                        call p4send (MSG_HTOGFLT_TAG,idest,real_out,
     -                       isize*REAL_SIZE,irc)
		enddo
		enddo
	endif

	return
	end

C=======================================================================
C
C	subroutine to synchronize all processes
C
C=======================================================================

C-----------------------------------------------------------------------
C
	subroutine mesh_synch
C
C	precondition:
C		all processes (host and grid) have called mesh_synch.
C	postcondition:
C		all processes (host and grid) have called mesh_synch.
C		(THAT IS -- this is a barrier synchronization.)
C
C-----------------------------------------------------------------------

	include 'mesh_uparms.h'
	include 'mesh_parms.h'
	include 'mesh_common.h'

	integer isynch(1)

	isynch(1) = 0
	call mesh_merge_int_max(1, isynch, idummy)
	return
	end

C=======================================================================
C=======================================================================
C
C	subroutines to broadcast data
C
C=======================================================================
C=======================================================================

C=======================================================================
C
C	subroutine to broadcast integers
C
C=======================================================================

C-----------------------------------------------------------------------
C
	subroutine mesh_bcast_int(isize, int_array)
C
C	precondition:
C		int_array is an integer array of size isize, with
C			int_array[P] denoting the value of int_array
C			in grid process P and int_array[0] denoting
C			the value of int_array in the host process.
C		all processes (host and grid) have called
C			mesh_bcast_int(isize, int_array).
C	postcondition:
C		in all processes (host and grid), int_array has the
C			same value as int_array[0].
C
C-----------------------------------------------------------------------

	include 'mesh_uparms.h'
	include 'mesh_parms.h'
	include 'mesh_common.h'
	integer isize
	integer int_array(isize)

	if (iproctype .eq. HOSTTYPE) then
		do ii = 1, XPROCS
		do jj = 1, YPROCS
                        idest = iprocnum(GRIDTYPE, ii, jj)-1
                        call p4send (MSG_HTOGINT_TAG,idest,int_array,
     -                       isize*INTEGER_SIZE,irc)
		enddo
		enddo
	else
                ilen = isize*INTEGER_SIZE
                ifrom = 0
                call p4recv (MSG_HTOGINT_TAG,ifrom,int_array,
     -			isize*INTEGER_SIZE,ilen,irc)
	endif

	return
	end

C-----------------------------------------------------------------------
C
	subroutine mesh_bcast_real(isize, real_array)
C
C	precondition:
C		real_array is an real array of size isize, with
C			real_array[P] denoting the value of real_array
C			in grid process P and real_array[0] denoting
C			the value of real_array in the host process.
C		all processes (host and grid) have called
C			mesh_bcast_real(isize, real_array).
C	postcondition:
C		in all processes (host and grid), real_array has the
C			same value as real_array[0].
C
C-----------------------------------------------------------------------

	include 'mesh_uparms.h'
	include 'mesh_parms.h'
	include 'mesh_common.h'
	integer isize
	real real_array(isize)

	if (iproctype .eq. HOSTTYPE) then
		do ii = 1, XPROCS
		do jj = 1, YPROCS
                        idest = iprocnum (GRIDTYPE, ii, jj)-1
                        call p4send (MSG_HTOGFLT_TAG,idest,real_array,
     -                       isize*REAL_SIZE,irc)
		enddo
		enddo
	else
                ilen = isize*REAL_SIZE
                ifrom = 0
                call p4recv (MSG_HTOGFLT_TAG,ifrom,real_array,
     -			isize*REAL_SIZE,ilen,irc)
	endif

	return
	end

C=======================================================================
C=======================================================================
C
C	utility functions and subroutines
C
C=======================================================================
C=======================================================================

C=======================================================================
C
C	routines to map from process type and process grid coordinates
C		to process number and vice versa
C
C=======================================================================

C-----------------------------------------------------------------------
C
	integer function iprocnum(iptype, ipx, ipy)
C
C	precondition:
C		iptype is process type (HOSTTYPE or GRIDTYPE).
C		ipx, ipy are coordinates of process in process
C			grid (if iptype is GRIDTYPE).
C	postcondition:
C		return value is a single integer denoting process
C			-- unique for each process.
C
C-----------------------------------------------------------------------

	include 'mesh_uparms.h'
	include 'mesh_parms.h'

	if (iptype .eq. HOSTTYPE) then
		iprocnum = 1
	else
		iprocnum = (ipy-1)*XPROCS + ipx + 1
	endif
	return
	end

C-----------------------------------------------------------------------

C#######################################################################
C
C     WARNING!!! Assumed that this subroutine does the following with
C         iproc:
C           iproc = 1   =>  hosttype.
C           2...NPROCS  =>  gridtype.
C
C#######################################################################

C-----------------------------------------------------------------------
C
	subroutine pxpypz(iproc, iptype, ipx, ipy)
C
C	precondition:
C		iproc is a single integer denoting process
C			-- unique for each process.
C	postcondition:
C		iptype is process type (HOSTTYPE or GRIDTYPE)
C		ipx, ipy are coordinates of process in process.
C			grid (if iptype is GRIDTYPE)
C		(i.e., this subroutine is the inverse of function
C			iprocnum above.)
C
C-----------------------------------------------------------------------

	include 'mesh_uparms.h'
	include 'mesh_parms.h'

	if (iproc .eq. 1) then
		iptype = HOSTTYPE
		ipx = 0
		ipy = 0
	else
		iptype = GRIDTYPE
C		compute ipx - 1
		itemp = iproc - 2
		ipxm1 = mod(itemp, XPROCS)
		ipx = ipxm1 + 1
C		compute ipy - 1
		itemp = (iproc - 1 - ipx) / XPROCS
		ipym1 = mod(itemp, YPROCS)
		ipy = ipym1 + 1
	endif
	return
	end

C=======================================================================
C
C	subroutines to map from global to local indices
C
C=======================================================================

C-----------------------------------------------------------------------
C
	subroutine ilocal(ii, ipx, iiloc)
C
C	precondition:
C		ii is a global X index.
C	postcondition:
C		ipx, iiloc are the corresponding X index into the
C			process grid and local X index.
C
C-----------------------------------------------------------------------

	include 'mesh_uparms.h'
	include 'mesh_parms.h'
	include 'mesh_common.h'

	ipx = (ii-1) / NXlsize + 1
	iiloc = mod((ii-1), NXlsize) + 1
	return
	end

C-----------------------------------------------------------------------

C-----------------------------------------------------------------------
C
	subroutine jlocal(jj, ipy, jjloc)
C
C	precondition:
C		jj is a global Y index.
C	postcondition:
C		ipy, jjloc are the corresponding Y index into the
C			process grid and local Y index.
C
C-----------------------------------------------------------------------

	include 'mesh_uparms.h'
	include 'mesh_parms.h'
	include 'mesh_common.h'

	ipy = (jj-1) / NYlsize + 1
	jjloc = mod((jj-1), NYlsize) + 1
	return
	end

C=======================================================================
C
C	subroutines to map from local to global indices
C
C=======================================================================

C-----------------------------------------------------------------------
C
	subroutine iglobal(ipx, iiloc, ii)
C
C	precondition:
C		ipx, iiloc are an X index into the process grid and
C			a local X index.
C	postcondition:
C		ii is the corresponding global X index.
C
C-----------------------------------------------------------------------

	include 'mesh_uparms.h'
	include 'mesh_parms.h'
	include 'mesh_common.h'

	ii = (ipx-1)*NXlsize + iiloc
	return
	end

C-----------------------------------------------------------------------
C
	subroutine jglobal(ipy, jjloc, jj)
C
C	precondition:
C		ipy, jjloc are an Y index into the process grid and
C			a local Y index.
C	postcondition:
C		jj is the corresponding global Y index.
C
C-----------------------------------------------------------------------

	include 'mesh_uparms.h'
	include 'mesh_parms.h'
	include 'mesh_common.h'

	jj = (ipy-1)*NYlsize + jjloc
	return
	end

C=======================================================================
C
C	subroutines to determine intersection of a range of global
C	indices with the local section
C
C=======================================================================

C-----------------------------------------------------------------------
C
	subroutine xintersect(iglob1, iglob2, iloc1, iloc2, lempty)
C
C	precondition:
C		iglob1, iglob2 specify a range of global X indices.
C	postcondition:
C		lempty is .true. if the intersection of this range
C			with the local section is empty, .false.
C			otherwise.
C		iloc1, iloc2 specify the (local indices of the)
C			intersection of iglob1:iglob2
C			with the local section (if lempty is .false.).
C
C-----------------------------------------------------------------------

	include 'mesh_uparms.h'
	include 'mesh_parms.h'
	include 'mesh_common.h'

	logical lempty

C	transform global indices into local indices
	call ilocal(iglob1,ipx1,iloc1)
	call ilocal(iglob2,ipx2,iloc2)
C	is the intersection empty?
	if (ipx1 .gt. iprocx .or. ipx2 .lt. iprocx) then
		lempty = .true.
	else
		lempty = .false.
		IF (ipx1 .LT. iprocx) iloc1=1
		IF (ipx2 .GT. iprocx) iloc2=NXlocal
	endif
	return
	end

C-----------------------------------------------------------------------
C
	subroutine yintersect(jglob1, jglob2, jloc1, jloc2, lempty)
C
C	precondition:
C		jglob1, jglob2 specify a range of global X indices.
C	postcondition:
C		lempty is .true. if the intersection of this range
C			with the local section is empty, .false.
C			otherwise.
C		jloc1, jloc2 specify the (local indices of the)
C			intersection of jglob1:jglob2
C			with the local section (if lempty is .false.).
C
C-----------------------------------------------------------------------

	include 'mesh_uparms.h'
	include 'mesh_parms.h'
	include 'mesh_common.h'

	logical lempty

C	transform global indices into local indices
	call jlocal(jglob1,ipy1,jloc1)
	call jlocal(jglob2,ipy2,jloc2)
C	is the intersection empty?
	if (ipy1 .gt. iprocy .or. ipy2 .lt. iprocy) then
		lempty = .true.
	else
		lempty = .false.
		IF (ipy1 .LT. iprocy) jloc1=1
		IF (ipy2 .GT. iprocy) jloc2=NYlocal
	endif
	return
	end

C=======================================================================
C
C	subroutine to generate filename
C
C=======================================================================

C-----------------------------------------------------------------------
C
	subroutine fname(iptype, ipx, ipy, inname, outname)
C
C	precondition:
C		iptype is process type (HOSTTYPE or GRIDTYPE).
C		ipx, ipy are coordinates of process in process
C			grid (if iptype is GRIDTYPE) -- assumed to
C			be no more than 2 digits each.
C		inname is a character string -- base for file name.
C	postcondition:
C		outname is inname followed by:
C			'host' if iptype is HOSTTYPE,
C			process grid coordinates (ipx, ipy)
C				if iptype is GRIDTYPE.
C
C-----------------------------------------------------------------------

	include 'mesh_uparms.h'
	include 'mesh_parms.h'

	character*(*) inname
	character*(*) outname
	character*8 cbuff

	if (iptype .eq. HOSTTYPE) then
		outname = inname // 'host'
	else
		if (XPROCS .le. 9 .and. YPROCS .le. 9) then
			write (cbuff, '(3i1.1)') ipx,ipy
		else
			write (cbuff, '(3i2.2)') ipx,ipy
		endif
		outname = inname // cbuff
	endif

	return
	end

C=======================================================================
C=======================================================================
