The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
        include 'mpif.h'
        real, dimension (:,:), allocatable :: t,s,r
        real (kind=8):: timef,t2,t1
        real :: h,xl,yl,tol,h2,aa,sum,global_sum
        integer :: nx,ny,i,j,ij,nproc,iter,niter,n_left,npt,nxy
        integer :: ierror,nproc,my_rank,root
        integer, dimension(MPI_STATUS_SIZE) :: istatus
        integer, dimension(:), allocatable :: local_n
        real, dimension(:), allocatable :: local_a
c234567
       call mpi_init(ierror)
       call mpi_comm_size(MPI_COMM_WORLD, nproc, ierror)
       call mpi_comm_rank(MPI_COMM_WORLD, my_rank, ierror)
c
      xl = 10.
      yl = 10.
      tol = 1.0e-3
c
c initial set up done by my_rank = 0 (i.e. master) process only
c
      if(my_rank .eq. 0) then
      print *,'there are ',nproc,' processes'
      print *,'enter number of iterations and h:'
      read(5,*) niter,h
c
c  determine the workload for each processor
c
      nx = xl/h + 1
      ny = nx
c
      print *,'nx=ny= ',nx
c
      allocate(local_n(nproc))
      allocate(local_a(nproc))
c
      do i=1,nproc
        local_n(i) = ny/nproc 
      enddo
c
c if workload cannot be divided evenly
c to each process, assign to the
c first n_left processes with an
c additional piece of work
c
      if(mod(ny,nproc) .ne. 0) then
       n_left = ny - nproc*(ny/nproc)
        do i=1,n_left
          local_n(i) = local_n(i) + 1
        enddo
        local_a(1) = 0.0
        do i=2,nproc
          local_a(i) = local_a(i-1) + h*local_n(i-1)
        enddo
      endif

c
      print *,'process #,  local worklaod,  start location '
      do i=1,nproc
       print *,i-1, local_n(i), local_a(i)
      enddo

      endif

c
c  now all the processes begin to work together
c
      call MPI_Bcast(h,1,MPI_REAL,0,MPI_COMM_WORLD,ierror)
      call MPI_Bcast(niter,1,MPI_INTEGER,0,MPI_COMM_WORLD,ierror)
      call MPI_Scatter(local_n,1,MPI_INTEGER,
     &                 npt,1,MPI_INTEGER,0,MPI_COMM_WORLD,ierror)
      call MPI_Scatter(local_a,1,MPI_REAL,
     &                 aa,1,MPI_REAL,0,MPI_COMM_WORLD,ierror)
c
c  add one additional point to the end strips and 
c  add additional 2 points to the interior strips 
c  to facilitate overlap      
c
      xl = 10.0
      nx = xl/h + 1

      if( my_rank.eq.0 .or. my_rank.eq.nproc-1) then
          ny = npt + 1
      else
          ny = npt + 2
      endif
c
c
      allocate (t(nx,ny))
      allocate (s(nx,ny))
      allocate (r(nx,ny))
c
c      print *,'I am process ',my_rank,' nx,ny= ',nx,ny
c      print *,' aa= ',aa
c      print *,' h = ',h
c      print *,' niter= ',niter
c
      h2 = h*h
      nxy = nx*ny
c
c
c   source term
c
      do j=1,ny
       do i=1,nx
       t(i,j) = 20. 
       s(i,j) = 0.0
       r(i,j) = 0.0
       enddo
      enddo
c
c  fix the boundary conditions
c
c  domain is divided into horizontal stripes
c

c
c    along the west and east walls
c

      do j=1,ny
      t(1,j)  = 20.0
      t(nx,j) = 20.0
      enddo

c 
c   along the south and north walls
c

c234567
c
c  south boundary is owned by the first processor
c
      if( my_rank .eq. 0) then
      do i=1,nx
       t(i,1) = 20. 
      enddo
      endif
c
c  north boundary is owned by the last processor
c 
      if( my_rank .eq. nproc-1) then
      do i=1,nx
c
      xx = (i-1)*h
      if ( xx.ge.3.0 .and. xx.le.7.0 ) then
       t(i,ny) = 100.0
      else
       t(i,ny) = 20.0
      endif

      enddo
      endif


      call MPI_Barrier(MPI_COMM_WORLD,ierror)

      if( my_rank .eq. 0) then
       print *,'finish updating boundary conditions'
c       t1 = timef()
       t1 = MPI_Wtime()
      endif


      do iter=1,niter

c
c send and receive interfacial values from nearest neighbours
c
      if( my_rank.ne.0   .and.  my_rank.ne.nproc-1 ) then
c
c along the south side
c
        call MPI_Sendrecv(t(1,2),nx,MPI_REAL,my_rank-1,10,
     &                    t(1,1),nx,MPI_REAL,my_rank-1,20,
     &                    MPI_COMM_WORLD,istatus,ierror)
c
c along the north side
c
        call MPI_Sendrecv(t(1,ny-1),nx,MPI_REAL,my_rank+1,20,
     &                    t(1,ny),  nx,MPI_REAL,my_rank+1,10,
     &                    MPI_COMM_WORLD,istatus,ierror)

      endif

      if( my_rank .eq. 0 ) then
c
c along the north side
c
        root = 1
        call MPI_Sendrecv(t(1,ny-1),nx,MPI_REAL,root,20,
     &                    t(1,ny),  nx,MPI_REAL,root,10,
     &                    MPI_COMM_WORLD,istatus,ierror)

      endif


      if( my_rank .eq. nproc-1 ) then
c
c along the south side
c
        root = nproc-2
        call MPI_Sendrecv(t(1,2),nx,MPI_REAL,root,10,
     &                    t(1,1),nx,MPI_REAL,root,20,
     &                    MPI_COMM_WORLD,istatus,ierror)

      endif

     
      sum = 0.0

      do j=2,ny-1
       do i=2,nx-1
        r(i,j) = s(i,j)*h2 - (
     1                         t(i+1,j)+t(i,j+1)
     2                        -4.*t(i,j)
     3                        +t(i-1,j)+t(i,j-1)  )
        sum = sum + r(i,j)*r(i,j)
       enddo
      enddo 
c
c  update temperatures
c
      do ij=1,nxy
        t(ij,1) = t(ij,1) - 0.25*r(ij,1)
      enddo

c
c obtain global sum
c
      call MPI_Allreduce(sum,global_sum,1,MPI_REAL,
     &                   MPI_SUM,MPI_COMM_WORLD,ierror)
c
c
      global_sum = sqrt(global_sum)
c
c
c      if( my_rank .eq. 0) then
c      if (mod(iter,500) .eq. 0) then
c      print *,iter,global_sum
c      endif
c
c      write(2,*) iter, global_sum
c
c      endif
c
      if(global_sum .le. tol ) go to 1000
c
      enddo
c
1000     continue
c
c
        if( my_rank .eq. 0) then
c        t2 = timef()
        t2 = MPI_Wtime()
        print *,'no. of iterations took = ',iter
        print *,'final L2 norm of residual: ',global_sum
        print *,'wall clock time in seconds= ',(t2-t1)
        endif
c
        call MPI_FINALIZE(ierror)
c
      stop
      end