diff --git a/src/core_atmosphere/dynamics/mpas_atm_boundaries.F b/src/core_atmosphere/dynamics/mpas_atm_boundaries.F index fca1734138..ed2e9a492e 100644 --- a/src/core_atmosphere/dynamics/mpas_atm_boundaries.F +++ b/src/core_atmosphere/dynamics/mpas_atm_boundaries.F @@ -287,7 +287,7 @@ end subroutine mpas_atm_update_bdy_tend !> tend_scalars(1,:,:) = mpas_atm_get_bdy_tend(clock, domain % blocklist, nVertLevels, nCells, 'qv', 0.0_RKIND) ! !----------------------------------------------------------------------- - function mpas_atm_get_bdy_tend(clock, block, vertDim, horizDim, field, delta_t) result(return_tend) + subroutine mpas_atm_get_bdy_tend(clock, block, vertDim, horizDim, field, delta_t, return_tend) implicit none @@ -296,31 +296,49 @@ function mpas_atm_get_bdy_tend(clock, block, vertDim, horizDim, field, delta_t) integer, intent(in) :: vertDim, horizDim character(len=*), intent(in) :: field real (kind=RKIND), intent(in) :: delta_t - - real (kind=RKIND), dimension(vertDim,horizDim+1) :: return_tend + real (kind=RKIND), dimension(vertDim,horizDim+1), intent(out) :: return_tend type (mpas_pool_type), pointer :: lbc - integer, pointer :: idx + integer, pointer :: idx_ptr real (kind=RKIND), dimension(:,:), pointer :: tend real (kind=RKIND), dimension(:,:,:), pointer :: tend_scalars - integer :: ierr + integer :: idx, i, j call mpas_pool_get_subpool(block % structs, 'lbc', lbc) nullify(tend) call mpas_pool_get_array(lbc, 'lbc_'//trim(field), tend, 1) - + if (associated(tend)) then - return_tend(:,:) = tend(:,:) else call mpas_pool_get_array(lbc, 'lbc_scalars', tend_scalars, 1) - call mpas_pool_get_dimension(lbc, 'index_'//trim(field), idx) - return_tend(:,:) = tend_scalars(idx,:,:) + ! Ensure the integer pointed to by idx_ptr is copied to the gpu device + call mpas_pool_get_dimension(lbc, 'index_'//trim(field), idx_ptr) + idx = idx_ptr + end if + + !$acc parallel default(present) + if (associated(tend)) then + !$acc loop gang vector collapse(2) + do j=1,horizDim+1 + do i=1,vertDim + return_tend(i,j) = tend(i,j) + end do + end do + else + !$acc loop gang vector collapse(2) + do j=1,horizDim+1 + do i=1,vertDim + return_tend(i,j) = tend_scalars(idx,i,j) + end do + end do end if + !$acc end parallel + - end function mpas_atm_get_bdy_tend + end subroutine mpas_atm_get_bdy_tend !*********************************************************************** @@ -356,10 +374,11 @@ end function mpas_atm_get_bdy_tend !> scalars(1,:,:) = mpas_atm_get_bdy_state(clock, domain % blocklist, nVertLevels, nCells, 'qv', 0.0_RKIND) ! !----------------------------------------------------------------------- - function mpas_atm_get_bdy_state_2d(clock, block, vertDim, horizDim, field, delta_t) result(return_state) + subroutine mpas_atm_get_bdy_state_2d(clock, block, vertDim, horizDim, field, delta_t, return_state) use mpas_pool_routines, only : mpas_pool_get_error_level, mpas_pool_set_error_level use mpas_derived_types, only : MPAS_POOL_SILENT + use mpas_log, only : mpas_log_write implicit none @@ -368,8 +387,7 @@ function mpas_atm_get_bdy_state_2d(clock, block, vertDim, horizDim, field, delta integer, intent(in) :: vertDim, horizDim character(len=*), intent(in) :: field real (kind=RKIND), intent(in) :: delta_t - - real (kind=RKIND), dimension(vertDim,horizDim+1) :: return_state + real (kind=RKIND), dimension(vertDim,horizDim+1), intent(out) :: return_state type (mpas_pool_type), pointer :: lbc integer, pointer :: idx_ptr @@ -420,10 +438,6 @@ function mpas_atm_get_bdy_state_2d(clock, block, vertDim, horizDim, field, delta ! query the field as a scalar constituent ! if (associated(tend) .and. associated(state)) then - MPAS_ACC_TIMER_START('mpas_atm_get_bdy_state_2d [ACC_data_xfer]') - !$acc enter data create(return_state) & - !$acc copyin(tend, state) - MPAS_ACC_TIMER_STOP('mpas_atm_get_bdy_state_2d [ACC_data_xfer]') !$acc parallel default(present) !$acc loop gang vector collapse(2) @@ -434,10 +448,6 @@ function mpas_atm_get_bdy_state_2d(clock, block, vertDim, horizDim, field, delta end do !$acc end parallel - MPAS_ACC_TIMER_START('mpas_atm_get_bdy_state_2d [ACC_data_xfer]') - !$acc exit data copyout(return_state) & - !$acc delete(tend, state) - MPAS_ACC_TIMER_STOP('mpas_atm_get_bdy_state_2d [ACC_data_xfer]') else call mpas_pool_get_array(lbc, 'lbc_scalars', tend_scalars, 1) call mpas_pool_get_array(lbc, 'lbc_scalars', state_scalars, 2) @@ -445,11 +455,6 @@ function mpas_atm_get_bdy_state_2d(clock, block, vertDim, horizDim, field, delta idx=idx_ptr ! Avoid non-array pointer for OpenACC - MPAS_ACC_TIMER_START('mpas_atm_get_bdy_state_2d [ACC_data_xfer]') - !$acc enter data create(return_state) & - !$acc copyin(tend_scalars, state_scalars) - MPAS_ACC_TIMER_STOP('mpas_atm_get_bdy_state_2d [ACC_data_xfer]') - !$acc parallel default(present) !$acc loop gang vector collapse(2) do i=1, horizDim+1 @@ -459,13 +464,9 @@ function mpas_atm_get_bdy_state_2d(clock, block, vertDim, horizDim, field, delta end do !$acc end parallel - MPAS_ACC_TIMER_START('mpas_atm_get_bdy_state_2d [ACC_data_xfer]') - !$acc exit data copyout(return_state) & - !$acc delete(tend_scalars, state_scalars) - MPAS_ACC_TIMER_STOP('mpas_atm_get_bdy_state_2d [ACC_data_xfer]') end if - end function mpas_atm_get_bdy_state_2d + end subroutine mpas_atm_get_bdy_state_2d !*********************************************************************** @@ -498,7 +499,7 @@ end function mpas_atm_get_bdy_state_2d !> num_scalars, nVertLevels, nCells, 'scalars', 0.0_RKIND) ! !----------------------------------------------------------------------- - function mpas_atm_get_bdy_state_3d(clock, block, innerDim, vertDim, horizDim, field, delta_t) result(return_state) + subroutine mpas_atm_get_bdy_state_3d(clock, block, innerDim, vertDim, horizDim, field, delta_t, return_state) use mpas_pool_routines, only : mpas_pool_get_error_level, mpas_pool_set_error_level use mpas_derived_types, only : MPAS_POOL_SILENT @@ -510,8 +511,7 @@ function mpas_atm_get_bdy_state_3d(clock, block, innerDim, vertDim, horizDim, fi integer, intent(in) :: innerDim, vertDim, horizDim character(len=*), intent(in) :: field real (kind=RKIND), intent(in) :: delta_t - - real (kind=RKIND), dimension(innerDim,vertDim,horizDim+1) :: return_state + real (kind=RKIND), dimension(innerDim,vertDim,horizDim+1), intent(out) :: return_state type (mpas_pool_type), pointer :: lbc real (kind=RKIND), dimension(:,:,:), pointer :: tend @@ -543,11 +543,6 @@ function mpas_atm_get_bdy_state_3d(clock, block, innerDim, vertDim, horizDim, fi call mpas_pool_get_array(lbc, 'lbc_'//trim(field), tend, 1) call mpas_pool_get_array(lbc, 'lbc_'//trim(field), state, 2) - MPAS_ACC_TIMER_START('mpas_atm_get_bdy_state_3d [ACC_data_xfer]') - !$acc enter data create(return_state) & - !$acc copyin(tend, state) - MPAS_ACC_TIMER_STOP('mpas_atm_get_bdy_state_3d [ACC_data_xfer]') - !$acc parallel default(present) !$acc loop gang vector collapse(3) do i=1, horizDim+1 @@ -559,12 +554,7 @@ function mpas_atm_get_bdy_state_3d(clock, block, innerDim, vertDim, horizDim, fi end do !$acc end parallel - MPAS_ACC_TIMER_START('mpas_atm_get_bdy_state_3d [ACC_data_xfer]') - !$acc exit data copyout(return_state) & - !$acc delete(tend, state) - MPAS_ACC_TIMER_STOP('mpas_atm_get_bdy_state_3d [ACC_data_xfer]') - - end function mpas_atm_get_bdy_state_3d + end subroutine mpas_atm_get_bdy_state_3d !*********************************************************************** diff --git a/src/core_atmosphere/dynamics/mpas_atm_iau.F b/src/core_atmosphere/dynamics/mpas_atm_iau.F index 654fd3ae82..c0b9fab65e 100644 --- a/src/core_atmosphere/dynamics/mpas_atm_iau.F +++ b/src/core_atmosphere/dynamics/mpas_atm_iau.F @@ -137,6 +137,7 @@ subroutine atm_add_tend_anal_incr (configs, structs, itimestep, dt, tend_ru, ten call mpas_pool_get_array(state, 'scalars', scalars, 1) call mpas_pool_get_array(state, 'rho_zz', rho_zz, 2) call mpas_pool_get_array(diag , 'rho_edge', rho_edge) + !$acc update self(theta_m, scalars, rho_zz, rho_edge) call mpas_pool_get_dimension(state, 'moist_start', moist_start) call mpas_pool_get_dimension(state, 'moist_end', moist_end) @@ -149,6 +150,7 @@ subroutine atm_add_tend_anal_incr (configs, structs, itimestep, dt, tend_ru, ten ! call mpas_pool_get_array(tend, 'rho_zz', tend_rho) ! call mpas_pool_get_array(tend, 'theta_m', tend_theta) call mpas_pool_get_array(tend, 'scalars_tend', tend_scalars) + !$acc update self(tend_scalars) call mpas_pool_get_array(tend_iau, 'theta', theta_amb) call mpas_pool_get_array(tend_iau, 'rho', rho_amb) diff --git a/src/core_atmosphere/dynamics/mpas_atm_time_integration.F b/src/core_atmosphere/dynamics/mpas_atm_time_integration.F index f5bbcdd0ae..1b6c79a9c6 100644 --- a/src/core_atmosphere/dynamics/mpas_atm_time_integration.F +++ b/src/core_atmosphere/dynamics/mpas_atm_time_integration.F @@ -65,6 +65,9 @@ end subroutine halo_exchange_routine ! real (kind=RKIND), allocatable, dimension(:,:) :: delsq_circulation ! no longer used -> removed real (kind=RKIND), allocatable, dimension(:,:) :: delsq_vorticity real (kind=RKIND), allocatable, dimension(:,:) :: dpdz + !$acc declare create(qtot) + !$acc declare create(delsq_theta, delsq_w, delsq_divergence) + !$acc declare create(delsq_u, delsq_vorticity, dpdz) ! Used in atm_advance_scalars real (kind=RKIND), dimension(:,:,:), allocatable :: horiz_flux_array @@ -78,19 +81,32 @@ end subroutine halo_exchange_routine real (kind=RKIND), dimension(:,:), allocatable :: wdtn_arr real (kind=RKIND), dimension(:,:), allocatable :: rho_zz_int - real (kind=RKIND), dimension(:,:,:), allocatable :: scalars_driving ! regional_MPAS addition real (kind=RKIND), dimension(:,:), allocatable :: ru_driving_tend ! regional_MPAS addition real (kind=RKIND), dimension(:,:), allocatable :: rt_driving_tend ! regional_MPAS addition real (kind=RKIND), dimension(:,:), allocatable :: rho_driving_tend ! regional_MPAS addition + !$acc declare create(ru_driving_tend) + !$acc declare create(rt_driving_tend) + !$acc declare create(rho_driving_tend) + + real (kind=RKIND), dimension(:,:,:), allocatable :: scalars_driving ! regional_MPAS addition real (kind=RKIND), dimension(:,:), allocatable :: ru_driving_values ! regional_MPAS addition real (kind=RKIND), dimension(:,:), allocatable :: rt_driving_values ! regional_MPAS addition - real (kind=RKIND), dimension(:,:), allocatable :: rho_driving_values ! regional_MPAS addition + real (kind=RKIND), dimension(:,:), allocatable :: rho_driving_values ! regional_MPAS addition + + !$acc declare create(scalars_driving) + !$acc declare create(ru_driving_values) + !$acc declare create(rt_driving_values) + !$acc declare create(rho_driving_values) + integer, dimension(:), pointer :: bdyMaskEdge ! regional_MPAS addition logical, pointer :: config_apply_lbcs ! Used in compute_solve_diagnostics real (kind=RKIND), allocatable, dimension(:,:) :: ke_vertex real (kind=RKIND), allocatable, dimension(:,:) :: ke_edge + !$acc declare create(ke_edge) + + type (MPAS_Clock_type), pointer, private :: clock type (block_type), pointer, private :: block @@ -198,7 +214,7 @@ subroutine mpas_atm_dynamics_init(domain) #ifdef MPAS_OPENACC type (mpas_pool_type), pointer :: mesh - + real (kind=RKIND), dimension(:), pointer :: dvEdge integer, dimension(:,:), pointer :: cellsOnCell integer, dimension(:,:), pointer :: cellsOnEdge @@ -243,8 +259,25 @@ subroutine mpas_atm_dynamics_init(domain) real (kind=RKIND), dimension(:,:), pointer :: zxu real (kind=RKIND), dimension(:,:), pointer :: dss real (kind=RKIND), dimension(:), pointer :: specZoneMaskCell -#endif + real (kind=RKIND), dimension(:,:), pointer :: defc_a + real (kind=RKIND), dimension(:,:), pointer :: defc_b + real (kind=RKIND), dimension(:), pointer :: latEdge + real (kind=RKIND), dimension(:), pointer :: angleEdge + real (kind=RKIND), dimension(:), pointer :: meshScalingDel2 + real (kind=RKIND), dimension(:), pointer :: meshScalingDel4 + real (kind=RKIND), dimension(:), pointer :: meshScalingRegionalCell + real (kind=RKIND), dimension(:), pointer :: meshScalingRegionalEdge + real (kind=RKIND), dimension(:), pointer :: latCell + real (kind=RKIND), dimension(:), pointer :: lonCell + real (kind=RKIND), dimension(:,:,:), pointer :: coeffs_reconstruct + + real (kind=RKIND), dimension(:,:), pointer :: rthdynten + + real (kind=RKIND), dimension(:), pointer :: u_init, v_init, qv_init + real (kind=RKIND), dimension(:,:), pointer :: t_init + +#endif #ifdef MPAS_CAM_DYCORE nullify(tend_physics) @@ -264,6 +297,7 @@ subroutine mpas_atm_dynamics_init(domain) nullify(mesh) call mpas_pool_get_subpool(domain % blocklist % structs, 'mesh', mesh) + MPAS_ACC_TIMER_START('mpas_dynamics_init [ACC_data_xfer]') call mpas_pool_get_array(mesh, 'dvEdge', dvEdge) !$acc enter data copyin(dvEdge) @@ -378,26 +412,954 @@ subroutine mpas_atm_dynamics_init(domain) call mpas_pool_get_array(mesh, 'zb', zb) !$acc enter data copyin(zb) - call mpas_pool_get_array(mesh, 'zb3', zb3) - !$acc enter data copyin(zb3) + call mpas_pool_get_array(mesh, 'zb3', zb3) + !$acc enter data copyin(zb3) + + call mpas_pool_get_array(mesh, 'nearestRelaxationCell', nearestRelaxationCell) + !$acc enter data copyin(nearestRelaxationCell) + + call mpas_pool_get_array(mesh, 'zgrid', zgrid) + !$acc enter data copyin(zgrid) + + call mpas_pool_get_array(mesh, 'zxu', zxu) + !$acc enter data copyin(zxu) + + call mpas_pool_get_array(mesh, 'dss', dss) + !$acc enter data copyin(dss) + + call mpas_pool_get_array(mesh, 'specZoneMaskCell', specZoneMaskCell) + !$acc enter data copyin(specZoneMaskCell) + + call mpas_pool_get_array(mesh, 'defc_a', defc_a) + !$acc enter data copyin(defc_a) + + call mpas_pool_get_array(mesh, 'defc_b', defc_b) + !$acc enter data copyin(defc_b) + + call mpas_pool_get_array(mesh, 'latEdge', latEdge) + !$acc enter data copyin(latEdge) + + call mpas_pool_get_array(mesh, 'angleEdge', angleEdge) + !$acc enter data copyin(angleEdge) + + call mpas_pool_get_array(mesh, 'meshScalingDel2', meshScalingDel2) + !$acc enter data copyin(meshScalingDel2) + + call mpas_pool_get_array(mesh, 'meshScalingDel4', meshScalingDel4) + !$acc enter data copyin(meshScalingDel4) + call mpas_pool_get_array(mesh, 'meshScalingRegionalCell', meshScalingRegionalCell) + !$acc enter data copyin(meshScalingRegionalCell) + + call mpas_pool_get_array(mesh, 'meshScalingRegionalEdge', meshScalingRegionalEdge) + !$acc enter data copyin(meshScalingRegionalEdge) + + call mpas_pool_get_array(mesh, 'latCell', latCell) + !$acc enter data copyin(latCell) + + call mpas_pool_get_array(mesh, 'lonCell', lonCell) + !$acc enter data copyin(lonCell) + + call mpas_pool_get_array(mesh, 'coeffs_reconstruct', coeffs_reconstruct) + !$acc enter data copyin(coeffs_reconstruct) + + call mpas_pool_get_array(mesh, 'u_init', u_init) + !$acc enter data copyin(u_init) + call mpas_pool_get_array(mesh, 'v_init', v_init) + !$acc enter data copyin(v_init) + call mpas_pool_get_array(mesh, 't_init', t_init) + !$acc enter data copyin(t_init) + call mpas_pool_get_array(mesh, 'qv_init', qv_init) + !$acc enter data copyin(qv_init) + + MPAS_ACC_TIMER_STOP('mpas_dynamics_init [ACC_data_xfer]') +#endif + + end subroutine mpas_atm_dynamics_init + + + subroutine mpas_atm_pre_computesolvediag_h2d(block) + + implicit none + + type (block_type), intent(inout) :: block + + +#ifdef MPAS_OPENACC + type (mpas_pool_type), pointer :: mesh + type (mpas_pool_type), pointer :: diag + type (mpas_pool_type), pointer :: state + type (mpas_pool_type), pointer :: tend_physics + real (kind=RKIND), dimension(:,:), pointer :: rthdynten + + real (kind=RKIND), dimension(:,:), pointer :: h_edge, v, vorticity, ke, pv_edge, & + pv_vertex, pv_cell, gradPVn, gradPVt, divergence + real (kind=RKIND), dimension(:,:), pointer :: u, h + + real (kind=RKIND), dimension(:,:), pointer :: zz + real (kind=RKIND), dimension(:,:,:), pointer :: zb_cell + real (kind=RKIND), dimension(:,:,:), pointer :: zb3_cell + real (kind=RKIND), dimension(:), pointer :: fzm + real (kind=RKIND), dimension(:), pointer :: fzp + real (kind=RKIND), dimension(:,:,:), pointer :: zb + real (kind=RKIND), dimension(:,:,:), pointer :: zb3 + + + real (kind=RKIND), dimension(:), pointer :: dvEdge + integer, dimension(:,:), pointer :: cellsOnCell + integer, dimension(:,:), pointer :: cellsOnEdge + integer, dimension(:,:), pointer :: advCellsForEdge + integer, dimension(:,:), pointer :: edgesOnCell + integer, dimension(:), pointer :: nAdvCellsForEdge + integer, dimension(:), pointer :: nEdgesOnCell + real (kind=RKIND), dimension(:,:), pointer :: adv_coefs + real (kind=RKIND), dimension(:,:), pointer :: adv_coefs_3rd + real (kind=RKIND), dimension(:,:), pointer :: edgesOnCell_sign + real (kind=RKIND), dimension(:), pointer :: invAreaCell + integer, dimension(:), pointer :: bdyMaskCell + integer, dimension(:), pointer :: bdyMaskEdge + real (kind=RKIND), dimension(:), pointer :: specZoneMaskEdge + real (kind=RKIND), dimension(:), pointer :: invDvEdge + real (kind=RKIND), dimension(:), pointer :: dcEdge + real (kind=RKIND), dimension(:), pointer :: invDcEdge + integer, dimension(:,:), pointer :: edgesOnEdge + integer, dimension(:,:), pointer :: edgesOnVertex + real (kind=RKIND), dimension(:,:), pointer :: edgesOnVertex_sign + integer, dimension(:), pointer :: nEdgesOnEdge + real (kind=RKIND), dimension(:,:), pointer :: weightsOnEdge + integer, dimension(:,:), pointer :: cellsOnVertex + integer, dimension(:,:), pointer :: verticesOnCell + integer, dimension(:,:), pointer :: verticesOnEdge + real (kind=RKIND), dimension(:), pointer :: invAreaTriangle + integer, dimension(:,:), pointer :: kiteForCell + real (kind=RKIND), dimension(:,:), pointer :: kiteAreasOnVertex + real (kind=RKIND), dimension(:), pointer :: fEdge + real (kind=RKIND), dimension(:), pointer :: fVertex + + nullify(mesh) + call mpas_pool_get_subpool(block % structs, 'mesh', mesh) + nullify(state) + call mpas_pool_get_subpool(block % structs, 'state', state) + nullify(diag) + call mpas_pool_get_subpool(block % structs, 'diag', diag) + + MPAS_ACC_TIMER_START('first_compute_solve_diagnostics [ACC_data_xfer]') + call mpas_pool_get_array(state, 'rho_zz', h, 1) + !$acc enter data create(h) + call mpas_pool_get_array(state, 'u', u, 1) + !$acc enter data copyin(u) + + call mpas_pool_get_array(diag, 'v', v) + !$acc enter data copyin(v) + call mpas_pool_get_array(diag, 'rho_edge', h_edge) + !$acc enter data copyin(h_edge) + call mpas_pool_get_array(diag, 'vorticity', vorticity) + !$acc enter data copyin(vorticity) + call mpas_pool_get_array(diag, 'divergence', divergence) + !$acc enter data copyin(divergence) + call mpas_pool_get_array(diag, 'ke', ke) + !$acc enter data copyin(ke) + call mpas_pool_get_array(diag, 'pv_edge', pv_edge) + !$acc enter data copyin(pv_edge) + call mpas_pool_get_array(diag, 'pv_vertex', pv_vertex) + !$acc enter data copyin(pv_vertex) + call mpas_pool_get_array(diag, 'pv_cell', pv_cell) + !$acc enter data copyin(pv_cell) + call mpas_pool_get_array(diag, 'gradPVn', gradPVn) + !$acc enter data copyin(gradPVn) + call mpas_pool_get_array(diag, 'gradPVt', gradPVt) + !$acc enter data copyin(gradPVt) + + ! Required by atm_init_coupled_diagnostics + call mpas_pool_get_array(mesh, 'zz', zz) + !$acc enter data copyin(zz) + + call mpas_pool_get_array(mesh, 'zb_cell', zb_cell) + !$acc enter data copyin(zb_cell) + + call mpas_pool_get_array(mesh, 'zb3_cell', zb3_cell) + !$acc enter data copyin(zb3_cell) + + call mpas_pool_get_array(mesh, 'fzm', fzm) + !$acc enter data copyin(fzm) + + call mpas_pool_get_array(mesh, 'fzp', fzp) + !$acc enter data copyin(fzp) + + call mpas_pool_get_array(mesh, 'zb', zb) + !$acc enter data copyin(zb) + + call mpas_pool_get_array(mesh, 'zb3', zb3) + !$acc enter data copyin(zb3) + + ! Required by atm_compute_solve_diagnostics + call mpas_pool_get_array(mesh, 'dvEdge', dvEdge) + !$acc enter data copyin(dvEdge) + + call mpas_pool_get_array(mesh, 'cellsOnEdge', cellsOnEdge) + !$acc enter data copyin(cellsOnEdge) + + call mpas_pool_get_array(mesh, 'edgesOnCell', edgesOnCell) + !$acc enter data copyin(edgesOnCell) + + call mpas_pool_get_array(mesh, 'nEdgesOnCell', nEdgesOnCell) + !$acc enter data copyin(nEdgesOnCell) + + call mpas_pool_get_array(mesh, 'edgesOnCell_sign', edgesOnCell_sign) + !$acc enter data copyin(edgesOnCell_sign) + + call mpas_pool_get_array(mesh, 'invAreaCell', invAreaCell) + !$acc enter data copyin(invAreaCell) + + call mpas_pool_get_array(mesh, 'invDvEdge', invDvEdge) + !$acc enter data copyin(invDvEdge) + + call mpas_pool_get_array(mesh, 'dcEdge', dcEdge) + !$acc enter data copyin(dcEdge) + + call mpas_pool_get_array(mesh, 'invDcEdge', invDcEdge) + !$acc enter data copyin(invDcEdge) + + call mpas_pool_get_array(mesh, 'edgesOnEdge', edgesOnEdge) + !$acc enter data copyin(edgesOnEdge) + + call mpas_pool_get_array(mesh, 'edgesOnVertex', edgesOnVertex) + !$acc enter data copyin(edgesOnVertex) + + call mpas_pool_get_array(mesh, 'edgesOnVertex_sign', edgesOnVertex_sign) + !$acc enter data copyin(edgesOnVertex_sign) + + call mpas_pool_get_array(mesh, 'nEdgesOnEdge', nEdgesOnEdge) + !$acc enter data copyin(nEdgesOnEdge) + + call mpas_pool_get_array(mesh, 'weightsOnEdge', weightsOnEdge) + !$acc enter data copyin(weightsOnEdge) + + call mpas_pool_get_array(mesh, 'verticesOnCell', verticesOnCell) + !$acc enter data copyin(verticesOnCell) + + call mpas_pool_get_array(mesh, 'verticesOnEdge', verticesOnEdge) + !$acc enter data copyin(verticesOnEdge) + + call mpas_pool_get_array(mesh, 'invAreaTriangle', invAreaTriangle) + !$acc enter data copyin(invAreaTriangle) + + call mpas_pool_get_array(mesh, 'kiteForCell', kiteForCell) + !$acc enter data copyin(kiteForCell) + + call mpas_pool_get_array(mesh, 'kiteAreasOnVertex', kiteAreasOnVertex) + !$acc enter data copyin(kiteAreasOnVertex) + + call mpas_pool_get_array(mesh, 'fVertex', fVertex) + !$acc enter data copyin(fVertex) + + MPAS_ACC_TIMER_STOP('first_compute_solve_diagnostics [ACC_data_xfer]') +#endif + + end subroutine mpas_atm_pre_computesolvediag_h2d + + + subroutine mpas_atm_post_computesolvediag_d2h(block) + + implicit none + + type (block_type), intent(inout) :: block + + +#ifdef MPAS_OPENACC + type (mpas_pool_type), pointer :: mesh + type (mpas_pool_type), pointer :: diag + type (mpas_pool_type), pointer :: state + type (mpas_pool_type), pointer :: tend_physics + real (kind=RKIND), dimension(:,:), pointer :: rthdynten + + real (kind=RKIND), dimension(:,:), pointer :: h_edge, v, vorticity, ke, pv_edge, & + pv_vertex, pv_cell, gradPVn, gradPVt, divergence + real (kind=RKIND), dimension(:,:), pointer :: u, h + + real (kind=RKIND), dimension(:,:), pointer :: zz + real (kind=RKIND), dimension(:,:,:), pointer :: zb_cell + real (kind=RKIND), dimension(:,:,:), pointer :: zb3_cell + real (kind=RKIND), dimension(:), pointer :: fzm + real (kind=RKIND), dimension(:), pointer :: fzp + real (kind=RKIND), dimension(:,:,:), pointer :: zb + real (kind=RKIND), dimension(:,:,:), pointer :: zb3 + + + real (kind=RKIND), dimension(:), pointer :: dvEdge + integer, dimension(:,:), pointer :: cellsOnCell + integer, dimension(:,:), pointer :: cellsOnEdge + integer, dimension(:,:), pointer :: advCellsForEdge + integer, dimension(:,:), pointer :: edgesOnCell + integer, dimension(:), pointer :: nAdvCellsForEdge + integer, dimension(:), pointer :: nEdgesOnCell + real (kind=RKIND), dimension(:,:), pointer :: adv_coefs + real (kind=RKIND), dimension(:,:), pointer :: adv_coefs_3rd + real (kind=RKIND), dimension(:,:), pointer :: edgesOnCell_sign + real (kind=RKIND), dimension(:), pointer :: invAreaCell + integer, dimension(:), pointer :: bdyMaskCell + integer, dimension(:), pointer :: bdyMaskEdge + real (kind=RKIND), dimension(:), pointer :: specZoneMaskEdge + real (kind=RKIND), dimension(:), pointer :: invDvEdge + real (kind=RKIND), dimension(:), pointer :: dcEdge + real (kind=RKIND), dimension(:), pointer :: invDcEdge + integer, dimension(:,:), pointer :: edgesOnEdge + integer, dimension(:,:), pointer :: edgesOnVertex + real (kind=RKIND), dimension(:,:), pointer :: edgesOnVertex_sign + integer, dimension(:), pointer :: nEdgesOnEdge + real (kind=RKIND), dimension(:,:), pointer :: weightsOnEdge + integer, dimension(:,:), pointer :: cellsOnVertex + integer, dimension(:,:), pointer :: verticesOnCell + integer, dimension(:,:), pointer :: verticesOnEdge + real (kind=RKIND), dimension(:), pointer :: invAreaTriangle + integer, dimension(:,:), pointer :: kiteForCell + real (kind=RKIND), dimension(:,:), pointer :: kiteAreasOnVertex + real (kind=RKIND), dimension(:), pointer :: fEdge + real (kind=RKIND), dimension(:), pointer :: fVertex + + nullify(mesh) + call mpas_pool_get_subpool(block % structs, 'mesh', mesh) + nullify(state) + call mpas_pool_get_subpool(block % structs, 'state', state) + nullify(diag) + call mpas_pool_get_subpool(block % structs, 'diag', diag) + + MPAS_ACC_TIMER_START('first_compute_solve_diagnostics [ACC_data_xfer]') + + call mpas_pool_get_array(state, 'rho_zz', h, 1) + !$acc exit data copyout(h) + call mpas_pool_get_array(state, 'u', u, 1) + !$acc exit data copyout(u) + + call mpas_pool_get_array(diag, 'v', v) + !$acc exit data copyout(v) + call mpas_pool_get_array(diag, 'rho_edge', h_edge) + !$acc exit data copyout(h_edge) + call mpas_pool_get_array(diag, 'vorticity', vorticity) + !$acc exit data copyout(vorticity) + call mpas_pool_get_array(diag, 'divergence', divergence) + !$acc exit data copyout(divergence) + call mpas_pool_get_array(diag, 'ke', ke) + !$acc exit data copyout(ke) + call mpas_pool_get_array(diag, 'pv_edge', pv_edge) + !$acc exit data copyout(pv_edge) + call mpas_pool_get_array(diag, 'pv_vertex', pv_vertex) + !$acc exit data copyout(pv_vertex) + call mpas_pool_get_array(diag, 'pv_cell', pv_cell) + !$acc exit data copyout(pv_cell) + call mpas_pool_get_array(diag, 'gradPVn', gradPVn) + !$acc exit data copyout(gradPVn) + call mpas_pool_get_array(diag, 'gradPVt', gradPVt) + !$acc exit data copyout(gradPVt) + + ! Required by atm_init_coupled_diagnostics + call mpas_pool_get_array(mesh, 'zz', zz) + !$acc exit data delete(zz) + + call mpas_pool_get_array(mesh, 'zb_cell', zb_cell) + !$acc exit data delete(zb_cell) + + call mpas_pool_get_array(mesh, 'zb3_cell', zb3_cell) + !$acc exit data delete(zb3_cell) + + call mpas_pool_get_array(mesh, 'fzm', fzm) + !$acc exit data delete(fzm) + + call mpas_pool_get_array(mesh, 'fzp', fzp) + !$acc exit data delete(fzp) + + call mpas_pool_get_array(mesh, 'zb', zb) + !$acc exit data delete(zb) + + call mpas_pool_get_array(mesh, 'zb3', zb3) + !$acc exit data delete(zb3) + + + call mpas_pool_get_array(mesh, 'dvEdge', dvEdge) + !$acc exit data delete(dvEdge) + + call mpas_pool_get_array(mesh, 'cellsOnEdge', cellsOnEdge) + !$acc exit data delete(cellsOnEdge) + + call mpas_pool_get_array(mesh, 'edgesOnCell', edgesOnCell) + !$acc exit data delete(edgesOnCell) + + call mpas_pool_get_array(mesh, 'nEdgesOnCell', nEdgesOnCell) + !$acc exit data delete(nEdgesOnCell) + + call mpas_pool_get_array(mesh, 'edgesOnCell_sign', edgesOnCell_sign) + !$acc exit data delete(edgesOnCell_sign) + + call mpas_pool_get_array(mesh, 'invAreaCell', invAreaCell) + !$acc exit data delete(invAreaCell) + + call mpas_pool_get_array(mesh, 'invDvEdge', invDvEdge) + !$acc exit data delete(invDvEdge) + + call mpas_pool_get_array(mesh, 'dcEdge', dcEdge) + !$acc exit data delete(dcEdge) + + call mpas_pool_get_array(mesh, 'invDcEdge', invDcEdge) + !$acc exit data delete(invDcEdge) + + call mpas_pool_get_array(mesh, 'edgesOnEdge', edgesOnEdge) + !$acc exit data delete(edgesOnEdge) + + call mpas_pool_get_array(mesh, 'edgesOnVertex', edgesOnVertex) + !$acc exit data delete(edgesOnVertex) + + call mpas_pool_get_array(mesh, 'edgesOnVertex_sign', edgesOnVertex_sign) + !$acc exit data delete(edgesOnVertex_sign) + + call mpas_pool_get_array(mesh, 'nEdgesOnEdge', nEdgesOnEdge) + !$acc exit data delete(nEdgesOnEdge) + + call mpas_pool_get_array(mesh, 'weightsOnEdge', weightsOnEdge) + !$acc exit data delete(weightsOnEdge) + + call mpas_pool_get_array(mesh, 'verticesOnCell', verticesOnCell) + !$acc exit data delete(verticesOnCell) + + call mpas_pool_get_array(mesh, 'verticesOnEdge', verticesOnEdge) + !$acc exit data delete(verticesOnEdge) + + call mpas_pool_get_array(mesh, 'invAreaTriangle', invAreaTriangle) + !$acc exit data delete(invAreaTriangle) + + call mpas_pool_get_array(mesh, 'kiteForCell', kiteForCell) + !$acc exit data delete(kiteForCell) + + call mpas_pool_get_array(mesh, 'kiteAreasOnVertex', kiteAreasOnVertex) + !$acc exit data delete(kiteAreasOnVertex) + + call mpas_pool_get_array(mesh, 'fVertex', fVertex) + !$acc exit data delete(fVertex) + + MPAS_ACC_TIMER_STOP('first_compute_solve_diagnostics [ACC_data_xfer]') +#endif + + end subroutine mpas_atm_post_computesolvediag_d2h + + subroutine mpas_atm_pre_dynamics_h2d(domain) + + implicit none + + type (domain_type), intent(inout) :: domain + + +#ifdef MPAS_OPENACC + type (mpas_pool_type), pointer :: state + type (mpas_pool_type), pointer :: diag + type (mpas_pool_type), pointer :: tend + type (mpas_pool_type), pointer :: tend_physics + type (mpas_pool_type), pointer :: lbc + + + real (kind=RKIND), dimension(:,:), pointer :: ru, ru_p + real (kind=RKIND), dimension(:,:), pointer :: ru_save + real (kind=RKIND), dimension(:,:), pointer :: rw, rw_p + real (kind=RKIND), dimension(:,:), pointer :: rw_save + real (kind=RKIND), dimension(:,:), pointer :: rtheta_p + real (kind=RKIND), dimension(:,:), pointer :: exner, exner_base + real (kind=RKIND), dimension(:,:), pointer :: rtheta_base, rho_base + real (kind=RKIND), dimension(:,:), pointer :: rtheta_p_save + real (kind=RKIND), dimension(:,:), pointer :: rho_p, rho_pp, rho, theta, theta_base + real (kind=RKIND), dimension(:,:), pointer :: rho_p_save + real (kind=RKIND), dimension(:,:), pointer :: rho_zz_old_split + real (kind=RKIND), dimension(:,:), pointer :: cqw, rtheta_pp_old, rtheta_pp + real (kind=RKIND), dimension(:,:), pointer :: cqu, pressure_base, pressure_p, pressure, v + real (kind=RKIND), dimension(:,:), pointer :: kdiff, pv_edge, pv_vertex, pv_cell, rho_edge, h_divergence, ke + real (kind=RKIND), dimension(:,:), pointer :: cofwr, cofwz, coftz, cofwt, a_tri, alpha_tri, gamma_tri + real (kind=RKIND), dimension(:), pointer :: cofrz + real (kind=RKIND), dimension(:,:), pointer :: gradPVn, gradPVt + + + real (kind=RKIND), dimension(:,:), pointer :: u_1, u_2 + real (kind=RKIND), dimension(:,:), pointer :: w_1, w_2 + real (kind=RKIND), dimension(:,:), pointer :: theta_m_1, theta_m_2 + real (kind=RKIND), dimension(:,:), pointer :: rho_zz_1, rho_zz_2 + real (kind=RKIND), dimension(:,:,:), pointer :: scalars_1, scalars_2 + real (kind=RKIND), dimension(:,:), pointer :: ruAvg, wwAvg, ruAvg_split, wwAvg_split + + real (kind=RKIND), dimension(:,:), pointer :: tend_ru, tend_rt, tend_rho, tend_rw, rt_diabatic_tend + real (kind=RKIND), dimension(:,:), pointer :: tend_u_euler, tend_w_euler, tend_theta_euler + real(kind=RKIND), dimension(:,:), pointer :: tend_w_pgf, tend_w_buoy + real(kind=RKIND), dimension(:,:,:), pointer :: scalar_tend_save + + real (kind=RKIND), dimension(:,:), pointer :: rthdynten, divergence, vorticity + + real (kind=RKIND), dimension(:,:), pointer :: lbc_u, lbc_w, lbc_ru, lbc_rho_edge, lbc_rho, lbc_rtheta_m, lbc_rho_zz, lbc_theta + real (kind=RKIND), dimension(:,:), pointer :: lbc_tend_u, lbc_tend_w, lbc_tend_ru, lbc_tend_rho_edge, lbc_tend_rho + real (kind=RKIND), dimension(:,:), pointer :: lbc_tend_rtheta_m, lbc_tend_rho_zz, lbc_tend_theta + + real (kind=RKIND), dimension(:,:,:), pointer :: lbc_scalars, lbc_tend_scalars + + nullify(state) + nullify(diag) + nullify(tend) + nullify(tend_physics) + nullify(lbc) + call mpas_pool_get_subpool(domain % blocklist % structs, 'state', state) + call mpas_pool_get_subpool(domain % blocklist % structs, 'diag', diag) + call mpas_pool_get_subpool(domain % blocklist % structs, 'tend', tend) + call mpas_pool_get_subpool(domain % blocklist % structs, 'tend_physics', tend_physics) + call mpas_pool_get_subpool(domain % blocklist % structs, 'lbc', lbc) + + MPAS_ACC_TIMER_START('atm_srk3 [ACC_data_xfer]') + call mpas_pool_get_array(diag, 'ru', ru) + !$acc enter data copyin(ru) ! use values from atm_init_coupled_diagnostics + call mpas_pool_get_array(diag, 'ru_p', ru_p) + !$acc enter data copyin(ru_p) + call mpas_pool_get_array(diag, 'ru_save', ru_save) + !$acc enter data copyin(ru_save) + call mpas_pool_get_array(diag, 'rw', rw) + !$acc enter data copyin(rw) ! use values from atm_init_coupled_diagnostics + call mpas_pool_get_array(diag, 'rw_p', rw_p) + !$acc enter data copyin(rw_p) + call mpas_pool_get_array(diag, 'rw_save', rw_save) + !$acc enter data copyin(rw_save) + call mpas_pool_get_array(diag, 'rtheta_p', rtheta_p) + !$acc enter data copyin(rtheta_p) ! use values from atm_init_coupled_diagnostics + call mpas_pool_get_array(diag, 'rtheta_p_save', rtheta_p_save) + !$acc enter data copyin(rtheta_p_save) + call mpas_pool_get_array(diag, 'exner', exner) + !$acc enter data copyin(exner) ! use values from atm_init_coupled_diagnostics + call mpas_pool_get_array(diag, 'exner_base', exner_base) + !$acc enter data copyin(exner_base) ! use values from atm_init_coupled_diagnostics + call mpas_pool_get_array(diag, 'rtheta_base', rtheta_base) + !$acc enter data copyin(rtheta_base) ! use values from atm_init_coupled_diagnostics + call mpas_pool_get_array(diag, 'rho_base', rho_base) + !$acc enter data copyin(rho_base) + call mpas_pool_get_array(diag, 'rho', rho) + !$acc enter data copyin(rho) + call mpas_pool_get_array(diag, 'theta', theta) + !$acc enter data copyin(theta) + call mpas_pool_get_array(diag, 'theta_base', theta_base) + !$acc enter data copyin(theta_base) + call mpas_pool_get_array(diag, 'rho_p', rho_p) + !$acc enter data copyin(rho_p) ! use values from atm_init_coupled_diagnostics + call mpas_pool_get_array(diag, 'rho_p_save', rho_p_save) + !$acc enter data copyin(rho_p_save) + call mpas_pool_get_array(diag, 'rho_pp', rho_pp) + !$acc enter data copyin(rho_pp) + call mpas_pool_get_array(diag, 'rho_zz_old_split', rho_zz_old_split) + !$acc enter data copyin(rho_zz_old_split) + call mpas_pool_get_array(diag, 'cqw', cqw) + !$acc enter data copyin(cqw) + call mpas_pool_get_array(diag, 'cqu', cqu) + !$acc enter data copyin(cqu) + call mpas_pool_get_array(diag, 'pressure_p', pressure_p) + !$acc enter data copyin(pressure_p) ! use values from atm_init_coupled_diagnostics + call mpas_pool_get_array(diag, 'pressure_base', pressure_base) + !$acc enter data copyin(pressure_base) ! use values from atm_init_coupled_diagnostics + call mpas_pool_get_array(diag, 'pressure', pressure) + !$acc enter data copyin(pressure) + call mpas_pool_get_array(diag, 'v', v) + !$acc enter data copyin(v) ! use values from atm_compute_solve_diagnostics + call mpas_pool_get_array(diag, 'rtheta_pp', rtheta_pp) + !$acc enter data copyin(rtheta_pp) + call mpas_pool_get_array(diag, 'rtheta_pp_old', rtheta_pp_old) + !$acc enter data copyin(rtheta_pp_old) + call mpas_pool_get_array(diag, 'kdiff', kdiff) + !$acc enter data copyin(kdiff) + call mpas_pool_get_array(diag, 'pv_edge', pv_edge) + !$acc enter data copyin(pv_edge) ! use values from atm_compute_solve_diagnostics + call mpas_pool_get_array(diag, 'pv_vertex', pv_vertex) + !$acc enter data copyin(pv_vertex) + call mpas_pool_get_array(diag, 'pv_cell', pv_cell) + !$acc enter data copyin(pv_cell) + call mpas_pool_get_array(diag, 'rho_edge', rho_edge) + !$acc enter data copyin(rho_edge) ! use values from atm_compute_solve_diagnostics + call mpas_pool_get_array(diag, 'h_divergence', h_divergence) + !$acc enter data copyin(h_divergence) + call mpas_pool_get_array(diag, 'ke', ke) + !$acc enter data copyin(ke) ! use values from atm_compute_solve_diagnostics + call mpas_pool_get_array(diag, 'gradPVn', gradPVn) + !$acc enter data copyin(gradPVn) + call mpas_pool_get_array(diag, 'gradPVt', gradPVt) + !$acc enter data copyin(gradPVt) + + call mpas_pool_get_array(diag, 'alpha_tri', alpha_tri) + !$acc enter data copyin(alpha_tri) + call mpas_pool_get_array(diag, 'gamma_tri', gamma_tri) + !$acc enter data copyin(gamma_tri) + call mpas_pool_get_array(diag, 'a_tri', a_tri) + !$acc enter data copyin(a_tri) + call mpas_pool_get_array(diag, 'cofwr', cofwr) + !$acc enter data copyin(cofwr) + call mpas_pool_get_array(diag, 'cofwz', cofwz) + !$acc enter data copyin(cofwz) + call mpas_pool_get_array(diag, 'coftz', coftz) + !$acc enter data copyin(coftz) + call mpas_pool_get_array(diag, 'cofwt', cofwt) + !$acc enter data copyin(cofwt) + call mpas_pool_get_array(diag, 'cofrz', cofrz) + !$acc enter data copyin(cofrz) + call mpas_pool_get_array(diag, 'vorticity', vorticity) + !$acc enter data copyin(vorticity) + call mpas_pool_get_array(diag, 'divergence', divergence) + !$acc enter data copyin(divergence) + call mpas_pool_get_array(diag, 'ruAvg', ruAvg) + !$acc enter data copyin(ruAvg) + call mpas_pool_get_array(diag, 'ruAvg_split', ruAvg_split) + !$acc enter data copyin(ruAvg_split) + call mpas_pool_get_array(diag, 'wwAvg', wwAvg) + !$acc enter data copyin(wwAvg) + call mpas_pool_get_array(diag, 'wwAvg_split', wwAvg_split) + !$acc enter data copyin(wwAvg_split) + + call mpas_pool_get_array(state, 'u', u_1, 1) + !$acc enter data copyin(u_1) + call mpas_pool_get_array(state, 'u', u_2, 2) + !$acc enter data copyin(u_2) + call mpas_pool_get_array(state, 'w', w_1, 1) + !$acc enter data copyin(w_1) + call mpas_pool_get_array(state, 'w', w_2, 2) + !$acc enter data copyin(w_2) + call mpas_pool_get_array(state, 'theta_m', theta_m_1, 1) + !$acc enter data copyin(theta_m_1) ! use values from atm_init_coupled_diagnostics + call mpas_pool_get_array(state, 'theta_m', theta_m_2, 2) + !$acc enter data copyin(theta_m_2) + call mpas_pool_get_array(state, 'rho_zz', rho_zz_1, 1) + !$acc enter data copyin(rho_zz_1) + call mpas_pool_get_array(state, 'rho_zz', rho_zz_2, 2) + !$acc enter data copyin(rho_zz_2) + call mpas_pool_get_array(state, 'scalars', scalars_1, 1) + !$acc enter data copyin(scalars_1) + call mpas_pool_get_array(state, 'scalars', scalars_2, 2) + !$acc enter data copyin(scalars_2) + + + call mpas_pool_get_array(tend, 'u', tend_ru) + !$acc enter data copyin(tend_ru) + call mpas_pool_get_array(tend, 'rho_zz', tend_rho) + !$acc enter data copyin(tend_rho) + call mpas_pool_get_array(tend, 'theta_m', tend_rt) + !$acc enter data copyin(tend_rt) + call mpas_pool_get_array(tend, 'w', tend_rw) + !$acc enter data copyin(tend_rw) + call mpas_pool_get_array(tend, 'rt_diabatic_tend', rt_diabatic_tend) + !$acc enter data copyin(rt_diabatic_tend) + call mpas_pool_get_array(tend, 'u_euler', tend_u_euler) + !$acc enter data copyin(tend_u_euler) + call mpas_pool_get_array(tend, 'theta_euler', tend_theta_euler) + !$acc enter data copyin(tend_theta_euler) + call mpas_pool_get_array(tend, 'w_euler', tend_w_euler) + !$acc enter data copyin(tend_w_euler) + call mpas_pool_get_array(tend, 'w_pgf', tend_w_pgf) + !$acc enter data copyin(tend_w_pgf) + call mpas_pool_get_array(tend, 'w_buoy', tend_w_buoy) + !$acc enter data copyin(tend_w_buoy) + call mpas_pool_get_array(tend, 'scalars_tend', scalar_tend_save) + !$acc enter data copyin(scalar_tend_save) + + + call mpas_pool_get_array(lbc, 'lbc_u', lbc_u, 2) + !$acc enter data copyin(lbc_u) + call mpas_pool_get_array(lbc, 'lbc_w', lbc_w, 2) + !$acc enter data copyin(lbc_w) + call mpas_pool_get_array(lbc, 'lbc_ru', lbc_ru, 2) + !$acc enter data copyin(lbc_ru) + call mpas_pool_get_array(lbc, 'lbc_rho_edge', lbc_rho_edge, 2) + !$acc enter data copyin(lbc_rho_edge) + call mpas_pool_get_array(lbc, 'lbc_theta', lbc_theta, 2) + !$acc enter data copyin(lbc_theta) + call mpas_pool_get_array(lbc, 'lbc_rtheta_m', lbc_rtheta_m, 2) + !$acc enter data copyin(lbc_rtheta_m) + call mpas_pool_get_array(lbc, 'lbc_rho_zz', lbc_rho_zz, 2) + !$acc enter data copyin(lbc_rho_zz) + call mpas_pool_get_array(lbc, 'lbc_rho', lbc_rho, 2) + !$acc enter data copyin(lbc_rho) + call mpas_pool_get_array(lbc, 'lbc_scalars', lbc_scalars, 2) + !$acc enter data copyin(lbc_scalars) + + + call mpas_pool_get_array(lbc, 'lbc_u', lbc_tend_u, 1) + !$acc enter data copyin(lbc_tend_u) + call mpas_pool_get_array(lbc, 'lbc_ru', lbc_tend_ru, 1) + !$acc enter data copyin(lbc_tend_ru) + call mpas_pool_get_array(lbc, 'lbc_rho_edge', lbc_tend_rho_edge, 1) + !$acc enter data copyin(lbc_tend_rho_edge) + call mpas_pool_get_array(lbc, 'lbc_w', lbc_tend_w, 1) + !$acc enter data copyin(lbc_tend_w) + call mpas_pool_get_array(lbc, 'lbc_theta', lbc_tend_theta, 1) + !$acc enter data copyin(lbc_tend_theta) + call mpas_pool_get_array(lbc, 'lbc_rtheta_m', lbc_tend_rtheta_m, 1) + !$acc enter data copyin(lbc_tend_rtheta_m) + call mpas_pool_get_array(lbc, 'lbc_rho_zz', lbc_tend_rho_zz, 1) + !$acc enter data copyin(lbc_tend_rho_zz) + call mpas_pool_get_array(lbc, 'lbc_rho', lbc_tend_rho, 1) + !$acc enter data copyin(lbc_tend_rho) + call mpas_pool_get_array(lbc, 'lbc_scalars', lbc_tend_scalars, 1) + !$acc enter data copyin(lbc_tend_scalars) + + call mpas_pool_get_array(tend_physics, 'rthdynten', rthdynten) + !$acc enter data copyin(rthdynten) + + MPAS_ACC_TIMER_STOP('atm_srk3 [ACC_data_xfer]') +#endif + + end subroutine mpas_atm_pre_dynamics_h2d + + + subroutine mpas_atm_post_dynamics_d2h(domain) + + implicit none + + type (domain_type), intent(inout) :: domain + + +#ifdef MPAS_OPENACC + type (mpas_pool_type), pointer :: state + type (mpas_pool_type), pointer :: diag + type (mpas_pool_type), pointer :: tend + type (mpas_pool_type), pointer :: tend_physics + type (mpas_pool_type), pointer :: lbc + + + real (kind=RKIND), dimension(:,:), pointer :: ru, ru_p + real (kind=RKIND), dimension(:,:), pointer :: ru_save + real (kind=RKIND), dimension(:,:), pointer :: rw, rw_p + real (kind=RKIND), dimension(:,:), pointer :: rw_save + real (kind=RKIND), dimension(:,:), pointer :: rtheta_p + real (kind=RKIND), dimension(:,:), pointer :: exner, exner_base + real (kind=RKIND), dimension(:,:), pointer :: rtheta_base, rho_base + real (kind=RKIND), dimension(:,:), pointer :: rtheta_p_save + real (kind=RKIND), dimension(:,:), pointer :: rho_p, rho_pp, rho, theta, theta_base + real (kind=RKIND), dimension(:,:), pointer :: rho_p_save + real (kind=RKIND), dimension(:,:), pointer :: rho_zz_old_split + real (kind=RKIND), dimension(:,:), pointer :: cqw, rtheta_pp_old, rtheta_pp + real (kind=RKIND), dimension(:,:), pointer :: cqu, pressure_base, pressure_p, pressure, v + real (kind=RKIND), dimension(:,:), pointer :: kdiff, pv_edge, pv_vertex, pv_cell, rho_edge, h_divergence, ke + real (kind=RKIND), dimension(:,:), pointer :: cofwr, cofwz, coftz, cofwt, a_tri, alpha_tri, gamma_tri + real (kind=RKIND), dimension(:), pointer :: cofrz + real (kind=RKIND), dimension(:,:), pointer :: gradPVn, gradPVt + + + real (kind=RKIND), dimension(:,:), pointer :: u_1, u_2 + real (kind=RKIND), dimension(:,:), pointer :: w_1, w_2 + real (kind=RKIND), dimension(:,:), pointer :: theta_m_1, theta_m_2 + real (kind=RKIND), dimension(:,:), pointer :: rho_zz_1, rho_zz_2 + real (kind=RKIND), dimension(:,:,:), pointer :: scalars_1, scalars_2 + real (kind=RKIND), dimension(:,:), pointer :: ruAvg, wwAvg, ruAvg_split, wwAvg_split + + real (kind=RKIND), dimension(:,:), pointer :: tend_ru, tend_rt, tend_rho, tend_rw, rt_diabatic_tend + real (kind=RKIND), dimension(:,:), pointer :: tend_u_euler, tend_w_euler, tend_theta_euler + real(kind=RKIND), dimension(:,:), pointer :: tend_w_pgf, tend_w_buoy + real(kind=RKIND), dimension(:,:,:), pointer :: scalar_tend_save + + real (kind=RKIND), dimension(:,:), pointer :: rthdynten, divergence, vorticity + + real (kind=RKIND), dimension(:,:), pointer :: lbc_u, lbc_w, lbc_ru, lbc_rho_edge, lbc_rho, lbc_rtheta_m, lbc_rho_zz, lbc_theta + real (kind=RKIND), dimension(:,:), pointer :: lbc_tend_u, lbc_tend_w, lbc_tend_ru, lbc_tend_rho_edge, lbc_tend_rho + real (kind=RKIND), dimension(:,:), pointer :: lbc_tend_rtheta_m, lbc_tend_rho_zz, lbc_tend_theta + + real (kind=RKIND), dimension(:,:,:), pointer :: lbc_scalars, lbc_tend_scalars + + nullify(state) + nullify(diag) + nullify(tend) + nullify(tend_physics) + nullify(lbc) + call mpas_pool_get_subpool(domain % blocklist % structs, 'state', state) + call mpas_pool_get_subpool(domain % blocklist % structs, 'diag', diag) + call mpas_pool_get_subpool(domain % blocklist % structs, 'tend', tend) + call mpas_pool_get_subpool(domain % blocklist % structs, 'tend_physics', tend_physics) + call mpas_pool_get_subpool(domain % blocklist % structs, 'lbc', lbc) + + MPAS_ACC_TIMER_START('atm_srk3 [ACC_data_xfer]') + call mpas_pool_get_array(diag, 'ru', ru) + !$acc exit data copyout(ru) ! use values from atm_init_coupled_diagnostics + call mpas_pool_get_array(diag, 'ru_p', ru_p) + !$acc exit data copyout(ru_p) + call mpas_pool_get_array(diag, 'ru_save', ru_save) + !$acc exit data delete(ru_save) + call mpas_pool_get_array(diag, 'rw', rw) + !$acc exit data copyout(rw) ! use values from atm_init_coupled_diagnostics + call mpas_pool_get_array(diag, 'rw_p', rw_p) + !$acc exit data copyout(rw_p) + call mpas_pool_get_array(diag, 'rw_save', rw_save) + !$acc exit data delete(rw_save) + call mpas_pool_get_array(diag, 'rtheta_p', rtheta_p) + !$acc exit data copyout(rtheta_p) ! use values from atm_init_coupled_diagnostics + call mpas_pool_get_array(diag, 'rtheta_p_save', rtheta_p_save) + !$acc exit data delete(rtheta_p_save) + call mpas_pool_get_array(diag, 'exner', exner) + !$acc exit data copyout(exner) ! use values from atm_init_coupled_diagnostics + call mpas_pool_get_array(diag, 'exner_base', exner_base) + !$acc exit data copyout(exner_base) ! use values from atm_init_coupled_diagnostics + call mpas_pool_get_array(diag, 'rtheta_base', rtheta_base) + !$acc exit data copyout(rtheta_base) ! use values from atm_init_coupled_diagnostics + call mpas_pool_get_array(diag, 'rho_base', rho_base) + !$acc exit data copyout(rho_base) + call mpas_pool_get_array(diag, 'rho', rho) + !$acc exit data copyout(rho) + call mpas_pool_get_array(diag, 'theta', theta) + !$acc exit data copyout(theta) + call mpas_pool_get_array(diag, 'theta_base', theta_base) + !$acc exit data copyout(theta_base) + call mpas_pool_get_array(diag, 'rho_p', rho_p) + !$acc exit data copyout(rho_p) ! use values from atm_init_coupled_diagnostics + call mpas_pool_get_array(diag, 'rho_p_save', rho_p_save) + !$acc exit data delete(rho_p_save) + call mpas_pool_get_array(diag, 'rho_pp', rho_pp) + !$acc exit data copyout(rho_pp) + call mpas_pool_get_array(diag, 'rho_zz_old_split', rho_zz_old_split) + !$acc exit data delete(rho_zz_old_split) + call mpas_pool_get_array(diag, 'cqw', cqw) + !$acc exit data delete(cqw) + call mpas_pool_get_array(diag, 'cqu', cqu) + !$acc exit data copyout(cqu) + call mpas_pool_get_array(diag, 'pressure_p', pressure_p) + !$acc exit data copyout(pressure_p) ! use values from atm_init_coupled_diagnostics + call mpas_pool_get_array(diag, 'pressure_base', pressure_base) + !$acc exit data copyout(pressure_base) ! use values from atm_init_coupled_diagnostics + call mpas_pool_get_array(diag, 'pressure', pressure) + !$acc exit data copyout(pressure) + call mpas_pool_get_array(diag, 'v', v) + !$acc exit data copyout(v) ! use values from atm_compute_solve_diagnostics + call mpas_pool_get_array(diag, 'rtheta_pp', rtheta_pp) + !$acc exit data copyout(rtheta_pp) + call mpas_pool_get_array(diag, 'rtheta_pp_old', rtheta_pp_old) + !$acc exit data copyout(rtheta_pp_old) + call mpas_pool_get_array(diag, 'kdiff', kdiff) + !$acc exit data copyout(kdiff) + call mpas_pool_get_array(diag, 'pv_edge', pv_edge) + !$acc exit data copyout(pv_edge) ! use values from atm_compute_solve_diagnostics + call mpas_pool_get_array(diag, 'pv_vertex', pv_vertex) + !$acc exit data copyout(pv_vertex) + call mpas_pool_get_array(diag, 'pv_cell', pv_cell) + !$acc exit data delete(pv_cell) + call mpas_pool_get_array(diag, 'rho_edge', rho_edge) + !$acc exit data copyout(rho_edge) ! use values from atm_compute_solve_diagnostics + call mpas_pool_get_array(diag, 'h_divergence', h_divergence) + !$acc exit data copyout(h_divergence) + call mpas_pool_get_array(diag, 'ke', ke) + !$acc exit data copyout(ke) ! use values from atm_compute_solve_diagnostics + call mpas_pool_get_array(diag, 'gradPVn', gradPVn) + !$acc exit data delete(gradPVn) + call mpas_pool_get_array(diag, 'gradPVt', gradPVt) + !$acc exit data delete(gradPVt) + + call mpas_pool_get_array(diag, 'alpha_tri', alpha_tri) + !$acc exit data delete(alpha_tri) + call mpas_pool_get_array(diag, 'gamma_tri', gamma_tri) + !$acc exit data delete(gamma_tri) + call mpas_pool_get_array(diag, 'a_tri', a_tri) + !$acc exit data delete(a_tri) + call mpas_pool_get_array(diag, 'cofwr', cofwr) + !$acc exit data delete(cofwr) + call mpas_pool_get_array(diag, 'cofwz', cofwz) + !$acc exit data delete(cofwz) + call mpas_pool_get_array(diag, 'coftz', coftz) + !$acc exit data delete(coftz) + call mpas_pool_get_array(diag, 'cofwt', cofwt) + !$acc exit data delete(cofwt) + call mpas_pool_get_array(diag, 'cofrz', cofrz) + !$acc exit data delete(cofrz) + call mpas_pool_get_array(diag, 'vorticity', vorticity) + !$acc exit data copyout(vorticity) + call mpas_pool_get_array(diag, 'divergence', divergence) + !$acc exit data copyout(divergence) + call mpas_pool_get_array(diag, 'ruAvg', ruAvg) + !$acc exit data copyout(ruAvg) + call mpas_pool_get_array(diag, 'ruAvg_split', ruAvg_split) + !$acc exit data copyout(ruAvg_split) + call mpas_pool_get_array(diag, 'wwAvg', wwAvg) + !$acc exit data copyout(wwAvg) + call mpas_pool_get_array(diag, 'wwAvg_split', wwAvg_split) + !$acc exit data copyout(wwAvg_split) - call mpas_pool_get_array(mesh, 'nearestRelaxationCell', nearestRelaxationCell) - !$acc enter data copyin(nearestRelaxationCell) + call mpas_pool_get_array(state, 'u', u_1, 1) + !$acc exit data copyout(u_1) + call mpas_pool_get_array(state, 'u', u_2, 2) + !$acc exit data delete(u_2) + call mpas_pool_get_array(state, 'w', w_1, 1) + !$acc exit data copyout(w_1) + call mpas_pool_get_array(state, 'w', w_2, 2) + !$acc exit data delete(w_2) + call mpas_pool_get_array(state, 'theta_m', theta_m_1, 1) + !$acc exit data copyout(theta_m_1) ! use values from atm_init_coupled_diagnostics + call mpas_pool_get_array(state, 'theta_m', theta_m_2, 2) + !$acc exit data copyout(theta_m_2) ! Delete gives incorrect results + call mpas_pool_get_array(state, 'rho_zz', rho_zz_1, 1) + !$acc exit data copyout(rho_zz_1) + call mpas_pool_get_array(state, 'rho_zz', rho_zz_2, 2) + !$acc exit data delete(rho_zz_2) + call mpas_pool_get_array(state, 'scalars', scalars_1, 1) + !$acc exit data copyout(scalars_1) + call mpas_pool_get_array(state, 'scalars', scalars_2, 2) + !$acc exit data copyout(scalars_2) ! Delete gives incorrect results - call mpas_pool_get_array(mesh, 'zgrid', zgrid) - !$acc enter data copyin(zgrid) - call mpas_pool_get_array(mesh, 'zxu', zxu) - !$acc enter data copyin(zxu) + call mpas_pool_get_array(tend, 'u', tend_ru) + !$acc exit data copyout(tend_ru) + call mpas_pool_get_array(tend, 'rho_zz', tend_rho) + !$acc exit data copyout(tend_rho) + call mpas_pool_get_array(tend, 'theta_m', tend_rt) + !$acc exit data copyout(tend_rt) + call mpas_pool_get_array(tend, 'w', tend_rw) + !$acc exit data copyout(tend_rw) + call mpas_pool_get_array(tend, 'rt_diabatic_tend', rt_diabatic_tend) + !$acc exit data copyout(rt_diabatic_tend) + call mpas_pool_get_array(tend, 'u_euler', tend_u_euler) + !$acc exit data copyout(tend_u_euler) + call mpas_pool_get_array(tend, 'theta_euler', tend_theta_euler) + !$acc exit data copyout(tend_theta_euler) + call mpas_pool_get_array(tend, 'w_euler', tend_w_euler) + !$acc exit data copyout(tend_w_euler) + call mpas_pool_get_array(tend, 'w_pgf', tend_w_pgf) + !$acc exit data copyout(tend_w_pgf) + call mpas_pool_get_array(tend, 'w_buoy', tend_w_buoy) + !$acc exit data copyout(tend_w_buoy) + call mpas_pool_get_array(tend, 'scalars_tend', scalar_tend_save) + !$acc exit data copyout(scalar_tend_save) + + + call mpas_pool_get_array(lbc, 'lbc_u', lbc_u, 2) + !$acc exit data delete(lbc_u) + call mpas_pool_get_array(lbc, 'lbc_w', lbc_w, 2) + !$acc exit data delete(lbc_w) + call mpas_pool_get_array(lbc, 'lbc_ru', lbc_ru, 2) + !$acc exit data delete(lbc_ru) + call mpas_pool_get_array(lbc, 'lbc_rho_edge', lbc_rho_edge, 2) + !$acc exit data delete(lbc_rho_edge) + call mpas_pool_get_array(lbc, 'lbc_theta', lbc_theta, 2) + !$acc exit data delete(lbc_theta) + call mpas_pool_get_array(lbc, 'lbc_rtheta_m', lbc_rtheta_m, 2) + !$acc exit data delete(lbc_rtheta_m) + call mpas_pool_get_array(lbc, 'lbc_rho_zz', lbc_rho_zz, 2) + !$acc exit data delete(lbc_rho_zz) + call mpas_pool_get_array(lbc, 'lbc_rho', lbc_rho, 2) + !$acc exit data delete(lbc_rho) + call mpas_pool_get_array(lbc, 'lbc_scalars', lbc_scalars, 2) + !$acc exit data delete(lbc_scalars) - call mpas_pool_get_array(mesh, 'dss', dss) - !$acc enter data copyin(dss) + + call mpas_pool_get_array(lbc, 'lbc_u', lbc_tend_u, 1) + !$acc exit data delete(lbc_tend_u) + call mpas_pool_get_array(lbc, 'lbc_ru', lbc_tend_ru, 1) + !$acc exit data delete(lbc_tend_ru) + call mpas_pool_get_array(lbc, 'lbc_rho_edge', lbc_tend_rho_edge, 1) + !$acc exit data delete(lbc_tend_rho_edge) + call mpas_pool_get_array(lbc, 'lbc_w', lbc_tend_w, 1) + !$acc exit data delete(lbc_tend_w) + call mpas_pool_get_array(lbc, 'lbc_theta', lbc_tend_theta, 1) + !$acc exit data delete(lbc_tend_theta) + call mpas_pool_get_array(lbc, 'lbc_rtheta_m', lbc_tend_rtheta_m, 1) + !$acc exit data delete(lbc_tend_rtheta_m) + call mpas_pool_get_array(lbc, 'lbc_rho_zz', lbc_tend_rho_zz, 1) + !$acc exit data delete(lbc_tend_rho_zz) + call mpas_pool_get_array(lbc, 'lbc_rho', lbc_tend_rho, 1) + !$acc exit data delete(lbc_tend_rho) + call mpas_pool_get_array(lbc, 'lbc_scalars', lbc_tend_scalars, 1) + !$acc exit data delete(lbc_tend_scalars) - call mpas_pool_get_array(mesh, 'specZoneMaskCell', specZoneMaskCell) - !$acc enter data copyin(specZoneMaskCell) + call mpas_pool_get_array(tend_physics, 'rthdynten', rthdynten) + !$acc exit data copyout(rthdynten) + MPAS_ACC_TIMER_STOP('atm_srk3 [ACC_data_xfer]') #endif - end subroutine mpas_atm_dynamics_init + end subroutine mpas_atm_post_dynamics_d2h !---------------------------------------------------------------------------- @@ -474,6 +1436,17 @@ subroutine mpas_atm_dynamics_finalize(domain) real (kind=RKIND), dimension(:,:), pointer :: zxu real (kind=RKIND), dimension(:,:), pointer :: dss real (kind=RKIND), dimension(:), pointer :: specZoneMaskCell + real (kind=RKIND), dimension(:,:), pointer :: defc_a + real (kind=RKIND), dimension(:,:), pointer :: defc_b + real (kind=RKIND), dimension(:), pointer :: latEdge + real (kind=RKIND), dimension(:), pointer :: angleEdge + real (kind=RKIND), dimension(:), pointer :: meshScalingDel2 + real (kind=RKIND), dimension(:), pointer :: meshScalingDel4 + real (kind=RKIND), dimension(:), pointer :: meshScalingRegionalCell + real (kind=RKIND), dimension(:), pointer :: meshScalingRegionalEdge + real (kind=RKIND), dimension(:), pointer :: latCell + real (kind=RKIND), dimension(:), pointer :: lonCell + real (kind=RKIND), dimension(:,:,:), pointer :: coeffs_reconstruct #endif @@ -626,6 +1599,39 @@ subroutine mpas_atm_dynamics_finalize(domain) call mpas_pool_get_array(mesh, 'specZoneMaskCell', specZoneMaskCell) !$acc exit data delete(specZoneMaskCell) + + call mpas_pool_get_array(mesh, 'defc_a', defc_a) + !$acc exit data delete(defc_a) + + call mpas_pool_get_array(mesh, 'defc_b', defc_b) + !$acc exit data delete(defc_b) + + call mpas_pool_get_array(mesh, 'latEdge', latEdge) + !$acc exit data delete(latEdge) + + call mpas_pool_get_array(mesh, 'angleEdge', angleEdge) + !$acc exit data delete(angleEdge) + + call mpas_pool_get_array(mesh, 'meshScalingDel2', meshScalingDel2) + !$acc exit data delete(meshScalingDel2) + + call mpas_pool_get_array(mesh, 'meshScalingDel4', meshScalingDel4) + !$acc exit data delete(meshScalingDel4) + + call mpas_pool_get_array(mesh, 'meshScalingRegionalCell', meshScalingRegionalCell) + !$acc exit data delete(meshScalingRegionalCell) + + call mpas_pool_get_array(mesh, 'meshScalingRegionalEdge', meshScalingRegionalEdge) + !$acc exit data delete(meshScalingRegionalEdge) + + call mpas_pool_get_array(mesh, 'latCell', latCell) + !$acc exit data delete(latCell) + + call mpas_pool_get_array(mesh, 'lonCell', lonCell) + !$acc exit data delete(lonCell) + + call mpas_pool_get_array(mesh, 'coeffs_reconstruct', coeffs_reconstruct) + !$acc exit data delete(coeffs_reconstruct) #endif end subroutine mpas_atm_dynamics_finalize @@ -666,12 +1672,14 @@ subroutine atm_timestep(domain, dt, nowTime, itimestep, exchange_halo_group) call mpas_pool_get_config(block % configs, 'config_time_integration', config_time_integration) call mpas_pool_get_config(block % configs, 'config_apply_lbcs', config_apply_lbcs) + call mpas_atm_pre_dynamics_h2d(domain) if (trim(config_time_integration) == 'SRK3') then call atm_srk3(domain, dt, itimestep, exchange_halo_group) else call mpas_log_write('Unknown time integration option '//trim(config_time_integration), messageType=MPAS_LOG_ERR) call mpas_log_write('Currently, only ''SRK3'' is supported.', messageType=MPAS_LOG_CRIT) end if + call mpas_atm_post_dynamics_d2h(domain) call mpas_set_timeInterval(dtInterval, dt=dt) currTime = nowTime + dtInterval @@ -763,6 +1771,8 @@ subroutine atm_srk3(domain, dt, itimestep, exchange_halo_group) real (kind=RKIND), dimension(:,:,:), pointer :: scalars, scalars_1, scalars_2 real (kind=RKIND), dimension(:,:), pointer :: rqvdynten, rthdynten, theta_m + real (kind=RKIND), dimension(:,:), pointer :: pressure_p, rtheta_p, exner, tend_u + real (kind=RKIND), dimension(:,:), pointer :: rho_pp, rtheta_pp, ru_p, rw_p, pv_edge, rho_edge real (kind=RKIND) :: theta_local, fac_m #ifndef MPAS_CAM_DYCORE @@ -846,7 +1856,9 @@ subroutine atm_srk3(domain, dt, itimestep, exchange_halo_group) ! allocate storage for physics tendency save ! allocate(qtot(nVertLevels,nCells+1)) + !$acc kernels qtot(:,nCells+1) = 0.0_RKIND + !$acc end kernels #ifndef MPAS_CAM_DYCORE call mpas_pool_get_field(tend_physics, 'tend_rtheta_physics', tend_rtheta_physicsField) @@ -916,8 +1928,14 @@ subroutine atm_srk3(domain, dt, itimestep, exchange_halo_group) ! ! Communicate halos for theta_m, scalars, pressure_p, and rtheta_p ! + call mpas_pool_get_array(state, 'theta_m', theta_m, 1) + call mpas_pool_get_array(state, 'scalars', scalars_1, 1) + call mpas_pool_get_array(diag, 'pressure_p', pressure_p) + call mpas_pool_get_array(diag, 'rtheta_p', rtheta_p) + !$acc update self(theta_m,scalars_1,pressure_p,rtheta_p) call exchange_halo_group(domain, 'dynamics:theta_m,scalars,pressure_p,rtheta_p') - + !$acc update device(theta_m,scalars_1,pressure_p,rtheta_p) + call mpas_timer_start('atm_rk_integration_setup') !$OMP PARALLEL DO @@ -978,6 +1996,8 @@ subroutine atm_srk3(domain, dt, itimestep, exchange_halo_group) tend_ru_physics, tend_rtheta_physics, tend_rho_physics) end if + !$acc enter data copyin(tend_rtheta_physics,tend_rho_physics,tend_ru_physics) + DYNAMICS_SUBSTEPS : do dynamics_substep = 1, dynamics_split @@ -996,9 +2016,11 @@ subroutine atm_srk3(domain, dt, itimestep, exchange_halo_group) end do !$OMP END PARALLEL DO call mpas_timer_stop('atm_compute_vert_imp_coefs') - + + call mpas_pool_get_array(diag, 'exner', exner) + !$acc update self(exner) call exchange_halo_group(domain, 'dynamics:exner') - + !$acc update device(exner) !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! ! BEGIN Runge-Kutta loop @@ -1025,19 +2047,31 @@ subroutine atm_srk3(domain, dt, itimestep, exchange_halo_group) call mpas_timer_start('atm_compute_dyn_tend') allocate(delsq_theta(nVertLevels,nCells+1)) + !$acc kernels delsq_theta(:,nCells+1) = 0.0_RKIND + !$acc end kernels allocate(delsq_w(nVertLevels,nCells+1)) + !$acc kernels delsq_w(:,nCells+1) = 0.0_RKIND + !$acc end kernels !! allocate(qtot(nVertLevels,nCells+1)) ! initializing this earlier in solution sequence allocate(delsq_divergence(nVertLevels,nCells+1)) + !$acc kernels delsq_divergence(:,nCells+1) = 0.0_RKIND + !$acc end kernels allocate(delsq_u(nVertLevels,nEdges+1)) + !$acc kernels delsq_u(:,nEdges+1) = 0.0_RKIND + !$acc end kernels !! allocate(delsq_circulation(nVertLevels,nVertices+1)) ! no longer used -> removed allocate(delsq_vorticity(nVertLevels,nVertices+1)) + !$acc kernels delsq_vorticity(:,nVertices+1) = 0.0_RKIND + !$acc end kernels allocate(dpdz(nVertLevels,nCells+1)) + !$acc kernels dpdz(:,nCells+1) = 0.0_RKIND + !$acc end kernels !$OMP PARALLEL DO do thread=1,nThreads @@ -1070,7 +2104,10 @@ subroutine atm_srk3(domain, dt, itimestep, exchange_halo_group) !*********************************** ! tend_u + call mpas_pool_get_array(tend, 'u', tend_u) + !$acc update self(tend_u) call exchange_halo_group(domain, 'dynamics:tend_u') + !$acc update device(tend_u) call mpas_timer_start('small_step_prep') @@ -1086,13 +2123,13 @@ subroutine atm_srk3(domain, dt, itimestep, exchange_halo_group) !------------------------------------------------------------------------------------------------------------------------ if (config_apply_lbcs) then ! adjust boundary tendencies for regional_MPAS dry dynamics in the specified zone - + allocate(ru_driving_tend(nVertLevels,nEdges+1)) allocate(rt_driving_tend(nVertLevels,nCells+1)) allocate(rho_driving_tend(nVertLevels,nCells+1)) - ru_driving_tend(1:nVertLevels,1:nEdges+1) = mpas_atm_get_bdy_tend( clock, block, nVertLevels, nEdges, 'ru', 0.0_RKIND ) - rt_driving_tend(1:nVertLevels,1:nCells+1) = mpas_atm_get_bdy_tend( clock, block, nVertLevels, nCells, 'rtheta_m', 0.0_RKIND ) - rho_driving_tend(1:nVertLevels,1:nCells+1) = mpas_atm_get_bdy_tend( clock, block, nVertLevels, nCells, 'rho_zz', 0.0_RKIND ) + call mpas_atm_get_bdy_tend( clock, block, nVertLevels, nEdges, 'ru', 0.0_RKIND, ru_driving_tend) + call mpas_atm_get_bdy_tend( clock, block, nVertLevels, nCells, 'rtheta_m', 0.0_RKIND, rt_driving_tend) + call mpas_atm_get_bdy_tend( clock, block, nVertLevels, nCells, 'rho_zz', 0.0_RKIND, rho_driving_tend) !$OMP PARALLEL DO do thread=1,nThreads call atm_bdy_adjust_dynamics_speczone_tend( tend, mesh, block % configs, nVertLevels, & @@ -1115,10 +2152,11 @@ subroutine atm_srk3(domain, dt, itimestep, exchange_halo_group) allocate(rho_driving_values(nVertLevels,nCells+1)) time_dyn_step = dt_dynamics*real(dynamics_substep-1) + rk_timestep(rk_step) - ru_driving_values(1:nVertLevels,1:nEdges+1) = mpas_atm_get_bdy_state( clock, block, nVertLevels, nEdges, 'ru', time_dyn_step ) - rt_driving_values(1:nVertLevels,1:nCells+1) = mpas_atm_get_bdy_state( clock, block, nVertLevels, nCells, 'rtheta_m', time_dyn_step ) - rho_driving_values(1:nVertLevels,1:nCells+1) = mpas_atm_get_bdy_state( clock, block, nVertLevels, nCells, 'rho_zz', time_dyn_step ) + call mpas_atm_get_bdy_state(clock, block, nVertLevels, nEdges, 'ru', time_dyn_step, ru_driving_values) + call mpas_atm_get_bdy_state(clock, block, nVertLevels, nCells, 'rtheta_m', time_dyn_step, rt_driving_values) + call mpas_atm_get_bdy_state(clock, block, nVertLevels, nCells, 'rho_zz', time_dyn_step, rho_driving_values) + call mpas_timer_start('atm_bdy_adjust_dynamics_relaxzone_tend') !$OMP PARALLEL DO do thread=1,nThreads call atm_bdy_adjust_dynamics_relaxzone_tend( block % configs, tend, state, diag, mesh, nVertLevels, dt, & @@ -1129,6 +2167,7 @@ subroutine atm_srk3(domain, dt, itimestep, exchange_halo_group) edgeSolveThreadStart(thread), edgeSolveThreadEnd(thread) ) end do !$OMP END PARALLEL DO + call mpas_timer_stop('atm_bdy_adjust_dynamics_relaxzone_tend') deallocate(ru_driving_values) deallocate(rt_driving_values) @@ -1143,8 +2182,11 @@ subroutine atm_srk3(domain, dt, itimestep, exchange_halo_group) !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! do small_step = 1, number_sub_steps(rk_step) - + + call mpas_pool_get_array(diag, 'rho_pp', rho_pp) + !$acc update self(rho_pp) call exchange_halo_group(domain, 'dynamics:rho_pp') + !$acc update device(rho_pp) call mpas_timer_start('atm_advance_acoustic_step') @@ -1167,7 +2209,10 @@ subroutine atm_srk3(domain, dt, itimestep, exchange_halo_group) ! rtheta_pp ! This is the only communications needed during the acoustic steps because we solve for u on all edges of owned cells + call mpas_pool_get_array(diag, 'rtheta_pp', rtheta_pp) + !$acc update self(rtheta_pp) call exchange_halo_group(domain, 'dynamics:rtheta_pp') + !$acc update device(rtheta_pp) ! complete update of horizontal momentum by including 3d divergence damping at the end of the acoustic step @@ -1187,7 +2232,13 @@ subroutine atm_srk3(domain, dt, itimestep, exchange_halo_group) ! ! Communicate halos for rw_p[1,2], ru_p[1,2], rho_pp[1,2], rtheta_pp[2] ! + call mpas_pool_get_array(diag, 'ru_p', ru_p) + call mpas_pool_get_array(diag, 'rw_p', rw_p) + call mpas_pool_get_array(diag, 'rho_pp', rho_pp) + call mpas_pool_get_array(diag, 'rtheta_pp', rtheta_pp) + !$acc update self(rw_p,ru_p,rho_pp,rtheta_pp) call exchange_halo_group(domain, 'dynamics:rw_p,ru_p,rho_pp,rtheta_pp') + !$acc update device(rw_p,ru_p,rho_pp,rtheta_pp) call mpas_timer_start('atm_recover_large_step_variables') @@ -1219,39 +2270,50 @@ subroutine atm_srk3(domain, dt, itimestep, exchange_halo_group) time_dyn_step = dt_dynamics*real(dynamics_substep-1) + rk_timestep(rk_step) - ru_driving_values(1:nVertLevels,1:nEdges+1) = mpas_atm_get_bdy_state( clock, block, nVertLevels, nEdges, 'u', time_dyn_step ) + call mpas_atm_get_bdy_state(clock, block, nVertLevels, nEdges, 'u', time_dyn_step, ru_driving_values) ! do this inline at present - it is simple enough + !$acc parallel + !$acc loop gang do iEdge = 1, nEdgesSolve if(bdyMaskEdge(iEdge) > nRelaxZone) then + !$acc loop vector do k = 1, nVertLevels u(k,iEdge) = ru_driving_values(k,iEdge) end do end if end do + !$acc end parallel - ru_driving_values(1:nVertLevels,1:nEdges+1) = mpas_atm_get_bdy_state( clock, block, nVertLevels, nEdges, 'ru', time_dyn_step ) + call mpas_atm_get_bdy_state(clock, block, nVertLevels, nEdges, 'ru', time_dyn_step, ru_driving_values) call mpas_pool_get_array(diag, 'ru', u) ! do this inline at present - it is simple enough + !$acc parallel + !$acc loop gang do iEdge = 1, nEdges if(bdyMaskEdge(iEdge) > nRelaxZone) then + !$acc loop vector do k = 1, nVertLevels u(k,iEdge) = ru_driving_values(k,iEdge) end do end if end do - + !$acc end parallel + deallocate(ru_driving_values) end if ! regional_MPAS addition !------------------------------------------------------------------- + call mpas_pool_get_array(state, 'u', u, 2) + !$acc update self(u) ! u if (config_apply_lbcs) then call exchange_halo_group(domain, 'dynamics:u_123') else call exchange_halo_group(domain, 'dynamics:u_3') end if + !$acc update device(u) ! scalar advection: RK3 scheme of Skamarock and Gassmann (2011). ! PD or monotonicity constraints applied only on the final Runge-Kutta substep. @@ -1263,15 +2325,18 @@ subroutine atm_srk3(domain, dt, itimestep, exchange_halo_group) if (config_apply_lbcs) then ! adjust boundary tendencies for regional_MPAS scalar transport + call mpas_pool_get_array(state, 'scalars', scalars_2, 2) + !$acc update self(scalars_2) call exchange_halo_group(domain, 'dynamics:scalars') + !$acc update device(scalars_2) allocate(scalars_driving(num_scalars,nVertLevels,nCells+1)) ! ! get the scalar values driving the regional boundary conditions ! - scalars_driving(:,:,:) = mpas_atm_get_bdy_state(clock, block, num_scalars, nVertLevels, nCells, & - 'scalars', rk_timestep(rk_step)) + call mpas_atm_get_bdy_state(clock, block, num_scalars, nVertLevels, nCells, & + 'scalars', rk_timestep(rk_step), scalars_driving) !$OMP PARALLEL DO do thread=1,nThreads @@ -1293,7 +2358,9 @@ subroutine atm_srk3(domain, dt, itimestep, exchange_halo_group) allocate(ke_vertex(nVertLevels,nVertices+1)) ke_vertex(:,nVertices+1) = 0.0_RKIND allocate(ke_edge(nVertLevels,nEdges+1)) + !$acc kernels ke_edge(:,nEdges+1) = 0.0_RKIND + !$acc end kernels !$OMP PARALLEL DO do thread=1,nThreads @@ -1309,17 +2376,25 @@ subroutine atm_srk3(domain, dt, itimestep, exchange_halo_group) call mpas_timer_stop('atm_compute_solve_diagnostics') + call mpas_pool_get_array(state, 'w', w, 2) + call mpas_pool_get_array(diag, 'pv_edge', pv_edge) + call mpas_pool_get_array(diag, 'rho_edge', rho_edge) + !$acc update self(w,pv_edge,rho_edge) if (config_scalar_advection .and. (.not. config_split_dynamics_transport) ) then ! ! Communicate halos for w[1,2], pv_edge[1,2], rho_edge[1,2], scalars[1,2] ! + call mpas_pool_get_array(state, 'scalars', scalars_2, 2) + !$acc update self(scalars_2) call exchange_halo_group(domain, 'dynamics:w,pv_edge,rho_edge,scalars') + !$acc update device(scalars_2) else ! ! Communicate halos for w[1,2], pv_edge[1,2], rho_edge[1,2] ! call exchange_halo_group(domain, 'dynamics:w,pv_edge,rho_edge') end if + !$acc update device(w,pv_edge,rho_edge) ! set the zero-gradient condition on w for regional_MPAS @@ -1333,7 +2408,10 @@ subroutine atm_srk3(domain, dt, itimestep, exchange_halo_group) !$OMP END PARALLEL DO ! w halo values needs resetting after regional boundary update + call mpas_pool_get_array(state, 'w', w, 2) + !$acc update self(w) call exchange_halo_group(domain, 'dynamics:w') + !$acc update device(w) end if ! end of regional_MPAS addition @@ -1344,7 +2422,12 @@ subroutine atm_srk3(domain, dt, itimestep, exchange_halo_group) ! ! Communicate halos for theta_m[1,2], pressure_p[1,2], and rtheta_p[1,2] ! + call mpas_pool_get_array(state, 'theta_m', theta_m, 2) + call mpas_pool_get_array(diag, 'pressure_p', pressure_p) + call mpas_pool_get_array(diag, 'rtheta_p', rtheta_p) + !$acc update self(theta_m,pressure_p,rtheta_p) call exchange_halo_group(domain, 'dynamics:theta_m,pressure_p,rtheta_p') + !$acc update device(theta_m,pressure_p,rtheta_p) ! ! Note: A halo exchange for 'exner' here as well as after the call @@ -1381,6 +2464,7 @@ subroutine atm_srk3(domain, dt, itimestep, exchange_halo_group) deallocate(qtot) ! we are finished with these now + !$acc exit data delete(tend_rtheta_physics,tend_rho_physics,tend_ru_physics) #ifndef MPAS_CAM_DYCORE call mpas_deallocate_scratch_field(tend_rtheta_physicsField) call mpas_deallocate_scratch_field(tend_rho_physicsField) @@ -1401,7 +2485,7 @@ subroutine atm_srk3(domain, dt, itimestep, exchange_halo_group) if(config_time_integration_order == 2) rk_timestep(1) = dt/2. RK3_SPLIT_TRANSPORT : do rk_step = 1, 3 ! Runge-Kutta loop - + call advance_scalars('scalars', domain, rk_step, rk_timestep, config_monotonic, config_positive_definite, & config_time_integration_order, config_split_dynamics_transport, exchange_halo_group) @@ -1409,15 +2493,18 @@ subroutine atm_srk3(domain, dt, itimestep, exchange_halo_group) if (config_apply_lbcs) then ! adjust boundary tendencies for regional_MPAS scalar transport ! need to fill halo for horizontal filter + call mpas_pool_get_array(state, 'scalars', scalars_2, 2) + !$acc update self(scalars_2) call exchange_halo_group(domain, 'dynamics:scalars') + !$acc update device(scalars_2) allocate(scalars_driving(num_scalars,nVertLevels,nCells+1)) ! ! get the scalar values driving the regional boundary conditions ! - scalars_driving(:,:,:) = mpas_atm_get_bdy_state(clock, block, num_scalars, nVertLevels, nCells, & - 'scalars', rk_timestep(rk_step)) + call mpas_atm_get_bdy_state(clock, block, num_scalars, nVertLevels, nCells, & + 'scalars', rk_timestep(rk_step), scalars_driving) !$OMP PARALLEL DO do thread=1,nThreads @@ -1435,7 +2522,10 @@ subroutine atm_srk3(domain, dt, itimestep, exchange_halo_group) !------------------------------------------------------------------------------------------------------------------------ if (rk_step < 3) then + call mpas_pool_get_array(state, 'scalars', scalars_2, 2) + !$acc update self(scalars_2) call exchange_halo_group(domain, 'dynamics:scalars') + !$acc update device(scalars_2) end if end do RK3_SPLIT_TRANSPORT @@ -1468,15 +2558,19 @@ subroutine atm_srk3(domain, dt, itimestep, exchange_halo_group) #ifdef DO_PHYSICS call mpas_pool_get_array(state, 'scalars', scalars_1, 1) + !$acc update self(scalars_1) call mpas_pool_get_array(state, 'scalars', scalars_2, 2) + !$acc update self(scalars_2) if(config_convection_scheme == 'cu_grell_freitas' .or. & config_convection_scheme == 'cu_ntiedtke') then call mpas_pool_get_array(tend_physics, 'rqvdynten', rqvdynten) call mpas_pool_get_array(state, 'theta_m', theta_m, 2) + !$acc update self(theta_m) call mpas_pool_get_array(tend_physics, 'rthdynten', rthdynten) + !$acc update self(rthdynten) !NOTE: The calculation of the tendency due to horizontal and vertical advection for the water vapor mixing ratio !requires that the subroutine atm_advance_scalars_mono was called on the third Runge Kutta step, so that a halo @@ -1501,6 +2595,7 @@ subroutine atm_srk3(domain, dt, itimestep, exchange_halo_group) where ( scalars_2(:,:,:) < 0.0) & scalars_2(:,:,:) = 0.0 + !$acc update device(scalars_2, rthdynten) !call microphysics schemes: if (trim(config_microp_scheme) /= 'off') then call mpas_timer_start('microphysics') @@ -1528,8 +2623,8 @@ subroutine atm_srk3(domain, dt, itimestep, exchange_halo_group) allocate(rho_driving_values(nVertLevels,nCells+1)) time_dyn_step = dt ! end of full timestep values - rt_driving_values(1:nVertLevels,1:nCells+1) = mpas_atm_get_bdy_state( clock, block, nVertLevels, nCells, 'rtheta_m', time_dyn_step ) - rho_driving_values(1:nVertLevels,1:nCells+1) = mpas_atm_get_bdy_state( clock, block, nVertLevels, nCells, 'rho_zz', time_dyn_step ) + call mpas_atm_get_bdy_state(clock, block, nVertLevels, nCells, 'rtheta_m', time_dyn_step, rt_driving_values) + call mpas_atm_get_bdy_state(clock, block, nVertLevels, nCells, 'rho_zz', time_dyn_step, rho_driving_values) !$OMP PARALLEL DO do thread=1,nThreads @@ -1548,14 +2643,17 @@ subroutine atm_srk3(domain, dt, itimestep, exchange_halo_group) if (config_apply_lbcs) then ! adjust boundary values for regional_MPAS scalar transport + call mpas_pool_get_array(state, 'scalars', scalars_2, 2) + !$acc update self(scalars_2) call exchange_halo_group(domain, 'dynamics:scalars') + !$acc update device(scalars_2) allocate(scalars_driving(num_scalars,nVertLevels,nCells+1)) ! ! get the scalar values driving the regional boundary conditions ! - scalars_driving(:,:,:) = mpas_atm_get_bdy_state(clock, block, num_scalars, nVertLevels, nCells, 'scalars', dt) + call mpas_atm_get_bdy_state(clock, block, num_scalars, nVertLevels, nCells, 'scalars', dt, scalars_driving) !$OMP PARALLEL DO do thread=1,nThreads @@ -1794,12 +2892,6 @@ subroutine atm_rk_integration_setup( state, diag, nVertLevels, num_scalars, & call mpas_pool_get_array(state, 'scalars', scalars_1, 1) call mpas_pool_get_array(state, 'scalars', scalars_2, 2) - MPAS_ACC_TIMER_START('atm_rk_integration_setup [ACC_data_xfer]') - !$acc enter data create(ru_save, u_2, rw_save, rtheta_p_save, rho_p_save, & - !$acc w_2, theta_m_2, rho_zz_2, rho_zz_old_split, scalars_2) & - !$acc copyin(ru, rw, rtheta_p, rho_p, u_1, w_1, theta_m_1, & - !$acc rho_zz_1, scalars_1) - MPAS_ACC_TIMER_STOP('atm_rk_integration_setup [ACC_data_xfer]') !$acc kernels theta_m_2(:,cellEnd+1) = 0.0_RKIND @@ -1847,13 +2939,7 @@ subroutine atm_rk_integration_setup( state, diag, nVertLevels, num_scalars, & end do !$acc end parallel - MPAS_ACC_TIMER_START('atm_rk_integration_setup [ACC_data_xfer]') - !$acc exit data copyout(ru_save, rw_save, rtheta_p_save, rho_p_save, u_2, & - !$acc w_2, theta_m_2, rho_zz_2, rho_zz_old_split, scalars_2) & - !$acc delete(ru, rw, rtheta_p, rho_p, u_1, w_1, theta_m_1, & - !$acc rho_zz_1, scalars_1) - MPAS_ACC_TIMER_STOP('atm_rk_integration_setup [ACC_data_xfer]') - + end subroutine atm_rk_integration_setup @@ -1903,10 +2989,6 @@ subroutine atm_compute_moist_coefficients( dims, state, diag, mesh, & moist_start = moist_start_ptr moist_end = moist_end_ptr - MPAS_ACC_TIMER_START('atm_compute_moist_coefficients [ACC_data_xfer]') - !$acc enter data create(qtot, cqw, cqu) & - !$acc copyin(scalars) - MPAS_ACC_TIMER_STOP('atm_compute_moist_coefficients [ACC_data_xfer]') !$acc parallel default(present) !$acc loop gang worker @@ -1956,10 +3038,6 @@ subroutine atm_compute_moist_coefficients( dims, state, diag, mesh, & end do !$acc end parallel - MPAS_ACC_TIMER_START('atm_compute_moist_coefficients [ACC_data_xfer]') - !$acc exit data copyout(cqw, cqu, qtot) & - !$acc delete(scalars) - MPAS_ACC_TIMER_STOP('atm_compute_moist_coefficients [ACC_data_xfer]') end subroutine atm_compute_moist_coefficients @@ -2092,9 +3170,7 @@ subroutine atm_compute_vert_imp_coefs_work(nCells, moist_start, moist_end, dts, real (kind=RKIND), dimension( nVertLevels ) :: b_tri, c_tri MPAS_ACC_TIMER_START('atm_compute_vert_imp_coefs_work [ACC_data_xfer]') - !$acc enter data copyin(cqw, p, t, qtot, rb, rtb, rt, pb) - !$acc enter data create(cofrz, cofwr, cofwz, coftz, cofwt, a_tri, b_tri, & - !$acc c_tri, alpha_tri, gamma_tri) + !$acc enter data create(b_tri, c_tri) MPAS_ACC_TIMER_STOP('atm_compute_vert_imp_coefs_work [ACC_data_xfer]') ! set coefficients @@ -2176,9 +3252,7 @@ subroutine atm_compute_vert_imp_coefs_work(nCells, moist_start, moist_end, dts, !$acc end parallel MPAS_ACC_TIMER_START('atm_compute_vert_imp_coefs_work [ACC_data_xfer]') - !$acc exit data copyout(cofrz, cofwr, cofwz, coftz, cofwt, a_tri, b_tri, & - !$acc c_tri, alpha_tri, gamma_tri) - !$acc exit data delete(cqw, p, t, qtot, rb, rtb, rt, pb) + !$acc exit data copyout( b_tri, c_tri) MPAS_ACC_TIMER_STOP('atm_compute_vert_imp_coefs_work [ACC_data_xfer]') end subroutine atm_compute_vert_imp_coefs_work @@ -2283,9 +3357,6 @@ subroutine atm_set_smlstep_pert_variables_work(nCells, nEdges, & integer :: iCell, iEdge, i, k real (kind=RKIND) :: flux - MPAS_ACC_TIMER_START('atm_set_smlstep_pert_variables [ACC_data_xfer]') - !$acc enter data copyin(u_tend, w_tend) - MPAS_ACC_TIMER_STOP('atm_set_smlstep_pert_variables [ACC_data_xfer]') ! we solve for omega instead of w (see Klemp et al MWR 2007), ! so here we change the w_p tendency to an omega_p tendency @@ -2318,10 +3389,6 @@ subroutine atm_set_smlstep_pert_variables_work(nCells, nEdges, & end do !$acc end parallel - MPAS_ACC_TIMER_START('atm_set_smlstep_pert_variables [ACC_data_xfer]') - !$acc exit data delete(u_tend) - !$acc exit data copyout(w_tend) - MPAS_ACC_TIMER_STOP('atm_set_smlstep_pert_variables [ACC_data_xfer]') end subroutine atm_set_smlstep_pert_variables_work @@ -2554,17 +3621,6 @@ subroutine atm_advance_acoustic_step_work(nCells, nEdges, nCellsSolve, cellStart resm = (1.0 - epssm) / (1.0 + epssm) rdts = 1./dts - MPAS_ACC_TIMER_START('atm_advance_acoustic_step [ACC_data_xfer]') - !$acc enter data copyin(exner,cqu,cofwt,coftz,cofrz,cofwr,cofwz, & - !$acc a_tri,alpha_tri,gamma_tri,rho_zz,theta_m,w, & - !$acc tend_ru,tend_rho,tend_rt,tend_rw,rw,rw_save) - !$acc enter data create(rtheta_pp_old) - if(small_step == 1) then - !$acc enter data create(ru_p,ruAvg,rho_pp,rtheta_pp,wwAvg,rw_p) - else - !$acc enter data copyin(ru_p,ruAvg,rho_pp,rtheta_pp,wwAvg,rw_p) - end if - MPAS_ACC_TIMER_STOP('atm_advance_acoustic_step [ACC_data_xfer]') if(small_step /= 1) then ! not needed on first small step @@ -2791,13 +3847,6 @@ subroutine atm_advance_acoustic_step_work(nCells, nEdges, nCellsSolve, cellStart end do ! end of loop over cells !$acc end parallel - MPAS_ACC_TIMER_START('atm_advance_acoustic_step [ACC_data_xfer]') - !$acc exit data delete(exner,cqu,cofwt,coftz,cofrz,cofwr,cofwz, & - !$acc a_tri,alpha_tri,gamma_tri,rho_zz,theta_m,w, & - !$acc tend_ru,tend_rho,tend_rt,tend_rw,rw,rw_save) - !$acc exit data copyout(rtheta_pp_old,ru_p,ruAvg,rho_pp, & - !$acc rtheta_pp,wwAvg,rw_p) - MPAS_ACC_TIMER_STOP('atm_advance_acoustic_step [ACC_data_xfer]') end subroutine atm_advance_acoustic_step_work @@ -2849,9 +3898,6 @@ subroutine atm_divergence_damping_3d( state, diag, mesh, configs, dts, edgeStart nCellsSolve = nCellsSolve_ptr nVertLevels = nVertLevels_ptr - MPAS_ACC_TIMER_START('atm_divergence_damping_3d [ACC_data_xfer]') - !$acc enter data copyin(ru_p, rtheta_pp, rtheta_pp_old, theta_m) - MPAS_ACC_TIMER_STOP('atm_divergence_damping_3d [ACC_data_xfer]') !$acc parallel default(present) !$acc loop gang worker @@ -2884,10 +3930,6 @@ subroutine atm_divergence_damping_3d( state, diag, mesh, configs, dts, edgeStart end do ! end loop over edges !$acc end parallel - MPAS_ACC_TIMER_START('atm_divergence_damping_3d [ACC_data_xfer]') - !$acc exit data copyout(ru_p) & - !$acc delete(rtheta_pp, rtheta_pp_old, theta_m) - MPAS_ACC_TIMER_STOP('atm_divergence_damping_3d [ACC_data_xfer]') end subroutine atm_divergence_damping_3d @@ -3078,17 +4120,6 @@ subroutine atm_recover_large_step_variables_work(nCells, nEdges, nCellsSolve, nE integer :: i, iCell, iEdge, k, cell1, cell2 real (kind=RKIND) :: invNs, rcv, p0, flux - MPAS_ACC_TIMER_START('atm_recover_large_step_variables [ACC_data_xfer]') - !$acc enter data copyin(rho_p_save,rho_pp,rho_base,rw_save,rw_p, & - !$acc rtheta_p_save,rtheta_pp,rtheta_base, & - !$acc ru_save,ru_p,wwAvg,ruAvg) & - !$acc create(rho_zz,rho_p,rw,w,rtheta_p,theta_m, & - !$acc ru,u) - if (rk_step == 3) then - !$acc enter data copyin(rt_diabatic_tend,exner_base) & - !$acc create(exner,pressure_p) - end if - MPAS_ACC_TIMER_STOP('atm_recover_large_step_variables [ACC_data_xfer]') rcv = rgas/(cp-rgas) p0 = 1.0e+05 ! this should come from somewhere else... @@ -3234,17 +4265,6 @@ subroutine atm_recover_large_step_variables_work(nCells, nEdges, nCellsSolve, nE end do !$acc end parallel - MPAS_ACC_TIMER_START('atm_recover_large_step_variables [ACC_data_xfer]') - !$acc exit data delete(rho_p_save,rho_pp,rho_base,rw_save,rw_p, & - !$acc rtheta_p_save,rtheta_pp,rtheta_base, & - !$acc ru_save,ru_p) & - !$acc copyout(rho_zz,rho_p,rw,w,rtheta_p,theta_m, & - !$acc ru,u,wwAvg,ruAvg) - if (rk_step == 3) then - !$acc exit data delete(rt_diabatic_tend,exner_base) & - !$acc copyout(exner,pressure_p) - end if - MPAS_ACC_TIMER_STOP('atm_recover_large_step_variables [ACC_data_xfer]') end subroutine atm_recover_large_step_variables_work @@ -3481,10 +4501,9 @@ subroutine atm_advance_scalars_work(nCells, num_scalars, dt, & MPAS_ACC_TIMER_START('atm_advance_scalars [ACC_data_xfer]') !$acc enter data create(horiz_flux_arr) - !$acc enter data copyin(uhAvg, scalar_new) MPAS_ACC_TIMER_STOP('atm_advance_scalars [ACC_data_xfer]') - !$acc parallel async + !$acc parallel !$acc loop gang worker private(scalar_weight2, ica) do iEdge=edgeStart,edgeEnd @@ -3578,12 +4597,6 @@ subroutine atm_advance_scalars_work(nCells, num_scalars, dt, & ! MPAS_ACC_TIMER_START('atm_advance_scalars [ACC_data_xfer]') -#ifndef DO_PHYSICS - !$acc enter data create(scalar_tend_save) -#else - !$acc enter data copyin(scalar_tend_save) -#endif - !$acc enter data copyin(scalar_old, fnm, fnp, rdnw, wwAvg, rho_zz_old, rho_zz_new) !$acc enter data create(scalar_tend_column, wdtn) MPAS_ACC_TIMER_STOP('atm_advance_scalars [ACC_data_xfer]') @@ -3666,9 +4679,8 @@ subroutine atm_advance_scalars_work(nCells, num_scalars, dt, & !$acc end parallel MPAS_ACC_TIMER_START('atm_advance_scalars [ACC_data_xfer]') - !$acc exit data copyout(scalar_new) - !$acc exit data delete(scalar_tend_column, wdtn, uhAvg, wwAvg, scalar_old, fnm, fnp, & - !$acc rdnw, rho_zz_old, rho_zz_new, horiz_flux_arr, scalar_tend_save) + !$acc exit data delete(scalar_tend_column, wdtn, & + !$acc horiz_flux_arr) MPAS_ACC_TIMER_STOP('atm_advance_scalars [ACC_data_xfer]') end subroutine atm_advance_scalars_work @@ -3927,19 +4939,9 @@ subroutine atm_advance_scalars_mono_work(field_name, block, state, nCells, nEdge MPAS_ACC_TIMER_START('atm_advance_scalars_mono [ACC_data_xfer]') - !$acc data present(nEdgesOnCell, edgesOnCell, edgesOnCell_sign, & - !$acc invAreaCell, cellsOnCell, cellsOnEdge, nAdvCellsForEdge, & - !$acc advCellsForEdge, adv_coefs, adv_coefs_3rd, dvEdge, bdyMaskCell) - -#ifdef DO_PHYSICS - !$acc enter data copyin(scalar_tend) -#else - !$acc enter data create(scalar_tend) -#endif if (local_advance_density) then !$acc enter data copyin(rho_zz_int) end if - !$acc enter data copyin(scalars_old, rho_zz_old, rdnw, uhAvg, wwAvg) MPAS_ACC_TIMER_STOP('atm_advance_scalars_mono [ACC_data_xfer]') !$acc parallel @@ -3964,8 +4966,6 @@ subroutine atm_advance_scalars_mono_work(field_name, block, state, nCells, nEdge !$acc end parallel MPAS_ACC_TIMER_START('atm_advance_scalars_mono [ACC_data_xfer]') - !$acc exit data copyout(scalar_tend) - !$acc update self(scalars_old) MPAS_ACC_TIMER_STOP('atm_advance_scalars_mono [ACC_data_xfer]') @@ -4028,12 +5028,10 @@ subroutine atm_advance_scalars_mono_work(field_name, block, state, nCells, nEdge end if + ! AG: Note that scalar_old, scalar_new in this subroutine are not the same + ! variables as 'scalars' obtained from the state pool. MPAS_ACC_TIMER_START('atm_advance_scalars_mono [ACC_data_xfer]') - if (.not. local_advance_density) then - !$acc enter data copyin(rho_zz_new) - end if - !$acc enter data copyin(scalars_new, fnm, fnp) - !$acc enter data create(scalar_old, scalar_new, scale_arr, s_min, s_max, & + !$acc enter data create(scale_arr, s_min, s_max, & !$acc flux_arr, flux_tmp, flux_upwind_tmp, wdtn) MPAS_ACC_TIMER_STOP('atm_advance_scalars_mono [ACC_data_xfer]') @@ -4496,7 +5494,6 @@ subroutine atm_advance_scalars_mono_work(field_name, block, state, nCells, nEdge !$acc end parallel #ifdef DEBUG_TRANSPORT - !$acc update self(scalar_new) !$acc update self(s_max) !$acc update self(s_min) @@ -4541,15 +5538,9 @@ subroutine atm_advance_scalars_mono_work(field_name, block, state, nCells, nEdge MPAS_ACC_TIMER_START('atm_advance_scalars_mono [ACC_data_xfer]') if (local_advance_density) then !$acc exit data copyout(rho_zz_int) - else - !$acc exit data delete(rho_zz_new) end if - !$acc exit data copyout(scalars_new) - !$acc exit data delete(scalars_old, scalar_old, scalar_new, scale_arr, s_min, s_max, & - !$acc rho_zz_old, flux_arr, flux_tmp, flux_upwind_tmp, wdtn, wwAvg, & - !$acc uhAvg, fnm, fnp, rdnw) - - !$acc end data + !$acc exit data delete(scale_arr, s_min, s_max, & + !$acc flux_arr, flux_tmp, flux_upwind_tmp, wdtn) MPAS_ACC_TIMER_STOP('atm_advance_scalars_mono [ACC_data_xfer]') end subroutine atm_advance_scalars_mono_work @@ -4981,25 +5972,49 @@ subroutine atm_compute_dyn_tend_work(nCells, nEdges, nVertices, nVertLevels_dumm flux4(q_im2, q_im1, q_i, q_ip1, ua) + & coef3*abs(ua)*((q_ip1 - q_im2)-3.*(q_i-q_im1))/12.0 + call mpas_log_write('-- RK step $i --', intArgs=[rk_step]) + + MPAS_ACC_TIMER_START('atm_compute_dyn_tend_work [ACC_data_xfer]') + !$acc enter data create(rayleigh_damp_coef) + MPAS_ACC_TIMER_STOP('atm_compute_dyn_tend_work [ACC_data_xfer]') + prandtl_inv = 1.0_RKIND / prandtl invDt = 1.0_RKIND / dt inv_r_earth = 1.0_RKIND / r_earth - v_mom_eddy_visc2 = config_v_mom_eddy_visc2 - v_theta_eddy_visc2 = config_v_theta_eddy_visc2 + v_mom_eddy_visc2 = config_v_mom_eddy_visc2 + v_theta_eddy_visc2 = config_v_theta_eddy_visc2 if (rk_step == 1) then -! tend_u_euler(1:nVertLevels,edgeStart:edgeEnd) = 0.0 + !$acc parallel + !$acc loop gang worker + do iEdge = edgeStart, edgeEnd + !$acc loop vector + do k = 1, nVertLevels + tend_u_euler(k,iEdge) = 0.0_RKIND + end do + end do + !$acc end parallel ! Smagorinsky eddy viscosity, based on horizontal deformation (in this case on model coordinate surfaces). ! The integration coefficients were precomputed and stored in defc_a and defc_b if(config_horiz_mixing == "2d_smagorinsky") then + + !$acc parallel default(present) + !$acc loop gang worker private(d_diag, d_off_diag) do iCell = cellStart,cellEnd - d_diag(1:nVertLevels) = 0.0 - d_off_diag(1:nVertLevels) = 0.0 + + !$acc loop vector + do k = 1, nVertLevels + d_diag(k) = 0.0_RKIND + d_off_diag(k) = 0.0_RKIND + end do + + !$acc loop seq do iEdge=1,nEdgesOnCell(iCell) + !$acc loop vector do k=1,nVertLevels d_diag(k) = d_diag(k) + defc_a(iEdge,iCell)*u(k,EdgesOnCell(iEdge,iCell)) & - defc_b(iEdge,iCell)*v(k,EdgesOnCell(iEdge,iCell)) @@ -5008,19 +6023,32 @@ subroutine atm_compute_dyn_tend_work(nCells, nEdges, nVertices, nVertLevels_dumm end do end do !DIR$ IVDEP + !$acc loop vector do k=1, nVertLevels ! here is the Smagorinsky formulation, ! followed by imposition of an upper bound on the eddy viscosity kdiff(k,iCell) = min((c_s * config_len_disp)**2 * sqrt(d_diag(k)**2 + d_off_diag(k)**2),(0.01*config_len_disp**2) * invDt) end do end do + !$acc end parallel h_mom_eddy_visc4 = config_visc4_2dsmag * config_len_disp**3 h_theta_eddy_visc4 = h_mom_eddy_visc4 else if(config_horiz_mixing == "2d_fixed") then - kdiff(1:nVertLevels,cellStart:cellEnd) = config_h_theta_eddy_visc2 + !!! MGD UNTESTED !!! + + !$acc parallel default(present) + !$acc loop gang worker + do iCell = cellStart, cellEnd + !$acc loop vector + do k = 1, nVertLevels + kdiff(k,iCell) = config_h_theta_eddy_visc2 + end do + end do + !$acc end parallel + h_mom_eddy_visc4 = config_h_mom_eddy_visc4 h_theta_eddy_visc4 = config_h_theta_eddy_visc4 @@ -5028,17 +6056,23 @@ subroutine atm_compute_dyn_tend_work(nCells, nEdges, nVertices, nVertLevels_dumm if (config_mpas_cam_coef > 0.0) then + !!! MGD UNTESTED !!! + + !$acc parallel default(present) + !$acc loop gang worker do iCell = cellStart,cellEnd ! ! 2nd-order filter for top absorbing layer similar to that in CAM-SE : WCS 10 May 2017, modified 7 April 2023 ! From MPAS-CAM V4.0 code, with addition to config-specified coefficient (V4.0_coef = 0.2; SE_coef = 1.0) ! + !$acc loop vector do k = nVertLevels-config_number_cam_damping_levels + 1, nVertLevels visc2cam = 4.0*2.0833*config_len_disp*config_mpas_cam_coef visc2cam = visc2cam*(1.0-real(nVertLevels-k)/real(config_number_cam_damping_levels)) kdiff(k ,iCell) = max(kdiff(nVertLevels ,iCell),visc2cam) end do end do + !$acc end parallel end if @@ -5049,26 +6083,40 @@ subroutine atm_compute_dyn_tend_work(nCells, nEdges, nVertices, nVertLevels_dumm ! accumulate horizontal mass-flux + !$acc parallel default(present) + !$acc loop gang worker do iCell=cellStart,cellEnd - h_divergence(1:nVertLevels,iCell) = 0.0 + + !$acc loop vector + do k=1,nVertLevels + h_divergence(k,iCell) = 0.0_RKIND + end do + + !$acc loop seq do i=1,nEdgesOnCell(iCell) iEdge = edgesOnCell(i,iCell) edge_sign = edgesOnCell_sign(i,iCell) * dvEdge(iEdge) !DIR$ IVDEP + !$acc loop vector do k=1,nVertLevels h_divergence(k,iCell) = h_divergence(k,iCell) + edge_sign * ru(k,iEdge) end do end do end do + !$acc end parallel ! compute horiontal mass-flux divergence, add vertical mass flux divergence to complete tend_rho + !$acc parallel default(present) + !$acc loop gang worker do iCell = cellStart,cellEnd r = invAreaCell(iCell) + !$acc loop vector do k = 1,nVertLevels h_divergence(k,iCell) = h_divergence(k,iCell) * r end do - end do + end do + !$acc end parallel ! ! dp / dz and tend_rho @@ -5078,14 +6126,20 @@ subroutine atm_compute_dyn_tend_work(nCells, nEdges, nVertices, nVertLevels_dumm if(rk_step == 1) then rgas_cprcv = rgas*cp/cv + + !$acc parallel default(present) + !$acc loop gang worker do iCell = cellStart,cellEnd !DIR$ IVDEP + !$acc loop vector do k = 1,nVertLevels tend_rho(k,iCell) = -h_divergence(k,iCell)-rdzw(k)*(rw(k+1,iCell)-rw(k,iCell)) + tend_rho_physics(k,iCell) dpdz(k,iCell) = -gravity*(rb(k,iCell)*(qtot(k,iCell)) + rr_save(k,iCell)*(1.+qtot(k,iCell))) end do end do + !$acc end parallel + end if !$OMP BARRIER @@ -5094,6 +6148,8 @@ subroutine atm_compute_dyn_tend_work(nCells, nEdges, nVertices, nVertLevels_dumm ! Compute u (normal) velocity tendency for each edge (cell face) ! + !$acc parallel default(present) + !$acc loop gang worker private(wduz, q) do iEdge=edgeSolveStart,edgeSolveEnd cell1 = cellsOnEdge(1,iEdge) @@ -5103,6 +6159,7 @@ subroutine atm_compute_dyn_tend_work(nCells, nEdges, nVertices, nVertLevels_dumm if(rk_step == 1) then !DIR$ IVDEP + !$acc loop vector do k=1,nVertLevels tend_u_euler(k,iEdge) = - cqu(k,iEdge)*( (pp(k,cell2)-pp(k,cell1))*invDcEdge(iEdge)/(.5*(zz(k,cell2)+zz(k,cell1))) & -0.5*zxu(k,iEdge)*(dpdz(k,cell1)+dpdz(k,cell2)) ) @@ -5116,6 +6173,7 @@ subroutine atm_compute_dyn_tend_work(nCells, nEdges, nVertices, nVertLevels_dumm k = 2 wduz(k) = 0.5*( rw(k,cell1)+rw(k,cell2))*(fzm(k)*u(k,iEdge)+fzp(k)*u(k-1,iEdge)) + !$acc loop vector do k=3,nVertLevels-1 wduz(k) = flux3( u(k-2,iEdge),u(k-1,iEdge),u(k,iEdge),u(k+1,iEdge),0.5*(rw(k,cell1)+rw(k,cell2)), 1.0_RKIND ) end do @@ -5125,15 +6183,23 @@ subroutine atm_compute_dyn_tend_work(nCells, nEdges, nVertices, nVertLevels_dumm wduz(nVertLevels+1) = 0. !DIR$ IVDEP + !$acc loop vector do k=1,nVertLevels tend_u(k,iEdge) = - rdzw(k)*(wduz(k+1)-wduz(k)) ! first use of tend_u end do ! Next, nonlinear Coriolis term (q) following Ringler et al JCP 2009 - q(:) = 0.0 + !$acc loop vector + do k=1,nVertLevels + q(k) = 0.0_RKIND + end do + + !$acc loop seq do j = 1,nEdgesOnEdge(iEdge) eoe = edgesOnEdge(j,iEdge) + + !$acc loop vector do k=1,nVertLevels workpv = 0.5 * (pv_edge(k,iEdge) + pv_edge(k,eoe)) ! the original definition of pv_edge had a factor of 1/density. We have removed that factor @@ -5143,6 +6209,7 @@ subroutine atm_compute_dyn_tend_work(nCells, nEdges, nVertices, nVertLevels_dumm end do !DIR$ IVDEP + !$acc loop vector do k=1,nVertLevels ! horizontal ke gradient and vorticity terms in the vector invariant formulation @@ -5151,6 +6218,9 @@ subroutine atm_compute_dyn_tend_work(nCells, nEdges, nVertices, nVertLevels_dumm * invDcEdge(iEdge)) & - u(k,iEdge)*0.5*(h_divergence(k,cell1)+h_divergence(k,cell2)) #ifdef CURVATURE + + !!! MGD UNTESTED !!! + ! curvature terms for the sphere tend_u(k,iEdge) = tend_u(k,iEdge) & - 2.*omega*cos(angleEdge(iEdge))*cos(latEdge(iEdge)) & @@ -5161,7 +6231,7 @@ subroutine atm_compute_dyn_tend_work(nCells, nEdges, nVertices, nVertLevels_dumm end do end do - + !$acc end parallel ! ! horizontal mixing for u @@ -5176,8 +6246,18 @@ subroutine atm_compute_dyn_tend_work(nCells, nEdges, nVertices, nVertLevels_dumm ! del^4 horizontal filter. We compute this as del^2 ( del^2 (u) ). ! First, storage to hold the result from the first del^2 computation. - delsq_u(1:nVertLevels,edgeStart:edgeEnd) = 0.0 + !$acc parallel default(present) + !$acc loop gang worker + do iEdge = edgeStart, edgeEnd + !$acc loop vector + do k = 1, nVertLevels + delsq_u(k,iEdge) = 0.0_RKIND + end do + end do + !$acc end parallel + !$acc parallel default(present) + !$acc loop gang worker do iEdge=edgeStart,edgeEnd cell1 = cellsOnEdge(1,iEdge) cell2 = cellsOnEdge(2,iEdge) @@ -5187,6 +6267,7 @@ subroutine atm_compute_dyn_tend_work(nCells, nEdges, nVertices, nVertLevels_dumm r_dv = min(invDvEdge(iEdge), 4*invDcEdge(iEdge)) !DIR$ IVDEP + !$acc loop vector do k=1,nVertLevels ! Compute diffusion, computed as \nabla divergence - k \times \nabla vorticity @@ -5205,38 +6286,65 @@ subroutine atm_compute_dyn_tend_work(nCells, nEdges, nVertices, nVertLevels_dumm end do end do + !$acc end parallel + if (h_mom_eddy_visc4 > 0.0) then ! 4th order mixing is active !$OMP BARRIER + !!! MGD UNTESTED !!! + + !$acc parallel default(present) + + !$acc loop gang worker do iVertex=vertexStart,vertexEnd - delsq_vorticity(1:nVertLevels,iVertex) = 0.0 + + !$acc loop vector + do k=1,nVertLevels + delsq_vorticity(k,iVertex) = 0.0_RKIND + end do + + !$acc loop seq do i=1,vertexDegree iEdge = edgesOnVertex(i,iVertex) edge_sign = invAreaTriangle(iVertex) * dcEdge(iEdge) * edgesOnVertex_sign(i,iVertex) + + !$acc loop vector do k=1,nVertLevels delsq_vorticity(k,iVertex) = delsq_vorticity(k,iVertex) + edge_sign * delsq_u(k,iEdge) end do end do end do + !$acc loop gang worker do iCell=cellStart,cellEnd - delsq_divergence(1:nVertLevels,iCell) = 0.0 + + !$acc loop vector + do k=1,nVertLevels + delsq_divergence(k,iCell) = 0.0_RKIND + end do + r = invAreaCell(iCell) + + !$acc loop seq do i=1,nEdgesOnCell(iCell) iEdge = edgesOnCell(i,iCell) edge_sign = r * dvEdge(iEdge) * edgesOnCell_sign(i,iCell) + + !$acc loop vector do k=1,nVertLevels delsq_divergence(k,iCell) = delsq_divergence(k,iCell) + edge_sign * delsq_u(k,iEdge) end do end do end do - - + !$acc end parallel !$OMP BARRIER + !$acc parallel default(present) + + !$acc loop gang worker do iEdge=edgeSolveStart,edgeSolveEnd cell1 = cellsOnEdge(1,iEdge) cell2 = cellsOnEdge(2,iEdge) @@ -5248,6 +6356,7 @@ subroutine atm_compute_dyn_tend_work(nCells, nEdges, nVertices, nVertLevels_dumm r_dv = u_mix_scale * min(invDvEdge(iEdge), 4*invDcEdge(iEdge)) !DIR$ IVDEP + !$acc loop vector do k=1,nVertLevels ! Compute diffusion, computed as \nabla divergence - k \times \nabla vorticity @@ -5263,6 +6372,8 @@ subroutine atm_compute_dyn_tend_work(nCells, nEdges, nVertices, nVertLevels_dumm end do end do + + !$acc end parallel end if ! 4th order mixing is active @@ -5271,13 +6382,18 @@ subroutine atm_compute_dyn_tend_work(nCells, nEdges, nVertices, nVertLevels_dumm ! if ( v_mom_eddy_visc2 > 0.0 ) then + !!! MGD UNTESTED !!! + if (config_mix_full) then ! mix full state + !$acc parallel default(present) + !$acc loop gang worker do iEdge=edgeSolveStart,edgeSolveEnd cell1 = cellsOnEdge(1,iEdge) cell2 = cellsOnEdge(2,iEdge) + !$acc loop vector do k=2,nVertLevels-1 z1 = 0.5*(zgrid(k-1,cell1)+zgrid(k-1,cell2)) @@ -5294,19 +6410,27 @@ subroutine atm_compute_dyn_tend_work(nCells, nEdges, nVertices, nVertLevels_dumm -(u(k ,iEdge)-u(k-1,iEdge))/(z0-zm) )/(0.5*(zp-zm)) end do end do + !$acc end parallel + else ! idealized cases where we mix on the perturbation from the initial 1-D state + !!! MGD UNTESTED !!! + + !$acc parallel default(present) + !$acc loop gang worker private(u_mix) do iEdge=edgeSolveStart,edgeSolveEnd cell1 = cellsOnEdge(1,iEdge) cell2 = cellsOnEdge(2,iEdge) + !$acc loop vector do k=1,nVertLevels u_mix(k) = u(k,iEdge) - u_init(k) * cos( angleEdge(iEdge) ) & - v_init(k) * sin( angleEdge(iEdge) ) end do + !$acc loop vector do k=2,nVertLevels-1 z1 = 0.5*(zgrid(k-1,cell1)+zgrid(k-1,cell2)) @@ -5323,6 +6447,7 @@ subroutine atm_compute_dyn_tend_work(nCells, nEdges, nVertices, nVertLevels_dumm -(u_mix(k )-u_mix(k-1))/(z0-zm) )/(0.5*(zp-zm)) end do end do + !$acc end parallel end if ! mix perturbation state @@ -5336,52 +6461,82 @@ subroutine atm_compute_dyn_tend_work(nCells, nEdges, nVertices, nVertLevels_dumm ! Rayleigh damping on u if (config_rayleigh_damp_u) then + rayleigh_coef_inverse = 1.0 / ( real(config_number_rayleigh_damp_u_levels) & * (config_rayleigh_damp_u_timescale_days*seconds_per_day) ) + do k=nVertLevels-config_number_rayleigh_damp_u_levels+1,nVertLevels rayleigh_damp_coef(k) = real(k - (nVertLevels-config_number_rayleigh_damp_u_levels))*rayleigh_coef_inverse end do + !!! MGD UNTESTED !!! + + !$acc parallel default(present) + + !$acc loop gang worker do iEdge=edgeSolveStart,edgeSolveEnd !DIR$ IVDEP + !$acc loop vector do k=nVertlevels-config_number_rayleigh_damp_u_levels+1,nVertLevels tend_u(k,iEdge) = tend_u(k,iEdge) - rho_edge(k,iEdge)*u(k,iEdge)*rayleigh_damp_coef(k) end do end do + + !$acc end parallel + end if + !$acc parallel default(present) + !$acc loop gang worker do iEdge=edgeSolveStart,edgeSolveEnd !DIR$ IVDEP + !$acc loop vector do k=1,nVertLevels ! tend_u(k,iEdge) = tend_u(k,iEdge) + tend_u_euler(k,iEdge) tend_u(k,iEdge) = tend_u(k,iEdge) + tend_u_euler(k,iEdge) + tend_ru_physics(k,iEdge) end do end do + !$acc end parallel !----------- rhs for w - ! ! horizontal advection for w ! - do iCell=cellSolveStart,cellSolveEnd ! Technically updating fewer cells than before... - tend_w(1:nVertLevels+1,iCell) = 0.0 + !$acc parallel default(present) + !$acc loop gang worker private(ru_edge_w, flux_arr) + do iCell=cellSolveStart,cellSolveEnd ! Technically updating fewer cells than before... + + !$acc loop vector + do k=1,nVertLevels+1 + tend_w(k,iCell) = 0.0_RKIND + end do + + !$acc loop seq do i=1,nEdgesOnCell(iCell) + iEdge = edgesOnCell(i,iCell) edge_sign = edgesOnCell_sign(i,iCell) * dvEdge(iEdge) * 0.5 + !$acc loop vector do k=2,nVertLevels ru_edge_w(k) = fzm(k)*ru(k,iEdge) + fzp(k)*ru(k-1,iEdge) end do - flux_arr(1:nVertLevels) = 0.0 + !$acc loop vector + do k=1,nVertLevels + flux_arr(k) = 0.0_RKIND + end do ! flux_arr stores the value of w at the cell edge used in the horizontal transport + !$acc loop seq do j=1,nAdvCellsForEdge(iEdge) iAdvCell = advCellsForEdge(j,iEdge) + + !$acc loop vector do k=2,nVertLevels scalar_weight = adv_coefs(j,iEdge) + sign(1.0_RKIND,ru_edge_w(k)) * adv_coefs_3rd(j,iEdge) flux_arr(k) = flux_arr(k) + scalar_weight * w(k,iAdvCell) @@ -5389,16 +6544,24 @@ subroutine atm_compute_dyn_tend_work(nCells, nEdges, nVertices, nVertLevels_dumm end do !DIR$ IVDEP + !$acc loop vector do k=2,nVertLevels tend_w(k,iCell) = tend_w(k,iCell) - edgesOnCell_sign(i,iCell) * ru_edge_w(k)*flux_arr(k) end do end do end do + !$acc end parallel #ifdef CURVATURE + + !!! MGD UNTESTED !!! + + !$acc parallel default(present) + !$acc loop gang worker do iCell = cellSolveStart, cellSolveEnd !DIR$ IVDEP + !$acc loop vector do k=2,nVertLevels tend_w(k,iCell) = tend_w(k,iCell) + (rho_zz(k,iCell)*fzm(k)+rho_zz(k-1,iCell)*fzp(k))* & ( (fzm(k)*ur_cell(k,iCell)+fzp(k)*ur_cell(k-1,iCell))**2. & @@ -5409,8 +6572,9 @@ subroutine atm_compute_dyn_tend_work(nCells, nEdges, nVertices, nVertLevels_dumm end do end do -#endif + !$acc end parallel +#endif ! ! horizontal mixing for w - we could combine this with advection directly (i.e. as a turbulent flux), @@ -5426,12 +6590,23 @@ subroutine atm_compute_dyn_tend_work(nCells, nEdges, nVertices, nVertLevels_dumm ! First, storage to hold the result from the first del^2 computation. ! we copied code from the theta mixing, hence the theta* names. + !$acc parallel default(present) + !$acc loop gang worker + do iCell=cellStart,cellEnd + + !$acc loop vector + do k=1,nVertLevels + delsq_w(k,iCell) = 0.0_RKIND + end do - delsq_w(1:nVertLevels,cellStart:cellEnd) = 0.0 + !$acc loop vector + do k=1,nVertLevels+1 + tend_w_euler(k,iCell) = 0.0_RKIND + end do - do iCell=cellStart,cellEnd - tend_w_euler(1:nVertLevels+1,iCell) = 0.0 r_areaCell = invAreaCell(iCell) + + !$acc loop seq do i=1,nEdgesOnCell(iCell) iEdge = edgesOnCell(i,iCell) @@ -5441,6 +6616,7 @@ subroutine atm_compute_dyn_tend_work(nCells, nEdges, nVertices, nVertLevels_dumm cell2 = cellsOnEdge(2,iEdge) !DIR$ IVDEP + !$acc loop vector do k=2,nVertLevels w_turb_flux = edge_sign*(rho_edge(k,iEdge)+rho_edge(k-1,iEdge))*(w(k,cell2) - w(k,cell1)) @@ -5451,13 +6627,21 @@ subroutine atm_compute_dyn_tend_work(nCells, nEdges, nVertices, nVertLevels_dumm end do end do end do + !$acc end parallel !$OMP BARRIER if (h_mom_eddy_visc4 > 0.0) then ! 4th order mixing is active + !!! MGD UNTESTED !!! + + !$acc parallel default(present) + !$acc loop gang worker do iCell=cellSolveStart,cellSolveEnd ! Technically updating fewer cells than before... + r_areaCell = h_mom_eddy_visc4 * invAreaCell(iCell) + + !$acc loop seq do i=1,nEdgesOnCell(iCell) iEdge = edgesOnCell(i,iCell) cell1 = cellsOnEdge(1,iEdge) @@ -5465,12 +6649,14 @@ subroutine atm_compute_dyn_tend_work(nCells, nEdges, nVertices, nVertLevels_dumm edge_sign = meshScalingDel4(iEdge)*r_areaCell*dvEdge(iEdge)*edgesOnCell_sign(i,iCell) * invDcEdge(iEdge) + !$acc loop vector do k=2,nVertLevels tend_w_euler(k,iCell) = tend_w_euler(k,iCell) - edge_sign * (delsq_w(k,cell2) - delsq_w(k,cell1)) end do end do end do + !$acc end parallel end if ! 4th order mixing is active @@ -5487,15 +6673,20 @@ subroutine atm_compute_dyn_tend_work(nCells, nEdges, nVertices, nVertLevels_dumm ! vertical advection, pressure gradient and buoyancy for w ! + !$acc parallel default(present) + !$acc loop gang worker private(wdwz) do iCell=cellSolveStart,cellSolveEnd wdwz(1) = 0.0 k = 2 wdwz(k) = 0.25*(rw(k,icell)+rw(k-1,iCell))*(w(k,iCell)+w(k-1,iCell)) + + !$acc loop vector do k=3,nVertLevels-1 wdwz(k) = flux3( w(k-2,iCell),w(k-1,iCell),w(k,iCell),w(k+1,iCell),0.5*(rw(k,iCell)+rw(k-1,iCell)), 1.0_RKIND ) end do + k = nVertLevels wdwz(k) = 0.25*(rw(k,icell)+rw(k-1,iCell))*(w(k,iCell)+w(k-1,iCell)) @@ -5504,12 +6695,14 @@ subroutine atm_compute_dyn_tend_work(nCells, nEdges, nVertices, nVertLevels_dumm ! Note: next we are also dividing through by the cell area after the horizontal flux divergence !DIR$ IVDEP + !$acc loop vector do k=2,nVertLevels tend_w(k,iCell) = tend_w(k,iCell) * invAreaCell(iCell) -rdzu(k)*(wdwz(k+1)-wdwz(k)) end do if(rk_step == 1) then !DIR$ IVDEP + !$acc loop vector do k=2,nVertLevels tend_w_euler(k,iCell) = tend_w_euler(k,iCell) - cqw(k,iCell)*( & rdzu(k)*(pp(k,iCell)-pp(k-1,iCell)) & @@ -5518,19 +6711,26 @@ subroutine atm_compute_dyn_tend_work(nCells, nEdges, nVertices, nVertLevels_dumm end if end do + !$acc end parallel if (rk_step == 1) then if ( v_mom_eddy_visc2 > 0.0 ) then + !!! MGD UNTESTED !!! + + !$acc parallel default(present) + !$acc loop gang worker do iCell=cellSolveStart,cellSolveEnd !DIR$ IVDEP + !$acc loop vector do k=2,nVertLevels tend_w_euler(k,iCell) = tend_w_euler(k,iCell) + v_mom_eddy_visc2*0.5*(rho_zz(k,iCell)+rho_zz(k-1,iCell))*( & (w(k+1,iCell)-w(k ,iCell))*rdzw(k) & -(w(k ,iCell)-w(k-1,iCell))*rdzw(k-1) )*rdzu(k) end do end do + !$acc end parallel end if @@ -5538,12 +6738,16 @@ subroutine atm_compute_dyn_tend_work(nCells, nEdges, nVertices, nVertLevels_dumm ! add in mixing terms for w + !$acc parallel default(present) + !$acc loop gang worker do iCell = cellSolveStart,cellSolveEnd !DIR$ IVDEP + !$acc loop vector do k=2,nVertLevels tend_w(k,iCell) = tend_w(k,iCell) + tend_w_euler(k,iCell) end do end do + !$acc end parallel !----------- rhs for theta @@ -5551,15 +6755,29 @@ subroutine atm_compute_dyn_tend_work(nCells, nEdges, nVertices, nVertLevels_dumm ! horizontal advection for theta ! + !$acc parallel default(present) + !$acc loop gang worker private(flux_arr) do iCell=cellSolveStart,cellSolveEnd ! Technically updating fewer cells than before... - tend_theta(1:nVertLevels,iCell) = 0.0 + + !$acc loop vector + do k=1,nVertLevels + tend_theta(k,iCell) = 0.0_RKIND + end do + + !$acc loop seq do i=1,nEdgesOnCell(iCell) iEdge = edgesOnCell(i,iCell) - flux_arr(1:nVertLevels) = 0.0 + !$acc loop vector + do k=1,nVertLevels + flux_arr(k) = 0.0_RKIND + end do + !$acc loop seq do j=1,nAdvCellsForEdge(iEdge) iAdvCell = advCellsForEdge(j,iEdge) + + !$acc loop vector do k=1,nVertLevels scalar_weight = adv_coefs(j,iEdge) + sign(1.0_RKIND,ru(k,iEdge))*adv_coefs_3rd(j,iEdge) flux_arr(k) = flux_arr(k) + scalar_weight* theta_m(k,iAdvCell) @@ -5567,28 +6785,38 @@ subroutine atm_compute_dyn_tend_work(nCells, nEdges, nVertices, nVertLevels_dumm end do !DIR$ IVDEP + !$acc loop vector do k=1,nVertLevels tend_theta(k,iCell) = tend_theta(k,iCell) - edgesOnCell_sign(i,iCell) * ru(k,iEdge) * flux_arr(k) end do end do end do + !$acc end parallel ! addition to pick up perturbation flux for rtheta_pp equation if(rk_step > 1) then + + !$acc parallel default(present) + !$acc loop gang worker do iCell=cellSolveStart,cellSolveEnd + + !$acc loop seq do i=1,nEdgesOnCell(iCell) iEdge = edgesOnCell(i,iCell) cell1 = cellsOnEdge(1,iEdge) cell2 = cellsOnEdge(2,iEdge) !DIR$ IVDEP + !$acc loop vector do k=1,nVertLevels flux = edgesOnCell_sign(i,iCell)*dvEdge(iEdge)*(ru_save(k,iEdge)-ru(k,iEdge))*0.5*(theta_m_save(k,cell2)+theta_m_save(k,cell1)) tend_theta(k,iCell) = tend_theta(k,iCell)-flux ! division by areaCell picked up down below end do end do end do + !$acc end parallel + end if ! @@ -5598,11 +6826,23 @@ subroutine atm_compute_dyn_tend_work(nCells, nEdges, nVertices, nVertLevels_dumm if (rk_step == 1) then - delsq_theta(1:nVertLevels,cellStart:cellEnd) = 0.0 - + !$acc parallel default(present) + !$acc loop gang worker do iCell=cellStart,cellEnd - tend_theta_euler(1:nVertLevels,iCell) = 0.0 + + !$acc loop vector + do k=1,nVertLevels + delsq_theta(k,iCell) = 0.0_RKIND + end do + + !$acc loop vector + do k=1,nVertLevels + tend_theta_euler(k,iCell) = 0.0_RKIND + end do + r_areaCell = invAreaCell(iCell) + + !$acc loop seq do i=1,nEdgesOnCell(iCell) iEdge = edgesOnCell(i,iCell) edge_sign = r_areaCell*edgesOnCell_sign(i,iCell) * dvEdge(iEdge) * invDcEdge(iEdge) @@ -5610,6 +6850,7 @@ subroutine atm_compute_dyn_tend_work(nCells, nEdges, nVertices, nVertLevels_dumm cell1 = cellsOnEdge(1,iEdge) cell2 = cellsOnEdge(2,iEdge) !DIR$ IVDEP + !$acc loop vector do k=1,nVertLevels ! we are computing the Smagorinsky filter at more points than needed here so as to pick up the delsq_theta for 4th order filter below @@ -5622,13 +6863,21 @@ subroutine atm_compute_dyn_tend_work(nCells, nEdges, nVertices, nVertLevels_dumm end do end do end do + !$acc end parallel !$OMP BARRIER if (h_theta_eddy_visc4 > 0.0) then ! 4th order mixing is active + !!! MGD UNTESTED !!! + + !$acc parallel default(present) + !$acc loop gang worker do iCell=cellSolveStart,cellSolveEnd ! Technically updating fewer cells than before... + r_areaCell = h_theta_eddy_visc4 * prandtl_inv * invAreaCell(iCell) + + !$acc loop seq do i=1,nEdgesOnCell(iCell) iEdge = edgesOnCell(i,iCell) @@ -5637,11 +6886,13 @@ subroutine atm_compute_dyn_tend_work(nCells, nEdges, nVertices, nVertLevels_dumm cell1 = cellsOnEdge(1,iEdge) cell2 = cellsOnEdge(2,iEdge) + !$acc loop vector do k=1,nVertLevels tend_theta_euler(k,iCell) = tend_theta_euler(k,iCell) - edge_sign*(delsq_theta(k,cell2) - delsq_theta(k,cell1)) end do end do end do + !$acc end parallel end if ! 4th order mixing is active @@ -5651,6 +6902,10 @@ subroutine atm_compute_dyn_tend_work(nCells, nEdges, nVertices, nVertLevels_dumm ! vertical advection plus diabatic term ! Note: we are also dividing through by the cell area after the horizontal flux divergence ! + + + !$acc parallel default(present) + !$acc loop gang worker private(wdtz) do iCell = cellSolveStart,cellSolveEnd wdtz(1) = 0.0 @@ -5658,22 +6913,27 @@ subroutine atm_compute_dyn_tend_work(nCells, nEdges, nVertices, nVertLevels_dumm k = 2 wdtz(k) = rw(k,icell)*(fzm(k)*theta_m(k,iCell)+fzp(k)*theta_m(k-1,iCell)) wdtz(k) = wdtz(k)+(rw_save(k,icell)-rw(k,icell))*(fzm(k)*theta_m_save(k,iCell)+fzp(k)*theta_m_save(k-1,iCell)) + + !$acc loop vector do k=3,nVertLevels-1 wdtz(k) = flux3( theta_m(k-2,iCell),theta_m(k-1,iCell),theta_m(k,iCell),theta_m(k+1,iCell), rw(k,iCell), coef_3rd_order ) wdtz(k) = wdtz(k) + (rw_save(k,icell)-rw(k,iCell))*(fzm(k)*theta_m_save(k,iCell)+fzp(k)*theta_m_save(k-1,iCell)) ! rtheta_pp redefinition end do + k = nVertLevels wdtz(k) = rw_save(k,icell)*(fzm(k)*theta_m(k,iCell)+fzp(k)*theta_m(k-1,iCell)) ! rtheta_pp redefinition wdtz(nVertLevels+1) = 0.0 !DIR$ IVDEP + !$acc loop vector do k=1,nVertLevels tend_theta(k,iCell) = tend_theta(k,iCell)*invAreaCell(iCell) -rdzw(k)*(wdtz(k+1)-wdtz(k)) rthdynten(k,iCell) = (tend_theta(k,iCell)-tend_rho(k,iCell)*theta_m(k,iCell))/rho_zz(k,iCell) tend_theta(k,iCell) = tend_theta(k,iCell) + rho_zz(k,iCell)*rt_diabatic_tend(k,iCell) end do end do + !$acc end parallel ! ! vertical mixing for theta - 2nd order @@ -5685,7 +6945,13 @@ subroutine atm_compute_dyn_tend_work(nCells, nEdges, nVertices, nVertLevels_dumm if (config_mix_full) then + !!! MGD UNTESTED !!! + + !$acc parallel default(present) + !$acc loop gang worker do iCell = cellSolveStart,cellSolveEnd + + !$acc loop vector do k=2,nVertLevels-1 z1 = zgrid(k-1,iCell) z2 = zgrid(k ,iCell) @@ -5701,9 +6967,12 @@ subroutine atm_compute_dyn_tend_work(nCells, nEdges, nVertices, nVertLevels_dumm -(theta_m(k ,iCell)-theta_m(k-1,iCell))/(z0-zm) )/(0.5*(zp-zm)) end do end do + !$acc end parallel - else ! idealized cases where we mix on the perturbation from the initial 1-D state + else ! idealized cases where we mix on the perturbation from the initial 1-D state + !$acc parallel default(present) + !$acc loop gang worker do iCell = cellSolveStart,cellSolveEnd do k=2,nVertLevels-1 z1 = zgrid(k-1,iCell) @@ -5720,6 +6989,7 @@ subroutine atm_compute_dyn_tend_work(nCells, nEdges, nVertices, nVertLevels_dumm -((theta_m(k ,iCell)-t_init(k,iCell))-(theta_m(k-1,iCell)-t_init(k-1,iCell)))/(z0-zm) )/(0.5*(zp-zm)) end do end do + !$acc end parallel end if @@ -5727,13 +6997,21 @@ subroutine atm_compute_dyn_tend_work(nCells, nEdges, nVertices, nVertLevels_dumm end if ! compute vertical theta mixing on first rk_step + !$acc parallel default(present) + !$acc loop gang worker do iCell = cellSolveStart,cellSolveEnd !DIR$ IVDEP + !$acc loop vector do k=1,nVertLevels ! tend_theta(k,iCell) = tend_theta(k,iCell) + tend_theta_euler(k,iCell) tend_theta(k,iCell) = tend_theta(k,iCell) + tend_theta_euler(k,iCell) + tend_rtheta_physics(k,iCell) end do end do + !$acc end parallel + + MPAS_ACC_TIMER_START('atm_compute_dyn_tend_work [ACC_data_xfer]') + !$acc exit data delete(rayleigh_damp_coef) + MPAS_ACC_TIMER_STOP('atm_compute_dyn_tend_work [ACC_data_xfer]') end subroutine atm_compute_dyn_tend_work @@ -5901,26 +7179,10 @@ subroutine atm_compute_solve_diagnostics_work(nCells, nEdges, nVertices, & logical :: reconstruct_v - MPAS_ACC_TIMER_START('atm_compute_solve_diagnostics [ACC_data_xfer]') - !$acc enter data copyin(cellsOnEdge,dcEdge,dvEdge, & - !$acc edgesOnVertex,edgesOnVertex_sign,invAreaTriangle, & - !$acc nEdgesOnCell,edgesOnCell, & - !$acc edgesOnCell_sign,invAreaCell, & - !$acc invAreaTriangle,edgesOnVertex, & - !$acc verticesOnCell,kiteForCell,kiteAreasOnVertex, & - !$acc nEdgesOnEdge,edgesOnEdge,weightsOnEdge, & - !$acc fVertex, & - !$acc verticesOnEdge, & - !$acc invDvEdge,invDcEdge) - !$acc enter data copyin(u,h) - MPAS_ACC_TIMER_STOP('atm_compute_solve_diagnostics [ACC_data_xfer]') ! ! Compute height on cell edges at velocity locations ! - MPAS_ACC_TIMER_START('atm_compute_solve_diagnostics [ACC_data_xfer]') - !$acc enter data create(h_edge,ke_edge,vorticity,divergence) - MPAS_ACC_TIMER_STOP('atm_compute_solve_diagnostics [ACC_data_xfer]') !$acc parallel default(present) !$acc loop gang do iEdge=edgeStart,edgeEnd @@ -6005,9 +7267,6 @@ subroutine atm_compute_solve_diagnostics_work(nCells, nEdges, nVertices, & ! ! Replace 2.0 with 2 in exponentiation to avoid outside chance that ! compiler will actually allow "float raised to float" operation - MPAS_ACC_TIMER_START('atm_compute_solve_diagnostics [ACC_data_xfer]') - !$acc enter data create(ke) - MPAS_ACC_TIMER_STOP('atm_compute_solve_diagnostics [ACC_data_xfer]') !$acc parallel default(present) !$acc loop gang do iCell=cellStart,cellEnd @@ -6108,14 +7367,6 @@ subroutine atm_compute_solve_diagnostics_work(nCells, nEdges, nVertices, & if(rk_step /= 3) reconstruct_v = .false. end if - MPAS_ACC_TIMER_START('atm_compute_solve_diagnostics [ACC_data_xfer]') - if (reconstruct_v) then - !$acc enter data create(v) - else - !$acc enter data copyin(v) - end if - MPAS_ACC_TIMER_STOP('atm_compute_solve_diagnostics [ACC_data_xfer]') - if (reconstruct_v) then !$acc parallel default(present) !$acc loop gang @@ -6142,10 +7393,7 @@ subroutine atm_compute_solve_diagnostics_work(nCells, nEdges, nVertices, & ! ( this computes pv_vertex at all vertices bounding real cells ) ! ! Avoid dividing h_vertex by areaTriangle and move areaTriangle into - ! numerator for the pv_vertex calculation - MPAS_ACC_TIMER_START('atm_compute_solve_diagnostics [ACC_data_xfer]') - !$acc enter data create(pv_vertex) - MPAS_ACC_TIMER_STOP('atm_compute_solve_diagnostics [ACC_data_xfer]') + ! numerator for the pv_vertex calculation !$acc parallel default(present) !$acc loop collapse(2) do iVertex = vertexStart,vertexEnd @@ -6169,9 +7417,6 @@ subroutine atm_compute_solve_diagnostics_work(nCells, nEdges, nVertices, & ! Compute pv at the edges ! ( this computes pv_edge at all edges bounding real cells ) ! - MPAS_ACC_TIMER_START('atm_compute_solve_diagnostics [ACC_data_xfer]') - !$acc enter data create(pv_edge) - MPAS_ACC_TIMER_STOP('atm_compute_solve_diagnostics [ACC_data_xfer]') !$acc parallel default(present) !$acc loop collapse(2) do iEdge = edgeStart,edgeEnd @@ -6188,10 +7433,7 @@ subroutine atm_compute_solve_diagnostics_work(nCells, nEdges, nVertices, & ! Compute pv at cell centers ! ( this computes pv_cell for all real cells ) ! only needed for APVM upwinding - ! - MPAS_ACC_TIMER_START('atm_compute_solve_diagnostics [ACC_data_xfer]') - !$acc enter data create(pv_cell) - MPAS_ACC_TIMER_STOP('atm_compute_solve_diagnostics [ACC_data_xfer]') + ! !$acc parallel default(present) !$acc loop gang do iCell=cellStart,cellEnd @@ -6230,9 +7472,6 @@ subroutine atm_compute_solve_diagnostics_work(nCells, nEdges, nVertices, & ! Merged loops for calculating gradPVt, gradPVn and pv_edge ! Also precomputed inverses of dvEdge and dcEdge to avoid repeated divisions ! - MPAS_ACC_TIMER_START('atm_compute_solve_diagnostics [ACC_data_xfer]') - !$acc enter data create(gradPVt,gradPVn) - MPAS_ACC_TIMER_STOP('atm_compute_solve_diagnostics [ACC_data_xfer]') r = config_apvm_upwinding * dt !$acc parallel default(present) !$acc loop gang @@ -6249,30 +7488,10 @@ subroutine atm_compute_solve_diagnostics_work(nCells, nEdges, nVertices, & end do !$acc end parallel - MPAS_ACC_TIMER_START('atm_compute_solve_diagnostics [ACC_data_xfer]') - !$acc exit data delete(pv_cell,gradPVt,gradPVn) - MPAS_ACC_TIMER_STOP('atm_compute_solve_diagnostics [ACC_data_xfer]') end if ! apvm upwinding - MPAS_ACC_TIMER_START('atm_compute_solve_diagnostics [ACC_data_xfer]') - !$acc exit data delete(cellsOnEdge,dcEdge,dvEdge, & - !$acc edgesOnVertex,edgesOnVertex_sign,invAreaTriangle, & - !$acc nEdgesOnCell,edgesOnCell, & - !$acc edgesOnCell_sign,invAreaCell, & - !$acc invAreaTriangle,edgesOnVertex, & - !$acc verticesOnCell,kiteForCell,kiteAreasOnVertex, & - !$acc nEdgesOnEdge,edgesOnEdge,weightsOnEdge, & - !$acc verticesOnEdge, & - !$acc fVertex,invDvEdge,invDcEdge) - !$acc exit data delete(u,h) - !$acc exit data copyout(h_edge,ke_edge,vorticity,divergence, & - !$acc ke, & - !$acc v, & - !$acc pv_vertex, & - !$acc pv_edge) - MPAS_ACC_TIMER_STOP('atm_compute_solve_diagnostics [ACC_data_xfer]') end subroutine atm_compute_solve_diagnostics_work @@ -6362,17 +7581,13 @@ subroutine atm_init_coupled_diagnostics(state, time_lev, diag, mesh, configs, & call mpas_pool_get_array(mesh, 'zb3_cell', zb3_cell) MPAS_ACC_TIMER_START('atm_init_coupled_diagnostics [ACC_data_xfer]') - ! copyin invariant fields - !$acc enter data copyin(cellsOnEdge,nEdgesOnCell,edgesOnCell, & - !$acc edgesOnCell_sign,zz,fzm,fzp,zb,zb3, & - !$acc zb_cell,zb3_cell) - + ! copyin the data that is only on the right-hand side - !$acc enter data copyin(scalars(index_qv,:,:),u,w,rho,theta, & + !$acc enter data copyin(scalars(index_qv,:,:),w,rho,theta, & !$acc rho_base,theta_base) ! copyin the data that will be modified in this routine - !$acc enter data create(theta_m,rho_zz,ru,rw,rho_p,rtheta_base, & + !$acc enter data create(theta_m,ru,rw,rho_p,rtheta_base, & !$acc rtheta_p,exner,exner_base,pressure_p, & !$acc pressure_base) MPAS_ACC_TIMER_STOP('atm_init_coupled_diagnostics [ACC_data_xfer]') @@ -6496,17 +7711,12 @@ subroutine atm_init_coupled_diagnostics(state, time_lev, diag, mesh, configs, & !$acc end parallel MPAS_ACC_TIMER_START('atm_init_coupled_diagnostics [ACC_data_xfer]') - ! delete invariant fields - !$acc exit data delete(cellsOnEdge,nEdgesOnCell,edgesOnCell, & - !$acc edgesOnCell_sign,zz,fzm,fzp,zb,zb3, & - !$acc zb_cell,zb3_cell) - ! delete the data that is only on the right-hand side - !$acc exit data delete(scalars(index_qv,:,:),u,w,rho,theta, & + !$acc exit data delete(scalars(index_qv,:,:),w,rho,theta, & !$acc rho_base,theta_base) ! copyout the data that will be modified in this routine - !$acc exit data copyout(theta_m,rho_zz,ru,rw,rho_p,rtheta_base, & + !$acc exit data copyout(theta_m,ru,rw,rho_p,rtheta_base, & !$acc rtheta_p,exner,exner_base,pressure_p, & !$acc pressure_base) MPAS_ACC_TIMER_STOP('atm_init_coupled_diagnostics [ACC_data_xfer]') @@ -6573,15 +7783,7 @@ subroutine atm_rk_dynamics_substep_finish( state, diag, nVertLevels, dynamics_su call mpas_pool_get_array(state, 'rho_zz', rho_zz_1, 1) call mpas_pool_get_array(state, 'rho_zz', rho_zz_2, 2) - MPAS_ACC_TIMER_START('atm_rk_dynamics_substep_finish [ACC_data_xfer]') - !$acc enter data create(ru_save, u_1, rtheta_p_save, theta_m_1, rho_p_save, rw_save, & - !$acc w_1, rho_zz_1) & - !$acc copyin(ru, u_2, rtheta_p, rho_p, theta_m_2, rho_zz_2, rw, & - !$acc w_2, ruAvg, wwAvg, ruAvg_split, wwAvg_split, rho_zz_old_split) - MPAS_ACC_TIMER_STOP('atm_rk_dynamics_substep_finish [ACC_data_xfer]') - ! Interim fix for the atm_compute_dyn_tend_work subroutine accessing uninitialized values - ! in garbage cells of theta_m !$acc kernels theta_m_1(:,cellEnd+1) = 0.0_RKIND !$acc end kernels @@ -6684,13 +7886,6 @@ subroutine atm_rk_dynamics_substep_finish( state, diag, nVertLevels, dynamics_su !$acc end parallel end if - MPAS_ACC_TIMER_START('atm_rk_dynamics_substep_finish [ACC_data_xfer]') - !$acc exit data copyout(ru_save, u_1, rtheta_p_save, rho_p_save, rw_save, & - !$acc w_1, theta_m_1, rho_zz_1, ruAvg, wwAvg, ruAvg_split, & - !$acc wwAvg_split) & - !$acc delete(ru, u_2, rtheta_p, rho_p, theta_m_2, rho_zz_2, rw, & - !$acc w_2, rho_zz_old_split) - MPAS_ACC_TIMER_STOP('atm_rk_dynamics_substep_finish [ACC_data_xfer]') end subroutine atm_rk_dynamics_substep_finish @@ -6745,9 +7940,6 @@ subroutine atm_zero_gradient_w_bdy_work( w, bdyMaskCell, nearestRelaxationCell, integer :: iCell, k - MPAS_ACC_TIMER_START('atm_zero_gradient_w_bdy_work [ACC_data_xfer]') - !$acc enter data copyin(w) - MPAS_ACC_TIMER_STOP('atm_zero_gradient_w_bdy_work [ACC_data_xfer]') !$acc parallel default(present) !$acc loop gang worker @@ -6762,10 +7954,7 @@ subroutine atm_zero_gradient_w_bdy_work( w, bdyMaskCell, nearestRelaxationCell, end do !$acc end parallel - MPAS_ACC_TIMER_START('atm_zero_gradient_w_bdy_work [ACC_data_xfer]') - !$acc exit data copyout(w) - MPAS_ACC_TIMER_STOP('atm_zero_gradient_w_bdy_work [ACC_data_xfer]') - + end subroutine atm_zero_gradient_w_bdy_work @@ -6805,13 +7994,6 @@ subroutine atm_bdy_adjust_dynamics_speczone_tend( tend, mesh, config, nVertLevel call mpas_pool_get_array(mesh, 'bdyMaskEdge', bdyMaskEdge) call mpas_pool_get_array(tend, 'rt_diabatic_tend', rt_diabatic_tend) - MPAS_ACC_TIMER_START('atm_bdy_adjust_dynamics_speczone_tend [ACC_data_xfer]') - !$acc enter data copyin(tend_ru,tend_rho,tend_rt,tend_rw, & - !$acc rt_diabatic_tend) & - !$acc copyin(rho_driving_tend,rt_driving_tend, & - !$acc ru_driving_tend) - MPAS_ACC_TIMER_STOP('atm_bdy_adjust_dynamics_speczone_tend [ACC_data_xfer]') - !$acc parallel default(present) !$acc loop gang worker do iCell = cellSolveStart, cellSolveEnd @@ -6838,13 +8020,6 @@ subroutine atm_bdy_adjust_dynamics_speczone_tend( tend, mesh, config, nVertLevel end if end do !$acc end parallel - - MPAS_ACC_TIMER_START('atm_bdy_adjust_dynamics_speczone_tend [ACC_data_xfer]') - !$acc exit data copyout(tend_ru,tend_rho,tend_rt, & - !$acc tend_rw,rt_diabatic_tend) & - !$acc delete(rho_driving_tend,rt_driving_tend, & - !$acc ru_driving_tend) - MPAS_ACC_TIMER_STOP('atm_bdy_adjust_dynamics_speczone_tend [ACC_data_xfer]') end subroutine atm_bdy_adjust_dynamics_speczone_tend @@ -6881,11 +8056,13 @@ subroutine atm_bdy_adjust_dynamics_relaxzone_tend( config, tend, state, diag, me real (kind=RKIND), dimension(:,:), pointer :: edgesOnCell_sign, edgesOnVertex_sign integer, dimension(:), pointer :: bdyMaskCell, bdyMaskEdge, nEdgesOnCell integer, dimension(:,:), pointer :: cellsOnEdge, verticesOnEdge, edgesOnCell, edgesOnVertex - integer, pointer :: vertexDegree + integer, pointer :: vertexDegree_ptr + integer :: vertexDegree real (kind=RKIND) :: edge_sign, laplacian_filter_coef, rayleigh_damping_coef, r_dc, r_dv, invArea - real (kind=RKIND), pointer :: divdamp_coef + real (kind=RKIND), pointer :: divdamp_coef_ptr + real (kind=RKIND) :: divdamp_coef real (kind=RKIND), dimension(nVertLevels) :: divergence1, divergence2, vorticity1, vorticity2 integer :: iCell, iEdge, i, k, cell1, cell2, iEdge_vort, iEdge_div integer :: vertex1, vertex2, iVertex @@ -6906,7 +8083,7 @@ subroutine atm_bdy_adjust_dynamics_relaxzone_tend( config, tend, state, diag, me call mpas_pool_get_array(state, 'theta_m', theta_m, 2) call mpas_pool_get_array(state, 'rho_zz', rho_zz, 2) - call mpas_pool_get_dimension(mesh, 'vertexDegree', vertexDegree) + call mpas_pool_get_dimension(mesh, 'vertexDegree', vertexDegree_ptr) call mpas_pool_get_array(mesh, 'dcEdge', dcEdge) call mpas_pool_get_array(mesh, 'dvEdge', dvEdge) call mpas_pool_get_array(mesh, 'invDcEdge', invDcEdge) @@ -6921,37 +8098,55 @@ subroutine atm_bdy_adjust_dynamics_relaxzone_tend( config, tend, state, diag, me call mpas_pool_get_array(mesh, 'nEdgesOnCell',nEdgesOnCell) call mpas_pool_get_array(mesh, 'verticesOnEdge', verticesOnEdge) - call mpas_pool_get_config(config, 'config_relax_zone_divdamp_coef', divdamp_coef) + call mpas_pool_get_config(config, 'config_relax_zone_divdamp_coef', divdamp_coef_ptr) + + ! De-referencing scalar integer pointers so that acc parallel regions correctly + ! copy these scalar integers onto the device + divdamp_coef = divdamp_coef_ptr + vertexDegree = vertexDegree_ptr + + MPAS_ACC_TIMER_START('atm_bdy_adjust_dynamics_relaxzone_tend [ACC_data_xfer]') + !$acc enter data create(divergence1, divergence2, vorticity1, vorticity2) + MPAS_ACC_TIMER_STOP('atm_bdy_adjust_dynamics_relaxzone_tend [ACC_data_xfer]') ! First, Rayleigh damping terms for ru, rtheta_m and rho_zz - + !$acc parallel default(present) + !$acc loop gang worker do iCell = cellSolveStart, cellSolveEnd if( (bdyMaskCell(iCell) > 1) .and. (bdyMaskCell(iCell) <= nRelaxZone) ) then rayleigh_damping_coef = (real(bdyMaskCell(iCell)) - 1.)/real(nRelaxZone)/(50.*dt*meshScalingRegionalCell(iCell)) + !$acc loop vector do k=1, nVertLevels tend_rho(k,iCell) = tend_rho(k,iCell) - rayleigh_damping_coef * (rho_zz(k,iCell) - rho_driving_values(k,iCell)) tend_rt(k,iCell) = tend_rt(k,iCell) - rayleigh_damping_coef * (rho_zz(k,iCell)*theta_m(k,iCell) - rt_driving_values(k,iCell)) end do end if end do + !$acc end parallel + !$acc parallel default(present) + !$acc loop gang worker do iEdge = edgeStart, edgeEnd if( (bdyMaskEdge(iEdge) > 1) .and. (bdyMaskEdge(iEdge) <= nRelaxZone) ) then rayleigh_damping_coef = (real(bdyMaskEdge(iEdge)) - 1.)/real(nRelaxZone)/(50.*dt*meshScalingRegionalEdge(iEdge)) + !$acc loop vector do k=1, nVertLevels tend_ru(k,iEdge) = tend_ru(k,iEdge) - rayleigh_damping_coef * (ru(k,iEdge) - ru_driving_values(k,iEdge)) end do end if end do - - ! Second, the horizontal filter for rtheta_m and rho_zz + !$acc end parallel + ! Second, the horizontal filter for rtheta_m and rho_zz + !$acc parallel default(present) + !$acc loop gang worker do iCell = cellSolveStart, cellSolveEnd ! threaded over cells if ( (bdyMaskCell(iCell) > 1) .and. (bdyMaskCell(iCell) <= nRelaxZone) ) then ! relaxation zone laplacian_filter_coef = (real(bdyMaskCell(iCell)) - 1.)/real(nRelaxZone)/(10.*dt*meshScalingRegionalCell(iCell)) ! + !$acc loop seq do i=1,nEdgesOnCell(iCell) iEdge = edgesOnCell(i,iCell) ! edge_sign = r_areaCell*edgesOnCell_sign(i,iCell) * dvEdge(iEdge) * invDcEdge(iEdge) * laplacian_filter_coef @@ -6960,6 +8155,7 @@ subroutine atm_bdy_adjust_dynamics_relaxzone_tend( config, tend, state, diag, me cell1 = cellsOnEdge(1,iEdge) cell2 = cellsOnEdge(2,iEdge) !DIR$ IVDEP + !$acc loop vector do k=1,nVertLevels tend_rt(k,iCell) = tend_rt(k,iCell) + edge_sign*( (rho_zz(k,cell2)*theta_m(k,cell2)-rt_driving_values(k,cell2)) & - (rho_zz(k,cell1)*theta_m(k,cell1)-rt_driving_values(k,cell1)) ) @@ -6971,9 +8167,11 @@ subroutine atm_bdy_adjust_dynamics_relaxzone_tend( config, tend, state, diag, me end if end do + !$acc end parallel ! Third (and last), the horizontal filter for ru - + !$acc parallel default(present) + !$acc loop gang worker private(divergence1, divergence2, vorticity1, vorticity2) do iEdge = edgeStart, edgeEnd if ( (bdyMaskEdge(iEdge) > 1) .and. (bdyMaskEdge(iEdge) <= nRelaxZone) ) then ! relaxation zone @@ -6990,10 +8188,19 @@ subroutine atm_bdy_adjust_dynamics_relaxzone_tend( config, tend, state, diag, me iCell = cell1 invArea = invAreaCell(iCell) - divergence1(1:nVertLevels) = 0. + !$acc loop vector + do k=1,nVertLevels + divergence1(k) = 0. + divergence2(k) = 0. + vorticity1(k) = 0. + vorticity2(k) = 0. + end do + + !$acc loop seq do i=1,nEdgesOnCell(iCell) iEdge_div = edgesOnCell(i,iCell) edge_sign = invArea * dvEdge(iEdge_div) * edgesOnCell_sign(i,iCell) + !$acc loop vector do k=1,nVertLevels divergence1(k) = divergence1(k) + edge_sign * (ru(k,iEdge_div) - ru_driving_values(k,iEdge_div)) end do @@ -7001,30 +8208,33 @@ subroutine atm_bdy_adjust_dynamics_relaxzone_tend( config, tend, state, diag, me iCell = cell2 invArea = invAreaCell(iCell) - divergence2(1:nVertLevels) = 0. + !$acc loop seq do i=1,nEdgesOnCell(iCell) iEdge_div = edgesOnCell(i,iCell) edge_sign = invArea * dvEdge(iEdge_div) * edgesOnCell_sign(i,iCell) + !$acc loop vector do k=1,nVertLevels divergence2(k) = divergence2(k) + edge_sign * (ru(k,iEdge_div) - ru_driving_values(k,iEdge_div)) end do end do iVertex = vertex1 - vorticity1(1:nVertLevels) = 0. + !$acc loop seq do i=1,vertexDegree iEdge_vort = edgesOnVertex(i,iVertex) edge_sign = invAreaTriangle(iVertex) * dcEdge(iEdge_vort) * edgesOnVertex_sign(i,iVertex) + !$acc loop vector do k=1,nVertLevels vorticity1(k) = vorticity1(k) + edge_sign * (ru(k,iEdge_vort) - ru_driving_values(k,iEdge_vort)) end do end do iVertex = vertex2 - vorticity2(1:nVertLevels) = 0. + !$acc loop seq do i=1,vertexDegree iEdge_vort = edgesOnVertex(i,iVertex) edge_sign = invAreaTriangle(iVertex) * dcEdge(iEdge_vort) * edgesOnVertex_sign(i,iVertex) + !$acc loop vector do k=1,nVertLevels vorticity2(k) = vorticity2(k) + edge_sign * (ru(k,iEdge_vort) - ru_driving_values(k,iEdge_vort)) end do @@ -7032,6 +8242,7 @@ subroutine atm_bdy_adjust_dynamics_relaxzone_tend( config, tend, state, diag, me ! Compute diffusion, computed as \nabla divergence - k \times \nabla vorticity ! + !$acc loop vector do k=1,nVertLevels tend_ru(k,iEdge) = tend_ru(k,iEdge) + laplacian_filter_coef & * (divdamp_coef * (divergence2(k) - divergence1(k)) * r_dc & @@ -7041,6 +8252,11 @@ subroutine atm_bdy_adjust_dynamics_relaxzone_tend( config, tend, state, diag, me end if ! end test for relaxation-zone edge end do ! end of loop over edges + !$acc end parallel + + MPAS_ACC_TIMER_START('atm_bdy_adjust_dynamics_relaxzone_tend [ACC_data_xfer]') + !$acc exit data delete(divergence1, divergence2, vorticity1, vorticity2) + MPAS_ACC_TIMER_STOP('atm_bdy_adjust_dynamics_relaxzone_tend [ACC_data_xfer]') end subroutine atm_bdy_adjust_dynamics_relaxzone_tend @@ -7074,11 +8290,6 @@ subroutine atm_bdy_reset_speczone_values( state, diag, mesh, nVertLevels, & call mpas_pool_get_array(state, 'theta_m', theta_m, 2) call mpas_pool_get_array(diag, 'rtheta_p', rtheta_p) call mpas_pool_get_array(diag, 'rtheta_base', rtheta_base) - - MPAS_ACC_TIMER_START('atm_bdy_reset_speczone_values [ACC_data_xfer]') - !$acc enter data copyin(rt_driving_values, rho_driving_values, rtheta_base, & - !$acc theta_m, rtheta_p) - MPAS_ACC_TIMER_STOP('atm_bdy_reset_speczone_values [ACC_data_xfer]') !$acc parallel default(present) !$acc loop gang worker @@ -7093,11 +8304,6 @@ subroutine atm_bdy_reset_speczone_values( state, diag, mesh, nVertLevels, & end do !$acc end parallel - MPAS_ACC_TIMER_START('atm_bdy_reset_speczone_values [ACC_data_xfer]') - !$acc exit data copyout(theta_m, rtheta_p) & - !$acc delete(rt_driving_values, rho_driving_values, rtheta_base) - MPAS_ACC_TIMER_STOP('atm_bdy_reset_speczone_values [ACC_data_xfer]') - end subroutine atm_bdy_reset_speczone_values !------------------------------------------------------------------------- @@ -7186,17 +8392,28 @@ subroutine atm_bdy_adjust_scalars_work( scalars_new, scalars_driving, dt, dt_rk, integer :: iCell, iEdge, iScalar, i, k, cell1, cell2 !--- + MPAS_ACC_TIMER_START('atm_bdy_adjust_scalars [ACC_data_xfer]') + !$acc enter data create(scalars_tmp) + MPAS_ACC_TIMER_STOP('atm_bdy_adjust_scalars [ACC_data_xfer]') + !$acc parallel default(present) + !$acc loop gang worker do iCell = cellSolveStart, cellSolveEnd ! threaded over cells if ( (bdyMaskCell(iCell) > 1) .and. (bdyMaskCell(iCell) <= nRelaxZone) ) then ! relaxation zone laplacian_filter_coef = dt_rk*(real(bdyMaskCell(iCell)) - 1.)/real(nRelaxZone)/(10.*dt*meshScalingRegionalCell(iCell)) rayleigh_damping_coef = laplacian_filter_coef/5.0 - scalars_tmp(1:num_scalars,1:nVertLevels,iCell) = scalars_new(1:num_scalars,1:nVertLevels,iCell) + !$acc loop collapse(2) + do k=1,nVertLevels + do iScalar=1,num_scalars + scalars_tmp(iScalar,k,iCell) = scalars_new(iScalar,k,iCell) + end do + end do ! first, we compute the 2nd-order laplacian filter ! + !$acc loop seq do i=1,nEdgesOnCell(iCell) iEdge = edgesOnCell(i,iCell) ! edge_sign = r_areaCell*edgesOnCell_sign(i,iCell) * dvEdge(iEdge) * invDcEdge(iEdge) * laplacian_filter_coef @@ -7205,6 +8422,7 @@ subroutine atm_bdy_adjust_scalars_work( scalars_new, scalars_driving, dt, dt_rk, cell1 = cellsOnEdge(1,iEdge) cell2 = cellsOnEdge(2,iEdge) !DIR$ IVDEP + !$acc loop vector collapse(2) do k=1,nVertLevels do iScalar = 1, num_scalars filter_flux = edge_sign*( (scalars_new(iScalar,k,cell2)-scalars_driving(iScalar,k,cell2)) & @@ -7217,6 +8435,7 @@ subroutine atm_bdy_adjust_scalars_work( scalars_new, scalars_driving, dt, dt_rk, ! second, we compute the Rayleigh damping component ! !DIR$ IVDEP + !$acc loop collapse(2) do k=1,nVertLevels do iScalar = 1, num_scalars scalars_tmp(iScalar,k,iCell) =scalars_tmp(iScalar,k,iCell) - rayleigh_damping_coef * (scalars_new(iScalar,k,iCell)-scalars_driving(iScalar,k,iCell)) @@ -7224,10 +8443,11 @@ subroutine atm_bdy_adjust_scalars_work( scalars_new, scalars_driving, dt, dt_rk, end do else if ( bdyMaskCell(iCell) > nRelaxZone) then ! specified zone - + ! update the specified-zone values ! !DIR$ IVDEP + !$acc loop collapse(2) do k=1,nVertLevels do iScalar = 1, num_scalars scalars_tmp(iScalar,k,iCell) = scalars_driving(iScalar,k,iCell) @@ -7237,12 +8457,16 @@ subroutine atm_bdy_adjust_scalars_work( scalars_new, scalars_driving, dt, dt_rk, end if end do ! updates now in temp storage - + !$acc end parallel + !$OMP BARRIER + !$acc parallel default(present) + !$acc loop gang worker do iCell = cellSolveStart, cellSolveEnd ! threaded over cells if (bdyMaskCell(iCell) > 1) then ! update values !DIR$ IVDEP + !$acc loop collapse(2) do k=1,nVertLevels do iScalar = 1, num_scalars scalars_new(iScalar,k,iCell) = scalars_tmp(iScalar,k,iCell) @@ -7250,6 +8474,11 @@ subroutine atm_bdy_adjust_scalars_work( scalars_new, scalars_driving, dt, dt_rk, end do end if end do + !$acc end parallel + + MPAS_ACC_TIMER_START('atm_bdy_adjust_scalars [ACC_data_xfer]') + !$acc exit data delete(scalars_tmp) + MPAS_ACC_TIMER_STOP('atm_bdy_adjust_scalars [ACC_data_xfer]') end subroutine atm_bdy_adjust_scalars_work @@ -7319,10 +8548,6 @@ subroutine atm_bdy_set_scalars_work( scalars_driving, scalars_new, & !--- - MPAS_ACC_TIMER_START('atm_bdy_set_scalars_work [ACC_data_xfer]') - !$acc enter data copyin(scalars_new, scalars_driving) - MPAS_ACC_TIMER_STOP('atm_bdy_set_scalars_work [ACC_data_xfer]') - !$acc parallel default(present) !$acc loop gang worker do iCell = cellSolveStart, cellSolveEnd ! threaded over cells @@ -7343,11 +8568,6 @@ subroutine atm_bdy_set_scalars_work( scalars_driving, scalars_new, & end do ! updates now in temp storage !$acc end parallel - - MPAS_ACC_TIMER_START('atm_bdy_set_scalars_work [ACC_data_xfer]') - !$acc exit data copyout(scalars_new) - !$acc exit data delete(scalars_driving) - MPAS_ACC_TIMER_STOP('atm_bdy_set_scalars_work [ACC_data_xfer]') end subroutine atm_bdy_set_scalars_work @@ -7368,7 +8588,8 @@ subroutine summarize_timestep(domain) logical, pointer :: config_print_global_minmax_sca integer :: iCell, k, iEdge, iScalar - integer, pointer :: num_scalars, nCellsSolve, nEdgesSolve, nVertLevels + integer, pointer :: num_scalars_ptr, nCellsSolve_ptr, nEdgesSolve_ptr, nVertLevels_ptr + integer :: num_scalars, nCellsSolve, nEdgesSolve, nVertLevels type (mpas_pool_type), pointer :: state type (mpas_pool_type), pointer :: diag @@ -7388,11 +8609,19 @@ subroutine summarize_timestep(domain) real (kind=RKIND) :: lonMax, lonMax_global real (kind=RKIND), dimension(5) :: localVals, globalVals - real (kind=RKIND) :: spd + real (kind=RKIND), dimension(:,:), allocatable :: spd + !$acc declare create(spd) real (kind=RKIND), dimension(:,:), pointer :: w real (kind=RKIND), dimension(:,:), pointer :: u, v, uReconstructZonal, uReconstructMeridional, uReconstructX, uReconstructY, uReconstructZ real (kind=RKIND), dimension(:,:,:), pointer :: scalars, scalars_1, scalars_2 + integer :: lin_indexMax ! Position of min/max in some array as a single integer. + ! If multiple locations match the extrema, always pick the lowest lin_indexMax. + ! array(k,idx) -> lin_indexMax = k+(idx-1)*size(array,1) + ! array(5,4) with size(array,1) = 10 -> lin_indexMax = 35 + + logical found_NaN + call mpas_pool_get_config(block % configs, 'config_print_global_minmax_vel', config_print_global_minmax_vel) call mpas_pool_get_config(block % configs, 'config_print_detailed_minmax_vel', config_print_detailed_minmax_vel) call mpas_pool_get_config(block % configs, 'config_print_global_minmax_sca', config_print_global_minmax_sca) @@ -7412,26 +8641,45 @@ subroutine summarize_timestep(domain) call mpas_pool_get_array(mesh, 'lonCell', lonCell) call mpas_pool_get_array(mesh, 'latEdge', latEdge) call mpas_pool_get_array(mesh, 'lonEdge', lonEdge) - call mpas_pool_get_dimension(state, 'nCellsSolve', nCellsSolve) - call mpas_pool_get_dimension(state, 'nEdgesSolve', nEdgesSolve) - call mpas_pool_get_dimension(state, 'nVertLevels', nVertLevels) + call mpas_pool_get_dimension(state, 'nCellsSolve', nCellsSolve_ptr) + call mpas_pool_get_dimension(state, 'nEdgesSolve', nEdgesSolve_ptr) + call mpas_pool_get_dimension(state, 'nVertLevels', nVertLevels_ptr) + nCellsSolve = nCellsSolve_ptr + nEdgesSolve = nEdgesSolve_ptr + nVertLevels = nVertLevels_ptr + + allocate(spd(nVertLevels,nEdgesSolve)) + scalar_min = 1.0e20 + lin_indexMax = huge(1) indexMax = -1 kMax = -1 latMax = 0.0 lonMax = 0.0 + !$acc parallel default(present) + !$acc loop collapse(2) gang vector reduction(min:scalar_min) + do iCell = 1, nCellsSolve + do k = 1, nVertLevels + scalar_min = min(scalar_min, w(k,iCell)) + end do + end do + !$acc end parallel + !$acc parallel default(present) + !$acc loop collapse(2) gang vector reduction(min:lin_indexMax) do iCell = 1, nCellsSolve do k = 1, nVertLevels - if (w(k,iCell) < scalar_min) then - scalar_min = w(k,iCell) - indexMax = iCell - kMax = k - latMax = latCell(iCell) - lonMax = lonCell(iCell) + if (w(k,iCell) == scalar_min) then + ! In case 2 locations tie, only save the minimum value + lin_indexMax = min(lin_indexMax, k + size(w,1)*(iCell-1)) end if end do end do + !$acc end parallel + kMax = modulo(lin_indexMax, size(w,1)) + indexMax = ((lin_indexMax - kMax) / size(w,1)) + 1 + latMax = latCell(indexMax) + lonMax = lonCell(indexMax) localVals(1) = scalar_min localVals(2) = real(indexMax,kind=RKIND) localVals(3) = real(kMax,kind=RKIND) @@ -7453,21 +8701,33 @@ subroutine summarize_timestep(domain) realArgs=(/global_scalar_min, latMax_global, lonMax_global/)) scalar_max = -1.0e20 + lin_indexMax = huge(1) indexMax = -1 kMax = -1 latMax = 0.0 lonMax = 0.0 + !$acc parallel default(present) + !$acc loop collapse(2) gang vector reduction(max:scalar_max) + do iCell = 1, nCellsSolve + do k = 1, nVertLevels + scalar_max = max(scalar_max, w(k,iCell)) + end do + end do + !$acc end parallel + !$acc parallel default(present) + !$acc loop collapse(2) gang vector reduction(min:lin_indexMax) do iCell = 1, nCellsSolve do k = 1, nVertLevels - if (w(k,iCell) > scalar_max) then - scalar_max = w(k,iCell) - indexMax = iCell - kMax = k - latMax = latCell(iCell) - lonMax = lonCell(iCell) + if (w(k,iCell) == scalar_max) then + lin_indexMax = min(lin_indexMax, k + size(w,1)*(iCell-1)) end if end do end do + !$acc end parallel + kMax = modulo(lin_indexMax, size(w,1)) + indexMax = ((lin_indexMax - kMax) / size(w,1)) + 1 + latMax = latCell(indexMax) + lonMax = lonCell(indexMax) localVals(1) = scalar_max localVals(2) = real(indexMax,kind=RKIND) localVals(3) = real(kMax,kind=RKIND) @@ -7489,21 +8749,33 @@ subroutine summarize_timestep(domain) realArgs=(/global_scalar_max, latMax_global, lonMax_global/)) scalar_min = 1.0e20 + lin_indexMax = huge(1) indexMax = -1 kMax = -1 latMax = 0.0 lonMax = 0.0 + !$acc parallel default(present) + !$acc loop collapse(2) gang vector reduction(min:scalar_min) + do iEdge = 1, nEdgesSolve + do k = 1, nVertLevels + scalar_min = min(scalar_min, u(k,iEdge)) + end do + end do + !$acc end parallel + !$acc parallel default(present) + !$acc loop collapse(2) gang vector reduction(min:lin_indexMax) do iEdge = 1, nEdgesSolve do k = 1, nVertLevels - if (u(k,iEdge) < scalar_min) then - scalar_min = u(k,iEdge) - indexMax = iEdge - kMax = k - latMax = latEdge(iEdge) - lonMax = lonEdge(iEdge) + if (u(k,iEdge) == scalar_min) then + lin_indexMax = min(lin_indexMax, k + size(u,1)*(iEdge-1)) end if end do end do + !$acc end parallel + kMax = modulo(lin_indexMax, size(u,1)) + indexMax = ((lin_indexMax - kMax) / size(u,1)) + 1 + latMax = latEdge(indexMax) + lonMax = lonEdge(indexMax) localVals(1) = scalar_min localVals(2) = real(indexMax,kind=RKIND) localVals(3) = real(kMax,kind=RKIND) @@ -7525,21 +8797,33 @@ subroutine summarize_timestep(domain) realArgs=(/global_scalar_min, latMax_global, lonMax_global/)) scalar_max = -1.0e20 + lin_indexMax = huge(1) indexMax = -1 kMax = -1 latMax = 0.0 lonMax = 0.0 + !$acc parallel default(present) + !$acc loop collapse(2) gang vector reduction(max:scalar_max) do iEdge = 1, nEdgesSolve do k = 1, nVertLevels - if (u(k,iEdge) > scalar_max) then - scalar_max = u(k,iEdge) - indexMax = iEdge - kMax = k - latMax = latEdge(iEdge) - lonMax = lonEdge(iEdge) + scalar_max = max(scalar_max, u(k,iEdge)) + end do + end do + !$acc end parallel + !$acc parallel default(present) + !$acc loop collapse(2) gang vector reduction(min:lin_indexMax) + do iEdge = 1, nEdgesSolve + do k = 1, nVertLevels + if (u(k,iEdge) == scalar_max) then + lin_indexMax = min(lin_indexMax, k + size(u,1)*(iEdge-1)) end if end do end do + !$acc end parallel + kMax = modulo(lin_indexMax, size(u,1)) + indexMax = ((lin_indexMax - kMax) / size(u,1)) + 1 + latMax = latEdge(indexMax) + lonMax = lonEdge(indexMax) localVals(1) = scalar_max localVals(2) = real(indexMax,kind=RKIND) localVals(3) = real(kMax,kind=RKIND) @@ -7561,22 +8845,34 @@ subroutine summarize_timestep(domain) realArgs=(/global_scalar_max, latMax_global, lonMax_global/)) scalar_max = -1.0e20 + lin_indexMax = huge(1) indexMax = -1 kMax = -1 latMax = 0.0 lonMax = 0.0 + !$acc parallel default(present) + !$acc loop collapse(2) gang vector reduction(max:scalar_max) + do iEdge = 1, nEdgesSolve + do k = 1, nVertLevels + spd(k,iEdge) = sqrt(u(k,iEdge)*u(k,iEdge) + v(k,iEdge)*v(k,iEdge)) + scalar_max = max(scalar_max, spd(k,iEdge)) + end do + end do + !$acc end parallel + !$acc parallel default(present) + !$acc loop collapse(2) gang vector reduction(min:lin_indexMax) do iEdge = 1, nEdgesSolve do k = 1, nVertLevels - spd = sqrt(u(k,iEdge)*u(k,iEdge) + v(k,iEdge)*v(k,iEdge)) - if (spd > scalar_max) then - scalar_max = spd - indexMax = iEdge - kMax = k - latMax = latEdge(iEdge) - lonMax = lonEdge(iEdge) + if (spd(k,iEdge) == scalar_max) then + lin_indexMax = min(lin_indexMax, k + size(u,1)*(iEdge-1)) end if end do end do + !$acc end parallel + kMax = modulo(lin_indexMax, size(u,1)) + indexMax = ((lin_indexMax - kMax) / size(u,1)) + 1 + latMax = latEdge(indexMax) + lonMax = lonEdge(indexMax) localVals(1) = scalar_max localVals(2) = real(indexMax,kind=RKIND) localVals(3) = real(kMax,kind=RKIND) @@ -7600,21 +8896,39 @@ subroutine summarize_timestep(domain) ! ! Check for NaNs ! + found_NaN = .false. + + !$acc parallel default(present) + !$acc loop collapse(2) gang vector reduction(.or.:found_NaN) do iCell = 1, nCellsSolve do k = 1, nVertLevels if (ieee_is_nan(w(k,iCell))) then - call mpas_log_write('NaN detected in ''w'' field.', messageType=MPAS_LOG_CRIT) + found_NaN = .true. end if end do end do + !$acc end parallel + if (found_NaN) then + call mpas_log_write('NaN detected in ''w'' field.', messageType=MPAS_LOG_CRIT) + end if + + !$acc parallel default(present) + !$acc loop collapse(2) gang vector reduction(.or.:found_NaN) do iEdge = 1, nEdgesSolve do k = 1, nVertLevels if (ieee_is_nan(u(k,iEdge))) then - call mpas_log_write('NaN detected in ''u'' field.', messageType=MPAS_LOG_CRIT) + found_NaN = .true. end if end do end do + !$acc end parallel + if (found_NaN) then + call mpas_log_write('NaN detected in ''u'' field.', messageType=MPAS_LOG_CRIT) + end if + + + deallocate(spd) else if (config_print_global_minmax_vel) then call mpas_log_write('') @@ -7622,33 +8936,43 @@ subroutine summarize_timestep(domain) call mpas_pool_get_subpool(block % structs, 'state', state) call mpas_pool_get_array(state, 'w', w, 2) call mpas_pool_get_array(state, 'u', u, 2) - call mpas_pool_get_dimension(state, 'nCellsSolve', nCellsSolve) - call mpas_pool_get_dimension(state, 'nEdgesSolve', nEdgesSolve) - call mpas_pool_get_dimension(state, 'nVertLevels', nVertLevels) + call mpas_pool_get_dimension(state, 'nCellsSolve', nCellsSolve_ptr) + call mpas_pool_get_dimension(state, 'nEdgesSolve', nEdgesSolve_ptr) + call mpas_pool_get_dimension(state, 'nVertLevels', nVertLevels_ptr) + nCellsSolve = nCellsSolve_ptr + nEdgesSolve = nEdgesSolve_ptr + nVertLevels = nVertLevels_ptr scalar_min = 0.0 scalar_max = 0.0 + !$acc parallel default(present) + !$acc loop gang vector collapse(2) reduction(min:scalar_min) reduction(max:scalar_max) do iCell = 1, nCellsSolve do k = 1, nVertLevels scalar_min = min(scalar_min, w(k,iCell)) scalar_max = max(scalar_max, w(k,iCell)) end do end do + !$acc end parallel call mpas_dmpar_min_real(domain % dminfo, scalar_min, global_scalar_min) call mpas_dmpar_max_real(domain % dminfo, scalar_max, global_scalar_max) call mpas_log_write('global min, max w $r $r', realArgs=(/global_scalar_min, global_scalar_max/)) scalar_min = 0.0 scalar_max = 0.0 + !$acc parallel default(present) + !$acc loop gang vector collapse(2) reduction(min:scalar_min) reduction(max:scalar_max) do iEdge = 1, nEdgesSolve do k = 1, nVertLevels scalar_min = min(scalar_min, u(k,iEdge)) scalar_max = max(scalar_max, u(k,iEdge)) end do end do + !$acc end parallel call mpas_dmpar_min_real(domain % dminfo, scalar_min, global_scalar_min) call mpas_dmpar_max_real(domain % dminfo, scalar_max, global_scalar_max) call mpas_log_write('global min, max u $r $r', realArgs=(/global_scalar_min, global_scalar_max/)) + end if if (config_print_global_minmax_sca) then @@ -7658,23 +8982,30 @@ subroutine summarize_timestep(domain) call mpas_pool_get_subpool(block % structs, 'state', state) call mpas_pool_get_array(state, 'scalars', scalars, 2) - call mpas_pool_get_dimension(state, 'nCellsSolve', nCellsSolve) - call mpas_pool_get_dimension(state, 'nVertLevels', nVertLevels) - call mpas_pool_get_dimension(state, 'num_scalars', num_scalars) + call mpas_pool_get_dimension(state, 'nCellsSolve', nCellsSolve_ptr) + call mpas_pool_get_dimension(state, 'nVertLevels', nVertLevels_ptr) + call mpas_pool_get_dimension(state, 'num_scalars', num_scalars_ptr) + nCellsSolve = nCellsSolve_ptr + nVertLevels = nVertLevels_ptr + num_scalars = num_scalars_ptr do iScalar = 1, num_scalars scalar_min = 0.0 scalar_max = 0.0 + !$acc parallel default(present) + !$acc loop gang vector collapse(2) reduction(min:scalar_min) reduction(max:scalar_max) do iCell = 1, nCellsSolve do k = 1, nVertLevels scalar_min = min(scalar_min, scalars(iScalar,k,iCell)) scalar_max = max(scalar_max, scalars(iScalar,k,iCell)) end do end do + !$acc end parallel call mpas_dmpar_min_real(domain % dminfo, scalar_min, global_scalar_min) call mpas_dmpar_max_real(domain % dminfo, scalar_max, global_scalar_max) call mpas_log_write(' global min, max scalar $i $r $r', intArgs=(/iScalar/), realArgs=(/global_scalar_min, global_scalar_max/)) end do + end if end subroutine summarize_timestep diff --git a/src/core_atmosphere/mpas_atm_core.F b/src/core_atmosphere/mpas_atm_core.F index 997d7ca8ba..de5b2d0d00 100644 --- a/src/core_atmosphere/mpas_atm_core.F +++ b/src/core_atmosphere/mpas_atm_core.F @@ -43,7 +43,7 @@ function atm_core_init(domain, startTimeStamp) result(ierr) use mpas_atm_dimensions, only : mpas_atm_set_dims use mpas_atm_diagnostics_manager, only : mpas_atm_diag_setup use mpas_atm_threading, only : mpas_atm_threading_init - use atm_time_integration, only : mpas_atm_dynamics_init + use atm_time_integration, only : mpas_atm_dynamics_init, mpas_atm_pre_dynamics_h2d, mpas_atm_post_dynamics_d2h use mpas_timer, only : mpas_timer_start, mpas_timer_stop use mpas_attlist, only : mpas_modify_att use mpas_string_utils, only : mpas_string_replace @@ -76,7 +76,7 @@ function atm_core_init(domain, startTimeStamp) result(ierr) integer, pointer :: nVertLevels, maxEdges, maxEdges2, num_scalars character (len=ShortStrKIND) :: init_stream_name real (kind=R8KIND) :: input_start_time, input_stop_time - + ierr = 0 @@ -284,7 +284,7 @@ function atm_core_init(domain, startTimeStamp) result(ierr) block => block % next end do - + call exchange_halo_group(domain, 'initialization:pv_edge,ru,rw') call mpas_atm_diag_setup(domain % streamManager, domain % blocklist % configs, & @@ -476,7 +476,9 @@ subroutine atm_mpas_init_block(dminfo, stream_manager, block, mesh, dt) allocate(ke_vertex(nVertLevels,nVertices+1)) ! ke_vertex is a module variable defined in mpas_atm_time_integration.F ke_vertex(:,nVertices+1) = 0.0_RKIND allocate(ke_edge(nVertLevels,nEdges+1)) ! ke_edge is a module variable defined in mpas_atm_time_integration.F + !$acc kernels ke_edge(:,nEdges+1) = 0.0_RKIND + !$acc end kernels call mpas_pool_get_dimension(block % dimensions, 'nThreads', nThreads) @@ -495,6 +497,7 @@ subroutine atm_mpas_init_block(dminfo, stream_manager, block, mesh, dt) call mpas_pool_get_dimension(block % dimensions, 'edgeSolveThreadStart', edgeSolveThreadStart) call mpas_pool_get_dimension(block % dimensions, 'edgeSolveThreadEnd', edgeSolveThreadEnd) + call mpas_atm_pre_computesolvediag_h2d(block) !$OMP PARALLEL DO do thread=1,nThreads if (.not. config_do_restart .or. (config_do_restart .and. config_do_DAcycling)) then @@ -506,13 +509,13 @@ subroutine atm_mpas_init_block(dminfo, stream_manager, block, mesh, dt) vertexSolveThreadStart(thread), vertexSolveThreadEnd(thread), & edgeSolveThreadStart(thread), edgeSolveThreadEnd(thread)) end if - call atm_compute_solve_diagnostics(dt, state, 1, diag, mesh, block % configs, & cellThreadStart(thread), cellThreadEnd(thread), & vertexThreadStart(thread), vertexThreadEnd(thread), & edgeThreadStart(thread), edgeThreadEnd(thread)) end do !$OMP END PARALLEL DO + call mpas_atm_post_computesolvediag_d2h(block) deallocate(ke_vertex) deallocate(ke_edge) @@ -595,6 +598,7 @@ function atm_core_run(domain) result(ierr) use mpas_derived_types, only : MPAS_STREAM_LATEST_BEFORE, MPAS_STREAM_INPUT, MPAS_STREAM_INPUT_OUTPUT use mpas_timer, only : mpas_timer_start, mpas_timer_stop use mpas_atm_boundaries, only : mpas_atm_update_bdy_tend + use atm_time_integration, only : mpas_atm_pre_dynamics_h2d, mpas_atm_post_dynamics_d2h use mpas_atm_diagnostics_manager, only : mpas_atm_diag_update, mpas_atm_diag_compute, mpas_atm_diag_reset implicit none diff --git a/src/core_atmosphere/physics/mpas_atmphys_interface.F b/src/core_atmosphere/physics/mpas_atmphys_interface.F index 40125972ab..3aba022e47 100644 --- a/src/core_atmosphere/physics/mpas_atmphys_interface.F +++ b/src/core_atmosphere/physics/mpas_atmphys_interface.F @@ -595,11 +595,13 @@ subroutine microphysics_from_MPAS(configs,mesh,state,time_lev,diag,diag_physics, call mpas_pool_get_array(state,'rho_zz' ,rho_zz ,time_lev) call mpas_pool_get_array(state,'theta_m',theta_m,time_lev) call mpas_pool_get_array(state,'w' ,w ,time_lev) + !$acc update host(exner, pressure_b, pressure_p, rho_zz, theta_m, w) call mpas_pool_get_dimension(state,'index_qv',index_qv) call mpas_pool_get_dimension(state,'index_qc',index_qc) call mpas_pool_get_dimension(state,'index_qr',index_qr) call mpas_pool_get_array(state,'scalars',scalars,time_lev) + !$acc update host(scalars) qv => scalars(index_qv,:,:) qc => scalars(index_qc,:,:) qr => scalars(index_qr,:,:) @@ -831,7 +833,6 @@ subroutine microphysics_to_MPAS(configs,mesh,state,time_lev,diag,diag_physics,te call mpas_pool_get_array(diag,'rtheta_p' ,rtheta_p ) call mpas_pool_get_array(diag,'surface_pressure',surface_pressure) call mpas_pool_get_array(diag,'dtheta_dt_mp' ,dtheta_dt_mp ) - call mpas_pool_get_array(tend,'tend_sfc_pressure',tend_sfc_pressure) call mpas_pool_get_array(state,'rho_zz' ,rho_zz ,time_lev) @@ -1038,6 +1039,9 @@ subroutine microphysics_to_MPAS(configs,mesh,state,time_lev,diag,diag_physics,te case default end select mp_tend_select + !$acc update device(exner, exner_b, pressure_b, pressure_p, rtheta_b) + !$acc update device(rtheta_p, rho_zz, theta_m, scalars) + !$acc update device(rt_diabatic_tend) end subroutine microphysics_to_MPAS diff --git a/src/core_atmosphere/physics/mpas_atmphys_todynamics.F b/src/core_atmosphere/physics/mpas_atmphys_todynamics.F index 284b072851..e51331e99e 100644 --- a/src/core_atmosphere/physics/mpas_atmphys_todynamics.F +++ b/src/core_atmosphere/physics/mpas_atmphys_todynamics.F @@ -133,6 +133,7 @@ subroutine physics_get_tend(block,mesh,state,diag,tend,tend_physics,configs,rk_s call mpas_pool_get_array(diag ,'rho_edge',mass_edge) call mpas_pool_get_array(diag ,'tend_u_phys',tend_u_phys) + !$acc update self(theta_m, scalars, mass, mass_edge) call mpas_pool_get_dimension(state,'index_qv',index_qv) call mpas_pool_get_dimension(state,'index_qc',index_qc) call mpas_pool_get_dimension(state,'index_qr',index_qr) @@ -170,6 +171,7 @@ subroutine physics_get_tend(block,mesh,state,diag,tend,tend_physics,configs,rk_s call mpas_pool_get_array(tend_physics,'rthratensw',rthratensw) call mpas_pool_get_array(tend,'scalars_tend',tend_scalars) +!$acc update self(tend_scalars) ! Probably not needed !initialize the tendency for the potential temperature and all scalars due to PBL, convection, @@ -219,6 +221,7 @@ subroutine physics_get_tend(block,mesh,state,diag,tend,tend_physics,configs,rk_s tend_th,tend_rtheta_physics,tend_scalars,tend_ru_physics,tend_u_phys, & exchange_halo_group) +!$acc update device(tend_scalars) !clean up any pointers that were allocated with zero size before the call to physics_get_tend_work: if(size(rucuten) == 0 ) deallocate(rucuten ) if(size(rvcuten) == 0 ) deallocate(rvcuten ) diff --git a/src/operators/mpas_vector_reconstruction.F b/src/operators/mpas_vector_reconstruction.F index 789ba50c1e..6bc3a3d804 100644 --- a/src/operators/mpas_vector_reconstruction.F +++ b/src/operators/mpas_vector_reconstruction.F @@ -24,6 +24,16 @@ module mpas_vector_reconstruction use mpas_rbf_interpolation use mpas_vector_operations + ! For use in OpenACC ported code to track in-function transfers + use mpas_timer, only : mpas_timer_start, mpas_timer_stop +#ifdef MPAS_OPENACC +#define MPAS_ACC_TIMER_START(X) call mpas_timer_start(X) +#define MPAS_ACC_TIMER_STOP(X) call mpas_timer_stop(X) +#else +#define MPAS_ACC_TIMER_START(X) +#define MPAS_ACC_TIMER_STOP(X) +#endif + implicit none public :: mpas_init_reconstruct, mpas_reconstruct @@ -207,10 +217,11 @@ subroutine mpas_reconstruct_2d(meshPool, u, uReconstructX, uReconstructY, uRecon ! temporary arrays needed in the compute procedure logical :: includeHalosLocal - integer, pointer :: nCells + integer, pointer :: nCells_ptr, nVertLevels_ptr + integer :: nCells, nVertLevels integer, dimension(:,:), pointer :: edgesOnCell integer, dimension(:), pointer :: nEdgesOnCell - integer :: iCell,iEdge, i + integer :: iCell,iEdge, i, k real(kind=RKIND), dimension(:), pointer :: latCell, lonCell real (kind=RKIND), dimension(:,:,:), pointer :: coeffs_reconstruct @@ -233,64 +244,108 @@ subroutine mpas_reconstruct_2d(meshPool, u, uReconstructX, uReconstructY, uRecon call mpas_pool_get_array(meshPool, 'edgesOnCell', edgesOnCell) if ( includeHalosLocal ) then - call mpas_pool_get_dimension(meshPool, 'nCells', nCells) + call mpas_pool_get_dimension(meshPool, 'nCells', nCells_ptr) else - call mpas_pool_get_dimension(meshPool, 'nCellsSolve', nCells) + call mpas_pool_get_dimension(meshPool, 'nCellsSolve', nCells_ptr) end if + call mpas_pool_get_dimension(meshPool, 'nVertLevels', nVertLevels_ptr) + ! Dereference pointers so that OpenACC copies them correctly + nCells = nCells_ptr + nVertLevels = nVertLevels_ptr call mpas_pool_get_array(meshPool, 'latCell', latCell) call mpas_pool_get_array(meshPool, 'lonCell', lonCell) call mpas_pool_get_config(meshPool, 'on_a_sphere', on_a_sphere) + MPAS_ACC_TIMER_START('mpas_reconstruct_2d [ACC_data_xfer]') + ! Only use sections needed, nCells may be all cells or only non-halo cells + !$acc enter data copyin(coeffs_reconstruct(:,:,1:nCells),nEdgesOnCell(1:nCells), & + !$acc edgesOnCell(:,1:nCells),latCell(1:nCells),lonCell(1:nCells)) + !$acc enter data copyin(u(:,:)) + !$acc enter data create(uReconstructX(:,1:nCells),uReconstructY(:,1:nCells), & + !$acc uReconstructZ(:,1:nCells),uReconstructZonal(:,1:nCells), & + !$acc uReconstructMeridional(:,1:nCells)) + MPAS_ACC_TIMER_STOP('mpas_reconstruct_2d [ACC_data_xfer]') + ! loop over cell centers !$omp do schedule(runtime) + !$acc parallel default(present) + !$acc loop gang do iCell = 1, nCells ! initialize the reconstructed vectors - uReconstructX(:,iCell) = 0.0 - uReconstructY(:,iCell) = 0.0 - uReconstructZ(:,iCell) = 0.0 + !$acc loop vector + do k=1,nVertLevels + uReconstructX(k,iCell) = 0.0 + uReconstructY(k,iCell) = 0.0 + uReconstructZ(k,iCell) = 0.0 + end do ! a more efficient reconstruction where rbf_values*matrix_reconstruct ! has been precomputed in coeffs_reconstruct + !$acc loop seq do i=1,nEdgesOnCell(iCell) iEdge = edgesOnCell(i,iCell) - uReconstructX(:,iCell) = uReconstructX(:,iCell) & - + coeffs_reconstruct(1,i,iCell) * u(:,iEdge) - uReconstructY(:,iCell) = uReconstructY(:,iCell) & - + coeffs_reconstruct(2,i,iCell) * u(:,iEdge) - uReconstructZ(:,iCell) = uReconstructZ(:,iCell) & - + coeffs_reconstruct(3,i,iCell) * u(:,iEdge) + !$acc loop vector + do k=1,nVertLevels + uReconstructX(k,iCell) = uReconstructX(k,iCell) & + + coeffs_reconstruct(1,i,iCell) * u(k,iEdge) + uReconstructY(k,iCell) = uReconstructY(k,iCell) & + + coeffs_reconstruct(2,i,iCell) * u(k,iEdge) + uReconstructZ(k,iCell) = uReconstructZ(k,iCell) & + + coeffs_reconstruct(3,i,iCell) * u(k,iEdge) + end do enddo enddo ! iCell + !$acc end parallel !$omp end do call mpas_threading_barrier() if (on_a_sphere) then !$omp do schedule(runtime) + !$acc parallel default(present) + !$acc loop gang do iCell = 1, nCells clat = cos(latCell(iCell)) slat = sin(latCell(iCell)) clon = cos(lonCell(iCell)) slon = sin(lonCell(iCell)) - uReconstructZonal(:,iCell) = -uReconstructX(:,iCell)*slon + & - uReconstructY(:,iCell)*clon - uReconstructMeridional(:,iCell) = -(uReconstructX(:,iCell)*clon & - + uReconstructY(:,iCell)*slon)*slat & - + uReconstructZ(:,iCell)*clat + !$acc loop vector + do k=1,nVertLevels + uReconstructZonal(k,iCell) = -uReconstructX(k,iCell)*slon + & + uReconstructY(k,iCell)*clon + uReconstructMeridional(k,iCell) = -(uReconstructX(k,iCell)*clon & + + uReconstructY(k,iCell)*slon)*slat & + + uReconstructZ(k,iCell)*clat + end do end do + !$acc end parallel !$omp end do else !$omp do schedule(runtime) + !$acc parallel default(present) + !$acc loop gang vector collapse(2) do iCell = 1, nCells - uReconstructZonal (:,iCell) = uReconstructX(:,iCell) - uReconstructMeridional(:,iCell) = uReconstructY(:,iCell) + do k=1,nVertLevels + uReconstructZonal (k,iCell) = uReconstructX(k,iCell) + uReconstructMeridional(k,iCell) = uReconstructY(k,iCell) + end do end do + !$acc end parallel !$omp end do end if + MPAS_ACC_TIMER_START('mpas_reconstruct_2d [ACC_data_xfer]') + !$acc exit data delete(coeffs_reconstruct(:,:,1:nCells),nEdgesOnCell(1:nCells), & + !$acc edgesOnCell(:,1:nCells),latCell(1:nCells),lonCell(1:nCells)) + !$acc exit data delete(u(:,:)) + !$acc exit data copyout(uReconstructX(:,1:nCells),uReconstructY(:,1:nCells), & + !$acc uReconstructZ(:,1:nCells), uReconstructZonal(:,1:nCells), & + !$acc uReconstructMeridional(:,1:nCells)) + MPAS_ACC_TIMER_STOP('mpas_reconstruct_2d [ACC_data_xfer]') + end subroutine mpas_reconstruct_2d!}}}