From 3344c52bc27fd3926a222369dfb111abc89556a7 Mon Sep 17 00:00:00 2001 From: Aaron Scheinberg Date: Sun, 4 Jun 2023 11:10:39 -0400 Subject: [PATCH] Add XGC migration algorithm --- core/src/Cabana_CommunicationPlan.hpp | 49 +++++++ core/src/Cabana_Distributor.hpp | 181 ++++++++++++++++++++++++++ core/src/Cabana_TransposeInPlace.hpp | 94 +++++++++++++ 3 files changed, 324 insertions(+) create mode 100644 core/src/Cabana_TransposeInPlace.hpp diff --git a/core/src/Cabana_CommunicationPlan.hpp b/core/src/Cabana_CommunicationPlan.hpp index 380020d90..745169eea 100644 --- a/core/src/Cabana_CommunicationPlan.hpp +++ b/core/src/Cabana_CommunicationPlan.hpp @@ -669,6 +669,55 @@ class CommunicationPlan return counts_and_ids.second; } + template + BinningData + createFromExportsOnly_XGC( const ViewType& element_export_ranks ) + { + // Store the number of export elements. + _num_export_element = element_export_ranks.size(); + + // Get the size of this communicator. + int comm_size = -1; + MPI_Comm_size( comm(), &comm_size ); + + // Get the MPI rank we are currently on. + int my_rank = -1; + MPI_Comm_rank( comm(), &my_rank ); + + // Pick an mpi tag for communication. This object has it's own + // communication space so any mpi tag will do. + const int mpi_tag = 1221; + + // Bin the elements based on input keys + const int num_bin = comm_size + 2; // Extra initial bin for particles remaining on rank + // Extra final bin for particles being removed + auto bin_data = Cabana::binByKey( keys, num_bin ); + + // Copy the bin counts to the host. + auto bin_counts_host = Kokkos::create_mirror_view_and_copy( + Kokkos::HostSpace(), bin_data.binSize ); + + // Determine exports from bin counts. + _num_export.assign( comm_size ); + for(int i=0; i +void distributeData_XGC( + const Distributor_t& distributor, AoSoA_t& aosoa, + typename std::enable_if<( is_distributor::value && + is_aosoa::value ), + int>::type* = 0 ){ + Kokkos::Profiling::pushRegion( "Cabana::migrate" ); + + // Get the size of this communicator. + int n_ranks = -1; + MPI_Comm_size( distributor.comm(), &n_ranks ); + + // Get the MPI rank we are currently on. + int my_rank = -1; + MPI_Comm_rank( distributor.comm(), &my_rank ); + + + // Create custom data type. Using MPI_BYTE may risk int overflow and MPI doesnt support long long int. + MPI_Datatype XGC_MPI_PTL; + MPI_Type_contiguous(sizeof( Cabana::Tuple), MPI_BYTE, &XGC_MPI_PTL); + MPI_Type_commit(&XGC_MPI_PTL); + + // Transpose (in place) all vectors particles where at least one particle in the vector is leaving the rank + int transpose_offset = n_staying/AoSoA_t::vector_length; // Mark first vector to transpose + int nvecs_to_transpose = divide_and_round_up(n_staying + n_leaving,AoSoA_t::vector_length) - transpose_offset; // How many vectors to transpose + transpose_particles_from_AoSoA_to_AoS(aosoa, transpose_offset, nvecs_to_transpose); + +#ifdef USE_GPU_AWARE_MPI + using MPIDeviceType = AoSoA_t::device_type; +#else + using MPIDeviceType = HostType; +#endif + typename data_type = AoSoA_t::data_type; + + // Allocate and fill send buffer + Kokkos::View sendbuf(Kokkos::ViewAllocateWithoutInitializing("sendbuf"),n_leaving); + { + Kokkos::View> + optl((data_type*)aosoa.data() + n_staying, sendbuf.size()); + Kokkos::deep_copy(sendbuf, optl); + } + + // Declare unmanaged recvbuffer within the AoSoA, starting after the staying particles + Kokkos::View> recvbuf((data_type*)aosoa.data() + n_staying, n_arriving); + + // Pass MPI pointer to the buffer + data_type* sendbuf_ptr = sendbuf.data(); + +#ifdef USE_GPU_AWARE_MPI + // Pass MPI device pointers + data_type* recvbuf_ptr = recvbuf.data(); +#else + // Need to use a host mirror of recv buffer for MPI + typename Kokkos::View::HostMirror recvbuf_s = Kokkos::create_mirror_view(recvbuf); + + // Pass MPI host pointers + data_type* recvbuf_ptr = recvbuf_s.data(); +#endif + + // Choose communication ordering + std::vector ordered_pids(n_ranks); + bool use_pairwise_ordering=true; + if(use_pairwise_ordering){ + // Match up unique pairs to try to keep communication flowing + int p=1; while ( p < n_ranks ) p*=2; // get next power of 2 above n_ranks + int step = 0; + for(int i=0; i dispExport(n_ranks); + std::vector dispImport(n_ranks); + dispExport[0] = 0; + dispImport[0] = 0; + for(int i = 1; i rrequests(n_ranks); + for (int istep=0; istep0){ + MPI_Irecv(recvbuf_ptr+dispImport(i), distributor.numImport(i), XGC_MPI_PTL, i, i, distributor.comm(), &(rrequests[i])); + } + } + for (int istep=0; istep0){ + MPI_Send(sendbuf_ptr+dispExport(i), distributor.numExport(i), XGC_MPI_PTL, i, my_rank, distributor.comm()); + } + } + for (int istep=0; istep0){ + MPI_Wait(&(rrequests[i]),MPI_STATUS_IGNORE); + } + } + + +#ifndef USE_GPU_AWARE_MPI + // Copy back to device + Kokkos::deep_copy(recvbuf, recvbuf_s); +#endif + + // Transpose back to AoSoA + nvecs_to_transpose = divide_and_round_up(n_staying + n_arriving,AoSoA_t::vector_length) - transpose_offset; // How many vectors to transpose + transpose_particles_from_AoS_to_AoSoA(local_particles, transpose_offset, nvecs_to_transpose); + + Kokkos::Profiling::popRegion(); +} + //---------------------------------------------------------------------------// //! \endcond } // end namespace Impl @@ -398,6 +523,62 @@ void migrate( const Distributor_t& distributor, AoSoA_t& aosoa, aosoa.resize( distributor.totalNumImport() ); } +/*! + \brief Synchronously migrate data between two different decompositions using + the distributor forward communication plan. Single AoSoA version that will + resize in-place. Note that resizing does not necessarily allocate more + memory. The AoSoA memory will only increase if not enough has already been + reserved/allocated for the needed number of elements. + + Migrate moves all data to a new distribution that is uniquely owned - each + element will only have a single destination rank. + + \tparam Distributor_t Distributor type - must be a distributor. + + \tparam AoSoA_t AoSoA type - must be an AoSoA. + + \param distributor The distributor to use for the migration. + + \param aosoa The AoSoA containing the data to be migrated. Upon input, must + have the same number of elements as the inputs used to construct the + destributor. At output, it will be the same size as th enumber of import + elements on this rank provided by the distributor. Before using this + function, consider reserving enough memory in the data structure so + reallocating is not necessary. +*/ +template +void migrate_XGC( const MPI_Comm& comm, const ViewType& keys, AoSoA_t& aosoa, + typename std::enable_if<( is_distributor::value && + is_aosoa::value ), + int>::type* = 0 ) +{ + Distributor_t distributor(comm); + + // Bin data based on keys and set up distributor + auto bin_data = distributor.createFromExportsOnly_XGC( keys ); + + // Determine if the source of destination decomposition has more data on + // this rank. + bool dst_is_bigger = + ( distributor.totalNumImport() > distributor.exportSize() ); + + // If the destination decomposition is bigger than the source + // decomposition resize now so we have enough space to do the operation. + if ( dst_is_bigger ) + aosoa.resize( distributor.totalNumImport() ); + + /* Permute the data */ + Cabana::permute( bin_data, aosoa ); + + // Move the data. + Impl::distributeData_XGC( distributor, aosoa ); + + // If the destination decomposition is smaller than the source + // decomposition resize after we have moved the data. + if ( !dst_is_bigger ) + aosoa.resize( distributor.totalNumImport() ); +} + //---------------------------------------------------------------------------// /*! \brief Synchronously migrate data between two different decompositions using diff --git a/core/src/Cabana_TransposeInPlace.hpp b/core/src/Cabana_TransposeInPlace.hpp new file mode 100644 index 000000000..7e72c6660 --- /dev/null +++ b/core/src/Cabana_TransposeInPlace.hpp @@ -0,0 +1,94 @@ +#ifndef CABANA_TRANSPOSE_IN_PLACE_HPP +#define CABANA_TRANSPOSE_IN_PLACE_HPP + +#include +#include + +namespace Cabana +{ + +template +void transpose_particles_from_AoS_to_AoSoA(AoSoA_t& aosoa, int ioffset, int n_vec){ + const std::size_t PTL_N_DBL = sizeof( Cabana::Tuple)/8; + + // Pointer to local particles + VecParticlesSimple* vptl = (VecParticlesSimple*)aosoa.data() + ioffset; + +#ifdef USE_GPU + int team_size = AoSoA_t::vector_length; +#else + int team_size = 1; +#endif + int league_size = n_vec; + + typedef Kokkos::View> PtlVec; + size_t shmem_size = PtlVec::shmem_size(); // Could halve this, but may worsen memory access + Kokkos::parallel_for("transpose_particles_from_AoS_to_AoSoA", + Kokkos::TeamPolicy (league_size, team_size).set_scratch_size(KOKKOS_TEAM_SCRATCH_OPT,Kokkos::PerTeam(shmem_size)), + KOKKOS_LAMBDA (Kokkos::TeamPolicy::member_type team_member){ + // Locally shared: global index in population + PtlVec shmem_ptl_vec(team_member.team_scratch(KOKKOS_TEAM_SCRATCH_OPT)); + + int ivec = team_member.league_rank(); // vector assigned to this team + int ith = team_member.team_rank(); // This thread's rank in the team + + // Copy vector to shared memory + for(int idbl = ith; idbl +void transpose_particles_from_AoSoA_to_AoS(AoSoA_t& aosoa, int ioffset, int n_vec){ + const std::size_t PTL_N_DBL = sizeof( Cabana::Tuple)/8; + + // Pointer to local particles + VecParticlesSimple* vptl = (VecParticlesSimple*)aosoa.data() + ioffset; + +#ifdef USE_GPU + int team_size = AoSoA_t::vector_length; +#else + int team_size = 1; +#endif + int league_size = n_vec; + + typedef Kokkos::View> PtlVec; + size_t shmem_size = PtlVec::shmem_size(); // Could halve this, but may worsen memory access + Kokkos::parallel_for("transpose_particles_from_AoS_to_AoSoA", + Kokkos::TeamPolicy (league_size, team_size).set_scratch_size(KOKKOS_TEAM_SCRATCH_OPT,Kokkos::PerTeam(shmem_size)), + KOKKOS_LAMBDA (Kokkos::TeamPolicy::member_type team_member){ + // Locally shared: global index in population + PtlVec shmem_ptl_vec(team_member.team_scratch(KOKKOS_TEAM_SCRATCH_OPT)); + + int ivec = team_member.league_rank(); // vector assigned to this team + int ith = team_member.team_rank(); // This thread's rank in the team + + // Transpose into shared memory + for(int iptl = ith; iptl