diff --git a/src/accel/CMakeLists.txt b/src/accel/CMakeLists.txt index 5afa717580..b228cd7aa0 100644 --- a/src/accel/CMakeLists.txt +++ b/src/accel/CMakeLists.txt @@ -51,6 +51,7 @@ list(APPEND SOURCES ) celeritas_polysource(ExceptionConverter) +celeritas_polysource(LocalOpticalGenOffload) if(Geant4_VERSION VERSION_GREATER_EQUAL 11.0) list(APPEND SOURCES diff --git a/src/accel/LocalOpticalGenOffload.cc b/src/accel/LocalOpticalGenOffload.cc index 15f093a5f4..a971b2dc82 100644 --- a/src/accel/LocalOpticalGenOffload.cc +++ b/src/accel/LocalOpticalGenOffload.cc @@ -14,10 +14,11 @@ #include "corecel/sys/ScopedProfiling.hh" #include "geocel/GeantUtils.hh" #include "celeritas/global/CoreParams.hh" -#include "celeritas/optical/CoreParams.hh" -#include "celeritas/optical/CoreState.hh" +#include "celeritas/optical/TrackExecutor.hh" #include "celeritas/optical/Transporter.hh" +#include "celeritas/optical/action/ActionLauncher.hh" #include "celeritas/optical/gen/GeneratorAction.hh" +#include "celeritas/optical/gen/detail/UpdatePendingExecutor.hh" #include "celeritas/phys/GeneratorRegistry.hh" #include "SetupOptions.hh" @@ -105,17 +106,14 @@ void LocalOpticalGenOffload::InitializeEvent(int id) CELER_EXPECT(id >= 0); event_id_ = id_cast(id); - if constexpr (CELERITAS_RESEED == CELERITAS_RESEED_TRACKSLOT) + + if (!(G4Threading::IsMultithreadedApplication() + && G4MTRunManager::SeedOncePerCommunication())) { - if (!(G4Threading::IsMultithreadedApplication() - && G4MTRunManager::SeedOncePerCommunication())) - { - // Since Geant4 schedules events dynamically, reseed the Celeritas - // RNGs using the Geant4 event ID for reproducibility. This - // guarantees that an event can be reproduced given the event ID. - state_->reseed(transport_->params()->rng(), - id_cast(id)); - } + // Since Geant4 schedules events dynamically, reseed the Celeritas RNGs + // using the Geant4 event ID for reproducibility. This guarantees that + // an event can be reproduced given the event ID. + state_->reseed(transport_->params()->rng(), id_cast(id)); } } @@ -186,9 +184,19 @@ void LocalOpticalGenOffload::Flush() // Copy the buffered distributions to device generate_->insert(*state_, make_span(buffer_)); - auto counters = state_->sync_get_counters(); - counters.num_pending += num_photons_; - state_->sync_put_counters(counters); + // Update the number of primaries waiting to be generated based on the + // number of photons. + if (celeritas::device()) + { + auto* s = dynamic_cast*>(&*state_); + this->update_primaries(*s); + } + else + { + auto* s = dynamic_cast*>(&*state_); + this->update_primaries(*s); + } + num_photons_ = 0; buffer_.clear(); @@ -243,5 +251,32 @@ void LocalOpticalGenOffload::Finalize() CELER_ENSURE(!*this); } +//---------------------------------------------------------------------------// +/*! + * Call the UpdatePending functor to update the number of primaries to be + * generated to include the buffered optical photons; use only one host thread. + */ +void LocalOpticalGenOffload::update_primaries( + optical::CoreState& state) const +{ + auto const& optical_params = *transport_->params(); + auto execute_thread = make_single_track_executor( + optical_params.ptr(), + state.ptr(), + optical::detail::UpdatePendingExecutor{num_photons_}); + launch_action(1, execute_thread); +} + +//---------------------------------------------------------------------------// +// DEVICE-DISABLED IMPLEMENTATION +//---------------------------------------------------------------------------// +#if !CELER_USE_DEVICE +inline void LocalOpticalGenOffload::update_primaries( + optical::CoreState&) const +{ + CELER_NOT_CONFIGURED("CUDA OR HIP"); +} +#endif + //---------------------------------------------------------------------------// } // namespace celeritas diff --git a/src/accel/LocalOpticalGenOffload.cu b/src/accel/LocalOpticalGenOffload.cu new file mode 100644 index 0000000000..55c14f497e --- /dev/null +++ b/src/accel/LocalOpticalGenOffload.cu @@ -0,0 +1,36 @@ +//------------------------------ -*- cuda -*- -------------------------------// +// Copyright Celeritas contributors: see top-level COPYRIGHT file for details +// SPDX-License-Identifier: (Apache-2.0 OR MIT) +//---------------------------------------------------------------------------// +//! \file accel/LocalOpticalGenOffload.cu +//---------------------------------------------------------------------------// +#include "LocalOpticalGenOffload.hh" + +#include "celeritas/global/CoreParams.hh" +#include "celeritas/optical/TrackExecutor.hh" +#include "celeritas/optical/Transporter.hh" +#include "celeritas/optical/action/ActionLauncher.device.hh" +#include "celeritas/optical/gen/detail/UpdatePendingExecutor.hh" + +namespace celeritas +{ +//---------------------------------------------------------------------------// +/*! + * Call the UpdatePending functor to update number of primaries to be generated + * to include the buffered optical photons; use only one device thread. + */ +void LocalOpticalGenOffload::update_primaries( + optical::CoreState& state) const +{ + auto const& optical_params = *transport_->params(); + auto execute_thread = make_single_track_executor( + optical_params.ptr(), + state.ptr(), + optical::detail::UpdatePendingExecutor{num_photons_}); + static KernelLauncher const launch_kernel( + "update-pending"); + launch_kernel(1, state.stream_id(), execute_thread); +} + +//---------------------------------------------------------------------------// +} // namespace celeritas diff --git a/src/accel/LocalOpticalGenOffload.hh b/src/accel/LocalOpticalGenOffload.hh index 343c27860b..73a1205949 100644 --- a/src/accel/LocalOpticalGenOffload.hh +++ b/src/accel/LocalOpticalGenOffload.hh @@ -11,6 +11,8 @@ #include "corecel/Types.hh" #include "celeritas/Types.hh" #include "celeritas/inp/Control.hh" +#include "celeritas/optical/CoreParams.hh" +#include "celeritas/optical/CoreState.hh" #include "celeritas/optical/gen/GeneratorData.hh" #include "LocalOffloadInterface.hh" @@ -80,6 +82,11 @@ class LocalOpticalGenOffload final : public LocalOffloadInterface explicit operator bool() const { return this->Initialized(); } private: + // Update the number of primaries waiting to be generated on host/device + // Called by Flush() + void update_primaries(optical::CoreState&) const; + void update_primaries(optical::CoreState&) const; + // Transport pending optical tracks std::shared_ptr transport_; diff --git a/src/celeritas/CMakeLists.txt b/src/celeritas/CMakeLists.txt index 99b3cdc4fc..dcaed475c4 100644 --- a/src/celeritas/CMakeLists.txt +++ b/src/celeritas/CMakeLists.txt @@ -397,6 +397,7 @@ celeritas_polysource(em/model/RelativisticBremModel) celeritas_polysource(em/model/SeltzerBergerModel) celeritas_polysource(em/model/CoulombScatteringModel) celeritas_polysource(geo/detail/BoundaryAction) +celeritas_polysource(global/Stepper) celeritas_polysource(global/detail/KillActive) celeritas_polysource(global/detail/TrackSlotUtils) celeritas_polysource(mucf/model/DTMixMucfModel) @@ -404,17 +405,20 @@ celeritas_polysource(neutron/model/ChipsNeutronElasticModel) celeritas_polysource(neutron/model/NeutronCaptureModel) celeritas_polysource(neutron/model/NeutronInelasticModel) celeritas_polysource(optical/model/AbsorptionModel) +celeritas_polysource(optical/Runner) celeritas_polysource(optical/model/MieModel) celeritas_polysource(optical/model/RayleighModel) celeritas_polysource(optical/model/WavelengthShiftModel) celeritas_polysource(optical/action/AlongStepAction) celeritas_polysource(optical/action/DetectorAction) celeritas_polysource(optical/action/DiscreteSelectAction) +celeritas_polysource(optical/action/LocateVacanciesAction) celeritas_polysource(optical/action/PreStepAction) celeritas_polysource(optical/action/StepDiagnostic) celeritas_polysource(optical/action/TrackingCutAction) celeritas_polysource(optical/action/detail/TrackInitAlgorithms) celeritas_polysource(optical/gen/GeneratorAction) +celeritas_polysource(optical/gen/GeneratorBase) celeritas_polysource(optical/gen/OffloadAction) celeritas_polysource(optical/gen/OffloadGatherAction) celeritas_polysource(optical/gen/PrimaryGeneratorAction) diff --git a/src/celeritas/global/CoreTrackView.hh b/src/celeritas/global/CoreTrackView.hh index 3e88faede8..a420359cdf 100644 --- a/src/celeritas/global/CoreTrackView.hh +++ b/src/celeritas/global/CoreTrackView.hh @@ -120,8 +120,8 @@ class CoreTrackView inline CELER_FUNCTION CoreStateCounters const& counters() const; private: - StateRef const& states_; ParamsRef const& params_; + StateRef const& states_; ThreadId const thread_id_; TrackSlotId track_slot_id_; }; @@ -136,7 +136,7 @@ CELER_FUNCTION CoreTrackView::CoreTrackView(ParamsRef const& params, StateRef const& states, ThreadId thread) - : states_(states), params_(params), thread_id_(thread) + : params_(params), states_(states), thread_id_(thread) { CELER_EXPECT(states_.track_slots.empty() || thread_id_ < states_.track_slots.size()); @@ -157,7 +157,7 @@ CELER_FUNCTION CoreTrackView::CoreTrackView(ParamsRef const& params, StateRef const& states, TrackSlotId track_slot) - : states_(states), params_(params), track_slot_id_(track_slot) + : params_(params), states_(states), track_slot_id_(track_slot) { CELER_EXPECT(track_slot_id_ < states_.size()); } diff --git a/src/celeritas/global/Stepper.cc b/src/celeritas/global/Stepper.cc index e5b07e848a..f49fd6f565 100644 --- a/src/celeritas/global/Stepper.cc +++ b/src/celeritas/global/Stepper.cc @@ -14,13 +14,16 @@ #include "corecel/sys/ScopedProfiling.hh" #include "orange/OrangeData.hh" #include "celeritas/Types.hh" +#include "celeritas/global/TrackExecutor.hh" #include "celeritas/random/RngReseed.hh" #include "celeritas/track/ExtendFromPrimariesAction.hh" #include "celeritas/track/TrackInitParams.hh" +#include "ActionLauncher.hh" #include "CoreParams.hh" #include "detail/KillActive.hh" +#include "detail/SetGeneratedExecutor.hh" namespace celeritas { @@ -127,13 +130,10 @@ template auto Stepper::operator()() -> result_type { ScopedProfiling profile_this{"step"}; - auto counters = state_->sync_get_counters(); - counters.num_generated = 0; - counters.num_cut = 0; - counters.num_errored = 0; - state_->sync_put_counters(counters); + // Initialize the num_generated counter to zero + this->set_generated(); actions_->step(*params_, *state_); - counters = state_->sync_get_counters(); + auto counters = state_->sync_get_counters(); // Get the number of track initializers and active tracks result_type result; @@ -169,9 +169,6 @@ auto Stepper::operator()(SpanConstPrimary primaries) -> result_type << "event number " << max_id->event_id.unchecked_get() << " exceeds max_events=" << params_->init()->max_events()); - auto counters = state_->sync_get_counters(); - counters.num_pending = primaries.size(); - state_->sync_put_counters(counters); primaries_action_->insert(*params_, *state_, primaries); return (*this)(); @@ -213,6 +210,31 @@ void Stepper::reseed(UniqueEventId event_id) params_->init()->reset_track_ids(state_->stream_id(), &state_->ref().init); } +//---------------------------------------------------------------------------// +/*! + * Set the num_pending counter to the number of generated primaries. + */ +template<> +void Stepper::set_generated() +{ + auto execute_thread + = make_single_track_executor(params_->ptr(), + state_->ptr(), + detail::SetGeneratedExecutor{}); + launch_core(1, "set-generated", *params_, *state_, execute_thread); +} + +//---------------------------------------------------------------------------// +// DEVICE-DISABLED IMPLEMENTATION +//---------------------------------------------------------------------------// +#if !CELER_USE_DEVICE +template<> +void Stepper::set_generated() +{ + CELER_NOT_CONFIGURED("CUDA OR HIP"); +} +#endif + //---------------------------------------------------------------------------// // EXPLICIT INSTANTIATION //---------------------------------------------------------------------------// diff --git a/src/celeritas/global/Stepper.cu b/src/celeritas/global/Stepper.cu new file mode 100644 index 0000000000..75fbe3d50c --- /dev/null +++ b/src/celeritas/global/Stepper.cu @@ -0,0 +1,38 @@ +//------------------------------ -*- cuda -*- -------------------------------// +// Copyright Celeritas contributors: see top-level COPYRIGHT file for details +// SPDX-License-Identifier: (Apache-2.0 OR MIT) +//---------------------------------------------------------------------------// +//! \file celeritas/global/Stepper.cu +//---------------------------------------------------------------------------// +#include "Stepper.hh" + +#include "corecel/Assert.hh" +#include "corecel/Types.hh" +#include "corecel/sys/KernelLauncher.device.hh" +#include "celeritas/global/TrackExecutor.hh" + +#include "CoreParams.hh" +#include "CoreState.hh" + +#include "detail/SetGeneratedExecutor.hh" + +namespace celeritas +{ +//---------------------------------------------------------------------------// +/*! + * Set the num_pending counter to the number of generated primaries. + */ +template<> +void Stepper::set_generated() +{ + auto execute_thread + = make_single_track_executor(params_->ptr(), + state_->ptr(), + detail::SetGeneratedExecutor{}); + static KernelLauncher const launch_kernel( + "set-generated"); + launch_kernel(1, state_->stream_id(), execute_thread); +} + +//---------------------------------------------------------------------------// +} // namespace celeritas diff --git a/src/celeritas/global/Stepper.hh b/src/celeritas/global/Stepper.hh index c7a9a00bf3..fe6fbf10c0 100644 --- a/src/celeritas/global/Stepper.hh +++ b/src/celeritas/global/Stepper.hh @@ -148,6 +148,8 @@ class Stepper final : public StepperInterface //!@{ //! \name Type aliases using StateRef = CoreStateData; + using CoreStateHost = CoreState; + using CoreStateDevice = CoreState; //!@} public: @@ -184,6 +186,9 @@ class Stepper final : public StepperInterface //! Reset the core state counters and data so it can be reused void reset_state() { state_->reset(); } + //! Reset the num_generated state counter to zero + void set_generated(); + //! Get a shared pointer to the state (TEMPORARY, DO NOT USE) SPState sp_state() final { return state_; } @@ -199,11 +204,9 @@ class Stepper final : public StepperInterface }; //---------------------------------------------------------------------------// -// EXPLICIT INSTANTIATION +// EXPLICIT INSTANTIATION removed but retained in Stepper.cc so that the +// set_generated() member function can be specialized based on MemSpace //---------------------------------------------------------------------------// -extern template class Stepper; -extern template class Stepper; - //---------------------------------------------------------------------------// } // namespace celeritas diff --git a/src/celeritas/global/TrackExecutor.hh b/src/celeritas/global/TrackExecutor.hh index 8ee00589a7..2935062b23 100644 --- a/src/celeritas/global/TrackExecutor.hh +++ b/src/celeritas/global/TrackExecutor.hh @@ -28,7 +28,7 @@ namespace celeritas * the tracks are sorted. Otherwise, thread and track slot have the same * numerical value. * - * This is primarily used by \c ActionLauncher . + * This is used primarily by \c ActionLauncher . * * \code void foo_kernel(CoreParamsPtr const params, @@ -157,7 +157,7 @@ CELER_FUNCTION ConditionalTrackExecutor(CoreParamsPtr, // FREE FUNCTIONS //---------------------------------------------------------------------------// /*! - * Return a track executor that only applies to active, non-errored tracks. + * Return a track executor that applies to only active, non-errored tracks. */ template inline CELER_FUNCTION decltype(auto) @@ -169,6 +169,21 @@ make_active_track_executor(CoreParamsPtr params, params, state, AppliesValid{}, celeritas::forward(apply_track)}; } +//---------------------------------------------------------------------------// +/*! + * Return a track executor that applies to only one track. This is used + * primarily when updating state counters, as these need only one thread. + */ +template +inline CELER_FUNCTION decltype(auto) +make_single_track_executor(CoreParamsPtr params, + CoreStatePtr const& state, + T&& apply_track) +{ + return ConditionalTrackExecutor{ + params, state, IsThreadZero{}, celeritas::forward(apply_track)}; +} + //---------------------------------------------------------------------------// /*! * Return a track executor that only applies if the action ID matches. diff --git a/src/celeritas/global/detail/SetGeneratedExecutor.hh b/src/celeritas/global/detail/SetGeneratedExecutor.hh new file mode 100644 index 0000000000..0f1fb2668e --- /dev/null +++ b/src/celeritas/global/detail/SetGeneratedExecutor.hh @@ -0,0 +1,54 @@ +//------------------------------- -*- C++ -*- -------------------------------// +// Copyright Celeritas contributors: see top-level COPYRIGHT file for details +// SPDX-License-Identifier: (Apache-2.0 OR MIT) +//---------------------------------------------------------------------------// +//! \file celeritas/global/detail/SetGeneratedExecutor.hh +//---------------------------------------------------------------------------// +#pragma once + +#include "corecel/Macros.hh" +#include "corecel/sys/ThreadId.hh" +#include "celeritas/Types.hh" +#include "celeritas/global/CoreTrackView.hh" + +#include "../CoreState.hh" + +namespace celeritas +{ +//---------------------------------------------------------------------------// +class CoreParams; +template +class CoreState; + +namespace detail +{ +//---------------------------------------------------------------------------// +// LAUNCHER +//---------------------------------------------------------------------------// +/*! + * Initialize the num_generated counter to zero. + */ +struct SetGeneratedExecutor +{ + //// FUNCTIONS //// + + // Initialize the num_generated counter to zero + CELER_FORCEINLINE_FUNCTION void operator()(CoreTrackView& track); +}; + +//---------------------------------------------------------------------------// +// INLINE DEFINITIONS +//---------------------------------------------------------------------------// +/*! + * Initialize the num_generated counter to zero. + */ +CELER_FORCEINLINE_FUNCTION void +SetGeneratedExecutor::operator()(CoreTrackView& track) +{ + CELER_EXPECT(track.thread_id() == ThreadId{0}); // single thread kernel + track.counters().num_generated = 0; +} + +//---------------------------------------------------------------------------// +} // namespace detail +} // namespace celeritas diff --git a/src/celeritas/optical/CoreState.hh b/src/celeritas/optical/CoreState.hh index 66435df662..cc29ece801 100644 --- a/src/celeritas/optical/CoreState.hh +++ b/src/celeritas/optical/CoreState.hh @@ -9,6 +9,7 @@ #include "corecel/cont/Span.hh" #include "corecel/data/AuxInterface.hh" #include "corecel/data/AuxStateVec.hh" +#include "corecel/data/DeviceVector.hh" #include "corecel/data/ObserverPtr.hh" #include "corecel/data/StateDataStore.hh" #include "corecel/random/params/RngParamsFwd.hh" diff --git a/src/celeritas/optical/CoreTrackView.hh b/src/celeritas/optical/CoreTrackView.hh index 431e67aa15..497de23d4f 100644 --- a/src/celeritas/optical/CoreTrackView.hh +++ b/src/celeritas/optical/CoreTrackView.hh @@ -8,6 +8,7 @@ #include "corecel/math/Atomics.hh" #include "corecel/random/engine/RngEngine.hh" +#include "corecel/sys/ThreadId.hh" #include "geocel/AllVolumesView.hh" #include "geocel/DetectorView.hh" #include "geocel/VolumeSurfaceView.hh" @@ -43,6 +44,11 @@ class CoreTrackView //!@} public: + // Construct with comprehensive param/state data and thread + inline CELER_FUNCTION CoreTrackView(ParamsRef const& params, + StateRef const& states, + ThreadId thread); + // Construct directly from a track slot ID inline CELER_FUNCTION CoreTrackView(ParamsRef const& params, StateRef const& states, @@ -90,6 +96,9 @@ class CoreTrackView // Return an RNG engine inline CELER_FUNCTION RngEngine rng() const; + // Get the index of the current thread in the current kernel + inline CELER_FUNCTION ThreadId thread_id() const; + // Get the track's index among the states inline CELER_FUNCTION TrackSlotId track_slot_id() const; @@ -106,12 +115,31 @@ class CoreTrackView private: ParamsRef const& params_; StateRef const& states_; + ThreadId const thread_id_; TrackSlotId const track_slot_id_; }; //---------------------------------------------------------------------------// // INLINE DEFINITIONS //---------------------------------------------------------------------------// +/*! + * Construct with comprehensive param/state data and thread. + * + * For optical tracks, the value of the track slot is the same as the thread + * ID. + */ +CELER_FUNCTION +CoreTrackView::CoreTrackView(ParamsRef const& params, + StateRef const& states, + ThreadId thread) + : params_(params) + , states_(states) + , thread_id_(thread) + , track_slot_id_(TrackSlotId{thread.get()}) +{ + CELER_EXPECT(track_slot_id_ < states_.size()); +} + /*! * Construct with comprehensive param/state data and track slot. * @@ -298,6 +326,24 @@ CELER_FUNCTION SimTrackView CoreTrackView::sim() const return SimTrackView{params_.sim, states_.sim, this->track_slot_id()}; } +//---------------------------------------------------------------------------// +/*! + * Get the index of the current thread in the current kernel. + * + * \warning If the kernel calling this function is not applied to \em all + * tracks, then comparing against a particular thread ID (e.g. zero for a + * once-per-kernel initialization) may result in an error. + * + * \pre The thread ID is only set if the class is initialized with the thread + * ID (e.g. from \c TrackExecutor ), which is not the case in track + * initialization (where the "core track" is constructed from a vacancy). + */ +CELER_FORCEINLINE_FUNCTION ThreadId CoreTrackView::thread_id() const +{ + CELER_EXPECT(thread_id_); + return thread_id_; +} + //---------------------------------------------------------------------------// /*! * Get the track's index among the states. diff --git a/src/celeritas/optical/Runner.cc b/src/celeritas/optical/Runner.cc index ae76c187b4..8b1b280f4f 100644 --- a/src/celeritas/optical/Runner.cc +++ b/src/celeritas/optical/Runner.cc @@ -12,12 +12,15 @@ #include "corecel/io/OutputRegistry.hh" #include "corecel/sys/ScopedProfiling.hh" #include "celeritas/inp/StandaloneInputIO.json.hh" +#include "celeritas/optical/TrackExecutor.hh" +#include "celeritas/optical/action/ActionLauncher.hh" #include "celeritas/phys/GeneratorRegistry.hh" #include "celeritas/setup/Problem.hh" #include "CoreParams.hh" #include "CoreState.hh" #include "Transporter.hh" +#include "gen/detail/UpdatePendingExecutor.hh" namespace celeritas { @@ -126,12 +129,21 @@ auto Runner::operator()(SpanConstGenDist data) -> Result * for some run modes, e.g. offloading distributions through accel where we * already know the number of pending tracks. */ - auto counters = state_->sync_get_counters(); + size_type total_pending(0); for (auto const& d : data) { - counters.num_pending += d.num_photons; + total_pending += d.num_photons; + } + if (celeritas::device()) + { + auto* s = dynamic_cast*>(&*state_); + this->update_pending(*s, total_pending); + } + else + { + auto* s = dynamic_cast*>(&*state_); + this->update_pending(*s, total_pending); } - state_->sync_put_counters(counters); return this->run(); } @@ -160,6 +172,29 @@ auto Runner::run() const -> Result return result; } +//---------------------------------------------------------------------------// +/*! + * Launch a (host) kernel to update the number of pending optical photons. + */ +void Runner::update_pending(CoreState& state, + size_type num_pending) const +{ + // Update the number of pending optical photons + auto execute_thread = make_single_track_executor( + this->params()->ptr(), + state.ptr(), + detail::UpdatePendingExecutor{num_pending}); + launch_action(1, execute_thread); +} + +//---------------------------------------------------------------------------// +#if !CELER_USE_DEVICE +void Runner::update_pending(CoreState&, size_type) const +{ + CELER_NOT_CONFIGURED("CUDA OR HIP"); +} +#endif + //---------------------------------------------------------------------------// } // namespace optical } // namespace celeritas diff --git a/src/celeritas/optical/Runner.cu b/src/celeritas/optical/Runner.cu new file mode 100644 index 0000000000..4b9168143e --- /dev/null +++ b/src/celeritas/optical/Runner.cu @@ -0,0 +1,40 @@ +//------------------------------ -*- cuda -*- -------------------------------// +// Copyright Celeritas contributors: see top-level COPYRIGHT file for details +// SPDX-License-Identifier: (Apache-2.0 OR MIT) +//---------------------------------------------------------------------------// +//! \file celeritas/optical/Runner.cu +//---------------------------------------------------------------------------// +#include "Runner.hh" + +#include "corecel/Assert.hh" + +#include "CoreParams.hh" +#include "CoreState.hh" +#include "TrackExecutor.hh" +#include "action/ActionLauncher.device.hh" +#include "gen/detail/UpdatePendingExecutor.hh" + +namespace celeritas +{ +namespace optical +{ +//---------------------------------------------------------------------------// +/*! + * Launch a (device) kernel to update the number of pending optical photons. + */ +void Runner::update_pending(CoreState& state, + size_type num_pending) const +{ + // Update the number of pending optical photons + auto execute_thread = make_single_track_executor( + this->params()->ptr(), + state.ptr(), + detail::UpdatePendingExecutor{num_pending}); + static KernelLauncher const launch_kernel( + "update-pending"); + launch_kernel(1, state.stream_id(), execute_thread); +} + +//---------------------------------------------------------------------------// +} // namespace optical +} // namespace celeritas diff --git a/src/celeritas/optical/Runner.hh b/src/celeritas/optical/Runner.hh index 1cf18fbd78..89b9436aa7 100644 --- a/src/celeritas/optical/Runner.hh +++ b/src/celeritas/optical/Runner.hh @@ -83,6 +83,10 @@ class Runner //// HELPER FUNCTIONS //// Result run() const; + + // Update the num_pending state counter + void update_pending(CoreState&, size_type) const; + void update_pending(CoreState&, size_type) const; }; //---------------------------------------------------------------------------// diff --git a/src/celeritas/optical/TrackExecutor.hh b/src/celeritas/optical/TrackExecutor.hh new file mode 100644 index 0000000000..3adf5322fc --- /dev/null +++ b/src/celeritas/optical/TrackExecutor.hh @@ -0,0 +1,231 @@ +//------------------------------- -*- C++ -*- -------------------------------// +// Copyright Celeritas contributors: see top-level COPYRIGHT file for details +// SPDX-License-Identifier: (Apache-2.0 OR MIT) +//---------------------------------------------------------------------------// +//! \file celeritas/optical/TrackExecutor.hh +//---------------------------------------------------------------------------// +#pragma once + +#include "corecel/Assert.hh" +#include "corecel/Types.hh" +#include "corecel/math/Algorithms.hh" +#include "corecel/sys/ThreadId.hh" +#include "celeritas/track/TrackFunctors.hh" + +#include "CoreTrackData.hh" +#include "CoreTrackDataFwd.hh" +#include "CoreTrackView.hh" + +namespace celeritas +{ +namespace optical +{ +//---------------------------------------------------------------------------// +/*! + * Call an optical \c CoreTrackView executor for a given ThreadId. + * + * This class can be used to call a functor that applies to \c CoreTrackView + * using a \c ThreadId, so that the tracks can be easily looped over as a + * group on CPU or GPU. It applies a remapping from \em thread to \em slot if + * the tracks are sorted. Otherwise, thread and track slot have the same + * numerical value. + * + * This is used primarily by \c ActionLauncher . + * + * \code +void foo_kernel(CoreParamsPtr const params, + CoreStatePtr const state) +{ + TrackExecutor execute{params, state, MyTrackApplier{}}; + + for (auto tid : range(ThreadID{123})) + { + step(tid); + } +} +\endcode + * + * \todo Rename to ThreadExecutor. The template parameter, which must operate + * on a core track view, is a track executor. + */ +template +class TrackExecutor +{ + public: + //!@{ + //! \name Type aliases + using ParamsPtr = CoreParamsPtr; + using StatePtr = CoreStatePtr; + using Applier = T; + //!@} + + public: + //! Construct with core data and executor + CELER_FUNCTION + TrackExecutor(ParamsPtr params, StatePtr state, T&& execute_track) + : params_{params} + , state_{state} + , execute_track_{celeritas::forward(execute_track)} + { + } + + //! Call the underlying function, using indirection array if needed + CELER_FUNCTION void operator()(ThreadId thread) + { + CELER_EXPECT(thread < state_->size()); + CoreTrackView track(*params_, *state_, thread); + return execute_track_(track); + } + + private: + ParamsPtr const params_; + StatePtr const state_; + T execute_track_; +}; + +//---------------------------------------------------------------------------// +/*! + * Launch the track only when a certain condition applies to the sim state. + * + * The condition \c C must have the signature \code + * (SimTrackView const&) -> bool + \endcode + * + * see \c make_active_track_executor for an example where this is used to apply + * only to active (or killed) tracks. + */ +template +class ConditionalTrackExecutor +{ + public: + //!@{ + //! \name Type aliases + using ParamsPtr = CoreParamsPtr; + using StatePtr = CoreStatePtr; + using Applier = T; + //!@} + + public: + //! Construct with condition and operator + CELER_FUNCTION + ConditionalTrackExecutor(ParamsPtr params, + StatePtr state, + C&& applies, + T&& execute_track) + : params_{params} + , state_{state} + , applies_{celeritas::forward(applies)} + , execute_track_{celeritas::forward(execute_track)} + { + } + + //! Launch the given thread if the track meets the condition + CELER_FUNCTION void operator()(ThreadId thread) + { + CELER_EXPECT(thread < state_->size()); + CoreTrackView track(*params_, *state_, thread); + if (!applies_(track)) + { + return; + } + + // NOTE: "return value type" error means the executor function is + // incorrectly returning a value + return execute_track_(track); + } + + private: + ParamsPtr const params_; + StatePtr const state_; + C applies_; + T execute_track_; +}; + +//---------------------------------------------------------------------------// +// DEDUCTION GUIDES +//---------------------------------------------------------------------------// +template +CELER_FUNCTION TrackExecutor(CoreParamsPtr, + CoreStatePtr, + T&&) -> TrackExecutor; + +template +CELER_FUNCTION ConditionalTrackExecutor(CoreParamsPtr, + CoreStatePtr, + C&&, + T&&) -> ConditionalTrackExecutor; + +//---------------------------------------------------------------------------// +// FREE FUNCTIONS +//---------------------------------------------------------------------------// +/*! + * Return a track executor that applies to only active, non-errored tracks. + */ +template +inline CELER_FUNCTION decltype(auto) +make_active_track_executor(CoreParamsPtr params, + CoreStatePtr const& state, + T&& apply_track) +{ + return ConditionalTrackExecutor{ + params, state, AppliesValid{}, celeritas::forward(apply_track)}; +} + +//---------------------------------------------------------------------------// +/*! + * Return a track executor that applies to only one track. This is used + * primarily when updating state counters, as these need only one thread. + */ +template +inline CELER_FUNCTION decltype(auto) +make_single_track_executor(CoreParamsPtr params, + CoreStatePtr const& state, + T&& apply_track) +{ + return ConditionalTrackExecutor{ + params, state, IsThreadZero{}, celeritas::forward(apply_track)}; +} + +//---------------------------------------------------------------------------// +/*! + * Return a track executor that only applies if the action ID matches. + * + * \note This should generally only be used for post-step actions and other + * cases where the IDs *explicitly* are set. Many explicit actions apply to all + * threads, active or not. + */ +template +inline CELER_FUNCTION decltype(auto) +make_action_track_executor(CoreParamsPtr params, + CoreStatePtr state, + ActionId action, + T&& apply_track) +{ + CELER_EXPECT(action); + return ConditionalTrackExecutor{params, + state, + IsStepActionEqual{action}, + celeritas::forward(apply_track)}; +} + +//---------------------------------------------------------------------------// +/*! + * Return a track executor that only applies for the given along-step action. + */ +template +inline CELER_FUNCTION decltype(auto) +make_along_step_track_executor(CoreParamsPtr params, + CoreStatePtr state, + ActionId action, + T&& apply_track) +{ + CELER_EXPECT(action); + return ConditionalTrackExecutor{params, + state, + IsAlongStepActionEqual{action}, + celeritas::forward(apply_track)}; +} + +//---------------------------------------------------------------------------// +} // namespace optical +} // namespace celeritas diff --git a/src/celeritas/optical/TrackInitData.hh b/src/celeritas/optical/TrackInitData.hh index e64b620797..da09f1fa50 100644 --- a/src/celeritas/optical/TrackInitData.hh +++ b/src/celeritas/optical/TrackInitData.hh @@ -29,7 +29,7 @@ namespace optical * - \c initializers stores the data for track initializers and secondaries * waiting to be turned into new tracks and can be any size up to \c * capacity. - * - \c vacancies stores the \c TrackSlotid of the tracks that have been + * - \c vacancies stores the \c TrackSlotId of the tracks that have been * killed; the size will be <= the number of track states. * - \c counters stores the number of tracks with a given status and is updated * during each step of the simulation of an event. diff --git a/src/celeritas/optical/action/LocateVacanciesAction.cc b/src/celeritas/optical/action/LocateVacanciesAction.cc index 20fa9020e8..118a6b7a89 100644 --- a/src/celeritas/optical/action/LocateVacanciesAction.cc +++ b/src/celeritas/optical/action/LocateVacanciesAction.cc @@ -10,8 +10,12 @@ #include "corecel/Macros.hh" #include "celeritas/optical/CoreParams.hh" #include "celeritas/optical/CoreState.hh" +#include "celeritas/optical/TrackExecutor.hh" + +#include "ActionLauncher.hh" #include "detail/TrackInitAlgorithms.hh" +#include "detail/UpdateAliveExecutor.hh" namespace celeritas { @@ -30,37 +34,64 @@ LocateVacanciesAction::LocateVacanciesAction(ActionId aid) /*! * Execute the action with host data. */ -void LocateVacanciesAction::step(CoreParams const&, CoreStateHost& state) const +void LocateVacanciesAction::step(CoreParams const& params, + CoreStateHost& state) const { - return this->step_impl(state); + this->step_impl(state); + return this->update_alive(params, state, state.size()); } //---------------------------------------------------------------------------// /*! * Execute the action with device data. */ -void LocateVacanciesAction::step(CoreParams const&, CoreStateDevice& state) const +void LocateVacanciesAction::step(CoreParams const& params, + CoreStateDevice& state) const { - return this->step_impl(state); + this->step_impl(state); + return this->update_alive(params, state, state.size()); } //---------------------------------------------------------------------------// /*! - * Initialize optical track states. + * Compact the IDs of the inactive slots to find the vacancies and update the + * number of alive slots accordingly. */ template void LocateVacanciesAction::step_impl(CoreState& state) const { - auto counters = state.sync_get_counters(); - // Compact the IDs of the inactive tracks, getting the sorted indices of // the empty slots - counters.num_vacancies = detail::copy_if_vacant( - state.ref().sim.status, state.ref().init.vacancies, state.stream_id()); + detail::copy_if_vacant( + state.ref().sim.status, state.ref().init, state.stream_id()); +} + +//---------------------------------------------------------------------------// +/*! + * Update the number of alive slots as the empty slots have been compacted. + */ +void LocateVacanciesAction::update_alive(CoreParams const& params, + CoreStateHost& state, + size_type state_size) const +{ + auto execute_thread + = make_single_track_executor(params.ptr(), + state.ptr(), + detail::UpdateAliveExecutor{state_size}); + launch_action(1, execute_thread); +} - counters.num_alive = state.size() - counters.num_vacancies; - state.sync_put_counters(counters); +//---------------------------------------------------------------------------// +// INLINE DEFINITIONS +//---------------------------------------------------------------------------// +#if !CELER_USE_DEVICE +inline void LocateVacanciesAction::update_alive(CoreParams const&, + CoreStateDevice&, + size_type) const +{ + CELER_NOT_CONFIGURED("CUDA or HIP"); } +#endif //---------------------------------------------------------------------------// } // namespace optical diff --git a/src/celeritas/optical/action/LocateVacanciesAction.cu b/src/celeritas/optical/action/LocateVacanciesAction.cu new file mode 100644 index 0000000000..79f63432b4 --- /dev/null +++ b/src/celeritas/optical/action/LocateVacanciesAction.cu @@ -0,0 +1,44 @@ +//------------------------------ -*- cuda -*- -------------------------------// +// Copyright Celeritas contributors: see top-level COPYRIGHT file for details +// SPDX-License-Identifier: (Apache-2.0 OR MIT) +//---------------------------------------------------------------------------// +//! \file celeritas/optical/action/LocateVacanciesAction.cu +//---------------------------------------------------------------------------// +#include "LocateVacanciesAction.hh" + +#include "corecel/Assert.hh" +#include "corecel/Macros.hh" +#include "corecel/Types.hh" +#include "corecel/sys/KernelLauncher.device.hh" + +#include "ActionLauncher.device.hh" +#include "../CoreParams.hh" +#include "../CoreState.hh" +#include "../TrackExecutor.hh" + +#include "detail/UpdateAliveExecutor.hh" + +namespace celeritas +{ +namespace optical +{ +//---------------------------------------------------------------------------// +/*! + * Update the number of active slots as the empty slots have been compacted. + */ +void LocateVacanciesAction::update_alive(CoreParams const& params, + CoreStateDevice& state, + size_type state_size) const +{ + auto execute_thread + = make_single_track_executor(params.ptr(), + state.ptr(), + detail::UpdateAliveExecutor{state_size}); + static KernelLauncher const launch_kernel( + "update-alive"); + launch_kernel(1, state.stream_id(), execute_thread); +} + +//---------------------------------------------------------------------------// +} // namespace optical +} // namespace celeritas diff --git a/src/celeritas/optical/action/LocateVacanciesAction.hh b/src/celeritas/optical/action/LocateVacanciesAction.hh index 3e66f8354e..e1bc108e4a 100644 --- a/src/celeritas/optical/action/LocateVacanciesAction.hh +++ b/src/celeritas/optical/action/LocateVacanciesAction.hh @@ -37,6 +37,8 @@ class LocateVacanciesAction final : public OpticalStepActionInterface, private: template void step_impl(CoreState&) const; + void update_alive(CoreParams const&, CoreStateHost&, size_type) const; + void update_alive(CoreParams const&, CoreStateDevice&, size_type) const; }; //---------------------------------------------------------------------------// diff --git a/src/celeritas/optical/action/detail/TrackInitAlgorithms.cc b/src/celeritas/optical/action/detail/TrackInitAlgorithms.cc index f61db0ef50..2cfc501f36 100644 --- a/src/celeritas/optical/action/detail/TrackInitAlgorithms.cc +++ b/src/celeritas/optical/action/detail/TrackInitAlgorithms.cc @@ -18,14 +18,14 @@ namespace detail * * \return Number of vacant track slots */ -size_type copy_if_vacant(TrackStatusRef const& status, - TrackSlotRef const& vacancies, - StreamId) +void copy_if_vacant(TrackStatusRef const& status, + TrackInitRef const& init, + StreamId) { - CELER_EXPECT(status.size() == vacancies.size()); + CELER_EXPECT(status.size() == init.vacancies.size()); auto* data = status.data().get(); - auto* result = vacancies.data().get(); + auto* result = init.vacancies.data().get(); size_type tid = 0; auto* const stop = data + status.size(); @@ -37,7 +37,10 @@ size_type copy_if_vacant(TrackStatusRef const& status, } ++tid; } - return result - vacancies.data().get(); + + auto counters = init.counters.data().get(); + counters->num_vacancies = result - init.vacancies.data().get(); + return; } //---------------------------------------------------------------------------// diff --git a/src/celeritas/optical/action/detail/TrackInitAlgorithms.cu b/src/celeritas/optical/action/detail/TrackInitAlgorithms.cu index 8e7ab434e5..f0d62ea4af 100644 --- a/src/celeritas/optical/action/detail/TrackInitAlgorithms.cu +++ b/src/celeritas/optical/action/detail/TrackInitAlgorithms.cu @@ -64,44 +64,49 @@ struct TransformType * * \return Number of vacant track slots */ -size_type copy_if_vacant(TrackStatusRef const& status, - TrackSlotRef const& vacancies, - StreamId stream_id) +void copy_if_vacant(TrackStatusRef const& status, + TrackInitRef const& init, + StreamId stream_id) { - CELER_EXPECT(status.size() == vacancies.size()); + CELER_EXPECT(status.size() == init.vacancies.size()); ScopedProfiling profile_this{"copy-if-vacant"}; -#ifdef CELER_USE_THRUST auto start = thrust::make_transform_iterator( thrust::make_counting_iterator(0), TransformType{}); - auto result = device_pointer_cast(vacancies.data()); + auto result = device_pointer_cast(init.vacancies.data()); + auto counters = device_pointer_cast(init.counters.data()); +#ifdef CELER_USE_THRUST auto end = thrust::copy_if(thrust_execute_on(stream_id), start, - start + vacancies.size(), + start + init.vacancies.size(), device_pointer_cast(status.data()), result, IsVacant{}); CELER_DEVICE_API_CALL(PeekAtLastError()); - return end - result; + // New size of the vacancy vector + auto host_counters + = ItemCopier{stream_id}(counters.get()); + host_counters.num_vacancies = end - result; + Copier copy{{counters.get(), 1}, + stream_id}; + copy(MemSpace::host, {&host_counters, 1}); + stream.sync(); + return; #else auto& stream = device().stream(stream_id); - DeviceVector num_vacancies{1, stream_id}; - auto start = thrust::make_transform_iterator( - thrust::make_counting_iterator(0), TransformType{}); # if CELER_CUB_HAS_FLAGGEDIF // Calling with nullptr causes the function to return the amount of working - // space needed instead of invoking the kernel. + // space needed instead of invoking the kernel size_t temp_storage_bytes = 0; auto flags = device_pointer_cast(status.data()); - auto results = device_pointer_cast(vacancies.data()); cub::DeviceSelect::FlaggedIf(nullptr, temp_storage_bytes, start, flags, - results, - num_vacancies.data(), - vacancies.size(), + result, + &(counters->num_vacancies), + init.vacancies.size(), IsVacant{}, stream.get()); // Allocate temporary storage @@ -110,9 +115,9 @@ size_type copy_if_vacant(TrackStatusRef const& status, temp_storage_bytes, start, flags, - results, - num_vacancies.data(), - vacancies.size(), + result, + &(counters->num_vacancies), + init.vacancies.size(), IsVacant{}, stream.get()); # else @@ -133,16 +138,15 @@ size_type copy_if_vacant(TrackStatusRef const& status, IsVacant{}); # endif // Calling with nullptr causes the function to return the amount of working - // space needed instead of invoking the kernel. + // space needed instead of invoking the kernel size_t temp_storage_bytes = 0; - auto results = device_pointer_cast(vacancies.data()); auto cub_error_code = cub::DeviceSelect::Flagged(nullptr, temp_storage_bytes, start, flags.data(), - results, - num_vacancies.data(), - vacancies.size(), + result, + &(counters->num_vacancies), + init.vacancies.size(), stream.get()); CELER_DISCARD(cub_error_code); // Allocate temporary storage @@ -151,18 +155,14 @@ size_type copy_if_vacant(TrackStatusRef const& status, temp_storage_bytes, start, flags.data(), - results, - num_vacancies.data(), - vacancies.size(), + result, + &(counters->num_vacancies), + init.vacancies.size(), stream.get()); CELER_DISCARD(cub_error_code); # endif CELER_DEVICE_API_CALL(PeekAtLastError()); - - auto result = ItemCopier{stream_id}(num_vacancies.data()); - - stream.sync(); - return result; + return; #endif } diff --git a/src/celeritas/optical/action/detail/TrackInitAlgorithms.hh b/src/celeritas/optical/action/detail/TrackInitAlgorithms.hh index 86f9cb276c..c5dfb205c1 100644 --- a/src/celeritas/optical/action/detail/TrackInitAlgorithms.hh +++ b/src/celeritas/optical/action/detail/TrackInitAlgorithms.hh @@ -11,6 +11,7 @@ #include "corecel/Types.hh" #include "corecel/data/Collection.hh" #include "celeritas/Types.hh" +#include "celeritas/optical/TrackInitData.hh" namespace celeritas { @@ -20,7 +21,7 @@ namespace detail { //---------------------------------------------------------------------------// template -using TrackSlotRef = StateCollection; +using TrackInitRef = TrackInitStateData; template using TrackStatusRef = StateCollection; @@ -36,20 +37,20 @@ struct IsVacant //---------------------------------------------------------------------------// // Compact the \c TrackSlotIds of the inactive tracks -size_type copy_if_vacant(TrackStatusRef const&, - TrackSlotRef const&, - StreamId); -size_type copy_if_vacant(TrackStatusRef const&, - TrackSlotRef const&, - StreamId); +void copy_if_vacant(TrackStatusRef const&, + TrackInitRef const&, + StreamId); +void copy_if_vacant(TrackStatusRef const&, + TrackInitRef const&, + StreamId); //---------------------------------------------------------------------------// // INLINE DEFINITIONS //---------------------------------------------------------------------------// #if !CELER_USE_DEVICE -inline size_type copy_if_vacant(TrackStatusRef const&, - TrackSlotRef const&, - StreamId) +inline void copy_if_vacant(TrackStatusRef const&, + TrackInitRef const&, + StreamId) { CELER_NOT_CONFIGURED("CUDA or HIP"); } diff --git a/src/celeritas/optical/action/detail/UpdateAliveExecutor.hh b/src/celeritas/optical/action/detail/UpdateAliveExecutor.hh new file mode 100644 index 0000000000..52fec0cdb1 --- /dev/null +++ b/src/celeritas/optical/action/detail/UpdateAliveExecutor.hh @@ -0,0 +1,57 @@ +//------------------------------- -*- C++ -*- -------------------------------// +// Copyright Celeritas contributors: see top-level COPYRIGHT file for details +// SPDX-License-Identifier: (Apache-2.0 OR MIT) +//---------------------------------------------------------------------------// +//! \file optical/action/detail/UpdateAliveExecutor.hh +//---------------------------------------------------------------------------// +#pragma once + +#include "corecel/Macros.hh" +#include "corecel/Types.hh" +#include "corecel/sys/ThreadId.hh" +#include "celeritas/optical/CoreTrackView.hh" + +namespace celeritas +{ +namespace optical +{ +namespace detail +{ +//---------------------------------------------------------------------------// +// LAUNCHER +//---------------------------------------------------------------------------// +/*! + * Update the num_alive counter based on the number of photons that are still + * alive after compacting vacancies. + */ +struct UpdateAliveExecutor +{ + //// DATA //// + + size_type state_size; + + //// FUNCTIONS //// + + // Update number of photons that are still alive + CELER_FORCEINLINE_FUNCTION void operator()(CoreTrackView& track); +}; + +//---------------------------------------------------------------------------// +// INLINE DEFINITIONS +//---------------------------------------------------------------------------// +/*! + * Update number of photons that are still alive after compacting vacancies. + */ +CELER_FORCEINLINE_FUNCTION void +UpdateAliveExecutor::operator()(CoreTrackView& track) +{ + CELER_EXPECT(track.thread_id() == ThreadId{0}); // single thread kernel + + track.counters().num_alive = state_size - track.counters().num_vacancies; + CELER_ASSERT(state_size >= track.counters().num_vacancies); +} + +//---------------------------------------------------------------------------// +} // namespace detail +} // namespace optical +} // namespace celeritas diff --git a/src/celeritas/optical/gen/DirectGeneratorAction.cc b/src/celeritas/optical/gen/DirectGeneratorAction.cc index fff4e30574..047b0a3713 100644 --- a/src/celeritas/optical/gen/DirectGeneratorAction.cc +++ b/src/celeritas/optical/gen/DirectGeneratorAction.cc @@ -25,6 +25,8 @@ namespace celeritas { namespace optical { +CoreParams* DirectGeneratorAction::params_ = nullptr; // Set in + // make_and_insert() namespace { //---------------------------------------------------------------------------// @@ -49,7 +51,7 @@ auto make_state(StreamId stream, size_type size) * Construct and add to core params. */ std::shared_ptr -DirectGeneratorAction::make_and_insert(CoreParams const& params) +DirectGeneratorAction::make_and_insert(CoreParams& params) { ActionRegistry& actions = *params.action_reg(); AuxParamsRegistry& aux = *params.aux_reg(); @@ -59,6 +61,7 @@ DirectGeneratorAction::make_and_insert(CoreParams const& params) actions.insert(result); aux.insert(result); gen.insert(result); + params_ = ¶ms; return result; } @@ -144,9 +147,7 @@ void DirectGeneratorAction::insert_impl(CoreState& state, // Update counters and copy distributions to aux state storage aux_state.counters.buffer_size = data.size(); aux_state.counters.num_pending = data.size(); - auto counters = state.sync_get_counters(); - counters.num_pending += data.size(); - state.sync_put_counters(counters); + this->update_pending(*params_, state, data.size()); Copier copy_to_aux{aux_state.initializers(), state.stream_id()}; @@ -186,9 +187,10 @@ void DirectGeneratorAction::step_impl(CoreParams const& params, = get>(*state.aux(), this->aux_id()); auto& counters = aux_state.counters; - if (state.sync_get_counters().num_vacancies > 0 && counters.num_pending > 0) + if (counters.num_pending > 0) { - // Generate the optical photons from the distribution data + // Generate the optical photons from the distribution data. To avoid + // synchronization, we defer the check for vacancies. this->generate(params, state); } @@ -215,20 +217,18 @@ void DirectGeneratorAction::generate(CoreParams const& params, auto& aux_state = get>( *state.aux(), this->aux_id()); - size_type num_gen = min(state.sync_get_counters().num_vacancies, - aux_state.counters.num_pending); // Generate optical photons in vacant track slots detail::DirectGeneratorExecutor execute{ params.ptr(), state.ptr(), aux_state.store.ref()}; - launch_action(num_gen, execute); + launch_action(aux_state.counters.num_pending, execute); } //---------------------------------------------------------------------------// #if !CELER_USE_DEVICE void DirectGeneratorAction::generate(CoreParams const&, CoreStateDevice&) const { - CELER_NOT_IMPLEMENTED("device"); + CELER_NOT_CONFIGURED("CUDA OR HIP"); } #endif diff --git a/src/celeritas/optical/gen/DirectGeneratorAction.cu b/src/celeritas/optical/gen/DirectGeneratorAction.cu index 5f46af8a0f..5c91cb3846 100644 --- a/src/celeritas/optical/gen/DirectGeneratorAction.cu +++ b/src/celeritas/optical/gen/DirectGeneratorAction.cu @@ -29,14 +29,14 @@ void DirectGeneratorAction::generate(CoreParams const& params, auto& aux_state = get>( *state.aux(), this->aux_id()); - size_type num_gen = min(state.sync_get_counters().num_vacancies, - aux_state.counters.num_pending); + // Generate optical photons in vacant track slots detail::DirectGeneratorExecutor execute{ params.ptr(), state.ptr(), aux_state.store.ref()}; static ActionLauncher const launch(*this); - launch(num_gen, state.stream_id(), execute); + launch(aux_state.counters.num_pending, state.stream_id(), execute); } + //---------------------------------------------------------------------------// } // namespace optical } // namespace celeritas diff --git a/src/celeritas/optical/gen/DirectGeneratorAction.hh b/src/celeritas/optical/gen/DirectGeneratorAction.hh index 683ce84e1e..9db7d44e22 100644 --- a/src/celeritas/optical/gen/DirectGeneratorAction.hh +++ b/src/celeritas/optical/gen/DirectGeneratorAction.hh @@ -42,8 +42,7 @@ class DirectGeneratorAction final : public GeneratorBase public: // Construct and add to core params - static std::shared_ptr - make_and_insert(CoreParams const&); + static std::shared_ptr make_and_insert(CoreParams&); // Construct with action ID and data IDs DirectGeneratorAction(ActionId, AuxId, GeneratorId); @@ -61,6 +60,12 @@ class DirectGeneratorAction final : public GeneratorBase void step(CoreParams const&, CoreStateDevice&) const final; private: + //// DATA //// + + // Core params isn't passed to insert(), so save a pointer so + // update_pending() can be called later + static CoreParams* params_; + //// HELPER FUNCTIONS //// template diff --git a/src/celeritas/optical/gen/GeneratorAction.cc b/src/celeritas/optical/gen/GeneratorAction.cc index a7c244af50..0c3cc94aee 100644 --- a/src/celeritas/optical/gen/GeneratorAction.cc +++ b/src/celeritas/optical/gen/GeneratorAction.cc @@ -202,9 +202,10 @@ void GeneratorAction::step_impl(CoreParams const& params, state.stream_id()); } - if (state.sync_get_counters().num_vacancies > 0 && counters.num_pending > 0) + if (counters.num_pending > 0) { - // Generate the optical photons from the distribution data + // Generate the optical photons from the distribution data. To avoid + // synchronization, we defer the check for vacancies. this->generate(params, state); } @@ -232,8 +233,7 @@ void GeneratorAction::generate(CoreParams const& params, auto& aux_state = get>(*state.aux(), this->aux_id()); - size_type num_gen = min(state.sync_get_counters().num_vacancies, - aux_state.counters.num_pending); + size_type num_gen = aux_state.counters.num_pending; { // Generate optical photons in vacant track slots detail::GeneratorExecutor execute{params.ptr(), @@ -247,7 +247,8 @@ void GeneratorAction::generate(CoreParams const& params, { // Update the cumulative sum of the number of photons per distribution // according to how many were generated - detail::UpdateSumExecutor execute{aux_state.store.ref(), num_gen}; + detail::UpdateSumExecutor execute{ + state.ptr(), aux_state.store.ref(), num_gen}; launch_kernel(aux_state.counters.buffer_size, execute); } } diff --git a/src/celeritas/optical/gen/GeneratorAction.cu b/src/celeritas/optical/gen/GeneratorAction.cu index 697cc0746e..7d46cb03f4 100644 --- a/src/celeritas/optical/gen/GeneratorAction.cu +++ b/src/celeritas/optical/gen/GeneratorAction.cu @@ -38,8 +38,7 @@ void GeneratorAction::generate(CoreParams const& params, auto& aux_state = get>(*state.aux(), this->aux_id()); - size_type num_gen = min(state.sync_get_counters().num_vacancies, - aux_state.counters.num_pending); + size_type num_gen = aux_state.counters.num_pending; { // Generate optical photons in vacant track slots detail::GeneratorExecutor execute{params.ptr(), @@ -54,7 +53,8 @@ void GeneratorAction::generate(CoreParams const& params, { // Update the cumulative sum of the number of photons per distribution // according to how many were generated - detail::UpdateSumExecutor execute{aux_state.store.ref(), num_gen}; + detail::UpdateSumExecutor execute{ + state.ptr(), aux_state.store.ref(), num_gen}; static KernelLauncher const launch_kernel( "update-sum"); launch_kernel( diff --git a/src/celeritas/optical/gen/GeneratorBase.cc b/src/celeritas/optical/gen/GeneratorBase.cc index 8df3ed9fe7..3d6c900faf 100644 --- a/src/celeritas/optical/gen/GeneratorBase.cc +++ b/src/celeritas/optical/gen/GeneratorBase.cc @@ -8,6 +8,12 @@ #include "corecel/Assert.hh" #include "corecel/data/AuxStateVec.hh" +#include "celeritas/optical/CoreParams.hh" +#include "celeritas/optical/CoreState.hh" +#include "celeritas/optical/TrackExecutor.hh" +#include "celeritas/optical/action/ActionLauncher.hh" + +#include "detail/UpdatePendingExecutor.hh" namespace celeritas { @@ -47,6 +53,32 @@ GeneratorStateBase const& GeneratorBase::counters(AuxStateVec const& aux) const return dynamic_cast(aux.at(aux_id_)); } +//---------------------------------------------------------------------------// +/*! + * Launch a (host) kernel to update the number of pending optical photons. + */ +void GeneratorBase::update_pending(CoreParams const& params, + CoreStateHost& state, + size_type num_pending) const +{ + // Update the number of pending optical photons + auto execute_thread = make_single_track_executor( + params.ptr(), + state.ptr(), + detail::UpdatePendingExecutor{num_pending}); + launch_action(1, execute_thread); +} + +//---------------------------------------------------------------------------// +#if !CELER_USE_DEVICE +void GeneratorBase::update_pending(CoreParams const&, + CoreStateDevice&, + size_type) const +{ + CELER_NOT_CONFIGURED("CUDA OR HIP"); +} +#endif + //---------------------------------------------------------------------------// } // namespace optical } // namespace celeritas diff --git a/src/celeritas/optical/gen/GeneratorBase.cu b/src/celeritas/optical/gen/GeneratorBase.cu new file mode 100644 index 0000000000..f67a47cd18 --- /dev/null +++ b/src/celeritas/optical/gen/GeneratorBase.cu @@ -0,0 +1,40 @@ +//------------------------------ -*- cuda -*- -------------------------------// +// Copyright Celeritas contributors: see top-level COPYRIGHT file for details +// SPDX-License-Identifier: (Apache-2.0 OR MIT) +//---------------------------------------------------------------------------// +//! \file celeritas/optical/gen/GeneratorBase.cu +//---------------------------------------------------------------------------// +#include "GeneratorBase.hh" + +#include "corecel/Assert.hh" +#include "celeritas/optical/CoreState.hh" +#include "celeritas/optical/TrackExecutor.hh" +#include "celeritas/optical/action/ActionLauncher.device.hh" + +#include "detail/UpdatePendingExecutor.hh" + +namespace celeritas +{ +namespace optical +{ +//---------------------------------------------------------------------------// +/*! + * Launch a (device) kernel to update the number of pending optical photons. + */ +void GeneratorBase::update_pending(CoreParams const& params, + CoreStateDevice& state, + size_type num_pending) const +{ + // Update the number of pending optical photons + auto execute_thread = make_single_track_executor( + params.ptr(), + state.ptr(), + detail::UpdatePendingExecutor{num_pending}); + static KernelLauncher const launch_kernel( + "update-pending"); + launch_kernel(1, state.stream_id(), execute_thread); +} + +//---------------------------------------------------------------------------// +} // namespace optical +} // namespace celeritas diff --git a/src/celeritas/optical/gen/GeneratorBase.hh b/src/celeritas/optical/gen/GeneratorBase.hh index 09c26d7330..e9e271331f 100644 --- a/src/celeritas/optical/gen/GeneratorBase.hh +++ b/src/celeritas/optical/gen/GeneratorBase.hh @@ -16,6 +16,8 @@ namespace celeritas { +class CoreParams; + namespace optical { //---------------------------------------------------------------------------// @@ -74,6 +76,10 @@ class GeneratorBase : virtual public optical::OpticalStepActionInterface, template inline void update_counters(optical::CoreState&) const; + // Update the num_pending state counter + void update_pending(CoreParams const&, CoreStateHost&, size_type) const; + void update_pending(CoreParams const&, CoreStateDevice&, size_type) const; + private: StaticActionData sad_; AuxId aux_id_; diff --git a/src/celeritas/optical/gen/GeneratorData.hh b/src/celeritas/optical/gen/GeneratorData.hh index 35862a33f5..3ca13842e5 100644 --- a/src/celeritas/optical/gen/GeneratorData.hh +++ b/src/celeritas/optical/gen/GeneratorData.hh @@ -162,7 +162,7 @@ struct GeneratorState : public GeneratorStateBase //---------------------------------------------------------------------------// /*! - * Resize optical buffere. + * Resize optical buffers. */ template void resize(GeneratorStateData* state, diff --git a/src/celeritas/optical/gen/PrimaryGeneratorAction.cc b/src/celeritas/optical/gen/PrimaryGeneratorAction.cc index c2b5850e9b..3887abee2d 100644 --- a/src/celeritas/optical/gen/PrimaryGeneratorAction.cc +++ b/src/celeritas/optical/gen/PrimaryGeneratorAction.cc @@ -29,12 +29,13 @@ namespace celeritas { namespace optical { +CoreParams* PrimaryGeneratorAction::core_params_ = nullptr; //---------------------------------------------------------------------------// /*! * Construct and add to core params. */ std::shared_ptr -PrimaryGeneratorAction::make_and_insert(CoreParams const& params, Input&& input) +PrimaryGeneratorAction::make_and_insert(CoreParams& params, Input&& input) { CELER_EXPECT(input); ActionRegistry& actions = *params.action_reg(); @@ -46,6 +47,7 @@ PrimaryGeneratorAction::make_and_insert(CoreParams const& params, Input&& input) actions.insert(result); aux.insert(result); gen.insert(result); + core_params_ = ¶ms; return result; } @@ -72,7 +74,6 @@ PrimaryGeneratorAction::PrimaryGeneratorAction(ActionId id, data_.shape = std::visit(insert, inp.shape); params_ = ParamsDataStore{std::move(host_params)}; - CELER_ENSURE(data_); CELER_ENSURE(params_); } @@ -135,9 +136,7 @@ void PrimaryGeneratorAction::insert_impl(optical::CoreState& state) const auto& aux_state = this->counters(*state.aux()); aux_state.counters.num_pending = data_.num_photons; - auto counters = state.sync_get_counters(); - counters.num_pending += data_.num_photons; - state.sync_put_counters(counters); + this->update_pending(*core_params_, state, data_.num_photons); } //---------------------------------------------------------------------------// @@ -152,9 +151,10 @@ void PrimaryGeneratorAction::step_impl(CoreParams const& params, auto const& counters = this->counters(*state.aux()).counters; - if (state.sync_get_counters().num_vacancies > 0 && counters.num_pending > 0) + if (counters.num_pending > 0) { - // Generate the optical photons from the distribution data + // Generate the optical photons from the distribution data. To avoid + // synchronization, we defer the check for vacancies. this->generate(params, state); } @@ -172,13 +172,11 @@ void PrimaryGeneratorAction::generate(CoreParams const& params, CELER_EXPECT(state.aux()); auto const& aux_state = this->counters(*state.aux()); - size_type num_gen = min(state.sync_get_counters().num_vacancies, - aux_state.counters.num_pending); // Generate optical photons in vacant track slots detail::PrimaryGeneratorExecutor execute{ params.ptr(), state.ptr(), data_, params_.host_ref()}; - launch_action(num_gen, execute); + launch_action(aux_state.counters.num_pending, execute); } //---------------------------------------------------------------------------// diff --git a/src/celeritas/optical/gen/PrimaryGeneratorAction.cu b/src/celeritas/optical/gen/PrimaryGeneratorAction.cu index a65ea1c77f..aec34f85f7 100644 --- a/src/celeritas/optical/gen/PrimaryGeneratorAction.cu +++ b/src/celeritas/optical/gen/PrimaryGeneratorAction.cu @@ -31,8 +31,6 @@ void PrimaryGeneratorAction::generate(CoreParams const& params, CELER_EXPECT(state.aux()); auto const& aux_state = this->counters(*state.aux()); - size_type num_gen = min(state.sync_get_counters().num_vacancies, - aux_state.counters.num_pending); // Generate optical photons in vacant track slots detail::PrimaryGeneratorExecutor execute{params.ptr(), @@ -40,7 +38,7 @@ void PrimaryGeneratorAction::generate(CoreParams const& params, data_, params_.device_ref()}; static ActionLauncher const launch(*this); - launch(num_gen, state.stream_id(), execute); + launch(aux_state.counters.num_pending, state.stream_id(), execute); } //---------------------------------------------------------------------------// diff --git a/src/celeritas/optical/gen/PrimaryGeneratorAction.hh b/src/celeritas/optical/gen/PrimaryGeneratorAction.hh index 2b3cf537cf..60f8bea69a 100644 --- a/src/celeritas/optical/gen/PrimaryGeneratorAction.hh +++ b/src/celeritas/optical/gen/PrimaryGeneratorAction.hh @@ -47,7 +47,7 @@ class PrimaryGeneratorAction final : public GeneratorBase public: // Construct and add to core params static std::shared_ptr - make_and_insert(CoreParams const&, Input&&); + make_and_insert(CoreParams&, Input&&); // Construct with IDs and distributions PrimaryGeneratorAction(ActionId, AuxId, GeneratorId, Input); @@ -76,6 +76,9 @@ class PrimaryGeneratorAction final : public GeneratorBase PrimaryDistributionData data_; ParamsDataStore params_; + // Core params isn't passed to insert(), so save a pointer so + // update_pending() can be called later + static CoreParams* core_params_; //// HELPER FUNCTIONS //// diff --git a/src/celeritas/optical/gen/WlsGeneratorAction.cc b/src/celeritas/optical/gen/WlsGeneratorAction.cc index 2ccb18e05c..709f878803 100644 --- a/src/celeritas/optical/gen/WlsGeneratorAction.cc +++ b/src/celeritas/optical/gen/WlsGeneratorAction.cc @@ -148,13 +148,13 @@ void WlsGeneratorAction::step_impl(CoreParams const& params, } // Update the core state counters with the number of new pending tracks - auto core_counters = state.sync_get_counters(); - core_counters.num_pending += counters.num_pending - num_pending_prev; - state.sync_put_counters(core_counters); + this->update_pending( + params, state, counters.num_pending - num_pending_prev); - if (counters.num_pending > 0 && core_counters.num_vacancies > 0) + if (counters.num_pending > 0) { - // Generate the optical photons from the distribution data + // Generate the optical photons from the distribution data. To avoid + // synchronization, we defer the check for vacancies. this->generate(params, state); // Compact the buffer again to remove stale distributions and free up @@ -194,8 +194,6 @@ void WlsGeneratorAction::generate(CoreParams const& params, auto& aux_state = get>(*state.aux(), this->aux_id()); - size_type num_gen = min(state.sync_get_counters().num_vacancies, - aux_state.counters.num_pending); // Generate optical photons in vacant track slots detail::WlsGeneratorExecutor execute{ @@ -205,7 +203,7 @@ void WlsGeneratorAction::generate(CoreParams const& params, wls2_ ? wls2_->host_ref() : NativeCRef{}, aux_state.store.ref(), aux_state.counters.buffer_size}; - launch_action(num_gen, execute); + launch_action(aux_state.counters.num_pending, execute); } //---------------------------------------------------------------------------// diff --git a/src/celeritas/optical/gen/WlsGeneratorAction.cu b/src/celeritas/optical/gen/WlsGeneratorAction.cu index 61d5c9222b..9dea3bd71b 100644 --- a/src/celeritas/optical/gen/WlsGeneratorAction.cu +++ b/src/celeritas/optical/gen/WlsGeneratorAction.cu @@ -34,8 +34,6 @@ void WlsGeneratorAction::generate(CoreParams const& params, auto& aux_state = get>(*state.aux(), this->aux_id()); - size_type num_gen = min(state.sync_get_counters().num_vacancies, - aux_state.counters.num_pending); // Generate optical photons in vacant track slots detail::WlsGeneratorExecutor execute{ @@ -46,7 +44,7 @@ void WlsGeneratorAction::generate(CoreParams const& params, aux_state.store.ref(), aux_state.counters.buffer_size}; static ActionLauncher const launch(*this); - launch(num_gen, state.stream_id(), execute); + launch(aux_state.counters.num_pending, state.stream_id(), execute); } //---------------------------------------------------------------------------// diff --git a/src/celeritas/optical/gen/detail/DirectGeneratorExecutor.hh b/src/celeritas/optical/gen/detail/DirectGeneratorExecutor.hh index 633788b737..5bcd9768fc 100644 --- a/src/celeritas/optical/gen/detail/DirectGeneratorExecutor.hh +++ b/src/celeritas/optical/gen/detail/DirectGeneratorExecutor.hh @@ -31,11 +31,7 @@ struct DirectGeneratorExecutor NativeRef const data; // Initialize optical photons - inline CELER_FUNCTION void operator()(TrackSlotId tid) const; - CELER_FORCEINLINE_FUNCTION void operator()(ThreadId tid) const - { - return (*this)(TrackSlotId{tid.unchecked_get()}); - } + inline CELER_FUNCTION void operator()(ThreadId tid) const; }; //---------------------------------------------------------------------------// @@ -44,22 +40,31 @@ struct DirectGeneratorExecutor /*! * Initialize optical photons. */ -CELER_FUNCTION void DirectGeneratorExecutor::operator()(TrackSlotId tid) const +CELER_FUNCTION void DirectGeneratorExecutor::operator()(ThreadId tid) const { CELER_EXPECT(params); CELER_EXPECT(state); auto* counters = state->init.counters.data().get(); + + // Original code set the number of threads to the minimum between of number + // of vacancies and the number of pending in the auxiliary state. To avoid + // accessing the state counters to compute this min, we skip the extra + // threads if counters->num_vacancies < aux_state.counters.num_pending + if (!(tid < counters->num_vacancies)) + { + return; + } + // Create view to new track to be initialized CoreTrackView vacancy(*params, *state, [&] { - TrackSlotId idx{ - index_before(counters->num_vacancies, ThreadId(tid.get()))}; + TrackSlotId idx{index_before(counters->num_vacancies, tid)}; return state->init.vacancies[idx]; }()); // Get initializer from the back TrackInitializer const& init = data.initializers[ItemId( - index_before(counters->num_pending, ThreadId(tid.get())))]; + index_before(counters->num_pending, tid))]; // Initialize track vacancy = init; diff --git a/src/celeritas/optical/gen/detail/GeneratorExecutor.hh b/src/celeritas/optical/gen/detail/GeneratorExecutor.hh index 314a561b10..3ff5f45903 100644 --- a/src/celeritas/optical/gen/detail/GeneratorExecutor.hh +++ b/src/celeritas/optical/gen/detail/GeneratorExecutor.hh @@ -44,11 +44,7 @@ struct GeneratorExecutor //// FUNCTIONS //// // Generate optical photons - inline CELER_FUNCTION void operator()(TrackSlotId tid) const; - CELER_FORCEINLINE_FUNCTION void operator()(ThreadId tid) const - { - return (*this)(TrackSlotId{tid.unchecked_get()}); - } + inline CELER_FUNCTION void operator()(ThreadId tid) const; }; //---------------------------------------------------------------------------// @@ -57,7 +53,7 @@ struct GeneratorExecutor /*! * Generate photons from optical distribution data. */ -CELER_FUNCTION void GeneratorExecutor::operator()(TrackSlotId tid) const +CELER_FUNCTION void GeneratorExecutor::operator()(ThreadId tid) const { using namespace celeritas::literals; CELER_EXPECT(state); @@ -67,8 +63,16 @@ CELER_FUNCTION void GeneratorExecutor::operator()(TrackSlotId tid) const auto* counters = state->init.counters.data().get(); - // Find the index of the first distribution that has a nonzero number of - // primaries left to generate + // Original code set the number of threads to the minimum between of number + // of vacancies and the number of pending in the auxiliary state. To avoid + // accessing the state counters to compute this min, we skip the extra + // threads if state.counters.num_vacancies < aux_state.counters.num_pending + if (!(tid < counters->num_vacancies)) + { + return; + } + // Find the index of the first distribution that has a nonzero number + // of primaries left to generate auto all_offsets = offload.offsets[ItemRange( ItemId(0), ItemId(buffer_size))]; auto buffer_start @@ -92,16 +96,15 @@ CELER_FUNCTION void GeneratorExecutor::operator()(TrackSlotId tid) const *params, *state, [&] { // Get the vacancy from the back in case there // are more vacancies than photons to generate - TrackSlotId idx{ - index_before(counters->num_vacancies, ThreadId(tid.get()))}; + TrackSlotId idx{index_before(counters->num_vacancies, tid)}; return state->init.vacancies[idx]; }()}; if (!dist.material) { // If the optical material hasn't been set, initialize a temporary - // geometry state at the pre-step point and use it to find the optical - // material ID + // geometry state at the pre-step point and use it to find the + // optical material ID auto geo = vacancy.geometry(); geo = GeoTrackInitializer{dist.points[StepPoint::pre].pos, {1, 0, 0}}; dist.material = vacancy.material_record(geo).material_id(); diff --git a/src/celeritas/optical/gen/detail/PrimaryGeneratorExecutor.hh b/src/celeritas/optical/gen/detail/PrimaryGeneratorExecutor.hh index fc118decd6..3ae0a5f0ff 100644 --- a/src/celeritas/optical/gen/detail/PrimaryGeneratorExecutor.hh +++ b/src/celeritas/optical/gen/detail/PrimaryGeneratorExecutor.hh @@ -39,11 +39,7 @@ struct PrimaryGeneratorExecutor //// FUNCTIONS //// // Generate optical photons - inline CELER_FUNCTION void operator()(TrackSlotId tid) const; - CELER_FORCEINLINE_FUNCTION void operator()(ThreadId tid) const - { - return (*this)(TrackSlotId{tid.unchecked_get()}); - } + inline CELER_FUNCTION void operator()(ThreadId tid) const; }; //---------------------------------------------------------------------------// @@ -52,26 +48,34 @@ struct PrimaryGeneratorExecutor /*! * Generate photons from optical distribution data. */ -CELER_FUNCTION void PrimaryGeneratorExecutor::operator()(TrackSlotId tid) const +CELER_FUNCTION void PrimaryGeneratorExecutor::operator()(ThreadId tid) const { CELER_EXPECT(params); CELER_EXPECT(state); CELER_EXPECT(data); CELER_EXPECT(distributions); - CoreTrackView track(*params, *state, tid); - auto const& counters = track.counters(); + auto* counters = state->init.counters.data().get(); + // Original code set the number of threads to the minimum between of number + // of vacancies and the number of pending in the auxiliary state. To avoid + // accessing the state counters to compute this min, we skip the extra + // threads if counters.num_vacancies < aux_state.counters.num_pending + if (!(tid < counters->num_vacancies)) + { + return; + } // Create the view to the new track to be initialized - CoreTrackView vacancy{*params, *state, [&] { - // Get the vacancy from the back in case there - // are more vacancies than photons to generate - TrackSlotId idx{index_before( - counters.num_vacancies, ThreadId(tid.get()))}; - return state->init.vacancies[idx]; - }()}; + CoreTrackView vacancy{ + *params, *state, [&] { + // Get the vacancy from the back in case there + // are more vacancies than photons to generate + TrackSlotId idx{index_before(counters->num_vacancies, tid)}; + return state->init.vacancies[idx]; + }()}; // Generate one primary from the distribution + CoreTrackView track(*params, *state, TrackSlotId{tid.get()}); auto rng = track.rng(); vacancy = PrimaryGenerator(distributions, data)(rng); } diff --git a/src/celeritas/optical/gen/detail/UpdatePendingExecutor.hh b/src/celeritas/optical/gen/detail/UpdatePendingExecutor.hh new file mode 100644 index 0000000000..d5eafde19d --- /dev/null +++ b/src/celeritas/optical/gen/detail/UpdatePendingExecutor.hh @@ -0,0 +1,57 @@ +//------------------------------- -*- C++ -*- -------------------------------// +// Copyright Celeritas contributors: see top-level COPYRIGHT file for details +// SPDX-License-Identifier: (Apache-2.0 OR MIT) +//---------------------------------------------------------------------------// +//! \file celeritas/optical/gen/detail/UpdatePendingExecutor.hh +//---------------------------------------------------------------------------// +#pragma once + +#include "corecel/Macros.hh" +#include "corecel/Types.hh" +#include "corecel/sys/ThreadId.hh" +#include "celeritas/optical/CoreTrackView.hh" + +namespace celeritas +{ +namespace optical +{ +namespace detail +{ +//---------------------------------------------------------------------------// +// LAUNCHER +//---------------------------------------------------------------------------// +/*! + * Update the num_pending counter based on the generated photons from buffered + * optical distribution data. + */ +struct UpdatePendingExecutor +{ + //// DATA //// + + size_type num_photons; + + //// FUNCTIONS //// + + // Update number of of primaries waiting to be generated + CELER_FORCEINLINE_FUNCTION void operator()(CoreTrackView& track); +}; + +//---------------------------------------------------------------------------// +// INLINE DEFINITIONS +//---------------------------------------------------------------------------// +/*! + * Update number of primaries to be generated to include the buffered optical + * photons. + */ +CELER_FORCEINLINE_FUNCTION void +UpdatePendingExecutor::operator()(CoreTrackView& track) +{ + CELER_EXPECT(track.thread_id() == ThreadId{0}); // single thread kernel + + track.counters().num_pending += num_photons; +} + +//---------------------------------------------------------------------------// +} // namespace detail +} // namespace optical +} // namespace celeritas diff --git a/src/celeritas/optical/gen/detail/UpdateSumExecutor.hh b/src/celeritas/optical/gen/detail/UpdateSumExecutor.hh index 49d54a1711..7eaea26c14 100644 --- a/src/celeritas/optical/gen/detail/UpdateSumExecutor.hh +++ b/src/celeritas/optical/gen/detail/UpdateSumExecutor.hh @@ -8,6 +8,7 @@ #include "corecel/Macros.hh" #include "corecel/Types.hh" +#include "celeritas/track/CoreStateCounters.hh" #include "../GeneratorData.hh" @@ -27,6 +28,7 @@ struct UpdateSumExecutor { //// DATA //// + RefPtr state; NativeRef const offload; size_type num_gen{}; @@ -48,18 +50,25 @@ struct UpdateSumExecutor */ CELER_FUNCTION void UpdateSumExecutor::operator()(TrackSlotId tid) const { + CELER_EXPECT(state); CELER_EXPECT(offload); CELER_EXPECT(num_gen > 0); CELER_EXPECT(tid < offload.offsets.size()); + // We deferred the check for the number of vacancies, but capped the + // updates at num_vacancies in the GeneratorExecutor functor if it was + // less than num_gen, so make the same adjustment here. + auto* counters = state->init.counters.data().get(); + size_type num_generated = min(num_gen, counters->num_vacancies); + auto& offset = offload.offsets[ItemId(tid.get())]; - if (offset < num_gen) + if (offset < num_generated) { offset = 0; } else { - offset -= num_gen; + offset -= num_generated; } } diff --git a/src/celeritas/optical/gen/detail/WlsGeneratorExecutor.hh b/src/celeritas/optical/gen/detail/WlsGeneratorExecutor.hh index a637dbfbaa..a10e39a6b5 100644 --- a/src/celeritas/optical/gen/detail/WlsGeneratorExecutor.hh +++ b/src/celeritas/optical/gen/detail/WlsGeneratorExecutor.hh @@ -43,11 +43,7 @@ struct WlsGeneratorExecutor //// FUNCTIONS //// // Generate optical photons - inline CELER_FUNCTION void operator()(TrackSlotId tid) const; - CELER_FORCEINLINE_FUNCTION void operator()(ThreadId tid) const - { - return (*this)(TrackSlotId{tid.unchecked_get()}); - } + inline CELER_FUNCTION void operator()(ThreadId tid) const; }; //---------------------------------------------------------------------------// @@ -56,7 +52,7 @@ struct WlsGeneratorExecutor /*! * Generate WLS photons from optical distribution data. */ -CELER_FUNCTION void WlsGeneratorExecutor::operator()(TrackSlotId tid) const +CELER_FUNCTION void WlsGeneratorExecutor::operator()(ThreadId tid) const { CELER_EXPECT(state); CELER_EXPECT(data); @@ -65,9 +61,17 @@ CELER_FUNCTION void WlsGeneratorExecutor::operator()(TrackSlotId tid) const auto* counters = state->init.counters.data().get(); - // Get the cumulative sum of the number of photons in the distributions. - // The values are used to determine which threads will generate from the - // corresponding distribution + // Original code set the number of threads to the minimum between of number + // of vacancies and the number of pending in the auxiliary state. To avoid + // accessing the state counters to compute this min, we skip the extra + // threads if state.counters.num_vacancies < aux_state.counters.num_pending + if (!(tid < counters->num_vacancies)) + { + return; + } + // Get the cumulative sum of the number of photons in the + // distributions. The values are used to determine which threads will + // generate from the corresponding distribution auto offsets = data.offsets[ItemRange( ItemId(0), ItemId(buffer_size))]; @@ -80,10 +84,9 @@ CELER_FUNCTION void WlsGeneratorExecutor::operator()(TrackSlotId tid) const // Create the view to the new track to be initialized CoreTrackView vacancy{ *params, *state, [&] { - // Get the vacancy from the back in case there are more vacancies - // than photons to generate - TrackSlotId idx{ - index_before(counters->num_vacancies, ThreadId(tid.get()))}; + // Get the vacancy from the back in case there are more + // vacancies than photons to generate + TrackSlotId idx{index_before(counters->num_vacancies, tid)}; return state->init.vacancies[idx]; }()}; diff --git a/src/celeritas/track/ExtendFromPrimariesAction.cc b/src/celeritas/track/ExtendFromPrimariesAction.cc index 305fc5d9b5..0e7a08740a 100644 --- a/src/celeritas/track/ExtendFromPrimariesAction.cc +++ b/src/celeritas/track/ExtendFromPrimariesAction.cc @@ -15,10 +15,12 @@ #include "celeritas/global/ActionLauncher.hh" #include "celeritas/global/CoreParams.hh" #include "celeritas/global/CoreState.hh" +#include "celeritas/global/TrackExecutor.hh" #include "TrackInitParams.hh" #include "detail/ProcessPrimariesExecutor.hh" // IWYU pragma: associated +#include "detail/UpdateCountersExecutor.hh" // IWYU pragma: associated namespace celeritas { @@ -184,18 +186,9 @@ void ExtendFromPrimariesAction::step_impl(CoreParams const& params, CoreState& state) const { auto& primaries = get>(state.aux(), aux_id_); - auto counters = state.sync_get_counters(); - - // Create track initializers from primaries - counters.num_initializers += primaries.count; - state.sync_put_counters(counters); this->process_primaries(params, state, primaries); - - // Mark that the primaries have been processed - counters.num_generated += primaries.count; - counters.num_pending = 0; + this->update_counters(params, state, primaries.count); primaries.count = 0; - state.sync_put_counters(counters); } //---------------------------------------------------------------------------// @@ -209,10 +202,26 @@ void ExtendFromPrimariesAction::process_primaries( { auto primaries = pstate.primaries(); detail::ProcessPrimariesExecutor execute{ - params.ptr(), state.ptr(), primaries}; + params.ptr(), state.ptr(), primaries, pstate.count}; return launch_action(*this, primaries.size(), params, state, execute); } +//---------------------------------------------------------------------------// +/*! + * Launch a (host) kernel to update state counters for number of primary + * particles. + */ +void ExtendFromPrimariesAction::update_counters(CoreParams const& params, + CoreStateHost& state, + size_type num_primaries) const +{ + auto execute_thread = make_single_track_executor( + params.ptr(), + state.ptr(), + detail::UpdateCountersExecutor{num_primaries}); + launch_core(1, "update-counters", params, state, execute_thread); +} + //---------------------------------------------------------------------------// #if !CELER_USE_DEVICE void ExtendFromPrimariesAction::process_primaries( @@ -222,6 +231,13 @@ void ExtendFromPrimariesAction::process_primaries( { CELER_NOT_CONFIGURED("CUDA OR HIP"); } + +void ExtendFromPrimariesAction::update_counters(CoreParams const&, + CoreStateDevice&, + size_type) const +{ + CELER_NOT_CONFIGURED("CUDA OR HIP"); +} #endif //---------------------------------------------------------------------------// diff --git a/src/celeritas/track/ExtendFromPrimariesAction.cu b/src/celeritas/track/ExtendFromPrimariesAction.cu index c528dbc3e5..c6ab27564d 100644 --- a/src/celeritas/track/ExtendFromPrimariesAction.cu +++ b/src/celeritas/track/ExtendFromPrimariesAction.cu @@ -9,8 +9,10 @@ #include "celeritas/global/ActionLauncher.device.hh" #include "celeritas/global/CoreParams.hh" #include "celeritas/global/CoreState.hh" +#include "celeritas/global/TrackExecutor.hh" #include "detail/ProcessPrimariesExecutor.hh" +#include "detail/UpdateCountersExecutor.hh" namespace celeritas { @@ -25,7 +27,7 @@ void ExtendFromPrimariesAction::process_primaries( { auto primaries = pstate.primaries(); detail::ProcessPrimariesExecutor execute_thread{ - params.ptr(), state.ptr(), primaries}; + params.ptr(), state.ptr(), primaries, pstate.count}; static ActionLauncher const launch_kernel(*this); if (!primaries.empty()) { @@ -33,5 +35,22 @@ void ExtendFromPrimariesAction::process_primaries( } } +//---------------------------------------------------------------------------// +/*! + * Launch a kernel to update state counters for number of primary particles. + */ +void ExtendFromPrimariesAction::update_counters(CoreParams const& params, + CoreStateDevice& state, + size_type num_primaries) const +{ + auto execute_thread = make_single_track_executor( + params.ptr(), + state.ptr(), + detail::UpdateCountersExecutor{num_primaries}); + static KernelLauncher const launch_kernel( + "update-counters"); + launch_kernel(1, state.stream_id(), execute_thread); +} + //---------------------------------------------------------------------------// } // namespace celeritas diff --git a/src/celeritas/track/ExtendFromPrimariesAction.hh b/src/celeritas/track/ExtendFromPrimariesAction.hh index 80d1fcb3ad..d245d2992c 100644 --- a/src/celeritas/track/ExtendFromPrimariesAction.hh +++ b/src/celeritas/track/ExtendFromPrimariesAction.hh @@ -99,6 +99,9 @@ class ExtendFromPrimariesAction final : public CoreStepActionInterface, void process_primaries(CoreParams const&, CoreStateDevice&, PrimaryStateData const&) const; + + void update_counters(CoreParams const&, CoreStateHost&, size_type) const; + void update_counters(CoreParams const&, CoreStateDevice&, size_type) const; }; template diff --git a/src/celeritas/track/InitializeTracksAction.cc b/src/celeritas/track/InitializeTracksAction.cc index a14930fc62..94068f8633 100644 --- a/src/celeritas/track/InitializeTracksAction.cc +++ b/src/celeritas/track/InitializeTracksAction.cc @@ -14,11 +14,13 @@ #include "celeritas/global/ActionLauncher.hh" #include "celeritas/global/CoreParams.hh" #include "celeritas/global/CoreState.hh" +#include "celeritas/global/TrackExecutor.hh" #include "TrackInitParams.hh" #include "detail/InitTracksExecutor.hh" // IWYU pragma: associated #include "detail/TrackInitAlgorithms.hh" +#include "detail/UpdateNumActiveExecutor.hh" // IWYU pragma: associated namespace celeritas { @@ -55,55 +57,52 @@ template void InitializeTracksAction::step_impl(CoreParams const& core_params, CoreState& core_state) const { - auto counters = core_state.sync_get_counters(); - // The number of new tracks to initialize is the smaller of the number of - // empty slots in the track vector and the number of track initializers - size_type num_new_tracks - = std::min(counters.num_vacancies, counters.num_initializers); - if (num_new_tracks > 0) + // empty slots in the track vector and the number of track initializers. + // To avoid synchronizing the core state counters, we let the kernels + // calculate the number of new tracks and proceed accordingly. This means + // the code might sometimes call these functions when there is no work + // to do, but that's quickly determined so the overhead should be minimal. + if (core_params.init()->track_order() == TrackOrder::init_charge) { - if (core_params.init()->track_order() == TrackOrder::init_charge) - { - // Reset track initializer indices - fill_sequence(&core_state.ref().init.indices, - core_state.stream_id()); - - // Partition indices by whether tracks are charged or neutral - detail::partition_initializers(core_params, - core_state.ref().init, - num_new_tracks, - core_state.stream_id()); - } + // Reset track initializer indices + fill_sequence(&core_state.ref().init.indices, core_state.stream_id()); - // Launch a kernel to initialize tracks - this->step_impl(core_params, core_state, num_new_tracks); - - // Update initializers/vacancies - counters.num_initializers -= num_new_tracks; - counters.num_vacancies -= num_new_tracks; + // Partition indices by whether tracks are charged or neutral + detail::partition_initializers( + core_params, core_state.ref().init, core_state.stream_id()); } - // Store number of active tracks at the start of the loop - counters.num_active = core_state.size() - counters.num_vacancies; - core_state.sync_put_counters(counters); + // Launch a kernel to initialize tracks, using the largest possible + // number and computing the actual number in the kernel. + this->step_impl(core_params, core_state, core_state.size()); } //---------------------------------------------------------------------------// /*! - * Launch a (host) kernel to initialize tracks. + * Launch (host) kernels to initialize tracks and to update the corresponding + * counters. * * The thread index here corresponds to initializer indices, not track slots * (or indices into the track slot indirection array). */ void InitializeTracksAction::step_impl(CoreParams const& core_params, CoreStateHost& core_state, - size_type num_new_tracks) const + size_type max_new_tracks) const { - detail::InitTracksExecutor execute{ - core_params.ptr(), core_state.ptr(), num_new_tracks}; - return launch_action( - *this, num_new_tracks, core_params, core_state, execute); + { + detail::InitTracksExecutor execute{core_params.ptr(), + core_state.ptr()}; + launch_action(*this, max_new_tracks, core_params, core_state, execute); + } + { + auto execute_thread = make_single_track_executor( + core_params.ptr(), + core_state.ptr(), + detail::UpdateNumActiveExecutor{core_state.size()}); + launch_core( + 1, "update-active", core_params, core_state, execute_thread); + } } //---------------------------------------------------------------------------// diff --git a/src/celeritas/track/InitializeTracksAction.cu b/src/celeritas/track/InitializeTracksAction.cu index da65606798..95fba10863 100644 --- a/src/celeritas/track/InitializeTracksAction.cu +++ b/src/celeritas/track/InitializeTracksAction.cu @@ -9,23 +9,37 @@ #include "celeritas/global/ActionLauncher.device.hh" #include "celeritas/global/CoreParams.hh" #include "celeritas/global/CoreState.hh" +#include "celeritas/global/TrackExecutor.hh" #include "detail/InitTracksExecutor.hh" +#include "detail/UpdateNumActiveExecutor.hh" namespace celeritas { //---------------------------------------------------------------------------// /*! - * Launch a kernel to initialize tracks. + * Launch (device) kernels to initialize tracks and to update the corresponding + * counters. */ void InitializeTracksAction::step_impl(CoreParams const& params, CoreStateDevice& state, size_type num_new_tracks) const { - detail::InitTracksExecutor execute_thread{ - params.ptr(), state.ptr(), num_new_tracks}; - static ActionLauncher const launch_kernel(*this); - launch_kernel(num_new_tracks, state.stream_id(), execute_thread); + { + detail::InitTracksExecutor execute{params.ptr(), + state.ptr()}; + static ActionLauncher const launch_kernel(*this); + launch_kernel(num_new_tracks, state.stream_id(), execute); + } + { + auto execute_thread = make_single_track_executor( + params.ptr(), + state.ptr(), + detail::UpdateNumActiveExecutor{state.size()}); + static KernelLauncher const launch_kernel( + "update-active"); + launch_kernel(1, state.stream_id(), execute_thread); + } } //---------------------------------------------------------------------------// diff --git a/src/celeritas/track/TrackFunctors.hh b/src/celeritas/track/TrackFunctors.hh index b12ab6a610..cd1c5e963b 100644 --- a/src/celeritas/track/TrackFunctors.hh +++ b/src/celeritas/track/TrackFunctors.hh @@ -27,6 +27,16 @@ struct AppliesValid } }; +//! Launch on only a single thread +struct IsThreadZero +{ + template + CELER_FUNCTION bool operator()(T const& track) const + { + return track.thread_id() == ThreadId{0}; + } +}; + //---------------------------------------------------------------------------// /*! * Apply only to tracks with the given post-step action ID. diff --git a/src/celeritas/track/TrackInitData.hh b/src/celeritas/track/TrackInitData.hh index 55aca8e626..7ab20f1a28 100644 --- a/src/celeritas/track/TrackInitData.hh +++ b/src/celeritas/track/TrackInitData.hh @@ -89,7 +89,7 @@ struct TrackInitializer * \c max_events. * - \c initializers stores the data for primaries and secondaries waiting to * be turned into new tracks and can be any size up to \c capacity. - * - \c vacancies stores the \c TrackSlotid of the tracks that have been + * - \c vacancies stores the \c TrackSlotId of the tracks that have been * killed; the size will be <= the number of track states. * - \c track_counters stores the total number of particles that have been * created per event. diff --git a/src/celeritas/track/detail/InitTracksExecutor.hh b/src/celeritas/track/detail/InitTracksExecutor.hh index 0414fe1dce..9e588df34e 100644 --- a/src/celeritas/track/detail/InitTracksExecutor.hh +++ b/src/celeritas/track/detail/InitTracksExecutor.hh @@ -45,7 +45,6 @@ struct InitTracksExecutor ParamsPtr params; StatePtr state; - size_type num_init{}; //// FUNCTIONS //// @@ -63,46 +62,54 @@ struct InitTracksExecutor */ CELER_FUNCTION void InitTracksExecutor::operator()(ThreadId tid) const { - CELER_EXPECT(tid < num_init); + CELER_EXPECT(params); + CELER_EXPECT(state); auto const& data = state->init; auto* counters = state->init.counters.data().get(); - // Get the track initializer from the back of the vector. Since new - // initializers are pushed to the back of the vector, these will be the - // most recently added and therefore the ones that still might have a - // parent they can copy the geometry state from. - TrackInitializer& init = data.initializers[ItemId([&] { - if (params->init.track_order == TrackOrder::init_charge) - { - // Get the index into the track initializer or parent track slot ID - // array from the sorted indices - return data.indices[TrackSlotId(index_before(num_init, tid))] - + counters->num_initializers - num_init; - } - return index_before(counters->num_initializers, tid); - }())]; - - // View to the new track to be initialized - CoreTrackView vacancy{ - *params, *state, [&] { - if (params->init.track_order == TrackOrder::init_charge - && IsNeutral{params}(init)) + size_type num_init + = min(counters->num_vacancies, counters->num_initializers); + CELER_EXPECT(num_init <= state->size()); + if (tid < num_init) + { + // Get the track initializer from the back of the vector. Since new + // initializers are pushed to the back of the vector, these will be the + // most recently added and therefore the ones that still might have a + // parent they can copy the geometry state from. + TrackInitializer& init = data.initializers[ItemId([&] { + if (params->init.track_order == TrackOrder::init_charge) { - // Get the vacancy from the front of the track state - return data.vacancies[TrackSlotId(index_before(num_init, tid))]; + // Get the index into the track initializer or parent track + // slot ID array from the sorted indices + return data.indices[TrackSlotId(index_before(num_init, tid))] + + counters->num_initializers - num_init; } - // Get the vacancy from the back of the track state - return data.vacancies[TrackSlotId( - index_before(counters->num_vacancies, tid))]; - }()}; + return index_before(counters->num_initializers, tid); + }())]; - // Clear parent IDs if new primaries were added this step - if (counters->num_generated) - { - init.geo.parent = {}; - } + // View to the new track to be initialized + CoreTrackView vacancy{ + *params, *state, [&] { + if (params->init.track_order == TrackOrder::init_charge + && IsNeutral{params}(init)) + { + // Get the vacancy from the front of the track state + return data + .vacancies[TrackSlotId(index_before(num_init, tid))]; + } + // Get the vacancy from the back of the track state + return data.vacancies[TrackSlotId( + index_before(counters->num_vacancies, tid))]; + }()}; - vacancy = init; + // Clear parent IDs if new primaries were added this step + if (counters->num_generated) + { + init.geo.parent = {}; + } + + vacancy = init; + } } //---------------------------------------------------------------------------// diff --git a/src/celeritas/track/detail/ProcessPrimariesExecutor.hh b/src/celeritas/track/detail/ProcessPrimariesExecutor.hh index 0addafb95f..7bc6105cbe 100644 --- a/src/celeritas/track/detail/ProcessPrimariesExecutor.hh +++ b/src/celeritas/track/detail/ProcessPrimariesExecutor.hh @@ -41,6 +41,7 @@ struct ProcessPrimariesExecutor StatePtr state; Span primaries; + size_type num_primaries; //// FUNCTIONS //// @@ -56,7 +57,8 @@ CELER_FUNCTION void ProcessPrimariesExecutor::operator()(ThreadId tid) const { CELER_EXPECT(tid < primaries.size()); auto* counters = state->init.counters.data().get(); - CELER_EXPECT(primaries.size() <= counters->num_initializers + tid.get()); + CELER_EXPECT(primaries.size() + <= counters->num_initializers + num_primaries + tid.get()); Primary const& primary = primaries[tid.unchecked_get()]; @@ -83,7 +85,8 @@ CELER_FUNCTION void ProcessPrimariesExecutor::operator()(ThreadId tid) const } // Store the initializer - size_type idx = counters->num_initializers - primaries.size() + tid.get(); + size_type idx = counters->num_initializers + num_primaries + - primaries.size() + tid.get(); state->init.initializers[ItemId(idx)] = ti; } diff --git a/src/celeritas/track/detail/TrackInitAlgorithms.cc b/src/celeritas/track/detail/TrackInitAlgorithms.cc index f34b45b2e9..902ab24831 100644 --- a/src/celeritas/track/detail/TrackInitAlgorithms.cc +++ b/src/celeritas/track/detail/TrackInitAlgorithms.cc @@ -81,13 +81,15 @@ size_type exclusive_scan_counts( void partition_initializers( CoreParams const& params, TrackInitStateData const& init, - size_type count, StreamId) { // Partition the indices based on the track initializer charge + auto* counters = init.counters.data().get(); + auto count = std::min(counters->num_vacancies, counters->num_initializers); + if (count == 0) + return; auto* start = init.indices.data().get(); auto* end = start + count; - auto* counters = init.counters.data().get(); auto* stencil = init.initializers.data().get() + counters->num_initializers - count; std::stable_partition( diff --git a/src/celeritas/track/detail/TrackInitAlgorithms.cu b/src/celeritas/track/detail/TrackInitAlgorithms.cu index 2f19f2e419..7fc7483e8a 100644 --- a/src/celeritas/track/detail/TrackInitAlgorithms.cu +++ b/src/celeritas/track/detail/TrackInitAlgorithms.cu @@ -95,8 +95,6 @@ void remove_if_alive( Copier copy{{counters.get(), 1}, stream_id}; copy(MemSpace::host, {&host_counters, 1}); - stream.sync(); - return; #else auto& stream = device().stream(stream_id); // Calling with nullptr causes the function to return the amount of working @@ -125,8 +123,9 @@ void remove_if_alive( stream.get()); CELER_DISCARD(cub_error_code); CELER_DEVICE_API_CALL(PeekAtLastError()); - return; #endif + stream.sync(); + return; } //---------------------------------------------------------------------------// @@ -155,9 +154,6 @@ size_type exclusive_scan_counts( // Copy the last element (accumulated total) back to host auto result = ItemCopier{stream_id}(stop.get() - 1); - - stream.sync(); - return result; #else // Calling with nullptr causes the function to return the amount of working // space needed instead of invoking the kernel. @@ -175,18 +171,15 @@ size_type exclusive_scan_counts( data, counts.size(), stream.get()); - // Set the counter similar to the following - // counters.num_secondaries = "last value in the counts object; CELER_DISCARD(cub_error_code); CELER_DEVICE_API_CALL(PeekAtLastError()); // Copy the last element (accumulated total) back to host auto result = ItemCopier{stream_id}(data.get() + counts.size() - 1); - +#endif stream.sync(); return result; -#endif } //---------------------------------------------------------------------------// @@ -199,26 +192,27 @@ size_type exclusive_scan_counts( void partition_initializers( CoreParams const& params, TrackInitStateData const& init, - size_type count, StreamId stream_id) { - CELER_EXPECT(count != 0); - ScopedProfiling profile_this{"partition-initializers"}; -#if CELER_USE_THRUST // Partition the indices based on the track initializer charge - auto start = device_pointer_cast(init.indices.data()); - auto end = start + count; auto counters = device_pointer_cast(init.counters.data()); auto cpucntrs = ItemCopier{stream_id}(counters.get()); + size_type count = min(cpucntrs.num_vacancies, cpucntrs.num_initializers); + if (count == 0) + return; + // The initializers array is large. Use stencil to point to the start where + // this array is being used auto stencil = static_cast(init.initializers.data()) + cpucntrs.num_initializers - count; +#if CELER_USE_THRUST + auto* start = device_pointer_cast(init.indices.data()); + auto* end = start + count; thrust::stable_partition( thrust_execute_on(stream_id), start, end, IsNeutralStencil{params.ptr(), stencil}); - CELER_DEVICE_API_CALL(PeekAtLastError()); #else auto& stream = device().stream(stream_id); // CUB doesn't have a partition function that allows the user to specify @@ -227,13 +221,6 @@ void partition_initializers( // instead we create an iterator by using a functor to transform the // stencil values into boolean flags that determine how to partition // the indices. - // - // The initializers array is large. Use stencil to point to the start where - // this array is being used - auto counters = device_pointer_cast(init.counters.data()); - auto cpucntrs = ItemCopier{stream_id}(counters.get()); - auto stencil = static_cast(init.initializers.data()) - + cpucntrs.num_initializers - count; DeviceVector flags{count, stream_id}; # if CELER_CUB_HAS_TRANSFORM || CELER_HIPCUB_HAS_TRANSFORM // HIP defines hipCUB functions as [[nodiscard]], but we defer error checks @@ -254,7 +241,7 @@ void partition_initializers( IsNeutral{params.ptr()}); # endif // Calling with nullptr causes the function to return the amount of working - // space needed instead of invoking the kernel. + // space needed instead of invoking the kernel size_t temp_storage_bytes = 0; // CUB doesn't support in-place partitioning, so use a counting iterator // because the indices are always sequential from zero @@ -282,8 +269,9 @@ void partition_initializers( count, stream.get()); CELER_DISCARD(cub_error_code); - CELER_DEVICE_API_CALL(PeekAtLastError()); #endif + CELER_DEVICE_API_CALL(PeekAtLastError()); + stream.sync(); } //---------------------------------------------------------------------------// diff --git a/src/celeritas/track/detail/TrackInitAlgorithms.hh b/src/celeritas/track/detail/TrackInitAlgorithms.hh index 81d0b53201..5eaad1022f 100644 --- a/src/celeritas/track/detail/TrackInitAlgorithms.hh +++ b/src/celeritas/track/detail/TrackInitAlgorithms.hh @@ -59,12 +59,10 @@ size_type exclusive_scan_counts( void partition_initializers( CoreParams const&, TrackInitStateData const&, - size_type, StreamId); void partition_initializers( CoreParams const&, TrackInitStateData const&, - size_type, StreamId); //---------------------------------------------------------------------------// @@ -87,7 +85,6 @@ inline size_type exclusive_scan_counts( inline void partition_initializers( CoreParams const&, TrackInitStateData const&, - size_type, StreamId) { CELER_NOT_CONFIGURED("CUDA or HIP"); diff --git a/src/celeritas/track/detail/UpdateCountersExecutor.hh b/src/celeritas/track/detail/UpdateCountersExecutor.hh new file mode 100644 index 0000000000..b0745c53a7 --- /dev/null +++ b/src/celeritas/track/detail/UpdateCountersExecutor.hh @@ -0,0 +1,54 @@ +//------------------------------- -*- C++ -*- -------------------------------// +// Copyright Celeritas contributors: see top-level COPYRIGHT file for details +// SPDX-License-Identifier: (Apache-2.0 OR MIT) +//---------------------------------------------------------------------------// +//! \file celeritas/track/detail/UpdateCountersExecutor.hh +//---------------------------------------------------------------------------// +#pragma once + +#include "corecel/Assert.hh" +#include "corecel/Macros.hh" +#include "celeritas/Types.hh" +#include "celeritas/global/CoreTrackView.hh" + +#include "../TrackInitData.hh" + +namespace celeritas +{ +namespace detail +{ +//---------------------------------------------------------------------------// +/*! + * Create track initializers from primary particles. + */ +struct UpdateCountersExecutor +{ + //// DATA //// + + size_type num_primaries; + + //// FUNCTIONS //// + + // Update state counters based on the number of primaries + CELER_FORCEINLINE_FUNCTION void operator()(CoreTrackView& track); +}; + +//---------------------------------------------------------------------------// +/*! + * Update state counters based on the number of primaries. + */ +CELER_FORCEINLINE_FUNCTION void +UpdateCountersExecutor::operator()(CoreTrackView& track) +{ + CELER_EXPECT(track.thread_id() == ThreadId{0}); // single thread kernel + + // Update track initializers from primaries + track.counters().num_initializers += num_primaries; + // Mark that the primaries have been processed + track.counters().num_generated += num_primaries; + track.counters().num_pending = 0; +} + +//---------------------------------------------------------------------------// +} // namespace detail +} // namespace celeritas diff --git a/src/celeritas/track/detail/UpdateNumActiveExecutor.hh b/src/celeritas/track/detail/UpdateNumActiveExecutor.hh new file mode 100644 index 0000000000..cbef53fb41 --- /dev/null +++ b/src/celeritas/track/detail/UpdateNumActiveExecutor.hh @@ -0,0 +1,58 @@ +//------------------------------- -*- C++ -*- -------------------------------// +// Copyright Celeritas contributors: see top-level COPYRIGHT file for details +// SPDX-License-Identifier: (Apache-2.0 OR MIT) +//---------------------------------------------------------------------------// +//! \file celeritas/track/detail/UpdateNumActiveExecutor.hh +//---------------------------------------------------------------------------// +#pragma once + +#include "corecel/Assert.hh" +#include "corecel/Macros.hh" +#include "celeritas/Types.hh" + +#include "../TrackInitData.hh" + +namespace celeritas +{ +namespace detail +{ +//---------------------------------------------------------------------------// +/*! + * Update num_active state counter based on the number of vacancies. + */ +struct UpdateNumActiveExecutor +{ + //// DATA //// + + size_type state_size; + + //// FUNCTIONS //// + + // Update state counters based on the number of primaries + CELER_FORCEINLINE_FUNCTION void operator()(CoreTrackView& track); +}; + +//---------------------------------------------------------------------------// +/*! + * Update number of active trackes based on the number of vacancies. + */ +CELER_FORCEINLINE_FUNCTION void +UpdateNumActiveExecutor::operator()(CoreTrackView& track) +{ + CELER_EXPECT(track.thread_id() == ThreadId{0}); // single thread kernel + + size_type num_new_tracks = min(track.counters().num_vacancies, + track.counters().num_initializers); + if (num_new_tracks > 0) + { + // Update initializers/vacancies + track.counters().num_initializers -= num_new_tracks; + track.counters().num_vacancies -= num_new_tracks; + } + // Store number of active tracks at the start of the loop + track.counters().num_active = state_size - track.counters().num_vacancies; +} + +//---------------------------------------------------------------------------// +} // namespace detail +} // namespace celeritas diff --git a/test/celeritas/optical/OpticalUtils.test.cc b/test/celeritas/optical/OpticalUtils.test.cc index b14af49bc9..d645136532 100644 --- a/test/celeritas/optical/OpticalUtils.test.cc +++ b/test/celeritas/optical/OpticalUtils.test.cc @@ -16,6 +16,7 @@ #include "corecel/data/CollectionBuilder.hh" #include "corecel/data/Ref.hh" #include "corecel/math/Algorithms.hh" +#include "celeritas/optical/CoreState.hh" #include "celeritas/optical/action/detail/TrackInitAlgorithms.hh" #include "celeritas/optical/gen/detail/GeneratorAlgorithms.hh" @@ -42,19 +43,21 @@ std::vector locate_vacancies(std::vector const& input) StateVal host_status; make_builder(&host_status).insert_back(input.begin(), input.end()); StateVal status(host_status); + optical::TrackInitStateData init; - StateVal vacancies; - resize(&vacancies, status.size()); + resize(&init, StreamId{0}, host_status.size()); StateRef status_ref(status); - StateRef vacancies_ref(vacancies); - size_type num_vacancies = optical::detail::copy_if_vacant( - status_ref, vacancies_ref, StreamId{0}); + optical::TrackInitStateData init_ref; + init_ref = init; + optical::detail::copy_if_vacant(status_ref, init_ref, StreamId{0}); - auto host_vacancies = copy_to_host(vacancies); + auto host_vacancies = copy_to_host(init.vacancies); + auto host_counters_copy = copy_to_host(init.counters); + auto* host_counters = host_counters_copy.data().get(); std::vector result; - for (auto tid : range(TrackSlotId{num_vacancies})) + for (auto tid : range(TrackSlotId{host_counters->num_vacancies})) { result.push_back(static_cast(host_vacancies[tid].unchecked_get())); }