BNLNPPS · plexoos · Jun 22, 2026
diff --git a/.github/workflows/build-pull-request.yaml b/.github/workflows/build-pull-request.yaml
@@ -183,6 +183,22 @@ jobs:
         run: |
           docker pull ${{ env.IMAGE_NAME }}:${{ env.IMAGE_TAG }}
 
+      - name: Build and run external example
+        run: |
+          docker run --rm --gpus 'device=1' ${{ env.IMAGE_NAME }}:${{ env.IMAGE_TAG }} bash -lc '
+            set -euo pipefail
+            build=/tmp/simphony-examples-build
+            prefix=/tmp/simphony-examples-install
+            cmake -S "$SIMPHONY_HOME/examples" -B "$build" -DCMAKE_PREFIX_PATH="$SIMPHONY_PREFIX" -DCMAKE_INSTALL_PREFIX="$prefix"
+            cmake --build "$build" --target install
+            cd /tmp
+            "$prefix/bin/simphox" --cpu
+            test -f /tmp/out/photons.npy
+            rm -f /tmp/out/photons.npy
+            "$prefix/bin/simphox" --gpu
+            test -f /tmp/out/photons.npy
+          '
+
       - name: Run tests
         run: |
           docker run --rm --gpus 'device=1' ${{ env.IMAGE_NAME }}:${{ env.IMAGE_TAG }} bash -lc 'ctest --test-dir "$SIMPHONY_BUILD" --output-on-failure'

diff --git a/cmake/Config.cmake.in b/cmake/Config.cmake.in
@@ -9,6 +9,7 @@ find_dependency(nlohmann_json REQUIRED)
 find_dependency(plog REQUIRED)
 find_dependency(GLEW REQUIRED)
 find_dependency(glfw3 REQUIRED)
+find_dependency(glm REQUIRED)
 find_dependency(CUDAToolkit REQUIRED)
 find_dependency(OptiX 7 REQUIRED)
 find_dependency(Geant4 REQUIRED)

diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
@@ -1,14 +1,17 @@
 cmake_minimum_required(VERSION 3.22)
 
-project(simphox LANGUAGES CXX)
+project(simphox LANGUAGES CXX CUDA)
 
-set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD 20)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
 set(CMAKE_CXX_EXTENSIONS OFF)
+set(CMAKE_CUDA_STANDARD 17)
+set(CMAKE_CUDA_STANDARD_REQUIRED ON)
+set(CMAKE_CUDA_EXTENSIONS OFF)
 
 find_package(simphony REQUIRED)
 
-add_executable(simphox simphox.cpp)
+add_executable(simphox simphox.cpp simphox_gpu.cu)
 target_link_libraries(simphox simphony::G4CX simphony::SysRap)
 
 install(TARGETS simphox)
diff --git a/examples/README.md b/examples/README.md
@@ -5,11 +5,14 @@ example by simply doing from this directory:
 cmake -S . -B build
 cmake --build build
 ./simphox
+./simphox --gpu
 ```
 
 It generates a configurable set of optical photons using the built-in torch
 configuration, converts them into an NP array, prints the data, and saves it as
-`out/photons.npy`.
+`out/photons.npy`. The default `cpu` backend generates photons on the host. The
+`gpu` backend generates the same kind of torch photons in a CUDA kernel and
+copies them back before writing the same output file.
 
 
 ## Examples

diff --git a/examples/simphox.cpp b/examples/simphox.cpp
@@ -1,23 +1,74 @@
 #include <iostream>
+#include <stdexcept>
 #include <string>
 #include <vector>
 
-#include "simphony/g4cx/G4CXOpticks.hh"
 #include "simphony/sysrap/NP.hh"
 #include "simphony/sysrap/sphoton.h"
 
 #include "simphony/sysrap/config.h"
 #include "simphony/sysrap/torch.h"
 
+#include "simphox_gpu.h"
+
 using namespace std;
 
 int main(int argc, char **argv)
 {
+    bool                   use_gpu = false;
+    constexpr unsigned int seed = 42;
+    unsigned int           num_photons = 0;
+
+    for (int i = 1; i < argc; i++)
+    {
+        string arg = argv[i];
+        if (arg == "--cpu")
+        {
+            use_gpu = false;
+        }
+        else if (arg == "--gpu")
+        {
+            use_gpu = true;
+        }
+        else if (arg == "--num-photons" && i + 1 < argc)
+        {
+            num_photons = static_cast<unsigned int>(stoul(argv[++i]));
+        }
+        else if (arg == "-h" || arg == "--help")
+        {
+            cout << "Usage: simphox [--cpu] [--gpu] [--num-photons N]" << endl;
+            return EXIT_SUCCESS;
+        }
+        else
+        {
+            cerr << "Unknown argument: " << arg << endl;
+            return EXIT_FAILURE;
+        }
+    }
+
     gphox::Config config("dev");
 
     cout << config.torch.desc() << endl;
+    cout << "backend " << (use_gpu ? "gpu" : "cpu") << endl;
+    cout << "seed " << seed << endl;
 
-    vector<sphoton> phs = generate_photons(config.torch);
+    vector<sphoton> phs;
+    if (!use_gpu)
+    {
+        phs = generate_photons(config.torch, num_photons, seed);
+    }
+    else
+    {
+        try
+        {
+            phs = generate_photons_gpu(config.torch, num_photons, seed);
+        }
+        catch (const exception& err)
+        {
+            cerr << "GPU photon generation failed: " << err.what() << endl;
+            return EXIT_FAILURE;
+        }
+    }
 
     size_t num_floats = phs.size() * 4 * 4;
     float *data = reinterpret_cast<float *>(phs.data());

diff --git a/examples/simphox_gpu.cu b/examples/simphox_gpu.cu
@@ -0,0 +1,59 @@
+#include "simphox_gpu.h"
+
+#include <cuda_runtime.h>
+#include <curand_kernel.h>
+
+#include "simphony/sysrap/CUDA_CHECK.h"
+#include "simphony/sysrap/srng.h"
+
+namespace
+{
+
+__global__ void generate_photons_kernel(storch torch, sphoton* photons, unsigned int num_photons, unsigned int seed)
+{
+    unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx >= num_photons)
+    {
+        return;
+    }
+
+    curandStatePhilox4_32_10 rng;
+    curand_init(seed, idx, 0, &rng);
+
+    qtorch qt;
+    qt.t = torch;
+
+    sphoton photon;
+    storch::generate(photon, rng, qt.q, idx, 0);
+    photons[idx] = photon;
+}
+
+} // namespace
+
+std::vector<sphoton> generate_photons_gpu(const storch& torch, unsigned int num_photons, unsigned int seed)
+{
+    if (num_photons == 0)
+    {
+        num_photons = torch.numphoton;
+    }
+
+    std::vector<sphoton> photons(num_photons);
+    if (num_photons == 0)
+    {
+        return photons;
+    }
+
+    sphoton* d_photons = nullptr;
+    CUDA_CHECK(cudaMalloc(reinterpret_cast<void**>(&d_photons), photons.size() * sizeof(sphoton)));
+
+    constexpr unsigned int threads_per_block = 256;
+    unsigned int blocks = (num_photons + threads_per_block - 1) / threads_per_block;
+
+    generate_photons_kernel<<<blocks, threads_per_block>>>(torch, d_photons, num_photons, seed);
+    CUDA_SYNC_CHECK();
+
+    CUDA_CHECK(cudaMemcpy(photons.data(), d_photons, photons.size() * sizeof(sphoton), cudaMemcpyDeviceToHost));
+    CUDA_CHECK(cudaFree(d_photons));
+
+    return photons;
+}
diff --git a/examples/simphox_gpu.h b/examples/simphox_gpu.h
@@ -0,0 +1,8 @@
+#pragma once
+
+#include <vector>
+
+#include "simphony/sysrap/sphoton.h"
+#include "simphony/sysrap/storch.h"
+
+std::vector<sphoton> generate_photons_gpu(const storch& torch, unsigned int num_photons = 0, unsigned int seed = 0);
diff --git a/sysrap/CMakeLists.txt b/sysrap/CMakeLists.txt
@@ -271,6 +271,7 @@ set(HEADERS
     tcomplex.h
 
     srng.h
+    srng_traits.h
     srngcpu.h
     scurand.h  
 

diff --git a/sysrap/storch.h b/sysrap/storch.h
@@ -35,6 +35,7 @@ Techniques to implement the spirit of the old torch gensteps in much less code
 //#include "scurand.h"
 #include "smath.h"
 #include "sphoton.h"
+#include "squad.h"
 #include "srng_traits.h"
 #include "storchtype.h"