diff --git a/core/src/CMakeLists.txt b/core/src/CMakeLists.txt
index 81f9db0aa..299eb6aa6 100644
--- a/core/src/CMakeLists.txt
+++ b/core/src/CMakeLists.txt
@@ -32,6 +32,7 @@ set(HEADERS_PUBLIC
   Cabana_Utils.hpp
   Cabana_VerletList.hpp
   Cabana_Version.hpp
+  Cabana_GaussianMixtureModel.hpp
   )
 
 if(Cabana_ENABLE_ARBORX)
@@ -58,7 +59,11 @@ endif()
 
 set(HEADERS_IMPL
   impl/Cabana_CartesianGrid.hpp
+  impl/Cabana_Erfinv.hpp
   impl/Cabana_Index.hpp
+  impl/Cabana_GaussianWeight.hpp
+  impl/Cabana_Hammersley.hpp
+  impl/Cabana_Matrix2d.hpp
   impl/Cabana_PerformanceTraits.hpp
   impl/Cabana_TypeTraits.hpp
   )
diff --git a/core/src/Cabana_Core.hpp b/core/src/Cabana_Core.hpp
index 393562327..fdda1896c 100644
--- a/core/src/Cabana_Core.hpp
+++ b/core/src/Cabana_Core.hpp
@@ -21,6 +21,7 @@
 #include <Cabana_AoSoA.hpp>
 #include <Cabana_DeepCopy.hpp>
 #include <Cabana_Fields.hpp>
+#include <Cabana_GaussianMixtureModel.hpp>
 #include <Cabana_LinkedCellList.hpp>
 #include <Cabana_MemberTypes.hpp>
 #include <Cabana_NeighborList.hpp>
diff --git a/core/src/Cabana_GaussianMixtureModel.hpp b/core/src/Cabana_GaussianMixtureModel.hpp
new file mode 100644
index 000000000..fc2ab68e5
--- /dev/null
+++ b/core/src/Cabana_GaussianMixtureModel.hpp
@@ -0,0 +1,1059 @@
+/****************************************************************************
+ * Copyright (c) 2023 by the Cabana authors                                 *
+ * All rights reserved.                                                     *
+ *                                                                          *
+ * This file is part of the Cabana library. Cabana is distributed under a   *
+ * BSD 3-clause license. For the licensing terms see the LICENSE file in    *
+ * the top-level directory.                                                 *
+ *                                                                          *
+ * SPDX-License-Identifier: BSD-3-Clause                                    *
+ ****************************************************************************/
+
+/*!
+  \file Cabana_GaussianMixtureModel.hpp
+  \brief Creation of a Gaussian Mixture Model
+*/
+#ifndef CABANA_GMM_HPP
+#define CABANA_GMM_HPP
+
+#include <typeinfo>
+
+#include <Kokkos_Core.hpp>
+#include <Kokkos_Random.hpp>
+#include <Cabana_Parallel.hpp>
+#include <impl/Cabana_GaussianWeight.hpp>
+
+/*!
+  Possible parameters to describe a Gaussian
+
+  In one dimension we use MuX and Cxx. In two dimension we assume a cylindrical
+  velocity space and use MuPar, MuPer, Cparpar and Cperper. The off-diagonal
+  elements Cparper and Cperpar are not currently in use. In three dimensions we
+  use all MuX, MuY, MuZ, as well as a full (symmetric) 3-by-3 covariance matrix.
+*/
+enum GaussianFields {
+    Weight,
+    // Can we make only the relevant fields vissible based on <dims>?
+    MuPar, MuPer,
+    Cparpar, Cparper,
+    Cperpar, Cperper,
+    MuX, MuY, MuZ,
+    Cxx, Cxy, Cxz,
+    Cyx, Cyy, Cyz,
+    Czx, Czy, Czz,
+    n_gaussian_param
+};
+
+
+namespace Cabana {
+
+// Everything that is implementation dependent and not for the user is in a
+// class indicating as much. The nicer high level user facing functions are at
+// the vbottom of the file
+
+template<unsigned int dims>
+class GMMImpl {
+public:
+
+/*!
+  Do a scan of the particles and compute the variance of the 1d velocity data
+
+  This is done for the particles where the entry in the cell slide matches c,
+  respecting the possibly different weights in the macro slice and the result is
+  reported in cov[0][0].
+*/
+template <typename CellSliceType, typename WeightSliceType, typename VelocitySliceType>
+static void variance(const CellSliceType& cell, const WeightSliceType& macro, const VelocitySliceType& velocity_x, const int c, double(&cov)[1][1]) {
+	using gmm_float_type = typename WeightSliceType::value_type;
+
+	gmm_float_type sum_x = 0.;
+	gmm_float_type sum_xx = 0.;
+	gmm_float_type N = 0;
+
+	// How does a single particle contribute to the total
+	auto _add = KOKKOS_LAMBDA(const int& i, gmm_float_type& local_N,
+	                          gmm_float_type& local_sum_x, gmm_float_type& local_sum_xx) {
+		if(cell(i) == c) {
+			local_N      += macro(i);
+			local_sum_x  += macro(i) * velocity_x(i);
+			local_sum_xx += macro(i) * velocity_x(i)*velocity_x(i);
+		}
+	};
+
+	// Execute
+	Kokkos::parallel_reduce("Particle Scan", velocity_x.size(), _add, N, sum_x, sum_xx);
+	const gmm_float_type mu_x = sum_x/gmm_float_type(N);
+	const gmm_float_type var_x = sum_xx/gmm_float_type(N) - mu_x*mu_x;
+
+	cov[0][0] = var_x;
+}
+
+/*!
+  Do a scan of the particles and compute the variance of the 2d velocity data
+
+  This is done for the particles where the entry in the cell slide matches c,
+  respecting the possibly different weights in the macro slice and the result is
+  reported in cov[0..1][0..1].
+*/
+template <typename CellSliceType, typename WeightSliceType, typename VelocitySliceType>
+static void variance(const CellSliceType& cell, const WeightSliceType& macro, const VelocitySliceType& velocity_par, const VelocitySliceType& velocity_per, const int c, double(&cov)[2][2]) {
+	using gmm_float_type = typename WeightSliceType::value_type;
+
+	gmm_float_type sum_par = 0.;
+	gmm_float_type sum_per = 0.;
+	gmm_float_type sum_parpar = 0.;
+	gmm_float_type sum_perper = 0.;
+	gmm_float_type sum_parper = 0.;
+	gmm_float_type N = 0;
+
+	// How does a single particle contribute to the total
+	auto _add = KOKKOS_LAMBDA(const int& i, gmm_float_type& local_N,
+	                          gmm_float_type& local_sum_par,  gmm_float_type& local_sum_per,
+	                          gmm_float_type& local_sum_parpar, gmm_float_type& local_sum_perper, gmm_float_type& local_sum_parper) {
+		if(cell(i) == c) {
+			local_N     += macro(i);
+			local_sum_par  += macro(i) * velocity_par(i);
+			local_sum_per  += macro(i) * velocity_per(i);
+			local_sum_parpar += macro(i) * velocity_par(i)*velocity_par(i);
+			local_sum_perper += macro(i) * velocity_per(i)*velocity_per(i);
+			local_sum_parper += macro(i) * velocity_par(i)*velocity_per(i);
+		}
+	};
+
+	// Execute
+	Kokkos::parallel_reduce("Particle Scan", velocity_par.size(), _add, N, sum_par, sum_per, sum_parpar, sum_perper, sum_parper);
+
+	printf("N = %e, sum_par=%e, sum_per = %e, sum_parpar = %e, sum_perper = %e, sum_parper = %e\n", N, sum_par, sum_per, sum_parpar, sum_perper, sum_parper);
+	const gmm_float_type mu_par = sum_par/gmm_float_type(N);
+	const gmm_float_type mu_per = 0.; // in cylindrical coordinates we don't expect this to be the perpendicular drift speed but sqrt(pi/2)*vthper
+	const gmm_float_type var_par = sum_parpar/gmm_float_type(N) - mu_par*mu_par;
+	const gmm_float_type var_per = sum_perper/gmm_float_type(N);// - mu_per*mu_per;
+	printf("var_par = %e, var_per = %e\n", var_par, var_per);
+
+	cov[0][0] = var_par;  cov[0][1] = 0.;
+	cov[1][0] = 0.;       cov[1][1] = var_per;
+
+	printf("variance: cell %d. var = %e %e, %e %e\n", c, cov[0][0], cov[0][1], cov[1][0], cov[1][1]);
+}
+
+/*!
+  Do a scan of the particles and compute the variance of the 3d velocity data
+
+  This is done for the particles where the entry in the cell slide matches c,
+  respecting the possibly different weights in the macro slice and the result is
+  reported in the 3x3 covariance matrix cov
+*/
+template <typename CellSliceType, typename WeightSliceType, typename VelocitySliceType>
+static void variance(const CellSliceType& cell, const WeightSliceType& macro, const VelocitySliceType& velocity_x, const VelocitySliceType& velocity_y, const VelocitySliceType& velocity_z, const int c, double(&cov)[3][3]) {
+	using gmm_float_type = typename WeightSliceType::value_type;
+
+	gmm_float_type sum_x = 0.;
+	gmm_float_type sum_y = 0.;
+	gmm_float_type sum_z = 0.;
+	gmm_float_type sum_xx = 0.;
+	gmm_float_type sum_yy = 0.;
+	gmm_float_type sum_zz = 0.;
+	gmm_float_type sum_xy = 0.;
+	gmm_float_type sum_xz = 0.;
+	gmm_float_type sum_yz = 0.;
+	gmm_float_type N = 0;
+
+	// How does a single particle contribute to the total
+	auto _add = KOKKOS_LAMBDA(const int& i, gmm_float_type& local_N,
+	                          gmm_float_type& local_sum_x,  gmm_float_type& local_sum_y,  gmm_float_type& local_sum_z,
+	                          gmm_float_type& local_sum_xx, gmm_float_type& local_sum_yy, gmm_float_type& local_sum_zz,
+	                          gmm_float_type& local_sum_xy, gmm_float_type& local_sum_xz, gmm_float_type& local_sum_yz) {
+		if(cell(i) == c) {
+			local_N     += macro(i);
+			local_sum_x  += macro(i) * velocity_x(i);
+			local_sum_y  += macro(i) * velocity_y(i);
+			local_sum_z  += macro(i) * velocity_z(i);
+			local_sum_xx += macro(i) * velocity_x(i)*velocity_x(i);
+			local_sum_yy += macro(i) * velocity_y(i)*velocity_y(i);
+			local_sum_zz += macro(i) * velocity_z(i)*velocity_z(i);
+			local_sum_xy += macro(i) * velocity_x(i)*velocity_y(i);
+			local_sum_xz += macro(i) * velocity_x(i)*velocity_z(i);
+			local_sum_yz += macro(i) * velocity_y(i)*velocity_z(i);
+		}
+	};
+
+	// Execute
+	Kokkos::parallel_reduce("Particle Scan", velocity_x.size(), _add, N, sum_x, sum_y, sum_z, sum_xx, sum_yy, sum_zz, sum_xy, sum_xz, sum_yz);
+	const gmm_float_type mu_x = sum_x/gmm_float_type(N);
+	const gmm_float_type mu_y = sum_y/gmm_float_type(N);
+	const gmm_float_type mu_z = sum_z/gmm_float_type(N);
+	const gmm_float_type var_x = sum_xx/gmm_float_type(N) - mu_x*mu_x;
+	const gmm_float_type var_y = sum_yy/gmm_float_type(N) - mu_y*mu_y;
+	const gmm_float_type var_z = sum_zz/gmm_float_type(N) - mu_z*mu_z;
+	const gmm_float_type cov_xy = sum_xy/gmm_float_type(N) - mu_x*mu_y;
+	const gmm_float_type cov_xz = sum_xz/gmm_float_type(N) - mu_x*mu_z;
+	const gmm_float_type cov_yz = sum_yz/gmm_float_type(N) - mu_y*mu_z;
+	//printf("N = %e, sum_x=%e, sum_y = %e, sum_z = %e, sum_xx = %e, sum_yy = %e, sum_zz = %e\n", N, sum_x, sum_y, sum_z, sum_xx, sum_yy, sum_zz);
+	//printf("var_x = %e, var_y = %e var_z = %e\n", var_x, var_y, var_z);
+	printf("var_iance: cell %d. sum_ = %e %e %e\n", c, sum_x,sum_y,sum_z);
+	printf("var_iance: cell %d. sum_^2 = %e %e %e\n", c, sum_xx,sum_yy,sum_zz);
+	printf("var_iance: cell %d. sum_ offdiag = %e %e %e\n", c, sum_xy,sum_xz,sum_yz);
+
+	printf("var_iance: cell %d. mu_ = %e %e %e\n", c, mu_x,mu_y,mu_z);
+	printf("var_iance: cell %d. var_ = %e %e %e\n", c, var_x,var_y,var_z);
+	printf("var_iance: cell %d. cov_ = %e %e %e\n", c, cov_xy,cov_xz,cov_yz);
+
+	cov[0][0] = var_x;  cov[0][1] = cov_xy; cov[0][2] = cov_xz;
+	cov[1][0] = cov_xy; cov[1][1] = var_y;  cov[1][2] = cov_yz;
+	cov[2][0] = cov_xz; cov[2][1] = cov_yz; cov[2][2] = var_z;
+
+	printf("variance: cell %d. var = %e %e %e, %e %e %e, %e %e %e\n", c, cov[0][0], cov[0][1], cov[0][2], cov[1][0], cov[1][1], cov[1][2], cov[2][0], cov[2][1], cov[2][2]);
+
+	//return (var_x+var_y+var_z)/3. ;
+}
+
+/*!
+  Compute the sum of alpha and return a version normalized to 1 in alpha_norm
+*/
+template<typename AlphaType>
+static void normalize(AlphaType& alpha_norm, const AlphaType& alpha) {
+	using gmm_float_type = typename AlphaType::value_type;
+
+	for(size_t c = 0; c < alpha.extent(0); c++) {
+		gmm_float_type sum = 0.;
+		int k_max = alpha.extent(1);
+		auto _add = KOKKOS_LAMBDA(const int& m, gmm_float_type& lsum) {
+			lsum += alpha(c,m);
+		};
+		Kokkos::parallel_reduce("Sum alpha", k_max, _add, sum);
+
+		auto _norm = KOKKOS_LAMBDA(const int&m) {
+			alpha_norm(c,m) = alpha(c,m) / sum;
+		};
+		Kokkos::parallel_for("Norm alpha", k_max, _norm);
+	}
+}
+
+
+/*!
+  Compute the w for each combination of a Gaussian and a particle by multiplying  alpha into u
+*/
+template<typename WType, typename AlphaType, typename UType>
+static void prefillW(WType w, const AlphaType& alpha_norm, const UType& u) {
+	using gmm_float_type = typename AlphaType::value_type;
+
+	auto _mult = KOKKOS_LAMBDA(const int& i) {
+		for(size_t c = 0; c < w.extent(0); c++) {
+			for(size_t m = 0; m < w.extent(1); m++) {
+				w(c,m,i) = alpha_norm(c,m) * u(c,m,i);
+			}
+		}
+	};
+	// We have way more particles than gaussians, so that is what we parallelize over
+	Kokkos::parallel_for("Prefill w", w.extent(2), _mult);
+}
+
+/*!
+  Compute the w_norm by normalizing w such that the sum over all Gaussians for a given particle is one
+*/
+template<typename WType>
+static void prefillWNorm(WType& w_norm, const WType& w) {
+	using gmm_float_type = typename WType::value_type;
+
+	auto _norm = KOKKOS_LAMBDA(const int& i) {
+		for(size_t c = 0; c < w.extent(0); c++) {
+			gmm_float_type sum = 0.;
+			for(size_t m = 0; m < w.extent(1); m++) {
+				sum += w(c,m,i);
+			}
+			for(size_t m = 0; m < w.extent(1); m++) {
+				if(sum == 0.) {
+					w_norm(c,m,i) = 0.;
+				} else {
+					w_norm(c,m,i) = w(c,m,i)/sum;
+				}
+			}
+		}
+	};
+	// We have way more particles than gaussians, so that is what we parallelize over
+	Kokkos::parallel_for("Prefill wnorm", w.extent(2), _norm);
+}
+
+
+/*!
+  recompute alpha as per line 10
+*/
+template<typename AlphaType, typename WType>
+static void updateAlpha(AlphaType& alpha, const WType& w_norm, const int c, const int m) {
+	using gmm_float_type = typename AlphaType::value_type;
+
+	int N = dims + dims*(dims+1)/2; // Number of actually independent degrees of freedom
+	if (dims == 1) {
+		N = 1 + 1*(1+1)/2;          // Number of actually independent degrees of freedom for each 1d Gaussian
+	} else if (dims == 2) {
+		N = 2 + 2;                  // Number of actually independent degrees of freedom for each uncorrelated(!) 2d Gaussian
+	} else if (dims == 3) {
+		N = 3 + 3*(3+1)/2;          // Number of actually independent degrees of freedom for each 3d Gaussian
+	}
+
+	Kokkos::View<gmm_float_type*> tmp1("tmp1", w_norm.extent(1));
+	auto _sett = KOKKOS_LAMBDA(const int j) {
+		gmm_float_type sum = 0.;
+		for(size_t n = 0; n < w_norm.extent(2); n++) {
+			sum += w_norm(c,j,n);
+		}
+		tmp1(j) = Kokkos::max(0., sum-N/2.);
+	};
+	Kokkos::parallel_for("compute tmp1", w_norm.extent(1), _sett);
+
+	gmm_float_type sum = 0.;
+	auto _sum = KOKKOS_LAMBDA(const int n, gmm_float_type& lsum) {
+		lsum += tmp1(n);
+	};
+	Kokkos::parallel_reduce("sum tmp1", w_norm.extent(1), _sum, sum);
+
+	auto _seta = KOKKOS_LAMBDA(const int j) {
+		if(j == m) {
+			alpha(c,j) = tmp1(j) / sum;
+		}
+	};
+
+	// Not actually parallel but should run on-device
+	Kokkos::parallel_for("update alpha", w_norm.extent(1), _seta);
+}
+
+
+/*!
+  compute weights u from the probability to get particles given one gaussian
+*/
+template <typename GaussianType, typename CellSliceType, typename VelocitySliceType, typename weight_type>
+static void updateWeights(weight_type u, const CellSliceType& cell, const VelocitySliceType vx, const VelocitySliceType& vy, const VelocitySliceType vz, const GaussianType& g_dev, const int c, const int m) {
+
+	using gmm_float_type = typename GaussianType::value_type;
+
+	if(u.extent(0) != g_dev.extent(0)) {
+		fprintf(stderr, "We need a separate value of the first index in u for each cell\n");
+		exit(1);
+	}
+	if(u.extent(1) != g_dev.extent(1)) {
+		fprintf(stderr, "We need a separate value of the second index in u for each gaussian\n");
+		exit(1);
+	}
+	if(u.extent(2) != vx.size()) {
+		fprintf(stderr, "We need a separate value of the third index in u for each particle\n");
+		exit(1);
+	}
+
+	if (dims == 1) {
+		VelocitySliceType velocity_x = vx;
+
+		// The value of u is given by the probability to draw a single particle from a Gaussian
+		auto _weight = KOKKOS_LAMBDA(const int s, const int i) {
+			int n = (s)*cell.vector_length + i;
+			if(cell(n) == c) {
+				gmm_float_type p = 0.;
+				gmm_float_type v[1] = {velocity_x(n)};
+				gmm_float_type Mu[1] = {g_dev(c,m,MuX)};
+				gmm_float_type C[1][1] = {{g_dev(c,m,Cxx)}};
+				p = Impl::GaussianWeight<gmm_float_type>::weight_1d(v, Mu, C);
+
+				u(c,m,n) = p;
+			} else {
+				u(c,m,n) = 0.;
+			}
+		};
+
+		// Define an execution policy
+		SimdPolicy<cell.vector_length, Kokkos::DefaultExecutionSpace> vec_policy(0, cell.size());
+
+		// Execute for all particles in parallel
+		simd_parallel_for(vec_policy, _weight, "weight()");
+
+	} else if (dims == 2) {
+		VelocitySliceType velocity_par = vx;
+		VelocitySliceType velocity_per = vy;
+
+		// The value of u is given by the probability to draw a single particle from a Gaussian
+		auto _weight = KOKKOS_LAMBDA(const int s, const int i) {
+			int n = (s)*cell.vector_length + i;
+			if(cell(n) == c) {
+				gmm_float_type p = 0.;
+				gmm_float_type v[2] = {velocity_par(n), velocity_per(n)};
+				gmm_float_type Mu[2] = {g_dev(c,m,MuPar), g_dev(c,m,MuPer)};
+				gmm_float_type C[2][2] = {{g_dev(c,m,Cparpar), g_dev(c,m,Cparper)},
+										{g_dev(c,m,Cperpar), g_dev(c,m,Cperper)}};
+				p = Impl::GaussianWeight<gmm_float_type>::weight_2d(v, Mu, C);
+
+				u(c,m,n) = p;
+			} else {
+				u(c,m,n) = 0.;
+			}
+		};
+
+		// Define an execution policy
+		SimdPolicy<cell.vector_length, Kokkos::DefaultExecutionSpace> vec_policy(0, cell.size());
+
+		// Execute for all particles in parallel
+		simd_parallel_for(vec_policy, _weight, "weight()");
+
+	} else if (dims == 3) {
+		VelocitySliceType velocity_x = vx;
+		VelocitySliceType velocity_y = vy;
+		VelocitySliceType velocity_z = vz;
+
+		// The value of u is given by the probability to draw a single particle from a Gaussian
+		auto _weight = KOKKOS_LAMBDA(const int s, const int i) {
+			int n = (s)*cell.vector_length + i;
+			if(cell(n) == c) {
+				gmm_float_type p = 0.;
+				gmm_float_type v[3] = {velocity_x(n), velocity_y(n), velocity_z(n)};
+				gmm_float_type Mu[3] = {g_dev(c,m,MuX), g_dev(c,m,MuY), g_dev(c,m,MuZ)};
+				gmm_float_type C[3][3] = {{g_dev(c,m,Cxx), g_dev(c,m,Cxy), g_dev(c,m,Cxz)},
+										{g_dev(c,m,Cyx), g_dev(c,m,Cyy), g_dev(c,m,Cyz)},
+										{g_dev(c,m,Czx), g_dev(c,m,Czy), g_dev(c,m,Czz)}};
+				p = Impl::GaussianWeight<gmm_float_type>::weight_3d(v, Mu, C);
+
+				u(c,m,n) = p;
+			} else {
+				u(c,m,n) = 0.;
+			}
+		};
+
+		// Define an execution policy
+		SimdPolicy<cell.vector_length, Kokkos::DefaultExecutionSpace> vec_policy(0, cell.size());
+
+		// Execute for all particles in parallel
+		simd_parallel_for(vec_policy, _weight, "weight()");
+	}
+
+}
+
+/*!
+  update gaussian m from the particles where we consider weights w_norm for the impact
+*/
+template <typename GaussianType, typename CellSliceType, typename VelocitySliceType, typename weight_type>
+static void updateGMM(GaussianType& g_dev, const CellSliceType& cell, const VelocitySliceType& vx, const VelocitySliceType& vy, const VelocitySliceType& vz, const weight_type& w_norm, const int c, const int m) {
+	using gmm_float_type = typename GaussianType::value_type;
+
+	if (dims == 1) {
+		auto velocity_x = vx;
+
+		auto _moments = KOKKOS_LAMBDA(const int i, gmm_float_type& lM0, gmm_float_type& lM1x, gmm_float_type& lM2xx) {
+			if(cell(i) == c) {
+				lM0   +=                               w_norm(c,m,i);
+				lM1x  += velocity_x(i)               * w_norm(c,m,i);
+				lM2xx += velocity_x(i)*velocity_x(i) * w_norm(c,m,i);
+			}
+		};
+
+		// Compute moments
+		gmm_float_type M0   = 0.;
+		gmm_float_type M1x  = 0.;
+		gmm_float_type M2xx = 0.;
+		Kokkos::parallel_reduce("update gaussian", w_norm.extent(2), _moments, M0, M1x, M2xx);
+
+		// Compute underlying parameters
+		auto _update = KOKKOS_LAMBDA(const int j) {
+			if(j == m) {
+				g_dev(c,m,MuX) = M1x/M0;
+				g_dev(c,m,Cxx) = M2xx/M0 - (M1x/M0)*(M1x/M0);
+			}
+		};
+
+		// Not actually parallel but should be on-device
+		Kokkos::parallel_for("update Gaussian", g_dev.extent(1), _update);
+	} else if (dims == 2) {
+		auto velocity_par = vx;
+		auto velocity_per = vy;
+
+		auto _moments = KOKKOS_LAMBDA(const int i, gmm_float_type& lM0,
+								   gmm_float_type& lM1par, gmm_float_type& lM1per,
+								   gmm_float_type& lM2parpar, gmm_float_type& lM2parper, gmm_float_type& lM2perper) {
+			if(cell(i) == c) {
+				lM0       +=                                   w_norm(c,m,i) / velocity_per(i);
+				lM1par    += velocity_par(i)                 * w_norm(c,m,i) / velocity_per(i);
+				lM1per    += velocity_per(i)                 * w_norm(c,m,i) / velocity_per(i);
+				lM2parpar += velocity_par(i)*velocity_par(i) * w_norm(c,m,i) / velocity_per(i);
+				lM2parper += velocity_par(i)*velocity_per(i) * w_norm(c,m,i) / velocity_per(i);
+				lM2perper += velocity_per(i)*velocity_per(i) * w_norm(c,m,i) / velocity_per(i);
+			}
+		};
+
+		// Compute moments
+		gmm_float_type M0 = 0.;
+		gmm_float_type M1par = 0.;
+		gmm_float_type M1per = 0.;
+		gmm_float_type M2parpar = 0.;
+		gmm_float_type M2parper = 0.;
+		gmm_float_type M2perper = 0.;
+		Kokkos::parallel_reduce("update gaussian", w_norm.extent(2), _moments, M0, M1par,M1per, M2parpar,M2parper,M2perper);
+
+		// Compute underlying parameters
+		auto _update = KOKKOS_LAMBDA(const int j) {
+			if(j == m) {
+				g_dev(c,m,MuPar) = M1par / M0;
+				g_dev(c,m,MuPer) = 0.;
+				g_dev(c,m,Cparpar) = M2parpar/M0 - (M1par/M0)*(M1par/M0);
+				g_dev(c,m,Cparper) = 0.;
+				g_dev(c,m,Cperpar) = 0.;
+				g_dev(c,m,Cperper) = M2perper/M0; // Eq. 93 has a factor 2 here
+
+				gmm_float_type disc = 4.*M1per*M1per - M_PI*M0*M2perper;
+				//printf("c=%d,m=%d, M0=%f,M1per=%f,M2perper=%f, disc=%f\n", c,m, M0, M1per, M2perper, disc);
+				gmm_float_type alpha, cperper;
+				if(disc < 0.) {
+					alpha = 0.;
+					cperper =  M2perper/M0;
+				} else {
+					gmm_float_type rad = (disc + 2.*M1per*Kokkos::sqrt(disc)) / (M_PI*M_PI*M0*M2perper);
+					alpha = Kokkos::sqrt(rad);
+					cperper = 0.5*M_PI*M2perper*M2perper / (8.*M1per*M1per - M_PI*M0*M2perper + 4.*M1per*Kokkos::sqrt(disc));
+				}
+				gmm_float_type uper = 2.*alpha*cperper;
+				//printf("c=%d,m=%d, alpha=%f, MuPer=%f, Cperper=%f\n", c,m, alpha, uper, cperper);
+				if(alpha > 0.01) {
+					// Iterative solution
+					double I0 = Kokkos::Experimental::cyl_bessel_i0<Kokkos::complex<double>, double, int>(Kokkos::complex(0.25*uper*uper/cperper)).real();
+					double I1 = Kokkos::Experimental::cyl_bessel_i1<Kokkos::complex<double>, double, int>(Kokkos::complex(0.25*uper*uper/cperper)).real();
+
+					double Exp = Kokkos::exp(-0.25*uper*uper/cperper);
+					for(int i = 0; i<500; i++) {
+						uper    = Kokkos::sqrt((-M_PI*Exp*Exp*I0*I1*M0*M2perper + 2*M1per*(-M1per + Kokkos::sqrt(M_PI*Exp*Exp*I0*I1*M0*M2perper + M_PI*Exp*Exp*I1*I1*M0*M2perper + M1per*M1per)))/M0/M0)/(Kokkos::sqrt(M_PI)*Exp*I1);
+						cperper = (M_PI*Exp*Exp*I0*I1*M0*M2perper + M_PI*Exp*Exp*I1*I1*M0*M2perper + 2*M1per*(M1per - Kokkos::sqrt(M_PI*Exp*Exp*I0*I1*M0*M2perper + M_PI*Exp*Exp*I1*I1*M0*M2perper + M1per*M1per)))/(2*M_PI*Exp*Exp*I1*I1*M0*M0);
+						I0  = Kokkos::Experimental::cyl_bessel_i0<Kokkos::complex<double>, double, int>(Kokkos::complex(0.25*uper*uper/cperper)).real();
+						I1  = Kokkos::Experimental::cyl_bessel_i1<Kokkos::complex<double>, double, int>(Kokkos::complex(0.25*uper*uper/cperper)).real();
+						Exp = Kokkos::exp(-0.25*uper*uper/cperper);
+					}
+					//printf("c=%d,m=%d, alpha=%f, MuPer=%f, Cperper=%f\n", c,m, alpha, uper, cperper);
+				}
+				g_dev(c,m,MuPer)  = uper;
+				g_dev(c,m,Cperper) = cperper;
+			}
+		};
+
+		// This shouldn't actually be parallel, but it should be on-device
+		Kokkos::parallel_for("update Gaussian", g_dev.extent(1), _update);
+	} else if (dims == 3) {
+		auto velocity_x = vx;
+		auto velocity_y = vy;
+		auto velocity_z = vz;
+
+		auto _moments = KOKKOS_LAMBDA(const int i, gmm_float_type& lM0,
+								   gmm_float_type& lM1x, gmm_float_type& lM1y, gmm_float_type& lM1z,
+								   gmm_float_type& lM2xx, gmm_float_type& lM2xy, gmm_float_type& lM2xz,
+								   gmm_float_type& lM2yx, gmm_float_type& lM2yy, gmm_float_type& lM2yz,
+								   gmm_float_type& lM2zx, gmm_float_type& lM2zy, gmm_float_type& lM2zz) {
+			if(cell(i) == c) {
+				lM0   +=                               w_norm(c,m,i);
+				lM1x  += velocity_x(i)               * w_norm(c,m,i);
+				lM1y  += velocity_y(i)               * w_norm(c,m,i);
+				lM1z  += velocity_z(i)               * w_norm(c,m,i);
+				lM2xx += velocity_x(i)*velocity_x(i) * w_norm(c,m,i);
+				lM2xy += velocity_x(i)*velocity_y(i) * w_norm(c,m,i);
+				lM2xz += velocity_x(i)*velocity_z(i) * w_norm(c,m,i);
+				lM2yx += velocity_y(i)*velocity_x(i) * w_norm(c,m,i);
+				lM2yy += velocity_y(i)*velocity_y(i) * w_norm(c,m,i);
+				lM2yz += velocity_y(i)*velocity_z(i) * w_norm(c,m,i);
+				lM2zx += velocity_z(i)*velocity_x(i) * w_norm(c,m,i);
+				lM2zy += velocity_z(i)*velocity_y(i) * w_norm(c,m,i);
+				lM2zz += velocity_z(i)*velocity_z(i) * w_norm(c,m,i);
+			}
+		};
+
+		// Compute moments
+		gmm_float_type M0 = 0.;
+		gmm_float_type M1x = 0.;
+		gmm_float_type M1y = 0.;
+		gmm_float_type M1z = 0.;
+		gmm_float_type M2xx = 0.;
+		gmm_float_type M2xy = 0.;
+		gmm_float_type M2xz = 0.;
+		gmm_float_type M2yx = 0.;
+		gmm_float_type M2yy = 0.;
+		gmm_float_type M2yz = 0.;
+		gmm_float_type M2zx = 0.;
+		gmm_float_type M2zy = 0.;
+		gmm_float_type M2zz = 0.;
+		Kokkos::parallel_reduce("update gaussian", w_norm.extent(2), _moments, M0, M1x,M1y,M1z, M2xx,M2xy,M2xz, M2yx,M2yy,M2yz, M2zx,M2zy,M2zz);
+
+		// Compute underlying parameters
+		auto _update = KOKKOS_LAMBDA(const int j) {
+			if(j == m) {
+				g_dev(c,m,MuX) = M1x/M0;
+				g_dev(c,m,MuY) = M1y/M0;
+				g_dev(c,m,MuZ) = M1z/M0;
+				g_dev(c,m,Cxx) = M2xx/M0 - (M1x/M0)*(M1x/M0);
+				g_dev(c,m,Cxy) = M2xy/M0 - (M1x/M0)*(M1y/M0);
+				g_dev(c,m,Cxz) = M2xz/M0 - (M1x/M0)*(M1z/M0);
+				g_dev(c,m,Cyx) = M2yx/M0 - (M1y/M0)*(M1x/M0);
+				g_dev(c,m,Cyy) = M2yy/M0 - (M1y/M0)*(M1y/M0);
+				g_dev(c,m,Cyz) = M2yz/M0 - (M1y/M0)*(M1z/M0);
+				g_dev(c,m,Czx) = M2zx/M0 - (M1z/M0)*(M1x/M0);
+				g_dev(c,m,Czy) = M2zy/M0 - (M1z/M0)*(M1y/M0);
+				g_dev(c,m,Czz) = M2zz/M0 - (M1z/M0)*(M1z/M0);
+			}
+		};
+
+		// This shouldn't actually be parallel, but it should be on-device
+		Kokkos::parallel_for("update Gaussian", g_dev.extent(1), _update);
+	}
+}
+
+
+/*!
+  Find the component that has the smallest remaining non-zero alpha_norm
+*/
+template<typename AlphaType>
+static size_t findWeakestComponent(const AlphaType& alpha_norm, const int c) {
+	using gmm_float_type = typename AlphaType::value_type;
+
+	auto alpha_host = Kokkos::create_mirror_view(Kokkos::HostSpace(), alpha_norm);
+	Kokkos::deep_copy(alpha_host, alpha_norm);
+	size_t idx = alpha_host.extent(1) + 1;
+	gmm_float_type min_weight = std::numeric_limits<gmm_float_type>::infinity();
+	for(size_t m = 0; m < alpha_host.extent(1); m++) {
+		if(alpha_host(c,m) > 0. && alpha_host(c,m) < min_weight) {
+			min_weight = alpha_host(c,m);
+			idx = m;
+		}
+	}
+
+	if(idx == alpha_host.extent(1) + 1) {
+		printf("No component left to remove\n");
+		exit(1);
+	}
+
+	return idx;
+}
+
+
+/*!
+  remove the selected component from the Gaussian Mixture Model
+*/
+template <typename AlphaType, typename WType>
+static void removeGaussianComponent(AlphaType& alpha, AlphaType& alpha_norm, WType& w, WType& w_norm, WType& u, const size_t c, const size_t m) {
+	printf("removing component m = %zd from cell %zd\n", m, c);
+
+	auto _remove = KOKKOS_LAMBDA(const int n) {
+		if(n == 0) { // Should we allow conflicting writes instead of branching?
+			alpha(c,m) = 0.;
+			alpha_norm(c,m) = 0.;
+		}
+		w(c,m,n) = 0.;
+		w_norm(c,m,n) = 0.;
+		u(c,m,n) = 0.;
+	};
+
+	Kokkos::parallel_for("remove smallest Gaussian", w_norm.extent(2), _remove);
+}
+
+/*!
+  Compute the Gaussian Mixture Model given the particle information
+*/
+template <typename GaussianType, typename CellSliceType, typename WeightSliceType, typename VelocitySliceType>
+static void implReconstructGMM(GaussianType& gaussians, const double eps, const int seed, CellSliceType const& cell, WeightSliceType const& weight, VelocitySliceType const& vx, VelocitySliceType const& vy, VelocitySliceType const& vz) {
+	using gmm_float_type = typename GaussianType::value_type;
+
+	const int Nparticles = vx.size();
+	const int c_max  = gaussians.extent(0); // Maximum number of Cells
+	const size_t k_max  = gaussians.extent(1); // Maximum number of Gaussians
+	int N;
+	if(dims == 1) {
+		N     = 1 + 1*(1+1)/2;       // Number of actually independent degrees of freedom for each 1d Gaussian
+	} else if (dims == 2) {
+		N     = 2 + 2;               // Number of actually independent degrees of freedom for each uncorrelated(!) 2d Gaussian
+	} else if (dims == 3) {
+		N     = 3 + 3*(3+1)/2;       // Number of actually independent degrees of freedom for each 3d Gaussian
+	}
+
+	// Line 1 in Fig. 2
+
+	Kokkos::View<gmm_float_type**> alpha("alpha", c_max, k_max);
+	Kokkos::View<gmm_float_type**> alpha_norm("alpha norm", c_max, k_max);
+
+	auto theta = Kokkos::create_mirror_view(Kokkos::DefaultExecutionSpace::memory_space(), gaussians);
+	Kokkos::deep_copy(theta, gaussians);
+
+	// Parameters for the case where we have the best number of gaussians
+	auto theta_best = Kokkos::create_mirror(theta);
+	auto alpha_best = Kokkos::create_mirror(alpha_norm);
+
+	// Generate initial guesses for all Gaussians
+	Kokkos::Random_XorShift64_Pool<> random_pool(seed);
+
+	if (dims == 1) {
+		VelocitySliceType velocity_x = vx;
+
+		for(int c = 0; c < c_max; c++) {
+			double cov[1][1];
+			GMMImpl<1>::variance(cell, weight, velocity_x, c, cov);
+			printf("cell %d. var = %e\n", c, cov[0][0]);
+
+			// Define how to initialize one Gaussian
+			auto _init = KOKKOS_LAMBDA(const int m) {
+				auto generator = random_pool.get_state();
+				theta(c,m,Weight) = 0.; // can we use this instead of alpha?
+				// Are we worried that me might draw the same particle multiple times?
+				theta(c,m,MuX) = velocity_x(generator.drand(0, Nparticles)); // FIXME: Draw a particle in the right cell
+				theta(c,m,Cxx) = cov[0][0];
+				alpha(c,m) = 1.;
+				random_pool.free_state(generator);
+			};
+
+			// execute for all Gaussians
+			Kokkos::parallel_for("initial guesses", k_max, _init);
+		}
+	} else if (dims == 2) {
+		VelocitySliceType velocity_par = vx;
+		VelocitySliceType velocity_per = vz;
+
+		for(int c = 0; c < c_max; c++) {
+			double cov[2][2];
+			GMMImpl<2>::variance(cell, weight, velocity_par, velocity_per, c, cov);
+			printf("cell %d. var = %e %e, %e %e\n", c, cov[0][0], cov[0][1], cov[1][0], cov[1][1]);
+
+			// Define how to initialize one Gaussian
+			auto _init = KOKKOS_LAMBDA(const int m) {
+				auto generator = random_pool.get_state();
+				theta(c,m,Weight) = 0.; // can we use this instead of alpha?
+				// Are we worried that me might draw the same particle multiple times?
+				const int particle_idx = generator.drand(0, Nparticles); // FIXME: Draw a particle in the right cell
+				theta(c,m,MuPar) = velocity_par(particle_idx);
+				theta(c,m,MuPer) = velocity_per(particle_idx);
+				theta(c,m,Cparpar) = cov[0][0];
+				theta(c,m,Cparper) = cov[0][1];
+				theta(c,m,Cperpar) = cov[1][0];
+				theta(c,m,Cperper) = cov[1][1];
+				alpha(c,m) = 1.;
+				random_pool.free_state(generator);
+			};
+
+			// execute for all Gaussians
+			Kokkos::parallel_for("initial guesses", k_max, _init);
+		}
+	} else if (dims == 3) {
+		VelocitySliceType velocity_x = vx;
+		VelocitySliceType velocity_y = vy;
+		VelocitySliceType velocity_z = vz;
+
+		for(int c = 0; c < c_max; c++) {
+			double cov[3][3];
+			GMMImpl<3>::variance(cell, weight, velocity_x, velocity_y, velocity_z, c, cov);
+			printf("cell %d. var = %e %e %e, %e %e %e, %e %e %e\n", c, cov[0][0], cov[0][1], cov[0][2], cov[1][0], cov[1][1], cov[1][2], cov[2][0], cov[2][1], cov[2][2]);
+
+			// Define how to initialize one Gaussian
+			auto _init = KOKKOS_LAMBDA(const int m) {
+				auto generator = random_pool.get_state();
+				theta(c,m,Weight) = 0.; // can we use this instead of alpha?
+				// Are we worried that me might draw the same particle multiple times?
+				const int particle_idx = generator.drand(0, Nparticles); // FIXME: Draw a particle in the right cell
+				theta(c,m,MuX) = velocity_x(particle_idx);
+				theta(c,m,MuY) = velocity_y(particle_idx);
+				theta(c,m,MuZ) = velocity_z(particle_idx);
+				theta(c,m,Cxx) = cov[0][0];
+				theta(c,m,Cxy) = cov[0][1];
+				theta(c,m,Cxz) = cov[0][2];
+				theta(c,m,Cyx) = cov[1][0];
+				theta(c,m,Cyy) = cov[1][1];
+				theta(c,m,Cyz) = cov[1][2];
+				theta(c,m,Czx) = cov[2][0];
+				theta(c,m,Czy) = cov[2][1];
+				theta(c,m,Czz) = cov[2][2];
+			};
+
+			// execute for all Gaussians
+			Kokkos::parallel_for("initial guesses", k_max, _init);
+		}
+	}
+
+	// print initial gaussians
+	auto thetahost = Kokkos::create_mirror(theta);
+	Kokkos::deep_copy(thetahost, theta);
+	printf("Initital Guess for gaussians (using diag entries):\n");
+	for(size_t c = 0; c < thetahost.extent(0); c++) {
+		for(size_t m = 0; m < k_max; m++) {
+#if VelocityDimensions == 1
+			printf("Gaussian(%zd) in cell %zd weight=%f, Mu=%f, Cov=%f\n", m, c, thetahost(c,m,Weight), thetahost(c,m,MuX), thetahost(c,m,Cxx));
+#elif VelocityDimensions == 2
+			printf("Gaussian(%zd) in cell %zd weight=%f, Mu=(%f,%f), Cov=((%f,%f),(%f,%f))\n", m, c, thetahost(c,m,Weight), thetahost(c,m,MuPar),thetahost(c,m,MuPer),
+				thetahost(c,m,Cparpar), thetahost(c,m,Cparper),
+				thetahost(c,m,Cperpar), thetahost(c,m,Cperper));
+#elif VelocityDimensions == 3
+			printf("Gaussian(%zd) in cell %zd weight=%f, Mu=(%f,%f,%f), Cov=((%f,%f,%f),(%f,%f,%f),(%f,%f,%f))\n", m, c, thetahost(c,m,Weight), thetahost(c,m,MuX),thetahost(c,m,MuY),thetahost(c,m,MuZ),
+				thetahost(c,m,Cxx), thetahost(c,m,Cxy), thetahost(c,m,Cxz),
+				thetahost(c,m,Cyx), thetahost(c,m,Cyy), thetahost(c,m,Cyz),
+				thetahost(c,m,Czx), thetahost(c,m,Czy), thetahost(c,m,Czz));
+#endif
+		}
+	}
+
+
+	// normalize alpha to sum 1
+	normalize(alpha_norm, alpha);
+
+	// at some points we need to have alpha_norm on the host
+	auto alpha_host = Kokkos::create_mirror_view(Kokkos::HostSpace(), alpha_norm);
+
+	// Line 4 in Fig. 2
+	// Compute the probabilities of all particles relative to all gaussians
+	Kokkos::View<gmm_float_type***> u("u", c_max,k_max,Nparticles);
+	for(int c = 0; c < c_max; c++) {
+		for(size_t m = 0; m < k_max; m++) {
+			updateWeights(u, cell, vx,vy,vz, theta, c,m);
+		}
+	}
+
+	// Compute weights
+	Kokkos::View<gmm_float_type***> w("w", c_max,k_max,Nparticles);
+	prefillW(w, alpha_norm, u);
+
+	// Normalize weights
+	Kokkos::View<gmm_float_type***> w_norm("w_norm", c_max,k_max,Nparticles);
+	prefillWNorm(w_norm, w);
+
+	for(int c = 0; c < c_max; c++) { // For now we just do things cell by cell
+		printf("#\n# c = %d\n#\n", c);
+
+		// Line 3 in Fig. 2
+		int knz = k_max;
+		gmm_float_type Lmin = std::numeric_limits<gmm_float_type>::infinity();
+
+		// Line 5 in Fig. 2
+		while(knz>0) {
+			int t = 0;
+			printf("\nknz = %d\n", knz);
+			fflush(NULL);
+
+			bool is_first_iter = true;
+			gmm_float_type L = std::numeric_limits<gmm_float_type>::infinity();
+			gmm_float_type Lold = std::numeric_limits<gmm_float_type>::infinity();
+			// Line 6 in Fig. 2
+			bool converged = false;
+			while(!converged) {
+				// Line 7 in Fig. 2
+				t += 1;
+
+				// Line 8 in Fig. 2
+				for(size_t m = 0; m < k_max; m++) {
+					// Get a current copy of alpha_norm
+					Kokkos::deep_copy(alpha_host, alpha_norm);
+					if(alpha_host(c,m) <= 0.) {
+						continue; // with next m
+					}
+
+					// Line 9 of Fig. 2
+					auto _updatew = KOKKOS_LAMBDA(const int& i) {
+						w(c,m,i) = alpha_norm(c,m) * u(c,m,i);
+					};
+					Kokkos::parallel_for("Update w", Nparticles, _updatew);
+					auto _updatewnorm = KOKKOS_LAMBDA(const int& i) {
+						gmm_float_type sum = 0.;
+						for(size_t mprime = 0; mprime < k_max; mprime++) {
+							sum += w(c,mprime,i);
+						}
+						if(sum == 0.) {
+							w_norm(c,m,i) = 0.;
+						} else {
+							w_norm(c,m,i) = weight(i) * w(c,m,i) / sum;
+						}
+						};
+					Kokkos::parallel_for("Update wnorm", Nparticles, _updatewnorm);
+
+					// Line 10 of Fig. 2
+					updateAlpha(alpha, w_norm, c,m);
+
+					// Line 11 of Fig. 2
+					// normalize alpha to sum 1
+					normalize(alpha_norm, alpha);
+
+					Kokkos::deep_copy(alpha_host, alpha_norm);
+
+					// Line 12 of Fig. 2
+					// Get a current copy of alpha_norm
+					Kokkos::deep_copy(alpha_host, alpha_norm);
+					if(alpha_host(c,m) > 0.) {
+						// Line 13 of Fig. 2
+						updateGMM(theta, cell, vx,vy,vz, w_norm, c,m);
+
+						// Line 14 of Fig. 2
+						// update u
+						updateWeights(u, cell, vx,vy,vz, theta, c,m);
+						// update w
+						Kokkos::parallel_for("Update w", Nparticles, _updatew);
+						// update w_norm
+						Kokkos::parallel_for("Update wnorm", Nparticles, _updatewnorm);
+					} else { // Line 15 of Fig. 2
+						printf("alpha(%d,%lu) <= 0\n", c,m);
+						removeGaussianComponent(alpha, alpha_norm, w, w_norm, u, c,m);
+						// Line 16 of Fig. 2
+						// count how many gaussians have alpha_norm(m)>0
+						int new_knz = 0;
+						for(size_t mprime = 0; mprime < k_max; mprime++) {
+							if(alpha_host(c,mprime) > 0.) {
+								new_knz++;
+							}
+						}
+						knz = new_knz;
+						printf("knz = %d\n", knz);
+						if(knz <= 0) {
+							Kokkos::abort("Unexpectedly all gaussian weights have fallen below zero. Construction of Gaussian mixture model failed.");
+						}
+					} // Line 17 of Fig. 2
+				} // Line 18 of Fig. 2
+				// Line 19 of Fig. 2
+				auto thetat = Kokkos::create_mirror(theta);
+				Kokkos::deep_copy(thetat, theta);
+
+				// Line 20 of Fig. 2
+				gmm_float_type term1b = 0.;
+				Kokkos::deep_copy(alpha_host, alpha_norm);
+				for(size_t mprime = 0; mprime < k_max; mprime++) {
+					if(alpha_host(c,mprime) > 0.) {
+						term1b += std::log(alpha_host(c,mprime));
+					}
+				}
+
+				gmm_float_type term2 = 0.;
+				auto _term2 = KOKKOS_LAMBDA(const int i, gmm_float_type& lsum) {
+					gmm_float_type tmp2 = 0.;
+					for(size_t mprime = 0; mprime < k_max; mprime++) {
+						if(alpha_norm(c,mprime) > 0.) {
+							tmp2 += alpha_norm(c,mprime) * u(c,mprime,i);
+						}
+					}
+					if(tmp2 > 0.) {
+						lsum += weight(i) * Kokkos::log(tmp2);
+					}
+				};
+				Kokkos::parallel_reduce("get term2", Nparticles, _term2, term2);
+
+				Lold = L;
+				gmm_float_type d = knz * N + knz - 1.;
+				L = N/2. * term1b + 0.5*d*std::log(Nparticles) - term2;
+
+				// Line 21 of Fig 2.
+				if(is_first_iter) {
+					is_first_iter = false;
+				} else {
+					//printf("knz = %d, Lold = %f, L = %f\n", knz, Lold, L);
+					if(fabs(Lold-L) < eps*fabs(L)) {
+						printf("converged on %d gaussians with L= %f in %d iterations!\n", knz, L, t);
+						converged = true;
+					}
+				}
+			}
+
+			// Line 22 of Fig. 2
+			if(L < Lmin) {
+				size_t mu_min = findWeakestComponent(alpha_norm, c);
+				Kokkos::deep_copy(alpha_host, alpha_norm);
+				if(alpha_host(c,mu_min) * Nparticles >= pow(2,dims)) {
+					// Line 23
+					Lmin = L;
+					// Line 24
+					Kokkos::deep_copy(theta_best, theta);
+					Kokkos::deep_copy(alpha_best, alpha_norm);
+					printf("new Lmin = %f for knz = %d\n", Lmin, knz);
+				}
+			} // Line 25
+
+			// Line 26 of Fig. 2
+			if(knz > 1) { // don't remove the last Gaussian
+				size_t mu_min = findWeakestComponent(alpha_norm, c);
+				Kokkos::deep_copy(alpha_host, alpha_norm);
+				if(alpha_host(c,mu_min) * Nparticles < pow(2,dims)) {
+					printf("component predicts zero particles, removing\n");
+				} else {
+					printf("weakest component isn't that weak, why should this make anything better?\n");
+				}
+				removeGaussianComponent(alpha, alpha_norm, w, w_norm, u, c, mu_min);
+			}
+			knz--;
+
+		} // Line 27
+
+		Kokkos::deep_copy(gaussians, theta_best); // When we are done, copy the gaussians back to the host
+
+		// Copy back alpha norm and set the weights of the Gaussians from that
+		Kokkos::deep_copy(alpha_host, alpha_best);
+		for(size_t m = 0; m < k_max; m++) {
+			gaussians(c,m,Weight) = alpha_host(c,m);
+		}
+
+		// copy that value of theta_best back to the device, because we trashed
+		// out theta there, trying smaller numbers of gaussians in that cell
+		Kokkos::deep_copy(theta, gaussians);
+	}
+
+	printf("Reconstruction done\n");
+}
+
+
+}; // end of GMMImpl class
+
+
+/*!
+  Reconstruct a Gaussian Mixture Model in one dimension
+
+  eps set the limit on relative change of the penalized log liklyhood function
+  when we consider the reconstruction to be converged. seed is used when drawing
+  the initial random Gaussians that the algorithm starts from. Reconstruction is
+  done separately for each unique value in cell. This slice is assume to be a
+  dense set of integers from 0 to gaussians.extent(0). The entries in weight can
+  be all equal or can be different for particles with different statistical
+  weight / macro factor, but it is assumed that the sum of all weights is equal
+  to the number of particles. All particle slices have to have the same extent.
+*/
+template <typename GaussianType, typename CellSliceType, typename WeightSliceType, typename VelocitySliceType>
+void reconstructGMM(GaussianType& gaussians, const double eps, const int seed, CellSliceType const& cell, WeightSliceType const& weight, VelocitySliceType const& vx) {
+	GMMImpl<1>::implReconstructGMM(gaussians, eps, seed, cell, weight, vx, vx, vx);
+}
+
+/*!
+  Reconstruct a Gaussian Mixture Model in two (cylindrical) dimensions
+
+  eps set the limit on relative change of the penalized log liklyhood function
+  when we consider the reconstruction to be converged. seed is used when drawing
+  the initial random Gaussians that the algorithm starts from. Reconstruction is
+  done separately for each unique value in cell. This slice is assume to be a
+  dense set of integers from 0 to gaussians.extent(0). The entries in weight can
+  be all equal or can be different for particles with different statistical
+  weight / macro factor, but it is assumed that the sum of all weights is equal
+  to the number of particles. All particle slices have to have the same extent.
+*/
+template <typename GaussianType, typename CellSliceType, typename WeightSliceType, typename VelocitySliceType>
+void reconstructGMM(GaussianType& gaussians, const double eps, const int seed, CellSliceType const& cell, WeightSliceType const& weight, VelocitySliceType const& vpar, VelocitySliceType const& vper) {
+	GMMImpl<2>::implReconstructGMM(gaussians, eps, seed, cell, weight, vpar, vper, vper);
+}
+
+/*!
+  Reconstruct a Gaussian Mixture Model in three (cartesian) dimensions
+
+  eps set the limit on relative change of the penalized log liklyhood function
+  when we consider the reconstruction to be converged. seed is used when drawing
+  the initial random Gaussians that the algorithm starts from. Reconstruction is
+  done separately for each unique value in cell. This slice is assume to be a
+  dense set of integers from 0 to gaussians.extent(0). The entries in weight can
+  be all equal or can be different for particles with different statistical
+  weight / macro factor, but it is assumed that the sum of all weights is equal
+  to the number of particles. All particle slices have to have the same extent.
+*/
+template <typename GaussianType, typename CellSliceType, typename WeightSliceType, typename VelocitySliceType>
+void reconstructGMM(GaussianType& gaussians, const double eps, const int seed, CellSliceType const& cell, WeightSliceType const& weight, VelocitySliceType const& vx, VelocitySliceType const& vy, VelocitySliceType const& vz) {
+	GMMImpl<3>::implReconstructGMM(gaussians, eps, seed, cell, weight, vx, vy, vz);
+}
+
+
+
+} // end namespace Cabana
+
+#endif // end CABANA_GMM_HPP
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/core/src/Cabana_ParticleInit.hpp b/core/src/Cabana_ParticleInit.hpp
index aaae4592f..9fb4549ac 100644
--- a/core/src/Cabana_ParticleInit.hpp
+++ b/core/src/Cabana_ParticleInit.hpp
@@ -18,6 +18,10 @@
 
 #include <Kokkos_Core.hpp>
 #include <Kokkos_Random.hpp>
+#include <Cabana_GaussianMixtureModel.hpp>
+#include <impl/Cabana_Erfinv.hpp>
+#include <impl/Cabana_Hammersley.hpp>
+#include <impl/Cabana_Matrix2d.hpp>
 
 #include <Cabana_ParticleList.hpp>
 #include <Cabana_Slice.hpp>
@@ -494,6 +498,1194 @@ createRandomParticlesMinDistance( ExecutionSpace exec_space,
     Kokkos::fence();
 }
 
+/*!
+  Populate the particles based on the description of the distribution function
+  in gaussians. Particles are places in 1d1v phase space.
+*/
+template <typename CellSliceType, typename WeightSliceType,
+          typename PositionSliceType, typename VelocitySliceType,
+          typename GaussianType>
+size_t initializeRandomParticles( CellSliceType& cell, WeightSliceType& macro,
+                                  PositionSliceType& position_x,
+                                  VelocitySliceType& velocity_x,
+                                  const GaussianType& gaussians,
+                                  const int seed )
+{
+    using gmm_float_type = typename GaussianType::value_type;
+    using particle_float_type = typename WeightSliceType::value_type;
+    Kokkos::Random_XorShift64_Pool<> random_pool( seed );
+
+    auto g_dev = Kokkos::create_mirror_view(
+        Kokkos::DefaultExecutionSpace::memory_space(), gaussians );
+    Kokkos::deep_copy( g_dev, gaussians );
+
+    // Define how to create ONE particle
+    auto _init = KOKKOS_LAMBDA( const int s, const int i )
+    {
+        // acquire the state of the random number generator engine
+        auto generator = random_pool.get_state();
+
+        // Figure out which Cell this particle is in
+        const int c = generator.drand( 0., g_dev.extent( 0 ) );
+
+        // Pick a random location in that cell
+        const particle_float_type x = generator.drand( c, c + 1. );
+
+        // Figure out which Gaussian to draw this particle from
+        const gmm_float_type r = generator.drand( 0., 1. );
+        unsigned int j = 0;
+        gmm_float_type sum = g_dev( c, j, Weight );
+        while ( ( sum < r ) && ( j < g_dev.extent( 1 ) - 1 ) )
+        {
+            j++;
+            sum += g_dev( c, j, Weight );
+        }
+
+        // Generate standard normal random variables
+        const gmm_float_type rx = generator.normal( 0., 1. );
+
+        velocity_x.access( s, i ) =
+            g_dev( c, j, MuX ) + Kokkos::sqrt( g_dev( c, j, Cxx ) ) * rx;
+
+        // Store the cell index
+        cell.access( s, i ) = c;
+
+        // Assign uniform particle weight
+        macro.access( s, i ) = 1.;
+
+        // Store particle position;
+        position_x.access( s, i ) = x;
+
+        // do not forget to release the state of the engine
+        random_pool.free_state( generator );
+    };
+
+    // Define an execution policy
+    Cabana::SimdPolicy<cell.vector_length, Kokkos::DefaultExecutionSpace>
+        vec_policy( 0, cell.size() );
+
+    // Execute
+    Cabana::simd_parallel_for( vec_policy, _init, "init()" );
+
+    return cell.size();
+}
+
+/*!
+  Populate the particles based on the description of the distribution function
+  in gaussians. Particles are places in 1d2v phase space with cylindrical
+  velocity space.
+*/
+template <typename CellSliceType, typename WeightSliceType,
+          typename PositionSliceType, typename VelocitySliceType,
+          typename GaussianType>
+size_t initializeRandomParticles( CellSliceType& cell, WeightSliceType& macro,
+                                  PositionSliceType& position_x,
+                                  VelocitySliceType& velocity_par,
+                                  VelocitySliceType& velocity_per,
+                                  const GaussianType& gaussians,
+                                  const int seed )
+{
+    using gmm_float_type = typename GaussianType::value_type;
+    using particle_float_type = typename WeightSliceType::value_type;
+    Kokkos::Random_XorShift64_Pool<> random_pool( seed );
+
+    auto g_dev = Kokkos::create_mirror_view(
+        Kokkos::DefaultExecutionSpace::memory_space(), gaussians );
+    Kokkos::deep_copy( g_dev, gaussians );
+
+    // Define how to create ONE particle
+    auto _init = KOKKOS_LAMBDA( const int s, const int i )
+    {
+        // acquire the state of the random number generator engine
+        auto generator = random_pool.get_state();
+
+        // Figure out which Cell this particle is in
+        const int c = generator.drand( 0., g_dev.extent( 0 ) );
+
+        // Pick a random location in that cell
+        const particle_float_type x = generator.drand( c, c + 1. );
+
+        // Figure out which Gaussian to draw this particle from
+        const gmm_float_type r = generator.drand( 0., 1. );
+        unsigned int j = 0;
+        gmm_float_type sum = g_dev( c, j, Weight );
+        while ( ( sum < r ) && ( j < g_dev.extent( 1 ) - 1 ) )
+        {
+            j++;
+            sum += g_dev( c, j, Weight );
+        }
+
+        // TODO: Extend this to ring distributions that allow for correlations
+        // between parallel and perpendicular velocity components
+        //
+        //  Put Covariance matrix of the jth Gaussian into 3x3 Matrix
+        const gmm_float_type C[3][3] = { { g_dev( c, j, Cparpar ), 0, 0 },
+                                         { 0, g_dev( c, j, Cperper ), 0 },
+                                         { 0, 0, g_dev( c, j, Cperper ) } };
+        gmm_float_type B[3][3];
+        // Get Cholesky decomposition
+        Cabana::Impl::Matrix2d<gmm_float_type, 3>::cholesky( B, C );
+
+        // Generate standard normal random variables
+        const gmm_float_type rx = generator.normal( 0., 1. );
+        const gmm_float_type ry = generator.normal( 0., 1. );
+        const gmm_float_type rz = generator.normal( 0., 1. );
+
+        const particle_float_type vx =
+            g_dev( c, j, MuPar ) + B[0][0] * rx + B[0][1] * ry + B[0][2] * rz;
+        const particle_float_type vy =
+            g_dev( c, j, MuPer ) + B[1][0] * rx + B[1][1] * ry + B[1][2] * rz;
+        const particle_float_type vz =
+            B[2][0] * rx + B[2][1] * ry + B[2][2] * rz;
+
+        velocity_par.access( s, i ) = vx;
+        velocity_per.access( s, i ) = Kokkos::sqrt( vy * vy + vz * vz );
+
+        // Store the cell index
+        cell.access( s, i ) = c;
+
+        // Assign uniform particle weight
+        macro.access( s, i ) = 1.;
+
+        // Store particle position;
+        position_x.access( s, i ) = x;
+
+        // do not forget to release the state of the engine
+        random_pool.free_state( generator );
+    };
+
+    // Define an execution policy
+    Cabana::SimdPolicy<cell.vector_length, Kokkos::DefaultExecutionSpace>
+        vec_policy( 0, cell.size() );
+
+    // Execute
+    Cabana::simd_parallel_for( vec_policy, _init, "init()" );
+
+    return cell.size();
+}
+
+/*!
+  Populate the particles based on the description of the distribution function
+  in gaussians. Particles are places in 1d3v phase space.
+*/
+template <typename CellSliceType, typename WeightSliceType,
+          typename PositionSliceType, typename VelocitySliceType,
+          typename GaussianType>
+size_t initializeRandomParticles( CellSliceType& cell, WeightSliceType& macro,
+                                  PositionSliceType& position_x,
+                                  VelocitySliceType& velocity_x,
+                                  VelocitySliceType& velocity_y,
+                                  VelocitySliceType& velocity_z,
+                                  const GaussianType& gaussians,
+                                  const int seed )
+{
+    using gmm_float_type = typename GaussianType::value_type;
+    using particle_float_type = typename WeightSliceType::value_type;
+    Kokkos::Random_XorShift64_Pool<> random_pool( seed );
+
+    auto g_dev = Kokkos::create_mirror_view(
+        Kokkos::DefaultExecutionSpace::memory_space(), gaussians );
+    Kokkos::deep_copy( g_dev, gaussians );
+
+    // Define how to create ONE particle
+    auto _init = KOKKOS_LAMBDA( const int s, const int i )
+    {
+        // acquire the state of the random number generator engine
+        auto generator = random_pool.get_state();
+
+        // Figure out which Cell this particle is in
+        const int c = generator.drand( 0., g_dev.extent( 0 ) );
+
+        // Pick a random location in that cell
+        const particle_float_type x = generator.drand( c, c + 1. );
+
+        // Figure out which Gaussian to draw this particle from
+        const gmm_float_type r = generator.drand( 0., 1. );
+        unsigned int j = 0;
+        gmm_float_type sum = g_dev( c, j, Weight );
+        while ( ( sum < r ) && ( j < g_dev.extent( 1 ) - 1 ) )
+        {
+            j++;
+            sum += g_dev( c, j, Weight );
+        }
+
+        // Put Covariance matrix of the jth Gaussian into 3x3 Matrix
+        const gmm_float_type C[3][3] = {
+            { g_dev( c, j, Cxx ), g_dev( c, j, Cxy ), g_dev( c, j, Cxz ) },
+            { g_dev( c, j, Cyx ), g_dev( c, j, Cyy ), g_dev( c, j, Cyz ) },
+            { g_dev( c, j, Czx ), g_dev( c, j, Czy ), g_dev( c, j, Czz ) } };
+        gmm_float_type B[3][3];
+        // Get Cholesky decomposition
+        Cabana::Impl::Matrix2d<gmm_float_type, 3>::cholesky( B, C );
+
+        // Generate standard normal random variables
+        const particle_float_type rx = generator.normal( 0., 1. );
+        const particle_float_type ry = generator.normal( 0., 1. );
+        const particle_float_type rz = generator.normal( 0., 1. );
+
+        velocity_x.access( s, i ) =
+            g_dev( c, j, MuX ) + B[0][0] * rx + B[0][1] * ry + B[0][2] * rz;
+        velocity_y.access( s, i ) =
+            g_dev( c, j, MuY ) + B[1][0] * rx + B[1][1] * ry + B[1][2] * rz;
+        velocity_z.access( s, i ) =
+            g_dev( c, j, MuZ ) + B[2][0] * rx + B[2][1] * ry + B[2][2] * rz;
+
+        // Store the cell index
+        cell.access( s, i ) = c;
+
+        // Assign uniform particle weight
+        macro.access( s, i ) = 1.;
+
+        // Store particle position;
+        position_x.access( s, i ) = x;
+
+        // do not forget to release the state of the engine
+        random_pool.free_state( generator );
+    };
+
+    // Define an execution policy
+    Cabana::SimdPolicy<cell.vector_length, Kokkos::DefaultExecutionSpace>
+        vec_policy( 0, cell.size() );
+
+    // Execute
+    Cabana::simd_parallel_for( vec_policy, _init, "init()" );
+
+    return cell.size();
+}
+
+/*!
+ Populate the particles based on the description of the distribution function in
+ gaussians
+*/
+template <typename CellSliceType, typename WeightSliceType,
+          typename PositionSliceType, typename VelocitySliceType,
+          typename GaussianType>
+size_t initializeParticlesFromCDF( CellSliceType& cell, WeightSliceType& macro,
+                                   PositionSliceType& position_x,
+                                   VelocitySliceType& velocity_x,
+                                   const GaussianType& gaussians,
+                                   const int seed )
+{
+    using gmm_float_type = typename WeightSliceType::value_type;
+    auto g_dev = Kokkos::create_mirror_view(
+        Kokkos::DefaultExecutionSpace::memory_space(), gaussians );
+    Kokkos::deep_copy( g_dev, gaussians );
+    Kokkos::Random_XorShift64_Pool<> random_pool( seed );
+
+    const int N_cells = gaussians.extent( 0 );
+    const int N_gauss = gaussians.extent( 1 );
+    const int N_particles = cell.size();
+
+    int start = 0;
+    for ( int c = 0; c < N_cells; c++ )
+    {
+        for ( int m = 0; m < N_gauss; m++ )
+        {
+            int Np = int(
+                N_particles * gaussians( c, m, Weight ) /
+                gmm_float_type(
+                    N_cells ) ); // number of particles to add for that gaussian
+
+            // Define how to create ONE particle in cell c, gaussian m
+            auto _init = KOKKOS_LAMBDA( const int s, const int i )
+            {
+                // acquire the state of the random number generator engine
+                auto generator = random_pool.get_state();
+
+                int id = (s)*cell.vector_length + i;
+
+                gmm_float_type cdf =
+                    ( ( id - start ) + 1 ) / gmm_float_type( Np + 1 );
+
+                // solution to 1/2 * (1+erf((x-mu)/(sqrt(2)*sigma))) == cdf
+                gmm_float_type vx =
+                    g_dev( c, m, MuX ) +
+                    Kokkos::sqrt( g_dev( c, m, Cxx ) ) * Cabana::Impl::ppnd7( cdf );
+                velocity_x.access( s, i ) = vx;
+
+                // Store the cell index
+                cell.access( s, i ) = c;
+
+                // Assign uniform particle weight
+                macro.access( s, i ) = 1.;
+
+                // Generate and store a random position in that cell
+                position_x.access( s, i ) = generator.drand( c, c + 1. );
+
+                // do not forget to release the state of the engine
+                random_pool.free_state( generator );
+            };
+
+            // Define an execution policy
+            Cabana::SimdPolicy<cell.vector_length,
+                               Kokkos::DefaultExecutionSpace>
+                vec_policy( start, start + Np );
+
+            // printf("Generating %d particles from %d to %d from Gaussian %d in
+            // cell %d\n", Np, start, start+Np, m, c);
+
+            // Execute
+            Cabana::simd_parallel_for( vec_policy, _init, "init()" );
+
+            start += Np;
+        }
+    }
+
+    // TODO: we may not have created exactly N_particles particles. Do we want
+    // to bother creating a few more to fix that?
+    return ( start );
+}
+
+/*!
+ Populate the particles based on the description of the distribution function in
+ gaussians
+*/
+template <typename CellSliceType, typename WeightSliceType,
+          typename PositionSliceType, typename VelocitySliceType,
+          typename GaussianType>
+size_t initializeEqualDensityParticlesWithHammersley(
+    CellSliceType& cell, WeightSliceType& macro, PositionSliceType& position_x,
+    VelocitySliceType& velocity_x, const GaussianType& gaussians )
+{
+    using gmm_float_type = typename WeightSliceType::value_type;
+    auto g_dev = Kokkos::create_mirror_view(
+        Kokkos::DefaultExecutionSpace::memory_space(), gaussians );
+    Kokkos::deep_copy( g_dev, gaussians );
+
+    const int N_cells = gaussians.extent( 0 ); // Number of cells
+    const int N_gauss =
+        gaussians.extent( 1 ); // Maximum number of Gaussians per cell
+    const int N_particles_in_total =
+        cell.size(); // Total number of particles the weight off all particles
+                     // _also_ needs to add to this number
+
+    int start = 0;
+    for ( int c = 0; c < N_cells; c++ )
+    {
+        // number of physical particles in this cell
+        // We assume that these get evenly distributed to all cells. We might
+        // change that in the future to be proportional to the plasma density
+        // in each cell.
+        int N_particles_in_cell = N_particles_in_total / N_cells;
+
+        // number of computational particles to add for each Gaussian
+        Kokkos::View<double*> Ncd( "Nc double", N_gauss );
+        Kokkos::View<int*> Nc( "Nc int", N_gauss );
+        auto Ncd_host =
+            Kokkos::create_mirror_view_and_copy( Kokkos::HostSpace(), Ncd );
+        auto Nc_host =
+            Kokkos::create_mirror_view_and_copy( Kokkos::HostSpace(), Nc );
+
+        double sum = 0.;
+        for ( int m = 0; m < N_gauss; m++ )
+        {
+            if ( gaussians( c, m, Weight ) > 0. )
+            {
+                Ncd_host( m ) = Kokkos::pow( gaussians( c, m, Weight ), 1.0 );
+                sum += Ncd_host( m );
+            }
+            else
+            {
+                Ncd_host( m ) = 0.;
+            }
+        }
+
+        double norm = N_particles_in_cell / sum;
+
+        N_particles_in_cell = 0;
+        for ( int m = 0; m < N_gauss; m++ )
+        {
+            Nc_host( m ) = int( Ncd_host( m ) * norm );
+            N_particles_in_cell += Nc_host( m );
+            // if(Nc_host(m) > 0) {
+            //    printf("Nc(%d,%d) = %d\n", c,m, Nc_host(m));
+            // }
+        }
+        Kokkos::deep_copy( Nc, Nc_host );
+
+        // number of physical particles for each Gaussian
+        Kokkos::View<double*> Npd( "Np double", N_gauss );
+        auto Npd_host =
+            Kokkos::create_mirror_view_and_copy( Kokkos::HostSpace(), Npd );
+        for ( int m = 0; m < N_gauss; m++ )
+        {
+            if ( gaussians( c, m, Weight ) > 0. )
+            {
+                Npd_host( m ) = gaussians( c, m, Weight ) * N_particles_in_cell;
+                // printf("Npd(%d,%d) = %f\n", c,m, Npd_host(m));
+            }
+            else
+            {
+                Npd_host( m ) = 0.;
+            }
+        }
+        Kokkos::deep_copy( Npd, Npd_host );
+
+        for ( int m = 0; m < N_gauss; m++ )
+        {
+            if ( gaussians( c, m, Weight ) <= 0. )
+            {
+                continue;
+            }
+            // printf("\n");
+
+            // Define how to create ONE particle in cell c, gaussian m
+            auto _init = KOKKOS_LAMBDA( const int s, const int i )
+            {
+                int id = (s)*cell.vector_length + i;
+
+                const gmm_float_type vx =
+                    g_dev( c, m, MuX ) +
+                    5. * Kokkos::sqrt( g_dev( c, m, Cxx ) ) *
+                        ( 2. * Cabana::Impl::hammersley( 1, id - start, Nc( m ) ) -
+                          1. );
+                const gmm_float_type dv = 10. * Kokkos::sqrt( g_dev( c, m, Cxx ) );
+                const gmm_float_type v[1] = { vx };
+                const gmm_float_type Mu[1] = { g_dev( c, m, MuX ) };
+                const gmm_float_type C[1][1] = { { g_dev( c, m, Cxx ) } };
+                const gmm_float_type p =
+                    Cabana::Impl::GaussianWeight<gmm_float_type>::weight_1d(
+                        v, Mu, C );
+
+                velocity_x.access( s, i ) = vx;
+
+                // Store the cell index
+                cell.access( s, i ) = c;
+
+                // Assign uniform particle weight
+                macro.access( s, i ) =
+                    p * Npd( m ) / (gmm_float_type)Nc( m ) * dv;
+
+                // Store particle position
+                position_x.access( s, i ) =
+                    c + Cabana::Impl::hammersley( 0, id - start + 1, Nc( m ) + 1 );
+            };
+
+            // Define an execution policy
+            Cabana::SimdPolicy<cell.vector_length,
+                               Kokkos::DefaultExecutionSpace>
+                vec_policy( start, start + Nc_host( m ) );
+
+            // printf("Generating %d particles from %d to %d from Gaussian %d in
+            // cell %d\n", Nc_host(m), start, start+Nc_host(m), m, c);
+
+            // Execute
+            Cabana::simd_parallel_for( vec_policy, _init, "init()" );
+
+            // Count weight of generated particles
+            double count_of_gaussian = 0.;
+            double weight_of_gaussian = 0.;
+            auto _sum =
+                KOKKOS_LAMBDA( const int i, double& lcount, double& lsum )
+            {
+                if ( ( start <= i ) && ( i < start + Nc( m ) ) )
+                {
+                    lcount += 1.;
+                    lsum += macro( i );
+                }
+            };
+            // printf("Checking particles %d to %d\n", start, start+Nc_host(m));
+            Kokkos::parallel_reduce( "Particle Scan", N_particles_in_total,
+                                     _sum, count_of_gaussian,
+                                     weight_of_gaussian );
+            // printf("Generated %f particles have weight %f\n",
+            // count_of_gaussian, weight_of_gaussian);
+
+            start += Nc_host( m );
+        }
+        // printf("\n");
+    }
+
+    // TODO: we may not have created exactly N_particles_in_total particles. Do
+    // we want to bother creating a few more to fix that?
+    return ( start );
+}
+
+/*!
+  Populate the particles based on the description of the distribution function
+  in gaussians. Particles are places in 1d2v phase space.
+*/
+template <typename CellSliceType, typename WeightSliceType,
+          typename PositionSliceType, typename VelocitySliceType,
+          typename GaussianType>
+size_t initializeEqualDensityParticlesWithHammersley(
+    CellSliceType& cell, WeightSliceType& macro, PositionSliceType& position_x,
+    VelocitySliceType& velocity_par, VelocitySliceType& velocity_per,
+    const GaussianType& gaussians )
+{
+    using gmm_float_type = typename WeightSliceType::value_type;
+    auto g_dev = Kokkos::create_mirror_view(
+        Kokkos::DefaultExecutionSpace::memory_space(), gaussians );
+    Kokkos::deep_copy( g_dev, gaussians );
+
+    const int N_cells = gaussians.extent( 0 ); // Number of cells
+    const int N_gauss =
+        gaussians.extent( 1 ); // Maximum number of Gaussians per cell
+    const int N_particles_in_total =
+        cell.size(); // Total number of particles the weight off all particles
+                     // _also_ needs to add to this number
+
+    int start = 0;
+    for ( int c = 0; c < N_cells; c++ )
+    {
+        // number of physical particles in this cell
+        // We assume that these get evenly distributed to all cells. We might
+        // change that in the future to be proportional to the plasma density
+        // in each cell.
+        int N_particles_in_cell = N_particles_in_total / N_cells;
+
+        // number of computational particles to add for each Gaussian
+        Kokkos::View<double*> Ncd( "Nc double", N_gauss );
+        Kokkos::View<int*> Nc( "Nc int", N_gauss );
+        auto Ncd_host =
+            Kokkos::create_mirror_view_and_copy( Kokkos::HostSpace(), Ncd );
+        auto Nc_host =
+            Kokkos::create_mirror_view_and_copy( Kokkos::HostSpace(), Nc );
+
+        double sum = 0.;
+        for ( int m = 0; m < N_gauss; m++ )
+        {
+            if ( gaussians( c, m, Weight ) > 0. )
+            {
+                Ncd_host( m ) = Kokkos::pow( gaussians( c, m, Weight ), 1.0 );
+                sum += Ncd_host( m );
+            }
+            else
+            {
+                Ncd_host( m ) = 0.;
+            }
+        }
+
+        double norm = N_particles_in_cell / sum;
+
+        N_particles_in_cell = 0;
+        for ( int m = 0; m < N_gauss; m++ )
+        {
+            Nc_host( m ) = int( Ncd_host( m ) * norm );
+            N_particles_in_cell += Nc_host( m );
+            // if(Nc_host(m) > 0) {
+            //    printf("Nc(%d,%d) = %d\n", c,m, Nc_host(m));
+            // }
+        }
+        Kokkos::deep_copy( Nc, Nc_host );
+
+        // number of physical particles for each Gaussian
+        Kokkos::View<double*> Npd( "Np double", N_gauss );
+        auto Npd_host =
+            Kokkos::create_mirror_view_and_copy( Kokkos::HostSpace(), Npd );
+        for ( int m = 0; m < N_gauss; m++ )
+        {
+            if ( gaussians( c, m, Weight ) > 0. )
+            {
+                Npd_host( m ) = gaussians( c, m, Weight ) * N_particles_in_cell;
+                // printf("Npd(%d,%d) = %f\n", c,m, Npd_host(m));
+            }
+            else
+            {
+                Npd_host( m ) = 0.;
+            }
+        }
+        Kokkos::deep_copy( Npd, Npd_host );
+
+        for ( int m = 0; m < N_gauss; m++ )
+        {
+            if ( gaussians( c, m, Weight ) <= 0. )
+            {
+                continue;
+            }
+            // printf("\n");
+
+            // Define how to create ONE particle in cell c, gaussian m
+            auto _init = KOKKOS_LAMBDA( const int s, const int i )
+            {
+                int id = (s)*cell.vector_length + i;
+
+                const gmm_float_type vpar =
+                    g_dev( c, m, MuPar ) +
+                    5. * Kokkos::sqrt( g_dev( c, m, Cparpar ) ) *
+                        ( 2. * Cabana::Impl::hammersley( 1, id - start + 1,
+                                                   Nc( m ) + 1 ) -
+                          1. );
+                const gmm_float_type vpermin =
+                    Kokkos::max( 0., g_dev( c, m, MuPer ) -
+                                         5. * Kokkos::sqrt( g_dev( c, m, Cperper ) ) );
+                const gmm_float_type vpermax =
+                    g_dev( c, m, MuPer ) + 5. * Kokkos::sqrt( g_dev( c, m, Cperper ) );
+                const gmm_float_type vper =
+                    vpermin +
+                    ( vpermax - vpermin ) *
+                        Cabana::Impl::hammersley( 2, id - start + 1, Nc( m ) + 1 );
+                const gmm_float_type dvpar =
+                    10. * Kokkos::sqrt( g_dev( c, m, Cparpar ) );
+                const gmm_float_type dvper = ( vpermax - vpermin );
+                const gmm_float_type v[2] = { vpar, vper };
+                const gmm_float_type Mu[2] = { g_dev( c, m, MuPar ),
+                                               g_dev( c, m, MuPer ) };
+                const gmm_float_type C[2][2] = {
+                    { g_dev( c, m, Cparpar ), g_dev( c, m, Cparper ) },
+                    { g_dev( c, m, Cperpar ), g_dev( c, m, Cperper ) } };
+                const gmm_float_type p =
+                    Cabana::Impl::GaussianWeight<gmm_float_type>::weight_2d(
+                        v, Mu, C );
+
+                velocity_par.access( s, i ) = vpar;
+                velocity_per.access( s, i ) = vper;
+
+                // Store the cell index
+                cell.access( s, i ) = c;
+
+                // Assign uniform particle weight
+                macro.access( s, i ) =
+                    p * Npd( m ) / (gmm_float_type)Nc( m ) * dvpar * dvper;
+
+                // Store particle position
+                position_x.access( s, i ) =
+                    c + Cabana::Impl::hammersley( 0, id - start + 1, Nc( m ) + 1 );
+            };
+
+            // Define an execution policy
+            Cabana::SimdPolicy<cell.vector_length,
+                               Kokkos::DefaultExecutionSpace>
+                vec_policy( start, start + Nc_host( m ) );
+
+            // printf("Generating %d particles from %d to %d from Gaussian %d in
+            // cell %d\n", Nc_host(m), start, start+Nc_host(m), m, c);
+
+            // Execute
+            Cabana::simd_parallel_for( vec_policy, _init, "init()" );
+
+            // Count weight of generated particles
+            double count_of_gaussian = 0.;
+            double weight_of_gaussian = 0.;
+            auto _sum =
+                KOKKOS_LAMBDA( const int i, double& lcount, double& lsum )
+            {
+                if ( ( start <= i ) && ( i < start + Nc( m ) ) )
+                {
+                    lcount += 1.;
+                    lsum += macro( i );
+                }
+            };
+            // printf("Checking particles %d to %d\n", start, start+Nc_host(m));
+            Kokkos::parallel_reduce( "Particle Scan", N_particles_in_total,
+                                     _sum, count_of_gaussian,
+                                     weight_of_gaussian );
+            // printf("Generated %f particles have weight %f\n",
+            // count_of_gaussian, weight_of_gaussian);
+
+            start += Nc_host( m );
+        }
+        // printf("\n");
+    }
+
+    // TODO: we may not have created exactly N_particles_in_total particles. Do
+    // we want to bother creating a few more to fix that?
+    return ( start );
+}
+
+/*!
+  Populate the particles based on the description of the distribution function
+  in gaussians. Particles are places in 1d3v phase space.
+*/
+template <typename CellSliceType, typename WeightSliceType,
+          typename PositionSliceType, typename VelocitySliceType,
+          typename GaussianType>
+size_t initializeEqualDensityParticlesWithHammersley(
+    CellSliceType& cell, WeightSliceType& macro, PositionSliceType& position_x,
+    VelocitySliceType& velocity_x, VelocitySliceType& velocity_y,
+    VelocitySliceType& velocity_z, const GaussianType& gaussians )
+{
+    using gmm_float_type = typename WeightSliceType::value_type;
+    auto g_dev = Kokkos::create_mirror_view(
+        Kokkos::DefaultExecutionSpace::memory_space(), gaussians );
+    Kokkos::deep_copy( g_dev, gaussians );
+
+    const int N_cells = gaussians.extent( 0 ); // Number of cells
+    const int N_gauss =
+        gaussians.extent( 1 ); // Maximum number of Gaussians per cell
+    const int N_particles_in_total =
+        cell.size(); // Total number of particles the weight off all particles
+                     // _also_ needs to add to this number
+
+    int start = 0;
+    for ( int c = 0; c < N_cells; c++ )
+    {
+        // number of physical particles in this cell
+        // We assume that these get evenly distributed to all cells. We might
+        // change that in the future to be proportional to the plasma density
+        // in each cell.
+        int N_particles_in_cell = N_particles_in_total / N_cells;
+
+        // number of computational particles to add for each Gaussian
+        Kokkos::View<double*> Ncd( "Nc double", N_gauss );
+        Kokkos::View<int*> Nc( "Nc int", N_gauss );
+        auto Ncd_host =
+            Kokkos::create_mirror_view_and_copy( Kokkos::HostSpace(), Ncd );
+        auto Nc_host =
+            Kokkos::create_mirror_view_and_copy( Kokkos::HostSpace(), Nc );
+
+        double sum = 0.;
+        for ( int m = 0; m < N_gauss; m++ )
+        {
+            if ( gaussians( c, m, Weight ) > 0. )
+            {
+                Ncd_host( m ) = Kokkos::pow( gaussians( c, m, Weight ), 1.0 );
+                sum += Ncd_host( m );
+            }
+            else
+            {
+                Ncd_host( m ) = 0.;
+            }
+        }
+
+        double norm = N_particles_in_cell / sum;
+
+        N_particles_in_cell = 0;
+        for ( int m = 0; m < N_gauss; m++ )
+        {
+            Nc_host( m ) = int( Ncd_host( m ) * norm );
+            N_particles_in_cell += Nc_host( m );
+            // if(Nc_host(m) > 0) {
+            //    printf("Nc(%d,%d) = %d\n", c,m, Nc_host(m));
+            // }
+        }
+        Kokkos::deep_copy( Nc, Nc_host );
+
+        // number of physical particles for each Gaussian
+        Kokkos::View<double*> Npd( "Np double", N_gauss );
+        auto Npd_host =
+            Kokkos::create_mirror_view_and_copy( Kokkos::HostSpace(), Npd );
+        for ( int m = 0; m < N_gauss; m++ )
+        {
+            if ( gaussians( c, m, Weight ) > 0. )
+            {
+                Npd_host( m ) = gaussians( c, m, Weight ) * N_particles_in_cell;
+                // printf("Npd(%d,%d) = %f\n", c,m, Npd_host(m));
+            }
+            else
+            {
+                Npd_host( m ) = 0.;
+            }
+        }
+        Kokkos::deep_copy( Npd, Npd_host );
+
+        for ( int m = 0; m < N_gauss; m++ )
+        {
+            if ( gaussians( c, m, Weight ) <= 0. )
+            {
+                continue;
+            }
+            // printf("\n");
+
+            // Define how to create ONE particle in cell c, gaussian m
+            auto _init = KOKKOS_LAMBDA( const int s, const int i )
+            {
+                int id = (s)*cell.vector_length + i;
+
+                const gmm_float_type vx =
+                    g_dev( c, m, MuX ) +
+                    3. * Kokkos::sqrt( g_dev( c, m, Cxx ) ) *
+                        ( 2. * Cabana::Impl::hammersley( 1, id - start + 1,
+                                                   Nc( m ) + 1 ) -
+                          1. );
+                const gmm_float_type vy =
+                    g_dev( c, m, MuY ) +
+                    3. * Kokkos::sqrt( g_dev( c, m, Cyy ) ) *
+                        ( 2. * Cabana::Impl::hammersley( 2, id - start + 1,
+                                                   Nc( m ) + 1 ) -
+                          1. );
+                const gmm_float_type vz =
+                    g_dev( c, m, MuZ ) +
+                    3. * Kokkos::sqrt( g_dev( c, m, Czz ) ) *
+                        ( 2. * Cabana::Impl::hammersley( 3, id - start + 1,
+                                                   Nc( m ) + 1 ) -
+                          1. );
+                const gmm_float_type dvx = 6. * Kokkos::sqrt( g_dev( c, m, Cxx ) );
+                const gmm_float_type dvy = 6. * Kokkos::sqrt( g_dev( c, m, Cyy ) );
+                const gmm_float_type dvz = 6. * Kokkos::sqrt( g_dev( c, m, Czz ) );
+
+                const gmm_float_type v[3] = { vx, vy, vz };
+                const gmm_float_type Mu[3] = { g_dev( c, m, MuX ),
+                                               g_dev( c, m, MuY ),
+                                               g_dev( c, m, MuZ ) };
+                const gmm_float_type C[3][3] = {
+                    { g_dev( c, m, Cxx ), g_dev( c, m, Cxy ),
+                      g_dev( c, m, Cxz ) },
+                    { g_dev( c, m, Cyx ), g_dev( c, m, Cyy ),
+                      g_dev( c, m, Cyz ) },
+                    { g_dev( c, m, Czx ), g_dev( c, m, Czy ),
+                      g_dev( c, m, Czz ) } };
+                const gmm_float_type p =
+                    Cabana::Impl::GaussianWeight<gmm_float_type>::weight_3d(
+                        v, Mu, C );
+
+                velocity_x.access( s, i ) = vx;
+                velocity_y.access( s, i ) = vy;
+                velocity_z.access( s, i ) = vz;
+
+                // Store the cell index
+                cell.access( s, i ) = c;
+
+                // Assign particle weight
+                macro.access( s, i ) =
+                    p * Npd( m ) / (gmm_float_type)Nc( m ) * dvx * dvy * dvz;
+
+                // Store particle position
+                position_x.access( s, i ) =
+                    c + Cabana::Impl::hammersley( 0, id - start + 1, Nc( m ) + 1 );
+            };
+
+            // Define an execution policy
+            Cabana::SimdPolicy<cell.vector_length,
+                               Kokkos::DefaultExecutionSpace>
+                vec_policy( start, start + Nc_host( m ) );
+
+            // printf("Generating %d particles from %d to %d from Gaussian %d in
+            // cell %d\n", Nc_host(m), start, start+Nc_host(m), m, c);
+
+            // Execute
+            Cabana::simd_parallel_for( vec_policy, _init, "init()" );
+
+            // Count weight of generated particles
+            double count_of_gaussian = 0.;
+            double weight_of_gaussian = 0.;
+            auto _sum =
+                KOKKOS_LAMBDA( const int i, double& lcount, double& lsum )
+            {
+                if ( ( start <= i ) && ( i < start + Nc( m ) ) )
+                {
+                    lcount += 1.;
+                    lsum += macro( i );
+                }
+            };
+            // printf("Checking particles %d to %d\n", start, start+Nc_host(m));
+            Kokkos::parallel_reduce( "Particle Scan", N_particles_in_total,
+                                     _sum, count_of_gaussian,
+                                     weight_of_gaussian );
+            // printf("Generated %f particles have weight %f\n",
+            // count_of_gaussian, weight_of_gaussian);
+
+            start += Nc_host( m );
+        }
+        // printf("\n");
+    }
+
+    // TODO: we may not have created exactly N_particles_in_total particles. Do
+    // we want to bother creating a few more to fix that?
+    return ( start );
+}
+
+/*!
+  Populate the particles based on the description of the distribution function
+  in gaussians
+*/
+template <typename CellSliceType, typename WeightSliceType,
+          typename PositionSliceType, typename VelocitySliceType,
+          typename GaussianType>
+size_t initializeEqualWeightParticlesWithHammersley(
+    CellSliceType& cell, WeightSliceType& macro, PositionSliceType& position_x,
+    VelocitySliceType& velocity_x, const GaussianType& gaussians )
+{
+    using gmm_float_type = typename WeightSliceType::value_type;
+    auto g_dev = Kokkos::create_mirror_view(
+        Kokkos::DefaultExecutionSpace::memory_space(), gaussians );
+    Kokkos::deep_copy( g_dev, gaussians );
+
+    const int N_cells = gaussians.extent( 0 );
+    const int N_gauss = gaussians.extent( 1 );
+    const int N_particles = cell.size();
+
+    int start = 0;
+    for ( int c = 0; c < N_cells; c++ )
+    {
+        for ( int m = 0; m < N_gauss; m++ )
+        {
+            if ( gaussians( c, m, Weight ) <= 0. )
+            {
+                continue;
+            }
+
+            int Np = int(
+                N_particles * gaussians( c, m, Weight ) /
+                gmm_float_type(
+                    N_cells ) ); // number of particles to add for that gaussian
+
+            // Define how to create ONE particle in cell c, gaussian m
+            auto _init = KOKKOS_LAMBDA( const int s, const int i )
+            {
+                int id = (s)*cell.vector_length + i;
+
+                // Generate uniform pseudo-radom variables
+                const gmm_float_type w =
+                    Cabana::Impl::hammersley( 1, id - start + 1, Np + 1 );
+                const gmm_float_type x =
+                    Cabana::Impl::hammersley( 2, id - start + 1, Np + 1 );
+
+                // Generate standard normal random variables
+                const gmm_float_type rx =
+                    Kokkos::sqrt( -2. * Kokkos::log( w ) ) * Kokkos::sin( 2. * M_PI * x );
+
+                velocity_x.access( s, i ) =
+                    g_dev( c, m, MuX ) + Kokkos::sqrt( g_dev( c, m, Cxx ) ) * rx;
+
+                // Store the cell index
+                cell.access( s, i ) = c;
+
+                // Assign uniform particle weight
+                macro.access( s, i ) = 1.;
+
+                // Store particle position
+                position_x.access( s, i ) =
+                    c + Cabana::Impl::hammersley( 0, id - start, Np );
+            };
+
+            // Define an execution policy
+            Cabana::SimdPolicy<cell.vector_length,
+                               Kokkos::DefaultExecutionSpace>
+                vec_policy( start, start + Np );
+
+            // printf("Generating %d particles from %d to %d from Gaussian %d in
+            // cell %d\n", Np, start, start+Np, m, c);
+
+            // Execute
+            Cabana::simd_parallel_for( vec_policy, _init, "init()" );
+
+            start += Np;
+        }
+    }
+
+    // TODO: we may not have created exactly N_particles particles. Do we want
+    // to bother creating a few more to fix that?
+    return ( start );
+}
+
+/*!
+  Populate the particles based on the description of the distribution function
+  in gaussians. Particles are places in 1d2v phase space.
+*/
+template <typename CellSliceType, typename WeightSliceType,
+          typename PositionSliceType, typename VelocitySliceType,
+          typename GaussianType>
+size_t initializeEqualWeightParticlesWithHammersley(
+    CellSliceType& cell, WeightSliceType& macro, PositionSliceType& position_x,
+    VelocitySliceType& velocity_par, VelocitySliceType& velocity_per,
+    const GaussianType& gaussians )
+{
+    using gmm_float_type = typename WeightSliceType::value_type;
+    auto g_dev = Kokkos::create_mirror_view(
+        Kokkos::DefaultExecutionSpace::memory_space(), gaussians );
+    Kokkos::deep_copy( g_dev, gaussians );
+
+    const int N_cells = gaussians.extent( 0 );
+    const int N_gauss = gaussians.extent( 1 );
+    const int N_particles = cell.size();
+
+    int start = 0;
+    for ( int c = 0; c < N_cells; c++ )
+    {
+        for ( int m = 0; m < N_gauss; m++ )
+        {
+            if ( gaussians( c, m, Weight ) <= 0. )
+            {
+                continue;
+            }
+
+            int Np = int(
+                N_particles * gaussians( c, m, Weight ) /
+                gmm_float_type(
+                    N_cells ) ); // number of particles to add for that gaussian
+
+            // Define how to create ONE particle in cell c, gaussian m
+            auto _init = KOKKOS_LAMBDA( const int s, const int i )
+            {
+                int id = (s)*cell.vector_length + i;
+
+                // TODO: Extend this to ring distributions that allow for correlations
+                // between parallel and perpendicular velocity components
+                //
+                //  Put Covariance matrix of the jth Gaussian into 3x3 Matrix
+                const gmm_float_type C[3][3] = {
+                    { g_dev( c, m, Cparpar ), 0, 0 },
+                    { 0, g_dev( c, m, Cperper ), 0 },
+                    { 0, 0, g_dev( c, m, Cperper ) } };
+                gmm_float_type B[3][3];
+                // Get Cholesky decomposition
+                Cabana::Impl::Matrix2d<gmm_float_type, 3>::cholesky( B, C );
+
+                // Generate uniform pseudo-radom variables
+                const gmm_float_type w =
+                    Cabana::Impl::hammersley( 1, id - start + 1, Np + 1 );
+                const gmm_float_type x =
+                    Cabana::Impl::hammersley( 2, id - start + 1, Np + 1 );
+                const gmm_float_type y =
+                    Cabana::Impl::hammersley( 3, id - start + 1, Np + 1 );
+                const gmm_float_type z =
+                    Cabana::Impl::hammersley( 4, id - start + 1, Np + 1 );
+
+                // Generate standard normal random variables
+                const gmm_float_type rx =
+                    Kokkos::sqrt( -2. * Kokkos::log( w ) ) * Kokkos::sin( 2. * M_PI * x );
+                const gmm_float_type ry =
+                    Kokkos::sqrt( -2. * Kokkos::log( w ) ) * Kokkos::cos( 2. * M_PI * x );
+                const gmm_float_type rz =
+                    Kokkos::sqrt( -2. * Kokkos::log( y ) ) * Kokkos::sin( 2. * M_PI * z );
+
+                const gmm_float_type vx = g_dev( c, m, MuPar ) + B[0][0] * rx +
+                                          B[0][1] * ry + B[0][2] * rz;
+                const gmm_float_type vy = g_dev( c, m, MuPer ) + B[1][0] * rx +
+                                          B[1][1] * ry + B[1][2] * rz;
+                const gmm_float_type vz =
+                    B[2][0] * rx + B[2][1] * ry + B[2][2] * rz;
+
+                velocity_par.access( s, i ) = vx;
+                velocity_per.access( s, i ) = Kokkos::sqrt( vy * vy + vz * vz );
+
+                // Store the cell index
+                cell.access( s, i ) = c;
+
+                // Assign uniform particle weight
+                macro.access( s, i ) = 1.;
+
+                // Store particle position
+                position_x.access( s, i ) =
+                    c + Cabana::Impl::hammersley( 0, id - start, Np );
+            };
+
+            // Define an execution policy
+            Cabana::SimdPolicy<cell.vector_length,
+                               Kokkos::DefaultExecutionSpace>
+                vec_policy( start, start + Np );
+
+            // printf("Generating %d particles from %d to %d from Gaussian %d in
+            // cell %d\n", Np, start, start+Np, m, c);
+
+            // Execute
+            Cabana::simd_parallel_for( vec_policy, _init, "init()" );
+
+            start += Np;
+        }
+    }
+
+    // TODO: we may not have created exactly N_particles particles. Do we want
+    // to bother creating a few more to fix that?
+    return ( start );
+}
+
+/*!
+  Populate the particles based on the description of the distribution function
+  in gaussians. Particles are places in 1d3v phase space.
+*/
+template <typename CellSliceType, typename WeightSliceType,
+          typename PositionSliceType, typename VelocitySliceType,
+          typename GaussianType>
+size_t initializeEqualWeightParticlesWithHammersley(
+    CellSliceType& cell, WeightSliceType& macro, PositionSliceType& position_x,
+    VelocitySliceType& velocity_x, VelocitySliceType& velocity_y,
+    VelocitySliceType& velocity_z, const GaussianType& gaussians )
+{
+    using gmm_float_type = typename WeightSliceType::value_type;
+    auto g_dev = Kokkos::create_mirror_view(
+        Kokkos::DefaultExecutionSpace::memory_space(), gaussians );
+    Kokkos::deep_copy( g_dev, gaussians );
+
+    const int N_cells = gaussians.extent( 0 );
+    const int N_gauss = gaussians.extent( 1 );
+    const int N_particles = cell.size();
+
+    int start = 0;
+    for ( int c = 0; c < N_cells; c++ )
+    {
+        for ( int m = 0; m < N_gauss; m++ )
+        {
+            if ( gaussians( c, m, Weight ) <= 0. )
+            {
+                continue;
+            }
+
+            int Np = int(
+                N_particles * gaussians( c, m, Weight ) /
+                gmm_float_type(
+                    N_cells ) ); // number of particles to add for that gaussian
+
+            // Define how to create ONE particle in cell c, gaussian m
+            auto _init = KOKKOS_LAMBDA( const int s, const int i )
+            {
+                int id = (s)*cell.vector_length + i;
+
+                // Put Covariance matrix of the mth Gaussian into 3x3 Matrix
+                const gmm_float_type C[3][3] = {
+                    { g_dev( c, m, Cxx ), g_dev( c, m, Cxy ),
+                      g_dev( c, m, Cxz ) },
+                    { g_dev( c, m, Cyx ), g_dev( c, m, Cyy ),
+                      g_dev( c, m, Cyz ) },
+                    { g_dev( c, m, Czx ), g_dev( c, m, Czy ),
+                      g_dev( c, m, Czz ) } };
+                gmm_float_type B[3][3];
+                // Get Cholesky decomposition
+                Cabana::Impl::Matrix2d<gmm_float_type, 3>::cholesky( B, C );
+
+                // Generate uniform pseudo-radom variables
+                const gmm_float_type w =
+                    Cabana::Impl::hammersley( 1, id - start + 1, Np + 1 );
+                const gmm_float_type x =
+                    Cabana::Impl::hammersley( 2, id - start + 1, Np + 1 );
+                const gmm_float_type y =
+                    Cabana::Impl::hammersley( 3, id - start + 1, Np + 1 );
+                const gmm_float_type z =
+                    Cabana::Impl::hammersley( 4, id - start + 1, Np + 1 );
+
+                // Generate standard normal random variables
+                const gmm_float_type rx =
+                    Kokkos::sqrt( -2 * Kokkos::log( w ) ) * Kokkos::sin( 2. * M_PI * x );
+                const gmm_float_type ry =
+                    Kokkos::sqrt( -2 * Kokkos::log( w ) ) * Kokkos::cos( 2. * M_PI * x );
+                const gmm_float_type rz =
+                    Kokkos::sqrt( -2 * Kokkos::log( y ) ) * Kokkos::sin( 2. * M_PI * z );
+
+                const gmm_float_type vx = g_dev( c, m, MuX ) + B[0][0] * rx +
+                                          B[0][1] * ry + B[0][2] * rz;
+                const gmm_float_type vy = g_dev( c, m, MuY ) + B[1][0] * rx +
+                                          B[1][1] * ry + B[1][2] * rz;
+                const gmm_float_type vz = g_dev( c, m, MuZ ) + B[2][0] * rx +
+                                          B[2][1] * ry + B[2][2] * rz;
+
+                velocity_x.access( s, i ) = vx;
+                velocity_y.access( s, i ) = vy;
+                velocity_z.access( s, i ) = vz;
+
+                // Store the cell index
+                cell.access( s, i ) = c;
+
+                // Assign uniform particle weight
+                macro.access( s, i ) = 1.;
+
+                // Store particle position
+                position_x.access( s, i ) =
+                    c + Cabana::Impl::hammersley( 0, id - start, Np );
+            };
+
+            // Define an execution policy
+            Cabana::SimdPolicy<cell.vector_length,
+                               Kokkos::DefaultExecutionSpace>
+                vec_policy( start, start + Np );
+
+            // printf("Generating %d particles from %d to %d from Gaussian %d in
+            // cell %d\n", Np, start, start+Np, m, c);
+
+            // Execute
+            Cabana::simd_parallel_for( vec_policy, _init, "init()" );
+
+            start += Np;
+        }
+    }
+
+    // TODO: we may not have created exactly N_particles particles. Do we want
+    // to bother creating a few more to fix that?
+    return ( start );
+}
+
+
+
 } // namespace Cabana
 
 #endif
diff --git a/core/src/impl/Cabana_Erfinv.hpp b/core/src/impl/Cabana_Erfinv.hpp
new file mode 100644
index 000000000..def01caff
--- /dev/null
+++ b/core/src/impl/Cabana_Erfinv.hpp
@@ -0,0 +1,108 @@
+/****************************************************************************
+ * Copyright (c) 2023 by the Cabana authors                                 *
+ * All rights reserved.                                                     *
+ *                                                                          *
+ * This file is part of the Cabana library. Cabana is distributed under a   *
+ * BSD 3-clause license. For the licensing terms see the LICENSE file in    *
+ * the top-level directory.                                                 *
+ *                                                                          *
+ * SPDX-License-Identifier: BSD-3-Clause                                    *
+ ****************************************************************************/
+
+/*!
+  \file impl/Cabana_Erfinv.hpp
+*/
+#ifndef CABANA_ERFINV_HPP
+#define CABANA_ERFINV_HPP
+
+#include <cstdio>
+#include <cmath>
+
+namespace Cabana {
+namespace Impl {
+
+/*!
+ This produces the inverse of
+ Phi(zp) = integrate(1/sqrt(2*pi) * exp(-zeta**2/2), (zeta,-oo,zp))
+         = erf(sqrt(2)*zp/2)/2 + 1/2
+ This means that ppnd7(p) = sqrt(2)*erfinv(2*p - 1)
+*/
+double KOKKOS_INLINE_FUNCTION ppnd7(double p) {
+	/*
+	Algorithm AS 241    Appl. Statist. (1988) Vol. 37, No. 3
+	Produce the normal deviate  z  corresponding to a given lower
+	tail area of  p;  z  is accurate to about 1 part in 10**7
+	*/
+
+	const double split1 = 0.425E0;
+	const double split2 = 5.0E0;
+	const double const1 = 0.180625E0;
+	const double const2 = 1.6E0;
+
+	// coefficicents for  p  close to  1/2
+	const double A0 = 3.38713'27179E0;
+	const double A1 = 5.04342'71938E1;
+	const double A2 = 1.59291'13202E2;
+	const double A3 = 5.91093'74720E1;
+	const double B1 = 1.78951'69469E1;
+	const double B2 = 7.87577'57664E1;
+	const double B3 = 6.71875'63600E1;
+
+	// coefficients for  p  neither close to  1/2  nor 0 or 1
+	const double C0 = 1.42343'72777E0;
+	const double C1 = 2.75681'53900E0;
+	const double C2 = 1.30672'84816E0;
+	const double C3 = 1.70238'21103E-1;
+	const double D1 = 7.37001'64250E-1;
+	const double D2 = 1.20211'32975E-1;
+
+	// coefficients for  p  near 0 or 1
+	const double E0 = 6.65790'51150E0;
+	const double E1 = 3.08122'63860E0;
+	const double E2 = 4.28682'94337E-1;
+	const double E3 = 1.73372'03997E-2;
+	const double F1 = 2.41978'94225E-1;
+	const double F2 = 1.22582'02635E-2;
+
+	const double q = p - 0.5;
+	if(fabs(q) < split1) {
+		const double r = const1 - q*q;
+		const double out = q * (((A3 * r + A2) * r + A1) * r + A0) / (((B3 * r + B2) * r + B1) * r + 1.0);
+		return out;
+	} else {
+		double r, out;
+		if(q < 0.) {
+			r = p;
+		} else {
+			r = 1.0 - p;
+		}
+		if((r < 0.) || (r > 1.)) {
+			Kokkos::abort("unexpected problem in ppnd7() inside erfinv()");
+		}
+		r = Kokkos::sqrt(-log(r));
+		if(r < split2) {
+			r = r - const2;
+			out = (((C3 * r + C2) * r + C1) * r + C0) / ((D2 * r + D1) * r + 1.0);
+		} else {
+			r = r - split2;
+			out = (((E3 * r + E2) * r + E1) * r + E0) / ((F2 * r + F1) * r + 1.0);
+		}
+
+		if(q < 0.) {
+			out = -out;
+		}
+
+		return out;
+	}
+}
+
+double KOKKOS_INLINE_FUNCTION erfinv(double x) {
+	const double out = ppnd7((x+1.)/2.)/Kokkos::sqrt(2.);
+	return out;
+}
+
+
+} // end namespace Impl
+} // end namespace Cabana
+
+#endif // end CABANA_ERFINV_HPP
diff --git a/core/src/impl/Cabana_GaussianWeight.hpp b/core/src/impl/Cabana_GaussianWeight.hpp
new file mode 100644
index 000000000..5d726eaee
--- /dev/null
+++ b/core/src/impl/Cabana_GaussianWeight.hpp
@@ -0,0 +1,70 @@
+/****************************************************************************
+ * Copyright (c) 2023 by the Cabana authors                                 *
+ * All rights reserved.                                                     *
+ *                                                                          *
+ * This file is part of the Cabana library. Cabana is distributed under a   *
+ * BSD 3-clause license. For the licensing terms see the LICENSE file in    *
+ * the top-level directory.                                                 *
+ *                                                                          *
+ * SPDX-License-Identifier: BSD-3-Clause                                    *
+ ****************************************************************************/
+
+/*!
+  \file impl/Cabana_GaussianWeight.hpp
+*/
+#ifndef CABANA_GAUSSIAN_WEIGHT_HPP
+#define CABANA_GAUSSIAN_WEIGHT_HPP
+
+#include <impl/Cabana_Matrix2d.hpp>
+
+namespace Cabana {
+namespace Impl {
+
+template<typename GMMFloatType>
+class GaussianWeight {
+
+public:
+
+/*!
+  Compute the value of a 1d Gaussian distribution function with mean Mu and covariance C at velocity v
+*/
+static GMMFloatType KOKKOS_INLINE_FUNCTION weight_1d(const GMMFloatType (&v)[1], const GMMFloatType (&Mu)[1], const GMMFloatType (&C)[1][1]) {
+	GMMFloatType I[1][1];
+	Matrix2d<GMMFloatType,1>::invert(I,C);
+	const GMMFloatType det = Matrix2d<GMMFloatType,1>::determinant(C);
+
+	const GMMFloatType arg = (v[0]-Mu[0]) * I[0][0] * (v[0]-Mu[0]);
+	return pow(2.*M_PI, -0.5*1)/Kokkos::sqrt(det) * Kokkos::exp(-0.5*arg);
+}
+
+/*!
+  Compute the value of a 2d ring distribution function with mean Mu and covariance C at velocity v
+*/
+static GMMFloatType KOKKOS_INLINE_FUNCTION weight_2d(const GMMFloatType (&v)[2], const GMMFloatType (&Mu)[2], const GMMFloatType (&C)[2][2]) {
+	return v[1]/Kokkos::sqrt(2.*M_PI*C[0][0])/C[1][1] * Kokkos::exp(-0.5*Mu[1]*Mu[1]/C[1][1]) * Kokkos::exp(-0.5*(v[0]-Mu[0])*(v[0]-Mu[0])/C[0][0]) * Kokkos::exp(-0.5*v[1]*v[1]/C[1][1]) *
+	       Kokkos::Experimental::cyl_bessel_i0<Kokkos::complex<GMMFloatType>, double, int>(Kokkos::complex(v[1]*Mu[1]/C[1][1])).real();
+}
+
+/*!
+  Compute the value of a 3d Gaussian distribution function with mean Mu and covariance C at velocity v
+*/
+static GMMFloatType KOKKOS_INLINE_FUNCTION weight_3d(const GMMFloatType (&v)[3], const GMMFloatType (&Mu)[3], const GMMFloatType (&C)[3][3]) {
+	GMMFloatType I[3][3];
+	Matrix2d<GMMFloatType,3>::invert(I,C);
+	const GMMFloatType det = Matrix2d<GMMFloatType,3>::determinant(C);
+
+	const GMMFloatType rx = I[0][0]*(v[0]-Mu[0]) + I[0][1]*(v[1]-Mu[1]) + I[0][2]*(v[2]-Mu[2]);
+	const GMMFloatType ry = I[1][0]*(v[0]-Mu[0]) + I[1][1]*(v[1]-Mu[1]) + I[1][2]*(v[2]-Mu[2]);
+	const GMMFloatType rz = I[2][0]*(v[0]-Mu[0]) + I[2][1]*(v[1]-Mu[1]) + I[2][2]*(v[2]-Mu[2]);
+
+	const GMMFloatType arg = (v[0]-Mu[0])*rx + (v[1]-Mu[1])*ry + (v[2]-Mu[2])*rz;
+	return pow(2.*M_PI, -0.5*3)/Kokkos::sqrt(det) * Kokkos::exp(-0.5*arg);
+}
+
+};
+
+
+} // end namespace Impl
+} // end namespace Cabana
+
+#endif // end CABANA_GAUSSIAN_WEIGHT_HPP
diff --git a/core/src/impl/Cabana_Hammersley.hpp b/core/src/impl/Cabana_Hammersley.hpp
new file mode 100644
index 000000000..7f1b239c7
--- /dev/null
+++ b/core/src/impl/Cabana_Hammersley.hpp
@@ -0,0 +1,101 @@
+/****************************************************************************
+ * Copyright (c) 2023 by the Cabana authors                                 *
+ * All rights reserved.                                                     *
+ *                                                                          *
+ * This file is part of the Cabana library. Cabana is distributed under a   *
+ * BSD 3-clause license. For the licensing terms see the LICENSE file in    *
+ * the top-level directory.                                                 *
+ *                                                                          *
+ * SPDX-License-Identifier: BSD-3-Clause                                    *
+ ****************************************************************************/
+
+/*!
+  \file Cabana_Hammersley.hpp
+*/
+#ifndef CABANA_HAMMERSLEY_HPP
+#define CABANA_HAMMERSLEY_HPP
+
+#include <Kokkos_Vector.hpp>
+
+namespace Cabana {
+namespace Impl {
+
+/*!
+  Reverse the bits in the unsigned 64bit input
+*/
+uint64_t KOKKOS_INLINE_FUNCTION reverseBits64(uint64_t n) {
+	n = ((n & 0x00000000ffffffff) << 32) | ((n & 0xffffffff00000000) >> 32);
+	n = ((n & 0x0000ffff0000ffff) << 16) | ((n & 0xffff0000ffff0000) >> 16);
+	n = ((n & 0x00ff00ff00ff00ff) <<  8) | ((n & 0xff00ff00ff00ff00) >>  8);
+	n = ((n & 0x0f0f0f0f0f0f0f0f) <<  4) | ((n & 0xf0f0f0f0f0f0f0f0) >>  4);
+	n = ((n & 0x3333333333333333) <<  2) | ((n & 0xcccccccccccccccc) >>  2);
+	n = ((n & 0x5555555555555555) <<  1) | ((n & 0xaaaaaaaaaaaaaaaa) >>  1);
+	return n;
+}
+
+/*!
+  Reverse the digits of the input n in the number system with basis base to
+  compute the nth entry in the Van der Corput sequence.
+*/
+template <int base> struct Corput {
+static double KOKKOS_INLINE_FUNCTION value(int n){
+	const double inv_Base = (double)1. / (double)base;
+	uint64_t out = 0;
+	double power = 1.;
+	while (n > 0.) {
+		uint64_t next  = n / base;        // next value with the last digit removed / after the right shift
+		uint64_t digit = n - next * base; // last digit that we pop off
+		out = out * base + digit;         // shift output left and append digit
+		power *= inv_Base;                // keep track of how often we have done that
+		n = next;                         // shift input right
+	}
+	// At this point out is the reversed string when writing n as digits in base
+	return out * power;                   // get a value between 0 and 1
+}
+};
+
+/*!
+  In base 2 reversing the digits is just reversing the bits and then scaling by
+  the appropriate constant to shift things to the right of the decimal point.
+  This is much faster than the general case.
+*/
+template<> struct Corput<2> {
+static double KOKKOS_INLINE_FUNCTION value(int n) {
+	return reverseBits64(n) * 0x1p-64;
+}
+};
+
+
+/*!
+  Generate the nth sample (of N) of the hammersley sequence and return the value in dimension i
+*/
+double KOKKOS_INLINE_FUNCTION hammersley(const int i, const uint64_t n, const uint64_t N) {
+	if(n >= N) {
+		printf("Called hammersely with n = %zd, N=%zd, which is wrong\n", n, N);
+	}
+	if((i < 0) || (i > 5)) {
+		printf("Called hammersely with i = %d, which is wrong\n", i);
+	}
+
+	if(i == 0) {
+		return (double)n/(double)N;
+	} else if (i == 1) {
+		return Corput<2>::value(n);
+	} else if (i == 2) {
+		return Corput<3>::value(n);
+	} else if (i == 3) {
+		return Corput<5>::value(n);
+	} else if (i == 4) {
+		return Corput<7>::value(n);
+	} else if (i == 5) {
+		return Corput<11>::value(n);
+	} else {
+		return 0. / 0.;
+	}
+}
+
+
+} // end namespace Impl
+} // end namespace Cabana
+
+#endif // end CABANA_HAMMERSLEY_HPP
diff --git a/core/src/impl/Cabana_Matrix2d.hpp b/core/src/impl/Cabana_Matrix2d.hpp
new file mode 100644
index 000000000..6cb740f18
--- /dev/null
+++ b/core/src/impl/Cabana_Matrix2d.hpp
@@ -0,0 +1,231 @@
+/****************************************************************************
+ * Copyright (c) 2023 by the Cabana authors                                 *
+ * All rights reserved.                                                     *
+ *                                                                          *
+ * This file is part of the Cabana library. Cabana is distributed under a   *
+ * BSD 3-clause license. For the licensing terms see the LICENSE file in    *
+ * the top-level directory.                                                 *
+ *                                                                          *
+ * SPDX-License-Identifier: BSD-3-Clause                                    *
+ ****************************************************************************/
+
+/*!
+  \file impl/Cabana_Matrix2d.hpp
+  \brief
+  Linear algebra helpers. A lot of them are not implemented for generic
+  matricies and we probably shouldn't do that either, but call to some
+  appropriate linear algebra package. Special cases for small matricies are
+  implemented below.
+*/
+#ifndef CABANA_MATRIX2D_IMPL_HPP
+#define CABANA_MATRIX2D_IMPL_HPP
+
+#include <cmath>
+
+namespace Cabana {
+
+namespace Impl {
+
+template<typename FloatType, int dim>
+class Matrix2d {
+
+public:
+/*!
+  Compute the determinant of a dim-by-dim matrix
+*/
+	static FloatType KOKKOS_INLINE_FUNCTION determinant(const FloatType (&A)[dim][dim]) {
+		// I would use fprintf to stderr, but that isn't possible from device code
+		printf("Somebody should implement matrix determinants for larger than 3x3\n");
+		return 0./0.;
+	}
+
+/*!
+  Compute the Cholesky decomposition of a dim-by-dim matrix A and return B such that B * B.T = A
+*/
+	static void KOKKOS_INLINE_FUNCTION cholesky(FloatType (&B)[dim][dim], const FloatType (&A)[dim][dim]) {
+		printf("Somebody should implement Cholesky decomposition for larger than 3x3\n");
+		for(size_t i = 0; i < dim; i++) {
+			for(size_t j = 0; j < dim; j++) {
+				B[i][j] = 0./0.;
+			}
+		}
+	}
+
+/*!
+  Transpose a dim-by-dim matrix matrix A and returns B such that B.T = A
+*/
+	static void KOKKOS_INLINE_FUNCTION transpose(FloatType (&B)[dim][dim], const FloatType (&A)[dim][dim]) {
+		for(size_t i = 0; i < dim; i++) {
+			for(size_t j = 0; j < dim; j++) {
+				B[i][j] = A[j][i];
+			}
+		}
+	}
+
+/*!
+  Invert a dim-by-dim matrix matrix A and returns B such that B.A = 1
+*/
+	static void KOKKOS_INLINE_FUNCTION invert(FloatType (&B)[dim][dim], const FloatType (&A)[dim][dim]) {
+		printf("Somebody should implement matrix inversion for larger than 3x3\n");
+		for(size_t i = 0; i < dim; i++) {
+			for(size_t j = 0; j < dim; j++) {
+				B[i][j] = 0./0.;
+			}
+		}
+	}
+};
+
+//
+// The trivial 1x1 case
+//
+template<typename FloatType> struct Matrix2d<FloatType,1> {
+/*!
+  Compute the determinant of a trivial 1-by-1 matrix
+*/
+	static FloatType KOKKOS_INLINE_FUNCTION determinant(const FloatType (&A)[1][1]) {
+		return A[0][0];
+	}
+
+/*!
+  Compute the Cholesky decomposition of a trival 1-by-1 matrix A and return B such that B * B.T = A
+*/
+	static void KOKKOS_INLINE_FUNCTION cholesky(FloatType (&B)[1][1], const FloatType (&A)[1][1]) {
+		B[0][0] = Kokkos::sqrt(A[0][0]);
+	}
+
+/*!
+  Transpose a trivial 1-by-1 matrix matrix A and returns B such that B.T = A
+*/
+	static void KOKKOS_INLINE_FUNCTION transpose(FloatType (&B)[1][1], const FloatType (&A)[1][1]) {
+		B[0][0] = A[0][0];
+	}
+
+/*!
+  Invert a trivial 1-by-1 matrix matrix A and returns B such that B.A = 1
+*/
+	static void KOKKOS_INLINE_FUNCTION invert(FloatType (&B)[1][1], const FloatType (&A)[1][1]) {
+		// determinant
+		const FloatType det = determinant(A);
+
+		// elements of the inverse
+		B[0][0] =  1./det;
+	}
+};
+
+//
+// The 2x2 case
+//
+template<typename FloatType> struct Matrix2d<FloatType,2> {
+/*!
+  Compute the determinant of a 2-by-2 matrix
+*/
+	static FloatType KOKKOS_INLINE_FUNCTION determinant(const FloatType (&A)[2][2]) {
+		return A[0][0]*A[1][1] - A[1][0]*A[0][1];
+	}
+
+/*!
+  Compute the Cholesky decomposition of a 2-by-2 matrix A and return B such that B * B.T = A
+*/
+	static void KOKKOS_INLINE_FUNCTION cholesky(FloatType (&B)[2][2], const FloatType (&A)[2][2]) {
+		B[0][0] = Kokkos::sqrt(A[0][0]);
+		B[0][1] = 0.;
+		B[1][0] = A[1][0] / B[0][0];
+		B[1][1] = Kokkos::sqrt(A[1][1] - B[1][0]*B[1][0]);
+	}
+
+/*!
+  Transpose a 2-by-2 matrix matrix A and returns B such that B.T = A
+*/
+	static void KOKKOS_INLINE_FUNCTION transpose(FloatType (&B)[2][2], const FloatType (&A)[2][2]) {
+		B[0][0] = A[0][0];
+		B[0][1] = A[1][0];
+		B[1][0] = A[0][1];
+		B[1][1] = A[1][1];
+	}
+
+/*!
+  Invert a 2-by-2 matrix matrix A and returns B such that B.A = 1
+*/
+	static void KOKKOS_INLINE_FUNCTION invert(FloatType (&B)[2][2], const FloatType (&A)[2][2]) {
+		// determinant
+		const FloatType det = determinant(A);
+
+		// elements of the inverse
+		B[0][0] =  A[1][1]/det;
+		B[0][1] = -A[0][1]/det;
+		B[1][0] = -A[1][0]/det;
+		B[1][1] =  A[0][0]/det;
+	}
+};
+
+//
+// The 3x3 case
+//
+template<typename FloatType> struct Matrix2d<FloatType,3> {
+/*!
+  Compute the determinant of a 3-by-3 matrix
+*/
+	static FloatType KOKKOS_INLINE_FUNCTION determinant(const FloatType (&A)[3][3]) {
+		return A[0][0]*A[1][1]*A[2][2]
+			 + A[0][1]*A[1][2]*A[2][0]
+			 + A[0][2]*A[1][0]*A[2][1]
+			 - A[2][0]*A[1][1]*A[0][2]
+			 - A[2][1]*A[1][2]*A[0][0]
+			 - A[2][2]*A[1][0]*A[0][1];
+	}
+
+/*!
+  Compute the Cholesky decomposition of a 3-by-3 matrix A and return B such that B * B.T = A
+*/
+	static void KOKKOS_INLINE_FUNCTION cholesky(FloatType (&B)[3][3], const FloatType (&A)[3][3]) {
+		B[0][0] = Kokkos::sqrt(A[0][0]);
+		B[0][1] = 0.;
+		B[0][2] = 0.;
+		B[1][0] = A[1][0] / B[0][0];
+		B[1][1] = Kokkos::sqrt(A[1][1] - B[1][0]*B[1][0]);
+		B[1][2] = 0.;
+		B[2][0] = A[2][0] / B[0][0];
+		B[2][1] = (A[2][1] - B[2][0]*B[1][0]) / B[1][1];
+		B[2][2] = Kokkos::sqrt(A[2][2] - B[2][0]*B[2][0] - B[2][1]*B[2][1]);
+	}
+
+/*!
+  Transpose a 3-by-3 matrix matrix A and returns B such that B.T = A
+*/
+	static void KOKKOS_INLINE_FUNCTION transpose(FloatType (&B)[3][3], const FloatType (&A)[3][3]) {
+		B[0][0] = A[0][0];
+		B[0][1] = A[1][0];
+		B[0][2] = A[2][0];
+		B[1][0] = A[0][1];
+		B[1][1] = A[1][1];
+		B[1][2] = A[2][1];
+		B[2][0] = A[0][2];
+		B[2][1] = A[1][2];
+		B[2][2] = A[2][2];
+	}
+
+/*!
+  Invert a 3-by-3 matrix matrix A and returns B such that B.A = 1
+*/
+	static void KOKKOS_INLINE_FUNCTION invert(FloatType (&B)[3][3], const FloatType (&A)[3][3]) {
+		// determinant
+		const FloatType det = determinant(A);
+
+		// elements of the inverse
+		B[0][0] = (A[1][1]*A[2][2]-A[1][2]*A[2][1])/det;
+		B[0][1] = (A[0][2]*A[2][1]-A[0][1]*A[2][2])/det;
+		B[0][2] = (A[0][1]*A[1][2]-A[0][2]*A[1][1])/det;
+		B[1][0] = (A[1][2]*A[2][0]-A[1][0]*A[2][2])/det;
+		B[1][1] = (A[0][0]*A[2][2]-A[0][2]*A[2][0])/det;
+		B[1][2] = (A[0][2]*A[1][0]-A[0][0]*A[1][2])/det;
+		B[2][0] = (A[1][0]*A[2][1]-A[1][1]*A[2][0])/det;
+		B[2][1] = (A[0][1]*A[2][0]-A[0][0]*A[2][1])/det;
+		B[2][2] = (A[0][0]*A[1][1]-A[0][1]*A[1][0])/det;
+	}
+};
+
+} // end namespace Impl
+
+} // end namespace Cabana
+
+#endif // end CABANA_MATRIX2D_IMPL_HPP
diff --git a/core/unit_test/CMakeLists.txt b/core/unit_test/CMakeLists.txt
index 00aa49379..fc7c74cf9 100644
--- a/core/unit_test/CMakeLists.txt
+++ b/core/unit_test/CMakeLists.txt
@@ -16,6 +16,11 @@ set(NOBACKEND_TESTS
   Index
   CartesianGrid
   SoA
+  Erfinv
+  Hammersley
+  Matrix1
+  Matrix2
+  Matrix3
   )
 
 set(SERIAL_TESTS
diff --git a/core/unit_test/tstErfinv.cpp b/core/unit_test/tstErfinv.cpp
new file mode 100644
index 000000000..e209a4a5b
--- /dev/null
+++ b/core/unit_test/tstErfinv.cpp
@@ -0,0 +1,44 @@
+/****************************************************************************
+ * Copyright (c) 2023 by the Cabana authors                                 *
+ * All rights reserved.                                                     *
+ *                                                                          *
+ * This file is part of the Cabana library. Cabana is distributed under a   *
+ * BSD 3-clause license. For the licensing terms see the LICENSE file in    *
+ * the top-level directory.                                                 *
+ *                                                                          *
+ * SPDX-License-Identifier: BSD-3-Clause                                    *
+ ****************************************************************************/
+
+#include <Kokkos_Vector.hpp>
+#include <impl/Cabana_Erfinv.hpp>
+
+#include <gtest/gtest.h>
+
+namespace Test
+{
+
+//---------------------------------------------------------------------------//
+// Erfinv test
+void testErfinv()
+{
+	// Test values from the original paper
+	EXPECT_NEAR(Cabana::Impl::ppnd7(0.25),  -0.6744897501960817, 1e-7);
+	EXPECT_NEAR(Cabana::Impl::ppnd7(0.001), -3.090232306167814,  1e-6);
+	EXPECT_NEAR(Cabana::Impl::ppnd7(1e-20), -9.262340089798408,  1e-6);
+
+	// test that erfinv() is actually the inverse of erf()
+	double testpoints[] = {-0.999, -0.9, -0.5, -0.1, -1e-3, -1e-8, 1e-8, 1e-3, 0.1, 0.5, 0.9, 0.999};
+	for(int i = 0; i < sizeof(testpoints)/sizeof(testpoints[0]); i++) {
+		const double x = testpoints[i];
+		EXPECT_NEAR(x, Kokkos::erf(Cabana::Impl::erfinv(x)), 1e-7);
+	}
+}
+
+//---------------------------------------------------------------------------//
+// TESTS
+//---------------------------------------------------------------------------//
+TEST( cabana_erfinv, erfinv_test ) { testErfinv(); }
+
+//---------------------------------------------------------------------------//
+
+} // end namespace Test
diff --git a/core/unit_test/tstHammersley.cpp b/core/unit_test/tstHammersley.cpp
new file mode 100644
index 000000000..9120c6720
--- /dev/null
+++ b/core/unit_test/tstHammersley.cpp
@@ -0,0 +1,61 @@
+/****************************************************************************
+ * Copyright (c) 2023 by the Cabana authors                                 *
+ * All rights reserved.                                                     *
+ *                                                                          *
+ * This file is part of the Cabana library. Cabana is distributed under a   *
+ * BSD 3-clause license. For the licensing terms see the LICENSE file in    *
+ * the top-level directory.                                                 *
+ *                                                                          *
+ * SPDX-License-Identifier: BSD-3-Clause                                    *
+ ****************************************************************************/
+
+#include <Kokkos_Vector.hpp>
+#include <impl/Cabana_Hammersley.hpp>
+#include <cmath>
+#include <cstdio>
+#include <cstdint>
+#include <algorithm>
+
+#include <gtest/gtest.h>
+
+namespace Test
+{
+
+//---------------------------------------------------------------------------//
+// Corput test
+void testCorput()
+{
+	EXPECT_NEAR(Cabana::Impl::Corput<2>::value(0), 0./8., 1e-7);
+	EXPECT_NEAR(Cabana::Impl::Corput<2>::value(1), 4./8., 1e-7);
+	EXPECT_NEAR(Cabana::Impl::Corput<2>::value(2), 2./8., 1e-7);
+	EXPECT_NEAR(Cabana::Impl::Corput<2>::value(3), 6./8., 1e-7);
+	EXPECT_NEAR(Cabana::Impl::Corput<2>::value(4), 1./8., 1e-7);
+	EXPECT_NEAR(Cabana::Impl::Corput<2>::value(5), 5./8., 1e-7);
+	EXPECT_NEAR(Cabana::Impl::Corput<2>::value(6), 3./8., 1e-7);
+	EXPECT_NEAR(Cabana::Impl::Corput<2>::value(7), 7./8., 1e-7);
+
+	EXPECT_NEAR(Cabana::Impl::Corput<3>::value(0), 0./9., 1e-7);
+	EXPECT_NEAR(Cabana::Impl::Corput<3>::value(1), 3./9., 1e-7);
+	EXPECT_NEAR(Cabana::Impl::Corput<3>::value(2), 6./9., 1e-7);
+	EXPECT_NEAR(Cabana::Impl::Corput<3>::value(3), 1./9., 1e-7);
+	EXPECT_NEAR(Cabana::Impl::Corput<3>::value(4), 4./9., 1e-7);
+	EXPECT_NEAR(Cabana::Impl::Corput<3>::value(5), 7./9., 1e-7);
+	EXPECT_NEAR(Cabana::Impl::Corput<3>::value(6), 2./9., 1e-7);
+	EXPECT_NEAR(Cabana::Impl::Corput<3>::value(7), 5./9., 1e-7);
+	EXPECT_NEAR(Cabana::Impl::Corput<3>::value(8), 8./9., 1e-7);
+
+	const int N = 1020; // 34 * 2*3*5
+	auto _print_hammersley = KOKKOS_LAMBDA(const int& i) {
+		printf("%f %f %f %f\n", Cabana::Impl::hammersley(0,i,N), Cabana::Impl::hammersley(1,i,N), Cabana::Impl::hammersley(2,i,N), Cabana::Impl::hammersley(3,i,N));
+	};
+	Kokkos::parallel_for("print hammersley", N, _print_hammersley);
+}
+
+//---------------------------------------------------------------------------//
+// TESTS
+//---------------------------------------------------------------------------//
+TEST( cabana_corput, corput_test ) { testCorput(); }
+
+//---------------------------------------------------------------------------//
+
+} // end namespace Test
diff --git a/core/unit_test/tstMatrix1.cpp b/core/unit_test/tstMatrix1.cpp
new file mode 100644
index 000000000..c4114b8cb
--- /dev/null
+++ b/core/unit_test/tstMatrix1.cpp
@@ -0,0 +1,74 @@
+/****************************************************************************
+ * Copyright (c) 2023 by the Cabana authors                                 *
+ * All rights reserved.                                                     *
+ *                                                                          *
+ * This file is part of the Cabana library. Cabana is distributed under a   *
+ * BSD 3-clause license. For the licensing terms see the LICENSE file in    *
+ * the top-level directory.                                                 *
+ *                                                                          *
+ * SPDX-License-Identifier: BSD-3-Clause                                    *
+ ****************************************************************************/
+
+#include <Kokkos_Vector.hpp>
+#include <impl/Cabana_Matrix2d.hpp>
+
+#include <gtest/gtest.h>
+
+namespace Test
+{
+
+//---------------------------------------------------------------------------//
+// Matrix test
+void testMatrix()
+{
+	// Hard code a random matrix
+	const float A1[1][1] = {4.f};
+
+	// Check determinant
+	auto _determinant_A1 = KOKKOS_LAMBDA(const int& i) {
+		float det = Cabana::Impl::Matrix2d<float,1>::determinant(A1);
+
+		assert(det == 4.f);
+	};
+	Kokkos::parallel_for("Determinant A1", 1, _determinant_A1);
+
+	// Check transposition
+	auto _transpose_A1 = KOKKOS_LAMBDA(const int& i) {
+		float T1[1][1];
+		Cabana::Impl::Matrix2d<float,1>::transpose(T1, A1);
+
+		assert(T1[0][0] == 4.f);
+	};
+	Kokkos::parallel_for("Transpose A1", 1, _transpose_A1);
+
+	// Check inverse
+	auto _invert_A1 = KOKKOS_LAMBDA(const int& i) {
+		float I1[1][1];
+		Cabana::Impl::Matrix2d<float,1>::invert(I1, A1);
+
+		assert(I1[0][0] ==  0.25f);
+	};
+	Kokkos::parallel_for("Invert A1", 1, _invert_A1);
+
+	// Test cholesky decomposition
+	auto _cholesky_A1 = KOKKOS_LAMBDA(const int& i) {
+		float C1[1][1];
+		Cabana::Impl::Matrix2d<float,1>::cholesky(C1, A1);
+
+		assert(C1[0][0] ==  2.f);
+	};
+	Kokkos::parallel_for("Decompose A1", 1, _cholesky_A1);
+}
+
+//---------------------------------------------------------------------------//
+// TESTS
+//---------------------------------------------------------------------------//
+TEST( cabana_matrix, matrix_test ) { testMatrix(); }
+
+//---------------------------------------------------------------------------//
+
+} // end namespace Test
+
+
+
+
diff --git a/core/unit_test/tstMatrix2.cpp b/core/unit_test/tstMatrix2.cpp
new file mode 100644
index 000000000..0b7aaef39
--- /dev/null
+++ b/core/unit_test/tstMatrix2.cpp
@@ -0,0 +1,113 @@
+/****************************************************************************
+ * Copyright (c) 2023 by the Cabana authors                                 *
+ * All rights reserved.                                                     *
+ *                                                                          *
+ * This file is part of the Cabana library. Cabana is distributed under a   *
+ * BSD 3-clause license. For the licensing terms see the LICENSE file in    *
+ * the top-level directory.                                                 *
+ *                                                                          *
+ * SPDX-License-Identifier: BSD-3-Clause                                    *
+ ****************************************************************************/
+
+#include <Kokkos_Vector.hpp>
+#include <impl/Cabana_Matrix2d.hpp>
+
+#include <gtest/gtest.h>
+
+namespace Test
+{
+
+//---------------------------------------------------------------------------//
+// Matrix test
+void testMatrix()
+{
+	// Hard code a random matrix
+	const float A2[2][2] = {{1.94f, 1.14f},
+							{5.58f, 6.66f}};
+
+	// Hard code a second, symmetrix matrix
+	const float B2[2][2] = {{ 4.f, 12.f},
+							{12.f, 37.f}};
+
+	// Check determinant
+	auto _determinant_A2 = KOKKOS_LAMBDA(const int& i) {
+		float det = Cabana::Impl::Matrix2d<float,2>::determinant(A2);
+
+		assert(det == 6.5592f);
+	};
+	Kokkos::parallel_for("Determinant A2", 1, _determinant_A2);
+
+	auto _determinant_B2 = KOKKOS_LAMBDA(const int& i) {
+		float det = Cabana::Impl::Matrix2d<float,2>::determinant(B2);
+
+		assert(det == 4.f);
+	};
+	Kokkos::parallel_for("Determinant B2", 1, _determinant_B2);
+
+	// Check transposition
+	auto _transpose_A2 = KOKKOS_LAMBDA(const int& i) {
+		float T2[2][2];
+		Cabana::Impl::Matrix2d<float,2>::transpose(T2, A2);
+
+		assert(T2[0][0] == 1.94f);
+		assert(T2[0][1] == 5.58f);
+		assert(T2[1][0] == 1.14f);
+		assert(T2[1][1] == 6.66f);
+	};
+	Kokkos::parallel_for("Transpose A2", 1, _transpose_A2);
+
+	auto _transpose_B2 = KOKKOS_LAMBDA(const int& i) {
+		float T2[2][2];
+		Cabana::Impl::Matrix2d<float,2>::transpose(T2, B2);
+
+		assert(T2[0][0] == B2[0][0]);
+		assert(T2[0][1] == B2[0][1]);
+		assert(T2[1][0] == B2[1][0]);
+		assert(T2[1][1] == B2[1][1]);
+	};
+	Kokkos::parallel_for("Transpose B2", 1, _transpose_B2);
+
+	// Check inverse
+	auto _invert_A2 = KOKKOS_LAMBDA(const int& i) {
+		float I2[2][2];
+		Cabana::Impl::Matrix2d<float,2>::invert(I2, A2);
+
+		assert(I2[0][0] ==  1.015370f);
+		assert(I2[0][1] == -0.173802f);
+		assert(I2[1][0] == -0.850714f);
+		assert(I2[1][1] ==  0.295768f);
+	};
+	Kokkos::parallel_for("Invert A2", 1, _invert_A2);
+
+	auto _invert_B2 = KOKKOS_LAMBDA(const int& i) {
+		float I2[2][2];
+		Cabana::Impl::Matrix2d<float,2>::invert(I2, B2);
+
+		assert(I2[0][0] ==  9.25f);
+		assert(I2[0][1] == -3.00f);
+		assert(I2[1][0] == -3.00f);
+		assert(I2[1][1] ==  1.00f);
+	};
+	Kokkos::parallel_for("Invert B2", 1, _invert_B2);
+
+	// Test cholesky decomposition
+	auto _cholesky_B2 = KOKKOS_LAMBDA(const int& i) {
+		float C2[2][2];
+		Cabana::Impl::Matrix2d<float,2>::cholesky(C2, B2);
+
+		assert(C2[0][0] == 2.f);
+		assert(C2[0][1] == 0.f);
+		assert(C2[1][0] == 6.f);
+		assert(C2[1][1] == 1.f);
+	};
+	Kokkos::parallel_for("Decompose B2", 1, _cholesky_B2);
+}
+
+//---------------------------------------------------------------------------//
+// TESTS
+//---------------------------------------------------------------------------//
+TEST( cabana_matrix, matrix_test ) { testMatrix(); }
+
+//---------------------------------------------------------------------------//
+
+} // end namespace Test
diff --git a/core/unit_test/tstMatrix3.cpp b/core/unit_test/tstMatrix3.cpp
new file mode 100644
index 000000000..3a08efba0
--- /dev/null
+++ b/core/unit_test/tstMatrix3.cpp
@@ -0,0 +1,140 @@
+/****************************************************************************
+ * Copyright (c) 2023 by the Cabana authors                                 *
+ * All rights reserved.                                                     *
+ *                                                                          *
+ * This file is part of the Cabana library. Cabana is distributed under a   *
+ * BSD 3-clause license. For the licensing terms see the LICENSE file in    *
+ * the top-level directory.                                                 *
+ *                                                                          *
+ * SPDX-License-Identifier: BSD-3-Clause                                    *
+ ****************************************************************************/
+
+#include <Kokkos_Vector.hpp>
+#include <impl/Cabana_Matrix2d.hpp>
+
+#include <gtest/gtest.h>
+
+namespace Test
+{
+
+//---------------------------------------------------------------------------//
+// Matrix test
+void testMatrix()
+{
+	// Hard code a random matrix
+	const float A3[3][3] = {{1.94f, 1.14f, 1.47f},
+							{5.58f, 6.66f, 0.25f},
+							{8.16f, 0.83f, 8.81f}};
+
+	// Hard code a second, symmetrix matrix
+	const float B3[3][3] = {{  4.f,  12.f, -16.f},
+							{ 12.f,  37.f, -43.f},
+							{-16.f, -43.f,  98.f}};
+
+	// Check determinant
+	auto _determinant_A3 = KOKKOS_LAMBDA(const int& i) {
+		float det = Cabana::Impl::Matrix2d<float,3>::determinant(A3);
+
+		assert(det == -13.3703f);
+	};
+	Kokkos::parallel_for("Determinant A3", 1, _determinant_A3);
+
+	auto _determinant_B3 = KOKKOS_LAMBDA(const int& i) {
+		float det = Cabana::Impl::Matrix2d<float,3>::determinant(B3);
+
+		assert(det == 36.f);
+	};
+	Kokkos::parallel_for("Determinant B3", 1, _determinant_B3);
+
+	// Check transposition
+	auto _transpose_A3 = KOKKOS_LAMBDA(const int& i) {
+		float T3[3][3];
+		Cabana::Impl::Matrix2d<float,3>::transpose(T3, A3);
+
+		assert(T3[0][0] == 1.94f);
+		assert(T3[0][1] == 5.58f);
+		assert(T3[0][2] == 8.16f);
+		assert(T3[1][0] == 1.14f);
+		assert(T3[1][1] == 6.66f);
+		assert(T3[1][2] == 0.83f);
+		assert(T3[2][0] == 1.47f);
+		assert(T3[2][1] == 0.25f);
+		assert(T3[2][2] == 8.81f);
+	};
+	Kokkos::parallel_for("Transpose A3", 1, _transpose_A3);
+
+	auto _transpose_B3 = KOKKOS_LAMBDA(const int& i) {
+		float T3[3][3];
+		Cabana::Impl::Matrix2d<float,3>::transpose(T3, B3);
+
+		assert(T3[0][0] == B3[0][0]);
+		assert(T3[0][1] == B3[0][1]);
+		assert(T3[0][2] == B3[0][2]);
+		assert(T3[1][0] == B3[1][0]);
+		assert(T3[1][1] == B3[1][1]);
+		assert(T3[1][2] == B3[1][2]);
+		assert(T3[2][0] == B3[2][0]);
+		assert(T3[2][1] == B3[2][1]);
+		assert(T3[2][2] == B3[2][2]);
+	};
+	Kokkos::parallel_for("Transpose B3", 1, _transpose_B3);
+
+	// Check inverse
+	auto _invert_A3 = KOKKOS_LAMBDA(const int& i) {
+		float I3[3][3];
+		Cabana::Impl::Matrix2d<float,3>::invert(I3, A3);
+
+		assert(I3[0][0] == -4.372920f);
+		assert(I3[0][1] ==  0.659919f);
+		assert(I3[0][2] ==  0.710920f);
+		assert(I3[1][0] ==  3.524220f);
+		assert(I3[1][1] == -0.381159f);
+		assert(I3[1][2] == -0.577221f);
+		assert(I3[2][0] ==  3.718266f);
+		assert(I3[2][1] == -0.575321f);
+		assert(I3[2][2] == -0.490581f);
+	};
+	Kokkos::parallel_for("Invert A3", 1, _invert_A3);
+
+	auto _invert_B3 = KOKKOS_LAMBDA(const int& i) {
+		float I3[3][3];
+		Cabana::Impl::Matrix2d<float,3>::invert(I3, B3);
+
+		assert(I3[0][0] ==  49.361111f);
+		assert(I3[0][1] == -13.555555f);
+		assert(I3[0][2] ==   2.111111f);
+		assert(I3[1][0] == -13.555555f);
+		assert(I3[1][1] ==   3.777777f);
+		assert(I3[1][2] ==  -0.555555f);
+		assert(I3[2][0] ==   2.111111f);
+		assert(I3[2][1] ==  -0.555555f);
+		assert(I3[2][2] ==   0.111111f);
+	};
+	Kokkos::parallel_for("Invert B3", 1, _invert_B3);
+
+	// Test cholesky decomposition
+	auto _cholesky_B3 = KOKKOS_LAMBDA(const int& i) {
+		float C3[3][3];
+		Cabana::Impl::Matrix2d<float,3>::cholesky(C3, B3);
+
+		assert(C3[0][0] ==  2.f);
+		assert(C3[0][1] ==  0.f);
+		assert(C3[0][2] ==  0.f);
+		assert(C3[1][0] ==  6.f);
+		assert(C3[1][1] ==  1.f);
+		assert(C3[1][2] ==  0.f);
+		assert(C3[2][0] == -8.f);
+		assert(C3[2][1] ==  5.f);
+		assert(C3[2][2] ==  3.f);
+	};
+	Kokkos::parallel_for("Decompose B3", 1, _cholesky_B3);
+}
+
+//---------------------------------------------------------------------------//
+// TESTS
+//---------------------------------------------------------------------------//
+TEST( cabana_matrix, matrix_test ) { testMatrix(); }
+
+//---------------------------------------------------------------------------//
+
+} // end namespace Test
diff --git a/example/core_tutorial/14_gmm/CMakeLists.txt b/example/core_tutorial/14_gmm/CMakeLists.txt
new file mode 100644
index 000000000..e63d14cbe
--- /dev/null
+++ b/example/core_tutorial/14_gmm/CMakeLists.txt
@@ -0,0 +1,52 @@
+############################################################################
+# Copyright (c) 2018-2023 by the Cabana authors                            #
+# All rights reserved.                                                     #
+#                                                                          #
+# This file is part of the Cabana library. Cabana is distributed under a   #
+# BSD 3-clause license. For the licensing terms see the LICENSE file in    #
+# the top-level directory.                                                 #
+#                                                                          #
+# SPDX-License-Identifier: BSD-3-Clause                                    #
+############################################################################
+
+# the next for lines are just, so that we can use this as
+# a standalone CMakeLists.txt for testing of the Target export
+cmake_minimum_required(VERSION 3.16)
+project(Cabana_GMM)
+if(NOT TARGET Cabana::cabanacore)
+  find_package(Cabana)
+endif()
+
+# Dimensions of velocity space
+set(VelocityDimensions "3" CACHE STRING "Properties of the dimensions of velocity space")
+set(VelocityDimensionsValues 1 2 3) # 1d Cartesian, 2d cylindrical, 3d cartesian. If we ever want 2d cartestian we have to come up with something
+set_property(CACHE VelocityDimensions PROPERTY STRINGS ${VelocityDimensionsValues})
+message(STATUS "VelocityDimensions='${VelocityDimensions}'")
+
+# Floating point type for particle data
+set(ParticleFloatType "double" CACHE STRING "Floating point type for particle data")
+set(ParticleFloatTypeValues float double)
+set_property(CACHE ParticleFloatType PROPERTY STRINGS ${ParticleFloatTypeValues})
+message(STATUS "ParticleFloatType='${ParticleFloatType}'")
+
+# Floating point type for particle data
+set(GmmFloatType "double" CACHE STRING "Floating point type for particle data")
+set(GmmFloatTypeValues float double)
+set_property(CACHE GmmFloatType PROPERTY STRINGS ${GmmFloatTypeValues})
+message(STATUS "GmmFloatType='${GmmFloatType}'")
+
+file(GLOB HEADERS "*.h *hpp")
+file(GLOB SOURCES "*.cpp")
+
+install(FILES ${HEADERS} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
+
+set (CMAKE_CXX_FLAGS "-Wall -Wextra -Wshadow -g3")
+set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DVelocityDimensions=${VelocityDimensions}")
+set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DParticleFloatType=${ParticleFloatType}")
+set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DGmmFloatType=${GmmFloatType}")
+
+add_executable(gmm ./main.cpp)
+target_link_libraries(gmm PUBLIC Cabana::cabanacore)
+target_include_directories(gmm PRIVATE ${CMAKE_CURRENT_BINARY_DIR})
+target_include_directories(gmm PUBLIC "${KokkosKernels_CMAKE_DIR}/../../../include")
+install(TARGETS gmm DESTINATION ${CMAKE_INSTALL_BINDIR})
diff --git a/example/core_tutorial/14_gmm/main.cpp b/example/core_tutorial/14_gmm/main.cpp
new file mode 100644
index 000000000..8b04881f6
--- /dev/null
+++ b/example/core_tutorial/14_gmm/main.cpp
@@ -0,0 +1,1090 @@
+#include <Cabana_Core.hpp>
+#include <Cabana_AoSoA.hpp>
+#include <Kokkos_Vector.hpp>
+
+#include <Cabana_ParticleInit.hpp>
+#include <limits>
+#include <cmath>
+
+
+using MemorySpace = Kokkos::DefaultExecutionSpace::memory_space;
+using ExecutionSpace = Kokkos::DefaultExecutionSpace;
+
+enum ParticleFields {
+    CellIndex,
+    MacroFactor,
+    PositionX,
+#if VelocityDimensions == 1
+    VelocityX
+#elif VelocityDimensions == 2
+    VelocityPar,
+    VelocityPer
+#elif VelocityDimensions == 3
+    VelocityX,
+    VelocityY,
+    VelocityZ
+#endif
+};
+
+
+// Designate the types that the particles will hold.
+using ParticleDataTypes =
+Cabana::MemberTypes<
+    int,                          // (0) Cell index
+    ParticleFloatType,            // (1) macro factor
+    ParticleFloatType,            // (2) x-position
+#if VelocityDimensions == 1
+    ParticleFloatType             // (3) x-velocity
+#elif VelocityDimensions == 2
+    ParticleFloatType,            // (3) parallel-velocity
+    ParticleFloatType             // (4) perp-velocity
+#elif VelocityDimensions == 3
+    ParticleFloatType,            // (3) x-velocity
+    ParticleFloatType,            // (4) y-velocity
+    ParticleFloatType             // (5) z-velocity
+#endif
+>;
+
+// Set the type for the particle AoSoA.
+using particle_list_t = Cabana::AoSoA<ParticleDataTypes,MemorySpace>;
+
+
+// Set the type for a list of Gaussians
+using gaussian_list_t = Kokkos::View<GmmFloatType**[n_gaussian_param],Kokkos::HostSpace>;
+using gaussian_dev_t  = Kokkos::View<GmmFloatType**[n_gaussian_param],Kokkos::LayoutRight>; // Should we force both host and device view to be LayoutLeft instead?
+
+
+enum ParticleGenerators {
+	RandomSampling,
+	CDFInversion,
+	HammersleyEqualDensity,
+	HammersleyEqualWeight
+};
+
+void check_gaussians(const gaussian_list_t& gaussians) {
+	int cmax = gaussians.extent(0);
+	int kmax = gaussians.extent(1);
+	Kokkos::View<GmmFloatType**[VelocityDimensions][VelocityDimensions]> C("C", cmax,kmax);
+	Kokkos::View<GmmFloatType**[VelocityDimensions][VelocityDimensions]> B("B", cmax,kmax);
+
+	auto g_dev = Kokkos::create_mirror_view(MemorySpace(), gaussians);
+	Kokkos::deep_copy(g_dev, gaussians);
+
+	auto _cholesky = KOKKOS_LAMBDA(const int& j) {
+		for(int c = 0; c < cmax; c++) {
+			if(g_dev(c,j,Weight)>0.) {
+#if VelocityDimensions == 1
+				C(c,j,0,0) = g_dev(c,j,Cxx);
+				B(c,j,0,0) = g_dev(c,j,Cxx);
+#elif VelocityDimensions == 2
+				C(c,j,0,0) = g_dev(c,j,Cparpar); C(c,j,0,1) = g_dev(c,j,Cparper);
+				C(c,j,1,0) = g_dev(c,j,Cperpar); C(c,j,1,1) = g_dev(c,j,Cperper);
+				GmmFloatType Cj[2][2] = {{g_dev(c,j,Cparpar), g_dev(c,j,Cparper)},
+				                         {g_dev(c,j,Cperpar), g_dev(c,j,Cperper)}};
+				GmmFloatType Bj[2][2];
+				Cabana::Impl::Matrix<GmmFloatType,2>::cholesky(Bj,Cj);
+				B(c,j,0,0) = Bj[0][0]; B(c,j,0,1) = Bj[0][1];
+				B(c,j,1,0) = Bj[1][0]; B(c,j,1,1) = Bj[1][1];
+#elif VelocityDimensions == 3
+				C(c,j,0,0) = g_dev(c,j,Cxx); C(c,j,0,1) = g_dev(c,j,Cxy); C(c,j,0,2) = g_dev(c,j,Cxz);
+				C(c,j,1,0) = g_dev(c,j,Cyx); C(c,j,1,1) = g_dev(c,j,Cyy); C(c,j,1,2) = g_dev(c,j,Cyz);
+				C(c,j,2,0) = g_dev(c,j,Czx); C(c,j,2,1) = g_dev(c,j,Czy); C(c,j,2,2) = g_dev(c,j,Czz);
+				GmmFloatType Cj[3][3] = {{g_dev(c,j,Cxx), g_dev(c,j,Cxy), g_dev(c,j,Cxz)},
+				                         {g_dev(c,j,Cyx), g_dev(c,j,Cyy), g_dev(c,j,Cyz)},
+				                         {g_dev(c,j,Czx), g_dev(c,j,Czy), g_dev(c,j,Czz)}};
+				GmmFloatType Bj[3][3];
+				Cabana::Impl::Matrix2d<GmmFloatType,3>::cholesky(Bj,Cj);
+				B(c,j,0,0) = Bj[0][0]; B(c,j,0,1) = Bj[0][1]; B(c,j,0,2) = Bj[0][2];
+				B(c,j,1,0) = Bj[1][0]; B(c,j,1,1) = Bj[1][1]; B(c,j,1,2) = Bj[1][2];
+				B(c,j,2,0) = Bj[1][0]; B(c,j,2,1) = Bj[2][1]; B(c,j,2,2) = Bj[2][2];
+#endif
+			}
+		}
+	};
+
+	Kokkos::parallel_for("Check gaussians", kmax, _cholesky);
+
+	auto B_host = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), B);
+	auto C_host = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), C);
+
+	for(int c = 0; c < cmax; c++) {
+		for(int m = 0; m < kmax; m++) {
+			if(gaussians(c,m,Weight)>0.) {
+#if VelocityDimensions == 1
+				if(std::isnan(B_host(c,m,0,0))) {
+					printf("Gaussian %d in cell %d has covariance matrix C=(%e), but is that really symmetric and positive definite, because I get a Cholesky decomposition of (%e)\n", m,c, C_host(c,m,0,0), B_host(c,m,0,0));
+					exit(1);
+				}
+#elif VelocityDimensions == 2
+				if(std::isnan(B_host(c,m,0,0))||std::isnan(B_host(c,m,0,1)) ||
+				   std::isnan(B_host(c,m,1,0))||std::isnan(B_host(c,m,1,1)) ) {
+					printf("Gaussian %d in cell %d has covariance matrix C=((%e,%e),(%e,%e)), but is that really symmetric and positive definite, because I get a Cholesky decomposition of ((%e,%e),(%e,%e))\n", m,c, C_host(c,m,0,0),C_host(c,m,0,1), C_host(c,m,1,0),C_host(c,m,1,1), B_host(c,m,0,0),B_host(c,m,0,1), B_host(c,m,1,0),B_host(c,m,1,1));
+					exit(1);
+				}
+#elif VelocityDimensions == 3
+				if(std::isnan(B_host(c,m,0,0))||std::isnan(B_host(c,m,0,1))||std::isnan(B_host(c,m,0,2)) ||
+				   std::isnan(B_host(c,m,1,0))||std::isnan(B_host(c,m,1,1))||std::isnan(B_host(c,m,1,2)) ||
+				   std::isnan(B_host(c,m,2,0))||std::isnan(B_host(c,m,2,1))||std::isnan(B_host(c,m,2,2)) ) {
+					printf("Gaussian %d in cell %d has covariance matrix C=((%e,%e,%e),(%e,%e,%e),(%e,%e,%e)), but is that really symmetric and positive definite, because I get a Cholesky decomposition of ((%e,%e,%e),(%e,%e,%e),(%e,%e,%e))\n", m,c, C_host(c,m,0,0),C_host(c,m,0,1),C_host(c,m,0,2), C_host(c,m,1,0),C_host(c,m,1,1),C_host(c,m,1,2), C_host(c,m,2,0),C_host(c,m,2,1),C_host(c,m,2,2), B_host(c,m,0,0),B_host(c,m,0,1),B_host(c,m,0,2), B_host(c,m,1,0),B_host(c,m,1,1),B_host(c,m,1,2), B_host(c,m,2,0),B_host(c,m,2,1),B_host(c,m,2,2));
+					exit(1);
+				}
+#endif
+			}
+		}
+	}
+}
+
+void initialize_particles(particle_list_t& particles, const gaussian_list_t& gaussians, const int seed, enum ParticleGenerators samplingtype = RandomSampling) {
+	size_t Np;
+
+	auto cell       = Cabana::slice<CellIndex>(particles);
+	auto macro      = Cabana::slice<MacroFactor>(particles);
+	auto position_x = Cabana::slice<PositionX>(particles);
+#if VelocityDimensions == 1
+	auto velocity_x = Cabana::slice<VelocityX>(particles);
+
+	if(samplingtype == RandomSampling) {
+		Np = Cabana::initializeRandomParticles(cell, macro, position_x, velocity_x, gaussians, seed);
+	} else if (samplingtype == CDFInversion) {
+		Np = Cabana::initializeParticlesFromCDF(cell, macro, position_x, velocity_x, gaussians, seed);
+	} else if (samplingtype == HammersleyEqualDensity) {
+		Np = Cabana::initializeEqualDensityParticlesWithHammersley(cell, macro, position_x, velocity_x, gaussians);
+	} else if (samplingtype == HammersleyEqualWeight) {
+		Np = Cabana::initializeEqualWeightParticlesWithHammersley(cell, macro, position_x, velocity_x, gaussians);
+	} else {
+		fprintf(stderr, "Unimplemented sampling type %d\n", samplingtype);
+		exit(1);
+	}
+
+#elif VelocityDimensions == 2
+	auto velocity_par = Cabana::slice<VelocityPar>(particles);
+	auto velocity_per = Cabana::slice<VelocityPer>(particles);
+	if(samplingtype == RandomSampling) {
+		Np = Cabana::initializeRandomParticles(cell, macro, position_x, velocity_par, velocity_per, gaussians, seed);
+	} else if (samplingtype == HammersleyEqualDensity) {
+		Np = Cabana::initializeEqualDensityParticlesWithHammersley(cell, macro, position_x, velocity_par, velocity_per, gaussians);
+	} else if (samplingtype == HammersleyEqualWeight) {
+		Np = Cabana::initializeEqualWeightParticlesWithHammersley(cell, macro, position_x, velocity_par, velocity_per, gaussians);
+	} else {
+		fprintf(stderr, "Unimplemented sampling type %d\n", samplingtype);
+		exit(1);
+	}
+
+#elif VelocityDimensions == 3
+	auto velocity_x = Cabana::slice<VelocityX>(particles);
+	auto velocity_y = Cabana::slice<VelocityY>(particles);
+	auto velocity_z = Cabana::slice<VelocityZ>(particles);
+
+	if(samplingtype == RandomSampling) {
+		Np = Cabana::initializeRandomParticles(cell, macro, position_x, velocity_x, velocity_y, velocity_z, gaussians, seed);
+	} else if (samplingtype == HammersleyEqualDensity) {
+		Np = Cabana::initializeEqualDensityParticlesWithHammersley(cell, macro, position_x, velocity_x, velocity_y, velocity_z, gaussians);
+	} else if (samplingtype == HammersleyEqualWeight) {
+		Np = Cabana::initializeEqualWeightParticlesWithHammersley(cell, macro, position_x, velocity_x, velocity_y, velocity_z, gaussians);
+	} else {
+		fprintf(stderr, "Unimplemented sampling type %d\n", samplingtype);
+		exit(1);
+	}
+
+#endif
+
+	particles.resize(Np);
+
+	GmmFloatType N = 0.;
+	auto _sum = KOKKOS_LAMBDA(const int& i, GmmFloatType& lN) {
+		lN += macro(i);
+	};
+	Kokkos::parallel_reduce("Particle Scan", particles.size(), _sum, N);
+
+	printf("We have %f particles with total weight %ld\n", N, Np);
+	printf("adjusting particle weights by a factor %f\n", Np/N);
+
+	auto _norm = KOKKOS_LAMBDA(const int s, const int i) {
+		macro.access(s,i) *= Np/N;
+	};
+	Cabana::SimdPolicy<particle_list_t::vector_length,ExecutionSpace> vec_policy(0, Np);
+	Cabana::simd_parallel_for(vec_policy, _norm, "norm()");
+
+}
+
+void save_particles(particle_list_t particles, const char* name) {
+	// Write out particles
+	FILE* fp = fopen(name, "w");
+	auto pl_host = Cabana::create_mirror_view_and_copy(Kokkos::HostSpace(), particles);
+	auto      macro_host = Cabana::slice<MacroFactor>(pl_host);
+	auto       cell_host = Cabana::slice<CellIndex>(pl_host);
+	auto position_x_host = Cabana::slice<PositionX>(pl_host);
+#if VelocityDimensions == 1
+	auto velocity_x_host = Cabana::slice<VelocityX>(pl_host);
+	for(size_t i = 0; i < pl_host.size(); i++) {
+		fprintf(fp, "%zd %d %f %f %e\n", i, cell_host(i), macro_host(i), position_x_host(i), velocity_x_host(i));
+	}
+#elif VelocityDimensions == 2
+	auto velocity_par_host = Cabana::slice<VelocityPar>(pl_host);
+	auto velocity_per_host = Cabana::slice<VelocityPer>(pl_host);
+	for(size_t i = 0; i < pl_host.size(); i++) {
+		fprintf(fp, "%zd %d %f %f %e %e\n", i, cell_host(i), macro_host(i), position_x_host(i), velocity_par_host(i), velocity_per_host(i));
+	}
+#elif VelocityDimensions == 3
+	auto velocity_x_host = Cabana::slice<VelocityX>(pl_host);
+	auto velocity_y_host = Cabana::slice<VelocityY>(pl_host);
+	auto velocity_z_host = Cabana::slice<VelocityZ>(pl_host);
+	for(size_t i = 0; i < pl_host.size(); i++) {
+		fprintf(fp, "%zd %d %f %f %e %e %e\n", i, cell_host(i), macro_host(i), position_x_host(i), velocity_x_host(i), velocity_y_host(i), velocity_z_host(i));
+	}
+#endif
+	fclose(fp);
+}
+
+// solve M.A = rho-rho2 for A
+void jacobi(const Kokkos::View<ParticleFloatType**>&M, Kokkos::View<ParticleFloatType*>& A, const Kokkos::View<ParticleFloatType*>& rho, const Kokkos::View<ParticleFloatType*>& rho2) {
+	// Square matrix
+	assert(M.extent(0) == M.extent(1));
+	// And matching sizes
+	assert(M.extent(0) == A.extent(0));
+	assert(M.extent(0) == rho.extent(0));
+	assert(M.extent(0) == rho2.extent(0));
+
+	// Initial guess for dA from diagonal elements
+	// M(i,i) * A(i) = rho(i) - rho2(i)
+	auto _init = KOKKOS_LAMBDA(const int& i) {
+		A(i) = (rho(i) - rho2(i)) / M(i,i);
+	};
+	Kokkos::parallel_for("guess A", A.extent(0), _init);
+
+	double error = 1.;
+	for(size_t k = 0; k < A.extent(0)*A.extent(0); k++) {
+		// Perform one jacobi iteration
+		Kokkos::View<ParticleFloatType*> Anext("Anext", A.extent(0));
+		auto _jacobi = KOKKOS_LAMBDA(const size_t& i) {
+			double dot = 0.;
+			for(size_t j = 0; j < Anext.extent(0); j++) {
+				dot += M(i,j) * A(j);
+			}
+			Anext(i) = (rho(i)-rho2(i) - dot + M(i,i)*A(i)) / M(i,i);
+		};
+
+		Kokkos::parallel_for("update A", A.extent(0), _jacobi);
+
+		// Meassure convergence
+		auto _sum = KOKKOS_LAMBDA(const int&i, double& lsum) {
+			lsum += (Anext(i)-A(i)) * (Anext(i)-A(i));
+		};
+		double sum = 0.;
+		Kokkos::parallel_reduce("squared difference", A.extent(0), _sum, sum);
+		//printf("summed squared differences = %e\n", sum);
+
+		// Overwrite A
+		auto _copy = KOKKOS_LAMBDA(const int& i) {
+			A(i) = Anext(i);
+		};
+		Kokkos::parallel_for("overwrite A", A.extent(0), _copy);
+
+		if(sum < error) {
+			error = sum;
+		} else {
+			break;
+		}
+	}
+}
+
+KOKKOS_INLINE_FUNCTION double W_TSC(const double x) {
+	if(fabs(x) < 0.5) {
+		return 0.75 - fabs(x)*fabs(x);
+	} else if (fabs(x) < 1.5) {
+		return 0.5 * (1.5-fabs(x)) * (1.5-fabs(x));
+	} else {
+		return 0.;
+	}
+}
+
+template<class macro_t, class cell_t, class pos_t, class result_t>
+	struct ChargeAdd {
+
+		using V = ParticleFloatType;
+		using size_type = size_t;
+		using value_type = V[];
+
+		KOKKOS_INLINE_FUNCTION
+			void operator()(const int particleindex, value_type rho) const {
+				for(int cellindex = cell_(particleindex)-2; cellindex <= cell_(particleindex)+2; cellindex++) {
+					// we have a ghost cell on the left which shifts indices of rho relative to cell indices
+					const double xg = cellindex;
+					rho[cellindex+2] += macro_(particleindex) * W_TSC(posx_(particleindex) - xg);
+				}
+			}
+
+		KOKKOS_INLINE_FUNCTION void join(value_type dst, const value_type src) const {
+			for(size_type i = 0; i < value_count; ++i) {
+				dst[i] += src[i];
+			}
+		}
+
+		KOKKOS_INLINE_FUNCTION void init(value_type dst) const {
+			for(size_type i = 0; i < value_count; ++i) {
+				dst[i] = 0.;
+			}
+		}
+
+		using has_final = std::true_type;
+		KOKKOS_INLINE_FUNCTION void final(value_type dst) const {
+			for(size_type i = 0; i < value_count; ++i) { results_(i) = dst[i]; }
+		}
+
+		macro_t macro_;
+		cell_t cell_;
+		pos_t posx_;
+		size_type particle_count;
+		size_type value_count; // cell count, but the name is special to kokkos
+		result_t results_; // total rho, but the name is special to kokkos
+	};
+
+// This version parallelizes over particles like a typical particle-in-cell
+// code. It needs some extra doing because the output is an array. For more
+// than one spatial dimension this leads to problems.
+void chargedensity(const particle_list_t& particles, Kokkos::View<ParticleFloatType*>& rho) {
+
+	const size_t numParticles = particles.size();
+	const size_t numCells = rho.extent(0);
+
+	auto macro = Cabana::slice<MacroFactor>(particles);
+	auto cell  = Cabana::slice<CellIndex>(particles);
+	auto posx  = Cabana::slice<PositionX>(particles);
+
+	using macro_t = typeof(macro);
+	using cell_t = typeof(cell);
+	using pos_t = typeof(posx);
+	using result_t = Kokkos::View<ParticleFloatType*>;
+
+	Kokkos::View<ParticleFloatType*> rhotmp("rhotmp", numCells+4);
+
+	Kokkos::parallel_reduce(numParticles, ChargeAdd<macro_t,cell_t,pos_t,result_t>{macro, cell, posx, numParticles, numCells+4, rhotmp});
+
+	auto rhotmp_host = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), rhotmp);
+	FILE* fp = fopen("debug_cd1.dat", "w");
+	double sum = 0.;
+	for(size_t i = 0; i < rhotmp_host.extent(0); i++) {
+		fprintf(fp, "%ld %f\n", i, rhotmp_host(i));
+		sum += rhotmp_host(i);
+	}
+	fprintf(fp, "#sum = %f\n", sum);
+	fclose(fp);
+
+	// Periodic boundaries on rho
+	auto _copy = KOKKOS_LAMBDA(const size_t& i) {
+		// copy interior
+		rho(i) = rhotmp(i+2);
+		// fold ghost cells
+		if(i == 0) {
+			rho(i) += rhotmp(rho.extent(0)+2);
+		}
+		if(i == 1) {
+			rho(i) += rhotmp(rho.extent(0)+3);
+		}
+		if(i == rho.extent(0)-2) {
+			rho(i) += rhotmp(0);
+		}
+		if(i == rho.extent(0)-1) {
+			rho(i) += rhotmp(1);
+		}
+	};
+	Kokkos::parallel_for("boundary of rho", rho.extent(0), _copy);
+
+	auto rho_host = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), rho);
+	fp = fopen("debug_cd1b.dat", "w");
+	sum = 0.;
+	for(size_t i = 0; i < rho_host.extent(0); i++) {
+		fprintf(fp, "%ld %f\n", i, rho_host(i));
+		sum += rho_host(i);
+	}
+	fprintf(fp, "#sum = %f\n", sum);
+	fclose(fp);
+}
+
+
+// This version parallelizes over cells in rho. It avoids write conflicts but
+// processes each particle multiple times which I suspected would be slower.
+// But it turns out to run faster in many cases.
+void chargedensity2(const particle_list_t& particles, Kokkos::View<ParticleFloatType*>& rho) {
+
+	const size_t numParticles = particles.size();
+	const size_t numCells = rho.extent(0);
+
+	auto macro = Cabana::slice<MacroFactor>(particles);
+	auto cell  = Cabana::slice<CellIndex>(particles);
+	auto posx  = Cabana::slice<PositionX>(particles);
+
+	Kokkos::View<ParticleFloatType*> rhotmp("rhotmp", numCells+4);
+
+	// how to compute a single cell
+	auto _deposit = KOKKOS_LAMBDA(const int& cellindex) {
+		rhotmp(cellindex) = 0.;
+		for(size_t n = 0; n < numParticles; n++) {
+			const double xg = cellindex - 2;
+			rhotmp[cellindex] += macro(n) * W_TSC(posx(n) - xg);
+		}
+	};
+
+	Kokkos::parallel_for("deposit rho", rhotmp.extent(0), _deposit);
+
+	auto rhotmp_host = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), rhotmp);
+	FILE* fp = fopen("debug_cd2.dat", "w");
+	double sum = 0.;
+	for(size_t i = 0; i < rhotmp_host.extent(0); i++) {
+		fprintf(fp, "%ld %f\n", i, rhotmp_host(i));
+		sum += rhotmp_host(i);
+	}
+	fprintf(fp, "#sum = %f\n", sum);
+	fclose(fp);
+
+	// Periodic boundaries on rho
+	auto _copy = KOKKOS_LAMBDA(const size_t& i) {
+		// copy interior
+		rho(i) = rhotmp(i+2);
+		// fold ghost cells
+		if(i == 0) {
+			rho(i) += rhotmp(numCells+2);
+		}
+		if(i == 1) {
+			rho(i) += rhotmp(numCells+3);
+		}
+		if(i == numCells-2) {
+			rho(i) += rhotmp(0);
+		}
+		if(i == numCells-1) {
+			rho(i) += rhotmp(1);
+		}
+	};
+	Kokkos::parallel_for("boundary of rho", numCells, _copy);
+
+	auto rho_host = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), rho);
+	fp = fopen("debug_cd2b.dat", "w");
+	sum = 0.;
+	fprintf(fp, "\n");
+	for(size_t i = 0; i < rho_host.extent(0); i++) {
+		fprintf(fp, "%ld %f\n", i, rho_host(i));
+		sum += rho_host(i);
+	}
+	fprintf(fp, "#sum = %f\n", sum);
+	fclose(fp);
+}
+
+
+// This version parallelizes over cells in rho. It avoids write conflicts but
+// processes each particle multiple times which I suspected would be slower.
+// But it turns out to run faster in many cases.
+template<typename FunctorType>
+void chargedensity3(FunctorType& deposit, const particle_list_t& particles, Kokkos::View<ParticleFloatType*>& rho, Kokkos::View<ParticleFloatType*>& rhotmp) {
+
+	const size_t numParticles = particles.size();
+	const size_t numCells = rho.extent(0);
+
+	Kokkos::parallel_for("deposit rho", rhotmp.extent(0), deposit);
+
+	auto rhotmp_host = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), rhotmp);
+	FILE* fp = fopen("debug_cd3.dat", "w");
+	double sum = 0.;
+	for(size_t i = 0; i < rhotmp_host.extent(0); i++) {
+		fprintf(fp, "%ld %f\n", i, rhotmp_host(i));
+		sum += rhotmp_host(i);
+	}
+	fprintf(fp, "#sum = %f\n", sum);
+	fclose(fp);
+
+	// Periodic boundaries on rho
+	auto _copy = KOKKOS_LAMBDA(const size_t& i) {
+		// copy interior
+		rho(i) = rhotmp(i+2);
+		// fold ghost cells
+		if(i == 0) {
+			rho(i) += rhotmp(numCells+2);
+		}
+		if(i == 1) {
+			rho(i) += rhotmp(numCells+3);
+		}
+		if(i == numCells-2) {
+			rho(i) += rhotmp(0);
+		}
+		if(i == numCells-1) {
+			rho(i) += rhotmp(1);
+		}
+	};
+	Kokkos::parallel_for("boundary of rho", numCells, _copy);
+
+	auto rho_host = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), rho);
+	fp = fopen("debug_cd2b.dat", "w");
+	sum = 0.;
+	fprintf(fp, "\n");
+	for(size_t i = 0; i < rho_host.extent(0); i++) {
+		fprintf(fp, "%ld %f\n", i, rho_host(i));
+		sum += rho_host(i);
+	}
+	fprintf(fp, "#sum = %f\n", sum);
+	fclose(fp);
+}
+
+void massmatrix(const particle_list_t& particles, Kokkos::View<ParticleFloatType**>& M) {
+
+	const size_t numParticles = particles.size();
+	const size_t numCells = M.extent(0);
+
+	auto macro = Cabana::slice<MacroFactor>(particles);
+	auto cell  = Cabana::slice<CellIndex>(particles);
+	auto posx  = Cabana::slice<PositionX>(particles);
+
+	// ghost cells in index i
+	Kokkos::View<ParticleFloatType**> Mtmp("Mtmp", numCells+4, M.extent(1));
+
+	// how to compute a single cell
+	auto _deposit = KOKKOS_LAMBDA(const int& cellindex) {
+		const double xg = cellindex - 2;
+		for(size_t n = 0; n < numParticles; n++) {
+			size_t j = floor(posx(n)+0.5);
+			if(j >= Mtmp.extent(1)) { j = 0; }
+			Mtmp(cellindex,j) += macro(n) * W_TSC(posx(n) - xg);
+		}
+	};
+
+	Kokkos::parallel_for("deposit M", Mtmp.extent(0), _deposit);
+
+	auto _copy = KOKKOS_LAMBDA(const int& j) {
+		// copy interior
+		for(size_t i = 0; i < numCells; i++) {
+			M(i,j) = Mtmp(i+2,j);
+		}
+		// fold ghost cells
+		M(0,j)             += Mtmp(numCells+2,j);
+		M(1,j)             += Mtmp(numCells+3,j);
+		M(M.extent(0)-2,j) += Mtmp(0,j);
+		M(M.extent(0)-1,j) += Mtmp(1,j);
+	};
+	Kokkos::parallel_for("boundary of M", M.extent(1), _copy);
+}
+
+
+void adjust_for_charge_conservation(particle_list_t& particles, const Kokkos::View<ParticleFloatType*>& dA) {
+
+	auto cell       = Cabana::slice<CellIndex>(particles);
+	auto macro      = Cabana::slice<MacroFactor>(particles);
+	auto posx       = Cabana::slice<PositionX>(particles);
+
+	auto _update = KOKKOS_LAMBDA(const int s, const int i) {
+		int n = (s)*particle_list_t::vector_length + i;
+		size_t j = floor(posx(n)+0.5);
+		if(j >= dA.extent(0)) { j = 0; }
+		ParticleFloatType dalpha_p = dA(j);
+		if(macro(n) > -dalpha_p) {
+			macro(n) += dalpha_p;
+		} else {
+			printf("Can't adjust a particle with alpha_p = %f using delta alpha_p = %f in cell %d\n", macro(n), dalpha_p, cell(n));
+		}
+	};
+
+	// Define an execution policy
+	Cabana::SimdPolicy<particle_list_t::vector_length,ExecutionSpace> vec_policy(0, particles.size());
+
+	// Execute for all particles in parallel
+	Cabana::simd_parallel_for(vec_policy, _update, "update()");
+}
+
+
+void chargeconservation(const particle_list_t& particles, particle_list_t new_particles, const int num_cells) {
+	Kokkos::View<ParticleFloatType*> rho("rho", num_cells); // ghost cells only temporary internally to the function
+	//chargedensity(particles, rho);
+	chargedensity2(particles, rho);
+	auto rho_host = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), rho);
+
+	FILE* fp = fopen("rho.dat", "w");
+	for(size_t i = 0; i < rho_host.size(); i++) {
+		fprintf(fp, "%zd %f\n", i, rho_host(i));
+	}
+	fclose(fp);
+
+	Kokkos::View<ParticleFloatType*> rhoprime("rhoprime", num_cells);
+	//chargedensity(new_particles, rhoprime);
+	chargedensity2(new_particles, rhoprime);
+	auto rhoprime_host = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), rhoprime);
+
+	fp = fopen("rhoprime.dat", "w");
+	for(size_t i = 0; i < rhoprime_host.size(); i++) {
+		fprintf(fp, "%zd %f\n", i, rhoprime_host(i));
+	}
+	fclose(fp);
+
+	Kokkos::View<ParticleFloatType**> M("M", num_cells, num_cells); // ghost cells only temporary internally to the function
+	massmatrix(new_particles, M);
+	auto M_host = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), M);
+
+	fp = fopen("M.dat", "w");
+	for(size_t i = 0; i < M_host.extent(0); i++) {
+		double sum = 0.;
+		for(size_t j = 0; j < M_host.extent(1); j++) {
+			fprintf(fp, "%zd %zd %f\n", i, j, M_host(i,j));
+			sum += M_host(i,j);
+		}
+		fprintf(fp, "#sum = %f\n", sum);
+	}
+	fclose(fp);
+
+	Kokkos::View<ParticleFloatType*> dA("dA", num_cells);
+	jacobi(M, dA, rho, rhoprime);
+	auto dA_host = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), dA);
+
+	fp = fopen("dA.dat", "w");
+	for(size_t i = 0; i < dA_host.size(); i++) {
+		fprintf(fp, "%zd %f\n", i, dA_host(i));
+	}
+	fclose(fp);
+
+	adjust_for_charge_conservation(new_particles, dA);
+
+	Kokkos::View<ParticleFloatType*> rhoadj("rhoadj", num_cells);
+	//chargedensity(new_particles, rhoadj);
+	chargedensity2(new_particles, rhoadj);
+	auto rhoadj_host = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), rhoadj);
+
+	fp = fopen("rhoadj.dat", "w");
+	for(size_t i = 0; i < rhoadj_host.size(); i++) {
+		fprintf(fp, "%zd %f\n", i, rhoadj_host(i));
+	}
+	fclose(fp);
+}
+
+void get_stats(const particle_list_t& particles, Kokkos::View<double*>& Np, Kokkos::View<double**>& P, Kokkos::View<double**>& E) {
+	auto macro = Cabana::slice<MacroFactor>(particles);
+	auto cell  = Cabana::slice<CellIndex>(particles);
+#if VelocityDimensions == 1
+	auto velX  = Cabana::slice<VelocityX>(particles);
+#elif VelocityDimensions == 2
+	auto velPar  = Cabana::slice<VelocityPar>(particles);
+	auto velPer  = Cabana::slice<VelocityPer>(particles);
+#elif VelocityDimensions == 3
+	auto velX  = Cabana::slice<VelocityX>(particles);
+	auto velY  = Cabana::slice<VelocityY>(particles);
+	auto velZ  = Cabana::slice<VelocityZ>(particles);
+#endif
+
+	const int numParticles = particles.size();
+	const size_t numCells = Np.extent(0);
+
+	auto _moments = KOKKOS_LAMBDA(const int& cellindex) {
+		Np(cellindex) = 0.;
+		for(int d = 0; d < VelocityDimensions; d++) {
+			P(cellindex,d) = 0.;
+			E(cellindex,d) = 0.;
+		}
+		for(int n = 0; n < numParticles; n++) {
+			if(cell(n) == cellindex) {
+				Np(cellindex) += macro(n);
+#if VelocityDimensions == 1
+				P(cellindex,0) += macro(n) * velX(n);
+				E(cellindex,0) += 0.5*macro(n) * velX(n)*velX(n);
+#elif VelocityDimensions == 2
+				P(cellindex,0) += macro(n) * velPar(n);
+				P(cellindex,1) += macro(n) * velPer(n);
+				E(cellindex,0) += 0.5*macro(n) * velPar(n)*velPar(n);
+				E(cellindex,1) += 0.5*macro(n) * velPer(n)*velPer(n);
+#elif VelocityDimensions == 3
+				P(cellindex,0) += macro(n) * velX(n);
+				P(cellindex,1) += macro(n) * velY(n);
+				P(cellindex,2) += macro(n) * velZ(n);
+				E(cellindex,0) += 0.5*macro(n) * velX(n)*velX(n);
+				E(cellindex,1) += 0.5*macro(n) * velY(n)*velY(n);
+				E(cellindex,2) += 0.5*macro(n) * velZ(n)*velZ(n);
+#endif
+			}
+		}
+	};
+
+	Kokkos::parallel_for("compute stats", numCells, _moments);
+}
+
+void get_stats(const gaussian_list_t gaussians, const double nppc, Kokkos::View<double**>& P, Kokkos::View<double**>& E) {
+	int cmax = gaussians.extent(0);
+	int kmax = gaussians.extent(1);
+	auto g_dev = Kokkos::create_mirror_view(MemorySpace(), gaussians);
+	Kokkos::deep_copy(g_dev, gaussians);
+
+	auto _moments = KOKKOS_LAMBDA(const int& cellindex) {
+		for(int d = 0; d < VelocityDimensions; d++) {
+			P(cellindex,d) = 0.;
+			E(cellindex,d) = 0.;
+		}
+		for(int k = 0; k < kmax; k++) {
+			if(g_dev(cellindex,k,Weight)>0.) {
+#if VelocityDimensions == 1
+				P(cellindex,0) += nppc*g_dev(cellindex,k,Weight) * g_dev(cellindex,k,MuX);
+				E(cellindex,0) += 0.5*nppc * g_dev(cellindex,k,Weight) * (g_dev(cellindex,k,Cxx) + g_dev(cellindex,k,MuX)*g_dev(cellindex,k,MuX));
+#elif VelocityDimensions == 2
+				P(cellindex,0) += nppc * g_dev(cellindex,k,Weight) * g_dev(cellindex,k,MuPar);
+				P(cellindex,1) += nppc * g_dev(cellindex,k,Weight) * Kokkos::sqrt(0.5*M_PI)*Kokkos::sqrt(g_dev(cellindex,k,Cperper));
+				E(cellindex,0) += 0.5*nppc * g_dev(cellindex,k,Weight) * (g_dev(cellindex,k,Cparpar) + g_dev(cellindex,k,MuPar)*g_dev(cellindex,k,MuPar));
+				E(cellindex,1) += 0.5*nppc * g_dev(cellindex,k,Weight) * (2.*g_dev(cellindex,k,Cperper)                                                 );
+#elif VelocityDimensions == 3
+				P(cellindex,0) += nppc*g_dev(cellindex,k,Weight)*g_dev(cellindex,k,MuX);
+				P(cellindex,1) += nppc*g_dev(cellindex,k,Weight)*g_dev(cellindex,k,MuY);
+				P(cellindex,2) += nppc*g_dev(cellindex,k,Weight)*g_dev(cellindex,k,MuZ);
+				E(cellindex,0) += 0.5*nppc * g_dev(cellindex,k,Weight) * (g_dev(cellindex,k,Cxx) + g_dev(cellindex,k,MuX)*g_dev(cellindex,k,MuX));
+				E(cellindex,1) += 0.5*nppc * g_dev(cellindex,k,Weight) * (g_dev(cellindex,k,Cyy) + g_dev(cellindex,k,MuY)*g_dev(cellindex,k,MuY));
+				E(cellindex,2) += 0.5*nppc * g_dev(cellindex,k,Weight) * (g_dev(cellindex,k,Czz) + g_dev(cellindex,k,MuZ)*g_dev(cellindex,k,MuZ));
+#endif
+			}
+		}
+	};
+
+	Kokkos::parallel_for("compute stats", cmax, _moments);
+}
+
+
+void get_corrections(const Kokkos::View<double*>& Npprime, const Kokkos::View<double**>& Pprime, const Kokkos::View<double**>& Eprime,
+                                                           const Kokkos::View<double**>& P,      const Kokkos::View<double**>& E,
+                     Kokkos::View<double**>& a, Kokkos::View<double**>& b) {
+
+	auto _correct = KOKKOS_LAMBDA(const size_t c) {
+#if VelocityDimensions == 1
+		const double num = 2.*E(c,0)*Npprime(c) - P(c,0)*P(c,0);
+		if(num >= 0.) {
+			const double denom = 2.*Eprime(c,0)*Npprime(c) - Pprime(c,0)*Pprime(c,0);
+			a(c,0) = Kokkos::sqrt(num/denom);
+			b(c,0) = (P(c,0)-a(c,0)*Pprime(c,0)) / (a(c,0)*Npprime(c));
+		} else{
+			a(c,0) = 1.;
+			b(c,0) = 0.;
+		}
+#elif VelocityDimensions == 2
+		const double num = 2.*(E(c,0)+E(c,1))*Npprime(c) - 2.*P(c,1)*P(c,1)/Pprime(c,1)/Pprime(c,1)*Eprime(c,1)*Npprime(c) - P(c,0)*P(c,0);
+		const double denom = 2.*Eprime(c,0)*Npprime(c) - Pprime(c,0)*Pprime(c,0);
+		if(num/denom >= 0.) {
+			a(c,0) = Kokkos::sqrt(num/denom);
+			a(c,1) = P(c,1) / Pprime(c,1);
+			b(c,0) = (P(c,0) - a(c,0)*Pprime(c,0)) / (a(c,0)*Npprime(c));
+		} else {
+			a(c,0) = 1.;
+			a(c,1) = 1.;
+			b(c,0) = 0.;
+		}
+#elif VelocityDimensions == 3
+		const double num = 2. * (E(c,0)+E(c,1)+E(c,2)) * Npprime(c) - (P(c,0)*P(c,0) + P(c,1)*P(c,1) + P(c,2)*P(c,2));
+		if(num >= 0.) {
+			const double denom = 2. * (Eprime(c,0)+Eprime(c,1)+Eprime(c,2)) * Npprime(c) - (Pprime(c,0)*Pprime(c,0) + Pprime(c,1)*Pprime(c,1) + Pprime(c,2)*Pprime(c,2));
+			a(c,0) = Kokkos::sqrt(num/denom);
+			for(int d = 0; d < VelocityDimensions; d++) {
+				b(c,d) = (P(c,d)-a(c,0)*Pprime(c,d)) / (a(c,0)*Npprime(c));
+			}
+		} else{
+			a(c,0) = 1.;
+			for(int d = 0; d < VelocityDimensions; d++) {
+				b(c,d) = 0.;
+			}
+		}
+#endif
+	};
+
+	Kokkos::parallel_for("Compute Lemon corrections", Npprime.extent(0), _correct);
+}
+
+void scale_and_shift_particles(particle_list_t& particles, const Kokkos::View<double**>& a, const Kokkos::View<double**>& b) {
+	auto cell  = Cabana::slice<CellIndex>(particles);
+#if VelocityDimensions == 1
+	auto velX  = Cabana::slice<VelocityX>(particles);
+#elif VelocityDimensions == 2
+	auto velPar  = Cabana::slice<VelocityPar>(particles);
+	auto velPer  = Cabana::slice<VelocityPer>(particles);
+#elif VelocityDimensions == 3
+	auto velX  = Cabana::slice<VelocityX>(particles);
+	auto velY  = Cabana::slice<VelocityY>(particles);
+	auto velZ  = Cabana::slice<VelocityZ>(particles);
+#endif
+
+	// Define how to adjust ONE particle
+	auto _adjust = KOKKOS_LAMBDA(const int s, const int i) {
+		auto c = cell.access(s,i);
+#if VelocityDimensions == 1
+		velX.access(s,i) = a(c,0) * (velX.access(s,i) + b(c,0));
+#elif VelocityDimensions == 2
+		// No shift in vper, different scaling in the two dimensions
+		velPar.access(s,i) = a(c,0) * (velPar.access(s,i) + b(c,0));
+		velPer.access(s,i) = a(c,1) * (velPer.access(s,i)         );
+#elif VelocityDimensions == 3
+		// Same scaling for all three dimensions, different shifts in the three dimensions
+		velX.access(s,i) = a(c,0) * (velX.access(s,i) + b(c,0));
+		velY.access(s,i) = a(c,0) * (velY.access(s,i) + b(c,1));
+		velZ.access(s,i) = a(c,0) * (velZ.access(s,i) + b(c,2));
+#endif
+
+	};
+
+	// Define an execution policy
+	Cabana::SimdPolicy<particle_list_t::vector_length,ExecutionSpace> vec_policy(0, particles.size());
+
+	// Execute
+	Cabana::simd_parallel_for(vec_policy, _adjust, "adjust()");
+
+}
+
+//---------------------------------------------------------------------------//
+// Main
+//---------------------------------------------------------------------------//
+int main(int argc, char* argv[]) {
+	// Initialize the kokkos runtime.
+	Kokkos::ScopeGuard scope_guard( argc, argv );
+
+	const int seed = 12345;
+	const int num_cells = 10;
+	const int num_particles = 2429*num_cells;
+
+	{
+		// Do work here
+
+		// Build outselves a (set of) Gaussian(s)
+		gaussian_list_t true_f("true f", num_cells,2); // Ten cells, up to two Gaussians each
+
+#if VelocityDimensions == 1
+		// first cell
+		true_f(0,0,Weight) = 0.6;
+		true_f(0,0,MuX) =  2.0;
+		true_f(0,0,Cxx) =  1.5;
+
+		true_f(0,1,Weight) = 0.4;
+		true_f(0,1,MuX) = -1.0;
+		true_f(0,1,Cxx) =  1.0;
+
+		for(size_t c=1; c<num_cells; c++) {
+			true_f(c,0,Weight) = 1.0;
+			true_f(c,0,MuX)    = 0.0;
+			true_f(c,0,Cxx)    = 1.5;
+
+			true_f(c,1,Weight) = 0.0;
+		}
+		// Do we even have to set the other parameters for that Gaussian?
+#elif VelocityDimensions == 2
+
+		true_f(0,0,Weight) = 0.6;
+		true_f(0,0,MuPar)  =  2.0; true_f(0,0,MuPer) =  0.;
+		true_f(0,0,Cparpar) = 0.2; true_f(0,0,Cparper) = 0.0;
+		true_f(0,0,Cperpar) = 0.0; true_f(0,0,Cperper) = 2.0;
+
+		true_f(0,1,Weight) = 0.4;
+		true_f(0,1,MuPar)  =  0.0; true_f(0,1,MuPer) =  1.5;
+		true_f(0,1,Cparpar) = 0.05; true_f(0,1,Cparper) = 0.0;
+		true_f(0,1,Cperpar) = 0.0; true_f(0,1,Cperper) = 0.1;
+
+		for(size_t c=1; c<num_cells; c++) {
+			true_f(c,0,Weight) = 1.0;
+			true_f(c,0,MuPar)  =  0.0; true_f(c,0,MuPer) =  0.;
+			true_f(c,0,Cparpar) = 0.5; true_f(c,0,Cparper) = 0.0;
+			true_f(c,0,Cperpar) = 0.0; true_f(c,0,Cperper) = 0.5;
+
+			true_f(c,1,Weight) = 0.0;
+		}
+#elif VelocityDimensions == 3
+		// first cell
+		true_f(0,0,Weight) = 0.6;
+		true_f(0,0,MuX) =  2.; true_f(0,0,MuY) =  2.; true_f(0,0,MuZ) =  2.;
+		true_f(0,0,Cxx) = 1.5; true_f(0,0,Cxy) = 0.0; true_f(0,0,Cxz) = 0.5;
+		true_f(0,0,Cyx) = 0.0; true_f(0,0,Cyy) = 2.0; true_f(0,0,Cyz) = 0.0;
+		true_f(0,0,Czx) = 0.5; true_f(0,0,Czy) = 0.0; true_f(0,0,Czz) = 2.5;
+
+		true_f(0,1,Weight) = 0.4;
+		true_f(0,1,MuX) = -2.0; true_f(0,1,MuY) = -2.0; true_f(0,1,MuZ) = -2.;
+		true_f(0,1,Cxx) = 1.00; true_f(0,1,Cxy) = 0.25; true_f(0,1,Cxz) = 0.0;
+		true_f(0,1,Cyx) = 0.25; true_f(0,1,Cyy) = 2.00; true_f(0,1,Cyz) = 1.5;
+		true_f(0,1,Czx) = 0.00; true_f(0,1,Czy) = 1.50; true_f(0,1,Czz) = 3.0;
+
+		// second cell
+		for(size_t c=1; c<num_cells; c++) {
+			true_f(c,0,Weight) = 1.0;
+			true_f(c,0,MuX) =  1.; true_f(c,0,MuY) =  2.; true_f(c,0,MuZ) =  3.;
+			true_f(c,0,Cxx) = 1.5; true_f(c,0,Cxy) = 0.0; true_f(c,0,Cxz) = 0.0;
+			true_f(c,0,Cyx) = 0.0; true_f(c,0,Cyy) = 2.0; true_f(c,0,Cyz) = 0.0;
+			true_f(c,0,Czx) = 0.0; true_f(c,0,Czy) = 0.0; true_f(c,0,Czz) = 2.5;
+
+			true_f(c,1,Weight) = 0.0;
+		}
+		// Do we even have to set the other parameters for that Gaussian?
+#endif
+
+		// Make sure we have positive definite covariance matrices
+		check_gaussians(true_f);
+
+		FILE* fp = fopen("true_f.dat", "w");
+		for(size_t c = 0; c<num_cells; c++) {
+			for(size_t m = 0; m < true_f.extent(1); m++) {
+				if(true_f(c,m,Weight) > 0.) {
+#if VelocityDimensions == 1
+					fprintf(fp, "%zd %zd %e %e %e\n", c, m, true_f(c,m,Weight), true_f(c,m,MuX), true_f(c,m,Cxx));
+#elif VelocityDimensions == 2
+					fprintf(fp, "%zd %zd %e %e %e %e %e %e %e\n", c, m, true_f(c,m,Weight), true_f(c,m,MuPar), true_f(c,m,MuPer), true_f(c,m,Cparpar), true_f(c,m,Cparper), true_f(c,m,Cperpar), true_f(c,m,Cperper));
+#elif VelocityDimensions == 3
+					fprintf(fp, "%zd %zd %e %e %e %e %e %e %e %e %e %e %e %e %e\n", c, m, true_f(c,m,Weight), true_f(c,m,MuX), true_f(c,m,MuY), true_f(c,m,MuZ), true_f(c,m,Cxx), true_f(c,m,Cxy), true_f(c,m,Cxz), true_f(c,m,Cyx), true_f(c,m,Cyy), true_f(c,m,Cyz), true_f(c,m,Czx), true_f(c,m,Czy), true_f(c,m,Czz));
+#endif
+				}
+			}
+		}
+		fclose(fp);
+
+		// Create particles
+		particle_list_t particles("particles", num_particles);
+		printf("Initalize particles\n");
+		initialize_particles(particles, true_f, 12345, RandomSampling);
+
+		// Write out particles
+		save_particles(particles, "part.dat");
+
+		// Compute GMM for the particles
+		size_t kmax = 10;
+		const GmmFloatType eps = 1e-16; // eps says what relative decrease in the loss function do we consider CEM converged
+		gaussian_list_t gmm("reconstructed f", num_cells, kmax);
+		printf("Reconstruct GMM\n");
+		//reconstruct_gmm(particles, gmm, eps);
+
+		auto cell       = Cabana::slice<CellIndex>(particles);
+		auto macro      = Cabana::slice<MacroFactor>(particles);
+#if VelocityDimensions==1
+		auto velocity_x = Cabana::slice<VelocityX>(particles);
+		Cabana::reconstructGMM(gmm, eps, seed, cell, macro, velocity_x);
+#elif VelocityDimensions==2
+		auto velocity_par = Cabana::slice<VelocityPar>(particles);
+		auto velocity_per = Cabana::slice<VelocityPer>(particles);
+		Cabana::reconstructGMM(gmm, eps, seed, cell, macro, velocity_par, velocity_per);
+#elif VelocityDimensions==3
+		auto velocity_x = Cabana::slice<VelocityX>(particles);
+		auto velocity_y = Cabana::slice<VelocityY>(particles);
+		auto velocity_z = Cabana::slice<VelocityZ>(particles);
+		Cabana::reconstructGMM(gmm, eps, seed, cell, macro, velocity_x, velocity_y, velocity_z);
+#endif
+
+
+
+		printf("Checking results\n");
+
+		// Take a look at the gaussians
+		printf("Reconstructed distribution function:\n");
+		for(size_t c = 0; c < gmm.extent(0); c++) {
+			for(size_t m = 0; m < kmax; m++) {
+				if(gmm(c,m,Weight) > 0.) {
+#if VelocityDimensions == 1
+					printf("Gaussian(%zd) in cell %zd weight=%f, Mu=%f, Cov=%f\n", m, c, gmm(c,m,Weight), gmm(c,m,MuX), gmm(c,m,Cxx));
+#elif VelocityDimensions == 2
+					printf("Gaussian(%zd) in cell %zd weight=%f, Mu=(%f,%f), Cov=((%f,%f),(%f,%f))\n", m, c, gmm(c,m,Weight), gmm(c,m,MuPar),gmm(c,m,MuPer),
+						gmm(c,m,Cparpar), gmm(c,m,Cparper),
+						gmm(c,m,Cperpar), gmm(c,m,Cperper));
+#elif VelocityDimensions == 3
+					printf("Gaussian(%zd) in cell %zd weight=%f, Mu=(%f,%f,%f), Cov=((%f,%f,%f),(%f,%f,%f),(%f,%f,%f))\n", m, c, gmm(c,m,Weight), gmm(c,m,MuX),gmm(c,m,MuY),gmm(c,m,MuZ),
+						gmm(c,m,Cxx), gmm(c,m,Cxy), gmm(c,m,Cxz),
+						gmm(c,m,Cyx), gmm(c,m,Cyy), gmm(c,m,Cyz),
+						gmm(c,m,Czx), gmm(c,m,Czy), gmm(c,m,Czz));
+#endif
+				}
+			}
+		}
+		fp = fopen("reconstructed_f.dat", "w");
+		for(size_t c = 0; c < gmm.extent(0); c++) {
+			for(size_t m = 0; m < gmm.extent(1); m++) {
+				if(gmm(c,m,Weight) > 0.) {
+#if VelocityDimensions == 1
+					fprintf(fp, "%zd %zd %e %e %e\n", c,m, gmm(c,m,Weight), gmm(c,m,MuX), gmm(c,m,Cxx));
+#elif VelocityDimensions == 2
+					fprintf(fp, "%zd %zd %e %e %e %e %e %e %e\n", c,m, gmm(c,m,Weight), gmm(c,m,MuPar),gmm(c,m,MuPer), gmm(c,m,Cparpar),gmm(c,m,Cparper), gmm(c,m,Cperpar),gmm(c,m,Cperper));
+#elif VelocityDimensions == 3
+					fprintf(fp, "%zd %zd %e %e %e %e %e %e %e %e %e %e %e %e %e\n", c,m, gmm(c,m,Weight), gmm(c,m,MuX), gmm(c,m,MuY), gmm(c,m,MuZ), gmm(c,m,Cxx), gmm(c,m,Cxy), gmm(c,m,Cxz), gmm(c,m,Cyx), gmm(c,m,Cyy), gmm(c,m,Cyz), gmm(c,m,Czx), gmm(c,m,Czy), gmm(c,m,Czz));
+#endif
+				}
+			}
+		}
+		fclose(fp);
+
+
+		// Print ground truth for reference
+		printf("True distribution function:\n");
+		for(size_t c = 0; c<num_cells; c++) {
+			for(size_t m = 0; m < true_f.extent(1); m++) {
+				if(true_f(c,m,Weight) > 0.) {
+#if VelocityDimensions == 1
+					printf("Gaussian(%zd) in cell %zd weight=%f, Mu=%f, Cov=%f\n", m,c, true_f(c,m,Weight), true_f(c,m,MuX), true_f(c,m,Cxx));
+#elif VelocityDimensions == 2
+					printf("Gaussian(%zd) in cell %zd weight=%f, Mu=(%f,%f), Cov=((%f,%f)(%f,%f))\n", m,c, true_f(c,m,Weight), true_f(c,m,MuPar),true_f(c,m,MuPer),
+						true_f(c,m,Cparpar),true_f(c,m,Cparper),
+						true_f(c,m,Cperpar),true_f(c,m,Cperper));
+#elif VelocityDimensions == 3
+					printf("Gaussian(%zd) in cell %zd weight=%f, Mu=(%f,%f,%f), Cov=((%f,%f,%f),(%f,%f,%f),(%f,%f,%f))\n", m,c, true_f(c,m,Weight), true_f(c,m,MuX),true_f(c,m,MuY),true_f(c,m,MuZ),
+						true_f(c,m,Cxx), true_f(c,m,Cxy), true_f(c,m,Cxz),
+						true_f(c,m,Cyx), true_f(c,m,Cyy), true_f(c,m,Cyz),
+						true_f(c,m,Czx), true_f(c,m,Czy), true_f(c,m,Czz));
+#endif
+				}
+			}
+		}
+
+		// Create new particles from recovered gaussian distribution functions
+		particle_list_t new_particles("new particles", num_particles);
+		initialize_particles(new_particles, gmm, 23456, HammersleyEqualWeight);
+
+		// Ensure that the new particles represent the same charge density distribution on cell centers
+		chargeconservation(particles, new_particles, num_cells); // this breaks if you use different particle numbers
+
+		// Get statistics from particles
+		Kokkos::View<double**> P("P", num_cells, VelocityDimensions);
+		Kokkos::View<double**> E("E", num_cells, VelocityDimensions);
+		const double nppc = num_particles/(double)num_cells;
+		get_stats(gmm, nppc, P, E);
+
+		Kokkos::View<double*> Npprime("Np'", num_cells);
+		Kokkos::View<double**> Pprime("P'", num_cells, VelocityDimensions);
+		Kokkos::View<double**> Eprime("E'", num_cells, VelocityDimensions);
+		get_stats(new_particles, Npprime, Pprime, Eprime);
+
+		Kokkos::View<double**> a("a", num_cells, VelocityDimensions);
+		Kokkos::View<double**> b("b", num_cells, VelocityDimensions);
+		get_corrections(Npprime, Pprime, Eprime, P, E, a, b);
+
+		auto P_host = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), P);
+		auto E_host = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), E);
+		auto Npprime_host = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), Npprime);
+		auto Pprime_host = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), Pprime);
+		auto Eprime_host = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), Eprime);
+		auto a_host = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), a);
+		auto b_host = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), b);
+
+		for(size_t c=0; c < num_cells; c++) {
+#if VelocityDimensions == 1
+			printf("N' = %f, P = %f, P' = %f, E = %f, E' = %f\n", Npprime_host(c), P_host(c,0), Pprime_host(c,0), E_host(c,0), Eprime_host(c,0));
+			printf("a = %f, b = %f\n", a_host(c,0), b_host(c,0));
+#elif VelocityDimensions == 2
+			printf("N' = %f, P = %f,%f, P' = %f,%f, E = %f, E' = %f\n", Npprime_host(c), P_host(c,0),P_host(c,1), Pprime_host(c,0),Pprime_host(c,1), E_host(c,0)+E_host(c,1), Eprime_host(c,0)+Eprime_host(c,1));
+			printf("a = %f,%f, b = %f\n", a_host(c,0),a_host(c,1), b_host(c,0));
+#elif VelocityDimensions == 3
+			printf("N' = %f, P = %f,%f,%f, P' = %f,%f,%f, E = %f, E' = %f\n", Npprime_host(c), P_host(c,0),P_host(c,1),P_host(c,2), Pprime_host(c,0),Pprime_host(c,1),Pprime_host(c,2), E_host(c,0)+E_host(c,1)+E_host(c,2), Eprime_host(c,0)+Eprime_host(c,1)+Eprime_host(c,2));
+			printf("a = %f,%f,%f b = %f,%f,%f\n", a_host(c,0),a_host(c,1),a_host(c,2), b_host(c,0),b_host(c,1),b_host(c,2));
+#endif
+		}
+
+		// Ensure conservation of momentum and energy
+		scale_and_shift_particles(new_particles, a, b);
+
+		// Check statistics again
+		get_stats(new_particles, Npprime, Pprime, Eprime);
+
+		Kokkos::deep_copy(Npprime_host, Npprime);
+		Kokkos::deep_copy(Pprime_host, Pprime);
+		Kokkos::deep_copy(Eprime_host, Eprime);
+
+		for(size_t c=0; c < num_cells; c++) {
+#if VelocityDimensions == 1
+			printf("N' = %f, P = %f, P' = %f, E = %f, E' = %f\n", Npprime_host(c), P_host(c,0), Pprime_host(c,0), E_host(c,0), Eprime_host(c,0));
+			printf("deltaP = %f, deltaE = %f\n", P_host(c,0)-Pprime_host(c,0), E_host(c,0)-Eprime_host(c,0));
+#elif VelocityDimensions == 2
+			printf("N' = %f, P = %f,%f, P' = %f,%f, E = %f, E' = %f\n", Npprime_host(c), P_host(c,0),P_host(c,1), Pprime_host(c,0),Pprime_host(c,1), E_host(c,0)+E_host(c,1), Eprime_host(c,0)+Eprime_host(c,1));
+			printf("deltaP = %f,%f, deltaE = %f\n", P_host(c,0)-Pprime_host(c,0), P_host(c,1)-Pprime_host(c,1), E_host(c,0)+E_host(c,1)-Eprime_host(c,0)-Eprime_host(c,1));
+#elif VelocityDimensions == 3
+			printf("N' = %f, P = %f,%f,%f, P' = %f,%f,%f, E = %f, E' = %f\n", Npprime_host(c), P_host(c,0),P_host(c,1),P_host(c,2), Pprime_host(c,0),Pprime_host(c,1),Pprime_host(c,2), E_host(c,0)+E_host(c,1)+E_host(c,2), Eprime_host(c,0)+Eprime_host(c,1)+Eprime_host(c,2));
+			printf("deltaP = %f,%f,%f, deltaE = %f\n", P_host(c,0)-Pprime_host(c,0), P_host(c,1)-Pprime_host(c,1), P_host(c,2)-Pprime_host(c,2), E_host(c,0)+E_host(c,1)+E_host(c,2)-Eprime_host(c,0)-Eprime_host(c,1)-Eprime_host(c,2));
+#endif
+		}
+
+		// Save particles
+		save_particles(new_particles, "part2.dat");
+	}
+
+	return 0;
+}
diff --git a/example/core_tutorial/CMakeLists.txt b/example/core_tutorial/CMakeLists.txt
index 26e0493ea..e1ac76836 100644
--- a/example/core_tutorial/CMakeLists.txt
+++ b/example/core_tutorial/CMakeLists.txt
@@ -38,3 +38,4 @@ endif()
 if(Cabana_ENABLE_HDF5)
   add_subdirectory(13_hdf5_output)
 endif()
+add_subdirectory(14_gmm)