diff --git a/README.md b/README.md
index 0e38ddb..d611fb0 100644
--- a/README.md
+++ b/README.md
@@ -3,12 +3,77 @@ CUDA Stream Compaction
**University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2**
-* (TODO) YOUR NAME HERE
- * (TODO) [LinkedIn](), [personal website](), [twitter](), etc.
-* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
+* Maya Diaz Huizar
+* Tested on: Windows 10, R7-5800X @ 3.8GHz 32GB, RTX 3080 10GB
-### (TODO: Your README)
+### Questions
+* Roughly optimize the block sizes of each of your implementations for minimal
+ run time on your GPU.
+ * Graphs:
+ * The optimal block size for the CPU implementation is N/A.
+ * 
+ * 
-Include analysis, etc. (Remember, this is public, so don't put
-anything here that you don't want to share with the world.)
+* Compare all of these GPU Scan implementations (Naive, Work-Efficient, and Thrust) to the serial CPU version of Scan. Plot a graph of the comparison (with array size on the independent axis).
+ * 
+ * 
+ * Write a brief explanation of the phenomena you see here.
+ * This generally makes sense, the efficient GPU scan and compact is much more efficient and more parallelizable, when compared to the naive approach. The CPU method is fast for small arrays and scales linearly, and thus is much worse at very large arrays when compared to the GPU implementation. Thrust almost certainly takes different approaches based on the size of the array, ensuring that it yields the best of both worlds, with fast small and large arrays. I also am wholly and entirely confident that the developers of the thrust library are more than capable of writing a faster library when compared to an undergrad CMPE major.
+* Paste the output of the test program into a triple-backtick block in your README.
+```
+The below tests results are from scanning and steam compacting 2^29 element arrays.
+****************
+** SCAN TESTS **
+****************
+ [ 42 34 32 22 8 16 34 39 37 30 7 2 14 ... 1 0 ]
+==== cpu scan, power-of-two ====
+ elapsed time: 246.098ms (std::chrono Measured)
+ [ 0 42 76 108 130 138 154 188 227 264 294 301 303 ... 264144619 264144620 ]
+==== cpu scan, non-power-of-two ====
+ elapsed time: 245.254ms (std::chrono Measured)
+ [ 0 42 76 108 130 138 154 188 227 264 294 301 303 ... 264144559 264144572 ]
+ passed
+==== naive scan, power-of-two ====
+ elapsed time: 378.275ms (CUDA Measured)
+ passed
+==== naive scan, non-power-of-two ====
+ elapsed time: 377.967ms (CUDA Measured)
+ passed
+==== work-efficient scan, power-of-two ====
+ elapsed time: 17.0086ms (CUDA Measured)
+ passed
+==== work-efficient scan, non-power-of-two ====
+ elapsed time: 16.9234ms (CUDA Measured)
+ passed
+==== thrust scan, power-of-two ====
+ elapsed time: 7.19872ms (CUDA Measured)
+ passed
+==== thrust scan, non-power-of-two ====
+ elapsed time: 7.27962ms (CUDA Measured)
+ passed
+
+*****************************
+** STREAM COMPACTION TESTS **
+*****************************
+ [ 3 0 2 1 3 2 0 0 3 3 0 3 2 ... 0 0 ]
+==== cpu compact without scan, power-of-two ====
+ elapsed time: 743.234ms (std::chrono Measured)
+ passed
+==== cpu compact without scan, non-power-of-two ====
+ elapsed time: 743.523ms (std::chrono Measured)
+ passed
+==== cpu compact with scan ====
+ elapsed time: 2002.52ms (std::chrono Measured)
+ passed
+==== work-efficient compact, power-of-two ====
+ elapsed time: 1130.99ms (CUDA Measured)
+ passed
+==== work-efficient compact, non-power-of-two ====
+ elapsed time: 875.497ms (CUDA Measured)
+ passed
+```
+
+* Extra Credit
+ * My efficient GPU scan was efficient from the onset, but I also wasn't following the slides very closely. (5pt GPU approach)
+ * I also implemented improvements for memory access to better align and thus prevent bank conflicts, based upon the overview provided by GPU Gems 3 Ch 39.2.3.
diff --git a/img/Efficient GPU - Time (ms) vs Block Size (lower is better).png b/img/Efficient GPU - Time (ms) vs Block Size (lower is better).png
new file mode 100644
index 0000000..dbdb631
Binary files /dev/null and b/img/Efficient GPU - Time (ms) vs Block Size (lower is better).png differ
diff --git a/img/Naive GPU - Time (ms) vs Block Size (lower is better).png b/img/Naive GPU - Time (ms) vs Block Size (lower is better).png
new file mode 100644
index 0000000..d548f1f
Binary files /dev/null and b/img/Naive GPU - Time (ms) vs Block Size (lower is better).png differ
diff --git a/img/Stream Compaction - Time (ms) vs Element Count (lower is better).png b/img/Stream Compaction - Time (ms) vs Element Count (lower is better).png
new file mode 100644
index 0000000..07a347a
Binary files /dev/null and b/img/Stream Compaction - Time (ms) vs Element Count (lower is better).png differ
diff --git a/img/Various Scans - Time (ms) vs Element Count (lower is better).png b/img/Various Scans - Time (ms) vs Element Count (lower is better).png
new file mode 100644
index 0000000..e6180b2
Binary files /dev/null and b/img/Various Scans - Time (ms) vs Element Count (lower is better).png differ
diff --git a/src/main.cpp b/src/main.cpp
index 896ac2b..e97a47a 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -13,7 +13,7 @@
#include
#include "testing_helpers.hpp"
-const int SIZE = 1 << 8; // feel free to change the size of array
+const int SIZE = 1 << 29; // feel free to change the size of array
const int NPOT = SIZE - 3; // Non-Power-Of-Two
int *a = new int[SIZE];
int *b = new int[SIZE];
@@ -34,6 +34,7 @@ int main(int argc, char* argv[]) {
// initialize b using StreamCompaction::CPU::scan you implement
// We use b for further comparison. Make sure your StreamCompaction::CPU::scan is correct.
// At first all cases passed because b && c are all zeroes.
+
zeroArray(SIZE, b);
printDesc("cpu scan, power-of-two");
StreamCompaction::CPU::scan(SIZE, b, a);
@@ -46,6 +47,7 @@ int main(int argc, char* argv[]) {
printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)");
printArray(NPOT, b, true);
printCmpResult(NPOT, b, c);
+
zeroArray(SIZE, c);
printDesc("naive scan, power-of-two");
@@ -67,6 +69,7 @@ int main(int argc, char* argv[]) {
//printArray(SIZE, c, true);
printCmpResult(NPOT, b, c);
+
zeroArray(SIZE, c);
printDesc("work-efficient scan, power-of-two");
StreamCompaction::Efficient::scan(SIZE, c, a);
@@ -94,7 +97,7 @@ int main(int argc, char* argv[]) {
printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
//printArray(NPOT, c, true);
printCmpResult(NPOT, b, c);
-
+
printf("\n");
printf("*****************************\n");
printf("** STREAM COMPACTION TESTS **\n");
@@ -110,12 +113,13 @@ int main(int argc, char* argv[]) {
// initialize b using StreamCompaction::CPU::compactWithoutScan you implement
// We use b for further comparison. Make sure your StreamCompaction::CPU::compactWithoutScan is correct.
+
zeroArray(SIZE, b);
printDesc("cpu compact without scan, power-of-two");
count = StreamCompaction::CPU::compactWithoutScan(SIZE, b, a);
printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)");
expectedCount = count;
- printArray(count, b, true);
+ //printArray(count, b, true);
printCmpLenResult(count, expectedCount, b, b);
zeroArray(SIZE, c);
@@ -123,20 +127,21 @@ int main(int argc, char* argv[]) {
count = StreamCompaction::CPU::compactWithoutScan(NPOT, c, a);
printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)");
expectedNPOT = count;
- printArray(count, c, true);
+ //printArray(count, c, true);
printCmpLenResult(count, expectedNPOT, b, c);
zeroArray(SIZE, c);
printDesc("cpu compact with scan");
count = StreamCompaction::CPU::compactWithScan(SIZE, c, a);
printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)");
- printArray(count, c, true);
+ //printArray(count, c, true);
printCmpLenResult(count, expectedCount, b, c);
zeroArray(SIZE, c);
printDesc("work-efficient compact, power-of-two");
count = StreamCompaction::Efficient::compact(SIZE, c, a);
printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
+ //printArray(count, b, true);
//printArray(count, c, true);
printCmpLenResult(count, expectedCount, b, c);
@@ -144,8 +149,8 @@ int main(int argc, char* argv[]) {
printDesc("work-efficient compact, non-power-of-two");
count = StreamCompaction::Efficient::compact(NPOT, c, a);
printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
- //printArray(count, c, true);
printCmpLenResult(count, expectedNPOT, b, c);
+
system("pause"); // stop Win32 console from closing on exit
delete[] a;
diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu
index 2ed6d63..5d83ef1 100644
--- a/stream_compaction/common.cu
+++ b/stream_compaction/common.cu
@@ -22,18 +22,30 @@ namespace StreamCompaction {
* Maps an array to an array of 0s and 1s for stream compaction. Elements
* which map to 0 will be removed, and elements which map to 1 will be kept.
*/
- __global__ void kernMapToBoolean(int n, int *bools, const int *idata) {
- // TODO
+ __global__ void kernMapToBoolean(int n, int* bools, const int* idata) {
+ int idx = threadIdx.x + blockIdx.x * blockDim.x;
+
+ if (idx < n) {
+ // Map to 1 if idata[idx] is non-zero, else map to 0
+ bools[idx] = (idata[idx] != 0) ? 1 : 0;
+ }
}
/**
* Performs scatter on an array. That is, for each element in idata,
* if bools[idx] == 1, it copies idata[idx] to odata[indices[idx]].
*/
- __global__ void kernScatter(int n, int *odata,
- const int *idata, const int *bools, const int *indices) {
- // TODO
+ __global__ void kernScatter(int n, int* odata, const int* idata, const int* bools, const int* indices) {
+ int idx = threadIdx.x + blockIdx.x * blockDim.x;
+
+ if (idx < n) {
+ // Perform scatter only if bools[idx] is 1
+ if (bools[idx] == 1) {
+ odata[indices[idx]] = idata[idx];
+ }
+ }
}
+
}
}
diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu
index 719fa11..80c11da 100644
--- a/stream_compaction/cpu.cu
+++ b/stream_compaction/cpu.cu
@@ -19,32 +19,87 @@ namespace StreamCompaction {
*/
void scan(int n, int *odata, const int *idata) {
timer().startCpuTimer();
- // TODO
+
+ // make sure elements exist
+ if (n <= 0) {
+ timer().endCpuTimer();
+ return;
+ }
+
+ // add identity for exclusive scan
+ odata[0] = 0;
+ for (int i = 1; i < n; i++) {
+ // do scan in one big for loop :(
+ odata[i] = odata[i - 1] + idata[i - 1];
+ }
+
timer().endCpuTimer();
}
/**
* CPU stream compaction without using the scan function.
- *
+ * @param n number of elements in initial array
+ * @param idata input array, not modified
+ * @param odata output array
* @returns the number of elements remaining after compaction.
*/
int compactWithoutScan(int n, int *odata, const int *idata) {
timer().startCpuTimer();
- // TODO
+
+ int count = 0;
+ // loop over entire array as one big loop,
+ for (int i = 0; i < n; i++) {
+ // check if they are zero (throw out) or non-zero (keep)
+ if (idata[i] != 0) {
+ odata[count] = idata[i];
+ count++;
+ }
+ }
timer().endCpuTimer();
- return -1;
+ return count;
}
/**
- * CPU stream compaction using scan and scatter, like the parallel version.
- *
- * @returns the number of elements remaining after compaction.
- */
- int compactWithScan(int n, int *odata, const int *idata) {
+ * CPU stream compaction using scan and scatter, like the parallel version.
+ *
+ * @returns the number of elements remaining after compaction.
+ */
+ int compactWithScan(int n, int* odata, const int* idata) {
timer().startCpuTimer();
- // TODO
+
+ // create temporary array
+ int* temp = new int[n];
+
+ // loop over creating boolean array
+ for (int i = 0; i < n; ++i) {
+ temp[i] = (idata[i] != 0) ? 1 : 0;
+ }
+
+ // create array for scan result
+ int* scanResult = new int[n];
+ scanResult[0] = 0;
+ // loop, exclusive scan
+ for (int i = 1; i < n; ++i) {
+ scanResult[i] = scanResult[i - 1] + temp[i - 1];
+ }
+
+ // final loop, use scan result and boolean result to generate new array
+ int count = 0;
+ for (int i = 0; i < n; ++i) {
+ if (temp[i] == 1) {
+ odata[scanResult[i]] = idata[i];
+ count++;
+ }
+ }
+
+ // cleanup
+ delete[] temp;
+ delete[] scanResult;
+
timer().endCpuTimer();
- return -1;
+
+ return count;
}
+
}
}
diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu
index 2db346e..3a6e8d1 100644
--- a/stream_compaction/efficient.cu
+++ b/stream_compaction/efficient.cu
@@ -2,6 +2,17 @@
#include
#include "common.h"
#include "efficient.h"
+#include "device_launch_parameters.h"
+#include
+
+#ifndef __CUDACC__
+#define __CUDACC__
+#endif
+
+#define NUM_BANKS 16
+#define LOG_NUM_BANKS 4
+#define CONFLICT_FREE_OFFSET(n) ((n) >> LOG_NUM_BANKS)
+
namespace StreamCompaction {
namespace Efficient {
@@ -11,30 +22,293 @@ namespace StreamCompaction {
static PerformanceTimer timer;
return timer;
}
+
+ // helper function to get next power of two (for host)
+ __host__ int h_nextPowerOfTwo(int n) {
+ int power = 1;
+ while (power < n)
+ power <<= 1;
+ return power;
+ }
+
+ // helper function to get next power of two (for device)
+ __device__ int d_nextPowerOfTwo(int x) {
+ if (x == 0) {
+ return 1;
+ }
+
+ x--;
+
+ x |= x >> 1;
+ x |= x >> 2;
+ x |= x >> 4;
+ x |= x >> 8;
+ x |= x >> 16;
+
+ return x + 1;
+ }
+
+ // do scan from GPU Gems
+ __global__ void scan_ker(int* g_odata, const int* g_idata, int* g_block_sums, int n) {
+ extern __shared__ int temp[]; // allocated on invocation
+ int thid = threadIdx.x;
+
+ int blockOffset = blockIdx.x * blockDim.x * 2;
+ int ai = thid;
+ int bi = thid + blockDim.x;
+
+ // number of elements to process in this block
+ int n_block = 2 * blockDim.x;
+
+ // next power of two greater or equal to n_block
+ int n_shared = n_block;
+ int bankOffsetA = CONFLICT_FREE_OFFSET(ai);
+ int bankOffsetB = CONFLICT_FREE_OFFSET(bi);
+
+ // load input into shared memory with padding
+ if (blockOffset + ai < n)
+ temp[ai + bankOffsetA] = g_idata[blockOffset + ai];
+ else
+ temp[ai + bankOffsetA] = 0;
+ if (blockOffset + bi < n)
+ temp[bi + bankOffsetB] = g_idata[blockOffset + bi];
+ else
+ temp[bi + bankOffsetB] = 0;
+
+ // build sum in place up the tree
+ int offset = 1;
+ for (int d = n_shared >> 1; d > 0; d >>= 1)
+ {
+ __syncthreads();
+ if (thid < d)
+ {
+ int ai = offset * (2 * thid + 1) - 1;
+ int bi = offset * (2 * thid + 2) - 1;
+
+ int bankOffsetA = CONFLICT_FREE_OFFSET(ai);
+ int bankOffsetB = CONFLICT_FREE_OFFSET(bi);
+
+ temp[bi + bankOffsetB] += temp[ai + bankOffsetA];
+ }
+ offset <<= 1;
+ }
+
+ // clear the last element
+ if (thid == 0) {
+ if (g_block_sums != NULL)
+ g_block_sums[blockIdx.x] = temp[n_shared - 1 + CONFLICT_FREE_OFFSET(n_shared - 1)];
+ temp[n_shared - 1 + CONFLICT_FREE_OFFSET(n_shared - 1)] = 0;
+ }
+
+ // traverse down tree & build scan
+ for (int d = 1; d < n_shared; d <<= 1)
+ {
+ offset >>= 1;
+ __syncthreads();
+ if (thid < d)
+ {
+ int ai = offset * (2 * thid + 1) - 1;
+ int bi = offset * (2 * thid + 2) - 1;
+
+ int bankOffsetA = CONFLICT_FREE_OFFSET(ai);
+ int bankOffsetB = CONFLICT_FREE_OFFSET(bi);
+
+ int t = temp[ai + bankOffsetA];
+ temp[ai + bankOffsetA] = temp[bi + bankOffsetB];
+ temp[bi + bankOffsetB] += t;
+ }
+ }
+ __syncthreads();
+
+ // write results to global memory
+ if (blockOffset + ai < n)
+ g_odata[blockOffset + ai] = temp[ai + bankOffsetA];
+ if (blockOffset + bi < n)
+ g_odata[blockOffset + bi] = temp[bi + bankOffsetB];
+ }
+
+ // kernel to add the scanned block sums to each block
+ __global__ void add_scanned_block_sums(int* g_data, const int* g_block_sums, int n) {
+ int index = threadIdx.x + blockIdx.x * blockDim.x * 2;
+ int offset = blockIdx.x;
+
+ if (offset == 0) return; // skip first block
+
+ int addValue = g_block_sums[offset];
+
+ if (index < n)
+ g_data[index] += addValue;
+ if (index + blockDim.x < n)
+ g_data[index + blockDim.x] += addValue;
+ }
+
+ // scan function
+ void scanRecursive(int n, int* d_odata, const int* d_idata) {
+ // base case
+ if (n <= 1024) {
+ int threadsPerBlock = (n + 1) / 2;
+ int sharedMemSize = h_nextPowerOfTwo(n) * sizeof(int);
+
+ scan_ker<<<1, threadsPerBlock, sharedMemSize>>>(d_odata, d_idata, NULL, n);
+ cudaDeviceSynchronize();
+ checkCUDAError("scan_ker base case kernel execution");
+ return;
+ }
+
+ // determine block and grid sizes
+ int threadsPerBlock = 512;
+ int elementsPerBlock = threadsPerBlock * 2;
+ int numBlocks = (n + elementsPerBlock - 1) / elementsPerBlock;
+
+ // allocate memory for block sums
+ int* d_block_sums;
+ cudaMalloc((void**)&d_block_sums, numBlocks * sizeof(int));
+ checkCUDAError("cudaMalloc d_block_sums");
+
+ // shared memory size per block
+ int sharedMemSize = 2 * threadsPerBlock * sizeof(int);
+
+ // launch the scan kernel
+ scan_ker <<>>(d_odata, d_idata, d_block_sums, n);
+ cudaDeviceSynchronize();
+ checkCUDAError("scan_ker kernel execution");
+
+ // if there is more than one block, we need to scan the block sums and add them to the data
+ if (numBlocks > 1) {
+ // allocate memory for scanned block sums
+ int* d_scanned_block_sums;
+ cudaMalloc((void**)&d_scanned_block_sums, numBlocks * sizeof(int));
+ checkCUDAError("cudaMalloc d_scanned_block_sums");
+
+ // recursively call scanRecursive on the block sums array
+ scanRecursive(numBlocks, d_scanned_block_sums, d_block_sums);
+
+ // launch kernel to add scanned block sums to data
+ add_scanned_block_sums<<>>(d_odata, d_scanned_block_sums, n);
+ cudaDeviceSynchronize();
+ checkCUDAError("add_scanned_block_sums kernel execution");
+
+ cudaFree(d_scanned_block_sums);
+ }
+
+ cudaFree(d_block_sums);
+ }
+
+ // scan (has timer)
+ void scan(int n, int* odata, const int* idata) {
+ // allocate device memory
+ int* d_idata, * d_odata;
+ cudaMalloc((void**)&d_idata, n * sizeof(int));
+ checkCUDAError("cudaMalloc d_idata");
+ cudaMalloc((void**)&d_odata, n * sizeof(int));
+ checkCUDAError("cudaMalloc d_odata");
+
+ // copy input data to device
+ cudaMemcpy(d_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice);
+ checkCUDAError("cudaMemcpy to d_idata");
- /**
- * Performs prefix-sum (aka scan) on idata, storing the result into odata.
- */
- void scan(int n, int *odata, const int *idata) {
timer().startGpuTimer();
- // TODO
+
+ // call the recursive scan function
+ scanRecursive(n, d_odata, d_idata);
+
timer().endGpuTimer();
+
+ // copy result back to host
+ cudaMemcpy(odata, d_odata, n * sizeof(int), cudaMemcpyDeviceToHost);
+ checkCUDAError("cudaMemcpy to odata");
+
+ // free device memory
+ cudaFree(d_idata);
+ cudaFree(d_odata);
+ }
+
+ // same as above, but does not call the GPU timer, to be used within a larger call in stream compact (efficient)
+ void scan_no_timer(int n, int* odata, const int* idata) {
+ // allocate device memory
+ int* d_idata, * d_odata;
+ cudaMalloc((void**)&d_idata, n * sizeof(int));
+ checkCUDAError("cudaMalloc d_idata");
+ cudaMalloc((void**)&d_odata, n * sizeof(int));
+ checkCUDAError("cudaMalloc d_odata");
+
+ // copy input data to device
+ cudaMemcpy(d_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice);
+ checkCUDAError("cudaMemcpy to d_idata");
+
+
+ // call the recursive scan function
+ scanRecursive(n, d_odata, d_idata);
+
+
+ // copy result back to host
+ cudaMemcpy(odata, d_odata, n * sizeof(int), cudaMemcpyDeviceToHost);
+ checkCUDAError("cudaMemcpy to odata");
+
+ // free device memory
+ cudaFree(d_idata);
+ cudaFree(d_odata);
}
- /**
- * Performs stream compaction on idata, storing the result into odata.
- * All zeroes are discarded.
- *
- * @param n The number of elements in idata.
- * @param odata The array into which to store elements.
- * @param idata The array of elements to compact.
- * @returns The number of elements remaining after compaction.
- */
- int compact(int n, int *odata, const int *idata) {
+ // compact
+ int compact(int n, int* odata, const int* idata) {
+ // allocate memory on the device
+ int* d_idata, * d_bools, * d_indices, * d_odata;
+ cudaMalloc((void**)&d_idata, n * sizeof(int));
+ cudaMalloc((void**)&d_bools, n * sizeof(int));
+ cudaMalloc((void**)&d_indices, n * sizeof(int));
+ cudaMalloc((void**)&d_odata, n * sizeof(int));
+
+ // copy input data to device
+ cudaMemcpy(d_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice);
+
+ int blockSize = 1024;
+ int gridSize = (n + blockSize - 1) / blockSize; // Ensure all elements are covered
+
+ // start timing after memory allocations and copies
timer().startGpuTimer();
- // TODO
+
+ // map input data to boolean (1 for non-zero, 0 for zero)
+ StreamCompaction::Common::kernMapToBoolean<<>>(n, d_bools, d_idata);
+ cudaDeviceSynchronize();
+
+ // perform an exclusive prefix sum (scan) on the boolean array
+ scan_no_timer(n, d_indices, d_bools);
+ cudaDeviceSynchronize();
+
+ // scatter non-zero elements from idata to odata based on the scan results
+ StreamCompaction::Common::kernScatter<<>>(n, d_odata, d_idata, d_bools, d_indices);
+ cudaDeviceSynchronize();
+
+ // end timing before any device-to-host memory transfers
timer().endGpuTimer();
- return -1;
+
+ // retrieve the number of valid (non-zero) elements
+ int numValidElements;
+ cudaMemcpy(&numValidElements, &d_indices[n - 1], sizeof(int), cudaMemcpyDeviceToHost);
+
+ // check if the last element is valid (if bools[n - 1] is 1, add 1 to numValidElements)
+ int lastBool;
+ cudaMemcpy(&lastBool, &d_bools[n - 1], sizeof(int), cudaMemcpyDeviceToHost);
+
+ if (lastBool == 1) {
+ numValidElements += 1;
+ }
+
+ // copy the compacted data to the output array on the host
+ cudaMemcpy(odata, d_odata, numValidElements * sizeof(int), cudaMemcpyDeviceToHost);
+
+ // free device memory
+ cudaFree(d_idata);
+ cudaFree(d_bools);
+ cudaFree(d_indices);
+ cudaFree(d_odata);
+
+ // return the number of elements remaining after compaction
+ return numValidElements;
}
+
+
+
}
}
diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu
index 4308876..f712113 100644
--- a/stream_compaction/naive.cu
+++ b/stream_compaction/naive.cu
@@ -11,15 +11,75 @@ namespace StreamCompaction {
static PerformanceTimer timer;
return timer;
}
- // TODO: __global__
+
+ // naive scan (kernel)
+ __global__ void scanKernel(int* g_odata, const int* g_idata, int n, int offset) {
+ // get index
+ int index = threadIdx.x + blockIdx.x * blockDim.x;
+
+ // return early if bad val
+ if (index >= n) return;
+
+
+ if (index >= offset) {
+ g_odata[index] = g_idata[index] + g_idata[index - offset];
+
+ } else {
+ g_odata[index] = g_idata[index];
+ }
+ }
+
/**
* Performs prefix-sum (aka scan) on idata, storing the result into odata.
*/
- void scan(int n, int *odata, const int *idata) {
+ void scan(int n, int* odata, const int* idata) {
+ // allocate memory on device
+ int* d_ping, * d_pong;
+ cudaMalloc((void**)&d_ping, n * sizeof(int));
+ checkCUDAError("cudaMalloc d_ping");
+ cudaMalloc((void**)&d_pong, n * sizeof(int));
+ checkCUDAError("cudaMalloc d_pong");
+
+ // copy data over
+ cudaMemcpy(d_ping + 1, idata, (n - 1) * sizeof(int), cudaMemcpyHostToDevice);
+ checkCUDAError("cudaMemcpy to d_ping");
+
+ int zero = 0;
+ cudaMemcpy(d_ping, &zero, sizeof(int), cudaMemcpyHostToDevice);
+ checkCUDAError("cudaMemcpy identity to d_ping");
+
+ // setup info
+ int logn = ilog2ceil(n);
+ int blockSize = 384;
+ int numBlocks = (n + blockSize - 1) / blockSize;
+
timer().startGpuTimer();
- // TODO
+
+ // do scan on subset of array
+ for (int d = 0; d < logn; d++) {
+ int offset = 1 << d;
+
+ // launch kernel
+ scanKernel<<>>(d_pong, d_ping, n, offset);
+ cudaDeviceSynchronize();
+ checkCUDAError("scanKernel execution");
+
+ // swap pointers
+ int* temp = d_ping;
+ d_ping = d_pong;
+ d_pong = temp;
+ }
+
timer().endGpuTimer();
+
+ // copy result back to host memory
+ cudaMemcpy(odata, d_ping, n * sizeof(int), cudaMemcpyDeviceToHost);
+ checkCUDAError("cudaMemcpy to odata");
+
+ // free device memory
+ cudaFree(d_ping);
+ cudaFree(d_pong);
}
}
}
diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu
index 1def45e..3a14394 100644
--- a/stream_compaction/thrust.cu
+++ b/stream_compaction/thrust.cu
@@ -17,12 +17,20 @@ namespace StreamCompaction {
/**
* Performs prefix-sum (aka scan) on idata, storing the result into odata.
*/
- void scan(int n, int *odata, const int *idata) {
+ void scan(int n, int* odata, const int* idata) {
+
+ // create thrust device vectors
+ thrust::device_vector d_idata(idata, idata + n);
+ thrust::device_vector d_odata(n);
timer().startGpuTimer();
- // TODO use `thrust::exclusive_scan`
- // example: for device_vectors dv_in and dv_out:
- // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin());
+
+ // use thrust exclusive scan
+ thrust::exclusive_scan(d_idata.begin(), d_idata.end(), d_odata.begin());
+
timer().endGpuTimer();
+
+ // copy result back
+ thrust::copy(d_odata.begin(), d_odata.end(), odata);
}
}
}