diff --git a/README.md b/README.md index 0e38ddb..1ee66cd 100644 --- a/README.md +++ b/README.md @@ -1,14 +1,62 @@ -CUDA Stream Compaction -====================== +# CUDA Stream Compaction **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2** -* (TODO) YOUR NAME HERE - * (TODO) [LinkedIn](), [personal website](), [twitter](), etc. -* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) +- Jinxiang Wang +- Tested on: Windows 11, AMD Ryzen 9 8945HS w/ Radeon 780M Graphics 4.00 GHz 32GB, RTX 4070 Laptop 8 GB ### (TODO: Your README) -Include analysis, etc. (Remember, this is public, so don't put -anything here that you don't want to share with the world.) +# 565hw2 +Owner: Andy Wang +Tags: CG General + +### Features Implemented + +1. CPU Scan & Stream Compaction +2. Naive GPU Scan Algorithm +3. Work-Efficient GPU Scan & Stream Compaction +4. Thrust Implementation + +### Results + +**Optimized Block Size:** + +![image.png](results/image.png) + +For different GPU scan implementations, applying different block size will achieve different performance. As indicated in the graph, the optimized block size of **Naive scan method** could be **1024**, and for **Work-Efficient method** it could be **256** + +**Scan Methods Performance Comparison** + +![image.png](results/image1.png) + +![image.png](results/image2.png) + +When dealing with array size less than 2^24, the difference is subtle between different methods. But as scan size increase exponentially, thrust implementations **out-performs** the CPU implementation at array size equals **2^24.** + +**Compact Methods Performance Comparison** + +![image.png](results/image3.png) + +The results from compact test is similar from what we had in scan test, where GPU implementation **out-performs** CPU implementation at array size equals **2^24.** + +**What does thrust do?** + +![Thrust.png](results/Thrust.png) + +![image.png](results/image4.png) + +By checking Nsight Compute, we can observe that Thrust implementation only take 2 step to finish the scan algorithm. The allocation and utilization of grid size and registers is much different from my implementation. + +**Bottleneck** + +![Bottleneck.png](results/Bottleneck.png) + +Comparing with the above shown results from Thrust implementation, there are lots of software calls in my algorithm. The ballance between compute throughput and memory thoughput is not optimized. A potential solution might be to implement this algorithm using shared memory to reduce memory throughput. + +**Result** + +Optimized Block Size, Array Size of $2^{24}$ + +![size24.png](results/size24.png) diff --git a/nsightoutput.ncu-rep b/nsightoutput.ncu-rep new file mode 100644 index 0000000..8493b95 Binary files /dev/null and b/nsightoutput.ncu-rep differ diff --git a/project2analysis/565hw2 1052caacc10180799430ec3107c815e1.md b/project2analysis/565hw2 1052caacc10180799430ec3107c815e1.md new file mode 100644 index 0000000..172587d --- /dev/null +++ b/project2analysis/565hw2 1052caacc10180799430ec3107c815e1.md @@ -0,0 +1,53 @@ +# 565hw2 + +Owner: Andy Wang +Tags: CG General + +### Features Implemented + +1. CPU Scan & Stream Compaction +2. Naive GPU Scan Algorithm +3. Work-Efficient GPU Scan & Stream Compaction +4. Thrust Implementation + +### Results + +**Optimized Block Size:** + +![image.png](image.png) + +For different GPU scan implementations, applying different block size will achieve different performance. As indicated in the graph, the optimized block size of **Naive scan method** could be **1024**, and for **Work-Efficient method** it could be **256** + +**Scan Methods Performance Comparison** + +![image.png](image%201.png) + +![image.png](image%202.png) + +When dealing with array size less than 2^24, the difference is subtle between different methods. But as scan size increase exponentially, thrust implementations **out-performs** the CPU implementation at array size equals **2^24.** + +**Compact Methods Performance Comparison** + +![image.png](image%203.png) + +The results from compact test is similar from what we had in scan test, where GPU implementation **out-performs** CPU implementation at array size equals **2^24.** + +**What does thrust do?** + +![Thrust.png](Thrust.png) + +![image.png](image%204.png) + +By checking Nsight Compute, we can observe that Thrust implementation only take 2 step to finish the scan algorithm. The allocation and utilization of grid size and registers is much different from my implementation. + +**Bottleneck** + +![Bottleneck.png](Bottleneck.png) + +Comparing with the above shown results from Thrust implementation, there are lots of software calls in my algorithm. The ballance between compute throughput and memory thoughput is not optimized. A potential solution might be to implement this algorithm using shared memory to reduce memory throughput. + +**Result** + +Optimized Block Size, Array Size of $2^{24}$ + +![size24.png](size24.png) \ No newline at end of file diff --git a/project2analysis/Bottleneck.png b/project2analysis/Bottleneck.png new file mode 100644 index 0000000..c91ac27 Binary files /dev/null and b/project2analysis/Bottleneck.png differ diff --git a/project2analysis/Thrust.png b/project2analysis/Thrust.png new file mode 100644 index 0000000..b2f7a16 Binary files /dev/null and b/project2analysis/Thrust.png differ diff --git a/project2analysis/image.png b/project2analysis/image.png new file mode 100644 index 0000000..4b01ddb Binary files /dev/null and b/project2analysis/image.png differ diff --git a/project2analysis/image1.png b/project2analysis/image1.png new file mode 100644 index 0000000..ce680e9 Binary files /dev/null and b/project2analysis/image1.png differ diff --git a/project2analysis/image2.png b/project2analysis/image2.png new file mode 100644 index 0000000..7fde85a Binary files /dev/null and b/project2analysis/image2.png differ diff --git a/project2analysis/image3.png b/project2analysis/image3.png new file mode 100644 index 0000000..b86d792 Binary files /dev/null and b/project2analysis/image3.png differ diff --git a/project2analysis/image4.png b/project2analysis/image4.png new file mode 100644 index 0000000..8b4f8c3 Binary files /dev/null and b/project2analysis/image4.png differ diff --git a/project2analysis/size24.png b/project2analysis/size24.png new file mode 100644 index 0000000..a59a887 Binary files /dev/null and b/project2analysis/size24.png differ diff --git a/results/Bottleneck.png b/results/Bottleneck.png new file mode 100644 index 0000000..c91ac27 Binary files /dev/null and b/results/Bottleneck.png differ diff --git a/results/Thrust.png b/results/Thrust.png new file mode 100644 index 0000000..b2f7a16 Binary files /dev/null and b/results/Thrust.png differ diff --git a/results/image.png b/results/image.png new file mode 100644 index 0000000..4b01ddb Binary files /dev/null and b/results/image.png differ diff --git a/results/image1.png b/results/image1.png new file mode 100644 index 0000000..ce680e9 Binary files /dev/null and b/results/image1.png differ diff --git a/results/image2.png b/results/image2.png new file mode 100644 index 0000000..7fde85a Binary files /dev/null and b/results/image2.png differ diff --git a/results/image3.png b/results/image3.png new file mode 100644 index 0000000..b86d792 Binary files /dev/null and b/results/image3.png differ diff --git a/results/image4.png b/results/image4.png new file mode 100644 index 0000000..8b4f8c3 Binary files /dev/null and b/results/image4.png differ diff --git a/results/size24.png b/results/size24.png new file mode 100644 index 0000000..a59a887 Binary files /dev/null and b/results/size24.png differ diff --git a/src/main.cpp b/src/main.cpp index 896ac2b..90fcea1 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -13,7 +13,7 @@ #include #include "testing_helpers.hpp" -const int SIZE = 1 << 8; // feel free to change the size of array +const int SIZE = 1 << 24; // feel free to change the size of array const int NPOT = SIZE - 3; // Non-Power-Of-Two int *a = new int[SIZE]; int *b = new int[SIZE]; diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu index 719fa11..317cbb7 100644 --- a/stream_compaction/cpu.cu +++ b/stream_compaction/cpu.cu @@ -18,11 +18,26 @@ namespace StreamCompaction { * (Optional) For better understanding before starting moving to GPU, you can simulate your GPU scan in this function first. */ void scan(int n, int *odata, const int *idata) { + timer().startCpuTimer(); // TODO + odata[0] = 0; + for (int i = 1; i < n; i++) { + odata[i] = odata[i - 1] + idata[i - 1]; + } + timer().endCpuTimer(); } + void scanWithoutTimer(int n, int* odata, const int* idata) { + + // TODO + odata[0] = 0; + for (int i = 1; i < n; i++) { + odata[i] = odata[i - 1] + idata[i - 1]; + } + + } /** * CPU stream compaction without using the scan function. * @@ -31,8 +46,15 @@ namespace StreamCompaction { int compactWithoutScan(int n, int *odata, const int *idata) { timer().startCpuTimer(); // TODO + int count = 0; + for (int i = 0; i < n; i++) { + if (idata[i] != 0) { + odata[count] = idata[i]; + count++; + } + } timer().endCpuTimer(); - return -1; + return count; } /** @@ -43,8 +65,24 @@ namespace StreamCompaction { int compactWithScan(int n, int *odata, const int *idata) { timer().startCpuTimer(); // TODO + int* temp = new int[n]; + int* scanResult = new int[n]; + for (int i = 0; i < n; i++) { + temp[i] = (idata[i] == 0) ? 0 : 1; + } + scanWithoutTimer(n, scanResult, temp); + int count = 0; + for (int i = 0; i < n; i++) { + if (temp[i] != 0) { + odata[scanResult[i]] = idata[i]; + count++; + } + } + delete[] temp; + delete[] scanResult; + timer().endCpuTimer(); - return -1; + return count; } } } diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index 2db346e..2cf0964 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -6,21 +6,89 @@ namespace StreamCompaction { namespace Efficient { using StreamCompaction::Common::PerformanceTimer; + #define blockSize 256 PerformanceTimer& timer() { static PerformanceTimer timer; return timer; } + __global__ void kernUpSweep(int n, int* odata, int d) { + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= n || index % (1 << (d + 1)) != 0) return; + + odata[index + (1 << (d + 1)) - 1] += odata[index + (1 << d) - 1]; + } + + __global__ void kernDownSweep(int n, int* odata, int d) { + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= n || index % (1 << (d + 1)) != 0) return; + + + int t = odata[index + (1 << d) - 1]; + odata[index + (1 << d) - 1] = odata[index + (1 << (d + 1)) - 1]; + odata[index + (1 << (d + 1)) - 1] += t; + } + + __global__ void computeTempArray(int n, int* odata, const int* idata) { + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= n) return; + + odata[index] = idata[index] == 0 ? 0 : 1; + } + + __global__ void scatter(int n, int* odata, const int* idata, const int* bools, const int* scan) { + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= n) return; + + if (bools[index] > 0) { + odata[scan[index]] = idata[index]; + } + } + /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { - timer().startGpuTimer(); // TODO - timer().endGpuTimer(); - } + //int blockSize = 128; + int npower2 = 1 << ilog2ceil(n); + int* dev_odata; + cudaMalloc((void**)&dev_odata, npower2 * sizeof(int)); + checkCUDAError("cudaMalloc dev_odata failed!"); + cudaMemset(dev_odata, 0, npower2 * sizeof(int)); + cudaMemcpy(dev_odata, idata, n * sizeof(int), cudaMemcpyHostToDevice); + checkCUDAError("cudaMemcpy idata to dev_idata failed!"); + + dim3 fullBlocksPerGrid((npower2 + blockSize - 1) / blockSize); + timer().startGpuTimer(); + + // up sweep + for (int d = 0; d < ilog2ceil(n); d++) { + kernUpSweep << > > (npower2, dev_odata, d); + checkCUDAError("kernUpSweep failed!"); + cudaDeviceSynchronize(); + } + + // down sweep + cudaMemset(dev_odata + npower2 - 1, 0, sizeof(int)); + for (int d = ilog2ceil(npower2) - 1; d >= 0; d--) { + kernDownSweep << > > (npower2, dev_odata, d); + checkCUDAError("kernDownSweep failed!"); + cudaDeviceSynchronize(); + } + timer().endGpuTimer(); + + cudaMemcpy(odata, dev_odata, n * sizeof(int), cudaMemcpyDeviceToHost); + checkCUDAError("cudaMemcpy dev_odata to odata failed!"); + + cudaFree(dev_odata); + + /*for (int i = 0; i < n; i++) { + printf("%d ", odata[i]); + }*/ + } /** * Performs stream compaction on idata, storing the result into odata. * All zeroes are discarded. @@ -30,11 +98,88 @@ namespace StreamCompaction { * @param idata The array of elements to compact. * @returns The number of elements remaining after compaction. */ - int compact(int n, int *odata, const int *idata) { - timer().startGpuTimer(); - // TODO - timer().endGpuTimer(); - return -1; - } + + + int compactPower2(int n, int* odata, const int* idata) { + // TODO + //int blockSize = 128; + + int* dev_tempArray; + int* dev_scanArray; + int* dev_idata; + int* dev_odata; + + cudaMalloc((void**)&dev_tempArray, n * sizeof(int)); + checkCUDAError("cudaMalloc dev_tempArray failed!"); + cudaMalloc((void**)&dev_idata, n * sizeof(int)); + checkCUDAError("cudaMalloc dev_idata failed!"); + cudaMalloc((void**)&dev_odata, n * sizeof(int)); + checkCUDAError("cudaMalloc dev_odata failed!"); + cudaMalloc((void**)&dev_scanArray, n * sizeof(int)); + checkCUDAError("cudaMalloc dev_scanArray failed!"); + + cudaMemcpy(dev_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice); + checkCUDAError("cudaMemcpy idata to dev_idata failed!"); + timer().startGpuTimer(); + + // compute tempArray + computeTempArray << <(n + blockSize - 1) / blockSize, blockSize >> > (n, dev_tempArray, dev_idata); + checkCUDAError("computeTempArray failed!"); + cudaDeviceSynchronize(); + + // up sweep and down sweep + cudaMemcpy(dev_scanArray, dev_tempArray, n * sizeof(int), cudaMemcpyDeviceToDevice); + for (int d = 0; d < ilog2ceil(n); d++) { + kernUpSweep << <(n + blockSize - 1) / blockSize, blockSize >> > (n, dev_scanArray, d); + checkCUDAError("kernUpSweep failed!"); + cudaDeviceSynchronize(); + } + + + cudaMemset(dev_scanArray + n - 1, 0, sizeof(int)); + for (int d = ilog2ceil(n) - 1; d >= 0; d--) { + kernDownSweep << <(n + blockSize - 1) / blockSize, blockSize >> > (n, dev_scanArray, d); + checkCUDAError("kernDownSweep failed!"); + cudaDeviceSynchronize(); + } + + // scatter + scatter << <(n + blockSize - 1) / blockSize, blockSize >> > (n, dev_odata, dev_idata, dev_tempArray, dev_scanArray); + checkCUDAError("scatter failed!"); + cudaDeviceSynchronize(); + timer().endGpuTimer(); + + cudaMemcpy(odata, dev_odata, n * sizeof(int), cudaMemcpyDeviceToHost); + + int* host_scanArray = new int[n]; + cudaMemcpy(host_scanArray, dev_scanArray, n * sizeof(int), cudaMemcpyDeviceToHost); + int count = host_scanArray[n - 1]; + + delete[] host_scanArray; + cudaFree(dev_tempArray); + cudaFree(dev_idata); + cudaFree(dev_odata); + cudaFree(dev_scanArray); + + return count; + } + + int compact(int n, int* odata, const int* idata) { + int npower2 = 1 << ilog2ceil(n); + int* idata_power2 = new int[npower2]; + memset(idata_power2, 0, npower2 * sizeof(int)); + memcpy(idata_power2, idata, n * sizeof(int)); + + int* odata_power2 = new int[npower2]; + memset(odata_power2, 0, npower2 * sizeof(int)); + + int count = compactPower2(npower2, odata_power2, idata_power2); + memcpy(odata, odata_power2, count * sizeof(int)); + + delete[] idata_power2; + delete[] odata_power2; + + return count; + } } } diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index 4308876..b3c417f 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -6,20 +6,68 @@ namespace StreamCompaction { namespace Naive { using StreamCompaction::Common::PerformanceTimer; + #define blockSize 1024 PerformanceTimer& timer() { static PerformanceTimer timer; return timer; } // TODO: __global__ + __global__ void kernNaiveScan(int n, int* odata, const int* idata, int d) { + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= n) return; + + if (index >= (1 << d)) { + odata[index] = idata[index - (1 << d)] + idata[index]; + } + else { + odata[index] = idata[index]; + } + } + + __global__ void kernShiftRight(int n, int* odata, const int* idata) { + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= n) return; + + odata[index] = index == 0 ? 0 : idata[index - 1]; + } /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { - timer().startGpuTimer(); // TODO - timer().endGpuTimer(); + + int* dev_idata; + int* dev_odata; + + cudaMalloc((void**)&dev_idata, n * sizeof(int)); + checkCUDAError("cudaMalloc dev_idata failed!"); + cudaMalloc((void**)&dev_odata, n * sizeof(int)); + checkCUDAError("cudaMalloc dev_odata failed!"); + + cudaMemcpy(dev_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice); + checkCUDAError("cudaMemcpy idata to dev_idata failed!"); + + dim3 fullBlocksPerGrid((n + blockSize - 1) / blockSize); + timer().startGpuTimer(); + for (int d = 0; d < ilog2ceil(n); d++) { + kernNaiveScan << > > (n, dev_odata, dev_idata, d); + checkCUDAError("kernNaiveScan failed!"); + cudaMemcpy(dev_idata, dev_odata, n * sizeof(int), cudaMemcpyDeviceToDevice); + } + + kernShiftRight << > > (n, dev_odata, dev_idata); + timer().endGpuTimer(); + + cudaMemcpy(odata, dev_odata, n * sizeof(int), cudaMemcpyDeviceToHost); + checkCUDAError("cudaMemcpy dev_odata to odata failed!"); + + + + cudaFree(dev_idata); + cudaFree(dev_odata); + } } } diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu index 1def45e..d7d4f8f 100644 --- a/stream_compaction/thrust.cu +++ b/stream_compaction/thrust.cu @@ -18,11 +18,18 @@ namespace StreamCompaction { * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { - timer().startGpuTimer(); // TODO use `thrust::exclusive_scan` // example: for device_vectors dv_in and dv_out: // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); + thrust::device_vector dv_in(idata, idata + n); + thrust::device_vector dv_out(n); + timer().startGpuTimer(); + thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); timer().endGpuTimer(); + + thrust::copy(dv_out.begin(), dv_out.end(), odata); + } + } }