CIS5650-Fall-2025 · printer83mph · Sep 7, 2025 · Sep 7, 2025 · Sep 8, 2025 · Sep 8, 2025
diff --git a/.clangd b/.clangd
@@ -0,0 +1,68 @@
+# CompileFlags:
+#   Add:
+#     - -std=c++11
+#     - --cuda-path=/usr/local/cuda
+#     - --cuda-gpu-arch=sm_75
+#     - -L/usr/local/cuda/lib64
+#     - -I/usr/local/cuda/include
+#   Remove:
+#     - "-forward-unknown-to-host-compiler"
+#     - "-arch=native"
+#     - "--expt-*"
+#     - "--options-file"
+#     - "-G"
+
+# Apply this config conditionally to all C files
+If:
+  PathMatch: .*\.(c|h)$
+CompileFlags:
+  Compiler: /usr/bin/gcc
+
+---
+
+# Apply this config conditionally to all C++ files
+If:
+  PathMatch: .*\.(c|h)pp
+CompileFlags:
+  Compiler: /usr/bin/g++
+
+---
+
+# Apply this config conditionally to all CUDA files
+If:
+  PathMatch: .*\.cuh?
+CompileFlags:
+  Compiler: /usr/local/cuda/bin/nvcc
+
+---
+
+# Tweak the clangd parse settings for all files
+CompileFlags:
+  Add:
+    # report all errors
+    - "-ferror-limit=0"
+  Remove:
+    # strip CUDA fatbin args
+    - "-Xfatbin*"
+    # strip CUDA arch flags
+    - "-gencode*"
+    - "--generate-code*"
+    # strip CUDA flags unknown to clang
+    - "-ccbin*"
+    - "--compiler-options*"
+    - "--expt-extended-lambda"
+    - "--expt-relaxed-constexpr"
+    - "-forward-unknown-to-host-compiler"
+    - "-Werror=cross-execution-space-call"
+    - "-arch=native"
+    - "--options-file"
+    - "-G"
+
+Hover:
+  ShowAKA: No
+InlayHints:
+  Enabled: No
+Diagnostics:
+  Suppress:
+    - "variadic_device_fn"
+    - "attributes_not_allowed"
diff --git a/.gitignore b/.gitignore
@@ -258,6 +258,11 @@ bld/
 # Uncomment if you have tasks that create the project's static files in wwwroot
 #wwwroot/
 
+.vscode/*
+!.vscode/launch.json
+!.vscode/extensions.json
+!.vscode/settings.json
+
 # MSTest test Results
 [Tt]est[Rr]esult*/
 [Bb]uild[Ll]og.*
@@ -271,6 +276,9 @@ TestResult.xml
 [Rr]eleasePS/
 dlldata.c
 
+# Clangd cache
+.cache/clangd
+
 # DNX
 project.lock.json
 artifacts/

diff --git a/.vscode/launch.json b/.vscode/launch.json
@@ -0,0 +1,25 @@
+{
+  "$schema": "vscode://schemas/launch",
+  // Use IntelliSense to learn about possible attributes.
+  // Hover to view descriptions of existing attributes.
+  // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
+  "version": "0.2.0",
+  "configurations": [
+    {
+      "name": "CUDA C++: Launch",
+      "type": "cuda-gdb",
+      "request": "launch",
+      "environment": [
+        {"name": "WAYLAND_DISPLAY", "value": ""},
+        {"name": "XDG_SESSION_TYPE", "value": "x11"}
+      ],
+      "program": "${workspaceFolder}/build/bin/cis5650_boids",
+      "cwd": "${workspaceFolder}"
+    },
+    {
+      "name": "CUDA C++: Attach",
+      "type": "cuda-gdb",
+      "request": "attach"
+    }
+  ]
+}
diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -0,0 +1,11 @@
+{
+  "files.associations": {
+    "*.cu": "cuda-cpp"
+  },
+  "[cpp]": {
+    "editor.defaultFormatter": "llvm-vs-code-extensions.vscode-clangd"
+  },
+  "[cuda-cpp]": {
+    "editor.defaultFormatter": "llvm-vs-code-extensions.vscode-clangd"
+  }
+}
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -53,7 +53,7 @@ find_package(CCCL REQUIRED)
 set(headers
     src/cudaMat4.hpp
     src/glslUtility.hpp
-    src/kernel.h
+    src/kernel.cuh
     src/main.hpp
     src/utilityCore.hpp
     )

diff --git a/README.md b/README.md
@@ -1,11 +1,27 @@
 **University of Pennsylvania, CIS 5650: GPU Programming and Architecture,
 Project 1 - Flocking**
 
-* (TODO) YOUR NAME HERE
-  * (TODO) [LinkedIn](), [personal website](), [twitter](), etc.
-* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
+* Thomas Shaw
+  * [LinkedIn](https://www.linkedin.com/in/thomas-shaw-54468b222), [personal website](https://tlshaw.me), [GitHub](https://github.com/printer83mph), etc.
+* Tested on: Fedora 42, Ryzen 7 5700x @ 4.67GHz, 32GB, RTX 2070 8GB
 
-### (TODO: Your README)
+# Boids!!
 
-Include screenshots, analysis, etc. (Remember, this is public, so don't put
-anything here that you don't want to share with the world.)
+![](images/boids_showcase.gif)
+
+## Performance Analysis
+
+![](images/data_changing_boid_count.png)
+
+When increasing boid count with our naive method (left), we get a clearly exponential curve. This checks out, since we search through N boids N times. Moving over to the uniform grid implementation without contiguous data (center), we get a much shorter curve, which appears a bit less exponential. However, it does seem to have more deviation when it comes to kernel timings, which is likely due to the more variable amount of other boids that each boid (N) must search through. If any single boid in a warp has a lot of neighbors, the entire warp will be affected in timing. The contiguous data mode is similar, but appears closer to linear in nature, which would make sense up until we reach the limits of SM/SP count.
+
+![](images/data_changing_block_size.png)
+
+When changing block size, we can see a correlation between block size and kernel timing in the naive method (far left). In the uniform methods, both scattered and contiguous, there is no clear relation between block size and compute time, with numbers appearing flat throughout. This makes sense, seeing as the naive method simply gets limited by less and less warps being available, with each one taken up by some long, long raw computation and not I/O.
+
+Full data for these charts can be found in this [Google Sheets](https://docs.google.com/spreadsheets/d/1Y02QKcO3Mqwjt8nWyBKV0KpgBx1f-lHaxcIr9DEqbnA/edit?usp=sharing).
+
+## Extra Credit
+
+- Grid-Looping Optimization
+  - This can be found in [kernel.cu](https://github.com/printer83mph/CIS5650-Project1-CUDA-Flocking/blob/0a3297f7ecc6c78a996bea0d2d22e4d4d889d054/src/kernel.cu#L485), in which we push the search radius out from the boid's current position. These points, aligned to the grid, are used as boundaries for the three nested `for` loops.
diff --git a/images/boids_showcase.gif b/images/boids_showcase.gif
diff --git a/images/data_changing_block_size.png b/images/data_changing_block_size.png
diff --git a/images/data_changing_boid_count.png b/images/data_changing_boid_count.png
diff --git a/src/cudaMat4.hpp b/src/cudaMat4.hpp
@@ -1,8 +1,8 @@
 /**
  * @file      cudaMat4.h
  * @brief     This file includes code from:
- *            Yining Karl Li's TAKUA Render, a massively parallel pathtracing renderer:
- *            http://www.yiningkarlli.com
+ *            Yining Karl Li's TAKUA Render, a massively parallel pathtracing
+ * renderer: http://www.yiningkarlli.com
  * @authors   Yining Karl Li
  * @date      2012
  * @copyright Yining Karl Li
@@ -13,14 +13,14 @@
 #include <glm/glm.hpp>
 
 struct cudaMat3 {
-    glm::vec3 x;
-    glm::vec3 y;
-    glm::vec3 z;
+  glm::vec3 x;
+  glm::vec3 y;
+  glm::vec3 z;
 };
 
 struct cudaMat4 {
-    glm::vec4 x;
-    glm::vec4 y;
-    glm::vec4 z;
-    glm::vec4 w;
+  glm::vec4 x;
+  glm::vec4 y;
+  glm::vec4 z;
+  glm::vec4 w;
 };