feat: add remaining tests in Chapter 3

2026-06-05 00:32:49 +08:00 · 2026-01-09 14:43:48 +08:00
parent ed2915e97b
commit b0fcce99c7
8 changed files with 319 additions and 1 deletions
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -82,7 +82,8 @@
    "terminal.integrated.profiles.windows": {
        "Command Prompt with oneAPI": {
            "path": "cmd.exe",
-            "args": ["/k", "${config:oneapi.root.windows}/setvars.bat", "&&", "powershell"]
+             // need to set VS2022INSTALLDIR envvar when WARNING: Visual Studio was not found in a standard install location
+            "args": ["/k", "${config:oneapi.root.windows}/setvars.bat", "intel64", "vs2022", "&&", "powershell"]
        }
    },
    "terminal.integrated.profiles.linux": {
--- a/README.md
+++ b/README.md
@@ -95,6 +95,63 @@ cmake --build .
 ctest
 ```

+### 4. Performance Analysis with Intel VTune
+
+Intel VTune Profiler is a powerful performance analysis tool that can help you identify bottlenecks and optimize the applications.
+
+#### Prerequisites
+
+- Intel VTune Profiler (included in Intel oneAPI Base Toolkit)
+- Compiled Base16384-SYCL application or tests with debug symbols (use `RelWithDebInfo` build type)
+
+#### Running VTune Analysis
+
+**1. Launch VTune GUI:**
+
+```bash
+vtune-gui
+```
+
+**2. Create a New Project:**
+
+- Click "New Project" in the welcome screen
+- Set project name and location
+- Configure the target application path
+
+**3. Configure Analysis Type:**
+
+Choose an analysis type based on your profiling goals:
+
+- **Hotspots Analysis**: Identify CPU-intensive functions
+- **GPU Offload Analysis**: Analyze GPU kernel performance and host-device data transfer
+- **Memory Consumption**: Track memory usage patterns
+- **Threading Analysis**: Detect threading issues and analyze parallelism
+
+**4. Run the Analysis:**
+
+- Click the "Start" button to begin profiling
+- VTune will execute your application and collect performance data
+
+**5. Analyze Results:**
+
+![VTune Analysis Results of basic test](./assets/vtune-b14-test-basic.png)
+
+**Key metrics to examine:**
+
+- **Kernel Execution Time**: Time spent in SYCL kernels
+- **Memory Transfer Overhead**: Host-to-device and device-to-host data transfer time
+- **CPU Utilization**: Host CPU usage during GPU operations
+- **GPU Utilization**: GPU compute unit occupancy
+
+#### Optimization Tips
+
+Based on VTune analysis, consider these optimization strategies:
+
+1. **Reduce Host-Device Transfer**: Minimize data copying between CPU and GPU
+2. **Increase Kernel Occupancy**: Optimize work-group sizes and global range
+3. **Use Shared Memory**: Leverage local memory for frequently accessed data
+4. **Batch Operations**: Process larger data chunks to amortize kernel launch overhead
+
 ## Build Configuration

 The project supports multiple build configurations:
--- a/assets/vtune-b14-test-basic.png
+++ b/assets/vtune-b14-test-basic.png
--- a/tests/fig-3-10_in-order-queue-usage.cpp
+++ b/tests/fig-3-10_in-order-queue-usage.cpp
@@ -0,0 +1,41 @@
+// Figure 3-10. In-order queue usage
+// from book - Data Parallel C++
+// https://link.springer.com/book/10.1007/978-1-4842-5574-2
+
+#include <sycl/sycl.hpp>
+
+constexpr int N = 4;
+int main() {
+  sycl::queue Q{sycl::property::queue::in_order()};
+  int* device_array = sycl::malloc_device<int>(N, Q);
+
+  // Task A
+  Q.submit(
+      [&](sycl::handler& h) { h.parallel_for(N, [=](sycl::id<1> i) { device_array[i] = 0; }); });
+  // Task B
+  Q.submit([&](sycl::handler& h) { h.parallel_for(N, [=](sycl::id<1> i) { device_array[i]++; }); });
+  // Task C
+  Q.submit(
+      [&](sycl::handler& h) { h.parallel_for(N, [=](sycl::id<1> i) { device_array[i] <<= 2; }); });
+
+  std::array<int, N> host_array;
+  Q.submit([&](sycl::handler& h) {
+    // copy deviceArray back to hostArray
+    h.memcpy(&host_array[0], device_array, N * sizeof(int));
+  });
+
+  Q.wait();
+
+  sycl::free(device_array, Q);
+
+  for (int i = 0; i < host_array.size(); i++) {
+    if (host_array[i] != 4) {
+      std::cerr << "Expect 4 at idx " << i << " but got " << host_array[i] << std::endl;
+      return -1;
+    }
+  }
+
+  std::cout << "Test Passed!!!" << std::endl;
+
+  return 0;
+}
--- a/tests/fig-3-11_using-events-and-depends-on.cpp
+++ b/tests/fig-3-11_using-events-and-depends-on.cpp
@@ -0,0 +1,69 @@
+// Figure 3-11. Using events and depends_on
+// from book - Data Parallel C++
+// https://link.springer.com/book/10.1007/978-1-4842-5574-2
+
+#include <sycl/sycl.hpp>
+
+constexpr int N = 4;
+int main() {
+  sycl::queue Q;
+  std::array<int, N> data1;
+  sycl::buffer B1{data1};
+  std::array<int, N> data2;
+  sycl::buffer B2{data2};
+
+  // Task A
+  auto eA = Q.submit([&](sycl::handler& h) {
+    sycl::accessor A1{B1, h};
+    sycl::accessor A2{B2, h};
+
+    h.parallel_for(N, [=](sycl::id<1> i) {
+      A1[i] = 233;
+      A2[i] = 666;
+    });
+  });
+  eA.wait();
+  // Task B
+  auto eB = Q.submit([&](sycl::handler& h) {
+    sycl::accessor A1{B1, h};
+    sycl::accessor A2{B2, h};
+
+    h.parallel_for(N, [=](sycl::id<1> i) {
+      A1[i] += i;      // 233 234 235 236
+      A2[i] += A1[i];  // 899 900 901 902
+    });
+  });
+  // Task C
+  auto eC = Q.submit([&](sycl::handler& h) {
+    sycl::accessor A2{B2, h};
+
+    h.depends_on(eB);
+    h.parallel_for(N, [=](sycl::id<1> i) {
+      A2[i] <<= 1;  // 1798 1800 1802 1804
+    });
+  });
+  // Task D
+  auto eD = Q.submit([&](sycl::handler& h) {
+    sycl::accessor A1{B1, h};
+    sycl::accessor A2{B2, h};
+
+    h.depends_on({eB, eC});
+    h.parallel_for(N, [=](sycl::id<1> i) {
+      A2[i] += A1[i] * i;  // 1798 2034 2272 2512
+    });
+  });
+
+  std::array<int, N> expected{1798, 2034, 2272, 2512};
+  sycl::host_accessor A2{B2};  // if use data2 directly, the data may have not been synced
+
+  for (int i = 0; i < expected.size(); i++) {
+    if (A2[i] != expected[i]) {
+      std::cerr << "Expect " << expected[i] << " at idx " << i << " but got " << A2[i] << std::endl;
+      return -1;
+    }
+  }
+
+  std::cout << "Test Passed!!!" << std::endl;
+
+  return 0;
+}
--- a/tests/fig-3-13_read-after-write.cpp
+++ b/tests/fig-3-13_read-after-write.cpp
@@ -0,0 +1,53 @@
+// Figure 3-13. Read-after-Write
+// from book - Data Parallel C++
+// https://link.springer.com/book/10.1007/978-1-4842-5574-2
+
+#include <sycl/sycl.hpp>
+
+constexpr int N = 42;
+int main() {
+  std::array<int, N> a, b, c;
+  for (int i = 0; i < N; i++) {
+    a[i] = 1;
+    b[i] = c[i] = 0;
+  }
+  sycl::queue Q;
+  // We will learn how to simplify this example later
+  sycl::buffer A{a};
+  sycl::buffer B{b};
+  sycl::buffer C{c};
+  Q.submit([&](sycl::handler& h) {
+    sycl::accessor accA(A, h, sycl::read_only);
+    sycl::accessor accB(B, h, sycl::write_only);
+    h.parallel_for(  // computeB
+        N, [=](sycl::id<1> i) { accB[i] = accA[i] + 1; });
+  });
+  int* datap = static_cast<int*>(sycl::malloc_shared(sizeof(int), Q));
+  Q.submit([&](sycl::handler& h) {
+    sycl::accessor accA(A, h, sycl::read_only);
+
+    h.parallel_for(  // readA
+        N, [=](sycl::id<1> i) {
+          // Useful only as an example
+          *datap = accA[i];
+        });
+  });
+  Q.submit([&](sycl::handler& h) {
+    // RAW of buffer B
+    sycl::accessor accB(B, h, sycl::read_only);
+    sycl::accessor accC(C, h, sycl::write_only);
+    h.parallel_for(  // computeC
+        N, [=](sycl::id<1> i) { accC[i] = accB[i] + 3; });
+  });
+  // read C on host
+  sycl::host_accessor host_accC(C, sycl::read_only);
+  for (int i = 0; i < N; i++) {
+    if (host_accC[i] != 5) {
+      std::cerr << "Expect 5 at idx " << i << " but got " << host_accC[i] << std::endl;
+      return -1;
+    }
+  }
+  std::cout << "readA: " << *datap << "\n";
+  std::cout << "Test Passed!!!" << std::endl;
+  return 0;
+}
--- a/tests/fig-3-15_write-after-read-and-write-after-write.cpp
+++ b/tests/fig-3-15_write-after-read-and-write-after-write.cpp
@@ -0,0 +1,50 @@
+// Figure 3-15. Write-after-Read and Write-after-Write
+// from book - Data Parallel C++
+// https://link.springer.com/book/10.1007/978-1-4842-5574-2
+
+#include <sycl/sycl.hpp>
+
+constexpr int N = 42;
+int main() {
+  std::array<int, N> a, b;
+  for (int i = 0; i < N; i++) {
+    a[i] = b[i] = 0;
+  }
+  sycl::queue Q;
+  sycl::buffer A{a};
+  sycl::buffer B{b};
+  Q.submit([&](sycl::handler& h) {
+    sycl::accessor accA(A, h, sycl::read_only);
+    sycl::accessor accB(B, h, sycl::write_only);
+    h.parallel_for(  // computeB
+        N, [=](sycl::id<1> i) { accB[i] = accA[i] + 1; });
+  });
+  Q.submit([&](sycl::handler& h) {
+    // WAR of buffer A
+    sycl::accessor accA(A, h, sycl::write_only);
+    h.parallel_for(  // rewriteA
+        N, [=](sycl::id<1> i) { accA[i] = 21; });
+  });
+  Q.submit([&](sycl::handler& h) {
+    // WAW of buffer B
+    sycl::accessor accB(B, h, sycl::write_only);
+    h.parallel_for(  // rewriteB
+        N, [=](sycl::id<1> i) { accB[i] = 30; });
+  });
+  sycl::host_accessor host_accA(A, sycl::read_only);
+  sycl::host_accessor host_accB(B, sycl::read_only);
+  for (int i = 0; i < N; i++) {
+    if (host_accA[i] != 21) {
+      std::cerr << "Expect host_accA[i] 21 at idx " << i << " but got " << host_accA[i]
+                << std::endl;
+      return -1;
+    }
+    if (host_accB[i] != 30) {
+      std::cerr << "Expect host_accB[i] 30 at idx " << i << " but got " << host_accB[i]
+                << std::endl;
+      return -1;
+    }
+  }
+  std::cout << "Test Passed!!!" << std::endl;
+  return 0;
+}
--- a/tests/fig-3-6_buffers-and-accessors.cpp
+++ b/tests/fig-3-6_buffers-and-accessors.cpp
@@ -0,0 +1,47 @@
+// Figure 3-6. Buffers and accessors
+// from book - Data Parallel C++
+// https://link.springer.com/book/10.1007/978-1-4842-5574-2
+
+#include <array>
+#include <sycl/sycl.hpp>
+
+constexpr int N = 42;
+
+int main() {
+  std::array<int, N> my_data{};  // filled with 0
+  {
+    sycl::queue q;
+    sycl::buffer my_buffer(my_data);
+
+    q.submit([&](sycl::handler& h) {
+      // create an accessor to update
+      // the buffer on the device
+      sycl::accessor my_accessor(my_buffer, h);
+
+      h.parallel_for(N, [=](sycl::id<1> i) { my_accessor[i]++; });
+    });
+
+    // create host accessor
+    sycl::host_accessor host_accessor(my_buffer);
+
+    std::cout << "host_accessor: ";
+    for (int i = 0; i < N; i++) {
+      // access myBuffer on host
+      std::cout << host_accessor[i] << " ";
+    }
+    std::cout << "\nmy_data_outsc: ";
+  }
+
+  // myData is updated when myBuffer is
+  // destroyed upon exiting scope
+  for (int i = 0; i < N; i++) {
+    std::cout << my_data[i] << " ";
+    if (my_data[i] != 1) {
+      std::cout << "Error at index " << i << ": expected " << 1 << ", got " << my_data[i]
+                << std::endl;
+      return 1;
+    }
+  }
+
+  std::cout << "\nTest Passed!!!" << std::endl;
+}