diff --git a/.vscode/settings.json b/.vscode/settings.json index 80e4973..d3b1153 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -82,7 +82,8 @@ "terminal.integrated.profiles.windows": { "Command Prompt with oneAPI": { "path": "cmd.exe", - "args": ["/k", "${config:oneapi.root.windows}/setvars.bat", "&&", "powershell"] + // need to set VS2022INSTALLDIR envvar when WARNING: Visual Studio was not found in a standard install location + "args": ["/k", "${config:oneapi.root.windows}/setvars.bat", "intel64", "vs2022", "&&", "powershell"] } }, "terminal.integrated.profiles.linux": { diff --git a/README.md b/README.md index 6f8e9d4..90b7375 100644 --- a/README.md +++ b/README.md @@ -95,6 +95,63 @@ cmake --build . ctest ``` +### 4. Performance Analysis with Intel VTune + +Intel VTune Profiler is a powerful performance analysis tool that can help you identify bottlenecks and optimize the applications. + +#### Prerequisites + +- Intel VTune Profiler (included in Intel oneAPI Base Toolkit) +- Compiled Base16384-SYCL application or tests with debug symbols (use `RelWithDebInfo` build type) + +#### Running VTune Analysis + +**1. Launch VTune GUI:** + +```bash +vtune-gui +``` + +**2. Create a New Project:** + +- Click "New Project" in the welcome screen +- Set project name and location +- Configure the target application path + +**3. Configure Analysis Type:** + +Choose an analysis type based on your profiling goals: + +- **Hotspots Analysis**: Identify CPU-intensive functions +- **GPU Offload Analysis**: Analyze GPU kernel performance and host-device data transfer +- **Memory Consumption**: Track memory usage patterns +- **Threading Analysis**: Detect threading issues and analyze parallelism + +**4. Run the Analysis:** + +- Click the "Start" button to begin profiling +- VTune will execute your application and collect performance data + +**5. Analyze Results:** + +![VTune Analysis Results of basic test](./assets/vtune-b14-test-basic.png) + +**Key metrics to examine:** + +- **Kernel Execution Time**: Time spent in SYCL kernels +- **Memory Transfer Overhead**: Host-to-device and device-to-host data transfer time +- **CPU Utilization**: Host CPU usage during GPU operations +- **GPU Utilization**: GPU compute unit occupancy + +#### Optimization Tips + +Based on VTune analysis, consider these optimization strategies: + +1. **Reduce Host-Device Transfer**: Minimize data copying between CPU and GPU +2. **Increase Kernel Occupancy**: Optimize work-group sizes and global range +3. **Use Shared Memory**: Leverage local memory for frequently accessed data +4. **Batch Operations**: Process larger data chunks to amortize kernel launch overhead + ## Build Configuration The project supports multiple build configurations: diff --git a/assets/vtune-b14-test-basic.png b/assets/vtune-b14-test-basic.png new file mode 100644 index 0000000..17a051d Binary files /dev/null and b/assets/vtune-b14-test-basic.png differ diff --git a/tests/fig-3-10_in-order-queue-usage.cpp b/tests/fig-3-10_in-order-queue-usage.cpp new file mode 100644 index 0000000..96c20a0 --- /dev/null +++ b/tests/fig-3-10_in-order-queue-usage.cpp @@ -0,0 +1,41 @@ +// Figure 3-10. In-order queue usage +// from book - Data Parallel C++ +// https://link.springer.com/book/10.1007/978-1-4842-5574-2 + +#include + +constexpr int N = 4; +int main() { + sycl::queue Q{sycl::property::queue::in_order()}; + int* device_array = sycl::malloc_device(N, Q); + + // Task A + Q.submit( + [&](sycl::handler& h) { h.parallel_for(N, [=](sycl::id<1> i) { device_array[i] = 0; }); }); + // Task B + Q.submit([&](sycl::handler& h) { h.parallel_for(N, [=](sycl::id<1> i) { device_array[i]++; }); }); + // Task C + Q.submit( + [&](sycl::handler& h) { h.parallel_for(N, [=](sycl::id<1> i) { device_array[i] <<= 2; }); }); + + std::array host_array; + Q.submit([&](sycl::handler& h) { + // copy deviceArray back to hostArray + h.memcpy(&host_array[0], device_array, N * sizeof(int)); + }); + + Q.wait(); + + sycl::free(device_array, Q); + + for (int i = 0; i < host_array.size(); i++) { + if (host_array[i] != 4) { + std::cerr << "Expect 4 at idx " << i << " but got " << host_array[i] << std::endl; + return -1; + } + } + + std::cout << "Test Passed!!!" << std::endl; + + return 0; +} \ No newline at end of file diff --git a/tests/fig-3-11_using-events-and-depends-on.cpp b/tests/fig-3-11_using-events-and-depends-on.cpp new file mode 100644 index 0000000..71a15d7 --- /dev/null +++ b/tests/fig-3-11_using-events-and-depends-on.cpp @@ -0,0 +1,69 @@ +// Figure 3-11. Using events and depends_on +// from book - Data Parallel C++ +// https://link.springer.com/book/10.1007/978-1-4842-5574-2 + +#include + +constexpr int N = 4; +int main() { + sycl::queue Q; + std::array data1; + sycl::buffer B1{data1}; + std::array data2; + sycl::buffer B2{data2}; + + // Task A + auto eA = Q.submit([&](sycl::handler& h) { + sycl::accessor A1{B1, h}; + sycl::accessor A2{B2, h}; + + h.parallel_for(N, [=](sycl::id<1> i) { + A1[i] = 233; + A2[i] = 666; + }); + }); + eA.wait(); + // Task B + auto eB = Q.submit([&](sycl::handler& h) { + sycl::accessor A1{B1, h}; + sycl::accessor A2{B2, h}; + + h.parallel_for(N, [=](sycl::id<1> i) { + A1[i] += i; // 233 234 235 236 + A2[i] += A1[i]; // 899 900 901 902 + }); + }); + // Task C + auto eC = Q.submit([&](sycl::handler& h) { + sycl::accessor A2{B2, h}; + + h.depends_on(eB); + h.parallel_for(N, [=](sycl::id<1> i) { + A2[i] <<= 1; // 1798 1800 1802 1804 + }); + }); + // Task D + auto eD = Q.submit([&](sycl::handler& h) { + sycl::accessor A1{B1, h}; + sycl::accessor A2{B2, h}; + + h.depends_on({eB, eC}); + h.parallel_for(N, [=](sycl::id<1> i) { + A2[i] += A1[i] * i; // 1798 2034 2272 2512 + }); + }); + + std::array expected{1798, 2034, 2272, 2512}; + sycl::host_accessor A2{B2}; // if use data2 directly, the data may have not been synced + + for (int i = 0; i < expected.size(); i++) { + if (A2[i] != expected[i]) { + std::cerr << "Expect " << expected[i] << " at idx " << i << " but got " << A2[i] << std::endl; + return -1; + } + } + + std::cout << "Test Passed!!!" << std::endl; + + return 0; +} \ No newline at end of file diff --git a/tests/fig-3-13_read-after-write.cpp b/tests/fig-3-13_read-after-write.cpp new file mode 100644 index 0000000..90473dd --- /dev/null +++ b/tests/fig-3-13_read-after-write.cpp @@ -0,0 +1,53 @@ +// Figure 3-13. Read-after-Write +// from book - Data Parallel C++ +// https://link.springer.com/book/10.1007/978-1-4842-5574-2 + +#include + +constexpr int N = 42; +int main() { + std::array a, b, c; + for (int i = 0; i < N; i++) { + a[i] = 1; + b[i] = c[i] = 0; + } + sycl::queue Q; + // We will learn how to simplify this example later + sycl::buffer A{a}; + sycl::buffer B{b}; + sycl::buffer C{c}; + Q.submit([&](sycl::handler& h) { + sycl::accessor accA(A, h, sycl::read_only); + sycl::accessor accB(B, h, sycl::write_only); + h.parallel_for( // computeB + N, [=](sycl::id<1> i) { accB[i] = accA[i] + 1; }); + }); + int* datap = static_cast(sycl::malloc_shared(sizeof(int), Q)); + Q.submit([&](sycl::handler& h) { + sycl::accessor accA(A, h, sycl::read_only); + + h.parallel_for( // readA + N, [=](sycl::id<1> i) { + // Useful only as an example + *datap = accA[i]; + }); + }); + Q.submit([&](sycl::handler& h) { + // RAW of buffer B + sycl::accessor accB(B, h, sycl::read_only); + sycl::accessor accC(C, h, sycl::write_only); + h.parallel_for( // computeC + N, [=](sycl::id<1> i) { accC[i] = accB[i] + 3; }); + }); + // read C on host + sycl::host_accessor host_accC(C, sycl::read_only); + for (int i = 0; i < N; i++) { + if (host_accC[i] != 5) { + std::cerr << "Expect 5 at idx " << i << " but got " << host_accC[i] << std::endl; + return -1; + } + } + std::cout << "readA: " << *datap << "\n"; + std::cout << "Test Passed!!!" << std::endl; + return 0; +} \ No newline at end of file diff --git a/tests/fig-3-15_write-after-read-and-write-after-write.cpp b/tests/fig-3-15_write-after-read-and-write-after-write.cpp new file mode 100644 index 0000000..df60c0e --- /dev/null +++ b/tests/fig-3-15_write-after-read-and-write-after-write.cpp @@ -0,0 +1,50 @@ +// Figure 3-15. Write-after-Read and Write-after-Write +// from book - Data Parallel C++ +// https://link.springer.com/book/10.1007/978-1-4842-5574-2 + +#include + +constexpr int N = 42; +int main() { + std::array a, b; + for (int i = 0; i < N; i++) { + a[i] = b[i] = 0; + } + sycl::queue Q; + sycl::buffer A{a}; + sycl::buffer B{b}; + Q.submit([&](sycl::handler& h) { + sycl::accessor accA(A, h, sycl::read_only); + sycl::accessor accB(B, h, sycl::write_only); + h.parallel_for( // computeB + N, [=](sycl::id<1> i) { accB[i] = accA[i] + 1; }); + }); + Q.submit([&](sycl::handler& h) { + // WAR of buffer A + sycl::accessor accA(A, h, sycl::write_only); + h.parallel_for( // rewriteA + N, [=](sycl::id<1> i) { accA[i] = 21; }); + }); + Q.submit([&](sycl::handler& h) { + // WAW of buffer B + sycl::accessor accB(B, h, sycl::write_only); + h.parallel_for( // rewriteB + N, [=](sycl::id<1> i) { accB[i] = 30; }); + }); + sycl::host_accessor host_accA(A, sycl::read_only); + sycl::host_accessor host_accB(B, sycl::read_only); + for (int i = 0; i < N; i++) { + if (host_accA[i] != 21) { + std::cerr << "Expect host_accA[i] 21 at idx " << i << " but got " << host_accA[i] + << std::endl; + return -1; + } + if (host_accB[i] != 30) { + std::cerr << "Expect host_accB[i] 30 at idx " << i << " but got " << host_accB[i] + << std::endl; + return -1; + } + } + std::cout << "Test Passed!!!" << std::endl; + return 0; +} \ No newline at end of file diff --git a/tests/fig-3-6_buffers-and-accessors.cpp b/tests/fig-3-6_buffers-and-accessors.cpp new file mode 100644 index 0000000..7c2feb0 --- /dev/null +++ b/tests/fig-3-6_buffers-and-accessors.cpp @@ -0,0 +1,47 @@ +// Figure 3-6. Buffers and accessors +// from book - Data Parallel C++ +// https://link.springer.com/book/10.1007/978-1-4842-5574-2 + +#include +#include + +constexpr int N = 42; + +int main() { + std::array my_data{}; // filled with 0 + { + sycl::queue q; + sycl::buffer my_buffer(my_data); + + q.submit([&](sycl::handler& h) { + // create an accessor to update + // the buffer on the device + sycl::accessor my_accessor(my_buffer, h); + + h.parallel_for(N, [=](sycl::id<1> i) { my_accessor[i]++; }); + }); + + // create host accessor + sycl::host_accessor host_accessor(my_buffer); + + std::cout << "host_accessor: "; + for (int i = 0; i < N; i++) { + // access myBuffer on host + std::cout << host_accessor[i] << " "; + } + std::cout << "\nmy_data_outsc: "; + } + + // myData is updated when myBuffer is + // destroyed upon exiting scope + for (int i = 0; i < N; i++) { + std::cout << my_data[i] << " "; + if (my_data[i] != 1) { + std::cout << "Error at index " << i << ": expected " << 1 << ", got " << my_data[i] + << std::endl; + return 1; + } + } + + std::cout << "\nTest Passed!!!" << std::endl; +} \ No newline at end of file