mirror of
https://github.com/fumiama/base16384-sycl.git
synced 2026-06-05 00:32:49 +08:00
feat: add remaining tests in Chapter 3
This commit is contained in:
3
.vscode/settings.json
vendored
3
.vscode/settings.json
vendored
@@ -82,7 +82,8 @@
|
||||
"terminal.integrated.profiles.windows": {
|
||||
"Command Prompt with oneAPI": {
|
||||
"path": "cmd.exe",
|
||||
"args": ["/k", "${config:oneapi.root.windows}/setvars.bat", "&&", "powershell"]
|
||||
// need to set VS2022INSTALLDIR envvar when WARNING: Visual Studio was not found in a standard install location
|
||||
"args": ["/k", "${config:oneapi.root.windows}/setvars.bat", "intel64", "vs2022", "&&", "powershell"]
|
||||
}
|
||||
},
|
||||
"terminal.integrated.profiles.linux": {
|
||||
|
||||
57
README.md
57
README.md
@@ -95,6 +95,63 @@ cmake --build .
|
||||
ctest
|
||||
```
|
||||
|
||||
### 4. Performance Analysis with Intel VTune
|
||||
|
||||
Intel VTune Profiler is a powerful performance analysis tool that can help you identify bottlenecks and optimize the applications.
|
||||
|
||||
#### Prerequisites
|
||||
|
||||
- Intel VTune Profiler (included in Intel oneAPI Base Toolkit)
|
||||
- Compiled Base16384-SYCL application or tests with debug symbols (use `RelWithDebInfo` build type)
|
||||
|
||||
#### Running VTune Analysis
|
||||
|
||||
**1. Launch VTune GUI:**
|
||||
|
||||
```bash
|
||||
vtune-gui
|
||||
```
|
||||
|
||||
**2. Create a New Project:**
|
||||
|
||||
- Click "New Project" in the welcome screen
|
||||
- Set project name and location
|
||||
- Configure the target application path
|
||||
|
||||
**3. Configure Analysis Type:**
|
||||
|
||||
Choose an analysis type based on your profiling goals:
|
||||
|
||||
- **Hotspots Analysis**: Identify CPU-intensive functions
|
||||
- **GPU Offload Analysis**: Analyze GPU kernel performance and host-device data transfer
|
||||
- **Memory Consumption**: Track memory usage patterns
|
||||
- **Threading Analysis**: Detect threading issues and analyze parallelism
|
||||
|
||||
**4. Run the Analysis:**
|
||||
|
||||
- Click the "Start" button to begin profiling
|
||||
- VTune will execute your application and collect performance data
|
||||
|
||||
**5. Analyze Results:**
|
||||
|
||||

|
||||
|
||||
**Key metrics to examine:**
|
||||
|
||||
- **Kernel Execution Time**: Time spent in SYCL kernels
|
||||
- **Memory Transfer Overhead**: Host-to-device and device-to-host data transfer time
|
||||
- **CPU Utilization**: Host CPU usage during GPU operations
|
||||
- **GPU Utilization**: GPU compute unit occupancy
|
||||
|
||||
#### Optimization Tips
|
||||
|
||||
Based on VTune analysis, consider these optimization strategies:
|
||||
|
||||
1. **Reduce Host-Device Transfer**: Minimize data copying between CPU and GPU
|
||||
2. **Increase Kernel Occupancy**: Optimize work-group sizes and global range
|
||||
3. **Use Shared Memory**: Leverage local memory for frequently accessed data
|
||||
4. **Batch Operations**: Process larger data chunks to amortize kernel launch overhead
|
||||
|
||||
## Build Configuration
|
||||
|
||||
The project supports multiple build configurations:
|
||||
|
||||
BIN
assets/vtune-b14-test-basic.png
Normal file
BIN
assets/vtune-b14-test-basic.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 157 KiB |
41
tests/fig-3-10_in-order-queue-usage.cpp
Normal file
41
tests/fig-3-10_in-order-queue-usage.cpp
Normal file
@@ -0,0 +1,41 @@
|
||||
// Figure 3-10. In-order queue usage
|
||||
// from book - Data Parallel C++
|
||||
// https://link.springer.com/book/10.1007/978-1-4842-5574-2
|
||||
|
||||
#include <sycl/sycl.hpp>
|
||||
|
||||
constexpr int N = 4;
|
||||
int main() {
|
||||
sycl::queue Q{sycl::property::queue::in_order()};
|
||||
int* device_array = sycl::malloc_device<int>(N, Q);
|
||||
|
||||
// Task A
|
||||
Q.submit(
|
||||
[&](sycl::handler& h) { h.parallel_for(N, [=](sycl::id<1> i) { device_array[i] = 0; }); });
|
||||
// Task B
|
||||
Q.submit([&](sycl::handler& h) { h.parallel_for(N, [=](sycl::id<1> i) { device_array[i]++; }); });
|
||||
// Task C
|
||||
Q.submit(
|
||||
[&](sycl::handler& h) { h.parallel_for(N, [=](sycl::id<1> i) { device_array[i] <<= 2; }); });
|
||||
|
||||
std::array<int, N> host_array;
|
||||
Q.submit([&](sycl::handler& h) {
|
||||
// copy deviceArray back to hostArray
|
||||
h.memcpy(&host_array[0], device_array, N * sizeof(int));
|
||||
});
|
||||
|
||||
Q.wait();
|
||||
|
||||
sycl::free(device_array, Q);
|
||||
|
||||
for (int i = 0; i < host_array.size(); i++) {
|
||||
if (host_array[i] != 4) {
|
||||
std::cerr << "Expect 4 at idx " << i << " but got " << host_array[i] << std::endl;
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
std::cout << "Test Passed!!!" << std::endl;
|
||||
|
||||
return 0;
|
||||
}
|
||||
69
tests/fig-3-11_using-events-and-depends-on.cpp
Normal file
69
tests/fig-3-11_using-events-and-depends-on.cpp
Normal file
@@ -0,0 +1,69 @@
|
||||
// Figure 3-11. Using events and depends_on
|
||||
// from book - Data Parallel C++
|
||||
// https://link.springer.com/book/10.1007/978-1-4842-5574-2
|
||||
|
||||
#include <sycl/sycl.hpp>
|
||||
|
||||
constexpr int N = 4;
|
||||
int main() {
|
||||
sycl::queue Q;
|
||||
std::array<int, N> data1;
|
||||
sycl::buffer B1{data1};
|
||||
std::array<int, N> data2;
|
||||
sycl::buffer B2{data2};
|
||||
|
||||
// Task A
|
||||
auto eA = Q.submit([&](sycl::handler& h) {
|
||||
sycl::accessor A1{B1, h};
|
||||
sycl::accessor A2{B2, h};
|
||||
|
||||
h.parallel_for(N, [=](sycl::id<1> i) {
|
||||
A1[i] = 233;
|
||||
A2[i] = 666;
|
||||
});
|
||||
});
|
||||
eA.wait();
|
||||
// Task B
|
||||
auto eB = Q.submit([&](sycl::handler& h) {
|
||||
sycl::accessor A1{B1, h};
|
||||
sycl::accessor A2{B2, h};
|
||||
|
||||
h.parallel_for(N, [=](sycl::id<1> i) {
|
||||
A1[i] += i; // 233 234 235 236
|
||||
A2[i] += A1[i]; // 899 900 901 902
|
||||
});
|
||||
});
|
||||
// Task C
|
||||
auto eC = Q.submit([&](sycl::handler& h) {
|
||||
sycl::accessor A2{B2, h};
|
||||
|
||||
h.depends_on(eB);
|
||||
h.parallel_for(N, [=](sycl::id<1> i) {
|
||||
A2[i] <<= 1; // 1798 1800 1802 1804
|
||||
});
|
||||
});
|
||||
// Task D
|
||||
auto eD = Q.submit([&](sycl::handler& h) {
|
||||
sycl::accessor A1{B1, h};
|
||||
sycl::accessor A2{B2, h};
|
||||
|
||||
h.depends_on({eB, eC});
|
||||
h.parallel_for(N, [=](sycl::id<1> i) {
|
||||
A2[i] += A1[i] * i; // 1798 2034 2272 2512
|
||||
});
|
||||
});
|
||||
|
||||
std::array<int, N> expected{1798, 2034, 2272, 2512};
|
||||
sycl::host_accessor A2{B2}; // if use data2 directly, the data may have not been synced
|
||||
|
||||
for (int i = 0; i < expected.size(); i++) {
|
||||
if (A2[i] != expected[i]) {
|
||||
std::cerr << "Expect " << expected[i] << " at idx " << i << " but got " << A2[i] << std::endl;
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
std::cout << "Test Passed!!!" << std::endl;
|
||||
|
||||
return 0;
|
||||
}
|
||||
53
tests/fig-3-13_read-after-write.cpp
Normal file
53
tests/fig-3-13_read-after-write.cpp
Normal file
@@ -0,0 +1,53 @@
|
||||
// Figure 3-13. Read-after-Write
|
||||
// from book - Data Parallel C++
|
||||
// https://link.springer.com/book/10.1007/978-1-4842-5574-2
|
||||
|
||||
#include <sycl/sycl.hpp>
|
||||
|
||||
constexpr int N = 42;
|
||||
int main() {
|
||||
std::array<int, N> a, b, c;
|
||||
for (int i = 0; i < N; i++) {
|
||||
a[i] = 1;
|
||||
b[i] = c[i] = 0;
|
||||
}
|
||||
sycl::queue Q;
|
||||
// We will learn how to simplify this example later
|
||||
sycl::buffer A{a};
|
||||
sycl::buffer B{b};
|
||||
sycl::buffer C{c};
|
||||
Q.submit([&](sycl::handler& h) {
|
||||
sycl::accessor accA(A, h, sycl::read_only);
|
||||
sycl::accessor accB(B, h, sycl::write_only);
|
||||
h.parallel_for( // computeB
|
||||
N, [=](sycl::id<1> i) { accB[i] = accA[i] + 1; });
|
||||
});
|
||||
int* datap = static_cast<int*>(sycl::malloc_shared(sizeof(int), Q));
|
||||
Q.submit([&](sycl::handler& h) {
|
||||
sycl::accessor accA(A, h, sycl::read_only);
|
||||
|
||||
h.parallel_for( // readA
|
||||
N, [=](sycl::id<1> i) {
|
||||
// Useful only as an example
|
||||
*datap = accA[i];
|
||||
});
|
||||
});
|
||||
Q.submit([&](sycl::handler& h) {
|
||||
// RAW of buffer B
|
||||
sycl::accessor accB(B, h, sycl::read_only);
|
||||
sycl::accessor accC(C, h, sycl::write_only);
|
||||
h.parallel_for( // computeC
|
||||
N, [=](sycl::id<1> i) { accC[i] = accB[i] + 3; });
|
||||
});
|
||||
// read C on host
|
||||
sycl::host_accessor host_accC(C, sycl::read_only);
|
||||
for (int i = 0; i < N; i++) {
|
||||
if (host_accC[i] != 5) {
|
||||
std::cerr << "Expect 5 at idx " << i << " but got " << host_accC[i] << std::endl;
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
std::cout << "readA: " << *datap << "\n";
|
||||
std::cout << "Test Passed!!!" << std::endl;
|
||||
return 0;
|
||||
}
|
||||
50
tests/fig-3-15_write-after-read-and-write-after-write.cpp
Normal file
50
tests/fig-3-15_write-after-read-and-write-after-write.cpp
Normal file
@@ -0,0 +1,50 @@
|
||||
// Figure 3-15. Write-after-Read and Write-after-Write
|
||||
// from book - Data Parallel C++
|
||||
// https://link.springer.com/book/10.1007/978-1-4842-5574-2
|
||||
|
||||
#include <sycl/sycl.hpp>
|
||||
|
||||
constexpr int N = 42;
|
||||
int main() {
|
||||
std::array<int, N> a, b;
|
||||
for (int i = 0; i < N; i++) {
|
||||
a[i] = b[i] = 0;
|
||||
}
|
||||
sycl::queue Q;
|
||||
sycl::buffer A{a};
|
||||
sycl::buffer B{b};
|
||||
Q.submit([&](sycl::handler& h) {
|
||||
sycl::accessor accA(A, h, sycl::read_only);
|
||||
sycl::accessor accB(B, h, sycl::write_only);
|
||||
h.parallel_for( // computeB
|
||||
N, [=](sycl::id<1> i) { accB[i] = accA[i] + 1; });
|
||||
});
|
||||
Q.submit([&](sycl::handler& h) {
|
||||
// WAR of buffer A
|
||||
sycl::accessor accA(A, h, sycl::write_only);
|
||||
h.parallel_for( // rewriteA
|
||||
N, [=](sycl::id<1> i) { accA[i] = 21; });
|
||||
});
|
||||
Q.submit([&](sycl::handler& h) {
|
||||
// WAW of buffer B
|
||||
sycl::accessor accB(B, h, sycl::write_only);
|
||||
h.parallel_for( // rewriteB
|
||||
N, [=](sycl::id<1> i) { accB[i] = 30; });
|
||||
});
|
||||
sycl::host_accessor host_accA(A, sycl::read_only);
|
||||
sycl::host_accessor host_accB(B, sycl::read_only);
|
||||
for (int i = 0; i < N; i++) {
|
||||
if (host_accA[i] != 21) {
|
||||
std::cerr << "Expect host_accA[i] 21 at idx " << i << " but got " << host_accA[i]
|
||||
<< std::endl;
|
||||
return -1;
|
||||
}
|
||||
if (host_accB[i] != 30) {
|
||||
std::cerr << "Expect host_accB[i] 30 at idx " << i << " but got " << host_accB[i]
|
||||
<< std::endl;
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
std::cout << "Test Passed!!!" << std::endl;
|
||||
return 0;
|
||||
}
|
||||
47
tests/fig-3-6_buffers-and-accessors.cpp
Normal file
47
tests/fig-3-6_buffers-and-accessors.cpp
Normal file
@@ -0,0 +1,47 @@
|
||||
// Figure 3-6. Buffers and accessors
|
||||
// from book - Data Parallel C++
|
||||
// https://link.springer.com/book/10.1007/978-1-4842-5574-2
|
||||
|
||||
#include <array>
|
||||
#include <sycl/sycl.hpp>
|
||||
|
||||
constexpr int N = 42;
|
||||
|
||||
int main() {
|
||||
std::array<int, N> my_data{}; // filled with 0
|
||||
{
|
||||
sycl::queue q;
|
||||
sycl::buffer my_buffer(my_data);
|
||||
|
||||
q.submit([&](sycl::handler& h) {
|
||||
// create an accessor to update
|
||||
// the buffer on the device
|
||||
sycl::accessor my_accessor(my_buffer, h);
|
||||
|
||||
h.parallel_for(N, [=](sycl::id<1> i) { my_accessor[i]++; });
|
||||
});
|
||||
|
||||
// create host accessor
|
||||
sycl::host_accessor host_accessor(my_buffer);
|
||||
|
||||
std::cout << "host_accessor: ";
|
||||
for (int i = 0; i < N; i++) {
|
||||
// access myBuffer on host
|
||||
std::cout << host_accessor[i] << " ";
|
||||
}
|
||||
std::cout << "\nmy_data_outsc: ";
|
||||
}
|
||||
|
||||
// myData is updated when myBuffer is
|
||||
// destroyed upon exiting scope
|
||||
for (int i = 0; i < N; i++) {
|
||||
std::cout << my_data[i] << " ";
|
||||
if (my_data[i] != 1) {
|
||||
std::cout << "Error at index " << i << ": expected " << 1 << ", got " << my_data[i]
|
||||
<< std::endl;
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
std::cout << "\nTest Passed!!!" << std::endl;
|
||||
}
|
||||
Reference in New Issue
Block a user