mirror of
https://github.com/fumiama/base16384-sycl.git
synced 2026-06-11 05:30:27 +08:00
feat: add remaining tests in Chapter 3
This commit is contained in:
3
.vscode/settings.json
vendored
3
.vscode/settings.json
vendored
@@ -82,7 +82,8 @@
|
|||||||
"terminal.integrated.profiles.windows": {
|
"terminal.integrated.profiles.windows": {
|
||||||
"Command Prompt with oneAPI": {
|
"Command Prompt with oneAPI": {
|
||||||
"path": "cmd.exe",
|
"path": "cmd.exe",
|
||||||
"args": ["/k", "${config:oneapi.root.windows}/setvars.bat", "&&", "powershell"]
|
// need to set VS2022INSTALLDIR envvar when WARNING: Visual Studio was not found in a standard install location
|
||||||
|
"args": ["/k", "${config:oneapi.root.windows}/setvars.bat", "intel64", "vs2022", "&&", "powershell"]
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"terminal.integrated.profiles.linux": {
|
"terminal.integrated.profiles.linux": {
|
||||||
|
|||||||
57
README.md
57
README.md
@@ -95,6 +95,63 @@ cmake --build .
|
|||||||
ctest
|
ctest
|
||||||
```
|
```
|
||||||
|
|
||||||
|
### 4. Performance Analysis with Intel VTune
|
||||||
|
|
||||||
|
Intel VTune Profiler is a powerful performance analysis tool that can help you identify bottlenecks and optimize the applications.
|
||||||
|
|
||||||
|
#### Prerequisites
|
||||||
|
|
||||||
|
- Intel VTune Profiler (included in Intel oneAPI Base Toolkit)
|
||||||
|
- Compiled Base16384-SYCL application or tests with debug symbols (use `RelWithDebInfo` build type)
|
||||||
|
|
||||||
|
#### Running VTune Analysis
|
||||||
|
|
||||||
|
**1. Launch VTune GUI:**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
vtune-gui
|
||||||
|
```
|
||||||
|
|
||||||
|
**2. Create a New Project:**
|
||||||
|
|
||||||
|
- Click "New Project" in the welcome screen
|
||||||
|
- Set project name and location
|
||||||
|
- Configure the target application path
|
||||||
|
|
||||||
|
**3. Configure Analysis Type:**
|
||||||
|
|
||||||
|
Choose an analysis type based on your profiling goals:
|
||||||
|
|
||||||
|
- **Hotspots Analysis**: Identify CPU-intensive functions
|
||||||
|
- **GPU Offload Analysis**: Analyze GPU kernel performance and host-device data transfer
|
||||||
|
- **Memory Consumption**: Track memory usage patterns
|
||||||
|
- **Threading Analysis**: Detect threading issues and analyze parallelism
|
||||||
|
|
||||||
|
**4. Run the Analysis:**
|
||||||
|
|
||||||
|
- Click the "Start" button to begin profiling
|
||||||
|
- VTune will execute your application and collect performance data
|
||||||
|
|
||||||
|
**5. Analyze Results:**
|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
|
**Key metrics to examine:**
|
||||||
|
|
||||||
|
- **Kernel Execution Time**: Time spent in SYCL kernels
|
||||||
|
- **Memory Transfer Overhead**: Host-to-device and device-to-host data transfer time
|
||||||
|
- **CPU Utilization**: Host CPU usage during GPU operations
|
||||||
|
- **GPU Utilization**: GPU compute unit occupancy
|
||||||
|
|
||||||
|
#### Optimization Tips
|
||||||
|
|
||||||
|
Based on VTune analysis, consider these optimization strategies:
|
||||||
|
|
||||||
|
1. **Reduce Host-Device Transfer**: Minimize data copying between CPU and GPU
|
||||||
|
2. **Increase Kernel Occupancy**: Optimize work-group sizes and global range
|
||||||
|
3. **Use Shared Memory**: Leverage local memory for frequently accessed data
|
||||||
|
4. **Batch Operations**: Process larger data chunks to amortize kernel launch overhead
|
||||||
|
|
||||||
## Build Configuration
|
## Build Configuration
|
||||||
|
|
||||||
The project supports multiple build configurations:
|
The project supports multiple build configurations:
|
||||||
|
|||||||
BIN
assets/vtune-b14-test-basic.png
Normal file
BIN
assets/vtune-b14-test-basic.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 157 KiB |
41
tests/fig-3-10_in-order-queue-usage.cpp
Normal file
41
tests/fig-3-10_in-order-queue-usage.cpp
Normal file
@@ -0,0 +1,41 @@
|
|||||||
|
// Figure 3-10. In-order queue usage
|
||||||
|
// from book - Data Parallel C++
|
||||||
|
// https://link.springer.com/book/10.1007/978-1-4842-5574-2
|
||||||
|
|
||||||
|
#include <sycl/sycl.hpp>
|
||||||
|
|
||||||
|
constexpr int N = 4;
|
||||||
|
int main() {
|
||||||
|
sycl::queue Q{sycl::property::queue::in_order()};
|
||||||
|
int* device_array = sycl::malloc_device<int>(N, Q);
|
||||||
|
|
||||||
|
// Task A
|
||||||
|
Q.submit(
|
||||||
|
[&](sycl::handler& h) { h.parallel_for(N, [=](sycl::id<1> i) { device_array[i] = 0; }); });
|
||||||
|
// Task B
|
||||||
|
Q.submit([&](sycl::handler& h) { h.parallel_for(N, [=](sycl::id<1> i) { device_array[i]++; }); });
|
||||||
|
// Task C
|
||||||
|
Q.submit(
|
||||||
|
[&](sycl::handler& h) { h.parallel_for(N, [=](sycl::id<1> i) { device_array[i] <<= 2; }); });
|
||||||
|
|
||||||
|
std::array<int, N> host_array;
|
||||||
|
Q.submit([&](sycl::handler& h) {
|
||||||
|
// copy deviceArray back to hostArray
|
||||||
|
h.memcpy(&host_array[0], device_array, N * sizeof(int));
|
||||||
|
});
|
||||||
|
|
||||||
|
Q.wait();
|
||||||
|
|
||||||
|
sycl::free(device_array, Q);
|
||||||
|
|
||||||
|
for (int i = 0; i < host_array.size(); i++) {
|
||||||
|
if (host_array[i] != 4) {
|
||||||
|
std::cerr << "Expect 4 at idx " << i << " but got " << host_array[i] << std::endl;
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
std::cout << "Test Passed!!!" << std::endl;
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
69
tests/fig-3-11_using-events-and-depends-on.cpp
Normal file
69
tests/fig-3-11_using-events-and-depends-on.cpp
Normal file
@@ -0,0 +1,69 @@
|
|||||||
|
// Figure 3-11. Using events and depends_on
|
||||||
|
// from book - Data Parallel C++
|
||||||
|
// https://link.springer.com/book/10.1007/978-1-4842-5574-2
|
||||||
|
|
||||||
|
#include <sycl/sycl.hpp>
|
||||||
|
|
||||||
|
constexpr int N = 4;
|
||||||
|
int main() {
|
||||||
|
sycl::queue Q;
|
||||||
|
std::array<int, N> data1;
|
||||||
|
sycl::buffer B1{data1};
|
||||||
|
std::array<int, N> data2;
|
||||||
|
sycl::buffer B2{data2};
|
||||||
|
|
||||||
|
// Task A
|
||||||
|
auto eA = Q.submit([&](sycl::handler& h) {
|
||||||
|
sycl::accessor A1{B1, h};
|
||||||
|
sycl::accessor A2{B2, h};
|
||||||
|
|
||||||
|
h.parallel_for(N, [=](sycl::id<1> i) {
|
||||||
|
A1[i] = 233;
|
||||||
|
A2[i] = 666;
|
||||||
|
});
|
||||||
|
});
|
||||||
|
eA.wait();
|
||||||
|
// Task B
|
||||||
|
auto eB = Q.submit([&](sycl::handler& h) {
|
||||||
|
sycl::accessor A1{B1, h};
|
||||||
|
sycl::accessor A2{B2, h};
|
||||||
|
|
||||||
|
h.parallel_for(N, [=](sycl::id<1> i) {
|
||||||
|
A1[i] += i; // 233 234 235 236
|
||||||
|
A2[i] += A1[i]; // 899 900 901 902
|
||||||
|
});
|
||||||
|
});
|
||||||
|
// Task C
|
||||||
|
auto eC = Q.submit([&](sycl::handler& h) {
|
||||||
|
sycl::accessor A2{B2, h};
|
||||||
|
|
||||||
|
h.depends_on(eB);
|
||||||
|
h.parallel_for(N, [=](sycl::id<1> i) {
|
||||||
|
A2[i] <<= 1; // 1798 1800 1802 1804
|
||||||
|
});
|
||||||
|
});
|
||||||
|
// Task D
|
||||||
|
auto eD = Q.submit([&](sycl::handler& h) {
|
||||||
|
sycl::accessor A1{B1, h};
|
||||||
|
sycl::accessor A2{B2, h};
|
||||||
|
|
||||||
|
h.depends_on({eB, eC});
|
||||||
|
h.parallel_for(N, [=](sycl::id<1> i) {
|
||||||
|
A2[i] += A1[i] * i; // 1798 2034 2272 2512
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
std::array<int, N> expected{1798, 2034, 2272, 2512};
|
||||||
|
sycl::host_accessor A2{B2}; // if use data2 directly, the data may have not been synced
|
||||||
|
|
||||||
|
for (int i = 0; i < expected.size(); i++) {
|
||||||
|
if (A2[i] != expected[i]) {
|
||||||
|
std::cerr << "Expect " << expected[i] << " at idx " << i << " but got " << A2[i] << std::endl;
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
std::cout << "Test Passed!!!" << std::endl;
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
53
tests/fig-3-13_read-after-write.cpp
Normal file
53
tests/fig-3-13_read-after-write.cpp
Normal file
@@ -0,0 +1,53 @@
|
|||||||
|
// Figure 3-13. Read-after-Write
|
||||||
|
// from book - Data Parallel C++
|
||||||
|
// https://link.springer.com/book/10.1007/978-1-4842-5574-2
|
||||||
|
|
||||||
|
#include <sycl/sycl.hpp>
|
||||||
|
|
||||||
|
constexpr int N = 42;
|
||||||
|
int main() {
|
||||||
|
std::array<int, N> a, b, c;
|
||||||
|
for (int i = 0; i < N; i++) {
|
||||||
|
a[i] = 1;
|
||||||
|
b[i] = c[i] = 0;
|
||||||
|
}
|
||||||
|
sycl::queue Q;
|
||||||
|
// We will learn how to simplify this example later
|
||||||
|
sycl::buffer A{a};
|
||||||
|
sycl::buffer B{b};
|
||||||
|
sycl::buffer C{c};
|
||||||
|
Q.submit([&](sycl::handler& h) {
|
||||||
|
sycl::accessor accA(A, h, sycl::read_only);
|
||||||
|
sycl::accessor accB(B, h, sycl::write_only);
|
||||||
|
h.parallel_for( // computeB
|
||||||
|
N, [=](sycl::id<1> i) { accB[i] = accA[i] + 1; });
|
||||||
|
});
|
||||||
|
int* datap = static_cast<int*>(sycl::malloc_shared(sizeof(int), Q));
|
||||||
|
Q.submit([&](sycl::handler& h) {
|
||||||
|
sycl::accessor accA(A, h, sycl::read_only);
|
||||||
|
|
||||||
|
h.parallel_for( // readA
|
||||||
|
N, [=](sycl::id<1> i) {
|
||||||
|
// Useful only as an example
|
||||||
|
*datap = accA[i];
|
||||||
|
});
|
||||||
|
});
|
||||||
|
Q.submit([&](sycl::handler& h) {
|
||||||
|
// RAW of buffer B
|
||||||
|
sycl::accessor accB(B, h, sycl::read_only);
|
||||||
|
sycl::accessor accC(C, h, sycl::write_only);
|
||||||
|
h.parallel_for( // computeC
|
||||||
|
N, [=](sycl::id<1> i) { accC[i] = accB[i] + 3; });
|
||||||
|
});
|
||||||
|
// read C on host
|
||||||
|
sycl::host_accessor host_accC(C, sycl::read_only);
|
||||||
|
for (int i = 0; i < N; i++) {
|
||||||
|
if (host_accC[i] != 5) {
|
||||||
|
std::cerr << "Expect 5 at idx " << i << " but got " << host_accC[i] << std::endl;
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
std::cout << "readA: " << *datap << "\n";
|
||||||
|
std::cout << "Test Passed!!!" << std::endl;
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
50
tests/fig-3-15_write-after-read-and-write-after-write.cpp
Normal file
50
tests/fig-3-15_write-after-read-and-write-after-write.cpp
Normal file
@@ -0,0 +1,50 @@
|
|||||||
|
// Figure 3-15. Write-after-Read and Write-after-Write
|
||||||
|
// from book - Data Parallel C++
|
||||||
|
// https://link.springer.com/book/10.1007/978-1-4842-5574-2
|
||||||
|
|
||||||
|
#include <sycl/sycl.hpp>
|
||||||
|
|
||||||
|
constexpr int N = 42;
|
||||||
|
int main() {
|
||||||
|
std::array<int, N> a, b;
|
||||||
|
for (int i = 0; i < N; i++) {
|
||||||
|
a[i] = b[i] = 0;
|
||||||
|
}
|
||||||
|
sycl::queue Q;
|
||||||
|
sycl::buffer A{a};
|
||||||
|
sycl::buffer B{b};
|
||||||
|
Q.submit([&](sycl::handler& h) {
|
||||||
|
sycl::accessor accA(A, h, sycl::read_only);
|
||||||
|
sycl::accessor accB(B, h, sycl::write_only);
|
||||||
|
h.parallel_for( // computeB
|
||||||
|
N, [=](sycl::id<1> i) { accB[i] = accA[i] + 1; });
|
||||||
|
});
|
||||||
|
Q.submit([&](sycl::handler& h) {
|
||||||
|
// WAR of buffer A
|
||||||
|
sycl::accessor accA(A, h, sycl::write_only);
|
||||||
|
h.parallel_for( // rewriteA
|
||||||
|
N, [=](sycl::id<1> i) { accA[i] = 21; });
|
||||||
|
});
|
||||||
|
Q.submit([&](sycl::handler& h) {
|
||||||
|
// WAW of buffer B
|
||||||
|
sycl::accessor accB(B, h, sycl::write_only);
|
||||||
|
h.parallel_for( // rewriteB
|
||||||
|
N, [=](sycl::id<1> i) { accB[i] = 30; });
|
||||||
|
});
|
||||||
|
sycl::host_accessor host_accA(A, sycl::read_only);
|
||||||
|
sycl::host_accessor host_accB(B, sycl::read_only);
|
||||||
|
for (int i = 0; i < N; i++) {
|
||||||
|
if (host_accA[i] != 21) {
|
||||||
|
std::cerr << "Expect host_accA[i] 21 at idx " << i << " but got " << host_accA[i]
|
||||||
|
<< std::endl;
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
if (host_accB[i] != 30) {
|
||||||
|
std::cerr << "Expect host_accB[i] 30 at idx " << i << " but got " << host_accB[i]
|
||||||
|
<< std::endl;
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
std::cout << "Test Passed!!!" << std::endl;
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
47
tests/fig-3-6_buffers-and-accessors.cpp
Normal file
47
tests/fig-3-6_buffers-and-accessors.cpp
Normal file
@@ -0,0 +1,47 @@
|
|||||||
|
// Figure 3-6. Buffers and accessors
|
||||||
|
// from book - Data Parallel C++
|
||||||
|
// https://link.springer.com/book/10.1007/978-1-4842-5574-2
|
||||||
|
|
||||||
|
#include <array>
|
||||||
|
#include <sycl/sycl.hpp>
|
||||||
|
|
||||||
|
constexpr int N = 42;
|
||||||
|
|
||||||
|
int main() {
|
||||||
|
std::array<int, N> my_data{}; // filled with 0
|
||||||
|
{
|
||||||
|
sycl::queue q;
|
||||||
|
sycl::buffer my_buffer(my_data);
|
||||||
|
|
||||||
|
q.submit([&](sycl::handler& h) {
|
||||||
|
// create an accessor to update
|
||||||
|
// the buffer on the device
|
||||||
|
sycl::accessor my_accessor(my_buffer, h);
|
||||||
|
|
||||||
|
h.parallel_for(N, [=](sycl::id<1> i) { my_accessor[i]++; });
|
||||||
|
});
|
||||||
|
|
||||||
|
// create host accessor
|
||||||
|
sycl::host_accessor host_accessor(my_buffer);
|
||||||
|
|
||||||
|
std::cout << "host_accessor: ";
|
||||||
|
for (int i = 0; i < N; i++) {
|
||||||
|
// access myBuffer on host
|
||||||
|
std::cout << host_accessor[i] << " ";
|
||||||
|
}
|
||||||
|
std::cout << "\nmy_data_outsc: ";
|
||||||
|
}
|
||||||
|
|
||||||
|
// myData is updated when myBuffer is
|
||||||
|
// destroyed upon exiting scope
|
||||||
|
for (int i = 0; i < N; i++) {
|
||||||
|
std::cout << my_data[i] << " ";
|
||||||
|
if (my_data[i] != 1) {
|
||||||
|
std::cout << "Error at index " << i << ": expected " << 1 << ", got " << my_data[i]
|
||||||
|
<< std::endl;
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
std::cout << "\nTest Passed!!!" << std::endl;
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user