1
0
mirror of https://github.com/fumiama/base16384-sycl.git synced 2026-06-05 00:32:49 +08:00

feat: add remaining tests in Chapter 3

This commit is contained in:
源文雨
2026-01-09 14:43:48 +08:00
parent ed2915e97b
commit b0fcce99c7
8 changed files with 319 additions and 1 deletions

View File

@@ -82,7 +82,8 @@
"terminal.integrated.profiles.windows": {
"Command Prompt with oneAPI": {
"path": "cmd.exe",
"args": ["/k", "${config:oneapi.root.windows}/setvars.bat", "&&", "powershell"]
// need to set VS2022INSTALLDIR envvar when WARNING: Visual Studio was not found in a standard install location
"args": ["/k", "${config:oneapi.root.windows}/setvars.bat", "intel64", "vs2022", "&&", "powershell"]
}
},
"terminal.integrated.profiles.linux": {

View File

@@ -95,6 +95,63 @@ cmake --build .
ctest
```
### 4. Performance Analysis with Intel VTune
Intel VTune Profiler is a powerful performance analysis tool that can help you identify bottlenecks and optimize the applications.
#### Prerequisites
- Intel VTune Profiler (included in Intel oneAPI Base Toolkit)
- Compiled Base16384-SYCL application or tests with debug symbols (use `RelWithDebInfo` build type)
#### Running VTune Analysis
**1. Launch VTune GUI:**
```bash
vtune-gui
```
**2. Create a New Project:**
- Click "New Project" in the welcome screen
- Set project name and location
- Configure the target application path
**3. Configure Analysis Type:**
Choose an analysis type based on your profiling goals:
- **Hotspots Analysis**: Identify CPU-intensive functions
- **GPU Offload Analysis**: Analyze GPU kernel performance and host-device data transfer
- **Memory Consumption**: Track memory usage patterns
- **Threading Analysis**: Detect threading issues and analyze parallelism
**4. Run the Analysis:**
- Click the "Start" button to begin profiling
- VTune will execute your application and collect performance data
**5. Analyze Results:**
![VTune Analysis Results of basic test](./assets/vtune-b14-test-basic.png)
**Key metrics to examine:**
- **Kernel Execution Time**: Time spent in SYCL kernels
- **Memory Transfer Overhead**: Host-to-device and device-to-host data transfer time
- **CPU Utilization**: Host CPU usage during GPU operations
- **GPU Utilization**: GPU compute unit occupancy
#### Optimization Tips
Based on VTune analysis, consider these optimization strategies:
1. **Reduce Host-Device Transfer**: Minimize data copying between CPU and GPU
2. **Increase Kernel Occupancy**: Optimize work-group sizes and global range
3. **Use Shared Memory**: Leverage local memory for frequently accessed data
4. **Batch Operations**: Process larger data chunks to amortize kernel launch overhead
## Build Configuration
The project supports multiple build configurations:

Binary file not shown.

After

Width:  |  Height:  |  Size: 157 KiB

View File

@@ -0,0 +1,41 @@
// Figure 3-10. In-order queue usage
// from book - Data Parallel C++
// https://link.springer.com/book/10.1007/978-1-4842-5574-2
#include <sycl/sycl.hpp>
constexpr int N = 4;
int main() {
sycl::queue Q{sycl::property::queue::in_order()};
int* device_array = sycl::malloc_device<int>(N, Q);
// Task A
Q.submit(
[&](sycl::handler& h) { h.parallel_for(N, [=](sycl::id<1> i) { device_array[i] = 0; }); });
// Task B
Q.submit([&](sycl::handler& h) { h.parallel_for(N, [=](sycl::id<1> i) { device_array[i]++; }); });
// Task C
Q.submit(
[&](sycl::handler& h) { h.parallel_for(N, [=](sycl::id<1> i) { device_array[i] <<= 2; }); });
std::array<int, N> host_array;
Q.submit([&](sycl::handler& h) {
// copy deviceArray back to hostArray
h.memcpy(&host_array[0], device_array, N * sizeof(int));
});
Q.wait();
sycl::free(device_array, Q);
for (int i = 0; i < host_array.size(); i++) {
if (host_array[i] != 4) {
std::cerr << "Expect 4 at idx " << i << " but got " << host_array[i] << std::endl;
return -1;
}
}
std::cout << "Test Passed!!!" << std::endl;
return 0;
}

View File

@@ -0,0 +1,69 @@
// Figure 3-11. Using events and depends_on
// from book - Data Parallel C++
// https://link.springer.com/book/10.1007/978-1-4842-5574-2
#include <sycl/sycl.hpp>
constexpr int N = 4;
int main() {
sycl::queue Q;
std::array<int, N> data1;
sycl::buffer B1{data1};
std::array<int, N> data2;
sycl::buffer B2{data2};
// Task A
auto eA = Q.submit([&](sycl::handler& h) {
sycl::accessor A1{B1, h};
sycl::accessor A2{B2, h};
h.parallel_for(N, [=](sycl::id<1> i) {
A1[i] = 233;
A2[i] = 666;
});
});
eA.wait();
// Task B
auto eB = Q.submit([&](sycl::handler& h) {
sycl::accessor A1{B1, h};
sycl::accessor A2{B2, h};
h.parallel_for(N, [=](sycl::id<1> i) {
A1[i] += i; // 233 234 235 236
A2[i] += A1[i]; // 899 900 901 902
});
});
// Task C
auto eC = Q.submit([&](sycl::handler& h) {
sycl::accessor A2{B2, h};
h.depends_on(eB);
h.parallel_for(N, [=](sycl::id<1> i) {
A2[i] <<= 1; // 1798 1800 1802 1804
});
});
// Task D
auto eD = Q.submit([&](sycl::handler& h) {
sycl::accessor A1{B1, h};
sycl::accessor A2{B2, h};
h.depends_on({eB, eC});
h.parallel_for(N, [=](sycl::id<1> i) {
A2[i] += A1[i] * i; // 1798 2034 2272 2512
});
});
std::array<int, N> expected{1798, 2034, 2272, 2512};
sycl::host_accessor A2{B2}; // if use data2 directly, the data may have not been synced
for (int i = 0; i < expected.size(); i++) {
if (A2[i] != expected[i]) {
std::cerr << "Expect " << expected[i] << " at idx " << i << " but got " << A2[i] << std::endl;
return -1;
}
}
std::cout << "Test Passed!!!" << std::endl;
return 0;
}

View File

@@ -0,0 +1,53 @@
// Figure 3-13. Read-after-Write
// from book - Data Parallel C++
// https://link.springer.com/book/10.1007/978-1-4842-5574-2
#include <sycl/sycl.hpp>
constexpr int N = 42;
int main() {
std::array<int, N> a, b, c;
for (int i = 0; i < N; i++) {
a[i] = 1;
b[i] = c[i] = 0;
}
sycl::queue Q;
// We will learn how to simplify this example later
sycl::buffer A{a};
sycl::buffer B{b};
sycl::buffer C{c};
Q.submit([&](sycl::handler& h) {
sycl::accessor accA(A, h, sycl::read_only);
sycl::accessor accB(B, h, sycl::write_only);
h.parallel_for( // computeB
N, [=](sycl::id<1> i) { accB[i] = accA[i] + 1; });
});
int* datap = static_cast<int*>(sycl::malloc_shared(sizeof(int), Q));
Q.submit([&](sycl::handler& h) {
sycl::accessor accA(A, h, sycl::read_only);
h.parallel_for( // readA
N, [=](sycl::id<1> i) {
// Useful only as an example
*datap = accA[i];
});
});
Q.submit([&](sycl::handler& h) {
// RAW of buffer B
sycl::accessor accB(B, h, sycl::read_only);
sycl::accessor accC(C, h, sycl::write_only);
h.parallel_for( // computeC
N, [=](sycl::id<1> i) { accC[i] = accB[i] + 3; });
});
// read C on host
sycl::host_accessor host_accC(C, sycl::read_only);
for (int i = 0; i < N; i++) {
if (host_accC[i] != 5) {
std::cerr << "Expect 5 at idx " << i << " but got " << host_accC[i] << std::endl;
return -1;
}
}
std::cout << "readA: " << *datap << "\n";
std::cout << "Test Passed!!!" << std::endl;
return 0;
}

View File

@@ -0,0 +1,50 @@
// Figure 3-15. Write-after-Read and Write-after-Write
// from book - Data Parallel C++
// https://link.springer.com/book/10.1007/978-1-4842-5574-2
#include <sycl/sycl.hpp>
constexpr int N = 42;
int main() {
std::array<int, N> a, b;
for (int i = 0; i < N; i++) {
a[i] = b[i] = 0;
}
sycl::queue Q;
sycl::buffer A{a};
sycl::buffer B{b};
Q.submit([&](sycl::handler& h) {
sycl::accessor accA(A, h, sycl::read_only);
sycl::accessor accB(B, h, sycl::write_only);
h.parallel_for( // computeB
N, [=](sycl::id<1> i) { accB[i] = accA[i] + 1; });
});
Q.submit([&](sycl::handler& h) {
// WAR of buffer A
sycl::accessor accA(A, h, sycl::write_only);
h.parallel_for( // rewriteA
N, [=](sycl::id<1> i) { accA[i] = 21; });
});
Q.submit([&](sycl::handler& h) {
// WAW of buffer B
sycl::accessor accB(B, h, sycl::write_only);
h.parallel_for( // rewriteB
N, [=](sycl::id<1> i) { accB[i] = 30; });
});
sycl::host_accessor host_accA(A, sycl::read_only);
sycl::host_accessor host_accB(B, sycl::read_only);
for (int i = 0; i < N; i++) {
if (host_accA[i] != 21) {
std::cerr << "Expect host_accA[i] 21 at idx " << i << " but got " << host_accA[i]
<< std::endl;
return -1;
}
if (host_accB[i] != 30) {
std::cerr << "Expect host_accB[i] 30 at idx " << i << " but got " << host_accB[i]
<< std::endl;
return -1;
}
}
std::cout << "Test Passed!!!" << std::endl;
return 0;
}

View File

@@ -0,0 +1,47 @@
// Figure 3-6. Buffers and accessors
// from book - Data Parallel C++
// https://link.springer.com/book/10.1007/978-1-4842-5574-2
#include <array>
#include <sycl/sycl.hpp>
constexpr int N = 42;
int main() {
std::array<int, N> my_data{}; // filled with 0
{
sycl::queue q;
sycl::buffer my_buffer(my_data);
q.submit([&](sycl::handler& h) {
// create an accessor to update
// the buffer on the device
sycl::accessor my_accessor(my_buffer, h);
h.parallel_for(N, [=](sycl::id<1> i) { my_accessor[i]++; });
});
// create host accessor
sycl::host_accessor host_accessor(my_buffer);
std::cout << "host_accessor: ";
for (int i = 0; i < N; i++) {
// access myBuffer on host
std::cout << host_accessor[i] << " ";
}
std::cout << "\nmy_data_outsc: ";
}
// myData is updated when myBuffer is
// destroyed upon exiting scope
for (int i = 0; i < N; i++) {
std::cout << my_data[i] << " ";
if (my_data[i] != 1) {
std::cout << "Error at index " << i << ": expected " << 1 << ", got " << my_data[i]
<< std::endl;
return 1;
}
}
std::cout << "\nTest Passed!!!" << std::endl;
}