mirror of
https://github.com/fumiama/base16384-sycl.git
synced 2026-06-05 08:40:34 +08:00
Run on i7-14700K: ``` > .\tests\basic.exe 执行设备: Intel(R) UHD Graphics 770 设备类型: GPU Intel GPU 特性: XeCore 数量: 4 每个 XeCore 的向量引擎数: 8 向量引擎总数: 32 每个 XeCore 的硬件线程数: 56 每个向量引擎的硬件线程数: 7 硬件线程总数: 224 GPU 内存大小: 31712935936 B (29.53 GB) 每个工作组的共享本地内存: 65536 B 最大工作组大小: 512 支持的子组大小: 8 16 32 推荐选择子组大小: 8 100% 占用率工作组大小: 256 CPU (2944.7 ms): 54 85 110 101 0 102 101 101 85 86 85 86 110 110 110 206 94 110 104 198 110 102 102 206 94 110 110 94 104 94 206 102 101 102 94 54 101 110 254 86 86 104 102 198 104 110 0 104 110 86 0 110 110 198 110 110 94 85 110 0 254 101 101 101... GPU 基本并行 (471.7 ms): 54 85 110 101 0 102 101 101 85 86 85 86 110 110 110 206 94 110 104 198 110 102 102 206 94 110 110 94 104 94 206 102 101 102 94 54 101 110 254 86 86 104 102 198 104 110 0 104 110 86 0 110 110 198 110 110 94 85 110 0 254 101 101 101... GPU 高级并行 (448.2 ms): 54 85 110 101 0 102 101 101 85 86 85 86 110 110 110 206 94 110 104 198 110 102 102 206 94 110 110 94 104 94 206 102 101 102 94 54 101 110 254 86 86 104 102 198 104 110 0 104 110 86 0 110 110 198 110 110 94 85 110 0 254 101 101 101... ```
151 lines
4.8 KiB
C++
151 lines
4.8 KiB
C++
#include <stdint.h>
|
|
#ifdef _WIN32
|
|
#include <windows.h>
|
|
#undef min
|
|
#undef max
|
|
#endif
|
|
|
|
#include <chrono>
|
|
#include <iomanip>
|
|
#include <iostream>
|
|
#include <random>
|
|
#include <ranges>
|
|
#include <sycl/sycl.hpp>
|
|
#include <vector>
|
|
|
|
#include "errors.hpp"
|
|
#include "test.hpp"
|
|
#include "xeinfo.hpp"
|
|
|
|
constexpr int iter_count = 65536;
|
|
constexpr int N = 65536;
|
|
|
|
int main() {
|
|
#ifdef _WIN32
|
|
// Set console code page to UTF-8
|
|
SetConsoleOutputCP(CP_UTF8);
|
|
SetConsoleCP(CP_UTF8);
|
|
#endif
|
|
sycl::queue q;
|
|
|
|
const sycl::device device;
|
|
const auto device_name = device.get_info<sycl::info::device::name>();
|
|
std::cout << "执行设备: " << device_name << std::endl;
|
|
std::cout << "设备类型: ";
|
|
if (device.is_cpu()) {
|
|
std::cout << "CPU" << std::endl;
|
|
} else if (device.is_gpu()) {
|
|
std::cout << "GPU" << std::endl;
|
|
} else {
|
|
std::cout << "其他" << std::endl;
|
|
}
|
|
|
|
int work_group_size = 64;
|
|
if (device.is_gpu() && device_name.starts_with("Intel")) {
|
|
try {
|
|
auto xeinfo = base16384::xeinfo(device);
|
|
work_group_size = xeinfo.work_group_size;
|
|
std::cout << "\n" << xeinfo.string() << "\n\n";
|
|
} catch (const sycl::exception& e) {
|
|
std::cout << "获取Intel GPU信息失败 (可能不是Intel设备): " << e.what() << std::endl;
|
|
std::cout << "使用默认工作组大小: " << work_group_size << "\n\n";
|
|
}
|
|
}
|
|
|
|
// Generate random initial data
|
|
std::random_device rd;
|
|
std::mt19937 gen{rd()};
|
|
std::uniform_int_distribution<int> dis{0, 255};
|
|
|
|
std::vector<uint8_t> initial_data(N);
|
|
for (auto& byte : initial_data) {
|
|
byte = static_cast<uint8_t>(dis(gen));
|
|
}
|
|
|
|
// CPU baseline test
|
|
auto cpu_data = initial_data;
|
|
|
|
auto start_time = std::chrono::high_resolution_clock::now();
|
|
for (int j = 0; j < iter_count; j++) {
|
|
for (auto& byte : cpu_data) {
|
|
byte = base16384::test::kernels_basic(byte);
|
|
}
|
|
}
|
|
auto end_time = std::chrono::high_resolution_clock::now();
|
|
auto duration = std::chrono::duration_cast<std::chrono::microseconds>(end_time - start_time);
|
|
|
|
std::cout << "CPU (" << std::fixed << std::setprecision(1) << duration.count() / 1000.0
|
|
<< " ms):";
|
|
for (int i = 0; i < std::min(N, 64); i++) std::cout << " " << static_cast<int>(cpu_data[i]);
|
|
std::cout << "..." << std::endl;
|
|
|
|
auto* data = sycl::malloc_shared<std::uint8_t>(N, q);
|
|
std::copy(initial_data.cbegin(), initial_data.cend(), data);
|
|
|
|
// test basic parallel kernel
|
|
start_time = std::chrono::high_resolution_clock::now();
|
|
auto errn = base16384::errors::try_failed([&]() {
|
|
for (int j = 0; j < iter_count; j++) {
|
|
q.parallel_for(sycl::range<1>(N),
|
|
[=](sycl::id<1> i) { data[i] = base16384::test::kernels_basic(data[i]); });
|
|
}
|
|
q.wait();
|
|
});
|
|
end_time = std::chrono::high_resolution_clock::now();
|
|
duration = std::chrono::duration_cast<std::chrono::microseconds>(end_time - start_time);
|
|
if (errn) return errn;
|
|
|
|
std::cout << "GPU 基本并行 (" << std::fixed << std::setprecision(1) << duration.count() / 1000.0
|
|
<< " ms):";
|
|
for (int i = 0; i < std::min(N, 64); i++) std::cout << " " << static_cast<int>(data[i]);
|
|
std::cout << "..." << std::endl;
|
|
|
|
// Verify GPU basic parallel result
|
|
for (int i = 0; i < N; i++) {
|
|
if (data[i] != cpu_data[i]) {
|
|
std::cerr << "GPU 基本并行结果验证失败:位置 " << i << " 期望值 "
|
|
<< static_cast<int>(cpu_data[i]) << " 实际值 " << static_cast<int>(data[i])
|
|
<< std::endl;
|
|
sycl::free(data, q);
|
|
return -1;
|
|
}
|
|
}
|
|
|
|
std::copy(initial_data.cbegin(), initial_data.cend(), data);
|
|
|
|
start_time = std::chrono::high_resolution_clock::now();
|
|
errn = base16384::errors::try_failed([&]() {
|
|
for (int j = 0; j < iter_count; j++) {
|
|
q.parallel_for(sycl::nd_range<1>(N, work_group_size),
|
|
[=](sycl::nd_item<1> item) { // sub-group size
|
|
const auto i = item.get_global_id(0);
|
|
data[i] = base16384::test::kernels_basic(data[i]);
|
|
});
|
|
}
|
|
q.wait();
|
|
});
|
|
end_time = std::chrono::high_resolution_clock::now();
|
|
duration = std::chrono::duration_cast<std::chrono::microseconds>(end_time - start_time);
|
|
if (errn) return errn;
|
|
|
|
std::cout << "GPU 高级并行 (" << std::fixed << std::setprecision(1) << duration.count() / 1000.0
|
|
<< " ms):";
|
|
for (int i = 0; i < std::min(N, 64); i++) std::cout << " " << static_cast<int>(data[i]);
|
|
std::cout << "..." << std::endl;
|
|
|
|
// Verify GPU advanced parallel result
|
|
for (int i = 0; i < N; i++) {
|
|
if (data[i] != cpu_data[i]) {
|
|
std::cerr << "GPU 高级并行结果验证失败:位置 " << i << " 期望值 "
|
|
<< static_cast<int>(cpu_data[i]) << " 实际值 " << static_cast<int>(data[i])
|
|
<< std::endl;
|
|
sycl::free(data, q);
|
|
return -1;
|
|
}
|
|
}
|
|
|
|
sycl::free(data, q);
|
|
|
|
return 0;
|
|
}
|