mirror of
https://github.com/fumiama/base16384-sycl.git
synced 2026-06-05 00:32:49 +08:00
optimize: add xeinfo class & more compl. kernel
This commit is contained in:
7
.vscode/settings.json
vendored
7
.vscode/settings.json
vendored
@@ -70,7 +70,12 @@
|
||||
"xstring": "cpp",
|
||||
"xtr1common": "cpp",
|
||||
"xutility": "cpp",
|
||||
"chrono": "cpp"
|
||||
"chrono": "cpp",
|
||||
"forward_list": "cpp",
|
||||
"iomanip": "cpp",
|
||||
"random": "cpp",
|
||||
"*.def": "cpp",
|
||||
"ranges": "cpp"
|
||||
},
|
||||
"terminal.integrated.defaultProfile.windows": "Command Prompt with oneAPI",
|
||||
"terminal.integrated.defaultProfile.linux": "bash with oneAPI",
|
||||
|
||||
@@ -64,11 +64,7 @@ set(LINK_FLAGS "-fsycl")
|
||||
|
||||
# Release 模式链接优化
|
||||
if(CMAKE_BUILD_TYPE STREQUAL "Release")
|
||||
if(WIN32)
|
||||
set(LINK_FLAGS "${LINK_FLAGS} /LTCG /OPT:REF /OPT:ICF")
|
||||
else()
|
||||
set(LINK_FLAGS "${LINK_FLAGS} -flto -Wl,-O1 -Wl,--as-needed")
|
||||
endif()
|
||||
set(LINK_FLAGS "${LINK_FLAGS} -flto -fuse-ld=lld -Wl,-O1 -Wl,--as-needed")
|
||||
endif()
|
||||
|
||||
add_subdirectory(libs)
|
||||
|
||||
@@ -5,36 +5,56 @@
|
||||
#include <iostream>
|
||||
#include <sycl/sycl.hpp>
|
||||
|
||||
typedef enum {
|
||||
base16384_errors_code_ok,
|
||||
base16384_errors_code_sync_sycl_exception,
|
||||
base16384_errors_code_std_exception,
|
||||
base16384_errors_code_unknown_exception,
|
||||
} base16384_errors_code_enum_t;
|
||||
|
||||
template <typename T>
|
||||
concept base16384_has_what_concept_t = requires(T t) { t.what(); };
|
||||
|
||||
template <base16384_has_what_concept_t T>
|
||||
static void base16384_print_what(T e, std::string msg) {
|
||||
std::cerr << msg << e.what() << std::endl;
|
||||
}
|
||||
template <typename F>
|
||||
concept base16384_callable_concept_t = requires(F f) { f(); };
|
||||
|
||||
// failed try to exec fn, catch and print .what() when exception is thrown.
|
||||
static base16384_errors_code_enum_t base16384_try_failed(std::function<void(void)> fn) {
|
||||
try {
|
||||
fn();
|
||||
} catch (sycl::exception &e) {
|
||||
base16384_print_what(e, "Caught sync SYCL exception: ");
|
||||
return base16384_errors_code_sync_sycl_exception;
|
||||
} catch (std::exception &e) {
|
||||
base16384_print_what(e, "Caught std exception: ");
|
||||
return base16384_errors_code_std_exception;
|
||||
} catch (...) {
|
||||
std::cerr << "Caught unknown exception." << std::endl;
|
||||
return base16384_errors_code_unknown_exception;
|
||||
}
|
||||
return base16384_errors_code_ok;
|
||||
}
|
||||
namespace base16384 {
|
||||
class errors {
|
||||
private:
|
||||
errors() = default;
|
||||
|
||||
template <base16384_has_what_concept_t T>
|
||||
static void print_what(T e, std::string msg) {
|
||||
std::cerr << msg << e.what() << std::endl;
|
||||
};
|
||||
|
||||
public:
|
||||
errors(const errors &) = delete;
|
||||
errors(errors &&) = delete;
|
||||
errors &operator=(const errors &) = delete;
|
||||
errors &operator=(errors &&) = delete;
|
||||
auto operator<=>(const errors &) const = delete;
|
||||
~errors() noexcept = default;
|
||||
|
||||
typedef enum {
|
||||
code_ok,
|
||||
code_sync_sycl_exception,
|
||||
code_std_exception,
|
||||
code_unknown_exception,
|
||||
} code_enum_t;
|
||||
|
||||
// failed try to exec fn, catch and print .what() when exception is thrown.
|
||||
template <base16384_callable_concept_t F>
|
||||
static code_enum_t try_failed(F &&fn) {
|
||||
try {
|
||||
fn();
|
||||
} catch (sycl::exception &e) {
|
||||
print_what(e, "Caught sync SYCL exception: ");
|
||||
return code_sync_sycl_exception;
|
||||
} catch (std::exception &e) {
|
||||
print_what(e, "Caught std exception: ");
|
||||
return code_std_exception;
|
||||
} catch (...) {
|
||||
std::cerr << "Caught unknown exception." << std::endl;
|
||||
return code_unknown_exception;
|
||||
}
|
||||
return code_ok;
|
||||
};
|
||||
};
|
||||
|
||||
} // namespace base16384
|
||||
|
||||
#endif
|
||||
|
||||
101
include/xeinfo.hpp
Normal file
101
include/xeinfo.hpp
Normal file
@@ -0,0 +1,101 @@
|
||||
#ifndef _XEINFO_HPP_
|
||||
#define _XEINFO_HPP_
|
||||
|
||||
#include <iostream>
|
||||
#include <sstream>
|
||||
#include <string>
|
||||
#include <sycl/sycl.hpp>
|
||||
#include <vector>
|
||||
|
||||
namespace base16384 {
|
||||
|
||||
class xeinfo {
|
||||
private:
|
||||
std::pair<size_t, int> calculate_optimal_sizes() const {
|
||||
size_t best_sub_group_size = sub_group_sizes[0];
|
||||
int best_work_group_size = 0;
|
||||
|
||||
for (auto sg_size : sub_group_sizes) {
|
||||
int wg_size = num_thread_per_xecore * sg_size;
|
||||
if (wg_size <= max_work_group_size && wg_size > best_work_group_size) {
|
||||
best_sub_group_size = sg_size;
|
||||
best_work_group_size = 1 << (31 - __builtin_clz(static_cast<unsigned>(wg_size)));
|
||||
}
|
||||
}
|
||||
|
||||
return {std::move(best_sub_group_size), std::move(best_work_group_size)};
|
||||
}
|
||||
|
||||
public:
|
||||
xeinfo(sycl::device device) noexcept
|
||||
: num_slices(device.get_info<sycl::ext::intel::info::device::gpu_slices>()),
|
||||
num_subslices_per_slice(
|
||||
device.get_info<sycl::ext::intel::info::device::gpu_subslices_per_slice>()),
|
||||
num_eus_per_subslice(
|
||||
device.get_info<sycl::ext::intel::info::device::gpu_eu_count_per_subslice>()),
|
||||
num_threads_per_eu(
|
||||
device.get_info<sycl::ext::intel::info::device::gpu_hw_threads_per_eu>()),
|
||||
global_mem_size(device.get_info<sycl::info::device::global_mem_size>()),
|
||||
local_mem_size(device.get_info<sycl::info::device::local_mem_size>()),
|
||||
max_work_group_size(device.get_info<sycl::info::device::max_work_group_size>()),
|
||||
sub_group_sizes(device.get_info<sycl::info::device::sub_group_sizes>()),
|
||||
num_thread_per_xecore(num_eus_per_subslice * num_threads_per_eu),
|
||||
total_xecores(num_slices * num_subslices_per_slice),
|
||||
total_vector_engines(num_slices * num_subslices_per_slice * num_eus_per_subslice),
|
||||
total_hardware_threads(num_slices * num_subslices_per_slice * num_eus_per_subslice *
|
||||
num_threads_per_eu),
|
||||
optimal_sizes(calculate_optimal_sizes()),
|
||||
sub_group_size(optimal_sizes.first),
|
||||
work_group_size(optimal_sizes.second) {}
|
||||
|
||||
xeinfo(const xeinfo&) = delete;
|
||||
xeinfo(xeinfo&&) = delete;
|
||||
xeinfo& operator=(const xeinfo&) = delete;
|
||||
xeinfo& operator=(xeinfo&&) = delete;
|
||||
auto operator<=>(const xeinfo&) const = delete;
|
||||
~xeinfo() noexcept = default;
|
||||
|
||||
const int num_slices;
|
||||
const int num_subslices_per_slice;
|
||||
const int num_eus_per_subslice;
|
||||
const int num_threads_per_eu;
|
||||
const int global_mem_size;
|
||||
const int local_mem_size;
|
||||
const int max_work_group_size;
|
||||
const std::vector<unsigned long long> sub_group_sizes;
|
||||
const int num_thread_per_xecore;
|
||||
const int total_xecores;
|
||||
const int total_vector_engines;
|
||||
const int total_hardware_threads;
|
||||
|
||||
private:
|
||||
const std::pair<size_t, int> optimal_sizes;
|
||||
|
||||
public:
|
||||
const size_t sub_group_size;
|
||||
const int work_group_size;
|
||||
|
||||
std::string string() const {
|
||||
std::ostringstream builder;
|
||||
builder << "Intel GPU 特性:\n";
|
||||
builder << " XeCore 数量: " << total_xecores << "\n";
|
||||
builder << " 每个 XeCore 的向量引擎数: " << num_eus_per_subslice << "\n";
|
||||
builder << " 向量引擎总数: " << total_vector_engines << "\n";
|
||||
builder << " 每个 XeCore 的硬件线程数: " << num_thread_per_xecore << "\n";
|
||||
builder << " 每个向量引擎的硬件线程数: " << num_threads_per_eu << "\n";
|
||||
builder << " 硬件线程总数: " << total_hardware_threads << "\n";
|
||||
builder << " GPU 内存大小: " << global_mem_size << " 字节\n";
|
||||
builder << " 每个工作组的共享本地内存: " << local_mem_size << " 字节\n";
|
||||
builder << " 最大工作组大小: " << max_work_group_size << "\n";
|
||||
builder << " 支持的子组大小:";
|
||||
for (size_t i = 0; i < sub_group_sizes.size(); i++) builder << " " << sub_group_sizes[i];
|
||||
builder << "\n";
|
||||
builder << " 推荐选择子组大小: " << sub_group_size << "\n";
|
||||
builder << " 100% 占用率工作组大小: " << work_group_size;
|
||||
return builder.str();
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace base16384
|
||||
|
||||
#endif // _XEINFO_HPP_
|
||||
135
tests/basic.cpp
135
tests/basic.cpp
@@ -1,15 +1,23 @@
|
||||
#include <chrono>
|
||||
#include <iostream>
|
||||
#include <sycl/sycl.hpp>
|
||||
#include <vector>
|
||||
#include <stdint.h>
|
||||
#ifdef _WIN32
|
||||
#include <windows.h>
|
||||
#undef min
|
||||
#undef max
|
||||
#endif
|
||||
|
||||
#include "errors.hpp"
|
||||
#include <chrono>
|
||||
#include <iomanip>
|
||||
#include <iostream>
|
||||
#include <random>
|
||||
#include <ranges>
|
||||
#include <sycl/sycl.hpp>
|
||||
#include <vector>
|
||||
|
||||
static const int N = 65536;
|
||||
static const int work_group_size = 64;
|
||||
#include "errors.hpp"
|
||||
#include "xeinfo.hpp"
|
||||
|
||||
constexpr int iter_count = 65536;
|
||||
constexpr int N = 65536;
|
||||
|
||||
int main() {
|
||||
#ifdef _WIN32
|
||||
@@ -19,8 +27,9 @@ int main() {
|
||||
#endif
|
||||
sycl::queue q;
|
||||
|
||||
auto device = q.get_device();
|
||||
std::cout << "执行设备: " << device.get_info<sycl::info::device::name>() << std::endl;
|
||||
const sycl::device device;
|
||||
const auto device_name = device.get_info<sycl::info::device::name>();
|
||||
std::cout << "执行设备: " << device_name << std::endl;
|
||||
std::cout << "设备类型: ";
|
||||
if (device.is_cpu()) {
|
||||
std::cout << "CPU" << std::endl;
|
||||
@@ -30,47 +39,105 @@ int main() {
|
||||
std::cout << "其他" << std::endl;
|
||||
}
|
||||
|
||||
int work_group_size = 64;
|
||||
if (device.is_gpu() && device_name.starts_with("Intel")) {
|
||||
try {
|
||||
auto xeinfo = base16384::xeinfo(device);
|
||||
work_group_size = xeinfo.work_group_size;
|
||||
std::cout << "\n" << xeinfo.string() << "\n\n";
|
||||
} catch (const sycl::exception& e) {
|
||||
std::cout << "获取Intel GPU信息失败 (可能不是Intel设备): " << e.what() << std::endl;
|
||||
std::cout << "使用默认工作组大小: " << work_group_size << "\n\n";
|
||||
}
|
||||
}
|
||||
|
||||
// Generate random initial data
|
||||
std::random_device rd;
|
||||
std::mt19937 gen{rd()};
|
||||
std::uniform_int_distribution<int> dis{0, 255};
|
||||
|
||||
std::vector<uint8_t> initial_data(N);
|
||||
for (auto& byte : initial_data) {
|
||||
byte = static_cast<uint8_t>(dis(gen));
|
||||
}
|
||||
|
||||
// CPU baseline test
|
||||
std::vector<int> cpu_data(N);
|
||||
for (int i = 0; i < N; i++) cpu_data[i] = i;
|
||||
auto cpu_data = initial_data;
|
||||
|
||||
auto start_time = std::chrono::high_resolution_clock::now();
|
||||
for (int i = 0; i < N; i++) cpu_data[i] *= 2;
|
||||
for (int j = 0; j < iter_count; j++) {
|
||||
for (auto& byte : cpu_data) {
|
||||
// 复杂计算:多步数学运算组合
|
||||
uint8_t temp = byte;
|
||||
temp = (temp * temp) % 251; // 使用质数避免快速收敛
|
||||
temp = temp ^ (temp >> 2); // 位运算
|
||||
temp = (temp + 17) % 256; // 加法和模运算
|
||||
temp = temp * 3 % 256; // 乘法
|
||||
byte = temp ^ (temp << 1); // 最终位运算
|
||||
}
|
||||
}
|
||||
auto end_time = std::chrono::high_resolution_clock::now();
|
||||
auto duration = std::chrono::duration_cast<std::chrono::microseconds>(end_time - start_time);
|
||||
|
||||
std::cout << "CPU (" << duration.count() << " us):" << std::endl;
|
||||
for (int i = 0; i < min(N, 64); i++) std::cout << " " << cpu_data[i];
|
||||
std::cout << "CPU (" << std::fixed << std::setprecision(1) << duration.count() / 1000.0
|
||||
<< " ms):";
|
||||
for (int i = 0; i < std::min(N, 64); i++) std::cout << " " << static_cast<int>(cpu_data[i]);
|
||||
std::cout << "..." << std::endl;
|
||||
|
||||
int *data = sycl::malloc_shared<int>(N, q);
|
||||
for (int i = 0; i < N; i++) data[i] = i;
|
||||
auto* data = sycl::malloc_shared<std::uint8_t>(N, q);
|
||||
std::copy(initial_data.cbegin(), initial_data.cend(), data);
|
||||
|
||||
// test basic parallel kernel
|
||||
start_time = std::chrono::high_resolution_clock::now();
|
||||
auto errn = base16384_try_failed(
|
||||
[&]() { q.parallel_for(sycl::range<1>(N), [=](sycl::id<1> i) { data[i] *= 2; }).wait(); });
|
||||
end_time = std::chrono::high_resolution_clock::now();
|
||||
duration = std::chrono::duration_cast<std::chrono::microseconds>(end_time - start_time);
|
||||
if (errn) return errn;
|
||||
|
||||
std::cout << "GPU基本并行 (" << duration.count() << " us):" << std::endl;
|
||||
for (int i = 0; i < min(N, 64); i++) std::cout << " " << data[i];
|
||||
std::cout << "..." << std::endl;
|
||||
|
||||
start_time = std::chrono::high_resolution_clock::now();
|
||||
errn = base16384_try_failed([&]() {
|
||||
q.parallel_for(sycl::nd_range<1>(N, work_group_size), [=](sycl::nd_item<1> item) {
|
||||
int i = item.get_global_id(0);
|
||||
data[i] /= 2;
|
||||
}).wait();
|
||||
auto errn = base16384::errors::try_failed([&]() {
|
||||
for (int j = 0; j < iter_count; j++) {
|
||||
q.parallel_for(sycl::range<1>(N), [=](sycl::id<1> i) {
|
||||
// 复杂计算:多步数学运算组合
|
||||
uint8_t temp = data[i];
|
||||
temp = (temp * temp) % 251; // 使用质数避免快速收敛
|
||||
temp = temp ^ (temp >> 2); // 位运算
|
||||
temp = (temp + 17) % 256; // 加法和模运算
|
||||
temp = temp * 3 % 256; // 乘法
|
||||
data[i] = temp ^ (temp << 1); // 最终位运算
|
||||
});
|
||||
}
|
||||
q.wait();
|
||||
});
|
||||
end_time = std::chrono::high_resolution_clock::now();
|
||||
duration = std::chrono::duration_cast<std::chrono::microseconds>(end_time - start_time);
|
||||
if (errn) return errn;
|
||||
|
||||
std::cout << "GPU高级并行 (" << duration.count() << " us):" << std::endl;
|
||||
for (int i = 0; i < min(N, 64); i++) std::cout << " " << data[i];
|
||||
std::cout << "GPU 基本并行 (" << std::fixed << std::setprecision(1) << duration.count() / 1000.0
|
||||
<< " ms):";
|
||||
for (int i = 0; i < std::min(N, 64); i++) std::cout << " " << static_cast<int>(data[i]);
|
||||
std::cout << "..." << std::endl;
|
||||
|
||||
std::copy(initial_data.cbegin(), initial_data.cend(), data);
|
||||
|
||||
start_time = std::chrono::high_resolution_clock::now();
|
||||
errn = base16384::errors::try_failed([&]() {
|
||||
for (int j = 0; j < iter_count; j++) {
|
||||
q.parallel_for(sycl::nd_range<1>(N, work_group_size),
|
||||
[=](sycl::nd_item<1> item) { // sub-group size
|
||||
const auto i = item.get_global_id(0);
|
||||
// 复杂计算:多步数学运算组合
|
||||
uint8_t temp = data[i];
|
||||
temp = (temp * temp) % 251; // 使用质数避免快速收敛
|
||||
temp = temp ^ (temp >> 2); // 位运算
|
||||
temp = (temp + 17) % 256; // 加法和模运算
|
||||
temp = temp * 3 % 256; // 乘法
|
||||
data[i] = temp ^ (temp << 1); // 最终位运算
|
||||
});
|
||||
}
|
||||
q.wait();
|
||||
});
|
||||
end_time = std::chrono::high_resolution_clock::now();
|
||||
duration = std::chrono::duration_cast<std::chrono::microseconds>(end_time - start_time);
|
||||
if (errn) return errn;
|
||||
|
||||
std::cout << "GPU 高级并行 (" << std::fixed << std::setprecision(1) << duration.count() / 1000.0
|
||||
<< " ms):";
|
||||
for (int i = 0; i < std::min(N, 64); i++) std::cout << " " << static_cast<int>(data[i]);
|
||||
std::cout << "..." << std::endl;
|
||||
|
||||
sycl::free(data, q);
|
||||
|
||||
Reference in New Issue
Block a user