1
0
mirror of https://github.com/fumiama/base16384-sycl.git synced 2026-06-08 12:00:36 +08:00

optimize: add xeinfo class & more compl. kernel

This commit is contained in:
源文雨
2025-09-29 17:01:27 +08:00
parent 78c6dea4c6
commit cbe9cda397
5 changed files with 256 additions and 67 deletions

View File

@@ -5,36 +5,56 @@
#include <iostream>
#include <sycl/sycl.hpp>
typedef enum {
base16384_errors_code_ok,
base16384_errors_code_sync_sycl_exception,
base16384_errors_code_std_exception,
base16384_errors_code_unknown_exception,
} base16384_errors_code_enum_t;
template <typename T>
concept base16384_has_what_concept_t = requires(T t) { t.what(); };
template <base16384_has_what_concept_t T>
static void base16384_print_what(T e, std::string msg) {
std::cerr << msg << e.what() << std::endl;
}
template <typename F>
concept base16384_callable_concept_t = requires(F f) { f(); };
// failed try to exec fn, catch and print .what() when exception is thrown.
static base16384_errors_code_enum_t base16384_try_failed(std::function<void(void)> fn) {
try {
fn();
} catch (sycl::exception &e) {
base16384_print_what(e, "Caught sync SYCL exception: ");
return base16384_errors_code_sync_sycl_exception;
} catch (std::exception &e) {
base16384_print_what(e, "Caught std exception: ");
return base16384_errors_code_std_exception;
} catch (...) {
std::cerr << "Caught unknown exception." << std::endl;
return base16384_errors_code_unknown_exception;
}
return base16384_errors_code_ok;
}
namespace base16384 {
class errors {
private:
errors() = default;
template <base16384_has_what_concept_t T>
static void print_what(T e, std::string msg) {
std::cerr << msg << e.what() << std::endl;
};
public:
errors(const errors &) = delete;
errors(errors &&) = delete;
errors &operator=(const errors &) = delete;
errors &operator=(errors &&) = delete;
auto operator<=>(const errors &) const = delete;
~errors() noexcept = default;
typedef enum {
code_ok,
code_sync_sycl_exception,
code_std_exception,
code_unknown_exception,
} code_enum_t;
// failed try to exec fn, catch and print .what() when exception is thrown.
template <base16384_callable_concept_t F>
static code_enum_t try_failed(F &&fn) {
try {
fn();
} catch (sycl::exception &e) {
print_what(e, "Caught sync SYCL exception: ");
return code_sync_sycl_exception;
} catch (std::exception &e) {
print_what(e, "Caught std exception: ");
return code_std_exception;
} catch (...) {
std::cerr << "Caught unknown exception." << std::endl;
return code_unknown_exception;
}
return code_ok;
};
};
} // namespace base16384
#endif

101
include/xeinfo.hpp Normal file
View File

@@ -0,0 +1,101 @@
#ifndef _XEINFO_HPP_
#define _XEINFO_HPP_
#include <iostream>
#include <sstream>
#include <string>
#include <sycl/sycl.hpp>
#include <vector>
namespace base16384 {
class xeinfo {
private:
std::pair<size_t, int> calculate_optimal_sizes() const {
size_t best_sub_group_size = sub_group_sizes[0];
int best_work_group_size = 0;
for (auto sg_size : sub_group_sizes) {
int wg_size = num_thread_per_xecore * sg_size;
if (wg_size <= max_work_group_size && wg_size > best_work_group_size) {
best_sub_group_size = sg_size;
best_work_group_size = 1 << (31 - __builtin_clz(static_cast<unsigned>(wg_size)));
}
}
return {std::move(best_sub_group_size), std::move(best_work_group_size)};
}
public:
xeinfo(sycl::device device) noexcept
: num_slices(device.get_info<sycl::ext::intel::info::device::gpu_slices>()),
num_subslices_per_slice(
device.get_info<sycl::ext::intel::info::device::gpu_subslices_per_slice>()),
num_eus_per_subslice(
device.get_info<sycl::ext::intel::info::device::gpu_eu_count_per_subslice>()),
num_threads_per_eu(
device.get_info<sycl::ext::intel::info::device::gpu_hw_threads_per_eu>()),
global_mem_size(device.get_info<sycl::info::device::global_mem_size>()),
local_mem_size(device.get_info<sycl::info::device::local_mem_size>()),
max_work_group_size(device.get_info<sycl::info::device::max_work_group_size>()),
sub_group_sizes(device.get_info<sycl::info::device::sub_group_sizes>()),
num_thread_per_xecore(num_eus_per_subslice * num_threads_per_eu),
total_xecores(num_slices * num_subslices_per_slice),
total_vector_engines(num_slices * num_subslices_per_slice * num_eus_per_subslice),
total_hardware_threads(num_slices * num_subslices_per_slice * num_eus_per_subslice *
num_threads_per_eu),
optimal_sizes(calculate_optimal_sizes()),
sub_group_size(optimal_sizes.first),
work_group_size(optimal_sizes.second) {}
xeinfo(const xeinfo&) = delete;
xeinfo(xeinfo&&) = delete;
xeinfo& operator=(const xeinfo&) = delete;
xeinfo& operator=(xeinfo&&) = delete;
auto operator<=>(const xeinfo&) const = delete;
~xeinfo() noexcept = default;
const int num_slices;
const int num_subslices_per_slice;
const int num_eus_per_subslice;
const int num_threads_per_eu;
const int global_mem_size;
const int local_mem_size;
const int max_work_group_size;
const std::vector<unsigned long long> sub_group_sizes;
const int num_thread_per_xecore;
const int total_xecores;
const int total_vector_engines;
const int total_hardware_threads;
private:
const std::pair<size_t, int> optimal_sizes;
public:
const size_t sub_group_size;
const int work_group_size;
std::string string() const {
std::ostringstream builder;
builder << "Intel GPU 特性:\n";
builder << " XeCore 数量: " << total_xecores << "\n";
builder << " 每个 XeCore 的向量引擎数: " << num_eus_per_subslice << "\n";
builder << " 向量引擎总数: " << total_vector_engines << "\n";
builder << " 每个 XeCore 的硬件线程数: " << num_thread_per_xecore << "\n";
builder << " 每个向量引擎的硬件线程数: " << num_threads_per_eu << "\n";
builder << " 硬件线程总数: " << total_hardware_threads << "\n";
builder << " GPU 内存大小: " << global_mem_size << " 字节\n";
builder << " 每个工作组的共享本地内存: " << local_mem_size << " 字节\n";
builder << " 最大工作组大小: " << max_work_group_size << "\n";
builder << " 支持的子组大小:";
for (size_t i = 0; i < sub_group_sizes.size(); i++) builder << " " << sub_group_sizes[i];
builder << "\n";
builder << " 推荐选择子组大小: " << sub_group_size << "\n";
builder << " 100% 占用率工作组大小: " << work_group_size;
return builder.str();
}
};
} // namespace base16384
#endif // _XEINFO_HPP_