diff --git a/.vscode/settings.json b/.vscode/settings.json index 42e275f..80e4973 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -70,7 +70,12 @@ "xstring": "cpp", "xtr1common": "cpp", "xutility": "cpp", - "chrono": "cpp" + "chrono": "cpp", + "forward_list": "cpp", + "iomanip": "cpp", + "random": "cpp", + "*.def": "cpp", + "ranges": "cpp" }, "terminal.integrated.defaultProfile.windows": "Command Prompt with oneAPI", "terminal.integrated.defaultProfile.linux": "bash with oneAPI", diff --git a/CMakeLists.txt b/CMakeLists.txt index c913279..c2710f7 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -64,11 +64,7 @@ set(LINK_FLAGS "-fsycl") # Release 模式链接优化 if(CMAKE_BUILD_TYPE STREQUAL "Release") - if(WIN32) - set(LINK_FLAGS "${LINK_FLAGS} /LTCG /OPT:REF /OPT:ICF") - else() - set(LINK_FLAGS "${LINK_FLAGS} -flto -Wl,-O1 -Wl,--as-needed") - endif() + set(LINK_FLAGS "${LINK_FLAGS} -flto -fuse-ld=lld -Wl,-O1 -Wl,--as-needed") endif() add_subdirectory(libs) diff --git a/include/errors.hpp b/include/errors.hpp index e451c2d..985e739 100644 --- a/include/errors.hpp +++ b/include/errors.hpp @@ -5,36 +5,56 @@ #include #include -typedef enum { - base16384_errors_code_ok, - base16384_errors_code_sync_sycl_exception, - base16384_errors_code_std_exception, - base16384_errors_code_unknown_exception, -} base16384_errors_code_enum_t; - template concept base16384_has_what_concept_t = requires(T t) { t.what(); }; -template -static void base16384_print_what(T e, std::string msg) { - std::cerr << msg << e.what() << std::endl; -} +template +concept base16384_callable_concept_t = requires(F f) { f(); }; -// failed try to exec fn, catch and print .what() when exception is thrown. -static base16384_errors_code_enum_t base16384_try_failed(std::function fn) { - try { - fn(); - } catch (sycl::exception &e) { - base16384_print_what(e, "Caught sync SYCL exception: "); - return base16384_errors_code_sync_sycl_exception; - } catch (std::exception &e) { - base16384_print_what(e, "Caught std exception: "); - return base16384_errors_code_std_exception; - } catch (...) { - std::cerr << "Caught unknown exception." << std::endl; - return base16384_errors_code_unknown_exception; - } - return base16384_errors_code_ok; -} +namespace base16384 { +class errors { + private: + errors() = default; + + template + static void print_what(T e, std::string msg) { + std::cerr << msg << e.what() << std::endl; + }; + + public: + errors(const errors &) = delete; + errors(errors &&) = delete; + errors &operator=(const errors &) = delete; + errors &operator=(errors &&) = delete; + auto operator<=>(const errors &) const = delete; + ~errors() noexcept = default; + + typedef enum { + code_ok, + code_sync_sycl_exception, + code_std_exception, + code_unknown_exception, + } code_enum_t; + + // failed try to exec fn, catch and print .what() when exception is thrown. + template + static code_enum_t try_failed(F &&fn) { + try { + fn(); + } catch (sycl::exception &e) { + print_what(e, "Caught sync SYCL exception: "); + return code_sync_sycl_exception; + } catch (std::exception &e) { + print_what(e, "Caught std exception: "); + return code_std_exception; + } catch (...) { + std::cerr << "Caught unknown exception." << std::endl; + return code_unknown_exception; + } + return code_ok; + }; +}; + +} // namespace base16384 #endif diff --git a/include/xeinfo.hpp b/include/xeinfo.hpp new file mode 100644 index 0000000..11e8968 --- /dev/null +++ b/include/xeinfo.hpp @@ -0,0 +1,101 @@ +#ifndef _XEINFO_HPP_ +#define _XEINFO_HPP_ + +#include +#include +#include +#include +#include + +namespace base16384 { + +class xeinfo { + private: + std::pair calculate_optimal_sizes() const { + size_t best_sub_group_size = sub_group_sizes[0]; + int best_work_group_size = 0; + + for (auto sg_size : sub_group_sizes) { + int wg_size = num_thread_per_xecore * sg_size; + if (wg_size <= max_work_group_size && wg_size > best_work_group_size) { + best_sub_group_size = sg_size; + best_work_group_size = 1 << (31 - __builtin_clz(static_cast(wg_size))); + } + } + + return {std::move(best_sub_group_size), std::move(best_work_group_size)}; + } + + public: + xeinfo(sycl::device device) noexcept + : num_slices(device.get_info()), + num_subslices_per_slice( + device.get_info()), + num_eus_per_subslice( + device.get_info()), + num_threads_per_eu( + device.get_info()), + global_mem_size(device.get_info()), + local_mem_size(device.get_info()), + max_work_group_size(device.get_info()), + sub_group_sizes(device.get_info()), + num_thread_per_xecore(num_eus_per_subslice * num_threads_per_eu), + total_xecores(num_slices * num_subslices_per_slice), + total_vector_engines(num_slices * num_subslices_per_slice * num_eus_per_subslice), + total_hardware_threads(num_slices * num_subslices_per_slice * num_eus_per_subslice * + num_threads_per_eu), + optimal_sizes(calculate_optimal_sizes()), + sub_group_size(optimal_sizes.first), + work_group_size(optimal_sizes.second) {} + + xeinfo(const xeinfo&) = delete; + xeinfo(xeinfo&&) = delete; + xeinfo& operator=(const xeinfo&) = delete; + xeinfo& operator=(xeinfo&&) = delete; + auto operator<=>(const xeinfo&) const = delete; + ~xeinfo() noexcept = default; + + const int num_slices; + const int num_subslices_per_slice; + const int num_eus_per_subslice; + const int num_threads_per_eu; + const int global_mem_size; + const int local_mem_size; + const int max_work_group_size; + const std::vector sub_group_sizes; + const int num_thread_per_xecore; + const int total_xecores; + const int total_vector_engines; + const int total_hardware_threads; + + private: + const std::pair optimal_sizes; + + public: + const size_t sub_group_size; + const int work_group_size; + + std::string string() const { + std::ostringstream builder; + builder << "Intel GPU 特性:\n"; + builder << " XeCore 数量: " << total_xecores << "\n"; + builder << " 每个 XeCore 的向量引擎数: " << num_eus_per_subslice << "\n"; + builder << " 向量引擎总数: " << total_vector_engines << "\n"; + builder << " 每个 XeCore 的硬件线程数: " << num_thread_per_xecore << "\n"; + builder << " 每个向量引擎的硬件线程数: " << num_threads_per_eu << "\n"; + builder << " 硬件线程总数: " << total_hardware_threads << "\n"; + builder << " GPU 内存大小: " << global_mem_size << " 字节\n"; + builder << " 每个工作组的共享本地内存: " << local_mem_size << " 字节\n"; + builder << " 最大工作组大小: " << max_work_group_size << "\n"; + builder << " 支持的子组大小:"; + for (size_t i = 0; i < sub_group_sizes.size(); i++) builder << " " << sub_group_sizes[i]; + builder << "\n"; + builder << " 推荐选择子组大小: " << sub_group_size << "\n"; + builder << " 100% 占用率工作组大小: " << work_group_size; + return builder.str(); + } +}; + +} // namespace base16384 + +#endif // _XEINFO_HPP_ \ No newline at end of file diff --git a/tests/basic.cpp b/tests/basic.cpp index a84dd00..64f272d 100644 --- a/tests/basic.cpp +++ b/tests/basic.cpp @@ -1,15 +1,23 @@ -#include -#include -#include -#include +#include #ifdef _WIN32 #include +#undef min +#undef max #endif -#include "errors.hpp" +#include +#include +#include +#include +#include +#include +#include -static const int N = 65536; -static const int work_group_size = 64; +#include "errors.hpp" +#include "xeinfo.hpp" + +constexpr int iter_count = 65536; +constexpr int N = 65536; int main() { #ifdef _WIN32 @@ -19,8 +27,9 @@ int main() { #endif sycl::queue q; - auto device = q.get_device(); - std::cout << "执行设备: " << device.get_info() << std::endl; + const sycl::device device; + const auto device_name = device.get_info(); + std::cout << "执行设备: " << device_name << std::endl; std::cout << "设备类型: "; if (device.is_cpu()) { std::cout << "CPU" << std::endl; @@ -30,47 +39,105 @@ int main() { std::cout << "其他" << std::endl; } + int work_group_size = 64; + if (device.is_gpu() && device_name.starts_with("Intel")) { + try { + auto xeinfo = base16384::xeinfo(device); + work_group_size = xeinfo.work_group_size; + std::cout << "\n" << xeinfo.string() << "\n\n"; + } catch (const sycl::exception& e) { + std::cout << "获取Intel GPU信息失败 (可能不是Intel设备): " << e.what() << std::endl; + std::cout << "使用默认工作组大小: " << work_group_size << "\n\n"; + } + } + + // Generate random initial data + std::random_device rd; + std::mt19937 gen{rd()}; + std::uniform_int_distribution dis{0, 255}; + + std::vector initial_data(N); + for (auto& byte : initial_data) { + byte = static_cast(dis(gen)); + } + // CPU baseline test - std::vector cpu_data(N); - for (int i = 0; i < N; i++) cpu_data[i] = i; + auto cpu_data = initial_data; auto start_time = std::chrono::high_resolution_clock::now(); - for (int i = 0; i < N; i++) cpu_data[i] *= 2; + for (int j = 0; j < iter_count; j++) { + for (auto& byte : cpu_data) { + // 复杂计算:多步数学运算组合 + uint8_t temp = byte; + temp = (temp * temp) % 251; // 使用质数避免快速收敛 + temp = temp ^ (temp >> 2); // 位运算 + temp = (temp + 17) % 256; // 加法和模运算 + temp = temp * 3 % 256; // 乘法 + byte = temp ^ (temp << 1); // 最终位运算 + } + } auto end_time = std::chrono::high_resolution_clock::now(); auto duration = std::chrono::duration_cast(end_time - start_time); - std::cout << "CPU (" << duration.count() << " us):" << std::endl; - for (int i = 0; i < min(N, 64); i++) std::cout << " " << cpu_data[i]; + std::cout << "CPU (" << std::fixed << std::setprecision(1) << duration.count() / 1000.0 + << " ms):"; + for (int i = 0; i < std::min(N, 64); i++) std::cout << " " << static_cast(cpu_data[i]); std::cout << "..." << std::endl; - int *data = sycl::malloc_shared(N, q); - for (int i = 0; i < N; i++) data[i] = i; + auto* data = sycl::malloc_shared(N, q); + std::copy(initial_data.cbegin(), initial_data.cend(), data); // test basic parallel kernel start_time = std::chrono::high_resolution_clock::now(); - auto errn = base16384_try_failed( - [&]() { q.parallel_for(sycl::range<1>(N), [=](sycl::id<1> i) { data[i] *= 2; }).wait(); }); - end_time = std::chrono::high_resolution_clock::now(); - duration = std::chrono::duration_cast(end_time - start_time); - if (errn) return errn; - - std::cout << "GPU基本并行 (" << duration.count() << " us):" << std::endl; - for (int i = 0; i < min(N, 64); i++) std::cout << " " << data[i]; - std::cout << "..." << std::endl; - - start_time = std::chrono::high_resolution_clock::now(); - errn = base16384_try_failed([&]() { - q.parallel_for(sycl::nd_range<1>(N, work_group_size), [=](sycl::nd_item<1> item) { - int i = item.get_global_id(0); - data[i] /= 2; - }).wait(); + auto errn = base16384::errors::try_failed([&]() { + for (int j = 0; j < iter_count; j++) { + q.parallel_for(sycl::range<1>(N), [=](sycl::id<1> i) { + // 复杂计算:多步数学运算组合 + uint8_t temp = data[i]; + temp = (temp * temp) % 251; // 使用质数避免快速收敛 + temp = temp ^ (temp >> 2); // 位运算 + temp = (temp + 17) % 256; // 加法和模运算 + temp = temp * 3 % 256; // 乘法 + data[i] = temp ^ (temp << 1); // 最终位运算 + }); + } + q.wait(); }); end_time = std::chrono::high_resolution_clock::now(); duration = std::chrono::duration_cast(end_time - start_time); if (errn) return errn; - std::cout << "GPU高级并行 (" << duration.count() << " us):" << std::endl; - for (int i = 0; i < min(N, 64); i++) std::cout << " " << data[i]; + std::cout << "GPU 基本并行 (" << std::fixed << std::setprecision(1) << duration.count() / 1000.0 + << " ms):"; + for (int i = 0; i < std::min(N, 64); i++) std::cout << " " << static_cast(data[i]); + std::cout << "..." << std::endl; + + std::copy(initial_data.cbegin(), initial_data.cend(), data); + + start_time = std::chrono::high_resolution_clock::now(); + errn = base16384::errors::try_failed([&]() { + for (int j = 0; j < iter_count; j++) { + q.parallel_for(sycl::nd_range<1>(N, work_group_size), + [=](sycl::nd_item<1> item) { // sub-group size + const auto i = item.get_global_id(0); + // 复杂计算:多步数学运算组合 + uint8_t temp = data[i]; + temp = (temp * temp) % 251; // 使用质数避免快速收敛 + temp = temp ^ (temp >> 2); // 位运算 + temp = (temp + 17) % 256; // 加法和模运算 + temp = temp * 3 % 256; // 乘法 + data[i] = temp ^ (temp << 1); // 最终位运算 + }); + } + q.wait(); + }); + end_time = std::chrono::high_resolution_clock::now(); + duration = std::chrono::duration_cast(end_time - start_time); + if (errn) return errn; + + std::cout << "GPU 高级并行 (" << std::fixed << std::setprecision(1) << duration.count() / 1000.0 + << " ms):"; + for (int i = 0; i < std::min(N, 64); i++) std::cout << " " << static_cast(data[i]); std::cout << "..." << std::endl; sycl::free(data, q);