optimize: add xeinfo class & more compl. kernel

2026-07-17 03:00:32 +08:00 · 2025-09-29 17:01:27 +08:00
parent 78c6dea4c6
commit cbe9cda397
5 changed files with 256 additions and 67 deletions
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -70,7 +70,12 @@
        "xstring": "cpp",
        "xtr1common": "cpp",
        "xutility": "cpp",
-        "chrono": "cpp"
+        "chrono": "cpp",
+        "forward_list": "cpp",
+        "iomanip": "cpp",
+        "random": "cpp",
+        "*.def": "cpp",
+        "ranges": "cpp"
    },
    "terminal.integrated.defaultProfile.windows": "Command Prompt with oneAPI",
    "terminal.integrated.defaultProfile.linux": "bash with oneAPI",
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -64,11 +64,7 @@ set(LINK_FLAGS "-fsycl")

 # Release 模式链接优化
 if(CMAKE_BUILD_TYPE STREQUAL "Release")
-  if(WIN32)
-    set(LINK_FLAGS "${LINK_FLAGS} /LTCG /OPT:REF /OPT:ICF")
-  else()
-    set(LINK_FLAGS "${LINK_FLAGS} -flto -Wl,-O1 -Wl,--as-needed")
-  endif()
+  set(LINK_FLAGS "${LINK_FLAGS} -flto -fuse-ld=lld -Wl,-O1 -Wl,--as-needed")
 endif()

 add_subdirectory(libs)
--- a/include/errors.hpp
+++ b/include/errors.hpp
@@ -5,36 +5,56 @@
 #include <iostream>
 #include <sycl/sycl.hpp>

-typedef enum {
-  base16384_errors_code_ok,
-  base16384_errors_code_sync_sycl_exception,
-  base16384_errors_code_std_exception,
-  base16384_errors_code_unknown_exception,
-} base16384_errors_code_enum_t;
-
 template <typename T>
 concept base16384_has_what_concept_t = requires(T t) { t.what(); };

-template <base16384_has_what_concept_t T>
-static void base16384_print_what(T e, std::string msg) {
-  std::cerr << msg << e.what() << std::endl;
-}
+template <typename F>
+concept base16384_callable_concept_t = requires(F f) { f(); };

-// failed try to exec fn, catch and print .what() when exception is thrown.
-static base16384_errors_code_enum_t base16384_try_failed(std::function<void(void)> fn) {
-  try {
-    fn();
-  } catch (sycl::exception &e) {
-    base16384_print_what(e, "Caught sync SYCL exception: ");
-    return base16384_errors_code_sync_sycl_exception;
-  } catch (std::exception &e) {
-    base16384_print_what(e, "Caught std exception: ");
-    return base16384_errors_code_std_exception;
-  } catch (...) {
-    std::cerr << "Caught unknown exception." << std::endl;
-    return base16384_errors_code_unknown_exception;
-  }
-  return base16384_errors_code_ok;
-}
+namespace base16384 {
+class errors {
+ private:
+  errors() = default;
+
+  template <base16384_has_what_concept_t T>
+  static void print_what(T e, std::string msg) {
+    std::cerr << msg << e.what() << std::endl;
+  };
+
+ public:
+  errors(const errors &) = delete;
+  errors(errors &&) = delete;
+  errors &operator=(const errors &) = delete;
+  errors &operator=(errors &&) = delete;
+  auto operator<=>(const errors &) const = delete;
+  ~errors() noexcept = default;
+
+  typedef enum {
+    code_ok,
+    code_sync_sycl_exception,
+    code_std_exception,
+    code_unknown_exception,
+  } code_enum_t;
+
+  // failed try to exec fn, catch and print .what() when exception is thrown.
+  template <base16384_callable_concept_t F>
+  static code_enum_t try_failed(F &&fn) {
+    try {
+      fn();
+    } catch (sycl::exception &e) {
+      print_what(e, "Caught sync SYCL exception: ");
+      return code_sync_sycl_exception;
+    } catch (std::exception &e) {
+      print_what(e, "Caught std exception: ");
+      return code_std_exception;
+    } catch (...) {
+      std::cerr << "Caught unknown exception." << std::endl;
+      return code_unknown_exception;
+    }
+    return code_ok;
+  };
+};
+
+}  // namespace base16384

 #endif
--- a/include/xeinfo.hpp
+++ b/include/xeinfo.hpp
@@ -0,0 +1,101 @@
+#ifndef _XEINFO_HPP_
+#define _XEINFO_HPP_
+
+#include <iostream>
+#include <sstream>
+#include <string>
+#include <sycl/sycl.hpp>
+#include <vector>
+
+namespace base16384 {
+
+class xeinfo {
+ private:
+  std::pair<size_t, int> calculate_optimal_sizes() const {
+    size_t best_sub_group_size = sub_group_sizes[0];
+    int best_work_group_size = 0;
+
+    for (auto sg_size : sub_group_sizes) {
+      int wg_size = num_thread_per_xecore * sg_size;
+      if (wg_size <= max_work_group_size && wg_size > best_work_group_size) {
+        best_sub_group_size = sg_size;
+        best_work_group_size = 1 << (31 - __builtin_clz(static_cast<unsigned>(wg_size)));
+      }
+    }
+
+    return {std::move(best_sub_group_size), std::move(best_work_group_size)};
+  }
+
+ public:
+  xeinfo(sycl::device device) noexcept
+      : num_slices(device.get_info<sycl::ext::intel::info::device::gpu_slices>()),
+        num_subslices_per_slice(
+            device.get_info<sycl::ext::intel::info::device::gpu_subslices_per_slice>()),
+        num_eus_per_subslice(
+            device.get_info<sycl::ext::intel::info::device::gpu_eu_count_per_subslice>()),
+        num_threads_per_eu(
+            device.get_info<sycl::ext::intel::info::device::gpu_hw_threads_per_eu>()),
+        global_mem_size(device.get_info<sycl::info::device::global_mem_size>()),
+        local_mem_size(device.get_info<sycl::info::device::local_mem_size>()),
+        max_work_group_size(device.get_info<sycl::info::device::max_work_group_size>()),
+        sub_group_sizes(device.get_info<sycl::info::device::sub_group_sizes>()),
+        num_thread_per_xecore(num_eus_per_subslice * num_threads_per_eu),
+        total_xecores(num_slices * num_subslices_per_slice),
+        total_vector_engines(num_slices * num_subslices_per_slice * num_eus_per_subslice),
+        total_hardware_threads(num_slices * num_subslices_per_slice * num_eus_per_subslice *
+                               num_threads_per_eu),
+        optimal_sizes(calculate_optimal_sizes()),
+        sub_group_size(optimal_sizes.first),
+        work_group_size(optimal_sizes.second) {}
+
+  xeinfo(const xeinfo&) = delete;
+  xeinfo(xeinfo&&) = delete;
+  xeinfo& operator=(const xeinfo&) = delete;
+  xeinfo& operator=(xeinfo&&) = delete;
+  auto operator<=>(const xeinfo&) const = delete;
+  ~xeinfo() noexcept = default;
+
+  const int num_slices;
+  const int num_subslices_per_slice;
+  const int num_eus_per_subslice;
+  const int num_threads_per_eu;
+  const int global_mem_size;
+  const int local_mem_size;
+  const int max_work_group_size;
+  const std::vector<unsigned long long> sub_group_sizes;
+  const int num_thread_per_xecore;
+  const int total_xecores;
+  const int total_vector_engines;
+  const int total_hardware_threads;
+
+ private:
+  const std::pair<size_t, int> optimal_sizes;
+
+ public:
+  const size_t sub_group_size;
+  const int work_group_size;
+
+  std::string string() const {
+    std::ostringstream builder;
+    builder << "Intel GPU 特性:\n";
+    builder << "  XeCore 数量: " << total_xecores << "\n";
+    builder << "  每个 XeCore 的向量引擎数: " << num_eus_per_subslice << "\n";
+    builder << "  向量引擎总数: " << total_vector_engines << "\n";
+    builder << "  每个 XeCore 的硬件线程数: " << num_thread_per_xecore << "\n";
+    builder << "  每个向量引擎的硬件线程数: " << num_threads_per_eu << "\n";
+    builder << "  硬件线程总数: " << total_hardware_threads << "\n";
+    builder << "  GPU 内存大小: " << global_mem_size << " 字节\n";
+    builder << "  每个工作组的共享本地内存: " << local_mem_size << " 字节\n";
+    builder << "  最大工作组大小: " << max_work_group_size << "\n";
+    builder << "  支持的子组大小:";
+    for (size_t i = 0; i < sub_group_sizes.size(); i++) builder << " " << sub_group_sizes[i];
+    builder << "\n";
+    builder << "  推荐选择子组大小: " << sub_group_size << "\n";
+    builder << "  100% 占用率工作组大小: " << work_group_size;
+    return builder.str();
+  }
+};
+
+}  // namespace base16384
+
+#endif  // _XEINFO_HPP_
--- a/tests/basic.cpp
+++ b/tests/basic.cpp
@@ -1,15 +1,23 @@
-#include <chrono>
-#include <iostream>
-#include <sycl/sycl.hpp>
-#include <vector>
+#include <stdint.h>
 #ifdef _WIN32
 #include <windows.h>
+#undef min
+#undef max
 #endif

-#include "errors.hpp"
+#include <chrono>
+#include <iomanip>
+#include <iostream>
+#include <random>
+#include <ranges>
+#include <sycl/sycl.hpp>
+#include <vector>

-static const int N = 65536;
-static const int work_group_size = 64;
+#include "errors.hpp"
+#include "xeinfo.hpp"
+
+constexpr int iter_count = 65536;
+constexpr int N = 65536;

 int main() {
 #ifdef _WIN32
@@ -19,8 +27,9 @@ int main() {
 #endif
  sycl::queue q;

-  auto device = q.get_device();
-  std::cout << "执行设备: " << device.get_info<sycl::info::device::name>() << std::endl;
+  const sycl::device device;
+  const auto device_name = device.get_info<sycl::info::device::name>();
+  std::cout << "执行设备: " << device_name << std::endl;
  std::cout << "设备类型: ";
  if (device.is_cpu()) {
    std::cout << "CPU" << std::endl;
@@ -30,47 +39,105 @@ int main() {
    std::cout << "其他" << std::endl;
  }

+  int work_group_size = 64;
+  if (device.is_gpu() && device_name.starts_with("Intel")) {
+    try {
+      auto xeinfo = base16384::xeinfo(device);
+      work_group_size = xeinfo.work_group_size;
+      std::cout << "\n" << xeinfo.string() << "\n\n";
+    } catch (const sycl::exception& e) {
+      std::cout << "获取Intel GPU信息失败 (可能不是Intel设备): " << e.what() << std::endl;
+      std::cout << "使用默认工作组大小: " << work_group_size << "\n\n";
+    }
+  }
+
+  // Generate random initial data
+  std::random_device rd;
+  std::mt19937 gen{rd()};
+  std::uniform_int_distribution<int> dis{0, 255};
+
+  std::vector<uint8_t> initial_data(N);
+  for (auto& byte : initial_data) {
+    byte = static_cast<uint8_t>(dis(gen));
+  }
+
  // CPU baseline test
-  std::vector<int> cpu_data(N);
-  for (int i = 0; i < N; i++) cpu_data[i] = i;
+  auto cpu_data = initial_data;

  auto start_time = std::chrono::high_resolution_clock::now();
-  for (int i = 0; i < N; i++) cpu_data[i] *= 2;
+  for (int j = 0; j < iter_count; j++) {
+    for (auto& byte : cpu_data) {
+      // 复杂计算：多步数学运算组合
+      uint8_t temp = byte;
+      temp = (temp * temp) % 251;  // 使用质数避免快速收敛
+      temp = temp ^ (temp >> 2);   // 位运算
+      temp = (temp + 17) % 256;    // 加法和模运算
+      temp = temp * 3 % 256;       // 乘法
+      byte = temp ^ (temp << 1);   // 最终位运算
+    }
+  }
  auto end_time = std::chrono::high_resolution_clock::now();
  auto duration = std::chrono::duration_cast<std::chrono::microseconds>(end_time - start_time);

-  std::cout << "CPU (" << duration.count() << " us):" << std::endl;
-  for (int i = 0; i < min(N, 64); i++) std::cout << " " << cpu_data[i];
+  std::cout << "CPU (" << std::fixed << std::setprecision(1) << duration.count() / 1000.0
+            << " ms):";
+  for (int i = 0; i < std::min(N, 64); i++) std::cout << " " << static_cast<int>(cpu_data[i]);
  std::cout << "..." << std::endl;

-  int *data = sycl::malloc_shared<int>(N, q);
-  for (int i = 0; i < N; i++) data[i] = i;
+  auto* data = sycl::malloc_shared<std::uint8_t>(N, q);
+  std::copy(initial_data.cbegin(), initial_data.cend(), data);

  // test basic parallel kernel
  start_time = std::chrono::high_resolution_clock::now();
-  auto errn = base16384_try_failed(
-      [&]() { q.parallel_for(sycl::range<1>(N), [=](sycl::id<1> i) { data[i] *= 2; }).wait(); });
-  end_time = std::chrono::high_resolution_clock::now();
-  duration = std::chrono::duration_cast<std::chrono::microseconds>(end_time - start_time);
-  if (errn) return errn;
-
-  std::cout << "GPU基本并行 (" << duration.count() << " us):" << std::endl;
-  for (int i = 0; i < min(N, 64); i++) std::cout << " " << data[i];
-  std::cout << "..." << std::endl;
-
-  start_time = std::chrono::high_resolution_clock::now();
-  errn = base16384_try_failed([&]() {
-    q.parallel_for(sycl::nd_range<1>(N, work_group_size), [=](sycl::nd_item<1> item) {
-       int i = item.get_global_id(0);
-       data[i] /= 2;
-     }).wait();
+  auto errn = base16384::errors::try_failed([&]() {
+    for (int j = 0; j < iter_count; j++) {
+      q.parallel_for(sycl::range<1>(N), [=](sycl::id<1> i) {
+        // 复杂计算：多步数学运算组合
+        uint8_t temp = data[i];
+        temp = (temp * temp) % 251;    // 使用质数避免快速收敛
+        temp = temp ^ (temp >> 2);     // 位运算
+        temp = (temp + 17) % 256;      // 加法和模运算
+        temp = temp * 3 % 256;         // 乘法
+        data[i] = temp ^ (temp << 1);  // 最终位运算
+      });
+    }
+    q.wait();
  });
  end_time = std::chrono::high_resolution_clock::now();
  duration = std::chrono::duration_cast<std::chrono::microseconds>(end_time - start_time);
  if (errn) return errn;

-  std::cout << "GPU高级并行 (" << duration.count() << " us):" << std::endl;
-  for (int i = 0; i < min(N, 64); i++) std::cout << " " << data[i];
+  std::cout << "GPU 基本并行 (" << std::fixed << std::setprecision(1) << duration.count() / 1000.0
+            << " ms):";
+  for (int i = 0; i < std::min(N, 64); i++) std::cout << " " << static_cast<int>(data[i]);
+  std::cout << "..." << std::endl;
+
+  std::copy(initial_data.cbegin(), initial_data.cend(), data);
+
+  start_time = std::chrono::high_resolution_clock::now();
+  errn = base16384::errors::try_failed([&]() {
+    for (int j = 0; j < iter_count; j++) {
+      q.parallel_for(sycl::nd_range<1>(N, work_group_size),
+                     [=](sycl::nd_item<1> item) {  // sub-group size
+                       const auto i = item.get_global_id(0);
+                       // 复杂计算：多步数学运算组合
+                       uint8_t temp = data[i];
+                       temp = (temp * temp) % 251;    // 使用质数避免快速收敛
+                       temp = temp ^ (temp >> 2);     // 位运算
+                       temp = (temp + 17) % 256;      // 加法和模运算
+                       temp = temp * 3 % 256;         // 乘法
+                       data[i] = temp ^ (temp << 1);  // 最终位运算
+                     });
+    }
+    q.wait();
+  });
+  end_time = std::chrono::high_resolution_clock::now();
+  duration = std::chrono::duration_cast<std::chrono::microseconds>(end_time - start_time);
+  if (errn) return errn;
+
+  std::cout << "GPU 高级并行 (" << std::fixed << std::setprecision(1) << duration.count() / 1000.0
+            << " ms):";
+  for (int i = 0; i < std::min(N, 64); i++) std::cout << " " << static_cast<int>(data[i]);
  std::cout << "..." << std::endl;

  sycl::free(data, q);