mirror of
https://github.com/fumiama/gozel.git
synced 2026-06-05 00:10:24 +08:00
feat(examples): add image_scale (#7)
Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com>
This commit is contained in:
60
examples/image_scale/README.md
Normal file
60
examples/image_scale/README.md
Normal file
@@ -0,0 +1,60 @@
|
||||
# Image Scaling — GPU Bilinear Resize with Sampler
|
||||
|
||||
Downscale an image on the GPU using Level Zero's native **image** and **sampler** objects. The sampler performs hardware-accelerated bilinear interpolation, producing a high-quality resized image in a single kernel dispatch.
|
||||
|
||||
## What It Does
|
||||
|
||||
1. Decodes an embedded WebP image (1272 × 855) and converts it to RGBA
|
||||
2. Computes the target dimensions (capped at 512 px on the longest side)
|
||||
3. Discovers a GPU device and prints its basic & compute properties
|
||||
4. Creates a SPIR-V module from an OpenCL C kernel compiled offline
|
||||
5. Uses `zeKernelSuggestGroupSize` to pick an optimal 2-D workgroup size
|
||||
6. Allocates host/device memory and two Level Zero **image objects** (input & output)
|
||||
7. Creates a **sampler** with clamp addressing and bilinear filtering
|
||||
8. Executes three command lists via a command queue:
|
||||
- **Pre**: copy host pixels → device buffer → input image
|
||||
- **Compute**: launch the `scale` kernel
|
||||
- **Post**: copy output image → device buffer → host memory
|
||||
9. Writes the result to `small.png`
|
||||
|
||||
## Run
|
||||
|
||||
```bash
|
||||
go run main.go
|
||||
```
|
||||
|
||||
## Result
|
||||
|
||||
| Before Scaling (1272 × 855) | After Scaling (512 × 344) |
|
||||
|:----------------------------:|:-------------------------:|
|
||||
|  |  |
|
||||
|
||||
### Console Output
|
||||
|
||||
```
|
||||
=============== Image Information ===============
|
||||
Image Format: webp
|
||||
Image W/H ratio: 1.4877
|
||||
Image Size: 1272 x 855
|
||||
Scale to Image Size: 512 x 344
|
||||
Scale ratio: 0.4025
|
||||
Image Data Size: 144802 bytes
|
||||
=============== Device Basic Properties ===============
|
||||
Running on device: ID = 32103 , Name = Intel(R) Graphics @ 0.00 GHz.
|
||||
=============== Device Compute Properties ===============
|
||||
Max Group Size (X, Y, Z): (1024, 1024, 1024)
|
||||
Max Group Count (X, Y, Z): (4294967295, 4294967295, 4294967295)
|
||||
Max Total Group Size: 1024
|
||||
Max Shared Local Memory: 65536
|
||||
Subgroup Sizes: [8 16 32]
|
||||
=============== Computation Configuration ===============
|
||||
Group Size (X, Y, Z): (64, 4, 1)
|
||||
Group Count (X, Y, Z): (8, 86, 1)
|
||||
Total Elements (srcN, dstN): (4350240, 704512)
|
||||
Source Buffer Size: 4248.28 KiB
|
||||
Dest Buffer Size: 688.00 KiB
|
||||
=============== Calculation Results ===============
|
||||
GPU Execution Time: 1.579000 ms
|
||||
GPU Throughput: 2.76 GiB/s
|
||||
Test Passed!!!
|
||||
```
|
||||
19
examples/image_scale/main.cl
Normal file
19
examples/image_scale/main.cl
Normal file
@@ -0,0 +1,19 @@
|
||||
kernel void scale(
|
||||
read_only image2d_t inputImg,
|
||||
sampler_t smp,
|
||||
write_only image2d_t outputImg)
|
||||
{
|
||||
uint x = get_global_id(0);
|
||||
uint y = get_global_id(1);
|
||||
uint outW = get_image_width(outputImg);
|
||||
uint outH = get_image_height(outputImg);
|
||||
|
||||
float2 normCoord = (float2)(
|
||||
(float)x / (float)outW,
|
||||
(float)y / (float)outH
|
||||
);
|
||||
|
||||
float4 pixel = read_imagef(inputImg, smp, normCoord);
|
||||
|
||||
write_imagef(outputImg, (int2)(x, y), pixel);
|
||||
}
|
||||
314
examples/image_scale/main.go
Normal file
314
examples/image_scale/main.go
Normal file
@@ -0,0 +1,314 @@
|
||||
// Package main demonstrates vector addition using the gozel Level Zero bindings.
|
||||
package main
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
_ "embed"
|
||||
"fmt"
|
||||
"image"
|
||||
"image/draw"
|
||||
"image/png"
|
||||
"math"
|
||||
"os"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
"unsafe"
|
||||
|
||||
_ "golang.org/x/image/webp"
|
||||
|
||||
"github.com/fumiama/gozel/gozel"
|
||||
"github.com/fumiama/gozel/ze"
|
||||
)
|
||||
|
||||
//go:generate ocloc compile -file main.cl -spv_only -options "-cl-mad-enable -cl-fast-relaxed-math -cl-finite-math-only -cl-single-precision-constant" -internal_options "-O3" -output main
|
||||
//go:generate llvm-spirv -to-text main_.spv -o main.spt
|
||||
|
||||
//go:embed main_.spv
|
||||
var kernelspv []byte
|
||||
|
||||
//go:embed 暖笺贺春.webp
|
||||
var imagebytes []byte
|
||||
|
||||
func main() {
|
||||
img, format, err := image.Decode(bytes.NewReader(imagebytes))
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
bounds := img.Bounds()
|
||||
width := bounds.Dx()
|
||||
height := bounds.Dy()
|
||||
ratio := float64(width) / float64(height)
|
||||
imgrgba := image.NewRGBA(bounds)
|
||||
draw.Draw(imgrgba, bounds, img, bounds.Min, draw.Src)
|
||||
dstw, dsth := width, height
|
||||
if dstw > 512 {
|
||||
dstw = 512
|
||||
dsth = int(float64(dstw) / ratio)
|
||||
}
|
||||
if dsth > 512 {
|
||||
dsth = 512
|
||||
dstw = int(float64(dsth) * ratio)
|
||||
}
|
||||
scaleRatio := float32(float64(dstw) / float64(width))
|
||||
|
||||
fmt.Println("=============== Image Information ===============")
|
||||
fmt.Printf("%-28s %s\n", "Image Format:", format)
|
||||
fmt.Printf("%-28s %.04f\n", "Image W/H ratio:", ratio)
|
||||
fmt.Printf("%-28s %d x %d\n", "Image Size:", width, height)
|
||||
fmt.Printf("%-28s %d x %d\n", "Scale to Image Size:", dstw, dsth)
|
||||
fmt.Printf("%-28s %.04f\n", "Scale ratio:", scaleRatio)
|
||||
fmt.Printf("%-28s %d bytes\n", "Image Data Size:", len(imagebytes))
|
||||
|
||||
gpus, err := ze.InitGPUDrivers()
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
if len(gpus) == 0 {
|
||||
panic("no gpu available")
|
||||
}
|
||||
gpu := gpus[0]
|
||||
|
||||
ctx, err := gpu.ContextCreate()
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
|
||||
devs, err := gpu.DeviceGet()
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
if len(devs) == 0 {
|
||||
panic("no device available")
|
||||
}
|
||||
dev := devs[0]
|
||||
|
||||
prop, err := dev.DeviceGetProperties()
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
|
||||
fmt.Println("=============== Device Basic Properties ===============")
|
||||
name, _, _ := strings.Cut(string(prop.Name[:]), "\x00")
|
||||
fmt.Println(
|
||||
"Running on device: ID =", prop.Deviceid, ", Name =", name,
|
||||
"@", strconv.FormatFloat(float64(prop.Coreclockrate)/1024/1024/1024, 'f', 2, 64), "GHz.",
|
||||
)
|
||||
|
||||
cprop, err := dev.DeviceGetComputeProperties()
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
fmt.Println("=============== Device Compute Properties ===============")
|
||||
fmt.Printf("%-28s (%d, %d, %d)\n", "Max Group Size (X, Y, Z):", cprop.Maxgroupsizex, cprop.Maxgroupsizey, cprop.Maxgroupsizez)
|
||||
fmt.Printf("%-28s (%d, %d, %d)\n", "Max Group Count (X, Y, Z):", cprop.Maxgroupcountx, cprop.Maxgroupcounty, cprop.Maxgroupcountz)
|
||||
fmt.Printf("%-28s %d\n", "Max Total Group Size:", cprop.Maxtotalgroupsize)
|
||||
fmt.Printf("%-28s %d\n", "Max Shared Local Memory:", cprop.Maxsharedlocalmemory)
|
||||
fmt.Printf("%-28s %v\n", "Subgroup Sizes:", cprop.Subgroupsizes[:cprop.Numsubgroupsizes])
|
||||
|
||||
mod, err := ctx.ModuleCreate(dev, kernelspv)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
defer mod.Destroy()
|
||||
|
||||
krn, err := mod.KernelCreate("scale")
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
defer krn.Destroy()
|
||||
|
||||
gX, gY, _, err := krn.SuggestGroupSize(uint32(dstw), uint32(dsth), 1)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
|
||||
var (
|
||||
X = uintptr(gX)
|
||||
Y = uintptr(gY)
|
||||
groupCountX = uint32(math.Ceil(float64(dstw) / float64(X)))
|
||||
groupCountY = uint32(math.Ceil(float64(dsth) / float64(Y)))
|
||||
srcN = uintptr(width * height * 4) // 4 for RGBA
|
||||
dstN = X * uintptr(groupCountX) * Y * uintptr(groupCountY) * 4 // 4 for RGBA
|
||||
srcbufsz = srcN * unsafe.Sizeof(uint8(0))
|
||||
dstbufsz = dstN * unsafe.Sizeof(uint8(0))
|
||||
)
|
||||
fmt.Println("=============== Computation Configuration ===============")
|
||||
fmt.Printf("%-28s (%d, %d, %d)\n", "Group Size (X, Y, Z):", X, Y, 1)
|
||||
fmt.Printf("%-28s (%d, %d, %d)\n", "Group Count (X, Y, Z):", groupCountX, groupCountY, 1)
|
||||
fmt.Printf("%-28s (%d, %d)\n", "Total Elements (srcN, dstN):", srcN, dstN)
|
||||
fmt.Printf("%-28s %.02f KiB\n", "Source Buffer Size:", float64(srcbufsz)/1024)
|
||||
fmt.Printf("%-28s %.02f KiB\n", "Dest Buffer Size:", float64(dstbufsz)/1024)
|
||||
|
||||
q, err := ctx.CommandQueueCreate(dev, gozel.ZE_COMMAND_QUEUE_MODE_DEFAULT)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
defer q.Destroy()
|
||||
|
||||
hbuf, err := ctx.MemAllocHost(srcbufsz, 1)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
defer ctx.MemFree(hbuf)
|
||||
|
||||
dbuf, err := ctx.MemAllocDevice(dev, srcbufsz, 1)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
defer ctx.MemFree(dbuf)
|
||||
|
||||
himg := unsafe.Slice((*uint8)(hbuf), srcN)
|
||||
copy(himg, imgrgba.Pix)
|
||||
|
||||
rgbaFmt := gozel.ZeImageFormat{
|
||||
Layout: gozel.ZE_IMAGE_FORMAT_LAYOUT_8_8_8_8,
|
||||
Type: gozel.ZE_IMAGE_FORMAT_TYPE_UNORM, // UNORM: bilinear sampling returns float [0,1]
|
||||
X: gozel.ZE_IMAGE_FORMAT_SWIZZLE_R,
|
||||
Y: gozel.ZE_IMAGE_FORMAT_SWIZZLE_G,
|
||||
Z: gozel.ZE_IMAGE_FORMAT_SWIZZLE_B,
|
||||
W: gozel.ZE_IMAGE_FORMAT_SWIZZLE_A,
|
||||
}
|
||||
input, err := ctx.ImageCreate(dev, 0, rgbaFmt, uint64(width), uint32(height))
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
defer input.Destroy()
|
||||
|
||||
smp, err := ctx.SamplerCreate(
|
||||
dev, gozel.ZE_SAMPLER_ADDRESS_MODE_CLAMP,
|
||||
gozel.ZE_SAMPLER_FILTER_MODE_LINEAR, 1,
|
||||
)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
defer smp.Destroy()
|
||||
|
||||
output, err := ctx.ImageCreate(
|
||||
dev, gozel.ZE_IMAGE_FLAG_KERNEL_WRITE,
|
||||
rgbaFmt, uint64(dstw), uint32(dsth),
|
||||
)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
defer output.Destroy()
|
||||
|
||||
err = krn.SetArgumentValue(0, input)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
err = krn.SetArgumentValue(1, smp)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
err = krn.SetArgumentValue(2, output)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
err = krn.SetGroupSize(uint32(X), uint32(Y), 1)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
|
||||
lstpre, err := ctx.CommandListCreate(dev)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
defer lstpre.Destroy()
|
||||
|
||||
err = lstpre.AppendMemoryCopy(dbuf, hbuf, srcbufsz, 0)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
err = lstpre.AppendBarrier(0)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
|
||||
err = lstpre.AppendImageCopyFromMemory(input, dbuf, nil, 0)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
err = lstpre.AppendBarrier(0)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
|
||||
err = lstpre.Close()
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
|
||||
lstcalc, err := ctx.CommandListCreate(dev)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
defer lstcalc.Destroy()
|
||||
|
||||
err = lstcalc.AppendLaunchKernel(krn, &gozel.ZeGroupCount{
|
||||
Groupcountx: groupCountX, Groupcounty: groupCountY, Groupcountz: 1,
|
||||
}, 0)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
|
||||
err = lstcalc.AppendBarrier(0)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
|
||||
err = lstcalc.Close()
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
|
||||
lstpost, err := ctx.CommandListCreate(dev)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
defer lstpost.Destroy()
|
||||
|
||||
err = lstpost.AppendImageCopyToMemory(dbuf, output, nil, 0)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
|
||||
err = lstpost.AppendMemoryCopy(hbuf, dbuf, dstbufsz, 0)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
|
||||
err = lstpost.Close()
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
|
||||
start := time.Now()
|
||||
err = q.ExecuteCommandLists(lstpre, lstcalc, lstpost)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
err = q.Synchronize(math.MaxUint64)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
elapsed := time.Since(start)
|
||||
|
||||
fmt.Println("=============== Calculation Results ===============")
|
||||
fmt.Printf("%-28s %.6f ms\n", "GPU Execution Time:", elapsed.Seconds()*1000)
|
||||
fmt.Printf("%-28s %.2f GiB/s\n", "GPU Throughput:", float64(srcbufsz)/elapsed.Seconds()/1e9)
|
||||
|
||||
newimgrgba := image.NewRGBA(image.Rect(0, 0, dstw, dsth))
|
||||
copy(newimgrgba.Pix, himg)
|
||||
file, err := os.Create("small.png")
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
defer file.Close()
|
||||
err = png.Encode(file, newimgrgba)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
|
||||
fmt.Println("Test Passed!!!")
|
||||
}
|
||||
119
examples/image_scale/main.spt
Normal file
119
examples/image_scale/main.spt
Normal file
@@ -0,0 +1,119 @@
|
||||
119734787 65536 393230 61 0
|
||||
2 Capability Addresses
|
||||
2 Capability Linkage
|
||||
2 Capability Kernel
|
||||
2 Capability Int64
|
||||
2 Capability ImageBasic
|
||||
5 ExtInstImport 1 "OpenCL.std"
|
||||
3 MemoryModel 2 2
|
||||
6 EntryPoint 6 53 "scale" 5
|
||||
16 String 59 "kernel_arg_type.scale.image2d_t,sampler_t,image2d_t,"
|
||||
10 String 60 "kernel_arg_type_qual.scale.,,,"
|
||||
3 Source 3 102000
|
||||
11 Name 5 "__spirv_BuiltInGlobalInvocationId"
|
||||
4 Name 11 "scale"
|
||||
5 Name 12 "inputImg"
|
||||
3 Name 13 "smp"
|
||||
5 Name 14 "outputImg"
|
||||
4 Name 15 "entry"
|
||||
4 Name 21 "call"
|
||||
4 Name 23 "conv"
|
||||
4 Name 26 "call1"
|
||||
4 Name 27 "conv2"
|
||||
4 Name 30 "call31"
|
||||
4 Name 31 "call3"
|
||||
4 Name 32 "call42"
|
||||
4 Name 33 "call4"
|
||||
4 Name 35 "conv5"
|
||||
4 Name 36 "conv6"
|
||||
3 Name 37 "div"
|
||||
4 Name 40 "vecinit"
|
||||
4 Name 41 "conv7"
|
||||
4 Name 42 "conv8"
|
||||
4 Name 43 "div9"
|
||||
5 Name 44 "vecinit10"
|
||||
7 Name 46 "TempSampledImage"
|
||||
4 Name 49 "call11"
|
||||
5 Name 51 "vecinit13"
|
||||
5 Name 52 "vecinit14"
|
||||
5 Name 54 "inputImg"
|
||||
3 Name 55 "smp"
|
||||
5 Name 56 "outputImg"
|
||||
|
||||
13 Decorate 5 LinkageAttributes "__spirv_BuiltInGlobalInvocationId" Import
|
||||
3 Decorate 5 Constant
|
||||
4 Decorate 5 BuiltIn 28
|
||||
6 Decorate 11 LinkageAttributes "scale" Export
|
||||
4 Decorate 37 FPFastMathMode 16
|
||||
4 Decorate 43 FPFastMathMode 16
|
||||
4 TypeInt 2 64 0
|
||||
4 TypeInt 22 32 0
|
||||
5 Constant 2 18 0 0
|
||||
4 Constant 22 29 0
|
||||
4 TypeVector 3 2 3
|
||||
4 TypePointer 4 1 3
|
||||
2 TypeVoid 6
|
||||
10 TypeImage 7 6 1 0 0 0 0 0 0
|
||||
2 TypeSampler 8
|
||||
10 TypeImage 9 6 1 0 0 0 0 0 1
|
||||
6 TypeFunction 10 6 7 8 9
|
||||
2 TypeBool 19
|
||||
4 TypeVector 28 22 2
|
||||
3 TypeFloat 34 32
|
||||
4 TypeVector 38 34 2
|
||||
3 TypeSampledImage 45 7
|
||||
4 TypeVector 47 34 4
|
||||
4 Variable 4 5 1
|
||||
3 ConstantTrue 19 20
|
||||
3 Undef 38 39
|
||||
4 Constant 34 48 0
|
||||
3 Undef 28 50
|
||||
|
||||
|
||||
|
||||
5 Function 6 11 0 10
|
||||
3 FunctionParameter 7 12
|
||||
3 FunctionParameter 8 13
|
||||
3 FunctionParameter 9 14
|
||||
|
||||
2 Label 15
|
||||
6 Load 3 16 5 2 32
|
||||
5 CompositeExtract 2 17 16 0
|
||||
6 Select 2 21 20 17 18
|
||||
4 UConvert 22 23 21
|
||||
6 Load 3 24 5 2 32
|
||||
5 CompositeExtract 2 25 24 1
|
||||
6 Select 2 26 20 25 18
|
||||
4 UConvert 22 27 26
|
||||
5 ImageQuerySizeLod 28 30 14 29
|
||||
5 CompositeExtract 22 31 30 0
|
||||
5 ImageQuerySizeLod 28 32 14 29
|
||||
5 CompositeExtract 22 33 32 1
|
||||
4 ConvertUToF 34 35 23
|
||||
4 ConvertUToF 34 36 31
|
||||
5 FDiv 34 37 35 36
|
||||
6 CompositeInsert 38 40 37 39 0
|
||||
4 ConvertUToF 34 41 27
|
||||
4 ConvertUToF 34 42 33
|
||||
5 FDiv 34 43 41 42
|
||||
6 CompositeInsert 38 44 43 40 1
|
||||
5 SampledImage 45 46 12 13
|
||||
7 ImageSampleExplicitLod 47 49 46 44 2 48
|
||||
6 CompositeInsert 28 51 23 50 0
|
||||
6 CompositeInsert 28 52 27 51 1
|
||||
4 ImageWrite 14 52 49
|
||||
1 Return
|
||||
|
||||
1 FunctionEnd
|
||||
|
||||
5 Function 6 53 0 10
|
||||
3 FunctionParameter 7 54
|
||||
3 FunctionParameter 8 55
|
||||
3 FunctionParameter 9 56
|
||||
|
||||
2 Label 57
|
||||
7 FunctionCall 6 58 11 54 55 56
|
||||
1 Return
|
||||
|
||||
1 FunctionEnd
|
||||
|
||||
BIN
examples/image_scale/main_.spv
Normal file
BIN
examples/image_scale/main_.spv
Normal file
Binary file not shown.
BIN
examples/image_scale/small.png
Normal file
BIN
examples/image_scale/small.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 251 KiB |
BIN
examples/image_scale/暖笺贺春.webp
Normal file
BIN
examples/image_scale/暖笺贺春.webp
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 141 KiB |
21
examples/quick_start/README.md
Normal file
21
examples/quick_start/README.md
Normal file
@@ -0,0 +1,21 @@
|
||||
# Quick Start — Device Enumeration
|
||||
|
||||
The simplest gozel example: initialize the Level Zero runtime, enumerate all available GPU drivers and their devices, and print device names.
|
||||
|
||||
## What It Does
|
||||
|
||||
- Initializes Level Zero and retrieves all GPU driver handles
|
||||
- Iterates over devices under each driver, queries and prints device properties (name)
|
||||
|
||||
## Run
|
||||
|
||||
```bash
|
||||
go run main.go
|
||||
```
|
||||
|
||||
## Sample Output
|
||||
|
||||
```
|
||||
Found 1 GPU driver(s)
|
||||
Device: Intel(R) Graphics
|
||||
```
|
||||
48
examples/vadd/README.md
Normal file
48
examples/vadd/README.md
Normal file
@@ -0,0 +1,48 @@
|
||||
# Vector Addition — Command Queue
|
||||
|
||||
> ![Tips]
|
||||
> **SYCL** is used to write this kernel, which is not a common practice.
|
||||
> Please also have a look at the **OpenCL** kernel examples like [image_scale](../image_scale/).
|
||||
|
||||
A classic GPU compute example: perform element-wise addition of two large float32 vectors on the GPU, then validate the result against a CPU reference.
|
||||
|
||||
## What It Does
|
||||
|
||||
1. Discovers a GPU device and prints its basic & compute properties
|
||||
2. Allocates host and device memory for two float32 vectors (256 MiB each)
|
||||
3. Fills both vectors with random values and copies them to device memory
|
||||
4. Loads a SPIR-V kernel (`vector_add`) that computes `a[i] += b[i]` in parallel
|
||||
5. Launches the kernel via a **command queue** with explicit command lists (pre-copy → compute → post-copy)
|
||||
6. Reads back the results and validates every element against the CPU reference
|
||||
7. Reports GPU vs. CPU execution time and throughput
|
||||
|
||||
## Run
|
||||
|
||||
```bash
|
||||
go run main.go
|
||||
```
|
||||
|
||||
## Sample Output
|
||||
|
||||
```
|
||||
=============== Device Basic Properties ===============
|
||||
Running on device: ID = 32103 , Name = Intel(R) Graphics @ 0.00 GHz.
|
||||
=============== Device Compute Properties ===============
|
||||
Max Group Size (X, Y, Z): (1024, 1024, 1024)
|
||||
Max Group Count (X, Y, Z): (4294967295, 4294967295, 4294967295)
|
||||
Max Total Group Size: 1024
|
||||
Max Shared Local Memory: 65536
|
||||
Subgroup Sizes: [8 16 32]
|
||||
=============== Computation Configuration ===============
|
||||
Group Size (X, Y, Z): (1024, 1, 1)
|
||||
Group Count: 65536
|
||||
Total Elements (N): 67108864
|
||||
Buffer Size: 256 MiB
|
||||
=============== Calculation Results ===============
|
||||
GPU Execution Time: 53.858600 ms
|
||||
GPU Throughput: 4.98 GiB/s
|
||||
=============== Validation Results ===============
|
||||
CPU Execution Time: 65.882900 ms
|
||||
CPU Throughput: 4.07 GiB/s
|
||||
Test Passed!!!
|
||||
```
|
||||
@@ -16,10 +16,11 @@ import (
|
||||
"github.com/fumiama/gozel/ze"
|
||||
)
|
||||
|
||||
//go:generate clang++ -fsycl -fsycl-device-only -fsycl-targets=spirv64 -Xclang -emit-llvm-bc main.cpp -o device_kern.bc
|
||||
//go:generate sycl-post-link -symbols -split=auto -o device_kern.table device_kern.bc
|
||||
//go:generate llvm-spirv -o main.spv device_kern_0.bc
|
||||
//go:generate clang++ -fsycl -fsycl-device-only -fno-sycl-instrument-device-code -fsycl-targets=spirv64 -Xclang -emit-llvm-bc main.cpp -o device_kern.bc
|
||||
//go:generate sycl-post-link -symbols -split=auto -emit-param-info -properties -o device_kern.table device_kern.bc
|
||||
//go:generate llvm-spirv --sycl-opt -o main.spv device_kern_0.bc
|
||||
//go:generate clang++ -target spirv64-unknown-unknown -S -emit-llvm -x ir device_kern_0.bc -o main.ll
|
||||
//go:generate llvm-spirv -to-text main.spv -o main.spt
|
||||
|
||||
//go:embed main.spv
|
||||
var kernelspv []byte
|
||||
|
||||
79
examples/vadd/main.spt
Normal file
79
examples/vadd/main.spt
Normal file
@@ -0,0 +1,79 @@
|
||||
119734787 66560 393230 34 0
|
||||
2 Capability Addresses
|
||||
2 Capability Linkage
|
||||
2 Capability Kernel
|
||||
2 Capability Int64
|
||||
5 ExtInstImport 1 "OpenCL.std"
|
||||
3 MemoryModel 2 2
|
||||
12 EntryPoint 6 29 "__sycl_kernel_vector_add" 5 6
|
||||
3 ExecutionMode 29 31
|
||||
3 Source 4 100000
|
||||
11 Name 5 "__spirv_BuiltInGlobalInvocationId"
|
||||
9 Name 6 "__spirv_BuiltInGlobalOffset"
|
||||
9 Name 11 "__sycl_kernel_vector_add"
|
||||
|
||||
13 Decorate 5 LinkageAttributes "__spirv_BuiltInGlobalInvocationId" Import
|
||||
3 Decorate 5 Constant
|
||||
4 Decorate 5 BuiltIn 28
|
||||
4 Decorate 5 Alignment 32
|
||||
11 Decorate 6 LinkageAttributes "__spirv_BuiltInGlobalOffset" Import
|
||||
3 Decorate 6 Constant
|
||||
4 Decorate 6 BuiltIn 33
|
||||
4 Decorate 6 Alignment 32
|
||||
11 Decorate 11 LinkageAttributes "__sycl_kernel_vector_add" Export
|
||||
4 Decorate 12 FuncParamAttr 5
|
||||
4 Decorate 12 Alignment 4
|
||||
4 Decorate 13 FuncParamAttr 5
|
||||
4 Decorate 13 FuncParamAttr 6
|
||||
4 Decorate 13 Alignment 4
|
||||
4 Decorate 30 FuncParamAttr 5
|
||||
4 Decorate 30 Alignment 4
|
||||
4 Decorate 31 FuncParamAttr 5
|
||||
4 Decorate 31 FuncParamAttr 6
|
||||
4 Decorate 31 Alignment 4
|
||||
4 TypeInt 2 64 0
|
||||
5 Constant 2 21 2147483648 0
|
||||
4 TypeVector 3 2 3
|
||||
4 TypePointer 4 5 3
|
||||
2 TypeVoid 7
|
||||
3 TypeFloat 8 32
|
||||
4 TypePointer 9 5 8
|
||||
5 TypeFunction 10 7 9 9
|
||||
4 TypePointer 15 5 2
|
||||
2 TypeBool 22
|
||||
4 Variable 4 5 5
|
||||
4 Variable 4 6 5
|
||||
|
||||
|
||||
|
||||
5 Function 7 11 0 10
|
||||
3 FunctionParameter 9 12
|
||||
3 FunctionParameter 9 13
|
||||
|
||||
2 Label 14
|
||||
4 Bitcast 15 16 5
|
||||
6 Load 2 17 16 2 32
|
||||
4 Bitcast 15 18 6
|
||||
6 Load 2 19 18 2 32
|
||||
5 ISub 2 20 17 19
|
||||
5 ULessThan 22 23 20 21
|
||||
5 InBoundsPtrAccessChain 9 24 13 20
|
||||
6 Load 8 25 24 2 4
|
||||
5 InBoundsPtrAccessChain 9 26 12 20
|
||||
6 Load 8 27 26 2 4
|
||||
5 FAdd 8 28 27 25
|
||||
5 Store 26 28 2 4
|
||||
1 Return
|
||||
|
||||
1 FunctionEnd
|
||||
|
||||
5 Function 7 29 0 10
|
||||
3 FunctionParameter 9 30
|
||||
3 FunctionParameter 9 31
|
||||
|
||||
2 Label 32
|
||||
6 FunctionCall 7 33 11 30 31
|
||||
1 Return
|
||||
|
||||
1 FunctionEnd
|
||||
|
||||
61
examples/vadd_event/README.md
Normal file
61
examples/vadd_event/README.md
Normal file
@@ -0,0 +1,61 @@
|
||||
# Vector Addition — Immediate Command List with Events
|
||||
|
||||
> ![Tips]
|
||||
> **SYCL** is used to write this kernel, which is not a common practice.
|
||||
> Please also have a look at the **OpenCL** kernel examples like [image_scale](../image_scale/).
|
||||
|
||||
The same vector addition workload as the `vadd` example, but driven by an **immediate command list** and **events** instead of explicit command queues. This demonstrates fine-grained dependency tracking: memory copies signal events, and the kernel launch waits on those events before executing.
|
||||
|
||||
## What It Does
|
||||
|
||||
1. Discovers a GPU device and prints its basic & compute properties
|
||||
2. Allocates host and device memory for two float32 vectors (256 MiB each)
|
||||
3. Fills both vectors with random values
|
||||
4. Loads a SPIR-V kernel (`vector_add`) that computes `a[i] += b[i]` in parallel
|
||||
5. Creates an **event pool** with 3 events to express data-flow dependencies
|
||||
6. Submits all work through a single **immediate command list**:
|
||||
- Two H→D copies, each signaling its own event
|
||||
- Kernel launch that **waits** on both copy events before executing
|
||||
- D→H copy that waits on the kernel event
|
||||
7. Synchronizes via `HostSynchronize` on the immediate command list
|
||||
8. Validates every element against the CPU reference
|
||||
|
||||
## Key Difference from `vadd`
|
||||
|
||||
| Aspect | `vadd` | `vadd_event` |
|
||||
|--------|--------|-------------|
|
||||
| Submission | 3 separate command lists executed on a command queue | 1 immediate command list |
|
||||
| Synchronization | `zeCommandQueueSynchronize` | `zeCommandListHostSynchronize` |
|
||||
| Dependencies | Implicit via command list ordering + barriers | Explicit via events (wait lists) |
|
||||
|
||||
## Run
|
||||
|
||||
```bash
|
||||
go run main.go
|
||||
```
|
||||
|
||||
## Sample Output
|
||||
|
||||
```
|
||||
=============== Device Basic Properties ===============
|
||||
Running on device: ID = 32103 , Name = Intel(R) Graphics @ 0.00 GHz.
|
||||
=============== Device Compute Properties ===============
|
||||
Max Group Size (X, Y, Z): (1024, 1024, 1024)
|
||||
Max Group Count (X, Y, Z): (4294967295, 4294967295, 4294967295)
|
||||
Max Total Group Size: 1024
|
||||
Max Shared Local Memory: 65536
|
||||
Num Subgroup Sizes: 3
|
||||
Subgroup Sizes: [8 16 32 0 0 0 0 0]
|
||||
=============== Computation Configuration ===============
|
||||
Group Size (X, Y, Z): (1024, 1, 1)
|
||||
Group Count: 65536
|
||||
Total Elements (N): 67108864
|
||||
Buffer Size: 256 MiB
|
||||
=============== Calculation Results ===============
|
||||
GPU Execution Time: 51.768500 ms
|
||||
GPU Throughput: 5.19 GiB/s
|
||||
=============== Validation Results ===============
|
||||
CPU Execution Time: 38.237400 ms
|
||||
CPU Throughput: 7.02 GiB/s
|
||||
Test Passed!!!
|
||||
```
|
||||
@@ -16,10 +16,11 @@ import (
|
||||
"github.com/fumiama/gozel/ze"
|
||||
)
|
||||
|
||||
//go:generate clang++ -fsycl -fsycl-device-only -fsycl-targets=spirv64 -Xclang -emit-llvm-bc main.cpp -o device_kern.bc
|
||||
//go:generate sycl-post-link -symbols -split=auto -o device_kern.table device_kern.bc
|
||||
//go:generate llvm-spirv -o main.spv device_kern_0.bc
|
||||
//go:generate clang++ -fsycl -fsycl-device-only -fno-sycl-instrument-device-code -fsycl-targets=spirv64 -Xclang -emit-llvm-bc main.cpp -o device_kern.bc
|
||||
//go:generate sycl-post-link -symbols -split=auto -emit-param-info -properties -o device_kern.table device_kern.bc
|
||||
//go:generate llvm-spirv --sycl-opt -o main.spv device_kern_0.bc
|
||||
//go:generate clang++ -target spirv64-unknown-unknown -S -emit-llvm -x ir device_kern_0.bc -o main.ll
|
||||
//go:generate llvm-spirv -to-text main.spv -o main.spt
|
||||
|
||||
//go:embed main.spv
|
||||
var kernelspv []byte
|
||||
|
||||
79
examples/vadd_event/main.spt
Normal file
79
examples/vadd_event/main.spt
Normal file
@@ -0,0 +1,79 @@
|
||||
119734787 66560 393230 34 0
|
||||
2 Capability Addresses
|
||||
2 Capability Linkage
|
||||
2 Capability Kernel
|
||||
2 Capability Int64
|
||||
5 ExtInstImport 1 "OpenCL.std"
|
||||
3 MemoryModel 2 2
|
||||
12 EntryPoint 6 29 "__sycl_kernel_vector_add" 5 6
|
||||
3 ExecutionMode 29 31
|
||||
3 Source 4 100000
|
||||
11 Name 5 "__spirv_BuiltInGlobalInvocationId"
|
||||
9 Name 6 "__spirv_BuiltInGlobalOffset"
|
||||
9 Name 11 "__sycl_kernel_vector_add"
|
||||
|
||||
13 Decorate 5 LinkageAttributes "__spirv_BuiltInGlobalInvocationId" Import
|
||||
3 Decorate 5 Constant
|
||||
4 Decorate 5 BuiltIn 28
|
||||
4 Decorate 5 Alignment 32
|
||||
11 Decorate 6 LinkageAttributes "__spirv_BuiltInGlobalOffset" Import
|
||||
3 Decorate 6 Constant
|
||||
4 Decorate 6 BuiltIn 33
|
||||
4 Decorate 6 Alignment 32
|
||||
11 Decorate 11 LinkageAttributes "__sycl_kernel_vector_add" Export
|
||||
4 Decorate 12 FuncParamAttr 5
|
||||
4 Decorate 12 Alignment 4
|
||||
4 Decorate 13 FuncParamAttr 5
|
||||
4 Decorate 13 FuncParamAttr 6
|
||||
4 Decorate 13 Alignment 4
|
||||
4 Decorate 30 FuncParamAttr 5
|
||||
4 Decorate 30 Alignment 4
|
||||
4 Decorate 31 FuncParamAttr 5
|
||||
4 Decorate 31 FuncParamAttr 6
|
||||
4 Decorate 31 Alignment 4
|
||||
4 TypeInt 2 64 0
|
||||
5 Constant 2 21 2147483648 0
|
||||
4 TypeVector 3 2 3
|
||||
4 TypePointer 4 5 3
|
||||
2 TypeVoid 7
|
||||
3 TypeFloat 8 32
|
||||
4 TypePointer 9 5 8
|
||||
5 TypeFunction 10 7 9 9
|
||||
4 TypePointer 15 5 2
|
||||
2 TypeBool 22
|
||||
4 Variable 4 5 5
|
||||
4 Variable 4 6 5
|
||||
|
||||
|
||||
|
||||
5 Function 7 11 0 10
|
||||
3 FunctionParameter 9 12
|
||||
3 FunctionParameter 9 13
|
||||
|
||||
2 Label 14
|
||||
4 Bitcast 15 16 5
|
||||
6 Load 2 17 16 2 32
|
||||
4 Bitcast 15 18 6
|
||||
6 Load 2 19 18 2 32
|
||||
5 ISub 2 20 17 19
|
||||
5 ULessThan 22 23 20 21
|
||||
5 InBoundsPtrAccessChain 9 24 13 20
|
||||
6 Load 8 25 24 2 4
|
||||
5 InBoundsPtrAccessChain 9 26 12 20
|
||||
6 Load 8 27 26 2 4
|
||||
5 FAdd 8 28 27 25
|
||||
5 Store 26 28 2 4
|
||||
1 Return
|
||||
|
||||
1 FunctionEnd
|
||||
|
||||
5 Function 7 29 0 10
|
||||
3 FunctionParameter 9 30
|
||||
3 FunctionParameter 9 31
|
||||
|
||||
2 Label 32
|
||||
6 FunctionCall 7 33 11 30 31
|
||||
1 Return
|
||||
|
||||
1 FunctionEnd
|
||||
|
||||
Reference in New Issue
Block a user