From e04c1a6ec7092f7a6cdc2f5609a4d39aa3cef503 Mon Sep 17 00:00:00 2001 From: zhoujingya Date: Mon, 6 Nov 2023 16:07:27 +0800 Subject: [PATCH 1/6] [#56][fix] Fix workitem function(enqueued_local_size & local_linear_id) bugs in libclc Support get_enqueued_local_size function and fix the calculation of get_local_linear_id --- libclc/riscv32/lib/SOURCES | 1 + .../lib/workitem/get_enqueued_local_size.cl | 7 ++++ .../lib/workitem/get_local_linear_id.cl | 16 ++++++-- libclc/riscv32/lib/workitem/workitem.S | 40 ++++++++----------- 4 files changed, 37 insertions(+), 27 deletions(-) create mode 100644 libclc/riscv32/lib/workitem/get_enqueued_local_size.cl diff --git a/libclc/riscv32/lib/SOURCES b/libclc/riscv32/lib/SOURCES index 912378857f20..afa07f7bb320 100644 --- a/libclc/riscv32/lib/SOURCES +++ b/libclc/riscv32/lib/SOURCES @@ -8,6 +8,7 @@ workitem/get_local_linear_id.cl workitem/get_local_size.cl workitem/get_num_groups.cl workitem/get_work_dim.cl +workitem/get_enqueued_local_size.cl compiler-rt/nextafterf.cl compiler-rt/adddf3.cl diff --git a/libclc/riscv32/lib/workitem/get_enqueued_local_size.cl b/libclc/riscv32/lib/workitem/get_enqueued_local_size.cl new file mode 100644 index 000000000000..9f150bc61cec --- /dev/null +++ b/libclc/riscv32/lib/workitem/get_enqueued_local_size.cl @@ -0,0 +1,7 @@ +#include + +// get_global_size(unit dim) / get_num_groups(unit dim) + +_CLC_DEF _CLC_OVERLOAD size_t get_enqueued_local_size(uint dim) { + return get_local_size(dim); +} diff --git a/libclc/riscv32/lib/workitem/get_local_linear_id.cl b/libclc/riscv32/lib/workitem/get_local_linear_id.cl index 8efc64fb3aa4..9fed98472282 100644 --- a/libclc/riscv32/lib/workitem/get_local_linear_id.cl +++ b/libclc/riscv32/lib/workitem/get_local_linear_id.cl @@ -1,7 +1,17 @@ #include -extern size_t __builtin_riscv_workitem_linear_id(); - _CLC_DEF _CLC_OVERLOAD size_t get_local_linear_id() { - return __builtin_riscv_workitem_linear_id(); + uint dim = get_work_dim() - 1; + switch (dim) { + case 0: + return get_local_id(0); + case 1: + return get_local_id(1) * get_local_size(0) + get_local_id(0); + case 2: + return (get_local_id(2) * get_local_size(1) + get_local_id(1)) * + get_local_size(0) + + get_local_id(0); + default: + return 0; + } } diff --git a/libclc/riscv32/lib/workitem/workitem.S b/libclc/riscv32/lib/workitem/workitem.S index 78e7c7519ca4..054106ac2f1b 100644 --- a/libclc/riscv32/lib/workitem/workitem.S +++ b/libclc/riscv32/lib/workitem/workitem.S @@ -23,9 +23,9 @@ * * * global_id (uniform methods in 1/2/3 dims): - * get_global_id(0) = CSR_GID_X * local_size_x + local_id_x - * get_global_id(1) = CSR_GID_Y * local_size_y + local_id_y - * get_global_id(2) = CSR_GID_Z * local_size_z + local_id_z + * get_global_id(0) = _global_offset_x + CSR_GID_X * local_size_x + local_id_x + * get_global_id(1) = _global_offset_y + CSR_GID_Y * local_size_y + local_id_y + * get_global_id(2) = _global_offset_z + CSR_GID_Z * local_size_z + local_id_z * * * global_linear_id: @@ -65,17 +65,6 @@ _local_id_z: .word 0 // End workaround for pocl driver - .text - .global __builtin_riscv_workitem_linear_id - .type __builtin_riscv_workitem_linear_id, @function -__builtin_riscv_workitem_linear_id: - csrr a0, CSR_KNL # Get kernel metadata buffer - csrr t1, CSR_TID # tid base offset for current warp - vid.v v2 # current thread offset - vadd.vx v0, v2, t1 # local_linear_id - ret - - .text .global __builtin_riscv_global_linear_id .type __builtin_riscv_global_linear_id, @function @@ -145,16 +134,17 @@ __builtin_riscv_workgroup_id_z: .global __builtin_riscv_workitem_id_x .type __builtin_riscv_workitem_id_x, @function __builtin_riscv_workitem_id_x: + addi sp, sp, 4 + sw ra, -4(sp) csrr a0, CSR_KNL # Get kernel metadata buffer lw t0, KNL_WORK_DIM(a0) # Get work_dim csrr t1, CSR_TID # tid base offset for current warp vid.v v2 # current thread offset vadd.vx v0, v2, t1 # local_id_x in 1 dim (local_linear_id) - li t2, 1 - beq t0, t2, .WIXR lw t3, KNL_LC_SIZE_X(a0) # local_size_x vremu.vx v0, v0, t3 # local_id_x = local_liner_id % local_size_x -.WIXR: + lw ra, -4(sp) + addi sp, sp, -4 ret @@ -162,22 +152,20 @@ __builtin_riscv_workitem_id_x: .global __builtin_riscv_workitem_id_y .type __builtin_riscv_workitem_id_y, @function __builtin_riscv_workitem_id_y: + addi sp, sp, 4 + sw ra, -4(sp) csrr a0, CSR_KNL # Get kernel metadata buffer lw t0, KNL_WORK_DIM(a0) # Get work_dim csrr t1, CSR_TID # tid base offset for current warp vid.v v2 # current thread offset vadd.vx v0, v2, t1 # local_linear_id lw t3, KNL_LC_SIZE_X(a0) # local_size_x offset in 2 work_dims - li t2, 3 - beq t0, t2, .WIY3 -.WIY2: # 2 dims - vdivu.vx v0, v0, t3 # local_id_y = local_liner_id / local_size_x - ret -.WIY3: # 3 dims - lw t4, KNL_LC_SIZE_Y(a0) # local_size_y + lw t4, KNL_LC_SIZE_Y(a0) # local_size_y offset in 2 work_dims mul t4, t4, t3 # local_size_x * local_size_y vremu.vx v0, v0, t4 # x = local_linear_id % (local_size_x * local_size_y) vdivu.vx v0, v0, t3 # x / local_size_x + lw ra, -4(sp) + addi sp, sp, -4 ret @@ -185,6 +173,8 @@ __builtin_riscv_workitem_id_y: .global __builtin_riscv_workitem_id_z .type __builtin_riscv_workitem_id_z, @function __builtin_riscv_workitem_id_z: + addi sp, sp, 4 + sw ra, -4(sp) csrr a0, CSR_KNL # Get kernel metadata buffer csrr t1, CSR_TID # tid base offset for current warp vid.v v2 # current thread offset @@ -193,6 +183,8 @@ __builtin_riscv_workitem_id_z: lw t4, KNL_LC_SIZE_Y(a0) # local_size_y mul t4, t4, t3 # local_size_x * local_size_y vdivu.vx v0, v0, t4 # local_linear_id / (local_size_x * local_size_y) + lw ra, -4(sp) + addi sp, sp, -4 7: ret From efd82b9d8612420e9963e403fdbb1fc9e8035056 Mon Sep 17 00:00:00 2001 From: zhoujingya Date: Fri, 10 Nov 2023 09:26:41 +0800 Subject: [PATCH 2/6] [#56][fix] Fix the implementation of get_local_linear_id --- .../lib/workitem/get_global_linear_id.cl | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/libclc/riscv32/lib/workitem/get_global_linear_id.cl b/libclc/riscv32/lib/workitem/get_global_linear_id.cl index b1005e6b719e..59dce04250ec 100644 --- a/libclc/riscv32/lib/workitem/get_global_linear_id.cl +++ b/libclc/riscv32/lib/workitem/get_global_linear_id.cl @@ -1,7 +1,20 @@ #include -extern size_t __builtin_riscv_global_linear_id(); - _CLC_DEF _CLC_OVERLOAD size_t get_global_linear_id() { - return __builtin_riscv_global_linear_id(); + uint dim = get_work_dim() - 1; + switch (dim) { + case 0: + return get_global_id(0) - get_global_offset(0); + ; + case 1: + return (get_global_id(1) - get_global_offset(1)) * get_global_size(0) + + (get_global_id(0) - get_global_offset(0)); + case 2: + return ((get_global_id(2) - get_global_offset(2)) * get_global_size(1) + + (get_global_id(1) - get_global_offset(1))) * + get_global_size(0) + + (get_global_id(0) - get_global_offset(0)); + default: + return 0; + } } From 03759b1bed75d49dc574577735d0c05ce1f7dc73 Mon Sep 17 00:00:00 2001 From: zhoujing Date: Wed, 31 Jan 2024 14:14:44 +0800 Subject: [PATCH 3/6] [VENTUS][fix] Fix get_local_id builtin function implementation --- libclc/riscv32/lib/workitem/workitem.S | 39 ++++++++++++++++++++++---- 1 file changed, 33 insertions(+), 6 deletions(-) diff --git a/libclc/riscv32/lib/workitem/workitem.S b/libclc/riscv32/lib/workitem/workitem.S index 054106ac2f1b..9a214715a6b3 100644 --- a/libclc/riscv32/lib/workitem/workitem.S +++ b/libclc/riscv32/lib/workitem/workitem.S @@ -143,7 +143,16 @@ __builtin_riscv_workitem_id_x: vadd.vx v0, v2, t1 # local_id_x in 1 dim (local_linear_id) lw t3, KNL_LC_SIZE_X(a0) # local_size_x vremu.vx v0, v0, t3 # local_id_x = local_liner_id % local_size_x - lw ra, -4(sp) + vmv.v.x v1, t3 +.hi1: + auipc t1, %pcrel_hi(.end1) + setrpc zero, t1, %pcrel_lo(.hi1) + vblt v0, v1, .end1 + li t5, -1 + vadd.vx v0, v1, t5 +.end1: + join zero, zero, 0 + lw ra, -4(sp) addi sp, sp, -4 ret @@ -161,10 +170,19 @@ __builtin_riscv_workitem_id_y: vadd.vx v0, v2, t1 # local_linear_id lw t3, KNL_LC_SIZE_X(a0) # local_size_x offset in 2 work_dims lw t4, KNL_LC_SIZE_Y(a0) # local_size_y offset in 2 work_dims - mul t4, t4, t3 # local_size_x * local_size_y - vremu.vx v0, v0, t4 # x = local_linear_id % (local_size_x * local_size_y) + mul t5, t5, t3 # local_size_x * local_size_y + vremu.vx v0, v0, t5 # x = local_linear_id % (local_size_x * local_size_y) vdivu.vx v0, v0, t3 # x / local_size_x - lw ra, -4(sp) + vmv.v.x v1, t4 +.hi2: + auipc t1, %pcrel_hi(.end2) + setrpc zero, t1, %pcrel_lo(.hi2) + vblt v0, v1, .end2 + li t5, -1 + vadd.vx v0, v1, t5 +.end2: + join zero, zero, 0 + lw ra, -4(sp) addi sp, sp, -4 ret @@ -181,11 +199,20 @@ __builtin_riscv_workitem_id_z: vadd.vx v0, v2, t1 # local_linear_id lw t3, KNL_LC_SIZE_X(a0) # local_size_x lw t4, KNL_LC_SIZE_Y(a0) # local_size_y + lw t5, KNL_LC_SIZE_Z(a0) # local_size_y mul t4, t4, t3 # local_size_x * local_size_y vdivu.vx v0, v0, t4 # local_linear_id / (local_size_x * local_size_y) - lw ra, -4(sp) + vmv.v.x v1, t5 +.hi3: + auipc t1, %pcrel_hi(.end3) + setrpc zero, t1, %pcrel_lo(.hi3) + vblt v0, v1, .end3 + li t5, -1 + vadd.vx v0, v1, t5 +.end3: + join zero, zero, 0 + lw ra, -4(sp) addi sp, sp, -4 -7: ret From 3bd573e3b3652e4dd905f717317c68a806f6c8c7 Mon Sep 17 00:00:00 2001 From: zhoujing Date: Wed, 31 Jan 2024 15:27:21 +0800 Subject: [PATCH 4/6] [VENTUS][fix] Remove codes and fix wrong register error in workitem.s --- libclc/riscv32/lib/workitem/workitem.S | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/libclc/riscv32/lib/workitem/workitem.S b/libclc/riscv32/lib/workitem/workitem.S index 9a214715a6b3..2df8005fb1c4 100644 --- a/libclc/riscv32/lib/workitem/workitem.S +++ b/libclc/riscv32/lib/workitem/workitem.S @@ -143,15 +143,6 @@ __builtin_riscv_workitem_id_x: vadd.vx v0, v2, t1 # local_id_x in 1 dim (local_linear_id) lw t3, KNL_LC_SIZE_X(a0) # local_size_x vremu.vx v0, v0, t3 # local_id_x = local_liner_id % local_size_x - vmv.v.x v1, t3 -.hi1: - auipc t1, %pcrel_hi(.end1) - setrpc zero, t1, %pcrel_lo(.hi1) - vblt v0, v1, .end1 - li t5, -1 - vadd.vx v0, v1, t5 -.end1: - join zero, zero, 0 lw ra, -4(sp) addi sp, sp, -4 ret @@ -170,7 +161,7 @@ __builtin_riscv_workitem_id_y: vadd.vx v0, v2, t1 # local_linear_id lw t3, KNL_LC_SIZE_X(a0) # local_size_x offset in 2 work_dims lw t4, KNL_LC_SIZE_Y(a0) # local_size_y offset in 2 work_dims - mul t5, t5, t3 # local_size_x * local_size_y + mul t5, t4, t3 # local_size_x * local_size_y vremu.vx v0, v0, t5 # x = local_linear_id % (local_size_x * local_size_y) vdivu.vx v0, v0, t3 # x / local_size_x vmv.v.x v1, t4 From 4b258122601f39c8f6b7a916fe871f99f523a188 Mon Sep 17 00:00:00 2001 From: qinfan Date: Wed, 31 Jan 2024 15:44:16 +0800 Subject: [PATCH 5/6] [VENTUS] Fix some comments Fix some comments. --- libclc/riscv32/lib/workitem/workitem.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libclc/riscv32/lib/workitem/workitem.S b/libclc/riscv32/lib/workitem/workitem.S index 2df8005fb1c4..4c9a9752d52a 100644 --- a/libclc/riscv32/lib/workitem/workitem.S +++ b/libclc/riscv32/lib/workitem/workitem.S @@ -190,7 +190,7 @@ __builtin_riscv_workitem_id_z: vadd.vx v0, v2, t1 # local_linear_id lw t3, KNL_LC_SIZE_X(a0) # local_size_x lw t4, KNL_LC_SIZE_Y(a0) # local_size_y - lw t5, KNL_LC_SIZE_Z(a0) # local_size_y + lw t5, KNL_LC_SIZE_Z(a0) # local_size_z mul t4, t4, t3 # local_size_x * local_size_y vdivu.vx v0, v0, t4 # local_linear_id / (local_size_x * local_size_y) vmv.v.x v1, t5 From 6cac00d1411225bb778e42c3c1e0e639484b79ea Mon Sep 17 00:00:00 2001 From: zhoujing Date: Thu, 1 Feb 2024 15:03:53 +0800 Subject: [PATCH 6/6] [NFC] comment fix --- libclc/riscv32/lib/workitem/workitem.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libclc/riscv32/lib/workitem/workitem.S b/libclc/riscv32/lib/workitem/workitem.S index 4c9a9752d52a..2df8005fb1c4 100644 --- a/libclc/riscv32/lib/workitem/workitem.S +++ b/libclc/riscv32/lib/workitem/workitem.S @@ -190,7 +190,7 @@ __builtin_riscv_workitem_id_z: vadd.vx v0, v2, t1 # local_linear_id lw t3, KNL_LC_SIZE_X(a0) # local_size_x lw t4, KNL_LC_SIZE_Y(a0) # local_size_y - lw t5, KNL_LC_SIZE_Z(a0) # local_size_z + lw t5, KNL_LC_SIZE_Z(a0) # local_size_y mul t4, t4, t3 # local_size_x * local_size_y vdivu.vx v0, v0, t4 # local_linear_id / (local_size_x * local_size_y) vmv.v.x v1, t5