[#56][fix] Fix workitem function(enqueued_local_size & local_linear_id) bugs in libclc

Support get_enqueued_local_size function and fix the calculation of get_local_linear_id
This commit is contained in:
zhoujingya 2023-11-06 16:07:27 +08:00 committed by zhoujing
parent 49c039a902
commit e04c1a6ec7
4 changed files with 37 additions and 27 deletions

View File

@ -8,6 +8,7 @@ workitem/get_local_linear_id.cl
workitem/get_local_size.cl
workitem/get_num_groups.cl
workitem/get_work_dim.cl
workitem/get_enqueued_local_size.cl
compiler-rt/nextafterf.cl
compiler-rt/adddf3.cl

View File

@ -0,0 +1,7 @@
#include <clc/clc.h>
// get_global_size(unit dim) / get_num_groups(unit dim)
_CLC_DEF _CLC_OVERLOAD size_t get_enqueued_local_size(uint dim) {
return get_local_size(dim);
}

View File

@ -1,7 +1,17 @@
#include <clc/clc.h>
extern size_t __builtin_riscv_workitem_linear_id();
_CLC_DEF _CLC_OVERLOAD size_t get_local_linear_id() {
return __builtin_riscv_workitem_linear_id();
uint dim = get_work_dim() - 1;
switch (dim) {
case 0:
return get_local_id(0);
case 1:
return get_local_id(1) * get_local_size(0) + get_local_id(0);
case 2:
return (get_local_id(2) * get_local_size(1) + get_local_id(1)) *
get_local_size(0) +
get_local_id(0);
default:
return 0;
}
}

View File

@ -23,9 +23,9 @@
*
*
* global_id (uniform methods in 1/2/3 dims):
* get_global_id(0) = CSR_GID_X * local_size_x + local_id_x
* get_global_id(1) = CSR_GID_Y * local_size_y + local_id_y
* get_global_id(2) = CSR_GID_Z * local_size_z + local_id_z
* get_global_id(0) = _global_offset_x + CSR_GID_X * local_size_x + local_id_x
* get_global_id(1) = _global_offset_y + CSR_GID_Y * local_size_y + local_id_y
* get_global_id(2) = _global_offset_z + CSR_GID_Z * local_size_z + local_id_z
*
*
* global_linear_id:
@ -65,17 +65,6 @@ _local_id_z:
.word 0
// End workaround for pocl driver
.text
.global __builtin_riscv_workitem_linear_id
.type __builtin_riscv_workitem_linear_id, @function
__builtin_riscv_workitem_linear_id:
csrr a0, CSR_KNL # Get kernel metadata buffer
csrr t1, CSR_TID # tid base offset for current warp
vid.v v2 # current thread offset
vadd.vx v0, v2, t1 # local_linear_id
ret
.text
.global __builtin_riscv_global_linear_id
.type __builtin_riscv_global_linear_id, @function
@ -145,16 +134,17 @@ __builtin_riscv_workgroup_id_z:
.global __builtin_riscv_workitem_id_x
.type __builtin_riscv_workitem_id_x, @function
__builtin_riscv_workitem_id_x:
addi sp, sp, 4
sw ra, -4(sp)
csrr a0, CSR_KNL # Get kernel metadata buffer
lw t0, KNL_WORK_DIM(a0) # Get work_dim
csrr t1, CSR_TID # tid base offset for current warp
vid.v v2 # current thread offset
vadd.vx v0, v2, t1 # local_id_x in 1 dim (local_linear_id)
li t2, 1
beq t0, t2, .WIXR
lw t3, KNL_LC_SIZE_X(a0) # local_size_x
vremu.vx v0, v0, t3 # local_id_x = local_liner_id % local_size_x
.WIXR:
lw ra, -4(sp)
addi sp, sp, -4
ret
@ -162,22 +152,20 @@ __builtin_riscv_workitem_id_x:
.global __builtin_riscv_workitem_id_y
.type __builtin_riscv_workitem_id_y, @function
__builtin_riscv_workitem_id_y:
addi sp, sp, 4
sw ra, -4(sp)
csrr a0, CSR_KNL # Get kernel metadata buffer
lw t0, KNL_WORK_DIM(a0) # Get work_dim
csrr t1, CSR_TID # tid base offset for current warp
vid.v v2 # current thread offset
vadd.vx v0, v2, t1 # local_linear_id
lw t3, KNL_LC_SIZE_X(a0) # local_size_x offset in 2 work_dims
li t2, 3
beq t0, t2, .WIY3
.WIY2: # 2 dims
vdivu.vx v0, v0, t3 # local_id_y = local_liner_id / local_size_x
ret
.WIY3: # 3 dims
lw t4, KNL_LC_SIZE_Y(a0) # local_size_y
lw t4, KNL_LC_SIZE_Y(a0) # local_size_y offset in 2 work_dims
mul t4, t4, t3 # local_size_x * local_size_y
vremu.vx v0, v0, t4 # x = local_linear_id % (local_size_x * local_size_y)
vdivu.vx v0, v0, t3 # x / local_size_x
lw ra, -4(sp)
addi sp, sp, -4
ret
@ -185,6 +173,8 @@ __builtin_riscv_workitem_id_y:
.global __builtin_riscv_workitem_id_z
.type __builtin_riscv_workitem_id_z, @function
__builtin_riscv_workitem_id_z:
addi sp, sp, 4
sw ra, -4(sp)
csrr a0, CSR_KNL # Get kernel metadata buffer
csrr t1, CSR_TID # tid base offset for current warp
vid.v v2 # current thread offset
@ -193,6 +183,8 @@ __builtin_riscv_workitem_id_z:
lw t4, KNL_LC_SIZE_Y(a0) # local_size_y
mul t4, t4, t3 # local_size_x * local_size_y
vdivu.vx v0, v0, t4 # local_linear_id / (local_size_x * local_size_y)
lw ra, -4(sp)
addi sp, sp, -4
7:
ret