/*******************************************************************************
 *
 * MIT License
 *
 * Copyright (c) 2020-2021 Advanced Micro Devices, Inc.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 *
 *******************************************************************************/
; generated by igemm_codegen.py (2c97a12e9f49260c2ce1b87c12cf7e05adcaf403)
;
.include "igemm_bwd_gtcx35_nhwc_fp16_utils.inc"

;----------------------------------------------------------
; starting of kernel igemm_bwd_gtcx35_nhwc_fp16_bx0_ex1_bt256x256x32_wt32x32x8_ws2x2_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mh
; tensor_layout              : 'nhwc'
; gemm_m_per_block           : 256
; gemm_n_per_block           : 256
; gemm_k_per_block           : 32
; wave_tile_m                : 32
; wave_step_m                : 2
; wave_repeat_m              : 2
; wave_tile_n                : 32
; wave_step_n                : 2
; wave_repeat_n              : 2
; wave_tile_k                : 8
; tensor_a_thread_lengths    : [1, 8, 4, 1]
; tensor_a_cluster_lengths   : [1, 4, 1, 64]
; tensor_b_thread_lengths    : [1, 8, 1, 4]
; tensor_b_cluster_lengths   : [1, 4, 1, 64]
; direction                  : 'bwd'
; precision                  : 'fp16'
; nxb                        : 0
; nxe                        : 1
; vector_c                   : 1
; 
; block_size                 : 256
; lds_total                  : 34816
; lds_buffer_num             : 1
; 
.set k_p_in, 0
.set k_p_wei, 8
.set k_p_out, 16
.set k_hi, 24
.set k_wi, 28
.set k_n, 32
.set k_k, 36
.set k_c, 40
.set k_ho, 44
.set k_wo, 48
.set k_stride_h, 52
.set k_stride_w, 56
.set k_dilation_h, 60
.set k_dilation_w, 64
.set k_pad_h, 68
.set k_pad_w, 72
.set k_y, 76
.set k_x, 80
.set k_dtile_iy, 84
.set k_dtile_ix, 88
.set k_dtile_dy, 92
.set k_dtile_dx, 96
.set k_dtile_y, 100
.set k_dtile_x, 104
.set k_dtile_h, 108
.set k_dtile_w, 112
.set k_dslice_y, 116
.set k_dslice_x, 120
.set k_dslice_h, 124
.set k_dslice_w, 128
.set k_dslice_h_left, 132
.set k_dslice_w_left, 136
.set k_group, 140
.set k_magic_0, 144
.set k_magic_1, 148
.set k_magic_2, 152
.set k_magic_3, 156
.set k_shift_pack_0, 160
.set k__pack_0, 164
.set k_end, 168
.set k_gload_out_k_stride, 16
.set k_gload_wei_c_stride, 8

.set s_ka, 0
.set s_bx, 2
.set s_by, 3
.set s_p_in, 4
.set s_p_wei, 8
.set s_p_out, 12
.set s_hi, 16
.set s_wi, 17
.set s_n, 18
.set s_k, 19
.set s_c, 20
.set s_ho, 21
.set s_wo, 22
.set s_stride_h, 23
.set s_stride_w, 24
.set s_dilation_h, 25
.set s_dilation_w, 26
.set s_pad_h, 27
.set s_pad_w, 28
.set s_y, 29
.set s_x, 30
.set s_dtile_iy, 31
.set s_dtile_ix, 32
.set s_dtile_dy, 33
.set s_dtile_dx, 34
.set s_dtile_y, 35
.set s_dtile_x, 36
.set s_dtile_h, 37
.set s_dtile_w, 38
.set s_dslice_y, 39
.set s_dslice_x, 40
.set s_dslice_h, 41
.set s_dslice_w, 42
.set s_dslice_h_left, 43
.set s_dslice_w_left, 44
.set s_group, 45
.set s_magic_0, 6
.set s_magic_1, 7
.set s_magic_2, 46
.set s_magic_3, 47
.set s_shift_m2, 37
.set s_shift_m3, 38
.set s_out_stride_wo, 48
.set s_out_stride_n, 49
.set s_wei_stride_k, 50
.set s_in_stride_wi, 51
.set s_in_stride_n, 52
.set s_block_gtc_ig, 53
.set s_block_gtc_ic, 54
.set s_block_gtc_inb, 55
.set s_move_slice_out_stride_k, 56
.set s_move_slice_wei_stride_k, 57
.set s_knum, 3
.set s_gemm_k_num_k, 58
.set s_dim_br, 59
.set s_dim_mp, 60
.set s_dim_mr, 61
.set s_dim_np, 62
.set s_wei_os_diff_acc_x_rst_k, 63
.set s_wei_os_diff_acc_y_rst_kx, 64
.set s_out_os_diff_acc_ho_rst_wo, 65
.set s_out_os_diff_acc_wo, 66
.set s_ho_diff_acc_y, 67
.set s_wo_diff_acc_x, 68
.set s_wo_diff_rst_x, 69
.set s_move_slice_k_ix, 70
.set s_flag_need_acc_yx, 71
.set s_shift_pack_0, 71
.set s_kitr, 1
.set s_out_offset, 72
.set s_wei_offset, 73
.set s_in_hi_sshift, 79
.set s_in_wi_sshift, 80
.set s_tmp, 82
.set s_end, 88

.set v_c, 0  ; coalescing:32, needed:0, resuable:66
.set v_a, 0
.set v_b, 16
.set v_gld_a, 32
.set v_gld_b, 48
.set v_sst_a_os, 64
.set v_sld_a_os, 65
.set v_sst_b_os, 66
.set v_sld_b_os, 67
.set v_out_os, 68
.set v_out_iho_list, 72
.set v_out_iwo_list, 76
.set v_out_flag, 80
.set v_out_flag_n, 84
.set v_out_ik, 85
.set v_out_inb, 86
.set v_out_in, 87
.set v_wei_os, 88
.set v_wei_ic, 89
.set v_wei_ik, 90
.set v_in_os, 32
.set v_in_in, 33
.set v_in_ihi, 34
.set v_in_iwi, 35
.set v_in_flag, 36
.set v_in_flag_c, 89
.set v_in_inb, 86
.set v_co_sst, 87
.set v_co_sld, 91
.set v_gemm_in, 92
.set v_gemm_im, 93
.set v_co_sub_m_index, 93
.set v_co_sub_n_index, 92
.set v_tmp, 94
.set v_wei_tmp_pack, 31
.set v_wei_flag, 100
.set v_pack_k_tmp, 94
.set v_in_hi_sshift, 98
.set v_in_wi_sshift, 99
.set v_end, 360

.set a_c, 0
.set a_end, 256

.text
.globl igemm_bwd_gtcx35_nhwc_fp16_bx0_ex1_bt256x256x32_wt32x32x8_ws2x2_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mh
.p2align 8
.type igemm_bwd_gtcx35_nhwc_fp16_bx0_ex1_bt256x256x32_wt32x32x8_ws2x2_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mh,@function
igemm_bwd_gtcx35_nhwc_fp16_bx0_ex1_bt256x256x32_wt32x32x8_ws2x2_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mh:
    s_load_dwordx2  s[s_p_in+0:s_p_in+1],       s[s_ka+0:s_ka+1],    0+k_p_in
    s_load_dwordx2  s[s_p_wei+0:s_p_wei+1],      s[s_ka+0:s_ka+1],    0+k_p_wei
    s_load_dwordx2  s[s_p_out+0:s_p_out+1],      s[s_ka+0:s_ka+1],    0+k_p_out
    s_load_dwordx16 s[s_hi+0:s_hi+15],        s[s_ka+0:s_ka+1],    0+k_hi
    s_load_dwordx8  s[s_dtile_ix+0:s_dtile_ix+7],   s[s_ka+0:s_ka+1],    0+k_dtile_ix
    s_load_dwordx4  s[s_dslice_x+0:s_dslice_x+3],   s[s_ka+0:s_ka+1],    0+k_dslice_x
    s_load_dwordx2  s[s_dslice_w_left+0:s_dslice_w_left+1],   s[s_ka+0:s_ka+1],    0+k_dslice_w_left
    s_load_dwordx2 s[s_magic_0+0:s_magic_0+1],  s[s_ka+0:s_ka+1],  0+k_magic_0
    s_load_dwordx2 s[s_magic_2+0:s_magic_2+1],  s[s_ka+0:s_ka+1],  0+k_magic_2
    s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1],  0+k_shift_pack_0
    ; out(e, k, nb0, nb1) thread_lengths: 1x8x4x1, cluster_length: 1x4x1x64, k_pack:8
    ; wei(e, k, c0, c1) thread_length: 1x8x1x4, cluster_length: 1x4x1x64, k_pack:8
    v_mov_b32 v[v_tmp], v0
    v_and_b32 v[v_out_ik], 3, v[v_tmp]
    v_lshlrev_b32 v[v_out_ik], 3, v[v_out_ik]
    v_lshrrev_b32 v[v_tmp], 2, v[v_tmp]
    v_and_b32 v[v_out_inb], 63, v[v_tmp]
    v_mov_b32 v[v_tmp], v0
    v_and_b32 v[v_wei_ic], 63, v[v_tmp]
    v_lshlrev_b32 v[v_wei_ic], 2, v[v_wei_ic]
    v_lshrrev_b32 v[v_tmp], 6, v[v_tmp]
    v_and_b32 v[v_wei_ik], 3, v[v_tmp]
    v_lshlrev_b32 v[v_wei_ik], 3, v[v_wei_ik]

    s_waitcnt lgkmcnt(0)

    ; calculate index
    s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group]
    s_mul_i32 s[s_tmp+2], s[s_wo], s[s_out_stride_wo]
    s_mul_i32 s[s_out_stride_n], s[s_ho], s[s_tmp+2]
    s_mul_i32 s[s_tmp], s[s_x], s[s_c]
    s_mul_i32 s[s_wei_stride_k], s[s_tmp], s[s_y]
    s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group]
    s_mul_i32 s[s_tmp+1], s[s_wi], s[s_in_stride_wi]
    s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+1]
    s_mul_i32  s[s_tmp], s[s_n], s[s_in_stride_n]
    s_mul_i32  s[s_tmp+1], s[s_n], s[s_out_stride_n]
    s_lshl_b32 s[s_tmp+4], s[s_tmp], 1
    s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 1
    s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4]
    s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4]
    s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp]
    s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1]
    s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5]
    s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5]
    s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp]
    s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1]
    s_mul_i32 s[s_dim_br], s[s_dslice_h], s[s_dslice_w]
    s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br]
    s_add_u32 s[s_tmp], 255, s[s_dim_mr]
    s_lshr_b32 s[s_tmp+1], s[s_tmp], 8
    s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 8
    s_add_u32 s[s_tmp], 255, s[s_c]
    s_lshr_b32 s[s_tmp+1], s[s_tmp], 8
    s_lshl_b32 s[s_dim_np], s[s_tmp+1], 8

    ; gemm_m_per_block:256, gemm_n_per_block:256, source_access_order:0
    s_lshr_b32 s[s_tmp], s[s_dim_mp], 8
    s_lshr_b32 s[s_tmp+1], s[s_dim_np], 8
    s_mul_i32 s[0], s[s_tmp+1], s[s_tmp]
    ; multihead dispatch code start
    s_mul_i32 s[s_tmp], s[s_dtile_y], s[s_dtile_x]
    s_cmp_eq_u32  1,  s[s_tmp]
    s_cbranch_scc1 L_igemm_bwd_gtcx35_nhwc_fp16_bx0_ex1_bt256x256x32_wt32x32x8_ws2x2_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mh_mh_dispatch_end
    s_mul_i32 s[s_tmp+2], s[0], s[s_group]
    .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_dtile_h,s_dtile_w,s_tmp+2,s_tmp
    s_mov_b32 s[s_bx], s[s_tmp+4]
    .mdiv_u32_rem_ss s_tmp+3,s_tmp+4,s_tmp+5,s_dtile_iy,s_dtile_ix,s_dtile_x,s_tmp
    s_add_u32 s[s_tmp+5], s[s_y], s[s_dtile_y]
    s_sub_u32 s[s_tmp+5], s[s_tmp+5], s[s_tmp+4]
    s_sub_u32 s[s_tmp+5], s[s_tmp+5], 1
    .mdiv_u32_ss s_dslice_y,s_tmp+5,s_dslice_y,s_dslice_x,s_tmp
    s_add_u32 s[s_tmp+5], s[s_x], s[s_dtile_x]
    s_sub_u32 s[s_tmp+5], s[s_tmp+5], s[s_tmp+3]
    s_sub_u32 s[s_tmp+5], s[s_tmp+5], 1
    .mdiv_u32_ss s_dslice_x,s_tmp+5,s_dtile_iy,s_dtile_ix,s_tmp
    s_mov_b32 s[s_dtile_iy],  s[s_tmp+4]
    s_mov_b32 s[s_dtile_ix],  s[s_tmp+3]
    s_cmp_lt_u32 s[s_dtile_iy], s[s_y]
    s_cbranch_scc0 L_igemm_bwd_gtcx35_nhwc_fp16_bx0_ex1_bt256x256x32_wt32x32x8_ws2x2_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mh_out
    s_cmp_lt_u32 s[s_dtile_ix], s[s_x]
    s_cbranch_scc0 L_igemm_bwd_gtcx35_nhwc_fp16_bx0_ex1_bt256x256x32_wt32x32x8_ws2x2_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mh_out
    ; multihead dispatch code end
L_igemm_bwd_gtcx35_nhwc_fp16_bx0_ex1_bt256x256x32_wt32x32x8_ws2x2_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mh_mh_dispatch_end:

    s_mul_i32 s[s_tmp], s[s_dslice_x], s[s_dslice_y]
    s_mul_i32 s[s_knum], s[s_tmp], s[s_k]
    s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8
    .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_1,s_tmp+3,0,s_tmp
    s_mov_b32 s[s_bx], s[s_tmp+4]
    s_lshr_b32 s[0], s[s_dim_np], 8
    s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8
    .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp
    ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im
    s_lshl_b32 s[s_block_gtc_ic], s[s_tmp+4], 8
    s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 8
    v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_out_inb]
    s_bfe_u32 s[s_shift_m3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8
    .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp
    s_bfe_u32 s[s_shift_m2], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8
    .mdiv_u32_rem_vs v_out_iwo_list,v_out_iho_list,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp
    v_add_u32 v[v_out_iho_list], s[s_dslice_h_left], v[v_out_iho_list]
    v_add_u32 v[v_out_iwo_list], s[s_dslice_w_left], v[v_out_iwo_list]

    s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 1
    ; calculate wei offset
    s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k]
    s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2]
    s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2]
    s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp]
    s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1]
    v_add_u32 v[v_tmp+5], s[s_block_gtc_ic], v[v_wei_ic]
    s_mul_i32 s[s_tmp], s[s_dtile_iy], s[s_x] 
    v_mul_lo_u32 v[v_tmp+4], s[s_wei_stride_k], v[v_wei_ik]
    s_add_u32 s[s_tmp], s[s_tmp], s[s_dtile_ix]
    v_add_lshl_u32 v[v_wei_os], v[v_tmp+4], v[v_tmp+5], 1
    s_lshl_b32 s[s_tmp+1] s[s_c], 1
    v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5]
    s_mul_i32 s[s_tmp], s[s_tmp], s[s_tmp+1]
    v_cndmask_b32 v[v_wei_flag], 0, 1 vcc
    v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag]
    v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os]

    s_lshl_b32 s[s_wei_stride_k], s[s_wei_stride_k], 1
    s_mul_i32 s[s_wei_offset], 2, s[s_wei_stride_k]
    s_mul_i32 s[s_wei_offset+1], 3, s[s_wei_stride_k]
    s_mul_i32 s[s_wei_offset+2], 4, s[s_wei_stride_k]
    s_mul_i32 s[s_wei_offset+3], 5, s[s_wei_stride_k]
    s_mul_i32 s[s_wei_offset+4], 6, s[s_wei_stride_k]
    s_mul_i32 s[s_wei_offset+5], 7, s[s_wei_stride_k]

    .v_clear_nc v_gld_b, 16
    s_mov_b32 s[s_p_wei+2], 0xffffffff
    s_mov_b32 s[s_p_wei+3], 0x27000
    v_cmpx_le_u32 vcc, 1, v[v_wei_flag]
    buffer_load_dwordx2 v[v_gld_b:v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0
    buffer_load_dwordx2 v[v_gld_b+2:v_gld_b+2+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0
    buffer_load_dwordx2 v[v_gld_b+4:v_gld_b+4+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0
    buffer_load_dwordx2 v[v_gld_b+6:v_gld_b+6+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0
    buffer_load_dwordx2 v[v_gld_b+8:v_gld_b+8+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+2] offen offset:0
    buffer_load_dwordx2 v[v_gld_b+10:v_gld_b+10+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+3] offen offset:0
    buffer_load_dwordx2 v[v_gld_b+12:v_gld_b+12+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+4] offen offset:0
    buffer_load_dwordx2 v[v_gld_b+14:v_gld_b+14+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+5] offen offset:0
    s_mov_b64 exec, -1

    v_cmp_gt_u32 vcc, s[s_n], v[v_out_in]
    v_cndmask_b32 v[v_tmp], 0, 1 vcc
    v_lshlrev_b32 v[v_out_flag_n], 0, v[v_tmp]
    ; calculate output offset
    s_mov_b32 s[s_out_offset], 0
    s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k]
    s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k]
    s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp]
    s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1]

    v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in]
    s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 1
    v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 1
    v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list]
    v_add_u32 v[v_tmp], v[v_out_iwo_list], v[v_tmp]
    v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp]
    v_add_u32 v[v_out_os], v[v_tmp+4], v[v_tmp]
    v_bfe_u32 v[v_tmp+1], v[v_out_flag_n],  0, 1
    v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list]
    v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+1] vcc
    v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list]
    v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag] vcc

    s_mov_b32 s1, 64
    v_add_u32 v[v_tmp], s1, v[v_out_inb]
    v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp]
    .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp
    .mdiv_u32_rem_vs v_out_iwo_list+1,v_out_iho_list+1,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp
    v_add_u32 v[v_out_iho_list+1], s[s_dslice_h_left], v[v_out_iho_list+1]
    v_add_u32 v[v_out_iwo_list+1], s[s_dslice_w_left], v[v_out_iwo_list+1]

    v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in]
    v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 1
    v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list+1]
    v_add_u32 v[v_tmp], v[v_out_iwo_list+1], v[v_tmp]
    v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp]
    v_add_u32 v[v_out_os+1], v[v_tmp+4], v[v_tmp]
    v_cmp_gt_u32 vcc, s[s_n], v[v_out_in]
    v_cndmask_b32 v[v_tmp], 0, 1 vcc
    v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 1, v[v_out_flag_n]
    v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+1]
    v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp] vcc
    v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+1]
    v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1] vcc
    s_mov_b32 s1, 128
    v_add_u32 v[v_tmp], s1, v[v_out_inb]
    v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp]
    .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp
    .mdiv_u32_rem_vs v_out_iwo_list+2,v_out_iho_list+2,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp
    v_add_u32 v[v_out_iho_list+2], s[s_dslice_h_left], v[v_out_iho_list+2]
    v_add_u32 v[v_out_iwo_list+2], s[s_dslice_w_left], v[v_out_iwo_list+2]

    v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in]
    v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 1
    v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list+2]
    v_add_u32 v[v_tmp], v[v_out_iwo_list+2], v[v_tmp]
    v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp]
    v_add_u32 v[v_out_os+2], v[v_tmp+4], v[v_tmp]
    v_cmp_gt_u32 vcc, s[s_n], v[v_out_in]
    v_cndmask_b32 v[v_tmp], 0, 1 vcc
    v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 2, v[v_out_flag_n]
    v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+2]
    v_cndmask_b32 v[v_out_flag+2], 0, v[v_tmp] vcc
    v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+2]
    v_cndmask_b32 v[v_out_flag+2], 0, v[v_out_flag+2] vcc
    s_mov_b32 s1, 192
    v_add_u32 v[v_tmp], s1, v[v_out_inb]
    v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp]
    .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp
    .mdiv_u32_rem_vs v_out_iwo_list+3,v_out_iho_list+3,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp
    v_add_u32 v[v_out_iho_list+3], s[s_dslice_h_left], v[v_out_iho_list+3]
    v_add_u32 v[v_out_iwo_list+3], s[s_dslice_w_left], v[v_out_iwo_list+3]

    v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in]
    v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 1
    v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list+3]
    v_add_u32 v[v_tmp], v[v_out_iwo_list+3], v[v_tmp]
    v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp]
    v_add_u32 v[v_out_os+3], v[v_tmp+4], v[v_tmp]
    v_cmp_gt_u32 vcc, s[s_n], v[v_out_in]
    v_cndmask_b32 v[v_tmp], 0, 1 vcc
    v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 3, v[v_out_flag_n]
    v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+3]
    v_cndmask_b32 v[v_out_flag+3], 0, v[v_tmp] vcc
    v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+3]
    v_cndmask_b32 v[v_out_flag+3], 0, v[v_out_flag+3] vcc
    s_mov_b32 s[s_p_out+2], 0xffffffff
    s_mov_b32 s[s_p_out+3], 0x27000
    ; load output, nxe:1
    .v_clear_nc v_gld_a, 16
    v_cmpx_le_u32 vcc, 1, v[v_out_flag]
    buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0
    s_mov_b64 exec, -1
    v_cmpx_le_u32 vcc, 1, v[v_out_flag+1]
    buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0
    s_mov_b64 exec, -1
    v_cmpx_le_u32 vcc, 1, v[v_out_flag+2]
    buffer_load_dwordx4 v[v_gld_a+8:v_gld_a+8+3], v[v_out_os+2], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0
    s_mov_b64 exec, -1
    v_cmpx_le_u32 vcc, 1, v[v_out_flag+3]
    buffer_load_dwordx4 v[v_gld_a+12:v_gld_a+12+3], v[v_out_os+3], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0
    s_mov_b64 exec, -1

    v_mov_b32 v[v_tmp+5], v0
    ; xdlops mapping, get source matrix gemm index, k_pack:8, v_pack:1, k_pack_per_thread:2
    v_and_b32 v[v_gemm_in], 31, v[v_tmp+5]           ; block_n index 
    v_and_b32 v[v_gemm_im], 31, v[v_tmp+5]           ; block_m index 
    v_lshlrev_b32 v[v_gemm_in], 3, v[v_gemm_in]   ; shift left k_pack:8
    v_lshlrev_b32 v[v_gemm_im], 3, v[v_gemm_im]   ; shift left k_pack:8
    v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5]
    v_and_b32 v[v_tmp + 0], 1, v[v_tmp+5]          ; block_k_per_wave index
    v_lshl_or_b32 v[v_gemm_in],  v[v_tmp + 0], 2, v[v_gemm_in]  ; or lanegroup_k_per_thread:4
    v_lshl_or_b32 v[v_gemm_im],  v[v_tmp + 0], 2, v[v_gemm_im]  ; or lanegroup_k_per_thread:4
    v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5]
    v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5]  ; waves_per_n index
    v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 9, v[v_gemm_in]
    v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5]
    v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5]  ; waves_per_m index
    v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 9, v[v_gemm_im]

    v_mov_b32 v[v_tmp+5], v0
    ; xdlops mapping, get dst matrix gemm index
    v_and_b32 v[v_tmp+0], 31, v[v_tmp+5]
    v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5]
    v_and_b32 v[v_tmp+1], 1, v[v_tmp+5]
    v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5]
    v_mov_b32 v[v_co_sst], v[v_tmp+0]
    v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1]
    v_and_b32 v[v_tmp+0], 1, v[v_tmp+5]
    v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5]
    v_and_b32 v[v_tmp+1], 1, v[v_tmp+5]
    v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 6, v[v_co_sst]
    v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 6, v[v_co_sld]

    ; LDS store, out: e,k,nb0,nb1: 1x8x4x1, 1x4x1x64, k_pack:8, k_pack_gld_a:8, fp16
    v_lshlrev_b32 v[v_tmp+2], 3,  v[v_out_inb]
    v_lshrrev_b32 v[v_tmp+1], 3,  v[v_out_ik]
    v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 11, v[v_tmp+2]
    v_lshlrev_b32 v[v_sst_a_os], 1, v[v_tmp]

    v_lshlrev_b32 v[v_sld_a_os], 1, v[v_gemm_im] ; LDS load out
    ; LDS store, wei: e,k,c: 1x8x1x4, 1x4x1x64, k_pack:8, k_pack_gld_b:8, fp16
    v_lshlrev_b32 v[v_tmp+2], 3,  v[v_wei_ic]
    v_lshrrev_b32 v[v_tmp+1], 3,  v[v_wei_ik]
    v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 11, v[v_tmp+2]
    v_lshlrev_b32 v[v_sst_b_os], 1, v[v_tmp]
    v_lshrrev_b32 v[v_tmp], 7, v[v_sst_b_os]
    v_lshlrev_b32 v[v_tmp], 4, v[v_tmp]
    v_add_u32 v[v_sst_b_os], v[v_tmp], v[v_sst_b_os]
    v_add_u32 v[v_sst_b_os], 16384, v[v_sst_b_os]

    v_lshlrev_b32 v[v_sld_b_os], 1, v[v_gemm_in] ; LDS load wei
    v_lshrrev_b32 v[v_tmp], 7, v[v_sld_b_os]
    v_lshlrev_b32 v[v_tmp], 4, v[v_tmp]
    v_add_u32 v[v_sld_b_os], v[v_tmp], v[v_sld_b_os]

    v_add_u32 v[v_sld_b_os], 16384, v[v_sld_b_os]
    v_mov_b32 v[v_gemm_in], v[v_co_sst]
    v_mov_b32 v[v_gemm_im], v[v_co_sld]
    ; init_co_lds_offset for xdlops
    v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im]
    v_and_b32 v[v_tmp],  1 v[v_tmp]   ; thread id of lanegroup_m_per_cluster
    v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp]
    v_lshrrev_b32 v[v_tmp+2], 6, v[v_gemm_im]  ; thread id of waves_per_m
    v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 5, v[v_co_sst]
    v_lshl_or_b32 v[v_co_sst], v[v_co_sst], 8, v[v_gemm_in]
    v_lshlrev_b32 v[v_co_sst], 1, v[v_co_sst]
    v_lshlrev_b32 v[v_co_sld], 4, v[0]
    ; init_co_sub_m_index xdlops, block_size:256, macro-tile:256x256 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7]
    ; g_mr:2, g_ms:2, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:2
    ; nd_stride:[4, 2, 1, 4, 1, 2, 2, 1]
    v_lshlrev_b32 v[v_tmp], 3, v[0]
    v_lshrrev_b32 v[v_co_sub_m_index], 8, v[v_tmp]  ; get tid along m
    v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index]                   ; => x_mt
    v_lshrrev_b32 v[v_co_sub_m_index], 2  ,v[v_co_sub_m_index]
    v_and_b32 v[v_tmp+1], 1, v[v_co_sub_m_index]                   ; => x_mc
    v_mov_b32 v[v_co_sub_m_index], v[v_tmp+0]      ; => accumulate x_mt
    v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 2, v[v_co_sub_m_index]      ; => accumulate x_mc
    ; init_co_sub_n_index xdlops
    v_lshlrev_b32 v[v_tmp], 3, v[0]
    v_and_b32 v[v_co_sub_n_index], 255, v[v_tmp]

    v_add_u32 v[v_tmp], s[s_block_gtc_ic], v[v_co_sub_n_index]
    v_cmp_gt_u32 vcc, s[s_c], v[v_tmp]
    v_cndmask_b32 v[v_in_flag_c], 0, 1 vcc
    ; input offset
    s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c]
    s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c]
    s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp]
    s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1]

    s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ic], 1
    s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp+3]
    s_addc_u32 s[s_p_in+1], s[s_p_in+1], 0

    s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 1
    v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index]
    s_mul_i32 s[s_tmp], s[s_dslice_h_left], s[s_stride_h]
    s_mul_i32 s[s_tmp+1], s[s_dtile_iy], s[s_dilation_h]
    s_add_i32 s[s_tmp+2], s[s_tmp], s[s_tmp+1]
    s_sub_i32 s[s_in_hi_sshift], s[s_tmp+2], s[s_pad_h]
    s_mul_i32 s[s_tmp], s[s_dslice_w_left], s[s_stride_w]
    s_mul_i32 s[s_tmp+1], s[s_dtile_ix], s[s_dilation_w]
    s_add_i32 s[s_tmp+2], s[s_tmp], s[s_tmp+1]
    s_sub_i32 s[s_in_wi_sshift], s[s_tmp+2], s[s_pad_w]
    v_lshlrev_b32 v[v_co_sub_n_index], 1, v[v_co_sub_n_index]
    s_lshl_b32 s[s_in_stride_n], s[s_in_stride_n], 1
    ; move slice stride
    s_lshl_b32 s[s_gemm_k_num_k], s[s_k], 1
    s_mul_i32 s[s_tmp], s[s_k], s[s_wei_stride_k]
    s_lshl_b32 s[s_tmp+3], s[s_c], 1
    s_mul_i32 s[s_tmp+1], s[s_dtile_x], s[s_tmp+3]
    s_sub_i32 s[s_wei_os_diff_acc_x_rst_k], s[s_tmp+1], s[s_tmp]
    s_sub_i32 s[s_tmp+2], s[s_dslice_x], 1
    s_mul_i32 s[s_tmp+2], s[s_tmp+2], s[s_tmp+3]
    s_mul_i32 s[s_tmp+2], s[s_tmp+2], s[s_dtile_x]
    s_mul_i32 s[s_tmp+3], s[s_x], s[s_tmp+3]
    s_mul_i32 s[s_tmp+1], s[s_dtile_y], s[s_tmp+3]
    s_sub_i32 s[s_tmp+1], s[s_tmp+1], s[s_tmp+2]
    s_sub_i32 s[s_wei_os_diff_acc_y_rst_kx], s[s_tmp+1], s[s_tmp]
    v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1
    s_mov_b32 s[s_move_slice_out_stride_k], 64
    s_mul_i32 s[s_move_slice_wei_stride_k], 32, s[s_wei_stride_k]
    s_mov_b32 s[s_move_slice_k_ix], 0
    s_sub_i32 s[s_tmp+3], s[s_dslice_x], 1
    s_mul_i32 s[s_tmp], s[s_tmp+3], s[s_dtile_dx]
    s_mul_i32 s[s_out_os_diff_acc_ho_rst_wo], s[s_tmp], s[s_out_stride_wo]
    s_mul_i32 s[s_wo_diff_rst_x], s[s_dtile_dx], s[s_tmp+3]
    s_mul_i32 s[s_ho_diff_acc_y], -1, s[s_dtile_dy]
    s_mul_i32 s[s_wo_diff_acc_x], -1, s[s_dtile_dx]
    s_mul_i32 s[s_out_os_diff_acc_wo], s[s_wo_diff_acc_x], s[s_out_stride_wo]
    s_mul_i32 s[s_tmp+1], s[s_wo], s[s_out_stride_wo] ; s_out_stride_ho
    s_mul_i32 s[s_tmp], s[s_ho_diff_acc_y], s[s_tmp+1]
    s_add_i32 s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_ho_rst_wo], s[s_tmp]

    s_mov_b32 s[s_p_in+2], 0xffffffff
    s_mov_b32 s[s_p_in+3], 0x27000
    ; start MFMA loop, 32x32 wave tile with 2x2 repeat, 2x2 step, k_pack:8
    s_waitcnt vmcnt(4)
    v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+2]
    v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+4], v[v_gld_b+6]
    v_pack_b32_f16 v[v_pack_k_tmp+2], v[v_gld_b+8], v[v_gld_b+10]
    v_pack_b32_f16 v[v_pack_k_tmp+3], v[v_gld_b+12], v[v_gld_b+14]
    ds_write_b128 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+3] 
    v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+2] op_sel:[1, 1]
    v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+4], v[v_gld_b+6] op_sel:[1, 1]
    v_pack_b32_f16 v[v_pack_k_tmp+2], v[v_gld_b+8], v[v_gld_b+10] op_sel:[1, 1]
    v_pack_b32_f16 v[v_pack_k_tmp+3], v[v_gld_b+12], v[v_gld_b+14] op_sel:[1, 1]
    ds_write_b128 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+3] offset:16
    v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b+1], v[v_gld_b+3]
    v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+5], v[v_gld_b+7]
    v_pack_b32_f16 v[v_pack_k_tmp+2], v[v_gld_b+9], v[v_gld_b+11]
    v_pack_b32_f16 v[v_pack_k_tmp+3], v[v_gld_b+13], v[v_gld_b+15]
    ds_write_b128 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+3] offset:32
    v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b+1], v[v_gld_b+3] op_sel:[1, 1]
    v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+5], v[v_gld_b+7] op_sel:[1, 1]
    v_pack_b32_f16 v[v_pack_k_tmp+2], v[v_gld_b+9], v[v_gld_b+11] op_sel:[1, 1]
    v_pack_b32_f16 v[v_pack_k_tmp+3], v[v_gld_b+13], v[v_gld_b+15] op_sel:[1, 1]
    ds_write_b128 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+3] offset:48

    s_waitcnt vmcnt(0)
    ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] 
    ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:1024
    ds_write_b128 v[v_sst_a_os], v[v_gld_a+8:v_gld_a+8+3] offset:2048
    ds_write_b128 v[v_sst_a_os], v[v_gld_a+12:v_gld_a+12+3] offset:3072

    .v_clear_acc_c a_c, 256
    ; make sure acc WAR harzard, at least 1 nop for src_c
    s_sub_i32 s[s_kitr], s[s_knum], 32
    s_cmp_gt_i32 s[s_kitr], 0
    s_cbranch_scc0 L_igemm_bwd_gtcx35_nhwc_fp16_bx0_ex1_bt256x256x32_wt32x32x8_ws2x2_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mh_mfma_end

    s_add_u32 s[s_out_offset],  s[s_move_slice_out_stride_k], s[s_out_offset]
    v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os]
    s_cmp_le_u32 s[s_gemm_k_num_k], s[s_out_offset]
    s_cselect_b32 s[s_flag_need_acc_yx], 1, 0

    
    s_cmp_eq_u32 1, s[s_flag_need_acc_yx]
    s_cbranch_scc0 igemm_bwd_gtcx35_nhwc_fp16_bx0_ex1_bt256x256x32_wt32x32x8_ws2x2_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mh_acc_yx_end_0  ; no need do accumulate yx
igemm_bwd_gtcx35_nhwc_fp16_bx0_ex1_bt256x256x32_wt32x32x8_ws2x2_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mh_acc_yx_0:
    s_mov_b32 s[s_out_offset], 0
    s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix]
    s_cmp_le_u32 s[s_dslice_x], s[s_move_slice_k_ix]
    s_cselect_b32 s[s_tmp], s[s_wo_diff_rst_x], s[s_wo_diff_acc_x]
    v_add_u32 v[v_out_iwo_list], s[s_tmp], v[v_out_iwo_list]
    v_add_u32 v[v_out_iwo_list+1], s[s_tmp], v[v_out_iwo_list+1]
    v_add_u32 v[v_out_iwo_list+2], s[s_tmp], v[v_out_iwo_list+2]
    v_add_u32 v[v_out_iwo_list+3], s[s_tmp], v[v_out_iwo_list+3]
    s_cselect_b32 s[s_tmp], s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_wo]
    v_add_u32 v[v_out_os], s[s_tmp], v[v_out_os]
    v_add_u32 v[v_out_os+1], s[s_tmp], v[v_out_os+1]
    v_add_u32 v[v_out_os+2], s[s_tmp], v[v_out_os+2]
    v_add_u32 v[v_out_os+3], s[s_tmp], v[v_out_os+3]
    s_cselect_b32 s[s_tmp], s[s_wei_os_diff_acc_y_rst_kx], s[s_wei_os_diff_acc_x_rst_k]
    v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os]
    s_cbranch_scc0 igemm_bwd_gtcx35_nhwc_fp16_bx0_ex1_bt256x256x32_wt32x32x8_ws2x2_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mh_acc_yx_x_end_0
    s_mov_b32 s[s_move_slice_k_ix], 0
    v_add_i32 v[v_out_iho_list], s[s_ho_diff_acc_y], v[v_out_iho_list]
    v_add_i32 v[v_out_iho_list+1], s[s_ho_diff_acc_y], v[v_out_iho_list+1]
    v_add_i32 v[v_out_iho_list+2], s[s_ho_diff_acc_y], v[v_out_iho_list+2]
    v_add_i32 v[v_out_iho_list+3], s[s_ho_diff_acc_y], v[v_out_iho_list+3]
igemm_bwd_gtcx35_nhwc_fp16_bx0_ex1_bt256x256x32_wt32x32x8_ws2x2_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mh_acc_yx_x_end_0:
    v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 0, 1   ; extract flag_n
    v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list]
    v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+5] vcc
    v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list]
    v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag] vcc
    v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 1, 1   ; extract flag_n
    v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+1]
    v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp+5] vcc
    v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+1]
    v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1] vcc
    v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 2, 1   ; extract flag_n
    v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+2]
    v_cndmask_b32 v[v_out_flag+2], 0, v[v_tmp+5] vcc
    v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+2]
    v_cndmask_b32 v[v_out_flag+2], 0, v[v_out_flag+2] vcc
    v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 3, 1   ; extract flag_n
    v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+3]
    v_cndmask_b32 v[v_out_flag+3], 0, v[v_tmp+5] vcc
    v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+3]
    v_cndmask_b32 v[v_out_flag+3], 0, v[v_out_flag+3] vcc
igemm_bwd_gtcx35_nhwc_fp16_bx0_ex1_bt256x256x32_wt32x32x8_ws2x2_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mh_acc_yx_end_0:

    s_waitcnt lgkmcnt(0)
    s_barrier
    ds_read2_b64 v[v_a+0:v_a+3], v[v_sld_a_os], offset0:0, offset1:64
    ds_read2_b64 v[v_b+0:v_b+3], v[v_sld_b_os], offset0:0, offset1:72
    v_add_u32 v[v_tmp+5], 2304, v[v_sld_b_os]
    ds_read2_b64 v[v_b+4+0:v_b+4+3], v[v_tmp+5], offset0:0, offset1:72
    ds_read2st64_b64 v[v_a+4+0:v_a+4+3], v[v_sld_a_os], offset0:4, offset1:5
L_igemm_bwd_gtcx35_nhwc_fp16_bx0_ex1_bt256x256x32_wt32x32x8_ws2x2_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mh_mfma_body:
    ; do fma accumulate with unroll 32
    s_waitcnt lgkmcnt(2)
    v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15]     ; repeat:0x0, step:0x0, num_a_c:16
    v_cmpx_le_u32 vcc, 1, v[v_wei_flag]
    buffer_load_dwordx2 v[v_gld_b:v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0
    s_mov_b64 exec, -1
    v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+32:a_c+47]     ; repeat:0x0, step:1x0, num_a_c:16
    v_cmpx_le_u32 vcc, 1, v[v_wei_flag]
    buffer_load_dwordx2 v[v_gld_b+2:v_gld_b+2+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0
    s_mov_b64 exec, -1
    v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31]     ; repeat:0x0, step:0x1, num_a_c:16
    v_cmpx_le_u32 vcc, 1, v[v_wei_flag]
    buffer_load_dwordx2 v[v_gld_b+4:v_gld_b+4+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0
    s_mov_b64 exec, -1
    v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+48:a_c+63]     ; repeat:0x0, step:1x1, num_a_c:16
    v_cmpx_le_u32 vcc, 1, v[v_wei_flag]
    buffer_load_dwordx2 v[v_gld_b+6:v_gld_b+6+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0
    s_mov_b64 exec, -1
    ds_read2st64_b64 v[v_a+8+0:v_a+8+3], v[v_sld_a_os], offset0:8, offset1:9 ; load i_k:1 into local buffer 1, repeat 0
    v_add_u32 v[v_tmp+5], 2304, v[v_tmp+5]
    ds_read2_b64 v[v_b+8+0:v_b+8+3], v[v_tmp+5], offset0:0, offset1:72 ; load i_k:1 into local buffer 1, repeat 0
    s_waitcnt lgkmcnt(3)
    v_mfma_f32_32x32x8f16 a[a_c+64:a_c+79], v[v_a+0:v_a+1], v[v_b+4:v_b+5], a[a_c+64:a_c+79]     ; repeat:0x1, step:0x0, num_a_c:16
    v_cmpx_le_u32 vcc, 1, v[v_wei_flag]
    buffer_load_dwordx2 v[v_gld_b+8:v_gld_b+8+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+2] offen offset:0
    s_mov_b64 exec, -1
    v_mfma_f32_32x32x8f16 a[a_c+96:a_c+111], v[v_a+2:v_a+3], v[v_b+4:v_b+5], a[a_c+96:a_c+111]     ; repeat:0x1, step:1x0, num_a_c:16
    v_cmpx_le_u32 vcc, 1, v[v_wei_flag]
    buffer_load_dwordx2 v[v_gld_b+10:v_gld_b+10+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+3] offen offset:0
    s_mov_b64 exec, -1
    v_mfma_f32_32x32x8f16 a[a_c+80:a_c+95], v[v_a+0:v_a+1], v[v_b+6:v_b+7], a[a_c+80:a_c+95]     ; repeat:0x1, step:0x1, num_a_c:16
    v_cmpx_le_u32 vcc, 1, v[v_wei_flag]
    buffer_load_dwordx2 v[v_gld_b+12:v_gld_b+12+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+4] offen offset:0
    s_mov_b64 exec, -1
    v_mfma_f32_32x32x8f16 a[a_c+112:a_c+127], v[v_a+2:v_a+3], v[v_b+6:v_b+7], a[a_c+112:a_c+127]     ; repeat:0x1, step:1x1, num_a_c:16
    v_cmpx_le_u32 vcc, 1, v[v_wei_flag]
    buffer_load_dwordx2 v[v_gld_b+14:v_gld_b+14+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+5] offen offset:0
    s_mov_b64 exec, -1
    v_add_u32 v[v_tmp+5], 2304, v[v_tmp+5]
    ds_read2_b64 v[v_b+12+0:v_b+12+3], v[v_tmp+5], offset0:0, offset1:72 ; load i_k:1 into local buffer 1, repeat 1
    ds_read2st64_b64 v[v_a+12+0:v_a+12+3], v[v_sld_a_os], offset0:12, offset1:13 ; load i_k:1 into local buffer 1, repeat 1
    s_waitcnt lgkmcnt(4)
    v_mfma_f32_32x32x8f16 a[a_c+128:a_c+143], v[v_a+4:v_a+5], v[v_b+0:v_b+1], a[a_c+128:a_c+143]     ; repeat:1x0, step:0x0, num_a_c:16
    .v_clear_nc v_gld_a, 16
    v_mfma_f32_32x32x8f16 a[a_c+160:a_c+175], v[v_a+6:v_a+7], v[v_b+0:v_b+1], a[a_c+160:a_c+175]     ; repeat:1x0, step:1x0, num_a_c:16
    v_cmpx_le_u32 vcc, 1, v[v_out_flag]
    buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0
    s_mov_b64 exec, -1
    v_mfma_f32_32x32x8f16 a[a_c+144:a_c+159], v[v_a+4:v_a+5], v[v_b+2:v_b+3], a[a_c+144:a_c+159]     ; repeat:1x0, step:0x1, num_a_c:16
    v_cmpx_le_u32 vcc, 1, v[v_out_flag+1]
    buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0
    s_mov_b64 exec, -1
    v_mfma_f32_32x32x8f16 a[a_c+176:a_c+191], v[v_a+6:v_a+7], v[v_b+2:v_b+3], a[a_c+176:a_c+191]     ; repeat:1x0, step:1x1, num_a_c:16
    v_cmpx_le_u32 vcc, 1, v[v_out_flag+2]
    buffer_load_dwordx4 v[v_gld_a+8:v_gld_a+8+3], v[v_out_os+2], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0
    s_mov_b64 exec, -1
    ds_read2st64_b64 v[v_a+0:v_a+3], v[v_sld_a_os], offset0:16, offset1:17 ; load i_k:2 into local buffer 0, repeat 0
    v_add_u32 v[v_tmp+5], 2304, v[v_tmp+5]
    ds_read2_b64 v[v_b+0:v_b+3], v[v_tmp+5], offset0:0, offset1:72 ; load i_k:2 into local buffer 0, repeat 0
    v_mfma_f32_32x32x8f16 a[a_c+192:a_c+207], v[v_a+4:v_a+5], v[v_b+4:v_b+5], a[a_c+192:a_c+207]     ; repeat:1x1, step:0x0, num_a_c:16
    v_cmpx_le_u32 vcc, 1, v[v_out_flag+3]
    buffer_load_dwordx4 v[v_gld_a+12:v_gld_a+12+3], v[v_out_os+3], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0
    s_mov_b64 exec, -1
    v_mfma_f32_32x32x8f16 a[a_c+224:a_c+239], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+224:a_c+239]     ; repeat:1x1, step:1x0, num_a_c:16
    s_add_u32 s[s_out_offset],  s[s_move_slice_out_stride_k], s[s_out_offset]
    v_mfma_f32_32x32x8f16 a[a_c+208:a_c+223], v[v_a+4:v_a+5], v[v_b+6:v_b+7], a[a_c+208:a_c+223]     ; repeat:1x1, step:0x1, num_a_c:16
    v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os]
    v_mfma_f32_32x32x8f16 a[a_c+240:a_c+255], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+240:a_c+255]     ; repeat:1x1, step:1x1, num_a_c:16
    s_cmp_le_u32 s[s_gemm_k_num_k], s[s_out_offset]
    v_add_u32 v[v_tmp+5], 2304, v[v_tmp+5]
    ds_read2_b64 v[v_b+4+0:v_b+4+3], v[v_tmp+5], offset0:0, offset1:72 ; load i_k:2 into local buffer 0, repeat 1
    s_waitcnt lgkmcnt(5)
    v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+8:v_a+9], v[v_b+8:v_b+9], a[a_c+0:a_c+15]     ; repeat:0x0, step:0x0, num_a_c:16
    s_cselect_b32 s[s_flag_need_acc_yx], 1, 0
    v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+10:v_a+11], v[v_b+8:v_b+9], a[a_c+32:a_c+47]     ; repeat:0x0, step:1x0, num_a_c:16
    
    v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+8:v_a+9], v[v_b+10:v_b+11], a[a_c+16:a_c+31]     ; repeat:0x0, step:0x1, num_a_c:16
    
    v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+10:v_a+11], v[v_b+10:v_b+11], a[a_c+48:a_c+63]     ; repeat:0x0, step:1x1, num_a_c:16
    
    ds_read2st64_b64 v[v_a+4+0:v_a+4+3], v[v_sld_a_os], offset0:20, offset1:21 ; load i_k:2 into local buffer 0, repeat 1
    s_waitcnt lgkmcnt(5)
    v_mfma_f32_32x32x8f16 a[a_c+64:a_c+79], v[v_a+8:v_a+9], v[v_b+12:v_b+13], a[a_c+64:a_c+79]     ; repeat:0x1, step:0x0, num_a_c:16
    
    v_mfma_f32_32x32x8f16 a[a_c+96:a_c+111], v[v_a+10:v_a+11], v[v_b+12:v_b+13], a[a_c+96:a_c+111]     ; repeat:0x1, step:1x0, num_a_c:16
    
    v_mfma_f32_32x32x8f16 a[a_c+80:a_c+95], v[v_a+8:v_a+9], v[v_b+14:v_b+15], a[a_c+80:a_c+95]     ; repeat:0x1, step:0x1, num_a_c:16
    
    v_mfma_f32_32x32x8f16 a[a_c+112:a_c+127], v[v_a+10:v_a+11], v[v_b+14:v_b+15], a[a_c+112:a_c+127]     ; repeat:0x1, step:1x1, num_a_c:16
    
    ds_read2st64_b64 v[v_a+8+0:v_a+8+3], v[v_sld_a_os], offset0:24, offset1:25 ; load i_k:3 into local buffer 1, repeat 0
    s_waitcnt lgkmcnt(5)
    v_mfma_f32_32x32x8f16 a[a_c+128:a_c+143], v[v_a+12:v_a+13], v[v_b+8:v_b+9], a[a_c+128:a_c+143]     ; repeat:1x0, step:0x0, num_a_c:16
    
    v_mfma_f32_32x32x8f16 a[a_c+160:a_c+175], v[v_a+14:v_a+15], v[v_b+8:v_b+9], a[a_c+160:a_c+175]     ; repeat:1x0, step:1x0, num_a_c:16
    
    v_mfma_f32_32x32x8f16 a[a_c+144:a_c+159], v[v_a+12:v_a+13], v[v_b+10:v_b+11], a[a_c+144:a_c+159]     ; repeat:1x0, step:0x1, num_a_c:16
    
    v_mfma_f32_32x32x8f16 a[a_c+176:a_c+191], v[v_a+14:v_a+15], v[v_b+10:v_b+11], a[a_c+176:a_c+191]     ; repeat:1x0, step:1x1, num_a_c:16
    
    v_add_u32 v[v_tmp+5], 2304, v[v_tmp+5]
    ds_read2_b64 v[v_b+8+0:v_b+8+3], v[v_tmp+5], offset0:0, offset1:72 ; load i_k:3 into local buffer 1, repeat 0
    v_mfma_f32_32x32x8f16 a[a_c+192:a_c+207], v[v_a+12:v_a+13], v[v_b+12:v_b+13], a[a_c+192:a_c+207]     ; repeat:1x1, step:0x0, num_a_c:16
    
    v_mfma_f32_32x32x8f16 a[a_c+224:a_c+239], v[v_a+14:v_a+15], v[v_b+12:v_b+13], a[a_c+224:a_c+239]     ; repeat:1x1, step:1x0, num_a_c:16
    
    v_mfma_f32_32x32x8f16 a[a_c+208:a_c+223], v[v_a+12:v_a+13], v[v_b+14:v_b+15], a[a_c+208:a_c+223]     ; repeat:1x1, step:0x1, num_a_c:16
    
    v_mfma_f32_32x32x8f16 a[a_c+240:a_c+255], v[v_a+14:v_a+15], v[v_b+14:v_b+15], a[a_c+240:a_c+255]     ; repeat:1x1, step:1x1, num_a_c:16
    
    v_add_u32 v[v_tmp+5], 2304, v[v_tmp+5]
    ds_read2_b64 v[v_b+12+0:v_b+12+3], v[v_tmp+5], offset0:0, offset1:72 ; load i_k:3 into local buffer 1, repeat 1
    ds_read2st64_b64 v[v_a+12+0:v_a+12+3], v[v_sld_a_os], offset0:28, offset1:29 ; load i_k:3 into local buffer 1, repeat 1
    s_waitcnt lgkmcnt(6)
    v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15]     ; repeat:0x0, step:0x0, num_a_c:16
    
    v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+32:a_c+47]     ; repeat:0x0, step:1x0, num_a_c:16
    
    v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31]     ; repeat:0x0, step:0x1, num_a_c:16
    
    v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+48:a_c+63]     ; repeat:0x0, step:1x1, num_a_c:16
    
    s_waitcnt lgkmcnt(5)
    v_mfma_f32_32x32x8f16 a[a_c+64:a_c+79], v[v_a+0:v_a+1], v[v_b+4:v_b+5], a[a_c+64:a_c+79]     ; repeat:0x1, step:0x0, num_a_c:16
    
    v_mfma_f32_32x32x8f16 a[a_c+96:a_c+111], v[v_a+2:v_a+3], v[v_b+4:v_b+5], a[a_c+96:a_c+111]     ; repeat:0x1, step:1x0, num_a_c:16
    
    v_mfma_f32_32x32x8f16 a[a_c+80:a_c+95], v[v_a+0:v_a+1], v[v_b+6:v_b+7], a[a_c+80:a_c+95]     ; repeat:0x1, step:0x1, num_a_c:16
    
    v_mfma_f32_32x32x8f16 a[a_c+112:a_c+127], v[v_a+2:v_a+3], v[v_b+6:v_b+7], a[a_c+112:a_c+127]     ; repeat:0x1, step:1x1, num_a_c:16
    
    s_waitcnt lgkmcnt(4)
    v_mfma_f32_32x32x8f16 a[a_c+128:a_c+143], v[v_a+4:v_a+5], v[v_b+0:v_b+1], a[a_c+128:a_c+143]     ; repeat:1x0, step:0x0, num_a_c:16
    
    v_mfma_f32_32x32x8f16 a[a_c+160:a_c+175], v[v_a+6:v_a+7], v[v_b+0:v_b+1], a[a_c+160:a_c+175]     ; repeat:1x0, step:1x0, num_a_c:16
    
    v_mfma_f32_32x32x8f16 a[a_c+144:a_c+159], v[v_a+4:v_a+5], v[v_b+2:v_b+3], a[a_c+144:a_c+159]     ; repeat:1x0, step:0x1, num_a_c:16
    
    v_mfma_f32_32x32x8f16 a[a_c+176:a_c+191], v[v_a+6:v_a+7], v[v_b+2:v_b+3], a[a_c+176:a_c+191]     ; repeat:1x0, step:1x1, num_a_c:16
    
    s_cmp_eq_u32 1, s[s_flag_need_acc_yx]
    s_cbranch_scc0 igemm_bwd_gtcx35_nhwc_fp16_bx0_ex1_bt256x256x32_wt32x32x8_ws2x2_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mh_acc_yx_end_1  ; no need do accumulate yx
igemm_bwd_gtcx35_nhwc_fp16_bx0_ex1_bt256x256x32_wt32x32x8_ws2x2_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mh_acc_yx_1:
    s_mov_b32 s[s_out_offset], 0
    s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix]
    s_cmp_le_u32 s[s_dslice_x], s[s_move_slice_k_ix]
    s_cselect_b32 s[s_tmp], s[s_wo_diff_rst_x], s[s_wo_diff_acc_x]
    v_add_u32 v[v_out_iwo_list], s[s_tmp], v[v_out_iwo_list]
    v_add_u32 v[v_out_iwo_list+1], s[s_tmp], v[v_out_iwo_list+1]
    v_add_u32 v[v_out_iwo_list+2], s[s_tmp], v[v_out_iwo_list+2]
    v_add_u32 v[v_out_iwo_list+3], s[s_tmp], v[v_out_iwo_list+3]
    s_cselect_b32 s[s_tmp], s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_wo]
    v_add_u32 v[v_out_os], s[s_tmp], v[v_out_os]
    v_add_u32 v[v_out_os+1], s[s_tmp], v[v_out_os+1]
    v_add_u32 v[v_out_os+2], s[s_tmp], v[v_out_os+2]
    v_add_u32 v[v_out_os+3], s[s_tmp], v[v_out_os+3]
    s_cselect_b32 s[s_tmp], s[s_wei_os_diff_acc_y_rst_kx], s[s_wei_os_diff_acc_x_rst_k]
    v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os]
    s_cbranch_scc0 igemm_bwd_gtcx35_nhwc_fp16_bx0_ex1_bt256x256x32_wt32x32x8_ws2x2_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mh_acc_yx_x_end_1
    s_mov_b32 s[s_move_slice_k_ix], 0
    v_add_i32 v[v_out_iho_list], s[s_ho_diff_acc_y], v[v_out_iho_list]
    v_add_i32 v[v_out_iho_list+1], s[s_ho_diff_acc_y], v[v_out_iho_list+1]
    v_add_i32 v[v_out_iho_list+2], s[s_ho_diff_acc_y], v[v_out_iho_list+2]
    v_add_i32 v[v_out_iho_list+3], s[s_ho_diff_acc_y], v[v_out_iho_list+3]
igemm_bwd_gtcx35_nhwc_fp16_bx0_ex1_bt256x256x32_wt32x32x8_ws2x2_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mh_acc_yx_x_end_1:
    v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 0, 1   ; extract flag_n
    v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list]
    v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+5] vcc
    v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list]
    v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag] vcc
    v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 1, 1   ; extract flag_n
    v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+1]
    v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp+5] vcc
    v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+1]
    v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1] vcc
    v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 2, 1   ; extract flag_n
    v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+2]
    v_cndmask_b32 v[v_out_flag+2], 0, v[v_tmp+5] vcc
    v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+2]
    v_cndmask_b32 v[v_out_flag+2], 0, v[v_out_flag+2] vcc
    v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 3, 1   ; extract flag_n
    v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+3]
    v_cndmask_b32 v[v_out_flag+3], 0, v[v_tmp+5] vcc
    v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+3]
    v_cndmask_b32 v[v_out_flag+3], 0, v[v_out_flag+3] vcc
igemm_bwd_gtcx35_nhwc_fp16_bx0_ex1_bt256x256x32_wt32x32x8_ws2x2_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mh_acc_yx_end_1:

    s_waitcnt lgkmcnt(0)
    s_barrier
    s_waitcnt vmcnt(4)
    v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+2]
    v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+4], v[v_gld_b+6]
    v_pack_b32_f16 v[v_pack_k_tmp+2], v[v_gld_b+8], v[v_gld_b+10]
    v_pack_b32_f16 v[v_pack_k_tmp+3], v[v_gld_b+12], v[v_gld_b+14]
    ds_write_b128 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+3]
    v_mfma_f32_32x32x8f16 a[a_c+192:a_c+207], v[v_a+4:v_a+5], v[v_b+4:v_b+5], a[a_c+192:a_c+207]     ; repeat:1x1, step:0x0, num_a_c:16
    v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+2] op_sel:[1, 1]
    v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+4], v[v_gld_b+6] op_sel:[1, 1]
    v_pack_b32_f16 v[v_pack_k_tmp+2], v[v_gld_b+8], v[v_gld_b+10] op_sel:[1, 1]
    v_pack_b32_f16 v[v_pack_k_tmp+3], v[v_gld_b+12], v[v_gld_b+14] op_sel:[1, 1]
    ds_write_b128 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+3] offset:16
    v_mfma_f32_32x32x8f16 a[a_c+224:a_c+239], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+224:a_c+239]     ; repeat:1x1, step:1x0, num_a_c:16
    v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b+1], v[v_gld_b+3]
    v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+5], v[v_gld_b+7]
    v_pack_b32_f16 v[v_pack_k_tmp+2], v[v_gld_b+9], v[v_gld_b+11]
    v_pack_b32_f16 v[v_pack_k_tmp+3], v[v_gld_b+13], v[v_gld_b+15]
    ds_write_b128 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+3] offset:32
    v_mfma_f32_32x32x8f16 a[a_c+208:a_c+223], v[v_a+4:v_a+5], v[v_b+6:v_b+7], a[a_c+208:a_c+223]     ; repeat:1x1, step:0x1, num_a_c:16
    v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b+1], v[v_gld_b+3] op_sel:[1, 1]
    v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+5], v[v_gld_b+7] op_sel:[1, 1]
    v_pack_b32_f16 v[v_pack_k_tmp+2], v[v_gld_b+9], v[v_gld_b+11] op_sel:[1, 1]
    v_pack_b32_f16 v[v_pack_k_tmp+3], v[v_gld_b+13], v[v_gld_b+15] op_sel:[1, 1]
    ds_write_b128 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+3] offset:48
    v_mfma_f32_32x32x8f16 a[a_c+240:a_c+255], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+240:a_c+255]     ; repeat:1x1, step:1x1, num_a_c:16
    s_waitcnt vmcnt(0)
    ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3]
    v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+8:v_a+9], v[v_b+8:v_b+9], a[a_c+0:a_c+15]     ; repeat:0x0, step:0x0, num_a_c:16
    ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:1024
    v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+10:v_a+11], v[v_b+8:v_b+9], a[a_c+32:a_c+47]     ; repeat:0x0, step:1x0, num_a_c:16
    ds_write_b128 v[v_sst_a_os], v[v_gld_a+8:v_gld_a+8+3] offset:2048
    v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+8:v_a+9], v[v_b+10:v_b+11], a[a_c+16:a_c+31]     ; repeat:0x0, step:0x1, num_a_c:16
    ds_write_b128 v[v_sst_a_os], v[v_gld_a+12:v_gld_a+12+3] offset:3072
    v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+10:v_a+11], v[v_b+10:v_b+11], a[a_c+48:a_c+63]     ; repeat:0x0, step:1x1, num_a_c:16
    v_mfma_f32_32x32x8f16 a[a_c+64:a_c+79], v[v_a+8:v_a+9], v[v_b+12:v_b+13], a[a_c+64:a_c+79]     ; repeat:0x1, step:0x0, num_a_c:16
    v_mfma_f32_32x32x8f16 a[a_c+96:a_c+111], v[v_a+10:v_a+11], v[v_b+12:v_b+13], a[a_c+96:a_c+111]     ; repeat:0x1, step:1x0, num_a_c:16
    v_mfma_f32_32x32x8f16 a[a_c+80:a_c+95], v[v_a+8:v_a+9], v[v_b+14:v_b+15], a[a_c+80:a_c+95]     ; repeat:0x1, step:0x1, num_a_c:16
    v_mfma_f32_32x32x8f16 a[a_c+112:a_c+127], v[v_a+10:v_a+11], v[v_b+14:v_b+15], a[a_c+112:a_c+127]     ; repeat:0x1, step:1x1, num_a_c:16
    s_sub_i32 s[s_kitr], s[s_kitr], 32
    s_cmp_gt_i32 s[s_kitr], 0
    s_cbranch_scc0 L_igemm_bwd_gtcx35_nhwc_fp16_bx0_ex1_bt256x256x32_wt32x32x8_ws2x2_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mh_mfma_finishing
    s_waitcnt lgkmcnt(0)
    s_barrier
    ds_read2_b64 v[v_a+0:v_a+3], v[v_sld_a_os], offset0:0, offset1:64
    ds_read2_b64 v[v_b+0:v_b+3], v[v_sld_b_os], offset0:0, offset1:72
    v_mfma_f32_32x32x8f16 a[a_c+128:a_c+143], v[v_a+12:v_a+13], v[v_b+8:v_b+9], a[a_c+128:a_c+143]     ; repeat:1x0, step:0x0, num_a_c:16
    v_mfma_f32_32x32x8f16 a[a_c+160:a_c+175], v[v_a+14:v_a+15], v[v_b+8:v_b+9], a[a_c+160:a_c+175]     ; repeat:1x0, step:1x0, num_a_c:16
    v_mfma_f32_32x32x8f16 a[a_c+144:a_c+159], v[v_a+12:v_a+13], v[v_b+10:v_b+11], a[a_c+144:a_c+159]     ; repeat:1x0, step:0x1, num_a_c:16
    v_mfma_f32_32x32x8f16 a[a_c+176:a_c+191], v[v_a+14:v_a+15], v[v_b+10:v_b+11], a[a_c+176:a_c+191]     ; repeat:1x0, step:1x1, num_a_c:16
    v_add_u32 v[v_tmp+5], 2304, v[v_sld_b_os]
    ds_read2_b64 v[v_b+4+0:v_b+4+3], v[v_tmp+5], offset0:0, offset1:72
    ds_read2st64_b64 v[v_a+4+0:v_a+4+3], v[v_sld_a_os], offset0:4, offset1:5
    v_mfma_f32_32x32x8f16 a[a_c+192:a_c+207], v[v_a+12:v_a+13], v[v_b+12:v_b+13], a[a_c+192:a_c+207]     ; repeat:1x1, step:0x0, num_a_c:16
    v_mfma_f32_32x32x8f16 a[a_c+224:a_c+239], v[v_a+14:v_a+15], v[v_b+12:v_b+13], a[a_c+224:a_c+239]     ; repeat:1x1, step:1x0, num_a_c:16
    v_mfma_f32_32x32x8f16 a[a_c+208:a_c+223], v[v_a+12:v_a+13], v[v_b+14:v_b+15], a[a_c+208:a_c+223]     ; repeat:1x1, step:0x1, num_a_c:16
    v_mfma_f32_32x32x8f16 a[a_c+240:a_c+255], v[v_a+14:v_a+15], v[v_b+14:v_b+15], a[a_c+240:a_c+255]     ; repeat:1x1, step:1x1, num_a_c:16
    s_branch L_igemm_bwd_gtcx35_nhwc_fp16_bx0_ex1_bt256x256x32_wt32x32x8_ws2x2_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mh_mfma_body
L_igemm_bwd_gtcx35_nhwc_fp16_bx0_ex1_bt256x256x32_wt32x32x8_ws2x2_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mh_mfma_finishing:
    v_mfma_f32_32x32x8f16 a[a_c+128:a_c+143], v[v_a+12:v_a+13], v[v_b+8:v_b+9], a[a_c+128:a_c+143]     ; repeat:1x0, step:0x0, num_a_c:16
    v_mfma_f32_32x32x8f16 a[a_c+160:a_c+175], v[v_a+14:v_a+15], v[v_b+8:v_b+9], a[a_c+160:a_c+175]     ; repeat:1x0, step:1x0, num_a_c:16
    v_mfma_f32_32x32x8f16 a[a_c+144:a_c+159], v[v_a+12:v_a+13], v[v_b+10:v_b+11], a[a_c+144:a_c+159]     ; repeat:1x0, step:0x1, num_a_c:16
    v_mfma_f32_32x32x8f16 a[a_c+176:a_c+191], v[v_a+14:v_a+15], v[v_b+10:v_b+11], a[a_c+176:a_c+191]     ; repeat:1x0, step:1x1, num_a_c:16

    v_mfma_f32_32x32x8f16 a[a_c+192:a_c+207], v[v_a+12:v_a+13], v[v_b+12:v_b+13], a[a_c+192:a_c+207]     ; repeat:1x1, step:0x0, num_a_c:16
    v_mfma_f32_32x32x8f16 a[a_c+224:a_c+239], v[v_a+14:v_a+15], v[v_b+12:v_b+13], a[a_c+224:a_c+239]     ; repeat:1x1, step:1x0, num_a_c:16
    v_mfma_f32_32x32x8f16 a[a_c+208:a_c+223], v[v_a+12:v_a+13], v[v_b+14:v_b+15], a[a_c+208:a_c+223]     ; repeat:1x1, step:0x1, num_a_c:16
    v_mfma_f32_32x32x8f16 a[a_c+240:a_c+255], v[v_a+14:v_a+15], v[v_b+14:v_b+15], a[a_c+240:a_c+255]     ; repeat:1x1, step:1x1, num_a_c:16

L_igemm_bwd_gtcx35_nhwc_fp16_bx0_ex1_bt256x256x32_wt32x32x8_ws2x2_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mh_mfma_end:
    s_waitcnt lgkmcnt(0)
    s_barrier
    ds_read2_b64 v[v_a+0:v_a+3], v[v_sld_a_os], offset0:0, offset1:64
    ds_read2_b64 v[v_b+0:v_b+3], v[v_sld_b_os], offset0:0, offset1:72
    v_add_u32 v[v_tmp+5], 2304, v[v_sld_b_os]
    ds_read2_b64 v[v_b+4+0:v_b+4+3], v[v_tmp+5], offset0:0, offset1:72
    ds_read2st64_b64 v[v_a+4+0:v_a+4+3], v[v_sld_a_os], offset0:4, offset1:5
    ; k iteration : 0
    s_waitcnt lgkmcnt(2)
    v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15]     ; repeat:0x0, step:0x0, num_a_c:16
    v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+32:a_c+47]     ; repeat:0x0, step:1x0, num_a_c:16
    v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31]     ; repeat:0x0, step:0x1, num_a_c:16
    v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+48:a_c+63]     ; repeat:0x0, step:1x1, num_a_c:16
    ds_read2st64_b64 v[v_a+8+0:v_a+8+3], v[v_sld_a_os], offset0:8, offset1:9 ; load i_k:1 into local buffer 1, repeat 0
    v_add_u32 v[v_tmp+5], 2304, v[v_tmp+5]
    ds_read2_b64 v[v_b+8+0:v_b+8+3], v[v_tmp+5], offset0:0, offset1:72 ; load i_k:1 into local buffer 1, repeat 0

    s_waitcnt lgkmcnt(3)
    v_mfma_f32_32x32x8f16 a[a_c+64:a_c+79], v[v_a+0:v_a+1], v[v_b+4:v_b+5], a[a_c+64:a_c+79]     ; repeat:0x1, step:0x0, num_a_c:16
    v_mfma_f32_32x32x8f16 a[a_c+96:a_c+111], v[v_a+2:v_a+3], v[v_b+4:v_b+5], a[a_c+96:a_c+111]     ; repeat:0x1, step:1x0, num_a_c:16
    v_mfma_f32_32x32x8f16 a[a_c+80:a_c+95], v[v_a+0:v_a+1], v[v_b+6:v_b+7], a[a_c+80:a_c+95]     ; repeat:0x1, step:0x1, num_a_c:16
    v_mfma_f32_32x32x8f16 a[a_c+112:a_c+127], v[v_a+2:v_a+3], v[v_b+6:v_b+7], a[a_c+112:a_c+127]     ; repeat:0x1, step:1x1, num_a_c:16
    v_add_u32 v[v_tmp+5], 2304, v[v_tmp+5]
    ds_read2_b64 v[v_b+12+0:v_b+12+3], v[v_tmp+5], offset0:0, offset1:72 ; load i_k:1 into local buffer 1, repeat 1
    ds_read2st64_b64 v[v_a+12+0:v_a+12+3], v[v_sld_a_os], offset0:12, offset1:13 ; load i_k:1 into local buffer 1, repeat 1

    s_waitcnt lgkmcnt(4)
    v_mfma_f32_32x32x8f16 a[a_c+128:a_c+143], v[v_a+4:v_a+5], v[v_b+0:v_b+1], a[a_c+128:a_c+143]     ; repeat:1x0, step:0x0, num_a_c:16
    v_mfma_f32_32x32x8f16 a[a_c+160:a_c+175], v[v_a+6:v_a+7], v[v_b+0:v_b+1], a[a_c+160:a_c+175]     ; repeat:1x0, step:1x0, num_a_c:16
    v_mfma_f32_32x32x8f16 a[a_c+144:a_c+159], v[v_a+4:v_a+5], v[v_b+2:v_b+3], a[a_c+144:a_c+159]     ; repeat:1x0, step:0x1, num_a_c:16
    v_mfma_f32_32x32x8f16 a[a_c+176:a_c+191], v[v_a+6:v_a+7], v[v_b+2:v_b+3], a[a_c+176:a_c+191]     ; repeat:1x0, step:1x1, num_a_c:16
    ds_read2st64_b64 v[v_a+0:v_a+3], v[v_sld_a_os], offset0:16, offset1:17 ; load i_k:2 into local buffer 0, repeat 0
    v_add_u32 v[v_tmp+5], 2304, v[v_tmp+5]
    ds_read2_b64 v[v_b+0:v_b+3], v[v_tmp+5], offset0:0, offset1:72 ; load i_k:2 into local buffer 0, repeat 0

    v_mfma_f32_32x32x8f16 a[a_c+192:a_c+207], v[v_a+4:v_a+5], v[v_b+4:v_b+5], a[a_c+192:a_c+207]     ; repeat:1x1, step:0x0, num_a_c:16
    v_mfma_f32_32x32x8f16 a[a_c+224:a_c+239], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+224:a_c+239]     ; repeat:1x1, step:1x0, num_a_c:16
    v_mfma_f32_32x32x8f16 a[a_c+208:a_c+223], v[v_a+4:v_a+5], v[v_b+6:v_b+7], a[a_c+208:a_c+223]     ; repeat:1x1, step:0x1, num_a_c:16
    v_mfma_f32_32x32x8f16 a[a_c+240:a_c+255], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+240:a_c+255]     ; repeat:1x1, step:1x1, num_a_c:16
    v_add_u32 v[v_tmp+5], 2304, v[v_tmp+5]
    ds_read2_b64 v[v_b+4+0:v_b+4+3], v[v_tmp+5], offset0:0, offset1:72 ; load i_k:2 into local buffer 0, repeat 1

    ; k iteration : 1
    s_waitcnt lgkmcnt(5)
    v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+8:v_a+9], v[v_b+8:v_b+9], a[a_c+0:a_c+15]     ; repeat:0x0, step:0x0, num_a_c:16
    v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+10:v_a+11], v[v_b+8:v_b+9], a[a_c+32:a_c+47]     ; repeat:0x0, step:1x0, num_a_c:16
    v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+8:v_a+9], v[v_b+10:v_b+11], a[a_c+16:a_c+31]     ; repeat:0x0, step:0x1, num_a_c:16
    v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+10:v_a+11], v[v_b+10:v_b+11], a[a_c+48:a_c+63]     ; repeat:0x0, step:1x1, num_a_c:16
    ds_read2st64_b64 v[v_a+4+0:v_a+4+3], v[v_sld_a_os], offset0:20, offset1:21 ; load i_k:2 into local buffer 0, repeat 1

    s_waitcnt lgkmcnt(5)
    v_mfma_f32_32x32x8f16 a[a_c+64:a_c+79], v[v_a+8:v_a+9], v[v_b+12:v_b+13], a[a_c+64:a_c+79]     ; repeat:0x1, step:0x0, num_a_c:16
    v_mfma_f32_32x32x8f16 a[a_c+96:a_c+111], v[v_a+10:v_a+11], v[v_b+12:v_b+13], a[a_c+96:a_c+111]     ; repeat:0x1, step:1x0, num_a_c:16
    v_mfma_f32_32x32x8f16 a[a_c+80:a_c+95], v[v_a+8:v_a+9], v[v_b+14:v_b+15], a[a_c+80:a_c+95]     ; repeat:0x1, step:0x1, num_a_c:16
    v_mfma_f32_32x32x8f16 a[a_c+112:a_c+127], v[v_a+10:v_a+11], v[v_b+14:v_b+15], a[a_c+112:a_c+127]     ; repeat:0x1, step:1x1, num_a_c:16
    ds_read2st64_b64 v[v_a+8+0:v_a+8+3], v[v_sld_a_os], offset0:24, offset1:25 ; load i_k:3 into local buffer 1, repeat 0

    s_waitcnt lgkmcnt(5)
    v_mfma_f32_32x32x8f16 a[a_c+128:a_c+143], v[v_a+12:v_a+13], v[v_b+8:v_b+9], a[a_c+128:a_c+143]     ; repeat:1x0, step:0x0, num_a_c:16
    v_mfma_f32_32x32x8f16 a[a_c+160:a_c+175], v[v_a+14:v_a+15], v[v_b+8:v_b+9], a[a_c+160:a_c+175]     ; repeat:1x0, step:1x0, num_a_c:16
    v_mfma_f32_32x32x8f16 a[a_c+144:a_c+159], v[v_a+12:v_a+13], v[v_b+10:v_b+11], a[a_c+144:a_c+159]     ; repeat:1x0, step:0x1, num_a_c:16
    v_mfma_f32_32x32x8f16 a[a_c+176:a_c+191], v[v_a+14:v_a+15], v[v_b+10:v_b+11], a[a_c+176:a_c+191]     ; repeat:1x0, step:1x1, num_a_c:16
    v_add_u32 v[v_tmp+5], 2304, v[v_tmp+5]
    ds_read2_b64 v[v_b+8+0:v_b+8+3], v[v_tmp+5], offset0:0, offset1:72 ; load i_k:3 into local buffer 1, repeat 0

    v_mfma_f32_32x32x8f16 a[a_c+192:a_c+207], v[v_a+12:v_a+13], v[v_b+12:v_b+13], a[a_c+192:a_c+207]     ; repeat:1x1, step:0x0, num_a_c:16
    v_mfma_f32_32x32x8f16 a[a_c+224:a_c+239], v[v_a+14:v_a+15], v[v_b+12:v_b+13], a[a_c+224:a_c+239]     ; repeat:1x1, step:1x0, num_a_c:16
    v_mfma_f32_32x32x8f16 a[a_c+208:a_c+223], v[v_a+12:v_a+13], v[v_b+14:v_b+15], a[a_c+208:a_c+223]     ; repeat:1x1, step:0x1, num_a_c:16
    v_mfma_f32_32x32x8f16 a[a_c+240:a_c+255], v[v_a+14:v_a+15], v[v_b+14:v_b+15], a[a_c+240:a_c+255]     ; repeat:1x1, step:1x1, num_a_c:16
    v_add_u32 v[v_tmp+5], 2304, v[v_tmp+5]
    ds_read2_b64 v[v_b+12+0:v_b+12+3], v[v_tmp+5], offset0:0, offset1:72 ; load i_k:3 into local buffer 1, repeat 1
    ds_read2st64_b64 v[v_a+12+0:v_a+12+3], v[v_sld_a_os], offset0:28, offset1:29 ; load i_k:3 into local buffer 1, repeat 1
    s_waitcnt lgkmcnt(6)
    v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15]     ; repeat:0x0, step:0x0, num_a_c:16
    v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+32:a_c+47]     ; repeat:0x0, step:1x0, num_a_c:16
    v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31]     ; repeat:0x0, step:0x1, num_a_c:16
    v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+48:a_c+63]     ; repeat:0x0, step:1x1, num_a_c:16

    s_waitcnt lgkmcnt(5)
    v_mfma_f32_32x32x8f16 a[a_c+64:a_c+79], v[v_a+0:v_a+1], v[v_b+4:v_b+5], a[a_c+64:a_c+79]     ; repeat:0x1, step:0x0, num_a_c:16
    v_mfma_f32_32x32x8f16 a[a_c+96:a_c+111], v[v_a+2:v_a+3], v[v_b+4:v_b+5], a[a_c+96:a_c+111]     ; repeat:0x1, step:1x0, num_a_c:16
    v_mfma_f32_32x32x8f16 a[a_c+80:a_c+95], v[v_a+0:v_a+1], v[v_b+6:v_b+7], a[a_c+80:a_c+95]     ; repeat:0x1, step:0x1, num_a_c:16
    v_mfma_f32_32x32x8f16 a[a_c+112:a_c+127], v[v_a+2:v_a+3], v[v_b+6:v_b+7], a[a_c+112:a_c+127]     ; repeat:0x1, step:1x1, num_a_c:16

    s_waitcnt lgkmcnt(4)
    v_mfma_f32_32x32x8f16 a[a_c+128:a_c+143], v[v_a+4:v_a+5], v[v_b+0:v_b+1], a[a_c+128:a_c+143]     ; repeat:1x0, step:0x0, num_a_c:16
    v_mfma_f32_32x32x8f16 a[a_c+160:a_c+175], v[v_a+6:v_a+7], v[v_b+0:v_b+1], a[a_c+160:a_c+175]     ; repeat:1x0, step:1x0, num_a_c:16
    v_mfma_f32_32x32x8f16 a[a_c+144:a_c+159], v[v_a+4:v_a+5], v[v_b+2:v_b+3], a[a_c+144:a_c+159]     ; repeat:1x0, step:0x1, num_a_c:16
    v_mfma_f32_32x32x8f16 a[a_c+176:a_c+191], v[v_a+6:v_a+7], v[v_b+2:v_b+3], a[a_c+176:a_c+191]     ; repeat:1x0, step:1x1, num_a_c:16

    ; k iteration : 30
    v_mfma_f32_32x32x8f16 a[a_c+192:a_c+207], v[v_a+4:v_a+5], v[v_b+4:v_b+5], a[a_c+192:a_c+207]     ; repeat:1x1, step:0x0, num_a_c:16
    v_mfma_f32_32x32x8f16 a[a_c+224:a_c+239], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+224:a_c+239]     ; repeat:1x1, step:1x0, num_a_c:16
    v_mfma_f32_32x32x8f16 a[a_c+208:a_c+223], v[v_a+4:v_a+5], v[v_b+6:v_b+7], a[a_c+208:a_c+223]     ; repeat:1x1, step:0x1, num_a_c:16
    v_mfma_f32_32x32x8f16 a[a_c+240:a_c+255], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+240:a_c+255]     ; repeat:1x1, step:1x1, num_a_c:16
    ; k iteration : 31
    s_waitcnt lgkmcnt(2)
    v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+8:v_a+9], v[v_b+8:v_b+9], a[a_c+0:a_c+15]     ; repeat:0x0, step:0x0, num_a_c:16
    v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+10:v_a+11], v[v_b+8:v_b+9], a[a_c+32:a_c+47]     ; repeat:0x0, step:1x0, num_a_c:16
    v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+8:v_a+9], v[v_b+10:v_b+11], a[a_c+16:a_c+31]     ; repeat:0x0, step:0x1, num_a_c:16
    v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+10:v_a+11], v[v_b+10:v_b+11], a[a_c+48:a_c+63]     ; repeat:0x0, step:1x1, num_a_c:16

    s_waitcnt lgkmcnt(1)
    v_mfma_f32_32x32x8f16 a[a_c+64:a_c+79], v[v_a+8:v_a+9], v[v_b+12:v_b+13], a[a_c+64:a_c+79]     ; repeat:0x1, step:0x0, num_a_c:16
    v_mfma_f32_32x32x8f16 a[a_c+96:a_c+111], v[v_a+10:v_a+11], v[v_b+12:v_b+13], a[a_c+96:a_c+111]     ; repeat:0x1, step:1x0, num_a_c:16
    v_mfma_f32_32x32x8f16 a[a_c+80:a_c+95], v[v_a+8:v_a+9], v[v_b+14:v_b+15], a[a_c+80:a_c+95]     ; repeat:0x1, step:0x1, num_a_c:16
    v_mfma_f32_32x32x8f16 a[a_c+112:a_c+127], v[v_a+10:v_a+11], v[v_b+14:v_b+15], a[a_c+112:a_c+127]     ; repeat:0x1, step:1x1, num_a_c:16

    s_waitcnt lgkmcnt(0)
    v_mfma_f32_32x32x8f16 a[a_c+128:a_c+143], v[v_a+12:v_a+13], v[v_b+8:v_b+9], a[a_c+128:a_c+143]     ; repeat:1x0, step:0x0, num_a_c:16
    v_mfma_f32_32x32x8f16 a[a_c+160:a_c+175], v[v_a+14:v_a+15], v[v_b+8:v_b+9], a[a_c+160:a_c+175]     ; repeat:1x0, step:1x0, num_a_c:16
    v_mfma_f32_32x32x8f16 a[a_c+144:a_c+159], v[v_a+12:v_a+13], v[v_b+10:v_b+11], a[a_c+144:a_c+159]     ; repeat:1x0, step:0x1, num_a_c:16
    v_mfma_f32_32x32x8f16 a[a_c+176:a_c+191], v[v_a+14:v_a+15], v[v_b+10:v_b+11], a[a_c+176:a_c+191]     ; repeat:1x0, step:1x1, num_a_c:16

    v_mfma_f32_32x32x8f16 a[a_c+192:a_c+207], v[v_a+12:v_a+13], v[v_b+12:v_b+13], a[a_c+192:a_c+207]     ; repeat:1x1, step:0x0, num_a_c:16
    v_mfma_f32_32x32x8f16 a[a_c+224:a_c+239], v[v_a+14:v_a+15], v[v_b+12:v_b+13], a[a_c+224:a_c+239]     ; repeat:1x1, step:1x0, num_a_c:16
    v_mfma_f32_32x32x8f16 a[a_c+208:a_c+223], v[v_a+12:v_a+13], v[v_b+14:v_b+15], a[a_c+208:a_c+223]     ; repeat:1x1, step:0x1, num_a_c:16
    v_mfma_f32_32x32x8f16 a[a_c+240:a_c+255], v[v_a+14:v_a+15], v[v_b+14:v_b+15], a[a_c+240:a_c+255]     ; repeat:1x1, step:1x1, num_a_c:16

    s_nop 15
    s_nop 2
    v_mov_b32 v[v_in_hi_sshift], s[s_in_hi_sshift]
    s_mov_b32 s[s_tmp], 0
    v_mov_b32 v[v_in_wi_sshift], s[s_in_wi_sshift]
    ; coalescing store, mapping:mt_m:256, mt_n:256, wt_m:32, wt_n:32, ws:4, r_m:2, r_n:2, s_m:2, s_n:2 | 32x32x8, lanegroup_m_tcbw:4x2x4x1, lanegroup_n_tcbw:1x32x1x1
    ; coalescing_groups:4, num_dword_per_group:64
    ; init_co_sub_m_index xdlops, block_size:256, macro-tile:256x256 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7]
    ; g_mr:2, g_ms:2, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:2
    ; nd_stride:[2, 1, 4, 1, 2, 2, 1]
    ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0
    s_barrier
    v_accvgpr_read_b32 v[v_c], a[a_c]
    v_accvgpr_read_b32 v[v_c+1], a[a_c+1]
    v_accvgpr_read_b32 v[v_c+2], a[a_c+2]
    v_accvgpr_read_b32 v[v_c+3], a[a_c+3]
    v_cvt_f16_f32_e32 v[v_c], v[v_c]
    v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1]
    v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2]
    v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3]
    ds_write_b16 v[v_co_sst], v[v_c]  ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0  x  i_nr:0, i_ns:0, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+1] offset:512 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0  x  i_nr:0, i_ns:0, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+2] offset:1024 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0  x  i_nr:0, i_ns:0, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+3] offset:1536 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0  x  i_nr:0, i_ns:0, i_nw:0
    v_accvgpr_read_b32 v[v_c+4], a[a_c+16]
    v_accvgpr_read_b32 v[v_c+5], a[a_c+17]
    v_accvgpr_read_b32 v[v_c+6], a[a_c+18]
    v_accvgpr_read_b32 v[v_c+7], a[a_c+19]
    v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4]
    v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5]
    v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6]
    v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7]
    ds_write_b16 v[v_co_sst], v[v_c+4] offset:64 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0  x  i_nr:0, i_ns:1, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+5] offset:576 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0  x  i_nr:0, i_ns:1, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+6] offset:1088 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0  x  i_nr:0, i_ns:1, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+7] offset:1600 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0  x  i_nr:0, i_ns:1, i_nw:0
    v_accvgpr_read_b32 v[v_c+8], a[a_c+64]
    v_accvgpr_read_b32 v[v_c+9], a[a_c+65]
    v_accvgpr_read_b32 v[v_c+10], a[a_c+66]
    v_accvgpr_read_b32 v[v_c+11], a[a_c+67]
    v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8]
    v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9]
    v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10]
    v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11]
    ds_write_b16 v[v_co_sst], v[v_c+8] offset:256 ; idword:128(0,128), 0x128, i_mr:0, i_ms:0, i_mw:0, i_mb:0  x  i_nr:1, i_ns:0, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+9] offset:768 ; idword:128(0,128), 0x128, i_mr:0, i_ms:0, i_mw:0, i_mb:0  x  i_nr:1, i_ns:0, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+10] offset:1280 ; idword:128(0,128), 0x128, i_mr:0, i_ms:0, i_mw:0, i_mb:0  x  i_nr:1, i_ns:0, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+11] offset:1792 ; idword:128(0,128), 0x128, i_mr:0, i_ms:0, i_mw:0, i_mb:0  x  i_nr:1, i_ns:0, i_nw:0
    v_accvgpr_read_b32 v[v_c+12], a[a_c+80]
    v_accvgpr_read_b32 v[v_c+13], a[a_c+81]
    v_accvgpr_read_b32 v[v_c+14], a[a_c+82]
    v_accvgpr_read_b32 v[v_c+15], a[a_c+83]
    v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12]
    v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13]
    v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14]
    v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15]
    ds_write_b16 v[v_co_sst], v[v_c+12] offset:320 ; idword:160(0,160), 0x160, i_mr:0, i_ms:0, i_mw:0, i_mb:0  x  i_nr:1, i_ns:1, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+13] offset:832 ; idword:160(0,160), 0x160, i_mr:0, i_ms:0, i_mw:0, i_mb:0  x  i_nr:1, i_ns:1, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+14] offset:1344 ; idword:160(0,160), 0x160, i_mr:0, i_ms:0, i_mw:0, i_mb:0  x  i_nr:1, i_ns:1, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+15] offset:1856 ; idword:160(0,160), 0x160, i_mr:0, i_ms:0, i_mw:0, i_mb:0  x  i_nr:1, i_ns:1, i_nw:0
    v_accvgpr_read_b32 v[v_c+16], a[a_c+4]
    v_accvgpr_read_b32 v[v_c+17], a[a_c+5]
    v_accvgpr_read_b32 v[v_c+18], a[a_c+6]
    v_accvgpr_read_b32 v[v_c+19], a[a_c+7]
    v_cvt_f16_f32_e32 v[v_c+16], v[v_c+16]
    v_cvt_f16_f32_e32 v[v_c+17], v[v_c+17]
    v_cvt_f16_f32_e32 v[v_c+18], v[v_c+18]
    v_cvt_f16_f32_e32 v[v_c+19], v[v_c+19]
    ds_write_b16 v[v_co_sst], v[v_c+16] offset:4096 ; idword:2048(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1  x  i_nr:0, i_ns:0, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+17] offset:4608 ; idword:2048(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1  x  i_nr:0, i_ns:0, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+18] offset:5120 ; idword:2048(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1  x  i_nr:0, i_ns:0, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+19] offset:5632 ; idword:2048(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1  x  i_nr:0, i_ns:0, i_nw:0
    v_accvgpr_read_b32 v[v_c+20], a[a_c+20]
    v_accvgpr_read_b32 v[v_c+21], a[a_c+21]
    v_accvgpr_read_b32 v[v_c+22], a[a_c+22]
    v_accvgpr_read_b32 v[v_c+23], a[a_c+23]
    v_cvt_f16_f32_e32 v[v_c+20], v[v_c+20]
    v_cvt_f16_f32_e32 v[v_c+21], v[v_c+21]
    v_cvt_f16_f32_e32 v[v_c+22], v[v_c+22]
    v_cvt_f16_f32_e32 v[v_c+23], v[v_c+23]
    ds_write_b16 v[v_co_sst], v[v_c+20] offset:4160 ; idword:2080(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1  x  i_nr:0, i_ns:1, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+21] offset:4672 ; idword:2080(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1  x  i_nr:0, i_ns:1, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+22] offset:5184 ; idword:2080(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1  x  i_nr:0, i_ns:1, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+23] offset:5696 ; idword:2080(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1  x  i_nr:0, i_ns:1, i_nw:0
    v_accvgpr_read_b32 v[v_c+24], a[a_c+68]
    v_accvgpr_read_b32 v[v_c+25], a[a_c+69]
    v_accvgpr_read_b32 v[v_c+26], a[a_c+70]
    v_accvgpr_read_b32 v[v_c+27], a[a_c+71]
    v_cvt_f16_f32_e32 v[v_c+24], v[v_c+24]
    v_cvt_f16_f32_e32 v[v_c+25], v[v_c+25]
    v_cvt_f16_f32_e32 v[v_c+26], v[v_c+26]
    v_cvt_f16_f32_e32 v[v_c+27], v[v_c+27]
    ds_write_b16 v[v_co_sst], v[v_c+24] offset:4352 ; idword:2176(8,128), 8x128, i_mr:0, i_ms:0, i_mw:0, i_mb:1  x  i_nr:1, i_ns:0, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+25] offset:4864 ; idword:2176(8,128), 8x128, i_mr:0, i_ms:0, i_mw:0, i_mb:1  x  i_nr:1, i_ns:0, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+26] offset:5376 ; idword:2176(8,128), 8x128, i_mr:0, i_ms:0, i_mw:0, i_mb:1  x  i_nr:1, i_ns:0, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+27] offset:5888 ; idword:2176(8,128), 8x128, i_mr:0, i_ms:0, i_mw:0, i_mb:1  x  i_nr:1, i_ns:0, i_nw:0
    v_accvgpr_read_b32 v[v_c+28], a[a_c+84]
    v_accvgpr_read_b32 v[v_c+29], a[a_c+85]
    v_accvgpr_read_b32 v[v_c+30], a[a_c+86]
    v_accvgpr_read_b32 v[v_c+31], a[a_c+87]
    v_cvt_f16_f32_e32 v[v_c+28], v[v_c+28]
    v_cvt_f16_f32_e32 v[v_c+29], v[v_c+29]
    v_cvt_f16_f32_e32 v[v_c+30], v[v_c+30]
    v_cvt_f16_f32_e32 v[v_c+31], v[v_c+31]
    ds_write_b16 v[v_co_sst], v[v_c+28] offset:4416 ; idword:2208(8,160), 8x160, i_mr:0, i_ms:0, i_mw:0, i_mb:1  x  i_nr:1, i_ns:1, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+29] offset:4928 ; idword:2208(8,160), 8x160, i_mr:0, i_ms:0, i_mw:0, i_mb:1  x  i_nr:1, i_ns:1, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+30] offset:5440 ; idword:2208(8,160), 8x160, i_mr:0, i_ms:0, i_mw:0, i_mb:1  x  i_nr:1, i_ns:1, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+31] offset:5952 ; idword:2208(8,160), 8x160, i_mr:0, i_ms:0, i_mw:0, i_mb:1  x  i_nr:1, i_ns:1, i_nw:0
    v_accvgpr_read_b32 v[v_c], a[a_c+8]
    v_accvgpr_read_b32 v[v_c+1], a[a_c+9]
    v_accvgpr_read_b32 v[v_c+2], a[a_c+10]
    v_accvgpr_read_b32 v[v_c+3], a[a_c+11]
    v_cvt_f16_f32_e32 v[v_c], v[v_c]
    v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1]
    v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2]
    v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3]
    ds_write_b16 v[v_co_sst], v[v_c] offset:8192 ; idword:4096(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2  x  i_nr:0, i_ns:0, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+1] offset:8704 ; idword:4096(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2  x  i_nr:0, i_ns:0, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+2] offset:9216 ; idword:4096(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2  x  i_nr:0, i_ns:0, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+3] offset:9728 ; idword:4096(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2  x  i_nr:0, i_ns:0, i_nw:0
    v_accvgpr_read_b32 v[v_c+4], a[a_c+24]
    v_accvgpr_read_b32 v[v_c+5], a[a_c+25]
    v_accvgpr_read_b32 v[v_c+6], a[a_c+26]
    v_accvgpr_read_b32 v[v_c+7], a[a_c+27]
    v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4]
    v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5]
    v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6]
    v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7]
    ds_write_b16 v[v_co_sst], v[v_c+4] offset:8256 ; idword:4128(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2  x  i_nr:0, i_ns:1, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+5] offset:8768 ; idword:4128(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2  x  i_nr:0, i_ns:1, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+6] offset:9280 ; idword:4128(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2  x  i_nr:0, i_ns:1, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+7] offset:9792 ; idword:4128(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2  x  i_nr:0, i_ns:1, i_nw:0
    v_accvgpr_read_b32 v[v_c+8], a[a_c+72]
    v_accvgpr_read_b32 v[v_c+9], a[a_c+73]
    v_accvgpr_read_b32 v[v_c+10], a[a_c+74]
    v_accvgpr_read_b32 v[v_c+11], a[a_c+75]
    v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8]
    v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9]
    v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10]
    v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11]
    ds_write_b16 v[v_co_sst], v[v_c+8] offset:8448 ; idword:4224(16,128), 16x128, i_mr:0, i_ms:0, i_mw:0, i_mb:2  x  i_nr:1, i_ns:0, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+9] offset:8960 ; idword:4224(16,128), 16x128, i_mr:0, i_ms:0, i_mw:0, i_mb:2  x  i_nr:1, i_ns:0, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+10] offset:9472 ; idword:4224(16,128), 16x128, i_mr:0, i_ms:0, i_mw:0, i_mb:2  x  i_nr:1, i_ns:0, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+11] offset:9984 ; idword:4224(16,128), 16x128, i_mr:0, i_ms:0, i_mw:0, i_mb:2  x  i_nr:1, i_ns:0, i_nw:0
    v_accvgpr_read_b32 v[v_c+12], a[a_c+88]
    v_accvgpr_read_b32 v[v_c+13], a[a_c+89]
    v_accvgpr_read_b32 v[v_c+14], a[a_c+90]
    v_accvgpr_read_b32 v[v_c+15], a[a_c+91]
    v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12]
    v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13]
    v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14]
    v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15]
    ds_write_b16 v[v_co_sst], v[v_c+12] offset:8512 ; idword:4256(16,160), 16x160, i_mr:0, i_ms:0, i_mw:0, i_mb:2  x  i_nr:1, i_ns:1, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+13] offset:9024 ; idword:4256(16,160), 16x160, i_mr:0, i_ms:0, i_mw:0, i_mb:2  x  i_nr:1, i_ns:1, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+14] offset:9536 ; idword:4256(16,160), 16x160, i_mr:0, i_ms:0, i_mw:0, i_mb:2  x  i_nr:1, i_ns:1, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+15] offset:10048 ; idword:4256(16,160), 16x160, i_mr:0, i_ms:0, i_mw:0, i_mb:2  x  i_nr:1, i_ns:1, i_nw:0
    v_accvgpr_read_b32 v[v_c+16], a[a_c+12]
    v_accvgpr_read_b32 v[v_c+17], a[a_c+13]
    v_accvgpr_read_b32 v[v_c+18], a[a_c+14]
    v_accvgpr_read_b32 v[v_c+19], a[a_c+15]
    v_cvt_f16_f32_e32 v[v_c+16], v[v_c+16]
    v_cvt_f16_f32_e32 v[v_c+17], v[v_c+17]
    v_cvt_f16_f32_e32 v[v_c+18], v[v_c+18]
    v_cvt_f16_f32_e32 v[v_c+19], v[v_c+19]
    ds_write_b16 v[v_co_sst], v[v_c+16] offset:12288 ; idword:6144(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3  x  i_nr:0, i_ns:0, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+17] offset:12800 ; idword:6144(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3  x  i_nr:0, i_ns:0, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+18] offset:13312 ; idword:6144(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3  x  i_nr:0, i_ns:0, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+19] offset:13824 ; idword:6144(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3  x  i_nr:0, i_ns:0, i_nw:0
    v_accvgpr_read_b32 v[v_c+20], a[a_c+28]
    v_accvgpr_read_b32 v[v_c+21], a[a_c+29]
    v_accvgpr_read_b32 v[v_c+22], a[a_c+30]
    v_accvgpr_read_b32 v[v_c+23], a[a_c+31]
    v_cvt_f16_f32_e32 v[v_c+20], v[v_c+20]
    v_cvt_f16_f32_e32 v[v_c+21], v[v_c+21]
    v_cvt_f16_f32_e32 v[v_c+22], v[v_c+22]
    v_cvt_f16_f32_e32 v[v_c+23], v[v_c+23]
    ds_write_b16 v[v_co_sst], v[v_c+20] offset:12352 ; idword:6176(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3  x  i_nr:0, i_ns:1, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+21] offset:12864 ; idword:6176(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3  x  i_nr:0, i_ns:1, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+22] offset:13376 ; idword:6176(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3  x  i_nr:0, i_ns:1, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+23] offset:13888 ; idword:6176(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3  x  i_nr:0, i_ns:1, i_nw:0
    v_accvgpr_read_b32 v[v_c+24], a[a_c+76]
    v_accvgpr_read_b32 v[v_c+25], a[a_c+77]
    v_accvgpr_read_b32 v[v_c+26], a[a_c+78]
    v_accvgpr_read_b32 v[v_c+27], a[a_c+79]
    v_cvt_f16_f32_e32 v[v_c+24], v[v_c+24]
    v_cvt_f16_f32_e32 v[v_c+25], v[v_c+25]
    v_cvt_f16_f32_e32 v[v_c+26], v[v_c+26]
    v_cvt_f16_f32_e32 v[v_c+27], v[v_c+27]
    ds_write_b16 v[v_co_sst], v[v_c+24] offset:12544 ; idword:6272(24,128), 24x128, i_mr:0, i_ms:0, i_mw:0, i_mb:3  x  i_nr:1, i_ns:0, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+25] offset:13056 ; idword:6272(24,128), 24x128, i_mr:0, i_ms:0, i_mw:0, i_mb:3  x  i_nr:1, i_ns:0, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+26] offset:13568 ; idword:6272(24,128), 24x128, i_mr:0, i_ms:0, i_mw:0, i_mb:3  x  i_nr:1, i_ns:0, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+27] offset:14080 ; idword:6272(24,128), 24x128, i_mr:0, i_ms:0, i_mw:0, i_mb:3  x  i_nr:1, i_ns:0, i_nw:0
    v_accvgpr_read_b32 v[v_c+28], a[a_c+92]
    v_accvgpr_read_b32 v[v_c+29], a[a_c+93]
    v_accvgpr_read_b32 v[v_c+30], a[a_c+94]
    v_accvgpr_read_b32 v[v_c+31], a[a_c+95]
    v_cvt_f16_f32_e32 v[v_c+28], v[v_c+28]
    v_cvt_f16_f32_e32 v[v_c+29], v[v_c+29]
    v_cvt_f16_f32_e32 v[v_c+30], v[v_c+30]
    v_cvt_f16_f32_e32 v[v_c+31], v[v_c+31]
    ds_write_b16 v[v_co_sst], v[v_c+28] offset:12608 ; idword:6304(24,160), 24x160, i_mr:0, i_ms:0, i_mw:0, i_mb:3  x  i_nr:1, i_ns:1, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+29] offset:13120 ; idword:6304(24,160), 24x160, i_mr:0, i_ms:0, i_mw:0, i_mb:3  x  i_nr:1, i_ns:1, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+30] offset:13632 ; idword:6304(24,160), 24x160, i_mr:0, i_ms:0, i_mw:0, i_mb:3  x  i_nr:1, i_ns:1, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+31] offset:14144 ; idword:6304(24,160), 24x160, i_mr:0, i_ms:0, i_mw:0, i_mb:3  x  i_nr:1, i_ns:1, i_nw:0
    v_add_u32 v[v_tmp], 0, v[v_in_inb]
    .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1
    .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1
    v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift]
    v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift]
    v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi]
    v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1]
    v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index]
    v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in]
    v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os]
    v_cmp_gt_u32 vcc, s[s_n], v[v_in_in]
    v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c] vcc
    v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi]
    v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1] vcc
    v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi]
    v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag] vcc
    s_waitcnt lgkmcnt(0)
    s_barrier
    ;   load from lds, i_ssgroup:0, num_sld_per_ssgroup:4
    ds_read_b128 v[v_c:v_c+3], v[v_co_sld] offset:0
    ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096
    ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192
    ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288
    ;   store to global, m index start from 0, m0:0, m1:0
    s_waitcnt lgkmcnt(3)
    v_cmpx_le_u32 vcc, 1, v[v_in_flag]
    buffer_store_dwordx4_m v[v_c:v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0
    s_mov_b64 exec, -1
    v_add_u32 v[v_tmp], 8, v[v_in_inb]
    .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1
    .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1
    v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift]
    v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift]
    v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi]
    v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1]
    v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index]
    v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in]
    v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os]
    v_cmp_gt_u32 vcc, s[s_n], v[v_in_in]
    v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c] vcc
    v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi]
    v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1] vcc
    v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi]
    v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag] vcc
    s_waitcnt lgkmcnt(2)
    v_cmpx_le_u32 vcc, 1, v[v_in_flag]
    buffer_store_dwordx4_m v[v_c+4:v_c+4+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0
    s_mov_b64 exec, -1
    v_add_u32 v[v_tmp], 16, v[v_in_inb]
    .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1
    .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1
    v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift]
    v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift]
    v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi]
    v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1]
    v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index]
    v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in]
    v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os]
    v_cmp_gt_u32 vcc, s[s_n], v[v_in_in]
    v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c] vcc
    v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi]
    v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1] vcc
    v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi]
    v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag] vcc
    s_waitcnt lgkmcnt(1)
    v_cmpx_le_u32 vcc, 1, v[v_in_flag]
    buffer_store_dwordx4_m v[v_c+8:v_c+8+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0
    s_mov_b64 exec, -1
    v_add_u32 v[v_tmp], 24, v[v_in_inb]
    .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1
    .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1
    v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift]
    v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift]
    v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi]
    v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1]
    v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index]
    v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in]
    v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os]
    v_cmp_gt_u32 vcc, s[s_n], v[v_in_in]
    v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c] vcc
    v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi]
    v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1] vcc
    v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi]
    v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag] vcc
    s_waitcnt lgkmcnt(0)
    v_cmpx_le_u32 vcc, 1, v[v_in_flag]
    buffer_store_dwordx4_m v[v_c+12:v_c+12+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0
    s_mov_b64 exec, -1
    v_add_u32 v[v_tmp], 64, v[v_in_inb]
    .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1
    .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1
    v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift]
    v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift]
    v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi]
    v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1]
    v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index]
    v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in]
    v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os]
    v_cmp_gt_u32 vcc, s[s_n], v[v_in_in]
    v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c] vcc
    v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi]
    v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1] vcc
    v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi]
    v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag] vcc
    ;   load from lds, i_ssgroup:1, num_sld_per_ssgroup:4
    ds_read_b128 v[v_c:v_c+3], v[v_co_sld] offset:16384
    ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:20480
    ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:24576
    ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:28672
    ;   store to global, m index start from 0, m0:0, m1:0
    s_waitcnt lgkmcnt(3)
    v_cmpx_le_u32 vcc, 1, v[v_in_flag]
    buffer_store_dwordx4_m v[v_c:v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0
    s_mov_b64 exec, -1
    v_add_u32 v[v_tmp], 72, v[v_in_inb]
    .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1
    .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1
    v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift]
    v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift]
    v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi]
    v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1]
    v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index]
    v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in]
    v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os]
    v_cmp_gt_u32 vcc, s[s_n], v[v_in_in]
    v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c] vcc
    v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi]
    v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1] vcc
    v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi]
    v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag] vcc
    s_waitcnt lgkmcnt(2)
    v_cmpx_le_u32 vcc, 1, v[v_in_flag]
    buffer_store_dwordx4_m v[v_c+4:v_c+4+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0
    s_mov_b64 exec, -1
    v_add_u32 v[v_tmp], 80, v[v_in_inb]
    .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1
    .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1
    v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift]
    v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift]
    v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi]
    v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1]
    v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index]
    v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in]
    v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os]
    v_cmp_gt_u32 vcc, s[s_n], v[v_in_in]
    v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c] vcc
    v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi]
    v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1] vcc
    v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi]
    v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag] vcc
    s_waitcnt lgkmcnt(1)
    v_cmpx_le_u32 vcc, 1, v[v_in_flag]
    buffer_store_dwordx4_m v[v_c+8:v_c+8+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0
    s_mov_b64 exec, -1
    v_add_u32 v[v_tmp], 88, v[v_in_inb]
    .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1
    .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1
    v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift]
    v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift]
    v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi]
    v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1]
    v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index]
    v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in]
    v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os]
    v_cmp_gt_u32 vcc, s[s_n], v[v_in_in]
    v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c] vcc
    v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi]
    v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1] vcc
    v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi]
    v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag] vcc
    s_waitcnt lgkmcnt(0)
    v_cmpx_le_u32 vcc, 1, v[v_in_flag]
    buffer_store_dwordx4_m v[v_c+12:v_c+12+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0
    s_mov_b64 exec, -1
    ; start group 1, i_g_mr:0, i_g_ms:1, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 32
    s_barrier
    v_accvgpr_read_b32 v[v_c], a[a_c+32]
    v_accvgpr_read_b32 v[v_c+1], a[a_c+33]
    v_accvgpr_read_b32 v[v_c+2], a[a_c+34]
    v_accvgpr_read_b32 v[v_c+3], a[a_c+35]
    v_cvt_f16_f32_e32 v[v_c], v[v_c]
    v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1]
    v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2]
    v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3]
    ds_write_b16 v[v_co_sst], v[v_c]  ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0  x  i_nr:0, i_ns:0, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+1] offset:512 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0  x  i_nr:0, i_ns:0, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+2] offset:1024 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0  x  i_nr:0, i_ns:0, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+3] offset:1536 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0  x  i_nr:0, i_ns:0, i_nw:0
    v_accvgpr_read_b32 v[v_c+4], a[a_c+48]
    v_accvgpr_read_b32 v[v_c+5], a[a_c+49]
    v_accvgpr_read_b32 v[v_c+6], a[a_c+50]
    v_accvgpr_read_b32 v[v_c+7], a[a_c+51]
    v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4]
    v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5]
    v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6]
    v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7]
    ds_write_b16 v[v_co_sst], v[v_c+4] offset:64 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0  x  i_nr:0, i_ns:1, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+5] offset:576 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0  x  i_nr:0, i_ns:1, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+6] offset:1088 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0  x  i_nr:0, i_ns:1, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+7] offset:1600 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0  x  i_nr:0, i_ns:1, i_nw:0
    v_accvgpr_read_b32 v[v_c+8], a[a_c+96]
    v_accvgpr_read_b32 v[v_c+9], a[a_c+97]
    v_accvgpr_read_b32 v[v_c+10], a[a_c+98]
    v_accvgpr_read_b32 v[v_c+11], a[a_c+99]
    v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8]
    v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9]
    v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10]
    v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11]
    ds_write_b16 v[v_co_sst], v[v_c+8] offset:256 ; idword:128(0,128), 0x128, i_mr:0, i_ms:0, i_mw:0, i_mb:0  x  i_nr:1, i_ns:0, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+9] offset:768 ; idword:128(0,128), 0x128, i_mr:0, i_ms:0, i_mw:0, i_mb:0  x  i_nr:1, i_ns:0, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+10] offset:1280 ; idword:128(0,128), 0x128, i_mr:0, i_ms:0, i_mw:0, i_mb:0  x  i_nr:1, i_ns:0, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+11] offset:1792 ; idword:128(0,128), 0x128, i_mr:0, i_ms:0, i_mw:0, i_mb:0  x  i_nr:1, i_ns:0, i_nw:0
    v_accvgpr_read_b32 v[v_c+12], a[a_c+112]
    v_accvgpr_read_b32 v[v_c+13], a[a_c+113]
    v_accvgpr_read_b32 v[v_c+14], a[a_c+114]
    v_accvgpr_read_b32 v[v_c+15], a[a_c+115]
    v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12]
    v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13]
    v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14]
    v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15]
    ds_write_b16 v[v_co_sst], v[v_c+12] offset:320 ; idword:160(0,160), 0x160, i_mr:0, i_ms:0, i_mw:0, i_mb:0  x  i_nr:1, i_ns:1, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+13] offset:832 ; idword:160(0,160), 0x160, i_mr:0, i_ms:0, i_mw:0, i_mb:0  x  i_nr:1, i_ns:1, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+14] offset:1344 ; idword:160(0,160), 0x160, i_mr:0, i_ms:0, i_mw:0, i_mb:0  x  i_nr:1, i_ns:1, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+15] offset:1856 ; idword:160(0,160), 0x160, i_mr:0, i_ms:0, i_mw:0, i_mb:0  x  i_nr:1, i_ns:1, i_nw:0
    v_accvgpr_read_b32 v[v_c+16], a[a_c+36]
    v_accvgpr_read_b32 v[v_c+17], a[a_c+37]
    v_accvgpr_read_b32 v[v_c+18], a[a_c+38]
    v_accvgpr_read_b32 v[v_c+19], a[a_c+39]
    v_cvt_f16_f32_e32 v[v_c+16], v[v_c+16]
    v_cvt_f16_f32_e32 v[v_c+17], v[v_c+17]
    v_cvt_f16_f32_e32 v[v_c+18], v[v_c+18]
    v_cvt_f16_f32_e32 v[v_c+19], v[v_c+19]
    ds_write_b16 v[v_co_sst], v[v_c+16] offset:4096 ; idword:2048(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1  x  i_nr:0, i_ns:0, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+17] offset:4608 ; idword:2048(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1  x  i_nr:0, i_ns:0, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+18] offset:5120 ; idword:2048(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1  x  i_nr:0, i_ns:0, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+19] offset:5632 ; idword:2048(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1  x  i_nr:0, i_ns:0, i_nw:0
    v_accvgpr_read_b32 v[v_c+20], a[a_c+52]
    v_accvgpr_read_b32 v[v_c+21], a[a_c+53]
    v_accvgpr_read_b32 v[v_c+22], a[a_c+54]
    v_accvgpr_read_b32 v[v_c+23], a[a_c+55]
    v_cvt_f16_f32_e32 v[v_c+20], v[v_c+20]
    v_cvt_f16_f32_e32 v[v_c+21], v[v_c+21]
    v_cvt_f16_f32_e32 v[v_c+22], v[v_c+22]
    v_cvt_f16_f32_e32 v[v_c+23], v[v_c+23]
    ds_write_b16 v[v_co_sst], v[v_c+20] offset:4160 ; idword:2080(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1  x  i_nr:0, i_ns:1, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+21] offset:4672 ; idword:2080(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1  x  i_nr:0, i_ns:1, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+22] offset:5184 ; idword:2080(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1  x  i_nr:0, i_ns:1, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+23] offset:5696 ; idword:2080(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1  x  i_nr:0, i_ns:1, i_nw:0
    v_accvgpr_read_b32 v[v_c+24], a[a_c+100]
    v_accvgpr_read_b32 v[v_c+25], a[a_c+101]
    v_accvgpr_read_b32 v[v_c+26], a[a_c+102]
    v_accvgpr_read_b32 v[v_c+27], a[a_c+103]
    v_cvt_f16_f32_e32 v[v_c+24], v[v_c+24]
    v_cvt_f16_f32_e32 v[v_c+25], v[v_c+25]
    v_cvt_f16_f32_e32 v[v_c+26], v[v_c+26]
    v_cvt_f16_f32_e32 v[v_c+27], v[v_c+27]
    ds_write_b16 v[v_co_sst], v[v_c+24] offset:4352 ; idword:2176(8,128), 8x128, i_mr:0, i_ms:0, i_mw:0, i_mb:1  x  i_nr:1, i_ns:0, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+25] offset:4864 ; idword:2176(8,128), 8x128, i_mr:0, i_ms:0, i_mw:0, i_mb:1  x  i_nr:1, i_ns:0, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+26] offset:5376 ; idword:2176(8,128), 8x128, i_mr:0, i_ms:0, i_mw:0, i_mb:1  x  i_nr:1, i_ns:0, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+27] offset:5888 ; idword:2176(8,128), 8x128, i_mr:0, i_ms:0, i_mw:0, i_mb:1  x  i_nr:1, i_ns:0, i_nw:0
    v_accvgpr_read_b32 v[v_c+28], a[a_c+116]
    v_accvgpr_read_b32 v[v_c+29], a[a_c+117]
    v_accvgpr_read_b32 v[v_c+30], a[a_c+118]
    v_accvgpr_read_b32 v[v_c+31], a[a_c+119]
    v_cvt_f16_f32_e32 v[v_c+28], v[v_c+28]
    v_cvt_f16_f32_e32 v[v_c+29], v[v_c+29]
    v_cvt_f16_f32_e32 v[v_c+30], v[v_c+30]
    v_cvt_f16_f32_e32 v[v_c+31], v[v_c+31]
    ds_write_b16 v[v_co_sst], v[v_c+28] offset:4416 ; idword:2208(8,160), 8x160, i_mr:0, i_ms:0, i_mw:0, i_mb:1  x  i_nr:1, i_ns:1, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+29] offset:4928 ; idword:2208(8,160), 8x160, i_mr:0, i_ms:0, i_mw:0, i_mb:1  x  i_nr:1, i_ns:1, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+30] offset:5440 ; idword:2208(8,160), 8x160, i_mr:0, i_ms:0, i_mw:0, i_mb:1  x  i_nr:1, i_ns:1, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+31] offset:5952 ; idword:2208(8,160), 8x160, i_mr:0, i_ms:0, i_mw:0, i_mb:1  x  i_nr:1, i_ns:1, i_nw:0
    v_accvgpr_read_b32 v[v_c], a[a_c+40]
    v_accvgpr_read_b32 v[v_c+1], a[a_c+41]
    v_accvgpr_read_b32 v[v_c+2], a[a_c+42]
    v_accvgpr_read_b32 v[v_c+3], a[a_c+43]
    v_cvt_f16_f32_e32 v[v_c], v[v_c]
    v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1]
    v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2]
    v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3]
    ds_write_b16 v[v_co_sst], v[v_c] offset:8192 ; idword:4096(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2  x  i_nr:0, i_ns:0, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+1] offset:8704 ; idword:4096(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2  x  i_nr:0, i_ns:0, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+2] offset:9216 ; idword:4096(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2  x  i_nr:0, i_ns:0, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+3] offset:9728 ; idword:4096(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2  x  i_nr:0, i_ns:0, i_nw:0
    v_accvgpr_read_b32 v[v_c+4], a[a_c+56]
    v_accvgpr_read_b32 v[v_c+5], a[a_c+57]
    v_accvgpr_read_b32 v[v_c+6], a[a_c+58]
    v_accvgpr_read_b32 v[v_c+7], a[a_c+59]
    v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4]
    v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5]
    v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6]
    v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7]
    ds_write_b16 v[v_co_sst], v[v_c+4] offset:8256 ; idword:4128(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2  x  i_nr:0, i_ns:1, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+5] offset:8768 ; idword:4128(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2  x  i_nr:0, i_ns:1, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+6] offset:9280 ; idword:4128(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2  x  i_nr:0, i_ns:1, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+7] offset:9792 ; idword:4128(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2  x  i_nr:0, i_ns:1, i_nw:0
    v_accvgpr_read_b32 v[v_c+8], a[a_c+104]
    v_accvgpr_read_b32 v[v_c+9], a[a_c+105]
    v_accvgpr_read_b32 v[v_c+10], a[a_c+106]
    v_accvgpr_read_b32 v[v_c+11], a[a_c+107]
    v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8]
    v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9]
    v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10]
    v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11]
    ds_write_b16 v[v_co_sst], v[v_c+8] offset:8448 ; idword:4224(16,128), 16x128, i_mr:0, i_ms:0, i_mw:0, i_mb:2  x  i_nr:1, i_ns:0, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+9] offset:8960 ; idword:4224(16,128), 16x128, i_mr:0, i_ms:0, i_mw:0, i_mb:2  x  i_nr:1, i_ns:0, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+10] offset:9472 ; idword:4224(16,128), 16x128, i_mr:0, i_ms:0, i_mw:0, i_mb:2  x  i_nr:1, i_ns:0, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+11] offset:9984 ; idword:4224(16,128), 16x128, i_mr:0, i_ms:0, i_mw:0, i_mb:2  x  i_nr:1, i_ns:0, i_nw:0
    v_accvgpr_read_b32 v[v_c+12], a[a_c+120]
    v_accvgpr_read_b32 v[v_c+13], a[a_c+121]
    v_accvgpr_read_b32 v[v_c+14], a[a_c+122]
    v_accvgpr_read_b32 v[v_c+15], a[a_c+123]
    v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12]
    v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13]
    v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14]
    v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15]
    ds_write_b16 v[v_co_sst], v[v_c+12] offset:8512 ; idword:4256(16,160), 16x160, i_mr:0, i_ms:0, i_mw:0, i_mb:2  x  i_nr:1, i_ns:1, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+13] offset:9024 ; idword:4256(16,160), 16x160, i_mr:0, i_ms:0, i_mw:0, i_mb:2  x  i_nr:1, i_ns:1, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+14] offset:9536 ; idword:4256(16,160), 16x160, i_mr:0, i_ms:0, i_mw:0, i_mb:2  x  i_nr:1, i_ns:1, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+15] offset:10048 ; idword:4256(16,160), 16x160, i_mr:0, i_ms:0, i_mw:0, i_mb:2  x  i_nr:1, i_ns:1, i_nw:0
    v_accvgpr_read_b32 v[v_c+16], a[a_c+44]
    v_accvgpr_read_b32 v[v_c+17], a[a_c+45]
    v_accvgpr_read_b32 v[v_c+18], a[a_c+46]
    v_accvgpr_read_b32 v[v_c+19], a[a_c+47]
    v_cvt_f16_f32_e32 v[v_c+16], v[v_c+16]
    v_cvt_f16_f32_e32 v[v_c+17], v[v_c+17]
    v_cvt_f16_f32_e32 v[v_c+18], v[v_c+18]
    v_cvt_f16_f32_e32 v[v_c+19], v[v_c+19]
    ds_write_b16 v[v_co_sst], v[v_c+16] offset:12288 ; idword:6144(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3  x  i_nr:0, i_ns:0, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+17] offset:12800 ; idword:6144(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3  x  i_nr:0, i_ns:0, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+18] offset:13312 ; idword:6144(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3  x  i_nr:0, i_ns:0, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+19] offset:13824 ; idword:6144(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3  x  i_nr:0, i_ns:0, i_nw:0
    v_accvgpr_read_b32 v[v_c+20], a[a_c+60]
    v_accvgpr_read_b32 v[v_c+21], a[a_c+61]
    v_accvgpr_read_b32 v[v_c+22], a[a_c+62]
    v_accvgpr_read_b32 v[v_c+23], a[a_c+63]
    v_cvt_f16_f32_e32 v[v_c+20], v[v_c+20]
    v_cvt_f16_f32_e32 v[v_c+21], v[v_c+21]
    v_cvt_f16_f32_e32 v[v_c+22], v[v_c+22]
    v_cvt_f16_f32_e32 v[v_c+23], v[v_c+23]
    ds_write_b16 v[v_co_sst], v[v_c+20] offset:12352 ; idword:6176(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3  x  i_nr:0, i_ns:1, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+21] offset:12864 ; idword:6176(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3  x  i_nr:0, i_ns:1, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+22] offset:13376 ; idword:6176(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3  x  i_nr:0, i_ns:1, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+23] offset:13888 ; idword:6176(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3  x  i_nr:0, i_ns:1, i_nw:0
    v_accvgpr_read_b32 v[v_c+24], a[a_c+108]
    v_accvgpr_read_b32 v[v_c+25], a[a_c+109]
    v_accvgpr_read_b32 v[v_c+26], a[a_c+110]
    v_accvgpr_read_b32 v[v_c+27], a[a_c+111]
    v_cvt_f16_f32_e32 v[v_c+24], v[v_c+24]
    v_cvt_f16_f32_e32 v[v_c+25], v[v_c+25]
    v_cvt_f16_f32_e32 v[v_c+26], v[v_c+26]
    v_cvt_f16_f32_e32 v[v_c+27], v[v_c+27]
    ds_write_b16 v[v_co_sst], v[v_c+24] offset:12544 ; idword:6272(24,128), 24x128, i_mr:0, i_ms:0, i_mw:0, i_mb:3  x  i_nr:1, i_ns:0, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+25] offset:13056 ; idword:6272(24,128), 24x128, i_mr:0, i_ms:0, i_mw:0, i_mb:3  x  i_nr:1, i_ns:0, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+26] offset:13568 ; idword:6272(24,128), 24x128, i_mr:0, i_ms:0, i_mw:0, i_mb:3  x  i_nr:1, i_ns:0, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+27] offset:14080 ; idword:6272(24,128), 24x128, i_mr:0, i_ms:0, i_mw:0, i_mb:3  x  i_nr:1, i_ns:0, i_nw:0
    v_accvgpr_read_b32 v[v_c+28], a[a_c+124]
    v_accvgpr_read_b32 v[v_c+29], a[a_c+125]
    v_accvgpr_read_b32 v[v_c+30], a[a_c+126]
    v_accvgpr_read_b32 v[v_c+31], a[a_c+127]
    v_cvt_f16_f32_e32 v[v_c+28], v[v_c+28]
    v_cvt_f16_f32_e32 v[v_c+29], v[v_c+29]
    v_cvt_f16_f32_e32 v[v_c+30], v[v_c+30]
    v_cvt_f16_f32_e32 v[v_c+31], v[v_c+31]
    ds_write_b16 v[v_co_sst], v[v_c+28] offset:12608 ; idword:6304(24,160), 24x160, i_mr:0, i_ms:0, i_mw:0, i_mb:3  x  i_nr:1, i_ns:1, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+29] offset:13120 ; idword:6304(24,160), 24x160, i_mr:0, i_ms:0, i_mw:0, i_mb:3  x  i_nr:1, i_ns:1, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+30] offset:13632 ; idword:6304(24,160), 24x160, i_mr:0, i_ms:0, i_mw:0, i_mb:3  x  i_nr:1, i_ns:1, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+31] offset:14144 ; idword:6304(24,160), 24x160, i_mr:0, i_ms:0, i_mw:0, i_mb:3  x  i_nr:1, i_ns:1, i_nw:0
    v_add_u32 v[v_tmp], 32, v[v_in_inb]
    .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1
    .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1
    v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift]
    v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift]
    v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi]
    v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1]
    v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index]
    v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in]
    v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os]
    v_cmp_gt_u32 vcc, s[s_n], v[v_in_in]
    v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c] vcc
    v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi]
    v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1] vcc
    v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi]
    v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag] vcc
    s_waitcnt lgkmcnt(0)
    s_barrier
    ;   load from lds, i_ssgroup:0, num_sld_per_ssgroup:4
    ds_read_b128 v[v_c:v_c+3], v[v_co_sld] offset:0
    ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096
    ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192
    ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288
    ;   store to global, m index start from 32, m0:0, m1:32
    s_waitcnt lgkmcnt(3)
    v_cmpx_le_u32 vcc, 1, v[v_in_flag]
    buffer_store_dwordx4_m v[v_c:v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0
    s_mov_b64 exec, -1
    v_add_u32 v[v_tmp], 40, v[v_in_inb]
    .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1
    .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1
    v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift]
    v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift]
    v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi]
    v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1]
    v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index]
    v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in]
    v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os]
    v_cmp_gt_u32 vcc, s[s_n], v[v_in_in]
    v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c] vcc
    v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi]
    v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1] vcc
    v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi]
    v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag] vcc
    s_waitcnt lgkmcnt(2)
    v_cmpx_le_u32 vcc, 1, v[v_in_flag]
    buffer_store_dwordx4_m v[v_c+4:v_c+4+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0
    s_mov_b64 exec, -1
    v_add_u32 v[v_tmp], 48, v[v_in_inb]
    .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1
    .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1
    v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift]
    v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift]
    v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi]
    v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1]
    v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index]
    v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in]
    v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os]
    v_cmp_gt_u32 vcc, s[s_n], v[v_in_in]
    v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c] vcc
    v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi]
    v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1] vcc
    v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi]
    v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag] vcc
    s_waitcnt lgkmcnt(1)
    v_cmpx_le_u32 vcc, 1, v[v_in_flag]
    buffer_store_dwordx4_m v[v_c+8:v_c+8+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0
    s_mov_b64 exec, -1
    v_add_u32 v[v_tmp], 56, v[v_in_inb]
    .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1
    .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1
    v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift]
    v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift]
    v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi]
    v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1]
    v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index]
    v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in]
    v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os]
    v_cmp_gt_u32 vcc, s[s_n], v[v_in_in]
    v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c] vcc
    v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi]
    v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1] vcc
    v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi]
    v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag] vcc
    s_waitcnt lgkmcnt(0)
    v_cmpx_le_u32 vcc, 1, v[v_in_flag]
    buffer_store_dwordx4_m v[v_c+12:v_c+12+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0
    s_mov_b64 exec, -1
    v_add_u32 v[v_tmp], 96, v[v_in_inb]
    .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1
    .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1
    v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift]
    v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift]
    v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi]
    v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1]
    v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index]
    v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in]
    v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os]
    v_cmp_gt_u32 vcc, s[s_n], v[v_in_in]
    v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c] vcc
    v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi]
    v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1] vcc
    v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi]
    v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag] vcc
    ;   load from lds, i_ssgroup:1, num_sld_per_ssgroup:4
    ds_read_b128 v[v_c:v_c+3], v[v_co_sld] offset:16384
    ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:20480
    ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:24576
    ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:28672
    ;   store to global, m index start from 32, m0:0, m1:32
    s_waitcnt lgkmcnt(3)
    v_cmpx_le_u32 vcc, 1, v[v_in_flag]
    buffer_store_dwordx4_m v[v_c:v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0
    s_mov_b64 exec, -1
    v_add_u32 v[v_tmp], 104, v[v_in_inb]
    .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1
    .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1
    v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift]
    v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift]
    v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi]
    v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1]
    v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index]
    v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in]
    v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os]
    v_cmp_gt_u32 vcc, s[s_n], v[v_in_in]
    v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c] vcc
    v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi]
    v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1] vcc
    v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi]
    v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag] vcc
    s_waitcnt lgkmcnt(2)
    v_cmpx_le_u32 vcc, 1, v[v_in_flag]
    buffer_store_dwordx4_m v[v_c+4:v_c+4+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0
    s_mov_b64 exec, -1
    v_add_u32 v[v_tmp], 112, v[v_in_inb]
    .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1
    .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1
    v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift]
    v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift]
    v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi]
    v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1]
    v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index]
    v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in]
    v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os]
    v_cmp_gt_u32 vcc, s[s_n], v[v_in_in]
    v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c] vcc
    v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi]
    v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1] vcc
    v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi]
    v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag] vcc
    s_waitcnt lgkmcnt(1)
    v_cmpx_le_u32 vcc, 1, v[v_in_flag]
    buffer_store_dwordx4_m v[v_c+8:v_c+8+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0
    s_mov_b64 exec, -1
    v_add_u32 v[v_tmp], 120, v[v_in_inb]
    .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1
    .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1
    v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift]
    v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift]
    v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi]
    v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1]
    v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index]
    v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in]
    v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os]
    v_cmp_gt_u32 vcc, s[s_n], v[v_in_in]
    v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c] vcc
    v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi]
    v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1] vcc
    v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi]
    v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag] vcc
    s_waitcnt lgkmcnt(0)
    v_cmpx_le_u32 vcc, 1, v[v_in_flag]
    buffer_store_dwordx4_m v[v_c+12:v_c+12+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0
    s_mov_b64 exec, -1
    ; start group 2, i_g_mr:1, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 128
    s_barrier
    v_accvgpr_read_b32 v[v_c], a[a_c+128]
    v_accvgpr_read_b32 v[v_c+1], a[a_c+129]
    v_accvgpr_read_b32 v[v_c+2], a[a_c+130]
    v_accvgpr_read_b32 v[v_c+3], a[a_c+131]
    v_cvt_f16_f32_e32 v[v_c], v[v_c]
    v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1]
    v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2]
    v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3]
    ds_write_b16 v[v_co_sst], v[v_c]  ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0  x  i_nr:0, i_ns:0, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+1] offset:512 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0  x  i_nr:0, i_ns:0, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+2] offset:1024 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0  x  i_nr:0, i_ns:0, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+3] offset:1536 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0  x  i_nr:0, i_ns:0, i_nw:0
    v_accvgpr_read_b32 v[v_c+4], a[a_c+144]
    v_accvgpr_read_b32 v[v_c+5], a[a_c+145]
    v_accvgpr_read_b32 v[v_c+6], a[a_c+146]
    v_accvgpr_read_b32 v[v_c+7], a[a_c+147]
    v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4]
    v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5]
    v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6]
    v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7]
    ds_write_b16 v[v_co_sst], v[v_c+4] offset:64 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0  x  i_nr:0, i_ns:1, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+5] offset:576 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0  x  i_nr:0, i_ns:1, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+6] offset:1088 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0  x  i_nr:0, i_ns:1, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+7] offset:1600 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0  x  i_nr:0, i_ns:1, i_nw:0
    v_accvgpr_read_b32 v[v_c+8], a[a_c+192]
    v_accvgpr_read_b32 v[v_c+9], a[a_c+193]
    v_accvgpr_read_b32 v[v_c+10], a[a_c+194]
    v_accvgpr_read_b32 v[v_c+11], a[a_c+195]
    v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8]
    v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9]
    v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10]
    v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11]
    ds_write_b16 v[v_co_sst], v[v_c+8] offset:256 ; idword:128(0,128), 0x128, i_mr:0, i_ms:0, i_mw:0, i_mb:0  x  i_nr:1, i_ns:0, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+9] offset:768 ; idword:128(0,128), 0x128, i_mr:0, i_ms:0, i_mw:0, i_mb:0  x  i_nr:1, i_ns:0, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+10] offset:1280 ; idword:128(0,128), 0x128, i_mr:0, i_ms:0, i_mw:0, i_mb:0  x  i_nr:1, i_ns:0, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+11] offset:1792 ; idword:128(0,128), 0x128, i_mr:0, i_ms:0, i_mw:0, i_mb:0  x  i_nr:1, i_ns:0, i_nw:0
    v_accvgpr_read_b32 v[v_c+12], a[a_c+208]
    v_accvgpr_read_b32 v[v_c+13], a[a_c+209]
    v_accvgpr_read_b32 v[v_c+14], a[a_c+210]
    v_accvgpr_read_b32 v[v_c+15], a[a_c+211]
    v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12]
    v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13]
    v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14]
    v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15]
    ds_write_b16 v[v_co_sst], v[v_c+12] offset:320 ; idword:160(0,160), 0x160, i_mr:0, i_ms:0, i_mw:0, i_mb:0  x  i_nr:1, i_ns:1, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+13] offset:832 ; idword:160(0,160), 0x160, i_mr:0, i_ms:0, i_mw:0, i_mb:0  x  i_nr:1, i_ns:1, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+14] offset:1344 ; idword:160(0,160), 0x160, i_mr:0, i_ms:0, i_mw:0, i_mb:0  x  i_nr:1, i_ns:1, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+15] offset:1856 ; idword:160(0,160), 0x160, i_mr:0, i_ms:0, i_mw:0, i_mb:0  x  i_nr:1, i_ns:1, i_nw:0
    v_accvgpr_read_b32 v[v_c+16], a[a_c+132]
    v_accvgpr_read_b32 v[v_c+17], a[a_c+133]
    v_accvgpr_read_b32 v[v_c+18], a[a_c+134]
    v_accvgpr_read_b32 v[v_c+19], a[a_c+135]
    v_cvt_f16_f32_e32 v[v_c+16], v[v_c+16]
    v_cvt_f16_f32_e32 v[v_c+17], v[v_c+17]
    v_cvt_f16_f32_e32 v[v_c+18], v[v_c+18]
    v_cvt_f16_f32_e32 v[v_c+19], v[v_c+19]
    ds_write_b16 v[v_co_sst], v[v_c+16] offset:4096 ; idword:2048(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1  x  i_nr:0, i_ns:0, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+17] offset:4608 ; idword:2048(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1  x  i_nr:0, i_ns:0, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+18] offset:5120 ; idword:2048(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1  x  i_nr:0, i_ns:0, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+19] offset:5632 ; idword:2048(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1  x  i_nr:0, i_ns:0, i_nw:0
    v_accvgpr_read_b32 v[v_c+20], a[a_c+148]
    v_accvgpr_read_b32 v[v_c+21], a[a_c+149]
    v_accvgpr_read_b32 v[v_c+22], a[a_c+150]
    v_accvgpr_read_b32 v[v_c+23], a[a_c+151]
    v_cvt_f16_f32_e32 v[v_c+20], v[v_c+20]
    v_cvt_f16_f32_e32 v[v_c+21], v[v_c+21]
    v_cvt_f16_f32_e32 v[v_c+22], v[v_c+22]
    v_cvt_f16_f32_e32 v[v_c+23], v[v_c+23]
    ds_write_b16 v[v_co_sst], v[v_c+20] offset:4160 ; idword:2080(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1  x  i_nr:0, i_ns:1, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+21] offset:4672 ; idword:2080(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1  x  i_nr:0, i_ns:1, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+22] offset:5184 ; idword:2080(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1  x  i_nr:0, i_ns:1, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+23] offset:5696 ; idword:2080(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1  x  i_nr:0, i_ns:1, i_nw:0
    v_accvgpr_read_b32 v[v_c+24], a[a_c+196]
    v_accvgpr_read_b32 v[v_c+25], a[a_c+197]
    v_accvgpr_read_b32 v[v_c+26], a[a_c+198]
    v_accvgpr_read_b32 v[v_c+27], a[a_c+199]
    v_cvt_f16_f32_e32 v[v_c+24], v[v_c+24]
    v_cvt_f16_f32_e32 v[v_c+25], v[v_c+25]
    v_cvt_f16_f32_e32 v[v_c+26], v[v_c+26]
    v_cvt_f16_f32_e32 v[v_c+27], v[v_c+27]
    ds_write_b16 v[v_co_sst], v[v_c+24] offset:4352 ; idword:2176(8,128), 8x128, i_mr:0, i_ms:0, i_mw:0, i_mb:1  x  i_nr:1, i_ns:0, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+25] offset:4864 ; idword:2176(8,128), 8x128, i_mr:0, i_ms:0, i_mw:0, i_mb:1  x  i_nr:1, i_ns:0, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+26] offset:5376 ; idword:2176(8,128), 8x128, i_mr:0, i_ms:0, i_mw:0, i_mb:1  x  i_nr:1, i_ns:0, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+27] offset:5888 ; idword:2176(8,128), 8x128, i_mr:0, i_ms:0, i_mw:0, i_mb:1  x  i_nr:1, i_ns:0, i_nw:0
    v_accvgpr_read_b32 v[v_c+28], a[a_c+212]
    v_accvgpr_read_b32 v[v_c+29], a[a_c+213]
    v_accvgpr_read_b32 v[v_c+30], a[a_c+214]
    v_accvgpr_read_b32 v[v_c+31], a[a_c+215]
    v_cvt_f16_f32_e32 v[v_c+28], v[v_c+28]
    v_cvt_f16_f32_e32 v[v_c+29], v[v_c+29]
    v_cvt_f16_f32_e32 v[v_c+30], v[v_c+30]
    v_cvt_f16_f32_e32 v[v_c+31], v[v_c+31]
    ds_write_b16 v[v_co_sst], v[v_c+28] offset:4416 ; idword:2208(8,160), 8x160, i_mr:0, i_ms:0, i_mw:0, i_mb:1  x  i_nr:1, i_ns:1, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+29] offset:4928 ; idword:2208(8,160), 8x160, i_mr:0, i_ms:0, i_mw:0, i_mb:1  x  i_nr:1, i_ns:1, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+30] offset:5440 ; idword:2208(8,160), 8x160, i_mr:0, i_ms:0, i_mw:0, i_mb:1  x  i_nr:1, i_ns:1, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+31] offset:5952 ; idword:2208(8,160), 8x160, i_mr:0, i_ms:0, i_mw:0, i_mb:1  x  i_nr:1, i_ns:1, i_nw:0
    v_accvgpr_read_b32 v[v_c], a[a_c+136]
    v_accvgpr_read_b32 v[v_c+1], a[a_c+137]
    v_accvgpr_read_b32 v[v_c+2], a[a_c+138]
    v_accvgpr_read_b32 v[v_c+3], a[a_c+139]
    v_cvt_f16_f32_e32 v[v_c], v[v_c]
    v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1]
    v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2]
    v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3]
    ds_write_b16 v[v_co_sst], v[v_c] offset:8192 ; idword:4096(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2  x  i_nr:0, i_ns:0, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+1] offset:8704 ; idword:4096(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2  x  i_nr:0, i_ns:0, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+2] offset:9216 ; idword:4096(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2  x  i_nr:0, i_ns:0, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+3] offset:9728 ; idword:4096(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2  x  i_nr:0, i_ns:0, i_nw:0
    v_accvgpr_read_b32 v[v_c+4], a[a_c+152]
    v_accvgpr_read_b32 v[v_c+5], a[a_c+153]
    v_accvgpr_read_b32 v[v_c+6], a[a_c+154]
    v_accvgpr_read_b32 v[v_c+7], a[a_c+155]
    v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4]
    v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5]
    v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6]
    v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7]
    ds_write_b16 v[v_co_sst], v[v_c+4] offset:8256 ; idword:4128(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2  x  i_nr:0, i_ns:1, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+5] offset:8768 ; idword:4128(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2  x  i_nr:0, i_ns:1, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+6] offset:9280 ; idword:4128(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2  x  i_nr:0, i_ns:1, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+7] offset:9792 ; idword:4128(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2  x  i_nr:0, i_ns:1, i_nw:0
    v_accvgpr_read_b32 v[v_c+8], a[a_c+200]
    v_accvgpr_read_b32 v[v_c+9], a[a_c+201]
    v_accvgpr_read_b32 v[v_c+10], a[a_c+202]
    v_accvgpr_read_b32 v[v_c+11], a[a_c+203]
    v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8]
    v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9]
    v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10]
    v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11]
    ds_write_b16 v[v_co_sst], v[v_c+8] offset:8448 ; idword:4224(16,128), 16x128, i_mr:0, i_ms:0, i_mw:0, i_mb:2  x  i_nr:1, i_ns:0, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+9] offset:8960 ; idword:4224(16,128), 16x128, i_mr:0, i_ms:0, i_mw:0, i_mb:2  x  i_nr:1, i_ns:0, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+10] offset:9472 ; idword:4224(16,128), 16x128, i_mr:0, i_ms:0, i_mw:0, i_mb:2  x  i_nr:1, i_ns:0, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+11] offset:9984 ; idword:4224(16,128), 16x128, i_mr:0, i_ms:0, i_mw:0, i_mb:2  x  i_nr:1, i_ns:0, i_nw:0
    v_accvgpr_read_b32 v[v_c+12], a[a_c+216]
    v_accvgpr_read_b32 v[v_c+13], a[a_c+217]
    v_accvgpr_read_b32 v[v_c+14], a[a_c+218]
    v_accvgpr_read_b32 v[v_c+15], a[a_c+219]
    v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12]
    v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13]
    v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14]
    v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15]
    ds_write_b16 v[v_co_sst], v[v_c+12] offset:8512 ; idword:4256(16,160), 16x160, i_mr:0, i_ms:0, i_mw:0, i_mb:2  x  i_nr:1, i_ns:1, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+13] offset:9024 ; idword:4256(16,160), 16x160, i_mr:0, i_ms:0, i_mw:0, i_mb:2  x  i_nr:1, i_ns:1, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+14] offset:9536 ; idword:4256(16,160), 16x160, i_mr:0, i_ms:0, i_mw:0, i_mb:2  x  i_nr:1, i_ns:1, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+15] offset:10048 ; idword:4256(16,160), 16x160, i_mr:0, i_ms:0, i_mw:0, i_mb:2  x  i_nr:1, i_ns:1, i_nw:0
    v_accvgpr_read_b32 v[v_c+16], a[a_c+140]
    v_accvgpr_read_b32 v[v_c+17], a[a_c+141]
    v_accvgpr_read_b32 v[v_c+18], a[a_c+142]
    v_accvgpr_read_b32 v[v_c+19], a[a_c+143]
    v_cvt_f16_f32_e32 v[v_c+16], v[v_c+16]
    v_cvt_f16_f32_e32 v[v_c+17], v[v_c+17]
    v_cvt_f16_f32_e32 v[v_c+18], v[v_c+18]
    v_cvt_f16_f32_e32 v[v_c+19], v[v_c+19]
    ds_write_b16 v[v_co_sst], v[v_c+16] offset:12288 ; idword:6144(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3  x  i_nr:0, i_ns:0, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+17] offset:12800 ; idword:6144(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3  x  i_nr:0, i_ns:0, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+18] offset:13312 ; idword:6144(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3  x  i_nr:0, i_ns:0, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+19] offset:13824 ; idword:6144(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3  x  i_nr:0, i_ns:0, i_nw:0
    v_accvgpr_read_b32 v[v_c+20], a[a_c+156]
    v_accvgpr_read_b32 v[v_c+21], a[a_c+157]
    v_accvgpr_read_b32 v[v_c+22], a[a_c+158]
    v_accvgpr_read_b32 v[v_c+23], a[a_c+159]
    v_cvt_f16_f32_e32 v[v_c+20], v[v_c+20]
    v_cvt_f16_f32_e32 v[v_c+21], v[v_c+21]
    v_cvt_f16_f32_e32 v[v_c+22], v[v_c+22]
    v_cvt_f16_f32_e32 v[v_c+23], v[v_c+23]
    ds_write_b16 v[v_co_sst], v[v_c+20] offset:12352 ; idword:6176(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3  x  i_nr:0, i_ns:1, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+21] offset:12864 ; idword:6176(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3  x  i_nr:0, i_ns:1, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+22] offset:13376 ; idword:6176(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3  x  i_nr:0, i_ns:1, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+23] offset:13888 ; idword:6176(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3  x  i_nr:0, i_ns:1, i_nw:0
    v_accvgpr_read_b32 v[v_c+24], a[a_c+204]
    v_accvgpr_read_b32 v[v_c+25], a[a_c+205]
    v_accvgpr_read_b32 v[v_c+26], a[a_c+206]
    v_accvgpr_read_b32 v[v_c+27], a[a_c+207]
    v_cvt_f16_f32_e32 v[v_c+24], v[v_c+24]
    v_cvt_f16_f32_e32 v[v_c+25], v[v_c+25]
    v_cvt_f16_f32_e32 v[v_c+26], v[v_c+26]
    v_cvt_f16_f32_e32 v[v_c+27], v[v_c+27]
    ds_write_b16 v[v_co_sst], v[v_c+24] offset:12544 ; idword:6272(24,128), 24x128, i_mr:0, i_ms:0, i_mw:0, i_mb:3  x  i_nr:1, i_ns:0, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+25] offset:13056 ; idword:6272(24,128), 24x128, i_mr:0, i_ms:0, i_mw:0, i_mb:3  x  i_nr:1, i_ns:0, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+26] offset:13568 ; idword:6272(24,128), 24x128, i_mr:0, i_ms:0, i_mw:0, i_mb:3  x  i_nr:1, i_ns:0, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+27] offset:14080 ; idword:6272(24,128), 24x128, i_mr:0, i_ms:0, i_mw:0, i_mb:3  x  i_nr:1, i_ns:0, i_nw:0
    v_accvgpr_read_b32 v[v_c+28], a[a_c+220]
    v_accvgpr_read_b32 v[v_c+29], a[a_c+221]
    v_accvgpr_read_b32 v[v_c+30], a[a_c+222]
    v_accvgpr_read_b32 v[v_c+31], a[a_c+223]
    v_cvt_f16_f32_e32 v[v_c+28], v[v_c+28]
    v_cvt_f16_f32_e32 v[v_c+29], v[v_c+29]
    v_cvt_f16_f32_e32 v[v_c+30], v[v_c+30]
    v_cvt_f16_f32_e32 v[v_c+31], v[v_c+31]
    ds_write_b16 v[v_co_sst], v[v_c+28] offset:12608 ; idword:6304(24,160), 24x160, i_mr:0, i_ms:0, i_mw:0, i_mb:3  x  i_nr:1, i_ns:1, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+29] offset:13120 ; idword:6304(24,160), 24x160, i_mr:0, i_ms:0, i_mw:0, i_mb:3  x  i_nr:1, i_ns:1, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+30] offset:13632 ; idword:6304(24,160), 24x160, i_mr:0, i_ms:0, i_mw:0, i_mb:3  x  i_nr:1, i_ns:1, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+31] offset:14144 ; idword:6304(24,160), 24x160, i_mr:0, i_ms:0, i_mw:0, i_mb:3  x  i_nr:1, i_ns:1, i_nw:0
    v_add_u32 v[v_tmp], 128, v[v_in_inb]
    .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1
    .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1
    v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift]
    v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift]
    v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi]
    v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1]
    v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index]
    v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in]
    v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os]
    v_cmp_gt_u32 vcc, s[s_n], v[v_in_in]
    v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c] vcc
    v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi]
    v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1] vcc
    v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi]
    v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag] vcc
    s_waitcnt lgkmcnt(0)
    s_barrier
    ;   load from lds, i_ssgroup:0, num_sld_per_ssgroup:4
    ds_read_b128 v[v_c:v_c+3], v[v_co_sld] offset:0
    ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096
    ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192
    ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288
    ;   store to global, m index start from 128, m0:2, m1:0
    s_waitcnt lgkmcnt(3)
    v_cmpx_le_u32 vcc, 1, v[v_in_flag]
    buffer_store_dwordx4_m v[v_c:v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0
    s_mov_b64 exec, -1
    v_add_u32 v[v_tmp], 136, v[v_in_inb]
    .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1
    .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1
    v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift]
    v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift]
    v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi]
    v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1]
    v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index]
    v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in]
    v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os]
    v_cmp_gt_u32 vcc, s[s_n], v[v_in_in]
    v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c] vcc
    v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi]
    v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1] vcc
    v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi]
    v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag] vcc
    s_waitcnt lgkmcnt(2)
    v_cmpx_le_u32 vcc, 1, v[v_in_flag]
    buffer_store_dwordx4_m v[v_c+4:v_c+4+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0
    s_mov_b64 exec, -1
    v_add_u32 v[v_tmp], 144, v[v_in_inb]
    .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1
    .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1
    v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift]
    v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift]
    v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi]
    v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1]
    v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index]
    v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in]
    v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os]
    v_cmp_gt_u32 vcc, s[s_n], v[v_in_in]
    v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c] vcc
    v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi]
    v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1] vcc
    v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi]
    v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag] vcc
    s_waitcnt lgkmcnt(1)
    v_cmpx_le_u32 vcc, 1, v[v_in_flag]
    buffer_store_dwordx4_m v[v_c+8:v_c+8+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0
    s_mov_b64 exec, -1
    v_add_u32 v[v_tmp], 152, v[v_in_inb]
    .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1
    .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1
    v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift]
    v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift]
    v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi]
    v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1]
    v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index]
    v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in]
    v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os]
    v_cmp_gt_u32 vcc, s[s_n], v[v_in_in]
    v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c] vcc
    v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi]
    v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1] vcc
    v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi]
    v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag] vcc
    s_waitcnt lgkmcnt(0)
    v_cmpx_le_u32 vcc, 1, v[v_in_flag]
    buffer_store_dwordx4_m v[v_c+12:v_c+12+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0
    s_mov_b64 exec, -1
    v_add_u32 v[v_tmp], 192, v[v_in_inb]
    .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1
    .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1
    v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift]
    v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift]
    v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi]
    v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1]
    v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index]
    v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in]
    v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os]
    v_cmp_gt_u32 vcc, s[s_n], v[v_in_in]
    v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c] vcc
    v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi]
    v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1] vcc
    v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi]
    v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag] vcc
    ;   load from lds, i_ssgroup:1, num_sld_per_ssgroup:4
    ds_read_b128 v[v_c:v_c+3], v[v_co_sld] offset:16384
    ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:20480
    ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:24576
    ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:28672
    ;   store to global, m index start from 128, m0:2, m1:0
    s_waitcnt lgkmcnt(3)
    v_cmpx_le_u32 vcc, 1, v[v_in_flag]
    buffer_store_dwordx4_m v[v_c:v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0
    s_mov_b64 exec, -1
    v_add_u32 v[v_tmp], 200, v[v_in_inb]
    .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1
    .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1
    v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift]
    v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift]
    v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi]
    v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1]
    v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index]
    v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in]
    v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os]
    v_cmp_gt_u32 vcc, s[s_n], v[v_in_in]
    v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c] vcc
    v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi]
    v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1] vcc
    v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi]
    v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag] vcc
    s_waitcnt lgkmcnt(2)
    v_cmpx_le_u32 vcc, 1, v[v_in_flag]
    buffer_store_dwordx4_m v[v_c+4:v_c+4+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0
    s_mov_b64 exec, -1
    v_add_u32 v[v_tmp], 208, v[v_in_inb]
    .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1
    .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1
    v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift]
    v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift]
    v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi]
    v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1]
    v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index]
    v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in]
    v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os]
    v_cmp_gt_u32 vcc, s[s_n], v[v_in_in]
    v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c] vcc
    v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi]
    v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1] vcc
    v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi]
    v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag] vcc
    s_waitcnt lgkmcnt(1)
    v_cmpx_le_u32 vcc, 1, v[v_in_flag]
    buffer_store_dwordx4_m v[v_c+8:v_c+8+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0
    s_mov_b64 exec, -1
    v_add_u32 v[v_tmp], 216, v[v_in_inb]
    .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1
    .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1
    v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift]
    v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift]
    v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi]
    v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1]
    v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index]
    v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in]
    v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os]
    v_cmp_gt_u32 vcc, s[s_n], v[v_in_in]
    v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c] vcc
    v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi]
    v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1] vcc
    v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi]
    v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag] vcc
    s_waitcnt lgkmcnt(0)
    v_cmpx_le_u32 vcc, 1, v[v_in_flag]
    buffer_store_dwordx4_m v[v_c+12:v_c+12+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0
    s_mov_b64 exec, -1
    ; start group 3, i_g_mr:1, i_g_ms:1, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 160
    s_barrier
    v_accvgpr_read_b32 v[v_c], a[a_c+160]
    v_accvgpr_read_b32 v[v_c+1], a[a_c+161]
    v_accvgpr_read_b32 v[v_c+2], a[a_c+162]
    v_accvgpr_read_b32 v[v_c+3], a[a_c+163]
    v_cvt_f16_f32_e32 v[v_c], v[v_c]
    v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1]
    v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2]
    v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3]
    ds_write_b16 v[v_co_sst], v[v_c]  ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0  x  i_nr:0, i_ns:0, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+1] offset:512 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0  x  i_nr:0, i_ns:0, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+2] offset:1024 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0  x  i_nr:0, i_ns:0, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+3] offset:1536 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0  x  i_nr:0, i_ns:0, i_nw:0
    v_accvgpr_read_b32 v[v_c+4], a[a_c+176]
    v_accvgpr_read_b32 v[v_c+5], a[a_c+177]
    v_accvgpr_read_b32 v[v_c+6], a[a_c+178]
    v_accvgpr_read_b32 v[v_c+7], a[a_c+179]
    v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4]
    v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5]
    v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6]
    v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7]
    ds_write_b16 v[v_co_sst], v[v_c+4] offset:64 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0  x  i_nr:0, i_ns:1, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+5] offset:576 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0  x  i_nr:0, i_ns:1, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+6] offset:1088 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0  x  i_nr:0, i_ns:1, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+7] offset:1600 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0  x  i_nr:0, i_ns:1, i_nw:0
    v_accvgpr_read_b32 v[v_c+8], a[a_c+224]
    v_accvgpr_read_b32 v[v_c+9], a[a_c+225]
    v_accvgpr_read_b32 v[v_c+10], a[a_c+226]
    v_accvgpr_read_b32 v[v_c+11], a[a_c+227]
    v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8]
    v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9]
    v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10]
    v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11]
    ds_write_b16 v[v_co_sst], v[v_c+8] offset:256 ; idword:128(0,128), 0x128, i_mr:0, i_ms:0, i_mw:0, i_mb:0  x  i_nr:1, i_ns:0, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+9] offset:768 ; idword:128(0,128), 0x128, i_mr:0, i_ms:0, i_mw:0, i_mb:0  x  i_nr:1, i_ns:0, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+10] offset:1280 ; idword:128(0,128), 0x128, i_mr:0, i_ms:0, i_mw:0, i_mb:0  x  i_nr:1, i_ns:0, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+11] offset:1792 ; idword:128(0,128), 0x128, i_mr:0, i_ms:0, i_mw:0, i_mb:0  x  i_nr:1, i_ns:0, i_nw:0
    v_accvgpr_read_b32 v[v_c+12], a[a_c+240]
    v_accvgpr_read_b32 v[v_c+13], a[a_c+241]
    v_accvgpr_read_b32 v[v_c+14], a[a_c+242]
    v_accvgpr_read_b32 v[v_c+15], a[a_c+243]
    v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12]
    v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13]
    v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14]
    v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15]
    ds_write_b16 v[v_co_sst], v[v_c+12] offset:320 ; idword:160(0,160), 0x160, i_mr:0, i_ms:0, i_mw:0, i_mb:0  x  i_nr:1, i_ns:1, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+13] offset:832 ; idword:160(0,160), 0x160, i_mr:0, i_ms:0, i_mw:0, i_mb:0  x  i_nr:1, i_ns:1, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+14] offset:1344 ; idword:160(0,160), 0x160, i_mr:0, i_ms:0, i_mw:0, i_mb:0  x  i_nr:1, i_ns:1, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+15] offset:1856 ; idword:160(0,160), 0x160, i_mr:0, i_ms:0, i_mw:0, i_mb:0  x  i_nr:1, i_ns:1, i_nw:0
    v_accvgpr_read_b32 v[v_c+16], a[a_c+164]
    v_accvgpr_read_b32 v[v_c+17], a[a_c+165]
    v_accvgpr_read_b32 v[v_c+18], a[a_c+166]
    v_accvgpr_read_b32 v[v_c+19], a[a_c+167]
    v_cvt_f16_f32_e32 v[v_c+16], v[v_c+16]
    v_cvt_f16_f32_e32 v[v_c+17], v[v_c+17]
    v_cvt_f16_f32_e32 v[v_c+18], v[v_c+18]
    v_cvt_f16_f32_e32 v[v_c+19], v[v_c+19]
    ds_write_b16 v[v_co_sst], v[v_c+16] offset:4096 ; idword:2048(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1  x  i_nr:0, i_ns:0, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+17] offset:4608 ; idword:2048(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1  x  i_nr:0, i_ns:0, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+18] offset:5120 ; idword:2048(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1  x  i_nr:0, i_ns:0, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+19] offset:5632 ; idword:2048(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1  x  i_nr:0, i_ns:0, i_nw:0
    v_accvgpr_read_b32 v[v_c+20], a[a_c+180]
    v_accvgpr_read_b32 v[v_c+21], a[a_c+181]
    v_accvgpr_read_b32 v[v_c+22], a[a_c+182]
    v_accvgpr_read_b32 v[v_c+23], a[a_c+183]
    v_cvt_f16_f32_e32 v[v_c+20], v[v_c+20]
    v_cvt_f16_f32_e32 v[v_c+21], v[v_c+21]
    v_cvt_f16_f32_e32 v[v_c+22], v[v_c+22]
    v_cvt_f16_f32_e32 v[v_c+23], v[v_c+23]
    ds_write_b16 v[v_co_sst], v[v_c+20] offset:4160 ; idword:2080(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1  x  i_nr:0, i_ns:1, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+21] offset:4672 ; idword:2080(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1  x  i_nr:0, i_ns:1, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+22] offset:5184 ; idword:2080(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1  x  i_nr:0, i_ns:1, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+23] offset:5696 ; idword:2080(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1  x  i_nr:0, i_ns:1, i_nw:0
    v_accvgpr_read_b32 v[v_c+24], a[a_c+228]
    v_accvgpr_read_b32 v[v_c+25], a[a_c+229]
    v_accvgpr_read_b32 v[v_c+26], a[a_c+230]
    v_accvgpr_read_b32 v[v_c+27], a[a_c+231]
    v_cvt_f16_f32_e32 v[v_c+24], v[v_c+24]
    v_cvt_f16_f32_e32 v[v_c+25], v[v_c+25]
    v_cvt_f16_f32_e32 v[v_c+26], v[v_c+26]
    v_cvt_f16_f32_e32 v[v_c+27], v[v_c+27]
    ds_write_b16 v[v_co_sst], v[v_c+24] offset:4352 ; idword:2176(8,128), 8x128, i_mr:0, i_ms:0, i_mw:0, i_mb:1  x  i_nr:1, i_ns:0, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+25] offset:4864 ; idword:2176(8,128), 8x128, i_mr:0, i_ms:0, i_mw:0, i_mb:1  x  i_nr:1, i_ns:0, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+26] offset:5376 ; idword:2176(8,128), 8x128, i_mr:0, i_ms:0, i_mw:0, i_mb:1  x  i_nr:1, i_ns:0, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+27] offset:5888 ; idword:2176(8,128), 8x128, i_mr:0, i_ms:0, i_mw:0, i_mb:1  x  i_nr:1, i_ns:0, i_nw:0
    v_accvgpr_read_b32 v[v_c+28], a[a_c+244]
    v_accvgpr_read_b32 v[v_c+29], a[a_c+245]
    v_accvgpr_read_b32 v[v_c+30], a[a_c+246]
    v_accvgpr_read_b32 v[v_c+31], a[a_c+247]
    v_cvt_f16_f32_e32 v[v_c+28], v[v_c+28]
    v_cvt_f16_f32_e32 v[v_c+29], v[v_c+29]
    v_cvt_f16_f32_e32 v[v_c+30], v[v_c+30]
    v_cvt_f16_f32_e32 v[v_c+31], v[v_c+31]
    ds_write_b16 v[v_co_sst], v[v_c+28] offset:4416 ; idword:2208(8,160), 8x160, i_mr:0, i_ms:0, i_mw:0, i_mb:1  x  i_nr:1, i_ns:1, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+29] offset:4928 ; idword:2208(8,160), 8x160, i_mr:0, i_ms:0, i_mw:0, i_mb:1  x  i_nr:1, i_ns:1, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+30] offset:5440 ; idword:2208(8,160), 8x160, i_mr:0, i_ms:0, i_mw:0, i_mb:1  x  i_nr:1, i_ns:1, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+31] offset:5952 ; idword:2208(8,160), 8x160, i_mr:0, i_ms:0, i_mw:0, i_mb:1  x  i_nr:1, i_ns:1, i_nw:0
    v_accvgpr_read_b32 v[v_c], a[a_c+168]
    v_accvgpr_read_b32 v[v_c+1], a[a_c+169]
    v_accvgpr_read_b32 v[v_c+2], a[a_c+170]
    v_accvgpr_read_b32 v[v_c+3], a[a_c+171]
    v_cvt_f16_f32_e32 v[v_c], v[v_c]
    v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1]
    v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2]
    v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3]
    ds_write_b16 v[v_co_sst], v[v_c] offset:8192 ; idword:4096(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2  x  i_nr:0, i_ns:0, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+1] offset:8704 ; idword:4096(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2  x  i_nr:0, i_ns:0, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+2] offset:9216 ; idword:4096(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2  x  i_nr:0, i_ns:0, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+3] offset:9728 ; idword:4096(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2  x  i_nr:0, i_ns:0, i_nw:0
    v_accvgpr_read_b32 v[v_c+4], a[a_c+184]
    v_accvgpr_read_b32 v[v_c+5], a[a_c+185]
    v_accvgpr_read_b32 v[v_c+6], a[a_c+186]
    v_accvgpr_read_b32 v[v_c+7], a[a_c+187]
    v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4]
    v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5]
    v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6]
    v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7]
    ds_write_b16 v[v_co_sst], v[v_c+4] offset:8256 ; idword:4128(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2  x  i_nr:0, i_ns:1, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+5] offset:8768 ; idword:4128(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2  x  i_nr:0, i_ns:1, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+6] offset:9280 ; idword:4128(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2  x  i_nr:0, i_ns:1, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+7] offset:9792 ; idword:4128(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2  x  i_nr:0, i_ns:1, i_nw:0
    v_accvgpr_read_b32 v[v_c+8], a[a_c+232]
    v_accvgpr_read_b32 v[v_c+9], a[a_c+233]
    v_accvgpr_read_b32 v[v_c+10], a[a_c+234]
    v_accvgpr_read_b32 v[v_c+11], a[a_c+235]
    v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8]
    v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9]
    v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10]
    v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11]
    ds_write_b16 v[v_co_sst], v[v_c+8] offset:8448 ; idword:4224(16,128), 16x128, i_mr:0, i_ms:0, i_mw:0, i_mb:2  x  i_nr:1, i_ns:0, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+9] offset:8960 ; idword:4224(16,128), 16x128, i_mr:0, i_ms:0, i_mw:0, i_mb:2  x  i_nr:1, i_ns:0, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+10] offset:9472 ; idword:4224(16,128), 16x128, i_mr:0, i_ms:0, i_mw:0, i_mb:2  x  i_nr:1, i_ns:0, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+11] offset:9984 ; idword:4224(16,128), 16x128, i_mr:0, i_ms:0, i_mw:0, i_mb:2  x  i_nr:1, i_ns:0, i_nw:0
    v_accvgpr_read_b32 v[v_c+12], a[a_c+248]
    v_accvgpr_read_b32 v[v_c+13], a[a_c+249]
    v_accvgpr_read_b32 v[v_c+14], a[a_c+250]
    v_accvgpr_read_b32 v[v_c+15], a[a_c+251]
    v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12]
    v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13]
    v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14]
    v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15]
    ds_write_b16 v[v_co_sst], v[v_c+12] offset:8512 ; idword:4256(16,160), 16x160, i_mr:0, i_ms:0, i_mw:0, i_mb:2  x  i_nr:1, i_ns:1, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+13] offset:9024 ; idword:4256(16,160), 16x160, i_mr:0, i_ms:0, i_mw:0, i_mb:2  x  i_nr:1, i_ns:1, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+14] offset:9536 ; idword:4256(16,160), 16x160, i_mr:0, i_ms:0, i_mw:0, i_mb:2  x  i_nr:1, i_ns:1, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+15] offset:10048 ; idword:4256(16,160), 16x160, i_mr:0, i_ms:0, i_mw:0, i_mb:2  x  i_nr:1, i_ns:1, i_nw:0
    v_accvgpr_read_b32 v[v_c+16], a[a_c+172]
    v_accvgpr_read_b32 v[v_c+17], a[a_c+173]
    v_accvgpr_read_b32 v[v_c+18], a[a_c+174]
    v_accvgpr_read_b32 v[v_c+19], a[a_c+175]
    v_cvt_f16_f32_e32 v[v_c+16], v[v_c+16]
    v_cvt_f16_f32_e32 v[v_c+17], v[v_c+17]
    v_cvt_f16_f32_e32 v[v_c+18], v[v_c+18]
    v_cvt_f16_f32_e32 v[v_c+19], v[v_c+19]
    ds_write_b16 v[v_co_sst], v[v_c+16] offset:12288 ; idword:6144(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3  x  i_nr:0, i_ns:0, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+17] offset:12800 ; idword:6144(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3  x  i_nr:0, i_ns:0, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+18] offset:13312 ; idword:6144(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3  x  i_nr:0, i_ns:0, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+19] offset:13824 ; idword:6144(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3  x  i_nr:0, i_ns:0, i_nw:0
    v_accvgpr_read_b32 v[v_c+20], a[a_c+188]
    v_accvgpr_read_b32 v[v_c+21], a[a_c+189]
    v_accvgpr_read_b32 v[v_c+22], a[a_c+190]
    v_accvgpr_read_b32 v[v_c+23], a[a_c+191]
    v_cvt_f16_f32_e32 v[v_c+20], v[v_c+20]
    v_cvt_f16_f32_e32 v[v_c+21], v[v_c+21]
    v_cvt_f16_f32_e32 v[v_c+22], v[v_c+22]
    v_cvt_f16_f32_e32 v[v_c+23], v[v_c+23]
    ds_write_b16 v[v_co_sst], v[v_c+20] offset:12352 ; idword:6176(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3  x  i_nr:0, i_ns:1, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+21] offset:12864 ; idword:6176(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3  x  i_nr:0, i_ns:1, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+22] offset:13376 ; idword:6176(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3  x  i_nr:0, i_ns:1, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+23] offset:13888 ; idword:6176(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3  x  i_nr:0, i_ns:1, i_nw:0
    v_accvgpr_read_b32 v[v_c+24], a[a_c+236]
    v_accvgpr_read_b32 v[v_c+25], a[a_c+237]
    v_accvgpr_read_b32 v[v_c+26], a[a_c+238]
    v_accvgpr_read_b32 v[v_c+27], a[a_c+239]
    v_cvt_f16_f32_e32 v[v_c+24], v[v_c+24]
    v_cvt_f16_f32_e32 v[v_c+25], v[v_c+25]
    v_cvt_f16_f32_e32 v[v_c+26], v[v_c+26]
    v_cvt_f16_f32_e32 v[v_c+27], v[v_c+27]
    ds_write_b16 v[v_co_sst], v[v_c+24] offset:12544 ; idword:6272(24,128), 24x128, i_mr:0, i_ms:0, i_mw:0, i_mb:3  x  i_nr:1, i_ns:0, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+25] offset:13056 ; idword:6272(24,128), 24x128, i_mr:0, i_ms:0, i_mw:0, i_mb:3  x  i_nr:1, i_ns:0, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+26] offset:13568 ; idword:6272(24,128), 24x128, i_mr:0, i_ms:0, i_mw:0, i_mb:3  x  i_nr:1, i_ns:0, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+27] offset:14080 ; idword:6272(24,128), 24x128, i_mr:0, i_ms:0, i_mw:0, i_mb:3  x  i_nr:1, i_ns:0, i_nw:0
    v_accvgpr_read_b32 v[v_c+28], a[a_c+252]
    v_accvgpr_read_b32 v[v_c+29], a[a_c+253]
    v_accvgpr_read_b32 v[v_c+30], a[a_c+254]
    v_accvgpr_read_b32 v[v_c+31], a[a_c+255]
    v_cvt_f16_f32_e32 v[v_c+28], v[v_c+28]
    v_cvt_f16_f32_e32 v[v_c+29], v[v_c+29]
    v_cvt_f16_f32_e32 v[v_c+30], v[v_c+30]
    v_cvt_f16_f32_e32 v[v_c+31], v[v_c+31]
    ds_write_b16 v[v_co_sst], v[v_c+28] offset:12608 ; idword:6304(24,160), 24x160, i_mr:0, i_ms:0, i_mw:0, i_mb:3  x  i_nr:1, i_ns:1, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+29] offset:13120 ; idword:6304(24,160), 24x160, i_mr:0, i_ms:0, i_mw:0, i_mb:3  x  i_nr:1, i_ns:1, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+30] offset:13632 ; idword:6304(24,160), 24x160, i_mr:0, i_ms:0, i_mw:0, i_mb:3  x  i_nr:1, i_ns:1, i_nw:0
    ds_write_b16 v[v_co_sst], v[v_c+31] offset:14144 ; idword:6304(24,160), 24x160, i_mr:0, i_ms:0, i_mw:0, i_mb:3  x  i_nr:1, i_ns:1, i_nw:0
    v_add_u32 v[v_tmp], 160, v[v_in_inb]
    .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1
    .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1
    v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift]
    v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift]
    v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi]
    v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1]
    v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index]
    v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in]
    v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os]
    v_cmp_gt_u32 vcc, s[s_n], v[v_in_in]
    v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c] vcc
    v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi]
    v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1] vcc
    v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi]
    v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag] vcc
    s_waitcnt lgkmcnt(0)
    s_barrier
    ;   load from lds, i_ssgroup:0, num_sld_per_ssgroup:4
    ds_read_b128 v[v_c:v_c+3], v[v_co_sld] offset:0
    ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096
    ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192
    ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288
    ;   store to global, m index start from 160, m0:2, m1:32
    s_waitcnt lgkmcnt(3)
    v_cmpx_le_u32 vcc, 1, v[v_in_flag]
    buffer_store_dwordx4_m v[v_c:v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0
    s_mov_b64 exec, -1
    v_add_u32 v[v_tmp], 168, v[v_in_inb]
    .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1
    .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1
    v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift]
    v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift]
    v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi]
    v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1]
    v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index]
    v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in]
    v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os]
    v_cmp_gt_u32 vcc, s[s_n], v[v_in_in]
    v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c] vcc
    v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi]
    v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1] vcc
    v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi]
    v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag] vcc
    s_waitcnt lgkmcnt(2)
    v_cmpx_le_u32 vcc, 1, v[v_in_flag]
    buffer_store_dwordx4_m v[v_c+4:v_c+4+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0
    s_mov_b64 exec, -1
    v_add_u32 v[v_tmp], 176, v[v_in_inb]
    .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1
    .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1
    v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift]
    v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift]
    v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi]
    v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1]
    v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index]
    v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in]
    v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os]
    v_cmp_gt_u32 vcc, s[s_n], v[v_in_in]
    v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c] vcc
    v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi]
    v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1] vcc
    v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi]
    v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag] vcc
    s_waitcnt lgkmcnt(1)
    v_cmpx_le_u32 vcc, 1, v[v_in_flag]
    buffer_store_dwordx4_m v[v_c+8:v_c+8+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0
    s_mov_b64 exec, -1
    v_add_u32 v[v_tmp], 184, v[v_in_inb]
    .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1
    .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1
    v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift]
    v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift]
    v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi]
    v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1]
    v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index]
    v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in]
    v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os]
    v_cmp_gt_u32 vcc, s[s_n], v[v_in_in]
    v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c] vcc
    v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi]
    v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1] vcc
    v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi]
    v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag] vcc
    s_waitcnt lgkmcnt(0)
    v_cmpx_le_u32 vcc, 1, v[v_in_flag]
    buffer_store_dwordx4_m v[v_c+12:v_c+12+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0
    s_mov_b64 exec, -1
    v_add_u32 v[v_tmp], 224, v[v_in_inb]
    .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1
    .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1
    v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift]
    v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift]
    v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi]
    v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1]
    v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index]
    v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in]
    v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os]
    v_cmp_gt_u32 vcc, s[s_n], v[v_in_in]
    v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c] vcc
    v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi]
    v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1] vcc
    v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi]
    v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag] vcc
    ;   load from lds, i_ssgroup:1, num_sld_per_ssgroup:4
    ds_read_b128 v[v_c:v_c+3], v[v_co_sld] offset:16384
    ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:20480
    ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:24576
    ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:28672
    ;   store to global, m index start from 160, m0:2, m1:32
    s_waitcnt lgkmcnt(3)
    v_cmpx_le_u32 vcc, 1, v[v_in_flag]
    buffer_store_dwordx4_m v[v_c:v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0
    s_mov_b64 exec, -1
    v_add_u32 v[v_tmp], 232, v[v_in_inb]
    .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1
    .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1
    v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift]
    v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift]
    v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi]
    v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1]
    v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index]
    v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in]
    v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os]
    v_cmp_gt_u32 vcc, s[s_n], v[v_in_in]
    v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c] vcc
    v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi]
    v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1] vcc
    v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi]
    v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag] vcc
    s_waitcnt lgkmcnt(2)
    v_cmpx_le_u32 vcc, 1, v[v_in_flag]
    buffer_store_dwordx4_m v[v_c+4:v_c+4+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0
    s_mov_b64 exec, -1
    v_add_u32 v[v_tmp], 240, v[v_in_inb]
    .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1
    .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1
    v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift]
    v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift]
    v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi]
    v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1]
    v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index]
    v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in]
    v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os]
    v_cmp_gt_u32 vcc, s[s_n], v[v_in_in]
    v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c] vcc
    v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi]
    v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1] vcc
    v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi]
    v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag] vcc
    s_waitcnt lgkmcnt(1)
    v_cmpx_le_u32 vcc, 1, v[v_in_flag]
    buffer_store_dwordx4_m v[v_c+8:v_c+8+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0
    s_mov_b64 exec, -1
    v_add_u32 v[v_tmp], 248, v[v_in_inb]
    .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1
    .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1
    v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift]
    v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift]
    v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi]
    v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1]
    v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index]
    v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in]
    v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os]
    v_cmp_gt_u32 vcc, s[s_n], v[v_in_in]
    v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c] vcc
    v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi]
    v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1] vcc
    v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi]
    v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag] vcc
    s_waitcnt lgkmcnt(0)
    v_cmpx_le_u32 vcc, 1, v[v_in_flag]
    buffer_store_dwordx4_m v[v_c+12:v_c+12+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0
    s_mov_b64 exec, -1
L_igemm_bwd_gtcx35_nhwc_fp16_bx0_ex1_bt256x256x32_wt32x32x8_ws2x2_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mh_out:
    s_endpgm
.rodata
.p2align 6
.amdhsa_kernel igemm_bwd_gtcx35_nhwc_fp16_bx0_ex1_bt256x256x32_wt32x32x8_ws2x2_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mh
    .amdhsa_group_segment_fixed_size 34816
    .amdhsa_user_sgpr_kernarg_segment_ptr 1
    .amdhsa_system_sgpr_workgroup_id_x 1
    .amdhsa_system_sgpr_workgroup_id_y 1
    .amdhsa_system_vgpr_workitem_id 0
    .amdhsa_next_free_vgpr 360
    .amdhsa_next_free_sgpr 88
    .amdhsa_ieee_mode 1
    .amdhsa_dx10_clamp 1
    .amdhsa_float_round_mode_32 3
    .amdhsa_float_round_mode_16_64 3
    .amdhsa_tg_split 0
    .amdhsa_accum_offset 104
.end_amdhsa_kernel

.amdgpu_metadata
---
amdhsa.version: [ 1, 0 ]
amdhsa.kernels:
  - .name: igemm_bwd_gtcx35_nhwc_fp16_bx0_ex1_bt256x256x32_wt32x32x8_ws2x2_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mh
    .symbol: igemm_bwd_gtcx35_nhwc_fp16_bx0_ex1_bt256x256x32_wt32x32x8_ws2x2_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mh.kd
    .sgpr_count: 94
    .vgpr_count: 360
    .kernarg_segment_align: 8
    .kernarg_segment_size: 168
    .group_segment_fixed_size: 34816
    .private_segment_fixed_size: 0
    .wavefront_size: 64
    .reqd_workgroup_size : [256, 1, 1]
    .max_flat_workgroup_size: 256
    .args:
    - { .name: p_in_     , .size: 8, .offset:   0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false}
    - { .name: p_wei_    , .size: 8, .offset:   8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true}
    - { .name: p_out_    , .size: 8, .offset:  16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true}
    - { .name: hi_       , .size: 4, .offset:  24, .value_kind: by_value, .value_type: i32}
    - { .name: wi_       , .size: 4, .offset:  28, .value_kind: by_value, .value_type: i32}
    - { .name: n_        , .size: 4, .offset:  32, .value_kind: by_value, .value_type: i32}
    - { .name: k_        , .size: 4, .offset:  36, .value_kind: by_value, .value_type: i32}
    - { .name: c_        , .size: 4, .offset:  40, .value_kind: by_value, .value_type: i32}
    - { .name: ho_       , .size: 4, .offset:  44, .value_kind: by_value, .value_type: i32}
    - { .name: wo_       , .size: 4, .offset:  48, .value_kind: by_value, .value_type: i32}
    - { .name: stride_h_ , .size: 4, .offset:  52, .value_kind: by_value, .value_type: i32}
    - { .name: stride_w_ , .size: 4, .offset:  56, .value_kind: by_value, .value_type: i32}
    - { .name: dilation_h_, .size: 4, .offset:  60, .value_kind: by_value, .value_type: i32}
    - { .name: dilation_w_, .size: 4, .offset:  64, .value_kind: by_value, .value_type: i32}
    - { .name: pad_h_    , .size: 4, .offset:  68, .value_kind: by_value, .value_type: i32}
    - { .name: pad_w_    , .size: 4, .offset:  72, .value_kind: by_value, .value_type: i32}
    - { .name: y_        , .size: 4, .offset:  76, .value_kind: by_value, .value_type: i32}
    - { .name: x_        , .size: 4, .offset:  80, .value_kind: by_value, .value_type: i32}
    - { .name: dtile_iy_ , .size: 4, .offset:  84, .value_kind: by_value, .value_type: i32}
    - { .name: dtile_ix_ , .size: 4, .offset:  88, .value_kind: by_value, .value_type: i32}
    - { .name: dtile_dy_ , .size: 4, .offset:  92, .value_kind: by_value, .value_type: i32}
    - { .name: dtile_dx_ , .size: 4, .offset:  96, .value_kind: by_value, .value_type: i32}
    - { .name: dtile_y_  , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32}
    - { .name: dtile_x_  , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32}
    - { .name: dtile_h_  , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32}
    - { .name: dtile_w_  , .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32}
    - { .name: dslice_y_ , .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32}
    - { .name: dslice_x_ , .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32}
    - { .name: dslice_h_ , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32}
    - { .name: dslice_w_ , .size: 4, .offset: 128, .value_kind: by_value, .value_type: i32}
    - { .name: dslice_h_left_, .size: 4, .offset: 132, .value_kind: by_value, .value_type: i32}
    - { .name: dslice_w_left_, .size: 4, .offset: 136, .value_kind: by_value, .value_type: i32}
    - { .name: group_    , .size: 4, .offset: 140, .value_kind: by_value, .value_type: i32}
    - { .name: magic_0_  , .size: 4, .offset: 144, .value_kind: by_value, .value_type: i32}
    - { .name: magic_1_  , .size: 4, .offset: 148, .value_kind: by_value, .value_type: i32}
    - { .name: magic_2_  , .size: 4, .offset: 152, .value_kind: by_value, .value_type: i32}
    - { .name: magic_3_  , .size: 4, .offset: 156, .value_kind: by_value, .value_type: i32}
    - { .name: shift_pack_0_, .size: 4, .offset: 160, .value_kind: by_value, .value_type: i32}
    - { .name: ks_       , .size: 4, .offset: 164, .value_kind: by_value, .value_type: i32}
...
.end_amdgpu_metadata
