
/******************************************/
/* Begin Kernel                           */
/******************************************/
.amdgcn_target "amdgcn-amd-amdhsa--gfx950"
.text
.protected Custom_Cijk_Ailk_Bljk_S_MX_B_BIAS_HA_S_SAV_NTD_SK3_UserArgs_MT256x256x32_MI16x16x1_shortname0_gfx950
.globl Custom_Cijk_Ailk_Bljk_S_MX_B_BIAS_HA_S_SAV_NTD_SK3_UserArgs_MT256x256x32_MI16x16x1_shortname0_gfx950
.p2align 8
.type Custom_Cijk_Ailk_Bljk_S_MX_B_BIAS_HA_S_SAV_NTD_SK3_UserArgs_MT256x256x32_MI16x16x1_shortname0_gfx950,@function
.section .rodata,#alloc
.p2align 6
.amdhsa_kernel Custom_Cijk_Ailk_Bljk_S_MX_B_BIAS_HA_S_SAV_NTD_SK3_UserArgs_MT256x256x32_MI16x16x1_shortname0_gfx950
  .amdhsa_user_sgpr_kernarg_segment_ptr 1
  .amdhsa_accum_offset 256 // accvgpr offset
  .amdhsa_next_free_vgpr 512 // vgprs
  .amdhsa_next_free_sgpr 91 // sgprs
  .amdhsa_group_segment_fixed_size 132096 // lds bytes
  .amdhsa_private_segment_fixed_size 0
  .amdhsa_system_sgpr_workgroup_id_x 1
  .amdhsa_system_sgpr_workgroup_id_y 1
  .amdhsa_system_sgpr_workgroup_id_z 1
  .amdhsa_system_vgpr_workitem_id 0
  .amdhsa_float_denorm_mode_32 3
  .amdhsa_float_denorm_mode_16_64 3
  .amdhsa_user_sgpr_count 13
  .amdhsa_user_sgpr_kernarg_preload_length 11
  .amdhsa_user_sgpr_kernarg_preload_offset 0
.end_amdhsa_kernel
.text
/* Num VGPR   =249 */
/* Num AccVGPR=256 */
/* Num SGPR   =91 */

/******************************************/
/* Optimizations and Config:              */
/******************************************/
/* ThreadTile= 32 x 8 */
/* SubGroup= 8 x 32 */
/* VectorWidthA=4 */
/* VectorWidthB=4 */
/* GlobalReadVectorWidthA=4, GlobalReadVectorWidthB=4 */
/* DirectToLdsA=True */
/* DirectToLdsB=True */
/* UseSgprForGRO=False */
.amdgpu_metadata
---
custom.config:
  InternalSupportParams:
    KernArgsVersion: 2
  ProblemType:
      OperationType: GEMM
      DataType: s
      DestDataType: s
      F32XdlMathOp: X
      HighPrecisionAccumulate: False
      TransposeA: 0
      TransposeB: 0
      UseBeta: True
      Batched: True
      UseBias: 1
      BiasDataTypeList: [0]
      UseScaleAlphaVec: 1
      Activation: true
      ActivationType: hipblaslt_all
      ActivationFuncCall: true
  MIBlock: [16, 16, 32, 1, 1, 1]
  MatrixInstruction: [16, 16, 32, 1]
  WavefrontSize: 64
  WorkGroupMapping: 16
  WorkGroupMappingXCC: 2
  WorkGroupMappingXCCGroup: -1
  StaggerU: 0
  EnableMatrixInstruction: True
  MIWaveGroup: [2, 2]
  MIWaveTile: [8, 8]
  MIInputPerThread: 32
  MIInputPerThreadA: 32
  MIInputPerThreadB: 32
  DepthU: 32
  DirectToLds: 1
  LocalReadVectorWidth: 4
  GlobalReadVectorWidthA: 4
  GlobalReadVectorWidthB: 4
  GlobalSplitU: 0
  GlobalSplitUAlgorithm: MultipleBuffer
  GlobalSplitUCoalesced: false
  GlobalSplitUWorkGroupMappingRoundRobin: false
  PrefetchGlobalRead: 2
  PrefetchLocalRead: 1
  StreamK: 3
  StreamKAtomic: 0
  StreamKXCCMapping: 0
  TransposeLDS: 1
amdhsa.version:
  - 1
  - 1
amdhsa.kernels:
  - .name: Custom_Cijk_Ailk_Bljk_S_MX_B_BIAS_HA_S_SAV_NTD_SK3_UserArgs_MT256x256x32_MI16x16x1_shortname0_gfx950
    .symbol: 'Custom_Cijk_Ailk_Bljk_S_MX_B_BIAS_HA_S_SAV_NTD_SK3_UserArgs_MT256x256x32_MI16x16x1_shortname0_gfx950.kd'
    .language:                   OpenCL C
    .language_version:
      - 2
      - 0
    .args:
      - .name:            Gemm info
        .size:            4
        .offset:          0
        .value_kind:      by_value
        .value_type:      u32
      - .name:            kernel info0
        .size:            4
        .offset:          4
        .value_kind:      by_value
        .value_type:      u32
      - .name:            kernel info1
        .size:            4
        .offset:          8
        .value_kind:      by_value
        .value_type:      u32
      - .name:            numWG
        .size:            4
        .offset:          12
        .value_kind:      by_value
        .value_type:      u32
      - .name:            SizesFree0
        .size:            4
        .offset:          16
        .value_kind:      by_value
        .value_type:      u32
      - .name:            SizesFree1
        .size:            4
        .offset:          20
        .value_kind:      by_value
        .value_type:      u32
      - .name:            SizesFree2
        .size:            4
        .offset:          24
        .value_kind:      by_value
        .value_type:      u32
      - .name:            SizesSum0
        .size:            4
        .offset:          28
        .value_kind:      by_value
        .value_type:      u32
      - .name:            D
        .size:            8
        .offset:          32
        .value_kind:      global_buffer
        .value_type:      f32
        .address_space:   generic
      - .name:            C
        .size:            8
        .offset:          40
        .value_kind:      global_buffer
        .value_type:      f32
        .address_space:   generic
      - .name:            A
        .size:            8
        .offset:          48
        .value_kind:      global_buffer
        .value_type:      f32
        .address_space:   generic
      - .name:            B
        .size:            8
        .offset:          56
        .value_kind:      global_buffer
        .value_type:      f32
        .address_space:   generic
      - .name:            AddressWS
        .size:            8
        .offset:          64
        .value_kind:      global_buffer
        .value_type:      f32
        .address_space:   generic
      - .name:            AddressFlags
        .size:            8
        .offset:          72
        .value_kind:      global_buffer
        .value_type:      f32
        .address_space:   generic
      - .name:            strideD0
        .size:            4
        .offset:          80
        .value_kind:      by_value
        .value_type:      u32
      - .name:            strideD1
        .size:            4
        .offset:          84
        .value_kind:      by_value
        .value_type:      u32
      - .name:            strideC0
        .size:            4
        .offset:          88
        .value_kind:      by_value
        .value_type:      u32
      - .name:            strideC1
        .size:            4
        .offset:          92
        .value_kind:      by_value
        .value_type:      u32
      - .name:            strideA0
        .size:            4
        .offset:          96
        .value_kind:      by_value
        .value_type:      u32
      - .name:            strideA1
        .size:            4
        .offset:          100
        .value_kind:      by_value
        .value_type:      u32
      - .name:            strideB0
        .size:            4
        .offset:          104
        .value_kind:      by_value
        .value_type:      u32
      - .name:            strideB1
        .size:            4
        .offset:          108
        .value_kind:      by_value
        .value_type:      u32
      - .name:            alpha
        .size:            4
        .offset:          112
        .value_kind:      by_value
        .value_type:      f32
      - .name:            beta
        .size:            4
        .offset:          116
        .value_kind:      by_value
        .value_type:      f32
      - .name:            ItersPerTile
        .size:            4
        .offset:          120
        .value_kind:      by_value
        .value_type:      u32
      - .name:            TotalIters
        .size:            4
        .offset:          124
        .value_kind:      by_value
        .value_type:      u32
      - .name:            SKItersPerWG
        .size:            4
        .offset:          128
        .value_kind:      by_value
        .value_type:      u32
      - .name:            skGridAndTiles
        .size:            4
        .offset:          132
        .value_kind:      by_value
        .value_type:      u32
      - .name:            skExtraIters
        .size:            4
        .offset:          136
        .value_kind:      by_value
        .value_type:      u32
      - .name:            AddressScaleAlphaVec
        .size:            8
        .offset:          140
        .value_kind:      global_buffer
        .value_type:      f32
        .address_space:   generic
      - .name:            bias
        .size:            8
        .offset:          148
        .value_kind:      global_buffer
        .value_type:      void
        .address_space:   generic
      - .name:            biasType
        .size:            4
        .offset:          156
        .value_kind:      by_value
        .value_type:      u32
      - .name:            StrideBias
        .size:            4
        .offset:          160
        .value_kind:      by_value
        .value_type:      u32
      - .name:            activationAlpha
        .size:            4
        .offset:          164
        .value_kind:      by_value
        .value_type:      f32
      - .name:            activationBeta
        .size:            4
        .offset:          168
        .value_kind:      by_value
        .value_type:      f32
      - .name:            activationType
        .size:            4
        .offset:          172
        .value_kind:      by_value
        .value_type:      u32
    .group_segment_fixed_size:   132096
    .kernarg_segment_align:      8
    .kernarg_segment_size:       176
    .max_flat_workgroup_size:    256
    .private_segment_fixed_size: 0
    .sgpr_count:                 91
    .sgpr_spill_count:           0
    .vgpr_count:                 256
    .vgpr_spill_count:           0
    .wavefront_size:             64
...
.end_amdgpu_metadata
Custom_Cijk_Ailk_Bljk_S_MX_B_BIAS_HA_S_SAV_NTD_SK3_UserArgs_MT256x256x32_MI16x16x1_shortname0_gfx950:
label_ASM_Start:  /// Main body of the asm kernel
.macro V_MAGIC_DIV vgprDstIdx:req, dividend:req, magicNumber:req, magicShift:req, magicA:req
    v_mul_hi_u32 v[\vgprDstIdx+1], \dividend, \magicNumber
    v_mul_lo_u32 v[\vgprDstIdx+0], \dividend, \magicA
    v_add_u32 v[\vgprDstIdx+0], v[\vgprDstIdx+0], v[\vgprDstIdx+1]
    v_lshrrev_b32 v[\vgprDstIdx+0], \magicShift, v[\vgprDstIdx+0]
.endm

/******************************************/
/* VGPR Assignments                       */
/******************************************/
/* ValuC range: [0-0), serializedStore enabled */
.set vgprValuC, 0
/* ValuA/B   Xn=PLR buffer idx,  In=InnerUnroll idx */
.set vgprBase, 18
.set vgprGlobalReadOffsetA, 0
.set vgprGlobalReadOffsetB, 8
.set vgprLocalReadAddrA, 16
.set vgprLocalReadAddrB, 17
.set vgprLocalReadSwapAddrA, 146
.set vgprLocalReadSwapAddrB, 147
.set vgprSerial, 148

/******************************************/
/* VGPR Macro Assignments                 */
/******************************************/
.set vgprValuA_X0_I0_BASE, vgprBase+0
.set vgprValuB_X0_I0_BASE, vgprBase+64
.set vgprValuA_X0_I0, vgprValuA_X0_I0_BASE+0
.set vgprValuB_X0_I0, vgprValuB_X0_I0_BASE+0
.set vgprValuA_T0_I0, 160 // 32 values
.set vgprValuB_T0_I0, 192 // 32 values
.set vgprValuA_T1_I0, 224 // 4 values


/******************************************/
/* SGPR Assignments                       */
/******************************************/
.set sgprKernArgAddress, 0
.set sgprWorkGroup0, 2
.set sgprWorkGroup1, 3
.set sgprWorkGroup2, 4
.set sgprArgType, 5
.set sgprStaggerU, 6
.set sgprWGM, 7
.set sgprLoopCounterL, 8
.set sgprOrigLoopCounter, 9
.set sgprSrdD, 12
.set sgprSrdC, 16
.set sgprNumWorkGroups0, 10
.set sgprNumWorkGroups1, 11
.set sgprSizesFree, 20
.set sgprSizesSum, 23
.set sgprAddressD, 24
.set sgprAddressC, 26
.set sgprAddressA, 28
.set sgprAddressB, 30
.set sgprAddressWS, 32
.set sgprAddressFlags, 34
.set sgprStridesD, 36
.set sgprStridesC, 38
.set sgprStridesA, 40
.set sgprStridesB, 42
.set sgprAlpha, 44
.set sgprBeta, 45
.set sgprItersPerTile, 46
.set sgprTotalIters, 47
.set sgprSKItersPerWG, 48
.set sgprskGridAndTiles, 49
.set sgprskExtraIters, 50
.set sgprLocalWriteAddrA, 51
.set sgprLocalWriteAddrB, 52
.set sgprSwapA, 53
.set sgprSwapB, 54
.set sgprStreamKIdx, 55
.set sgprStreamKIter, 56
.set sgprStreamKIterEnd, 57
.set sgprStreamKLocalStart, 58
.set sgprStreamKLocalEnd, 59
.set sgprSrdWS, 60

/* Size Assignments */
.set sgprSizeI, sgprSizesFree+0
.set sgprSizeJ, sgprSizesFree+1
.set sgprSizeK, sgprSizesFree+2
.set sgprSizeL, sgprSizesSum+0

/* Stride Assignments */
.set constStrideD0I, 1
.set sgprStrideD1J, sgprStridesD+0
.set sgprStrideDK, sgprStridesD+1
.set constStrideC0I, 1
.set sgprStrideC1J, sgprStridesC+0
.set sgprStrideCK, sgprStridesC+1
.set constStrideA0I, 1
.set sgprStrideAL, sgprStridesA+0
.set sgprStrideAK, sgprStridesA+1
.set constStrideBL, 1
.set sgprStrideB1J, sgprStridesB+0
.set sgprStrideBK, sgprStridesB+1

.set MT0, 256
.set MT1, 256
.set DepthU, 32
.set BpeA, 4
.set BpeALog2, 2
.set BpeB, 4
.set BpeBLog2, 2
.set BpeAGR, 4
.set BpeAGRLog2, 2
.set BpeBGR, 4
.set BpeBGRLog2, 2
/* Number of elements to shift-left SRD */
.set SrdShiftLeftA, 4
.set SrdShiftLeftB, 4
/* 2GB limit - set offsets to -1 to exceed this and clamp */
.set BufferLimit, 0xffffffff
.set BufferOOB, 0x80000000

/******************************************/
/* Bits 127:96 of SRD.                    */
/* hex: 0x20000                           */
/* dst_sel_x (3b): 0                      */
/* dst_sel_y (3b): 0                      */
/* dst_sel_z (3b): 0                      */
/* dst_sel_w (3b): 0                      */
/* num_format (3b): 0                     */
/* data_format (4b): 4                    */
/* user_vm_enable (1b): 0                 */
/* user_vm_mode (1b): 0                   */
/* index_stride (2b): 0                   */
/* add_tid_enable (1b): 0                 */
/* _unusedA (3b): 0                       */
/* nv (1b): 0                             */
/* _unusedB (2b): 0                       */
/* type (2b): 0                           */
/******************************************/
.set Srd127_96, 0x20000

/* Global Offset A */
.macro GLOBAL_OFFSET_A vgprAddr:req, vgprOffset0I:req, vgprOffsetL:req, vgprTmp:req
    v_mul_lo_u32 v[\vgprTmp+0], s[sgprStrideAL], v[\vgprOffsetL] // mul d1 lower
    v_add_co_u32 v[\vgprAddr+0], vcc, v[\vgprOffset0I], v[\vgprTmp+0] // accumulate K lower
    v_add_u32 v[\vgprAddr+0], 0x4, v[\vgprAddr+0]      // add prepad for pointer shift
    v_lshlrev_b32 v[\vgprAddr+0], 2, v[\vgprAddr+0]    // offset *= bytes/element
.endm

/* Global Offset B */
.macro GLOBAL_OFFSET_B vgprAddr:req, vgprOffsetL:req, vgprOffset1J:req, vgprTmp:req
    v_mul_lo_u32 v[\vgprTmp+0], s[sgprStrideB1J], v[\vgprOffset1J] // mul d1 lower
    v_add_co_u32 v[\vgprAddr+0], vcc, v[\vgprOffsetL], v[\vgprTmp+0] // accumulate K lower
    v_add_u32 v[\vgprAddr+0], 0x4, v[\vgprAddr+0]      // add prepad for pointer shift
    v_lshlrev_b32 v[\vgprAddr+0], 2, v[\vgprAddr+0]    // offset *= bytes/element
.endm

/******************************************/
/* Allocate Resources                     */
/******************************************/

/* Load num of Gemms */
s_load_dword s64, s[sgprKernArgAddress:sgprKernArgAddress+1], 0

/* Load packed kernel args (StaggerU/GSU) */
s_load_dword s66, s[sgprKernArgAddress:sgprKernArgAddress+1], 4

/* Load WGM data */
s_load_dword s[sgprWGM], s[sgprKernArgAddress:sgprKernArgAddress+1], 8

/* Load num of WGs */
s_load_dword s67, s[sgprKernArgAddress:sgprKernArgAddress+1], 12
s_waitcnt lgkmcnt(0)                               // load args
s_lshr_b32 s65, s64, 0x1e                          // Get arg type
s_and_b32 s64, 0x3fffffff, s64                     // Get nums of gemm
s_cmp_eq_u32 s65, 0                                // Is kernel args
s_cbranch_scc0 label_HBMArgs
s_add_u32 s[sgprKernArgAddress], s[sgprKernArgAddress], 0x10 // Shift common args
s_addc_u32 s[sgprKernArgAddress+1], s[sgprKernArgAddress+1], 0

/* Load Kernel Args */
s_load_dwordx16 s[20:35], s[sgprKernArgAddress:sgprKernArgAddress+1], 0 // 0
s_load_dwordx8 s[36:43], s[sgprKernArgAddress:sgprKernArgAddress+1], 64 // 64
s_load_dwordx4 s[44:47], s[sgprKernArgAddress:sgprKernArgAddress+1], 96 // 96
s_load_dwordx2 s[48:49], s[sgprKernArgAddress:sgprKernArgAddress+1], 112 // 112
s_load_dword s50, s[sgprKernArgAddress:sgprKernArgAddress+1], 120 // 120
s_waitcnt lgkmcnt(0)                               // preload
s_branch label_LoadArgsEnd
label_HBMArgs:

/* Load address of kernel arguments */
s_load_dwordx2 s[sgprKernArgAddress:sgprKernArgAddress+1], s[sgprKernArgAddress:sgprKernArgAddress+1], 16
s_waitcnt lgkmcnt(0)                               // wait for args to load
label_LoadArgsEnd:
s_branch label_common_kernel_entry

/* pad 33 snops to satisfy 0x100 code size for Preload Backward Compatibility Prologue */
s_nop 0
s_nop 0
s_nop 0
s_nop 0
s_nop 0
s_nop 0
s_nop 0
s_nop 0
s_nop 0
s_nop 0
s_nop 0
s_nop 0
s_nop 0
s_nop 0
s_nop 0
s_nop 0
s_nop 0
s_nop 0
s_nop 0
s_nop 0
s_nop 0
s_nop 0
s_nop 0
s_nop 0
s_nop 0
s_nop 0
s_nop 0
s_nop 0
s_nop 0
s_nop 0
s_nop 0
s_nop 0
s_nop 0
label_Preload_Offset_Start:
s_and_b32 s64, 0x3fffffff, s2                      // Get nums of gemm
s_lshr_b32 s65, s2, 0x1e                           // Get arg type
s_mov_b32 s66, s3                                  // Preload internal args
s_cmp_eq_u32 s65, 0                                // Is kernel args
s_cbranch_scc0 label_Preload_HBMArgs
s_add_u32 s[sgprKernArgAddress], s[sgprKernArgAddress], 0x10 // Shift common args
s_addc_u32 s[sgprKernArgAddress+1], s[sgprKernArgAddress+1], 0

/* Load Kernel Args */
s_load_dword s27, s[sgprKernArgAddress:sgprKernArgAddress+1], 28 // 28
s_load_dwordx16 s[28:43], s[sgprKernArgAddress:sgprKernArgAddress+1], 32 // 32
s_load_dwordx4 s[44:47], s[sgprKernArgAddress:sgprKernArgAddress+1], 96 // 96
s_load_dwordx2 s[48:49], s[sgprKernArgAddress:sgprKernArgAddress+1], 112 // 112
s_load_dword s50, s[sgprKernArgAddress:sgprKernArgAddress+1], 120 // 120
s_mov_b64 s[20:21], s[6:7]                         // move preload data to correct sgpr
s_mov_b64 s[22:23], s[8:9]                         // move preload data to correct sgpr
s_mov_b64 s[24:25], s[10:11]                       // move preload data to correct sgpr
s_mov_b32 s26, s12                                 // move preload data to correct sgpr
s_branch label_Preload_LoadArgsEnd
label_Preload_HBMArgs:
s_mov_b64 s[sgprKernArgAddress:sgprKernArgAddress+1], s[6:7] // Load address of kernel arguments
label_Preload_LoadArgsEnd:
s_mov_b32 s[sgprWGM], s4                           // Preload internal args2
s_mov_b32 s67, s5                                  // Load num of WGs
label_common_kernel_entry:  /// for both preload/non-preload common code
s_mov_b32 s[sgprWorkGroup0+0], s13                 // restore workgroup id
s_mov_b32 s[sgprWorkGroup0+1], s14                 // restore workgroup id
s_mov_b32 s[sgprWorkGroup0+2], s15                 // restore workgroup id
s_and_b32 s[sgprStaggerU], s66, 0xffff0000         // Restore StaggerU related vars
s_lshr_b32 s[sgprStaggerU], s[sgprStaggerU], 0x10
s_mov_b32 s[sgprArgType], s65
s_mov_b32 m0, 0x20400                              // LDS clamp at 132096 bytes
v_mov_b32 v[vgprSerial], v0                        // thread serial id

/* remap workgroup to XCCs */
s_lshr_b32 s72, s[sgprWGM], 0x10                   // Get WGMXCC
s_ff1_i32_b32 s72, s72                             // Get log(WGMXCC)
s_lshr_b32 s73, s[sgprWGM], 0x16                   // Get CU_Count
/* remap WGs if WGMXCC > 1 ( log(WGMXCC) > 0 ) */
s_cmp_gt_i32 s72, 0
s_cbranch_scc0 label_skip_WGMXCC
/* only remap WGs in the range */
s_lshr_b32 s69, s67, s72
s_lshl_b32 s69, s69, s72
s_cmp_ge_u32 s[sgprWorkGroup0], s69
s_cbranch_scc1 label_skip_WGMXCC
s_cmp_eq_u32 s73, 0                                // CU_Count == 0 ?
s_cbranch_scc0 label_XCCG_nonzero
s_lshr_b32 s69, s[sgprWorkGroup0], s72
s_bfm_b32 s70, s72, 0
s_and_b32 s70, s[sgprWorkGroup0], s70
s_lshr_b32 s71, s67, s72
s_mul_i32 s70, s70, s71
s_add_u32 s[sgprWorkGroup0], s69, s70
s_branch label_skip_WGMXCC
label_XCCG_nonzero:
/* temp0 = (wg//CU_Count)*CU_Count */
v_cvt_f32_u32 v18, s73                             // wg//CU_Count
v_rcp_iflag_f32 v18, v18                           // wg//CU_Count
v_cvt_f32_u32 v19, s[sgprWorkGroup0]               // wg//CU_Count
v_mul_f32 v18, v18, v19                            // wg//CU_Count
v_cvt_u32_f32 v18, v18                             // wg//CU_Count
v_mul_u32_u24 v19, v18, s73                        // wg//CU_Count
v_sub_u32 v19, s[sgprWorkGroup0], v19              // wg//CU_Count
v_cmpx_eq_u32 exec, v19, s73                       // wg//CU_Count
v_add_u32 v18, 1, v18                              // wg//CU_Count
v_mov_b32 v19, 0                                   // wg//CU_Count
s_mov_b64 exec, -1                                 // Reset exec
v_cmpx_gt_u32 exec, v19, s73                       // overflow happened in remainder
v_sub_u32 v18, v18, 1                              // quotient - 1
v_mul_u32_u24 v19, v18, s73                        // re-calculate remainder
v_sub_u32 v19, s[sgprWorkGroup0], v19              // re-calculate remainder
s_mov_b64 exec, -1                                 // Reset exec
v_readfirstlane_b32 s69, v18                       // quotient
v_readfirstlane_b32 s70, v19                       // remainder
s_mul_i32 s69, s69, s73
/* temp1 = (wg%CU_Count)//WGMXCC */
s_lshr_b32 s70, s70, s72
/* temp0 = temp0 + temp1 */
s_add_u32 s69, s69, s70
/* temp1 = (wg%WGMXCC) * ((WGs - (WGs//CU_Count) * CU_Count) if (wg > (WGs//CU_Count) * CU_Count) else CU_Count)//WGMXCC */
v_cvt_f32_u32 v18, s73                             // WGs//CU_Count
v_rcp_iflag_f32 v18, v18                           // WGs//CU_Count
v_cvt_f32_u32 v19, s67                             // WGs//CU_Count
v_mul_f32 v18, v18, v19                            // WGs//CU_Count
v_cvt_u32_f32 v18, v18                             // WGs//CU_Count
v_mul_u32_u24 v19, v18, s73                        // WGs//CU_Count
v_sub_u32 v19, s67, v19                            // WGs//CU_Count
v_cmpx_eq_u32 exec, v19, s73                       // WGs//CU_Count
v_add_u32 v18, 1, v18                              // WGs//CU_Count
s_mov_b64 exec, -1                                 // Reset exec
v_cmpx_gt_u32 exec, v19, s73                       // overflow happened in remainder
v_sub_u32 v18, v18, 1                              // quotient - 1
s_mov_b64 exec, -1                                 // Reset exec
v_readfirstlane_b32 s70, v18                       // quotient
s_mul_i32 s70, s70, s73
s_sub_u32 s71, s67, s70
s_cmp_gt_u32 s[sgprWorkGroup0], s70
s_cselect_b32 s70, s71, s73
s_lshr_b32 s70, s70, s72
s_bfm_b32 s71, s72, 0
s_and_b32 s71, s[sgprWorkGroup0], s71
s_mul_i32 s70, s70, s71
/* WorkGroup0 = temp0 + temp1 */
s_add_u32 s[sgprWorkGroup0], s69, s70
label_skip_WGMXCC:  /// skip WGMXCC if no enough WGs to remap
s_cmp_eq_u32 s65, 0
s_cbranch_scc0 label_MultiGemm
/* init: add vgpr [18...164) to pool */
/* init: add vgpr [0...0) to pool */
/* init: add agpr [0...256) to pool */
v_mov_b32 v20, MT0                                 // set MT0 into sgpr
v_mov_b32 v19, s[sgprSizesFree+0]                  // set Free0 size
v_cvt_f32_u32 v18, v20                             // v18 = ceil(v19 / v20)
v_rcp_iflag_f32 v18, v18                           // v18 = ceil(v19 / v20)
v_cvt_f32_u32 v21, v19                             // v18 = ceil(v19 / v20)
v_mul_f32 v18, v18, v21                            // v18 = ceil(v19 / v20)
v_cvt_u32_f32 v18, v18                             // v18 = ceil(v19 / v20)
v_mul_u32_u24 v21, v18, v20                        // v18 = ceil(v19 / v20)
v_sub_u32 v21, v19, v21                            // v18 = ceil(v19 / v20)
v_cmp_ne_u32 vcc, v21, 0                           // v18 = ceil(v19 / v20)
v_addc_co_u32 v18, vcc, v18, 0, vcc                // ceil
v_mov_b32 v20, MT1                                 // set MT1 into sgpr
v_mov_b32 v19, s[sgprSizesFree+1]                  // set Free1 size
v_readfirstlane_b32 s[sgprNumWorkGroups0], v18     // set back to numWorkGroup0
v_cvt_f32_u32 v18, v20                             // v18 = ceil(v19 / v20)
v_rcp_iflag_f32 v18, v18                           // v18 = ceil(v19 / v20)
v_cvt_f32_u32 v21, v19                             // v18 = ceil(v19 / v20)
v_mul_f32 v18, v18, v21                            // v18 = ceil(v19 / v20)
v_cvt_u32_f32 v18, v18                             // v18 = ceil(v19 / v20)
v_mul_u32_u24 v21, v18, v20                        // v18 = ceil(v19 / v20)
v_sub_u32 v21, v19, v21                            // v18 = ceil(v19 / v20)
v_cmp_ne_u32 vcc, v21, 0                           // v18 = ceil(v19 / v20)
v_addc_co_u32 v18, vcc, v18, 0, vcc                // ceil
s_nop 0                                            // 1 wait states
v_readfirstlane_b32 s[sgprNumWorkGroups1], v18     // set back to numWorkGroup1
s_waitcnt lgkmcnt(0)                               // wait for 80/0 bytes of kern args
s_branch label_MultiGemmEnd
label_MultiGemm:

/* Check if custom structure pointer is null */
s_cmp_eq_u32 s[sgprArgType], 2                     // ArgType == 2 ?
s_cbranch_scc1 label_IsExternalValid               // branch if ArgType == 2
s_mov_b32 s11, 160
s_mul_i32 s72, s64, 4
s_mov_b64 s[66:67], s[sgprKernArgAddress:sgprKernArgAddress+1]
s_branch label_IsExternalValidEnd
label_IsExternalValid:
s_mov_b32 s11, 216
s_mov_b32 s72, 0
s_mov_b64 s[66:67], s[sgprKernArgAddress:sgprKernArgAddress+1]
label_IsExternalValidEnd:

/* Grouped Gemm:: prefetch 1 arg load */
s_mov_b32 s10, 1
s_mov_b32 s73, 0
s_load_dwordx4 s[20:23], s[66:67], s72
s_cmpk_eq_u32 s64, 1                               // if gemm_count is 1?
s_cbranch_scc1 label_wgTable_noLoadLoop

/* Grouped Gemm:: accumulate numTiles for each gemm */
/* Grouped Gemm:: loop start */
label_Loop_GemmCount:
s_waitcnt lgkmcnt(0)
s_lshr_b32 s70, s20, 8                             // s70 = s20 / 256
s_and_b32 s68, 255, s20                            // s68 = s20 % 256
s_addc_u32 s70, s70, 0
s_lshr_b32 s71, s21, 8                             // s71 = s21 / 256
s_and_b32 s68, 255, s21                            // s68 = s21 % 256
s_addc_u32 s71, s71, 0
s_mul_i32 s70, s70, s71
s_mul_i32 s70, s70, s22
s_add_u32 s73, s73, s70
s_cmp_lt_u32 s[sgprWorkGroup0], s73
s_cbranch_scc1 label_FOUND
s_add_u32 s72, s72, s11
s_load_dwordx4 s[20:23], s[66:67], s72
s_add_u32 s10, s10, 1
s_cmp_lt_u32 s10, s64
s_cbranch_scc1 label_Loop_GemmCount

/* Grouped Gemm:: noLoadLoop */
label_wgTable_noLoadLoop:
s_waitcnt lgkmcnt(0)
s_lshr_b32 s70, s20, 8                             // s70 = s20 / 256
s_and_b32 s68, 255, s20                            // s68 = s20 % 256
s_addc_u32 s70, s70, 0
s_lshr_b32 s71, s21, 8                             // s71 = s21 / 256
s_and_b32 s68, 255, s21                            // s68 = s21 % 256
s_addc_u32 s71, s71, 0
s_mul_i32 s70, s70, s71
s_mul_i32 s70, s70, s22
s_add_u32 s73, s73, s70

/* Grouped Gemm:: gemmIndex found */
label_FOUND:
s_sub_u32 s67, s10, 1
s_sub_u32 s66, s73, s70
s_sub_u32 s[sgprWorkGroup0], s[sgprWorkGroup0], s66
/* Check if custom structure pointer is null */
s_cmp_eq_u32 s[sgprArgType], 2                     // ArgType == 2 ?
s_cbranch_scc1 label_LoadExternalStruct            // branch if ArgType == 2

/* Grouped Gemm: offset argument address to gemm */
/* Grouped Gemm: offset address from wg_table_start to args_start */
s_lshl2_add_u32 s[sgprKernArgAddress], s64, s[sgprKernArgAddress]
s_addc_u32 s[sgprKernArgAddress+1], s[sgprKernArgAddress+1], 0
/* Grouped Gemm: offset address from args_start to gemm_start */
s_mul_i32 s67, s67, 160
s_add_u32 s[sgprKernArgAddress], s[sgprKernArgAddress], s67
s_addc_u32 s[sgprKernArgAddress+1], s[sgprKernArgAddress+1], 0

/* Load Kernel Args */
s_load_dwordx16 s[24:39], s[sgprKernArgAddress:sgprKernArgAddress+1], 16 // 16
s_load_dwordx8 s[40:47], s[sgprKernArgAddress:sgprKernArgAddress+1], 80 // 80
s_load_dwordx2 s[48:49], s[sgprKernArgAddress:sgprKernArgAddress+1], 112 // 112
s_load_dword s50, s[sgprKernArgAddress:sgprKernArgAddress+1], 120 // 120
s_branch label_LoadExternalStructEnd
label_LoadExternalStruct:
/* Grouped Gemm: offset address from args_start to gemm_start */
s_mul_i32 s67, s67, 216
s_add_u32 s[sgprKernArgAddress], s[sgprKernArgAddress], s67
s_addc_u32 s[sgprKernArgAddress+1], s[sgprKernArgAddress+1], 0
s_load_dwordx16 s[24:39], s[sgprKernArgAddress:sgprKernArgAddress+1], 16 // 16
s_load_dwordx8 s[40:47], s[sgprKernArgAddress:sgprKernArgAddress+1], 80 // 80
s_load_dwordx2 s[48:49], s[sgprKernArgAddress:sgprKernArgAddress+1], 112 // 112
// Read Beta
s_load_dword s45, s[sgprKernArgAddress:sgprKernArgAddress+1], 132 // 132
label_LoadExternalStructEnd:
/* init: add vgpr [18...164) to pool */
/* init: add vgpr [0...0) to pool */
/* init: add agpr [0...256) to pool */
v_mov_b32 v20, MT0                                 // set MT0 into sgpr
v_mov_b32 v19, s[sgprSizesFree+0]                  // set Free0 size
v_cvt_f32_u32 v18, v20                             // v18 = ceil(v19 / v20)
v_rcp_iflag_f32 v18, v18                           // v18 = ceil(v19 / v20)
v_cvt_f32_u32 v21, v19                             // v18 = ceil(v19 / v20)
v_mul_f32 v18, v18, v21                            // v18 = ceil(v19 / v20)
v_cvt_u32_f32 v18, v18                             // v18 = ceil(v19 / v20)
v_mul_u32_u24 v21, v18, v20                        // v18 = ceil(v19 / v20)
v_sub_u32 v21, v19, v21                            // v18 = ceil(v19 / v20)
v_cmp_ne_u32 vcc, v21, 0                           // v18 = ceil(v19 / v20)
v_addc_co_u32 v18, vcc, v18, 0, vcc                // ceil
v_mov_b32 v20, MT1                                 // set MT1 into sgpr
v_mov_b32 v19, s[sgprSizesFree+1]                  // set Free1 size
v_readfirstlane_b32 s[sgprNumWorkGroups0], v18     // set back to numWorkGroup0
v_cvt_f32_u32 v18, v20                             // v18 = ceil(v19 / v20)
v_rcp_iflag_f32 v18, v18                           // v18 = ceil(v19 / v20)
v_cvt_f32_u32 v21, v19                             // v18 = ceil(v19 / v20)
v_mul_f32 v18, v18, v21                            // v18 = ceil(v19 / v20)
v_cvt_u32_f32 v18, v18                             // v18 = ceil(v19 / v20)
v_mul_u32_u24 v21, v18, v20                        // v18 = ceil(v19 / v20)
v_sub_u32 v21, v19, v21                            // v18 = ceil(v19 / v20)
v_cmp_ne_u32 vcc, v21, 0                           // v18 = ceil(v19 / v20)
v_addc_co_u32 v18, vcc, v18, 0, vcc                // ceil
s_nop 0                                            // 1 wait states
v_readfirstlane_b32 s[sgprNumWorkGroups1], v18     // set back to numWorkGroup1
s_waitcnt lgkmcnt(0)                               // wait for 80/0 bytes of kern args

/* Early stop if N(SizeFreeJ) == 0 */
s_cmp_eq_u32 s[sgprSizeJ], 0
s_cbranch_scc0 label_NoEarlyStop_N0
label_EarlyStop_if_N_is_0:
s_endpgm
label_NoEarlyStop_N0:

label_MultiGemmEnd:
.set sgprSrdA, 64
.set sgprSrdB, 68
.set sgprShadowLimitA, 72
.set sgprShadowLimitB, 74
.set sgprStaggerUIter, 76
.set sgprWrapUA, 77
.set sgprWrapUB, 79
.set sgprGlobalReadIncsA, 81
.set sgprGlobalReadIncsB, 82
s_sub_u32 s[sgprAddressA+0], s[sgprAddressA+0], 16 // pre-pad to make room for possible pointer shift
s_subb_u32 s[sgprAddressA+1], s[sgprAddressA+1], 0 // pre-pad to make room for possible pointer shift
s_sub_u32 s[sgprAddressB+0], s[sgprAddressB+0], 16 // pre-pad to make room for possible pointer shift
s_subb_u32 s[sgprAddressB+1], s[sgprAddressB+1], 0 // pre-pad to make room for possible pointer shift

/* Short circuit condition if Alpha == 0, then sumDims=0 */
v_cmp_eq_f32 vcc, s[sgprAlpha], 0.0                // s[Alpha] == 0.0f ?
s_cbranch_vccz label_AlphaNonZero                  // branch if s[Alpha] != 0
s_mov_b32 s[sgprSizesSum+0], 0                     // Set summation dim=0 if Alpha == 0
label_AlphaNonZero:
s_mov_b32 s[sgprStreamKIdx], s[sgprWorkGroup0]     // Save original StreamK index
s_mul_i32 s[sgprStreamKIter], s[sgprStreamKIdx], s[sgprItersPerTile] // DP starting iteration (case: DP work to do)
s_mov_b32 s[sgprStreamKIterEnd], s[sgprTotalIters] // DP ending iteration (case: only DP work to do)
s_and_b32 s83, s[sgprskGridAndTiles], 0xffff       // Get skTiles
s_mul_i32 s83, s83, s[sgprItersPerTile]            // Total SK iters
s_cmp_lt_u32 s83, s[sgprTotalIters]                // Check if there are DP tiles to do
s_cbranch_scc1 label_SK_InitDone                   // Done init
s_mul_i32 s[sgprStreamKIter], s[sgprStreamKIdx], s[sgprSKItersPerWG] // StreamK starting iteration (case: after extra iters)
s_add_u32 s[sgprStreamKIter], s[sgprStreamKIter], s[sgprskExtraIters] // Add extra iters
s_add_u32 s[sgprStreamKIterEnd], s[sgprStreamKIter], s[sgprSKItersPerWG] // StreamK ending iteration (case: after extra iters)
s_add_u32 s84, s[sgprSKItersPerWG], 1              // Spread out extra iterations
s_mul_i32 s83, s[sgprStreamKIdx], s84              // StreamK starting iteration (case: before extra iters)
s_add_u32 s84, s83, s84                            // StreamK ending iteration (case: before extra iters)
s_cmp_lt_u32 s[sgprStreamKIdx], s[sgprskExtraIters] // Check if lane gets an extra iteration
s_cselect_b32 s[sgprStreamKIter], s83, s[sgprStreamKIter] // Set start iter
s_cselect_b32 s[sgprStreamKIterEnd], s84, s[sgprStreamKIterEnd] // Set end iter
s_and_b32 s83, s[sgprskGridAndTiles], 0xffff       // Get skTiles
s_mul_i32 s83, s83, s[sgprItersPerTile]            // Total SK iters
s_min_u32 s[sgprStreamKIterEnd], s[sgprStreamKIterEnd], s83 // Cap ending iter at total SK iters
label_SK_InitDone:
s_cmp_lt_u32 s[sgprStreamKIter], s[sgprTotalIters] // Make sure there's work to do
s_cbranch_scc1 label_NoBranch_T8JHFHKM7BO5OHXW     // Only branch on scc0
s_getpc_b64 s[84:85]                               // addr of next instr
s_add_i32 s86, label_KernelEnd, 4                  // target branch offset
s_add_u32 s84, s84, s86                            // add target branch offset
s_addc_u32 s85, s85, 0                             // add high and carry
s_setpc_b64 s[84:85]                               // branch to label_KernelEnd
label_NoBranch_T8JHFHKM7BO5OHXW:

/******************************************/
/* Persistent Loop Start                  */
/******************************************/
label_PersistentLoopStart:

/******************************************/
/* Begin setupNewTile                     */
/******************************************/

/* global read addresses: work-group */
/* graWorkGroup mapping */

/* localReadResetOffsets */
/* handled internally */
v_xor_b32 v18, v[vgprLocalReadSwapAddrA], v[vgprLocalReadAddrA] // Get other lds buffer offset value
v_min_i32 v[vgprLocalReadAddrA], v[vgprLocalReadAddrA], v18 // Set LRA to first buffer offset

/* localReadResetOffsets */
/* handled internally */
v_xor_b32 v18, v[vgprLocalReadSwapAddrB], v[vgprLocalReadAddrB] // Get other lds buffer offset value
v_min_i32 v[vgprLocalReadAddrB], v[vgprLocalReadAddrB], v18 // Set LRA to first buffer offset
/* StreamK calculate tile idx and map to WG */
v_cvt_f32_u32 v18, s[sgprItersPerTile]             // StreamKIter // ItersPerTile
v_rcp_iflag_f32 v18, v18                           // StreamKIter // ItersPerTile
v_cvt_f32_u32 v19, s[sgprStreamKIter]              // StreamKIter // ItersPerTile
v_mul_f32 v18, v18, v19                            // StreamKIter // ItersPerTile
v_cvt_u32_f32 v18, v18                             // StreamKIter // ItersPerTile
v_mul_u32_u24 v19, v18, s[sgprItersPerTile]        // StreamKIter // ItersPerTile
v_sub_u32 v19, s[sgprStreamKIter], v19             // StreamKIter // ItersPerTile
v_cmpx_eq_u32 exec, v19, s[sgprItersPerTile]       // StreamKIter // ItersPerTile
v_add_u32 v18, 1, v18                              // StreamKIter // ItersPerTile
s_mov_b64 exec, -1                                 // Reset exec
v_cmpx_gt_u32 exec, v19, s[sgprItersPerTile]       // overflow happened in remainder
v_sub_u32 v18, v18, 1                              // quotient - 1
s_mov_b64 exec, -1                                 // Reset exec
v_readfirstlane_b32 s84, v18                       // quotient
s_mul_i32 s85, s84, s[sgprItersPerTile]            // Tile start iteration
s_add_u32 s86, s85, s[sgprItersPerTile]            // Tile end iteration
s_sub_u32 s[sgprStreamKLocalStart], s[sgprStreamKIter], s85 // Local iteration start
s_min_u32 s[sgprStreamKLocalEnd], s[sgprStreamKIterEnd], s86 // 1. (Local) iteration end (SK tile)
s_sub_u32 s[sgprStreamKLocalEnd], s[sgprStreamKLocalEnd], s85 // 2. Local iteration end (SK tile)
s_and_b32 s87, s[sgprskGridAndTiles], 0xffff       // Get skTiles
s_mul_i32 s87, s87, s[sgprItersPerTile]            // Total SK iters
s_sub_u32 s87, s[sgprTotalIters], s87              // Offset to first SK tile
s_lshr_b32 s85, s[sgprskGridAndTiles], 0x10        // Get skGrid
s_mul_i32 s85, s85, s[sgprItersPerTile]            // DP iterations shift
s_add_u32 s85, s85, s[sgprStreamKIter]             // Add DP shift
s_cmp_lt_u32 s85, s87                              // Check if still in DP section
s_cbranch_scc1 label_SK_UpdateDone                 // Done update
s_mov_b32 s85, s86                                 // SK iterations shift
s_cmp_le_u32 s87, s[sgprStreamKIter]               // Check if continuing in SK section
s_cbranch_scc1 label_SK_UpdateDone                 // Done update
s_mul_i32 s[sgprStreamKIter], s[sgprStreamKIdx], s[sgprSKItersPerWG] // StreamK starting iteration (case: after extra iters)
s_add_u32 s[sgprStreamKIter], s[sgprStreamKIter], s[sgprskExtraIters] // Add extra iters
s_add_u32 s[sgprStreamKIterEnd], s[sgprStreamKIter], s[sgprSKItersPerWG] // StreamK ending iteration (case: after extra iters)
s_add_u32 s89, s[sgprSKItersPerWG], 1              // Spread out extra iterations
s_mul_i32 s88, s[sgprStreamKIdx], s89              // StreamK starting iteration (case: before extra iters)
s_add_u32 s89, s88, s89                            // StreamK ending iteration (case: before extra iters)
s_cmp_lt_u32 s[sgprStreamKIdx], s[sgprskExtraIters] // Check if lane gets an extra iteration
s_cselect_b32 s[sgprStreamKIter], s88, s[sgprStreamKIter] // Set start iter
s_cselect_b32 s[sgprStreamKIterEnd], s89, s[sgprStreamKIterEnd] // Set end iter
s_add_u32 s85, s[sgprStreamKIter], s87             // Offset to start of SK section
s_add_u32 s[sgprStreamKIterEnd], s[sgprStreamKIterEnd], s87 // Offset to start of SK section
s_min_u32 s[sgprStreamKIterEnd], s[sgprStreamKIterEnd], s[sgprTotalIters] // Cap ending iter at total SK iters
s_cmp_lt_u32 s[sgprStreamKIter], s[sgprTotalIters] // Make sure there's work to do
s_cbranch_scc1 label_NoBranch_S4FDBQ587JJL6NOU     // Only branch on scc0
s_getpc_b64 s[88:89]                               // addr of next instr
s_add_i32 s90, label_KernelEnd, 4                  // target branch offset
s_add_u32 s88, s88, s90                            // add target branch offset
s_addc_u32 s89, s89, 0                             // add high and carry
s_setpc_b64 s[88:89]                               // branch to label_KernelEnd
label_NoBranch_S4FDBQ587JJL6NOU:
label_SK_UpdateDone:
s_mov_b32 s[sgprStreamKIter], s85                  // Store current iteration
/* Map StreamK tile index to wg0/1/2 */
s_mul_i32 s85, s[sgprNumWorkGroups0], s[sgprNumWorkGroups1] // Total tiles
v_cvt_f32_u32 v18, s85                             // TileID // nWG0*nWG1
v_rcp_iflag_f32 v18, v18                           // TileID // nWG0*nWG1
v_cvt_f32_u32 v19, s84                             // TileID // nWG0*nWG1
v_mul_f32 v18, v18, v19                            // TileID // nWG0*nWG1
v_cvt_u32_f32 v18, v18                             // TileID // nWG0*nWG1
v_mul_u32_u24 v19, v18, s85                        // TileID // nWG0*nWG1
v_sub_u32 v19, s84, v19                            // TileID // nWG0*nWG1
v_cmpx_eq_u32 exec, v19, s85                       // TileID // nWG0*nWG1
v_add_u32 v18, 1, v18                              // TileID // nWG0*nWG1
v_mov_b32 v19, 0                                   // TileID // nWG0*nWG1
s_mov_b64 exec, -1                                 // Reset exec
v_cmpx_gt_u32 exec, v19, s85                       // overflow happened in remainder
v_sub_u32 v18, v18, 1                              // quotient - 1
v_mul_u32_u24 v19, v18, s85                        // re-calculate remainder
v_sub_u32 v19, s84, v19                            // re-calculate remainder
s_mov_b64 exec, -1                                 // Reset exec
v_readfirstlane_b32 s[sgprWorkGroup2], v18         // quotient
v_readfirstlane_b32 s86, v19                       // remainder
v_cvt_f32_u32 v18, s[sgprNumWorkGroups0]           // TileID // nWG0
v_rcp_iflag_f32 v18, v18                           // TileID // nWG0
v_cvt_f32_u32 v19, s86                             // TileID // nWG0
v_mul_f32 v18, v18, v19                            // TileID // nWG0
v_cvt_u32_f32 v18, v18                             // TileID // nWG0
v_mul_u32_u24 v19, v18, s[sgprNumWorkGroups0]      // TileID // nWG0
v_sub_u32 v19, s86, v19                            // TileID // nWG0
v_cmpx_eq_u32 exec, v19, s[sgprNumWorkGroups0]     // TileID // nWG0
v_add_u32 v18, 1, v18                              // TileID // nWG0
v_mov_b32 v19, 0                                   // TileID // nWG0
s_mov_b64 exec, -1                                 // Reset exec
v_cmpx_gt_u32 exec, v19, s[sgprNumWorkGroups0]     // overflow happened in remainder
v_sub_u32 v18, v18, 1                              // quotient - 1
v_mul_u32_u24 v19, v18, s[sgprNumWorkGroups0]      // re-calculate remainder
v_sub_u32 v19, s86, v19                            // re-calculate remainder
s_mov_b64 exec, -1                                 // Reset exec
v_readfirstlane_b32 s[sgprWorkGroup1], v18         // quotient
v_readfirstlane_b32 s[sgprWorkGroup0], v19         // remainder

v_cmp_eq_f32 vcc, s[sgprAlpha], 0.0                // s[Alpha] == 0.0f ?
s_cbranch_vccz label_SKAlphaCheck                  // branch if s[Alpha] != 0
s_cmp_eq_u32 s[sgprStreamKLocalStart], 0           // does wg start tile?
s_cbranch_scc1 label_NoBranch_UR8VN3A1SJCPC6PO     // Only branch on scc0
s_getpc_b64 s[88:89]                               // addr of next instr
s_add_i32 s90, label_GW_End, 4                     // target branch offset
s_add_u32 s88, s88, s90                            // add target branch offset
s_addc_u32 s89, s89, 0                             // add high and carry
s_setpc_b64 s[88:89]                               // branch to label_GW_End
label_NoBranch_UR8VN3A1SJCPC6PO:
s_mov_b32 s[sgprStreamKLocalEnd], s[sgprItersPerTile] // Skip iterations
label_SKAlphaCheck:
s_sext_i32_i16 s[sgprWGM], s[sgprWGM]              // Restore WGM
s_cmp_gt_i32 s[sgprWGM], 1                         // WGM > 1 ?
s_cbranch_scc1 label_WGMPositive                   // branch if WGM > 1
s_cmp_ge_i32 s[sgprWGM], 0                         // WGM >= 0 ?
s_cbranch_scc1 label_WGM                           // branch if WGM >= 0
s_abs_i32 s87, s[sgprWGM]                          // abs(WGM)
v_cvt_f32_u32 v18, s87                             // WGM
v_rcp_iflag_f32 v18, v18                           // WGM
v_cvt_f32_u32 v19, s[sgprWorkGroup0]               // WGM
v_mul_f32 v18, v18, v19                            // WGM
v_cvt_u32_f32 v18, v18                             // WGM
v_mul_u32_u24 v19, v18, s87                        // WGM
v_sub_u32 v19, s[sgprWorkGroup0], v19              // WGM
v_cmpx_eq_u32 exec, v19, s87                       // WGM
v_add_u32 v18, 1, v18                              // WGM
s_mov_b64 exec, -1                                 // Reset exec
v_cmpx_gt_u32 exec, v19, s87                       // overflow happened in remainder
v_sub_u32 v18, v18, 1                              // quotient - 1
s_mov_b64 exec, -1                                 // Reset exec
v_readfirstlane_b32 s83, v18                       // quotient
s_mul_i32 s86, s83, s87                            // quotient * non-magic divisor
s_sub_u32 s86, s[sgprWorkGroup0], s86              // WorkGroup0=remainder
s_mul_i32 s86, s86, s[sgprNumWorkGroups1]          // (wg1 % WGM)*NumWorkGroups1
s_add_u32 s86, s86, s[sgprWorkGroup1]              // wgSerial = wg0 + (wg1 % WGM)*NumWorkGroups1
v_cvt_f32_u32 v18, s87                             // WGM
v_rcp_iflag_f32 v18, v18                           // WGM
v_cvt_f32_u32 v19, s[sgprNumWorkGroups0]           // WGM
v_mul_f32 v18, v18, v19                            // WGM
v_cvt_u32_f32 v18, v18                             // WGM
v_mul_u32_u24 v19, v18, s87                        // WGM
v_sub_u32 v19, s[sgprNumWorkGroups0], v19          // WGM
v_cmpx_eq_u32 exec, v19, s87                       // WGM
v_add_u32 v18, 1, v18                              // WGM
s_mov_b64 exec, -1                                 // Reset exec
v_cmpx_gt_u32 exec, v19, s87                       // overflow happened in remainder
v_sub_u32 v18, v18, 1                              // quotient - 1
s_mov_b64 exec, -1                                 // Reset exec
v_readfirstlane_b32 s84, v18                       // quotient
s_mul_i32 s85, s87, s84                            // quotient * non-magic divisor
s_sub_u32 s85, s[sgprNumWorkGroups0], s85          // NumWorkGroups0=remainder
s_cmp_eq_u32 s85, 0                                // remainder == 0 ?
s_cmov_b32 s85, s87                                // remainder = WGM if remainder == 0
s_cmp_ge_u32 s83, s84                              // blockId >= numFullBlocks ?
s_cselect_b32 s84, s85, s87
v_cvt_f32_u32 v18, s84                             // s[sgprWorkGroup1] = s86 / s84
v_rcp_iflag_f32 v18, v18                           // s[sgprWorkGroup1] = s86 / s84
v_cvt_f32_u32 v19, s86                             // s[sgprWorkGroup1] = s86 / s84
v_mul_f32 v18, v18, v19                            // s[sgprWorkGroup1] = s86 / s84
v_cvt_u32_f32 v18, v18                             // s[sgprWorkGroup1] = s86 / s84
v_mul_u32_u24 v19, v18, s84                        // s[sgprWorkGroup1] = s86 / s84
v_sub_u32 v19, s86, v19                            // s[sgprWorkGroup1] = s86 / s84
v_cmpx_eq_u32 exec, v19, s84                       // s[sgprWorkGroup1] = s86 / s84
v_add_u32 v18, 1, v18                              // s[sgprWorkGroup1] = s86 / s84
v_mov_b32 v19, 0                                   // s[sgprWorkGroup0] = s86 % s84
s_mov_b64 exec, -1                                 // Reset exec
v_cmpx_gt_u32 exec, v19, s84                       // overflow happened in remainder
v_sub_u32 v18, v18, 1                              // quotient - 1
v_mul_u32_u24 v19, v18, s84                        // re-calculate remainder
v_sub_u32 v19, s86, v19                            // re-calculate remainder
s_mov_b64 exec, -1                                 // Reset exec
v_readfirstlane_b32 s[sgprWorkGroup1], v18         // quotient
v_readfirstlane_b32 s[sgprWorkGroup0], v19         // remainder
s_mul_i32 s[sgprWorkGroup0], s[sgprWorkGroup1], s84 // quotient * non-magic divisor
s_sub_u32 s[sgprWorkGroup0], s86, s[sgprWorkGroup0] // WorkGroup0=remainder
s_mul_i32 s83, s83, s87                            // blockId * WGM
s_add_u32 s[sgprWorkGroup0], s[sgprWorkGroup0], s83 // wg1 += blockId * WGM
s_branch label_WGM
label_WGMPositive:
s_mov_b32 s87, s[sgprWGM]                          // WGM
v_cvt_f32_u32 v18, s87                             // WGM
v_rcp_iflag_f32 v18, v18                           // WGM
v_cvt_f32_u32 v19, s[sgprWorkGroup1]               // WGM
v_mul_f32 v18, v18, v19                            // WGM
v_cvt_u32_f32 v18, v18                             // WGM
v_mul_u32_u24 v19, v18, s87                        // WGM
v_sub_u32 v19, s[sgprWorkGroup1], v19              // WGM
v_cmpx_eq_u32 exec, v19, s87                       // WGM
v_add_u32 v18, 1, v18                              // WGM
s_mov_b64 exec, -1                                 // Reset exec
v_cmpx_gt_u32 exec, v19, s87                       // overflow happened in remainder
v_sub_u32 v18, v18, 1                              // quotient - 1
s_mov_b64 exec, -1                                 // Reset exec
v_readfirstlane_b32 s83, v18                       // quotient
s_mul_i32 s86, s83, s87                            // quotient * non-magic divisor
s_sub_u32 s86, s[sgprWorkGroup1], s86              // WorkGroup1=remainder
s_mul_i32 s86, s86, s[sgprNumWorkGroups0]          // (wg1 % WGM)*NumWorkGroups0
s_add_u32 s86, s86, s[sgprWorkGroup0]              // wgSerial = wg0 + (wg1 % WGM)*NumWorkGroups0
v_cvt_f32_u32 v18, s87                             // WGM
v_rcp_iflag_f32 v18, v18                           // WGM
v_cvt_f32_u32 v19, s[sgprNumWorkGroups1]           // WGM
v_mul_f32 v18, v18, v19                            // WGM
v_cvt_u32_f32 v18, v18                             // WGM
v_mul_u32_u24 v19, v18, s87                        // WGM
v_sub_u32 v19, s[sgprNumWorkGroups1], v19          // WGM
v_cmpx_eq_u32 exec, v19, s87                       // WGM
v_add_u32 v18, 1, v18                              // WGM
s_mov_b64 exec, -1                                 // Reset exec
v_cmpx_gt_u32 exec, v19, s87                       // overflow happened in remainder
v_sub_u32 v18, v18, 1                              // quotient - 1
s_mov_b64 exec, -1                                 // Reset exec
v_readfirstlane_b32 s84, v18                       // quotient
s_mul_i32 s85, s87, s84                            // quotient * non-magic divisor
s_sub_u32 s85, s[sgprNumWorkGroups1], s85          // NumWorkGroups1=remainder
s_cmp_eq_u32 s85, 0                                // remainder == 0 ?
s_cmov_b32 s85, s87                                // remainder = WGM if remainder == 0
s_cmp_ge_u32 s83, s84                              // blockId >= numFullBlocks ?
s_cselect_b32 s84, s85, s87
v_cvt_f32_u32 v18, s84                             // s[sgprWorkGroup0] = s86 / s84
v_rcp_iflag_f32 v18, v18                           // s[sgprWorkGroup0] = s86 / s84
v_cvt_f32_u32 v19, s86                             // s[sgprWorkGroup0] = s86 / s84
v_mul_f32 v18, v18, v19                            // s[sgprWorkGroup0] = s86 / s84
v_cvt_u32_f32 v18, v18                             // s[sgprWorkGroup0] = s86 / s84
v_mul_u32_u24 v19, v18, s84                        // s[sgprWorkGroup0] = s86 / s84
v_sub_u32 v19, s86, v19                            // s[sgprWorkGroup0] = s86 / s84
v_cmpx_eq_u32 exec, v19, s84                       // s[sgprWorkGroup0] = s86 / s84
v_add_u32 v18, 1, v18                              // s[sgprWorkGroup0] = s86 / s84
v_mov_b32 v19, 0                                   // s[sgprWorkGroup1] = s86 % s84
s_mov_b64 exec, -1                                 // Reset exec
v_cmpx_gt_u32 exec, v19, s84                       // overflow happened in remainder
v_sub_u32 v18, v18, 1                              // quotient - 1
v_mul_u32_u24 v19, v18, s84                        // re-calculate remainder
v_sub_u32 v19, s86, v19                            // re-calculate remainder
s_mov_b64 exec, -1                                 // Reset exec
v_readfirstlane_b32 s[sgprWorkGroup0], v18         // quotient
v_readfirstlane_b32 s[sgprWorkGroup1], v19         // remainder
s_mul_i32 s[sgprWorkGroup1], s[sgprWorkGroup0], s84 // quotient * non-magic divisor
s_sub_u32 s[sgprWorkGroup1], s86, s[sgprWorkGroup1] // WorkGroup1=remainder
s_mul_i32 s83, s83, s87                            // blockId * WGM
s_add_u32 s[sgprWorkGroup1], s[sgprWorkGroup1], s83 // wg1 += blockId * WGM
label_WGM:

/******************************************/
/* Local Read Addresses                   */
/******************************************/

/* local read addresses: tile assignments a/b */
/* lr0I */
v_and_b32 v19, 63, v[vgprSerial]                   // 0. thread id in wave: wtid = tid % wavelength(64)
v_and_b32 v18, 15, v19                             // 1. N offset: nIdx = wtid % MI_N(16)
                                                   // 1. N offset: nOffset = nIdx * nStride(1) (multiplier is 1, do nothing)
/* Skip. 2. block offset: bnOffset = 0 when num1DBlocks = 1 */
v_lshlrev_b32 v18, 2, v18                          // 4. apply VectorWidth: bnOffset = bnOffset * vw(4)
v_lshrrev_b32 v19, 4, v19                          // 5. K offset: kIdx = wtid / (MIN(16) * MIBB(1))
v_lshl_add_u32 v18, v19, 10, v18                   // 5. K offset: lrKOffset = kIdx * mStride(1024); 6. offset in wave: lrOffset = bnOffset + lrKOffset
v_lshrrev_b32 v22, 6, v[vgprSerial]                // 7. wave offset in N dimen: wtid = tid / dividedForWaveId(64)
v_and_b32 v22, 1, v22                              // 7. wave offset in M dimen: wtid0 = wtid / num1DWaves(2)
v_lshl_add_u32 v18, v22, 6, v18                    // 7. wave offset in M dimen: wOffset = wtid0 * W0Stride(64); 7. final local read offset: flrOffset = lrOffset + WOffset
/* lr1J */
v_and_b32 v20, 63, v[vgprSerial]                   // 0. thread id in wave: wtid = tid % wavelength(64)
v_and_b32 v19, 15, v20                             // 1. N offset: nIdx = wtid % MI_N(16)
v_lshlrev_b32 v19, 5, v19                          // 1. N offset: nOffset = nIdx * nStride(32)
/* Skip. 2. block offset: bnOffset = 0 when num1DBlocks = 1 */
v_lshlrev_b32 v19, 2, v19                          // 4. apply VectorWidth: bnOffset = bnOffset * vw(4)
v_lshrrev_b32 v20, 4, v20                          // 5. K offset: kIdx = wtid / (MIN(16) * MIBB(1))
v_lshl_add_u32 v19, v20, 2, v19                    // 5. K offset: lrKOffset = kIdx * mStride(4); 6. offset in wave: lrOffset = bnOffset + lrKOffset
v_lshrrev_b32 v21, 7, v[vgprSerial]                // 7. wave offset in N dimen: wtid = tid / dividedForWaveId(128)
v_and_b32 v21, 1, v21                              // 7. wave offset in M dimen: wtid0 = wtid / num1DWaves(2)
v_lshl_add_u32 v19, v21, 11, v19                   // 7. wave offset in M dimen: wOffset = wtid0 * W0Stride(2048); 7. final local read offset: flrOffset = lrOffset + WOffset

/* local read addresses: final offsets a */
v_lshrrev_b32 v20, 6, v[vgprSerial]                // 20 = Serial / 64
v_lshrrev_b32 v20, 2, v20                          // LSU offset: Get LSU wave_id
s_mov_b32 s83, 8192                                // LSU offset: stride = lsuStride(32)*(MT0(256) + PAD0(0))
v_mul_lo_u32 v20, s83, v20                         // LSU offset: lsuoffset = wave_id*lsuStride*(MT0+PAD)
v_add_lshl_u32 v[vgprLocalReadAddrA], v20, v18, 0x2 // Final Offset: offset = (lro0+lsuoffset)*bpeDS

/* local read addresses: final offsets b */
v_lshrrev_b32 v18, 6, v[vgprSerial]                // 18 = Serial / 64
v_lshrrev_b32 v18, 2, v18                          // LSU offset: Get LSU wave_id
s_mov_b32 s83, 32                                  // LSU offset: stride = lsuStride(32) when umlds==True
v_mul_lo_u32 v18, s83, v18                         // LSU offset: lsuoffset = wave_id*lsuStride*(MT1+PAD)
v_add_lshl_u32 v[vgprLocalReadAddrB], v18, v19, 0x2 // Final Offset: offset = (lro1+lsuoffset)*bpeDS
v_lshrrev_b32 v20, 10, v[vgprLocalReadAddrB]       // Final Offset: padding 16 per block 1024
v_lshl_add_u32 v[vgprLocalReadAddrB], v20, 4, v[vgprLocalReadAddrB] // Final Offset: padding 16 per block 1024

/* local read addresses: declare addresses a */
/* N/A */

/* local read addresses: declare addresses b */
v_add_co_u32 v[vgprLocalReadAddrB+0], vcc, 0x8000, v[vgprLocalReadAddrB+0] //  += LdsOffsetB (lower)
v_add_u32 v[vgprLocalReadSwapAddrA], 66048, v[vgprLocalReadAddrA] // Calculate starting lds addr of second buffer
v_xor_b32 v[vgprLocalReadSwapAddrA], v[vgprLocalReadSwapAddrA], v[vgprLocalReadAddrA] // xor both lds buffer offsets to enable swapping
v_add_u32 v[vgprLocalReadSwapAddrB], 66048, v[vgprLocalReadAddrB] // Calculate starting lds addr of second buffer
v_xor_b32 v[vgprLocalReadSwapAddrB], v[vgprLocalReadSwapAddrB], v[vgprLocalReadAddrB] // xor both lds buffer offsets to enable swapping

/******************************************/
/* Local Write Addresses                  */
/******************************************/
/* LVCA = 64 */
/* v19 = A-unroll = serial/LVCA */
v_lshrrev_b32 v19, 6, v[vgprSerial]                // 19 = Serial / 64
v_and_b32 v18, 63, v[vgprSerial]                   // 18 = Serial % 64
/* tile *= glvw */
v_lshlrev_b32 v18, 2, v18                          // v18 = v18 * 4
v_mov_b32 v22, v19                                 // copy for GlobalSplitU
/* LVCB = 8 */
/* v21 = B-unroll = serial%LVCB */
v_lshrrev_b32 v20, 3, v[vgprSerial]                // 20 = Serial / 8
v_and_b32 v21, 7, v[vgprSerial]                    // 21 = Serial % 8
/* unroll *= glvw */
v_lshlrev_b32 v21, 2, v21                          // v21 = v21 * 4
v_mov_b32 v23, v21                                 // copy for GlobalSplitU
/* lwaUnrollAssignmentA = v22 */
/* lwaUnrollAssignmentB = v23 */

/* local write addresses: first offset a */
v_mul_u32_u24 v24, 0x100, v22                      // lwAL**(MTA + PAD)
v_add_lshl_u32 v24, v18, v24, 0x2                  // lwFOA = (lwAA + lwAL*(MT0I+PAD))*bpeDS
s_nop 0                                            // 1 wait states required before reading vgpr by lane
v_readfirstlane_b32 s[sgprLocalWriteAddrA], v24    // Copy lds write address VGPR to SGPR
s_nop 0                                            // 1 wait states
s_add_u32 s[sgprSwapA], s[sgprLocalWriteAddrA], 66048 // Calculate starting lds addr of second buffer
s_xor_b32 s[sgprSwapA], s[sgprSwapA], s[sgprLocalWriteAddrA] // xor both lds buffer offsets to enable swapping

/* local write addresses: first offset b */
v_mul_u32_u24 v24, 0x20, v20                       // lwBL**(DepthU_Compute + PAD)
v_add_lshl_u32 v24, v23, v24, 0x2                  // lwFOB = (lwBB + lwBL*(DepthU+PAD))*bpeDS
v_lshrrev_b32 v26, 10, v24                         // padding 16 per block 1024
v_lshl_add_u32 v24, v26, 4, v24                    // padding 16 per block 1024
v_add_co_u32 v24, vcc, 0x8000, v24                 // lwFOB = lwB1J + lwBL*MT1J + LDS_OFFSET_B=32768
s_nop 0                                            // 1 wait states required before reading vgpr by lane
v_readfirstlane_b32 s[sgprLocalWriteAddrB], v24    // Copy lds write address VGPR to SGPR
s_nop 0                                            // 1 wait states
s_add_u32 s[sgprSwapB], s[sgprLocalWriteAddrB], 66048 // Calculate starting lds addr of second buffer
s_xor_b32 s[sgprSwapB], s[sgprSwapB], s[sgprLocalWriteAddrB] // xor both lds buffer offsets to enable swapping

/* global read addresses: tile offset assignment a */
/* graTileAssignmentA = v18 */

/* global read addresses: tile offset assignment b */
/* graTileAssignmentB = v20 */

/* global read addresses: unroll assignment a */
/* v19 */

/* global read addresses: unroll assignment b */
/* v21 */

/* global read addresses: other free assignments */
/* s[sgprWorkGroup2] */

/* global read addresses: tile offsets a */
v_mov_b32 v24, v18                                 // groA0I_0

/* global read addresses: tile offsets b */
v_mov_b32 v25, v20                                 // groB1J_0
v_add_co_u32 v26, vcc, 32, v25                     // groB1J_1 += LSPB
v_add_co_u32 v27, vcc, 32, v26                     // groB1J_2 += LSPB
v_add_co_u32 v28, vcc, 32, v27                     // groB1J_3 += LSPB
v_add_co_u32 v29, vcc, 32, v28                     // groB1J_4 += LSPB
v_add_co_u32 v30, vcc, 32, v29                     // groB1J_5 += LSPB
v_add_co_u32 v31, vcc, 32, v30                     // groB1J_6 += LSPB
v_add_co_u32 v32, vcc, 32, v31                     // groB1J_7 += LSPB

/* global read addresses: unroll offsets a */
v_mov_b32 v33, v19                                 // groAL_0
v_add_co_u32 v34, vcc, 4, v33                      // groAL_1 + LSPA
v_add_co_u32 v35, vcc, 4, v34                      // groAL_2 + LSPA
v_add_co_u32 v36, vcc, 4, v35                      // groAL_3 + LSPA
v_add_co_u32 v37, vcc, 4, v36                      // groAL_4 + LSPA
v_add_co_u32 v38, vcc, 4, v37                      // groAL_5 + LSPA
v_add_co_u32 v39, vcc, 4, v38                      // groAL_6 + LSPA
v_add_co_u32 v40, vcc, 4, v39                      // groAL_7 + LSPA

/* global read addresses: unroll offsets b */
v_mov_b32 v41, v21                                 // groBL_0

/* global read addresses: shift a */
s_mul_i32 s83, s[sgprWorkGroup0], 256              // WorkGroup[01] * MT
s_sub_u32 s83, s[sgprSizeI], s83                   // edge = Size0I - WG*MT
s_sub_u32 s83, s83, 4                              // edge -= margin(4)
v_mov_b32 v42, s83                                 // edge vgpr = Size0I- WG*MT - margin(4)
v_min_i32 v24, v42, v24                            // offset = (offset < edge) ? offset(v24) : edge(v42)

/* global read addresses: final offsets a */
GLOBAL_OFFSET_A vgprGlobalReadOffsetA+0, 24, 33, 42 // gROA_0_0_0_0
GLOBAL_OFFSET_A vgprGlobalReadOffsetA+1, 24, 34, 42 // gROA_0_0_1_0
GLOBAL_OFFSET_A vgprGlobalReadOffsetA+2, 24, 35, 42 // gROA_0_0_2_0
GLOBAL_OFFSET_A vgprGlobalReadOffsetA+3, 24, 36, 42 // gROA_0_0_3_0
GLOBAL_OFFSET_A vgprGlobalReadOffsetA+4, 24, 37, 42 // gROA_0_0_4_0
GLOBAL_OFFSET_A vgprGlobalReadOffsetA+5, 24, 38, 42 // gROA_0_0_5_0
GLOBAL_OFFSET_A vgprGlobalReadOffsetA+6, 24, 39, 42 // gROA_0_0_6_0
GLOBAL_OFFSET_A vgprGlobalReadOffsetA+7, 24, 40, 42 // gROA_0_0_7_0

/* global read addresses: final offsets b */
GLOBAL_OFFSET_B vgprGlobalReadOffsetB+0, 41, 25, 33 // gROB_0_0_0_0
GLOBAL_OFFSET_B vgprGlobalReadOffsetB+1, 41, 26, 33 // gROB_0_0_1_0
GLOBAL_OFFSET_B vgprGlobalReadOffsetB+2, 41, 27, 33 // gROB_0_0_2_0
GLOBAL_OFFSET_B vgprGlobalReadOffsetB+3, 41, 28, 33 // gROB_0_0_3_0
GLOBAL_OFFSET_B vgprGlobalReadOffsetB+4, 41, 29, 33 // gROB_0_0_4_0
GLOBAL_OFFSET_B vgprGlobalReadOffsetB+5, 41, 30, 33 // gROB_0_0_5_0
GLOBAL_OFFSET_B vgprGlobalReadOffsetB+6, 41, 31, 33 // gROB_0_0_6_0
GLOBAL_OFFSET_B vgprGlobalReadOffsetB+7, 41, 32, 33 // gROB_0_0_7_0

/* global read addresses: addresses a */
/* max read offset = size[n] * stride[n-1] */
s_mul_hi_u32 s87, s[sgprWorkGroup0], 256           // WorkGroup[01] * MT
s_mul_i32 s86, s[sgprWorkGroup0], 256              // WorkGroup[01] * MT
s_mul_i32 s84, s[sgprStreamKLocalStart], DepthU    // StreamK tile start offset
s_mul_hi_u32 s85, s84, s[sgprStrideAL]             // StreamK tile start offset
s_mul_i32 s84, s84, s[sgprStrideAL]                // StreamK tile start offset
s_add_u32 s86, s86, s84                            // accum GsuOffset term to tilestart
s_addc_u32 s87, s87, s85                           // accum GsuOffset term to tilestart
s_mov_b64 s[sgprShadowLimitA+0:sgprShadowLimitA+0+1], 1 // Init tensor size
s_sub_u32 s84, s[sgprSizeI], 1                     // (size-1)
s_mul_hi_u32 s85, constStrideA0I, s84              // stride x (size-1)
s_mul_i32 s84, constStrideA0I, s84                 // stride x (size-1)
s_add_u32 s[sgprShadowLimitA+0], s[sgprShadowLimitA+0], s84 // sum tensor size
s_addc_u32 s[sgprShadowLimitA+1], s[sgprShadowLimitA+1], s85 // sum tensor size
s_sub_u32 s84, s[sgprSizeL], 1                     // (size-1)
s_mul_hi_u32 s85, s[sgprStrideAL], s84             // stride x (size-1)
s_mul_i32 s84, s[sgprStrideAL], s84                // stride x (size-1)
s_add_u32 s[sgprShadowLimitA+0], s[sgprShadowLimitA+0], s84 // sum tensor size
s_addc_u32 s[sgprShadowLimitA+1], s[sgprShadowLimitA+1], s85 // sum tensor size
s_sub_u32 s[sgprShadowLimitA+0], s[sgprShadowLimitA+0], s86 // sub tileStart
s_subb_u32 s[sgprShadowLimitA+1], s[sgprShadowLimitA+1], s87 // sub tileStart
s_lshl_b64 s[sgprShadowLimitA:sgprShadowLimitA+1], s[sgprShadowLimitA:sgprShadowLimitA+1], 0x2 // Set limit to use bytes
s_add_u32 s[sgprShadowLimitA+0], s[sgprShadowLimitA+0], 16 // extend limit for pre-pad
s_addc_u32 s[sgprShadowLimitA+1], s[sgprShadowLimitA+1], 0 // extend limit for pre-pad
s_cmp_eq_u32 s[sgprShadowLimitA+1], 0              // are we within 2^32?
s_cselect_b32 s[sgprSrdA+2], s[sgprShadowLimitA+0], BufferLimit // Move shadow to real if we are within 2^32
s_mul_hi_u32 s85, s[sgprStrideAK], s[sgprWorkGroup2] // Stride*WG
s_mul_i32 s84, s[sgprStrideAK], s[sgprWorkGroup2]  // Stride*WG
s_add_u32 s86, s86, s84                            // accum wg term to tilestart
s_addc_u32 s87, s87, s85                           // accum wg term to tilestart
s_lshl_b64 s[86:87], s[86:87], 2                   // tileStart *= BPE
s_add_u32 s[sgprSrdA+0], s[sgprAddressA+0], s86    // SRD base = Address+ tileStart0
s_addc_u32 s[sgprSrdA+1], s[sgprAddressA+1], s87   // SRD base = Address+ tileStart1
s_mov_b32 s[sgprSrdA+3], Srd127_96                 // Set bits 127_96 in SRD

/* global read addresses: addresses b */
/* max read offset = size[n] * stride[n-1] */
s_mul_hi_u32 s87, s[sgprWorkGroup1], 256           // WorkGroup[01] * MT
s_mul_i32 s86, s[sgprWorkGroup1], 256              // WorkGroup[01] * MT
s_mul_hi_u32 s87, s86, s[sgprStrideB1J]            // tlu=0, scaled tile-offset by stride
s_mul_i32 s86, s86, s[sgprStrideB1J]               // tlu=0, scaled tile-offset by stride
s_mul_i32 s84, s[sgprStreamKLocalStart], DepthU    // StreamK tile start offset
s_mul_hi_u32 s85, s84, constStrideBL               // StreamK tile start offset
s_mul_i32 s84, s84, constStrideBL                  // StreamK tile start offset
s_add_u32 s86, s86, s84                            // accum GsuOffset term to tilestart
s_addc_u32 s87, s87, s85                           // accum GsuOffset term to tilestart
s_mov_b64 s[sgprShadowLimitB+0:sgprShadowLimitB+0+1], 1 // Init tensor size
s_sub_u32 s84, s[sgprSizeL], 1                     // (size-1)
s_mul_hi_u32 s85, constStrideBL, s84               // stride x (size-1)
s_mul_i32 s84, constStrideBL, s84                  // stride x (size-1)
s_add_u32 s[sgprShadowLimitB+0], s[sgprShadowLimitB+0], s84 // sum tensor size
s_addc_u32 s[sgprShadowLimitB+1], s[sgprShadowLimitB+1], s85 // sum tensor size
s_sub_u32 s84, s[sgprSizeJ], 1                     // (size-1)
s_mul_hi_u32 s85, s[sgprStrideB1J], s84            // stride x (size-1)
s_mul_i32 s84, s[sgprStrideB1J], s84               // stride x (size-1)
s_add_u32 s[sgprShadowLimitB+0], s[sgprShadowLimitB+0], s84 // sum tensor size
s_addc_u32 s[sgprShadowLimitB+1], s[sgprShadowLimitB+1], s85 // sum tensor size
s_sub_u32 s[sgprShadowLimitB+0], s[sgprShadowLimitB+0], s86 // sub tileStart
s_subb_u32 s[sgprShadowLimitB+1], s[sgprShadowLimitB+1], s87 // sub tileStart
s_lshl_b64 s[sgprShadowLimitB:sgprShadowLimitB+1], s[sgprShadowLimitB:sgprShadowLimitB+1], 0x2 // Set limit to use bytes
s_add_u32 s[sgprShadowLimitB+0], s[sgprShadowLimitB+0], 16 // extend limit for pre-pad
s_addc_u32 s[sgprShadowLimitB+1], s[sgprShadowLimitB+1], 0 // extend limit for pre-pad
s_cmp_eq_u32 s[sgprShadowLimitB+1], 0              // are we within 2^32?
s_cselect_b32 s[sgprSrdB+2], s[sgprShadowLimitB+0], BufferLimit // Move shadow to real if we are within 2^32
s_mul_hi_u32 s85, s[sgprStrideBK], s[sgprWorkGroup2] // Stride*WG
s_mul_i32 s84, s[sgprStrideBK], s[sgprWorkGroup2]  // Stride*WG
s_add_u32 s86, s86, s84                            // accum wg term to tilestart
s_addc_u32 s87, s87, s85                           // accum wg term to tilestart
s_lshl_b64 s[86:87], s[86:87], 2                   // tileStart *= BPE
s_add_u32 s[sgprSrdB+0], s[sgprAddressB+0], s86    // SRD base = Address+ tileStart0
s_addc_u32 s[sgprSrdB+1], s[sgprAddressB+1], s87   // SRD base = Address+ tileStart1
s_mov_b32 s[sgprSrdB+3], Srd127_96                 // Set bits 127_96 in SRD

/* global read addresses: increments a */
s_mul_i32 s[sgprGlobalReadIncsA+0], DepthU*BpeAGR, s[sgprStrideAL] // incrA unrollIdx)

/* global read addresses: increments b */
s_mov_b32 s[sgprGlobalReadIncsB+0], DepthU*BpeBGR  // incrB (unrollIdx)
/* declare loop num iterations */
s_sub_u32 s[sgprLoopCounterL], s[sgprStreamKLocalEnd], s[sgprStreamKLocalStart] // StreamK loop counter = localEnd - localStart
v_cmp_eq_f32 vcc, s[sgprAlpha], 0.0                // s[Alpha] == 0.0f ?
s_cbranch_vccz label_SKAlphaCheck2                 // branch if s[Alpha] != 0
s_mov_b32 s[sgprLoopCounterL], 0                   // Skip iterations
label_SKAlphaCheck2:
s_and_b32 s85, 31, s[sgprSizesSum+0]               // s85 = s[sgprSizesSum+0] % 32
s_cmp_eq_u32 s85, 0                                // numIterL == 0
s_cselect_b32 s84, 0, 1                            // check if size uses tail loop
s_cmp_eq_u32 s[sgprStreamKLocalEnd], s[sgprItersPerTile] // Check if WG processes final iteration of tile
s_cselect_b32 s84, s84, 0                          // this WG runs tail loop
s_sub_u32 s[sgprLoopCounterL], s[sgprLoopCounterL], s84 // Adjust loop counter for tail loop
s_mov_b32 s[sgprOrigLoopCounter], s[sgprLoopCounterL] // copy loop counter
s_and_b32 s86, s[sgprStaggerU], 0x1f00
s_lshr_b32 s86, s86, 0x8
s_and_b32 s87, s[sgprStaggerU], 0xe000
s_and_b32 s[sgprStaggerU], s[sgprStaggerU], 0xff
s_mov_b32 s84, s[sgprStaggerU]                     // init staggerU
label_beginStaggerUIter:
s_lshl_b32 s85, s84, s86                           // shift by StaggerUStride
s_cmp_ge_u32 s[sgprOrigLoopCounter], s85           // loopCount >= current shift Count
s_cbranch_scc1 label_endStaggerUIter               // jump to end
s_lshr_b32 s84, s84, 1                             // step down to smaller stagger
s_branch label_beginStaggerUIter                   // jump to begin
label_endStaggerUIter:
s_sub_u32 s85, s84, 1                              // staggerU mask
s_cmp_ge_u32 s84, 1                                // if current staggerU >= 1
s_cselect_b32 s[sgprStaggerUIter], s85, 0          // set Mask
s_cmp_eq_u32 s87, 0x0
s_cbranch_scc1 label_StaggerUMapping_1
s_mov_b32 s84, s[sgprWorkGroup0]
s_branch label_staggerInputEnd
label_StaggerUMapping_1:
s_cmp_eq_u32 s87, 0x2000
s_cbranch_scc1 label_StaggerUMapping_2
s_mov_b32 s84, s[sgprWorkGroup1]
s_branch label_staggerInputEnd
label_StaggerUMapping_2:
s_cmp_eq_u32 s87, 0x4000
s_cbranch_scc1 label_StaggerUMapping_3
s_mov_b32 s84, -0x1
s_branch label_staggerInputEnd
label_StaggerUMapping_3:
s_cmp_eq_u32 s87, 0x6000
s_cbranch_scc1 label_StaggerUMapping_4
s_mul_i32 s85, s[sgprNumWorkGroups0], s[sgprWorkGroup1]
s_add_u32 s84, s84, s85
s_add_u32 s84, s84, s[sgprWorkGroup0]
s_branch label_staggerInputEnd
label_StaggerUMapping_4:
s_cmp_eq_u32 s87, 0x8000
s_cbranch_scc1 label_staggerInputEnd
s_mov_b32 s84, -0x1
s_branch label_staggerInputEnd
label_staggerInputEnd:
s_and_b32 s[sgprStaggerUIter], s[sgprStaggerUIter], s84 // Compute actual stagger start for this tile
s_lshl_b32 s[sgprStaggerUIter], s[sgprStaggerUIter], s86 // shift by StaggerUStride
s_cmp_gt_u32 s[sgprStreamKLocalStart], 0           // does wg start tile?
s_cmov_b32 s[sgprStaggerUIter], 0                  // set stagger=0 for partial tiles
s_cmp_lt_u32 s[sgprStreamKLocalEnd], s[sgprItersPerTile] // does wg finish tile?
s_cmov_b32 s[sgprStaggerUIter], 0                  // set stagger=0 for partial tiles

/* SRDs += (StaggerUIter) * GlobalReadIncsA+0 */
s_mul_hi_i32 s85, s[sgprStaggerUIter], s[sgprGlobalReadIncsA+0] //  stagger byte offset
s_mul_i32 s84, s[sgprStaggerUIter], s[sgprGlobalReadIncsA+0] //  stagger byte offset
s_mul_hi_i32 s[sgprWrapUA+1], s[sgprLoopCounterL], s[sgprGlobalReadIncsA+0] // Number of bytes accessed by the unroll loop
s_mul_i32 s[sgprWrapUA+0], s[sgprLoopCounterL], s[sgprGlobalReadIncsA+0] // Number of bytes accessed by the unroll loop
s_sub_u32 s[sgprWrapUA+0], s[sgprGlobalReadIncsA+0], s[sgprWrapUA+0] // remove one iteration
s_subb_u32 s[sgprWrapUA+1], 0, s[sgprWrapUA+1]     // remove one iteration
s_add_u32 s[sgprSrdA+0], s[sgprSrdA+0], s84        // gra SRD += inc(lower)
s_addc_u32 s[sgprSrdA+1], s[sgprSrdA+1], s85       // gra SRD += inc(upper)
s_sub_u32 s[sgprShadowLimitA+0], s[sgprShadowLimitA+0], s84 // limit -= inc)
s_subb_u32 s[sgprShadowLimitA+1], s[sgprShadowLimitA+1], s85 // limit -= inc)
s_cmp_eq_u32 s[sgprShadowLimitA+1], 0              // are we within 2^32?
s_cselect_b32 s[sgprSrdA+2], s[sgprShadowLimitA+0], BufferLimit // Move shadow to real if we are within 2^32

/* SRDs += (StaggerUIter) * GlobalReadIncsB+0 */
s_mul_hi_i32 s85, s[sgprStaggerUIter], s[sgprGlobalReadIncsB+0] //  stagger byte offset
s_mul_i32 s84, s[sgprStaggerUIter], s[sgprGlobalReadIncsB+0] //  stagger byte offset
s_mul_hi_i32 s[sgprWrapUB+1], s[sgprLoopCounterL], s[sgprGlobalReadIncsB+0] // Number of bytes accessed by the unroll loop
s_mul_i32 s[sgprWrapUB+0], s[sgprLoopCounterL], s[sgprGlobalReadIncsB+0] // Number of bytes accessed by the unroll loop
s_sub_u32 s[sgprWrapUB+0], s[sgprGlobalReadIncsB+0], s[sgprWrapUB+0] // remove one iteration
s_subb_u32 s[sgprWrapUB+1], 0, s[sgprWrapUB+1]     // remove one iteration
s_add_u32 s[sgprSrdB+0], s[sgprSrdB+0], s84        // gra SRD += inc(lower)
s_addc_u32 s[sgprSrdB+1], s[sgprSrdB+1], s85       // gra SRD += inc(upper)
s_sub_u32 s[sgprShadowLimitB+0], s[sgprShadowLimitB+0], s84 // limit -= inc)
s_subb_u32 s[sgprShadowLimitB+1], s[sgprShadowLimitB+1], s85 // limit -= inc)
s_cmp_eq_u32 s[sgprShadowLimitB+1], 0              // are we within 2^32?
s_cselect_b32 s[sgprSrdB+2], s[sgprShadowLimitB+0], BufferLimit // Move shadow to real if we are within 2^32
s_add_u32 s[sgprStaggerUIter], s[sgprStaggerUIter], 2 // Subtract (PGR-1); StaggerUIter now contains target iteration to wrap
/* local read addresses: init pointers a */

/* localReadInitPointers */
/* local read addresses: init pointers b */

/* localReadInitPointers */

/* prefetch: global -> local */
s_cmp_eq_u32 s[sgprLoopCounterL], 0                // at last iteration?
s_cbranch_scc1 label_ShadowInitStart               // skip to ShadowInitStart iter b/c numIter==0
s_mov_b32 m0, s[sgprLocalWriteAddrA]               // m0 <- LDS write address
/* before DirectToLds load, ensure prior ds_reads have finished */
s_waitcnt lgkmcnt(0)
s_barrier
buffer_load_dwordx4 v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], 0 offen offset:0, lds // G -> Reg 0_0_0_0
s_add_u32 m0, m0, 4096                             // Move LDS write address to next line
buffer_load_dwordx4 v[vgprGlobalReadOffsetA+1], s[sgprSrdA:sgprSrdA+3], 0 offen offset:0, lds // G -> Reg 0_0_1_0
s_add_u32 m0, m0, 4096                             // Move LDS write address to next line
buffer_load_dwordx4 v[vgprGlobalReadOffsetA+2], s[sgprSrdA:sgprSrdA+3], 0 offen offset:0, lds // G -> Reg 0_0_2_0
s_add_u32 m0, m0, 4096                             // Move LDS write address to next line
buffer_load_dwordx4 v[vgprGlobalReadOffsetA+3], s[sgprSrdA:sgprSrdA+3], 0 offen offset:0, lds // G -> Reg 0_0_3_0
s_add_u32 m0, m0, 4096                             // Move LDS write address to next line
buffer_load_dwordx4 v[vgprGlobalReadOffsetA+4], s[sgprSrdA:sgprSrdA+3], 0 offen offset:0, lds // G -> Reg 0_0_4_0
s_add_u32 m0, m0, 4096                             // Move LDS write address to next line
buffer_load_dwordx4 v[vgprGlobalReadOffsetA+5], s[sgprSrdA:sgprSrdA+3], 0 offen offset:0, lds // G -> Reg 0_0_5_0
s_add_u32 m0, m0, 4096                             // Move LDS write address to next line
buffer_load_dwordx4 v[vgprGlobalReadOffsetA+6], s[sgprSrdA:sgprSrdA+3], 0 offen offset:0, lds // G -> Reg 0_0_6_0
s_add_u32 m0, m0, 4096                             // Move LDS write address to next line
buffer_load_dwordx4 v[vgprGlobalReadOffsetA+7], s[sgprSrdA:sgprSrdA+3], 0 offen offset:0, lds // G -> Reg 0_0_7_0
s_mov_b32 m0, 0x20400                              // Restore LDS clamp at 132096 bytes
s_mov_b32 m0, s[sgprLocalWriteAddrB]               // m0 <- LDS write address
buffer_load_dwordx4 v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], 0 offen offset:0, lds // G -> Reg 0_0_0_0
s_add_u32 m0, m0, 4160                             // Move LDS write address to next line
buffer_load_dwordx4 v[vgprGlobalReadOffsetB+1], s[sgprSrdB:sgprSrdB+3], 0 offen offset:0, lds // G -> Reg 0_0_1_0
s_add_u32 m0, m0, 4160                             // Move LDS write address to next line
buffer_load_dwordx4 v[vgprGlobalReadOffsetB+2], s[sgprSrdB:sgprSrdB+3], 0 offen offset:0, lds // G -> Reg 0_0_2_0
s_add_u32 m0, m0, 4160                             // Move LDS write address to next line
buffer_load_dwordx4 v[vgprGlobalReadOffsetB+3], s[sgprSrdB:sgprSrdB+3], 0 offen offset:0, lds // G -> Reg 0_0_3_0
s_add_u32 m0, m0, 4160                             // Move LDS write address to next line
buffer_load_dwordx4 v[vgprGlobalReadOffsetB+4], s[sgprSrdB:sgprSrdB+3], 0 offen offset:0, lds // G -> Reg 0_0_4_0
s_add_u32 m0, m0, 4160                             // Move LDS write address to next line
buffer_load_dwordx4 v[vgprGlobalReadOffsetB+5], s[sgprSrdB:sgprSrdB+3], 0 offen offset:0, lds // G -> Reg 0_0_5_0
s_add_u32 m0, m0, 4160                             // Move LDS write address to next line
buffer_load_dwordx4 v[vgprGlobalReadOffsetB+6], s[sgprSrdB:sgprSrdB+3], 0 offen offset:0, lds // G -> Reg 0_0_6_0
s_add_u32 m0, m0, 4160                             // Move LDS write address to next line
buffer_load_dwordx4 v[vgprGlobalReadOffsetB+7], s[sgprSrdB:sgprSrdB+3], 0 offen offset:0, lds // G -> Reg 0_0_7_0
s_mov_b32 m0, 0x20400                              // Restore LDS clamp at 132096 bytes

/* global read inc A loopL */
s_add_u32 s86, s[sgprLoopCounterL], 1              // remove pf(1)
s_cmp_eq_u32 s[sgprStaggerUIter], s86              // Is this wrapIter? (pf)
s_cselect_b32 s84, s[sgprWrapUA+0], s[sgprGlobalReadIncsA+0] // incLower <- ?
s_cselect_b32 s85, s[sgprWrapUA+1], 0              // incUpper <- ?
s_add_u32 s[sgprSrdA+0], s[sgprSrdA+0], s84        // gra SRD += inc(lower)
s_addc_u32 s[sgprSrdA+1], s[sgprSrdA+1], s85       // gra SRD += inc(upper)
s_sub_u32 s[sgprShadowLimitA+0], s[sgprShadowLimitA+0], s84 // limit -= inc)
s_subb_u32 s[sgprShadowLimitA+1], s[sgprShadowLimitA+1], s85 // limit -= inc)
s_cmp_eq_u32 s[sgprShadowLimitA+1], 0              // are we within 2^32?
s_cselect_b32 s[sgprSrdA+2], s[sgprShadowLimitA+0], BufferLimit // Move shadow to real if we are within 2^32

/* global read inc B loopL */
s_add_u32 s86, s[sgprLoopCounterL], 1              // remove pf(1)
s_cmp_eq_u32 s[sgprStaggerUIter], s86              // Is this wrapIter? (pf)
s_cselect_b32 s84, s[sgprWrapUB+0], s[sgprGlobalReadIncsB+0] // incLower <- ?
s_cselect_b32 s85, s[sgprWrapUB+1], 0              // incUpper <- ?
s_add_u32 s[sgprSrdB+0], s[sgprSrdB+0], s84        // gra SRD += inc(lower)
s_addc_u32 s[sgprSrdB+1], s[sgprSrdB+1], s85       // gra SRD += inc(upper)
s_sub_u32 s[sgprShadowLimitB+0], s[sgprShadowLimitB+0], s84 // limit -= inc)
s_subb_u32 s[sgprShadowLimitB+1], s[sgprShadowLimitB+1], s85 // limit -= inc)
s_cmp_eq_u32 s[sgprShadowLimitB+1], 0              // are we within 2^32?
s_cselect_b32 s[sgprSrdB+2], s[sgprShadowLimitB+0], BufferLimit // Move shadow to real if we are within 2^32

/******************************************/
/* End setupNewTile                       */
/******************************************/
label_ShadowInitStart:
s_mov_b64 s[sgprSrdD+0:sgprSrdD+0+1], s[sgprAddressD+0:sgprAddressD+0+1] // init SRD base address
s_mov_b32 s[sgprSrdD+2], BufferOOB
s_mov_b32 s[sgprSrdD+3], Srd127_96                 // Set bits 127_96 in post-loop SRD

s_mov_b64 s[sgprSrdC+0:sgprSrdC+0+1], s[sgprAddressC+0:sgprAddressC+0+1] // init SRD base address
s_mov_b32 s[sgprSrdC+2], BufferOOB
s_mov_b32 s[sgprSrdC+3], Srd127_96                 // Set bits 127_96 in post-loop SRD


s_mul_i32 s86, MT1, s[sgprWorkGroup1]              // <- wg1*MT1
s_mul_hi_u32 s85, s86, s[sgprStrideC1J]            // ScaleC s86 by Stride
s_mul_i32 s84, s86, s[sgprStrideC1J]               // ScaleC s86 by Stride
s_lshl_b64 s[84:85], s[84:85], 2                   // scale by bpe
s_add_u32 s[sgprSrdC+0], s[sgprAddressC+0], s84    // add lo to SRD
s_addc_u32 s[sgprSrdC+1], s[sgprAddressC+1], s85   // add hi to SRD
s_mul_hi_u32 s85, s86, s[sgprStrideD1J]            // ScaleD s86 by Stride
s_mul_i32 s84, s86, s[sgprStrideD1J]               // ScaleD s86 by Stride
s_lshl_b64 s[84:85], s[84:85], 2                   // scale by bpe
s_add_u32 s[sgprSrdD+0], s[sgprAddressD+0], s84    // add lo to SRD
s_addc_u32 s[sgprSrdD+1], s[sgprAddressD+1], s85   // add hi to SRD

s_mul_hi_u32 s85, s[sgprWorkGroup2], s[sgprStrideCK] // ScaleC s[sgprWorkGroup2] by Stride
s_mul_i32 s84, s[sgprWorkGroup2], s[sgprStrideCK]  // ScaleC s[sgprWorkGroup2] by Stride
s_lshl_b64 s[84:85], s[84:85], 2                   // scale by bpe
s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s84        // add lo to SRD
s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], s85       // add hi to SRD
s_mul_hi_u32 s85, s[sgprWorkGroup2], s[sgprStrideDK] // ScaleD s[sgprWorkGroup2] by Stride
s_mul_i32 s84, s[sgprWorkGroup2], s[sgprStrideDK]  // ScaleD s[sgprWorkGroup2] by Stride
s_lshl_b64 s[84:85], s[84:85], 2                   // scale by bpe
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s84        // add lo to SRD
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], s85       // add hi to SRD


/* initC: remove ValuC vgpr buffer [0...0) from pool */

/* initC: remove acc vgpr buffer [0...256) from pool */

/* initC: remove ValuA/B vgpr buffer [18...146) from pool */
// Init C
v_mov_b64 v[246:247], 0
v_accvgpr_write acc0, 0                            // initC
v_accvgpr_write acc1, 0                            // initC
v_accvgpr_write acc2, 0                            // initC
v_accvgpr_write acc3, 0                            // initC
v_accvgpr_write acc4, 0                            // initC
v_accvgpr_write acc5, 0                            // initC
v_accvgpr_write acc6, 0                            // initC
v_accvgpr_write acc7, 0                            // initC
v_accvgpr_write acc8, 0                            // initC
v_accvgpr_write acc9, 0                            // initC
v_accvgpr_write acc10, 0                           // initC
v_accvgpr_write acc11, 0                           // initC
v_accvgpr_write acc12, 0                           // initC
v_accvgpr_write acc13, 0                           // initC
v_accvgpr_write acc14, 0                           // initC
v_accvgpr_write acc15, 0                           // initC
v_mfma_i32_32x32x16_i8 acc[16:31], v[246:247], v[246:247], acc[0:15]
v_mfma_i32_32x32x16_i8 acc[32:47], v[246:247], v[246:247], acc[0:15]
v_mfma_i32_32x32x16_i8 acc[48:63], v[246:247], v[246:247], acc[0:15]
v_mfma_i32_32x32x16_i8 acc[64:79], v[246:247], v[246:247], acc[0:15]
v_mfma_i32_32x32x16_i8 acc[80:95], v[246:247], v[246:247], acc[0:15]
v_mfma_i32_32x32x16_i8 acc[96:111], v[246:247], v[246:247], acc[0:15]
v_mfma_i32_32x32x16_i8 acc[112:127], v[246:247], v[246:247], acc[0:15]
v_mfma_i32_32x32x16_i8 acc[128:143], v[246:247], v[246:247], acc[0:15]
v_mfma_i32_32x32x16_i8 acc[144:159], v[246:247], v[246:247], acc[0:15]
v_mfma_i32_32x32x16_i8 acc[160:175], v[246:247], v[246:247], acc[0:15]
v_mfma_i32_32x32x16_i8 acc[176:191], v[246:247], v[246:247], acc[0:15]
v_mfma_i32_32x32x16_i8 acc[192:207], v[246:247], v[246:247], acc[0:15]
v_mfma_i32_32x32x16_i8 acc[208:223], v[246:247], v[246:247], acc[0:15]
v_mfma_i32_32x32x16_i8 acc[224:239], v[246:247], v[246:247], acc[0:15]
v_mfma_i32_32x32x16_i8 acc[240:255], v[246:247], v[246:247], acc[0:15]
s_cmp_eq_u32 s[sgprLoopCounterL], 0                // at last iteration?

/* after InitC, skip to end of prefetch last iter if numIter==0 */
s_cbranch_scc0 label_NoBranch_8S4L1KCK9VFC7AQU     // Only branch on scc1
s_getpc_b64 s[84:85]                               // addr of next instr
s_add_i32 s86, label_PrefetchGlobalLastIterEnd, 4  // target branch offset
s_add_u32 s84, s84, s86                            // add target branch offset
s_addc_u32 s85, s85, 0                             // add high and carry
s_setpc_b64 s[84:85]                               // branch to label_PrefetchGlobalLastIterEnd
label_NoBranch_8S4L1KCK9VFC7AQU:
s_waitcnt vmcnt(0)                                 // wait for global read
s_barrier                                          // For stream-k / persistent loop

/* local write a */

/* local write b */

/* local write swap a */
s_xor_b32 s[sgprLocalWriteAddrA], s[sgprSwapA], s[sgprLocalWriteAddrA] // swap Red Blk SGPR

/* local write swap b */
s_xor_b32 s[sgprLocalWriteAddrB], s[sgprSwapB], s[sgprLocalWriteAddrB] // swap Red Blk SGPR
s_cmp_eq_u32 s[sgprLoopCounterL], 0x1              // PGR=2 but only 1 loop
s_cbranch_scc1 label_skipPGR2                      // PGR=2 but only 1 loop
s_mov_b32 m0, s[sgprLocalWriteAddrA]               // m0 <- LDS write address
buffer_load_dwordx4 v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], 0 offen offset:0, lds // G -> Reg 0_0_0_0
s_add_u32 m0, m0, 4096                             // Move LDS write address to next line
buffer_load_dwordx4 v[vgprGlobalReadOffsetA+1], s[sgprSrdA:sgprSrdA+3], 0 offen offset:0, lds // G -> Reg 0_0_1_0
s_add_u32 m0, m0, 4096                             // Move LDS write address to next line
buffer_load_dwordx4 v[vgprGlobalReadOffsetA+2], s[sgprSrdA:sgprSrdA+3], 0 offen offset:0, lds // G -> Reg 0_0_2_0
s_add_u32 m0, m0, 4096                             // Move LDS write address to next line
buffer_load_dwordx4 v[vgprGlobalReadOffsetA+3], s[sgprSrdA:sgprSrdA+3], 0 offen offset:0, lds // G -> Reg 0_0_3_0
s_add_u32 m0, m0, 4096                             // Move LDS write address to next line
buffer_load_dwordx4 v[vgprGlobalReadOffsetA+4], s[sgprSrdA:sgprSrdA+3], 0 offen offset:0, lds // G -> Reg 0_0_4_0
s_add_u32 m0, m0, 4096                             // Move LDS write address to next line
buffer_load_dwordx4 v[vgprGlobalReadOffsetA+5], s[sgprSrdA:sgprSrdA+3], 0 offen offset:0, lds // G -> Reg 0_0_5_0
s_add_u32 m0, m0, 4096                             // Move LDS write address to next line
buffer_load_dwordx4 v[vgprGlobalReadOffsetA+6], s[sgprSrdA:sgprSrdA+3], 0 offen offset:0, lds // G -> Reg 0_0_6_0
s_add_u32 m0, m0, 4096                             // Move LDS write address to next line
buffer_load_dwordx4 v[vgprGlobalReadOffsetA+7], s[sgprSrdA:sgprSrdA+3], 0 offen offset:0, lds // G -> Reg 0_0_7_0
s_mov_b32 m0, s[sgprLocalWriteAddrB]               // m0 <- LDS write address
buffer_load_dwordx4 v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], 0 offen offset:0, lds // G -> Reg 0_0_0_0
s_add_u32 m0, m0, 4160                             // Move LDS write address to next line
buffer_load_dwordx4 v[vgprGlobalReadOffsetB+1], s[sgprSrdB:sgprSrdB+3], 0 offen offset:0, lds // G -> Reg 0_0_1_0
s_add_u32 m0, m0, 4160                             // Move LDS write address to next line
buffer_load_dwordx4 v[vgprGlobalReadOffsetB+2], s[sgprSrdB:sgprSrdB+3], 0 offen offset:0, lds // G -> Reg 0_0_2_0
s_add_u32 m0, m0, 4160                             // Move LDS write address to next line
buffer_load_dwordx4 v[vgprGlobalReadOffsetB+3], s[sgprSrdB:sgprSrdB+3], 0 offen offset:0, lds // G -> Reg 0_0_3_0
s_add_u32 m0, m0, 4160                             // Move LDS write address to next line
buffer_load_dwordx4 v[vgprGlobalReadOffsetB+4], s[sgprSrdB:sgprSrdB+3], 0 offen offset:0, lds // G -> Reg 0_0_4_0
s_add_u32 m0, m0, 4160                             // Move LDS write address to next line
buffer_load_dwordx4 v[vgprGlobalReadOffsetB+5], s[sgprSrdB:sgprSrdB+3], 0 offen offset:0, lds // G -> Reg 0_0_5_0
s_add_u32 m0, m0, 4160                             // Move LDS write address to next line
buffer_load_dwordx4 v[vgprGlobalReadOffsetB+6], s[sgprSrdB:sgprSrdB+3], 0 offen offset:0, lds // G -> Reg 0_0_6_0
s_add_u32 m0, m0, 4160                             // Move LDS write address to next line
buffer_load_dwordx4 v[vgprGlobalReadOffsetB+7], s[sgprSrdB:sgprSrdB+3], 0 offen offset:0, lds // G -> Reg 0_0_7_0

/* local write swap a */
s_xor_b32 s[sgprLocalWriteAddrA], s[sgprSwapA], s[sgprLocalWriteAddrA] // swap Red Blk SGPR

/* local write swap b */
s_xor_b32 s[sgprLocalWriteAddrB], s[sgprSwapB], s[sgprLocalWriteAddrB] // swap Red Blk SGPR
label_skipPGR2:

s_waitcnt lgkmcnt(0) // This is needed.
s_barrier


ds_read_b128 v[vgprValuA_T0_I0+0:vgprValuA_T0_I0+3], v[vgprLocalReadAddrA] offset:0 // L -> Reg lro=0 swapByteOffset=0 ti=128 vIdx=0 eIdx=0 rIdx=0 oIdx=0 buffer=0 iui=0
ds_read_b128 v[vgprValuA_T0_I0+4:vgprValuA_T0_I0+7], v[vgprLocalReadAddrA] offset:1024 // L -> Reg lro=0 swapByteOffset=0 ti=128 vIdx=0 eIdx=0 rIdx=1 oIdx=0 buffer=0 iui=0
ds_read_b128 v[vgprValuA_T0_I0+8:vgprValuA_T0_I0+11], v[vgprLocalReadAddrA] offset:2048 // L -> Reg lro=0 swapByteOffset=0 ti=128 vIdx=0 eIdx=0 rIdx=2 oIdx=0 buffer=0 iui=0
ds_read_b128 v[vgprValuA_T0_I0+12:vgprValuA_T0_I0+15], v[vgprLocalReadAddrA] offset:3072 // L -> Reg lro=0 swapByteOffset=0 ti=128 vIdx=0 eIdx=0 rIdx=3 oIdx=0 buffer=0 iui=0
ds_read_b128 v[vgprValuA_T0_I0+16:vgprValuA_T0_I0+19], v[vgprLocalReadAddrA] offset:16384 // L -> Reg lro=0 swapByteOffset=0 ti=128 vIdx=0 eIdx=0 rIdx=4 oIdx=0 buffer=0 iui=0
ds_read_b128 v[vgprValuA_T0_I0+20:vgprValuA_T0_I0+23], v[vgprLocalReadAddrA] offset:17408 // L -> Reg lro=0 swapByteOffset=0 ti=128 vIdx=0 eIdx=0 rIdx=5 oIdx=0 buffer=0 iui=0
ds_read_b128 v[vgprValuA_T0_I0+24:vgprValuA_T0_I0+27], v[vgprLocalReadAddrA] offset:18432 // L -> Reg lro=0 swapByteOffset=0 ti=128 vIdx=0 eIdx=0 rIdx=6 oIdx=0 buffer=0 iui=0
ds_read_b128 v[vgprValuA_T0_I0+28:vgprValuA_T0_I0+31], v[vgprLocalReadAddrA] offset:19456 // L -> Reg lro=0 swapByteOffset=0 ti=128 vIdx=0 eIdx=0 rIdx=7 oIdx=0 buffer=0 iui=0
  
ds_read_b128 v[vgprValuB_T0_I0+0:vgprValuB_T0_I0+0+3], v[vgprLocalReadAddrB] offset:0 // L -> Reg lro=0 swapByteOffset=0 ti=128 vIdx=0 eIdx=0 rIdx=0 oIdx=0 buffer=0 iui=0
ds_read_b128 v[vgprValuB_T0_I0+4:vgprValuB_T0_I0+4+3], v[vgprLocalReadAddrB] offset:64 // L -> Reg lro=0 swapByteOffset=0 ti=128 vIdx=0 eIdx=0 rIdx=1 oIdx=0 buffer=0 iui=0
ds_read_b128 v[vgprValuB_T0_I0+8:vgprValuB_T0_I0+8+3], v[vgprLocalReadAddrB] offset:128 // L -> Reg lro=0 swapByteOffset=0 ti=128 vIdx=0 eIdx=1 rIdx=0 oIdx=0 buffer=0 iui=0
ds_read_b128 v[vgprValuB_T0_I0+12:vgprValuB_T0_I0+12+3], v[vgprLocalReadAddrB] offset:192 // L -> Reg lro=0 swapByteOffset=0 ti=128 vIdx=0 eIdx=1 rIdx=1 oIdx=0 buffer=0 iui=0
ds_read_b128 v[vgprValuB_T0_I0+16:vgprValuB_T0_I0+16+3], v[vgprLocalReadAddrB] offset:256 // L -> Reg lro=0 swapByteOffset=0 ti=128 vIdx=0 eIdx=2 rIdx=0 oIdx=0 buffer=0 iui=0
ds_read_b128 v[vgprValuB_T0_I0+20:vgprValuB_T0_I0+20+3], v[vgprLocalReadAddrB] offset:320 // L -> Reg lro=0 swapByteOffset=0 ti=128 vIdx=0 eIdx=2 rIdx=1 oIdx=0 buffer=0 iui=0
ds_read_b128 v[vgprValuB_T0_I0+24:vgprValuB_T0_I0+24+3], v[vgprLocalReadAddrB] offset:384 // L -> Reg lro=0 swapByteOffset=0 ti=128 vIdx=0 eIdx=3 rIdx=0 oIdx=0 buffer=0 iui=0
ds_read_b128 v[vgprValuB_T0_I0+28:vgprValuB_T0_I0+28+3], v[vgprLocalReadAddrB] offset:448 // L -> Reg lro=0 swapByteOffset=0 ti=128 vIdx=0 eIdx=3 rIdx=1 oIdx=0 buffer=0 iui=0

s_waitcnt lgkmcnt(0)                               // Wait for dependent lr

v_cvt_pk_bf16_f32 v[vgprValuA_X0_I0+0], v[vgprValuA_T0_I0+0], v[vgprValuA_T0_I0+4] // 0, 1 HIs
v_cvt_pk_bf16_f32 v[vgprValuA_X0_I0+0+1], v[vgprValuA_T0_I0+8], v[vgprValuA_T0_I0+12] // 2, 3 HIs
v_cvt_pk_bf16_f32 v[vgprValuA_X0_I0+0+2], v[vgprValuA_T0_I0+16], v[vgprValuA_T0_I0+20] // 4, 5 HIs
v_cvt_pk_bf16_f32 v[vgprValuA_X0_I0+0+3], v[vgprValuA_T0_I0+24], v[vgprValuA_T0_I0+28] // 6, 7 HIs
v_cvt_pk_bf16_f32 v[vgprValuA_X0_I0+8], v[vgprValuA_T0_I0+0+1], v[vgprValuA_T0_I0+4+1] // 0, 1 HIs
v_cvt_pk_bf16_f32 v[vgprValuA_X0_I0+8+1], v[vgprValuA_T0_I0+8+1], v[vgprValuA_T0_I0+12+1] // 2, 3 HIs
v_cvt_pk_bf16_f32 v[vgprValuA_X0_I0+8+2], v[vgprValuA_T0_I0+16+1], v[vgprValuA_T0_I0+20+1] // 4, 5 HIs
v_cvt_pk_bf16_f32 v[vgprValuA_X0_I0+8+3], v[vgprValuA_T0_I0+24+1], v[vgprValuA_T0_I0+28+1] // 6, 7 HIs
v_cvt_pk_bf16_f32 v[vgprValuA_X0_I0+16], v[vgprValuA_T0_I0+0+2], v[vgprValuA_T0_I0+4+2] // 0, 1 HIs
v_cvt_pk_bf16_f32 v[vgprValuA_X0_I0+16+1], v[vgprValuA_T0_I0+8+2], v[vgprValuA_T0_I0+12+2] // 2, 3 HIs
v_cvt_pk_bf16_f32 v[vgprValuA_X0_I0+16+2], v[vgprValuA_T0_I0+16+2], v[vgprValuA_T0_I0+20+2] // 4, 5 HIs
v_cvt_pk_bf16_f32 v[vgprValuA_X0_I0+16+3], v[vgprValuA_T0_I0+24+2], v[vgprValuA_T0_I0+28+2] // 6, 7 HIs
v_cvt_pk_bf16_f32 v[vgprValuA_X0_I0+24], v[vgprValuA_T0_I0+0+3], v[vgprValuA_T0_I0+4+3] // 0, 1 HIs
v_cvt_pk_bf16_f32 v[vgprValuA_X0_I0+24+1], v[vgprValuA_T0_I0+8+3], v[vgprValuA_T0_I0+12+3] // 2, 3 HIs
v_cvt_pk_bf16_f32 v[vgprValuA_X0_I0+24+2], v[vgprValuA_T0_I0+16+3], v[vgprValuA_T0_I0+20+3] // 4, 5 HIs
v_cvt_pk_bf16_f32 v[vgprValuA_X0_I0+24+3], v[vgprValuA_T0_I0+24+3], v[vgprValuA_T0_I0+28+3] // 6, 7 HIs

v_dot2c_f32_bf16 v[vgprValuA_T0_I0+0], 0x8000bf80, v[vgprValuA_X0_I0+0]
v_dot2c_f32_bf16 v[vgprValuA_T0_I0+4], 0xbf800000, v[vgprValuA_X0_I0+0]
v_dot2c_f32_bf16 v[vgprValuA_T0_I0+8], 0x8000bf80, v[vgprValuA_X0_I0+0+1]
v_dot2c_f32_bf16 v[vgprValuA_T0_I0+12], 0xbf800000, v[vgprValuA_X0_I0+0+1]
v_dot2c_f32_bf16 v[vgprValuA_T0_I0+16], 0x8000bf80, v[vgprValuA_X0_I0+0+2]
v_dot2c_f32_bf16 v[vgprValuA_T0_I0+20], 0xbf800000, v[vgprValuA_X0_I0+0+2]
v_dot2c_f32_bf16 v[vgprValuA_T0_I0+24], 0x8000bf80, v[vgprValuA_X0_I0+0+3]
v_dot2c_f32_bf16 v[vgprValuA_T0_I0+28], 0xbf800000, v[vgprValuA_X0_I0+0+3]
v_dot2c_f32_bf16 v[vgprValuA_T0_I0+0+1], 0x8000bf80, v[vgprValuA_X0_I0+8]
v_dot2c_f32_bf16 v[vgprValuA_T0_I0+4+1], 0xbf800000, v[vgprValuA_X0_I0+8]
v_dot2c_f32_bf16 v[vgprValuA_T0_I0+8+1], 0x8000bf80, v[vgprValuA_X0_I0+8+1]
v_dot2c_f32_bf16 v[vgprValuA_T0_I0+12+1], 0xbf800000, v[vgprValuA_X0_I0+8+1]
v_dot2c_f32_bf16 v[vgprValuA_T0_I0+16+1], 0x8000bf80, v[vgprValuA_X0_I0+8+2]
v_dot2c_f32_bf16 v[vgprValuA_T0_I0+20+1], 0xbf800000, v[vgprValuA_X0_I0+8+2]
v_dot2c_f32_bf16 v[vgprValuA_T0_I0+24+1], 0x8000bf80, v[vgprValuA_X0_I0+8+3]
v_dot2c_f32_bf16 v[vgprValuA_T0_I0+28+1], 0xbf800000, v[vgprValuA_X0_I0+8+3]
v_dot2c_f32_bf16 v[vgprValuA_T0_I0+0+2], 0x8000bf80, v[vgprValuA_X0_I0+16]
v_dot2c_f32_bf16 v[vgprValuA_T0_I0+4+2], 0xbf800000, v[vgprValuA_X0_I0+16]
v_dot2c_f32_bf16 v[vgprValuA_T0_I0+8+2], 0x8000bf80, v[vgprValuA_X0_I0+16+1]
v_dot2c_f32_bf16 v[vgprValuA_T0_I0+12+2], 0xbf800000, v[vgprValuA_X0_I0+16+1]
v_dot2c_f32_bf16 v[vgprValuA_T0_I0+16+2], 0x8000bf80, v[vgprValuA_X0_I0+16+2]
v_dot2c_f32_bf16 v[vgprValuA_T0_I0+20+2], 0xbf800000, v[vgprValuA_X0_I0+16+2]
v_dot2c_f32_bf16 v[vgprValuA_T0_I0+24+2], 0x8000bf80, v[vgprValuA_X0_I0+16+3]
v_dot2c_f32_bf16 v[vgprValuA_T0_I0+28+2], 0xbf800000, v[vgprValuA_X0_I0+16+3]
v_dot2c_f32_bf16 v[vgprValuA_T0_I0+0+3], 0x8000bf80, v[vgprValuA_X0_I0+24]
v_dot2c_f32_bf16 v[vgprValuA_T0_I0+4+3], 0xbf800000, v[vgprValuA_X0_I0+24]
v_dot2c_f32_bf16 v[vgprValuA_T0_I0+8+3], 0x8000bf80, v[vgprValuA_X0_I0+24+1]
v_dot2c_f32_bf16 v[vgprValuA_T0_I0+12+3], 0xbf800000, v[vgprValuA_X0_I0+24+1]
v_dot2c_f32_bf16 v[vgprValuA_T0_I0+16+3], 0x8000bf80, v[vgprValuA_X0_I0+24+2]
v_dot2c_f32_bf16 v[vgprValuA_T0_I0+20+3], 0xbf800000, v[vgprValuA_X0_I0+24+2]
v_dot2c_f32_bf16 v[vgprValuA_T0_I0+24+3], 0x8000bf80, v[vgprValuA_X0_I0+24+3]
v_dot2c_f32_bf16 v[vgprValuA_T0_I0+28+3], 0xbf800000, v[vgprValuA_X0_I0+24+3]

v_cvt_pk_bf16_f32 v[vgprValuA_X0_I0+0+4], v[vgprValuA_T0_I0+0], v[vgprValuA_T0_I0+4]
v_cvt_pk_bf16_f32 v[vgprValuA_X0_I0+0+5], v[vgprValuA_T0_I0+8], v[vgprValuA_T0_I0+12]
v_cvt_pk_bf16_f32 v[vgprValuA_X0_I0+0+6], v[vgprValuA_T0_I0+16], v[vgprValuA_T0_I0+20]
v_cvt_pk_bf16_f32 v[vgprValuA_X0_I0+0+7], v[vgprValuA_T0_I0+24], v[vgprValuA_T0_I0+28]
v_cvt_pk_bf16_f32 v[vgprValuA_X0_I0+8+4], v[vgprValuA_T0_I0+0+1], v[vgprValuA_T0_I0+4+1]
v_cvt_pk_bf16_f32 v[vgprValuA_X0_I0+8+5], v[vgprValuA_T0_I0+8+1], v[vgprValuA_T0_I0+12+1]
v_cvt_pk_bf16_f32 v[vgprValuA_X0_I0+8+6], v[vgprValuA_T0_I0+16+1], v[vgprValuA_T0_I0+20+1]
v_cvt_pk_bf16_f32 v[vgprValuA_X0_I0+8+7], v[vgprValuA_T0_I0+24+1], v[vgprValuA_T0_I0+28+1]
v_cvt_pk_bf16_f32 v[vgprValuA_X0_I0+16+4], v[vgprValuA_T0_I0+0+2], v[vgprValuA_T0_I0+4+2]
v_cvt_pk_bf16_f32 v[vgprValuA_X0_I0+16+5], v[vgprValuA_T0_I0+8+2], v[vgprValuA_T0_I0+12+2]
v_cvt_pk_bf16_f32 v[vgprValuA_X0_I0+16+6], v[vgprValuA_T0_I0+16+2], v[vgprValuA_T0_I0+20+2]
v_cvt_pk_bf16_f32 v[vgprValuA_X0_I0+16+7], v[vgprValuA_T0_I0+24+2], v[vgprValuA_T0_I0+28+2]
v_cvt_pk_bf16_f32 v[vgprValuA_X0_I0+24+4], v[vgprValuA_T0_I0+0+3], v[vgprValuA_T0_I0+4+3]
v_cvt_pk_bf16_f32 v[vgprValuA_X0_I0+24+5], v[vgprValuA_T0_I0+8+3], v[vgprValuA_T0_I0+12+3]
v_cvt_pk_bf16_f32 v[vgprValuA_X0_I0+24+6], v[vgprValuA_T0_I0+16+3], v[vgprValuA_T0_I0+20+3]
v_cvt_pk_bf16_f32 v[vgprValuA_X0_I0+24+7], v[vgprValuA_T0_I0+24+3], v[vgprValuA_T0_I0+28+3]
  
v_cvt_pk_bf16_f32 v[vgprValuB_X0_I0+0+0], v[vgprValuB_T0_I0+0+0], v[vgprValuB_T0_I0+0+1]
v_cvt_pk_bf16_f32 v[vgprValuB_X0_I0+0+1], v[vgprValuB_T0_I0+0+2], v[vgprValuB_T0_I0+0+3]
v_cvt_pk_bf16_f32 v[vgprValuB_X0_I0+0+2], v[vgprValuB_T0_I0+0+4], v[vgprValuB_T0_I0+0+5]
v_cvt_pk_bf16_f32 v[vgprValuB_X0_I0+0+3], v[vgprValuB_T0_I0+0+6], v[vgprValuB_T0_I0+0+7]
v_cvt_pk_bf16_f32 v[vgprValuB_X0_I0+8+0], v[vgprValuB_T0_I0+8+0], v[vgprValuB_T0_I0+8+1]
v_cvt_pk_bf16_f32 v[vgprValuB_X0_I0+8+1], v[vgprValuB_T0_I0+8+2], v[vgprValuB_T0_I0+8+3]
v_cvt_pk_bf16_f32 v[vgprValuB_X0_I0+8+2], v[vgprValuB_T0_I0+8+4], v[vgprValuB_T0_I0+8+5]
v_cvt_pk_bf16_f32 v[vgprValuB_X0_I0+8+3], v[vgprValuB_T0_I0+8+6], v[vgprValuB_T0_I0+8+7]
v_cvt_pk_bf16_f32 v[vgprValuB_X0_I0+16+0], v[vgprValuB_T0_I0+16+0], v[vgprValuB_T0_I0+16+1]
v_cvt_pk_bf16_f32 v[vgprValuB_X0_I0+16+1], v[vgprValuB_T0_I0+16+2], v[vgprValuB_T0_I0+16+3]
v_cvt_pk_bf16_f32 v[vgprValuB_X0_I0+16+2], v[vgprValuB_T0_I0+16+4], v[vgprValuB_T0_I0+16+5]
v_cvt_pk_bf16_f32 v[vgprValuB_X0_I0+16+3], v[vgprValuB_T0_I0+16+6], v[vgprValuB_T0_I0+16+7]
v_cvt_pk_bf16_f32 v[vgprValuB_X0_I0+24+0], v[vgprValuB_T0_I0+24+0], v[vgprValuB_T0_I0+24+1]
v_cvt_pk_bf16_f32 v[vgprValuB_X0_I0+24+1], v[vgprValuB_T0_I0+24+2], v[vgprValuB_T0_I0+24+3]
v_cvt_pk_bf16_f32 v[vgprValuB_X0_I0+24+2], v[vgprValuB_T0_I0+24+4], v[vgprValuB_T0_I0+24+5]
v_cvt_pk_bf16_f32 v[vgprValuB_X0_I0+24+3], v[vgprValuB_T0_I0+24+6], v[vgprValuB_T0_I0+24+7]

v_dot2c_f32_bf16 v[vgprValuB_T0_I0+0+0], 0x8000bf80, v[vgprValuB_X0_I0+0+0]
v_dot2c_f32_bf16 v[vgprValuB_T0_I0+0+1], 0xbf800000, v[vgprValuB_X0_I0+0+0]
v_dot2c_f32_bf16 v[vgprValuB_T0_I0+0+2], 0x8000bf80, v[vgprValuB_X0_I0+0+1]
v_dot2c_f32_bf16 v[vgprValuB_T0_I0+0+3], 0xbf800000, v[vgprValuB_X0_I0+0+1]
v_dot2c_f32_bf16 v[vgprValuB_T0_I0+0+4], 0x8000bf80, v[vgprValuB_X0_I0+0+2]
v_dot2c_f32_bf16 v[vgprValuB_T0_I0+0+5], 0xbf800000, v[vgprValuB_X0_I0+0+2]
v_dot2c_f32_bf16 v[vgprValuB_T0_I0+0+6], 0x8000bf80, v[vgprValuB_X0_I0+0+3]
v_dot2c_f32_bf16 v[vgprValuB_T0_I0+0+7], 0xbf800000, v[vgprValuB_X0_I0+0+3]
v_dot2c_f32_bf16 v[vgprValuB_T0_I0+8+0], 0x8000bf80, v[vgprValuB_X0_I0+8+0]
v_dot2c_f32_bf16 v[vgprValuB_T0_I0+8+1], 0xbf800000, v[vgprValuB_X0_I0+8+0]
v_dot2c_f32_bf16 v[vgprValuB_T0_I0+8+2], 0x8000bf80, v[vgprValuB_X0_I0+8+1]
v_dot2c_f32_bf16 v[vgprValuB_T0_I0+8+3], 0xbf800000, v[vgprValuB_X0_I0+8+1]
v_dot2c_f32_bf16 v[vgprValuB_T0_I0+8+4], 0x8000bf80, v[vgprValuB_X0_I0+8+2]
v_dot2c_f32_bf16 v[vgprValuB_T0_I0+8+5], 0xbf800000, v[vgprValuB_X0_I0+8+2]
v_dot2c_f32_bf16 v[vgprValuB_T0_I0+8+6], 0x8000bf80, v[vgprValuB_X0_I0+8+3]
v_dot2c_f32_bf16 v[vgprValuB_T0_I0+8+7], 0xbf800000, v[vgprValuB_X0_I0+8+3]
v_dot2c_f32_bf16 v[vgprValuB_T0_I0+16+0], 0x8000bf80, v[vgprValuB_X0_I0+16+0]
v_dot2c_f32_bf16 v[vgprValuB_T0_I0+16+1], 0xbf800000, v[vgprValuB_X0_I0+16+0]
v_dot2c_f32_bf16 v[vgprValuB_T0_I0+16+2], 0x8000bf80, v[vgprValuB_X0_I0+16+1]
v_dot2c_f32_bf16 v[vgprValuB_T0_I0+16+3], 0xbf800000, v[vgprValuB_X0_I0+16+1]
v_dot2c_f32_bf16 v[vgprValuB_T0_I0+16+4], 0x8000bf80, v[vgprValuB_X0_I0+16+2]
v_dot2c_f32_bf16 v[vgprValuB_T0_I0+16+5], 0xbf800000, v[vgprValuB_X0_I0+16+2]
v_dot2c_f32_bf16 v[vgprValuB_T0_I0+16+6], 0x8000bf80, v[vgprValuB_X0_I0+16+3]
v_dot2c_f32_bf16 v[vgprValuB_T0_I0+16+7], 0xbf800000, v[vgprValuB_X0_I0+16+3]
v_dot2c_f32_bf16 v[vgprValuB_T0_I0+24+0], 0x8000bf80, v[vgprValuB_X0_I0+24+0]
v_dot2c_f32_bf16 v[vgprValuB_T0_I0+24+1], 0xbf800000, v[vgprValuB_X0_I0+24+0]
v_dot2c_f32_bf16 v[vgprValuB_T0_I0+24+2], 0x8000bf80, v[vgprValuB_X0_I0+24+1]
v_dot2c_f32_bf16 v[vgprValuB_T0_I0+24+3], 0xbf800000, v[vgprValuB_X0_I0+24+1]
v_dot2c_f32_bf16 v[vgprValuB_T0_I0+24+4], 0x8000bf80, v[vgprValuB_X0_I0+24+2]
v_dot2c_f32_bf16 v[vgprValuB_T0_I0+24+5], 0xbf800000, v[vgprValuB_X0_I0+24+2]
v_dot2c_f32_bf16 v[vgprValuB_T0_I0+24+6], 0x8000bf80, v[vgprValuB_X0_I0+24+3]
v_dot2c_f32_bf16 v[vgprValuB_T0_I0+24+7], 0xbf800000, v[vgprValuB_X0_I0+24+3]

v_cvt_pk_bf16_f32 v[vgprValuB_X0_I0+0+7], v[vgprValuB_T0_I0+0+6], v[vgprValuB_T0_I0+0+7]
v_cvt_pk_bf16_f32 v[vgprValuB_X0_I0+0+6], v[vgprValuB_T0_I0+0+4], v[vgprValuB_T0_I0+0+5]
v_cvt_pk_bf16_f32 v[vgprValuB_X0_I0+0+5], v[vgprValuB_T0_I0+0+2], v[vgprValuB_T0_I0+0+3]
v_cvt_pk_bf16_f32 v[vgprValuB_X0_I0+0+4], v[vgprValuB_T0_I0+0+0], v[vgprValuB_T0_I0+0+1]
v_cvt_pk_bf16_f32 v[vgprValuB_X0_I0+8+7], v[vgprValuB_T0_I0+8+6], v[vgprValuB_T0_I0+8+7]
v_cvt_pk_bf16_f32 v[vgprValuB_X0_I0+8+6], v[vgprValuB_T0_I0+8+4], v[vgprValuB_T0_I0+8+5]
v_cvt_pk_bf16_f32 v[vgprValuB_X0_I0+8+5], v[vgprValuB_T0_I0+8+2], v[vgprValuB_T0_I0+8+3]
v_cvt_pk_bf16_f32 v[vgprValuB_X0_I0+8+4], v[vgprValuB_T0_I0+8+0], v[vgprValuB_T0_I0+8+1]
  
/******************************************/
/* Unrolled Loop(s) - Begin               */
/******************************************/
label_openLoopL:
s_cmp_eq_u32 s[sgprLoopCounterL], 0x1              // LoopCounterL < EndCounter
s_cbranch_scc1 label_toPGR1                        // PGR=2 but only 1 loop, toPGR1
s_cmp_le_u32 s[sgprLoopCounterL], 0x2              // LoopCounterL < EndCounter
s_cbranch_scc1 label_LoopEndL                      // do not enter LoopL
label_LoopBeginL:

/******************************************/
/* Unrolled Loop 1/1 - Begin              */
/******************************************/
/* iter 0 (reset local read pointers iteration)  (swap and reset local write pointers iteration)  (swap local read pointers iteration)  */


.macro GRINC base id=0
.if \id == 0
s_cmp_eq_u32 s[sgprLoopCounterL], s[sgprStaggerUIter] // Is this the wrapIter?
.elseif \id == 1
s_cselect_b32 s84, s[sgprWrapU\base+0], s[sgprGlobalReadIncs\base+0] // incLower <- ?
.elseif \id == 2
s_cselect_b32 s85, s[sgprWrapU\base+1], 0              // incUpper <- ?
.elseif \id == 3
s_add_u32 s[sgprSrd\base+0], s[sgprSrd\base+0], s84        // gra SRD += inc(lower)
.elseif \id == 4
s_addc_u32 s[sgprSrd\base+1], s[sgprSrd\base+1], s85       // gra SRD += inc(upper)
.elseif \id == 5
s_sub_u32 s[sgprShadowLimit\base+0], s[sgprShadowLimit\base+0], s84 // limit -= inc)
.elseif \id == 6
s_subb_u32 s[sgprShadowLimit\base+1], s[sgprShadowLimit\base+1], s85 // limit -= inc)
.elseif \id == 7
s_cmp_eq_u32 s[sgprShadowLimit\base+1], 0              // are we within 2^32?
.elseif \id == 8
s_cselect_b32 s[sgprSrd\base+2], s[sgprShadowLimit\base+0], BufferLimit // Move shadow to real if we are within 2^32
.endif
.endm


.macro LR0 base id=0 // TLU=0
.if \id == 0
ds_read_b128 v[vgprValu\base\()_T0_I0+0:vgprValu\base\()_T0_I0+0+3], v[vgprLocalReadAddr\base] offset:0 // L -> Reg lro=0 swapByteOffset=0 ti=128 vIdx=0 eIdx=0 rIdx=0 oIdx=0 buffer=0 iui=0
.elseif \id == 1
ds_read_b128 v[vgprValu\base\()_T0_I0+4:vgprValu\base\()_T0_I0+4+3], v[vgprLocalReadAddr\base] offset:64 // L -> Reg lro=0 swapByteOffset=0 ti=128 vIdx=0 eIdx=0 rIdx=1 oIdx=0 buffer=0 iui=0
.elseif \id == 2
ds_read_b128 v[vgprValu\base\()_T0_I0+8:vgprValu\base\()_T0_I0+8+3], v[vgprLocalReadAddr\base] offset:128 // L -> Reg lro=0 swapByteOffset=0 ti=128 vIdx=0 eIdx=1 rIdx=0 oIdx=0 buffer=0 iui=0
.elseif \id == 3
ds_read_b128 v[vgprValu\base\()_T0_I0+12:vgprValu\base\()_T0_I0+12+3], v[vgprLocalReadAddr\base] offset:192 // L -> Reg lro=0 swapByteOffset=0 ti=128 vIdx=0 eIdx=1 rIdx=1 oIdx=0 buffer=0 iui=0
.elseif \id == 4
ds_read_b128 v[vgprValu\base\()_T0_I0+16:vgprValu\base\()_T0_I0+16+3], v[vgprLocalReadAddr\base] offset:256 // L -> Reg lro=0 swapByteOffset=0 ti=128 vIdx=0 eIdx=2 rIdx=0 oIdx=0 buffer=0 iui=0
.elseif \id == 5
ds_read_b128 v[vgprValu\base\()_T0_I0+20:vgprValu\base\()_T0_I0+20+3], v[vgprLocalReadAddr\base] offset:320 // L -> Reg lro=0 swapByteOffset=0 ti=128 vIdx=0 eIdx=2 rIdx=1 oIdx=0 buffer=0 iui=0
.elseif \id == 6
ds_read_b128 v[vgprValu\base\()_T0_I0+24:vgprValu\base\()_T0_I0+24+3], v[vgprLocalReadAddr\base] offset:384 // L -> Reg lro=0 swapByteOffset=0 ti=128 vIdx=0 eIdx=3 rIdx=0 oIdx=0 buffer=0 iui=0
.elseif \id == 7
ds_read_b128 v[vgprValu\base\()_T0_I0+28:vgprValu\base\()_T0_I0+28+3], v[vgprLocalReadAddr\base] offset:448 // L -> Reg lro=0 swapByteOffset=0 ti=128 vIdx=0 eIdx=3 rIdx=1 oIdx=0 buffer=0 iui=0
.elseif \id == 8
ds_read_b128 v[vgprValu\base\()_T0_I0+0:vgprValu\base\()_T0_I0+0+3], v[vgprLocalReadAddr\base] offset:16640 // L -> Reg lro=0 swapByteOffset=0 ti=128 vIdx=1 eIdx=0 rIdx=0 oIdx=0 buffer=0 iui=0
.elseif \id == 9
ds_read_b128 v[vgprValu\base\()_T0_I0+4:vgprValu\base\()_T0_I0+4+3], v[vgprLocalReadAddr\base] offset:16704 // L -> Reg lro=0 swapByteOffset=0 ti=128 vIdx=1 eIdx=0 rIdx=1 oIdx=0 buffer=0 iui=0
.elseif \id == 10
ds_read_b128 v[vgprValu\base\()_T0_I0+8:vgprValu\base\()_T0_I0+8+3], v[vgprLocalReadAddr\base] offset:16768 // L -> Reg lro=0 swapByteOffset=0 ti=128 vIdx=1 eIdx=1 rIdx=0 oIdx=0 buffer=0 iui=0
.elseif \id == 11
ds_read_b128 v[vgprValu\base\()_T0_I0+12:vgprValu\base\()_T0_I0+12+3], v[vgprLocalReadAddr\base] offset:16832 // L -> Reg lro=0 swapByteOffset=0 ti=128 vIdx=1 eIdx=1 rIdx=1 oIdx=0 buffer=0 iui=0
.elseif \id == 12
ds_read_b128 v[vgprValu\base\()_T0_I0+16:vgprValu\base\()_T0_I0+16+3], v[vgprLocalReadAddr\base] offset:16896 // L -> Reg lro=0 swapByteOffset=0 ti=128 vIdx=1 eIdx=2 rIdx=0 oIdx=0 buffer=0 iui=0
.elseif \id == 13
ds_read_b128 v[vgprValu\base\()_T0_I0+20:vgprValu\base\()_T0_I0+20+3], v[vgprLocalReadAddr\base] offset:16960 // L -> Reg lro=0 swapByteOffset=0 ti=128 vIdx=1 eIdx=2 rIdx=1 oIdx=0 buffer=0 iui=0
.elseif \id == 14
ds_read_b128 v[vgprValu\base\()_T0_I0+24:vgprValu\base\()_T0_I0+24+3], v[vgprLocalReadAddr\base] offset:17024 // L -> Reg lro=0 swapByteOffset=0 ti=128 vIdx=1 eIdx=3 rIdx=0 oIdx=0 buffer=0 iui=0
.elseif \id == 15
ds_read_b128 v[vgprValu\base\()_T0_I0+28:vgprValu\base\()_T0_I0+28+3], v[vgprLocalReadAddr\base] offset:17088 // L -> Reg lro=0 swapByteOffset=0 ti=128 vIdx=1 eIdx=3 rIdx=1 oIdx=0 buffer=0 iui=0
.endif
.endm  
  
.macro LR1 base id=0 // TLU=1
.if \id == 0
ds_read_b128 v[vgprValu\base\()_T0_I0+0:vgprValu\base\()_T0_I0+3], v[vgprLocalReadAddr\base] offset:0 // L -> Reg lro=0 swapByteOffset=0 ti=128 vIdx=0 eIdx=0 rIdx=0 oIdx=0 buffer=0 iui=0
.elseif \id == 1
ds_read_b128 v[vgprValu\base\()_T0_I0+4:vgprValu\base\()_T0_I0+7], v[vgprLocalReadAddr\base] offset:1024 // L -> Reg lro=0 swapByteOffset=0 ti=128 vIdx=0 eIdx=0 rIdx=1 oIdx=0 buffer=0 iui=0
.elseif \id == 2
ds_read_b128 v[vgprValu\base\()_T0_I0+8:vgprValu\base\()_T0_I0+11], v[vgprLocalReadAddr\base] offset:2048 // L -> Reg lro=0 swapByteOffset=0 ti=128 vIdx=0 eIdx=0 rIdx=2 oIdx=0 buffer=0 iui=0
.elseif \id == 3
ds_read_b128 v[vgprValu\base\()_T0_I0+12:vgprValu\base\()_T0_I0+15], v[vgprLocalReadAddr\base] offset:3072 // L -> Reg lro=0 swapByteOffset=0 ti=128 vIdx=0 eIdx=0 rIdx=3 oIdx=0 buffer=0 iui=0
.elseif \id == 4
ds_read_b128 v[vgprValu\base\()_T0_I0+16:vgprValu\base\()_T0_I0+19], v[vgprLocalReadAddr\base] offset:16384 // L -> Reg lro=0 swapByteOffset=0 ti=128 vIdx=0 eIdx=0 rIdx=4 oIdx=0 buffer=0 iui=0
.elseif \id == 5
ds_read_b128 v[vgprValu\base\()_T0_I0+20:vgprValu\base\()_T0_I0+23], v[vgprLocalReadAddr\base] offset:17408 // L -> Reg lro=0 swapByteOffset=0 ti=128 vIdx=0 eIdx=0 rIdx=5 oIdx=0 buffer=0 iui=0
.elseif \id == 6
ds_read_b128 v[vgprValu\base\()_T0_I0+24:vgprValu\base\()_T0_I0+27], v[vgprLocalReadAddr\base] offset:18432 // L -> Reg lro=0 swapByteOffset=0 ti=128 vIdx=0 eIdx=0 rIdx=6 oIdx=0 buffer=0 iui=0
.elseif \id == 7
ds_read_b128 v[vgprValu\base\()_T0_I0+28:vgprValu\base\()_T0_I0+31], v[vgprLocalReadAddr\base] offset:19456 // L -> Reg lro=0 swapByteOffset=0 ti=128 vIdx=0 eIdx=0 rIdx=7 oIdx=0 buffer=0 iui=0
.elseif \id == 8
ds_read_b128 v[vgprValu\base\()_T0_I0+0:vgprValu\base\()_T0_I0+3], v[vgprLocalReadAddr\base] offset:512 // L -> Reg lro=0 swapByteOffset=0 ti=128 vIdx=1 eIdx=0 rIdx=0 oIdx=0 buffer=0 iui=0
.elseif \id == 9
ds_read_b128 v[vgprValu\base\()_T0_I0+4:vgprValu\base\()_T0_I0+7], v[vgprLocalReadAddr\base] offset:1536 // L -> Reg lro=0 swapByteOffset=0 ti=128 vIdx=1 eIdx=0 rIdx=1 oIdx=0 buffer=0 iui=0
.elseif \id == 10
ds_read_b128 v[vgprValu\base\()_T0_I0+8:vgprValu\base\()_T0_I0+11], v[vgprLocalReadAddr\base] offset:2560 // L -> Reg lro=0 swapByteOffset=0 ti=128 vIdx=1 eIdx=0 rIdx=2 oIdx=0 buffer=0 iui=0
.elseif \id == 11
ds_read_b128 v[vgprValu\base\()_T0_I0+12:vgprValu\base\()_T0_I0+15], v[vgprLocalReadAddr\base] offset:3584 // L -> Reg lro=0 swapByteOffset=0 ti=128 vIdx=1 eIdx=0 rIdx=3 oIdx=0 buffer=0 iui=0
.elseif \id == 12
ds_read_b128 v[vgprValu\base\()_T0_I0+16:vgprValu\base\()_T0_I0+19], v[vgprLocalReadAddr\base] offset:16896 // L -> Reg lro=0 swapByteOffset=0 ti=128 vIdx=1 eIdx=0 rIdx=4 oIdx=0 buffer=0 iui=0
.elseif \id == 13
ds_read_b128 v[vgprValu\base\()_T0_I0+20:vgprValu\base\()_T0_I0+23], v[vgprLocalReadAddr\base] offset:17920 // L -> Reg lro=0 swapByteOffset=0 ti=128 vIdx=1 eIdx=0 rIdx=5 oIdx=0 buffer=0 iui=0
.elseif \id == 14
ds_read_b128 v[vgprValu\base\()_T0_I0+24:vgprValu\base\()_T0_I0+27], v[vgprLocalReadAddr\base] offset:18944 // L -> Reg lro=0 swapByteOffset=0 ti=128 vIdx=1 eIdx=0 rIdx=6 oIdx=0 buffer=0 iui=0
.elseif \id == 15
ds_read_b128 v[vgprValu\base\()_T0_I0+28:vgprValu\base\()_T0_I0+31], v[vgprLocalReadAddr\base] offset:19968 // L -> Reg lro=0 swapByteOffset=0 ti=128 vIdx=1 eIdx=0 rIdx=7 oIdx=0 buffer=0 iui=0
.endif
.endm


  

.macro GR base id off
.if \id == 0
s_mov_b32 m0, s[sgprLocalWriteAddr\base]               // m0 <- LDS write address
.elseif \id == 1
buffer_load_dwordx4 v[vgprGlobalReadOffset\base+0], s[sgprSrd\base:sgprSrd\base+3], 0 offen offset:0, lds // G -> Reg 0_0_0_0
.elseif \id == 2
s_add_u32 m0, m0, \off                             // Move LDS write address to next line
.elseif \id == 3
buffer_load_dwordx4 v[vgprGlobalReadOffset\base+1], s[sgprSrd\base:sgprSrd\base+3], 0 offen offset:0, lds // G -> Reg 0_0_1_0
.elseif \id == 4
s_add_u32 m0, m0, \off                             // Move LDS write address to next line
.elseif \id == 5
buffer_load_dwordx4 v[vgprGlobalReadOffset\base+2], s[sgprSrd\base:sgprSrd\base+3], 0 offen offset:0, lds // G -> Reg 0_0_2_0
.elseif \id == 6
s_add_u32 m0, m0, \off                             // Move LDS write address to next line
.elseif \id == 7
buffer_load_dwordx4 v[vgprGlobalReadOffset\base+3], s[sgprSrd\base:sgprSrd\base+3], 0 offen offset:0, lds // G -> Reg 0_0_3_0
.elseif \id == 8
s_add_u32 m0, m0, \off                             // Move LDS write address to next line
.elseif \id == 9
buffer_load_dwordx4 v[vgprGlobalReadOffset\base+4], s[sgprSrd\base:sgprSrd\base+3], 0 offen offset:0, lds // G -> Reg 0_0_4_0
.elseif \id == 10
s_add_u32 m0, m0, \off                             // Move LDS write address to next line
.elseif \id == 11
buffer_load_dwordx4 v[vgprGlobalReadOffset\base+5], s[sgprSrd\base:sgprSrd\base+3], 0 offen offset:0, lds // G -> Reg 0_0_5_0
.elseif \id == 12
s_add_u32 m0, m0, \off                             // Move LDS write address to next line
.elseif \id == 13
buffer_load_dwordx4 v[vgprGlobalReadOffset\base+6], s[sgprSrd\base:sgprSrd\base+3], 0 offen offset:0, lds // G -> Reg 0_0_6_0
.elseif \id == 14
s_add_u32 m0, m0, \off                             // Move LDS write address to next line
.elseif \id == 15
buffer_load_dwordx4 v[vgprGlobalReadOffset\base+7], s[sgprSrd\base:sgprSrd\base+3], 0 offen offset:0, lds // G -> Reg 0_0_7_0
.endif
.endm


.macro CVT base baseID group id
.if \id == 0
v_cvt_pk_bf16_f32 v[vgprValu\base\()_X0_I0+\baseID], v[vgprValu\base\()_T0_I0+\group+0], v[vgprValu\base\()_T0_I0+\group+1] // 0, 1 HIs
.elseif \id == 1
v_cvt_pk_bf16_f32 v[vgprValu\base\()_X0_I0+\baseID+1], v[vgprValu\base\()_T0_I0+\group+2], v[vgprValu\base\()_T0_I0+\group+3] // 2, 3 HIs
.elseif \id == 2
v_cvt_f32_bf16 v232, v[vgprValu\base\()_X0_I0+\baseID]
.elseif \id == 3
v_sub_f32 v[vgprValu\base\()_T0_I0+\group+0], v[vgprValu\base\()_T0_I0+\group+0], v232 // 0 Low
.elseif \id == 4
v_cvt_f32_bf16 v233, v[vgprValu\base\()_X0_I0+\baseID] src0_sel:WORD_1
.elseif \id == 5
v_sub_f32 v[vgprValu\base\()_T0_I0+\group+1], v[vgprValu\base\()_T0_I0+\group+1], v233 // 1 Low
.elseif \id == 6
v_cvt_f32_bf16 v234, v[vgprValu\base\()_X0_I0+\baseID+1]
.elseif \id == 7
v_sub_f32 v[vgprValu\base\()_T0_I0+\group+2], v[vgprValu\base\()_T0_I0+\group+2], v234 // 2 Low
.elseif \id == 8
v_cvt_f32_bf16 v235, v[vgprValu\base\()_X0_I0+\baseID+1] src0_sel:WORD_1
.elseif \id == 9
v_sub_f32 v[vgprValu\base\()_T0_I0+\group+3], v[vgprValu\base\()_T0_I0+0+\group+3], v235 // 3 Low
.elseif \id == 10 // Requires second load to be ready
v_cvt_pk_bf16_f32 v[vgprValu\base\()_X0_I0+\baseID+2], v[vgprValu\base\()_T0_I0+\group+4], v[vgprValu\base\()_T0_I0+\group+5] // 4, 5 HIs
.elseif \id == 11
v_cvt_pk_bf16_f32 v[vgprValu\base\()_X0_I0+\baseID+3], v[vgprValu\base\()_T0_I0+\group+6], v[vgprValu\base\()_T0_I0+\group+7] // 6, 7 HIs
.elseif \id == 12
v_cvt_f32_bf16 v236, v[vgprValu\base\()_X0_I0+\baseID+2]
.elseif \id == 13
v_sub_f32 v[vgprValu\base\()_T0_I0+\group+4], v[vgprValu\base\()_T0_I0+\group+4], v236 // 4 Low
.elseif \id == 14
v_cvt_f32_bf16 v237, v[vgprValu\base\()_X0_I0+\baseID+2] src0_sel:WORD_1
.elseif \id == 15
v_sub_f32 v[vgprValu\base\()_T0_I0+\group+5], v[vgprValu\base\()_T0_I0+\group+5], v237 // 5 Low
.elseif \id == 16
v_cvt_f32_bf16 v238, v[vgprValu\base\()_X0_I0+\baseID+3]
.elseif \id == 17
v_sub_f32 v[vgprValu\base\()_T0_I0+\group+6], v[vgprValu\base\()_T0_I0+\group+6], v238 // 6 Low
.elseif \id == 18
v_cvt_f32_bf16 v239, v[vgprValu\base\()_X0_I0+\baseID+3] src0_sel:WORD_1
.elseif \id == 19
v_sub_f32 v[vgprValu\base\()_T0_I0+\group+7], v[vgprValu\base\()_T0_I0+\group+7], v239 // 7 Low
.elseif \id == 20
v_cvt_pk_bf16_f32 v[vgprValu\base\()_X0_I0+\baseID+4], v[vgprValu\base\()_T0_I0+\group+0], v[vgprValu\base\()_T0_I0+\group+1]
.elseif \id == 21
v_cvt_pk_bf16_f32 v[vgprValu\base\()_X0_I0+\baseID+5], v[vgprValu\base\()_T0_I0+\group+2], v[vgprValu\base\()_T0_I0+\group+3]
.elseif \id == 22
v_cvt_pk_bf16_f32 v[vgprValu\base\()_X0_I0+\baseID+6], v[vgprValu\base\()_T0_I0+\group+4], v[vgprValu\base\()_T0_I0+\group+5]
.elseif \id == 23
v_cvt_pk_bf16_f32 v[vgprValu\base\()_X0_I0+\baseID+7], v[vgprValu\base\()_T0_I0+\group+6], v[vgprValu\base\()_T0_I0+\group+7]
.endif
.endm
  
.macro PERM base baseID group id
.if \id == 0
v_cvt_pk_bf16_f32 v[vgprValu\base\()_X0_I0+\baseID], v[vgprValu\base\()_T0_I0+0+\group], v[vgprValu\base\()_T0_I0+4+\group] // 0, 1 HIs
.elseif \id == 1
v_cvt_pk_bf16_f32 v[vgprValu\base\()_X0_I0+\baseID+1], v[vgprValu\base\()_T0_I0+8+\group], v[vgprValu\base\()_T0_I0+12+\group] // 2, 3 HIs
.elseif \id == 2
v_cvt_f32_bf16 v232, v[vgprValu\base\()_X0_I0+\baseID]
.elseif \id == 3
v_sub_f32 v[vgprValu\base\()_T0_I0+0+\group], v[vgprValu\base\()_T0_I0+0+\group], v232 // 0 Low
.elseif \id == 4
v_cvt_f32_bf16 v233, v[vgprValu\base\()_X0_I0+\baseID] src0_sel:WORD_1
.elseif \id == 5
v_sub_f32 v[vgprValu\base\()_T0_I0+4+\group], v[vgprValu\base\()_T0_I0+0+4+\group], v233 // 1 Low
.elseif \id == 6
v_cvt_f32_bf16 v234, v[vgprValu\base\()_X0_I0+\baseID+1]
.elseif \id == 7
v_sub_f32 v[vgprValu\base\()_T0_I0+8+\group], v[vgprValu\base\()_T0_I0+8+\group], v234 // 2 Low
.elseif \id == 8
v_cvt_f32_bf16 v235, v[vgprValu\base\()_X0_I0+\baseID+1] src0_sel:WORD_1
.elseif \id == 9
v_sub_f32 v[vgprValu\base\()_T0_I0+12+\group], v[vgprValu\base\()_T0_I0+0+12+\group], v235 // 3 Low
.elseif \id == 10
v_cvt_pk_bf16_f32 v[vgprValu\base\()_X0_I0+\baseID+4], v[vgprValu\base\()_T0_I0+0+\group], v[vgprValu\base\()_T0_I0+4+\group]
.elseif \id == 11
v_cvt_pk_bf16_f32 v[vgprValu\base\()_X0_I0+\baseID+5], v[vgprValu\base\()_T0_I0+8+\group], v[vgprValu\base\()_T0_I0+12+\group]
.elseif \id == 12 // Needs all LR to be done
v_cvt_pk_bf16_f32 v[vgprValu\base\()_X0_I0+\baseID+2], v[vgprValu\base\()_T0_I0+16+\group], v[vgprValu\base\()_T0_I0+20+\group] // 4, 5 HIs
.elseif \id == 13
v_cvt_pk_bf16_f32 v[vgprValu\base\()_X0_I0+\baseID+3], v[vgprValu\base\()_T0_I0+24+\group], v[vgprValu\base\()_T0_I0+28+\group] // 6, 7 HIs
.elseif \id == 14
v_cvt_f32_bf16 v236, v[vgprValu\base\()_X0_I0+\baseID+2]
.elseif \id == 15
v_sub_f32 v[vgprValu\base\()_T0_I0+16+\group], v[vgprValu\base\()_T0_I0+16+\group], v236 // 4 Low
.elseif \id == 16
v_cvt_f32_bf16 v237, v[vgprValu\base\()_X0_I0+\baseID+2] src0_sel:WORD_1
.elseif \id == 17
v_sub_f32 v[vgprValu\base\()_T0_I0+20+\group], v[vgprValu\base\()_T0_I0+0+20+\group], v237 // 5 Low
.elseif \id == 18
v_cvt_f32_bf16 v238, v[vgprValu\base\()_X0_I0+\baseID+3]
.elseif \id == 19
v_sub_f32 v[vgprValu\base\()_T0_I0+24+\group], v[vgprValu\base\()_T0_I0+24+\group], v238 // 6 Low
.elseif \id == 20
v_cvt_f32_bf16 v239, v[vgprValu\base\()_X0_I0+\baseID+3] src0_sel:WORD_1
.elseif \id == 21
v_sub_f32 v[vgprValu\base\()_T0_I0+28+\group], v[vgprValu\base\()_T0_I0+0+28+\group], v239 // 7 Low
.elseif \id == 22
v_cvt_pk_bf16_f32 v[vgprValu\base\()_X0_I0+\baseID+6], v[vgprValu\base\()_T0_I0+16+\group], v[vgprValu\base\()_T0_I0+20+\group]
.elseif \id == 23
v_cvt_pk_bf16_f32 v[vgprValu\base\()_X0_I0+\baseID+7], v[vgprValu\base\()_T0_I0+24+\group], v[vgprValu\base\()_T0_I0+28+\group]
.endif
.endm


.macro MAINLOOP isOdd
// A0 B0
/*  mfmaIndex:0  */
v_mfma_f32_16x16x32_bf16 acc[0:3], v[vgprValuB_X0_I0+0:vgprValuB_X0_I0+0+3], v[vgprValuA_X0_I0+0+4:vgprValuA_X0_I0+0+4+3], acc[0:3] // left value = acc[0+0:3+0]
GRINC A 0
.if \isOdd == 0
LR1 A 8
LR1 A 9
.else
v_cvt_pk_bf16_f32 v[vgprValuB_X0_I0+16+7], v[vgprValuB_T0_I0+16+6], v[vgprValuB_T0_I0+16+7]
v_cvt_pk_bf16_f32 v[vgprValuB_X0_I0+16+6], v[vgprValuB_T0_I0+16+4], v[vgprValuB_T0_I0+16+5]
.endif  
/*  mfmaIndex:1  */
v_mfma_f32_16x16x32_bf16 acc[0:3], v[vgprValuB_X0_I0+0+4:vgprValuB_X0_I0+0+4+3], v[vgprValuA_X0_I0+0:vgprValuA_X0_I0+0+3], acc[0:3] // left value = acc[0+0:3+0]
GRINC A 1
.if \isOdd == 1
LR1 A 8
LR1 A 9
.else
v_cvt_pk_bf16_f32 v[vgprValuB_X0_I0+16+7], v[vgprValuB_T0_I0+16+6], v[vgprValuB_T0_I0+16+7]
v_cvt_pk_bf16_f32 v[vgprValuB_X0_I0+16+6], v[vgprValuB_T0_I0+16+4], v[vgprValuB_T0_I0+16+5]
.endif
/*  mfmaIndex:2  */
v_mfma_f32_16x16x32_bf16 acc[0:3], v[vgprValuB_X0_I0+0:vgprValuB_X0_I0+0+3], v[vgprValuA_X0_I0+0:vgprValuA_X0_I0+0+3], acc[0:3] // left value = acc[0+0:3+0]
GRINC A 2
.if \isOdd == 0
LR1 A 10
LR1 A 11
.else
v_cvt_pk_bf16_f32 v[vgprValuB_X0_I0+16+5], v[vgprValuB_T0_I0+16+2], v[vgprValuB_T0_I0+16+3]
v_cvt_pk_bf16_f32 v[vgprValuB_X0_I0+16+4], v[vgprValuB_T0_I0+16+0], v[vgprValuB_T0_I0+16+1]
.endif
/*  mfmaIndex:3  */
v_mfma_f32_16x16x32_bf16 acc[4:7], v[vgprValuB_X0_I0+0:vgprValuB_X0_I0+0+3], v[vgprValuA_X0_I0+8+4:vgprValuA_X0_I0+8+4+3], acc[4:7] // left value = acc[4+0:7+0]
GRINC A 3
.if \isOdd == 1
LR1 A 10
LR1 A 11
.else
v_cvt_pk_bf16_f32 v[vgprValuB_X0_I0+16+5], v[vgprValuB_T0_I0+16+2], v[vgprValuB_T0_I0+16+3]
v_cvt_pk_bf16_f32 v[vgprValuB_X0_I0+16+4], v[vgprValuB_T0_I0+16+0], v[vgprValuB_T0_I0+16+1]
.endif
/*  mfmaIndex:4  */
v_mfma_f32_16x16x32_bf16 acc[4:7], v[vgprValuB_X0_I0+0+4:vgprValuB_X0_I0+0+4+3], v[vgprValuA_X0_I0+8:vgprValuA_X0_I0+8+3], acc[4:7] // left value = acc[4+0:7+0]
GRINC A 4
.if \isOdd == 0
LR1 A 12
LR1 A 13
.else
v_cvt_pk_bf16_f32 v[vgprValuB_X0_I0+24+7], v[vgprValuB_T0_I0+24+6], v[vgprValuB_T0_I0+24+7]
v_cvt_pk_bf16_f32 v[vgprValuB_X0_I0+24+6], v[vgprValuB_T0_I0+24+4], v[vgprValuB_T0_I0+24+5]
.endif
/*  mfmaIndex:5  */
v_mfma_f32_16x16x32_bf16 acc[4:7], v[vgprValuB_X0_I0+0:vgprValuB_X0_I0+0+3], v[vgprValuA_X0_I0+8:vgprValuA_X0_I0+8+3], acc[4:7] // left value = acc[4+0:7+0]
GRINC A 5
.if \isOdd == 1
LR1 A 12
LR1 A 13
.else
v_cvt_pk_bf16_f32 v[vgprValuB_X0_I0+24+7], v[vgprValuB_T0_I0+24+6], v[vgprValuB_T0_I0+24+7]
v_cvt_pk_bf16_f32 v[vgprValuB_X0_I0+24+6], v[vgprValuB_T0_I0+24+4], v[vgprValuB_T0_I0+24+5]
.endif
/*  mfmaIndex:6  */
v_mfma_f32_16x16x32_bf16 acc[8:11], v[vgprValuB_X0_I0+0:vgprValuB_X0_I0+0+3], v[vgprValuA_X0_I0+16+4:vgprValuA_X0_I0+16+4+3], acc[8:11] // left value = acc[8+0:11+0]
GRINC A 6
.if \isOdd == 0
LR1 A 14
LR1 A 15
.else
v_cvt_pk_bf16_f32 v[vgprValuB_X0_I0+24+5], v[vgprValuB_T0_I0+24+2], v[vgprValuB_T0_I0+24+3]
v_cvt_pk_bf16_f32 v[vgprValuB_X0_I0+24+4], v[vgprValuB_T0_I0+24+0], v[vgprValuB_T0_I0+24+1]
.endif
/*  mfmaIndex:7  */
v_mfma_f32_16x16x32_bf16 acc[8:11], v[vgprValuB_X0_I0+0+4:vgprValuB_X0_I0+0+4+3], v[vgprValuA_X0_I0+16:vgprValuA_X0_I0+16+3], acc[8:11] // left value = acc[8+0:11+0]
GRINC A 7
.if \isOdd == 1
LR1 A 14
LR1 A 15
.else
v_cvt_pk_bf16_f32 v[vgprValuB_X0_I0+24+5], v[vgprValuB_T0_I0+24+2], v[vgprValuB_T0_I0+24+3]
v_cvt_pk_bf16_f32 v[vgprValuB_X0_I0+24+4], v[vgprValuB_T0_I0+24+0], v[vgprValuB_T0_I0+24+1]
.endif
/*  mfmaIndex:8  */
v_mfma_f32_16x16x32_bf16 acc[8:11], v[vgprValuB_X0_I0+0:vgprValuB_X0_I0+0+3], v[vgprValuA_X0_I0+16:vgprValuA_X0_I0+16+3], acc[8:11] // left value = acc[8+0:11+0]
s_waitcnt lgkmcnt(4) // 8x LRA1 issue, wait for 4x LR A1 to be done.
PERM A 32 0 0
PERM A 32 0 1
/*  mfmaIndex:9  */
v_mfma_f32_16x16x32_bf16 acc[12:15], v[vgprValuB_X0_I0+0:vgprValuB_X0_I0+0+3], v[vgprValuA_X0_I0+24+4:vgprValuA_X0_I0+24+4+3], acc[12:15] // left value = acc[12+0:15+0]
GRINC A 8
PERM A 32 0 2
PERM A 32 0 3
/*  mfmaIndex:10  */
v_mfma_f32_16x16x32_bf16 acc[12:15], v[vgprValuB_X0_I0+0+4:vgprValuB_X0_I0+0+4+3], v[vgprValuA_X0_I0+24:vgprValuA_X0_I0+24+3], acc[12:15] // left value = acc[12+0:15+0]
PERM A 32 0 4
PERM A 32 0 5
/*  mfmaIndex:11  */
v_mfma_f32_16x16x32_bf16 acc[12:15], v[vgprValuB_X0_I0+0:vgprValuB_X0_I0+0+3], v[vgprValuA_X0_I0+24:vgprValuA_X0_I0+24+3], acc[12:15] // left value = acc[12+0:15+0]
s_waitcnt lgkmcnt(0) // 8x LR A1 done.
PERM A 32 0 6
PERM A 32 0 7
/*  mfmaIndex:12  */
v_mfma_f32_16x16x32_bf16 acc[32:35], v[vgprValuB_X0_I0+8:vgprValuB_X0_I0+8+3], v[vgprValuA_X0_I0+0+4:vgprValuA_X0_I0+0+4+3], acc[32:35] // left value = acc[32+0:35+0]
s_barrier // Can start global A read
/*  mfmaIndex:13  */
v_mfma_f32_16x16x32_bf16 acc[32:35], v[vgprValuB_X0_I0+8+4:vgprValuB_X0_I0+8+4+3], v[vgprValuA_X0_I0+0:vgprValuA_X0_I0+0+3], acc[32:35] // left value = acc[32+0:35+0]
.if \isOdd == 0
GR A 0 4096
v_xor_b32 v[vgprLocalReadAddrA], v[vgprLocalReadSwapAddrA], v[vgprLocalReadAddrA] // swap Red Blk
GR A 1 4096
.else
LR0 B 8
.endif
PERM A 32 0 8
/*  mfmaIndex:14  */
v_mfma_f32_16x16x32_bf16 acc[32:35], v[vgprValuB_X0_I0+8:vgprValuB_X0_I0+8+3], v[vgprValuA_X0_I0+0:vgprValuA_X0_I0+0+3], acc[32:35] // left value = acc[32+0:35+0]
.if \isOdd == 1
GR A 0 4096
v_xor_b32 v[vgprLocalReadAddrA], v[vgprLocalReadSwapAddrA], v[vgprLocalReadAddrA] // swap Red Blk
GR A 1 4096
.else
LR0 B 8
.endif
PERM A 32 0 9
/*  mfmaIndex:15  */
v_mfma_f32_16x16x32_bf16 acc[36:39], v[vgprValuB_X0_I0+8:vgprValuB_X0_I0+8+3], v[vgprValuA_X0_I0+8+4:vgprValuA_X0_I0+8+4+3], acc[36:39] // left value = acc[36+0:39+0]
.if \isOdd == 0
GR A 2 4096
GR A 3 4096
.else
LR0 B 9
.endif
PERM A 32 0 10
/*  mfmaIndex:16  */
v_mfma_f32_16x16x32_bf16 acc[36:39], v[vgprValuB_X0_I0+8+4:vgprValuB_X0_I0+8+4+3], v[vgprValuA_X0_I0+8:vgprValuA_X0_I0+8+3], acc[36:39] // left value = acc[36+0:39+0]
.if \isOdd == 1
GR A 2 4096
GR A 3 4096
.else
LR0 B 9
.endif
PERM A 32 0 11
/*  mfmaIndex:17  */
v_mfma_f32_16x16x32_bf16 acc[36:39], v[vgprValuB_X0_I0+8:vgprValuB_X0_I0+8+3], v[vgprValuA_X0_I0+8:vgprValuA_X0_I0+8+3], acc[36:39] // left value = acc[36+0:39+0]
.if \isOdd == 0
GR A 4 4096
GR A 5 4096
.else
LR0 B 10
.endif
PERM A 32 0 12
/*  mfmaIndex:18  */
v_mfma_f32_16x16x32_bf16 acc[40:43], v[vgprValuB_X0_I0+8:vgprValuB_X0_I0+8+3], v[vgprValuA_X0_I0+16+4:vgprValuA_X0_I0+16+4+3], acc[40:43] // left value = acc[40+0:43+0]
.if \isOdd == 1
GR A 4 4096
GR A 5 4096
.else
LR0 B 10
.endif
PERM A 32 0 13
/*  mfmaIndex:19  */
v_mfma_f32_16x16x32_bf16 acc[40:43], v[vgprValuB_X0_I0+8+4:vgprValuB_X0_I0+8+4+3], v[vgprValuA_X0_I0+16:vgprValuA_X0_I0+16+3], acc[40:43] // left value = acc[40+0:43+0]
.if \isOdd == 0
GR A 6 4096
GR A 7 4096
.else
LR0 B 11
.endif
PERM A 32 0 14
/*  mfmaIndex:20  */
v_mfma_f32_16x16x32_bf16 acc[40:43], v[vgprValuB_X0_I0+8:vgprValuB_X0_I0+8+3], v[vgprValuA_X0_I0+16:vgprValuA_X0_I0+16+3], acc[40:43] // left value = acc[40+0:43+0]
.if \isOdd == 1
GR A 6 4096
GR A 7 4096
.else
LR0 B 11
.endif
PERM A 32 0 15
/*  mfmaIndex:21  */
v_mfma_f32_16x16x32_bf16 acc[44:47], v[vgprValuB_X0_I0+8:vgprValuB_X0_I0+8+3], v[vgprValuA_X0_I0+24+4:vgprValuA_X0_I0+24+4+3], acc[44:47] // left value = acc[44+0:47+0]
.if \isOdd == 0
LR0 B 12
.else
PERM A 32 0 16
PERM A 32 0 17
.endif
/*  mfmaIndex:22  */
v_mfma_f32_16x16x32_bf16 acc[44:47], v[vgprValuB_X0_I0+8+4:vgprValuB_X0_I0+8+4+3], v[vgprValuA_X0_I0+24:vgprValuA_X0_I0+24+3], acc[44:47] // left value = acc[44+0:47+0]
.if \isOdd == 1
LR0 B 12
.else
PERM A 32 0 16
PERM A 32 0 17
.endif
/*  mfmaIndex:23  */
v_mfma_f32_16x16x32_bf16 acc[44:47], v[vgprValuB_X0_I0+8:vgprValuB_X0_I0+8+3], v[vgprValuA_X0_I0+24:vgprValuA_X0_I0+24+3], acc[44:47] // left value = acc[44+0:47+0]
PERM A 32 0 18
PERM A 32 0 19
/*  mfmaIndex:24  */
v_mfma_f32_16x16x32_bf16 acc[64:67], v[vgprValuB_X0_I0+16:vgprValuB_X0_I0+16+3], v[vgprValuA_X0_I0+0+4:vgprValuA_X0_I0+0+4+3], acc[64:67] // left value = acc[64+0:67+0]
.if \isOdd == 0
GR A 8 4096
GR A 9 4096
.else
LR0 B 13  
.endif
PERM A 32 0 20
/*  mfmaIndex:25  */
v_mfma_f32_16x16x32_bf16 acc[64:67], v[vgprValuB_X0_I0+16+4:vgprValuB_X0_I0+16+4+3], v[vgprValuA_X0_I0+0:vgprValuA_X0_I0+0+3], acc[64:67] // left value = acc[64+0:67+0]
.if \isOdd == 1
GR A 8 4096
GR A 9 4096
.else
LR0 B 13  
.endif
PERM A 32 0 21
/*  mfmaIndex:26  */
v_mfma_f32_16x16x32_bf16 acc[64:67], v[vgprValuB_X0_I0+16:vgprValuB_X0_I0+16+3], v[vgprValuA_X0_I0+0:vgprValuA_X0_I0+0+3], acc[64:67] // left value = acc[64+0:67+0]
.if \isOdd == 0
LR0 B 14
.else
PERM A 32 0 22
PERM A 32 0 23  
.endif
/*  mfmaIndex:27  */
v_mfma_f32_16x16x32_bf16 acc[68:71], v[vgprValuB_X0_I0+16:vgprValuB_X0_I0+16+3], v[vgprValuA_X0_I0+8+4:vgprValuA_X0_I0+8+4+3], acc[68:71] // left value = acc[68+0:71+0]
.if \isOdd == 1
LR0 B 14
.else
PERM A 32 0 22
PERM A 32 0 23  
.endif
/*  mfmaIndex:28  */
v_mfma_f32_16x16x32_bf16 acc[68:71], v[vgprValuB_X0_I0+16+4:vgprValuB_X0_I0+16+4+3], v[vgprValuA_X0_I0+8:vgprValuA_X0_I0+8+3], acc[68:71] // left value = acc[68+0:71+0]
PERM A 40 1 0
PERM A 40 1 1
/*  mfmaIndex:29  */
v_mfma_f32_16x16x32_bf16 acc[68:71], v[vgprValuB_X0_I0+16:vgprValuB_X0_I0+16+3], v[vgprValuA_X0_I0+8:vgprValuA_X0_I0+8+3], acc[68:71] // left value = acc[68+0:71+0]
PERM A 40 1 2
PERM A 40 1 3
/*  mfmaIndex:30  */
v_mfma_f32_16x16x32_bf16 acc[72:75], v[vgprValuB_X0_I0+16:vgprValuB_X0_I0+16+3], v[vgprValuA_X0_I0+16+4:vgprValuA_X0_I0+16+4+3], acc[72:75] // left value = acc[72+0:75+0]
.if \isOdd == 0
GR A 10 4096
GR A 11 4096
.else
LR0 B 15  
.endif
PERM A 40 1 4
/*  mfmaIndex:31  */
v_mfma_f32_16x16x32_bf16 acc[72:75], v[vgprValuB_X0_I0+16+4:vgprValuB_X0_I0+16+4+3], v[vgprValuA_X0_I0+16:vgprValuA_X0_I0+16+3], acc[72:75] // left value = acc[72+0:75+0]
.if \isOdd == 1
GR A 10 4096
GR A 11 4096
.else
LR0 B 15  
.endif
PERM A 40 1 5
/*  mfmaIndex:32  */
v_mfma_f32_16x16x32_bf16 acc[72:75], v[vgprValuB_X0_I0+16:vgprValuB_X0_I0+16+3], v[vgprValuA_X0_I0+16:vgprValuA_X0_I0+16+3], acc[72:75] // left value = acc[72+0:75+0]
PERM A 40 1 6
PERM A 40 1 7
/*  mfmaIndex:33  */
v_mfma_f32_16x16x32_bf16 acc[76:79], v[vgprValuB_X0_I0+16:vgprValuB_X0_I0+16+3], v[vgprValuA_X0_I0+24+4:vgprValuA_X0_I0+24+4+3], acc[76:79] // left value = acc[76+0:79+0]
PERM A 40 1 8
PERM A 40 1 9
/*  mfmaIndex:34  */
v_mfma_f32_16x16x32_bf16 acc[76:79], v[vgprValuB_X0_I0+16+4:vgprValuB_X0_I0+16+4+3], v[vgprValuA_X0_I0+24:vgprValuA_X0_I0+24+3], acc[76:79] // left value = acc[76+0:79+0]
PERM A 40 1 10
PERM A 40 1 11
/*  mfmaIndex:35  */
v_mfma_f32_16x16x32_bf16 acc[76:79], v[vgprValuB_X0_I0+16:vgprValuB_X0_I0+16+3], v[vgprValuA_X0_I0+24:vgprValuA_X0_I0+24+3], acc[76:79] // left value = acc[76+0:79+0]
PERM A 40 1 12
PERM A 40 1 13
/*  mfmaIndex:36  */
v_mfma_f32_16x16x32_bf16 acc[96:99], v[vgprValuB_X0_I0+24:vgprValuB_X0_I0+24+3], v[vgprValuA_X0_I0+0+4:vgprValuA_X0_I0+0+4+3], acc[96:99] // left value = acc[96+0:99+0]
.if \isOdd == 0
GR A 12 4096
GR A 13 4096
.endif
PERM A 40 1 14
/*  mfmaIndex:37  */
v_mfma_f32_16x16x32_bf16 acc[96:99], v[vgprValuB_X0_I0+24+4:vgprValuB_X0_I0+24+4+3], v[vgprValuA_X0_I0+0:vgprValuA_X0_I0+0+3], acc[96:99] // left value = acc[96+0:99+0]
.if \isOdd == 1
GR A 12 4096
GR A 13 4096
.endif
PERM A 40 1 15
/*  mfmaIndex:38  */
v_mfma_f32_16x16x32_bf16 acc[96:99], v[vgprValuB_X0_I0+24:vgprValuB_X0_I0+24+3], v[vgprValuA_X0_I0+0:vgprValuA_X0_I0+0+3], acc[96:99] // left value = acc[96+0:99+0]
PERM A 40 1 16
PERM A 40 1 17
/*  mfmaIndex:39  */
v_mfma_f32_16x16x32_bf16 acc[100:103], v[vgprValuB_X0_I0+24:vgprValuB_X0_I0+24+3], v[vgprValuA_X0_I0+8+4:vgprValuA_X0_I0+8+4+3], acc[100:103] // left value = acc[100+0:103+0]
PERM A 40 1 18
PERM A 40 1 19
/*  mfmaIndex:40  */
v_mfma_f32_16x16x32_bf16 acc[100:103], v[vgprValuB_X0_I0+24+4:vgprValuB_X0_I0+24+4+3], v[vgprValuA_X0_I0+8:vgprValuA_X0_I0+8+3], acc[100:103] // left value = acc[100+0:103+0]
PERM A 40 1 20
PERM A 40 1 21
/*  mfmaIndex:41  */
v_mfma_f32_16x16x32_bf16 acc[100:103], v[vgprValuB_X0_I0+24:vgprValuB_X0_I0+24+3], v[vgprValuA_X0_I0+8:vgprValuA_X0_I0+8+3], acc[100:103] // left value = acc[100+0:103+0]
.if \isOdd == 0
GR A 14 4096
GR A 15 4096
.else
PERM A 40 1 22
PERM A 40 1 23
.endif
/*  mfmaIndex:42  */
v_mfma_f32_16x16x32_bf16 acc[104:107], v[vgprValuB_X0_I0+24:vgprValuB_X0_I0+24+3], v[vgprValuA_X0_I0+16+4:vgprValuA_X0_I0+16+4+3], acc[104:107] // left value = acc[104+0:107+0]
.if \isOdd == 1
GR A 14 4096
GR A 15 4096
.else
PERM A 40 1 22
PERM A 40 1 23
.endif
/*  mfmaIndex:43  */
v_mfma_f32_16x16x32_bf16 acc[104:107], v[vgprValuB_X0_I0+24+4:vgprValuB_X0_I0+24+4+3], v[vgprValuA_X0_I0+16:vgprValuA_X0_I0+16+3], acc[104:107] // left value = acc[104+0:107+0]
s_xor_b32 s[sgprLocalWriteAddrA], s[sgprSwapA], s[sgprLocalWriteAddrA] // swap Red Blk SGPR
PERM A 48 2 0
PERM A 48 2 1
/*  mfmaIndex:44  */
v_mfma_f32_16x16x32_bf16 acc[104:107], v[vgprValuB_X0_I0+24:vgprValuB_X0_I0+24+3], v[vgprValuA_X0_I0+16:vgprValuA_X0_I0+16+3], acc[104:107] // left value = acc[104+0:107+0]
GRINC B 0
PERM A 48 2 2
PERM A 48 2 3
/*  mfmaIndex:45  */
v_mfma_f32_16x16x32_bf16 acc[108:111], v[vgprValuB_X0_I0+24:vgprValuB_X0_I0+24+3], v[vgprValuA_X0_I0+24+4:vgprValuA_X0_I0+24+4+3], acc[108:111] // left value = acc[108+0:111+0]
GRINC B 1
PERM A 48 2 4
PERM A 48 2 5
/*  mfmaIndex:46  */
v_mfma_f32_16x16x32_bf16 acc[108:111], v[vgprValuB_X0_I0+24+4:vgprValuB_X0_I0+24+4+3], v[vgprValuA_X0_I0+24:vgprValuA_X0_I0+24+3], acc[108:111] // left value = acc[108+0:111+0]
GRINC B 2
PERM A 48 2 6
PERM A 48 2 7
/*  mfmaIndex:47  */
v_mfma_f32_16x16x32_bf16 acc[108:111], v[vgprValuB_X0_I0+24:vgprValuB_X0_I0+24+3], v[vgprValuA_X0_I0+24:vgprValuA_X0_I0+24+3], acc[108:111] // left value = acc[108+0:111+0]
GRINC B 3
PERM A 48 2 8
PERM A 48 2 9

// B0 A1 // MFMAs reordered to allow more time for PERM A1 to complete..

/*  mfmaIndex:48  */
v_mfma_f32_16x16x32_bf16 acc[16:19], v[vgprValuB_X0_I0+0:vgprValuB_X0_I0+0+3], v[vgprValuA_X0_I0+32+4:vgprValuA_X0_I0+32+4+3], acc[16:19] // left value = acc[16+0:19+0]
GRINC B 4
PERM A 48 2 10
PERM A 48 2 11
/*  mfmaIndex:49  */
v_mfma_f32_16x16x32_bf16 acc[16:19], v[vgprValuB_X0_I0+0+4:vgprValuB_X0_I0+0+4+3], v[vgprValuA_X0_I0+32:vgprValuA_X0_I0+32+3], acc[16:19] // left value = acc[16+0:19+0]
GRINC B 5
PERM A 48 2 12
PERM A 48 2 13
/*  mfmaIndex:50  */
v_mfma_f32_16x16x32_bf16 acc[16:19], v[vgprValuB_X0_I0+0:vgprValuB_X0_I0+0+3], v[vgprValuA_X0_I0+32:vgprValuA_X0_I0+32+3], acc[16:19] // left value = acc[16+0:19+0]
GRINC B 6
PERM A 48 2 14
PERM A 48 2 15
  /*  mfmaIndex:51  */
v_mfma_f32_16x16x32_bf16 acc[20:23], v[vgprValuB_X0_I0+0:vgprValuB_X0_I0+0+3], v[vgprValuA_X0_I0+40+4:vgprValuA_X0_I0+40+4+3], acc[20:23] // left value = acc[20+0:23+0]
GRINC B 7
PERM A 48 2 16
PERM A 48 2 17
/*  mfmaIndex:52  */
v_mfma_f32_16x16x32_bf16 acc[20:23], v[vgprValuB_X0_I0+0+4:vgprValuB_X0_I0+0+4+3], v[vgprValuA_X0_I0+40:vgprValuA_X0_I0+40+3], acc[20:23] // left value = acc[20+0:23+0]
GRINC B 8
PERM A 48 2 18
PERM A 48 2 19
/*  mfmaIndex:53  */
v_mfma_f32_16x16x32_bf16 acc[20:23], v[vgprValuB_X0_I0+0:vgprValuB_X0_I0+0+3], v[vgprValuA_X0_I0+40:vgprValuA_X0_I0+40+3], acc[20:23] // left value = acc[20+0:23+0]
PERM A 48 2 20
PERM A 48 2 21
/*  mfmaIndex:60  */
v_mfma_f32_16x16x32_bf16 acc[48:51], v[vgprValuB_X0_I0+8:vgprValuB_X0_I0+8+3], v[vgprValuA_X0_I0+32+4:vgprValuA_X0_I0+32+4+3], acc[48:51] // left value = acc[48+0:51+0]
PERM A 48 2 22
PERM A 48 2 23
/*  mfmaIndex:61  */
v_mfma_f32_16x16x32_bf16 acc[48:51], v[vgprValuB_X0_I0+8+4:vgprValuB_X0_I0+8+4+3], v[vgprValuA_X0_I0+32:vgprValuA_X0_I0+32+3], acc[48:51] // left value = acc[48+0:51+0]
PERM A 56 3 0
PERM A 56 3 1
/*  mfmaIndex:62  */
v_mfma_f32_16x16x32_bf16 acc[48:51], v[vgprValuB_X0_I0+8:vgprValuB_X0_I0+8+3], v[vgprValuA_X0_I0+32:vgprValuA_X0_I0+32+3], acc[48:51] // left value = acc[48+0:51+0]
PERM A 56 3 2
PERM A 56 3 3
/*  mfmaIndex:63  */
v_mfma_f32_16x16x32_bf16 acc[52:55], v[vgprValuB_X0_I0+8:vgprValuB_X0_I0+8+3], v[vgprValuA_X0_I0+40+4:vgprValuA_X0_I0+40+4+3], acc[52:55] // left value = acc[52+0:55+0]
PERM A 56 3 4
PERM A 56 3 5
/*  mfmaIndex:64  */
v_mfma_f32_16x16x32_bf16 acc[52:55], v[vgprValuB_X0_I0+8+4:vgprValuB_X0_I0+8+4+3], v[vgprValuA_X0_I0+40:vgprValuA_X0_I0+40+3], acc[52:55] // left value = acc[52+0:55+0]
PERM A 56 3 6
PERM A 56 3 7
/*  mfmaIndex:65  */
v_mfma_f32_16x16x32_bf16 acc[52:55], v[vgprValuB_X0_I0+8:vgprValuB_X0_I0+8+3], v[vgprValuA_X0_I0+40:vgprValuA_X0_I0+40+3], acc[52:55] // left value = acc[52+0:55+0]
PERM A 56 3 8
PERM A 56 3 9
/*  mfmaIndex:72  */
v_mfma_f32_16x16x32_bf16 acc[80:83], v[vgprValuB_X0_I0+16:vgprValuB_X0_I0+16+3], v[vgprValuA_X0_I0+32+4:vgprValuA_X0_I0+32+4+3], acc[80:83] // left value = acc[80+0:83+0]
PERM A 56 3 10
PERM A 56 3 11
/*  mfmaIndex:73  */
v_mfma_f32_16x16x32_bf16 acc[80:83], v[vgprValuB_X0_I0+16+4:vgprValuB_X0_I0+16+4+3], v[vgprValuA_X0_I0+32:vgprValuA_X0_I0+32+3], acc[80:83] // left value = acc[80+0:83+0]
PERM A 56 3 12
PERM A 56 3 13
/*  mfmaIndex:74  */
v_mfma_f32_16x16x32_bf16 acc[80:83], v[vgprValuB_X0_I0+16:vgprValuB_X0_I0+16+3], v[vgprValuA_X0_I0+32:vgprValuA_X0_I0+32+3], acc[80:83] // left value = acc[80+0:83+0]
PERM A 56 3 14
PERM A 56 3 15
/*  mfmaIndex:75  */
v_mfma_f32_16x16x32_bf16 acc[84:87], v[vgprValuB_X0_I0+16:vgprValuB_X0_I0+16+3], v[vgprValuA_X0_I0+40+4:vgprValuA_X0_I0+40+4+3], acc[84:87] // left value = acc[84+0:87+0]
PERM A 56 3 16
PERM A 56 3 17
/*  mfmaIndex:76  */
v_mfma_f32_16x16x32_bf16 acc[84:87], v[vgprValuB_X0_I0+16+4:vgprValuB_X0_I0+16+4+3], v[vgprValuA_X0_I0+40:vgprValuA_X0_I0+40+3], acc[84:87] // left value = acc[84+0:87+0]
PERM A 56 3 18
PERM A 56 3 19
/*  mfmaIndex:77  */
v_mfma_f32_16x16x32_bf16 acc[84:87], v[vgprValuB_X0_I0+16:vgprValuB_X0_I0+16+3], v[vgprValuA_X0_I0+40:vgprValuA_X0_I0+40+3], acc[84:87] // left value = acc[84+0:87+0]
PERM A 56 3 20
PERM A 56 3 21
/*  mfmaIndex:84  */
v_mfma_f32_16x16x32_bf16 acc[112:115], v[vgprValuB_X0_I0+24:vgprValuB_X0_I0+24+3], v[vgprValuA_X0_I0+32+4:vgprValuA_X0_I0+32+4+3], acc[112:115] // left value = acc[112+0:115+0]
PERM A 56 3 22
PERM A 56 3 23
/*  mfmaIndex:85  */
v_mfma_f32_16x16x32_bf16 acc[112:115], v[vgprValuB_X0_I0+24+4:vgprValuB_X0_I0+24+4+3], v[vgprValuA_X0_I0+32:vgprValuA_X0_I0+32+3], acc[112:115] // left value = acc[112+0:115+0]
s_waitcnt lgkmcnt(0) // Wait for LR B1 to finish
CVT B 32 0 0
CVT B 32 0 1
/*  mfmaIndex:86  */
v_mfma_f32_16x16x32_bf16 acc[112:115], v[vgprValuB_X0_I0+24:vgprValuB_X0_I0+24+3], v[vgprValuA_X0_I0+32:vgprValuA_X0_I0+32+3], acc[112:115] // left value = acc[112+0:115+0]
CVT B 32 0 2
CVT B 32 0 3
/*  mfmaIndex:87  */
v_mfma_f32_16x16x32_bf16 acc[116:119], v[vgprValuB_X0_I0+24:vgprValuB_X0_I0+24+3], v[vgprValuA_X0_I0+40+4:vgprValuA_X0_I0+40+4+3], acc[116:119] // left value = acc[116+0:119+0]
CVT B 32 0 4
CVT B 32 0 5
/*  mfmaIndex:88  */
v_mfma_f32_16x16x32_bf16 acc[116:119], v[vgprValuB_X0_I0+24+4:vgprValuB_X0_I0+24+4+3], v[vgprValuA_X0_I0+40:vgprValuA_X0_I0+40+3], acc[116:119] // left value = acc[116+0:119+0]
CVT B 32 0 6
CVT B 32 0 7
/*  mfmaIndex:89  */
v_mfma_f32_16x16x32_bf16 acc[116:119], v[vgprValuB_X0_I0+24:vgprValuB_X0_I0+24+3], v[vgprValuA_X0_I0+40:vgprValuA_X0_I0+40+3], acc[116:119] // left value = acc[116+0:119+0]
CVT B 32 0 8
CVT B 32 0 9
/*  mfmaIndex:54  */
v_mfma_f32_16x16x32_bf16 acc[24:27], v[vgprValuB_X0_I0+0:vgprValuB_X0_I0+0+3], v[vgprValuA_X0_I0+48+4:vgprValuA_X0_I0+48+4+3], acc[24:27] // left value = acc[24+0:27+0]
CVT B 32 0 10
CVT B 32 0 11
/*  mfmaIndex:55  */
v_mfma_f32_16x16x32_bf16 acc[24:27], v[vgprValuB_X0_I0+0+4:vgprValuB_X0_I0+0+4+3], v[vgprValuA_X0_I0+48:vgprValuA_X0_I0+48+3], acc[24:27] // left value = acc[24+0:27+0]
CVT B 32 0 12
CVT B 32 0 13
/*  mfmaIndex:56  */
v_mfma_f32_16x16x32_bf16 acc[24:27], v[vgprValuB_X0_I0+0:vgprValuB_X0_I0+0+3], v[vgprValuA_X0_I0+48:vgprValuA_X0_I0+48+3], acc[24:27] // left value = acc[24+0:27+0]
CVT B 32 0 14
CVT B 32 0 15
/*  mfmaIndex:57  */
v_mfma_f32_16x16x32_bf16 acc[28:31], v[vgprValuB_X0_I0+0:vgprValuB_X0_I0+0+3], v[vgprValuA_X0_I0+56+4:vgprValuA_X0_I0+56+4+3], acc[28:31] // left value = acc[28+0:31+0]
CVT B 32 0 16
CVT B 32 0 17
/*  mfmaIndex:58  */
v_mfma_f32_16x16x32_bf16 acc[28:31], v[vgprValuB_X0_I0+0+4:vgprValuB_X0_I0+0+4+3], v[vgprValuA_X0_I0+56:vgprValuA_X0_I0+56+3], acc[28:31] // left value = acc[28+0:31+0]
CVT B 32 0 18
CVT B 32 0 19
/*  mfmaIndex:59  */
v_mfma_f32_16x16x32_bf16 acc[28:31], v[vgprValuB_X0_I0+0:vgprValuB_X0_I0+0+3], v[vgprValuA_X0_I0+56:vgprValuA_X0_I0+56+3], acc[28:31] // left value = acc[28+0:31+0]
CVT B 32 0 20
CVT B 32 0 21
/*  mfmaIndex:66  */
v_mfma_f32_16x16x32_bf16 acc[56:59], v[vgprValuB_X0_I0+8:vgprValuB_X0_I0+8+3], v[vgprValuA_X0_I0+48+4:vgprValuA_X0_I0+48+4+3], acc[56:59] // left value = acc[56+0:59+0]
CVT B 32 0 22
CVT B 32 0 23
/*  mfmaIndex:67  */
v_mfma_f32_16x16x32_bf16 acc[56:59], v[vgprValuB_X0_I0+8+4:vgprValuB_X0_I0+8+4+3], v[vgprValuA_X0_I0+48:vgprValuA_X0_I0+48+3], acc[56:59] // left value = acc[56+0:59+0]
CVT B 40 8 0
CVT B 40 8 1
/*  mfmaIndex:68  */
v_mfma_f32_16x16x32_bf16 acc[56:59], v[vgprValuB_X0_I0+8:vgprValuB_X0_I0+8+3], v[vgprValuA_X0_I0+48:vgprValuA_X0_I0+48+3], acc[56:59] // left value = acc[56+0:59+0]
CVT B 40 8 2
CVT B 40 8 3
/*  mfmaIndex:69  */
v_mfma_f32_16x16x32_bf16 acc[60:63], v[vgprValuB_X0_I0+8:vgprValuB_X0_I0+8+3], v[vgprValuA_X0_I0+56+4:vgprValuA_X0_I0+56+4+3], acc[60:63] // left value = acc[60+0:63+0]
CVT B 40 8 4
CVT B 40 8 5
/*  mfmaIndex:70  */
v_mfma_f32_16x16x32_bf16 acc[60:63], v[vgprValuB_X0_I0+8+4:vgprValuB_X0_I0+8+4+3], v[vgprValuA_X0_I0+56:vgprValuA_X0_I0+56+3], acc[60:63] // left value = acc[60+0:63+0]
CVT B 40 8 6
CVT B 40 8 7
/*  mfmaIndex:71  */
v_mfma_f32_16x16x32_bf16 acc[60:63], v[vgprValuB_X0_I0+8:vgprValuB_X0_I0+8+3], v[vgprValuA_X0_I0+56:vgprValuA_X0_I0+56+3], acc[60:63] // left value = acc[60+0:63+0]
CVT B 40 8 8
CVT B 40 8 9
/*  mfmaIndex:78  */
v_mfma_f32_16x16x32_bf16 acc[88:91], v[vgprValuB_X0_I0+16:vgprValuB_X0_I0+16+3], v[vgprValuA_X0_I0+48+4:vgprValuA_X0_I0+48+4+3], acc[88:91] // left value = acc[88+0:91+0]
s_waitcnt vmcnt(16) // Wait for previous A to finish to start LR A0, 8A + 8B issued prior, 8A just issued, 8B prior + 8A just can still be in flight.
CVT B 40 8 10
CVT B 40 8 11
/*  mfmaIndex:79  */
v_mfma_f32_16x16x32_bf16 acc[88:91], v[vgprValuB_X0_I0+16+4:vgprValuB_X0_I0+16+4+3], v[vgprValuA_X0_I0+48:vgprValuA_X0_I0+48+3], acc[88:91] // left value = acc[88+0:91+0]
s_barrier
/*  mfmaIndex:80  */
v_mfma_f32_16x16x32_bf16 acc[88:91], v[vgprValuB_X0_I0+16:vgprValuB_X0_I0+16+3], v[vgprValuA_X0_I0+48:vgprValuA_X0_I0+48+3], acc[88:91] // left value = acc[88+0:91+0]
.if \isOdd == 0
GR B 0 4160
v_xor_b32 v[vgprLocalReadAddrB], v[vgprLocalReadSwapAddrB], v[vgprLocalReadAddrB] // swap Red Blk
GR B 1 4160
.else
LR1 A 0 // LR A0 reads to A_T0, so can use A_X0
LR1 A 1
.endif
/*  mfmaIndex:81  */
v_mfma_f32_16x16x32_bf16 acc[92:95], v[vgprValuB_X0_I0+16:vgprValuB_X0_I0+16+3], v[vgprValuA_X0_I0+56+4:vgprValuA_X0_I0+56+4+3], acc[92:95] // left value = acc[92+0:95+0]
.if \isOdd == 1
GR B 0 4160
v_xor_b32 v[vgprLocalReadAddrB], v[vgprLocalReadSwapAddrB], v[vgprLocalReadAddrB] // swap Red Blk
GR B 1 4160
.else
LR1 A 0
LR1 A 1
.endif
/*  mfmaIndex:82  */
v_mfma_f32_16x16x32_bf16 acc[92:95], v[vgprValuB_X0_I0+16+4:vgprValuB_X0_I0+16+4+3], v[vgprValuA_X0_I0+56:vgprValuA_X0_I0+56+3], acc[92:95] // left value = acc[92+0:95+0]
.if \isOdd == 0
GR B 2 4160
GR B 3 4160
.else
LR1 A 2
LR1 A 3
.endif
/*  mfmaIndex:83  */
v_mfma_f32_16x16x32_bf16 acc[92:95], v[vgprValuB_X0_I0+16:vgprValuB_X0_I0+16+3], v[vgprValuA_X0_I0+56:vgprValuA_X0_I0+56+3], acc[92:95] // left value = acc[92+0:95+0]
.if \isOdd == 1
GR B 2 4160
GR B 3 4160
.else
LR1 A 2
LR1 A 3
.endif
/*  mfmaIndex:90  */
v_mfma_f32_16x16x32_bf16 acc[120:123], v[vgprValuB_X0_I0+24:vgprValuB_X0_I0+24+3], v[vgprValuA_X0_I0+48+4:vgprValuA_X0_I0+48+4+3], acc[120:123] // left value = acc[120+0:123+0]
.if \isOdd == 0
GR B 4 4160
GR B 5 4160
.else
LR1 A 4
LR1 A 5
.endif
/*  mfmaIndex:91  */
v_mfma_f32_16x16x32_bf16 acc[120:123], v[vgprValuB_X0_I0+24+4:vgprValuB_X0_I0+24+4+3], v[vgprValuA_X0_I0+48:vgprValuA_X0_I0+48+3], acc[120:123] // left value = acc[120+0:123+0]
.if \isOdd == 1
GR B 4 4160
GR B 5 4160
.else
LR1 A 4
LR1 A 5
.endif
/*  mfmaIndex:92  */
v_mfma_f32_16x16x32_bf16 acc[120:123], v[vgprValuB_X0_I0+24:vgprValuB_X0_I0+24+3], v[vgprValuA_X0_I0+48:vgprValuA_X0_I0+48+3], acc[120:123] // left value = acc[120+0:123+0]
.if \isOdd == 0
GR B 6 4160
GR B 7 4160
.else
LR1 A 6
LR1 A 7
.endif
/*  mfmaIndex:93  */
v_mfma_f32_16x16x32_bf16 acc[124:127], v[vgprValuB_X0_I0+24:vgprValuB_X0_I0+24+3], v[vgprValuA_X0_I0+56+4:vgprValuA_X0_I0+56+4+3], acc[124:127] // left value = acc[124+0:127+0]
.if \isOdd == 1
GR B 6 4160
GR B 7 4160
.else
LR1 A 6
LR1 A 7
.endif
/*  mfmaIndex:94  */
v_mfma_f32_16x16x32_bf16 acc[124:127], v[vgprValuB_X0_I0+24+4:vgprValuB_X0_I0+24+4+3], v[vgprValuA_X0_I0+56:vgprValuA_X0_I0+56+3], acc[124:127] // left value = acc[124+0:127+0]
.if \isOdd == 0
GR B 8 4160
GR B 9 4160
.else
CVT B 40 8 12
CVT B 40 8 13
.endif
/*  mfmaIndex:95  */
v_mfma_f32_16x16x32_bf16 acc[124:127], v[vgprValuB_X0_I0+24:vgprValuB_X0_I0+24+3], v[vgprValuA_X0_I0+56:vgprValuA_X0_I0+56+3], acc[124:127] // left value = acc[124+0:127+0]
.if \isOdd == 1
GR B 8 4160
GR B 9 4160
.else
CVT B 40 8 12
CVT B 40 8 13
.endif


// B1 A0

/*  mfmaIndex:96  */
v_mfma_f32_16x16x32_bf16 acc[128:131], v[vgprValuB_X0_I0+32:vgprValuB_X0_I0+32+3], v[vgprValuA_X0_I0+0+4:vgprValuA_X0_I0+0+4+3], acc[128:131] // left value = acc[128+0:131+0]
CVT B 40 8 14
CVT B 40 8 15
/*  mfmaIndex:97  */
v_mfma_f32_16x16x32_bf16 acc[128:131], v[vgprValuB_X0_I0+32+4:vgprValuB_X0_I0+32+4+3], v[vgprValuA_X0_I0+0:vgprValuA_X0_I0+0+3], acc[128:131] // left value = acc[128+0:131+0]
CVT B 40 8 16
CVT B 40 8 17
/*  mfmaIndex:98  */
v_mfma_f32_16x16x32_bf16 acc[128:131], v[vgprValuB_X0_I0+32:vgprValuB_X0_I0+32+3], v[vgprValuA_X0_I0+0:vgprValuA_X0_I0+0+3], acc[128:131] // left value = acc[128+0:131+0]
CVT B 40 8 18
CVT B 40 8 19
/*  mfmaIndex:99  */
v_mfma_f32_16x16x32_bf16 acc[132:135], v[vgprValuB_X0_I0+32:vgprValuB_X0_I0+32+3], v[vgprValuA_X0_I0+8+4:vgprValuA_X0_I0+8+4+3], acc[132:135] // left value = acc[132+0:135+0]
CVT B 40 8 20
CVT B 40 8 21
/*  mfmaIndex:100  */
v_mfma_f32_16x16x32_bf16 acc[132:135], v[vgprValuB_X0_I0+32+4:vgprValuB_X0_I0+32+4+3], v[vgprValuA_X0_I0+8:vgprValuA_X0_I0+8+3], acc[132:135] // left value = acc[132+0:135+0]
CVT B 40 8 22
CVT B 40 8 23
/*  mfmaIndex:101  */
v_mfma_f32_16x16x32_bf16 acc[132:135], v[vgprValuB_X0_I0+32:vgprValuB_X0_I0+32+3], v[vgprValuA_X0_I0+8:vgprValuA_X0_I0+8+3], acc[132:135] // left value = acc[132+0:135+0]
CVT B 48 16 0
CVT B 48 16 1
/*  mfmaIndex:102  */
v_mfma_f32_16x16x32_bf16 acc[136:139], v[vgprValuB_X0_I0+32:vgprValuB_X0_I0+32+3], v[vgprValuA_X0_I0+16+4:vgprValuA_X0_I0+16+4+3], acc[136:139] // left value = acc[136+0:139+0]
CVT B 48 16 2
CVT B 48 16 3
/*  mfmaIndex:103  */
v_mfma_f32_16x16x32_bf16 acc[136:139], v[vgprValuB_X0_I0+32+4:vgprValuB_X0_I0+32+4+3], v[vgprValuA_X0_I0+16:vgprValuA_X0_I0+16+3], acc[136:139] // left value = acc[136+0:139+0]
CVT B 48 16 4
CVT B 48 16 5
/*  mfmaIndex:104  */
v_mfma_f32_16x16x32_bf16 acc[136:139], v[vgprValuB_X0_I0+32:vgprValuB_X0_I0+32+3], v[vgprValuA_X0_I0+16:vgprValuA_X0_I0+16+3], acc[136:139] // left value = acc[136+0:139+0]
CVT B 48 16 6
CVT B 48 16 7
/*  mfmaIndex:105  */
v_mfma_f32_16x16x32_bf16 acc[140:143], v[vgprValuB_X0_I0+32:vgprValuB_X0_I0+32+3], v[vgprValuA_X0_I0+24+4:vgprValuA_X0_I0+24+4+3], acc[140:143] // left value = acc[140+0:143+0]
CVT B 48 16 8
CVT B 48 16 9
/*  mfmaIndex:106  */
v_mfma_f32_16x16x32_bf16 acc[140:143], v[vgprValuB_X0_I0+32+4:vgprValuB_X0_I0+32+4+3], v[vgprValuA_X0_I0+24:vgprValuA_X0_I0+24+3], acc[140:143] // left value = acc[140+0:143+0]
CVT B 48 16 10
CVT B 48 16 11
/*  mfmaIndex:107  */
v_mfma_f32_16x16x32_bf16 acc[140:143], v[vgprValuB_X0_I0+32:vgprValuB_X0_I0+32+3], v[vgprValuA_X0_I0+24:vgprValuA_X0_I0+24+3], acc[140:143] // left value = acc[140+0:143+0]
CVT B 48 16 12
CVT B 48 16 13
/*  mfmaIndex:108  */
v_mfma_f32_16x16x32_bf16 acc[160:163], v[vgprValuB_X0_I0+40:vgprValuB_X0_I0+40+3], v[vgprValuA_X0_I0+0+4:vgprValuA_X0_I0+0+4+3], acc[160:163] // left value = acc[160+0:163+0]
CVT B 48 16 14
CVT B 48 16 15
/*  mfmaIndex:109  */
v_mfma_f32_16x16x32_bf16 acc[160:163], v[vgprValuB_X0_I0+40+4:vgprValuB_X0_I0+40+4+3], v[vgprValuA_X0_I0+0:vgprValuA_X0_I0+0+3], acc[160:163] // left value = acc[160+0:163+0]
CVT B 48 16 16
CVT B 48 16 17
/*  mfmaIndex:110  */
v_mfma_f32_16x16x32_bf16 acc[160:163], v[vgprValuB_X0_I0+40:vgprValuB_X0_I0+40+3], v[vgprValuA_X0_I0+0:vgprValuA_X0_I0+0+3], acc[160:163] // left value = acc[160+0:163+0]
CVT B 48 16 18
CVT B 48 16 19
/*  mfmaIndex:111  */
v_mfma_f32_16x16x32_bf16 acc[164:167], v[vgprValuB_X0_I0+40:vgprValuB_X0_I0+40+3], v[vgprValuA_X0_I0+8+4:vgprValuA_X0_I0+8+4+3], acc[164:167] // left value = acc[164+0:167+0]
CVT B 48 16 20
CVT B 48 16 21
/*  mfmaIndex:112  */
v_mfma_f32_16x16x32_bf16 acc[164:167], v[vgprValuB_X0_I0+40+4:vgprValuB_X0_I0+40+4+3], v[vgprValuA_X0_I0+8:vgprValuA_X0_I0+8+3], acc[164:167] // left value = acc[164+0:167+0]
CVT B 48 16 22
CVT B 48 16 23
/*  mfmaIndex:113  */
v_mfma_f32_16x16x32_bf16 acc[164:167], v[vgprValuB_X0_I0+40:vgprValuB_X0_I0+40+3], v[vgprValuA_X0_I0+8:vgprValuA_X0_I0+8+3], acc[164:167] // left value = acc[164+0:167+0]
CVT B 56 24 0
CVT B 56 24 1
/*  mfmaIndex:114  */
v_mfma_f32_16x16x32_bf16 acc[168:171], v[vgprValuB_X0_I0+40:vgprValuB_X0_I0+40+3], v[vgprValuA_X0_I0+16+4:vgprValuA_X0_I0+16+4+3], acc[168:171] // left value = acc[168+0:171+0]
CVT B 56 24 2
CVT B 56 24 3
/*  mfmaIndex:115  */
v_mfma_f32_16x16x32_bf16 acc[168:171], v[vgprValuB_X0_I0+40+4:vgprValuB_X0_I0+40+4+3], v[vgprValuA_X0_I0+16:vgprValuA_X0_I0+16+3], acc[168:171] // left value = acc[168+0:171+0]
CVT B 56 24 4
CVT B 56 24 5
/*  mfmaIndex:116  */
v_mfma_f32_16x16x32_bf16 acc[168:171], v[vgprValuB_X0_I0+40:vgprValuB_X0_I0+40+3], v[vgprValuA_X0_I0+16:vgprValuA_X0_I0+16+3], acc[168:171] // left value = acc[168+0:171+0]
CVT B 56 24 6
CVT B 56 24 7
/*  mfmaIndex:117  */
v_mfma_f32_16x16x32_bf16 acc[172:175], v[vgprValuB_X0_I0+40:vgprValuB_X0_I0+40+3], v[vgprValuA_X0_I0+24+4:vgprValuA_X0_I0+24+4+3], acc[172:175] // left value = acc[172+0:175+0]
CVT B 56 24 8
CVT B 56 24 9
/*  mfmaIndex:118  */
v_mfma_f32_16x16x32_bf16 acc[172:175], v[vgprValuB_X0_I0+40+4:vgprValuB_X0_I0+40+4+3], v[vgprValuA_X0_I0+24:vgprValuA_X0_I0+24+3], acc[172:175] // left value = acc[172+0:175+0]
CVT B 56 24 10
CVT B 56 24 11
/*  mfmaIndex:119  */
v_mfma_f32_16x16x32_bf16 acc[172:175], v[vgprValuB_X0_I0+40:vgprValuB_X0_I0+40+3], v[vgprValuA_X0_I0+24:vgprValuA_X0_I0+24+3], acc[172:175] // left value = acc[172+0:175+0]
CVT B 56 24 12
CVT B 56 24 13
/*  mfmaIndex:120  */
v_mfma_f32_16x16x32_bf16 acc[192:195], v[vgprValuB_X0_I0+48:vgprValuB_X0_I0+48+3], v[vgprValuA_X0_I0+0+4:vgprValuA_X0_I0+0+4+3], acc[192:195] // left value = acc[192+0:195+0]
CVT B 56 24 14
CVT B 56 24 15
/*  mfmaIndex:121  */
v_mfma_f32_16x16x32_bf16 acc[192:195], v[vgprValuB_X0_I0+48+4:vgprValuB_X0_I0+48+4+3], v[vgprValuA_X0_I0+0:vgprValuA_X0_I0+0+3], acc[192:195] // left value = acc[192+0:195+0]
CVT B 56 24 16
CVT B 56 24 17
/*  mfmaIndex:122  */
v_mfma_f32_16x16x32_bf16 acc[192:195], v[vgprValuB_X0_I0+48:vgprValuB_X0_I0+48+3], v[vgprValuA_X0_I0+0:vgprValuA_X0_I0+0+3], acc[192:195] // left value = acc[192+0:195+0]
CVT B 56 24 18
CVT B 56 24 19
/*  mfmaIndex:123  */
v_mfma_f32_16x16x32_bf16 acc[196:199], v[vgprValuB_X0_I0+48:vgprValuB_X0_I0+48+3], v[vgprValuA_X0_I0+8+4:vgprValuA_X0_I0+8+4+3], acc[196:199] // left value = acc[196+0:199+0]
s_waitcnt vmcnt(8 + 5) & lgkmcnt(0) // Wait for previous GR B to finish and previous LR A0, 8A + 5B just issue, 8A + 8B issue prior
CVT B 56 24 20
CVT B 56 24 21
/*  mfmaIndex:124  */
v_mfma_f32_16x16x32_bf16 acc[196:199], v[vgprValuB_X0_I0+48+4:vgprValuB_X0_I0+48+4+3], v[vgprValuA_X0_I0+8:vgprValuA_X0_I0+8+3], acc[196:199] // left value = acc[196+0:199+0]
s_barrier
/*  mfmaIndex:125  */
v_mfma_f32_16x16x32_bf16 acc[196:199], v[vgprValuB_X0_I0+48:vgprValuB_X0_I0+48+3], v[vgprValuA_X0_I0+8:vgprValuA_X0_I0+8+3], acc[196:199] // left value = acc[196+0:199+0]
.if \isOdd == 0
GR B 10 4160
GR B 11 4160
.else
LR0 B 0
.endif
CVT B 56 24 22
/*  mfmaIndex:126  */
v_mfma_f32_16x16x32_bf16 acc[200:203], v[vgprValuB_X0_I0+48:vgprValuB_X0_I0+48+3], v[vgprValuA_X0_I0+16+4:vgprValuA_X0_I0+16+4+3], acc[200:203] // left value = acc[200+0:203+0]
.if \isOdd == 1
GR B 10 4160
GR B 11 4160
.else
LR0 B 0
.endif
CVT B 56 24 23
/*  mfmaIndex:127  */
v_mfma_f32_16x16x32_bf16 acc[200:203], v[vgprValuB_X0_I0+48+4:vgprValuB_X0_I0+48+4+3], v[vgprValuA_X0_I0+16:vgprValuA_X0_I0+16+3], acc[200:203] // left value = acc[200+0:203+0]
.if \isOdd == 0
GR B 12 4160
GR B 13 4160
.else
LR0 B 1
.endif
v_mov_b64 v[vgprValuA_T1_I0+0:vgprValuA_T1_I0+1], v[vgprValuA_X0_I0+0:vgprValuA_X0_I0+1]
/*  mfmaIndex:128  */
v_mfma_f32_16x16x32_bf16 acc[200:203], v[vgprValuB_X0_I0+48:vgprValuB_X0_I0+48+3], v[vgprValuA_X0_I0+16:vgprValuA_X0_I0+16+3], acc[200:203] // left value = acc[200+0:203+0]
.if \isOdd == 1
GR B 12 4160
GR B 13 4160
.else
LR0 B 1
.endif
v_mov_b64 v[vgprValuA_T1_I0+2:vgprValuA_T1_I0+3], v[vgprValuA_X0_I0+2:vgprValuA_X0_I0+3]
/*  mfmaIndex:129  */
v_mfma_f32_16x16x32_bf16 acc[204:207], v[vgprValuB_X0_I0+48:vgprValuB_X0_I0+48+3], v[vgprValuA_X0_I0+24+4:vgprValuA_X0_I0+24+4+3], acc[204:207] // left value = acc[204+0:207+0]
.if \isOdd == 0
GR B 14 4160
GR B 15 4160
.else
LR0 B 2
.endif
v_mov_b64 v[vgprValuA_T1_I0+4:vgprValuA_T1_I0+5], v[vgprValuA_X0_I0+4:vgprValuA_X0_I0+5]
/*  mfmaIndex:130  */
v_mfma_f32_16x16x32_bf16 acc[204:207], v[vgprValuB_X0_I0+48+4:vgprValuB_X0_I0+48+4+3], v[vgprValuA_X0_I0+24:vgprValuA_X0_I0+24+3], acc[204:207] // left value = acc[204+0:207+0]
.if \isOdd == 1
GR B 14 4160
GR B 15 4160
.else
LR0 B 2
.endif
v_mov_b64 v[vgprValuA_T1_I0+6:vgprValuA_T1_I0+7], v[vgprValuA_X0_I0+6:vgprValuA_X0_I0+7]
/*  mfmaIndex:131  */
v_mfma_f32_16x16x32_bf16 acc[204:207], v[vgprValuB_X0_I0+48:vgprValuB_X0_I0+48+3], v[vgprValuA_X0_I0+24:vgprValuA_X0_I0+24+3], acc[204:207] // left value = acc[204+0:207+0]
s_xor_b32 s[sgprLocalWriteAddrB], s[sgprSwapB], s[sgprLocalWriteAddrB] // swap Red Blk SGPR
.if \isOdd == 0
LR0 B 3
PERM A 0 0 0
.else
PERM A 0 0 0
PERM A 0 0 1
.endif
/*  mfmaIndex:132  */
v_mfma_f32_16x16x32_bf16 acc[224:227], v[vgprValuB_X0_I0+56:vgprValuB_X0_I0+56+3], v[vgprValuA_T1_I0+0+4:vgprValuA_T1_I0+0+4+3], acc[224:227] // left value = acc[224+0:227+0]
.if \isOdd == 1
LR0 B 3
PERM A 0 0 2
.else
PERM A 0 0 1
PERM A 0 0 2
.endif
/*  mfmaIndex:133  */
v_mfma_f32_16x16x32_bf16 acc[224:227], v[vgprValuB_X0_I0+56+4:vgprValuB_X0_I0+56+4+3], v[vgprValuA_T1_I0+0:vgprValuA_T1_I0+0+3], acc[224:227] // left value = acc[224+0:227+0]
.if \isOdd == 0
LR0 B 4
PERM A 0 0 3
.else
PERM A 0 0 3
PERM A 0 0 4
.endif
/*  mfmaIndex:134  */
v_mfma_f32_16x16x32_bf16 acc[224:227], v[vgprValuB_X0_I0+56:vgprValuB_X0_I0+56+3], v[vgprValuA_T1_I0+0:vgprValuA_T1_I0+0+3], acc[224:227] // left value = acc[224+0:227+0]
.if \isOdd == 1
LR0 B 4
PERM A 0 0 5
.else
PERM A 0 0 4
PERM A 0 0 5
.endif
/*  mfmaIndex:135  */
v_mfma_f32_16x16x32_bf16 acc[228:231], v[vgprValuB_X0_I0+56:vgprValuB_X0_I0+56+3], v[vgprValuA_X0_I0+8+4:vgprValuA_X0_I0+8+4+3], acc[228:231] // left value = acc[228+0:231+0]
.if \isOdd == 0
LR0 B 5
PERM A 0 0 6
.else
PERM A 0 0 6
PERM A 0 0 7
.endif
/*  mfmaIndex:136  */
v_mfma_f32_16x16x32_bf16 acc[228:231], v[vgprValuB_X0_I0+56+4:vgprValuB_X0_I0+56+4+3], v[vgprValuA_X0_I0+8:vgprValuA_X0_I0+8+3], acc[228:231] // left value = acc[228+0:231+0]
.if \isOdd == 1
LR0 B 5
PERM A 0 0 8
.else
PERM A 0 0 7
PERM A 0 0 8
.endif
/*  mfmaIndex:137  */
v_mfma_f32_16x16x32_bf16 acc[228:231], v[vgprValuB_X0_I0+56:vgprValuB_X0_I0+56+3], v[vgprValuA_X0_I0+8:vgprValuA_X0_I0+8+3], acc[228:231] // left value = acc[228+0:231+0]
.if \isOdd == 0
LR0 B 6
PERM A 0 0 9
.else
PERM A 0 0 9
PERM A 0 0 10
.endif
/*  mfmaIndex:138  */
v_mfma_f32_16x16x32_bf16 acc[232:235], v[vgprValuB_X0_I0+56:vgprValuB_X0_I0+56+3], v[vgprValuA_X0_I0+16+4:vgprValuA_X0_I0+16+4+3], acc[232:235] // left value = acc[232+0:235+0]
.if \isOdd == 1
LR0 B 6
PERM A 0 0 11
.else
PERM A 0 0 10
PERM A 0 0 11
.endif
/*  mfmaIndex:139  */
v_mfma_f32_16x16x32_bf16 acc[232:235], v[vgprValuB_X0_I0+56+4:vgprValuB_X0_I0+56+4+3], v[vgprValuA_X0_I0+16:vgprValuA_X0_I0+16+3], acc[232:235] // left value = acc[232+0:235+0]
PERM A 0 0 12
PERM A 0 0 13
/*  mfmaIndex:140  */
v_mfma_f32_16x16x32_bf16 acc[232:235], v[vgprValuB_X0_I0+56:vgprValuB_X0_I0+56+3], v[vgprValuA_X0_I0+16:vgprValuA_X0_I0+16+3], acc[232:235] // left value = acc[232+0:235+0]
.if \isOdd == 0
LR0 B 7
PERM A 0 0 14
.else
PERM A 0 0 14
PERM A 0 0 15
.endif
/*  mfmaIndex:141  */
v_mfma_f32_16x16x32_bf16 acc[236:239], v[vgprValuB_X0_I0+56:vgprValuB_X0_I0+56+3], v[vgprValuA_X0_I0+24+4:vgprValuA_X0_I0+24+4+3], acc[236:239] // left value = acc[236+0:239+0]
.if \isOdd == 1
LR0 B 7
PERM A 0 0 16
.else
PERM A 0 0 15
PERM A 0 0 16
.endif
/*  mfmaIndex:142  */
v_mfma_f32_16x16x32_bf16 acc[236:239], v[vgprValuB_X0_I0+56+4:vgprValuB_X0_I0+56+4+3], v[vgprValuA_X0_I0+24:vgprValuA_X0_I0+24+3], acc[236:239] // left value = acc[236+0:239+0]
PERM A 0 0 17
PERM A 0 0 18
/*  mfmaIndex:143  */
v_mfma_f32_16x16x32_bf16 acc[236:239], v[vgprValuB_X0_I0+56:vgprValuB_X0_I0+56+3], v[vgprValuA_X0_I0+24:vgprValuA_X0_I0+24+3], acc[236:239] // left value = acc[236+0:239+0]
PERM A 0 0 19
PERM A 0 0 20
// A1 B1
/*  mfmaIndex:144  */
v_mfma_f32_16x16x32_bf16 acc[144:147], v[vgprValuB_X0_I0+32:vgprValuB_X0_I0+32+3], v[vgprValuA_X0_I0+32+4:vgprValuA_X0_I0+32+4+3], acc[144:147] // left value = acc[144+0:147+0]
PERM A 0 0 21
PERM A 0 0 22
PERM A 0 0 23
/*  mfmaIndex:145  */
v_mfma_f32_16x16x32_bf16 acc[144:147], v[vgprValuB_X0_I0+32+4:vgprValuB_X0_I0+32+4+3], v[vgprValuA_X0_I0+32:vgprValuA_X0_I0+32+3], acc[144:147] // left value = acc[144+0:147+0]
PERM A 8 1 0
PERM A 8 1 1
/*  mfmaIndex:146  */
v_mfma_f32_16x16x32_bf16 acc[144:147], v[vgprValuB_X0_I0+32:vgprValuB_X0_I0+32+3], v[vgprValuA_X0_I0+32:vgprValuA_X0_I0+32+3], acc[144:147] // left value = acc[144+0:147+0]
PERM A 8 1 2
PERM A 8 1 3
/*  mfmaIndex:147  */
v_mfma_f32_16x16x32_bf16 acc[148:151], v[vgprValuB_X0_I0+32:vgprValuB_X0_I0+32+3], v[vgprValuA_X0_I0+40+4:vgprValuA_X0_I0+40+4+3], acc[148:151] // left value = acc[148+0:151+0]
PERM A 8 1 4
PERM A 8 1 5
/*  mfmaIndex:148  */
v_mfma_f32_16x16x32_bf16 acc[148:151], v[vgprValuB_X0_I0+32+4:vgprValuB_X0_I0+32+4+3], v[vgprValuA_X0_I0+40:vgprValuA_X0_I0+40+3], acc[148:151] // left value = acc[148+0:151+0]
PERM A 8 1 6
PERM A 8 1 7
/*  mfmaIndex:149  */
v_mfma_f32_16x16x32_bf16 acc[148:151], v[vgprValuB_X0_I0+32:vgprValuB_X0_I0+32+3], v[vgprValuA_X0_I0+40:vgprValuA_X0_I0+40+3], acc[148:151] // left value = acc[148+0:151+0]
PERM A 8 1 8
PERM A 8 1 9
/*  mfmaIndex:150  */
v_mfma_f32_16x16x32_bf16 acc[152:155], v[vgprValuB_X0_I0+32:vgprValuB_X0_I0+32+3], v[vgprValuA_X0_I0+48+4:vgprValuA_X0_I0+48+4+3], acc[152:155] // left value = acc[152+0:155+0]
PERM A 8 1 10
PERM A 8 1 11
/*  mfmaIndex:151  */
v_mfma_f32_16x16x32_bf16 acc[152:155], v[vgprValuB_X0_I0+32+4:vgprValuB_X0_I0+32+4+3], v[vgprValuA_X0_I0+48:vgprValuA_X0_I0+48+3], acc[152:155] // left value = acc[152+0:155+0]
PERM A 8 1 12
PERM A 8 1 13
/*  mfmaIndex:152  */
v_mfma_f32_16x16x32_bf16 acc[152:155], v[vgprValuB_X0_I0+32:vgprValuB_X0_I0+32+3], v[vgprValuA_X0_I0+48:vgprValuA_X0_I0+48+3], acc[152:155] // left value = acc[152+0:155+0]
PERM A 8 1 14
PERM A 8 1 15
/*  mfmaIndex:153  */
v_mfma_f32_16x16x32_bf16 acc[156:159], v[vgprValuB_X0_I0+32:vgprValuB_X0_I0+32+3], v[vgprValuA_X0_I0+56+4:vgprValuA_X0_I0+56+4+3], acc[156:159] // left value = acc[156+0:159+0]
PERM A 8 1 16
PERM A 8 1 17
/*  mfmaIndex:154  */
v_mfma_f32_16x16x32_bf16 acc[156:159], v[vgprValuB_X0_I0+32+4:vgprValuB_X0_I0+32+4+3], v[vgprValuA_X0_I0+56:vgprValuA_X0_I0+56+3], acc[156:159] // left value = acc[156+0:159+0]
PERM A 8 1 18
PERM A 8 1 19
/*  mfmaIndex:155  */
v_mfma_f32_16x16x32_bf16 acc[156:159], v[vgprValuB_X0_I0+32:vgprValuB_X0_I0+32+3], v[vgprValuA_X0_I0+56:vgprValuA_X0_I0+56+3], acc[156:159] // left value = acc[156+0:159+0]
PERM A 8 1 20
PERM A 8 1 21
/*  mfmaIndex:156  */
v_mfma_f32_16x16x32_bf16 acc[176:179], v[vgprValuB_X0_I0+40:vgprValuB_X0_I0+40+3], v[vgprValuA_X0_I0+32+4:vgprValuA_X0_I0+32+4+3], acc[176:179] // left value = acc[176+0:179+0]
PERM A 8 1 22
PERM A 8 1 23
/*  mfmaIndex:157  */
v_mfma_f32_16x16x32_bf16 acc[176:179], v[vgprValuB_X0_I0+40+4:vgprValuB_X0_I0+40+4+3], v[vgprValuA_X0_I0+32:vgprValuA_X0_I0+32+3], acc[176:179] // left value = acc[176+0:179+0]
PERM A 16 2 0
PERM A 16 2 1
/*  mfmaIndex:158  */
v_mfma_f32_16x16x32_bf16 acc[176:179], v[vgprValuB_X0_I0+40:vgprValuB_X0_I0+40+3], v[vgprValuA_X0_I0+32:vgprValuA_X0_I0+32+3], acc[176:179] // left value = acc[176+0:179+0]
PERM A 16 2 2
PERM A 16 2 3
/*  mfmaIndex:159  */
v_mfma_f32_16x16x32_bf16 acc[180:183], v[vgprValuB_X0_I0+40:vgprValuB_X0_I0+40+3], v[vgprValuA_X0_I0+40+4:vgprValuA_X0_I0+40+4+3], acc[180:183] // left value = acc[180+0:183+0]
PERM A 16 2 4
PERM A 16 2 5
/*  mfmaIndex:160  */
v_mfma_f32_16x16x32_bf16 acc[180:183], v[vgprValuB_X0_I0+40+4:vgprValuB_X0_I0+40+4+3], v[vgprValuA_X0_I0+40:vgprValuA_X0_I0+40+3], acc[180:183] // left value = acc[180+0:183+0]
PERM A 16 2 6
PERM A 16 2 7
/*  mfmaIndex:161  */
v_mfma_f32_16x16x32_bf16 acc[180:183], v[vgprValuB_X0_I0+40:vgprValuB_X0_I0+40+3], v[vgprValuA_X0_I0+40:vgprValuA_X0_I0+40+3], acc[180:183] // left value = acc[180+0:183+0]
PERM A 16 2 8
PERM A 16 2 9
/*  mfmaIndex:162  */
v_mfma_f32_16x16x32_bf16 acc[184:187], v[vgprValuB_X0_I0+40:vgprValuB_X0_I0+40+3], v[vgprValuA_X0_I0+48+4:vgprValuA_X0_I0+48+4+3], acc[184:187] // left value = acc[184+0:187+0]
PERM A 16 2 10
PERM A 16 2 11
/*  mfmaIndex:163  */
v_mfma_f32_16x16x32_bf16 acc[184:187], v[vgprValuB_X0_I0+40+4:vgprValuB_X0_I0+40+4+3], v[vgprValuA_X0_I0+48:vgprValuA_X0_I0+48+3], acc[184:187] // left value = acc[184+0:187+0]
PERM A 16 2 12
PERM A 16 2 13
/*  mfmaIndex:164  */
v_mfma_f32_16x16x32_bf16 acc[184:187], v[vgprValuB_X0_I0+40:vgprValuB_X0_I0+40+3], v[vgprValuA_X0_I0+48:vgprValuA_X0_I0+48+3], acc[184:187] // left value = acc[184+0:187+0]
PERM A 16 2 14
PERM A 16 2 15
/*  mfmaIndex:165  */
v_mfma_f32_16x16x32_bf16 acc[188:191], v[vgprValuB_X0_I0+40:vgprValuB_X0_I0+40+3], v[vgprValuA_X0_I0+56+4:vgprValuA_X0_I0+56+4+3], acc[188:191] // left value = acc[188+0:191+0]
PERM A 16 2 16
PERM A 16 2 17
/*  mfmaIndex:166  */
v_mfma_f32_16x16x32_bf16 acc[188:191], v[vgprValuB_X0_I0+40+4:vgprValuB_X0_I0+40+4+3], v[vgprValuA_X0_I0+56:vgprValuA_X0_I0+56+3], acc[188:191] // left value = acc[188+0:191+0]
PERM A 16 2 18
PERM A 16 2 19
/*  mfmaIndex:167  */
v_mfma_f32_16x16x32_bf16 acc[188:191], v[vgprValuB_X0_I0+40:vgprValuB_X0_I0+40+3], v[vgprValuA_X0_I0+56:vgprValuA_X0_I0+56+3], acc[188:191] // left value = acc[188+0:191+0]
PERM A 16 2 20
PERM A 16 2 21
/*  mfmaIndex:168  */
v_mfma_f32_16x16x32_bf16 acc[208:211], v[vgprValuB_X0_I0+48:vgprValuB_X0_I0+48+3], v[vgprValuA_X0_I0+32+4:vgprValuA_X0_I0+32+4+3], acc[208:211] // left value = acc[208+0:211+0]
PERM A 16 2 22
PERM A 16 2 23
/*  mfmaIndex:169  */
v_mfma_f32_16x16x32_bf16 acc[208:211], v[vgprValuB_X0_I0+48+4:vgprValuB_X0_I0+48+4+3], v[vgprValuA_X0_I0+32:vgprValuA_X0_I0+32+3], acc[208:211] // left value = acc[208+0:211+0]
PERM A 24 3 0
PERM A 24 3 1
/*  mfmaIndex:170  */
v_mfma_f32_16x16x32_bf16 acc[208:211], v[vgprValuB_X0_I0+48:vgprValuB_X0_I0+48+3], v[vgprValuA_X0_I0+32:vgprValuA_X0_I0+32+3], acc[208:211] // left value = acc[208+0:211+0]
PERM A 24 3 2
PERM A 24 3 3
/*  mfmaIndex:171  */
v_mfma_f32_16x16x32_bf16 acc[212:215], v[vgprValuB_X0_I0+48:vgprValuB_X0_I0+48+3], v[vgprValuA_X0_I0+40+4:vgprValuA_X0_I0+40+4+3], acc[212:215] // left value = acc[212+0:215+0]
PERM A 24 3 4
PERM A 24 3 5
/*  mfmaIndex:172  */
v_mfma_f32_16x16x32_bf16 acc[212:215], v[vgprValuB_X0_I0+48+4:vgprValuB_X0_I0+48+4+3], v[vgprValuA_X0_I0+40:vgprValuA_X0_I0+40+3], acc[212:215] // left value = acc[212+0:215+0]
PERM A 24 3 6
PERM A 24 3 7
/*  mfmaIndex:173  */
v_mfma_f32_16x16x32_bf16 acc[212:215], v[vgprValuB_X0_I0+48:vgprValuB_X0_I0+48+3], v[vgprValuA_X0_I0+40:vgprValuA_X0_I0+40+3], acc[212:215] // left value = acc[212+0:215+0]
PERM A 24 3 8
PERM A 24 3 9
/*  mfmaIndex:174  */
v_mfma_f32_16x16x32_bf16 acc[216:219], v[vgprValuB_X0_I0+48:vgprValuB_X0_I0+48+3], v[vgprValuA_X0_I0+48+4:vgprValuA_X0_I0+48+4+3], acc[216:219] // left value = acc[216+0:219+0]
PERM A 24 3 10
PERM A 24 3 11
/*  mfmaIndex:175  */
v_mfma_f32_16x16x32_bf16 acc[216:219], v[vgprValuB_X0_I0+48+4:vgprValuB_X0_I0+48+4+3], v[vgprValuA_X0_I0+48:vgprValuA_X0_I0+48+3], acc[216:219] // left value = acc[216+0:219+0]
PERM A 24 3 12
PERM A 24 3 13
/*  mfmaIndex:176  */
v_mfma_f32_16x16x32_bf16 acc[216:219], v[vgprValuB_X0_I0+48:vgprValuB_X0_I0+48+3], v[vgprValuA_X0_I0+48:vgprValuA_X0_I0+48+3], acc[216:219] // left value = acc[216+0:219+0]
PERM A 24 3 14
PERM A 24 3 15
/*  mfmaIndex:177  */
v_mfma_f32_16x16x32_bf16 acc[220:223], v[vgprValuB_X0_I0+48:vgprValuB_X0_I0+48+3], v[vgprValuA_X0_I0+56+4:vgprValuA_X0_I0+56+4+3], acc[220:223] // left value = acc[220+0:223+0]
PERM A 24 3 16
PERM A 24 3 17
/*  mfmaIndex:178  */
v_mfma_f32_16x16x32_bf16 acc[220:223], v[vgprValuB_X0_I0+48+4:vgprValuB_X0_I0+48+4+3], v[vgprValuA_X0_I0+56:vgprValuA_X0_I0+56+3], acc[220:223] // left value = acc[220+0:223+0]
PERM A 24 3 18
PERM A 24 3 19
/*  mfmaIndex:179  */
v_mfma_f32_16x16x32_bf16 acc[220:223], v[vgprValuB_X0_I0+48:vgprValuB_X0_I0+48+3], v[vgprValuA_X0_I0+56:vgprValuA_X0_I0+56+3], acc[220:223] // left value = acc[220+0:223+0]
PERM A 24 3 20
PERM A 24 3 21
/*  mfmaIndex:180  */
v_mfma_f32_16x16x32_bf16 acc[240:243], v[vgprValuB_X0_I0+56:vgprValuB_X0_I0+56+3], v[vgprValuA_X0_I0+32+4:vgprValuA_X0_I0+32+4+3], acc[240:243] // left value = acc[240+0:243+0]
PERM A 24 3 22
PERM A 24 3 23
/*  mfmaIndex:181  */
v_mfma_f32_16x16x32_bf16 acc[240:243], v[vgprValuB_X0_I0+56+4:vgprValuB_X0_I0+56+4+3], v[vgprValuA_X0_I0+32:vgprValuA_X0_I0+32+3], acc[240:243] // left value = acc[240+0:243+0]
s_waitcnt lgkmcnt(0) // Wait for LR B0
CVT B 0 0 0
CVT B 0 0 1
/*  mfmaIndex:182  */
v_mfma_f32_16x16x32_bf16 acc[240:243], v[vgprValuB_X0_I0+56:vgprValuB_X0_I0+56+3], v[vgprValuA_X0_I0+32:vgprValuA_X0_I0+32+3], acc[240:243] // left value = acc[240+0:243+0]
CVT B 0 0 2
CVT B 0 0 3
/*  mfmaIndex:183  */
v_mfma_f32_16x16x32_bf16 acc[244:247], v[vgprValuB_X0_I0+56:vgprValuB_X0_I0+56+3], v[vgprValuA_X0_I0+40+4:vgprValuA_X0_I0+40+4+3], acc[244:247] // left value = acc[244+0:247+0]
CVT B 0 0 4
CVT B 0 0 5
/*  mfmaIndex:184  */
v_mfma_f32_16x16x32_bf16 acc[244:247], v[vgprValuB_X0_I0+56+4:vgprValuB_X0_I0+56+4+3], v[vgprValuA_X0_I0+40:vgprValuA_X0_I0+40+3], acc[244:247] // left value = acc[244+0:247+0]
CVT B 0 0 6
CVT B 0 0 7
/*  mfmaIndex:185  */
v_mfma_f32_16x16x32_bf16 acc[244:247], v[vgprValuB_X0_I0+56:vgprValuB_X0_I0+56+3], v[vgprValuA_X0_I0+40:vgprValuA_X0_I0+40+3], acc[244:247] // left value = acc[244+0:247+0]
CVT B 0 0 8
CVT B 0 0 9
/*  mfmaIndex:186  */
v_mfma_f32_16x16x32_bf16 acc[248:251], v[vgprValuB_X0_I0+56:vgprValuB_X0_I0+56+3], v[vgprValuA_X0_I0+48+4:vgprValuA_X0_I0+48+4+3], acc[248:251] // left value = acc[248+0:251+0]
CVT B 0 0 10
CVT B 0 0 11
/*  mfmaIndex:187  */
v_mfma_f32_16x16x32_bf16 acc[248:251], v[vgprValuB_X0_I0+56+4:vgprValuB_X0_I0+56+4+3], v[vgprValuA_X0_I0+48:vgprValuA_X0_I0+48+3], acc[248:251] // left value = acc[248+0:251+0]
CVT B 0 0 12
CVT B 0 0 13
/*  mfmaIndex:188  */
v_mfma_f32_16x16x32_bf16 acc[248:251], v[vgprValuB_X0_I0+56:vgprValuB_X0_I0+56+3], v[vgprValuA_X0_I0+48:vgprValuA_X0_I0+48+3], acc[248:251] // left value = acc[248+0:251+0]
CVT B 0 0 14
CVT B 0 0 15
/*  mfmaIndex:189  */
v_mfma_f32_16x16x32_bf16 acc[252:255], v[vgprValuB_X0_I0+56:vgprValuB_X0_I0+56+3], v[vgprValuA_X0_I0+56+4:vgprValuA_X0_I0+56+4+3], acc[252:255] // left value = acc[252+0:255+0]
s_sub_u32 s[sgprLoopCounterL], s[sgprLoopCounterL], 1 // dec counterL
CVT B 0 0 16
CVT B 0 0 17
/*  mfmaIndex:190  */
v_mfma_f32_16x16x32_bf16 acc[252:255], v[vgprValuB_X0_I0+56+4:vgprValuB_X0_I0+56+4+3], v[vgprValuA_X0_I0+56:vgprValuA_X0_I0+56+3], acc[252:255] // left value = acc[252+0:255+0]
s_cmp_eq_i32 s[sgprLoopCounterL], 0x2              // counterL==2
CVT B 0 0 18
CVT B 0 0 19
CVT B 0 0 20
CVT B 0 0 21
CVT B 0 0 22
CVT B 0 0 23
.if \isOdd == 0  
s_nop 0
.endif
v_cvt_pk_bf16_f32 v[vgprValuB_X0_I0+8+0], v[vgprValuB_T0_I0+8+0], v[vgprValuB_T0_I0+8+1]
v_cvt_pk_bf16_f32 v[vgprValuB_X0_I0+8+1], v[vgprValuB_T0_I0+8+2], v[vgprValuB_T0_I0+8+3]
v_cvt_pk_bf16_f32 v[vgprValuB_X0_I0+8+2], v[vgprValuB_T0_I0+8+4], v[vgprValuB_T0_I0+8+5]
v_cvt_pk_bf16_f32 v[vgprValuB_X0_I0+8+3], v[vgprValuB_T0_I0+8+6], v[vgprValuB_T0_I0+8+7]
.if \isOdd == 1
s_nop 0
.endif 
v_cvt_pk_bf16_f32 v[vgprValuB_X0_I0+16+0], v[vgprValuB_T0_I0+16+0], v[vgprValuB_T0_I0+16+1]
v_cvt_pk_bf16_f32 v[vgprValuB_X0_I0+16+1], v[vgprValuB_T0_I0+16+2], v[vgprValuB_T0_I0+16+3]
v_cvt_pk_bf16_f32 v[vgprValuB_X0_I0+16+2], v[vgprValuB_T0_I0+16+4], v[vgprValuB_T0_I0+16+5]
v_cvt_pk_bf16_f32 v[vgprValuB_X0_I0+16+3], v[vgprValuB_T0_I0+16+6], v[vgprValuB_T0_I0+16+7]
v_cvt_pk_bf16_f32 v[vgprValuB_X0_I0+24+0], v[vgprValuB_T0_I0+24+0], v[vgprValuB_T0_I0+24+1]
v_cvt_pk_bf16_f32 v[vgprValuB_X0_I0+24+1], v[vgprValuB_T0_I0+24+2], v[vgprValuB_T0_I0+24+3]
v_cvt_pk_bf16_f32 v[vgprValuB_X0_I0+24+2], v[vgprValuB_T0_I0+24+4], v[vgprValuB_T0_I0+24+5]
v_cvt_pk_bf16_f32 v[vgprValuB_X0_I0+24+3], v[vgprValuB_T0_I0+24+6], v[vgprValuB_T0_I0+24+7]

v_dot2c_f32_bf16 v[vgprValuB_T0_I0+8+0], 0x8000bf80, v[vgprValuB_X0_I0+8+0]
v_dot2c_f32_bf16 v[vgprValuB_T0_I0+8+1], 0xbf800000, v[vgprValuB_X0_I0+8+0]
v_dot2c_f32_bf16 v[vgprValuB_T0_I0+8+2], 0x8000bf80, v[vgprValuB_X0_I0+8+1]
v_dot2c_f32_bf16 v[vgprValuB_T0_I0+8+3], 0xbf800000, v[vgprValuB_X0_I0+8+1]
v_dot2c_f32_bf16 v[vgprValuB_T0_I0+8+4], 0x8000bf80, v[vgprValuB_X0_I0+8+2]
v_dot2c_f32_bf16 v[vgprValuB_T0_I0+8+5], 0xbf800000, v[vgprValuB_X0_I0+8+2]
v_dot2c_f32_bf16 v[vgprValuB_T0_I0+8+6], 0x8000bf80, v[vgprValuB_X0_I0+8+3]
v_dot2c_f32_bf16 v[vgprValuB_T0_I0+8+7], 0xbf800000, v[vgprValuB_X0_I0+8+3]
v_dot2c_f32_bf16 v[vgprValuB_T0_I0+16+0], 0x8000bf80, v[vgprValuB_X0_I0+16+0]
v_dot2c_f32_bf16 v[vgprValuB_T0_I0+16+1], 0xbf800000, v[vgprValuB_X0_I0+16+0]
v_dot2c_f32_bf16 v[vgprValuB_T0_I0+16+2], 0x8000bf80, v[vgprValuB_X0_I0+16+1]
v_dot2c_f32_bf16 v[vgprValuB_T0_I0+16+3], 0xbf800000, v[vgprValuB_X0_I0+16+1]
v_dot2c_f32_bf16 v[vgprValuB_T0_I0+16+4], 0x8000bf80, v[vgprValuB_X0_I0+16+2]
v_dot2c_f32_bf16 v[vgprValuB_T0_I0+16+5], 0xbf800000, v[vgprValuB_X0_I0+16+2]
v_dot2c_f32_bf16 v[vgprValuB_T0_I0+16+6], 0x8000bf80, v[vgprValuB_X0_I0+16+3]
v_dot2c_f32_bf16 v[vgprValuB_T0_I0+16+7], 0xbf800000, v[vgprValuB_X0_I0+16+3]
v_dot2c_f32_bf16 v[vgprValuB_T0_I0+24+0], 0x8000bf80, v[vgprValuB_X0_I0+24+0]
v_dot2c_f32_bf16 v[vgprValuB_T0_I0+24+1], 0xbf800000, v[vgprValuB_X0_I0+24+0]
v_dot2c_f32_bf16 v[vgprValuB_T0_I0+24+2], 0x8000bf80, v[vgprValuB_X0_I0+24+1]
v_dot2c_f32_bf16 v[vgprValuB_T0_I0+24+3], 0xbf800000, v[vgprValuB_X0_I0+24+1]
v_dot2c_f32_bf16 v[vgprValuB_T0_I0+24+4], 0x8000bf80, v[vgprValuB_X0_I0+24+2]
v_dot2c_f32_bf16 v[vgprValuB_T0_I0+24+5], 0xbf800000, v[vgprValuB_X0_I0+24+2]
v_dot2c_f32_bf16 v[vgprValuB_T0_I0+24+6], 0x8000bf80, v[vgprValuB_X0_I0+24+3]
v_dot2c_f32_bf16 v[vgprValuB_T0_I0+24+7], 0xbf800000, v[vgprValuB_X0_I0+24+3]

v_cvt_pk_bf16_f32 v[vgprValuB_X0_I0+8+7], v[vgprValuB_T0_I0+8+6], v[vgprValuB_T0_I0+8+7]
v_cvt_pk_bf16_f32 v[vgprValuB_X0_I0+8+6], v[vgprValuB_T0_I0+8+4], v[vgprValuB_T0_I0+8+5]
v_cvt_pk_bf16_f32 v[vgprValuB_X0_I0+8+5], v[vgprValuB_T0_I0+8+2], v[vgprValuB_T0_I0+8+3]
v_cvt_pk_bf16_f32 v[vgprValuB_X0_I0+8+4], v[vgprValuB_T0_I0+8+0], v[vgprValuB_T0_I0+8+1]
/*  mfmaIndex:191  */
v_mfma_f32_16x16x32_bf16 acc[252:255], v[vgprValuB_X0_I0+56:vgprValuB_X0_I0+56+3], v[vgprValuA_X0_I0+56:vgprValuA_X0_I0+56+3], acc[252:255] // left value = acc[252+0:255+0]
.endm


// EVEN SIMDID takes WVLoop0 path, ODD SIMDID takes other path
s_getreg_b32 s84, hwreg(HW_REG_HW_ID, 4, 1)
//s_mov_b32 s84, 0 // Force one code-path
s_cmp_eq_u32 s84, 0
s_cbranch_scc0 label_LoopBeginL1

/******************************************/
/* Unrolled Loop 1/1 - Begin (Even SIMD)  */
/******************************************/
label_LoopBeginL0:
MAINLOOP 0
/* closeLoop loopL finalLoop=1 tailLoop=0 */
s_cbranch_scc0 label_LoopBeginL0                    // restart LoopL
s_branch label_LoopEndL

/******************************************/
/* Unrolled Loop 1/1 - Begin (Odd SIMD)   */
/******************************************/
label_LoopBeginL1:
MAINLOOP 1
/* closeLoop loopL finalLoop=1 tailLoop=0 */
s_cbranch_scc0 label_LoopBeginL1                    // restart LoopL

label_LoopEndL:

/* Before NLL: Check VGPR.checkin for INT8 LW */

/******************************************/
/* Ord. NoGlobalLoadLoop - Begin          */
/******************************************/
s_waitcnt vmcnt(0)                                // 10wait for global read
// Skip force waitcnt0
s_barrier

/* iter 0 (reset local read pointers iteration)  (swap and reset local write pointers iteration)  (swap local read pointers iteration)  */
/*  grEndMfmaIndex:6, lwStartMfmaIndex:54, lwEndMfmaIndex:191  */
/*  numMfmaForLR:52, syncPlrMfmaIndex:0  */
// A0 B0
/*  mfmaIndex:0  */
v_mfma_f32_16x16x32_bf16 acc[0:3], v[vgprValuB_X0_I0+0:vgprValuB_X0_I0+0+3], v[vgprValuA_X0_I0+0+4:vgprValuA_X0_I0+0+4+3], acc[0:3] // left value = acc[0+0:3+0]
GRINC A 0
LR1 A 8
LR1 A 9
/*  mfmaIndex:1  */
v_mfma_f32_16x16x32_bf16 acc[0:3], v[vgprValuB_X0_I0+0+4:vgprValuB_X0_I0+0+4+3], v[vgprValuA_X0_I0+0:vgprValuA_X0_I0+0+3], acc[0:3] // left value = acc[0+0:3+0]
GRINC A 1
v_cvt_pk_bf16_f32 v[vgprValuB_X0_I0+16+7], v[vgprValuB_T0_I0+16+6], v[vgprValuB_T0_I0+16+7]
v_cvt_pk_bf16_f32 v[vgprValuB_X0_I0+16+6], v[vgprValuB_T0_I0+16+4], v[vgprValuB_T0_I0+16+5]
/*  mfmaIndex:2  */
v_mfma_f32_16x16x32_bf16 acc[0:3], v[vgprValuB_X0_I0+0:vgprValuB_X0_I0+0+3], v[vgprValuA_X0_I0+0:vgprValuA_X0_I0+0+3], acc[0:3] // left value = acc[0+0:3+0]
GRINC A 2
LR1 A 10
LR1 A 11
/*  mfmaIndex:3  */
v_mfma_f32_16x16x32_bf16 acc[4:7], v[vgprValuB_X0_I0+0:vgprValuB_X0_I0+0+3], v[vgprValuA_X0_I0+8+4:vgprValuA_X0_I0+8+4+3], acc[4:7] // left value = acc[4+0:7+0]
GRINC A 3
v_cvt_pk_bf16_f32 v[vgprValuB_X0_I0+16+5], v[vgprValuB_T0_I0+16+2], v[vgprValuB_T0_I0+16+3]
v_cvt_pk_bf16_f32 v[vgprValuB_X0_I0+16+4], v[vgprValuB_T0_I0+16+0], v[vgprValuB_T0_I0+16+1]
/*  mfmaIndex:4  */
v_mfma_f32_16x16x32_bf16 acc[4:7], v[vgprValuB_X0_I0+0+4:vgprValuB_X0_I0+0+4+3], v[vgprValuA_X0_I0+8:vgprValuA_X0_I0+8+3], acc[4:7] // left value = acc[4+0:7+0]
GRINC A 4
LR1 A 12
LR1 A 13
/*  mfmaIndex:5  */
v_mfma_f32_16x16x32_bf16 acc[4:7], v[vgprValuB_X0_I0+0:vgprValuB_X0_I0+0+3], v[vgprValuA_X0_I0+8:vgprValuA_X0_I0+8+3], acc[4:7] // left value = acc[4+0:7+0]
GRINC A 5
v_cvt_pk_bf16_f32 v[vgprValuB_X0_I0+24+7], v[vgprValuB_T0_I0+24+6], v[vgprValuB_T0_I0+24+7]
v_cvt_pk_bf16_f32 v[vgprValuB_X0_I0+24+6], v[vgprValuB_T0_I0+24+4], v[vgprValuB_T0_I0+24+5]
/*  mfmaIndex:6  */
v_mfma_f32_16x16x32_bf16 acc[8:11], v[vgprValuB_X0_I0+0:vgprValuB_X0_I0+0+3], v[vgprValuA_X0_I0+16+4:vgprValuA_X0_I0+16+4+3], acc[8:11] // left value = acc[8+0:11+0]
GRINC A 6
LR1 A 14
LR1 A 15
/*  mfmaIndex:7  */
v_mfma_f32_16x16x32_bf16 acc[8:11], v[vgprValuB_X0_I0+0+4:vgprValuB_X0_I0+0+4+3], v[vgprValuA_X0_I0+16:vgprValuA_X0_I0+16+3], acc[8:11] // left value = acc[8+0:11+0]
GRINC A 7
v_cvt_pk_bf16_f32 v[vgprValuB_X0_I0+24+5], v[vgprValuB_T0_I0+24+2], v[vgprValuB_T0_I0+24+3]
v_cvt_pk_bf16_f32 v[vgprValuB_X0_I0+24+4], v[vgprValuB_T0_I0+24+0], v[vgprValuB_T0_I0+24+1]
/*  mfmaIndex:8  */
v_mfma_f32_16x16x32_bf16 acc[8:11], v[vgprValuB_X0_I0+0:vgprValuB_X0_I0+0+3], v[vgprValuA_X0_I0+16:vgprValuA_X0_I0+16+3], acc[8:11] // left value = acc[8+0:11+0]
s_waitcnt lgkmcnt(4) // 8x LRA1 issue, wait for 4x LR A1 to be done.
PERM A 32 0 0
PERM A 32 0 1
/*  mfmaIndex:9  */
v_mfma_f32_16x16x32_bf16 acc[12:15], v[vgprValuB_X0_I0+0:vgprValuB_X0_I0+0+3], v[vgprValuA_X0_I0+24+4:vgprValuA_X0_I0+24+4+3], acc[12:15] // left value = acc[12+0:15+0]
GRINC A 8
PERM A 32 0 2
PERM A 32 0 3
/*  mfmaIndex:10  */
v_mfma_f32_16x16x32_bf16 acc[12:15], v[vgprValuB_X0_I0+0+4:vgprValuB_X0_I0+0+4+3], v[vgprValuA_X0_I0+24:vgprValuA_X0_I0+24+3], acc[12:15] // left value = acc[12+0:15+0]
PERM A 32 0 4
PERM A 32 0 5
/*  mfmaIndex:11  */
v_mfma_f32_16x16x32_bf16 acc[12:15], v[vgprValuB_X0_I0+0:vgprValuB_X0_I0+0+3], v[vgprValuA_X0_I0+24:vgprValuA_X0_I0+24+3], acc[12:15] // left value = acc[12+0:15+0]
s_waitcnt lgkmcnt(0) // 8x LR A1 done.
PERM A 32 0 6
PERM A 32 0 7
/*  mfmaIndex:12  */
v_mfma_f32_16x16x32_bf16 acc[32:35], v[vgprValuB_X0_I0+8:vgprValuB_X0_I0+8+3], v[vgprValuA_X0_I0+0+4:vgprValuA_X0_I0+0+4+3], acc[32:35] // left value = acc[32+0:35+0]
s_barrier // Can start global A read
/*  mfmaIndex:13  */
v_mfma_f32_16x16x32_bf16 acc[32:35], v[vgprValuB_X0_I0+8+4:vgprValuB_X0_I0+8+4+3], v[vgprValuA_X0_I0+0:vgprValuA_X0_I0+0+3], acc[32:35] // left value = acc[32+0:35+0]
v_xor_b32 v[vgprLocalReadAddrA], v[vgprLocalReadSwapAddrA], v[vgprLocalReadAddrA] // swap Red Blk
PERM A 32 0 8
/*  mfmaIndex:14  */
v_mfma_f32_16x16x32_bf16 acc[32:35], v[vgprValuB_X0_I0+8:vgprValuB_X0_I0+8+3], v[vgprValuA_X0_I0+0:vgprValuA_X0_I0+0+3], acc[32:35] // left value = acc[32+0:35+0]
LR0 B 8
PERM A 32 0 9
/*  mfmaIndex:15  */
v_mfma_f32_16x16x32_bf16 acc[36:39], v[vgprValuB_X0_I0+8:vgprValuB_X0_I0+8+3], v[vgprValuA_X0_I0+8+4:vgprValuA_X0_I0+8+4+3], acc[36:39] // left value = acc[36+0:39+0]
PERM A 32 0 10
/*  mfmaIndex:16  */
v_mfma_f32_16x16x32_bf16 acc[36:39], v[vgprValuB_X0_I0+8+4:vgprValuB_X0_I0+8+4+3], v[vgprValuA_X0_I0+8:vgprValuA_X0_I0+8+3], acc[36:39] // left value = acc[36+0:39+0]
LR0 B 9
PERM A 32 0 11
/*  mfmaIndex:17  */
v_mfma_f32_16x16x32_bf16 acc[36:39], v[vgprValuB_X0_I0+8:vgprValuB_X0_I0+8+3], v[vgprValuA_X0_I0+8:vgprValuA_X0_I0+8+3], acc[36:39] // left value = acc[36+0:39+0]
PERM A 32 0 12
/*  mfmaIndex:18  */
v_mfma_f32_16x16x32_bf16 acc[40:43], v[vgprValuB_X0_I0+8:vgprValuB_X0_I0+8+3], v[vgprValuA_X0_I0+16+4:vgprValuA_X0_I0+16+4+3], acc[40:43] // left value = acc[40+0:43+0]
LR0 B 10
PERM A 32 0 13
/*  mfmaIndex:19  */
v_mfma_f32_16x16x32_bf16 acc[40:43], v[vgprValuB_X0_I0+8+4:vgprValuB_X0_I0+8+4+3], v[vgprValuA_X0_I0+16:vgprValuA_X0_I0+16+3], acc[40:43] // left value = acc[40+0:43+0]
PERM A 32 0 14
/*  mfmaIndex:20  */
v_mfma_f32_16x16x32_bf16 acc[40:43], v[vgprValuB_X0_I0+8:vgprValuB_X0_I0+8+3], v[vgprValuA_X0_I0+16:vgprValuA_X0_I0+16+3], acc[40:43] // left value = acc[40+0:43+0]
LR0 B 11
PERM A 32 0 15
/*  mfmaIndex:21  */
v_mfma_f32_16x16x32_bf16 acc[44:47], v[vgprValuB_X0_I0+8:vgprValuB_X0_I0+8+3], v[vgprValuA_X0_I0+24+4:vgprValuA_X0_I0+24+4+3], acc[44:47] // left value = acc[44+0:47+0]
LR0 B 12
/*  mfmaIndex:22  */
v_mfma_f32_16x16x32_bf16 acc[44:47], v[vgprValuB_X0_I0+8+4:vgprValuB_X0_I0+8+4+3], v[vgprValuA_X0_I0+24:vgprValuA_X0_I0+24+3], acc[44:47] // left value = acc[44+0:47+0]
PERM A 32 0 16
PERM A 32 0 17
/*  mfmaIndex:23  */
v_mfma_f32_16x16x32_bf16 acc[44:47], v[vgprValuB_X0_I0+8:vgprValuB_X0_I0+8+3], v[vgprValuA_X0_I0+24:vgprValuA_X0_I0+24+3], acc[44:47] // left value = acc[44+0:47+0]
PERM A 32 0 18
PERM A 32 0 19
/*  mfmaIndex:24  */
v_mfma_f32_16x16x32_bf16 acc[64:67], v[vgprValuB_X0_I0+16:vgprValuB_X0_I0+16+3], v[vgprValuA_X0_I0+0+4:vgprValuA_X0_I0+0+4+3], acc[64:67] // left value = acc[64+0:67+0]
PERM A 32 0 20
/*  mfmaIndex:25  */
v_mfma_f32_16x16x32_bf16 acc[64:67], v[vgprValuB_X0_I0+16+4:vgprValuB_X0_I0+16+4+3], v[vgprValuA_X0_I0+0:vgprValuA_X0_I0+0+3], acc[64:67] // left value = acc[64+0:67+0]
LR0 B 13  
PERM A 32 0 21
/*  mfmaIndex:26  */
v_mfma_f32_16x16x32_bf16 acc[64:67], v[vgprValuB_X0_I0+16:vgprValuB_X0_I0+16+3], v[vgprValuA_X0_I0+0:vgprValuA_X0_I0+0+3], acc[64:67] // left value = acc[64+0:67+0]
LR0 B 14
/*  mfmaIndex:27  */
v_mfma_f32_16x16x32_bf16 acc[68:71], v[vgprValuB_X0_I0+16:vgprValuB_X0_I0+16+3], v[vgprValuA_X0_I0+8+4:vgprValuA_X0_I0+8+4+3], acc[68:71] // left value = acc[68+0:71+0]
PERM A 32 0 22
PERM A 32 0 23  
/*  mfmaIndex:28  */
v_mfma_f32_16x16x32_bf16 acc[68:71], v[vgprValuB_X0_I0+16+4:vgprValuB_X0_I0+16+4+3], v[vgprValuA_X0_I0+8:vgprValuA_X0_I0+8+3], acc[68:71] // left value = acc[68+0:71+0]
PERM A 40 1 0
PERM A 40 1 1
/*  mfmaIndex:29  */
v_mfma_f32_16x16x32_bf16 acc[68:71], v[vgprValuB_X0_I0+16:vgprValuB_X0_I0+16+3], v[vgprValuA_X0_I0+8:vgprValuA_X0_I0+8+3], acc[68:71] // left value = acc[68+0:71+0]
PERM A 40 1 2
PERM A 40 1 3
/*  mfmaIndex:30  */
v_mfma_f32_16x16x32_bf16 acc[72:75], v[vgprValuB_X0_I0+16:vgprValuB_X0_I0+16+3], v[vgprValuA_X0_I0+16+4:vgprValuA_X0_I0+16+4+3], acc[72:75] // left value = acc[72+0:75+0]
PERM A 40 1 4
/*  mfmaIndex:31  */
v_mfma_f32_16x16x32_bf16 acc[72:75], v[vgprValuB_X0_I0+16+4:vgprValuB_X0_I0+16+4+3], v[vgprValuA_X0_I0+16:vgprValuA_X0_I0+16+3], acc[72:75] // left value = acc[72+0:75+0]
LR0 B 15  
PERM A 40 1 5
/*  mfmaIndex:32  */
v_mfma_f32_16x16x32_bf16 acc[72:75], v[vgprValuB_X0_I0+16:vgprValuB_X0_I0+16+3], v[vgprValuA_X0_I0+16:vgprValuA_X0_I0+16+3], acc[72:75] // left value = acc[72+0:75+0]
PERM A 40 1 6
PERM A 40 1 7
/*  mfmaIndex:33  */
v_mfma_f32_16x16x32_bf16 acc[76:79], v[vgprValuB_X0_I0+16:vgprValuB_X0_I0+16+3], v[vgprValuA_X0_I0+24+4:vgprValuA_X0_I0+24+4+3], acc[76:79] // left value = acc[76+0:79+0]
PERM A 40 1 8
PERM A 40 1 9
/*  mfmaIndex:34  */
v_mfma_f32_16x16x32_bf16 acc[76:79], v[vgprValuB_X0_I0+16+4:vgprValuB_X0_I0+16+4+3], v[vgprValuA_X0_I0+24:vgprValuA_X0_I0+24+3], acc[76:79] // left value = acc[76+0:79+0]
PERM A 40 1 10
PERM A 40 1 11
/*  mfmaIndex:35  */
v_mfma_f32_16x16x32_bf16 acc[76:79], v[vgprValuB_X0_I0+16:vgprValuB_X0_I0+16+3], v[vgprValuA_X0_I0+24:vgprValuA_X0_I0+24+3], acc[76:79] // left value = acc[76+0:79+0]
PERM A 40 1 12
PERM A 40 1 13
/*  mfmaIndex:36  */
v_mfma_f32_16x16x32_bf16 acc[96:99], v[vgprValuB_X0_I0+24:vgprValuB_X0_I0+24+3], v[vgprValuA_X0_I0+0+4:vgprValuA_X0_I0+0+4+3], acc[96:99] // left value = acc[96+0:99+0]
PERM A 40 1 14
/*  mfmaIndex:37  */
v_mfma_f32_16x16x32_bf16 acc[96:99], v[vgprValuB_X0_I0+24+4:vgprValuB_X0_I0+24+4+3], v[vgprValuA_X0_I0+0:vgprValuA_X0_I0+0+3], acc[96:99] // left value = acc[96+0:99+0]
PERM A 40 1 15
/*  mfmaIndex:38  */
v_mfma_f32_16x16x32_bf16 acc[96:99], v[vgprValuB_X0_I0+24:vgprValuB_X0_I0+24+3], v[vgprValuA_X0_I0+0:vgprValuA_X0_I0+0+3], acc[96:99] // left value = acc[96+0:99+0]
PERM A 40 1 16
PERM A 40 1 17
/*  mfmaIndex:39  */
v_mfma_f32_16x16x32_bf16 acc[100:103], v[vgprValuB_X0_I0+24:vgprValuB_X0_I0+24+3], v[vgprValuA_X0_I0+8+4:vgprValuA_X0_I0+8+4+3], acc[100:103] // left value = acc[100+0:103+0]
PERM A 40 1 18
PERM A 40 1 19
/*  mfmaIndex:40  */
v_mfma_f32_16x16x32_bf16 acc[100:103], v[vgprValuB_X0_I0+24+4:vgprValuB_X0_I0+24+4+3], v[vgprValuA_X0_I0+8:vgprValuA_X0_I0+8+3], acc[100:103] // left value = acc[100+0:103+0]
PERM A 40 1 20
PERM A 40 1 21
/*  mfmaIndex:41  */
v_mfma_f32_16x16x32_bf16 acc[100:103], v[vgprValuB_X0_I0+24:vgprValuB_X0_I0+24+3], v[vgprValuA_X0_I0+8:vgprValuA_X0_I0+8+3], acc[100:103] // left value = acc[100+0:103+0]
/*  mfmaIndex:42  */
v_mfma_f32_16x16x32_bf16 acc[104:107], v[vgprValuB_X0_I0+24:vgprValuB_X0_I0+24+3], v[vgprValuA_X0_I0+16+4:vgprValuA_X0_I0+16+4+3], acc[104:107] // left value = acc[104+0:107+0]
PERM A 40 1 22
PERM A 40 1 23
/*  mfmaIndex:43  */
v_mfma_f32_16x16x32_bf16 acc[104:107], v[vgprValuB_X0_I0+24+4:vgprValuB_X0_I0+24+4+3], v[vgprValuA_X0_I0+16:vgprValuA_X0_I0+16+3], acc[104:107] // left value = acc[104+0:107+0]
s_xor_b32 s[sgprLocalWriteAddrA], s[sgprSwapA], s[sgprLocalWriteAddrA] // swap Red Blk SGPR
PERM A 48 2 0
PERM A 48 2 1
/*  mfmaIndex:44  */
v_mfma_f32_16x16x32_bf16 acc[104:107], v[vgprValuB_X0_I0+24:vgprValuB_X0_I0+24+3], v[vgprValuA_X0_I0+16:vgprValuA_X0_I0+16+3], acc[104:107] // left value = acc[104+0:107+0]
GRINC B 0
PERM A 48 2 2
PERM A 48 2 3
/*  mfmaIndex:45  */
v_mfma_f32_16x16x32_bf16 acc[108:111], v[vgprValuB_X0_I0+24:vgprValuB_X0_I0+24+3], v[vgprValuA_X0_I0+24+4:vgprValuA_X0_I0+24+4+3], acc[108:111] // left value = acc[108+0:111+0]
GRINC B 1
PERM A 48 2 4
PERM A 48 2 5
/*  mfmaIndex:46  */
v_mfma_f32_16x16x32_bf16 acc[108:111], v[vgprValuB_X0_I0+24+4:vgprValuB_X0_I0+24+4+3], v[vgprValuA_X0_I0+24:vgprValuA_X0_I0+24+3], acc[108:111] // left value = acc[108+0:111+0]
GRINC B 2
PERM A 48 2 6
PERM A 48 2 7
/*  mfmaIndex:47  */
v_mfma_f32_16x16x32_bf16 acc[108:111], v[vgprValuB_X0_I0+24:vgprValuB_X0_I0+24+3], v[vgprValuA_X0_I0+24:vgprValuA_X0_I0+24+3], acc[108:111] // left value = acc[108+0:111+0]
GRINC B 3
PERM A 48 2 8
PERM A 48 2 9

// B0 A1 // MFMAs reordered to allow more time for PERM A1 to complete..

/*  mfmaIndex:48  */
v_mfma_f32_16x16x32_bf16 acc[16:19], v[vgprValuB_X0_I0+0:vgprValuB_X0_I0+0+3], v[vgprValuA_X0_I0+32+4:vgprValuA_X0_I0+32+4+3], acc[16:19] // left value = acc[16+0:19+0]
GRINC B 4
PERM A 48 2 10
PERM A 48 2 11
/*  mfmaIndex:49  */
v_mfma_f32_16x16x32_bf16 acc[16:19], v[vgprValuB_X0_I0+0+4:vgprValuB_X0_I0+0+4+3], v[vgprValuA_X0_I0+32:vgprValuA_X0_I0+32+3], acc[16:19] // left value = acc[16+0:19+0]
GRINC B 5
PERM A 48 2 12
PERM A 48 2 13
/*  mfmaIndex:50  */
v_mfma_f32_16x16x32_bf16 acc[16:19], v[vgprValuB_X0_I0+0:vgprValuB_X0_I0+0+3], v[vgprValuA_X0_I0+32:vgprValuA_X0_I0+32+3], acc[16:19] // left value = acc[16+0:19+0]
GRINC B 6
PERM A 48 2 14
PERM A 48 2 15
  /*  mfmaIndex:51  */
v_mfma_f32_16x16x32_bf16 acc[20:23], v[vgprValuB_X0_I0+0:vgprValuB_X0_I0+0+3], v[vgprValuA_X0_I0+40+4:vgprValuA_X0_I0+40+4+3], acc[20:23] // left value = acc[20+0:23+0]
GRINC B 7
PERM A 48 2 16
PERM A 48 2 17
/*  mfmaIndex:52  */
v_mfma_f32_16x16x32_bf16 acc[20:23], v[vgprValuB_X0_I0+0+4:vgprValuB_X0_I0+0+4+3], v[vgprValuA_X0_I0+40:vgprValuA_X0_I0+40+3], acc[20:23] // left value = acc[20+0:23+0]
GRINC B 8
PERM A 48 2 18
PERM A 48 2 19
/*  mfmaIndex:53  */
v_mfma_f32_16x16x32_bf16 acc[20:23], v[vgprValuB_X0_I0+0:vgprValuB_X0_I0+0+3], v[vgprValuA_X0_I0+40:vgprValuA_X0_I0+40+3], acc[20:23] // left value = acc[20+0:23+0]
PERM A 48 2 20
PERM A 48 2 21
/*  mfmaIndex:60  */
v_mfma_f32_16x16x32_bf16 acc[48:51], v[vgprValuB_X0_I0+8:vgprValuB_X0_I0+8+3], v[vgprValuA_X0_I0+32+4:vgprValuA_X0_I0+32+4+3], acc[48:51] // left value = acc[48+0:51+0]
PERM A 48 2 22
PERM A 48 2 23
/*  mfmaIndex:61  */
v_mfma_f32_16x16x32_bf16 acc[48:51], v[vgprValuB_X0_I0+8+4:vgprValuB_X0_I0+8+4+3], v[vgprValuA_X0_I0+32:vgprValuA_X0_I0+32+3], acc[48:51] // left value = acc[48+0:51+0]
PERM A 56 3 0
PERM A 56 3 1
/*  mfmaIndex:62  */
v_mfma_f32_16x16x32_bf16 acc[48:51], v[vgprValuB_X0_I0+8:vgprValuB_X0_I0+8+3], v[vgprValuA_X0_I0+32:vgprValuA_X0_I0+32+3], acc[48:51] // left value = acc[48+0:51+0]
PERM A 56 3 2
PERM A 56 3 3
/*  mfmaIndex:63  */
v_mfma_f32_16x16x32_bf16 acc[52:55], v[vgprValuB_X0_I0+8:vgprValuB_X0_I0+8+3], v[vgprValuA_X0_I0+40+4:vgprValuA_X0_I0+40+4+3], acc[52:55] // left value = acc[52+0:55+0]
PERM A 56 3 4
PERM A 56 3 5
/*  mfmaIndex:64  */
v_mfma_f32_16x16x32_bf16 acc[52:55], v[vgprValuB_X0_I0+8+4:vgprValuB_X0_I0+8+4+3], v[vgprValuA_X0_I0+40:vgprValuA_X0_I0+40+3], acc[52:55] // left value = acc[52+0:55+0]
PERM A 56 3 6
PERM A 56 3 7
/*  mfmaIndex:65  */
v_mfma_f32_16x16x32_bf16 acc[52:55], v[vgprValuB_X0_I0+8:vgprValuB_X0_I0+8+3], v[vgprValuA_X0_I0+40:vgprValuA_X0_I0+40+3], acc[52:55] // left value = acc[52+0:55+0]
PERM A 56 3 8
PERM A 56 3 9
/*  mfmaIndex:72  */
v_mfma_f32_16x16x32_bf16 acc[80:83], v[vgprValuB_X0_I0+16:vgprValuB_X0_I0+16+3], v[vgprValuA_X0_I0+32+4:vgprValuA_X0_I0+32+4+3], acc[80:83] // left value = acc[80+0:83+0]
PERM A 56 3 10
PERM A 56 3 11
/*  mfmaIndex:73  */
v_mfma_f32_16x16x32_bf16 acc[80:83], v[vgprValuB_X0_I0+16+4:vgprValuB_X0_I0+16+4+3], v[vgprValuA_X0_I0+32:vgprValuA_X0_I0+32+3], acc[80:83] // left value = acc[80+0:83+0]
PERM A 56 3 12
PERM A 56 3 13
/*  mfmaIndex:74  */
v_mfma_f32_16x16x32_bf16 acc[80:83], v[vgprValuB_X0_I0+16:vgprValuB_X0_I0+16+3], v[vgprValuA_X0_I0+32:vgprValuA_X0_I0+32+3], acc[80:83] // left value = acc[80+0:83+0]
PERM A 56 3 14
PERM A 56 3 15
/*  mfmaIndex:75  */
v_mfma_f32_16x16x32_bf16 acc[84:87], v[vgprValuB_X0_I0+16:vgprValuB_X0_I0+16+3], v[vgprValuA_X0_I0+40+4:vgprValuA_X0_I0+40+4+3], acc[84:87] // left value = acc[84+0:87+0]
PERM A 56 3 16
PERM A 56 3 17
/*  mfmaIndex:76  */
v_mfma_f32_16x16x32_bf16 acc[84:87], v[vgprValuB_X0_I0+16+4:vgprValuB_X0_I0+16+4+3], v[vgprValuA_X0_I0+40:vgprValuA_X0_I0+40+3], acc[84:87] // left value = acc[84+0:87+0]
PERM A 56 3 18
PERM A 56 3 19
/*  mfmaIndex:77  */
v_mfma_f32_16x16x32_bf16 acc[84:87], v[vgprValuB_X0_I0+16:vgprValuB_X0_I0+16+3], v[vgprValuA_X0_I0+40:vgprValuA_X0_I0+40+3], acc[84:87] // left value = acc[84+0:87+0]
PERM A 56 3 20
PERM A 56 3 21
/*  mfmaIndex:84  */
v_mfma_f32_16x16x32_bf16 acc[112:115], v[vgprValuB_X0_I0+24:vgprValuB_X0_I0+24+3], v[vgprValuA_X0_I0+32+4:vgprValuA_X0_I0+32+4+3], acc[112:115] // left value = acc[112+0:115+0]
PERM A 56 3 22
PERM A 56 3 23
/*  mfmaIndex:85  */
v_mfma_f32_16x16x32_bf16 acc[112:115], v[vgprValuB_X0_I0+24+4:vgprValuB_X0_I0+24+4+3], v[vgprValuA_X0_I0+32:vgprValuA_X0_I0+32+3], acc[112:115] // left value = acc[112+0:115+0]
s_waitcnt lgkmcnt(0) // Wait for LR B1 to finish
CVT B 32 0 0
CVT B 32 0 1
/*  mfmaIndex:86  */
v_mfma_f32_16x16x32_bf16 acc[112:115], v[vgprValuB_X0_I0+24:vgprValuB_X0_I0+24+3], v[vgprValuA_X0_I0+32:vgprValuA_X0_I0+32+3], acc[112:115] // left value = acc[112+0:115+0]
CVT B 32 0 2
CVT B 32 0 3
/*  mfmaIndex:87  */
v_mfma_f32_16x16x32_bf16 acc[116:119], v[vgprValuB_X0_I0+24:vgprValuB_X0_I0+24+3], v[vgprValuA_X0_I0+40+4:vgprValuA_X0_I0+40+4+3], acc[116:119] // left value = acc[116+0:119+0]
CVT B 32 0 4
CVT B 32 0 5
/*  mfmaIndex:88  */
v_mfma_f32_16x16x32_bf16 acc[116:119], v[vgprValuB_X0_I0+24+4:vgprValuB_X0_I0+24+4+3], v[vgprValuA_X0_I0+40:vgprValuA_X0_I0+40+3], acc[116:119] // left value = acc[116+0:119+0]
CVT B 32 0 6
CVT B 32 0 7
/*  mfmaIndex:89  */
v_mfma_f32_16x16x32_bf16 acc[116:119], v[vgprValuB_X0_I0+24:vgprValuB_X0_I0+24+3], v[vgprValuA_X0_I0+40:vgprValuA_X0_I0+40+3], acc[116:119] // left value = acc[116+0:119+0]
CVT B 32 0 8
CVT B 32 0 9
/*  mfmaIndex:54  */
v_mfma_f32_16x16x32_bf16 acc[24:27], v[vgprValuB_X0_I0+0:vgprValuB_X0_I0+0+3], v[vgprValuA_X0_I0+48+4:vgprValuA_X0_I0+48+4+3], acc[24:27] // left value = acc[24+0:27+0]
CVT B 32 0 10
CVT B 32 0 11
/*  mfmaIndex:55  */
v_mfma_f32_16x16x32_bf16 acc[24:27], v[vgprValuB_X0_I0+0+4:vgprValuB_X0_I0+0+4+3], v[vgprValuA_X0_I0+48:vgprValuA_X0_I0+48+3], acc[24:27] // left value = acc[24+0:27+0]
CVT B 32 0 12
CVT B 32 0 13
/*  mfmaIndex:56  */
v_mfma_f32_16x16x32_bf16 acc[24:27], v[vgprValuB_X0_I0+0:vgprValuB_X0_I0+0+3], v[vgprValuA_X0_I0+48:vgprValuA_X0_I0+48+3], acc[24:27] // left value = acc[24+0:27+0]
CVT B 32 0 14
CVT B 32 0 15
/*  mfmaIndex:57  */
v_mfma_f32_16x16x32_bf16 acc[28:31], v[vgprValuB_X0_I0+0:vgprValuB_X0_I0+0+3], v[vgprValuA_X0_I0+56+4:vgprValuA_X0_I0+56+4+3], acc[28:31] // left value = acc[28+0:31+0]
CVT B 32 0 16
CVT B 32 0 17
/*  mfmaIndex:58  */
v_mfma_f32_16x16x32_bf16 acc[28:31], v[vgprValuB_X0_I0+0+4:vgprValuB_X0_I0+0+4+3], v[vgprValuA_X0_I0+56:vgprValuA_X0_I0+56+3], acc[28:31] // left value = acc[28+0:31+0]
CVT B 32 0 18
CVT B 32 0 19
/*  mfmaIndex:59  */
v_mfma_f32_16x16x32_bf16 acc[28:31], v[vgprValuB_X0_I0+0:vgprValuB_X0_I0+0+3], v[vgprValuA_X0_I0+56:vgprValuA_X0_I0+56+3], acc[28:31] // left value = acc[28+0:31+0]
CVT B 32 0 20
CVT B 32 0 21
/*  mfmaIndex:66  */
v_mfma_f32_16x16x32_bf16 acc[56:59], v[vgprValuB_X0_I0+8:vgprValuB_X0_I0+8+3], v[vgprValuA_X0_I0+48+4:vgprValuA_X0_I0+48+4+3], acc[56:59] // left value = acc[56+0:59+0]
CVT B 32 0 22
CVT B 32 0 23
/*  mfmaIndex:67  */
v_mfma_f32_16x16x32_bf16 acc[56:59], v[vgprValuB_X0_I0+8+4:vgprValuB_X0_I0+8+4+3], v[vgprValuA_X0_I0+48:vgprValuA_X0_I0+48+3], acc[56:59] // left value = acc[56+0:59+0]
CVT B 40 8 0
CVT B 40 8 1
/*  mfmaIndex:68  */
v_mfma_f32_16x16x32_bf16 acc[56:59], v[vgprValuB_X0_I0+8:vgprValuB_X0_I0+8+3], v[vgprValuA_X0_I0+48:vgprValuA_X0_I0+48+3], acc[56:59] // left value = acc[56+0:59+0]
CVT B 40 8 2
CVT B 40 8 3
/*  mfmaIndex:69  */
v_mfma_f32_16x16x32_bf16 acc[60:63], v[vgprValuB_X0_I0+8:vgprValuB_X0_I0+8+3], v[vgprValuA_X0_I0+56+4:vgprValuA_X0_I0+56+4+3], acc[60:63] // left value = acc[60+0:63+0]
CVT B 40 8 4
CVT B 40 8 5
/*  mfmaIndex:70  */
v_mfma_f32_16x16x32_bf16 acc[60:63], v[vgprValuB_X0_I0+8+4:vgprValuB_X0_I0+8+4+3], v[vgprValuA_X0_I0+56:vgprValuA_X0_I0+56+3], acc[60:63] // left value = acc[60+0:63+0]
CVT B 40 8 6
CVT B 40 8 7
/*  mfmaIndex:71  */
v_mfma_f32_16x16x32_bf16 acc[60:63], v[vgprValuB_X0_I0+8:vgprValuB_X0_I0+8+3], v[vgprValuA_X0_I0+56:vgprValuA_X0_I0+56+3], acc[60:63] // left value = acc[60+0:63+0]
CVT B 40 8 8
CVT B 40 8 9
/*  mfmaIndex:78  */
v_mfma_f32_16x16x32_bf16 acc[88:91], v[vgprValuB_X0_I0+16:vgprValuB_X0_I0+16+3], v[vgprValuA_X0_I0+48+4:vgprValuA_X0_I0+48+4+3], acc[88:91] // left value = acc[88+0:91+0]
s_waitcnt vmcnt(16) // Wait for previous A to finish to start LR A0, 8A + 8B issued prior, 8A just issued, 8B prior + 8A just can still be in flight.
CVT B 40 8 10
CVT B 40 8 11
/*  mfmaIndex:79  */
v_mfma_f32_16x16x32_bf16 acc[88:91], v[vgprValuB_X0_I0+16+4:vgprValuB_X0_I0+16+4+3], v[vgprValuA_X0_I0+48:vgprValuA_X0_I0+48+3], acc[88:91] // left value = acc[88+0:91+0]
s_barrier
/*  mfmaIndex:80  */
v_mfma_f32_16x16x32_bf16 acc[88:91], v[vgprValuB_X0_I0+16:vgprValuB_X0_I0+16+3], v[vgprValuA_X0_I0+48:vgprValuA_X0_I0+48+3], acc[88:91] // left value = acc[88+0:91+0]
v_xor_b32 v[vgprLocalReadAddrB], v[vgprLocalReadSwapAddrB], v[vgprLocalReadAddrB] // swap Red Blk
/*  mfmaIndex:81  */
v_mfma_f32_16x16x32_bf16 acc[92:95], v[vgprValuB_X0_I0+16:vgprValuB_X0_I0+16+3], v[vgprValuA_X0_I0+56+4:vgprValuA_X0_I0+56+4+3], acc[92:95] // left value = acc[92+0:95+0]
LR1 A 0
LR1 A 1
/*  mfmaIndex:82  */
v_mfma_f32_16x16x32_bf16 acc[92:95], v[vgprValuB_X0_I0+16+4:vgprValuB_X0_I0+16+4+3], v[vgprValuA_X0_I0+56:vgprValuA_X0_I0+56+3], acc[92:95] // left value = acc[92+0:95+0]
/*  mfmaIndex:83  */
v_mfma_f32_16x16x32_bf16 acc[92:95], v[vgprValuB_X0_I0+16:vgprValuB_X0_I0+16+3], v[vgprValuA_X0_I0+56:vgprValuA_X0_I0+56+3], acc[92:95] // left value = acc[92+0:95+0]
LR1 A 2
LR1 A 3
/*  mfmaIndex:90  */
v_mfma_f32_16x16x32_bf16 acc[120:123], v[vgprValuB_X0_I0+24:vgprValuB_X0_I0+24+3], v[vgprValuA_X0_I0+48+4:vgprValuA_X0_I0+48+4+3], acc[120:123] // left value = acc[120+0:123+0]
/*  mfmaIndex:91  */
v_mfma_f32_16x16x32_bf16 acc[120:123], v[vgprValuB_X0_I0+24+4:vgprValuB_X0_I0+24+4+3], v[vgprValuA_X0_I0+48:vgprValuA_X0_I0+48+3], acc[120:123] // left value = acc[120+0:123+0]
LR1 A 4
LR1 A 5
/*  mfmaIndex:92  */
v_mfma_f32_16x16x32_bf16 acc[120:123], v[vgprValuB_X0_I0+24:vgprValuB_X0_I0+24+3], v[vgprValuA_X0_I0+48:vgprValuA_X0_I0+48+3], acc[120:123] // left value = acc[120+0:123+0]
/*  mfmaIndex:93  */
v_mfma_f32_16x16x32_bf16 acc[124:127], v[vgprValuB_X0_I0+24:vgprValuB_X0_I0+24+3], v[vgprValuA_X0_I0+56+4:vgprValuA_X0_I0+56+4+3], acc[124:127] // left value = acc[124+0:127+0]
LR1 A 6
LR1 A 7
/*  mfmaIndex:94  */
v_mfma_f32_16x16x32_bf16 acc[124:127], v[vgprValuB_X0_I0+24+4:vgprValuB_X0_I0+24+4+3], v[vgprValuA_X0_I0+56:vgprValuA_X0_I0+56+3], acc[124:127] // left value = acc[124+0:127+0]
/*  mfmaIndex:95  */
v_mfma_f32_16x16x32_bf16 acc[124:127], v[vgprValuB_X0_I0+24:vgprValuB_X0_I0+24+3], v[vgprValuA_X0_I0+56:vgprValuA_X0_I0+56+3], acc[124:127] // left value = acc[124+0:127+0]
CVT B 40 8 12
CVT B 40 8 13

// B1 A0

/*  mfmaIndex:96  */
v_mfma_f32_16x16x32_bf16 acc[128:131], v[vgprValuB_X0_I0+32:vgprValuB_X0_I0+32+3], v[vgprValuA_X0_I0+0+4:vgprValuA_X0_I0+0+4+3], acc[128:131] // left value = acc[128+0:131+0]
CVT B 40 8 14
CVT B 40 8 15
/*  mfmaIndex:97  */
v_mfma_f32_16x16x32_bf16 acc[128:131], v[vgprValuB_X0_I0+32+4:vgprValuB_X0_I0+32+4+3], v[vgprValuA_X0_I0+0:vgprValuA_X0_I0+0+3], acc[128:131] // left value = acc[128+0:131+0]
CVT B 40 8 16
CVT B 40 8 17
/*  mfmaIndex:98  */
v_mfma_f32_16x16x32_bf16 acc[128:131], v[vgprValuB_X0_I0+32:vgprValuB_X0_I0+32+3], v[vgprValuA_X0_I0+0:vgprValuA_X0_I0+0+3], acc[128:131] // left value = acc[128+0:131+0]
CVT B 40 8 18
CVT B 40 8 19
/*  mfmaIndex:99  */
v_mfma_f32_16x16x32_bf16 acc[132:135], v[vgprValuB_X0_I0+32:vgprValuB_X0_I0+32+3], v[vgprValuA_X0_I0+8+4:vgprValuA_X0_I0+8+4+3], acc[132:135] // left value = acc[132+0:135+0]
CVT B 40 8 20
CVT B 40 8 21
/*  mfmaIndex:100  */
v_mfma_f32_16x16x32_bf16 acc[132:135], v[vgprValuB_X0_I0+32+4:vgprValuB_X0_I0+32+4+3], v[vgprValuA_X0_I0+8:vgprValuA_X0_I0+8+3], acc[132:135] // left value = acc[132+0:135+0]
CVT B 40 8 22
CVT B 40 8 23
/*  mfmaIndex:101  */
v_mfma_f32_16x16x32_bf16 acc[132:135], v[vgprValuB_X0_I0+32:vgprValuB_X0_I0+32+3], v[vgprValuA_X0_I0+8:vgprValuA_X0_I0+8+3], acc[132:135] // left value = acc[132+0:135+0]
CVT B 48 16 0
CVT B 48 16 1
/*  mfmaIndex:102  */
v_mfma_f32_16x16x32_bf16 acc[136:139], v[vgprValuB_X0_I0+32:vgprValuB_X0_I0+32+3], v[vgprValuA_X0_I0+16+4:vgprValuA_X0_I0+16+4+3], acc[136:139] // left value = acc[136+0:139+0]
CVT B 48 16 2
CVT B 48 16 3
/*  mfmaIndex:103  */
v_mfma_f32_16x16x32_bf16 acc[136:139], v[vgprValuB_X0_I0+32+4:vgprValuB_X0_I0+32+4+3], v[vgprValuA_X0_I0+16:vgprValuA_X0_I0+16+3], acc[136:139] // left value = acc[136+0:139+0]
CVT B 48 16 4
CVT B 48 16 5
/*  mfmaIndex:104  */
v_mfma_f32_16x16x32_bf16 acc[136:139], v[vgprValuB_X0_I0+32:vgprValuB_X0_I0+32+3], v[vgprValuA_X0_I0+16:vgprValuA_X0_I0+16+3], acc[136:139] // left value = acc[136+0:139+0]
CVT B 48 16 6
CVT B 48 16 7
/*  mfmaIndex:105  */
v_mfma_f32_16x16x32_bf16 acc[140:143], v[vgprValuB_X0_I0+32:vgprValuB_X0_I0+32+3], v[vgprValuA_X0_I0+24+4:vgprValuA_X0_I0+24+4+3], acc[140:143] // left value = acc[140+0:143+0]
CVT B 48 16 8
CVT B 48 16 9
/*  mfmaIndex:106  */
v_mfma_f32_16x16x32_bf16 acc[140:143], v[vgprValuB_X0_I0+32+4:vgprValuB_X0_I0+32+4+3], v[vgprValuA_X0_I0+24:vgprValuA_X0_I0+24+3], acc[140:143] // left value = acc[140+0:143+0]
CVT B 48 16 10
CVT B 48 16 11
/*  mfmaIndex:107  */
v_mfma_f32_16x16x32_bf16 acc[140:143], v[vgprValuB_X0_I0+32:vgprValuB_X0_I0+32+3], v[vgprValuA_X0_I0+24:vgprValuA_X0_I0+24+3], acc[140:143] // left value = acc[140+0:143+0]
CVT B 48 16 12
CVT B 48 16 13
/*  mfmaIndex:108  */
v_mfma_f32_16x16x32_bf16 acc[160:163], v[vgprValuB_X0_I0+40:vgprValuB_X0_I0+40+3], v[vgprValuA_X0_I0+0+4:vgprValuA_X0_I0+0+4+3], acc[160:163] // left value = acc[160+0:163+0]
CVT B 48 16 14
CVT B 48 16 15
/*  mfmaIndex:109  */
v_mfma_f32_16x16x32_bf16 acc[160:163], v[vgprValuB_X0_I0+40+4:vgprValuB_X0_I0+40+4+3], v[vgprValuA_X0_I0+0:vgprValuA_X0_I0+0+3], acc[160:163] // left value = acc[160+0:163+0]
CVT B 48 16 16
CVT B 48 16 17
/*  mfmaIndex:110  */
v_mfma_f32_16x16x32_bf16 acc[160:163], v[vgprValuB_X0_I0+40:vgprValuB_X0_I0+40+3], v[vgprValuA_X0_I0+0:vgprValuA_X0_I0+0+3], acc[160:163] // left value = acc[160+0:163+0]
CVT B 48 16 18
CVT B 48 16 19
/*  mfmaIndex:111  */
v_mfma_f32_16x16x32_bf16 acc[164:167], v[vgprValuB_X0_I0+40:vgprValuB_X0_I0+40+3], v[vgprValuA_X0_I0+8+4:vgprValuA_X0_I0+8+4+3], acc[164:167] // left value = acc[164+0:167+0]
CVT B 48 16 20
CVT B 48 16 21
/*  mfmaIndex:112  */
v_mfma_f32_16x16x32_bf16 acc[164:167], v[vgprValuB_X0_I0+40+4:vgprValuB_X0_I0+40+4+3], v[vgprValuA_X0_I0+8:vgprValuA_X0_I0+8+3], acc[164:167] // left value = acc[164+0:167+0]
CVT B 48 16 22
CVT B 48 16 23
/*  mfmaIndex:113  */
v_mfma_f32_16x16x32_bf16 acc[164:167], v[vgprValuB_X0_I0+40:vgprValuB_X0_I0+40+3], v[vgprValuA_X0_I0+8:vgprValuA_X0_I0+8+3], acc[164:167] // left value = acc[164+0:167+0]
CVT B 56 24 0
CVT B 56 24 1
/*  mfmaIndex:114  */
v_mfma_f32_16x16x32_bf16 acc[168:171], v[vgprValuB_X0_I0+40:vgprValuB_X0_I0+40+3], v[vgprValuA_X0_I0+16+4:vgprValuA_X0_I0+16+4+3], acc[168:171] // left value = acc[168+0:171+0]
CVT B 56 24 2
CVT B 56 24 3
/*  mfmaIndex:115  */
v_mfma_f32_16x16x32_bf16 acc[168:171], v[vgprValuB_X0_I0+40+4:vgprValuB_X0_I0+40+4+3], v[vgprValuA_X0_I0+16:vgprValuA_X0_I0+16+3], acc[168:171] // left value = acc[168+0:171+0]
CVT B 56 24 4
CVT B 56 24 5
/*  mfmaIndex:116  */
v_mfma_f32_16x16x32_bf16 acc[168:171], v[vgprValuB_X0_I0+40:vgprValuB_X0_I0+40+3], v[vgprValuA_X0_I0+16:vgprValuA_X0_I0+16+3], acc[168:171] // left value = acc[168+0:171+0]
CVT B 56 24 6
CVT B 56 24 7
/*  mfmaIndex:117  */
v_mfma_f32_16x16x32_bf16 acc[172:175], v[vgprValuB_X0_I0+40:vgprValuB_X0_I0+40+3], v[vgprValuA_X0_I0+24+4:vgprValuA_X0_I0+24+4+3], acc[172:175] // left value = acc[172+0:175+0]
CVT B 56 24 8
CVT B 56 24 9
/*  mfmaIndex:118  */
v_mfma_f32_16x16x32_bf16 acc[172:175], v[vgprValuB_X0_I0+40+4:vgprValuB_X0_I0+40+4+3], v[vgprValuA_X0_I0+24:vgprValuA_X0_I0+24+3], acc[172:175] // left value = acc[172+0:175+0]
CVT B 56 24 10
CVT B 56 24 11
/*  mfmaIndex:119  */
v_mfma_f32_16x16x32_bf16 acc[172:175], v[vgprValuB_X0_I0+40:vgprValuB_X0_I0+40+3], v[vgprValuA_X0_I0+24:vgprValuA_X0_I0+24+3], acc[172:175] // left value = acc[172+0:175+0]
CVT B 56 24 12
CVT B 56 24 13
/*  mfmaIndex:120  */
v_mfma_f32_16x16x32_bf16 acc[192:195], v[vgprValuB_X0_I0+48:vgprValuB_X0_I0+48+3], v[vgprValuA_X0_I0+0+4:vgprValuA_X0_I0+0+4+3], acc[192:195] // left value = acc[192+0:195+0]
CVT B 56 24 14
CVT B 56 24 15
/*  mfmaIndex:121  */
v_mfma_f32_16x16x32_bf16 acc[192:195], v[vgprValuB_X0_I0+48+4:vgprValuB_X0_I0+48+4+3], v[vgprValuA_X0_I0+0:vgprValuA_X0_I0+0+3], acc[192:195] // left value = acc[192+0:195+0]
CVT B 56 24 16
CVT B 56 24 17
/*  mfmaIndex:122  */
v_mfma_f32_16x16x32_bf16 acc[192:195], v[vgprValuB_X0_I0+48:vgprValuB_X0_I0+48+3], v[vgprValuA_X0_I0+0:vgprValuA_X0_I0+0+3], acc[192:195] // left value = acc[192+0:195+0]
CVT B 56 24 18
CVT B 56 24 19
/*  mfmaIndex:123  */
v_mfma_f32_16x16x32_bf16 acc[196:199], v[vgprValuB_X0_I0+48:vgprValuB_X0_I0+48+3], v[vgprValuA_X0_I0+8+4:vgprValuA_X0_I0+8+4+3], acc[196:199] // left value = acc[196+0:199+0]
s_waitcnt vmcnt(8 + 5) & lgkmcnt(0) // Wait for previous GR B to finish and previous LR A0, 8A + 5B just issue, 8A + 8B issue prior
CVT B 56 24 20
CVT B 56 24 21
/*  mfmaIndex:124  */
v_mfma_f32_16x16x32_bf16 acc[196:199], v[vgprValuB_X0_I0+48+4:vgprValuB_X0_I0+48+4+3], v[vgprValuA_X0_I0+8:vgprValuA_X0_I0+8+3], acc[196:199] // left value = acc[196+0:199+0]
s_barrier
/*  mfmaIndex:125  */
v_mfma_f32_16x16x32_bf16 acc[196:199], v[vgprValuB_X0_I0+48:vgprValuB_X0_I0+48+3], v[vgprValuA_X0_I0+8:vgprValuA_X0_I0+8+3], acc[196:199] // left value = acc[196+0:199+0]
CVT B 56 24 22
/*  mfmaIndex:126  */
v_mfma_f32_16x16x32_bf16 acc[200:203], v[vgprValuB_X0_I0+48:vgprValuB_X0_I0+48+3], v[vgprValuA_X0_I0+16+4:vgprValuA_X0_I0+16+4+3], acc[200:203] // left value = acc[200+0:203+0]
LR0 B 0
CVT B 56 24 23
/*  mfmaIndex:127  */
v_mfma_f32_16x16x32_bf16 acc[200:203], v[vgprValuB_X0_I0+48+4:vgprValuB_X0_I0+48+4+3], v[vgprValuA_X0_I0+16:vgprValuA_X0_I0+16+3], acc[200:203] // left value = acc[200+0:203+0]
v_mov_b64 v[vgprValuA_T1_I0+0:vgprValuA_T1_I0+1], v[vgprValuA_X0_I0+0:vgprValuA_X0_I0+1]
/*  mfmaIndex:128  */
v_mfma_f32_16x16x32_bf16 acc[200:203], v[vgprValuB_X0_I0+48:vgprValuB_X0_I0+48+3], v[vgprValuA_X0_I0+16:vgprValuA_X0_I0+16+3], acc[200:203] // left value = acc[200+0:203+0]
LR0 B 1
v_mov_b64 v[vgprValuA_T1_I0+2:vgprValuA_T1_I0+3], v[vgprValuA_X0_I0+2:vgprValuA_X0_I0+3]
/*  mfmaIndex:129  */
v_mfma_f32_16x16x32_bf16 acc[204:207], v[vgprValuB_X0_I0+48:vgprValuB_X0_I0+48+3], v[vgprValuA_X0_I0+24+4:vgprValuA_X0_I0+24+4+3], acc[204:207] // left value = acc[204+0:207+0]
v_mov_b64 v[vgprValuA_T1_I0+4:vgprValuA_T1_I0+5], v[vgprValuA_X0_I0+4:vgprValuA_X0_I0+5]
/*  mfmaIndex:130  */
v_mfma_f32_16x16x32_bf16 acc[204:207], v[vgprValuB_X0_I0+48+4:vgprValuB_X0_I0+48+4+3], v[vgprValuA_X0_I0+24:vgprValuA_X0_I0+24+3], acc[204:207] // left value = acc[204+0:207+0]
LR0 B 2
v_mov_b64 v[vgprValuA_T1_I0+6:vgprValuA_T1_I0+7], v[vgprValuA_X0_I0+6:vgprValuA_X0_I0+7]
/*  mfmaIndex:131  */
v_mfma_f32_16x16x32_bf16 acc[204:207], v[vgprValuB_X0_I0+48:vgprValuB_X0_I0+48+3], v[vgprValuA_X0_I0+24:vgprValuA_X0_I0+24+3], acc[204:207] // left value = acc[204+0:207+0]
LR0 B 3
PERM A 0 0 0
/*  mfmaIndex:132  */
v_mfma_f32_16x16x32_bf16 acc[224:227], v[vgprValuB_X0_I0+56:vgprValuB_X0_I0+56+3], v[vgprValuA_T1_I0+0+4:vgprValuA_T1_I0+0+4+3], acc[224:227] // left value = acc[224+0:227+0]
PERM A 0 0 1
PERM A 0 0 2
/*  mfmaIndex:133  */
v_mfma_f32_16x16x32_bf16 acc[224:227], v[vgprValuB_X0_I0+56+4:vgprValuB_X0_I0+56+4+3], v[vgprValuA_T1_I0+0:vgprValuA_T1_I0+0+3], acc[224:227] // left value = acc[224+0:227+0]
LR0 B 4
PERM A 0 0 3
/*  mfmaIndex:134  */
v_mfma_f32_16x16x32_bf16 acc[224:227], v[vgprValuB_X0_I0+56:vgprValuB_X0_I0+56+3], v[vgprValuA_T1_I0+0:vgprValuA_T1_I0+0+3], acc[224:227] // left value = acc[224+0:227+0]
PERM A 0 0 4
PERM A 0 0 5
/*  mfmaIndex:135  */
v_mfma_f32_16x16x32_bf16 acc[228:231], v[vgprValuB_X0_I0+56:vgprValuB_X0_I0+56+3], v[vgprValuA_X0_I0+8+4:vgprValuA_X0_I0+8+4+3], acc[228:231] // left value = acc[228+0:231+0]
LR0 B 5
PERM A 0 0 6
/*  mfmaIndex:136  */
v_mfma_f32_16x16x32_bf16 acc[228:231], v[vgprValuB_X0_I0+56+4:vgprValuB_X0_I0+56+4+3], v[vgprValuA_X0_I0+8:vgprValuA_X0_I0+8+3], acc[228:231] // left value = acc[228+0:231+0]
PERM A 0 0 7
PERM A 0 0 8
/*  mfmaIndex:137  */
v_mfma_f32_16x16x32_bf16 acc[228:231], v[vgprValuB_X0_I0+56:vgprValuB_X0_I0+56+3], v[vgprValuA_X0_I0+8:vgprValuA_X0_I0+8+3], acc[228:231] // left value = acc[228+0:231+0]
LR0 B 6
PERM A 0 0 9
/*  mfmaIndex:138  */
v_mfma_f32_16x16x32_bf16 acc[232:235], v[vgprValuB_X0_I0+56:vgprValuB_X0_I0+56+3], v[vgprValuA_X0_I0+16+4:vgprValuA_X0_I0+16+4+3], acc[232:235] // left value = acc[232+0:235+0]
PERM A 0 0 10
PERM A 0 0 11
/*  mfmaIndex:139  */
v_mfma_f32_16x16x32_bf16 acc[232:235], v[vgprValuB_X0_I0+56+4:vgprValuB_X0_I0+56+4+3], v[vgprValuA_X0_I0+16:vgprValuA_X0_I0+16+3], acc[232:235] // left value = acc[232+0:235+0]
PERM A 0 0 12
PERM A 0 0 13
/*  mfmaIndex:140  */
v_mfma_f32_16x16x32_bf16 acc[232:235], v[vgprValuB_X0_I0+56:vgprValuB_X0_I0+56+3], v[vgprValuA_X0_I0+16:vgprValuA_X0_I0+16+3], acc[232:235] // left value = acc[232+0:235+0]
LR0 B 7
PERM A 0 0 14
/*  mfmaIndex:141  */
v_mfma_f32_16x16x32_bf16 acc[236:239], v[vgprValuB_X0_I0+56:vgprValuB_X0_I0+56+3], v[vgprValuA_X0_I0+24+4:vgprValuA_X0_I0+24+4+3], acc[236:239] // left value = acc[236+0:239+0]
PERM A 0 0 15
PERM A 0 0 16
/*  mfmaIndex:142  */
v_mfma_f32_16x16x32_bf16 acc[236:239], v[vgprValuB_X0_I0+56+4:vgprValuB_X0_I0+56+4+3], v[vgprValuA_X0_I0+24:vgprValuA_X0_I0+24+3], acc[236:239] // left value = acc[236+0:239+0]
PERM A 0 0 17
PERM A 0 0 18
/*  mfmaIndex:143  */
v_mfma_f32_16x16x32_bf16 acc[236:239], v[vgprValuB_X0_I0+56:vgprValuB_X0_I0+56+3], v[vgprValuA_X0_I0+24:vgprValuA_X0_I0+24+3], acc[236:239] // left value = acc[236+0:239+0]
PERM A 0 0 19
PERM A 0 0 20
// A1 B1
/*  mfmaIndex:144  */
v_mfma_f32_16x16x32_bf16 acc[144:147], v[vgprValuB_X0_I0+32:vgprValuB_X0_I0+32+3], v[vgprValuA_X0_I0+32+4:vgprValuA_X0_I0+32+4+3], acc[144:147] // left value = acc[144+0:147+0]
PERM A 0 0 21
PERM A 0 0 22
PERM A 0 0 23
/*  mfmaIndex:145  */
v_mfma_f32_16x16x32_bf16 acc[144:147], v[vgprValuB_X0_I0+32+4:vgprValuB_X0_I0+32+4+3], v[vgprValuA_X0_I0+32:vgprValuA_X0_I0+32+3], acc[144:147] // left value = acc[144+0:147+0]
PERM A 8 1 0
PERM A 8 1 1
/*  mfmaIndex:146  */
v_mfma_f32_16x16x32_bf16 acc[144:147], v[vgprValuB_X0_I0+32:vgprValuB_X0_I0+32+3], v[vgprValuA_X0_I0+32:vgprValuA_X0_I0+32+3], acc[144:147] // left value = acc[144+0:147+0]
PERM A 8 1 2
PERM A 8 1 3
/*  mfmaIndex:147  */
v_mfma_f32_16x16x32_bf16 acc[148:151], v[vgprValuB_X0_I0+32:vgprValuB_X0_I0+32+3], v[vgprValuA_X0_I0+40+4:vgprValuA_X0_I0+40+4+3], acc[148:151] // left value = acc[148+0:151+0]
PERM A 8 1 4
PERM A 8 1 5
/*  mfmaIndex:148  */
v_mfma_f32_16x16x32_bf16 acc[148:151], v[vgprValuB_X0_I0+32+4:vgprValuB_X0_I0+32+4+3], v[vgprValuA_X0_I0+40:vgprValuA_X0_I0+40+3], acc[148:151] // left value = acc[148+0:151+0]
PERM A 8 1 6
PERM A 8 1 7
/*  mfmaIndex:149  */
v_mfma_f32_16x16x32_bf16 acc[148:151], v[vgprValuB_X0_I0+32:vgprValuB_X0_I0+32+3], v[vgprValuA_X0_I0+40:vgprValuA_X0_I0+40+3], acc[148:151] // left value = acc[148+0:151+0]
PERM A 8 1 8
PERM A 8 1 9
/*  mfmaIndex:150  */
v_mfma_f32_16x16x32_bf16 acc[152:155], v[vgprValuB_X0_I0+32:vgprValuB_X0_I0+32+3], v[vgprValuA_X0_I0+48+4:vgprValuA_X0_I0+48+4+3], acc[152:155] // left value = acc[152+0:155+0]
PERM A 8 1 10
PERM A 8 1 11
/*  mfmaIndex:151  */
v_mfma_f32_16x16x32_bf16 acc[152:155], v[vgprValuB_X0_I0+32+4:vgprValuB_X0_I0+32+4+3], v[vgprValuA_X0_I0+48:vgprValuA_X0_I0+48+3], acc[152:155] // left value = acc[152+0:155+0]
PERM A 8 1 12
PERM A 8 1 13
/*  mfmaIndex:152  */
v_mfma_f32_16x16x32_bf16 acc[152:155], v[vgprValuB_X0_I0+32:vgprValuB_X0_I0+32+3], v[vgprValuA_X0_I0+48:vgprValuA_X0_I0+48+3], acc[152:155] // left value = acc[152+0:155+0]
PERM A 8 1 14
PERM A 8 1 15
/*  mfmaIndex:153  */
v_mfma_f32_16x16x32_bf16 acc[156:159], v[vgprValuB_X0_I0+32:vgprValuB_X0_I0+32+3], v[vgprValuA_X0_I0+56+4:vgprValuA_X0_I0+56+4+3], acc[156:159] // left value = acc[156+0:159+0]
PERM A 8 1 16
PERM A 8 1 17
/*  mfmaIndex:154  */
v_mfma_f32_16x16x32_bf16 acc[156:159], v[vgprValuB_X0_I0+32+4:vgprValuB_X0_I0+32+4+3], v[vgprValuA_X0_I0+56:vgprValuA_X0_I0+56+3], acc[156:159] // left value = acc[156+0:159+0]
PERM A 8 1 18
PERM A 8 1 19
/*  mfmaIndex:155  */
v_mfma_f32_16x16x32_bf16 acc[156:159], v[vgprValuB_X0_I0+32:vgprValuB_X0_I0+32+3], v[vgprValuA_X0_I0+56:vgprValuA_X0_I0+56+3], acc[156:159] // left value = acc[156+0:159+0]
PERM A 8 1 20
PERM A 8 1 21
/*  mfmaIndex:156  */
v_mfma_f32_16x16x32_bf16 acc[176:179], v[vgprValuB_X0_I0+40:vgprValuB_X0_I0+40+3], v[vgprValuA_X0_I0+32+4:vgprValuA_X0_I0+32+4+3], acc[176:179] // left value = acc[176+0:179+0]
PERM A 8 1 22
PERM A 8 1 23
/*  mfmaIndex:157  */
v_mfma_f32_16x16x32_bf16 acc[176:179], v[vgprValuB_X0_I0+40+4:vgprValuB_X0_I0+40+4+3], v[vgprValuA_X0_I0+32:vgprValuA_X0_I0+32+3], acc[176:179] // left value = acc[176+0:179+0]
PERM A 16 2 0
PERM A 16 2 1
/*  mfmaIndex:158  */
v_mfma_f32_16x16x32_bf16 acc[176:179], v[vgprValuB_X0_I0+40:vgprValuB_X0_I0+40+3], v[vgprValuA_X0_I0+32:vgprValuA_X0_I0+32+3], acc[176:179] // left value = acc[176+0:179+0]
PERM A 16 2 2
PERM A 16 2 3
/*  mfmaIndex:159  */
v_mfma_f32_16x16x32_bf16 acc[180:183], v[vgprValuB_X0_I0+40:vgprValuB_X0_I0+40+3], v[vgprValuA_X0_I0+40+4:vgprValuA_X0_I0+40+4+3], acc[180:183] // left value = acc[180+0:183+0]
PERM A 16 2 4
PERM A 16 2 5
/*  mfmaIndex:160  */
v_mfma_f32_16x16x32_bf16 acc[180:183], v[vgprValuB_X0_I0+40+4:vgprValuB_X0_I0+40+4+3], v[vgprValuA_X0_I0+40:vgprValuA_X0_I0+40+3], acc[180:183] // left value = acc[180+0:183+0]
PERM A 16 2 6
PERM A 16 2 7
/*  mfmaIndex:161  */
v_mfma_f32_16x16x32_bf16 acc[180:183], v[vgprValuB_X0_I0+40:vgprValuB_X0_I0+40+3], v[vgprValuA_X0_I0+40:vgprValuA_X0_I0+40+3], acc[180:183] // left value = acc[180+0:183+0]
PERM A 16 2 8
PERM A 16 2 9
/*  mfmaIndex:162  */
v_mfma_f32_16x16x32_bf16 acc[184:187], v[vgprValuB_X0_I0+40:vgprValuB_X0_I0+40+3], v[vgprValuA_X0_I0+48+4:vgprValuA_X0_I0+48+4+3], acc[184:187] // left value = acc[184+0:187+0]
PERM A 16 2 10
PERM A 16 2 11
/*  mfmaIndex:163  */
v_mfma_f32_16x16x32_bf16 acc[184:187], v[vgprValuB_X0_I0+40+4:vgprValuB_X0_I0+40+4+3], v[vgprValuA_X0_I0+48:vgprValuA_X0_I0+48+3], acc[184:187] // left value = acc[184+0:187+0]
PERM A 16 2 12
PERM A 16 2 13
/*  mfmaIndex:164  */
v_mfma_f32_16x16x32_bf16 acc[184:187], v[vgprValuB_X0_I0+40:vgprValuB_X0_I0+40+3], v[vgprValuA_X0_I0+48:vgprValuA_X0_I0+48+3], acc[184:187] // left value = acc[184+0:187+0]
PERM A 16 2 14
PERM A 16 2 15
/*  mfmaIndex:165  */
v_mfma_f32_16x16x32_bf16 acc[188:191], v[vgprValuB_X0_I0+40:vgprValuB_X0_I0+40+3], v[vgprValuA_X0_I0+56+4:vgprValuA_X0_I0+56+4+3], acc[188:191] // left value = acc[188+0:191+0]
PERM A 16 2 16
PERM A 16 2 17
/*  mfmaIndex:166  */
v_mfma_f32_16x16x32_bf16 acc[188:191], v[vgprValuB_X0_I0+40+4:vgprValuB_X0_I0+40+4+3], v[vgprValuA_X0_I0+56:vgprValuA_X0_I0+56+3], acc[188:191] // left value = acc[188+0:191+0]
PERM A 16 2 18
PERM A 16 2 19
/*  mfmaIndex:167  */
v_mfma_f32_16x16x32_bf16 acc[188:191], v[vgprValuB_X0_I0+40:vgprValuB_X0_I0+40+3], v[vgprValuA_X0_I0+56:vgprValuA_X0_I0+56+3], acc[188:191] // left value = acc[188+0:191+0]
PERM A 16 2 20
PERM A 16 2 21
/*  mfmaIndex:168  */
v_mfma_f32_16x16x32_bf16 acc[208:211], v[vgprValuB_X0_I0+48:vgprValuB_X0_I0+48+3], v[vgprValuA_X0_I0+32+4:vgprValuA_X0_I0+32+4+3], acc[208:211] // left value = acc[208+0:211+0]
PERM A 16 2 22
PERM A 16 2 23
/*  mfmaIndex:169  */
v_mfma_f32_16x16x32_bf16 acc[208:211], v[vgprValuB_X0_I0+48+4:vgprValuB_X0_I0+48+4+3], v[vgprValuA_X0_I0+32:vgprValuA_X0_I0+32+3], acc[208:211] // left value = acc[208+0:211+0]
PERM A 24 3 0
PERM A 24 3 1
/*  mfmaIndex:170  */
v_mfma_f32_16x16x32_bf16 acc[208:211], v[vgprValuB_X0_I0+48:vgprValuB_X0_I0+48+3], v[vgprValuA_X0_I0+32:vgprValuA_X0_I0+32+3], acc[208:211] // left value = acc[208+0:211+0]
PERM A 24 3 2
PERM A 24 3 3
/*  mfmaIndex:171  */
v_mfma_f32_16x16x32_bf16 acc[212:215], v[vgprValuB_X0_I0+48:vgprValuB_X0_I0+48+3], v[vgprValuA_X0_I0+40+4:vgprValuA_X0_I0+40+4+3], acc[212:215] // left value = acc[212+0:215+0]
PERM A 24 3 4
PERM A 24 3 5
/*  mfmaIndex:172  */
v_mfma_f32_16x16x32_bf16 acc[212:215], v[vgprValuB_X0_I0+48+4:vgprValuB_X0_I0+48+4+3], v[vgprValuA_X0_I0+40:vgprValuA_X0_I0+40+3], acc[212:215] // left value = acc[212+0:215+0]
PERM A 24 3 6
PERM A 24 3 7
/*  mfmaIndex:173  */
v_mfma_f32_16x16x32_bf16 acc[212:215], v[vgprValuB_X0_I0+48:vgprValuB_X0_I0+48+3], v[vgprValuA_X0_I0+40:vgprValuA_X0_I0+40+3], acc[212:215] // left value = acc[212+0:215+0]
PERM A 24 3 8
PERM A 24 3 9
/*  mfmaIndex:174  */
v_mfma_f32_16x16x32_bf16 acc[216:219], v[vgprValuB_X0_I0+48:vgprValuB_X0_I0+48+3], v[vgprValuA_X0_I0+48+4:vgprValuA_X0_I0+48+4+3], acc[216:219] // left value = acc[216+0:219+0]
PERM A 24 3 10
PERM A 24 3 11
/*  mfmaIndex:175  */
v_mfma_f32_16x16x32_bf16 acc[216:219], v[vgprValuB_X0_I0+48+4:vgprValuB_X0_I0+48+4+3], v[vgprValuA_X0_I0+48:vgprValuA_X0_I0+48+3], acc[216:219] // left value = acc[216+0:219+0]
PERM A 24 3 12
PERM A 24 3 13
/*  mfmaIndex:176  */
v_mfma_f32_16x16x32_bf16 acc[216:219], v[vgprValuB_X0_I0+48:vgprValuB_X0_I0+48+3], v[vgprValuA_X0_I0+48:vgprValuA_X0_I0+48+3], acc[216:219] // left value = acc[216+0:219+0]
PERM A 24 3 14
PERM A 24 3 15
/*  mfmaIndex:177  */
v_mfma_f32_16x16x32_bf16 acc[220:223], v[vgprValuB_X0_I0+48:vgprValuB_X0_I0+48+3], v[vgprValuA_X0_I0+56+4:vgprValuA_X0_I0+56+4+3], acc[220:223] // left value = acc[220+0:223+0]
PERM A 24 3 16
PERM A 24 3 17
/*  mfmaIndex:178  */
v_mfma_f32_16x16x32_bf16 acc[220:223], v[vgprValuB_X0_I0+48+4:vgprValuB_X0_I0+48+4+3], v[vgprValuA_X0_I0+56:vgprValuA_X0_I0+56+3], acc[220:223] // left value = acc[220+0:223+0]
PERM A 24 3 18
PERM A 24 3 19
/*  mfmaIndex:179  */
v_mfma_f32_16x16x32_bf16 acc[220:223], v[vgprValuB_X0_I0+48:vgprValuB_X0_I0+48+3], v[vgprValuA_X0_I0+56:vgprValuA_X0_I0+56+3], acc[220:223] // left value = acc[220+0:223+0]
PERM A 24 3 20
PERM A 24 3 21
/*  mfmaIndex:180  */
v_mfma_f32_16x16x32_bf16 acc[240:243], v[vgprValuB_X0_I0+56:vgprValuB_X0_I0+56+3], v[vgprValuA_X0_I0+32+4:vgprValuA_X0_I0+32+4+3], acc[240:243] // left value = acc[240+0:243+0]
PERM A 24 3 22
PERM A 24 3 23
/*  mfmaIndex:181  */
v_mfma_f32_16x16x32_bf16 acc[240:243], v[vgprValuB_X0_I0+56+4:vgprValuB_X0_I0+56+4+3], v[vgprValuA_X0_I0+32:vgprValuA_X0_I0+32+3], acc[240:243] // left value = acc[240+0:243+0]
s_waitcnt lgkmcnt(0) // Wait for LR B0
CVT B 0 0 0
CVT B 0 0 1
/*  mfmaIndex:182  */
v_mfma_f32_16x16x32_bf16 acc[240:243], v[vgprValuB_X0_I0+56:vgprValuB_X0_I0+56+3], v[vgprValuA_X0_I0+32:vgprValuA_X0_I0+32+3], acc[240:243] // left value = acc[240+0:243+0]
CVT B 0 0 2
CVT B 0 0 3
/*  mfmaIndex:183  */
v_mfma_f32_16x16x32_bf16 acc[244:247], v[vgprValuB_X0_I0+56:vgprValuB_X0_I0+56+3], v[vgprValuA_X0_I0+40+4:vgprValuA_X0_I0+40+4+3], acc[244:247] // left value = acc[244+0:247+0]
CVT B 0 0 4
CVT B 0 0 5
/*  mfmaIndex:184  */
v_mfma_f32_16x16x32_bf16 acc[244:247], v[vgprValuB_X0_I0+56+4:vgprValuB_X0_I0+56+4+3], v[vgprValuA_X0_I0+40:vgprValuA_X0_I0+40+3], acc[244:247] // left value = acc[244+0:247+0]
CVT B 0 0 6
CVT B 0 0 7
/*  mfmaIndex:185  */
v_mfma_f32_16x16x32_bf16 acc[244:247], v[vgprValuB_X0_I0+56:vgprValuB_X0_I0+56+3], v[vgprValuA_X0_I0+40:vgprValuA_X0_I0+40+3], acc[244:247] // left value = acc[244+0:247+0]
CVT B 0 0 8
CVT B 0 0 9
/*  mfmaIndex:186  */
v_mfma_f32_16x16x32_bf16 acc[248:251], v[vgprValuB_X0_I0+56:vgprValuB_X0_I0+56+3], v[vgprValuA_X0_I0+48+4:vgprValuA_X0_I0+48+4+3], acc[248:251] // left value = acc[248+0:251+0]
CVT B 0 0 10
CVT B 0 0 11
/*  mfmaIndex:187  */
v_mfma_f32_16x16x32_bf16 acc[248:251], v[vgprValuB_X0_I0+56+4:vgprValuB_X0_I0+56+4+3], v[vgprValuA_X0_I0+48:vgprValuA_X0_I0+48+3], acc[248:251] // left value = acc[248+0:251+0]
CVT B 0 0 12
CVT B 0 0 13
/*  mfmaIndex:188  */
v_mfma_f32_16x16x32_bf16 acc[248:251], v[vgprValuB_X0_I0+56:vgprValuB_X0_I0+56+3], v[vgprValuA_X0_I0+48:vgprValuA_X0_I0+48+3], acc[248:251] // left value = acc[248+0:251+0]
CVT B 0 0 14
CVT B 0 0 15
/*  mfmaIndex:189  */
v_mfma_f32_16x16x32_bf16 acc[252:255], v[vgprValuB_X0_I0+56:vgprValuB_X0_I0+56+3], v[vgprValuA_X0_I0+56+4:vgprValuA_X0_I0+56+4+3], acc[252:255] // left value = acc[252+0:255+0]
CVT B 0 0 16
CVT B 0 0 17
/*  mfmaIndex:190  */
v_mfma_f32_16x16x32_bf16 acc[252:255], v[vgprValuB_X0_I0+56+4:vgprValuB_X0_I0+56+4+3], v[vgprValuA_X0_I0+56:vgprValuA_X0_I0+56+3], acc[252:255] // left value = acc[252+0:255+0]
CVT B 0 0 18
CVT B 0 0 19
CVT B 0 0 20
CVT B 0 0 21
CVT B 0 0 22
CVT B 0 0 23
v_cvt_pk_bf16_f32 v[vgprValuB_X0_I0+8+0], v[vgprValuB_T0_I0+8+0], v[vgprValuB_T0_I0+8+1]
v_cvt_pk_bf16_f32 v[vgprValuB_X0_I0+8+1], v[vgprValuB_T0_I0+8+2], v[vgprValuB_T0_I0+8+3]
v_cvt_pk_bf16_f32 v[vgprValuB_X0_I0+8+2], v[vgprValuB_T0_I0+8+4], v[vgprValuB_T0_I0+8+5]
v_cvt_pk_bf16_f32 v[vgprValuB_X0_I0+8+3], v[vgprValuB_T0_I0+8+6], v[vgprValuB_T0_I0+8+7]
v_cvt_pk_bf16_f32 v[vgprValuB_X0_I0+16+0], v[vgprValuB_T0_I0+16+0], v[vgprValuB_T0_I0+16+1]
v_cvt_pk_bf16_f32 v[vgprValuB_X0_I0+16+1], v[vgprValuB_T0_I0+16+2], v[vgprValuB_T0_I0+16+3]
v_cvt_pk_bf16_f32 v[vgprValuB_X0_I0+16+2], v[vgprValuB_T0_I0+16+4], v[vgprValuB_T0_I0+16+5]
v_cvt_pk_bf16_f32 v[vgprValuB_X0_I0+16+3], v[vgprValuB_T0_I0+16+6], v[vgprValuB_T0_I0+16+7]
v_cvt_pk_bf16_f32 v[vgprValuB_X0_I0+24+0], v[vgprValuB_T0_I0+24+0], v[vgprValuB_T0_I0+24+1]
v_cvt_pk_bf16_f32 v[vgprValuB_X0_I0+24+1], v[vgprValuB_T0_I0+24+2], v[vgprValuB_T0_I0+24+3]
v_cvt_pk_bf16_f32 v[vgprValuB_X0_I0+24+2], v[vgprValuB_T0_I0+24+4], v[vgprValuB_T0_I0+24+5]
v_cvt_pk_bf16_f32 v[vgprValuB_X0_I0+24+3], v[vgprValuB_T0_I0+24+6], v[vgprValuB_T0_I0+24+7]

v_dot2c_f32_bf16 v[vgprValuB_T0_I0+8+0], 0x8000bf80, v[vgprValuB_X0_I0+8+0]
v_dot2c_f32_bf16 v[vgprValuB_T0_I0+8+1], 0xbf800000, v[vgprValuB_X0_I0+8+0]
v_dot2c_f32_bf16 v[vgprValuB_T0_I0+8+2], 0x8000bf80, v[vgprValuB_X0_I0+8+1]
v_dot2c_f32_bf16 v[vgprValuB_T0_I0+8+3], 0xbf800000, v[vgprValuB_X0_I0+8+1]
v_dot2c_f32_bf16 v[vgprValuB_T0_I0+8+4], 0x8000bf80, v[vgprValuB_X0_I0+8+2]
v_dot2c_f32_bf16 v[vgprValuB_T0_I0+8+5], 0xbf800000, v[vgprValuB_X0_I0+8+2]
v_dot2c_f32_bf16 v[vgprValuB_T0_I0+8+6], 0x8000bf80, v[vgprValuB_X0_I0+8+3]
v_dot2c_f32_bf16 v[vgprValuB_T0_I0+8+7], 0xbf800000, v[vgprValuB_X0_I0+8+3]
v_dot2c_f32_bf16 v[vgprValuB_T0_I0+16+0], 0x8000bf80, v[vgprValuB_X0_I0+16+0]
v_dot2c_f32_bf16 v[vgprValuB_T0_I0+16+1], 0xbf800000, v[vgprValuB_X0_I0+16+0]
v_dot2c_f32_bf16 v[vgprValuB_T0_I0+16+2], 0x8000bf80, v[vgprValuB_X0_I0+16+1]
v_dot2c_f32_bf16 v[vgprValuB_T0_I0+16+3], 0xbf800000, v[vgprValuB_X0_I0+16+1]
v_dot2c_f32_bf16 v[vgprValuB_T0_I0+16+4], 0x8000bf80, v[vgprValuB_X0_I0+16+2]
v_dot2c_f32_bf16 v[vgprValuB_T0_I0+16+5], 0xbf800000, v[vgprValuB_X0_I0+16+2]
v_dot2c_f32_bf16 v[vgprValuB_T0_I0+16+6], 0x8000bf80, v[vgprValuB_X0_I0+16+3]
v_dot2c_f32_bf16 v[vgprValuB_T0_I0+16+7], 0xbf800000, v[vgprValuB_X0_I0+16+3]
v_dot2c_f32_bf16 v[vgprValuB_T0_I0+24+0], 0x8000bf80, v[vgprValuB_X0_I0+24+0]
v_dot2c_f32_bf16 v[vgprValuB_T0_I0+24+1], 0xbf800000, v[vgprValuB_X0_I0+24+0]
v_dot2c_f32_bf16 v[vgprValuB_T0_I0+24+2], 0x8000bf80, v[vgprValuB_X0_I0+24+1]
v_dot2c_f32_bf16 v[vgprValuB_T0_I0+24+3], 0xbf800000, v[vgprValuB_X0_I0+24+1]
v_dot2c_f32_bf16 v[vgprValuB_T0_I0+24+4], 0x8000bf80, v[vgprValuB_X0_I0+24+2]
v_dot2c_f32_bf16 v[vgprValuB_T0_I0+24+5], 0xbf800000, v[vgprValuB_X0_I0+24+2]
v_dot2c_f32_bf16 v[vgprValuB_T0_I0+24+6], 0x8000bf80, v[vgprValuB_X0_I0+24+3]
v_dot2c_f32_bf16 v[vgprValuB_T0_I0+24+7], 0xbf800000, v[vgprValuB_X0_I0+24+3]

v_cvt_pk_bf16_f32 v[vgprValuB_X0_I0+8+7], v[vgprValuB_T0_I0+8+6], v[vgprValuB_T0_I0+8+7]
v_cvt_pk_bf16_f32 v[vgprValuB_X0_I0+8+6], v[vgprValuB_T0_I0+8+4], v[vgprValuB_T0_I0+8+5]
v_cvt_pk_bf16_f32 v[vgprValuB_X0_I0+8+5], v[vgprValuB_T0_I0+8+2], v[vgprValuB_T0_I0+8+3]
v_cvt_pk_bf16_f32 v[vgprValuB_X0_I0+8+4], v[vgprValuB_T0_I0+8+0], v[vgprValuB_T0_I0+8+1]
/*  mfmaIndex:191  */
v_mfma_f32_16x16x32_bf16 acc[252:255], v[vgprValuB_X0_I0+56:vgprValuB_X0_I0+56+3], v[vgprValuA_X0_I0+56:vgprValuA_X0_I0+56+3], acc[252:255] // left value = acc[252+0:255+0]
/* numPrefetchIter=0 */
/* dataAtIterA=0 numReadsIterA=1 skipReadsIterA=0 readsPerIterA=64 */
/* dataAtIterB=0 numReadsIterB=1 skipReadsIterB=0 readsPerIterB=16 */
label_toPGR1:

/******************************************/
/* Ord. NoLoadLoop - Begin                */
/******************************************/
s_waitcnt vmcnt(0)                                 // 10wait for global read
// Skip force waitcnt0
s_barrier

/* iter 0 (last unrolled loop) */
/*  grEndMfmaIndex:0, lwStartMfmaIndex:191, lwEndMfmaIndex:191  */
/*  numMfmaForLR:52, syncPlrMfmaIndex:0  */
// A0 B0
/*  mfmaIndex:0  */
v_mfma_f32_16x16x32_bf16 acc[0:3], v[vgprValuB_X0_I0+0:vgprValuB_X0_I0+0+3], v[vgprValuA_X0_I0+0+4:vgprValuA_X0_I0+0+4+3], acc[0:3] // left value = acc[0+0:3+0]
LR1 A 8
LR1 A 9
/*  mfmaIndex:1  */
v_mfma_f32_16x16x32_bf16 acc[0:3], v[vgprValuB_X0_I0+0+4:vgprValuB_X0_I0+0+4+3], v[vgprValuA_X0_I0+0:vgprValuA_X0_I0+0+3], acc[0:3] // left value = acc[0+0:3+0]
v_cvt_pk_bf16_f32 v[vgprValuB_X0_I0+16+7], v[vgprValuB_T0_I0+16+6], v[vgprValuB_T0_I0+16+7]
v_cvt_pk_bf16_f32 v[vgprValuB_X0_I0+16+6], v[vgprValuB_T0_I0+16+4], v[vgprValuB_T0_I0+16+5]
/*  mfmaIndex:2  */
v_mfma_f32_16x16x32_bf16 acc[0:3], v[vgprValuB_X0_I0+0:vgprValuB_X0_I0+0+3], v[vgprValuA_X0_I0+0:vgprValuA_X0_I0+0+3], acc[0:3] // left value = acc[0+0:3+0]
LR1 A 10
LR1 A 11
/*  mfmaIndex:3  */
v_mfma_f32_16x16x32_bf16 acc[4:7], v[vgprValuB_X0_I0+0:vgprValuB_X0_I0+0+3], v[vgprValuA_X0_I0+8+4:vgprValuA_X0_I0+8+4+3], acc[4:7] // left value = acc[4+0:7+0]
v_cvt_pk_bf16_f32 v[vgprValuB_X0_I0+16+5], v[vgprValuB_T0_I0+16+2], v[vgprValuB_T0_I0+16+3]
v_cvt_pk_bf16_f32 v[vgprValuB_X0_I0+16+4], v[vgprValuB_T0_I0+16+0], v[vgprValuB_T0_I0+16+1]
/*  mfmaIndex:4  */
v_mfma_f32_16x16x32_bf16 acc[4:7], v[vgprValuB_X0_I0+0+4:vgprValuB_X0_I0+0+4+3], v[vgprValuA_X0_I0+8:vgprValuA_X0_I0+8+3], acc[4:7] // left value = acc[4+0:7+0]
LR1 A 12
LR1 A 13
/*  mfmaIndex:5  */
v_mfma_f32_16x16x32_bf16 acc[4:7], v[vgprValuB_X0_I0+0:vgprValuB_X0_I0+0+3], v[vgprValuA_X0_I0+8:vgprValuA_X0_I0+8+3], acc[4:7] // left value = acc[4+0:7+0]
v_cvt_pk_bf16_f32 v[vgprValuB_X0_I0+24+7], v[vgprValuB_T0_I0+24+6], v[vgprValuB_T0_I0+24+7]
v_cvt_pk_bf16_f32 v[vgprValuB_X0_I0+24+6], v[vgprValuB_T0_I0+24+4], v[vgprValuB_T0_I0+24+5]
/*  mfmaIndex:6  */
v_mfma_f32_16x16x32_bf16 acc[8:11], v[vgprValuB_X0_I0+0:vgprValuB_X0_I0+0+3], v[vgprValuA_X0_I0+16+4:vgprValuA_X0_I0+16+4+3], acc[8:11] // left value = acc[8+0:11+0]
LR1 A 14
LR1 A 15
/*  mfmaIndex:7  */
v_mfma_f32_16x16x32_bf16 acc[8:11], v[vgprValuB_X0_I0+0+4:vgprValuB_X0_I0+0+4+3], v[vgprValuA_X0_I0+16:vgprValuA_X0_I0+16+3], acc[8:11] // left value = acc[8+0:11+0]
v_cvt_pk_bf16_f32 v[vgprValuB_X0_I0+24+5], v[vgprValuB_T0_I0+24+2], v[vgprValuB_T0_I0+24+3]
v_cvt_pk_bf16_f32 v[vgprValuB_X0_I0+24+4], v[vgprValuB_T0_I0+24+0], v[vgprValuB_T0_I0+24+1]
/*  mfmaIndex:8  */
v_mfma_f32_16x16x32_bf16 acc[8:11], v[vgprValuB_X0_I0+0:vgprValuB_X0_I0+0+3], v[vgprValuA_X0_I0+16:vgprValuA_X0_I0+16+3], acc[8:11] // left value = acc[8+0:11+0]
s_waitcnt lgkmcnt(4) // 8x LRA1 issue, wait for 4x LR A1 to be done.
PERM A 32 0 0
PERM A 32 0 1
/*  mfmaIndex:9  */
v_mfma_f32_16x16x32_bf16 acc[12:15], v[vgprValuB_X0_I0+0:vgprValuB_X0_I0+0+3], v[vgprValuA_X0_I0+24+4:vgprValuA_X0_I0+24+4+3], acc[12:15] // left value = acc[12+0:15+0]
PERM A 32 0 2
PERM A 32 0 3
/*  mfmaIndex:10  */
v_mfma_f32_16x16x32_bf16 acc[12:15], v[vgprValuB_X0_I0+0+4:vgprValuB_X0_I0+0+4+3], v[vgprValuA_X0_I0+24:vgprValuA_X0_I0+24+3], acc[12:15] // left value = acc[12+0:15+0]
PERM A 32 0 4
PERM A 32 0 5
/*  mfmaIndex:11  */
v_mfma_f32_16x16x32_bf16 acc[12:15], v[vgprValuB_X0_I0+0:vgprValuB_X0_I0+0+3], v[vgprValuA_X0_I0+24:vgprValuA_X0_I0+24+3], acc[12:15] // left value = acc[12+0:15+0]
s_waitcnt lgkmcnt(0) // 8x LR A1 done.
PERM A 32 0 6
PERM A 32 0 7
/*  mfmaIndex:12  */
v_mfma_f32_16x16x32_bf16 acc[32:35], v[vgprValuB_X0_I0+8:vgprValuB_X0_I0+8+3], v[vgprValuA_X0_I0+0+4:vgprValuA_X0_I0+0+4+3], acc[32:35] // left value = acc[32+0:35+0]
/*  mfmaIndex:13  */
v_mfma_f32_16x16x32_bf16 acc[32:35], v[vgprValuB_X0_I0+8+4:vgprValuB_X0_I0+8+4+3], v[vgprValuA_X0_I0+0:vgprValuA_X0_I0+0+3], acc[32:35] // left value = acc[32+0:35+0]
PERM A 32 0 8
/*  mfmaIndex:14  */
v_mfma_f32_16x16x32_bf16 acc[32:35], v[vgprValuB_X0_I0+8:vgprValuB_X0_I0+8+3], v[vgprValuA_X0_I0+0:vgprValuA_X0_I0+0+3], acc[32:35] // left value = acc[32+0:35+0]
LR0 B 8
PERM A 32 0 9
/*  mfmaIndex:15  */
v_mfma_f32_16x16x32_bf16 acc[36:39], v[vgprValuB_X0_I0+8:vgprValuB_X0_I0+8+3], v[vgprValuA_X0_I0+8+4:vgprValuA_X0_I0+8+4+3], acc[36:39] // left value = acc[36+0:39+0]
PERM A 32 0 10
/*  mfmaIndex:16  */
v_mfma_f32_16x16x32_bf16 acc[36:39], v[vgprValuB_X0_I0+8+4:vgprValuB_X0_I0+8+4+3], v[vgprValuA_X0_I0+8:vgprValuA_X0_I0+8+3], acc[36:39] // left value = acc[36+0:39+0]
LR0 B 9
PERM A 32 0 11
/*  mfmaIndex:17  */
v_mfma_f32_16x16x32_bf16 acc[36:39], v[vgprValuB_X0_I0+8:vgprValuB_X0_I0+8+3], v[vgprValuA_X0_I0+8:vgprValuA_X0_I0+8+3], acc[36:39] // left value = acc[36+0:39+0]
PERM A 32 0 12
/*  mfmaIndex:18  */
v_mfma_f32_16x16x32_bf16 acc[40:43], v[vgprValuB_X0_I0+8:vgprValuB_X0_I0+8+3], v[vgprValuA_X0_I0+16+4:vgprValuA_X0_I0+16+4+3], acc[40:43] // left value = acc[40+0:43+0]
LR0 B 10
PERM A 32 0 13
/*  mfmaIndex:19  */
v_mfma_f32_16x16x32_bf16 acc[40:43], v[vgprValuB_X0_I0+8+4:vgprValuB_X0_I0+8+4+3], v[vgprValuA_X0_I0+16:vgprValuA_X0_I0+16+3], acc[40:43] // left value = acc[40+0:43+0]
PERM A 32 0 14
/*  mfmaIndex:20  */
v_mfma_f32_16x16x32_bf16 acc[40:43], v[vgprValuB_X0_I0+8:vgprValuB_X0_I0+8+3], v[vgprValuA_X0_I0+16:vgprValuA_X0_I0+16+3], acc[40:43] // left value = acc[40+0:43+0]
LR0 B 11
PERM A 32 0 15
/*  mfmaIndex:21  */
v_mfma_f32_16x16x32_bf16 acc[44:47], v[vgprValuB_X0_I0+8:vgprValuB_X0_I0+8+3], v[vgprValuA_X0_I0+24+4:vgprValuA_X0_I0+24+4+3], acc[44:47] // left value = acc[44+0:47+0]
LR0 B 12
/*  mfmaIndex:22  */
v_mfma_f32_16x16x32_bf16 acc[44:47], v[vgprValuB_X0_I0+8+4:vgprValuB_X0_I0+8+4+3], v[vgprValuA_X0_I0+24:vgprValuA_X0_I0+24+3], acc[44:47] // left value = acc[44+0:47+0]
PERM A 32 0 16
PERM A 32 0 17
/*  mfmaIndex:23  */
v_mfma_f32_16x16x32_bf16 acc[44:47], v[vgprValuB_X0_I0+8:vgprValuB_X0_I0+8+3], v[vgprValuA_X0_I0+24:vgprValuA_X0_I0+24+3], acc[44:47] // left value = acc[44+0:47+0]
PERM A 32 0 18
PERM A 32 0 19
/*  mfmaIndex:24  */
v_mfma_f32_16x16x32_bf16 acc[64:67], v[vgprValuB_X0_I0+16:vgprValuB_X0_I0+16+3], v[vgprValuA_X0_I0+0+4:vgprValuA_X0_I0+0+4+3], acc[64:67] // left value = acc[64+0:67+0]
PERM A 32 0 20
/*  mfmaIndex:25  */
v_mfma_f32_16x16x32_bf16 acc[64:67], v[vgprValuB_X0_I0+16+4:vgprValuB_X0_I0+16+4+3], v[vgprValuA_X0_I0+0:vgprValuA_X0_I0+0+3], acc[64:67] // left value = acc[64+0:67+0]
LR0 B 13  
PERM A 32 0 21
/*  mfmaIndex:26  */
v_mfma_f32_16x16x32_bf16 acc[64:67], v[vgprValuB_X0_I0+16:vgprValuB_X0_I0+16+3], v[vgprValuA_X0_I0+0:vgprValuA_X0_I0+0+3], acc[64:67] // left value = acc[64+0:67+0]
LR0 B 14
/*  mfmaIndex:27  */
v_mfma_f32_16x16x32_bf16 acc[68:71], v[vgprValuB_X0_I0+16:vgprValuB_X0_I0+16+3], v[vgprValuA_X0_I0+8+4:vgprValuA_X0_I0+8+4+3], acc[68:71] // left value = acc[68+0:71+0]
PERM A 32 0 22
PERM A 32 0 23  
/*  mfmaIndex:28  */
v_mfma_f32_16x16x32_bf16 acc[68:71], v[vgprValuB_X0_I0+16+4:vgprValuB_X0_I0+16+4+3], v[vgprValuA_X0_I0+8:vgprValuA_X0_I0+8+3], acc[68:71] // left value = acc[68+0:71+0]
PERM A 40 1 0
PERM A 40 1 1
/*  mfmaIndex:29  */
v_mfma_f32_16x16x32_bf16 acc[68:71], v[vgprValuB_X0_I0+16:vgprValuB_X0_I0+16+3], v[vgprValuA_X0_I0+8:vgprValuA_X0_I0+8+3], acc[68:71] // left value = acc[68+0:71+0]
PERM A 40 1 2
PERM A 40 1 3
/*  mfmaIndex:30  */
v_mfma_f32_16x16x32_bf16 acc[72:75], v[vgprValuB_X0_I0+16:vgprValuB_X0_I0+16+3], v[vgprValuA_X0_I0+16+4:vgprValuA_X0_I0+16+4+3], acc[72:75] // left value = acc[72+0:75+0]
PERM A 40 1 4
/*  mfmaIndex:31  */
v_mfma_f32_16x16x32_bf16 acc[72:75], v[vgprValuB_X0_I0+16+4:vgprValuB_X0_I0+16+4+3], v[vgprValuA_X0_I0+16:vgprValuA_X0_I0+16+3], acc[72:75] // left value = acc[72+0:75+0]
LR0 B 15  
PERM A 40 1 5
/*  mfmaIndex:32  */
v_mfma_f32_16x16x32_bf16 acc[72:75], v[vgprValuB_X0_I0+16:vgprValuB_X0_I0+16+3], v[vgprValuA_X0_I0+16:vgprValuA_X0_I0+16+3], acc[72:75] // left value = acc[72+0:75+0]
PERM A 40 1 6
PERM A 40 1 7
/*  mfmaIndex:33  */
v_mfma_f32_16x16x32_bf16 acc[76:79], v[vgprValuB_X0_I0+16:vgprValuB_X0_I0+16+3], v[vgprValuA_X0_I0+24+4:vgprValuA_X0_I0+24+4+3], acc[76:79] // left value = acc[76+0:79+0]
PERM A 40 1 8
PERM A 40 1 9
/*  mfmaIndex:34  */
v_mfma_f32_16x16x32_bf16 acc[76:79], v[vgprValuB_X0_I0+16+4:vgprValuB_X0_I0+16+4+3], v[vgprValuA_X0_I0+24:vgprValuA_X0_I0+24+3], acc[76:79] // left value = acc[76+0:79+0]
PERM A 40 1 10
PERM A 40 1 11
/*  mfmaIndex:35  */
v_mfma_f32_16x16x32_bf16 acc[76:79], v[vgprValuB_X0_I0+16:vgprValuB_X0_I0+16+3], v[vgprValuA_X0_I0+24:vgprValuA_X0_I0+24+3], acc[76:79] // left value = acc[76+0:79+0]
PERM A 40 1 12
PERM A 40 1 13
/*  mfmaIndex:36  */
v_mfma_f32_16x16x32_bf16 acc[96:99], v[vgprValuB_X0_I0+24:vgprValuB_X0_I0+24+3], v[vgprValuA_X0_I0+0+4:vgprValuA_X0_I0+0+4+3], acc[96:99] // left value = acc[96+0:99+0]
PERM A 40 1 14
/*  mfmaIndex:37  */
v_mfma_f32_16x16x32_bf16 acc[96:99], v[vgprValuB_X0_I0+24+4:vgprValuB_X0_I0+24+4+3], v[vgprValuA_X0_I0+0:vgprValuA_X0_I0+0+3], acc[96:99] // left value = acc[96+0:99+0]
PERM A 40 1 15
/*  mfmaIndex:38  */
v_mfma_f32_16x16x32_bf16 acc[96:99], v[vgprValuB_X0_I0+24:vgprValuB_X0_I0+24+3], v[vgprValuA_X0_I0+0:vgprValuA_X0_I0+0+3], acc[96:99] // left value = acc[96+0:99+0]
PERM A 40 1 16
PERM A 40 1 17
/*  mfmaIndex:39  */
v_mfma_f32_16x16x32_bf16 acc[100:103], v[vgprValuB_X0_I0+24:vgprValuB_X0_I0+24+3], v[vgprValuA_X0_I0+8+4:vgprValuA_X0_I0+8+4+3], acc[100:103] // left value = acc[100+0:103+0]
PERM A 40 1 18
PERM A 40 1 19
/*  mfmaIndex:40  */
v_mfma_f32_16x16x32_bf16 acc[100:103], v[vgprValuB_X0_I0+24+4:vgprValuB_X0_I0+24+4+3], v[vgprValuA_X0_I0+8:vgprValuA_X0_I0+8+3], acc[100:103] // left value = acc[100+0:103+0]
PERM A 40 1 20
PERM A 40 1 21
/*  mfmaIndex:41  */
v_mfma_f32_16x16x32_bf16 acc[100:103], v[vgprValuB_X0_I0+24:vgprValuB_X0_I0+24+3], v[vgprValuA_X0_I0+8:vgprValuA_X0_I0+8+3], acc[100:103] // left value = acc[100+0:103+0]
/*  mfmaIndex:42  */
v_mfma_f32_16x16x32_bf16 acc[104:107], v[vgprValuB_X0_I0+24:vgprValuB_X0_I0+24+3], v[vgprValuA_X0_I0+16+4:vgprValuA_X0_I0+16+4+3], acc[104:107] // left value = acc[104+0:107+0]
PERM A 40 1 22
PERM A 40 1 23
/*  mfmaIndex:43  */
v_mfma_f32_16x16x32_bf16 acc[104:107], v[vgprValuB_X0_I0+24+4:vgprValuB_X0_I0+24+4+3], v[vgprValuA_X0_I0+16:vgprValuA_X0_I0+16+3], acc[104:107] // left value = acc[104+0:107+0]
PERM A 48 2 0
PERM A 48 2 1
/*  mfmaIndex:44  */
v_mfma_f32_16x16x32_bf16 acc[104:107], v[vgprValuB_X0_I0+24:vgprValuB_X0_I0+24+3], v[vgprValuA_X0_I0+16:vgprValuA_X0_I0+16+3], acc[104:107] // left value = acc[104+0:107+0]
PERM A 48 2 2
PERM A 48 2 3
/*  mfmaIndex:45  */
v_mfma_f32_16x16x32_bf16 acc[108:111], v[vgprValuB_X0_I0+24:vgprValuB_X0_I0+24+3], v[vgprValuA_X0_I0+24+4:vgprValuA_X0_I0+24+4+3], acc[108:111] // left value = acc[108+0:111+0]
PERM A 48 2 4
PERM A 48 2 5
/*  mfmaIndex:46  */
v_mfma_f32_16x16x32_bf16 acc[108:111], v[vgprValuB_X0_I0+24+4:vgprValuB_X0_I0+24+4+3], v[vgprValuA_X0_I0+24:vgprValuA_X0_I0+24+3], acc[108:111] // left value = acc[108+0:111+0]
PERM A 48 2 6
PERM A 48 2 7
/*  mfmaIndex:47  */
v_mfma_f32_16x16x32_bf16 acc[108:111], v[vgprValuB_X0_I0+24:vgprValuB_X0_I0+24+3], v[vgprValuA_X0_I0+24:vgprValuA_X0_I0+24+3], acc[108:111] // left value = acc[108+0:111+0]
PERM A 48 2 8
PERM A 48 2 9

// B0 A1 // MFMAs reordered to allow more time for PERM A1 to complete..

/*  mfmaIndex:48  */
v_mfma_f32_16x16x32_bf16 acc[16:19], v[vgprValuB_X0_I0+0:vgprValuB_X0_I0+0+3], v[vgprValuA_X0_I0+32+4:vgprValuA_X0_I0+32+4+3], acc[16:19] // left value = acc[16+0:19+0]
PERM A 48 2 10
PERM A 48 2 11
/*  mfmaIndex:49  */
v_mfma_f32_16x16x32_bf16 acc[16:19], v[vgprValuB_X0_I0+0+4:vgprValuB_X0_I0+0+4+3], v[vgprValuA_X0_I0+32:vgprValuA_X0_I0+32+3], acc[16:19] // left value = acc[16+0:19+0]
PERM A 48 2 12
PERM A 48 2 13
/*  mfmaIndex:50  */
v_mfma_f32_16x16x32_bf16 acc[16:19], v[vgprValuB_X0_I0+0:vgprValuB_X0_I0+0+3], v[vgprValuA_X0_I0+32:vgprValuA_X0_I0+32+3], acc[16:19] // left value = acc[16+0:19+0]
PERM A 48 2 14
PERM A 48 2 15
  /*  mfmaIndex:51  */
v_mfma_f32_16x16x32_bf16 acc[20:23], v[vgprValuB_X0_I0+0:vgprValuB_X0_I0+0+3], v[vgprValuA_X0_I0+40+4:vgprValuA_X0_I0+40+4+3], acc[20:23] // left value = acc[20+0:23+0]
PERM A 48 2 16
PERM A 48 2 17
/*  mfmaIndex:52  */
v_mfma_f32_16x16x32_bf16 acc[20:23], v[vgprValuB_X0_I0+0+4:vgprValuB_X0_I0+0+4+3], v[vgprValuA_X0_I0+40:vgprValuA_X0_I0+40+3], acc[20:23] // left value = acc[20+0:23+0]
PERM A 48 2 18
PERM A 48 2 19
/*  mfmaIndex:53  */
v_mfma_f32_16x16x32_bf16 acc[20:23], v[vgprValuB_X0_I0+0:vgprValuB_X0_I0+0+3], v[vgprValuA_X0_I0+40:vgprValuA_X0_I0+40+3], acc[20:23] // left value = acc[20+0:23+0]
PERM A 48 2 20
PERM A 48 2 21
/*  mfmaIndex:60  */
v_mfma_f32_16x16x32_bf16 acc[48:51], v[vgprValuB_X0_I0+8:vgprValuB_X0_I0+8+3], v[vgprValuA_X0_I0+32+4:vgprValuA_X0_I0+32+4+3], acc[48:51] // left value = acc[48+0:51+0]
PERM A 48 2 22
PERM A 48 2 23
/*  mfmaIndex:61  */
v_mfma_f32_16x16x32_bf16 acc[48:51], v[vgprValuB_X0_I0+8+4:vgprValuB_X0_I0+8+4+3], v[vgprValuA_X0_I0+32:vgprValuA_X0_I0+32+3], acc[48:51] // left value = acc[48+0:51+0]
PERM A 56 3 0
PERM A 56 3 1
/*  mfmaIndex:62  */
v_mfma_f32_16x16x32_bf16 acc[48:51], v[vgprValuB_X0_I0+8:vgprValuB_X0_I0+8+3], v[vgprValuA_X0_I0+32:vgprValuA_X0_I0+32+3], acc[48:51] // left value = acc[48+0:51+0]
PERM A 56 3 2
PERM A 56 3 3
/*  mfmaIndex:63  */
v_mfma_f32_16x16x32_bf16 acc[52:55], v[vgprValuB_X0_I0+8:vgprValuB_X0_I0+8+3], v[vgprValuA_X0_I0+40+4:vgprValuA_X0_I0+40+4+3], acc[52:55] // left value = acc[52+0:55+0]
PERM A 56 3 4
PERM A 56 3 5
/*  mfmaIndex:64  */
v_mfma_f32_16x16x32_bf16 acc[52:55], v[vgprValuB_X0_I0+8+4:vgprValuB_X0_I0+8+4+3], v[vgprValuA_X0_I0+40:vgprValuA_X0_I0+40+3], acc[52:55] // left value = acc[52+0:55+0]
PERM A 56 3 6
PERM A 56 3 7
/*  mfmaIndex:65  */
v_mfma_f32_16x16x32_bf16 acc[52:55], v[vgprValuB_X0_I0+8:vgprValuB_X0_I0+8+3], v[vgprValuA_X0_I0+40:vgprValuA_X0_I0+40+3], acc[52:55] // left value = acc[52+0:55+0]
PERM A 56 3 8
PERM A 56 3 9
/*  mfmaIndex:72  */
v_mfma_f32_16x16x32_bf16 acc[80:83], v[vgprValuB_X0_I0+16:vgprValuB_X0_I0+16+3], v[vgprValuA_X0_I0+32+4:vgprValuA_X0_I0+32+4+3], acc[80:83] // left value = acc[80+0:83+0]
PERM A 56 3 10
PERM A 56 3 11
/*  mfmaIndex:73  */
v_mfma_f32_16x16x32_bf16 acc[80:83], v[vgprValuB_X0_I0+16+4:vgprValuB_X0_I0+16+4+3], v[vgprValuA_X0_I0+32:vgprValuA_X0_I0+32+3], acc[80:83] // left value = acc[80+0:83+0]
PERM A 56 3 12
PERM A 56 3 13
/*  mfmaIndex:74  */
v_mfma_f32_16x16x32_bf16 acc[80:83], v[vgprValuB_X0_I0+16:vgprValuB_X0_I0+16+3], v[vgprValuA_X0_I0+32:vgprValuA_X0_I0+32+3], acc[80:83] // left value = acc[80+0:83+0]
PERM A 56 3 14
PERM A 56 3 15
/*  mfmaIndex:75  */
v_mfma_f32_16x16x32_bf16 acc[84:87], v[vgprValuB_X0_I0+16:vgprValuB_X0_I0+16+3], v[vgprValuA_X0_I0+40+4:vgprValuA_X0_I0+40+4+3], acc[84:87] // left value = acc[84+0:87+0]
PERM A 56 3 16
PERM A 56 3 17
/*  mfmaIndex:76  */
v_mfma_f32_16x16x32_bf16 acc[84:87], v[vgprValuB_X0_I0+16+4:vgprValuB_X0_I0+16+4+3], v[vgprValuA_X0_I0+40:vgprValuA_X0_I0+40+3], acc[84:87] // left value = acc[84+0:87+0]
PERM A 56 3 18
PERM A 56 3 19
/*  mfmaIndex:77  */
v_mfma_f32_16x16x32_bf16 acc[84:87], v[vgprValuB_X0_I0+16:vgprValuB_X0_I0+16+3], v[vgprValuA_X0_I0+40:vgprValuA_X0_I0+40+3], acc[84:87] // left value = acc[84+0:87+0]
PERM A 56 3 20
PERM A 56 3 21
/*  mfmaIndex:84  */
v_mfma_f32_16x16x32_bf16 acc[112:115], v[vgprValuB_X0_I0+24:vgprValuB_X0_I0+24+3], v[vgprValuA_X0_I0+32+4:vgprValuA_X0_I0+32+4+3], acc[112:115] // left value = acc[112+0:115+0]
PERM A 56 3 22
PERM A 56 3 23
/*  mfmaIndex:85  */
v_mfma_f32_16x16x32_bf16 acc[112:115], v[vgprValuB_X0_I0+24+4:vgprValuB_X0_I0+24+4+3], v[vgprValuA_X0_I0+32:vgprValuA_X0_I0+32+3], acc[112:115] // left value = acc[112+0:115+0]
s_waitcnt lgkmcnt(0) // Wait for LR B1 to finish
CVT B 32 0 0
CVT B 32 0 1
/*  mfmaIndex:86  */
v_mfma_f32_16x16x32_bf16 acc[112:115], v[vgprValuB_X0_I0+24:vgprValuB_X0_I0+24+3], v[vgprValuA_X0_I0+32:vgprValuA_X0_I0+32+3], acc[112:115] // left value = acc[112+0:115+0]
CVT B 32 0 2
CVT B 32 0 3
/*  mfmaIndex:87  */
v_mfma_f32_16x16x32_bf16 acc[116:119], v[vgprValuB_X0_I0+24:vgprValuB_X0_I0+24+3], v[vgprValuA_X0_I0+40+4:vgprValuA_X0_I0+40+4+3], acc[116:119] // left value = acc[116+0:119+0]
CVT B 32 0 4
CVT B 32 0 5
/*  mfmaIndex:88  */
v_mfma_f32_16x16x32_bf16 acc[116:119], v[vgprValuB_X0_I0+24+4:vgprValuB_X0_I0+24+4+3], v[vgprValuA_X0_I0+40:vgprValuA_X0_I0+40+3], acc[116:119] // left value = acc[116+0:119+0]
CVT B 32 0 6
CVT B 32 0 7
/*  mfmaIndex:89  */
v_mfma_f32_16x16x32_bf16 acc[116:119], v[vgprValuB_X0_I0+24:vgprValuB_X0_I0+24+3], v[vgprValuA_X0_I0+40:vgprValuA_X0_I0+40+3], acc[116:119] // left value = acc[116+0:119+0]
CVT B 32 0 8
CVT B 32 0 9
/*  mfmaIndex:54  */
v_mfma_f32_16x16x32_bf16 acc[24:27], v[vgprValuB_X0_I0+0:vgprValuB_X0_I0+0+3], v[vgprValuA_X0_I0+48+4:vgprValuA_X0_I0+48+4+3], acc[24:27] // left value = acc[24+0:27+0]
CVT B 32 0 10
CVT B 32 0 11
/*  mfmaIndex:55  */
v_mfma_f32_16x16x32_bf16 acc[24:27], v[vgprValuB_X0_I0+0+4:vgprValuB_X0_I0+0+4+3], v[vgprValuA_X0_I0+48:vgprValuA_X0_I0+48+3], acc[24:27] // left value = acc[24+0:27+0]
CVT B 32 0 12
CVT B 32 0 13
/*  mfmaIndex:56  */
v_mfma_f32_16x16x32_bf16 acc[24:27], v[vgprValuB_X0_I0+0:vgprValuB_X0_I0+0+3], v[vgprValuA_X0_I0+48:vgprValuA_X0_I0+48+3], acc[24:27] // left value = acc[24+0:27+0]
CVT B 32 0 14
CVT B 32 0 15
/*  mfmaIndex:57  */
v_mfma_f32_16x16x32_bf16 acc[28:31], v[vgprValuB_X0_I0+0:vgprValuB_X0_I0+0+3], v[vgprValuA_X0_I0+56+4:vgprValuA_X0_I0+56+4+3], acc[28:31] // left value = acc[28+0:31+0]
CVT B 32 0 16
CVT B 32 0 17
/*  mfmaIndex:58  */
v_mfma_f32_16x16x32_bf16 acc[28:31], v[vgprValuB_X0_I0+0+4:vgprValuB_X0_I0+0+4+3], v[vgprValuA_X0_I0+56:vgprValuA_X0_I0+56+3], acc[28:31] // left value = acc[28+0:31+0]
CVT B 32 0 18
CVT B 32 0 19
/*  mfmaIndex:59  */
v_mfma_f32_16x16x32_bf16 acc[28:31], v[vgprValuB_X0_I0+0:vgprValuB_X0_I0+0+3], v[vgprValuA_X0_I0+56:vgprValuA_X0_I0+56+3], acc[28:31] // left value = acc[28+0:31+0]
CVT B 32 0 20
CVT B 32 0 21
/*  mfmaIndex:66  */
v_mfma_f32_16x16x32_bf16 acc[56:59], v[vgprValuB_X0_I0+8:vgprValuB_X0_I0+8+3], v[vgprValuA_X0_I0+48+4:vgprValuA_X0_I0+48+4+3], acc[56:59] // left value = acc[56+0:59+0]
CVT B 32 0 22
CVT B 32 0 23
/*  mfmaIndex:67  */
v_mfma_f32_16x16x32_bf16 acc[56:59], v[vgprValuB_X0_I0+8+4:vgprValuB_X0_I0+8+4+3], v[vgprValuA_X0_I0+48:vgprValuA_X0_I0+48+3], acc[56:59] // left value = acc[56+0:59+0]
CVT B 40 8 0
CVT B 40 8 1
/*  mfmaIndex:68  */
v_mfma_f32_16x16x32_bf16 acc[56:59], v[vgprValuB_X0_I0+8:vgprValuB_X0_I0+8+3], v[vgprValuA_X0_I0+48:vgprValuA_X0_I0+48+3], acc[56:59] // left value = acc[56+0:59+0]
CVT B 40 8 2
CVT B 40 8 3
/*  mfmaIndex:69  */
v_mfma_f32_16x16x32_bf16 acc[60:63], v[vgprValuB_X0_I0+8:vgprValuB_X0_I0+8+3], v[vgprValuA_X0_I0+56+4:vgprValuA_X0_I0+56+4+3], acc[60:63] // left value = acc[60+0:63+0]
CVT B 40 8 4
CVT B 40 8 5
/*  mfmaIndex:70  */
v_mfma_f32_16x16x32_bf16 acc[60:63], v[vgprValuB_X0_I0+8+4:vgprValuB_X0_I0+8+4+3], v[vgprValuA_X0_I0+56:vgprValuA_X0_I0+56+3], acc[60:63] // left value = acc[60+0:63+0]
CVT B 40 8 6
CVT B 40 8 7
/*  mfmaIndex:71  */
v_mfma_f32_16x16x32_bf16 acc[60:63], v[vgprValuB_X0_I0+8:vgprValuB_X0_I0+8+3], v[vgprValuA_X0_I0+56:vgprValuA_X0_I0+56+3], acc[60:63] // left value = acc[60+0:63+0]
CVT B 40 8 8
CVT B 40 8 9
/*  mfmaIndex:78  */
v_mfma_f32_16x16x32_bf16 acc[88:91], v[vgprValuB_X0_I0+16:vgprValuB_X0_I0+16+3], v[vgprValuA_X0_I0+48+4:vgprValuA_X0_I0+48+4+3], acc[88:91] // left value = acc[88+0:91+0]
s_waitcnt vmcnt(16) // Wait for previous A to finish to start LR A0, 8A + 8B issued prior, 8A just issued, 8B prior + 8A just can still be in flight.
CVT B 40 8 10
CVT B 40 8 11
/*  mfmaIndex:79  */
v_mfma_f32_16x16x32_bf16 acc[88:91], v[vgprValuB_X0_I0+16+4:vgprValuB_X0_I0+16+4+3], v[vgprValuA_X0_I0+48:vgprValuA_X0_I0+48+3], acc[88:91] // left value = acc[88+0:91+0]
/*  mfmaIndex:80  */
v_mfma_f32_16x16x32_bf16 acc[88:91], v[vgprValuB_X0_I0+16:vgprValuB_X0_I0+16+3], v[vgprValuA_X0_I0+48:vgprValuA_X0_I0+48+3], acc[88:91] // left value = acc[88+0:91+0]
/*  mfmaIndex:81  */
v_mfma_f32_16x16x32_bf16 acc[92:95], v[vgprValuB_X0_I0+16:vgprValuB_X0_I0+16+3], v[vgprValuA_X0_I0+56+4:vgprValuA_X0_I0+56+4+3], acc[92:95] // left value = acc[92+0:95+0]
/*  mfmaIndex:82  */
v_mfma_f32_16x16x32_bf16 acc[92:95], v[vgprValuB_X0_I0+16+4:vgprValuB_X0_I0+16+4+3], v[vgprValuA_X0_I0+56:vgprValuA_X0_I0+56+3], acc[92:95] // left value = acc[92+0:95+0]
/*  mfmaIndex:83  */
v_mfma_f32_16x16x32_bf16 acc[92:95], v[vgprValuB_X0_I0+16:vgprValuB_X0_I0+16+3], v[vgprValuA_X0_I0+56:vgprValuA_X0_I0+56+3], acc[92:95] // left value = acc[92+0:95+0]
/*  mfmaIndex:90  */
v_mfma_f32_16x16x32_bf16 acc[120:123], v[vgprValuB_X0_I0+24:vgprValuB_X0_I0+24+3], v[vgprValuA_X0_I0+48+4:vgprValuA_X0_I0+48+4+3], acc[120:123] // left value = acc[120+0:123+0]
/*  mfmaIndex:91  */
v_mfma_f32_16x16x32_bf16 acc[120:123], v[vgprValuB_X0_I0+24+4:vgprValuB_X0_I0+24+4+3], v[vgprValuA_X0_I0+48:vgprValuA_X0_I0+48+3], acc[120:123] // left value = acc[120+0:123+0]
/*  mfmaIndex:92  */
v_mfma_f32_16x16x32_bf16 acc[120:123], v[vgprValuB_X0_I0+24:vgprValuB_X0_I0+24+3], v[vgprValuA_X0_I0+48:vgprValuA_X0_I0+48+3], acc[120:123] // left value = acc[120+0:123+0]
/*  mfmaIndex:93  */
v_mfma_f32_16x16x32_bf16 acc[124:127], v[vgprValuB_X0_I0+24:vgprValuB_X0_I0+24+3], v[vgprValuA_X0_I0+56+4:vgprValuA_X0_I0+56+4+3], acc[124:127] // left value = acc[124+0:127+0]
/*  mfmaIndex:94  */
v_mfma_f32_16x16x32_bf16 acc[124:127], v[vgprValuB_X0_I0+24+4:vgprValuB_X0_I0+24+4+3], v[vgprValuA_X0_I0+56:vgprValuA_X0_I0+56+3], acc[124:127] // left value = acc[124+0:127+0]
/*  mfmaIndex:95  */
v_mfma_f32_16x16x32_bf16 acc[124:127], v[vgprValuB_X0_I0+24:vgprValuB_X0_I0+24+3], v[vgprValuA_X0_I0+56:vgprValuA_X0_I0+56+3], acc[124:127] // left value = acc[124+0:127+0]
CVT B 40 8 12
CVT B 40 8 13
// B1 A0
/*  mfmaIndex:96  */
v_mfma_f32_16x16x32_bf16 acc[128:131], v[vgprValuB_X0_I0+32:vgprValuB_X0_I0+32+3], v[vgprValuA_X0_I0+0+4:vgprValuA_X0_I0+0+4+3], acc[128:131] // left value = acc[128+0:131+0]
CVT B 40 8 14
CVT B 40 8 15
/*  mfmaIndex:97  */
v_mfma_f32_16x16x32_bf16 acc[128:131], v[vgprValuB_X0_I0+32+4:vgprValuB_X0_I0+32+4+3], v[vgprValuA_X0_I0+0:vgprValuA_X0_I0+0+3], acc[128:131] // left value = acc[128+0:131+0]
CVT B 40 8 16
CVT B 40 8 17
/*  mfmaIndex:98  */
v_mfma_f32_16x16x32_bf16 acc[128:131], v[vgprValuB_X0_I0+32:vgprValuB_X0_I0+32+3], v[vgprValuA_X0_I0+0:vgprValuA_X0_I0+0+3], acc[128:131] // left value = acc[128+0:131+0]
CVT B 40 8 18
CVT B 40 8 19
/*  mfmaIndex:99  */
v_mfma_f32_16x16x32_bf16 acc[132:135], v[vgprValuB_X0_I0+32:vgprValuB_X0_I0+32+3], v[vgprValuA_X0_I0+8+4:vgprValuA_X0_I0+8+4+3], acc[132:135] // left value = acc[132+0:135+0]
CVT B 40 8 20
CVT B 40 8 21
/*  mfmaIndex:100  */
v_mfma_f32_16x16x32_bf16 acc[132:135], v[vgprValuB_X0_I0+32+4:vgprValuB_X0_I0+32+4+3], v[vgprValuA_X0_I0+8:vgprValuA_X0_I0+8+3], acc[132:135] // left value = acc[132+0:135+0]
CVT B 40 8 22
CVT B 40 8 23
/*  mfmaIndex:101  */
v_mfma_f32_16x16x32_bf16 acc[132:135], v[vgprValuB_X0_I0+32:vgprValuB_X0_I0+32+3], v[vgprValuA_X0_I0+8:vgprValuA_X0_I0+8+3], acc[132:135] // left value = acc[132+0:135+0]
CVT B 48 16 0
CVT B 48 16 1
/*  mfmaIndex:102  */
v_mfma_f32_16x16x32_bf16 acc[136:139], v[vgprValuB_X0_I0+32:vgprValuB_X0_I0+32+3], v[vgprValuA_X0_I0+16+4:vgprValuA_X0_I0+16+4+3], acc[136:139] // left value = acc[136+0:139+0]
CVT B 48 16 2
CVT B 48 16 3
/*  mfmaIndex:103  */
v_mfma_f32_16x16x32_bf16 acc[136:139], v[vgprValuB_X0_I0+32+4:vgprValuB_X0_I0+32+4+3], v[vgprValuA_X0_I0+16:vgprValuA_X0_I0+16+3], acc[136:139] // left value = acc[136+0:139+0]
CVT B 48 16 4
CVT B 48 16 5
/*  mfmaIndex:104  */
v_mfma_f32_16x16x32_bf16 acc[136:139], v[vgprValuB_X0_I0+32:vgprValuB_X0_I0+32+3], v[vgprValuA_X0_I0+16:vgprValuA_X0_I0+16+3], acc[136:139] // left value = acc[136+0:139+0]
CVT B 48 16 6
CVT B 48 16 7
/*  mfmaIndex:105  */
v_mfma_f32_16x16x32_bf16 acc[140:143], v[vgprValuB_X0_I0+32:vgprValuB_X0_I0+32+3], v[vgprValuA_X0_I0+24+4:vgprValuA_X0_I0+24+4+3], acc[140:143] // left value = acc[140+0:143+0]
CVT B 48 16 8
CVT B 48 16 9
/*  mfmaIndex:106  */
v_mfma_f32_16x16x32_bf16 acc[140:143], v[vgprValuB_X0_I0+32+4:vgprValuB_X0_I0+32+4+3], v[vgprValuA_X0_I0+24:vgprValuA_X0_I0+24+3], acc[140:143] // left value = acc[140+0:143+0]
CVT B 48 16 10
CVT B 48 16 11
/*  mfmaIndex:107  */
v_mfma_f32_16x16x32_bf16 acc[140:143], v[vgprValuB_X0_I0+32:vgprValuB_X0_I0+32+3], v[vgprValuA_X0_I0+24:vgprValuA_X0_I0+24+3], acc[140:143] // left value = acc[140+0:143+0]
CVT B 48 16 12
CVT B 48 16 13
/*  mfmaIndex:108  */
v_mfma_f32_16x16x32_bf16 acc[160:163], v[vgprValuB_X0_I0+40:vgprValuB_X0_I0+40+3], v[vgprValuA_X0_I0+0+4:vgprValuA_X0_I0+0+4+3], acc[160:163] // left value = acc[160+0:163+0]
CVT B 48 16 14
CVT B 48 16 15
/*  mfmaIndex:109  */
v_mfma_f32_16x16x32_bf16 acc[160:163], v[vgprValuB_X0_I0+40+4:vgprValuB_X0_I0+40+4+3], v[vgprValuA_X0_I0+0:vgprValuA_X0_I0+0+3], acc[160:163] // left value = acc[160+0:163+0]
CVT B 48 16 16
CVT B 48 16 17
/*  mfmaIndex:110  */
v_mfma_f32_16x16x32_bf16 acc[160:163], v[vgprValuB_X0_I0+40:vgprValuB_X0_I0+40+3], v[vgprValuA_X0_I0+0:vgprValuA_X0_I0+0+3], acc[160:163] // left value = acc[160+0:163+0]
CVT B 48 16 18
CVT B 48 16 19
/*  mfmaIndex:111  */
v_mfma_f32_16x16x32_bf16 acc[164:167], v[vgprValuB_X0_I0+40:vgprValuB_X0_I0+40+3], v[vgprValuA_X0_I0+8+4:vgprValuA_X0_I0+8+4+3], acc[164:167] // left value = acc[164+0:167+0]
CVT B 48 16 20
CVT B 48 16 21
/*  mfmaIndex:112  */
v_mfma_f32_16x16x32_bf16 acc[164:167], v[vgprValuB_X0_I0+40+4:vgprValuB_X0_I0+40+4+3], v[vgprValuA_X0_I0+8:vgprValuA_X0_I0+8+3], acc[164:167] // left value = acc[164+0:167+0]
CVT B 48 16 22
CVT B 48 16 23
/*  mfmaIndex:113  */
v_mfma_f32_16x16x32_bf16 acc[164:167], v[vgprValuB_X0_I0+40:vgprValuB_X0_I0+40+3], v[vgprValuA_X0_I0+8:vgprValuA_X0_I0+8+3], acc[164:167] // left value = acc[164+0:167+0]
CVT B 56 24 0
CVT B 56 24 1
/*  mfmaIndex:114  */
v_mfma_f32_16x16x32_bf16 acc[168:171], v[vgprValuB_X0_I0+40:vgprValuB_X0_I0+40+3], v[vgprValuA_X0_I0+16+4:vgprValuA_X0_I0+16+4+3], acc[168:171] // left value = acc[168+0:171+0]
CVT B 56 24 2
CVT B 56 24 3
/*  mfmaIndex:115  */
v_mfma_f32_16x16x32_bf16 acc[168:171], v[vgprValuB_X0_I0+40+4:vgprValuB_X0_I0+40+4+3], v[vgprValuA_X0_I0+16:vgprValuA_X0_I0+16+3], acc[168:171] // left value = acc[168+0:171+0]
CVT B 56 24 4
CVT B 56 24 5
/*  mfmaIndex:116  */
v_mfma_f32_16x16x32_bf16 acc[168:171], v[vgprValuB_X0_I0+40:vgprValuB_X0_I0+40+3], v[vgprValuA_X0_I0+16:vgprValuA_X0_I0+16+3], acc[168:171] // left value = acc[168+0:171+0]
CVT B 56 24 6
CVT B 56 24 7
/*  mfmaIndex:117  */
v_mfma_f32_16x16x32_bf16 acc[172:175], v[vgprValuB_X0_I0+40:vgprValuB_X0_I0+40+3], v[vgprValuA_X0_I0+24+4:vgprValuA_X0_I0+24+4+3], acc[172:175] // left value = acc[172+0:175+0]
CVT B 56 24 8
CVT B 56 24 9
/*  mfmaIndex:118  */
v_mfma_f32_16x16x32_bf16 acc[172:175], v[vgprValuB_X0_I0+40+4:vgprValuB_X0_I0+40+4+3], v[vgprValuA_X0_I0+24:vgprValuA_X0_I0+24+3], acc[172:175] // left value = acc[172+0:175+0]
CVT B 56 24 10
CVT B 56 24 11
/*  mfmaIndex:119  */
v_mfma_f32_16x16x32_bf16 acc[172:175], v[vgprValuB_X0_I0+40:vgprValuB_X0_I0+40+3], v[vgprValuA_X0_I0+24:vgprValuA_X0_I0+24+3], acc[172:175] // left value = acc[172+0:175+0]
CVT B 56 24 12
CVT B 56 24 13
/*  mfmaIndex:120  */
v_mfma_f32_16x16x32_bf16 acc[192:195], v[vgprValuB_X0_I0+48:vgprValuB_X0_I0+48+3], v[vgprValuA_X0_I0+0+4:vgprValuA_X0_I0+0+4+3], acc[192:195] // left value = acc[192+0:195+0]
CVT B 56 24 14
CVT B 56 24 15
/*  mfmaIndex:121  */
v_mfma_f32_16x16x32_bf16 acc[192:195], v[vgprValuB_X0_I0+48+4:vgprValuB_X0_I0+48+4+3], v[vgprValuA_X0_I0+0:vgprValuA_X0_I0+0+3], acc[192:195] // left value = acc[192+0:195+0]
CVT B 56 24 16
CVT B 56 24 17
/*  mfmaIndex:122  */
v_mfma_f32_16x16x32_bf16 acc[192:195], v[vgprValuB_X0_I0+48:vgprValuB_X0_I0+48+3], v[vgprValuA_X0_I0+0:vgprValuA_X0_I0+0+3], acc[192:195] // left value = acc[192+0:195+0]
CVT B 56 24 18
CVT B 56 24 19
/*  mfmaIndex:123  */
v_mfma_f32_16x16x32_bf16 acc[196:199], v[vgprValuB_X0_I0+48:vgprValuB_X0_I0+48+3], v[vgprValuA_X0_I0+8+4:vgprValuA_X0_I0+8+4+3], acc[196:199] // left value = acc[196+0:199+0]
CVT B 56 24 20
/*  mfmaIndex:124  */
v_mfma_f32_16x16x32_bf16 acc[196:199], v[vgprValuB_X0_I0+48+4:vgprValuB_X0_I0+48+4+3], v[vgprValuA_X0_I0+8:vgprValuA_X0_I0+8+3], acc[196:199] // left value = acc[196+0:199+0]
CVT B 56 24 21
/*  mfmaIndex:125  */
v_mfma_f32_16x16x32_bf16 acc[196:199], v[vgprValuB_X0_I0+48:vgprValuB_X0_I0+48+3], v[vgprValuA_X0_I0+8:vgprValuA_X0_I0+8+3], acc[196:199] // left value = acc[196+0:199+0]
CVT B 56 24 22
/*  mfmaIndex:126  */
v_mfma_f32_16x16x32_bf16 acc[200:203], v[vgprValuB_X0_I0+48:vgprValuB_X0_I0+48+3], v[vgprValuA_X0_I0+16+4:vgprValuA_X0_I0+16+4+3], acc[200:203] // left value = acc[200+0:203+0]
CVT B 56 24 23
/*  mfmaIndex:127  */
v_mfma_f32_16x16x32_bf16 acc[200:203], v[vgprValuB_X0_I0+48+4:vgprValuB_X0_I0+48+4+3], v[vgprValuA_X0_I0+16:vgprValuA_X0_I0+16+3], acc[200:203] // left value = acc[200+0:203+0]
v_mov_b64 v[vgprValuA_T1_I0+0:vgprValuA_T1_I0+1], v[vgprValuA_X0_I0+0:vgprValuA_X0_I0+1]
/*  mfmaIndex:128  */
v_mfma_f32_16x16x32_bf16 acc[200:203], v[vgprValuB_X0_I0+48:vgprValuB_X0_I0+48+3], v[vgprValuA_X0_I0+16:vgprValuA_X0_I0+16+3], acc[200:203] // left value = acc[200+0:203+0]
v_mov_b64 v[vgprValuA_T1_I0+2:vgprValuA_T1_I0+3], v[vgprValuA_X0_I0+2:vgprValuA_X0_I0+3]
/*  mfmaIndex:129  */
v_mfma_f32_16x16x32_bf16 acc[204:207], v[vgprValuB_X0_I0+48:vgprValuB_X0_I0+48+3], v[vgprValuA_X0_I0+24+4:vgprValuA_X0_I0+24+4+3], acc[204:207] // left value = acc[204+0:207+0]
v_mov_b64 v[vgprValuA_T1_I0+4:vgprValuA_T1_I0+5], v[vgprValuA_X0_I0+4:vgprValuA_X0_I0+5]
/*  mfmaIndex:130  */
v_mfma_f32_16x16x32_bf16 acc[204:207], v[vgprValuB_X0_I0+48+4:vgprValuB_X0_I0+48+4+3], v[vgprValuA_X0_I0+24:vgprValuA_X0_I0+24+3], acc[204:207] // left value = acc[204+0:207+0]
v_mov_b64 v[vgprValuA_T1_I0+6:vgprValuA_T1_I0+7], v[vgprValuA_X0_I0+6:vgprValuA_X0_I0+7]
/*  mfmaIndex:131  */
v_mfma_f32_16x16x32_bf16 acc[204:207], v[vgprValuB_X0_I0+48:vgprValuB_X0_I0+48+3], v[vgprValuA_X0_I0+24:vgprValuA_X0_I0+24+3], acc[204:207] // left value = acc[204+0:207+0]
/*  mfmaIndex:132  */
v_mfma_f32_16x16x32_bf16 acc[224:227], v[vgprValuB_X0_I0+56:vgprValuB_X0_I0+56+3], v[vgprValuA_T1_I0+0+4:vgprValuA_T1_I0+0+4+3], acc[224:227] // left value = acc[224+0:227+0]
/*  mfmaIndex:133  */
v_mfma_f32_16x16x32_bf16 acc[224:227], v[vgprValuB_X0_I0+56+4:vgprValuB_X0_I0+56+4+3], v[vgprValuA_T1_I0+0:vgprValuA_T1_I0+0+3], acc[224:227] // left value = acc[224+0:227+0]
/*  mfmaIndex:134  */
v_mfma_f32_16x16x32_bf16 acc[224:227], v[vgprValuB_X0_I0+56:vgprValuB_X0_I0+56+3], v[vgprValuA_T1_I0+0:vgprValuA_T1_I0+0+3], acc[224:227] // left value = acc[224+0:227+0]
/*  mfmaIndex:135  */
v_mfma_f32_16x16x32_bf16 acc[228:231], v[vgprValuB_X0_I0+56:vgprValuB_X0_I0+56+3], v[vgprValuA_X0_I0+8+4:vgprValuA_X0_I0+8+4+3], acc[228:231] // left value = acc[228+0:231+0]
/*  mfmaIndex:136  */
v_mfma_f32_16x16x32_bf16 acc[228:231], v[vgprValuB_X0_I0+56+4:vgprValuB_X0_I0+56+4+3], v[vgprValuA_X0_I0+8:vgprValuA_X0_I0+8+3], acc[228:231] // left value = acc[228+0:231+0]
/*  mfmaIndex:137  */
v_mfma_f32_16x16x32_bf16 acc[228:231], v[vgprValuB_X0_I0+56:vgprValuB_X0_I0+56+3], v[vgprValuA_X0_I0+8:vgprValuA_X0_I0+8+3], acc[228:231] // left value = acc[228+0:231+0]
/*  mfmaIndex:138  */
v_mfma_f32_16x16x32_bf16 acc[232:235], v[vgprValuB_X0_I0+56:vgprValuB_X0_I0+56+3], v[vgprValuA_X0_I0+16+4:vgprValuA_X0_I0+16+4+3], acc[232:235] // left value = acc[232+0:235+0]
/*  mfmaIndex:139  */
v_mfma_f32_16x16x32_bf16 acc[232:235], v[vgprValuB_X0_I0+56+4:vgprValuB_X0_I0+56+4+3], v[vgprValuA_X0_I0+16:vgprValuA_X0_I0+16+3], acc[232:235] // left value = acc[232+0:235+0]
/*  mfmaIndex:140  */
v_mfma_f32_16x16x32_bf16 acc[232:235], v[vgprValuB_X0_I0+56:vgprValuB_X0_I0+56+3], v[vgprValuA_X0_I0+16:vgprValuA_X0_I0+16+3], acc[232:235] // left value = acc[232+0:235+0]
/*  mfmaIndex:141  */
v_mfma_f32_16x16x32_bf16 acc[236:239], v[vgprValuB_X0_I0+56:vgprValuB_X0_I0+56+3], v[vgprValuA_X0_I0+24+4:vgprValuA_X0_I0+24+4+3], acc[236:239] // left value = acc[236+0:239+0]
/*  mfmaIndex:142  */
v_mfma_f32_16x16x32_bf16 acc[236:239], v[vgprValuB_X0_I0+56+4:vgprValuB_X0_I0+56+4+3], v[vgprValuA_X0_I0+24:vgprValuA_X0_I0+24+3], acc[236:239] // left value = acc[236+0:239+0]
/*  mfmaIndex:143  */
v_mfma_f32_16x16x32_bf16 acc[236:239], v[vgprValuB_X0_I0+56:vgprValuB_X0_I0+56+3], v[vgprValuA_X0_I0+24:vgprValuA_X0_I0+24+3], acc[236:239] // left value = acc[236+0:239+0]
// A1 B1
/*  mfmaIndex:144  */
v_mfma_f32_16x16x32_bf16 acc[144:147], v[vgprValuB_X0_I0+32:vgprValuB_X0_I0+32+3], v[vgprValuA_X0_I0+32+4:vgprValuA_X0_I0+32+4+3], acc[144:147] // left value = acc[144+0:147+0]
/*  mfmaIndex:145  */
v_mfma_f32_16x16x32_bf16 acc[144:147], v[vgprValuB_X0_I0+32+4:vgprValuB_X0_I0+32+4+3], v[vgprValuA_X0_I0+32:vgprValuA_X0_I0+32+3], acc[144:147] // left value = acc[144+0:147+0]
/*  mfmaIndex:146  */
v_mfma_f32_16x16x32_bf16 acc[144:147], v[vgprValuB_X0_I0+32:vgprValuB_X0_I0+32+3], v[vgprValuA_X0_I0+32:vgprValuA_X0_I0+32+3], acc[144:147] // left value = acc[144+0:147+0]
/*  mfmaIndex:147  */
v_mfma_f32_16x16x32_bf16 acc[148:151], v[vgprValuB_X0_I0+32:vgprValuB_X0_I0+32+3], v[vgprValuA_X0_I0+40+4:vgprValuA_X0_I0+40+4+3], acc[148:151] // left value = acc[148+0:151+0]
/*  mfmaIndex:148  */
v_mfma_f32_16x16x32_bf16 acc[148:151], v[vgprValuB_X0_I0+32+4:vgprValuB_X0_I0+32+4+3], v[vgprValuA_X0_I0+40:vgprValuA_X0_I0+40+3], acc[148:151] // left value = acc[148+0:151+0]
/*  mfmaIndex:149  */
v_mfma_f32_16x16x32_bf16 acc[148:151], v[vgprValuB_X0_I0+32:vgprValuB_X0_I0+32+3], v[vgprValuA_X0_I0+40:vgprValuA_X0_I0+40+3], acc[148:151] // left value = acc[148+0:151+0]
/*  mfmaIndex:150  */
v_mfma_f32_16x16x32_bf16 acc[152:155], v[vgprValuB_X0_I0+32:vgprValuB_X0_I0+32+3], v[vgprValuA_X0_I0+48+4:vgprValuA_X0_I0+48+4+3], acc[152:155] // left value = acc[152+0:155+0]
/*  mfmaIndex:151  */
v_mfma_f32_16x16x32_bf16 acc[152:155], v[vgprValuB_X0_I0+32+4:vgprValuB_X0_I0+32+4+3], v[vgprValuA_X0_I0+48:vgprValuA_X0_I0+48+3], acc[152:155] // left value = acc[152+0:155+0]
/*  mfmaIndex:152  */
v_mfma_f32_16x16x32_bf16 acc[152:155], v[vgprValuB_X0_I0+32:vgprValuB_X0_I0+32+3], v[vgprValuA_X0_I0+48:vgprValuA_X0_I0+48+3], acc[152:155] // left value = acc[152+0:155+0]
/*  mfmaIndex:153  */
v_mfma_f32_16x16x32_bf16 acc[156:159], v[vgprValuB_X0_I0+32:vgprValuB_X0_I0+32+3], v[vgprValuA_X0_I0+56+4:vgprValuA_X0_I0+56+4+3], acc[156:159] // left value = acc[156+0:159+0]
/*  mfmaIndex:154  */
v_mfma_f32_16x16x32_bf16 acc[156:159], v[vgprValuB_X0_I0+32+4:vgprValuB_X0_I0+32+4+3], v[vgprValuA_X0_I0+56:vgprValuA_X0_I0+56+3], acc[156:159] // left value = acc[156+0:159+0]
/*  mfmaIndex:155  */
v_mfma_f32_16x16x32_bf16 acc[156:159], v[vgprValuB_X0_I0+32:vgprValuB_X0_I0+32+3], v[vgprValuA_X0_I0+56:vgprValuA_X0_I0+56+3], acc[156:159] // left value = acc[156+0:159+0]
/*  mfmaIndex:156  */
v_mfma_f32_16x16x32_bf16 acc[176:179], v[vgprValuB_X0_I0+40:vgprValuB_X0_I0+40+3], v[vgprValuA_X0_I0+32+4:vgprValuA_X0_I0+32+4+3], acc[176:179] // left value = acc[176+0:179+0]
/*  mfmaIndex:157  */
v_mfma_f32_16x16x32_bf16 acc[176:179], v[vgprValuB_X0_I0+40+4:vgprValuB_X0_I0+40+4+3], v[vgprValuA_X0_I0+32:vgprValuA_X0_I0+32+3], acc[176:179] // left value = acc[176+0:179+0]
/*  mfmaIndex:158  */
v_mfma_f32_16x16x32_bf16 acc[176:179], v[vgprValuB_X0_I0+40:vgprValuB_X0_I0+40+3], v[vgprValuA_X0_I0+32:vgprValuA_X0_I0+32+3], acc[176:179] // left value = acc[176+0:179+0]
/*  mfmaIndex:159  */
v_mfma_f32_16x16x32_bf16 acc[180:183], v[vgprValuB_X0_I0+40:vgprValuB_X0_I0+40+3], v[vgprValuA_X0_I0+40+4:vgprValuA_X0_I0+40+4+3], acc[180:183] // left value = acc[180+0:183+0]
/*  mfmaIndex:160  */
v_mfma_f32_16x16x32_bf16 acc[180:183], v[vgprValuB_X0_I0+40+4:vgprValuB_X0_I0+40+4+3], v[vgprValuA_X0_I0+40:vgprValuA_X0_I0+40+3], acc[180:183] // left value = acc[180+0:183+0]
/*  mfmaIndex:161  */
v_mfma_f32_16x16x32_bf16 acc[180:183], v[vgprValuB_X0_I0+40:vgprValuB_X0_I0+40+3], v[vgprValuA_X0_I0+40:vgprValuA_X0_I0+40+3], acc[180:183] // left value = acc[180+0:183+0]
/*  mfmaIndex:162  */
v_mfma_f32_16x16x32_bf16 acc[184:187], v[vgprValuB_X0_I0+40:vgprValuB_X0_I0+40+3], v[vgprValuA_X0_I0+48+4:vgprValuA_X0_I0+48+4+3], acc[184:187] // left value = acc[184+0:187+0]
/*  mfmaIndex:163  */
v_mfma_f32_16x16x32_bf16 acc[184:187], v[vgprValuB_X0_I0+40+4:vgprValuB_X0_I0+40+4+3], v[vgprValuA_X0_I0+48:vgprValuA_X0_I0+48+3], acc[184:187] // left value = acc[184+0:187+0]
/*  mfmaIndex:164  */
v_mfma_f32_16x16x32_bf16 acc[184:187], v[vgprValuB_X0_I0+40:vgprValuB_X0_I0+40+3], v[vgprValuA_X0_I0+48:vgprValuA_X0_I0+48+3], acc[184:187] // left value = acc[184+0:187+0]
/*  mfmaIndex:165  */
v_mfma_f32_16x16x32_bf16 acc[188:191], v[vgprValuB_X0_I0+40:vgprValuB_X0_I0+40+3], v[vgprValuA_X0_I0+56+4:vgprValuA_X0_I0+56+4+3], acc[188:191] // left value = acc[188+0:191+0]
/*  mfmaIndex:166  */
v_mfma_f32_16x16x32_bf16 acc[188:191], v[vgprValuB_X0_I0+40+4:vgprValuB_X0_I0+40+4+3], v[vgprValuA_X0_I0+56:vgprValuA_X0_I0+56+3], acc[188:191] // left value = acc[188+0:191+0]
/*  mfmaIndex:167  */
v_mfma_f32_16x16x32_bf16 acc[188:191], v[vgprValuB_X0_I0+40:vgprValuB_X0_I0+40+3], v[vgprValuA_X0_I0+56:vgprValuA_X0_I0+56+3], acc[188:191] // left value = acc[188+0:191+0]
/*  mfmaIndex:168  */
v_mfma_f32_16x16x32_bf16 acc[208:211], v[vgprValuB_X0_I0+48:vgprValuB_X0_I0+48+3], v[vgprValuA_X0_I0+32+4:vgprValuA_X0_I0+32+4+3], acc[208:211] // left value = acc[208+0:211+0]
/*  mfmaIndex:169  */
v_mfma_f32_16x16x32_bf16 acc[208:211], v[vgprValuB_X0_I0+48+4:vgprValuB_X0_I0+48+4+3], v[vgprValuA_X0_I0+32:vgprValuA_X0_I0+32+3], acc[208:211] // left value = acc[208+0:211+0]
/*  mfmaIndex:170  */
v_mfma_f32_16x16x32_bf16 acc[208:211], v[vgprValuB_X0_I0+48:vgprValuB_X0_I0+48+3], v[vgprValuA_X0_I0+32:vgprValuA_X0_I0+32+3], acc[208:211] // left value = acc[208+0:211+0]
/*  mfmaIndex:171  */
v_mfma_f32_16x16x32_bf16 acc[212:215], v[vgprValuB_X0_I0+48:vgprValuB_X0_I0+48+3], v[vgprValuA_X0_I0+40+4:vgprValuA_X0_I0+40+4+3], acc[212:215] // left value = acc[212+0:215+0]
/*  mfmaIndex:172  */
v_mfma_f32_16x16x32_bf16 acc[212:215], v[vgprValuB_X0_I0+48+4:vgprValuB_X0_I0+48+4+3], v[vgprValuA_X0_I0+40:vgprValuA_X0_I0+40+3], acc[212:215] // left value = acc[212+0:215+0]
/*  mfmaIndex:173  */
v_mfma_f32_16x16x32_bf16 acc[212:215], v[vgprValuB_X0_I0+48:vgprValuB_X0_I0+48+3], v[vgprValuA_X0_I0+40:vgprValuA_X0_I0+40+3], acc[212:215] // left value = acc[212+0:215+0]
/*  mfmaIndex:174  */
v_mfma_f32_16x16x32_bf16 acc[216:219], v[vgprValuB_X0_I0+48:vgprValuB_X0_I0+48+3], v[vgprValuA_X0_I0+48+4:vgprValuA_X0_I0+48+4+3], acc[216:219] // left value = acc[216+0:219+0]
/*  mfmaIndex:175  */
v_mfma_f32_16x16x32_bf16 acc[216:219], v[vgprValuB_X0_I0+48+4:vgprValuB_X0_I0+48+4+3], v[vgprValuA_X0_I0+48:vgprValuA_X0_I0+48+3], acc[216:219] // left value = acc[216+0:219+0]
/*  mfmaIndex:176  */
v_mfma_f32_16x16x32_bf16 acc[216:219], v[vgprValuB_X0_I0+48:vgprValuB_X0_I0+48+3], v[vgprValuA_X0_I0+48:vgprValuA_X0_I0+48+3], acc[216:219] // left value = acc[216+0:219+0]
/*  mfmaIndex:177  */
v_mfma_f32_16x16x32_bf16 acc[220:223], v[vgprValuB_X0_I0+48:vgprValuB_X0_I0+48+3], v[vgprValuA_X0_I0+56+4:vgprValuA_X0_I0+56+4+3], acc[220:223] // left value = acc[220+0:223+0]
/*  mfmaIndex:178  */
v_mfma_f32_16x16x32_bf16 acc[220:223], v[vgprValuB_X0_I0+48+4:vgprValuB_X0_I0+48+4+3], v[vgprValuA_X0_I0+56:vgprValuA_X0_I0+56+3], acc[220:223] // left value = acc[220+0:223+0]
/*  mfmaIndex:179  */
v_mfma_f32_16x16x32_bf16 acc[220:223], v[vgprValuB_X0_I0+48:vgprValuB_X0_I0+48+3], v[vgprValuA_X0_I0+56:vgprValuA_X0_I0+56+3], acc[220:223] // left value = acc[220+0:223+0]
/*  mfmaIndex:180  */
v_mfma_f32_16x16x32_bf16 acc[240:243], v[vgprValuB_X0_I0+56:vgprValuB_X0_I0+56+3], v[vgprValuA_X0_I0+32+4:vgprValuA_X0_I0+32+4+3], acc[240:243] // left value = acc[240+0:243+0]
/*  mfmaIndex:181  */
v_mfma_f32_16x16x32_bf16 acc[240:243], v[vgprValuB_X0_I0+56+4:vgprValuB_X0_I0+56+4+3], v[vgprValuA_X0_I0+32:vgprValuA_X0_I0+32+3], acc[240:243] // left value = acc[240+0:243+0]
/*  mfmaIndex:182  */
v_mfma_f32_16x16x32_bf16 acc[240:243], v[vgprValuB_X0_I0+56:vgprValuB_X0_I0+56+3], v[vgprValuA_X0_I0+32:vgprValuA_X0_I0+32+3], acc[240:243] // left value = acc[240+0:243+0]
/*  mfmaIndex:183  */
v_mfma_f32_16x16x32_bf16 acc[244:247], v[vgprValuB_X0_I0+56:vgprValuB_X0_I0+56+3], v[vgprValuA_X0_I0+40+4:vgprValuA_X0_I0+40+4+3], acc[244:247] // left value = acc[244+0:247+0]
/*  mfmaIndex:184  */
v_mfma_f32_16x16x32_bf16 acc[244:247], v[vgprValuB_X0_I0+56+4:vgprValuB_X0_I0+56+4+3], v[vgprValuA_X0_I0+40:vgprValuA_X0_I0+40+3], acc[244:247] // left value = acc[244+0:247+0]
/*  mfmaIndex:185  */
v_mfma_f32_16x16x32_bf16 acc[244:247], v[vgprValuB_X0_I0+56:vgprValuB_X0_I0+56+3], v[vgprValuA_X0_I0+40:vgprValuA_X0_I0+40+3], acc[244:247] // left value = acc[244+0:247+0]
/*  mfmaIndex:186  */
v_mfma_f32_16x16x32_bf16 acc[248:251], v[vgprValuB_X0_I0+56:vgprValuB_X0_I0+56+3], v[vgprValuA_X0_I0+48+4:vgprValuA_X0_I0+48+4+3], acc[248:251] // left value = acc[248+0:251+0]
/*  mfmaIndex:187  */
v_mfma_f32_16x16x32_bf16 acc[248:251], v[vgprValuB_X0_I0+56+4:vgprValuB_X0_I0+56+4+3], v[vgprValuA_X0_I0+48:vgprValuA_X0_I0+48+3], acc[248:251] // left value = acc[248+0:251+0]
/*  mfmaIndex:188  */
v_mfma_f32_16x16x32_bf16 acc[248:251], v[vgprValuB_X0_I0+56:vgprValuB_X0_I0+56+3], v[vgprValuA_X0_I0+48:vgprValuA_X0_I0+48+3], acc[248:251] // left value = acc[248+0:251+0]
/*  mfmaIndex:189  */
v_mfma_f32_16x16x32_bf16 acc[252:255], v[vgprValuB_X0_I0+56:vgprValuB_X0_I0+56+3], v[vgprValuA_X0_I0+56+4:vgprValuA_X0_I0+56+4+3], acc[252:255] // left value = acc[252+0:255+0]
/*  mfmaIndex:190  */
v_mfma_f32_16x16x32_bf16 acc[252:255], v[vgprValuB_X0_I0+56+4:vgprValuB_X0_I0+56+4+3], v[vgprValuA_X0_I0+56:vgprValuA_X0_I0+56+3], acc[252:255] // left value = acc[252+0:255+0]
/*  mfmaIndex:191  */
v_mfma_f32_16x16x32_bf16 acc[252:255], v[vgprValuB_X0_I0+56:vgprValuB_X0_I0+56+3], v[vgprValuA_X0_I0+56:vgprValuA_X0_I0+56+3], acc[252:255] // left value = acc[252+0:255+0]
/* numPrefetchIter=0 */
/* dataAtIterA=0 numReadsIterA=1 skipReadsIterA=0 readsPerIterA=64 */
/* dataAtIterB=0 numReadsIterB=1 skipReadsIterB=0 readsPerIterB=16 */
label_toPGR1end_OrdNLL:
label_PrefetchGlobalLastIterEnd:

/* Tail: add ValuA/B vgpr buffer [18...146) to pool */

/* Tail: add address/G2L vgpr [146...146) to pool */

/******************************************/
/* Tail Loop                              */
/******************************************/

/* local write reset offsets a */
s_xor_b32 s83, s[sgprSwapA], s[sgprLocalWriteAddrA] // Get other lds buffer offset value
s_min_u32 s[sgprLocalWriteAddrA], s[sgprLocalWriteAddrA], s83 // Set LWA to first buffer offset

/* local write reset offsets b */
s_xor_b32 s83, s[sgprSwapB], s[sgprLocalWriteAddrB] // Get other lds buffer offset value
s_min_u32 s[sgprLocalWriteAddrB], s[sgprLocalWriteAddrB], s83 // Set LWA to first buffer offset
/* Check out VGPR (numG2LA,numG2LB,numG2LMetadata) = (32,32,0) */
.set vgprG2LA_BASE, 18
.set vgprG2LB_BASE, 50

// numIterL = LOCAL_SPLITU * min(sizeL % LOCAL_DEPTHU, DEPTHU / LOCAL_SPLITU)
s_and_b32 s[sgprLoopCounterL], 31, s[sgprSizesSum+0] // s[sgprLoopCounterL] = s[sgprSizesSum+0] % 32
s_cmp_lt_u32 s[sgprStreamKLocalEnd], s[sgprItersPerTile] // Check if WG processes final iteration of tile
s_cmov_b32 s[sgprLoopCounterL], 0                  // This WG not completing tile
s_cmp_eq_u32 s[sgprLoopCounterL], 0                // numIterL == 0
s_mov_b32 s[sgprOrigLoopCounter], 0                // repurpose to count each localRead increment
s_cbranch_scc1 label_SkipTailLoopL                 // skip to end of tail loop b/c numIter==0

/* remove stagger offsets for tail loop */
s_sub_i32 s84, 3, s[sgprStaggerUIter]
s_cmp_ge_i32 s84, 0
s_cbranch_scc0 label_Negative_J5DQFVGFWLXU2DUR
s_mul_hi_u32 s85, s84, s[sgprGlobalReadIncsA+0]    // start offset S in bytes
s_mul_i32 s84, s84, s[sgprGlobalReadIncsA+0]       // start offset S in bytes
s_branch label_MultiplyDone_DLSAQLEVYLOBCPNL
label_Negative_J5DQFVGFWLXU2DUR:
s_abs_i32 s84, s84
s_mul_hi_u32 s85, s84, s[sgprGlobalReadIncsA+0]    // start offset S in bytes
s_mul_i32 s84, s84, s[sgprGlobalReadIncsA+0]       // start offset S in bytes
s_xor_b32 s84, s84, 0xffffffff
s_xor_b32 s85, s85, 0xffffffff
s_add_u32 s84, s84, 0x1
s_addc_u32 s85, s85, 0
label_MultiplyDone_DLSAQLEVYLOBCPNL:
s_sub_u32 s84, s84, s[sgprWrapUA]                  // S - WrapU
s_subb_u32 s85, s85, s[sgprWrapUA+1]               // S - WrapU
s_add_u32 s[sgprSrdA+0], s[sgprSrdA+0], s84        // gra SRD += inc(lower)
s_addc_u32 s[sgprSrdA+1], s[sgprSrdA+1], s85       // gra SRD += inc(upper)
s_sub_u32 s[sgprShadowLimitA+0], s[sgprShadowLimitA+0], s84 // limit -= inc)
s_subb_u32 s[sgprShadowLimitA+1], s[sgprShadowLimitA+1], s85 // limit -= inc)
s_cmp_eq_u32 s[sgprShadowLimitA+1], 0              // are we within 2^32?
s_cselect_b32 s[sgprSrdA+2], s[sgprShadowLimitA+0], BufferLimit // Move shadow to real if we are within 2^32
s_sub_i32 s84, 3, s[sgprStaggerUIter]
s_cmp_ge_i32 s84, 0
s_cbranch_scc0 label_Negative_LQI6BOBE0EY8XIP1
s_mul_hi_u32 s85, s84, s[sgprGlobalReadIncsB+0]    // start offset S in bytes
s_mul_i32 s84, s84, s[sgprGlobalReadIncsB+0]       // start offset S in bytes
s_branch label_MultiplyDone_9N1QELR2XL4Z0HRB
label_Negative_LQI6BOBE0EY8XIP1:
s_abs_i32 s84, s84
s_mul_hi_u32 s85, s84, s[sgprGlobalReadIncsB+0]    // start offset S in bytes
s_mul_i32 s84, s84, s[sgprGlobalReadIncsB+0]       // start offset S in bytes
s_xor_b32 s84, s84, 0xffffffff
s_xor_b32 s85, s85, 0xffffffff
s_add_u32 s84, s84, 0x1
s_addc_u32 s85, s85, 0
label_MultiplyDone_9N1QELR2XL4Z0HRB:
s_sub_u32 s84, s84, s[sgprWrapUB]                  // S - WrapU
s_subb_u32 s85, s85, s[sgprWrapUB+1]               // S - WrapU
s_add_u32 s[sgprSrdB+0], s[sgprSrdB+0], s84        // gra SRD += inc(lower)
s_addc_u32 s[sgprSrdB+1], s[sgprSrdB+1], s85       // gra SRD += inc(upper)
s_sub_u32 s[sgprShadowLimitB+0], s[sgprShadowLimitB+0], s84 // limit -= inc)
s_subb_u32 s[sgprShadowLimitB+1], s[sgprShadowLimitB+1], s85 // limit -= inc)
s_cmp_eq_u32 s[sgprShadowLimitB+1], 0              // are we within 2^32?
s_cselect_b32 s[sgprSrdB+2], s[sgprShadowLimitB+0], BufferLimit // Move shadow to real if we are within 2^32

/* Update M0 for DTLDS */
s_mov_b32 m0, s[sgprLocalWriteAddrA]               // m0 <- LDS write address
/* before DirectToLds load, ensure prior ds_reads have finished */
s_waitcnt lgkmcnt(0)
s_barrier

/* Tail global read A */
/* g2l=0, load component 0 */
buffer_load_dwordx4 v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], 0 offen offset:0, lds // load one buffer value
/* g2l=4, load component 0 */
s_add_u32 m0, m0, 4096                             // Move LDS write address to next line
buffer_load_dwordx4 v[vgprGlobalReadOffsetA+1], s[sgprSrdA:sgprSrdA+3], 0 offen offset:0, lds // load one buffer value
/* g2l=8, load component 0 */
s_add_u32 m0, m0, 4096                             // Move LDS write address to next line
buffer_load_dwordx4 v[vgprGlobalReadOffsetA+2], s[sgprSrdA:sgprSrdA+3], 0 offen offset:0, lds // load one buffer value
/* g2l=12, load component 0 */
s_add_u32 m0, m0, 4096                             // Move LDS write address to next line
buffer_load_dwordx4 v[vgprGlobalReadOffsetA+3], s[sgprSrdA:sgprSrdA+3], 0 offen offset:0, lds // load one buffer value
/* g2l=16, load component 0 */
s_add_u32 m0, m0, 4096                             // Move LDS write address to next line
buffer_load_dwordx4 v[vgprGlobalReadOffsetA+4], s[sgprSrdA:sgprSrdA+3], 0 offen offset:0, lds // load one buffer value
/* g2l=20, load component 0 */
s_add_u32 m0, m0, 4096                             // Move LDS write address to next line
buffer_load_dwordx4 v[vgprGlobalReadOffsetA+5], s[sgprSrdA:sgprSrdA+3], 0 offen offset:0, lds // load one buffer value
/* g2l=24, load component 0 */
s_add_u32 m0, m0, 4096                             // Move LDS write address to next line
buffer_load_dwordx4 v[vgprGlobalReadOffsetA+6], s[sgprSrdA:sgprSrdA+3], 0 offen offset:0, lds // load one buffer value
/* g2l=28, load component 0 */
s_add_u32 m0, m0, 4096                             // Move LDS write address to next line
buffer_load_dwordx4 v[vgprGlobalReadOffsetA+7], s[sgprSrdA:sgprSrdA+3], 0 offen offset:0, lds // load one buffer value
s_mov_b32 m0, 0x20400                              // Restore LDS clamp at 132096 bytes HERE

/* Update M0 for DTLDS */
s_mov_b32 m0, s[sgprLocalWriteAddrB]               // m0 <- LDS write address

/* Tail global read B */
/* g2l=0, load component 0 */
buffer_load_dwordx4 v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], 0 offen offset:0, lds // load one buffer value
/* g2l=4, load component 0 */
s_add_u32 m0, m0, 4160                             // Move LDS write address to next line
buffer_load_dwordx4 v[vgprGlobalReadOffsetB+1], s[sgprSrdB:sgprSrdB+3], 0 offen offset:0, lds // load one buffer value
/* g2l=8, load component 0 */
s_add_u32 m0, m0, 4160                             // Move LDS write address to next line
buffer_load_dwordx4 v[vgprGlobalReadOffsetB+2], s[sgprSrdB:sgprSrdB+3], 0 offen offset:0, lds // load one buffer value
/* g2l=12, load component 0 */
s_add_u32 m0, m0, 4160                             // Move LDS write address to next line
buffer_load_dwordx4 v[vgprGlobalReadOffsetB+3], s[sgprSrdB:sgprSrdB+3], 0 offen offset:0, lds // load one buffer value
/* g2l=16, load component 0 */
s_add_u32 m0, m0, 4160                             // Move LDS write address to next line
buffer_load_dwordx4 v[vgprGlobalReadOffsetB+4], s[sgprSrdB:sgprSrdB+3], 0 offen offset:0, lds // load one buffer value
/* g2l=20, load component 0 */
s_add_u32 m0, m0, 4160                             // Move LDS write address to next line
buffer_load_dwordx4 v[vgprGlobalReadOffsetB+5], s[sgprSrdB:sgprSrdB+3], 0 offen offset:0, lds // load one buffer value
/* g2l=24, load component 0 */
s_add_u32 m0, m0, 4160                             // Move LDS write address to next line
buffer_load_dwordx4 v[vgprGlobalReadOffsetB+6], s[sgprSrdB:sgprSrdB+3], 0 offen offset:0, lds // load one buffer value
/* g2l=28, load component 0 */
s_add_u32 m0, m0, 4160                             // Move LDS write address to next line
buffer_load_dwordx4 v[vgprGlobalReadOffsetB+7], s[sgprSrdB:sgprSrdB+3], 0 offen offset:0, lds // load one buffer value
s_mov_b32 m0, 0x20400                              // Restore LDS clamp at 132096 bytes HERE
s_waitcnt vmcnt(0)                                 // 2wait for global read
// Skip force waitcnt0
s_barrier

/* Recalc local read offsets */
s_waitcnt lgkmcnt(0)                               // 5wait for local write
// Skip force waitcnt0
s_barrier
.set vgprG2LA_BASE, UNDEF
.set vgprG2LB_BASE, UNDEF
.set vgprValuA_X0_I0_BASE, 18
.set vgprValuA_X0_I0, vgprValuA_X0_I0_BASE+0
.set vgprValuB_X0_I0_BASE, 82
.set vgprValuB_X0_I0, vgprValuB_X0_I0_BASE+0

/* Tail: local read reset offsets a */

/* localReadResetOffsets */
/* handled internally */
v_xor_b32 v149, v[vgprLocalReadSwapAddrA], v[vgprLocalReadAddrA] // Get other lds buffer offset value
v_min_i32 v[vgprLocalReadAddrA], v[vgprLocalReadAddrA], v149 // Set LRA to first buffer offset

/* Tail: local read reset offsets b */

/* localReadResetOffsets */
/* handled internally */
v_xor_b32 v149, v[vgprLocalReadSwapAddrB], v[vgprLocalReadAddrB] // Get other lds buffer offset value
v_min_i32 v[vgprLocalReadAddrB], v[vgprLocalReadAddrB], v149 // Set LRA to first buffer offset

/* Tail: local read init pointers a */

/* localReadInitPointers */

/* Tail: local read init pointers b */

/* localReadInitPointers */

/* tail loop: macs */
label_TailLoopBeginL:

/* local read a */
ds_read_b32 v[vgprValuA_X0_I0+0], v[vgprLocalReadAddrA] offset:0 // L -> Reg lro=0 swapByteOffset=0 ti=128 vIdx=0 eIdx=0 rIdx=0 oIdx=0 buffer=0 iui=0
ds_read_b32 v[vgprValuA_X0_I0+1], v[vgprLocalReadAddrA] offset:1024 // L -> Reg lro=0 swapByteOffset=0 ti=128 vIdx=0 eIdx=0 rIdx=1 oIdx=0 buffer=0 iui=0
ds_read_b32 v[vgprValuA_X0_I0+2], v[vgprLocalReadAddrA] offset:2048 // L -> Reg lro=0 swapByteOffset=0 ti=128 vIdx=0 eIdx=0 rIdx=2 oIdx=0 buffer=0 iui=0
ds_read_b32 v[vgprValuA_X0_I0+3], v[vgprLocalReadAddrA] offset:3072 // L -> Reg lro=0 swapByteOffset=0 ti=128 vIdx=0 eIdx=0 rIdx=3 oIdx=0 buffer=0 iui=0
ds_read_b32 v[vgprValuA_X0_I0+4], v[vgprLocalReadAddrA] offset:16384 // L -> Reg lro=0 swapByteOffset=0 ti=128 vIdx=0 eIdx=0 rIdx=4 oIdx=0 buffer=0 iui=0
ds_read_b32 v[vgprValuA_X0_I0+5], v[vgprLocalReadAddrA] offset:17408 // L -> Reg lro=0 swapByteOffset=0 ti=128 vIdx=0 eIdx=0 rIdx=5 oIdx=0 buffer=0 iui=0
ds_read_b32 v[vgprValuA_X0_I0+6], v[vgprLocalReadAddrA] offset:18432 // L -> Reg lro=0 swapByteOffset=0 ti=128 vIdx=0 eIdx=0 rIdx=6 oIdx=0 buffer=0 iui=0
ds_read_b32 v[vgprValuA_X0_I0+7], v[vgprLocalReadAddrA] offset:19456 // L -> Reg lro=0 swapByteOffset=0 ti=128 vIdx=0 eIdx=0 rIdx=7 oIdx=0 buffer=0 iui=0
ds_read_b32 v[vgprValuA_X0_I0+8], v[vgprLocalReadAddrA] offset:4 // L -> Reg lro=0 swapByteOffset=0 ti=128 vIdx=0 eIdx=1 rIdx=0 oIdx=0 buffer=0 iui=0
ds_read_b32 v[vgprValuA_X0_I0+9], v[vgprLocalReadAddrA] offset:1028 // L -> Reg lro=0 swapByteOffset=0 ti=128 vIdx=0 eIdx=1 rIdx=1 oIdx=0 buffer=0 iui=0
ds_read_b32 v[vgprValuA_X0_I0+10], v[vgprLocalReadAddrA] offset:2052 // L -> Reg lro=0 swapByteOffset=0 ti=128 vIdx=0 eIdx=1 rIdx=2 oIdx=0 buffer=0 iui=0
ds_read_b32 v[vgprValuA_X0_I0+11], v[vgprLocalReadAddrA] offset:3076 // L -> Reg lro=0 swapByteOffset=0 ti=128 vIdx=0 eIdx=1 rIdx=3 oIdx=0 buffer=0 iui=0
ds_read_b32 v[vgprValuA_X0_I0+12], v[vgprLocalReadAddrA] offset:16388 // L -> Reg lro=0 swapByteOffset=0 ti=128 vIdx=0 eIdx=1 rIdx=4 oIdx=0 buffer=0 iui=0
ds_read_b32 v[vgprValuA_X0_I0+13], v[vgprLocalReadAddrA] offset:17412 // L -> Reg lro=0 swapByteOffset=0 ti=128 vIdx=0 eIdx=1 rIdx=5 oIdx=0 buffer=0 iui=0
ds_read_b32 v[vgprValuA_X0_I0+14], v[vgprLocalReadAddrA] offset:18436 // L -> Reg lro=0 swapByteOffset=0 ti=128 vIdx=0 eIdx=1 rIdx=6 oIdx=0 buffer=0 iui=0
ds_read_b32 v[vgprValuA_X0_I0+15], v[vgprLocalReadAddrA] offset:19460 // L -> Reg lro=0 swapByteOffset=0 ti=128 vIdx=0 eIdx=1 rIdx=7 oIdx=0 buffer=0 iui=0
ds_read_b32 v[vgprValuA_X0_I0+16], v[vgprLocalReadAddrA] offset:8 // L -> Reg lro=0 swapByteOffset=0 ti=128 vIdx=0 eIdx=2 rIdx=0 oIdx=0 buffer=0 iui=0
ds_read_b32 v[vgprValuA_X0_I0+17], v[vgprLocalReadAddrA] offset:1032 // L -> Reg lro=0 swapByteOffset=0 ti=128 vIdx=0 eIdx=2 rIdx=1 oIdx=0 buffer=0 iui=0
ds_read_b32 v[vgprValuA_X0_I0+18], v[vgprLocalReadAddrA] offset:2056 // L -> Reg lro=0 swapByteOffset=0 ti=128 vIdx=0 eIdx=2 rIdx=2 oIdx=0 buffer=0 iui=0
ds_read_b32 v[vgprValuA_X0_I0+19], v[vgprLocalReadAddrA] offset:3080 // L -> Reg lro=0 swapByteOffset=0 ti=128 vIdx=0 eIdx=2 rIdx=3 oIdx=0 buffer=0 iui=0
ds_read_b32 v[vgprValuA_X0_I0+20], v[vgprLocalReadAddrA] offset:16392 // L -> Reg lro=0 swapByteOffset=0 ti=128 vIdx=0 eIdx=2 rIdx=4 oIdx=0 buffer=0 iui=0
ds_read_b32 v[vgprValuA_X0_I0+21], v[vgprLocalReadAddrA] offset:17416 // L -> Reg lro=0 swapByteOffset=0 ti=128 vIdx=0 eIdx=2 rIdx=5 oIdx=0 buffer=0 iui=0
ds_read_b32 v[vgprValuA_X0_I0+22], v[vgprLocalReadAddrA] offset:18440 // L -> Reg lro=0 swapByteOffset=0 ti=128 vIdx=0 eIdx=2 rIdx=6 oIdx=0 buffer=0 iui=0
ds_read_b32 v[vgprValuA_X0_I0+23], v[vgprLocalReadAddrA] offset:19464 // L -> Reg lro=0 swapByteOffset=0 ti=128 vIdx=0 eIdx=2 rIdx=7 oIdx=0 buffer=0 iui=0
ds_read_b32 v[vgprValuA_X0_I0+24], v[vgprLocalReadAddrA] offset:12 // L -> Reg lro=0 swapByteOffset=0 ti=128 vIdx=0 eIdx=3 rIdx=0 oIdx=0 buffer=0 iui=0
ds_read_b32 v[vgprValuA_X0_I0+25], v[vgprLocalReadAddrA] offset:1036 // L -> Reg lro=0 swapByteOffset=0 ti=128 vIdx=0 eIdx=3 rIdx=1 oIdx=0 buffer=0 iui=0
ds_read_b32 v[vgprValuA_X0_I0+26], v[vgprLocalReadAddrA] offset:2060 // L -> Reg lro=0 swapByteOffset=0 ti=128 vIdx=0 eIdx=3 rIdx=2 oIdx=0 buffer=0 iui=0
ds_read_b32 v[vgprValuA_X0_I0+27], v[vgprLocalReadAddrA] offset:3084 // L -> Reg lro=0 swapByteOffset=0 ti=128 vIdx=0 eIdx=3 rIdx=3 oIdx=0 buffer=0 iui=0
ds_read_b32 v[vgprValuA_X0_I0+28], v[vgprLocalReadAddrA] offset:16396 // L -> Reg lro=0 swapByteOffset=0 ti=128 vIdx=0 eIdx=3 rIdx=4 oIdx=0 buffer=0 iui=0
ds_read_b32 v[vgprValuA_X0_I0+29], v[vgprLocalReadAddrA] offset:17420 // L -> Reg lro=0 swapByteOffset=0 ti=128 vIdx=0 eIdx=3 rIdx=5 oIdx=0 buffer=0 iui=0
ds_read_b32 v[vgprValuA_X0_I0+30], v[vgprLocalReadAddrA] offset:18444 // L -> Reg lro=0 swapByteOffset=0 ti=128 vIdx=0 eIdx=3 rIdx=6 oIdx=0 buffer=0 iui=0
ds_read_b32 v[vgprValuA_X0_I0+31], v[vgprLocalReadAddrA] offset:19468 // L -> Reg lro=0 swapByteOffset=0 ti=128 vIdx=0 eIdx=3 rIdx=7 oIdx=0 buffer=0 iui=0
ds_read_b32 v[vgprValuA_X0_I0+32], v[vgprLocalReadAddrA] offset:512 // L -> Reg lro=0 swapByteOffset=0 ti=128 vIdx=1 eIdx=0 rIdx=0 oIdx=0 buffer=0 iui=0
ds_read_b32 v[vgprValuA_X0_I0+33], v[vgprLocalReadAddrA] offset:1536 // L -> Reg lro=0 swapByteOffset=0 ti=128 vIdx=1 eIdx=0 rIdx=1 oIdx=0 buffer=0 iui=0
ds_read_b32 v[vgprValuA_X0_I0+34], v[vgprLocalReadAddrA] offset:2560 // L -> Reg lro=0 swapByteOffset=0 ti=128 vIdx=1 eIdx=0 rIdx=2 oIdx=0 buffer=0 iui=0
ds_read_b32 v[vgprValuA_X0_I0+35], v[vgprLocalReadAddrA] offset:3584 // L -> Reg lro=0 swapByteOffset=0 ti=128 vIdx=1 eIdx=0 rIdx=3 oIdx=0 buffer=0 iui=0
ds_read_b32 v[vgprValuA_X0_I0+36], v[vgprLocalReadAddrA] offset:16896 // L -> Reg lro=0 swapByteOffset=0 ti=128 vIdx=1 eIdx=0 rIdx=4 oIdx=0 buffer=0 iui=0
ds_read_b32 v[vgprValuA_X0_I0+37], v[vgprLocalReadAddrA] offset:17920 // L -> Reg lro=0 swapByteOffset=0 ti=128 vIdx=1 eIdx=0 rIdx=5 oIdx=0 buffer=0 iui=0
ds_read_b32 v[vgprValuA_X0_I0+38], v[vgprLocalReadAddrA] offset:18944 // L -> Reg lro=0 swapByteOffset=0 ti=128 vIdx=1 eIdx=0 rIdx=6 oIdx=0 buffer=0 iui=0
ds_read_b32 v[vgprValuA_X0_I0+39], v[vgprLocalReadAddrA] offset:19968 // L -> Reg lro=0 swapByteOffset=0 ti=128 vIdx=1 eIdx=0 rIdx=7 oIdx=0 buffer=0 iui=0
ds_read_b32 v[vgprValuA_X0_I0+40], v[vgprLocalReadAddrA] offset:516 // L -> Reg lro=0 swapByteOffset=0 ti=128 vIdx=1 eIdx=1 rIdx=0 oIdx=0 buffer=0 iui=0
ds_read_b32 v[vgprValuA_X0_I0+41], v[vgprLocalReadAddrA] offset:1540 // L -> Reg lro=0 swapByteOffset=0 ti=128 vIdx=1 eIdx=1 rIdx=1 oIdx=0 buffer=0 iui=0
ds_read_b32 v[vgprValuA_X0_I0+42], v[vgprLocalReadAddrA] offset:2564 // L -> Reg lro=0 swapByteOffset=0 ti=128 vIdx=1 eIdx=1 rIdx=2 oIdx=0 buffer=0 iui=0
ds_read_b32 v[vgprValuA_X0_I0+43], v[vgprLocalReadAddrA] offset:3588 // L -> Reg lro=0 swapByteOffset=0 ti=128 vIdx=1 eIdx=1 rIdx=3 oIdx=0 buffer=0 iui=0
ds_read_b32 v[vgprValuA_X0_I0+44], v[vgprLocalReadAddrA] offset:16900 // L -> Reg lro=0 swapByteOffset=0 ti=128 vIdx=1 eIdx=1 rIdx=4 oIdx=0 buffer=0 iui=0
ds_read_b32 v[vgprValuA_X0_I0+45], v[vgprLocalReadAddrA] offset:17924 // L -> Reg lro=0 swapByteOffset=0 ti=128 vIdx=1 eIdx=1 rIdx=5 oIdx=0 buffer=0 iui=0
ds_read_b32 v[vgprValuA_X0_I0+46], v[vgprLocalReadAddrA] offset:18948 // L -> Reg lro=0 swapByteOffset=0 ti=128 vIdx=1 eIdx=1 rIdx=6 oIdx=0 buffer=0 iui=0
ds_read_b32 v[vgprValuA_X0_I0+47], v[vgprLocalReadAddrA] offset:19972 // L -> Reg lro=0 swapByteOffset=0 ti=128 vIdx=1 eIdx=1 rIdx=7 oIdx=0 buffer=0 iui=0
ds_read_b32 v[vgprValuA_X0_I0+48], v[vgprLocalReadAddrA] offset:520 // L -> Reg lro=0 swapByteOffset=0 ti=128 vIdx=1 eIdx=2 rIdx=0 oIdx=0 buffer=0 iui=0
ds_read_b32 v[vgprValuA_X0_I0+49], v[vgprLocalReadAddrA] offset:1544 // L -> Reg lro=0 swapByteOffset=0 ti=128 vIdx=1 eIdx=2 rIdx=1 oIdx=0 buffer=0 iui=0
ds_read_b32 v[vgprValuA_X0_I0+50], v[vgprLocalReadAddrA] offset:2568 // L -> Reg lro=0 swapByteOffset=0 ti=128 vIdx=1 eIdx=2 rIdx=2 oIdx=0 buffer=0 iui=0
ds_read_b32 v[vgprValuA_X0_I0+51], v[vgprLocalReadAddrA] offset:3592 // L -> Reg lro=0 swapByteOffset=0 ti=128 vIdx=1 eIdx=2 rIdx=3 oIdx=0 buffer=0 iui=0
ds_read_b32 v[vgprValuA_X0_I0+52], v[vgprLocalReadAddrA] offset:16904 // L -> Reg lro=0 swapByteOffset=0 ti=128 vIdx=1 eIdx=2 rIdx=4 oIdx=0 buffer=0 iui=0
ds_read_b32 v[vgprValuA_X0_I0+53], v[vgprLocalReadAddrA] offset:17928 // L -> Reg lro=0 swapByteOffset=0 ti=128 vIdx=1 eIdx=2 rIdx=5 oIdx=0 buffer=0 iui=0
ds_read_b32 v[vgprValuA_X0_I0+54], v[vgprLocalReadAddrA] offset:18952 // L -> Reg lro=0 swapByteOffset=0 ti=128 vIdx=1 eIdx=2 rIdx=6 oIdx=0 buffer=0 iui=0
ds_read_b32 v[vgprValuA_X0_I0+55], v[vgprLocalReadAddrA] offset:19976 // L -> Reg lro=0 swapByteOffset=0 ti=128 vIdx=1 eIdx=2 rIdx=7 oIdx=0 buffer=0 iui=0
ds_read_b32 v[vgprValuA_X0_I0+56], v[vgprLocalReadAddrA] offset:524 // L -> Reg lro=0 swapByteOffset=0 ti=128 vIdx=1 eIdx=3 rIdx=0 oIdx=0 buffer=0 iui=0
ds_read_b32 v[vgprValuA_X0_I0+57], v[vgprLocalReadAddrA] offset:1548 // L -> Reg lro=0 swapByteOffset=0 ti=128 vIdx=1 eIdx=3 rIdx=1 oIdx=0 buffer=0 iui=0
ds_read_b32 v[vgprValuA_X0_I0+58], v[vgprLocalReadAddrA] offset:2572 // L -> Reg lro=0 swapByteOffset=0 ti=128 vIdx=1 eIdx=3 rIdx=2 oIdx=0 buffer=0 iui=0
ds_read_b32 v[vgprValuA_X0_I0+59], v[vgprLocalReadAddrA] offset:3596 // L -> Reg lro=0 swapByteOffset=0 ti=128 vIdx=1 eIdx=3 rIdx=3 oIdx=0 buffer=0 iui=0
ds_read_b32 v[vgprValuA_X0_I0+60], v[vgprLocalReadAddrA] offset:16908 // L -> Reg lro=0 swapByteOffset=0 ti=128 vIdx=1 eIdx=3 rIdx=4 oIdx=0 buffer=0 iui=0
ds_read_b32 v[vgprValuA_X0_I0+61], v[vgprLocalReadAddrA] offset:17932 // L -> Reg lro=0 swapByteOffset=0 ti=128 vIdx=1 eIdx=3 rIdx=5 oIdx=0 buffer=0 iui=0
ds_read_b32 v[vgprValuA_X0_I0+62], v[vgprLocalReadAddrA] offset:18956 // L -> Reg lro=0 swapByteOffset=0 ti=128 vIdx=1 eIdx=3 rIdx=6 oIdx=0 buffer=0 iui=0
ds_read_b32 v[vgprValuA_X0_I0+63], v[vgprLocalReadAddrA] offset:19980 // L -> Reg lro=0 swapByteOffset=0 ti=128 vIdx=1 eIdx=3 rIdx=7 oIdx=0 buffer=0 iui=0

/* local read b */
ds_read_b128 v[vgprValuB_X0_I0+0:vgprValuB_X0_I0+0+3], v[vgprLocalReadAddrB] offset:0 // L -> Reg lro=0 swapByteOffset=0 ti=128 vIdx=0 eIdx=0 rIdx=0 oIdx=0 buffer=0 iui=0
ds_read_b128 v[vgprValuB_X0_I0+4:vgprValuB_X0_I0+4+3], v[vgprLocalReadAddrB] offset:64 // L -> Reg lro=0 swapByteOffset=0 ti=128 vIdx=0 eIdx=0 rIdx=1 oIdx=0 buffer=0 iui=0
ds_read_b128 v[vgprValuB_X0_I0+8:vgprValuB_X0_I0+8+3], v[vgprLocalReadAddrB] offset:128 // L -> Reg lro=0 swapByteOffset=0 ti=128 vIdx=0 eIdx=1 rIdx=0 oIdx=0 buffer=0 iui=0
ds_read_b128 v[vgprValuB_X0_I0+12:vgprValuB_X0_I0+12+3], v[vgprLocalReadAddrB] offset:192 // L -> Reg lro=0 swapByteOffset=0 ti=128 vIdx=0 eIdx=1 rIdx=1 oIdx=0 buffer=0 iui=0
ds_read_b128 v[vgprValuB_X0_I0+16:vgprValuB_X0_I0+16+3], v[vgprLocalReadAddrB] offset:256 // L -> Reg lro=0 swapByteOffset=0 ti=128 vIdx=0 eIdx=2 rIdx=0 oIdx=0 buffer=0 iui=0
ds_read_b128 v[vgprValuB_X0_I0+20:vgprValuB_X0_I0+20+3], v[vgprLocalReadAddrB] offset:320 // L -> Reg lro=0 swapByteOffset=0 ti=128 vIdx=0 eIdx=2 rIdx=1 oIdx=0 buffer=0 iui=0
ds_read_b128 v[vgprValuB_X0_I0+24:vgprValuB_X0_I0+24+3], v[vgprLocalReadAddrB] offset:384 // L -> Reg lro=0 swapByteOffset=0 ti=128 vIdx=0 eIdx=3 rIdx=0 oIdx=0 buffer=0 iui=0
ds_read_b128 v[vgprValuB_X0_I0+28:vgprValuB_X0_I0+28+3], v[vgprLocalReadAddrB] offset:448 // L -> Reg lro=0 swapByteOffset=0 ti=128 vIdx=0 eIdx=3 rIdx=1 oIdx=0 buffer=0 iui=0
ds_read_b128 v[vgprValuB_X0_I0+32:vgprValuB_X0_I0+32+3], v[vgprLocalReadAddrB] offset:16640 // L -> Reg lro=0 swapByteOffset=0 ti=128 vIdx=1 eIdx=0 rIdx=0 oIdx=0 buffer=0 iui=0
ds_read_b128 v[vgprValuB_X0_I0+36:vgprValuB_X0_I0+36+3], v[vgprLocalReadAddrB] offset:16704 // L -> Reg lro=0 swapByteOffset=0 ti=128 vIdx=1 eIdx=0 rIdx=1 oIdx=0 buffer=0 iui=0
ds_read_b128 v[vgprValuB_X0_I0+40:vgprValuB_X0_I0+40+3], v[vgprLocalReadAddrB] offset:16768 // L -> Reg lro=0 swapByteOffset=0 ti=128 vIdx=1 eIdx=1 rIdx=0 oIdx=0 buffer=0 iui=0
ds_read_b128 v[vgprValuB_X0_I0+44:vgprValuB_X0_I0+44+3], v[vgprLocalReadAddrB] offset:16832 // L -> Reg lro=0 swapByteOffset=0 ti=128 vIdx=1 eIdx=1 rIdx=1 oIdx=0 buffer=0 iui=0
ds_read_b128 v[vgprValuB_X0_I0+48:vgprValuB_X0_I0+48+3], v[vgprLocalReadAddrB] offset:16896 // L -> Reg lro=0 swapByteOffset=0 ti=128 vIdx=1 eIdx=2 rIdx=0 oIdx=0 buffer=0 iui=0
ds_read_b128 v[vgprValuB_X0_I0+52:vgprValuB_X0_I0+52+3], v[vgprLocalReadAddrB] offset:16960 // L -> Reg lro=0 swapByteOffset=0 ti=128 vIdx=1 eIdx=2 rIdx=1 oIdx=0 buffer=0 iui=0
ds_read_b128 v[vgprValuB_X0_I0+56:vgprValuB_X0_I0+56+3], v[vgprLocalReadAddrB] offset:17024 // L -> Reg lro=0 swapByteOffset=0 ti=128 vIdx=1 eIdx=3 rIdx=0 oIdx=0 buffer=0 iui=0
ds_read_b128 v[vgprValuB_X0_I0+60:vgprValuB_X0_I0+60+3], v[vgprLocalReadAddrB] offset:17088 // L -> Reg lro=0 swapByteOffset=0 ti=128 vIdx=1 eIdx=3 rIdx=1 oIdx=0 buffer=0 iui=0

/* local read inc a */
s_mov_b32 s83, 0x8000                              // inc
v_add_co_u32 v[vgprLocalReadAddrA+0], vcc, s83, v[vgprLocalReadAddrA+0] // lrA += 32768 ((MT+PAD)*bpeDS)

/* local read inc b */
s_mov_b32 s83, 0x80                                // inc
v_add_co_u32 v[vgprLocalReadAddrB+0], vcc, s83, v[vgprLocalReadAddrB+0] // lrB += 128 (bpeDS)
s_waitcnt lgkmcnt(0)                               // 4wait for local read
v_and_b32 v149, 63, v[vgprSerial]                  // v149 = v[vgprSerial] % 64
v_lshrrev_b32 v149, 4, v149                        // 149 = 149 / 16
v_lshlrev_b32 v149, 2, v149                        // v149 = v149 * 4
v_add_u32 v150, v149, 0
v_cmp_ge_i32 s[84:85], v150, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuA_X0_I0+0+0+0+0], v[vgprValuA_X0_I0+0+0+0+0], 0, s[84:85] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuA_X0_I0+8+0+0+0], v[vgprValuA_X0_I0+8+0+0+0], 0, s[84:85] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuA_X0_I0+16+0+0+0], v[vgprValuA_X0_I0+16+0+0+0], 0, s[84:85] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuA_X0_I0+24+0+0+0], v[vgprValuA_X0_I0+24+0+0+0], 0, s[84:85] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuA_X0_I0+32+0+0+0], v[vgprValuA_X0_I0+32+0+0+0], 0, s[84:85] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuA_X0_I0+40+0+0+0], v[vgprValuA_X0_I0+40+0+0+0], 0, s[84:85] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuA_X0_I0+48+0+0+0], v[vgprValuA_X0_I0+48+0+0+0], 0, s[84:85] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuA_X0_I0+56+0+0+0], v[vgprValuA_X0_I0+56+0+0+0], 0, s[84:85] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuA_X0_I0+0+0+0+1], v[vgprValuA_X0_I0+0+0+0+1], 0, s[84:85] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuA_X0_I0+8+0+0+1], v[vgprValuA_X0_I0+8+0+0+1], 0, s[84:85] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuA_X0_I0+16+0+0+1], v[vgprValuA_X0_I0+16+0+0+1], 0, s[84:85] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuA_X0_I0+24+0+0+1], v[vgprValuA_X0_I0+24+0+0+1], 0, s[84:85] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuA_X0_I0+32+0+0+1], v[vgprValuA_X0_I0+32+0+0+1], 0, s[84:85] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuA_X0_I0+40+0+0+1], v[vgprValuA_X0_I0+40+0+0+1], 0, s[84:85] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuA_X0_I0+48+0+0+1], v[vgprValuA_X0_I0+48+0+0+1], 0, s[84:85] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuA_X0_I0+56+0+0+1], v[vgprValuA_X0_I0+56+0+0+1], 0, s[84:85] // set 0 if K_idx >= sizeL
v_add_u32 v150, v150, 2                            // add part of K
v_cmp_ge_i32 s[84:85], v150, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuA_X0_I0+0+0+0+2], v[vgprValuA_X0_I0+0+0+0+2], 0, s[84:85] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuA_X0_I0+8+0+0+2], v[vgprValuA_X0_I0+8+0+0+2], 0, s[84:85] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuA_X0_I0+16+0+0+2], v[vgprValuA_X0_I0+16+0+0+2], 0, s[84:85] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuA_X0_I0+24+0+0+2], v[vgprValuA_X0_I0+24+0+0+2], 0, s[84:85] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuA_X0_I0+32+0+0+2], v[vgprValuA_X0_I0+32+0+0+2], 0, s[84:85] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuA_X0_I0+40+0+0+2], v[vgprValuA_X0_I0+40+0+0+2], 0, s[84:85] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuA_X0_I0+48+0+0+2], v[vgprValuA_X0_I0+48+0+0+2], 0, s[84:85] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuA_X0_I0+56+0+0+2], v[vgprValuA_X0_I0+56+0+0+2], 0, s[84:85] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuA_X0_I0+0+0+0+3], v[vgprValuA_X0_I0+0+0+0+3], 0, s[84:85] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuA_X0_I0+8+0+0+3], v[vgprValuA_X0_I0+8+0+0+3], 0, s[84:85] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuA_X0_I0+16+0+0+3], v[vgprValuA_X0_I0+16+0+0+3], 0, s[84:85] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuA_X0_I0+24+0+0+3], v[vgprValuA_X0_I0+24+0+0+3], 0, s[84:85] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuA_X0_I0+32+0+0+3], v[vgprValuA_X0_I0+32+0+0+3], 0, s[84:85] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuA_X0_I0+40+0+0+3], v[vgprValuA_X0_I0+40+0+0+3], 0, s[84:85] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuA_X0_I0+48+0+0+3], v[vgprValuA_X0_I0+48+0+0+3], 0, s[84:85] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuA_X0_I0+56+0+0+3], v[vgprValuA_X0_I0+56+0+0+3], 0, s[84:85] // set 0 if K_idx >= sizeL
v_add_u32 v150, v150, 14                           // add part of K
v_cmp_ge_i32 s[84:85], v150, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuA_X0_I0+0+0+0+4], v[vgprValuA_X0_I0+0+0+0+4], 0, s[84:85] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuA_X0_I0+8+0+0+4], v[vgprValuA_X0_I0+8+0+0+4], 0, s[84:85] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuA_X0_I0+16+0+0+4], v[vgprValuA_X0_I0+16+0+0+4], 0, s[84:85] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuA_X0_I0+24+0+0+4], v[vgprValuA_X0_I0+24+0+0+4], 0, s[84:85] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuA_X0_I0+32+0+0+4], v[vgprValuA_X0_I0+32+0+0+4], 0, s[84:85] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuA_X0_I0+40+0+0+4], v[vgprValuA_X0_I0+40+0+0+4], 0, s[84:85] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuA_X0_I0+48+0+0+4], v[vgprValuA_X0_I0+48+0+0+4], 0, s[84:85] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuA_X0_I0+56+0+0+4], v[vgprValuA_X0_I0+56+0+0+4], 0, s[84:85] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuA_X0_I0+0+0+0+5], v[vgprValuA_X0_I0+0+0+0+5], 0, s[84:85] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuA_X0_I0+8+0+0+5], v[vgprValuA_X0_I0+8+0+0+5], 0, s[84:85] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuA_X0_I0+16+0+0+5], v[vgprValuA_X0_I0+16+0+0+5], 0, s[84:85] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuA_X0_I0+24+0+0+5], v[vgprValuA_X0_I0+24+0+0+5], 0, s[84:85] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuA_X0_I0+32+0+0+5], v[vgprValuA_X0_I0+32+0+0+5], 0, s[84:85] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuA_X0_I0+40+0+0+5], v[vgprValuA_X0_I0+40+0+0+5], 0, s[84:85] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuA_X0_I0+48+0+0+5], v[vgprValuA_X0_I0+48+0+0+5], 0, s[84:85] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuA_X0_I0+56+0+0+5], v[vgprValuA_X0_I0+56+0+0+5], 0, s[84:85] // set 0 if K_idx >= sizeL
v_add_u32 v150, v150, 2                            // add part of K
v_cmp_ge_i32 s[84:85], v150, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuA_X0_I0+0+0+0+6], v[vgprValuA_X0_I0+0+0+0+6], 0, s[84:85] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuA_X0_I0+8+0+0+6], v[vgprValuA_X0_I0+8+0+0+6], 0, s[84:85] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuA_X0_I0+16+0+0+6], v[vgprValuA_X0_I0+16+0+0+6], 0, s[84:85] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuA_X0_I0+24+0+0+6], v[vgprValuA_X0_I0+24+0+0+6], 0, s[84:85] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuA_X0_I0+32+0+0+6], v[vgprValuA_X0_I0+32+0+0+6], 0, s[84:85] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuA_X0_I0+40+0+0+6], v[vgprValuA_X0_I0+40+0+0+6], 0, s[84:85] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuA_X0_I0+48+0+0+6], v[vgprValuA_X0_I0+48+0+0+6], 0, s[84:85] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuA_X0_I0+56+0+0+6], v[vgprValuA_X0_I0+56+0+0+6], 0, s[84:85] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuA_X0_I0+0+0+0+7], v[vgprValuA_X0_I0+0+0+0+7], 0, s[84:85] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuA_X0_I0+8+0+0+7], v[vgprValuA_X0_I0+8+0+0+7], 0, s[84:85] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuA_X0_I0+16+0+0+7], v[vgprValuA_X0_I0+16+0+0+7], 0, s[84:85] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuA_X0_I0+24+0+0+7], v[vgprValuA_X0_I0+24+0+0+7], 0, s[84:85] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuA_X0_I0+32+0+0+7], v[vgprValuA_X0_I0+32+0+0+7], 0, s[84:85] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuA_X0_I0+40+0+0+7], v[vgprValuA_X0_I0+40+0+0+7], 0, s[84:85] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuA_X0_I0+48+0+0+7], v[vgprValuA_X0_I0+48+0+0+7], 0, s[84:85] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuA_X0_I0+56+0+0+7], v[vgprValuA_X0_I0+56+0+0+7], 0, s[84:85] // set 0 if K_idx >= sizeL
v_and_b32 v149, 63, v[vgprSerial]                  // v149 = v[vgprSerial] % 64
v_lshrrev_b32 v149, 4, v149                        // 149 = 149 / 16
v_lshlrev_b32 v149, 2, v149                        // v149 = v149 * 4
v_add_u32 v150, v149, 0
v_cmp_ge_i32 s[84:85], v150, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuB_X0_I0+0+0+0+0], v[vgprValuB_X0_I0+0+0+0+0], 0, s[84:85] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuB_X0_I0+8+0+0+0], v[vgprValuB_X0_I0+8+0+0+0], 0, s[84:85] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuB_X0_I0+16+0+0+0], v[vgprValuB_X0_I0+16+0+0+0], 0, s[84:85] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuB_X0_I0+24+0+0+0], v[vgprValuB_X0_I0+24+0+0+0], 0, s[84:85] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuB_X0_I0+32+0+0+0], v[vgprValuB_X0_I0+32+0+0+0], 0, s[84:85] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuB_X0_I0+40+0+0+0], v[vgprValuB_X0_I0+40+0+0+0], 0, s[84:85] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuB_X0_I0+48+0+0+0], v[vgprValuB_X0_I0+48+0+0+0], 0, s[84:85] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuB_X0_I0+56+0+0+0], v[vgprValuB_X0_I0+56+0+0+0], 0, s[84:85] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuB_X0_I0+0+0+0+1], v[vgprValuB_X0_I0+0+0+0+1], 0, s[84:85] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuB_X0_I0+8+0+0+1], v[vgprValuB_X0_I0+8+0+0+1], 0, s[84:85] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuB_X0_I0+16+0+0+1], v[vgprValuB_X0_I0+16+0+0+1], 0, s[84:85] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuB_X0_I0+24+0+0+1], v[vgprValuB_X0_I0+24+0+0+1], 0, s[84:85] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuB_X0_I0+32+0+0+1], v[vgprValuB_X0_I0+32+0+0+1], 0, s[84:85] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuB_X0_I0+40+0+0+1], v[vgprValuB_X0_I0+40+0+0+1], 0, s[84:85] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuB_X0_I0+48+0+0+1], v[vgprValuB_X0_I0+48+0+0+1], 0, s[84:85] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuB_X0_I0+56+0+0+1], v[vgprValuB_X0_I0+56+0+0+1], 0, s[84:85] // set 0 if K_idx >= sizeL
v_add_u32 v150, v150, 2                            // add part of K
v_cmp_ge_i32 s[84:85], v150, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuB_X0_I0+0+0+0+2], v[vgprValuB_X0_I0+0+0+0+2], 0, s[84:85] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuB_X0_I0+8+0+0+2], v[vgprValuB_X0_I0+8+0+0+2], 0, s[84:85] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuB_X0_I0+16+0+0+2], v[vgprValuB_X0_I0+16+0+0+2], 0, s[84:85] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuB_X0_I0+24+0+0+2], v[vgprValuB_X0_I0+24+0+0+2], 0, s[84:85] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuB_X0_I0+32+0+0+2], v[vgprValuB_X0_I0+32+0+0+2], 0, s[84:85] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuB_X0_I0+40+0+0+2], v[vgprValuB_X0_I0+40+0+0+2], 0, s[84:85] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuB_X0_I0+48+0+0+2], v[vgprValuB_X0_I0+48+0+0+2], 0, s[84:85] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuB_X0_I0+56+0+0+2], v[vgprValuB_X0_I0+56+0+0+2], 0, s[84:85] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuB_X0_I0+0+0+0+3], v[vgprValuB_X0_I0+0+0+0+3], 0, s[84:85] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuB_X0_I0+8+0+0+3], v[vgprValuB_X0_I0+8+0+0+3], 0, s[84:85] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuB_X0_I0+16+0+0+3], v[vgprValuB_X0_I0+16+0+0+3], 0, s[84:85] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuB_X0_I0+24+0+0+3], v[vgprValuB_X0_I0+24+0+0+3], 0, s[84:85] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuB_X0_I0+32+0+0+3], v[vgprValuB_X0_I0+32+0+0+3], 0, s[84:85] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuB_X0_I0+40+0+0+3], v[vgprValuB_X0_I0+40+0+0+3], 0, s[84:85] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuB_X0_I0+48+0+0+3], v[vgprValuB_X0_I0+48+0+0+3], 0, s[84:85] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuB_X0_I0+56+0+0+3], v[vgprValuB_X0_I0+56+0+0+3], 0, s[84:85] // set 0 if K_idx >= sizeL
v_add_u32 v150, v150, 14                           // add part of K
v_cmp_ge_i32 s[84:85], v150, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuB_X0_I0+0+0+0+4], v[vgprValuB_X0_I0+0+0+0+4], 0, s[84:85] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuB_X0_I0+8+0+0+4], v[vgprValuB_X0_I0+8+0+0+4], 0, s[84:85] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuB_X0_I0+16+0+0+4], v[vgprValuB_X0_I0+16+0+0+4], 0, s[84:85] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuB_X0_I0+24+0+0+4], v[vgprValuB_X0_I0+24+0+0+4], 0, s[84:85] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuB_X0_I0+32+0+0+4], v[vgprValuB_X0_I0+32+0+0+4], 0, s[84:85] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuB_X0_I0+40+0+0+4], v[vgprValuB_X0_I0+40+0+0+4], 0, s[84:85] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuB_X0_I0+48+0+0+4], v[vgprValuB_X0_I0+48+0+0+4], 0, s[84:85] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuB_X0_I0+56+0+0+4], v[vgprValuB_X0_I0+56+0+0+4], 0, s[84:85] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuB_X0_I0+0+0+0+5], v[vgprValuB_X0_I0+0+0+0+5], 0, s[84:85] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuB_X0_I0+8+0+0+5], v[vgprValuB_X0_I0+8+0+0+5], 0, s[84:85] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuB_X0_I0+16+0+0+5], v[vgprValuB_X0_I0+16+0+0+5], 0, s[84:85] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuB_X0_I0+24+0+0+5], v[vgprValuB_X0_I0+24+0+0+5], 0, s[84:85] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuB_X0_I0+32+0+0+5], v[vgprValuB_X0_I0+32+0+0+5], 0, s[84:85] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuB_X0_I0+40+0+0+5], v[vgprValuB_X0_I0+40+0+0+5], 0, s[84:85] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuB_X0_I0+48+0+0+5], v[vgprValuB_X0_I0+48+0+0+5], 0, s[84:85] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuB_X0_I0+56+0+0+5], v[vgprValuB_X0_I0+56+0+0+5], 0, s[84:85] // set 0 if K_idx >= sizeL
v_add_u32 v150, v150, 2                            // add part of K
v_cmp_ge_i32 s[84:85], v150, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuB_X0_I0+0+0+0+6], v[vgprValuB_X0_I0+0+0+0+6], 0, s[84:85] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuB_X0_I0+8+0+0+6], v[vgprValuB_X0_I0+8+0+0+6], 0, s[84:85] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuB_X0_I0+16+0+0+6], v[vgprValuB_X0_I0+16+0+0+6], 0, s[84:85] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuB_X0_I0+24+0+0+6], v[vgprValuB_X0_I0+24+0+0+6], 0, s[84:85] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuB_X0_I0+32+0+0+6], v[vgprValuB_X0_I0+32+0+0+6], 0, s[84:85] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuB_X0_I0+40+0+0+6], v[vgprValuB_X0_I0+40+0+0+6], 0, s[84:85] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuB_X0_I0+48+0+0+6], v[vgprValuB_X0_I0+48+0+0+6], 0, s[84:85] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuB_X0_I0+56+0+0+6], v[vgprValuB_X0_I0+56+0+0+6], 0, s[84:85] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuB_X0_I0+0+0+0+7], v[vgprValuB_X0_I0+0+0+0+7], 0, s[84:85] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuB_X0_I0+8+0+0+7], v[vgprValuB_X0_I0+8+0+0+7], 0, s[84:85] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuB_X0_I0+16+0+0+7], v[vgprValuB_X0_I0+16+0+0+7], 0, s[84:85] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuB_X0_I0+24+0+0+7], v[vgprValuB_X0_I0+24+0+0+7], 0, s[84:85] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuB_X0_I0+32+0+0+7], v[vgprValuB_X0_I0+32+0+0+7], 0, s[84:85] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuB_X0_I0+40+0+0+7], v[vgprValuB_X0_I0+40+0+0+7], 0, s[84:85] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuB_X0_I0+48+0+0+7], v[vgprValuB_X0_I0+48+0+0+7], 0, s[84:85] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuB_X0_I0+56+0+0+7], v[vgprValuB_X0_I0+56+0+0+7], 0, s[84:85] // set 0 if K_idx >= sizeL
s_and_b32 s83, s[sgprLoopCounterL], 7              // get inputs for edge thread
s_sub_u32 s83, 8, s83                              // use shift to fill 0 for outside element
s_lshl_b32 s83, s83, 5                             // use shift to fill 0 for outside element
v_lshlrev_b64 v[152:153], s83, v[vgprValuA_X0_I0+0+0+0+0:vgprValuA_X0_I0+0+0+0+0+1]
v_lshlrev_b64 v[154:155], s83, v[vgprValuA_X0_I0+0+0+0+2:vgprValuA_X0_I0+0+0+0+2+1]
v_lshlrev_b64 v[156:157], s83, v[vgprValuA_X0_I0+0+0+0+4:vgprValuA_X0_I0+0+0+0+4+1]
v_lshlrev_b64 v[158:159], s83, v[vgprValuA_X0_I0+0+0+0+6:vgprValuA_X0_I0+0+0+0+6+1]
v_add_u32 v150, v149, 2                            // add part of K
v_cmp_ge_i32 s[84:85], v150, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuA_X0_I0+0+0+0+0], v[vgprValuA_X0_I0+0+0+0+0], v152, s[84:85]
v_cmp_ge_i32 s[84:85], v150, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuA_X0_I0+0+0+0+1], v[vgprValuA_X0_I0+0+0+0+1], v153, s[84:85]
v_add_u32 v150, v150, 2                            // add part of K
v_cmp_ge_i32 s[84:85], v150, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuA_X0_I0+0+0+0+2], v[vgprValuA_X0_I0+0+0+0+2], v154, s[84:85]
v_cmp_ge_i32 s[84:85], v150, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuA_X0_I0+0+0+0+3], v[vgprValuA_X0_I0+0+0+0+3], v155, s[84:85]
v_add_u32 v150, v150, 14                           // add part of K
v_cmp_ge_i32 s[84:85], v150, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuA_X0_I0+0+0+0+4], v[vgprValuA_X0_I0+0+0+0+4], v156, s[84:85]
v_cmp_ge_i32 s[84:85], v150, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuA_X0_I0+0+0+0+5], v[vgprValuA_X0_I0+0+0+0+5], v157, s[84:85]
v_add_u32 v150, v150, 2                            // add part of K
v_cmp_ge_i32 s[84:85], v150, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuA_X0_I0+0+0+0+6], v[vgprValuA_X0_I0+0+0+0+6], v158, s[84:85]
v_cmp_ge_i32 s[84:85], v150, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuA_X0_I0+0+0+0+7], v[vgprValuA_X0_I0+0+0+0+7], v159, s[84:85]
v_lshlrev_b64 v[152:153], s83, v[vgprValuA_X0_I0+8+0+0+0:vgprValuA_X0_I0+8+0+0+0+1]
v_lshlrev_b64 v[154:155], s83, v[vgprValuA_X0_I0+8+0+0+2:vgprValuA_X0_I0+8+0+0+2+1]
v_lshlrev_b64 v[156:157], s83, v[vgprValuA_X0_I0+8+0+0+4:vgprValuA_X0_I0+8+0+0+4+1]
v_lshlrev_b64 v[158:159], s83, v[vgprValuA_X0_I0+8+0+0+6:vgprValuA_X0_I0+8+0+0+6+1]
v_add_u32 v150, v149, 2                            // add part of K
v_cmp_ge_i32 s[84:85], v150, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuA_X0_I0+8+0+0+0], v[vgprValuA_X0_I0+8+0+0+0], v152, s[84:85]
v_cmp_ge_i32 s[84:85], v150, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuA_X0_I0+8+0+0+1], v[vgprValuA_X0_I0+8+0+0+1], v153, s[84:85]
v_add_u32 v150, v150, 2                            // add part of K
v_cmp_ge_i32 s[84:85], v150, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuA_X0_I0+8+0+0+2], v[vgprValuA_X0_I0+8+0+0+2], v154, s[84:85]
v_cmp_ge_i32 s[84:85], v150, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuA_X0_I0+8+0+0+3], v[vgprValuA_X0_I0+8+0+0+3], v155, s[84:85]
v_add_u32 v150, v150, 14                           // add part of K
v_cmp_ge_i32 s[84:85], v150, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuA_X0_I0+8+0+0+4], v[vgprValuA_X0_I0+8+0+0+4], v156, s[84:85]
v_cmp_ge_i32 s[84:85], v150, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuA_X0_I0+8+0+0+5], v[vgprValuA_X0_I0+8+0+0+5], v157, s[84:85]
v_add_u32 v150, v150, 2                            // add part of K
v_cmp_ge_i32 s[84:85], v150, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuA_X0_I0+8+0+0+6], v[vgprValuA_X0_I0+8+0+0+6], v158, s[84:85]
v_cmp_ge_i32 s[84:85], v150, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuA_X0_I0+8+0+0+7], v[vgprValuA_X0_I0+8+0+0+7], v159, s[84:85]
v_lshlrev_b64 v[152:153], s83, v[vgprValuA_X0_I0+16+0+0+0:vgprValuA_X0_I0+16+0+0+0+1]
v_lshlrev_b64 v[154:155], s83, v[vgprValuA_X0_I0+16+0+0+2:vgprValuA_X0_I0+16+0+0+2+1]
v_lshlrev_b64 v[156:157], s83, v[vgprValuA_X0_I0+16+0+0+4:vgprValuA_X0_I0+16+0+0+4+1]
v_lshlrev_b64 v[158:159], s83, v[vgprValuA_X0_I0+16+0+0+6:vgprValuA_X0_I0+16+0+0+6+1]
v_add_u32 v150, v149, 2                            // add part of K
v_cmp_ge_i32 s[84:85], v150, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuA_X0_I0+16+0+0+0], v[vgprValuA_X0_I0+16+0+0+0], v152, s[84:85]
v_cmp_ge_i32 s[84:85], v150, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuA_X0_I0+16+0+0+1], v[vgprValuA_X0_I0+16+0+0+1], v153, s[84:85]
v_add_u32 v150, v150, 2                            // add part of K
v_cmp_ge_i32 s[84:85], v150, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuA_X0_I0+16+0+0+2], v[vgprValuA_X0_I0+16+0+0+2], v154, s[84:85]
v_cmp_ge_i32 s[84:85], v150, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuA_X0_I0+16+0+0+3], v[vgprValuA_X0_I0+16+0+0+3], v155, s[84:85]
v_add_u32 v150, v150, 14                           // add part of K
v_cmp_ge_i32 s[84:85], v150, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuA_X0_I0+16+0+0+4], v[vgprValuA_X0_I0+16+0+0+4], v156, s[84:85]
v_cmp_ge_i32 s[84:85], v150, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuA_X0_I0+16+0+0+5], v[vgprValuA_X0_I0+16+0+0+5], v157, s[84:85]
v_add_u32 v150, v150, 2                            // add part of K
v_cmp_ge_i32 s[84:85], v150, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuA_X0_I0+16+0+0+6], v[vgprValuA_X0_I0+16+0+0+6], v158, s[84:85]
v_cmp_ge_i32 s[84:85], v150, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuA_X0_I0+16+0+0+7], v[vgprValuA_X0_I0+16+0+0+7], v159, s[84:85]
v_lshlrev_b64 v[152:153], s83, v[vgprValuA_X0_I0+24+0+0+0:vgprValuA_X0_I0+24+0+0+0+1]
v_lshlrev_b64 v[154:155], s83, v[vgprValuA_X0_I0+24+0+0+2:vgprValuA_X0_I0+24+0+0+2+1]
v_lshlrev_b64 v[156:157], s83, v[vgprValuA_X0_I0+24+0+0+4:vgprValuA_X0_I0+24+0+0+4+1]
v_lshlrev_b64 v[158:159], s83, v[vgprValuA_X0_I0+24+0+0+6:vgprValuA_X0_I0+24+0+0+6+1]
v_add_u32 v150, v149, 2                            // add part of K
v_cmp_ge_i32 s[84:85], v150, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuA_X0_I0+24+0+0+0], v[vgprValuA_X0_I0+24+0+0+0], v152, s[84:85]
v_cmp_ge_i32 s[84:85], v150, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuA_X0_I0+24+0+0+1], v[vgprValuA_X0_I0+24+0+0+1], v153, s[84:85]
v_add_u32 v150, v150, 2                            // add part of K
v_cmp_ge_i32 s[84:85], v150, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuA_X0_I0+24+0+0+2], v[vgprValuA_X0_I0+24+0+0+2], v154, s[84:85]
v_cmp_ge_i32 s[84:85], v150, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuA_X0_I0+24+0+0+3], v[vgprValuA_X0_I0+24+0+0+3], v155, s[84:85]
v_add_u32 v150, v150, 14                           // add part of K
v_cmp_ge_i32 s[84:85], v150, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuA_X0_I0+24+0+0+4], v[vgprValuA_X0_I0+24+0+0+4], v156, s[84:85]
v_cmp_ge_i32 s[84:85], v150, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuA_X0_I0+24+0+0+5], v[vgprValuA_X0_I0+24+0+0+5], v157, s[84:85]
v_add_u32 v150, v150, 2                            // add part of K
v_cmp_ge_i32 s[84:85], v150, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuA_X0_I0+24+0+0+6], v[vgprValuA_X0_I0+24+0+0+6], v158, s[84:85]
v_cmp_ge_i32 s[84:85], v150, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuA_X0_I0+24+0+0+7], v[vgprValuA_X0_I0+24+0+0+7], v159, s[84:85]
v_lshlrev_b64 v[152:153], s83, v[vgprValuA_X0_I0+32+0+0+0:vgprValuA_X0_I0+32+0+0+0+1]
v_lshlrev_b64 v[154:155], s83, v[vgprValuA_X0_I0+32+0+0+2:vgprValuA_X0_I0+32+0+0+2+1]
v_lshlrev_b64 v[156:157], s83, v[vgprValuA_X0_I0+32+0+0+4:vgprValuA_X0_I0+32+0+0+4+1]
v_lshlrev_b64 v[158:159], s83, v[vgprValuA_X0_I0+32+0+0+6:vgprValuA_X0_I0+32+0+0+6+1]
v_add_u32 v150, v149, 2                            // add part of K
v_cmp_ge_i32 s[84:85], v150, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuA_X0_I0+32+0+0+0], v[vgprValuA_X0_I0+32+0+0+0], v152, s[84:85]
v_cmp_ge_i32 s[84:85], v150, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuA_X0_I0+32+0+0+1], v[vgprValuA_X0_I0+32+0+0+1], v153, s[84:85]
v_add_u32 v150, v150, 2                            // add part of K
v_cmp_ge_i32 s[84:85], v150, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuA_X0_I0+32+0+0+2], v[vgprValuA_X0_I0+32+0+0+2], v154, s[84:85]
v_cmp_ge_i32 s[84:85], v150, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuA_X0_I0+32+0+0+3], v[vgprValuA_X0_I0+32+0+0+3], v155, s[84:85]
v_add_u32 v150, v150, 14                           // add part of K
v_cmp_ge_i32 s[84:85], v150, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuA_X0_I0+32+0+0+4], v[vgprValuA_X0_I0+32+0+0+4], v156, s[84:85]
v_cmp_ge_i32 s[84:85], v150, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuA_X0_I0+32+0+0+5], v[vgprValuA_X0_I0+32+0+0+5], v157, s[84:85]
v_add_u32 v150, v150, 2                            // add part of K
v_cmp_ge_i32 s[84:85], v150, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuA_X0_I0+32+0+0+6], v[vgprValuA_X0_I0+32+0+0+6], v158, s[84:85]
v_cmp_ge_i32 s[84:85], v150, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuA_X0_I0+32+0+0+7], v[vgprValuA_X0_I0+32+0+0+7], v159, s[84:85]
v_lshlrev_b64 v[152:153], s83, v[vgprValuA_X0_I0+40+0+0+0:vgprValuA_X0_I0+40+0+0+0+1]
v_lshlrev_b64 v[154:155], s83, v[vgprValuA_X0_I0+40+0+0+2:vgprValuA_X0_I0+40+0+0+2+1]
v_lshlrev_b64 v[156:157], s83, v[vgprValuA_X0_I0+40+0+0+4:vgprValuA_X0_I0+40+0+0+4+1]
v_lshlrev_b64 v[158:159], s83, v[vgprValuA_X0_I0+40+0+0+6:vgprValuA_X0_I0+40+0+0+6+1]
v_add_u32 v150, v149, 2                            // add part of K
v_cmp_ge_i32 s[84:85], v150, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuA_X0_I0+40+0+0+0], v[vgprValuA_X0_I0+40+0+0+0], v152, s[84:85]
v_cmp_ge_i32 s[84:85], v150, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuA_X0_I0+40+0+0+1], v[vgprValuA_X0_I0+40+0+0+1], v153, s[84:85]
v_add_u32 v150, v150, 2                            // add part of K
v_cmp_ge_i32 s[84:85], v150, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuA_X0_I0+40+0+0+2], v[vgprValuA_X0_I0+40+0+0+2], v154, s[84:85]
v_cmp_ge_i32 s[84:85], v150, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuA_X0_I0+40+0+0+3], v[vgprValuA_X0_I0+40+0+0+3], v155, s[84:85]
v_add_u32 v150, v150, 14                           // add part of K
v_cmp_ge_i32 s[84:85], v150, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuA_X0_I0+40+0+0+4], v[vgprValuA_X0_I0+40+0+0+4], v156, s[84:85]
v_cmp_ge_i32 s[84:85], v150, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuA_X0_I0+40+0+0+5], v[vgprValuA_X0_I0+40+0+0+5], v157, s[84:85]
v_add_u32 v150, v150, 2                            // add part of K
v_cmp_ge_i32 s[84:85], v150, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuA_X0_I0+40+0+0+6], v[vgprValuA_X0_I0+40+0+0+6], v158, s[84:85]
v_cmp_ge_i32 s[84:85], v150, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuA_X0_I0+40+0+0+7], v[vgprValuA_X0_I0+40+0+0+7], v159, s[84:85]
v_lshlrev_b64 v[152:153], s83, v[vgprValuA_X0_I0+48+0+0+0:vgprValuA_X0_I0+48+0+0+0+1]
v_lshlrev_b64 v[154:155], s83, v[vgprValuA_X0_I0+48+0+0+2:vgprValuA_X0_I0+48+0+0+2+1]
v_lshlrev_b64 v[156:157], s83, v[vgprValuA_X0_I0+48+0+0+4:vgprValuA_X0_I0+48+0+0+4+1]
v_lshlrev_b64 v[158:159], s83, v[vgprValuA_X0_I0+48+0+0+6:vgprValuA_X0_I0+48+0+0+6+1]
v_add_u32 v150, v149, 2                            // add part of K
v_cmp_ge_i32 s[84:85], v150, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuA_X0_I0+48+0+0+0], v[vgprValuA_X0_I0+48+0+0+0], v152, s[84:85]
v_cmp_ge_i32 s[84:85], v150, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuA_X0_I0+48+0+0+1], v[vgprValuA_X0_I0+48+0+0+1], v153, s[84:85]
v_add_u32 v150, v150, 2                            // add part of K
v_cmp_ge_i32 s[84:85], v150, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuA_X0_I0+48+0+0+2], v[vgprValuA_X0_I0+48+0+0+2], v154, s[84:85]
v_cmp_ge_i32 s[84:85], v150, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuA_X0_I0+48+0+0+3], v[vgprValuA_X0_I0+48+0+0+3], v155, s[84:85]
v_add_u32 v150, v150, 14                           // add part of K
v_cmp_ge_i32 s[84:85], v150, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuA_X0_I0+48+0+0+4], v[vgprValuA_X0_I0+48+0+0+4], v156, s[84:85]
v_cmp_ge_i32 s[84:85], v150, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuA_X0_I0+48+0+0+5], v[vgprValuA_X0_I0+48+0+0+5], v157, s[84:85]
v_add_u32 v150, v150, 2                            // add part of K
v_cmp_ge_i32 s[84:85], v150, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuA_X0_I0+48+0+0+6], v[vgprValuA_X0_I0+48+0+0+6], v158, s[84:85]
v_cmp_ge_i32 s[84:85], v150, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuA_X0_I0+48+0+0+7], v[vgprValuA_X0_I0+48+0+0+7], v159, s[84:85]
v_lshlrev_b64 v[152:153], s83, v[vgprValuA_X0_I0+56+0+0+0:vgprValuA_X0_I0+56+0+0+0+1]
v_lshlrev_b64 v[154:155], s83, v[vgprValuA_X0_I0+56+0+0+2:vgprValuA_X0_I0+56+0+0+2+1]
v_lshlrev_b64 v[156:157], s83, v[vgprValuA_X0_I0+56+0+0+4:vgprValuA_X0_I0+56+0+0+4+1]
v_lshlrev_b64 v[158:159], s83, v[vgprValuA_X0_I0+56+0+0+6:vgprValuA_X0_I0+56+0+0+6+1]
v_add_u32 v150, v149, 2                            // add part of K
v_cmp_ge_i32 s[84:85], v150, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuA_X0_I0+56+0+0+0], v[vgprValuA_X0_I0+56+0+0+0], v152, s[84:85]
v_cmp_ge_i32 s[84:85], v150, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuA_X0_I0+56+0+0+1], v[vgprValuA_X0_I0+56+0+0+1], v153, s[84:85]
v_add_u32 v150, v150, 2                            // add part of K
v_cmp_ge_i32 s[84:85], v150, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuA_X0_I0+56+0+0+2], v[vgprValuA_X0_I0+56+0+0+2], v154, s[84:85]
v_cmp_ge_i32 s[84:85], v150, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuA_X0_I0+56+0+0+3], v[vgprValuA_X0_I0+56+0+0+3], v155, s[84:85]
v_add_u32 v150, v150, 14                           // add part of K
v_cmp_ge_i32 s[84:85], v150, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuA_X0_I0+56+0+0+4], v[vgprValuA_X0_I0+56+0+0+4], v156, s[84:85]
v_cmp_ge_i32 s[84:85], v150, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuA_X0_I0+56+0+0+5], v[vgprValuA_X0_I0+56+0+0+5], v157, s[84:85]
v_add_u32 v150, v150, 2                            // add part of K
v_cmp_ge_i32 s[84:85], v150, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuA_X0_I0+56+0+0+6], v[vgprValuA_X0_I0+56+0+0+6], v158, s[84:85]
v_cmp_ge_i32 s[84:85], v150, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuA_X0_I0+56+0+0+7], v[vgprValuA_X0_I0+56+0+0+7], v159, s[84:85]
v_lshlrev_b64 v[152:153], s83, v[vgprValuB_X0_I0+0+0+0+0:vgprValuB_X0_I0+0+0+0+0+1]
v_lshlrev_b64 v[154:155], s83, v[vgprValuB_X0_I0+0+0+0+2:vgprValuB_X0_I0+0+0+0+2+1]
v_lshlrev_b64 v[156:157], s83, v[vgprValuB_X0_I0+0+0+0+4:vgprValuB_X0_I0+0+0+0+4+1]
v_lshlrev_b64 v[158:159], s83, v[vgprValuB_X0_I0+0+0+0+6:vgprValuB_X0_I0+0+0+0+6+1]
v_add_u32 v150, v149, 2                            // add part of K
v_cmp_ge_i32 s[84:85], v150, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuB_X0_I0+0+0+0+0], v[vgprValuB_X0_I0+0+0+0+0], v152, s[84:85]
v_cmp_ge_i32 s[84:85], v150, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuB_X0_I0+0+0+0+1], v[vgprValuB_X0_I0+0+0+0+1], v153, s[84:85]
v_add_u32 v150, v150, 2                            // add part of K
v_cmp_ge_i32 s[84:85], v150, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuB_X0_I0+0+0+0+2], v[vgprValuB_X0_I0+0+0+0+2], v154, s[84:85]
v_cmp_ge_i32 s[84:85], v150, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuB_X0_I0+0+0+0+3], v[vgprValuB_X0_I0+0+0+0+3], v155, s[84:85]
v_add_u32 v150, v150, 14                           // add part of K
v_cmp_ge_i32 s[84:85], v150, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuB_X0_I0+0+0+0+4], v[vgprValuB_X0_I0+0+0+0+4], v156, s[84:85]
v_cmp_ge_i32 s[84:85], v150, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuB_X0_I0+0+0+0+5], v[vgprValuB_X0_I0+0+0+0+5], v157, s[84:85]
v_add_u32 v150, v150, 2                            // add part of K
v_cmp_ge_i32 s[84:85], v150, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuB_X0_I0+0+0+0+6], v[vgprValuB_X0_I0+0+0+0+6], v158, s[84:85]
v_cmp_ge_i32 s[84:85], v150, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuB_X0_I0+0+0+0+7], v[vgprValuB_X0_I0+0+0+0+7], v159, s[84:85]
v_lshlrev_b64 v[152:153], s83, v[vgprValuB_X0_I0+8+0+0+0:vgprValuB_X0_I0+8+0+0+0+1]
v_lshlrev_b64 v[154:155], s83, v[vgprValuB_X0_I0+8+0+0+2:vgprValuB_X0_I0+8+0+0+2+1]
v_lshlrev_b64 v[156:157], s83, v[vgprValuB_X0_I0+8+0+0+4:vgprValuB_X0_I0+8+0+0+4+1]
v_lshlrev_b64 v[158:159], s83, v[vgprValuB_X0_I0+8+0+0+6:vgprValuB_X0_I0+8+0+0+6+1]
v_add_u32 v150, v149, 2                            // add part of K
v_cmp_ge_i32 s[84:85], v150, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuB_X0_I0+8+0+0+0], v[vgprValuB_X0_I0+8+0+0+0], v152, s[84:85]
v_cmp_ge_i32 s[84:85], v150, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuB_X0_I0+8+0+0+1], v[vgprValuB_X0_I0+8+0+0+1], v153, s[84:85]
v_add_u32 v150, v150, 2                            // add part of K
v_cmp_ge_i32 s[84:85], v150, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuB_X0_I0+8+0+0+2], v[vgprValuB_X0_I0+8+0+0+2], v154, s[84:85]
v_cmp_ge_i32 s[84:85], v150, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuB_X0_I0+8+0+0+3], v[vgprValuB_X0_I0+8+0+0+3], v155, s[84:85]
v_add_u32 v150, v150, 14                           // add part of K
v_cmp_ge_i32 s[84:85], v150, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuB_X0_I0+8+0+0+4], v[vgprValuB_X0_I0+8+0+0+4], v156, s[84:85]
v_cmp_ge_i32 s[84:85], v150, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuB_X0_I0+8+0+0+5], v[vgprValuB_X0_I0+8+0+0+5], v157, s[84:85]
v_add_u32 v150, v150, 2                            // add part of K
v_cmp_ge_i32 s[84:85], v150, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuB_X0_I0+8+0+0+6], v[vgprValuB_X0_I0+8+0+0+6], v158, s[84:85]
v_cmp_ge_i32 s[84:85], v150, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuB_X0_I0+8+0+0+7], v[vgprValuB_X0_I0+8+0+0+7], v159, s[84:85]
v_lshlrev_b64 v[152:153], s83, v[vgprValuB_X0_I0+16+0+0+0:vgprValuB_X0_I0+16+0+0+0+1]
v_lshlrev_b64 v[154:155], s83, v[vgprValuB_X0_I0+16+0+0+2:vgprValuB_X0_I0+16+0+0+2+1]
v_lshlrev_b64 v[156:157], s83, v[vgprValuB_X0_I0+16+0+0+4:vgprValuB_X0_I0+16+0+0+4+1]
v_lshlrev_b64 v[158:159], s83, v[vgprValuB_X0_I0+16+0+0+6:vgprValuB_X0_I0+16+0+0+6+1]
v_add_u32 v150, v149, 2                            // add part of K
v_cmp_ge_i32 s[84:85], v150, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuB_X0_I0+16+0+0+0], v[vgprValuB_X0_I0+16+0+0+0], v152, s[84:85]
v_cmp_ge_i32 s[84:85], v150, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuB_X0_I0+16+0+0+1], v[vgprValuB_X0_I0+16+0+0+1], v153, s[84:85]
v_add_u32 v150, v150, 2                            // add part of K
v_cmp_ge_i32 s[84:85], v150, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuB_X0_I0+16+0+0+2], v[vgprValuB_X0_I0+16+0+0+2], v154, s[84:85]
v_cmp_ge_i32 s[84:85], v150, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuB_X0_I0+16+0+0+3], v[vgprValuB_X0_I0+16+0+0+3], v155, s[84:85]
v_add_u32 v150, v150, 14                           // add part of K
v_cmp_ge_i32 s[84:85], v150, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuB_X0_I0+16+0+0+4], v[vgprValuB_X0_I0+16+0+0+4], v156, s[84:85]
v_cmp_ge_i32 s[84:85], v150, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuB_X0_I0+16+0+0+5], v[vgprValuB_X0_I0+16+0+0+5], v157, s[84:85]
v_add_u32 v150, v150, 2                            // add part of K
v_cmp_ge_i32 s[84:85], v150, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuB_X0_I0+16+0+0+6], v[vgprValuB_X0_I0+16+0+0+6], v158, s[84:85]
v_cmp_ge_i32 s[84:85], v150, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuB_X0_I0+16+0+0+7], v[vgprValuB_X0_I0+16+0+0+7], v159, s[84:85]
v_lshlrev_b64 v[152:153], s83, v[vgprValuB_X0_I0+24+0+0+0:vgprValuB_X0_I0+24+0+0+0+1]
v_lshlrev_b64 v[154:155], s83, v[vgprValuB_X0_I0+24+0+0+2:vgprValuB_X0_I0+24+0+0+2+1]
v_lshlrev_b64 v[156:157], s83, v[vgprValuB_X0_I0+24+0+0+4:vgprValuB_X0_I0+24+0+0+4+1]
v_lshlrev_b64 v[158:159], s83, v[vgprValuB_X0_I0+24+0+0+6:vgprValuB_X0_I0+24+0+0+6+1]
v_add_u32 v150, v149, 2                            // add part of K
v_cmp_ge_i32 s[84:85], v150, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuB_X0_I0+24+0+0+0], v[vgprValuB_X0_I0+24+0+0+0], v152, s[84:85]
v_cmp_ge_i32 s[84:85], v150, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuB_X0_I0+24+0+0+1], v[vgprValuB_X0_I0+24+0+0+1], v153, s[84:85]
v_add_u32 v150, v150, 2                            // add part of K
v_cmp_ge_i32 s[84:85], v150, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuB_X0_I0+24+0+0+2], v[vgprValuB_X0_I0+24+0+0+2], v154, s[84:85]
v_cmp_ge_i32 s[84:85], v150, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuB_X0_I0+24+0+0+3], v[vgprValuB_X0_I0+24+0+0+3], v155, s[84:85]
v_add_u32 v150, v150, 14                           // add part of K
v_cmp_ge_i32 s[84:85], v150, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuB_X0_I0+24+0+0+4], v[vgprValuB_X0_I0+24+0+0+4], v156, s[84:85]
v_cmp_ge_i32 s[84:85], v150, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuB_X0_I0+24+0+0+5], v[vgprValuB_X0_I0+24+0+0+5], v157, s[84:85]
v_add_u32 v150, v150, 2                            // add part of K
v_cmp_ge_i32 s[84:85], v150, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuB_X0_I0+24+0+0+6], v[vgprValuB_X0_I0+24+0+0+6], v158, s[84:85]
v_cmp_ge_i32 s[84:85], v150, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuB_X0_I0+24+0+0+7], v[vgprValuB_X0_I0+24+0+0+7], v159, s[84:85]
v_lshlrev_b64 v[152:153], s83, v[vgprValuB_X0_I0+32+0+0+0:vgprValuB_X0_I0+32+0+0+0+1]
v_lshlrev_b64 v[154:155], s83, v[vgprValuB_X0_I0+32+0+0+2:vgprValuB_X0_I0+32+0+0+2+1]
v_lshlrev_b64 v[156:157], s83, v[vgprValuB_X0_I0+32+0+0+4:vgprValuB_X0_I0+32+0+0+4+1]
v_lshlrev_b64 v[158:159], s83, v[vgprValuB_X0_I0+32+0+0+6:vgprValuB_X0_I0+32+0+0+6+1]
v_add_u32 v150, v149, 2                            // add part of K
v_cmp_ge_i32 s[84:85], v150, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuB_X0_I0+32+0+0+0], v[vgprValuB_X0_I0+32+0+0+0], v152, s[84:85]
v_cmp_ge_i32 s[84:85], v150, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuB_X0_I0+32+0+0+1], v[vgprValuB_X0_I0+32+0+0+1], v153, s[84:85]
v_add_u32 v150, v150, 2                            // add part of K
v_cmp_ge_i32 s[84:85], v150, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuB_X0_I0+32+0+0+2], v[vgprValuB_X0_I0+32+0+0+2], v154, s[84:85]
v_cmp_ge_i32 s[84:85], v150, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuB_X0_I0+32+0+0+3], v[vgprValuB_X0_I0+32+0+0+3], v155, s[84:85]
v_add_u32 v150, v150, 14                           // add part of K
v_cmp_ge_i32 s[84:85], v150, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuB_X0_I0+32+0+0+4], v[vgprValuB_X0_I0+32+0+0+4], v156, s[84:85]
v_cmp_ge_i32 s[84:85], v150, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuB_X0_I0+32+0+0+5], v[vgprValuB_X0_I0+32+0+0+5], v157, s[84:85]
v_add_u32 v150, v150, 2                            // add part of K
v_cmp_ge_i32 s[84:85], v150, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuB_X0_I0+32+0+0+6], v[vgprValuB_X0_I0+32+0+0+6], v158, s[84:85]
v_cmp_ge_i32 s[84:85], v150, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuB_X0_I0+32+0+0+7], v[vgprValuB_X0_I0+32+0+0+7], v159, s[84:85]
v_lshlrev_b64 v[152:153], s83, v[vgprValuB_X0_I0+40+0+0+0:vgprValuB_X0_I0+40+0+0+0+1]
v_lshlrev_b64 v[154:155], s83, v[vgprValuB_X0_I0+40+0+0+2:vgprValuB_X0_I0+40+0+0+2+1]
v_lshlrev_b64 v[156:157], s83, v[vgprValuB_X0_I0+40+0+0+4:vgprValuB_X0_I0+40+0+0+4+1]
v_lshlrev_b64 v[158:159], s83, v[vgprValuB_X0_I0+40+0+0+6:vgprValuB_X0_I0+40+0+0+6+1]
v_add_u32 v150, v149, 2                            // add part of K
v_cmp_ge_i32 s[84:85], v150, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuB_X0_I0+40+0+0+0], v[vgprValuB_X0_I0+40+0+0+0], v152, s[84:85]
v_cmp_ge_i32 s[84:85], v150, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuB_X0_I0+40+0+0+1], v[vgprValuB_X0_I0+40+0+0+1], v153, s[84:85]
v_add_u32 v150, v150, 2                            // add part of K
v_cmp_ge_i32 s[84:85], v150, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuB_X0_I0+40+0+0+2], v[vgprValuB_X0_I0+40+0+0+2], v154, s[84:85]
v_cmp_ge_i32 s[84:85], v150, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuB_X0_I0+40+0+0+3], v[vgprValuB_X0_I0+40+0+0+3], v155, s[84:85]
v_add_u32 v150, v150, 14                           // add part of K
v_cmp_ge_i32 s[84:85], v150, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuB_X0_I0+40+0+0+4], v[vgprValuB_X0_I0+40+0+0+4], v156, s[84:85]
v_cmp_ge_i32 s[84:85], v150, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuB_X0_I0+40+0+0+5], v[vgprValuB_X0_I0+40+0+0+5], v157, s[84:85]
v_add_u32 v150, v150, 2                            // add part of K
v_cmp_ge_i32 s[84:85], v150, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuB_X0_I0+40+0+0+6], v[vgprValuB_X0_I0+40+0+0+6], v158, s[84:85]
v_cmp_ge_i32 s[84:85], v150, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuB_X0_I0+40+0+0+7], v[vgprValuB_X0_I0+40+0+0+7], v159, s[84:85]
v_lshlrev_b64 v[152:153], s83, v[vgprValuB_X0_I0+48+0+0+0:vgprValuB_X0_I0+48+0+0+0+1]
v_lshlrev_b64 v[154:155], s83, v[vgprValuB_X0_I0+48+0+0+2:vgprValuB_X0_I0+48+0+0+2+1]
v_lshlrev_b64 v[156:157], s83, v[vgprValuB_X0_I0+48+0+0+4:vgprValuB_X0_I0+48+0+0+4+1]
v_lshlrev_b64 v[158:159], s83, v[vgprValuB_X0_I0+48+0+0+6:vgprValuB_X0_I0+48+0+0+6+1]
v_add_u32 v150, v149, 2                            // add part of K
v_cmp_ge_i32 s[84:85], v150, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuB_X0_I0+48+0+0+0], v[vgprValuB_X0_I0+48+0+0+0], v152, s[84:85]
v_cmp_ge_i32 s[84:85], v150, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuB_X0_I0+48+0+0+1], v[vgprValuB_X0_I0+48+0+0+1], v153, s[84:85]
v_add_u32 v150, v150, 2                            // add part of K
v_cmp_ge_i32 s[84:85], v150, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuB_X0_I0+48+0+0+2], v[vgprValuB_X0_I0+48+0+0+2], v154, s[84:85]
v_cmp_ge_i32 s[84:85], v150, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuB_X0_I0+48+0+0+3], v[vgprValuB_X0_I0+48+0+0+3], v155, s[84:85]
v_add_u32 v150, v150, 14                           // add part of K
v_cmp_ge_i32 s[84:85], v150, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuB_X0_I0+48+0+0+4], v[vgprValuB_X0_I0+48+0+0+4], v156, s[84:85]
v_cmp_ge_i32 s[84:85], v150, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuB_X0_I0+48+0+0+5], v[vgprValuB_X0_I0+48+0+0+5], v157, s[84:85]
v_add_u32 v150, v150, 2                            // add part of K
v_cmp_ge_i32 s[84:85], v150, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuB_X0_I0+48+0+0+6], v[vgprValuB_X0_I0+48+0+0+6], v158, s[84:85]
v_cmp_ge_i32 s[84:85], v150, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuB_X0_I0+48+0+0+7], v[vgprValuB_X0_I0+48+0+0+7], v159, s[84:85]
v_lshlrev_b64 v[152:153], s83, v[vgprValuB_X0_I0+56+0+0+0:vgprValuB_X0_I0+56+0+0+0+1]
v_lshlrev_b64 v[154:155], s83, v[vgprValuB_X0_I0+56+0+0+2:vgprValuB_X0_I0+56+0+0+2+1]
v_lshlrev_b64 v[156:157], s83, v[vgprValuB_X0_I0+56+0+0+4:vgprValuB_X0_I0+56+0+0+4+1]
v_lshlrev_b64 v[158:159], s83, v[vgprValuB_X0_I0+56+0+0+6:vgprValuB_X0_I0+56+0+0+6+1]
v_add_u32 v150, v149, 2                            // add part of K
v_cmp_ge_i32 s[84:85], v150, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuB_X0_I0+56+0+0+0], v[vgprValuB_X0_I0+56+0+0+0], v152, s[84:85]
v_cmp_ge_i32 s[84:85], v150, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuB_X0_I0+56+0+0+1], v[vgprValuB_X0_I0+56+0+0+1], v153, s[84:85]
v_add_u32 v150, v150, 2                            // add part of K
v_cmp_ge_i32 s[84:85], v150, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuB_X0_I0+56+0+0+2], v[vgprValuB_X0_I0+56+0+0+2], v154, s[84:85]
v_cmp_ge_i32 s[84:85], v150, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuB_X0_I0+56+0+0+3], v[vgprValuB_X0_I0+56+0+0+3], v155, s[84:85]
v_add_u32 v150, v150, 14                           // add part of K
v_cmp_ge_i32 s[84:85], v150, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuB_X0_I0+56+0+0+4], v[vgprValuB_X0_I0+56+0+0+4], v156, s[84:85]
v_cmp_ge_i32 s[84:85], v150, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuB_X0_I0+56+0+0+5], v[vgprValuB_X0_I0+56+0+0+5], v157, s[84:85]
v_add_u32 v150, v150, 2                            // add part of K
v_cmp_ge_i32 s[84:85], v150, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuB_X0_I0+56+0+0+6], v[vgprValuB_X0_I0+56+0+0+6], v158, s[84:85]
v_cmp_ge_i32 s[84:85], v150, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuB_X0_I0+56+0+0+7], v[vgprValuB_X0_I0+56+0+0+7], v159, s[84:85]
v_cvt_pk_bf16_f32 v150, v[vgprValuA_X0_I0+0], v[vgprValuA_X0_I0+0+1]
v_cvt_f32_bf16 v149, v150
v_sub_f32 v[vgprValuA_X0_I0+0], v[vgprValuA_X0_I0+0], v149
v_cvt_f32_bf16 v152, v150 src0_sel:WORD_1          // cvt bf16 to f32
v_sub_f32 v[vgprValuA_X0_I0+0+1], v[vgprValuA_X0_I0+0+1], v152
v_cvt_pk_bf16_f32 v151, v[vgprValuA_X0_I0+0+2], v[vgprValuA_X0_I0+0+3]
v_cvt_f32_bf16 v153, v151
v_sub_f32 v[vgprValuA_X0_I0+0+2], v[vgprValuA_X0_I0+0+2], v153
v_cvt_f32_bf16 v153, v151 src0_sel:WORD_1          // cvt bf16 to f32
v_sub_f32 v[vgprValuA_X0_I0+0+3], v[vgprValuA_X0_I0+0+3], v153
v_cvt_pk_bf16_f32 v152, v[vgprValuA_X0_I0+4], v[vgprValuA_X0_I0+4+1]
v_cvt_f32_bf16 v149, v152
v_sub_f32 v[vgprValuA_X0_I0+4], v[vgprValuA_X0_I0+4], v149
v_cvt_f32_bf16 v154, v152 src0_sel:WORD_1          // cvt bf16 to f32
v_sub_f32 v[vgprValuA_X0_I0+4+1], v[vgprValuA_X0_I0+4+1], v154
v_cvt_pk_bf16_f32 v153, v[vgprValuA_X0_I0+4+2], v[vgprValuA_X0_I0+4+3]
v_cvt_f32_bf16 v155, v153
v_sub_f32 v[vgprValuA_X0_I0+4+2], v[vgprValuA_X0_I0+4+2], v155
v_cvt_f32_bf16 v155, v153 src0_sel:WORD_1          // cvt bf16 to f32
v_sub_f32 v[vgprValuA_X0_I0+4+3], v[vgprValuA_X0_I0+4+3], v155
v_cvt_pk_bf16_f32 v[vgprValuA_X0_I0+0+7], v[vgprValuA_X0_I0+0+6], v[vgprValuA_X0_I0+0+7]
v_cvt_pk_bf16_f32 v[vgprValuA_X0_I0+0+6], v[vgprValuA_X0_I0+0+4], v[vgprValuA_X0_I0+0+5]
v_cvt_pk_bf16_f32 v[vgprValuA_X0_I0+0+5], v[vgprValuA_X0_I0+0+2], v[vgprValuA_X0_I0+0+3]
v_cvt_pk_bf16_f32 v[vgprValuA_X0_I0+0+4], v[vgprValuA_X0_I0+0+0], v[vgprValuA_X0_I0+0+1]
v_mov_b64 v[vgprValuA_X0_I0+0+0:vgprValuA_X0_I0+0+0+1], v[150:151]
v_mov_b64 v[vgprValuA_X0_I0+0+2:vgprValuA_X0_I0+0+2+1], v[152:153]
v_cvt_pk_bf16_f32 v150, v[vgprValuA_X0_I0+8], v[vgprValuA_X0_I0+8+1]
v_cvt_f32_bf16 v149, v150
v_sub_f32 v[vgprValuA_X0_I0+8], v[vgprValuA_X0_I0+8], v149
v_cvt_f32_bf16 v152, v150 src0_sel:WORD_1          // cvt bf16 to f32
v_sub_f32 v[vgprValuA_X0_I0+8+1], v[vgprValuA_X0_I0+8+1], v152
v_cvt_pk_bf16_f32 v151, v[vgprValuA_X0_I0+8+2], v[vgprValuA_X0_I0+8+3]
v_cvt_f32_bf16 v153, v151
v_sub_f32 v[vgprValuA_X0_I0+8+2], v[vgprValuA_X0_I0+8+2], v153
v_cvt_f32_bf16 v153, v151 src0_sel:WORD_1          // cvt bf16 to f32
v_sub_f32 v[vgprValuA_X0_I0+8+3], v[vgprValuA_X0_I0+8+3], v153
v_cvt_pk_bf16_f32 v152, v[vgprValuA_X0_I0+12], v[vgprValuA_X0_I0+12+1]
v_cvt_f32_bf16 v149, v152
v_sub_f32 v[vgprValuA_X0_I0+12], v[vgprValuA_X0_I0+12], v149
v_cvt_f32_bf16 v154, v152 src0_sel:WORD_1          // cvt bf16 to f32
v_sub_f32 v[vgprValuA_X0_I0+12+1], v[vgprValuA_X0_I0+12+1], v154
v_cvt_pk_bf16_f32 v153, v[vgprValuA_X0_I0+12+2], v[vgprValuA_X0_I0+12+3]
v_cvt_f32_bf16 v155, v153
v_sub_f32 v[vgprValuA_X0_I0+12+2], v[vgprValuA_X0_I0+12+2], v155
v_cvt_f32_bf16 v155, v153 src0_sel:WORD_1          // cvt bf16 to f32
v_sub_f32 v[vgprValuA_X0_I0+12+3], v[vgprValuA_X0_I0+12+3], v155
v_cvt_pk_bf16_f32 v[vgprValuA_X0_I0+8+7], v[vgprValuA_X0_I0+8+6], v[vgprValuA_X0_I0+8+7]
v_cvt_pk_bf16_f32 v[vgprValuA_X0_I0+8+6], v[vgprValuA_X0_I0+8+4], v[vgprValuA_X0_I0+8+5]
v_cvt_pk_bf16_f32 v[vgprValuA_X0_I0+8+5], v[vgprValuA_X0_I0+8+2], v[vgprValuA_X0_I0+8+3]
v_cvt_pk_bf16_f32 v[vgprValuA_X0_I0+8+4], v[vgprValuA_X0_I0+8+0], v[vgprValuA_X0_I0+8+1]
v_mov_b64 v[vgprValuA_X0_I0+8+0:vgprValuA_X0_I0+8+0+1], v[150:151]
v_mov_b64 v[vgprValuA_X0_I0+8+2:vgprValuA_X0_I0+8+2+1], v[152:153]
v_cvt_pk_bf16_f32 v150, v[vgprValuA_X0_I0+16], v[vgprValuA_X0_I0+16+1]
v_cvt_f32_bf16 v149, v150
v_sub_f32 v[vgprValuA_X0_I0+16], v[vgprValuA_X0_I0+16], v149
v_cvt_f32_bf16 v152, v150 src0_sel:WORD_1          // cvt bf16 to f32
v_sub_f32 v[vgprValuA_X0_I0+16+1], v[vgprValuA_X0_I0+16+1], v152
v_cvt_pk_bf16_f32 v151, v[vgprValuA_X0_I0+16+2], v[vgprValuA_X0_I0+16+3]
v_cvt_f32_bf16 v153, v151
v_sub_f32 v[vgprValuA_X0_I0+16+2], v[vgprValuA_X0_I0+16+2], v153
v_cvt_f32_bf16 v153, v151 src0_sel:WORD_1          // cvt bf16 to f32
v_sub_f32 v[vgprValuA_X0_I0+16+3], v[vgprValuA_X0_I0+16+3], v153
v_cvt_pk_bf16_f32 v152, v[vgprValuA_X0_I0+20], v[vgprValuA_X0_I0+20+1]
v_cvt_f32_bf16 v149, v152
v_sub_f32 v[vgprValuA_X0_I0+20], v[vgprValuA_X0_I0+20], v149
v_cvt_f32_bf16 v154, v152 src0_sel:WORD_1          // cvt bf16 to f32
v_sub_f32 v[vgprValuA_X0_I0+20+1], v[vgprValuA_X0_I0+20+1], v154
v_cvt_pk_bf16_f32 v153, v[vgprValuA_X0_I0+20+2], v[vgprValuA_X0_I0+20+3]
v_cvt_f32_bf16 v155, v153
v_sub_f32 v[vgprValuA_X0_I0+20+2], v[vgprValuA_X0_I0+20+2], v155
v_cvt_f32_bf16 v155, v153 src0_sel:WORD_1          // cvt bf16 to f32
v_sub_f32 v[vgprValuA_X0_I0+20+3], v[vgprValuA_X0_I0+20+3], v155
v_cvt_pk_bf16_f32 v[vgprValuA_X0_I0+16+7], v[vgprValuA_X0_I0+16+6], v[vgprValuA_X0_I0+16+7]
v_cvt_pk_bf16_f32 v[vgprValuA_X0_I0+16+6], v[vgprValuA_X0_I0+16+4], v[vgprValuA_X0_I0+16+5]
v_cvt_pk_bf16_f32 v[vgprValuA_X0_I0+16+5], v[vgprValuA_X0_I0+16+2], v[vgprValuA_X0_I0+16+3]
v_cvt_pk_bf16_f32 v[vgprValuA_X0_I0+16+4], v[vgprValuA_X0_I0+16+0], v[vgprValuA_X0_I0+16+1]
v_mov_b64 v[vgprValuA_X0_I0+16+0:vgprValuA_X0_I0+16+0+1], v[150:151]
v_mov_b64 v[vgprValuA_X0_I0+16+2:vgprValuA_X0_I0+16+2+1], v[152:153]
v_cvt_pk_bf16_f32 v150, v[vgprValuA_X0_I0+24], v[vgprValuA_X0_I0+24+1]
v_cvt_f32_bf16 v149, v150
v_sub_f32 v[vgprValuA_X0_I0+24], v[vgprValuA_X0_I0+24], v149
v_cvt_f32_bf16 v152, v150 src0_sel:WORD_1          // cvt bf16 to f32
v_sub_f32 v[vgprValuA_X0_I0+24+1], v[vgprValuA_X0_I0+24+1], v152
v_cvt_pk_bf16_f32 v151, v[vgprValuA_X0_I0+24+2], v[vgprValuA_X0_I0+24+3]
v_cvt_f32_bf16 v153, v151
v_sub_f32 v[vgprValuA_X0_I0+24+2], v[vgprValuA_X0_I0+24+2], v153
v_cvt_f32_bf16 v153, v151 src0_sel:WORD_1          // cvt bf16 to f32
v_sub_f32 v[vgprValuA_X0_I0+24+3], v[vgprValuA_X0_I0+24+3], v153
v_cvt_pk_bf16_f32 v152, v[vgprValuA_X0_I0+28], v[vgprValuA_X0_I0+28+1]
v_cvt_f32_bf16 v149, v152
v_sub_f32 v[vgprValuA_X0_I0+28], v[vgprValuA_X0_I0+28], v149
v_cvt_f32_bf16 v154, v152 src0_sel:WORD_1          // cvt bf16 to f32
v_sub_f32 v[vgprValuA_X0_I0+28+1], v[vgprValuA_X0_I0+28+1], v154
v_cvt_pk_bf16_f32 v153, v[vgprValuA_X0_I0+28+2], v[vgprValuA_X0_I0+28+3]
v_cvt_f32_bf16 v155, v153
v_sub_f32 v[vgprValuA_X0_I0+28+2], v[vgprValuA_X0_I0+28+2], v155
v_cvt_f32_bf16 v155, v153 src0_sel:WORD_1          // cvt bf16 to f32
v_sub_f32 v[vgprValuA_X0_I0+28+3], v[vgprValuA_X0_I0+28+3], v155
v_cvt_pk_bf16_f32 v[vgprValuA_X0_I0+24+7], v[vgprValuA_X0_I0+24+6], v[vgprValuA_X0_I0+24+7]
v_cvt_pk_bf16_f32 v[vgprValuA_X0_I0+24+6], v[vgprValuA_X0_I0+24+4], v[vgprValuA_X0_I0+24+5]
v_cvt_pk_bf16_f32 v[vgprValuA_X0_I0+24+5], v[vgprValuA_X0_I0+24+2], v[vgprValuA_X0_I0+24+3]
v_cvt_pk_bf16_f32 v[vgprValuA_X0_I0+24+4], v[vgprValuA_X0_I0+24+0], v[vgprValuA_X0_I0+24+1]
v_mov_b64 v[vgprValuA_X0_I0+24+0:vgprValuA_X0_I0+24+0+1], v[150:151]
v_mov_b64 v[vgprValuA_X0_I0+24+2:vgprValuA_X0_I0+24+2+1], v[152:153]
v_cvt_pk_bf16_f32 v150, v[vgprValuA_X0_I0+32], v[vgprValuA_X0_I0+32+1]
v_cvt_f32_bf16 v149, v150
v_sub_f32 v[vgprValuA_X0_I0+32], v[vgprValuA_X0_I0+32], v149
v_cvt_f32_bf16 v152, v150 src0_sel:WORD_1          // cvt bf16 to f32
v_sub_f32 v[vgprValuA_X0_I0+32+1], v[vgprValuA_X0_I0+32+1], v152
v_cvt_pk_bf16_f32 v151, v[vgprValuA_X0_I0+32+2], v[vgprValuA_X0_I0+32+3]
v_cvt_f32_bf16 v153, v151
v_sub_f32 v[vgprValuA_X0_I0+32+2], v[vgprValuA_X0_I0+32+2], v153
v_cvt_f32_bf16 v153, v151 src0_sel:WORD_1          // cvt bf16 to f32
v_sub_f32 v[vgprValuA_X0_I0+32+3], v[vgprValuA_X0_I0+32+3], v153
v_cvt_pk_bf16_f32 v152, v[vgprValuA_X0_I0+36], v[vgprValuA_X0_I0+36+1]
v_cvt_f32_bf16 v149, v152
v_sub_f32 v[vgprValuA_X0_I0+36], v[vgprValuA_X0_I0+36], v149
v_cvt_f32_bf16 v154, v152 src0_sel:WORD_1          // cvt bf16 to f32
v_sub_f32 v[vgprValuA_X0_I0+36+1], v[vgprValuA_X0_I0+36+1], v154
v_cvt_pk_bf16_f32 v153, v[vgprValuA_X0_I0+36+2], v[vgprValuA_X0_I0+36+3]
v_cvt_f32_bf16 v155, v153
v_sub_f32 v[vgprValuA_X0_I0+36+2], v[vgprValuA_X0_I0+36+2], v155
v_cvt_f32_bf16 v155, v153 src0_sel:WORD_1          // cvt bf16 to f32
v_sub_f32 v[vgprValuA_X0_I0+36+3], v[vgprValuA_X0_I0+36+3], v155
v_cvt_pk_bf16_f32 v[vgprValuA_X0_I0+32+7], v[vgprValuA_X0_I0+32+6], v[vgprValuA_X0_I0+32+7]
v_cvt_pk_bf16_f32 v[vgprValuA_X0_I0+32+6], v[vgprValuA_X0_I0+32+4], v[vgprValuA_X0_I0+32+5]
v_cvt_pk_bf16_f32 v[vgprValuA_X0_I0+32+5], v[vgprValuA_X0_I0+32+2], v[vgprValuA_X0_I0+32+3]
v_cvt_pk_bf16_f32 v[vgprValuA_X0_I0+32+4], v[vgprValuA_X0_I0+32+0], v[vgprValuA_X0_I0+32+1]
v_mov_b64 v[vgprValuA_X0_I0+32+0:vgprValuA_X0_I0+32+0+1], v[150:151]
v_mov_b64 v[vgprValuA_X0_I0+32+2:vgprValuA_X0_I0+32+2+1], v[152:153]
v_cvt_pk_bf16_f32 v150, v[vgprValuA_X0_I0+40], v[vgprValuA_X0_I0+40+1]
v_cvt_f32_bf16 v149, v150
v_sub_f32 v[vgprValuA_X0_I0+40], v[vgprValuA_X0_I0+40], v149
v_cvt_f32_bf16 v152, v150 src0_sel:WORD_1          // cvt bf16 to f32
v_sub_f32 v[vgprValuA_X0_I0+40+1], v[vgprValuA_X0_I0+40+1], v152
v_cvt_pk_bf16_f32 v151, v[vgprValuA_X0_I0+40+2], v[vgprValuA_X0_I0+40+3]
v_cvt_f32_bf16 v153, v151
v_sub_f32 v[vgprValuA_X0_I0+40+2], v[vgprValuA_X0_I0+40+2], v153
v_cvt_f32_bf16 v153, v151 src0_sel:WORD_1          // cvt bf16 to f32
v_sub_f32 v[vgprValuA_X0_I0+40+3], v[vgprValuA_X0_I0+40+3], v153
v_cvt_pk_bf16_f32 v152, v[vgprValuA_X0_I0+44], v[vgprValuA_X0_I0+44+1]
v_cvt_f32_bf16 v149, v152
v_sub_f32 v[vgprValuA_X0_I0+44], v[vgprValuA_X0_I0+44], v149
v_cvt_f32_bf16 v154, v152 src0_sel:WORD_1          // cvt bf16 to f32
v_sub_f32 v[vgprValuA_X0_I0+44+1], v[vgprValuA_X0_I0+44+1], v154
v_cvt_pk_bf16_f32 v153, v[vgprValuA_X0_I0+44+2], v[vgprValuA_X0_I0+44+3]
v_cvt_f32_bf16 v155, v153
v_sub_f32 v[vgprValuA_X0_I0+44+2], v[vgprValuA_X0_I0+44+2], v155
v_cvt_f32_bf16 v155, v153 src0_sel:WORD_1          // cvt bf16 to f32
v_sub_f32 v[vgprValuA_X0_I0+44+3], v[vgprValuA_X0_I0+44+3], v155
v_cvt_pk_bf16_f32 v[vgprValuA_X0_I0+40+7], v[vgprValuA_X0_I0+40+6], v[vgprValuA_X0_I0+40+7]
v_cvt_pk_bf16_f32 v[vgprValuA_X0_I0+40+6], v[vgprValuA_X0_I0+40+4], v[vgprValuA_X0_I0+40+5]
v_cvt_pk_bf16_f32 v[vgprValuA_X0_I0+40+5], v[vgprValuA_X0_I0+40+2], v[vgprValuA_X0_I0+40+3]
v_cvt_pk_bf16_f32 v[vgprValuA_X0_I0+40+4], v[vgprValuA_X0_I0+40+0], v[vgprValuA_X0_I0+40+1]
v_mov_b64 v[vgprValuA_X0_I0+40+0:vgprValuA_X0_I0+40+0+1], v[150:151]
v_mov_b64 v[vgprValuA_X0_I0+40+2:vgprValuA_X0_I0+40+2+1], v[152:153]
v_cvt_pk_bf16_f32 v150, v[vgprValuA_X0_I0+48], v[vgprValuA_X0_I0+48+1]
v_cvt_f32_bf16 v149, v150
v_sub_f32 v[vgprValuA_X0_I0+48], v[vgprValuA_X0_I0+48], v149
v_cvt_f32_bf16 v152, v150 src0_sel:WORD_1          // cvt bf16 to f32
v_sub_f32 v[vgprValuA_X0_I0+48+1], v[vgprValuA_X0_I0+48+1], v152
v_cvt_pk_bf16_f32 v151, v[vgprValuA_X0_I0+48+2], v[vgprValuA_X0_I0+48+3]
v_cvt_f32_bf16 v153, v151
v_sub_f32 v[vgprValuA_X0_I0+48+2], v[vgprValuA_X0_I0+48+2], v153
v_cvt_f32_bf16 v153, v151 src0_sel:WORD_1          // cvt bf16 to f32
v_sub_f32 v[vgprValuA_X0_I0+48+3], v[vgprValuA_X0_I0+48+3], v153
v_cvt_pk_bf16_f32 v152, v[vgprValuA_X0_I0+52], v[vgprValuA_X0_I0+52+1]
v_cvt_f32_bf16 v149, v152
v_sub_f32 v[vgprValuA_X0_I0+52], v[vgprValuA_X0_I0+52], v149
v_cvt_f32_bf16 v154, v152 src0_sel:WORD_1          // cvt bf16 to f32
v_sub_f32 v[vgprValuA_X0_I0+52+1], v[vgprValuA_X0_I0+52+1], v154
v_cvt_pk_bf16_f32 v153, v[vgprValuA_X0_I0+52+2], v[vgprValuA_X0_I0+52+3]
v_cvt_f32_bf16 v155, v153
v_sub_f32 v[vgprValuA_X0_I0+52+2], v[vgprValuA_X0_I0+52+2], v155
v_cvt_f32_bf16 v155, v153 src0_sel:WORD_1          // cvt bf16 to f32
v_sub_f32 v[vgprValuA_X0_I0+52+3], v[vgprValuA_X0_I0+52+3], v155
v_cvt_pk_bf16_f32 v[vgprValuA_X0_I0+48+7], v[vgprValuA_X0_I0+48+6], v[vgprValuA_X0_I0+48+7]
v_cvt_pk_bf16_f32 v[vgprValuA_X0_I0+48+6], v[vgprValuA_X0_I0+48+4], v[vgprValuA_X0_I0+48+5]
v_cvt_pk_bf16_f32 v[vgprValuA_X0_I0+48+5], v[vgprValuA_X0_I0+48+2], v[vgprValuA_X0_I0+48+3]
v_cvt_pk_bf16_f32 v[vgprValuA_X0_I0+48+4], v[vgprValuA_X0_I0+48+0], v[vgprValuA_X0_I0+48+1]
v_mov_b64 v[vgprValuA_X0_I0+48+0:vgprValuA_X0_I0+48+0+1], v[150:151]
v_mov_b64 v[vgprValuA_X0_I0+48+2:vgprValuA_X0_I0+48+2+1], v[152:153]
v_cvt_pk_bf16_f32 v150, v[vgprValuA_X0_I0+56], v[vgprValuA_X0_I0+56+1]
v_cvt_f32_bf16 v149, v150
v_sub_f32 v[vgprValuA_X0_I0+56], v[vgprValuA_X0_I0+56], v149
v_cvt_f32_bf16 v152, v150 src0_sel:WORD_1          // cvt bf16 to f32
v_sub_f32 v[vgprValuA_X0_I0+56+1], v[vgprValuA_X0_I0+56+1], v152
v_cvt_pk_bf16_f32 v151, v[vgprValuA_X0_I0+56+2], v[vgprValuA_X0_I0+56+3]
v_cvt_f32_bf16 v153, v151
v_sub_f32 v[vgprValuA_X0_I0+56+2], v[vgprValuA_X0_I0+56+2], v153
v_cvt_f32_bf16 v153, v151 src0_sel:WORD_1          // cvt bf16 to f32
v_sub_f32 v[vgprValuA_X0_I0+56+3], v[vgprValuA_X0_I0+56+3], v153
v_cvt_pk_bf16_f32 v152, v[vgprValuA_X0_I0+60], v[vgprValuA_X0_I0+60+1]
v_cvt_f32_bf16 v149, v152
v_sub_f32 v[vgprValuA_X0_I0+60], v[vgprValuA_X0_I0+60], v149
v_cvt_f32_bf16 v154, v152 src0_sel:WORD_1          // cvt bf16 to f32
v_sub_f32 v[vgprValuA_X0_I0+60+1], v[vgprValuA_X0_I0+60+1], v154
v_cvt_pk_bf16_f32 v153, v[vgprValuA_X0_I0+60+2], v[vgprValuA_X0_I0+60+3]
v_cvt_f32_bf16 v155, v153
v_sub_f32 v[vgprValuA_X0_I0+60+2], v[vgprValuA_X0_I0+60+2], v155
v_cvt_f32_bf16 v155, v153 src0_sel:WORD_1          // cvt bf16 to f32
v_sub_f32 v[vgprValuA_X0_I0+60+3], v[vgprValuA_X0_I0+60+3], v155
v_cvt_pk_bf16_f32 v[vgprValuA_X0_I0+56+7], v[vgprValuA_X0_I0+56+6], v[vgprValuA_X0_I0+56+7]
v_cvt_pk_bf16_f32 v[vgprValuA_X0_I0+56+6], v[vgprValuA_X0_I0+56+4], v[vgprValuA_X0_I0+56+5]
v_cvt_pk_bf16_f32 v[vgprValuA_X0_I0+56+5], v[vgprValuA_X0_I0+56+2], v[vgprValuA_X0_I0+56+3]
v_cvt_pk_bf16_f32 v[vgprValuA_X0_I0+56+4], v[vgprValuA_X0_I0+56+0], v[vgprValuA_X0_I0+56+1]
v_mov_b64 v[vgprValuA_X0_I0+56+0:vgprValuA_X0_I0+56+0+1], v[150:151]
v_mov_b64 v[vgprValuA_X0_I0+56+2:vgprValuA_X0_I0+56+2+1], v[152:153]
v_cvt_pk_bf16_f32 v150, v[vgprValuB_X0_I0+0], v[vgprValuB_X0_I0+0+1]
v_cvt_f32_bf16 v149, v150
v_sub_f32 v[vgprValuB_X0_I0+0], v[vgprValuB_X0_I0+0], v149
v_cvt_f32_bf16 v152, v150 src0_sel:WORD_1          // cvt bf16 to f32
v_sub_f32 v[vgprValuB_X0_I0+0+1], v[vgprValuB_X0_I0+0+1], v152
v_cvt_pk_bf16_f32 v151, v[vgprValuB_X0_I0+0+2], v[vgprValuB_X0_I0+0+3]
v_cvt_f32_bf16 v153, v151
v_sub_f32 v[vgprValuB_X0_I0+0+2], v[vgprValuB_X0_I0+0+2], v153
v_cvt_f32_bf16 v153, v151 src0_sel:WORD_1          // cvt bf16 to f32
v_sub_f32 v[vgprValuB_X0_I0+0+3], v[vgprValuB_X0_I0+0+3], v153
v_cvt_pk_bf16_f32 v152, v[vgprValuB_X0_I0+4], v[vgprValuB_X0_I0+4+1]
v_cvt_f32_bf16 v149, v152
v_sub_f32 v[vgprValuB_X0_I0+4], v[vgprValuB_X0_I0+4], v149
v_cvt_f32_bf16 v154, v152 src0_sel:WORD_1          // cvt bf16 to f32
v_sub_f32 v[vgprValuB_X0_I0+4+1], v[vgprValuB_X0_I0+4+1], v154
v_cvt_pk_bf16_f32 v153, v[vgprValuB_X0_I0+4+2], v[vgprValuB_X0_I0+4+3]
v_cvt_f32_bf16 v155, v153
v_sub_f32 v[vgprValuB_X0_I0+4+2], v[vgprValuB_X0_I0+4+2], v155
v_cvt_f32_bf16 v155, v153 src0_sel:WORD_1          // cvt bf16 to f32
v_sub_f32 v[vgprValuB_X0_I0+4+3], v[vgprValuB_X0_I0+4+3], v155
v_cvt_pk_bf16_f32 v[vgprValuB_X0_I0+0+7], v[vgprValuB_X0_I0+0+6], v[vgprValuB_X0_I0+0+7]
v_cvt_pk_bf16_f32 v[vgprValuB_X0_I0+0+6], v[vgprValuB_X0_I0+0+4], v[vgprValuB_X0_I0+0+5]
v_cvt_pk_bf16_f32 v[vgprValuB_X0_I0+0+5], v[vgprValuB_X0_I0+0+2], v[vgprValuB_X0_I0+0+3]
v_cvt_pk_bf16_f32 v[vgprValuB_X0_I0+0+4], v[vgprValuB_X0_I0+0+0], v[vgprValuB_X0_I0+0+1]
v_mov_b64 v[vgprValuB_X0_I0+0+0:vgprValuB_X0_I0+0+0+1], v[150:151]
v_mov_b64 v[vgprValuB_X0_I0+0+2:vgprValuB_X0_I0+0+2+1], v[152:153]
v_cvt_pk_bf16_f32 v150, v[vgprValuB_X0_I0+8], v[vgprValuB_X0_I0+8+1]
v_cvt_f32_bf16 v149, v150
v_sub_f32 v[vgprValuB_X0_I0+8], v[vgprValuB_X0_I0+8], v149
v_cvt_f32_bf16 v152, v150 src0_sel:WORD_1          // cvt bf16 to f32
v_sub_f32 v[vgprValuB_X0_I0+8+1], v[vgprValuB_X0_I0+8+1], v152
v_cvt_pk_bf16_f32 v151, v[vgprValuB_X0_I0+8+2], v[vgprValuB_X0_I0+8+3]
v_cvt_f32_bf16 v153, v151
v_sub_f32 v[vgprValuB_X0_I0+8+2], v[vgprValuB_X0_I0+8+2], v153
v_cvt_f32_bf16 v153, v151 src0_sel:WORD_1          // cvt bf16 to f32
v_sub_f32 v[vgprValuB_X0_I0+8+3], v[vgprValuB_X0_I0+8+3], v153
v_cvt_pk_bf16_f32 v152, v[vgprValuB_X0_I0+12], v[vgprValuB_X0_I0+12+1]
v_cvt_f32_bf16 v149, v152
v_sub_f32 v[vgprValuB_X0_I0+12], v[vgprValuB_X0_I0+12], v149
v_cvt_f32_bf16 v154, v152 src0_sel:WORD_1          // cvt bf16 to f32
v_sub_f32 v[vgprValuB_X0_I0+12+1], v[vgprValuB_X0_I0+12+1], v154
v_cvt_pk_bf16_f32 v153, v[vgprValuB_X0_I0+12+2], v[vgprValuB_X0_I0+12+3]
v_cvt_f32_bf16 v155, v153
v_sub_f32 v[vgprValuB_X0_I0+12+2], v[vgprValuB_X0_I0+12+2], v155
v_cvt_f32_bf16 v155, v153 src0_sel:WORD_1          // cvt bf16 to f32
v_sub_f32 v[vgprValuB_X0_I0+12+3], v[vgprValuB_X0_I0+12+3], v155
v_cvt_pk_bf16_f32 v[vgprValuB_X0_I0+8+7], v[vgprValuB_X0_I0+8+6], v[vgprValuB_X0_I0+8+7]
v_cvt_pk_bf16_f32 v[vgprValuB_X0_I0+8+6], v[vgprValuB_X0_I0+8+4], v[vgprValuB_X0_I0+8+5]
v_cvt_pk_bf16_f32 v[vgprValuB_X0_I0+8+5], v[vgprValuB_X0_I0+8+2], v[vgprValuB_X0_I0+8+3]
v_cvt_pk_bf16_f32 v[vgprValuB_X0_I0+8+4], v[vgprValuB_X0_I0+8+0], v[vgprValuB_X0_I0+8+1]
v_mov_b64 v[vgprValuB_X0_I0+8+0:vgprValuB_X0_I0+8+0+1], v[150:151]
v_mov_b64 v[vgprValuB_X0_I0+8+2:vgprValuB_X0_I0+8+2+1], v[152:153]
v_cvt_pk_bf16_f32 v150, v[vgprValuB_X0_I0+16], v[vgprValuB_X0_I0+16+1]
v_cvt_f32_bf16 v149, v150
v_sub_f32 v[vgprValuB_X0_I0+16], v[vgprValuB_X0_I0+16], v149
v_cvt_f32_bf16 v152, v150 src0_sel:WORD_1          // cvt bf16 to f32
v_sub_f32 v[vgprValuB_X0_I0+16+1], v[vgprValuB_X0_I0+16+1], v152
v_cvt_pk_bf16_f32 v151, v[vgprValuB_X0_I0+16+2], v[vgprValuB_X0_I0+16+3]
v_cvt_f32_bf16 v153, v151
v_sub_f32 v[vgprValuB_X0_I0+16+2], v[vgprValuB_X0_I0+16+2], v153
v_cvt_f32_bf16 v153, v151 src0_sel:WORD_1          // cvt bf16 to f32
v_sub_f32 v[vgprValuB_X0_I0+16+3], v[vgprValuB_X0_I0+16+3], v153
v_cvt_pk_bf16_f32 v152, v[vgprValuB_X0_I0+20], v[vgprValuB_X0_I0+20+1]
v_cvt_f32_bf16 v149, v152
v_sub_f32 v[vgprValuB_X0_I0+20], v[vgprValuB_X0_I0+20], v149
v_cvt_f32_bf16 v154, v152 src0_sel:WORD_1          // cvt bf16 to f32
v_sub_f32 v[vgprValuB_X0_I0+20+1], v[vgprValuB_X0_I0+20+1], v154
v_cvt_pk_bf16_f32 v153, v[vgprValuB_X0_I0+20+2], v[vgprValuB_X0_I0+20+3]
v_cvt_f32_bf16 v155, v153
v_sub_f32 v[vgprValuB_X0_I0+20+2], v[vgprValuB_X0_I0+20+2], v155
v_cvt_f32_bf16 v155, v153 src0_sel:WORD_1          // cvt bf16 to f32
v_sub_f32 v[vgprValuB_X0_I0+20+3], v[vgprValuB_X0_I0+20+3], v155
v_cvt_pk_bf16_f32 v[vgprValuB_X0_I0+16+7], v[vgprValuB_X0_I0+16+6], v[vgprValuB_X0_I0+16+7]
v_cvt_pk_bf16_f32 v[vgprValuB_X0_I0+16+6], v[vgprValuB_X0_I0+16+4], v[vgprValuB_X0_I0+16+5]
v_cvt_pk_bf16_f32 v[vgprValuB_X0_I0+16+5], v[vgprValuB_X0_I0+16+2], v[vgprValuB_X0_I0+16+3]
v_cvt_pk_bf16_f32 v[vgprValuB_X0_I0+16+4], v[vgprValuB_X0_I0+16+0], v[vgprValuB_X0_I0+16+1]
v_mov_b64 v[vgprValuB_X0_I0+16+0:vgprValuB_X0_I0+16+0+1], v[150:151]
v_mov_b64 v[vgprValuB_X0_I0+16+2:vgprValuB_X0_I0+16+2+1], v[152:153]
v_cvt_pk_bf16_f32 v150, v[vgprValuB_X0_I0+24], v[vgprValuB_X0_I0+24+1]
v_cvt_f32_bf16 v149, v150
v_sub_f32 v[vgprValuB_X0_I0+24], v[vgprValuB_X0_I0+24], v149
v_cvt_f32_bf16 v152, v150 src0_sel:WORD_1          // cvt bf16 to f32
v_sub_f32 v[vgprValuB_X0_I0+24+1], v[vgprValuB_X0_I0+24+1], v152
v_cvt_pk_bf16_f32 v151, v[vgprValuB_X0_I0+24+2], v[vgprValuB_X0_I0+24+3]
v_cvt_f32_bf16 v153, v151
v_sub_f32 v[vgprValuB_X0_I0+24+2], v[vgprValuB_X0_I0+24+2], v153
v_cvt_f32_bf16 v153, v151 src0_sel:WORD_1          // cvt bf16 to f32
v_sub_f32 v[vgprValuB_X0_I0+24+3], v[vgprValuB_X0_I0+24+3], v153
v_cvt_pk_bf16_f32 v152, v[vgprValuB_X0_I0+28], v[vgprValuB_X0_I0+28+1]
v_cvt_f32_bf16 v149, v152
v_sub_f32 v[vgprValuB_X0_I0+28], v[vgprValuB_X0_I0+28], v149
v_cvt_f32_bf16 v154, v152 src0_sel:WORD_1          // cvt bf16 to f32
v_sub_f32 v[vgprValuB_X0_I0+28+1], v[vgprValuB_X0_I0+28+1], v154
v_cvt_pk_bf16_f32 v153, v[vgprValuB_X0_I0+28+2], v[vgprValuB_X0_I0+28+3]
v_cvt_f32_bf16 v155, v153
v_sub_f32 v[vgprValuB_X0_I0+28+2], v[vgprValuB_X0_I0+28+2], v155
v_cvt_f32_bf16 v155, v153 src0_sel:WORD_1          // cvt bf16 to f32
v_sub_f32 v[vgprValuB_X0_I0+28+3], v[vgprValuB_X0_I0+28+3], v155
v_cvt_pk_bf16_f32 v[vgprValuB_X0_I0+24+7], v[vgprValuB_X0_I0+24+6], v[vgprValuB_X0_I0+24+7]
v_cvt_pk_bf16_f32 v[vgprValuB_X0_I0+24+6], v[vgprValuB_X0_I0+24+4], v[vgprValuB_X0_I0+24+5]
v_cvt_pk_bf16_f32 v[vgprValuB_X0_I0+24+5], v[vgprValuB_X0_I0+24+2], v[vgprValuB_X0_I0+24+3]
v_cvt_pk_bf16_f32 v[vgprValuB_X0_I0+24+4], v[vgprValuB_X0_I0+24+0], v[vgprValuB_X0_I0+24+1]
v_mov_b64 v[vgprValuB_X0_I0+24+0:vgprValuB_X0_I0+24+0+1], v[150:151]
v_mov_b64 v[vgprValuB_X0_I0+24+2:vgprValuB_X0_I0+24+2+1], v[152:153]
v_cvt_pk_bf16_f32 v150, v[vgprValuB_X0_I0+32], v[vgprValuB_X0_I0+32+1]
v_cvt_f32_bf16 v149, v150
v_sub_f32 v[vgprValuB_X0_I0+32], v[vgprValuB_X0_I0+32], v149
v_cvt_f32_bf16 v152, v150 src0_sel:WORD_1          // cvt bf16 to f32
v_sub_f32 v[vgprValuB_X0_I0+32+1], v[vgprValuB_X0_I0+32+1], v152
v_cvt_pk_bf16_f32 v151, v[vgprValuB_X0_I0+32+2], v[vgprValuB_X0_I0+32+3]
v_cvt_f32_bf16 v153, v151
v_sub_f32 v[vgprValuB_X0_I0+32+2], v[vgprValuB_X0_I0+32+2], v153
v_cvt_f32_bf16 v153, v151 src0_sel:WORD_1          // cvt bf16 to f32
v_sub_f32 v[vgprValuB_X0_I0+32+3], v[vgprValuB_X0_I0+32+3], v153
v_cvt_pk_bf16_f32 v152, v[vgprValuB_X0_I0+36], v[vgprValuB_X0_I0+36+1]
v_cvt_f32_bf16 v149, v152
v_sub_f32 v[vgprValuB_X0_I0+36], v[vgprValuB_X0_I0+36], v149
v_cvt_f32_bf16 v154, v152 src0_sel:WORD_1          // cvt bf16 to f32
v_sub_f32 v[vgprValuB_X0_I0+36+1], v[vgprValuB_X0_I0+36+1], v154
v_cvt_pk_bf16_f32 v153, v[vgprValuB_X0_I0+36+2], v[vgprValuB_X0_I0+36+3]
v_cvt_f32_bf16 v155, v153
v_sub_f32 v[vgprValuB_X0_I0+36+2], v[vgprValuB_X0_I0+36+2], v155
v_cvt_f32_bf16 v155, v153 src0_sel:WORD_1          // cvt bf16 to f32
v_sub_f32 v[vgprValuB_X0_I0+36+3], v[vgprValuB_X0_I0+36+3], v155
v_cvt_pk_bf16_f32 v[vgprValuB_X0_I0+32+7], v[vgprValuB_X0_I0+32+6], v[vgprValuB_X0_I0+32+7]
v_cvt_pk_bf16_f32 v[vgprValuB_X0_I0+32+6], v[vgprValuB_X0_I0+32+4], v[vgprValuB_X0_I0+32+5]
v_cvt_pk_bf16_f32 v[vgprValuB_X0_I0+32+5], v[vgprValuB_X0_I0+32+2], v[vgprValuB_X0_I0+32+3]
v_cvt_pk_bf16_f32 v[vgprValuB_X0_I0+32+4], v[vgprValuB_X0_I0+32+0], v[vgprValuB_X0_I0+32+1]
v_mov_b64 v[vgprValuB_X0_I0+32+0:vgprValuB_X0_I0+32+0+1], v[150:151]
v_mov_b64 v[vgprValuB_X0_I0+32+2:vgprValuB_X0_I0+32+2+1], v[152:153]
v_cvt_pk_bf16_f32 v150, v[vgprValuB_X0_I0+40], v[vgprValuB_X0_I0+40+1]
v_cvt_f32_bf16 v149, v150
v_sub_f32 v[vgprValuB_X0_I0+40], v[vgprValuB_X0_I0+40], v149
v_cvt_f32_bf16 v152, v150 src0_sel:WORD_1          // cvt bf16 to f32
v_sub_f32 v[vgprValuB_X0_I0+40+1], v[vgprValuB_X0_I0+40+1], v152
v_cvt_pk_bf16_f32 v151, v[vgprValuB_X0_I0+40+2], v[vgprValuB_X0_I0+40+3]
v_cvt_f32_bf16 v153, v151
v_sub_f32 v[vgprValuB_X0_I0+40+2], v[vgprValuB_X0_I0+40+2], v153
v_cvt_f32_bf16 v153, v151 src0_sel:WORD_1          // cvt bf16 to f32
v_sub_f32 v[vgprValuB_X0_I0+40+3], v[vgprValuB_X0_I0+40+3], v153
v_cvt_pk_bf16_f32 v152, v[vgprValuB_X0_I0+44], v[vgprValuB_X0_I0+44+1]
v_cvt_f32_bf16 v149, v152
v_sub_f32 v[vgprValuB_X0_I0+44], v[vgprValuB_X0_I0+44], v149
v_cvt_f32_bf16 v154, v152 src0_sel:WORD_1          // cvt bf16 to f32
v_sub_f32 v[vgprValuB_X0_I0+44+1], v[vgprValuB_X0_I0+44+1], v154
v_cvt_pk_bf16_f32 v153, v[vgprValuB_X0_I0+44+2], v[vgprValuB_X0_I0+44+3]
v_cvt_f32_bf16 v155, v153
v_sub_f32 v[vgprValuB_X0_I0+44+2], v[vgprValuB_X0_I0+44+2], v155
v_cvt_f32_bf16 v155, v153 src0_sel:WORD_1          // cvt bf16 to f32
v_sub_f32 v[vgprValuB_X0_I0+44+3], v[vgprValuB_X0_I0+44+3], v155
v_cvt_pk_bf16_f32 v[vgprValuB_X0_I0+40+7], v[vgprValuB_X0_I0+40+6], v[vgprValuB_X0_I0+40+7]
v_cvt_pk_bf16_f32 v[vgprValuB_X0_I0+40+6], v[vgprValuB_X0_I0+40+4], v[vgprValuB_X0_I0+40+5]
v_cvt_pk_bf16_f32 v[vgprValuB_X0_I0+40+5], v[vgprValuB_X0_I0+40+2], v[vgprValuB_X0_I0+40+3]
v_cvt_pk_bf16_f32 v[vgprValuB_X0_I0+40+4], v[vgprValuB_X0_I0+40+0], v[vgprValuB_X0_I0+40+1]
v_mov_b64 v[vgprValuB_X0_I0+40+0:vgprValuB_X0_I0+40+0+1], v[150:151]
v_mov_b64 v[vgprValuB_X0_I0+40+2:vgprValuB_X0_I0+40+2+1], v[152:153]
v_cvt_pk_bf16_f32 v150, v[vgprValuB_X0_I0+48], v[vgprValuB_X0_I0+48+1]
v_cvt_f32_bf16 v149, v150
v_sub_f32 v[vgprValuB_X0_I0+48], v[vgprValuB_X0_I0+48], v149
v_cvt_f32_bf16 v152, v150 src0_sel:WORD_1          // cvt bf16 to f32
v_sub_f32 v[vgprValuB_X0_I0+48+1], v[vgprValuB_X0_I0+48+1], v152
v_cvt_pk_bf16_f32 v151, v[vgprValuB_X0_I0+48+2], v[vgprValuB_X0_I0+48+3]
v_cvt_f32_bf16 v153, v151
v_sub_f32 v[vgprValuB_X0_I0+48+2], v[vgprValuB_X0_I0+48+2], v153
v_cvt_f32_bf16 v153, v151 src0_sel:WORD_1          // cvt bf16 to f32
v_sub_f32 v[vgprValuB_X0_I0+48+3], v[vgprValuB_X0_I0+48+3], v153
v_cvt_pk_bf16_f32 v152, v[vgprValuB_X0_I0+52], v[vgprValuB_X0_I0+52+1]
v_cvt_f32_bf16 v149, v152
v_sub_f32 v[vgprValuB_X0_I0+52], v[vgprValuB_X0_I0+52], v149
v_cvt_f32_bf16 v154, v152 src0_sel:WORD_1          // cvt bf16 to f32
v_sub_f32 v[vgprValuB_X0_I0+52+1], v[vgprValuB_X0_I0+52+1], v154
v_cvt_pk_bf16_f32 v153, v[vgprValuB_X0_I0+52+2], v[vgprValuB_X0_I0+52+3]
v_cvt_f32_bf16 v155, v153
v_sub_f32 v[vgprValuB_X0_I0+52+2], v[vgprValuB_X0_I0+52+2], v155
v_cvt_f32_bf16 v155, v153 src0_sel:WORD_1          // cvt bf16 to f32
v_sub_f32 v[vgprValuB_X0_I0+52+3], v[vgprValuB_X0_I0+52+3], v155
v_cvt_pk_bf16_f32 v[vgprValuB_X0_I0+48+7], v[vgprValuB_X0_I0+48+6], v[vgprValuB_X0_I0+48+7]
v_cvt_pk_bf16_f32 v[vgprValuB_X0_I0+48+6], v[vgprValuB_X0_I0+48+4], v[vgprValuB_X0_I0+48+5]
v_cvt_pk_bf16_f32 v[vgprValuB_X0_I0+48+5], v[vgprValuB_X0_I0+48+2], v[vgprValuB_X0_I0+48+3]
v_cvt_pk_bf16_f32 v[vgprValuB_X0_I0+48+4], v[vgprValuB_X0_I0+48+0], v[vgprValuB_X0_I0+48+1]
v_mov_b64 v[vgprValuB_X0_I0+48+0:vgprValuB_X0_I0+48+0+1], v[150:151]
v_mov_b64 v[vgprValuB_X0_I0+48+2:vgprValuB_X0_I0+48+2+1], v[152:153]
v_cvt_pk_bf16_f32 v150, v[vgprValuB_X0_I0+56], v[vgprValuB_X0_I0+56+1]
v_cvt_f32_bf16 v149, v150
v_sub_f32 v[vgprValuB_X0_I0+56], v[vgprValuB_X0_I0+56], v149
v_cvt_f32_bf16 v152, v150 src0_sel:WORD_1          // cvt bf16 to f32
v_sub_f32 v[vgprValuB_X0_I0+56+1], v[vgprValuB_X0_I0+56+1], v152
v_cvt_pk_bf16_f32 v151, v[vgprValuB_X0_I0+56+2], v[vgprValuB_X0_I0+56+3]
v_cvt_f32_bf16 v153, v151
v_sub_f32 v[vgprValuB_X0_I0+56+2], v[vgprValuB_X0_I0+56+2], v153
v_cvt_f32_bf16 v153, v151 src0_sel:WORD_1          // cvt bf16 to f32
v_sub_f32 v[vgprValuB_X0_I0+56+3], v[vgprValuB_X0_I0+56+3], v153
v_cvt_pk_bf16_f32 v152, v[vgprValuB_X0_I0+60], v[vgprValuB_X0_I0+60+1]
v_cvt_f32_bf16 v149, v152
v_sub_f32 v[vgprValuB_X0_I0+60], v[vgprValuB_X0_I0+60], v149
v_cvt_f32_bf16 v154, v152 src0_sel:WORD_1          // cvt bf16 to f32
v_sub_f32 v[vgprValuB_X0_I0+60+1], v[vgprValuB_X0_I0+60+1], v154
v_cvt_pk_bf16_f32 v153, v[vgprValuB_X0_I0+60+2], v[vgprValuB_X0_I0+60+3]
v_cvt_f32_bf16 v155, v153
v_sub_f32 v[vgprValuB_X0_I0+60+2], v[vgprValuB_X0_I0+60+2], v155
v_cvt_f32_bf16 v155, v153 src0_sel:WORD_1          // cvt bf16 to f32
v_sub_f32 v[vgprValuB_X0_I0+60+3], v[vgprValuB_X0_I0+60+3], v155
v_cvt_pk_bf16_f32 v[vgprValuB_X0_I0+56+7], v[vgprValuB_X0_I0+56+6], v[vgprValuB_X0_I0+56+7]
v_cvt_pk_bf16_f32 v[vgprValuB_X0_I0+56+6], v[vgprValuB_X0_I0+56+4], v[vgprValuB_X0_I0+56+5]
v_cvt_pk_bf16_f32 v[vgprValuB_X0_I0+56+5], v[vgprValuB_X0_I0+56+2], v[vgprValuB_X0_I0+56+3]
v_cvt_pk_bf16_f32 v[vgprValuB_X0_I0+56+4], v[vgprValuB_X0_I0+56+0], v[vgprValuB_X0_I0+56+1]
v_mov_b64 v[vgprValuB_X0_I0+56+0:vgprValuB_X0_I0+56+0+1], v[150:151]
v_mov_b64 v[vgprValuB_X0_I0+56+2:vgprValuB_X0_I0+56+2+1], v[152:153]
s_nop 1
v_mfma_f32_16x16x32_bf16 acc[0:3], v[vgprValuB_X0_I0+0:vgprValuB_X0_I0+0+3], v[vgprValuA_X0_I0+0+4:vgprValuA_X0_I0+0+4+3], acc[0:3] // left value = acc[0+0:3+0]
v_mfma_f32_16x16x32_bf16 acc[0:3], v[vgprValuB_X0_I0+0+4:vgprValuB_X0_I0+0+4+3], v[vgprValuA_X0_I0+0:vgprValuA_X0_I0+0+3], acc[0:3] // left value = acc[0+0:3+0]
v_mfma_f32_16x16x32_bf16 acc[0:3], v[vgprValuB_X0_I0+0:vgprValuB_X0_I0+0+3], v[vgprValuA_X0_I0+0:vgprValuA_X0_I0+0+3], acc[0:3] // left value = acc[0+0:3+0]
v_mfma_f32_16x16x32_bf16 acc[4:7], v[vgprValuB_X0_I0+0:vgprValuB_X0_I0+0+3], v[vgprValuA_X0_I0+8+4:vgprValuA_X0_I0+8+4+3], acc[4:7] // left value = acc[4+0:7+0]
v_mfma_f32_16x16x32_bf16 acc[4:7], v[vgprValuB_X0_I0+0+4:vgprValuB_X0_I0+0+4+3], v[vgprValuA_X0_I0+8:vgprValuA_X0_I0+8+3], acc[4:7] // left value = acc[4+0:7+0]
v_mfma_f32_16x16x32_bf16 acc[4:7], v[vgprValuB_X0_I0+0:vgprValuB_X0_I0+0+3], v[vgprValuA_X0_I0+8:vgprValuA_X0_I0+8+3], acc[4:7] // left value = acc[4+0:7+0]
v_mfma_f32_16x16x32_bf16 acc[8:11], v[vgprValuB_X0_I0+0:vgprValuB_X0_I0+0+3], v[vgprValuA_X0_I0+16+4:vgprValuA_X0_I0+16+4+3], acc[8:11] // left value = acc[8+0:11+0]
v_mfma_f32_16x16x32_bf16 acc[8:11], v[vgprValuB_X0_I0+0+4:vgprValuB_X0_I0+0+4+3], v[vgprValuA_X0_I0+16:vgprValuA_X0_I0+16+3], acc[8:11] // left value = acc[8+0:11+0]
v_mfma_f32_16x16x32_bf16 acc[8:11], v[vgprValuB_X0_I0+0:vgprValuB_X0_I0+0+3], v[vgprValuA_X0_I0+16:vgprValuA_X0_I0+16+3], acc[8:11] // left value = acc[8+0:11+0]
v_mfma_f32_16x16x32_bf16 acc[12:15], v[vgprValuB_X0_I0+0:vgprValuB_X0_I0+0+3], v[vgprValuA_X0_I0+24+4:vgprValuA_X0_I0+24+4+3], acc[12:15] // left value = acc[12+0:15+0]
v_mfma_f32_16x16x32_bf16 acc[12:15], v[vgprValuB_X0_I0+0+4:vgprValuB_X0_I0+0+4+3], v[vgprValuA_X0_I0+24:vgprValuA_X0_I0+24+3], acc[12:15] // left value = acc[12+0:15+0]
v_mfma_f32_16x16x32_bf16 acc[12:15], v[vgprValuB_X0_I0+0:vgprValuB_X0_I0+0+3], v[vgprValuA_X0_I0+24:vgprValuA_X0_I0+24+3], acc[12:15] // left value = acc[12+0:15+0]
v_mfma_f32_16x16x32_bf16 acc[16:19], v[vgprValuB_X0_I0+0:vgprValuB_X0_I0+0+3], v[vgprValuA_X0_I0+32+4:vgprValuA_X0_I0+32+4+3], acc[16:19] // left value = acc[16+0:19+0]
v_mfma_f32_16x16x32_bf16 acc[16:19], v[vgprValuB_X0_I0+0+4:vgprValuB_X0_I0+0+4+3], v[vgprValuA_X0_I0+32:vgprValuA_X0_I0+32+3], acc[16:19] // left value = acc[16+0:19+0]
v_mfma_f32_16x16x32_bf16 acc[16:19], v[vgprValuB_X0_I0+0:vgprValuB_X0_I0+0+3], v[vgprValuA_X0_I0+32:vgprValuA_X0_I0+32+3], acc[16:19] // left value = acc[16+0:19+0]
v_mfma_f32_16x16x32_bf16 acc[20:23], v[vgprValuB_X0_I0+0:vgprValuB_X0_I0+0+3], v[vgprValuA_X0_I0+40+4:vgprValuA_X0_I0+40+4+3], acc[20:23] // left value = acc[20+0:23+0]
v_mfma_f32_16x16x32_bf16 acc[20:23], v[vgprValuB_X0_I0+0+4:vgprValuB_X0_I0+0+4+3], v[vgprValuA_X0_I0+40:vgprValuA_X0_I0+40+3], acc[20:23] // left value = acc[20+0:23+0]
v_mfma_f32_16x16x32_bf16 acc[20:23], v[vgprValuB_X0_I0+0:vgprValuB_X0_I0+0+3], v[vgprValuA_X0_I0+40:vgprValuA_X0_I0+40+3], acc[20:23] // left value = acc[20+0:23+0]
v_mfma_f32_16x16x32_bf16 acc[24:27], v[vgprValuB_X0_I0+0:vgprValuB_X0_I0+0+3], v[vgprValuA_X0_I0+48+4:vgprValuA_X0_I0+48+4+3], acc[24:27] // left value = acc[24+0:27+0]
v_mfma_f32_16x16x32_bf16 acc[24:27], v[vgprValuB_X0_I0+0+4:vgprValuB_X0_I0+0+4+3], v[vgprValuA_X0_I0+48:vgprValuA_X0_I0+48+3], acc[24:27] // left value = acc[24+0:27+0]
v_mfma_f32_16x16x32_bf16 acc[24:27], v[vgprValuB_X0_I0+0:vgprValuB_X0_I0+0+3], v[vgprValuA_X0_I0+48:vgprValuA_X0_I0+48+3], acc[24:27] // left value = acc[24+0:27+0]
v_mfma_f32_16x16x32_bf16 acc[28:31], v[vgprValuB_X0_I0+0:vgprValuB_X0_I0+0+3], v[vgprValuA_X0_I0+56+4:vgprValuA_X0_I0+56+4+3], acc[28:31] // left value = acc[28+0:31+0]
v_mfma_f32_16x16x32_bf16 acc[28:31], v[vgprValuB_X0_I0+0+4:vgprValuB_X0_I0+0+4+3], v[vgprValuA_X0_I0+56:vgprValuA_X0_I0+56+3], acc[28:31] // left value = acc[28+0:31+0]
v_mfma_f32_16x16x32_bf16 acc[28:31], v[vgprValuB_X0_I0+0:vgprValuB_X0_I0+0+3], v[vgprValuA_X0_I0+56:vgprValuA_X0_I0+56+3], acc[28:31] // left value = acc[28+0:31+0]
v_mfma_f32_16x16x32_bf16 acc[32:35], v[vgprValuB_X0_I0+8:vgprValuB_X0_I0+8+3], v[vgprValuA_X0_I0+0+4:vgprValuA_X0_I0+0+4+3], acc[32:35] // left value = acc[32+0:35+0]
v_mfma_f32_16x16x32_bf16 acc[32:35], v[vgprValuB_X0_I0+8+4:vgprValuB_X0_I0+8+4+3], v[vgprValuA_X0_I0+0:vgprValuA_X0_I0+0+3], acc[32:35] // left value = acc[32+0:35+0]
v_mfma_f32_16x16x32_bf16 acc[32:35], v[vgprValuB_X0_I0+8:vgprValuB_X0_I0+8+3], v[vgprValuA_X0_I0+0:vgprValuA_X0_I0+0+3], acc[32:35] // left value = acc[32+0:35+0]
v_mfma_f32_16x16x32_bf16 acc[36:39], v[vgprValuB_X0_I0+8:vgprValuB_X0_I0+8+3], v[vgprValuA_X0_I0+8+4:vgprValuA_X0_I0+8+4+3], acc[36:39] // left value = acc[36+0:39+0]
v_mfma_f32_16x16x32_bf16 acc[36:39], v[vgprValuB_X0_I0+8+4:vgprValuB_X0_I0+8+4+3], v[vgprValuA_X0_I0+8:vgprValuA_X0_I0+8+3], acc[36:39] // left value = acc[36+0:39+0]
v_mfma_f32_16x16x32_bf16 acc[36:39], v[vgprValuB_X0_I0+8:vgprValuB_X0_I0+8+3], v[vgprValuA_X0_I0+8:vgprValuA_X0_I0+8+3], acc[36:39] // left value = acc[36+0:39+0]
v_mfma_f32_16x16x32_bf16 acc[40:43], v[vgprValuB_X0_I0+8:vgprValuB_X0_I0+8+3], v[vgprValuA_X0_I0+16+4:vgprValuA_X0_I0+16+4+3], acc[40:43] // left value = acc[40+0:43+0]
v_mfma_f32_16x16x32_bf16 acc[40:43], v[vgprValuB_X0_I0+8+4:vgprValuB_X0_I0+8+4+3], v[vgprValuA_X0_I0+16:vgprValuA_X0_I0+16+3], acc[40:43] // left value = acc[40+0:43+0]
v_mfma_f32_16x16x32_bf16 acc[40:43], v[vgprValuB_X0_I0+8:vgprValuB_X0_I0+8+3], v[vgprValuA_X0_I0+16:vgprValuA_X0_I0+16+3], acc[40:43] // left value = acc[40+0:43+0]
v_mfma_f32_16x16x32_bf16 acc[44:47], v[vgprValuB_X0_I0+8:vgprValuB_X0_I0+8+3], v[vgprValuA_X0_I0+24+4:vgprValuA_X0_I0+24+4+3], acc[44:47] // left value = acc[44+0:47+0]
v_mfma_f32_16x16x32_bf16 acc[44:47], v[vgprValuB_X0_I0+8+4:vgprValuB_X0_I0+8+4+3], v[vgprValuA_X0_I0+24:vgprValuA_X0_I0+24+3], acc[44:47] // left value = acc[44+0:47+0]
v_mfma_f32_16x16x32_bf16 acc[44:47], v[vgprValuB_X0_I0+8:vgprValuB_X0_I0+8+3], v[vgprValuA_X0_I0+24:vgprValuA_X0_I0+24+3], acc[44:47] // left value = acc[44+0:47+0]
v_mfma_f32_16x16x32_bf16 acc[48:51], v[vgprValuB_X0_I0+8:vgprValuB_X0_I0+8+3], v[vgprValuA_X0_I0+32+4:vgprValuA_X0_I0+32+4+3], acc[48:51] // left value = acc[48+0:51+0]
v_mfma_f32_16x16x32_bf16 acc[48:51], v[vgprValuB_X0_I0+8+4:vgprValuB_X0_I0+8+4+3], v[vgprValuA_X0_I0+32:vgprValuA_X0_I0+32+3], acc[48:51] // left value = acc[48+0:51+0]
v_mfma_f32_16x16x32_bf16 acc[48:51], v[vgprValuB_X0_I0+8:vgprValuB_X0_I0+8+3], v[vgprValuA_X0_I0+32:vgprValuA_X0_I0+32+3], acc[48:51] // left value = acc[48+0:51+0]
v_mfma_f32_16x16x32_bf16 acc[52:55], v[vgprValuB_X0_I0+8:vgprValuB_X0_I0+8+3], v[vgprValuA_X0_I0+40+4:vgprValuA_X0_I0+40+4+3], acc[52:55] // left value = acc[52+0:55+0]
v_mfma_f32_16x16x32_bf16 acc[52:55], v[vgprValuB_X0_I0+8+4:vgprValuB_X0_I0+8+4+3], v[vgprValuA_X0_I0+40:vgprValuA_X0_I0+40+3], acc[52:55] // left value = acc[52+0:55+0]
v_mfma_f32_16x16x32_bf16 acc[52:55], v[vgprValuB_X0_I0+8:vgprValuB_X0_I0+8+3], v[vgprValuA_X0_I0+40:vgprValuA_X0_I0+40+3], acc[52:55] // left value = acc[52+0:55+0]
v_mfma_f32_16x16x32_bf16 acc[56:59], v[vgprValuB_X0_I0+8:vgprValuB_X0_I0+8+3], v[vgprValuA_X0_I0+48+4:vgprValuA_X0_I0+48+4+3], acc[56:59] // left value = acc[56+0:59+0]
v_mfma_f32_16x16x32_bf16 acc[56:59], v[vgprValuB_X0_I0+8+4:vgprValuB_X0_I0+8+4+3], v[vgprValuA_X0_I0+48:vgprValuA_X0_I0+48+3], acc[56:59] // left value = acc[56+0:59+0]
v_mfma_f32_16x16x32_bf16 acc[56:59], v[vgprValuB_X0_I0+8:vgprValuB_X0_I0+8+3], v[vgprValuA_X0_I0+48:vgprValuA_X0_I0+48+3], acc[56:59] // left value = acc[56+0:59+0]
v_mfma_f32_16x16x32_bf16 acc[60:63], v[vgprValuB_X0_I0+8:vgprValuB_X0_I0+8+3], v[vgprValuA_X0_I0+56+4:vgprValuA_X0_I0+56+4+3], acc[60:63] // left value = acc[60+0:63+0]
v_mfma_f32_16x16x32_bf16 acc[60:63], v[vgprValuB_X0_I0+8+4:vgprValuB_X0_I0+8+4+3], v[vgprValuA_X0_I0+56:vgprValuA_X0_I0+56+3], acc[60:63] // left value = acc[60+0:63+0]
v_mfma_f32_16x16x32_bf16 acc[60:63], v[vgprValuB_X0_I0+8:vgprValuB_X0_I0+8+3], v[vgprValuA_X0_I0+56:vgprValuA_X0_I0+56+3], acc[60:63] // left value = acc[60+0:63+0]
v_mfma_f32_16x16x32_bf16 acc[64:67], v[vgprValuB_X0_I0+16:vgprValuB_X0_I0+16+3], v[vgprValuA_X0_I0+0+4:vgprValuA_X0_I0+0+4+3], acc[64:67] // left value = acc[64+0:67+0]
v_mfma_f32_16x16x32_bf16 acc[64:67], v[vgprValuB_X0_I0+16+4:vgprValuB_X0_I0+16+4+3], v[vgprValuA_X0_I0+0:vgprValuA_X0_I0+0+3], acc[64:67] // left value = acc[64+0:67+0]
v_mfma_f32_16x16x32_bf16 acc[64:67], v[vgprValuB_X0_I0+16:vgprValuB_X0_I0+16+3], v[vgprValuA_X0_I0+0:vgprValuA_X0_I0+0+3], acc[64:67] // left value = acc[64+0:67+0]
v_mfma_f32_16x16x32_bf16 acc[68:71], v[vgprValuB_X0_I0+16:vgprValuB_X0_I0+16+3], v[vgprValuA_X0_I0+8+4:vgprValuA_X0_I0+8+4+3], acc[68:71] // left value = acc[68+0:71+0]
v_mfma_f32_16x16x32_bf16 acc[68:71], v[vgprValuB_X0_I0+16+4:vgprValuB_X0_I0+16+4+3], v[vgprValuA_X0_I0+8:vgprValuA_X0_I0+8+3], acc[68:71] // left value = acc[68+0:71+0]
v_mfma_f32_16x16x32_bf16 acc[68:71], v[vgprValuB_X0_I0+16:vgprValuB_X0_I0+16+3], v[vgprValuA_X0_I0+8:vgprValuA_X0_I0+8+3], acc[68:71] // left value = acc[68+0:71+0]
v_mfma_f32_16x16x32_bf16 acc[72:75], v[vgprValuB_X0_I0+16:vgprValuB_X0_I0+16+3], v[vgprValuA_X0_I0+16+4:vgprValuA_X0_I0+16+4+3], acc[72:75] // left value = acc[72+0:75+0]
v_mfma_f32_16x16x32_bf16 acc[72:75], v[vgprValuB_X0_I0+16+4:vgprValuB_X0_I0+16+4+3], v[vgprValuA_X0_I0+16:vgprValuA_X0_I0+16+3], acc[72:75] // left value = acc[72+0:75+0]
v_mfma_f32_16x16x32_bf16 acc[72:75], v[vgprValuB_X0_I0+16:vgprValuB_X0_I0+16+3], v[vgprValuA_X0_I0+16:vgprValuA_X0_I0+16+3], acc[72:75] // left value = acc[72+0:75+0]
v_mfma_f32_16x16x32_bf16 acc[76:79], v[vgprValuB_X0_I0+16:vgprValuB_X0_I0+16+3], v[vgprValuA_X0_I0+24+4:vgprValuA_X0_I0+24+4+3], acc[76:79] // left value = acc[76+0:79+0]
v_mfma_f32_16x16x32_bf16 acc[76:79], v[vgprValuB_X0_I0+16+4:vgprValuB_X0_I0+16+4+3], v[vgprValuA_X0_I0+24:vgprValuA_X0_I0+24+3], acc[76:79] // left value = acc[76+0:79+0]
v_mfma_f32_16x16x32_bf16 acc[76:79], v[vgprValuB_X0_I0+16:vgprValuB_X0_I0+16+3], v[vgprValuA_X0_I0+24:vgprValuA_X0_I0+24+3], acc[76:79] // left value = acc[76+0:79+0]
v_mfma_f32_16x16x32_bf16 acc[80:83], v[vgprValuB_X0_I0+16:vgprValuB_X0_I0+16+3], v[vgprValuA_X0_I0+32+4:vgprValuA_X0_I0+32+4+3], acc[80:83] // left value = acc[80+0:83+0]
v_mfma_f32_16x16x32_bf16 acc[80:83], v[vgprValuB_X0_I0+16+4:vgprValuB_X0_I0+16+4+3], v[vgprValuA_X0_I0+32:vgprValuA_X0_I0+32+3], acc[80:83] // left value = acc[80+0:83+0]
v_mfma_f32_16x16x32_bf16 acc[80:83], v[vgprValuB_X0_I0+16:vgprValuB_X0_I0+16+3], v[vgprValuA_X0_I0+32:vgprValuA_X0_I0+32+3], acc[80:83] // left value = acc[80+0:83+0]
v_mfma_f32_16x16x32_bf16 acc[84:87], v[vgprValuB_X0_I0+16:vgprValuB_X0_I0+16+3], v[vgprValuA_X0_I0+40+4:vgprValuA_X0_I0+40+4+3], acc[84:87] // left value = acc[84+0:87+0]
v_mfma_f32_16x16x32_bf16 acc[84:87], v[vgprValuB_X0_I0+16+4:vgprValuB_X0_I0+16+4+3], v[vgprValuA_X0_I0+40:vgprValuA_X0_I0+40+3], acc[84:87] // left value = acc[84+0:87+0]
v_mfma_f32_16x16x32_bf16 acc[84:87], v[vgprValuB_X0_I0+16:vgprValuB_X0_I0+16+3], v[vgprValuA_X0_I0+40:vgprValuA_X0_I0+40+3], acc[84:87] // left value = acc[84+0:87+0]
v_mfma_f32_16x16x32_bf16 acc[88:91], v[vgprValuB_X0_I0+16:vgprValuB_X0_I0+16+3], v[vgprValuA_X0_I0+48+4:vgprValuA_X0_I0+48+4+3], acc[88:91] // left value = acc[88+0:91+0]
v_mfma_f32_16x16x32_bf16 acc[88:91], v[vgprValuB_X0_I0+16+4:vgprValuB_X0_I0+16+4+3], v[vgprValuA_X0_I0+48:vgprValuA_X0_I0+48+3], acc[88:91] // left value = acc[88+0:91+0]
v_mfma_f32_16x16x32_bf16 acc[88:91], v[vgprValuB_X0_I0+16:vgprValuB_X0_I0+16+3], v[vgprValuA_X0_I0+48:vgprValuA_X0_I0+48+3], acc[88:91] // left value = acc[88+0:91+0]
v_mfma_f32_16x16x32_bf16 acc[92:95], v[vgprValuB_X0_I0+16:vgprValuB_X0_I0+16+3], v[vgprValuA_X0_I0+56+4:vgprValuA_X0_I0+56+4+3], acc[92:95] // left value = acc[92+0:95+0]
v_mfma_f32_16x16x32_bf16 acc[92:95], v[vgprValuB_X0_I0+16+4:vgprValuB_X0_I0+16+4+3], v[vgprValuA_X0_I0+56:vgprValuA_X0_I0+56+3], acc[92:95] // left value = acc[92+0:95+0]
v_mfma_f32_16x16x32_bf16 acc[92:95], v[vgprValuB_X0_I0+16:vgprValuB_X0_I0+16+3], v[vgprValuA_X0_I0+56:vgprValuA_X0_I0+56+3], acc[92:95] // left value = acc[92+0:95+0]
v_mfma_f32_16x16x32_bf16 acc[96:99], v[vgprValuB_X0_I0+24:vgprValuB_X0_I0+24+3], v[vgprValuA_X0_I0+0+4:vgprValuA_X0_I0+0+4+3], acc[96:99] // left value = acc[96+0:99+0]
v_mfma_f32_16x16x32_bf16 acc[96:99], v[vgprValuB_X0_I0+24+4:vgprValuB_X0_I0+24+4+3], v[vgprValuA_X0_I0+0:vgprValuA_X0_I0+0+3], acc[96:99] // left value = acc[96+0:99+0]
v_mfma_f32_16x16x32_bf16 acc[96:99], v[vgprValuB_X0_I0+24:vgprValuB_X0_I0+24+3], v[vgprValuA_X0_I0+0:vgprValuA_X0_I0+0+3], acc[96:99] // left value = acc[96+0:99+0]
v_mfma_f32_16x16x32_bf16 acc[100:103], v[vgprValuB_X0_I0+24:vgprValuB_X0_I0+24+3], v[vgprValuA_X0_I0+8+4:vgprValuA_X0_I0+8+4+3], acc[100:103] // left value = acc[100+0:103+0]
v_mfma_f32_16x16x32_bf16 acc[100:103], v[vgprValuB_X0_I0+24+4:vgprValuB_X0_I0+24+4+3], v[vgprValuA_X0_I0+8:vgprValuA_X0_I0+8+3], acc[100:103] // left value = acc[100+0:103+0]
v_mfma_f32_16x16x32_bf16 acc[100:103], v[vgprValuB_X0_I0+24:vgprValuB_X0_I0+24+3], v[vgprValuA_X0_I0+8:vgprValuA_X0_I0+8+3], acc[100:103] // left value = acc[100+0:103+0]
v_mfma_f32_16x16x32_bf16 acc[104:107], v[vgprValuB_X0_I0+24:vgprValuB_X0_I0+24+3], v[vgprValuA_X0_I0+16+4:vgprValuA_X0_I0+16+4+3], acc[104:107] // left value = acc[104+0:107+0]
v_mfma_f32_16x16x32_bf16 acc[104:107], v[vgprValuB_X0_I0+24+4:vgprValuB_X0_I0+24+4+3], v[vgprValuA_X0_I0+16:vgprValuA_X0_I0+16+3], acc[104:107] // left value = acc[104+0:107+0]
v_mfma_f32_16x16x32_bf16 acc[104:107], v[vgprValuB_X0_I0+24:vgprValuB_X0_I0+24+3], v[vgprValuA_X0_I0+16:vgprValuA_X0_I0+16+3], acc[104:107] // left value = acc[104+0:107+0]
v_mfma_f32_16x16x32_bf16 acc[108:111], v[vgprValuB_X0_I0+24:vgprValuB_X0_I0+24+3], v[vgprValuA_X0_I0+24+4:vgprValuA_X0_I0+24+4+3], acc[108:111] // left value = acc[108+0:111+0]
v_mfma_f32_16x16x32_bf16 acc[108:111], v[vgprValuB_X0_I0+24+4:vgprValuB_X0_I0+24+4+3], v[vgprValuA_X0_I0+24:vgprValuA_X0_I0+24+3], acc[108:111] // left value = acc[108+0:111+0]
v_mfma_f32_16x16x32_bf16 acc[108:111], v[vgprValuB_X0_I0+24:vgprValuB_X0_I0+24+3], v[vgprValuA_X0_I0+24:vgprValuA_X0_I0+24+3], acc[108:111] // left value = acc[108+0:111+0]
v_mfma_f32_16x16x32_bf16 acc[112:115], v[vgprValuB_X0_I0+24:vgprValuB_X0_I0+24+3], v[vgprValuA_X0_I0+32+4:vgprValuA_X0_I0+32+4+3], acc[112:115] // left value = acc[112+0:115+0]
v_mfma_f32_16x16x32_bf16 acc[112:115], v[vgprValuB_X0_I0+24+4:vgprValuB_X0_I0+24+4+3], v[vgprValuA_X0_I0+32:vgprValuA_X0_I0+32+3], acc[112:115] // left value = acc[112+0:115+0]
v_mfma_f32_16x16x32_bf16 acc[112:115], v[vgprValuB_X0_I0+24:vgprValuB_X0_I0+24+3], v[vgprValuA_X0_I0+32:vgprValuA_X0_I0+32+3], acc[112:115] // left value = acc[112+0:115+0]
v_mfma_f32_16x16x32_bf16 acc[116:119], v[vgprValuB_X0_I0+24:vgprValuB_X0_I0+24+3], v[vgprValuA_X0_I0+40+4:vgprValuA_X0_I0+40+4+3], acc[116:119] // left value = acc[116+0:119+0]
v_mfma_f32_16x16x32_bf16 acc[116:119], v[vgprValuB_X0_I0+24+4:vgprValuB_X0_I0+24+4+3], v[vgprValuA_X0_I0+40:vgprValuA_X0_I0+40+3], acc[116:119] // left value = acc[116+0:119+0]
v_mfma_f32_16x16x32_bf16 acc[116:119], v[vgprValuB_X0_I0+24:vgprValuB_X0_I0+24+3], v[vgprValuA_X0_I0+40:vgprValuA_X0_I0+40+3], acc[116:119] // left value = acc[116+0:119+0]
v_mfma_f32_16x16x32_bf16 acc[120:123], v[vgprValuB_X0_I0+24:vgprValuB_X0_I0+24+3], v[vgprValuA_X0_I0+48+4:vgprValuA_X0_I0+48+4+3], acc[120:123] // left value = acc[120+0:123+0]
v_mfma_f32_16x16x32_bf16 acc[120:123], v[vgprValuB_X0_I0+24+4:vgprValuB_X0_I0+24+4+3], v[vgprValuA_X0_I0+48:vgprValuA_X0_I0+48+3], acc[120:123] // left value = acc[120+0:123+0]
v_mfma_f32_16x16x32_bf16 acc[120:123], v[vgprValuB_X0_I0+24:vgprValuB_X0_I0+24+3], v[vgprValuA_X0_I0+48:vgprValuA_X0_I0+48+3], acc[120:123] // left value = acc[120+0:123+0]
v_mfma_f32_16x16x32_bf16 acc[124:127], v[vgprValuB_X0_I0+24:vgprValuB_X0_I0+24+3], v[vgprValuA_X0_I0+56+4:vgprValuA_X0_I0+56+4+3], acc[124:127] // left value = acc[124+0:127+0]
v_mfma_f32_16x16x32_bf16 acc[124:127], v[vgprValuB_X0_I0+24+4:vgprValuB_X0_I0+24+4+3], v[vgprValuA_X0_I0+56:vgprValuA_X0_I0+56+3], acc[124:127] // left value = acc[124+0:127+0]
v_mfma_f32_16x16x32_bf16 acc[124:127], v[vgprValuB_X0_I0+24:vgprValuB_X0_I0+24+3], v[vgprValuA_X0_I0+56:vgprValuA_X0_I0+56+3], acc[124:127] // left value = acc[124+0:127+0]
v_mfma_f32_16x16x32_bf16 acc[128:131], v[vgprValuB_X0_I0+32:vgprValuB_X0_I0+32+3], v[vgprValuA_X0_I0+0+4:vgprValuA_X0_I0+0+4+3], acc[128:131] // left value = acc[128+0:131+0]
v_mfma_f32_16x16x32_bf16 acc[128:131], v[vgprValuB_X0_I0+32+4:vgprValuB_X0_I0+32+4+3], v[vgprValuA_X0_I0+0:vgprValuA_X0_I0+0+3], acc[128:131] // left value = acc[128+0:131+0]
v_mfma_f32_16x16x32_bf16 acc[128:131], v[vgprValuB_X0_I0+32:vgprValuB_X0_I0+32+3], v[vgprValuA_X0_I0+0:vgprValuA_X0_I0+0+3], acc[128:131] // left value = acc[128+0:131+0]
v_mfma_f32_16x16x32_bf16 acc[132:135], v[vgprValuB_X0_I0+32:vgprValuB_X0_I0+32+3], v[vgprValuA_X0_I0+8+4:vgprValuA_X0_I0+8+4+3], acc[132:135] // left value = acc[132+0:135+0]
v_mfma_f32_16x16x32_bf16 acc[132:135], v[vgprValuB_X0_I0+32+4:vgprValuB_X0_I0+32+4+3], v[vgprValuA_X0_I0+8:vgprValuA_X0_I0+8+3], acc[132:135] // left value = acc[132+0:135+0]
v_mfma_f32_16x16x32_bf16 acc[132:135], v[vgprValuB_X0_I0+32:vgprValuB_X0_I0+32+3], v[vgprValuA_X0_I0+8:vgprValuA_X0_I0+8+3], acc[132:135] // left value = acc[132+0:135+0]
v_mfma_f32_16x16x32_bf16 acc[136:139], v[vgprValuB_X0_I0+32:vgprValuB_X0_I0+32+3], v[vgprValuA_X0_I0+16+4:vgprValuA_X0_I0+16+4+3], acc[136:139] // left value = acc[136+0:139+0]
v_mfma_f32_16x16x32_bf16 acc[136:139], v[vgprValuB_X0_I0+32+4:vgprValuB_X0_I0+32+4+3], v[vgprValuA_X0_I0+16:vgprValuA_X0_I0+16+3], acc[136:139] // left value = acc[136+0:139+0]
v_mfma_f32_16x16x32_bf16 acc[136:139], v[vgprValuB_X0_I0+32:vgprValuB_X0_I0+32+3], v[vgprValuA_X0_I0+16:vgprValuA_X0_I0+16+3], acc[136:139] // left value = acc[136+0:139+0]
v_mfma_f32_16x16x32_bf16 acc[140:143], v[vgprValuB_X0_I0+32:vgprValuB_X0_I0+32+3], v[vgprValuA_X0_I0+24+4:vgprValuA_X0_I0+24+4+3], acc[140:143] // left value = acc[140+0:143+0]
v_mfma_f32_16x16x32_bf16 acc[140:143], v[vgprValuB_X0_I0+32+4:vgprValuB_X0_I0+32+4+3], v[vgprValuA_X0_I0+24:vgprValuA_X0_I0+24+3], acc[140:143] // left value = acc[140+0:143+0]
v_mfma_f32_16x16x32_bf16 acc[140:143], v[vgprValuB_X0_I0+32:vgprValuB_X0_I0+32+3], v[vgprValuA_X0_I0+24:vgprValuA_X0_I0+24+3], acc[140:143] // left value = acc[140+0:143+0]
v_mfma_f32_16x16x32_bf16 acc[144:147], v[vgprValuB_X0_I0+32:vgprValuB_X0_I0+32+3], v[vgprValuA_X0_I0+32+4:vgprValuA_X0_I0+32+4+3], acc[144:147] // left value = acc[144+0:147+0]
v_mfma_f32_16x16x32_bf16 acc[144:147], v[vgprValuB_X0_I0+32+4:vgprValuB_X0_I0+32+4+3], v[vgprValuA_X0_I0+32:vgprValuA_X0_I0+32+3], acc[144:147] // left value = acc[144+0:147+0]
v_mfma_f32_16x16x32_bf16 acc[144:147], v[vgprValuB_X0_I0+32:vgprValuB_X0_I0+32+3], v[vgprValuA_X0_I0+32:vgprValuA_X0_I0+32+3], acc[144:147] // left value = acc[144+0:147+0]
v_mfma_f32_16x16x32_bf16 acc[148:151], v[vgprValuB_X0_I0+32:vgprValuB_X0_I0+32+3], v[vgprValuA_X0_I0+40+4:vgprValuA_X0_I0+40+4+3], acc[148:151] // left value = acc[148+0:151+0]
v_mfma_f32_16x16x32_bf16 acc[148:151], v[vgprValuB_X0_I0+32+4:vgprValuB_X0_I0+32+4+3], v[vgprValuA_X0_I0+40:vgprValuA_X0_I0+40+3], acc[148:151] // left value = acc[148+0:151+0]
v_mfma_f32_16x16x32_bf16 acc[148:151], v[vgprValuB_X0_I0+32:vgprValuB_X0_I0+32+3], v[vgprValuA_X0_I0+40:vgprValuA_X0_I0+40+3], acc[148:151] // left value = acc[148+0:151+0]
v_mfma_f32_16x16x32_bf16 acc[152:155], v[vgprValuB_X0_I0+32:vgprValuB_X0_I0+32+3], v[vgprValuA_X0_I0+48+4:vgprValuA_X0_I0+48+4+3], acc[152:155] // left value = acc[152+0:155+0]
v_mfma_f32_16x16x32_bf16 acc[152:155], v[vgprValuB_X0_I0+32+4:vgprValuB_X0_I0+32+4+3], v[vgprValuA_X0_I0+48:vgprValuA_X0_I0+48+3], acc[152:155] // left value = acc[152+0:155+0]
v_mfma_f32_16x16x32_bf16 acc[152:155], v[vgprValuB_X0_I0+32:vgprValuB_X0_I0+32+3], v[vgprValuA_X0_I0+48:vgprValuA_X0_I0+48+3], acc[152:155] // left value = acc[152+0:155+0]
v_mfma_f32_16x16x32_bf16 acc[156:159], v[vgprValuB_X0_I0+32:vgprValuB_X0_I0+32+3], v[vgprValuA_X0_I0+56+4:vgprValuA_X0_I0+56+4+3], acc[156:159] // left value = acc[156+0:159+0]
v_mfma_f32_16x16x32_bf16 acc[156:159], v[vgprValuB_X0_I0+32+4:vgprValuB_X0_I0+32+4+3], v[vgprValuA_X0_I0+56:vgprValuA_X0_I0+56+3], acc[156:159] // left value = acc[156+0:159+0]
v_mfma_f32_16x16x32_bf16 acc[156:159], v[vgprValuB_X0_I0+32:vgprValuB_X0_I0+32+3], v[vgprValuA_X0_I0+56:vgprValuA_X0_I0+56+3], acc[156:159] // left value = acc[156+0:159+0]
v_mfma_f32_16x16x32_bf16 acc[160:163], v[vgprValuB_X0_I0+40:vgprValuB_X0_I0+40+3], v[vgprValuA_X0_I0+0+4:vgprValuA_X0_I0+0+4+3], acc[160:163] // left value = acc[160+0:163+0]
v_mfma_f32_16x16x32_bf16 acc[160:163], v[vgprValuB_X0_I0+40+4:vgprValuB_X0_I0+40+4+3], v[vgprValuA_X0_I0+0:vgprValuA_X0_I0+0+3], acc[160:163] // left value = acc[160+0:163+0]
v_mfma_f32_16x16x32_bf16 acc[160:163], v[vgprValuB_X0_I0+40:vgprValuB_X0_I0+40+3], v[vgprValuA_X0_I0+0:vgprValuA_X0_I0+0+3], acc[160:163] // left value = acc[160+0:163+0]
v_mfma_f32_16x16x32_bf16 acc[164:167], v[vgprValuB_X0_I0+40:vgprValuB_X0_I0+40+3], v[vgprValuA_X0_I0+8+4:vgprValuA_X0_I0+8+4+3], acc[164:167] // left value = acc[164+0:167+0]
v_mfma_f32_16x16x32_bf16 acc[164:167], v[vgprValuB_X0_I0+40+4:vgprValuB_X0_I0+40+4+3], v[vgprValuA_X0_I0+8:vgprValuA_X0_I0+8+3], acc[164:167] // left value = acc[164+0:167+0]
v_mfma_f32_16x16x32_bf16 acc[164:167], v[vgprValuB_X0_I0+40:vgprValuB_X0_I0+40+3], v[vgprValuA_X0_I0+8:vgprValuA_X0_I0+8+3], acc[164:167] // left value = acc[164+0:167+0]
v_mfma_f32_16x16x32_bf16 acc[168:171], v[vgprValuB_X0_I0+40:vgprValuB_X0_I0+40+3], v[vgprValuA_X0_I0+16+4:vgprValuA_X0_I0+16+4+3], acc[168:171] // left value = acc[168+0:171+0]
v_mfma_f32_16x16x32_bf16 acc[168:171], v[vgprValuB_X0_I0+40+4:vgprValuB_X0_I0+40+4+3], v[vgprValuA_X0_I0+16:vgprValuA_X0_I0+16+3], acc[168:171] // left value = acc[168+0:171+0]
v_mfma_f32_16x16x32_bf16 acc[168:171], v[vgprValuB_X0_I0+40:vgprValuB_X0_I0+40+3], v[vgprValuA_X0_I0+16:vgprValuA_X0_I0+16+3], acc[168:171] // left value = acc[168+0:171+0]
v_mfma_f32_16x16x32_bf16 acc[172:175], v[vgprValuB_X0_I0+40:vgprValuB_X0_I0+40+3], v[vgprValuA_X0_I0+24+4:vgprValuA_X0_I0+24+4+3], acc[172:175] // left value = acc[172+0:175+0]
v_mfma_f32_16x16x32_bf16 acc[172:175], v[vgprValuB_X0_I0+40+4:vgprValuB_X0_I0+40+4+3], v[vgprValuA_X0_I0+24:vgprValuA_X0_I0+24+3], acc[172:175] // left value = acc[172+0:175+0]
v_mfma_f32_16x16x32_bf16 acc[172:175], v[vgprValuB_X0_I0+40:vgprValuB_X0_I0+40+3], v[vgprValuA_X0_I0+24:vgprValuA_X0_I0+24+3], acc[172:175] // left value = acc[172+0:175+0]
v_mfma_f32_16x16x32_bf16 acc[176:179], v[vgprValuB_X0_I0+40:vgprValuB_X0_I0+40+3], v[vgprValuA_X0_I0+32+4:vgprValuA_X0_I0+32+4+3], acc[176:179] // left value = acc[176+0:179+0]
v_mfma_f32_16x16x32_bf16 acc[176:179], v[vgprValuB_X0_I0+40+4:vgprValuB_X0_I0+40+4+3], v[vgprValuA_X0_I0+32:vgprValuA_X0_I0+32+3], acc[176:179] // left value = acc[176+0:179+0]
v_mfma_f32_16x16x32_bf16 acc[176:179], v[vgprValuB_X0_I0+40:vgprValuB_X0_I0+40+3], v[vgprValuA_X0_I0+32:vgprValuA_X0_I0+32+3], acc[176:179] // left value = acc[176+0:179+0]
v_mfma_f32_16x16x32_bf16 acc[180:183], v[vgprValuB_X0_I0+40:vgprValuB_X0_I0+40+3], v[vgprValuA_X0_I0+40+4:vgprValuA_X0_I0+40+4+3], acc[180:183] // left value = acc[180+0:183+0]
v_mfma_f32_16x16x32_bf16 acc[180:183], v[vgprValuB_X0_I0+40+4:vgprValuB_X0_I0+40+4+3], v[vgprValuA_X0_I0+40:vgprValuA_X0_I0+40+3], acc[180:183] // left value = acc[180+0:183+0]
v_mfma_f32_16x16x32_bf16 acc[180:183], v[vgprValuB_X0_I0+40:vgprValuB_X0_I0+40+3], v[vgprValuA_X0_I0+40:vgprValuA_X0_I0+40+3], acc[180:183] // left value = acc[180+0:183+0]
v_mfma_f32_16x16x32_bf16 acc[184:187], v[vgprValuB_X0_I0+40:vgprValuB_X0_I0+40+3], v[vgprValuA_X0_I0+48+4:vgprValuA_X0_I0+48+4+3], acc[184:187] // left value = acc[184+0:187+0]
v_mfma_f32_16x16x32_bf16 acc[184:187], v[vgprValuB_X0_I0+40+4:vgprValuB_X0_I0+40+4+3], v[vgprValuA_X0_I0+48:vgprValuA_X0_I0+48+3], acc[184:187] // left value = acc[184+0:187+0]
v_mfma_f32_16x16x32_bf16 acc[184:187], v[vgprValuB_X0_I0+40:vgprValuB_X0_I0+40+3], v[vgprValuA_X0_I0+48:vgprValuA_X0_I0+48+3], acc[184:187] // left value = acc[184+0:187+0]
v_mfma_f32_16x16x32_bf16 acc[188:191], v[vgprValuB_X0_I0+40:vgprValuB_X0_I0+40+3], v[vgprValuA_X0_I0+56+4:vgprValuA_X0_I0+56+4+3], acc[188:191] // left value = acc[188+0:191+0]
v_mfma_f32_16x16x32_bf16 acc[188:191], v[vgprValuB_X0_I0+40+4:vgprValuB_X0_I0+40+4+3], v[vgprValuA_X0_I0+56:vgprValuA_X0_I0+56+3], acc[188:191] // left value = acc[188+0:191+0]
v_mfma_f32_16x16x32_bf16 acc[188:191], v[vgprValuB_X0_I0+40:vgprValuB_X0_I0+40+3], v[vgprValuA_X0_I0+56:vgprValuA_X0_I0+56+3], acc[188:191] // left value = acc[188+0:191+0]
v_mfma_f32_16x16x32_bf16 acc[192:195], v[vgprValuB_X0_I0+48:vgprValuB_X0_I0+48+3], v[vgprValuA_X0_I0+0+4:vgprValuA_X0_I0+0+4+3], acc[192:195] // left value = acc[192+0:195+0]
v_mfma_f32_16x16x32_bf16 acc[192:195], v[vgprValuB_X0_I0+48+4:vgprValuB_X0_I0+48+4+3], v[vgprValuA_X0_I0+0:vgprValuA_X0_I0+0+3], acc[192:195] // left value = acc[192+0:195+0]
v_mfma_f32_16x16x32_bf16 acc[192:195], v[vgprValuB_X0_I0+48:vgprValuB_X0_I0+48+3], v[vgprValuA_X0_I0+0:vgprValuA_X0_I0+0+3], acc[192:195] // left value = acc[192+0:195+0]
v_mfma_f32_16x16x32_bf16 acc[196:199], v[vgprValuB_X0_I0+48:vgprValuB_X0_I0+48+3], v[vgprValuA_X0_I0+8+4:vgprValuA_X0_I0+8+4+3], acc[196:199] // left value = acc[196+0:199+0]
v_mfma_f32_16x16x32_bf16 acc[196:199], v[vgprValuB_X0_I0+48+4:vgprValuB_X0_I0+48+4+3], v[vgprValuA_X0_I0+8:vgprValuA_X0_I0+8+3], acc[196:199] // left value = acc[196+0:199+0]
v_mfma_f32_16x16x32_bf16 acc[196:199], v[vgprValuB_X0_I0+48:vgprValuB_X0_I0+48+3], v[vgprValuA_X0_I0+8:vgprValuA_X0_I0+8+3], acc[196:199] // left value = acc[196+0:199+0]
v_mfma_f32_16x16x32_bf16 acc[200:203], v[vgprValuB_X0_I0+48:vgprValuB_X0_I0+48+3], v[vgprValuA_X0_I0+16+4:vgprValuA_X0_I0+16+4+3], acc[200:203] // left value = acc[200+0:203+0]
v_mfma_f32_16x16x32_bf16 acc[200:203], v[vgprValuB_X0_I0+48+4:vgprValuB_X0_I0+48+4+3], v[vgprValuA_X0_I0+16:vgprValuA_X0_I0+16+3], acc[200:203] // left value = acc[200+0:203+0]
v_mfma_f32_16x16x32_bf16 acc[200:203], v[vgprValuB_X0_I0+48:vgprValuB_X0_I0+48+3], v[vgprValuA_X0_I0+16:vgprValuA_X0_I0+16+3], acc[200:203] // left value = acc[200+0:203+0]
v_mfma_f32_16x16x32_bf16 acc[204:207], v[vgprValuB_X0_I0+48:vgprValuB_X0_I0+48+3], v[vgprValuA_X0_I0+24+4:vgprValuA_X0_I0+24+4+3], acc[204:207] // left value = acc[204+0:207+0]
v_mfma_f32_16x16x32_bf16 acc[204:207], v[vgprValuB_X0_I0+48+4:vgprValuB_X0_I0+48+4+3], v[vgprValuA_X0_I0+24:vgprValuA_X0_I0+24+3], acc[204:207] // left value = acc[204+0:207+0]
v_mfma_f32_16x16x32_bf16 acc[204:207], v[vgprValuB_X0_I0+48:vgprValuB_X0_I0+48+3], v[vgprValuA_X0_I0+24:vgprValuA_X0_I0+24+3], acc[204:207] // left value = acc[204+0:207+0]
v_mfma_f32_16x16x32_bf16 acc[208:211], v[vgprValuB_X0_I0+48:vgprValuB_X0_I0+48+3], v[vgprValuA_X0_I0+32+4:vgprValuA_X0_I0+32+4+3], acc[208:211] // left value = acc[208+0:211+0]
v_mfma_f32_16x16x32_bf16 acc[208:211], v[vgprValuB_X0_I0+48+4:vgprValuB_X0_I0+48+4+3], v[vgprValuA_X0_I0+32:vgprValuA_X0_I0+32+3], acc[208:211] // left value = acc[208+0:211+0]
v_mfma_f32_16x16x32_bf16 acc[208:211], v[vgprValuB_X0_I0+48:vgprValuB_X0_I0+48+3], v[vgprValuA_X0_I0+32:vgprValuA_X0_I0+32+3], acc[208:211] // left value = acc[208+0:211+0]
v_mfma_f32_16x16x32_bf16 acc[212:215], v[vgprValuB_X0_I0+48:vgprValuB_X0_I0+48+3], v[vgprValuA_X0_I0+40+4:vgprValuA_X0_I0+40+4+3], acc[212:215] // left value = acc[212+0:215+0]
v_mfma_f32_16x16x32_bf16 acc[212:215], v[vgprValuB_X0_I0+48+4:vgprValuB_X0_I0+48+4+3], v[vgprValuA_X0_I0+40:vgprValuA_X0_I0+40+3], acc[212:215] // left value = acc[212+0:215+0]
v_mfma_f32_16x16x32_bf16 acc[212:215], v[vgprValuB_X0_I0+48:vgprValuB_X0_I0+48+3], v[vgprValuA_X0_I0+40:vgprValuA_X0_I0+40+3], acc[212:215] // left value = acc[212+0:215+0]
v_mfma_f32_16x16x32_bf16 acc[216:219], v[vgprValuB_X0_I0+48:vgprValuB_X0_I0+48+3], v[vgprValuA_X0_I0+48+4:vgprValuA_X0_I0+48+4+3], acc[216:219] // left value = acc[216+0:219+0]
v_mfma_f32_16x16x32_bf16 acc[216:219], v[vgprValuB_X0_I0+48+4:vgprValuB_X0_I0+48+4+3], v[vgprValuA_X0_I0+48:vgprValuA_X0_I0+48+3], acc[216:219] // left value = acc[216+0:219+0]
v_mfma_f32_16x16x32_bf16 acc[216:219], v[vgprValuB_X0_I0+48:vgprValuB_X0_I0+48+3], v[vgprValuA_X0_I0+48:vgprValuA_X0_I0+48+3], acc[216:219] // left value = acc[216+0:219+0]
v_mfma_f32_16x16x32_bf16 acc[220:223], v[vgprValuB_X0_I0+48:vgprValuB_X0_I0+48+3], v[vgprValuA_X0_I0+56+4:vgprValuA_X0_I0+56+4+3], acc[220:223] // left value = acc[220+0:223+0]
v_mfma_f32_16x16x32_bf16 acc[220:223], v[vgprValuB_X0_I0+48+4:vgprValuB_X0_I0+48+4+3], v[vgprValuA_X0_I0+56:vgprValuA_X0_I0+56+3], acc[220:223] // left value = acc[220+0:223+0]
v_mfma_f32_16x16x32_bf16 acc[220:223], v[vgprValuB_X0_I0+48:vgprValuB_X0_I0+48+3], v[vgprValuA_X0_I0+56:vgprValuA_X0_I0+56+3], acc[220:223] // left value = acc[220+0:223+0]
v_mfma_f32_16x16x32_bf16 acc[224:227], v[vgprValuB_X0_I0+56:vgprValuB_X0_I0+56+3], v[vgprValuA_X0_I0+0+4:vgprValuA_X0_I0+0+4+3], acc[224:227] // left value = acc[224+0:227+0]
v_mfma_f32_16x16x32_bf16 acc[224:227], v[vgprValuB_X0_I0+56+4:vgprValuB_X0_I0+56+4+3], v[vgprValuA_X0_I0+0:vgprValuA_X0_I0+0+3], acc[224:227] // left value = acc[224+0:227+0]
v_mfma_f32_16x16x32_bf16 acc[224:227], v[vgprValuB_X0_I0+56:vgprValuB_X0_I0+56+3], v[vgprValuA_X0_I0+0:vgprValuA_X0_I0+0+3], acc[224:227] // left value = acc[224+0:227+0]
v_mfma_f32_16x16x32_bf16 acc[228:231], v[vgprValuB_X0_I0+56:vgprValuB_X0_I0+56+3], v[vgprValuA_X0_I0+8+4:vgprValuA_X0_I0+8+4+3], acc[228:231] // left value = acc[228+0:231+0]
v_mfma_f32_16x16x32_bf16 acc[228:231], v[vgprValuB_X0_I0+56+4:vgprValuB_X0_I0+56+4+3], v[vgprValuA_X0_I0+8:vgprValuA_X0_I0+8+3], acc[228:231] // left value = acc[228+0:231+0]
v_mfma_f32_16x16x32_bf16 acc[228:231], v[vgprValuB_X0_I0+56:vgprValuB_X0_I0+56+3], v[vgprValuA_X0_I0+8:vgprValuA_X0_I0+8+3], acc[228:231] // left value = acc[228+0:231+0]
v_mfma_f32_16x16x32_bf16 acc[232:235], v[vgprValuB_X0_I0+56:vgprValuB_X0_I0+56+3], v[vgprValuA_X0_I0+16+4:vgprValuA_X0_I0+16+4+3], acc[232:235] // left value = acc[232+0:235+0]
v_mfma_f32_16x16x32_bf16 acc[232:235], v[vgprValuB_X0_I0+56+4:vgprValuB_X0_I0+56+4+3], v[vgprValuA_X0_I0+16:vgprValuA_X0_I0+16+3], acc[232:235] // left value = acc[232+0:235+0]
v_mfma_f32_16x16x32_bf16 acc[232:235], v[vgprValuB_X0_I0+56:vgprValuB_X0_I0+56+3], v[vgprValuA_X0_I0+16:vgprValuA_X0_I0+16+3], acc[232:235] // left value = acc[232+0:235+0]
v_mfma_f32_16x16x32_bf16 acc[236:239], v[vgprValuB_X0_I0+56:vgprValuB_X0_I0+56+3], v[vgprValuA_X0_I0+24+4:vgprValuA_X0_I0+24+4+3], acc[236:239] // left value = acc[236+0:239+0]
v_mfma_f32_16x16x32_bf16 acc[236:239], v[vgprValuB_X0_I0+56+4:vgprValuB_X0_I0+56+4+3], v[vgprValuA_X0_I0+24:vgprValuA_X0_I0+24+3], acc[236:239] // left value = acc[236+0:239+0]
v_mfma_f32_16x16x32_bf16 acc[236:239], v[vgprValuB_X0_I0+56:vgprValuB_X0_I0+56+3], v[vgprValuA_X0_I0+24:vgprValuA_X0_I0+24+3], acc[236:239] // left value = acc[236+0:239+0]
v_mfma_f32_16x16x32_bf16 acc[240:243], v[vgprValuB_X0_I0+56:vgprValuB_X0_I0+56+3], v[vgprValuA_X0_I0+32+4:vgprValuA_X0_I0+32+4+3], acc[240:243] // left value = acc[240+0:243+0]
v_mfma_f32_16x16x32_bf16 acc[240:243], v[vgprValuB_X0_I0+56+4:vgprValuB_X0_I0+56+4+3], v[vgprValuA_X0_I0+32:vgprValuA_X0_I0+32+3], acc[240:243] // left value = acc[240+0:243+0]
v_mfma_f32_16x16x32_bf16 acc[240:243], v[vgprValuB_X0_I0+56:vgprValuB_X0_I0+56+3], v[vgprValuA_X0_I0+32:vgprValuA_X0_I0+32+3], acc[240:243] // left value = acc[240+0:243+0]
v_mfma_f32_16x16x32_bf16 acc[244:247], v[vgprValuB_X0_I0+56:vgprValuB_X0_I0+56+3], v[vgprValuA_X0_I0+40+4:vgprValuA_X0_I0+40+4+3], acc[244:247] // left value = acc[244+0:247+0]
v_mfma_f32_16x16x32_bf16 acc[244:247], v[vgprValuB_X0_I0+56+4:vgprValuB_X0_I0+56+4+3], v[vgprValuA_X0_I0+40:vgprValuA_X0_I0+40+3], acc[244:247] // left value = acc[244+0:247+0]
v_mfma_f32_16x16x32_bf16 acc[244:247], v[vgprValuB_X0_I0+56:vgprValuB_X0_I0+56+3], v[vgprValuA_X0_I0+40:vgprValuA_X0_I0+40+3], acc[244:247] // left value = acc[244+0:247+0]
v_mfma_f32_16x16x32_bf16 acc[248:251], v[vgprValuB_X0_I0+56:vgprValuB_X0_I0+56+3], v[vgprValuA_X0_I0+48+4:vgprValuA_X0_I0+48+4+3], acc[248:251] // left value = acc[248+0:251+0]
v_mfma_f32_16x16x32_bf16 acc[248:251], v[vgprValuB_X0_I0+56+4:vgprValuB_X0_I0+56+4+3], v[vgprValuA_X0_I0+48:vgprValuA_X0_I0+48+3], acc[248:251] // left value = acc[248+0:251+0]
v_mfma_f32_16x16x32_bf16 acc[248:251], v[vgprValuB_X0_I0+56:vgprValuB_X0_I0+56+3], v[vgprValuA_X0_I0+48:vgprValuA_X0_I0+48+3], acc[248:251] // left value = acc[248+0:251+0]
v_mfma_f32_16x16x32_bf16 acc[252:255], v[vgprValuB_X0_I0+56:vgprValuB_X0_I0+56+3], v[vgprValuA_X0_I0+56+4:vgprValuA_X0_I0+56+4+3], acc[252:255] // left value = acc[252+0:255+0]
v_mfma_f32_16x16x32_bf16 acc[252:255], v[vgprValuB_X0_I0+56+4:vgprValuB_X0_I0+56+4+3], v[vgprValuA_X0_I0+56:vgprValuA_X0_I0+56+3], acc[252:255] // left value = acc[252+0:255+0]
v_mfma_f32_16x16x32_bf16 acc[252:255], v[vgprValuB_X0_I0+56:vgprValuB_X0_I0+56+3], v[vgprValuA_X0_I0+56:vgprValuA_X0_I0+56+3], acc[252:255] // left value = acc[252+0:255+0]

/* closeLoop loopL finalLoop=1 tailLoop=1 */
s_sub_i32 s[sgprLoopCounterL], s[sgprLoopCounterL], 0x20 // dec counterL (tailLoop)
s_add_u32 s[sgprOrigLoopCounter], s[sgprOrigLoopCounter], 0x20 // inc counterL
s_cmp_le_i32 s[sgprLoopCounterL], 0x0              // counterL<=0
s_cbranch_scc0 label_TailLoopBeginL                // restart LoopL
label_TailLoopEndL:
s_mov_b32 s83, 1024                                // tailloop lds offset
s_mul_i32 s83, s[sgprOrigLoopCounter], s83         // scale by mul
v_sub_u32 v[vgprLocalReadAddrA], v[vgprLocalReadAddrA], s83 // remove lro damage
s_mov_b32 s83, 4                                   // tailloop lds offset
s_mul_i32 s83, s[sgprOrigLoopCounter], s83         // scale by mul
v_sub_u32 v[vgprLocalReadAddrB], v[vgprLocalReadAddrB], s83 // remove lro damage
label_SkipTailLoopL:
.set vgprValuA_X0_I0_BASE, UNDEF
.set vgprValuA_X0_I0, UNDEF
.set vgprValuB_X0_I0_BASE, UNDEF
.set vgprValuB_X0_I0, UNDEF

/* Tail: add MISC Vgpr [0...18) to pool */
label_Summation_End_DZOUDPYJU2HHRCOQ:
.set sgprLoopCounterL, UNDEF
.set sgprOrigLoopCounter, UNDEF
.set sgprSrdA, UNDEF
.set sgprSrdB, UNDEF
.set sgprShadowLimitA, UNDEF
.set sgprShadowLimitB, UNDEF
.set sgprStaggerUIter, UNDEF
.set sgprWrapUA, UNDEF
.set sgprWrapUB, UNDEF
.set sgprGlobalReadIncsA, UNDEF
.set sgprGlobalReadIncsB, UNDEF
/* load store sgprs */
.set sgprAddressScaleAlphaVec, 64
.set sgprAddressBias, 66
.set sgprBiasType, 68
.set sgprBiasStride, 69
.set sgpractivationAlpha, 70
.set sgpractivationBeta, 71
.set sgprActivationType, 72
/* Check if custom structure pointer is null */
s_cmp_eq_u32 s[sgprArgType], 2                     // ArgType == 2 ?
s_cbranch_scc1 label_LoadExternalEpilogueStruct    // branch if ArgType == 2
s_load_dwordx8 s[64:71], s[sgprKernArgAddress:sgprKernArgAddress+1], 124 // 124
s_load_dword s72, s[sgprKernArgAddress:sgprKernArgAddress+1], 156 // 156
s_branch label_LoadExternalEpilogueStructEnd
label_LoadExternalEpilogueStruct:
s_load_dwordx4 s[64:67], s[sgprKernArgAddress:sgprKernArgAddress+1], 180 // 180
s_load_dwordx2 s[68:69], s[sgprKernArgAddress:sgprKernArgAddress+1], 196 // 196
s_load_dwordx2 s[70:71], s[sgprKernArgAddress:sgprKernArgAddress+1], 220 // 220
s_load_dword s72, s[sgprKernArgAddress:sgprKernArgAddress+1], 228 // 228
label_LoadExternalEpilogueStructEnd:
.set sgprSrdScaleAlphaVec, 76
.set sgprSrdBias, 80

/* Mapping of Acc register -> C Vgpr register */

/* shift vector components d0 */
v_mov_b32 v3, s[sgprWorkGroup0]
v_mul_i32_i24 v3, -0x100, v3                       // wg*MT
v_add_co_u32 v3, vcc, s[sgprSizesFree+0], v3       // wgMT = Size - wg*MT
v_mov_b32 v4, 0x100                                // MT
v_cmp_lt_u32 s[8:9], v3, v4                        // wgMT < MT
v_cndmask_b32 v3, v4, v3, s[8:9]                   // wgMT = (wgMT < MT) ? wgMT : MT
v_lshrrev_b32 v5, 6, v[vgprSerial]                 // 5 = Serial / 64
v_and_b32 v5, 1, v5                                // v5 = v5 % 2
v_lshrrev_b32 v6, 6, v3                            // 6 = 3 / 64
v_and_b32 v6, 1, v6                                // v6 = v6 % 2
v_cmp_eq_u32 s[8:9], v6, v5                        // wave_id == block_belong_to_wave?
v_cndmask_b32 v3, v4, v3, s[8:9]                   // wgMT = (wgMT < MT) ? wgMT : MT

/* mbReg: which mb block need to shift, mb(matrixInstCoal(16) * VectorWidth(4)) */
v_lshrrev_b32 v4, 6, v3                            // 4 = 3 / 64
v_lshlrev_b32 v6, 0, v5                            // v6 = v5 * 1
v_sub_u32 v4, v4, v6

/* gbReg: glvw block id */
v_lshrrev_b32 v6, 2, v3                            // 6 = 3 / 4

/* tgbReg: glvw block id */
v_lshrrev_b32 v7, 0, v[vgprSerial]                 // 7 = Serial / 1
v_and_b32 v7, 15, v7                               // v7 = v7 % 16
v_lshlrev_b32 v7, 2, v7                            // v7 = v7 * 4
v_lshrrev_b32 v7, 2, v7                            // 7 = 7 / 4
v_lshlrev_b32 v5, 4, v5                            // v5 = v5 * 16
v_add_co_u32 v7, vcc, v5, v7                       // tgbReg = (tid_coal * continOut) / GLVW
v_sub_u32 v6, v6, v7

/* vwReg: glvw in which vw block? */
v_and_b32 v5, 3, v3                                // permute register between threads
v_lshrrev_b32 v5, 2, v5                            // permute register between threads

/* rReg : reminder of M_size % GlobalReadVectorWidth */
v_and_b32 v7, 3, v3                                // v7 = v3 % 4
v_cmp_eq_u32 vcc, v7, 0x1                          // wgMT%VW == 1
s_cbranch_vccnz label_ShiftVectorComponents0_GLVW1 // branch to shift d0 r=1
v_cmp_eq_u32 vcc, v7, 0x2                          // wgMT%VW == 2
s_cbranch_vccnz label_ShiftVectorComponents0_GLVW2 // branch to shift d0 r=2
v_cmp_eq_u32 vcc, v7, 0x3                          // wgMT%VW == 3
s_cbranch_vccnz label_ShiftVectorComponents0_GLVW3 // branch to shift d0 r=3
s_branch label_ShiftVectorComponents0_GLVW0        // no shifting

/******************************************/
/* shift d0 r=1                           */
/******************************************/
label_ShiftVectorComponents0_GLVW1:
v_cmp_eq_u32 vcc, v4, 0x0
s_cbranch_vccnz label_ShiftVectorComponents0_GLVW1_BM0 // branch to shift d0 r1 mb0
v_cmp_eq_u32 vcc, v4, 0x2
s_cbranch_vccnz label_ShiftVectorComponents0_GLVW1_BM1 // branch to shift d0 r1 mb1

/******************************************/
/* shift d0 r=2                           */
/******************************************/
label_ShiftVectorComponents0_GLVW2:
v_cmp_eq_u32 vcc, v4, 0x0
s_cbranch_vccnz label_ShiftVectorComponents0_GLVW2_BM0 // branch to shift d0 r2 mb0
v_cmp_eq_u32 vcc, v4, 0x2
s_cbranch_vccnz label_ShiftVectorComponents0_GLVW2_BM1 // branch to shift d0 r2 mb1

/******************************************/
/* shift d0 r=3                           */
/******************************************/
label_ShiftVectorComponents0_GLVW3:
v_cmp_eq_u32 vcc, v4, 0x0
s_cbranch_vccnz label_ShiftVectorComponents0_GLVW3_BM0 // branch to shift d0 r3 mb0
v_cmp_eq_u32 vcc, v4, 0x2
s_cbranch_vccnz label_ShiftVectorComponents0_GLVW3_BM1 // branch to shift d0 r3 mb1

/******************************************/
/* shift d0 r=1 mb=0                      */
/******************************************/
label_ShiftVectorComponents0_GLVW1_BM0:  /// r1 mb0
v_cmp_eq_u32 vcc, v5, 0x0
s_cbranch_vccnz label_ShiftVectorComponents0_GLVW1_BM0_VW0 // branch to shift d0 r1 mb0 vw0

/******************************************/
/* shift d0 r=1 mb=1                      */
/******************************************/
label_ShiftVectorComponents0_GLVW1_BM1:  /// r1 mb1
v_cmp_eq_u32 vcc, v5, 0x0
s_cbranch_vccnz label_ShiftVectorComponents0_GLVW1_BM1_VW0 // branch to shift d0 r1 mb1 vw0

/******************************************/
/* shift d0 r=2 mb=0                      */
/******************************************/
label_ShiftVectorComponents0_GLVW2_BM0:  /// r2 mb0
v_cmp_eq_u32 vcc, v5, 0x0
s_cbranch_vccnz label_ShiftVectorComponents0_GLVW2_BM0_VW0 // branch to shift d0 r2 mb0 vw0

/******************************************/
/* shift d0 r=2 mb=1                      */
/******************************************/
label_ShiftVectorComponents0_GLVW2_BM1:  /// r2 mb1
v_cmp_eq_u32 vcc, v5, 0x0
s_cbranch_vccnz label_ShiftVectorComponents0_GLVW2_BM1_VW0 // branch to shift d0 r2 mb1 vw0

/******************************************/
/* shift d0 r=3 mb=0                      */
/******************************************/
label_ShiftVectorComponents0_GLVW3_BM0:  /// r3 mb0
v_cmp_eq_u32 vcc, v5, 0x0
s_cbranch_vccnz label_ShiftVectorComponents0_GLVW3_BM0_VW0 // branch to shift d0 r3 mb0 vw0

/******************************************/
/* shift d0 r=3 mb=1                      */
/******************************************/
label_ShiftVectorComponents0_GLVW3_BM1:  /// r3 mb1
v_cmp_eq_u32 vcc, v5, 0x0
s_cbranch_vccnz label_ShiftVectorComponents0_GLVW3_BM1_VW0 // branch to shift d0 r3 mb1 vw0

/******************************************/
/* shift d0 r=1 mb=0 vw0                  */
/******************************************/
label_ShiftVectorComponents0_GLVW1_BM0_VW0:  /// r1 mb0 vw0
s_mov_b32 s8, 0
v_cmpx_eq_u32 s[8:9], v6, s8                       // is thread in edge glvw region
v_and_b32 v0, 63, v[vgprSerial]                    // permute register between threads
v_lshlrev_b32 v0, 2, v0                            // permute register between threads
v_accvgpr_read_b32 v7, acc12                       // glvw 1 mb 0 tt1 0 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc0, v7
v_accvgpr_read_b32 v7, acc44                       // glvw 1 mb 0 tt1 1 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc32, v7
v_accvgpr_read_b32 v7, acc76                       // glvw 1 mb 0 tt1 2 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc64, v7
v_accvgpr_read_b32 v7, acc108                      // glvw 1 mb 0 tt1 3 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc96, v7
v_accvgpr_read_b32 v7, acc13                       // glvw 1 mb 0 tt1 4 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc1, v7
v_accvgpr_read_b32 v7, acc45                       // glvw 1 mb 0 tt1 5 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc33, v7
v_accvgpr_read_b32 v7, acc77                       // glvw 1 mb 0 tt1 6 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc65, v7
v_accvgpr_read_b32 v7, acc109                      // glvw 1 mb 0 tt1 7 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc97, v7
v_accvgpr_read_b32 v7, acc14                       // glvw 1 mb 0 tt1 8 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc2, v7
v_accvgpr_read_b32 v7, acc46                       // glvw 1 mb 0 tt1 9 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc34, v7
v_accvgpr_read_b32 v7, acc78                       // glvw 1 mb 0 tt1 10 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc66, v7
v_accvgpr_read_b32 v7, acc110                      // glvw 1 mb 0 tt1 11 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc98, v7
v_accvgpr_read_b32 v7, acc15                       // glvw 1 mb 0 tt1 12 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc3, v7
v_accvgpr_read_b32 v7, acc47                       // glvw 1 mb 0 tt1 13 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc35, v7
v_accvgpr_read_b32 v7, acc79                       // glvw 1 mb 0 tt1 14 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc67, v7
v_accvgpr_read_b32 v7, acc111                      // glvw 1 mb 0 tt1 15 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc99, v7
v_accvgpr_read_b32 v7, acc140                      // glvw 1 mb 0 tt1 16 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc128, v7
v_accvgpr_read_b32 v7, acc172                      // glvw 1 mb 0 tt1 17 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc160, v7
v_accvgpr_read_b32 v7, acc204                      // glvw 1 mb 0 tt1 18 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc192, v7
v_accvgpr_read_b32 v7, acc236                      // glvw 1 mb 0 tt1 19 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc224, v7
v_accvgpr_read_b32 v7, acc141                      // glvw 1 mb 0 tt1 20 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc129, v7
v_accvgpr_read_b32 v7, acc173                      // glvw 1 mb 0 tt1 21 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc161, v7
v_accvgpr_read_b32 v7, acc205                      // glvw 1 mb 0 tt1 22 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc193, v7
v_accvgpr_read_b32 v7, acc237                      // glvw 1 mb 0 tt1 23 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc225, v7
v_accvgpr_read_b32 v7, acc142                      // glvw 1 mb 0 tt1 24 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc130, v7
v_accvgpr_read_b32 v7, acc174                      // glvw 1 mb 0 tt1 25 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc162, v7
v_accvgpr_read_b32 v7, acc206                      // glvw 1 mb 0 tt1 26 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc194, v7
v_accvgpr_read_b32 v7, acc238                      // glvw 1 mb 0 tt1 27 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc226, v7
v_accvgpr_read_b32 v7, acc143                      // glvw 1 mb 0 tt1 28 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc131, v7
v_accvgpr_read_b32 v7, acc175                      // glvw 1 mb 0 tt1 29 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc163, v7
v_accvgpr_read_b32 v7, acc207                      // glvw 1 mb 0 tt1 30 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc195, v7
v_accvgpr_read_b32 v7, acc239                      // glvw 1 mb 0 tt1 31 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc227, v7
s_mov_b64 s[8:9], 0xFFFFFFFFFFFFFFFF               // to restore all threads active
s_or_saveexec_b64 vcc, s[8:9]                      // all threads active
s_branch label_ShiftVectorComponents0_GLVW0        // done shifting


/******************************************/
/* shift d0 r=1 mb=1 vw0                  */
/******************************************/
label_ShiftVectorComponents0_GLVW1_BM1_VW0:  /// r1 mb1 vw0
s_mov_b32 s8, 32
v_cmpx_eq_u32 s[8:9], v6, s8                       // is thread in edge glvw region
v_and_b32 v0, 63, v[vgprSerial]                    // permute register between threads
v_lshlrev_b32 v0, 2, v0                            // permute register between threads
v_accvgpr_read_b32 v7, acc28                       // glvw 1 mb 1 tt1 0 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc16, v7
v_accvgpr_read_b32 v7, acc60                       // glvw 1 mb 1 tt1 1 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc48, v7
v_accvgpr_read_b32 v7, acc92                       // glvw 1 mb 1 tt1 2 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc80, v7
v_accvgpr_read_b32 v7, acc124                      // glvw 1 mb 1 tt1 3 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc112, v7
v_accvgpr_read_b32 v7, acc29                       // glvw 1 mb 1 tt1 4 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc17, v7
v_accvgpr_read_b32 v7, acc61                       // glvw 1 mb 1 tt1 5 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc49, v7
v_accvgpr_read_b32 v7, acc93                       // glvw 1 mb 1 tt1 6 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc81, v7
v_accvgpr_read_b32 v7, acc125                      // glvw 1 mb 1 tt1 7 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc113, v7
v_accvgpr_read_b32 v7, acc30                       // glvw 1 mb 1 tt1 8 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc18, v7
v_accvgpr_read_b32 v7, acc62                       // glvw 1 mb 1 tt1 9 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc50, v7
v_accvgpr_read_b32 v7, acc94                       // glvw 1 mb 1 tt1 10 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc82, v7
v_accvgpr_read_b32 v7, acc126                      // glvw 1 mb 1 tt1 11 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc114, v7
v_accvgpr_read_b32 v7, acc31                       // glvw 1 mb 1 tt1 12 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc19, v7
v_accvgpr_read_b32 v7, acc63                       // glvw 1 mb 1 tt1 13 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc51, v7
v_accvgpr_read_b32 v7, acc95                       // glvw 1 mb 1 tt1 14 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc83, v7
v_accvgpr_read_b32 v7, acc127                      // glvw 1 mb 1 tt1 15 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc115, v7
v_accvgpr_read_b32 v7, acc156                      // glvw 1 mb 1 tt1 16 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc144, v7
v_accvgpr_read_b32 v7, acc188                      // glvw 1 mb 1 tt1 17 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc176, v7
v_accvgpr_read_b32 v7, acc220                      // glvw 1 mb 1 tt1 18 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc208, v7
v_accvgpr_read_b32 v7, acc252                      // glvw 1 mb 1 tt1 19 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc240, v7
v_accvgpr_read_b32 v7, acc157                      // glvw 1 mb 1 tt1 20 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc145, v7
v_accvgpr_read_b32 v7, acc189                      // glvw 1 mb 1 tt1 21 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc177, v7
v_accvgpr_read_b32 v7, acc221                      // glvw 1 mb 1 tt1 22 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc209, v7
v_accvgpr_read_b32 v7, acc253                      // glvw 1 mb 1 tt1 23 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc241, v7
v_accvgpr_read_b32 v7, acc158                      // glvw 1 mb 1 tt1 24 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc146, v7
v_accvgpr_read_b32 v7, acc190                      // glvw 1 mb 1 tt1 25 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc178, v7
v_accvgpr_read_b32 v7, acc222                      // glvw 1 mb 1 tt1 26 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc210, v7
v_accvgpr_read_b32 v7, acc254                      // glvw 1 mb 1 tt1 27 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc242, v7
v_accvgpr_read_b32 v7, acc159                      // glvw 1 mb 1 tt1 28 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc147, v7
v_accvgpr_read_b32 v7, acc191                      // glvw 1 mb 1 tt1 29 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc179, v7
v_accvgpr_read_b32 v7, acc223                      // glvw 1 mb 1 tt1 30 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc211, v7
v_accvgpr_read_b32 v7, acc255                      // glvw 1 mb 1 tt1 31 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc243, v7
s_mov_b64 s[8:9], 0xFFFFFFFFFFFFFFFF               // to restore all threads active
s_or_saveexec_b64 vcc, s[8:9]                      // all threads active
s_branch label_ShiftVectorComponents0_GLVW0        // done shifting


/******************************************/
/* shift d0 r=2 mb=0 vw0                  */
/******************************************/
label_ShiftVectorComponents0_GLVW2_BM0_VW0:  /// r2 mb0 vw0
s_mov_b32 s8, 0
v_cmpx_eq_u32 s[8:9], v6, s8                       // is thread in edge glvw region
v_and_b32 v0, 63, v[vgprSerial]                    // permute register between threads
v_lshlrev_b32 v0, 2, v0                            // permute register between threads
v_accvgpr_read_b32 v7, acc8                        // glvw 2 mb 0 tt1 0 r 0
v_accvgpr_read_b32 v8, acc12                       // glvw 2 mb 0 tt1 0 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc0, v7
v_accvgpr_write_b32 acc4, v8
v_accvgpr_read_b32 v7, acc40                       // glvw 2 mb 0 tt1 1 r 0
v_accvgpr_read_b32 v8, acc44                       // glvw 2 mb 0 tt1 1 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc32, v7
v_accvgpr_write_b32 acc36, v8
v_accvgpr_read_b32 v7, acc72                       // glvw 2 mb 0 tt1 2 r 0
v_accvgpr_read_b32 v8, acc76                       // glvw 2 mb 0 tt1 2 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc64, v7
v_accvgpr_write_b32 acc68, v8
v_accvgpr_read_b32 v7, acc104                      // glvw 2 mb 0 tt1 3 r 0
v_accvgpr_read_b32 v8, acc108                      // glvw 2 mb 0 tt1 3 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc96, v7
v_accvgpr_write_b32 acc100, v8
v_accvgpr_read_b32 v7, acc9                        // glvw 2 mb 0 tt1 4 r 0
v_accvgpr_read_b32 v8, acc13                       // glvw 2 mb 0 tt1 4 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc1, v7
v_accvgpr_write_b32 acc5, v8
v_accvgpr_read_b32 v7, acc41                       // glvw 2 mb 0 tt1 5 r 0
v_accvgpr_read_b32 v8, acc45                       // glvw 2 mb 0 tt1 5 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc33, v7
v_accvgpr_write_b32 acc37, v8
v_accvgpr_read_b32 v7, acc73                       // glvw 2 mb 0 tt1 6 r 0
v_accvgpr_read_b32 v8, acc77                       // glvw 2 mb 0 tt1 6 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc65, v7
v_accvgpr_write_b32 acc69, v8
v_accvgpr_read_b32 v7, acc105                      // glvw 2 mb 0 tt1 7 r 0
v_accvgpr_read_b32 v8, acc109                      // glvw 2 mb 0 tt1 7 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc97, v7
v_accvgpr_write_b32 acc101, v8
v_accvgpr_read_b32 v7, acc10                       // glvw 2 mb 0 tt1 8 r 0
v_accvgpr_read_b32 v8, acc14                       // glvw 2 mb 0 tt1 8 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc2, v7
v_accvgpr_write_b32 acc6, v8
v_accvgpr_read_b32 v7, acc42                       // glvw 2 mb 0 tt1 9 r 0
v_accvgpr_read_b32 v8, acc46                       // glvw 2 mb 0 tt1 9 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc34, v7
v_accvgpr_write_b32 acc38, v8
v_accvgpr_read_b32 v7, acc74                       // glvw 2 mb 0 tt1 10 r 0
v_accvgpr_read_b32 v8, acc78                       // glvw 2 mb 0 tt1 10 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc66, v7
v_accvgpr_write_b32 acc70, v8
v_accvgpr_read_b32 v7, acc106                      // glvw 2 mb 0 tt1 11 r 0
v_accvgpr_read_b32 v8, acc110                      // glvw 2 mb 0 tt1 11 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc98, v7
v_accvgpr_write_b32 acc102, v8
v_accvgpr_read_b32 v7, acc11                       // glvw 2 mb 0 tt1 12 r 0
v_accvgpr_read_b32 v8, acc15                       // glvw 2 mb 0 tt1 12 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc3, v7
v_accvgpr_write_b32 acc7, v8
v_accvgpr_read_b32 v7, acc43                       // glvw 2 mb 0 tt1 13 r 0
v_accvgpr_read_b32 v8, acc47                       // glvw 2 mb 0 tt1 13 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc35, v7
v_accvgpr_write_b32 acc39, v8
v_accvgpr_read_b32 v7, acc75                       // glvw 2 mb 0 tt1 14 r 0
v_accvgpr_read_b32 v8, acc79                       // glvw 2 mb 0 tt1 14 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc67, v7
v_accvgpr_write_b32 acc71, v8
v_accvgpr_read_b32 v7, acc107                      // glvw 2 mb 0 tt1 15 r 0
v_accvgpr_read_b32 v8, acc111                      // glvw 2 mb 0 tt1 15 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc99, v7
v_accvgpr_write_b32 acc103, v8
v_accvgpr_read_b32 v7, acc136                      // glvw 2 mb 0 tt1 16 r 0
v_accvgpr_read_b32 v8, acc140                      // glvw 2 mb 0 tt1 16 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc128, v7
v_accvgpr_write_b32 acc132, v8
v_accvgpr_read_b32 v7, acc168                      // glvw 2 mb 0 tt1 17 r 0
v_accvgpr_read_b32 v8, acc172                      // glvw 2 mb 0 tt1 17 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc160, v7
v_accvgpr_write_b32 acc164, v8
v_accvgpr_read_b32 v7, acc200                      // glvw 2 mb 0 tt1 18 r 0
v_accvgpr_read_b32 v8, acc204                      // glvw 2 mb 0 tt1 18 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc192, v7
v_accvgpr_write_b32 acc196, v8
v_accvgpr_read_b32 v7, acc232                      // glvw 2 mb 0 tt1 19 r 0
v_accvgpr_read_b32 v8, acc236                      // glvw 2 mb 0 tt1 19 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc224, v7
v_accvgpr_write_b32 acc228, v8
v_accvgpr_read_b32 v7, acc137                      // glvw 2 mb 0 tt1 20 r 0
v_accvgpr_read_b32 v8, acc141                      // glvw 2 mb 0 tt1 20 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc129, v7
v_accvgpr_write_b32 acc133, v8
v_accvgpr_read_b32 v7, acc169                      // glvw 2 mb 0 tt1 21 r 0
v_accvgpr_read_b32 v8, acc173                      // glvw 2 mb 0 tt1 21 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc161, v7
v_accvgpr_write_b32 acc165, v8
v_accvgpr_read_b32 v7, acc201                      // glvw 2 mb 0 tt1 22 r 0
v_accvgpr_read_b32 v8, acc205                      // glvw 2 mb 0 tt1 22 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc193, v7
v_accvgpr_write_b32 acc197, v8
v_accvgpr_read_b32 v7, acc233                      // glvw 2 mb 0 tt1 23 r 0
v_accvgpr_read_b32 v8, acc237                      // glvw 2 mb 0 tt1 23 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc225, v7
v_accvgpr_write_b32 acc229, v8
v_accvgpr_read_b32 v7, acc138                      // glvw 2 mb 0 tt1 24 r 0
v_accvgpr_read_b32 v8, acc142                      // glvw 2 mb 0 tt1 24 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc130, v7
v_accvgpr_write_b32 acc134, v8
v_accvgpr_read_b32 v7, acc170                      // glvw 2 mb 0 tt1 25 r 0
v_accvgpr_read_b32 v8, acc174                      // glvw 2 mb 0 tt1 25 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc162, v7
v_accvgpr_write_b32 acc166, v8
v_accvgpr_read_b32 v7, acc202                      // glvw 2 mb 0 tt1 26 r 0
v_accvgpr_read_b32 v8, acc206                      // glvw 2 mb 0 tt1 26 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc194, v7
v_accvgpr_write_b32 acc198, v8
v_accvgpr_read_b32 v7, acc234                      // glvw 2 mb 0 tt1 27 r 0
v_accvgpr_read_b32 v8, acc238                      // glvw 2 mb 0 tt1 27 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc226, v7
v_accvgpr_write_b32 acc230, v8
v_accvgpr_read_b32 v7, acc139                      // glvw 2 mb 0 tt1 28 r 0
v_accvgpr_read_b32 v8, acc143                      // glvw 2 mb 0 tt1 28 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc131, v7
v_accvgpr_write_b32 acc135, v8
v_accvgpr_read_b32 v7, acc171                      // glvw 2 mb 0 tt1 29 r 0
v_accvgpr_read_b32 v8, acc175                      // glvw 2 mb 0 tt1 29 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc163, v7
v_accvgpr_write_b32 acc167, v8
v_accvgpr_read_b32 v7, acc203                      // glvw 2 mb 0 tt1 30 r 0
v_accvgpr_read_b32 v8, acc207                      // glvw 2 mb 0 tt1 30 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc195, v7
v_accvgpr_write_b32 acc199, v8
v_accvgpr_read_b32 v7, acc235                      // glvw 2 mb 0 tt1 31 r 0
v_accvgpr_read_b32 v8, acc239                      // glvw 2 mb 0 tt1 31 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc227, v7
v_accvgpr_write_b32 acc231, v8
s_mov_b64 s[8:9], 0xFFFFFFFFFFFFFFFF               // to restore all threads active
s_or_saveexec_b64 vcc, s[8:9]                      // all threads active
s_branch label_ShiftVectorComponents0_GLVW0        // done shifting


/******************************************/
/* shift d0 r=2 mb=1 vw0                  */
/******************************************/
label_ShiftVectorComponents0_GLVW2_BM1_VW0:  /// r2 mb1 vw0
s_mov_b32 s8, 32
v_cmpx_eq_u32 s[8:9], v6, s8                       // is thread in edge glvw region
v_and_b32 v0, 63, v[vgprSerial]                    // permute register between threads
v_lshlrev_b32 v0, 2, v0                            // permute register between threads
v_accvgpr_read_b32 v7, acc24                       // glvw 2 mb 1 tt1 0 r 0
v_accvgpr_read_b32 v8, acc28                       // glvw 2 mb 1 tt1 0 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc16, v7
v_accvgpr_write_b32 acc20, v8
v_accvgpr_read_b32 v7, acc56                       // glvw 2 mb 1 tt1 1 r 0
v_accvgpr_read_b32 v8, acc60                       // glvw 2 mb 1 tt1 1 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc48, v7
v_accvgpr_write_b32 acc52, v8
v_accvgpr_read_b32 v7, acc88                       // glvw 2 mb 1 tt1 2 r 0
v_accvgpr_read_b32 v8, acc92                       // glvw 2 mb 1 tt1 2 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc80, v7
v_accvgpr_write_b32 acc84, v8
v_accvgpr_read_b32 v7, acc120                      // glvw 2 mb 1 tt1 3 r 0
v_accvgpr_read_b32 v8, acc124                      // glvw 2 mb 1 tt1 3 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc112, v7
v_accvgpr_write_b32 acc116, v8
v_accvgpr_read_b32 v7, acc25                       // glvw 2 mb 1 tt1 4 r 0
v_accvgpr_read_b32 v8, acc29                       // glvw 2 mb 1 tt1 4 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc17, v7
v_accvgpr_write_b32 acc21, v8
v_accvgpr_read_b32 v7, acc57                       // glvw 2 mb 1 tt1 5 r 0
v_accvgpr_read_b32 v8, acc61                       // glvw 2 mb 1 tt1 5 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc49, v7
v_accvgpr_write_b32 acc53, v8
v_accvgpr_read_b32 v7, acc89                       // glvw 2 mb 1 tt1 6 r 0
v_accvgpr_read_b32 v8, acc93                       // glvw 2 mb 1 tt1 6 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc81, v7
v_accvgpr_write_b32 acc85, v8
v_accvgpr_read_b32 v7, acc121                      // glvw 2 mb 1 tt1 7 r 0
v_accvgpr_read_b32 v8, acc125                      // glvw 2 mb 1 tt1 7 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc113, v7
v_accvgpr_write_b32 acc117, v8
v_accvgpr_read_b32 v7, acc26                       // glvw 2 mb 1 tt1 8 r 0
v_accvgpr_read_b32 v8, acc30                       // glvw 2 mb 1 tt1 8 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc18, v7
v_accvgpr_write_b32 acc22, v8
v_accvgpr_read_b32 v7, acc58                       // glvw 2 mb 1 tt1 9 r 0
v_accvgpr_read_b32 v8, acc62                       // glvw 2 mb 1 tt1 9 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc50, v7
v_accvgpr_write_b32 acc54, v8
v_accvgpr_read_b32 v7, acc90                       // glvw 2 mb 1 tt1 10 r 0
v_accvgpr_read_b32 v8, acc94                       // glvw 2 mb 1 tt1 10 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc82, v7
v_accvgpr_write_b32 acc86, v8
v_accvgpr_read_b32 v7, acc122                      // glvw 2 mb 1 tt1 11 r 0
v_accvgpr_read_b32 v8, acc126                      // glvw 2 mb 1 tt1 11 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc114, v7
v_accvgpr_write_b32 acc118, v8
v_accvgpr_read_b32 v7, acc27                       // glvw 2 mb 1 tt1 12 r 0
v_accvgpr_read_b32 v8, acc31                       // glvw 2 mb 1 tt1 12 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc19, v7
v_accvgpr_write_b32 acc23, v8
v_accvgpr_read_b32 v7, acc59                       // glvw 2 mb 1 tt1 13 r 0
v_accvgpr_read_b32 v8, acc63                       // glvw 2 mb 1 tt1 13 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc51, v7
v_accvgpr_write_b32 acc55, v8
v_accvgpr_read_b32 v7, acc91                       // glvw 2 mb 1 tt1 14 r 0
v_accvgpr_read_b32 v8, acc95                       // glvw 2 mb 1 tt1 14 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc83, v7
v_accvgpr_write_b32 acc87, v8
v_accvgpr_read_b32 v7, acc123                      // glvw 2 mb 1 tt1 15 r 0
v_accvgpr_read_b32 v8, acc127                      // glvw 2 mb 1 tt1 15 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc115, v7
v_accvgpr_write_b32 acc119, v8
v_accvgpr_read_b32 v7, acc152                      // glvw 2 mb 1 tt1 16 r 0
v_accvgpr_read_b32 v8, acc156                      // glvw 2 mb 1 tt1 16 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc144, v7
v_accvgpr_write_b32 acc148, v8
v_accvgpr_read_b32 v7, acc184                      // glvw 2 mb 1 tt1 17 r 0
v_accvgpr_read_b32 v8, acc188                      // glvw 2 mb 1 tt1 17 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc176, v7
v_accvgpr_write_b32 acc180, v8
v_accvgpr_read_b32 v7, acc216                      // glvw 2 mb 1 tt1 18 r 0
v_accvgpr_read_b32 v8, acc220                      // glvw 2 mb 1 tt1 18 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc208, v7
v_accvgpr_write_b32 acc212, v8
v_accvgpr_read_b32 v7, acc248                      // glvw 2 mb 1 tt1 19 r 0
v_accvgpr_read_b32 v8, acc252                      // glvw 2 mb 1 tt1 19 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc240, v7
v_accvgpr_write_b32 acc244, v8
v_accvgpr_read_b32 v7, acc153                      // glvw 2 mb 1 tt1 20 r 0
v_accvgpr_read_b32 v8, acc157                      // glvw 2 mb 1 tt1 20 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc145, v7
v_accvgpr_write_b32 acc149, v8
v_accvgpr_read_b32 v7, acc185                      // glvw 2 mb 1 tt1 21 r 0
v_accvgpr_read_b32 v8, acc189                      // glvw 2 mb 1 tt1 21 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc177, v7
v_accvgpr_write_b32 acc181, v8
v_accvgpr_read_b32 v7, acc217                      // glvw 2 mb 1 tt1 22 r 0
v_accvgpr_read_b32 v8, acc221                      // glvw 2 mb 1 tt1 22 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc209, v7
v_accvgpr_write_b32 acc213, v8
v_accvgpr_read_b32 v7, acc249                      // glvw 2 mb 1 tt1 23 r 0
v_accvgpr_read_b32 v8, acc253                      // glvw 2 mb 1 tt1 23 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc241, v7
v_accvgpr_write_b32 acc245, v8
v_accvgpr_read_b32 v7, acc154                      // glvw 2 mb 1 tt1 24 r 0
v_accvgpr_read_b32 v8, acc158                      // glvw 2 mb 1 tt1 24 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc146, v7
v_accvgpr_write_b32 acc150, v8
v_accvgpr_read_b32 v7, acc186                      // glvw 2 mb 1 tt1 25 r 0
v_accvgpr_read_b32 v8, acc190                      // glvw 2 mb 1 tt1 25 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc178, v7
v_accvgpr_write_b32 acc182, v8
v_accvgpr_read_b32 v7, acc218                      // glvw 2 mb 1 tt1 26 r 0
v_accvgpr_read_b32 v8, acc222                      // glvw 2 mb 1 tt1 26 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc210, v7
v_accvgpr_write_b32 acc214, v8
v_accvgpr_read_b32 v7, acc250                      // glvw 2 mb 1 tt1 27 r 0
v_accvgpr_read_b32 v8, acc254                      // glvw 2 mb 1 tt1 27 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc242, v7
v_accvgpr_write_b32 acc246, v8
v_accvgpr_read_b32 v7, acc155                      // glvw 2 mb 1 tt1 28 r 0
v_accvgpr_read_b32 v8, acc159                      // glvw 2 mb 1 tt1 28 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc147, v7
v_accvgpr_write_b32 acc151, v8
v_accvgpr_read_b32 v7, acc187                      // glvw 2 mb 1 tt1 29 r 0
v_accvgpr_read_b32 v8, acc191                      // glvw 2 mb 1 tt1 29 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc179, v7
v_accvgpr_write_b32 acc183, v8
v_accvgpr_read_b32 v7, acc219                      // glvw 2 mb 1 tt1 30 r 0
v_accvgpr_read_b32 v8, acc223                      // glvw 2 mb 1 tt1 30 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc211, v7
v_accvgpr_write_b32 acc215, v8
v_accvgpr_read_b32 v7, acc251                      // glvw 2 mb 1 tt1 31 r 0
v_accvgpr_read_b32 v8, acc255                      // glvw 2 mb 1 tt1 31 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc243, v7
v_accvgpr_write_b32 acc247, v8
s_mov_b64 s[8:9], 0xFFFFFFFFFFFFFFFF               // to restore all threads active
s_or_saveexec_b64 vcc, s[8:9]                      // all threads active
s_branch label_ShiftVectorComponents0_GLVW0        // done shifting


/******************************************/
/* shift d0 r=3 mb=0 vw0                  */
/******************************************/
label_ShiftVectorComponents0_GLVW3_BM0_VW0:  /// r3 mb0 vw0
s_mov_b32 s8, 0
v_cmpx_eq_u32 s[8:9], v6, s8                       // is thread in edge glvw region
v_and_b32 v0, 63, v[vgprSerial]                    // permute register between threads
v_lshlrev_b32 v0, 2, v0                            // permute register between threads
v_accvgpr_read_b32 v7, acc4                        // glvw 3 mb 0 tt1 0 r 0
v_accvgpr_read_b32 v8, acc8                        // glvw 3 mb 0 tt1 0 r 0
v_accvgpr_read_b32 v9, acc12                       // glvw 3 mb 0 tt1 0 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc0, v7
v_accvgpr_write_b32 acc4, v8
v_accvgpr_write_b32 acc8, v9
v_accvgpr_read_b32 v7, acc36                       // glvw 3 mb 0 tt1 1 r 0
v_accvgpr_read_b32 v8, acc40                       // glvw 3 mb 0 tt1 1 r 0
v_accvgpr_read_b32 v9, acc44                       // glvw 3 mb 0 tt1 1 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc32, v7
v_accvgpr_write_b32 acc36, v8
v_accvgpr_write_b32 acc40, v9
v_accvgpr_read_b32 v7, acc68                       // glvw 3 mb 0 tt1 2 r 0
v_accvgpr_read_b32 v8, acc72                       // glvw 3 mb 0 tt1 2 r 0
v_accvgpr_read_b32 v9, acc76                       // glvw 3 mb 0 tt1 2 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc64, v7
v_accvgpr_write_b32 acc68, v8
v_accvgpr_write_b32 acc72, v9
v_accvgpr_read_b32 v7, acc100                      // glvw 3 mb 0 tt1 3 r 0
v_accvgpr_read_b32 v8, acc104                      // glvw 3 mb 0 tt1 3 r 0
v_accvgpr_read_b32 v9, acc108                      // glvw 3 mb 0 tt1 3 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc96, v7
v_accvgpr_write_b32 acc100, v8
v_accvgpr_write_b32 acc104, v9
v_accvgpr_read_b32 v7, acc5                        // glvw 3 mb 0 tt1 4 r 0
v_accvgpr_read_b32 v8, acc9                        // glvw 3 mb 0 tt1 4 r 0
v_accvgpr_read_b32 v9, acc13                       // glvw 3 mb 0 tt1 4 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc1, v7
v_accvgpr_write_b32 acc5, v8
v_accvgpr_write_b32 acc9, v9
v_accvgpr_read_b32 v7, acc37                       // glvw 3 mb 0 tt1 5 r 0
v_accvgpr_read_b32 v8, acc41                       // glvw 3 mb 0 tt1 5 r 0
v_accvgpr_read_b32 v9, acc45                       // glvw 3 mb 0 tt1 5 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc33, v7
v_accvgpr_write_b32 acc37, v8
v_accvgpr_write_b32 acc41, v9
v_accvgpr_read_b32 v7, acc69                       // glvw 3 mb 0 tt1 6 r 0
v_accvgpr_read_b32 v8, acc73                       // glvw 3 mb 0 tt1 6 r 0
v_accvgpr_read_b32 v9, acc77                       // glvw 3 mb 0 tt1 6 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc65, v7
v_accvgpr_write_b32 acc69, v8
v_accvgpr_write_b32 acc73, v9
v_accvgpr_read_b32 v7, acc101                      // glvw 3 mb 0 tt1 7 r 0
v_accvgpr_read_b32 v8, acc105                      // glvw 3 mb 0 tt1 7 r 0
v_accvgpr_read_b32 v9, acc109                      // glvw 3 mb 0 tt1 7 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc97, v7
v_accvgpr_write_b32 acc101, v8
v_accvgpr_write_b32 acc105, v9
v_accvgpr_read_b32 v7, acc6                        // glvw 3 mb 0 tt1 8 r 0
v_accvgpr_read_b32 v8, acc10                       // glvw 3 mb 0 tt1 8 r 0
v_accvgpr_read_b32 v9, acc14                       // glvw 3 mb 0 tt1 8 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc2, v7
v_accvgpr_write_b32 acc6, v8
v_accvgpr_write_b32 acc10, v9
v_accvgpr_read_b32 v7, acc38                       // glvw 3 mb 0 tt1 9 r 0
v_accvgpr_read_b32 v8, acc42                       // glvw 3 mb 0 tt1 9 r 0
v_accvgpr_read_b32 v9, acc46                       // glvw 3 mb 0 tt1 9 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc34, v7
v_accvgpr_write_b32 acc38, v8
v_accvgpr_write_b32 acc42, v9
v_accvgpr_read_b32 v7, acc70                       // glvw 3 mb 0 tt1 10 r 0
v_accvgpr_read_b32 v8, acc74                       // glvw 3 mb 0 tt1 10 r 0
v_accvgpr_read_b32 v9, acc78                       // glvw 3 mb 0 tt1 10 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc66, v7
v_accvgpr_write_b32 acc70, v8
v_accvgpr_write_b32 acc74, v9
v_accvgpr_read_b32 v7, acc102                      // glvw 3 mb 0 tt1 11 r 0
v_accvgpr_read_b32 v8, acc106                      // glvw 3 mb 0 tt1 11 r 0
v_accvgpr_read_b32 v9, acc110                      // glvw 3 mb 0 tt1 11 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc98, v7
v_accvgpr_write_b32 acc102, v8
v_accvgpr_write_b32 acc106, v9
v_accvgpr_read_b32 v7, acc7                        // glvw 3 mb 0 tt1 12 r 0
v_accvgpr_read_b32 v8, acc11                       // glvw 3 mb 0 tt1 12 r 0
v_accvgpr_read_b32 v9, acc15                       // glvw 3 mb 0 tt1 12 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc3, v7
v_accvgpr_write_b32 acc7, v8
v_accvgpr_write_b32 acc11, v9
v_accvgpr_read_b32 v7, acc39                       // glvw 3 mb 0 tt1 13 r 0
v_accvgpr_read_b32 v8, acc43                       // glvw 3 mb 0 tt1 13 r 0
v_accvgpr_read_b32 v9, acc47                       // glvw 3 mb 0 tt1 13 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc35, v7
v_accvgpr_write_b32 acc39, v8
v_accvgpr_write_b32 acc43, v9
v_accvgpr_read_b32 v7, acc71                       // glvw 3 mb 0 tt1 14 r 0
v_accvgpr_read_b32 v8, acc75                       // glvw 3 mb 0 tt1 14 r 0
v_accvgpr_read_b32 v9, acc79                       // glvw 3 mb 0 tt1 14 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc67, v7
v_accvgpr_write_b32 acc71, v8
v_accvgpr_write_b32 acc75, v9
v_accvgpr_read_b32 v7, acc103                      // glvw 3 mb 0 tt1 15 r 0
v_accvgpr_read_b32 v8, acc107                      // glvw 3 mb 0 tt1 15 r 0
v_accvgpr_read_b32 v9, acc111                      // glvw 3 mb 0 tt1 15 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc99, v7
v_accvgpr_write_b32 acc103, v8
v_accvgpr_write_b32 acc107, v9
v_accvgpr_read_b32 v7, acc132                      // glvw 3 mb 0 tt1 16 r 0
v_accvgpr_read_b32 v8, acc136                      // glvw 3 mb 0 tt1 16 r 0
v_accvgpr_read_b32 v9, acc140                      // glvw 3 mb 0 tt1 16 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc128, v7
v_accvgpr_write_b32 acc132, v8
v_accvgpr_write_b32 acc136, v9
v_accvgpr_read_b32 v7, acc164                      // glvw 3 mb 0 tt1 17 r 0
v_accvgpr_read_b32 v8, acc168                      // glvw 3 mb 0 tt1 17 r 0
v_accvgpr_read_b32 v9, acc172                      // glvw 3 mb 0 tt1 17 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc160, v7
v_accvgpr_write_b32 acc164, v8
v_accvgpr_write_b32 acc168, v9
v_accvgpr_read_b32 v7, acc196                      // glvw 3 mb 0 tt1 18 r 0
v_accvgpr_read_b32 v8, acc200                      // glvw 3 mb 0 tt1 18 r 0
v_accvgpr_read_b32 v9, acc204                      // glvw 3 mb 0 tt1 18 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc192, v7
v_accvgpr_write_b32 acc196, v8
v_accvgpr_write_b32 acc200, v9
v_accvgpr_read_b32 v7, acc228                      // glvw 3 mb 0 tt1 19 r 0
v_accvgpr_read_b32 v8, acc232                      // glvw 3 mb 0 tt1 19 r 0
v_accvgpr_read_b32 v9, acc236                      // glvw 3 mb 0 tt1 19 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc224, v7
v_accvgpr_write_b32 acc228, v8
v_accvgpr_write_b32 acc232, v9
v_accvgpr_read_b32 v7, acc133                      // glvw 3 mb 0 tt1 20 r 0
v_accvgpr_read_b32 v8, acc137                      // glvw 3 mb 0 tt1 20 r 0
v_accvgpr_read_b32 v9, acc141                      // glvw 3 mb 0 tt1 20 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc129, v7
v_accvgpr_write_b32 acc133, v8
v_accvgpr_write_b32 acc137, v9
v_accvgpr_read_b32 v7, acc165                      // glvw 3 mb 0 tt1 21 r 0
v_accvgpr_read_b32 v8, acc169                      // glvw 3 mb 0 tt1 21 r 0
v_accvgpr_read_b32 v9, acc173                      // glvw 3 mb 0 tt1 21 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc161, v7
v_accvgpr_write_b32 acc165, v8
v_accvgpr_write_b32 acc169, v9
v_accvgpr_read_b32 v7, acc197                      // glvw 3 mb 0 tt1 22 r 0
v_accvgpr_read_b32 v8, acc201                      // glvw 3 mb 0 tt1 22 r 0
v_accvgpr_read_b32 v9, acc205                      // glvw 3 mb 0 tt1 22 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc193, v7
v_accvgpr_write_b32 acc197, v8
v_accvgpr_write_b32 acc201, v9
v_accvgpr_read_b32 v7, acc229                      // glvw 3 mb 0 tt1 23 r 0
v_accvgpr_read_b32 v8, acc233                      // glvw 3 mb 0 tt1 23 r 0
v_accvgpr_read_b32 v9, acc237                      // glvw 3 mb 0 tt1 23 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc225, v7
v_accvgpr_write_b32 acc229, v8
v_accvgpr_write_b32 acc233, v9
v_accvgpr_read_b32 v7, acc134                      // glvw 3 mb 0 tt1 24 r 0
v_accvgpr_read_b32 v8, acc138                      // glvw 3 mb 0 tt1 24 r 0
v_accvgpr_read_b32 v9, acc142                      // glvw 3 mb 0 tt1 24 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc130, v7
v_accvgpr_write_b32 acc134, v8
v_accvgpr_write_b32 acc138, v9
v_accvgpr_read_b32 v7, acc166                      // glvw 3 mb 0 tt1 25 r 0
v_accvgpr_read_b32 v8, acc170                      // glvw 3 mb 0 tt1 25 r 0
v_accvgpr_read_b32 v9, acc174                      // glvw 3 mb 0 tt1 25 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc162, v7
v_accvgpr_write_b32 acc166, v8
v_accvgpr_write_b32 acc170, v9
v_accvgpr_read_b32 v7, acc198                      // glvw 3 mb 0 tt1 26 r 0
v_accvgpr_read_b32 v8, acc202                      // glvw 3 mb 0 tt1 26 r 0
v_accvgpr_read_b32 v9, acc206                      // glvw 3 mb 0 tt1 26 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc194, v7
v_accvgpr_write_b32 acc198, v8
v_accvgpr_write_b32 acc202, v9
v_accvgpr_read_b32 v7, acc230                      // glvw 3 mb 0 tt1 27 r 0
v_accvgpr_read_b32 v8, acc234                      // glvw 3 mb 0 tt1 27 r 0
v_accvgpr_read_b32 v9, acc238                      // glvw 3 mb 0 tt1 27 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc226, v7
v_accvgpr_write_b32 acc230, v8
v_accvgpr_write_b32 acc234, v9
v_accvgpr_read_b32 v7, acc135                      // glvw 3 mb 0 tt1 28 r 0
v_accvgpr_read_b32 v8, acc139                      // glvw 3 mb 0 tt1 28 r 0
v_accvgpr_read_b32 v9, acc143                      // glvw 3 mb 0 tt1 28 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc131, v7
v_accvgpr_write_b32 acc135, v8
v_accvgpr_write_b32 acc139, v9
v_accvgpr_read_b32 v7, acc167                      // glvw 3 mb 0 tt1 29 r 0
v_accvgpr_read_b32 v8, acc171                      // glvw 3 mb 0 tt1 29 r 0
v_accvgpr_read_b32 v9, acc175                      // glvw 3 mb 0 tt1 29 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc163, v7
v_accvgpr_write_b32 acc167, v8
v_accvgpr_write_b32 acc171, v9
v_accvgpr_read_b32 v7, acc199                      // glvw 3 mb 0 tt1 30 r 0
v_accvgpr_read_b32 v8, acc203                      // glvw 3 mb 0 tt1 30 r 0
v_accvgpr_read_b32 v9, acc207                      // glvw 3 mb 0 tt1 30 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc195, v7
v_accvgpr_write_b32 acc199, v8
v_accvgpr_write_b32 acc203, v9
v_accvgpr_read_b32 v7, acc231                      // glvw 3 mb 0 tt1 31 r 0
v_accvgpr_read_b32 v8, acc235                      // glvw 3 mb 0 tt1 31 r 0
v_accvgpr_read_b32 v9, acc239                      // glvw 3 mb 0 tt1 31 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc227, v7
v_accvgpr_write_b32 acc231, v8
v_accvgpr_write_b32 acc235, v9
s_mov_b64 s[8:9], 0xFFFFFFFFFFFFFFFF               // to restore all threads active
s_or_saveexec_b64 vcc, s[8:9]                      // all threads active
s_branch label_ShiftVectorComponents0_GLVW0        // done shifting


/******************************************/
/* shift d0 r=3 mb=1 vw0                  */
/******************************************/
label_ShiftVectorComponents0_GLVW3_BM1_VW0:  /// r3 mb1 vw0
s_mov_b32 s8, 32
v_cmpx_eq_u32 s[8:9], v6, s8                       // is thread in edge glvw region
v_and_b32 v0, 63, v[vgprSerial]                    // permute register between threads
v_lshlrev_b32 v0, 2, v0                            // permute register between threads
v_accvgpr_read_b32 v7, acc20                       // glvw 3 mb 1 tt1 0 r 0
v_accvgpr_read_b32 v8, acc24                       // glvw 3 mb 1 tt1 0 r 0
v_accvgpr_read_b32 v9, acc28                       // glvw 3 mb 1 tt1 0 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc16, v7
v_accvgpr_write_b32 acc20, v8
v_accvgpr_write_b32 acc24, v9
v_accvgpr_read_b32 v7, acc52                       // glvw 3 mb 1 tt1 1 r 0
v_accvgpr_read_b32 v8, acc56                       // glvw 3 mb 1 tt1 1 r 0
v_accvgpr_read_b32 v9, acc60                       // glvw 3 mb 1 tt1 1 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc48, v7
v_accvgpr_write_b32 acc52, v8
v_accvgpr_write_b32 acc56, v9
v_accvgpr_read_b32 v7, acc84                       // glvw 3 mb 1 tt1 2 r 0
v_accvgpr_read_b32 v8, acc88                       // glvw 3 mb 1 tt1 2 r 0
v_accvgpr_read_b32 v9, acc92                       // glvw 3 mb 1 tt1 2 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc80, v7
v_accvgpr_write_b32 acc84, v8
v_accvgpr_write_b32 acc88, v9
v_accvgpr_read_b32 v7, acc116                      // glvw 3 mb 1 tt1 3 r 0
v_accvgpr_read_b32 v8, acc120                      // glvw 3 mb 1 tt1 3 r 0
v_accvgpr_read_b32 v9, acc124                      // glvw 3 mb 1 tt1 3 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc112, v7
v_accvgpr_write_b32 acc116, v8
v_accvgpr_write_b32 acc120, v9
v_accvgpr_read_b32 v7, acc21                       // glvw 3 mb 1 tt1 4 r 0
v_accvgpr_read_b32 v8, acc25                       // glvw 3 mb 1 tt1 4 r 0
v_accvgpr_read_b32 v9, acc29                       // glvw 3 mb 1 tt1 4 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc17, v7
v_accvgpr_write_b32 acc21, v8
v_accvgpr_write_b32 acc25, v9
v_accvgpr_read_b32 v7, acc53                       // glvw 3 mb 1 tt1 5 r 0
v_accvgpr_read_b32 v8, acc57                       // glvw 3 mb 1 tt1 5 r 0
v_accvgpr_read_b32 v9, acc61                       // glvw 3 mb 1 tt1 5 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc49, v7
v_accvgpr_write_b32 acc53, v8
v_accvgpr_write_b32 acc57, v9
v_accvgpr_read_b32 v7, acc85                       // glvw 3 mb 1 tt1 6 r 0
v_accvgpr_read_b32 v8, acc89                       // glvw 3 mb 1 tt1 6 r 0
v_accvgpr_read_b32 v9, acc93                       // glvw 3 mb 1 tt1 6 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc81, v7
v_accvgpr_write_b32 acc85, v8
v_accvgpr_write_b32 acc89, v9
v_accvgpr_read_b32 v7, acc117                      // glvw 3 mb 1 tt1 7 r 0
v_accvgpr_read_b32 v8, acc121                      // glvw 3 mb 1 tt1 7 r 0
v_accvgpr_read_b32 v9, acc125                      // glvw 3 mb 1 tt1 7 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc113, v7
v_accvgpr_write_b32 acc117, v8
v_accvgpr_write_b32 acc121, v9
v_accvgpr_read_b32 v7, acc22                       // glvw 3 mb 1 tt1 8 r 0
v_accvgpr_read_b32 v8, acc26                       // glvw 3 mb 1 tt1 8 r 0
v_accvgpr_read_b32 v9, acc30                       // glvw 3 mb 1 tt1 8 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc18, v7
v_accvgpr_write_b32 acc22, v8
v_accvgpr_write_b32 acc26, v9
v_accvgpr_read_b32 v7, acc54                       // glvw 3 mb 1 tt1 9 r 0
v_accvgpr_read_b32 v8, acc58                       // glvw 3 mb 1 tt1 9 r 0
v_accvgpr_read_b32 v9, acc62                       // glvw 3 mb 1 tt1 9 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc50, v7
v_accvgpr_write_b32 acc54, v8
v_accvgpr_write_b32 acc58, v9
v_accvgpr_read_b32 v7, acc86                       // glvw 3 mb 1 tt1 10 r 0
v_accvgpr_read_b32 v8, acc90                       // glvw 3 mb 1 tt1 10 r 0
v_accvgpr_read_b32 v9, acc94                       // glvw 3 mb 1 tt1 10 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc82, v7
v_accvgpr_write_b32 acc86, v8
v_accvgpr_write_b32 acc90, v9
v_accvgpr_read_b32 v7, acc118                      // glvw 3 mb 1 tt1 11 r 0
v_accvgpr_read_b32 v8, acc122                      // glvw 3 mb 1 tt1 11 r 0
v_accvgpr_read_b32 v9, acc126                      // glvw 3 mb 1 tt1 11 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc114, v7
v_accvgpr_write_b32 acc118, v8
v_accvgpr_write_b32 acc122, v9
v_accvgpr_read_b32 v7, acc23                       // glvw 3 mb 1 tt1 12 r 0
v_accvgpr_read_b32 v8, acc27                       // glvw 3 mb 1 tt1 12 r 0
v_accvgpr_read_b32 v9, acc31                       // glvw 3 mb 1 tt1 12 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc19, v7
v_accvgpr_write_b32 acc23, v8
v_accvgpr_write_b32 acc27, v9
v_accvgpr_read_b32 v7, acc55                       // glvw 3 mb 1 tt1 13 r 0
v_accvgpr_read_b32 v8, acc59                       // glvw 3 mb 1 tt1 13 r 0
v_accvgpr_read_b32 v9, acc63                       // glvw 3 mb 1 tt1 13 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc51, v7
v_accvgpr_write_b32 acc55, v8
v_accvgpr_write_b32 acc59, v9
v_accvgpr_read_b32 v7, acc87                       // glvw 3 mb 1 tt1 14 r 0
v_accvgpr_read_b32 v8, acc91                       // glvw 3 mb 1 tt1 14 r 0
v_accvgpr_read_b32 v9, acc95                       // glvw 3 mb 1 tt1 14 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc83, v7
v_accvgpr_write_b32 acc87, v8
v_accvgpr_write_b32 acc91, v9
v_accvgpr_read_b32 v7, acc119                      // glvw 3 mb 1 tt1 15 r 0
v_accvgpr_read_b32 v8, acc123                      // glvw 3 mb 1 tt1 15 r 0
v_accvgpr_read_b32 v9, acc127                      // glvw 3 mb 1 tt1 15 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc115, v7
v_accvgpr_write_b32 acc119, v8
v_accvgpr_write_b32 acc123, v9
v_accvgpr_read_b32 v7, acc148                      // glvw 3 mb 1 tt1 16 r 0
v_accvgpr_read_b32 v8, acc152                      // glvw 3 mb 1 tt1 16 r 0
v_accvgpr_read_b32 v9, acc156                      // glvw 3 mb 1 tt1 16 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc144, v7
v_accvgpr_write_b32 acc148, v8
v_accvgpr_write_b32 acc152, v9
v_accvgpr_read_b32 v7, acc180                      // glvw 3 mb 1 tt1 17 r 0
v_accvgpr_read_b32 v8, acc184                      // glvw 3 mb 1 tt1 17 r 0
v_accvgpr_read_b32 v9, acc188                      // glvw 3 mb 1 tt1 17 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc176, v7
v_accvgpr_write_b32 acc180, v8
v_accvgpr_write_b32 acc184, v9
v_accvgpr_read_b32 v7, acc212                      // glvw 3 mb 1 tt1 18 r 0
v_accvgpr_read_b32 v8, acc216                      // glvw 3 mb 1 tt1 18 r 0
v_accvgpr_read_b32 v9, acc220                      // glvw 3 mb 1 tt1 18 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc208, v7
v_accvgpr_write_b32 acc212, v8
v_accvgpr_write_b32 acc216, v9
v_accvgpr_read_b32 v7, acc244                      // glvw 3 mb 1 tt1 19 r 0
v_accvgpr_read_b32 v8, acc248                      // glvw 3 mb 1 tt1 19 r 0
v_accvgpr_read_b32 v9, acc252                      // glvw 3 mb 1 tt1 19 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc240, v7
v_accvgpr_write_b32 acc244, v8
v_accvgpr_write_b32 acc248, v9
v_accvgpr_read_b32 v7, acc149                      // glvw 3 mb 1 tt1 20 r 0
v_accvgpr_read_b32 v8, acc153                      // glvw 3 mb 1 tt1 20 r 0
v_accvgpr_read_b32 v9, acc157                      // glvw 3 mb 1 tt1 20 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc145, v7
v_accvgpr_write_b32 acc149, v8
v_accvgpr_write_b32 acc153, v9
v_accvgpr_read_b32 v7, acc181                      // glvw 3 mb 1 tt1 21 r 0
v_accvgpr_read_b32 v8, acc185                      // glvw 3 mb 1 tt1 21 r 0
v_accvgpr_read_b32 v9, acc189                      // glvw 3 mb 1 tt1 21 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc177, v7
v_accvgpr_write_b32 acc181, v8
v_accvgpr_write_b32 acc185, v9
v_accvgpr_read_b32 v7, acc213                      // glvw 3 mb 1 tt1 22 r 0
v_accvgpr_read_b32 v8, acc217                      // glvw 3 mb 1 tt1 22 r 0
v_accvgpr_read_b32 v9, acc221                      // glvw 3 mb 1 tt1 22 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc209, v7
v_accvgpr_write_b32 acc213, v8
v_accvgpr_write_b32 acc217, v9
v_accvgpr_read_b32 v7, acc245                      // glvw 3 mb 1 tt1 23 r 0
v_accvgpr_read_b32 v8, acc249                      // glvw 3 mb 1 tt1 23 r 0
v_accvgpr_read_b32 v9, acc253                      // glvw 3 mb 1 tt1 23 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc241, v7
v_accvgpr_write_b32 acc245, v8
v_accvgpr_write_b32 acc249, v9
v_accvgpr_read_b32 v7, acc150                      // glvw 3 mb 1 tt1 24 r 0
v_accvgpr_read_b32 v8, acc154                      // glvw 3 mb 1 tt1 24 r 0
v_accvgpr_read_b32 v9, acc158                      // glvw 3 mb 1 tt1 24 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc146, v7
v_accvgpr_write_b32 acc150, v8
v_accvgpr_write_b32 acc154, v9
v_accvgpr_read_b32 v7, acc182                      // glvw 3 mb 1 tt1 25 r 0
v_accvgpr_read_b32 v8, acc186                      // glvw 3 mb 1 tt1 25 r 0
v_accvgpr_read_b32 v9, acc190                      // glvw 3 mb 1 tt1 25 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc178, v7
v_accvgpr_write_b32 acc182, v8
v_accvgpr_write_b32 acc186, v9
v_accvgpr_read_b32 v7, acc214                      // glvw 3 mb 1 tt1 26 r 0
v_accvgpr_read_b32 v8, acc218                      // glvw 3 mb 1 tt1 26 r 0
v_accvgpr_read_b32 v9, acc222                      // glvw 3 mb 1 tt1 26 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc210, v7
v_accvgpr_write_b32 acc214, v8
v_accvgpr_write_b32 acc218, v9
v_accvgpr_read_b32 v7, acc246                      // glvw 3 mb 1 tt1 27 r 0
v_accvgpr_read_b32 v8, acc250                      // glvw 3 mb 1 tt1 27 r 0
v_accvgpr_read_b32 v9, acc254                      // glvw 3 mb 1 tt1 27 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc242, v7
v_accvgpr_write_b32 acc246, v8
v_accvgpr_write_b32 acc250, v9
v_accvgpr_read_b32 v7, acc151                      // glvw 3 mb 1 tt1 28 r 0
v_accvgpr_read_b32 v8, acc155                      // glvw 3 mb 1 tt1 28 r 0
v_accvgpr_read_b32 v9, acc159                      // glvw 3 mb 1 tt1 28 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc147, v7
v_accvgpr_write_b32 acc151, v8
v_accvgpr_write_b32 acc155, v9
v_accvgpr_read_b32 v7, acc183                      // glvw 3 mb 1 tt1 29 r 0
v_accvgpr_read_b32 v8, acc187                      // glvw 3 mb 1 tt1 29 r 0
v_accvgpr_read_b32 v9, acc191                      // glvw 3 mb 1 tt1 29 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc179, v7
v_accvgpr_write_b32 acc183, v8
v_accvgpr_write_b32 acc187, v9
v_accvgpr_read_b32 v7, acc215                      // glvw 3 mb 1 tt1 30 r 0
v_accvgpr_read_b32 v8, acc219                      // glvw 3 mb 1 tt1 30 r 0
v_accvgpr_read_b32 v9, acc223                      // glvw 3 mb 1 tt1 30 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc211, v7
v_accvgpr_write_b32 acc215, v8
v_accvgpr_write_b32 acc219, v9
v_accvgpr_read_b32 v7, acc247                      // glvw 3 mb 1 tt1 31 r 0
v_accvgpr_read_b32 v8, acc251                      // glvw 3 mb 1 tt1 31 r 0
v_accvgpr_read_b32 v9, acc255                      // glvw 3 mb 1 tt1 31 r 0
s_nop 1                                            // v_accvgpr read vgpr after write vgpr: 2 wait states
v_accvgpr_write_b32 acc243, v7
v_accvgpr_write_b32 acc247, v8
v_accvgpr_write_b32 acc251, v9
s_mov_b64 s[8:9], 0xFFFFFFFFFFFFFFFF               // to restore all threads active
s_or_saveexec_b64 vcc, s[8:9]                      // all threads active
s_branch label_ShiftVectorComponents0_GLVW0        // done shifting

label_ShiftVectorComponents0_GLVW0:  /// end shift0

/* not-LocalSplitU: global write indices */
/* computeStoreVgprs */
v_lshrrev_b32 v4, 6, v[vgprSerial]                 // 4 = Serial / 64
v_lshrrev_b32 v5, 1, v4                            // 5 = 4 / 2
v_mul_lo_u32 v5, 0x10, v5                          // wave coordination offset 1
v_and_b32 v1, 63, v[vgprSerial]                    // v1 = v[vgprSerial] % 64
v_lshrrev_b32 v1, 4, v1                            // 1 = 1 / 16
v_lshlrev_b32 v1, 2, v1                            // thread0 * continuous_output
v_add_lshl_u32 v1, v5, v1, 2                       // coordination 1 = vwB *(wave_id1 + tid1)
v_mul_lo_u32 v2, v1, s[sgprStrideC1J]              //  offset 1
v_mul_lo_u32 v3, v1, s[sgprStrideD1J]              //  offset 1
v_and_b32 v0, 1, v4                                // v0 = v4 % 2
v_mul_lo_u32 v0, 0x10, v0                          // wave coordination offset 0
v_and_b32 v5, 15, v[vgprSerial]                    // v5 = v[vgprSerial] % 16
v_add_lshl_u32 v0, v5, v0, 2                       // coordination 0 = vwA * (wave_id0 + tid0)
s_mul_i32 s8, 256, s[sgprWorkGroup0]               // wgp0 * MT0
v_add_u32 v0, s8, v0                               // coord 0 = (tid0/MI_m)*4 + waveG0*MIB_m + MT0*SG0
s_mul_i32 s8, 256, s[sgprWorkGroup1]               // wgp1 * MT1
v_add_u32 v1, s8, v1                               // coord 1 = (tid0%MI_m) + waveG1*MIB_n + MT1*SG1

/* not-LocalSplitU: global write */

/******************************************/
/* Global Write Elements                  */
/******************************************/
s_waitcnt lgkmcnt(0)                               // wait for 36 bytes of kern args.
s_mov_b64 s[sgprSrdScaleAlphaVec+0:sgprSrdScaleAlphaVec+0+1], s[sgprAddressScaleAlphaVec+0:sgprAddressScaleAlphaVec+0+1] // init SRD base address
s_mov_b32 s[sgprSrdScaleAlphaVec+3], Srd127_96     // Set bits 127_96 in post-loop SRD
s_cmp_eq_u64 s[sgprAddressScaleAlphaVec:sgprAddressScaleAlphaVec+1], 0 // s[AddressScaleAlphaVec] == 0 ?
s_cbranch_scc0 label_ScaleAlphaVecAddrValid        // branch if s[AddressScaleAlphaVec] != 0
s_mov_b32 s[sgprSrdScaleAlphaVec+2], 0
s_branch label_ScaleAlphaVecAddrValid_End
label_ScaleAlphaVecAddrValid:
s_mov_b32 s[sgprSrdScaleAlphaVec+2], s[sgprSizeI]
label_ScaleAlphaVecAddrValid_End:

s_mul_i32 s[sgprSrdScaleAlphaVec+2], 0x4, s[sgprSrdScaleAlphaVec+2] // ScaleAlphaVec scaled by BPE
s_add_u32 s8, s[sgprWorkGroup2], 0x1
s_mul_i32 s8, s[sgprBiasStride], s8                // stride * (wg+1)
s_cmp_eq_u32 s8, 0                                 // bias stride = 0?
s_cselect_b32 s8, s[sgprSizeI], s8
s_mov_b64 s[sgprSrdBias+0:sgprSrdBias+0+1], s[sgprAddressBias+0:sgprAddressBias+0+1] // init SRD base address
s_mov_b32 s[sgprSrdBias+3], Srd127_96              // Set bits 127_96 in post-loop SRD
s_cmp_eq_u64 s[sgprAddressBias:sgprAddressBias+1], 0 // s[AddressBias] == 0 ?
s_cbranch_scc0 label_BiasAddrValid                 // branch if s[AddressBias] != 0
s_mov_b32 s[sgprSrdBias+2], 0
s_branch label_BiasAddrValid_End
label_BiasAddrValid:
s_mov_b32 s[sgprSrdBias+2], s8
label_BiasAddrValid_End:


/******************************************/
/* Read vector to LDS                     */
/******************************************/
s_mul_i32 s8, 256, s[sgprWorkGroup0]               // wgp0 * MT0
v_add_u32 v8, s8, v[vgprSerial]                    // coord 0 = wgp0 * MT0 + thread offset
s_mul_i32 s[sgprSrdBias+2], 0x4, s[sgprSrdBias+2]  // scaled by BPE
s_mul_i32 s8, s[sgprBiasStride], s[sgprWorkGroup2] // Stride * WG
v_add_u32 v6, s8, v8                               // coord 0 = wgp0 * MT0 + thread offset + Stride * WG
v_lshlrev_b32 v6, 0x2, v6                          // Global bias address scaled by BPE
v_lshlrev_b32 v7, 0x2, v8                          // Global scaleAlpha address scaled by BPE
s_mul_i32 s8, 256, s[sgprWorkGroup1]               // wgp1 * MT1
v_add_u32 v8, s8, v[vgprSerial]                    // coord 1 = wgp1 * MT1 + thread offset
buffer_load_dword v4, v6, s[sgprSrdBias:sgprSrdBias+3], 0 offen offset:0 // Load Bias
buffer_load_dword v5, v7, s[sgprSrdScaleAlphaVec:sgprSrdScaleAlphaVec+3], 0 offen offset:0 // Load ScaleAlphaVec
v_lshlrev_b32 v8, 0x2, v[vgprSerial]               // Local address scaled by BPE
s_barrier                                          // wait for all global loads.
s_waitcnt vmcnt(1)                                 // wait for global load
ds_write_b32 v8, v4 offset:0                       // store bias
v_cmp_gt_u32 s[sgprAddressScaleAlphaVec:sgprAddressScaleAlphaVec+1], s[sgprSrdScaleAlphaVec+2], 0 //  == 0 ?
s_waitcnt vmcnt(0)                                 // wait for global load
v_cndmask_b32 v5, 1.0, v5, s[sgprAddressScaleAlphaVec:sgprAddressScaleAlphaVec+1] // 1. mul 1 if 0
ds_write_b32 v8, v5 offset:1024                    // store scaleAlpha
.set sgprAddressScaleAlphaVec, UNDEF
.set sgprSrdScaleAlphaVec, UNDEF
s_cmp_eq_u32 s[sgprStreamKLocalStart], 0           // does wg start tile?
s_cbranch_scc1 label_NoBranch_QWMA7J3AUDGL0X23     // Only branch on scc0
s_getpc_b64 s[84:85]                               // addr of next instr
s_add_i32 s86, label_SK_Partials, 4                // target branch offset
s_add_u32 s84, s84, s86                            // add target branch offset
s_addc_u32 s85, s85, 0                             // add high and carry
s_setpc_b64 s[84:85]                               // branch to label_SK_Partials
label_NoBranch_QWMA7J3AUDGL0X23:
s_cmp_eq_u32 s[sgprStreamKLocalEnd], s[sgprItersPerTile] // does wg finish tile?
s_cbranch_scc1 label_SK_Store                      // Branch if started and finished tile, go to regular store code
s_add_u32 s77, s[sgprStreamKIdx], 1                // input partial tile index
v_cvt_f32_u32 v9, s[sgprItersPerTile]              // StreamKIterEnd // ItersPerTile
v_rcp_iflag_f32 v9, v9                             // StreamKIterEnd // ItersPerTile
v_cvt_f32_u32 v10, s[sgprStreamKIterEnd]           // StreamKIterEnd // ItersPerTile
v_mul_f32 v9, v9, v10                              // StreamKIterEnd // ItersPerTile
v_cvt_u32_f32 v9, v9                               // StreamKIterEnd // ItersPerTile
v_mul_u32_u24 v10, v9, s[sgprItersPerTile]         // StreamKIterEnd // ItersPerTile
v_sub_u32 v10, s[sgprStreamKIterEnd], v10          // StreamKIterEnd // ItersPerTile
v_cmpx_eq_u32 exec, v10, s[sgprItersPerTile]       // StreamKIterEnd // ItersPerTile
v_add_u32 v9, 1, v9                                // StreamKIterEnd // ItersPerTile
v_mov_b32 v10, 0                                   // StreamKIterEnd // ItersPerTile
s_mov_b64 exec, -1                                 // Reset exec
v_cmpx_gt_u32 exec, v10, s[sgprItersPerTile]       // overflow happened in remainder
v_sub_u32 v9, v9, 1                                // quotient - 1
v_mul_u32_u24 v10, v9, s[sgprItersPerTile]         // re-calculate remainder
v_sub_u32 v10, s[sgprStreamKIterEnd], v10          // re-calculate remainder
s_mov_b64 exec, -1                                 // Reset exec
v_readfirstlane_b32 s73, v9                        // quotient
v_readfirstlane_b32 s78, v10                       // remainder
label_SK_Fixup:
s_lshl_b32 s73, s77, 2                             // flag offset based on CTA index
s_load_dword s75, s[sgprAddressFlags:sgprAddressFlags+1], s73 glc // get flag
s_waitcnt lgkmcnt(0)                               // wait for flag load
s_cmp_eq_u32 s75, 1                                // check if ready
s_cbranch_scc0 label_SK_Fixup                      // if flag not set, wait and check again
s_barrier                                          // wait for all workgroups before resetting flag
v_readfirstlane_b32 s75, v[vgprSerial]             // Wave 0 updates flags
s_cmp_eq_u32 s75, 0                                // Check for wave 0
s_cbranch_scc0 label_SK_SkipFlagReset              // Skip flag reset
s_store_dword s75, s[sgprAddressFlags:sgprAddressFlags+1], s73 glc // reset flag
label_SK_SkipFlagReset:
label_Fixup_E0:

/* edge=0, allocate 2 sgpr. perBatchTmpS=2 perBatchMaskS=0 perElementMaskS=0 elementsPerBatch=28 */
s_mov_b64 s[sgprSrdWS+0:sgprSrdWS+0+1], s[sgprAddressWS+0:sgprAddressWS+0+1] // init SRD base address
s_mov_b32 s[sgprSrdWS+2], BufferOOB
s_mov_b32 s[sgprSrdWS+3], Srd127_96                // Set bits 127_96 in post-loop SRD

s_mul_i32 s74, 0x40000, s77                        // Offset to correct partials tile
s_add_u32 s[sgprSrdWS+0], s[sgprSrdWS+0], s74      // add lo to SRD
s_addc_u32 s[sgprSrdWS+1], s[sgprSrdWS+1], 0       // add hi to SRD
/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */

/******************************************/
/* Fixup Batch #0 (d1,d0,vc1,vc0) =       */
/*      (0,0,0,0:vw4); (0,1,0,0:vw4); (0,0,1,0:vw4); (0,1,1,0:vw4); (0,0,2,0:vw4); (0,1,2,0:vw4); (0,0,3,0:vw4); (0,1,3,0:vw4); (0,0,4,0:vw4); (0,1,4,0:vw4); (0,0,5,0:vw4); (0,1,5,0:vw4); (0,0,6,0:vw4); (0,1,6,0:vw4); (0,0,7,0:vw4); (0,1,7,0:vw4); (0,0,8,0:vw4); (0,1,8,0:vw4); (0,0,9,0:vw4); (0,1,9,0:vw4); (0,0,10,0:vw4); (0,1,10,0:vw4); (0,0,11,0:vw4); (0,1,11,0:vw4); (0,0,12,0:vw4); (0,1,12,0:vw4); (0,0,13,0:vw4); (0,1,13,0:vw4) */
/******************************************/

/* calc coords, apply mask, and issue loads (if necessary) */
v_lshlrev_b32 v10, 4, v[vgprSerial]                // v10 = v[vgprSerial] * 16
s_mov_b32 s74, 0                                   // Init sgpr offset
buffer_load_dwordx4 v[124:127], v10, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS
s_add_u32 s74, s74, 4096                           // Inc sgpr offset
buffer_load_dwordx4 v[128:131], v10, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS
s_add_u32 s74, s74, 4096                           // Inc sgpr offset
buffer_load_dwordx4 v[132:135], v10, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS
s_add_u32 s74, s74, 4096                           // Inc sgpr offset
buffer_load_dwordx4 v[136:139], v10, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS
s_add_u32 s74, s74, 4096                           // Inc sgpr offset
buffer_load_dwordx4 v[140:143], v10, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS
s_add_u32 s74, s74, 4096                           // Inc sgpr offset
buffer_load_dwordx4 v[152:155], v10, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS
s_add_u32 s74, s74, 4096                           // Inc sgpr offset
buffer_load_dwordx4 v[156:159], v10, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS
s_add_u32 s74, s74, 4096                           // Inc sgpr offset
buffer_load_dwordx4 v[160:163], v10, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS
s_add_u32 s74, s74, 4096                           // Inc sgpr offset
buffer_load_dwordx4 v[164:167], v10, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS
s_add_u32 s74, s74, 4096                           // Inc sgpr offset
buffer_load_dwordx4 v[168:171], v10, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS
s_add_u32 s74, s74, 4096                           // Inc sgpr offset
buffer_load_dwordx4 v[172:175], v10, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS
s_add_u32 s74, s74, 4096                           // Inc sgpr offset
buffer_load_dwordx4 v[176:179], v10, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS
s_add_u32 s74, s74, 4096                           // Inc sgpr offset
buffer_load_dwordx4 v[180:183], v10, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS
s_add_u32 s74, s74, 4096                           // Inc sgpr offset
buffer_load_dwordx4 v[184:187], v10, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS
s_add_u32 s74, s74, 4096                           // Inc sgpr offset
buffer_load_dwordx4 v[188:191], v10, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS
s_add_u32 s74, s74, 4096                           // Inc sgpr offset
buffer_load_dwordx4 v[192:195], v10, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS
s_add_u32 s74, s74, 4096                           // Inc sgpr offset
buffer_load_dwordx4 v[196:199], v10, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS
s_add_u32 s74, s74, 4096                           // Inc sgpr offset
buffer_load_dwordx4 v[200:203], v10, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS
s_add_u32 s74, s74, 4096                           // Inc sgpr offset
buffer_load_dwordx4 v[204:207], v10, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS
s_add_u32 s74, s74, 4096                           // Inc sgpr offset
buffer_load_dwordx4 v[208:211], v10, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS
s_add_u32 s74, s74, 4096                           // Inc sgpr offset
buffer_load_dwordx4 v[212:215], v10, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS
s_add_u32 s74, s74, 4096                           // Inc sgpr offset
buffer_load_dwordx4 v[216:219], v10, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS
s_add_u32 s74, s74, 4096                           // Inc sgpr offset
buffer_load_dwordx4 v[220:223], v10, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS
s_add_u32 s74, s74, 4096                           // Inc sgpr offset
buffer_load_dwordx4 v[224:227], v10, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS
s_add_u32 s74, s74, 4096                           // Inc sgpr offset
buffer_load_dwordx4 v[228:231], v10, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS
s_add_u32 s74, s74, 4096                           // Inc sgpr offset
buffer_load_dwordx4 v[232:235], v10, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS
s_add_u32 s74, s74, 4096                           // Inc sgpr offset
buffer_load_dwordx4 v[236:239], v10, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS
s_add_u32 s74, s74, 4096                           // Inc sgpr offset
buffer_load_dwordx4 v[240:243], v10, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS
v_accvgpr_read_b32 v[vgprValuC+12], acc0           // copy acc to vreg[0]
v_accvgpr_read_b32 v[vgprValuC+13], acc4           // copy acc to vreg[1]
v_accvgpr_read_b32 v[vgprValuC+14], acc8           // copy acc to vreg[2]
v_accvgpr_read_b32 v[vgprValuC+15], acc12          // copy acc to vreg[3]
v_accvgpr_read_b32 v[vgprValuC+16], acc16          // copy acc to vreg[4]
v_accvgpr_read_b32 v[vgprValuC+17], acc20          // copy acc to vreg[5]
v_accvgpr_read_b32 v[vgprValuC+18], acc24          // copy acc to vreg[6]
v_accvgpr_read_b32 v[vgprValuC+19], acc28          // copy acc to vreg[7]
v_accvgpr_read_b32 v[vgprValuC+20], acc32          // copy acc to vreg[8]
v_accvgpr_read_b32 v[vgprValuC+21], acc36          // copy acc to vreg[9]
v_accvgpr_read_b32 v[vgprValuC+22], acc40          // copy acc to vreg[10]
v_accvgpr_read_b32 v[vgprValuC+23], acc44          // copy acc to vreg[11]
v_accvgpr_read_b32 v[vgprValuC+24], acc48          // copy acc to vreg[12]
v_accvgpr_read_b32 v[vgprValuC+25], acc52          // copy acc to vreg[13]
v_accvgpr_read_b32 v[vgprValuC+26], acc56          // copy acc to vreg[14]
v_accvgpr_read_b32 v[vgprValuC+27], acc60          // copy acc to vreg[15]
v_accvgpr_read_b32 v[vgprValuC+28], acc64          // copy acc to vreg[16]
v_accvgpr_read_b32 v[vgprValuC+29], acc68          // copy acc to vreg[17]
v_accvgpr_read_b32 v[vgprValuC+30], acc72          // copy acc to vreg[18]
v_accvgpr_read_b32 v[vgprValuC+31], acc76          // copy acc to vreg[19]
v_accvgpr_read_b32 v[vgprValuC+32], acc80          // copy acc to vreg[20]
v_accvgpr_read_b32 v[vgprValuC+33], acc84          // copy acc to vreg[21]
v_accvgpr_read_b32 v[vgprValuC+34], acc88          // copy acc to vreg[22]
v_accvgpr_read_b32 v[vgprValuC+35], acc92          // copy acc to vreg[23]
v_accvgpr_read_b32 v[vgprValuC+36], acc96          // copy acc to vreg[24]
v_accvgpr_read_b32 v[vgprValuC+37], acc100         // copy acc to vreg[25]
v_accvgpr_read_b32 v[vgprValuC+38], acc104         // copy acc to vreg[26]
v_accvgpr_read_b32 v[vgprValuC+39], acc108         // copy acc to vreg[27]
v_accvgpr_read_b32 v[vgprValuC+40], acc112         // copy acc to vreg[28]
v_accvgpr_read_b32 v[vgprValuC+41], acc116         // copy acc to vreg[29]
v_accvgpr_read_b32 v[vgprValuC+42], acc120         // copy acc to vreg[30]
v_accvgpr_read_b32 v[vgprValuC+43], acc124         // copy acc to vreg[31]
v_accvgpr_read_b32 v[vgprValuC+44], acc1           // copy acc to vreg[32]
v_accvgpr_read_b32 v[vgprValuC+45], acc5           // copy acc to vreg[33]
v_accvgpr_read_b32 v[vgprValuC+46], acc9           // copy acc to vreg[34]
v_accvgpr_read_b32 v[vgprValuC+47], acc13          // copy acc to vreg[35]
v_accvgpr_read_b32 v[vgprValuC+48], acc17          // copy acc to vreg[36]
v_accvgpr_read_b32 v[vgprValuC+49], acc21          // copy acc to vreg[37]
v_accvgpr_read_b32 v[vgprValuC+50], acc25          // copy acc to vreg[38]
v_accvgpr_read_b32 v[vgprValuC+51], acc29          // copy acc to vreg[39]
v_accvgpr_read_b32 v[vgprValuC+52], acc33          // copy acc to vreg[40]
v_accvgpr_read_b32 v[vgprValuC+53], acc37          // copy acc to vreg[41]
v_accvgpr_read_b32 v[vgprValuC+54], acc41          // copy acc to vreg[42]
v_accvgpr_read_b32 v[vgprValuC+55], acc45          // copy acc to vreg[43]
v_accvgpr_read_b32 v[vgprValuC+56], acc49          // copy acc to vreg[44]
v_accvgpr_read_b32 v[vgprValuC+57], acc53          // copy acc to vreg[45]
v_accvgpr_read_b32 v[vgprValuC+58], acc57          // copy acc to vreg[46]
v_accvgpr_read_b32 v[vgprValuC+59], acc61          // copy acc to vreg[47]
v_accvgpr_read_b32 v[vgprValuC+60], acc65          // copy acc to vreg[48]
v_accvgpr_read_b32 v[vgprValuC+61], acc69          // copy acc to vreg[49]
v_accvgpr_read_b32 v[vgprValuC+62], acc73          // copy acc to vreg[50]
v_accvgpr_read_b32 v[vgprValuC+63], acc77          // copy acc to vreg[51]
v_accvgpr_read_b32 v[vgprValuC+64], acc81          // copy acc to vreg[52]
v_accvgpr_read_b32 v[vgprValuC+65], acc85          // copy acc to vreg[53]
v_accvgpr_read_b32 v[vgprValuC+66], acc89          // copy acc to vreg[54]
v_accvgpr_read_b32 v[vgprValuC+67], acc93          // copy acc to vreg[55]
v_accvgpr_read_b32 v[vgprValuC+68], acc97          // copy acc to vreg[56]
v_accvgpr_read_b32 v[vgprValuC+69], acc101         // copy acc to vreg[57]
v_accvgpr_read_b32 v[vgprValuC+70], acc105         // copy acc to vreg[58]
v_accvgpr_read_b32 v[vgprValuC+71], acc109         // copy acc to vreg[59]
v_accvgpr_read_b32 v[vgprValuC+72], acc113         // copy acc to vreg[60]
v_accvgpr_read_b32 v[vgprValuC+73], acc117         // copy acc to vreg[61]
v_accvgpr_read_b32 v[vgprValuC+74], acc121         // copy acc to vreg[62]
v_accvgpr_read_b32 v[vgprValuC+75], acc125         // copy acc to vreg[63]
v_accvgpr_read_b32 v[vgprValuC+76], acc2           // copy acc to vreg[64]
v_accvgpr_read_b32 v[vgprValuC+77], acc6           // copy acc to vreg[65]
v_accvgpr_read_b32 v[vgprValuC+78], acc10          // copy acc to vreg[66]
v_accvgpr_read_b32 v[vgprValuC+79], acc14          // copy acc to vreg[67]
v_accvgpr_read_b32 v[vgprValuC+80], acc18          // copy acc to vreg[68]
v_accvgpr_read_b32 v[vgprValuC+81], acc22          // copy acc to vreg[69]
v_accvgpr_read_b32 v[vgprValuC+82], acc26          // copy acc to vreg[70]
v_accvgpr_read_b32 v[vgprValuC+83], acc30          // copy acc to vreg[71]
v_accvgpr_read_b32 v[vgprValuC+84], acc34          // copy acc to vreg[72]
v_accvgpr_read_b32 v[vgprValuC+85], acc38          // copy acc to vreg[73]
v_accvgpr_read_b32 v[vgprValuC+86], acc42          // copy acc to vreg[74]
v_accvgpr_read_b32 v[vgprValuC+87], acc46          // copy acc to vreg[75]
v_accvgpr_read_b32 v[vgprValuC+88], acc50          // copy acc to vreg[76]
v_accvgpr_read_b32 v[vgprValuC+89], acc54          // copy acc to vreg[77]
v_accvgpr_read_b32 v[vgprValuC+90], acc58          // copy acc to vreg[78]
v_accvgpr_read_b32 v[vgprValuC+91], acc62          // copy acc to vreg[79]
v_accvgpr_read_b32 v[vgprValuC+92], acc66          // copy acc to vreg[80]
v_accvgpr_read_b32 v[vgprValuC+93], acc70          // copy acc to vreg[81]
v_accvgpr_read_b32 v[vgprValuC+94], acc74          // copy acc to vreg[82]
v_accvgpr_read_b32 v[vgprValuC+95], acc78          // copy acc to vreg[83]
v_accvgpr_read_b32 v[vgprValuC+96], acc82          // copy acc to vreg[84]
v_accvgpr_read_b32 v[vgprValuC+97], acc86          // copy acc to vreg[85]
v_accvgpr_read_b32 v[vgprValuC+98], acc90          // copy acc to vreg[86]
v_accvgpr_read_b32 v[vgprValuC+99], acc94          // copy acc to vreg[87]
v_accvgpr_read_b32 v[vgprValuC+100], acc98         // copy acc to vreg[88]
v_accvgpr_read_b32 v[vgprValuC+101], acc102        // copy acc to vreg[89]
v_accvgpr_read_b32 v[vgprValuC+102], acc106        // copy acc to vreg[90]
v_accvgpr_read_b32 v[vgprValuC+103], acc110        // copy acc to vreg[91]
v_accvgpr_read_b32 v[vgprValuC+104], acc114        // copy acc to vreg[92]
v_accvgpr_read_b32 v[vgprValuC+105], acc118        // copy acc to vreg[93]
v_accvgpr_read_b32 v[vgprValuC+106], acc122        // copy acc to vreg[94]
v_accvgpr_read_b32 v[vgprValuC+107], acc126        // copy acc to vreg[95]
v_accvgpr_read_b32 v[vgprValuC+108], acc3          // copy acc to vreg[96]
v_accvgpr_read_b32 v[vgprValuC+109], acc7          // copy acc to vreg[97]
v_accvgpr_read_b32 v[vgprValuC+110], acc11         // copy acc to vreg[98]
v_accvgpr_read_b32 v[vgprValuC+111], acc15         // copy acc to vreg[99]
v_accvgpr_read_b32 v[vgprValuC+112], acc19         // copy acc to vreg[100]
v_accvgpr_read_b32 v[vgprValuC+113], acc23         // copy acc to vreg[101]
v_accvgpr_read_b32 v[vgprValuC+114], acc27         // copy acc to vreg[102]
v_accvgpr_read_b32 v[vgprValuC+115], acc31         // copy acc to vreg[103]
v_accvgpr_read_b32 v[vgprValuC+116], acc35         // copy acc to vreg[104]
v_accvgpr_read_b32 v[vgprValuC+117], acc39         // copy acc to vreg[105]
v_accvgpr_read_b32 v[vgprValuC+118], acc43         // copy acc to vreg[106]
v_accvgpr_read_b32 v[vgprValuC+119], acc47         // copy acc to vreg[107]
v_accvgpr_read_b32 v[vgprValuC+120], acc51         // copy acc to vreg[108]
v_accvgpr_read_b32 v[vgprValuC+121], acc55         // copy acc to vreg[109]
v_accvgpr_read_b32 v[vgprValuC+122], acc59         // copy acc to vreg[110]
v_accvgpr_read_b32 v[vgprValuC+123], acc63         // copy acc to vreg[111]
s_nop 1                                            // 2 wait states required before reading vgpr

/* apply mask, calc new C and issue writes */

s_waitcnt vmcnt(27)                                // wait C (interleaved) 27 = 28 - 0 + 0 - 1
v_add_f32 v[vgprValuC+12], v[vgprValuC+12], v124   // accum partials
v_add_f32 v[vgprValuC+13], v[vgprValuC+13], v125   // accum partials
v_add_f32 v[vgprValuC+14], v[vgprValuC+14], v126   // accum partials
v_add_f32 v[vgprValuC+15], v[vgprValuC+15], v127   // accum partials

s_waitcnt vmcnt(26)                                // wait C (interleaved) 26 = 28 - 1 + 0 - 1
v_add_f32 v[vgprValuC+16], v[vgprValuC+16], v128   // accum partials
v_add_f32 v[vgprValuC+17], v[vgprValuC+17], v129   // accum partials
v_add_f32 v[vgprValuC+18], v[vgprValuC+18], v130   // accum partials
v_add_f32 v[vgprValuC+19], v[vgprValuC+19], v131   // accum partials

s_waitcnt vmcnt(25)                                // wait C (interleaved) 25 = 28 - 2 + 0 - 1
v_add_f32 v[vgprValuC+20], v[vgprValuC+20], v132   // accum partials
v_add_f32 v[vgprValuC+21], v[vgprValuC+21], v133   // accum partials
v_add_f32 v[vgprValuC+22], v[vgprValuC+22], v134   // accum partials
v_add_f32 v[vgprValuC+23], v[vgprValuC+23], v135   // accum partials

s_waitcnt vmcnt(24)                                // wait C (interleaved) 24 = 28 - 3 + 0 - 1
v_add_f32 v[vgprValuC+24], v[vgprValuC+24], v136   // accum partials
v_add_f32 v[vgprValuC+25], v[vgprValuC+25], v137   // accum partials
v_add_f32 v[vgprValuC+26], v[vgprValuC+26], v138   // accum partials
v_add_f32 v[vgprValuC+27], v[vgprValuC+27], v139   // accum partials

s_waitcnt vmcnt(23)                                // wait C (interleaved) 23 = 28 - 4 + 0 - 1
v_add_f32 v[vgprValuC+28], v[vgprValuC+28], v140   // accum partials
v_add_f32 v[vgprValuC+29], v[vgprValuC+29], v141   // accum partials
v_add_f32 v[vgprValuC+30], v[vgprValuC+30], v142   // accum partials
v_add_f32 v[vgprValuC+31], v[vgprValuC+31], v143   // accum partials

s_waitcnt vmcnt(22)                                // wait C (interleaved) 22 = 28 - 5 + 0 - 1
v_add_f32 v[vgprValuC+32], v[vgprValuC+32], v152   // accum partials
v_add_f32 v[vgprValuC+33], v[vgprValuC+33], v153   // accum partials
v_add_f32 v[vgprValuC+34], v[vgprValuC+34], v154   // accum partials
v_add_f32 v[vgprValuC+35], v[vgprValuC+35], v155   // accum partials

s_waitcnt vmcnt(21)                                // wait C (interleaved) 21 = 28 - 6 + 0 - 1
v_add_f32 v[vgprValuC+36], v[vgprValuC+36], v156   // accum partials
v_add_f32 v[vgprValuC+37], v[vgprValuC+37], v157   // accum partials
v_add_f32 v[vgprValuC+38], v[vgprValuC+38], v158   // accum partials
v_add_f32 v[vgprValuC+39], v[vgprValuC+39], v159   // accum partials

s_waitcnt vmcnt(20)                                // wait C (interleaved) 20 = 28 - 7 + 0 - 1
v_add_f32 v[vgprValuC+40], v[vgprValuC+40], v160   // accum partials
v_add_f32 v[vgprValuC+41], v[vgprValuC+41], v161   // accum partials
v_add_f32 v[vgprValuC+42], v[vgprValuC+42], v162   // accum partials
v_add_f32 v[vgprValuC+43], v[vgprValuC+43], v163   // accum partials

s_waitcnt vmcnt(19)                                // wait C (interleaved) 19 = 28 - 8 + 0 - 1
v_add_f32 v[vgprValuC+44], v[vgprValuC+44], v164   // accum partials
v_add_f32 v[vgprValuC+45], v[vgprValuC+45], v165   // accum partials
v_add_f32 v[vgprValuC+46], v[vgprValuC+46], v166   // accum partials
v_add_f32 v[vgprValuC+47], v[vgprValuC+47], v167   // accum partials

s_waitcnt vmcnt(18)                                // wait C (interleaved) 18 = 28 - 9 + 0 - 1
v_add_f32 v[vgprValuC+48], v[vgprValuC+48], v168   // accum partials
v_add_f32 v[vgprValuC+49], v[vgprValuC+49], v169   // accum partials
v_add_f32 v[vgprValuC+50], v[vgprValuC+50], v170   // accum partials
v_add_f32 v[vgprValuC+51], v[vgprValuC+51], v171   // accum partials

s_waitcnt vmcnt(17)                                // wait C (interleaved) 17 = 28 - 10 + 0 - 1
v_add_f32 v[vgprValuC+52], v[vgprValuC+52], v172   // accum partials
v_add_f32 v[vgprValuC+53], v[vgprValuC+53], v173   // accum partials
v_add_f32 v[vgprValuC+54], v[vgprValuC+54], v174   // accum partials
v_add_f32 v[vgprValuC+55], v[vgprValuC+55], v175   // accum partials

s_waitcnt vmcnt(16)                                // wait C (interleaved) 16 = 28 - 11 + 0 - 1
v_add_f32 v[vgprValuC+56], v[vgprValuC+56], v176   // accum partials
v_add_f32 v[vgprValuC+57], v[vgprValuC+57], v177   // accum partials
v_add_f32 v[vgprValuC+58], v[vgprValuC+58], v178   // accum partials
v_add_f32 v[vgprValuC+59], v[vgprValuC+59], v179   // accum partials

s_waitcnt vmcnt(15)                                // wait C (interleaved) 15 = 28 - 12 + 0 - 1
v_add_f32 v[vgprValuC+60], v[vgprValuC+60], v180   // accum partials
v_add_f32 v[vgprValuC+61], v[vgprValuC+61], v181   // accum partials
v_add_f32 v[vgprValuC+62], v[vgprValuC+62], v182   // accum partials
v_add_f32 v[vgprValuC+63], v[vgprValuC+63], v183   // accum partials

s_waitcnt vmcnt(14)                                // wait C (interleaved) 14 = 28 - 13 + 0 - 1
v_add_f32 v[vgprValuC+64], v[vgprValuC+64], v184   // accum partials
v_add_f32 v[vgprValuC+65], v[vgprValuC+65], v185   // accum partials
v_add_f32 v[vgprValuC+66], v[vgprValuC+66], v186   // accum partials
v_add_f32 v[vgprValuC+67], v[vgprValuC+67], v187   // accum partials

s_waitcnt vmcnt(13)                                // wait C (interleaved) 13 = 28 - 14 + 0 - 1
v_add_f32 v[vgprValuC+68], v[vgprValuC+68], v188   // accum partials
v_add_f32 v[vgprValuC+69], v[vgprValuC+69], v189   // accum partials
v_add_f32 v[vgprValuC+70], v[vgprValuC+70], v190   // accum partials
v_add_f32 v[vgprValuC+71], v[vgprValuC+71], v191   // accum partials

s_waitcnt vmcnt(12)                                // wait C (interleaved) 12 = 28 - 15 + 0 - 1
v_add_f32 v[vgprValuC+72], v[vgprValuC+72], v192   // accum partials
v_add_f32 v[vgprValuC+73], v[vgprValuC+73], v193   // accum partials
v_add_f32 v[vgprValuC+74], v[vgprValuC+74], v194   // accum partials
v_add_f32 v[vgprValuC+75], v[vgprValuC+75], v195   // accum partials

s_waitcnt vmcnt(11)                                // wait C (interleaved) 11 = 28 - 16 + 0 - 1
v_add_f32 v[vgprValuC+76], v[vgprValuC+76], v196   // accum partials
v_add_f32 v[vgprValuC+77], v[vgprValuC+77], v197   // accum partials
v_add_f32 v[vgprValuC+78], v[vgprValuC+78], v198   // accum partials
v_add_f32 v[vgprValuC+79], v[vgprValuC+79], v199   // accum partials

s_waitcnt vmcnt(10)                                // wait C (interleaved) 10 = 28 - 17 + 0 - 1
v_add_f32 v[vgprValuC+80], v[vgprValuC+80], v200   // accum partials
v_add_f32 v[vgprValuC+81], v[vgprValuC+81], v201   // accum partials
v_add_f32 v[vgprValuC+82], v[vgprValuC+82], v202   // accum partials
v_add_f32 v[vgprValuC+83], v[vgprValuC+83], v203   // accum partials

s_waitcnt vmcnt(9)                                 // wait C (interleaved) 9 = 28 - 18 + 0 - 1
v_add_f32 v[vgprValuC+84], v[vgprValuC+84], v204   // accum partials
v_add_f32 v[vgprValuC+85], v[vgprValuC+85], v205   // accum partials
v_add_f32 v[vgprValuC+86], v[vgprValuC+86], v206   // accum partials
v_add_f32 v[vgprValuC+87], v[vgprValuC+87], v207   // accum partials

s_waitcnt vmcnt(8)                                 // wait C (interleaved) 8 = 28 - 19 + 0 - 1
v_add_f32 v[vgprValuC+88], v[vgprValuC+88], v208   // accum partials
v_add_f32 v[vgprValuC+89], v[vgprValuC+89], v209   // accum partials
v_add_f32 v[vgprValuC+90], v[vgprValuC+90], v210   // accum partials
v_add_f32 v[vgprValuC+91], v[vgprValuC+91], v211   // accum partials

s_waitcnt vmcnt(7)                                 // wait C (interleaved) 7 = 28 - 20 + 0 - 1
v_add_f32 v[vgprValuC+92], v[vgprValuC+92], v212   // accum partials
v_add_f32 v[vgprValuC+93], v[vgprValuC+93], v213   // accum partials
v_add_f32 v[vgprValuC+94], v[vgprValuC+94], v214   // accum partials
v_add_f32 v[vgprValuC+95], v[vgprValuC+95], v215   // accum partials

s_waitcnt vmcnt(6)                                 // wait C (interleaved) 6 = 28 - 21 + 0 - 1
v_add_f32 v[vgprValuC+96], v[vgprValuC+96], v216   // accum partials
v_add_f32 v[vgprValuC+97], v[vgprValuC+97], v217   // accum partials
v_add_f32 v[vgprValuC+98], v[vgprValuC+98], v218   // accum partials
v_add_f32 v[vgprValuC+99], v[vgprValuC+99], v219   // accum partials

s_waitcnt vmcnt(5)                                 // wait C (interleaved) 5 = 28 - 22 + 0 - 1
v_add_f32 v[vgprValuC+100], v[vgprValuC+100], v220 // accum partials
v_add_f32 v[vgprValuC+101], v[vgprValuC+101], v221 // accum partials
v_add_f32 v[vgprValuC+102], v[vgprValuC+102], v222 // accum partials
v_add_f32 v[vgprValuC+103], v[vgprValuC+103], v223 // accum partials

s_waitcnt vmcnt(4)                                 // wait C (interleaved) 4 = 28 - 23 + 0 - 1
v_add_f32 v[vgprValuC+104], v[vgprValuC+104], v224 // accum partials
v_add_f32 v[vgprValuC+105], v[vgprValuC+105], v225 // accum partials
v_add_f32 v[vgprValuC+106], v[vgprValuC+106], v226 // accum partials
v_add_f32 v[vgprValuC+107], v[vgprValuC+107], v227 // accum partials

s_waitcnt vmcnt(3)                                 // wait C (interleaved) 3 = 28 - 24 + 0 - 1
v_add_f32 v[vgprValuC+108], v[vgprValuC+108], v228 // accum partials
v_add_f32 v[vgprValuC+109], v[vgprValuC+109], v229 // accum partials
v_add_f32 v[vgprValuC+110], v[vgprValuC+110], v230 // accum partials
v_add_f32 v[vgprValuC+111], v[vgprValuC+111], v231 // accum partials

s_waitcnt vmcnt(2)                                 // wait C (interleaved) 2 = 28 - 25 + 0 - 1
v_add_f32 v[vgprValuC+112], v[vgprValuC+112], v232 // accum partials
v_add_f32 v[vgprValuC+113], v[vgprValuC+113], v233 // accum partials
v_add_f32 v[vgprValuC+114], v[vgprValuC+114], v234 // accum partials
v_add_f32 v[vgprValuC+115], v[vgprValuC+115], v235 // accum partials

s_waitcnt vmcnt(1)                                 // wait C (interleaved) 1 = 28 - 26 + 0 - 1
v_add_f32 v[vgprValuC+116], v[vgprValuC+116], v236 // accum partials
v_add_f32 v[vgprValuC+117], v[vgprValuC+117], v237 // accum partials
v_add_f32 v[vgprValuC+118], v[vgprValuC+118], v238 // accum partials
v_add_f32 v[vgprValuC+119], v[vgprValuC+119], v239 // accum partials

s_waitcnt vmcnt(0)                                 // wait C (interleaved) 0 = 28 - 27 + 0 - 1
v_add_f32 v[vgprValuC+120], v[vgprValuC+120], v240 // accum partials
v_add_f32 v[vgprValuC+121], v[vgprValuC+121], v241 // accum partials
v_add_f32 v[vgprValuC+122], v[vgprValuC+122], v242 // accum partials
v_add_f32 v[vgprValuC+123], v[vgprValuC+123], v243 // accum partials
v_accvgpr_write_b32 acc0, v[vgprValuC+12]          // copy vreg[0] to acc
v_accvgpr_write_b32 acc4, v[vgprValuC+13]          // copy vreg[1] to acc
v_accvgpr_write_b32 acc8, v[vgprValuC+14]          // copy vreg[2] to acc
v_accvgpr_write_b32 acc12, v[vgprValuC+15]         // copy vreg[3] to acc
v_accvgpr_write_b32 acc16, v[vgprValuC+16]         // copy vreg[4] to acc
v_accvgpr_write_b32 acc20, v[vgprValuC+17]         // copy vreg[5] to acc
v_accvgpr_write_b32 acc24, v[vgprValuC+18]         // copy vreg[6] to acc
v_accvgpr_write_b32 acc28, v[vgprValuC+19]         // copy vreg[7] to acc
v_accvgpr_write_b32 acc32, v[vgprValuC+20]         // copy vreg[8] to acc
v_accvgpr_write_b32 acc36, v[vgprValuC+21]         // copy vreg[9] to acc
v_accvgpr_write_b32 acc40, v[vgprValuC+22]         // copy vreg[10] to acc
v_accvgpr_write_b32 acc44, v[vgprValuC+23]         // copy vreg[11] to acc
v_accvgpr_write_b32 acc48, v[vgprValuC+24]         // copy vreg[12] to acc
v_accvgpr_write_b32 acc52, v[vgprValuC+25]         // copy vreg[13] to acc
v_accvgpr_write_b32 acc56, v[vgprValuC+26]         // copy vreg[14] to acc
v_accvgpr_write_b32 acc60, v[vgprValuC+27]         // copy vreg[15] to acc
v_accvgpr_write_b32 acc64, v[vgprValuC+28]         // copy vreg[16] to acc
v_accvgpr_write_b32 acc68, v[vgprValuC+29]         // copy vreg[17] to acc
v_accvgpr_write_b32 acc72, v[vgprValuC+30]         // copy vreg[18] to acc
v_accvgpr_write_b32 acc76, v[vgprValuC+31]         // copy vreg[19] to acc
v_accvgpr_write_b32 acc80, v[vgprValuC+32]         // copy vreg[20] to acc
v_accvgpr_write_b32 acc84, v[vgprValuC+33]         // copy vreg[21] to acc
v_accvgpr_write_b32 acc88, v[vgprValuC+34]         // copy vreg[22] to acc
v_accvgpr_write_b32 acc92, v[vgprValuC+35]         // copy vreg[23] to acc
v_accvgpr_write_b32 acc96, v[vgprValuC+36]         // copy vreg[24] to acc
v_accvgpr_write_b32 acc100, v[vgprValuC+37]        // copy vreg[25] to acc
v_accvgpr_write_b32 acc104, v[vgprValuC+38]        // copy vreg[26] to acc
v_accvgpr_write_b32 acc108, v[vgprValuC+39]        // copy vreg[27] to acc
v_accvgpr_write_b32 acc112, v[vgprValuC+40]        // copy vreg[28] to acc
v_accvgpr_write_b32 acc116, v[vgprValuC+41]        // copy vreg[29] to acc
v_accvgpr_write_b32 acc120, v[vgprValuC+42]        // copy vreg[30] to acc
v_accvgpr_write_b32 acc124, v[vgprValuC+43]        // copy vreg[31] to acc
v_accvgpr_write_b32 acc1, v[vgprValuC+44]          // copy vreg[32] to acc
v_accvgpr_write_b32 acc5, v[vgprValuC+45]          // copy vreg[33] to acc
v_accvgpr_write_b32 acc9, v[vgprValuC+46]          // copy vreg[34] to acc
v_accvgpr_write_b32 acc13, v[vgprValuC+47]         // copy vreg[35] to acc
v_accvgpr_write_b32 acc17, v[vgprValuC+48]         // copy vreg[36] to acc
v_accvgpr_write_b32 acc21, v[vgprValuC+49]         // copy vreg[37] to acc
v_accvgpr_write_b32 acc25, v[vgprValuC+50]         // copy vreg[38] to acc
v_accvgpr_write_b32 acc29, v[vgprValuC+51]         // copy vreg[39] to acc
v_accvgpr_write_b32 acc33, v[vgprValuC+52]         // copy vreg[40] to acc
v_accvgpr_write_b32 acc37, v[vgprValuC+53]         // copy vreg[41] to acc
v_accvgpr_write_b32 acc41, v[vgprValuC+54]         // copy vreg[42] to acc
v_accvgpr_write_b32 acc45, v[vgprValuC+55]         // copy vreg[43] to acc
v_accvgpr_write_b32 acc49, v[vgprValuC+56]         // copy vreg[44] to acc
v_accvgpr_write_b32 acc53, v[vgprValuC+57]         // copy vreg[45] to acc
v_accvgpr_write_b32 acc57, v[vgprValuC+58]         // copy vreg[46] to acc
v_accvgpr_write_b32 acc61, v[vgprValuC+59]         // copy vreg[47] to acc
v_accvgpr_write_b32 acc65, v[vgprValuC+60]         // copy vreg[48] to acc
v_accvgpr_write_b32 acc69, v[vgprValuC+61]         // copy vreg[49] to acc
v_accvgpr_write_b32 acc73, v[vgprValuC+62]         // copy vreg[50] to acc
v_accvgpr_write_b32 acc77, v[vgprValuC+63]         // copy vreg[51] to acc
v_accvgpr_write_b32 acc81, v[vgprValuC+64]         // copy vreg[52] to acc
v_accvgpr_write_b32 acc85, v[vgprValuC+65]         // copy vreg[53] to acc
v_accvgpr_write_b32 acc89, v[vgprValuC+66]         // copy vreg[54] to acc
v_accvgpr_write_b32 acc93, v[vgprValuC+67]         // copy vreg[55] to acc
v_accvgpr_write_b32 acc97, v[vgprValuC+68]         // copy vreg[56] to acc
v_accvgpr_write_b32 acc101, v[vgprValuC+69]        // copy vreg[57] to acc
v_accvgpr_write_b32 acc105, v[vgprValuC+70]        // copy vreg[58] to acc
v_accvgpr_write_b32 acc109, v[vgprValuC+71]        // copy vreg[59] to acc
v_accvgpr_write_b32 acc113, v[vgprValuC+72]        // copy vreg[60] to acc
v_accvgpr_write_b32 acc117, v[vgprValuC+73]        // copy vreg[61] to acc
v_accvgpr_write_b32 acc121, v[vgprValuC+74]        // copy vreg[62] to acc
v_accvgpr_write_b32 acc125, v[vgprValuC+75]        // copy vreg[63] to acc
v_accvgpr_write_b32 acc2, v[vgprValuC+76]          // copy vreg[64] to acc
v_accvgpr_write_b32 acc6, v[vgprValuC+77]          // copy vreg[65] to acc
v_accvgpr_write_b32 acc10, v[vgprValuC+78]         // copy vreg[66] to acc
v_accvgpr_write_b32 acc14, v[vgprValuC+79]         // copy vreg[67] to acc
v_accvgpr_write_b32 acc18, v[vgprValuC+80]         // copy vreg[68] to acc
v_accvgpr_write_b32 acc22, v[vgprValuC+81]         // copy vreg[69] to acc
v_accvgpr_write_b32 acc26, v[vgprValuC+82]         // copy vreg[70] to acc
v_accvgpr_write_b32 acc30, v[vgprValuC+83]         // copy vreg[71] to acc
v_accvgpr_write_b32 acc34, v[vgprValuC+84]         // copy vreg[72] to acc
v_accvgpr_write_b32 acc38, v[vgprValuC+85]         // copy vreg[73] to acc
v_accvgpr_write_b32 acc42, v[vgprValuC+86]         // copy vreg[74] to acc
v_accvgpr_write_b32 acc46, v[vgprValuC+87]         // copy vreg[75] to acc
v_accvgpr_write_b32 acc50, v[vgprValuC+88]         // copy vreg[76] to acc
v_accvgpr_write_b32 acc54, v[vgprValuC+89]         // copy vreg[77] to acc
v_accvgpr_write_b32 acc58, v[vgprValuC+90]         // copy vreg[78] to acc
v_accvgpr_write_b32 acc62, v[vgprValuC+91]         // copy vreg[79] to acc
v_accvgpr_write_b32 acc66, v[vgprValuC+92]         // copy vreg[80] to acc
v_accvgpr_write_b32 acc70, v[vgprValuC+93]         // copy vreg[81] to acc
v_accvgpr_write_b32 acc74, v[vgprValuC+94]         // copy vreg[82] to acc
v_accvgpr_write_b32 acc78, v[vgprValuC+95]         // copy vreg[83] to acc
v_accvgpr_write_b32 acc82, v[vgprValuC+96]         // copy vreg[84] to acc
v_accvgpr_write_b32 acc86, v[vgprValuC+97]         // copy vreg[85] to acc
v_accvgpr_write_b32 acc90, v[vgprValuC+98]         // copy vreg[86] to acc
v_accvgpr_write_b32 acc94, v[vgprValuC+99]         // copy vreg[87] to acc
v_accvgpr_write_b32 acc98, v[vgprValuC+100]        // copy vreg[88] to acc
v_accvgpr_write_b32 acc102, v[vgprValuC+101]       // copy vreg[89] to acc
v_accvgpr_write_b32 acc106, v[vgprValuC+102]       // copy vreg[90] to acc
v_accvgpr_write_b32 acc110, v[vgprValuC+103]       // copy vreg[91] to acc
v_accvgpr_write_b32 acc114, v[vgprValuC+104]       // copy vreg[92] to acc
v_accvgpr_write_b32 acc118, v[vgprValuC+105]       // copy vreg[93] to acc
v_accvgpr_write_b32 acc122, v[vgprValuC+106]       // copy vreg[94] to acc
v_accvgpr_write_b32 acc126, v[vgprValuC+107]       // copy vreg[95] to acc
v_accvgpr_write_b32 acc3, v[vgprValuC+108]         // copy vreg[96] to acc
v_accvgpr_write_b32 acc7, v[vgprValuC+109]         // copy vreg[97] to acc
v_accvgpr_write_b32 acc11, v[vgprValuC+110]        // copy vreg[98] to acc
v_accvgpr_write_b32 acc15, v[vgprValuC+111]        // copy vreg[99] to acc
v_accvgpr_write_b32 acc19, v[vgprValuC+112]        // copy vreg[100] to acc
v_accvgpr_write_b32 acc23, v[vgprValuC+113]        // copy vreg[101] to acc
v_accvgpr_write_b32 acc27, v[vgprValuC+114]        // copy vreg[102] to acc
v_accvgpr_write_b32 acc31, v[vgprValuC+115]        // copy vreg[103] to acc
v_accvgpr_write_b32 acc35, v[vgprValuC+116]        // copy vreg[104] to acc
v_accvgpr_write_b32 acc39, v[vgprValuC+117]        // copy vreg[105] to acc
v_accvgpr_write_b32 acc43, v[vgprValuC+118]        // copy vreg[106] to acc
v_accvgpr_write_b32 acc47, v[vgprValuC+119]        // copy vreg[107] to acc
v_accvgpr_write_b32 acc51, v[vgprValuC+120]        // copy vreg[108] to acc
v_accvgpr_write_b32 acc55, v[vgprValuC+121]        // copy vreg[109] to acc
v_accvgpr_write_b32 acc59, v[vgprValuC+122]        // copy vreg[110] to acc
v_accvgpr_write_b32 acc63, v[vgprValuC+123]        // copy vreg[111] to acc
s_nop 1                                            // 2 wait states required before reading vgpr
s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */

/******************************************/
/* Fixup Batch #1 (d1,d0,vc1,vc0) =       */
/*      (0,0,14,0:vw4); (0,1,14,0:vw4); (0,0,15,0:vw4); (0,1,15,0:vw4); (1,0,0,0:vw4); (1,1,0,0:vw4); (1,0,1,0:vw4); (1,1,1,0:vw4); (1,0,2,0:vw4); (1,1,2,0:vw4); (1,0,3,0:vw4); (1,1,3,0:vw4); (1,0,4,0:vw4); (1,1,4,0:vw4); (1,0,5,0:vw4); (1,1,5,0:vw4); (1,0,6,0:vw4); (1,1,6,0:vw4); (1,0,7,0:vw4); (1,1,7,0:vw4); (1,0,8,0:vw4); (1,1,8,0:vw4); (1,0,9,0:vw4); (1,1,9,0:vw4); (1,0,10,0:vw4); (1,1,10,0:vw4); (1,0,11,0:vw4); (1,1,11,0:vw4) */
/******************************************/

/* calc coords, apply mask, and issue loads (if necessary) */
s_add_u32 s74, s74, 4096                           // Inc sgpr offset
buffer_load_dwordx4 v[124:127], v10, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS
s_add_u32 s74, s74, 4096                           // Inc sgpr offset
buffer_load_dwordx4 v[128:131], v10, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS
s_add_u32 s74, s74, 4096                           // Inc sgpr offset
buffer_load_dwordx4 v[132:135], v10, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS
s_add_u32 s74, s74, 4096                           // Inc sgpr offset
buffer_load_dwordx4 v[136:139], v10, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS
s_add_u32 s74, s74, 4096                           // Inc sgpr offset
buffer_load_dwordx4 v[140:143], v10, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS
s_add_u32 s74, s74, 4096                           // Inc sgpr offset
buffer_load_dwordx4 v[152:155], v10, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS
s_add_u32 s74, s74, 4096                           // Inc sgpr offset
buffer_load_dwordx4 v[156:159], v10, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS
s_add_u32 s74, s74, 4096                           // Inc sgpr offset
buffer_load_dwordx4 v[160:163], v10, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS
s_add_u32 s74, s74, 4096                           // Inc sgpr offset
buffer_load_dwordx4 v[164:167], v10, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS
s_add_u32 s74, s74, 4096                           // Inc sgpr offset
buffer_load_dwordx4 v[168:171], v10, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS
s_add_u32 s74, s74, 4096                           // Inc sgpr offset
buffer_load_dwordx4 v[172:175], v10, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS
s_add_u32 s74, s74, 4096                           // Inc sgpr offset
buffer_load_dwordx4 v[176:179], v10, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS
s_add_u32 s74, s74, 4096                           // Inc sgpr offset
buffer_load_dwordx4 v[180:183], v10, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS
s_add_u32 s74, s74, 4096                           // Inc sgpr offset
buffer_load_dwordx4 v[184:187], v10, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS
s_add_u32 s74, s74, 4096                           // Inc sgpr offset
buffer_load_dwordx4 v[188:191], v10, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS
s_add_u32 s74, s74, 4096                           // Inc sgpr offset
buffer_load_dwordx4 v[192:195], v10, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS
s_add_u32 s74, s74, 4096                           // Inc sgpr offset
buffer_load_dwordx4 v[196:199], v10, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS
s_add_u32 s74, s74, 4096                           // Inc sgpr offset
buffer_load_dwordx4 v[200:203], v10, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS
s_add_u32 s74, s74, 4096                           // Inc sgpr offset
buffer_load_dwordx4 v[204:207], v10, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS
s_add_u32 s74, s74, 4096                           // Inc sgpr offset
buffer_load_dwordx4 v[208:211], v10, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS
s_add_u32 s74, s74, 4096                           // Inc sgpr offset
buffer_load_dwordx4 v[212:215], v10, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS
s_add_u32 s74, s74, 4096                           // Inc sgpr offset
buffer_load_dwordx4 v[216:219], v10, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS
s_add_u32 s74, s74, 4096                           // Inc sgpr offset
buffer_load_dwordx4 v[220:223], v10, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS
s_add_u32 s74, s74, 4096                           // Inc sgpr offset
buffer_load_dwordx4 v[224:227], v10, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS
s_add_u32 s74, s74, 4096                           // Inc sgpr offset
buffer_load_dwordx4 v[228:231], v10, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS
s_add_u32 s74, s74, 4096                           // Inc sgpr offset
buffer_load_dwordx4 v[232:235], v10, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS
s_add_u32 s74, s74, 4096                           // Inc sgpr offset
buffer_load_dwordx4 v[236:239], v10, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS
s_add_u32 s74, s74, 4096                           // Inc sgpr offset
buffer_load_dwordx4 v[240:243], v10, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS
v_accvgpr_read_b32 v[vgprValuC+12], acc67          // copy acc to vreg[112]
v_accvgpr_read_b32 v[vgprValuC+13], acc71          // copy acc to vreg[113]
v_accvgpr_read_b32 v[vgprValuC+14], acc75          // copy acc to vreg[114]
v_accvgpr_read_b32 v[vgprValuC+15], acc79          // copy acc to vreg[115]
v_accvgpr_read_b32 v[vgprValuC+16], acc83          // copy acc to vreg[116]
v_accvgpr_read_b32 v[vgprValuC+17], acc87          // copy acc to vreg[117]
v_accvgpr_read_b32 v[vgprValuC+18], acc91          // copy acc to vreg[118]
v_accvgpr_read_b32 v[vgprValuC+19], acc95          // copy acc to vreg[119]
v_accvgpr_read_b32 v[vgprValuC+20], acc99          // copy acc to vreg[120]
v_accvgpr_read_b32 v[vgprValuC+21], acc103         // copy acc to vreg[121]
v_accvgpr_read_b32 v[vgprValuC+22], acc107         // copy acc to vreg[122]
v_accvgpr_read_b32 v[vgprValuC+23], acc111         // copy acc to vreg[123]
v_accvgpr_read_b32 v[vgprValuC+24], acc115         // copy acc to vreg[124]
v_accvgpr_read_b32 v[vgprValuC+25], acc119         // copy acc to vreg[125]
v_accvgpr_read_b32 v[vgprValuC+26], acc123         // copy acc to vreg[126]
v_accvgpr_read_b32 v[vgprValuC+27], acc127         // copy acc to vreg[127]
v_accvgpr_read_b32 v[vgprValuC+28], acc128         // copy acc to vreg[128]
v_accvgpr_read_b32 v[vgprValuC+29], acc132         // copy acc to vreg[129]
v_accvgpr_read_b32 v[vgprValuC+30], acc136         // copy acc to vreg[130]
v_accvgpr_read_b32 v[vgprValuC+31], acc140         // copy acc to vreg[131]
v_accvgpr_read_b32 v[vgprValuC+32], acc144         // copy acc to vreg[132]
v_accvgpr_read_b32 v[vgprValuC+33], acc148         // copy acc to vreg[133]
v_accvgpr_read_b32 v[vgprValuC+34], acc152         // copy acc to vreg[134]
v_accvgpr_read_b32 v[vgprValuC+35], acc156         // copy acc to vreg[135]
v_accvgpr_read_b32 v[vgprValuC+36], acc160         // copy acc to vreg[136]
v_accvgpr_read_b32 v[vgprValuC+37], acc164         // copy acc to vreg[137]
v_accvgpr_read_b32 v[vgprValuC+38], acc168         // copy acc to vreg[138]
v_accvgpr_read_b32 v[vgprValuC+39], acc172         // copy acc to vreg[139]
v_accvgpr_read_b32 v[vgprValuC+40], acc176         // copy acc to vreg[140]
v_accvgpr_read_b32 v[vgprValuC+41], acc180         // copy acc to vreg[141]
v_accvgpr_read_b32 v[vgprValuC+42], acc184         // copy acc to vreg[142]
v_accvgpr_read_b32 v[vgprValuC+43], acc188         // copy acc to vreg[143]
v_accvgpr_read_b32 v[vgprValuC+44], acc192         // copy acc to vreg[144]
v_accvgpr_read_b32 v[vgprValuC+45], acc196         // copy acc to vreg[145]
v_accvgpr_read_b32 v[vgprValuC+46], acc200         // copy acc to vreg[146]
v_accvgpr_read_b32 v[vgprValuC+47], acc204         // copy acc to vreg[147]
v_accvgpr_read_b32 v[vgprValuC+48], acc208         // copy acc to vreg[148]
v_accvgpr_read_b32 v[vgprValuC+49], acc212         // copy acc to vreg[149]
v_accvgpr_read_b32 v[vgprValuC+50], acc216         // copy acc to vreg[150]
v_accvgpr_read_b32 v[vgprValuC+51], acc220         // copy acc to vreg[151]
v_accvgpr_read_b32 v[vgprValuC+52], acc224         // copy acc to vreg[152]
v_accvgpr_read_b32 v[vgprValuC+53], acc228         // copy acc to vreg[153]
v_accvgpr_read_b32 v[vgprValuC+54], acc232         // copy acc to vreg[154]
v_accvgpr_read_b32 v[vgprValuC+55], acc236         // copy acc to vreg[155]
v_accvgpr_read_b32 v[vgprValuC+56], acc240         // copy acc to vreg[156]
v_accvgpr_read_b32 v[vgprValuC+57], acc244         // copy acc to vreg[157]
v_accvgpr_read_b32 v[vgprValuC+58], acc248         // copy acc to vreg[158]
v_accvgpr_read_b32 v[vgprValuC+59], acc252         // copy acc to vreg[159]
v_accvgpr_read_b32 v[vgprValuC+60], acc129         // copy acc to vreg[160]
v_accvgpr_read_b32 v[vgprValuC+61], acc133         // copy acc to vreg[161]
v_accvgpr_read_b32 v[vgprValuC+62], acc137         // copy acc to vreg[162]
v_accvgpr_read_b32 v[vgprValuC+63], acc141         // copy acc to vreg[163]
v_accvgpr_read_b32 v[vgprValuC+64], acc145         // copy acc to vreg[164]
v_accvgpr_read_b32 v[vgprValuC+65], acc149         // copy acc to vreg[165]
v_accvgpr_read_b32 v[vgprValuC+66], acc153         // copy acc to vreg[166]
v_accvgpr_read_b32 v[vgprValuC+67], acc157         // copy acc to vreg[167]
v_accvgpr_read_b32 v[vgprValuC+68], acc161         // copy acc to vreg[168]
v_accvgpr_read_b32 v[vgprValuC+69], acc165         // copy acc to vreg[169]
v_accvgpr_read_b32 v[vgprValuC+70], acc169         // copy acc to vreg[170]
v_accvgpr_read_b32 v[vgprValuC+71], acc173         // copy acc to vreg[171]
v_accvgpr_read_b32 v[vgprValuC+72], acc177         // copy acc to vreg[172]
v_accvgpr_read_b32 v[vgprValuC+73], acc181         // copy acc to vreg[173]
v_accvgpr_read_b32 v[vgprValuC+74], acc185         // copy acc to vreg[174]
v_accvgpr_read_b32 v[vgprValuC+75], acc189         // copy acc to vreg[175]
v_accvgpr_read_b32 v[vgprValuC+76], acc193         // copy acc to vreg[176]
v_accvgpr_read_b32 v[vgprValuC+77], acc197         // copy acc to vreg[177]
v_accvgpr_read_b32 v[vgprValuC+78], acc201         // copy acc to vreg[178]
v_accvgpr_read_b32 v[vgprValuC+79], acc205         // copy acc to vreg[179]
v_accvgpr_read_b32 v[vgprValuC+80], acc209         // copy acc to vreg[180]
v_accvgpr_read_b32 v[vgprValuC+81], acc213         // copy acc to vreg[181]
v_accvgpr_read_b32 v[vgprValuC+82], acc217         // copy acc to vreg[182]
v_accvgpr_read_b32 v[vgprValuC+83], acc221         // copy acc to vreg[183]
v_accvgpr_read_b32 v[vgprValuC+84], acc225         // copy acc to vreg[184]
v_accvgpr_read_b32 v[vgprValuC+85], acc229         // copy acc to vreg[185]
v_accvgpr_read_b32 v[vgprValuC+86], acc233         // copy acc to vreg[186]
v_accvgpr_read_b32 v[vgprValuC+87], acc237         // copy acc to vreg[187]
v_accvgpr_read_b32 v[vgprValuC+88], acc241         // copy acc to vreg[188]
v_accvgpr_read_b32 v[vgprValuC+89], acc245         // copy acc to vreg[189]
v_accvgpr_read_b32 v[vgprValuC+90], acc249         // copy acc to vreg[190]
v_accvgpr_read_b32 v[vgprValuC+91], acc253         // copy acc to vreg[191]
v_accvgpr_read_b32 v[vgprValuC+92], acc130         // copy acc to vreg[192]
v_accvgpr_read_b32 v[vgprValuC+93], acc134         // copy acc to vreg[193]
v_accvgpr_read_b32 v[vgprValuC+94], acc138         // copy acc to vreg[194]
v_accvgpr_read_b32 v[vgprValuC+95], acc142         // copy acc to vreg[195]
v_accvgpr_read_b32 v[vgprValuC+96], acc146         // copy acc to vreg[196]
v_accvgpr_read_b32 v[vgprValuC+97], acc150         // copy acc to vreg[197]
v_accvgpr_read_b32 v[vgprValuC+98], acc154         // copy acc to vreg[198]
v_accvgpr_read_b32 v[vgprValuC+99], acc158         // copy acc to vreg[199]
v_accvgpr_read_b32 v[vgprValuC+100], acc162        // copy acc to vreg[200]
v_accvgpr_read_b32 v[vgprValuC+101], acc166        // copy acc to vreg[201]
v_accvgpr_read_b32 v[vgprValuC+102], acc170        // copy acc to vreg[202]
v_accvgpr_read_b32 v[vgprValuC+103], acc174        // copy acc to vreg[203]
v_accvgpr_read_b32 v[vgprValuC+104], acc178        // copy acc to vreg[204]
v_accvgpr_read_b32 v[vgprValuC+105], acc182        // copy acc to vreg[205]
v_accvgpr_read_b32 v[vgprValuC+106], acc186        // copy acc to vreg[206]
v_accvgpr_read_b32 v[vgprValuC+107], acc190        // copy acc to vreg[207]
v_accvgpr_read_b32 v[vgprValuC+108], acc194        // copy acc to vreg[208]
v_accvgpr_read_b32 v[vgprValuC+109], acc198        // copy acc to vreg[209]
v_accvgpr_read_b32 v[vgprValuC+110], acc202        // copy acc to vreg[210]
v_accvgpr_read_b32 v[vgprValuC+111], acc206        // copy acc to vreg[211]
v_accvgpr_read_b32 v[vgprValuC+112], acc210        // copy acc to vreg[212]
v_accvgpr_read_b32 v[vgprValuC+113], acc214        // copy acc to vreg[213]
v_accvgpr_read_b32 v[vgprValuC+114], acc218        // copy acc to vreg[214]
v_accvgpr_read_b32 v[vgprValuC+115], acc222        // copy acc to vreg[215]
v_accvgpr_read_b32 v[vgprValuC+116], acc226        // copy acc to vreg[216]
v_accvgpr_read_b32 v[vgprValuC+117], acc230        // copy acc to vreg[217]
v_accvgpr_read_b32 v[vgprValuC+118], acc234        // copy acc to vreg[218]
v_accvgpr_read_b32 v[vgprValuC+119], acc238        // copy acc to vreg[219]
v_accvgpr_read_b32 v[vgprValuC+120], acc242        // copy acc to vreg[220]
v_accvgpr_read_b32 v[vgprValuC+121], acc246        // copy acc to vreg[221]
v_accvgpr_read_b32 v[vgprValuC+122], acc250        // copy acc to vreg[222]
v_accvgpr_read_b32 v[vgprValuC+123], acc254        // copy acc to vreg[223]
s_nop 1                                            // 2 wait states required before reading vgpr

/* apply mask, calc new C and issue writes */

s_waitcnt vmcnt(27)                                // wait C (interleaved) 27 = 28 - 0 + 0 - 1
v_add_f32 v[vgprValuC+12], v[vgprValuC+12], v124   // accum partials
v_add_f32 v[vgprValuC+13], v[vgprValuC+13], v125   // accum partials
v_add_f32 v[vgprValuC+14], v[vgprValuC+14], v126   // accum partials
v_add_f32 v[vgprValuC+15], v[vgprValuC+15], v127   // accum partials

s_waitcnt vmcnt(26)                                // wait C (interleaved) 26 = 28 - 1 + 0 - 1
v_add_f32 v[vgprValuC+16], v[vgprValuC+16], v128   // accum partials
v_add_f32 v[vgprValuC+17], v[vgprValuC+17], v129   // accum partials
v_add_f32 v[vgprValuC+18], v[vgprValuC+18], v130   // accum partials
v_add_f32 v[vgprValuC+19], v[vgprValuC+19], v131   // accum partials

s_waitcnt vmcnt(25)                                // wait C (interleaved) 25 = 28 - 2 + 0 - 1
v_add_f32 v[vgprValuC+20], v[vgprValuC+20], v132   // accum partials
v_add_f32 v[vgprValuC+21], v[vgprValuC+21], v133   // accum partials
v_add_f32 v[vgprValuC+22], v[vgprValuC+22], v134   // accum partials
v_add_f32 v[vgprValuC+23], v[vgprValuC+23], v135   // accum partials

s_waitcnt vmcnt(24)                                // wait C (interleaved) 24 = 28 - 3 + 0 - 1
v_add_f32 v[vgprValuC+24], v[vgprValuC+24], v136   // accum partials
v_add_f32 v[vgprValuC+25], v[vgprValuC+25], v137   // accum partials
v_add_f32 v[vgprValuC+26], v[vgprValuC+26], v138   // accum partials
v_add_f32 v[vgprValuC+27], v[vgprValuC+27], v139   // accum partials

s_waitcnt vmcnt(23)                                // wait C (interleaved) 23 = 28 - 4 + 0 - 1
v_add_f32 v[vgprValuC+28], v[vgprValuC+28], v140   // accum partials
v_add_f32 v[vgprValuC+29], v[vgprValuC+29], v141   // accum partials
v_add_f32 v[vgprValuC+30], v[vgprValuC+30], v142   // accum partials
v_add_f32 v[vgprValuC+31], v[vgprValuC+31], v143   // accum partials

s_waitcnt vmcnt(22)                                // wait C (interleaved) 22 = 28 - 5 + 0 - 1
v_add_f32 v[vgprValuC+32], v[vgprValuC+32], v152   // accum partials
v_add_f32 v[vgprValuC+33], v[vgprValuC+33], v153   // accum partials
v_add_f32 v[vgprValuC+34], v[vgprValuC+34], v154   // accum partials
v_add_f32 v[vgprValuC+35], v[vgprValuC+35], v155   // accum partials

s_waitcnt vmcnt(21)                                // wait C (interleaved) 21 = 28 - 6 + 0 - 1
v_add_f32 v[vgprValuC+36], v[vgprValuC+36], v156   // accum partials
v_add_f32 v[vgprValuC+37], v[vgprValuC+37], v157   // accum partials
v_add_f32 v[vgprValuC+38], v[vgprValuC+38], v158   // accum partials
v_add_f32 v[vgprValuC+39], v[vgprValuC+39], v159   // accum partials

s_waitcnt vmcnt(20)                                // wait C (interleaved) 20 = 28 - 7 + 0 - 1
v_add_f32 v[vgprValuC+40], v[vgprValuC+40], v160   // accum partials
v_add_f32 v[vgprValuC+41], v[vgprValuC+41], v161   // accum partials
v_add_f32 v[vgprValuC+42], v[vgprValuC+42], v162   // accum partials
v_add_f32 v[vgprValuC+43], v[vgprValuC+43], v163   // accum partials

s_waitcnt vmcnt(19)                                // wait C (interleaved) 19 = 28 - 8 + 0 - 1
v_add_f32 v[vgprValuC+44], v[vgprValuC+44], v164   // accum partials
v_add_f32 v[vgprValuC+45], v[vgprValuC+45], v165   // accum partials
v_add_f32 v[vgprValuC+46], v[vgprValuC+46], v166   // accum partials
v_add_f32 v[vgprValuC+47], v[vgprValuC+47], v167   // accum partials

s_waitcnt vmcnt(18)                                // wait C (interleaved) 18 = 28 - 9 + 0 - 1
v_add_f32 v[vgprValuC+48], v[vgprValuC+48], v168   // accum partials
v_add_f32 v[vgprValuC+49], v[vgprValuC+49], v169   // accum partials
v_add_f32 v[vgprValuC+50], v[vgprValuC+50], v170   // accum partials
v_add_f32 v[vgprValuC+51], v[vgprValuC+51], v171   // accum partials

s_waitcnt vmcnt(17)                                // wait C (interleaved) 17 = 28 - 10 + 0 - 1
v_add_f32 v[vgprValuC+52], v[vgprValuC+52], v172   // accum partials
v_add_f32 v[vgprValuC+53], v[vgprValuC+53], v173   // accum partials
v_add_f32 v[vgprValuC+54], v[vgprValuC+54], v174   // accum partials
v_add_f32 v[vgprValuC+55], v[vgprValuC+55], v175   // accum partials

s_waitcnt vmcnt(16)                                // wait C (interleaved) 16 = 28 - 11 + 0 - 1
v_add_f32 v[vgprValuC+56], v[vgprValuC+56], v176   // accum partials
v_add_f32 v[vgprValuC+57], v[vgprValuC+57], v177   // accum partials
v_add_f32 v[vgprValuC+58], v[vgprValuC+58], v178   // accum partials
v_add_f32 v[vgprValuC+59], v[vgprValuC+59], v179   // accum partials

s_waitcnt vmcnt(15)                                // wait C (interleaved) 15 = 28 - 12 + 0 - 1
v_add_f32 v[vgprValuC+60], v[vgprValuC+60], v180   // accum partials
v_add_f32 v[vgprValuC+61], v[vgprValuC+61], v181   // accum partials
v_add_f32 v[vgprValuC+62], v[vgprValuC+62], v182   // accum partials
v_add_f32 v[vgprValuC+63], v[vgprValuC+63], v183   // accum partials

s_waitcnt vmcnt(14)                                // wait C (interleaved) 14 = 28 - 13 + 0 - 1
v_add_f32 v[vgprValuC+64], v[vgprValuC+64], v184   // accum partials
v_add_f32 v[vgprValuC+65], v[vgprValuC+65], v185   // accum partials
v_add_f32 v[vgprValuC+66], v[vgprValuC+66], v186   // accum partials
v_add_f32 v[vgprValuC+67], v[vgprValuC+67], v187   // accum partials

s_waitcnt vmcnt(13)                                // wait C (interleaved) 13 = 28 - 14 + 0 - 1
v_add_f32 v[vgprValuC+68], v[vgprValuC+68], v188   // accum partials
v_add_f32 v[vgprValuC+69], v[vgprValuC+69], v189   // accum partials
v_add_f32 v[vgprValuC+70], v[vgprValuC+70], v190   // accum partials
v_add_f32 v[vgprValuC+71], v[vgprValuC+71], v191   // accum partials

s_waitcnt vmcnt(12)                                // wait C (interleaved) 12 = 28 - 15 + 0 - 1
v_add_f32 v[vgprValuC+72], v[vgprValuC+72], v192   // accum partials
v_add_f32 v[vgprValuC+73], v[vgprValuC+73], v193   // accum partials
v_add_f32 v[vgprValuC+74], v[vgprValuC+74], v194   // accum partials
v_add_f32 v[vgprValuC+75], v[vgprValuC+75], v195   // accum partials

s_waitcnt vmcnt(11)                                // wait C (interleaved) 11 = 28 - 16 + 0 - 1
v_add_f32 v[vgprValuC+76], v[vgprValuC+76], v196   // accum partials
v_add_f32 v[vgprValuC+77], v[vgprValuC+77], v197   // accum partials
v_add_f32 v[vgprValuC+78], v[vgprValuC+78], v198   // accum partials
v_add_f32 v[vgprValuC+79], v[vgprValuC+79], v199   // accum partials

s_waitcnt vmcnt(10)                                // wait C (interleaved) 10 = 28 - 17 + 0 - 1
v_add_f32 v[vgprValuC+80], v[vgprValuC+80], v200   // accum partials
v_add_f32 v[vgprValuC+81], v[vgprValuC+81], v201   // accum partials
v_add_f32 v[vgprValuC+82], v[vgprValuC+82], v202   // accum partials
v_add_f32 v[vgprValuC+83], v[vgprValuC+83], v203   // accum partials

s_waitcnt vmcnt(9)                                 // wait C (interleaved) 9 = 28 - 18 + 0 - 1
v_add_f32 v[vgprValuC+84], v[vgprValuC+84], v204   // accum partials
v_add_f32 v[vgprValuC+85], v[vgprValuC+85], v205   // accum partials
v_add_f32 v[vgprValuC+86], v[vgprValuC+86], v206   // accum partials
v_add_f32 v[vgprValuC+87], v[vgprValuC+87], v207   // accum partials

s_waitcnt vmcnt(8)                                 // wait C (interleaved) 8 = 28 - 19 + 0 - 1
v_add_f32 v[vgprValuC+88], v[vgprValuC+88], v208   // accum partials
v_add_f32 v[vgprValuC+89], v[vgprValuC+89], v209   // accum partials
v_add_f32 v[vgprValuC+90], v[vgprValuC+90], v210   // accum partials
v_add_f32 v[vgprValuC+91], v[vgprValuC+91], v211   // accum partials

s_waitcnt vmcnt(7)                                 // wait C (interleaved) 7 = 28 - 20 + 0 - 1
v_add_f32 v[vgprValuC+92], v[vgprValuC+92], v212   // accum partials
v_add_f32 v[vgprValuC+93], v[vgprValuC+93], v213   // accum partials
v_add_f32 v[vgprValuC+94], v[vgprValuC+94], v214   // accum partials
v_add_f32 v[vgprValuC+95], v[vgprValuC+95], v215   // accum partials

s_waitcnt vmcnt(6)                                 // wait C (interleaved) 6 = 28 - 21 + 0 - 1
v_add_f32 v[vgprValuC+96], v[vgprValuC+96], v216   // accum partials
v_add_f32 v[vgprValuC+97], v[vgprValuC+97], v217   // accum partials
v_add_f32 v[vgprValuC+98], v[vgprValuC+98], v218   // accum partials
v_add_f32 v[vgprValuC+99], v[vgprValuC+99], v219   // accum partials

s_waitcnt vmcnt(5)                                 // wait C (interleaved) 5 = 28 - 22 + 0 - 1
v_add_f32 v[vgprValuC+100], v[vgprValuC+100], v220 // accum partials
v_add_f32 v[vgprValuC+101], v[vgprValuC+101], v221 // accum partials
v_add_f32 v[vgprValuC+102], v[vgprValuC+102], v222 // accum partials
v_add_f32 v[vgprValuC+103], v[vgprValuC+103], v223 // accum partials

s_waitcnt vmcnt(4)                                 // wait C (interleaved) 4 = 28 - 23 + 0 - 1
v_add_f32 v[vgprValuC+104], v[vgprValuC+104], v224 // accum partials
v_add_f32 v[vgprValuC+105], v[vgprValuC+105], v225 // accum partials
v_add_f32 v[vgprValuC+106], v[vgprValuC+106], v226 // accum partials
v_add_f32 v[vgprValuC+107], v[vgprValuC+107], v227 // accum partials

s_waitcnt vmcnt(3)                                 // wait C (interleaved) 3 = 28 - 24 + 0 - 1
v_add_f32 v[vgprValuC+108], v[vgprValuC+108], v228 // accum partials
v_add_f32 v[vgprValuC+109], v[vgprValuC+109], v229 // accum partials
v_add_f32 v[vgprValuC+110], v[vgprValuC+110], v230 // accum partials
v_add_f32 v[vgprValuC+111], v[vgprValuC+111], v231 // accum partials

s_waitcnt vmcnt(2)                                 // wait C (interleaved) 2 = 28 - 25 + 0 - 1
v_add_f32 v[vgprValuC+112], v[vgprValuC+112], v232 // accum partials
v_add_f32 v[vgprValuC+113], v[vgprValuC+113], v233 // accum partials
v_add_f32 v[vgprValuC+114], v[vgprValuC+114], v234 // accum partials
v_add_f32 v[vgprValuC+115], v[vgprValuC+115], v235 // accum partials

s_waitcnt vmcnt(1)                                 // wait C (interleaved) 1 = 28 - 26 + 0 - 1
v_add_f32 v[vgprValuC+116], v[vgprValuC+116], v236 // accum partials
v_add_f32 v[vgprValuC+117], v[vgprValuC+117], v237 // accum partials
v_add_f32 v[vgprValuC+118], v[vgprValuC+118], v238 // accum partials
v_add_f32 v[vgprValuC+119], v[vgprValuC+119], v239 // accum partials

s_waitcnt vmcnt(0)                                 // wait C (interleaved) 0 = 28 - 27 + 0 - 1
v_add_f32 v[vgprValuC+120], v[vgprValuC+120], v240 // accum partials
v_add_f32 v[vgprValuC+121], v[vgprValuC+121], v241 // accum partials
v_add_f32 v[vgprValuC+122], v[vgprValuC+122], v242 // accum partials
v_add_f32 v[vgprValuC+123], v[vgprValuC+123], v243 // accum partials
v_accvgpr_write_b32 acc67, v[vgprValuC+12]         // copy vreg[112] to acc
v_accvgpr_write_b32 acc71, v[vgprValuC+13]         // copy vreg[113] to acc
v_accvgpr_write_b32 acc75, v[vgprValuC+14]         // copy vreg[114] to acc
v_accvgpr_write_b32 acc79, v[vgprValuC+15]         // copy vreg[115] to acc
v_accvgpr_write_b32 acc83, v[vgprValuC+16]         // copy vreg[116] to acc
v_accvgpr_write_b32 acc87, v[vgprValuC+17]         // copy vreg[117] to acc
v_accvgpr_write_b32 acc91, v[vgprValuC+18]         // copy vreg[118] to acc
v_accvgpr_write_b32 acc95, v[vgprValuC+19]         // copy vreg[119] to acc
v_accvgpr_write_b32 acc99, v[vgprValuC+20]         // copy vreg[120] to acc
v_accvgpr_write_b32 acc103, v[vgprValuC+21]        // copy vreg[121] to acc
v_accvgpr_write_b32 acc107, v[vgprValuC+22]        // copy vreg[122] to acc
v_accvgpr_write_b32 acc111, v[vgprValuC+23]        // copy vreg[123] to acc
v_accvgpr_write_b32 acc115, v[vgprValuC+24]        // copy vreg[124] to acc
v_accvgpr_write_b32 acc119, v[vgprValuC+25]        // copy vreg[125] to acc
v_accvgpr_write_b32 acc123, v[vgprValuC+26]        // copy vreg[126] to acc
v_accvgpr_write_b32 acc127, v[vgprValuC+27]        // copy vreg[127] to acc
v_accvgpr_write_b32 acc128, v[vgprValuC+28]        // copy vreg[128] to acc
v_accvgpr_write_b32 acc132, v[vgprValuC+29]        // copy vreg[129] to acc
v_accvgpr_write_b32 acc136, v[vgprValuC+30]        // copy vreg[130] to acc
v_accvgpr_write_b32 acc140, v[vgprValuC+31]        // copy vreg[131] to acc
v_accvgpr_write_b32 acc144, v[vgprValuC+32]        // copy vreg[132] to acc
v_accvgpr_write_b32 acc148, v[vgprValuC+33]        // copy vreg[133] to acc
v_accvgpr_write_b32 acc152, v[vgprValuC+34]        // copy vreg[134] to acc
v_accvgpr_write_b32 acc156, v[vgprValuC+35]        // copy vreg[135] to acc
v_accvgpr_write_b32 acc160, v[vgprValuC+36]        // copy vreg[136] to acc
v_accvgpr_write_b32 acc164, v[vgprValuC+37]        // copy vreg[137] to acc
v_accvgpr_write_b32 acc168, v[vgprValuC+38]        // copy vreg[138] to acc
v_accvgpr_write_b32 acc172, v[vgprValuC+39]        // copy vreg[139] to acc
v_accvgpr_write_b32 acc176, v[vgprValuC+40]        // copy vreg[140] to acc
v_accvgpr_write_b32 acc180, v[vgprValuC+41]        // copy vreg[141] to acc
v_accvgpr_write_b32 acc184, v[vgprValuC+42]        // copy vreg[142] to acc
v_accvgpr_write_b32 acc188, v[vgprValuC+43]        // copy vreg[143] to acc
v_accvgpr_write_b32 acc192, v[vgprValuC+44]        // copy vreg[144] to acc
v_accvgpr_write_b32 acc196, v[vgprValuC+45]        // copy vreg[145] to acc
v_accvgpr_write_b32 acc200, v[vgprValuC+46]        // copy vreg[146] to acc
v_accvgpr_write_b32 acc204, v[vgprValuC+47]        // copy vreg[147] to acc
v_accvgpr_write_b32 acc208, v[vgprValuC+48]        // copy vreg[148] to acc
v_accvgpr_write_b32 acc212, v[vgprValuC+49]        // copy vreg[149] to acc
v_accvgpr_write_b32 acc216, v[vgprValuC+50]        // copy vreg[150] to acc
v_accvgpr_write_b32 acc220, v[vgprValuC+51]        // copy vreg[151] to acc
v_accvgpr_write_b32 acc224, v[vgprValuC+52]        // copy vreg[152] to acc
v_accvgpr_write_b32 acc228, v[vgprValuC+53]        // copy vreg[153] to acc
v_accvgpr_write_b32 acc232, v[vgprValuC+54]        // copy vreg[154] to acc
v_accvgpr_write_b32 acc236, v[vgprValuC+55]        // copy vreg[155] to acc
v_accvgpr_write_b32 acc240, v[vgprValuC+56]        // copy vreg[156] to acc
v_accvgpr_write_b32 acc244, v[vgprValuC+57]        // copy vreg[157] to acc
v_accvgpr_write_b32 acc248, v[vgprValuC+58]        // copy vreg[158] to acc
v_accvgpr_write_b32 acc252, v[vgprValuC+59]        // copy vreg[159] to acc
v_accvgpr_write_b32 acc129, v[vgprValuC+60]        // copy vreg[160] to acc
v_accvgpr_write_b32 acc133, v[vgprValuC+61]        // copy vreg[161] to acc
v_accvgpr_write_b32 acc137, v[vgprValuC+62]        // copy vreg[162] to acc
v_accvgpr_write_b32 acc141, v[vgprValuC+63]        // copy vreg[163] to acc
v_accvgpr_write_b32 acc145, v[vgprValuC+64]        // copy vreg[164] to acc
v_accvgpr_write_b32 acc149, v[vgprValuC+65]        // copy vreg[165] to acc
v_accvgpr_write_b32 acc153, v[vgprValuC+66]        // copy vreg[166] to acc
v_accvgpr_write_b32 acc157, v[vgprValuC+67]        // copy vreg[167] to acc
v_accvgpr_write_b32 acc161, v[vgprValuC+68]        // copy vreg[168] to acc
v_accvgpr_write_b32 acc165, v[vgprValuC+69]        // copy vreg[169] to acc
v_accvgpr_write_b32 acc169, v[vgprValuC+70]        // copy vreg[170] to acc
v_accvgpr_write_b32 acc173, v[vgprValuC+71]        // copy vreg[171] to acc
v_accvgpr_write_b32 acc177, v[vgprValuC+72]        // copy vreg[172] to acc
v_accvgpr_write_b32 acc181, v[vgprValuC+73]        // copy vreg[173] to acc
v_accvgpr_write_b32 acc185, v[vgprValuC+74]        // copy vreg[174] to acc
v_accvgpr_write_b32 acc189, v[vgprValuC+75]        // copy vreg[175] to acc
v_accvgpr_write_b32 acc193, v[vgprValuC+76]        // copy vreg[176] to acc
v_accvgpr_write_b32 acc197, v[vgprValuC+77]        // copy vreg[177] to acc
v_accvgpr_write_b32 acc201, v[vgprValuC+78]        // copy vreg[178] to acc
v_accvgpr_write_b32 acc205, v[vgprValuC+79]        // copy vreg[179] to acc
v_accvgpr_write_b32 acc209, v[vgprValuC+80]        // copy vreg[180] to acc
v_accvgpr_write_b32 acc213, v[vgprValuC+81]        // copy vreg[181] to acc
v_accvgpr_write_b32 acc217, v[vgprValuC+82]        // copy vreg[182] to acc
v_accvgpr_write_b32 acc221, v[vgprValuC+83]        // copy vreg[183] to acc
v_accvgpr_write_b32 acc225, v[vgprValuC+84]        // copy vreg[184] to acc
v_accvgpr_write_b32 acc229, v[vgprValuC+85]        // copy vreg[185] to acc
v_accvgpr_write_b32 acc233, v[vgprValuC+86]        // copy vreg[186] to acc
v_accvgpr_write_b32 acc237, v[vgprValuC+87]        // copy vreg[187] to acc
v_accvgpr_write_b32 acc241, v[vgprValuC+88]        // copy vreg[188] to acc
v_accvgpr_write_b32 acc245, v[vgprValuC+89]        // copy vreg[189] to acc
v_accvgpr_write_b32 acc249, v[vgprValuC+90]        // copy vreg[190] to acc
v_accvgpr_write_b32 acc253, v[vgprValuC+91]        // copy vreg[191] to acc
v_accvgpr_write_b32 acc130, v[vgprValuC+92]        // copy vreg[192] to acc
v_accvgpr_write_b32 acc134, v[vgprValuC+93]        // copy vreg[193] to acc
v_accvgpr_write_b32 acc138, v[vgprValuC+94]        // copy vreg[194] to acc
v_accvgpr_write_b32 acc142, v[vgprValuC+95]        // copy vreg[195] to acc
v_accvgpr_write_b32 acc146, v[vgprValuC+96]        // copy vreg[196] to acc
v_accvgpr_write_b32 acc150, v[vgprValuC+97]        // copy vreg[197] to acc
v_accvgpr_write_b32 acc154, v[vgprValuC+98]        // copy vreg[198] to acc
v_accvgpr_write_b32 acc158, v[vgprValuC+99]        // copy vreg[199] to acc
v_accvgpr_write_b32 acc162, v[vgprValuC+100]       // copy vreg[200] to acc
v_accvgpr_write_b32 acc166, v[vgprValuC+101]       // copy vreg[201] to acc
v_accvgpr_write_b32 acc170, v[vgprValuC+102]       // copy vreg[202] to acc
v_accvgpr_write_b32 acc174, v[vgprValuC+103]       // copy vreg[203] to acc
v_accvgpr_write_b32 acc178, v[vgprValuC+104]       // copy vreg[204] to acc
v_accvgpr_write_b32 acc182, v[vgprValuC+105]       // copy vreg[205] to acc
v_accvgpr_write_b32 acc186, v[vgprValuC+106]       // copy vreg[206] to acc
v_accvgpr_write_b32 acc190, v[vgprValuC+107]       // copy vreg[207] to acc
v_accvgpr_write_b32 acc194, v[vgprValuC+108]       // copy vreg[208] to acc
v_accvgpr_write_b32 acc198, v[vgprValuC+109]       // copy vreg[209] to acc
v_accvgpr_write_b32 acc202, v[vgprValuC+110]       // copy vreg[210] to acc
v_accvgpr_write_b32 acc206, v[vgprValuC+111]       // copy vreg[211] to acc
v_accvgpr_write_b32 acc210, v[vgprValuC+112]       // copy vreg[212] to acc
v_accvgpr_write_b32 acc214, v[vgprValuC+113]       // copy vreg[213] to acc
v_accvgpr_write_b32 acc218, v[vgprValuC+114]       // copy vreg[214] to acc
v_accvgpr_write_b32 acc222, v[vgprValuC+115]       // copy vreg[215] to acc
v_accvgpr_write_b32 acc226, v[vgprValuC+116]       // copy vreg[216] to acc
v_accvgpr_write_b32 acc230, v[vgprValuC+117]       // copy vreg[217] to acc
v_accvgpr_write_b32 acc234, v[vgprValuC+118]       // copy vreg[218] to acc
v_accvgpr_write_b32 acc238, v[vgprValuC+119]       // copy vreg[219] to acc
v_accvgpr_write_b32 acc242, v[vgprValuC+120]       // copy vreg[220] to acc
v_accvgpr_write_b32 acc246, v[vgprValuC+121]       // copy vreg[221] to acc
v_accvgpr_write_b32 acc250, v[vgprValuC+122]       // copy vreg[222] to acc
v_accvgpr_write_b32 acc254, v[vgprValuC+123]       // copy vreg[223] to acc
s_nop 1                                            // 2 wait states required before reading vgpr
s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */

/******************************************/
/* Fixup Batch #2 (d1,d0,vc1,vc0) =       */
/*      (1,0,12,0:vw4); (1,1,12,0:vw4); (1,0,13,0:vw4); (1,1,13,0:vw4); (1,0,14,0:vw4); (1,1,14,0:vw4); (1,0,15,0:vw4); (1,1,15,0:vw4) */
/******************************************/

/* calc coords, apply mask, and issue loads (if necessary) */
s_add_u32 s74, s74, 4096                           // Inc sgpr offset
buffer_load_dwordx4 v[44:47], v10, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS
s_add_u32 s74, s74, 4096                           // Inc sgpr offset
buffer_load_dwordx4 v[48:51], v10, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS
s_add_u32 s74, s74, 4096                           // Inc sgpr offset
buffer_load_dwordx4 v[52:55], v10, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS
s_add_u32 s74, s74, 4096                           // Inc sgpr offset
buffer_load_dwordx4 v[56:59], v10, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS
s_add_u32 s74, s74, 4096                           // Inc sgpr offset
buffer_load_dwordx4 v[60:63], v10, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS
s_add_u32 s74, s74, 4096                           // Inc sgpr offset
buffer_load_dwordx4 v[64:67], v10, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS
s_add_u32 s74, s74, 4096                           // Inc sgpr offset
buffer_load_dwordx4 v[68:71], v10, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS
s_add_u32 s74, s74, 4096                           // Inc sgpr offset
buffer_load_dwordx4 v[72:75], v10, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS
v_accvgpr_read_b32 v[vgprValuC+12], acc131         // copy acc to vreg[224]
v_accvgpr_read_b32 v[vgprValuC+13], acc135         // copy acc to vreg[225]
v_accvgpr_read_b32 v[vgprValuC+14], acc139         // copy acc to vreg[226]
v_accvgpr_read_b32 v[vgprValuC+15], acc143         // copy acc to vreg[227]
v_accvgpr_read_b32 v[vgprValuC+16], acc147         // copy acc to vreg[228]
v_accvgpr_read_b32 v[vgprValuC+17], acc151         // copy acc to vreg[229]
v_accvgpr_read_b32 v[vgprValuC+18], acc155         // copy acc to vreg[230]
v_accvgpr_read_b32 v[vgprValuC+19], acc159         // copy acc to vreg[231]
v_accvgpr_read_b32 v[vgprValuC+20], acc163         // copy acc to vreg[232]
v_accvgpr_read_b32 v[vgprValuC+21], acc167         // copy acc to vreg[233]
v_accvgpr_read_b32 v[vgprValuC+22], acc171         // copy acc to vreg[234]
v_accvgpr_read_b32 v[vgprValuC+23], acc175         // copy acc to vreg[235]
v_accvgpr_read_b32 v[vgprValuC+24], acc179         // copy acc to vreg[236]
v_accvgpr_read_b32 v[vgprValuC+25], acc183         // copy acc to vreg[237]
v_accvgpr_read_b32 v[vgprValuC+26], acc187         // copy acc to vreg[238]
v_accvgpr_read_b32 v[vgprValuC+27], acc191         // copy acc to vreg[239]
v_accvgpr_read_b32 v[vgprValuC+28], acc195         // copy acc to vreg[240]
v_accvgpr_read_b32 v[vgprValuC+29], acc199         // copy acc to vreg[241]
v_accvgpr_read_b32 v[vgprValuC+30], acc203         // copy acc to vreg[242]
v_accvgpr_read_b32 v[vgprValuC+31], acc207         // copy acc to vreg[243]
v_accvgpr_read_b32 v[vgprValuC+32], acc211         // copy acc to vreg[244]
v_accvgpr_read_b32 v[vgprValuC+33], acc215         // copy acc to vreg[245]
v_accvgpr_read_b32 v[vgprValuC+34], acc219         // copy acc to vreg[246]
v_accvgpr_read_b32 v[vgprValuC+35], acc223         // copy acc to vreg[247]
v_accvgpr_read_b32 v[vgprValuC+36], acc227         // copy acc to vreg[248]
v_accvgpr_read_b32 v[vgprValuC+37], acc231         // copy acc to vreg[249]
v_accvgpr_read_b32 v[vgprValuC+38], acc235         // copy acc to vreg[250]
v_accvgpr_read_b32 v[vgprValuC+39], acc239         // copy acc to vreg[251]
v_accvgpr_read_b32 v[vgprValuC+40], acc243         // copy acc to vreg[252]
v_accvgpr_read_b32 v[vgprValuC+41], acc247         // copy acc to vreg[253]
v_accvgpr_read_b32 v[vgprValuC+42], acc251         // copy acc to vreg[254]
v_accvgpr_read_b32 v[vgprValuC+43], acc255         // copy acc to vreg[255]
s_nop 1                                            // 2 wait states required before reading vgpr

/* apply mask, calc new C and issue writes */

s_waitcnt vmcnt(7)                                 // wait C (interleaved) 7 = 8 - 0 + 0 - 1
v_add_f32 v[vgprValuC+12], v[vgprValuC+12], v44    // accum partials
v_add_f32 v[vgprValuC+13], v[vgprValuC+13], v45    // accum partials
v_add_f32 v[vgprValuC+14], v[vgprValuC+14], v46    // accum partials
v_add_f32 v[vgprValuC+15], v[vgprValuC+15], v47    // accum partials

s_waitcnt vmcnt(6)                                 // wait C (interleaved) 6 = 8 - 1 + 0 - 1
v_add_f32 v[vgprValuC+16], v[vgprValuC+16], v48    // accum partials
v_add_f32 v[vgprValuC+17], v[vgprValuC+17], v49    // accum partials
v_add_f32 v[vgprValuC+18], v[vgprValuC+18], v50    // accum partials
v_add_f32 v[vgprValuC+19], v[vgprValuC+19], v51    // accum partials

s_waitcnt vmcnt(5)                                 // wait C (interleaved) 5 = 8 - 2 + 0 - 1
v_add_f32 v[vgprValuC+20], v[vgprValuC+20], v52    // accum partials
v_add_f32 v[vgprValuC+21], v[vgprValuC+21], v53    // accum partials
v_add_f32 v[vgprValuC+22], v[vgprValuC+22], v54    // accum partials
v_add_f32 v[vgprValuC+23], v[vgprValuC+23], v55    // accum partials

s_waitcnt vmcnt(4)                                 // wait C (interleaved) 4 = 8 - 3 + 0 - 1
v_add_f32 v[vgprValuC+24], v[vgprValuC+24], v56    // accum partials
v_add_f32 v[vgprValuC+25], v[vgprValuC+25], v57    // accum partials
v_add_f32 v[vgprValuC+26], v[vgprValuC+26], v58    // accum partials
v_add_f32 v[vgprValuC+27], v[vgprValuC+27], v59    // accum partials

s_waitcnt vmcnt(3)                                 // wait C (interleaved) 3 = 8 - 4 + 0 - 1
v_add_f32 v[vgprValuC+28], v[vgprValuC+28], v60    // accum partials
v_add_f32 v[vgprValuC+29], v[vgprValuC+29], v61    // accum partials
v_add_f32 v[vgprValuC+30], v[vgprValuC+30], v62    // accum partials
v_add_f32 v[vgprValuC+31], v[vgprValuC+31], v63    // accum partials

s_waitcnt vmcnt(2)                                 // wait C (interleaved) 2 = 8 - 5 + 0 - 1
v_add_f32 v[vgprValuC+32], v[vgprValuC+32], v64    // accum partials
v_add_f32 v[vgprValuC+33], v[vgprValuC+33], v65    // accum partials
v_add_f32 v[vgprValuC+34], v[vgprValuC+34], v66    // accum partials
v_add_f32 v[vgprValuC+35], v[vgprValuC+35], v67    // accum partials

s_waitcnt vmcnt(1)                                 // wait C (interleaved) 1 = 8 - 6 + 0 - 1
v_add_f32 v[vgprValuC+36], v[vgprValuC+36], v68    // accum partials
v_add_f32 v[vgprValuC+37], v[vgprValuC+37], v69    // accum partials
v_add_f32 v[vgprValuC+38], v[vgprValuC+38], v70    // accum partials
v_add_f32 v[vgprValuC+39], v[vgprValuC+39], v71    // accum partials

s_waitcnt vmcnt(0)                                 // wait C (interleaved) 0 = 8 - 7 + 0 - 1
v_add_f32 v[vgprValuC+40], v[vgprValuC+40], v72    // accum partials
v_add_f32 v[vgprValuC+41], v[vgprValuC+41], v73    // accum partials
v_add_f32 v[vgprValuC+42], v[vgprValuC+42], v74    // accum partials
v_add_f32 v[vgprValuC+43], v[vgprValuC+43], v75    // accum partials
v_accvgpr_write_b32 acc131, v[vgprValuC+12]        // copy vreg[224] to acc
v_accvgpr_write_b32 acc135, v[vgprValuC+13]        // copy vreg[225] to acc
v_accvgpr_write_b32 acc139, v[vgprValuC+14]        // copy vreg[226] to acc
v_accvgpr_write_b32 acc143, v[vgprValuC+15]        // copy vreg[227] to acc
v_accvgpr_write_b32 acc147, v[vgprValuC+16]        // copy vreg[228] to acc
v_accvgpr_write_b32 acc151, v[vgprValuC+17]        // copy vreg[229] to acc
v_accvgpr_write_b32 acc155, v[vgprValuC+18]        // copy vreg[230] to acc
v_accvgpr_write_b32 acc159, v[vgprValuC+19]        // copy vreg[231] to acc
v_accvgpr_write_b32 acc163, v[vgprValuC+20]        // copy vreg[232] to acc
v_accvgpr_write_b32 acc167, v[vgprValuC+21]        // copy vreg[233] to acc
v_accvgpr_write_b32 acc171, v[vgprValuC+22]        // copy vreg[234] to acc
v_accvgpr_write_b32 acc175, v[vgprValuC+23]        // copy vreg[235] to acc
v_accvgpr_write_b32 acc179, v[vgprValuC+24]        // copy vreg[236] to acc
v_accvgpr_write_b32 acc183, v[vgprValuC+25]        // copy vreg[237] to acc
v_accvgpr_write_b32 acc187, v[vgprValuC+26]        // copy vreg[238] to acc
v_accvgpr_write_b32 acc191, v[vgprValuC+27]        // copy vreg[239] to acc
v_accvgpr_write_b32 acc195, v[vgprValuC+28]        // copy vreg[240] to acc
v_accvgpr_write_b32 acc199, v[vgprValuC+29]        // copy vreg[241] to acc
v_accvgpr_write_b32 acc203, v[vgprValuC+30]        // copy vreg[242] to acc
v_accvgpr_write_b32 acc207, v[vgprValuC+31]        // copy vreg[243] to acc
v_accvgpr_write_b32 acc211, v[vgprValuC+32]        // copy vreg[244] to acc
v_accvgpr_write_b32 acc215, v[vgprValuC+33]        // copy vreg[245] to acc
v_accvgpr_write_b32 acc219, v[vgprValuC+34]        // copy vreg[246] to acc
v_accvgpr_write_b32 acc223, v[vgprValuC+35]        // copy vreg[247] to acc
v_accvgpr_write_b32 acc227, v[vgprValuC+36]        // copy vreg[248] to acc
v_accvgpr_write_b32 acc231, v[vgprValuC+37]        // copy vreg[249] to acc
v_accvgpr_write_b32 acc235, v[vgprValuC+38]        // copy vreg[250] to acc
v_accvgpr_write_b32 acc239, v[vgprValuC+39]        // copy vreg[251] to acc
v_accvgpr_write_b32 acc243, v[vgprValuC+40]        // copy vreg[252] to acc
v_accvgpr_write_b32 acc247, v[vgprValuC+41]        // copy vreg[253] to acc
v_accvgpr_write_b32 acc251, v[vgprValuC+42]        // copy vreg[254] to acc
v_accvgpr_write_b32 acc255, v[vgprValuC+43]        // copy vreg[255] to acc
s_nop 1                                            // 2 wait states required before reading vgpr
s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
s_add_u32 s73, s[sgprSKItersPerWG], 1              // Add extra iter
s_cmp_lt_u32 s77, s[sgprskExtraIters]              // Check if next WG had an extra iteration
s_cselect_b32 s73, s73, s[sgprSKItersPerWG]        // Select correct number of iterations for next WG
s_add_u32 s78, s78, s73                            // next partial tile iteration
s_add_u32 s77, s77, 1                              // next partial tile index
s_cmp_lt_u32 s78, s[sgprItersPerTile]              // done loading partial tiles?
s_cbranch_scc1 label_SK_Fixup                      // Branch to continue fixup loop
label_SK_Store:
s_cmpk_eq_u32 s[sgprBeta], 0                       // Beta == 0
s_cbranch_scc0 label_GW_Beta                       // Branch if Beta is not zero

s_and_b32 s74, 255, s[sgprSizeI]                   // s74 = s[sgprSizeI] % 256
s_add_u32 s75, -0x1, s[sgprNumWorkGroups0]
s_cmp_ge_u32 s[sgprWorkGroup0], s75                // wg0 >= nwg0-1 ?
s_cselect_b32 s74, s74, 0                          // set rMT0
s_cmpk_gt_u32 s74, 0                               // rMT0 > 0
s_cbranch_scc1 label_GW_B0_E1_M                    // jump if edges required
s_and_b32 s74, 255, s[sgprSizeJ]                   // s74 = s[sgprSizeJ] % 256
s_add_u32 s75, -0x1, s[sgprNumWorkGroups1]
s_cmp_ge_u32 s[sgprWorkGroup1], s75                // wg1 >= nwg1-1
s_cselect_b32 s74, s74, 0                          // set rMT1
s_cmpk_gt_u32 s74, 0                               // rMT1 > 0
s_cbranch_scc1 label_GW_B0_E1_N                    // jump if edges required
label_GW_B0_E0:
s_cmpk_eq_u32 s[sgprActivationType], 3             // activationType == 3
s_cbranch_scc1 label_To_Activation_Gelu_VW4_beta_0_edge_0 // Branch if true
s_cmpk_eq_u32 s[sgprActivationType], 5             // activationType == 5
s_cbranch_scc1 label_To_Activation_Relu_VW4_beta_0_edge_0 // Branch if true
s_cmpk_eq_u32 s[sgprActivationType], 10            // activationType == 10
s_cbranch_scc1 label_To_Activation_Silu_VW4_beta_0_edge_0 // Branch if true
s_cmpk_eq_u32 s[sgprActivationType], 12            // activationType == 12
s_cbranch_scc1 label_To_Activation_Clamp_VW4_beta_0_edge_0 // Branch if true
label_To_Activation_None_VW4_beta_0_edge_0:
s_getpc_b64 s[8:9]                                 // addr of next instr
s_add_i32 s73, label_Activation_None_VW4, 4        // target branch offset
s_add_u32 s8, s8, s73                              // add target branch offset
s_addc_u32 s9, s9, 0                               // add high and carry
s_branch label_ActivationSetPCAddrEnd_5
label_To_Activation_Gelu_VW4_beta_0_edge_0:
s_getpc_b64 s[8:9]                                 // addr of next instr
s_add_i32 s73, label_Activation_Gelu_VW4, 4        // target branch offset
s_add_u32 s8, s8, s73                              // add target branch offset
s_addc_u32 s9, s9, 0                               // add high and carry
s_branch label_ActivationSetPCAddrEnd_5
label_To_Activation_Relu_VW4_beta_0_edge_0:
s_getpc_b64 s[8:9]                                 // addr of next instr
s_add_i32 s73, label_Activation_Relu_VW4, 4        // target branch offset
s_add_u32 s8, s8, s73                              // add target branch offset
s_addc_u32 s9, s9, 0                               // add high and carry
s_branch label_ActivationSetPCAddrEnd_5
label_To_Activation_Silu_VW4_beta_0_edge_0:
s_getpc_b64 s[8:9]                                 // addr of next instr
s_add_i32 s73, label_Activation_Silu_VW4, 4        // target branch offset
s_add_u32 s8, s8, s73                              // add target branch offset
s_addc_u32 s9, s9, 0                               // add high and carry
s_branch label_ActivationSetPCAddrEnd_5
label_To_Activation_Clamp_VW4_beta_0_edge_0:
s_getpc_b64 s[8:9]                                 // addr of next instr
s_add_i32 s73, label_Activation_Clamp_VW4, 4       // target branch offset
s_add_u32 s8, s8, s73                              // add target branch offset
s_addc_u32 s9, s9, 0                               // add high and carry
s_branch label_ActivationSetPCAddrEnd_5
label_ActivationSetPCAddrEnd_5:

/* edge=0, allocate 2 sgpr. perBatchTmpS=2 perBatchMaskS=0 perElementMaskS=0 elementsPerBatch=19 */
/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 factorDim=0 */

/******************************************/
/* Global Write Batch #0 (d1,d0,vc1,vc0) = */
/*    (0,0,0,0:vw4); (0,1,0,0:vw4); (0,0,1,0:vw4); (0,1,1,0:vw4); (0,0,2,0:vw4); (0,1,2,0:vw4); (0,0,3,0:vw4); (0,1,3,0:vw4); (0,0,4,0:vw4); (0,1,4,0:vw4); (0,0,5,0:vw4); (0,1,5,0:vw4); (0,0,6,0:vw4); (0,1,6,0:vw4); (0,0,7,0:vw4); (0,1,7,0:vw4); (0,0,8,0:vw4); (0,1,8,0:vw4); (0,0,9,0:vw4) */
/******************************************/

/* calc coords, apply mask, and issue loads (if necessary) */
/* (d1,vc1,d0,vc0)=(0,0,0,0) */
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v11, v0, s74
v_lshlrev_b32 v11, 0x2, v11                        // Bias address scaled by BPE
s_waitcnt lgkmcnt(0)                               // Wait for LDS write
s_barrier                                          // LDS write barrier
ds_read_b128 v[88:91], v11 offset:0                // load Bias
ds_read_b128 v[92:95], v11 offset:1024             // load scaleAlpha
/* (d1,vc1,d0,vc0)=(0,0,1,0) */
ds_read_b128 v[96:99], v11 offset:512              // load Bias
ds_read_b128 v[100:103], v11 offset:1536           // load scaleAlpha
/* (d1,vc1,d0,vc0)=(0,1,0,0) */
/* (d1,vc1,d0,vc0)=(0,1,1,0) */
/* (d1,vc1,d0,vc0)=(0,2,0,0) */
/* (d1,vc1,d0,vc0)=(0,2,1,0) */
/* (d1,vc1,d0,vc0)=(0,3,0,0) */
/* (d1,vc1,d0,vc0)=(0,3,1,0) */
/* (d1,vc1,d0,vc0)=(0,4,0,0) */
/* (d1,vc1,d0,vc0)=(0,4,1,0) */
/* (d1,vc1,d0,vc0)=(0,5,0,0) */
/* (d1,vc1,d0,vc0)=(0,5,1,0) */
/* (d1,vc1,d0,vc0)=(0,6,0,0) */
/* (d1,vc1,d0,vc0)=(0,6,1,0) */
/* (d1,vc1,d0,vc0)=(0,7,0,0) */
/* (d1,vc1,d0,vc0)=(0,7,1,0) */
/* (d1,vc1,d0,vc0)=(0,8,0,0) */
/* (d1,vc1,d0,vc0)=(0,8,1,0) */
/* (d1,vc1,d0,vc0)=(0,9,0,0) */
v_add_lshl_u32 v9, v3, v0, 0x2                     // optSingleColVgpr scaleToBpe: sharedAddrVgpr <- cinRowPtr + coord0, scaled by BPE. BSHERE:coord0=0, coord0Vgpr=0
v_accvgpr_read_b32 v[vgprValuC+12], acc0           // copy acc to vreg[0]
v_accvgpr_read_b32 v[vgprValuC+13], acc4           // copy acc to vreg[1]
v_accvgpr_read_b32 v[vgprValuC+14], acc8           // copy acc to vreg[2]
v_accvgpr_read_b32 v[vgprValuC+15], acc12          // copy acc to vreg[3]
v_accvgpr_read_b32 v[vgprValuC+16], acc16          // copy acc to vreg[4]
v_accvgpr_read_b32 v[vgprValuC+17], acc20          // copy acc to vreg[5]
v_accvgpr_read_b32 v[vgprValuC+18], acc24          // copy acc to vreg[6]
v_accvgpr_read_b32 v[vgprValuC+19], acc28          // copy acc to vreg[7]
v_accvgpr_read_b32 v[vgprValuC+20], acc32          // copy acc to vreg[8]
v_accvgpr_read_b32 v[vgprValuC+21], acc36          // copy acc to vreg[9]
v_accvgpr_read_b32 v[vgprValuC+22], acc40          // copy acc to vreg[10]
v_accvgpr_read_b32 v[vgprValuC+23], acc44          // copy acc to vreg[11]
v_accvgpr_read_b32 v[vgprValuC+24], acc48          // copy acc to vreg[12]
v_accvgpr_read_b32 v[vgprValuC+25], acc52          // copy acc to vreg[13]
v_accvgpr_read_b32 v[vgprValuC+26], acc56          // copy acc to vreg[14]
v_accvgpr_read_b32 v[vgprValuC+27], acc60          // copy acc to vreg[15]
v_accvgpr_read_b32 v[vgprValuC+28], acc64          // copy acc to vreg[16]
v_accvgpr_read_b32 v[vgprValuC+29], acc68          // copy acc to vreg[17]
v_accvgpr_read_b32 v[vgprValuC+30], acc72          // copy acc to vreg[18]
v_accvgpr_read_b32 v[vgprValuC+31], acc76          // copy acc to vreg[19]
v_accvgpr_read_b32 v[vgprValuC+32], acc80          // copy acc to vreg[20]
v_accvgpr_read_b32 v[vgprValuC+33], acc84          // copy acc to vreg[21]
v_accvgpr_read_b32 v[vgprValuC+34], acc88          // copy acc to vreg[22]
v_accvgpr_read_b32 v[vgprValuC+35], acc92          // copy acc to vreg[23]
v_accvgpr_read_b32 v[vgprValuC+36], acc96          // copy acc to vreg[24]
v_accvgpr_read_b32 v[vgprValuC+37], acc100         // copy acc to vreg[25]
v_accvgpr_read_b32 v[vgprValuC+38], acc104         // copy acc to vreg[26]
v_accvgpr_read_b32 v[vgprValuC+39], acc108         // copy acc to vreg[27]
v_accvgpr_read_b32 v[vgprValuC+40], acc112         // copy acc to vreg[28]
v_accvgpr_read_b32 v[vgprValuC+41], acc116         // copy acc to vreg[29]
v_accvgpr_read_b32 v[vgprValuC+42], acc120         // copy acc to vreg[30]
v_accvgpr_read_b32 v[vgprValuC+43], acc124         // copy acc to vreg[31]
v_accvgpr_read_b32 v[vgprValuC+44], acc1           // copy acc to vreg[32]
v_accvgpr_read_b32 v[vgprValuC+45], acc5           // copy acc to vreg[33]
v_accvgpr_read_b32 v[vgprValuC+46], acc9           // copy acc to vreg[34]
v_accvgpr_read_b32 v[vgprValuC+47], acc13          // copy acc to vreg[35]
v_accvgpr_read_b32 v[vgprValuC+48], acc17          // copy acc to vreg[36]
v_accvgpr_read_b32 v[vgprValuC+49], acc21          // copy acc to vreg[37]
v_accvgpr_read_b32 v[vgprValuC+50], acc25          // copy acc to vreg[38]
v_accvgpr_read_b32 v[vgprValuC+51], acc29          // copy acc to vreg[39]
v_accvgpr_read_b32 v[vgprValuC+52], acc33          // copy acc to vreg[40]
v_accvgpr_read_b32 v[vgprValuC+53], acc37          // copy acc to vreg[41]
v_accvgpr_read_b32 v[vgprValuC+54], acc41          // copy acc to vreg[42]
v_accvgpr_read_b32 v[vgprValuC+55], acc45          // copy acc to vreg[43]
v_accvgpr_read_b32 v[vgprValuC+56], acc49          // copy acc to vreg[44]
v_accvgpr_read_b32 v[vgprValuC+57], acc53          // copy acc to vreg[45]
v_accvgpr_read_b32 v[vgprValuC+58], acc57          // copy acc to vreg[46]
v_accvgpr_read_b32 v[vgprValuC+59], acc61          // copy acc to vreg[47]
v_accvgpr_read_b32 v[vgprValuC+60], acc65          // copy acc to vreg[48]
v_accvgpr_read_b32 v[vgprValuC+61], acc69          // copy acc to vreg[49]
v_accvgpr_read_b32 v[vgprValuC+62], acc73          // copy acc to vreg[50]
v_accvgpr_read_b32 v[vgprValuC+63], acc77          // copy acc to vreg[51]
v_accvgpr_read_b32 v[vgprValuC+64], acc81          // copy acc to vreg[52]
v_accvgpr_read_b32 v[vgprValuC+65], acc85          // copy acc to vreg[53]
v_accvgpr_read_b32 v[vgprValuC+66], acc89          // copy acc to vreg[54]
v_accvgpr_read_b32 v[vgprValuC+67], acc93          // copy acc to vreg[55]
v_accvgpr_read_b32 v[vgprValuC+68], acc97          // copy acc to vreg[56]
v_accvgpr_read_b32 v[vgprValuC+69], acc101         // copy acc to vreg[57]
v_accvgpr_read_b32 v[vgprValuC+70], acc105         // copy acc to vreg[58]
v_accvgpr_read_b32 v[vgprValuC+71], acc109         // copy acc to vreg[59]
v_accvgpr_read_b32 v[vgprValuC+72], acc113         // copy acc to vreg[60]
v_accvgpr_read_b32 v[vgprValuC+73], acc117         // copy acc to vreg[61]
v_accvgpr_read_b32 v[vgprValuC+74], acc121         // copy acc to vreg[62]
v_accvgpr_read_b32 v[vgprValuC+75], acc125         // copy acc to vreg[63]
v_accvgpr_read_b32 v[vgprValuC+76], acc2           // copy acc to vreg[64]
v_accvgpr_read_b32 v[vgprValuC+77], acc6           // copy acc to vreg[65]
v_accvgpr_read_b32 v[vgprValuC+78], acc10          // copy acc to vreg[66]
v_accvgpr_read_b32 v[vgprValuC+79], acc14          // copy acc to vreg[67]
v_accvgpr_read_b32 v[vgprValuC+80], acc18          // copy acc to vreg[68]
v_accvgpr_read_b32 v[vgprValuC+81], acc22          // copy acc to vreg[69]
v_accvgpr_read_b32 v[vgprValuC+82], acc26          // copy acc to vreg[70]
v_accvgpr_read_b32 v[vgprValuC+83], acc30          // copy acc to vreg[71]
v_accvgpr_read_b32 v[vgprValuC+84], acc34          // copy acc to vreg[72]
v_accvgpr_read_b32 v[vgprValuC+85], acc38          // copy acc to vreg[73]
v_accvgpr_read_b32 v[vgprValuC+86], acc42          // copy acc to vreg[74]
v_accvgpr_read_b32 v[vgprValuC+87], acc46          // copy acc to vreg[75]

/* rC *= alpha batchElements=[(0, 0, 0, 0), (0, 1, 0, 0), (0, 0, 1, 0), (0, 1, 1, 0), (0, 0, 2, 0), (0, 1, 2, 0), (0, 0, 3, 0), (0, 1, 3, 0), (0, 0, 4, 0), (0, 1, 4, 0), (0, 0, 5, 0), (0, 1, 5, 0), (0, 0, 6, 0), (0, 1, 6, 0), (0, 0, 7, 0), (0, 1, 7, 0), (0, 0, 8, 0), (0, 1, 8, 0), (0, 0, 9, 0)] */
v_pk_mul_f32 v[vgprValuC+12:vgprValuC+12+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+12:vgprValuC+12+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+14:vgprValuC+14+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+14:vgprValuC+14+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+16:vgprValuC+16+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+16:vgprValuC+16+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+18:vgprValuC+18+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+18:vgprValuC+18+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+20:vgprValuC+20+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+20:vgprValuC+20+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+22:vgprValuC+22+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+22:vgprValuC+22+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+24:vgprValuC+24+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+24:vgprValuC+24+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+26:vgprValuC+26+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+26:vgprValuC+26+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+28:vgprValuC+28+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+28:vgprValuC+28+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+30:vgprValuC+30+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+30:vgprValuC+30+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+32:vgprValuC+32+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+32:vgprValuC+32+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+34:vgprValuC+34+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+34:vgprValuC+34+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+36:vgprValuC+36+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+36:vgprValuC+36+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+38:vgprValuC+38+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+38:vgprValuC+38+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+40:vgprValuC+40+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+40:vgprValuC+40+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+42:vgprValuC+42+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+42:vgprValuC+42+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+44:vgprValuC+44+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+44:vgprValuC+44+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+46:vgprValuC+46+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+46:vgprValuC+46+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+48:vgprValuC+48+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+48:vgprValuC+48+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+50:vgprValuC+50+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+50:vgprValuC+50+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+52:vgprValuC+52+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+52:vgprValuC+52+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+54:vgprValuC+54+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+54:vgprValuC+54+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+56:vgprValuC+56+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+56:vgprValuC+56+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+58:vgprValuC+58+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+58:vgprValuC+58+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+60:vgprValuC+60+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+60:vgprValuC+60+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+62:vgprValuC+62+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+62:vgprValuC+62+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+64:vgprValuC+64+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+64:vgprValuC+64+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+66:vgprValuC+66+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+66:vgprValuC+66+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+68:vgprValuC+68+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+68:vgprValuC+68+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+70:vgprValuC+70+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+70:vgprValuC+70+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+72:vgprValuC+72+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+72:vgprValuC+72+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+74:vgprValuC+74+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+74:vgprValuC+74+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+76:vgprValuC+76+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+76:vgprValuC+76+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+78:vgprValuC+78+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+78:vgprValuC+78+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+80:vgprValuC+80+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+80:vgprValuC+80+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+82:vgprValuC+82+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+82:vgprValuC+82+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+84:vgprValuC+84+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+84:vgprValuC+84+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+86:vgprValuC+86+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+86:vgprValuC+86+1] op_sel_hi:[0,1,1] // *= alpha (pk)

/* apply mask, calc new C and issue writes */

s_waitcnt lgkmcnt(2)                               // dscnt(2) = 4 - 1 (bias) - 1 (scaleAlphaVec) (interleaved)
v_pk_mul_f32 v[vgprValuC+12:vgprValuC+12+1], v[92:93], v[vgprValuC+12:vgprValuC+12+1] // *= ScaleAlphaVecVMulPK(92)(0)
v_pk_mul_f32 v[vgprValuC+14:vgprValuC+14+1], v[94:95], v[vgprValuC+14:vgprValuC+14+1] // *= ScaleAlphaVecVMulPK(92)(2)
v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+12:vgprValuC+12+1] // C += bias
v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+14:vgprValuC+14+1] // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b64 v[12:13], v[4:5]
v_mov_b64 v[14:15], v[6:7]
buffer_store_dwordx4 v[12:15], v9, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D

s_waitcnt lgkmcnt(0)                               // dscnt(0) = 4 - 2 (bias) - 2 (scaleAlphaVec) (interleaved)
v_pk_mul_f32 v[vgprValuC+16:vgprValuC+16+1], v[100:101], v[vgprValuC+16:vgprValuC+16+1] // *= ScaleAlphaVecVMulPK(100)(0)
v_pk_mul_f32 v[vgprValuC+18:vgprValuC+18+1], v[102:103], v[vgprValuC+18:vgprValuC+18+1] // *= ScaleAlphaVecVMulPK(100)(2)
v_pk_add_f32 v[4:5], v[96:97], v[vgprValuC+16:vgprValuC+16+1] // C += bias
v_pk_add_f32 v[6:7], v[98:99], v[vgprValuC+18:vgprValuC+18+1] // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b64 v[16:17], v[4:5]
v_mov_b64 v[18:19], v[6:7]
buffer_store_dwordx4 v[16:19], v9, s[sgprSrdD:sgprSrdD+3], 0 offen offset:512 // store D
v_pk_mul_f32 v[vgprValuC+20:vgprValuC+20+1], v[92:93], v[vgprValuC+20:vgprValuC+20+1] // *= ScaleAlphaVecVMulPK(92)(0)
v_pk_mul_f32 v[vgprValuC+22:vgprValuC+22+1], v[94:95], v[vgprValuC+22:vgprValuC+22+1] // *= ScaleAlphaVecVMulPK(92)(2)
v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+20:vgprValuC+20+1] // C += bias
v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+22:vgprValuC+22+1] // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b64 v[20:21], v[4:5]
v_mov_b64 v[22:23], v[6:7]
s_lshl_b32 s74, s[sgprStrideD1J], 2                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[20:23], v9, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_pk_mul_f32 v[vgprValuC+24:vgprValuC+24+1], v[100:101], v[vgprValuC+24:vgprValuC+24+1] // *= ScaleAlphaVecVMulPK(100)(0)
v_pk_mul_f32 v[vgprValuC+26:vgprValuC+26+1], v[102:103], v[vgprValuC+26:vgprValuC+26+1] // *= ScaleAlphaVecVMulPK(100)(2)
v_pk_add_f32 v[4:5], v[96:97], v[vgprValuC+24:vgprValuC+24+1] // C += bias
v_pk_add_f32 v[6:7], v[98:99], v[vgprValuC+26:vgprValuC+26+1] // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b64 v[24:25], v[4:5]
v_mov_b64 v[26:27], v[6:7]
buffer_store_dwordx4 v[24:27], v9, s[sgprSrdD:sgprSrdD+3], 0 offen offset:512 // store D
v_pk_mul_f32 v[vgprValuC+28:vgprValuC+28+1], v[92:93], v[vgprValuC+28:vgprValuC+28+1] // *= ScaleAlphaVecVMulPK(92)(0)
v_pk_mul_f32 v[vgprValuC+30:vgprValuC+30+1], v[94:95], v[vgprValuC+30:vgprValuC+30+1] // *= ScaleAlphaVecVMulPK(92)(2)
v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+28:vgprValuC+28+1] // C += bias
v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+30:vgprValuC+30+1] // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b64 v[28:29], v[4:5]
v_mov_b64 v[30:31], v[6:7]
s_lshl_b32 s74, s[sgprStrideD1J], 2                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[28:31], v9, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_pk_mul_f32 v[vgprValuC+32:vgprValuC+32+1], v[100:101], v[vgprValuC+32:vgprValuC+32+1] // *= ScaleAlphaVecVMulPK(100)(0)
v_pk_mul_f32 v[vgprValuC+34:vgprValuC+34+1], v[102:103], v[vgprValuC+34:vgprValuC+34+1] // *= ScaleAlphaVecVMulPK(100)(2)
v_pk_add_f32 v[4:5], v[96:97], v[vgprValuC+32:vgprValuC+32+1] // C += bias
v_pk_add_f32 v[6:7], v[98:99], v[vgprValuC+34:vgprValuC+34+1] // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b64 v[32:33], v[4:5]
v_mov_b64 v[34:35], v[6:7]
buffer_store_dwordx4 v[32:35], v9, s[sgprSrdD:sgprSrdD+3], 0 offen offset:512 // store D
v_pk_mul_f32 v[vgprValuC+36:vgprValuC+36+1], v[92:93], v[vgprValuC+36:vgprValuC+36+1] // *= ScaleAlphaVecVMulPK(92)(0)
v_pk_mul_f32 v[vgprValuC+38:vgprValuC+38+1], v[94:95], v[vgprValuC+38:vgprValuC+38+1] // *= ScaleAlphaVecVMulPK(92)(2)
v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+36:vgprValuC+36+1] // C += bias
v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+38:vgprValuC+38+1] // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b64 v[36:37], v[4:5]
v_mov_b64 v[38:39], v[6:7]
s_lshl_b32 s74, s[sgprStrideD1J], 2                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[36:39], v9, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_pk_mul_f32 v[vgprValuC+40:vgprValuC+40+1], v[100:101], v[vgprValuC+40:vgprValuC+40+1] // *= ScaleAlphaVecVMulPK(100)(0)
v_pk_mul_f32 v[vgprValuC+42:vgprValuC+42+1], v[102:103], v[vgprValuC+42:vgprValuC+42+1] // *= ScaleAlphaVecVMulPK(100)(2)
v_pk_add_f32 v[4:5], v[96:97], v[vgprValuC+40:vgprValuC+40+1] // C += bias
v_pk_add_f32 v[6:7], v[98:99], v[vgprValuC+42:vgprValuC+42+1] // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b64 v[40:41], v[4:5]
v_mov_b64 v[42:43], v[6:7]
buffer_store_dwordx4 v[40:43], v9, s[sgprSrdD:sgprSrdD+3], 0 offen offset:512 // store D
v_pk_mul_f32 v[vgprValuC+44:vgprValuC+44+1], v[92:93], v[vgprValuC+44:vgprValuC+44+1] // *= ScaleAlphaVecVMulPK(92)(0)
v_pk_mul_f32 v[vgprValuC+46:vgprValuC+46+1], v[94:95], v[vgprValuC+46:vgprValuC+46+1] // *= ScaleAlphaVecVMulPK(92)(2)
v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+44:vgprValuC+44+1] // C += bias
v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+46:vgprValuC+46+1] // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b64 v[44:45], v[4:5]
v_mov_b64 v[46:47], v[6:7]
s_lshl_b32 s74, s[sgprStrideD1J], 2                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[44:47], v9, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_pk_mul_f32 v[vgprValuC+48:vgprValuC+48+1], v[100:101], v[vgprValuC+48:vgprValuC+48+1] // *= ScaleAlphaVecVMulPK(100)(0)
v_pk_mul_f32 v[vgprValuC+50:vgprValuC+50+1], v[102:103], v[vgprValuC+50:vgprValuC+50+1] // *= ScaleAlphaVecVMulPK(100)(2)
v_pk_add_f32 v[4:5], v[96:97], v[vgprValuC+48:vgprValuC+48+1] // C += bias
v_pk_add_f32 v[6:7], v[98:99], v[vgprValuC+50:vgprValuC+50+1] // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b64 v[48:49], v[4:5]
v_mov_b64 v[50:51], v[6:7]
buffer_store_dwordx4 v[48:51], v9, s[sgprSrdD:sgprSrdD+3], 0 offen offset:512 // store D
v_pk_mul_f32 v[vgprValuC+52:vgprValuC+52+1], v[92:93], v[vgprValuC+52:vgprValuC+52+1] // *= ScaleAlphaVecVMulPK(92)(0)
v_pk_mul_f32 v[vgprValuC+54:vgprValuC+54+1], v[94:95], v[vgprValuC+54:vgprValuC+54+1] // *= ScaleAlphaVecVMulPK(92)(2)
v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+52:vgprValuC+52+1] // C += bias
v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+54:vgprValuC+54+1] // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b64 v[52:53], v[4:5]
v_mov_b64 v[54:55], v[6:7]
s_lshl_b32 s74, s[sgprStrideD1J], 2                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[52:55], v9, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_pk_mul_f32 v[vgprValuC+56:vgprValuC+56+1], v[100:101], v[vgprValuC+56:vgprValuC+56+1] // *= ScaleAlphaVecVMulPK(100)(0)
v_pk_mul_f32 v[vgprValuC+58:vgprValuC+58+1], v[102:103], v[vgprValuC+58:vgprValuC+58+1] // *= ScaleAlphaVecVMulPK(100)(2)
v_pk_add_f32 v[4:5], v[96:97], v[vgprValuC+56:vgprValuC+56+1] // C += bias
v_pk_add_f32 v[6:7], v[98:99], v[vgprValuC+58:vgprValuC+58+1] // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b64 v[56:57], v[4:5]
v_mov_b64 v[58:59], v[6:7]
buffer_store_dwordx4 v[56:59], v9, s[sgprSrdD:sgprSrdD+3], 0 offen offset:512 // store D
v_pk_mul_f32 v[vgprValuC+60:vgprValuC+60+1], v[92:93], v[vgprValuC+60:vgprValuC+60+1] // *= ScaleAlphaVecVMulPK(92)(0)
v_pk_mul_f32 v[vgprValuC+62:vgprValuC+62+1], v[94:95], v[vgprValuC+62:vgprValuC+62+1] // *= ScaleAlphaVecVMulPK(92)(2)
v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+60:vgprValuC+60+1] // C += bias
v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+62:vgprValuC+62+1] // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b64 v[60:61], v[4:5]
v_mov_b64 v[62:63], v[6:7]
s_lshl_b32 s74, s[sgprStrideD1J], 2                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[60:63], v9, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_pk_mul_f32 v[vgprValuC+64:vgprValuC+64+1], v[100:101], v[vgprValuC+64:vgprValuC+64+1] // *= ScaleAlphaVecVMulPK(100)(0)
v_pk_mul_f32 v[vgprValuC+66:vgprValuC+66+1], v[102:103], v[vgprValuC+66:vgprValuC+66+1] // *= ScaleAlphaVecVMulPK(100)(2)
v_pk_add_f32 v[4:5], v[96:97], v[vgprValuC+64:vgprValuC+64+1] // C += bias
v_pk_add_f32 v[6:7], v[98:99], v[vgprValuC+66:vgprValuC+66+1] // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b64 v[64:65], v[4:5]
v_mov_b64 v[66:67], v[6:7]
buffer_store_dwordx4 v[64:67], v9, s[sgprSrdD:sgprSrdD+3], 0 offen offset:512 // store D
v_pk_mul_f32 v[vgprValuC+68:vgprValuC+68+1], v[92:93], v[vgprValuC+68:vgprValuC+68+1] // *= ScaleAlphaVecVMulPK(92)(0)
v_pk_mul_f32 v[vgprValuC+70:vgprValuC+70+1], v[94:95], v[vgprValuC+70:vgprValuC+70+1] // *= ScaleAlphaVecVMulPK(92)(2)
v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+68:vgprValuC+68+1] // C += bias
v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+70:vgprValuC+70+1] // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b64 v[68:69], v[4:5]
v_mov_b64 v[70:71], v[6:7]
s_lshl_b32 s74, s[sgprStrideD1J], 2                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[68:71], v9, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_pk_mul_f32 v[vgprValuC+72:vgprValuC+72+1], v[100:101], v[vgprValuC+72:vgprValuC+72+1] // *= ScaleAlphaVecVMulPK(100)(0)
v_pk_mul_f32 v[vgprValuC+74:vgprValuC+74+1], v[102:103], v[vgprValuC+74:vgprValuC+74+1] // *= ScaleAlphaVecVMulPK(100)(2)
v_pk_add_f32 v[4:5], v[96:97], v[vgprValuC+72:vgprValuC+72+1] // C += bias
v_pk_add_f32 v[6:7], v[98:99], v[vgprValuC+74:vgprValuC+74+1] // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b64 v[72:73], v[4:5]
v_mov_b64 v[74:75], v[6:7]
buffer_store_dwordx4 v[72:75], v9, s[sgprSrdD:sgprSrdD+3], 0 offen offset:512 // store D
v_pk_mul_f32 v[vgprValuC+76:vgprValuC+76+1], v[92:93], v[vgprValuC+76:vgprValuC+76+1] // *= ScaleAlphaVecVMulPK(92)(0)
v_pk_mul_f32 v[vgprValuC+78:vgprValuC+78+1], v[94:95], v[vgprValuC+78:vgprValuC+78+1] // *= ScaleAlphaVecVMulPK(92)(2)
v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+76:vgprValuC+76+1] // C += bias
v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+78:vgprValuC+78+1] // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b64 v[76:77], v[4:5]
v_mov_b64 v[78:79], v[6:7]
s_lshl_b32 s74, s[sgprStrideD1J], 2                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[76:79], v9, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_pk_mul_f32 v[vgprValuC+80:vgprValuC+80+1], v[100:101], v[vgprValuC+80:vgprValuC+80+1] // *= ScaleAlphaVecVMulPK(100)(0)
v_pk_mul_f32 v[vgprValuC+82:vgprValuC+82+1], v[102:103], v[vgprValuC+82:vgprValuC+82+1] // *= ScaleAlphaVecVMulPK(100)(2)
v_pk_add_f32 v[4:5], v[96:97], v[vgprValuC+80:vgprValuC+80+1] // C += bias
v_pk_add_f32 v[6:7], v[98:99], v[vgprValuC+82:vgprValuC+82+1] // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b64 v[80:81], v[4:5]
v_mov_b64 v[82:83], v[6:7]
buffer_store_dwordx4 v[80:83], v9, s[sgprSrdD:sgprSrdD+3], 0 offen offset:512 // store D
v_pk_mul_f32 v[vgprValuC+84:vgprValuC+84+1], v[92:93], v[vgprValuC+84:vgprValuC+84+1] // *= ScaleAlphaVecVMulPK(92)(0)
v_pk_mul_f32 v[vgprValuC+86:vgprValuC+86+1], v[94:95], v[vgprValuC+86:vgprValuC+86+1] // *= ScaleAlphaVecVMulPK(92)(2)
v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+84:vgprValuC+84+1] // C += bias
v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+86:vgprValuC+86+1] // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b64 v[84:85], v[4:5]
v_mov_b64 v[86:87], v[6:7]
s_lshl_b32 s74, s[sgprStrideD1J], 2                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[84:87], v9, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 factorDim=0 */

/******************************************/
/* Global Write Batch #1 (d1,d0,vc1,vc0) = */
/*    (0,1,9,0:vw4); (0,0,10,0:vw4); (0,1,10,0:vw4); (0,0,11,0:vw4); (0,1,11,0:vw4); (0,0,12,0:vw4); (0,1,12,0:vw4); (0,0,13,0:vw4); (0,1,13,0:vw4); (0,0,14,0:vw4); (0,1,14,0:vw4); (0,0,15,0:vw4); (0,1,15,0:vw4); (1,0,0,0:vw4); (1,1,0,0:vw4); (1,0,1,0:vw4); (1,1,1,0:vw4); (1,0,2,0:vw4); (1,1,2,0:vw4) */
/******************************************/

/* calc coords, apply mask, and issue loads (if necessary) */
/* (d1,vc1,d0,vc0)=(0,9,1,0) */
ds_read_b128 v[88:91], v11 offset:512              // load Bias
ds_read_b128 v[92:95], v11 offset:1536             // load scaleAlpha
/* (d1,vc1,d0,vc0)=(0,10,0,0) */
ds_read_b128 v[96:99], v11 offset:0                // load Bias
ds_read_b128 v[100:103], v11 offset:1024           // load scaleAlpha
/* (d1,vc1,d0,vc0)=(0,10,1,0) */
/* (d1,vc1,d0,vc0)=(0,11,0,0) */
/* (d1,vc1,d0,vc0)=(0,11,1,0) */
/* (d1,vc1,d0,vc0)=(0,12,0,0) */
/* (d1,vc1,d0,vc0)=(0,12,1,0) */
/* (d1,vc1,d0,vc0)=(0,13,0,0) */
/* (d1,vc1,d0,vc0)=(0,13,1,0) */
/* (d1,vc1,d0,vc0)=(0,14,0,0) */
/* (d1,vc1,d0,vc0)=(0,14,1,0) */
/* (d1,vc1,d0,vc0)=(0,15,0,0) */
/* (d1,vc1,d0,vc0)=(0,15,1,0) */
/* (d1,vc1,d0,vc0)=(1,0,0,0) */
/* (d1,vc1,d0,vc0)=(1,0,1,0) */
/* (d1,vc1,d0,vc0)=(1,1,0,0) */
/* (d1,vc1,d0,vc0)=(1,1,1,0) */
/* (d1,vc1,d0,vc0)=(1,2,0,0) */
/* (d1,vc1,d0,vc0)=(1,2,1,0) */
v_accvgpr_read_b32 v[vgprValuC+12], acc50          // copy acc to vreg[76]
v_accvgpr_read_b32 v[vgprValuC+13], acc54          // copy acc to vreg[77]
v_accvgpr_read_b32 v[vgprValuC+14], acc58          // copy acc to vreg[78]
v_accvgpr_read_b32 v[vgprValuC+15], acc62          // copy acc to vreg[79]
v_accvgpr_read_b32 v[vgprValuC+16], acc66          // copy acc to vreg[80]
v_accvgpr_read_b32 v[vgprValuC+17], acc70          // copy acc to vreg[81]
v_accvgpr_read_b32 v[vgprValuC+18], acc74          // copy acc to vreg[82]
v_accvgpr_read_b32 v[vgprValuC+19], acc78          // copy acc to vreg[83]
v_accvgpr_read_b32 v[vgprValuC+20], acc82          // copy acc to vreg[84]
v_accvgpr_read_b32 v[vgprValuC+21], acc86          // copy acc to vreg[85]
v_accvgpr_read_b32 v[vgprValuC+22], acc90          // copy acc to vreg[86]
v_accvgpr_read_b32 v[vgprValuC+23], acc94          // copy acc to vreg[87]
v_accvgpr_read_b32 v[vgprValuC+24], acc98          // copy acc to vreg[88]
v_accvgpr_read_b32 v[vgprValuC+25], acc102         // copy acc to vreg[89]
v_accvgpr_read_b32 v[vgprValuC+26], acc106         // copy acc to vreg[90]
v_accvgpr_read_b32 v[vgprValuC+27], acc110         // copy acc to vreg[91]
v_accvgpr_read_b32 v[vgprValuC+28], acc114         // copy acc to vreg[92]
v_accvgpr_read_b32 v[vgprValuC+29], acc118         // copy acc to vreg[93]
v_accvgpr_read_b32 v[vgprValuC+30], acc122         // copy acc to vreg[94]
v_accvgpr_read_b32 v[vgprValuC+31], acc126         // copy acc to vreg[95]
v_accvgpr_read_b32 v[vgprValuC+32], acc3           // copy acc to vreg[96]
v_accvgpr_read_b32 v[vgprValuC+33], acc7           // copy acc to vreg[97]
v_accvgpr_read_b32 v[vgprValuC+34], acc11          // copy acc to vreg[98]
v_accvgpr_read_b32 v[vgprValuC+35], acc15          // copy acc to vreg[99]
v_accvgpr_read_b32 v[vgprValuC+36], acc19          // copy acc to vreg[100]
v_accvgpr_read_b32 v[vgprValuC+37], acc23          // copy acc to vreg[101]
v_accvgpr_read_b32 v[vgprValuC+38], acc27          // copy acc to vreg[102]
v_accvgpr_read_b32 v[vgprValuC+39], acc31          // copy acc to vreg[103]
v_accvgpr_read_b32 v[vgprValuC+40], acc35          // copy acc to vreg[104]
v_accvgpr_read_b32 v[vgprValuC+41], acc39          // copy acc to vreg[105]
v_accvgpr_read_b32 v[vgprValuC+42], acc43          // copy acc to vreg[106]
v_accvgpr_read_b32 v[vgprValuC+43], acc47          // copy acc to vreg[107]
v_accvgpr_read_b32 v[vgprValuC+44], acc51          // copy acc to vreg[108]
v_accvgpr_read_b32 v[vgprValuC+45], acc55          // copy acc to vreg[109]
v_accvgpr_read_b32 v[vgprValuC+46], acc59          // copy acc to vreg[110]
v_accvgpr_read_b32 v[vgprValuC+47], acc63          // copy acc to vreg[111]
v_accvgpr_read_b32 v[vgprValuC+48], acc67          // copy acc to vreg[112]
v_accvgpr_read_b32 v[vgprValuC+49], acc71          // copy acc to vreg[113]
v_accvgpr_read_b32 v[vgprValuC+50], acc75          // copy acc to vreg[114]
v_accvgpr_read_b32 v[vgprValuC+51], acc79          // copy acc to vreg[115]
v_accvgpr_read_b32 v[vgprValuC+52], acc83          // copy acc to vreg[116]
v_accvgpr_read_b32 v[vgprValuC+53], acc87          // copy acc to vreg[117]
v_accvgpr_read_b32 v[vgprValuC+54], acc91          // copy acc to vreg[118]
v_accvgpr_read_b32 v[vgprValuC+55], acc95          // copy acc to vreg[119]
v_accvgpr_read_b32 v[vgprValuC+56], acc99          // copy acc to vreg[120]
v_accvgpr_read_b32 v[vgprValuC+57], acc103         // copy acc to vreg[121]
v_accvgpr_read_b32 v[vgprValuC+58], acc107         // copy acc to vreg[122]
v_accvgpr_read_b32 v[vgprValuC+59], acc111         // copy acc to vreg[123]
v_accvgpr_read_b32 v[vgprValuC+60], acc115         // copy acc to vreg[124]
v_accvgpr_read_b32 v[vgprValuC+61], acc119         // copy acc to vreg[125]
v_accvgpr_read_b32 v[vgprValuC+62], acc123         // copy acc to vreg[126]
v_accvgpr_read_b32 v[vgprValuC+63], acc127         // copy acc to vreg[127]
v_accvgpr_read_b32 v[vgprValuC+64], acc128         // copy acc to vreg[128]
v_accvgpr_read_b32 v[vgprValuC+65], acc132         // copy acc to vreg[129]
v_accvgpr_read_b32 v[vgprValuC+66], acc136         // copy acc to vreg[130]
v_accvgpr_read_b32 v[vgprValuC+67], acc140         // copy acc to vreg[131]
v_accvgpr_read_b32 v[vgprValuC+68], acc144         // copy acc to vreg[132]
v_accvgpr_read_b32 v[vgprValuC+69], acc148         // copy acc to vreg[133]
v_accvgpr_read_b32 v[vgprValuC+70], acc152         // copy acc to vreg[134]
v_accvgpr_read_b32 v[vgprValuC+71], acc156         // copy acc to vreg[135]
v_accvgpr_read_b32 v[vgprValuC+72], acc160         // copy acc to vreg[136]
v_accvgpr_read_b32 v[vgprValuC+73], acc164         // copy acc to vreg[137]
v_accvgpr_read_b32 v[vgprValuC+74], acc168         // copy acc to vreg[138]
v_accvgpr_read_b32 v[vgprValuC+75], acc172         // copy acc to vreg[139]
v_accvgpr_read_b32 v[vgprValuC+76], acc176         // copy acc to vreg[140]
v_accvgpr_read_b32 v[vgprValuC+77], acc180         // copy acc to vreg[141]
v_accvgpr_read_b32 v[vgprValuC+78], acc184         // copy acc to vreg[142]
v_accvgpr_read_b32 v[vgprValuC+79], acc188         // copy acc to vreg[143]
v_accvgpr_read_b32 v[vgprValuC+80], acc192         // copy acc to vreg[144]
v_accvgpr_read_b32 v[vgprValuC+81], acc196         // copy acc to vreg[145]
v_accvgpr_read_b32 v[vgprValuC+82], acc200         // copy acc to vreg[146]
v_accvgpr_read_b32 v[vgprValuC+83], acc204         // copy acc to vreg[147]
v_accvgpr_read_b32 v[vgprValuC+84], acc208         // copy acc to vreg[148]
v_accvgpr_read_b32 v[vgprValuC+85], acc212         // copy acc to vreg[149]
v_accvgpr_read_b32 v[vgprValuC+86], acc216         // copy acc to vreg[150]
v_accvgpr_read_b32 v[vgprValuC+87], acc220         // copy acc to vreg[151]

/* rC *= alpha batchElements=[(0, 1, 9, 0), (0, 0, 10, 0), (0, 1, 10, 0), (0, 0, 11, 0), (0, 1, 11, 0), (0, 0, 12, 0), (0, 1, 12, 0), (0, 0, 13, 0), (0, 1, 13, 0), (0, 0, 14, 0), (0, 1, 14, 0), (0, 0, 15, 0), (0, 1, 15, 0), (1, 0, 0, 0), (1, 1, 0, 0), (1, 0, 1, 0), (1, 1, 1, 0), (1, 0, 2, 0), (1, 1, 2, 0)] */
v_pk_mul_f32 v[vgprValuC+12:vgprValuC+12+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+12:vgprValuC+12+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+14:vgprValuC+14+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+14:vgprValuC+14+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+16:vgprValuC+16+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+16:vgprValuC+16+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+18:vgprValuC+18+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+18:vgprValuC+18+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+20:vgprValuC+20+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+20:vgprValuC+20+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+22:vgprValuC+22+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+22:vgprValuC+22+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+24:vgprValuC+24+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+24:vgprValuC+24+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+26:vgprValuC+26+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+26:vgprValuC+26+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+28:vgprValuC+28+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+28:vgprValuC+28+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+30:vgprValuC+30+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+30:vgprValuC+30+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+32:vgprValuC+32+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+32:vgprValuC+32+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+34:vgprValuC+34+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+34:vgprValuC+34+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+36:vgprValuC+36+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+36:vgprValuC+36+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+38:vgprValuC+38+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+38:vgprValuC+38+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+40:vgprValuC+40+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+40:vgprValuC+40+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+42:vgprValuC+42+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+42:vgprValuC+42+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+44:vgprValuC+44+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+44:vgprValuC+44+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+46:vgprValuC+46+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+46:vgprValuC+46+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+48:vgprValuC+48+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+48:vgprValuC+48+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+50:vgprValuC+50+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+50:vgprValuC+50+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+52:vgprValuC+52+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+52:vgprValuC+52+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+54:vgprValuC+54+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+54:vgprValuC+54+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+56:vgprValuC+56+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+56:vgprValuC+56+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+58:vgprValuC+58+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+58:vgprValuC+58+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+60:vgprValuC+60+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+60:vgprValuC+60+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+62:vgprValuC+62+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+62:vgprValuC+62+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+64:vgprValuC+64+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+64:vgprValuC+64+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+66:vgprValuC+66+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+66:vgprValuC+66+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+68:vgprValuC+68+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+68:vgprValuC+68+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+70:vgprValuC+70+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+70:vgprValuC+70+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+72:vgprValuC+72+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+72:vgprValuC+72+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+74:vgprValuC+74+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+74:vgprValuC+74+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+76:vgprValuC+76+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+76:vgprValuC+76+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+78:vgprValuC+78+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+78:vgprValuC+78+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+80:vgprValuC+80+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+80:vgprValuC+80+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+82:vgprValuC+82+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+82:vgprValuC+82+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+84:vgprValuC+84+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+84:vgprValuC+84+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+86:vgprValuC+86+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+86:vgprValuC+86+1] op_sel_hi:[0,1,1] // *= alpha (pk)

/* apply mask, calc new C and issue writes */

s_waitcnt lgkmcnt(2)                               // dscnt(2) = 4 - 1 (bias) - 1 (scaleAlphaVec) (interleaved)
v_pk_mul_f32 v[vgprValuC+12:vgprValuC+12+1], v[92:93], v[vgprValuC+12:vgprValuC+12+1] // *= ScaleAlphaVecVMulPK(92)(0)
v_pk_mul_f32 v[vgprValuC+14:vgprValuC+14+1], v[94:95], v[vgprValuC+14:vgprValuC+14+1] // *= ScaleAlphaVecVMulPK(92)(2)
v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+12:vgprValuC+12+1] // C += bias
v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+14:vgprValuC+14+1] // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b64 v[12:13], v[4:5]
v_mov_b64 v[14:15], v[6:7]
buffer_store_dwordx4 v[12:15], v9, s[sgprSrdD:sgprSrdD+3], 0 offen offset:512 // store D

s_waitcnt lgkmcnt(0)                               // dscnt(0) = 4 - 2 (bias) - 2 (scaleAlphaVec) (interleaved)
v_pk_mul_f32 v[vgprValuC+16:vgprValuC+16+1], v[100:101], v[vgprValuC+16:vgprValuC+16+1] // *= ScaleAlphaVecVMulPK(100)(0)
v_pk_mul_f32 v[vgprValuC+18:vgprValuC+18+1], v[102:103], v[vgprValuC+18:vgprValuC+18+1] // *= ScaleAlphaVecVMulPK(100)(2)
v_pk_add_f32 v[4:5], v[96:97], v[vgprValuC+16:vgprValuC+16+1] // C += bias
v_pk_add_f32 v[6:7], v[98:99], v[vgprValuC+18:vgprValuC+18+1] // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b64 v[16:17], v[4:5]
v_mov_b64 v[18:19], v[6:7]
s_lshl_b32 s74, s[sgprStrideD1J], 2                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[16:19], v9, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_pk_mul_f32 v[vgprValuC+20:vgprValuC+20+1], v[92:93], v[vgprValuC+20:vgprValuC+20+1] // *= ScaleAlphaVecVMulPK(92)(0)
v_pk_mul_f32 v[vgprValuC+22:vgprValuC+22+1], v[94:95], v[vgprValuC+22:vgprValuC+22+1] // *= ScaleAlphaVecVMulPK(92)(2)
v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+20:vgprValuC+20+1] // C += bias
v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+22:vgprValuC+22+1] // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b64 v[20:21], v[4:5]
v_mov_b64 v[22:23], v[6:7]
buffer_store_dwordx4 v[20:23], v9, s[sgprSrdD:sgprSrdD+3], 0 offen offset:512 // store D
v_pk_mul_f32 v[vgprValuC+24:vgprValuC+24+1], v[100:101], v[vgprValuC+24:vgprValuC+24+1] // *= ScaleAlphaVecVMulPK(100)(0)
v_pk_mul_f32 v[vgprValuC+26:vgprValuC+26+1], v[102:103], v[vgprValuC+26:vgprValuC+26+1] // *= ScaleAlphaVecVMulPK(100)(2)
v_pk_add_f32 v[4:5], v[96:97], v[vgprValuC+24:vgprValuC+24+1] // C += bias
v_pk_add_f32 v[6:7], v[98:99], v[vgprValuC+26:vgprValuC+26+1] // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b64 v[24:25], v[4:5]
v_mov_b64 v[26:27], v[6:7]
s_lshl_b32 s74, s[sgprStrideD1J], 2                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[24:27], v9, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_pk_mul_f32 v[vgprValuC+28:vgprValuC+28+1], v[92:93], v[vgprValuC+28:vgprValuC+28+1] // *= ScaleAlphaVecVMulPK(92)(0)
v_pk_mul_f32 v[vgprValuC+30:vgprValuC+30+1], v[94:95], v[vgprValuC+30:vgprValuC+30+1] // *= ScaleAlphaVecVMulPK(92)(2)
v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+28:vgprValuC+28+1] // C += bias
v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+30:vgprValuC+30+1] // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b64 v[28:29], v[4:5]
v_mov_b64 v[30:31], v[6:7]
buffer_store_dwordx4 v[28:31], v9, s[sgprSrdD:sgprSrdD+3], 0 offen offset:512 // store D
v_pk_mul_f32 v[vgprValuC+32:vgprValuC+32+1], v[100:101], v[vgprValuC+32:vgprValuC+32+1] // *= ScaleAlphaVecVMulPK(100)(0)
v_pk_mul_f32 v[vgprValuC+34:vgprValuC+34+1], v[102:103], v[vgprValuC+34:vgprValuC+34+1] // *= ScaleAlphaVecVMulPK(100)(2)
v_pk_add_f32 v[4:5], v[96:97], v[vgprValuC+32:vgprValuC+32+1] // C += bias
v_pk_add_f32 v[6:7], v[98:99], v[vgprValuC+34:vgprValuC+34+1] // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b64 v[32:33], v[4:5]
v_mov_b64 v[34:35], v[6:7]
s_lshl_b32 s74, s[sgprStrideD1J], 2                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[32:35], v9, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_pk_mul_f32 v[vgprValuC+36:vgprValuC+36+1], v[92:93], v[vgprValuC+36:vgprValuC+36+1] // *= ScaleAlphaVecVMulPK(92)(0)
v_pk_mul_f32 v[vgprValuC+38:vgprValuC+38+1], v[94:95], v[vgprValuC+38:vgprValuC+38+1] // *= ScaleAlphaVecVMulPK(92)(2)
v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+36:vgprValuC+36+1] // C += bias
v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+38:vgprValuC+38+1] // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b64 v[36:37], v[4:5]
v_mov_b64 v[38:39], v[6:7]
buffer_store_dwordx4 v[36:39], v9, s[sgprSrdD:sgprSrdD+3], 0 offen offset:512 // store D
v_pk_mul_f32 v[vgprValuC+40:vgprValuC+40+1], v[100:101], v[vgprValuC+40:vgprValuC+40+1] // *= ScaleAlphaVecVMulPK(100)(0)
v_pk_mul_f32 v[vgprValuC+42:vgprValuC+42+1], v[102:103], v[vgprValuC+42:vgprValuC+42+1] // *= ScaleAlphaVecVMulPK(100)(2)
v_pk_add_f32 v[4:5], v[96:97], v[vgprValuC+40:vgprValuC+40+1] // C += bias
v_pk_add_f32 v[6:7], v[98:99], v[vgprValuC+42:vgprValuC+42+1] // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b64 v[40:41], v[4:5]
v_mov_b64 v[42:43], v[6:7]
s_lshl_b32 s74, s[sgprStrideD1J], 2                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[40:43], v9, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_pk_mul_f32 v[vgprValuC+44:vgprValuC+44+1], v[92:93], v[vgprValuC+44:vgprValuC+44+1] // *= ScaleAlphaVecVMulPK(92)(0)
v_pk_mul_f32 v[vgprValuC+46:vgprValuC+46+1], v[94:95], v[vgprValuC+46:vgprValuC+46+1] // *= ScaleAlphaVecVMulPK(92)(2)
v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+44:vgprValuC+44+1] // C += bias
v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+46:vgprValuC+46+1] // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b64 v[44:45], v[4:5]
v_mov_b64 v[46:47], v[6:7]
buffer_store_dwordx4 v[44:47], v9, s[sgprSrdD:sgprSrdD+3], 0 offen offset:512 // store D
v_pk_mul_f32 v[vgprValuC+48:vgprValuC+48+1], v[100:101], v[vgprValuC+48:vgprValuC+48+1] // *= ScaleAlphaVecVMulPK(100)(0)
v_pk_mul_f32 v[vgprValuC+50:vgprValuC+50+1], v[102:103], v[vgprValuC+50:vgprValuC+50+1] // *= ScaleAlphaVecVMulPK(100)(2)
v_pk_add_f32 v[4:5], v[96:97], v[vgprValuC+48:vgprValuC+48+1] // C += bias
v_pk_add_f32 v[6:7], v[98:99], v[vgprValuC+50:vgprValuC+50+1] // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b64 v[48:49], v[4:5]
v_mov_b64 v[50:51], v[6:7]
s_lshl_b32 s74, s[sgprStrideD1J], 2                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[48:51], v9, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_pk_mul_f32 v[vgprValuC+52:vgprValuC+52+1], v[92:93], v[vgprValuC+52:vgprValuC+52+1] // *= ScaleAlphaVecVMulPK(92)(0)
v_pk_mul_f32 v[vgprValuC+54:vgprValuC+54+1], v[94:95], v[vgprValuC+54:vgprValuC+54+1] // *= ScaleAlphaVecVMulPK(92)(2)
v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+52:vgprValuC+52+1] // C += bias
v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+54:vgprValuC+54+1] // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b64 v[52:53], v[4:5]
v_mov_b64 v[54:55], v[6:7]
buffer_store_dwordx4 v[52:55], v9, s[sgprSrdD:sgprSrdD+3], 0 offen offset:512 // store D
v_pk_mul_f32 v[vgprValuC+56:vgprValuC+56+1], v[100:101], v[vgprValuC+56:vgprValuC+56+1] // *= ScaleAlphaVecVMulPK(100)(0)
v_pk_mul_f32 v[vgprValuC+58:vgprValuC+58+1], v[102:103], v[vgprValuC+58:vgprValuC+58+1] // *= ScaleAlphaVecVMulPK(100)(2)
v_pk_add_f32 v[4:5], v[96:97], v[vgprValuC+56:vgprValuC+56+1] // C += bias
v_pk_add_f32 v[6:7], v[98:99], v[vgprValuC+58:vgprValuC+58+1] // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b64 v[56:57], v[4:5]
v_mov_b64 v[58:59], v[6:7]
s_lshl_b32 s74, s[sgprStrideD1J], 2                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[56:59], v9, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_pk_mul_f32 v[vgprValuC+60:vgprValuC+60+1], v[92:93], v[vgprValuC+60:vgprValuC+60+1] // *= ScaleAlphaVecVMulPK(92)(0)
v_pk_mul_f32 v[vgprValuC+62:vgprValuC+62+1], v[94:95], v[vgprValuC+62:vgprValuC+62+1] // *= ScaleAlphaVecVMulPK(92)(2)
v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+60:vgprValuC+60+1] // C += bias
v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+62:vgprValuC+62+1] // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b64 v[60:61], v[4:5]
v_mov_b64 v[62:63], v[6:7]
buffer_store_dwordx4 v[60:63], v9, s[sgprSrdD:sgprSrdD+3], 0 offen offset:512 // store D
v_pk_mul_f32 v[vgprValuC+64:vgprValuC+64+1], v[100:101], v[vgprValuC+64:vgprValuC+64+1] // *= ScaleAlphaVecVMulPK(100)(0)
v_pk_mul_f32 v[vgprValuC+66:vgprValuC+66+1], v[102:103], v[vgprValuC+66:vgprValuC+66+1] // *= ScaleAlphaVecVMulPK(100)(2)
v_pk_add_f32 v[4:5], v[96:97], v[vgprValuC+64:vgprValuC+64+1] // C += bias
v_pk_add_f32 v[6:7], v[98:99], v[vgprValuC+66:vgprValuC+66+1] // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b64 v[64:65], v[4:5]
v_mov_b64 v[66:67], v[6:7]
s_mul_i32 s74, s[sgprStrideD1J], 452               // scale StrideD *= numRows(113) * bpe
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[64:67], v9, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_pk_mul_f32 v[vgprValuC+68:vgprValuC+68+1], v[92:93], v[vgprValuC+68:vgprValuC+68+1] // *= ScaleAlphaVecVMulPK(92)(0)
v_pk_mul_f32 v[vgprValuC+70:vgprValuC+70+1], v[94:95], v[vgprValuC+70:vgprValuC+70+1] // *= ScaleAlphaVecVMulPK(92)(2)
v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+68:vgprValuC+68+1] // C += bias
v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+70:vgprValuC+70+1] // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b64 v[68:69], v[4:5]
v_mov_b64 v[70:71], v[6:7]
buffer_store_dwordx4 v[68:71], v9, s[sgprSrdD:sgprSrdD+3], 0 offen offset:512 // store D
v_pk_mul_f32 v[vgprValuC+72:vgprValuC+72+1], v[100:101], v[vgprValuC+72:vgprValuC+72+1] // *= ScaleAlphaVecVMulPK(100)(0)
v_pk_mul_f32 v[vgprValuC+74:vgprValuC+74+1], v[102:103], v[vgprValuC+74:vgprValuC+74+1] // *= ScaleAlphaVecVMulPK(100)(2)
v_pk_add_f32 v[4:5], v[96:97], v[vgprValuC+72:vgprValuC+72+1] // C += bias
v_pk_add_f32 v[6:7], v[98:99], v[vgprValuC+74:vgprValuC+74+1] // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b64 v[72:73], v[4:5]
v_mov_b64 v[74:75], v[6:7]
s_lshl_b32 s74, s[sgprStrideD1J], 2                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[72:75], v9, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_pk_mul_f32 v[vgprValuC+76:vgprValuC+76+1], v[92:93], v[vgprValuC+76:vgprValuC+76+1] // *= ScaleAlphaVecVMulPK(92)(0)
v_pk_mul_f32 v[vgprValuC+78:vgprValuC+78+1], v[94:95], v[vgprValuC+78:vgprValuC+78+1] // *= ScaleAlphaVecVMulPK(92)(2)
v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+76:vgprValuC+76+1] // C += bias
v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+78:vgprValuC+78+1] // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b64 v[76:77], v[4:5]
v_mov_b64 v[78:79], v[6:7]
buffer_store_dwordx4 v[76:79], v9, s[sgprSrdD:sgprSrdD+3], 0 offen offset:512 // store D
v_pk_mul_f32 v[vgprValuC+80:vgprValuC+80+1], v[100:101], v[vgprValuC+80:vgprValuC+80+1] // *= ScaleAlphaVecVMulPK(100)(0)
v_pk_mul_f32 v[vgprValuC+82:vgprValuC+82+1], v[102:103], v[vgprValuC+82:vgprValuC+82+1] // *= ScaleAlphaVecVMulPK(100)(2)
v_pk_add_f32 v[4:5], v[96:97], v[vgprValuC+80:vgprValuC+80+1] // C += bias
v_pk_add_f32 v[6:7], v[98:99], v[vgprValuC+82:vgprValuC+82+1] // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b64 v[80:81], v[4:5]
v_mov_b64 v[82:83], v[6:7]
s_lshl_b32 s74, s[sgprStrideD1J], 2                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[80:83], v9, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_pk_mul_f32 v[vgprValuC+84:vgprValuC+84+1], v[92:93], v[vgprValuC+84:vgprValuC+84+1] // *= ScaleAlphaVecVMulPK(92)(0)
v_pk_mul_f32 v[vgprValuC+86:vgprValuC+86+1], v[94:95], v[vgprValuC+86:vgprValuC+86+1] // *= ScaleAlphaVecVMulPK(92)(2)
v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+84:vgprValuC+84+1] // C += bias
v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+86:vgprValuC+86+1] // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b64 v[84:85], v[4:5]
v_mov_b64 v[86:87], v[6:7]
buffer_store_dwordx4 v[84:87], v9, s[sgprSrdD:sgprSrdD+3], 0 offen offset:512 // store D
s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 factorDim=0 */

/******************************************/
/* Global Write Batch #2 (d1,d0,vc1,vc0) = */
/*    (1,0,3,0:vw4); (1,1,3,0:vw4); (1,0,4,0:vw4); (1,1,4,0:vw4); (1,0,5,0:vw4); (1,1,5,0:vw4); (1,0,6,0:vw4); (1,1,6,0:vw4); (1,0,7,0:vw4); (1,1,7,0:vw4); (1,0,8,0:vw4); (1,1,8,0:vw4); (1,0,9,0:vw4); (1,1,9,0:vw4); (1,0,10,0:vw4); (1,1,10,0:vw4); (1,0,11,0:vw4); (1,1,11,0:vw4); (1,0,12,0:vw4) */
/******************************************/

/* calc coords, apply mask, and issue loads (if necessary) */
/* (d1,vc1,d0,vc0)=(1,3,0,0) */
ds_read_b128 v[88:91], v11 offset:0                // load Bias
ds_read_b128 v[92:95], v11 offset:1024             // load scaleAlpha
/* (d1,vc1,d0,vc0)=(1,3,1,0) */
ds_read_b128 v[96:99], v11 offset:512              // load Bias
ds_read_b128 v[100:103], v11 offset:1536           // load scaleAlpha
/* (d1,vc1,d0,vc0)=(1,4,0,0) */
/* (d1,vc1,d0,vc0)=(1,4,1,0) */
/* (d1,vc1,d0,vc0)=(1,5,0,0) */
/* (d1,vc1,d0,vc0)=(1,5,1,0) */
/* (d1,vc1,d0,vc0)=(1,6,0,0) */
/* (d1,vc1,d0,vc0)=(1,6,1,0) */
/* (d1,vc1,d0,vc0)=(1,7,0,0) */
/* (d1,vc1,d0,vc0)=(1,7,1,0) */
/* (d1,vc1,d0,vc0)=(1,8,0,0) */
/* (d1,vc1,d0,vc0)=(1,8,1,0) */
/* (d1,vc1,d0,vc0)=(1,9,0,0) */
/* (d1,vc1,d0,vc0)=(1,9,1,0) */
/* (d1,vc1,d0,vc0)=(1,10,0,0) */
/* (d1,vc1,d0,vc0)=(1,10,1,0) */
/* (d1,vc1,d0,vc0)=(1,11,0,0) */
/* (d1,vc1,d0,vc0)=(1,11,1,0) */
/* (d1,vc1,d0,vc0)=(1,12,0,0) */
v_accvgpr_read_b32 v[vgprValuC+12], acc224         // copy acc to vreg[152]
v_accvgpr_read_b32 v[vgprValuC+13], acc228         // copy acc to vreg[153]
v_accvgpr_read_b32 v[vgprValuC+14], acc232         // copy acc to vreg[154]
v_accvgpr_read_b32 v[vgprValuC+15], acc236         // copy acc to vreg[155]
v_accvgpr_read_b32 v[vgprValuC+16], acc240         // copy acc to vreg[156]
v_accvgpr_read_b32 v[vgprValuC+17], acc244         // copy acc to vreg[157]
v_accvgpr_read_b32 v[vgprValuC+18], acc248         // copy acc to vreg[158]
v_accvgpr_read_b32 v[vgprValuC+19], acc252         // copy acc to vreg[159]
v_accvgpr_read_b32 v[vgprValuC+20], acc129         // copy acc to vreg[160]
v_accvgpr_read_b32 v[vgprValuC+21], acc133         // copy acc to vreg[161]
v_accvgpr_read_b32 v[vgprValuC+22], acc137         // copy acc to vreg[162]
v_accvgpr_read_b32 v[vgprValuC+23], acc141         // copy acc to vreg[163]
v_accvgpr_read_b32 v[vgprValuC+24], acc145         // copy acc to vreg[164]
v_accvgpr_read_b32 v[vgprValuC+25], acc149         // copy acc to vreg[165]
v_accvgpr_read_b32 v[vgprValuC+26], acc153         // copy acc to vreg[166]
v_accvgpr_read_b32 v[vgprValuC+27], acc157         // copy acc to vreg[167]
v_accvgpr_read_b32 v[vgprValuC+28], acc161         // copy acc to vreg[168]
v_accvgpr_read_b32 v[vgprValuC+29], acc165         // copy acc to vreg[169]
v_accvgpr_read_b32 v[vgprValuC+30], acc169         // copy acc to vreg[170]
v_accvgpr_read_b32 v[vgprValuC+31], acc173         // copy acc to vreg[171]
v_accvgpr_read_b32 v[vgprValuC+32], acc177         // copy acc to vreg[172]
v_accvgpr_read_b32 v[vgprValuC+33], acc181         // copy acc to vreg[173]
v_accvgpr_read_b32 v[vgprValuC+34], acc185         // copy acc to vreg[174]
v_accvgpr_read_b32 v[vgprValuC+35], acc189         // copy acc to vreg[175]
v_accvgpr_read_b32 v[vgprValuC+36], acc193         // copy acc to vreg[176]
v_accvgpr_read_b32 v[vgprValuC+37], acc197         // copy acc to vreg[177]
v_accvgpr_read_b32 v[vgprValuC+38], acc201         // copy acc to vreg[178]
v_accvgpr_read_b32 v[vgprValuC+39], acc205         // copy acc to vreg[179]
v_accvgpr_read_b32 v[vgprValuC+40], acc209         // copy acc to vreg[180]
v_accvgpr_read_b32 v[vgprValuC+41], acc213         // copy acc to vreg[181]
v_accvgpr_read_b32 v[vgprValuC+42], acc217         // copy acc to vreg[182]
v_accvgpr_read_b32 v[vgprValuC+43], acc221         // copy acc to vreg[183]
v_accvgpr_read_b32 v[vgprValuC+44], acc225         // copy acc to vreg[184]
v_accvgpr_read_b32 v[vgprValuC+45], acc229         // copy acc to vreg[185]
v_accvgpr_read_b32 v[vgprValuC+46], acc233         // copy acc to vreg[186]
v_accvgpr_read_b32 v[vgprValuC+47], acc237         // copy acc to vreg[187]
v_accvgpr_read_b32 v[vgprValuC+48], acc241         // copy acc to vreg[188]
v_accvgpr_read_b32 v[vgprValuC+49], acc245         // copy acc to vreg[189]
v_accvgpr_read_b32 v[vgprValuC+50], acc249         // copy acc to vreg[190]
v_accvgpr_read_b32 v[vgprValuC+51], acc253         // copy acc to vreg[191]
v_accvgpr_read_b32 v[vgprValuC+52], acc130         // copy acc to vreg[192]
v_accvgpr_read_b32 v[vgprValuC+53], acc134         // copy acc to vreg[193]
v_accvgpr_read_b32 v[vgprValuC+54], acc138         // copy acc to vreg[194]
v_accvgpr_read_b32 v[vgprValuC+55], acc142         // copy acc to vreg[195]
v_accvgpr_read_b32 v[vgprValuC+56], acc146         // copy acc to vreg[196]
v_accvgpr_read_b32 v[vgprValuC+57], acc150         // copy acc to vreg[197]
v_accvgpr_read_b32 v[vgprValuC+58], acc154         // copy acc to vreg[198]
v_accvgpr_read_b32 v[vgprValuC+59], acc158         // copy acc to vreg[199]
v_accvgpr_read_b32 v[vgprValuC+60], acc162         // copy acc to vreg[200]
v_accvgpr_read_b32 v[vgprValuC+61], acc166         // copy acc to vreg[201]
v_accvgpr_read_b32 v[vgprValuC+62], acc170         // copy acc to vreg[202]
v_accvgpr_read_b32 v[vgprValuC+63], acc174         // copy acc to vreg[203]
v_accvgpr_read_b32 v[vgprValuC+64], acc178         // copy acc to vreg[204]
v_accvgpr_read_b32 v[vgprValuC+65], acc182         // copy acc to vreg[205]
v_accvgpr_read_b32 v[vgprValuC+66], acc186         // copy acc to vreg[206]
v_accvgpr_read_b32 v[vgprValuC+67], acc190         // copy acc to vreg[207]
v_accvgpr_read_b32 v[vgprValuC+68], acc194         // copy acc to vreg[208]
v_accvgpr_read_b32 v[vgprValuC+69], acc198         // copy acc to vreg[209]
v_accvgpr_read_b32 v[vgprValuC+70], acc202         // copy acc to vreg[210]
v_accvgpr_read_b32 v[vgprValuC+71], acc206         // copy acc to vreg[211]
v_accvgpr_read_b32 v[vgprValuC+72], acc210         // copy acc to vreg[212]
v_accvgpr_read_b32 v[vgprValuC+73], acc214         // copy acc to vreg[213]
v_accvgpr_read_b32 v[vgprValuC+74], acc218         // copy acc to vreg[214]
v_accvgpr_read_b32 v[vgprValuC+75], acc222         // copy acc to vreg[215]
v_accvgpr_read_b32 v[vgprValuC+76], acc226         // copy acc to vreg[216]
v_accvgpr_read_b32 v[vgprValuC+77], acc230         // copy acc to vreg[217]
v_accvgpr_read_b32 v[vgprValuC+78], acc234         // copy acc to vreg[218]
v_accvgpr_read_b32 v[vgprValuC+79], acc238         // copy acc to vreg[219]
v_accvgpr_read_b32 v[vgprValuC+80], acc242         // copy acc to vreg[220]
v_accvgpr_read_b32 v[vgprValuC+81], acc246         // copy acc to vreg[221]
v_accvgpr_read_b32 v[vgprValuC+82], acc250         // copy acc to vreg[222]
v_accvgpr_read_b32 v[vgprValuC+83], acc254         // copy acc to vreg[223]
v_accvgpr_read_b32 v[vgprValuC+84], acc131         // copy acc to vreg[224]
v_accvgpr_read_b32 v[vgprValuC+85], acc135         // copy acc to vreg[225]
v_accvgpr_read_b32 v[vgprValuC+86], acc139         // copy acc to vreg[226]
v_accvgpr_read_b32 v[vgprValuC+87], acc143         // copy acc to vreg[227]

/* rC *= alpha batchElements=[(1, 0, 3, 0), (1, 1, 3, 0), (1, 0, 4, 0), (1, 1, 4, 0), (1, 0, 5, 0), (1, 1, 5, 0), (1, 0, 6, 0), (1, 1, 6, 0), (1, 0, 7, 0), (1, 1, 7, 0), (1, 0, 8, 0), (1, 1, 8, 0), (1, 0, 9, 0), (1, 1, 9, 0), (1, 0, 10, 0), (1, 1, 10, 0), (1, 0, 11, 0), (1, 1, 11, 0), (1, 0, 12, 0)] */
v_pk_mul_f32 v[vgprValuC+12:vgprValuC+12+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+12:vgprValuC+12+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+14:vgprValuC+14+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+14:vgprValuC+14+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+16:vgprValuC+16+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+16:vgprValuC+16+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+18:vgprValuC+18+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+18:vgprValuC+18+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+20:vgprValuC+20+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+20:vgprValuC+20+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+22:vgprValuC+22+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+22:vgprValuC+22+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+24:vgprValuC+24+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+24:vgprValuC+24+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+26:vgprValuC+26+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+26:vgprValuC+26+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+28:vgprValuC+28+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+28:vgprValuC+28+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+30:vgprValuC+30+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+30:vgprValuC+30+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+32:vgprValuC+32+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+32:vgprValuC+32+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+34:vgprValuC+34+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+34:vgprValuC+34+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+36:vgprValuC+36+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+36:vgprValuC+36+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+38:vgprValuC+38+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+38:vgprValuC+38+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+40:vgprValuC+40+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+40:vgprValuC+40+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+42:vgprValuC+42+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+42:vgprValuC+42+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+44:vgprValuC+44+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+44:vgprValuC+44+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+46:vgprValuC+46+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+46:vgprValuC+46+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+48:vgprValuC+48+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+48:vgprValuC+48+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+50:vgprValuC+50+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+50:vgprValuC+50+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+52:vgprValuC+52+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+52:vgprValuC+52+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+54:vgprValuC+54+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+54:vgprValuC+54+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+56:vgprValuC+56+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+56:vgprValuC+56+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+58:vgprValuC+58+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+58:vgprValuC+58+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+60:vgprValuC+60+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+60:vgprValuC+60+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+62:vgprValuC+62+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+62:vgprValuC+62+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+64:vgprValuC+64+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+64:vgprValuC+64+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+66:vgprValuC+66+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+66:vgprValuC+66+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+68:vgprValuC+68+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+68:vgprValuC+68+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+70:vgprValuC+70+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+70:vgprValuC+70+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+72:vgprValuC+72+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+72:vgprValuC+72+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+74:vgprValuC+74+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+74:vgprValuC+74+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+76:vgprValuC+76+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+76:vgprValuC+76+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+78:vgprValuC+78+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+78:vgprValuC+78+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+80:vgprValuC+80+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+80:vgprValuC+80+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+82:vgprValuC+82+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+82:vgprValuC+82+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+84:vgprValuC+84+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+84:vgprValuC+84+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+86:vgprValuC+86+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+86:vgprValuC+86+1] op_sel_hi:[0,1,1] // *= alpha (pk)

/* apply mask, calc new C and issue writes */

s_waitcnt lgkmcnt(2)                               // dscnt(2) = 4 - 1 (bias) - 1 (scaleAlphaVec) (interleaved)
v_pk_mul_f32 v[vgprValuC+12:vgprValuC+12+1], v[92:93], v[vgprValuC+12:vgprValuC+12+1] // *= ScaleAlphaVecVMulPK(92)(0)
v_pk_mul_f32 v[vgprValuC+14:vgprValuC+14+1], v[94:95], v[vgprValuC+14:vgprValuC+14+1] // *= ScaleAlphaVecVMulPK(92)(2)
v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+12:vgprValuC+12+1] // C += bias
v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+14:vgprValuC+14+1] // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b64 v[12:13], v[4:5]
v_mov_b64 v[14:15], v[6:7]
s_lshl_b32 s74, s[sgprStrideD1J], 2                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[12:15], v9, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D

s_waitcnt lgkmcnt(0)                               // dscnt(0) = 4 - 2 (bias) - 2 (scaleAlphaVec) (interleaved)
v_pk_mul_f32 v[vgprValuC+16:vgprValuC+16+1], v[100:101], v[vgprValuC+16:vgprValuC+16+1] // *= ScaleAlphaVecVMulPK(100)(0)
v_pk_mul_f32 v[vgprValuC+18:vgprValuC+18+1], v[102:103], v[vgprValuC+18:vgprValuC+18+1] // *= ScaleAlphaVecVMulPK(100)(2)
v_pk_add_f32 v[4:5], v[96:97], v[vgprValuC+16:vgprValuC+16+1] // C += bias
v_pk_add_f32 v[6:7], v[98:99], v[vgprValuC+18:vgprValuC+18+1] // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b64 v[16:17], v[4:5]
v_mov_b64 v[18:19], v[6:7]
buffer_store_dwordx4 v[16:19], v9, s[sgprSrdD:sgprSrdD+3], 0 offen offset:512 // store D
v_pk_mul_f32 v[vgprValuC+20:vgprValuC+20+1], v[92:93], v[vgprValuC+20:vgprValuC+20+1] // *= ScaleAlphaVecVMulPK(92)(0)
v_pk_mul_f32 v[vgprValuC+22:vgprValuC+22+1], v[94:95], v[vgprValuC+22:vgprValuC+22+1] // *= ScaleAlphaVecVMulPK(92)(2)
v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+20:vgprValuC+20+1] // C += bias
v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+22:vgprValuC+22+1] // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b64 v[20:21], v[4:5]
v_mov_b64 v[22:23], v[6:7]
s_lshl_b32 s74, s[sgprStrideD1J], 2                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[20:23], v9, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_pk_mul_f32 v[vgprValuC+24:vgprValuC+24+1], v[100:101], v[vgprValuC+24:vgprValuC+24+1] // *= ScaleAlphaVecVMulPK(100)(0)
v_pk_mul_f32 v[vgprValuC+26:vgprValuC+26+1], v[102:103], v[vgprValuC+26:vgprValuC+26+1] // *= ScaleAlphaVecVMulPK(100)(2)
v_pk_add_f32 v[4:5], v[96:97], v[vgprValuC+24:vgprValuC+24+1] // C += bias
v_pk_add_f32 v[6:7], v[98:99], v[vgprValuC+26:vgprValuC+26+1] // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b64 v[24:25], v[4:5]
v_mov_b64 v[26:27], v[6:7]
buffer_store_dwordx4 v[24:27], v9, s[sgprSrdD:sgprSrdD+3], 0 offen offset:512 // store D
v_pk_mul_f32 v[vgprValuC+28:vgprValuC+28+1], v[92:93], v[vgprValuC+28:vgprValuC+28+1] // *= ScaleAlphaVecVMulPK(92)(0)
v_pk_mul_f32 v[vgprValuC+30:vgprValuC+30+1], v[94:95], v[vgprValuC+30:vgprValuC+30+1] // *= ScaleAlphaVecVMulPK(92)(2)
v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+28:vgprValuC+28+1] // C += bias
v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+30:vgprValuC+30+1] // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b64 v[28:29], v[4:5]
v_mov_b64 v[30:31], v[6:7]
s_lshl_b32 s74, s[sgprStrideD1J], 2                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[28:31], v9, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_pk_mul_f32 v[vgprValuC+32:vgprValuC+32+1], v[100:101], v[vgprValuC+32:vgprValuC+32+1] // *= ScaleAlphaVecVMulPK(100)(0)
v_pk_mul_f32 v[vgprValuC+34:vgprValuC+34+1], v[102:103], v[vgprValuC+34:vgprValuC+34+1] // *= ScaleAlphaVecVMulPK(100)(2)
v_pk_add_f32 v[4:5], v[96:97], v[vgprValuC+32:vgprValuC+32+1] // C += bias
v_pk_add_f32 v[6:7], v[98:99], v[vgprValuC+34:vgprValuC+34+1] // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b64 v[32:33], v[4:5]
v_mov_b64 v[34:35], v[6:7]
buffer_store_dwordx4 v[32:35], v9, s[sgprSrdD:sgprSrdD+3], 0 offen offset:512 // store D
v_pk_mul_f32 v[vgprValuC+36:vgprValuC+36+1], v[92:93], v[vgprValuC+36:vgprValuC+36+1] // *= ScaleAlphaVecVMulPK(92)(0)
v_pk_mul_f32 v[vgprValuC+38:vgprValuC+38+1], v[94:95], v[vgprValuC+38:vgprValuC+38+1] // *= ScaleAlphaVecVMulPK(92)(2)
v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+36:vgprValuC+36+1] // C += bias
v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+38:vgprValuC+38+1] // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b64 v[36:37], v[4:5]
v_mov_b64 v[38:39], v[6:7]
s_lshl_b32 s74, s[sgprStrideD1J], 2                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[36:39], v9, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_pk_mul_f32 v[vgprValuC+40:vgprValuC+40+1], v[100:101], v[vgprValuC+40:vgprValuC+40+1] // *= ScaleAlphaVecVMulPK(100)(0)
v_pk_mul_f32 v[vgprValuC+42:vgprValuC+42+1], v[102:103], v[vgprValuC+42:vgprValuC+42+1] // *= ScaleAlphaVecVMulPK(100)(2)
v_pk_add_f32 v[4:5], v[96:97], v[vgprValuC+40:vgprValuC+40+1] // C += bias
v_pk_add_f32 v[6:7], v[98:99], v[vgprValuC+42:vgprValuC+42+1] // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b64 v[40:41], v[4:5]
v_mov_b64 v[42:43], v[6:7]
buffer_store_dwordx4 v[40:43], v9, s[sgprSrdD:sgprSrdD+3], 0 offen offset:512 // store D
v_pk_mul_f32 v[vgprValuC+44:vgprValuC+44+1], v[92:93], v[vgprValuC+44:vgprValuC+44+1] // *= ScaleAlphaVecVMulPK(92)(0)
v_pk_mul_f32 v[vgprValuC+46:vgprValuC+46+1], v[94:95], v[vgprValuC+46:vgprValuC+46+1] // *= ScaleAlphaVecVMulPK(92)(2)
v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+44:vgprValuC+44+1] // C += bias
v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+46:vgprValuC+46+1] // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b64 v[44:45], v[4:5]
v_mov_b64 v[46:47], v[6:7]
s_lshl_b32 s74, s[sgprStrideD1J], 2                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[44:47], v9, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_pk_mul_f32 v[vgprValuC+48:vgprValuC+48+1], v[100:101], v[vgprValuC+48:vgprValuC+48+1] // *= ScaleAlphaVecVMulPK(100)(0)
v_pk_mul_f32 v[vgprValuC+50:vgprValuC+50+1], v[102:103], v[vgprValuC+50:vgprValuC+50+1] // *= ScaleAlphaVecVMulPK(100)(2)
v_pk_add_f32 v[4:5], v[96:97], v[vgprValuC+48:vgprValuC+48+1] // C += bias
v_pk_add_f32 v[6:7], v[98:99], v[vgprValuC+50:vgprValuC+50+1] // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b64 v[48:49], v[4:5]
v_mov_b64 v[50:51], v[6:7]
buffer_store_dwordx4 v[48:51], v9, s[sgprSrdD:sgprSrdD+3], 0 offen offset:512 // store D
v_pk_mul_f32 v[vgprValuC+52:vgprValuC+52+1], v[92:93], v[vgprValuC+52:vgprValuC+52+1] // *= ScaleAlphaVecVMulPK(92)(0)
v_pk_mul_f32 v[vgprValuC+54:vgprValuC+54+1], v[94:95], v[vgprValuC+54:vgprValuC+54+1] // *= ScaleAlphaVecVMulPK(92)(2)
v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+52:vgprValuC+52+1] // C += bias
v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+54:vgprValuC+54+1] // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b64 v[52:53], v[4:5]
v_mov_b64 v[54:55], v[6:7]
s_lshl_b32 s74, s[sgprStrideD1J], 2                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[52:55], v9, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_pk_mul_f32 v[vgprValuC+56:vgprValuC+56+1], v[100:101], v[vgprValuC+56:vgprValuC+56+1] // *= ScaleAlphaVecVMulPK(100)(0)
v_pk_mul_f32 v[vgprValuC+58:vgprValuC+58+1], v[102:103], v[vgprValuC+58:vgprValuC+58+1] // *= ScaleAlphaVecVMulPK(100)(2)
v_pk_add_f32 v[4:5], v[96:97], v[vgprValuC+56:vgprValuC+56+1] // C += bias
v_pk_add_f32 v[6:7], v[98:99], v[vgprValuC+58:vgprValuC+58+1] // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b64 v[56:57], v[4:5]
v_mov_b64 v[58:59], v[6:7]
buffer_store_dwordx4 v[56:59], v9, s[sgprSrdD:sgprSrdD+3], 0 offen offset:512 // store D
v_pk_mul_f32 v[vgprValuC+60:vgprValuC+60+1], v[92:93], v[vgprValuC+60:vgprValuC+60+1] // *= ScaleAlphaVecVMulPK(92)(0)
v_pk_mul_f32 v[vgprValuC+62:vgprValuC+62+1], v[94:95], v[vgprValuC+62:vgprValuC+62+1] // *= ScaleAlphaVecVMulPK(92)(2)
v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+60:vgprValuC+60+1] // C += bias
v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+62:vgprValuC+62+1] // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b64 v[60:61], v[4:5]
v_mov_b64 v[62:63], v[6:7]
s_lshl_b32 s74, s[sgprStrideD1J], 2                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[60:63], v9, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_pk_mul_f32 v[vgprValuC+64:vgprValuC+64+1], v[100:101], v[vgprValuC+64:vgprValuC+64+1] // *= ScaleAlphaVecVMulPK(100)(0)
v_pk_mul_f32 v[vgprValuC+66:vgprValuC+66+1], v[102:103], v[vgprValuC+66:vgprValuC+66+1] // *= ScaleAlphaVecVMulPK(100)(2)
v_pk_add_f32 v[4:5], v[96:97], v[vgprValuC+64:vgprValuC+64+1] // C += bias
v_pk_add_f32 v[6:7], v[98:99], v[vgprValuC+66:vgprValuC+66+1] // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b64 v[64:65], v[4:5]
v_mov_b64 v[66:67], v[6:7]
buffer_store_dwordx4 v[64:67], v9, s[sgprSrdD:sgprSrdD+3], 0 offen offset:512 // store D
v_pk_mul_f32 v[vgprValuC+68:vgprValuC+68+1], v[92:93], v[vgprValuC+68:vgprValuC+68+1] // *= ScaleAlphaVecVMulPK(92)(0)
v_pk_mul_f32 v[vgprValuC+70:vgprValuC+70+1], v[94:95], v[vgprValuC+70:vgprValuC+70+1] // *= ScaleAlphaVecVMulPK(92)(2)
v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+68:vgprValuC+68+1] // C += bias
v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+70:vgprValuC+70+1] // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b64 v[68:69], v[4:5]
v_mov_b64 v[70:71], v[6:7]
s_lshl_b32 s74, s[sgprStrideD1J], 2                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[68:71], v9, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_pk_mul_f32 v[vgprValuC+72:vgprValuC+72+1], v[100:101], v[vgprValuC+72:vgprValuC+72+1] // *= ScaleAlphaVecVMulPK(100)(0)
v_pk_mul_f32 v[vgprValuC+74:vgprValuC+74+1], v[102:103], v[vgprValuC+74:vgprValuC+74+1] // *= ScaleAlphaVecVMulPK(100)(2)
v_pk_add_f32 v[4:5], v[96:97], v[vgprValuC+72:vgprValuC+72+1] // C += bias
v_pk_add_f32 v[6:7], v[98:99], v[vgprValuC+74:vgprValuC+74+1] // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b64 v[72:73], v[4:5]
v_mov_b64 v[74:75], v[6:7]
buffer_store_dwordx4 v[72:75], v9, s[sgprSrdD:sgprSrdD+3], 0 offen offset:512 // store D
v_pk_mul_f32 v[vgprValuC+76:vgprValuC+76+1], v[92:93], v[vgprValuC+76:vgprValuC+76+1] // *= ScaleAlphaVecVMulPK(92)(0)
v_pk_mul_f32 v[vgprValuC+78:vgprValuC+78+1], v[94:95], v[vgprValuC+78:vgprValuC+78+1] // *= ScaleAlphaVecVMulPK(92)(2)
v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+76:vgprValuC+76+1] // C += bias
v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+78:vgprValuC+78+1] // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b64 v[76:77], v[4:5]
v_mov_b64 v[78:79], v[6:7]
s_lshl_b32 s74, s[sgprStrideD1J], 2                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[76:79], v9, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_pk_mul_f32 v[vgprValuC+80:vgprValuC+80+1], v[100:101], v[vgprValuC+80:vgprValuC+80+1] // *= ScaleAlphaVecVMulPK(100)(0)
v_pk_mul_f32 v[vgprValuC+82:vgprValuC+82+1], v[102:103], v[vgprValuC+82:vgprValuC+82+1] // *= ScaleAlphaVecVMulPK(100)(2)
v_pk_add_f32 v[4:5], v[96:97], v[vgprValuC+80:vgprValuC+80+1] // C += bias
v_pk_add_f32 v[6:7], v[98:99], v[vgprValuC+82:vgprValuC+82+1] // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b64 v[80:81], v[4:5]
v_mov_b64 v[82:83], v[6:7]
buffer_store_dwordx4 v[80:83], v9, s[sgprSrdD:sgprSrdD+3], 0 offen offset:512 // store D
v_pk_mul_f32 v[vgprValuC+84:vgprValuC+84+1], v[92:93], v[vgprValuC+84:vgprValuC+84+1] // *= ScaleAlphaVecVMulPK(92)(0)
v_pk_mul_f32 v[vgprValuC+86:vgprValuC+86+1], v[94:95], v[vgprValuC+86:vgprValuC+86+1] // *= ScaleAlphaVecVMulPK(92)(2)
v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+84:vgprValuC+84+1] // C += bias
v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+86:vgprValuC+86+1] // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b64 v[84:85], v[4:5]
v_mov_b64 v[86:87], v[6:7]
s_lshl_b32 s74, s[sgprStrideD1J], 2                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[84:87], v9, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 factorDim=0 */

/******************************************/
/* Global Write Batch #3 (d1,d0,vc1,vc0) = */
/*    (1,1,12,0:vw4); (1,0,13,0:vw4); (1,1,13,0:vw4); (1,0,14,0:vw4); (1,1,14,0:vw4); (1,0,15,0:vw4); (1,1,15,0:vw4) */
/******************************************/

/* calc coords, apply mask, and issue loads (if necessary) */
/* (d1,vc1,d0,vc0)=(1,12,1,0) */
ds_read_b128 v[40:43], v11 offset:512              // load Bias
ds_read_b128 v[44:47], v11 offset:1536             // load scaleAlpha
/* (d1,vc1,d0,vc0)=(1,13,0,0) */
ds_read_b128 v[48:51], v11 offset:0                // load Bias
ds_read_b128 v[52:55], v11 offset:1024             // load scaleAlpha
/* (d1,vc1,d0,vc0)=(1,13,1,0) */
/* (d1,vc1,d0,vc0)=(1,14,0,0) */
/* (d1,vc1,d0,vc0)=(1,14,1,0) */
/* (d1,vc1,d0,vc0)=(1,15,0,0) */
/* (d1,vc1,d0,vc0)=(1,15,1,0) */
v_accvgpr_read_b32 v[vgprValuC+12], acc147         // copy acc to vreg[228]
v_accvgpr_read_b32 v[vgprValuC+13], acc151         // copy acc to vreg[229]
v_accvgpr_read_b32 v[vgprValuC+14], acc155         // copy acc to vreg[230]
v_accvgpr_read_b32 v[vgprValuC+15], acc159         // copy acc to vreg[231]
v_accvgpr_read_b32 v[vgprValuC+16], acc163         // copy acc to vreg[232]
v_accvgpr_read_b32 v[vgprValuC+17], acc167         // copy acc to vreg[233]
v_accvgpr_read_b32 v[vgprValuC+18], acc171         // copy acc to vreg[234]
v_accvgpr_read_b32 v[vgprValuC+19], acc175         // copy acc to vreg[235]
v_accvgpr_read_b32 v[vgprValuC+20], acc179         // copy acc to vreg[236]
v_accvgpr_read_b32 v[vgprValuC+21], acc183         // copy acc to vreg[237]
v_accvgpr_read_b32 v[vgprValuC+22], acc187         // copy acc to vreg[238]
v_accvgpr_read_b32 v[vgprValuC+23], acc191         // copy acc to vreg[239]
v_accvgpr_read_b32 v[vgprValuC+24], acc195         // copy acc to vreg[240]
v_accvgpr_read_b32 v[vgprValuC+25], acc199         // copy acc to vreg[241]
v_accvgpr_read_b32 v[vgprValuC+26], acc203         // copy acc to vreg[242]
v_accvgpr_read_b32 v[vgprValuC+27], acc207         // copy acc to vreg[243]
v_accvgpr_read_b32 v[vgprValuC+28], acc211         // copy acc to vreg[244]
v_accvgpr_read_b32 v[vgprValuC+29], acc215         // copy acc to vreg[245]
v_accvgpr_read_b32 v[vgprValuC+30], acc219         // copy acc to vreg[246]
v_accvgpr_read_b32 v[vgprValuC+31], acc223         // copy acc to vreg[247]
v_accvgpr_read_b32 v[vgprValuC+32], acc227         // copy acc to vreg[248]
v_accvgpr_read_b32 v[vgprValuC+33], acc231         // copy acc to vreg[249]
v_accvgpr_read_b32 v[vgprValuC+34], acc235         // copy acc to vreg[250]
v_accvgpr_read_b32 v[vgprValuC+35], acc239         // copy acc to vreg[251]
v_accvgpr_read_b32 v[vgprValuC+36], acc243         // copy acc to vreg[252]
v_accvgpr_read_b32 v[vgprValuC+37], acc247         // copy acc to vreg[253]
v_accvgpr_read_b32 v[vgprValuC+38], acc251         // copy acc to vreg[254]
v_accvgpr_read_b32 v[vgprValuC+39], acc255         // copy acc to vreg[255]

/* rC *= alpha batchElements=[(1, 1, 12, 0), (1, 0, 13, 0), (1, 1, 13, 0), (1, 0, 14, 0), (1, 1, 14, 0), (1, 0, 15, 0), (1, 1, 15, 0)] */
v_pk_mul_f32 v[vgprValuC+12:vgprValuC+12+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+12:vgprValuC+12+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+14:vgprValuC+14+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+14:vgprValuC+14+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+16:vgprValuC+16+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+16:vgprValuC+16+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+18:vgprValuC+18+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+18:vgprValuC+18+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+20:vgprValuC+20+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+20:vgprValuC+20+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+22:vgprValuC+22+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+22:vgprValuC+22+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+24:vgprValuC+24+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+24:vgprValuC+24+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+26:vgprValuC+26+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+26:vgprValuC+26+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+28:vgprValuC+28+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+28:vgprValuC+28+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+30:vgprValuC+30+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+30:vgprValuC+30+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+32:vgprValuC+32+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+32:vgprValuC+32+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+34:vgprValuC+34+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+34:vgprValuC+34+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+36:vgprValuC+36+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+36:vgprValuC+36+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+38:vgprValuC+38+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+38:vgprValuC+38+1] op_sel_hi:[0,1,1] // *= alpha (pk)

/* apply mask, calc new C and issue writes */

s_waitcnt lgkmcnt(2)                               // dscnt(2) = 4 - 1 (bias) - 1 (scaleAlphaVec) (interleaved)
v_pk_mul_f32 v[vgprValuC+12:vgprValuC+12+1], v[44:45], v[vgprValuC+12:vgprValuC+12+1] // *= ScaleAlphaVecVMulPK(44)(0)
v_pk_mul_f32 v[vgprValuC+14:vgprValuC+14+1], v[46:47], v[vgprValuC+14:vgprValuC+14+1] // *= ScaleAlphaVecVMulPK(44)(2)
v_pk_add_f32 v[4:5], v[40:41], v[vgprValuC+12:vgprValuC+12+1] // C += bias
v_pk_add_f32 v[6:7], v[42:43], v[vgprValuC+14:vgprValuC+14+1] // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b64 v[12:13], v[4:5]
v_mov_b64 v[14:15], v[6:7]
buffer_store_dwordx4 v[12:15], v9, s[sgprSrdD:sgprSrdD+3], 0 offen offset:512 // store D

s_waitcnt lgkmcnt(0)                               // dscnt(0) = 4 - 2 (bias) - 2 (scaleAlphaVec) (interleaved)
v_pk_mul_f32 v[vgprValuC+16:vgprValuC+16+1], v[52:53], v[vgprValuC+16:vgprValuC+16+1] // *= ScaleAlphaVecVMulPK(52)(0)
v_pk_mul_f32 v[vgprValuC+18:vgprValuC+18+1], v[54:55], v[vgprValuC+18:vgprValuC+18+1] // *= ScaleAlphaVecVMulPK(52)(2)
v_pk_add_f32 v[4:5], v[48:49], v[vgprValuC+16:vgprValuC+16+1] // C += bias
v_pk_add_f32 v[6:7], v[50:51], v[vgprValuC+18:vgprValuC+18+1] // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b64 v[16:17], v[4:5]
v_mov_b64 v[18:19], v[6:7]
s_lshl_b32 s74, s[sgprStrideD1J], 2                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[16:19], v9, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_pk_mul_f32 v[vgprValuC+20:vgprValuC+20+1], v[44:45], v[vgprValuC+20:vgprValuC+20+1] // *= ScaleAlphaVecVMulPK(44)(0)
v_pk_mul_f32 v[vgprValuC+22:vgprValuC+22+1], v[46:47], v[vgprValuC+22:vgprValuC+22+1] // *= ScaleAlphaVecVMulPK(44)(2)
v_pk_add_f32 v[4:5], v[40:41], v[vgprValuC+20:vgprValuC+20+1] // C += bias
v_pk_add_f32 v[6:7], v[42:43], v[vgprValuC+22:vgprValuC+22+1] // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b64 v[20:21], v[4:5]
v_mov_b64 v[22:23], v[6:7]
buffer_store_dwordx4 v[20:23], v9, s[sgprSrdD:sgprSrdD+3], 0 offen offset:512 // store D
v_pk_mul_f32 v[vgprValuC+24:vgprValuC+24+1], v[52:53], v[vgprValuC+24:vgprValuC+24+1] // *= ScaleAlphaVecVMulPK(52)(0)
v_pk_mul_f32 v[vgprValuC+26:vgprValuC+26+1], v[54:55], v[vgprValuC+26:vgprValuC+26+1] // *= ScaleAlphaVecVMulPK(52)(2)
v_pk_add_f32 v[4:5], v[48:49], v[vgprValuC+24:vgprValuC+24+1] // C += bias
v_pk_add_f32 v[6:7], v[50:51], v[vgprValuC+26:vgprValuC+26+1] // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b64 v[24:25], v[4:5]
v_mov_b64 v[26:27], v[6:7]
s_lshl_b32 s74, s[sgprStrideD1J], 2                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[24:27], v9, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_pk_mul_f32 v[vgprValuC+28:vgprValuC+28+1], v[44:45], v[vgprValuC+28:vgprValuC+28+1] // *= ScaleAlphaVecVMulPK(44)(0)
v_pk_mul_f32 v[vgprValuC+30:vgprValuC+30+1], v[46:47], v[vgprValuC+30:vgprValuC+30+1] // *= ScaleAlphaVecVMulPK(44)(2)
v_pk_add_f32 v[4:5], v[40:41], v[vgprValuC+28:vgprValuC+28+1] // C += bias
v_pk_add_f32 v[6:7], v[42:43], v[vgprValuC+30:vgprValuC+30+1] // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b64 v[28:29], v[4:5]
v_mov_b64 v[30:31], v[6:7]
buffer_store_dwordx4 v[28:31], v9, s[sgprSrdD:sgprSrdD+3], 0 offen offset:512 // store D
v_pk_mul_f32 v[vgprValuC+32:vgprValuC+32+1], v[52:53], v[vgprValuC+32:vgprValuC+32+1] // *= ScaleAlphaVecVMulPK(52)(0)
v_pk_mul_f32 v[vgprValuC+34:vgprValuC+34+1], v[54:55], v[vgprValuC+34:vgprValuC+34+1] // *= ScaleAlphaVecVMulPK(52)(2)
v_pk_add_f32 v[4:5], v[48:49], v[vgprValuC+32:vgprValuC+32+1] // C += bias
v_pk_add_f32 v[6:7], v[50:51], v[vgprValuC+34:vgprValuC+34+1] // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b64 v[32:33], v[4:5]
v_mov_b64 v[34:35], v[6:7]
s_lshl_b32 s74, s[sgprStrideD1J], 2                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[32:35], v9, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_pk_mul_f32 v[vgprValuC+36:vgprValuC+36+1], v[44:45], v[vgprValuC+36:vgprValuC+36+1] // *= ScaleAlphaVecVMulPK(44)(0)
v_pk_mul_f32 v[vgprValuC+38:vgprValuC+38+1], v[46:47], v[vgprValuC+38:vgprValuC+38+1] // *= ScaleAlphaVecVMulPK(44)(2)
v_pk_add_f32 v[4:5], v[40:41], v[vgprValuC+36:vgprValuC+36+1] // C += bias
v_pk_add_f32 v[6:7], v[42:43], v[vgprValuC+38:vgprValuC+38+1] // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b64 v[36:37], v[4:5]
v_mov_b64 v[38:39], v[6:7]
buffer_store_dwordx4 v[36:39], v9, s[sgprSrdD:sgprSrdD+3], 0 offen offset:512 // store D
s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
s_branch label_GW_End                              // jump to end
label_GW_B0_E1_N:
s_cmpk_eq_u32 s[sgprActivationType], 3             // activationType == 3
s_cbranch_scc1 label_To_Activation_Gelu_VW4_beta_0_edge_1 // Branch if true
s_cmpk_eq_u32 s[sgprActivationType], 5             // activationType == 5
s_cbranch_scc1 label_To_Activation_Relu_VW4_beta_0_edge_1 // Branch if true
s_cmpk_eq_u32 s[sgprActivationType], 10            // activationType == 10
s_cbranch_scc1 label_To_Activation_Silu_VW4_beta_0_edge_1 // Branch if true
s_cmpk_eq_u32 s[sgprActivationType], 12            // activationType == 12
s_cbranch_scc1 label_To_Activation_Clamp_VW4_beta_0_edge_1 // Branch if true
label_To_Activation_None_VW4_beta_0_edge_1:
s_getpc_b64 s[8:9]                                 // addr of next instr
s_add_i32 s73, label_Activation_None_VW4, 4        // target branch offset
s_add_u32 s8, s8, s73                              // add target branch offset
s_addc_u32 s9, s9, 0                               // add high and carry
s_branch label_ActivationSetPCAddrEnd_4
label_To_Activation_Gelu_VW4_beta_0_edge_1:
s_getpc_b64 s[8:9]                                 // addr of next instr
s_add_i32 s73, label_Activation_Gelu_VW4, 4        // target branch offset
s_add_u32 s8, s8, s73                              // add target branch offset
s_addc_u32 s9, s9, 0                               // add high and carry
s_branch label_ActivationSetPCAddrEnd_4
label_To_Activation_Relu_VW4_beta_0_edge_1:
s_getpc_b64 s[8:9]                                 // addr of next instr
s_add_i32 s73, label_Activation_Relu_VW4, 4        // target branch offset
s_add_u32 s8, s8, s73                              // add target branch offset
s_addc_u32 s9, s9, 0                               // add high and carry
s_branch label_ActivationSetPCAddrEnd_4
label_To_Activation_Silu_VW4_beta_0_edge_1:
s_getpc_b64 s[8:9]                                 // addr of next instr
s_add_i32 s73, label_Activation_Silu_VW4, 4        // target branch offset
s_add_u32 s8, s8, s73                              // add target branch offset
s_addc_u32 s9, s9, 0                               // add high and carry
s_branch label_ActivationSetPCAddrEnd_4
label_To_Activation_Clamp_VW4_beta_0_edge_1:
s_getpc_b64 s[8:9]                                 // addr of next instr
s_add_i32 s73, label_Activation_Clamp_VW4, 4       // target branch offset
s_add_u32 s8, s8, s73                              // add target branch offset
s_addc_u32 s9, s9, 0                               // add high and carry
s_branch label_ActivationSetPCAddrEnd_4
label_ActivationSetPCAddrEnd_4:

/* edge=1, allocate 6 sgpr. perBatchTmpS=4 perBatchMaskS=2 perElementMaskS=0 elementsPerBatch=15 */
/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 factorDim=0 */

/******************************************/
/* Global Write Edge Batch #0 (d1,d0,vc1,vc0) = */
/*    (0,0,0,0:vw4); (0,1,0,0:vw4); (0,0,1,0:vw4); (0,1,1,0:vw4); (0,0,2,0:vw4); (0,1,2,0:vw4); (0,0,3,0:vw4); (0,1,3,0:vw4); (0,0,4,0:vw4); (0,1,4,0:vw4); (0,0,5,0:vw4); (0,1,5,0:vw4); (0,0,6,0:vw4); (0,1,6,0:vw4); (0,0,7,0:vw4) */
/******************************************/

/* calc coords, apply mask, and issue loads (if necessary) */
v_mov_b32 v8, BufferOOB
/* (d1,vc1,d0,vc0)=(0,0,0,0) */
v_cmp_lt_u32 s[74:75], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v10, v0, s74
v_lshlrev_b32 v10, 0x2, v10                        // Bias address scaled by BPE
s_waitcnt lgkmcnt(0)                               // Wait for LDS write
s_barrier                                          // LDS write barrier
ds_read_b128 v[72:75], v10 offset:0                // load Bias
ds_read_b128 v[76:79], v10 offset:1024             // load scaleAlpha
v_add_lshl_u32 v9, v3, v0, 0x2                     // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v9, v8, v9, s[78:79]                 // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,0,1,0) */
s_mov_b32 s74, 128                                 // coordOffset0 d0=1 vc0=0
v_add_co_u32 v4, vcc, v0, s74                      // coord0.2: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v88, v4, s74
v_lshlrev_b32 v88, 0x2, v88                        // Bias address scaled by BPE
ds_read_b128 v[80:83], v88 offset:0                // load Bias
ds_read_b128 v[84:87], v88 offset:1024             // load scaleAlpha
v_add_lshl_u32 v11, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v11, v8, v11, s[78:79]               // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,1,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[74:75], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v90, v0, s74
v_lshlrev_b32 v90, 0x2, v90                        // Bias address scaled by BPE
v_add_lshl_u32 v89, v3, v0, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v89, v8, v89, s[78:79]               // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,1,1,0) */
s_mov_b32 s74, 128                                 // coordOffset0 d0=1 vc0=0
v_add_co_u32 v4, vcc, v0, s74                      // coord0.2: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v92, v4, s74
v_lshlrev_b32 v92, 0x2, v92                        // Bias address scaled by BPE
v_add_lshl_u32 v91, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v91, v8, v91, s[78:79]               // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,2,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[74:75], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v94, v0, s74
v_lshlrev_b32 v94, 0x2, v94                        // Bias address scaled by BPE
v_add_lshl_u32 v93, v3, v0, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v93, v8, v93, s[78:79]               // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,2,1,0) */
s_mov_b32 s74, 128                                 // coordOffset0 d0=1 vc0=0
v_add_co_u32 v4, vcc, v0, s74                      // coord0.2: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v96, v4, s74
v_lshlrev_b32 v96, 0x2, v96                        // Bias address scaled by BPE
v_add_lshl_u32 v95, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v95, v8, v95, s[78:79]               // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,3,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[74:75], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v98, v0, s74
v_lshlrev_b32 v98, 0x2, v98                        // Bias address scaled by BPE
v_add_lshl_u32 v97, v3, v0, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v97, v8, v97, s[78:79]               // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,3,1,0) */
s_mov_b32 s74, 128                                 // coordOffset0 d0=1 vc0=0
v_add_co_u32 v4, vcc, v0, s74                      // coord0.2: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v100, v4, s74
v_lshlrev_b32 v100, 0x2, v100                      // Bias address scaled by BPE
v_add_lshl_u32 v99, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v99, v8, v99, s[78:79]               // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,4,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[74:75], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v102, v0, s74
v_lshlrev_b32 v102, 0x2, v102                      // Bias address scaled by BPE
v_add_lshl_u32 v101, v3, v0, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v101, v8, v101, s[78:79]             // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,4,1,0) */
s_mov_b32 s74, 128                                 // coordOffset0 d0=1 vc0=0
v_add_co_u32 v4, vcc, v0, s74                      // coord0.2: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v104, v4, s74
v_lshlrev_b32 v104, 0x2, v104                      // Bias address scaled by BPE
v_add_lshl_u32 v103, v3, v4, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v103, v8, v103, s[78:79]             // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,5,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[74:75], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v106, v0, s74
v_lshlrev_b32 v106, 0x2, v106                      // Bias address scaled by BPE
v_add_lshl_u32 v105, v3, v0, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v105, v8, v105, s[78:79]             // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,5,1,0) */
s_mov_b32 s74, 128                                 // coordOffset0 d0=1 vc0=0
v_add_co_u32 v4, vcc, v0, s74                      // coord0.2: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v108, v4, s74
v_lshlrev_b32 v108, 0x2, v108                      // Bias address scaled by BPE
v_add_lshl_u32 v107, v3, v4, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v107, v8, v107, s[78:79]             // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,6,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[74:75], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v110, v0, s74
v_lshlrev_b32 v110, 0x2, v110                      // Bias address scaled by BPE
v_add_lshl_u32 v109, v3, v0, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v109, v8, v109, s[78:79]             // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,6,1,0) */
s_mov_b32 s74, 128                                 // coordOffset0 d0=1 vc0=0
v_add_co_u32 v4, vcc, v0, s74                      // coord0.2: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v112, v4, s74
v_lshlrev_b32 v112, 0x2, v112                      // Bias address scaled by BPE
v_add_lshl_u32 v111, v3, v4, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v111, v8, v111, s[78:79]             // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,7,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[74:75], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v114, v0, s74
v_lshlrev_b32 v114, 0x2, v114                      // Bias address scaled by BPE
v_add_lshl_u32 v113, v3, v0, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v113, v8, v113, s[78:79]             // LDD clip if OOB. offset
v_accvgpr_read_b32 v[vgprValuC+12], acc0           // copy acc to vreg[0]
v_accvgpr_read_b32 v[vgprValuC+13], acc4           // copy acc to vreg[1]
v_accvgpr_read_b32 v[vgprValuC+14], acc8           // copy acc to vreg[2]
v_accvgpr_read_b32 v[vgprValuC+15], acc12          // copy acc to vreg[3]
v_accvgpr_read_b32 v[vgprValuC+16], acc16          // copy acc to vreg[4]
v_accvgpr_read_b32 v[vgprValuC+17], acc20          // copy acc to vreg[5]
v_accvgpr_read_b32 v[vgprValuC+18], acc24          // copy acc to vreg[6]
v_accvgpr_read_b32 v[vgprValuC+19], acc28          // copy acc to vreg[7]
v_accvgpr_read_b32 v[vgprValuC+20], acc32          // copy acc to vreg[8]
v_accvgpr_read_b32 v[vgprValuC+21], acc36          // copy acc to vreg[9]
v_accvgpr_read_b32 v[vgprValuC+22], acc40          // copy acc to vreg[10]
v_accvgpr_read_b32 v[vgprValuC+23], acc44          // copy acc to vreg[11]
v_accvgpr_read_b32 v[vgprValuC+24], acc48          // copy acc to vreg[12]
v_accvgpr_read_b32 v[vgprValuC+25], acc52          // copy acc to vreg[13]
v_accvgpr_read_b32 v[vgprValuC+26], acc56          // copy acc to vreg[14]
v_accvgpr_read_b32 v[vgprValuC+27], acc60          // copy acc to vreg[15]
v_accvgpr_read_b32 v[vgprValuC+28], acc64          // copy acc to vreg[16]
v_accvgpr_read_b32 v[vgprValuC+29], acc68          // copy acc to vreg[17]
v_accvgpr_read_b32 v[vgprValuC+30], acc72          // copy acc to vreg[18]
v_accvgpr_read_b32 v[vgprValuC+31], acc76          // copy acc to vreg[19]
v_accvgpr_read_b32 v[vgprValuC+32], acc80          // copy acc to vreg[20]
v_accvgpr_read_b32 v[vgprValuC+33], acc84          // copy acc to vreg[21]
v_accvgpr_read_b32 v[vgprValuC+34], acc88          // copy acc to vreg[22]
v_accvgpr_read_b32 v[vgprValuC+35], acc92          // copy acc to vreg[23]
v_accvgpr_read_b32 v[vgprValuC+36], acc96          // copy acc to vreg[24]
v_accvgpr_read_b32 v[vgprValuC+37], acc100         // copy acc to vreg[25]
v_accvgpr_read_b32 v[vgprValuC+38], acc104         // copy acc to vreg[26]
v_accvgpr_read_b32 v[vgprValuC+39], acc108         // copy acc to vreg[27]
v_accvgpr_read_b32 v[vgprValuC+40], acc112         // copy acc to vreg[28]
v_accvgpr_read_b32 v[vgprValuC+41], acc116         // copy acc to vreg[29]
v_accvgpr_read_b32 v[vgprValuC+42], acc120         // copy acc to vreg[30]
v_accvgpr_read_b32 v[vgprValuC+43], acc124         // copy acc to vreg[31]
v_accvgpr_read_b32 v[vgprValuC+44], acc1           // copy acc to vreg[32]
v_accvgpr_read_b32 v[vgprValuC+45], acc5           // copy acc to vreg[33]
v_accvgpr_read_b32 v[vgprValuC+46], acc9           // copy acc to vreg[34]
v_accvgpr_read_b32 v[vgprValuC+47], acc13          // copy acc to vreg[35]
v_accvgpr_read_b32 v[vgprValuC+48], acc17          // copy acc to vreg[36]
v_accvgpr_read_b32 v[vgprValuC+49], acc21          // copy acc to vreg[37]
v_accvgpr_read_b32 v[vgprValuC+50], acc25          // copy acc to vreg[38]
v_accvgpr_read_b32 v[vgprValuC+51], acc29          // copy acc to vreg[39]
v_accvgpr_read_b32 v[vgprValuC+52], acc33          // copy acc to vreg[40]
v_accvgpr_read_b32 v[vgprValuC+53], acc37          // copy acc to vreg[41]
v_accvgpr_read_b32 v[vgprValuC+54], acc41          // copy acc to vreg[42]
v_accvgpr_read_b32 v[vgprValuC+55], acc45          // copy acc to vreg[43]
v_accvgpr_read_b32 v[vgprValuC+56], acc49          // copy acc to vreg[44]
v_accvgpr_read_b32 v[vgprValuC+57], acc53          // copy acc to vreg[45]
v_accvgpr_read_b32 v[vgprValuC+58], acc57          // copy acc to vreg[46]
v_accvgpr_read_b32 v[vgprValuC+59], acc61          // copy acc to vreg[47]
v_accvgpr_read_b32 v[vgprValuC+60], acc65          // copy acc to vreg[48]
v_accvgpr_read_b32 v[vgprValuC+61], acc69          // copy acc to vreg[49]
v_accvgpr_read_b32 v[vgprValuC+62], acc73          // copy acc to vreg[50]
v_accvgpr_read_b32 v[vgprValuC+63], acc77          // copy acc to vreg[51]
v_accvgpr_read_b32 v[vgprValuC+64], acc81          // copy acc to vreg[52]
v_accvgpr_read_b32 v[vgprValuC+65], acc85          // copy acc to vreg[53]
v_accvgpr_read_b32 v[vgprValuC+66], acc89          // copy acc to vreg[54]
v_accvgpr_read_b32 v[vgprValuC+67], acc93          // copy acc to vreg[55]
v_accvgpr_read_b32 v[vgprValuC+68], acc97          // copy acc to vreg[56]
v_accvgpr_read_b32 v[vgprValuC+69], acc101         // copy acc to vreg[57]
v_accvgpr_read_b32 v[vgprValuC+70], acc105         // copy acc to vreg[58]
v_accvgpr_read_b32 v[vgprValuC+71], acc109         // copy acc to vreg[59]

/* rC *= alpha batchElements=[(0, 0, 0, 0), (0, 1, 0, 0), (0, 0, 1, 0), (0, 1, 1, 0), (0, 0, 2, 0), (0, 1, 2, 0), (0, 0, 3, 0), (0, 1, 3, 0), (0, 0, 4, 0), (0, 1, 4, 0), (0, 0, 5, 0), (0, 1, 5, 0), (0, 0, 6, 0), (0, 1, 6, 0), (0, 0, 7, 0)] */
v_pk_mul_f32 v[vgprValuC+12:vgprValuC+12+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+12:vgprValuC+12+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+14:vgprValuC+14+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+14:vgprValuC+14+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+16:vgprValuC+16+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+16:vgprValuC+16+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+18:vgprValuC+18+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+18:vgprValuC+18+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+20:vgprValuC+20+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+20:vgprValuC+20+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+22:vgprValuC+22+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+22:vgprValuC+22+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+24:vgprValuC+24+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+24:vgprValuC+24+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+26:vgprValuC+26+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+26:vgprValuC+26+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+28:vgprValuC+28+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+28:vgprValuC+28+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+30:vgprValuC+30+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+30:vgprValuC+30+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+32:vgprValuC+32+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+32:vgprValuC+32+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+34:vgprValuC+34+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+34:vgprValuC+34+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+36:vgprValuC+36+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+36:vgprValuC+36+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+38:vgprValuC+38+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+38:vgprValuC+38+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+40:vgprValuC+40+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+40:vgprValuC+40+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+42:vgprValuC+42+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+42:vgprValuC+42+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+44:vgprValuC+44+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+44:vgprValuC+44+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+46:vgprValuC+46+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+46:vgprValuC+46+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+48:vgprValuC+48+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+48:vgprValuC+48+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+50:vgprValuC+50+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+50:vgprValuC+50+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+52:vgprValuC+52+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+52:vgprValuC+52+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+54:vgprValuC+54+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+54:vgprValuC+54+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+56:vgprValuC+56+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+56:vgprValuC+56+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+58:vgprValuC+58+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+58:vgprValuC+58+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+60:vgprValuC+60+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+60:vgprValuC+60+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+62:vgprValuC+62+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+62:vgprValuC+62+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+64:vgprValuC+64+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+64:vgprValuC+64+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+66:vgprValuC+66+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+66:vgprValuC+66+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+68:vgprValuC+68+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+68:vgprValuC+68+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+70:vgprValuC+70+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+70:vgprValuC+70+1] op_sel_hi:[0,1,1] // *= alpha (pk)
s_waitcnt lgkmcnt(0)                               // wait for Bias LDS, ScaleAlphaVec

/* apply mask, calc new C and issue writes */
v_pk_mul_f32 v[vgprValuC+12:vgprValuC+12+1], v[76:77], v[vgprValuC+12:vgprValuC+12+1] // *= ScaleAlphaVecVMulPK(76)(0)
v_pk_mul_f32 v[vgprValuC+14:vgprValuC+14+1], v[78:79], v[vgprValuC+14:vgprValuC+14+1] // *= ScaleAlphaVecVMulPK(76)(2)
v_pk_add_f32 v[4:5], v[72:73], v[vgprValuC+12:vgprValuC+12+1] // C += bias
v_pk_add_f32 v[6:7], v[74:75], v[vgprValuC+14:vgprValuC+14+1] // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b64 v[12:13], v[4:5]
v_mov_b64 v[14:15], v[6:7]
buffer_store_dwordx4 v[12:15], v9, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_pk_mul_f32 v[vgprValuC+16:vgprValuC+16+1], v[84:85], v[vgprValuC+16:vgprValuC+16+1] // *= ScaleAlphaVecVMulPK(84)(0)
v_pk_mul_f32 v[vgprValuC+18:vgprValuC+18+1], v[86:87], v[vgprValuC+18:vgprValuC+18+1] // *= ScaleAlphaVecVMulPK(84)(2)
v_pk_add_f32 v[4:5], v[80:81], v[vgprValuC+16:vgprValuC+16+1] // C += bias
v_pk_add_f32 v[6:7], v[82:83], v[vgprValuC+18:vgprValuC+18+1] // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b64 v[16:17], v[4:5]
v_mov_b64 v[18:19], v[6:7]
buffer_store_dwordx4 v[16:19], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_pk_mul_f32 v[vgprValuC+20:vgprValuC+20+1], v[76:77], v[vgprValuC+20:vgprValuC+20+1] // *= ScaleAlphaVecVMulPK(76)(0)
v_pk_mul_f32 v[vgprValuC+22:vgprValuC+22+1], v[78:79], v[vgprValuC+22:vgprValuC+22+1] // *= ScaleAlphaVecVMulPK(76)(2)
v_pk_add_f32 v[4:5], v[72:73], v[vgprValuC+20:vgprValuC+20+1] // C += bias
v_pk_add_f32 v[6:7], v[74:75], v[vgprValuC+22:vgprValuC+22+1] // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b64 v[20:21], v[4:5]
v_mov_b64 v[22:23], v[6:7]
buffer_store_dwordx4 v[20:23], v89, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_pk_mul_f32 v[vgprValuC+24:vgprValuC+24+1], v[84:85], v[vgprValuC+24:vgprValuC+24+1] // *= ScaleAlphaVecVMulPK(84)(0)
v_pk_mul_f32 v[vgprValuC+26:vgprValuC+26+1], v[86:87], v[vgprValuC+26:vgprValuC+26+1] // *= ScaleAlphaVecVMulPK(84)(2)
v_pk_add_f32 v[4:5], v[80:81], v[vgprValuC+24:vgprValuC+24+1] // C += bias
v_pk_add_f32 v[6:7], v[82:83], v[vgprValuC+26:vgprValuC+26+1] // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b64 v[24:25], v[4:5]
v_mov_b64 v[26:27], v[6:7]
buffer_store_dwordx4 v[24:27], v91, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_pk_mul_f32 v[vgprValuC+28:vgprValuC+28+1], v[76:77], v[vgprValuC+28:vgprValuC+28+1] // *= ScaleAlphaVecVMulPK(76)(0)
v_pk_mul_f32 v[vgprValuC+30:vgprValuC+30+1], v[78:79], v[vgprValuC+30:vgprValuC+30+1] // *= ScaleAlphaVecVMulPK(76)(2)
v_pk_add_f32 v[4:5], v[72:73], v[vgprValuC+28:vgprValuC+28+1] // C += bias
v_pk_add_f32 v[6:7], v[74:75], v[vgprValuC+30:vgprValuC+30+1] // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b64 v[28:29], v[4:5]
v_mov_b64 v[30:31], v[6:7]
buffer_store_dwordx4 v[28:31], v93, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_pk_mul_f32 v[vgprValuC+32:vgprValuC+32+1], v[84:85], v[vgprValuC+32:vgprValuC+32+1] // *= ScaleAlphaVecVMulPK(84)(0)
v_pk_mul_f32 v[vgprValuC+34:vgprValuC+34+1], v[86:87], v[vgprValuC+34:vgprValuC+34+1] // *= ScaleAlphaVecVMulPK(84)(2)
v_pk_add_f32 v[4:5], v[80:81], v[vgprValuC+32:vgprValuC+32+1] // C += bias
v_pk_add_f32 v[6:7], v[82:83], v[vgprValuC+34:vgprValuC+34+1] // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b64 v[32:33], v[4:5]
v_mov_b64 v[34:35], v[6:7]
buffer_store_dwordx4 v[32:35], v95, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_pk_mul_f32 v[vgprValuC+36:vgprValuC+36+1], v[76:77], v[vgprValuC+36:vgprValuC+36+1] // *= ScaleAlphaVecVMulPK(76)(0)
v_pk_mul_f32 v[vgprValuC+38:vgprValuC+38+1], v[78:79], v[vgprValuC+38:vgprValuC+38+1] // *= ScaleAlphaVecVMulPK(76)(2)
v_pk_add_f32 v[4:5], v[72:73], v[vgprValuC+36:vgprValuC+36+1] // C += bias
v_pk_add_f32 v[6:7], v[74:75], v[vgprValuC+38:vgprValuC+38+1] // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b64 v[36:37], v[4:5]
v_mov_b64 v[38:39], v[6:7]
buffer_store_dwordx4 v[36:39], v97, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_pk_mul_f32 v[vgprValuC+40:vgprValuC+40+1], v[84:85], v[vgprValuC+40:vgprValuC+40+1] // *= ScaleAlphaVecVMulPK(84)(0)
v_pk_mul_f32 v[vgprValuC+42:vgprValuC+42+1], v[86:87], v[vgprValuC+42:vgprValuC+42+1] // *= ScaleAlphaVecVMulPK(84)(2)
v_pk_add_f32 v[4:5], v[80:81], v[vgprValuC+40:vgprValuC+40+1] // C += bias
v_pk_add_f32 v[6:7], v[82:83], v[vgprValuC+42:vgprValuC+42+1] // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b64 v[40:41], v[4:5]
v_mov_b64 v[42:43], v[6:7]
buffer_store_dwordx4 v[40:43], v99, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_pk_mul_f32 v[vgprValuC+44:vgprValuC+44+1], v[76:77], v[vgprValuC+44:vgprValuC+44+1] // *= ScaleAlphaVecVMulPK(76)(0)
v_pk_mul_f32 v[vgprValuC+46:vgprValuC+46+1], v[78:79], v[vgprValuC+46:vgprValuC+46+1] // *= ScaleAlphaVecVMulPK(76)(2)
v_pk_add_f32 v[4:5], v[72:73], v[vgprValuC+44:vgprValuC+44+1] // C += bias
v_pk_add_f32 v[6:7], v[74:75], v[vgprValuC+46:vgprValuC+46+1] // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b64 v[44:45], v[4:5]
v_mov_b64 v[46:47], v[6:7]
buffer_store_dwordx4 v[44:47], v101, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_pk_mul_f32 v[vgprValuC+48:vgprValuC+48+1], v[84:85], v[vgprValuC+48:vgprValuC+48+1] // *= ScaleAlphaVecVMulPK(84)(0)
v_pk_mul_f32 v[vgprValuC+50:vgprValuC+50+1], v[86:87], v[vgprValuC+50:vgprValuC+50+1] // *= ScaleAlphaVecVMulPK(84)(2)
v_pk_add_f32 v[4:5], v[80:81], v[vgprValuC+48:vgprValuC+48+1] // C += bias
v_pk_add_f32 v[6:7], v[82:83], v[vgprValuC+50:vgprValuC+50+1] // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b64 v[48:49], v[4:5]
v_mov_b64 v[50:51], v[6:7]
buffer_store_dwordx4 v[48:51], v103, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_pk_mul_f32 v[vgprValuC+52:vgprValuC+52+1], v[76:77], v[vgprValuC+52:vgprValuC+52+1] // *= ScaleAlphaVecVMulPK(76)(0)
v_pk_mul_f32 v[vgprValuC+54:vgprValuC+54+1], v[78:79], v[vgprValuC+54:vgprValuC+54+1] // *= ScaleAlphaVecVMulPK(76)(2)
v_pk_add_f32 v[4:5], v[72:73], v[vgprValuC+52:vgprValuC+52+1] // C += bias
v_pk_add_f32 v[6:7], v[74:75], v[vgprValuC+54:vgprValuC+54+1] // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b64 v[52:53], v[4:5]
v_mov_b64 v[54:55], v[6:7]
buffer_store_dwordx4 v[52:55], v105, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_pk_mul_f32 v[vgprValuC+56:vgprValuC+56+1], v[84:85], v[vgprValuC+56:vgprValuC+56+1] // *= ScaleAlphaVecVMulPK(84)(0)
v_pk_mul_f32 v[vgprValuC+58:vgprValuC+58+1], v[86:87], v[vgprValuC+58:vgprValuC+58+1] // *= ScaleAlphaVecVMulPK(84)(2)
v_pk_add_f32 v[4:5], v[80:81], v[vgprValuC+56:vgprValuC+56+1] // C += bias
v_pk_add_f32 v[6:7], v[82:83], v[vgprValuC+58:vgprValuC+58+1] // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b64 v[56:57], v[4:5]
v_mov_b64 v[58:59], v[6:7]
buffer_store_dwordx4 v[56:59], v107, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_pk_mul_f32 v[vgprValuC+60:vgprValuC+60+1], v[76:77], v[vgprValuC+60:vgprValuC+60+1] // *= ScaleAlphaVecVMulPK(76)(0)
v_pk_mul_f32 v[vgprValuC+62:vgprValuC+62+1], v[78:79], v[vgprValuC+62:vgprValuC+62+1] // *= ScaleAlphaVecVMulPK(76)(2)
v_pk_add_f32 v[4:5], v[72:73], v[vgprValuC+60:vgprValuC+60+1] // C += bias
v_pk_add_f32 v[6:7], v[74:75], v[vgprValuC+62:vgprValuC+62+1] // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b64 v[60:61], v[4:5]
v_mov_b64 v[62:63], v[6:7]
buffer_store_dwordx4 v[60:63], v109, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_pk_mul_f32 v[vgprValuC+64:vgprValuC+64+1], v[84:85], v[vgprValuC+64:vgprValuC+64+1] // *= ScaleAlphaVecVMulPK(84)(0)
v_pk_mul_f32 v[vgprValuC+66:vgprValuC+66+1], v[86:87], v[vgprValuC+66:vgprValuC+66+1] // *= ScaleAlphaVecVMulPK(84)(2)
v_pk_add_f32 v[4:5], v[80:81], v[vgprValuC+64:vgprValuC+64+1] // C += bias
v_pk_add_f32 v[6:7], v[82:83], v[vgprValuC+66:vgprValuC+66+1] // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b64 v[64:65], v[4:5]
v_mov_b64 v[66:67], v[6:7]
buffer_store_dwordx4 v[64:67], v111, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_pk_mul_f32 v[vgprValuC+68:vgprValuC+68+1], v[76:77], v[vgprValuC+68:vgprValuC+68+1] // *= ScaleAlphaVecVMulPK(76)(0)
v_pk_mul_f32 v[vgprValuC+70:vgprValuC+70+1], v[78:79], v[vgprValuC+70:vgprValuC+70+1] // *= ScaleAlphaVecVMulPK(76)(2)
v_pk_add_f32 v[4:5], v[72:73], v[vgprValuC+68:vgprValuC+68+1] // C += bias
v_pk_add_f32 v[6:7], v[74:75], v[vgprValuC+70:vgprValuC+70+1] // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b64 v[68:69], v[4:5]
v_mov_b64 v[70:71], v[6:7]
buffer_store_dwordx4 v[68:71], v113, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 factorDim=0 */

/******************************************/
/* Global Write Edge Batch #1 (d1,d0,vc1,vc0) = */
/*    (0,1,7,0:vw4); (0,0,8,0:vw4); (0,1,8,0:vw4); (0,0,9,0:vw4); (0,1,9,0:vw4); (0,0,10,0:vw4); (0,1,10,0:vw4); (0,0,11,0:vw4); (0,1,11,0:vw4); (0,0,12,0:vw4); (0,1,12,0:vw4); (0,0,13,0:vw4); (0,1,13,0:vw4); (0,0,14,0:vw4); (0,1,14,0:vw4) */
/******************************************/

/* calc coords, apply mask, and issue loads (if necessary) */
v_mov_b32 v8, BufferOOB
/* (d1,vc1,d0,vc0)=(0,7,1,0) */
s_mov_b32 s74, 128                                 // coordOffset0 d0=1 vc0=0
v_add_co_u32 v4, vcc, v0, s74                      // coord0.2: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v10, v4, s74
v_lshlrev_b32 v10, 0x2, v10                        // Bias address scaled by BPE
ds_read_b128 v[72:75], v10 offset:0                // load Bias
ds_read_b128 v[76:79], v10 offset:1024             // load scaleAlpha
v_add_lshl_u32 v9, v3, v4, 0x2                     // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v9, v8, v9, s[78:79]                 // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,8,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[74:75], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v88, v0, s74
v_lshlrev_b32 v88, 0x2, v88                        // Bias address scaled by BPE
ds_read_b128 v[80:83], v88 offset:0                // load Bias
ds_read_b128 v[84:87], v88 offset:1024             // load scaleAlpha
v_add_lshl_u32 v11, v3, v0, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v11, v8, v11, s[78:79]               // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,8,1,0) */
s_mov_b32 s74, 128                                 // coordOffset0 d0=1 vc0=0
v_add_co_u32 v4, vcc, v0, s74                      // coord0.2: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v90, v4, s74
v_lshlrev_b32 v90, 0x2, v90                        // Bias address scaled by BPE
v_add_lshl_u32 v89, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v89, v8, v89, s[78:79]               // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,9,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[74:75], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v92, v0, s74
v_lshlrev_b32 v92, 0x2, v92                        // Bias address scaled by BPE
v_add_lshl_u32 v91, v3, v0, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v91, v8, v91, s[78:79]               // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,9,1,0) */
s_mov_b32 s74, 128                                 // coordOffset0 d0=1 vc0=0
v_add_co_u32 v4, vcc, v0, s74                      // coord0.2: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v94, v4, s74
v_lshlrev_b32 v94, 0x2, v94                        // Bias address scaled by BPE
v_add_lshl_u32 v93, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v93, v8, v93, s[78:79]               // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,10,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[74:75], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v96, v0, s74
v_lshlrev_b32 v96, 0x2, v96                        // Bias address scaled by BPE
v_add_lshl_u32 v95, v3, v0, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v95, v8, v95, s[78:79]               // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,10,1,0) */
s_mov_b32 s74, 128                                 // coordOffset0 d0=1 vc0=0
v_add_co_u32 v4, vcc, v0, s74                      // coord0.2: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v98, v4, s74
v_lshlrev_b32 v98, 0x2, v98                        // Bias address scaled by BPE
v_add_lshl_u32 v97, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v97, v8, v97, s[78:79]               // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,11,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[74:75], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v100, v0, s74
v_lshlrev_b32 v100, 0x2, v100                      // Bias address scaled by BPE
v_add_lshl_u32 v99, v3, v0, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v99, v8, v99, s[78:79]               // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,11,1,0) */
s_mov_b32 s74, 128                                 // coordOffset0 d0=1 vc0=0
v_add_co_u32 v4, vcc, v0, s74                      // coord0.2: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v102, v4, s74
v_lshlrev_b32 v102, 0x2, v102                      // Bias address scaled by BPE
v_add_lshl_u32 v101, v3, v4, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v101, v8, v101, s[78:79]             // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,12,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[74:75], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v104, v0, s74
v_lshlrev_b32 v104, 0x2, v104                      // Bias address scaled by BPE
v_add_lshl_u32 v103, v3, v0, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v103, v8, v103, s[78:79]             // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,12,1,0) */
s_mov_b32 s74, 128                                 // coordOffset0 d0=1 vc0=0
v_add_co_u32 v4, vcc, v0, s74                      // coord0.2: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v106, v4, s74
v_lshlrev_b32 v106, 0x2, v106                      // Bias address scaled by BPE
v_add_lshl_u32 v105, v3, v4, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v105, v8, v105, s[78:79]             // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,13,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[74:75], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v108, v0, s74
v_lshlrev_b32 v108, 0x2, v108                      // Bias address scaled by BPE
v_add_lshl_u32 v107, v3, v0, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v107, v8, v107, s[78:79]             // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,13,1,0) */
s_mov_b32 s74, 128                                 // coordOffset0 d0=1 vc0=0
v_add_co_u32 v4, vcc, v0, s74                      // coord0.2: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v110, v4, s74
v_lshlrev_b32 v110, 0x2, v110                      // Bias address scaled by BPE
v_add_lshl_u32 v109, v3, v4, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v109, v8, v109, s[78:79]             // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,14,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[74:75], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v112, v0, s74
v_lshlrev_b32 v112, 0x2, v112                      // Bias address scaled by BPE
v_add_lshl_u32 v111, v3, v0, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v111, v8, v111, s[78:79]             // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,14,1,0) */
s_mov_b32 s74, 128                                 // coordOffset0 d0=1 vc0=0
v_add_co_u32 v4, vcc, v0, s74                      // coord0.2: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v114, v4, s74
v_lshlrev_b32 v114, 0x2, v114                      // Bias address scaled by BPE
v_add_lshl_u32 v113, v3, v4, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v113, v8, v113, s[78:79]             // LDD clip if OOB. offset
v_accvgpr_read_b32 v[vgprValuC+12], acc113         // copy acc to vreg[60]
v_accvgpr_read_b32 v[vgprValuC+13], acc117         // copy acc to vreg[61]
v_accvgpr_read_b32 v[vgprValuC+14], acc121         // copy acc to vreg[62]
v_accvgpr_read_b32 v[vgprValuC+15], acc125         // copy acc to vreg[63]
v_accvgpr_read_b32 v[vgprValuC+16], acc2           // copy acc to vreg[64]
v_accvgpr_read_b32 v[vgprValuC+17], acc6           // copy acc to vreg[65]
v_accvgpr_read_b32 v[vgprValuC+18], acc10          // copy acc to vreg[66]
v_accvgpr_read_b32 v[vgprValuC+19], acc14          // copy acc to vreg[67]
v_accvgpr_read_b32 v[vgprValuC+20], acc18          // copy acc to vreg[68]
v_accvgpr_read_b32 v[vgprValuC+21], acc22          // copy acc to vreg[69]
v_accvgpr_read_b32 v[vgprValuC+22], acc26          // copy acc to vreg[70]
v_accvgpr_read_b32 v[vgprValuC+23], acc30          // copy acc to vreg[71]
v_accvgpr_read_b32 v[vgprValuC+24], acc34          // copy acc to vreg[72]
v_accvgpr_read_b32 v[vgprValuC+25], acc38          // copy acc to vreg[73]
v_accvgpr_read_b32 v[vgprValuC+26], acc42          // copy acc to vreg[74]
v_accvgpr_read_b32 v[vgprValuC+27], acc46          // copy acc to vreg[75]
v_accvgpr_read_b32 v[vgprValuC+28], acc50          // copy acc to vreg[76]
v_accvgpr_read_b32 v[vgprValuC+29], acc54          // copy acc to vreg[77]
v_accvgpr_read_b32 v[vgprValuC+30], acc58          // copy acc to vreg[78]
v_accvgpr_read_b32 v[vgprValuC+31], acc62          // copy acc to vreg[79]
v_accvgpr_read_b32 v[vgprValuC+32], acc66          // copy acc to vreg[80]
v_accvgpr_read_b32 v[vgprValuC+33], acc70          // copy acc to vreg[81]
v_accvgpr_read_b32 v[vgprValuC+34], acc74          // copy acc to vreg[82]
v_accvgpr_read_b32 v[vgprValuC+35], acc78          // copy acc to vreg[83]
v_accvgpr_read_b32 v[vgprValuC+36], acc82          // copy acc to vreg[84]
v_accvgpr_read_b32 v[vgprValuC+37], acc86          // copy acc to vreg[85]
v_accvgpr_read_b32 v[vgprValuC+38], acc90          // copy acc to vreg[86]
v_accvgpr_read_b32 v[vgprValuC+39], acc94          // copy acc to vreg[87]
v_accvgpr_read_b32 v[vgprValuC+40], acc98          // copy acc to vreg[88]
v_accvgpr_read_b32 v[vgprValuC+41], acc102         // copy acc to vreg[89]
v_accvgpr_read_b32 v[vgprValuC+42], acc106         // copy acc to vreg[90]
v_accvgpr_read_b32 v[vgprValuC+43], acc110         // copy acc to vreg[91]
v_accvgpr_read_b32 v[vgprValuC+44], acc114         // copy acc to vreg[92]
v_accvgpr_read_b32 v[vgprValuC+45], acc118         // copy acc to vreg[93]
v_accvgpr_read_b32 v[vgprValuC+46], acc122         // copy acc to vreg[94]
v_accvgpr_read_b32 v[vgprValuC+47], acc126         // copy acc to vreg[95]
v_accvgpr_read_b32 v[vgprValuC+48], acc3           // copy acc to vreg[96]
v_accvgpr_read_b32 v[vgprValuC+49], acc7           // copy acc to vreg[97]
v_accvgpr_read_b32 v[vgprValuC+50], acc11          // copy acc to vreg[98]
v_accvgpr_read_b32 v[vgprValuC+51], acc15          // copy acc to vreg[99]
v_accvgpr_read_b32 v[vgprValuC+52], acc19          // copy acc to vreg[100]
v_accvgpr_read_b32 v[vgprValuC+53], acc23          // copy acc to vreg[101]
v_accvgpr_read_b32 v[vgprValuC+54], acc27          // copy acc to vreg[102]
v_accvgpr_read_b32 v[vgprValuC+55], acc31          // copy acc to vreg[103]
v_accvgpr_read_b32 v[vgprValuC+56], acc35          // copy acc to vreg[104]
v_accvgpr_read_b32 v[vgprValuC+57], acc39          // copy acc to vreg[105]
v_accvgpr_read_b32 v[vgprValuC+58], acc43          // copy acc to vreg[106]
v_accvgpr_read_b32 v[vgprValuC+59], acc47          // copy acc to vreg[107]
v_accvgpr_read_b32 v[vgprValuC+60], acc51          // copy acc to vreg[108]
v_accvgpr_read_b32 v[vgprValuC+61], acc55          // copy acc to vreg[109]
v_accvgpr_read_b32 v[vgprValuC+62], acc59          // copy acc to vreg[110]
v_accvgpr_read_b32 v[vgprValuC+63], acc63          // copy acc to vreg[111]
v_accvgpr_read_b32 v[vgprValuC+64], acc67          // copy acc to vreg[112]
v_accvgpr_read_b32 v[vgprValuC+65], acc71          // copy acc to vreg[113]
v_accvgpr_read_b32 v[vgprValuC+66], acc75          // copy acc to vreg[114]
v_accvgpr_read_b32 v[vgprValuC+67], acc79          // copy acc to vreg[115]
v_accvgpr_read_b32 v[vgprValuC+68], acc83          // copy acc to vreg[116]
v_accvgpr_read_b32 v[vgprValuC+69], acc87          // copy acc to vreg[117]
v_accvgpr_read_b32 v[vgprValuC+70], acc91          // copy acc to vreg[118]
v_accvgpr_read_b32 v[vgprValuC+71], acc95          // copy acc to vreg[119]

/* rC *= alpha batchElements=[(0, 1, 7, 0), (0, 0, 8, 0), (0, 1, 8, 0), (0, 0, 9, 0), (0, 1, 9, 0), (0, 0, 10, 0), (0, 1, 10, 0), (0, 0, 11, 0), (0, 1, 11, 0), (0, 0, 12, 0), (0, 1, 12, 0), (0, 0, 13, 0), (0, 1, 13, 0), (0, 0, 14, 0), (0, 1, 14, 0)] */
v_pk_mul_f32 v[vgprValuC+12:vgprValuC+12+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+12:vgprValuC+12+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+14:vgprValuC+14+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+14:vgprValuC+14+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+16:vgprValuC+16+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+16:vgprValuC+16+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+18:vgprValuC+18+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+18:vgprValuC+18+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+20:vgprValuC+20+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+20:vgprValuC+20+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+22:vgprValuC+22+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+22:vgprValuC+22+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+24:vgprValuC+24+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+24:vgprValuC+24+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+26:vgprValuC+26+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+26:vgprValuC+26+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+28:vgprValuC+28+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+28:vgprValuC+28+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+30:vgprValuC+30+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+30:vgprValuC+30+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+32:vgprValuC+32+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+32:vgprValuC+32+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+34:vgprValuC+34+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+34:vgprValuC+34+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+36:vgprValuC+36+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+36:vgprValuC+36+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+38:vgprValuC+38+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+38:vgprValuC+38+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+40:vgprValuC+40+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+40:vgprValuC+40+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+42:vgprValuC+42+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+42:vgprValuC+42+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+44:vgprValuC+44+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+44:vgprValuC+44+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+46:vgprValuC+46+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+46:vgprValuC+46+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+48:vgprValuC+48+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+48:vgprValuC+48+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+50:vgprValuC+50+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+50:vgprValuC+50+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+52:vgprValuC+52+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+52:vgprValuC+52+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+54:vgprValuC+54+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+54:vgprValuC+54+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+56:vgprValuC+56+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+56:vgprValuC+56+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+58:vgprValuC+58+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+58:vgprValuC+58+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+60:vgprValuC+60+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+60:vgprValuC+60+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+62:vgprValuC+62+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+62:vgprValuC+62+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+64:vgprValuC+64+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+64:vgprValuC+64+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+66:vgprValuC+66+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+66:vgprValuC+66+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+68:vgprValuC+68+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+68:vgprValuC+68+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+70:vgprValuC+70+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+70:vgprValuC+70+1] op_sel_hi:[0,1,1] // *= alpha (pk)
s_waitcnt lgkmcnt(0)                               // wait for Bias LDS, ScaleAlphaVec

/* apply mask, calc new C and issue writes */
v_pk_mul_f32 v[vgprValuC+12:vgprValuC+12+1], v[76:77], v[vgprValuC+12:vgprValuC+12+1] // *= ScaleAlphaVecVMulPK(76)(0)
v_pk_mul_f32 v[vgprValuC+14:vgprValuC+14+1], v[78:79], v[vgprValuC+14:vgprValuC+14+1] // *= ScaleAlphaVecVMulPK(76)(2)
v_pk_add_f32 v[4:5], v[72:73], v[vgprValuC+12:vgprValuC+12+1] // C += bias
v_pk_add_f32 v[6:7], v[74:75], v[vgprValuC+14:vgprValuC+14+1] // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b64 v[12:13], v[4:5]
v_mov_b64 v[14:15], v[6:7]
buffer_store_dwordx4 v[12:15], v9, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_pk_mul_f32 v[vgprValuC+16:vgprValuC+16+1], v[84:85], v[vgprValuC+16:vgprValuC+16+1] // *= ScaleAlphaVecVMulPK(84)(0)
v_pk_mul_f32 v[vgprValuC+18:vgprValuC+18+1], v[86:87], v[vgprValuC+18:vgprValuC+18+1] // *= ScaleAlphaVecVMulPK(84)(2)
v_pk_add_f32 v[4:5], v[80:81], v[vgprValuC+16:vgprValuC+16+1] // C += bias
v_pk_add_f32 v[6:7], v[82:83], v[vgprValuC+18:vgprValuC+18+1] // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b64 v[16:17], v[4:5]
v_mov_b64 v[18:19], v[6:7]
buffer_store_dwordx4 v[16:19], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_pk_mul_f32 v[vgprValuC+20:vgprValuC+20+1], v[76:77], v[vgprValuC+20:vgprValuC+20+1] // *= ScaleAlphaVecVMulPK(76)(0)
v_pk_mul_f32 v[vgprValuC+22:vgprValuC+22+1], v[78:79], v[vgprValuC+22:vgprValuC+22+1] // *= ScaleAlphaVecVMulPK(76)(2)
v_pk_add_f32 v[4:5], v[72:73], v[vgprValuC+20:vgprValuC+20+1] // C += bias
v_pk_add_f32 v[6:7], v[74:75], v[vgprValuC+22:vgprValuC+22+1] // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b64 v[20:21], v[4:5]
v_mov_b64 v[22:23], v[6:7]
buffer_store_dwordx4 v[20:23], v89, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_pk_mul_f32 v[vgprValuC+24:vgprValuC+24+1], v[84:85], v[vgprValuC+24:vgprValuC+24+1] // *= ScaleAlphaVecVMulPK(84)(0)
v_pk_mul_f32 v[vgprValuC+26:vgprValuC+26+1], v[86:87], v[vgprValuC+26:vgprValuC+26+1] // *= ScaleAlphaVecVMulPK(84)(2)
v_pk_add_f32 v[4:5], v[80:81], v[vgprValuC+24:vgprValuC+24+1] // C += bias
v_pk_add_f32 v[6:7], v[82:83], v[vgprValuC+26:vgprValuC+26+1] // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b64 v[24:25], v[4:5]
v_mov_b64 v[26:27], v[6:7]
buffer_store_dwordx4 v[24:27], v91, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_pk_mul_f32 v[vgprValuC+28:vgprValuC+28+1], v[76:77], v[vgprValuC+28:vgprValuC+28+1] // *= ScaleAlphaVecVMulPK(76)(0)
v_pk_mul_f32 v[vgprValuC+30:vgprValuC+30+1], v[78:79], v[vgprValuC+30:vgprValuC+30+1] // *= ScaleAlphaVecVMulPK(76)(2)
v_pk_add_f32 v[4:5], v[72:73], v[vgprValuC+28:vgprValuC+28+1] // C += bias
v_pk_add_f32 v[6:7], v[74:75], v[vgprValuC+30:vgprValuC+30+1] // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b64 v[28:29], v[4:5]
v_mov_b64 v[30:31], v[6:7]
buffer_store_dwordx4 v[28:31], v93, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_pk_mul_f32 v[vgprValuC+32:vgprValuC+32+1], v[84:85], v[vgprValuC+32:vgprValuC+32+1] // *= ScaleAlphaVecVMulPK(84)(0)
v_pk_mul_f32 v[vgprValuC+34:vgprValuC+34+1], v[86:87], v[vgprValuC+34:vgprValuC+34+1] // *= ScaleAlphaVecVMulPK(84)(2)
v_pk_add_f32 v[4:5], v[80:81], v[vgprValuC+32:vgprValuC+32+1] // C += bias
v_pk_add_f32 v[6:7], v[82:83], v[vgprValuC+34:vgprValuC+34+1] // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b64 v[32:33], v[4:5]
v_mov_b64 v[34:35], v[6:7]
buffer_store_dwordx4 v[32:35], v95, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_pk_mul_f32 v[vgprValuC+36:vgprValuC+36+1], v[76:77], v[vgprValuC+36:vgprValuC+36+1] // *= ScaleAlphaVecVMulPK(76)(0)
v_pk_mul_f32 v[vgprValuC+38:vgprValuC+38+1], v[78:79], v[vgprValuC+38:vgprValuC+38+1] // *= ScaleAlphaVecVMulPK(76)(2)
v_pk_add_f32 v[4:5], v[72:73], v[vgprValuC+36:vgprValuC+36+1] // C += bias
v_pk_add_f32 v[6:7], v[74:75], v[vgprValuC+38:vgprValuC+38+1] // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b64 v[36:37], v[4:5]
v_mov_b64 v[38:39], v[6:7]
buffer_store_dwordx4 v[36:39], v97, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_pk_mul_f32 v[vgprValuC+40:vgprValuC+40+1], v[84:85], v[vgprValuC+40:vgprValuC+40+1] // *= ScaleAlphaVecVMulPK(84)(0)
v_pk_mul_f32 v[vgprValuC+42:vgprValuC+42+1], v[86:87], v[vgprValuC+42:vgprValuC+42+1] // *= ScaleAlphaVecVMulPK(84)(2)
v_pk_add_f32 v[4:5], v[80:81], v[vgprValuC+40:vgprValuC+40+1] // C += bias
v_pk_add_f32 v[6:7], v[82:83], v[vgprValuC+42:vgprValuC+42+1] // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b64 v[40:41], v[4:5]
v_mov_b64 v[42:43], v[6:7]
buffer_store_dwordx4 v[40:43], v99, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_pk_mul_f32 v[vgprValuC+44:vgprValuC+44+1], v[76:77], v[vgprValuC+44:vgprValuC+44+1] // *= ScaleAlphaVecVMulPK(76)(0)
v_pk_mul_f32 v[vgprValuC+46:vgprValuC+46+1], v[78:79], v[vgprValuC+46:vgprValuC+46+1] // *= ScaleAlphaVecVMulPK(76)(2)
v_pk_add_f32 v[4:5], v[72:73], v[vgprValuC+44:vgprValuC+44+1] // C += bias
v_pk_add_f32 v[6:7], v[74:75], v[vgprValuC+46:vgprValuC+46+1] // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b64 v[44:45], v[4:5]
v_mov_b64 v[46:47], v[6:7]
buffer_store_dwordx4 v[44:47], v101, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_pk_mul_f32 v[vgprValuC+48:vgprValuC+48+1], v[84:85], v[vgprValuC+48:vgprValuC+48+1] // *= ScaleAlphaVecVMulPK(84)(0)
v_pk_mul_f32 v[vgprValuC+50:vgprValuC+50+1], v[86:87], v[vgprValuC+50:vgprValuC+50+1] // *= ScaleAlphaVecVMulPK(84)(2)
v_pk_add_f32 v[4:5], v[80:81], v[vgprValuC+48:vgprValuC+48+1] // C += bias
v_pk_add_f32 v[6:7], v[82:83], v[vgprValuC+50:vgprValuC+50+1] // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b64 v[48:49], v[4:5]
v_mov_b64 v[50:51], v[6:7]
buffer_store_dwordx4 v[48:51], v103, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_pk_mul_f32 v[vgprValuC+52:vgprValuC+52+1], v[76:77], v[vgprValuC+52:vgprValuC+52+1] // *= ScaleAlphaVecVMulPK(76)(0)
v_pk_mul_f32 v[vgprValuC+54:vgprValuC+54+1], v[78:79], v[vgprValuC+54:vgprValuC+54+1] // *= ScaleAlphaVecVMulPK(76)(2)
v_pk_add_f32 v[4:5], v[72:73], v[vgprValuC+52:vgprValuC+52+1] // C += bias
v_pk_add_f32 v[6:7], v[74:75], v[vgprValuC+54:vgprValuC+54+1] // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b64 v[52:53], v[4:5]
v_mov_b64 v[54:55], v[6:7]
buffer_store_dwordx4 v[52:55], v105, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_pk_mul_f32 v[vgprValuC+56:vgprValuC+56+1], v[84:85], v[vgprValuC+56:vgprValuC+56+1] // *= ScaleAlphaVecVMulPK(84)(0)
v_pk_mul_f32 v[vgprValuC+58:vgprValuC+58+1], v[86:87], v[vgprValuC+58:vgprValuC+58+1] // *= ScaleAlphaVecVMulPK(84)(2)
v_pk_add_f32 v[4:5], v[80:81], v[vgprValuC+56:vgprValuC+56+1] // C += bias
v_pk_add_f32 v[6:7], v[82:83], v[vgprValuC+58:vgprValuC+58+1] // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b64 v[56:57], v[4:5]
v_mov_b64 v[58:59], v[6:7]
buffer_store_dwordx4 v[56:59], v107, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_pk_mul_f32 v[vgprValuC+60:vgprValuC+60+1], v[76:77], v[vgprValuC+60:vgprValuC+60+1] // *= ScaleAlphaVecVMulPK(76)(0)
v_pk_mul_f32 v[vgprValuC+62:vgprValuC+62+1], v[78:79], v[vgprValuC+62:vgprValuC+62+1] // *= ScaleAlphaVecVMulPK(76)(2)
v_pk_add_f32 v[4:5], v[72:73], v[vgprValuC+60:vgprValuC+60+1] // C += bias
v_pk_add_f32 v[6:7], v[74:75], v[vgprValuC+62:vgprValuC+62+1] // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b64 v[60:61], v[4:5]
v_mov_b64 v[62:63], v[6:7]
buffer_store_dwordx4 v[60:63], v109, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_pk_mul_f32 v[vgprValuC+64:vgprValuC+64+1], v[84:85], v[vgprValuC+64:vgprValuC+64+1] // *= ScaleAlphaVecVMulPK(84)(0)
v_pk_mul_f32 v[vgprValuC+66:vgprValuC+66+1], v[86:87], v[vgprValuC+66:vgprValuC+66+1] // *= ScaleAlphaVecVMulPK(84)(2)
v_pk_add_f32 v[4:5], v[80:81], v[vgprValuC+64:vgprValuC+64+1] // C += bias
v_pk_add_f32 v[6:7], v[82:83], v[vgprValuC+66:vgprValuC+66+1] // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b64 v[64:65], v[4:5]
v_mov_b64 v[66:67], v[6:7]
buffer_store_dwordx4 v[64:67], v111, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_pk_mul_f32 v[vgprValuC+68:vgprValuC+68+1], v[76:77], v[vgprValuC+68:vgprValuC+68+1] // *= ScaleAlphaVecVMulPK(76)(0)
v_pk_mul_f32 v[vgprValuC+70:vgprValuC+70+1], v[78:79], v[vgprValuC+70:vgprValuC+70+1] // *= ScaleAlphaVecVMulPK(76)(2)
v_pk_add_f32 v[4:5], v[72:73], v[vgprValuC+68:vgprValuC+68+1] // C += bias
v_pk_add_f32 v[6:7], v[74:75], v[vgprValuC+70:vgprValuC+70+1] // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b64 v[68:69], v[4:5]
v_mov_b64 v[70:71], v[6:7]
buffer_store_dwordx4 v[68:71], v113, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 factorDim=0 */

/******************************************/
/* Global Write Edge Batch #2 (d1,d0,vc1,vc0) = */
/*    (0,0,15,0:vw4); (0,1,15,0:vw4); (1,0,0,0:vw4); (1,1,0,0:vw4); (1,0,1,0:vw4); (1,1,1,0:vw4); (1,0,2,0:vw4); (1,1,2,0:vw4); (1,0,3,0:vw4); (1,1,3,0:vw4); (1,0,4,0:vw4); (1,1,4,0:vw4); (1,0,5,0:vw4); (1,1,5,0:vw4); (1,0,6,0:vw4) */
/******************************************/

/* calc coords, apply mask, and issue loads (if necessary) */
v_mov_b32 v8, BufferOOB
/* (d1,vc1,d0,vc0)=(0,15,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[74:75], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v10, v0, s74
v_lshlrev_b32 v10, 0x2, v10                        // Bias address scaled by BPE
ds_read_b128 v[72:75], v10 offset:0                // load Bias
ds_read_b128 v[76:79], v10 offset:1024             // load scaleAlpha
v_add_lshl_u32 v9, v3, v0, 0x2                     // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v9, v8, v9, s[78:79]                 // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,15,1,0) */
s_mov_b32 s74, 128                                 // coordOffset0 d0=1 vc0=0
v_add_co_u32 v4, vcc, v0, s74                      // coord0.2: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v88, v4, s74
v_lshlrev_b32 v88, 0x2, v88                        // Bias address scaled by BPE
ds_read_b128 v[80:83], v88 offset:0                // load Bias
ds_read_b128 v[84:87], v88 offset:1024             // load scaleAlpha
v_add_lshl_u32 v11, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v11, v8, v11, s[78:79]               // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(1,0,0,0) */
s_mov_b32 s74, 113                                 // rowInc d1=0 vc1=0
v_add_co_u32 v1, vcc, v1, s74                      // coord1.2: coord1 += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
s_mul_i32 s74, s[sgprStrideC1J], 113               // scale stride
v_add_i32 v2, v2, s74                              // ROWINC- Move cinRowPtr to next row
s_mul_i32 s74, s[sgprStrideD1J], 113               // scale stride
v_add_i32 v3, v3, s74                              // Move coutRowPtrD to next row
v_cmp_lt_u32 s[74:75], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v90, v0, s74
v_lshlrev_b32 v90, 0x2, v90                        // Bias address scaled by BPE
v_add_lshl_u32 v89, v3, v0, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v89, v8, v89, s[78:79]               // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(1,0,1,0) */
s_mov_b32 s74, 128                                 // coordOffset0 d0=1 vc0=0
v_add_co_u32 v4, vcc, v0, s74                      // coord0.2: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v92, v4, s74
v_lshlrev_b32 v92, 0x2, v92                        // Bias address scaled by BPE
v_add_lshl_u32 v91, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v91, v8, v91, s[78:79]               // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(1,1,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[74:75], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v94, v0, s74
v_lshlrev_b32 v94, 0x2, v94                        // Bias address scaled by BPE
v_add_lshl_u32 v93, v3, v0, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v93, v8, v93, s[78:79]               // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(1,1,1,0) */
s_mov_b32 s74, 128                                 // coordOffset0 d0=1 vc0=0
v_add_co_u32 v4, vcc, v0, s74                      // coord0.2: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v96, v4, s74
v_lshlrev_b32 v96, 0x2, v96                        // Bias address scaled by BPE
v_add_lshl_u32 v95, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v95, v8, v95, s[78:79]               // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(1,2,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[74:75], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v98, v0, s74
v_lshlrev_b32 v98, 0x2, v98                        // Bias address scaled by BPE
v_add_lshl_u32 v97, v3, v0, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v97, v8, v97, s[78:79]               // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(1,2,1,0) */
s_mov_b32 s74, 128                                 // coordOffset0 d0=1 vc0=0
v_add_co_u32 v4, vcc, v0, s74                      // coord0.2: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v100, v4, s74
v_lshlrev_b32 v100, 0x2, v100                      // Bias address scaled by BPE
v_add_lshl_u32 v99, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v99, v8, v99, s[78:79]               // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(1,3,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[74:75], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v102, v0, s74
v_lshlrev_b32 v102, 0x2, v102                      // Bias address scaled by BPE
v_add_lshl_u32 v101, v3, v0, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v101, v8, v101, s[78:79]             // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(1,3,1,0) */
s_mov_b32 s74, 128                                 // coordOffset0 d0=1 vc0=0
v_add_co_u32 v4, vcc, v0, s74                      // coord0.2: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v104, v4, s74
v_lshlrev_b32 v104, 0x2, v104                      // Bias address scaled by BPE
v_add_lshl_u32 v103, v3, v4, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v103, v8, v103, s[78:79]             // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(1,4,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[74:75], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v106, v0, s74
v_lshlrev_b32 v106, 0x2, v106                      // Bias address scaled by BPE
v_add_lshl_u32 v105, v3, v0, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v105, v8, v105, s[78:79]             // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(1,4,1,0) */
s_mov_b32 s74, 128                                 // coordOffset0 d0=1 vc0=0
v_add_co_u32 v4, vcc, v0, s74                      // coord0.2: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v108, v4, s74
v_lshlrev_b32 v108, 0x2, v108                      // Bias address scaled by BPE
v_add_lshl_u32 v107, v3, v4, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v107, v8, v107, s[78:79]             // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(1,5,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[74:75], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v110, v0, s74
v_lshlrev_b32 v110, 0x2, v110                      // Bias address scaled by BPE
v_add_lshl_u32 v109, v3, v0, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v109, v8, v109, s[78:79]             // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(1,5,1,0) */
s_mov_b32 s74, 128                                 // coordOffset0 d0=1 vc0=0
v_add_co_u32 v4, vcc, v0, s74                      // coord0.2: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v112, v4, s74
v_lshlrev_b32 v112, 0x2, v112                      // Bias address scaled by BPE
v_add_lshl_u32 v111, v3, v4, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v111, v8, v111, s[78:79]             // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(1,6,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[74:75], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v114, v0, s74
v_lshlrev_b32 v114, 0x2, v114                      // Bias address scaled by BPE
v_add_lshl_u32 v113, v3, v0, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v113, v8, v113, s[78:79]             // LDD clip if OOB. offset
v_accvgpr_read_b32 v[vgprValuC+12], acc99          // copy acc to vreg[120]
v_accvgpr_read_b32 v[vgprValuC+13], acc103         // copy acc to vreg[121]
v_accvgpr_read_b32 v[vgprValuC+14], acc107         // copy acc to vreg[122]
v_accvgpr_read_b32 v[vgprValuC+15], acc111         // copy acc to vreg[123]
v_accvgpr_read_b32 v[vgprValuC+16], acc115         // copy acc to vreg[124]
v_accvgpr_read_b32 v[vgprValuC+17], acc119         // copy acc to vreg[125]
v_accvgpr_read_b32 v[vgprValuC+18], acc123         // copy acc to vreg[126]
v_accvgpr_read_b32 v[vgprValuC+19], acc127         // copy acc to vreg[127]
v_accvgpr_read_b32 v[vgprValuC+20], acc128         // copy acc to vreg[128]
v_accvgpr_read_b32 v[vgprValuC+21], acc132         // copy acc to vreg[129]
v_accvgpr_read_b32 v[vgprValuC+22], acc136         // copy acc to vreg[130]
v_accvgpr_read_b32 v[vgprValuC+23], acc140         // copy acc to vreg[131]
v_accvgpr_read_b32 v[vgprValuC+24], acc144         // copy acc to vreg[132]
v_accvgpr_read_b32 v[vgprValuC+25], acc148         // copy acc to vreg[133]
v_accvgpr_read_b32 v[vgprValuC+26], acc152         // copy acc to vreg[134]
v_accvgpr_read_b32 v[vgprValuC+27], acc156         // copy acc to vreg[135]
v_accvgpr_read_b32 v[vgprValuC+28], acc160         // copy acc to vreg[136]
v_accvgpr_read_b32 v[vgprValuC+29], acc164         // copy acc to vreg[137]
v_accvgpr_read_b32 v[vgprValuC+30], acc168         // copy acc to vreg[138]
v_accvgpr_read_b32 v[vgprValuC+31], acc172         // copy acc to vreg[139]
v_accvgpr_read_b32 v[vgprValuC+32], acc176         // copy acc to vreg[140]
v_accvgpr_read_b32 v[vgprValuC+33], acc180         // copy acc to vreg[141]
v_accvgpr_read_b32 v[vgprValuC+34], acc184         // copy acc to vreg[142]
v_accvgpr_read_b32 v[vgprValuC+35], acc188         // copy acc to vreg[143]
v_accvgpr_read_b32 v[vgprValuC+36], acc192         // copy acc to vreg[144]
v_accvgpr_read_b32 v[vgprValuC+37], acc196         // copy acc to vreg[145]
v_accvgpr_read_b32 v[vgprValuC+38], acc200         // copy acc to vreg[146]
v_accvgpr_read_b32 v[vgprValuC+39], acc204         // copy acc to vreg[147]
v_accvgpr_read_b32 v[vgprValuC+40], acc208         // copy acc to vreg[148]
v_accvgpr_read_b32 v[vgprValuC+41], acc212         // copy acc to vreg[149]
v_accvgpr_read_b32 v[vgprValuC+42], acc216         // copy acc to vreg[150]
v_accvgpr_read_b32 v[vgprValuC+43], acc220         // copy acc to vreg[151]
v_accvgpr_read_b32 v[vgprValuC+44], acc224         // copy acc to vreg[152]
v_accvgpr_read_b32 v[vgprValuC+45], acc228         // copy acc to vreg[153]
v_accvgpr_read_b32 v[vgprValuC+46], acc232         // copy acc to vreg[154]
v_accvgpr_read_b32 v[vgprValuC+47], acc236         // copy acc to vreg[155]
v_accvgpr_read_b32 v[vgprValuC+48], acc240         // copy acc to vreg[156]
v_accvgpr_read_b32 v[vgprValuC+49], acc244         // copy acc to vreg[157]
v_accvgpr_read_b32 v[vgprValuC+50], acc248         // copy acc to vreg[158]
v_accvgpr_read_b32 v[vgprValuC+51], acc252         // copy acc to vreg[159]
v_accvgpr_read_b32 v[vgprValuC+52], acc129         // copy acc to vreg[160]
v_accvgpr_read_b32 v[vgprValuC+53], acc133         // copy acc to vreg[161]
v_accvgpr_read_b32 v[vgprValuC+54], acc137         // copy acc to vreg[162]
v_accvgpr_read_b32 v[vgprValuC+55], acc141         // copy acc to vreg[163]
v_accvgpr_read_b32 v[vgprValuC+56], acc145         // copy acc to vreg[164]
v_accvgpr_read_b32 v[vgprValuC+57], acc149         // copy acc to vreg[165]
v_accvgpr_read_b32 v[vgprValuC+58], acc153         // copy acc to vreg[166]
v_accvgpr_read_b32 v[vgprValuC+59], acc157         // copy acc to vreg[167]
v_accvgpr_read_b32 v[vgprValuC+60], acc161         // copy acc to vreg[168]
v_accvgpr_read_b32 v[vgprValuC+61], acc165         // copy acc to vreg[169]
v_accvgpr_read_b32 v[vgprValuC+62], acc169         // copy acc to vreg[170]
v_accvgpr_read_b32 v[vgprValuC+63], acc173         // copy acc to vreg[171]
v_accvgpr_read_b32 v[vgprValuC+64], acc177         // copy acc to vreg[172]
v_accvgpr_read_b32 v[vgprValuC+65], acc181         // copy acc to vreg[173]
v_accvgpr_read_b32 v[vgprValuC+66], acc185         // copy acc to vreg[174]
v_accvgpr_read_b32 v[vgprValuC+67], acc189         // copy acc to vreg[175]
v_accvgpr_read_b32 v[vgprValuC+68], acc193         // copy acc to vreg[176]
v_accvgpr_read_b32 v[vgprValuC+69], acc197         // copy acc to vreg[177]
v_accvgpr_read_b32 v[vgprValuC+70], acc201         // copy acc to vreg[178]
v_accvgpr_read_b32 v[vgprValuC+71], acc205         // copy acc to vreg[179]

/* rC *= alpha batchElements=[(0, 0, 15, 0), (0, 1, 15, 0), (1, 0, 0, 0), (1, 1, 0, 0), (1, 0, 1, 0), (1, 1, 1, 0), (1, 0, 2, 0), (1, 1, 2, 0), (1, 0, 3, 0), (1, 1, 3, 0), (1, 0, 4, 0), (1, 1, 4, 0), (1, 0, 5, 0), (1, 1, 5, 0), (1, 0, 6, 0)] */
v_pk_mul_f32 v[vgprValuC+12:vgprValuC+12+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+12:vgprValuC+12+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+14:vgprValuC+14+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+14:vgprValuC+14+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+16:vgprValuC+16+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+16:vgprValuC+16+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+18:vgprValuC+18+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+18:vgprValuC+18+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+20:vgprValuC+20+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+20:vgprValuC+20+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+22:vgprValuC+22+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+22:vgprValuC+22+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+24:vgprValuC+24+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+24:vgprValuC+24+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+26:vgprValuC+26+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+26:vgprValuC+26+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+28:vgprValuC+28+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+28:vgprValuC+28+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+30:vgprValuC+30+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+30:vgprValuC+30+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+32:vgprValuC+32+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+32:vgprValuC+32+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+34:vgprValuC+34+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+34:vgprValuC+34+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+36:vgprValuC+36+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+36:vgprValuC+36+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+38:vgprValuC+38+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+38:vgprValuC+38+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+40:vgprValuC+40+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+40:vgprValuC+40+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+42:vgprValuC+42+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+42:vgprValuC+42+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+44:vgprValuC+44+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+44:vgprValuC+44+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+46:vgprValuC+46+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+46:vgprValuC+46+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+48:vgprValuC+48+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+48:vgprValuC+48+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+50:vgprValuC+50+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+50:vgprValuC+50+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+52:vgprValuC+52+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+52:vgprValuC+52+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+54:vgprValuC+54+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+54:vgprValuC+54+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+56:vgprValuC+56+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+56:vgprValuC+56+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+58:vgprValuC+58+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+58:vgprValuC+58+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+60:vgprValuC+60+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+60:vgprValuC+60+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+62:vgprValuC+62+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+62:vgprValuC+62+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+64:vgprValuC+64+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+64:vgprValuC+64+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+66:vgprValuC+66+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+66:vgprValuC+66+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+68:vgprValuC+68+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+68:vgprValuC+68+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+70:vgprValuC+70+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+70:vgprValuC+70+1] op_sel_hi:[0,1,1] // *= alpha (pk)
s_waitcnt lgkmcnt(0)                               // wait for Bias LDS, ScaleAlphaVec

/* apply mask, calc new C and issue writes */
v_pk_mul_f32 v[vgprValuC+12:vgprValuC+12+1], v[76:77], v[vgprValuC+12:vgprValuC+12+1] // *= ScaleAlphaVecVMulPK(76)(0)
v_pk_mul_f32 v[vgprValuC+14:vgprValuC+14+1], v[78:79], v[vgprValuC+14:vgprValuC+14+1] // *= ScaleAlphaVecVMulPK(76)(2)
v_pk_add_f32 v[4:5], v[72:73], v[vgprValuC+12:vgprValuC+12+1] // C += bias
v_pk_add_f32 v[6:7], v[74:75], v[vgprValuC+14:vgprValuC+14+1] // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b64 v[12:13], v[4:5]
v_mov_b64 v[14:15], v[6:7]
buffer_store_dwordx4 v[12:15], v9, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_pk_mul_f32 v[vgprValuC+16:vgprValuC+16+1], v[84:85], v[vgprValuC+16:vgprValuC+16+1] // *= ScaleAlphaVecVMulPK(84)(0)
v_pk_mul_f32 v[vgprValuC+18:vgprValuC+18+1], v[86:87], v[vgprValuC+18:vgprValuC+18+1] // *= ScaleAlphaVecVMulPK(84)(2)
v_pk_add_f32 v[4:5], v[80:81], v[vgprValuC+16:vgprValuC+16+1] // C += bias
v_pk_add_f32 v[6:7], v[82:83], v[vgprValuC+18:vgprValuC+18+1] // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b64 v[16:17], v[4:5]
v_mov_b64 v[18:19], v[6:7]
buffer_store_dwordx4 v[16:19], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_pk_mul_f32 v[vgprValuC+20:vgprValuC+20+1], v[76:77], v[vgprValuC+20:vgprValuC+20+1] // *= ScaleAlphaVecVMulPK(76)(0)
v_pk_mul_f32 v[vgprValuC+22:vgprValuC+22+1], v[78:79], v[vgprValuC+22:vgprValuC+22+1] // *= ScaleAlphaVecVMulPK(76)(2)
v_pk_add_f32 v[4:5], v[72:73], v[vgprValuC+20:vgprValuC+20+1] // C += bias
v_pk_add_f32 v[6:7], v[74:75], v[vgprValuC+22:vgprValuC+22+1] // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b64 v[20:21], v[4:5]
v_mov_b64 v[22:23], v[6:7]
buffer_store_dwordx4 v[20:23], v89, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_pk_mul_f32 v[vgprValuC+24:vgprValuC+24+1], v[84:85], v[vgprValuC+24:vgprValuC+24+1] // *= ScaleAlphaVecVMulPK(84)(0)
v_pk_mul_f32 v[vgprValuC+26:vgprValuC+26+1], v[86:87], v[vgprValuC+26:vgprValuC+26+1] // *= ScaleAlphaVecVMulPK(84)(2)
v_pk_add_f32 v[4:5], v[80:81], v[vgprValuC+24:vgprValuC+24+1] // C += bias
v_pk_add_f32 v[6:7], v[82:83], v[vgprValuC+26:vgprValuC+26+1] // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b64 v[24:25], v[4:5]
v_mov_b64 v[26:27], v[6:7]
buffer_store_dwordx4 v[24:27], v91, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_pk_mul_f32 v[vgprValuC+28:vgprValuC+28+1], v[76:77], v[vgprValuC+28:vgprValuC+28+1] // *= ScaleAlphaVecVMulPK(76)(0)
v_pk_mul_f32 v[vgprValuC+30:vgprValuC+30+1], v[78:79], v[vgprValuC+30:vgprValuC+30+1] // *= ScaleAlphaVecVMulPK(76)(2)
v_pk_add_f32 v[4:5], v[72:73], v[vgprValuC+28:vgprValuC+28+1] // C += bias
v_pk_add_f32 v[6:7], v[74:75], v[vgprValuC+30:vgprValuC+30+1] // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b64 v[28:29], v[4:5]
v_mov_b64 v[30:31], v[6:7]
buffer_store_dwordx4 v[28:31], v93, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_pk_mul_f32 v[vgprValuC+32:vgprValuC+32+1], v[84:85], v[vgprValuC+32:vgprValuC+32+1] // *= ScaleAlphaVecVMulPK(84)(0)
v_pk_mul_f32 v[vgprValuC+34:vgprValuC+34+1], v[86:87], v[vgprValuC+34:vgprValuC+34+1] // *= ScaleAlphaVecVMulPK(84)(2)
v_pk_add_f32 v[4:5], v[80:81], v[vgprValuC+32:vgprValuC+32+1] // C += bias
v_pk_add_f32 v[6:7], v[82:83], v[vgprValuC+34:vgprValuC+34+1] // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b64 v[32:33], v[4:5]
v_mov_b64 v[34:35], v[6:7]
buffer_store_dwordx4 v[32:35], v95, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_pk_mul_f32 v[vgprValuC+36:vgprValuC+36+1], v[76:77], v[vgprValuC+36:vgprValuC+36+1] // *= ScaleAlphaVecVMulPK(76)(0)
v_pk_mul_f32 v[vgprValuC+38:vgprValuC+38+1], v[78:79], v[vgprValuC+38:vgprValuC+38+1] // *= ScaleAlphaVecVMulPK(76)(2)
v_pk_add_f32 v[4:5], v[72:73], v[vgprValuC+36:vgprValuC+36+1] // C += bias
v_pk_add_f32 v[6:7], v[74:75], v[vgprValuC+38:vgprValuC+38+1] // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b64 v[36:37], v[4:5]
v_mov_b64 v[38:39], v[6:7]
buffer_store_dwordx4 v[36:39], v97, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_pk_mul_f32 v[vgprValuC+40:vgprValuC+40+1], v[84:85], v[vgprValuC+40:vgprValuC+40+1] // *= ScaleAlphaVecVMulPK(84)(0)
v_pk_mul_f32 v[vgprValuC+42:vgprValuC+42+1], v[86:87], v[vgprValuC+42:vgprValuC+42+1] // *= ScaleAlphaVecVMulPK(84)(2)
v_pk_add_f32 v[4:5], v[80:81], v[vgprValuC+40:vgprValuC+40+1] // C += bias
v_pk_add_f32 v[6:7], v[82:83], v[vgprValuC+42:vgprValuC+42+1] // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b64 v[40:41], v[4:5]
v_mov_b64 v[42:43], v[6:7]
buffer_store_dwordx4 v[40:43], v99, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_pk_mul_f32 v[vgprValuC+44:vgprValuC+44+1], v[76:77], v[vgprValuC+44:vgprValuC+44+1] // *= ScaleAlphaVecVMulPK(76)(0)
v_pk_mul_f32 v[vgprValuC+46:vgprValuC+46+1], v[78:79], v[vgprValuC+46:vgprValuC+46+1] // *= ScaleAlphaVecVMulPK(76)(2)
v_pk_add_f32 v[4:5], v[72:73], v[vgprValuC+44:vgprValuC+44+1] // C += bias
v_pk_add_f32 v[6:7], v[74:75], v[vgprValuC+46:vgprValuC+46+1] // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b64 v[44:45], v[4:5]
v_mov_b64 v[46:47], v[6:7]
buffer_store_dwordx4 v[44:47], v101, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_pk_mul_f32 v[vgprValuC+48:vgprValuC+48+1], v[84:85], v[vgprValuC+48:vgprValuC+48+1] // *= ScaleAlphaVecVMulPK(84)(0)
v_pk_mul_f32 v[vgprValuC+50:vgprValuC+50+1], v[86:87], v[vgprValuC+50:vgprValuC+50+1] // *= ScaleAlphaVecVMulPK(84)(2)
v_pk_add_f32 v[4:5], v[80:81], v[vgprValuC+48:vgprValuC+48+1] // C += bias
v_pk_add_f32 v[6:7], v[82:83], v[vgprValuC+50:vgprValuC+50+1] // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b64 v[48:49], v[4:5]
v_mov_b64 v[50:51], v[6:7]
buffer_store_dwordx4 v[48:51], v103, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_pk_mul_f32 v[vgprValuC+52:vgprValuC+52+1], v[76:77], v[vgprValuC+52:vgprValuC+52+1] // *= ScaleAlphaVecVMulPK(76)(0)
v_pk_mul_f32 v[vgprValuC+54:vgprValuC+54+1], v[78:79], v[vgprValuC+54:vgprValuC+54+1] // *= ScaleAlphaVecVMulPK(76)(2)
v_pk_add_f32 v[4:5], v[72:73], v[vgprValuC+52:vgprValuC+52+1] // C += bias
v_pk_add_f32 v[6:7], v[74:75], v[vgprValuC+54:vgprValuC+54+1] // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b64 v[52:53], v[4:5]
v_mov_b64 v[54:55], v[6:7]
buffer_store_dwordx4 v[52:55], v105, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_pk_mul_f32 v[vgprValuC+56:vgprValuC+56+1], v[84:85], v[vgprValuC+56:vgprValuC+56+1] // *= ScaleAlphaVecVMulPK(84)(0)
v_pk_mul_f32 v[vgprValuC+58:vgprValuC+58+1], v[86:87], v[vgprValuC+58:vgprValuC+58+1] // *= ScaleAlphaVecVMulPK(84)(2)
v_pk_add_f32 v[4:5], v[80:81], v[vgprValuC+56:vgprValuC+56+1] // C += bias
v_pk_add_f32 v[6:7], v[82:83], v[vgprValuC+58:vgprValuC+58+1] // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b64 v[56:57], v[4:5]
v_mov_b64 v[58:59], v[6:7]
buffer_store_dwordx4 v[56:59], v107, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_pk_mul_f32 v[vgprValuC+60:vgprValuC+60+1], v[76:77], v[vgprValuC+60:vgprValuC+60+1] // *= ScaleAlphaVecVMulPK(76)(0)
v_pk_mul_f32 v[vgprValuC+62:vgprValuC+62+1], v[78:79], v[vgprValuC+62:vgprValuC+62+1] // *= ScaleAlphaVecVMulPK(76)(2)
v_pk_add_f32 v[4:5], v[72:73], v[vgprValuC+60:vgprValuC+60+1] // C += bias
v_pk_add_f32 v[6:7], v[74:75], v[vgprValuC+62:vgprValuC+62+1] // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b64 v[60:61], v[4:5]
v_mov_b64 v[62:63], v[6:7]
buffer_store_dwordx4 v[60:63], v109, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_pk_mul_f32 v[vgprValuC+64:vgprValuC+64+1], v[84:85], v[vgprValuC+64:vgprValuC+64+1] // *= ScaleAlphaVecVMulPK(84)(0)
v_pk_mul_f32 v[vgprValuC+66:vgprValuC+66+1], v[86:87], v[vgprValuC+66:vgprValuC+66+1] // *= ScaleAlphaVecVMulPK(84)(2)
v_pk_add_f32 v[4:5], v[80:81], v[vgprValuC+64:vgprValuC+64+1] // C += bias
v_pk_add_f32 v[6:7], v[82:83], v[vgprValuC+66:vgprValuC+66+1] // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b64 v[64:65], v[4:5]
v_mov_b64 v[66:67], v[6:7]
buffer_store_dwordx4 v[64:67], v111, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_pk_mul_f32 v[vgprValuC+68:vgprValuC+68+1], v[76:77], v[vgprValuC+68:vgprValuC+68+1] // *= ScaleAlphaVecVMulPK(76)(0)
v_pk_mul_f32 v[vgprValuC+70:vgprValuC+70+1], v[78:79], v[vgprValuC+70:vgprValuC+70+1] // *= ScaleAlphaVecVMulPK(76)(2)
v_pk_add_f32 v[4:5], v[72:73], v[vgprValuC+68:vgprValuC+68+1] // C += bias
v_pk_add_f32 v[6:7], v[74:75], v[vgprValuC+70:vgprValuC+70+1] // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b64 v[68:69], v[4:5]
v_mov_b64 v[70:71], v[6:7]
buffer_store_dwordx4 v[68:71], v113, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 factorDim=0 */

/******************************************/
/* Global Write Edge Batch #3 (d1,d0,vc1,vc0) = */
/*    (1,1,6,0:vw4); (1,0,7,0:vw4); (1,1,7,0:vw4); (1,0,8,0:vw4); (1,1,8,0:vw4); (1,0,9,0:vw4); (1,1,9,0:vw4); (1,0,10,0:vw4); (1,1,10,0:vw4); (1,0,11,0:vw4); (1,1,11,0:vw4); (1,0,12,0:vw4); (1,1,12,0:vw4); (1,0,13,0:vw4); (1,1,13,0:vw4) */
/******************************************/

/* calc coords, apply mask, and issue loads (if necessary) */
v_mov_b32 v8, BufferOOB
/* (d1,vc1,d0,vc0)=(1,6,1,0) */
s_mov_b32 s74, 128                                 // coordOffset0 d0=1 vc0=0
v_add_co_u32 v4, vcc, v0, s74                      // coord0.2: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v10, v4, s74
v_lshlrev_b32 v10, 0x2, v10                        // Bias address scaled by BPE
ds_read_b128 v[72:75], v10 offset:0                // load Bias
ds_read_b128 v[76:79], v10 offset:1024             // load scaleAlpha
v_add_lshl_u32 v9, v3, v4, 0x2                     // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v9, v8, v9, s[78:79]                 // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(1,7,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[74:75], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v88, v0, s74
v_lshlrev_b32 v88, 0x2, v88                        // Bias address scaled by BPE
ds_read_b128 v[80:83], v88 offset:0                // load Bias
ds_read_b128 v[84:87], v88 offset:1024             // load scaleAlpha
v_add_lshl_u32 v11, v3, v0, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v11, v8, v11, s[78:79]               // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(1,7,1,0) */
s_mov_b32 s74, 128                                 // coordOffset0 d0=1 vc0=0
v_add_co_u32 v4, vcc, v0, s74                      // coord0.2: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v90, v4, s74
v_lshlrev_b32 v90, 0x2, v90                        // Bias address scaled by BPE
v_add_lshl_u32 v89, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v89, v8, v89, s[78:79]               // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(1,8,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[74:75], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v92, v0, s74
v_lshlrev_b32 v92, 0x2, v92                        // Bias address scaled by BPE
v_add_lshl_u32 v91, v3, v0, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v91, v8, v91, s[78:79]               // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(1,8,1,0) */
s_mov_b32 s74, 128                                 // coordOffset0 d0=1 vc0=0
v_add_co_u32 v4, vcc, v0, s74                      // coord0.2: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v94, v4, s74
v_lshlrev_b32 v94, 0x2, v94                        // Bias address scaled by BPE
v_add_lshl_u32 v93, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v93, v8, v93, s[78:79]               // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(1,9,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[74:75], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v96, v0, s74
v_lshlrev_b32 v96, 0x2, v96                        // Bias address scaled by BPE
v_add_lshl_u32 v95, v3, v0, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v95, v8, v95, s[78:79]               // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(1,9,1,0) */
s_mov_b32 s74, 128                                 // coordOffset0 d0=1 vc0=0
v_add_co_u32 v4, vcc, v0, s74                      // coord0.2: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v98, v4, s74
v_lshlrev_b32 v98, 0x2, v98                        // Bias address scaled by BPE
v_add_lshl_u32 v97, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v97, v8, v97, s[78:79]               // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(1,10,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[74:75], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v100, v0, s74
v_lshlrev_b32 v100, 0x2, v100                      // Bias address scaled by BPE
v_add_lshl_u32 v99, v3, v0, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v99, v8, v99, s[78:79]               // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(1,10,1,0) */
s_mov_b32 s74, 128                                 // coordOffset0 d0=1 vc0=0
v_add_co_u32 v4, vcc, v0, s74                      // coord0.2: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v102, v4, s74
v_lshlrev_b32 v102, 0x2, v102                      // Bias address scaled by BPE
v_add_lshl_u32 v101, v3, v4, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v101, v8, v101, s[78:79]             // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(1,11,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[74:75], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v104, v0, s74
v_lshlrev_b32 v104, 0x2, v104                      // Bias address scaled by BPE
v_add_lshl_u32 v103, v3, v0, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v103, v8, v103, s[78:79]             // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(1,11,1,0) */
s_mov_b32 s74, 128                                 // coordOffset0 d0=1 vc0=0
v_add_co_u32 v4, vcc, v0, s74                      // coord0.2: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v106, v4, s74
v_lshlrev_b32 v106, 0x2, v106                      // Bias address scaled by BPE
v_add_lshl_u32 v105, v3, v4, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v105, v8, v105, s[78:79]             // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(1,12,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[74:75], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v108, v0, s74
v_lshlrev_b32 v108, 0x2, v108                      // Bias address scaled by BPE
v_add_lshl_u32 v107, v3, v0, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v107, v8, v107, s[78:79]             // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(1,12,1,0) */
s_mov_b32 s74, 128                                 // coordOffset0 d0=1 vc0=0
v_add_co_u32 v4, vcc, v0, s74                      // coord0.2: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v110, v4, s74
v_lshlrev_b32 v110, 0x2, v110                      // Bias address scaled by BPE
v_add_lshl_u32 v109, v3, v4, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v109, v8, v109, s[78:79]             // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(1,13,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[74:75], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v112, v0, s74
v_lshlrev_b32 v112, 0x2, v112                      // Bias address scaled by BPE
v_add_lshl_u32 v111, v3, v0, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v111, v8, v111, s[78:79]             // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(1,13,1,0) */
s_mov_b32 s74, 128                                 // coordOffset0 d0=1 vc0=0
v_add_co_u32 v4, vcc, v0, s74                      // coord0.2: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v114, v4, s74
v_lshlrev_b32 v114, 0x2, v114                      // Bias address scaled by BPE
v_add_lshl_u32 v113, v3, v4, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v113, v8, v113, s[78:79]             // LDD clip if OOB. offset
v_accvgpr_read_b32 v[vgprValuC+12], acc209         // copy acc to vreg[180]
v_accvgpr_read_b32 v[vgprValuC+13], acc213         // copy acc to vreg[181]
v_accvgpr_read_b32 v[vgprValuC+14], acc217         // copy acc to vreg[182]
v_accvgpr_read_b32 v[vgprValuC+15], acc221         // copy acc to vreg[183]
v_accvgpr_read_b32 v[vgprValuC+16], acc225         // copy acc to vreg[184]
v_accvgpr_read_b32 v[vgprValuC+17], acc229         // copy acc to vreg[185]
v_accvgpr_read_b32 v[vgprValuC+18], acc233         // copy acc to vreg[186]
v_accvgpr_read_b32 v[vgprValuC+19], acc237         // copy acc to vreg[187]
v_accvgpr_read_b32 v[vgprValuC+20], acc241         // copy acc to vreg[188]
v_accvgpr_read_b32 v[vgprValuC+21], acc245         // copy acc to vreg[189]
v_accvgpr_read_b32 v[vgprValuC+22], acc249         // copy acc to vreg[190]
v_accvgpr_read_b32 v[vgprValuC+23], acc253         // copy acc to vreg[191]
v_accvgpr_read_b32 v[vgprValuC+24], acc130         // copy acc to vreg[192]
v_accvgpr_read_b32 v[vgprValuC+25], acc134         // copy acc to vreg[193]
v_accvgpr_read_b32 v[vgprValuC+26], acc138         // copy acc to vreg[194]
v_accvgpr_read_b32 v[vgprValuC+27], acc142         // copy acc to vreg[195]
v_accvgpr_read_b32 v[vgprValuC+28], acc146         // copy acc to vreg[196]
v_accvgpr_read_b32 v[vgprValuC+29], acc150         // copy acc to vreg[197]
v_accvgpr_read_b32 v[vgprValuC+30], acc154         // copy acc to vreg[198]
v_accvgpr_read_b32 v[vgprValuC+31], acc158         // copy acc to vreg[199]
v_accvgpr_read_b32 v[vgprValuC+32], acc162         // copy acc to vreg[200]
v_accvgpr_read_b32 v[vgprValuC+33], acc166         // copy acc to vreg[201]
v_accvgpr_read_b32 v[vgprValuC+34], acc170         // copy acc to vreg[202]
v_accvgpr_read_b32 v[vgprValuC+35], acc174         // copy acc to vreg[203]
v_accvgpr_read_b32 v[vgprValuC+36], acc178         // copy acc to vreg[204]
v_accvgpr_read_b32 v[vgprValuC+37], acc182         // copy acc to vreg[205]
v_accvgpr_read_b32 v[vgprValuC+38], acc186         // copy acc to vreg[206]
v_accvgpr_read_b32 v[vgprValuC+39], acc190         // copy acc to vreg[207]
v_accvgpr_read_b32 v[vgprValuC+40], acc194         // copy acc to vreg[208]
v_accvgpr_read_b32 v[vgprValuC+41], acc198         // copy acc to vreg[209]
v_accvgpr_read_b32 v[vgprValuC+42], acc202         // copy acc to vreg[210]
v_accvgpr_read_b32 v[vgprValuC+43], acc206         // copy acc to vreg[211]
v_accvgpr_read_b32 v[vgprValuC+44], acc210         // copy acc to vreg[212]
v_accvgpr_read_b32 v[vgprValuC+45], acc214         // copy acc to vreg[213]
v_accvgpr_read_b32 v[vgprValuC+46], acc218         // copy acc to vreg[214]
v_accvgpr_read_b32 v[vgprValuC+47], acc222         // copy acc to vreg[215]
v_accvgpr_read_b32 v[vgprValuC+48], acc226         // copy acc to vreg[216]
v_accvgpr_read_b32 v[vgprValuC+49], acc230         // copy acc to vreg[217]
v_accvgpr_read_b32 v[vgprValuC+50], acc234         // copy acc to vreg[218]
v_accvgpr_read_b32 v[vgprValuC+51], acc238         // copy acc to vreg[219]
v_accvgpr_read_b32 v[vgprValuC+52], acc242         // copy acc to vreg[220]
v_accvgpr_read_b32 v[vgprValuC+53], acc246         // copy acc to vreg[221]
v_accvgpr_read_b32 v[vgprValuC+54], acc250         // copy acc to vreg[222]
v_accvgpr_read_b32 v[vgprValuC+55], acc254         // copy acc to vreg[223]
v_accvgpr_read_b32 v[vgprValuC+56], acc131         // copy acc to vreg[224]
v_accvgpr_read_b32 v[vgprValuC+57], acc135         // copy acc to vreg[225]
v_accvgpr_read_b32 v[vgprValuC+58], acc139         // copy acc to vreg[226]
v_accvgpr_read_b32 v[vgprValuC+59], acc143         // copy acc to vreg[227]
v_accvgpr_read_b32 v[vgprValuC+60], acc147         // copy acc to vreg[228]
v_accvgpr_read_b32 v[vgprValuC+61], acc151         // copy acc to vreg[229]
v_accvgpr_read_b32 v[vgprValuC+62], acc155         // copy acc to vreg[230]
v_accvgpr_read_b32 v[vgprValuC+63], acc159         // copy acc to vreg[231]
v_accvgpr_read_b32 v[vgprValuC+64], acc163         // copy acc to vreg[232]
v_accvgpr_read_b32 v[vgprValuC+65], acc167         // copy acc to vreg[233]
v_accvgpr_read_b32 v[vgprValuC+66], acc171         // copy acc to vreg[234]
v_accvgpr_read_b32 v[vgprValuC+67], acc175         // copy acc to vreg[235]
v_accvgpr_read_b32 v[vgprValuC+68], acc179         // copy acc to vreg[236]
v_accvgpr_read_b32 v[vgprValuC+69], acc183         // copy acc to vreg[237]
v_accvgpr_read_b32 v[vgprValuC+70], acc187         // copy acc to vreg[238]
v_accvgpr_read_b32 v[vgprValuC+71], acc191         // copy acc to vreg[239]

/* rC *= alpha batchElements=[(1, 1, 6, 0), (1, 0, 7, 0), (1, 1, 7, 0), (1, 0, 8, 0), (1, 1, 8, 0), (1, 0, 9, 0), (1, 1, 9, 0), (1, 0, 10, 0), (1, 1, 10, 0), (1, 0, 11, 0), (1, 1, 11, 0), (1, 0, 12, 0), (1, 1, 12, 0), (1, 0, 13, 0), (1, 1, 13, 0)] */
v_pk_mul_f32 v[vgprValuC+12:vgprValuC+12+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+12:vgprValuC+12+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+14:vgprValuC+14+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+14:vgprValuC+14+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+16:vgprValuC+16+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+16:vgprValuC+16+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+18:vgprValuC+18+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+18:vgprValuC+18+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+20:vgprValuC+20+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+20:vgprValuC+20+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+22:vgprValuC+22+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+22:vgprValuC+22+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+24:vgprValuC+24+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+24:vgprValuC+24+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+26:vgprValuC+26+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+26:vgprValuC+26+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+28:vgprValuC+28+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+28:vgprValuC+28+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+30:vgprValuC+30+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+30:vgprValuC+30+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+32:vgprValuC+32+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+32:vgprValuC+32+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+34:vgprValuC+34+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+34:vgprValuC+34+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+36:vgprValuC+36+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+36:vgprValuC+36+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+38:vgprValuC+38+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+38:vgprValuC+38+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+40:vgprValuC+40+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+40:vgprValuC+40+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+42:vgprValuC+42+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+42:vgprValuC+42+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+44:vgprValuC+44+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+44:vgprValuC+44+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+46:vgprValuC+46+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+46:vgprValuC+46+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+48:vgprValuC+48+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+48:vgprValuC+48+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+50:vgprValuC+50+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+50:vgprValuC+50+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+52:vgprValuC+52+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+52:vgprValuC+52+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+54:vgprValuC+54+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+54:vgprValuC+54+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+56:vgprValuC+56+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+56:vgprValuC+56+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+58:vgprValuC+58+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+58:vgprValuC+58+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+60:vgprValuC+60+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+60:vgprValuC+60+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+62:vgprValuC+62+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+62:vgprValuC+62+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+64:vgprValuC+64+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+64:vgprValuC+64+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+66:vgprValuC+66+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+66:vgprValuC+66+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+68:vgprValuC+68+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+68:vgprValuC+68+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+70:vgprValuC+70+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+70:vgprValuC+70+1] op_sel_hi:[0,1,1] // *= alpha (pk)
s_waitcnt lgkmcnt(0)                               // wait for Bias LDS, ScaleAlphaVec

/* apply mask, calc new C and issue writes */
v_pk_mul_f32 v[vgprValuC+12:vgprValuC+12+1], v[76:77], v[vgprValuC+12:vgprValuC+12+1] // *= ScaleAlphaVecVMulPK(76)(0)
v_pk_mul_f32 v[vgprValuC+14:vgprValuC+14+1], v[78:79], v[vgprValuC+14:vgprValuC+14+1] // *= ScaleAlphaVecVMulPK(76)(2)
v_pk_add_f32 v[4:5], v[72:73], v[vgprValuC+12:vgprValuC+12+1] // C += bias
v_pk_add_f32 v[6:7], v[74:75], v[vgprValuC+14:vgprValuC+14+1] // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b64 v[12:13], v[4:5]
v_mov_b64 v[14:15], v[6:7]
buffer_store_dwordx4 v[12:15], v9, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_pk_mul_f32 v[vgprValuC+16:vgprValuC+16+1], v[84:85], v[vgprValuC+16:vgprValuC+16+1] // *= ScaleAlphaVecVMulPK(84)(0)
v_pk_mul_f32 v[vgprValuC+18:vgprValuC+18+1], v[86:87], v[vgprValuC+18:vgprValuC+18+1] // *= ScaleAlphaVecVMulPK(84)(2)
v_pk_add_f32 v[4:5], v[80:81], v[vgprValuC+16:vgprValuC+16+1] // C += bias
v_pk_add_f32 v[6:7], v[82:83], v[vgprValuC+18:vgprValuC+18+1] // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b64 v[16:17], v[4:5]
v_mov_b64 v[18:19], v[6:7]
buffer_store_dwordx4 v[16:19], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_pk_mul_f32 v[vgprValuC+20:vgprValuC+20+1], v[76:77], v[vgprValuC+20:vgprValuC+20+1] // *= ScaleAlphaVecVMulPK(76)(0)
v_pk_mul_f32 v[vgprValuC+22:vgprValuC+22+1], v[78:79], v[vgprValuC+22:vgprValuC+22+1] // *= ScaleAlphaVecVMulPK(76)(2)
v_pk_add_f32 v[4:5], v[72:73], v[vgprValuC+20:vgprValuC+20+1] // C += bias
v_pk_add_f32 v[6:7], v[74:75], v[vgprValuC+22:vgprValuC+22+1] // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b64 v[20:21], v[4:5]
v_mov_b64 v[22:23], v[6:7]
buffer_store_dwordx4 v[20:23], v89, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_pk_mul_f32 v[vgprValuC+24:vgprValuC+24+1], v[84:85], v[vgprValuC+24:vgprValuC+24+1] // *= ScaleAlphaVecVMulPK(84)(0)
v_pk_mul_f32 v[vgprValuC+26:vgprValuC+26+1], v[86:87], v[vgprValuC+26:vgprValuC+26+1] // *= ScaleAlphaVecVMulPK(84)(2)
v_pk_add_f32 v[4:5], v[80:81], v[vgprValuC+24:vgprValuC+24+1] // C += bias
v_pk_add_f32 v[6:7], v[82:83], v[vgprValuC+26:vgprValuC+26+1] // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b64 v[24:25], v[4:5]
v_mov_b64 v[26:27], v[6:7]
buffer_store_dwordx4 v[24:27], v91, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_pk_mul_f32 v[vgprValuC+28:vgprValuC+28+1], v[76:77], v[vgprValuC+28:vgprValuC+28+1] // *= ScaleAlphaVecVMulPK(76)(0)
v_pk_mul_f32 v[vgprValuC+30:vgprValuC+30+1], v[78:79], v[vgprValuC+30:vgprValuC+30+1] // *= ScaleAlphaVecVMulPK(76)(2)
v_pk_add_f32 v[4:5], v[72:73], v[vgprValuC+28:vgprValuC+28+1] // C += bias
v_pk_add_f32 v[6:7], v[74:75], v[vgprValuC+30:vgprValuC+30+1] // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b64 v[28:29], v[4:5]
v_mov_b64 v[30:31], v[6:7]
buffer_store_dwordx4 v[28:31], v93, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_pk_mul_f32 v[vgprValuC+32:vgprValuC+32+1], v[84:85], v[vgprValuC+32:vgprValuC+32+1] // *= ScaleAlphaVecVMulPK(84)(0)
v_pk_mul_f32 v[vgprValuC+34:vgprValuC+34+1], v[86:87], v[vgprValuC+34:vgprValuC+34+1] // *= ScaleAlphaVecVMulPK(84)(2)
v_pk_add_f32 v[4:5], v[80:81], v[vgprValuC+32:vgprValuC+32+1] // C += bias
v_pk_add_f32 v[6:7], v[82:83], v[vgprValuC+34:vgprValuC+34+1] // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b64 v[32:33], v[4:5]
v_mov_b64 v[34:35], v[6:7]
buffer_store_dwordx4 v[32:35], v95, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_pk_mul_f32 v[vgprValuC+36:vgprValuC+36+1], v[76:77], v[vgprValuC+36:vgprValuC+36+1] // *= ScaleAlphaVecVMulPK(76)(0)
v_pk_mul_f32 v[vgprValuC+38:vgprValuC+38+1], v[78:79], v[vgprValuC+38:vgprValuC+38+1] // *= ScaleAlphaVecVMulPK(76)(2)
v_pk_add_f32 v[4:5], v[72:73], v[vgprValuC+36:vgprValuC+36+1] // C += bias
v_pk_add_f32 v[6:7], v[74:75], v[vgprValuC+38:vgprValuC+38+1] // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b64 v[36:37], v[4:5]
v_mov_b64 v[38:39], v[6:7]
buffer_store_dwordx4 v[36:39], v97, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_pk_mul_f32 v[vgprValuC+40:vgprValuC+40+1], v[84:85], v[vgprValuC+40:vgprValuC+40+1] // *= ScaleAlphaVecVMulPK(84)(0)
v_pk_mul_f32 v[vgprValuC+42:vgprValuC+42+1], v[86:87], v[vgprValuC+42:vgprValuC+42+1] // *= ScaleAlphaVecVMulPK(84)(2)
v_pk_add_f32 v[4:5], v[80:81], v[vgprValuC+40:vgprValuC+40+1] // C += bias
v_pk_add_f32 v[6:7], v[82:83], v[vgprValuC+42:vgprValuC+42+1] // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b64 v[40:41], v[4:5]
v_mov_b64 v[42:43], v[6:7]
buffer_store_dwordx4 v[40:43], v99, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_pk_mul_f32 v[vgprValuC+44:vgprValuC+44+1], v[76:77], v[vgprValuC+44:vgprValuC+44+1] // *= ScaleAlphaVecVMulPK(76)(0)
v_pk_mul_f32 v[vgprValuC+46:vgprValuC+46+1], v[78:79], v[vgprValuC+46:vgprValuC+46+1] // *= ScaleAlphaVecVMulPK(76)(2)
v_pk_add_f32 v[4:5], v[72:73], v[vgprValuC+44:vgprValuC+44+1] // C += bias
v_pk_add_f32 v[6:7], v[74:75], v[vgprValuC+46:vgprValuC+46+1] // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b64 v[44:45], v[4:5]
v_mov_b64 v[46:47], v[6:7]
buffer_store_dwordx4 v[44:47], v101, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_pk_mul_f32 v[vgprValuC+48:vgprValuC+48+1], v[84:85], v[vgprValuC+48:vgprValuC+48+1] // *= ScaleAlphaVecVMulPK(84)(0)
v_pk_mul_f32 v[vgprValuC+50:vgprValuC+50+1], v[86:87], v[vgprValuC+50:vgprValuC+50+1] // *= ScaleAlphaVecVMulPK(84)(2)
v_pk_add_f32 v[4:5], v[80:81], v[vgprValuC+48:vgprValuC+48+1] // C += bias
v_pk_add_f32 v[6:7], v[82:83], v[vgprValuC+50:vgprValuC+50+1] // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b64 v[48:49], v[4:5]
v_mov_b64 v[50:51], v[6:7]
buffer_store_dwordx4 v[48:51], v103, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_pk_mul_f32 v[vgprValuC+52:vgprValuC+52+1], v[76:77], v[vgprValuC+52:vgprValuC+52+1] // *= ScaleAlphaVecVMulPK(76)(0)
v_pk_mul_f32 v[vgprValuC+54:vgprValuC+54+1], v[78:79], v[vgprValuC+54:vgprValuC+54+1] // *= ScaleAlphaVecVMulPK(76)(2)
v_pk_add_f32 v[4:5], v[72:73], v[vgprValuC+52:vgprValuC+52+1] // C += bias
v_pk_add_f32 v[6:7], v[74:75], v[vgprValuC+54:vgprValuC+54+1] // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b64 v[52:53], v[4:5]
v_mov_b64 v[54:55], v[6:7]
buffer_store_dwordx4 v[52:55], v105, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_pk_mul_f32 v[vgprValuC+56:vgprValuC+56+1], v[84:85], v[vgprValuC+56:vgprValuC+56+1] // *= ScaleAlphaVecVMulPK(84)(0)
v_pk_mul_f32 v[vgprValuC+58:vgprValuC+58+1], v[86:87], v[vgprValuC+58:vgprValuC+58+1] // *= ScaleAlphaVecVMulPK(84)(2)
v_pk_add_f32 v[4:5], v[80:81], v[vgprValuC+56:vgprValuC+56+1] // C += bias
v_pk_add_f32 v[6:7], v[82:83], v[vgprValuC+58:vgprValuC+58+1] // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b64 v[56:57], v[4:5]
v_mov_b64 v[58:59], v[6:7]
buffer_store_dwordx4 v[56:59], v107, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_pk_mul_f32 v[vgprValuC+60:vgprValuC+60+1], v[76:77], v[vgprValuC+60:vgprValuC+60+1] // *= ScaleAlphaVecVMulPK(76)(0)
v_pk_mul_f32 v[vgprValuC+62:vgprValuC+62+1], v[78:79], v[vgprValuC+62:vgprValuC+62+1] // *= ScaleAlphaVecVMulPK(76)(2)
v_pk_add_f32 v[4:5], v[72:73], v[vgprValuC+60:vgprValuC+60+1] // C += bias
v_pk_add_f32 v[6:7], v[74:75], v[vgprValuC+62:vgprValuC+62+1] // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b64 v[60:61], v[4:5]
v_mov_b64 v[62:63], v[6:7]
buffer_store_dwordx4 v[60:63], v109, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_pk_mul_f32 v[vgprValuC+64:vgprValuC+64+1], v[84:85], v[vgprValuC+64:vgprValuC+64+1] // *= ScaleAlphaVecVMulPK(84)(0)
v_pk_mul_f32 v[vgprValuC+66:vgprValuC+66+1], v[86:87], v[vgprValuC+66:vgprValuC+66+1] // *= ScaleAlphaVecVMulPK(84)(2)
v_pk_add_f32 v[4:5], v[80:81], v[vgprValuC+64:vgprValuC+64+1] // C += bias
v_pk_add_f32 v[6:7], v[82:83], v[vgprValuC+66:vgprValuC+66+1] // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b64 v[64:65], v[4:5]
v_mov_b64 v[66:67], v[6:7]
buffer_store_dwordx4 v[64:67], v111, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_pk_mul_f32 v[vgprValuC+68:vgprValuC+68+1], v[76:77], v[vgprValuC+68:vgprValuC+68+1] // *= ScaleAlphaVecVMulPK(76)(0)
v_pk_mul_f32 v[vgprValuC+70:vgprValuC+70+1], v[78:79], v[vgprValuC+70:vgprValuC+70+1] // *= ScaleAlphaVecVMulPK(76)(2)
v_pk_add_f32 v[4:5], v[72:73], v[vgprValuC+68:vgprValuC+68+1] // C += bias
v_pk_add_f32 v[6:7], v[74:75], v[vgprValuC+70:vgprValuC+70+1] // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b64 v[68:69], v[4:5]
v_mov_b64 v[70:71], v[6:7]
buffer_store_dwordx4 v[68:71], v113, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 factorDim=0 */

/******************************************/
/* Global Write Edge Batch #4 (d1,d0,vc1,vc0) = */
/*    (1,0,14,0:vw4); (1,1,14,0:vw4); (1,0,15,0:vw4); (1,1,15,0:vw4) */
/******************************************/

/* calc coords, apply mask, and issue loads (if necessary) */
v_mov_b32 v8, BufferOOB
/* (d1,vc1,d0,vc0)=(1,14,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[74:75], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v10, v0, s74
v_lshlrev_b32 v10, 0x2, v10                        // Bias address scaled by BPE
ds_read_b128 v[28:31], v10 offset:0                // load Bias
ds_read_b128 v[32:35], v10 offset:1024             // load scaleAlpha
v_add_lshl_u32 v9, v3, v0, 0x2                     // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v9, v8, v9, s[78:79]                 // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(1,14,1,0) */
s_mov_b32 s74, 128                                 // coordOffset0 d0=1 vc0=0
v_add_co_u32 v4, vcc, v0, s74                      // coord0.2: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v44, v4, s74
v_lshlrev_b32 v44, 0x2, v44                        // Bias address scaled by BPE
ds_read_b128 v[36:39], v44 offset:0                // load Bias
ds_read_b128 v[40:43], v44 offset:1024             // load scaleAlpha
v_add_lshl_u32 v11, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v11, v8, v11, s[78:79]               // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(1,15,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[74:75], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v46, v0, s74
v_lshlrev_b32 v46, 0x2, v46                        // Bias address scaled by BPE
v_add_lshl_u32 v45, v3, v0, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v45, v8, v45, s[78:79]               // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(1,15,1,0) */
s_mov_b32 s74, 128                                 // coordOffset0 d0=1 vc0=0
v_add_co_u32 v4, vcc, v0, s74                      // coord0.2: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v48, v4, s74
v_lshlrev_b32 v48, 0x2, v48                        // Bias address scaled by BPE
v_add_lshl_u32 v47, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v47, v8, v47, s[78:79]               // LDD clip if OOB. offset
v_accvgpr_read_b32 v[vgprValuC+12], acc195         // copy acc to vreg[240]
v_accvgpr_read_b32 v[vgprValuC+13], acc199         // copy acc to vreg[241]
v_accvgpr_read_b32 v[vgprValuC+14], acc203         // copy acc to vreg[242]
v_accvgpr_read_b32 v[vgprValuC+15], acc207         // copy acc to vreg[243]
v_accvgpr_read_b32 v[vgprValuC+16], acc211         // copy acc to vreg[244]
v_accvgpr_read_b32 v[vgprValuC+17], acc215         // copy acc to vreg[245]
v_accvgpr_read_b32 v[vgprValuC+18], acc219         // copy acc to vreg[246]
v_accvgpr_read_b32 v[vgprValuC+19], acc223         // copy acc to vreg[247]
v_accvgpr_read_b32 v[vgprValuC+20], acc227         // copy acc to vreg[248]
v_accvgpr_read_b32 v[vgprValuC+21], acc231         // copy acc to vreg[249]
v_accvgpr_read_b32 v[vgprValuC+22], acc235         // copy acc to vreg[250]
v_accvgpr_read_b32 v[vgprValuC+23], acc239         // copy acc to vreg[251]
v_accvgpr_read_b32 v[vgprValuC+24], acc243         // copy acc to vreg[252]
v_accvgpr_read_b32 v[vgprValuC+25], acc247         // copy acc to vreg[253]
v_accvgpr_read_b32 v[vgprValuC+26], acc251         // copy acc to vreg[254]
v_accvgpr_read_b32 v[vgprValuC+27], acc255         // copy acc to vreg[255]

/* rC *= alpha batchElements=[(1, 0, 14, 0), (1, 1, 14, 0), (1, 0, 15, 0), (1, 1, 15, 0)] */
v_pk_mul_f32 v[vgprValuC+12:vgprValuC+12+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+12:vgprValuC+12+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+14:vgprValuC+14+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+14:vgprValuC+14+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+16:vgprValuC+16+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+16:vgprValuC+16+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+18:vgprValuC+18+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+18:vgprValuC+18+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+20:vgprValuC+20+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+20:vgprValuC+20+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+22:vgprValuC+22+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+22:vgprValuC+22+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+24:vgprValuC+24+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+24:vgprValuC+24+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+26:vgprValuC+26+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+26:vgprValuC+26+1] op_sel_hi:[0,1,1] // *= alpha (pk)
s_waitcnt lgkmcnt(0)                               // wait for Bias LDS, ScaleAlphaVec

/* apply mask, calc new C and issue writes */
v_pk_mul_f32 v[vgprValuC+12:vgprValuC+12+1], v[32:33], v[vgprValuC+12:vgprValuC+12+1] // *= ScaleAlphaVecVMulPK(32)(0)
v_pk_mul_f32 v[vgprValuC+14:vgprValuC+14+1], v[34:35], v[vgprValuC+14:vgprValuC+14+1] // *= ScaleAlphaVecVMulPK(32)(2)
v_pk_add_f32 v[4:5], v[28:29], v[vgprValuC+12:vgprValuC+12+1] // C += bias
v_pk_add_f32 v[6:7], v[30:31], v[vgprValuC+14:vgprValuC+14+1] // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b64 v[12:13], v[4:5]
v_mov_b64 v[14:15], v[6:7]
buffer_store_dwordx4 v[12:15], v9, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_pk_mul_f32 v[vgprValuC+16:vgprValuC+16+1], v[40:41], v[vgprValuC+16:vgprValuC+16+1] // *= ScaleAlphaVecVMulPK(40)(0)
v_pk_mul_f32 v[vgprValuC+18:vgprValuC+18+1], v[42:43], v[vgprValuC+18:vgprValuC+18+1] // *= ScaleAlphaVecVMulPK(40)(2)
v_pk_add_f32 v[4:5], v[36:37], v[vgprValuC+16:vgprValuC+16+1] // C += bias
v_pk_add_f32 v[6:7], v[38:39], v[vgprValuC+18:vgprValuC+18+1] // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b64 v[16:17], v[4:5]
v_mov_b64 v[18:19], v[6:7]
buffer_store_dwordx4 v[16:19], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_pk_mul_f32 v[vgprValuC+20:vgprValuC+20+1], v[32:33], v[vgprValuC+20:vgprValuC+20+1] // *= ScaleAlphaVecVMulPK(32)(0)
v_pk_mul_f32 v[vgprValuC+22:vgprValuC+22+1], v[34:35], v[vgprValuC+22:vgprValuC+22+1] // *= ScaleAlphaVecVMulPK(32)(2)
v_pk_add_f32 v[4:5], v[28:29], v[vgprValuC+20:vgprValuC+20+1] // C += bias
v_pk_add_f32 v[6:7], v[30:31], v[vgprValuC+22:vgprValuC+22+1] // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b64 v[20:21], v[4:5]
v_mov_b64 v[22:23], v[6:7]
buffer_store_dwordx4 v[20:23], v45, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_pk_mul_f32 v[vgprValuC+24:vgprValuC+24+1], v[40:41], v[vgprValuC+24:vgprValuC+24+1] // *= ScaleAlphaVecVMulPK(40)(0)
v_pk_mul_f32 v[vgprValuC+26:vgprValuC+26+1], v[42:43], v[vgprValuC+26:vgprValuC+26+1] // *= ScaleAlphaVecVMulPK(40)(2)
v_pk_add_f32 v[4:5], v[36:37], v[vgprValuC+24:vgprValuC+24+1] // C += bias
v_pk_add_f32 v[6:7], v[38:39], v[vgprValuC+26:vgprValuC+26+1] // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b64 v[24:25], v[4:5]
v_mov_b64 v[26:27], v[6:7]
buffer_store_dwordx4 v[24:27], v47, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
s_branch label_GW_End                              // jump to end
label_GW_B0_E1_M:
s_cmpk_eq_u32 s[sgprActivationType], 3             // activationType == 3
s_cbranch_scc1 label_To_Activation_Gelu_VW1_beta_0_edge_1 // Branch if true
s_cmpk_eq_u32 s[sgprActivationType], 5             // activationType == 5
s_cbranch_scc1 label_To_Activation_Relu_VW1_beta_0_edge_1 // Branch if true
s_cmpk_eq_u32 s[sgprActivationType], 10            // activationType == 10
s_cbranch_scc1 label_To_Activation_Silu_VW1_beta_0_edge_1 // Branch if true
s_cmpk_eq_u32 s[sgprActivationType], 12            // activationType == 12
s_cbranch_scc1 label_To_Activation_Clamp_VW1_beta_0_edge_1 // Branch if true
label_To_Activation_None_VW1_beta_0_edge_1:
s_getpc_b64 s[8:9]                                 // addr of next instr
s_add_i32 s73, label_Activation_None_VW1, 4        // target branch offset
s_add_u32 s8, s8, s73                              // add target branch offset
s_addc_u32 s9, s9, 0                               // add high and carry
s_branch label_ActivationSetPCAddrEnd_3
label_To_Activation_Gelu_VW1_beta_0_edge_1:
s_getpc_b64 s[8:9]                                 // addr of next instr
s_add_i32 s73, label_Activation_Gelu_VW1, 4        // target branch offset
s_add_u32 s8, s8, s73                              // add target branch offset
s_addc_u32 s9, s9, 0                               // add high and carry
s_branch label_ActivationSetPCAddrEnd_3
label_To_Activation_Relu_VW1_beta_0_edge_1:
s_getpc_b64 s[8:9]                                 // addr of next instr
s_add_i32 s73, label_Activation_Relu_VW1, 4        // target branch offset
s_add_u32 s8, s8, s73                              // add target branch offset
s_addc_u32 s9, s9, 0                               // add high and carry
s_branch label_ActivationSetPCAddrEnd_3
label_To_Activation_Silu_VW1_beta_0_edge_1:
s_getpc_b64 s[8:9]                                 // addr of next instr
s_add_i32 s73, label_Activation_Silu_VW1, 4        // target branch offset
s_add_u32 s8, s8, s73                              // add target branch offset
s_addc_u32 s9, s9, 0                               // add high and carry
s_branch label_ActivationSetPCAddrEnd_3
label_To_Activation_Clamp_VW1_beta_0_edge_1:
s_getpc_b64 s[8:9]                                 // addr of next instr
s_add_i32 s73, label_Activation_Clamp_VW1, 4       // target branch offset
s_add_u32 s8, s8, s73                              // add target branch offset
s_addc_u32 s9, s9, 0                               // add high and carry
s_branch label_ActivationSetPCAddrEnd_3
label_ActivationSetPCAddrEnd_3:

/* edge=1, allocate 6 sgpr. perBatchTmpS=4 perBatchMaskS=2 perElementMaskS=0 elementsPerBatch=47 */
/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 factorDim=0 */

/******************************************/
/* Global Write Edge Batch #0 (d1,d0,vc1,vc0) = */
/*    (0,0,0,0:vw1); (0,0,0,1:vw1); (0,0,0,2:vw1); (0,0,0,3:vw1); (0,1,0,0:vw1); (0,1,0,1:vw1); (0,1,0,2:vw1); (0,1,0,3:vw1); (0,0,1,0:vw1); (0,0,1,1:vw1); (0,0,1,2:vw1); (0,0,1,3:vw1); (0,1,1,0:vw1); (0,1,1,1:vw1); (0,1,1,2:vw1); (0,1,1,3:vw1); (0,0,2,0:vw1); (0,0,2,1:vw1); (0,0,2,2:vw1); (0,0,2,3:vw1); (0,1,2,0:vw1); (0,1,2,1:vw1); (0,1,2,2:vw1); (0,1,2,3:vw1); (0,0,3,0:vw1); (0,0,3,1:vw1); (0,0,3,2:vw1); (0,0,3,3:vw1); (0,1,3,0:vw1); (0,1,3,1:vw1); (0,1,3,2:vw1); (0,1,3,3:vw1); (0,0,4,0:vw1); (0,0,4,1:vw1); (0,0,4,2:vw1); (0,0,4,3:vw1); (0,1,4,0:vw1); (0,1,4,1:vw1); (0,1,4,2:vw1); (0,1,4,3:vw1); (0,0,5,0:vw1); (0,0,5,1:vw1); (0,0,5,2:vw1); (0,0,5,3:vw1); (0,1,5,0:vw1); (0,1,5,1:vw1); (0,1,5,2:vw1) */
/******************************************/

/* calc coords, apply mask, and issue loads (if necessary) */
v_mov_b32 v8, BufferOOB
/* (d1,vc1,d0,vc0)=(0,0,0,0) */
v_cmp_lt_u32 s[74:75], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v59, v0, s74
v_lshlrev_b32 v59, 0x2, v59                        // Bias address scaled by BPE
s_waitcnt lgkmcnt(0)                               // Wait for LDS write
s_barrier                                          // LDS write barrier
ds_read_b32 v56, v59 offset:0                      // load Bias
ds_read_b32 v57, v59 offset:1024                   // load scaleAlpha
v_add_lshl_u32 v58, v3, v0, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v58, v8, v58, s[78:79]               // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,0,0,1) */
v_add_co_u32 v4, vcc, v0, 1                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v63, v4, s74
v_lshlrev_b32 v63, 0x2, v63                        // Bias address scaled by BPE
ds_read_b32 v60, v63 offset:0                      // load Bias
ds_read_b32 v61, v63 offset:1024                   // load scaleAlpha
v_add_lshl_u32 v62, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v62, v8, v62, s[78:79]               // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,0,0,2) */
v_add_co_u32 v4, vcc, v0, 2                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v67, v4, s74
v_lshlrev_b32 v67, 0x2, v67                        // Bias address scaled by BPE
ds_read_b32 v64, v67 offset:0                      // load Bias
ds_read_b32 v65, v67 offset:1024                   // load scaleAlpha
v_add_lshl_u32 v66, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v66, v8, v66, s[78:79]               // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,0,0,3) */
v_add_co_u32 v4, vcc, v0, 3                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v71, v4, s74
v_lshlrev_b32 v71, 0x2, v71                        // Bias address scaled by BPE
ds_read_b32 v68, v71 offset:0                      // load Bias
ds_read_b32 v69, v71 offset:1024                   // load scaleAlpha
v_add_lshl_u32 v70, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v70, v8, v70, s[78:79]               // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,0,1,0) */
s_mov_b32 s74, 128                                 // coordOffset0 d0=1 vc0=0
v_add_co_u32 v4, vcc, v0, s74                      // coord0.2: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v75, v4, s74
v_lshlrev_b32 v75, 0x2, v75                        // Bias address scaled by BPE
ds_read_b32 v72, v75 offset:0                      // load Bias
ds_read_b32 v73, v75 offset:1024                   // load scaleAlpha
v_add_lshl_u32 v74, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v74, v8, v74, s[78:79]               // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,0,1,1) */
s_mov_b32 s74, 129                                 // coordOffset0 d0=1 vc0=1
v_add_co_u32 v4, vcc, v0, s74                      // coord0.2: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v79, v4, s74
v_lshlrev_b32 v79, 0x2, v79                        // Bias address scaled by BPE
ds_read_b32 v76, v79 offset:0                      // load Bias
ds_read_b32 v77, v79 offset:1024                   // load scaleAlpha
v_add_lshl_u32 v78, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v78, v8, v78, s[78:79]               // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,0,1,2) */
s_mov_b32 s74, 130                                 // coordOffset0 d0=1 vc0=2
v_add_co_u32 v4, vcc, v0, s74                      // coord0.2: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v83, v4, s74
v_lshlrev_b32 v83, 0x2, v83                        // Bias address scaled by BPE
ds_read_b32 v80, v83 offset:0                      // load Bias
ds_read_b32 v81, v83 offset:1024                   // load scaleAlpha
v_add_lshl_u32 v82, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v82, v8, v82, s[78:79]               // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,0,1,3) */
s_mov_b32 s74, 131                                 // coordOffset0 d0=1 vc0=3
v_add_co_u32 v4, vcc, v0, s74                      // coord0.2: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v87, v4, s74
v_lshlrev_b32 v87, 0x2, v87                        // Bias address scaled by BPE
ds_read_b32 v84, v87 offset:0                      // load Bias
ds_read_b32 v85, v87 offset:1024                   // load scaleAlpha
v_add_lshl_u32 v86, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v86, v8, v86, s[78:79]               // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,1,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[74:75], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v89, v0, s74
v_lshlrev_b32 v89, 0x2, v89                        // Bias address scaled by BPE
v_add_lshl_u32 v88, v3, v0, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v88, v8, v88, s[78:79]               // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,1,0,1) */
v_add_co_u32 v4, vcc, v0, 1                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v91, v4, s74
v_lshlrev_b32 v91, 0x2, v91                        // Bias address scaled by BPE
v_add_lshl_u32 v90, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v90, v8, v90, s[78:79]               // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,1,0,2) */
v_add_co_u32 v4, vcc, v0, 2                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v93, v4, s74
v_lshlrev_b32 v93, 0x2, v93                        // Bias address scaled by BPE
v_add_lshl_u32 v92, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v92, v8, v92, s[78:79]               // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,1,0,3) */
v_add_co_u32 v4, vcc, v0, 3                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v95, v4, s74
v_lshlrev_b32 v95, 0x2, v95                        // Bias address scaled by BPE
v_add_lshl_u32 v94, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v94, v8, v94, s[78:79]               // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,1,1,0) */
s_mov_b32 s74, 128                                 // coordOffset0 d0=1 vc0=0
v_add_co_u32 v4, vcc, v0, s74                      // coord0.2: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v97, v4, s74
v_lshlrev_b32 v97, 0x2, v97                        // Bias address scaled by BPE
v_add_lshl_u32 v96, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v96, v8, v96, s[78:79]               // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,1,1,1) */
s_mov_b32 s74, 129                                 // coordOffset0 d0=1 vc0=1
v_add_co_u32 v4, vcc, v0, s74                      // coord0.2: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v99, v4, s74
v_lshlrev_b32 v99, 0x2, v99                        // Bias address scaled by BPE
v_add_lshl_u32 v98, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v98, v8, v98, s[78:79]               // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,1,1,2) */
s_mov_b32 s74, 130                                 // coordOffset0 d0=1 vc0=2
v_add_co_u32 v4, vcc, v0, s74                      // coord0.2: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v101, v4, s74
v_lshlrev_b32 v101, 0x2, v101                      // Bias address scaled by BPE
v_add_lshl_u32 v100, v3, v4, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v100, v8, v100, s[78:79]             // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,1,1,3) */
s_mov_b32 s74, 131                                 // coordOffset0 d0=1 vc0=3
v_add_co_u32 v4, vcc, v0, s74                      // coord0.2: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v103, v4, s74
v_lshlrev_b32 v103, 0x2, v103                      // Bias address scaled by BPE
v_add_lshl_u32 v102, v3, v4, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v102, v8, v102, s[78:79]             // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,2,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[74:75], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v105, v0, s74
v_lshlrev_b32 v105, 0x2, v105                      // Bias address scaled by BPE
v_add_lshl_u32 v104, v3, v0, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v104, v8, v104, s[78:79]             // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,2,0,1) */
v_add_co_u32 v4, vcc, v0, 1                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v107, v4, s74
v_lshlrev_b32 v107, 0x2, v107                      // Bias address scaled by BPE
v_add_lshl_u32 v106, v3, v4, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v106, v8, v106, s[78:79]             // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,2,0,2) */
v_add_co_u32 v4, vcc, v0, 2                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v109, v4, s74
v_lshlrev_b32 v109, 0x2, v109                      // Bias address scaled by BPE
v_add_lshl_u32 v108, v3, v4, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v108, v8, v108, s[78:79]             // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,2,0,3) */
v_add_co_u32 v4, vcc, v0, 3                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v111, v4, s74
v_lshlrev_b32 v111, 0x2, v111                      // Bias address scaled by BPE
v_add_lshl_u32 v110, v3, v4, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v110, v8, v110, s[78:79]             // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,2,1,0) */
s_mov_b32 s74, 128                                 // coordOffset0 d0=1 vc0=0
v_add_co_u32 v4, vcc, v0, s74                      // coord0.2: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v113, v4, s74
v_lshlrev_b32 v113, 0x2, v113                      // Bias address scaled by BPE
v_add_lshl_u32 v112, v3, v4, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v112, v8, v112, s[78:79]             // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,2,1,1) */
s_mov_b32 s74, 129                                 // coordOffset0 d0=1 vc0=1
v_add_co_u32 v4, vcc, v0, s74                      // coord0.2: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v115, v4, s74
v_lshlrev_b32 v115, 0x2, v115                      // Bias address scaled by BPE
v_add_lshl_u32 v114, v3, v4, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v114, v8, v114, s[78:79]             // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,2,1,2) */
s_mov_b32 s74, 130                                 // coordOffset0 d0=1 vc0=2
v_add_co_u32 v4, vcc, v0, s74                      // coord0.2: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v117, v4, s74
v_lshlrev_b32 v117, 0x2, v117                      // Bias address scaled by BPE
v_add_lshl_u32 v116, v3, v4, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v116, v8, v116, s[78:79]             // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,2,1,3) */
s_mov_b32 s74, 131                                 // coordOffset0 d0=1 vc0=3
v_add_co_u32 v4, vcc, v0, s74                      // coord0.2: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v119, v4, s74
v_lshlrev_b32 v119, 0x2, v119                      // Bias address scaled by BPE
v_add_lshl_u32 v118, v3, v4, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v118, v8, v118, s[78:79]             // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,3,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[74:75], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v121, v0, s74
v_lshlrev_b32 v121, 0x2, v121                      // Bias address scaled by BPE
v_add_lshl_u32 v120, v3, v0, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v120, v8, v120, s[78:79]             // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,3,0,1) */
v_add_co_u32 v4, vcc, v0, 1                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v123, v4, s74
v_lshlrev_b32 v123, 0x2, v123                      // Bias address scaled by BPE
v_add_lshl_u32 v122, v3, v4, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v122, v8, v122, s[78:79]             // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,3,0,2) */
v_add_co_u32 v4, vcc, v0, 2                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v125, v4, s74
v_lshlrev_b32 v125, 0x2, v125                      // Bias address scaled by BPE
v_add_lshl_u32 v124, v3, v4, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v124, v8, v124, s[78:79]             // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,3,0,3) */
v_add_co_u32 v4, vcc, v0, 3                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v127, v4, s74
v_lshlrev_b32 v127, 0x2, v127                      // Bias address scaled by BPE
v_add_lshl_u32 v126, v3, v4, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v126, v8, v126, s[78:79]             // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,3,1,0) */
s_mov_b32 s74, 128                                 // coordOffset0 d0=1 vc0=0
v_add_co_u32 v4, vcc, v0, s74                      // coord0.2: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v129, v4, s74
v_lshlrev_b32 v129, 0x2, v129                      // Bias address scaled by BPE
v_add_lshl_u32 v128, v3, v4, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v128, v8, v128, s[78:79]             // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,3,1,1) */
s_mov_b32 s74, 129                                 // coordOffset0 d0=1 vc0=1
v_add_co_u32 v4, vcc, v0, s74                      // coord0.2: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v131, v4, s74
v_lshlrev_b32 v131, 0x2, v131                      // Bias address scaled by BPE
v_add_lshl_u32 v130, v3, v4, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v130, v8, v130, s[78:79]             // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,3,1,2) */
s_mov_b32 s74, 130                                 // coordOffset0 d0=1 vc0=2
v_add_co_u32 v4, vcc, v0, s74                      // coord0.2: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v133, v4, s74
v_lshlrev_b32 v133, 0x2, v133                      // Bias address scaled by BPE
v_add_lshl_u32 v132, v3, v4, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v132, v8, v132, s[78:79]             // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,3,1,3) */
s_mov_b32 s74, 131                                 // coordOffset0 d0=1 vc0=3
v_add_co_u32 v4, vcc, v0, s74                      // coord0.2: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v135, v4, s74
v_lshlrev_b32 v135, 0x2, v135                      // Bias address scaled by BPE
v_add_lshl_u32 v134, v3, v4, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v134, v8, v134, s[78:79]             // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,4,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[74:75], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v137, v0, s74
v_lshlrev_b32 v137, 0x2, v137                      // Bias address scaled by BPE
v_add_lshl_u32 v136, v3, v0, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v136, v8, v136, s[78:79]             // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,4,0,1) */
v_add_co_u32 v4, vcc, v0, 1                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v139, v4, s74
v_lshlrev_b32 v139, 0x2, v139                      // Bias address scaled by BPE
v_add_lshl_u32 v138, v3, v4, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v138, v8, v138, s[78:79]             // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,4,0,2) */
v_add_co_u32 v4, vcc, v0, 2                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v141, v4, s74
v_lshlrev_b32 v141, 0x2, v141                      // Bias address scaled by BPE
v_add_lshl_u32 v140, v3, v4, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v140, v8, v140, s[78:79]             // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,4,0,3) */
v_add_co_u32 v4, vcc, v0, 3                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v143, v4, s74
v_lshlrev_b32 v143, 0x2, v143                      // Bias address scaled by BPE
v_add_lshl_u32 v142, v3, v4, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v142, v8, v142, s[78:79]             // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,4,1,0) */
s_mov_b32 s74, 128                                 // coordOffset0 d0=1 vc0=0
v_add_co_u32 v4, vcc, v0, s74                      // coord0.2: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v145, v4, s74
v_lshlrev_b32 v145, 0x2, v145                      // Bias address scaled by BPE
v_add_lshl_u32 v144, v3, v4, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v144, v8, v144, s[78:79]             // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,4,1,1) */
s_mov_b32 s74, 129                                 // coordOffset0 d0=1 vc0=1
v_add_co_u32 v4, vcc, v0, s74                      // coord0.2: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v150, v4, s74
v_lshlrev_b32 v150, 0x2, v150                      // Bias address scaled by BPE
v_add_lshl_u32 v149, v3, v4, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v149, v8, v149, s[78:79]             // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,4,1,2) */
s_mov_b32 s74, 130                                 // coordOffset0 d0=1 vc0=2
v_add_co_u32 v4, vcc, v0, s74                      // coord0.2: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v152, v4, s74
v_lshlrev_b32 v152, 0x2, v152                      // Bias address scaled by BPE
v_add_lshl_u32 v151, v3, v4, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v151, v8, v151, s[78:79]             // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,4,1,3) */
s_mov_b32 s74, 131                                 // coordOffset0 d0=1 vc0=3
v_add_co_u32 v4, vcc, v0, s74                      // coord0.2: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v154, v4, s74
v_lshlrev_b32 v154, 0x2, v154                      // Bias address scaled by BPE
v_add_lshl_u32 v153, v3, v4, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v153, v8, v153, s[78:79]             // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,5,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[74:75], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v156, v0, s74
v_lshlrev_b32 v156, 0x2, v156                      // Bias address scaled by BPE
v_add_lshl_u32 v155, v3, v0, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v155, v8, v155, s[78:79]             // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,5,0,1) */
v_add_co_u32 v4, vcc, v0, 1                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v158, v4, s74
v_lshlrev_b32 v158, 0x2, v158                      // Bias address scaled by BPE
v_add_lshl_u32 v157, v3, v4, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v157, v8, v157, s[78:79]             // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,5,0,2) */
v_add_co_u32 v4, vcc, v0, 2                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v160, v4, s74
v_lshlrev_b32 v160, 0x2, v160                      // Bias address scaled by BPE
v_add_lshl_u32 v159, v3, v4, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v159, v8, v159, s[78:79]             // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,5,0,3) */
v_add_co_u32 v4, vcc, v0, 3                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v162, v4, s74
v_lshlrev_b32 v162, 0x2, v162                      // Bias address scaled by BPE
v_add_lshl_u32 v161, v3, v4, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v161, v8, v161, s[78:79]             // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,5,1,0) */
s_mov_b32 s74, 128                                 // coordOffset0 d0=1 vc0=0
v_add_co_u32 v4, vcc, v0, s74                      // coord0.2: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v164, v4, s74
v_lshlrev_b32 v164, 0x2, v164                      // Bias address scaled by BPE
v_add_lshl_u32 v163, v3, v4, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v163, v8, v163, s[78:79]             // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,5,1,1) */
s_mov_b32 s74, 129                                 // coordOffset0 d0=1 vc0=1
v_add_co_u32 v4, vcc, v0, s74                      // coord0.2: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v166, v4, s74
v_lshlrev_b32 v166, 0x2, v166                      // Bias address scaled by BPE
v_add_lshl_u32 v165, v3, v4, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v165, v8, v165, s[78:79]             // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,5,1,2) */
s_mov_b32 s74, 130                                 // coordOffset0 d0=1 vc0=2
v_add_co_u32 v4, vcc, v0, s74                      // coord0.2: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v168, v4, s74
v_lshlrev_b32 v168, 0x2, v168                      // Bias address scaled by BPE
v_add_lshl_u32 v167, v3, v4, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v167, v8, v167, s[78:79]             // LDD clip if OOB. offset
v_accvgpr_read_b32 v[vgprValuC+9], acc0            // copy acc to vreg[0]
v_accvgpr_read_b32 v[vgprValuC+10], acc4           // copy acc to vreg[1]
v_accvgpr_read_b32 v[vgprValuC+11], acc8           // copy acc to vreg[2]
v_accvgpr_read_b32 v[vgprValuC+12], acc12          // copy acc to vreg[3]
v_accvgpr_read_b32 v[vgprValuC+13], acc16          // copy acc to vreg[4]
v_accvgpr_read_b32 v[vgprValuC+14], acc20          // copy acc to vreg[5]
v_accvgpr_read_b32 v[vgprValuC+15], acc24          // copy acc to vreg[6]
v_accvgpr_read_b32 v[vgprValuC+16], acc28          // copy acc to vreg[7]
v_accvgpr_read_b32 v[vgprValuC+17], acc32          // copy acc to vreg[8]
v_accvgpr_read_b32 v[vgprValuC+18], acc36          // copy acc to vreg[9]
v_accvgpr_read_b32 v[vgprValuC+19], acc40          // copy acc to vreg[10]
v_accvgpr_read_b32 v[vgprValuC+20], acc44          // copy acc to vreg[11]
v_accvgpr_read_b32 v[vgprValuC+21], acc48          // copy acc to vreg[12]
v_accvgpr_read_b32 v[vgprValuC+22], acc52          // copy acc to vreg[13]
v_accvgpr_read_b32 v[vgprValuC+23], acc56          // copy acc to vreg[14]
v_accvgpr_read_b32 v[vgprValuC+24], acc60          // copy acc to vreg[15]
v_accvgpr_read_b32 v[vgprValuC+25], acc64          // copy acc to vreg[16]
v_accvgpr_read_b32 v[vgprValuC+26], acc68          // copy acc to vreg[17]
v_accvgpr_read_b32 v[vgprValuC+27], acc72          // copy acc to vreg[18]
v_accvgpr_read_b32 v[vgprValuC+28], acc76          // copy acc to vreg[19]
v_accvgpr_read_b32 v[vgprValuC+29], acc80          // copy acc to vreg[20]
v_accvgpr_read_b32 v[vgprValuC+30], acc84          // copy acc to vreg[21]
v_accvgpr_read_b32 v[vgprValuC+31], acc88          // copy acc to vreg[22]
v_accvgpr_read_b32 v[vgprValuC+32], acc92          // copy acc to vreg[23]
v_accvgpr_read_b32 v[vgprValuC+33], acc96          // copy acc to vreg[24]
v_accvgpr_read_b32 v[vgprValuC+34], acc100         // copy acc to vreg[25]
v_accvgpr_read_b32 v[vgprValuC+35], acc104         // copy acc to vreg[26]
v_accvgpr_read_b32 v[vgprValuC+36], acc108         // copy acc to vreg[27]
v_accvgpr_read_b32 v[vgprValuC+37], acc112         // copy acc to vreg[28]
v_accvgpr_read_b32 v[vgprValuC+38], acc116         // copy acc to vreg[29]
v_accvgpr_read_b32 v[vgprValuC+39], acc120         // copy acc to vreg[30]
v_accvgpr_read_b32 v[vgprValuC+40], acc124         // copy acc to vreg[31]
v_accvgpr_read_b32 v[vgprValuC+41], acc1           // copy acc to vreg[32]
v_accvgpr_read_b32 v[vgprValuC+42], acc5           // copy acc to vreg[33]
v_accvgpr_read_b32 v[vgprValuC+43], acc9           // copy acc to vreg[34]
v_accvgpr_read_b32 v[vgprValuC+44], acc13          // copy acc to vreg[35]
v_accvgpr_read_b32 v[vgprValuC+45], acc17          // copy acc to vreg[36]
v_accvgpr_read_b32 v[vgprValuC+46], acc21          // copy acc to vreg[37]
v_accvgpr_read_b32 v[vgprValuC+47], acc25          // copy acc to vreg[38]
v_accvgpr_read_b32 v[vgprValuC+48], acc29          // copy acc to vreg[39]
v_accvgpr_read_b32 v[vgprValuC+49], acc33          // copy acc to vreg[40]
v_accvgpr_read_b32 v[vgprValuC+50], acc37          // copy acc to vreg[41]
v_accvgpr_read_b32 v[vgprValuC+51], acc41          // copy acc to vreg[42]
v_accvgpr_read_b32 v[vgprValuC+52], acc45          // copy acc to vreg[43]
v_accvgpr_read_b32 v[vgprValuC+53], acc49          // copy acc to vreg[44]
v_accvgpr_read_b32 v[vgprValuC+54], acc53          // copy acc to vreg[45]
v_accvgpr_read_b32 v[vgprValuC+55], acc57          // copy acc to vreg[46]

/* rC *= alpha batchElements=[(0, 0, 0, 0), (0, 0, 0, 1), (0, 0, 0, 2), (0, 0, 0, 3), (0, 1, 0, 0), (0, 1, 0, 1), (0, 1, 0, 2), (0, 1, 0, 3), (0, 0, 1, 0), (0, 0, 1, 1), (0, 0, 1, 2), (0, 0, 1, 3), (0, 1, 1, 0), (0, 1, 1, 1), (0, 1, 1, 2), (0, 1, 1, 3), (0, 0, 2, 0), (0, 0, 2, 1), (0, 0, 2, 2), (0, 0, 2, 3), (0, 1, 2, 0), (0, 1, 2, 1), (0, 1, 2, 2), (0, 1, 2, 3), (0, 0, 3, 0), (0, 0, 3, 1), (0, 0, 3, 2), (0, 0, 3, 3), (0, 1, 3, 0), (0, 1, 3, 1), (0, 1, 3, 2), (0, 1, 3, 3), (0, 0, 4, 0), (0, 0, 4, 1), (0, 0, 4, 2), (0, 0, 4, 3), (0, 1, 4, 0), (0, 1, 4, 1), (0, 1, 4, 2), (0, 1, 4, 3), (0, 0, 5, 0), (0, 0, 5, 1), (0, 0, 5, 2), (0, 0, 5, 3), (0, 1, 5, 0), (0, 1, 5, 1), (0, 1, 5, 2)] */
v_mul_f32 v[vgprValuC+9], s[sgprAlpha], v[vgprValuC+9] // *= alpha
v_pk_mul_f32 v[vgprValuC+10:vgprValuC+10+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+10:vgprValuC+10+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+12:vgprValuC+12+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+12:vgprValuC+12+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+14:vgprValuC+14+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+14:vgprValuC+14+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+16:vgprValuC+16+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+16:vgprValuC+16+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+18:vgprValuC+18+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+18:vgprValuC+18+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+20:vgprValuC+20+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+20:vgprValuC+20+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+22:vgprValuC+22+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+22:vgprValuC+22+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+24:vgprValuC+24+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+24:vgprValuC+24+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+26:vgprValuC+26+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+26:vgprValuC+26+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+28:vgprValuC+28+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+28:vgprValuC+28+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+30:vgprValuC+30+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+30:vgprValuC+30+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+32:vgprValuC+32+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+32:vgprValuC+32+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+34:vgprValuC+34+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+34:vgprValuC+34+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+36:vgprValuC+36+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+36:vgprValuC+36+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+38:vgprValuC+38+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+38:vgprValuC+38+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+40:vgprValuC+40+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+40:vgprValuC+40+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+42:vgprValuC+42+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+42:vgprValuC+42+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+44:vgprValuC+44+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+44:vgprValuC+44+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+46:vgprValuC+46+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+46:vgprValuC+46+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+48:vgprValuC+48+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+48:vgprValuC+48+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+50:vgprValuC+50+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+50:vgprValuC+50+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+52:vgprValuC+52+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+52:vgprValuC+52+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+54:vgprValuC+54+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+54:vgprValuC+54+1] op_sel_hi:[0,1,1] // *= alpha (pk)
s_waitcnt lgkmcnt(0)                               // wait for Bias LDS, ScaleAlphaVec

/* apply mask, calc new C and issue writes */
v_mul_f32 v[vgprValuC+9], v57, v[vgprValuC+9]      // *= ScaleAlphaVecVMul
v_add_f32 v4, v56, v[vgprValuC+9]                  // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b32 v9, v4
buffer_store_dword v9, v58, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_mul_f32 v[vgprValuC+10], v61, v[vgprValuC+10]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v60, v[vgprValuC+10]                 // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b32 v10, v4
buffer_store_dword v10, v62, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_mul_f32 v[vgprValuC+11], v65, v[vgprValuC+11]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v64, v[vgprValuC+11]                 // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b32 v11, v4
buffer_store_dword v11, v66, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_mul_f32 v[vgprValuC+12], v69, v[vgprValuC+12]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v68, v[vgprValuC+12]                 // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b32 v12, v4
buffer_store_dword v12, v70, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_mul_f32 v[vgprValuC+13], v73, v[vgprValuC+13]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v72, v[vgprValuC+13]                 // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b32 v13, v4
buffer_store_dword v13, v74, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_mul_f32 v[vgprValuC+14], v77, v[vgprValuC+14]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v76, v[vgprValuC+14]                 // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b32 v14, v4
buffer_store_dword v14, v78, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_mul_f32 v[vgprValuC+15], v81, v[vgprValuC+15]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v80, v[vgprValuC+15]                 // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b32 v15, v4
buffer_store_dword v15, v82, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_mul_f32 v[vgprValuC+16], v85, v[vgprValuC+16]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v84, v[vgprValuC+16]                 // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b32 v16, v4
buffer_store_dword v16, v86, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_mul_f32 v[vgprValuC+17], v57, v[vgprValuC+17]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v56, v[vgprValuC+17]                 // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b32 v17, v4
buffer_store_dword v17, v88, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_mul_f32 v[vgprValuC+18], v61, v[vgprValuC+18]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v60, v[vgprValuC+18]                 // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b32 v18, v4
buffer_store_dword v18, v90, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_mul_f32 v[vgprValuC+19], v65, v[vgprValuC+19]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v64, v[vgprValuC+19]                 // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b32 v19, v4
buffer_store_dword v19, v92, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_mul_f32 v[vgprValuC+20], v69, v[vgprValuC+20]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v68, v[vgprValuC+20]                 // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b32 v20, v4
buffer_store_dword v20, v94, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_mul_f32 v[vgprValuC+21], v73, v[vgprValuC+21]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v72, v[vgprValuC+21]                 // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b32 v21, v4
buffer_store_dword v21, v96, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_mul_f32 v[vgprValuC+22], v77, v[vgprValuC+22]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v76, v[vgprValuC+22]                 // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b32 v22, v4
buffer_store_dword v22, v98, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_mul_f32 v[vgprValuC+23], v81, v[vgprValuC+23]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v80, v[vgprValuC+23]                 // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b32 v23, v4
buffer_store_dword v23, v100, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_mul_f32 v[vgprValuC+24], v85, v[vgprValuC+24]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v84, v[vgprValuC+24]                 // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b32 v24, v4
buffer_store_dword v24, v102, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_mul_f32 v[vgprValuC+25], v57, v[vgprValuC+25]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v56, v[vgprValuC+25]                 // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b32 v25, v4
buffer_store_dword v25, v104, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_mul_f32 v[vgprValuC+26], v61, v[vgprValuC+26]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v60, v[vgprValuC+26]                 // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b32 v26, v4
buffer_store_dword v26, v106, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_mul_f32 v[vgprValuC+27], v65, v[vgprValuC+27]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v64, v[vgprValuC+27]                 // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b32 v27, v4
buffer_store_dword v27, v108, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_mul_f32 v[vgprValuC+28], v69, v[vgprValuC+28]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v68, v[vgprValuC+28]                 // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b32 v28, v4
buffer_store_dword v28, v110, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_mul_f32 v[vgprValuC+29], v73, v[vgprValuC+29]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v72, v[vgprValuC+29]                 // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b32 v29, v4
buffer_store_dword v29, v112, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_mul_f32 v[vgprValuC+30], v77, v[vgprValuC+30]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v76, v[vgprValuC+30]                 // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b32 v30, v4
buffer_store_dword v30, v114, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_mul_f32 v[vgprValuC+31], v81, v[vgprValuC+31]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v80, v[vgprValuC+31]                 // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b32 v31, v4
buffer_store_dword v31, v116, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_mul_f32 v[vgprValuC+32], v85, v[vgprValuC+32]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v84, v[vgprValuC+32]                 // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b32 v32, v4
buffer_store_dword v32, v118, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_mul_f32 v[vgprValuC+33], v57, v[vgprValuC+33]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v56, v[vgprValuC+33]                 // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b32 v33, v4
buffer_store_dword v33, v120, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_mul_f32 v[vgprValuC+34], v61, v[vgprValuC+34]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v60, v[vgprValuC+34]                 // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b32 v34, v4
buffer_store_dword v34, v122, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_mul_f32 v[vgprValuC+35], v65, v[vgprValuC+35]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v64, v[vgprValuC+35]                 // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b32 v35, v4
buffer_store_dword v35, v124, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_mul_f32 v[vgprValuC+36], v69, v[vgprValuC+36]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v68, v[vgprValuC+36]                 // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b32 v36, v4
buffer_store_dword v36, v126, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_mul_f32 v[vgprValuC+37], v73, v[vgprValuC+37]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v72, v[vgprValuC+37]                 // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b32 v37, v4
buffer_store_dword v37, v128, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_mul_f32 v[vgprValuC+38], v77, v[vgprValuC+38]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v76, v[vgprValuC+38]                 // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b32 v38, v4
buffer_store_dword v38, v130, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_mul_f32 v[vgprValuC+39], v81, v[vgprValuC+39]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v80, v[vgprValuC+39]                 // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b32 v39, v4
buffer_store_dword v39, v132, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_mul_f32 v[vgprValuC+40], v85, v[vgprValuC+40]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v84, v[vgprValuC+40]                 // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b32 v40, v4
buffer_store_dword v40, v134, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_mul_f32 v[vgprValuC+41], v57, v[vgprValuC+41]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v56, v[vgprValuC+41]                 // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b32 v41, v4
buffer_store_dword v41, v136, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_mul_f32 v[vgprValuC+42], v61, v[vgprValuC+42]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v60, v[vgprValuC+42]                 // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b32 v42, v4
buffer_store_dword v42, v138, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_mul_f32 v[vgprValuC+43], v65, v[vgprValuC+43]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v64, v[vgprValuC+43]                 // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b32 v43, v4
buffer_store_dword v43, v140, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_mul_f32 v[vgprValuC+44], v69, v[vgprValuC+44]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v68, v[vgprValuC+44]                 // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b32 v44, v4
buffer_store_dword v44, v142, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_mul_f32 v[vgprValuC+45], v73, v[vgprValuC+45]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v72, v[vgprValuC+45]                 // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b32 v45, v4
buffer_store_dword v45, v144, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_mul_f32 v[vgprValuC+46], v77, v[vgprValuC+46]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v76, v[vgprValuC+46]                 // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b32 v46, v4
buffer_store_dword v46, v149, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_mul_f32 v[vgprValuC+47], v81, v[vgprValuC+47]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v80, v[vgprValuC+47]                 // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b32 v47, v4
buffer_store_dword v47, v151, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_mul_f32 v[vgprValuC+48], v85, v[vgprValuC+48]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v84, v[vgprValuC+48]                 // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b32 v48, v4
buffer_store_dword v48, v153, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_mul_f32 v[vgprValuC+49], v57, v[vgprValuC+49]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v56, v[vgprValuC+49]                 // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b32 v49, v4
buffer_store_dword v49, v155, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_mul_f32 v[vgprValuC+50], v61, v[vgprValuC+50]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v60, v[vgprValuC+50]                 // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b32 v50, v4
buffer_store_dword v50, v157, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_mul_f32 v[vgprValuC+51], v65, v[vgprValuC+51]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v64, v[vgprValuC+51]                 // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b32 v51, v4
buffer_store_dword v51, v159, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_mul_f32 v[vgprValuC+52], v69, v[vgprValuC+52]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v68, v[vgprValuC+52]                 // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b32 v52, v4
buffer_store_dword v52, v161, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_mul_f32 v[vgprValuC+53], v73, v[vgprValuC+53]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v72, v[vgprValuC+53]                 // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b32 v53, v4
buffer_store_dword v53, v163, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_mul_f32 v[vgprValuC+54], v77, v[vgprValuC+54]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v76, v[vgprValuC+54]                 // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b32 v54, v4
buffer_store_dword v54, v165, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_mul_f32 v[vgprValuC+55], v81, v[vgprValuC+55]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v80, v[vgprValuC+55]                 // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b32 v55, v4
buffer_store_dword v55, v167, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 factorDim=0 */

/******************************************/
/* Global Write Edge Batch #1 (d1,d0,vc1,vc0) = */
/*    (0,1,5,3:vw1); (0,0,6,0:vw1); (0,0,6,1:vw1); (0,0,6,2:vw1); (0,0,6,3:vw1); (0,1,6,0:vw1); (0,1,6,1:vw1); (0,1,6,2:vw1); (0,1,6,3:vw1); (0,0,7,0:vw1); (0,0,7,1:vw1); (0,0,7,2:vw1); (0,0,7,3:vw1); (0,1,7,0:vw1); (0,1,7,1:vw1); (0,1,7,2:vw1); (0,1,7,3:vw1); (0,0,8,0:vw1); (0,0,8,1:vw1); (0,0,8,2:vw1); (0,0,8,3:vw1); (0,1,8,0:vw1); (0,1,8,1:vw1); (0,1,8,2:vw1); (0,1,8,3:vw1); (0,0,9,0:vw1); (0,0,9,1:vw1); (0,0,9,2:vw1); (0,0,9,3:vw1); (0,1,9,0:vw1); (0,1,9,1:vw1); (0,1,9,2:vw1); (0,1,9,3:vw1); (0,0,10,0:vw1); (0,0,10,1:vw1); (0,0,10,2:vw1); (0,0,10,3:vw1); (0,1,10,0:vw1); (0,1,10,1:vw1); (0,1,10,2:vw1); (0,1,10,3:vw1); (0,0,11,0:vw1); (0,0,11,1:vw1); (0,0,11,2:vw1); (0,0,11,3:vw1); (0,1,11,0:vw1); (0,1,11,1:vw1) */
/******************************************/

/* calc coords, apply mask, and issue loads (if necessary) */
v_mov_b32 v8, BufferOOB
/* (d1,vc1,d0,vc0)=(0,5,1,3) */
s_mov_b32 s74, 131                                 // coordOffset0 d0=1 vc0=3
v_add_co_u32 v4, vcc, v0, s74                      // coord0.2: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v59, v4, s74
v_lshlrev_b32 v59, 0x2, v59                        // Bias address scaled by BPE
ds_read_b32 v56, v59 offset:0                      // load Bias
ds_read_b32 v57, v59 offset:1024                   // load scaleAlpha
v_add_lshl_u32 v58, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v58, v8, v58, s[78:79]               // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,6,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[74:75], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v63, v0, s74
v_lshlrev_b32 v63, 0x2, v63                        // Bias address scaled by BPE
ds_read_b32 v60, v63 offset:0                      // load Bias
ds_read_b32 v61, v63 offset:1024                   // load scaleAlpha
v_add_lshl_u32 v62, v3, v0, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v62, v8, v62, s[78:79]               // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,6,0,1) */
v_add_co_u32 v4, vcc, v0, 1                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v67, v4, s74
v_lshlrev_b32 v67, 0x2, v67                        // Bias address scaled by BPE
ds_read_b32 v64, v67 offset:0                      // load Bias
ds_read_b32 v65, v67 offset:1024                   // load scaleAlpha
v_add_lshl_u32 v66, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v66, v8, v66, s[78:79]               // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,6,0,2) */
v_add_co_u32 v4, vcc, v0, 2                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v71, v4, s74
v_lshlrev_b32 v71, 0x2, v71                        // Bias address scaled by BPE
ds_read_b32 v68, v71 offset:0                      // load Bias
ds_read_b32 v69, v71 offset:1024                   // load scaleAlpha
v_add_lshl_u32 v70, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v70, v8, v70, s[78:79]               // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,6,0,3) */
v_add_co_u32 v4, vcc, v0, 3                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v75, v4, s74
v_lshlrev_b32 v75, 0x2, v75                        // Bias address scaled by BPE
ds_read_b32 v72, v75 offset:0                      // load Bias
ds_read_b32 v73, v75 offset:1024                   // load scaleAlpha
v_add_lshl_u32 v74, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v74, v8, v74, s[78:79]               // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,6,1,0) */
s_mov_b32 s74, 128                                 // coordOffset0 d0=1 vc0=0
v_add_co_u32 v4, vcc, v0, s74                      // coord0.2: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v79, v4, s74
v_lshlrev_b32 v79, 0x2, v79                        // Bias address scaled by BPE
ds_read_b32 v76, v79 offset:0                      // load Bias
ds_read_b32 v77, v79 offset:1024                   // load scaleAlpha
v_add_lshl_u32 v78, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v78, v8, v78, s[78:79]               // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,6,1,1) */
s_mov_b32 s74, 129                                 // coordOffset0 d0=1 vc0=1
v_add_co_u32 v4, vcc, v0, s74                      // coord0.2: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v83, v4, s74
v_lshlrev_b32 v83, 0x2, v83                        // Bias address scaled by BPE
ds_read_b32 v80, v83 offset:0                      // load Bias
ds_read_b32 v81, v83 offset:1024                   // load scaleAlpha
v_add_lshl_u32 v82, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v82, v8, v82, s[78:79]               // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,6,1,2) */
s_mov_b32 s74, 130                                 // coordOffset0 d0=1 vc0=2
v_add_co_u32 v4, vcc, v0, s74                      // coord0.2: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v87, v4, s74
v_lshlrev_b32 v87, 0x2, v87                        // Bias address scaled by BPE
ds_read_b32 v84, v87 offset:0                      // load Bias
ds_read_b32 v85, v87 offset:1024                   // load scaleAlpha
v_add_lshl_u32 v86, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v86, v8, v86, s[78:79]               // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,6,1,3) */
s_mov_b32 s74, 131                                 // coordOffset0 d0=1 vc0=3
v_add_co_u32 v4, vcc, v0, s74                      // coord0.2: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v89, v4, s74
v_lshlrev_b32 v89, 0x2, v89                        // Bias address scaled by BPE
v_add_lshl_u32 v88, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v88, v8, v88, s[78:79]               // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,7,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[74:75], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v91, v0, s74
v_lshlrev_b32 v91, 0x2, v91                        // Bias address scaled by BPE
v_add_lshl_u32 v90, v3, v0, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v90, v8, v90, s[78:79]               // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,7,0,1) */
v_add_co_u32 v4, vcc, v0, 1                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v93, v4, s74
v_lshlrev_b32 v93, 0x2, v93                        // Bias address scaled by BPE
v_add_lshl_u32 v92, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v92, v8, v92, s[78:79]               // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,7,0,2) */
v_add_co_u32 v4, vcc, v0, 2                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v95, v4, s74
v_lshlrev_b32 v95, 0x2, v95                        // Bias address scaled by BPE
v_add_lshl_u32 v94, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v94, v8, v94, s[78:79]               // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,7,0,3) */
v_add_co_u32 v4, vcc, v0, 3                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v97, v4, s74
v_lshlrev_b32 v97, 0x2, v97                        // Bias address scaled by BPE
v_add_lshl_u32 v96, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v96, v8, v96, s[78:79]               // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,7,1,0) */
s_mov_b32 s74, 128                                 // coordOffset0 d0=1 vc0=0
v_add_co_u32 v4, vcc, v0, s74                      // coord0.2: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v99, v4, s74
v_lshlrev_b32 v99, 0x2, v99                        // Bias address scaled by BPE
v_add_lshl_u32 v98, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v98, v8, v98, s[78:79]               // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,7,1,1) */
s_mov_b32 s74, 129                                 // coordOffset0 d0=1 vc0=1
v_add_co_u32 v4, vcc, v0, s74                      // coord0.2: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v101, v4, s74
v_lshlrev_b32 v101, 0x2, v101                      // Bias address scaled by BPE
v_add_lshl_u32 v100, v3, v4, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v100, v8, v100, s[78:79]             // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,7,1,2) */
s_mov_b32 s74, 130                                 // coordOffset0 d0=1 vc0=2
v_add_co_u32 v4, vcc, v0, s74                      // coord0.2: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v103, v4, s74
v_lshlrev_b32 v103, 0x2, v103                      // Bias address scaled by BPE
v_add_lshl_u32 v102, v3, v4, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v102, v8, v102, s[78:79]             // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,7,1,3) */
s_mov_b32 s74, 131                                 // coordOffset0 d0=1 vc0=3
v_add_co_u32 v4, vcc, v0, s74                      // coord0.2: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v105, v4, s74
v_lshlrev_b32 v105, 0x2, v105                      // Bias address scaled by BPE
v_add_lshl_u32 v104, v3, v4, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v104, v8, v104, s[78:79]             // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,8,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[74:75], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v107, v0, s74
v_lshlrev_b32 v107, 0x2, v107                      // Bias address scaled by BPE
v_add_lshl_u32 v106, v3, v0, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v106, v8, v106, s[78:79]             // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,8,0,1) */
v_add_co_u32 v4, vcc, v0, 1                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v109, v4, s74
v_lshlrev_b32 v109, 0x2, v109                      // Bias address scaled by BPE
v_add_lshl_u32 v108, v3, v4, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v108, v8, v108, s[78:79]             // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,8,0,2) */
v_add_co_u32 v4, vcc, v0, 2                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v111, v4, s74
v_lshlrev_b32 v111, 0x2, v111                      // Bias address scaled by BPE
v_add_lshl_u32 v110, v3, v4, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v110, v8, v110, s[78:79]             // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,8,0,3) */
v_add_co_u32 v4, vcc, v0, 3                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v113, v4, s74
v_lshlrev_b32 v113, 0x2, v113                      // Bias address scaled by BPE
v_add_lshl_u32 v112, v3, v4, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v112, v8, v112, s[78:79]             // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,8,1,0) */
s_mov_b32 s74, 128                                 // coordOffset0 d0=1 vc0=0
v_add_co_u32 v4, vcc, v0, s74                      // coord0.2: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v115, v4, s74
v_lshlrev_b32 v115, 0x2, v115                      // Bias address scaled by BPE
v_add_lshl_u32 v114, v3, v4, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v114, v8, v114, s[78:79]             // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,8,1,1) */
s_mov_b32 s74, 129                                 // coordOffset0 d0=1 vc0=1
v_add_co_u32 v4, vcc, v0, s74                      // coord0.2: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v117, v4, s74
v_lshlrev_b32 v117, 0x2, v117                      // Bias address scaled by BPE
v_add_lshl_u32 v116, v3, v4, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v116, v8, v116, s[78:79]             // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,8,1,2) */
s_mov_b32 s74, 130                                 // coordOffset0 d0=1 vc0=2
v_add_co_u32 v4, vcc, v0, s74                      // coord0.2: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v119, v4, s74
v_lshlrev_b32 v119, 0x2, v119                      // Bias address scaled by BPE
v_add_lshl_u32 v118, v3, v4, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v118, v8, v118, s[78:79]             // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,8,1,3) */
s_mov_b32 s74, 131                                 // coordOffset0 d0=1 vc0=3
v_add_co_u32 v4, vcc, v0, s74                      // coord0.2: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v121, v4, s74
v_lshlrev_b32 v121, 0x2, v121                      // Bias address scaled by BPE
v_add_lshl_u32 v120, v3, v4, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v120, v8, v120, s[78:79]             // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,9,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[74:75], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v123, v0, s74
v_lshlrev_b32 v123, 0x2, v123                      // Bias address scaled by BPE
v_add_lshl_u32 v122, v3, v0, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v122, v8, v122, s[78:79]             // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,9,0,1) */
v_add_co_u32 v4, vcc, v0, 1                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v125, v4, s74
v_lshlrev_b32 v125, 0x2, v125                      // Bias address scaled by BPE
v_add_lshl_u32 v124, v3, v4, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v124, v8, v124, s[78:79]             // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,9,0,2) */
v_add_co_u32 v4, vcc, v0, 2                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v127, v4, s74
v_lshlrev_b32 v127, 0x2, v127                      // Bias address scaled by BPE
v_add_lshl_u32 v126, v3, v4, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v126, v8, v126, s[78:79]             // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,9,0,3) */
v_add_co_u32 v4, vcc, v0, 3                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v129, v4, s74
v_lshlrev_b32 v129, 0x2, v129                      // Bias address scaled by BPE
v_add_lshl_u32 v128, v3, v4, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v128, v8, v128, s[78:79]             // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,9,1,0) */
s_mov_b32 s74, 128                                 // coordOffset0 d0=1 vc0=0
v_add_co_u32 v4, vcc, v0, s74                      // coord0.2: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v131, v4, s74
v_lshlrev_b32 v131, 0x2, v131                      // Bias address scaled by BPE
v_add_lshl_u32 v130, v3, v4, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v130, v8, v130, s[78:79]             // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,9,1,1) */
s_mov_b32 s74, 129                                 // coordOffset0 d0=1 vc0=1
v_add_co_u32 v4, vcc, v0, s74                      // coord0.2: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v133, v4, s74
v_lshlrev_b32 v133, 0x2, v133                      // Bias address scaled by BPE
v_add_lshl_u32 v132, v3, v4, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v132, v8, v132, s[78:79]             // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,9,1,2) */
s_mov_b32 s74, 130                                 // coordOffset0 d0=1 vc0=2
v_add_co_u32 v4, vcc, v0, s74                      // coord0.2: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v135, v4, s74
v_lshlrev_b32 v135, 0x2, v135                      // Bias address scaled by BPE
v_add_lshl_u32 v134, v3, v4, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v134, v8, v134, s[78:79]             // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,9,1,3) */
s_mov_b32 s74, 131                                 // coordOffset0 d0=1 vc0=3
v_add_co_u32 v4, vcc, v0, s74                      // coord0.2: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v137, v4, s74
v_lshlrev_b32 v137, 0x2, v137                      // Bias address scaled by BPE
v_add_lshl_u32 v136, v3, v4, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v136, v8, v136, s[78:79]             // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,10,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[74:75], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v139, v0, s74
v_lshlrev_b32 v139, 0x2, v139                      // Bias address scaled by BPE
v_add_lshl_u32 v138, v3, v0, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v138, v8, v138, s[78:79]             // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,10,0,1) */
v_add_co_u32 v4, vcc, v0, 1                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v141, v4, s74
v_lshlrev_b32 v141, 0x2, v141                      // Bias address scaled by BPE
v_add_lshl_u32 v140, v3, v4, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v140, v8, v140, s[78:79]             // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,10,0,2) */
v_add_co_u32 v4, vcc, v0, 2                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v143, v4, s74
v_lshlrev_b32 v143, 0x2, v143                      // Bias address scaled by BPE
v_add_lshl_u32 v142, v3, v4, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v142, v8, v142, s[78:79]             // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,10,0,3) */
v_add_co_u32 v4, vcc, v0, 3                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v145, v4, s74
v_lshlrev_b32 v145, 0x2, v145                      // Bias address scaled by BPE
v_add_lshl_u32 v144, v3, v4, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v144, v8, v144, s[78:79]             // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,10,1,0) */
s_mov_b32 s74, 128                                 // coordOffset0 d0=1 vc0=0
v_add_co_u32 v4, vcc, v0, s74                      // coord0.2: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v150, v4, s74
v_lshlrev_b32 v150, 0x2, v150                      // Bias address scaled by BPE
v_add_lshl_u32 v149, v3, v4, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v149, v8, v149, s[78:79]             // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,10,1,1) */
s_mov_b32 s74, 129                                 // coordOffset0 d0=1 vc0=1
v_add_co_u32 v4, vcc, v0, s74                      // coord0.2: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v152, v4, s74
v_lshlrev_b32 v152, 0x2, v152                      // Bias address scaled by BPE
v_add_lshl_u32 v151, v3, v4, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v151, v8, v151, s[78:79]             // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,10,1,2) */
s_mov_b32 s74, 130                                 // coordOffset0 d0=1 vc0=2
v_add_co_u32 v4, vcc, v0, s74                      // coord0.2: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v154, v4, s74
v_lshlrev_b32 v154, 0x2, v154                      // Bias address scaled by BPE
v_add_lshl_u32 v153, v3, v4, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v153, v8, v153, s[78:79]             // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,10,1,3) */
s_mov_b32 s74, 131                                 // coordOffset0 d0=1 vc0=3
v_add_co_u32 v4, vcc, v0, s74                      // coord0.2: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v156, v4, s74
v_lshlrev_b32 v156, 0x2, v156                      // Bias address scaled by BPE
v_add_lshl_u32 v155, v3, v4, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v155, v8, v155, s[78:79]             // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,11,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[74:75], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v158, v0, s74
v_lshlrev_b32 v158, 0x2, v158                      // Bias address scaled by BPE
v_add_lshl_u32 v157, v3, v0, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v157, v8, v157, s[78:79]             // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,11,0,1) */
v_add_co_u32 v4, vcc, v0, 1                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v160, v4, s74
v_lshlrev_b32 v160, 0x2, v160                      // Bias address scaled by BPE
v_add_lshl_u32 v159, v3, v4, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v159, v8, v159, s[78:79]             // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,11,0,2) */
v_add_co_u32 v4, vcc, v0, 2                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v162, v4, s74
v_lshlrev_b32 v162, 0x2, v162                      // Bias address scaled by BPE
v_add_lshl_u32 v161, v3, v4, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v161, v8, v161, s[78:79]             // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,11,0,3) */
v_add_co_u32 v4, vcc, v0, 3                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v164, v4, s74
v_lshlrev_b32 v164, 0x2, v164                      // Bias address scaled by BPE
v_add_lshl_u32 v163, v3, v4, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v163, v8, v163, s[78:79]             // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,11,1,0) */
s_mov_b32 s74, 128                                 // coordOffset0 d0=1 vc0=0
v_add_co_u32 v4, vcc, v0, s74                      // coord0.2: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v166, v4, s74
v_lshlrev_b32 v166, 0x2, v166                      // Bias address scaled by BPE
v_add_lshl_u32 v165, v3, v4, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v165, v8, v165, s[78:79]             // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,11,1,1) */
s_mov_b32 s74, 129                                 // coordOffset0 d0=1 vc0=1
v_add_co_u32 v4, vcc, v0, s74                      // coord0.2: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v168, v4, s74
v_lshlrev_b32 v168, 0x2, v168                      // Bias address scaled by BPE
v_add_lshl_u32 v167, v3, v4, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v167, v8, v167, s[78:79]             // LDD clip if OOB. offset
v_accvgpr_read_b32 v[vgprValuC+9], acc61           // copy acc to vreg[47]
v_accvgpr_read_b32 v[vgprValuC+10], acc65          // copy acc to vreg[48]
v_accvgpr_read_b32 v[vgprValuC+11], acc69          // copy acc to vreg[49]
v_accvgpr_read_b32 v[vgprValuC+12], acc73          // copy acc to vreg[50]
v_accvgpr_read_b32 v[vgprValuC+13], acc77          // copy acc to vreg[51]
v_accvgpr_read_b32 v[vgprValuC+14], acc81          // copy acc to vreg[52]
v_accvgpr_read_b32 v[vgprValuC+15], acc85          // copy acc to vreg[53]
v_accvgpr_read_b32 v[vgprValuC+16], acc89          // copy acc to vreg[54]
v_accvgpr_read_b32 v[vgprValuC+17], acc93          // copy acc to vreg[55]
v_accvgpr_read_b32 v[vgprValuC+18], acc97          // copy acc to vreg[56]
v_accvgpr_read_b32 v[vgprValuC+19], acc101         // copy acc to vreg[57]
v_accvgpr_read_b32 v[vgprValuC+20], acc105         // copy acc to vreg[58]
v_accvgpr_read_b32 v[vgprValuC+21], acc109         // copy acc to vreg[59]
v_accvgpr_read_b32 v[vgprValuC+22], acc113         // copy acc to vreg[60]
v_accvgpr_read_b32 v[vgprValuC+23], acc117         // copy acc to vreg[61]
v_accvgpr_read_b32 v[vgprValuC+24], acc121         // copy acc to vreg[62]
v_accvgpr_read_b32 v[vgprValuC+25], acc125         // copy acc to vreg[63]
v_accvgpr_read_b32 v[vgprValuC+26], acc2           // copy acc to vreg[64]
v_accvgpr_read_b32 v[vgprValuC+27], acc6           // copy acc to vreg[65]
v_accvgpr_read_b32 v[vgprValuC+28], acc10          // copy acc to vreg[66]
v_accvgpr_read_b32 v[vgprValuC+29], acc14          // copy acc to vreg[67]
v_accvgpr_read_b32 v[vgprValuC+30], acc18          // copy acc to vreg[68]
v_accvgpr_read_b32 v[vgprValuC+31], acc22          // copy acc to vreg[69]
v_accvgpr_read_b32 v[vgprValuC+32], acc26          // copy acc to vreg[70]
v_accvgpr_read_b32 v[vgprValuC+33], acc30          // copy acc to vreg[71]
v_accvgpr_read_b32 v[vgprValuC+34], acc34          // copy acc to vreg[72]
v_accvgpr_read_b32 v[vgprValuC+35], acc38          // copy acc to vreg[73]
v_accvgpr_read_b32 v[vgprValuC+36], acc42          // copy acc to vreg[74]
v_accvgpr_read_b32 v[vgprValuC+37], acc46          // copy acc to vreg[75]
v_accvgpr_read_b32 v[vgprValuC+38], acc50          // copy acc to vreg[76]
v_accvgpr_read_b32 v[vgprValuC+39], acc54          // copy acc to vreg[77]
v_accvgpr_read_b32 v[vgprValuC+40], acc58          // copy acc to vreg[78]
v_accvgpr_read_b32 v[vgprValuC+41], acc62          // copy acc to vreg[79]
v_accvgpr_read_b32 v[vgprValuC+42], acc66          // copy acc to vreg[80]
v_accvgpr_read_b32 v[vgprValuC+43], acc70          // copy acc to vreg[81]
v_accvgpr_read_b32 v[vgprValuC+44], acc74          // copy acc to vreg[82]
v_accvgpr_read_b32 v[vgprValuC+45], acc78          // copy acc to vreg[83]
v_accvgpr_read_b32 v[vgprValuC+46], acc82          // copy acc to vreg[84]
v_accvgpr_read_b32 v[vgprValuC+47], acc86          // copy acc to vreg[85]
v_accvgpr_read_b32 v[vgprValuC+48], acc90          // copy acc to vreg[86]
v_accvgpr_read_b32 v[vgprValuC+49], acc94          // copy acc to vreg[87]
v_accvgpr_read_b32 v[vgprValuC+50], acc98          // copy acc to vreg[88]
v_accvgpr_read_b32 v[vgprValuC+51], acc102         // copy acc to vreg[89]
v_accvgpr_read_b32 v[vgprValuC+52], acc106         // copy acc to vreg[90]
v_accvgpr_read_b32 v[vgprValuC+53], acc110         // copy acc to vreg[91]
v_accvgpr_read_b32 v[vgprValuC+54], acc114         // copy acc to vreg[92]
v_accvgpr_read_b32 v[vgprValuC+55], acc118         // copy acc to vreg[93]

/* rC *= alpha batchElements=[(0, 1, 5, 3), (0, 0, 6, 0), (0, 0, 6, 1), (0, 0, 6, 2), (0, 0, 6, 3), (0, 1, 6, 0), (0, 1, 6, 1), (0, 1, 6, 2), (0, 1, 6, 3), (0, 0, 7, 0), (0, 0, 7, 1), (0, 0, 7, 2), (0, 0, 7, 3), (0, 1, 7, 0), (0, 1, 7, 1), (0, 1, 7, 2), (0, 1, 7, 3), (0, 0, 8, 0), (0, 0, 8, 1), (0, 0, 8, 2), (0, 0, 8, 3), (0, 1, 8, 0), (0, 1, 8, 1), (0, 1, 8, 2), (0, 1, 8, 3), (0, 0, 9, 0), (0, 0, 9, 1), (0, 0, 9, 2), (0, 0, 9, 3), (0, 1, 9, 0), (0, 1, 9, 1), (0, 1, 9, 2), (0, 1, 9, 3), (0, 0, 10, 0), (0, 0, 10, 1), (0, 0, 10, 2), (0, 0, 10, 3), (0, 1, 10, 0), (0, 1, 10, 1), (0, 1, 10, 2), (0, 1, 10, 3), (0, 0, 11, 0), (0, 0, 11, 1), (0, 0, 11, 2), (0, 0, 11, 3), (0, 1, 11, 0), (0, 1, 11, 1)] */
v_mul_f32 v[vgprValuC+9], s[sgprAlpha], v[vgprValuC+9] // *= alpha
v_pk_mul_f32 v[vgprValuC+10:vgprValuC+10+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+10:vgprValuC+10+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+12:vgprValuC+12+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+12:vgprValuC+12+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+14:vgprValuC+14+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+14:vgprValuC+14+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+16:vgprValuC+16+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+16:vgprValuC+16+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+18:vgprValuC+18+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+18:vgprValuC+18+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+20:vgprValuC+20+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+20:vgprValuC+20+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+22:vgprValuC+22+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+22:vgprValuC+22+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+24:vgprValuC+24+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+24:vgprValuC+24+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+26:vgprValuC+26+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+26:vgprValuC+26+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+28:vgprValuC+28+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+28:vgprValuC+28+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+30:vgprValuC+30+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+30:vgprValuC+30+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+32:vgprValuC+32+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+32:vgprValuC+32+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+34:vgprValuC+34+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+34:vgprValuC+34+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+36:vgprValuC+36+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+36:vgprValuC+36+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+38:vgprValuC+38+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+38:vgprValuC+38+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+40:vgprValuC+40+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+40:vgprValuC+40+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+42:vgprValuC+42+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+42:vgprValuC+42+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+44:vgprValuC+44+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+44:vgprValuC+44+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+46:vgprValuC+46+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+46:vgprValuC+46+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+48:vgprValuC+48+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+48:vgprValuC+48+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+50:vgprValuC+50+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+50:vgprValuC+50+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+52:vgprValuC+52+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+52:vgprValuC+52+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+54:vgprValuC+54+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+54:vgprValuC+54+1] op_sel_hi:[0,1,1] // *= alpha (pk)
s_waitcnt lgkmcnt(0)                               // wait for Bias LDS, ScaleAlphaVec

/* apply mask, calc new C and issue writes */
v_mul_f32 v[vgprValuC+9], v57, v[vgprValuC+9]      // *= ScaleAlphaVecVMul
v_add_f32 v4, v56, v[vgprValuC+9]                  // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b32 v9, v4
buffer_store_dword v9, v58, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_mul_f32 v[vgprValuC+10], v61, v[vgprValuC+10]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v60, v[vgprValuC+10]                 // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b32 v10, v4
buffer_store_dword v10, v62, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_mul_f32 v[vgprValuC+11], v65, v[vgprValuC+11]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v64, v[vgprValuC+11]                 // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b32 v11, v4
buffer_store_dword v11, v66, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_mul_f32 v[vgprValuC+12], v69, v[vgprValuC+12]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v68, v[vgprValuC+12]                 // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b32 v12, v4
buffer_store_dword v12, v70, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_mul_f32 v[vgprValuC+13], v73, v[vgprValuC+13]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v72, v[vgprValuC+13]                 // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b32 v13, v4
buffer_store_dword v13, v74, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_mul_f32 v[vgprValuC+14], v77, v[vgprValuC+14]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v76, v[vgprValuC+14]                 // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b32 v14, v4
buffer_store_dword v14, v78, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_mul_f32 v[vgprValuC+15], v81, v[vgprValuC+15]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v80, v[vgprValuC+15]                 // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b32 v15, v4
buffer_store_dword v15, v82, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_mul_f32 v[vgprValuC+16], v85, v[vgprValuC+16]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v84, v[vgprValuC+16]                 // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b32 v16, v4
buffer_store_dword v16, v86, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_mul_f32 v[vgprValuC+17], v57, v[vgprValuC+17]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v56, v[vgprValuC+17]                 // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b32 v17, v4
buffer_store_dword v17, v88, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_mul_f32 v[vgprValuC+18], v61, v[vgprValuC+18]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v60, v[vgprValuC+18]                 // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b32 v18, v4
buffer_store_dword v18, v90, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_mul_f32 v[vgprValuC+19], v65, v[vgprValuC+19]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v64, v[vgprValuC+19]                 // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b32 v19, v4
buffer_store_dword v19, v92, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_mul_f32 v[vgprValuC+20], v69, v[vgprValuC+20]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v68, v[vgprValuC+20]                 // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b32 v20, v4
buffer_store_dword v20, v94, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_mul_f32 v[vgprValuC+21], v73, v[vgprValuC+21]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v72, v[vgprValuC+21]                 // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b32 v21, v4
buffer_store_dword v21, v96, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_mul_f32 v[vgprValuC+22], v77, v[vgprValuC+22]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v76, v[vgprValuC+22]                 // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b32 v22, v4
buffer_store_dword v22, v98, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_mul_f32 v[vgprValuC+23], v81, v[vgprValuC+23]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v80, v[vgprValuC+23]                 // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b32 v23, v4
buffer_store_dword v23, v100, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_mul_f32 v[vgprValuC+24], v85, v[vgprValuC+24]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v84, v[vgprValuC+24]                 // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b32 v24, v4
buffer_store_dword v24, v102, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_mul_f32 v[vgprValuC+25], v57, v[vgprValuC+25]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v56, v[vgprValuC+25]                 // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b32 v25, v4
buffer_store_dword v25, v104, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_mul_f32 v[vgprValuC+26], v61, v[vgprValuC+26]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v60, v[vgprValuC+26]                 // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b32 v26, v4
buffer_store_dword v26, v106, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_mul_f32 v[vgprValuC+27], v65, v[vgprValuC+27]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v64, v[vgprValuC+27]                 // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b32 v27, v4
buffer_store_dword v27, v108, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_mul_f32 v[vgprValuC+28], v69, v[vgprValuC+28]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v68, v[vgprValuC+28]                 // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b32 v28, v4
buffer_store_dword v28, v110, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_mul_f32 v[vgprValuC+29], v73, v[vgprValuC+29]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v72, v[vgprValuC+29]                 // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b32 v29, v4
buffer_store_dword v29, v112, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_mul_f32 v[vgprValuC+30], v77, v[vgprValuC+30]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v76, v[vgprValuC+30]                 // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b32 v30, v4
buffer_store_dword v30, v114, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_mul_f32 v[vgprValuC+31], v81, v[vgprValuC+31]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v80, v[vgprValuC+31]                 // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b32 v31, v4
buffer_store_dword v31, v116, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_mul_f32 v[vgprValuC+32], v85, v[vgprValuC+32]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v84, v[vgprValuC+32]                 // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b32 v32, v4
buffer_store_dword v32, v118, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_mul_f32 v[vgprValuC+33], v57, v[vgprValuC+33]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v56, v[vgprValuC+33]                 // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b32 v33, v4
buffer_store_dword v33, v120, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_mul_f32 v[vgprValuC+34], v61, v[vgprValuC+34]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v60, v[vgprValuC+34]                 // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b32 v34, v4
buffer_store_dword v34, v122, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_mul_f32 v[vgprValuC+35], v65, v[vgprValuC+35]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v64, v[vgprValuC+35]                 // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b32 v35, v4
buffer_store_dword v35, v124, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_mul_f32 v[vgprValuC+36], v69, v[vgprValuC+36]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v68, v[vgprValuC+36]                 // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b32 v36, v4
buffer_store_dword v36, v126, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_mul_f32 v[vgprValuC+37], v73, v[vgprValuC+37]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v72, v[vgprValuC+37]                 // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b32 v37, v4
buffer_store_dword v37, v128, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_mul_f32 v[vgprValuC+38], v77, v[vgprValuC+38]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v76, v[vgprValuC+38]                 // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b32 v38, v4
buffer_store_dword v38, v130, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_mul_f32 v[vgprValuC+39], v81, v[vgprValuC+39]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v80, v[vgprValuC+39]                 // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b32 v39, v4
buffer_store_dword v39, v132, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_mul_f32 v[vgprValuC+40], v85, v[vgprValuC+40]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v84, v[vgprValuC+40]                 // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b32 v40, v4
buffer_store_dword v40, v134, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_mul_f32 v[vgprValuC+41], v57, v[vgprValuC+41]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v56, v[vgprValuC+41]                 // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b32 v41, v4
buffer_store_dword v41, v136, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_mul_f32 v[vgprValuC+42], v61, v[vgprValuC+42]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v60, v[vgprValuC+42]                 // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b32 v42, v4
buffer_store_dword v42, v138, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_mul_f32 v[vgprValuC+43], v65, v[vgprValuC+43]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v64, v[vgprValuC+43]                 // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b32 v43, v4
buffer_store_dword v43, v140, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_mul_f32 v[vgprValuC+44], v69, v[vgprValuC+44]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v68, v[vgprValuC+44]                 // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b32 v44, v4
buffer_store_dword v44, v142, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_mul_f32 v[vgprValuC+45], v73, v[vgprValuC+45]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v72, v[vgprValuC+45]                 // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b32 v45, v4
buffer_store_dword v45, v144, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_mul_f32 v[vgprValuC+46], v77, v[vgprValuC+46]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v76, v[vgprValuC+46]                 // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b32 v46, v4
buffer_store_dword v46, v149, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_mul_f32 v[vgprValuC+47], v81, v[vgprValuC+47]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v80, v[vgprValuC+47]                 // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b32 v47, v4
buffer_store_dword v47, v151, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_mul_f32 v[vgprValuC+48], v85, v[vgprValuC+48]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v84, v[vgprValuC+48]                 // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b32 v48, v4
buffer_store_dword v48, v153, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_mul_f32 v[vgprValuC+49], v57, v[vgprValuC+49]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v56, v[vgprValuC+49]                 // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b32 v49, v4
buffer_store_dword v49, v155, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_mul_f32 v[vgprValuC+50], v61, v[vgprValuC+50]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v60, v[vgprValuC+50]                 // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b32 v50, v4
buffer_store_dword v50, v157, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_mul_f32 v[vgprValuC+51], v65, v[vgprValuC+51]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v64, v[vgprValuC+51]                 // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b32 v51, v4
buffer_store_dword v51, v159, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_mul_f32 v[vgprValuC+52], v69, v[vgprValuC+52]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v68, v[vgprValuC+52]                 // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b32 v52, v4
buffer_store_dword v52, v161, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_mul_f32 v[vgprValuC+53], v73, v[vgprValuC+53]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v72, v[vgprValuC+53]                 // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b32 v53, v4
buffer_store_dword v53, v163, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_mul_f32 v[vgprValuC+54], v77, v[vgprValuC+54]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v76, v[vgprValuC+54]                 // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b32 v54, v4
buffer_store_dword v54, v165, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_mul_f32 v[vgprValuC+55], v81, v[vgprValuC+55]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v80, v[vgprValuC+55]                 // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b32 v55, v4
buffer_store_dword v55, v167, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 factorDim=0 */

/******************************************/
/* Global Write Edge Batch #2 (d1,d0,vc1,vc0) = */
/*    (0,1,11,2:vw1); (0,1,11,3:vw1); (0,0,12,0:vw1); (0,0,12,1:vw1); (0,0,12,2:vw1); (0,0,12,3:vw1); (0,1,12,0:vw1); (0,1,12,1:vw1); (0,1,12,2:vw1); (0,1,12,3:vw1); (0,0,13,0:vw1); (0,0,13,1:vw1); (0,0,13,2:vw1); (0,0,13,3:vw1); (0,1,13,0:vw1); (0,1,13,1:vw1); (0,1,13,2:vw1); (0,1,13,3:vw1); (0,0,14,0:vw1); (0,0,14,1:vw1); (0,0,14,2:vw1); (0,0,14,3:vw1); (0,1,14,0:vw1); (0,1,14,1:vw1); (0,1,14,2:vw1); (0,1,14,3:vw1); (0,0,15,0:vw1); (0,0,15,1:vw1); (0,0,15,2:vw1); (0,0,15,3:vw1); (0,1,15,0:vw1); (0,1,15,1:vw1); (0,1,15,2:vw1); (0,1,15,3:vw1); (1,0,0,0:vw1); (1,0,0,1:vw1); (1,0,0,2:vw1); (1,0,0,3:vw1); (1,1,0,0:vw1); (1,1,0,1:vw1); (1,1,0,2:vw1); (1,1,0,3:vw1); (1,0,1,0:vw1); (1,0,1,1:vw1); (1,0,1,2:vw1); (1,0,1,3:vw1); (1,1,1,0:vw1) */
/******************************************/

/* calc coords, apply mask, and issue loads (if necessary) */
v_mov_b32 v8, BufferOOB
/* (d1,vc1,d0,vc0)=(0,11,1,2) */
s_mov_b32 s74, 130                                 // coordOffset0 d0=1 vc0=2
v_add_co_u32 v4, vcc, v0, s74                      // coord0.2: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v59, v4, s74
v_lshlrev_b32 v59, 0x2, v59                        // Bias address scaled by BPE
ds_read_b32 v56, v59 offset:0                      // load Bias
ds_read_b32 v57, v59 offset:1024                   // load scaleAlpha
v_add_lshl_u32 v58, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v58, v8, v58, s[78:79]               // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,11,1,3) */
s_mov_b32 s74, 131                                 // coordOffset0 d0=1 vc0=3
v_add_co_u32 v4, vcc, v0, s74                      // coord0.2: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v63, v4, s74
v_lshlrev_b32 v63, 0x2, v63                        // Bias address scaled by BPE
ds_read_b32 v60, v63 offset:0                      // load Bias
ds_read_b32 v61, v63 offset:1024                   // load scaleAlpha
v_add_lshl_u32 v62, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v62, v8, v62, s[78:79]               // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,12,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[74:75], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v67, v0, s74
v_lshlrev_b32 v67, 0x2, v67                        // Bias address scaled by BPE
ds_read_b32 v64, v67 offset:0                      // load Bias
ds_read_b32 v65, v67 offset:1024                   // load scaleAlpha
v_add_lshl_u32 v66, v3, v0, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v66, v8, v66, s[78:79]               // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,12,0,1) */
v_add_co_u32 v4, vcc, v0, 1                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v71, v4, s74
v_lshlrev_b32 v71, 0x2, v71                        // Bias address scaled by BPE
ds_read_b32 v68, v71 offset:0                      // load Bias
ds_read_b32 v69, v71 offset:1024                   // load scaleAlpha
v_add_lshl_u32 v70, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v70, v8, v70, s[78:79]               // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,12,0,2) */
v_add_co_u32 v4, vcc, v0, 2                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v75, v4, s74
v_lshlrev_b32 v75, 0x2, v75                        // Bias address scaled by BPE
ds_read_b32 v72, v75 offset:0                      // load Bias
ds_read_b32 v73, v75 offset:1024                   // load scaleAlpha
v_add_lshl_u32 v74, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v74, v8, v74, s[78:79]               // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,12,0,3) */
v_add_co_u32 v4, vcc, v0, 3                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v79, v4, s74
v_lshlrev_b32 v79, 0x2, v79                        // Bias address scaled by BPE
ds_read_b32 v76, v79 offset:0                      // load Bias
ds_read_b32 v77, v79 offset:1024                   // load scaleAlpha
v_add_lshl_u32 v78, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v78, v8, v78, s[78:79]               // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,12,1,0) */
s_mov_b32 s74, 128                                 // coordOffset0 d0=1 vc0=0
v_add_co_u32 v4, vcc, v0, s74                      // coord0.2: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v83, v4, s74
v_lshlrev_b32 v83, 0x2, v83                        // Bias address scaled by BPE
ds_read_b32 v80, v83 offset:0                      // load Bias
ds_read_b32 v81, v83 offset:1024                   // load scaleAlpha
v_add_lshl_u32 v82, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v82, v8, v82, s[78:79]               // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,12,1,1) */
s_mov_b32 s74, 129                                 // coordOffset0 d0=1 vc0=1
v_add_co_u32 v4, vcc, v0, s74                      // coord0.2: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v87, v4, s74
v_lshlrev_b32 v87, 0x2, v87                        // Bias address scaled by BPE
ds_read_b32 v84, v87 offset:0                      // load Bias
ds_read_b32 v85, v87 offset:1024                   // load scaleAlpha
v_add_lshl_u32 v86, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v86, v8, v86, s[78:79]               // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,12,1,2) */
s_mov_b32 s74, 130                                 // coordOffset0 d0=1 vc0=2
v_add_co_u32 v4, vcc, v0, s74                      // coord0.2: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v89, v4, s74
v_lshlrev_b32 v89, 0x2, v89                        // Bias address scaled by BPE
v_add_lshl_u32 v88, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v88, v8, v88, s[78:79]               // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,12,1,3) */
s_mov_b32 s74, 131                                 // coordOffset0 d0=1 vc0=3
v_add_co_u32 v4, vcc, v0, s74                      // coord0.2: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v91, v4, s74
v_lshlrev_b32 v91, 0x2, v91                        // Bias address scaled by BPE
v_add_lshl_u32 v90, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v90, v8, v90, s[78:79]               // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,13,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[74:75], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v93, v0, s74
v_lshlrev_b32 v93, 0x2, v93                        // Bias address scaled by BPE
v_add_lshl_u32 v92, v3, v0, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v92, v8, v92, s[78:79]               // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,13,0,1) */
v_add_co_u32 v4, vcc, v0, 1                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v95, v4, s74
v_lshlrev_b32 v95, 0x2, v95                        // Bias address scaled by BPE
v_add_lshl_u32 v94, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v94, v8, v94, s[78:79]               // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,13,0,2) */
v_add_co_u32 v4, vcc, v0, 2                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v97, v4, s74
v_lshlrev_b32 v97, 0x2, v97                        // Bias address scaled by BPE
v_add_lshl_u32 v96, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v96, v8, v96, s[78:79]               // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,13,0,3) */
v_add_co_u32 v4, vcc, v0, 3                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v99, v4, s74
v_lshlrev_b32 v99, 0x2, v99                        // Bias address scaled by BPE
v_add_lshl_u32 v98, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v98, v8, v98, s[78:79]               // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,13,1,0) */
s_mov_b32 s74, 128                                 // coordOffset0 d0=1 vc0=0
v_add_co_u32 v4, vcc, v0, s74                      // coord0.2: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v101, v4, s74
v_lshlrev_b32 v101, 0x2, v101                      // Bias address scaled by BPE
v_add_lshl_u32 v100, v3, v4, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v100, v8, v100, s[78:79]             // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,13,1,1) */
s_mov_b32 s74, 129                                 // coordOffset0 d0=1 vc0=1
v_add_co_u32 v4, vcc, v0, s74                      // coord0.2: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v103, v4, s74
v_lshlrev_b32 v103, 0x2, v103                      // Bias address scaled by BPE
v_add_lshl_u32 v102, v3, v4, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v102, v8, v102, s[78:79]             // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,13,1,2) */
s_mov_b32 s74, 130                                 // coordOffset0 d0=1 vc0=2
v_add_co_u32 v4, vcc, v0, s74                      // coord0.2: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v105, v4, s74
v_lshlrev_b32 v105, 0x2, v105                      // Bias address scaled by BPE
v_add_lshl_u32 v104, v3, v4, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v104, v8, v104, s[78:79]             // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,13,1,3) */
s_mov_b32 s74, 131                                 // coordOffset0 d0=1 vc0=3
v_add_co_u32 v4, vcc, v0, s74                      // coord0.2: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v107, v4, s74
v_lshlrev_b32 v107, 0x2, v107                      // Bias address scaled by BPE
v_add_lshl_u32 v106, v3, v4, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v106, v8, v106, s[78:79]             // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,14,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[74:75], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v109, v0, s74
v_lshlrev_b32 v109, 0x2, v109                      // Bias address scaled by BPE
v_add_lshl_u32 v108, v3, v0, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v108, v8, v108, s[78:79]             // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,14,0,1) */
v_add_co_u32 v4, vcc, v0, 1                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v111, v4, s74
v_lshlrev_b32 v111, 0x2, v111                      // Bias address scaled by BPE
v_add_lshl_u32 v110, v3, v4, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v110, v8, v110, s[78:79]             // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,14,0,2) */
v_add_co_u32 v4, vcc, v0, 2                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v113, v4, s74
v_lshlrev_b32 v113, 0x2, v113                      // Bias address scaled by BPE
v_add_lshl_u32 v112, v3, v4, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v112, v8, v112, s[78:79]             // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,14,0,3) */
v_add_co_u32 v4, vcc, v0, 3                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v115, v4, s74
v_lshlrev_b32 v115, 0x2, v115                      // Bias address scaled by BPE
v_add_lshl_u32 v114, v3, v4, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v114, v8, v114, s[78:79]             // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,14,1,0) */
s_mov_b32 s74, 128                                 // coordOffset0 d0=1 vc0=0
v_add_co_u32 v4, vcc, v0, s74                      // coord0.2: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v117, v4, s74
v_lshlrev_b32 v117, 0x2, v117                      // Bias address scaled by BPE
v_add_lshl_u32 v116, v3, v4, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v116, v8, v116, s[78:79]             // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,14,1,1) */
s_mov_b32 s74, 129                                 // coordOffset0 d0=1 vc0=1
v_add_co_u32 v4, vcc, v0, s74                      // coord0.2: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v119, v4, s74
v_lshlrev_b32 v119, 0x2, v119                      // Bias address scaled by BPE
v_add_lshl_u32 v118, v3, v4, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v118, v8, v118, s[78:79]             // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,14,1,2) */
s_mov_b32 s74, 130                                 // coordOffset0 d0=1 vc0=2
v_add_co_u32 v4, vcc, v0, s74                      // coord0.2: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v121, v4, s74
v_lshlrev_b32 v121, 0x2, v121                      // Bias address scaled by BPE
v_add_lshl_u32 v120, v3, v4, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v120, v8, v120, s[78:79]             // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,14,1,3) */
s_mov_b32 s74, 131                                 // coordOffset0 d0=1 vc0=3
v_add_co_u32 v4, vcc, v0, s74                      // coord0.2: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v123, v4, s74
v_lshlrev_b32 v123, 0x2, v123                      // Bias address scaled by BPE
v_add_lshl_u32 v122, v3, v4, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v122, v8, v122, s[78:79]             // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,15,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[74:75], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v125, v0, s74
v_lshlrev_b32 v125, 0x2, v125                      // Bias address scaled by BPE
v_add_lshl_u32 v124, v3, v0, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v124, v8, v124, s[78:79]             // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,15,0,1) */
v_add_co_u32 v4, vcc, v0, 1                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v127, v4, s74
v_lshlrev_b32 v127, 0x2, v127                      // Bias address scaled by BPE
v_add_lshl_u32 v126, v3, v4, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v126, v8, v126, s[78:79]             // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,15,0,2) */
v_add_co_u32 v4, vcc, v0, 2                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v129, v4, s74
v_lshlrev_b32 v129, 0x2, v129                      // Bias address scaled by BPE
v_add_lshl_u32 v128, v3, v4, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v128, v8, v128, s[78:79]             // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,15,0,3) */
v_add_co_u32 v4, vcc, v0, 3                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v131, v4, s74
v_lshlrev_b32 v131, 0x2, v131                      // Bias address scaled by BPE
v_add_lshl_u32 v130, v3, v4, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v130, v8, v130, s[78:79]             // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,15,1,0) */
s_mov_b32 s74, 128                                 // coordOffset0 d0=1 vc0=0
v_add_co_u32 v4, vcc, v0, s74                      // coord0.2: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v133, v4, s74
v_lshlrev_b32 v133, 0x2, v133                      // Bias address scaled by BPE
v_add_lshl_u32 v132, v3, v4, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v132, v8, v132, s[78:79]             // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,15,1,1) */
s_mov_b32 s74, 129                                 // coordOffset0 d0=1 vc0=1
v_add_co_u32 v4, vcc, v0, s74                      // coord0.2: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v135, v4, s74
v_lshlrev_b32 v135, 0x2, v135                      // Bias address scaled by BPE
v_add_lshl_u32 v134, v3, v4, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v134, v8, v134, s[78:79]             // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,15,1,2) */
s_mov_b32 s74, 130                                 // coordOffset0 d0=1 vc0=2
v_add_co_u32 v4, vcc, v0, s74                      // coord0.2: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v137, v4, s74
v_lshlrev_b32 v137, 0x2, v137                      // Bias address scaled by BPE
v_add_lshl_u32 v136, v3, v4, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v136, v8, v136, s[78:79]             // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,15,1,3) */
s_mov_b32 s74, 131                                 // coordOffset0 d0=1 vc0=3
v_add_co_u32 v4, vcc, v0, s74                      // coord0.2: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v139, v4, s74
v_lshlrev_b32 v139, 0x2, v139                      // Bias address scaled by BPE
v_add_lshl_u32 v138, v3, v4, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v138, v8, v138, s[78:79]             // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(1,0,0,0) */
s_mov_b32 s74, 113                                 // rowInc d1=0 vc1=0
v_add_co_u32 v1, vcc, v1, s74                      // coord1.2: coord1 += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
s_mul_i32 s74, s[sgprStrideC1J], 113               // scale stride
v_add_i32 v2, v2, s74                              // ROWINC- Move cinRowPtr to next row
s_mul_i32 s74, s[sgprStrideD1J], 113               // scale stride
v_add_i32 v3, v3, s74                              // Move coutRowPtrD to next row
v_cmp_lt_u32 s[74:75], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v141, v0, s74
v_lshlrev_b32 v141, 0x2, v141                      // Bias address scaled by BPE
v_add_lshl_u32 v140, v3, v0, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v140, v8, v140, s[78:79]             // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(1,0,0,1) */
v_add_co_u32 v4, vcc, v0, 1                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v143, v4, s74
v_lshlrev_b32 v143, 0x2, v143                      // Bias address scaled by BPE
v_add_lshl_u32 v142, v3, v4, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v142, v8, v142, s[78:79]             // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(1,0,0,2) */
v_add_co_u32 v4, vcc, v0, 2                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v145, v4, s74
v_lshlrev_b32 v145, 0x2, v145                      // Bias address scaled by BPE
v_add_lshl_u32 v144, v3, v4, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v144, v8, v144, s[78:79]             // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(1,0,0,3) */
v_add_co_u32 v4, vcc, v0, 3                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v150, v4, s74
v_lshlrev_b32 v150, 0x2, v150                      // Bias address scaled by BPE
v_add_lshl_u32 v149, v3, v4, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v149, v8, v149, s[78:79]             // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(1,0,1,0) */
s_mov_b32 s74, 128                                 // coordOffset0 d0=1 vc0=0
v_add_co_u32 v4, vcc, v0, s74                      // coord0.2: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v152, v4, s74
v_lshlrev_b32 v152, 0x2, v152                      // Bias address scaled by BPE
v_add_lshl_u32 v151, v3, v4, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v151, v8, v151, s[78:79]             // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(1,0,1,1) */
s_mov_b32 s74, 129                                 // coordOffset0 d0=1 vc0=1
v_add_co_u32 v4, vcc, v0, s74                      // coord0.2: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v154, v4, s74
v_lshlrev_b32 v154, 0x2, v154                      // Bias address scaled by BPE
v_add_lshl_u32 v153, v3, v4, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v153, v8, v153, s[78:79]             // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(1,0,1,2) */
s_mov_b32 s74, 130                                 // coordOffset0 d0=1 vc0=2
v_add_co_u32 v4, vcc, v0, s74                      // coord0.2: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v156, v4, s74
v_lshlrev_b32 v156, 0x2, v156                      // Bias address scaled by BPE
v_add_lshl_u32 v155, v3, v4, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v155, v8, v155, s[78:79]             // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(1,0,1,3) */
s_mov_b32 s74, 131                                 // coordOffset0 d0=1 vc0=3
v_add_co_u32 v4, vcc, v0, s74                      // coord0.2: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v158, v4, s74
v_lshlrev_b32 v158, 0x2, v158                      // Bias address scaled by BPE
v_add_lshl_u32 v157, v3, v4, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v157, v8, v157, s[78:79]             // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(1,1,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[74:75], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v160, v0, s74
v_lshlrev_b32 v160, 0x2, v160                      // Bias address scaled by BPE
v_add_lshl_u32 v159, v3, v0, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v159, v8, v159, s[78:79]             // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(1,1,0,1) */
v_add_co_u32 v4, vcc, v0, 1                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v162, v4, s74
v_lshlrev_b32 v162, 0x2, v162                      // Bias address scaled by BPE
v_add_lshl_u32 v161, v3, v4, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v161, v8, v161, s[78:79]             // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(1,1,0,2) */
v_add_co_u32 v4, vcc, v0, 2                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v164, v4, s74
v_lshlrev_b32 v164, 0x2, v164                      // Bias address scaled by BPE
v_add_lshl_u32 v163, v3, v4, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v163, v8, v163, s[78:79]             // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(1,1,0,3) */
v_add_co_u32 v4, vcc, v0, 3                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v166, v4, s74
v_lshlrev_b32 v166, 0x2, v166                      // Bias address scaled by BPE
v_add_lshl_u32 v165, v3, v4, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v165, v8, v165, s[78:79]             // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(1,1,1,0) */
s_mov_b32 s74, 128                                 // coordOffset0 d0=1 vc0=0
v_add_co_u32 v4, vcc, v0, s74                      // coord0.2: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v168, v4, s74
v_lshlrev_b32 v168, 0x2, v168                      // Bias address scaled by BPE
v_add_lshl_u32 v167, v3, v4, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v167, v8, v167, s[78:79]             // LDD clip if OOB. offset
v_accvgpr_read_b32 v[vgprValuC+9], acc122          // copy acc to vreg[94]
v_accvgpr_read_b32 v[vgprValuC+10], acc126         // copy acc to vreg[95]
v_accvgpr_read_b32 v[vgprValuC+11], acc3           // copy acc to vreg[96]
v_accvgpr_read_b32 v[vgprValuC+12], acc7           // copy acc to vreg[97]
v_accvgpr_read_b32 v[vgprValuC+13], acc11          // copy acc to vreg[98]
v_accvgpr_read_b32 v[vgprValuC+14], acc15          // copy acc to vreg[99]
v_accvgpr_read_b32 v[vgprValuC+15], acc19          // copy acc to vreg[100]
v_accvgpr_read_b32 v[vgprValuC+16], acc23          // copy acc to vreg[101]
v_accvgpr_read_b32 v[vgprValuC+17], acc27          // copy acc to vreg[102]
v_accvgpr_read_b32 v[vgprValuC+18], acc31          // copy acc to vreg[103]
v_accvgpr_read_b32 v[vgprValuC+19], acc35          // copy acc to vreg[104]
v_accvgpr_read_b32 v[vgprValuC+20], acc39          // copy acc to vreg[105]
v_accvgpr_read_b32 v[vgprValuC+21], acc43          // copy acc to vreg[106]
v_accvgpr_read_b32 v[vgprValuC+22], acc47          // copy acc to vreg[107]
v_accvgpr_read_b32 v[vgprValuC+23], acc51          // copy acc to vreg[108]
v_accvgpr_read_b32 v[vgprValuC+24], acc55          // copy acc to vreg[109]
v_accvgpr_read_b32 v[vgprValuC+25], acc59          // copy acc to vreg[110]
v_accvgpr_read_b32 v[vgprValuC+26], acc63          // copy acc to vreg[111]
v_accvgpr_read_b32 v[vgprValuC+27], acc67          // copy acc to vreg[112]
v_accvgpr_read_b32 v[vgprValuC+28], acc71          // copy acc to vreg[113]
v_accvgpr_read_b32 v[vgprValuC+29], acc75          // copy acc to vreg[114]
v_accvgpr_read_b32 v[vgprValuC+30], acc79          // copy acc to vreg[115]
v_accvgpr_read_b32 v[vgprValuC+31], acc83          // copy acc to vreg[116]
v_accvgpr_read_b32 v[vgprValuC+32], acc87          // copy acc to vreg[117]
v_accvgpr_read_b32 v[vgprValuC+33], acc91          // copy acc to vreg[118]
v_accvgpr_read_b32 v[vgprValuC+34], acc95          // copy acc to vreg[119]
v_accvgpr_read_b32 v[vgprValuC+35], acc99          // copy acc to vreg[120]
v_accvgpr_read_b32 v[vgprValuC+36], acc103         // copy acc to vreg[121]
v_accvgpr_read_b32 v[vgprValuC+37], acc107         // copy acc to vreg[122]
v_accvgpr_read_b32 v[vgprValuC+38], acc111         // copy acc to vreg[123]
v_accvgpr_read_b32 v[vgprValuC+39], acc115         // copy acc to vreg[124]
v_accvgpr_read_b32 v[vgprValuC+40], acc119         // copy acc to vreg[125]
v_accvgpr_read_b32 v[vgprValuC+41], acc123         // copy acc to vreg[126]
v_accvgpr_read_b32 v[vgprValuC+42], acc127         // copy acc to vreg[127]
v_accvgpr_read_b32 v[vgprValuC+43], acc128         // copy acc to vreg[128]
v_accvgpr_read_b32 v[vgprValuC+44], acc132         // copy acc to vreg[129]
v_accvgpr_read_b32 v[vgprValuC+45], acc136         // copy acc to vreg[130]
v_accvgpr_read_b32 v[vgprValuC+46], acc140         // copy acc to vreg[131]
v_accvgpr_read_b32 v[vgprValuC+47], acc144         // copy acc to vreg[132]
v_accvgpr_read_b32 v[vgprValuC+48], acc148         // copy acc to vreg[133]
v_accvgpr_read_b32 v[vgprValuC+49], acc152         // copy acc to vreg[134]
v_accvgpr_read_b32 v[vgprValuC+50], acc156         // copy acc to vreg[135]
v_accvgpr_read_b32 v[vgprValuC+51], acc160         // copy acc to vreg[136]
v_accvgpr_read_b32 v[vgprValuC+52], acc164         // copy acc to vreg[137]
v_accvgpr_read_b32 v[vgprValuC+53], acc168         // copy acc to vreg[138]
v_accvgpr_read_b32 v[vgprValuC+54], acc172         // copy acc to vreg[139]
v_accvgpr_read_b32 v[vgprValuC+55], acc176         // copy acc to vreg[140]

/* rC *= alpha batchElements=[(0, 1, 11, 2), (0, 1, 11, 3), (0, 0, 12, 0), (0, 0, 12, 1), (0, 0, 12, 2), (0, 0, 12, 3), (0, 1, 12, 0), (0, 1, 12, 1), (0, 1, 12, 2), (0, 1, 12, 3), (0, 0, 13, 0), (0, 0, 13, 1), (0, 0, 13, 2), (0, 0, 13, 3), (0, 1, 13, 0), (0, 1, 13, 1), (0, 1, 13, 2), (0, 1, 13, 3), (0, 0, 14, 0), (0, 0, 14, 1), (0, 0, 14, 2), (0, 0, 14, 3), (0, 1, 14, 0), (0, 1, 14, 1), (0, 1, 14, 2), (0, 1, 14, 3), (0, 0, 15, 0), (0, 0, 15, 1), (0, 0, 15, 2), (0, 0, 15, 3), (0, 1, 15, 0), (0, 1, 15, 1), (0, 1, 15, 2), (0, 1, 15, 3), (1, 0, 0, 0), (1, 0, 0, 1), (1, 0, 0, 2), (1, 0, 0, 3), (1, 1, 0, 0), (1, 1, 0, 1), (1, 1, 0, 2), (1, 1, 0, 3), (1, 0, 1, 0), (1, 0, 1, 1), (1, 0, 1, 2), (1, 0, 1, 3), (1, 1, 1, 0)] */
v_mul_f32 v[vgprValuC+9], s[sgprAlpha], v[vgprValuC+9] // *= alpha
v_pk_mul_f32 v[vgprValuC+10:vgprValuC+10+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+10:vgprValuC+10+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+12:vgprValuC+12+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+12:vgprValuC+12+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+14:vgprValuC+14+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+14:vgprValuC+14+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+16:vgprValuC+16+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+16:vgprValuC+16+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+18:vgprValuC+18+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+18:vgprValuC+18+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+20:vgprValuC+20+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+20:vgprValuC+20+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+22:vgprValuC+22+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+22:vgprValuC+22+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+24:vgprValuC+24+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+24:vgprValuC+24+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+26:vgprValuC+26+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+26:vgprValuC+26+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+28:vgprValuC+28+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+28:vgprValuC+28+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+30:vgprValuC+30+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+30:vgprValuC+30+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+32:vgprValuC+32+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+32:vgprValuC+32+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+34:vgprValuC+34+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+34:vgprValuC+34+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+36:vgprValuC+36+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+36:vgprValuC+36+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+38:vgprValuC+38+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+38:vgprValuC+38+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+40:vgprValuC+40+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+40:vgprValuC+40+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+42:vgprValuC+42+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+42:vgprValuC+42+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+44:vgprValuC+44+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+44:vgprValuC+44+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+46:vgprValuC+46+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+46:vgprValuC+46+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+48:vgprValuC+48+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+48:vgprValuC+48+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+50:vgprValuC+50+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+50:vgprValuC+50+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+52:vgprValuC+52+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+52:vgprValuC+52+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+54:vgprValuC+54+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+54:vgprValuC+54+1] op_sel_hi:[0,1,1] // *= alpha (pk)
s_waitcnt lgkmcnt(0)                               // wait for Bias LDS, ScaleAlphaVec

/* apply mask, calc new C and issue writes */
v_mul_f32 v[vgprValuC+9], v57, v[vgprValuC+9]      // *= ScaleAlphaVecVMul
v_add_f32 v4, v56, v[vgprValuC+9]                  // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b32 v9, v4
buffer_store_dword v9, v58, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_mul_f32 v[vgprValuC+10], v61, v[vgprValuC+10]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v60, v[vgprValuC+10]                 // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b32 v10, v4
buffer_store_dword v10, v62, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_mul_f32 v[vgprValuC+11], v65, v[vgprValuC+11]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v64, v[vgprValuC+11]                 // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b32 v11, v4
buffer_store_dword v11, v66, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_mul_f32 v[vgprValuC+12], v69, v[vgprValuC+12]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v68, v[vgprValuC+12]                 // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b32 v12, v4
buffer_store_dword v12, v70, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_mul_f32 v[vgprValuC+13], v73, v[vgprValuC+13]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v72, v[vgprValuC+13]                 // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b32 v13, v4
buffer_store_dword v13, v74, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_mul_f32 v[vgprValuC+14], v77, v[vgprValuC+14]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v76, v[vgprValuC+14]                 // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b32 v14, v4
buffer_store_dword v14, v78, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_mul_f32 v[vgprValuC+15], v81, v[vgprValuC+15]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v80, v[vgprValuC+15]                 // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b32 v15, v4
buffer_store_dword v15, v82, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_mul_f32 v[vgprValuC+16], v85, v[vgprValuC+16]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v84, v[vgprValuC+16]                 // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b32 v16, v4
buffer_store_dword v16, v86, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_mul_f32 v[vgprValuC+17], v57, v[vgprValuC+17]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v56, v[vgprValuC+17]                 // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b32 v17, v4
buffer_store_dword v17, v88, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_mul_f32 v[vgprValuC+18], v61, v[vgprValuC+18]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v60, v[vgprValuC+18]                 // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b32 v18, v4
buffer_store_dword v18, v90, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_mul_f32 v[vgprValuC+19], v65, v[vgprValuC+19]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v64, v[vgprValuC+19]                 // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b32 v19, v4
buffer_store_dword v19, v92, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_mul_f32 v[vgprValuC+20], v69, v[vgprValuC+20]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v68, v[vgprValuC+20]                 // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b32 v20, v4
buffer_store_dword v20, v94, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_mul_f32 v[vgprValuC+21], v73, v[vgprValuC+21]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v72, v[vgprValuC+21]                 // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b32 v21, v4
buffer_store_dword v21, v96, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_mul_f32 v[vgprValuC+22], v77, v[vgprValuC+22]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v76, v[vgprValuC+22]                 // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b32 v22, v4
buffer_store_dword v22, v98, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_mul_f32 v[vgprValuC+23], v81, v[vgprValuC+23]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v80, v[vgprValuC+23]                 // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b32 v23, v4
buffer_store_dword v23, v100, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_mul_f32 v[vgprValuC+24], v85, v[vgprValuC+24]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v84, v[vgprValuC+24]                 // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b32 v24, v4
buffer_store_dword v24, v102, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_mul_f32 v[vgprValuC+25], v57, v[vgprValuC+25]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v56, v[vgprValuC+25]                 // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b32 v25, v4
buffer_store_dword v25, v104, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_mul_f32 v[vgprValuC+26], v61, v[vgprValuC+26]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v60, v[vgprValuC+26]                 // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b32 v26, v4
buffer_store_dword v26, v106, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_mul_f32 v[vgprValuC+27], v65, v[vgprValuC+27]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v64, v[vgprValuC+27]                 // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b32 v27, v4
buffer_store_dword v27, v108, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_mul_f32 v[vgprValuC+28], v69, v[vgprValuC+28]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v68, v[vgprValuC+28]                 // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b32 v28, v4
buffer_store_dword v28, v110, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_mul_f32 v[vgprValuC+29], v73, v[vgprValuC+29]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v72, v[vgprValuC+29]                 // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b32 v29, v4
buffer_store_dword v29, v112, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_mul_f32 v[vgprValuC+30], v77, v[vgprValuC+30]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v76, v[vgprValuC+30]                 // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b32 v30, v4
buffer_store_dword v30, v114, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_mul_f32 v[vgprValuC+31], v81, v[vgprValuC+31]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v80, v[vgprValuC+31]                 // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b32 v31, v4
buffer_store_dword v31, v116, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_mul_f32 v[vgprValuC+32], v85, v[vgprValuC+32]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v84, v[vgprValuC+32]                 // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b32 v32, v4
buffer_store_dword v32, v118, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_mul_f32 v[vgprValuC+33], v57, v[vgprValuC+33]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v56, v[vgprValuC+33]                 // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b32 v33, v4
buffer_store_dword v33, v120, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_mul_f32 v[vgprValuC+34], v61, v[vgprValuC+34]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v60, v[vgprValuC+34]                 // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b32 v34, v4
buffer_store_dword v34, v122, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_mul_f32 v[vgprValuC+35], v65, v[vgprValuC+35]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v64, v[vgprValuC+35]                 // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b32 v35, v4
buffer_store_dword v35, v124, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_mul_f32 v[vgprValuC+36], v69, v[vgprValuC+36]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v68, v[vgprValuC+36]                 // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b32 v36, v4
buffer_store_dword v36, v126, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_mul_f32 v[vgprValuC+37], v73, v[vgprValuC+37]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v72, v[vgprValuC+37]                 // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b32 v37, v4
buffer_store_dword v37, v128, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_mul_f32 v[vgprValuC+38], v77, v[vgprValuC+38]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v76, v[vgprValuC+38]                 // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b32 v38, v4
buffer_store_dword v38, v130, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_mul_f32 v[vgprValuC+39], v81, v[vgprValuC+39]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v80, v[vgprValuC+39]                 // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b32 v39, v4
buffer_store_dword v39, v132, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_mul_f32 v[vgprValuC+40], v85, v[vgprValuC+40]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v84, v[vgprValuC+40]                 // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b32 v40, v4
buffer_store_dword v40, v134, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_mul_f32 v[vgprValuC+41], v57, v[vgprValuC+41]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v56, v[vgprValuC+41]                 // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b32 v41, v4
buffer_store_dword v41, v136, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_mul_f32 v[vgprValuC+42], v61, v[vgprValuC+42]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v60, v[vgprValuC+42]                 // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b32 v42, v4
buffer_store_dword v42, v138, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_mul_f32 v[vgprValuC+43], v65, v[vgprValuC+43]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v64, v[vgprValuC+43]                 // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b32 v43, v4
buffer_store_dword v43, v140, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_mul_f32 v[vgprValuC+44], v69, v[vgprValuC+44]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v68, v[vgprValuC+44]                 // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b32 v44, v4
buffer_store_dword v44, v142, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_mul_f32 v[vgprValuC+45], v73, v[vgprValuC+45]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v72, v[vgprValuC+45]                 // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b32 v45, v4
buffer_store_dword v45, v144, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_mul_f32 v[vgprValuC+46], v77, v[vgprValuC+46]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v76, v[vgprValuC+46]                 // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b32 v46, v4
buffer_store_dword v46, v149, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_mul_f32 v[vgprValuC+47], v81, v[vgprValuC+47]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v80, v[vgprValuC+47]                 // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b32 v47, v4
buffer_store_dword v47, v151, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_mul_f32 v[vgprValuC+48], v85, v[vgprValuC+48]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v84, v[vgprValuC+48]                 // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b32 v48, v4
buffer_store_dword v48, v153, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_mul_f32 v[vgprValuC+49], v57, v[vgprValuC+49]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v56, v[vgprValuC+49]                 // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b32 v49, v4
buffer_store_dword v49, v155, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_mul_f32 v[vgprValuC+50], v61, v[vgprValuC+50]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v60, v[vgprValuC+50]                 // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b32 v50, v4
buffer_store_dword v50, v157, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_mul_f32 v[vgprValuC+51], v65, v[vgprValuC+51]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v64, v[vgprValuC+51]                 // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b32 v51, v4
buffer_store_dword v51, v159, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_mul_f32 v[vgprValuC+52], v69, v[vgprValuC+52]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v68, v[vgprValuC+52]                 // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b32 v52, v4
buffer_store_dword v52, v161, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_mul_f32 v[vgprValuC+53], v73, v[vgprValuC+53]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v72, v[vgprValuC+53]                 // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b32 v53, v4
buffer_store_dword v53, v163, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_mul_f32 v[vgprValuC+54], v77, v[vgprValuC+54]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v76, v[vgprValuC+54]                 // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b32 v54, v4
buffer_store_dword v54, v165, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_mul_f32 v[vgprValuC+55], v81, v[vgprValuC+55]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v80, v[vgprValuC+55]                 // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b32 v55, v4
buffer_store_dword v55, v167, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 factorDim=0 */

/******************************************/
/* Global Write Edge Batch #3 (d1,d0,vc1,vc0) = */
/*    (1,1,1,1:vw1); (1,1,1,2:vw1); (1,1,1,3:vw1); (1,0,2,0:vw1); (1,0,2,1:vw1); (1,0,2,2:vw1); (1,0,2,3:vw1); (1,1,2,0:vw1); (1,1,2,1:vw1); (1,1,2,2:vw1); (1,1,2,3:vw1); (1,0,3,0:vw1); (1,0,3,1:vw1); (1,0,3,2:vw1); (1,0,3,3:vw1); (1,1,3,0:vw1); (1,1,3,1:vw1); (1,1,3,2:vw1); (1,1,3,3:vw1); (1,0,4,0:vw1); (1,0,4,1:vw1); (1,0,4,2:vw1); (1,0,4,3:vw1); (1,1,4,0:vw1); (1,1,4,1:vw1); (1,1,4,2:vw1); (1,1,4,3:vw1); (1,0,5,0:vw1); (1,0,5,1:vw1); (1,0,5,2:vw1); (1,0,5,3:vw1); (1,1,5,0:vw1); (1,1,5,1:vw1); (1,1,5,2:vw1); (1,1,5,3:vw1); (1,0,6,0:vw1); (1,0,6,1:vw1); (1,0,6,2:vw1); (1,0,6,3:vw1); (1,1,6,0:vw1); (1,1,6,1:vw1); (1,1,6,2:vw1); (1,1,6,3:vw1); (1,0,7,0:vw1); (1,0,7,1:vw1); (1,0,7,2:vw1); (1,0,7,3:vw1) */
/******************************************/

/* calc coords, apply mask, and issue loads (if necessary) */
v_mov_b32 v8, BufferOOB
/* (d1,vc1,d0,vc0)=(1,1,1,1) */
s_mov_b32 s74, 129                                 // coordOffset0 d0=1 vc0=1
v_add_co_u32 v4, vcc, v0, s74                      // coord0.2: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v59, v4, s74
v_lshlrev_b32 v59, 0x2, v59                        // Bias address scaled by BPE
ds_read_b32 v56, v59 offset:0                      // load Bias
ds_read_b32 v57, v59 offset:1024                   // load scaleAlpha
v_add_lshl_u32 v58, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v58, v8, v58, s[78:79]               // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(1,1,1,2) */
s_mov_b32 s74, 130                                 // coordOffset0 d0=1 vc0=2
v_add_co_u32 v4, vcc, v0, s74                      // coord0.2: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v63, v4, s74
v_lshlrev_b32 v63, 0x2, v63                        // Bias address scaled by BPE
ds_read_b32 v60, v63 offset:0                      // load Bias
ds_read_b32 v61, v63 offset:1024                   // load scaleAlpha
v_add_lshl_u32 v62, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v62, v8, v62, s[78:79]               // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(1,1,1,3) */
s_mov_b32 s74, 131                                 // coordOffset0 d0=1 vc0=3
v_add_co_u32 v4, vcc, v0, s74                      // coord0.2: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v67, v4, s74
v_lshlrev_b32 v67, 0x2, v67                        // Bias address scaled by BPE
ds_read_b32 v64, v67 offset:0                      // load Bias
ds_read_b32 v65, v67 offset:1024                   // load scaleAlpha
v_add_lshl_u32 v66, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v66, v8, v66, s[78:79]               // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(1,2,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[74:75], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v71, v0, s74
v_lshlrev_b32 v71, 0x2, v71                        // Bias address scaled by BPE
ds_read_b32 v68, v71 offset:0                      // load Bias
ds_read_b32 v69, v71 offset:1024                   // load scaleAlpha
v_add_lshl_u32 v70, v3, v0, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v70, v8, v70, s[78:79]               // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(1,2,0,1) */
v_add_co_u32 v4, vcc, v0, 1                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v75, v4, s74
v_lshlrev_b32 v75, 0x2, v75                        // Bias address scaled by BPE
ds_read_b32 v72, v75 offset:0                      // load Bias
ds_read_b32 v73, v75 offset:1024                   // load scaleAlpha
v_add_lshl_u32 v74, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v74, v8, v74, s[78:79]               // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(1,2,0,2) */
v_add_co_u32 v4, vcc, v0, 2                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v79, v4, s74
v_lshlrev_b32 v79, 0x2, v79                        // Bias address scaled by BPE
ds_read_b32 v76, v79 offset:0                      // load Bias
ds_read_b32 v77, v79 offset:1024                   // load scaleAlpha
v_add_lshl_u32 v78, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v78, v8, v78, s[78:79]               // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(1,2,0,3) */
v_add_co_u32 v4, vcc, v0, 3                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v83, v4, s74
v_lshlrev_b32 v83, 0x2, v83                        // Bias address scaled by BPE
ds_read_b32 v80, v83 offset:0                      // load Bias
ds_read_b32 v81, v83 offset:1024                   // load scaleAlpha
v_add_lshl_u32 v82, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v82, v8, v82, s[78:79]               // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(1,2,1,0) */
s_mov_b32 s74, 128                                 // coordOffset0 d0=1 vc0=0
v_add_co_u32 v4, vcc, v0, s74                      // coord0.2: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v87, v4, s74
v_lshlrev_b32 v87, 0x2, v87                        // Bias address scaled by BPE
ds_read_b32 v84, v87 offset:0                      // load Bias
ds_read_b32 v85, v87 offset:1024                   // load scaleAlpha
v_add_lshl_u32 v86, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v86, v8, v86, s[78:79]               // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(1,2,1,1) */
s_mov_b32 s74, 129                                 // coordOffset0 d0=1 vc0=1
v_add_co_u32 v4, vcc, v0, s74                      // coord0.2: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v89, v4, s74
v_lshlrev_b32 v89, 0x2, v89                        // Bias address scaled by BPE
v_add_lshl_u32 v88, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v88, v8, v88, s[78:79]               // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(1,2,1,2) */
s_mov_b32 s74, 130                                 // coordOffset0 d0=1 vc0=2
v_add_co_u32 v4, vcc, v0, s74                      // coord0.2: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v91, v4, s74
v_lshlrev_b32 v91, 0x2, v91                        // Bias address scaled by BPE
v_add_lshl_u32 v90, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v90, v8, v90, s[78:79]               // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(1,2,1,3) */
s_mov_b32 s74, 131                                 // coordOffset0 d0=1 vc0=3
v_add_co_u32 v4, vcc, v0, s74                      // coord0.2: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v93, v4, s74
v_lshlrev_b32 v93, 0x2, v93                        // Bias address scaled by BPE
v_add_lshl_u32 v92, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v92, v8, v92, s[78:79]               // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(1,3,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[74:75], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v95, v0, s74
v_lshlrev_b32 v95, 0x2, v95                        // Bias address scaled by BPE
v_add_lshl_u32 v94, v3, v0, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v94, v8, v94, s[78:79]               // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(1,3,0,1) */
v_add_co_u32 v4, vcc, v0, 1                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v97, v4, s74
v_lshlrev_b32 v97, 0x2, v97                        // Bias address scaled by BPE
v_add_lshl_u32 v96, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v96, v8, v96, s[78:79]               // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(1,3,0,2) */
v_add_co_u32 v4, vcc, v0, 2                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v99, v4, s74
v_lshlrev_b32 v99, 0x2, v99                        // Bias address scaled by BPE
v_add_lshl_u32 v98, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v98, v8, v98, s[78:79]               // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(1,3,0,3) */
v_add_co_u32 v4, vcc, v0, 3                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v101, v4, s74
v_lshlrev_b32 v101, 0x2, v101                      // Bias address scaled by BPE
v_add_lshl_u32 v100, v3, v4, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v100, v8, v100, s[78:79]             // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(1,3,1,0) */
s_mov_b32 s74, 128                                 // coordOffset0 d0=1 vc0=0
v_add_co_u32 v4, vcc, v0, s74                      // coord0.2: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v103, v4, s74
v_lshlrev_b32 v103, 0x2, v103                      // Bias address scaled by BPE
v_add_lshl_u32 v102, v3, v4, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v102, v8, v102, s[78:79]             // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(1,3,1,1) */
s_mov_b32 s74, 129                                 // coordOffset0 d0=1 vc0=1
v_add_co_u32 v4, vcc, v0, s74                      // coord0.2: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v105, v4, s74
v_lshlrev_b32 v105, 0x2, v105                      // Bias address scaled by BPE
v_add_lshl_u32 v104, v3, v4, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v104, v8, v104, s[78:79]             // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(1,3,1,2) */
s_mov_b32 s74, 130                                 // coordOffset0 d0=1 vc0=2
v_add_co_u32 v4, vcc, v0, s74                      // coord0.2: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v107, v4, s74
v_lshlrev_b32 v107, 0x2, v107                      // Bias address scaled by BPE
v_add_lshl_u32 v106, v3, v4, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v106, v8, v106, s[78:79]             // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(1,3,1,3) */
s_mov_b32 s74, 131                                 // coordOffset0 d0=1 vc0=3
v_add_co_u32 v4, vcc, v0, s74                      // coord0.2: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v109, v4, s74
v_lshlrev_b32 v109, 0x2, v109                      // Bias address scaled by BPE
v_add_lshl_u32 v108, v3, v4, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v108, v8, v108, s[78:79]             // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(1,4,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[74:75], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v111, v0, s74
v_lshlrev_b32 v111, 0x2, v111                      // Bias address scaled by BPE
v_add_lshl_u32 v110, v3, v0, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v110, v8, v110, s[78:79]             // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(1,4,0,1) */
v_add_co_u32 v4, vcc, v0, 1                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v113, v4, s74
v_lshlrev_b32 v113, 0x2, v113                      // Bias address scaled by BPE
v_add_lshl_u32 v112, v3, v4, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v112, v8, v112, s[78:79]             // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(1,4,0,2) */
v_add_co_u32 v4, vcc, v0, 2                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v115, v4, s74
v_lshlrev_b32 v115, 0x2, v115                      // Bias address scaled by BPE
v_add_lshl_u32 v114, v3, v4, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v114, v8, v114, s[78:79]             // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(1,4,0,3) */
v_add_co_u32 v4, vcc, v0, 3                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v117, v4, s74
v_lshlrev_b32 v117, 0x2, v117                      // Bias address scaled by BPE
v_add_lshl_u32 v116, v3, v4, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v116, v8, v116, s[78:79]             // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(1,4,1,0) */
s_mov_b32 s74, 128                                 // coordOffset0 d0=1 vc0=0
v_add_co_u32 v4, vcc, v0, s74                      // coord0.2: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v119, v4, s74
v_lshlrev_b32 v119, 0x2, v119                      // Bias address scaled by BPE
v_add_lshl_u32 v118, v3, v4, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v118, v8, v118, s[78:79]             // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(1,4,1,1) */
s_mov_b32 s74, 129                                 // coordOffset0 d0=1 vc0=1
v_add_co_u32 v4, vcc, v0, s74                      // coord0.2: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v121, v4, s74
v_lshlrev_b32 v121, 0x2, v121                      // Bias address scaled by BPE
v_add_lshl_u32 v120, v3, v4, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v120, v8, v120, s[78:79]             // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(1,4,1,2) */
s_mov_b32 s74, 130                                 // coordOffset0 d0=1 vc0=2
v_add_co_u32 v4, vcc, v0, s74                      // coord0.2: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v123, v4, s74
v_lshlrev_b32 v123, 0x2, v123                      // Bias address scaled by BPE
v_add_lshl_u32 v122, v3, v4, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v122, v8, v122, s[78:79]             // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(1,4,1,3) */
s_mov_b32 s74, 131                                 // coordOffset0 d0=1 vc0=3
v_add_co_u32 v4, vcc, v0, s74                      // coord0.2: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v125, v4, s74
v_lshlrev_b32 v125, 0x2, v125                      // Bias address scaled by BPE
v_add_lshl_u32 v124, v3, v4, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v124, v8, v124, s[78:79]             // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(1,5,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[74:75], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v127, v0, s74
v_lshlrev_b32 v127, 0x2, v127                      // Bias address scaled by BPE
v_add_lshl_u32 v126, v3, v0, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v126, v8, v126, s[78:79]             // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(1,5,0,1) */
v_add_co_u32 v4, vcc, v0, 1                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v129, v4, s74
v_lshlrev_b32 v129, 0x2, v129                      // Bias address scaled by BPE
v_add_lshl_u32 v128, v3, v4, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v128, v8, v128, s[78:79]             // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(1,5,0,2) */
v_add_co_u32 v4, vcc, v0, 2                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v131, v4, s74
v_lshlrev_b32 v131, 0x2, v131                      // Bias address scaled by BPE
v_add_lshl_u32 v130, v3, v4, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v130, v8, v130, s[78:79]             // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(1,5,0,3) */
v_add_co_u32 v4, vcc, v0, 3                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v133, v4, s74
v_lshlrev_b32 v133, 0x2, v133                      // Bias address scaled by BPE
v_add_lshl_u32 v132, v3, v4, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v132, v8, v132, s[78:79]             // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(1,5,1,0) */
s_mov_b32 s74, 128                                 // coordOffset0 d0=1 vc0=0
v_add_co_u32 v4, vcc, v0, s74                      // coord0.2: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v135, v4, s74
v_lshlrev_b32 v135, 0x2, v135                      // Bias address scaled by BPE
v_add_lshl_u32 v134, v3, v4, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v134, v8, v134, s[78:79]             // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(1,5,1,1) */
s_mov_b32 s74, 129                                 // coordOffset0 d0=1 vc0=1
v_add_co_u32 v4, vcc, v0, s74                      // coord0.2: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v137, v4, s74
v_lshlrev_b32 v137, 0x2, v137                      // Bias address scaled by BPE
v_add_lshl_u32 v136, v3, v4, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v136, v8, v136, s[78:79]             // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(1,5,1,2) */
s_mov_b32 s74, 130                                 // coordOffset0 d0=1 vc0=2
v_add_co_u32 v4, vcc, v0, s74                      // coord0.2: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v139, v4, s74
v_lshlrev_b32 v139, 0x2, v139                      // Bias address scaled by BPE
v_add_lshl_u32 v138, v3, v4, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v138, v8, v138, s[78:79]             // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(1,5,1,3) */
s_mov_b32 s74, 131                                 // coordOffset0 d0=1 vc0=3
v_add_co_u32 v4, vcc, v0, s74                      // coord0.2: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v141, v4, s74
v_lshlrev_b32 v141, 0x2, v141                      // Bias address scaled by BPE
v_add_lshl_u32 v140, v3, v4, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v140, v8, v140, s[78:79]             // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(1,6,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[74:75], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v143, v0, s74
v_lshlrev_b32 v143, 0x2, v143                      // Bias address scaled by BPE
v_add_lshl_u32 v142, v3, v0, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v142, v8, v142, s[78:79]             // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(1,6,0,1) */
v_add_co_u32 v4, vcc, v0, 1                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v145, v4, s74
v_lshlrev_b32 v145, 0x2, v145                      // Bias address scaled by BPE
v_add_lshl_u32 v144, v3, v4, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v144, v8, v144, s[78:79]             // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(1,6,0,2) */
v_add_co_u32 v4, vcc, v0, 2                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v150, v4, s74
v_lshlrev_b32 v150, 0x2, v150                      // Bias address scaled by BPE
v_add_lshl_u32 v149, v3, v4, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v149, v8, v149, s[78:79]             // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(1,6,0,3) */
v_add_co_u32 v4, vcc, v0, 3                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v152, v4, s74
v_lshlrev_b32 v152, 0x2, v152                      // Bias address scaled by BPE
v_add_lshl_u32 v151, v3, v4, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v151, v8, v151, s[78:79]             // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(1,6,1,0) */
s_mov_b32 s74, 128                                 // coordOffset0 d0=1 vc0=0
v_add_co_u32 v4, vcc, v0, s74                      // coord0.2: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v154, v4, s74
v_lshlrev_b32 v154, 0x2, v154                      // Bias address scaled by BPE
v_add_lshl_u32 v153, v3, v4, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v153, v8, v153, s[78:79]             // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(1,6,1,1) */
s_mov_b32 s74, 129                                 // coordOffset0 d0=1 vc0=1
v_add_co_u32 v4, vcc, v0, s74                      // coord0.2: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v156, v4, s74
v_lshlrev_b32 v156, 0x2, v156                      // Bias address scaled by BPE
v_add_lshl_u32 v155, v3, v4, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v155, v8, v155, s[78:79]             // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(1,6,1,2) */
s_mov_b32 s74, 130                                 // coordOffset0 d0=1 vc0=2
v_add_co_u32 v4, vcc, v0, s74                      // coord0.2: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v158, v4, s74
v_lshlrev_b32 v158, 0x2, v158                      // Bias address scaled by BPE
v_add_lshl_u32 v157, v3, v4, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v157, v8, v157, s[78:79]             // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(1,6,1,3) */
s_mov_b32 s74, 131                                 // coordOffset0 d0=1 vc0=3
v_add_co_u32 v4, vcc, v0, s74                      // coord0.2: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v160, v4, s74
v_lshlrev_b32 v160, 0x2, v160                      // Bias address scaled by BPE
v_add_lshl_u32 v159, v3, v4, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v159, v8, v159, s[78:79]             // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(1,7,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[74:75], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v162, v0, s74
v_lshlrev_b32 v162, 0x2, v162                      // Bias address scaled by BPE
v_add_lshl_u32 v161, v3, v0, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v161, v8, v161, s[78:79]             // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(1,7,0,1) */
v_add_co_u32 v4, vcc, v0, 1                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v164, v4, s74
v_lshlrev_b32 v164, 0x2, v164                      // Bias address scaled by BPE
v_add_lshl_u32 v163, v3, v4, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v163, v8, v163, s[78:79]             // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(1,7,0,2) */
v_add_co_u32 v4, vcc, v0, 2                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v166, v4, s74
v_lshlrev_b32 v166, 0x2, v166                      // Bias address scaled by BPE
v_add_lshl_u32 v165, v3, v4, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v165, v8, v165, s[78:79]             // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(1,7,0,3) */
v_add_co_u32 v4, vcc, v0, 3                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[78:79], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[78:79], s[74:75], s[78:79]             // in0 && in1
s_mul_i32 s74, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v168, v4, s74
v_lshlrev_b32 v168, 0x2, v168                      // Bias address scaled by BPE
v_add_lshl_u32 v167, v3, v4, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v167, v8, v167, s[78:79]             // LDD clip if OOB. offset
v_accvgpr_read_b32 v[vgprValuC+9], acc180          // copy acc to vreg[141]
v_accvgpr_read_b32 v[vgprValuC+10], acc184         // copy acc to vreg[142]
v_accvgpr_read_b32 v[vgprValuC+11], acc188         // copy acc to vreg[143]
v_accvgpr_read_b32 v[vgprValuC+12], acc192         // copy acc to vreg[144]
v_accvgpr_read_b32 v[vgprValuC+13], acc196         // copy acc to vreg[145]
v_accvgpr_read_b32 v[vgprValuC+14], acc200         // copy acc to vreg[146]
v_accvgpr_read_b32 v[vgprValuC+15], acc204         // copy acc to vreg[147]
v_accvgpr_read_b32 v[vgprValuC+16], acc208         // copy acc to vreg[148]
v_accvgpr_read_b32 v[vgprValuC+17], acc212         // copy acc to vreg[149]
v_accvgpr_read_b32 v[vgprValuC+18], acc216         // copy acc to vreg[150]
v_accvgpr_read_b32 v[vgprValuC+19], acc220         // copy acc to vreg[151]
v_accvgpr_read_b32 v[vgprValuC+20], acc224         // copy acc to vreg[152]
v_accvgpr_read_b32 v[vgprValuC+21], acc228         // copy acc to vreg[153]
v_accvgpr_read_b32 v[vgprValuC+22], acc232         // copy acc to vreg[154]
v_accvgpr_read_b32 v[vgprValuC+23], acc236         // copy acc to vreg[155]
v_accvgpr_read_b32 v[vgprValuC+24], acc240         // copy acc to vreg[156]
v_accvgpr_read_b32 v[vgprValuC+25], acc244         // copy acc to vreg[157]
v_accvgpr_read_b32 v[vgprValuC+26], acc248         // copy acc to vreg[158]
v_accvgpr_read_b32 v[vgprValuC+27], acc252         // copy acc to vreg[159]
v_accvgpr_read_b32 v[vgprValuC+28], acc129         // copy acc to vreg[160]
v_accvgpr_read_b32 v[vgprValuC+29], acc133         // copy acc to vreg[161]
v_accvgpr_read_b32 v[vgprValuC+30], acc137         // copy acc to vreg[162]
v_accvgpr_read_b32 v[vgprValuC+31], acc141         // copy acc to vreg[163]
v_accvgpr_read_b32 v[vgprValuC+32], acc145         // copy acc to vreg[164]
v_accvgpr_read_b32 v[vgprValuC+33], acc149         // copy acc to vreg[165]
v_accvgpr_read_b32 v[vgprValuC+34], acc153         // copy acc to vreg[166]
v_accvgpr_read_b32 v[vgprValuC+35], acc157         // copy acc to vreg[167]
v_accvgpr_read_b32 v[vgprValuC+36], acc161         // copy acc to vreg[168]
v_accvgpr_read_b32 v[vgprValuC+37], acc165         // copy acc to vreg[169]
v_accvgpr_read_b32 v[vgprValuC+38], acc169         // copy acc to vreg[170]
v_accvgpr_read_b32 v[vgprValuC+39], acc173         // copy acc to vreg[171]
v_accvgpr_read_b32 v[vgprValuC+40], acc177         // copy acc to vreg[172]
v_accvgpr_read_b32 v[vgprValuC+41], acc181         // copy acc to vreg[173]
v_accvgpr_read_b32 v[vgprValuC+42], acc185         // copy acc to vreg[174]
v_accvgpr_read_b32 v[vgprValuC+43], acc189         // copy acc to vreg[175]
v_accvgpr_read_b32 v[vgprValuC+44], acc193         // copy acc to vreg[176]
v_accvgpr_read_b32 v[vgprValuC+45], acc197         // copy acc to vreg[177]
v_accvgpr_read_b32 v[vgprValuC+46], acc201         // copy acc to vreg[178]
v_accvgpr_read_b32 v[vgprValuC+47], acc205         // copy acc to vreg[179]
v_accvgpr_read_b32 v[vgprValuC+48], acc209         // copy acc to vreg[180]
v_accvgpr_read_b32 v[vgprValuC+49], acc213         // copy acc to vreg[181]
v_accvgpr_read_b32 v[vgprValuC+50], acc217         // copy acc to vreg[182]
v_accvgpr_read_b32 v[vgprValuC+51], acc221         // copy acc to vreg[183]
v_accvgpr_read_b32 v[vgprValuC+52], acc225         // copy acc to vreg[184]
v_accvgpr_read_b32 v[vgprValuC+53], acc229         // copy acc to vreg[185]
v_accvgpr_read_b32 v[vgprValuC+54], acc233         // copy acc to vreg[186]
v_accvgpr_read_b32 v[vgprValuC+55], acc237         // copy acc to vreg[187]

/* rC *= alpha batchElements=[(1, 1, 1, 1), (1, 1, 1, 2), (1, 1, 1, 3), (1, 0, 2, 0), (1, 0, 2, 1), (1, 0, 2, 2), (1, 0, 2, 3), (1, 1, 2, 0), (1, 1, 2, 1), (1, 1, 2, 2), (1, 1, 2, 3), (1, 0, 3, 0), (1, 0, 3, 1), (1, 0, 3, 2), (1, 0, 3, 3), (1, 1, 3, 0), (1, 1, 3, 1), (1, 1, 3, 2), (1, 1, 3, 3), (1, 0, 4, 0), (1, 0, 4, 1), (1, 0, 4, 2), (1, 0, 4, 3), (1, 1, 4, 0), (1, 1, 4, 1), (1, 1, 4, 2), (1, 1, 4, 3), (1, 0, 5, 0), (1, 0, 5, 1), (1, 0, 5, 2), (1, 0, 5, 3), (1, 1, 5, 0), (1, 1, 5, 1), (1, 1, 5, 2), (1, 1, 5, 3), (1, 0, 6, 0), (1, 0, 6, 1), (1, 0, 6, 2), (1, 0, 6, 3), (1, 1, 6, 0), (1, 1, 6, 1), (1, 1, 6, 2), (1, 1, 6, 3), (1, 0, 7, 0), (1, 0, 7, 1), (1, 0, 7, 2), (1, 0, 7, 3)] */
v_mul_f32 v[vgprValuC+9], s[sgprAlpha], v[vgprValuC+9] // *= alpha
v_pk_mul_f32 v[vgprValuC+10:vgprValuC+10+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+10:vgprValuC+10+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+12:vgprValuC+12+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+12:vgprValuC+12+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+14:vgprValuC+14+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+14:vgprValuC+14+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+16:vgprValuC+16+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+16:vgprValuC+16+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+18:vgprValuC+18+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+18:vgprValuC+18+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+20:vgprValuC+20+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+20:vgprValuC+20+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+22:vgprValuC+22+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+22:vgprValuC+22+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+24:vgprValuC+24+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+24:vgprValuC+24+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+26:vgprValuC+26+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+26:vgprValuC+26+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+28:vgprValuC+28+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+28:vgprValuC+28+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+30:vgprValuC+30+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+30:vgprValuC+30+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+32:vgprValuC+32+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+32:vgprValuC+32+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+34:vgprValuC+34+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+34:vgprValuC+34+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+36:vgprValuC+36+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+36:vgprValuC+36+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+38:vgprValuC+38+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+38:vgprValuC+38+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+40:vgprValuC+40+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+40:vgprValuC+40+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+42:vgprValuC+42+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+42:vgprValuC+42+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+44:vgprValuC+44+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+44:vgprValuC+44+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+46:vgprValuC+46+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+46:vgprValuC+46+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+48:vgprValuC+48+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+48:vgprValuC+48+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+50:vgprValuC+50+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+50:vgprValuC+50+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+52:vgprValuC+52+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+52:vgprValuC+52+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+54:vgprValuC+54+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+54:vgprValuC+54+1] op_sel_hi:[0,1,1] // *= alpha (pk)
s_waitcnt lgkmcnt(0)                               // wait for Bias LDS, ScaleAlphaVec

/* apply mask, calc new C and issue writes */
v_mul_f32 v[vgprValuC+9], v57, v[vgprValuC+9]      // *= ScaleAlphaVecVMul
v_add_f32 v4, v56, v[vgprValuC+9]                  // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b32 v9, v4
buffer_store_dword v9, v58, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_mul_f32 v[vgprValuC+10], v61, v[vgprValuC+10]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v60, v[vgprValuC+10]                 // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b32 v10, v4
buffer_store_dword v10, v62, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_mul_f32 v[vgprValuC+11], v65, v[vgprValuC+11]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v64, v[vgprValuC+11]                 // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b32 v11, v4
buffer_store_dword v11, v66, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_mul_f32 v[vgprValuC+12], v69, v[vgprValuC+12]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v68, v[vgprValuC+12]                 // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b32 v12, v4
buffer_store_dword v12, v70, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_mul_f32 v[vgprValuC+13], v73, v[vgprValuC+13]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v72, v[vgprValuC+13]                 // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b32 v13, v4
buffer_store_dword v13, v74, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_mul_f32 v[vgprValuC+14], v77, v[vgprValuC+14]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v76, v[vgprValuC+14]                 // C += bias
s_swappc_b64 s[64:65], s[8:9]
v_mov_b32 v14, v4
buffer_store_dword v14, v78, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
v_mul_f32 v[vgprValuC+15], v81, v[vgprValuC+15]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v80, v[vgprValuC+15]                 // C += bias
s_swappc_b64 