[fastregalloc] Enhance the heuristics for liveout in self loop.

For below case, virtual register is defined twice in the self loop. We don't need to spill %0 after the third instruction `%0 = def (tied %0)`, because it is defined in the second instruction `%0 = def`. 1 bb.1 2 %0 = def 3 %0 = def (tied %0) 4 ... 5 jmp bb.1 Reviewed By: MatzeB Differential Revision: https://reviews.llvm.org/D125079
2022-06-21 08:10:16 +08:00 · 2022-06-21 08:10:16 +08:00 · 44e8a205f4
parent d883a02a7c
commit 44e8a205f4
9 changed files with 110 additions and 177 deletions
--- a/llvm/lib/CodeGen/RegAllocFast.cpp
+++ b/llvm/lib/CodeGen/RegAllocFast.cpp
@ -359,7 +359,16 @@ bool RegAllocFast::mayLiveOut(Register VirtReg) {
  // If this block loops back to itself, it is necessary to check whether the
  // use comes after the def.
  if (MBB->isSuccessor(MBB)) {
-    SelfLoopDef = MRI->getUniqueVRegDef(VirtReg);
+    // Find the first def in the self loop MBB.
+    for (const MachineInstr &DefInst : MRI->def_instructions(VirtReg)) {
+      if (DefInst.getParent() != MBB) {
+        MayLiveAcrossBlocks.set(Register::virtReg2Index(VirtReg));
+        return true;
+      } else {
+        if (!SelfLoopDef || dominates(*MBB, DefInst.getIterator(), SelfLoopDef))
+          SelfLoopDef = &DefInst;
+      }
+    }
    if (!SelfLoopDef) {
      MayLiveAcrossBlocks.set(Register::virtReg2Index(VirtReg));
      return true;
--- a/llvm/test/CodeGen/AArch64/atomicrmw-O0.ll
+++ b/llvm/test/CodeGen/AArch64/atomicrmw-O0.ll
@ -257,43 +257,41 @@ define i128 @test_rmw_add_128(i128* %dst)   {
 ;
 ; LSE-LABEL: test_rmw_add_128:
 ; LSE:       // %bb.0: // %entry
-; LSE-NEXT:    sub sp, sp, #80
-; LSE-NEXT:    .cfi_def_cfa_offset 80
-; LSE-NEXT:    str x0, [sp, #56] // 8-byte Folded Spill
+; LSE-NEXT:    sub sp, sp, #48
+; LSE-NEXT:    .cfi_def_cfa_offset 48
+; LSE-NEXT:    str x0, [sp, #24] // 8-byte Folded Spill
 ; LSE-NEXT:    ldr x8, [x0, #8]
 ; LSE-NEXT:    ldr x9, [x0]
-; LSE-NEXT:    str x9, [sp, #64] // 8-byte Folded Spill
-; LSE-NEXT:    str x8, [sp, #72] // 8-byte Folded Spill
+; LSE-NEXT:    str x9, [sp, #32] // 8-byte Folded Spill
+; LSE-NEXT:    str x8, [sp, #40] // 8-byte Folded Spill
 ; LSE-NEXT:    b .LBB4_1
 ; LSE-NEXT:  .LBB4_1: // %atomicrmw.start
 ; LSE-NEXT:    // =>This Inner Loop Header: Depth=1
-; LSE-NEXT:    ldr x10, [sp, #72] // 8-byte Folded Reload
-; LSE-NEXT:    ldr x8, [sp, #64] // 8-byte Folded Reload
-; LSE-NEXT:    ldr x9, [sp, #56] // 8-byte Folded Reload
+; LSE-NEXT:    ldr x10, [sp, #40] // 8-byte Folded Reload
+; LSE-NEXT:    ldr x8, [sp, #32] // 8-byte Folded Reload
+; LSE-NEXT:    ldr x9, [sp, #24] // 8-byte Folded Reload
 ; LSE-NEXT:    mov x0, x8
 ; LSE-NEXT:    mov x1, x10
-; LSE-NEXT:    stp x0, x1, [sp, #8] // 16-byte Folded Spill
 ; LSE-NEXT:    adds x2, x8, #1
 ; LSE-NEXT:    cinc x11, x10, hs
 ; LSE-NEXT:    // kill: def $x2 killed $x2 def $x2_x3
 ; LSE-NEXT:    mov x3, x11
 ; LSE-NEXT:    caspal x0, x1, x2, x3, [x9]
-; LSE-NEXT:    stp x0, x1, [sp, #24] // 16-byte Folded Spill
 ; LSE-NEXT:    mov x9, x1
-; LSE-NEXT:    str x9, [sp, #40] // 8-byte Folded Spill
+; LSE-NEXT:    str x9, [sp, #8] // 8-byte Folded Spill
 ; LSE-NEXT:    eor x11, x9, x10
 ; LSE-NEXT:    mov x10, x0
-; LSE-NEXT:    str x10, [sp, #48] // 8-byte Folded Spill
+; LSE-NEXT:    str x10, [sp, #16] // 8-byte Folded Spill
 ; LSE-NEXT:    eor x8, x10, x8
 ; LSE-NEXT:    orr x8, x8, x11
-; LSE-NEXT:    str x10, [sp, #64] // 8-byte Folded Spill
-; LSE-NEXT:    str x9, [sp, #72] // 8-byte Folded Spill
+; LSE-NEXT:    str x10, [sp, #32] // 8-byte Folded Spill
+; LSE-NEXT:    str x9, [sp, #40] // 8-byte Folded Spill
 ; LSE-NEXT:    cbnz x8, .LBB4_1
 ; LSE-NEXT:    b .LBB4_2
 ; LSE-NEXT:  .LBB4_2: // %atomicrmw.end
-; LSE-NEXT:    ldr x1, [sp, #40] // 8-byte Folded Reload
-; LSE-NEXT:    ldr x0, [sp, #48] // 8-byte Folded Reload
-; LSE-NEXT:    add sp, sp, #80
+; LSE-NEXT:    ldr x1, [sp, #8] // 8-byte Folded Reload
+; LSE-NEXT:    ldr x0, [sp, #16] // 8-byte Folded Reload
+; LSE-NEXT:    add sp, sp, #48
 ; LSE-NEXT:    ret
 entry:
  %res = atomicrmw add i128* %dst, i128 1 seq_cst
@ -355,7 +353,6 @@ define i8 @test_rmw_nand_8(i8* %dst)   {
 ; LSE-NEXT:    orr w10, w8, #0xfffffffe
 ; LSE-NEXT:    mov w8, w9
 ; LSE-NEXT:    casalb w8, w10, [x11]
-; LSE-NEXT:    str w8, [sp, #8] // 4-byte Folded Spill
 ; LSE-NEXT:    subs w9, w8, w9, uxtb
 ; LSE-NEXT:    cset w9, eq
 ; LSE-NEXT:    str w8, [sp, #12] // 4-byte Folded Spill
@ -428,7 +425,6 @@ define i16 @test_rmw_nand_16(i16* %dst)   {
 ; LSE-NEXT:    orr w10, w8, #0xfffffffe
 ; LSE-NEXT:    mov w8, w9
 ; LSE-NEXT:    casalh w8, w10, [x11]
-; LSE-NEXT:    str w8, [sp, #8] // 4-byte Folded Spill
 ; LSE-NEXT:    subs w9, w8, w9, uxth
 ; LSE-NEXT:    cset w9, eq
 ; LSE-NEXT:    str w8, [sp, #12] // 4-byte Folded Spill
@ -501,7 +497,6 @@ define i32 @test_rmw_nand_32(i32* %dst)   {
 ; LSE-NEXT:    orr w10, w8, #0xfffffffe
 ; LSE-NEXT:    mov w8, w9
 ; LSE-NEXT:    casal w8, w10, [x11]
-; LSE-NEXT:    str w8, [sp, #8] // 4-byte Folded Spill
 ; LSE-NEXT:    subs w9, w8, w9
 ; LSE-NEXT:    cset w9, eq
 ; LSE-NEXT:    str w8, [sp, #12] // 4-byte Folded Spill
@ -580,7 +575,6 @@ define i64 @test_rmw_nand_64(i64* %dst)   {
 ; LSE-NEXT:    orr x10, x8, #0xfffffffffffffffe
 ; LSE-NEXT:    mov x8, x9
 ; LSE-NEXT:    casal x8, x10, [x11]
-; LSE-NEXT:    str x8, [sp] // 8-byte Folded Spill
 ; LSE-NEXT:    subs x9, x8, x9
 ; LSE-NEXT:    cset w9, eq
 ; LSE-NEXT:    str x8, [sp, #8] // 8-byte Folded Spill
@ -657,22 +651,21 @@ define i128 @test_rmw_nand_128(i128* %dst)   {
 ;
 ; LSE-LABEL: test_rmw_nand_128:
 ; LSE:       // %bb.0: // %entry
-; LSE-NEXT:    sub sp, sp, #80
-; LSE-NEXT:    .cfi_def_cfa_offset 80
-; LSE-NEXT:    str x0, [sp, #56] // 8-byte Folded Spill
+; LSE-NEXT:    sub sp, sp, #48
+; LSE-NEXT:    .cfi_def_cfa_offset 48
+; LSE-NEXT:    str x0, [sp, #24] // 8-byte Folded Spill
 ; LSE-NEXT:    ldr x8, [x0, #8]
 ; LSE-NEXT:    ldr x9, [x0]
-; LSE-NEXT:    str x9, [sp, #64] // 8-byte Folded Spill
-; LSE-NEXT:    str x8, [sp, #72] // 8-byte Folded Spill
+; LSE-NEXT:    str x9, [sp, #32] // 8-byte Folded Spill
+; LSE-NEXT:    str x8, [sp, #40] // 8-byte Folded Spill
 ; LSE-NEXT:    b .LBB9_1
 ; LSE-NEXT:  .LBB9_1: // %atomicrmw.start
 ; LSE-NEXT:    // =>This Inner Loop Header: Depth=1
-; LSE-NEXT:    ldr x10, [sp, #72] // 8-byte Folded Reload
-; LSE-NEXT:    ldr x8, [sp, #64] // 8-byte Folded Reload
-; LSE-NEXT:    ldr x9, [sp, #56] // 8-byte Folded Reload
+; LSE-NEXT:    ldr x10, [sp, #40] // 8-byte Folded Reload
+; LSE-NEXT:    ldr x8, [sp, #32] // 8-byte Folded Reload
+; LSE-NEXT:    ldr x9, [sp, #24] // 8-byte Folded Reload
 ; LSE-NEXT:    mov x0, x8
 ; LSE-NEXT:    mov x1, x10
-; LSE-NEXT:    stp x0, x1, [sp, #8] // 16-byte Folded Spill
 ; LSE-NEXT:    mov w11, w8
 ; LSE-NEXT:    mvn w12, w11
 ; LSE-NEXT:    // implicit-def: $x11
@ -682,22 +675,21 @@ define i128 @test_rmw_nand_128(i128* %dst)   {
 ; LSE-NEXT:    // kill: def $x2 killed $x2 def $x2_x3
 ; LSE-NEXT:    mov x3, x11
 ; LSE-NEXT:    caspal x0, x1, x2, x3, [x9]
-; LSE-NEXT:    stp x0, x1, [sp, #24] // 16-byte Folded Spill
 ; LSE-NEXT:    mov x9, x1
-; LSE-NEXT:    str x9, [sp, #40] // 8-byte Folded Spill
+; LSE-NEXT:    str x9, [sp, #8] // 8-byte Folded Spill
 ; LSE-NEXT:    eor x11, x9, x10
 ; LSE-NEXT:    mov x10, x0
-; LSE-NEXT:    str x10, [sp, #48] // 8-byte Folded Spill
+; LSE-NEXT:    str x10, [sp, #16] // 8-byte Folded Spill
 ; LSE-NEXT:    eor x8, x10, x8
 ; LSE-NEXT:    orr x8, x8, x11
-; LSE-NEXT:    str x10, [sp, #64] // 8-byte Folded Spill
-; LSE-NEXT:    str x9, [sp, #72] // 8-byte Folded Spill
+; LSE-NEXT:    str x10, [sp, #32] // 8-byte Folded Spill
+; LSE-NEXT:    str x9, [sp, #40] // 8-byte Folded Spill
 ; LSE-NEXT:    cbnz x8, .LBB9_1
 ; LSE-NEXT:    b .LBB9_2
 ; LSE-NEXT:  .LBB9_2: // %atomicrmw.end
-; LSE-NEXT:    ldr x1, [sp, #40] // 8-byte Folded Reload
-; LSE-NEXT:    ldr x0, [sp, #48] // 8-byte Folded Reload
-; LSE-NEXT:    add sp, sp, #80
+; LSE-NEXT:    ldr x1, [sp, #8] // 8-byte Folded Reload
+; LSE-NEXT:    ldr x0, [sp, #16] // 8-byte Folded Reload
+; LSE-NEXT:    add sp, sp, #48
 ; LSE-NEXT:    ret
 entry:
  %res = atomicrmw nand i128* %dst, i128 1 seq_cst
--- a/llvm/test/CodeGen/AMDGPU/fastregalloc-self-loop-heuristic.mir
+++ b/llvm/test/CodeGen/AMDGPU/fastregalloc-self-loop-heuristic.mir
@ -55,11 +55,11 @@ body:             |
  ; GCN:   renamable $vgpr2 = GLOBAL_LOAD_DWORD renamable $vgpr0_vgpr1, 0, 0, implicit $exec
  ; GCN:   GLOBAL_STORE_DWORD renamable $vgpr0_vgpr1, renamable $vgpr2, 0, 0, implicit $exec
  ; GCN:   renamable $vgpr2 = GLOBAL_LOAD_DWORD renamable $vgpr0_vgpr1, 0, 0, implicit $exec
-  ; GCN:   SI_SPILL_V32_SAVE $vgpr2, %stack.1, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5)
-  ; GCN:   GLOBAL_STORE_DWORD renamable $vgpr0_vgpr1, renamable $vgpr2, 0, 0, implicit $exec
+  ; GCN:   GLOBAL_STORE_DWORD renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec
  ; GCN:   S_CBRANCH_EXECZ %bb.1, implicit $exec
  ; GCN: bb.2:
  ; GCN:   S_ENDPGM 0
+
  bb.0:
    liveins: $vgpr0_vgpr1
    %0:vreg_64 = COPY $vgpr0_vgpr1
--- a/llvm/test/CodeGen/X86/atomic32.ll
+++ b/llvm/test/CodeGen/X86/atomic32.ll
@ -226,7 +226,6 @@ define void @atomic_fetch_nand32(i32 %x) nounwind {
 ; X64-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 4-byte Reload
 ; X64-NEXT:    movl %eax, %ecx
 ; X64-NEXT:    andl %edx, %ecx
-; X64-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; X64-NEXT:    notl %ecx
 ; X64-NEXT:    lock cmpxchgl %ecx, sc32(%rip)
 ; X64-NEXT:    sete %cl
@ -239,18 +238,17 @@ define void @atomic_fetch_nand32(i32 %x) nounwind {
 ;
 ; X86-LABEL: atomic_fetch_nand32:
 ; X86:       # %bb.0:
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    subl $8, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
 ; X86-NEXT:    movl sc32, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:  .LBB5_1: # %atomicrmw.start
 ; X86-NEXT:    # =>This Inner Loop Header: Depth=1
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    movl (%esp), %edx # 4-byte Reload
 ; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    andl %edx, %ecx
-; X86-NEXT:    movl %ecx, (%esp) # 4-byte Spill
 ; X86-NEXT:    notl %ecx
 ; X86-NEXT:    lock cmpxchgl %ecx, sc32
 ; X86-NEXT:    sete %cl
@ -259,7 +257,7 @@ define void @atomic_fetch_nand32(i32 %x) nounwind {
 ; X86-NEXT:    jne .LBB5_2
 ; X86-NEXT:    jmp .LBB5_1
 ; X86-NEXT:  .LBB5_2: # %atomicrmw.end
-; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    addl $8, %esp
 ; X86-NEXT:    retl
  %t1 = atomicrmw nand i32* @sc32, i32 %x acquire
  ret void
@ -277,7 +275,6 @@ define void @atomic_fetch_max32(i32 %x) nounwind {
 ; X64-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload
 ; X64-NEXT:    movl %eax, %edx
 ; X64-NEXT:    subl %ecx, %edx
-; X64-NEXT:    movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; X64-NEXT:    cmovgl %eax, %ecx
 ; X64-NEXT:    lock cmpxchgl %ecx, sc32(%rip)
 ; X64-NEXT:    sete %cl
@ -290,18 +287,17 @@ define void @atomic_fetch_max32(i32 %x) nounwind {
 ;
 ; X86-CMOV-LABEL: atomic_fetch_max32:
 ; X86-CMOV:       # %bb.0:
-; X86-CMOV-NEXT:    subl $12, %esp
+; X86-CMOV-NEXT:    subl $8, %esp
 ; X86-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-CMOV-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-CMOV-NEXT:    movl %eax, (%esp) # 4-byte Spill
 ; X86-CMOV-NEXT:    movl sc32, %eax
 ; X86-CMOV-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-CMOV-NEXT:  .LBB6_1: # %atomicrmw.start
 ; X86-CMOV-NEXT:    # =>This Inner Loop Header: Depth=1
 ; X86-CMOV-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-CMOV-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-CMOV-NEXT:    movl (%esp), %ecx # 4-byte Reload
 ; X86-CMOV-NEXT:    movl %eax, %edx
 ; X86-CMOV-NEXT:    subl %ecx, %edx
-; X86-CMOV-NEXT:    movl %edx, (%esp) # 4-byte Spill
 ; X86-CMOV-NEXT:    cmovgl %eax, %ecx
 ; X86-CMOV-NEXT:    lock cmpxchgl %ecx, sc32
 ; X86-CMOV-NEXT:    sete %cl
@ -310,7 +306,7 @@ define void @atomic_fetch_max32(i32 %x) nounwind {
 ; X86-CMOV-NEXT:    jne .LBB6_2
 ; X86-CMOV-NEXT:    jmp .LBB6_1
 ; X86-CMOV-NEXT:  .LBB6_2: # %atomicrmw.end
-; X86-CMOV-NEXT:    addl $12, %esp
+; X86-CMOV-NEXT:    addl $8, %esp
 ; X86-CMOV-NEXT:    retl
 ;
 ; X86-NOCMOV-LABEL: atomic_fetch_max32:
@ -396,7 +392,6 @@ define void @atomic_fetch_min32(i32 %x) nounwind {
 ; X64-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload
 ; X64-NEXT:    movl %eax, %edx
 ; X64-NEXT:    subl %ecx, %edx
-; X64-NEXT:    movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; X64-NEXT:    cmovlel %eax, %ecx
 ; X64-NEXT:    lock cmpxchgl %ecx, sc32(%rip)
 ; X64-NEXT:    sete %cl
@ -409,18 +404,17 @@ define void @atomic_fetch_min32(i32 %x) nounwind {
 ;
 ; X86-CMOV-LABEL: atomic_fetch_min32:
 ; X86-CMOV:       # %bb.0:
-; X86-CMOV-NEXT:    subl $12, %esp
+; X86-CMOV-NEXT:    subl $8, %esp
 ; X86-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-CMOV-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-CMOV-NEXT:    movl %eax, (%esp) # 4-byte Spill
 ; X86-CMOV-NEXT:    movl sc32, %eax
 ; X86-CMOV-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-CMOV-NEXT:  .LBB7_1: # %atomicrmw.start
 ; X86-CMOV-NEXT:    # =>This Inner Loop Header: Depth=1
 ; X86-CMOV-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-CMOV-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-CMOV-NEXT:    movl (%esp), %ecx # 4-byte Reload
 ; X86-CMOV-NEXT:    movl %eax, %edx
 ; X86-CMOV-NEXT:    subl %ecx, %edx
-; X86-CMOV-NEXT:    movl %edx, (%esp) # 4-byte Spill
 ; X86-CMOV-NEXT:    cmovlel %eax, %ecx
 ; X86-CMOV-NEXT:    lock cmpxchgl %ecx, sc32
 ; X86-CMOV-NEXT:    sete %cl
@ -429,7 +423,7 @@ define void @atomic_fetch_min32(i32 %x) nounwind {
 ; X86-CMOV-NEXT:    jne .LBB7_2
 ; X86-CMOV-NEXT:    jmp .LBB7_1
 ; X86-CMOV-NEXT:  .LBB7_2: # %atomicrmw.end
-; X86-CMOV-NEXT:    addl $12, %esp
+; X86-CMOV-NEXT:    addl $8, %esp
 ; X86-CMOV-NEXT:    retl
 ;
 ; X86-NOCMOV-LABEL: atomic_fetch_min32:
@ -515,7 +509,6 @@ define void @atomic_fetch_umax32(i32 %x) nounwind {
 ; X64-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload
 ; X64-NEXT:    movl %eax, %edx
 ; X64-NEXT:    subl %ecx, %edx
-; X64-NEXT:    movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; X64-NEXT:    cmoval %eax, %ecx
 ; X64-NEXT:    lock cmpxchgl %ecx, sc32(%rip)
 ; X64-NEXT:    sete %cl
@ -528,18 +521,17 @@ define void @atomic_fetch_umax32(i32 %x) nounwind {
 ;
 ; X86-CMOV-LABEL: atomic_fetch_umax32:
 ; X86-CMOV:       # %bb.0:
-; X86-CMOV-NEXT:    subl $12, %esp
+; X86-CMOV-NEXT:    subl $8, %esp
 ; X86-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-CMOV-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-CMOV-NEXT:    movl %eax, (%esp) # 4-byte Spill
 ; X86-CMOV-NEXT:    movl sc32, %eax
 ; X86-CMOV-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-CMOV-NEXT:  .LBB8_1: # %atomicrmw.start
 ; X86-CMOV-NEXT:    # =>This Inner Loop Header: Depth=1
 ; X86-CMOV-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-CMOV-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-CMOV-NEXT:    movl (%esp), %ecx # 4-byte Reload
 ; X86-CMOV-NEXT:    movl %eax, %edx
 ; X86-CMOV-NEXT:    subl %ecx, %edx
-; X86-CMOV-NEXT:    movl %edx, (%esp) # 4-byte Spill
 ; X86-CMOV-NEXT:    cmoval %eax, %ecx
 ; X86-CMOV-NEXT:    lock cmpxchgl %ecx, sc32
 ; X86-CMOV-NEXT:    sete %cl
@ -548,7 +540,7 @@ define void @atomic_fetch_umax32(i32 %x) nounwind {
 ; X86-CMOV-NEXT:    jne .LBB8_2
 ; X86-CMOV-NEXT:    jmp .LBB8_1
 ; X86-CMOV-NEXT:  .LBB8_2: # %atomicrmw.end
-; X86-CMOV-NEXT:    addl $12, %esp
+; X86-CMOV-NEXT:    addl $8, %esp
 ; X86-CMOV-NEXT:    retl
 ;
 ; X86-NOCMOV-LABEL: atomic_fetch_umax32:
@ -634,7 +626,6 @@ define void @atomic_fetch_umin32(i32 %x) nounwind {
 ; X64-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload
 ; X64-NEXT:    movl %eax, %edx
 ; X64-NEXT:    subl %ecx, %edx
-; X64-NEXT:    movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; X64-NEXT:    cmovbel %eax, %ecx
 ; X64-NEXT:    lock cmpxchgl %ecx, sc32(%rip)
 ; X64-NEXT:    sete %cl
@ -647,18 +638,17 @@ define void @atomic_fetch_umin32(i32 %x) nounwind {
 ;
 ; X86-CMOV-LABEL: atomic_fetch_umin32:
 ; X86-CMOV:       # %bb.0:
-; X86-CMOV-NEXT:    subl $12, %esp
+; X86-CMOV-NEXT:    subl $8, %esp
 ; X86-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-CMOV-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-CMOV-NEXT:    movl %eax, (%esp) # 4-byte Spill
 ; X86-CMOV-NEXT:    movl sc32, %eax
 ; X86-CMOV-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-CMOV-NEXT:  .LBB9_1: # %atomicrmw.start
 ; X86-CMOV-NEXT:    # =>This Inner Loop Header: Depth=1
 ; X86-CMOV-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-CMOV-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-CMOV-NEXT:    movl (%esp), %ecx # 4-byte Reload
 ; X86-CMOV-NEXT:    movl %eax, %edx
 ; X86-CMOV-NEXT:    subl %ecx, %edx
-; X86-CMOV-NEXT:    movl %edx, (%esp) # 4-byte Spill
 ; X86-CMOV-NEXT:    cmovbel %eax, %ecx
 ; X86-CMOV-NEXT:    lock cmpxchgl %ecx, sc32
 ; X86-CMOV-NEXT:    sete %cl
@ -667,7 +657,7 @@ define void @atomic_fetch_umin32(i32 %x) nounwind {
 ; X86-CMOV-NEXT:    jne .LBB9_2
 ; X86-CMOV-NEXT:    jmp .LBB9_1
 ; X86-CMOV-NEXT:  .LBB9_2: # %atomicrmw.end
-; X86-CMOV-NEXT:    addl $12, %esp
+; X86-CMOV-NEXT:    addl $8, %esp
 ; X86-CMOV-NEXT:    retl
 ;
 ; X86-NOCMOV-LABEL: atomic_fetch_umin32:
--- a/llvm/test/CodeGen/X86/atomic64.ll
+++ b/llvm/test/CodeGen/X86/atomic64.ll
@ -273,7 +273,6 @@ define void @atomic_fetch_nand64(i64 %x) nounwind {
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
 ; X64-NEXT:    movq %rax, %rcx
 ; X64-NEXT:    andq %rdx, %rcx
-; X64-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    notq %rcx
 ; X64-NEXT:    lock cmpxchgq %rcx, sc64(%rip)
 ; X64-NEXT:    sete %cl
@ -313,7 +312,6 @@ define void @atomic_fetch_max64(i64 %x) nounwind {
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
 ; X64-NEXT:    movq %rax, %rdx
 ; X64-NEXT:    subq %rcx, %rdx
-; X64-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    cmovgq %rax, %rcx
 ; X64-NEXT:    lock cmpxchgq %rcx, sc64(%rip)
 ; X64-NEXT:    sete %cl
@ -406,7 +404,6 @@ define void @atomic_fetch_min64(i64 %x) nounwind {
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
 ; X64-NEXT:    movq %rax, %rdx
 ; X64-NEXT:    subq %rcx, %rdx
-; X64-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    cmovleq %rax, %rcx
 ; X64-NEXT:    lock cmpxchgq %rcx, sc64(%rip)
 ; X64-NEXT:    sete %cl
@ -499,7 +496,6 @@ define void @atomic_fetch_umax64(i64 %x) nounwind {
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
 ; X64-NEXT:    movq %rax, %rdx
 ; X64-NEXT:    subq %rcx, %rdx
-; X64-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    cmovaq %rax, %rcx
 ; X64-NEXT:    lock cmpxchgq %rcx, sc64(%rip)
 ; X64-NEXT:    sete %cl
@ -592,7 +588,6 @@ define void @atomic_fetch_umin64(i64 %x) nounwind {
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
 ; X64-NEXT:    movq %rax, %rdx
 ; X64-NEXT:    subq %rcx, %rdx
-; X64-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    cmovbeq %rax, %rcx
 ; X64-NEXT:    lock cmpxchgq %rcx, sc64(%rip)
 ; X64-NEXT:    sete %cl
--- a/llvm/test/CodeGen/X86/atomic6432.ll
+++ b/llvm/test/CodeGen/X86/atomic6432.ll
@ -8,7 +8,7 @@ define void @atomic_fetch_add64() nounwind {
 ; X32:       # %bb.0: # %entry
 ; X32-NEXT:    pushl %ebx
 ; X32-NEXT:    pushl %esi
-; X32-NEXT:    subl $72, %esp
+; X32-NEXT:    subl $40, %esp
 ; X32-NEXT:    movl sc64+4, %edx
 ; X32-NEXT:    movl sc64, %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@ -20,10 +20,8 @@ define void @atomic_fetch_add64() nounwind {
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    movl %eax, %ebx
 ; X32-NEXT:    addl $1, %ebx
-; X32-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %edx, %ecx
 ; X32-NEXT:    adcl $0, %ecx
-; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    lock cmpxchg8b sc64
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@ -41,10 +39,8 @@ define void @atomic_fetch_add64() nounwind {
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    movl %eax, %ebx
 ; X32-NEXT:    addl $3, %ebx
-; X32-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %edx, %ecx
 ; X32-NEXT:    adcl $0, %ecx
-; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    lock cmpxchg8b sc64
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@ -62,10 +58,8 @@ define void @atomic_fetch_add64() nounwind {
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    movl %eax, %ebx
 ; X32-NEXT:    addl $5, %ebx
-; X32-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %edx, %ecx
 ; X32-NEXT:    adcl $0, %ecx
-; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    lock cmpxchg8b sc64
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@ -76,28 +70,26 @@ define void @atomic_fetch_add64() nounwind {
 ; X32-NEXT:  .LBB0_6: # %atomicrmw.end7
 ; X32-NEXT:    movl sc64+4, %edx
 ; X32-NEXT:    movl sc64, %eax
-; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl %eax, (%esp) # 4-byte Spill
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    jmp .LBB0_7
 ; X32-NEXT:  .LBB0_7: # %atomicrmw.start14
 ; X32-NEXT:    # =>This Inner Loop Header: Depth=1
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NEXT:    movl (%esp), %eax # 4-byte Reload
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X32-NEXT:    movl %eax, %ebx
 ; X32-NEXT:    addl %ecx, %ebx
-; X32-NEXT:    movl %ebx, (%esp) # 4-byte Spill
 ; X32-NEXT:    movl %edx, %ecx
 ; X32-NEXT:    adcl %esi, %ecx
-; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    lock cmpxchg8b sc64
-; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl %eax, (%esp) # 4-byte Spill
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    jne .LBB0_7
 ; X32-NEXT:    jmp .LBB0_8
 ; X32-NEXT:  .LBB0_8: # %atomicrmw.end13
-; X32-NEXT:    addl $72, %esp
+; X32-NEXT:    addl $40, %esp
 ; X32-NEXT:    popl %esi
 ; X32-NEXT:    popl %ebx
 ; X32-NEXT:    retl
@ -114,7 +106,7 @@ define void @atomic_fetch_sub64() nounwind {
 ; X32:       # %bb.0:
 ; X32-NEXT:    pushl %ebx
 ; X32-NEXT:    pushl %esi
-; X32-NEXT:    subl $72, %esp
+; X32-NEXT:    subl $40, %esp
 ; X32-NEXT:    movl sc64+4, %edx
 ; X32-NEXT:    movl sc64, %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@ -126,10 +118,8 @@ define void @atomic_fetch_sub64() nounwind {
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    movl %eax, %ebx
 ; X32-NEXT:    addl $-1, %ebx
-; X32-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %edx, %ecx
 ; X32-NEXT:    adcl $-1, %ecx
-; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    lock cmpxchg8b sc64
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@ -147,10 +137,8 @@ define void @atomic_fetch_sub64() nounwind {
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    movl %eax, %ebx
 ; X32-NEXT:    addl $-3, %ebx
-; X32-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %edx, %ecx
 ; X32-NEXT:    adcl $-1, %ecx
-; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    lock cmpxchg8b sc64
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@ -168,10 +156,8 @@ define void @atomic_fetch_sub64() nounwind {
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    movl %eax, %ebx
 ; X32-NEXT:    addl $-5, %ebx
-; X32-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %edx, %ecx
 ; X32-NEXT:    adcl $-1, %ecx
-; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    lock cmpxchg8b sc64
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@ -182,28 +168,26 @@ define void @atomic_fetch_sub64() nounwind {
 ; X32-NEXT:  .LBB1_6: # %atomicrmw.end7
 ; X32-NEXT:    movl sc64+4, %edx
 ; X32-NEXT:    movl sc64, %eax
-; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl %eax, (%esp) # 4-byte Spill
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    jmp .LBB1_7
 ; X32-NEXT:  .LBB1_7: # %atomicrmw.start14
 ; X32-NEXT:    # =>This Inner Loop Header: Depth=1
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NEXT:    movl (%esp), %eax # 4-byte Reload
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X32-NEXT:    movl %eax, %ebx
 ; X32-NEXT:    subl %ecx, %ebx
-; X32-NEXT:    movl %ebx, (%esp) # 4-byte Spill
 ; X32-NEXT:    movl %edx, %ecx
 ; X32-NEXT:    sbbl %esi, %ecx
-; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    lock cmpxchg8b sc64
-; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl %eax, (%esp) # 4-byte Spill
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    jne .LBB1_7
 ; X32-NEXT:    jmp .LBB1_8
 ; X32-NEXT:  .LBB1_8: # %atomicrmw.end13
-; X32-NEXT:    addl $72, %esp
+; X32-NEXT:    addl $40, %esp
 ; X32-NEXT:    popl %esi
 ; X32-NEXT:    popl %ebx
 ; X32-NEXT:    retl
@ -219,7 +203,7 @@ define void @atomic_fetch_and64() nounwind {
 ; X32:       # %bb.0:
 ; X32-NEXT:    pushl %ebx
 ; X32-NEXT:    pushl %esi
-; X32-NEXT:    subl $52, %esp
+; X32-NEXT:    subl $32, %esp
 ; X32-NEXT:    movl sc64+4, %edx
 ; X32-NEXT:    movl sc64, %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@ -231,7 +215,6 @@ define void @atomic_fetch_and64() nounwind {
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    movl %eax, %ebx
 ; X32-NEXT:    andl $3, %ebx
-; X32-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    xorl %ecx, %ecx
 ; X32-NEXT:    lock cmpxchg8b sc64
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@ -250,10 +233,8 @@ define void @atomic_fetch_and64() nounwind {
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    movl %eax, %ebx
 ; X32-NEXT:    andl $1, %ebx
-; X32-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %edx, %ecx
 ; X32-NEXT:    andl $1, %ecx
-; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    lock cmpxchg8b sc64
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@ -264,28 +245,26 @@ define void @atomic_fetch_and64() nounwind {
 ; X32-NEXT:  .LBB2_4: # %atomicrmw.end1
 ; X32-NEXT:    movl sc64+4, %edx
 ; X32-NEXT:    movl sc64, %eax
-; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl %eax, (%esp) # 4-byte Spill
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    jmp .LBB2_5
 ; X32-NEXT:  .LBB2_5: # %atomicrmw.start8
 ; X32-NEXT:    # =>This Inner Loop Header: Depth=1
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NEXT:    movl (%esp), %eax # 4-byte Reload
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X32-NEXT:    movl %eax, %ebx
 ; X32-NEXT:    andl %ecx, %ebx
-; X32-NEXT:    movl %ebx, (%esp) # 4-byte Spill
 ; X32-NEXT:    movl %edx, %ecx
 ; X32-NEXT:    andl %esi, %ecx
-; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    lock cmpxchg8b sc64
-; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl %eax, (%esp) # 4-byte Spill
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    jne .LBB2_5
 ; X32-NEXT:    jmp .LBB2_6
 ; X32-NEXT:  .LBB2_6: # %atomicrmw.end7
-; X32-NEXT:    addl $52, %esp
+; X32-NEXT:    addl $32, %esp
 ; X32-NEXT:    popl %esi
 ; X32-NEXT:    popl %ebx
 ; X32-NEXT:    retl
@ -300,7 +279,7 @@ define void @atomic_fetch_or64() nounwind {
 ; X32:       # %bb.0:
 ; X32-NEXT:    pushl %ebx
 ; X32-NEXT:    pushl %esi
-; X32-NEXT:    subl $52, %esp
+; X32-NEXT:    subl $32, %esp
 ; X32-NEXT:    movl sc64+4, %edx
 ; X32-NEXT:    movl sc64, %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@ -312,7 +291,6 @@ define void @atomic_fetch_or64() nounwind {
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    movl %eax, %ebx
 ; X32-NEXT:    orl $3, %ebx
-; X32-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %ecx, %edx
 ; X32-NEXT:    lock cmpxchg8b sc64
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@ -331,10 +309,8 @@ define void @atomic_fetch_or64() nounwind {
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    movl %eax, %ebx
 ; X32-NEXT:    orl $1, %ebx
-; X32-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %edx, %ecx
 ; X32-NEXT:    orl $1, %ecx
-; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    lock cmpxchg8b sc64
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@ -345,28 +321,26 @@ define void @atomic_fetch_or64() nounwind {
 ; X32-NEXT:  .LBB3_4: # %atomicrmw.end1
 ; X32-NEXT:    movl sc64+4, %edx
 ; X32-NEXT:    movl sc64, %eax
-; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl %eax, (%esp) # 4-byte Spill
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    jmp .LBB3_5
 ; X32-NEXT:  .LBB3_5: # %atomicrmw.start8
 ; X32-NEXT:    # =>This Inner Loop Header: Depth=1
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NEXT:    movl (%esp), %eax # 4-byte Reload
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X32-NEXT:    movl %eax, %ebx
 ; X32-NEXT:    orl %ecx, %ebx
-; X32-NEXT:    movl %ebx, (%esp) # 4-byte Spill
 ; X32-NEXT:    movl %edx, %ecx
 ; X32-NEXT:    orl %esi, %ecx
-; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    lock cmpxchg8b sc64
-; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl %eax, (%esp) # 4-byte Spill
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    jne .LBB3_5
 ; X32-NEXT:    jmp .LBB3_6
 ; X32-NEXT:  .LBB3_6: # %atomicrmw.end7
-; X32-NEXT:    addl $52, %esp
+; X32-NEXT:    addl $32, %esp
 ; X32-NEXT:    popl %esi
 ; X32-NEXT:    popl %ebx
 ; X32-NEXT:    retl
@ -381,7 +355,7 @@ define void @atomic_fetch_xor64() nounwind {
 ; X32:       # %bb.0:
 ; X32-NEXT:    pushl %ebx
 ; X32-NEXT:    pushl %esi
-; X32-NEXT:    subl $52, %esp
+; X32-NEXT:    subl $32, %esp
 ; X32-NEXT:    movl sc64+4, %edx
 ; X32-NEXT:    movl sc64, %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@ -393,7 +367,6 @@ define void @atomic_fetch_xor64() nounwind {
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    movl %eax, %ebx
 ; X32-NEXT:    xorl $3, %ebx
-; X32-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %ecx, %edx
 ; X32-NEXT:    lock cmpxchg8b sc64
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@ -412,10 +385,8 @@ define void @atomic_fetch_xor64() nounwind {
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    movl %eax, %ebx
 ; X32-NEXT:    xorl $1, %ebx
-; X32-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %edx, %ecx
 ; X32-NEXT:    xorl $1, %ecx
-; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    lock cmpxchg8b sc64
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@ -426,28 +397,26 @@ define void @atomic_fetch_xor64() nounwind {
 ; X32-NEXT:  .LBB4_4: # %atomicrmw.end1
 ; X32-NEXT:    movl sc64+4, %edx
 ; X32-NEXT:    movl sc64, %eax
-; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl %eax, (%esp) # 4-byte Spill
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    jmp .LBB4_5
 ; X32-NEXT:  .LBB4_5: # %atomicrmw.start8
 ; X32-NEXT:    # =>This Inner Loop Header: Depth=1
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NEXT:    movl (%esp), %eax # 4-byte Reload
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X32-NEXT:    movl %eax, %ebx
 ; X32-NEXT:    xorl %ecx, %ebx
-; X32-NEXT:    movl %ebx, (%esp) # 4-byte Spill
 ; X32-NEXT:    movl %edx, %ecx
 ; X32-NEXT:    xorl %esi, %ecx
-; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    lock cmpxchg8b sc64
-; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl %eax, (%esp) # 4-byte Spill
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    jne .LBB4_5
 ; X32-NEXT:    jmp .LBB4_6
 ; X32-NEXT:  .LBB4_6: # %atomicrmw.end7
-; X32-NEXT:    addl $52, %esp
+; X32-NEXT:    addl $32, %esp
 ; X32-NEXT:    popl %esi
 ; X32-NEXT:    popl %ebx
 ; X32-NEXT:    retl
@ -463,9 +432,9 @@ define void @atomic_fetch_nand64(i64 %x) nounwind {
 ; X32-NEXT:    pushl %ebx
 ; X32-NEXT:    pushl %edi
 ; X32-NEXT:    pushl %esi
-; X32-NEXT:    subl $32, %esp
+; X32-NEXT:    subl $16, %esp
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl %eax, (%esp) # 4-byte Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl sc64+4, %edx
@ -478,24 +447,20 @@ define void @atomic_fetch_nand64(i64 %x) nounwind {
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NEXT:    movl (%esp), %edi # 4-byte Reload
 ; X32-NEXT:    movl %edx, %ecx
 ; X32-NEXT:    andl %edi, %ecx
-; X32-NEXT:    movl %ecx, (%esp) # 4-byte Spill
 ; X32-NEXT:    movl %eax, %ebx
 ; X32-NEXT:    andl %esi, %ebx
-; X32-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    notl %ebx
-; X32-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    notl %ecx
-; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    lock cmpxchg8b sc64
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    jne .LBB5_1
 ; X32-NEXT:    jmp .LBB5_2
 ; X32-NEXT:  .LBB5_2: # %atomicrmw.end
-; X32-NEXT:    addl $32, %esp
+; X32-NEXT:    addl $16, %esp
 ; X32-NEXT:    popl %esi
 ; X32-NEXT:    popl %edi
 ; X32-NEXT:    popl %ebx
@ -509,9 +474,9 @@ define void @atomic_fetch_max64(i64 %x) nounwind {
 ; X32:       # %bb.0:
 ; X32-NEXT:    pushl %ebx
 ; X32-NEXT:    pushl %esi
-; X32-NEXT:    subl $32, %esp
+; X32-NEXT:    subl $16, %esp
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl %eax, (%esp) # 4-byte Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl sc64+4, %edx
@ -524,24 +489,20 @@ define void @atomic_fetch_max64(i64 %x) nounwind {
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NEXT:    movl (%esp), %ecx # 4-byte Reload
 ; X32-NEXT:    movl %ebx, %esi
 ; X32-NEXT:    subl %eax, %esi
-; X32-NEXT:    movl %esi, (%esp) # 4-byte Spill
 ; X32-NEXT:    movl %ecx, %esi
 ; X32-NEXT:    sbbl %edx, %esi
-; X32-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    cmovll %edx, %ecx
-; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    cmovll %eax, %ebx
-; X32-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    lock cmpxchg8b sc64
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    jne .LBB6_1
 ; X32-NEXT:    jmp .LBB6_2
 ; X32-NEXT:  .LBB6_2: # %atomicrmw.end
-; X32-NEXT:    addl $32, %esp
+; X32-NEXT:    addl $16, %esp
 ; X32-NEXT:    popl %esi
 ; X32-NEXT:    popl %ebx
 ; X32-NEXT:    retl
@ -554,9 +515,9 @@ define void @atomic_fetch_min64(i64 %x) nounwind {
 ; X32:       # %bb.0:
 ; X32-NEXT:    pushl %ebx
 ; X32-NEXT:    pushl %esi
-; X32-NEXT:    subl $32, %esp
+; X32-NEXT:    subl $16, %esp
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl %eax, (%esp) # 4-byte Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl sc64+4, %edx
@ -569,24 +530,20 @@ define void @atomic_fetch_min64(i64 %x) nounwind {
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NEXT:    movl (%esp), %ecx # 4-byte Reload
 ; X32-NEXT:    movl %ebx, %esi
 ; X32-NEXT:    subl %eax, %esi
-; X32-NEXT:    movl %esi, (%esp) # 4-byte Spill
 ; X32-NEXT:    movl %ecx, %esi
 ; X32-NEXT:    sbbl %edx, %esi
-; X32-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    cmovgel %edx, %ecx
-; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    cmovgel %eax, %ebx
-; X32-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    lock cmpxchg8b sc64
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    jne .LBB7_1
 ; X32-NEXT:    jmp .LBB7_2
 ; X32-NEXT:  .LBB7_2: # %atomicrmw.end
-; X32-NEXT:    addl $32, %esp
+; X32-NEXT:    addl $16, %esp
 ; X32-NEXT:    popl %esi
 ; X32-NEXT:    popl %ebx
 ; X32-NEXT:    retl
@ -599,9 +556,9 @@ define void @atomic_fetch_umax64(i64 %x) nounwind {
 ; X32:       # %bb.0:
 ; X32-NEXT:    pushl %ebx
 ; X32-NEXT:    pushl %esi
-; X32-NEXT:    subl $32, %esp
+; X32-NEXT:    subl $16, %esp
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl %eax, (%esp) # 4-byte Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl sc64+4, %edx
@ -614,24 +571,20 @@ define void @atomic_fetch_umax64(i64 %x) nounwind {
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NEXT:    movl (%esp), %ecx # 4-byte Reload
 ; X32-NEXT:    movl %ebx, %esi
 ; X32-NEXT:    subl %eax, %esi
-; X32-NEXT:    movl %esi, (%esp) # 4-byte Spill
 ; X32-NEXT:    movl %ecx, %esi
 ; X32-NEXT:    sbbl %edx, %esi
-; X32-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    cmovbl %edx, %ecx
-; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    cmovbl %eax, %ebx
-; X32-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    lock cmpxchg8b sc64
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    jne .LBB8_1
 ; X32-NEXT:    jmp .LBB8_2
 ; X32-NEXT:  .LBB8_2: # %atomicrmw.end
-; X32-NEXT:    addl $32, %esp
+; X32-NEXT:    addl $16, %esp
 ; X32-NEXT:    popl %esi
 ; X32-NEXT:    popl %ebx
 ; X32-NEXT:    retl
@ -644,9 +597,9 @@ define void @atomic_fetch_umin64(i64 %x) nounwind {
 ; X32:       # %bb.0:
 ; X32-NEXT:    pushl %ebx
 ; X32-NEXT:    pushl %esi
-; X32-NEXT:    subl $32, %esp
+; X32-NEXT:    subl $16, %esp
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl %eax, (%esp) # 4-byte Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl sc64+4, %edx
@ -659,24 +612,20 @@ define void @atomic_fetch_umin64(i64 %x) nounwind {
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NEXT:    movl (%esp), %ecx # 4-byte Reload
 ; X32-NEXT:    movl %ebx, %esi
 ; X32-NEXT:    subl %eax, %esi
-; X32-NEXT:    movl %esi, (%esp) # 4-byte Spill
 ; X32-NEXT:    movl %ecx, %esi
 ; X32-NEXT:    sbbl %edx, %esi
-; X32-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    cmovael %edx, %ecx
-; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    cmovael %eax, %ebx
-; X32-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    lock cmpxchg8b sc64
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    jne .LBB9_1
 ; X32-NEXT:    jmp .LBB9_2
 ; X32-NEXT:  .LBB9_2: # %atomicrmw.end
-; X32-NEXT:    addl $32, %esp
+; X32-NEXT:    addl $16, %esp
 ; X32-NEXT:    popl %esi
 ; X32-NEXT:    popl %ebx
 ; X32-NEXT:    retl
--- a/llvm/test/CodeGen/X86/fastregalloc-selfloop.mir
+++ b/llvm/test/CodeGen/X86/fastregalloc-selfloop.mir
@ -14,11 +14,10 @@ stack:
 machineFunctionInfo: {}
 body:             |
  bb.0.entry:
-    ; CHECK: renamable $xmm1 = V_SET0
+    ; CHECK:      renamable $xmm1 = V_SET0
    ; CHECK-NEXT: renamable $xmm0 = V_SET0
    ; CHECK-NEXT: renamable $xmm1 = PXORrr renamable $xmm1, renamable $xmm0
-    ; CHECK-NEXT: MOVAPSmr %stack.1, 1, $noreg, 0, $noreg, $xmm1 :: (store (s128) into %stack.1)
-    ; CHECK-NEXT: MOVAPSmr %stack.0, 1, $noreg, 0, $noreg, renamable $xmm1
+    ; CHECK-NEXT: MOVAPSmr %stack.0, 1, $noreg, 0, $noreg, killed renamable $xmm1
    ; CHECK-NEXT: MOVAPSmr %stack.0, 1, $noreg, 16, $noreg, killed renamable $xmm0

    %0:vr128 = V_SET0
--- a/llvm/test/CodeGen/X86/swifterror.ll
+++ b/llvm/test/CodeGen/X86/swifterror.ll
@ -983,7 +983,6 @@ define void @swifterror_isel(%swift.refcounted*) {
 ; CHECK-O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 ## 8-byte Reload
 ; CHECK-O0-NEXT:    ## implicit-def: $edi
 ; CHECK-O0-NEXT:    movw %ax, %di
-; CHECK-O0-NEXT:    movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill
 ; CHECK-O0-NEXT:    ## implicit-def: $rax
 ; CHECK-O0-NEXT:    callq *%rax
 ; CHECK-O0-NEXT:    ## implicit-def: $rax
--- a/llvm/test/DebugInfo/X86/fission-ranges.ll
+++ b/llvm/test/DebugInfo/X86/fission-ranges.ll
@ -51,7 +51,7 @@
 ; CHECK-NEXT:   DW_LLE_end_of_list   ()
 ; CHECK:      [[E]]:
 ; CHECK-NEXT:   DW_LLE_startx_length (0x00000004, 0x0000000b): DW_OP_reg0 RAX
-; CHECK-NEXT:   DW_LLE_startx_length (0x00000005, 0x0000005a): DW_OP_breg7 RSP-48
+; CHECK-NEXT:   DW_LLE_startx_length (0x00000005, 0x0000005a): DW_OP_breg7 RSP-36
 ; CHECK-NEXT:   DW_LLE_end_of_list   ()
 ; CHECK:      [[B]]:
 ; CHECK-NEXT:   DW_LLE_startx_length (0x00000006, 0x0000000b): DW_OP_reg0 RAX