ARM & AArch64: convert asm tests to LLVM IR and restrict optimizations.

This is mostly a one-time autoconversion of tests that checked assembly after "-Owhatever" compiles to only run "opt -mem2reg" and check the assembly. This should make them much more stable to changes in LLVM so they won't break on unrelated changes. "opt -mem2reg" is a compromise designed to increase the readability of tests that check dataflow, while minimizing dependency on LLVM. Hopefully mem2reg is stable enough that no surpises will come along. Should address http://llvm.org/PR26815. git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@263048 91177308-0d34-0410-b5e6-96231b3b80d8
2016-03-09 18:54:42 +00:00 · 2016-03-09 18:54:42 +00:00 · 0ca63760b6
parent e2e2605a56
commit 0ca63760b6
39 changed files with 48441 additions and 11753 deletions
--- a/test/CodeGen/aarch64-neon-2velem.c
+++ b/test/CodeGen/aarch64-neon-2velem.c
--- a/test/CodeGen/aarch64-neon-3v.c
+++ b/test/CodeGen/aarch64-neon-3v.c
@ -1,486 +1,597 @@
-// REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -S -O3 -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s

 // Test new aarch64 intrinsics and types

 #include <arm_neon.h>

+// CHECK-LABEL: define <8 x i8> @test_vand_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[AND_I:%.*]] = and <8 x i8> %a, %b
+// CHECK:   ret <8 x i8> [[AND_I]]
 int8x8_t test_vand_s8(int8x8_t a, int8x8_t b) {
-  // CHECK-LABEL: test_vand_s8
  return vand_s8(a, b);
-  // CHECK: and {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }

+// CHECK-LABEL: define <16 x i8> @test_vandq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[AND_I:%.*]] = and <16 x i8> %a, %b
+// CHECK:   ret <16 x i8> [[AND_I]]
 int8x16_t test_vandq_s8(int8x16_t a, int8x16_t b) {
-  // CHECK-LABEL: test_vandq_s8
  return vandq_s8(a, b);
-  // CHECK: and {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }

+// CHECK-LABEL: define <4 x i16> @test_vand_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[AND_I:%.*]] = and <4 x i16> %a, %b
+// CHECK:   ret <4 x i16> [[AND_I]]
 int16x4_t test_vand_s16(int16x4_t a, int16x4_t b) {
-  // CHECK-LABEL: test_vand_s16
  return vand_s16(a, b);
-  // CHECK: and {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }

+// CHECK-LABEL: define <8 x i16> @test_vandq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[AND_I:%.*]] = and <8 x i16> %a, %b
+// CHECK:   ret <8 x i16> [[AND_I]]
 int16x8_t test_vandq_s16(int16x8_t a, int16x8_t b) {
-  // CHECK-LABEL: test_vandq_s16
  return vandq_s16(a, b);
-  // CHECK: and {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }

+// CHECK-LABEL: define <2 x i32> @test_vand_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[AND_I:%.*]] = and <2 x i32> %a, %b
+// CHECK:   ret <2 x i32> [[AND_I]]
 int32x2_t test_vand_s32(int32x2_t a, int32x2_t b) {
-  // CHECK-LABEL: test_vand_s32
  return vand_s32(a, b);
-  // CHECK: and {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }

+// CHECK-LABEL: define <4 x i32> @test_vandq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[AND_I:%.*]] = and <4 x i32> %a, %b
+// CHECK:   ret <4 x i32> [[AND_I]]
 int32x4_t test_vandq_s32(int32x4_t a, int32x4_t b) {
-  // CHECK-LABEL: test_vandq_s32
  return vandq_s32(a, b);
-  // CHECK: and {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }

+// CHECK-LABEL: define <1 x i64> @test_vand_s64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[AND_I:%.*]] = and <1 x i64> %a, %b
+// CHECK:   ret <1 x i64> [[AND_I]]
 int64x1_t test_vand_s64(int64x1_t a, int64x1_t b) {
-  // CHECK-LABEL: test_vand_s64
  return vand_s64(a, b);
-  // CHECK: and {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }

+// CHECK-LABEL: define <2 x i64> @test_vandq_s64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[AND_I:%.*]] = and <2 x i64> %a, %b
+// CHECK:   ret <2 x i64> [[AND_I]]
 int64x2_t test_vandq_s64(int64x2_t a, int64x2_t b) {
-  // CHECK-LABEL: test_vandq_s64
  return vandq_s64(a, b);
-  // CHECK: and {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }

+// CHECK-LABEL: define <8 x i8> @test_vand_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[AND_I:%.*]] = and <8 x i8> %a, %b
+// CHECK:   ret <8 x i8> [[AND_I]]
 uint8x8_t test_vand_u8(uint8x8_t a, uint8x8_t b) {
-  // CHECK-LABEL: test_vand_u8
  return vand_u8(a, b);
-  // CHECK: and {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }

+// CHECK-LABEL: define <16 x i8> @test_vandq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[AND_I:%.*]] = and <16 x i8> %a, %b
+// CHECK:   ret <16 x i8> [[AND_I]]
 uint8x16_t test_vandq_u8(uint8x16_t a, uint8x16_t b) {
-  // CHECK-LABEL: test_vandq_u8
  return vandq_u8(a, b);
-  // CHECK: and {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }

+// CHECK-LABEL: define <4 x i16> @test_vand_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[AND_I:%.*]] = and <4 x i16> %a, %b
+// CHECK:   ret <4 x i16> [[AND_I]]
 uint16x4_t test_vand_u16(uint16x4_t a, uint16x4_t b) {
-  // CHECK-LABEL: test_vand_u16
  return vand_u16(a, b);
-  // CHECK: and {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }

+// CHECK-LABEL: define <8 x i16> @test_vandq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[AND_I:%.*]] = and <8 x i16> %a, %b
+// CHECK:   ret <8 x i16> [[AND_I]]
 uint16x8_t test_vandq_u16(uint16x8_t a, uint16x8_t b) {
-  // CHECK-LABEL: test_vandq_u16
  return vandq_u16(a, b);
-  // CHECK: and {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }

+// CHECK-LABEL: define <2 x i32> @test_vand_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[AND_I:%.*]] = and <2 x i32> %a, %b
+// CHECK:   ret <2 x i32> [[AND_I]]
 uint32x2_t test_vand_u32(uint32x2_t a, uint32x2_t b) {
-  // CHECK-LABEL: test_vand_u32
  return vand_u32(a, b);
-  // CHECK: and {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }

+// CHECK-LABEL: define <4 x i32> @test_vandq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[AND_I:%.*]] = and <4 x i32> %a, %b
+// CHECK:   ret <4 x i32> [[AND_I]]
 uint32x4_t test_vandq_u32(uint32x4_t a, uint32x4_t b) {
-  // CHECK-LABEL: test_vandq_u32
  return vandq_u32(a, b);
-  // CHECK: and {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }

+// CHECK-LABEL: define <1 x i64> @test_vand_u64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[AND_I:%.*]] = and <1 x i64> %a, %b
+// CHECK:   ret <1 x i64> [[AND_I]]
 uint64x1_t test_vand_u64(uint64x1_t a, uint64x1_t b) {
-  // CHECK-LABEL: test_vand_u64
  return vand_u64(a, b);
-  // CHECK: and {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }

+// CHECK-LABEL: define <2 x i64> @test_vandq_u64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[AND_I:%.*]] = and <2 x i64> %a, %b
+// CHECK:   ret <2 x i64> [[AND_I]]
 uint64x2_t test_vandq_u64(uint64x2_t a, uint64x2_t b) {
-  // CHECK-LABEL: test_vandq_u64
  return vandq_u64(a, b);
-  // CHECK: and {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }

+// CHECK-LABEL: define <8 x i8> @test_vorr_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[OR_I:%.*]] = or <8 x i8> %a, %b
+// CHECK:   ret <8 x i8> [[OR_I]]
 int8x8_t test_vorr_s8(int8x8_t a, int8x8_t b) {
-  // CHECK-LABEL: test_vorr_s8
  return vorr_s8(a, b);
-  // CHECK: orr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }

+// CHECK-LABEL: define <16 x i8> @test_vorrq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[OR_I:%.*]] = or <16 x i8> %a, %b
+// CHECK:   ret <16 x i8> [[OR_I]]
 int8x16_t test_vorrq_s8(int8x16_t a, int8x16_t b) {
-  // CHECK-LABEL: test_vorrq_s8
  return vorrq_s8(a, b);
-  // CHECK: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }

+// CHECK-LABEL: define <4 x i16> @test_vorr_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[OR_I:%.*]] = or <4 x i16> %a, %b
+// CHECK:   ret <4 x i16> [[OR_I]]
 int16x4_t test_vorr_s16(int16x4_t a, int16x4_t b) {
-  // CHECK-LABEL: test_vorr_s16
  return vorr_s16(a, b);
-  // CHECK: orr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }

+// CHECK-LABEL: define <8 x i16> @test_vorrq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[OR_I:%.*]] = or <8 x i16> %a, %b
+// CHECK:   ret <8 x i16> [[OR_I]]
 int16x8_t test_vorrq_s16(int16x8_t a, int16x8_t b) {
-  // CHECK-LABEL: test_vorrq_s16
  return vorrq_s16(a, b);
-  // CHECK: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }

+// CHECK-LABEL: define <2 x i32> @test_vorr_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[OR_I:%.*]] = or <2 x i32> %a, %b
+// CHECK:   ret <2 x i32> [[OR_I]]
 int32x2_t test_vorr_s32(int32x2_t a, int32x2_t b) {
-  // CHECK-LABEL: test_vorr_s32
  return vorr_s32(a, b);
-  // CHECK: orr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }

+// CHECK-LABEL: define <4 x i32> @test_vorrq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[OR_I:%.*]] = or <4 x i32> %a, %b
+// CHECK:   ret <4 x i32> [[OR_I]]
 int32x4_t test_vorrq_s32(int32x4_t a, int32x4_t b) {
-  // CHECK-LABEL: test_vorrq_s32
  return vorrq_s32(a, b);
-  // CHECK: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }

+// CHECK-LABEL: define <1 x i64> @test_vorr_s64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[OR_I:%.*]] = or <1 x i64> %a, %b
+// CHECK:   ret <1 x i64> [[OR_I]]
 int64x1_t test_vorr_s64(int64x1_t a, int64x1_t b) {
-  // CHECK-LABEL: test_vorr_s64
  return vorr_s64(a, b);
-  // CHECK: orr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }

+// CHECK-LABEL: define <2 x i64> @test_vorrq_s64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[OR_I:%.*]] = or <2 x i64> %a, %b
+// CHECK:   ret <2 x i64> [[OR_I]]
 int64x2_t test_vorrq_s64(int64x2_t a, int64x2_t b) {
-  // CHECK-LABEL: test_vorrq_s64
  return vorrq_s64(a, b);
-  // CHECK: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }

+// CHECK-LABEL: define <8 x i8> @test_vorr_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[OR_I:%.*]] = or <8 x i8> %a, %b
+// CHECK:   ret <8 x i8> [[OR_I]]
 uint8x8_t test_vorr_u8(uint8x8_t a, uint8x8_t b) {
-  // CHECK-LABEL: test_vorr_u8
  return vorr_u8(a, b);
-  // CHECK: orr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }

+// CHECK-LABEL: define <16 x i8> @test_vorrq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[OR_I:%.*]] = or <16 x i8> %a, %b
+// CHECK:   ret <16 x i8> [[OR_I]]
 uint8x16_t test_vorrq_u8(uint8x16_t a, uint8x16_t b) {
-  // CHECK-LABEL: test_vorrq_u8
  return vorrq_u8(a, b);
-  // CHECK: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }

+// CHECK-LABEL: define <4 x i16> @test_vorr_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[OR_I:%.*]] = or <4 x i16> %a, %b
+// CHECK:   ret <4 x i16> [[OR_I]]
 uint16x4_t test_vorr_u16(uint16x4_t a, uint16x4_t b) {
-  // CHECK-LABEL: test_vorr_u16
  return vorr_u16(a, b);
-  // CHECK: orr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }

+// CHECK-LABEL: define <8 x i16> @test_vorrq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[OR_I:%.*]] = or <8 x i16> %a, %b
+// CHECK:   ret <8 x i16> [[OR_I]]
 uint16x8_t test_vorrq_u16(uint16x8_t a, uint16x8_t b) {
-  // CHECK-LABEL: test_vorrq_u16
  return vorrq_u16(a, b);
-  // CHECK: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }

+// CHECK-LABEL: define <2 x i32> @test_vorr_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[OR_I:%.*]] = or <2 x i32> %a, %b
+// CHECK:   ret <2 x i32> [[OR_I]]
 uint32x2_t test_vorr_u32(uint32x2_t a, uint32x2_t b) {
-  // CHECK-LABEL: test_vorr_u32
  return vorr_u32(a, b);
-  // CHECK: orr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }

+// CHECK-LABEL: define <4 x i32> @test_vorrq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[OR_I:%.*]] = or <4 x i32> %a, %b
+// CHECK:   ret <4 x i32> [[OR_I]]
 uint32x4_t test_vorrq_u32(uint32x4_t a, uint32x4_t b) {
-  // CHECK-LABEL: test_vorrq_u32
  return vorrq_u32(a, b);
-  // CHECK: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }

+// CHECK-LABEL: define <1 x i64> @test_vorr_u64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[OR_I:%.*]] = or <1 x i64> %a, %b
+// CHECK:   ret <1 x i64> [[OR_I]]
 uint64x1_t test_vorr_u64(uint64x1_t a, uint64x1_t b) {
-  // CHECK-LABEL: test_vorr_u64
  return vorr_u64(a, b);
-  // CHECK: orr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }

+// CHECK-LABEL: define <2 x i64> @test_vorrq_u64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[OR_I:%.*]] = or <2 x i64> %a, %b
+// CHECK:   ret <2 x i64> [[OR_I]]
 uint64x2_t test_vorrq_u64(uint64x2_t a, uint64x2_t b) {
-  // CHECK-LABEL: test_vorrq_u64
  return vorrq_u64(a, b);
-  // CHECK: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }

+// CHECK-LABEL: define <8 x i8> @test_veor_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[XOR_I:%.*]] = xor <8 x i8> %a, %b
+// CHECK:   ret <8 x i8> [[XOR_I]]
 int8x8_t test_veor_s8(int8x8_t a, int8x8_t b) {
-  // CHECK-LABEL: test_veor_s8
  return veor_s8(a, b);
-  // CHECK: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }

+// CHECK-LABEL: define <16 x i8> @test_veorq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[XOR_I:%.*]] = xor <16 x i8> %a, %b
+// CHECK:   ret <16 x i8> [[XOR_I]]
 int8x16_t test_veorq_s8(int8x16_t a, int8x16_t b) {
-  // CHECK-LABEL: test_veorq_s8
  return veorq_s8(a, b);
-  // CHECK: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }

+// CHECK-LABEL: define <4 x i16> @test_veor_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[XOR_I:%.*]] = xor <4 x i16> %a, %b
+// CHECK:   ret <4 x i16> [[XOR_I]]
 int16x4_t test_veor_s16(int16x4_t a, int16x4_t b) {
-  // CHECK-LABEL: test_veor_s16
  return veor_s16(a, b);
-  // CHECK: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }

+// CHECK-LABEL: define <8 x i16> @test_veorq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[XOR_I:%.*]] = xor <8 x i16> %a, %b
+// CHECK:   ret <8 x i16> [[XOR_I]]
 int16x8_t test_veorq_s16(int16x8_t a, int16x8_t b) {
-  // CHECK-LABEL: test_veorq_s16
  return veorq_s16(a, b);
-  // CHECK: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }

+// CHECK-LABEL: define <2 x i32> @test_veor_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[XOR_I:%.*]] = xor <2 x i32> %a, %b
+// CHECK:   ret <2 x i32> [[XOR_I]]
 int32x2_t test_veor_s32(int32x2_t a, int32x2_t b) {
-  // CHECK-LABEL: test_veor_s32
  return veor_s32(a, b);
-  // CHECK: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }

+// CHECK-LABEL: define <4 x i32> @test_veorq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[XOR_I:%.*]] = xor <4 x i32> %a, %b
+// CHECK:   ret <4 x i32> [[XOR_I]]
 int32x4_t test_veorq_s32(int32x4_t a, int32x4_t b) {
-  // CHECK-LABEL: test_veorq_s32
  return veorq_s32(a, b);
-  // CHECK: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }

+// CHECK-LABEL: define <1 x i64> @test_veor_s64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[XOR_I:%.*]] = xor <1 x i64> %a, %b
+// CHECK:   ret <1 x i64> [[XOR_I]]
 int64x1_t test_veor_s64(int64x1_t a, int64x1_t b) {
-  // CHECK-LABEL: test_veor_s64
  return veor_s64(a, b);
-  // CHECK: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }

+// CHECK-LABEL: define <2 x i64> @test_veorq_s64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[XOR_I:%.*]] = xor <2 x i64> %a, %b
+// CHECK:   ret <2 x i64> [[XOR_I]]
 int64x2_t test_veorq_s64(int64x2_t a, int64x2_t b) {
-  // CHECK-LABEL: test_veorq_s64
  return veorq_s64(a, b);
-  // CHECK: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }

+// CHECK-LABEL: define <8 x i8> @test_veor_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[XOR_I:%.*]] = xor <8 x i8> %a, %b
+// CHECK:   ret <8 x i8> [[XOR_I]]
 uint8x8_t test_veor_u8(uint8x8_t a, uint8x8_t b) {
-  // CHECK-LABEL: test_veor_u8
  return veor_u8(a, b);
-  // CHECK: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }

+// CHECK-LABEL: define <16 x i8> @test_veorq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[XOR_I:%.*]] = xor <16 x i8> %a, %b
+// CHECK:   ret <16 x i8> [[XOR_I]]
 uint8x16_t test_veorq_u8(uint8x16_t a, uint8x16_t b) {
-  // CHECK-LABEL: test_veorq_u8
  return veorq_u8(a, b);
-  // CHECK: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }

+// CHECK-LABEL: define <4 x i16> @test_veor_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[XOR_I:%.*]] = xor <4 x i16> %a, %b
+// CHECK:   ret <4 x i16> [[XOR_I]]
 uint16x4_t test_veor_u16(uint16x4_t a, uint16x4_t b) {
-  // CHECK-LABEL: test_veor_u16
  return veor_u16(a, b);
-  // CHECK: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }

+// CHECK-LABEL: define <8 x i16> @test_veorq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[XOR_I:%.*]] = xor <8 x i16> %a, %b
+// CHECK:   ret <8 x i16> [[XOR_I]]
 uint16x8_t test_veorq_u16(uint16x8_t a, uint16x8_t b) {
-  // CHECK-LABEL: test_veorq_u16
  return veorq_u16(a, b);
-  // CHECK: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }

+// CHECK-LABEL: define <2 x i32> @test_veor_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[XOR_I:%.*]] = xor <2 x i32> %a, %b
+// CHECK:   ret <2 x i32> [[XOR_I]]
 uint32x2_t test_veor_u32(uint32x2_t a, uint32x2_t b) {
-  // CHECK-LABEL: test_veor_u32
  return veor_u32(a, b);
-  // CHECK: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }

+// CHECK-LABEL: define <4 x i32> @test_veorq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[XOR_I:%.*]] = xor <4 x i32> %a, %b
+// CHECK:   ret <4 x i32> [[XOR_I]]
 uint32x4_t test_veorq_u32(uint32x4_t a, uint32x4_t b) {
-  // CHECK-LABEL: test_veorq_u32
  return veorq_u32(a, b);
-  // CHECK: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }

+// CHECK-LABEL: define <1 x i64> @test_veor_u64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[XOR_I:%.*]] = xor <1 x i64> %a, %b
+// CHECK:   ret <1 x i64> [[XOR_I]]
 uint64x1_t test_veor_u64(uint64x1_t a, uint64x1_t b) {
-  // CHECK-LABEL: test_veor_u64
  return veor_u64(a, b);
-  // CHECK: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }

+// CHECK-LABEL: define <2 x i64> @test_veorq_u64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[XOR_I:%.*]] = xor <2 x i64> %a, %b
+// CHECK:   ret <2 x i64> [[XOR_I]]
 uint64x2_t test_veorq_u64(uint64x2_t a, uint64x2_t b) {
-  // CHECK-LABEL: test_veorq_u64
  return veorq_u64(a, b);
-  // CHECK: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }

+// CHECK-LABEL: define <8 x i8> @test_vbic_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <8 x i8> %b, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK:   [[AND_I:%.*]] = and <8 x i8> %a, [[NEG_I]]
+// CHECK:   ret <8 x i8> [[AND_I]]
 int8x8_t test_vbic_s8(int8x8_t a, int8x8_t b) {
-  // CHECK-LABEL: test_vbic_s8
  return vbic_s8(a, b);
-  // CHECK: bic {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }

+// CHECK-LABEL: define <16 x i8> @test_vbicq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <16 x i8> %b, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK:   [[AND_I:%.*]] = and <16 x i8> %a, [[NEG_I]]
+// CHECK:   ret <16 x i8> [[AND_I]]
 int8x16_t test_vbicq_s8(int8x16_t a, int8x16_t b) {
-  // CHECK-LABEL: test_vbicq_s8
  return vbicq_s8(a, b);
-  // CHECK: bic {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }

+// CHECK-LABEL: define <4 x i16> @test_vbic_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <4 x i16> %b, <i16 -1, i16 -1, i16 -1, i16 -1>
+// CHECK:   [[AND_I:%.*]] = and <4 x i16> %a, [[NEG_I]]
+// CHECK:   ret <4 x i16> [[AND_I]]
 int16x4_t test_vbic_s16(int16x4_t a, int16x4_t b) {
-  // CHECK-LABEL: test_vbic_s16
  return vbic_s16(a, b);
-  // CHECK: bic {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }

+// CHECK-LABEL: define <8 x i16> @test_vbicq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <8 x i16> %b, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
+// CHECK:   [[AND_I:%.*]] = and <8 x i16> %a, [[NEG_I]]
+// CHECK:   ret <8 x i16> [[AND_I]]
 int16x8_t test_vbicq_s16(int16x8_t a, int16x8_t b) {
-  // CHECK-LABEL: test_vbicq_s16
  return vbicq_s16(a, b);
-  // CHECK: bic {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }

+// CHECK-LABEL: define <2 x i32> @test_vbic_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <2 x i32> %b, <i32 -1, i32 -1>
+// CHECK:   [[AND_I:%.*]] = and <2 x i32> %a, [[NEG_I]]
+// CHECK:   ret <2 x i32> [[AND_I]]
 int32x2_t test_vbic_s32(int32x2_t a, int32x2_t b) {
-  // CHECK-LABEL: test_vbic_s32
  return vbic_s32(a, b);
-  // CHECK: bic {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }

+// CHECK-LABEL: define <4 x i32> @test_vbicq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <4 x i32> %b, <i32 -1, i32 -1, i32 -1, i32 -1>
+// CHECK:   [[AND_I:%.*]] = and <4 x i32> %a, [[NEG_I]]
+// CHECK:   ret <4 x i32> [[AND_I]]
 int32x4_t test_vbicq_s32(int32x4_t a, int32x4_t b) {
-  // CHECK-LABEL: test_vbicq_s32
  return vbicq_s32(a, b);
-  // CHECK: bic {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }

+// CHECK-LABEL: define <1 x i64> @test_vbic_s64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <1 x i64> %b, <i64 -1>
+// CHECK:   [[AND_I:%.*]] = and <1 x i64> %a, [[NEG_I]]
+// CHECK:   ret <1 x i64> [[AND_I]]
 int64x1_t test_vbic_s64(int64x1_t a, int64x1_t b) {
-  // CHECK-LABEL: test_vbic_s64
  return vbic_s64(a, b);
-  // CHECK: bic {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }

+// CHECK-LABEL: define <2 x i64> @test_vbicq_s64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <2 x i64> %b, <i64 -1, i64 -1>
+// CHECK:   [[AND_I:%.*]] = and <2 x i64> %a, [[NEG_I]]
+// CHECK:   ret <2 x i64> [[AND_I]]
 int64x2_t test_vbicq_s64(int64x2_t a, int64x2_t b) {
-  // CHECK-LABEL: test_vbicq_s64
  return vbicq_s64(a, b);
-  // CHECK: bic {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }

+// CHECK-LABEL: define <8 x i8> @test_vbic_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <8 x i8> %b, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK:   [[AND_I:%.*]] = and <8 x i8> %a, [[NEG_I]]
+// CHECK:   ret <8 x i8> [[AND_I]]
 uint8x8_t test_vbic_u8(uint8x8_t a, uint8x8_t b) {
-  // CHECK-LABEL: test_vbic_u8
  return vbic_u8(a, b);
-  // CHECK: bic {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }

+// CHECK-LABEL: define <16 x i8> @test_vbicq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <16 x i8> %b, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK:   [[AND_I:%.*]] = and <16 x i8> %a, [[NEG_I]]
+// CHECK:   ret <16 x i8> [[AND_I]]
 uint8x16_t test_vbicq_u8(uint8x16_t a, uint8x16_t b) {
-  // CHECK-LABEL: test_vbicq_u8
  return vbicq_u8(a, b);
-  // CHECK: bic {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }

+// CHECK-LABEL: define <4 x i16> @test_vbic_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <4 x i16> %b, <i16 -1, i16 -1, i16 -1, i16 -1>
+// CHECK:   [[AND_I:%.*]] = and <4 x i16> %a, [[NEG_I]]
+// CHECK:   ret <4 x i16> [[AND_I]]
 uint16x4_t test_vbic_u16(uint16x4_t a, uint16x4_t b) {
-  // CHECK-LABEL: test_vbic_u16
  return vbic_u16(a, b);
-  // CHECK: bic {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }

+// CHECK-LABEL: define <8 x i16> @test_vbicq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <8 x i16> %b, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
+// CHECK:   [[AND_I:%.*]] = and <8 x i16> %a, [[NEG_I]]
+// CHECK:   ret <8 x i16> [[AND_I]]
 uint16x8_t test_vbicq_u16(uint16x8_t a, uint16x8_t b) {
-  // CHECK-LABEL: test_vbicq_u16
  return vbicq_u16(a, b);
-  // CHECK: bic {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }

+// CHECK-LABEL: define <2 x i32> @test_vbic_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <2 x i32> %b, <i32 -1, i32 -1>
+// CHECK:   [[AND_I:%.*]] = and <2 x i32> %a, [[NEG_I]]
+// CHECK:   ret <2 x i32> [[AND_I]]
 uint32x2_t test_vbic_u32(uint32x2_t a, uint32x2_t b) {
-  // CHECK-LABEL: test_vbic_u32
  return vbic_u32(a, b);
-  // CHECK: bic {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }

+// CHECK-LABEL: define <4 x i32> @test_vbicq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <4 x i32> %b, <i32 -1, i32 -1, i32 -1, i32 -1>
+// CHECK:   [[AND_I:%.*]] = and <4 x i32> %a, [[NEG_I]]
+// CHECK:   ret <4 x i32> [[AND_I]]
 uint32x4_t test_vbicq_u32(uint32x4_t a, uint32x4_t b) {
-  // CHECK-LABEL: test_vbicq_u32
  return vbicq_u32(a, b);
-  // CHECK: bic {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }

+// CHECK-LABEL: define <1 x i64> @test_vbic_u64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <1 x i64> %b, <i64 -1>
+// CHECK:   [[AND_I:%.*]] = and <1 x i64> %a, [[NEG_I]]
+// CHECK:   ret <1 x i64> [[AND_I]]
 uint64x1_t test_vbic_u64(uint64x1_t a, uint64x1_t b) {
-  // CHECK-LABEL: test_vbic_u64
  return vbic_u64(a, b);
-  // CHECK: bic {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }

+// CHECK-LABEL: define <2 x i64> @test_vbicq_u64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <2 x i64> %b, <i64 -1, i64 -1>
+// CHECK:   [[AND_I:%.*]] = and <2 x i64> %a, [[NEG_I]]
+// CHECK:   ret <2 x i64> [[AND_I]]
 uint64x2_t test_vbicq_u64(uint64x2_t a, uint64x2_t b) {
-  // CHECK-LABEL: test_vbicq_u64
  return vbicq_u64(a, b);
-  // CHECK: bic {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }

+// CHECK-LABEL: define <8 x i8> @test_vorn_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <8 x i8> %b, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK:   [[OR_I:%.*]] = or <8 x i8> %a, [[NEG_I]]
+// CHECK:   ret <8 x i8> [[OR_I]]
 int8x8_t test_vorn_s8(int8x8_t a, int8x8_t b) {
-  // CHECK-LABEL: test_vorn_s8
  return vorn_s8(a, b);
-  // CHECK: orn {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }

+// CHECK-LABEL: define <16 x i8> @test_vornq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <16 x i8> %b, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK:   [[OR_I:%.*]] = or <16 x i8> %a, [[NEG_I]]
+// CHECK:   ret <16 x i8> [[OR_I]]
 int8x16_t test_vornq_s8(int8x16_t a, int8x16_t b) {
-  // CHECK-LABEL: test_vornq_s8
  return vornq_s8(a, b);
-  // CHECK: orn {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }

+// CHECK-LABEL: define <4 x i16> @test_vorn_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <4 x i16> %b, <i16 -1, i16 -1, i16 -1, i16 -1>
+// CHECK:   [[OR_I:%.*]] = or <4 x i16> %a, [[NEG_I]]
+// CHECK:   ret <4 x i16> [[OR_I]]
 int16x4_t test_vorn_s16(int16x4_t a, int16x4_t b) {
-  // CHECK-LABEL: test_vorn_s16
  return vorn_s16(a, b);
-  // CHECK: orn {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }

+// CHECK-LABEL: define <8 x i16> @test_vornq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <8 x i16> %b, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
+// CHECK:   [[OR_I:%.*]] = or <8 x i16> %a, [[NEG_I]]
+// CHECK:   ret <8 x i16> [[OR_I]]
 int16x8_t test_vornq_s16(int16x8_t a, int16x8_t b) {
-  // CHECK-LABEL: test_vornq_s16
  return vornq_s16(a, b);
-  // CHECK: orn {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }

+// CHECK-LABEL: define <2 x i32> @test_vorn_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <2 x i32> %b, <i32 -1, i32 -1>
+// CHECK:   [[OR_I:%.*]] = or <2 x i32> %a, [[NEG_I]]
+// CHECK:   ret <2 x i32> [[OR_I]]
 int32x2_t test_vorn_s32(int32x2_t a, int32x2_t b) {
-  // CHECK-LABEL: test_vorn_s32
  return vorn_s32(a, b);
-  // CHECK: orn {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }

+// CHECK-LABEL: define <4 x i32> @test_vornq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <4 x i32> %b, <i32 -1, i32 -1, i32 -1, i32 -1>
+// CHECK:   [[OR_I:%.*]] = or <4 x i32> %a, [[NEG_I]]
+// CHECK:   ret <4 x i32> [[OR_I]]
 int32x4_t test_vornq_s32(int32x4_t a, int32x4_t b) {
-  // CHECK-LABEL: test_vornq_s32
  return vornq_s32(a, b);
-  // CHECK: orn {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }

+// CHECK-LABEL: define <1 x i64> @test_vorn_s64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <1 x i64> %b, <i64 -1>
+// CHECK:   [[OR_I:%.*]] = or <1 x i64> %a, [[NEG_I]]
+// CHECK:   ret <1 x i64> [[OR_I]]
 int64x1_t test_vorn_s64(int64x1_t a, int64x1_t b) {
-  // CHECK-LABEL: test_vorn_s64
  return vorn_s64(a, b);
-  // CHECK: orn {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }

+// CHECK-LABEL: define <2 x i64> @test_vornq_s64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <2 x i64> %b, <i64 -1, i64 -1>
+// CHECK:   [[OR_I:%.*]] = or <2 x i64> %a, [[NEG_I]]
+// CHECK:   ret <2 x i64> [[OR_I]]
 int64x2_t test_vornq_s64(int64x2_t a, int64x2_t b) {
-  // CHECK-LABEL: test_vornq_s64
  return vornq_s64(a, b);
-  // CHECK: orn {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }

+// CHECK-LABEL: define <8 x i8> @test_vorn_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <8 x i8> %b, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK:   [[OR_I:%.*]] = or <8 x i8> %a, [[NEG_I]]
+// CHECK:   ret <8 x i8> [[OR_I]]
 uint8x8_t test_vorn_u8(uint8x8_t a, uint8x8_t b) {
-  // CHECK-LABEL: test_vorn_u8
  return vorn_u8(a, b);
-  // CHECK: orn {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }

+// CHECK-LABEL: define <16 x i8> @test_vornq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <16 x i8> %b, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK:   [[OR_I:%.*]] = or <16 x i8> %a, [[NEG_I]]
+// CHECK:   ret <16 x i8> [[OR_I]]
 uint8x16_t test_vornq_u8(uint8x16_t a, uint8x16_t b) {
-  // CHECK-LABEL: test_vornq_u8
  return vornq_u8(a, b);
-  // CHECK: orn {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }

+// CHECK-LABEL: define <4 x i16> @test_vorn_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <4 x i16> %b, <i16 -1, i16 -1, i16 -1, i16 -1>
+// CHECK:   [[OR_I:%.*]] = or <4 x i16> %a, [[NEG_I]]
+// CHECK:   ret <4 x i16> [[OR_I]]
 uint16x4_t test_vorn_u16(uint16x4_t a, uint16x4_t b) {
-  // CHECK-LABEL: test_vorn_u16
  return vorn_u16(a, b);
-  // CHECK: orn {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }

+// CHECK-LABEL: define <8 x i16> @test_vornq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <8 x i16> %b, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
+// CHECK:   [[OR_I:%.*]] = or <8 x i16> %a, [[NEG_I]]
+// CHECK:   ret <8 x i16> [[OR_I]]
 uint16x8_t test_vornq_u16(uint16x8_t a, uint16x8_t b) {
-  // CHECK-LABEL: test_vornq_u16
  return vornq_u16(a, b);
-  // CHECK: orn {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }

+// CHECK-LABEL: define <2 x i32> @test_vorn_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <2 x i32> %b, <i32 -1, i32 -1>
+// CHECK:   [[OR_I:%.*]] = or <2 x i32> %a, [[NEG_I]]
+// CHECK:   ret <2 x i32> [[OR_I]]
 uint32x2_t test_vorn_u32(uint32x2_t a, uint32x2_t b) {
-  // CHECK-LABEL: test_vorn_u32
  return vorn_u32(a, b);
-  // CHECK: orn {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }

+// CHECK-LABEL: define <4 x i32> @test_vornq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <4 x i32> %b, <i32 -1, i32 -1, i32 -1, i32 -1>
+// CHECK:   [[OR_I:%.*]] = or <4 x i32> %a, [[NEG_I]]
+// CHECK:   ret <4 x i32> [[OR_I]]
 uint32x4_t test_vornq_u32(uint32x4_t a, uint32x4_t b) {
-  // CHECK-LABEL: test_vornq_u32
  return vornq_u32(a, b);
-  // CHECK: orn {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }

+// CHECK-LABEL: define <1 x i64> @test_vorn_u64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <1 x i64> %b, <i64 -1>
+// CHECK:   [[OR_I:%.*]] = or <1 x i64> %a, [[NEG_I]]
+// CHECK:   ret <1 x i64> [[OR_I]]
 uint64x1_t test_vorn_u64(uint64x1_t a, uint64x1_t b) {
-  // CHECK-LABEL: test_vorn_u64
  return vorn_u64(a, b);
-  // CHECK: orn {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }

+// CHECK-LABEL: define <2 x i64> @test_vornq_u64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <2 x i64> %b, <i64 -1, i64 -1>
+// CHECK:   [[OR_I:%.*]] = or <2 x i64> %a, [[NEG_I]]
+// CHECK:   ret <2 x i64> [[OR_I]]
 uint64x2_t test_vornq_u64(uint64x2_t a, uint64x2_t b) {
-  // CHECK-LABEL: test_vornq_u64
  return vornq_u64(a, b);
-  // CHECK: orn {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
--- a/test/CodeGen/aarch64-neon-across.c
+++ b/test/CodeGen/aarch64-neon-across.c
@ -1,271 +1,398 @@
-// REQUIRES: aarch64-registered-target
 // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon \
-// RUN:   -ffp-contract=fast -S -O3 -o - %s | FileCheck %s
+// RUN:   -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s

 // Test new aarch64 intrinsics and types

 #include <arm_neon.h>

+// CHECK-LABEL: define i16 @test_vaddlv_s8(<8 x i8> %a) #0 {
+// CHECK:   [[VADDLV_I:%.*]] = call i32 @llvm.aarch64.neon.saddlv.i32.v8i8(<8 x i8> %a) #2
+// CHECK:   [[TMP0:%.*]] = trunc i32 [[VADDLV_I]] to i16
+// CHECK:   ret i16 [[TMP0]]
 int16_t test_vaddlv_s8(int8x8_t a) {
-  // CHECK-LABEL: test_vaddlv_s8
  return vaddlv_s8(a);
-  // CHECK: saddlv {{h[0-9]+}}, {{v[0-9]+}}.8b
 }

+// CHECK-LABEL: define i32 @test_vaddlv_s16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VADDLV_I:%.*]] = call i32 @llvm.aarch64.neon.saddlv.i32.v4i16(<4 x i16> [[TMP1]]) #2
+// CHECK:   ret i32 [[VADDLV_I]]
 int32_t test_vaddlv_s16(int16x4_t a) {
-  // CHECK-LABEL: test_vaddlv_s16
  return vaddlv_s16(a);
-  // CHECK: saddlv {{s[0-9]+}}, {{v[0-9]+}}.4h
 }

+// CHECK-LABEL: define i16 @test_vaddlv_u8(<8 x i8> %a) #0 {
+// CHECK:   [[VADDLV_I:%.*]] = call i32 @llvm.aarch64.neon.uaddlv.i32.v8i8(<8 x i8> %a) #2
+// CHECK:   [[TMP0:%.*]] = trunc i32 [[VADDLV_I]] to i16
+// CHECK:   ret i16 [[TMP0]]
 uint16_t test_vaddlv_u8(uint8x8_t a) {
-  // CHECK-LABEL: test_vaddlv_u8
  return vaddlv_u8(a);
-  // CHECK: uaddlv {{h[0-9]+}}, {{v[0-9]+}}.8b
 }

+// CHECK-LABEL: define i32 @test_vaddlv_u16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VADDLV_I:%.*]] = call i32 @llvm.aarch64.neon.uaddlv.i32.v4i16(<4 x i16> [[TMP1]]) #2
+// CHECK:   ret i32 [[VADDLV_I]]
 uint32_t test_vaddlv_u16(uint16x4_t a) {
-  // CHECK-LABEL: test_vaddlv_u16
  return vaddlv_u16(a);
-  // CHECK: uaddlv {{s[0-9]+}}, {{v[0-9]+}}.4h
 }

+// CHECK-LABEL: define i16 @test_vaddlvq_s8(<16 x i8> %a) #0 {
+// CHECK:   [[VADDLV_I:%.*]] = call i32 @llvm.aarch64.neon.saddlv.i32.v16i8(<16 x i8> %a) #2
+// CHECK:   [[TMP0:%.*]] = trunc i32 [[VADDLV_I]] to i16
+// CHECK:   ret i16 [[TMP0]]
 int16_t test_vaddlvq_s8(int8x16_t a) {
-  // CHECK-LABEL: test_vaddlvq_s8
  return vaddlvq_s8(a);
-  // CHECK: saddlv {{h[0-9]+}}, {{v[0-9]+}}.16b
 }

+// CHECK-LABEL: define i32 @test_vaddlvq_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VADDLV_I:%.*]] = call i32 @llvm.aarch64.neon.saddlv.i32.v8i16(<8 x i16> [[TMP1]]) #2
+// CHECK:   ret i32 [[VADDLV_I]]
 int32_t test_vaddlvq_s16(int16x8_t a) {
-  // CHECK-LABEL: test_vaddlvq_s16
  return vaddlvq_s16(a);
-  // CHECK: saddlv {{s[0-9]+}}, {{v[0-9]+}}.8h
 }

+// CHECK-LABEL: define i64 @test_vaddlvq_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VADDLVQ_S32_I:%.*]] = call i64 @llvm.aarch64.neon.saddlv.i64.v4i32(<4 x i32> [[TMP1]]) #2
+// CHECK:   ret i64 [[VADDLVQ_S32_I]]
 int64_t test_vaddlvq_s32(int32x4_t a) {
-  // CHECK-LABEL: test_vaddlvq_s32
  return vaddlvq_s32(a);
-  // CHECK: saddlv {{d[0-9]+}}, {{v[0-9]+}}.4s
 }

+// CHECK-LABEL: define i16 @test_vaddlvq_u8(<16 x i8> %a) #0 {
+// CHECK:   [[VADDLV_I:%.*]] = call i32 @llvm.aarch64.neon.uaddlv.i32.v16i8(<16 x i8> %a) #2
+// CHECK:   [[TMP0:%.*]] = trunc i32 [[VADDLV_I]] to i16
+// CHECK:   ret i16 [[TMP0]]
 uint16_t test_vaddlvq_u8(uint8x16_t a) {
-  // CHECK-LABEL: test_vaddlvq_u8
  return vaddlvq_u8(a);
-  // CHECK: uaddlv {{h[0-9]+}}, {{v[0-9]+}}.16b
 }

+// CHECK-LABEL: define i32 @test_vaddlvq_u16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VADDLV_I:%.*]] = call i32 @llvm.aarch64.neon.uaddlv.i32.v8i16(<8 x i16> [[TMP1]]) #2
+// CHECK:   ret i32 [[VADDLV_I]]
 uint32_t test_vaddlvq_u16(uint16x8_t a) {
-  // CHECK-LABEL: test_vaddlvq_u16
  return vaddlvq_u16(a);
-  // CHECK: uaddlv {{s[0-9]+}}, {{v[0-9]+}}.8h
 }

+// CHECK-LABEL: define i64 @test_vaddlvq_u32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VADDLVQ_U32_I:%.*]] = call i64 @llvm.aarch64.neon.uaddlv.i64.v4i32(<4 x i32> [[TMP1]]) #2
+// CHECK:   ret i64 [[VADDLVQ_U32_I]]
 uint64_t test_vaddlvq_u32(uint32x4_t a) {
-  // CHECK-LABEL: test_vaddlvq_u32
  return vaddlvq_u32(a);
-  // CHECK: uaddlv {{d[0-9]+}}, {{v[0-9]+}}.4s
 }

+// CHECK-LABEL: define i8 @test_vmaxv_s8(<8 x i8> %a) #0 {
+// CHECK:   [[VMAXV_I:%.*]] = call i32 @llvm.aarch64.neon.smaxv.i32.v8i8(<8 x i8> %a) #2
+// CHECK:   [[TMP0:%.*]] = trunc i32 [[VMAXV_I]] to i8
+// CHECK:   ret i8 [[TMP0]]
 int8_t test_vmaxv_s8(int8x8_t a) {
-  // CHECK-LABEL: test_vmaxv_s8
  return vmaxv_s8(a);
-  // CHECK: smaxv {{b[0-9]+}}, {{v[0-9]+}}.8b
 }

+// CHECK-LABEL: define i16 @test_vmaxv_s16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMAXV_I:%.*]] = call i32 @llvm.aarch64.neon.smaxv.i32.v4i16(<4 x i16> [[TMP1]]) #2
+// CHECK:   [[TMP2:%.*]] = trunc i32 [[VMAXV_I]] to i16
+// CHECK:   ret i16 [[TMP2]]
 int16_t test_vmaxv_s16(int16x4_t a) {
-  // CHECK-LABEL: test_vmaxv_s16
  return vmaxv_s16(a);
-  // CHECK: smaxv {{h[0-9]+}}, {{v[0-9]+}}.4h
 }

+// CHECK-LABEL: define i8 @test_vmaxv_u8(<8 x i8> %a) #0 {
+// CHECK:   [[VMAXV_I:%.*]] = call i32 @llvm.aarch64.neon.umaxv.i32.v8i8(<8 x i8> %a) #2
+// CHECK:   [[TMP0:%.*]] = trunc i32 [[VMAXV_I]] to i8
+// CHECK:   ret i8 [[TMP0]]
 uint8_t test_vmaxv_u8(uint8x8_t a) {
-  // CHECK-LABEL: test_vmaxv_u8
  return vmaxv_u8(a);
-  // CHECK: umaxv {{b[0-9]+}}, {{v[0-9]+}}.8b
 }

+// CHECK-LABEL: define i16 @test_vmaxv_u16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMAXV_I:%.*]] = call i32 @llvm.aarch64.neon.umaxv.i32.v4i16(<4 x i16> [[TMP1]]) #2
+// CHECK:   [[TMP2:%.*]] = trunc i32 [[VMAXV_I]] to i16
+// CHECK:   ret i16 [[TMP2]]
 uint16_t test_vmaxv_u16(uint16x4_t a) {
-  // CHECK-LABEL: test_vmaxv_u16
  return vmaxv_u16(a);
-  // CHECK: umaxv {{h[0-9]+}}, {{v[0-9]+}}.4h
 }

+// CHECK-LABEL: define i8 @test_vmaxvq_s8(<16 x i8> %a) #0 {
+// CHECK:   [[VMAXV_I:%.*]] = call i32 @llvm.aarch64.neon.smaxv.i32.v16i8(<16 x i8> %a) #2
+// CHECK:   [[TMP0:%.*]] = trunc i32 [[VMAXV_I]] to i8
+// CHECK:   ret i8 [[TMP0]]
 int8_t test_vmaxvq_s8(int8x16_t a) {
-  // CHECK-LABEL: test_vmaxvq_s8
  return vmaxvq_s8(a);
-  // CHECK: smaxv {{b[0-9]+}}, {{v[0-9]+}}.16b
 }

+// CHECK-LABEL: define i16 @test_vmaxvq_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VMAXV_I:%.*]] = call i32 @llvm.aarch64.neon.smaxv.i32.v8i16(<8 x i16> [[TMP1]]) #2
+// CHECK:   [[TMP2:%.*]] = trunc i32 [[VMAXV_I]] to i16
+// CHECK:   ret i16 [[TMP2]]
 int16_t test_vmaxvq_s16(int16x8_t a) {
-  // CHECK-LABEL: test_vmaxvq_s16
  return vmaxvq_s16(a);
-  // CHECK: smaxv {{h[0-9]+}}, {{v[0-9]+}}.8h
 }

+// CHECK-LABEL: define i32 @test_vmaxvq_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VMAXVQ_S32_I:%.*]] = call i32 @llvm.aarch64.neon.smaxv.i32.v4i32(<4 x i32> [[TMP1]]) #2
+// CHECK:   ret i32 [[VMAXVQ_S32_I]]
 int32_t test_vmaxvq_s32(int32x4_t a) {
-  // CHECK-LABEL: test_vmaxvq_s32
  return vmaxvq_s32(a);
-  // CHECK: smaxv {{s[0-9]+}}, {{v[0-9]+}}.4s
 }

+// CHECK-LABEL: define i8 @test_vmaxvq_u8(<16 x i8> %a) #0 {
+// CHECK:   [[VMAXV_I:%.*]] = call i32 @llvm.aarch64.neon.umaxv.i32.v16i8(<16 x i8> %a) #2
+// CHECK:   [[TMP0:%.*]] = trunc i32 [[VMAXV_I]] to i8
+// CHECK:   ret i8 [[TMP0]]
 uint8_t test_vmaxvq_u8(uint8x16_t a) {
-  // CHECK-LABEL: test_vmaxvq_u8
  return vmaxvq_u8(a);
-  // CHECK: umaxv {{b[0-9]+}}, {{v[0-9]+}}.16b
 }

+// CHECK-LABEL: define i16 @test_vmaxvq_u16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VMAXV_I:%.*]] = call i32 @llvm.aarch64.neon.umaxv.i32.v8i16(<8 x i16> [[TMP1]]) #2
+// CHECK:   [[TMP2:%.*]] = trunc i32 [[VMAXV_I]] to i16
+// CHECK:   ret i16 [[TMP2]]
 uint16_t test_vmaxvq_u16(uint16x8_t a) {
-  // CHECK-LABEL: test_vmaxvq_u16
  return vmaxvq_u16(a);
-  // CHECK: umaxv {{h[0-9]+}}, {{v[0-9]+}}.8h
 }

+// CHECK-LABEL: define i32 @test_vmaxvq_u32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VMAXVQ_U32_I:%.*]] = call i32 @llvm.aarch64.neon.umaxv.i32.v4i32(<4 x i32> [[TMP1]]) #2
+// CHECK:   ret i32 [[VMAXVQ_U32_I]]
 uint32_t test_vmaxvq_u32(uint32x4_t a) {
-  // CHECK-LABEL: test_vmaxvq_u32
  return vmaxvq_u32(a);
-  // CHECK: umaxv {{s[0-9]+}}, {{v[0-9]+}}.4s
 }

+// CHECK-LABEL: define i8 @test_vminv_s8(<8 x i8> %a) #0 {
+// CHECK:   [[VMINV_I:%.*]] = call i32 @llvm.aarch64.neon.sminv.i32.v8i8(<8 x i8> %a) #2
+// CHECK:   [[TMP0:%.*]] = trunc i32 [[VMINV_I]] to i8
+// CHECK:   ret i8 [[TMP0]]
 int8_t test_vminv_s8(int8x8_t a) {
-  // CHECK-LABEL: test_vminv_s8
  return vminv_s8(a);
-  // CHECK: sminv {{b[0-9]+}}, {{v[0-9]+}}.8b
 }

+// CHECK-LABEL: define i16 @test_vminv_s16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMINV_I:%.*]] = call i32 @llvm.aarch64.neon.sminv.i32.v4i16(<4 x i16> [[TMP1]]) #2
+// CHECK:   [[TMP2:%.*]] = trunc i32 [[VMINV_I]] to i16
+// CHECK:   ret i16 [[TMP2]]
 int16_t test_vminv_s16(int16x4_t a) {
-  // CHECK-LABEL: test_vminv_s16
  return vminv_s16(a);
-  // CHECK: sminv {{h[0-9]+}}, {{v[0-9]+}}.4h
 }

+// CHECK-LABEL: define i8 @test_vminv_u8(<8 x i8> %a) #0 {
+// CHECK:   [[VMINV_I:%.*]] = call i32 @llvm.aarch64.neon.uminv.i32.v8i8(<8 x i8> %a) #2
+// CHECK:   [[TMP0:%.*]] = trunc i32 [[VMINV_I]] to i8
+// CHECK:   ret i8 [[TMP0]]
 uint8_t test_vminv_u8(uint8x8_t a) {
-  // CHECK-LABEL: test_vminv_u8
  return vminv_u8(a);
-  // CHECK: uminv {{b[0-9]+}}, {{v[0-9]+}}.8b
 }

+// CHECK-LABEL: define i16 @test_vminv_u16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMINV_I:%.*]] = call i32 @llvm.aarch64.neon.uminv.i32.v4i16(<4 x i16> [[TMP1]]) #2
+// CHECK:   [[TMP2:%.*]] = trunc i32 [[VMINV_I]] to i16
+// CHECK:   ret i16 [[TMP2]]
 uint16_t test_vminv_u16(uint16x4_t a) {
-  // CHECK-LABEL: test_vminv_u16
  return vminv_u16(a);
-  // CHECK: uminv {{h[0-9]+}}, {{v[0-9]+}}.4h
 }

+// CHECK-LABEL: define i8 @test_vminvq_s8(<16 x i8> %a) #0 {
+// CHECK:   [[VMINV_I:%.*]] = call i32 @llvm.aarch64.neon.sminv.i32.v16i8(<16 x i8> %a) #2
+// CHECK:   [[TMP0:%.*]] = trunc i32 [[VMINV_I]] to i8
+// CHECK:   ret i8 [[TMP0]]
 int8_t test_vminvq_s8(int8x16_t a) {
-  // CHECK-LABEL: test_vminvq_s8
  return vminvq_s8(a);
-  // CHECK: sminv {{b[0-9]+}}, {{v[0-9]+}}.16b
 }

+// CHECK-LABEL: define i16 @test_vminvq_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VMINV_I:%.*]] = call i32 @llvm.aarch64.neon.sminv.i32.v8i16(<8 x i16> [[TMP1]]) #2
+// CHECK:   [[TMP2:%.*]] = trunc i32 [[VMINV_I]] to i16
+// CHECK:   ret i16 [[TMP2]]
 int16_t test_vminvq_s16(int16x8_t a) {
-  // CHECK-LABEL: test_vminvq_s16
  return vminvq_s16(a);
-  // CHECK: sminv {{h[0-9]+}}, {{v[0-9]+}}.8h
 }

+// CHECK-LABEL: define i32 @test_vminvq_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VMINVQ_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sminv.i32.v4i32(<4 x i32> [[TMP1]]) #2
+// CHECK:   ret i32 [[VMINVQ_S32_I]]
 int32_t test_vminvq_s32(int32x4_t a) {
-  // CHECK-LABEL: test_vminvq_s32
  return vminvq_s32(a);
-  // CHECK: sminv {{s[0-9]+}}, {{v[0-9]+}}.4s
 }

+// CHECK-LABEL: define i8 @test_vminvq_u8(<16 x i8> %a) #0 {
+// CHECK:   [[VMINV_I:%.*]] = call i32 @llvm.aarch64.neon.uminv.i32.v16i8(<16 x i8> %a) #2
+// CHECK:   [[TMP0:%.*]] = trunc i32 [[VMINV_I]] to i8
+// CHECK:   ret i8 [[TMP0]]
 uint8_t test_vminvq_u8(uint8x16_t a) {
-  // CHECK-LABEL: test_vminvq_u8
  return vminvq_u8(a);
-  // CHECK: uminv {{b[0-9]+}}, {{v[0-9]+}}.16b
 }

+// CHECK-LABEL: define i16 @test_vminvq_u16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VMINV_I:%.*]] = call i32 @llvm.aarch64.neon.uminv.i32.v8i16(<8 x i16> [[TMP1]]) #2
+// CHECK:   [[TMP2:%.*]] = trunc i32 [[VMINV_I]] to i16
+// CHECK:   ret i16 [[TMP2]]
 uint16_t test_vminvq_u16(uint16x8_t a) {
-  // CHECK-LABEL: test_vminvq_u16
  return vminvq_u16(a);
-  // CHECK: uminv {{h[0-9]+}}, {{v[0-9]+}}.8h
 }

+// CHECK-LABEL: define i32 @test_vminvq_u32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VMINVQ_U32_I:%.*]] = call i32 @llvm.aarch64.neon.uminv.i32.v4i32(<4 x i32> [[TMP1]]) #2
+// CHECK:   ret i32 [[VMINVQ_U32_I]]
 uint32_t test_vminvq_u32(uint32x4_t a) {
-  // CHECK-LABEL: test_vminvq_u32
  return vminvq_u32(a);
-  // CHECK: uminv {{s[0-9]+}}, {{v[0-9]+}}.4s
 }

+// CHECK-LABEL: define i8 @test_vaddv_s8(<8 x i8> %a) #0 {
+// CHECK:   [[VADDV_I:%.*]] = call i32 @llvm.aarch64.neon.saddv.i32.v8i8(<8 x i8> %a) #2
+// CHECK:   [[TMP0:%.*]] = trunc i32 [[VADDV_I]] to i8
+// CHECK:   ret i8 [[TMP0]]
 int8_t test_vaddv_s8(int8x8_t a) {
-  // CHECK-LABEL: test_vaddv_s8
  return vaddv_s8(a);
-  // CHECK: addv {{b[0-9]+}}, {{v[0-9]+}}.8b
 }

+// CHECK-LABEL: define i16 @test_vaddv_s16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VADDV_I:%.*]] = call i32 @llvm.aarch64.neon.saddv.i32.v4i16(<4 x i16> [[TMP1]]) #2
+// CHECK:   [[TMP2:%.*]] = trunc i32 [[VADDV_I]] to i16
+// CHECK:   ret i16 [[TMP2]]
 int16_t test_vaddv_s16(int16x4_t a) {
-  // CHECK-LABEL: test_vaddv_s16
  return vaddv_s16(a);
-  // CHECK: addv {{h[0-9]+}}, {{v[0-9]+}}.4h
 }

+// CHECK-LABEL: define i8 @test_vaddv_u8(<8 x i8> %a) #0 {
+// CHECK:   [[VADDV_I:%.*]] = call i32 @llvm.aarch64.neon.uaddv.i32.v8i8(<8 x i8> %a) #2
+// CHECK:   [[TMP0:%.*]] = trunc i32 [[VADDV_I]] to i8
+// CHECK:   ret i8 [[TMP0]]
 uint8_t test_vaddv_u8(uint8x8_t a) {
-  // CHECK-LABEL: test_vaddv_u8
  return vaddv_u8(a);
-  // CHECK: addv {{b[0-9]+}}, {{v[0-9]+}}.8b
 }

+// CHECK-LABEL: define i16 @test_vaddv_u16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VADDV_I:%.*]] = call i32 @llvm.aarch64.neon.uaddv.i32.v4i16(<4 x i16> [[TMP1]]) #2
+// CHECK:   [[TMP2:%.*]] = trunc i32 [[VADDV_I]] to i16
+// CHECK:   ret i16 [[TMP2]]
 uint16_t test_vaddv_u16(uint16x4_t a) {
-  // CHECK-LABEL: test_vaddv_u16
  return vaddv_u16(a);
-  // CHECK: addv {{h[0-9]+}}, {{v[0-9]+}}.4h
 }

+// CHECK-LABEL: define i8 @test_vaddvq_s8(<16 x i8> %a) #0 {
+// CHECK:   [[VADDV_I:%.*]] = call i32 @llvm.aarch64.neon.saddv.i32.v16i8(<16 x i8> %a) #2
+// CHECK:   [[TMP0:%.*]] = trunc i32 [[VADDV_I]] to i8
+// CHECK:   ret i8 [[TMP0]]
 int8_t test_vaddvq_s8(int8x16_t a) {
-  // CHECK-LABEL: test_vaddvq_s8
  return vaddvq_s8(a);
-  // CHECK: addv {{b[0-9]+}}, {{v[0-9]+}}.16b
 }

+// CHECK-LABEL: define i16 @test_vaddvq_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VADDV_I:%.*]] = call i32 @llvm.aarch64.neon.saddv.i32.v8i16(<8 x i16> [[TMP1]]) #2
+// CHECK:   [[TMP2:%.*]] = trunc i32 [[VADDV_I]] to i16
+// CHECK:   ret i16 [[TMP2]]
 int16_t test_vaddvq_s16(int16x8_t a) {
-  // CHECK-LABEL: test_vaddvq_s16
  return vaddvq_s16(a);
-  // CHECK: addv {{h[0-9]+}}, {{v[0-9]+}}.8h
 }

+// CHECK-LABEL: define i32 @test_vaddvq_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VADDVQ_S32_I:%.*]] = call i32 @llvm.aarch64.neon.saddv.i32.v4i32(<4 x i32> [[TMP1]]) #2
+// CHECK:   ret i32 [[VADDVQ_S32_I]]
 int32_t test_vaddvq_s32(int32x4_t a) {
-  // CHECK-LABEL: test_vaddvq_s32
  return vaddvq_s32(a);
-  // CHECK: addv {{s[0-9]+}}, {{v[0-9]+}}.4s
 }

+// CHECK-LABEL: define i8 @test_vaddvq_u8(<16 x i8> %a) #0 {
+// CHECK:   [[VADDV_I:%.*]] = call i32 @llvm.aarch64.neon.uaddv.i32.v16i8(<16 x i8> %a) #2
+// CHECK:   [[TMP0:%.*]] = trunc i32 [[VADDV_I]] to i8
+// CHECK:   ret i8 [[TMP0]]
 uint8_t test_vaddvq_u8(uint8x16_t a) {
-  // CHECK-LABEL: test_vaddvq_u8
  return vaddvq_u8(a);
-  // CHECK: addv {{b[0-9]+}}, {{v[0-9]+}}.16b
 }

+// CHECK-LABEL: define i16 @test_vaddvq_u16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VADDV_I:%.*]] = call i32 @llvm.aarch64.neon.uaddv.i32.v8i16(<8 x i16> [[TMP1]]) #2
+// CHECK:   [[TMP2:%.*]] = trunc i32 [[VADDV_I]] to i16
+// CHECK:   ret i16 [[TMP2]]
 uint16_t test_vaddvq_u16(uint16x8_t a) {
-  // CHECK-LABEL: test_vaddvq_u16
  return vaddvq_u16(a);
-  // CHECK: addv {{h[0-9]+}}, {{v[0-9]+}}.8h
 }

+// CHECK-LABEL: define i32 @test_vaddvq_u32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VADDVQ_U32_I:%.*]] = call i32 @llvm.aarch64.neon.uaddv.i32.v4i32(<4 x i32> [[TMP1]]) #2
+// CHECK:   ret i32 [[VADDVQ_U32_I]]
 uint32_t test_vaddvq_u32(uint32x4_t a) {
-  // CHECK-LABEL: test_vaddvq_u32
  return vaddvq_u32(a);
-  // CHECK: addv {{s[0-9]+}}, {{v[0-9]+}}.4s
 }

+// CHECK-LABEL: define float @test_vmaxvq_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VMAXVQ_F32_I:%.*]] = call float @llvm.aarch64.neon.fmaxv.f32.v4f32(<4 x float> [[TMP1]]) #2
+// CHECK:   ret float [[VMAXVQ_F32_I]]
 float32_t test_vmaxvq_f32(float32x4_t a) {
-  // CHECK-LABEL: test_vmaxvq_f32
  return vmaxvq_f32(a);
-  // CHECK: fmaxv {{s[0-9]+}}, {{v[0-9]+}}.4s
 }

+// CHECK-LABEL: define float @test_vminvq_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VMINVQ_F32_I:%.*]] = call float @llvm.aarch64.neon.fminv.f32.v4f32(<4 x float> [[TMP1]]) #2
+// CHECK:   ret float [[VMINVQ_F32_I]]
 float32_t test_vminvq_f32(float32x4_t a) {
-  // CHECK-LABEL: test_vminvq_f32
  return vminvq_f32(a);
-  // CHECK: fminv {{s[0-9]+}}, {{v[0-9]+}}.4s
 }

+// CHECK-LABEL: define float @test_vmaxnmvq_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VMAXNMVQ_F32_I:%.*]] = call float @llvm.aarch64.neon.fmaxnmv.f32.v4f32(<4 x float> [[TMP1]]) #2
+// CHECK:   ret float [[VMAXNMVQ_F32_I]]
 float32_t test_vmaxnmvq_f32(float32x4_t a) {
-  // CHECK-LABEL: test_vmaxnmvq_f32
  return vmaxnmvq_f32(a);
-  // CHECK: fmaxnmv {{s[0-9]+}}, {{v[0-9]+}}.4s
 }

+// CHECK-LABEL: define float @test_vminnmvq_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VMINNMVQ_F32_I:%.*]] = call float @llvm.aarch64.neon.fminnmv.f32.v4f32(<4 x float> [[TMP1]]) #2
+// CHECK:   ret float [[VMINNMVQ_F32_I]]
 float32_t test_vminnmvq_f32(float32x4_t a) {
-  // CHECK-LABEL: test_vminnmvq_f32
  return vminnmvq_f32(a);
-  // CHECK: fminnmv {{s[0-9]+}}, {{v[0-9]+}}.4s
 }
--- a/test/CodeGen/aarch64-neon-extract.c
+++ b/test/CodeGen/aarch64-neon-extract.c
@ -1,148 +1,247 @@
 // REQUIRES: aarch64-registered-target
 // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon \
-// RUN:   -ffp-contract=fast -S -O3 -o - %s | FileCheck %s
+// RUN:   -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s

 // Test new aarch64 intrinsics and types

 #include <arm_neon.h>

+// CHECK-LABEL: define <8 x i8> @test_vext_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VEXT:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9>
+// CHECK:   ret <8 x i8> [[VEXT]]
 int8x8_t test_vext_s8(int8x8_t a, int8x8_t b) {
-  // CHECK-LABEL: test_vext_s8
  return vext_s8(a, b, 2);
-  // CHECK: ext {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #{{(0x)?2}}
 }

+// CHECK-LABEL: define <4 x i16> @test_vext_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VEXT:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP3]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+// CHECK:   ret <4 x i16> [[VEXT]]
 int16x4_t test_vext_s16(int16x4_t a, int16x4_t b) {
-  // CHECK-LABEL: test_vext_s16
  return vext_s16(a, b, 3);
-  // CHECK: ext {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #{{(0x)?6}}
 }

+// CHECK-LABEL: define <2 x i32> @test_vext_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VEXT:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> <i32 1, i32 2>
+// CHECK:   ret <2 x i32> [[VEXT]]
 int32x2_t test_vext_s32(int32x2_t a, int32x2_t b) {
-  // CHECK-LABEL: test_vext_s32
  return vext_s32(a, b, 1);
-  // CHECK: ext {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #{{(0x)?4}}
 }

+// CHECK-LABEL: define <1 x i64> @test_vext_s64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[VEXT:%.*]] = shufflevector <1 x i64> [[TMP2]], <1 x i64> [[TMP3]], <1 x i32> zeroinitializer
+// CHECK:   ret <1 x i64> [[VEXT]]
 int64x1_t test_vext_s64(int64x1_t a, int64x1_t b) {
-  // CHECK-LABEL: test_vext_s64
  return vext_s64(a, b, 0);
 }

+// CHECK-LABEL: define <16 x i8> @test_vextq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VEXT:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17>
+// CHECK:   ret <16 x i8> [[VEXT]]
 int8x16_t test_vextq_s8(int8x16_t a, int8x16_t b) {
-  // CHECK-LABEL: test_vextq_s8
  return vextq_s8(a, b, 2);
-  // CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #{{(0x)?2}}
 }

+// CHECK-LABEL: define <8 x i16> @test_vextq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VEXT:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> [[TMP3]], <8 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10>
+// CHECK:   ret <8 x i16> [[VEXT]]
 int16x8_t test_vextq_s16(int16x8_t a, int16x8_t b) {
-  // CHECK-LABEL: test_vextq_s16
  return vextq_s16(a, b, 3);
-  // CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #{{(0x)?6}}
 }

+// CHECK-LABEL: define <4 x i32> @test_vextq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VEXT:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> <i32 1, i32 2, i32 3, i32 4>
+// CHECK:   ret <4 x i32> [[VEXT]]
 int32x4_t test_vextq_s32(int32x4_t a, int32x4_t b) {
-  // CHECK-LABEL: test_vextq_s32
  return vextq_s32(a, b, 1);
-  // CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #{{(0x)?4}}
 }

+// CHECK-LABEL: define <2 x i64> @test_vextq_s64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VEXT:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> [[TMP3]], <2 x i32> <i32 1, i32 2>
+// CHECK:   ret <2 x i64> [[VEXT]]
 int64x2_t test_vextq_s64(int64x2_t a, int64x2_t b) {
-  // CHECK-LABEL: test_vextq_s64
  return vextq_s64(a, b, 1);
-  // CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #{{(0x)?8}}
 }

+// CHECK-LABEL: define <8 x i8> @test_vext_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VEXT:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9>
+// CHECK:   ret <8 x i8> [[VEXT]]
 uint8x8_t test_vext_u8(uint8x8_t a, uint8x8_t b) {
-  // CHECK-LABEL: test_vext_u8
  return vext_u8(a, b, 2);
-  // CHECK: ext {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #{{(0x)?2}}
 }

+// CHECK-LABEL: define <4 x i16> @test_vext_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VEXT:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP3]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+// CHECK:   ret <4 x i16> [[VEXT]]
 uint16x4_t test_vext_u16(uint16x4_t a, uint16x4_t b) {
-  // CHECK-LABEL: test_vext_u16
  return vext_u16(a, b, 3);
-  // CHECK: ext {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #{{(0x)?6}}
 }

+// CHECK-LABEL: define <2 x i32> @test_vext_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VEXT:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> <i32 1, i32 2>
+// CHECK:   ret <2 x i32> [[VEXT]]
 uint32x2_t test_vext_u32(uint32x2_t a, uint32x2_t b) {
-  // CHECK-LABEL: test_vext_u32
  return vext_u32(a, b, 1);
-  // CHECK: ext {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #{{(0x)?4}}
 }

+// CHECK-LABEL: define <1 x i64> @test_vext_u64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[VEXT:%.*]] = shufflevector <1 x i64> [[TMP2]], <1 x i64> [[TMP3]], <1 x i32> zeroinitializer
+// CHECK:   ret <1 x i64> [[VEXT]]
 uint64x1_t test_vext_u64(uint64x1_t a, uint64x1_t b) {
-  // CHECK-LABEL: test_vext_u64
  return vext_u64(a, b, 0);
 }

+// CHECK-LABEL: define <16 x i8> @test_vextq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VEXT:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17>
+// CHECK:   ret <16 x i8> [[VEXT]]
 uint8x16_t test_vextq_u8(uint8x16_t a, uint8x16_t b) {
-  // CHECK-LABEL: test_vextq_u8
  return vextq_u8(a, b, 2);
-  // CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #{{(0x)?2}}
 }

+// CHECK-LABEL: define <8 x i16> @test_vextq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VEXT:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> [[TMP3]], <8 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10>
+// CHECK:   ret <8 x i16> [[VEXT]]
 uint16x8_t test_vextq_u16(uint16x8_t a, uint16x8_t b) {
-  // CHECK-LABEL: test_vextq_u16
  return vextq_u16(a, b, 3);
-  // CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #{{(0x)?6}}
 }

+// CHECK-LABEL: define <4 x i32> @test_vextq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VEXT:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> <i32 1, i32 2, i32 3, i32 4>
+// CHECK:   ret <4 x i32> [[VEXT]]
 uint32x4_t test_vextq_u32(uint32x4_t a, uint32x4_t b) {
-  // CHECK-LABEL: test_vextq_u32
  return vextq_u32(a, b, 1);
-  // CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #{{(0x)?4}}
 }

+// CHECK-LABEL: define <2 x i64> @test_vextq_u64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VEXT:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> [[TMP3]], <2 x i32> <i32 1, i32 2>
+// CHECK:   ret <2 x i64> [[VEXT]]
 uint64x2_t test_vextq_u64(uint64x2_t a, uint64x2_t b) {
-  // CHECK-LABEL: test_vextq_u64
  return vextq_u64(a, b, 1);
-  // CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #{{(0x)?8}}
 }

+// CHECK-LABEL: define <2 x float> @test_vext_f32(<2 x float> %a, <2 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK:   [[VEXT:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> [[TMP3]], <2 x i32> <i32 1, i32 2>
+// CHECK:   ret <2 x float> [[VEXT]]
 float32x2_t test_vext_f32(float32x2_t a, float32x2_t b) {
-  // CHECK-LABEL: test_vext_f32
  return vext_f32(a, b, 1);
-  // CHECK: ext {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #{{(0x)?4}}
 }

+// CHECK-LABEL: define <1 x double> @test_vext_f64(<1 x double> %a, <1 x double> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double>
+// CHECK:   [[VEXT:%.*]] = shufflevector <1 x double> [[TMP2]], <1 x double> [[TMP3]], <1 x i32> zeroinitializer
+// CHECK:   ret <1 x double> [[VEXT]]
 float64x1_t test_vext_f64(float64x1_t a, float64x1_t b) {
-  // CHECK-LABEL: test_vext_f64
  return vext_f64(a, b, 0);
 }

+// CHECK-LABEL: define <4 x float> @test_vextq_f32(<4 x float> %a, <4 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK:   [[VEXT:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP3]], <4 x i32> <i32 1, i32 2, i32 3, i32 4>
+// CHECK:   ret <4 x float> [[VEXT]]
 float32x4_t test_vextq_f32(float32x4_t a, float32x4_t b) {
-  // CHECK-LABEL: test_vextq_f32
  return vextq_f32(a, b, 1);
-  // CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #{{(0x)?4}}
 }

+// CHECK-LABEL: define <2 x double> @test_vextq_f64(<2 x double> %a, <2 x double> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
+// CHECK:   [[VEXT:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> [[TMP3]], <2 x i32> <i32 1, i32 2>
+// CHECK:   ret <2 x double> [[VEXT]]
 float64x2_t test_vextq_f64(float64x2_t a, float64x2_t b) {
-  // CHECK-LABEL: test_vextq_f64
  return vextq_f64(a, b, 1);
-  // CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #{{(0x)?8}}
 }

+// CHECK-LABEL: define <8 x i8> @test_vext_p8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VEXT:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9>
+// CHECK:   ret <8 x i8> [[VEXT]]
 poly8x8_t test_vext_p8(poly8x8_t a, poly8x8_t b) {
-  // CHECK-LABEL: test_vext_p8
  return vext_p8(a, b, 2);
-  // CHECK: ext {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #{{(0x)?2}}
 }

+// CHECK-LABEL: define <4 x i16> @test_vext_p16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VEXT:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP3]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+// CHECK:   ret <4 x i16> [[VEXT]]
 poly16x4_t test_vext_p16(poly16x4_t a, poly16x4_t b) {
-  // CHECK-LABEL: test_vext_p16
  return vext_p16(a, b, 3);
-  // CHECK: ext {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #{{(0x)?6}}
 }

+// CHECK-LABEL: define <16 x i8> @test_vextq_p8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VEXT:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17>
+// CHECK:   ret <16 x i8> [[VEXT]]
 poly8x16_t test_vextq_p8(poly8x16_t a, poly8x16_t b) {
-  // CHECK-LABEL: test_vextq_p8
  return vextq_p8(a, b, 2);
-  // CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #{{(0x)?2}}
 }

+// CHECK-LABEL: define <8 x i16> @test_vextq_p16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VEXT:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> [[TMP3]], <8 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10>
+// CHECK:   ret <8 x i16> [[VEXT]]
 poly16x8_t test_vextq_p16(poly16x8_t a, poly16x8_t b) {
-  // CHECK-LABEL: test_vextq_p16
  return vextq_p16(a, b, 3);
-  // CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #{{(0x)?6}}
 }
--- a/test/CodeGen/aarch64-neon-fcvt-intrinsics.c
+++ b/test/CodeGen/aarch64-neon-fcvt-intrinsics.c
@ -1,133 +1,153 @@
-// REQUIRES: aarch64-registered-target
 // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon \
-// RUN:   -ffp-contract=fast -S -O3 -o - %s | FileCheck %s
+// RUN:   -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s

 // Test new aarch64 intrinsics and types

 #include <arm_neon.h>

+// CHECK-LABEL: define float @test_vcvtxd_f32_f64(double %a) #0 {
+// CHECK:   [[VCVTXD_F32_F64_I:%.*]] = call float @llvm.aarch64.sisd.fcvtxn(double %a) #2
+// CHECK:   ret float [[VCVTXD_F32_F64_I]]
 float32_t test_vcvtxd_f32_f64(float64_t a) {
-// CHECK-LABEL: test_vcvtxd_f32_f64
-// CHECK: fcvtxn {{s[0-9]+}}, {{d[0-9]+}}
  return (float32_t)vcvtxd_f32_f64(a);
 }

+// CHECK-LABEL: define i32 @test_vcvtas_s32_f32(float %a) #0 {
+// CHECK:   [[VCVTAS_S32_F32_I:%.*]] = call i32 @llvm.aarch64.neon.fcvtas.i32.f32(float %a) #2
+// CHECK:   ret i32 [[VCVTAS_S32_F32_I]]
 int32_t test_vcvtas_s32_f32(float32_t a) {
-// CHECK-LABEL: test_vcvtas_s32_f32
-// CHECK: fcvtas {{[ws][0-9]+}}, {{s[0-9]+}}
  return (int32_t)vcvtas_s32_f32(a);
 }

+// CHECK-LABEL: define i64 @test_test_vcvtad_s64_f64(double %a) #0 {
+// CHECK:   [[VCVTAD_S64_F64_I:%.*]] = call i64 @llvm.aarch64.neon.fcvtas.i64.f64(double %a) #2
+// CHECK:   ret i64 [[VCVTAD_S64_F64_I]]
 int64_t test_test_vcvtad_s64_f64(float64_t a) {
-// CHECK-LABEL: test_test_vcvtad_s64_f64
-// CHECK: fcvtas {{[dx][0-9]+}}, {{d[0-9]+}}
  return (int64_t)vcvtad_s64_f64(a);
 }

+// CHECK-LABEL: define i32 @test_vcvtas_u32_f32(float %a) #0 {
+// CHECK:   [[VCVTAS_U32_F32_I:%.*]] = call i32 @llvm.aarch64.neon.fcvtau.i32.f32(float %a) #2
+// CHECK:   ret i32 [[VCVTAS_U32_F32_I]]
 uint32_t test_vcvtas_u32_f32(float32_t a) {
-// CHECK-LABEL: test_vcvtas_u32_f32
-// CHECK: fcvtau {{[ws][0-9]+}}, {{s[0-9]+}}
  return (uint32_t)vcvtas_u32_f32(a);
 }

+// CHECK-LABEL: define i64 @test_vcvtad_u64_f64(double %a) #0 {
+// CHECK:   [[VCVTAD_U64_F64_I:%.*]] = call i64 @llvm.aarch64.neon.fcvtau.i64.f64(double %a) #2
+// CHECK:   ret i64 [[VCVTAD_U64_F64_I]]
 uint64_t test_vcvtad_u64_f64(float64_t a) {
-// CHECK-LABEL: test_vcvtad_u64_f64
-// CHECK: fcvtau {{[xd][0-9]+}}, {{d[0-9]+}}
  return (uint64_t)vcvtad_u64_f64(a);
 }

+// CHECK-LABEL: define i32 @test_vcvtms_s32_f32(float %a) #0 {
+// CHECK:   [[VCVTMS_S32_F32_I:%.*]] = call i32 @llvm.aarch64.neon.fcvtms.i32.f32(float %a) #2
+// CHECK:   ret i32 [[VCVTMS_S32_F32_I]]
 int32_t test_vcvtms_s32_f32(float32_t a) {
-// CHECK-LABEL: test_vcvtms_s32_f32
-// CHECK: fcvtms {{[sw][0-9]+}}, {{s[0-9]+}}
  return (int32_t)vcvtms_s32_f32(a);
 }

+// CHECK-LABEL: define i64 @test_vcvtmd_s64_f64(double %a) #0 {
+// CHECK:   [[VCVTMD_S64_F64_I:%.*]] = call i64 @llvm.aarch64.neon.fcvtms.i64.f64(double %a) #2
+// CHECK:   ret i64 [[VCVTMD_S64_F64_I]]
 int64_t test_vcvtmd_s64_f64(float64_t a) {
-// CHECK-LABEL: test_vcvtmd_s64_f64
-// CHECK: fcvtms {{[dx][0-9]+}}, {{d[0-9]+}}
  return (int64_t)vcvtmd_s64_f64(a);
 }

+// CHECK-LABEL: define i32 @test_vcvtms_u32_f32(float %a) #0 {
+// CHECK:   [[VCVTMS_U32_F32_I:%.*]] = call i32 @llvm.aarch64.neon.fcvtmu.i32.f32(float %a) #2
+// CHECK:   ret i32 [[VCVTMS_U32_F32_I]]
 uint32_t test_vcvtms_u32_f32(float32_t a) {
-// CHECK-LABEL: test_vcvtms_u32_f32
-// CHECK: fcvtmu {{[ws][0-9]+}}, {{s[0-9]+}}
  return (uint32_t)vcvtms_u32_f32(a);
 }

+// CHECK-LABEL: define i64 @test_vcvtmd_u64_f64(double %a) #0 {
+// CHECK:   [[VCVTMD_U64_F64_I:%.*]] = call i64 @llvm.aarch64.neon.fcvtmu.i64.f64(double %a) #2
+// CHECK:   ret i64 [[VCVTMD_U64_F64_I]]
 uint64_t test_vcvtmd_u64_f64(float64_t a) {
-// CHECK-LABEL: test_vcvtmd_u64_f64
-// CHECK: fcvtmu {{[xd][0-9]+}}, {{d[0-9]+}}
  return (uint64_t)vcvtmd_u64_f64(a);
 }

+// CHECK-LABEL: define i32 @test_vcvtns_s32_f32(float %a) #0 {
+// CHECK:   [[VCVTNS_S32_F32_I:%.*]] = call i32 @llvm.aarch64.neon.fcvtns.i32.f32(float %a) #2
+// CHECK:   ret i32 [[VCVTNS_S32_F32_I]]
 int32_t test_vcvtns_s32_f32(float32_t a) {
-// CHECK-LABEL: test_vcvtns_s32_f32
-// CHECK: fcvtns {{[sw][0-9]+}}, {{s[0-9]+}}
  return (int32_t)vcvtns_s32_f32(a);
 }

+// CHECK-LABEL: define i64 @test_vcvtnd_s64_f64(double %a) #0 {
+// CHECK:   [[VCVTND_S64_F64_I:%.*]] = call i64 @llvm.aarch64.neon.fcvtns.i64.f64(double %a) #2
+// CHECK:   ret i64 [[VCVTND_S64_F64_I]]
 int64_t test_vcvtnd_s64_f64(float64_t a) {
-// CHECK-LABEL: test_vcvtnd_s64_f64
-// CHECK: fcvtns {{[dx][0-9]+}}, {{d[0-9]+}}
  return (int64_t)vcvtnd_s64_f64(a);
 }

+// CHECK-LABEL: define i32 @test_vcvtns_u32_f32(float %a) #0 {
+// CHECK:   [[VCVTNS_U32_F32_I:%.*]] = call i32 @llvm.aarch64.neon.fcvtnu.i32.f32(float %a) #2
+// CHECK:   ret i32 [[VCVTNS_U32_F32_I]]
 uint32_t test_vcvtns_u32_f32(float32_t a) {
-// CHECK-LABEL: test_vcvtns_u32_f32
-// CHECK: fcvtnu {{[sw][0-9]+}}, {{s[0-9]+}}
  return (uint32_t)vcvtns_u32_f32(a);
 }

+// CHECK-LABEL: define i64 @test_vcvtnd_u64_f64(double %a) #0 {
+// CHECK:   [[VCVTND_U64_F64_I:%.*]] = call i64 @llvm.aarch64.neon.fcvtnu.i64.f64(double %a) #2
+// CHECK:   ret i64 [[VCVTND_U64_F64_I]]
 uint64_t test_vcvtnd_u64_f64(float64_t a) {
-// CHECK-LABEL: test_vcvtnd_u64_f64
-// CHECK: fcvtnu {{[dx][0-9]+}}, {{d[0-9]+}}
  return (uint64_t)vcvtnd_u64_f64(a);
 }

+// CHECK-LABEL: define i32 @test_vcvtps_s32_f32(float %a) #0 {
+// CHECK:   [[VCVTPS_S32_F32_I:%.*]] = call i32 @llvm.aarch64.neon.fcvtps.i32.f32(float %a) #2
+// CHECK:   ret i32 [[VCVTPS_S32_F32_I]]
 int32_t test_vcvtps_s32_f32(float32_t a) {
-// CHECK-LABEL: test_vcvtps_s32_f32
-// CHECK: fcvtps {{[sw][0-9]+}}, {{s[0-9]+}}
  return (int32_t)vcvtps_s32_f32(a);
 }

+// CHECK-LABEL: define i64 @test_vcvtpd_s64_f64(double %a) #0 {
+// CHECK:   [[VCVTPD_S64_F64_I:%.*]] = call i64 @llvm.aarch64.neon.fcvtps.i64.f64(double %a) #2
+// CHECK:   ret i64 [[VCVTPD_S64_F64_I]]
 int64_t test_vcvtpd_s64_f64(float64_t a) {
-// CHECK-LABEL: test_vcvtpd_s64_f64
-// CHECK: fcvtps {{[dx][0-9]+}}, {{d[0-9]+}}
  return (int64_t)vcvtpd_s64_f64(a);
 }

+// CHECK-LABEL: define i32 @test_vcvtps_u32_f32(float %a) #0 {
+// CHECK:   [[VCVTPS_U32_F32_I:%.*]] = call i32 @llvm.aarch64.neon.fcvtpu.i32.f32(float %a) #2
+// CHECK:   ret i32 [[VCVTPS_U32_F32_I]]
 uint32_t test_vcvtps_u32_f32(float32_t a) {
-// CHECK-LABEL: test_vcvtps_u32_f32
-// CHECK: fcvtpu {{[sw][0-9]+}}, {{s[0-9]+}}
  return (uint32_t)vcvtps_u32_f32(a);
 }

+// CHECK-LABEL: define i64 @test_vcvtpd_u64_f64(double %a) #0 {
+// CHECK:   [[VCVTPD_U64_F64_I:%.*]] = call i64 @llvm.aarch64.neon.fcvtpu.i64.f64(double %a) #2
+// CHECK:   ret i64 [[VCVTPD_U64_F64_I]]
 uint64_t test_vcvtpd_u64_f64(float64_t a) {
-// CHECK-LABEL: test_vcvtpd_u64_f64
-// CHECK: fcvtpu {{[dx][0-9]+}}, {{d[0-9]+}}
  return (uint64_t)vcvtpd_u64_f64(a);
 }

+// CHECK-LABEL: define i32 @test_vcvts_s32_f32(float %a) #0 {
+// CHECK:   [[TMP0:%.*]] = fptosi float %a to i32
+// CHECK:   ret i32 [[TMP0]]
 int32_t test_vcvts_s32_f32(float32_t a) {
-// CHECK-LABEL: test_vcvts_s32_f32
-// CHECK: fcvtzs {{[sw][0-9]+}}, {{s[0-9]+}}
  return (int32_t)vcvts_s32_f32(a);
 }

+// CHECK-LABEL: define i64 @test_vcvtd_s64_f64(double %a) #0 {
+// CHECK:   [[TMP0:%.*]] = fptosi double %a to i64
+// CHECK:   ret i64 [[TMP0]]
 int64_t test_vcvtd_s64_f64(float64_t a) {
-// CHECK-LABEL: test_vcvtd_s64_f64
-// CHECK: fcvtzs {{[dx][0-9]+}}, {{d[0-9]+}}
  return (int64_t)vcvtd_s64_f64(a);
 }

+// CHECK-LABEL: define i32 @test_vcvts_u32_f32(float %a) #0 {
+// CHECK:   [[TMP0:%.*]] = fptoui float %a to i32
+// CHECK:   ret i32 [[TMP0]]
 uint32_t test_vcvts_u32_f32(float32_t a) {
-// CHECK-LABEL: test_vcvts_u32_f32
-// CHECK: fcvtzu {{[sw][0-9]+}}, {{s[0-9]+}}
  return (uint32_t)vcvts_u32_f32(a);
 }

+// CHECK-LABEL: define i64 @test_vcvtd_u64_f64(double %a) #0 {
+// CHECK:   [[TMP0:%.*]] = fptoui double %a to i64
+// CHECK:   ret i64 [[TMP0]]
 uint64_t test_vcvtd_u64_f64(float64_t a) {
-// CHECK-LABEL: test_vcvtd_u64_f64
-// CHECK: fcvtzu {{[dx][0-9]+}}, {{d[0-9]+}}
  return (uint64_t)vcvtd_u64_f64(a);
 }
--- a/test/CodeGen/aarch64-neon-fma.c
+++ b/test/CodeGen/aarch64-neon-fma.c
@ -1,199 +1,243 @@
-// REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -S -O3 -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -S -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s

 // Test new aarch64 intrinsics and types

 #include <arm_neon.h>

+// CHECK-LABEL: define <2 x float> @test_vmla_n_f32(<2 x float> %a, <2 x float> %b, float %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x float> undef, float %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x float> [[VECINIT_I]], float %c, i32 1
+// CHECK:   [[MUL_I:%.*]] = fmul <2 x float> %b, [[VECINIT1_I]]
+// CHECK:   [[ADD_I:%.*]] = fadd <2 x float> %a, [[MUL_I]]
+// CHECK:   ret <2 x float> [[ADD_I]]
 float32x2_t test_vmla_n_f32(float32x2_t a, float32x2_t b, float32_t c) {
-  // CHECK-LABEL: test_vmla_n_f32
  return vmla_n_f32(a, b, c);
-  // CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-  // CHECK: fadd {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-  // CHECK-FMA: dup {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-  // CHECK-FMA: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }

+// CHECK-LABEL: define <4 x float> @test_vmlaq_n_f32(<4 x float> %a, <4 x float> %b, float %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x float> undef, float %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float %c, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float %c, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float %c, i32 3
+// CHECK:   [[MUL_I:%.*]] = fmul <4 x float> %b, [[VECINIT3_I]]
+// CHECK:   [[ADD_I:%.*]] = fadd <4 x float> %a, [[MUL_I]]
+// CHECK:   ret <4 x float> [[ADD_I]]
 float32x4_t test_vmlaq_n_f32(float32x4_t a, float32x4_t b, float32_t c) {
-  // CHECK-LABEL: test_vmlaq_n_f32
  return vmlaq_n_f32(a, b, c);
-  // CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-  // CHECK: fadd {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-  // CHECK-FMA: dup {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-  // CHECK-FMA: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }

+// CHECK-LABEL: define <2 x double> @test_vmlaq_n_f64(<2 x double> %a, <2 x double> %b, double %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x double> undef, double %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x double> [[VECINIT_I]], double %c, i32 1
+// CHECK:   [[MUL_I:%.*]] = fmul <2 x double> %b, [[VECINIT1_I]]
+// CHECK:   [[ADD_I:%.*]] = fadd <2 x double> %a, [[MUL_I]]
+// CHECK:   ret <2 x double> [[ADD_I]]
 float64x2_t test_vmlaq_n_f64(float64x2_t a, float64x2_t b, float64_t c) {
-  // CHECK-LABEL: test_vmlaq_n_f64
  return vmlaq_n_f64(a, b, c);
-  // CHECK: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
-  // CHECK: fadd {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
-  // CHECK-FMA: dup {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
-  // CHECK-FMA: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }

+// CHECK-LABEL: define <4 x float> @test_vmlsq_n_f32(<4 x float> %a, <4 x float> %b, float %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x float> undef, float %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float %c, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float %c, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float %c, i32 3
+// CHECK:   [[MUL_I:%.*]] = fmul <4 x float> %b, [[VECINIT3_I]]
+// CHECK:   [[SUB_I:%.*]] = fsub <4 x float> %a, [[MUL_I]]
+// CHECK:   ret <4 x float> [[SUB_I]]
 float32x4_t test_vmlsq_n_f32(float32x4_t a, float32x4_t b, float32_t c) {
-  // CHECK-LABEL: test_vmlsq_n_f32
  return vmlsq_n_f32(a, b, c);
-  // CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-  // CHECK: fsub {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-  // CHECK-FMA: dup {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-  // CHECK-FMA: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }

+// CHECK-LABEL: define <2 x float> @test_vmls_n_f32(<2 x float> %a, <2 x float> %b, float %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x float> undef, float %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x float> [[VECINIT_I]], float %c, i32 1
+// CHECK:   [[MUL_I:%.*]] = fmul <2 x float> %b, [[VECINIT1_I]]
+// CHECK:   [[SUB_I:%.*]] = fsub <2 x float> %a, [[MUL_I]]
+// CHECK:   ret <2 x float> [[SUB_I]]
 float32x2_t test_vmls_n_f32(float32x2_t a, float32x2_t b, float32_t c) {
-  // CHECK-LABEL: test_vmls_n_f32
  return vmls_n_f32(a, b, c);
-  // CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-  // CHECK: fsub {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-  // CHECK-FMA: dup {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-  // CHECK-FMA: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }

+// CHECK-LABEL: define <2 x double> @test_vmlsq_n_f64(<2 x double> %a, <2 x double> %b, double %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x double> undef, double %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x double> [[VECINIT_I]], double %c, i32 1
+// CHECK:   [[MUL_I:%.*]] = fmul <2 x double> %b, [[VECINIT1_I]]
+// CHECK:   [[SUB_I:%.*]] = fsub <2 x double> %a, [[MUL_I]]
+// CHECK:   ret <2 x double> [[SUB_I]]
 float64x2_t test_vmlsq_n_f64(float64x2_t a, float64x2_t b, float64_t c) {
-  // CHECK-LABEL: test_vmlsq_n_f64
  return vmlsq_n_f64(a, b, c);
-  // CHECK: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
-  // CHECK: fsub {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
-  // CHECK-FMA: dup {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
-  // CHECK-FMA: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }

+// CHECK-LABEL: define <2 x float> @test_vmla_lane_f32_0(<2 x float> %a, <2 x float> %b, <2 x float> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <2 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = fmul <2 x float> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = fadd <2 x float> %a, [[MUL]]
+// CHECK:   ret <2 x float> [[ADD]]
 float32x2_t test_vmla_lane_f32_0(float32x2_t a, float32x2_t b, float32x2_t v) {
-  // CHECK-LABEL: test_vmla_lane_f32_0
  return vmla_lane_f32(a, b, v, 0);
-  // CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-  // CHECK: fadd {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-  // CHECK-FMA: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }

+// CHECK-LABEL: define <4 x float> @test_vmlaq_lane_f32_0(<4 x float> %a, <4 x float> %b, <2 x float> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <4 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = fmul <4 x float> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = fadd <4 x float> %a, [[MUL]]
+// CHECK:   ret <4 x float> [[ADD]]
 float32x4_t test_vmlaq_lane_f32_0(float32x4_t a, float32x4_t b, float32x2_t v) {
-  // CHECK-LABEL: test_vmlaq_lane_f32_0
  return vmlaq_lane_f32(a, b, v, 0);
-  // CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-  // CHECK: fadd {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-  // CHECK-FMA: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }

+// CHECK-LABEL: define <2 x float> @test_vmla_laneq_f32_0(<2 x float> %a, <2 x float> %b, <4 x float> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <2 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = fmul <2 x float> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = fadd <2 x float> %a, [[MUL]]
+// CHECK:   ret <2 x float> [[ADD]]
 float32x2_t test_vmla_laneq_f32_0(float32x2_t a, float32x2_t b, float32x4_t v) {
-  // CHECK-LABEL: test_vmla_laneq_f32_0
  return vmla_laneq_f32(a, b, v, 0);
-  // CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-  // CHECK: fadd {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-  // CHECK-FMA: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }

+// CHECK-LABEL: define <4 x float> @test_vmlaq_laneq_f32_0(<4 x float> %a, <4 x float> %b, <4 x float> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <4 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = fmul <4 x float> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = fadd <4 x float> %a, [[MUL]]
+// CHECK:   ret <4 x float> [[ADD]]
 float32x4_t test_vmlaq_laneq_f32_0(float32x4_t a, float32x4_t b, float32x4_t v) {
-  // CHECK-LABEL: test_vmlaq_laneq_f32_0
  return vmlaq_laneq_f32(a, b, v, 0);
-  // CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-  // CHECK: fadd {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-  // CHECK-FMA: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }

+// CHECK-LABEL: define <2 x float> @test_vmls_lane_f32_0(<2 x float> %a, <2 x float> %b, <2 x float> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <2 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = fmul <2 x float> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = fsub <2 x float> %a, [[MUL]]
+// CHECK:   ret <2 x float> [[SUB]]
 float32x2_t test_vmls_lane_f32_0(float32x2_t a, float32x2_t b, float32x2_t v) {
-  // CHECK-LABEL: test_vmls_lane_f32_0
  return vmls_lane_f32(a, b, v, 0);
-  // CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-  // CHECK: fsub {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-  // CHECK-FMA: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }

+// CHECK-LABEL: define <4 x float> @test_vmlsq_lane_f32_0(<4 x float> %a, <4 x float> %b, <2 x float> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <4 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = fmul <4 x float> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = fsub <4 x float> %a, [[MUL]]
+// CHECK:   ret <4 x float> [[SUB]]
 float32x4_t test_vmlsq_lane_f32_0(float32x4_t a, float32x4_t b, float32x2_t v) {
-  // CHECK-LABEL: test_vmlsq_lane_f32_0
  return vmlsq_lane_f32(a, b, v, 0);
-  // CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-  // CHECK: fsub {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-  // CHECK-FMA: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }

+// CHECK-LABEL: define <2 x float> @test_vmls_laneq_f32_0(<2 x float> %a, <2 x float> %b, <4 x float> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <2 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = fmul <2 x float> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = fsub <2 x float> %a, [[MUL]]
+// CHECK:   ret <2 x float> [[SUB]]
 float32x2_t test_vmls_laneq_f32_0(float32x2_t a, float32x2_t b, float32x4_t v) {
-  // CHECK-LABEL: test_vmls_laneq_f32_0
  return vmls_laneq_f32(a, b, v, 0);
-  // CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-  // CHECK: fsub {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-  // CHECK-FMA: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }

+// CHECK-LABEL: define <4 x float> @test_vmlsq_laneq_f32_0(<4 x float> %a, <4 x float> %b, <4 x float> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <4 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = fmul <4 x float> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = fsub <4 x float> %a, [[MUL]]
+// CHECK:   ret <4 x float> [[SUB]]
 float32x4_t test_vmlsq_laneq_f32_0(float32x4_t a, float32x4_t b, float32x4_t v) {
-  // CHECK-LABEL: test_vmlsq_laneq_f32_0
  return vmlsq_laneq_f32(a, b, v, 0);
-  // CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-  // CHECK: fsub {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-  // CHECK-FMA: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }

+// CHECK-LABEL: define <2 x float> @test_vmla_lane_f32(<2 x float> %a, <2 x float> %b, <2 x float> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[MUL:%.*]] = fmul <2 x float> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = fadd <2 x float> %a, [[MUL]]
+// CHECK:   ret <2 x float> [[ADD]]
 float32x2_t test_vmla_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v) {
-  // CHECK-LABEL: test_vmla_lane_f32
  return vmla_lane_f32(a, b, v, 1);
-  // CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
-  // CHECK: fadd {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-  // CHECK-FMA: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
 }

+// CHECK-LABEL: define <4 x float> @test_vmlaq_lane_f32(<4 x float> %a, <4 x float> %b, <2 x float> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK:   [[MUL:%.*]] = fmul <4 x float> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = fadd <4 x float> %a, [[MUL]]
+// CHECK:   ret <4 x float> [[ADD]]
 float32x4_t test_vmlaq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v) {
-  // CHECK-LABEL: test_vmlaq_lane_f32
  return vmlaq_lane_f32(a, b, v, 1);
-  // CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
-  // CHECK: fadd {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-  // CHECK-FMA: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
 }

+// CHECK-LABEL: define <2 x float> @test_vmla_laneq_f32(<2 x float> %a, <2 x float> %b, <4 x float> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <2 x i32> <i32 3, i32 3>
+// CHECK:   [[MUL:%.*]] = fmul <2 x float> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = fadd <2 x float> %a, [[MUL]]
+// CHECK:   ret <2 x float> [[ADD]]
 float32x2_t test_vmla_laneq_f32(float32x2_t a, float32x2_t b, float32x4_t v) {
-  // CHECK-LABEL: test_vmla_laneq_f32
  return vmla_laneq_f32(a, b, v, 3);
-  // CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
-  // CHECK: fadd {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-  // CHECK-FMA: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
 }

+// CHECK-LABEL: define <4 x float> @test_vmlaq_laneq_f32(<4 x float> %a, <4 x float> %b, <4 x float> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[MUL:%.*]] = fmul <4 x float> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = fadd <4 x float> %a, [[MUL]]
+// CHECK:   ret <4 x float> [[ADD]]
 float32x4_t test_vmlaq_laneq_f32(float32x4_t a, float32x4_t b, float32x4_t v) {
-  // CHECK-LABEL: test_vmlaq_laneq_f32
  return vmlaq_laneq_f32(a, b, v, 3);
-  // CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
-  // CHECK: fadd {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-  // CHECK-FMA: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
 }

+// CHECK-LABEL: define <2 x float> @test_vmls_lane_f32(<2 x float> %a, <2 x float> %b, <2 x float> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[MUL:%.*]] = fmul <2 x float> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = fsub <2 x float> %a, [[MUL]]
+// CHECK:   ret <2 x float> [[SUB]]
 float32x2_t test_vmls_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v) {
-  // CHECK-LABEL: test_vmls_lane_f32
  return vmls_lane_f32(a, b, v, 1);
-  // CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
-  // CHECK: fsub {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-  // CHECK-FMA: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
 }

+// CHECK-LABEL: define <4 x float> @test_vmlsq_lane_f32(<4 x float> %a, <4 x float> %b, <2 x float> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK:   [[MUL:%.*]] = fmul <4 x float> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = fsub <4 x float> %a, [[MUL]]
+// CHECK:   ret <4 x float> [[SUB]]
 float32x4_t test_vmlsq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v) {
-  // CHECK-LABEL: test_vmlsq_lane_f32
  return vmlsq_lane_f32(a, b, v, 1);
-  // CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
-  // CHECK: fsub {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-  // CHECK-FMA: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
 }
+// CHECK-LABEL: define <2 x float> @test_vmls_laneq_f32(<2 x float> %a, <2 x float> %b, <4 x float> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <2 x i32> <i32 3, i32 3>
+// CHECK:   [[MUL:%.*]] = fmul <2 x float> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = fsub <2 x float> %a, [[MUL]]
+// CHECK:   ret <2 x float> [[SUB]]
 float32x2_t test_vmls_laneq_f32(float32x2_t a, float32x2_t b, float32x4_t v) {
-  // CHECK-LABEL: test_vmls_laneq_f32
  return vmls_laneq_f32(a, b, v, 3);
-  // CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
-  // CHECK: fsub {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-  // CHECK-FMA: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
 }

+// CHECK-LABEL: define <4 x float> @test_vmlsq_laneq_f32(<4 x float> %a, <4 x float> %b, <4 x float> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[MUL:%.*]] = fmul <4 x float> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = fsub <4 x float> %a, [[MUL]]
+// CHECK:   ret <4 x float> [[SUB]]
 float32x4_t test_vmlsq_laneq_f32(float32x4_t a, float32x4_t b, float32x4_t v) {
-  // CHECK-LABEL: test_vmlsq_laneq_f32
  return vmlsq_laneq_f32(a, b, v, 3);
-  // CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
-  // CHECK: fsub {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-  // CHECK-FMA: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
 }

+// CHECK-LABEL: define <2 x double> @test_vfmaq_n_f64(<2 x double> %a, <2 x double> %b, double %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x double> undef, double %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x double> [[VECINIT_I]], double %c, i32 1
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x double> [[VECINIT1_I]] to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
+// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double>
+// CHECK:   [[TMP6:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[TMP4]], <2 x double> [[TMP5]], <2 x double> [[TMP3]]) #2
+// CHECK:   ret <2 x double> [[TMP6]]
 float64x2_t test_vfmaq_n_f64(float64x2_t a, float64x2_t b, float64_t c) {
-  // CHECK-LABEL: test_vfmaq_n_f64:
  return vfmaq_n_f64(a, b, c);
-  // CHECK: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+\.2d|v[0-9]+\.d\[0\]}}
 }

+// CHECK-LABEL: define <2 x double> @test_vfmsq_n_f64(<2 x double> %a, <2 x double> %b, double %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x double> undef, double %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x double> [[VECINIT_I]], double %c, i32 1
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x double> [[VECINIT1_I]] to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
+// CHECK:   [[TMP4:%.*]] = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, [[TMP3]]
+// CHECK:   [[FMLS_I_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double>
+// CHECK:   [[FMLS1_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[FMLS2_I_I:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[FMLS_I_I]], <2 x double> [[TMP4]], <2 x double> [[FMLS1_I_I]]) #2
+// CHECK:   ret <2 x double> [[FMLS2_I_I]]
 float64x2_t test_vfmsq_n_f64(float64x2_t a, float64x2_t b, float64_t c) {
-  // CHECK-LABEL: test_vfmsq_n_f64:
  return vfmsq_n_f64(a, b, c);
-  // CHECK: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+\.2d|v[0-9]+\.d\[0\]}}
 }
--- a/test/CodeGen/aarch64-neon-intrinsics.c
+++ b/test/CodeGen/aarch64-neon-intrinsics.c
--- a/test/CodeGen/aarch64-neon-ldst-one.c
+++ b/test/CodeGen/aarch64-neon-ldst-one.c
--- a/test/CodeGen/aarch64-neon-misc.c
+++ b/test/CodeGen/aarch64-neon-misc.c
--- a/test/CodeGen/aarch64-neon-perm.c
+++ b/test/CodeGen/aarch64-neon-perm.c
--- a/test/CodeGen/aarch64-neon-scalar-copy.c
+++ b/test/CodeGen/aarch64-neon-scalar-copy.c
@ -1,173 +1,228 @@
-// REQUIRES: aarch64-registered-target
 // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon \
-// RUN:   -ffp-contract=fast -S -O3 -o - %s | FileCheck %s
+// RUN:  -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s


 #include <arm_neon.h>

-// CHECK-LABEL: test_vdups_lane_f32
+// CHECK-LABEL: define float @test_vdups_lane_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VDUPS_LANE:%.*]] = extractelement <2 x float> [[TMP1]], i32 1
+// CHECK:   ret float [[VDUPS_LANE]]
 float32_t test_vdups_lane_f32(float32x2_t a) {
  return vdups_lane_f32(a, 1);
-// CHECK: ret
-// CHECK-NOT: dup {{s[0-9]+}}, {{v[0-9]+}}.s[1]
 }


-// CHECK-LABEL: test_vdupd_lane_f64
+// CHECK-LABEL: define double @test_vdupd_lane_f64(<1 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
+// CHECK:   [[VDUPD_LANE:%.*]] = extractelement <1 x double> [[TMP1]], i32 0
+// CHECK:   ret double [[VDUPD_LANE]]
 float64_t test_vdupd_lane_f64(float64x1_t a) {
  return vdupd_lane_f64(a, 0);
-// CHECK: ret
-// CHECK-NOT: dup {{d[0-9]+}}, {{v[0-9]+}}.d[0]
 }


-// CHECK-LABEL: test_vdups_laneq_f32
+// CHECK-LABEL: define float @test_vdups_laneq_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
+// CHECK:   ret float [[VGETQ_LANE]]
 float32_t test_vdups_laneq_f32(float32x4_t a) {
  return vdups_laneq_f32(a, 3);
-// CHECK: ret
-// CHECK-NOT: dup {{s[0-9]+}}, {{v[0-9]+}}.s[3]
 }


-// CHECK-LABEL: test_vdupd_laneq_f64
+// CHECK-LABEL: define double @test_vdupd_laneq_f64(<2 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
+// CHECK:   ret double [[VGETQ_LANE]]
 float64_t test_vdupd_laneq_f64(float64x2_t a) {
  return vdupd_laneq_f64(a, 1);
-// CHECK: ret
-// CHECK-NOT: dup {{d[0-9]+}}, {{v[0-9]+}}.d[1]
 }


-// CHECK-LABEL: test_vdupb_lane_s8
+// CHECK-LABEL: define i8 @test_vdupb_lane_s8(<8 x i8> %a) #0 {
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <8 x i8> %a, i32 7
+// CHECK:   ret i8 [[VGET_LANE]]
 int8_t test_vdupb_lane_s8(int8x8_t a) {
  return vdupb_lane_s8(a, 7);
-// CHECK: {{umov|smov}} {{w[0-9]+}}, {{v[0-9]+}}.b[7]
 }


-// CHECK-LABEL: test_vduph_lane_s16
+// CHECK-LABEL: define i16 @test_vduph_lane_s16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3
+// CHECK:   ret i16 [[VGET_LANE]]
 int16_t test_vduph_lane_s16(int16x4_t a) {
  return vduph_lane_s16(a, 3);
-// CHECK: {{umov|smov}} {{w[0-9]+}}, {{v[0-9]+}}.h[3]
 }


-// CHECK-LABEL: test_vdups_lane_s32
+// CHECK-LABEL: define i32 @test_vdups_lane_s32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
+// CHECK:   ret i32 [[VGET_LANE]]
 int32_t test_vdups_lane_s32(int32x2_t a) {
  return vdups_lane_s32(a, 1);
-// CHECK: {{mov|umov}} {{w[0-9]+}}, {{v[0-9]+}}.s[1]
 }


-// CHECK-LABEL: test_vdupd_lane_s64
+// CHECK-LABEL: define i64 @test_vdupd_lane_s64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <1 x i64> [[TMP1]], i32 0
+// CHECK:   ret i64 [[VGET_LANE]]
 int64_t test_vdupd_lane_s64(int64x1_t a) {
  return vdupd_lane_s64(a, 0);
-// CHECK: fmov {{x[0-9]+}}, {{d[0-9]+}}
 }


-// CHECK-LABEL: test_vdupb_lane_u8
+// CHECK-LABEL: define i8 @test_vdupb_lane_u8(<8 x i8> %a) #0 {
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <8 x i8> %a, i32 7
+// CHECK:   ret i8 [[VGET_LANE]]
 uint8_t test_vdupb_lane_u8(uint8x8_t a) {
  return vdupb_lane_u8(a, 7);
-// CHECK: {{mov|umov}} {{w[0-9]+}}, {{v[0-9]+}}.b[7]
 }


-// CHECK-LABEL: test_vduph_lane_u16
+// CHECK-LABEL: define i16 @test_vduph_lane_u16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3
+// CHECK:   ret i16 [[VGET_LANE]]
 uint16_t test_vduph_lane_u16(uint16x4_t a) {
  return vduph_lane_u16(a, 3);
-// CHECK: {{mov|umov}} {{w[0-9]+}}, {{v[0-9]+}}.h[3]
 }


-// CHECK-LABEL: test_vdups_lane_u32
+// CHECK-LABEL: define i32 @test_vdups_lane_u32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
+// CHECK:   ret i32 [[VGET_LANE]]
 uint32_t test_vdups_lane_u32(uint32x2_t a) {
  return vdups_lane_u32(a, 1);
-// CHECK: {{mov|umov}} {{w[0-9]+}}, {{v[0-9]+}}.s[1]
 }


-// CHECK-LABEL: test_vdupd_lane_u64
+// CHECK-LABEL: define i64 @test_vdupd_lane_u64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <1 x i64> [[TMP1]], i32 0
+// CHECK:   ret i64 [[VGET_LANE]]
 uint64_t test_vdupd_lane_u64(uint64x1_t a) {
  return vdupd_lane_u64(a, 0);
-// CHECK: fmov {{x[0-9]+}}, {{d[0-9]+}}
 }

-// CHECK-LABEL: test_vdupb_laneq_s8
+// CHECK-LABEL: define i8 @test_vdupb_laneq_s8(<16 x i8> %a) #0 {
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <16 x i8> %a, i32 15
+// CHECK:   ret i8 [[VGETQ_LANE]]
 int8_t test_vdupb_laneq_s8(int8x16_t a) {
  return vdupb_laneq_s8(a, 15);
-// CHECK: {{umov|smov}} {{w[0-9]+}}, {{v[0-9]+}}.b[15]
 }


-// CHECK-LABEL: test_vduph_laneq_s16
+// CHECK-LABEL: define i16 @test_vduph_laneq_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <8 x i16> [[TMP1]], i32 7
+// CHECK:   ret i16 [[VGETQ_LANE]]
 int16_t test_vduph_laneq_s16(int16x8_t a) {
  return vduph_laneq_s16(a, 7);
-// CHECK: {{umov|smov}} {{w[0-9]+}}, {{v[0-9]+}}.h[7]
 }


-// CHECK-LABEL: test_vdups_laneq_s32
+// CHECK-LABEL: define i32 @test_vdups_laneq_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
+// CHECK:   ret i32 [[VGETQ_LANE]]
 int32_t test_vdups_laneq_s32(int32x4_t a) {
  return vdups_laneq_s32(a, 3);
-// CHECK: {{mov|umov}} {{w[0-9]+}}, {{v[0-9]+}}.s[3]
 }


-// CHECK-LABEL: test_vdupd_laneq_s64
+// CHECK-LABEL: define i64 @test_vdupd_laneq_s64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <2 x i64> [[TMP1]], i32 1
+// CHECK:   ret i64 [[VGETQ_LANE]]
 int64_t test_vdupd_laneq_s64(int64x2_t a) {
  return vdupd_laneq_s64(a, 1);
-// CHECK: {{mov|umov}} {{x[0-9]+}}, {{v[0-9]+}}.d[1]
 }


-// CHECK-LABEL: test_vdupb_laneq_u8
+// CHECK-LABEL: define i8 @test_vdupb_laneq_u8(<16 x i8> %a) #0 {
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <16 x i8> %a, i32 15
+// CHECK:   ret i8 [[VGETQ_LANE]]
 uint8_t test_vdupb_laneq_u8(uint8x16_t a) {
  return vdupb_laneq_u8(a, 15);
-// CHECK: {{mov|umov}} {{w[0-9]+}}, {{v[0-9]+}}.b[15]
 }


-// CHECK-LABEL: test_vduph_laneq_u16
+// CHECK-LABEL: define i16 @test_vduph_laneq_u16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <8 x i16> [[TMP1]], i32 7
+// CHECK:   ret i16 [[VGETQ_LANE]]
 uint16_t test_vduph_laneq_u16(uint16x8_t a) {
  return vduph_laneq_u16(a, 7);
-// CHECK: {{mov|umov}} {{w[0-9]+}}, {{v[0-9]+}}.h[7]
 }


-// CHECK-LABEL: test_vdups_laneq_u32
+// CHECK-LABEL: define i32 @test_vdups_laneq_u32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
+// CHECK:   ret i32 [[VGETQ_LANE]]
 uint32_t test_vdups_laneq_u32(uint32x4_t a) {
  return vdups_laneq_u32(a, 3);
-// CHECK: {{mov|umov}} {{w[0-9]+}}, {{v[0-9]+}}.s[3]
 }


-// CHECK-LABEL: test_vdupd_laneq_u64
+// CHECK-LABEL: define i64 @test_vdupd_laneq_u64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <2 x i64> [[TMP1]], i32 1
+// CHECK:   ret i64 [[VGETQ_LANE]]
 uint64_t test_vdupd_laneq_u64(uint64x2_t a) {
  return vdupd_laneq_u64(a, 1);
-// CHECK: {{mov|umov}} {{x[0-9]+}}, {{v[0-9]+}}.d[1]
 }

-// CHECK-LABEL: test_vdupb_lane_p8
+// CHECK-LABEL: define i8 @test_vdupb_lane_p8(<8 x i8> %a) #0 {
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <8 x i8> %a, i32 7
+// CHECK:   ret i8 [[VGET_LANE]]
 poly8_t test_vdupb_lane_p8(poly8x8_t a) {
  return vdupb_lane_p8(a, 7);
-// CHECK: {{mov|umov}} {{w[0-9]+}}, {{v[0-9]+}}.b[7]
 }

-// CHECK-LABEL: test_vduph_lane_p16
+// CHECK-LABEL: define i16 @test_vduph_lane_p16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3
+// CHECK:   ret i16 [[VGET_LANE]]
 poly16_t test_vduph_lane_p16(poly16x4_t a) {
  return vduph_lane_p16(a, 3);
-// CHECK: {{mov|umov}} {{w[0-9]+}}, {{v[0-9]+}}.h[3]
 }

-// CHECK-LABEL: test_vdupb_laneq_p8
+// CHECK-LABEL: define i8 @test_vdupb_laneq_p8(<16 x i8> %a) #0 {
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <16 x i8> %a, i32 15
+// CHECK:   ret i8 [[VGETQ_LANE]]
 poly8_t test_vdupb_laneq_p8(poly8x16_t a) {
  return vdupb_laneq_p8(a, 15);
-// CHECK: {{mov|umov}} {{w[0-9]+}}, {{v[0-9]+}}.b[15]
 }

-// CHECK-LABEL: test_vduph_laneq_p16
+// CHECK-LABEL: define i16 @test_vduph_laneq_p16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <8 x i16> [[TMP1]], i32 7
+// CHECK:   ret i16 [[VGETQ_LANE]]
 poly16_t test_vduph_laneq_p16(poly16x8_t a) {
  return vduph_laneq_p16(a, 7);
-// CHECK: {{mov|umov}} {{w[0-9]+}}, {{v[0-9]+}}.h[7]
 }

--- a/test/CodeGen/aarch64-neon-scalar-x-indexed-elem.c
+++ b/test/CodeGen/aarch64-neon-scalar-x-indexed-elem.c
@ -1,259 +1,509 @@
-// REQUIRES: aarch64-registered-target
 // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -target-cpu cyclone \
-// RUN:   -ffp-contract=fast -S -O3 -o - %s | FileCheck %s
+// RUN:  -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s

 // Test new aarch64 intrinsics and types

 #include <arm_neon.h>


+// CHECK-LABEL: define float @test_vmuls_lane_f32(float %a, <2 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <2 x float> [[TMP1]], i32 1
+// CHECK:   [[MUL:%.*]] = fmul float %a, [[VGET_LANE]]
+// CHECK:   ret float [[MUL]]
 float32_t test_vmuls_lane_f32(float32_t a, float32x2_t b) {
-  // CHECK-LABEL: test_vmuls_lane_f32
  return vmuls_lane_f32(a, b, 1);
-  // CHECK: fmul {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[1]
 }

+// CHECK-LABEL: define double @test_vmuld_lane_f64(double %a, <1 x double> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <1 x double> [[TMP1]], i32 0
+// CHECK:   [[MUL:%.*]] = fmul double %a, [[VGET_LANE]]
+// CHECK:   ret double [[MUL]]
 float64_t test_vmuld_lane_f64(float64_t a, float64x1_t b) {
-  // CHECK-LABEL: test_vmuld_lane_f64
  return vmuld_lane_f64(a, b, 0);
-  // CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+.d\[0\]|d[0-9]+}}
 }

+// CHECK-LABEL: define float @test_vmuls_laneq_f32(float %a, <4 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %b to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
+// CHECK:   [[MUL:%.*]] = fmul float %a, [[VGETQ_LANE]]
+// CHECK:   ret float [[MUL]]
 float32_t test_vmuls_laneq_f32(float32_t a, float32x4_t b) {
-  // CHECK-LABEL: test_vmuls_laneq_f32
  return vmuls_laneq_f32(a, b, 3);
-  // CHECK: fmul {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3]
 }

+// CHECK-LABEL: define double @test_vmuld_laneq_f64(double %a, <2 x double> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %b to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
+// CHECK:   [[MUL:%.*]] = fmul double %a, [[VGETQ_LANE]]
+// CHECK:   ret double [[MUL]]
 float64_t test_vmuld_laneq_f64(float64_t a, float64x2_t b) {
-  // CHECK-LABEL: test_vmuld_laneq_f64
  return vmuld_laneq_f64(a, b, 1);
-  // CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1]
 }

+// CHECK-LABEL: define <1 x double> @test_vmul_n_f64(<1 x double> %a, double %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
+// CHECK:   [[TMP2:%.*]] = bitcast <1 x double> [[TMP1]] to double
+// CHECK:   [[TMP3:%.*]] = fmul double [[TMP2]], %b
+// CHECK:   [[TMP4:%.*]] = bitcast double [[TMP3]] to <1 x double>
+// CHECK:   ret <1 x double> [[TMP4]]
 float64x1_t test_vmul_n_f64(float64x1_t a, float64_t b) {
-  // CHECK-LABEL: test_vmul_n_f64
  return vmul_n_f64(a, b);
-  // CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+.d\[0\]|d[0-9]+}}
 }

+// CHECK-LABEL: define float @test_vmulxs_lane_f32(float %a, <2 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <2 x float> [[TMP1]], i32 1
+// CHECK:   [[VMULXS_F32_I:%.*]] = call float @llvm.aarch64.neon.fmulx.f32(float %a, float [[VGET_LANE]]) #2
+// CHECK:   ret float [[VMULXS_F32_I]]
 float32_t test_vmulxs_lane_f32(float32_t a, float32x2_t b) {
-// CHECK-LABEL: test_vmulxs_lane_f32
  return vmulxs_lane_f32(a, b, 1);
-// CHECK: fmulx {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[1]
 }

+// CHECK-LABEL: define float @test_vmulxs_laneq_f32(float %a, <4 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %b to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
+// CHECK:   [[VMULXS_F32_I:%.*]] = call float @llvm.aarch64.neon.fmulx.f32(float %a, float [[VGETQ_LANE]]) #2
+// CHECK:   ret float [[VMULXS_F32_I]]
 float32_t test_vmulxs_laneq_f32(float32_t a, float32x4_t b) {
-// CHECK-LABEL: test_vmulxs_laneq_f32
  return vmulxs_laneq_f32(a, b, 3);
-// CHECK: fmulx {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3]
 }

+// CHECK-LABEL: define double @test_vmulxd_lane_f64(double %a, <1 x double> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <1 x double> [[TMP1]], i32 0
+// CHECK:   [[VMULXD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmulx.f64(double %a, double [[VGET_LANE]]) #2
+// CHECK:   ret double [[VMULXD_F64_I]]
 float64_t test_vmulxd_lane_f64(float64_t a, float64x1_t b) {
-// CHECK-LABEL: test_vmulxd_lane_f64
  return vmulxd_lane_f64(a, b, 0);
-// CHECK: fmulx {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+.d\[0\]|d[0-9]+}}
 }

+// CHECK-LABEL: define double @test_vmulxd_laneq_f64(double %a, <2 x double> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %b to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
+// CHECK:   [[VMULXD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmulx.f64(double %a, double [[VGETQ_LANE]]) #2
+// CHECK:   ret double [[VMULXD_F64_I]]
 float64_t test_vmulxd_laneq_f64(float64_t a, float64x2_t b) {
-// CHECK-LABEL: test_vmulxd_laneq_f64
  return vmulxd_laneq_f64(a, b, 1);
-// CHECK: fmulx {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1]
 }

-// CHECK-LABEL: test_vmulx_lane_f64
+// CHECK-LABEL: define <1 x double> @test_vmulx_lane_f64(<1 x double> %a, <1 x double> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <1 x double> [[TMP1]], i32 0
+// CHECK:   [[TMP2:%.*]] = bitcast <1 x double> %b to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x double>
+// CHECK:   [[VGET_LANE6:%.*]] = extractelement <1 x double> [[TMP3]], i32 0
+// CHECK:   [[VMULXD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmulx.f64(double [[VGET_LANE]], double [[VGET_LANE]]6) #2
+// CHECK:   [[TMP4:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x double>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <1 x double> [[TMP5]], double [[VMULXD_F64_I]], i32 0
+// CHECK:   ret <1 x double> [[VSET_LANE]]
 float64x1_t test_vmulx_lane_f64(float64x1_t a, float64x1_t b) {
  return vmulx_lane_f64(a, b, 0);
-  // CHECK: fmulx {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+.d\[0\]|d[0-9]+}}
 }


-// CHECK-LABEL: test_vmulx_laneq_f64_0
+// CHECK-LABEL: define <1 x double> @test_vmulx_laneq_f64_0(<1 x double> %a, <2 x double> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <1 x double> [[TMP1]], i32 0
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x double> %b to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <2 x double> [[TMP3]], i32 0
+// CHECK:   [[VMULXD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmulx.f64(double [[VGET_LANE]], double [[VGETQ_LANE]]) #2
+// CHECK:   [[TMP4:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x double>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <1 x double> [[TMP5]], double [[VMULXD_F64_I]], i32 0
+// CHECK:   ret <1 x double> [[VSET_LANE]]
 float64x1_t test_vmulx_laneq_f64_0(float64x1_t a, float64x2_t b) {
  return vmulx_laneq_f64(a, b, 0);
-  // CHECK: fmulx {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[0]
 }

-// CHECK-LABEL: test_vmulx_laneq_f64_1
+// CHECK-LABEL: define <1 x double> @test_vmulx_laneq_f64_1(<1 x double> %a, <2 x double> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <1 x double> [[TMP1]], i32 0
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x double> %b to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <2 x double> [[TMP3]], i32 1
+// CHECK:   [[VMULXD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmulx.f64(double [[VGET_LANE]], double [[VGETQ_LANE]]) #2
+// CHECK:   [[TMP4:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x double>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <1 x double> [[TMP5]], double [[VMULXD_F64_I]], i32 0
+// CHECK:   ret <1 x double> [[VSET_LANE]]
 float64x1_t test_vmulx_laneq_f64_1(float64x1_t a, float64x2_t b) {
  return vmulx_laneq_f64(a, b, 1);
-  // CHECK: fmulx {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1]
 }


-// CHECK-LABEL: test_vfmas_lane_f32
+// CHECK-LABEL: define float @test_vfmas_lane_f32(float %a, float %b, <2 x float> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %c to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[EXTRACT:%.*]] = extractelement <2 x float> [[TMP1]], i32 1
+// CHECK:   [[TMP2:%.*]] = call float @llvm.fma.f32(float %b, float [[EXTRACT]], float %a)
+// CHECK:   ret float [[TMP2]]
 float32_t test_vfmas_lane_f32(float32_t a, float32_t b, float32x2_t c) {
  return vfmas_lane_f32(a, b, c, 1);
-  // CHECK: fmla {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[1]
 }

-// CHECK-LABEL: test_vfmad_lane_f64
+// CHECK-LABEL: define double @test_vfmad_lane_f64(double %a, double %b, <1 x double> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %c to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
+// CHECK:   [[EXTRACT:%.*]] = extractelement <1 x double> [[TMP1]], i32 0
+// CHECK:   [[TMP2:%.*]] = call double @llvm.fma.f64(double %b, double [[EXTRACT]], double %a)
+// CHECK:   ret double [[TMP2]]
 float64_t test_vfmad_lane_f64(float64_t a, float64_t b, float64x1_t c) {
  return vfmad_lane_f64(a, b, c, 0);
-  // CHECK: {{fmla|fmadd}} {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+.d\[0\]|d[0-9]+}}
 }

-// CHECK-LABEL: test_vfmad_laneq_f64
+// CHECK-LABEL: define double @test_vfmad_laneq_f64(double %a, double %b, <2 x double> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %c to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[EXTRACT:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
+// CHECK:   [[TMP2:%.*]] = call double @llvm.fma.f64(double %b, double [[EXTRACT]], double %a)
+// CHECK:   ret double [[TMP2]]
 float64_t test_vfmad_laneq_f64(float64_t a, float64_t b, float64x2_t c) {
  return vfmad_laneq_f64(a, b, c, 1);
-  // CHECK: fmla {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1]
 }

-// CHECK-LABEL: test_vfmss_lane_f32
+// CHECK-LABEL: define float @test_vfmss_lane_f32(float %a, float %b, <2 x float> %c) #0 {
+// CHECK:   [[SUB:%.*]] = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %c
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> [[SUB]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[EXTRACT:%.*]] = extractelement <2 x float> [[TMP1]], i32 1
+// CHECK:   [[TMP2:%.*]] = call float @llvm.fma.f32(float %b, float [[EXTRACT]], float %a)
+// CHECK:   ret float [[TMP2]]
 float32_t test_vfmss_lane_f32(float32_t a, float32_t b, float32x2_t c) {
  return vfmss_lane_f32(a, b, c, 1);
-  // CHECK: fmls {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[1]
 }

-// CHECK-LABEL: test_vfma_lane_f64
+// CHECK-LABEL: define <1 x double> @test_vfma_lane_f64(<1 x double> %a, <1 x double> %b, <1 x double> %v) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <1 x double> %v to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x double>
+// CHECK:   [[LANE:%.*]] = shufflevector <1 x double> [[TMP3]], <1 x double> [[TMP3]], <1 x i32> zeroinitializer
+// CHECK:   [[FMLA:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double>
+// CHECK:   [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
+// CHECK:   [[FMLA2:%.*]] = call <1 x double> @llvm.fma.v1f64(<1 x double> [[FMLA]], <1 x double> [[LANE]], <1 x double> [[FMLA]]1)
+// CHECK:   ret <1 x double> [[FMLA]]2
 float64x1_t test_vfma_lane_f64(float64x1_t a, float64x1_t b, float64x1_t v) {
  return vfma_lane_f64(a, b, v, 0);
-  // CHECK: {{fmla|fmadd}} {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+.d\[0\]|d[0-9]+}}
 }

-// CHECK-LABEL: test_vfms_lane_f64
+// CHECK-LABEL: define <1 x double> @test_vfms_lane_f64(<1 x double> %a, <1 x double> %b, <1 x double> %v) #0 {
+// CHECK:   [[SUB:%.*]] = fsub <1 x double> <double -0.000000e+00>, %v
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <1 x double> [[SUB]] to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x double>
+// CHECK:   [[LANE:%.*]] = shufflevector <1 x double> [[TMP3]], <1 x double> [[TMP3]], <1 x i32> zeroinitializer
+// CHECK:   [[FMLA:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double>
+// CHECK:   [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
+// CHECK:   [[FMLA2:%.*]] = call <1 x double> @llvm.fma.v1f64(<1 x double> [[FMLA]], <1 x double> [[LANE]], <1 x double> [[FMLA]]1)
+// CHECK:   ret <1 x double> [[FMLA]]2
 float64x1_t test_vfms_lane_f64(float64x1_t a, float64x1_t b, float64x1_t v) {
  return vfms_lane_f64(a, b, v, 0);
-  // CHECK: {{fmls|fmsub}} {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+.d\[0\]|d[0-9]+}}
 }

-// CHECK-LABEL: test_vfma_laneq_f64
+// CHECK-LABEL: define <1 x double> @test_vfma_laneq_f64(<1 x double> %a, <1 x double> %b, <2 x double> %v) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x double> %v to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to double
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to double
+// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double>
+// CHECK:   [[EXTRACT:%.*]] = extractelement <2 x double> [[TMP5]], i32 0
+// CHECK:   [[TMP6:%.*]] = call double @llvm.fma.f64(double [[TMP4]], double [[EXTRACT]], double [[TMP3]])
+// CHECK:   [[TMP7:%.*]] = bitcast double [[TMP6]] to <1 x double>
+// CHECK:   ret <1 x double> [[TMP7]]
 float64x1_t test_vfma_laneq_f64(float64x1_t a, float64x1_t b, float64x2_t v) {
  return vfma_laneq_f64(a, b, v, 0);
-  // CHECK: fmla {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[0]
 }

-// CHECK-LABEL: test_vfms_laneq_f64
+// CHECK-LABEL: define <1 x double> @test_vfms_laneq_f64(<1 x double> %a, <1 x double> %b, <2 x double> %v) #0 {
+// CHECK:   [[SUB:%.*]] = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %v
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x double> [[SUB]] to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to double
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to double
+// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double>
+// CHECK:   [[EXTRACT:%.*]] = extractelement <2 x double> [[TMP5]], i32 0
+// CHECK:   [[TMP6:%.*]] = call double @llvm.fma.f64(double [[TMP4]], double [[EXTRACT]], double [[TMP3]])
+// CHECK:   [[TMP7:%.*]] = bitcast double [[TMP6]] to <1 x double>
+// CHECK:   ret <1 x double> [[TMP7]]
 float64x1_t test_vfms_laneq_f64(float64x1_t a, float64x1_t b, float64x2_t v) {
  return vfms_laneq_f64(a, b, v, 0);
-  // CHECK: fmls {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[0]
 }

-// CHECK-LABEL: test_vqdmullh_lane_s16
+// CHECK-LABEL: define i32 @test_vqdmullh_lane_s16(i16 %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3
+// CHECK:   [[TMP2:%.*]] = insertelement <4 x i16> undef, i16 %a, i64 0
+// CHECK:   [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[VGET_LANE]], i64 0
+// CHECK:   [[VQDMULLH_S16_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]]) #2
+// CHECK:   [[TMP4:%.*]] = extractelement <4 x i32> [[VQDMULLH_S16_I]], i64 0
+// CHECK:   ret i32 [[TMP4]]
 int32_t test_vqdmullh_lane_s16(int16_t a, int16x4_t b) {
  return vqdmullh_lane_s16(a, b, 3);
-  // CHECK: sqdmull {{s[0-9]+|v[0-9]+.4s}}, {{h[0-9]+|v[0-9].4h}}, {{v[0-9]+}}.h[3]
 }

-// CHECK-LABEL: test_vqdmulls_lane_s32
+// CHECK-LABEL: define i64 @test_vqdmulls_lane_s32(i32 %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
+// CHECK:   [[VQDMULLS_S32_I:%.*]] = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %a, i32 [[VGET_LANE]]) #2
+// CHECK:   ret i64 [[VQDMULLS_S32_I]]
 int64_t test_vqdmulls_lane_s32(int32_t a, int32x2_t b) {
  return vqdmulls_lane_s32(a, b, 1);
-  // CHECK: sqdmull {{d[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[1]
 }

-// CHECK-LABEL: test_vqdmullh_laneq_s16
+// CHECK-LABEL: define i32 @test_vqdmullh_laneq_s16(i16 %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <8 x i16> [[TMP1]], i32 7
+// CHECK:   [[TMP2:%.*]] = insertelement <4 x i16> undef, i16 %a, i64 0
+// CHECK:   [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[VGETQ_LANE]], i64 0
+// CHECK:   [[VQDMULLH_S16_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]]) #2
+// CHECK:   [[TMP4:%.*]] = extractelement <4 x i32> [[VQDMULLH_S16_I]], i64 0
+// CHECK:   ret i32 [[TMP4]]
 int32_t test_vqdmullh_laneq_s16(int16_t a, int16x8_t b) {
  return vqdmullh_laneq_s16(a, b, 7);
-  // CHECK: sqdmull {{s[0-9]+|v[0-9]+.4s}}, {{h[0-9]+|v[0-9]+.4h}}, {{v[0-9]+}}.h[7]
 }

-// CHECK-LABEL: test_vqdmulls_laneq_s32
+// CHECK-LABEL: define i64 @test_vqdmulls_laneq_s32(i32 %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
+// CHECK:   [[VQDMULLS_S32_I:%.*]] = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %a, i32 [[VGETQ_LANE]]) #2
+// CHECK:   ret i64 [[VQDMULLS_S32_I]]
 int64_t test_vqdmulls_laneq_s32(int32_t a, int32x4_t b) {
  return vqdmulls_laneq_s32(a, b, 3);
-  // CHECK: sqdmull {{d[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3]
 }

-// CHECK-LABEL: test_vqdmulhh_lane_s16
+// CHECK-LABEL: define i16 @test_vqdmulhh_lane_s16(i16 %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3
+// CHECK:   [[TMP2:%.*]] = insertelement <4 x i16> undef, i16 %a, i64 0
+// CHECK:   [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[VGET_LANE]], i64 0
+// CHECK:   [[VQDMULHH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]]) #2
+// CHECK:   [[TMP4:%.*]] = extractelement <4 x i16> [[VQDMULHH_S16_I]], i64 0
+// CHECK:   ret i16 [[TMP4]]
 int16_t test_vqdmulhh_lane_s16(int16_t a, int16x4_t b) {
  return vqdmulhh_lane_s16(a, b, 3);
-// CHECK: sqdmulh {{h[0-9]+|v[0-9]+.4h}}, {{h[0-9]+|v[0-9]+.4h}}, {{v[0-9]+}}.h[3]
 }

-// CHECK-LABEL: test_vqdmulhs_lane_s32
+// CHECK-LABEL: define i32 @test_vqdmulhs_lane_s32(i32 %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
+// CHECK:   [[VQDMULHS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqdmulh.i32(i32 %a, i32 [[VGET_LANE]]) #2
+// CHECK:   ret i32 [[VQDMULHS_S32_I]]
 int32_t test_vqdmulhs_lane_s32(int32_t a, int32x2_t b) {
  return vqdmulhs_lane_s32(a, b, 1);
-// CHECK: sqdmulh {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[1]
 }


-// CHECK-LABEL: test_vqdmulhh_laneq_s16
+// CHECK-LABEL: define i16 @test_vqdmulhh_laneq_s16(i16 %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <8 x i16> [[TMP1]], i32 7
+// CHECK:   [[TMP2:%.*]] = insertelement <4 x i16> undef, i16 %a, i64 0
+// CHECK:   [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[VGETQ_LANE]], i64 0
+// CHECK:   [[VQDMULHH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]]) #2
+// CHECK:   [[TMP4:%.*]] = extractelement <4 x i16> [[VQDMULHH_S16_I]], i64 0
+// CHECK:   ret i16 [[TMP4]]
 int16_t test_vqdmulhh_laneq_s16(int16_t a, int16x8_t b) {
  return vqdmulhh_laneq_s16(a, b, 7);
-// CHECK: sqdmulh {{h[0-9]+|v[0-9]+.4h}}, {{h[0-9]+|v[0-9]+.4h}}, {{v[0-9]+}}.h[7]
 }


-// CHECK-LABEL: test_vqdmulhs_laneq_s32
+// CHECK-LABEL: define i32 @test_vqdmulhs_laneq_s32(i32 %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
+// CHECK:   [[VQDMULHS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqdmulh.i32(i32 %a, i32 [[VGETQ_LANE]]) #2
+// CHECK:   ret i32 [[VQDMULHS_S32_I]]
 int32_t test_vqdmulhs_laneq_s32(int32_t a, int32x4_t b) {
  return vqdmulhs_laneq_s32(a, b, 3);
-// CHECK: sqdmulh {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3]
 }

-// CHECK-LABEL: test_vqrdmulhh_lane_s16
+// CHECK-LABEL: define i16 @test_vqrdmulhh_lane_s16(i16 %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3
+// CHECK:   [[TMP2:%.*]] = insertelement <4 x i16> undef, i16 %a, i64 0
+// CHECK:   [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[VGET_LANE]], i64 0
+// CHECK:   [[VQRDMULHH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]]) #2
+// CHECK:   [[TMP4:%.*]] = extractelement <4 x i16> [[VQRDMULHH_S16_I]], i64 0
+// CHECK:   ret i16 [[TMP4]]
 int16_t test_vqrdmulhh_lane_s16(int16_t a, int16x4_t b) {
  return vqrdmulhh_lane_s16(a, b, 3);
-// CHECK: sqrdmulh {{h[0-9]+|v[0-9]+.4h}}, {{h[0-9]+|v[0-9]+.4h}}, {{v[0-9]+}}.h[3]
 }

-// CHECK-LABEL: test_vqrdmulhs_lane_s32
+// CHECK-LABEL: define i32 @test_vqrdmulhs_lane_s32(i32 %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
+// CHECK:   [[VQRDMULHS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %a, i32 [[VGET_LANE]]) #2
+// CHECK:   ret i32 [[VQRDMULHS_S32_I]]
 int32_t test_vqrdmulhs_lane_s32(int32_t a, int32x2_t b) {
  return vqrdmulhs_lane_s32(a, b, 1);
-// CHECK: sqrdmulh {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[1]
 }


-// CHECK-LABEL: test_vqrdmulhh_laneq_s16
+// CHECK-LABEL: define i16 @test_vqrdmulhh_laneq_s16(i16 %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <8 x i16> [[TMP1]], i32 7
+// CHECK:   [[TMP2:%.*]] = insertelement <4 x i16> undef, i16 %a, i64 0
+// CHECK:   [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[VGETQ_LANE]], i64 0
+// CHECK:   [[VQRDMULHH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]]) #2
+// CHECK:   [[TMP4:%.*]] = extractelement <4 x i16> [[VQRDMULHH_S16_I]], i64 0
+// CHECK:   ret i16 [[TMP4]]
 int16_t test_vqrdmulhh_laneq_s16(int16_t a, int16x8_t b) {
  return vqrdmulhh_laneq_s16(a, b, 7);
-// CHECK: sqrdmulh {{h[0-9]+|v[0-9]+.4h}}, {{h[0-9]+|v[0-9]+.4h}}, {{v[0-9]+}}.h[7]
 }


-// CHECK-LABEL: test_vqrdmulhs_laneq_s32
+// CHECK-LABEL: define i32 @test_vqrdmulhs_laneq_s32(i32 %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
+// CHECK:   [[VQRDMULHS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %a, i32 [[VGETQ_LANE]]) #2
+// CHECK:   ret i32 [[VQRDMULHS_S32_I]]
 int32_t test_vqrdmulhs_laneq_s32(int32_t a, int32x4_t b) {
  return vqrdmulhs_laneq_s32(a, b, 3);
-// CHECK: sqrdmulh {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3]
 }

-// CHECK-LABEL: test_vqdmlalh_lane_s16
+// CHECK-LABEL: define i32 @test_vqdmlalh_lane_s16(i32 %a, i16 %b, <4 x i16> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %c to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[LANE:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3
+// CHECK:   [[TMP2:%.*]] = insertelement <4 x i16> undef, i16 %b, i64 0
+// CHECK:   [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[LANE]], i64 0
+// CHECK:   [[VQDMLXL:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]])
+// CHECK:   [[LANE0:%.*]] = extractelement <4 x i32> [[VQDMLXL]], i64 0
+// CHECK:   [[VQDMLXL1:%.*]] = call i32 @llvm.aarch64.neon.sqadd.i32(i32 %a, i32 [[LANE]]0)
+// CHECK:   ret i32 [[VQDMLXL]]1
 int32_t test_vqdmlalh_lane_s16(int32_t a, int16_t b, int16x4_t c) {
  return vqdmlalh_lane_s16(a, b, c, 3);
-// CHECK: sqdmlal {{s[0-9]+|v[0-9]+.4s}}, {{h[0-9]+|v[0-9]+.4h}}, {{v[0-9]+}}.h[3]
 }

-// CHECK-LABEL: test_vqdmlals_lane_s32
+// CHECK-LABEL: define i64 @test_vqdmlals_lane_s32(i64 %a, i32 %b, <2 x i32> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %c to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[LANE:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
+// CHECK:   [[VQDMLXL:%.*]] = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %b, i32 [[LANE]])
+// CHECK:   [[VQDMLXL1:%.*]] = call i64 @llvm.aarch64.neon.sqadd.i64(i64 %a, i64 [[VQDMLXL]])
+// CHECK:   ret i64 [[VQDMLXL]]1
 int64_t test_vqdmlals_lane_s32(int64_t a, int32_t b, int32x2_t c) {
  return vqdmlals_lane_s32(a, b, c, 1);
-// CHECK: sqdmlal {{d[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[1]
 }

-// CHECK-LABEL: test_vqdmlalh_laneq_s16
+// CHECK-LABEL: define i32 @test_vqdmlalh_laneq_s16(i32 %a, i16 %b, <8 x i16> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %c to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[LANE:%.*]] = extractelement <8 x i16> [[TMP1]], i32 7
+// CHECK:   [[TMP2:%.*]] = insertelement <4 x i16> undef, i16 %b, i64 0
+// CHECK:   [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[LANE]], i64 0
+// CHECK:   [[VQDMLXL:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]])
+// CHECK:   [[LANE0:%.*]] = extractelement <4 x i32> [[VQDMLXL]], i64 0
+// CHECK:   [[VQDMLXL1:%.*]] = call i32 @llvm.aarch64.neon.sqadd.i32(i32 %a, i32 [[LANE]]0)
+// CHECK:   ret i32 [[VQDMLXL]]1
 int32_t test_vqdmlalh_laneq_s16(int32_t a, int16_t b, int16x8_t c) {
  return vqdmlalh_laneq_s16(a, b, c, 7);
-// CHECK: sqdmlal {{s[0-9]+|v[0-9]+.4s}}, {{h[0-9]+|v[0-9]+.4h}}, {{v[0-9]+}}.h[7]
 }

-// CHECK-LABEL: test_vqdmlals_laneq_s32
+// CHECK-LABEL: define i64 @test_vqdmlals_laneq_s32(i64 %a, i32 %b, <4 x i32> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %c to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[LANE:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
+// CHECK:   [[VQDMLXL:%.*]] = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %b, i32 [[LANE]])
+// CHECK:   [[VQDMLXL1:%.*]] = call i64 @llvm.aarch64.neon.sqadd.i64(i64 %a, i64 [[VQDMLXL]])
+// CHECK:   ret i64 [[VQDMLXL]]1
 int64_t test_vqdmlals_laneq_s32(int64_t a, int32_t b, int32x4_t c) {
  return vqdmlals_laneq_s32(a, b, c, 3);
-// CHECK: sqdmlal {{d[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3]
 }

-// CHECK-LABEL: test_vqdmlslh_lane_s16
+// CHECK-LABEL: define i32 @test_vqdmlslh_lane_s16(i32 %a, i16 %b, <4 x i16> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %c to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[LANE:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3
+// CHECK:   [[TMP2:%.*]] = insertelement <4 x i16> undef, i16 %b, i64 0
+// CHECK:   [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[LANE]], i64 0
+// CHECK:   [[VQDMLXL:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]])
+// CHECK:   [[LANE0:%.*]] = extractelement <4 x i32> [[VQDMLXL]], i64 0
+// CHECK:   [[VQDMLXL1:%.*]] = call i32 @llvm.aarch64.neon.sqsub.i32(i32 %a, i32 [[LANE]]0)
+// CHECK:   ret i32 [[VQDMLXL]]1
 int32_t test_vqdmlslh_lane_s16(int32_t a, int16_t b, int16x4_t c) {
  return vqdmlslh_lane_s16(a, b, c, 3);
-// CHECK: sqdmlsl {{s[0-9]+|v[0-9]+.4s}}, {{h[0-9]+|v[0-9]+.4h}}, {{v[0-9]+}}.h[3]
 }

-// CHECK-LABEL: test_vqdmlsls_lane_s32
+// CHECK-LABEL: define i64 @test_vqdmlsls_lane_s32(i64 %a, i32 %b, <2 x i32> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %c to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[LANE:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
+// CHECK:   [[VQDMLXL:%.*]] = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %b, i32 [[LANE]])
+// CHECK:   [[VQDMLXL1:%.*]] = call i64 @llvm.aarch64.neon.sqsub.i64(i64 %a, i64 [[VQDMLXL]])
+// CHECK:   ret i64 [[VQDMLXL]]1
 int64_t test_vqdmlsls_lane_s32(int64_t a, int32_t b, int32x2_t c) {
  return vqdmlsls_lane_s32(a, b, c, 1);
-// CHECK: sqdmlsl {{d[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[1]
 }

-// CHECK-LABEL: test_vqdmlslh_laneq_s16
+// CHECK-LABEL: define i32 @test_vqdmlslh_laneq_s16(i32 %a, i16 %b, <8 x i16> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %c to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[LANE:%.*]] = extractelement <8 x i16> [[TMP1]], i32 7
+// CHECK:   [[TMP2:%.*]] = insertelement <4 x i16> undef, i16 %b, i64 0
+// CHECK:   [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[LANE]], i64 0
+// CHECK:   [[VQDMLXL:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]])
+// CHECK:   [[LANE0:%.*]] = extractelement <4 x i32> [[VQDMLXL]], i64 0
+// CHECK:   [[VQDMLXL1:%.*]] = call i32 @llvm.aarch64.neon.sqsub.i32(i32 %a, i32 [[LANE]]0)
+// CHECK:   ret i32 [[VQDMLXL]]1
 int32_t test_vqdmlslh_laneq_s16(int32_t a, int16_t b, int16x8_t c) {
  return vqdmlslh_laneq_s16(a, b, c, 7);
-// CHECK: sqdmlsl {{s[0-9]+|v[0-9]+.4s}}, {{h[0-9]+|v[0-9]+.4h}}, {{v[0-9]+}}.h[7]
 }

-// CHECK-LABEL: test_vqdmlsls_laneq_s32
+// CHECK-LABEL: define i64 @test_vqdmlsls_laneq_s32(i64 %a, i32 %b, <4 x i32> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %c to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[LANE:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
+// CHECK:   [[VQDMLXL:%.*]] = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %b, i32 [[LANE]])
+// CHECK:   [[VQDMLXL1:%.*]] = call i64 @llvm.aarch64.neon.sqsub.i64(i64 %a, i64 [[VQDMLXL]])
+// CHECK:   ret i64 [[VQDMLXL]]1
 int64_t test_vqdmlsls_laneq_s32(int64_t a, int32_t b, int32x4_t c) {
  return vqdmlsls_laneq_s32(a, b, c, 3);
-// CHECK: sqdmlsl {{d[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3]
 }

-// CHECK-LABEL: test_vmulx_lane_f64_0:
+// CHECK-LABEL: define <1 x double> @test_vmulx_lane_f64_0() #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64 4599917171378402754 to <1 x double>
+// CHECK:   [[TMP1:%.*]] = bitcast i64 4606655882138939123 to <1 x double>
+// CHECK:   [[TMP2:%.*]] = bitcast <1 x double> [[TMP0]] to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x double>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <1 x double> [[TMP3]], i32 0
+// CHECK:   [[TMP4:%.*]] = bitcast <1 x double> [[TMP1]] to <8 x i8>
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x double>
+// CHECK:   [[VGET_LANE7:%.*]] = extractelement <1 x double> [[TMP5]], i32 0
+// CHECK:   [[VMULXD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmulx.f64(double [[VGET_LANE]], double [[VGET_LANE]]7) #2
+// CHECK:   [[TMP6:%.*]] = bitcast <1 x double> [[TMP0]] to <8 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x double>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <1 x double> [[TMP7]], double [[VMULXD_F64_I]], i32 0
+// CHECK:   ret <1 x double> [[VSET_LANE]]
 float64x1_t test_vmulx_lane_f64_0() {
      float64x1_t arg1;
      float64x1_t arg2;
@ -262,15 +512,24 @@ float64x1_t test_vmulx_lane_f64_0() {
      arg1 = vcreate_f64(UINT64_C(0x3fd6304bc43ab5c2));
      arg2 = vcreate_f64(UINT64_C(0x3fee211e215aeef3));
      result = vmulx_lane_f64(arg1, arg2, 0);
-// CHECK: adrp x[[ADDRLO:[0-9]+]]
-// CHECK: ldr d0, [x[[ADDRLO]],
-// CHECK: adrp x[[ADDRLO:[0-9]+]]
-// CHECK: ldr d1, [x[[ADDRLO]],
-// CHECK: fmulx d0, d1, d0
      return result;
 }

-// CHECK-LABEL: test_vmulx_laneq_f64_2:
+// CHECK-LABEL: define <1 x double> @test_vmulx_laneq_f64_2() #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64 4599917171378402754 to <1 x double>
+// CHECK:   [[TMP1:%.*]] = bitcast i64 4606655882138939123 to <1 x double>
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <1 x double> [[TMP0]], <1 x double> [[TMP1]], <2 x i32> <i32 0, i32 1>
+// CHECK:   [[TMP2:%.*]] = bitcast <1 x double> [[TMP0]] to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x double>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <1 x double> [[TMP3]], i32 0
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x double> [[SHUFFLE_I]] to <16 x i8>
+// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x double>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <2 x double> [[TMP5]], i32 1
+// CHECK:   [[VMULXD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmulx.f64(double [[VGET_LANE]], double [[VGETQ_LANE]]) #2
+// CHECK:   [[TMP6:%.*]] = bitcast <1 x double> [[TMP0]] to <8 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x double>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <1 x double> [[TMP7]], double [[VMULXD_F64_I]], i32 0
+// CHECK:   ret <1 x double> [[VSET_LANE]]
 float64x1_t test_vmulx_laneq_f64_2() {
      float64x1_t arg1;
      float64x1_t arg2;
@ -281,10 +540,5 @@ float64x1_t test_vmulx_laneq_f64_2() {
      arg2 = vcreate_f64(UINT64_C(0x3fee211e215aeef3));
      arg3 = vcombine_f64(arg1, arg2);
      result = vmulx_laneq_f64(arg1, arg3, 1);
-// CHECK: adrp x[[ADDRLO:[0-9]+]]
-// CHECK: ldr d0, [x[[ADDRLO]],
-// CHECK: adrp x[[ADDRLO:[0-9]+]]
-// CHECK: ldr d1, [x[[ADDRLO]],
-// CHECK: fmulx d0, d1, d0
      return result;
 }
--- a/test/CodeGen/aarch64-neon-shifts.c
+++ b/test/CodeGen/aarch64-neon-shifts.c
@ -1,6 +1,5 @@
-// REQUIRES: aarch64-registered-target
 // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon \
-// RUN:   -ffp-contract=fast -S -emit-llvm -O1 -o - %s | FileCheck %s
+// RUN:   -ffp-contract=fast -S -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s

 #include <arm_neon.h>

@ -25,19 +24,20 @@ uint8x8_t test_shift_vshr_umax(uint8x8_t a) {
 uint8x8_t test_shift_vsra(uint8x8_t a, uint8x8_t b) {
  // CHECK-LABEL: test_shift_vsra
  // CHECK: %[[SHR:.*]] = lshr <8 x i8> %b, <i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5>
-  // CHECK: %{{.*}} = add <8 x i8> %[[SHR]], %a
+  // CHECK: %{{.*}} = add <8 x i8> %a, %[[SHR]]
  return vsra_n_u8(a, b, 5);
 }

 int8x8_t test_shift_vsra_smax(int8x8_t a, int8x8_t b) {
  // CHECK-LABEL: test_shift_vsra_smax
  // CHECK: %[[SHR:.*]] = ashr <8 x i8> %b, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
-  // CHECK: %{{.*}} = add <8 x i8> %[[SHR]], %a
+  // CHECK: %{{.*}} = add <8 x i8> %a, %[[SHR]]
  return vsra_n_s8(a, b, 8);
 }

 uint8x8_t test_shift_vsra_umax(uint8x8_t a, uint8x8_t b) {
  // CHECK-LABEL: test_shift_vsra_umax
-  // CHECK: ret <8 x i8> %a
+  // CHECK: [[RES:%.*]] = add <8 x i8> %a, zeroinitializer
+  // CHECK: ret <8 x i8> [[RES]]
  return vsra_n_u8(a, b, 8);
 }
--- a/test/CodeGen/aarch64-neon-tbl.c
+++ b/test/CodeGen/aarch64-neon-tbl.c
--- a/test/CodeGen/aarch64-neon-vcombine.c
+++ b/test/CodeGen/aarch64-neon-vcombine.c
@ -1,90 +1,103 @@
-// REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -S -O3 -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -fallow-half-arguments-and-returns -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s

 // Test new aarch64 intrinsics and types

 #include <arm_neon.h>

+// CHECK-LABEL: define <16 x i8> @test_vcombine_s8(<8 x i8> %low, <8 x i8> %high) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %low, <8 x i8> %high, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 int8x16_t test_vcombine_s8(int8x8_t low, int8x8_t high) {
-  // CHECK-LABEL: test_vcombine_s8:
  return vcombine_s8(low, high);
-  // CHECK: ins	v0.d[1], v1.d[0]
 }

+// CHECK-LABEL: define <8 x i16> @test_vcombine_s16(<4 x i16> %low, <4 x i16> %high) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %low, <4 x i16> %high, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 int16x8_t test_vcombine_s16(int16x4_t low, int16x4_t high) {
-  // CHECK-LABEL: test_vcombine_s16:
  return vcombine_s16(low, high);
-  // CHECK: ins	v0.d[1], v1.d[0]
 }

+// CHECK-LABEL: define <4 x i32> @test_vcombine_s32(<2 x i32> %low, <2 x i32> %high) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %low, <2 x i32> %high, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK:   ret <4 x i32> [[SHUFFLE_I]]
 int32x4_t test_vcombine_s32(int32x2_t low, int32x2_t high) {
-  // CHECK-LABEL: test_vcombine_s32:
  return vcombine_s32(low, high);
-  // CHECK: ins	v0.d[1], v1.d[0]
 }

+// CHECK-LABEL: define <2 x i64> @test_vcombine_s64(<1 x i64> %low, <1 x i64> %high) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <1 x i64> %low, <1 x i64> %high, <2 x i32> <i32 0, i32 1>
+// CHECK:   ret <2 x i64> [[SHUFFLE_I]]
 int64x2_t test_vcombine_s64(int64x1_t low, int64x1_t high) {
-  // CHECK-LABEL: test_vcombine_s64:
  return vcombine_s64(low, high);
-  // CHECK: ins	v0.d[1], v1.d[0]
 }

+// CHECK-LABEL: define <16 x i8> @test_vcombine_u8(<8 x i8> %low, <8 x i8> %high) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %low, <8 x i8> %high, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 uint8x16_t test_vcombine_u8(uint8x8_t low, uint8x8_t high) {
-  // CHECK-LABEL: test_vcombine_u8:
  return vcombine_u8(low, high);
-  // CHECK: ins	v0.d[1], v1.d[0]
 }

+// CHECK-LABEL: define <8 x i16> @test_vcombine_u16(<4 x i16> %low, <4 x i16> %high) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %low, <4 x i16> %high, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 uint16x8_t test_vcombine_u16(uint16x4_t low, uint16x4_t high) {
-  // CHECK-LABEL: test_vcombine_u16:
  return vcombine_u16(low, high);
-  // CHECK: ins	v0.d[1], v1.d[0]
 }

+// CHECK-LABEL: define <4 x i32> @test_vcombine_u32(<2 x i32> %low, <2 x i32> %high) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %low, <2 x i32> %high, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK:   ret <4 x i32> [[SHUFFLE_I]]
 uint32x4_t test_vcombine_u32(uint32x2_t low, uint32x2_t high) {
-  // CHECK-LABEL: test_vcombine_u32:
  return vcombine_u32(low, high);
-  // CHECK: ins	v0.d[1], v1.d[0]
 }

+// CHECK-LABEL: define <2 x i64> @test_vcombine_u64(<1 x i64> %low, <1 x i64> %high) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <1 x i64> %low, <1 x i64> %high, <2 x i32> <i32 0, i32 1>
+// CHECK:   ret <2 x i64> [[SHUFFLE_I]]
 uint64x2_t test_vcombine_u64(uint64x1_t low, uint64x1_t high) {
-  // CHECK-LABEL: test_vcombine_u64:
  return vcombine_u64(low, high);
-  // CHECK: ins	v0.d[1], v1.d[0]
 }

+// CHECK-LABEL: define <2 x i64> @test_vcombine_p64(<1 x i64> %low, <1 x i64> %high) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <1 x i64> %low, <1 x i64> %high, <2 x i32> <i32 0, i32 1>
+// CHECK:   ret <2 x i64> [[SHUFFLE_I]]
 poly64x2_t test_vcombine_p64(poly64x1_t low, poly64x1_t high) {
-  // CHECK-LABEL: test_vcombine_p64:
  return vcombine_p64(low, high);
-  // CHECK: ins	v0.d[1], v1.d[0]
 }

+// CHECK-LABEL: define <8 x half> @test_vcombine_f16(<4 x half> %low, <4 x half> %high) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x half> %low, <4 x half> %high, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK:   ret <8 x half> [[SHUFFLE_I]]
 float16x8_t test_vcombine_f16(float16x4_t low, float16x4_t high) {
-  // CHECK-LABEL: test_vcombine_f16:
  return vcombine_f16(low, high);
-  // CHECK: ins	v0.d[1], v1.d[0]
 }

+// CHECK-LABEL: define <4 x float> @test_vcombine_f32(<2 x float> %low, <2 x float> %high) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x float> %low, <2 x float> %high, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK:   ret <4 x float> [[SHUFFLE_I]]
 float32x4_t test_vcombine_f32(float32x2_t low, float32x2_t high) {
-  // CHECK-LABEL: test_vcombine_f32:
  return vcombine_f32(low, high);
-  // CHECK: ins	v0.d[1], v1.d[0]
 }

+// CHECK-LABEL: define <16 x i8> @test_vcombine_p8(<8 x i8> %low, <8 x i8> %high) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %low, <8 x i8> %high, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 poly8x16_t test_vcombine_p8(poly8x8_t low, poly8x8_t high) {
-  // CHECK-LABEL: test_vcombine_p8:
  return vcombine_p8(low, high);
-  // CHECK: ins	v0.d[1], v1.d[0]
 }

+// CHECK-LABEL: define <8 x i16> @test_vcombine_p16(<4 x i16> %low, <4 x i16> %high) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %low, <4 x i16> %high, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 poly16x8_t test_vcombine_p16(poly16x4_t low, poly16x4_t high) {
-  // CHECK-LABEL: test_vcombine_p16:
  return vcombine_p16(low, high);
-  // CHECK: ins	v0.d[1], v1.d[0]
 }

+// CHECK-LABEL: define <2 x double> @test_vcombine_f64(<1 x double> %low, <1 x double> %high) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <1 x double> %low, <1 x double> %high, <2 x i32> <i32 0, i32 1>
+// CHECK:   ret <2 x double> [[SHUFFLE_I]]
 float64x2_t test_vcombine_f64(float64x1_t low, float64x1_t high) {
-  // CHECK-LABEL: test_vcombine_f64:
  return vcombine_f64(low, high);
-  // CHECK: ins	v0.d[1], v1.d[0]
 }
--- a/test/CodeGen/aarch64-neon-vget-hilo.c
+++ b/test/CodeGen/aarch64-neon-vget-hilo.c
@ -1,176 +1,203 @@
-// REQUIRES: aarch64-registered-target
 // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon \
-// RUN:   -ffp-contract=fast -S -O3 -o - %s | FileCheck %s --check-prefix CHECK-COMMON --check-prefix CHECK-ARM64
-
+// RUN:  -fallow-half-arguments-and-returns -emit-llvm -o - %s \
+// RUN: | opt -S -mem2reg | FileCheck %s
 // Test new aarch64 intrinsics and types

 #include <arm_neon.h>

+// CHECK-LABEL: define <8 x i8> @test_vget_high_s8(<16 x i8> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 int8x8_t test_vget_high_s8(int8x16_t a) {
-  // CHECK-COMMON-LABEL: test_vget_high_s8:
  return vget_high_s8(a);
-  // CHECK-ARM64: ext v0.16b, v0.16b, v0.16b, #8
 }

+// CHECK-LABEL: define <4 x i16> @test_vget_high_s16(<8 x i16> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 int16x4_t test_vget_high_s16(int16x8_t a) {
-  // CHECK-COMMON-LABEL: test_vget_high_s16:
  return vget_high_s16(a);
-  // CHECK-ARM64: ext v0.16b, v0.16b, v0.16b, #8
 }

+// CHECK-LABEL: define <2 x i32> @test_vget_high_s32(<4 x i32> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
+// CHECK:   ret <2 x i32> [[SHUFFLE_I]]
 int32x2_t test_vget_high_s32(int32x4_t a) {
-  // CHECK-COMMON-LABEL: test_vget_high_s32:
  return vget_high_s32(a);
-  // CHECK-ARM64: ext v0.16b, v0.16b, v0.16b, #8
 }

+// CHECK-LABEL: define <1 x i64> @test_vget_high_s64(<2 x i64> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %a, <1 x i32> <i32 1>
+// CHECK:   ret <1 x i64> [[SHUFFLE_I]]
 int64x1_t test_vget_high_s64(int64x2_t a) {
-  // CHECK-COMMON-LABEL: test_vget_high_s64:
  return vget_high_s64(a);
-  // CHECK-ARM64: ext v0.16b, v0.16b, v0.16b, #8
 }

+// CHECK-LABEL: define <8 x i8> @test_vget_high_u8(<16 x i8> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 uint8x8_t test_vget_high_u8(uint8x16_t a) {
-  // CHECK-COMMON-LABEL: test_vget_high_u8:
  return vget_high_u8(a);
-  // CHECK-ARM64: ext v0.16b, v0.16b, v0.16b, #8
 }

+// CHECK-LABEL: define <4 x i16> @test_vget_high_u16(<8 x i16> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 uint16x4_t test_vget_high_u16(uint16x8_t a) {
-  // CHECK-COMMON-LABEL: test_vget_high_u16:
  return vget_high_u16(a);
-  // CHECK-ARM64: ext v0.16b, v0.16b, v0.16b, #8
 }

+// CHECK-LABEL: define <2 x i32> @test_vget_high_u32(<4 x i32> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
+// CHECK:   ret <2 x i32> [[SHUFFLE_I]]
 uint32x2_t test_vget_high_u32(uint32x4_t a) {
-  // CHECK-COMMON-LABEL: test_vget_high_u32:
  return vget_high_u32(a);
-  // CHECK-ARM64: ext v0.16b, v0.16b, v0.16b, #8
 }

+// CHECK-LABEL: define <1 x i64> @test_vget_high_u64(<2 x i64> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %a, <1 x i32> <i32 1>
+// CHECK:   ret <1 x i64> [[SHUFFLE_I]]
 uint64x1_t test_vget_high_u64(uint64x2_t a) {
-  // CHECK-COMMON-LABEL: test_vget_high_u64:
  return vget_high_u64(a);
-  // CHECK-ARM64: ext v0.16b, v0.16b, v0.16b, #8
 }

+// CHECK-LABEL: define <1 x i64> @test_vget_high_p64(<2 x i64> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %a, <1 x i32> <i32 1>
+// CHECK:   ret <1 x i64> [[SHUFFLE_I]]
 poly64x1_t test_vget_high_p64(poly64x2_t a) {
-  // CHECK-COMMON-LABEL: test_vget_high_p64:
  return vget_high_p64(a);
-  // CHECK-ARM64: ext v0.16b, v0.16b, v0.16b, #8
 }

+// CHECK-LABEL: define <4 x half> @test_vget_high_f16(<8 x half> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x half> %a, <8 x half> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   ret <4 x half> [[SHUFFLE_I]]
 float16x4_t test_vget_high_f16(float16x8_t a) {
-  // CHECK-COMMON-LABEL: test_vget_high_f16:
  return vget_high_f16(a);
-  // CHECK-ARM64: ext v0.16b, v0.16b, v0.16b, #8
 }

+// CHECK-LABEL: define <2 x float> @test_vget_high_f32(<4 x float> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %a, <2 x i32> <i32 2, i32 3>
+// CHECK:   ret <2 x float> [[SHUFFLE_I]]
 float32x2_t test_vget_high_f32(float32x4_t a) {
-  // CHECK-COMMON-LABEL: test_vget_high_f32:
  return vget_high_f32(a);
-  // CHECK-ARM64: ext v0.16b, v0.16b, v0.16b, #8
 }

+// CHECK-LABEL: define <8 x i8> @test_vget_high_p8(<16 x i8> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 poly8x8_t test_vget_high_p8(poly8x16_t a) {
-  // CHECK-COMMON-LABEL: test_vget_high_p8:
  return vget_high_p8(a);
-  // CHECK-ARM64: ext v0.16b, v0.16b, v0.16b, #8
 }

+// CHECK-LABEL: define <4 x i16> @test_vget_high_p16(<8 x i16> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 poly16x4_t test_vget_high_p16(poly16x8_t a) {
-  // CHECK-COMMON-LABEL: test_vget_high_p16
  return vget_high_p16(a);
-  // CHECK-ARM64: ext v0.16b, v0.16b, v0.16b, #8
 }

+// CHECK-LABEL: define <1 x double> @test_vget_high_f64(<2 x double> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x double> %a, <2 x double> %a, <1 x i32> <i32 1>
+// CHECK:   ret <1 x double> [[SHUFFLE_I]]
 float64x1_t test_vget_high_f64(float64x2_t a) {
-  // CHECK-COMMON-LABEL: test_vget_high_f64
  return vget_high_f64(a);
-  // CHECK-ARM64: ext v0.16b, v0.16b, v0.16b, #8
 }

+// CHECK-LABEL: define <8 x i8> @test_vget_low_s8(<16 x i8> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 int8x8_t test_vget_low_s8(int8x16_t a) {
-  // CHECK-COMMON-LABEL: test_vget_low_s8:
  return vget_low_s8(a);
-  // CHECK-COMMON-NEXT: ret
 }

+// CHECK-LABEL: define <4 x i16> @test_vget_low_s16(<8 x i16> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 int16x4_t test_vget_low_s16(int16x8_t a) {
-  // CHECK-COMMON-LABEL: test_vget_low_s16:
  return vget_low_s16(a);
-  // CHECK-COMMON-NEXT: ret
 }

+// CHECK-LABEL: define <2 x i32> @test_vget_low_s32(<4 x i32> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 0, i32 1>
+// CHECK:   ret <2 x i32> [[SHUFFLE_I]]
 int32x2_t test_vget_low_s32(int32x4_t a) {
-  // CHECK-COMMON-LABEL: test_vget_low_s32:
  return vget_low_s32(a);
-  // CHECK-COMMON-NEXT: ret
 }

+// CHECK-LABEL: define <1 x i64> @test_vget_low_s64(<2 x i64> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %a, <1 x i32> zeroinitializer
+// CHECK:   ret <1 x i64> [[SHUFFLE_I]]
 int64x1_t test_vget_low_s64(int64x2_t a) {
-  // CHECK-COMMON-LABEL: test_vget_low_s64:
  return vget_low_s64(a);
-  // CHECK-COMMON-NEXT: ret
 }

+// CHECK-LABEL: define <8 x i8> @test_vget_low_u8(<16 x i8> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 uint8x8_t test_vget_low_u8(uint8x16_t a) {
-  // CHECK-COMMON-LABEL: test_vget_low_u8:
  return vget_low_u8(a);
-  // CHECK-COMMON-NEXT: ret
 }

+// CHECK-LABEL: define <4 x i16> @test_vget_low_u16(<8 x i16> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 uint16x4_t test_vget_low_u16(uint16x8_t a) {
-  // CHECK-COMMON-LABEL: test_vget_low_u16:
  return vget_low_u16(a);
-  // CHECK-COMMON-NEXT: ret
 }

+// CHECK-LABEL: define <2 x i32> @test_vget_low_u32(<4 x i32> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 0, i32 1>
+// CHECK:   ret <2 x i32> [[SHUFFLE_I]]
 uint32x2_t test_vget_low_u32(uint32x4_t a) {
-  // CHECK-COMMON-LABEL: test_vget_low_u32:
  return vget_low_u32(a);
-  // CHECK-COMMON-NEXT: ret
 }

+// CHECK-LABEL: define <1 x i64> @test_vget_low_u64(<2 x i64> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %a, <1 x i32> zeroinitializer
+// CHECK:   ret <1 x i64> [[SHUFFLE_I]]
 uint64x1_t test_vget_low_u64(uint64x2_t a) {
-  // CHECK-COMMON-LABEL: test_vget_low_u64:
  return vget_low_u64(a);
-  // CHECK-COMMON-NEXT: ret
 }

+// CHECK-LABEL: define <1 x i64> @test_vget_low_p64(<2 x i64> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %a, <1 x i32> zeroinitializer
+// CHECK:   ret <1 x i64> [[SHUFFLE_I]]
 poly64x1_t test_vget_low_p64(poly64x2_t a) {
-  // CHECK-COMMON-LABEL: test_vget_low_p64:
  return vget_low_p64(a);
-  // CHECK-COMMON-NEXT: ret
 }

+// CHECK-LABEL: define <4 x half> @test_vget_low_f16(<8 x half> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x half> %a, <8 x half> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK:   ret <4 x half> [[SHUFFLE_I]]
 float16x4_t test_vget_low_f16(float16x8_t a) {
-  // CHECK-COMMON-LABEL: test_vget_low_f16:
  return vget_low_f16(a);
-  // CHECK-COMMON-NEXT: ret
 }

+// CHECK-LABEL: define <2 x float> @test_vget_low_f32(<4 x float> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %a, <2 x i32> <i32 0, i32 1>
+// CHECK:   ret <2 x float> [[SHUFFLE_I]]
 float32x2_t test_vget_low_f32(float32x4_t a) {
-  // CHECK-COMMON-LABEL: test_vget_low_f32:
  return vget_low_f32(a);
-  // CHECK-COMMON-NEXT: ret
 }

+// CHECK-LABEL: define <8 x i8> @test_vget_low_p8(<16 x i8> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 poly8x8_t test_vget_low_p8(poly8x16_t a) {
-  // CHECK-COMMON-LABEL: test_vget_low_p8:
  return vget_low_p8(a);
-  // CHECK-COMMON-NEXT: ret
 }

+// CHECK-LABEL: define <4 x i16> @test_vget_low_p16(<8 x i16> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 poly16x4_t test_vget_low_p16(poly16x8_t a) {
-  // CHECK-COMMON-LABEL: test_vget_low_p16:
  return vget_low_p16(a);
-  // CHECK-COMMON-NEXT: ret
 }

+// CHECK-LABEL: define <1 x double> @test_vget_low_f64(<2 x double> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x double> %a, <2 x double> %a, <1 x i32> zeroinitializer
+// CHECK:   ret <1 x double> [[SHUFFLE_I]]
 float64x1_t test_vget_low_f64(float64x2_t a) {
-  // CHECK-COMMON-LABEL: test_vget_low_f64:
  return vget_low_f64(a);
-  // CHECK-COMMON-NEXT: ret
 }

--- a/test/CodeGen/aarch64-neon-vget.c
+++ b/test/CodeGen/aarch64-neon-vget.c
@ -1,348 +1,458 @@
-// REQUIRES: aarch64-registered-target
 // RUN: %clang_cc1 -triple arm64-apple-darwin -target-feature +neon \
-// RUN:   -ffp-contract=fast -S -O3 -o - %s | FileCheck %s
+// RUN:   -fallow-half-arguments-and-returns -emit-llvm -o - %s \
+// RUN: | opt -S -mem2reg | FileCheck %s

 #include <arm_neon.h>

+// CHECK-LABEL: define i8 @test_vget_lane_u8(<8 x i8> %a) #0 {
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <8 x i8> %a, i32 7
+// CHECK:   ret i8 [[VGET_LANE]]
 uint8_t test_vget_lane_u8(uint8x8_t a) {
-  // CHECK-LABEL: test_vget_lane_u8:
-  // CHECK-NEXT:  umov.b w0, v0[7]
-  // CHECK-NEXT:  ret
  return vget_lane_u8(a, 7);
 }

+// CHECK-LABEL: define i16 @test_vget_lane_u16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3
+// CHECK:   ret i16 [[VGET_LANE]]
 uint16_t test_vget_lane_u16(uint16x4_t a) {
-  // CHECK-LABEL: test_vget_lane_u16:
-  // CHECK-NEXT:  umov.h w0, v0[3]
-  // CHECK-NEXT:  ret
  return vget_lane_u16(a, 3);
 }

+// CHECK-LABEL: define i32 @test_vget_lane_u32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
+// CHECK:   ret i32 [[VGET_LANE]]
 uint32_t test_vget_lane_u32(uint32x2_t a) {
-  // CHECK-LABEL: test_vget_lane_u32:
-  // CHECK-NEXT:  mov.s  w0, v0[1]
-  // CHECK-NEXT:  ret
  return vget_lane_u32(a, 1);
 }

+// CHECK-LABEL: define i8 @test_vget_lane_s8(<8 x i8> %a) #0 {
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <8 x i8> %a, i32 7
+// CHECK:   ret i8 [[VGET_LANE]]
 int8_t test_vget_lane_s8(int8x8_t a) {
-  // CHECK-LABEL: test_vget_lane_s8:
-  // CHECK-NEXT:  umov.b w0, v0[7]
-  // CHECK-NEXT:  ret
  return vget_lane_s8(a, 7);
 }

+// CHECK-LABEL: define i16 @test_vget_lane_s16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3
+// CHECK:   ret i16 [[VGET_LANE]]
 int16_t test_vget_lane_s16(int16x4_t a) {
-  // CHECK-LABEL: test_vget_lane_s16:
-  // CHECK-NEXT:  umov.h w0, v0[3]
-  // CHECK-NEXT:  ret
  return vget_lane_s16(a, 3);
 }

+// CHECK-LABEL: define i32 @test_vget_lane_s32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
+// CHECK:   ret i32 [[VGET_LANE]]
 int32_t test_vget_lane_s32(int32x2_t a) {
-  // CHECK-LABEL: test_vget_lane_s32:
-  // CHECK-NEXT:  mov.s  w0, v0[1]
-  // CHECK-NEXT:  ret
  return vget_lane_s32(a, 1);
 }

+// CHECK-LABEL: define i8 @test_vget_lane_p8(<8 x i8> %a) #0 {
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <8 x i8> %a, i32 7
+// CHECK:   ret i8 [[VGET_LANE]]
 poly8_t test_vget_lane_p8(poly8x8_t a) {
-  // CHECK-LABEL: test_vget_lane_p8:
-  // CHECK-NEXT:  umov.b w0, v0[7]
-  // CHECK-NEXT:  ret
  return vget_lane_p8(a, 7);
 }

+// CHECK-LABEL: define i16 @test_vget_lane_p16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3
+// CHECK:   ret i16 [[VGET_LANE]]
 poly16_t test_vget_lane_p16(poly16x4_t a) {
-  // CHECK-LABEL: test_vget_lane_p16:
-  // CHECK-NEXT:  umov.h w0, v0[3]
-  // CHECK-NEXT:  ret
  return vget_lane_p16(a, 3);
 }

+// CHECK-LABEL: define float @test_vget_lane_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <2 x float> [[TMP1]], i32 1
+// CHECK:   ret float [[VGET_LANE]]
 float32_t test_vget_lane_f32(float32x2_t a) {
-  // CHECK-LABEL: test_vget_lane_f32:
-  // CHECK-NEXT:  mov s0, v0[1]
-  // CHECK-NEXT:  ret
  return vget_lane_f32(a, 1);
 }

+// CHECK-LABEL: define float @test_vget_lane_f16(<4 x half> %a) #0 {
+// CHECK:   [[__REINT_242:%.*]] = alloca <4 x half>, align 8
+// CHECK:   [[__REINT1_242:%.*]] = alloca i16, align 2
+// CHECK:   store <4 x half> %a, <4 x half>* [[__REINT_242]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x half>* [[__REINT_242]] to <4 x i16>*
+// CHECK:   [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* [[TMP0]], align 8
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[TMP1]] to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP3]], i32 1
+// CHECK:   store i16 [[VGET_LANE]], i16* [[__REINT1_242]], align 2
+// CHECK:   [[TMP4:%.*]] = bitcast i16* [[__REINT1_242]] to half*
+// CHECK:   [[TMP5:%.*]] = load half, half* [[TMP4]], align 2
+// CHECK:   [[CONV:%.*]] = fpext half [[TMP5]] to float
+// CHECK:   ret float [[CONV]]
 float32_t test_vget_lane_f16(float16x4_t a) {
-  // CHECK-LABEL: test_vget_lane_f16:
-  // CHECK-NEXT:  umov.h w8, v0[1]
-  // CHECK-NEXT:  fmov s0, w8
-  // CHECK-NEXT:  fcvt s0, h0
-  // CHECK-NEXT:  ret
  return vget_lane_f16(a, 1);
 }

+// CHECK-LABEL: define i8 @test_vgetq_lane_u8(<16 x i8> %a) #0 {
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <16 x i8> %a, i32 15
+// CHECK:   ret i8 [[VGETQ_LANE]]
 uint8_t test_vgetq_lane_u8(uint8x16_t a) {
-  // CHECK-LABEL: test_vgetq_lane_u8:
-  // CHECK-NEXT:  umov.b w0, v0[15]
-  // CHECK-NEXT:  ret
  return vgetq_lane_u8(a, 15);
 }

+// CHECK-LABEL: define i16 @test_vgetq_lane_u16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <8 x i16> [[TMP1]], i32 7
+// CHECK:   ret i16 [[VGETQ_LANE]]
 uint16_t test_vgetq_lane_u16(uint16x8_t a) {
-  // CHECK-LABEL: test_vgetq_lane_u16:
-  // CHECK-NEXT:  umov.h w0, v0[7]
-  // CHECK-NEXT:  ret
  return vgetq_lane_u16(a, 7);
 }

+// CHECK-LABEL: define i32 @test_vgetq_lane_u32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
+// CHECK:   ret i32 [[VGETQ_LANE]]
 uint32_t test_vgetq_lane_u32(uint32x4_t a) {
-  // CHECK-LABEL: test_vgetq_lane_u32:
-  // CHECK-NEXT:  mov.s  w0, v0[3]
-  // CHECK-NEXT:  ret
  return vgetq_lane_u32(a, 3);
 }

+// CHECK-LABEL: define i8 @test_vgetq_lane_s8(<16 x i8> %a) #0 {
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <16 x i8> %a, i32 15
+// CHECK:   ret i8 [[VGETQ_LANE]]
 int8_t test_vgetq_lane_s8(int8x16_t a) {
-  // CHECK-LABEL: test_vgetq_lane_s8:
-  // CHECK-NEXT:  umov.b w0, v0[15]
-  // CHECK-NEXT:  ret
  return vgetq_lane_s8(a, 15);
 }

+// CHECK-LABEL: define i16 @test_vgetq_lane_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <8 x i16> [[TMP1]], i32 7
+// CHECK:   ret i16 [[VGETQ_LANE]]
 int16_t test_vgetq_lane_s16(int16x8_t a) {
-  // CHECK-LABEL: test_vgetq_lane_s16:
-  // CHECK-NEXT:  umov.h w0, v0[7]
-  // CHECK-NEXT:  ret
  return vgetq_lane_s16(a, 7);
 }

+// CHECK-LABEL: define i32 @test_vgetq_lane_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
+// CHECK:   ret i32 [[VGETQ_LANE]]
 int32_t test_vgetq_lane_s32(int32x4_t a) {
-  // CHECK-LABEL: test_vgetq_lane_s32:
-  // CHECK-NEXT:  mov.s  w0, v0[3]
-  // CHECK-NEXT:  ret
  return vgetq_lane_s32(a, 3);
 }

+// CHECK-LABEL: define i8 @test_vgetq_lane_p8(<16 x i8> %a) #0 {
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <16 x i8> %a, i32 15
+// CHECK:   ret i8 [[VGETQ_LANE]]
 poly8_t test_vgetq_lane_p8(poly8x16_t a) {
-  // CHECK-LABEL: test_vgetq_lane_p8:
-  // CHECK-NEXT:  umov.b w0, v0[15]
-  // CHECK-NEXT:  ret
  return vgetq_lane_p8(a, 15);
 }

+// CHECK-LABEL: define i16 @test_vgetq_lane_p16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <8 x i16> [[TMP1]], i32 7
+// CHECK:   ret i16 [[VGETQ_LANE]]
 poly16_t test_vgetq_lane_p16(poly16x8_t a) {
-  // CHECK-LABEL: test_vgetq_lane_p16:
-  // CHECK-NEXT:  umov.h w0, v0[7]
-  // CHECK-NEXT:  ret
  return vgetq_lane_p16(a, 7);
 }

+// CHECK-LABEL: define float @test_vgetq_lane_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
+// CHECK:   ret float [[VGETQ_LANE]]
 float32_t test_vgetq_lane_f32(float32x4_t a) {
-  // CHECK-LABEL: test_vgetq_lane_f32:
-  // CHECK-NEXT:  mov s0, v0[3]
-  // CHECK-NEXT:  ret
  return vgetq_lane_f32(a, 3);
 }

+// CHECK-LABEL: define float @test_vgetq_lane_f16(<8 x half> %a) #0 {
+// CHECK:   [[__REINT_244:%.*]] = alloca <8 x half>, align 16
+// CHECK:   [[__REINT1_244:%.*]] = alloca i16, align 2
+// CHECK:   store <8 x half> %a, <8 x half>* [[__REINT_244]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x half>* [[__REINT_244]] to <8 x i16>*
+// CHECK:   [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* [[TMP0]], align 16
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> [[TMP1]] to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <8 x i16> [[TMP3]], i32 3
+// CHECK:   store i16 [[VGETQ_LANE]], i16* [[__REINT1_244]], align 2
+// CHECK:   [[TMP4:%.*]] = bitcast i16* [[__REINT1_244]] to half*
+// CHECK:   [[TMP5:%.*]] = load half, half* [[TMP4]], align 2
+// CHECK:   [[CONV:%.*]] = fpext half [[TMP5]] to float
+// CHECK:   ret float [[CONV]]
 float32_t test_vgetq_lane_f16(float16x8_t a) {
-  // CHECK-LABEL: test_vgetq_lane_f16:
-  // CHECK-NEXT:  umov.h w8, v0[3]
-  // CHECK-NEXT:  fmov s0, w8
-  // CHECK-NEXT:  fcvt s0, h0
-  // CHECK-NEXT:  ret
  return vgetq_lane_f16(a, 3);
 }

+// CHECK-LABEL: define i64 @test_vget_lane_s64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <1 x i64> [[TMP1]], i32 0
+// CHECK:   ret i64 [[VGET_LANE]]
 int64_t test_vget_lane_s64(int64x1_t a) {
-  // CHECK-LABEL: test_vget_lane_s64:
-  // CHECK-NEXT:  fmov x0, d0
-  // CHECK-NEXT:  ret
  return vget_lane_s64(a, 0);
 }

+// CHECK-LABEL: define i64 @test_vget_lane_u64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <1 x i64> [[TMP1]], i32 0
+// CHECK:   ret i64 [[VGET_LANE]]
 uint64_t test_vget_lane_u64(uint64x1_t a) {
-  // CHECK-LABEL: test_vget_lane_u64:
-  // CHECK-NEXT:  fmov x0, d0
-  // CHECK-NEXT:  ret
  return vget_lane_u64(a, 0);
 }

+// CHECK-LABEL: define i64 @test_vgetq_lane_s64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <2 x i64> [[TMP1]], i32 1
+// CHECK:   ret i64 [[VGETQ_LANE]]
 int64_t test_vgetq_lane_s64(int64x2_t a) {
-  // CHECK-LABEL: test_vgetq_lane_s64:
-  // CHECK-NEXT:  mov.d  x0, v0[1]
-  // CHECK-NEXT:  ret
  return vgetq_lane_s64(a, 1);
 }

+// CHECK-LABEL: define i64 @test_vgetq_lane_u64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <2 x i64> [[TMP1]], i32 1
+// CHECK:   ret i64 [[VGETQ_LANE]]
 uint64_t test_vgetq_lane_u64(uint64x2_t a) {
-  // CHECK-LABEL: test_vgetq_lane_u64:
-  // CHECK-NEXT:  mov.d  x0, v0[1]
-  // CHECK-NEXT:  ret
  return vgetq_lane_u64(a, 1);
 }


+// CHECK-LABEL: define <8 x i8> @test_vset_lane_u8(i8 %a, <8 x i8> %b) #0 {
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <8 x i8> %b, i8 %a, i32 7
+// CHECK:   ret <8 x i8> [[VSET_LANE]]
 uint8x8_t test_vset_lane_u8(uint8_t a, uint8x8_t b) {
-  // CHECK-LABEL: test_vset_lane_u8:
-  // CHECK-NEXT:  ins.b v0[7], w0
-  // CHECK-NEXT:  ret
  return vset_lane_u8(a, b, 7);
 }

+// CHECK-LABEL: define <4 x i16> @test_vset_lane_u16(i16 %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <4 x i16> [[TMP1]], i16 %a, i32 3
+// CHECK:   ret <4 x i16> [[VSET_LANE]]
 uint16x4_t test_vset_lane_u16(uint16_t a, uint16x4_t b) {
-  // CHECK-LABEL: test_vset_lane_u16:
-  // CHECK-NEXT:  ins.h v0[3], w0
-  // CHECK-NEXT:  ret
  return vset_lane_u16(a, b, 3);
 }

+// CHECK-LABEL: define <2 x i32> @test_vset_lane_u32(i32 %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <2 x i32> [[TMP1]], i32 %a, i32 1
+// CHECK:   ret <2 x i32> [[VSET_LANE]]
 uint32x2_t test_vset_lane_u32(uint32_t a, uint32x2_t b) {
-  // CHECK-LABEL: test_vset_lane_u32:
-  // CHECK-NEXT:  ins.s v0[1], w0
-  // CHECK-NEXT:  ret
  return vset_lane_u32(a, b, 1);
 }

+// CHECK-LABEL: define <8 x i8> @test_vset_lane_s8(i8 %a, <8 x i8> %b) #0 {
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <8 x i8> %b, i8 %a, i32 7
+// CHECK:   ret <8 x i8> [[VSET_LANE]]
 int8x8_t test_vset_lane_s8(int8_t a, int8x8_t b) {
-  // CHECK-LABEL: test_vset_lane_s8:
-  // CHECK-NEXT:  ins.b v0[7], w0
-  // CHECK-NEXT:  ret
  return vset_lane_s8(a, b, 7);
 }

+// CHECK-LABEL: define <4 x i16> @test_vset_lane_s16(i16 %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <4 x i16> [[TMP1]], i16 %a, i32 3
+// CHECK:   ret <4 x i16> [[VSET_LANE]]
 int16x4_t test_vset_lane_s16(int16_t a, int16x4_t b) {
-  // CHECK-LABEL: test_vset_lane_s16:
-  // CHECK-NEXT:  ins.h v0[3], w0
-  // CHECK-NEXT:  ret
  return vset_lane_s16(a, b, 3);
 }

+// CHECK-LABEL: define <2 x i32> @test_vset_lane_s32(i32 %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <2 x i32> [[TMP1]], i32 %a, i32 1
+// CHECK:   ret <2 x i32> [[VSET_LANE]]
 int32x2_t test_vset_lane_s32(int32_t a, int32x2_t b) {
-  // CHECK-LABEL: test_vset_lane_s32:
-  // CHECK-NEXT:  ins.s v0[1], w0
-  // CHECK-NEXT:  ret
  return vset_lane_s32(a, b, 1);
 }

+// CHECK-LABEL: define <8 x i8> @test_vset_lane_p8(i8 %a, <8 x i8> %b) #0 {
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <8 x i8> %b, i8 %a, i32 7
+// CHECK:   ret <8 x i8> [[VSET_LANE]]
 poly8x8_t test_vset_lane_p8(poly8_t a, poly8x8_t b) {
-  // CHECK-LABEL: test_vset_lane_p8:
-  // CHECK-NEXT:  ins.b v0[7], w0
-  // CHECK-NEXT:  ret
  return vset_lane_p8(a, b, 7);
 }

+// CHECK-LABEL: define <4 x i16> @test_vset_lane_p16(i16 %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <4 x i16> [[TMP1]], i16 %a, i32 3
+// CHECK:   ret <4 x i16> [[VSET_LANE]]
 poly16x4_t test_vset_lane_p16(poly16_t a, poly16x4_t b) {
-  // CHECK-LABEL: test_vset_lane_p16:
-  // CHECK-NEXT:  ins.h v0[3], w0
-  // CHECK-NEXT:  ret
  return vset_lane_p16(a, b, 3);
 }

+// CHECK-LABEL: define <2 x float> @test_vset_lane_f32(float %a, <2 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <2 x float> [[TMP1]], float %a, i32 1
+// CHECK:   ret <2 x float> [[VSET_LANE]]
 float32x2_t test_vset_lane_f32(float32_t a, float32x2_t b) {
-  // CHECK-LABEL: test_vset_lane_f32:
-  // CHECK-NEXT:  ins.s v1[1], v0[0]
-  // CHECK-NEXT:  mov.16b  v0, v1
-  // CHECK-NEXT:  ret
  return vset_lane_f32(a, b, 1);
 }

+// CHECK-LABEL: define <4 x half> @test_vset_lane_f16(half* %a, <4 x half> %b) #0 {
+// CHECK:   [[__REINT_246:%.*]] = alloca half, align 2
+// CHECK:   [[__REINT1_246:%.*]] = alloca <4 x half>, align 8
+// CHECK:   [[__REINT2_246:%.*]] = alloca <4 x i16>, align 8
+// CHECK:   [[TMP0:%.*]] = load half, half* %a, align 2
+// CHECK:   store half [[TMP0]], half* [[__REINT_246]], align 2
+// CHECK:   store <4 x half> %b, <4 x half>* [[__REINT1_246]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast half* [[__REINT_246]] to i16*
+// CHECK:   [[TMP2:%.*]] = load i16, i16* [[TMP1]], align 2
+// CHECK:   [[TMP3:%.*]] = bitcast <4 x half>* [[__REINT1_246]] to <4 x i16>*
+// CHECK:   [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[TMP3]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
+// CHECK:   [[TMP6:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <4 x i16> [[TMP6]], i16 [[TMP2]], i32 3
+// CHECK:   store <4 x i16> [[VSET_LANE]], <4 x i16>* [[__REINT2_246]], align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x i16>* [[__REINT2_246]] to <4 x half>*
+// CHECK:   [[TMP8:%.*]] = load <4 x half>, <4 x half>* [[TMP7]], align 8
+// CHECK:   ret <4 x half> [[TMP8]]
 float16x4_t test_vset_lane_f16(float16_t *a, float16x4_t b) {
-  // CHECK-LABEL: test_vset_lane_f16:
-  // CHECK-NEXT:  ld1.h { v0 }[3], [x0]
-  // CHECK-NEXT:  ret
  return vset_lane_f16(*a, b, 3);
 }

+// CHECK-LABEL: define <16 x i8> @test_vsetq_lane_u8(i8 %a, <16 x i8> %b) #0 {
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <16 x i8> %b, i8 %a, i32 15
+// CHECK:   ret <16 x i8> [[VSET_LANE]]
 uint8x16_t test_vsetq_lane_u8(uint8_t a, uint8x16_t b) {
-  // CHECK-LABEL: test_vsetq_lane_u8:
-  // CHECK-NEXT:  ins.b v0[15], w0
-  // CHECK-NEXT:  ret
  return vsetq_lane_u8(a, b, 15);
 }

+// CHECK-LABEL: define <8 x i16> @test_vsetq_lane_u16(i16 %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <8 x i16> [[TMP1]], i16 %a, i32 7
+// CHECK:   ret <8 x i16> [[VSET_LANE]]
 uint16x8_t test_vsetq_lane_u16(uint16_t a, uint16x8_t b) {
-  // CHECK-LABEL: test_vsetq_lane_u16:
-  // CHECK-NEXT:  ins.h v0[7], w0
-  // CHECK-NEXT:  ret
  return vsetq_lane_u16(a, b, 7);
 }

+// CHECK-LABEL: define <4 x i32> @test_vsetq_lane_u32(i32 %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <4 x i32> [[TMP1]], i32 %a, i32 3
+// CHECK:   ret <4 x i32> [[VSET_LANE]]
 uint32x4_t test_vsetq_lane_u32(uint32_t a, uint32x4_t b) {
-  // CHECK-LABEL: test_vsetq_lane_u32:
-  // CHECK-NEXT:  ins.s v0[3], w0
-  // CHECK-NEXT:  ret
  return vsetq_lane_u32(a, b, 3);
 }

+// CHECK-LABEL: define <16 x i8> @test_vsetq_lane_s8(i8 %a, <16 x i8> %b) #0 {
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <16 x i8> %b, i8 %a, i32 15
+// CHECK:   ret <16 x i8> [[VSET_LANE]]
 int8x16_t test_vsetq_lane_s8(int8_t a, int8x16_t b) {
-  // CHECK-LABEL: test_vsetq_lane_s8:
-  // CHECK-NEXT:  ins.b v0[15], w0
-  // CHECK-NEXT:  ret
  return vsetq_lane_s8(a, b, 15);
 }

+// CHECK-LABEL: define <8 x i16> @test_vsetq_lane_s16(i16 %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <8 x i16> [[TMP1]], i16 %a, i32 7
+// CHECK:   ret <8 x i16> [[VSET_LANE]]
 int16x8_t test_vsetq_lane_s16(int16_t a, int16x8_t b) {
-  // CHECK-LABEL: test_vsetq_lane_s16:
-  // CHECK-NEXT:  ins.h v0[7], w0
-  // CHECK-NEXT:  ret
  return vsetq_lane_s16(a, b, 7);
 }

+// CHECK-LABEL: define <4 x i32> @test_vsetq_lane_s32(i32 %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <4 x i32> [[TMP1]], i32 %a, i32 3
+// CHECK:   ret <4 x i32> [[VSET_LANE]]
 int32x4_t test_vsetq_lane_s32(int32_t a, int32x4_t b) {
-  // CHECK-LABEL: test_vsetq_lane_s32:
-  // CHECK-NEXT:  ins.s v0[3], w0
-  // CHECK-NEXT:  ret
  return vsetq_lane_s32(a, b, 3);
 }

+// CHECK-LABEL: define <16 x i8> @test_vsetq_lane_p8(i8 %a, <16 x i8> %b) #0 {
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <16 x i8> %b, i8 %a, i32 15
+// CHECK:   ret <16 x i8> [[VSET_LANE]]
 poly8x16_t test_vsetq_lane_p8(poly8_t a, poly8x16_t b) {
-  // CHECK-LABEL: test_vsetq_lane_p8:
-  // CHECK-NEXT:  ins.b v0[15], w0
-  // CHECK-NEXT:  ret
  return vsetq_lane_p8(a, b, 15);
 }

+// CHECK-LABEL: define <8 x i16> @test_vsetq_lane_p16(i16 %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <8 x i16> [[TMP1]], i16 %a, i32 7
+// CHECK:   ret <8 x i16> [[VSET_LANE]]
 poly16x8_t test_vsetq_lane_p16(poly16_t a, poly16x8_t b) {
-  // CHECK-LABEL: test_vsetq_lane_p16:
-  // CHECK-NEXT:  ins.h v0[7], w0
-  // CHECK-NEXT:  ret
  return vsetq_lane_p16(a, b, 7);
 }

+// CHECK-LABEL: define <4 x float> @test_vsetq_lane_f32(float %a, <4 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %b to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <4 x float> [[TMP1]], float %a, i32 3
+// CHECK:   ret <4 x float> [[VSET_LANE]]
 float32x4_t test_vsetq_lane_f32(float32_t a, float32x4_t b) {
-  // CHECK-LABEL: test_vsetq_lane_f32:
-  // CHECK-NEXT:  ins.s v1[3], v0[0]
-  // CHECK-NEXT:  mov.16b  v0, v1
-  // CHECK-NEXT:  ret
  return vsetq_lane_f32(a, b, 3);
 }

+// CHECK-LABEL: define <8 x half> @test_vsetq_lane_f16(half* %a, <8 x half> %b) #0 {
+// CHECK:   [[__REINT_248:%.*]] = alloca half, align 2
+// CHECK:   [[__REINT1_248:%.*]] = alloca <8 x half>, align 16
+// CHECK:   [[__REINT2_248:%.*]] = alloca <8 x i16>, align 16
+// CHECK:   [[TMP0:%.*]] = load half, half* %a, align 2
+// CHECK:   store half [[TMP0]], half* [[__REINT_248]], align 2
+// CHECK:   store <8 x half> %b, <8 x half>* [[__REINT1_248]], align 16
+// CHECK:   [[TMP1:%.*]] = bitcast half* [[__REINT_248]] to i16*
+// CHECK:   [[TMP2:%.*]] = load i16, i16* [[TMP1]], align 2
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x half>* [[__REINT1_248]] to <8 x i16>*
+// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[TMP3]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
+// CHECK:   [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <8 x i16> [[TMP6]], i16 [[TMP2]], i32 7
+// CHECK:   store <8 x i16> [[VSET_LANE]], <8 x i16>* [[__REINT2_248]], align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i16>* [[__REINT2_248]] to <8 x half>*
+// CHECK:   [[TMP8:%.*]] = load <8 x half>, <8 x half>* [[TMP7]], align 16
+// CHECK:   ret <8 x half> [[TMP8]]
 float16x8_t test_vsetq_lane_f16(float16_t *a, float16x8_t b) {
-  // CHECK-LABEL: test_vsetq_lane_f16:
-  // CHECK-NEXT:  ld1.h { v0 }[7], [x0]
-  // CHECK-NEXT:  ret
  return vsetq_lane_f16(*a, b, 7);
 }

+// CHECK-LABEL: define <1 x i64> @test_vset_lane_s64(i64 %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <1 x i64> [[TMP1]], i64 %a, i32 0
+// CHECK:   ret <1 x i64> [[VSET_LANE]]
 int64x1_t test_vset_lane_s64(int64_t a, int64x1_t b) {
-  // CHECK-LABEL: test_vset_lane_s64:
-  // CHECK-NEXT:  fmov d0, x0
-  // CHECK-NEXT:  ret
  return vset_lane_s64(a, b, 0);
 }

+// CHECK-LABEL: define <1 x i64> @test_vset_lane_u64(i64 %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <1 x i64> [[TMP1]], i64 %a, i32 0
+// CHECK:   ret <1 x i64> [[VSET_LANE]]
 uint64x1_t test_vset_lane_u64(uint64_t a, uint64x1_t b) {
-  // CHECK-LABEL: test_vset_lane_u64:
-  // CHECK-NEXT:  fmov d0, x0
-  // CHECK-NEXT:  ret
  return vset_lane_u64(a, b, 0);
 }

+// CHECK-LABEL: define <2 x i64> @test_vsetq_lane_s64(i64 %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <2 x i64> [[TMP1]], i64 %a, i32 1
+// CHECK:   ret <2 x i64> [[VSET_LANE]]
 int64x2_t test_vsetq_lane_s64(int64_t a, int64x2_t b) {
-  // CHECK-LABEL: test_vsetq_lane_s64:
-  // CHECK-NEXT:  ins.d v0[1], x0
-  // CHECK-NEXT:  ret
  return vsetq_lane_s64(a, b, 1);
 }

+// CHECK-LABEL: define <2 x i64> @test_vsetq_lane_u64(i64 %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <2 x i64> [[TMP1]], i64 %a, i32 1
+// CHECK:   ret <2 x i64> [[VSET_LANE]]
 uint64x2_t test_vsetq_lane_u64(uint64_t a, uint64x2_t b) {
-  // CHECK-LABEL: test_vsetq_lane_u64:
-  // CHECK-NEXT:  ins.d v0[1], x0
-  // CHECK-NEXT:  ret
  return vsetq_lane_u64(a, b, 1);
 }
--- a/test/CodeGen/aarch64-poly128.c
+++ b/test/CodeGen/aarch64-poly128.c
@ -1,7 +1,7 @@
 // REQUIRES: aarch64-registered-target
 // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon \
-// RUN:  -ffp-contract=fast -S -O3 -o - %s | FileCheck %s --check-prefix=CHECK \
-// RUN:  --check-prefix=CHECK-ARM64
+// RUN:  -ffp-contract=fast -emit-llvm -o - %s | opt -S -mem2reg \
+// RUN:  | FileCheck %s

 // Test new aarch64 intrinsics with poly128
 // FIXME: Currently, poly128_t equals to uint128, which will be spilt into
@ -12,192 +12,238 @@

 #include <arm_neon.h>

+// CHECK-LABEL: define void @test_vstrq_p128(i128* %ptr, i128 %val) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i128* %ptr to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i128*
+// CHECK:   store i128 %val, i128* [[TMP1]]
+// CHECK:   ret void
 void test_vstrq_p128(poly128_t * ptr, poly128_t val) {
-  // CHECK-LABEL: test_vstrq_p128
  vstrq_p128(ptr, val);

-  // CHECK-ARM64: stp {{x[0-9]+}}, {{x[0-9]+}}, [x0]
 }

+// CHECK-LABEL: define i128 @test_vldrq_p128(i128* %ptr) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i128* %ptr to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i128*
+// CHECK:   [[TMP2:%.*]] = load i128, i128* [[TMP1]]
+// CHECK:   ret i128 [[TMP2]]
 poly128_t test_vldrq_p128(poly128_t * ptr) {
-  // CHECK-LABEL: test_vldrq_p128
  return vldrq_p128(ptr);

-  // CHECK-ARM64: ldp {{x[0-9]+}}, {{x[0-9]+}}, [x0]
 }

+// CHECK-LABEL: define void @test_ld_st_p128(i128* %ptr) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i128* %ptr to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i128*
+// CHECK:   [[TMP2:%.*]] = load i128, i128* [[TMP1]]
+// CHECK:   [[ADD_PTR:%.*]] = getelementptr inbounds i128, i128* %ptr, i64 1
+// CHECK:   [[TMP3:%.*]] = bitcast i128* [[ADD_PTR]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP3]] to i128*
+// CHECK:   store i128 [[TMP2]], i128* [[TMP4]]
+// CHECK:   ret void
 void test_ld_st_p128(poly128_t * ptr) {
-  // CHECK-LABEL: test_ld_st_p128
   vstrq_p128(ptr+1, vldrq_p128(ptr));

- // CHECK-ARM64: ldp [[PLO:x[0-9]+]], [[PHI:x[0-9]+]], [{{x[0-9]+}}]
- // CHECK-ARM64-NEXT: stp [[PLO]], [[PHI]], [{{x[0-9]+}}, #16]
 }

+// CHECK-LABEL: define i128 @test_vmull_p64(i64 %a, i64 %b) #0 {
+// CHECK:   [[VMULL_P64_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.pmull64(i64 %a, i64 %b) #2
+// CHECK:   [[VMULL_P641_I:%.*]] = bitcast <16 x i8> [[VMULL_P64_I]] to i128
+// CHECK:   ret i128 [[VMULL_P641_I]]
 poly128_t test_vmull_p64(poly64_t a, poly64_t b) {
-  // CHECK-LABEL: test_vmull_p64
  return vmull_p64(a, b);
-  // CHECK: pmull {{v[0-9]+}}.1q, {{v[0-9]+}}.1d, {{v[0-9]+}}.1d
 }

+// CHECK-LABEL: define i128 @test_vmull_high_p64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %a, <1 x i32> <i32 1>
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> [[SHUFFLE_I_I]] to i64
+// CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <2 x i64> %b, <2 x i64> %b, <1 x i32> <i32 1>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> [[SHUFFLE_I7_I]] to i64
+// CHECK:   [[VMULL_P64_I_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.pmull64(i64 [[TMP0]], i64 [[TMP1]]) #2
+// CHECK:   [[VMULL_P641_I_I:%.*]] = bitcast <16 x i8> [[VMULL_P64_I_I]] to i128
+// CHECK:   ret i128 [[VMULL_P641_I_I]]
 poly128_t test_vmull_high_p64(poly64x2_t a, poly64x2_t b) {
-  // CHECK-LABEL: test_vmull_high_p64
  return vmull_high_p64(a, b);
-  // CHECK: pmull2 {{v[0-9]+}}.1q, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }

-// CHECK-LABEL: test_vreinterpretq_p128_s8
-// CHECK: ret
+// CHECK-LABEL: define i128 @test_vreinterpretq_p128_s8(<16 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to i128
+// CHECK:   ret i128 [[TMP0]]
 poly128_t test_vreinterpretq_p128_s8(int8x16_t a) {
  return vreinterpretq_p128_s8(a);
 }

-// CHECK-LABEL: test_vreinterpretq_p128_s16
-// CHECK: ret
+// CHECK-LABEL: define i128 @test_vreinterpretq_p128_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to i128
+// CHECK:   ret i128 [[TMP0]]
 poly128_t test_vreinterpretq_p128_s16(int16x8_t a) {
  return vreinterpretq_p128_s16(a);
 }

-// CHECK-LABEL: test_vreinterpretq_p128_s32
-// CHECK: ret
+// CHECK-LABEL: define i128 @test_vreinterpretq_p128_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to i128
+// CHECK:   ret i128 [[TMP0]]
 poly128_t test_vreinterpretq_p128_s32(int32x4_t a) {
  return vreinterpretq_p128_s32(a);
 }

-// CHECK-LABEL: test_vreinterpretq_p128_s64
-// CHECK: ret
+// CHECK-LABEL: define i128 @test_vreinterpretq_p128_s64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to i128
+// CHECK:   ret i128 [[TMP0]]
 poly128_t test_vreinterpretq_p128_s64(int64x2_t a) {
  return vreinterpretq_p128_s64(a);
 }

-// CHECK-LABEL: test_vreinterpretq_p128_u8
-// CHECK: ret
+// CHECK-LABEL: define i128 @test_vreinterpretq_p128_u8(<16 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to i128
+// CHECK:   ret i128 [[TMP0]]
 poly128_t test_vreinterpretq_p128_u8(uint8x16_t a) {
  return vreinterpretq_p128_u8(a);
 }

-// CHECK-LABEL: test_vreinterpretq_p128_u16
-// CHECK: ret
+// CHECK-LABEL: define i128 @test_vreinterpretq_p128_u16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to i128
+// CHECK:   ret i128 [[TMP0]]
 poly128_t test_vreinterpretq_p128_u16(uint16x8_t a) {
  return vreinterpretq_p128_u16(a);
 }

-// CHECK-LABEL: test_vreinterpretq_p128_u32
-// CHECK: ret
+// CHECK-LABEL: define i128 @test_vreinterpretq_p128_u32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to i128
+// CHECK:   ret i128 [[TMP0]]
 poly128_t test_vreinterpretq_p128_u32(uint32x4_t a) {
  return vreinterpretq_p128_u32(a);
 }

-// CHECK-LABEL: test_vreinterpretq_p128_u64
-// CHECK: ret
+// CHECK-LABEL: define i128 @test_vreinterpretq_p128_u64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to i128
+// CHECK:   ret i128 [[TMP0]]
 poly128_t test_vreinterpretq_p128_u64(uint64x2_t a) {
  return vreinterpretq_p128_u64(a);
 }

-// CHECK-LABEL: test_vreinterpretq_p128_f32
-// CHECK: ret
+// CHECK-LABEL: define i128 @test_vreinterpretq_p128_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to i128
+// CHECK:   ret i128 [[TMP0]]
 poly128_t test_vreinterpretq_p128_f32(float32x4_t a) {
  return vreinterpretq_p128_f32(a);
 }

-// CHECK-LABEL: test_vreinterpretq_p128_f64
-// CHECK: ret
+// CHECK-LABEL: define i128 @test_vreinterpretq_p128_f64(<2 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to i128
+// CHECK:   ret i128 [[TMP0]]
 poly128_t test_vreinterpretq_p128_f64(float64x2_t a) {
  return vreinterpretq_p128_f64(a);
 }

-// CHECK-LABEL: test_vreinterpretq_p128_p8
-// CHECK: ret
+// CHECK-LABEL: define i128 @test_vreinterpretq_p128_p8(<16 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to i128
+// CHECK:   ret i128 [[TMP0]]
 poly128_t test_vreinterpretq_p128_p8(poly8x16_t a) {
  return vreinterpretq_p128_p8(a);
 }

-// CHECK-LABEL: test_vreinterpretq_p128_p16
-// CHECK: ret
+// CHECK-LABEL: define i128 @test_vreinterpretq_p128_p16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to i128
+// CHECK:   ret i128 [[TMP0]]
 poly128_t test_vreinterpretq_p128_p16(poly16x8_t a) {
  return vreinterpretq_p128_p16(a);
 }

-// CHECK-LABEL: test_vreinterpretq_p128_p64
-// CHECK: ret
+// CHECK-LABEL: define i128 @test_vreinterpretq_p128_p64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to i128
+// CHECK:   ret i128 [[TMP0]]
 poly128_t test_vreinterpretq_p128_p64(poly64x2_t a) {
  return vreinterpretq_p128_p64(a);
 }

-// CHECK-LABEL: test_vreinterpretq_s8_p128
-// CHECK: ret
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_p128(i128 %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i128 %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 int8x16_t test_vreinterpretq_s8_p128(poly128_t a) {
  return vreinterpretq_s8_p128(a);
 }

-// CHECK-LABEL: test_vreinterpretq_s16_p128
-// CHECK: ret
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_p128(i128 %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i128 %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 int16x8_t test_vreinterpretq_s16_p128(poly128_t  a) {
  return vreinterpretq_s16_p128(a);
 }

-// CHECK-LABEL: test_vreinterpretq_s32_p128
-// CHECK: ret
+// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_p128(i128 %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i128 %a to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP0]]
 int32x4_t test_vreinterpretq_s32_p128(poly128_t a) {
  return vreinterpretq_s32_p128(a);
 }

-// CHECK-LABEL: test_vreinterpretq_s64_p128
-// CHECK: ret
+// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_p128(i128 %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i128 %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 int64x2_t test_vreinterpretq_s64_p128(poly128_t  a) {
  return vreinterpretq_s64_p128(a);
 }

-// CHECK-LABEL: test_vreinterpretq_u8_p128
-// CHECK: ret
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_p128(i128 %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i128 %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 uint8x16_t test_vreinterpretq_u8_p128(poly128_t  a) {
  return vreinterpretq_u8_p128(a);
 }

-// CHECK-LABEL: test_vreinterpretq_u16_p128
-// CHECK: ret
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_p128(i128 %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i128 %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 uint16x8_t test_vreinterpretq_u16_p128(poly128_t  a) {
  return vreinterpretq_u16_p128(a);
 }

-// CHECK-LABEL: test_vreinterpretq_u32_p128
-// CHECK: ret
+// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_p128(i128 %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i128 %a to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP0]]
 uint32x4_t test_vreinterpretq_u32_p128(poly128_t  a) {
  return vreinterpretq_u32_p128(a);
 }

-// CHECK-LABEL: test_vreinterpretq_u64_p128
-// CHECK: ret
+// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_p128(i128 %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i128 %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 uint64x2_t test_vreinterpretq_u64_p128(poly128_t  a) {
  return vreinterpretq_u64_p128(a);
 }

-// CHECK-LABEL: test_vreinterpretq_f32_p128
-// CHECK: ret
+// CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_p128(i128 %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i128 %a to <4 x float>
+// CHECK:   ret <4 x float> [[TMP0]]
 float32x4_t test_vreinterpretq_f32_p128(poly128_t  a) {
  return vreinterpretq_f32_p128(a);
 }

-// CHECK-LABEL: test_vreinterpretq_f64_p128
-// CHECK: ret
+// CHECK-LABEL: define <2 x double> @test_vreinterpretq_f64_p128(i128 %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i128 %a to <2 x double>
+// CHECK:   ret <2 x double> [[TMP0]]
 float64x2_t test_vreinterpretq_f64_p128(poly128_t  a) {
  return vreinterpretq_f64_p128(a);
 }

-// CHECK-LABEL: test_vreinterpretq_p8_p128
-// CHECK: ret
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_p128(i128 %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i128 %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 poly8x16_t test_vreinterpretq_p8_p128(poly128_t  a) {
  return vreinterpretq_p8_p128(a);
 }

-// CHECK-LABEL: test_vreinterpretq_p16_p128
-// CHECK: ret
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_p128(i128 %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i128 %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 poly16x8_t test_vreinterpretq_p16_p128(poly128_t  a) {
  return vreinterpretq_p16_p128(a);
 }

-// CHECK-LABEL: test_vreinterpretq_p64_p128
-// CHECK: ret
+// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_p64_p128(i128 %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i128 %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 poly64x2_t test_vreinterpretq_p64_p128(poly128_t  a) {
  return vreinterpretq_p64_p128(a);
 }
--- a/test/CodeGen/aarch64-poly64.c
+++ b/test/CodeGen/aarch64-poly64.c
@ -1,299 +1,634 @@
-// FIXME: This is a front-end test that depends on LLVM optimizations (-O3). 
-// It should be split into separate files for front/middle/back-end testing.
-
-// REQUIRES: aarch64-registered-target
 // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon \
-// RUN:  -ffp-contract=fast -S -O3 -o - %s | FileCheck %s --check-prefix=CHECK \
-// RUN:  --check-prefix=CHECK-ARM64
+// RUN:  -ffp-contract=fast -emit-llvm -o - %s | opt -S -mem2reg \
+// RUN:  | FileCheck %s

 // Test new aarch64 intrinsics with poly64

 #include <arm_neon.h>

+// CHECK-LABEL: define <1 x i64> @test_vceq_p64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp eq <1 x i64> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64>
+// CHECK:   ret <1 x i64> [[SEXT_I]]
 uint64x1_t test_vceq_p64(poly64x1_t a, poly64x1_t b) {
-  // CHECK-LABEL: test_vceq_p64
  return vceq_p64(a, b);
-  // CHECK: cmeq {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
 }

+// CHECK-LABEL: define <2 x i64> @test_vceqq_p64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp eq <2 x i64> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[SEXT_I]]
 uint64x2_t test_vceqq_p64(poly64x2_t a, poly64x2_t b) {
-  // CHECK-LABEL: test_vceqq_p64
  return vceqq_p64(a, b);
-  // CHECK: cmeq {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }

+// CHECK-LABEL: define <1 x i64> @test_vtst_p64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[TMP4:%.*]] = and <1 x i64> [[TMP2]], [[TMP3]]
+// CHECK:   [[TMP5:%.*]] = icmp ne <1 x i64> [[TMP4]], zeroinitializer
+// CHECK:   [[VTST_I:%.*]] = sext <1 x i1> [[TMP5]] to <1 x i64>
+// CHECK:   ret <1 x i64> [[VTST_I]]
 uint64x1_t test_vtst_p64(poly64x1_t a, poly64x1_t b) {
-  // CHECK-LABEL: test_vtst_p64
  return vtst_p64(a, b);
-  // CHECK: cmtst {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
 }

+// CHECK-LABEL: define <2 x i64> @test_vtstq_p64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[TMP4:%.*]] = and <2 x i64> [[TMP2]], [[TMP3]]
+// CHECK:   [[TMP5:%.*]] = icmp ne <2 x i64> [[TMP4]], zeroinitializer
+// CHECK:   [[VTST_I:%.*]] = sext <2 x i1> [[TMP5]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[VTST_I]]
 uint64x2_t test_vtstq_p64(poly64x2_t a, poly64x2_t b) {
-  // CHECK-LABEL: test_vtstq_p64
  return vtstq_p64(a, b);
-  // CHECK: cmtst {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }

+// CHECK-LABEL: define <1 x i64> @test_vbsl_p64(<1 x i64> %a, <1 x i64> %b, <1 x i64> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <1 x i64> %c to <8 x i8>
+// CHECK:   [[VBSL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VBSL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[VBSL2_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x i64>
+// CHECK:   [[VBSL3_I:%.*]] = and <1 x i64> [[VBSL_I]], [[VBSL1_I]]
+// CHECK:   [[TMP3:%.*]] = xor <1 x i64> [[VBSL_I]], <i64 -1>
+// CHECK:   [[VBSL4_I:%.*]] = and <1 x i64> [[TMP3]], [[VBSL2_I]]
+// CHECK:   [[VBSL5_I:%.*]] = or <1 x i64> [[VBSL3_I]], [[VBSL4_I]]
+// CHECK:   ret <1 x i64> [[VBSL5_I]]
 poly64x1_t test_vbsl_p64(poly64x1_t a, poly64x1_t b, poly64x1_t c) {
-  // CHECK-LABEL: test_vbsl_p64
  return vbsl_p64(a, b, c);
-  // CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }

+// CHECK-LABEL: define <2 x i64> @test_vbslq_p64(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i64> %c to <16 x i8>
+// CHECK:   [[VBSL_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VBSL1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VBSL2_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x i64>
+// CHECK:   [[VBSL3_I:%.*]] = and <2 x i64> [[VBSL_I]], [[VBSL1_I]]
+// CHECK:   [[TMP3:%.*]] = xor <2 x i64> [[VBSL_I]], <i64 -1, i64 -1>
+// CHECK:   [[VBSL4_I:%.*]] = and <2 x i64> [[TMP3]], [[VBSL2_I]]
+// CHECK:   [[VBSL5_I:%.*]] = or <2 x i64> [[VBSL3_I]], [[VBSL4_I]]
+// CHECK:   ret <2 x i64> [[VBSL5_I]]
 poly64x2_t test_vbslq_p64(poly64x2_t a, poly64x2_t b, poly64x2_t c) {
-  // CHECK-LABEL: test_vbslq_p64
  return vbslq_p64(a, b, c);
-  // CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }

+// CHECK-LABEL: define i64 @test_vget_lane_p64(<1 x i64> %v) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %v to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <1 x i64> [[TMP1]], i32 0
+// CHECK:   ret i64 [[VGET_LANE]]
 poly64_t test_vget_lane_p64(poly64x1_t v) {
-  // CHECK-LABEL: test_vget_lane_p64
  return vget_lane_p64(v, 0);
-  // CHECK: fmov  {{x[0-9]+}}, {{d[0-9]+}}
 }

+// CHECK-LABEL: define i64 @test_vgetq_lane_p64(<2 x i64> %v) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %v to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <2 x i64> [[TMP1]], i32 1
+// CHECK:   ret i64 [[VGETQ_LANE]]
 poly64_t test_vgetq_lane_p64(poly64x2_t v) {
-  // CHECK-LABEL: test_vgetq_lane_p64
  return vgetq_lane_p64(v, 1);
-  // CHECK: {{mov|umov}}  {{x[0-9]+}}, {{v[0-9]+}}.d[1]
 }

+// CHECK-LABEL: define <1 x i64> @test_vset_lane_p64(i64 %a, <1 x i64> %v) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %v to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <1 x i64> [[TMP1]], i64 %a, i32 0
+// CHECK:   ret <1 x i64> [[VSET_LANE]]
 poly64x1_t test_vset_lane_p64(poly64_t a, poly64x1_t v) {
-  // CHECK-LABEL: test_vset_lane_p64
  return vset_lane_p64(a, v, 0);
-  // CHECK: fmov  {{d[0-9]+}}, {{x[0-9]+}}
 }

+// CHECK-LABEL: define <2 x i64> @test_vsetq_lane_p64(i64 %a, <2 x i64> %v) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %v to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <2 x i64> [[TMP1]], i64 %a, i32 1
+// CHECK:   ret <2 x i64> [[VSET_LANE]]
 poly64x2_t test_vsetq_lane_p64(poly64_t a, poly64x2_t v) {
-  // CHECK-LABEL: test_vsetq_lane_p64
  return vsetq_lane_p64(a, v, 1);
-  // CHECK: ins  {{v[0-9]+}}.d[1], {{x[0-9]+}}
 }

+// CHECK-LABEL: define <1 x i64> @test_vcopy_lane_p64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <1 x i64> [[TMP1]], i32 0
+// CHECK:   [[TMP2:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x i64>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <1 x i64> [[TMP3]], i64 [[VGET_LANE]], i32 0
+// CHECK:   ret <1 x i64> [[VSET_LANE]]
 poly64x1_t test_vcopy_lane_p64(poly64x1_t a, poly64x1_t b) {
-  // CHECK-LABEL: test_vcopy_lane_p64
  return vcopy_lane_p64(a, 0, b, 0);

-  // CHECK-ARM64: mov v0.16b, v1.16b
 }

+// CHECK-LABEL: define <2 x i64> @test_vcopyq_lane_p64(<2 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <1 x i64> [[TMP1]], i32 0
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x i64>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[VGET_LANE]], i32 1
+// CHECK:   ret <2 x i64> [[VSET_LANE]]
 poly64x2_t test_vcopyq_lane_p64(poly64x2_t a, poly64x1_t b) {
-  // CHECK-LABEL: test_vcopyq_lane_p64
  return vcopyq_lane_p64(a, 1, b, 0);
-  // CHECK: zip1 v0.2d, v0.2d, v1.2d
 }

+// CHECK-LABEL: define <2 x i64> @test_vcopyq_laneq_p64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <2 x i64> [[TMP1]], i32 1
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x i64>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[VGETQ_LANE]], i32 1
+// CHECK:   ret <2 x i64> [[VSET_LANE]]
 poly64x2_t test_vcopyq_laneq_p64(poly64x2_t a, poly64x2_t b) {
-  // CHECK-LABEL: test_vcopyq_laneq_p64
  return vcopyq_laneq_p64(a, 1, b, 1);
 }

+// CHECK-LABEL: define <1 x i64> @test_vcreate_p64(i64 %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64 %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP0]]
 poly64x1_t test_vcreate_p64(uint64_t a) {
-  // CHECK-LABEL: test_vcreate_p64
  return vcreate_p64(a);
-  // CHECK: fmov  {{d[0-9]+}}, {{x[0-9]+}}
 }

+// CHECK-LABEL: define <1 x i64> @test_vdup_n_p64(i64 %a) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <1 x i64> undef, i64 %a, i32 0
+// CHECK:   ret <1 x i64> [[VECINIT_I]]
 poly64x1_t test_vdup_n_p64(poly64_t a) {
-  // CHECK-LABEL: test_vdup_n_p64
  return vdup_n_p64(a);
-  // CHECK: fmov {{d[0-9]+}}, {{x[0-9]+}}
 }
+// CHECK-LABEL: define <2 x i64> @test_vdupq_n_p64(i64 %a) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i64> undef, i64 %a, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i64> [[VECINIT_I]], i64 %a, i32 1
+// CHECK:   ret <2 x i64> [[VECINIT1_I]]
 poly64x2_t test_vdupq_n_p64(poly64_t a) {
-  // CHECK-LABEL: test_vdupq_n_p64
  return vdupq_n_p64(a);
-  // CHECK: dup {{v[0-9]+}}.2d, {{x[0-9]+}}
 }

+// CHECK-LABEL: define <1 x i64> @test_vmov_n_p64(i64 %a) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <1 x i64> undef, i64 %a, i32 0
+// CHECK:   ret <1 x i64> [[VECINIT_I]]
 poly64x1_t test_vmov_n_p64(poly64_t a) {
-  // CHECK-LABEL: test_vmov_n_p64
  return vmov_n_p64(a);
-  // CHECK: fmov {{d[0-9]+}}, {{x[0-9]+}}
 }

+// CHECK-LABEL: define <2 x i64> @test_vmovq_n_p64(i64 %a) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i64> undef, i64 %a, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i64> [[VECINIT_I]], i64 %a, i32 1
+// CHECK:   ret <2 x i64> [[VECINIT1_I]]
 poly64x2_t test_vmovq_n_p64(poly64_t a) {
-  // CHECK-LABEL: test_vmovq_n_p64
  return vmovq_n_p64(a);
-  // CHECK: dup {{v[0-9]+}}.2d, {{x[0-9]+}}
 }

+// CHECK-LABEL: define <1 x i64> @test_vdup_lane_p64(<1 x i64> %vec) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <1 x i64> %vec, <1 x i64> %vec, <1 x i32> zeroinitializer
+// CHECK:   ret <1 x i64> [[SHUFFLE]]
 poly64x1_t test_vdup_lane_p64(poly64x1_t vec) {
-  // CHECK-LABEL: test_vdup_lane_p64
  return vdup_lane_p64(vec, 0);
-  // CHECK: ret
 }

+// CHECK-LABEL: define <2 x i64> @test_vdupq_lane_p64(<1 x i64> %vec) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <1 x i64> %vec, <1 x i64> %vec, <2 x i32> zeroinitializer
+// CHECK:   ret <2 x i64> [[SHUFFLE]]
 poly64x2_t test_vdupq_lane_p64(poly64x1_t vec) {
-  // CHECK-LABEL: test_vdupq_lane_p64
  return vdupq_lane_p64(vec, 0);
-  // CHECK: dup {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
 }

+// CHECK-LABEL: define <2 x i64> @test_vdupq_laneq_p64(<2 x i64> %vec) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i64> %vec, <2 x i64> %vec, <2 x i32> <i32 1, i32 1>
+// CHECK:   ret <2 x i64> [[SHUFFLE]]
 poly64x2_t test_vdupq_laneq_p64(poly64x2_t vec) {
-  // CHECK-LABEL: test_vdupq_laneq_p64
  return vdupq_laneq_p64(vec, 1);
-  // CHECK: dup {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1]
 }

+// CHECK-LABEL: define <2 x i64> @test_vcombine_p64(<1 x i64> %low, <1 x i64> %high) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <1 x i64> %low, <1 x i64> %high, <2 x i32> <i32 0, i32 1>
+// CHECK:   ret <2 x i64> [[SHUFFLE_I]]
 poly64x2_t test_vcombine_p64(poly64x1_t low, poly64x1_t high) {
-  // CHECK-LABEL: test_vcombine_p64
  return vcombine_p64(low, high);
-  // CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
 }

+// CHECK-LABEL: define <1 x i64> @test_vld1_p64(i64* %ptr) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64* %ptr to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <1 x i64>*
+// CHECK:   [[TMP2:%.*]] = load <1 x i64>, <1 x i64>* [[TMP1]]
+// CHECK:   ret <1 x i64> [[TMP2]]
 poly64x1_t test_vld1_p64(poly64_t const * ptr) {
-  // CHECK-LABEL: test_vld1_p64
  return vld1_p64(ptr);
-  // CHECK-ARM64: ldr {{d[0-9]+}}, [{{x[0-9]+|sp}}]
 }

+// CHECK-LABEL: define <2 x i64> @test_vld1q_p64(i64* %ptr) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64* %ptr to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <2 x i64>*
+// CHECK:   [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* [[TMP1]]
+// CHECK:   ret <2 x i64> [[TMP2]]
 poly64x2_t test_vld1q_p64(poly64_t const * ptr) {
-  // CHECK-LABEL: test_vld1q_p64
  return vld1q_p64(ptr);
-  // CHECK-ARM64: ldr {{q[0-9]+}}, [{{x[0-9]+|sp}}]
 }

+// CHECK-LABEL: define void @test_vst1_p64(i64* %ptr, <1 x i64> %val) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64* %ptr to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %val to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to <1 x i64>*
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   store <1 x i64> [[TMP3]], <1 x i64>* [[TMP2]]
+// CHECK:   ret void
 void test_vst1_p64(poly64_t * ptr, poly64x1_t val) {
-  // CHECK-LABEL: test_vst1_p64
  return vst1_p64(ptr, val);
-  // CHECK-ARM64: str {{d[0-9]+}}, [{{x[0-9]+|sp}}]
 }

+// CHECK-LABEL: define void @test_vst1q_p64(i64* %ptr, <2 x i64> %val) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64* %ptr to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %val to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to <2 x i64>*
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   store <2 x i64> [[TMP3]], <2 x i64>* [[TMP2]]
+// CHECK:   ret void
 void test_vst1q_p64(poly64_t * ptr, poly64x2_t val) {
-  // CHECK-LABEL: test_vst1q_p64
  return vst1q_p64(ptr, val);
-  // CHECK-ARM64: str {{q[0-9]+}}, [{{x[0-9]+|sp}}]
 }

+// CHECK-LABEL: define %struct.poly64x1x2_t @test_vld2_p64(i64* %ptr) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly64x1x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly64x1x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x1x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %ptr to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <1 x i64>*
+// CHECK:   [[VLD2:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2.v1i64.p0v1i64(<1 x i64>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64> }*
+// CHECK:   store { <1 x i64>, <1 x i64> } [[VLD2]], { <1 x i64>, <1 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly64x1x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly64x1x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.poly64x1x2_t, %struct.poly64x1x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly64x1x2_t [[TMP6]]
 poly64x1x2_t test_vld2_p64(poly64_t const * ptr) {
-  // CHECK-LABEL: test_vld2_p64
  return vld2_p64(ptr);
-  // CHECK: ld1 {{{ *v[0-9]+.1d, v[0-9]+.1d *}}}, [{{x[0-9]+|sp}}]
 }

+// CHECK-LABEL: define %struct.poly64x2x2_t @test_vld2q_p64(i64* %ptr) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly64x2x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly64x2x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x2x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %ptr to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <2 x i64>*
+// CHECK:   [[VLD2:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2.v2i64.p0v2i64(<2 x i64>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i64>, <2 x i64> }*
+// CHECK:   store { <2 x i64>, <2 x i64> } [[VLD2]], { <2 x i64>, <2 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly64x2x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly64x2x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.poly64x2x2_t, %struct.poly64x2x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.poly64x2x2_t [[TMP6]]
 poly64x2x2_t test_vld2q_p64(poly64_t const * ptr) {
-  // CHECK-LABEL: test_vld2q_p64
  return vld2q_p64(ptr);
-  // CHECK: ld2 {{{ *v[0-9]+.2d, v[0-9]+.2d *}}}, [{{x[0-9]+|sp}}]
 }

+// CHECK-LABEL: define %struct.poly64x1x3_t @test_vld3_p64(i64* %ptr) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly64x1x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly64x1x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x1x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %ptr to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <1 x i64>*
+// CHECK:   [[VLD3:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3.v1i64.p0v1i64(<1 x i64>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64> }*
+// CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3]], { <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly64x1x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly64x1x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.poly64x1x3_t, %struct.poly64x1x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly64x1x3_t [[TMP6]]
 poly64x1x3_t test_vld3_p64(poly64_t const * ptr) {
-  // CHECK-LABEL: test_vld3_p64
  return vld3_p64(ptr);
-  // CHECK: ld1 {{{ *v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d *}}}, [{{x[0-9]+|sp}}]
 }

+// CHECK-LABEL: define %struct.poly64x2x3_t @test_vld3q_p64(i64* %ptr) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly64x2x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly64x2x3_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x2x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %ptr to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <2 x i64>*
+// CHECK:   [[VLD3:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3.v2i64.p0v2i64(<2 x i64>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i64>, <2 x i64>, <2 x i64> }*
+// CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3]], { <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly64x2x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly64x2x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.poly64x2x3_t, %struct.poly64x2x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.poly64x2x3_t [[TMP6]]
 poly64x2x3_t test_vld3q_p64(poly64_t const * ptr) {
-  // CHECK-LABEL: test_vld3q_p64
  return vld3q_p64(ptr);
-  // CHECK: ld3 {{{ *v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d *}}}, [{{x[0-9]+|sp}}]
 }

+// CHECK-LABEL: define %struct.poly64x1x4_t @test_vld4_p64(i64* %ptr) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly64x1x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly64x1x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x1x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %ptr to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <1 x i64>*
+// CHECK:   [[VLD4:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4.v1i64.p0v1i64(<1 x i64>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }*
+// CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4]], { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly64x1x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly64x1x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.poly64x1x4_t, %struct.poly64x1x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly64x1x4_t [[TMP6]]
 poly64x1x4_t test_vld4_p64(poly64_t const * ptr) {
-  // CHECK-LABEL: test_vld4_p64
  return vld4_p64(ptr);
-  // CHECK: ld1 {{{ *v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d *}}}, [{{x[0-9]+|sp}}]
 }

+// CHECK-LABEL: define %struct.poly64x2x4_t @test_vld4q_p64(i64* %ptr) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly64x2x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly64x2x4_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x2x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %ptr to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <2 x i64>*
+// CHECK:   [[VLD4:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4.v2i64.p0v2i64(<2 x i64>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }*
+// CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4]], { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly64x2x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly64x2x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.poly64x2x4_t, %struct.poly64x2x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.poly64x2x4_t [[TMP6]]
 poly64x2x4_t test_vld4q_p64(poly64_t const * ptr) {
-  // CHECK-LABEL: test_vld4q_p64
  return vld4q_p64(ptr);
-  // CHECK: ld4 {{{ *v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d *}}}, [{{x[0-9]+|sp}}]
 }

+// CHECK-LABEL: define void @test_vst2_p64(i64* %ptr, [2 x <1 x i64>] %val.coerce) #0 {
+// CHECK:   [[VAL:%.*]] = alloca %struct.poly64x1x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly64x1x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly64x1x2_t, %struct.poly64x1x2_t* [[VAL]], i32 0, i32 0
+// CHECK:   store [2 x <1 x i64>] [[VAL]].coerce, [2 x <1 x i64>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x1x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly64x1x2_t* [[VAL]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %ptr to i8*
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly64x1x2_t, %struct.poly64x1x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL]]1, i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL2:%.*]] = getelementptr inbounds %struct.poly64x1x2_t, %struct.poly64x1x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX3:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL]]2, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]3, align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
+// CHECK:   call void @llvm.aarch64.neon.st2.v1i64.p0i8(<1 x i64> [[TMP7]], <1 x i64> [[TMP8]], i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst2_p64(poly64_t * ptr, poly64x1x2_t val) {
-  // CHECK-LABEL: test_vst2_p64
  return vst2_p64(ptr, val);
-  // CHECK:  st1 {{{ *v[0-9]+.1d, v[0-9]+.1d *}}}, [{{x[0-9]+|sp}}]
 }

+// CHECK-LABEL: define void @test_vst2q_p64(i64* %ptr, [2 x <2 x i64>] %val.coerce) #0 {
+// CHECK:   [[VAL:%.*]] = alloca %struct.poly64x2x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly64x2x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly64x2x2_t, %struct.poly64x2x2_t* [[VAL]], i32 0, i32 0
+// CHECK:   store [2 x <2 x i64>] [[VAL]].coerce, [2 x <2 x i64>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x2x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly64x2x2_t* [[VAL]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %ptr to i8*
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly64x2x2_t, %struct.poly64x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* [[VAL]]1, i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL2:%.*]] = getelementptr inbounds %struct.poly64x2x2_t, %struct.poly64x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX3:%.*]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* [[VAL]]2, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]]3, align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
+// CHECK:   call void @llvm.aarch64.neon.st2.v2i64.p0i8(<2 x i64> [[TMP7]], <2 x i64> [[TMP8]], i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst2q_p64(poly64_t * ptr, poly64x2x2_t val) {
-  // CHECK-LABEL: test_vst2q_p64
  return vst2q_p64(ptr, val);
-  // CHECK:  st2 {{{ *v[0-9]+.2d, v[0-9]+.2d *}}}, [{{x[0-9]+|sp}}]
 }

+// CHECK-LABEL: define void @test_vst3_p64(i64* %ptr, [3 x <1 x i64>] %val.coerce) #0 {
+// CHECK:   [[VAL:%.*]] = alloca %struct.poly64x1x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly64x1x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly64x1x3_t, %struct.poly64x1x3_t* [[VAL]], i32 0, i32 0
+// CHECK:   store [3 x <1 x i64>] [[VAL]].coerce, [3 x <1 x i64>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x1x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly64x1x3_t* [[VAL]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %ptr to i8*
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly64x1x3_t, %struct.poly64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]]1, i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL2:%.*]] = getelementptr inbounds %struct.poly64x1x3_t, %struct.poly64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX3:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]]2, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]3, align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL4:%.*]] = getelementptr inbounds %struct.poly64x1x3_t, %struct.poly64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX5:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]]4, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]5, align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64>
+// CHECK:   call void @llvm.aarch64.neon.st3.v1i64.p0i8(<1 x i64> [[TMP9]], <1 x i64> [[TMP1]]0, <1 x i64> [[TMP1]]1, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst3_p64(poly64_t * ptr, poly64x1x3_t val) {
-  // CHECK-LABEL: test_vst3_p64
  return vst3_p64(ptr, val);
-  // CHECK:  st1 {{{ *v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d *}}}, [{{x[0-9]+|sp}}]
 }

+// CHECK-LABEL: define void @test_vst3q_p64(i64* %ptr, [3 x <2 x i64>] %val.coerce) #0 {
+// CHECK:   [[VAL:%.*]] = alloca %struct.poly64x2x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly64x2x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly64x2x3_t, %struct.poly64x2x3_t* [[VAL]], i32 0, i32 0
+// CHECK:   store [3 x <2 x i64>] [[VAL]].coerce, [3 x <2 x i64>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x2x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly64x2x3_t* [[VAL]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %ptr to i8*
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly64x2x3_t, %struct.poly64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL]]1, i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL2:%.*]] = getelementptr inbounds %struct.poly64x2x3_t, %struct.poly64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX3:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL]]2, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]]3, align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL4:%.*]] = getelementptr inbounds %struct.poly64x2x3_t, %struct.poly64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX5:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL]]4, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]]5, align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64>
+// CHECK:   call void @llvm.aarch64.neon.st3.v2i64.p0i8(<2 x i64> [[TMP9]], <2 x i64> [[TMP1]]0, <2 x i64> [[TMP1]]1, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst3q_p64(poly64_t * ptr, poly64x2x3_t val) {
-  // CHECK-LABEL: test_vst3q_p64
  return vst3q_p64(ptr, val);
-  // CHECK:  st3 {{{ *v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d *}}}, [{{x[0-9]+|sp}}]
 }

+// CHECK-LABEL: define void @test_vst4_p64(i64* %ptr, [4 x <1 x i64>] %val.coerce) #0 {
+// CHECK:   [[VAL:%.*]] = alloca %struct.poly64x1x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly64x1x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly64x1x4_t, %struct.poly64x1x4_t* [[VAL]], i32 0, i32 0
+// CHECK:   store [4 x <1 x i64>] [[VAL]].coerce, [4 x <1 x i64>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x1x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly64x1x4_t* [[VAL]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %ptr to i8*
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly64x1x4_t, %struct.poly64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]]1, i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL2:%.*]] = getelementptr inbounds %struct.poly64x1x4_t, %struct.poly64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX3:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]]2, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]3, align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL4:%.*]] = getelementptr inbounds %struct.poly64x1x4_t, %struct.poly64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX5:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]]4, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]5, align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8>
+// CHECK:   [[VAL6:%.*]] = getelementptr inbounds %struct.poly64x1x4_t, %struct.poly64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX7:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]]6, i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]7, align 8
+// CHECK:   [[TMP10:%.*]] = bitcast <1 x i64> [[TMP9]] to <8 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP1]]0 to <1 x i64>
+// CHECK:   call void @llvm.aarch64.neon.st4.v1i64.p0i8(<1 x i64> [[TMP1]]1, <1 x i64> [[TMP1]]2, <1 x i64> [[TMP1]]3, <1 x i64> [[TMP1]]4, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst4_p64(poly64_t * ptr, poly64x1x4_t val) {
-  // CHECK-LABEL: test_vst4_p64
  return vst4_p64(ptr, val);
-  // CHECK:  st1 {{{ *v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d *}}}, [{{x[0-9]+|sp}}]
 }

+// CHECK-LABEL: define void @test_vst4q_p64(i64* %ptr, [4 x <2 x i64>] %val.coerce) #0 {
+// CHECK:   [[VAL:%.*]] = alloca %struct.poly64x2x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly64x2x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly64x2x4_t, %struct.poly64x2x4_t* [[VAL]], i32 0, i32 0
+// CHECK:   store [4 x <2 x i64>] [[VAL]].coerce, [4 x <2 x i64>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x2x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly64x2x4_t* [[VAL]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %ptr to i8*
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly64x2x4_t, %struct.poly64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]]1, i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL2:%.*]] = getelementptr inbounds %struct.poly64x2x4_t, %struct.poly64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX3:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]]2, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]]3, align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL4:%.*]] = getelementptr inbounds %struct.poly64x2x4_t, %struct.poly64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX5:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]]4, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]]5, align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8>
+// CHECK:   [[VAL6:%.*]] = getelementptr inbounds %struct.poly64x2x4_t, %struct.poly64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX7:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]]6, i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]]7, align 16
+// CHECK:   [[TMP10:%.*]] = bitcast <2 x i64> [[TMP9]] to <16 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP1]]0 to <2 x i64>
+// CHECK:   call void @llvm.aarch64.neon.st4.v2i64.p0i8(<2 x i64> [[TMP1]]1, <2 x i64> [[TMP1]]2, <2 x i64> [[TMP1]]3, <2 x i64> [[TMP1]]4, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst4q_p64(poly64_t * ptr, poly64x2x4_t val) {
-  // CHECK-LABEL: test_vst4q_p64
  return vst4q_p64(ptr, val);
-  // CHECK:  st4 {{{ *v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d *}}}, [{{x[0-9]+|sp}}]
 }

+// CHECK-LABEL: define <1 x i64> @test_vext_p64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[VEXT:%.*]] = shufflevector <1 x i64> [[TMP2]], <1 x i64> [[TMP3]], <1 x i32> zeroinitializer
+// CHECK:   ret <1 x i64> [[VEXT]]
 poly64x1_t test_vext_p64(poly64x1_t a, poly64x1_t b) {
-  // CHECK-LABEL: test_vext_p64
  return vext_u64(a, b, 0);

 }

+// CHECK-LABEL: define <2 x i64> @test_vextq_p64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VEXT:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> [[TMP3]], <2 x i32> <i32 1, i32 2>
+// CHECK:   ret <2 x i64> [[VEXT]]
 poly64x2_t test_vextq_p64(poly64x2_t a, poly64x2_t b) {
-  // CHECK-LABEL: test_vextq_p64
  return vextq_p64(a, b, 1);
-  // CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{#0x8|#8}}
 }

+// CHECK-LABEL: define <2 x i64> @test_vzip1q_p64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 0, i32 2>
+// CHECK:   ret <2 x i64> [[SHUFFLE_I]]
 poly64x2_t test_vzip1q_p64(poly64x2_t a, poly64x2_t b) {
-  // CHECK-LABEL: test_vzip1q_p64
  return vzip1q_p64(a, b);
-  // CHECK-ARM64: zip1 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }

+// CHECK-LABEL: define <2 x i64> @test_vzip2q_p64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 3>
+// CHECK:   ret <2 x i64> [[SHUFFLE_I]]
 poly64x2_t test_vzip2q_p64(poly64x2_t a, poly64x2_t b) {
-  // CHECK-LABEL: test_vzip2q_p64
  return vzip2q_u64(a, b);
-  // CHECK-ARM64: zip2 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }

+// CHECK-LABEL: define <2 x i64> @test_vuzp1q_p64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 0, i32 2>
+// CHECK:   ret <2 x i64> [[SHUFFLE_I]]
 poly64x2_t test_vuzp1q_p64(poly64x2_t a, poly64x2_t b) {
-  // CHECK-LABEL: test_vuzp1q_p64
  return vuzp1q_p64(a, b);
-  // CHECK-ARM64: zip1 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }

+// CHECK-LABEL: define <2 x i64> @test_vuzp2q_p64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 3>
+// CHECK:   ret <2 x i64> [[SHUFFLE_I]]
 poly64x2_t test_vuzp2q_p64(poly64x2_t a, poly64x2_t b) {
-  // CHECK-LABEL: test_vuzp2q_p64
  return vuzp2q_u64(a, b);
-  // CHECK-ARM64: zip2 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }

+// CHECK-LABEL: define <2 x i64> @test_vtrn1q_p64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 0, i32 2>
+// CHECK:   ret <2 x i64> [[SHUFFLE_I]]
 poly64x2_t test_vtrn1q_p64(poly64x2_t a, poly64x2_t b) {
-  // CHECK-LABEL: test_vtrn1q_p64
  return vtrn1q_p64(a, b);
-  // CHECK-ARM64: zip1 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }

+// CHECK-LABEL: define <2 x i64> @test_vtrn2q_p64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 3>
+// CHECK:   ret <2 x i64> [[SHUFFLE_I]]
 poly64x2_t test_vtrn2q_p64(poly64x2_t a, poly64x2_t b) {
-  // CHECK-LABEL: test_vtrn2q_p64
  return vtrn2q_u64(a, b);
-  // CHECK-ARM64: zip2 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }

+// CHECK-LABEL: define <1 x i64> @test_vsri_n_p64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[VSRI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VSRI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[VSRI_N2:%.*]] = call <1 x i64> @llvm.aarch64.neon.vsri.v1i64(<1 x i64> [[VSRI_N]], <1 x i64> [[VSRI_N]]1, i32 33)
+// CHECK:   ret <1 x i64> [[VSRI_N]]2
 poly64x1_t test_vsri_n_p64(poly64x1_t a, poly64x1_t b) {
-  // CHECK-LABEL: test_vsri_n_p64
  return vsri_n_p64(a, b, 33);
-  // CHECK: sri {{d[0-9]+}}, {{d[0-9]+}}, #33
 }

+// CHECK-LABEL: define <2 x i64> @test_vsriq_n_p64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VSRI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VSRI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VSRI_N2:%.*]] = call <2 x i64> @llvm.aarch64.neon.vsri.v2i64(<2 x i64> [[VSRI_N]], <2 x i64> [[VSRI_N]]1, i32 64)
+// CHECK:   ret <2 x i64> [[VSRI_N]]2
 poly64x2_t test_vsriq_n_p64(poly64x2_t a, poly64x2_t b) {
-  // CHECK-LABEL: test_vsriq_n_p64
  return vsriq_n_p64(a, b, 64);
-  // CHECK: sri {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #64
 }

--- a/test/CodeGen/arm-bitfield-alignment.c
+++ b/test/CodeGen/arm-bitfield-alignment.c
@ -1,5 +1,5 @@
-// RUN: %clang_cc1 -triple arm-none-eabi -ffreestanding -emit-llvm -o - -O3 %s | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64 -ffreestanding -emit-llvm -o - -O3 %s | FileCheck %s
+// RUN: %clang_cc1 -triple arm-none-eabi -ffreestanding -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -ffreestanding -emit-llvm -o - %s | FileCheck %s

 extern struct T {
  int b0 : 8;
--- a/test/CodeGen/arm-crc32.c
+++ b/test/CodeGen/arm-crc32.c
@ -1,6 +1,5 @@
-// REQUIRES: arm-registered-target
 // RUN: %clang_cc1 -triple armv8-none-linux-gnueabi \
-// RUN:   -O3 -S -emit-llvm -o - %s | FileCheck %s
+// RUN:   -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s

 int crc32b(int a, char b)
 {
@ -48,7 +47,7 @@ int crc32d(int a, long long b)
 // CHECK: [[T0:%[0-9]+]] = trunc i64 %b to i32
 // CHECK: [[T1:%[0-9]+]] = lshr i64 %b, 32
 // CHECK: [[T2:%[0-9]+]] = trunc i64 [[T1]] to i32
-// CHECK: [[T3:%[0-9]+]] = tail call i32 @llvm.arm.crc32w(i32 %a, i32 [[T0]])
+// CHECK: [[T3:%[0-9]+]] = call i32 @llvm.arm.crc32w(i32 %a, i32 [[T0]])
 // CHECK: call i32 @llvm.arm.crc32w(i32 [[T3]], i32 [[T2]])
 }

@ -58,6 +57,6 @@ int crc32cd(int a, long long b)
 // CHECK: [[T0:%[0-9]+]] = trunc i64 %b to i32
 // CHECK: [[T1:%[0-9]+]] = lshr i64 %b, 32
 // CHECK: [[T2:%[0-9]+]] = trunc i64 [[T1]] to i32
-// CHECK: [[T3:%[0-9]+]] = tail call i32 @llvm.arm.crc32cw(i32 %a, i32 [[T0]])
+// CHECK: [[T3:%[0-9]+]] = call i32 @llvm.arm.crc32cw(i32 %a, i32 [[T0]])
 // CHECK: call i32 @llvm.arm.crc32cw(i32 [[T3]], i32 [[T2]])
 }
--- a/test/CodeGen/arm-neon-directed-rounding.c
+++ b/test/CodeGen/arm-neon-directed-rounding.c
@ -1,75 +1,135 @@
-// RUN: %clang_cc1 -triple thumbv8-linux-gnueabihf -target-cpu cortex-a57 -ffreestanding -O1 -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8-linux-gnueabihf -target-cpu cortex-a57 -ffreestanding -emit-llvm %s -o - | opt -S -mem2reg | FileCheck %s

 #include <arm_neon.h>

+// CHECK-LABEL: define <2 x float> @test_vrnda_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[VRNDA_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VRNDA_V1_I:%.*]] = call <2 x float> @llvm.arm.neon.vrinta.v2f32(<2 x float> [[VRNDA_V_I]]) #2
+// CHECK:   [[VRNDA_V2_I:%.*]] = bitcast <2 x float> [[VRNDA_V1_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VRNDA_V2_I]] to <2 x float>
+// CHECK:   ret <2 x float> [[TMP1]]
 float32x2_t test_vrnda_f32(float32x2_t a) {
-  // CHECK-LABEL: test_vrnda_f32
-  // CHECK: call <2 x float> @llvm.arm.neon.vrinta.v2f32(<2 x float> %a)
  return vrnda_f32(a);
 }

+// CHECK-LABEL: define <4 x float> @test_vrndaq_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[VRNDAQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VRNDAQ_V1_I:%.*]] = call <4 x float> @llvm.arm.neon.vrinta.v4f32(<4 x float> [[VRNDAQ_V_I]]) #2
+// CHECK:   [[VRNDAQ_V2_I:%.*]] = bitcast <4 x float> [[VRNDAQ_V1_I]] to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[VRNDAQ_V2_I]] to <4 x float>
+// CHECK:   ret <4 x float> [[TMP1]]
 float32x4_t test_vrndaq_f32(float32x4_t a) {
-  // CHECK-LABEL: test_vrndaq_f32
-  // CHECK: call <4 x float> @llvm.arm.neon.vrinta.v4f32(<4 x float> %a)
  return vrndaq_f32(a);
 }

+// CHECK-LABEL: define <2 x float> @test_vrndm_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[VRNDM_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VRNDM_V1_I:%.*]] = call <2 x float> @llvm.arm.neon.vrintm.v2f32(<2 x float> [[VRNDM_V_I]]) #2
+// CHECK:   [[VRNDM_V2_I:%.*]] = bitcast <2 x float> [[VRNDM_V1_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VRNDM_V2_I]] to <2 x float>
+// CHECK:   ret <2 x float> [[TMP1]]
 float32x2_t test_vrndm_f32(float32x2_t a) {
-  // CHECK-LABEL: test_vrndm_f32
-  // CHECK: call <2 x float> @llvm.arm.neon.vrintm.v2f32(<2 x float> %a)
  return vrndm_f32(a);
 }

+// CHECK-LABEL: define <4 x float> @test_vrndmq_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[VRNDMQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VRNDMQ_V1_I:%.*]] = call <4 x float> @llvm.arm.neon.vrintm.v4f32(<4 x float> [[VRNDMQ_V_I]]) #2
+// CHECK:   [[VRNDMQ_V2_I:%.*]] = bitcast <4 x float> [[VRNDMQ_V1_I]] to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[VRNDMQ_V2_I]] to <4 x float>
+// CHECK:   ret <4 x float> [[TMP1]]
 float32x4_t test_vrndmq_f32(float32x4_t a) {
-  // CHECK-LABEL: test_vrndmq_f32
-  // CHECK: call <4 x float> @llvm.arm.neon.vrintm.v4f32(<4 x float> %a)
  return vrndmq_f32(a);
 }

+// CHECK-LABEL: define <2 x float> @test_vrndn_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[VRNDN_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VRNDN_V1_I:%.*]] = call <2 x float> @llvm.arm.neon.vrintn.v2f32(<2 x float> [[VRNDN_V_I]]) #2
+// CHECK:   [[VRNDN_V2_I:%.*]] = bitcast <2 x float> [[VRNDN_V1_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VRNDN_V2_I]] to <2 x float>
+// CHECK:   ret <2 x float> [[TMP1]]
 float32x2_t test_vrndn_f32(float32x2_t a) {
-  // CHECK-LABEL: test_vrndn_f32
-  // CHECK: call <2 x float> @llvm.arm.neon.vrintn.v2f32(<2 x float> %a)
  return vrndn_f32(a);
 }

+// CHECK-LABEL: define <4 x float> @test_vrndnq_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[VRNDNQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VRNDNQ_V1_I:%.*]] = call <4 x float> @llvm.arm.neon.vrintn.v4f32(<4 x float> [[VRNDNQ_V_I]]) #2
+// CHECK:   [[VRNDNQ_V2_I:%.*]] = bitcast <4 x float> [[VRNDNQ_V1_I]] to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[VRNDNQ_V2_I]] to <4 x float>
+// CHECK:   ret <4 x float> [[TMP1]]
 float32x4_t test_vrndnq_f32(float32x4_t a) {
-  // CHECK-LABEL: test_vrndnq_f32
-  // CHECK: call <4 x float> @llvm.arm.neon.vrintn.v4f32(<4 x float> %a)
  return vrndnq_f32(a);
 }

+// CHECK-LABEL: define <2 x float> @test_vrndp_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[VRNDP_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VRNDP_V1_I:%.*]] = call <2 x float> @llvm.arm.neon.vrintp.v2f32(<2 x float> [[VRNDP_V_I]]) #2
+// CHECK:   [[VRNDP_V2_I:%.*]] = bitcast <2 x float> [[VRNDP_V1_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VRNDP_V2_I]] to <2 x float>
+// CHECK:   ret <2 x float> [[TMP1]]
 float32x2_t test_vrndp_f32(float32x2_t a) {
-  // CHECK-LABEL: test_vrndp_f32
-  // CHECK: call <2 x float> @llvm.arm.neon.vrintp.v2f32(<2 x float> %a)
  return vrndp_f32(a);
 }

+// CHECK-LABEL: define <4 x float> @test_vrndpq_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[VRNDPQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VRNDPQ_V1_I:%.*]] = call <4 x float> @llvm.arm.neon.vrintp.v4f32(<4 x float> [[VRNDPQ_V_I]]) #2
+// CHECK:   [[VRNDPQ_V2_I:%.*]] = bitcast <4 x float> [[VRNDPQ_V1_I]] to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[VRNDPQ_V2_I]] to <4 x float>
+// CHECK:   ret <4 x float> [[TMP1]]
 float32x4_t test_vrndpq_f32(float32x4_t a) {
-  // CHECK-LABEL: test_vrndpq_f32
-  // CHECK: call <4 x float> @llvm.arm.neon.vrintp.v4f32(<4 x float> %a)
  return vrndpq_f32(a);
 }

+// CHECK-LABEL: define <2 x float> @test_vrndx_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[VRNDX_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VRNDX_V1_I:%.*]] = call <2 x float> @llvm.arm.neon.vrintx.v2f32(<2 x float> [[VRNDX_V_I]]) #2
+// CHECK:   [[VRNDX_V2_I:%.*]] = bitcast <2 x float> [[VRNDX_V1_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VRNDX_V2_I]] to <2 x float>
+// CHECK:   ret <2 x float> [[TMP1]]
 float32x2_t test_vrndx_f32(float32x2_t a) {
-  // CHECK-LABEL: test_vrndx_f32
-  // CHECK: call <2 x float> @llvm.arm.neon.vrintx.v2f32(<2 x float> %a)
  return vrndx_f32(a);
 }

+// CHECK-LABEL: define <4 x float> @test_vrndxq_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[VRNDXQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VRNDXQ_V1_I:%.*]] = call <4 x float> @llvm.arm.neon.vrintx.v4f32(<4 x float> [[VRNDXQ_V_I]]) #2
+// CHECK:   [[VRNDXQ_V2_I:%.*]] = bitcast <4 x float> [[VRNDXQ_V1_I]] to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[VRNDXQ_V2_I]] to <4 x float>
+// CHECK:   ret <4 x float> [[TMP1]]
 float32x4_t test_vrndxq_f32(float32x4_t a) {
-  // CHECK-LABEL: test_vrndxq_f32
-  // CHECK: call <4 x float> @llvm.arm.neon.vrintx.v4f32(<4 x float> %a)
  return vrndxq_f32(a);
 }

+// CHECK-LABEL: define <2 x float> @test_vrnd_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[VRND_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VRND_V1_I:%.*]] = call <2 x float> @llvm.arm.neon.vrintz.v2f32(<2 x float> [[VRND_V_I]]) #2
+// CHECK:   [[VRND_V2_I:%.*]] = bitcast <2 x float> [[VRND_V1_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VRND_V2_I]] to <2 x float>
+// CHECK:   ret <2 x float> [[TMP1]]
 float32x2_t test_vrnd_f32(float32x2_t a) {
-  // CHECK-LABEL: test_vrnd_f32
-  // CHECK: call <2 x float> @llvm.arm.neon.vrintz.v2f32(<2 x float> %a)
  return vrnd_f32(a);
 }

+// CHECK-LABEL: define <4 x float> @test_vrndq_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[VRNDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VRNDQ_V1_I:%.*]] = call <4 x float> @llvm.arm.neon.vrintz.v4f32(<4 x float> [[VRNDQ_V_I]]) #2
+// CHECK:   [[VRNDQ_V2_I:%.*]] = bitcast <4 x float> [[VRNDQ_V1_I]] to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[VRNDQ_V2_I]] to <4 x float>
+// CHECK:   ret <4 x float> [[TMP1]]
 float32x4_t test_vrndq_f32(float32x4_t a) {
-  // CHECK-LABEL: test_vrndq_f32
-  // CHECK: call <4 x float> @llvm.arm.neon.vrintz.v4f32(<4 x float> %a)
  return vrndq_f32(a);
 }
--- a/test/CodeGen/arm-neon-fma.c
+++ b/test/CodeGen/arm-neon-fma.c
@ -1,19 +1,34 @@
-// REQUIRES: arm-registered-target
 // RUN: %clang_cc1 -triple thumbv7-none-linux-gnueabihf \
 // RUN:   -target-abi aapcs \
 // RUN:   -target-cpu cortex-a8 \
 // RUN:   -mfloat-abi hard \
 // RUN:   -ffreestanding \
-// RUN:   -O3 -S -emit-llvm -o - %s | FileCheck %s
+// RUN:   -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s

 #include <arm_neon.h>

+// CHECK-LABEL: define <2 x float> @test_fma_order(<2 x float> %accum, <2 x float> %lhs, <2 x float> %rhs) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %accum to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %lhs to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x float> %rhs to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
+// CHECK:   [[TMP6:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[TMP4]], <2 x float> [[TMP5]], <2 x float> [[TMP3]]) #2
+// CHECK:   ret <2 x float> [[TMP6]]
 float32x2_t test_fma_order(float32x2_t accum, float32x2_t lhs, float32x2_t rhs) {
  return vfma_f32(accum, lhs, rhs);
-// CHECK: call <2 x float> @llvm.fma.v2f32(<2 x float> %lhs, <2 x float> %rhs, <2 x float> %accum)
 }

+// CHECK-LABEL: define <4 x float> @test_fmaq_order(<4 x float> %accum, <4 x float> %lhs, <4 x float> %rhs) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %accum to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %lhs to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x float> %rhs to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
+// CHECK:   [[TMP6:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[TMP4]], <4 x float> [[TMP5]], <4 x float> [[TMP3]]) #2
+// CHECK:   ret <4 x float> [[TMP6]]
 float32x4_t test_fmaq_order(float32x4_t accum, float32x4_t lhs, float32x4_t rhs) {
  return vfmaq_f32(accum, lhs, rhs);
-// CHECK: call <4 x float> @llvm.fma.v4f32(<4 x float> %lhs, <4 x float> %rhs, <4 x float> %accum)
 }
--- a/test/CodeGen/arm-neon-numeric-maxmin.c
+++ b/test/CodeGen/arm-neon-numeric-maxmin.c
@ -1,27 +1,55 @@
-// RUN: %clang_cc1 -triple thumbv8-linux-gnueabihf -target-cpu cortex-a57 -ffreestanding -O1 -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8-linux-gnueabihf -target-cpu cortex-a57 -ffreestanding -emit-llvm %s -o - | opt -S -mem2reg | FileCheck %s

 #include <arm_neon.h>

+// CHECK-LABEL: define <2 x float> @test_vmaxnm_f32(<2 x float> %a, <2 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[VMAXNM_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VMAXNM_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK:   [[VMAXNM_V2_I:%.*]] = call <2 x float> @llvm.arm.neon.vmaxnm.v2f32(<2 x float> [[VMAXNM_V_I]], <2 x float> [[VMAXNM_V1_I]]) #2
+// CHECK:   [[VMAXNM_V3_I:%.*]] = bitcast <2 x float> [[VMAXNM_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VMAXNM_V3_I]] to <2 x float>
+// CHECK:   ret <2 x float> [[TMP2]]
 float32x2_t test_vmaxnm_f32(float32x2_t a, float32x2_t b) {
-  // CHECK-LABEL: test_vmaxnm_f32
-  // CHECK: call <2 x float> @llvm.arm.neon.vmaxnm.v2f32(<2 x float> %a, <2 x float> %b)
  return vmaxnm_f32(a, b);
 }

+// CHECK-LABEL: define <4 x float> @test_vmaxnmq_f32(<4 x float> %a, <4 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
+// CHECK:   [[VMAXNMQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VMAXNMQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK:   [[VMAXNMQ_V2_I:%.*]] = call <4 x float> @llvm.arm.neon.vmaxnm.v4f32(<4 x float> [[VMAXNMQ_V_I]], <4 x float> [[VMAXNMQ_V1_I]]) #2
+// CHECK:   [[VMAXNMQ_V3_I:%.*]] = bitcast <4 x float> [[VMAXNMQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VMAXNMQ_V3_I]] to <4 x float>
+// CHECK:   ret <4 x float> [[TMP2]]
 float32x4_t test_vmaxnmq_f32(float32x4_t a, float32x4_t b) {
-  // CHECK-LABEL: test_vmaxnmq_f32
-  // CHECK: call <4 x float> @llvm.arm.neon.vmaxnm.v4f32(<4 x float> %a, <4 x float> %b)
  return vmaxnmq_f32(a, b);
 }

+// CHECK-LABEL: define <2 x float> @test_vminnm_f32(<2 x float> %a, <2 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[VMINNM_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VMINNM_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK:   [[VMINNM_V2_I:%.*]] = call <2 x float> @llvm.arm.neon.vminnm.v2f32(<2 x float> [[VMINNM_V_I]], <2 x float> [[VMINNM_V1_I]]) #2
+// CHECK:   [[VMINNM_V3_I:%.*]] = bitcast <2 x float> [[VMINNM_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VMINNM_V3_I]] to <2 x float>
+// CHECK:   ret <2 x float> [[TMP2]]
 float32x2_t test_vminnm_f32(float32x2_t a, float32x2_t b) {
-  // CHECK-LABEL: test_vminnm_f32
-  // CHECK: call <2 x float> @llvm.arm.neon.vminnm.v2f32(<2 x float> %a, <2 x float> %b)
  return vminnm_f32(a, b);
 }

+// CHECK-LABEL: define <4 x float> @test_vminnmq_f32(<4 x float> %a, <4 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
+// CHECK:   [[VMINNMQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VMINNMQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK:   [[VMINNMQ_V2_I:%.*]] = call <4 x float> @llvm.arm.neon.vminnm.v4f32(<4 x float> [[VMINNMQ_V_I]], <4 x float> [[VMINNMQ_V1_I]]) #2
+// CHECK:   [[VMINNMQ_V3_I:%.*]] = bitcast <4 x float> [[VMINNMQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VMINNMQ_V3_I]] to <4 x float>
+// CHECK:   ret <4 x float> [[TMP2]]
 float32x4_t test_vminnmq_f32(float32x4_t a, float32x4_t b) {
-  // CHECK-LABEL: test_vminnmq_f32
-  // CHECK: call <4 x float> @llvm.arm.neon.vminnm.v4f32(<4 x float> %a, <4 x float> %b)
  return vminnmq_f32(a, b);
 }
--- a/test/CodeGen/arm-neon-shifts.c
+++ b/test/CodeGen/arm-neon-shifts.c
@ -2,7 +2,7 @@
 // RUN: %clang_cc1 -triple thumbv7-apple-darwin \
 // RUN:   -target-cpu cortex-a8 \
 // RUN:   -ffreestanding \
-// RUN:   -emit-llvm -w -O1 -o - %s | FileCheck %s
+// RUN:   -emit-llvm -w -o - %s | opt -S -mem2reg | FileCheck %s

 #include <arm_neon.h>

@ -27,19 +27,20 @@ uint8x8_t test_shift_vshr_umax(uint8x8_t a) {
 uint8x8_t test_shift_vsra(uint8x8_t a, uint8x8_t b) {
  // CHECK-LABEL: test_shift_vsra
  // CHECK: %[[SHR:.*]] = lshr <8 x i8> %b, <i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5>
-  // CHECK: %{{.*}} = add <8 x i8> %[[SHR]], %a
+  // CHECK: %{{.*}} = add <8 x i8> %a, %[[SHR]]
  return vsra_n_u8(a, b, 5);
 }

 int8x8_t test_shift_vsra_smax(int8x8_t a, int8x8_t b) {
  // CHECK-LABEL: test_shift_vsra_smax
  // CHECK: %[[SHR:.*]] = ashr <8 x i8> %b, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
-  // CHECK: %{{.*}} = add <8 x i8> %[[SHR]], %a
+  // CHECK: %{{.*}} = add <8 x i8> %a, %[[SHR]]
  return vsra_n_s8(a, b, 8);
 }

 uint8x8_t test_shift_vsra_umax(uint8x8_t a, uint8x8_t b) {
  // CHECK-LABEL: test_shift_vsra_umax
-  // CHECK: ret <8 x i8> %a
+  // CHECK: [[RES:%.*]] = add <8 x i8> %a, zeroinitializer
+  // CHECK: ret <8 x i8> [[RES]]
  return vsra_n_u8(a, b, 8);
 }
--- a/test/CodeGen/arm-neon-vcvtX.c
+++ b/test/CodeGen/arm-neon-vcvtX.c
@ -1,99 +1,147 @@
-// RUN: %clang_cc1 -triple thumbv8-linux-gnueabihf -target-cpu cortex-a57 -ffreestanding -O1 -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8-linux-gnueabihf -target-cpu cortex-a57 -ffreestanding -emit-llvm %s -o - | opt -S -mem2reg | FileCheck %s

 #include <arm_neon.h>

+// CHECK-LABEL: define <2 x i32> @test_vcvta_s32_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[VCVTA_S32_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VCVTA_S32_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vcvtas.v2i32.v2f32(<2 x float> [[VCVTA_S32_V_I]]) #2
+// CHECK:   ret <2 x i32> [[VCVTA_S32_V1_I]]
 int32x2_t test_vcvta_s32_f32(float32x2_t a) {
-  // CHECK-LABEL: test_vcvta_s32_f32
-  // CHECK-LABEL: call <2 x i32> @llvm.arm.neon.vcvtas.v2i32.v2f32(<2 x float> %a)
  return vcvta_s32_f32(a);
 }

+// CHECK-LABEL: define <2 x i32> @test_vcvta_u32_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[VCVTA_U32_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VCVTA_U32_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vcvtau.v2i32.v2f32(<2 x float> [[VCVTA_U32_V_I]]) #2
+// CHECK:   ret <2 x i32> [[VCVTA_U32_V1_I]]
 uint32x2_t test_vcvta_u32_f32(float32x2_t a) {
-  // CHECK-LABEL: test_vcvta_u32_f32
-  // CHECK-LABEL: call <2 x i32> @llvm.arm.neon.vcvtau.v2i32.v2f32(<2 x float> %a)
  return vcvta_u32_f32(a);
 }

+// CHECK-LABEL: define <4 x i32> @test_vcvtaq_s32_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[VCVTAQ_S32_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VCVTAQ_S32_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vcvtas.v4i32.v4f32(<4 x float> [[VCVTAQ_S32_V_I]]) #2
+// CHECK:   ret <4 x i32> [[VCVTAQ_S32_V1_I]]
 int32x4_t test_vcvtaq_s32_f32(float32x4_t a) {
-  // CHECK-LABEL: test_vcvtaq_s32_f32
-  // CHECK-LABEL: call <4 x i32> @llvm.arm.neon.vcvtas.v4i32.v4f32(<4 x float> %a)
  return vcvtaq_s32_f32(a);
 }

+// CHECK-LABEL: define <4 x i32> @test_vcvtaq_u32_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[VCVTAQ_U32_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VCVTAQ_U32_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vcvtau.v4i32.v4f32(<4 x float> [[VCVTAQ_U32_V_I]]) #2
+// CHECK:   ret <4 x i32> [[VCVTAQ_U32_V1_I]]
 uint32x4_t test_vcvtaq_u32_f32(float32x4_t a) {
-  // CHECK-LABEL: test_vcvtaq_u32_f32
-  // CHECK-LABEL: call <4 x i32> @llvm.arm.neon.vcvtau.v4i32.v4f32(<4 x float> %a)
  return vcvtaq_u32_f32(a);
 }

+// CHECK-LABEL: define <2 x i32> @test_vcvtn_s32_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[VCVTN_S32_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VCVTN_S32_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vcvtns.v2i32.v2f32(<2 x float> [[VCVTN_S32_V_I]]) #2
+// CHECK:   ret <2 x i32> [[VCVTN_S32_V1_I]]
 int32x2_t test_vcvtn_s32_f32(float32x2_t a) {
-  // CHECK-LABEL: test_vcvtn_s32_f32
-  // CHECK-LABEL: call <2 x i32> @llvm.arm.neon.vcvtns.v2i32.v2f32(<2 x float> %a)
  return vcvtn_s32_f32(a);
 }

+// CHECK-LABEL: define <2 x i32> @test_vcvtn_u32_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[VCVTN_U32_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VCVTN_U32_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vcvtnu.v2i32.v2f32(<2 x float> [[VCVTN_U32_V_I]]) #2
+// CHECK:   ret <2 x i32> [[VCVTN_U32_V1_I]]
 uint32x2_t test_vcvtn_u32_f32(float32x2_t a) {
-  // CHECK-LABEL: test_vcvtn_u32_f32
-  // CHECK-LABEL: call <2 x i32> @llvm.arm.neon.vcvtnu.v2i32.v2f32(<2 x float> %a)
  return vcvtn_u32_f32(a);
 }

+// CHECK-LABEL: define <4 x i32> @test_vcvtnq_s32_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[VCVTNQ_S32_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VCVTNQ_S32_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vcvtns.v4i32.v4f32(<4 x float> [[VCVTNQ_S32_V_I]]) #2
+// CHECK:   ret <4 x i32> [[VCVTNQ_S32_V1_I]]
 int32x4_t test_vcvtnq_s32_f32(float32x4_t a) {
-  // CHECK-LABEL: test_vcvtnq_s32_f32
-  // CHECK-LABEL: call <4 x i32> @llvm.arm.neon.vcvtns.v4i32.v4f32(<4 x float> %a)
  return vcvtnq_s32_f32(a);
 }

+// CHECK-LABEL: define <4 x i32> @test_vcvtnq_u32_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[VCVTNQ_U32_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VCVTNQ_U32_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vcvtnu.v4i32.v4f32(<4 x float> [[VCVTNQ_U32_V_I]]) #2
+// CHECK:   ret <4 x i32> [[VCVTNQ_U32_V1_I]]
 uint32x4_t test_vcvtnq_u32_f32(float32x4_t a) {
-  // CHECK-LABEL: test_vcvtnq_u32_f32
-  // CHECK-LABEL: call <4 x i32> @llvm.arm.neon.vcvtnu.v4i32.v4f32(<4 x float> %a)
  return vcvtnq_u32_f32(a);
 }

+// CHECK-LABEL: define <2 x i32> @test_vcvtp_s32_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[VCVTP_S32_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VCVTP_S32_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vcvtps.v2i32.v2f32(<2 x float> [[VCVTP_S32_V_I]]) #2
+// CHECK:   ret <2 x i32> [[VCVTP_S32_V1_I]]
 int32x2_t test_vcvtp_s32_f32(float32x2_t a) {
-  // CHECK-LABEL: test_vcvtp_s32_f32
-  // CHECK-LABEL: call <2 x i32> @llvm.arm.neon.vcvtps.v2i32.v2f32(<2 x float> %a)
  return vcvtp_s32_f32(a);
 }

+// CHECK-LABEL: define <2 x i32> @test_vcvtp_u32_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[VCVTP_U32_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VCVTP_U32_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vcvtpu.v2i32.v2f32(<2 x float> [[VCVTP_U32_V_I]]) #2
+// CHECK:   ret <2 x i32> [[VCVTP_U32_V1_I]]
 uint32x2_t test_vcvtp_u32_f32(float32x2_t a) {
-  // CHECK-LABEL: test_vcvtp_u32_f32
-  // CHECK-LABEL: call <2 x i32> @llvm.arm.neon.vcvtpu.v2i32.v2f32(<2 x float> %a)
  return vcvtp_u32_f32(a);
 }

+// CHECK-LABEL: define <4 x i32> @test_vcvtpq_s32_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[VCVTPQ_S32_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VCVTPQ_S32_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vcvtps.v4i32.v4f32(<4 x float> [[VCVTPQ_S32_V_I]]) #2
+// CHECK:   ret <4 x i32> [[VCVTPQ_S32_V1_I]]
 int32x4_t test_vcvtpq_s32_f32(float32x4_t a) {
-  // CHECK-LABEL: test_vcvtpq_s32_f32
-  // CHECK-LABEL: call <4 x i32> @llvm.arm.neon.vcvtps.v4i32.v4f32(<4 x float> %a)
  return vcvtpq_s32_f32(a);
 }

+// CHECK-LABEL: define <4 x i32> @test_vcvtpq_u32_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[VCVTPQ_U32_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VCVTPQ_U32_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vcvtpu.v4i32.v4f32(<4 x float> [[VCVTPQ_U32_V_I]]) #2
+// CHECK:   ret <4 x i32> [[VCVTPQ_U32_V1_I]]
 uint32x4_t test_vcvtpq_u32_f32(float32x4_t a) {
-  // CHECK-LABEL: test_vcvtpq_u32_f32
-  // CHECK-LABEL: call <4 x i32> @llvm.arm.neon.vcvtpu.v4i32.v4f32(<4 x float> %a)
  return vcvtpq_u32_f32(a);
 }

+// CHECK-LABEL: define <2 x i32> @test_vcvtm_s32_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[VCVTM_S32_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VCVTM_S32_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vcvtms.v2i32.v2f32(<2 x float> [[VCVTM_S32_V_I]]) #2
+// CHECK:   ret <2 x i32> [[VCVTM_S32_V1_I]]
 int32x2_t test_vcvtm_s32_f32(float32x2_t a) {
-  // CHECK-LABEL: test_vcvtm_s32_f32
-  // CHECK-LABEL: call <2 x i32> @llvm.arm.neon.vcvtms.v2i32.v2f32(<2 x float> %a)
  return vcvtm_s32_f32(a);
 }

+// CHECK-LABEL: define <2 x i32> @test_vcvtm_u32_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[VCVTM_U32_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VCVTM_U32_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vcvtmu.v2i32.v2f32(<2 x float> [[VCVTM_U32_V_I]]) #2
+// CHECK:   ret <2 x i32> [[VCVTM_U32_V1_I]]
 uint32x2_t test_vcvtm_u32_f32(float32x2_t a) {
-  // CHECK-LABEL: test_vcvtm_u32_f32
-  // CHECK-LABEL: call <2 x i32> @llvm.arm.neon.vcvtmu.v2i32.v2f32(<2 x float> %a)
  return vcvtm_u32_f32(a);
 }

+// CHECK-LABEL: define <4 x i32> @test_vcvtmq_s32_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[VCVTMQ_S32_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VCVTMQ_S32_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vcvtms.v4i32.v4f32(<4 x float> [[VCVTMQ_S32_V_I]]) #2
+// CHECK:   ret <4 x i32> [[VCVTMQ_S32_V1_I]]
 int32x4_t test_vcvtmq_s32_f32(float32x4_t a) {
-  // CHECK-LABEL: test_vcvtmq_s32_f32
-  // CHECK-LABEL: call <4 x i32> @llvm.arm.neon.vcvtms.v4i32.v4f32(<4 x float> %a)
  return vcvtmq_s32_f32(a);
 }

+// CHECK-LABEL: define <4 x i32> @test_vcvtmq_u32_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[VCVTMQ_U32_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VCVTMQ_U32_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vcvtmu.v4i32.v4f32(<4 x float> [[VCVTMQ_U32_V_I]]) #2
+// CHECK:   ret <4 x i32> [[VCVTMQ_U32_V1_I]]
 uint32x4_t test_vcvtmq_u32_f32(float32x4_t a) {
-  // CHECK-LABEL: test_vcvtmq_u32_f32
-  // CHECK-LABEL: call <4 x i32> @llvm.arm.neon.vcvtmu.v4i32.v4f32(<4 x float> %a)
  return vcvtmq_u32_f32(a);
 }
--- a/test/CodeGen/arm-neon-vget.c
+++ b/test/CodeGen/arm-neon-vget.c
@ -1,124 +1,123 @@
-// REQUIRES: arm-registered-target
 // RUN: %clang_cc1 -triple thumbv7-apple-darwin \
 // RUN:   -target-abi apcs-gnu \
 // RUN:   -target-cpu cortex-a8 \
 // RUN:   -mfloat-abi soft \
 // RUN:   -target-feature +soft-float-abi \
 // RUN:   -ffreestanding \
-// RUN:   -emit-llvm -w -O1 -o - %s | FileCheck %s
+// RUN:   -emit-llvm -w -o - %s | opt -S -mem2reg | FileCheck %s

 #include <arm_neon.h>

 // Check that the vget_low/vget_high intrinsics generate a single shuffle
 // without any bitcasting.
 int8x8_t low_s8(int8x16_t a) {
-// CHECK: shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK: shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
  return vget_low_s8(a);
 }

 uint8x8_t low_u8 (uint8x16_t a) {
-// CHECK: shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK: shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
  return vget_low_u8(a);
 }

 int16x4_t low_s16( int16x8_t a) {
-// CHECK: shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK: shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
  return vget_low_s16(a);
 }

 uint16x4_t low_u16(uint16x8_t a) {
-// CHECK: shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK: shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
  return vget_low_u16(a);
 }

 int32x2_t low_s32( int32x4_t a) {
-// CHECK: shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
+// CHECK: shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 0, i32 1>
  return vget_low_s32(a);
 }

 uint32x2_t low_u32(uint32x4_t a) {
-// CHECK: shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
+// CHECK: shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 0, i32 1>
  return vget_low_u32(a);
 }

 int64x1_t low_s64( int64x2_t a) {
-// CHECK: shufflevector <2 x i64> %a, <2 x i64> undef, <1 x i32> zeroinitializer
+// CHECK: shufflevector <2 x i64> %a, <2 x i64> %a, <1 x i32> zeroinitializer
  return vget_low_s64(a);
 }

 uint64x1_t low_u64(uint64x2_t a) {
-// CHECK: shufflevector <2 x i64> %a, <2 x i64> undef, <1 x i32> zeroinitializer
+// CHECK: shufflevector <2 x i64> %a, <2 x i64> %a, <1 x i32> zeroinitializer
  return vget_low_u64(a);
 }

 poly8x8_t low_p8 (poly8x16_t a) {
-// CHECK: shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK: shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
  return vget_low_p8(a);
 }

 poly16x4_t low_p16(poly16x8_t a) {
-// CHECK: shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK: shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
  return vget_low_p16(a);
 }

 float32x2_t low_f32(float32x4_t a) {
-// CHECK: shufflevector <4 x float> %a, <4 x float> undef, <2 x i32> <i32 0, i32 1>
+// CHECK: shufflevector <4 x float> %a, <4 x float> %a, <2 x i32> <i32 0, i32 1>
  return vget_low_f32(a);
 }


 int8x8_t high_s8(int8x16_t a) {
-// CHECK: shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK: shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
  return vget_high_s8(a);
 }

 uint8x8_t high_u8 (uint8x16_t a) {
-// CHECK: shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK: shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
  return vget_high_u8(a);
 }

 int16x4_t high_s16( int16x8_t a) {
-// CHECK: shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK: shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
  return vget_high_s16(a);
 }

 uint16x4_t high_u16(uint16x8_t a) {
-// CHECK: shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK: shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
  return vget_high_u16(a);
 }

 int32x2_t high_s32( int32x4_t a) {
-// CHECK: shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+// CHECK: shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
  return vget_high_s32(a);
 }

 uint32x2_t high_u32(uint32x4_t a) {
-// CHECK: shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+// CHECK: shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
  return vget_high_u32(a);
 }

 int64x1_t high_s64( int64x2_t a) {
-// CHECK: shufflevector <2 x i64> %a, <2 x i64> undef, <1 x i32> <i32 1>
+// CHECK: shufflevector <2 x i64> %a, <2 x i64> %a, <1 x i32> <i32 1>
  return vget_high_s64(a);
 }

 uint64x1_t high_u64(uint64x2_t a) {
-// CHECK: shufflevector <2 x i64> %a, <2 x i64> undef, <1 x i32> <i32 1>
+// CHECK: shufflevector <2 x i64> %a, <2 x i64> %a, <1 x i32> <i32 1>
  return vget_high_u64(a);
 }

 poly8x8_t high_p8 (poly8x16_t a) {
-// CHECK: shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK: shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
  return vget_high_p8(a);
 }

 poly16x4_t high_p16(poly16x8_t a) {
-// CHECK: shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK: shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
  return vget_high_p16(a);
 }

 float32x2_t high_f32(float32x4_t a) {
-// CHECK: shufflevector <4 x float> %a, <4 x float> undef, <2 x i32> <i32 2, i32 3>
+// CHECK: shufflevector <4 x float> %a, <4 x float> %a, <2 x i32> <i32 2, i32 3>
  return vget_high_f32(a);
 }

--- a/test/CodeGen/arm64-be-bitfield.c
+++ b/test/CodeGen/arm64-be-bitfield.c
@ -1,6 +1,4 @@
-// REQUIRES: aarch64-registered-target
 // RUN:  %clang_cc1 -triple aarch64_be-linux-gnu -ffreestanding -emit-llvm -O0 -o - %s | FileCheck --check-prefix IR %s
-// RUN:  %clang_cc1 -triple aarch64_be-linux-gnu -ffreestanding -S -O1 -o - %s | FileCheck --check-prefix ARM %s

 struct bt3 { signed b2:10; signed b3:10; } b16;

@ -10,6 +8,5 @@ signed callee_b0f(struct bt3 bp11) {
 // IR: store i64 [[ARG]], i64* [[PTR:%.*]], align 8
 // IR: [[BITCAST:%.*]] = bitcast i64* [[PTR]] to i8*
 // IR: call void @llvm.memcpy.p0i8.p0i8.i64(i8* {{.*}}, i8* [[BITCAST]], i64 4
-// ARM: asr x0, x0, #54
  return bp11.b2;
 }
--- a/test/CodeGen/arm64-crc32.c
+++ b/test/CodeGen/arm64-crc32.c
@ -1,6 +1,6 @@
 // REQUIRES: aarch64-registered-target
 // RUN: %clang_cc1 -triple arm64-none-linux-gnu \
-// RUN:   -O3 -S -emit-llvm -o - %s | FileCheck %s
+// RUN:   -S -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s

 int crc32b(int a, char b)
 {
--- a/test/CodeGen/arm64-lanes.c
+++ b/test/CodeGen/arm64-lanes.c
@ -1,74 +1,127 @@
-// RUN: %clang_cc1 -O3 -triple arm64-apple-ios7 -target-feature +neon -ffreestanding -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -O3 -triple aarch64_be-linux-gnu -target-feature +neon -ffreestanding -emit-llvm -o - %s | FileCheck %s --check-prefix CHECK-BE
+// RUN: %clang_cc1 -triple arm64-apple-ios7 -target-feature +neon -ffreestanding -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64_be-linux-gnu -target-feature +neon -ffreestanding -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s --check-prefix CHECK-BE

 #include <arm_neon.h>

-// CHECK-LABEL: @test_vdupb_lane_s8
 int8_t test_vdupb_lane_s8(int8x8_t src) {
  return vdupb_lane_s8(src, 2);
+  // CHECK-LABEL: @test_vdupb_lane_s8
  // CHECK: extractelement <8 x i8> %src, i32 2
-  // CHECK-BE: extractelement <8 x i8> %src, i32 5
+
+  // CHECK-BE-LABEL: @test_vdupb_lane_s8
+  // CHECK-BE: [[REV:%.*]] = shufflevector <8 x i8> {{.*}}, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+  // CHECK-BE: extractelement <8 x i8> [[REV]], i32 2
 }

-// CHECK-LABEL: @test_vdupb_lane_u8
 uint8_t test_vdupb_lane_u8(uint8x8_t src) {
  return vdupb_lane_u8(src, 2);
+  // CHECK-LABEL: @test_vdupb_lane_u8
  // CHECK: extractelement <8 x i8> %src, i32 2
-  // CHECK-BE: extractelement <8 x i8> %src, i32 5
+
+  // CHECK-BE-LABEL: @test_vdupb_lane_u8
+  // CHECK-BE: [[REV:%.*]] = shufflevector <8 x i8> {{.*}}, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+  // CHECK-BE: extractelement <8 x i8> [[REV]], i32 2
 }

-// CHECK-LABEL: @test_vduph_lane_s16
 int16_t test_vduph_lane_s16(int16x4_t src) {
  return vduph_lane_s16(src, 2);
-  // CHECK: extractelement <4 x i16> %src, i32 2
-  // CHECK-BE: extractelement <4 x i16> %src, i32 1
+  // CHECK-LABEL: @test_vduph_lane_s16
+  // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %src to [[TYPE:.*]]
+  // CHECK: [[TMP2:%.*]] = bitcast [[TYPE]] [[TMP1]] to <4 x i16>
+  // CHECK: extractelement <4 x i16> [[TMP2]], i32 2
+
+  // CHECK-BE-LABEL: @test_vduph_lane_s16
+  // CHECK-BE: [[REV:%.*]] = shufflevector <4 x i16> {{.*}}, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  // CHECK-BE: [[TMP1:%.*]] = bitcast <4 x i16> [[REV]] to [[TYPE:.*]]
+  // CHECK-BE: [[TMP2:%.*]] = bitcast [[TYPE]] [[TMP1]] to <4 x i16>
+  // CHECK-BE: extractelement <4 x i16> [[TMP2]], i32 2
 }

-// CHECK-LABEL: @test_vduph_lane_u16
 uint16_t test_vduph_lane_u16(uint16x4_t src) {
  return vduph_lane_u16(src, 2);
-  // CHECK: extractelement <4 x i16> %src, i32 2
-  // CHECK-BE: extractelement <4 x i16> %src, i32 1
+  // CHECK-LABEL: @test_vduph_lane_u16
+  // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %src to [[TYPE:.*]]
+  // CHECK: [[TMP2:%.*]] = bitcast [[TYPE]] [[TMP1]] to <4 x i16>
+  // CHECK: extractelement <4 x i16> [[TMP2]], i32 2
+
+  // CHECK-BE-LABEL: @test_vduph_lane_u16
+  // CHECK-BE: [[REV:%.*]] = shufflevector <4 x i16> {{.*}}, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  // CHECK-BE: [[TMP1:%.*]] = bitcast <4 x i16> [[REV]] to [[TYPE:.*]]
+  // CHECK-BE: [[TMP2:%.*]] = bitcast [[TYPE]] [[TMP1]] to <4 x i16>
+  // CHECK-BE: extractelement <4 x i16> [[TMP2]], i32 2
 }

-// CHECK-LABEL: @test_vdups_lane_s32
 int32_t test_vdups_lane_s32(int32x2_t src) {
  return vdups_lane_s32(src, 0);
-  // CHECK: extractelement <2 x i32> %src, i32 0
-  // CHECK-BE: extractelement <2 x i32> %src, i32 1
+  // CHECK-LABEL: @test_vdups_lane_s32
+  // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %src to [[TYPE:.*]]
+  // CHECK: [[TMP2:%.*]] = bitcast [[TYPE]] [[TMP1]] to <2 x i32>
+  // CHECK: extractelement <2 x i32> [[TMP2]], i32 0
+
+  // CHECK-BE-LABEL: @test_vdups_lane_s32
+  // CHECK-BE: [[REV:%.*]] = shufflevector <2 x i32> {{.*}}, <2 x i32> <i32 1, i32 0>
+  // CHECK-BE: [[TMP1:%.*]] = bitcast <2 x i32> [[REV]] to [[TYPE:.*]]
+  // CHECK-BE: [[TMP2:%.*]] = bitcast [[TYPE]] [[TMP1]] to <2 x i32>
+  // CHECK-BE: extractelement <2 x i32> [[TMP2]], i32 0
 }

-// CHECK-LABEL: @test_vdups_lane_u32
 uint32_t test_vdups_lane_u32(uint32x2_t src) {
  return vdups_lane_u32(src, 0);
-  // CHECK: extractelement <2 x i32> %src, i32 0
-  // CHECK-BE: extractelement <2 x i32> %src, i32 1
+  // CHECK-LABEL: @test_vdups_lane_u32
+  // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %src to [[TYPE:.*]]
+  // CHECK: [[TMP2:%.*]] = bitcast [[TYPE]] [[TMP1]] to <2 x i32>
+  // CHECK: extractelement <2 x i32> [[TMP2]], i32 0
+
+  // CHECK-BE-LABEL: @test_vdups_lane_u32
+  // CHECK-BE: [[REV:%.*]] = shufflevector <2 x i32> {{.*}}, <2 x i32> <i32 1, i32 0>
+  // CHECK-BE: [[TMP1:%.*]] = bitcast <2 x i32> [[REV]] to [[TYPE:.*]]
+  // CHECK-BE: [[TMP2:%.*]] = bitcast [[TYPE]] [[TMP1]] to <2 x i32>
+  // CHECK-BE: extractelement <2 x i32> [[TMP2]], i32 0
 }

-// CHECK-LABEL: @test_vdups_lane_f32
 float32_t test_vdups_lane_f32(float32x2_t src) {
  return vdups_lane_f32(src, 0);
-  // CHECK: extractelement <2 x float> %src, i32 0
-  // CHECK-BE: extractelement <2 x float> %src, i32 1
+  // CHECK-LABEL: @test_vdups_lane_f32
+  // CHECK: [[TMP1:%.*]] = bitcast <2 x float> %src to [[TYPE:.*]]
+  // CHECK: [[TMP2:%.*]] = bitcast [[TYPE]] [[TMP1]] to <2 x float>
+  // CHECK: extractelement <2 x float> [[TMP2]], i32 0
+
+  // CHECK-BE-LABEL: @test_vdups_lane_f32
+  // CHECK-BE: [[REV:%.*]] = shufflevector <2 x float> {{.*}}, <2 x i32> <i32 1, i32 0>
+  // CHECK-BE: [[TMP1:%.*]] = bitcast <2 x float> [[REV]] to [[TYPE:.*]]
+  // CHECK-BE: [[TMP2:%.*]] = bitcast [[TYPE]] [[TMP1]] to <2 x float>
+  // CHECK-BE: extractelement <2 x float> [[TMP2]], i32 0
 }

-// CHECK-LABEL: @test_vdupd_lane_s64
 int64_t test_vdupd_lane_s64(int64x1_t src) {
  return vdupd_lane_s64(src, 0);
-  // CHECK: extractelement <1 x i64> %src, i32 0
-  // CHECK-BE: extractelement <1 x i64> %src, i32 0
+  // CHECK-LABEL: @test_vdupd_lane_s64
+  // CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %src to [[TYPE:.*]]
+  // CHECK: [[TMP2:%.*]] = bitcast [[TYPE]] [[TMP1]] to <1 x i64>
+  // CHECK: extractelement <1 x i64> [[TMP2]], i32 0
+
+  // CHECK-BE-LABEL: @test_vdupd_lane_s64
+  // CHECK-BE: extractelement <1 x i64> {{.*}}, i32 0
 }

-// CHECK-LABEL: @test_vdupd_lane_u64
 uint64_t test_vdupd_lane_u64(uint64x1_t src) {
  return vdupd_lane_u64(src, 0);
-  // CHECK: extractelement <1 x i64> %src, i32 0
-  // CHECK-BE: extractelement <1 x i64> %src, i32 0
+  // CHECK-LABEL: @test_vdupd_lane_u64
+  // CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %src to [[TYPE:.*]]
+  // CHECK: [[TMP2:%.*]] = bitcast [[TYPE]] [[TMP1]] to <1 x i64>
+  // CHECK: extractelement <1 x i64> [[TMP2]], i32 0
+
+  // CHECK-BE-LABEL: @test_vdupd_lane_u64
+  // CHECK-BE: extractelement <1 x i64> {{.*}}, i32 0
 }

-// CHECK-LABEL: @test_vdupd_lane_f64
 float64_t test_vdupd_lane_f64(float64x1_t src) {
  return vdupd_lane_f64(src, 0);
-  // CHECK: extractelement <1 x double> %src, i32 0
-  // CHECK-BE: extractelement <1 x double> %src, i32 0
+  // CHECK-LABEL: @test_vdupd_lane_f64
+  // CHECK: [[TMP1:%.*]] = bitcast <1 x double> %src to [[TYPE:.*]]
+  // CHECK: [[TMP2:%.*]] = bitcast [[TYPE]] [[TMP1]] to <1 x double>
+  // CHECK: extractelement <1 x double> [[TMP2]], i32 0
+
+  // CHECK-BE-LABEL: @test_vdupd_lane_f64
+  // CHECK-BE: extractelement <1 x double> {{.*}}, i32 0
 }
--- a/test/CodeGen/arm64_vcopy.c
+++ b/test/CodeGen/arm64_vcopy.c
@ -1,69 +1,121 @@
-// RUN: %clang_cc1 -O1 -triple arm64-apple-ios7 -target-feature +neon -ffreestanding -S -o - -emit-llvm %s | FileCheck %s
+// RUN: %clang_cc1 -triple arm64-apple-ios7 -target-feature +neon -ffreestanding -S -o - -emit-llvm %s | opt -S -mem2reg | FileCheck %s

 // Test ARM64 SIMD copy vector element to vector element: vcopyq_lane*

 #include <arm_neon.h>

+// CHECK-LABEL: define <16 x i8> @test_vcopyq_laneq_s8(<16 x i8> %a1, <16 x i8> %a2) #0 {
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <16 x i8> %a2, i32 13
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <16 x i8> %a1, i8 [[VGETQ_LANE]], i32 3
+// CHECK:   ret <16 x i8> [[VSET_LANE]]
 int8x16_t test_vcopyq_laneq_s8(int8x16_t a1, int8x16_t a2) {
-  // CHECK-LABEL: test_vcopyq_laneq_s8
  return vcopyq_laneq_s8(a1, (int64_t) 3, a2, (int64_t) 13);
-  // CHECK: shufflevector <16 x i8> %a1, <16 x i8> %a2, <16 x i32> <i32 0, i32 1, i32 2, i32 29, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 }

+// CHECK-LABEL: define <16 x i8> @test_vcopyq_laneq_u8(<16 x i8> %a1, <16 x i8> %a2) #0 {
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <16 x i8> %a2, i32 13
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <16 x i8> %a1, i8 [[VGETQ_LANE]], i32 3
+// CHECK:   ret <16 x i8> [[VSET_LANE]]
 uint8x16_t test_vcopyq_laneq_u8(uint8x16_t a1, uint8x16_t a2) {
-  // CHECK-LABEL: test_vcopyq_laneq_u8
  return vcopyq_laneq_u8(a1, (int64_t) 3, a2, (int64_t) 13);
-  // CHECK: shufflevector <16 x i8> %a1, <16 x i8> %a2, <16 x i32> <i32 0, i32 1, i32 2, i32 29, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>

 }

+// CHECK-LABEL: define <8 x i16> @test_vcopyq_laneq_s16(<8 x i16> %a1, <8 x i16> %a2) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a2 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <8 x i16> [[TMP1]], i32 7
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %a1 to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <8 x i16> [[TMP3]], i16 [[VGETQ_LANE]], i32 3
+// CHECK:   ret <8 x i16> [[VSET_LANE]]
 int16x8_t test_vcopyq_laneq_s16(int16x8_t a1, int16x8_t a2) {
-  // CHECK-LABEL: test_vcopyq_laneq_s16
  return vcopyq_laneq_s16(a1, (int64_t) 3, a2, (int64_t) 7);
-  // CHECK: shufflevector <8 x i16> %a1, <8 x i16> %a2, <8 x i32> <i32 0, i32 1, i32 2, i32 15, i32 4, i32 5, i32 6, i32 7>

 }

+// CHECK-LABEL: define <8 x i16> @test_vcopyq_laneq_u16(<8 x i16> %a1, <8 x i16> %a2) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a2 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <8 x i16> [[TMP1]], i32 7
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %a1 to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <8 x i16> [[TMP3]], i16 [[VGETQ_LANE]], i32 3
+// CHECK:   ret <8 x i16> [[VSET_LANE]]
 uint16x8_t test_vcopyq_laneq_u16(uint16x8_t a1, uint16x8_t a2) {
-  // CHECK-LABEL: test_vcopyq_laneq_u16
  return vcopyq_laneq_u16(a1, (int64_t) 3, a2, (int64_t) 7);
-  // CHECK: shufflevector <8 x i16> %a1, <8 x i16> %a2, <8 x i32> <i32 0, i32 1, i32 2, i32 15, i32 4, i32 5, i32 6, i32 7>

 }

+// CHECK-LABEL: define <4 x i32> @test_vcopyq_laneq_s32(<4 x i32> %a1, <4 x i32> %a2) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a2 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i32> %a1 to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[VGETQ_LANE]], i32 3
+// CHECK:   ret <4 x i32> [[VSET_LANE]]
 int32x4_t test_vcopyq_laneq_s32(int32x4_t a1, int32x4_t a2) {
-  // CHECK-LABEL: test_vcopyq_laneq_s32
  return vcopyq_laneq_s32(a1, (int64_t) 3, a2, (int64_t) 3);
-  // CHECK: shufflevector <4 x i32> %a1, <4 x i32> %a2, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
 }

+// CHECK-LABEL: define <4 x i32> @test_vcopyq_laneq_u32(<4 x i32> %a1, <4 x i32> %a2) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a2 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i32> %a1 to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[VGETQ_LANE]], i32 3
+// CHECK:   ret <4 x i32> [[VSET_LANE]]
 uint32x4_t test_vcopyq_laneq_u32(uint32x4_t a1, uint32x4_t a2) {
-  // CHECK-LABEL: test_vcopyq_laneq_u32
  return vcopyq_laneq_u32(a1, (int64_t) 3, a2, (int64_t) 3);
-  // CHECK: shufflevector <4 x i32> %a1, <4 x i32> %a2, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
 }

+// CHECK-LABEL: define <2 x i64> @test_vcopyq_laneq_s64(<2 x i64> %a1, <2 x i64> %a2) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a2 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <2 x i64> [[TMP1]], i32 1
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i64> %a1 to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x i64>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[VGETQ_LANE]], i32 0
+// CHECK:   ret <2 x i64> [[VSET_LANE]]
 int64x2_t test_vcopyq_laneq_s64(int64x2_t a1, int64x2_t a2) {
-  // CHECK-LABEL: test_vcopyq_laneq_s64
  return vcopyq_laneq_s64(a1, (int64_t) 0, a2, (int64_t) 1);
-  // CHECK: shufflevector <2 x i64> %a1, <2 x i64> %a2, <2 x i32> <i32 3, i32 1>
 }

+// CHECK-LABEL: define <2 x i64> @test_vcopyq_laneq_u64(<2 x i64> %a1, <2 x i64> %a2) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a2 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <2 x i64> [[TMP1]], i32 1
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i64> %a1 to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x i64>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[VGETQ_LANE]], i32 0
+// CHECK:   ret <2 x i64> [[VSET_LANE]]
 uint64x2_t test_vcopyq_laneq_u64(uint64x2_t a1, uint64x2_t a2) {
-  // CHECK-LABEL: test_vcopyq_laneq_u64
  return vcopyq_laneq_u64(a1, (int64_t) 0, a2, (int64_t) 1);
-  // CHECK: shufflevector <2 x i64> %a1, <2 x i64> %a2, <2 x i32> <i32 3, i32 1>
 }

+// CHECK-LABEL: define <4 x float> @test_vcopyq_laneq_f32(<4 x float> %a1, <4 x float> %a2) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a2 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x float> %a1 to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <4 x float> [[TMP3]], float [[VGETQ_LANE]], i32 0
+// CHECK:   ret <4 x float> [[VSET_LANE]]
 float32x4_t test_vcopyq_laneq_f32(float32x4_t a1, float32x4_t a2) {
-  // CHECK-LABEL: test_vcopyq_laneq_f32
  return vcopyq_laneq_f32(a1, 0, a2, 3);
-  // CHECK: shufflevector <4 x float> %a1, <4 x float> %a2, <4 x i32> <i32 7, i32 1, i32 2, i32 3>
 }

+// CHECK-LABEL: define <2 x double> @test_vcopyq_laneq_f64(<2 x double> %a1, <2 x double> %a2) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a2 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x double> %a1 to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <2 x double> [[TMP3]], double [[VGETQ_LANE]], i32 0
+// CHECK:   ret <2 x double> [[VSET_LANE]]
 float64x2_t test_vcopyq_laneq_f64(float64x2_t a1, float64x2_t a2) {
-  // CHECK-LABEL: test_vcopyq_laneq_f64
  return vcopyq_laneq_f64(a1, 0, a2, 1);
-  // CHECK: shufflevector <2 x double> %a1, <2 x double> %a2, <2 x i32> <i32 3, i32 1>
 }

--- a/test/CodeGen/arm64_vcreate.c
+++ b/test/CodeGen/arm64_vcreate.c
@ -1,7 +1,6 @@
-// RUN: %clang_cc1 -O1 -triple arm64-apple-ios7 -target-feature +neon -ffreestanding -S -o - -emit-llvm %s | FileCheck %s
+// RUN: %clang_cc1 -triple arm64-apple-ios7 -target-feature +neon -ffreestanding -S -o - -emit-llvm %s | opt -S -mem2reg | FileCheck %s
 // Test ARM64 SIMD vcreate intrinsics

-/*#include <arm_neon.h>*/
 #include <arm_neon.h>

 float32x2_t test_vcreate_f32(uint64_t a1) {
@ -10,14 +9,3 @@ float32x2_t test_vcreate_f32(uint64_t a1) {
  // CHECK: bitcast {{.*}} to <2 x float>
  // CHECK-NEXT: ret
 }
-
-// FIXME enable when scalar_to_vector in backend is fixed.  Also, change
-// CHECK@ to CHECK<colon> and CHECK-NEXT@ to CHECK-NEXT<colon>
-/*
-float64x1_t test_vcreate_f64(uint64_t a1) {
-  // CHECK@ test_vcreate_f64
-  return vcreate_f64(a1);
-  // CHECK@ llvm.aarch64.neon.saddlv.i64.v2i32
-  // CHECK-NEXT@ ret
-}
-*/
--- a/test/CodeGen/arm64_vdupq_n_f64.c
+++ b/test/CodeGen/arm64_vdupq_n_f64.c
@ -1,88 +1,78 @@
-// RUN: %clang_cc1 -O3 -triple arm64-apple-ios7 -target-feature +neon -ffreestanding -S -o - %s | FileCheck %s
-// RUN: %clang_cc1 -O3 -triple arm64-apple-ios7 -target-feature +neon -ffreestanding -S -o - -emit-llvm %s | \
-// RUN:   FileCheck -check-prefix=CHECK-IR %s
-// REQUIRES: aarch64-registered-target
-
-/// Test vdupq_n_f64 and vmovq_nf64 ARM64 intrinsics
-// <rdar://problem/11778405> ARM64: vdupq_n_f64 and vdupq_lane_f64 intrinsics
-// missing
-
+// RUN: %clang_cc1 -triple arm64-apple-ios7 -target-feature +neon -ffreestanding -fallow-half-arguments-and-returns -S -o - -emit-llvm %s | opt -S -mem2reg | FileCheck %s

 #include <arm_neon.h>

 // vdupq_n_f64 -> dup.2d v0, v0[0]
 //
-float64x2_t test_vdupq_n_f64(float64_t w)
-{
+// CHECK-LABEL: define <2 x double> @test_vdupq_n_f64(double %w) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x double> undef, double %w, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x double> [[VECINIT_I]], double %w, i32 1
+// CHECK:   ret <2 x double> [[VECINIT1_I]]
+float64x2_t test_vdupq_n_f64(float64_t w) {
    return vdupq_n_f64(w);
-  // CHECK-LABEL: test_vdupq_n_f64:
-  // CHECK: dup.2d v0, v0[0]
-  // CHECK-NEXT: ret
 }

 // might as well test this while we're here
 // vdupq_n_f32 -> dup.4s v0, v0[0]
-float32x4_t test_vdupq_n_f32(float32_t w)
-{
+// CHECK-LABEL: define <4 x float> @test_vdupq_n_f32(float %w) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x float> undef, float %w, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float %w, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float %w, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float %w, i32 3
+// CHECK:   ret <4 x float> [[VECINIT3_I]]
+float32x4_t test_vdupq_n_f32(float32_t w) {
    return vdupq_n_f32(w);
-  // CHECK-LABEL: test_vdupq_n_f32:
-  // CHECK: dup.4s v0, v0[0]
-  // CHECK-NEXT: ret
 }

 // vdupq_lane_f64 -> dup.2d v0, v0[0]
 // this was in <rdar://problem/11778405>, but had already been implemented,
 // test anyway
-float64x2_t test_vdupq_lane_f64(float64x1_t V)
-{
+// CHECK-LABEL: define <2 x double> @test_vdupq_lane_f64(<1 x double> %V) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <1 x double> %V, <1 x double> %V, <2 x i32> zeroinitializer
+// CHECK:   ret <2 x double> [[SHUFFLE]]
+float64x2_t test_vdupq_lane_f64(float64x1_t V) {
    return vdupq_lane_f64(V, 0);
-  // CHECK-LABEL: test_vdupq_lane_f64:
-  // CHECK: dup.2d v0, v0[0]
-  // CHECK-NEXT: ret
 }

 // vmovq_n_f64 -> dup Vd.2d,X0
 // this wasn't in <rdar://problem/11778405>, but it was between the vdups
-float64x2_t test_vmovq_n_f64(float64_t w)
-{
+// CHECK-LABEL: define <2 x double> @test_vmovq_n_f64(double %w) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x double> undef, double %w, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x double> [[VECINIT_I]], double %w, i32 1
+// CHECK:   ret <2 x double> [[VECINIT1_I]]
+float64x2_t test_vmovq_n_f64(float64_t w) {
  return vmovq_n_f64(w);
-  // CHECK-LABEL: test_vmovq_n_f64:
-  // CHECK: dup.2d v0, v0[0]
-  // CHECK-NEXT: ret
 }

-float16x4_t test_vmov_n_f16(float16_t *a1)
-{
-  // CHECK-IR-LABEL: test_vmov_n_f16
+// CHECK-LABEL: define <4 x half> @test_vmov_n_f16(half* %a1) #0 {
+// CHECK:   [[TMP0:%.*]] = load half, half* %a1, align 2
+// CHECK:   [[VECINIT:%.*]] = insertelement <4 x half> undef, half [[TMP0]], i32 0
+// CHECK:   [[VECINIT1:%.*]] = insertelement <4 x half> [[VECINIT]], half [[TMP0]], i32 1
+// CHECK:   [[VECINIT2:%.*]] = insertelement <4 x half> [[VECINIT]]1, half [[TMP0]], i32 2
+// CHECK:   [[VECINIT3:%.*]] = insertelement <4 x half> [[VECINIT]]2, half [[TMP0]], i32 3
+// CHECK:   ret <4 x half> [[VECINIT]]3
+float16x4_t test_vmov_n_f16(float16_t *a1) {
  return vmov_n_f16(*a1);
-  // CHECK-IR: insertelement {{.*}} i32 0{{ *$}}
-  // CHECK-IR: insertelement {{.*}} i32 1{{ *$}}
-  // CHECK-IR: insertelement {{.*}} i32 2{{ *$}}
-  // CHECK-IR: insertelement {{.*}} i32 3{{ *$}}
 }

-// Disable until scalar problem in backend is fixed. Change CHECK-IR@ to
-// CHECK-IR<colon>
 /*
-float64x1_t test_vmov_n_f64(float64_t a1)
-{
-  // CHECK-IR@ test_vmov_n_f64
+float64x1_t test_vmov_n_f64(float64_t a1) {
  return vmov_n_f64(a1);
-  // CHECK-IR@ insertelement {{.*}} i32 0{{ *$}}
 }
 */

-float16x8_t test_vmovq_n_f16(float16_t *a1)
-{
-  // CHECK-IR-LABEL: test_vmovq_n_f16
+// CHECK-LABEL: define <8 x half> @test_vmovq_n_f16(half* %a1) #0 {
+// CHECK:   [[TMP0:%.*]] = load half, half* %a1, align 2
+// CHECK:   [[VECINIT:%.*]] = insertelement <8 x half> undef, half [[TMP0]], i32 0
+// CHECK:   [[VECINIT1:%.*]] = insertelement <8 x half> [[VECINIT]], half [[TMP0]], i32 1
+// CHECK:   [[VECINIT2:%.*]] = insertelement <8 x half> [[VECINIT]]1, half [[TMP0]], i32 2
+// CHECK:   [[VECINIT3:%.*]] = insertelement <8 x half> [[VECINIT]]2, half [[TMP0]], i32 3
+// CHECK:   [[VECINIT4:%.*]] = insertelement <8 x half> [[VECINIT]]3, half [[TMP0]], i32 4
+// CHECK:   [[VECINIT5:%.*]] = insertelement <8 x half> [[VECINIT]]4, half [[TMP0]], i32 5
+// CHECK:   [[VECINIT6:%.*]] = insertelement <8 x half> [[VECINIT]]5, half [[TMP0]], i32 6
+// CHECK:   [[VECINIT7:%.*]] = insertelement <8 x half> [[VECINIT]]6, half [[TMP0]], i32 7
+// CHECK:   ret <8 x half> [[VECINIT]]7
+float16x8_t test_vmovq_n_f16(float16_t *a1) {
  return vmovq_n_f16(*a1);
-  // CHECK-IR: insertelement {{.*}} i32 0{{ *$}}
-  // CHECK-IR: insertelement {{.*}} i32 1{{ *$}}
-  // CHECK-IR: insertelement {{.*}} i32 2{{ *$}}
-  // CHECK-IR: insertelement {{.*}} i32 3{{ *$}}
-  // CHECK-IR: insertelement {{.*}} i32 4{{ *$}}
-  // CHECK-IR: insertelement {{.*}} i32 5{{ *$}}
-  // CHECK-IR: insertelement {{.*}} i32 6{{ *$}}
-  // CHECK-IR: insertelement {{.*}} i32 7{{ *$}}
 }

--- a/test/CodeGen/arm_neon_intrinsics.c
+++ b/test/CodeGen/arm_neon_intrinsics.c
--- a/test/CodeGen/builtins-arm-exclusive.c
+++ b/test/CodeGen/builtins-arm-exclusive.c
@ -1,32 +1,6 @@
-// REQUIRES: arm-registered-target
-// RUN: %clang_cc1 -Wall -Werror -triple thumbv8-linux-gnueabi -fno-signed-char -O3 -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -Wall -Werror -triple arm64-apple-ios7.0 -O3 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-ARM64
+// RUN: %clang_cc1 -Wall -Werror -triple thumbv8-linux-gnueabi -fno-signed-char -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s
+// RUN: %clang_cc1 -Wall -Werror -triple arm64-apple-ios7.0 -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s --check-prefix=CHECK-ARM64

-// Make sure the canonical use works before going into smaller details:
-int atomic_inc(int *addr) {
-  int Failure, OldVal;
-  do {
-    OldVal = __builtin_arm_ldrex(addr);
-    Failure = __builtin_arm_strex(OldVal + 1, addr);
-  } while (Failure);
-
-  return OldVal;
-}
-
-// CHECK-LABEL: @atomic_inc
-// CHECK:   [[OLDVAL:%.*]] = tail call i32 @llvm.arm.ldrex.p0i32(i32* %addr)
-// CHECK:   [[INC:%.*]] = add nsw i32 [[OLDVAL]], 1
-// CHECK:   [[FAILURE:%.*]] = tail call i32 @llvm.arm.strex.p0i32(i32 [[INC]], i32* %addr)
-// CHECK:   [[TST:%.*]] = icmp eq i32 [[FAILURE]], 0
-// CHECK:   br i1 [[TST]], label {{%[a-zA-Z0-9.]+}}, label {{%[a-zA-Z0-9.]+}}
-
-// CHECK-ARM64-LABEL: @atomic_inc
-// CHECK-ARM64:   [[OLDVAL:%.*]] = tail call i64 @llvm.aarch64.ldxr.p0i32(i32* %addr)
-// CHECK-ARM64:   [[INC:%.*]] = add i64 [[OLDVAL]], 1
-// CHECK-ARM64:   [[TRUNC:%.*]] = and i64 [[INC]], 4294967295
-// CHECK-ARM64:   [[FAILURE:%.*]] = tail call i32 @llvm.aarch64.stxr.p0i32(i64 [[TRUNC]], i32* %addr)
-// CHECK-ARM64:   [[TST:%.*]] = icmp eq i32 [[FAILURE]], 0
-// CHECK-ARM64:   br i1 [[TST]], label {{%[a-zA-Z0-9.]+}}, label {{%[a-zA-Z0-9.]+}}

 struct Simple {
  char a, b;
@ -37,36 +11,33 @@ int test_ldrex(char *addr, long long *addr64, float *addrfloat) {
 // CHECK-ARM64-LABEL: @test_ldrex
  int sum = 0;
  sum += __builtin_arm_ldrex(addr);
-// CHECK: [[INTRES:%.*]] = tail call i32 @llvm.arm.ldrex.p0i8(i8* %addr)
-// CHECK: and i32 [[INTRES]], 255
+// CHECK: [[INTRES:%.*]] = call i32 @llvm.arm.ldrex.p0i8(i8* %addr)
+// CHECK: trunc i32 [[INTRES]] to i8

-// CHECK-ARM64: [[INTRES:%.*]] = tail call i64 @llvm.aarch64.ldxr.p0i8(i8* %addr)
-// CHECK-ARM64: [[TRUNCRES:%.*]] = trunc i64 [[INTRES]] to i32
-// CHECK-ARM64: [[SEXTTMP:%.*]] = shl i32 [[TRUNCRES]], 24
-// CHECK-ARM64: ashr exact i32 [[SEXTTMP]], 24
+// CHECK-ARM64: [[INTRES:%.*]] = call i64 @llvm.aarch64.ldxr.p0i8(i8* %addr)
+// CHECK-ARM64: trunc i64 [[INTRES]] to i8

  sum += __builtin_arm_ldrex((short *)addr);
 // CHECK: [[ADDR16:%.*]] = bitcast i8* %addr to i16*
-// CHECK: [[INTRES:%.*]] = tail call i32 @llvm.arm.ldrex.p0i16(i16* [[ADDR16]])
-// CHECK: [[TMPSEXT:%.*]] = shl i32 [[INTRES]], 16
-// CHECK: ashr exact i32 [[TMPSEXT]], 16
+// CHECK: [[INTRES:%.*]] = call i32 @llvm.arm.ldrex.p0i16(i16* [[ADDR16]])
+// CHECK: trunc i32 [[INTRES]] to i16

 // CHECK-ARM64: [[ADDR16:%.*]] = bitcast i8* %addr to i16*
-// CHECK-ARM64: [[INTRES:%.*]] = tail call i64 @llvm.aarch64.ldxr.p0i16(i16* [[ADDR16]])
-// CHECK-ARM64: [[TRUNCRES:%.*]] = trunc i64 [[INTRES]] to i32
-// CHECK-ARM64: [[TMPSEXT:%.*]] = shl i32 [[TRUNCRES]], 16
-// CHECK-ARM64: ashr exact i32 [[TMPSEXT]], 16
+// CHECK-ARM64: [[INTRES:%.*]] = call i64 @llvm.aarch64.ldxr.p0i16(i16* [[ADDR16]])
+// CHECK-ARM64: trunc i64 [[INTRES]] to i16

  sum += __builtin_arm_ldrex((int *)addr);
 // CHECK: [[ADDR32:%.*]] = bitcast i8* %addr to i32*
-// CHECK:  call i32 @llvm.arm.ldrex.p0i32(i32* [[ADDR32]])
+// CHECK: call i32 @llvm.arm.ldrex.p0i32(i32* [[ADDR32]])

 // CHECK-ARM64: [[ADDR32:%.*]] = bitcast i8* %addr to i32*
-// CHECK-ARM64: [[INTRES:%.*]] = tail call i64 @llvm.aarch64.ldxr.p0i32(i32* [[ADDR32]])
+// CHECK-ARM64: [[INTRES:%.*]] = call i64 @llvm.aarch64.ldxr.p0i32(i32* [[ADDR32]])
 // CHECK-ARM64: trunc i64 [[INTRES]] to i32

  sum += __builtin_arm_ldrex((long long *)addr);
-// CHECK: call { i32, i32 } @llvm.arm.ldrexd(i8* %addr)
+// CHECK: [[TMP4:%.*]] = bitcast i8* %addr to i64*
+// CHECK: [[TMP5:%.*]] = bitcast i64* [[TMP4]] to i8*
+// CHECK: call { i32, i32 } @llvm.arm.ldrexd(i8* [[TMP5]])

 // CHECK-ARM64: [[ADDR64:%.*]] = bitcast i8* %addr to i64*
 // CHECK-ARM64: call i64 @llvm.aarch64.ldxr.p0i64(i64* [[ADDR64]])
@ -79,16 +50,18 @@ int test_ldrex(char *addr, long long *addr64, float *addrfloat) {

  sum += __builtin_arm_ldrex(addrfloat);
 // CHECK: [[INTADDR:%.*]] = bitcast float* %addrfloat to i32*
-// CHECK: [[INTRES:%.*]] = tail call i32 @llvm.arm.ldrex.p0i32(i32* [[INTADDR]])
+// CHECK: [[INTRES:%.*]] = call i32 @llvm.arm.ldrex.p0i32(i32* [[INTADDR]])
 // CHECK: bitcast i32 [[INTRES]] to float

 // CHECK-ARM64: [[INTADDR:%.*]] = bitcast float* %addrfloat to i32*
-// CHECK-ARM64: [[INTRES:%.*]] = tail call i64 @llvm.aarch64.ldxr.p0i32(i32* [[INTADDR]])
+// CHECK-ARM64: [[INTRES:%.*]] = call i64 @llvm.aarch64.ldxr.p0i32(i32* [[INTADDR]])
 // CHECK-ARM64: [[TRUNCRES:%.*]] = trunc i64 [[INTRES]] to i32
 // CHECK-ARM64: bitcast i32 [[TRUNCRES]] to float

  sum += __builtin_arm_ldrex((double *)addr);
-// CHECK: [[STRUCTRES:%.*]] = tail call { i32, i32 } @llvm.arm.ldrexd(i8* %addr)
+// CHECK: [[TMP4:%.*]] = bitcast i8* %addr to double*
+// CHECK: [[TMP5:%.*]] = bitcast double* [[TMP4]] to i8*
+// CHECK: [[STRUCTRES:%.*]] = call { i32, i32 } @llvm.arm.ldrexd(i8* [[TMP5]])
 // CHECK: [[RESHI:%.*]] = extractvalue { i32, i32 } [[STRUCTRES]], 1
 // CHECK: [[RESLO:%.*]] = extractvalue { i32, i32 } [[STRUCTRES]], 0
 // CHECK: [[RESHI64:%.*]] = zext i32 [[RESHI]] to i64
@ -97,21 +70,31 @@ int test_ldrex(char *addr, long long *addr64, float *addrfloat) {
 // CHECK: [[INTRES:%.*]] = or i64 [[RESHIHI]], [[RESLO64]]
 // CHECK: bitcast i64 [[INTRES]] to double

-// CHECK-ARM64: [[INTRES:%.*]] = tail call i64 @llvm.aarch64.ldxr.p0i64(i64* [[ADDR64]])
+// CHECK-ARM64: [[TMP4:%.*]] = bitcast i8* %addr to double*
+// CHECK-ARM64: [[TMP5:%.*]] = bitcast double* [[TMP4]] to i64*
+// CHECK-ARM64: [[INTRES:%.*]] = call i64 @llvm.aarch64.ldxr.p0i64(i64* [[TMP5]])
 // CHECK-ARM64: bitcast i64 [[INTRES]] to double

  sum += *__builtin_arm_ldrex((int **)addr);
-// CHECK: [[INTRES:%.*]] = tail call i32 @llvm.arm.ldrex.p0i32(i32* [[ADDR32]])
+// CHECK: [[TMP4:%.*]] = bitcast i8* %addr to i32**
+// CHECK: [[TMP5:%.*]] = bitcast i32** [[TMP4]] to i32*
+// CHECK: [[INTRES:%.*]] = call i32 @llvm.arm.ldrex.p0i32(i32* [[TMP5]])
 // CHECK: inttoptr i32 [[INTRES]] to i32*

-// CHECK-ARM64: [[INTRES:%.*]] = tail call i64 @llvm.aarch64.ldxr.p0i64(i64* [[ADDR64]])
+// CHECK-ARM64: [[TMP4:%.*]] = bitcast i8* %addr to i32**
+// CHECK-ARM64: [[TMP5:%.*]] = bitcast i32** [[TMP4]] to i64*
+// CHECK-ARM64: [[INTRES:%.*]] = call i64 @llvm.aarch64.ldxr.p0i64(i64* [[TMP5]])
 // CHECK-ARM64: inttoptr i64 [[INTRES]] to i32*

  sum += __builtin_arm_ldrex((struct Simple **)addr)->a;
-// CHECK: [[INTRES:%.*]] = tail call i32 @llvm.arm.ldrex.p0i32(i32* [[ADDR32]])
+// CHECK: [[TMP4:%.*]] = bitcast i8* %addr to %struct.Simple**
+// CHECK: [[TMP5:%.*]] = bitcast %struct.Simple** [[TMP4]] to i32*
+// CHECK: [[INTRES:%.*]] = call i32 @llvm.arm.ldrex.p0i32(i32* [[TMP5]])
 // CHECK: inttoptr i32 [[INTRES]] to %struct.Simple*

-// CHECK-ARM64: [[INTRES:%.*]] = tail call i64 @llvm.aarch64.ldxr.p0i64(i64* [[ADDR64]])
+// CHECK-ARM64: [[TMP4:%.*]] = bitcast i8* %addr to %struct.Simple**
+// CHECK-ARM64: [[TMP5:%.*]] = bitcast %struct.Simple** [[TMP4]] to i64*
+// CHECK-ARM64: [[INTRES:%.*]] = call i64 @llvm.aarch64.ldxr.p0i64(i64* [[TMP5]])
 // CHECK-ARM64: inttoptr i64 [[INTRES]] to %struct.Simple*
  return sum;
 }
@ -121,36 +104,33 @@ int test_ldaex(char *addr, long long *addr64, float *addrfloat) {
 // CHECK-ARM64-LABEL: @test_ldaex
  int sum = 0;
  sum += __builtin_arm_ldaex(addr);
-// CHECK: [[INTRES:%.*]] = tail call i32 @llvm.arm.ldaex.p0i8(i8* %addr)
-// CHECK: and i32 [[INTRES]], 255
+// CHECK: [[INTRES:%.*]] = call i32 @llvm.arm.ldaex.p0i8(i8* %addr)
+// CHECK: trunc i32 [[INTRES]] to i8

-// CHECK-ARM64: [[INTRES:%.*]] = tail call i64 @llvm.aarch64.ldaxr.p0i8(i8* %addr)
-// CHECK-ARM64: [[TRUNCRES:%.*]] = trunc i64 [[INTRES]] to i32
-// CHECK-ARM64: [[SEXTTMP:%.*]] = shl i32 [[TRUNCRES]], 24
-// CHECK-ARM64: ashr exact i32 [[SEXTTMP]], 24
+// CHECK-ARM64: [[INTRES:%.*]] = call i64 @llvm.aarch64.ldaxr.p0i8(i8* %addr)
+// CHECK-ARM64: trunc i64 [[INTRES]] to i8

  sum += __builtin_arm_ldaex((short *)addr);
 // CHECK: [[ADDR16:%.*]] = bitcast i8* %addr to i16*
-// CHECK: [[INTRES:%.*]] = tail call i32 @llvm.arm.ldaex.p0i16(i16* [[ADDR16]])
-// CHECK: [[TMPSEXT:%.*]] = shl i32 [[INTRES]], 16
-// CHECK: ashr exact i32 [[TMPSEXT]], 16
+// CHECK: [[INTRES:%.*]] = call i32 @llvm.arm.ldaex.p0i16(i16* [[ADDR16]])
+// CHECK: trunc i32 [[INTRES]] to i16

 // CHECK-ARM64: [[ADDR16:%.*]] = bitcast i8* %addr to i16*
-// CHECK-ARM64: [[INTRES:%.*]] = tail call i64 @llvm.aarch64.ldaxr.p0i16(i16* [[ADDR16]])
-// CHECK-ARM64: [[TRUNCRES:%.*]] = trunc i64 [[INTRES]] to i32
-// CHECK-ARM64: [[TMPSEXT:%.*]] = shl i32 [[TRUNCRES]], 16
-// CHECK-ARM64: ashr exact i32 [[TMPSEXT]], 16
+// CHECK-ARM64: [[INTRES:%.*]] = call i64 @llvm.aarch64.ldaxr.p0i16(i16* [[ADDR16]])
+// CHECK-ARM64: [[TRUNCRES:%.*]] = trunc i64 [[INTRES]] to i16

  sum += __builtin_arm_ldaex((int *)addr);
 // CHECK: [[ADDR32:%.*]] = bitcast i8* %addr to i32*
 // CHECK:  call i32 @llvm.arm.ldaex.p0i32(i32* [[ADDR32]])

 // CHECK-ARM64: [[ADDR32:%.*]] = bitcast i8* %addr to i32*
-// CHECK-ARM64: [[INTRES:%.*]] = tail call i64 @llvm.aarch64.ldaxr.p0i32(i32* [[ADDR32]])
+// CHECK-ARM64: [[INTRES:%.*]] = call i64 @llvm.aarch64.ldaxr.p0i32(i32* [[ADDR32]])
 // CHECK-ARM64: trunc i64 [[INTRES]] to i32

  sum += __builtin_arm_ldaex((long long *)addr);
-// CHECK: call { i32, i32 } @llvm.arm.ldaexd(i8* %addr)
+// CHECK: [[TMP4:%.*]] = bitcast i8* %addr to i64*
+// CHECK: [[TMP5:%.*]] = bitcast i64* [[TMP4]] to i8*
+// CHECK: call { i32, i32 } @llvm.arm.ldaexd(i8* [[TMP5]])

 // CHECK-ARM64: [[ADDR64:%.*]] = bitcast i8* %addr to i64*
 // CHECK-ARM64: call i64 @llvm.aarch64.ldaxr.p0i64(i64* [[ADDR64]])
@ -163,16 +143,18 @@ int test_ldaex(char *addr, long long *addr64, float *addrfloat) {

  sum += __builtin_arm_ldaex(addrfloat);
 // CHECK: [[INTADDR:%.*]] = bitcast float* %addrfloat to i32*
-// CHECK: [[INTRES:%.*]] = tail call i32 @llvm.arm.ldaex.p0i32(i32* [[INTADDR]])
+// CHECK: [[INTRES:%.*]] = call i32 @llvm.arm.ldaex.p0i32(i32* [[INTADDR]])
 // CHECK: bitcast i32 [[INTRES]] to float

 // CHECK-ARM64: [[INTADDR:%.*]] = bitcast float* %addrfloat to i32*
-// CHECK-ARM64: [[INTRES:%.*]] = tail call i64 @llvm.aarch64.ldaxr.p0i32(i32* [[INTADDR]])
+// CHECK-ARM64: [[INTRES:%.*]] = call i64 @llvm.aarch64.ldaxr.p0i32(i32* [[INTADDR]])
 // CHECK-ARM64: [[TRUNCRES:%.*]] = trunc i64 [[INTRES]] to i32
 // CHECK-ARM64: bitcast i32 [[TRUNCRES]] to float

  sum += __builtin_arm_ldaex((double *)addr);
-// CHECK: [[STRUCTRES:%.*]] = tail call { i32, i32 } @llvm.arm.ldaexd(i8* %addr)
+// CHECK: [[TMP4:%.*]] = bitcast i8* %addr to double*
+// CHECK: [[TMP5:%.*]] = bitcast double* [[TMP4]] to i8*
+// CHECK: [[STRUCTRES:%.*]] = call { i32, i32 } @llvm.arm.ldaexd(i8* [[TMP5]])
 // CHECK: [[RESHI:%.*]] = extractvalue { i32, i32 } [[STRUCTRES]], 1
 // CHECK: [[RESLO:%.*]] = extractvalue { i32, i32 } [[STRUCTRES]], 0
 // CHECK: [[RESHI64:%.*]] = zext i32 [[RESHI]] to i64
@ -181,21 +163,31 @@ int test_ldaex(char *addr, long long *addr64, float *addrfloat) {
 // CHECK: [[INTRES:%.*]] = or i64 [[RESHIHI]], [[RESLO64]]
 // CHECK: bitcast i64 [[INTRES]] to double

-// CHECK-ARM64: [[INTRES:%.*]] = tail call i64 @llvm.aarch64.ldaxr.p0i64(i64* [[ADDR64]])
+// CHECK-ARM64: [[TMP4:%.*]] = bitcast i8* %addr to double*
+// CHECK-ARM64: [[TMP5:%.*]] = bitcast double* [[TMP4]] to i64*
+// CHECK-ARM64: [[INTRES:%.*]] = call i64 @llvm.aarch64.ldaxr.p0i64(i64* [[TMP5]])
 // CHECK-ARM64: bitcast i64 [[INTRES]] to double

  sum += *__builtin_arm_ldaex((int **)addr);
-// CHECK: [[INTRES:%.*]] = tail call i32 @llvm.arm.ldaex.p0i32(i32* [[ADDR32]])
+// CHECK: [[TMP4:%.*]] = bitcast i8* %addr to i32**
+// CHECK: [[TMP5:%.*]] = bitcast i32** [[TMP4]] to i32*
+// CHECK: [[INTRES:%.*]] = call i32 @llvm.arm.ldaex.p0i32(i32* [[TMP5]])
 // CHECK: inttoptr i32 [[INTRES]] to i32*

-// CHECK-ARM64: [[INTRES:%.*]] = tail call i64 @llvm.aarch64.ldaxr.p0i64(i64* [[ADDR64]])
+// CHECK-ARM64: [[TMP4:%.*]] = bitcast i8* %addr to i32**
+// CHECK-ARM64: [[TMP5:%.*]] = bitcast i32** [[TMP4]] to i64*
+// CHECK-ARM64: [[INTRES:%.*]] = call i64 @llvm.aarch64.ldaxr.p0i64(i64* [[TMP5]])
 // CHECK-ARM64: inttoptr i64 [[INTRES]] to i32*

  sum += __builtin_arm_ldaex((struct Simple **)addr)->a;
-// CHECK: [[INTRES:%.*]] = tail call i32 @llvm.arm.ldaex.p0i32(i32* [[ADDR32]])
+// CHECK: [[TMP4:%.*]] = bitcast i8* %addr to %struct.Simple**
+// CHECK: [[TMP5:%.*]] = bitcast %struct.Simple** [[TMP4]] to i32*
+// CHECK: [[INTRES:%.*]] = call i32 @llvm.arm.ldaex.p0i32(i32* [[TMP5]])
 // CHECK: inttoptr i32 [[INTRES]] to %struct.Simple*

-// CHECK-ARM64: [[INTRES:%.*]] = tail call i64 @llvm.aarch64.ldaxr.p0i64(i64* [[ADDR64]])
+// CHECK-ARM64: [[TMP4:%.*]] = bitcast i8* %addr to %struct.Simple**
+// CHECK-ARM64: [[TMP5:%.*]] = bitcast %struct.Simple** [[TMP4]] to i64*
+// CHECK-ARM64: [[INTRES:%.*]] = call i64 @llvm.aarch64.ldaxr.p0i64(i64* [[TMP5]])
 // CHECK-ARM64: inttoptr i64 [[INTRES]] to %struct.Simple*
  return sum;
 }
@ -225,27 +217,51 @@ int test_strex(char *addr) {
 // CHECK-ARM64: call i32 @llvm.aarch64.stxr.p0i32(i64 42, i32* [[ADDR32]])

  res |= __builtin_arm_strex(42, (long long *)addr);
-// CHECK: call i32 @llvm.arm.strexd(i32 42, i32 0, i8* %addr)
+// CHECK: store i64 42, i64* [[TMP:%.*]], align 8
+// CHECK: [[LOHI_ADDR:%.*]] = bitcast i64* [[TMP]] to { i32, i32 }*
+// CHECK: [[LOHI:%.*]] = load { i32, i32 }, { i32, i32 }* [[LOHI_ADDR]]
+// CHECK: [[LO:%.*]] = extractvalue { i32, i32 } [[LOHI]], 0
+// CHECK: [[HI:%.*]] = extractvalue { i32, i32 } [[LOHI]], 1
+// CHECK: [[TMP4:%.*]] = bitcast i8* %addr to i64*
+// CHECK: [[TMP5:%.*]] = bitcast i64* [[TMP4]] to i8*
+// CHECK: call i32 @llvm.arm.strexd(i32 [[LO]], i32 [[HI]], i8* [[TMP5]])

 // CHECK-ARM64: [[ADDR64:%.*]] = bitcast i8* %addr to i64*
 // CHECK-ARM64: call i32 @llvm.aarch64.stxr.p0i64(i64 42, i64* [[ADDR64]])

  res |= __builtin_arm_strex(2.71828f, (float *)addr);
-// CHECK: call i32 @llvm.arm.strex.p0i32(i32 1076754509, i32* [[ADDR32]])
+// CHECK: [[TMP4:%.*]] = bitcast i8* %addr to float*
+// CHECK: [[TMP5:%.*]] = bitcast float* [[TMP4]] to i32*
+// CHECK: call i32 @llvm.arm.strex.p0i32(i32 1076754509, i32* [[TMP5]])

-// CHECK-ARM64: call i32 @llvm.aarch64.stxr.p0i32(i64 1076754509, i32* [[ADDR32]])
+// CHECK-ARM64: [[TMP4:%.*]] = bitcast i8* %addr to float*
+// CHECK-ARM64: [[TMP5:%.*]] = bitcast float* [[TMP4]] to i32*
+// CHECK-ARM64: call i32 @llvm.aarch64.stxr.p0i32(i64 1076754509, i32* [[TMP5]])

  res |= __builtin_arm_strex(3.14159, (double *)addr);
-// CHECK: call i32 @llvm.arm.strexd(i32 -266631570, i32 1074340345, i8* %addr)
+// CHECK: store double 3.141590e+00, double* [[TMP:%.*]], align 8
+// CHECK: [[LOHI_ADDR:%.*]] = bitcast double* [[TMP]] to { i32, i32 }*
+// CHECK: [[LOHI:%.*]] = load { i32, i32 }, { i32, i32 }* [[LOHI_ADDR]]
+// CHECK: [[LO:%.*]] = extractvalue { i32, i32 } [[LOHI]], 0
+// CHECK: [[HI:%.*]] = extractvalue { i32, i32 } [[LOHI]], 1
+// CHECK: [[TMP4:%.*]] = bitcast i8* %addr to double*
+// CHECK: [[TMP5:%.*]] = bitcast double* [[TMP4]] to i8*
+// CHECK: call i32 @llvm.arm.strexd(i32 [[LO]], i32 [[HI]], i8* [[TMP5]])

-// CHECK-ARM64: call i32 @llvm.aarch64.stxr.p0i64(i64 4614256650576692846, i64* [[ADDR64]])
+// CHECK-ARM64: [[TMP4:%.*]] = bitcast i8* %addr to double*
+// CHECK-ARM64: [[TMP5:%.*]] = bitcast double* [[TMP4]] to i64*
+// CHECK-ARM64: call i32 @llvm.aarch64.stxr.p0i64(i64 4614256650576692846, i64* [[TMP5]])

  res |= __builtin_arm_strex(&var, (struct Simple **)addr);
-// CHECK: [[INTVAL:%.*]] = ptrtoint i16* %var to i32
-// CHECK: call i32 @llvm.arm.strex.p0i32(i32 [[INTVAL]], i32* [[ADDR32]])
+// CHECK: [[TMP4:%.*]] = bitcast i8* %addr to %struct.Simple**
+// CHECK: [[TMP5:%.*]] = bitcast %struct.Simple** [[TMP4]] to i32*
+// CHECK: [[INTVAL:%.*]] = ptrtoint %struct.Simple* %var to i32
+// CHECK: call i32 @llvm.arm.strex.p0i32(i32 [[INTVAL]], i32* [[TMP5]])

-// CHECK-ARM64: [[INTVAL:%.*]] = ptrtoint i16* %var to i64
-// CHECK-ARM64: call i32 @llvm.aarch64.stxr.p0i64(i64 [[INTVAL]], i64* [[ADDR64]])
+// CHECK-ARM64: [[TMP4:%.*]] = bitcast i8* %addr to %struct.Simple**
+// CHECK-ARM64: [[TMP5:%.*]] = bitcast %struct.Simple** [[TMP4]] to i64*
+// CHECK-ARM64: [[INTVAL:%.*]] = ptrtoint %struct.Simple* %var to i64
+// CHECK-ARM64: call i32 @llvm.aarch64.stxr.p0i64(i64 [[INTVAL]], i64* [[TMP5]])

  return res;
 }
@ -275,27 +291,51 @@ int test_stlex(char *addr) {
 // CHECK-ARM64: call i32 @llvm.aarch64.stlxr.p0i32(i64 42, i32* [[ADDR32]])

  res |= __builtin_arm_stlex(42, (long long *)addr);
-// CHECK: call i32 @llvm.arm.stlexd(i32 42, i32 0, i8* %addr)
+// CHECK: store i64 42, i64* [[TMP:%.*]], align 8
+// CHECK: [[LOHI_ADDR:%.*]] = bitcast i64* [[TMP]] to { i32, i32 }*
+// CHECK: [[LOHI:%.*]] = load { i32, i32 }, { i32, i32 }* [[LOHI_ADDR]]
+// CHECK: [[LO:%.*]] = extractvalue { i32, i32 } [[LOHI]], 0
+// CHECK: [[HI:%.*]] = extractvalue { i32, i32 } [[LOHI]], 1
+// CHECK: [[TMP4:%.*]] = bitcast i8* %addr to i64*
+// CHECK: [[TMP5:%.*]] = bitcast i64* [[TMP4]] to i8*
+// CHECK: call i32 @llvm.arm.stlexd(i32 [[LO]], i32 [[HI]], i8* [[TMP5]])

 // CHECK-ARM64: [[ADDR64:%.*]] = bitcast i8* %addr to i64*
 // CHECK-ARM64: call i32 @llvm.aarch64.stlxr.p0i64(i64 42, i64* [[ADDR64]])

  res |= __builtin_arm_stlex(2.71828f, (float *)addr);
-// CHECK: call i32 @llvm.arm.stlex.p0i32(i32 1076754509, i32* [[ADDR32]])
+// CHECK: [[TMP4:%.*]] = bitcast i8* %addr to float*
+// CHECK: [[TMP5:%.*]] = bitcast float* [[TMP4]] to i32*
+// CHECK: call i32 @llvm.arm.stlex.p0i32(i32 1076754509, i32* [[TMP5]])

-// CHECK-ARM64: call i32 @llvm.aarch64.stlxr.p0i32(i64 1076754509, i32* [[ADDR32]])
+// CHECK-ARM64: [[TMP4:%.*]] = bitcast i8* %addr to float*
+// CHECK-ARM64: [[TMP5:%.*]] = bitcast float* [[TMP4]] to i32*
+// CHECK-ARM64: call i32 @llvm.aarch64.stlxr.p0i32(i64 1076754509, i32* [[TMP5]])

  res |= __builtin_arm_stlex(3.14159, (double *)addr);
-// CHECK: call i32 @llvm.arm.stlexd(i32 -266631570, i32 1074340345, i8* %addr)
+// CHECK: store double 3.141590e+00, double* [[TMP:%.*]], align 8
+// CHECK: [[LOHI_ADDR:%.*]] = bitcast double* [[TMP]] to { i32, i32 }*
+// CHECK: [[LOHI:%.*]] = load { i32, i32 }, { i32, i32 }* [[LOHI_ADDR]]
+// CHECK: [[LO:%.*]] = extractvalue { i32, i32 } [[LOHI]], 0
+// CHECK: [[HI:%.*]] = extractvalue { i32, i32 } [[LOHI]], 1
+// CHECK: [[TMP4:%.*]] = bitcast i8* %addr to double*
+// CHECK: [[TMP5:%.*]] = bitcast double* [[TMP4]] to i8*
+// CHECK: call i32 @llvm.arm.stlexd(i32 [[LO]], i32 [[HI]], i8* [[TMP5]])

-// CHECK-ARM64: call i32 @llvm.aarch64.stlxr.p0i64(i64 4614256650576692846, i64* [[ADDR64]])
+// CHECK-ARM64: [[TMP4:%.*]] = bitcast i8* %addr to double*
+// CHECK-ARM64: [[TMP5:%.*]] = bitcast double* [[TMP4]] to i64*
+// CHECK-ARM64: call i32 @llvm.aarch64.stlxr.p0i64(i64 4614256650576692846, i64* [[TMP5]])

  res |= __builtin_arm_stlex(&var, (struct Simple **)addr);
-// CHECK: [[INTVAL:%.*]] = ptrtoint i16* %var to i32
-// CHECK: call i32 @llvm.arm.stlex.p0i32(i32 [[INTVAL]], i32* [[ADDR32]])
+// CHECK: [[TMP4:%.*]] = bitcast i8* %addr to %struct.Simple**
+// CHECK: [[TMP5:%.*]] = bitcast %struct.Simple** [[TMP4]] to i32*
+// CHECK: [[INTVAL:%.*]] = ptrtoint %struct.Simple* %var to i32
+// CHECK: call i32 @llvm.arm.stlex.p0i32(i32 [[INTVAL]], i32* [[TMP5]])

-// CHECK-ARM64: [[INTVAL:%.*]] = ptrtoint i16* %var to i64
-// CHECK-ARM64: call i32 @llvm.aarch64.stlxr.p0i64(i64 [[INTVAL]], i64* [[ADDR64]])
+// CHECK-ARM64: [[TMP4:%.*]] = bitcast i8* %addr to %struct.Simple**
+// CHECK-ARM64: [[TMP5:%.*]] = bitcast %struct.Simple** [[TMP4]] to i64*
+// CHECK-ARM64: [[INTVAL:%.*]] = ptrtoint %struct.Simple* %var to i64
+// CHECK-ARM64: call i32 @llvm.aarch64.stlxr.p0i64(i64 [[INTVAL]], i64* [[TMP5]])

  return res;
 }
@ -317,7 +357,7 @@ __int128 test_ldrex_128(__int128 *addr) {

  return __builtin_arm_ldrex(addr);
 // CHECK-ARM64: [[ADDR8:%.*]] = bitcast i128* %addr to i8*
-// CHECK-ARM64: [[STRUCTRES:%.*]] = tail call { i64, i64 } @llvm.aarch64.ldxp(i8* [[ADDR8]])
+// CHECK-ARM64: [[STRUCTRES:%.*]] = call { i64, i64 } @llvm.aarch64.ldxp(i8* [[ADDR8]])
 // CHECK-ARM64: [[RESHI:%.*]] = extractvalue { i64, i64 } [[STRUCTRES]], 1
 // CHECK-ARM64: [[RESLO:%.*]] = extractvalue { i64, i64 } [[STRUCTRES]], 0
 // CHECK-ARM64: [[RESHI64:%.*]] = zext i64 [[RESHI]] to i128
@ -331,11 +371,13 @@ int test_strex_128(__int128 *addr, __int128 val) {
 // CHECK-ARM64-LABEL: @test_strex_128

  return __builtin_arm_strex(val, addr);
-// CHECK-ARM64: [[VALLO:%.*]] = trunc i128 %val to i64
-// CHECK-ARM64: [[VALHI128:%.*]] = lshr i128 %val, 64
-// CHECK-ARM64: [[VALHI:%.*]] = trunc i128 [[VALHI128]] to i64
+// CHECK-ARM64: store i128 %val, i128* [[TMP:%.*]], align 16
+// CHECK-ARM64: [[LOHI_ADDR:%.*]] = bitcast i128* [[TMP]] to { i64, i64 }*
+// CHECK-ARM64: [[LOHI:%.*]] = load { i64, i64 }, { i64, i64 }* [[LOHI_ADDR]]
+// CHECK-ARM64: [[LO:%.*]] = extractvalue { i64, i64 } [[LOHI]], 0
+// CHECK-ARM64: [[HI:%.*]] = extractvalue { i64, i64 } [[LOHI]], 1
 // CHECK-ARM64: [[ADDR8:%.*]] = bitcast i128* %addr to i8*
-// CHECK-ARM64: [[RES:%.*]] = tail call i32 @llvm.aarch64.stxp(i64 [[VALLO]], i64 [[VALHI]], i8* [[ADDR8]])
+// CHECK-ARM64: call i32 @llvm.aarch64.stxp(i64 [[LO]], i64 [[HI]], i8* [[ADDR8]])
 }

 __int128 test_ldaex_128(__int128 *addr) {
@ -343,7 +385,7 @@ __int128 test_ldaex_128(__int128 *addr) {

  return __builtin_arm_ldaex(addr);
 // CHECK-ARM64: [[ADDR8:%.*]] = bitcast i128* %addr to i8*
-// CHECK-ARM64: [[STRUCTRES:%.*]] = tail call { i64, i64 } @llvm.aarch64.ldaxp(i8* [[ADDR8]])
+// CHECK-ARM64: [[STRUCTRES:%.*]] = call { i64, i64 } @llvm.aarch64.ldaxp(i8* [[ADDR8]])
 // CHECK-ARM64: [[RESHI:%.*]] = extractvalue { i64, i64 } [[STRUCTRES]], 1
 // CHECK-ARM64: [[RESLO:%.*]] = extractvalue { i64, i64 } [[STRUCTRES]], 0
 // CHECK-ARM64: [[RESHI64:%.*]] = zext i64 [[RESHI]] to i128
@ -357,11 +399,13 @@ int test_stlex_128(__int128 *addr, __int128 val) {
 // CHECK-ARM64-LABEL: @test_stlex_128

  return __builtin_arm_stlex(val, addr);
-// CHECK-ARM64: [[VALLO:%.*]] = trunc i128 %val to i64
-// CHECK-ARM64: [[VALHI128:%.*]] = lshr i128 %val, 64
-// CHECK-ARM64: [[VALHI:%.*]] = trunc i128 [[VALHI128]] to i64
+// CHECK-ARM64: store i128 %val, i128* [[TMP:%.*]], align 16
+// CHECK-ARM64: [[LOHI_ADDR:%.*]] = bitcast i128* [[TMP]] to { i64, i64 }*
+// CHECK-ARM64: [[LOHI:%.*]] = load { i64, i64 }, { i64, i64 }* [[LOHI_ADDR]]
+// CHECK-ARM64: [[LO:%.*]] = extractvalue { i64, i64 } [[LOHI]], 0
+// CHECK-ARM64: [[HI:%.*]] = extractvalue { i64, i64 } [[LOHI]], 1
 // CHECK-ARM64: [[ADDR8:%.*]] = bitcast i128* %addr to i8*
-// CHECK-ARM64: [[RES:%.*]] = tail call i32 @llvm.aarch64.stlxp(i64 [[VALLO]], i64 [[VALHI]], i8* [[ADDR8]])
+// CHECK-ARM64: [[RES:%.*]] = call i32 @llvm.aarch64.stlxp(i64 [[LO]], i64 [[HI]], i8* [[ADDR8]])
 }

 #endif
--- a/test/CodeGen/builtins-arm.c
+++ b/test/CodeGen/builtins-arm.c
@ -1,5 +1,4 @@
-// REQUIRES: arm-registered-target
-// RUN: %clang_cc1 -Wall -Werror -triple thumbv7-eabi -target-cpu cortex-a8 -O3 -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -Wall -Werror -triple thumbv7-eabi -target-cpu cortex-a8 -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s

 void *f0()
 {
@ -87,14 +86,14 @@ void prefetch(int i) {

 unsigned mrc() {
  // CHECK: define i32 @mrc()
-  // CHECK: [[R:%.*]] = {{.*}} call i32 @llvm.arm.mrc(i32 15, i32 0, i32 13, i32 0, i32 3)
+  // CHECK: [[R:%.*]] = call i32 @llvm.arm.mrc(i32 15, i32 0, i32 13, i32 0, i32 3)
  // CHECK-NEXT: ret i32 [[R]]
  return __builtin_arm_mrc(15, 0, 13, 0, 3);
 }

 unsigned mrc2() {
  // CHECK: define i32 @mrc2()
-  // CHECK: [[R:%.*]] = {{.*}} call i32 @llvm.arm.mrc2(i32 15, i32 0, i32 13, i32 0, i32 3)
+  // CHECK: [[R:%.*]] = call i32 @llvm.arm.mrc2(i32 15, i32 0, i32 13, i32 0, i32 3)
  // CHECK-NEXT: ret i32 [[R]]
  return __builtin_arm_mrc2(15, 0, 13, 0, 3);
 }
@ -124,40 +123,40 @@ void mcrr2(unsigned a, unsigned b) {
 }

 unsigned rsr() {
-  // CHECK: [[V0:[%A-Za-z0-9.]+]] = {{.*}} call i32 @llvm.read_register.i32(metadata !7)
+  // CHECK: [[V0:[%A-Za-z0-9.]+]] = call i32 @llvm.read_register.i32(metadata ![[M0:.*]])
  // CHECK-NEXT: ret i32 [[V0]]
  return __builtin_arm_rsr("cp1:2:c3:c4:5");
 }

 unsigned long long rsr64() {
-  // CHECK: [[V0:[%A-Za-z0-9.]+]] = {{.*}} call i64 @llvm.read_register.i64(metadata !8)
+  // CHECK: [[V0:[%A-Za-z0-9.]+]] = call i64 @llvm.read_register.i64(metadata ![[M1:.*]])
  // CHECK-NEXT: ret i64 [[V0]]
  return __builtin_arm_rsr64("cp1:2:c3");
 }

 void *rsrp() {
-  // CHECK: [[V0:[%A-Za-z0-9.]+]] = {{.*}} call i32 @llvm.read_register.i32(metadata !9)
+  // CHECK: [[V0:[%A-Za-z0-9.]+]] = call i32 @llvm.read_register.i32(metadata ![[M2:.*]])
  // CHECK-NEXT: [[V1:[%A-Za-z0-9.]+]] = inttoptr i32 [[V0]] to i8*
  // CHECK-NEXT: ret i8* [[V1]]
  return __builtin_arm_rsrp("sysreg");
 }

 void wsr(unsigned v) {
-  // CHECK: call void @llvm.write_register.i32(metadata !7, i32 %v)
+  // CHECK: call void @llvm.write_register.i32(metadata ![[M0]], i32 %v)
  __builtin_arm_wsr("cp1:2:c3:c4:5", v);
 }

 void wsr64(unsigned long long v) {
-  // CHECK: call void @llvm.write_register.i64(metadata !8, i64 %v)
+  // CHECK: call void @llvm.write_register.i64(metadata ![[M1]], i64 %v)
  __builtin_arm_wsr64("cp1:2:c3", v);
 }

 void wsrp(void *v) {
  // CHECK: [[V0:[%A-Za-z0-9.]+]] = ptrtoint i8* %v to i32
-  // CHECK-NEXT: call void @llvm.write_register.i32(metadata !9, i32 [[V0]])
+  // CHECK-NEXT: call void @llvm.write_register.i32(metadata ![[M2]], i32 [[V0]])
  __builtin_arm_wsrp("sysreg", v);
 }

-// CHECK: !7 = !{!"cp1:2:c3:c4:5"}
-// CHECK: !8 = !{!"cp1:2:c3"}
-// CHECK: !9 = !{!"sysreg"}
+// CHECK: ![[M0]] = !{!"cp1:2:c3:c4:5"}
+// CHECK: ![[M1]] = !{!"cp1:2:c3"}
+// CHECK: ![[M2]] = !{!"sysreg"}
--- a/test/CodeGen/builtins-arm64.c
+++ b/test/CodeGen/builtins-arm64.c
@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -triple arm64-apple-ios -O3 -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple arm64-apple-ios -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s

 void f0(void *a, void *b) {
 	__clear_cache(a,b);
@ -50,7 +50,7 @@ void prefetch() {
 }

 unsigned rsr() {
-  // CHECK: [[V0:[%A-Za-z0-9.]+]] = {{.*}} call i64 @llvm.read_register.i64(metadata ![[M0:[0-9]]])
+  // CHECK: [[V0:[%A-Za-z0-9.]+]] = call i64 @llvm.read_register.i64(metadata ![[M0:[0-9]]])
  // CHECK-NEXT: trunc i64 [[V0]] to i32
  return __builtin_arm_rsr("1:2:3:4:5");
 }
@ -61,7 +61,7 @@ unsigned long rsr64() {
 }

 void *rsrp() {
-  // CHECK: [[V0:[%A-Za-z0-9.]+]] = {{.*}} call i64 @llvm.read_register.i64(metadata ![[M0:[0-9]]])
+  // CHECK: [[V0:[%A-Za-z0-9.]+]] = call i64 @llvm.read_register.i64(metadata ![[M0:[0-9]]])
  // CHECK-NEXT: inttoptr i64 [[V0]] to i8*
  return __builtin_arm_rsrp("1:2:3:4:5");
 }
--- a/test/CodeGen/fp128_complex.c
+++ b/test/CodeGen/fp128_complex.c
@ -1,9 +1,9 @@
-// RUN: %clang -target aarch64-linux-gnuabi %s -O3 -S -emit-llvm -o - | FileCheck %s
+// RUN: %clang -target aarch64-linux-gnuabi %s -S -emit-llvm -o - | FileCheck %s

 _Complex long double a, b, c, d;
 void test_fp128_compound_assign(void) {
-  // CHECK: tail call { fp128, fp128 } @__multc3
+  // CHECK: call { fp128, fp128 } @__multc3
  a *= b;
-  // CHECK: tail call { fp128, fp128 } @__divtc3
+  // CHECK: call { fp128, fp128 } @__divtc3
  c /= d;
 }
--- a/test/CodeGen/neon-immediate-ubsan.c
+++ b/test/CodeGen/neon-immediate-ubsan.c
@ -1,9 +1,9 @@
-// RUN: %clang_cc1 -triple armv7s-linux-gnu -emit-llvm -O1 -o - %s \
+// RUN: %clang_cc1 -triple armv7s-linux-gnu -emit-llvm -o - %s \
 // RUN:     -target-feature +neon -target-cpu cortex-a8 \
 // RUN:     -fsanitize=signed-integer-overflow \
 // RUN:   | FileCheck %s --check-prefix=CHECK --check-prefix=ARMV7

-// RUN: %clang_cc1 -triple aarch64-unknown-unknown -emit-llvm -O1 -o - %s \
+// RUN: %clang_cc1 -triple aarch64-unknown-unknown -emit-llvm -o - %s \
 // RUN:     -target-feature +neon -target-cpu cortex-a53 \
 // RUN:     -fsanitize=signed-integer-overflow \
 // RUN:   | FileCheck %s --check-prefix=CHECK --check-prefix=AARCH64