Commit 916e37ba by Eric Holk

ARM32 vector lowering: fabs, scalarize remaining arithmetic operations.

parent d6cf6b38
...@@ -446,10 +446,7 @@ check-xtest: $(OBJDIR)/pnacl-sz make_symlink runtime ...@@ -446,10 +446,7 @@ check-xtest: $(OBJDIR)/pnacl-sz make_symlink runtime
-i x8664,native,sse2 \ -i x8664,native,sse2 \
-i x8664,native,sse4.1,test_vector_ops \ -i x8664,native,sse4.1,test_vector_ops \
-i x8664,sandbox,sse4.1,Om1 \ -i x8664,sandbox,sse4.1,Om1 \
-i arm32,neon \ -i arm32,neon
-e arm32,neon,test_vector_ops \
-e arm32,nonsfi \
-e arm32,neon,test_vector_ops
PNACL_BIN_PATH=$(PNACL_BIN_PATH) \ PNACL_BIN_PATH=$(PNACL_BIN_PATH) \
$(LLVM_SRC_PATH)/utils/lit/lit.py -sv $(CHECK_XTEST_TESTS) $(LLVM_SRC_PATH)/utils/lit/lit.py -sv $(CHECK_XTEST_TESTS)
endif endif
......
...@@ -29,13 +29,9 @@ def main(): ...@@ -29,13 +29,9 @@ def main():
'arm32': targets.ARM32Target } 'arm32': targets.ARM32Target }
arch_sz_flags = { 'x8632': [], arch_sz_flags = { 'x8632': [],
'x8664': [], 'x8664': [],
# TODO(jvoung): remove skip-unimplemented when # For ARM, test a large stack offset as well. +/- 4095 is
# implemented. # the limit, so test somewhere near that boundary.
# For ARM, test a large stack offset as well, until we 'arm32': ['--test-stack-extra', '4084']
# are more confident. +/- 4095 is the limit, so test
# somewhere near that boundary.
'arm32': ['--skip-unimplemented',
'--test-stack-extra', '4084']
} }
arch_llc_flags_extra = { arch_llc_flags_extra = {
# Use sse2 instructions regardless of input -mattr # Use sse2 instructions regardless of input -mattr
......
...@@ -447,9 +447,15 @@ void TargetARM32::genTargetHelperCallFor(Inst *Instr) { ...@@ -447,9 +447,15 @@ void TargetARM32::genTargetHelperCallFor(Inst *Instr) {
switch (Op) { switch (Op) {
default: default:
break; break;
case InstArithmetic::Ashr:
case InstArithmetic::Fdiv: case InstArithmetic::Fdiv:
case InstArithmetic::Udiv: case InstArithmetic::Frem:
case InstArithmetic::Lshr:
case InstArithmetic::Sdiv: case InstArithmetic::Sdiv:
case InstArithmetic::Shl:
case InstArithmetic::Srem:
case InstArithmetic::Udiv:
case InstArithmetic::Urem:
scalarizeArithmetic(Op, Dest, Instr->getSrc(0), Instr->getSrc(1)); scalarizeArithmetic(Op, Dest, Instr->getSrc(0), Instr->getSrc(1));
Instr->setDeleted(); Instr->setDeleted();
return; return;
......
; Show that we know how to translate asr ; Show that we know how to translate asr.
; NOTE: We use -O2 to get rid of memory stores. ; NOTE: We use -O2 to get rid of memory stores.
...@@ -61,3 +61,70 @@ entry: ...@@ -61,3 +61,70 @@ entry:
ret i32 %v ret i32 %v
} }
define internal <4 x i32> @AshrVeci32(<4 x i32> %a, <4 x i32> %b) {
; ASM-LABEL:AshrVeci32:
; DIS-LABEL:00000020 <AshrVeci32>:
; IASM-LABEL:AshrVeci32:
entry:
%v = ashr <4 x i32> %a, %b
; ASM: asr r0, r0, r1
; ASM: asr r0, r0, r1
; ASM: asr r0, r0, r1
; ASM: asr r0, r0, r1
; DIS: 28: e1a00150
; DIS: 38: e1a00150
; DIS: 48: e1a00150
; DIS: 58: e1a00150
ret <4 x i32> %v
}
define internal <8 x i16> @AshrVeci16(<8 x i16> %a, <8 x i16> %b) {
; ASM-LABEL:AshrVeci16:
entry:
%v = ashr <8 x i16> %a, %b
; ASM: asr r0, r0, r1
; ASM: asr r0, r0, r1
; ASM: asr r0, r0, r1
; ASM: asr r0, r0, r1
; ASM: asr r0, r0, r1
; ASM: asr r0, r0, r1
; ASM: asr r0, r0, r1
; ASM: asr r0, r0, r1
ret <8 x i16> %v
}
define internal <16 x i8> @AshrVeci8(<16 x i8> %a, <16 x i8> %b) {
; ASM-LABEL:AshrVeci8:
entry:
%v = ashr <16 x i8> %a, %b
; ASM: asr r0, r0, r1
; ASM: asr r0, r0, r1
; ASM: asr r0, r0, r1
; ASM: asr r0, r0, r1
; ASM: asr r0, r0, r1
; ASM: asr r0, r0, r1
; ASM: asr r0, r0, r1
; ASM: asr r0, r0, r1
; ASM: asr r0, r0, r1
; ASM: asr r0, r0, r1
; ASM: asr r0, r0, r1
; ASM: asr r0, r0, r1
; ASM: asr r0, r0, r1
; ASM: asr r0, r0, r1
; ASM: asr r0, r0, r1
; ASM: asr r0, r0, r1
ret <16 x i8> %v
}
...@@ -248,3 +248,120 @@ entry: ...@@ -248,3 +248,120 @@ entry:
ret <16 x i8> %res ret <16 x i8> %res
} }
define internal <4 x i32> @testSdiv4i32(<4 x i32> %v1, <4 x i32> %v2) {
; ASM-LABEL: testSdiv4i32:
; IASM-LABEL: testSdiv4i32:
entry:
%res = sdiv <4 x i32> %v1, %v2
; ASM: sdiv r0, r0, r1
; ASM: sdiv r0, r0, r1
; ASM: sdiv r0, r0, r1
; ASM: sdiv r0, r0, r1
; IASM-NOT: sdiv
ret <4 x i32> %res
}
define internal <8 x i16> @testSdiv8i16(<8 x i16> %v1, <8 x i16> %v2) {
; ASM-LABEL: testSdiv8i16:
; IASM-LABEL: testSdiv8i16:
entry:
%res = sdiv <8 x i16> %v1, %v2
; ASM: sxth r0, r0
; ASM: sxth r1, r1
; ASM: sdiv r0, r0, r1
; ASM: sxth r0, r0
; ASM: sxth r1, r1
; ASM: sdiv r0, r0, r1
; ASM: sxth r0, r0
; ASM: sxth r1, r1
; ASM: sdiv r0, r0, r1
; ASM: sxth r0, r0
; ASM: sxth r1, r1
; ASM: sdiv r0, r0, r1
; ASM: sxth r0, r0
; ASM: sxth r1, r1
; ASM: sdiv r0, r0, r1
; ASM: sxth r0, r0
; ASM: sxth r1, r1
; ASM: sdiv r0, r0, r1
; ASM: sxth r0, r0
; ASM: sxth r1, r1
; ASM: sdiv r0, r0, r1
; ASM: sxth r0, r0
; ASM: sxth r1, r1
; ASM: sdiv r0, r0, r1
; IASM-NOT: sxth
; IASM-NOT: sdiv
ret <8 x i16> %res
}
define internal <16 x i8> @testSdiv16i8(<16 x i8> %v1, <16 x i8> %v2) {
; ASM-LABEL: testSdiv16i8:
; IASM-LABEL: testSdiv16i8:
entry:
%res = sdiv <16 x i8> %v1, %v2
; ASM: sxtb r0, r0
; ASM: sxtb r1, r1
; ASM: sdiv r0, r0, r1
; ASM: sxtb r0, r0
; ASM: sxtb r1, r1
; ASM: sdiv r0, r0, r1
; ASM: sxtb r0, r0
; ASM: sxtb r1, r1
; ASM: sdiv r0, r0, r1
; ASM: sxtb r0, r0
; ASM: sxtb r1, r1
; ASM: sdiv r0, r0, r1
; ASM: sxtb r0, r0
; ASM: sxtb r1, r1
; ASM: sdiv r0, r0, r1
; ASM: sxtb r0, r0
; ASM: sxtb r1, r1
; ASM: sdiv r0, r0, r1
; ASM: sxtb r0, r0
; ASM: sxtb r1, r1
; ASM: sdiv r0, r0, r1
; ASM: sxtb r0, r0
; ASM: sxtb r1, r1
; ASM: sdiv r0, r0, r1
; ASM: sxtb r0, r0
; ASM: sxtb r1, r1
; ASM: sdiv r0, r0, r1
; ASM: sxtb r0, r0
; ASM: sxtb r1, r1
; ASM: sdiv r0, r0, r1
; ASM: sxtb r0, r0
; ASM: sxtb r1, r1
; ASM: sdiv r0, r0, r1
; ASM: sxtb r0, r0
; ASM: sxtb r1, r1
; ASM: sdiv r0, r0, r1
; ASM: sxtb r0, r0
; ASM: sxtb r1, r1
; ASM: sdiv r0, r0, r1
; ASM: sxtb r0, r0
; ASM: sxtb r1, r1
; ASM: sdiv r0, r0, r1
; ASM: sxtb r0, r0
; ASM: sxtb r1, r1
; ASM: sdiv r0, r0, r1
; ASM: sxtb r0, r0
; ASM: sxtb r1, r1
; ASM: sdiv r0, r0, r1
; IASM-NOT: sxtb
; IASM-NOT: sdiv
ret <16 x i8> %res
}
...@@ -61,3 +61,69 @@ entry: ...@@ -61,3 +61,69 @@ entry:
ret i32 %shl ret i32 %shl
} }
define internal <4 x i32> @ShlVec(<4 x i32> %a, <4 x i32> %b) {
; ASM-LABEL:ShlVec:
; DIS-LABEL:00000020 <ShlVec>:
; IASM-LABEL:ShlVec:
entry:
; ASM-NEXT:.LShlVec$entry:
; IASM-NEXT:.LShlVec$entry:
%shl = shl <4 x i32> %a, %b
; ASM: lsl r0, r0, r1
; ASM: lsl r0, r0, r1
; ASM: lsl r0, r0, r1
; ASM: lsl r0, r0, r1
; DIS: 28: e1a00110
ret <4 x i32> %shl
}
define internal <8 x i16> @ShlVeci16(<8 x i16> %a, <8 x i16> %b) {
; ASM-LABEL:ShlVeci16:
entry:
%v = shl <8 x i16> %a, %b
; ASM: lsl r0, r0, r1
; ASM: lsl r0, r0, r1
; ASM: lsl r0, r0, r1
; ASM: lsl r0, r0, r1
; ASM: lsl r0, r0, r1
; ASM: lsl r0, r0, r1
; ASM: lsl r0, r0, r1
; ASM: lsl r0, r0, r1
ret <8 x i16> %v
}
define internal <16 x i8> @ShlVeci8(<16 x i8> %a, <16 x i8> %b) {
; ASM-LABEL:ShlVeci8:
entry:
%v = shl <16 x i8> %a, %b
; ASM: lsl r0, r0, r1
; ASM: lsl r0, r0, r1
; ASM: lsl r0, r0, r1
; ASM: lsl r0, r0, r1
; ASM: lsl r0, r0, r1
; ASM: lsl r0, r0, r1
; ASM: lsl r0, r0, r1
; ASM: lsl r0, r0, r1
; ASM: lsl r0, r0, r1
; ASM: lsl r0, r0, r1
; ASM: lsl r0, r0, r1
; ASM: lsl r0, r0, r1
; ASM: lsl r0, r0, r1
; ASM: lsl r0, r0, r1
; ASM: lsl r0, r0, r1
; ASM: lsl r0, r0, r1
ret <16 x i8> %v
}
...@@ -61,3 +61,69 @@ entry: ...@@ -61,3 +61,69 @@ entry:
ret i32 %v ret i32 %v
} }
define internal <4 x i32> @LshrVec(<4 x i32> %a, <4 x i32> %b) {
; ASM-LABEL:LshrVec:
; DIS-LABEL:00000020 <LshrVec>:
; IASM-LABEL:LshrVec:
entry:
; ASM-NEXT:.LLshrVec$entry:
; IASM-NEXT:.LLshrVec$entry:
%v = lshr <4 x i32> %a, %b
; ASM: lsr r0, r0, r1
; ASM: lsr r0, r0, r1
; ASM: lsr r0, r0, r1
; ASM: lsr r0, r0, r1
; DIS: 28: e1a00130
ret <4 x i32> %v
}
define internal <8 x i16> @LshrVeci16(<8 x i16> %a, <8 x i16> %b) {
; ASM-LABEL:LshrVeci16:
entry:
%v = lshr <8 x i16> %a, %b
; ASM: lsr r0, r0, r1
; ASM: lsr r0, r0, r1
; ASM: lsr r0, r0, r1
; ASM: lsr r0, r0, r1
; ASM: lsr r0, r0, r1
; ASM: lsr r0, r0, r1
; ASM: lsr r0, r0, r1
; ASM: lsr r0, r0, r1
ret <8 x i16> %v
}
define internal <16 x i8> @LshrVeci8(<16 x i8> %a, <16 x i8> %b) {
; ASM-LABEL:LshrVeci8:
entry:
%v = lshr <16 x i8> %a, %b
; ASM: lsr r0, r0, r1
; ASM: lsr r0, r0, r1
; ASM: lsr r0, r0, r1
; ASM: lsr r0, r0, r1
; ASM: lsr r0, r0, r1
; ASM: lsr r0, r0, r1
; ASM: lsr r0, r0, r1
; ASM: lsr r0, r0, r1
; ASM: lsr r0, r0, r1
; ASM: lsr r0, r0, r1
; ASM: lsr r0, r0, r1
; ASM: lsr r0, r0, r1
; ASM: lsr r0, r0, r1
; ASM: lsr r0, r0, r1
; ASM: lsr r0, r0, r1
; ASM: lsr r0, r0, r1
ret <16 x i8> %v
}
; Show that we know how to translate vector urem, srem and frem.
; NOTE: We use -O2 to get rid of memory stores.
; REQUIRES: allow_dump
; Compile using standalone assembler.
; RUN: %p2i --filetype=asm -i %s --target=arm32 --args -O2 -mattr=hwdiv-arm \
; RUN: | FileCheck %s --check-prefix=ASM
define internal <4 x i32> @Urem4i32(<4 x i32> %a, <4 x i32> %b) {
; ASM-LABEL:Urem4i32:
entry:
; ASM-NEXT:.LUrem4i32$entry:
%v = urem <4 x i32> %a, %b
; ASM-LABEL:.LUrem4i32$local$__0:
; ASM-NEXT: udiv r2, r0, r1
; ASM-NEXT: mls r2, r2, r1, r0
; ASM-NEXT: vmov.32 d4[0], r2
; ASM-LABEL:.LUrem4i32$local$__1:
; ASM-NEXT: udiv r2, r0, r1
; ASM-NEXT: mls r2, r2, r1, r0
; ASM-NEXT: vmov.32 d4[1], r2
; ASM-LABEL:.LUrem4i32$local$__2:
; ASM-NEXT: udiv r2, r0, r1
; ASM-NEXT: mls r2, r2, r1, r0
; ASM-NEXT: vmov.32 d5[0], r2
; ASM-LABEL:.LUrem4i32$local$__3:
; ASM-NEXT: udiv r2, r0, r1
; ASM-NEXT: mls r2, r2, r1, r0
; ASM-NEXT: vmov.32 d5[1], r2
ret <4 x i32> %v
}
define internal <4 x i32> @Srem4i32(<4 x i32> %a, <4 x i32> %b) {
; ASM-LABEL:Srem4i32:
entry:
; ASM-NEXT:.LSrem4i32$entry:
%v = srem <4 x i32> %a, %b
; ASM-LABEL:.LSrem4i32$local$__0:
; ASM-NEXT: sdiv r2, r0, r1
; ASM-NEXT: mls r2, r2, r1, r0
; ASM-NEXT: vmov.32 d4[0], r2
; ASM-LABEL:.LSrem4i32$local$__1:
; ASM-NEXT: sdiv r2, r0, r1
; ASM-NEXT: mls r2, r2, r1, r0
; ASM-NEXT: vmov.32 d4[1], r2
; ASM-LABEL:.LSrem4i32$local$__2:
; ASM-NEXT: sdiv r2, r0, r1
; ASM-NEXT: mls r2, r2, r1, r0
; ASM-NEXT: vmov.32 d5[0], r2
; ASM-LABEL:.LSrem4i32$local$__3:
; ASM-NEXT: sdiv r2, r0, r1
; ASM-NEXT: mls r2, r2, r1, r0
; ASM-NEXT: vmov.32 d5[1], r2
ret <4 x i32> %v
}
define internal <4 x float> @Frem4float(<4 x float> %a, <4 x float> %b) {
; ASM-LABEL:Frem4float:
entry:
; ASM-NEXT:.LFrem4float$entry:
%v = frem <4 x float> %a, %b
; ASM: vmov.f32 s0, s16
; ASM-NEXT: vmov.f32 s1, s20
; ASM-NEXT: movw r0, #:lower16:fmodf
; ASM-NEXT: movt r0, #:upper16:fmodf
; ASM-NEXT: blx r0
; ASM: vmov.f32 s0, s17
; ASM-NEXT: vmov.f32 s1, s21
; ASM-NEXT: movw r0, #:lower16:fmodf
; ASM-NEXT: movt r0, #:upper16:fmodf
; ASM-NEXT: blx r0
; ASM: vmov.f32 s0, s18
; ASM-NEXT: vmov.f32 s1, s22
; ASM-NEXT: movw r0, #:lower16:fmodf
; ASM-NEXT: movt r0, #:upper16:fmodf
; ASM-NEXT: blx r0
; ASM: vmov.f32 s16, s19
; ASM-NEXT: vmov.f32 s20, s23
; ASM-NEXT: movw r0, #:lower16:fmodf
; ASM-NEXT: movt r0, #:upper16:fmodf
; ASM: blx r0
ret <4 x float> %v
}
; Show that we translate intrinsics for fabs on float and double. ; Show that we translate intrinsics for fabs on float, double and float vectors.
; REQUIRES: allow_dump ; REQUIRES: allow_dump
...@@ -26,6 +26,7 @@ ...@@ -26,6 +26,7 @@
declare float @llvm.fabs.f32(float) declare float @llvm.fabs.f32(float)
declare double @llvm.fabs.f64(double) declare double @llvm.fabs.f64(double)
declare <4 x float> @llvm.fabs.v4f32(<4 x float>)
define internal float @test_fabs_float(float %x) { define internal float @test_fabs_float(float %x) {
; ASM-LABEL: test_fabs_float: ; ASM-LABEL: test_fabs_float:
...@@ -56,3 +57,18 @@ entry: ...@@ -56,3 +57,18 @@ entry:
ret double %r ret double %r
} }
define internal <4 x float> @test_fabs_4float(<4 x float> %x) {
; ASM-LABEL: test_fabs_4float:
; DIS-LABEL: 00000050 <test_fabs_4float>:
; IASM-LABEL: test_fabs_4float:
entry:
%r = call <4 x float> @llvm.fabs.v4f32(<4 x float> %x)
; ASM: vabs.f32 q0, q0
; DIS: 60: f3b90740
; IASM-NOT: vabs.f32
ret <4 x float> %r
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment