Subzero: Use scalar arithmetic when no vector instruction exists.

Implement scalarizeArithmetic() which extracts the components of the input vectors, performs the operation with scalar instructions, and builds the output vector component by component. Fix the lowering of sdiv and srem. These were previously emitting a wrong instruction (cdq) for i8 and i16 inputs (needing cbw, cwd). In the test_arith crosstest, mask the inputs to vector shift operations to ensure that the shifts are in range. Otherwise the Subzero output is not identical to the llc output in some (undefined) cases. BUG=none R=stichnot@chromium.org Review URL: https://codereview.chromium.org/443203003

Subzero: Use scalar arithmetic when no vector instruction exists.
afeaee41 · Matt Wala · 206833c6 · afeaee41 · afeaee41 · afeaee41
Commit afeaee41 authored Aug 07, 2014 by Matt Wala
10 changed files
--- a/crosstest/test_arith.cpp
+++ b/crosstest/test_arith.cpp
@@ -18,7 +18,7 @@

 #include "test_arith.h"

-#define X(inst, op, isdiv)                                                     \
+#define X(inst, op, isdiv, isshift)                                            \
  bool test##inst(bool a, bool b) { return a op b; }                           \
  uint8_t test##inst(uint8_t a, uint8_t b) { return a op b; }                  \
  uint16_t test##inst(uint16_t a, uint16_t b) { return a op b; }               \
@@ -30,7 +30,7 @@
 UINTOP_TABLE
 #undef X

-#define X(inst, op, isdiv)                                                     \
+#define X(inst, op, isdiv, isshift)                                            \
  bool test##inst(bool a, bool b) { return a op b; }                           \
  myint8_t test##inst(myint8_t a, myint8_t b) { return a op b; }               \
  int16_t test##inst(int16_t a, int16_t b) { return a op b; }                  \

--- a/crosstest/test_arith.def
+++ b/crosstest/test_arith.def
@@ -18,25 +18,25 @@
 #define STR(s) #s

 #define UINTOP_TABLE                 \
-  /* inst, operator, div */ \
-  X(Add,   +,        0 )    \
-  X(Sub,   -,        0 )    \
-  X(Mul,   *,        0 )    \
-  X(Udiv,  /,        1 )    \
-  X(Urem,  %,        1 )    \
-  X(Shl,   <<,       0)     \
-  X(Lshr,  >>,       0)     \
-  X(And,   &,        0 )    \
-  X(Or,    |,        0 )    \
-  X(Xor,   ^,        0 )    \
-//#define X(inst, op, isdiv)
+  /* inst, operator, div, shift */   \
+  X(Add,   +,        0,   0)         \
+  X(Sub,   -,        0,   0)         \
+  X(Mul,   *,        0,   0)         \
+  X(Udiv,  /,        1,   0)         \
+  X(Urem,  %,        1,   0)         \
+  X(Shl,   <<,       0,   1)         \
+  X(Lshr,  >>,       0,   1)         \
+  X(And,   &,        0,   0)         \
+  X(Or,    |,        0,   0)         \
+  X(Xor,   ^,        0,   0)         \
+//#define X(inst, op, isdiv, isshift)

 #define SINTOP_TABLE                 \
-  /* inst, operator, div */ \
-  X(Sdiv,  /,        1)     \
-  X(Srem,  %,        1)     \
-  X(Ashr,  >>,       0)     \
-//#define X(inst, op, isdiv)
+  /* inst, operator, div, shift */   \
+  X(Sdiv,  /,        1,   0)         \
+  X(Srem,  %,        1,   0)         \
+  X(Ashr,  >>,       0,   1)         \
+//#define X(inst, op, isdiv, isshift)

 #define COMMA ,
 #define FPOP_TABLE           \

--- a/crosstest/test_arith.h
+++ b/crosstest/test_arith.h
@@ -17,7 +17,7 @@

 #include "vectors.h"

-#define X(inst, op, isdiv)                                                     \
+#define X(inst, op, isdiv, isshift)                                            \
  bool test##inst(bool a, bool b);                                             \
  uint8_t test##inst(uint8_t a, uint8_t b);                                    \
  uint16_t test##inst(uint16_t a, uint16_t b);                                 \
@@ -29,7 +29,7 @@
 UINTOP_TABLE
 #undef X

-#define X(inst, op, isdiv)                                                     \
+#define X(inst, op, isdiv, isshift)                                            \
  bool test##inst(bool a, bool b);                                             \
  myint8_t test##inst(myint8_t a, myint8_t b);                                 \
  int16_t test##inst(int16_t a, int16_t b);                                    \

--- a/crosstest/test_arith_main.cpp
+++ b/crosstest/test_arith_main.cpp
@@ -61,12 +61,12 @@ void testsInt(size_t &TotalTests, size_t &Passes, size_t &Failures) {
    FuncTypeSigned FuncSzSigned;
    bool ExcludeDivExceptions; // for divide related tests
  } Funcs[] = {
-#define X(inst, op, isdiv)                                                     \
+#define X(inst, op, isdiv, isshift)                                            \
  { STR(inst), test##inst, Subzero_::test##inst, NULL, NULL, isdiv }           \
  ,
      UINTOP_TABLE
 #undef X
-#define X(inst, op, isdiv)                                                     \
+#define X(inst, op, isdiv, isshift)                                            \
  { STR(inst), NULL, NULL, test##inst, Subzero_::test##inst, isdiv }           \
  ,
      SINTOP_TABLE
@@ -172,17 +172,18 @@ void testsVecInt(size_t &TotalTests, size_t &Passes, size_t &Failures) {
    FuncTypeSigned FuncLlcSigned;
    FuncTypeSigned FuncSzSigned;
    bool ExcludeDivExceptions; // for divide related tests
+    bool MaskShiftOperations;  // for shift related tests
  } Funcs[] = {
-#define X(inst, op, isdiv)                                                     \
+#define X(inst, op, isdiv, isshift)                                            \
  {                                                                            \
-    STR(inst), test##inst, Subzero_::test##inst, NULL, NULL, isdiv             \
+    STR(inst), test##inst, Subzero_::test##inst, NULL, NULL, isdiv, isshift    \
  }                                                                            \
  ,
        UINTOP_TABLE
 #undef X
-#define X(inst, op, isdiv)                                                     \
+#define X(inst, op, isdiv, isshift)                                            \
  {                                                                            \
-    STR(inst), NULL, NULL, test##inst, Subzero_::test##inst, isdiv             \
+    STR(inst), NULL, NULL, test##inst, Subzero_::test##inst, isdiv, isshift    \
  }                                                                            \
  ,
        SINTOP_TABLE
@@ -201,6 +202,8 @@ void testsVecInt(size_t &TotalTests, size_t &Passes, size_t &Failures) {
        if (Funcs[f].ExcludeDivExceptions &&
            inputsMayTriggerException<ElementTypeSigned>(Element1, Element2))
          continue;
+        if (Funcs[f].MaskShiftOperations)
+          Element2 &= CHAR_BIT * sizeof(ElementTypeUnsigned) - 1;
        Value1[j] = Element1;
        Value2[j] = Element2;
        ++j;
@@ -360,37 +363,3 @@ int main(int argc, char **argv) {
  return Failures;
 }

-extern "C" {
-// Subzero helpers
-  v4si32 Sz_shl_v4i32(v4si32 a, v4si32 b) { return a << b; }
-  v4si32 Sz_ashr_v4i32(v4si32 a, v4si32 b) { return a >> b; }
-  v4ui32 Sz_lshr_v4i32(v4ui32 a, v4ui32 b) { return a >> b; }
-  v4si32 Sz_sdiv_v4i32(v4si32 a, v4si32 b) { return a / b; }
-  v4ui32 Sz_udiv_v4i32(v4ui32 a, v4ui32 b) { return a / b; }
-  v4si32 Sz_srem_v4i32(v4si32 a, v4si32 b) { return a % b; }
-  v4ui32 Sz_urem_v4i32(v4ui32 a, v4ui32 b) { return a % b; }
-
-  v8si16 Sz_shl_v8i16(v8si16 a, v8si16 b) { return a << b; }
-  v8si16 Sz_ashr_v8i16(v8si16 a, v8si16 b) { return a >> b; }
-  v8ui16 Sz_lshr_v8i16(v8ui16 a, v8ui16 b) { return a >> b; }
-  v8si16 Sz_sdiv_v8i16(v8si16 a, v8si16 b) { return a / b; }
-  v8ui16 Sz_udiv_v8i16(v8ui16 a, v8ui16 b) { return a / b; }
-  v8si16 Sz_srem_v8i16(v8si16 a, v8si16 b) { return a % b; }
-  v8ui16 Sz_urem_v8i16(v8ui16 a, v8ui16 b) { return a % b; }
-
-  v16ui8 Sz_mul_v16i8(v16ui8 a, v16ui8 b) { return a * b; }
-  v16si8 Sz_shl_v16i8(v16si8 a, v16si8 b) { return a << b; }
-  v16si8 Sz_ashr_v16i8(v16si8 a, v16si8 b) { return a >> b; }
-  v16ui8 Sz_lshr_v16i8(v16ui8 a, v16ui8 b) { return a >> b; }
-  v16si8 Sz_sdiv_v16i8(v16si8 a, v16si8 b) { return a / b; }
-  v16ui8 Sz_udiv_v16i8(v16ui8 a, v16ui8 b) { return a / b; }
-  v16si8 Sz_srem_v16i8(v16si8 a, v16si8 b) { return a % b; }
-  v16ui8 Sz_urem_v16i8(v16ui8 a, v16ui8 b) { return a % b; }
-
-  v4f32 Sz_frem_v4f32(v4f32 a, v4f32 b) {
-    v4f32 Result;
-    for (int i = 0; i < 4; ++i)
-      Result[i] = fmodf(a[i], b[i]);
-    return Result;
-  }
-}
--- a/src/IceInstX8632.cpp
+++ b/src/IceInstX8632.cpp
@@ -136,11 +136,8 @@ InstX8632Call::InstX8632Call(Cfg *Func, Variable *Dest, Operand *CallTarget)
  addSource(CallTarget);
 }

-InstX8632Cdq::InstX8632Cdq(Cfg *Func, Variable *Dest, Operand *Source)
-    : InstX8632(Func, InstX8632::Cdq, 1, Dest) {
-  assert(Dest->getRegNum() == TargetX8632::Reg_edx);
-  assert(llvm::isa<Variable>(Source));
-  assert(llvm::dyn_cast<Variable>(Source)->getRegNum() == TargetX8632::Reg_eax);
+InstX8632Cbwdq::InstX8632Cbwdq(Cfg *Func, Variable *Dest, Operand *Source)
+    : InstX8632(Func, InstX8632::Cbwdq, 1, Dest) {
  addSource(Source);
 }

@@ -721,16 +718,35 @@ void InstX8632Shrd::dump(const Cfg *Func) const {
  dumpSources(Func);
 }

-void InstX8632Cdq::emit(const Cfg *Func) const {
+void InstX8632Cbwdq::emit(const Cfg *Func) const {
  Ostream &Str = Func->getContext()->getStrEmit();
  assert(getSrcSize() == 1);
+  Operand *Src0 = getSrc(0);
+  assert(llvm::isa<Variable>(Src0));
+  assert(llvm::cast<Variable>(Src0)->getRegNum() == TargetX8632::Reg_eax);
+  switch (Src0->getType()) {
+  default:
+    llvm_unreachable("unexpected source type!");
+    break;
+  case IceType_i8:
+    assert(getDest()->getRegNum() == TargetX8632::Reg_eax);
+    Str << "\tcbw\n";
+    break;
+  case IceType_i16:
+    assert(getDest()->getRegNum() == TargetX8632::Reg_edx);
+    Str << "\tcwd\n";
+    break;
+  case IceType_i32:
+    assert(getDest()->getRegNum() == TargetX8632::Reg_edx);
    Str << "\tcdq\n";
+    break;
+  }
 }

-void InstX8632Cdq::dump(const Cfg *Func) const {
+void InstX8632Cbwdq::dump(const Cfg *Func) const {
  Ostream &Str = Func->getContext()->getStrDump();
  dumpDest(Func);
-  Str << " = cdq." << getSrc(0)->getType() << " ";
+  Str << " = cbw/cwd/cdq." << getSrc(0)->getType() << " ";
  dumpSources(Func);
 }


--- a/src/IceInstX8632.h
+++ b/src/IceInstX8632.h
@@ -144,7 +144,7 @@ public:
    Bsr,
    Bswap,
    Call,
-    Cdq,
+    Cbwdq,
    Cmov,
    Cmpps,
    Cmpxchg,
@@ -689,22 +689,22 @@ private:
  virtual ~InstX8632Shrd() {}
 };

-// Cdq instruction - sign-extend eax into edx
-class InstX8632Cdq : public InstX8632 {
+// Cbdwq instruction - wrapper for cbw, cwd, or cdq
+class InstX8632Cbwdq : public InstX8632 {
 public:
-  static InstX8632Cdq *create(Cfg *Func, Variable *Dest, Operand *Source) {
-    return new (Func->allocate<InstX8632Cdq>())
-        InstX8632Cdq(Func, Dest, Source);
+  static InstX8632Cbwdq *create(Cfg *Func, Variable *Dest, Operand *Source) {
+    return new (Func->allocate<InstX8632Cbwdq>())
+        InstX8632Cbwdq(Func, Dest, Source);
  }
  virtual void emit(const Cfg *Func) const;
  virtual void dump(const Cfg *Func) const;
-  static bool classof(const Inst *Inst) { return isClassof(Inst, Cdq); }
+  static bool classof(const Inst *Inst) { return isClassof(Inst, Cbwdq); }

 private:
-  InstX8632Cdq(Cfg *Func, Variable *Dest, Operand *Source);
-  InstX8632Cdq(const InstX8632Cdq &) LLVM_DELETED_FUNCTION;
-  InstX8632Cdq &operator=(const InstX8632Cdq &) LLVM_DELETED_FUNCTION;
-  virtual ~InstX8632Cdq() {}
+  InstX8632Cbwdq(Cfg *Func, Variable *Dest, Operand *Source);
+  InstX8632Cbwdq(const InstX8632Cbwdq &) LLVM_DELETED_FUNCTION;
+  InstX8632Cbwdq &operator=(const InstX8632Cbwdq &) LLVM_DELETED_FUNCTION;
+  virtual ~InstX8632Cbwdq() {}
 };

 // Conditional move instruction.

--- a/src/IceTargetLoweringX8632.cpp
+++ b/src/IceTargetLoweringX8632.cpp
@@ -1296,78 +1296,18 @@ void TargetX8632::lowerArithmetic(const InstArithmetic *Inst) {
        _movp(Dest, T4);
      } else {
        assert(Dest->getType() == IceType_v16i8);
-        // Sz_mul_v16i8
-        const IceString Helper = "Sz_mul_v16i8";
-        const SizeT MaxSrcs = 2;
-        InstCall *Call = makeHelperCall(Helper, Dest, MaxSrcs);
-        Call->addArg(Src0);
-        Call->addArg(Src1);
-        lowerCall(Call);
+        scalarizeArithmetic(Inst->getOp(), Dest, Src0, Src1);
      }
    } break;
-    case InstArithmetic::Shl: {
-      // Sz_shl_v4i32, Sz_shl_v8i16, Sz_shl_v16i8
-      const IceString Helper = "Sz_shl_" + typeIdentString(Dest->getType());
-      const SizeT MaxSrcs = 2;
-      InstCall *Call = makeHelperCall(Helper, Dest, MaxSrcs);
-      Call->addArg(Src0);
-      Call->addArg(Src1);
-      lowerCall(Call);
-    } break;
-    case InstArithmetic::Lshr: {
-      // Sz_lshr_v4i32, Sz_lshr_v8i16, Sz_lshr_v16i8
-      const IceString Helper = "Sz_lshr_" + typeIdentString(Dest->getType());
-      const SizeT MaxSrcs = 2;
-      InstCall *Call = makeHelperCall(Helper, Dest, MaxSrcs);
-      Call->addArg(Src0);
-      Call->addArg(Src1);
-      lowerCall(Call);
-    } break;
-    case InstArithmetic::Ashr: {
-      // Sz_ashr_v4i32, Sz_ashr_v8i16, Sz_ashr_v16i8
-      const IceString Helper = "Sz_ashr_" + typeIdentString(Dest->getType());
-      const SizeT MaxSrcs = 2;
-      InstCall *Call = makeHelperCall(Helper, Dest, MaxSrcs);
-      Call->addArg(Src0);
-      Call->addArg(Src1);
-      lowerCall(Call);
-    } break;
-    case InstArithmetic::Udiv: {
-      // Sz_udiv_v4i32, Sz_udiv_v8i16, Sz_udiv_v16i8
-      const IceString Helper = "Sz_udiv_" + typeIdentString(Dest->getType());
-      const SizeT MaxSrcs = 2;
-      InstCall *Call = makeHelperCall(Helper, Dest, MaxSrcs);
-      Call->addArg(Src0);
-      Call->addArg(Src1);
-      lowerCall(Call);
-    } break;
-    case InstArithmetic::Sdiv: {
-      // Sz_sdiv_v4i32, Sz_sdiv_v8i16, Sz_sdiv_v16i8
-      const IceString Helper = "Sz_sdiv_" + typeIdentString(Dest->getType());
-      const SizeT MaxSrcs = 2;
-      InstCall *Call = makeHelperCall(Helper, Dest, MaxSrcs);
-      Call->addArg(Src0);
-      Call->addArg(Src1);
-      lowerCall(Call);
-    } break;
-    case InstArithmetic::Urem: {
-      // Sz_urem_v4i32, Sz_urem_v8i16, Sz_urem_v16i8
-      const IceString Helper = "Sz_urem_" + typeIdentString(Dest->getType());
-      const SizeT MaxSrcs = 2;
-      InstCall *Call = makeHelperCall(Helper, Dest, MaxSrcs);
-      Call->addArg(Src0);
-      Call->addArg(Src1);
-      lowerCall(Call);
-    } break;
-    case InstArithmetic::Srem: {
-      // Sz_srem_v4i32, Sz_srem_v8i16, Sz_srem_v16i8
-      const IceString Helper = "Sz_srem_" + typeIdentString(Dest->getType());
-      const SizeT MaxSrcs = 2;
-      InstCall *Call = makeHelperCall(Helper, Dest, MaxSrcs);
-      Call->addArg(Src0);
-      Call->addArg(Src1);
-      lowerCall(Call);
-    } break;
+    case InstArithmetic::Shl:
+    case InstArithmetic::Lshr:
+    case InstArithmetic::Ashr:
+    case InstArithmetic::Udiv:
+    case InstArithmetic::Urem:
+    case InstArithmetic::Sdiv:
+    case InstArithmetic::Srem:
+      scalarizeArithmetic(Inst->getOp(), Dest, Src0, Src1);
+      break;
    case InstArithmetic::Fadd: {
      Variable *T = makeReg(Dest->getType());
      _movp(T, Src0);
@@ -1392,13 +1332,9 @@ void TargetX8632::lowerArithmetic(const InstArithmetic *Inst) {
      _divps(T, LEGAL_HACK(Src1));
      _movp(Dest, T);
    } break;
-    case InstArithmetic::Frem: {
-      const SizeT MaxSrcs = 2;
-      InstCall *Call = makeHelperCall("Sz_frem_v4f32", Dest, MaxSrcs);
-      Call->addArg(Src0);
-      Call->addArg(Src1);
-      lowerCall(Call);
-    } break;
+    case InstArithmetic::Frem:
+      scalarizeArithmetic(Inst->getOp(), Dest, Src0, Src1);
+      break;
    }
 #undef LEGAL_HACK
  } else { // Dest->getType() is non-i64 scalar
@@ -1490,11 +1426,18 @@ void TargetX8632::lowerArithmetic(const InstArithmetic *Inst) {
      break;
    case InstArithmetic::Sdiv:
      Src1 = legalize(Src1, Legal_Reg | Legal_Mem);
+      if (Dest->getType() == IceType_i8) {
+        _mov(T, Src0, Reg_eax);
+        _cbwdq(T, T);
+        _idiv(T, Src1, T);
+        _mov(Dest, T);
+      } else {
        T_edx = makeReg(IceType_i32, Reg_edx);
        _mov(T, Src0, Reg_eax);
-      _cdq(T_edx, T);
+        _cbwdq(T_edx, T);
        _idiv(T, Src1, T_edx);
        _mov(Dest, T);
+      }
      break;
    case InstArithmetic::Urem:
      Src1 = legalize(Src1, Legal_Reg | Legal_Mem);
@@ -1515,11 +1458,20 @@ void TargetX8632::lowerArithmetic(const InstArithmetic *Inst) {
      break;
    case InstArithmetic::Srem:
      Src1 = legalize(Src1, Legal_Reg | Legal_Mem);
+      if (Dest->getType() == IceType_i8) {
+        Variable *T_ah = makeReg(IceType_i8, Reg_ah);
+        _mov(T, Src0, Reg_eax);
+        _cbwdq(T, T);
+        Context.insert(InstFakeDef::create(Func, T_ah));
+        _idiv(T_ah, Src1, T);
+        _mov(Dest, T_ah);
+      } else {
        T_edx = makeReg(IceType_i32, Reg_edx);
        _mov(T, Src0, Reg_eax);
-      _cdq(T_edx, T);
+        _cbwdq(T_edx, T);
        _idiv(T_edx, Src1, T);
        _mov(Dest, T_edx);
+      }
      break;
    case InstArithmetic::Fadd:
      _mov(T, Src0);
@@ -3744,6 +3696,39 @@ void TargetX8632::lowerSwitch(const InstSwitch *Inst) {
  _br(Inst->getLabelDefault());
 }

+void TargetX8632::scalarizeArithmetic(InstArithmetic::OpKind Kind,
+                                      Variable *Dest, Operand *Src0,
+                                      Operand *Src1) {
+  assert(isVectorType(Dest->getType()));
+  Type Ty = Dest->getType();
+  Type ElementTy = typeElementType(Ty);
+  SizeT NumElements = typeNumElements(Ty);
+
+  Operand *T = Ctx->getConstantUndef(Ty);
+  for (SizeT I = 0; I < NumElements; ++I) {
+    Constant *Index = Ctx->getConstantInt(IceType_i32, I);
+
+    // Extract the next two inputs.
+    Variable *Op0 = Func->makeVariable(ElementTy, Context.getNode());
+    lowerExtractElement(InstExtractElement::create(Func, Op0, Src0, Index));
+    Variable *Op1 = Func->makeVariable(ElementTy, Context.getNode());
+    lowerExtractElement(InstExtractElement::create(Func, Op1, Src1, Index));
+
+    // Perform the arithmetic as a scalar operation.
+    Variable *Res = Func->makeVariable(ElementTy, Context.getNode());
+    lowerArithmetic(InstArithmetic::create(Func, Kind, Res, Op0, Op1));
+
+    // Insert the result into position.
+    Variable *DestT = Func->makeVariable(Ty, Context.getNode());
+    lowerInsertElement(InstInsertElement::create(Func, DestT, T, Res, Index));
+    T = DestT;
+    // TODO(stichnot): Use postLower() in -Om1 mode to avoid buildup of
+    // infinite weight temporaries.
+  }
+
+  lowerAssign(InstAssign::create(Func, Dest, T));
+}
+
 // The following pattern occurs often in lowered C and C++ code:
 //
 //   %cmp     = fcmp/icmp pred <n x ty> %src0, %src1

--- a/src/IceTargetLoweringX8632.h
+++ b/src/IceTargetLoweringX8632.h
@@ -122,6 +122,9 @@ protected:

  void eliminateNextVectorSextInstruction(Variable *SignExtendedResult);

+  void scalarizeArithmetic(InstArithmetic::OpKind K, Variable *Dest,
+                           Operand *Src0, Operand *Src1);
+
  // Operand legalization helpers.  To deal with address mode
  // constraints, the helpers will create a new Operand and emit
  // instructions that guarantee that the Operand kind is one of those
@@ -220,8 +223,8 @@ protected:
  void _bswap(Variable *SrcDest) {
    Context.insert(InstX8632Bswap::create(Func, SrcDest));
  }
-  void _cdq(Variable *Dest, Operand *Src0) {
-    Context.insert(InstX8632Cdq::create(Func, Dest, Src0));
+  void _cbwdq(Variable *Dest, Operand *Src0) {
+    Context.insert(InstX8632Cbwdq::create(Func, Dest, Src0));
  }
  void _cmov(Variable *Dest, Operand *Src0, InstX8632::BrCond Condition) {
    Context.insert(InstX8632Cmov::create(Func, Dest, Src0, Condition));

--- a/tests_lit/llvm2ice_tests/sdiv.ll
+++ b/tests_lit/llvm2ice_tests/sdiv.ll
+; This checks the correctness of the lowering code for the small
+; integer variants of sdiv and srem.
+
+; RUN: %llvm2ice --verbose none %s | FileCheck  %s
+; RUN: %llvm2ice -O2 --verbose none %s | FileCheck  %s
+; RUN: %llvm2ice -O2 --verbose none %s \
+; RUN:               | llvm-mc -arch=x86 -x86-asm-syntax=intel -filetype=obj
+; RUN: %llvm2ice -Om1 --verbose none %s \
+; RUN:               | llvm-mc -arch=x86 -x86-asm-syntax=intel -filetype=obj
+; RUN: %llvm2ice --verbose none %s | FileCheck --check-prefix=ERRORS %s
+; RUN: %llvm2iceinsts %s | %szdiff %s | FileCheck --check-prefix=DUMP %s
+; RUN: %llvm2iceinsts --pnacl %s | %szdiff %s \
+; RUN:                           | FileCheck --check-prefix=DUMP %s
+
+define i32 @sdiv_i8(i32 %a.i32, i32 %b.i32) {
+entry:
+  %a = trunc i32 %a.i32 to i8
+  %b = trunc i32 %b.i32 to i8
+  %res = sdiv i8 %a, %b
+  %res.i32 = zext i8 %res to i32
+  ret i32 %res.i32
+; CHECK-LABEL: sdiv_i8:
+; CHECK: cbw
+; CHECK: idiv
+}
+
+define i32 @sdiv_i16(i32 %a.i32, i32 %b.i32) {
+entry:
+  %a = trunc i32 %a.i32 to i16
+  %b = trunc i32 %b.i32 to i16
+  %res = sdiv i16 %a, %b
+  %res.i32 = zext i16 %res to i32
+  ret i32 %res.i32
+; CHECK-LABEL: sdiv_i16:
+; CHECK: cwd
+; CHECK: idiv
+}
+
+define i32 @sdiv_i32(i32 %a, i32 %b) {
+entry:
+  %res = sdiv i32 %a, %b
+  ret i32 %res
+; CHECK-LABEL: sdiv_i32:
+; CHECK: cdq
+; CHECK: idiv
+}
+
+define i32 @srem_i8(i32 %a.i32, i32 %b.i32) {
+entry:
+  %a = trunc i32 %a.i32 to i8
+  %b = trunc i32 %b.i32 to i8
+  %res = srem i8 %a, %b
+  %res.i32 = zext i8 %res to i32
+  ret i32 %res.i32
+; CHECK-LABEL: srem_i8:
+; CHECK: cbw
+; CHECK: idiv
+}
+
+define i32 @srem_i16(i32 %a.i32, i32 %b.i32) {
+entry:
+  %a = trunc i32 %a.i32 to i16
+  %b = trunc i32 %b.i32 to i16
+  %res = srem i16 %a, %b
+  %res.i32 = zext i16 %res to i32
+  ret i32 %res.i32
+; CHECK-LABEL: srem_i16:
+; CHECK: cwd
+; CHECK: idiv
+}
+
+define i32 @srem_i32(i32 %a, i32 %b) {
+entry:
+  %res = srem i32 %a, %b
+  ret i32 %res
+; CHECK-LABEL: srem_i32:
+; CHECK: cdq
+; CHECK: idiv
+}
+
+; ERRORS-NOT: ICE translation error
+; DUMP-NOT: SZ
--- a/tests_lit/llvm2ice_tests/vector-arith.ll
+++ b/tests_lit/llvm2ice_tests/vector-arith.ll
@@ -56,7 +56,10 @@ entry:
  %res = frem <4 x float> %arg0, %arg1
  ret <4 x float> %res
 ; CHECK-LABEL: test_frem:
-; CHECK: Sz_frem_v4f32
+; CHECK: fmodf
+; CHECK: fmodf
+; CHECK: fmodf
+; CHECK: fmodf
 }

 define <16 x i8> @test_add_v16i8(<16 x i8> %arg0, <16 x i8> %arg1) {
@@ -104,7 +107,22 @@ entry:
  %res = mul <16 x i8> %arg0, %arg1
  ret <16 x i8> %res
 ; CHECK-LABEL: test_mul_v16i8:
-; CHECK: Sz_mul_v16i8
+; CHECK: imul
+; CHECK: imul
+; CHECK: imul
+; CHECK: imul
+; CHECK: imul
+; CHECK: imul
+; CHECK: imul
+; CHECK: imul
+; CHECK: imul
+; CHECK: imul
+; CHECK: imul
+; CHECK: imul
+; CHECK: imul
+; CHECK: imul
+; CHECK: imul
+; CHECK: imul
 }

 define <16 x i8> @test_shl_v16i8(<16 x i8> %arg0, <16 x i8> %arg1) {
@@ -112,7 +130,22 @@ entry:
  %res = shl <16 x i8> %arg0, %arg1
  ret <16 x i8> %res
 ; CHECK-LABEL: test_shl_v16i8:
-; CHECK: Sz_shl_v16i8
+; CHECK: shl
+; CHECK: shl
+; CHECK: shl
+; CHECK: shl
+; CHECK: shl
+; CHECK: shl
+; CHECK: shl
+; CHECK: shl
+; CHECK: shl
+; CHECK: shl
+; CHECK: shl
+; CHECK: shl
+; CHECK: shl
+; CHECK: shl
+; CHECK: shl
+; CHECK: shl
 }

 define <16 x i8> @test_lshr_v16i8(<16 x i8> %arg0, <16 x i8> %arg1) {
@@ -120,7 +153,22 @@ entry:
  %res = lshr <16 x i8> %arg0, %arg1
  ret <16 x i8> %res
 ; CHECK-LABEL: test_lshr_v16i8:
-; CHECK: Sz_lshr_v16i8
+; CHECK: shr
+; CHECK: shr
+; CHECK: shr
+; CHECK: shr
+; CHECK: shr
+; CHECK: shr
+; CHECK: shr
+; CHECK: shr
+; CHECK: shr
+; CHECK: shr
+; CHECK: shr
+; CHECK: shr
+; CHECK: shr
+; CHECK: shr
+; CHECK: shr
+; CHECK: shr
 }

 define <16 x i8> @test_ashr_v16i8(<16 x i8> %arg0, <16 x i8> %arg1) {
@@ -128,7 +176,22 @@ entry:
  %res = ashr <16 x i8> %arg0, %arg1
  ret <16 x i8> %res
 ; CHECK-LABEL: test_ashr_v16i8:
-; CHECK: Sz_ashr_v16i8
+; CHECK: sar
+; CHECK: sar
+; CHECK: sar
+; CHECK: sar
+; CHECK: sar
+; CHECK: sar
+; CHECK: sar
+; CHECK: sar
+; CHECK: sar
+; CHECK: sar
+; CHECK: sar
+; CHECK: sar
+; CHECK: sar
+; CHECK: sar
+; CHECK: sar
+; CHECK: sar
 }

 define <16 x i8> @test_udiv_v16i8(<16 x i8> %arg0, <16 x i8> %arg1) {
@@ -136,7 +199,22 @@ entry:
  %res = udiv <16 x i8> %arg0, %arg1
  ret <16 x i8> %res
 ; CHECK-LABEL: test_udiv_v16i8:
-; CHECK: Sz_udiv_v16i8
+; CHECK: div
+; CHECK: div
+; CHECK: div
+; CHECK: div
+; CHECK: div
+; CHECK: div
+; CHECK: div
+; CHECK: div
+; CHECK: div
+; CHECK: div
+; CHECK: div
+; CHECK: div
+; CHECK: div
+; CHECK: div
+; CHECK: div
+; CHECK: div
 }

 define <16 x i8> @test_sdiv_v16i8(<16 x i8> %arg0, <16 x i8> %arg1) {
@@ -144,7 +222,22 @@ entry:
  %res = sdiv <16 x i8> %arg0, %arg1
  ret <16 x i8> %res
 ; CHECK-LABEL: test_sdiv_v16i8:
-; CHECK: Sz_sdiv_v16i8
+; CHECK: idiv
+; CHECK: idiv
+; CHECK: idiv
+; CHECK: idiv
+; CHECK: idiv
+; CHECK: idiv
+; CHECK: idiv
+; CHECK: idiv
+; CHECK: idiv
+; CHECK: idiv
+; CHECK: idiv
+; CHECK: idiv
+; CHECK: idiv
+; CHECK: idiv
+; CHECK: idiv
+; CHECK: idiv
 }

 define <16 x i8> @test_urem_v16i8(<16 x i8> %arg0, <16 x i8> %arg1) {
@@ -152,7 +245,22 @@ entry:
  %res = urem <16 x i8> %arg0, %arg1
  ret <16 x i8> %res
 ; CHECK-LABEL: test_urem_v16i8:
-; CHECK: Sz_urem_v16i8
+; CHECK: div
+; CHECK: div
+; CHECK: div
+; CHECK: div
+; CHECK: div
+; CHECK: div
+; CHECK: div
+; CHECK: div
+; CHECK: div
+; CHECK: div
+; CHECK: div
+; CHECK: div
+; CHECK: div
+; CHECK: div
+; CHECK: div
+; CHECK: div
 }

 define <16 x i8> @test_srem_v16i8(<16 x i8> %arg0, <16 x i8> %arg1) {
@@ -160,7 +268,22 @@ entry:
  %res = srem <16 x i8> %arg0, %arg1
  ret <16 x i8> %res
 ; CHECK-LABEL: test_srem_v16i8:
-; CHECK: Sz_srem_v16i8
+; CHECK: idiv
+; CHECK: idiv
+; CHECK: idiv
+; CHECK: idiv
+; CHECK: idiv
+; CHECK: idiv
+; CHECK: idiv
+; CHECK: idiv
+; CHECK: idiv
+; CHECK: idiv
+; CHECK: idiv
+; CHECK: idiv
+; CHECK: idiv
+; CHECK: idiv
+; CHECK: idiv
+; CHECK: idiv
 }

 define <8 x i16> @test_add_v8i16(<8 x i16> %arg0, <8 x i16> %arg1) {
@@ -216,7 +339,14 @@ entry:
  %res = shl <8 x i16> %arg0, %arg1
  ret <8 x i16> %res
 ; CHECK-LABEL: test_shl_v8i16:
-; CHECK: Sz_shl_v8i16
+; CHECK: shl
+; CHECK: shl
+; CHECK: shl
+; CHECK: shl
+; CHECK: shl
+; CHECK: shl
+; CHECK: shl
+; CHECK: shl
 }

 define <8 x i16> @test_lshr_v8i16(<8 x i16> %arg0, <8 x i16> %arg1) {
@@ -224,7 +354,14 @@ entry:
  %res = lshr <8 x i16> %arg0, %arg1
  ret <8 x i16> %res
 ; CHECK-LABEL: test_lshr_v8i16:
-; CHECK: Sz_lshr_v8i16
+; CHECK: shr
+; CHECK: shr
+; CHECK: shr
+; CHECK: shr
+; CHECK: shr
+; CHECK: shr
+; CHECK: shr
+; CHECK: shr
 }

 define <8 x i16> @test_ashr_v8i16(<8 x i16> %arg0, <8 x i16> %arg1) {
@@ -232,7 +369,14 @@ entry:
  %res = ashr <8 x i16> %arg0, %arg1
  ret <8 x i16> %res
 ; CHECK-LABEL: test_ashr_v8i16:
-; CHECK: Sz_ashr_v8i16
+; CHECK: sar
+; CHECK: sar
+; CHECK: sar
+; CHECK: sar
+; CHECK: sar
+; CHECK: sar
+; CHECK: sar
+; CHECK: sar
 }

 define <8 x i16> @test_udiv_v8i16(<8 x i16> %arg0, <8 x i16> %arg1) {
@@ -240,7 +384,14 @@ entry:
  %res = udiv <8 x i16> %arg0, %arg1
  ret <8 x i16> %res
 ; CHECK-LABEL: test_udiv_v8i16:
-; CHECK: Sz_udiv_v8i16
+; CHECK: div
+; CHECK: div
+; CHECK: div
+; CHECK: div
+; CHECK: div
+; CHECK: div
+; CHECK: div
+; CHECK: div
 }

 define <8 x i16> @test_sdiv_v8i16(<8 x i16> %arg0, <8 x i16> %arg1) {
@@ -248,7 +399,14 @@ entry:
  %res = sdiv <8 x i16> %arg0, %arg1
  ret <8 x i16> %res
 ; CHECK-LABEL: test_sdiv_v8i16:
-; CHECK: Sz_sdiv_v8i16
+; CHECK: idiv
+; CHECK: idiv
+; CHECK: idiv
+; CHECK: idiv
+; CHECK: idiv
+; CHECK: idiv
+; CHECK: idiv
+; CHECK: idiv
 }

 define <8 x i16> @test_urem_v8i16(<8 x i16> %arg0, <8 x i16> %arg1) {
@@ -256,7 +414,14 @@ entry:
  %res = urem <8 x i16> %arg0, %arg1
  ret <8 x i16> %res
 ; CHECK-LABEL: test_urem_v8i16:
-; CHECK: Sz_urem_v8i16
+; CHECK: div
+; CHECK: div
+; CHECK: div
+; CHECK: div
+; CHECK: div
+; CHECK: div
+; CHECK: div
+; CHECK: div
 }

 define <8 x i16> @test_srem_v8i16(<8 x i16> %arg0, <8 x i16> %arg1) {
@@ -264,7 +429,14 @@ entry:
  %res = srem <8 x i16> %arg0, %arg1
  ret <8 x i16> %res
 ; CHECK-LABEL: test_srem_v8i16:
-; CHECK: Sz_srem_v8i16
+; CHECK: idiv
+; CHECK: idiv
+; CHECK: idiv
+; CHECK: idiv
+; CHECK: idiv
+; CHECK: idiv
+; CHECK: idiv
+; CHECK: idiv
 }

 define <4 x i32> @test_add_v4i32(<4 x i32> %arg0, <4 x i32> %arg1) {
@@ -324,7 +496,10 @@ entry:
  %res = shl <4 x i32> %arg0, %arg1
  ret <4 x i32> %res
 ; CHECK-LABEL: test_shl_v4i32:
-; CHECK: Sz_shl_v4i32
+; CHECK: shl
+; CHECK: shl
+; CHECK: shl
+; CHECK: shl

 ; This line is to ensure that pmulld is generated in test_mul_v4i32 above.
 ; SSE41-LABEL: test_shl_v4i32:
@@ -335,7 +510,10 @@ entry:
  %res = lshr <4 x i32> %arg0, %arg1
  ret <4 x i32> %res
 ; CHECK-LABEL: test_lshr_v4i32:
-; CHECK: Sz_lshr_v4i32
+; CHECK: shr
+; CHECK: shr
+; CHECK: shr
+; CHECK: shr
 }

 define <4 x i32> @test_ashr_v4i32(<4 x i32> %arg0, <4 x i32> %arg1) {
@@ -343,7 +521,10 @@ entry:
  %res = ashr <4 x i32> %arg0, %arg1
  ret <4 x i32> %res
 ; CHECK-LABEL: test_ashr_v4i32:
-; CHECK: Sz_ashr_v4i32
+; CHECK: sar
+; CHECK: sar
+; CHECK: sar
+; CHECK: sar
 }

 define <4 x i32> @test_udiv_v4i32(<4 x i32> %arg0, <4 x i32> %arg1) {
@@ -351,7 +532,10 @@ entry:
  %res = udiv <4 x i32> %arg0, %arg1
  ret <4 x i32> %res
 ; CHECK-LABEL: test_udiv_v4i32:
-; CHECK: Sz_udiv_v4i32
+; CHECK: div
+; CHECK: div
+; CHECK: div
+; CHECK: div
 }

 define <4 x i32> @test_sdiv_v4i32(<4 x i32> %arg0, <4 x i32> %arg1) {
@@ -359,7 +543,10 @@ entry:
  %res = sdiv <4 x i32> %arg0, %arg1
  ret <4 x i32> %res
 ; CHECK-LABEL: test_sdiv_v4i32:
-; CHECK: Sz_sdiv_v4i32
+; CHECK: idiv
+; CHECK: idiv
+; CHECK: idiv
+; CHECK: idiv
 }

 define <4 x i32> @test_urem_v4i32(<4 x i32> %arg0, <4 x i32> %arg1) {
@@ -367,7 +554,10 @@ entry:
  %res = urem <4 x i32> %arg0, %arg1
  ret <4 x i32> %res
 ; CHECK-LABEL: test_urem_v4i32:
-; CHECK: Sz_urem_v4i32
+; CHECK: div
+; CHECK: div
+; CHECK: div
+; CHECK: div
 }

 define <4 x i32> @test_srem_v4i32(<4 x i32> %arg0, <4 x i32> %arg1) {
@@ -375,7 +565,10 @@ entry:
  %res = srem <4 x i32> %arg0, %arg1
  ret <4 x i32> %res
 ; CHECK-LABEL: test_srem_v4i32:
-; CHECK: Sz_srem_v4i32
+; CHECK: idiv
+; CHECK: idiv
+; CHECK: idiv
+; CHECK: idiv
 }

 ; ERRORS-NOT: ICE translation error