Use three-address form of imul

Previously we did not take advantage of the three address versions of the imul instruction. With this we are able to avoid some copies before imuls. BUG= R=stichnot@chromium.org Review URL: https://codereview.chromium.org/1365433004 .

Use three-address form of imul
e11f878a · David Sehr · 578f1161 · e11f878a · e11f878a · e11f878a
Commit e11f878a authored Oct 06, 2015 by David Sehr
7 changed files
--- a/src/IceAssemblerX86Base.h
+++ b/src/IceAssemblerX86Base.h
@@ -751,6 +751,11 @@ public:
  void imul(Type Ty, typename Traits::GPRRegister reg);
  void imul(Type Ty, const typename Traits::Address &address);

+  void imul(Type Ty, typename Traits::GPRRegister dst,
+            typename Traits::GPRRegister src, const Immediate &imm);
+  void imul(Type Ty, typename Traits::GPRRegister dst,
+            const typename Traits::Address &address, const Immediate &imm);
+
  void mul(Type Ty, typename Traits::GPRRegister reg);
  void mul(Type Ty, const typename Traits::Address &address);


--- a/src/IceAssemblerX86BaseImpl.h
+++ b/src/IceAssemblerX86BaseImpl.h
@@ -2581,6 +2581,46 @@ void AssemblerX86Base<Machine>::imul(Type Ty,
 }

 template <class Machine>
+void AssemblerX86Base<Machine>::imul(Type Ty, typename Traits::GPRRegister dst,
+                                     typename Traits::GPRRegister src,
+                                     const Immediate &imm) {
+  AssemblerBuffer::EnsureCapacity ensured(&Buffer);
+  assert(Ty == IceType_i16 || Ty == IceType_i32);
+  if (Ty == IceType_i16)
+    emitOperandSizeOverride();
+  emitRexRB(Ty, dst, src);
+  if (imm.is_int8()) {
+    emitUint8(0x6B);
+    emitRegisterOperand(gprEncoding(dst), gprEncoding(src));
+    emitUint8(imm.value() & 0xFF);
+  } else {
+    emitUint8(0x69);
+    emitRegisterOperand(gprEncoding(dst), gprEncoding(src));
+    emitImmediate(Ty, imm);
+  }
+}
+
+template <class Machine>
+void AssemblerX86Base<Machine>::imul(Type Ty, typename Traits::GPRRegister dst,
+                                     const typename Traits::Address &address,
+                                     const Immediate &imm) {
+  AssemblerBuffer::EnsureCapacity ensured(&Buffer);
+  assert(Ty == IceType_i16 || Ty == IceType_i32);
+  if (Ty == IceType_i16)
+    emitOperandSizeOverride();
+  emitRex(Ty, address, dst);
+  if (imm.is_int8()) {
+    emitUint8(0x6B);
+    emitOperand(gprEncoding(dst), address);
+    emitUint8(imm.value() & 0xFF);
+  } else {
+    emitUint8(0x69);
+    emitOperand(gprEncoding(dst), address);
+    emitImmediate(Ty, imm);
+  }
+}
+
+template <class Machine>
 void AssemblerX86Base<Machine>::mul(Type Ty, typename Traits::GPRRegister reg) {
  AssemblerBuffer::EnsureCapacity ensured(&Buffer);
  if (Ty == IceType_i16)

--- a/src/IceInstX86Base.h
+++ b/src/IceInstX86Base.h
@@ -68,6 +68,7 @@ public:
    Icmp,
    Idiv,
    Imul,
+    ImulImm,
    Insertps,
    Jmp,
    Label,
@@ -1622,6 +1623,25 @@ private:
 };

 template <class Machine>
+class InstX86ImulImm
+    : public InstX86BaseThreeAddressop<Machine, InstX86Base<Machine>::ImulImm> {
+public:
+  static InstX86ImulImm *create(Cfg *Func, Variable *Dest, Operand *Source0,
+                                Operand *Source1) {
+    return new (Func->allocate<InstX86ImulImm>())
+        InstX86ImulImm(Func, Dest, Source0, Source1);
+  }
+
+  void emit(const Cfg *Func) const override;
+  void emitIAS(const Cfg *Func) const override;
+
+private:
+  InstX86ImulImm(Cfg *Func, Variable *Dest, Operand *Source0, Operand *Source1)
+      : InstX86BaseThreeAddressop<Machine, InstX86Base<Machine>::ImulImm>(
+            Func, Dest, Source0, Source1) {}
+};
+
+template <class Machine>
 class InstX86Mulps
    : public InstX86BaseBinopXmm<Machine, InstX86Base<Machine>::Mulps, true> {
 public:
@@ -2790,6 +2810,7 @@ template <class Machine> struct Insts {
  using XorRMW = InstX86XorRMW<Machine>;
  using Pxor = InstX86Pxor<Machine>;
  using Imul = InstX86Imul<Machine>;
+  using ImulImm = InstX86ImulImm<Machine>;
  using Mulps = InstX86Mulps<Machine>;
  using Mulss = InstX86Mulss<Machine>;
  using Pmull = InstX86Pmull<Machine>;
@@ -2897,6 +2918,7 @@ template <class Machine> struct Insts {
  template <> const char *InstX86XorRMW<Machine>::Base::Opcode = "xor";        \
  template <> const char *InstX86Pxor<Machine>::Base::Opcode = "pxor";         \
  template <> const char *InstX86Imul<Machine>::Base::Opcode = "imul";         \
+  template <> const char *InstX86ImulImm<Machine>::Base::Opcode = "imul";      \
  template <> const char *InstX86Mulps<Machine>::Base::Opcode = "mulps";       \
  template <> const char *InstX86Mulss<Machine>::Base::Opcode = "mulss";       \
  template <> const char *InstX86Pmull<Machine>::Base::Opcode = "pmull";       \

--- a/src/IceInstX86BaseImpl.h
+++ b/src/IceInstX86BaseImpl.h
@@ -1330,8 +1330,8 @@ void InstX86Imul<Machine>::emitIAS(const Cfg *Func) const {
        &InstX86Base<Machine>::Traits::Assembler::imul};
    emitIASOpTyGPR<Machine>(Func, Ty, this->getSrc(1), Emitter);
  } else {
-    // We only use imul as a two-address instruction even though there is a 3
-    // operand version when one of the operands is a constant.
+    // The two-address version is used when multiplying by a non-constant
+    // or doing an 8-bit multiply.
    assert(Var == this->getSrc(0));
    static const typename InstX86Base<
        Machine>::Traits::Assembler::GPREmitterRegOp Emitter = {
@@ -1343,6 +1343,43 @@ void InstX86Imul<Machine>::emitIAS(const Cfg *Func) const {
 }

 template <class Machine>
+void InstX86ImulImm<Machine>::emit(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrEmit();
+  assert(this->getSrcSize() == 2);
+  Variable *Dest = this->getDest();
+  assert(Dest->getType() == IceType_i16 || Dest->getType() == IceType_i32);
+  assert(llvm::isa<Constant>(this->getSrc(1)));
+  Str << "\timul" << this->getWidthString(Dest->getType()) << "\t";
+  this->getSrc(1)->emit(Func);
+  Str << ", ";
+  this->getSrc(0)->emit(Func);
+  Str << ", ";
+  Dest->emit(Func);
+}
+
+template <class Machine>
+void InstX86ImulImm<Machine>::emitIAS(const Cfg *Func) const {
+  assert(this->getSrcSize() == 2);
+  const Variable *Dest = this->getDest();
+  Type Ty = Dest->getType();
+  assert(llvm::isa<Constant>(this->getSrc(1)));
+  static const typename InstX86Base<Machine>::Traits::Assembler::
+      template ThreeOpImmEmitter<
+          typename InstX86Base<Machine>::Traits::RegisterSet::GPRRegister,
+          typename InstX86Base<Machine>::Traits::RegisterSet::GPRRegister>
+          Emitter = {&InstX86Base<Machine>::Traits::Assembler::imul,
+                     &InstX86Base<Machine>::Traits::Assembler::imul};
+  emitIASThreeOpImmOps<
+      Machine, typename InstX86Base<Machine>::Traits::RegisterSet::GPRRegister,
+      typename InstX86Base<Machine>::Traits::RegisterSet::GPRRegister,
+      InstX86Base<Machine>::Traits::RegisterSet::getEncodedGPR,
+      InstX86Base<Machine>::Traits::RegisterSet::getEncodedGPR>(
+      Func, Ty, Dest, this->getSrc(0), this->getSrc(1), Emitter);
+}
+
+template <class Machine>
 void InstX86Insertps<Machine>::emitIAS(const Cfg *Func) const {
  assert(this->getSrcSize() == 3);
  assert(static_cast<typename InstX86Base<Machine>::Traits::TargetLowering *>(

--- a/src/IceTargetLoweringX86Base.h
+++ b/src/IceTargetLoweringX86Base.h
@@ -428,6 +428,9 @@ protected:
  void _imul(Variable *Dest, Operand *Src0) {
    Context.insert(Traits::Insts::Imul::create(Func, Dest, Src0));
  }
+  void _imul_imm(Variable *Dest, Operand *Src0, Constant *Imm) {
+    Context.insert(Traits::Insts::ImulImm::create(Func, Dest, Src0, Imm));
+  }
  void _insertps(Variable *Dest, Operand *Src0, Operand *Src1) {
    Context.insert(Traits::Insts::Insertps::create(Func, Dest, Src0, Src1));
  }

--- a/src/IceTargetLoweringX86BaseImpl.h
+++ b/src/IceTargetLoweringX86BaseImpl.h
@@ -1618,11 +1618,17 @@ void TargetX86Base<Machine>::lowerArithmetic(const InstArithmetic *Inst) {
    if (isByteSizedArithType(Dest->getType())) {
      _mov(T, Src0, Traits::RegisterSet::Reg_eax);
      Src1 = legalize(Src1, Legal_Reg | Legal_Mem);
+      _imul(T, Src0 == Src1 ? T : Src1);
+      _mov(Dest, T);
+    } else if (auto *ImmConst = llvm::dyn_cast<ConstantInteger32>(Src1)) {
+      T = makeReg(Dest->getType());
+      _imul_imm(T, Src0, ImmConst);
+      _mov(Dest, T);
    } else {
      _mov(T, Src0);
+      _imul(T, Src0 == Src1 ? T : Src1);
+      _mov(Dest, T);
    }
-    _imul(T, Src0 == Src1 ? T : Src1);
-    _mov(Dest, T);
    break;
  case InstArithmetic::Shl:
    _mov(T, Src0);

--- a/tests_lit/assembler/x86/immediate_encodings.ll
+++ b/tests_lit/assembler/x86/immediate_encodings.ll
@@ -250,6 +250,88 @@ entry:
 ; CHECK-LABEL: testMul32Imm16Neg
 ; CHECK: 69 c0 01 ff ff ff  imul eax,eax,0xffffff01

+define i32 @testMul32Imm32ThreeAddress(i32 %a) {
+entry:
+  %mul = mul i32 232, %a
+  %add = add i32 %mul, %a
+  ret i32 %add
+}
+; CHECK-LABEL: testMul32Imm32ThreeAddress
+; CHECK: 69 c8 e8 00 00 00  imul ecx,eax,0xe8
+
+define i32 @testMul32Mem32Imm32ThreeAddress(i32 %addr_arg) {
+entry:
+  %__1 = inttoptr i32 %addr_arg to i32*
+  %a = load i32, i32* %__1, align 1
+  %mul = mul i32 232, %a
+  ret i32 %mul
+}
+; CHECK-LABEL: testMul32Mem32Imm32ThreeAddress
+; CHECK: 69 00 e8 00 00 00  imul eax,DWORD PTR [eax],0xe8
+
+define i32 @testMul32Imm8ThreeAddress(i32 %a) {
+entry:
+  %mul = mul i32 127, %a
+  %add = add i32 %mul, %a
+  ret i32 %add
+}
+; CHECK-LABEL: testMul32Imm8ThreeAddress
+; CHECK: 6b c8 7f imul ecx,eax,0x7f
+
+define i32 @testMul32Mem32Imm8ThreeAddress(i32 %addr_arg) {
+entry:
+  %__1 = inttoptr i32 %addr_arg to i32*
+  %a = load i32, i32* %__1, align 1
+  %mul = mul i32 127, %a
+  ret i32 %mul
+}
+; CHECK-LABEL: testMul32Mem32Imm8ThreeAddress
+; CHECK: 6b 00 7f imul eax,DWORD PTR [eax],0x7f
+
+define i32 @testMul16Imm16ThreeAddress(i32 %a) {
+entry:
+  %arg_i16 = trunc i32 %a to i16
+  %mul = mul i16 232, %arg_i16
+  %add = add i16 %mul, %arg_i16
+  %result = zext i16 %add to i32
+  ret i32 %result
+}
+; CHECK-LABEL: testMul16Imm16ThreeAddress
+; CHECK: 66 69 c8 e8 00 imul cx,ax,0xe8
+
+define i32 @testMul16Mem16Imm16ThreeAddress(i32 %addr_arg) {
+entry:
+  %__1 = inttoptr i32 %addr_arg to i16*
+  %a = load i16, i16* %__1, align 1
+  %mul = mul i16 232, %a
+  %result = zext i16 %mul to i32
+  ret i32 %result
+}
+; CHECK-LABEL: testMul16Mem16Imm16ThreeAddress
+; CHECK: 66 69 00 e8 00 imul ax,WORD PTR [eax],0xe8
+
+define i32 @testMul16Imm8ThreeAddress(i32 %a) {
+entry:
+  %arg_i16 = trunc i32 %a to i16
+  %mul = mul i16 127, %arg_i16
+  %add = add i16 %mul, %arg_i16
+  %result = zext i16 %add to i32
+  ret i32 %result
+}
+; CHECK-LABEL: testMul16Imm8ThreeAddress
+; CHECK: 66 6b c8 7f imul cx,ax,0x7f
+
+define i32 @testMul16Mem16Imm8ThreeAddress(i32 %addr_arg) {
+entry:
+  %__1 = inttoptr i32 %addr_arg to i16*
+  %a = load i16, i16* %__1, align 1
+  %mul = mul i16 127, %a
+  %result = zext i16 %mul to i32
+  ret i32 %result
+}
+; CHECK-LABEL: testMul16Mem16Imm8ThreeAddress
+; CHECK: 66 6b 00 7f imul ax,WORD PTR [eax],0x7f
+
 ; The GPR shift instructions either allow an 8-bit immediate or
 ; have a special encoding for "1".
 define internal i32 @testShl16Imm8(i32 %arg) {