Subzero: Use cmov to improve lowering for the select instruction.

This is instead of explicit control flow which may interfere with branch prediction. However, explicit control flow is still needed for types other than i16 and i32, due to cmov limitations. The assembler for cmov is extended to allow the non-dest operand to be a memory operand. The select lowering is getting large enough that it was in our best interest to combine the default lowering with the bool-folding optimization. BUG= https://code.google.com/p/nativeclient/issues/detail?id=4095 R=jvoung@chromium.org Review URL: https://codereview.chromium.org/1125323004

Subzero: Use cmov to improve lowering for the select instruction.
537b5ba0 · Jim Stichnoth · b3401d27 · 537b5ba0 · 537b5ba0 · 537b5ba0
Commit 537b5ba0 authored May 19, 2015 by Jim Stichnoth
8 changed files
--- a/src/IceInstX8632.cpp
+++ b/src/IceInstX8632.cpp
@@ -84,6 +84,10 @@ const char *InstX8632::getFldString(Type Ty) {
  return TypeX8632Attributes[Ty].FldString;
 }
+CondX86::BrCond InstX8632::getOppositeCondition(CondX86::BrCond Cond) {
+  return InstX8632BrAttributes[Cond].Opposite;
+}
 OperandX8632Mem::OperandX8632Mem(Cfg *Func, Type Ty, Variable *Base,
                                 Constant *Offset, Variable *Index,
                                 uint16_t Shift, SegmentRegisters SegmentReg)
@@ -181,7 +185,7 @@ bool InstX8632Br::optimizeBranch(const CfgNode *NextNode) {
  // condition, swap the targets, and set new fallthrough to nullptr.
  if (getTargetTrue() == NextNode) {
    assert(Condition != CondX86::Br_None);
-    Condition = InstX8632BrAttributes[Condition].Opposite;
+    Condition = getOppositeCondition(Condition);
    TargetTrue = getTargetFalse();
    TargetFalse = nullptr;
    return true;
@@ -1558,13 +1562,28 @@ void InstX8632Cmov::emitIAS(const Cfg *Func) const {
  assert(Condition != CondX86::Br_None);
  assert(getDest()->hasReg());
  assert(getSrcSize() == 2);
-  // Only need the reg/reg form now.
+  Operand *Src = getSrc(1);
-  const auto SrcVar = llvm::cast<Variable>(getSrc(1));
+  Type SrcTy = Src->getType();
-  assert(SrcVar->hasReg());
+  assert(SrcTy == IceType_i16 || SrcTy == IceType_i32);
-  assert(SrcVar->getType() == IceType_i32);
  X8632::AssemblerX8632 *Asm = Func->getAssembler<X8632::AssemblerX8632>();
-  Asm->cmov(Condition, RegX8632::getEncodedGPR(getDest()->getRegNum()),
+  if (const auto *SrcVar = llvm::dyn_cast<Variable>(Src)) {
-            RegX8632::getEncodedGPR(SrcVar->getRegNum()));
+    if (SrcVar->hasReg()) {
+      Asm->cmov(SrcTy, Condition,
+                RegX8632::getEncodedGPR(getDest()->getRegNum()),
+                RegX8632::getEncodedGPR(SrcVar->getRegNum()));
+    } else {
+      Asm->cmov(SrcTy, Condition,
+                RegX8632::getEncodedGPR(getDest()->getRegNum()),
+                static_cast<TargetX8632 *>(Func->getTarget())
+                    ->stackVarToAsmOperand(SrcVar));
+    }
+  } else if (const auto Mem = llvm::dyn_cast<OperandX8632Mem>(Src)) {
+    assert(Mem->getSegmentRegister() == OperandX8632Mem::DefaultSegment);
+    Asm->cmov(SrcTy, Condition, RegX8632::getEncodedGPR(getDest()->getRegNum()),
+              Mem->toAsmAddress(Asm));
+  } else {
+    llvm_unreachable("Unexpected operand type");
+  }
 }
 void InstX8632Cmov::dump(const Cfg *Func) const {

--- a/src/IceInstX8632.h
+++ b/src/IceInstX8632.h
@@ -265,6 +265,7 @@ public:
  static const char *getWidthString(Type Ty);
  static const char *getFldString(Type Ty);
+  static CondX86::BrCond getOppositeCondition(CondX86::BrCond Cond);
  void dump(const Cfg *Func) const override;
 protected:

--- a/src/IceTargetLoweringX8632.cpp
+++ b/src/IceTargetLoweringX8632.cpp
@@ -4076,11 +4076,12 @@ void TargetX8632::lowerRet(const InstRet *Inst) {
 void TargetX8632::lowerSelect(const InstSelect *Inst) {
  Variable *Dest = Inst->getDest();
+  Type DestTy = Dest->getType();
  Operand *SrcT = Inst->getTrueOperand();
  Operand *SrcF = Inst->getFalseOperand();
  Operand *Condition = Inst->getCondition();
-  if (isVectorType(Dest->getType())) {
+  if (isVectorType(DestTy)) {
    Type SrcTy = SrcT->getType();
    Variable *T = makeReg(SrcTy);
    Operand *SrcTRM = legalize(SrcT, Legal_Reg | Legal_Mem);
@@ -4138,6 +4139,9 @@ void TargetX8632::lowerSelect(const InstSelect *Inst) {
    return;
  }
+  CondX86::BrCond Cond = CondX86::Br_ne;
+  Operand *CmpOpnd0 = nullptr;
+  Operand *CmpOpnd1 = nullptr;
  // Handle folding opportunities.
  if (const class Inst *Producer = FoldingInfo.getProducerFor(Condition)) {
    assert(Producer->isDeleted());
@@ -4145,73 +4149,69 @@ void TargetX8632::lowerSelect(const InstSelect *Inst) {
    default:
      break;
    case BoolFolding::PK_Icmp32: {
-      // d=cmp e,f; a=d?b:c ==> cmp e,f; a=b; jne L1; a=c; L1:
      auto *Cmp = llvm::dyn_cast<InstIcmp>(Producer);
-      InstX8632Label *Label = InstX8632Label::create(Func, this);
+      Cond = getIcmp32Mapping(Cmp->getCondition());
-      Operand *Src0 = Producer->getSrc(0);
+      CmpOpnd1 = legalize(Producer->getSrc(1));
-      Operand *Src1 = legalize(Producer->getSrc(1));
+      CmpOpnd0 = legalizeSrc0ForCmp(Producer->getSrc(0), CmpOpnd1);
-      Operand *Src0RM = legalizeSrc0ForCmp(Src0, Src1);
+    } break;
-      _cmp(Src0RM, Src1);
-      // This is the same code as below (for both i64 and non-i64),
-      // except without the _cmp instruction and with a different
-      // branch condition.  TODO(stichnot): refactor.
-      if (Dest->getType() == IceType_i64) {
-        Variable *DestLo = llvm::cast<Variable>(loOperand(Dest));
-        Variable *DestHi = llvm::cast<Variable>(hiOperand(Dest));
-        Operand *SrcLoRI = legalize(loOperand(SrcT), Legal_Reg | Legal_Imm);
-        Operand *SrcHiRI = legalize(hiOperand(SrcT), Legal_Reg | Legal_Imm);
-        _mov(DestLo, SrcLoRI);
-        _mov(DestHi, SrcHiRI);
-        _br(getIcmp32Mapping(Cmp->getCondition()), Label);
-        Operand *SrcFLo = loOperand(SrcF);
-        Operand *SrcFHi = hiOperand(SrcF);
-        SrcLoRI = legalize(SrcFLo, Legal_Reg | Legal_Imm);
-        SrcHiRI = legalize(SrcFHi, Legal_Reg | Legal_Imm);
-        _mov_nonkillable(DestLo, SrcLoRI);
-        _mov_nonkillable(DestHi, SrcHiRI);
-      } else {
-        SrcT = legalize(SrcT, Legal_Reg | Legal_Imm);
-        _mov(Dest, SrcT);
-        _br(getIcmp32Mapping(Cmp->getCondition()), Label);
-        SrcF = legalize(SrcF, Legal_Reg | Legal_Imm);
-        _mov_nonkillable(Dest, SrcF);
-      }
-      Context.insert(Label);
-      return;
-    }
    }
  }
+  if (CmpOpnd0 == nullptr) {
+    CmpOpnd0 = legalize(Condition, Legal_Reg | Legal_Mem);
+    CmpOpnd1 = Ctx->getConstantZero(IceType_i32);
+  }
+  assert(CmpOpnd0);
+  assert(CmpOpnd1);
-  // a=d?b:c ==> cmp d,0; a=b; jne L1; a=c; L1:
+  _cmp(CmpOpnd0, CmpOpnd1);
-  Operand *ConditionRM = legalize(Condition, Legal_Reg | Legal_Mem);
+  if (typeWidthInBytes(DestTy) == 1 || isFloatingType(DestTy)) {
-  Constant *Zero = Ctx->getConstantZero(IceType_i32);
+    // The cmov instruction doesn't allow 8-bit or FP operands, so
-  InstX8632Label *Label = InstX8632Label::create(Func, this);
+    // we need explicit control flow.
+    // d=cmp e,f; a=d?b:c ==> cmp e,f; a=b; jne L1; a=c; L1:
-  if (Dest->getType() == IceType_i64) {
+    InstX8632Label *Label = InstX8632Label::create(Func, this);
-    Variable *DestLo = llvm::cast<Variable>(loOperand(Dest));
-    Variable *DestHi = llvm::cast<Variable>(hiOperand(Dest));
-    Operand *SrcLoRI = legalize(loOperand(SrcT), Legal_Reg | Legal_Imm);
-    Operand *SrcHiRI = legalize(hiOperand(SrcT), Legal_Reg | Legal_Imm);
-    _cmp(ConditionRM, Zero);
-    _mov(DestLo, SrcLoRI);
-    _mov(DestHi, SrcHiRI);
-    _br(CondX86::Br_ne, Label);
-    Operand *SrcFLo = loOperand(SrcF);
-    Operand *SrcFHi = hiOperand(SrcF);
-    SrcLoRI = legalize(SrcFLo, Legal_Reg | Legal_Imm);
-    SrcHiRI = legalize(SrcFHi, Legal_Reg | Legal_Imm);
-    _mov_nonkillable(DestLo, SrcLoRI);
-    _mov_nonkillable(DestHi, SrcHiRI);
-  } else {
-    _cmp(ConditionRM, Zero);
    SrcT = legalize(SrcT, Legal_Reg | Legal_Imm);
    _mov(Dest, SrcT);
-    _br(CondX86::Br_ne, Label);
+    _br(Cond, Label);
    SrcF = legalize(SrcF, Legal_Reg | Legal_Imm);
    _mov_nonkillable(Dest, SrcF);
+    Context.insert(Label);
+    return;
+  }
+  // mov t, SrcF; cmov_cond t, SrcT; mov dest, t
+  // But if SrcT is immediate, we might be able to do better, as
+  // the cmov instruction doesn't allow an immediate operand:
+  // mov t, SrcT; cmov_!cond t, SrcF; mov dest, t
+  if (llvm::isa<Constant>(SrcT) && !llvm::isa<Constant>(SrcF)) {
+    std::swap(SrcT, SrcF);
+    Cond = InstX8632::getOppositeCondition(Cond);
+  }
+  if (DestTy == IceType_i64) {
+    // Set the low portion.
+    Variable *DestLo = llvm::cast<Variable>(loOperand(Dest));
+    Variable *TLo = nullptr;
+    Operand *SrcFLo = legalize(loOperand(SrcF));
+    _mov(TLo, SrcFLo);
+    Operand *SrcTLo = legalize(loOperand(SrcT), Legal_Reg | Legal_Mem);
+    _cmov(TLo, SrcTLo, Cond);
+    _mov(DestLo, TLo);
+    // Set the high portion.
+    Variable *DestHi = llvm::cast<Variable>(hiOperand(Dest));
+    Variable *THi = nullptr;
+    Operand *SrcFHi = legalize(hiOperand(SrcF));
+    _mov(THi, SrcFHi);
+    Operand *SrcTHi = legalize(hiOperand(SrcT), Legal_Reg | Legal_Mem);
+    _cmov(THi, SrcTHi, Cond);
+    _mov(DestHi, THi);
+    return;
  }
-  Context.insert(Label);
+  assert(DestTy == IceType_i16 || DestTy == IceType_i32);
+  Variable *T = nullptr;
+  SrcF = legalize(SrcF);
+  _mov(T, SrcF);
+  SrcT = legalize(SrcT, Legal_Reg | Legal_Mem);
+  _cmov(T, SrcT, Cond);
+  _mov(Dest, T);
 }
 void TargetX8632::lowerStore(const InstStore *Inst) {

--- a/src/assembler_ia32.cpp
+++ b/src/assembler_ia32.cpp
@@ -272,14 +272,30 @@ void AssemblerX8632::lea(Type Ty, GPRRegister dst, const Address &src) {
  EmitOperand(dst, src);
 }
-void AssemblerX8632::cmov(CondX86::BrCond cond, GPRRegister dst,
+void AssemblerX8632::cmov(Type Ty, CondX86::BrCond cond, GPRRegister dst,
                          GPRRegister src) {
  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  if (Ty == IceType_i16)
+    EmitOperandSizeOverride();
+  else
+    assert(Ty == IceType_i32);
  EmitUint8(0x0F);
  EmitUint8(0x40 + cond);
  EmitRegisterOperand(dst, src);
 }
+void AssemblerX8632::cmov(Type Ty, CondX86::BrCond cond, GPRRegister dst,
+                          const Address &src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  if (Ty == IceType_i16)
+    EmitOperandSizeOverride();
+  else
+    assert(Ty == IceType_i32);
+  EmitUint8(0x0F);
+  EmitUint8(0x40 + cond);
+  EmitOperand(dst, src);
+}
 void AssemblerX8632::rep_movsb() {
  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
  EmitUint8(0xF3);

--- a/src/assembler_ia32.h
+++ b/src/assembler_ia32.h
@@ -510,7 +510,8 @@ public:
  void lea(Type Ty, GPRRegister dst, const Address &src);
-  void cmov(CondX86::BrCond cond, GPRRegister dst, GPRRegister src);
+  void cmov(Type Ty, CondX86::BrCond cond, GPRRegister dst, GPRRegister src);
+  void cmov(Type Ty, CondX86::BrCond cond, GPRRegister dst, const Address &src);
  void rep_movsb();

--- a/tests_lit/assembler/x86/sandboxing.ll
+++ b/tests_lit/assembler/x86/sandboxing.ll
@@ -9,6 +9,7 @@
 declare void @call_target()
 @global_byte = internal global [1 x i8] zeroinitializer
 @global_short = internal global [2 x i8] zeroinitializer
+@global_int = internal global [4 x i8] zeroinitializer
 ; A direct call sequence uses the right mask and register-call sequence.
 define void @test_direct_call() {
@@ -94,26 +95,31 @@ entry:
 ; A zero-byte instruction (e.g. local label definition) at a bundle
 ; boundary should not trigger nop padding.
-define void @label_at_boundary(i32 %arg) {
+define void @label_at_boundary(i32 %arg, float %farg1, float %farg2) {
 entry:
-  %cmp = icmp eq i32 %arg, 0
+  %argi8 = trunc i32 %arg to i8
  call void @call_target()
  ; bundle boundary
  %addr_short = bitcast [2 x i8]* @global_short to i16*
-  store i16 0, i16* %addr_short, align 1   ; 9-byte instruction
+  %addr_int = bitcast [4 x i8]* @global_int to i32*
-  %blah = select i1 %cmp, i32 3, i32 5     ; 23-byte lowering sequence
+  store i32 0, i32* %addr_int, align 1           ; 10-byte instruction
+  %blah = select i1 true, i8 %argi8, i8 %argi8   ; 22-byte lowering sequence
  ; label is here
-  store i16 0, i16* %addr_short, align 1   ; 9-byte instruction
+  store i16 0, i16* %addr_short, align 1         ; 9-byte instruction
  ret void
 }
 ; CHECK-LABEL: label_at_boundary
 ; CHECK: call
-; We rely on the hideous 4-instruction 23-byte Om1 lowering sequence for select.
+; We rely on a particular 7-instruction 22-byte Om1 lowering sequence
-; CHECK-NEXT: 20: {{.*}} mov WORD PTR
+; for select.
-; CHECK-NEXT: 29: {{.*}} cmp BYTE PTR
+; CHECK-NEXT: 20: {{.*}} mov DWORD PTR
-; CHECK-NEXT: 2e: {{.*}} mov DWORD PTR
+; CHECK-NEXT: 2a: {{.*}} mov {{.*}},0x1
+; CHECK-NEXT: 2c: {{.*}} cmp {{.*}},0x0
+; CHECK-NEXT: 2e: {{.*}} mov {{.*}},BYTE PTR
+; CHECK-NEXT: 32: {{.*}} mov BYTE PTR
 ; CHECK-NEXT: 36: {{.*}} jne 40
-; CHECK-NEXT: 38: {{.*}} mov DWORD PTR
+; CHECK-NEXT: 38: {{.*}} mov {{.*}},BYTE PTR
+; CHECK-NEXT: 3c: {{.*}} mov BYTE PTR
 ; CHECK-NEXT: 40: {{.*}} mov WORD PTR
 ; Bundle lock without padding.

--- a/tests_lit/llvm2ice_tests/64bit.pnacl.ll
+++ b/tests_lit/llvm2ice_tests/64bit.pnacl.ll
@@ -1190,7 +1190,7 @@ entry:
 ; CHECK: cmp
 ; CHECK: jb
 ; CHECK: cmp
-; CHECK: jne
+; CHECK: cmovne
 ;
 ; OPTM1-LABEL: select64VarVar
 ; OPTM1: cmp
@@ -1199,7 +1199,7 @@ entry:
 ; OPTM1: cmp
 ; OPTM1: jb
 ; OPTM1: cmp
-; OPTM1: jne
+; OPTM1: cmovne
 define internal i64 @select64VarConst(i64 %a, i64 %b) {
 entry:
@@ -1214,7 +1214,7 @@ entry:
 ; CHECK: cmp
 ; CHECK: jb
 ; CHECK: cmp
-; CHECK: jne
+; CHECK: cmovne
 ;
 ; OPTM1-LABEL: select64VarConst
 ; OPTM1: cmp
@@ -1223,7 +1223,7 @@ entry:
 ; OPTM1: cmp
 ; OPTM1: jb
 ; OPTM1: cmp
-; OPTM1: jne
+; OPTM1: cmovne
 define internal i64 @select64ConstVar(i64 %a, i64 %b) {
 entry:
@@ -1238,7 +1238,7 @@ entry:
 ; CHECK: cmp
 ; CHECK: jb
 ; CHECK: cmp
-; CHECK: jne
+; CHECK: cmove
 ;
 ; OPTM1-LABEL: select64ConstVar
 ; OPTM1: cmp
@@ -1247,7 +1247,7 @@ entry:
 ; OPTM1: cmp
 ; OPTM1: jb
 ; OPTM1: cmp
-; OPTM1: jne
+; OPTM1: cmove
 define internal void @icmpEq64Imm() {
 entry:

--- a/tests_lit/llvm2ice_tests/bool-folding.ll
+++ b/tests_lit/llvm2ice_tests/bool-folding.ll
@@ -91,8 +91,7 @@ entry:
 ; CHECK-LABEL: fold_cmp_select
 ; CHECK: cmp
-; CHECK: jl
+; CHECK: cmovl
-; CHECK: mov
 ; 64-bit cmp/select folding.
@@ -107,9 +106,8 @@ entry:
 ; CHECK-LABEL: fold_cmp_select_64
 ; CHECK: cmp
-; CHECK: jl
+; CHECK: cmovl
-; CHECK: mov
+; CHECK: cmovl
-; CHECK: mov
 ; Cmp/select folding with intervening instructions.
@@ -125,8 +123,7 @@ entry:
 ; CHECK-NOT: cmp
 ; CHECK: call
 ; CHECK: cmp
-; CHECK: jl
+; CHECK: cmovl
-; CHECK: mov
 ; Cmp/multi-select folding.
@@ -143,11 +140,11 @@ entry:
 ; CHECK-LABEL: fold_cmp_select_multi
 ; CHECK: cmp
-; CHECK: jl
+; CHECK: cmovl
 ; CHECK: cmp
-; CHECK: jl
+; CHECK: cmovl
 ; CHECK: cmp
-; CHECK: jl
+; CHECK: cmovge
 ; CHECK: add
 ; CHECK: add
@@ -169,11 +166,11 @@ next:
 ; CHECK-LABEL: no_fold_cmp_select_multi_liveout
 ; CHECK: set
 ; CHECK: cmp
-; CHECK: jne
+; CHECK: cmovne
 ; CHECK: cmp
-; CHECK: jne
+; CHECK: cmovne
 ; CHECK: cmp
-; CHECK: jne
+; CHECK: cmove
 ; CHECK: add
 ; CHECK: add
@@ -195,11 +192,11 @@ entry:
 ; CHECK-LABEL: no_fold_cmp_select_multi_non_whitelist
 ; CHECK: set
 ; CHECK: cmp
-; CHECK: jne
+; CHECK: cmovne
 ; CHECK: cmp
-; CHECK: jne
+; CHECK: cmovne
 ; CHECK: cmp
-; CHECK: jne
+; CHECK: cmove
 ; CHECK: movzx
 ; CHECK: add
 ; CHECK: add