Subzero: Fold the load instruction into the next cast instruction.

This is similar to the way a load instruction may be folded into the next arithmetic instruction. Usually the effect is to improve a sequence like: mov ax, WORD PTR [mem] movsx eax, ax into this: movsx eax, WORD PTR [mem] without actually improving register allocation, though other kinds of casts may have different improvements. Existing tests needed to be fixed when they "inadvertently" did a cast to i32 return type and triggered the optimization when it wasn't wanted. These were fixed by inserting a "dummy" instruction between the load and the cast. BUG= https://code.google.com/p/nativeclient/issues/detail?id=4095 R=jvoung@chromium.org Review URL: https://codereview.chromium.org/1152783006

Subzero: Fold the load instruction into the next cast instruction.
c77f817f · Jim Stichnoth · c207d51e · c77f817f · c77f817f · c77f817f
Commit c77f817f authored May 31, 2015 by Jim Stichnoth
4 changed files
--- a/src/IceTargetLoweringX8632.cpp
+++ b/src/IceTargetLoweringX8632.cpp
@@ -883,7 +883,7 @@ void TargetX8632::addProlog(CfgNode *Node) {
    // that stack slot.
    if (SpillVariable *SpillVar = llvm::dyn_cast<SpillVariable>(Var)) {
      assert(Var->getWeight().isZero());
-      if (!SpillVar->getLinkedTo()->hasReg()) {
+      if (SpillVar->getLinkedTo() && !SpillVar->getLinkedTo()->hasReg()) {
        VariablesLinkedToSpillSlots.push_back(Var);
        continue;
      }
@@ -1160,8 +1160,9 @@ void TargetX8632::split64(Variable *Var) {
 }

 Operand *TargetX8632::loOperand(Operand *Operand) {
-  assert(Operand->getType() == IceType_i64);
-  if (Operand->getType() != IceType_i64)
+  assert(Operand->getType() == IceType_i64 ||
+         Operand->getType() == IceType_f64);
+  if (Operand->getType() != IceType_i64 && Operand->getType() != IceType_f64)
    return Operand;
  if (Variable *Var = llvm::dyn_cast<Variable>(Operand)) {
    split64(Var);
@@ -1180,8 +1181,9 @@ Operand *TargetX8632::loOperand(Operand *Operand) {
 }

 Operand *TargetX8632::hiOperand(Operand *Operand) {
-  assert(Operand->getType() == IceType_i64);
-  if (Operand->getType() != IceType_i64)
+  assert(Operand->getType() == IceType_i64 ||
+         Operand->getType() == IceType_f64);
+  if (Operand->getType() != IceType_i64 && Operand->getType() != IceType_f64)
    return Operand;
  if (Variable *Var = llvm::dyn_cast<Variable>(Operand)) {
    split64(Var);
@@ -2463,20 +2465,25 @@ void TargetX8632::lowerCast(const InstCast *Inst) {
      //   a_lo.i32 = t_lo.i32
      //   t_hi.i32 = hi(s.f64)
      //   a_hi.i32 = t_hi.i32
-      SpillVariable *SpillVar = Func->makeVariable<SpillVariable>(IceType_f64);
-      SpillVar->setLinkedTo(llvm::dyn_cast<Variable>(Src0RM));
-      Variable *Spill = SpillVar;
-      Spill->setWeight(RegWeight::Zero);
-      _movq(Spill, Src0RM);
+      Operand *SpillLo, *SpillHi;
+      if (auto *Src0Var = llvm::dyn_cast<Variable>(Src0RM)) {
+        SpillVariable *SpillVar =
+            Func->makeVariable<SpillVariable>(IceType_f64);
+        SpillVar->setLinkedTo(Src0Var);
+        Variable *Spill = SpillVar;
+        Spill->setWeight(RegWeight::Zero);
+        _movq(Spill, Src0RM);
+        SpillLo = VariableSplit::create(Func, Spill, VariableSplit::Low);
+        SpillHi = VariableSplit::create(Func, Spill, VariableSplit::High);
+      } else {
+        SpillLo = loOperand(Src0RM);
+        SpillHi = hiOperand(Src0RM);
+      }

      Variable *DestLo = llvm::cast<Variable>(loOperand(Dest));
      Variable *DestHi = llvm::cast<Variable>(hiOperand(Dest));
      Variable *T_Lo = makeReg(IceType_i32);
      Variable *T_Hi = makeReg(IceType_i32);
-      VariableSplit *SpillLo =
-          VariableSplit::create(Func, Spill, VariableSplit::Low);
-      VariableSplit *SpillHi =
-          VariableSplit::create(Func, Spill, VariableSplit::High);

      _mov(T_Lo, SpillLo);
      _mov(DestLo, T_Lo);
@@ -2486,6 +2493,12 @@ void TargetX8632::lowerCast(const InstCast *Inst) {
    case IceType_f64: {
      Src0 = legalize(Src0);
      assert(Src0->getType() == IceType_i64);
+      if (llvm::isa<OperandX8632Mem>(Src0)) {
+        Variable *T = Func->makeVariable(Dest->getType());
+        _movq(T, Src0);
+        _movq(Dest, T);
+        break;
+      }
      // a.f64 = bitcast b.i64 ==>
      //   t_lo.i32 = b_lo.i32
      //   FakeDef(s.f64)
@@ -3955,20 +3968,23 @@ void computeAddressOpt(Cfg *Func, const Inst *Instr, Variable *&Base,

 } // anonymous namespace

-void TargetX8632::lowerLoad(const InstLoad *Inst) {
+void TargetX8632::lowerLoad(const InstLoad *Load) {
  // A Load instruction can be treated the same as an Assign
  // instruction, after the source operand is transformed into an
  // OperandX8632Mem operand.  Note that the address mode
  // optimization already creates an OperandX8632Mem operand, so it
  // doesn't need another level of transformation.
-  Type Ty = Inst->getDest()->getType();
-  Operand *Src0 = FormMemoryOperand(Inst->getSourceAddress(), Ty);
+  Type Ty = Load->getDest()->getType();
+  Operand *Src0 = FormMemoryOperand(Load->getSourceAddress(), Ty);

  // Fuse this load with a subsequent Arithmetic instruction in the
  // following situations:
  //   a=[mem]; c=b+a ==> c=b+[mem] if last use of a and a not in b
  //   a=[mem]; c=a+b ==> c=b+[mem] if commutative and above is true
  //
+  // Fuse this load with a subsequent Cast instruction:
+  //   a=[mem]; b=cast(a) ==> b=cast([mem]) if last use of a
+  //
  // TODO: Clean up and test thoroughly.
  // (E.g., if there is an mfence-all make sure the load ends up on the
  // same side of the fence).
@@ -3979,30 +3995,46 @@ void TargetX8632::lowerLoad(const InstLoad *Inst) {
  // load instruction's dest variable, and that instruction ends that
  // variable's live range, then make the substitution.  Deal with
  // commutativity optimization in the arithmetic instruction lowering.
-  InstArithmetic *NewArith = nullptr;
-  if (InstArithmetic *Arith =
-          llvm::dyn_cast_or_null<InstArithmetic>(Context.getNextInst())) {
-    Variable *DestLoad = Inst->getDest();
-    Variable *Src0Arith = llvm::dyn_cast<Variable>(Arith->getSrc(0));
-    Variable *Src1Arith = llvm::dyn_cast<Variable>(Arith->getSrc(1));
-    if (Src1Arith == DestLoad && Arith->isLastUse(Src1Arith) &&
-        DestLoad != Src0Arith) {
-      NewArith = InstArithmetic::create(Func, Arith->getOp(), Arith->getDest(),
-                                        Arith->getSrc(0), Src0);
-    } else if (Src0Arith == DestLoad && Arith->isCommutative() &&
-               Arith->isLastUse(Src0Arith) && DestLoad != Src1Arith) {
-      NewArith = InstArithmetic::create(Func, Arith->getOp(), Arith->getDest(),
-                                        Arith->getSrc(1), Src0);
-    }
-    if (NewArith) {
-      Arith->setDeleted();
-      Context.advanceNext();
-      lowerArithmetic(NewArith);
-      return;
+  //
+  // TODO(stichnot): Do load fusing as a separate pass.  Run it before
+  // the bool folding pass.  Modify Ice::Inst to allow src operands to
+  // be replaced, including updating Inst::LiveRangesEnded, to avoid
+  // having to manually mostly clone each instruction type.
+  Inst *NextInst = Context.getNextInst();
+  Variable *DestLoad = Load->getDest();
+  if (NextInst && NextInst->isLastUse(DestLoad)) {
+    if (auto *Arith = llvm::dyn_cast<InstArithmetic>(NextInst)) {
+      InstArithmetic *NewArith = nullptr;
+      Variable *Src0Arith = llvm::dyn_cast<Variable>(Arith->getSrc(0));
+      Variable *Src1Arith = llvm::dyn_cast<Variable>(Arith->getSrc(1));
+      if (Src1Arith == DestLoad && DestLoad != Src0Arith) {
+        NewArith = InstArithmetic::create(
+            Func, Arith->getOp(), Arith->getDest(), Arith->getSrc(0), Src0);
+      } else if (Src0Arith == DestLoad && Arith->isCommutative() &&
+                 DestLoad != Src1Arith) {
+        NewArith = InstArithmetic::create(
+            Func, Arith->getOp(), Arith->getDest(), Arith->getSrc(1), Src0);
+      }
+      if (NewArith) {
+        Arith->setDeleted();
+        Context.advanceNext();
+        lowerArithmetic(NewArith);
+        return;
+      }
+    } else if (auto *Cast = llvm::dyn_cast<InstCast>(NextInst)) {
+      Variable *Src0Cast = llvm::dyn_cast<Variable>(Cast->getSrc(0));
+      if (Src0Cast == DestLoad) {
+        InstCast *NewCast =
+            InstCast::create(Func, Cast->getCastKind(), Cast->getDest(), Src0);
+        Cast->setDeleted();
+        Context.advanceNext();
+        lowerCast(NewCast);
+        return;
+      }
    }
  }

-  InstAssign *Assign = InstAssign::create(Func, Inst->getDest(), Src0);
+  InstAssign *Assign = InstAssign::create(Func, DestLoad, Src0);
  lowerAssign(Assign);
 }


--- a/tests_lit/llvm2ice_tests/8bit.pnacl.ll
+++ b/tests_lit/llvm2ice_tests/8bit.pnacl.ll
@@ -335,7 +335,8 @@ define i32 @load_i8(i32 %addr_arg) {
 entry:
  %addr = inttoptr i32 %addr_arg to i8*
  %ret = load i8* %addr, align 1
-  %ret_ext = zext i8 %ret to i32
+  %ret2 = sub i8 %ret, 0
+  %ret_ext = zext i8 %ret2 to i32
  ret i32 %ret_ext
 }
 ; CHECK-LABEL: load_i8
@@ -345,7 +346,8 @@ define i32 @load_i8_global(i32 %addr_arg) {
 entry:
  %addr = bitcast [1 x i8]* @global8 to i8*
  %ret = load i8* %addr, align 1
-  %ret_ext = zext i8 %ret to i32
+  %ret2 = sub i8 %ret, 0
+  %ret_ext = zext i8 %ret2 to i32
  ret i32 %ret_ext
 }
 ; CHECK-LABEL: load_i8_global

--- a/tests_lit/llvm2ice_tests/load_cast.ll
+++ b/tests_lit/llvm2ice_tests/load_cast.ll
+; Tests desired and undesired folding of load instructions into cast
+; instructions.  The folding is only done when liveness analysis is performed,
+; so only O2 is tested.
+
+; RUN: %p2i --filetype=obj --disassemble -i %s --args -O2 | FileCheck %s
+
+; Not testing trunc, or 32-bit bitcast, because the lowered code uses pretty
+; much the same mov instructions regardless of whether folding is done.
+
+define internal i32 @zext_fold(i32 %arg) {
+entry:
+  %ptr = add i32 %arg, 200
+  %addr = inttoptr i32 %ptr to i8*
+  %load = load i8* %addr, align 1
+  %result = zext i8 %load to i32
+  ret i32 %result
+}
+; CHECK-LABEL: zext_fold
+; CHECK: movzx {{.*}},BYTE PTR [{{.*}}+0xc8]
+
+define internal i32 @zext_nofold(i32 %arg) {
+entry:
+  %ptr = add i32 %arg, 200
+  %addr = inttoptr i32 %ptr to i8*
+  %load = load i8* %addr, align 1
+  %tmp1 = zext i8 %load to i32
+  %tmp2 = zext i8 %load to i32
+  %result = add i32 %tmp1, %tmp2
+  ret i32 %result
+}
+; Test that load folding does not happen.
+; CHECK-LABEL: zext_nofold
+; CHECK-NOT: movzx {{.*}},BYTE PTR [{{.*}}+0xc8]
+
+define internal i32 @sext_fold(i32 %arg) {
+entry:
+  %ptr = add i32 %arg, 200
+  %addr = inttoptr i32 %ptr to i8*
+  %load = load i8* %addr, align 1
+  %result = sext i8 %load to i32
+  ret i32 %result
+}
+; CHECK-LABEL: sext_fold
+; CHECK: movsx {{.*}},BYTE PTR [{{.*}}+0xc8]
+
+define internal i32 @sext_nofold(i32 %arg) {
+entry:
+  %ptr = add i32 %arg, 200
+  %addr = inttoptr i32 %ptr to i8*
+  %load = load i8* %addr, align 1
+  %tmp1 = sext i8 %load to i32
+  %tmp2 = sext i8 %load to i32
+  %result = add i32 %tmp1, %tmp2
+  ret i32 %result
+}
+; Test that load folding does not happen.
+; CHECK-LABEL: sext_nofold
+; CHECK-NOT: movsx {{.*}},BYTE PTR [{{.*}}+0xc8]
+
+define internal float @fptrunc_fold(i32 %arg) {
+entry:
+  %ptr = add i32 %arg, 200
+  %addr = inttoptr i32 %ptr to double*
+  %load = load double* %addr, align 8
+  %result = fptrunc double %load to float
+  ret float %result
+}
+; CHECK-LABEL: fptrunc_fold
+; CHECK: cvtsd2ss {{.*}},QWORD PTR [{{.*}}+0xc8]
+
+define internal float @fptrunc_nofold(i32 %arg) {
+entry:
+  %ptr = add i32 %arg, 200
+  %addr = inttoptr i32 %ptr to double*
+  %load = load double* %addr, align 8
+  %tmp1 = fptrunc double %load to float
+  %tmp2 = fptrunc double %load to float
+  %result = fadd float %tmp1, %tmp2
+  ret float %result
+}
+; Test that load folding does not happen.
+; CHECK-LABEL: fptrunc_nofold
+; CHECK-NOT: cvtsd2ss {{.*}},QWORD PTR [{{.*}}+0xc8]
+
+define internal double @fpext_fold(i32 %arg) {
+entry:
+  %ptr = add i32 %arg, 200
+  %addr = inttoptr i32 %ptr to float*
+  %load = load float* %addr, align 4
+  %result = fpext float %load to double
+  ret double %result
+}
+; CHECK-LABEL: fpext_fold
+; CHECK: cvtss2sd {{.*}},DWORD PTR [{{.*}}+0xc8]
+
+define internal double @fpext_nofold(i32 %arg) {
+entry:
+  %ptr = add i32 %arg, 200
+  %addr = inttoptr i32 %ptr to float*
+  %load = load float* %addr, align 4
+  %tmp1 = fpext float %load to double
+  %tmp2 = fpext float %load to double
+  %result = fadd double %tmp1, %tmp2
+  ret double %result
+}
+; Test that load folding does not happen.
+; CHECK-LABEL: fpext_nofold
+; CHECK-NOT: cvtss2sd {{.*}},DWORD PTR [{{.*}}+0xc8]
+
+define internal i32 @fptoui_fold(i32 %arg) {
+entry:
+  %ptr = add i32 %arg, 200
+  %addr = inttoptr i32 %ptr to double*
+  %load = load double* %addr, align 8
+  %result = fptoui double %load to i16
+  %result2 = zext i16 %result to i32
+  ret i32 %result2
+}
+; CHECK-LABEL: fptoui_fold
+; CHECK: cvttsd2si {{.*}},QWORD PTR [{{.*}}+0xc8]
+
+define internal i32 @fptoui_nofold(i32 %arg) {
+entry:
+  %ptr = add i32 %arg, 200
+  %addr = inttoptr i32 %ptr to double*
+  %load = load double* %addr, align 8
+  %tmp1 = fptoui double %load to i16
+  %tmp2 = fptoui double %load to i16
+  %result = add i16 %tmp1, %tmp2
+  %result2 = zext i16 %result to i32
+  ret i32 %result2
+}
+; Test that load folding does not happen.
+; CHECK-LABEL: fptoui_nofold
+; CHECK-NOT: cvttsd2si {{.*}},QWORD PTR [{{.*}}+0xc8]
+
+define internal i32 @fptosi_fold(i32 %arg) {
+entry:
+  %ptr = add i32 %arg, 200
+  %addr = inttoptr i32 %ptr to double*
+  %load = load double* %addr, align 8
+  %result = fptosi double %load to i16
+  %result2 = zext i16 %result to i32
+  ret i32 %result2
+}
+; CHECK-LABEL: fptosi_fold
+; CHECK: cvttsd2si {{.*}},QWORD PTR [{{.*}}+0xc8]
+
+define internal i32 @fptosi_nofold(i32 %arg) {
+entry:
+  %ptr = add i32 %arg, 200
+  %addr = inttoptr i32 %ptr to double*
+  %load = load double* %addr, align 8
+  %tmp1 = fptosi double %load to i16
+  %tmp2 = fptosi double %load to i16
+  %result = add i16 %tmp1, %tmp2
+  %result2 = zext i16 %result to i32
+  ret i32 %result2
+}
+; Test that load folding does not happen.
+; CHECK-LABEL: fptosi_nofold
+; CHECK-NOT: cvttsd2si {{.*}},QWORD PTR [{{.*}}+0xc8]
+
+define internal double @uitofp_fold(i32 %arg) {
+entry:
+  %ptr = add i32 %arg, 200
+  %addr = inttoptr i32 %ptr to i16*
+  %load = load i16* %addr, align 1
+  %result = uitofp i16 %load to double
+  ret double %result
+}
+; CHECK-LABEL: uitofp_fold
+; CHECK: movzx {{.*}},WORD PTR [{{.*}}+0xc8]
+
+define internal double @uitofp_nofold(i32 %arg) {
+entry:
+  %ptr = add i32 %arg, 200
+  %addr = inttoptr i32 %ptr to i16*
+  %load = load i16* %addr, align 1
+  %tmp1 = uitofp i16 %load to double
+  %tmp2 = uitofp i16 %load to double
+  %result = fadd double %tmp1, %tmp2
+  ret double %result
+}
+; Test that load folding does not happen.
+; CHECK-LABEL: uitofp_nofold
+; CHECK-NOT: movzx {{.*}},WORD PTR [{{.*}}+0xc8]
+
+define internal double @sitofp_fold(i32 %arg) {
+entry:
+  %ptr = add i32 %arg, 200
+  %addr = inttoptr i32 %ptr to i16*
+  %load = load i16* %addr, align 1
+  %result = sitofp i16 %load to double
+  ret double %result
+}
+; CHECK-LABEL: sitofp_fold
+; CHECK: movsx {{.*}},WORD PTR [{{.*}}+0xc8]
+
+define internal double @sitofp_nofold(i32 %arg) {
+entry:
+  %ptr = add i32 %arg, 200
+  %addr = inttoptr i32 %ptr to i16*
+  %load = load i16* %addr, align 1
+  %tmp1 = sitofp i16 %load to double
+  %tmp2 = sitofp i16 %load to double
+  %result = fadd double %tmp1, %tmp2
+  ret double %result
+}
+; Test that load folding does not happen.
+; CHECK-LABEL: sitofp_nofold
+; CHECK-NOT: movsx {{.*}},WORD PTR [{{.*}}+0xc8]
+
+define internal double @bitcast_i64_fold(i32 %arg) {
+entry:
+  %ptr = add i32 %arg, 200
+  %addr = inttoptr i32 %ptr to i64*
+  %load = load i64* %addr, align 1
+  %result = bitcast i64 %load to double
+  ret double %result
+}
+; CHECK-LABEL: bitcast_i64_fold
+; CHECK: movq {{.*}},QWORD PTR [{{.*}}+0xc8]
+
+define internal double @bitcast_i64_nofold(i32 %arg) {
+entry:
+  %ptr = add i32 %arg, 200
+  %addr = inttoptr i32 %ptr to i64*
+  %load = load i64* %addr, align 1
+  %tmp1 = bitcast i64 %load to double
+  %tmp2 = bitcast i64 %load to double
+  %result = fadd double %tmp1, %tmp2
+  ret double %result
+}
+; Test that load folding does not happen.
+; CHECK-LABEL: bitcast_i64_nofold
+; CHECK-NOT: movq {{.*}},QWORD PTR [{{.*}}+0xc8]
+
+define internal i64 @bitcast_double_fold(i32 %arg) {
+entry:
+  %ptr = add i32 %arg, 200
+  %addr = inttoptr i32 %ptr to double*
+  %load = load double* %addr, align 8
+  %result = bitcast double %load to i64
+  ret i64 %result
+}
+; CHECK-LABEL: bitcast_double_fold
+; CHECK-NOT: QWORD PTR
+; CHECK: mov {{.*}},DWORD PTR [{{.*}}+0xc8]
+; CHECK: mov {{.*}},DWORD PTR [{{.*}}+0xcc]
+; CHECK-NOT: QWORD PTR
+
+define internal i64 @bitcast_double_nofold(i32 %arg) {
+entry:
+  %ptr = add i32 %arg, 200
+  %addr = inttoptr i32 %ptr to double*
+  %load = load double* %addr, align 8
+  %tmp1 = bitcast double %load to i64
+  %tmp2 = bitcast double %load to i64
+  %result = add i64 %tmp1, %tmp2
+  ret i64 %result
+}
+; Test that load folding does not happen.
+; CHECK-LABEL: bitcast_double_nofold
+; CHECK: QWORD PTR
+; CHECK: QWORD PTR
--- a/tests_lit/llvm2ice_tests/nacl-atomic-intrinsics.ll
+++ b/tests_lit/llvm2ice_tests/nacl-atomic-intrinsics.ll
@@ -48,7 +48,8 @@ entry:
  %ptr = inttoptr i32 %iptr to i8*
  ; parameter value "6" is for the sequential consistency memory order.
  %i = call i8 @llvm.nacl.atomic.load.i8(i8* %ptr, i32 6)
-  %r = zext i8 %i to i32
+  %i2 = sub i8 %i, 0
+  %r = zext i8 %i2 to i32
  ret i32 %r
 }
 ; CHECK-LABEL: test_atomic_load_8
@@ -59,7 +60,8 @@ define i32 @test_atomic_load_16(i32 %iptr) {
 entry:
  %ptr = inttoptr i32 %iptr to i16*
  %i = call i16 @llvm.nacl.atomic.load.i16(i16* %ptr, i32 6)
-  %r = zext i16 %i to i32
+  %i2 = sub i16 %i, 0
+  %r = zext i16 %i2 to i32
  ret i32 %r
 }
 ; CHECK-LABEL: test_atomic_load_16