Inline memcpy for small constant sizes.

Combined with memset inlining, this has shown an improvement of over 11% on the eon benchmark. This the only C++ program in spec2k and it seems C++ programs have a significantly larger number of memset/memcpy calls. Other benchmarks also showed improvement of ~5% (perlbmk, parser) while most had a 1-2% improvement. This commit also includes a refactoring of lowerMemset which is much more readable and also removed the fake use of the destination pointer register. BUG= R=jvoung@chromium.org, stichnot@chromium.org Review URL: https://codereview.chromium.org/1279833005.

Inline memcpy for small constant sizes.
9df4a379 · Andrew Scull · f9df4523 · 9df4a379 · 9df4a379 · 9df4a379
Commit 9df4a379 authored Aug 07, 2015 by Andrew Scull
3 changed files
--- a/src/IceTargetLoweringX86Base.h
+++ b/src/IceTargetLoweringX86Base.h
@@ -153,7 +153,9 @@ protected:
                      Operand *Val);
  void lowerCountZeros(bool Cttz, Type Ty, Variable *Dest, Operand *FirstVal,
                       Operand *SecondVal);
-  /// Replace a function call with inline instructions.
+  /// Replace a call to memcpy with inline instructions.
+  void lowerMemcpy(Operand *Dest, Operand *Src, Operand *Count);
+  /// Replace a call to memset with inline instructions.
  void lowerMemset(Operand *Dest, Operand *Val, Operand *Count);
  /// Lower an indirect jump adding sandboxing when needed.

--- a/src/IceTargetLoweringX86BaseImpl.h
+++ b/src/IceTargetLoweringX86BaseImpl.h
@@ -3030,13 +3030,7 @@ void TargetX86Base<Machine>::lowerIntrinsicCall(
    return;
  }
  case Intrinsics::Memcpy: {
-    // In the future, we could potentially emit an inline memcpy/memset, etc.
+    lowerMemcpy(Instr->getArg(0), Instr->getArg(1), Instr->getArg(2));
-    // for intrinsic calls w/ a known length.
-    InstCall *Call = makeHelperCall(H_call_memcpy, nullptr, 3);
-    Call->addArg(Instr->getArg(0));
-    Call->addArg(Instr->getArg(1));
-    Call->addArg(Instr->getArg(2));
-    lowerCall(Call);
    return;
  }
  case Intrinsics::Memmove: {
@@ -3487,9 +3481,123 @@ void TargetX86Base<Machine>::lowerCountZeros(bool Cttz, Type Ty, Variable *Dest,
 }
 template <class Machine>
+void TargetX86Base<Machine>::lowerMemcpy(Operand *Dest, Operand *Src,
+                                         Operand *Count) {
+  // There is a load and store for each chunk in the unroll
+  constexpr uint32_t UNROLL_LIMIT = 8;
+  constexpr uint32_t BytesPerStorep = 16;
+  constexpr uint32_t BytesPerStoreq = 8;
+  constexpr uint32_t BytesPerStorei32 = 4;
+  constexpr uint32_t BytesPerStorei16 = 2;
+  constexpr uint32_t BytesPerStorei8 = 1;
+  // Check if the operands are constants
+  const auto *CountConst = llvm::dyn_cast<const ConstantInteger32>(Count);
+  const bool IsCountConst = CountConst != nullptr;
+  const uint32_t CountValue = IsCountConst ? CountConst->getValue() : 0;
+  if (IsCountConst && CountValue <= BytesPerStorep * UNROLL_LIMIT) {
+    // Unlikely, but nothing to do if it does happen
+    if (CountValue == 0)
+      return;
+    Variable *SrcBase = legalizeToReg(Src);
+    Variable *DestBase = legalizeToReg(Dest);
+    auto lowerCopy = [this, DestBase, SrcBase](Type Ty, uint32_t OffsetAmt) {
+      Constant *Offset = OffsetAmt ? Ctx->getConstantInt32(OffsetAmt) : nullptr;
+      // TODO(ascull): this or add nullptr test to _movp, _movq
+      Variable *Data = makeReg(Ty);
+      // TODO(ascull): is 64-bit better with vector or scalar movq?
+      auto *SrcMem = Traits::X86OperandMem::create(Func, Ty, SrcBase, Offset);
+      if (isVectorType(Ty))
+        _movp(Data, SrcMem);
+      else if (Ty == IceType_f64)
+        _movq(Data, SrcMem);
+      else
+        _mov(Data, SrcMem);
+      auto *DestMem = Traits::X86OperandMem::create(Func, Ty, DestBase, Offset);
+      if (isVectorType(Ty))
+        _storep(Data, DestMem);
+      else if (Ty == IceType_f64)
+        _storeq(Data, DestMem);
+      else
+        _store(Data, DestMem);
+    };
+    // Lowers the assignment to the remaining bytes. Assumes the original size
+    // was large enough to allow for overlaps.
+    auto lowerLeftOvers = [this, lowerCopy, CountValue](uint32_t Size) {
+      if (Size > BytesPerStoreq) {
+        lowerCopy(IceType_v16i8, CountValue - BytesPerStorep);
+      } else if (Size > BytesPerStorei32) {
+        lowerCopy(IceType_f64, CountValue - BytesPerStoreq);
+      } else if (Size > BytesPerStorei16) {
+        lowerCopy(IceType_i32, CountValue - BytesPerStorei32);
+      } else if (Size > BytesPerStorei8) {
+        lowerCopy(IceType_i16, CountValue - BytesPerStorei16);
+      } else if (Size == BytesPerStorei8) {
+        lowerCopy(IceType_i8, CountValue - BytesPerStorei8);
+      }
+    };
+    if (CountValue >= BytesPerStorep) {
+      // Use large vector operations
+      for (uint32_t N = CountValue & 0xFFFFFFF0; N != 0;) {
+        N -= BytesPerStorep;
+        lowerCopy(IceType_v16i8, N);
+      }
+      lowerLeftOvers(CountValue & 0xF);
+      return;
+    }
+    // Too small to use large vector operations so use small ones instead
+    if (CountValue >= BytesPerStoreq) {
+      lowerCopy(IceType_f64, 0);
+      lowerLeftOvers(CountValue - BytesPerStoreq);
+      return;
+    }
+    // Too small for vector operations so use scalar ones
+    if (CountValue >= BytesPerStorei32) {
+      lowerCopy(IceType_i32, 0);
+      lowerLeftOvers(CountValue - BytesPerStorei32);
+      return;
+    }
+    // 3 is the awkward size as it is too small for the vector or 32-bit
+    // operations and will not work with lowerLeftOvers as there is no valid
+    // overlap.
+    if (CountValue == 3) {
+      lowerCopy(IceType_i16, 0);
+      lowerCopy(IceType_i8, 2);
+      return;
+    }
+    // 1 or 2 can be done in a single scalar copy
+    lowerLeftOvers(CountValue);
+    return;
+  }
+  // Fall back on a function call
+  InstCall *Call = makeHelperCall(H_call_memcpy, nullptr, 3);
+  Call->addArg(Dest);
+  Call->addArg(Src);
+  Call->addArg(Count);
+  lowerCall(Call);
+}
+template <class Machine>
 void TargetX86Base<Machine>::lowerMemset(Operand *Dest, Operand *Val,
                                         Operand *Count) {
  constexpr uint32_t UNROLL_LIMIT = 16;
+  constexpr uint32_t BytesPerStorep = 16;
+  constexpr uint32_t BytesPerStoreq = 8;
+  constexpr uint32_t BytesPerStorei32 = 4;
+  constexpr uint32_t BytesPerStorei16 = 2;
+  constexpr uint32_t BytesPerStorei8 = 1;
  assert(Val->getType() == IceType_i8);
  // Check if the operands are constants
@@ -3508,109 +3616,86 @@ void TargetX86Base<Machine>::lowerMemset(Operand *Dest, Operand *Val,
  // to inline by spreading the value across 4 bytes and accessing subregs e.g.
  // eax, ax and al.
  if (IsCountConst && IsValConst) {
-    Variable *Base = legalizeToReg(Dest);
+    Variable *Base = nullptr;
-    // Add a FakeUse in case Base is ultimately not used, e.g. it falls back to
+    const uint32_t SpreadValue =
-    // calling memset().  Otherwise Om1 register allocation fails because this
+        (ValValue << 24) | (ValValue << 16) | (ValValue << 8) | ValValue;
-    // infinite-weight variable has a definition but no uses.
+    Variable *VecReg = nullptr;
-    Context.insert(InstFakeUse::create(Func, Base));
-    // 3 is the awkward size as it is too small for the vector or 32-bit
-    // operations and will not work with lowerLeftOvers as there is no valid
-    // overlap.
-    if (CountValue == 3) {
-      Constant *Offset = nullptr;
-      auto *Mem =
-          Traits::X86OperandMem::create(Func, IceType_i16, Base, Offset);
-      _store(Ctx->getConstantInt16((ValValue << 8) | ValValue), Mem);
-      Offset = Ctx->getConstantInt8(2);
+    auto lowerSet = [this, &Base, SpreadValue, &VecReg](Type Ty,
-      Mem = Traits::X86OperandMem::create(Func, IceType_i8, Base, Offset);
+                                                       uint32_t OffsetAmt) {
-      _store(Ctx->getConstantInt8(ValValue), Mem);
+      assert(Base != nullptr);
-      return;
+      Constant *Offset = OffsetAmt ? Ctx->getConstantInt32(OffsetAmt) : nullptr;
-    }
-    // Lowers the assignment to the remaining bytes. Assumes the original size
+      // TODO(ascull): is 64-bit better with vector or scalar movq?
-    // was large enough to allow for overlaps.
-    auto lowerLeftOvers = [this, Base, CountValue](
-        uint32_t SpreadValue, uint32_t Size, Variable *VecReg) {
-      auto lowerStoreSpreadValue =
-          [this, Base, CountValue, SpreadValue](Type Ty) {
-            Constant *Offset =
-                Ctx->getConstantInt32(CountValue - typeWidthInBytes(Ty));
      auto *Mem = Traits::X86OperandMem::create(Func, Ty, Base, Offset);
-            _store(Ctx->getConstantInt(Ty, SpreadValue), Mem);
+      if (isVectorType(Ty)) {
-          };
-      if (Size > 8) {
        assert(VecReg != nullptr);
-        Constant *Offset = Ctx->getConstantInt32(CountValue - 16);
-        auto *Mem = Traits::X86OperandMem::create(Func, VecReg->getType(), Base,
-                                                  Offset);
        _storep(VecReg, Mem);
-      } else if (Size > 4) {
+      } else if (Ty == IceType_i64) {
        assert(VecReg != nullptr);
-        Constant *Offset = Ctx->getConstantInt32(CountValue - 8);
-        auto *Mem =
-            Traits::X86OperandMem::create(Func, IceType_i64, Base, Offset);
        _storeq(VecReg, Mem);
-      } else if (Size > 2) {
+      } else {
-        lowerStoreSpreadValue(IceType_i32);
+        _store(Ctx->getConstantInt(Ty, SpreadValue), Mem);
-      } else if (Size > 1) {
-        lowerStoreSpreadValue(IceType_i16);
-      } else if (Size == 1) {
-        lowerStoreSpreadValue(IceType_i8);
      }
    };
-    // When the value is zero it can be loaded into a register cheaply using
+    // Lowers the assignment to the remaining bytes. Assumes the original size
-    // the xor trick.
+    // was large enough to allow for overlaps.
-    constexpr uint32_t BytesPerStorep = 16;
+    auto lowerLeftOvers = [this, lowerSet, CountValue](uint32_t Size) {
-    if (ValValue == 0 && CountValue >= 8 &&
+      if (Size > BytesPerStoreq) {
+        lowerSet(IceType_v16i8, CountValue - BytesPerStorep);
+      } else if (Size > BytesPerStorei32) {
+        lowerSet(IceType_i64, CountValue - BytesPerStoreq);
+      } else if (Size > BytesPerStorei16) {
+        lowerSet(IceType_i32, CountValue - BytesPerStorei32);
+      } else if (Size > BytesPerStorei8) {
+        lowerSet(IceType_i16, CountValue - BytesPerStorei16);
+      } else if (Size == BytesPerStorei8) {
+        lowerSet(IceType_i8, CountValue - BytesPerStorei8);
+      }
+    };
+    // When the value is zero it can be loaded into a vector register cheaply
+    // using the xor trick.
+    if (ValValue == 0 && CountValue >= BytesPerStoreq &&
        CountValue <= BytesPerStorep * UNROLL_LIMIT) {
-      Variable *Zero = makeVectorOfZeros(IceType_v16i8);
+      Base = legalizeToReg(Dest);
+      VecReg = makeVectorOfZeros(IceType_v16i8);
      // Too small to use large vector operations so use small ones instead
-      if (CountValue < 16) {
+      if (CountValue < BytesPerStorep) {
-        Constant *Offset = nullptr;
+        lowerSet(IceType_i64, 0);
-        auto *Mem =
+        lowerLeftOvers(CountValue - BytesPerStoreq);
-            Traits::X86OperandMem::create(Func, IceType_i64, Base, Offset);
-        _storeq(Zero, Mem);
-        lowerLeftOvers(0, CountValue - 8, Zero);
        return;
      }
-      assert(CountValue >= 16);
      // Use large vector operations
      for (uint32_t N = CountValue & 0xFFFFFFF0; N != 0;) {
        N -= 16;
-        Constant *Offset = Ctx->getConstantInt32(N);
+        lowerSet(IceType_v16i8, N);
-        auto *Mem =
-            Traits::X86OperandMem::create(Func, Zero->getType(), Base, Offset);
-        _storep(Zero, Mem);
      }
-      uint32_t LeftOver = CountValue & 0xF;
+      lowerLeftOvers(CountValue & 0xF);
-      lowerLeftOvers(0, LeftOver, Zero);
      return;
    }
    // TODO(ascull): load val into reg and select subregs e.g. eax, ax, al?
-    constexpr uint32_t BytesPerStore = 4;
+    if (CountValue <= BytesPerStorei32 * UNROLL_LIMIT) {
-    if (CountValue <= BytesPerStore * UNROLL_LIMIT) {
+      Base = legalizeToReg(Dest);
+      // 3 is the awkward size as it is too small for the vector or 32-bit
+      // operations and will not work with lowerLeftOvers as there is no valid
+      // overlap.
+      if (CountValue == 3) {
+        lowerSet(IceType_i16, 0);
+        lowerSet(IceType_i8, 2);
+        return;
+      }
      // TODO(ascull); 64-bit can do better with 64-bit mov
-      uint32_t SpreadValue =
-          (ValValue << 24) | (ValValue << 16) | (ValValue << 8) | ValValue;
-      if (CountValue >= 4) {
-        Constant *ValueConst = Ctx->getConstantInt32(SpreadValue);
      for (uint32_t N = CountValue & 0xFFFFFFFC; N != 0;) {
        N -= 4;
-          Constant *Offset = Ctx->getConstantInt32(N);
+        lowerSet(IceType_i32, N);
-          auto *Mem =
-              Traits::X86OperandMem::create(Func, IceType_i32, Base, Offset);
-          _store(ValueConst, Mem);
-        }
      }
-      uint32_t LeftOver = CountValue & 0x3;
+      lowerLeftOvers(CountValue & 0x3);
-      lowerLeftOvers(SpreadValue, LeftOver, nullptr);
      return;
    }
  }

--- a/tests_lit/llvm2ice_tests/nacl-mem-intrinsics.ll
+++ b/tests_lit/llvm2ice_tests/nacl-mem-intrinsics.ll
@@ -30,21 +30,123 @@ entry:
 ; ARM32-LABEL: test_memcpy
 ; ARM32: bl {{.*}} memcpy
-; TODO(jvoung) -- if we want to be clever, we can do this and the memmove,
+define void @test_memcpy_long_const_len(i32 %iptr_dst, i32 %iptr_src) {
-; memset without a function call.
-define void @test_memcpy_const_len_align(i32 %iptr_dst, i32 %iptr_src) {
 entry:
  %dst = inttoptr i32 %iptr_dst to i8*
  %src = inttoptr i32 %iptr_src to i8*
  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %dst, i8* %src,
-                                       i32 32, i32 1, i1 false)
+                                       i32 4876, i32 1, i1 false)
  ret void
 }
-; CHECK-LABEL: test_memcpy_const_len_align
+; CHECK-LABEL: test_memcpy_long_const_len
 ; CHECK: call {{.*}} R_{{.*}} memcpy
-; ARM32-LABEL: test_memcpy_const_len_align
+; ARM32-LABEL: test_memcpy_long_const_len
+; ARM32: bl {{.*}} memcpy
+define void @test_memcpy_very_small_const_len(i32 %iptr_dst, i32 %iptr_src) {
+entry:
+  %dst = inttoptr i32 %iptr_dst to i8*
+  %src = inttoptr i32 %iptr_src to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %dst, i8* %src,
+                                       i32 2, i32 1, i1 false)
+  ret void
+}
+; CHECK-LABEL: test_memcpy_very_small_const_len
+; CHECK: mov [[REG:[^,]*]],WORD PTR [{{.*}}]
+; CHECK-NEXT: mov WORD PTR [{{.*}}],[[REG]]
+; CHECK-NOT: mov
+; ARM32-LABEL: test_memcpy_very_small_const_len
+; ARM32: bl {{.*}} memcpy
+define void @test_memcpy_const_len_3(i32 %iptr_dst, i32 %iptr_src) {
+entry:
+  %dst = inttoptr i32 %iptr_dst to i8*
+  %src = inttoptr i32 %iptr_src to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %dst, i8* %src,
+                                       i32 3, i32 1, i1 false)
+  ret void
+}
+; CHECK-LABEL: test_memcpy_const_len_3
+; CHECK: mov [[REG:[^,]*]],WORD PTR [{{.*}}]
+; CHECK-NEXT: mov WORD PTR [{{.*}}],[[REG]]
+; CHECK-NEXT: mov [[REG:[^,]*]],BYTE PTR [{{.*}}+0x2]
+; CHECK-NEXT: mov BYTE PTR [{{.*}}+0x2],[[REG]]
+; CHECK-NOT: mov
+; ARM32-LABEL: test_memcpy_const_len_3
+; ARM32: bl {{.*}} memcpy
+define void @test_memcpy_mid_const_len(i32 %iptr_dst, i32 %iptr_src) {
+entry:
+  %dst = inttoptr i32 %iptr_dst to i8*
+  %src = inttoptr i32 %iptr_src to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %dst, i8* %src,
+                                       i32 9, i32 1, i1 false)
+  ret void
+}
+; CHECK-LABEL: test_memcpy_mid_const_len
+; CHECK: movq [[REG:xmm[0-9]+]],QWORD PTR [{{.*}}]
+; CHECK-NEXT: movq QWORD PTR [{{.*}}],[[REG]]
+; CHECK-NEXT: mov [[REG:[^,]*]],BYTE PTR [{{.*}}+0x8]
+; CHECK-NEXT: mov BYTE PTR [{{.*}}+0x8],[[REG]]
+; CHECK-NOT: mov
+; ARM32-LABEL: test_memcpy_mid_const_len
+; ARM32: bl {{.*}} memcpy
+define void @test_memcpy_mid_const_len_overlap(i32 %iptr_dst, i32 %iptr_src) {
+entry:
+  %dst = inttoptr i32 %iptr_dst to i8*
+  %src = inttoptr i32 %iptr_src to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %dst, i8* %src,
+                                       i32 15, i32 1, i1 false)
+  ret void
+}
+; CHECK-LABEL: test_memcpy_mid_const_len_overlap
+; CHECK: movq [[REG:xmm[0-9]+]],QWORD PTR [{{.*}}]
+; CHECK-NEXT: movq QWORD PTR [{{.*}}],[[REG]]
+; CHECK-NEXT: movq [[REG:xmm[0-9]+]],QWORD PTR [{{.*}}+0x7]
+; CHECK-NEXT: movq QWORD PTR [{{.*}}+0x7],[[REG]]
+; CHECK-NOT: mov
+; ARM32-LABEL: test_memcpy_mid_const_len_overlap
+; ARM32: bl {{.*}} memcpy
+define void @test_memcpy_large_const_len_overlap(i32 %iptr_dst, i32 %iptr_src) {
+entry:
+  %dst = inttoptr i32 %iptr_dst to i8*
+  %src = inttoptr i32 %iptr_src to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %dst, i8* %src,
+                                       i32 30, i32 1, i1 false)
+  ret void
+}
+; CHECK-LABEL: test_memcpy_large_const_len_overlap
+; CHECK: movups [[REG:xmm[0-9]+]],XMMWORD PTR [{{.*}}]
+; CHECK-NEXT: movups XMMWORD PTR [{{.*}}],[[REG]]
+; CHECK-NEXT: movups [[REG:xmm[0-9]+]],XMMWORD PTR [{{.*}}+0xe]
+; CHECK-NEXT: movups XMMWORD PTR [{{.*}}+0xe],[[REG]]
+; CHECK-NOT: mov
+; ARM32-LABEL: test_memcpy_large_const_len_overlap
+; ARM32: bl {{.*}} memcpy
+define void @test_memcpy_large_const_len(i32 %iptr_dst, i32 %iptr_src) {
+entry:
+  %dst = inttoptr i32 %iptr_dst to i8*
+  %src = inttoptr i32 %iptr_src to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %dst, i8* %src,
+                                       i32 33, i32 1, i1 false)
+  ret void
+}
+; CHECK-LABEL: test_memcpy_large_const_len
+; CHECK: movups [[REG:xmm[0-9]+]],XMMWORD PTR [{{.*}}+0x10]
+; CHECK-NEXT: movups XMMWORD PTR [{{.*}}+0x10],[[REG]]
+; CHECK-NEXT: movups [[REG:xmm[0-9]+]],XMMWORD PTR [{{.*}}]
+; CHECK-NEXT: movups XMMWORD PTR [{{.*}}],[[REG]]
+; CHECK-NEXT: mov [[REG:[^,]*]],BYTE PTR [{{.*}}+0x20]
+; CHECK-NEXT: mov BYTE PTR [{{.*}}+0x20],[[REG]]
+; CHECK-NOT: mov
+; ARM32-LABEL: test_memcpy_large_const_len
 ; ARM32: bl {{.*}} memcpy
+; TODO(jvoung) -- if we want to be clever, we can do memset without a function
+; call similar to memcpy.
 define void @test_memmove(i32 %iptr_dst, i32 %iptr_src, i32 %len) {
 entry:
  %dst = inttoptr i32 %iptr_dst to i8*