package: glibc: Add optimization memory copy with aligned version

Port the linux kernel memcpy function for optimizing the 128 byte align case, this will improve the performance of large block memcpy. Here we combine the memcpy of glibc and kernel. Signed-off-by: Mason Huo <mason.huo@starfivetech.com>
2023-04-17 14:53:22 +08:00
parent 8d66d14452
commit 013b3a25e8
1 changed files with 299 additions and 172 deletions
@@ -1,27 +1,49 @@
-From 15850d406a9807d70e752aacfbb456946a01f6ac Mon Sep 17 00:00:00 2001
+From 666e593642136fad34be8eef1fcf1e872830013c Mon Sep 17 00:00:00 2001
 From: Mason Huo <mason.huo@starfivetech.com>
-Date: Tue, 21 Mar 2023 14:36:21 +0800
+Date: Mon, 17 Apr 2023 13:41:06 +0800
 Subject: [PATCH] Optimize memory operations for RISCV

-Port the memcpy & memset implementation from
-Linux kerenl to glibc to improve performance of
-memory operations in user space.
---
- sysdeps/riscv/memcpy.S | 120 +++++++++++++++++++++++++++++++++++++++++
- sysdeps/riscv/memset.S | 120 +++++++++++++++++++++++++++++++++++++++++
- 2 files changed, 240 insertions(+)
- create mode 100644 sysdeps/riscv/memcpy.S
- create mode 100644 sysdeps/riscv/memset.S
+Port the linux kernel memcpy function for optimizing
+the 128 byte align case, this will improve the
+performance of large block memcpy.

-diff --git a/sysdeps/riscv/memcpy.S b/sysdeps/riscv/memcpy.S
+Here we combine the memcpy of glibc and kernel.
+
+Signed-off-by: Mason Huo <mason.huo@starfivetech.com>
+---
+ sysdeps/riscv/Makefile         |   4 +
+ sysdeps/riscv/memcpy.c         | 265 +++++++++++++++++++++++++++++++++
+ sysdeps/riscv/memcpy_aligned.S |  82 ++++++++++
+ 3 files changed, 351 insertions(+)
+ create mode 100644 sysdeps/riscv/memcpy.c
+ create mode 100644 sysdeps/riscv/memcpy_aligned.S
+
+diff --git a/sysdeps/riscv/Makefile b/sysdeps/riscv/Makefile
+index 20a99681..5c3c3244 100644
+--- a/sysdeps/riscv/Makefile
+++ b/sysdeps/riscv/Makefile
+@@ -2,6 +2,10 @@ ifeq ($(subdir),misc)
+ sysdep_headers += sys/asm.h
+ endif
+ 
+ifeq ($(subdir),string)
+sysdep_routines += memcpy_aligned
+endif
+
+ # RISC-V's assembler also needs to know about PIC as it changes the definition
+ # of some assembler macros.
+ ASFLAGS-.os += $(pic-ccflag)
+diff --git a/sysdeps/riscv/memcpy.c b/sysdeps/riscv/memcpy.c
 new file mode 100644
-index 00000000..aa0eaee9
+index 00000000..1de6141e
 --- /dev/null
-+++ b/sysdeps/riscv/memcpy.S
-@@ -0,0 +1,120 @@
-+/* memcpy for RISC-V.
-+   Copyright (C) 1996-2020 Free Software Foundation, Inc.
+++ b/sysdeps/riscv/memcpy.c
+@@ -0,0 +1,265 @@
+/* Copy memory to memory until the specified number of bytes
+   has been copied.  Overlap is NOT handled correctly.
+   Copyright (C) 1991-2020 Free Software Foundation, Inc.
 +   This file is part of the GNU C Library.
+   Contributed by Torbjorn Granlund (tege@sics.se).
 +
 +   The GNU C Library is free software; you can redistribute it and/or
 +   modify it under the terms of the GNU Lesser General Public
@@ -34,40 +56,272 @@ index 00000000..aa0eaee9
 +   Lesser General Public License for more details.
 +
 +   You should have received a copy of the GNU Lesser General Public
-+   License along with the GNU C Library.  If not, see
+   License along with the GNU C Library; if not, see
 +   <https://www.gnu.org/licenses/>.  */
 +
+#include <stddef.h>
+#include <string.h>
+
+#define MERGE(w0, sh_1, w1, sh_2) (((w0) >> (sh_1)) | ((w1) << (sh_2)))
+#define OP_T_THRES      16
+#define op_t    unsigned long
+#define OPSIZ   (sizeof(op_t))
+#define OPSIZ_MASK   (sizeof(op_t) - 1)
+#define FAST_COPY_THRES  (128)
+#define byte    unsigned char
+
+static void _wordcopy_fwd_aligned(long dstp, long srcp, size_t len)
+{
+	op_t a0, a1;
+
+	switch (len % 8) {
+	case 2:
+		a0 = ((op_t *) srcp)[0];
+		srcp -= 6 * OPSIZ;
+		dstp -= 7 * OPSIZ;
+		len += 6;
+		goto do1;
+	case 3:
+		a1 = ((op_t *) srcp)[0];
+		srcp -= 5 * OPSIZ;
+		dstp -= 6 * OPSIZ;
+		len += 5;
+		goto do2;
+	case 4:
+		a0 = ((op_t *) srcp)[0];
+		srcp -= 4 * OPSIZ;
+		dstp -= 5 * OPSIZ;
+		len += 4;
+		goto do3;
+	case 5:
+		a1 = ((op_t *) srcp)[0];
+		srcp -= 3 * OPSIZ;
+		dstp -= 4 * OPSIZ;
+		len += 3;
+		goto do4;
+	case 6:
+		a0 = ((op_t *) srcp)[0];
+		srcp -= 2 * OPSIZ;
+		dstp -= 3 * OPSIZ;
+		len += 2;
+		goto do5;
+	case 7:
+		a1 = ((op_t *) srcp)[0];
+		srcp -= 1 * OPSIZ;
+		dstp -= 2 * OPSIZ;
+		len += 1;
+		goto do6;
+
+	case 0:
+		if (OP_T_THRES <= 3 * OPSIZ && len == 0)
+			return;
+		a0 = ((op_t *) srcp)[0];
+		srcp -= 0 * OPSIZ;
+		dstp -= 1 * OPSIZ;
+		goto do7;
+	case 1:
+		a1 = ((op_t *) srcp)[0];
+		srcp -= -1 * OPSIZ;
+		dstp -= 0 * OPSIZ;
+		len -= 1;
+		if (OP_T_THRES <= 3 * OPSIZ && len == 0)
+			goto do0;
+		goto do8;                 /* No-op.  */
+	}
+
+	do {
+do8:
+		a0 = ((op_t *) srcp)[0];
+		((op_t *) dstp)[0] = a1;
+do7:
+		a1 = ((op_t *) srcp)[1];
+		((op_t *) dstp)[1] = a0;
+do6:
+		a0 = ((op_t *) srcp)[2];
+		((op_t *) dstp)[2] = a1;
+do5:
+		a1 = ((op_t *) srcp)[3];
+		((op_t *) dstp)[3] = a0;
+do4:
+		a0 = ((op_t *) srcp)[4];
+		((op_t *) dstp)[4] = a1;
+do3:
+		a1 = ((op_t *) srcp)[5];
+		((op_t *) dstp)[5] = a0;
+do2:
+		a0 = ((op_t *) srcp)[6];
+		((op_t *) dstp)[6] = a1;
+do1:
+		a1 = ((op_t *) srcp)[7];
+		((op_t *) dstp)[7] = a0;
+
+		srcp += 8 * OPSIZ;
+		dstp += 8 * OPSIZ;
+		len -= 8;
+	} while (len != 0);
+
+	/* This is the right position for do0.  Please don't move
+	 * it into the loop.
+	 */
+do0:
+	((op_t *) dstp)[0] = a1;
+}
+
+static void _wordcopy_fwd_dest_aligned(long dstp, long srcp, size_t len)
+{
+	op_t a0, a1, a2, a3;
+	int sh_1, sh_2;
+
+	/* Calculate how to shift a word read at the memory operation
+	 * aligned srcp to make it aligned for copy.
+	 */
+
+	sh_1 = 8 * (srcp % OPSIZ);
+	sh_2 = 8 * OPSIZ - sh_1;
+
+	/* Make SRCP aligned by rounding it down to the beginning of the `op_t'
+	 * it points in the middle of.
+	 */
+	srcp &= -OPSIZ;
+
+	switch (len % 4) {
+	case 2:
+		a1 = ((op_t *) srcp)[0];
+		a2 = ((op_t *) srcp)[1];
+		srcp -= 1 * OPSIZ;
+		dstp -= 3 * OPSIZ;
+		len += 2;
+		goto do1;
+	case 3:
+		a0 = ((op_t *) srcp)[0];
+		a1 = ((op_t *) srcp)[1];
+		srcp -= 0 * OPSIZ;
+		dstp -= 2 * OPSIZ;
+		len += 1;
+		goto do2;
+	case 0:
+		if (OP_T_THRES <= 3 * OPSIZ && len == 0)
+			return;
+		a3 = ((op_t *) srcp)[0];
+		a0 = ((op_t *) srcp)[1];
+		srcp -= -1 * OPSIZ;
+		dstp -= 1 * OPSIZ;
+		len += 0;
+		goto do3;
+	case 1:
+		a2 = ((op_t *) srcp)[0];
+		a3 = ((op_t *) srcp)[1];
+		srcp -= -2 * OPSIZ;
+		dstp -= 0 * OPSIZ;
+		len -= 1;
+		if (OP_T_THRES <= 3 * OPSIZ && len == 0)
+			goto do0;
+		goto do4;                 /* No-op.  */
+	}
+
+	do {
+do4:
+		a0 = ((op_t *) srcp)[0];
+		((op_t *) dstp)[0] = MERGE(a2, sh_1, a3, sh_2);
+do3:
+		a1 = ((op_t *) srcp)[1];
+		((op_t *) dstp)[1] = MERGE(a3, sh_1, a0, sh_2);
+do2:
+		a2 = ((op_t *) srcp)[2];
+		((op_t *) dstp)[2] = MERGE(a0, sh_1, a1, sh_2);
+do1:
+		a3 = ((op_t *) srcp)[3];
+		((op_t *) dstp)[3] = MERGE(a1, sh_1, a2, sh_2);
+
+		srcp += 4 * OPSIZ;
+		dstp += 4 * OPSIZ;
+		len -= 4;
+	} while (len != 0);
+
+	/* This is the right position for do0.  Please don't move
+	 * it into the loop.
+	 */
+do0:
+	((op_t *) dstp)[0] = MERGE(a2, sh_1, a3, sh_2);
+}
+
+#define BYTE_COPY_FWD(dst_bp, src_bp, nbytes)		\
+do {							\
+	size_t __nbytes = (nbytes);			\
+	while (__nbytes > 0) {						\
+		byte __x = ((byte *) src_bp)[0];		\
+		src_bp += 1;				\
+		__nbytes -= 1;				\
+		((byte *) dst_bp)[0] = __x;		\
+		dst_bp += 1;				\
+	}						\
+} while (0)
+
+#define WORD_COPY_FWD(dst_bp, src_bp, nbytes_left, nbytes)			\
+do {										\
+	if (src_bp % OPSIZ == 0)						\
+		_wordcopy_fwd_aligned(dst_bp, src_bp, (nbytes) / OPSIZ);	\
+	else									\
+		_wordcopy_fwd_dest_aligned(dst_bp, src_bp, (nbytes) / OPSIZ);	\
+	src_bp += (nbytes) & -OPSIZ;						\
+	dst_bp += (nbytes) & -OPSIZ;						\
+	(nbytes_left) = (nbytes) % OPSIZ;					\
+} while (0)
+
+extern void *__memcpy_aligned(void *dest, const void *src, size_t len);
+void *__memcpy(void *dest, const void *src, size_t len)
+{
+	unsigned long dstp = (long) dest;
+	unsigned long srcp = (long) src;
+
+	/* If there not too few bytes to copy, use word copy.  */
+	if (len >= OP_T_THRES) {
+		if ((len >= FAST_COPY_THRES) && ((dstp & OPSIZ_MASK) == 0) &&
+			((srcp & OPSIZ_MASK) == 0)) {
+			__memcpy_aligned(dest, src, len);
+			return dest;
+		}
+		/* Copy just a few bytes to make DSTP aligned.  */
+		len -= (-dstp) % OPSIZ;
+		BYTE_COPY_FWD(dstp, srcp, (-dstp) % OPSIZ);
+
+		/* Copy from SRCP to DSTP taking advantage of the known alignment of
+		 * DSTP.  Number of bytes remaining is put in the third argument,
+		 * i.e. in LEN.  This number may vary from machine to machine.
+		 */
+		WORD_COPY_FWD(dstp, srcp, len, len);
+	/* Fall out and copy the tail.  */
+	}
+
+	/* There are just a few bytes to copy.  Use byte memory operations.  */
+	BYTE_COPY_FWD(dstp, srcp, len);
+
+	return dest;
+}
+
+#ifdef weak_alias
+weak_alias (__memcpy, memcpy)
+#endif
+
+libc_hidden_builtin_def (memcpy)
+diff --git a/sysdeps/riscv/memcpy_aligned.S b/sysdeps/riscv/memcpy_aligned.S
+new file mode 100644
+index 00000000..b9b01e35
+--- /dev/null
+++ b/sysdeps/riscv/memcpy_aligned.S
+@@ -0,0 +1,82 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2013 Regents of the University of California
+ */
+
 +#include <sysdep.h>
 +#include <sys/asm.h>
 +
-+ENTRY(memcpy)
+/* void *__memcpy_aligned(void *, const void *, size_t) */
+ENTRY(__memcpy_aligned)
 +	move t6, a0  /* Preserve return value */
 +
-+	/* Defer to byte-oriented copy for small sizes */
-+	sltiu a3, a2, 128
-+	bnez a3, 4f
-+	/* Use word-oriented copy only if low-order bits match */
-+	andi a3, t6, SZREG-1
-+	andi a4, a1, SZREG-1
-+	bne a3, a4, 4f
-+
-+	beqz a3, 2f  /* Skip if already aligned */
-+	/*
-+	 * Round to nearest double word-aligned address
-+	 * greater than or equal to start address
-+	 */
-+	andi a3, a1, ~(SZREG-1)
-+	addi a3, a3, SZREG
-+	/* Handle initial misalignment */
-+	sub a4, a3, a1
-+1:
-+	lb a5, 0(a1)
-+	addi a1, a1, 1
-+	sb a5, 0(t6)
-+	addi t6, t6, 1
-+	bltu a1, a3, 1b
-+	sub a2, a2, a4  /* Update count */
-+
 +2:
 +	andi a4, a2, ~((16*SZREG)-1)
 +	beqz a4, 4f
@@ -137,134 +391,7 @@ index 00000000..aa0eaee9
 +	bltu a1, a3, 5b
 +6:
 +	ret
-+END(memcpy)
-+libc_hidden_builtin_def (memcpy)
-diff --git a/sysdeps/riscv/memset.S b/sysdeps/riscv/memset.S
-new file mode 100644
-index 00000000..431a19f5
--- /dev/null
-+++ b/sysdeps/riscv/memset.S
-@@ -0,0 +1,120 @@
-+/* memset for RISC-V.
-+   Copyright (C) 1996-2020 Free Software Foundation, Inc.
-+   This file is part of the GNU C Library.
-+
-+   The GNU C Library is free software; you can redistribute it and/or
-+   modify it under the terms of the GNU Lesser General Public
-+   License as published by the Free Software Foundation; either
-+   version 2.1 of the License, or (at your option) any later version.
-+
-+   The GNU C Library is distributed in the hope that it will be useful,
-+   but WITHOUT ANY WARRANTY; without even the implied warranty of
-+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+   Lesser General Public License for more details.
-+
-+   You should have received a copy of the GNU Lesser General Public
-+   License along with the GNU C Library.  If not, see
-+   <https://www.gnu.org/licenses/>.  */
-+
-+#include <sysdep.h>
-+#include <sys/asm.h>
-+
-+ENTRY(memset)
-+	move t0, a0  /* Preserve return value */
-+
-+	/* Defer to byte-oriented fill for small sizes */
-+	sltiu a3, a2, 16
-+	bnez a3, 4f
-+
-+	/*
-+	 * Round to nearest XLEN-aligned address
-+	 * greater than or equal to start address
-+	 */
-+	addi a3, t0, SZREG-1
-+	andi a3, a3, ~(SZREG-1)
-+	beq a3, t0, 2f  /* Skip if already aligned */
-+	/* Handle initial misalignment */
-+	sub a4, a3, t0
-+1:
-+	sb a1, 0(t0)
-+	addi t0, t0, 1
-+	bltu t0, a3, 1b
-+	sub a2, a2, a4  /* Update count */
-+
-+2: /* Duff's device with 32 XLEN stores per iteration */
-+	/* Broadcast value into all bytes */
-+	andi a1, a1, 0xff
-+	slli a3, a1, 8
-+	or a1, a3, a1
-+	slli a3, a1, 16
-+	or a1, a3, a1
-+	slli a3, a1, 32
-+	or a1, a3, a1
-+
-+	/* Calculate end address */
-+	andi a4, a2, ~(SZREG-1)
-+	add a3, t0, a4
-+
-+	andi a4, a4, 31*SZREG  /* Calculate remainder */
-+	beqz a4, 3f            /* Shortcut if no remainder */
-+	neg a4, a4
-+	addi a4, a4, 32*SZREG  /* Calculate initial offset */
-+
-+	/* Adjust start address with offset */
-+	sub t0, t0, a4
-+
-+	/* Jump into loop body */
-+	/* Assumes 32-bit instruction lengths */
-+	la a5, 3f
-+	srli a4, a4, 1
-+	add a5, a5, a4
-+	jr a5
-+3:
-+	REG_S a1,        0(t0)
-+	REG_S a1,    SZREG(t0)
-+	REG_S a1,  2*SZREG(t0)
-+	REG_S a1,  3*SZREG(t0)
-+	REG_S a1,  4*SZREG(t0)
-+	REG_S a1,  5*SZREG(t0)
-+	REG_S a1,  6*SZREG(t0)
-+	REG_S a1,  7*SZREG(t0)
-+	REG_S a1,  8*SZREG(t0)
-+	REG_S a1,  9*SZREG(t0)
-+	REG_S a1, 10*SZREG(t0)
-+	REG_S a1, 11*SZREG(t0)
-+	REG_S a1, 12*SZREG(t0)
-+	REG_S a1, 13*SZREG(t0)
-+	REG_S a1, 14*SZREG(t0)
-+	REG_S a1, 15*SZREG(t0)
-+	REG_S a1, 16*SZREG(t0)
-+	REG_S a1, 17*SZREG(t0)
-+	REG_S a1, 18*SZREG(t0)
-+	REG_S a1, 19*SZREG(t0)
-+	REG_S a1, 20*SZREG(t0)
-+	REG_S a1, 21*SZREG(t0)
-+	REG_S a1, 22*SZREG(t0)
-+	REG_S a1, 23*SZREG(t0)
-+	REG_S a1, 24*SZREG(t0)
-+	REG_S a1, 25*SZREG(t0)
-+	REG_S a1, 26*SZREG(t0)
-+	REG_S a1, 27*SZREG(t0)
-+	REG_S a1, 28*SZREG(t0)
-+	REG_S a1, 29*SZREG(t0)
-+	REG_S a1, 30*SZREG(t0)
-+	REG_S a1, 31*SZREG(t0)
-+	addi t0, t0, 32*SZREG
-+	bltu t0, a3, 3b
-+	andi a2, a2, SZREG-1  /* Update count */
-+
-+4:
-+	/* Handle trailing misalignment */
-+	beqz a2, 6f
-+	add a3, t0, a2
-+5:
-+	sb a1, 0(t0)
-+	addi t0, t0, 1
-+	bltu t0, a3, 5b
-+6:
-+	ret
-+END(memset)
-+libc_hidden_builtin_def (memset)
+END(__memcpy_aligned)
 -- 
 2.39.2