fml13v01-buildroot/package/glibc/0004-Optimize-memory-operations-for-RISCV.patch

From 666e593642136fad34be8eef1fcf1e872830013c Mon Sep 17 00:00:00 2001
From: Mason Huo <mason.huo@starfivetech.com>
Date: Mon, 17 Apr 2023 13:41:06 +0800
Subject: [PATCH] Optimize memory operations for RISCV

Port the linux kernel memcpy function for optimizing
the 128 byte align case, this will improve the
performance of large block memcpy.

Here we combine the memcpy of glibc and kernel.

Signed-off-by: Mason Huo <mason.huo@starfivetech.com>
---
 sysdeps/riscv/Makefile         |   4 +
 sysdeps/riscv/memcpy.c         | 265 +++++++++++++++++++++++++++++++++
 sysdeps/riscv/memcpy_aligned.S |  82 ++++++++++
 3 files changed, 351 insertions(+)
 create mode 100644 sysdeps/riscv/memcpy.c
 create mode 100644 sysdeps/riscv/memcpy_aligned.S

diff --git a/sysdeps/riscv/Makefile b/sysdeps/riscv/Makefile
index 20a99681..5c3c3244 100644
--- a/sysdeps/riscv/Makefile
+++ b/sysdeps/riscv/Makefile
@@ -2,6 +2,10 @@ ifeq ($(subdir),misc)
 sysdep_headers += sys/asm.h
 endif

+ifeq ($(subdir),string)
+sysdep_routines += memcpy_aligned
+endif
+
 # RISC-V's assembler also needs to know about PIC as it changes the definition
 # of some assembler macros.
 ASFLAGS-.os += $(pic-ccflag)
diff --git a/sysdeps/riscv/memcpy.c b/sysdeps/riscv/memcpy.c
new file mode 100644
index 00000000..1de6141e
--- /dev/null
+++ b/sysdeps/riscv/memcpy.c
@@ -0,0 +1,265 @@
+/* Copy memory to memory until the specified number of bytes
+   has been copied.  Overlap is NOT handled correctly.
+   Copyright (C) 1991-2020 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Torbjorn Granlund (tege@sics.se).
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <stddef.h>
+#include <string.h>
+
+#define MERGE(w0, sh_1, w1, sh_2) (((w0) >> (sh_1)) | ((w1) << (sh_2)))
+#define OP_T_THRES      16
+#define op_t    unsigned long
+#define OPSIZ   (sizeof(op_t))
+#define OPSIZ_MASK   (sizeof(op_t) - 1)
+#define FAST_COPY_THRES  (128)
+#define byte    unsigned char
+
+static void _wordcopy_fwd_aligned(long dstp, long srcp, size_t len)
+{
+	op_t a0, a1;
+
+	switch (len % 8) {
+	case 2:
+		a0 = ((op_t *) srcp)[0];
+		srcp -= 6 * OPSIZ;
+		dstp -= 7 * OPSIZ;
+		len += 6;
+		goto do1;
+	case 3:
+		a1 = ((op_t *) srcp)[0];
+		srcp -= 5 * OPSIZ;
+		dstp -= 6 * OPSIZ;
+		len += 5;
+		goto do2;
+	case 4:
+		a0 = ((op_t *) srcp)[0];
+		srcp -= 4 * OPSIZ;
+		dstp -= 5 * OPSIZ;
+		len += 4;
+		goto do3;
+	case 5:
+		a1 = ((op_t *) srcp)[0];
+		srcp -= 3 * OPSIZ;
+		dstp -= 4 * OPSIZ;
+		len += 3;
+		goto do4;
+	case 6:
+		a0 = ((op_t *) srcp)[0];
+		srcp -= 2 * OPSIZ;
+		dstp -= 3 * OPSIZ;
+		len += 2;
+		goto do5;
+	case 7:
+		a1 = ((op_t *) srcp)[0];
+		srcp -= 1 * OPSIZ;
+		dstp -= 2 * OPSIZ;
+		len += 1;
+		goto do6;
+
+	case 0:
+		if (OP_T_THRES <= 3 * OPSIZ && len == 0)
+			return;
+		a0 = ((op_t *) srcp)[0];
+		srcp -= 0 * OPSIZ;
+		dstp -= 1 * OPSIZ;
+		goto do7;
+	case 1:
+		a1 = ((op_t *) srcp)[0];
+		srcp -= -1 * OPSIZ;
+		dstp -= 0 * OPSIZ;
+		len -= 1;
+		if (OP_T_THRES <= 3 * OPSIZ && len == 0)
+			goto do0;
+		goto do8;                 /* No-op.  */
+	}
+
+	do {
+do8:
+		a0 = ((op_t *) srcp)[0];
+		((op_t *) dstp)[0] = a1;
+do7:
+		a1 = ((op_t *) srcp)[1];
+		((op_t *) dstp)[1] = a0;
+do6:
+		a0 = ((op_t *) srcp)[2];
+		((op_t *) dstp)[2] = a1;
+do5:
+		a1 = ((op_t *) srcp)[3];
+		((op_t *) dstp)[3] = a0;
+do4:
+		a0 = ((op_t *) srcp)[4];
+		((op_t *) dstp)[4] = a1;
+do3:
+		a1 = ((op_t *) srcp)[5];
+		((op_t *) dstp)[5] = a0;
+do2:
+		a0 = ((op_t *) srcp)[6];
+		((op_t *) dstp)[6] = a1;
+do1:
+		a1 = ((op_t *) srcp)[7];
+		((op_t *) dstp)[7] = a0;
+
+		srcp += 8 * OPSIZ;
+		dstp += 8 * OPSIZ;
+		len -= 8;
+	} while (len != 0);
+
+	/* This is the right position for do0.  Please don't move
+	 * it into the loop.
+	 */
+do0:
+	((op_t *) dstp)[0] = a1;
+}
+
+static void _wordcopy_fwd_dest_aligned(long dstp, long srcp, size_t len)
+{
+	op_t a0, a1, a2, a3;
+	int sh_1, sh_2;
+
+	/* Calculate how to shift a word read at the memory operation
+	 * aligned srcp to make it aligned for copy.
+	 */
+
+	sh_1 = 8 * (srcp % OPSIZ);
+	sh_2 = 8 * OPSIZ - sh_1;
+
+	/* Make SRCP aligned by rounding it down to the beginning of the `op_t'
+	 * it points in the middle of.
+	 */
+	srcp &= -OPSIZ;
+
+	switch (len % 4) {
+	case 2:
+		a1 = ((op_t *) srcp)[0];
+		a2 = ((op_t *) srcp)[1];
+		srcp -= 1 * OPSIZ;
+		dstp -= 3 * OPSIZ;
+		len += 2;
+		goto do1;
+	case 3:
+		a0 = ((op_t *) srcp)[0];
+		a1 = ((op_t *) srcp)[1];
+		srcp -= 0 * OPSIZ;
+		dstp -= 2 * OPSIZ;
+		len += 1;
+		goto do2;
+	case 0:
+		if (OP_T_THRES <= 3 * OPSIZ && len == 0)
+			return;
+		a3 = ((op_t *) srcp)[0];
+		a0 = ((op_t *) srcp)[1];
+		srcp -= -1 * OPSIZ;
+		dstp -= 1 * OPSIZ;
+		len += 0;
+		goto do3;
+	case 1:
+		a2 = ((op_t *) srcp)[0];
+		a3 = ((op_t *) srcp)[1];
+		srcp -= -2 * OPSIZ;
+		dstp -= 0 * OPSIZ;
+		len -= 1;
+		if (OP_T_THRES <= 3 * OPSIZ && len == 0)
+			goto do0;
+		goto do4;                 /* No-op.  */
+	}
+
+	do {
+do4:
+		a0 = ((op_t *) srcp)[0];
+		((op_t *) dstp)[0] = MERGE(a2, sh_1, a3, sh_2);
+do3:
+		a1 = ((op_t *) srcp)[1];
+		((op_t *) dstp)[1] = MERGE(a3, sh_1, a0, sh_2);
+do2:
+		a2 = ((op_t *) srcp)[2];
+		((op_t *) dstp)[2] = MERGE(a0, sh_1, a1, sh_2);
+do1:
+		a3 = ((op_t *) srcp)[3];
+		((op_t *) dstp)[3] = MERGE(a1, sh_1, a2, sh_2);
+
+		srcp += 4 * OPSIZ;
+		dstp += 4 * OPSIZ;
+		len -= 4;
+	} while (len != 0);
+
+	/* This is the right position for do0.  Please don't move
+	 * it into the loop.
+	 */
+do0:
+	((op_t *) dstp)[0] = MERGE(a2, sh_1, a3, sh_2);
+}
+
+#define BYTE_COPY_FWD(dst_bp, src_bp, nbytes)		\
+do {							\
+	size_t __nbytes = (nbytes);			\
+	while (__nbytes > 0) {						\
+		byte __x = ((byte *) src_bp)[0];		\
+		src_bp += 1;				\
+		__nbytes -= 1;				\
+		((byte *) dst_bp)[0] = __x;		\
+		dst_bp += 1;				\
+	}						\
+} while (0)
+
+#define WORD_COPY_FWD(dst_bp, src_bp, nbytes_left, nbytes)			\
+do {										\
+	if (src_bp % OPSIZ == 0)						\
+		_wordcopy_fwd_aligned(dst_bp, src_bp, (nbytes) / OPSIZ);	\
+	else									\
+		_wordcopy_fwd_dest_aligned(dst_bp, src_bp, (nbytes) / OPSIZ);	\
+	src_bp += (nbytes) & -OPSIZ;						\
+	dst_bp += (nbytes) & -OPSIZ;						\
+	(nbytes_left) = (nbytes) % OPSIZ;					\
+} while (0)
+
+extern void *__memcpy_aligned(void *dest, const void *src, size_t len);
+void *__memcpy(void *dest, const void *src, size_t len)
+{
+	unsigned long dstp = (long) dest;
+	unsigned long srcp = (long) src;
+
+	/* If there not too few bytes to copy, use word copy.  */
+	if (len >= OP_T_THRES) {
+		if ((len >= FAST_COPY_THRES) && ((dstp & OPSIZ_MASK) == 0) &&
+			((srcp & OPSIZ_MASK) == 0)) {
+			__memcpy_aligned(dest, src, len);
+			return dest;
+		}
+		/* Copy just a few bytes to make DSTP aligned.  */
+		len -= (-dstp) % OPSIZ;
+		BYTE_COPY_FWD(dstp, srcp, (-dstp) % OPSIZ);
+
+		/* Copy from SRCP to DSTP taking advantage of the known alignment of
+		 * DSTP.  Number of bytes remaining is put in the third argument,
+		 * i.e. in LEN.  This number may vary from machine to machine.
+		 */
+		WORD_COPY_FWD(dstp, srcp, len, len);
+	/* Fall out and copy the tail.  */
+	}
+
+	/* There are just a few bytes to copy.  Use byte memory operations.  */
+	BYTE_COPY_FWD(dstp, srcp, len);
+
+	return dest;
+}
+
+#ifdef weak_alias
+weak_alias (__memcpy, memcpy)
+#endif
+
+libc_hidden_builtin_def (memcpy)
diff --git a/sysdeps/riscv/memcpy_aligned.S b/sysdeps/riscv/memcpy_aligned.S
new file mode 100644
index 00000000..b9b01e35
--- /dev/null
+++ b/sysdeps/riscv/memcpy_aligned.S
@@ -0,0 +1,82 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2013 Regents of the University of California
+ */
+
+#include <sysdep.h>
+#include <sys/asm.h>
+
+/* void *__memcpy_aligned(void *, const void *, size_t) */
+ENTRY(__memcpy_aligned)
+	move t6, a0  /* Preserve return value */
+
+2:
+	andi a4, a2, ~((16*SZREG)-1)
+	beqz a4, 4f
+	add a3, a1, a4
+3:
+	REG_L a4,       0(a1)
+	REG_L a5,   SZREG(a1)
+	REG_L a6, 2*SZREG(a1)
+	REG_L a7, 3*SZREG(a1)
+	REG_L t0, 4*SZREG(a1)
+	REG_L t1, 5*SZREG(a1)
+	REG_L t2, 6*SZREG(a1)
+	REG_L t3, 7*SZREG(a1)
+	REG_L t4, 8*SZREG(a1)
+	REG_L t5, 9*SZREG(a1)
+	REG_S a4,       0(t6)
+	REG_S a5,   SZREG(t6)
+	REG_S a6, 2*SZREG(t6)
+	REG_S a7, 3*SZREG(t6)
+	REG_S t0, 4*SZREG(t6)
+	REG_S t1, 5*SZREG(t6)
+	REG_S t2, 6*SZREG(t6)
+	REG_S t3, 7*SZREG(t6)
+	REG_S t4, 8*SZREG(t6)
+	REG_S t5, 9*SZREG(t6)
+	REG_L a4, 10*SZREG(a1)
+	REG_L a5, 11*SZREG(a1)
+	REG_L a6, 12*SZREG(a1)
+	REG_L a7, 13*SZREG(a1)
+	REG_L t0, 14*SZREG(a1)
+	REG_L t1, 15*SZREG(a1)
+	addi a1, a1, 16*SZREG
+	REG_S a4, 10*SZREG(t6)
+	REG_S a5, 11*SZREG(t6)
+	REG_S a6, 12*SZREG(t6)
+	REG_S a7, 13*SZREG(t6)
+	REG_S t0, 14*SZREG(t6)
+	REG_S t1, 15*SZREG(t6)
+	addi t6, t6, 16*SZREG
+	bltu a1, a3, 3b
+	andi a2, a2, (16*SZREG)-1  /* Update count */
+
+4:
+	/* Handle trailing misalignment */
+	beqz a2, 6f
+	add a3, a1, a2
+
+	/* Use word-oriented copy if co-aligned to word boundary */
+	or a5, a1, t6
+	or a5, a5, a3
+	andi a5, a5, 3
+	bnez a5, 5f
+7:
+	lw a4, 0(a1)
+	addi a1, a1, 4
+	sw a4, 0(t6)
+	addi t6, t6, 4
+	bltu a1, a3, 7b
+
+	ret
+
+5:
+	lb a4, 0(a1)
+	addi a1, a1, 1
+	sb a4, 0(t6)
+	addi t6, t6, 1
+	bltu a1, a3, 5b
+6:
+	ret
+END(__memcpy_aligned)
--
2.39.2