From 3b6505746a766ab62a56424e4d2546feead88053 Mon Sep 17 00:00:00 2001
From: Andy Hu <andy.hu@starfivetech.com>
Date: Fri, 19 May 2023 00:03:37 +0800
Subject: [PATCH] package: glibc: Add optimization memory operations for RISCV

Port the memcpy & memset implementation from
Linux kerenl to glibc to improve performance of
memory operations in user space.

Signed-off-by: Mason Huo <mason.huo@starfivetech.com>
---
 ...Optimize-memory-operations-for-RISCV.patch | 397 ++++++++++++++++++
 1 file changed, 397 insertions(+)
 create mode 100644 package/glibc/0004-Optimize-memory-operations-for-RISCV.patch

diff --git a/package/glibc/0004-Optimize-memory-operations-for-RISCV.patch b/package/glibc/0004-Optimize-memory-operations-for-RISCV.patch
new file mode 100644
index 00000000..d3ab989f
--- /dev/null
+++ b/package/glibc/0004-Optimize-memory-operations-for-RISCV.patch
@@ -0,0 +1,397 @@
+From 666e593642136fad34be8eef1fcf1e872830013c Mon Sep 17 00:00:00 2001
+From: Mason Huo <mason.huo@starfivetech.com>
+Date: Mon, 17 Apr 2023 13:41:06 +0800
+Subject: [PATCH] Optimize memory operations for RISCV
+
+Port the linux kernel memcpy function for optimizing
+the 128 byte align case, this will improve the
+performance of large block memcpy.
+
+Here we combine the memcpy of glibc and kernel.
+
+Signed-off-by: Mason Huo <mason.huo@starfivetech.com>
+---
+ sysdeps/riscv/Makefile         |   4 +
+ sysdeps/riscv/memcpy.c         | 265 +++++++++++++++++++++++++++++++++
+ sysdeps/riscv/memcpy_aligned.S |  82 ++++++++++
+ 3 files changed, 351 insertions(+)
+ create mode 100644 sysdeps/riscv/memcpy.c
+ create mode 100644 sysdeps/riscv/memcpy_aligned.S
+
+diff --git a/sysdeps/riscv/Makefile b/sysdeps/riscv/Makefile
+index 20a99681..5c3c3244 100644
+--- a/sysdeps/riscv/Makefile
++++ b/sysdeps/riscv/Makefile
+@@ -2,6 +2,10 @@ ifeq ($(subdir),misc)
+ sysdep_headers += sys/asm.h
+ endif
+ 
++ifeq ($(subdir),string)
++sysdep_routines += memcpy_aligned
++endif
++
+ # RISC-V's assembler also needs to know about PIC as it changes the definition
+ # of some assembler macros.
+ ASFLAGS-.os += $(pic-ccflag)
+diff --git a/sysdeps/riscv/memcpy.c b/sysdeps/riscv/memcpy.c
+new file mode 100644
+index 00000000..1de6141e
+--- /dev/null
++++ b/sysdeps/riscv/memcpy.c
+@@ -0,0 +1,265 @@
++/* Copy memory to memory until the specified number of bytes
++   has been copied.  Overlap is NOT handled correctly.
++   Copyright (C) 1991-2020 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++   Contributed by Torbjorn Granlund (tege@sics.se).
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <https://www.gnu.org/licenses/>.  */
++
++#include <stddef.h>
++#include <string.h>
++
++#define MERGE(w0, sh_1, w1, sh_2) (((w0) >> (sh_1)) | ((w1) << (sh_2)))
++#define OP_T_THRES      16
++#define op_t    unsigned long
++#define OPSIZ   (sizeof(op_t))
++#define OPSIZ_MASK   (sizeof(op_t) - 1)
++#define FAST_COPY_THRES  (128)
++#define byte    unsigned char
++
++static void _wordcopy_fwd_aligned(long dstp, long srcp, size_t len)
++{
++	op_t a0, a1;
++
++	switch (len % 8) {
++	case 2:
++		a0 = ((op_t *) srcp)[0];
++		srcp -= 6 * OPSIZ;
++		dstp -= 7 * OPSIZ;
++		len += 6;
++		goto do1;
++	case 3:
++		a1 = ((op_t *) srcp)[0];
++		srcp -= 5 * OPSIZ;
++		dstp -= 6 * OPSIZ;
++		len += 5;
++		goto do2;
++	case 4:
++		a0 = ((op_t *) srcp)[0];
++		srcp -= 4 * OPSIZ;
++		dstp -= 5 * OPSIZ;
++		len += 4;
++		goto do3;
++	case 5:
++		a1 = ((op_t *) srcp)[0];
++		srcp -= 3 * OPSIZ;
++		dstp -= 4 * OPSIZ;
++		len += 3;
++		goto do4;
++	case 6:
++		a0 = ((op_t *) srcp)[0];
++		srcp -= 2 * OPSIZ;
++		dstp -= 3 * OPSIZ;
++		len += 2;
++		goto do5;
++	case 7:
++		a1 = ((op_t *) srcp)[0];
++		srcp -= 1 * OPSIZ;
++		dstp -= 2 * OPSIZ;
++		len += 1;
++		goto do6;
++
++	case 0:
++		if (OP_T_THRES <= 3 * OPSIZ && len == 0)
++			return;
++		a0 = ((op_t *) srcp)[0];
++		srcp -= 0 * OPSIZ;
++		dstp -= 1 * OPSIZ;
++		goto do7;
++	case 1:
++		a1 = ((op_t *) srcp)[0];
++		srcp -= -1 * OPSIZ;
++		dstp -= 0 * OPSIZ;
++		len -= 1;
++		if (OP_T_THRES <= 3 * OPSIZ && len == 0)
++			goto do0;
++		goto do8;                 /* No-op.  */
++	}
++
++	do {
++do8:
++		a0 = ((op_t *) srcp)[0];
++		((op_t *) dstp)[0] = a1;
++do7:
++		a1 = ((op_t *) srcp)[1];
++		((op_t *) dstp)[1] = a0;
++do6:
++		a0 = ((op_t *) srcp)[2];
++		((op_t *) dstp)[2] = a1;
++do5:
++		a1 = ((op_t *) srcp)[3];
++		((op_t *) dstp)[3] = a0;
++do4:
++		a0 = ((op_t *) srcp)[4];
++		((op_t *) dstp)[4] = a1;
++do3:
++		a1 = ((op_t *) srcp)[5];
++		((op_t *) dstp)[5] = a0;
++do2:
++		a0 = ((op_t *) srcp)[6];
++		((op_t *) dstp)[6] = a1;
++do1:
++		a1 = ((op_t *) srcp)[7];
++		((op_t *) dstp)[7] = a0;
++
++		srcp += 8 * OPSIZ;
++		dstp += 8 * OPSIZ;
++		len -= 8;
++	} while (len != 0);
++
++	/* This is the right position for do0.  Please don't move
++	 * it into the loop.
++	 */
++do0:
++	((op_t *) dstp)[0] = a1;
++}
++
++static void _wordcopy_fwd_dest_aligned(long dstp, long srcp, size_t len)
++{
++	op_t a0, a1, a2, a3;
++	int sh_1, sh_2;
++
++	/* Calculate how to shift a word read at the memory operation
++	 * aligned srcp to make it aligned for copy.
++	 */
++
++	sh_1 = 8 * (srcp % OPSIZ);
++	sh_2 = 8 * OPSIZ - sh_1;
++
++	/* Make SRCP aligned by rounding it down to the beginning of the `op_t'
++	 * it points in the middle of.
++	 */
++	srcp &= -OPSIZ;
++
++	switch (len % 4) {
++	case 2:
++		a1 = ((op_t *) srcp)[0];
++		a2 = ((op_t *) srcp)[1];
++		srcp -= 1 * OPSIZ;
++		dstp -= 3 * OPSIZ;
++		len += 2;
++		goto do1;
++	case 3:
++		a0 = ((op_t *) srcp)[0];
++		a1 = ((op_t *) srcp)[1];
++		srcp -= 0 * OPSIZ;
++		dstp -= 2 * OPSIZ;
++		len += 1;
++		goto do2;
++	case 0:
++		if (OP_T_THRES <= 3 * OPSIZ && len == 0)
++			return;
++		a3 = ((op_t *) srcp)[0];
++		a0 = ((op_t *) srcp)[1];
++		srcp -= -1 * OPSIZ;
++		dstp -= 1 * OPSIZ;
++		len += 0;
++		goto do3;
++	case 1:
++		a2 = ((op_t *) srcp)[0];
++		a3 = ((op_t *) srcp)[1];
++		srcp -= -2 * OPSIZ;
++		dstp -= 0 * OPSIZ;
++		len -= 1;
++		if (OP_T_THRES <= 3 * OPSIZ && len == 0)
++			goto do0;
++		goto do4;                 /* No-op.  */
++	}
++
++	do {
++do4:
++		a0 = ((op_t *) srcp)[0];
++		((op_t *) dstp)[0] = MERGE(a2, sh_1, a3, sh_2);
++do3:
++		a1 = ((op_t *) srcp)[1];
++		((op_t *) dstp)[1] = MERGE(a3, sh_1, a0, sh_2);
++do2:
++		a2 = ((op_t *) srcp)[2];
++		((op_t *) dstp)[2] = MERGE(a0, sh_1, a1, sh_2);
++do1:
++		a3 = ((op_t *) srcp)[3];
++		((op_t *) dstp)[3] = MERGE(a1, sh_1, a2, sh_2);
++
++		srcp += 4 * OPSIZ;
++		dstp += 4 * OPSIZ;
++		len -= 4;
++	} while (len != 0);
++
++	/* This is the right position for do0.  Please don't move
++	 * it into the loop.
++	 */
++do0:
++	((op_t *) dstp)[0] = MERGE(a2, sh_1, a3, sh_2);
++}
++
++#define BYTE_COPY_FWD(dst_bp, src_bp, nbytes)		\
++do {							\
++	size_t __nbytes = (nbytes);			\
++	while (__nbytes > 0) {						\
++		byte __x = ((byte *) src_bp)[0];		\
++		src_bp += 1;				\
++		__nbytes -= 1;				\
++		((byte *) dst_bp)[0] = __x;		\
++		dst_bp += 1;				\
++	}						\
++} while (0)
++
++#define WORD_COPY_FWD(dst_bp, src_bp, nbytes_left, nbytes)			\
++do {										\
++	if (src_bp % OPSIZ == 0)						\
++		_wordcopy_fwd_aligned(dst_bp, src_bp, (nbytes) / OPSIZ);	\
++	else									\
++		_wordcopy_fwd_dest_aligned(dst_bp, src_bp, (nbytes) / OPSIZ);	\
++	src_bp += (nbytes) & -OPSIZ;						\
++	dst_bp += (nbytes) & -OPSIZ;						\
++	(nbytes_left) = (nbytes) % OPSIZ;					\
++} while (0)
++
++extern void *__memcpy_aligned(void *dest, const void *src, size_t len);
++void *__memcpy(void *dest, const void *src, size_t len)
++{
++	unsigned long dstp = (long) dest;
++	unsigned long srcp = (long) src;
++
++	/* If there not too few bytes to copy, use word copy.  */
++	if (len >= OP_T_THRES) {
++		if ((len >= FAST_COPY_THRES) && ((dstp & OPSIZ_MASK) == 0) &&
++			((srcp & OPSIZ_MASK) == 0)) {
++			__memcpy_aligned(dest, src, len);
++			return dest;
++		}
++		/* Copy just a few bytes to make DSTP aligned.  */
++		len -= (-dstp) % OPSIZ;
++		BYTE_COPY_FWD(dstp, srcp, (-dstp) % OPSIZ);
++
++		/* Copy from SRCP to DSTP taking advantage of the known alignment of
++		 * DSTP.  Number of bytes remaining is put in the third argument,
++		 * i.e. in LEN.  This number may vary from machine to machine.
++		 */
++		WORD_COPY_FWD(dstp, srcp, len, len);
++	/* Fall out and copy the tail.  */
++	}
++
++	/* There are just a few bytes to copy.  Use byte memory operations.  */
++	BYTE_COPY_FWD(dstp, srcp, len);
++
++	return dest;
++}
++
++#ifdef weak_alias
++weak_alias (__memcpy, memcpy)
++#endif
++
++libc_hidden_builtin_def (memcpy)
+diff --git a/sysdeps/riscv/memcpy_aligned.S b/sysdeps/riscv/memcpy_aligned.S
+new file mode 100644
+index 00000000..b9b01e35
+--- /dev/null
++++ b/sysdeps/riscv/memcpy_aligned.S
+@@ -0,0 +1,82 @@
++/* SPDX-License-Identifier: GPL-2.0-only */
++/*
++ * Copyright (C) 2013 Regents of the University of California
++ */
++
++#include <sysdep.h>
++#include <sys/asm.h>
++
++/* void *__memcpy_aligned(void *, const void *, size_t) */
++ENTRY(__memcpy_aligned)
++	move t6, a0  /* Preserve return value */
++
++2:
++	andi a4, a2, ~((16*SZREG)-1)
++	beqz a4, 4f
++	add a3, a1, a4
++3:
++	REG_L a4,       0(a1)
++	REG_L a5,   SZREG(a1)
++	REG_L a6, 2*SZREG(a1)
++	REG_L a7, 3*SZREG(a1)
++	REG_L t0, 4*SZREG(a1)
++	REG_L t1, 5*SZREG(a1)
++	REG_L t2, 6*SZREG(a1)
++	REG_L t3, 7*SZREG(a1)
++	REG_L t4, 8*SZREG(a1)
++	REG_L t5, 9*SZREG(a1)
++	REG_S a4,       0(t6)
++	REG_S a5,   SZREG(t6)
++	REG_S a6, 2*SZREG(t6)
++	REG_S a7, 3*SZREG(t6)
++	REG_S t0, 4*SZREG(t6)
++	REG_S t1, 5*SZREG(t6)
++	REG_S t2, 6*SZREG(t6)
++	REG_S t3, 7*SZREG(t6)
++	REG_S t4, 8*SZREG(t6)
++	REG_S t5, 9*SZREG(t6)
++	REG_L a4, 10*SZREG(a1)
++	REG_L a5, 11*SZREG(a1)
++	REG_L a6, 12*SZREG(a1)
++	REG_L a7, 13*SZREG(a1)
++	REG_L t0, 14*SZREG(a1)
++	REG_L t1, 15*SZREG(a1)
++	addi a1, a1, 16*SZREG
++	REG_S a4, 10*SZREG(t6)
++	REG_S a5, 11*SZREG(t6)
++	REG_S a6, 12*SZREG(t6)
++	REG_S a7, 13*SZREG(t6)
++	REG_S t0, 14*SZREG(t6)
++	REG_S t1, 15*SZREG(t6)
++	addi t6, t6, 16*SZREG
++	bltu a1, a3, 3b
++	andi a2, a2, (16*SZREG)-1  /* Update count */
++
++4:
++	/* Handle trailing misalignment */
++	beqz a2, 6f
++	add a3, a1, a2
++
++	/* Use word-oriented copy if co-aligned to word boundary */
++	or a5, a1, t6
++	or a5, a5, a3
++	andi a5, a5, 3
++	bnez a5, 5f
++7:
++	lw a4, 0(a1)
++	addi a1, a1, 4
++	sw a4, 0(t6)
++	addi t6, t6, 4
++	bltu a1, a3, 7b
++
++	ret
++
++5:
++	lb a4, 0(a1)
++	addi a1, a1, 1
++	sb a4, 0(t6)
++	addi t6, t6, 1
++	bltu a1, a3, 5b
++6:
++	ret
++END(__memcpy_aligned)
+-- 
+2.39.2
+