1 Star 0 Fork 151

hwzjyggsddu/glibc

forked from src-openEuler/glibc 
加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
文件
该仓库未声明开源许可证文件(LICENSE),使用请关注具体项目描述及其代码上游依赖。
克隆/下载
backport-Kunpeng-patches.patch 30.83 KB
一键复制 编辑 原始数据 按行查看 历史

From 0dfa5db2106d75db595e83f064352fb89d92986e Mon Sep 17 00:00:00 2001
From: wangbin224 <[email protected]>
Date: Sat, 28 Mar 2020 19:14:41 +0800
Subject: [PATCH] glibc: backport Kunpeng patches
backport Kunpeng patches
Signed-off-by: wangbin224 <[email protected]>
---
manual/tunables.texi | 2 +-
sysdeps/aarch64/memcmp.S | 4 +-
sysdeps/aarch64/memrchr.S | 15 +-
sysdeps/aarch64/multiarch/Makefile | 2 +-
sysdeps/aarch64/multiarch/ifunc-impl-list.c | 54 +-
sysdeps/aarch64/multiarch/memcpy.c | 9 +-
sysdeps/aarch64/multiarch/memcpy_kunpeng.S | 576 ------------------
sysdeps/aarch64/multiarch/memmove.c | 11 +-
sysdeps/aarch64/multiarch/memset.c | 14 +-
sysdeps/aarch64/multiarch/memset_kunpeng.S | 58 +-
sysdeps/aarch64/strcpy.S | 6 +-
sysdeps/aarch64/strnlen.S | 4 +-
.../unix/sysv/linux/aarch64/cpu-features.c | 4 +-
.../unix/sysv/linux/aarch64/cpu-features.h | 7 +-
14 files changed, 86 insertions(+), 680 deletions(-)
delete mode 100755 sysdeps/aarch64/multiarch/memcpy_kunpeng.S
diff --git a/manual/tunables.texi b/manual/tunables.texi
index bb4819bd..124b39b6 100644
--- a/manual/tunables.texi
+++ b/manual/tunables.texi
@@ -333,7 +333,7 @@ This tunable is specific to powerpc, powerpc64 and powerpc64le.
The @code{glibc.tune.cpu=xxx} tunable allows the user to tell @theglibc{} to
assume that the CPU is @code{xxx} where xxx may have one of these values:
@code{generic}, @code{falkor}, @code{thunderxt88}, @code{thunderx2t99},
-@code{thunderx2t99p1}.
+@code{thunderx2t99p1}, @code{kunpeng}.
This tunable is specific to aarch64.
@end deftp
diff --git a/sysdeps/aarch64/memcmp.S b/sysdeps/aarch64/memcmp.S
index 04129d83..a2138616 100644
--- a/sysdeps/aarch64/memcmp.S
+++ b/sysdeps/aarch64/memcmp.S
@@ -1,6 +1,6 @@
/* memcmp - compare memory
- Copyright (C) 2013-2019 Free Software Foundation, Inc.
+ Copyright (C) 2013-2018 Free Software Foundation, Inc.
This file is part of the GNU C Library.
@@ -16,7 +16,7 @@
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library. If not, see
- <https://www.gnu.org/licenses/>. */
+ <http://www.gnu.org/licenses/>. */
#include <sysdep.h>
diff --git a/sysdeps/aarch64/memrchr.S b/sysdeps/aarch64/memrchr.S
index 9095304b..0565168a 100644
--- a/sysdeps/aarch64/memrchr.S
+++ b/sysdeps/aarch64/memrchr.S
@@ -16,8 +16,8 @@
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library. If not, see
- <http://www.gnu.org/licenses/>. */
-
+ <https://www.gnu.org/licenses/>. */
+
#include <sysdep.h>
/* Assumptions:
@@ -61,7 +61,7 @@
* things occur in the original string, counting trailing zeros allows to
* identify exactly which byte has matched.
*/
-
+
ENTRY (__memrchr)
/* Do not dereference srcin if no bytes to compare. */
cbz cntin, L(zero_length)
@@ -101,7 +101,7 @@ ENTRY (__memrchr)
addp vend.16b, vend.16b, vend.16b /* 128->64 */
mov synd, vend.2d[0]
/* Clear the (32-soff)*2 upper bits */
- lsl tmp, soff, #1
+ lsl tmp, soff, #1
lsl synd, synd, tmp
lsr synd, synd, tmp
/* The first block can also be the last */
@@ -135,16 +135,16 @@ L(end):
b.hi L(tail)
L(masklast):
- /* Clear the (32 - ((cntrem + (32-soff)) % 32)) * 2 lower bits */
+ /* Clear the (32 - ((cntrem + (32-soff)) % 32)) * 2 lower bits */
add tmp, cntrem, soff
and tmp, tmp, #31
sub tmp, tmp, #32
- neg tmp, tmp, lsl #1
+ neg tmp, tmp, lsl #1
lsr synd, synd, tmp
lsl synd, synd, tmp
L(tail):
- /* Compensate the last post-increment*/
+ /* Compensate the last post-increment*/
add seek_dst, seek_dst, #32
/* Check that we have found a character */
cmp synd, #0
@@ -163,4 +163,3 @@ L(zero_length):
END (__memrchr)
weak_alias (__memrchr, memrchr)
libc_hidden_builtin_def (memrchr)
-
diff --git a/sysdeps/aarch64/multiarch/Makefile b/sysdeps/aarch64/multiarch/Makefile
index 90529d40..722ed824 100644
--- a/sysdeps/aarch64/multiarch/Makefile
+++ b/sysdeps/aarch64/multiarch/Makefile
@@ -1,4 +1,4 @@
ifeq ($(subdir),string)
-sysdep_routines += memcpy_kunpeng memcpy_generic memcpy_thunderx memcpy_thunderx2 \
+sysdep_routines += memcpy_generic memcpy_thunderx memcpy_thunderx2 \
memcpy_falkor memmove_falkor memset_generic memset_falkor memset_kunpeng
endif
diff --git a/sysdeps/aarch64/multiarch/ifunc-impl-list.c b/sysdeps/aarch64/multiarch/ifunc-impl-list.c
index bef9b06d..0026dbba 100644
--- a/sysdeps/aarch64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/aarch64/multiarch/ifunc-impl-list.c
@@ -1,5 +1,5 @@
/* Enumerate available IFUNC implementations of a function. AARCH64 version.
- Copyright (C) 2017-2019 Free Software Foundation, Inc.
+ Copyright (C) 2017-2018 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
@@ -25,36 +25,34 @@
#include <stdio.h>
/* Maximum number of IFUNC implementations. */
-#define MAX_IFUNC 5
+#define MAX_IFUNC 4
size_t
__libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
size_t max)
{
- assert(max >= MAX_IFUNC);
-
- size_t i = 0;
-
- INIT_ARCH();
-
- /* Support sysdeps/aarch64/multiarch/memcpy.c and memmove.c. */
- IFUNC_IMPL(i, name, memcpy,
- IFUNC_IMPL_ADD(array, i, memcpy, 1, __memcpy_thunderx)
- IFUNC_IMPL_ADD(array, i, memcpy, 1, __memcpy_thunderx2)
- IFUNC_IMPL_ADD(array, i, memcpy, 1, __memcpy_falkor)
- IFUNC_IMPL_ADD(array, i, memcpy, 1, __memcpy_kunpeng)
- IFUNC_IMPL_ADD(array, i, memcpy, 1, __memcpy_generic))
- IFUNC_IMPL(i, name, memmove,
- IFUNC_IMPL_ADD(array, i, memmove, 1, __memmove_thunderx)
- IFUNC_IMPL_ADD(array, i, memmove, 1, __memmove_falkor)
- IFUNC_IMPL_ADD(array, i, memmove, 1, __memmove_kunpeng)
- IFUNC_IMPL_ADD(array, i, memmove, 1, __memmove_generic))
- IFUNC_IMPL(i, name, memset,
- /* Enable this on non-falkor processors too so that other cores
- can do a comparative analysis with __memset_generic. */
- IFUNC_IMPL_ADD(array, i, memset, (zva_size == 64), __memset_falkor)
- IFUNC_IMPL_ADD(array, i, memset, 1, __memset_generic)
- IFUNC_IMPL_ADD(array, i, memset, 1, __memset_kunpeng))
-
- return i;
+ assert (max >= MAX_IFUNC);
+
+ size_t i = 0;
+
+ INIT_ARCH ();
+
+ /* Support sysdeps/aarch64/multiarch/memcpy.c and memmove.c. */
+ IFUNC_IMPL (i, name, memcpy,
+ IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_thunderx)
+ IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_thunderx2)
+ IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_falkor)
+ IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_generic))
+ IFUNC_IMPL (i, name, memmove,
+ IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_thunderx)
+ IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_falkor)
+ IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_generic))
+ IFUNC_IMPL (i, name, memset,
+ /* Enable this on non-falkor processors too so that other cores
+ can do a comparative analysis with __memset_generic. */
+ IFUNC_IMPL_ADD (array, i, memset, (zva_size == 64), __memset_falkor)
+ IFUNC_IMPL_ADD (array, i, memset, 1, __memset_kunpeng)
+ IFUNC_IMPL_ADD (array, i, memset, 1, __memset_generic))
+
+ return i;
}
diff --git a/sysdeps/aarch64/multiarch/memcpy.c b/sysdeps/aarch64/multiarch/memcpy.c
index 150e1ca9..2d358a83 100644
--- a/sysdeps/aarch64/multiarch/memcpy.c
+++ b/sysdeps/aarch64/multiarch/memcpy.c
@@ -1,5 +1,5 @@
/* Multiple versions of memcpy. AARCH64 version.
- Copyright (C) 2017-2019 Free Software Foundation, Inc.
+ Copyright (C) 2017-2018 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
@@ -32,14 +32,11 @@ extern __typeof (__redirect_memcpy) __memcpy_generic attribute_hidden;
extern __typeof (__redirect_memcpy) __memcpy_thunderx attribute_hidden;
extern __typeof (__redirect_memcpy) __memcpy_thunderx2 attribute_hidden;
extern __typeof (__redirect_memcpy) __memcpy_falkor attribute_hidden;
-extern __typeof (__redirect_memcpy) __memcpy_kunpeng attribute_hidden;
libc_ifunc (__libc_memcpy,
- IS_KUNPENG920(midr)
- ?__memcpy_kunpeng
- : (IS_THUNDERX (midr)
+ (IS_THUNDERX (midr)
? __memcpy_thunderx
- : (IS_FALKOR (midr) || IS_PHECDA (midr)
+ : (IS_FALKOR (midr) || IS_PHECDA (midr) || IS_KUNPENG920 (midr)
? __memcpy_falkor
: (IS_THUNDERX2 (midr) || IS_THUNDERX2PA (midr)
? __memcpy_thunderx2
diff --git a/sysdeps/aarch64/multiarch/memcpy_kunpeng.S b/sysdeps/aarch64/multiarch/memcpy_kunpeng.S
deleted file mode 100755
index 2102478a..00000000
--- a/sysdeps/aarch64/multiarch/memcpy_kunpeng.S
+++ /dev/null
@@ -1,576 +0,0 @@
-/* A Kunpeng Optimized memcpy implementation for AARCH64.
- Copyright (C) 2018-2019 Free Software Foundation, Inc.
-
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#include <sysdep.h>
-
-/* Assumptions:
- *
- * ARMv8-a, AArch64, unaligned accesses.
- *
- */
-
-#define dstin x0
-#define src x1
-#define count x2
-#define dst x3
-#define srcend x4
-#define dstend x5
-#define tmp2 x6
-#define tmp3 x7
-#define tmp3w w7
-#define A_l x6
-#define A_lw w6
-#define A_h x7
-#define A_hw w7
-#define B_l x8
-#define B_lw w8
-#define B_h x9
-#define C_l x10
-#define C_h x11
-#define D_l x12
-#define D_h x13
-#define E_l src
-#define E_h count
-#define F_l srcend
-#define F_h dst
-#define G_l count
-#define G_h dst
-#define tmp1 x14
-
-#define A_q q0
-#define B_q q1
-#define C_q q2
-#define D_q q3
-#define E_q q4
-#define F_q q5
-#define G_q q6
-#define H_q q7
-#define I_q q16
-#define J_q q17
-
-#define A_v v0
-#define B_v v1
-#define C_v v2
-#define D_v v3
-#define E_v v4
-#define F_v v5
-#define G_v v6
-#define H_v v7
-#define I_v v16
-#define J_v v17
-
-#ifndef MEMMOVE
-# define MEMMOVE memmove
-#endif
-#ifndef MEMCPY
-# define MEMCPY memcpy
-#endif
-
-#if IS_IN (libc)
-
-#undef MEMCPY
-#define MEMCPY __memcpy_kunpeng
-#undef MEMMOVE
-#define MEMMOVE __memmove_kunpeng
-
-
-/* Overlapping large forward memmoves use a loop that copies backwards.
- Otherwise memcpy is used. Small moves branch to memcopy16 directly.
- The longer memcpy cases fall through to the memcpy head.
-*/
-
-ENTRY_ALIGN (MEMMOVE, 6)
-
- DELOUSE (0)
- DELOUSE (1)
- DELOUSE (2)
-
- sub tmp1, dstin, src
- cmp count, 512
- ccmp tmp1, count, 2, hi
- b.lo L(move_long)
- cmp count, 96
- ccmp tmp1, count, 2, hi
- b.lo L(move_middle)
-
-END (MEMMOVE)
-libc_hidden_builtin_def (MEMMOVE)
-
-
-/* Copies are split into 3 main cases: small copies of up to 16 bytes,
- medium copies of 17..96 bytes which are fully unrolled. Large copies
- of more than 96 bytes align the destination and use load-and-merge
- approach in the case src and dst addresses are unaligned not evenly,
- so that, actual loads and stores are always aligned.
- Large copies use the loops processing 64 bytes per iteration for
- unaligned case and 128 bytes per iteration for aligned ones.
-*/
-
-#define MEMCPY_PREFETCH_LDR 640
-
- .p2align 4
-ENTRY (MEMCPY)
-
- DELOUSE (0)
- DELOUSE (1)
- DELOUSE (2)
-
- add srcend, src, count
- cmp count, 16
- b.ls L(memcopy16)
- add dstend, dstin, count
- cmp count, 96
- b.hi L(memcopy_long)
-
- /* Medium copies: 17..96 bytes. */
- ldr A_q, [src], #16
- and tmp1, src, 15
- ldr E_q, [srcend, -16]
- cmp count, 64
- b.gt L(memcpy_copy96)
- cmp count, 48
- b.le L(bytes_17_to_48)
- /* 49..64 bytes */
- ldp B_q, C_q, [src]
- str E_q, [dstend, -16]
- stp A_q, B_q, [dstin]
- str C_q, [dstin, 32]
- ret
-
-L(bytes_17_to_48):
- /* 17..48 bytes*/
- cmp count, 32
- b.gt L(bytes_32_to_48)
- /* 17..32 bytes*/
- str A_q, [dstin]
- str E_q, [dstend, -16]
- ret
-
-L(bytes_32_to_48):
- /* 32..48 */
- ldr B_q, [src]
- str A_q, [dstin]
- str E_q, [dstend, -16]
- str B_q, [dstin, 16]
- ret
-
- .p2align 4
- /* Small copies: 0..16 bytes. */
-L(memcopy16):
- cmp count, 8
- b.lo L(bytes_0_to_8)
- ldr A_l, [src]
- ldr A_h, [srcend, -8]
- add dstend, dstin, count
- str A_l, [dstin]
- str A_h, [dstend, -8]
- ret
- .p2align 4
-
-L(bytes_0_to_8):
- tbz count, 2, L(bytes_0_to_3)
- ldr A_lw, [src]
- ldr A_hw, [srcend, -4]
- add dstend, dstin, count
- str A_lw, [dstin]
- str A_hw, [dstend, -4]
- ret
-
- /* Copy 0..3 bytes. Use a branchless sequence that copies the same
- byte 3 times if count==1, or the 2nd byte twice if count==2. */
-L(bytes_0_to_3):
- cbz count, 1f
- lsr tmp1, count, 1
- ldrb A_lw, [src]
- ldrb A_hw, [srcend, -1]
- add dstend, dstin, count
- ldrb B_lw, [src, tmp1]
- strb B_lw, [dstin, tmp1]
- strb A_hw, [dstend, -1]
- strb A_lw, [dstin]
-1:
- ret
-
- .p2align 4
-
-L(memcpy_copy96):
- /* Copying 65..96 bytes. A_q (first 16 bytes) and
- E_q(last 16 bytes) are already loaded. The size
- is large enough to benefit from aligned loads */
- bic src, src, 15
- ldp B_q, C_q, [src]
- /* Loaded 64 bytes, second 16-bytes chunk can be
- overlapping with the first chunk by tmp1 bytes.
- Stored 16 bytes. */
- sub dst, dstin, tmp1
- add count, count, tmp1
- /* The range of count being [65..96] becomes [65..111]
- after tmp [0..15] gets added to it,
- count now is <bytes-left-to-load>+48 */
- cmp count, 80
- b.gt L(copy96_medium)
- ldr D_q, [src, 32]
- stp B_q, C_q, [dst, 16]
- str D_q, [dst, 48]
- str A_q, [dstin]
- str E_q, [dstend, -16]
- ret
-
- .p2align 4
-L(copy96_medium):
- ldp D_q, G_q, [src, 32]
- cmp count, 96
- b.gt L(copy96_large)
- stp B_q, C_q, [dst, 16]
- stp D_q, G_q, [dst, 48]
- str A_q, [dstin]
- str E_q, [dstend, -16]
- ret
-
-L(copy96_large):
- ldr F_q, [src, 64]
- str B_q, [dst, 16]
- stp C_q, D_q, [dst, 32]
- stp G_q, F_q, [dst, 64]
- str A_q, [dstin]
- str E_q, [dstend, -16]
- ret
-
- .p2align 4
-L(memcopy_long):
- cmp count, 2048
- b.ls L(copy2048_large)
- ldr A_q, [src], #16
- and tmp1, src, 15
- bic src, src, 15
- ldp B_q, C_q, [src], #32
- sub dst, dstin, tmp1
- add count, count, tmp1
- add dst, dst, 16
- and tmp1, dst, 15
- ldp D_q, E_q, [src], #32
- str A_q, [dstin]
-
- /* Already loaded 64+16 bytes. Check if at
- least 64 more bytes left */
- subs count, count, 64+64+16
- b.lt L(loop128_exit0)
- cmp count, MEMCPY_PREFETCH_LDR + 64 + 32
- b.lt L(loop128)
- cbnz tmp1, L(dst_unaligned)
- sub count, count, MEMCPY_PREFETCH_LDR + 64 + 32
-
- .p2align 4
-
-L(loop128_prefetch):
- prfm pldl1strm, [src, MEMCPY_PREFETCH_LDR]
- ldp F_q, G_q, [src], #32
- stp B_q, C_q, [dst], #32
- ldp H_q, I_q, [src], #32
- prfm pldl1strm, [src, MEMCPY_PREFETCH_LDR]
- ldp B_q, C_q, [src], #32
- stp D_q, E_q, [dst], #32
- ldp D_q, E_q, [src], #32
- stp F_q, G_q, [dst], #32
- stp H_q, I_q, [dst], #32
- subs count, count, 128
- b.ge L(loop128_prefetch)
-
- add count, count, MEMCPY_PREFETCH_LDR + 64 + 32
- .p2align 4
-L(loop128):
- ldp F_q, G_q, [src], #32
- ldp H_q, I_q, [src], #32
- stp B_q, C_q, [dst], #32
- stp D_q, E_q, [dst], #32
- subs count, count, 64
- b.lt L(loop128_exit1)
- ldp B_q, C_q, [src], #32
- ldp D_q, E_q, [src], #32
- stp F_q, G_q, [dst], #32
- stp H_q, I_q, [dst], #32
- subs count, count, 64
- b.ge L(loop128)
-L(loop128_exit0):
- ldp F_q, G_q, [srcend, -64]
- ldp H_q, I_q, [srcend, -32]
- stp B_q, C_q, [dst], #32
- stp D_q, E_q, [dst]
- stp F_q, G_q, [dstend, -64]
- stp H_q, I_q, [dstend, -32]
- ret
-L(loop128_exit1):
- ldp B_q, C_q, [srcend, -64]
- ldp D_q, E_q, [srcend, -32]
- stp F_q, G_q, [dst], #32
- stp H_q, I_q, [dst]
- stp B_q, C_q, [dstend, -64]
- stp D_q, E_q, [dstend, -32]
- ret
-
-L(copy2048_large):
- and tmp1, dstin, 15
- bic dst, dstin, 15
- ldp D_l, D_h, [src]
- sub src, src, tmp1
- add count, count, tmp1 /* Count is now 16 too large. */
- ldp A_l, A_h, [src, 16]
- stp D_l, D_h, [dstin]
- ldp B_l, B_h, [src, 32]
- ldp C_l, C_h, [src, 48]
- ldp D_l, D_h, [src, 64]!
- subs count, count, 128 + 16 /* Test and readjust count. */
- b.ls L(last64)
-
-L(loop64):
- stp A_l, A_h, [dst, 16]
- ldp A_l, A_h, [src, 16]
- stp B_l, B_h, [dst, 32]
- ldp B_l, B_h, [src, 32]
- stp C_l, C_h, [dst, 48]
- ldp C_l, C_h, [src, 48]
- stp D_l, D_h, [dst, 64]
- ldp D_l, D_h, [src, 64]
- add dst, dst, 64
- add src, src, 64
- subs count, count, 64
- b.hi L(loop64)
-
- /* Write the last full set of 64 bytes. The remainder is at most 64
- bytes, so it is safe to always copy 64 bytes from the end even if
- there is just 1 byte left. */
-L(last64):
- ldp E_l, E_h, [srcend, -64]
- stp A_l, A_h, [dst, 16]
- ldp A_l, A_h, [srcend, -48]
- stp B_l, B_h, [dst, 32]
- ldp B_l, B_h, [srcend, -32]
- stp C_l, C_h, [dst, 48]
- ldp C_l, C_h, [srcend, -16]
- stp D_l, D_h, [dst, 64]
- stp E_l, E_h, [dstend, -64]
- stp A_l, A_h, [dstend, -48]
- stp B_l, B_h, [dstend, -32]
- stp C_l, C_h, [dstend, -16]
- ret
-
-
-L(dst_unaligned_tail):
- ldp C_q, D_q, [srcend, -64]
- ldp E_q, F_q, [srcend, -32]
- stp A_q, B_q, [dst], #32
- stp H_q, I_q, [dst], #16
- str G_q, [dst, tmp1]
- stp C_q, D_q, [dstend, -64]
- stp E_q, F_q, [dstend, -32]
- ret
-
-L(dst_unaligned):
- /* For the unaligned store case the code loads two
- aligned chunks and then merges them using ext
- instruction. This can be up to 30% faster than
- the the simple unaligned store access.
-
- Current state: tmp1 = dst % 16; C_q, D_q, E_q
- contains data yet to be stored. src and dst points
- to next-to-be-processed data. A_q, B_q contains
- data already stored before, count = bytes left to
- be load decremented by 64.
-
- The control is passed here if at least 64 bytes left
- to be loaded. The code does two aligned loads and then
- extracts (16-tmp1) bytes from the first register and
- tmp1 bytes from the next register forming the value
- for the aligned store.
-
- As ext instruction can only have it's index encoded
- as immediate. 15 code chunks process each possible
- index value. Computed goto is used to reach the
- required code. */
-
- /* Store the 16 bytes to dst and align dst for further
- operations, several bytes will be stored at this
- address once more */
-
- ldp F_q, G_q, [src], #32
- stp B_q, C_q, [dst], #32
- bic dst, dst, 15
- sub count, count, 32
- adrp tmp2, L(ext_table)
- add tmp2, tmp2, :lo12:L(ext_table)
- add tmp2, tmp2, tmp1, LSL #2
- ldr tmp3w, [tmp2]
- add tmp2, tmp2, tmp3w, SXTW
- br tmp2
-
-.p2align 4
- /* to make the loop in each chunk 16-bytes aligned */
- nop
-#define EXT_CHUNK(shft) \
-L(ext_size_ ## shft):;\
- ext A_v.16b, C_v.16b, D_v.16b, 16-shft;\
- ext B_v.16b, D_v.16b, E_v.16b, 16-shft;\
- ext H_v.16b, E_v.16b, F_v.16b, 16-shft;\
-1:;\
- stp A_q, B_q, [dst], #32;\
- prfm pldl1strm, [src, MEMCPY_PREFETCH_LDR];\
- ldp C_q, D_q, [src], #32;\
- ext I_v.16b, F_v.16b, G_v.16b, 16-shft;\
- stp H_q, I_q, [dst], #32;\
- ext A_v.16b, G_v.16b, C_v.16b, 16-shft;\
- ext B_v.16b, C_v.16b, D_v.16b, 16-shft;\
- ldp F_q, G_q, [src], #32;\
- ext H_v.16b, D_v.16b, F_v.16b, 16-shft;\
- subs count, count, 64;\
- b.ge 1b;\
-2:;\
- ext I_v.16b, F_v.16b, G_v.16b, 16-shft;\
- b L(dst_unaligned_tail);
-
-EXT_CHUNK(1)
-EXT_CHUNK(2)
-EXT_CHUNK(3)
-EXT_CHUNK(4)
-EXT_CHUNK(5)
-EXT_CHUNK(6)
-EXT_CHUNK(7)
-EXT_CHUNK(8)
-EXT_CHUNK(9)
-EXT_CHUNK(10)
-EXT_CHUNK(11)
-EXT_CHUNK(12)
-EXT_CHUNK(13)
-EXT_CHUNK(14)
-EXT_CHUNK(15)
-
-.p2align 4
-L(move_long):
-1:
- add srcend, src, count
- add dstend, dstin, count
-
- and tmp1, dstend, 15
- ldr D_q, [srcend, -16]
- sub srcend, srcend, tmp1
- sub count, count, tmp1
- ldp A_q, B_q, [srcend, -32]
- str D_q, [dstend, -16]
- ldp C_q, D_q, [srcend, -64]!
- sub dstend, dstend, tmp1
- subs count, count, 128
- b.ls 2f
-
-.p2align 4
-1:
- subs count, count, 64
- stp A_q, B_q, [dstend, -32]
- ldp A_q, B_q, [srcend, -32]
- stp C_q, D_q, [dstend, -64]!
- ldp C_q, D_q, [srcend, -64]!
- b.hi 1b
-
- /* Write the last full set of 64 bytes. The remainder is at most 64
- bytes, so it is safe to always copy 64 bytes from the start even if
- there is just 1 byte left. */
-2:
- ldp E_q, F_q, [src, 32]
- ldp G_q, H_q, [src]
- stp A_q, B_q, [dstend, -32]
- stp C_q, D_q, [dstend, -64]
- stp E_q, F_q, [dstin, 32]
- stp G_q, H_q, [dstin]
-3: ret
-
-
-.p2align 4
-L(move_middle):
- cbz tmp1, 3f
- add srcend, src, count
- prfm PLDL1STRM, [srcend, -64]
- add dstend, dstin, count
- and tmp1, dstend, 15
- ldr D_q, [srcend, -16]
- sub srcend, srcend, tmp1
- sub count, count, tmp1
- ldr A_q, [srcend, -16]
- str D_q, [dstend, -16]
- ldr B_q, [srcend, -32]
- ldr C_q, [srcend, -48]
- ldr D_q, [srcend, -64]!
- sub dstend, dstend, tmp1
- subs count, count, 128
- b.ls 2f
-
-1:
- str A_q, [dstend, -16]
- ldr A_q, [srcend, -16]
- str B_q, [dstend, -32]
- ldr B_q, [srcend, -32]
- str C_q, [dstend, -48]
- ldr C_q, [srcend, -48]
- str D_q, [dstend, -64]!
- ldr D_q, [srcend, -64]!
- subs count, count, 64
- b.hi 1b
-
- /* Write the last full set of 64 bytes. The remainder is at most 64
- bytes, so it is safe to always copy 64 bytes from the start even if
- there is just 1 byte left. */
-2:
- ldr G_q, [src, 48]
- str A_q, [dstend, -16]
- ldr A_q, [src, 32]
- str B_q, [dstend, -32]
- ldr B_q, [src, 16]
- str C_q, [dstend, -48]
- ldr C_q, [src]
- str D_q, [dstend, -64]
- str G_q, [dstin, 48]
- str A_q, [dstin, 32]
- str B_q, [dstin, 16]
- str C_q, [dstin]
-3: ret
-
-
-END (MEMCPY)
- .section .rodata
- .p2align 4
-
-L(ext_table):
- /* The first entry is for the alignment of 0 and is never
- actually used (could be any value). */
- .word 0
- .word L(ext_size_1) -.
- .word L(ext_size_2) -.
- .word L(ext_size_3) -.
- .word L(ext_size_4) -.
- .word L(ext_size_5) -.
- .word L(ext_size_6) -.
- .word L(ext_size_7) -.
- .word L(ext_size_8) -.
- .word L(ext_size_9) -.
- .word L(ext_size_10) -.
- .word L(ext_size_11) -.
- .word L(ext_size_12) -.
- .word L(ext_size_13) -.
- .word L(ext_size_14) -.
- .word L(ext_size_15) -.
-
-libc_hidden_builtin_def (MEMCPY)
-#endif
diff --git a/sysdeps/aarch64/multiarch/memmove.c b/sysdeps/aarch64/multiarch/memmove.c
index 0d8c85b4..e69d8162 100644
--- a/sysdeps/aarch64/multiarch/memmove.c
+++ b/sysdeps/aarch64/multiarch/memmove.c
@@ -1,5 +1,5 @@
/* Multiple versions of memmove. AARCH64 version.
- Copyright (C) 2017-2019 Free Software Foundation, Inc.
+ Copyright (C) 2017-2018 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
@@ -31,16 +31,13 @@ extern __typeof (__redirect_memmove) __libc_memmove;
extern __typeof (__redirect_memmove) __memmove_generic attribute_hidden;
extern __typeof (__redirect_memmove) __memmove_thunderx attribute_hidden;
extern __typeof (__redirect_memmove) __memmove_falkor attribute_hidden;
-extern __typeof (__redirect_memmove) __memmove_kunpeng attribute_hidden;
-
+
libc_ifunc (__libc_memmove,
- (IS_KUNPENG920(midr)
- ?__memmove_kunpeng
- :(IS_THUNDERX (midr)
+ (IS_THUNDERX (midr)
? __memmove_thunderx
: (IS_FALKOR (midr) || IS_PHECDA (midr)
? __memmove_falkor
- : __memmove_generic))));
+ : __memmove_generic)));
# undef memmove
strong_alias (__libc_memmove, memmove);
diff --git a/sysdeps/aarch64/multiarch/memset.c b/sysdeps/aarch64/multiarch/memset.c
index 0f7ad0c8..f7ae291e 100644
--- a/sysdeps/aarch64/multiarch/memset.c
+++ b/sysdeps/aarch64/multiarch/memset.c
@@ -1,5 +1,5 @@
/* Multiple versions of memset. AARCH64 version.
- Copyright (C) 2017-2019 Free Software Foundation, Inc.
+ Copyright (C) 2017-2018 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
@@ -29,15 +29,15 @@
extern __typeof (__redirect_memset) __libc_memset;
extern __typeof (__redirect_memset) __memset_falkor attribute_hidden;
-extern __typeof (__redirect_memset) __memset_generic attribute_hidden;
extern __typeof (__redirect_memset) __memset_kunpeng attribute_hidden;
+extern __typeof (__redirect_memset) __memset_generic attribute_hidden;
libc_ifunc (__libc_memset,
- IS_KUNPENG920(midr)
- ?__memset_kunpeng
- :((IS_FALKOR (midr) || IS_PHECDA (midr)) && zva_size == 64
- ?__memset_falkor
- :__memset_generic));
+ IS_KUNPENG920 (midr)
+ ?__memset_kunpeng
+ : ((IS_FALKOR (midr) || IS_PHECDA (midr)) && zva_size == 64
+ ? __memset_falkor
+ : __memset_generic));
# undef memset
strong_alias (__libc_memset, memset);
diff --git a/sysdeps/aarch64/multiarch/memset_kunpeng.S b/sysdeps/aarch64/multiarch/memset_kunpeng.S
index 22a3d4a7..a03441ae 100644
--- a/sysdeps/aarch64/multiarch/memset_kunpeng.S
+++ b/sysdeps/aarch64/multiarch/memset_kunpeng.S
@@ -1,4 +1,5 @@
-/* Copyright (C) 2012-2019 Free Software Foundation, Inc.
+/* Optimized memset for Huawei Kunpeng processor.
+ Copyright (C) 2012-2019 Free Software Foundation, Inc.
This file is part of the GNU C Library.
@@ -14,7 +15,7 @@
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library. If not, see
- <http://www.gnu.org/licenses/>. */
+ <https://www.gnu.org/licenses/>. */
#include <sysdep.h>
#include <sysdeps/aarch64/memset-reg.h>
@@ -35,7 +36,7 @@ ENTRY_ALIGN (MEMSET, 6)
dup v0.16B, valw
add dstend, dstin, count
-
+
cmp count, 128
b.hs L(set_long)
@@ -44,7 +45,7 @@ ENTRY_ALIGN (MEMSET, 6)
/* Set 16..127 bytes. */
str q0, [dstin]
- tbnz count, 6, L(set112)
+ tbnz count, 6, L(set127)
str q0, [dstend, -16]
tbz count, 5, 1f
str q0, [dstin, 16]
@@ -53,26 +54,14 @@ ENTRY_ALIGN (MEMSET, 6)
.p2align 4
/* Set 64..127 bytes. Write 64 bytes from the start and
- 32 bytes from the end. */
-L(set112):
- ands tmp1, dstin, 15
- bne 2f
- str q0, [dstin, 16]
- stp q0, q0, [dstin, 32]//finish 64
- tbz count, 5, 1f
- stp q0, q0, [dstin, 64] //大于96, finish 96
-1: stp q0, q0, [dstend, -32]
+ 64 bytes from the end. */
+L(set127):
+ stp q0, q0, [dstin, 16]
+ str q0, [dstin, 48]
+ stp q0, q0, [dstend, -64]
+ stp q0, q0, [dstend, -32]
ret
- .p2align 4
-2: bic dst, dstin, 15//回退到16对齐
- stp q0,q0, [dst, 16]
- str q0, [dst, 48]
- tbz count, 5, 3f //大于96
- stp q0, q0, [dst, 64]
-3: stp q0, q0, [dstend, -48]//finish 64~80
- str q0, [dstend, -16]//finish 96
- ret
-
+
.p2align 4
/* Set 0..15 bytes. */
L(less16):
@@ -90,10 +79,9 @@ L(less8):
tbz count, 1, 3f
str h0, [dstend, -2]
3: ret
-
+
.p2align 4
-L(set_long):
- and valw, valw, 255
+L(set_long):
bic dst, dstin, 15
str q0, [dstin]
sub count, dstend, dst /* Count is 16 too large. */
@@ -103,19 +91,21 @@ L(set_long):
stp q0, q0, [dst, 64]!
subs count, count, 64
b.lo 1f
- stp q0, q0, [dst, 32]
+ stp q0, q0, [dst, 32]
stp q0, q0, [dst, 64]!
subs count, count, 64
b.lo 1f
- stp q0, q0, [dst, 32]
+ stp q0, q0, [dst, 32]
stp q0, q0, [dst, 64]!
subs count, count, 64
- b.hs 1b
-
-1: tbz count, 5, 2f
- str q0, [dst, 32]
- str q0, [dst, 48]
-2: stp q0, q0, [dstend, -32]
+ b.lo 1f
+ stp q0, q0, [dst, 32]
+ stp q0, q0, [dst, 64]!
+ subs count, count, 64
+ b.hs 1b
+
+1: stp q0, q0, [dstend, -64]
+ stp q0, q0, [dstend, -32]
ret
END (MEMSET)
diff --git a/sysdeps/aarch64/strcpy.S b/sysdeps/aarch64/strcpy.S
index 290bcf8d..a64c5980 100644
--- a/sysdeps/aarch64/strcpy.S
+++ b/sysdeps/aarch64/strcpy.S
@@ -1,5 +1,5 @@
/* strcpy/stpcpy - copy a string returning pointer to start/end.
- Copyright (C) 2013-2019 Free Software Foundation, Inc.
+ Copyright (C) 2013-2018 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
@@ -14,7 +14,7 @@
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
- <https://www.gnu.org/licenses/>. */
+ <http://www.gnu.org/licenses/>. */
/* To build as stpcpy, define BUILD_STPCPY before compiling this file.
@@ -232,7 +232,7 @@ L(entry_no_page_cross):
#ifdef __AARCH64EB__
rev64 datav.16b, datav.16b
#endif
- /* loc */
+ /* calculate the loc value */
cmeq datav.16b, datav.16b, #0
mov data1, datav.d[0]
mov data2, datav.d[1]
diff --git a/sysdeps/aarch64/strnlen.S b/sysdeps/aarch64/strnlen.S
index a57753b0..0a42f404 100644
--- a/sysdeps/aarch64/strnlen.S
+++ b/sysdeps/aarch64/strnlen.S
@@ -1,6 +1,6 @@
/* strnlen - calculate the length of a string with limit.
- Copyright (C) 2013-2019 Free Software Foundation, Inc.
+ Copyright (C) 2013-2018 Free Software Foundation, Inc.
This file is part of the GNU C Library.
@@ -16,7 +16,7 @@
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library. If not, see
- <https://www.gnu.org/licenses/>. */
+ <http://www.gnu.org/licenses/>. */
#include <sysdep.h>
diff --git a/sysdeps/unix/sysv/linux/aarch64/cpu-features.c b/sysdeps/unix/sysv/linux/aarch64/cpu-features.c
index b152c4e3..e60485b0 100644
--- a/sysdeps/unix/sysv/linux/aarch64/cpu-features.c
+++ b/sysdeps/unix/sysv/linux/aarch64/cpu-features.c
@@ -1,6 +1,6 @@
/* Initialize CPU feature data. AArch64 version.
This file is part of the GNU C Library.
- Copyright (C) 2017-2019 Free Software Foundation, Inc.
+ Copyright (C) 2017-2018 Free Software Foundation, Inc.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
@@ -36,7 +36,7 @@ static struct cpu_list cpu_list[] = {
{"thunderx2t99", 0x431F0AF0},
{"thunderx2t99p1", 0x420F5160},
{"phecda", 0x680F0000},
- {"kunpeng920", 0x481FD010},
+ {"kunpeng920", 0x481FD010},
{"generic", 0x0}
};
diff --git a/sysdeps/unix/sysv/linux/aarch64/cpu-features.h b/sysdeps/unix/sysv/linux/aarch64/cpu-features.h
index 4faeed7a..ed77cde7 100644
--- a/sysdeps/unix/sysv/linux/aarch64/cpu-features.h
+++ b/sysdeps/unix/sysv/linux/aarch64/cpu-features.h
@@ -1,6 +1,6 @@
/* Initialize CPU feature data. AArch64 version.
This file is part of the GNU C Library.
- Copyright (C) 2017-2019 Free Software Foundation, Inc.
+ Copyright (C) 2017-2018 Free Software Foundation, Inc.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
@@ -51,8 +51,9 @@
#define IS_PHECDA(midr) (MIDR_IMPLEMENTOR(midr) == 'h' \
&& MIDR_PARTNUM(midr) == 0x000)
-#define IS_KUNPENG920(midr) (MIDR_IMPLEMENTOR(midr) == 'H' \
- && MIDR_PARTNUM(midr) == 0xd01)
+
+#define IS_KUNPENG920(midr) (MIDR_IMPLEMENTOR(midr) == 'H' \
+ && MIDR_PARTNUM(midr) == 0xd01)
struct cpu_features
{
--
2.19.1
Loading...
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化
1
https://gitee.com/hwzjyggsddu/glibc.git
[email protected]:hwzjyggsddu/glibc.git
hwzjyggsddu
glibc
glibc
openEuler-20.03-LTS

搜索帮助