diff --git a/LICENSE b/LICENSE
index 20a4b7717cf5e46e2def2ecd47756baf3061d2bd..2543b82ed92d0bdc5f3fdfa5047144db3c7e9014 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1,11 +1,6 @@
-MIT OR Apache-2.0 WITH LLVM-exception
-=====================================
-
-
MIT License
------------
-Copyright (c) 1999-2022, Arm Limited.
+Copyright (c) 1999-2019, Arm Limited.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
@@ -24,226 +19,3 @@ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
-
-
-Apache-2.0 WITH LLVM-exception
-------------------------------
-
- Apache License
- Version 2.0, January 2004
- http://www.apache.org/licenses/
-
- TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
-
- 1. Definitions.
-
- "License" shall mean the terms and conditions for use, reproduction,
- and distribution as defined by Sections 1 through 9 of this document.
-
- "Licensor" shall mean the copyright owner or entity authorized by
- the copyright owner that is granting the License.
-
- "Legal Entity" shall mean the union of the acting entity and all
- other entities that control, are controlled by, or are under common
- control with that entity. For the purposes of this definition,
- "control" means (i) the power, direct or indirect, to cause the
- direction or management of such entity, whether by contract or
- otherwise, or (ii) ownership of fifty percent (50%) or more of the
- outstanding shares, or (iii) beneficial ownership of such entity.
-
- "You" (or "Your") shall mean an individual or Legal Entity
- exercising permissions granted by this License.
-
- "Source" form shall mean the preferred form for making modifications,
- including but not limited to software source code, documentation
- source, and configuration files.
-
- "Object" form shall mean any form resulting from mechanical
- transformation or translation of a Source form, including but
- not limited to compiled object code, generated documentation,
- and conversions to other media types.
-
- "Work" shall mean the work of authorship, whether in Source or
- Object form, made available under the License, as indicated by a
- copyright notice that is included in or attached to the work
- (an example is provided in the Appendix below).
-
- "Derivative Works" shall mean any work, whether in Source or Object
- form, that is based on (or derived from) the Work and for which the
- editorial revisions, annotations, elaborations, or other modifications
- represent, as a whole, an original work of authorship. For the purposes
- of this License, Derivative Works shall not include works that remain
- separable from, or merely link (or bind by name) to the interfaces of,
- the Work and Derivative Works thereof.
-
- "Contribution" shall mean any work of authorship, including
- the original version of the Work and any modifications or additions
- to that Work or Derivative Works thereof, that is intentionally
- submitted to Licensor for inclusion in the Work by the copyright owner
- or by an individual or Legal Entity authorized to submit on behalf of
- the copyright owner. For the purposes of this definition, "submitted"
- means any form of electronic, verbal, or written communication sent
- to the Licensor or its representatives, including but not limited to
- communication on electronic mailing lists, source code control systems,
- and issue tracking systems that are managed by, or on behalf of, the
- Licensor for the purpose of discussing and improving the Work, but
- excluding communication that is conspicuously marked or otherwise
- designated in writing by the copyright owner as "Not a Contribution."
-
- "Contributor" shall mean Licensor and any individual or Legal Entity
- on behalf of whom a Contribution has been received by Licensor and
- subsequently incorporated within the Work.
-
- 2. Grant of Copyright License. Subject to the terms and conditions of
- this License, each Contributor hereby grants to You a perpetual,
- worldwide, non-exclusive, no-charge, royalty-free, irrevocable
- copyright license to reproduce, prepare Derivative Works of,
- publicly display, publicly perform, sublicense, and distribute the
- Work and such Derivative Works in Source or Object form.
-
- 3. Grant of Patent License. Subject to the terms and conditions of
- this License, each Contributor hereby grants to You a perpetual,
- worldwide, non-exclusive, no-charge, royalty-free, irrevocable
- (except as stated in this section) patent license to make, have made,
- use, offer to sell, sell, import, and otherwise transfer the Work,
- where such license applies only to those patent claims licensable
- by such Contributor that are necessarily infringed by their
- Contribution(s) alone or by combination of their Contribution(s)
- with the Work to which such Contribution(s) was submitted. If You
- institute patent litigation against any entity (including a
- cross-claim or counterclaim in a lawsuit) alleging that the Work
- or a Contribution incorporated within the Work constitutes direct
- or contributory patent infringement, then any patent licenses
- granted to You under this License for that Work shall terminate
- as of the date such litigation is filed.
-
- 4. Redistribution. You may reproduce and distribute copies of the
- Work or Derivative Works thereof in any medium, with or without
- modifications, and in Source or Object form, provided that You
- meet the following conditions:
-
- (a) You must give any other recipients of the Work or
- Derivative Works a copy of this License; and
-
- (b) You must cause any modified files to carry prominent notices
- stating that You changed the files; and
-
- (c) You must retain, in the Source form of any Derivative Works
- that You distribute, all copyright, patent, trademark, and
- attribution notices from the Source form of the Work,
- excluding those notices that do not pertain to any part of
- the Derivative Works; and
-
- (d) If the Work includes a "NOTICE" text file as part of its
- distribution, then any Derivative Works that You distribute must
- include a readable copy of the attribution notices contained
- within such NOTICE file, excluding those notices that do not
- pertain to any part of the Derivative Works, in at least one
- of the following places: within a NOTICE text file distributed
- as part of the Derivative Works; within the Source form or
- documentation, if provided along with the Derivative Works; or,
- within a display generated by the Derivative Works, if and
- wherever such third-party notices normally appear. The contents
- of the NOTICE file are for informational purposes only and
- do not modify the License. You may add Your own attribution
- notices within Derivative Works that You distribute, alongside
- or as an addendum to the NOTICE text from the Work, provided
- that such additional attribution notices cannot be construed
- as modifying the License.
-
- You may add Your own copyright statement to Your modifications and
- may provide additional or different license terms and conditions
- for use, reproduction, or distribution of Your modifications, or
- for any such Derivative Works as a whole, provided Your use,
- reproduction, and distribution of the Work otherwise complies with
- the conditions stated in this License.
-
- 5. Submission of Contributions. Unless You explicitly state otherwise,
- any Contribution intentionally submitted for inclusion in the Work
- by You to the Licensor shall be under the terms and conditions of
- this License, without any additional terms or conditions.
- Notwithstanding the above, nothing herein shall supersede or modify
- the terms of any separate license agreement you may have executed
- with Licensor regarding such Contributions.
-
- 6. Trademarks. This License does not grant permission to use the trade
- names, trademarks, service marks, or product names of the Licensor,
- except as required for reasonable and customary use in describing the
- origin of the Work and reproducing the content of the NOTICE file.
-
- 7. Disclaimer of Warranty. Unless required by applicable law or
- agreed to in writing, Licensor provides the Work (and each
- Contributor provides its Contributions) on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
- implied, including, without limitation, any warranties or conditions
- of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
- PARTICULAR PURPOSE. You are solely responsible for determining the
- appropriateness of using or redistributing the Work and assume any
- risks associated with Your exercise of permissions under this License.
-
- 8. Limitation of Liability. In no event and under no legal theory,
- whether in tort (including negligence), contract, or otherwise,
- unless required by applicable law (such as deliberate and grossly
- negligent acts) or agreed to in writing, shall any Contributor be
- liable to You for damages, including any direct, indirect, special,
- incidental, or consequential damages of any character arising as a
- result of this License or out of the use or inability to use the
- Work (including but not limited to damages for loss of goodwill,
- work stoppage, computer failure or malfunction, or any and all
- other commercial damages or losses), even if such Contributor
- has been advised of the possibility of such damages.
-
- 9. Accepting Warranty or Additional Liability. While redistributing
- the Work or Derivative Works thereof, You may choose to offer,
- and charge a fee for, acceptance of support, warranty, indemnity,
- or other liability obligations and/or rights consistent with this
- License. However, in accepting such obligations, You may act only
- on Your own behalf and on Your sole responsibility, not on behalf
- of any other Contributor, and only if You agree to indemnify,
- defend, and hold each Contributor harmless for any liability
- incurred by, or claims asserted against, such Contributor by reason
- of your accepting any such warranty or additional liability.
-
- END OF TERMS AND CONDITIONS
-
- APPENDIX: How to apply the Apache License to your work.
-
- To apply the Apache License to your work, attach the following
- boilerplate notice, with the fields enclosed by brackets "[]"
- replaced with your own identifying information. (Don't include
- the brackets!) The text should be enclosed in the appropriate
- comment syntax for the file format. We also recommend that a
- file or class name and description of purpose be included on the
- same "printed page" as the copyright notice for easier
- identification within third-party archives.
-
- Copyright [yyyy] [name of copyright owner]
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-
-
---- LLVM Exceptions to the Apache 2.0 License ----
-
-As an exception, if, as a result of your compiling your source code, portions
-of this Software are embedded into an Object form of such source code, you
-may redistribute such embedded portions in such Object form without complying
-with the conditions of Sections 4(a), 4(b) and 4(d) of the License.
-
-In addition, if you combine or link compiled forms of this Software with
-software that is licensed under the GPLv2 ("Combined Software") and if a
-court of competent jurisdiction determines that the patent provision (Section
-3), the indemnity provision (Section 9) or other Section of the License
-conflicts with the conditions of the GPLv2, you may retroactively and
-prospectively choose to deem waived or otherwise exclude such Section(s) of
-the License, but only in their entirety and only with respect to the Combined
-Software.
diff --git a/Makefile b/Makefile
index c487896728c2cd3c877dad0f52256ddd1e5ebbe8..169f89e2c9d6be3f53a91780447652ee7917b28e 100644
--- a/Makefile
+++ b/Makefile
@@ -1,7 +1,7 @@
# Makefile - requires GNU make
#
-# Copyright (c) 2018-2022, Arm Limited.
-# SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+# Copyright (c) 2018-2020, Arm Limited.
+# SPDX-License-Identifier: MIT
srcdir = .
prefix = /usr
@@ -11,7 +11,6 @@ includedir = $(prefix)/include
# Configure these in config.mk, do not make changes in this file.
SUBS = math string networking
-PLSUBS = math
HOST_CC = cc
HOST_CFLAGS = -std=c99 -O2
HOST_LDFLAGS =
@@ -21,7 +20,6 @@ CPPFLAGS =
CFLAGS = -std=c99 -O2
CFLAGS_SHARED = -fPIC
CFLAGS_ALL = -Ibuild/include $(CPPFLAGS) $(CFLAGS)
-CFLAGS_PL = -Ibuild/pl/include $(CPPFLAGS) $(CFLAGS) -DPL
LDFLAGS =
LDLIBS =
AR = $(CROSS_COMPILE)ar
@@ -53,7 +51,6 @@ $(DIRS):
mkdir -p $@
$(filter %.os,$(ALL_FILES)): CFLAGS_ALL += $(CFLAGS_SHARED)
-$(filter %.os,$(ALL_FILES)): CFLAGS_PL += $(CFLAGS_SHARED)
build/%.o: $(srcdir)/%.S
$(CC) $(CFLAGS_ALL) -c -o $@ $<
diff --git a/OAT.xml b/OAT.xml
index ab48a784237e62c8f8595b1b124e3251991afade..71acb93c33930961bd73a8c0eed2ddee84da6bd7 100644
--- a/OAT.xml
+++ b/OAT.xml
@@ -19,7 +19,7 @@
policylist:
1. policy: If the OAT-Default.xml policies do not meet your requirements, please add policies here.
2. policyitem: The fields type, name, path, desc is required, and the fields rule, group, filefilter is optional,the default value is:
-
+
3. policyitem type:
"compatibility" is used to check license compatibility in the specified path;
"license" is used to check source license header in the specified path;
@@ -49,43 +49,10 @@ All configurations in this file will be merged to OAT-Default.xml, if you have a
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
+
-
-
-
+
diff --git a/README b/README
index a2143a28488abe9cbdb629698a3f22d353489b9a..9e1a34fdc65d9acd27964255a42211af5ef06efa 100644
--- a/README
+++ b/README
@@ -2,17 +2,14 @@ Arm Optimized Routines
----------------------
This repository contains implementations of library functions
-provided by Arm. The outbound license is available under a dual
-license, at the user’s election, as reflected in the LICENSE file.
-Contributions to this project are accepted, but Contributors have
-to sign an Assignment Agreement, please follow the instructions in
+provided by Arm under MIT License (See LICENSE). Contributions
+to this project are accepted, but Contributors have to sign an
+Assignment Agreement, please follow the instructions in
contributor-agreement.pdf. This is needed so upstreaming code
-to projects that require copyright assignment is possible. Further
-contribution requirements are documented in README.contributors of
-the appropriate subdirectory.
+to projects that require copyright assignment is possible.
Regular quarterly releases are tagged as vYY.MM, the latest
-release is v23.01.
+release is v21.02.
Source code layout:
@@ -27,7 +24,6 @@ networking/test/ - networking test and benchmark related sources.
string/ - string routines subproject sources.
string/include/ - string library public headers.
string/test/ - string test and benchmark related sources.
-pl/... - separately maintained performance library code.
The steps to build the target libraries and run the tests:
diff --git a/config.mk.dist b/config.mk.dist
index c4a6dba4b463f669c8a27bac66029c508ed2c875..177e1ac4f53a3e14772a7560f7f79eba86ffe5e7 100644
--- a/config.mk.dist
+++ b/config.mk.dist
@@ -1,14 +1,11 @@
# Example config.mk
#
-# Copyright (c) 2018-2022, Arm Limited.
-# SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+# Copyright (c) 2018-2020, Arm Limited.
+# SPDX-License-Identifier: MIT
# Subprojects to build
SUBS = math string networking
-# Subsubprojects to build if subproject pl is built
-PLSUBS = math
-
# Target architecture: aarch64, arm or x86_64
ARCH = aarch64
@@ -59,22 +56,8 @@ math-cflags += -ffp-contract=fast -fno-math-errno
# Use with clang.
#math-cflags += -ffp-contract=fast
-# Disable/enable SVE vector math code and tests
-WANT_SVE_MATH = 0
-ifeq ($(WANT_SVE_MATH), 1)
- math-cflags += -march=armv8.2-a+sve
-endif
-math-cflags += -DWANT_SVE_MATH=$(WANT_SVE_MATH)
-
-# If defined to 1, set errno in math functions according to ISO C. Many math
-# libraries do not set errno, so this is 0 by default. It may need to be
-# set to 1 if math.h has (math_errhandling & MATH_ERRNO) != 0.
-WANT_ERRNO = 0
-math-cflags += -DWANT_ERRNO=$(WANT_ERRNO)
-
-# If set to 1, set fenv in vector math routines.
-WANT_SIMD_EXCEPT = 0
-math-cflags += -DWANT_SIMD_EXCEPT=$(WANT_SIMD_EXCEPT)
+# Disable vector math code
+#math-cflags += -DWANT_VMATH=0
# Disable fenv checks
#math-ulpflags = -q -f
diff --git a/math/Dir.mk b/math/Dir.mk
index d6385d2bf5173daa6ea0b68d358749c5e7c45154..3b841ab71955cc69efff77a1e1fee21938422371 100644
--- a/math/Dir.mk
+++ b/math/Dir.mk
@@ -1,14 +1,12 @@
# Makefile fragment - requires GNU make
#
-# Copyright (c) 2019-2022, Arm Limited.
-# SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+# Copyright (c) 2019, Arm Limited.
+# SPDX-License-Identifier: MIT
S := $(srcdir)/math
B := build/math
math-lib-srcs := $(wildcard $(S)/*.[cS])
-math-lib-srcs += $(wildcard $(S)/$(ARCH)/*.[cS])
-
math-test-srcs := \
$(S)/test/mathtest.c \
$(S)/test/mathbench.c \
@@ -17,7 +15,6 @@ math-test-srcs := \
math-test-host-srcs := $(wildcard $(S)/test/rtest/*.[cS])
math-includes := $(patsubst $(S)/%,build/%,$(wildcard $(S)/include/*.h))
-math-test-includes := $(patsubst $(S)/%,build/include/%,$(wildcard $(S)/test/*.h))
math-libs := \
build/lib/libmathlib.so \
@@ -45,11 +42,10 @@ math-files := \
$(math-tools) \
$(math-host-tools) \
$(math-includes) \
- $(math-test-includes) \
-all-math: $(math-libs) $(math-tools) $(math-includes) $(math-test-includes)
+all-math: $(math-libs) $(math-tools) $(math-includes)
-$(math-objs): $(math-includes) $(math-test-includes)
+$(math-objs): $(math-includes)
$(math-objs): CFLAGS_ALL += $(math-cflags)
$(B)/test/mathtest.o: CFLAGS_ALL += -fmath-errno
$(math-host-objs): CC = $(HOST_CC)
@@ -87,9 +83,6 @@ build/bin/ulp: $(B)/test/ulp.o build/lib/libmathlib.a
build/include/%.h: $(S)/include/%.h
cp $< $@
-build/include/test/%.h: $(S)/test/%.h
- cp $< $@
-
build/bin/%.sh: $(S)/test/%.sh
cp $< $@
@@ -103,7 +96,7 @@ check-math-rtest: $(math-host-tools) $(math-tools)
cat $(math-rtests) | build/bin/rtest | $(EMULATOR) build/bin/mathtest $(math-testflags)
check-math-ulp: $(math-tools)
- ULPFLAGS="$(math-ulpflags)" WANT_SIMD_EXCEPT="$(WANT_SIMD_EXCEPT)" build/bin/runulp.sh $(EMULATOR)
+ ULPFLAGS="$(math-ulpflags)" build/bin/runulp.sh $(EMULATOR)
check-math: check-math-test check-math-rtest check-math-ulp
diff --git a/math/README.contributors b/math/README.contributors
deleted file mode 100644
index 33e7ba376e419301eaf8e51fc7abe4ad10a31350..0000000000000000000000000000000000000000
--- a/math/README.contributors
+++ /dev/null
@@ -1,78 +0,0 @@
-STYLE REQUIREMENTS
-==================
-
-1. Most code in this sub-directory is expected to be upstreamed into glibc so
- the GNU Coding Standard and glibc specific conventions should be followed
- to ease upstreaming.
-
-2. ABI and symbols: the code should be written so it is suitable for inclusion
- into a libc with minimal changes. This e.g. means that internal symbols
- should be hidden and in the implementation reserved namespace according to
- ISO C and POSIX rules. If possible the built shared libraries and static
- library archives should be usable to override libc symbols at link time (or
- at runtime via LD_PRELOAD). This requires the symbols to follow the glibc ABI
- (other than symbol versioning), this cannot be done reliably for static
- linking so this is a best effort requirement.
-
-3. API: include headers should be suitable for benchmarking and testing code
- and should not conflict with libc headers.
-
-
-CONTRIBUTION GUIDELINES FOR math SUB-DIRECTORY
-==============================================
-
-1. Math functions have quality and performance requirements.
-
-2. Quality:
- - Worst-case ULP error should be small in the entire input domain (for most
- common double precision scalar functions the target is < 0.66 ULP error,
- and < 1 ULP for single precision, even performance optimized function
- variant should not have > 5 ULP error if the goal is to be a drop in
- replacement for a standard math function), this should be tested
- statistically (or on all inputs if possible in reasonable amount of time).
- The ulp tool is for this and runulp.sh should be updated for new functions.
-
- - All standard rounding modes need to be supported but in non-default rounding
- modes the quality requirement can be relaxed. (Non-nearest rounded
- computation can be slow and inaccurate but has to be correct for conformance
- reasons.)
-
- - Special cases and error handling need to follow ISO C Annex F requirements,
- POSIX requirements, IEEE 754-2008 requirements and Glibc requiremnts:
- https://www.gnu.org/software/libc/manual/html_mono/libc.html#Errors-in-Math-Functions
- this should be tested by direct tests (glibc test system may be used for it).
-
- - Error handling code should be decoupled from the approximation code as much
- as possible. (There are helper functions, these take care of errno as well
- as exception raising.)
-
- - Vector math code does not need to work in non-nearest rounding mode and error
- handling side effects need not happen (fenv exceptions and errno), but the
- result should be correct (within quality requirements, which are lower for
- vector code than for scalar code).
-
- - Error bounds of the approximation should be clearly documented.
-
- - The code should build and pass tests on arm, aarch64 and x86_64 GNU linux
- systems. (Routines and features can be disabled on specific targets, but
- the build must complete). On aarch64, both little- and big-endian targets
- are supported as well as valid combinations of architecture extensions.
- The configurations that should be tested depend on the contribution.
-
-3. Performance:
- - Common math code should be benchmarked on modern aarch64 microarchitectures
- over typical inputs.
-
- - Performance improvements should be documented (relative numbers can be
- published; it is enough to use the mathbench microbenchmark tool which should
- be updated for new functions).
-
- - Attention should be paid to the compilation flags: for aarch64 fma
- contraction should be on and math errno turned off so some builtins can be
- inlined.
-
- - The code should be reasonably performant on x86_64 too, e.g. some rounding
- instructions and fma may not be available on x86_64, such builtins turn into
- libc calls with slow code. Such slowdown is not acceptable, a faster fallback
- should be present: glibc and bionic use the same code on all targets. (This
- does not apply to vector math code).
diff --git a/math/aarch64/v_cos.c b/math/aarch64/v_cos.c
deleted file mode 100644
index 9a73575bce896a9cc54930bb5cd7586b316aa5c0..0000000000000000000000000000000000000000
--- a/math/aarch64/v_cos.c
+++ /dev/null
@@ -1,87 +0,0 @@
-/*
- * Double-precision vector cos function.
- *
- * Copyright (c) 2019-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-
-#include "mathlib.h"
-#include "v_math.h"
-
-static const struct data
-{
- float64x2_t poly[7];
- float64x2_t range_val, shift, inv_pi, half_pi, pi_1, pi_2, pi_3;
-} data = {
- /* Worst-case error is 3.3 ulp in [-pi/2, pi/2]. */
- .poly = { V2 (-0x1.555555555547bp-3), V2 (0x1.1111111108a4dp-7),
- V2 (-0x1.a01a019936f27p-13), V2 (0x1.71de37a97d93ep-19),
- V2 (-0x1.ae633919987c6p-26), V2 (0x1.60e277ae07cecp-33),
- V2 (-0x1.9e9540300a1p-41) },
- .inv_pi = V2 (0x1.45f306dc9c883p-2),
- .half_pi = V2 (0x1.921fb54442d18p+0),
- .pi_1 = V2 (0x1.921fb54442d18p+1),
- .pi_2 = V2 (0x1.1a62633145c06p-53),
- .pi_3 = V2 (0x1.c1cd129024e09p-106),
- .shift = V2 (0x1.8p52),
- .range_val = V2 (0x1p23)
-};
-
-#define C(i) d->poly[i]
-
-static float64x2_t VPCS_ATTR NOINLINE
-special_case (float64x2_t x, float64x2_t y, uint64x2_t odd, uint64x2_t cmp)
-{
- y = vreinterpretq_f64_u64 (veorq_u64 (vreinterpretq_u64_f64 (y), odd));
- return v_call_f64 (cos, x, y, cmp);
-}
-
-float64x2_t VPCS_ATTR V_NAME_D1 (cos) (float64x2_t x)
-{
- const struct data *d = ptr_barrier (&data);
- float64x2_t n, r, r2, r3, r4, t1, t2, t3, y;
- uint64x2_t odd, cmp;
-
-#if WANT_SIMD_EXCEPT
- r = vabsq_f64 (x);
- cmp = vcgeq_u64 (vreinterpretq_u64_f64 (r),
- vreinterpretq_u64_f64 (d->range_val));
- if (unlikely (v_any_u64 (cmp)))
- /* If fenv exceptions are to be triggered correctly, set any special lanes
- to 1 (which is neutral w.r.t. fenv). These lanes will be fixed by
- special-case handler later. */
- r = vbslq_f64 (cmp, v_f64 (1.0), r);
-#else
- cmp = vcageq_f64 (x, d->range_val);
- r = x;
-#endif
-
- /* n = rint((|x|+pi/2)/pi) - 0.5. */
- n = vfmaq_f64 (d->shift, d->inv_pi, vaddq_f64 (r, d->half_pi));
- odd = vshlq_n_u64 (vreinterpretq_u64_f64 (n), 63);
- n = vsubq_f64 (n, d->shift);
- n = vsubq_f64 (n, v_f64 (0.5));
-
- /* r = |x| - n*pi (range reduction into -pi/2 .. pi/2). */
- r = vfmsq_f64 (r, d->pi_1, n);
- r = vfmsq_f64 (r, d->pi_2, n);
- r = vfmsq_f64 (r, d->pi_3, n);
-
- /* sin(r) poly approx. */
- r2 = vmulq_f64 (r, r);
- r3 = vmulq_f64 (r2, r);
- r4 = vmulq_f64 (r2, r2);
-
- t1 = vfmaq_f64 (C (4), C (5), r2);
- t2 = vfmaq_f64 (C (2), C (3), r2);
- t3 = vfmaq_f64 (C (0), C (1), r2);
-
- y = vfmaq_f64 (t1, C (6), r4);
- y = vfmaq_f64 (t2, y, r4);
- y = vfmaq_f64 (t3, y, r4);
- y = vfmaq_f64 (r, y, r3);
-
- if (unlikely (v_any_u64 (cmp)))
- return special_case (x, y, odd, cmp);
- return vreinterpretq_f64_u64 (veorq_u64 (vreinterpretq_u64_f64 (y), odd));
-}
diff --git a/math/aarch64/v_cosf.c b/math/aarch64/v_cosf.c
deleted file mode 100644
index b9890b2998ad3c260a6849d980cf3f69b4453ec4..0000000000000000000000000000000000000000
--- a/math/aarch64/v_cosf.c
+++ /dev/null
@@ -1,82 +0,0 @@
-/*
- * Single-precision vector cos function.
- *
- * Copyright (c) 2019-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-
-#include "mathlib.h"
-#include "v_math.h"
-
-static const struct data
-{
- float32x4_t poly[4];
- float32x4_t range_val, inv_pi, half_pi, shift, pi_1, pi_2, pi_3;
-} data = {
- /* 1.886 ulp error. */
- .poly = { V4 (-0x1.555548p-3f), V4 (0x1.110df4p-7f), V4 (-0x1.9f42eap-13f),
- V4 (0x1.5b2e76p-19f) },
-
- .pi_1 = V4 (0x1.921fb6p+1f),
- .pi_2 = V4 (-0x1.777a5cp-24f),
- .pi_3 = V4 (-0x1.ee59dap-49f),
-
- .inv_pi = V4 (0x1.45f306p-2f),
- .shift = V4 (0x1.8p+23f),
- .half_pi = V4 (0x1.921fb6p0f),
- .range_val = V4 (0x1p20f)
-};
-
-#define C(i) d->poly[i]
-
-static float32x4_t VPCS_ATTR NOINLINE
-special_case (float32x4_t x, float32x4_t y, uint32x4_t odd, uint32x4_t cmp)
-{
- /* Fall back to scalar code. */
- y = vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (y), odd));
- return v_call_f32 (cosf, x, y, cmp);
-}
-
-float32x4_t VPCS_ATTR V_NAME_F1 (cos) (float32x4_t x)
-{
- const struct data *d = ptr_barrier (&data);
- float32x4_t n, r, r2, r3, y;
- uint32x4_t odd, cmp;
-
-#if WANT_SIMD_EXCEPT
- r = vabsq_f32 (x);
- cmp = vcgeq_u32 (vreinterpretq_u32_f32 (r),
- vreinterpretq_u32_f32 (d->range_val));
- if (unlikely (v_any_u32 (cmp)))
- /* If fenv exceptions are to be triggered correctly, set any special lanes
- to 1 (which is neutral w.r.t. fenv). These lanes will be fixed by
- special-case handler later. */
- r = vbslq_f32 (cmp, v_f32 (1.0f), r);
-#else
- cmp = vcageq_f32 (x, d->range_val);
- r = x;
-#endif
-
- /* n = rint((|x|+pi/2)/pi) - 0.5. */
- n = vfmaq_f32 (d->shift, d->inv_pi, vaddq_f32 (r, d->half_pi));
- odd = vshlq_n_u32 (vreinterpretq_u32_f32 (n), 31);
- n = vsubq_f32 (n, d->shift);
- n = vsubq_f32 (n, v_f32 (0.5f));
-
- /* r = |x| - n*pi (range reduction into -pi/2 .. pi/2). */
- r = vfmsq_f32 (r, d->pi_1, n);
- r = vfmsq_f32 (r, d->pi_2, n);
- r = vfmsq_f32 (r, d->pi_3, n);
-
- /* y = sin(r). */
- r2 = vmulq_f32 (r, r);
- r3 = vmulq_f32 (r2, r);
- y = vfmaq_f32 (C (2), C (3), r2);
- y = vfmaq_f32 (C (1), y, r2);
- y = vfmaq_f32 (C (0), y, r2);
- y = vfmaq_f32 (r, y, r3);
-
- if (unlikely (v_any_u32 (cmp)))
- return special_case (x, y, odd, cmp);
- return vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (y), odd));
-}
diff --git a/math/aarch64/v_exp.c b/math/aarch64/v_exp.c
deleted file mode 100644
index bc5609faf4fc3597a5ec3a1080a12e843417bcc7..0000000000000000000000000000000000000000
--- a/math/aarch64/v_exp.c
+++ /dev/null
@@ -1,125 +0,0 @@
-/*
- * Double-precision vector e^x function.
- *
- * Copyright (c) 2019-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-
-#include "mathlib.h"
-#include "v_math.h"
-
-#define N (1 << V_EXP_TABLE_BITS)
-#define IndexMask (N - 1)
-
-const static volatile struct
-{
- float64x2_t poly[3];
- float64x2_t inv_ln2, ln2_hi, ln2_lo, shift;
-#if !WANT_SIMD_EXCEPT
- float64x2_t special_bound, scale_thresh;
-#endif
-} data = {
- /* maxerr: 1.88 +0.5 ulp
- rel error: 1.4337*2^-53
- abs error: 1.4299*2^-53 in [ -ln2/256, ln2/256 ]. */
- .poly = { V2 (0x1.ffffffffffd43p-2), V2 (0x1.55555c75adbb2p-3),
- V2 (0x1.55555da646206p-5) },
-#if !WANT_SIMD_EXCEPT
- .scale_thresh = V2 (163840.0), /* 1280.0 * N. */
- .special_bound = V2 (704.0),
-#endif
- .inv_ln2 = V2 (0x1.71547652b82fep7), /* N/ln2. */
- .ln2_hi = V2 (0x1.62e42fefa39efp-8), /* ln2/N. */
- .ln2_lo = V2 (0x1.abc9e3b39803f3p-63),
- .shift = V2 (0x1.8p+52)
-};
-
-#define C(i) data.poly[i]
-#define Tab __v_exp_data
-
-#if WANT_SIMD_EXCEPT
-
-# define TinyBound v_u64 (0x2000000000000000) /* asuint64 (0x1p-511). */
-# define BigBound v_u64 (0x4080000000000000) /* asuint64 (0x1p9). */
-# define SpecialBound v_u64 (0x2080000000000000) /* BigBound - TinyBound. */
-
-static float64x2_t VPCS_ATTR NOINLINE
-special_case (float64x2_t x, float64x2_t y, uint64x2_t cmp)
-{
- /* If fenv exceptions are to be triggered correctly, fall back to the scalar
- routine to special lanes. */
- return v_call_f64 (exp, x, y, cmp);
-}
-
-#else
-
-# define SpecialOffset v_u64 (0x6000000000000000) /* 0x1p513. */
-/* SpecialBias1 + SpecialBias1 = asuint(1.0). */
-# define SpecialBias1 v_u64 (0x7000000000000000) /* 0x1p769. */
-# define SpecialBias2 v_u64 (0x3010000000000000) /* 0x1p-254. */
-
-static inline float64x2_t VPCS_ATTR
-special_case (float64x2_t s, float64x2_t y, float64x2_t n)
-{
- /* 2^(n/N) may overflow, break it up into s1*s2. */
- uint64x2_t b = vandq_u64 (vcltzq_f64 (n), SpecialOffset);
- float64x2_t s1 = vreinterpretq_f64_u64 (vsubq_u64 (SpecialBias1, b));
- float64x2_t s2 = vreinterpretq_f64_u64 (
- vaddq_u64 (vsubq_u64 (vreinterpretq_u64_f64 (s), SpecialBias2), b));
- uint64x2_t cmp = vcagtq_f64 (n, data.scale_thresh);
- float64x2_t r1 = vmulq_f64 (s1, s1);
- float64x2_t r0 = vmulq_f64 (vfmaq_f64 (s2, y, s2), s1);
- return vbslq_f64 (cmp, r1, r0);
-}
-
-#endif
-
-float64x2_t VPCS_ATTR V_NAME_D1 (exp) (float64x2_t x)
-{
- float64x2_t n, r, r2, s, y, z;
- uint64x2_t cmp, u, e;
-
-#if WANT_SIMD_EXCEPT
- /* If any lanes are special, mask them with 1 and retain a copy of x to allow
- special_case to fix special lanes later. This is only necessary if fenv
- exceptions are to be triggered correctly. */
- float64x2_t xm = x;
- uint64x2_t iax = vreinterpretq_u64_f64 (vabsq_f64 (x));
- cmp = vcgeq_u64 (vsubq_u64 (iax, TinyBound), SpecialBound);
- if (unlikely (v_any_u64 (cmp)))
- x = vbslq_f64 (cmp, v_f64 (1), x);
-#else
- cmp = vcagtq_f64 (x, data.special_bound);
-#endif
-
- /* n = round(x/(ln2/N)). */
- z = vfmaq_f64 (data.shift, x, data.inv_ln2);
- u = vreinterpretq_u64_f64 (z);
- n = vsubq_f64 (z, data.shift);
-
- /* r = x - n*ln2/N. */
- r = x;
- r = vfmsq_f64 (r, data.ln2_hi, n);
- r = vfmsq_f64 (r, data.ln2_lo, n);
-
- e = vshlq_n_u64 (u, 52 - V_EXP_TABLE_BITS);
-
- /* y = exp(r) - 1 ~= r + C0 r^2 + C1 r^3 + C2 r^4. */
- r2 = vmulq_f64 (r, r);
- y = vfmaq_f64 (C (0), C (1), r);
- y = vfmaq_f64 (y, C (2), r2);
- y = vfmaq_f64 (r, y, r2);
-
- /* s = 2^(n/N). */
- u = (uint64x2_t){ Tab[u[0] & IndexMask], Tab[u[1] & IndexMask] };
- s = vreinterpretq_f64_u64 (vaddq_u64 (u, e));
-
- if (unlikely (v_any_u64 (cmp)))
-#if WANT_SIMD_EXCEPT
- return special_case (xm, vfmaq_f64 (s, y, s), cmp);
-#else
- return special_case (s, y, n);
-#endif
-
- return vfmaq_f64 (s, y, s);
-}
diff --git a/math/aarch64/v_exp2f.c b/math/aarch64/v_exp2f.c
deleted file mode 100644
index e402205e98e6bea310877d6d8b9b5f014e16c47a..0000000000000000000000000000000000000000
--- a/math/aarch64/v_exp2f.c
+++ /dev/null
@@ -1,113 +0,0 @@
-/*
- * Single-precision vector 2^x function.
- *
- * Copyright (c) 2019-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-
-#include "mathlib.h"
-#include "v_math.h"
-
-static const struct data
-{
- float32x4_t poly[5];
- uint32x4_t exponent_bias;
-#if !WANT_SIMD_EXCEPT
- float32x4_t special_bound, scale_thresh;
-#endif
-} data = {
- /* maxerr: 1.962 ulp. */
- .poly = { V4 (0x1.59977ap-10f), V4 (0x1.3ce9e4p-7f), V4 (0x1.c6bd32p-5f),
- V4 (0x1.ebf9bcp-3f), V4 (0x1.62e422p-1f) },
- .exponent_bias = V4 (0x3f800000),
-#if !WANT_SIMD_EXCEPT
- .special_bound = V4 (126.0f),
- .scale_thresh = V4 (192.0f),
-#endif
-};
-
-#define C(i) d->poly[i]
-
-#if WANT_SIMD_EXCEPT
-
-# define TinyBound v_u32 (0x20000000) /* asuint (0x1p-63). */
-# define BigBound v_u32 (0x42800000) /* asuint (0x1p6). */
-# define SpecialBound v_u32 (0x22800000) /* BigBound - TinyBound. */
-
-static float32x4_t VPCS_ATTR NOINLINE
-special_case (float32x4_t x, float32x4_t y, uint32x4_t cmp)
-{
- /* If fenv exceptions are to be triggered correctly, fall back to the scalar
- routine for special lanes. */
- return v_call_f32 (exp2f, x, y, cmp);
-}
-
-#else
-
-# define SpecialOffset v_u32 (0x82000000)
-# define SpecialBias v_u32 (0x7f000000)
-
-static float32x4_t VPCS_ATTR NOINLINE
-special_case (float32x4_t poly, float32x4_t n, uint32x4_t e, uint32x4_t cmp1,
- float32x4_t scale, const struct data *d)
-{
- /* 2^n may overflow, break it up into s1*s2. */
- uint32x4_t b = vandq_u32 (vclezq_f32 (n), SpecialOffset);
- float32x4_t s1 = vreinterpretq_f32_u32 (vaddq_u32 (b, SpecialBias));
- float32x4_t s2 = vreinterpretq_f32_u32 (vsubq_u32 (e, b));
- uint32x4_t cmp2 = vcagtq_f32 (n, d->scale_thresh);
- float32x4_t r2 = vmulq_f32 (s1, s1);
- float32x4_t r1 = vmulq_f32 (vfmaq_f32 (s2, poly, s2), s1);
- /* Similar to r1 but avoids double rounding in the subnormal range. */
- float32x4_t r0 = vfmaq_f32 (scale, poly, scale);
- float32x4_t r = vbslq_f32 (cmp1, r1, r0);
- return vbslq_f32 (cmp2, r2, r);
-}
-
-#endif
-
-float32x4_t VPCS_ATTR V_NAME_F1 (exp2) (float32x4_t x)
-{
- const struct data *d = ptr_barrier (&data);
- float32x4_t n, r, r2, scale, p, q, poly;
- uint32x4_t cmp, e;
-
-#if WANT_SIMD_EXCEPT
- /* asuint(|x|) - TinyBound >= BigBound - TinyBound. */
- uint32x4_t ia = vreinterpretq_u32_f32 (vabsq_f32 (x));
- cmp = vcgeq_u32 (vsubq_u32 (ia, TinyBound), SpecialBound);
- float32x4_t xm = x;
- /* If any lanes are special, mask them with 1 and retain a copy of x to allow
- special_case to fix special lanes later. This is only necessary if fenv
- exceptions are to be triggered correctly. */
- if (unlikely (v_any_u32 (cmp)))
- x = vbslq_f32 (cmp, v_f32 (1), x);
-#endif
-
- /* exp2(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)]
- x = n + r, with r in [-1/2, 1/2]. */
- n = vrndaq_f32 (x);
- r = vsubq_f32 (x, n);
- e = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtaq_s32_f32 (x)), 23);
- scale = vreinterpretq_f32_u32 (vaddq_u32 (e, d->exponent_bias));
-
-#if !WANT_SIMD_EXCEPT
- cmp = vcagtq_f32 (n, d->special_bound);
-#endif
-
- r2 = vmulq_f32 (r, r);
- p = vfmaq_f32 (C (1), C (0), r);
- q = vfmaq_f32 (C (3), C (2), r);
- q = vfmaq_f32 (q, p, r2);
- p = vmulq_f32 (C (4), r);
- poly = vfmaq_f32 (p, q, r2);
-
- if (unlikely (v_any_u32 (cmp)))
-#if WANT_SIMD_EXCEPT
- return special_case (xm, vfmaq_f32 (scale, poly, scale), cmp);
-#else
- return special_case (poly, n, e, cmp, scale, d);
-#endif
-
- return vfmaq_f32 (scale, poly, scale);
-}
diff --git a/math/aarch64/v_exp_data.c b/math/aarch64/v_exp_data.c
deleted file mode 100644
index 45f0848cac5b5bcf00b768c7f107e0400a8fab7a..0000000000000000000000000000000000000000
--- a/math/aarch64/v_exp_data.c
+++ /dev/null
@@ -1,146 +0,0 @@
-/*
- * Lookup table for double-precision e^x vector function.
- *
- * Copyright (c) 2019-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-
-#include "v_math.h"
-
-# define N (1 << V_EXP_TABLE_BITS)
-
-/* 2^(j/N), j=0..N. */
-const uint64_t __v_exp_data[] = {
-# if N == 128
- 0x3ff0000000000000, 0x3feff63da9fb3335, 0x3fefec9a3e778061,
- 0x3fefe315e86e7f85, 0x3fefd9b0d3158574, 0x3fefd06b29ddf6de,
- 0x3fefc74518759bc8, 0x3fefbe3ecac6f383, 0x3fefb5586cf9890f,
- 0x3fefac922b7247f7, 0x3fefa3ec32d3d1a2, 0x3fef9b66affed31b,
- 0x3fef9301d0125b51, 0x3fef8abdc06c31cc, 0x3fef829aaea92de0,
- 0x3fef7a98c8a58e51, 0x3fef72b83c7d517b, 0x3fef6af9388c8dea,
- 0x3fef635beb6fcb75, 0x3fef5be084045cd4, 0x3fef54873168b9aa,
- 0x3fef4d5022fcd91d, 0x3fef463b88628cd6, 0x3fef3f49917ddc96,
- 0x3fef387a6e756238, 0x3fef31ce4fb2a63f, 0x3fef2b4565e27cdd,
- 0x3fef24dfe1f56381, 0x3fef1e9df51fdee1, 0x3fef187fd0dad990,
- 0x3fef1285a6e4030b, 0x3fef0cafa93e2f56, 0x3fef06fe0a31b715,
- 0x3fef0170fc4cd831, 0x3feefc08b26416ff, 0x3feef6c55f929ff1,
- 0x3feef1a7373aa9cb, 0x3feeecae6d05d866, 0x3feee7db34e59ff7,
- 0x3feee32dc313a8e5, 0x3feedea64c123422, 0x3feeda4504ac801c,
- 0x3feed60a21f72e2a, 0x3feed1f5d950a897, 0x3feece086061892d,
- 0x3feeca41ed1d0057, 0x3feec6a2b5c13cd0, 0x3feec32af0d7d3de,
- 0x3feebfdad5362a27, 0x3feebcb299fddd0d, 0x3feeb9b2769d2ca7,
- 0x3feeb6daa2cf6642, 0x3feeb42b569d4f82, 0x3feeb1a4ca5d920f,
- 0x3feeaf4736b527da, 0x3feead12d497c7fd, 0x3feeab07dd485429,
- 0x3feea9268a5946b7, 0x3feea76f15ad2148, 0x3feea5e1b976dc09,
- 0x3feea47eb03a5585, 0x3feea34634ccc320, 0x3feea23882552225,
- 0x3feea155d44ca973, 0x3feea09e667f3bcd, 0x3feea012750bdabf,
- 0x3fee9fb23c651a2f, 0x3fee9f7df9519484, 0x3fee9f75e8ec5f74,
- 0x3fee9f9a48a58174, 0x3fee9feb564267c9, 0x3feea0694fde5d3f,
- 0x3feea11473eb0187, 0x3feea1ed0130c132, 0x3feea2f336cf4e62,
- 0x3feea427543e1a12, 0x3feea589994cce13, 0x3feea71a4623c7ad,
- 0x3feea8d99b4492ed, 0x3feeaac7d98a6699, 0x3feeace5422aa0db,
- 0x3feeaf3216b5448c, 0x3feeb1ae99157736, 0x3feeb45b0b91ffc6,
- 0x3feeb737b0cdc5e5, 0x3feeba44cbc8520f, 0x3feebd829fde4e50,
- 0x3feec0f170ca07ba, 0x3feec49182a3f090, 0x3feec86319e32323,
- 0x3feecc667b5de565, 0x3feed09bec4a2d33, 0x3feed503b23e255d,
- 0x3feed99e1330b358, 0x3feede6b5579fdbf, 0x3feee36bbfd3f37a,
- 0x3feee89f995ad3ad, 0x3feeee07298db666, 0x3feef3a2b84f15fb,
- 0x3feef9728de5593a, 0x3feeff76f2fb5e47, 0x3fef05b030a1064a,
- 0x3fef0c1e904bc1d2, 0x3fef12c25bd71e09, 0x3fef199bdd85529c,
- 0x3fef20ab5fffd07a, 0x3fef27f12e57d14b, 0x3fef2f6d9406e7b5,
- 0x3fef3720dcef9069, 0x3fef3f0b555dc3fa, 0x3fef472d4a07897c,
- 0x3fef4f87080d89f2, 0x3fef5818dcfba487, 0x3fef60e316c98398,
- 0x3fef69e603db3285, 0x3fef7321f301b460, 0x3fef7c97337b9b5f,
- 0x3fef864614f5a129, 0x3fef902ee78b3ff6, 0x3fef9a51fbc74c83,
- 0x3fefa4afa2a490da, 0x3fefaf482d8e67f1, 0x3fefba1bee615a27,
- 0x3fefc52b376bba97, 0x3fefd0765b6e4540, 0x3fefdbfdad9cbe14,
- 0x3fefe7c1819e90d8, 0x3feff3c22b8f71f1,
-# elif N == 256
- 0x3ff0000000000000, 0x3feffb1afa5abcbf, 0x3feff63da9fb3335,
- 0x3feff168143b0281, 0x3fefec9a3e778061, 0x3fefe7d42e11bbcc,
- 0x3fefe315e86e7f85, 0x3fefde5f72f654b1, 0x3fefd9b0d3158574,
- 0x3fefd50a0e3c1f89, 0x3fefd06b29ddf6de, 0x3fefcbd42b72a836,
- 0x3fefc74518759bc8, 0x3fefc2bdf66607e0, 0x3fefbe3ecac6f383,
- 0x3fefb9c79b1f3919, 0x3fefb5586cf9890f, 0x3fefb0f145e46c85,
- 0x3fefac922b7247f7, 0x3fefa83b23395dec, 0x3fefa3ec32d3d1a2,
- 0x3fef9fa55fdfa9c5, 0x3fef9b66affed31b, 0x3fef973028d7233e,
- 0x3fef9301d0125b51, 0x3fef8edbab5e2ab6, 0x3fef8abdc06c31cc,
- 0x3fef86a814f204ab, 0x3fef829aaea92de0, 0x3fef7e95934f312e,
- 0x3fef7a98c8a58e51, 0x3fef76a45471c3c2, 0x3fef72b83c7d517b,
- 0x3fef6ed48695bbc0, 0x3fef6af9388c8dea, 0x3fef672658375d2f,
- 0x3fef635beb6fcb75, 0x3fef5f99f8138a1c, 0x3fef5be084045cd4,
- 0x3fef582f95281c6b, 0x3fef54873168b9aa, 0x3fef50e75eb44027,
- 0x3fef4d5022fcd91d, 0x3fef49c18438ce4d, 0x3fef463b88628cd6,
- 0x3fef42be3578a819, 0x3fef3f49917ddc96, 0x3fef3bdda27912d1,
- 0x3fef387a6e756238, 0x3fef351ffb82140a, 0x3fef31ce4fb2a63f,
- 0x3fef2e85711ece75, 0x3fef2b4565e27cdd, 0x3fef280e341ddf29,
- 0x3fef24dfe1f56381, 0x3fef21ba7591bb70, 0x3fef1e9df51fdee1,
- 0x3fef1b8a66d10f13, 0x3fef187fd0dad990, 0x3fef157e39771b2f,
- 0x3fef1285a6e4030b, 0x3fef0f961f641589, 0x3fef0cafa93e2f56,
- 0x3fef09d24abd886b, 0x3fef06fe0a31b715, 0x3fef0432edeeb2fd,
- 0x3fef0170fc4cd831, 0x3feefeb83ba8ea32, 0x3feefc08b26416ff,
- 0x3feef96266e3fa2d, 0x3feef6c55f929ff1, 0x3feef431a2de883b,
- 0x3feef1a7373aa9cb, 0x3feeef26231e754a, 0x3feeecae6d05d866,
- 0x3feeea401b7140ef, 0x3feee7db34e59ff7, 0x3feee57fbfec6cf4,
- 0x3feee32dc313a8e5, 0x3feee0e544ede173, 0x3feedea64c123422,
- 0x3feedc70df1c5175, 0x3feeda4504ac801c, 0x3feed822c367a024,
- 0x3feed60a21f72e2a, 0x3feed3fb2709468a, 0x3feed1f5d950a897,
- 0x3feecffa3f84b9d4, 0x3feece086061892d, 0x3feecc2042a7d232,
- 0x3feeca41ed1d0057, 0x3feec86d668b3237, 0x3feec6a2b5c13cd0,
- 0x3feec4e1e192aed2, 0x3feec32af0d7d3de, 0x3feec17dea6db7d7,
- 0x3feebfdad5362a27, 0x3feebe41b817c114, 0x3feebcb299fddd0d,
- 0x3feebb2d81d8abff, 0x3feeb9b2769d2ca7, 0x3feeb8417f4531ee,
- 0x3feeb6daa2cf6642, 0x3feeb57de83f4eef, 0x3feeb42b569d4f82,
- 0x3feeb2e2f4f6ad27, 0x3feeb1a4ca5d920f, 0x3feeb070dde910d2,
- 0x3feeaf4736b527da, 0x3feeae27dbe2c4cf, 0x3feead12d497c7fd,
- 0x3feeac0827ff07cc, 0x3feeab07dd485429, 0x3feeaa11fba87a03,
- 0x3feea9268a5946b7, 0x3feea84590998b93, 0x3feea76f15ad2148,
- 0x3feea6a320dceb71, 0x3feea5e1b976dc09, 0x3feea52ae6cdf6f4,
- 0x3feea47eb03a5585, 0x3feea3dd1d1929fd, 0x3feea34634ccc320,
- 0x3feea2b9febc8fb7, 0x3feea23882552225, 0x3feea1c1c70833f6,
- 0x3feea155d44ca973, 0x3feea0f4b19e9538, 0x3feea09e667f3bcd,
- 0x3feea052fa75173e, 0x3feea012750bdabf, 0x3fee9fdcddd47645,
- 0x3fee9fb23c651a2f, 0x3fee9f9298593ae5, 0x3fee9f7df9519484,
- 0x3fee9f7466f42e87, 0x3fee9f75e8ec5f74, 0x3fee9f8286ead08a,
- 0x3fee9f9a48a58174, 0x3fee9fbd35d7cbfd, 0x3fee9feb564267c9,
- 0x3feea024b1ab6e09, 0x3feea0694fde5d3f, 0x3feea0b938ac1cf6,
- 0x3feea11473eb0187, 0x3feea17b0976cfdb, 0x3feea1ed0130c132,
- 0x3feea26a62ff86f0, 0x3feea2f336cf4e62, 0x3feea3878491c491,
- 0x3feea427543e1a12, 0x3feea4d2add106d9, 0x3feea589994cce13,
- 0x3feea64c1eb941f7, 0x3feea71a4623c7ad, 0x3feea7f4179f5b21,
- 0x3feea8d99b4492ed, 0x3feea9cad931a436, 0x3feeaac7d98a6699,
- 0x3feeabd0a478580f, 0x3feeace5422aa0db, 0x3feeae05bad61778,
- 0x3feeaf3216b5448c, 0x3feeb06a5e0866d9, 0x3feeb1ae99157736,
- 0x3feeb2fed0282c8a, 0x3feeb45b0b91ffc6, 0x3feeb5c353aa2fe2,
- 0x3feeb737b0cdc5e5, 0x3feeb8b82b5f98e5, 0x3feeba44cbc8520f,
- 0x3feebbdd9a7670b3, 0x3feebd829fde4e50, 0x3feebf33e47a22a2,
- 0x3feec0f170ca07ba, 0x3feec2bb4d53fe0d, 0x3feec49182a3f090,
- 0x3feec674194bb8d5, 0x3feec86319e32323, 0x3feeca5e8d07f29e,
- 0x3feecc667b5de565, 0x3feece7aed8eb8bb, 0x3feed09bec4a2d33,
- 0x3feed2c980460ad8, 0x3feed503b23e255d, 0x3feed74a8af46052,
- 0x3feed99e1330b358, 0x3feedbfe53c12e59, 0x3feede6b5579fdbf,
- 0x3feee0e521356eba, 0x3feee36bbfd3f37a, 0x3feee5ff3a3c2774,
- 0x3feee89f995ad3ad, 0x3feeeb4ce622f2ff, 0x3feeee07298db666,
- 0x3feef0ce6c9a8952, 0x3feef3a2b84f15fb, 0x3feef68415b749b1,
- 0x3feef9728de5593a, 0x3feefc6e29f1c52a, 0x3feeff76f2fb5e47,
- 0x3fef028cf22749e4, 0x3fef05b030a1064a, 0x3fef08e0b79a6f1f,
- 0x3fef0c1e904bc1d2, 0x3fef0f69c3f3a207, 0x3fef12c25bd71e09,
- 0x3fef16286141b33d, 0x3fef199bdd85529c, 0x3fef1d1cd9fa652c,
- 0x3fef20ab5fffd07a, 0x3fef244778fafb22, 0x3fef27f12e57d14b,
- 0x3fef2ba88988c933, 0x3fef2f6d9406e7b5, 0x3fef33405751c4db,
- 0x3fef3720dcef9069, 0x3fef3b0f2e6d1675, 0x3fef3f0b555dc3fa,
- 0x3fef43155b5bab74, 0x3fef472d4a07897c, 0x3fef4b532b08c968,
- 0x3fef4f87080d89f2, 0x3fef53c8eacaa1d6, 0x3fef5818dcfba487,
- 0x3fef5c76e862e6d3, 0x3fef60e316c98398, 0x3fef655d71ff6075,
- 0x3fef69e603db3285, 0x3fef6e7cd63a8315, 0x3fef7321f301b460,
- 0x3fef77d5641c0658, 0x3fef7c97337b9b5f, 0x3fef81676b197d17,
- 0x3fef864614f5a129, 0x3fef8b333b16ee12, 0x3fef902ee78b3ff6,
- 0x3fef953924676d76, 0x3fef9a51fbc74c83, 0x3fef9f7977cdb740,
- 0x3fefa4afa2a490da, 0x3fefa9f4867cca6e, 0x3fefaf482d8e67f1,
- 0x3fefb4aaa2188510, 0x3fefba1bee615a27, 0x3fefbf9c1cb6412a,
- 0x3fefc52b376bba97, 0x3fefcac948dd7274, 0x3fefd0765b6e4540,
- 0x3fefd632798844f8, 0x3fefdbfdad9cbe14, 0x3fefe1d802243c89,
- 0x3fefe7c1819e90d8, 0x3fefedba3692d514, 0x3feff3c22b8f71f1,
- 0x3feff9d96b2a23d9,
-# endif
-};
diff --git a/math/aarch64/v_expf.c b/math/aarch64/v_expf.c
deleted file mode 100644
index 34e8b6081bcd947effb06be781b8fba6bd95bbba..0000000000000000000000000000000000000000
--- a/math/aarch64/v_expf.c
+++ /dev/null
@@ -1,122 +0,0 @@
-/*
- * Single-precision vector e^x function.
- *
- * Copyright (c) 2019-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-
-#include "mathlib.h"
-#include "v_math.h"
-
-static const struct data
-{
- float32x4_t poly[5];
- float32x4_t shift, inv_ln2, ln2_hi, ln2_lo;
- uint32x4_t exponent_bias;
-#if !WANT_SIMD_EXCEPT
- float32x4_t special_bound, scale_thresh;
-#endif
-} data = {
- /* maxerr: 1.45358 +0.5 ulp. */
- .poly = { V4 (0x1.0e4020p-7f), V4 (0x1.573e2ep-5f), V4 (0x1.555e66p-3f),
- V4 (0x1.fffdb6p-2f), V4 (0x1.ffffecp-1f) },
- .shift = V4 (0x1.8p23f),
- .inv_ln2 = V4 (0x1.715476p+0f),
- .ln2_hi = V4 (0x1.62e4p-1f),
- .ln2_lo = V4 (0x1.7f7d1cp-20f),
- .exponent_bias = V4 (0x3f800000),
-#if !WANT_SIMD_EXCEPT
- .special_bound = V4 (126.0f),
- .scale_thresh = V4 (192.0f),
-#endif
-};
-
-#define C(i) d->poly[i]
-
-#if WANT_SIMD_EXCEPT
-
-# define TinyBound v_u32 (0x20000000) /* asuint (0x1p-63). */
-# define BigBound v_u32 (0x42800000) /* asuint (0x1p6). */
-# define SpecialBound v_u32 (0x22800000) /* BigBound - TinyBound. */
-
-static float32x4_t VPCS_ATTR NOINLINE
-special_case (float32x4_t x, float32x4_t y, uint32x4_t cmp)
-{
- /* If fenv exceptions are to be triggered correctly, fall back to the scalar
- routine to special lanes. */
- return v_call_f32 (expf, x, y, cmp);
-}
-
-#else
-
-# define SpecialOffset v_u32 (0x82000000)
-# define SpecialBias v_u32 (0x7f000000)
-
-static float32x4_t VPCS_ATTR NOINLINE
-special_case (float32x4_t poly, float32x4_t n, uint32x4_t e, uint32x4_t cmp1,
- float32x4_t scale, const struct data *d)
-{
- /* 2^n may overflow, break it up into s1*s2. */
- uint32x4_t b = vandq_u32 (vclezq_f32 (n), SpecialOffset);
- float32x4_t s1 = vreinterpretq_f32_u32 (vaddq_u32 (b, SpecialBias));
- float32x4_t s2 = vreinterpretq_f32_u32 (vsubq_u32 (e, b));
- uint32x4_t cmp2 = vcagtq_f32 (n, d->scale_thresh);
- float32x4_t r2 = vmulq_f32 (s1, s1);
- float32x4_t r1 = vmulq_f32 (vfmaq_f32 (s2, poly, s2), s1);
- /* Similar to r1 but avoids double rounding in the subnormal range. */
- float32x4_t r0 = vfmaq_f32 (scale, poly, scale);
- float32x4_t r = vbslq_f32 (cmp1, r1, r0);
- return vbslq_f32 (cmp2, r2, r);
-}
-
-#endif
-
-float32x4_t VPCS_ATTR V_NAME_F1 (exp) (float32x4_t x)
-{
- const struct data *d = ptr_barrier (&data);
- float32x4_t n, r, r2, scale, p, q, poly, z;
- uint32x4_t cmp, e;
-
-#if WANT_SIMD_EXCEPT
- /* asuint(x) - TinyBound >= BigBound - TinyBound. */
- cmp = vcgeq_u32 (
- vsubq_u32 (vandq_u32 (vreinterpretq_u32_f32 (x), v_u32 (0x7fffffff)),
- TinyBound),
- SpecialBound);
- float32x4_t xm = x;
- /* If any lanes are special, mask them with 1 and retain a copy of x to allow
- special case handler to fix special lanes later. This is only necessary if
- fenv exceptions are to be triggered correctly. */
- if (unlikely (v_any_u32 (cmp)))
- x = vbslq_f32 (cmp, v_f32 (1), x);
-#endif
-
- /* exp(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)]
- x = ln2*n + r, with r in [-ln2/2, ln2/2]. */
- z = vfmaq_f32 (d->shift, x, d->inv_ln2);
- n = vsubq_f32 (z, d->shift);
- r = vfmsq_f32 (x, n, d->ln2_hi);
- r = vfmsq_f32 (r, n, d->ln2_lo);
- e = vshlq_n_u32 (vreinterpretq_u32_f32 (z), 23);
- scale = vreinterpretq_f32_u32 (vaddq_u32 (e, d->exponent_bias));
-
-#if !WANT_SIMD_EXCEPT
- cmp = vcagtq_f32 (n, d->special_bound);
-#endif
-
- r2 = vmulq_f32 (r, r);
- p = vfmaq_f32 (C (1), C (0), r);
- q = vfmaq_f32 (C (3), C (2), r);
- q = vfmaq_f32 (q, p, r2);
- p = vmulq_f32 (C (4), r);
- poly = vfmaq_f32 (p, q, r2);
-
- if (unlikely (v_any_u32 (cmp)))
-#if WANT_SIMD_EXCEPT
- return special_case (xm, vfmaq_f32 (scale, poly, scale), cmp);
-#else
- return special_case (poly, n, e, cmp, scale, d);
-#endif
-
- return vfmaq_f32 (scale, poly, scale);
-}
diff --git a/math/aarch64/v_log.c b/math/aarch64/v_log.c
deleted file mode 100644
index 1d1c1fa62c0423da2c6c402113da471af2df7540..0000000000000000000000000000000000000000
--- a/math/aarch64/v_log.c
+++ /dev/null
@@ -1,100 +0,0 @@
-/*
- * Double-precision vector log(x) function.
- *
- * Copyright (c) 2019-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-
-#include "mathlib.h"
-#include "v_math.h"
-
-static const struct data
-{
- uint64x2_t min_norm;
- uint32x4_t special_bound;
- float64x2_t poly[5];
- float64x2_t ln2;
- uint64x2_t sign_exp_mask;
-} data = {
- /* Worst-case error: 1.17 + 0.5 ulp.
- Rel error: 0x1.6272e588p-56 in [ -0x1.fc1p-9 0x1.009p-8 ]. */
- .poly = { V2 (-0x1.ffffffffffff7p-2), V2 (0x1.55555555170d4p-2),
- V2 (-0x1.0000000399c27p-2), V2 (0x1.999b2e90e94cap-3),
- V2 (-0x1.554e550bd501ep-3) },
- .ln2 = V2 (0x1.62e42fefa39efp-1),
- .min_norm = V2 (0x0010000000000000),
- .special_bound = V4 (0x7fe00000), /* asuint64(inf) - min_norm. */
- .sign_exp_mask = V2 (0xfff0000000000000)
-};
-
-#define A(i) d->poly[i]
-#define N (1 << V_LOG_TABLE_BITS)
-#define IndexMask (N - 1)
-#define Off v_u64 (0x3fe6900900000000)
-
-struct entry
-{
- float64x2_t invc;
- float64x2_t logc;
-};
-
-static inline struct entry
-lookup (uint64x2_t i)
-{
- /* Since N is a power of 2, n % N = n & (N - 1). */
- struct entry e;
- uint64_t i0 = (i[0] >> (52 - V_LOG_TABLE_BITS)) & IndexMask;
- uint64_t i1 = (i[1] >> (52 - V_LOG_TABLE_BITS)) & IndexMask;
- float64x2_t e0 = vld1q_f64 (&__v_log_data.table[i0].invc);
- float64x2_t e1 = vld1q_f64 (&__v_log_data.table[i1].invc);
- e.invc = vuzp1q_f64 (e0, e1);
- e.logc = vuzp2q_f64 (e0, e1);
- return e;
-}
-
-static float64x2_t VPCS_ATTR NOINLINE
-special_case (float64x2_t x, float64x2_t y, float64x2_t hi, float64x2_t r2,
- uint32x2_t cmp)
-{
- return v_call_f64 (log, x, vfmaq_f64 (hi, y, r2), vmovl_u32 (cmp));
-}
-
-float64x2_t VPCS_ATTR V_NAME_D1 (log) (float64x2_t x)
-{
- const struct data *d = ptr_barrier (&data);
- float64x2_t z, r, r2, p, y, kd, hi;
- uint64x2_t ix, iz, tmp;
- uint32x2_t cmp;
- int64x2_t k;
- struct entry e;
-
- ix = vreinterpretq_u64_f64 (x);
- cmp = vcge_u32 (vsubhn_u64 (ix, d->min_norm),
- vget_low_u32 (d->special_bound));
-
- /* x = 2^k z; where z is in range [Off,2*Off) and exact.
- The range is split into N subintervals.
- The ith subinterval contains z and c is near its center. */
- tmp = vsubq_u64 (ix, Off);
- k = vshrq_n_s64 (vreinterpretq_s64_u64 (tmp), 52); /* arithmetic shift. */
- iz = vsubq_u64 (ix, vandq_u64 (tmp, d->sign_exp_mask));
- z = vreinterpretq_f64_u64 (iz);
- e = lookup (tmp);
-
- /* log(x) = log1p(z/c-1) + log(c) + k*Ln2. */
- r = vfmaq_f64 (v_f64 (-1.0), z, e.invc);
- kd = vcvtq_f64_s64 (k);
-
- /* hi = r + log(c) + k*Ln2. */
- hi = vfmaq_f64 (vaddq_f64 (e.logc, r), kd, d->ln2);
- /* y = r2*(A0 + r*A1 + r2*(A2 + r*A3 + r2*A4)) + hi. */
- r2 = vmulq_f64 (r, r);
- y = vfmaq_f64 (A (2), A (3), r);
- p = vfmaq_f64 (A (0), A (1), r);
- y = vfmaq_f64 (y, A (4), r2);
- y = vfmaq_f64 (p, y, r2);
-
- if (unlikely (v_any_u32h (cmp)))
- return special_case (x, y, hi, r2, cmp);
- return vfmaq_f64 (hi, y, r2);
-}
diff --git a/math/aarch64/v_log_data.c b/math/aarch64/v_log_data.c
deleted file mode 100644
index 82351bb14766f2fbf6095cbf2e214e99b45f217d..0000000000000000000000000000000000000000
--- a/math/aarch64/v_log_data.c
+++ /dev/null
@@ -1,156 +0,0 @@
-/*
- * Lookup table for double-precision log(x) vector function.
- *
- * Copyright (c) 2019-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-
-#include "v_math.h"
-
-#define N (1 << V_LOG_TABLE_BITS)
-
-const struct v_log_data __v_log_data = {
- /* Algorithm:
-
- x = 2^k z
- log(x) = k ln2 + log(c) + poly(z/c - 1)
-
- where z is in [a;2a) which is split into N subintervals (a=0x1.69009p-1,
- N=128) and log(c) and 1/c for the ith subinterval comes from lookup tables:
-
- table[i].invc = 1/c
- table[i].logc = (double)log(c)
-
- where c is near the center of the subinterval and is chosen by trying several
- floating point invc candidates around 1/center and selecting one for which
- the error in (double)log(c) is minimized (< 0x1p-74), except the subinterval
- that contains 1 and the previous one got tweaked to avoid cancellation. */
- .table = { { 0x1.6a133d0dec120p+0, -0x1.62fe995eb963ap-2 },
- { 0x1.6815f2f3e42edp+0, -0x1.5d5a48dad6b67p-2 },
- { 0x1.661e39be1ac9ep+0, -0x1.57bde257d2769p-2 },
- { 0x1.642bfa30ac371p+0, -0x1.52294fbf2af55p-2 },
- { 0x1.623f1d916f323p+0, -0x1.4c9c7b598aa38p-2 },
- { 0x1.60578da220f65p+0, -0x1.47174fc5ff560p-2 },
- { 0x1.5e75349dea571p+0, -0x1.4199b7fa7b5cap-2 },
- { 0x1.5c97fd387a75ap+0, -0x1.3c239f48cfb99p-2 },
- { 0x1.5abfd2981f200p+0, -0x1.36b4f154d2aebp-2 },
- { 0x1.58eca051dc99cp+0, -0x1.314d9a0ff32fbp-2 },
- { 0x1.571e526d9df12p+0, -0x1.2bed85cca3cffp-2 },
- { 0x1.5554d555b3fcbp+0, -0x1.2694a11421af9p-2 },
- { 0x1.539015e2a20cdp+0, -0x1.2142d8d014fb2p-2 },
- { 0x1.51d0014ee0164p+0, -0x1.1bf81a2c77776p-2 },
- { 0x1.50148538cd9eep+0, -0x1.16b452a39c6a4p-2 },
- { 0x1.4e5d8f9f698a1p+0, -0x1.11776ffa6c67ep-2 },
- { 0x1.4cab0edca66bep+0, -0x1.0c416035020e0p-2 },
- { 0x1.4afcf1a9db874p+0, -0x1.071211aa10fdap-2 },
- { 0x1.495327136e16fp+0, -0x1.01e972e293b1bp-2 },
- { 0x1.47ad9e84af28fp+0, -0x1.f98ee587fd434p-3 },
- { 0x1.460c47b39ae15p+0, -0x1.ef5800ad716fbp-3 },
- { 0x1.446f12b278001p+0, -0x1.e52e160484698p-3 },
- { 0x1.42d5efdd720ecp+0, -0x1.db1104b19352ep-3 },
- { 0x1.4140cfe001a0fp+0, -0x1.d100ac59e0bd6p-3 },
- { 0x1.3fafa3b421f69p+0, -0x1.c6fced287c3bdp-3 },
- { 0x1.3e225c9c8ece5p+0, -0x1.bd05a7b317c29p-3 },
- { 0x1.3c98ec29a211ap+0, -0x1.b31abd229164fp-3 },
- { 0x1.3b13442a413fep+0, -0x1.a93c0edadb0a3p-3 },
- { 0x1.399156baa3c54p+0, -0x1.9f697ee30d7ddp-3 },
- { 0x1.38131639b4cdbp+0, -0x1.95a2efa9aa40ap-3 },
- { 0x1.36987540fbf53p+0, -0x1.8be843d796044p-3 },
- { 0x1.352166b648f61p+0, -0x1.82395ecc477edp-3 },
- { 0x1.33adddb3eb575p+0, -0x1.7896240966422p-3 },
- { 0x1.323dcd99fc1d3p+0, -0x1.6efe77aca8c55p-3 },
- { 0x1.30d129fefc7d2p+0, -0x1.65723e117ec5cp-3 },
- { 0x1.2f67e6b72fe7dp+0, -0x1.5bf15c0955706p-3 },
- { 0x1.2e01f7cf8b187p+0, -0x1.527bb6c111da1p-3 },
- { 0x1.2c9f518ddc86ep+0, -0x1.491133c939f8fp-3 },
- { 0x1.2b3fe86e5f413p+0, -0x1.3fb1b90c7fc58p-3 },
- { 0x1.29e3b1211b25cp+0, -0x1.365d2cc485f8dp-3 },
- { 0x1.288aa08b373cfp+0, -0x1.2d13758970de7p-3 },
- { 0x1.2734abcaa8467p+0, -0x1.23d47a721fd47p-3 },
- { 0x1.25e1c82459b81p+0, -0x1.1aa0229f25ec2p-3 },
- { 0x1.2491eb1ad59c5p+0, -0x1.117655ddebc3bp-3 },
- { 0x1.23450a54048b5p+0, -0x1.0856fbf83ab6bp-3 },
- { 0x1.21fb1bb09e578p+0, -0x1.fe83fabbaa106p-4 },
- { 0x1.20b415346d8f7p+0, -0x1.ec6e8507a56cdp-4 },
- { 0x1.1f6fed179a1acp+0, -0x1.da6d68c7cc2eap-4 },
- { 0x1.1e2e99b93c7b3p+0, -0x1.c88078462be0cp-4 },
- { 0x1.1cf011a7a882ap+0, -0x1.b6a786a423565p-4 },
- { 0x1.1bb44b97dba5ap+0, -0x1.a4e2676ac7f85p-4 },
- { 0x1.1a7b3e66cdd4fp+0, -0x1.9330eea777e76p-4 },
- { 0x1.1944e11dc56cdp+0, -0x1.8192f134d5ad9p-4 },
- { 0x1.18112aebb1a6ep+0, -0x1.70084464f0538p-4 },
- { 0x1.16e013231b7e9p+0, -0x1.5e90bdec5cb1fp-4 },
- { 0x1.15b1913f156cfp+0, -0x1.4d2c3433c5536p-4 },
- { 0x1.14859cdedde13p+0, -0x1.3bda7e219879ap-4 },
- { 0x1.135c2dc68cfa4p+0, -0x1.2a9b732d27194p-4 },
- { 0x1.12353bdb01684p+0, -0x1.196eeb2b10807p-4 },
- { 0x1.1110bf25b85b4p+0, -0x1.0854be8ef8a7ep-4 },
- { 0x1.0feeafd2f8577p+0, -0x1.ee998cb277432p-5 },
- { 0x1.0ecf062c51c3bp+0, -0x1.ccadb79919fb9p-5 },
- { 0x1.0db1baa076c8bp+0, -0x1.aae5b1d8618b0p-5 },
- { 0x1.0c96c5bb3048ep+0, -0x1.89413015d7442p-5 },
- { 0x1.0b7e20263e070p+0, -0x1.67bfe7bf158dep-5 },
- { 0x1.0a67c2acd0ce3p+0, -0x1.46618f83941bep-5 },
- { 0x1.0953a6391e982p+0, -0x1.2525df1b0618ap-5 },
- { 0x1.0841c3caea380p+0, -0x1.040c8e2f77c6ap-5 },
- { 0x1.07321489b13eap+0, -0x1.c62aad39f738ap-6 },
- { 0x1.062491aee9904p+0, -0x1.847fe3bdead9cp-6 },
- { 0x1.05193497a7cc5p+0, -0x1.43183683400acp-6 },
- { 0x1.040ff6b5f5e9fp+0, -0x1.01f31c4e1d544p-6 },
- { 0x1.0308d19aa6127p+0, -0x1.82201d1e6b69ap-7 },
- { 0x1.0203beedb0c67p+0, -0x1.00dd0f3e1bfd6p-7 },
- { 0x1.010037d38bcc2p+0, -0x1.ff6fe1feb4e53p-9 },
- { 1.0, 0.0 },
- { 0x1.fc06d493cca10p-1, 0x1.fe91885ec8e20p-8 },
- { 0x1.f81e6ac3b918fp-1, 0x1.fc516f716296dp-7 },
- { 0x1.f44546ef18996p-1, 0x1.7bb4dd70a015bp-6 },
- { 0x1.f07b10382c84bp-1, 0x1.f84c99b34b674p-6 },
- { 0x1.ecbf7070e59d4p-1, 0x1.39f9ce4fb2d71p-5 },
- { 0x1.e91213f715939p-1, 0x1.7756c0fd22e78p-5 },
- { 0x1.e572a9a75f7b7p-1, 0x1.b43ee82db8f3ap-5 },
- { 0x1.e1e0e2c530207p-1, 0x1.f0b3fced60034p-5 },
- { 0x1.de5c72d8a8be3p-1, 0x1.165bd78d4878ep-4 },
- { 0x1.dae50fa5658ccp-1, 0x1.3425d2715ebe6p-4 },
- { 0x1.d77a71145a2dap-1, 0x1.51b8bd91b7915p-4 },
- { 0x1.d41c51166623ep-1, 0x1.6f15632c76a47p-4 },
- { 0x1.d0ca6ba0bb29fp-1, 0x1.8c3c88ecbe503p-4 },
- { 0x1.cd847e8e59681p-1, 0x1.a92ef077625dap-4 },
- { 0x1.ca4a499693e00p-1, 0x1.c5ed5745fa006p-4 },
- { 0x1.c71b8e399e821p-1, 0x1.e27876de1c993p-4 },
- { 0x1.c3f80faf19077p-1, 0x1.fed104fce4cdcp-4 },
- { 0x1.c0df92dc2b0ecp-1, 0x1.0d7bd9c17d78bp-3 },
- { 0x1.bdd1de3cbb542p-1, 0x1.1b76986cef97bp-3 },
- { 0x1.baceb9e1007a3p-1, 0x1.295913d24f750p-3 },
- { 0x1.b7d5ef543e55ep-1, 0x1.37239fa295d17p-3 },
- { 0x1.b4e749977d953p-1, 0x1.44d68dd78714bp-3 },
- { 0x1.b20295155478ep-1, 0x1.52722ebe5d780p-3 },
- { 0x1.af279f8e82be2p-1, 0x1.5ff6d12671f98p-3 },
- { 0x1.ac5638197fdf3p-1, 0x1.6d64c2389484bp-3 },
- { 0x1.a98e2f102e087p-1, 0x1.7abc4da40fddap-3 },
- { 0x1.a6cf5606d05c1p-1, 0x1.87fdbda1e8452p-3 },
- { 0x1.a4197fc04d746p-1, 0x1.95295b06a5f37p-3 },
- { 0x1.a16c80293dc01p-1, 0x1.a23f6d34abbc5p-3 },
- { 0x1.9ec82c4dc5bc9p-1, 0x1.af403a28e04f2p-3 },
- { 0x1.9c2c5a491f534p-1, 0x1.bc2c06a85721ap-3 },
- { 0x1.9998e1480b618p-1, 0x1.c903161240163p-3 },
- { 0x1.970d9977c6c2dp-1, 0x1.d5c5aa93287ebp-3 },
- { 0x1.948a5c023d212p-1, 0x1.e274051823fa9p-3 },
- { 0x1.920f0303d6809p-1, 0x1.ef0e656300c16p-3 },
- { 0x1.8f9b698a98b45p-1, 0x1.fb9509f05aa2ap-3 },
- { 0x1.8d2f6b81726f6p-1, 0x1.04041821f37afp-2 },
- { 0x1.8acae5bb55badp-1, 0x1.0a340a49b3029p-2 },
- { 0x1.886db5d9275b8p-1, 0x1.105a7918a126dp-2 },
- { 0x1.8617ba567c13cp-1, 0x1.1677819812b84p-2 },
- { 0x1.83c8d27487800p-1, 0x1.1c8b405b40c0ep-2 },
- { 0x1.8180de3c5dbe7p-1, 0x1.2295d16cfa6b1p-2 },
- { 0x1.7f3fbe71cdb71p-1, 0x1.28975066318a2p-2 },
- { 0x1.7d055498071c1p-1, 0x1.2e8fd855d86fcp-2 },
- { 0x1.7ad182e54f65ap-1, 0x1.347f83d605e59p-2 },
- { 0x1.78a42c3c90125p-1, 0x1.3a666d1244588p-2 },
- { 0x1.767d342f76944p-1, 0x1.4044adb6f8ec4p-2 },
- { 0x1.745c7ef26b00ap-1, 0x1.461a5f077558cp-2 },
- { 0x1.7241f15769d0fp-1, 0x1.4be799e20b9c8p-2 },
- { 0x1.702d70d396e41p-1, 0x1.51ac76a6b79dfp-2 },
- { 0x1.6e1ee3700cd11p-1, 0x1.57690d5744a45p-2 },
- { 0x1.6c162fc9cbe02p-1, 0x1.5d1d758e45217p-2 } }
-};
diff --git a/math/aarch64/v_logf.c b/math/aarch64/v_logf.c
deleted file mode 100644
index 66ebbbcd2b5a840b8a194cb18139ee585f67208a..0000000000000000000000000000000000000000
--- a/math/aarch64/v_logf.c
+++ /dev/null
@@ -1,74 +0,0 @@
-/*
- * Single-precision vector log function.
- *
- * Copyright (c) 2019-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-
-#include "mathlib.h"
-#include "v_math.h"
-
-static const struct data
-{
- uint32x4_t min_norm;
- uint16x8_t special_bound;
- float32x4_t poly[7];
- float32x4_t ln2, tiny_bound;
- uint32x4_t off, mantissa_mask;
-} data = {
- /* 3.34 ulp error. */
- .poly = { V4 (-0x1.3e737cp-3f), V4 (0x1.5a9aa2p-3f), V4 (-0x1.4f9934p-3f),
- V4 (0x1.961348p-3f), V4 (-0x1.00187cp-2f), V4 (0x1.555d7cp-2f),
- V4 (-0x1.ffffc8p-2f) },
- .ln2 = V4 (0x1.62e43p-1f),
- .tiny_bound = V4 (0x1p-126),
- .min_norm = V4 (0x00800000),
- .special_bound = V8 (0x7f00), /* asuint32(inf) - min_norm. */
- .off = V4 (0x3f2aaaab), /* 0.666667. */
- .mantissa_mask = V4 (0x007fffff)
-};
-
-#define P(i) d->poly[7 - i]
-
-static float32x4_t VPCS_ATTR NOINLINE
-special_case (float32x4_t x, float32x4_t y, float32x4_t r2, float32x4_t p,
- uint16x4_t cmp)
-{
- /* Fall back to scalar code. */
- return v_call_f32 (logf, x, vfmaq_f32 (p, y, r2), vmovl_u16 (cmp));
-}
-
-float32x4_t VPCS_ATTR V_NAME_F1 (log) (float32x4_t x)
-{
- const struct data *d = ptr_barrier (&data);
- float32x4_t n, p, q, r, r2, y;
- uint32x4_t u;
- uint16x4_t cmp;
-
- u = vreinterpretq_u32_f32 (x);
- cmp = vcge_u16 (vsubhn_u32 (u, d->min_norm),
- vget_low_u16 (d->special_bound));
-
- /* x = 2^n * (1+r), where 2/3 < 1+r < 4/3. */
- u = vsubq_u32 (u, d->off);
- n = vcvtq_f32_s32 (
- vshrq_n_s32 (vreinterpretq_s32_u32 (u), 23)); /* signextend. */
- u = vandq_u32 (u, d->mantissa_mask);
- u = vaddq_u32 (u, d->off);
- r = vsubq_f32 (vreinterpretq_f32_u32 (u), v_f32 (1.0f));
-
- /* y = log(1+r) + n*ln2. */
- r2 = vmulq_f32 (r, r);
- /* n*ln2 + r + r2*(P1 + r*P2 + r2*(P3 + r*P4 + r2*(P5 + r*P6 + r2*P7))). */
- p = vfmaq_f32 (P (5), P (6), r);
- q = vfmaq_f32 (P (3), P (4), r);
- y = vfmaq_f32 (P (1), P (2), r);
- p = vfmaq_f32 (p, P (7), r2);
- q = vfmaq_f32 (q, p, r2);
- y = vfmaq_f32 (y, q, r2);
- p = vfmaq_f32 (r, d->ln2, n);
-
- if (unlikely (v_any_u16h (cmp)))
- return special_case (x, y, r2, p, cmp);
- return vfmaq_f32 (p, y, r2);
-}
diff --git a/math/aarch64/v_math.h b/math/aarch64/v_math.h
deleted file mode 100644
index 1dc9916c6fb076fd0c3d5074f5d156d2d952b4f2..0000000000000000000000000000000000000000
--- a/math/aarch64/v_math.h
+++ /dev/null
@@ -1,135 +0,0 @@
-/*
- * Vector math abstractions.
- *
- * Copyright (c) 2019-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-
-#ifndef _V_MATH_H
-#define _V_MATH_H
-
-#if !__aarch64__
-# error "Cannot build without AArch64"
-#endif
-
-#define VPCS_ATTR __attribute__ ((aarch64_vector_pcs))
-
-#define V_NAME_F1(fun) _ZGVnN4v_##fun##f
-#define V_NAME_D1(fun) _ZGVnN2v_##fun
-#define V_NAME_F2(fun) _ZGVnN4vv_##fun##f
-#define V_NAME_D2(fun) _ZGVnN2vv_##fun
-
-#include
-#include "../math_config.h"
-#include
-
-/* Shorthand helpers for declaring constants. */
-# define V2(X) { X, X }
-# define V4(X) { X, X, X, X }
-# define V8(X) { X, X, X, X, X, X, X, X }
-
-static inline int
-v_any_u16h (uint16x4_t x)
-{
- return vget_lane_u64 (vreinterpret_u64_u16 (x), 0) != 0;
-}
-
-static inline int
-v_lanes32 (void)
-{
- return 4;
-}
-
-static inline float32x4_t
-v_f32 (float x)
-{
- return (float32x4_t) V4 (x);
-}
-static inline uint32x4_t
-v_u32 (uint32_t x)
-{
- return (uint32x4_t) V4 (x);
-}
-/* true if any elements of a v_cond result is non-zero. */
-static inline int
-v_any_u32 (uint32x4_t x)
-{
- /* assume elements in x are either 0 or -1u. */
- return vpaddd_u64 (vreinterpretq_u64_u32 (x)) != 0;
-}
-static inline int
-v_any_u32h (uint32x2_t x)
-{
- return vget_lane_u64 (vreinterpret_u64_u32 (x), 0) != 0;
-}
-static inline float32x4_t
-v_lookup_f32 (const float *tab, uint32x4_t idx)
-{
- return (float32x4_t){tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]};
-}
-static inline uint32x4_t
-v_lookup_u32 (const uint32_t *tab, uint32x4_t idx)
-{
- return (uint32x4_t){tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]};
-}
-static inline float32x4_t
-v_call_f32 (float (*f) (float), float32x4_t x, float32x4_t y, uint32x4_t p)
-{
- return (float32x4_t){p[0] ? f (x[0]) : y[0], p[1] ? f (x[1]) : y[1],
- p[2] ? f (x[2]) : y[2], p[3] ? f (x[3]) : y[3]};
-}
-static inline float32x4_t
-v_call2_f32 (float (*f) (float, float), float32x4_t x1, float32x4_t x2,
- float32x4_t y, uint32x4_t p)
-{
- return (float32x4_t){p[0] ? f (x1[0], x2[0]) : y[0],
- p[1] ? f (x1[1], x2[1]) : y[1],
- p[2] ? f (x1[2], x2[2]) : y[2],
- p[3] ? f (x1[3], x2[3]) : y[3]};
-}
-
-static inline int
-v_lanes64 (void)
-{
- return 2;
-}
-static inline float64x2_t
-v_f64 (double x)
-{
- return (float64x2_t) V2 (x);
-}
-static inline uint64x2_t
-v_u64 (uint64_t x)
-{
- return (uint64x2_t) V2 (x);
-}
-/* true if any elements of a v_cond result is non-zero. */
-static inline int
-v_any_u64 (uint64x2_t x)
-{
- /* assume elements in x are either 0 or -1u. */
- return vpaddd_u64 (x) != 0;
-}
-static inline float64x2_t
-v_lookup_f64 (const double *tab, uint64x2_t idx)
-{
- return (float64x2_t){tab[idx[0]], tab[idx[1]]};
-}
-static inline uint64x2_t
-v_lookup_u64 (const uint64_t *tab, uint64x2_t idx)
-{
- return (uint64x2_t){tab[idx[0]], tab[idx[1]]};
-}
-static inline float64x2_t
-v_call_f64 (double (*f) (double), float64x2_t x, float64x2_t y, uint64x2_t p)
-{
- double p1 = p[1];
- double x1 = x[1];
- if (likely (p[0]))
- y[0] = f (x[0]);
- if (likely (p1))
- y[1] = f (x1);
- return y;
-}
-
-#endif
diff --git a/math/aarch64/v_powf.c b/math/aarch64/v_powf.c
deleted file mode 100644
index 3a4163ab05582b387e87245bd4de77e9b93f9ac1..0000000000000000000000000000000000000000
--- a/math/aarch64/v_powf.c
+++ /dev/null
@@ -1,148 +0,0 @@
-/*
- * Single-precision vector powf function.
- *
- * Copyright (c) 2019-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-
-#include "v_math.h"
-
-#define Min v_u32 (0x00800000)
-#define Max v_u32 (0x7f800000)
-#define Thresh v_u32 (0x7f000000) /* Max - Min. */
-#define MantissaMask v_u32 (0x007fffff)
-
-#define A data.log2_poly
-#define C data.exp2f_poly
-
-/* 2.6 ulp ~ 0.5 + 2^24 (128*Ln2*relerr_log2 + relerr_exp2). */
-#define Off v_u32 (0x3f35d000)
-
-#define V_POWF_LOG2_TABLE_BITS 5
-#define V_EXP2F_TABLE_BITS 5
-#define Log2IdxMask v_u32 ((1 << V_POWF_LOG2_TABLE_BITS) - 1)
-#define Scale ((double) (1 << V_EXP2F_TABLE_BITS))
-
-static const struct
-{
- struct
- {
- double invc, logc;
- } log2_tab[1 << V_POWF_LOG2_TABLE_BITS];
- double log2_poly[4];
- uint64_t exp2f_tab[1 << V_EXP2F_TABLE_BITS];
- double exp2f_poly[3];
-} data = {
- .log2_tab = {{0x1.6489890582816p+0, -0x1.e960f97b22702p-2 * Scale},
- {0x1.5cf19b35e3472p+0, -0x1.c993406cd4db6p-2 * Scale},
- {0x1.55aac0e956d65p+0, -0x1.aa711d9a7d0f3p-2 * Scale},
- {0x1.4eb0022977e01p+0, -0x1.8bf37bacdce9bp-2 * Scale},
- {0x1.47fcccda1dd1fp+0, -0x1.6e13b3519946ep-2 * Scale},
- {0x1.418ceabab68c1p+0, -0x1.50cb8281e4089p-2 * Scale},
- {0x1.3b5c788f1edb3p+0, -0x1.341504a237e2bp-2 * Scale},
- {0x1.3567de48e9c9ap+0, -0x1.17eaab624ffbbp-2 * Scale},
- {0x1.2fabc80fd19bap+0, -0x1.f88e708f8c853p-3 * Scale},
- {0x1.2a25200ce536bp+0, -0x1.c24b6da113914p-3 * Scale},
- {0x1.24d108e0152e3p+0, -0x1.8d02ee397cb1dp-3 * Scale},
- {0x1.1facd8ab2fbe1p+0, -0x1.58ac1223408b3p-3 * Scale},
- {0x1.1ab614a03efdfp+0, -0x1.253e6fd190e89p-3 * Scale},
- {0x1.15ea6d03af9ffp+0, -0x1.e5641882c12ffp-4 * Scale},
- {0x1.1147b994bb776p+0, -0x1.81fea712926f7p-4 * Scale},
- {0x1.0ccbf650593aap+0, -0x1.203e240de64a3p-4 * Scale},
- {0x1.0875408477302p+0, -0x1.8029b86a78281p-5 * Scale},
- {0x1.0441d42a93328p+0, -0x1.85d713190fb9p-6 * Scale},
- {0x1p+0, 0x0p+0 * Scale},
- {0x1.f1d006c855e86p-1, 0x1.4c1cc07312997p-5 * Scale},
- {0x1.e28c3341aa301p-1, 0x1.5e1848ccec948p-4 * Scale},
- {0x1.d4bdf9aa64747p-1, 0x1.04cfcb7f1196fp-3 * Scale},
- {0x1.c7b45a24e5803p-1, 0x1.582813d463c21p-3 * Scale},
- {0x1.bb5f5eb2ed60ap-1, 0x1.a936fa68760ccp-3 * Scale},
- {0x1.afb0bff8fe6b4p-1, 0x1.f81bc31d6cc4ep-3 * Scale},
- {0x1.a49badf7ab1f5p-1, 0x1.2279a09fae6b1p-2 * Scale},
- {0x1.9a14a111fc4c9p-1, 0x1.47ec0b6df5526p-2 * Scale},
- {0x1.901131f5b2fdcp-1, 0x1.6c71762280f1p-2 * Scale},
- {0x1.8687f73f6d865p-1, 0x1.90155070798dap-2 * Scale},
- {0x1.7d7067eb77986p-1, 0x1.b2e23b1d3068cp-2 * Scale},
- {0x1.74c2c1cf97b65p-1, 0x1.d4e21b0daa86ap-2 * Scale},
- {0x1.6c77f37cff2a1p-1, 0x1.f61e2a2f67f3fp-2 * Scale},},
- .log2_poly = { /* rel err: 1.5 * 2^-30. */
- -0x1.6ff5daa3b3d7cp-2 * Scale, 0x1.ec81d03c01aebp-2 * Scale,
- -0x1.71547bb43f101p-1 * Scale, 0x1.7154764a815cbp0 * Scale,},
- .exp2f_tab = {0x3ff0000000000000, 0x3fefd9b0d3158574, 0x3fefb5586cf9890f,
- 0x3fef9301d0125b51, 0x3fef72b83c7d517b, 0x3fef54873168b9aa,
- 0x3fef387a6e756238, 0x3fef1e9df51fdee1, 0x3fef06fe0a31b715,
- 0x3feef1a7373aa9cb, 0x3feedea64c123422, 0x3feece086061892d,
- 0x3feebfdad5362a27, 0x3feeb42b569d4f82, 0x3feeab07dd485429,
- 0x3feea47eb03a5585, 0x3feea09e667f3bcd, 0x3fee9f75e8ec5f74,
- 0x3feea11473eb0187, 0x3feea589994cce13, 0x3feeace5422aa0db,
- 0x3feeb737b0cdc5e5, 0x3feec49182a3f090, 0x3feed503b23e255d,
- 0x3feee89f995ad3ad, 0x3feeff76f2fb5e47, 0x3fef199bdd85529c,
- 0x3fef3720dcef9069, 0x3fef5818dcfba487, 0x3fef7c97337b9b5f,
- 0x3fefa4afa2a490da, 0x3fefd0765b6e4540,},
- .exp2f_poly = { /* rel err: 1.69 * 2^-34. */
- 0x1.c6af84b912394p-5 / Scale / Scale / Scale,
- 0x1.ebfce50fac4f3p-3 / Scale / Scale,
- 0x1.62e42ff0c52d6p-1 / Scale}};
-
-static float32x4_t VPCS_ATTR NOINLINE
-special_case (float32x4_t x, float32x4_t y, float32x4_t ret, uint32x4_t cmp)
-{
- return v_call2_f32 (powf, x, y, ret, cmp);
-}
-
-float32x4_t VPCS_ATTR V_NAME_F2 (pow) (float32x4_t x, float32x4_t y)
-{
- uint32x4_t u = vreinterpretq_u32_f32 (x);
- uint32x4_t cmp = vcgeq_u32 (vsubq_u32 (u, Min), Thresh);
- uint32x4_t tmp = vsubq_u32 (u, Off);
- uint32x4_t i = vandq_u32 (vshrq_n_u32 (tmp, (23 - V_POWF_LOG2_TABLE_BITS)),
- Log2IdxMask);
- uint32x4_t top = vbicq_u32 (tmp, MantissaMask);
- uint32x4_t iz = vsubq_u32 (u, top);
- int32x4_t k = vshrq_n_s32 (vreinterpretq_s32_u32 (top),
- 23 - V_EXP2F_TABLE_BITS); /* arithmetic shift. */
-
- float32x4_t ret;
- for (int lane = 0; lane < 4; lane++)
- {
- /* Use double precision for each lane. */
- double invc = data.log2_tab[i[lane]].invc;
- double logc = data.log2_tab[i[lane]].logc;
- double z = (double) asfloat (iz[lane]);
-
- /* log2(x) = log1p(z/c-1)/ln2 + log2(c) + k. */
- double r = __builtin_fma (z, invc, -1.0);
- double y0 = logc + (double) k[lane];
-
- /* Polynomial to approximate log1p(r)/ln2. */
- double logx = A[0];
- logx = r * logx + A[1];
- logx = r * logx + A[2];
- logx = r * logx + A[3];
- logx = r * logx + y0;
- double ylogx = y[lane] * logx;
- cmp[lane] = (asuint64 (ylogx) >> 47 & 0xffff)
- >= asuint64 (126.0 * (1 << V_EXP2F_TABLE_BITS)) >> 47
- ? 1
- : cmp[lane];
-
- /* N*x = k + r with r in [-1/2, 1/2]. */
- double kd = round (ylogx);
- uint64_t ki = lround (ylogx);
- r = ylogx - kd;
-
- /* exp2(x) = 2^(k/N) * 2^r ~= s * (C0*r^3 + C1*r^2 + C2*r + 1). */
- uint64_t t = data.exp2f_tab[ki % (1 << V_EXP2F_TABLE_BITS)];
- t += ki << (52 - V_EXP2F_TABLE_BITS);
- double s = asdouble (t);
- double p = C[0];
- p = __builtin_fma (p, r, C[1]);
- p = __builtin_fma (p, r, C[2]);
- p = __builtin_fma (p, s * r, s);
-
- ret[lane] = p;
- }
- if (unlikely (v_any_u32 (cmp)))
- return special_case (x, y, ret, cmp);
- return ret;
-}
diff --git a/math/aarch64/v_sin.c b/math/aarch64/v_sin.c
deleted file mode 100644
index 04129c31133d62dcecedf832b4e410b5217a51a2..0000000000000000000000000000000000000000
--- a/math/aarch64/v_sin.c
+++ /dev/null
@@ -1,97 +0,0 @@
-/*
- * Double-precision vector sin function.
- *
- * Copyright (c) 2019-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-
-#include "mathlib.h"
-#include "v_math.h"
-
-static const struct data
-{
- float64x2_t poly[7];
- float64x2_t range_val, inv_pi, shift, pi_1, pi_2, pi_3;
-} data = {
- .poly = { V2 (-0x1.555555555547bp-3), V2 (0x1.1111111108a4dp-7),
- V2 (-0x1.a01a019936f27p-13), V2 (0x1.71de37a97d93ep-19),
- V2 (-0x1.ae633919987c6p-26), V2 (0x1.60e277ae07cecp-33),
- V2 (-0x1.9e9540300a1p-41) },
-
- .range_val = V2 (0x1p23),
- .inv_pi = V2 (0x1.45f306dc9c883p-2),
- .pi_1 = V2 (0x1.921fb54442d18p+1),
- .pi_2 = V2 (0x1.1a62633145c06p-53),
- .pi_3 = V2 (0x1.c1cd129024e09p-106),
- .shift = V2 (0x1.8p52),
-};
-
-#if WANT_SIMD_EXCEPT
-# define TinyBound v_u64 (0x3000000000000000) /* asuint64 (0x1p-255). */
-# define Thresh v_u64 (0x1160000000000000) /* RangeVal - TinyBound. */
-#endif
-
-#define C(i) d->poly[i]
-
-static float64x2_t VPCS_ATTR NOINLINE
-special_case (float64x2_t x, float64x2_t y, uint64x2_t odd, uint64x2_t cmp)
-{
- y = vreinterpretq_f64_u64 (veorq_u64 (vreinterpretq_u64_f64 (y), odd));
- return v_call_f64 (sin, x, y, cmp);
-}
-
-/* Vector (AdvSIMD) sin approximation.
- Maximum observed error in [-pi/2, pi/2], where argument is not reduced,
- is 2.87 ULP:
- _ZGVnN2v_sin (0x1.921d5c6a07142p+0) got 0x1.fffffffa7dc02p-1
- want 0x1.fffffffa7dc05p-1
- Maximum observed error in the entire non-special domain ([-2^23, 2^23])
- is 3.22 ULP:
- _ZGVnN2v_sin (0x1.5702447b6f17bp+22) got 0x1.ffdcd125c84fbp-3
- want 0x1.ffdcd125c84f8p-3. */
-float64x2_t VPCS_ATTR V_NAME_D1 (sin) (float64x2_t x)
-{
- const struct data *d = ptr_barrier (&data);
- float64x2_t n, r, r2, r3, r4, y, t1, t2, t3;
- uint64x2_t odd, cmp;
-
-#if WANT_SIMD_EXCEPT
- /* Detect |x| <= TinyBound or |x| >= RangeVal. If fenv exceptions are to be
- triggered correctly, set any special lanes to 1 (which is neutral w.r.t.
- fenv). These lanes will be fixed by special-case handler later. */
- uint64x2_t ir = vreinterpretq_u64_f64 (vabsq_f64 (x));
- cmp = vcgeq_u64 (vsubq_u64 (ir, TinyBound), Thresh);
- r = vbslq_f64 (cmp, vreinterpretq_f64_u64 (cmp), x);
-#else
- r = x;
- cmp = vcageq_f64 (x, d->range_val);
-#endif
-
- /* n = rint(|x|/pi). */
- n = vfmaq_f64 (d->shift, d->inv_pi, r);
- odd = vshlq_n_u64 (vreinterpretq_u64_f64 (n), 63);
- n = vsubq_f64 (n, d->shift);
-
- /* r = |x| - n*pi (range reduction into -pi/2 .. pi/2). */
- r = vfmsq_f64 (r, d->pi_1, n);
- r = vfmsq_f64 (r, d->pi_2, n);
- r = vfmsq_f64 (r, d->pi_3, n);
-
- /* sin(r) poly approx. */
- r2 = vmulq_f64 (r, r);
- r3 = vmulq_f64 (r2, r);
- r4 = vmulq_f64 (r2, r2);
-
- t1 = vfmaq_f64 (C (4), C (5), r2);
- t2 = vfmaq_f64 (C (2), C (3), r2);
- t3 = vfmaq_f64 (C (0), C (1), r2);
-
- y = vfmaq_f64 (t1, C (6), r4);
- y = vfmaq_f64 (t2, y, r4);
- y = vfmaq_f64 (t3, y, r4);
- y = vfmaq_f64 (r, y, r3);
-
- if (unlikely (v_any_u64 (cmp)))
- return special_case (x, y, odd, cmp);
- return vreinterpretq_f64_u64 (veorq_u64 (vreinterpretq_u64_f64 (y), odd));
-}
diff --git a/math/aarch64/v_sinf.c b/math/aarch64/v_sinf.c
deleted file mode 100644
index 336879844459f70accf8f2532407db6fc6810e69..0000000000000000000000000000000000000000
--- a/math/aarch64/v_sinf.c
+++ /dev/null
@@ -1,82 +0,0 @@
-/*
- * Single-precision vector sin function.
- *
- * Copyright (c) 2019-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-
-#include "mathlib.h"
-#include "v_math.h"
-
-static const struct data
-{
- float32x4_t poly[4];
- float32x4_t range_val, inv_pi, shift, pi_1, pi_2, pi_3;
-} data = {
- /* 1.886 ulp error. */
- .poly = { V4 (-0x1.555548p-3f), V4 (0x1.110df4p-7f), V4 (-0x1.9f42eap-13f),
- V4 (0x1.5b2e76p-19f) },
-
- .pi_1 = V4 (0x1.921fb6p+1f),
- .pi_2 = V4 (-0x1.777a5cp-24f),
- .pi_3 = V4 (-0x1.ee59dap-49f),
-
- .inv_pi = V4 (0x1.45f306p-2f),
- .shift = V4 (0x1.8p+23f),
- .range_val = V4 (0x1p20f)
-};
-
-#if WANT_SIMD_EXCEPT
-# define TinyBound v_u32 (0x21000000) /* asuint32(0x1p-61f). */
-# define Thresh v_u32 (0x28800000) /* RangeVal - TinyBound. */
-#endif
-
-#define C(i) d->poly[i]
-
-static float32x4_t VPCS_ATTR NOINLINE
-special_case (float32x4_t x, float32x4_t y, uint32x4_t odd, uint32x4_t cmp)
-{
- /* Fall back to scalar code. */
- y = vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (y), odd));
- return v_call_f32 (sinf, x, y, cmp);
-}
-
-float32x4_t VPCS_ATTR V_NAME_F1 (sin) (float32x4_t x)
-{
- const struct data *d = ptr_barrier (&data);
- float32x4_t n, r, r2, y;
- uint32x4_t odd, cmp;
-
-#if WANT_SIMD_EXCEPT
- uint32x4_t ir = vreinterpretq_u32_f32 (vabsq_f32 (x));
- cmp = vcgeq_u32 (vsubq_u32 (ir, TinyBound), Thresh);
- /* If fenv exceptions are to be triggered correctly, set any special lanes
- to 1 (which is neutral w.r.t. fenv). These lanes will be fixed by
- special-case handler later. */
- r = vbslq_f32 (cmp, vreinterpretq_f32_u32 (cmp), x);
-#else
- r = x;
- cmp = vcageq_f32 (x, d->range_val);
-#endif
-
- /* n = rint(|x|/pi) */
- n = vfmaq_f32 (d->shift, d->inv_pi, r);
- odd = vshlq_n_u32 (vreinterpretq_u32_f32 (n), 31);
- n = vsubq_f32 (n, d->shift);
-
- /* r = |x| - n*pi (range reduction into -pi/2 .. pi/2) */
- r = vfmsq_f32 (r, d->pi_1, n);
- r = vfmsq_f32 (r, d->pi_2, n);
- r = vfmsq_f32 (r, d->pi_3, n);
-
- /* y = sin(r) */
- r2 = vmulq_f32 (r, r);
- y = vfmaq_f32 (C (2), C (3), r2);
- y = vfmaq_f32 (C (1), y, r2);
- y = vfmaq_f32 (C (0), y, r2);
- y = vfmaq_f32 (r, vmulq_f32 (y, r2), r);
-
- if (unlikely (v_any_u32 (cmp)))
- return special_case (x, y, odd, cmp);
- return vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (y), odd));
-}
diff --git a/math/cosf.c b/math/cosf.c
index 6293ce8f1b7d6bc0d0a515bb07339b1e364a0c27..f29f19474e230327f439da21eb0661e53bfaa1fe 100644
--- a/math/cosf.c
+++ b/math/cosf.c
@@ -1,8 +1,8 @@
/*
* Single-precision cos function.
*
- * Copyright (c) 2018-2021, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * Copyright (c) 2018-2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
*/
#include
@@ -22,7 +22,7 @@ cosf (float y)
int n;
const sincos_t *p = &__sincosf_table[0];
- if (abstop12 (y) < abstop12 (pio4f))
+ if (abstop12 (y) < abstop12 (pio4))
{
double x2 = x * x;
diff --git a/math/erf.c b/math/erf.c
index 5f9f40dda26434e314e4d141d84868b2d3b9c1f6..12d7e5160df702ab10ff1ae5da5604c927e54372 100644
--- a/math/erf.c
+++ b/math/erf.c
@@ -2,7 +2,7 @@
* Double-precision erf(x) function.
*
* Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * SPDX-License-Identifier: MIT
*/
#include "math_config.h"
diff --git a/math/erf_data.c b/math/erf_data.c
index 10cf1fae93e078c2636409318f91078931c443bf..807875bdd7f5db86ad3557c9c36c7afd93c07ca0 100644
--- a/math/erf_data.c
+++ b/math/erf_data.c
@@ -2,7 +2,7 @@
* Shared data between erf and erfc.
*
* Copyright (c) 2019-2020, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * SPDX-License-Identifier: MIT
*/
#include "math_config.h"
diff --git a/math/erff.c b/math/erff.c
index 9fa476dbbab2d72299486163eaeb7f5676a7b040..a58e82565dc34745500197c469d7f2ea9ec1f71b 100644
--- a/math/erff.c
+++ b/math/erff.c
@@ -2,7 +2,7 @@
* Single-precision erf(x) function.
*
* Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * SPDX-License-Identifier: MIT
*/
#include
diff --git a/math/erff_data.c b/math/erff_data.c
index f822788d0dd8068b17dc84ac3204349e21b4f34d..fa6b1ef4dedbfe7bafe493aa7c0dc007174fe704 100644
--- a/math/erff_data.c
+++ b/math/erff_data.c
@@ -2,7 +2,7 @@
* Data for approximation of erff.
*
* Copyright (c) 2019-2020, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * SPDX-License-Identifier: MIT
*/
#include "math_config.h"
diff --git a/math/exp.c b/math/exp.c
index 1de500c31f3ed08468b4e712fd3f7ea28e8a137e..7f5024cd8792144fe2681f1a60e297d405b9ea06 100644
--- a/math/exp.c
+++ b/math/exp.c
@@ -2,7 +2,7 @@
* Double-precision e^x function.
*
* Copyright (c) 2018-2019, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * SPDX-License-Identifier: MIT
*/
#include
diff --git a/math/exp10.c b/math/exp10.c
deleted file mode 100644
index 0fbec4c694ca831797d96968fc881a87aaf93644..0000000000000000000000000000000000000000
--- a/math/exp10.c
+++ /dev/null
@@ -1,129 +0,0 @@
-/*
- * Double-precision 10^x function.
- *
- * Copyright (c) 2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-
-#include "math_config.h"
-
-#define N (1 << EXP_TABLE_BITS)
-#define IndexMask (N - 1)
-#define OFlowBound 0x1.34413509f79ffp8 /* log10(DBL_MAX). */
-#define UFlowBound -0x1.5ep+8 /* -350. */
-#define SmallTop 0x3c6 /* top12(0x1p-57). */
-#define BigTop 0x407 /* top12(0x1p8). */
-#define Thresh 0x41 /* BigTop - SmallTop. */
-#define Shift __exp_data.shift
-#define C(i) __exp_data.exp10_poly[i]
-
-static double
-special_case (uint64_t sbits, double_t tmp, uint64_t ki)
-{
- double_t scale, y;
-
- if (ki - (1ull << 16) < 0x80000000)
- {
- /* The exponent of scale might have overflowed by 1. */
- sbits -= 1ull << 52;
- scale = asdouble (sbits);
- y = 2 * (scale + scale * tmp);
- return check_oflow (eval_as_double (y));
- }
-
- /* n < 0, need special care in the subnormal range. */
- sbits += 1022ull << 52;
- scale = asdouble (sbits);
- y = scale + scale * tmp;
-
- if (y < 1.0)
- {
- /* Round y to the right precision before scaling it into the subnormal
- range to avoid double rounding that can cause 0.5+E/2 ulp error where
- E is the worst-case ulp error outside the subnormal range. So this
- is only useful if the goal is better than 1 ulp worst-case error. */
- double_t lo = scale - y + scale * tmp;
- double_t hi = 1.0 + y;
- lo = 1.0 - hi + y + lo;
- y = eval_as_double (hi + lo) - 1.0;
- /* Avoid -0.0 with downward rounding. */
- if (WANT_ROUNDING && y == 0.0)
- y = 0.0;
- /* The underflow exception needs to be signaled explicitly. */
- force_eval_double (opt_barrier_double (0x1p-1022) * 0x1p-1022);
- }
- y = 0x1p-1022 * y;
-
- return check_uflow (y);
-}
-
-/* Double-precision 10^x approximation. Largest observed error is ~0.513 ULP. */
-double
-exp10 (double x)
-{
- uint64_t ix = asuint64 (x);
- uint32_t abstop = (ix >> 52) & 0x7ff;
-
- if (unlikely (abstop - SmallTop >= Thresh))
- {
- if (abstop - SmallTop >= 0x80000000)
- /* Avoid spurious underflow for tiny x.
- Note: 0 is common input. */
- return x + 1;
- if (abstop == 0x7ff)
- return ix == asuint64 (-INFINITY) ? 0.0 : x + 1.0;
- if (x >= OFlowBound)
- return __math_oflow (0);
- if (x < UFlowBound)
- return __math_uflow (0);
-
- /* Large x is special-cased below. */
- abstop = 0;
- }
-
- /* Reduce x: z = x * N / log10(2), k = round(z). */
- double_t z = __exp_data.invlog10_2N * x;
- double_t kd;
- int64_t ki;
-#if TOINT_INTRINSICS
- kd = roundtoint (z);
- ki = converttoint (z);
-#else
- kd = eval_as_double (z + Shift);
- kd -= Shift;
- ki = kd;
-#endif
-
- /* r = x - k * log10(2), r in [-0.5, 0.5]. */
- double_t r = x;
- r = __exp_data.neglog10_2hiN * kd + r;
- r = __exp_data.neglog10_2loN * kd + r;
-
- /* exp10(x) = 2^(k/N) * 2^(r/N).
- Approximate the two components separately. */
-
- /* s = 2^(k/N), using lookup table. */
- uint64_t e = ki << (52 - EXP_TABLE_BITS);
- uint64_t i = (ki & IndexMask) * 2;
- uint64_t u = __exp_data.tab[i + 1];
- uint64_t sbits = u + e;
-
- double_t tail = asdouble (__exp_data.tab[i]);
-
- /* 2^(r/N) ~= 1 + r * Poly(r). */
- double_t r2 = r * r;
- double_t p = C (0) + r * C (1);
- double_t y = C (2) + r * C (3);
- y = y + r2 * C (4);
- y = p + r2 * y;
- y = tail + y * r;
-
- if (unlikely (abstop == 0))
- return special_case (sbits, y, ki);
-
- /* Assemble components:
- y = 2^(r/N) * 2^(k/N)
- ~= (y + 1) * s. */
- double_t s = asdouble (sbits);
- return eval_as_double (s * y + s);
-}
diff --git a/math/exp2.c b/math/exp2.c
index a1eee44f1f4828b7fb9f133227e8b0e808f83788..35ab39f22ed5fcb0442c2fb84eea80ff95540fe2 100644
--- a/math/exp2.c
+++ b/math/exp2.c
@@ -2,7 +2,7 @@
* Double-precision 2^x function.
*
* Copyright (c) 2018-2019, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * SPDX-License-Identifier: MIT
*/
#include
diff --git a/math/exp2f.c b/math/exp2f.c
index 776c3ddf76636a75b24de080ac9fde62eed642d8..94b32538aa0de9c7e47ea3df9a5c60b7851bbeed 100644
--- a/math/exp2f.c
+++ b/math/exp2f.c
@@ -2,7 +2,7 @@
* Single-precision 2^x function.
*
* Copyright (c) 2017-2018, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * SPDX-License-Identifier: MIT
*/
#include
diff --git a/math/exp2f_data.c b/math/exp2f_data.c
index f0cb7fccacd158e0a771e3c3cb7ea4847896c149..3fb0ad11b15a4e387b91778ea2dc31faa1903bfa 100644
--- a/math/exp2f_data.c
+++ b/math/exp2f_data.c
@@ -2,7 +2,7 @@
* Shared data between expf, exp2f and powf.
*
* Copyright (c) 2017-2018, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * SPDX-License-Identifier: MIT
*/
#include "math_config.h"
diff --git a/math/exp_data.c b/math/exp_data.c
index 9df4e0b1a2bb9ccbb2c21deb23787323bcfce88d..cba76832566f04cc100bd153da745a6a57d30faf 100644
--- a/math/exp_data.c
+++ b/math/exp_data.c
@@ -2,7 +2,7 @@
* Shared data between exp, exp2 and pow.
*
* Copyright (c) 2018, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * SPDX-License-Identifier: MIT
*/
#include "math_config.h"
@@ -12,7 +12,6 @@
const struct exp_data __exp_data = {
// N/ln2
.invln2N = 0x1.71547652b82fep0 * N,
-.invlog10_2N = 0x1.a934f0979a371p1 * N,
// -ln2/N
#if N == 64
.negln2hiN = -0x1.62e42fefa0000p-7,
@@ -27,8 +26,6 @@ const struct exp_data __exp_data = {
.negln2hiN = -0x1.62e42fef80000p-10,
.negln2loN = -0x1.1cf79abc9e3b4p-45,
#endif
-.neglog10_2hiN = -0x1.3441350ap-2 / N,
-.neglog10_2loN = 0x1.0c0219dc1da99p-39 / N,
// Used for rounding when !TOINT_INTRINSICS
#if EXP_USE_TOINT_NARROW
.shift = 0x1800000000.8p0,
@@ -150,24 +147,6 @@ const struct exp_data __exp_data = {
0x1.3b2ab786ee1dap-7,
#endif
},
-.exp10_poly = {
-#if EXP10_POLY_WIDE
-/* Range is wider if using shift-based reduction: coeffs generated
- using Remez in [-log10(2)/128, log10(2)/128 ]. */
-0x1.26bb1bbb55515p1,
-0x1.53524c73cd32bp1,
-0x1.0470591e1a108p1,
-0x1.2bd77b12fe9a8p0,
-0x1.14289fef24b78p-1
-#else
-/* Coeffs generated using Remez in [-log10(2)/256, log10(2)/256 ]. */
-0x1.26bb1bbb55516p1,
-0x1.53524c73ce9fep1,
-0x1.0470591ce4b26p1,
-0x1.2bd76577fe684p0,
-0x1.1446eeccd0efbp-1
-#endif
-},
// 2^(k/N) ~= H[k]*(1 + T[k]) for int k in [0,N)
// tab[2*k] = asuint64(T[k])
// tab[2*k+1] = asuint64(H[k]) - (k << 52)/N
diff --git a/math/expf.c b/math/expf.c
index 08a20d59e49145ab8ae0099c1bda89ab6cad0752..9b2f0c3d8c56c98d8e9d37d45143b713cb92e570 100644
--- a/math/expf.c
+++ b/math/expf.c
@@ -2,7 +2,7 @@
* Single-precision e^x function.
*
* Copyright (c) 2017-2019, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * SPDX-License-Identifier: MIT
*/
#include
diff --git a/math/include/mathlib.h b/math/include/mathlib.h
index 64cbb9c1f8506eca4fc7bf0ccf9c2991b4663b06..279d829d8ea15acae38ae51ada3fa74f3920f7f5 100644
--- a/math/include/mathlib.h
+++ b/math/include/mathlib.h
@@ -1,8 +1,8 @@
/*
* Public API.
*
- * Copyright (c) 2015-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * Copyright (c) 2015-2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
*/
#ifndef _MATHLIB_H
@@ -18,33 +18,74 @@ float cosf (float);
void sincosf (float, float*, float*);
double exp (double);
-double exp10 (double);
double exp2 (double);
double log (double);
double log2 (double);
double pow (double, double);
+/* Scalar functions using the vector algorithm with identical result. */
+float __s_sinf (float);
+float __s_cosf (float);
+float __s_expf (float);
+float __s_expf_1u (float);
+float __s_exp2f (float);
+float __s_exp2f_1u (float);
+float __s_logf (float);
+float __s_powf (float, float);
+double __s_sin (double);
+double __s_cos (double);
+double __s_exp (double);
+double __s_log (double);
+double __s_pow (double, double);
+
#if __aarch64__
-# if __GNUC__ >= 5
+#if __GNUC__ >= 5
typedef __Float32x4_t __f32x4_t;
typedef __Float64x2_t __f64x2_t;
-# elif __clang_major__*100+__clang_minor__ >= 305
+#elif __clang_major__*100+__clang_minor__ >= 305
typedef __attribute__((__neon_vector_type__(4))) float __f32x4_t;
typedef __attribute__((__neon_vector_type__(2))) double __f64x2_t;
-# else
-# error Unsupported compiler
-# endif
+#else
+#error Unsupported compiler
+#endif
+
+/* Vector functions following the base PCS. */
+__f32x4_t __v_sinf (__f32x4_t);
+__f32x4_t __v_cosf (__f32x4_t);
+__f32x4_t __v_expf (__f32x4_t);
+__f32x4_t __v_expf_1u (__f32x4_t);
+__f32x4_t __v_exp2f (__f32x4_t);
+__f32x4_t __v_exp2f_1u (__f32x4_t);
+__f32x4_t __v_logf (__f32x4_t);
+__f32x4_t __v_powf (__f32x4_t, __f32x4_t);
+__f64x2_t __v_sin (__f64x2_t);
+__f64x2_t __v_cos (__f64x2_t);
+__f64x2_t __v_exp (__f64x2_t);
+__f64x2_t __v_log (__f64x2_t);
+__f64x2_t __v_pow (__f64x2_t, __f64x2_t);
-# if __GNUC__ >= 9 || __clang_major__ >= 8
-# undef __vpcs
-# define __vpcs __attribute__((__aarch64_vector_pcs__))
+#if __GNUC__ >= 9 || __clang_major__ >= 8
+#define __vpcs __attribute__((__aarch64_vector_pcs__))
+
+/* Vector functions following the vector PCS. */
+__vpcs __f32x4_t __vn_sinf (__f32x4_t);
+__vpcs __f32x4_t __vn_cosf (__f32x4_t);
+__vpcs __f32x4_t __vn_expf (__f32x4_t);
+__vpcs __f32x4_t __vn_expf_1u (__f32x4_t);
+__vpcs __f32x4_t __vn_exp2f (__f32x4_t);
+__vpcs __f32x4_t __vn_exp2f_1u (__f32x4_t);
+__vpcs __f32x4_t __vn_logf (__f32x4_t);
+__vpcs __f32x4_t __vn_powf (__f32x4_t, __f32x4_t);
+__vpcs __f64x2_t __vn_sin (__f64x2_t);
+__vpcs __f64x2_t __vn_cos (__f64x2_t);
+__vpcs __f64x2_t __vn_exp (__f64x2_t);
+__vpcs __f64x2_t __vn_log (__f64x2_t);
+__vpcs __f64x2_t __vn_pow (__f64x2_t, __f64x2_t);
/* Vector functions following the vector PCS using ABI names. */
__vpcs __f32x4_t _ZGVnN4v_sinf (__f32x4_t);
__vpcs __f32x4_t _ZGVnN4v_cosf (__f32x4_t);
-__vpcs __f32x4_t _ZGVnN4v_expf_1u (__f32x4_t);
__vpcs __f32x4_t _ZGVnN4v_expf (__f32x4_t);
-__vpcs __f32x4_t _ZGVnN4v_exp2f_1u (__f32x4_t);
__vpcs __f32x4_t _ZGVnN4v_exp2f (__f32x4_t);
__vpcs __f32x4_t _ZGVnN4v_logf (__f32x4_t);
__vpcs __f32x4_t _ZGVnN4vv_powf (__f32x4_t, __f32x4_t);
@@ -53,7 +94,7 @@ __vpcs __f64x2_t _ZGVnN2v_cos (__f64x2_t);
__vpcs __f64x2_t _ZGVnN2v_exp (__f64x2_t);
__vpcs __f64x2_t _ZGVnN2v_log (__f64x2_t);
__vpcs __f64x2_t _ZGVnN2vv_pow (__f64x2_t, __f64x2_t);
-# endif
+#endif
#endif
#endif
diff --git a/math/log.c b/math/log.c
index 43dfc2a744f060f8ebe9a4b25fb8da0367070d5e..d3b7bc60747c2ace661ed1885669b1ab763e4dd2 100644
--- a/math/log.c
+++ b/math/log.c
@@ -2,7 +2,7 @@
* Double-precision log(x) function.
*
* Copyright (c) 2018-2019, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * SPDX-License-Identifier: MIT
*/
#include
diff --git a/math/log2.c b/math/log2.c
index 3f9c21b0396263dd8274b252ffb4b1669e03ef18..55102b7729696324f1f2afb4cf4cd89fbd06c034 100644
--- a/math/log2.c
+++ b/math/log2.c
@@ -2,7 +2,7 @@
* Double-precision log2(x) function.
*
* Copyright (c) 2018-2019, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * SPDX-License-Identifier: MIT
*/
#include
diff --git a/math/log2_data.c b/math/log2_data.c
index 293bd7df4118b08a69b4d0ff3bcc65917c628a73..3fc9b47c1f03868c950cac77bcc28e552fbf411a 100644
--- a/math/log2_data.c
+++ b/math/log2_data.c
@@ -2,7 +2,7 @@
* Data for log2.
*
* Copyright (c) 2018, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * SPDX-License-Identifier: MIT
*/
#include "math_config.h"
diff --git a/math/log2f.c b/math/log2f.c
index 0a44fa2024f60639c34a1ce06a7b5d4eb77b09c6..acb629e6846cf3b94f665bca351d93098fb543a3 100644
--- a/math/log2f.c
+++ b/math/log2f.c
@@ -2,7 +2,7 @@
* Single-precision log2 function.
*
* Copyright (c) 2017-2018, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * SPDX-License-Identifier: MIT
*/
#include
diff --git a/math/log2f_data.c b/math/log2f_data.c
index 4866ef7f8171e67f36f16e41abfc858173326ab6..f3546d730abab682f5b6e81adeb2064ef9357ba4 100644
--- a/math/log2f_data.c
+++ b/math/log2f_data.c
@@ -2,7 +2,7 @@
* Data definition for log2f.
*
* Copyright (c) 2017-2018, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * SPDX-License-Identifier: MIT
*/
#include "math_config.h"
diff --git a/math/log_data.c b/math/log_data.c
index 3ecc1f40a8228d5e13438a51ac0b615392e268fb..96a098d42c160e9d8713e565e35bf8901183528d 100644
--- a/math/log_data.c
+++ b/math/log_data.c
@@ -2,7 +2,7 @@
* Data for log.
*
* Copyright (c) 2018, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * SPDX-License-Identifier: MIT
*/
#include "math_config.h"
diff --git a/math/logf.c b/math/logf.c
index 820f74c3e66a7078f78d39a326aade89251894ee..cfbaee12df108750f6de0ca9f8dd30be7a17ff2b 100644
--- a/math/logf.c
+++ b/math/logf.c
@@ -1,8 +1,8 @@
/*
* Single-precision log function.
*
- * Copyright (c) 2017-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * Copyright (c) 2017-2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
*/
#include
@@ -57,7 +57,7 @@ logf (float x)
tmp = ix - OFF;
i = (tmp >> (23 - LOGF_TABLE_BITS)) % N;
k = (int32_t) tmp >> 23; /* arithmetic shift */
- iz = ix - (tmp & 0xff800000);
+ iz = ix - (tmp & 0x1ff << 23);
invc = T[i].invc;
logc = T[i].logc;
z = (double_t) asfloat (iz);
diff --git a/math/logf_data.c b/math/logf_data.c
index 04247684755fdf65d4a834920f30dbb4fe72d89b..e8973ce4fedcbffc2d587bf73fd2afa3917331ca 100644
--- a/math/logf_data.c
+++ b/math/logf_data.c
@@ -2,7 +2,7 @@
* Data definition for logf.
*
* Copyright (c) 2017-2019, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * SPDX-License-Identifier: MIT
*/
#include "math_config.h"
diff --git a/math/math_config.h b/math/math_config.h
index 394aaebc48ac8a94e4ab15b23326a2b3de4e337d..e85104337048abdfb1f51302fe7b3d33ead2b06a 100644
--- a/math/math_config.h
+++ b/math/math_config.h
@@ -1,8 +1,8 @@
/*
* Configuration for math routines.
*
- * Copyright (c) 2017-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * Copyright (c) 2017-2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
*/
#ifndef _MATH_CONFIG_H
@@ -92,17 +92,6 @@
# define unlikely(x) (x)
#endif
-/* Return ptr but hide its value from the compiler so accesses through it
- cannot be optimized based on the contents. */
-#define ptr_barrier(ptr) \
- ({ \
- __typeof (ptr) __ptr = (ptr); \
- __asm("" : "+r"(__ptr)); \
- __ptr; \
- })
-
-/* Symbol renames to avoid libc conflicts. */
-
#if HAVE_FAST_ROUND
/* When set, the roundtoint and converttoint functions are provided with
the semantics documented below. */
@@ -392,22 +381,15 @@ extern const struct powf_log2_data
#define EXP_USE_TOINT_NARROW 0
#define EXP2_POLY_ORDER 5
#define EXP2_POLY_WIDE 0
-/* Wider exp10 polynomial necessary for good precision in non-nearest rounding
- and !TOINT_INTRINSICS. */
-#define EXP10_POLY_WIDE 0
extern const struct exp_data
{
double invln2N;
- double invlog10_2N;
double shift;
double negln2hiN;
double negln2loN;
- double neglog10_2hiN;
- double neglog10_2loN;
double poly[4]; /* Last four coefficients. */
double exp2_shift;
double exp2_poly[EXP2_POLY_ORDER];
- double exp10_poly[5];
uint64_t tab[2*(1 << EXP_TABLE_BITS)];
} __exp_data HIDDEN;
@@ -477,16 +459,4 @@ extern const struct erf_data
double erfc_poly_F[ERFC_POLY_F_NCOEFFS];
} __erf_data HIDDEN;
-#define V_EXP_TABLE_BITS 7
-extern const uint64_t __v_exp_data[1 << V_EXP_TABLE_BITS] HIDDEN;
-
-#define V_LOG_TABLE_BITS 7
-extern const struct v_log_data
-{
- struct
- {
- double invc, logc;
- } table[1 << V_LOG_TABLE_BITS];
-} __v_log_data HIDDEN;
-
#endif
diff --git a/math/math_err.c b/math/math_err.c
index cfe072809cf43c2dcd700798469af446e576affa..1bf9538a1ab1d43ee26a20b8a57d2c129685fcd7 100644
--- a/math/math_err.c
+++ b/math/math_err.c
@@ -2,7 +2,7 @@
* Double-precision math error handling.
*
* Copyright (c) 2018, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * SPDX-License-Identifier: MIT
*/
#include "math_config.h"
diff --git a/math/math_errf.c b/math/math_errf.c
index 4233918b1eaeef1e597d82e3242ce302d34e572c..d5350b819ab1aa4c37f61616e2d54b77027520fd 100644
--- a/math/math_errf.c
+++ b/math/math_errf.c
@@ -2,7 +2,7 @@
* Single-precision math error handling.
*
* Copyright (c) 2017-2020, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * SPDX-License-Identifier: MIT
*/
#include "math_config.h"
diff --git a/math/pow.c b/math/pow.c
index af719fe5ab105861f410eaaa0350692bfaa49346..86842c6abacd962b4df3f536229c977b9d167775 100644
--- a/math/pow.c
+++ b/math/pow.c
@@ -2,7 +2,7 @@
* Double-precision x^y function.
*
* Copyright (c) 2018-2020, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * SPDX-License-Identifier: MIT
*/
#include
diff --git a/math/pow_log_data.c b/math/pow_log_data.c
index 2a4c250d85c3b7715e84c513ed1e6c9daa628ce2..45569c5cc0645171b2e88db7dacc186540f8614b 100644
--- a/math/pow_log_data.c
+++ b/math/pow_log_data.c
@@ -2,7 +2,7 @@
* Data for the log part of pow.
*
* Copyright (c) 2018, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * SPDX-License-Identifier: MIT
*/
#include "math_config.h"
diff --git a/math/powf.c b/math/powf.c
index 05c80bb2eb670e032ec6ce4bc504ef0577d45ea8..6ba45d3852a50b1ae3decb93e291a6d285692e5e 100644
--- a/math/powf.c
+++ b/math/powf.c
@@ -2,7 +2,7 @@
* Single-precision pow function.
*
* Copyright (c) 2017-2019, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * SPDX-License-Identifier: MIT
*/
#include
diff --git a/math/powf_log2_data.c b/math/powf_log2_data.c
index 243836a549fdb7d8daf14488b2403eea074f4594..97e0d98cdbab6ffa9358a9670acd5c1255c02799 100644
--- a/math/powf_log2_data.c
+++ b/math/powf_log2_data.c
@@ -2,7 +2,7 @@
* Data definition for powf.
*
* Copyright (c) 2017-2019, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * SPDX-License-Identifier: MIT
*/
#include "math_config.h"
diff --git a/math/s_cos.c b/math/s_cos.c
new file mode 100644
index 0000000000000000000000000000000000000000..53a95b0adfde452cdfd9adb3fd315f314d080118
--- /dev/null
+++ b/math/s_cos.c
@@ -0,0 +1,6 @@
+/*
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+#define SCALAR 1
+#include "v_cos.c"
diff --git a/math/s_cosf.c b/math/s_cosf.c
new file mode 100644
index 0000000000000000000000000000000000000000..914c02eba6516e924785351f166161a608520c30
--- /dev/null
+++ b/math/s_cosf.c
@@ -0,0 +1,6 @@
+/*
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+#define SCALAR 1
+#include "v_cosf.c"
diff --git a/math/s_exp.c b/math/s_exp.c
new file mode 100644
index 0000000000000000000000000000000000000000..ac7246b2c100d474250533eae917b79e179ae13c
--- /dev/null
+++ b/math/s_exp.c
@@ -0,0 +1,6 @@
+/*
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+#define SCALAR 1
+#include "v_exp.c"
diff --git a/math/s_exp2f.c b/math/s_exp2f.c
new file mode 100644
index 0000000000000000000000000000000000000000..df7dfd680ff40d4a15c8fdff9dec438a9978ddd1
--- /dev/null
+++ b/math/s_exp2f.c
@@ -0,0 +1,6 @@
+/*
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+#define SCALAR 1
+#include "v_exp2f.c"
diff --git a/math/s_exp2f_1u.c b/math/s_exp2f_1u.c
new file mode 100644
index 0000000000000000000000000000000000000000..5e3852b41d83710fe91f1dffcd97662a5bfe6d01
--- /dev/null
+++ b/math/s_exp2f_1u.c
@@ -0,0 +1,6 @@
+/*
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+#define SCALAR 1
+#include "v_exp2f_1u.c"
diff --git a/math/s_expf.c b/math/s_expf.c
new file mode 100644
index 0000000000000000000000000000000000000000..3492c460733d7a128deb55b4bb6db4eaa7092db4
--- /dev/null
+++ b/math/s_expf.c
@@ -0,0 +1,6 @@
+/*
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+#define SCALAR 1
+#include "v_expf.c"
diff --git a/math/s_expf_1u.c b/math/s_expf_1u.c
new file mode 100644
index 0000000000000000000000000000000000000000..eb7bbcba5566a177dad04d70407cbeb4c99b3aee
--- /dev/null
+++ b/math/s_expf_1u.c
@@ -0,0 +1,6 @@
+/*
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+#define SCALAR 1
+#include "v_expf_1u.c"
diff --git a/math/s_log.c b/math/s_log.c
new file mode 100644
index 0000000000000000000000000000000000000000..23289cf948ecd9503653a7719bfcec1daf238289
--- /dev/null
+++ b/math/s_log.c
@@ -0,0 +1,6 @@
+/*
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+#define SCALAR 1
+#include "v_log.c"
diff --git a/math/s_logf.c b/math/s_logf.c
new file mode 100644
index 0000000000000000000000000000000000000000..9399350fc1ee501f7e855ef1bf4ad53fd1c2d374
--- /dev/null
+++ b/math/s_logf.c
@@ -0,0 +1,6 @@
+/*
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+#define SCALAR 1
+#include "v_logf.c"
diff --git a/math/s_pow.c b/math/s_pow.c
new file mode 100644
index 0000000000000000000000000000000000000000..2e34c9f896d6d920937d12befad1fc9e4e0a1596
--- /dev/null
+++ b/math/s_pow.c
@@ -0,0 +1,6 @@
+/*
+ * Copyright (c) 2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+#define SCALAR 1
+#include "v_pow.c"
diff --git a/math/s_powf.c b/math/s_powf.c
new file mode 100644
index 0000000000000000000000000000000000000000..6d91a4a72b3733ba435d3605ec4d5a880f33ce90
--- /dev/null
+++ b/math/s_powf.c
@@ -0,0 +1,6 @@
+/*
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+#define SCALAR 1
+#include "v_powf.c"
diff --git a/math/s_sin.c b/math/s_sin.c
new file mode 100644
index 0000000000000000000000000000000000000000..06982c2018c675c1b8eac362d7c17b07553da760
--- /dev/null
+++ b/math/s_sin.c
@@ -0,0 +1,6 @@
+/*
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+#define SCALAR 1
+#include "v_sin.c"
diff --git a/math/s_sinf.c b/math/s_sinf.c
new file mode 100644
index 0000000000000000000000000000000000000000..68ca90853736f260b4b7f345928c1b7ee893f24c
--- /dev/null
+++ b/math/s_sinf.c
@@ -0,0 +1,6 @@
+/*
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+#define SCALAR 1
+#include "v_sinf.c"
diff --git a/math/sincosf.c b/math/sincosf.c
index 446f21d60faf3a5b3203ac6abf4df89a77907ed6..9746f1c22e6c2b30a2003e649fcfd40ebd8bcc7c 100644
--- a/math/sincosf.c
+++ b/math/sincosf.c
@@ -1,8 +1,8 @@
/*
* Single-precision sin/cos function.
*
- * Copyright (c) 2018-2021, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * Copyright (c) 2018-2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
*/
#include
@@ -22,7 +22,7 @@ sincosf (float y, float *sinp, float *cosp)
int n;
const sincos_t *p = &__sincosf_table[0];
- if (abstop12 (y) < abstop12 (pio4f))
+ if (abstop12 (y) < abstop12 (pio4))
{
double x2 = x * x;
diff --git a/math/sincosf.h b/math/sincosf.h
index ec23ed7aeb2615e97ca26860c12452d548179ba4..1e80fc9ba8e19cab265fc98ec325c9b3f17a998d 100644
--- a/math/sincosf.h
+++ b/math/sincosf.h
@@ -1,8 +1,8 @@
/*
* Header for sinf, cosf and sincosf.
*
- * Copyright (c) 2018-2021, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * Copyright (c) 2018, Arm Limited.
+ * SPDX-License-Identifier: MIT
*/
#include
@@ -12,7 +12,7 @@
/* 2PI * 2^-64. */
static const double pi63 = 0x1.921FB54442D18p-62;
/* PI / 4. */
-static const float pio4f = 0x1.921FB6p-1f;
+static const double pio4 = 0x1.921FB54442D18p-1;
/* The constants and polynomials for sine and cosine. */
typedef struct
diff --git a/math/sincosf_data.c b/math/sincosf_data.c
index 22525290ab087a0f27e60f0c36731a5227da4baa..ab4ac4710feff2468cf9e55b04d4ad22dbc75233 100644
--- a/math/sincosf_data.c
+++ b/math/sincosf_data.c
@@ -2,7 +2,7 @@
* Data definition for sinf, cosf and sincosf.
*
* Copyright (c) 2018-2019, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * SPDX-License-Identifier: MIT
*/
#include
diff --git a/math/sinf.c b/math/sinf.c
index 8dd8ae458794c51cd24a4e7623d16ccf49b0bcd9..ddbc1daf74a9df1d90dad824f3c30d0460aafcc2 100644
--- a/math/sinf.c
+++ b/math/sinf.c
@@ -1,8 +1,8 @@
/*
* Single-precision sin function.
*
- * Copyright (c) 2018-2021, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * Copyright (c) 2018-2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
*/
#include
@@ -21,7 +21,7 @@ sinf (float y)
int n;
const sincos_t *p = &__sincosf_table[0];
- if (abstop12 (y) < abstop12 (pio4f))
+ if (abstop12 (y) < abstop12 (pio4))
{
s = x * x;
diff --git a/math/test/mathbench.c b/math/test/mathbench.c
index b2711e5a763ab4c4b13dfe23ce6abebd8d18d4da..0c17826e52961b3abd86b1e53ab3ec4a74d7ed8e 100644
--- a/math/test/mathbench.c
+++ b/math/test/mathbench.c
@@ -1,8 +1,8 @@
/*
* Microbenchmark for math functions.
*
- * Copyright (c) 2018-2022, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * Copyright (c) 2018-2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
*/
#undef _GNU_SOURCE
@@ -15,6 +15,11 @@
#include
#include "mathlib.h"
+#ifndef WANT_VMATH
+/* Enable the build of vector math code. */
+# define WANT_VMATH 1
+#endif
+
/* Number of measurements, best result is reported. */
#define MEASURE 60
/* Array size. */
@@ -29,9 +34,8 @@ static float Af[N];
static long measurecount = MEASURE;
static long itercount = ITER;
-#ifdef __vpcs
-#include
-typedef float64x2_t v_double;
+#if __aarch64__ && WANT_VMATH
+typedef __f64x2_t v_double;
#define v_double_len() 2
@@ -47,7 +51,7 @@ v_double_dup (double x)
return (v_double){x, x};
}
-typedef float32x4_t v_float;
+typedef __f32x4_t v_float;
#define v_float_len() 4
@@ -72,91 +76,141 @@ typedef float v_float;
#define v_float_len(x) 1
#define v_float_load(x) (x)[0]
#define v_float_dup(x) (x)
-
#endif
-#if WANT_SVE_MATH
-#include
-typedef svbool_t sv_bool;
-typedef svfloat64_t sv_double;
+static double
+dummy (double x)
+{
+ return x;
+}
-#define sv_double_len() svcntd()
+static float
+dummyf (float x)
+{
+ return x;
+}
-static inline sv_double
-sv_double_load (const double *p)
+#if WANT_VMATH
+#if __aarch64__
+static v_double
+__v_dummy (v_double x)
{
- svbool_t pg = svptrue_b64();
- return svld1(pg, p);
+ return x;
}
-static inline sv_double
-sv_double_dup (double x)
+static v_float
+__v_dummyf (v_float x)
{
- return svdup_n_f64(x);
+ return x;
}
-typedef svfloat32_t sv_float;
+#ifdef __vpcs
+__vpcs static v_double
+__vn_dummy (v_double x)
+{
+ return x;
+}
-#define sv_float_len() svcntw()
+__vpcs static v_float
+__vn_dummyf (v_float x)
+{
+ return x;
+}
-static inline sv_float
-sv_float_load (const float *p)
+__vpcs static v_float
+xy__vn_powf (v_float x)
{
- svbool_t pg = svptrue_b32();
- return svld1(pg, p);
+ return __vn_powf (x, x);
}
-static inline sv_float
-sv_float_dup (float x)
+__vpcs static v_float
+xy_Z_powf (v_float x)
{
- return svdup_n_f32(x);
+ return _ZGVnN4vv_powf (x, x);
+}
+
+__vpcs static v_double
+xy__vn_pow (v_double x)
+{
+ return __vn_pow (x, x);
+}
+
+__vpcs static v_double
+xy_Z_pow (v_double x)
+{
+ return _ZGVnN2vv_pow (x, x);
}
-#else
-/* dummy definitions to make things compile. */
-#define sv_double_len(x) 1
-#define sv_float_len(x) 1
#endif
-static double
-dummy (double x)
+static v_float
+xy__v_powf (v_float x)
{
- return x;
+ return __v_powf (x, x);
}
-static float
-dummyf (float x)
+static v_double
+xy__v_pow (v_double x)
{
- return x;
+ return __v_pow (x, x);
}
-#ifdef __vpcs
-__vpcs static v_double
-__vn_dummy (v_double x)
+#endif
+
+static float
+xy__s_powf (float x)
{
- return x;
+ return __s_powf (x, x);
}
-__vpcs static v_float
-__vn_dummyf (v_float x)
+static double
+xy__s_pow (double x)
{
- return x;
+ return __s_pow (x, x);
}
#endif
-#if WANT_SVE_MATH
-static sv_double
-__sv_dummy (sv_double x, sv_bool pg)
+
+static double
+xypow (double x)
{
- return x;
+ return pow (x, x);
}
-static sv_float
-__sv_dummyf (sv_float x, sv_bool pg)
+static float
+xypowf (float x)
{
- return x;
+ return powf (x, x);
}
-#endif
+static double
+xpow (double x)
+{
+ return pow (x, 23.4);
+}
+
+static float
+xpowf (float x)
+{
+ return powf (x, 23.4f);
+}
+
+static double
+ypow (double x)
+{
+ return pow (2.34, x);
+}
+
+static float
+ypowf (float x)
+{
+ return powf (2.34f, x);
+}
-#include "test/mathbench_wrappers.h"
+static float
+sincosf_wrap (float x)
+{
+ float s, c;
+ sincosf (x, &s, &c);
+ return s + c;
+}
static const struct fun
{
@@ -169,40 +223,127 @@ static const struct fun
{
double (*d) (double);
float (*f) (float);
+ v_double (*vd) (v_double);
+ v_float (*vf) (v_float);
#ifdef __vpcs
__vpcs v_double (*vnd) (v_double);
__vpcs v_float (*vnf) (v_float);
-#endif
-#if WANT_SVE_MATH
- sv_double (*svd) (sv_double, sv_bool);
- sv_float (*svf) (sv_float, sv_bool);
#endif
} fun;
} funtab[] = {
#define D(func, lo, hi) {#func, 'd', 0, lo, hi, {.d = func}},
#define F(func, lo, hi) {#func, 'f', 0, lo, hi, {.f = func}},
+#define VD(func, lo, hi) {#func, 'd', 'v', lo, hi, {.vd = func}},
+#define VF(func, lo, hi) {#func, 'f', 'v', lo, hi, {.vf = func}},
#define VND(func, lo, hi) {#func, 'd', 'n', lo, hi, {.vnd = func}},
#define VNF(func, lo, hi) {#func, 'f', 'n', lo, hi, {.vnf = func}},
-#define SVD(func, lo, hi) {#func, 'd', 's', lo, hi, {.svd = func}},
-#define SVF(func, lo, hi) {#func, 'f', 's', lo, hi, {.svf = func}},
D (dummy, 1.0, 2.0)
+D (exp, -9.9, 9.9)
+D (exp, 0.5, 1.0)
+D (exp2, -9.9, 9.9)
+D (log, 0.01, 11.1)
+D (log, 0.999, 1.001)
+D (log2, 0.01, 11.1)
+D (log2, 0.999, 1.001)
+{"pow", 'd', 0, 0.01, 11.1, {.d = xypow}},
+D (xpow, 0.01, 11.1)
+D (ypow, -9.9, 9.9)
+D (erf, -6.0, 6.0)
+
F (dummyf, 1.0, 2.0)
+F (expf, -9.9, 9.9)
+F (exp2f, -9.9, 9.9)
+F (logf, 0.01, 11.1)
+F (log2f, 0.01, 11.1)
+{"powf", 'f', 0, 0.01, 11.1, {.f = xypowf}},
+F (xpowf, 0.01, 11.1)
+F (ypowf, -9.9, 9.9)
+{"sincosf", 'f', 0, 0.1, 0.7, {.f = sincosf_wrap}},
+{"sincosf", 'f', 0, 0.8, 3.1, {.f = sincosf_wrap}},
+{"sincosf", 'f', 0, -3.1, 3.1, {.f = sincosf_wrap}},
+{"sincosf", 'f', 0, 3.3, 33.3, {.f = sincosf_wrap}},
+{"sincosf", 'f', 0, 100, 1000, {.f = sincosf_wrap}},
+{"sincosf", 'f', 0, 1e6, 1e32, {.f = sincosf_wrap}},
+F (sinf, 0.1, 0.7)
+F (sinf, 0.8, 3.1)
+F (sinf, -3.1, 3.1)
+F (sinf, 3.3, 33.3)
+F (sinf, 100, 1000)
+F (sinf, 1e6, 1e32)
+F (cosf, 0.1, 0.7)
+F (cosf, 0.8, 3.1)
+F (cosf, -3.1, 3.1)
+F (cosf, 3.3, 33.3)
+F (cosf, 100, 1000)
+F (cosf, 1e6, 1e32)
+F (erff, -4.0, 4.0)
+#if WANT_VMATH
+D (__s_sin, -3.1, 3.1)
+D (__s_cos, -3.1, 3.1)
+D (__s_exp, -9.9, 9.9)
+D (__s_log, 0.01, 11.1)
+{"__s_pow", 'd', 0, 0.01, 11.1, {.d = xy__s_pow}},
+F (__s_expf, -9.9, 9.9)
+F (__s_expf_1u, -9.9, 9.9)
+F (__s_exp2f, -9.9, 9.9)
+F (__s_exp2f_1u, -9.9, 9.9)
+F (__s_logf, 0.01, 11.1)
+{"__s_powf", 'f', 0, 0.01, 11.1, {.f = xy__s_powf}},
+F (__s_sinf, -3.1, 3.1)
+F (__s_cosf, -3.1, 3.1)
+#if __aarch64__
+VD (__v_dummy, 1.0, 2.0)
+VD (__v_sin, -3.1, 3.1)
+VD (__v_cos, -3.1, 3.1)
+VD (__v_exp, -9.9, 9.9)
+VD (__v_log, 0.01, 11.1)
+{"__v_pow", 'd', 'v', 0.01, 11.1, {.vd = xy__v_pow}},
+VF (__v_dummyf, 1.0, 2.0)
+VF (__v_expf, -9.9, 9.9)
+VF (__v_expf_1u, -9.9, 9.9)
+VF (__v_exp2f, -9.9, 9.9)
+VF (__v_exp2f_1u, -9.9, 9.9)
+VF (__v_logf, 0.01, 11.1)
+{"__v_powf", 'f', 'v', 0.01, 11.1, {.vf = xy__v_powf}},
+VF (__v_sinf, -3.1, 3.1)
+VF (__v_cosf, -3.1, 3.1)
#ifdef __vpcs
VND (__vn_dummy, 1.0, 2.0)
+VND (__vn_exp, -9.9, 9.9)
+VND (_ZGVnN2v_exp, -9.9, 9.9)
+VND (__vn_log, 0.01, 11.1)
+VND (_ZGVnN2v_log, 0.01, 11.1)
+{"__vn_pow", 'd', 'n', 0.01, 11.1, {.vnd = xy__vn_pow}},
+{"_ZGVnN2vv_pow", 'd', 'n', 0.01, 11.1, {.vnd = xy_Z_pow}},
+VND (__vn_sin, -3.1, 3.1)
+VND (_ZGVnN2v_sin, -3.1, 3.1)
+VND (__vn_cos, -3.1, 3.1)
+VND (_ZGVnN2v_cos, -3.1, 3.1)
VNF (__vn_dummyf, 1.0, 2.0)
+VNF (__vn_expf, -9.9, 9.9)
+VNF (_ZGVnN4v_expf, -9.9, 9.9)
+VNF (__vn_expf_1u, -9.9, 9.9)
+VNF (__vn_exp2f, -9.9, 9.9)
+VNF (_ZGVnN4v_exp2f, -9.9, 9.9)
+VNF (__vn_exp2f_1u, -9.9, 9.9)
+VNF (__vn_logf, 0.01, 11.1)
+VNF (_ZGVnN4v_logf, 0.01, 11.1)
+{"__vn_powf", 'f', 'n', 0.01, 11.1, {.vnf = xy__vn_powf}},
+{"_ZGVnN4vv_powf", 'f', 'n', 0.01, 11.1, {.vnf = xy_Z_powf}},
+VNF (__vn_sinf, -3.1, 3.1)
+VNF (_ZGVnN4v_sinf, -3.1, 3.1)
+VNF (__vn_cosf, -3.1, 3.1)
+VNF (_ZGVnN4v_cosf, -3.1, 3.1)
+#endif
#endif
-#if WANT_SVE_MATH
-SVD (__sv_dummy, 1.0, 2.0)
-SVF (__sv_dummyf, 1.0, 2.0)
#endif
-#include "test/mathbench_funcs.h"
{0},
#undef F
#undef D
+#undef VF
+#undef VD
#undef VNF
#undef VND
-#undef SVF
-#undef SVD
};
static void
@@ -301,75 +442,69 @@ runf_latency (float f (float))
prev = f (Af[i] + prev * z);
}
-#ifdef __vpcs
static void
-run_vn_thruput (__vpcs v_double f (v_double))
+run_v_thruput (v_double f (v_double))
{
for (int i = 0; i < N; i += v_double_len ())
f (v_double_load (A+i));
}
static void
-runf_vn_thruput (__vpcs v_float f (v_float))
+runf_v_thruput (v_float f (v_float))
{
for (int i = 0; i < N; i += v_float_len ())
f (v_float_load (Af+i));
}
static void
-run_vn_latency (__vpcs v_double f (v_double))
+run_v_latency (v_double f (v_double))
{
- volatile uint64x2_t vsel = (uint64x2_t) { 0, 0 };
- uint64x2_t sel = vsel;
- v_double prev = v_double_dup (0);
+ v_double z = v_double_dup (zero);
+ v_double prev = z;
for (int i = 0; i < N; i += v_double_len ())
- prev = f (vbslq_f64 (sel, prev, v_double_load (A+i)));
+ prev = f (v_double_load (A+i) + prev * z);
}
static void
-runf_vn_latency (__vpcs v_float f (v_float))
+runf_v_latency (v_float f (v_float))
{
- volatile uint32x4_t vsel = (uint32x4_t) { 0, 0, 0, 0 };
- uint32x4_t sel = vsel;
- v_float prev = v_float_dup (0);
+ v_float z = v_float_dup (zero);
+ v_float prev = z;
for (int i = 0; i < N; i += v_float_len ())
- prev = f (vbslq_f32 (sel, prev, v_float_load (Af+i)));
+ prev = f (v_float_load (Af+i) + prev * z);
}
-#endif
-#if WANT_SVE_MATH
+#ifdef __vpcs
static void
-run_sv_thruput (sv_double f (sv_double, sv_bool))
+run_vn_thruput (__vpcs v_double f (v_double))
{
- for (int i = 0; i < N; i += sv_double_len ())
- f (sv_double_load (A+i), svptrue_b64 ());
+ for (int i = 0; i < N; i += v_double_len ())
+ f (v_double_load (A+i));
}
static void
-runf_sv_thruput (sv_float f (sv_float, sv_bool))
+runf_vn_thruput (__vpcs v_float f (v_float))
{
- for (int i = 0; i < N; i += sv_float_len ())
- f (sv_float_load (Af+i), svptrue_b32 ());
+ for (int i = 0; i < N; i += v_float_len ())
+ f (v_float_load (Af+i));
}
static void
-run_sv_latency (sv_double f (sv_double, sv_bool))
+run_vn_latency (__vpcs v_double f (v_double))
{
- volatile sv_bool vsel = svptrue_b64 ();
- sv_bool sel = vsel;
- sv_double prev = sv_double_dup (0);
- for (int i = 0; i < N; i += sv_double_len ())
- prev = f (svsel_f64 (sel, sv_double_load (A+i), prev), svptrue_b64 ());
+ v_double z = v_double_dup (zero);
+ v_double prev = z;
+ for (int i = 0; i < N; i += v_double_len ())
+ prev = f (v_double_load (A+i) + prev * z);
}
static void
-runf_sv_latency (sv_float f (sv_float, sv_bool))
+runf_vn_latency (__vpcs v_float f (v_float))
{
- volatile sv_bool vsel = svptrue_b32 ();
- sv_bool sel = vsel;
- sv_float prev = sv_float_dup (0);
- for (int i = 0; i < N; i += sv_float_len ())
- prev = f (svsel_f32 (sel, sv_float_load (Af+i), prev), svptrue_b32 ());
+ v_float z = v_float_dup (zero);
+ v_float prev = z;
+ for (int i = 0; i < N; i += v_float_len ())
+ prev = f (v_float_load (Af+i) + prev * z);
}
#endif
@@ -404,10 +539,10 @@ bench1 (const struct fun *f, int type, double lo, double hi)
const char *s = type == 't' ? "rthruput" : "latency";
int vlen = 1;
- if (f->vec == 'n')
- vlen = f->prec == 'd' ? v_double_len() : v_float_len();
- else if (f->vec == 's')
- vlen = f->prec == 'd' ? sv_double_len() : sv_float_len();
+ if (f->vec && f->prec == 'd')
+ vlen = v_double_len();
+ else if (f->vec && f->prec == 'f')
+ vlen = v_float_len();
if (f->prec == 'd' && type == 't' && f->vec == 0)
TIMEIT (run_thruput, f->fun.d);
@@ -417,6 +552,14 @@ bench1 (const struct fun *f, int type, double lo, double hi)
TIMEIT (runf_thruput, f->fun.f);
else if (f->prec == 'f' && type == 'l' && f->vec == 0)
TIMEIT (runf_latency, f->fun.f);
+ else if (f->prec == 'd' && type == 't' && f->vec == 'v')
+ TIMEIT (run_v_thruput, f->fun.vd);
+ else if (f->prec == 'd' && type == 'l' && f->vec == 'v')
+ TIMEIT (run_v_latency, f->fun.vd);
+ else if (f->prec == 'f' && type == 't' && f->vec == 'v')
+ TIMEIT (runf_v_thruput, f->fun.vf);
+ else if (f->prec == 'f' && type == 'l' && f->vec == 'v')
+ TIMEIT (runf_v_latency, f->fun.vf);
#ifdef __vpcs
else if (f->prec == 'd' && type == 't' && f->vec == 'n')
TIMEIT (run_vn_thruput, f->fun.vnd);
@@ -427,32 +570,20 @@ bench1 (const struct fun *f, int type, double lo, double hi)
else if (f->prec == 'f' && type == 'l' && f->vec == 'n')
TIMEIT (runf_vn_latency, f->fun.vnf);
#endif
-#if WANT_SVE_MATH
- else if (f->prec == 'd' && type == 't' && f->vec == 's')
- TIMEIT (run_sv_thruput, f->fun.svd);
- else if (f->prec == 'd' && type == 'l' && f->vec == 's')
- TIMEIT (run_sv_latency, f->fun.svd);
- else if (f->prec == 'f' && type == 't' && f->vec == 's')
- TIMEIT (runf_sv_thruput, f->fun.svf);
- else if (f->prec == 'f' && type == 'l' && f->vec == 's')
- TIMEIT (runf_sv_latency, f->fun.svf);
-#endif
if (type == 't')
{
ns100 = (100 * dt + itercount * N / 2) / (itercount * N);
- printf ("%9s %8s: %4u.%02u ns/elem %10llu ns in [%g %g] vlen %d\n",
- f->name, s,
+ printf ("%9s %8s: %4u.%02u ns/elem %10llu ns in [%g %g]\n", f->name, s,
(unsigned) (ns100 / 100), (unsigned) (ns100 % 100),
- (unsigned long long) dt, lo, hi, vlen);
+ (unsigned long long) dt, lo, hi);
}
else if (type == 'l')
{
ns100 = (100 * dt + itercount * N / vlen / 2) / (itercount * N / vlen);
- printf ("%9s %8s: %4u.%02u ns/call %10llu ns in [%g %g] vlen %d\n",
- f->name, s,
+ printf ("%9s %8s: %4u.%02u ns/call %10llu ns in [%g %g]\n", f->name, s,
(unsigned) (ns100 / 100), (unsigned) (ns100 % 100),
- (unsigned long long) dt, lo, hi, vlen);
+ (unsigned long long) dt, lo, hi);
}
fflush (stdout);
}
diff --git a/math/test/mathbench_funcs.h b/math/test/mathbench_funcs.h
deleted file mode 100644
index 84c4e68650acbb1ded2e43dee5410b7c3e7224c4..0000000000000000000000000000000000000000
--- a/math/test/mathbench_funcs.h
+++ /dev/null
@@ -1,62 +0,0 @@
-/*
- * Function entries for mathbench.
- *
- * Copyright (c) 2022-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-/* clang-format off */
-D (exp, -9.9, 9.9)
-D (exp, 0.5, 1.0)
-D (exp10, -9.9, 9.9)
-D (exp2, -9.9, 9.9)
-D (log, 0.01, 11.1)
-D (log, 0.999, 1.001)
-D (log2, 0.01, 11.1)
-D (log2, 0.999, 1.001)
-{"pow", 'd', 0, 0.01, 11.1, {.d = xypow}},
-D (xpow, 0.01, 11.1)
-D (ypow, -9.9, 9.9)
-D (erf, -6.0, 6.0)
-
-F (expf, -9.9, 9.9)
-F (exp2f, -9.9, 9.9)
-F (logf, 0.01, 11.1)
-F (log2f, 0.01, 11.1)
-{"powf", 'f', 0, 0.01, 11.1, {.f = xypowf}},
-F (xpowf, 0.01, 11.1)
-F (ypowf, -9.9, 9.9)
-{"sincosf", 'f', 0, 0.1, 0.7, {.f = sincosf_wrap}},
-{"sincosf", 'f', 0, 0.8, 3.1, {.f = sincosf_wrap}},
-{"sincosf", 'f', 0, -3.1, 3.1, {.f = sincosf_wrap}},
-{"sincosf", 'f', 0, 3.3, 33.3, {.f = sincosf_wrap}},
-{"sincosf", 'f', 0, 100, 1000, {.f = sincosf_wrap}},
-{"sincosf", 'f', 0, 1e6, 1e32, {.f = sincosf_wrap}},
-F (sinf, 0.1, 0.7)
-F (sinf, 0.8, 3.1)
-F (sinf, -3.1, 3.1)
-F (sinf, 3.3, 33.3)
-F (sinf, 100, 1000)
-F (sinf, 1e6, 1e32)
-F (cosf, 0.1, 0.7)
-F (cosf, 0.8, 3.1)
-F (cosf, -3.1, 3.1)
-F (cosf, 3.3, 33.3)
-F (cosf, 100, 1000)
-F (cosf, 1e6, 1e32)
-F (erff, -4.0, 4.0)
-#ifdef __vpcs
-VND (_ZGVnN2v_exp, -9.9, 9.9)
-VND (_ZGVnN2v_log, 0.01, 11.1)
-{"_ZGVnN2vv_pow", 'd', 'n', 0.01, 11.1, {.vnd = xy_Z_pow}},
-VND (_ZGVnN2v_sin, -3.1, 3.1)
-VND (_ZGVnN2v_cos, -3.1, 3.1)
-VNF (_ZGVnN4v_expf, -9.9, 9.9)
-VNF (_ZGVnN4v_expf_1u, -9.9, 9.9)
-VNF (_ZGVnN4v_exp2f, -9.9, 9.9)
-VNF (_ZGVnN4v_exp2f_1u, -9.9, 9.9)
-VNF (_ZGVnN4v_logf, 0.01, 11.1)
-{"_ZGVnN4vv_powf", 'f', 'n', 0.01, 11.1, {.vnf = xy_Z_powf}},
-VNF (_ZGVnN4v_sinf, -3.1, 3.1)
-VNF (_ZGVnN4v_cosf, -3.1, 3.1)
-#endif
- /* clang-format on */
diff --git a/math/test/mathbench_wrappers.h b/math/test/mathbench_wrappers.h
deleted file mode 100644
index 062b9db56de51a741a698e13a547184461b2ca2b..0000000000000000000000000000000000000000
--- a/math/test/mathbench_wrappers.h
+++ /dev/null
@@ -1,66 +0,0 @@
-/*
- * Function wrappers for mathbench.
- *
- * Copyright (c) 2022-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-
-#ifdef __vpcs
-
-__vpcs static v_float
-xy_Z_powf (v_float x)
-{
- return _ZGVnN4vv_powf (x, x);
-}
-
-__vpcs static v_double
-xy_Z_pow (v_double x)
-{
- return _ZGVnN2vv_pow (x, x);
-}
-
-#endif
-
-static double
-xypow (double x)
-{
- return pow (x, x);
-}
-
-static float
-xypowf (float x)
-{
- return powf (x, x);
-}
-
-static double
-xpow (double x)
-{
- return pow (x, 23.4);
-}
-
-static float
-xpowf (float x)
-{
- return powf (x, 23.4f);
-}
-
-static double
-ypow (double x)
-{
- return pow (2.34, x);
-}
-
-static float
-ypowf (float x)
-{
- return powf (2.34f, x);
-}
-
-static float
-sincosf_wrap (float x)
-{
- float s, c;
- sincosf (x, &s, &c);
- return s + c;
-}
diff --git a/math/test/mathtest.c b/math/test/mathtest.c
index cedccfd39455930bf51ffdbb638b1be9935b4d80..310896738e478481a9f91ff878957a1f86accc2e 100644
--- a/math/test/mathtest.c
+++ b/math/test/mathtest.c
@@ -1,8 +1,8 @@
/*
* mathtest.c - test rig for mathlib
*
- * Copyright (c) 1998-2022, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * Copyright (c) 1998-2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
*/
#include
@@ -196,11 +196,9 @@ int is_complex_rettype(int rettype) {
#define TFUNCARM(arg,ret,name,tolerance) { t_func, arg, ret, (void*)& ARM_PREFIX(name), m_none, tolerance, #name }
#define MFUNC(arg,ret,name,tolerance) { t_macro, arg, ret, NULL, m_##name, tolerance, #name }
-#ifndef PL
/* sincosf wrappers for easier testing. */
static float sincosf_sinf(float x) { float s,c; sincosf(x, &s, &c); return s; }
static float sincosf_cosf(float x) { float s,c; sincosf(x, &s, &c); return c; }
-#endif
test_func tfuncs[] = {
/* trigonometric */
@@ -220,10 +218,9 @@ test_func tfuncs[] = {
TFUNCARM(at_s,rt_s, tanf, 4*ULPUNIT),
TFUNCARM(at_s,rt_s, sinf, 3*ULPUNIT/4),
TFUNCARM(at_s,rt_s, cosf, 3*ULPUNIT/4),
-#ifndef PL
TFUNCARM(at_s,rt_s, sincosf_sinf, 3*ULPUNIT/4),
TFUNCARM(at_s,rt_s, sincosf_cosf, 3*ULPUNIT/4),
-#endif
+
/* hyperbolic */
TFUNC(at_d, rt_d, atanh, 4*ULPUNIT),
TFUNC(at_d, rt_d, asinh, 4*ULPUNIT),
@@ -254,7 +251,6 @@ test_func tfuncs[] = {
TFUNCARM(at_s,rt_s, expf, 3*ULPUNIT/4),
TFUNCARM(at_s,rt_s, exp2f, 3*ULPUNIT/4),
TFUNC(at_s,rt_s, expm1f, ULPUNIT),
- TFUNC(at_d,rt_d, exp10, ULPUNIT),
/* power */
TFUNC(at_d2,rt_d, pow, 3*ULPUNIT/4),
@@ -1022,7 +1018,6 @@ int runtest(testdetail t) {
DO_DOP(d_arg1,op1r);
DO_DOP(d_arg2,op2r);
s_arg1.i = t.op1r[0]; s_arg2.i = t.op2r[0];
- s_res.i = 0;
/*
* Detect NaNs, infinities and denormals on input, and set a
@@ -1157,25 +1152,22 @@ int runtest(testdetail t) {
tresultr[0] = t.resultr[0];
tresultr[1] = t.resultr[1];
resultr[0] = d_res.i[dmsd]; resultr[1] = d_res.i[dlsd];
- resulti[0] = resulti[1] = 0;
wres = 2;
break;
case rt_i:
tresultr[0] = t.resultr[0];
resultr[0] = intres;
- resulti[0] = 0;
wres = 1;
break;
case rt_s:
case rt_s2:
tresultr[0] = t.resultr[0];
resultr[0] = s_res.i;
- resulti[0] = 0;
wres = 1;
break;
default:
puts("unhandled rettype in runtest");
- abort ();
+ wres = 0;
}
if(t.resultc != rc_none) {
int err = 0;
diff --git a/math/test/rtest/dotest.c b/math/test/rtest/dotest.c
index 5b3e9b4f18e467c536d989292b91706af1ff4f67..6be79e1df0d1acef5a5c3861f1ab73058e10b836 100644
--- a/math/test/rtest/dotest.c
+++ b/math/test/rtest/dotest.c
@@ -2,7 +2,7 @@
* dotest.c - actually generate mathlib test cases
*
* Copyright (c) 1999-2019, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * SPDX-License-Identifier: MIT
*/
#include
diff --git a/math/test/rtest/intern.h b/math/test/rtest/intern.h
index 3ebd7ddaf85d7b37c163ea8994250ac20408b396..12a9c749e18e1127eb27922ad30d13ac3cbd4d1c 100644
--- a/math/test/rtest/intern.h
+++ b/math/test/rtest/intern.h
@@ -2,7 +2,7 @@
* intern.h
*
* Copyright (c) 1999-2019, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * SPDX-License-Identifier: MIT
*/
#ifndef mathtest_intern_h
diff --git a/math/test/rtest/main.c b/math/test/rtest/main.c
index 3d533c946f79be126fc0b427a9578effed68179a..0d8ead891320a5c5afb3d72d7b7fbd82c5f6e540 100644
--- a/math/test/rtest/main.c
+++ b/math/test/rtest/main.c
@@ -2,7 +2,7 @@
* main.c
*
* Copyright (c) 1999-2019, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * SPDX-License-Identifier: MIT
*/
#include
diff --git a/math/test/rtest/random.c b/math/test/rtest/random.c
index 1de32580b733d347d9a62cfe2521a8aca3622b87..56123966b8c48f8acbeb1501d1e56d5b1d5e3e2c 100644
--- a/math/test/rtest/random.c
+++ b/math/test/rtest/random.c
@@ -2,7 +2,7 @@
* random.c - random number generator for producing mathlib test cases
*
* Copyright (c) 1998-2019, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * SPDX-License-Identifier: MIT
*/
#include "types.h"
diff --git a/math/test/rtest/random.h b/math/test/rtest/random.h
index 0b477d72b2346b2adbcd7cc7fec20dd9ade42395..b4b22df82a3d768bdb8227f6731b0bce5d6ce843 100644
--- a/math/test/rtest/random.h
+++ b/math/test/rtest/random.h
@@ -2,7 +2,7 @@
* random.h - header for random.c
*
* Copyright (c) 2009-2019, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * SPDX-License-Identifier: MIT
*/
#include "types.h"
diff --git a/math/test/rtest/semi.c b/math/test/rtest/semi.c
index 70a7844a48d613d1726b529e1347ccaec6439c1a..c9f0daf76508194f5443bfe9fccbbfb89fe8964d 100644
--- a/math/test/rtest/semi.c
+++ b/math/test/rtest/semi.c
@@ -2,7 +2,7 @@
* semi.c: test implementations of mathlib seminumerical functions
*
* Copyright (c) 1999-2019, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * SPDX-License-Identifier: MIT
*/
#include
diff --git a/math/test/rtest/semi.h b/math/test/rtest/semi.h
index 7a1444e55d288c93f665769a4a6fea37d42d7d33..17dc4158fb51e87e465c76ca5c9192cfb0dee71b 100644
--- a/math/test/rtest/semi.h
+++ b/math/test/rtest/semi.h
@@ -2,7 +2,7 @@
* semi.h: header for semi.c
*
* Copyright (c) 1999-2019, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * SPDX-License-Identifier: MIT
*/
#ifndef test_semi_h
diff --git a/math/test/rtest/types.h b/math/test/rtest/types.h
index e15b4e06a0d4aac3a0595edf69bfe830f56d624c..53cd557fa4cf448d6d4f49dbd85cf8c514905d47 100644
--- a/math/test/rtest/types.h
+++ b/math/test/rtest/types.h
@@ -2,7 +2,7 @@
* types.h
*
* Copyright (c) 2005-2019, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * SPDX-License-Identifier: MIT
*/
#ifndef mathtest_types_h
diff --git a/math/test/rtest/wrappers.c b/math/test/rtest/wrappers.c
index 441017192ab48b8332415c666cbd6d29c87c1e08..de45ac5768d0f750c1ad48c15902a02df9a8336d 100644
--- a/math/test/rtest/wrappers.c
+++ b/math/test/rtest/wrappers.c
@@ -2,7 +2,7 @@
* wrappers.c - wrappers to modify output of MPFR/MPC test functions
*
* Copyright (c) 2014-2019, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * SPDX-License-Identifier: MIT
*/
#include
diff --git a/math/test/rtest/wrappers.h b/math/test/rtest/wrappers.h
index 0a8a58777d8aed7ae1c2e1f489b822919aa53967..7b09c85a59f114af56f6ec7dd9e0e7c00bd43721 100644
--- a/math/test/rtest/wrappers.h
+++ b/math/test/rtest/wrappers.h
@@ -2,7 +2,7 @@
* wrappers.h - wrappers to modify output of MPFR/MPC test functions
*
* Copyright (c) 2014-2019, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * SPDX-License-Identifier: MIT
*/
typedef struct {
diff --git a/math/test/runulp.sh b/math/test/runulp.sh
index e2e03e3ae76196e8f94f2cdadd3147b4f4fafdac..0190d9ab27fb104de780d9101507a85ee9ff7a2e 100755
--- a/math/test/runulp.sh
+++ b/math/test/runulp.sh
@@ -2,8 +2,8 @@
# ULP error check script.
#
-# Copyright (c) 2019-2023, Arm Limited.
-# SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+# Copyright (c) 2019-2020, Arm Limited.
+# SPDX-License-Identifier: MIT
#set -x
set -eu
@@ -72,16 +72,6 @@ t pow 0x1.ffffffffffff0p-1 0x1.0000000000008p0 x 0x1p60 0x1p68 50000
t pow 0x1.ffffffffff000p-1 0x1p0 x 0x1p50 0x1p52 50000
t pow -0x1.ffffffffff000p-1 -0x1p0 x 0x1p50 0x1p52 50000
-L=0.02
-t exp10 0 0x1p-47 5000
-t exp10 -0 -0x1p-47 5000
-t exp10 0x1p-47 1 50000
-t exp10 -0x1p-47 -1 50000
-t exp10 1 0x1.34413509f79ffp8 50000
-t exp10 -1 -0x1.434e6420f4374p8 50000
-t exp10 0x1.34413509f79ffp8 inf 5000
-t exp10 -0x1.434e6420f4374p8 -inf 5000
-
L=1.0
Ldir=0.9
t erf 0 0xffff000000000000 10000
@@ -153,10 +143,15 @@ Ldir=0.5
done
# vector functions
-
Ldir=0.5
r='n'
-flags="${ULPFLAGS:--q}"
+flags="${ULPFLAGS:--q} -f"
+runs=
+check __s_exp 1 && runs=1
+runv=
+check __v_exp 1 && runv=1
+runvn=
+check __vn_exp 1 && runvn=1
range_exp='
0 0xffff000000000000 10000
@@ -182,10 +177,9 @@ range_pow='
'
range_sin='
- 0 0x1p23 500000
- -0 -0x1p23 500000
- 0x1p23 inf 10000
- -0x1p23 -inf 10000
+ 0 0xffff000000000000 10000
+ 0x1p-4 0x1p4 400000
+ -0x1p-23 0x1p23 400000
'
range_cos="$range_sin"
@@ -205,10 +199,9 @@ range_logf='
'
range_sinf='
- 0 0x1p20 500000
- -0 -0x1p20 500000
- 0x1p20 inf 10000
- -0x1p20 -inf 10000
+ 0 0xffff0000 10000
+ 0x1p-4 0x1p4 300000
+-0x1p-9 -0x1p9 300000
'
range_cosf="$range_sinf"
@@ -236,8 +229,9 @@ L_sinf=1.4
L_cosf=1.4
L_powf=2.1
-while read G F D
+while read G F R
do
+ [ "$R" = 1 ] || continue
case "$G" in \#*) continue ;; esac
eval range="\${range_$G}"
eval L="\${L_$G}"
@@ -245,35 +239,74 @@ do
do
[ -n "$X" ] || continue
case "$X" in \#*) continue ;; esac
- disable_fenv=""
- if [ -z "$WANT_SIMD_EXCEPT" ] || [ $WANT_SIMD_EXCEPT -eq 0 ]; then
- # If library was built with SIMD exceptions
- # disabled, disable fenv checking in ulp
- # tool. Otherwise, fenv checking may still be
- # disabled by adding -f to the end of the run
- # line.
- disable_fenv="-f"
- fi
- t $D $disable_fenv $F $X
+ t $F $X
done << EOF
$range
-
EOF
done << EOF
# group symbol run
-exp _ZGVnN2v_exp
-log _ZGVnN2v_log
-pow _ZGVnN2vv_pow -f
-sin _ZGVnN2v_sin -z
-cos _ZGVnN2v_cos
-expf _ZGVnN4v_expf
-expf_1u _ZGVnN4v_expf_1u -f
-exp2f _ZGVnN4v_exp2f
-exp2f_1u _ZGVnN4v_exp2f_1u -f
-logf _ZGVnN4v_logf
-sinf _ZGVnN4v_sinf -z
-cosf _ZGVnN4v_cosf
-powf _ZGVnN4vv_powf -f
+exp __s_exp $runs
+exp __v_exp $runv
+exp __vn_exp $runvn
+exp _ZGVnN2v_exp $runvn
+
+log __s_log $runs
+log __v_log $runv
+log __vn_log $runvn
+log _ZGVnN2v_log $runvn
+
+pow __s_pow $runs
+pow __v_pow $runv
+pow __vn_pow $runvn
+pow _ZGVnN2vv_pow $runvn
+
+sin __s_sin $runs
+sin __v_sin $runv
+sin __vn_sin $runvn
+sin _ZGVnN2v_sin $runvn
+
+cos __s_cos $runs
+cos __v_cos $runv
+cos __vn_cos $runvn
+cos _ZGVnN2v_cos $runvn
+
+expf __s_expf $runs
+expf __v_expf $runv
+expf __vn_expf $runvn
+expf _ZGVnN4v_expf $runvn
+
+expf_1u __s_expf_1u $runs
+expf_1u __v_expf_1u $runv
+expf_1u __vn_expf_1u $runvn
+
+exp2f __s_exp2f $runs
+exp2f __v_exp2f $runv
+exp2f __vn_exp2f $runvn
+exp2f _ZGVnN4v_exp2f $runvn
+
+exp2f_1u __s_exp2f_1u $runs
+exp2f_1u __v_exp2f_1u $runv
+exp2f_1u __vn_exp2f_1u $runvn
+
+logf __s_logf $runs
+logf __v_logf $runv
+logf __vn_logf $runvn
+logf _ZGVnN4v_logf $runvn
+
+sinf __s_sinf $runs
+sinf __v_sinf $runv
+sinf __vn_sinf $runvn
+sinf _ZGVnN4v_sinf $runvn
+
+cosf __s_cosf $runs
+cosf __v_cosf $runv
+cosf __vn_cosf $runvn
+cosf _ZGVnN4v_cosf $runvn
+
+powf __s_powf $runs
+powf __v_powf $runv
+powf __vn_powf $runvn
+powf _ZGVnN4vv_powf $runvn
EOF
[ 0 -eq $FAIL ] || {
diff --git a/math/test/testcases/directed/cosf.tst b/math/test/testcases/directed/cosf.tst
index 7ea0d45795a3647c73ea1de9f6cbf956a1ef7bb9..79160443f0990058f70bc0d03be6be545f3fd6f7 100644
--- a/math/test/testcases/directed/cosf.tst
+++ b/math/test/testcases/directed/cosf.tst
@@ -1,7 +1,7 @@
; cosf.tst - Directed test cases for SP cosine
;
; Copyright (c) 2007-2019, Arm Limited.
-; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+; SPDX-License-Identifier: MIT
func=cosf op1=7fc00001 result=7fc00001 errno=0
func=cosf op1=ffc00001 result=7fc00001 errno=0
diff --git a/math/test/testcases/directed/erf.tst b/math/test/testcases/directed/erf.tst
index 12384cef0dd98e24ac842c45ed89012d9c1b0ef6..7fa4d1868c0eb1a27920eda1485ee7be4dbe0f01 100644
--- a/math/test/testcases/directed/erf.tst
+++ b/math/test/testcases/directed/erf.tst
@@ -1,7 +1,7 @@
; erf.tst - Directed test cases for erf
;
; Copyright (c) 2007-2020, Arm Limited.
-; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+; SPDX-License-Identifier: MIT
func=erf op1=7ff80000.00000001 result=7ff80000.00000001 errno=0
func=erf op1=fff80000.00000001 result=7ff80000.00000001 errno=0
diff --git a/math/test/testcases/directed/erff.tst b/math/test/testcases/directed/erff.tst
index 28f8fa37f5aa7db743d399ab2a6af3070b009fe8..d05b7b1119c46c21ce7d22d2d3f2cbebff6eae44 100644
--- a/math/test/testcases/directed/erff.tst
+++ b/math/test/testcases/directed/erff.tst
@@ -1,7 +1,7 @@
; erff.tst
;
; Copyright (c) 2007-2020, Arm Limited.
-; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+; SPDX-License-Identifier: MIT
func=erff op1=7fc00001 result=7fc00001 errno=0
func=erff op1=ffc00001 result=7fc00001 errno=0
diff --git a/math/test/testcases/directed/exp.tst b/math/test/testcases/directed/exp.tst
index 0bb2ef4579cc1c5313494ead5e0dc80f4a02153e..85d556cd1e00f75c3273e67d420adce2ea7849df 100644
--- a/math/test/testcases/directed/exp.tst
+++ b/math/test/testcases/directed/exp.tst
@@ -1,7 +1,7 @@
; Directed test cases for exp
;
; Copyright (c) 2018-2019, Arm Limited.
-; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+; SPDX-License-Identifier: MIT
func=exp op1=7ff80000.00000001 result=7ff80000.00000001 errno=0
func=exp op1=fff80000.00000001 result=7ff80000.00000001 errno=0
diff --git a/math/test/testcases/directed/exp10.tst b/math/test/testcases/directed/exp10.tst
deleted file mode 100644
index 2cf4273bd1d718ef0332e78553e4c2c3c1cb5c2d..0000000000000000000000000000000000000000
--- a/math/test/testcases/directed/exp10.tst
+++ /dev/null
@@ -1,15 +0,0 @@
-; Directed test cases for exp10
-;
-; Copyright (c) 2023, Arm Limited.
-; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
-
-func=exp10 op1=7ff80000.00000001 result=7ff80000.00000001 errno=0
-func=exp10 op1=fff80000.00000001 result=7ff80000.00000001 errno=0
-func=exp10 op1=7ff00000.00000001 result=7ff80000.00000001 errno=0 status=i
-func=exp10 op1=fff00000.00000001 result=7ff80000.00000001 errno=0 status=i
-func=exp10 op1=7ff00000.00000000 result=7ff00000.00000000 errno=0
-func=exp10 op1=7fefffff.ffffffff result=7ff00000.00000000 errno=ERANGE status=ox
-func=exp10 op1=fff00000.00000000 result=00000000.00000000 errno=0
-func=exp10 op1=ffefffff.ffffffff result=00000000.00000000 errno=ERANGE status=ux
-func=exp10 op1=00000000.00000000 result=3ff00000.00000000 errno=0
-func=exp10 op1=80000000.00000000 result=3ff00000.00000000 errno=0
diff --git a/math/test/testcases/directed/exp2.tst b/math/test/testcases/directed/exp2.tst
index 7069f9010c8ccf6e5407aadf2a581b12ff736fe4..fa56c9f8be4b91598121f7f376e68968d806001d 100644
--- a/math/test/testcases/directed/exp2.tst
+++ b/math/test/testcases/directed/exp2.tst
@@ -1,7 +1,7 @@
; Directed test cases for exp2
;
; Copyright (c) 2018-2019, Arm Limited.
-; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+; SPDX-License-Identifier: MIT
func=exp2 op1=7ff80000.00000001 result=7ff80000.00000001 errno=0
func=exp2 op1=fff80000.00000001 result=7ff80000.00000001 errno=0
diff --git a/math/test/testcases/directed/exp2f.tst b/math/test/testcases/directed/exp2f.tst
index 6ca2eeab4e121e165703644bee54b5d855225886..38cfc3f78ac61dae04c0d0372110d3351e669848 100644
--- a/math/test/testcases/directed/exp2f.tst
+++ b/math/test/testcases/directed/exp2f.tst
@@ -1,7 +1,7 @@
; exp2f.tst - Directed test cases for exp2f
;
; Copyright (c) 2017-2019, Arm Limited.
-; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+; SPDX-License-Identifier: MIT
func=exp2f op1=7fc00001 result=7fc00001 errno=0
func=exp2f op1=ffc00001 result=7fc00001 errno=0
diff --git a/math/test/testcases/directed/expf.tst b/math/test/testcases/directed/expf.tst
index 89ae8fe78e6c17cd5295230c4185a8d11b98849d..ff0f671c2656a94b17f96d8f9a683a6b8436f674 100644
--- a/math/test/testcases/directed/expf.tst
+++ b/math/test/testcases/directed/expf.tst
@@ -1,7 +1,7 @@
; expf.tst - Directed test cases for expf
;
; Copyright (c) 2007-2019, Arm Limited.
-; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+; SPDX-License-Identifier: MIT
func=expf op1=7fc00001 result=7fc00001 errno=0
func=expf op1=ffc00001 result=7fc00001 errno=0
diff --git a/math/test/testcases/directed/log.tst b/math/test/testcases/directed/log.tst
index 686ea835645b9c7af857f5052efbb29c71493a20..a0aa398cbf734396be64c61612f463524d283d15 100644
--- a/math/test/testcases/directed/log.tst
+++ b/math/test/testcases/directed/log.tst
@@ -1,7 +1,7 @@
; Directed test cases for log
;
; Copyright (c) 2018-2019, Arm Limited.
-; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+; SPDX-License-Identifier: MIT
func=log op1=7ff80000.00000001 result=7ff80000.00000001 errno=0
func=log op1=fff80000.00000001 result=7ff80000.00000001 errno=0
diff --git a/math/test/testcases/directed/log2.tst b/math/test/testcases/directed/log2.tst
index 361bddec374bb16da87dd6fa5dc10d0a2f1ff366..ff1286cbd53e8ebfba5db81b9d244a598eb9a6ac 100644
--- a/math/test/testcases/directed/log2.tst
+++ b/math/test/testcases/directed/log2.tst
@@ -1,7 +1,7 @@
; Directed test cases for log2
;
; Copyright (c) 2018-2019, Arm Limited.
-; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+; SPDX-License-Identifier: MIT
func=log2 op1=7ff80000.00000001 result=7ff80000.00000001 errno=0
func=log2 op1=fff80000.00000001 result=7ff80000.00000001 errno=0
diff --git a/math/test/testcases/directed/log2f.tst b/math/test/testcases/directed/log2f.tst
index 5fce051cddba75e19eff4fd577a456a452554159..5832c4f08f1ecb6acf9fdbbb06f0ce75bac82f6d 100644
--- a/math/test/testcases/directed/log2f.tst
+++ b/math/test/testcases/directed/log2f.tst
@@ -1,7 +1,7 @@
; log2f.tst - Directed test cases for log2f
;
; Copyright (c) 2017-2019, Arm Limited.
-; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+; SPDX-License-Identifier: MIT
func=log2f op1=7fc00001 result=7fc00001 errno=0
func=log2f op1=ffc00001 result=7fc00001 errno=0
diff --git a/math/test/testcases/directed/logf.tst b/math/test/testcases/directed/logf.tst
index a6d1b9d5c51fa1b9b7e8c5eef465f2e4f5cfb6f5..6e68a36e0f6a29f8d0646f450ed3abf0aafca260 100644
--- a/math/test/testcases/directed/logf.tst
+++ b/math/test/testcases/directed/logf.tst
@@ -1,7 +1,7 @@
; logf.tst - Directed test cases for logf
;
; Copyright (c) 2007-2019, Arm Limited.
-; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+; SPDX-License-Identifier: MIT
func=logf op1=7fc00001 result=7fc00001 errno=0
func=logf op1=ffc00001 result=7fc00001 errno=0
diff --git a/math/test/testcases/directed/pow.tst b/math/test/testcases/directed/pow.tst
index 879d12864afe5d2c3e98e1c07095d7f58fe68b3b..19665817153d03ef84dc85fe3be375bd63d2dad5 100644
--- a/math/test/testcases/directed/pow.tst
+++ b/math/test/testcases/directed/pow.tst
@@ -1,7 +1,7 @@
; Directed test cases for pow
;
; Copyright (c) 2018-2019, Arm Limited.
-; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+; SPDX-License-Identifier: MIT
func=pow op1=00000000.00000000 op2=00000000.00000000 result=3ff00000.00000000 errno=0
func=pow op1=00000000.00000000 op2=00000000.00000001 result=00000000.00000000 errno=0
diff --git a/math/test/testcases/directed/powf.tst b/math/test/testcases/directed/powf.tst
index 46d5224008710127eb93863f5b19196cdc89693d..3fa8b110f8bcb97196dca92030271ceb376644a8 100644
--- a/math/test/testcases/directed/powf.tst
+++ b/math/test/testcases/directed/powf.tst
@@ -1,7 +1,7 @@
; powf.tst - Directed test cases for powf
;
; Copyright (c) 2007-2019, Arm Limited.
-; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+; SPDX-License-Identifier: MIT
func=powf op1=7f800001 op2=7f800001 result=7fc00001 errno=0 status=i
func=powf op1=7f800001 op2=ff800001 result=7fc00001 errno=0 status=i
diff --git a/math/test/testcases/directed/sincosf.tst b/math/test/testcases/directed/sincosf.tst
index cddb346558ea3c16ad3ca47ff7c2e3789aa31816..4b33d2291c660c034fed47522966599203bb8b6c 100644
--- a/math/test/testcases/directed/sincosf.tst
+++ b/math/test/testcases/directed/sincosf.tst
@@ -1,7 +1,7 @@
; Directed test cases for SP sincos
;
; Copyright (c) 2007-2019, Arm Limited.
-; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+; SPDX-License-Identifier: MIT
func=sincosf_sinf op1=7fc00001 result=7fc00001 errno=0
diff --git a/math/test/testcases/directed/sinf.tst b/math/test/testcases/directed/sinf.tst
index 041b13d5d6cbc5e56fb570126159a73b70d0d1ab..ded80b1598c6a3904ed8eb6baab351f493592bcc 100644
--- a/math/test/testcases/directed/sinf.tst
+++ b/math/test/testcases/directed/sinf.tst
@@ -1,7 +1,7 @@
; sinf.tst - Directed test cases for SP sine
;
; Copyright (c) 2007-2019, Arm Limited.
-; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+; SPDX-License-Identifier: MIT
func=sinf op1=7fc00001 result=7fc00001 errno=0
diff --git a/math/test/testcases/random/double.tst b/math/test/testcases/random/double.tst
index 8e885d61722a0b5e871e12af0a0cc0f4558f6d5b..c24ff80d5d95eccc799de5bd3dd0876b19ae8fb9 100644
--- a/math/test/testcases/random/double.tst
+++ b/math/test/testcases/random/double.tst
@@ -1,7 +1,7 @@
!! double.tst - Random test case specification for DP functions
!!
!! Copyright (c) 1999-2019, Arm Limited.
-!! SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+!! SPDX-License-Identifier: MIT
test exp 10000
test exp2 10000
diff --git a/math/test/testcases/random/float.tst b/math/test/testcases/random/float.tst
index ea4a5a01521484b53a8413a7edfe673c110b2bae..d02a22750abe07b9b64b63a0caf9afcb3c83d50c 100644
--- a/math/test/testcases/random/float.tst
+++ b/math/test/testcases/random/float.tst
@@ -1,7 +1,7 @@
!! single.tst - Random test case specification for SP functions
!!
!! Copyright (c) 1999-2019, Arm Limited.
-!! SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+!! SPDX-License-Identifier: MIT
test sinf 10000
test cosf 10000
diff --git a/math/test/ulp.c b/math/test/ulp.c
index 5ff29972e50ee01026e8f15af0e7c73008909bda..51479b87a0fde860e1584536fd13b8471cfca9a2 100644
--- a/math/test/ulp.c
+++ b/math/test/ulp.c
@@ -1,11 +1,10 @@
/*
* ULP error checking tool for math functions.
*
- * Copyright (c) 2019-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * Copyright (c) 2019-2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
*/
-#define _GNU_SOURCE
#include
#include
#include
@@ -24,6 +23,11 @@
# include
#endif
+#ifndef WANT_VMATH
+/* Enable the build of vector math code. */
+# define WANT_VMATH 1
+#endif
+
static inline uint64_t
asuint64 (double f)
{
@@ -208,61 +212,73 @@ struct conf
unsigned long long n;
double softlim;
double errlim;
- int ignore_zero_sign;
};
+/* Wrappers for sincos. */
+static float sincosf_sinf(float x) {(void)cosf(x); return sinf(x);}
+static float sincosf_cosf(float x) {(void)sinf(x); return cosf(x);}
+static double sincos_sin(double x) {(void)cos(x); return sin(x);}
+static double sincos_cos(double x) {(void)sin(x); return cos(x);}
+#if USE_MPFR
+static int sincos_mpfr_sin(mpfr_t y, const mpfr_t x, mpfr_rnd_t r) { mpfr_cos(y,x,r); return mpfr_sin(y,x,r); }
+static int sincos_mpfr_cos(mpfr_t y, const mpfr_t x, mpfr_rnd_t r) { mpfr_sin(y,x,r); return mpfr_cos(y,x,r); }
+#endif
+
/* A bit of a hack: call vector functions twice with the same
input in lane 0 but a different value in other lanes: once
with an in-range value and then with a special case value. */
static int secondcall;
/* Wrappers for vector functions. */
-#ifdef __vpcs
+#if __aarch64__ && WANT_VMATH
typedef __f32x4_t v_float;
typedef __f64x2_t v_double;
-/* First element of fv and dv may be changed by -c argument. */
-static float fv[2] = {1.0f, -INFINITY};
-static double dv[2] = {1.0, -INFINITY};
+static const float fv[2] = {1.0f, -INFINITY};
+static const double dv[2] = {1.0, -INFINITY};
static inline v_float argf(float x) { return (v_float){x,x,x,fv[secondcall]}; }
static inline v_double argd(double x) { return (v_double){x,dv[secondcall]}; }
-#if WANT_SVE_MATH
-#include
-typedef __SVFloat32_t sv_float;
-typedef __SVFloat64_t sv_double;
-
-static inline sv_float svargf(float x) {
- int n = svcntw();
- float base[n];
- for (int i=0; iname; f++)
printf ("\t%s\n", f->name);
@@ -719,7 +768,6 @@ main (int argc, char *argv[])
conf.fenv = 1;
conf.softlim = 0;
conf.errlim = INFINITY;
- conf.ignore_zero_sign = 0;
for (;;)
{
argc--;
@@ -759,22 +807,11 @@ main (int argc, char *argv[])
{
argc--;
argv++;
- if (argc < 1 || argv[0][1] != '\0')
+ if (argc < 1)
usage ();
conf.rc = argv[0][0];
}
break;
- case 'z':
- conf.ignore_zero_sign = 1;
- break;
-#ifdef __vpcs
- case 'c':
- argc--;
- argv++;
- fv[0] = strtof(argv[0], 0);
- dv[0] = strtod(argv[0], 0);
- break;
-#endif
default:
usage ();
}
@@ -800,19 +837,7 @@ main (int argc, char *argv[])
if (strcmp (argv[0], f->name) == 0)
break;
if (!f->name)
- {
-#ifndef __vpcs
- /* Ignore vector math functions if vector math is not supported. */
- if (strncmp (argv[0], "_ZGVnN", 6) == 0)
- exit (0);
-#endif
-#if !WANT_SVE_MATH
- if (strncmp (argv[0], "_ZGVsMxv", 8) == 0)
- exit (0);
-#endif
- printf ("math function %s not supported\n", argv[0]);
- exit (1);
- }
+ usage ();
if (!f->singleprec && LDBL_MANT_DIG == DBL_MANT_DIG)
conf.mpfr = 1; /* Use mpfr if long double has no extra precision. */
if (!USE_MPFR && conf.mpfr)
diff --git a/math/test/ulp.h b/math/test/ulp.h
index b0bc59aeef8ddbd712d731e3e8d6635254fa7e88..a0c301664321067789322ba64932130fffa37000 100644
--- a/math/test/ulp.h
+++ b/math/test/ulp.h
@@ -1,8 +1,8 @@
/*
* Generic functions for ULP error estimation.
*
- * Copyright (c) 2019-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
*/
/* For each different math function type,
@@ -37,8 +37,7 @@ static int RT(ulpscale_mpfr) (mpfr_t x, int t)
/* Difference between exact result and closest real number that
gets rounded to got, i.e. error before rounding, for a correctly
rounded result the difference is 0. */
-static double RT (ulperr) (RT (float) got, const struct RT (ret) * p, int r,
- int ignore_zero_sign)
+static double RT(ulperr) (RT(float) got, const struct RT(ret) * p, int r)
{
RT(float) want = p->y;
RT(float) d;
@@ -46,18 +45,10 @@ static double RT (ulperr) (RT (float) got, const struct RT (ret) * p, int r,
if (RT(asuint) (got) == RT(asuint) (want))
return 0.0;
- if (isnan (got) && isnan (want))
- /* Ignore sign of NaN. */
- return RT (issignaling) (got) == RT (issignaling) (want) ? 0 : INFINITY;
if (signbit (got) != signbit (want))
- {
- /* Fall through to ULP calculation if ignoring sign of zero and at
- exactly one of want and got is non-zero. */
- if (ignore_zero_sign && want == got)
- return 0.0;
- if (!ignore_zero_sign || (want != 0 && got != 0))
- return INFINITY;
- }
+ /* May have false positives with NaN. */
+ //return isnan(got) && isnan(want) ? 0 : INFINITY;
+ return INFINITY;
if (!isfinite (want) || !isfinite (got))
{
if (isnan (got) != isnan (want))
@@ -123,12 +114,8 @@ static inline void T(call_fenv) (const struct fun *f, struct T(args) a, int r,
static inline void T(call_nofenv) (const struct fun *f, struct T(args) a,
int r, RT(float) * y, int *ex)
{
- if (r != FE_TONEAREST)
- fesetround (r);
*y = T(call) (f, a);
*ex = 0;
- if (r != FE_TONEAREST)
- fesetround (FE_TONEAREST);
}
static inline int T(call_long_fenv) (const struct fun *f, struct T(args) a,
@@ -168,12 +155,8 @@ static inline int T(call_long_nofenv) (const struct fun *f, struct T(args) a,
int r, struct RT(ret) * p,
RT(float) ygot, int exgot)
{
- if (r != FE_TONEAREST)
- fesetround (r);
RT(double) yl = T(call_long) (f, a);
p->y = (RT(float)) yl;
- if (r != FE_TONEAREST)
- fesetround (FE_TONEAREST);
if (RT(isok_nofenv) (ygot, p->y))
return 1;
p->ulpexp = RT(ulpscale) (p->y);
@@ -305,7 +288,7 @@ static int T(cmp) (const struct fun *f, struct gen *gen,
if (!ok)
{
int print = 0;
- double err = RT (ulperr) (ygot, &want, r, conf->ignore_zero_sign);
+ double err = RT(ulperr) (ygot, &want, r);
double abserr = fabs (err);
// TODO: count errors below accuracy limit.
if (abserr > 0)
diff --git a/math/test/ulp_funcs.h b/math/test/ulp_funcs.h
deleted file mode 100644
index 84f7927d393548617c480517b6709b875b0de70b..0000000000000000000000000000000000000000
--- a/math/test/ulp_funcs.h
+++ /dev/null
@@ -1,40 +0,0 @@
-/*
- * Function entries for ulp.
- *
- * Copyright (c) 2022-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-/* clang-format off */
- F1 (sin)
- F1 (cos)
- F (sincosf_sinf, sincosf_sinf, sincos_sin, sincos_mpfr_sin, 1, 1, f1, 0)
- F (sincosf_cosf, sincosf_cosf, sincos_cos, sincos_mpfr_cos, 1, 1, f1, 0)
- F1 (exp)
- F1 (exp2)
- F1 (log)
- F1 (log2)
- F2 (pow)
- F1 (erf)
- D1 (exp)
- D1 (exp10)
- D1 (exp2)
- D1 (log)
- D1 (log2)
- D2 (pow)
- D1 (erf)
-#ifdef __vpcs
- F (_ZGVnN4v_sinf, Z_sinf, sin, mpfr_sin, 1, 1, f1, 1)
- F (_ZGVnN4v_cosf, Z_cosf, cos, mpfr_cos, 1, 1, f1, 1)
- F (_ZGVnN4v_expf_1u, Z_expf_1u, exp, mpfr_exp, 1, 1, f1, 1)
- F (_ZGVnN4v_expf, Z_expf, exp, mpfr_exp, 1, 1, f1, 1)
- F (_ZGVnN4v_exp2f_1u, Z_exp2f_1u, exp2, mpfr_exp2, 1, 1, f1, 1)
- F (_ZGVnN4v_exp2f, Z_exp2f, exp2, mpfr_exp2, 1, 1, f1, 1)
- F (_ZGVnN4v_logf, Z_logf, log, mpfr_log, 1, 1, f1, 1)
- F (_ZGVnN4vv_powf, Z_powf, pow, mpfr_pow, 2, 1, f2, 1)
- F (_ZGVnN2v_sin, Z_sin, sinl, mpfr_sin, 1, 0, d1, 1)
- F (_ZGVnN2v_cos, Z_cos, cosl, mpfr_cos, 1, 0, d1, 1)
- F (_ZGVnN2v_exp, Z_exp, expl, mpfr_exp, 1, 0, d1, 1)
- F (_ZGVnN2v_log, Z_log, logl, mpfr_log, 1, 0, d1, 1)
- F (_ZGVnN2vv_pow, Z_pow, powl, mpfr_pow, 2, 0, d2, 1)
-#endif
-/* clang-format on */
diff --git a/math/test/ulp_wrappers.h b/math/test/ulp_wrappers.h
deleted file mode 100644
index 60dc3d6dd652875043118f36f38e64ebedf5ab4a..0000000000000000000000000000000000000000
--- a/math/test/ulp_wrappers.h
+++ /dev/null
@@ -1,37 +0,0 @@
-/*
- * Function wrappers for ulp.
- *
- * Copyright (c) 2022-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-
-/* clang-format off */
-
-/* Wrappers for sincos. */
-static float sincosf_sinf(float x) {(void)cosf(x); return sinf(x);}
-static float sincosf_cosf(float x) {(void)sinf(x); return cosf(x);}
-static double sincos_sin(double x) {(void)cos(x); return sin(x);}
-static double sincos_cos(double x) {(void)sin(x); return cos(x);}
-#if USE_MPFR
-static int sincos_mpfr_sin(mpfr_t y, const mpfr_t x, mpfr_rnd_t r) { mpfr_cos(y,x,r); return mpfr_sin(y,x,r); }
-static int sincos_mpfr_cos(mpfr_t y, const mpfr_t x, mpfr_rnd_t r) { mpfr_sin(y,x,r); return mpfr_cos(y,x,r); }
-#endif
-
-/* Wrappers for vector functions. */
-#ifdef __vpcs
-static float Z_sinf(float x) { return _ZGVnN4v_sinf(argf(x))[0]; }
-static float Z_cosf(float x) { return _ZGVnN4v_cosf(argf(x))[0]; }
-static float Z_expf_1u(float x) { return _ZGVnN4v_expf_1u(argf(x))[0]; }
-static float Z_expf(float x) { return _ZGVnN4v_expf(argf(x))[0]; }
-static float Z_exp2f_1u(float x) { return _ZGVnN4v_exp2f_1u(argf(x))[0]; }
-static float Z_exp2f(float x) { return _ZGVnN4v_exp2f(argf(x))[0]; }
-static float Z_logf(float x) { return _ZGVnN4v_logf(argf(x))[0]; }
-static float Z_powf(float x, float y) { return _ZGVnN4vv_powf(argf(x),argf(y))[0]; }
-static double Z_sin(double x) { return _ZGVnN2v_sin(argd(x))[0]; }
-static double Z_cos(double x) { return _ZGVnN2v_cos(argd(x))[0]; }
-static double Z_exp(double x) { return _ZGVnN2v_exp(argd(x))[0]; }
-static double Z_log(double x) { return _ZGVnN2v_log(argd(x))[0]; }
-static double Z_pow(double x, double y) { return _ZGVnN2vv_pow(argd(x),argd(y))[0]; }
-#endif
-
-/* clang-format on */
diff --git a/math/tgamma128.c b/math/tgamma128.c
deleted file mode 100644
index dda0da7e8adb4a7fa3b78826316f8fd8a4fae12a..0000000000000000000000000000000000000000
--- a/math/tgamma128.c
+++ /dev/null
@@ -1,351 +0,0 @@
-/*
- * Implementation of the true gamma function (as opposed to lgamma)
- * for 128-bit long double.
- *
- * Copyright (c) 2006,2009,2023 Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-
-/*
- * This module implements the float128 gamma function under the name
- * tgamma128. It's expected to be suitable for integration into system
- * maths libraries under the standard name tgammal, if long double is
- * 128-bit. Such a library will probably want to check the error
- * handling and optimize the initial process of extracting the
- * exponent, which is done here by simple and portable (but
- * potentially slower) methods.
- */
-
-#include
-#include
-#include
-#include
-
-#include "tgamma128.h"
-
-#define lenof(x) (sizeof(x)/sizeof(*(x)))
-
-/*
- * Helper routine to evaluate a polynomial via Horner's rule
- */
-static long double poly(const long double *coeffs, size_t n, long double x)
-{
- long double result = coeffs[--n];
-
- while (n > 0)
- result = (result * x) + coeffs[--n];
-
- return result;
-}
-
-/*
- * Compute sin(pi*x) / pi, for use in the reflection formula that
- * relates gamma(-x) and gamma(x).
- */
-static long double sin_pi_x_over_pi(long double x)
-{
- int quo;
- long double fracpart = remquol(x, 0.5L, &quo);
-
- long double sign = 1.0L;
- if (quo & 2)
- sign = -sign;
- quo &= 1;
-
- if (quo == 0 && fabsl(fracpart) < 0x1.p-58L) {
- /* For numbers this size, sin(pi*x) is so close to pi*x that
- * sin(pi*x)/pi is indistinguishable from x in float128 */
- return sign * fracpart;
- }
-
- if (quo == 0) {
- return sign * sinl(pi*fracpart) / pi;
- } else {
- return sign * cosl(pi*fracpart) / pi;
- }
-}
-
-/* Return tgamma(x) on the assumption that x >= 8. */
-static long double tgamma_large(long double x,
- bool negative, long double negadjust)
-{
- /*
- * In this range we compute gamma(x) as x^(x-1/2) * e^-x * K,
- * where K is a correction factor computed as a polynomial in 1/x.
- *
- * (Vaguely inspired by the form of the Lanczos approximation, but
- * I tried the Lanczos approximation itself and it suffers badly
- * from big cancellation leading to loss of significance.)
- */
- long double t = 1/x;
- long double p = poly(coeffs_large, lenof(coeffs_large), t);
-
- /*
- * To avoid overflow in cases where x^(x-0.5) does overflow
- * but gamma(x) does not, we split x^(x-0.5) in half and
- * multiply back up _after_ multiplying the shrinking factor
- * of exp(-(x-0.5)).
- *
- * Note that computing x-0.5 and (x-0.5)/2 is exact for the
- * relevant range of x, so the only sources of error are pow
- * and exp themselves, plus the multiplications.
- */
- long double powhalf = powl(x, (x-0.5L)/2.0L);
- long double expret = expl(-(x-0.5L));
-
- if (!negative) {
- return (expret * powhalf) * powhalf * p;
- } else {
- /*
- * Apply the reflection formula as commented below, but
- * carefully: negadjust has magnitude less than 1, so it can
- * turn a case where gamma(+x) would overflow into a case
- * where gamma(-x) doesn't underflow. Not only that, but the
- * FP format has greater range in the tiny domain due to
- * denormals. For both reasons, it's not good enough to
- * compute the positive result and then adjust it.
- */
- long double ret = 1 / ((expret * powhalf) * (x * negadjust) * p);
- return ret / powhalf;
- }
-}
-
-/* Return tgamma(x) on the assumption that 0 <= x < 1/32. */
-static long double tgamma_tiny(long double x,
- bool negative, long double negadjust)
-{
- /*
- * For x near zero, we use a polynomial approximation to
- * g = 1/(x*gamma(x)), and then return 1/(g*x).
- */
- long double g = poly(coeffs_tiny, lenof(coeffs_tiny), x);
- if (!negative)
- return 1.0L / (g*x);
- else
- return g / negadjust;
-}
-
-/* Return tgamma(x) on the assumption that 0 <= x < 2^-113. */
-static long double tgamma_ultratiny(long double x, bool negative,
- long double negadjust)
-{
- /* On this interval, gamma can't even be distinguished from 1/x,
- * so we skip the polynomial evaluation in tgamma_tiny, partly to
- * save time and partly to avoid the tiny intermediate values
- * setting the underflow exception flag. */
- if (!negative)
- return 1.0L / x;
- else
- return 1.0L / negadjust;
-}
-
-/* Return tgamma(x) on the assumption that 1 <= x <= 2. */
-static long double tgamma_central(long double x)
-{
- /*
- * In this central interval, our strategy is to finding the
- * difference between x and the point where gamma has a minimum,
- * and approximate based on that.
- */
-
- /* The difference between the input x and the minimum x. The first
- * subtraction is expected to be exact, since x and min_hi have
- * the same exponent (unless x=2, in which case it will still be
- * exact). */
- long double t = (x - min_x_hi) - min_x_lo;
-
- /*
- * Now use two different polynomials for the intervals [1,m] and
- * [m,2].
- */
- long double p;
- if (t < 0)
- p = poly(coeffs_central_neg, lenof(coeffs_central_neg), -t);
- else
- p = poly(coeffs_central_pos, lenof(coeffs_central_pos), t);
-
- return (min_y_lo + p * (t*t)) + min_y_hi;
-}
-
-long double tgamma128(long double x)
-{
- /*
- * Start by extracting the number's sign and exponent, and ruling
- * out cases of non-normalized numbers.
- *
- * For an implementation integrated into a system libm, it would
- * almost certainly be quicker to do this by direct bitwise access
- * to the input float128 value, using whatever is the local idiom
- * for knowing its endianness.
- *
- * Integration into a system libc may also need to worry about
- * setting errno, if that's the locally preferred way to report
- * math.h errors.
- */
- int sign = signbit(x);
- int exponent;
- switch (fpclassify(x)) {
- case FP_NAN:
- return x+x; /* propagate QNaN, make SNaN throw an exception */
- case FP_ZERO:
- return 1/x; /* divide by zero on purpose to indicate a pole */
- case FP_INFINITE:
- if (sign) {
- return x-x; /* gamma(-inf) has indeterminate sign, so provoke an
- * IEEE invalid operation exception to indicate that */
- }
- return x; /* but gamma(+inf) is just +inf with no error */
- case FP_SUBNORMAL:
- exponent = -16384;
- break;
- default:
- frexpl(x, &exponent);
- exponent--;
- break;
- }
-
- bool negative = false;
- long double negadjust = 0.0L;
-
- if (sign) {
- /*
- * Euler's reflection formula is
- *
- * gamma(1-x) gamma(x) = pi/sin(pi*x)
- *
- * pi
- * => gamma(x) = --------------------
- * gamma(1-x) sin(pi*x)
- *
- * But computing 1-x is going to lose a lot of accuracy when x
- * is very small, so instead we transform using the recurrence
- * gamma(t+1)=t gamma(t). Setting t=-x, this gives us
- * gamma(1-x) = -x gamma(-x), so we now have
- *
- * pi
- * gamma(x) = ----------------------
- * -x gamma(-x) sin(pi*x)
- *
- * which relates gamma(x) to gamma(-x), which is much nicer,
- * since x can be turned into -x without rounding.
- */
- negadjust = sin_pi_x_over_pi(x);
- negative = true;
- x = -x;
-
- /*
- * Now the ultimate answer we want is
- *
- * 1 / (gamma(x) * x * negadjust)
- *
- * where x is the positive value we've just turned it into.
- *
- * For some of the cases below, we'll compute gamma(x)
- * normally and then compute this adjusted value afterwards.
- * But for others, we can implement the reciprocal operation
- * in this formula by _avoiding_ an inversion that the
- * sub-case was going to do anyway.
- */
-
- if (negadjust == 0) {
- /*
- * Special case for negative integers. Applying the
- * reflection formula would cause division by zero, but
- * standards would prefer we treat this error case as an
- * invalid operation and return NaN instead. (Possibly
- * because otherwise you'd have to decide which sign of
- * infinity to return, and unlike the x=0 case, there's no
- * sign of zero available to disambiguate.)
- */
- return negadjust / negadjust;
- }
- }
-
- /*
- * Split the positive domain into various cases. For cases where
- * we do the negative-number adjustment the usual way, we'll leave
- * the answer in 'g' and drop out of the if statement.
- */
- long double g;
-
- if (exponent >= 11) {
- /*
- * gamma of any positive value this large overflows, and gamma
- * of any negative value underflows.
- */
- if (!negative) {
- long double huge = 0x1p+12288L;
- return huge * huge; /* provoke an overflow */
- } else {
- long double tiny = 0x1p-12288L;
- return tiny * tiny * negadjust; /* underflow, of the right sign */
- }
- } else if (exponent >= 3) {
- /* Negative-number adjustment happens inside here */
- return tgamma_large(x, negative, negadjust);
- } else if (exponent < -113) {
- /* Negative-number adjustment happens inside here */
- return tgamma_ultratiny(x, negative, negadjust);
- } else if (exponent < -5) {
- /* Negative-number adjustment happens inside here */
- return tgamma_tiny(x, negative, negadjust);
- } else if (exponent == 0) {
- g = tgamma_central(x);
- } else if (exponent < 0) {
- /*
- * For x in [1/32,1) we range-reduce upwards to the interval
- * [1,2), using the inverse of the normal recurrence formula:
- * gamma(x) = gamma(x+1)/x.
- */
- g = tgamma_central(1+x) / x;
- } else {
- /*
- * For x in [2,8) we range-reduce downwards to the interval
- * [1,2) by repeated application of the recurrence formula.
- *
- * Actually multiplying (x-1) by (x-2) by (x-3) and so on
- * would introduce multiple ULPs of rounding error. We can get
- * better accuracy by writing x = (k+1/2) + t, where k is an
- * integer and |t|<1/2, and expanding out the obvious factor
- * (x-1)(x-2)...(x-k+1) as a polynomial in t.
- */
- long double mult;
- int i = x;
- if (i == 2) { /* x in [2,3) */
- mult = (x-1);
- } else {
- long double t = x - (i + 0.5L);
- switch (i) {
- /* E.g. for x=3.5+t, we want
- * (x-1)(x-2) = (2.5+t)(1.5+t) = 3.75 + 4t + t^2 */
- case 3:
- mult = 3.75L+t*(4.0L+t);
- break;
- case 4:
- mult = 13.125L+t*(17.75L+t*(7.5L+t));
- break;
- case 5:
- mult = 59.0625L+t*(93.0L+t*(51.50L+t*(12.0L+t)));
- break;
- case 6:
- mult = 324.84375L+t*(570.5625L+t*(376.250L+t*(
- 117.5L+t*(17.5L+t))));
- break;
- case 7:
- mult = 2111.484375L+t*(4033.5L+t*(3016.1875L+t*(
- 1140.0L+t*(231.25L+t*(24.0L+t)))));
- break;
- }
- }
-
- g = tgamma_central(x - (i-1)) * mult;
- }
-
- if (!negative) {
- /* Positive domain: return g unmodified */
- return g;
- } else {
- /* Negative domain: apply the reflection formula as commented above */
- return 1.0L / (g * x * negadjust);
- }
-}
diff --git a/math/tgamma128.h b/math/tgamma128.h
deleted file mode 100644
index ced10c3cc34ca26bcb3d6d8b31899ef9c3f35b15..0000000000000000000000000000000000000000
--- a/math/tgamma128.h
+++ /dev/null
@@ -1,141 +0,0 @@
-/*
- * Polynomial coefficients and other constants for tgamma128.c.
- *
- * Copyright (c) 2006,2009,2023 Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-
-/* The largest positive value for which 128-bit tgamma does not overflow. */
-static const long double max_x = 0x1.b6e3180cd66a5c4206f128ba77f4p+10L;
-
-/* Coefficients of the polynomial used in the tgamma_large() subroutine */
-static const long double coeffs_large[] = {
- 0x1.8535745aa79569579b9eec0f3bbcp+0L,
- 0x1.0378f83c6fb8f0e51269f2b4a973p-3L,
- 0x1.59f6a05094f69686c3380f4e2783p-8L,
- -0x1.0b291dee952a82764a4859b081a6p-8L,
- -0x1.6dd301b2205bf936b5a3eaad0dbbp-12L,
- 0x1.387a8b5f38dd77e7f139b1021e86p-10L,
- 0x1.bca46637f65b13750c728cc29e40p-14L,
- -0x1.d80401c00aef998c9e303151a51cp-11L,
- -0x1.49cb6bb09f935a2053ccc2cf3711p-14L,
- 0x1.4e950204437dcaf2be77f73a6f45p-10L,
- 0x1.cb711a2d65f188bf60110934d6bep-14L,
- -0x1.7d7ff4bc95dc7faefc5e767f70f1p-9L,
- -0x1.0305ab9760cddb0d833e73766836p-12L,
- 0x1.3ef6c84bf1cd5c3f65ac2693bb5bp-7L,
- 0x1.bb4144740ad9290123fdcea684aap-11L,
- -0x1.72ab4e88272a229bfafd192450f0p-5L,
- 0x1.80c70ac6eb3b7a698983d25a62b8p-12L,
- 0x1.e222791c6743ce3e3cae220fb236p-3L,
- 0x1.1a2dca1c82a9326c52b465f7cb7ap-2L,
- -0x1.9d204fa235a42cd901b123d2ad47p+1L,
- 0x1.55b56d1158f77ddb1c95fc44ab02p+0L,
- 0x1.37f900a11dbd892abd7dde533e2dp+5L,
- -0x1.2da49f4188dd89cb958369ef2401p+7L,
- 0x1.fdae5ec3ec6eb7dffc09edbe6c14p+7L,
- -0x1.61433cebe649098c9611c4c7774ap+7L,
-};
-
-/* Coefficients of the polynomial used in the tgamma_tiny() subroutine */
-static const long double coeffs_tiny[] = {
- 0x1.0000000000000000000000000000p+0L,
- 0x1.2788cfc6fb618f49a37c7f0201fep-1L,
- -0x1.4fcf4026afa2dceb8490ade22796p-1L,
- -0x1.5815e8fa27047c8f42b5d9217244p-5L,
- 0x1.5512320b43fbe5dfa771333518f7p-3L,
- -0x1.59af103c340927bffdd44f954bfcp-5L,
- -0x1.3b4af28483e210479657e5543366p-7L,
- 0x1.d919c527f6070bfce9b29c2ace9cp-8L,
- -0x1.317112ce35337def3556a18aa178p-10L,
- -0x1.c364fe77a6f27677b985b1fa2e1dp-13L,
- 0x1.0c8a7a19a3fd40fe1f7e867efe7bp-13L,
- -0x1.51cf9f090b5dc398ba86305e3634p-16L,
- -0x1.4e80f64c04a339740de06ca9fa4ap-20L,
- 0x1.241ddc2aef2ec20e58b08f2fda17p-20L,
-};
-
-/* The location within the interval [1,2] where gamma has a minimum.
- * Specified as the sum of two 128-bit values, for extra precision. */
-static const long double min_x_hi = 0x1.762d86356be3f6e1a9c8865e0a4fp+0L;
-static const long double min_x_lo = 0x1.ac54d7d218de21303a7c60f08840p-118L;
-
-/* The actual minimum value that gamma takes at that location.
- * Again specified as the sum of two 128-bit values. */
-static const long double min_y_hi = 0x1.c56dc82a74aee8d8851566d40f32p-1L;
-static const long double min_y_lo = 0x1.8ed98685742c353ce55e5794686fp-114L;
-
-/* Coefficients of the polynomial used in the tgamma_central() subroutine
- * for computing gamma on the interval [1,min_x] */
-static const long double coeffs_central_neg[] = {
- 0x1.b6c53f7377b83839c8a292e43b69p-2L,
- 0x1.0bae9f40c7d09ed76e732045850ap-3L,
- 0x1.4981175e14d04c3530e51d01c5fep-3L,
- 0x1.79f77aaf032c948af3a9edbd2061p-4L,
- 0x1.1e97bd10821095a5b79fbfdfa1a3p-4L,
- 0x1.8071ce0935e4dcf0b33b0fbec7c1p-5L,
- 0x1.0b44c2f92982f887b55ec36dfdb0p-5L,
- 0x1.6df1de1e178ef72ca7bd63d40870p-6L,
- 0x1.f63f502bde27e81c0f5e13479b43p-7L,
- 0x1.57fd67d901f40ea011353ad89a0ap-7L,
- 0x1.d7151376eed187eb753e2273cafcp-8L,
- 0x1.427162b5c6ff1d904c71ef53e37cp-8L,
- 0x1.b954b8c3a56cf93e49ef6538928ap-9L,
- 0x1.2dff2ec26a3ae5cd3aaccae7a09ep-9L,
- 0x1.9d35250d9b9378d9b59df734537ap-10L,
- 0x1.1b2c0c48b9855a28f6dbd6fdff3cp-10L,
- 0x1.7e0db39bb99cdb52b028d9359380p-11L,
- 0x1.2164b5e1d364a0b5eaf97c436aa7p-11L,
- 0x1.27521cf5fd24dcdf43524e6add11p-13L,
- 0x1.06461d62243bf9a826b42349672fp-10L,
- -0x1.2b852abead28209b4e0c756dc46ep-9L,
- 0x1.be673c11a72c826115ec6d286c14p-8L,
- -0x1.fd9ce330c215c31fcd3cb53c42ebp-7L,
- 0x1.fa362bd2dc68f41abef2d8600acdp-6L,
- -0x1.a21585b2f52f8b23855de8e452edp-5L,
- 0x1.1f234431ed032052fc92e64e0493p-4L,
- -0x1.40d332476ca0199c60cdae3f9132p-4L,
- 0x1.1d45dc665d86012eba2eea199cefp-4L,
- -0x1.8491016cdd08dc9be7ade9b5fef3p-5L,
- 0x1.7e7e2fbc6d49ad484300d6add324p-6L,
- -0x1.e63fe3f874a37276a8d7d8b705ecp-8L,
- 0x1.30a2a73944f8c84998314d69c23fp-10L,
-};
-
-/* Coefficients of the polynomial used in the tgamma_central() subroutine
- * for computing gamma on the interval [min_x,2] */
-static const long double coeffs_central_pos[] = {
- 0x1.b6c53f7377b83839c8a292e22aa2p-2L,
- -0x1.0bae9f40c7d09ed76e72e1c955dep-3L,
- 0x1.4981175e14d04c3530ee5e1ecebcp-3L,
- -0x1.79f77aaf032c948ac983d77f3e07p-4L,
- 0x1.1e97bd10821095ab7dc94936cc11p-4L,
- -0x1.8071ce0935e4d7edef8cbf2a1cf1p-5L,
- 0x1.0b44c2f929837fafef7b5d9e80f1p-5L,
- -0x1.6df1de1e175fe2a51faa25cddbb4p-6L,
- 0x1.f63f502be57d11aed2cfe90843ffp-7L,
- -0x1.57fd67d852f230015b9f64770273p-7L,
- 0x1.d715138adc07e5fce81077070357p-8L,
- -0x1.4271618e9fda8992a667adb15f4fp-8L,
- 0x1.b954d15d9eb772e80fdd760672d7p-9L,
- -0x1.2dfe391241d3cb79c8c15182843dp-9L,
- 0x1.9d44396fcd48451c3ba924cee814p-10L,
- -0x1.1ac195fb99739e341589e39803e6p-10L,
- 0x1.82e46127b68f002770826e25f146p-11L,
- -0x1.089dacd90d9f41493119ac178359p-11L,
- 0x1.6993c007b20394a057d21f3d37f8p-12L,
- -0x1.ec43a709f4446560c099dec8e31bp-13L,
- 0x1.4ba36322f4074e9add9450f003cap-13L,
- -0x1.b3f83a977965ca1b7937bf5b34cap-14L,
- 0x1.10af346abc09cb25a6d9fe810b6ep-14L,
- -0x1.38d8ea1188f242f50203edc395bdp-15L,
- 0x1.39add987a948ec56f62b721a4475p-16L,
- -0x1.02a4e141f286c8a967e2df9bc9adp-17L,
- 0x1.433b50af22425f546e87113062d7p-19L,
- -0x1.0c7b73cb0013f00aafc103e8e382p-21L,
- 0x1.b852de313ec38da2297f6deaa6b4p-25L,
-};
-
-/* 128-bit float value of pi, used by the sin_pi_x_over_pi subroutine
- */
-static const long double pi = 0x1.921fb54442d18469898cc51701b8p+1L;
diff --git a/math/tools/cos.sollya b/math/tools/cos.sollya
index 6690adfcbb9b8e57cfb5e11ca73fa52594a8443c..bd72d6b7482089d27bce02848b85b074e4b737b3 100644
--- a/math/tools/cos.sollya
+++ b/math/tools/cos.sollya
@@ -1,7 +1,7 @@
// polynomial for approximating cos(x)
//
// Copyright (c) 2019, Arm Limited.
-// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+// SPDX-License-Identifier: MIT
deg = 8; // polynomial degree
a = -pi/4; // interval
diff --git a/math/tools/exp.sollya b/math/tools/exp.sollya
index 0668bdb5b3d30a088e09b38f099824e91368a237..b7a462cda5a4f8efb571c3ce3c296d42bb7d7e98 100644
--- a/math/tools/exp.sollya
+++ b/math/tools/exp.sollya
@@ -1,7 +1,7 @@
// polynomial for approximating e^x
//
// Copyright (c) 2019, Arm Limited.
-// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+// SPDX-License-Identifier: MIT
deg = 5; // poly degree
N = 128; // table entries
diff --git a/math/tools/exp2.sollya b/math/tools/exp2.sollya
index bd0a42d6bbcbc0c66157c423d19a2a26970eecd5..e760769601d40009575d6b121e969e7c09749acb 100644
--- a/math/tools/exp2.sollya
+++ b/math/tools/exp2.sollya
@@ -1,7 +1,7 @@
// polynomial for approximating 2^x
//
// Copyright (c) 2019, Arm Limited.
-// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+// SPDX-License-Identifier: MIT
// exp2f parameters
deg = 3; // poly degree
diff --git a/math/tools/log.sollya b/math/tools/log.sollya
index 5288f557292570e5f54ef2e80083407e51c82c41..6df4db44b6f30133e38fa46a0824ea1356313fb1 100644
--- a/math/tools/log.sollya
+++ b/math/tools/log.sollya
@@ -1,7 +1,7 @@
// polynomial for approximating log(1+x)
//
// Copyright (c) 2019, Arm Limited.
-// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+// SPDX-License-Identifier: MIT
deg = 12; // poly degree
// |log(1+x)| > 0x1p-4 outside the interval
diff --git a/math/tools/log2.sollya b/math/tools/log2.sollya
index 85811be5d90c9bb5acdee32f5dcfe6d3a2989514..4a364c0f111ff6acebfb0782b472b1500218187e 100644
--- a/math/tools/log2.sollya
+++ b/math/tools/log2.sollya
@@ -1,7 +1,7 @@
// polynomial for approximating log2(1+x)
//
// Copyright (c) 2019, Arm Limited.
-// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+// SPDX-License-Identifier: MIT
deg = 11; // poly degree
// |log2(1+x)| > 0x1p-4 outside the interval
diff --git a/math/tools/log2_abs.sollya b/math/tools/log2_abs.sollya
index d018ba0145d24d0d095b4393ff9263bcff89cdb0..82c4dac26fa128d98f0905b12166efee28f0f180 100644
--- a/math/tools/log2_abs.sollya
+++ b/math/tools/log2_abs.sollya
@@ -1,7 +1,7 @@
// polynomial for approximating log2(1+x)
//
// Copyright (c) 2019, Arm Limited.
-// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+// SPDX-License-Identifier: MIT
deg = 7; // poly degree
// interval ~= 1/(2*N), where N is the table entries
diff --git a/math/tools/log_abs.sollya b/math/tools/log_abs.sollya
index 5f9bfe41a6830f5a4ae4028bba223c0186d9c9ee..a2ac190fc49702e362decc43aafa5240d15730f5 100644
--- a/math/tools/log_abs.sollya
+++ b/math/tools/log_abs.sollya
@@ -1,7 +1,7 @@
// polynomial for approximating log(1+x)
//
// Copyright (c) 2019, Arm Limited.
-// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+// SPDX-License-Identifier: MIT
deg = 6; // poly degree
// interval ~= 1/(2*N), where N is the table entries
diff --git a/math/tools/plot.py b/math/tools/plot.py
index a0fa023225606e0b02afadede19c028d64d85d15..6c8b89ff284b5a6e220d940fa06d0d56549a21c6 100755
--- a/math/tools/plot.py
+++ b/math/tools/plot.py
@@ -3,7 +3,7 @@
# ULP error plot tool.
#
# Copyright (c) 2019, Arm Limited.
-# SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+# SPDX-License-Identifier: MIT
import numpy as np
import matplotlib.pyplot as plt
diff --git a/math/tools/remez.jl b/math/tools/remez.jl
index 1deab67d0660a946fac4e38d6394bae0aaeb7c98..2ff436f5287ff2d426413f6817a966ac82990439 100755
--- a/math/tools/remez.jl
+++ b/math/tools/remez.jl
@@ -4,7 +4,7 @@
# remez.jl - implementation of the Remez algorithm for polynomial approximation
#
# Copyright (c) 2015-2019, Arm Limited.
-# SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+# SPDX-License-Identifier: MIT
import Base.\
diff --git a/math/tools/sin.sollya b/math/tools/sin.sollya
index a19300019867873928cb384f28d7ede5a46155dc..a6e851145c119e9a425e6af308d01b4022be44f5 100644
--- a/math/tools/sin.sollya
+++ b/math/tools/sin.sollya
@@ -1,7 +1,7 @@
// polynomial for approximating sin(x)
//
// Copyright (c) 2019, Arm Limited.
-// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+// SPDX-License-Identifier: MIT
deg = 7; // polynomial degree
a = -pi/4; // interval
diff --git a/math/tools/tgamma128_gen.jl b/math/tools/tgamma128_gen.jl
deleted file mode 100644
index da76e8b9b84ba8f5e0290db2e38e551b26c7c332..0000000000000000000000000000000000000000
--- a/math/tools/tgamma128_gen.jl
+++ /dev/null
@@ -1,212 +0,0 @@
-# -*- julia -*-
-#
-# Generate tgamma128.h, containing polynomials and constants used by
-# tgamma128.c.
-#
-# Copyright (c) 2006,2009,2023 Arm Limited.
-# SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
-
-# This Julia program depends on the 'Remez' and 'SpecialFunctions'
-# library packages. To install them, run this at the interactive Julia
-# prompt:
-#
-# import Pkg; Pkg.add(["Remez", "SpecialFunctions"])
-#
-# Tested on Julia 1.4.1 (Ubuntu 20.04) and 1.9.0 (22.04).
-
-import Printf
-import Remez
-import SpecialFunctions
-
-# Round a BigFloat to 128-bit long double and format it as a C99 hex
-# float literal.
-function quadhex(x)
- sign = " "
- if x < 0
- sign = "-"
- x = -x
- end
-
- exponent = BigInt(floor(log2(x)))
- exponent = max(exponent, -16382)
- @assert(exponent <= 16383) # else overflow
-
- x /= BigFloat(2)^exponent
- @assert(1 <= x < 2)
- x *= BigFloat(2)^112
- mantissa = BigInt(round(x))
-
- mantstr = string(mantissa, base=16, pad=29)
- return Printf.@sprintf("%s0x%s.%sp%+dL", sign, mantstr[1], mantstr[2:end],
- exponent)
-end
-
-# Round a BigFloat to 128-bit long double and return it still as a
-# BigFloat.
-function quadval(x, round=0)
- sign = +1
- if x.sign < 0
- sign = -1
- x = -x
- end
-
- exponent = BigInt(floor(log2(x)))
- exponent = max(exponent, -16382)
- @assert(exponent <= 16383) # else overflow
-
- x /= BigFloat(2)^exponent
- @assert(1 <= x < 2)
- x *= BigFloat(2)^112
- if round < 0
- mantissa = floor(x)
- elseif round > 0
- mantissa = ceil(x)
- else
- mantissa = round(x)
- end
-
- return sign * mantissa * BigFloat(2)^(exponent - 112)
-end
-
-# Output an array of BigFloats as a C array declaration.
-function dumparray(a, name)
- println("static const long double ", name, "[] = {")
- for x in N
- println(" ", quadhex(x), ",")
- end
- println("};")
-end
-
-print("/*
- * Polynomial coefficients and other constants for tgamma128.c.
- *
- * Copyright (c) 2006,2009,2023 Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-")
-
-Base.MPFR.setprecision(512)
-
-e = exp(BigFloat(1))
-
-print("
-/* The largest positive value for which 128-bit tgamma does not overflow. */
-")
-lo = BigFloat("1000")
-hi = BigFloat("2000")
-while true
- global lo
- global hi
- global max_x
-
- mid = (lo + hi) / 2
- if mid == lo || mid == hi
- max_x = mid
- break
- end
- if SpecialFunctions.logabsgamma(mid)[1] < 16384 * log(BigFloat(2))
- lo = mid
- else
- hi = mid
- end
-end
-max_x = quadval(max_x, -1)
-println("static const long double max_x = ", quadhex(max_x), ";")
-
-print("
-/* Coefficients of the polynomial used in the tgamma_large() subroutine */
-")
-N, D, E, X = Remez.ratfn_minimax(
- x -> x==0 ? sqrt(BigFloat(2)*pi/e) :
- exp(SpecialFunctions.logabsgamma(1/x)[1] +
- (1/x-0.5)*(1+log(x))),
- (0, 1/BigFloat(8)),
- 24, 0,
- (x, y) -> 1/y
-)
-dumparray(N, "coeffs_large")
-
-print("
-/* Coefficients of the polynomial used in the tgamma_tiny() subroutine */
-")
-N, D, E, X = Remez.ratfn_minimax(
- x -> x==0 ? 1 : 1/(x*SpecialFunctions.gamma(x)),
- (0, 1/BigFloat(32)),
- 13, 0,
-)
-dumparray(N, "coeffs_tiny")
-
-print("
-/* The location within the interval [1,2] where gamma has a minimum.
- * Specified as the sum of two 128-bit values, for extra precision. */
-")
-lo = BigFloat("1.4")
-hi = BigFloat("1.5")
-while true
- global lo
- global hi
- global min_x
-
- mid = (lo + hi) / 2
- if mid == lo || mid == hi
- min_x = mid
- break
- end
- if SpecialFunctions.digamma(mid) < 0
- lo = mid
- else
- hi = mid
- end
-end
-min_x_hi = quadval(min_x, -1)
-println("static const long double min_x_hi = ", quadhex(min_x_hi), ";")
-println("static const long double min_x_lo = ", quadhex(min_x - min_x_hi), ";")
-
-print("
-/* The actual minimum value that gamma takes at that location.
- * Again specified as the sum of two 128-bit values. */
-")
-min_y = SpecialFunctions.gamma(min_x)
-min_y_hi = quadval(min_y, -1)
-println("static const long double min_y_hi = ", quadhex(min_y_hi), ";")
-println("static const long double min_y_lo = ", quadhex(min_y - min_y_hi), ";")
-
-function taylor_bodge(x)
- # Taylor series generated by Wolfram Alpha for (gamma(min_x+x)-min_y)/x^2.
- # Used in the Remez calls below for x values very near the origin, to avoid
- # significance loss problems when trying to compute it directly via that
- # formula (even in MPFR's extra precision).
- return BigFloat("0.428486815855585429730209907810650582960483696962660010556335457558784421896667728014324097132413696263704801646004585959298743677879606168187061990204432200")+x*(-BigFloat("0.130704158939785761928008749242671025181542078105370084716141350308119418619652583986015464395882363802104154017741656168641240436089858504560718773026275797")+x*(BigFloat("0.160890753325112844190519489594363387594505844658437718135952967735294789599989664428071656484587979507034160383271974554122934842441540146372016567834062876")+x*(-BigFloat("0.092277030213334350126864106458600575084335085690780082222880945224248438672595248111704471182201673989215223667543694847795410779036800385804729955729659506"))))
-end
-
-print("
-/* Coefficients of the polynomial used in the tgamma_central() subroutine
- * for computing gamma on the interval [1,min_x] */
-")
-N, D, E, X = Remez.ratfn_minimax(
- x -> x < BigFloat(0x1p-64) ? taylor_bodge(-x) :
- (SpecialFunctions.gamma(min_x - x) - min_y) / (x*x),
- (0, min_x - 1),
- 31, 0,
- (x, y) -> x^2,
-)
-dumparray(N, "coeffs_central_neg")
-
-print("
-/* Coefficients of the polynomial used in the tgamma_central() subroutine
- * for computing gamma on the interval [min_x,2] */
-")
-N, D, E, X = Remez.ratfn_minimax(
- x -> x < BigFloat(0x1p-64) ? taylor_bodge(x) :
- (SpecialFunctions.gamma(min_x + x) - min_y) / (x*x),
- (0, 2 - min_x),
- 28, 0,
- (x, y) -> x^2,
-)
-dumparray(N, "coeffs_central_pos")
-
-print("
-/* 128-bit float value of pi, used by the sin_pi_x_over_pi subroutine
- */
-")
-println("static const long double pi = ", quadhex(BigFloat(pi)), ";")
diff --git a/math/tools/v_exp.sollya b/math/tools/v_exp.sollya
index 5fa7de7435a9863d3b9511cdff140977165a8333..c0abb63fb642a58ca023eb242a010b1a418e15fe 100644
--- a/math/tools/v_exp.sollya
+++ b/math/tools/v_exp.sollya
@@ -1,7 +1,7 @@
// polynomial for approximating e^x
//
// Copyright (c) 2019, Arm Limited.
-// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+// SPDX-License-Identifier: MIT
deg = 4; // poly degree
N = 128; // table entries
diff --git a/math/tools/v_log.sollya b/math/tools/v_log.sollya
index d982524eb920f0e581fd2b9221d493364a26bd37..cc3d2c4ae72a1b860313625a9771e2fc1e19e93b 100644
--- a/math/tools/v_log.sollya
+++ b/math/tools/v_log.sollya
@@ -1,7 +1,7 @@
// polynomial used for __v_log(x)
//
// Copyright (c) 2019, Arm Limited.
-// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+// SPDX-License-Identifier: MIT
deg = 6; // poly degree
a = -0x1.fc1p-9;
diff --git a/math/tools/v_sin.sollya b/math/tools/v_sin.sollya
index 63b9d65a1ac35a14b98a8dcab6a00637d35db4fb..65cc9957c624a6fd09a32762d7f4d7296e0b8319 100644
--- a/math/tools/v_sin.sollya
+++ b/math/tools/v_sin.sollya
@@ -1,7 +1,7 @@
// polynomial for approximating sin(x)
//
// Copyright (c) 2019, Arm Limited.
-// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+// SPDX-License-Identifier: MIT
deg = 15; // polynomial degree
a = -pi/2; // interval
diff --git a/math/v_cos.c b/math/v_cos.c
new file mode 100644
index 0000000000000000000000000000000000000000..20ba6bd0d0d9a4a5e98f56f0e374f22a88df2f7a
--- /dev/null
+++ b/math/v_cos.c
@@ -0,0 +1,87 @@
+/*
+ * Double-precision vector cos function.
+ *
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "mathlib.h"
+#include "v_math.h"
+#if V_SUPPORTED
+
+static const double Poly[] = {
+/* worst-case error is 3.5 ulp.
+ abs error: 0x1.be222a58p-53 in [-pi/2, pi/2]. */
+-0x1.9f4a9c8b21dc9p-41,
+ 0x1.60e88a10163f2p-33,
+-0x1.ae6361b7254e7p-26,
+ 0x1.71de382e8d62bp-19,
+-0x1.a01a019aeb4ffp-13,
+ 0x1.111111110b25ep-7,
+-0x1.55555555554c3p-3,
+};
+
+#define C7 v_f64 (Poly[0])
+#define C6 v_f64 (Poly[1])
+#define C5 v_f64 (Poly[2])
+#define C4 v_f64 (Poly[3])
+#define C3 v_f64 (Poly[4])
+#define C2 v_f64 (Poly[5])
+#define C1 v_f64 (Poly[6])
+
+#define InvPi v_f64 (0x1.45f306dc9c883p-2)
+#define HalfPi v_f64 (0x1.921fb54442d18p+0)
+#define Pi1 v_f64 (0x1.921fb54442d18p+1)
+#define Pi2 v_f64 (0x1.1a62633145c06p-53)
+#define Pi3 v_f64 (0x1.c1cd129024e09p-106)
+#define Shift v_f64 (0x1.8p52)
+#define RangeVal v_f64 (0x1p23)
+#define AbsMask v_u64 (0x7fffffffffffffff)
+
+VPCS_ATTR
+__attribute__ ((noinline)) static v_f64_t
+specialcase (v_f64_t x, v_f64_t y, v_u64_t cmp)
+{
+ return v_call_f64 (cos, x, y, cmp);
+}
+
+VPCS_ATTR
+v_f64_t
+V_NAME(cos) (v_f64_t x)
+{
+ v_f64_t n, r, r2, y;
+ v_u64_t odd, cmp;
+
+ r = v_as_f64_u64 (v_as_u64_f64 (x) & AbsMask);
+ cmp = v_cond_u64 (v_as_u64_f64 (r) >= v_as_u64_f64 (RangeVal));
+
+ /* n = rint((|x|+pi/2)/pi) - 0.5. */
+ n = v_fma_f64 (InvPi, r + HalfPi, Shift);
+ odd = v_as_u64_f64 (n) << 63;
+ n -= Shift;
+ n -= v_f64 (0.5);
+
+ /* r = |x| - n*pi (range reduction into -pi/2 .. pi/2). */
+ r = v_fma_f64 (-Pi1, n, r);
+ r = v_fma_f64 (-Pi2, n, r);
+ r = v_fma_f64 (-Pi3, n, r);
+
+ /* sin(r) poly approx. */
+ r2 = r * r;
+ y = v_fma_f64 (C7, r2, C6);
+ y = v_fma_f64 (y, r2, C5);
+ y = v_fma_f64 (y, r2, C4);
+ y = v_fma_f64 (y, r2, C3);
+ y = v_fma_f64 (y, r2, C2);
+ y = v_fma_f64 (y, r2, C1);
+ y = v_fma_f64 (y * r2, r, r);
+
+ /* sign. */
+ y = v_as_f64_u64 (v_as_u64_f64 (y) ^ odd);
+
+ if (unlikely (v_any_u64 (cmp)))
+ return specialcase (x, y, cmp);
+ return y;
+}
+VPCS_ALIAS
+#endif
diff --git a/math/v_cosf.c b/math/v_cosf.c
new file mode 100644
index 0000000000000000000000000000000000000000..150294b8845e735c06423bdbc9e78fe6bb567b06
--- /dev/null
+++ b/math/v_cosf.c
@@ -0,0 +1,76 @@
+/*
+ * Single-precision vector cos function.
+ *
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "mathlib.h"
+#include "v_math.h"
+#if V_SUPPORTED
+
+static const float Poly[] = {
+ /* 1.886 ulp error */
+ 0x1.5b2e76p-19f,
+ -0x1.9f42eap-13f,
+ 0x1.110df4p-7f,
+ -0x1.555548p-3f,
+};
+#define Pi1 v_f32 (0x1.921fb6p+1f)
+#define Pi2 v_f32 (-0x1.777a5cp-24f)
+#define Pi3 v_f32 (-0x1.ee59dap-49f)
+#define A3 v_f32 (Poly[3])
+#define A5 v_f32 (Poly[2])
+#define A7 v_f32 (Poly[1])
+#define A9 v_f32 (Poly[0])
+#define RangeVal v_f32 (0x1p20f)
+#define InvPi v_f32 (0x1.45f306p-2f)
+#define Shift v_f32 (0x1.8p+23f)
+#define AbsMask v_u32 (0x7fffffff)
+#define HalfPi v_f32 (0x1.921fb6p0f)
+
+VPCS_ATTR
+static v_f32_t
+specialcase (v_f32_t x, v_f32_t y, v_u32_t cmp)
+{
+ /* Fall back to scalar code. */
+ return v_call_f32 (cosf, x, y, cmp);
+}
+
+VPCS_ATTR
+v_f32_t
+V_NAME(cosf) (v_f32_t x)
+{
+ v_f32_t n, r, r2, y;
+ v_u32_t odd, cmp;
+
+ r = v_as_f32_u32 (v_as_u32_f32 (x) & AbsMask);
+ cmp = v_cond_u32 (v_as_u32_f32 (r) >= v_as_u32_f32 (RangeVal));
+
+ /* n = rint((|x|+pi/2)/pi) - 0.5 */
+ n = v_fma_f32 (InvPi, r + HalfPi, Shift);
+ odd = v_as_u32_f32 (n) << 31;
+ n -= Shift;
+ n -= v_f32 (0.5f);
+
+ /* r = |x| - n*pi (range reduction into -pi/2 .. pi/2) */
+ r = v_fma_f32 (-Pi1, n, r);
+ r = v_fma_f32 (-Pi2, n, r);
+ r = v_fma_f32 (-Pi3, n, r);
+
+ /* y = sin(r) */
+ r2 = r * r;
+ y = v_fma_f32 (A9, r2, A7);
+ y = v_fma_f32 (y, r2, A5);
+ y = v_fma_f32 (y, r2, A3);
+ y = v_fma_f32 (y * r2, r, r);
+
+ /* sign fix */
+ y = v_as_f32_u32 (v_as_u32_f32 (y) ^ odd);
+
+ if (unlikely (v_any_u32 (cmp)))
+ return specialcase (x, y, cmp);
+ return y;
+}
+VPCS_ALIAS
+#endif
diff --git a/math/v_exp.c b/math/v_exp.c
new file mode 100644
index 0000000000000000000000000000000000000000..e459d53fddd2509f6f8ddb69328615e8cc80b2e8
--- /dev/null
+++ b/math/v_exp.c
@@ -0,0 +1,94 @@
+/*
+ * Double-precision vector e^x function.
+ *
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "mathlib.h"
+#include "v_math.h"
+#if V_SUPPORTED
+#include "v_exp.h"
+
+#if V_EXP_TABLE_BITS == 7
+/* maxerr: 1.88 +0.5 ulp
+ rel error: 1.4337*2^-53
+ abs error: 1.4299*2^-53 in [ -ln2/256, ln2/256 ]. */
+#define C1 v_f64 (0x1.ffffffffffd43p-2)
+#define C2 v_f64 (0x1.55555c75adbb2p-3)
+#define C3 v_f64 (0x1.55555da646206p-5)
+#define InvLn2 v_f64 (0x1.71547652b82fep7) /* N/ln2. */
+#define Ln2hi v_f64 (0x1.62e42fefa39efp-8) /* ln2/N. */
+#define Ln2lo v_f64 (0x1.abc9e3b39803f3p-63)
+#elif V_EXP_TABLE_BITS == 8
+/* maxerr: 0.54 +0.5 ulp
+ rel error: 1.4318*2^-58
+ abs error: 1.4299*2^-58 in [ -ln2/512, ln2/512 ]. */
+#define C1 v_f64 (0x1.fffffffffffd4p-2)
+#define C2 v_f64 (0x1.5555571d6b68cp-3)
+#define C3 v_f64 (0x1.5555576a59599p-5)
+#define InvLn2 v_f64 (0x1.71547652b82fep8)
+#define Ln2hi v_f64 (0x1.62e42fefa39efp-9)
+#define Ln2lo v_f64 (0x1.abc9e3b39803f3p-64)
+#endif
+
+#define N (1 << V_EXP_TABLE_BITS)
+#define Tab __v_exp_data
+#define IndexMask v_u64 (N - 1)
+#define Shift v_f64 (0x1.8p+52)
+#define Thres v_f64 (704.0)
+
+VPCS_ATTR
+static v_f64_t
+specialcase (v_f64_t s, v_f64_t y, v_f64_t n)
+{
+ v_f64_t absn = v_abs_f64 (n);
+
+ /* 2^(n/N) may overflow, break it up into s1*s2. */
+ v_u64_t b = v_cond_u64 (n <= v_f64 (0.0)) & v_u64 (0x6000000000000000);
+ v_f64_t s1 = v_as_f64_u64 (v_u64 (0x7000000000000000) - b);
+ v_f64_t s2 = v_as_f64_u64 (v_as_u64_f64 (s) - v_u64 (0x3010000000000000) + b);
+ v_u64_t cmp = v_cond_u64 (absn > v_f64 (1280.0 * N));
+ v_f64_t r1 = s1 * s1;
+ v_f64_t r0 = v_fma_f64 (y, s2, s2) * s1;
+ return v_as_f64_u64 ((cmp & v_as_u64_f64 (r1)) | (~cmp & v_as_u64_f64 (r0)));
+}
+
+VPCS_ATTR
+v_f64_t
+V_NAME(exp) (v_f64_t x)
+{
+ v_f64_t n, r, r2, s, y, z;
+ v_u64_t cmp, u, e, i;
+
+ cmp = v_cond_u64 (v_abs_f64 (x) > Thres);
+
+ /* n = round(x/(ln2/N)). */
+ z = v_fma_f64 (x, InvLn2, Shift);
+ u = v_as_u64_f64 (z);
+ n = z - Shift;
+
+ /* r = x - n*ln2/N. */
+ r = x;
+ r = v_fma_f64 (-Ln2hi, n, r);
+ r = v_fma_f64 (-Ln2lo, n, r);
+
+ e = u << (52 - V_EXP_TABLE_BITS);
+ i = u & IndexMask;
+
+ /* y = exp(r) - 1 ~= r + C1 r^2 + C2 r^3 + C3 r^4. */
+ r2 = r * r;
+ y = v_fma_f64 (C2, r, C1);
+ y = v_fma_f64 (C3, r2, y);
+ y = v_fma_f64 (y, r2, r);
+
+ /* s = 2^(n/N). */
+ u = v_lookup_u64 (Tab, i);
+ s = v_as_f64_u64 (u + e);
+
+ if (unlikely (v_any_u64 (cmp)))
+ return specialcase (s, y, n);
+ return v_fma_f64 (y, s, s);
+}
+VPCS_ALIAS
+#endif
diff --git a/math/v_exp.h b/math/v_exp.h
new file mode 100644
index 0000000000000000000000000000000000000000..305da19c0a53924f18007df499b0af2747b1cfa2
--- /dev/null
+++ b/math/v_exp.h
@@ -0,0 +1,14 @@
+/*
+ * Declarations for double-precision e^x vector function.
+ *
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "v_math.h"
+#if WANT_VMATH
+
+#define V_EXP_TABLE_BITS 7
+
+extern const u64_t __v_exp_data[1 << V_EXP_TABLE_BITS] HIDDEN;
+#endif
diff --git a/math/v_exp2f.c b/math/v_exp2f.c
new file mode 100644
index 0000000000000000000000000000000000000000..e3ea5af3414dc848da0a12659444ff2626f6cfcc
--- /dev/null
+++ b/math/v_exp2f.c
@@ -0,0 +1,78 @@
+/*
+ * Single-precision vector 2^x function.
+ *
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "mathlib.h"
+#include "v_math.h"
+#if V_SUPPORTED
+
+static const float Poly[] = {
+ /* maxerr: 1.962 ulp. */
+ 0x1.59977ap-10f,
+ 0x1.3ce9e4p-7f,
+ 0x1.c6bd32p-5f,
+ 0x1.ebf9bcp-3f,
+ 0x1.62e422p-1f,
+};
+#define C0 v_f32 (Poly[0])
+#define C1 v_f32 (Poly[1])
+#define C2 v_f32 (Poly[2])
+#define C3 v_f32 (Poly[3])
+#define C4 v_f32 (Poly[4])
+
+#define Shift v_f32 (0x1.8p23f)
+
+VPCS_ATTR
+static v_f32_t
+specialcase (v_f32_t poly, v_f32_t n, v_u32_t e, v_f32_t absn, v_u32_t cmp1, v_f32_t scale)
+{
+ /* 2^n may overflow, break it up into s1*s2. */
+ v_u32_t b = v_cond_u32 (n <= v_f32 (0.0f)) & v_u32 (0x82000000);
+ v_f32_t s1 = v_as_f32_u32 (v_u32 (0x7f000000) + b);
+ v_f32_t s2 = v_as_f32_u32 (e - b);
+ v_u32_t cmp2 = v_cond_u32 (absn > v_f32 (192.0f));
+ v_u32_t r2 = v_as_u32_f32 (s1 * s1);
+ v_u32_t r1 = v_as_u32_f32 (v_fma_f32 (poly, s2, s2) * s1);
+ /* Similar to r1 but avoids double rounding in the subnormal range. */
+ v_u32_t r0 = v_as_u32_f32 (v_fma_f32 (poly, scale, scale));
+ return v_as_f32_u32 ((cmp2 & r2) | (~cmp2 & cmp1 & r1) | (~cmp1 & r0));
+}
+
+VPCS_ATTR
+v_f32_t
+V_NAME(exp2f) (v_f32_t x)
+{
+ v_f32_t n, r, r2, scale, p, q, poly, absn;
+ v_u32_t cmp, e;
+
+ /* exp2(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)]
+ x = n + r, with r in [-1/2, 1/2]. */
+#if 0
+ v_f32_t z;
+ z = x + Shift;
+ n = z - Shift;
+ r = x - n;
+ e = v_as_u32_f32 (z) << 23;
+#else
+ n = v_round_f32 (x);
+ r = x - n;
+ e = v_as_u32_s32 (v_round_s32 (x)) << 23;
+#endif
+ scale = v_as_f32_u32 (e + v_u32 (0x3f800000));
+ absn = v_abs_f32 (n);
+ cmp = v_cond_u32 (absn > v_f32 (126.0f));
+ r2 = r * r;
+ p = v_fma_f32 (C0, r, C1);
+ q = v_fma_f32 (C2, r, C3);
+ q = v_fma_f32 (p, r2, q);
+ p = C4 * r;
+ poly = v_fma_f32 (q, r2, p);
+ if (unlikely (v_any_u32 (cmp)))
+ return specialcase (poly, n, e, absn, cmp, scale);
+ return v_fma_f32 (poly, scale, scale);
+}
+VPCS_ALIAS
+#endif
diff --git a/math/aarch64/v_exp2f_1u.c b/math/v_exp2f_1u.c
similarity index 43%
rename from math/aarch64/v_exp2f_1u.c
rename to math/v_exp2f_1u.c
index ba6b02fbb4bcbd9c215d8326dd74f2e4bbadc18b..1caa14d9bffffbb2d0cc47ac6470b12701732f67 100644
--- a/math/aarch64/v_exp2f_1u.c
+++ b/math/v_exp2f_1u.c
@@ -1,12 +1,13 @@
/*
* Single-precision vector 2^x function.
*
- * Copyright (c) 2019-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
*/
#include "mathlib.h"
#include "v_math.h"
+#if V_SUPPORTED
static const float Poly[] = {
/* maxerr: 0.878 ulp. */
@@ -24,49 +25,51 @@ static const float Poly[] = {
#define Ln2hi v_f32 (0x1.62e4p-1f)
#define Ln2lo v_f32 (0x1.7f7d1cp-20f)
-static float32x4_t VPCS_ATTR NOINLINE
-specialcase (float32x4_t poly, float32x4_t n, uint32x4_t e, float32x4_t absn)
+VPCS_ATTR
+static v_f32_t
+specialcase (v_f32_t poly, v_f32_t n, v_u32_t e, v_f32_t absn)
{
/* 2^n may overflow, break it up into s1*s2. */
- uint32x4_t b = (n <= v_f32 (0.0f)) & v_u32 (0x83000000);
- float32x4_t s1 = vreinterpretq_f32_u32 (v_u32 (0x7f000000) + b);
- float32x4_t s2 = vreinterpretq_f32_u32 (e - b);
- uint32x4_t cmp = absn > v_f32 (192.0f);
- float32x4_t r1 = s1 * s1;
- float32x4_t r0 = poly * s1 * s2;
- return vreinterpretq_f32_u32 ((cmp & vreinterpretq_u32_f32 (r1))
- | (~cmp & vreinterpretq_u32_f32 (r0)));
+ v_u32_t b = v_cond_u32 (n <= v_f32 (0.0f)) & v_u32 (0x83000000);
+ v_f32_t s1 = v_as_f32_u32 (v_u32 (0x7f000000) + b);
+ v_f32_t s2 = v_as_f32_u32 (e - b);
+ v_u32_t cmp = v_cond_u32 (absn > v_f32 (192.0f));
+ v_f32_t r1 = s1 * s1;
+ v_f32_t r0 = poly * s1 * s2;
+ return v_as_f32_u32 ((cmp & v_as_u32_f32 (r1)) | (~cmp & v_as_u32_f32 (r0)));
}
-float32x4_t VPCS_ATTR
-_ZGVnN4v_exp2f_1u (float32x4_t x)
+VPCS_ATTR
+v_f32_t
+V_NAME(exp2f_1u) (v_f32_t x)
{
- float32x4_t n, r, scale, poly, absn;
- uint32x4_t cmp, e;
+ v_f32_t n, r, scale, poly, absn;
+ v_u32_t cmp, e;
/* exp2(x) = 2^n * poly(r), with poly(r) in [1/sqrt(2),sqrt(2)]
x = n + r, with r in [-1/2, 1/2]. */
#if 0
- float32x4_t z;
+ v_f32_t z;
z = x + Shift;
n = z - Shift;
r = x - n;
- e = vreinterpretq_u32_f32 (z) << 23;
+ e = v_as_u32_f32 (z) << 23;
#else
- n = vrndaq_f32 (x);
+ n = v_round_f32 (x);
r = x - n;
- e = vreinterpretq_u32_s32 (vcvtaq_s32_f32 (x)) << 23;
+ e = v_as_u32_s32 (v_round_s32 (x)) << 23;
#endif
- scale = vreinterpretq_f32_u32 (e + v_u32 (0x3f800000));
- absn = vabsq_f32 (n);
- cmp = absn > v_f32 (126.0f);
- poly = vfmaq_f32 (C1, C0, r);
- poly = vfmaq_f32 (C2, poly, r);
- poly = vfmaq_f32 (C3, poly, r);
- poly = vfmaq_f32 (C4, poly, r);
- poly = vfmaq_f32 (C5, poly, r);
- poly = vfmaq_f32 (v_f32 (1.0f), poly, r);
+ scale = v_as_f32_u32 (e + v_u32 (0x3f800000));
+ absn = v_abs_f32 (n);
+ cmp = v_cond_u32 (absn > v_f32 (126.0f));
+ poly = v_fma_f32 (C0, r, C1);
+ poly = v_fma_f32 (poly, r, C2);
+ poly = v_fma_f32 (poly, r, C3);
+ poly = v_fma_f32 (poly, r, C4);
+ poly = v_fma_f32 (poly, r, C5);
+ poly = v_fma_f32 (poly, r, v_f32 (1.0f));
if (unlikely (v_any_u32 (cmp)))
return specialcase (poly, n, e, absn);
return scale * poly;
}
+#endif
diff --git a/math/v_exp_data.c b/math/v_exp_data.c
new file mode 100644
index 0000000000000000000000000000000000000000..365355497e95026692d683d656b8b286e3594446
--- /dev/null
+++ b/math/v_exp_data.c
@@ -0,0 +1,403 @@
+/*
+ * Lookup table for double-precision e^x vector function.
+ *
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "v_exp.h"
+#if WANT_VMATH
+
+#define N (1 << V_EXP_TABLE_BITS)
+
+/* 2^(j/N), j=0..N. */
+const u64_t __v_exp_data[] = {
+#if N == 128
+0x3ff0000000000000,
+0x3feff63da9fb3335,
+0x3fefec9a3e778061,
+0x3fefe315e86e7f85,
+0x3fefd9b0d3158574,
+0x3fefd06b29ddf6de,
+0x3fefc74518759bc8,
+0x3fefbe3ecac6f383,
+0x3fefb5586cf9890f,
+0x3fefac922b7247f7,
+0x3fefa3ec32d3d1a2,
+0x3fef9b66affed31b,
+0x3fef9301d0125b51,
+0x3fef8abdc06c31cc,
+0x3fef829aaea92de0,
+0x3fef7a98c8a58e51,
+0x3fef72b83c7d517b,
+0x3fef6af9388c8dea,
+0x3fef635beb6fcb75,
+0x3fef5be084045cd4,
+0x3fef54873168b9aa,
+0x3fef4d5022fcd91d,
+0x3fef463b88628cd6,
+0x3fef3f49917ddc96,
+0x3fef387a6e756238,
+0x3fef31ce4fb2a63f,
+0x3fef2b4565e27cdd,
+0x3fef24dfe1f56381,
+0x3fef1e9df51fdee1,
+0x3fef187fd0dad990,
+0x3fef1285a6e4030b,
+0x3fef0cafa93e2f56,
+0x3fef06fe0a31b715,
+0x3fef0170fc4cd831,
+0x3feefc08b26416ff,
+0x3feef6c55f929ff1,
+0x3feef1a7373aa9cb,
+0x3feeecae6d05d866,
+0x3feee7db34e59ff7,
+0x3feee32dc313a8e5,
+0x3feedea64c123422,
+0x3feeda4504ac801c,
+0x3feed60a21f72e2a,
+0x3feed1f5d950a897,
+0x3feece086061892d,
+0x3feeca41ed1d0057,
+0x3feec6a2b5c13cd0,
+0x3feec32af0d7d3de,
+0x3feebfdad5362a27,
+0x3feebcb299fddd0d,
+0x3feeb9b2769d2ca7,
+0x3feeb6daa2cf6642,
+0x3feeb42b569d4f82,
+0x3feeb1a4ca5d920f,
+0x3feeaf4736b527da,
+0x3feead12d497c7fd,
+0x3feeab07dd485429,
+0x3feea9268a5946b7,
+0x3feea76f15ad2148,
+0x3feea5e1b976dc09,
+0x3feea47eb03a5585,
+0x3feea34634ccc320,
+0x3feea23882552225,
+0x3feea155d44ca973,
+0x3feea09e667f3bcd,
+0x3feea012750bdabf,
+0x3fee9fb23c651a2f,
+0x3fee9f7df9519484,
+0x3fee9f75e8ec5f74,
+0x3fee9f9a48a58174,
+0x3fee9feb564267c9,
+0x3feea0694fde5d3f,
+0x3feea11473eb0187,
+0x3feea1ed0130c132,
+0x3feea2f336cf4e62,
+0x3feea427543e1a12,
+0x3feea589994cce13,
+0x3feea71a4623c7ad,
+0x3feea8d99b4492ed,
+0x3feeaac7d98a6699,
+0x3feeace5422aa0db,
+0x3feeaf3216b5448c,
+0x3feeb1ae99157736,
+0x3feeb45b0b91ffc6,
+0x3feeb737b0cdc5e5,
+0x3feeba44cbc8520f,
+0x3feebd829fde4e50,
+0x3feec0f170ca07ba,
+0x3feec49182a3f090,
+0x3feec86319e32323,
+0x3feecc667b5de565,
+0x3feed09bec4a2d33,
+0x3feed503b23e255d,
+0x3feed99e1330b358,
+0x3feede6b5579fdbf,
+0x3feee36bbfd3f37a,
+0x3feee89f995ad3ad,
+0x3feeee07298db666,
+0x3feef3a2b84f15fb,
+0x3feef9728de5593a,
+0x3feeff76f2fb5e47,
+0x3fef05b030a1064a,
+0x3fef0c1e904bc1d2,
+0x3fef12c25bd71e09,
+0x3fef199bdd85529c,
+0x3fef20ab5fffd07a,
+0x3fef27f12e57d14b,
+0x3fef2f6d9406e7b5,
+0x3fef3720dcef9069,
+0x3fef3f0b555dc3fa,
+0x3fef472d4a07897c,
+0x3fef4f87080d89f2,
+0x3fef5818dcfba487,
+0x3fef60e316c98398,
+0x3fef69e603db3285,
+0x3fef7321f301b460,
+0x3fef7c97337b9b5f,
+0x3fef864614f5a129,
+0x3fef902ee78b3ff6,
+0x3fef9a51fbc74c83,
+0x3fefa4afa2a490da,
+0x3fefaf482d8e67f1,
+0x3fefba1bee615a27,
+0x3fefc52b376bba97,
+0x3fefd0765b6e4540,
+0x3fefdbfdad9cbe14,
+0x3fefe7c1819e90d8,
+0x3feff3c22b8f71f1,
+#elif N == 256
+0x3ff0000000000000,
+0x3feffb1afa5abcbf,
+0x3feff63da9fb3335,
+0x3feff168143b0281,
+0x3fefec9a3e778061,
+0x3fefe7d42e11bbcc,
+0x3fefe315e86e7f85,
+0x3fefde5f72f654b1,
+0x3fefd9b0d3158574,
+0x3fefd50a0e3c1f89,
+0x3fefd06b29ddf6de,
+0x3fefcbd42b72a836,
+0x3fefc74518759bc8,
+0x3fefc2bdf66607e0,
+0x3fefbe3ecac6f383,
+0x3fefb9c79b1f3919,
+0x3fefb5586cf9890f,
+0x3fefb0f145e46c85,
+0x3fefac922b7247f7,
+0x3fefa83b23395dec,
+0x3fefa3ec32d3d1a2,
+0x3fef9fa55fdfa9c5,
+0x3fef9b66affed31b,
+0x3fef973028d7233e,
+0x3fef9301d0125b51,
+0x3fef8edbab5e2ab6,
+0x3fef8abdc06c31cc,
+0x3fef86a814f204ab,
+0x3fef829aaea92de0,
+0x3fef7e95934f312e,
+0x3fef7a98c8a58e51,
+0x3fef76a45471c3c2,
+0x3fef72b83c7d517b,
+0x3fef6ed48695bbc0,
+0x3fef6af9388c8dea,
+0x3fef672658375d2f,
+0x3fef635beb6fcb75,
+0x3fef5f99f8138a1c,
+0x3fef5be084045cd4,
+0x3fef582f95281c6b,
+0x3fef54873168b9aa,
+0x3fef50e75eb44027,
+0x3fef4d5022fcd91d,
+0x3fef49c18438ce4d,
+0x3fef463b88628cd6,
+0x3fef42be3578a819,
+0x3fef3f49917ddc96,
+0x3fef3bdda27912d1,
+0x3fef387a6e756238,
+0x3fef351ffb82140a,
+0x3fef31ce4fb2a63f,
+0x3fef2e85711ece75,
+0x3fef2b4565e27cdd,
+0x3fef280e341ddf29,
+0x3fef24dfe1f56381,
+0x3fef21ba7591bb70,
+0x3fef1e9df51fdee1,
+0x3fef1b8a66d10f13,
+0x3fef187fd0dad990,
+0x3fef157e39771b2f,
+0x3fef1285a6e4030b,
+0x3fef0f961f641589,
+0x3fef0cafa93e2f56,
+0x3fef09d24abd886b,
+0x3fef06fe0a31b715,
+0x3fef0432edeeb2fd,
+0x3fef0170fc4cd831,
+0x3feefeb83ba8ea32,
+0x3feefc08b26416ff,
+0x3feef96266e3fa2d,
+0x3feef6c55f929ff1,
+0x3feef431a2de883b,
+0x3feef1a7373aa9cb,
+0x3feeef26231e754a,
+0x3feeecae6d05d866,
+0x3feeea401b7140ef,
+0x3feee7db34e59ff7,
+0x3feee57fbfec6cf4,
+0x3feee32dc313a8e5,
+0x3feee0e544ede173,
+0x3feedea64c123422,
+0x3feedc70df1c5175,
+0x3feeda4504ac801c,
+0x3feed822c367a024,
+0x3feed60a21f72e2a,
+0x3feed3fb2709468a,
+0x3feed1f5d950a897,
+0x3feecffa3f84b9d4,
+0x3feece086061892d,
+0x3feecc2042a7d232,
+0x3feeca41ed1d0057,
+0x3feec86d668b3237,
+0x3feec6a2b5c13cd0,
+0x3feec4e1e192aed2,
+0x3feec32af0d7d3de,
+0x3feec17dea6db7d7,
+0x3feebfdad5362a27,
+0x3feebe41b817c114,
+0x3feebcb299fddd0d,
+0x3feebb2d81d8abff,
+0x3feeb9b2769d2ca7,
+0x3feeb8417f4531ee,
+0x3feeb6daa2cf6642,
+0x3feeb57de83f4eef,
+0x3feeb42b569d4f82,
+0x3feeb2e2f4f6ad27,
+0x3feeb1a4ca5d920f,
+0x3feeb070dde910d2,
+0x3feeaf4736b527da,
+0x3feeae27dbe2c4cf,
+0x3feead12d497c7fd,
+0x3feeac0827ff07cc,
+0x3feeab07dd485429,
+0x3feeaa11fba87a03,
+0x3feea9268a5946b7,
+0x3feea84590998b93,
+0x3feea76f15ad2148,
+0x3feea6a320dceb71,
+0x3feea5e1b976dc09,
+0x3feea52ae6cdf6f4,
+0x3feea47eb03a5585,
+0x3feea3dd1d1929fd,
+0x3feea34634ccc320,
+0x3feea2b9febc8fb7,
+0x3feea23882552225,
+0x3feea1c1c70833f6,
+0x3feea155d44ca973,
+0x3feea0f4b19e9538,
+0x3feea09e667f3bcd,
+0x3feea052fa75173e,
+0x3feea012750bdabf,
+0x3fee9fdcddd47645,
+0x3fee9fb23c651a2f,
+0x3fee9f9298593ae5,
+0x3fee9f7df9519484,
+0x3fee9f7466f42e87,
+0x3fee9f75e8ec5f74,
+0x3fee9f8286ead08a,
+0x3fee9f9a48a58174,
+0x3fee9fbd35d7cbfd,
+0x3fee9feb564267c9,
+0x3feea024b1ab6e09,
+0x3feea0694fde5d3f,
+0x3feea0b938ac1cf6,
+0x3feea11473eb0187,
+0x3feea17b0976cfdb,
+0x3feea1ed0130c132,
+0x3feea26a62ff86f0,
+0x3feea2f336cf4e62,
+0x3feea3878491c491,
+0x3feea427543e1a12,
+0x3feea4d2add106d9,
+0x3feea589994cce13,
+0x3feea64c1eb941f7,
+0x3feea71a4623c7ad,
+0x3feea7f4179f5b21,
+0x3feea8d99b4492ed,
+0x3feea9cad931a436,
+0x3feeaac7d98a6699,
+0x3feeabd0a478580f,
+0x3feeace5422aa0db,
+0x3feeae05bad61778,
+0x3feeaf3216b5448c,
+0x3feeb06a5e0866d9,
+0x3feeb1ae99157736,
+0x3feeb2fed0282c8a,
+0x3feeb45b0b91ffc6,
+0x3feeb5c353aa2fe2,
+0x3feeb737b0cdc5e5,
+0x3feeb8b82b5f98e5,
+0x3feeba44cbc8520f,
+0x3feebbdd9a7670b3,
+0x3feebd829fde4e50,
+0x3feebf33e47a22a2,
+0x3feec0f170ca07ba,
+0x3feec2bb4d53fe0d,
+0x3feec49182a3f090,
+0x3feec674194bb8d5,
+0x3feec86319e32323,
+0x3feeca5e8d07f29e,
+0x3feecc667b5de565,
+0x3feece7aed8eb8bb,
+0x3feed09bec4a2d33,
+0x3feed2c980460ad8,
+0x3feed503b23e255d,
+0x3feed74a8af46052,
+0x3feed99e1330b358,
+0x3feedbfe53c12e59,
+0x3feede6b5579fdbf,
+0x3feee0e521356eba,
+0x3feee36bbfd3f37a,
+0x3feee5ff3a3c2774,
+0x3feee89f995ad3ad,
+0x3feeeb4ce622f2ff,
+0x3feeee07298db666,
+0x3feef0ce6c9a8952,
+0x3feef3a2b84f15fb,
+0x3feef68415b749b1,
+0x3feef9728de5593a,
+0x3feefc6e29f1c52a,
+0x3feeff76f2fb5e47,
+0x3fef028cf22749e4,
+0x3fef05b030a1064a,
+0x3fef08e0b79a6f1f,
+0x3fef0c1e904bc1d2,
+0x3fef0f69c3f3a207,
+0x3fef12c25bd71e09,
+0x3fef16286141b33d,
+0x3fef199bdd85529c,
+0x3fef1d1cd9fa652c,
+0x3fef20ab5fffd07a,
+0x3fef244778fafb22,
+0x3fef27f12e57d14b,
+0x3fef2ba88988c933,
+0x3fef2f6d9406e7b5,
+0x3fef33405751c4db,
+0x3fef3720dcef9069,
+0x3fef3b0f2e6d1675,
+0x3fef3f0b555dc3fa,
+0x3fef43155b5bab74,
+0x3fef472d4a07897c,
+0x3fef4b532b08c968,
+0x3fef4f87080d89f2,
+0x3fef53c8eacaa1d6,
+0x3fef5818dcfba487,
+0x3fef5c76e862e6d3,
+0x3fef60e316c98398,
+0x3fef655d71ff6075,
+0x3fef69e603db3285,
+0x3fef6e7cd63a8315,
+0x3fef7321f301b460,
+0x3fef77d5641c0658,
+0x3fef7c97337b9b5f,
+0x3fef81676b197d17,
+0x3fef864614f5a129,
+0x3fef8b333b16ee12,
+0x3fef902ee78b3ff6,
+0x3fef953924676d76,
+0x3fef9a51fbc74c83,
+0x3fef9f7977cdb740,
+0x3fefa4afa2a490da,
+0x3fefa9f4867cca6e,
+0x3fefaf482d8e67f1,
+0x3fefb4aaa2188510,
+0x3fefba1bee615a27,
+0x3fefbf9c1cb6412a,
+0x3fefc52b376bba97,
+0x3fefcac948dd7274,
+0x3fefd0765b6e4540,
+0x3fefd632798844f8,
+0x3fefdbfdad9cbe14,
+0x3fefe1d802243c89,
+0x3fefe7c1819e90d8,
+0x3fefedba3692d514,
+0x3feff3c22b8f71f1,
+0x3feff9d96b2a23d9,
+#endif
+};
+#endif
diff --git a/math/v_expf.c b/math/v_expf.c
new file mode 100644
index 0000000000000000000000000000000000000000..d403e00534f068d81edefca1f48b6800cf7ab363
--- /dev/null
+++ b/math/v_expf.c
@@ -0,0 +1,83 @@
+/*
+ * Single-precision vector e^x function.
+ *
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "mathlib.h"
+#include "v_math.h"
+#if V_SUPPORTED
+
+static const float Poly[] = {
+ /* maxerr: 1.45358 +0.5 ulp. */
+ 0x1.0e4020p-7f,
+ 0x1.573e2ep-5f,
+ 0x1.555e66p-3f,
+ 0x1.fffdb6p-2f,
+ 0x1.ffffecp-1f,
+};
+#define C0 v_f32 (Poly[0])
+#define C1 v_f32 (Poly[1])
+#define C2 v_f32 (Poly[2])
+#define C3 v_f32 (Poly[3])
+#define C4 v_f32 (Poly[4])
+
+#define Shift v_f32 (0x1.8p23f)
+#define InvLn2 v_f32 (0x1.715476p+0f)
+#define Ln2hi v_f32 (0x1.62e4p-1f)
+#define Ln2lo v_f32 (0x1.7f7d1cp-20f)
+
+VPCS_ATTR
+static v_f32_t
+specialcase (v_f32_t poly, v_f32_t n, v_u32_t e, v_f32_t absn, v_u32_t cmp1, v_f32_t scale)
+{
+ /* 2^n may overflow, break it up into s1*s2. */
+ v_u32_t b = v_cond_u32 (n <= v_f32 (0.0f)) & v_u32 (0x82000000);
+ v_f32_t s1 = v_as_f32_u32 (v_u32 (0x7f000000) + b);
+ v_f32_t s2 = v_as_f32_u32 (e - b);
+ v_u32_t cmp2 = v_cond_u32 (absn > v_f32 (192.0f));
+ v_u32_t r2 = v_as_u32_f32 (s1 * s1);
+ v_u32_t r1 = v_as_u32_f32 (v_fma_f32 (poly, s2, s2) * s1);
+ /* Similar to r1 but avoids double rounding in the subnormal range. */
+ v_u32_t r0 = v_as_u32_f32 (v_fma_f32 (poly, scale, scale));
+ return v_as_f32_u32 ((cmp2 & r2) | (~cmp2 & cmp1 & r1) | (~cmp1 & r0));
+}
+
+VPCS_ATTR
+v_f32_t
+V_NAME(expf) (v_f32_t x)
+{
+ v_f32_t n, r, r2, scale, p, q, poly, absn, z;
+ v_u32_t cmp, e;
+
+ /* exp(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)]
+ x = ln2*n + r, with r in [-ln2/2, ln2/2]. */
+#if 1
+ z = v_fma_f32 (x, InvLn2, Shift);
+ n = z - Shift;
+ r = v_fma_f32 (n, -Ln2hi, x);
+ r = v_fma_f32 (n, -Ln2lo, r);
+ e = v_as_u32_f32 (z) << 23;
+#else
+ z = x * InvLn2;
+ n = v_round_f32 (z);
+ r = v_fma_f32 (n, -Ln2hi, x);
+ r = v_fma_f32 (n, -Ln2lo, r);
+ e = v_as_u32_s32 (v_round_s32 (z)) << 23;
+#endif
+ scale = v_as_f32_u32 (e + v_u32 (0x3f800000));
+ absn = v_abs_f32 (n);
+ cmp = v_cond_u32 (absn > v_f32 (126.0f));
+ r2 = r * r;
+ p = v_fma_f32 (C0, r, C1);
+ q = v_fma_f32 (C2, r, C3);
+ q = v_fma_f32 (p, r2, q);
+ p = C4 * r;
+ poly = v_fma_f32 (q, r2, p);
+ if (unlikely (v_any_u32 (cmp)))
+ return specialcase (poly, n, e, absn, cmp, scale);
+ return v_fma_f32 (poly, scale, scale);
+}
+VPCS_ALIAS
+#endif
diff --git a/math/aarch64/v_expf_1u.c b/math/v_expf_1u.c
similarity index 39%
rename from math/aarch64/v_expf_1u.c
rename to math/v_expf_1u.c
index 43d03fa34efab42e2ac666dd6c784c02b8fdf6ed..023bd248c9ac9c89e88a9979d0d1a24197550f79 100644
--- a/math/aarch64/v_expf_1u.c
+++ b/math/v_expf_1u.c
@@ -1,12 +1,13 @@
/*
* Single-precision vector e^x function.
*
- * Copyright (c) 2019-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
*/
#include "mathlib.h"
#include "v_math.h"
+#if V_SUPPORTED
static const float Poly[] = {
/* maxerr: 0.36565 +0.5 ulp. */
@@ -27,51 +28,53 @@ static const float Poly[] = {
#define Ln2hi v_f32 (0x1.62e4p-1f)
#define Ln2lo v_f32 (0x1.7f7d1cp-20f)
-static float32x4_t VPCS_ATTR NOINLINE
-specialcase (float32x4_t poly, float32x4_t n, uint32x4_t e, float32x4_t absn)
+VPCS_ATTR
+static v_f32_t
+specialcase (v_f32_t poly, v_f32_t n, v_u32_t e, v_f32_t absn)
{
/* 2^n may overflow, break it up into s1*s2. */
- uint32x4_t b = (n <= v_f32 (0.0f)) & v_u32 (0x83000000);
- float32x4_t s1 = vreinterpretq_f32_u32 (v_u32 (0x7f000000) + b);
- float32x4_t s2 = vreinterpretq_f32_u32 (e - b);
- uint32x4_t cmp = absn > v_f32 (192.0f);
- float32x4_t r1 = s1 * s1;
- float32x4_t r0 = poly * s1 * s2;
- return vreinterpretq_f32_u32 ((cmp & vreinterpretq_u32_f32 (r1))
- | (~cmp & vreinterpretq_u32_f32 (r0)));
+ v_u32_t b = v_cond_u32 (n <= v_f32 (0.0f)) & v_u32 (0x83000000);
+ v_f32_t s1 = v_as_f32_u32 (v_u32 (0x7f000000) + b);
+ v_f32_t s2 = v_as_f32_u32 (e - b);
+ v_u32_t cmp = v_cond_u32 (absn > v_f32 (192.0f));
+ v_f32_t r1 = s1 * s1;
+ v_f32_t r0 = poly * s1 * s2;
+ return v_as_f32_u32 ((cmp & v_as_u32_f32 (r1)) | (~cmp & v_as_u32_f32 (r0)));
}
-float32x4_t VPCS_ATTR
-_ZGVnN4v_expf_1u (float32x4_t x)
+VPCS_ATTR
+v_f32_t
+V_NAME(expf_1u) (v_f32_t x)
{
- float32x4_t n, r, scale, poly, absn, z;
- uint32x4_t cmp, e;
+ v_f32_t n, r, scale, poly, absn, z;
+ v_u32_t cmp, e;
/* exp(x) = 2^n * poly(r), with poly(r) in [1/sqrt(2),sqrt(2)]
x = ln2*n + r, with r in [-ln2/2, ln2/2]. */
#if 1
- z = vfmaq_f32 (Shift, x, InvLn2);
+ z = v_fma_f32 (x, InvLn2, Shift);
n = z - Shift;
- r = vfmaq_f32 (x, n, -Ln2hi);
- r = vfmaq_f32 (r, n, -Ln2lo);
- e = vreinterpretq_u32_f32 (z) << 23;
+ r = v_fma_f32 (n, -Ln2hi, x);
+ r = v_fma_f32 (n, -Ln2lo, r);
+ e = v_as_u32_f32 (z) << 23;
#else
z = x * InvLn2;
- n = vrndaq_f32 (z);
- r = vfmaq_f32 (x, n, -Ln2hi);
- r = vfmaq_f32 (r, n, -Ln2lo);
- e = vreinterpretq_u32_s32 (vcvtaq_s32_f32 (z)) << 23;
+ n = v_round_f32 (z);
+ r = v_fma_f32 (n, -Ln2hi, x);
+ r = v_fma_f32 (n, -Ln2lo, r);
+ e = v_as_u32_s32 (v_round_s32 (z)) << 23;
#endif
- scale = vreinterpretq_f32_u32 (e + v_u32 (0x3f800000));
- absn = vabsq_f32 (n);
- cmp = absn > v_f32 (126.0f);
- poly = vfmaq_f32 (C1, C0, r);
- poly = vfmaq_f32 (C2, poly, r);
- poly = vfmaq_f32 (C3, poly, r);
- poly = vfmaq_f32 (C4, poly, r);
- poly = vfmaq_f32 (v_f32 (1.0f), poly, r);
- poly = vfmaq_f32 (v_f32 (1.0f), poly, r);
+ scale = v_as_f32_u32 (e + v_u32 (0x3f800000));
+ absn = v_abs_f32 (n);
+ cmp = v_cond_u32 (absn > v_f32 (126.0f));
+ poly = v_fma_f32 (C0, r, C1);
+ poly = v_fma_f32 (poly, r, C2);
+ poly = v_fma_f32 (poly, r, C3);
+ poly = v_fma_f32 (poly, r, C4);
+ poly = v_fma_f32 (poly, r, v_f32 (1.0f));
+ poly = v_fma_f32 (poly, r, v_f32 (1.0f));
if (unlikely (v_any_u32 (cmp)))
return specialcase (poly, n, e, absn);
return scale * poly;
}
+#endif
diff --git a/math/v_log.c b/math/v_log.c
new file mode 100644
index 0000000000000000000000000000000000000000..d84c740d2b6b519a5572b41a2e4e91aba27b0477
--- /dev/null
+++ b/math/v_log.c
@@ -0,0 +1,104 @@
+/*
+ * Double-precision vector log(x) function.
+ *
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "mathlib.h"
+#include "v_math.h"
+#include "v_log.h"
+#if V_SUPPORTED
+
+/* Worst-case error: 1.17 + 0.5 ulp. */
+
+static const f64_t Poly[] = {
+ /* rel error: 0x1.6272e588p-56 in [ -0x1.fc1p-9 0x1.009p-8 ]. */
+ -0x1.ffffffffffff7p-2,
+ 0x1.55555555170d4p-2,
+ -0x1.0000000399c27p-2,
+ 0x1.999b2e90e94cap-3,
+ -0x1.554e550bd501ep-3,
+};
+
+#define A0 v_f64 (Poly[0])
+#define A1 v_f64 (Poly[1])
+#define A2 v_f64 (Poly[2])
+#define A3 v_f64 (Poly[3])
+#define A4 v_f64 (Poly[4])
+#define Ln2 v_f64 (0x1.62e42fefa39efp-1)
+#define N (1 << V_LOG_TABLE_BITS)
+#define OFF v_u64 (0x3fe6900900000000)
+
+struct entry
+{
+ v_f64_t invc;
+ v_f64_t logc;
+};
+
+static inline struct entry
+lookup (v_u64_t i)
+{
+ struct entry e;
+#ifdef SCALAR
+ e.invc = __v_log_data[i].invc;
+ e.logc = __v_log_data[i].logc;
+#else
+ e.invc[0] = __v_log_data[i[0]].invc;
+ e.logc[0] = __v_log_data[i[0]].logc;
+ e.invc[1] = __v_log_data[i[1]].invc;
+ e.logc[1] = __v_log_data[i[1]].logc;
+#endif
+ return e;
+}
+
+VPCS_ATTR
+__attribute__ ((noinline)) static v_f64_t
+specialcase (v_f64_t x, v_f64_t y, v_u64_t cmp)
+{
+ return v_call_f64 (log, x, y, cmp);
+}
+
+VPCS_ATTR
+v_f64_t
+V_NAME(log) (v_f64_t x)
+{
+ v_f64_t z, r, r2, p, y, kd, hi;
+ v_u64_t ix, iz, tmp, top, i, cmp;
+ v_s64_t k;
+ struct entry e;
+
+ ix = v_as_u64_f64 (x);
+ top = ix >> 48;
+ cmp = v_cond_u64 (top - v_u64 (0x0010) >= v_u64 (0x7ff0 - 0x0010));
+
+ /* x = 2^k z; where z is in range [OFF,2*OFF) and exact.
+ The range is split into N subintervals.
+ The ith subinterval contains z and c is near its center. */
+ tmp = ix - OFF;
+ i = (tmp >> (52 - V_LOG_TABLE_BITS)) % N;
+ k = v_as_s64_u64 (tmp) >> 52; /* arithmetic shift */
+ iz = ix - (tmp & v_u64 (0xfffULL << 52));
+ z = v_as_f64_u64 (iz);
+ e = lookup (i);
+
+ /* log(x) = log1p(z/c-1) + log(c) + k*Ln2. */
+ r = v_fma_f64 (z, e.invc, v_f64 (-1.0));
+ kd = v_to_f64_s64 (k);
+
+ /* hi = r + log(c) + k*Ln2. */
+ hi = v_fma_f64 (kd, Ln2, e.logc + r);
+ /* y = r2*(A0 + r*A1 + r2*(A2 + r*A3 + r2*A4)) + hi. */
+ r2 = r * r;
+ y = v_fma_f64 (A3, r, A2);
+ p = v_fma_f64 (A1, r, A0);
+ y = v_fma_f64 (A4, r2, y);
+ y = v_fma_f64 (y, r2, p);
+ y = v_fma_f64 (y, r2, hi);
+
+ if (unlikely (v_any_u64 (cmp)))
+ return specialcase (x, y, cmp);
+ return y;
+}
+VPCS_ALIAS
+#endif
diff --git a/math/v_log.h b/math/v_log.h
new file mode 100644
index 0000000000000000000000000000000000000000..bcc2fa6fa9305a936ae6b6e25997a27c2c4ab4e5
--- /dev/null
+++ b/math/v_log.h
@@ -0,0 +1,18 @@
+/*
+ * Declarations for double-precision log(x) vector function.
+ *
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "v_math.h"
+#if WANT_VMATH
+
+#define V_LOG_TABLE_BITS 7
+
+extern const struct v_log_data
+{
+ f64_t invc;
+ f64_t logc;
+} __v_log_data[1 << V_LOG_TABLE_BITS] HIDDEN;
+#endif
diff --git a/math/v_log_data.c b/math/v_log_data.c
new file mode 100644
index 0000000000000000000000000000000000000000..97ee5b09c6a9c2b6b100f444fa16e7dd801e5c5b
--- /dev/null
+++ b/math/v_log_data.c
@@ -0,0 +1,158 @@
+/*
+ * Lookup table for double-precision log(x) vector function.
+ *
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "v_log.h"
+#if WANT_VMATH
+
+#define N (1 << V_LOG_TABLE_BITS)
+
+/* Algorithm:
+
+ x = 2^k z
+ log(x) = k ln2 + log(c) + poly(z/c - 1)
+
+where z is in [a;2a) which is split into N subintervals (a=0x1.69009p-1,N=128)
+and log(c) and 1/c for the ith subinterval comes from a lookup table:
+
+ tab[i].invc = 1/c
+ tab[i].logc = (double)log(c)
+
+where c is near the center of the subinterval and is chosen by trying several
+floating point invc candidates around 1/center and selecting one for which
+the error in (double)log(c) is minimized (< 0x1p-74), except the subinterval
+that contains 1 and the previous one got tweaked to avoid cancellation. */
+const struct v_log_data __v_log_data[N] = {
+{0x1.6a133d0dec120p+0, -0x1.62fe995eb963ap-2},
+{0x1.6815f2f3e42edp+0, -0x1.5d5a48dad6b67p-2},
+{0x1.661e39be1ac9ep+0, -0x1.57bde257d2769p-2},
+{0x1.642bfa30ac371p+0, -0x1.52294fbf2af55p-2},
+{0x1.623f1d916f323p+0, -0x1.4c9c7b598aa38p-2},
+{0x1.60578da220f65p+0, -0x1.47174fc5ff560p-2},
+{0x1.5e75349dea571p+0, -0x1.4199b7fa7b5cap-2},
+{0x1.5c97fd387a75ap+0, -0x1.3c239f48cfb99p-2},
+{0x1.5abfd2981f200p+0, -0x1.36b4f154d2aebp-2},
+{0x1.58eca051dc99cp+0, -0x1.314d9a0ff32fbp-2},
+{0x1.571e526d9df12p+0, -0x1.2bed85cca3cffp-2},
+{0x1.5554d555b3fcbp+0, -0x1.2694a11421af9p-2},
+{0x1.539015e2a20cdp+0, -0x1.2142d8d014fb2p-2},
+{0x1.51d0014ee0164p+0, -0x1.1bf81a2c77776p-2},
+{0x1.50148538cd9eep+0, -0x1.16b452a39c6a4p-2},
+{0x1.4e5d8f9f698a1p+0, -0x1.11776ffa6c67ep-2},
+{0x1.4cab0edca66bep+0, -0x1.0c416035020e0p-2},
+{0x1.4afcf1a9db874p+0, -0x1.071211aa10fdap-2},
+{0x1.495327136e16fp+0, -0x1.01e972e293b1bp-2},
+{0x1.47ad9e84af28fp+0, -0x1.f98ee587fd434p-3},
+{0x1.460c47b39ae15p+0, -0x1.ef5800ad716fbp-3},
+{0x1.446f12b278001p+0, -0x1.e52e160484698p-3},
+{0x1.42d5efdd720ecp+0, -0x1.db1104b19352ep-3},
+{0x1.4140cfe001a0fp+0, -0x1.d100ac59e0bd6p-3},
+{0x1.3fafa3b421f69p+0, -0x1.c6fced287c3bdp-3},
+{0x1.3e225c9c8ece5p+0, -0x1.bd05a7b317c29p-3},
+{0x1.3c98ec29a211ap+0, -0x1.b31abd229164fp-3},
+{0x1.3b13442a413fep+0, -0x1.a93c0edadb0a3p-3},
+{0x1.399156baa3c54p+0, -0x1.9f697ee30d7ddp-3},
+{0x1.38131639b4cdbp+0, -0x1.95a2efa9aa40ap-3},
+{0x1.36987540fbf53p+0, -0x1.8be843d796044p-3},
+{0x1.352166b648f61p+0, -0x1.82395ecc477edp-3},
+{0x1.33adddb3eb575p+0, -0x1.7896240966422p-3},
+{0x1.323dcd99fc1d3p+0, -0x1.6efe77aca8c55p-3},
+{0x1.30d129fefc7d2p+0, -0x1.65723e117ec5cp-3},
+{0x1.2f67e6b72fe7dp+0, -0x1.5bf15c0955706p-3},
+{0x1.2e01f7cf8b187p+0, -0x1.527bb6c111da1p-3},
+{0x1.2c9f518ddc86ep+0, -0x1.491133c939f8fp-3},
+{0x1.2b3fe86e5f413p+0, -0x1.3fb1b90c7fc58p-3},
+{0x1.29e3b1211b25cp+0, -0x1.365d2cc485f8dp-3},
+{0x1.288aa08b373cfp+0, -0x1.2d13758970de7p-3},
+{0x1.2734abcaa8467p+0, -0x1.23d47a721fd47p-3},
+{0x1.25e1c82459b81p+0, -0x1.1aa0229f25ec2p-3},
+{0x1.2491eb1ad59c5p+0, -0x1.117655ddebc3bp-3},
+{0x1.23450a54048b5p+0, -0x1.0856fbf83ab6bp-3},
+{0x1.21fb1bb09e578p+0, -0x1.fe83fabbaa106p-4},
+{0x1.20b415346d8f7p+0, -0x1.ec6e8507a56cdp-4},
+{0x1.1f6fed179a1acp+0, -0x1.da6d68c7cc2eap-4},
+{0x1.1e2e99b93c7b3p+0, -0x1.c88078462be0cp-4},
+{0x1.1cf011a7a882ap+0, -0x1.b6a786a423565p-4},
+{0x1.1bb44b97dba5ap+0, -0x1.a4e2676ac7f85p-4},
+{0x1.1a7b3e66cdd4fp+0, -0x1.9330eea777e76p-4},
+{0x1.1944e11dc56cdp+0, -0x1.8192f134d5ad9p-4},
+{0x1.18112aebb1a6ep+0, -0x1.70084464f0538p-4},
+{0x1.16e013231b7e9p+0, -0x1.5e90bdec5cb1fp-4},
+{0x1.15b1913f156cfp+0, -0x1.4d2c3433c5536p-4},
+{0x1.14859cdedde13p+0, -0x1.3bda7e219879ap-4},
+{0x1.135c2dc68cfa4p+0, -0x1.2a9b732d27194p-4},
+{0x1.12353bdb01684p+0, -0x1.196eeb2b10807p-4},
+{0x1.1110bf25b85b4p+0, -0x1.0854be8ef8a7ep-4},
+{0x1.0feeafd2f8577p+0, -0x1.ee998cb277432p-5},
+{0x1.0ecf062c51c3bp+0, -0x1.ccadb79919fb9p-5},
+{0x1.0db1baa076c8bp+0, -0x1.aae5b1d8618b0p-5},
+{0x1.0c96c5bb3048ep+0, -0x1.89413015d7442p-5},
+{0x1.0b7e20263e070p+0, -0x1.67bfe7bf158dep-5},
+{0x1.0a67c2acd0ce3p+0, -0x1.46618f83941bep-5},
+{0x1.0953a6391e982p+0, -0x1.2525df1b0618ap-5},
+{0x1.0841c3caea380p+0, -0x1.040c8e2f77c6ap-5},
+{0x1.07321489b13eap+0, -0x1.c62aad39f738ap-6},
+{0x1.062491aee9904p+0, -0x1.847fe3bdead9cp-6},
+{0x1.05193497a7cc5p+0, -0x1.43183683400acp-6},
+{0x1.040ff6b5f5e9fp+0, -0x1.01f31c4e1d544p-6},
+{0x1.0308d19aa6127p+0, -0x1.82201d1e6b69ap-7},
+{0x1.0203beedb0c67p+0, -0x1.00dd0f3e1bfd6p-7},
+{0x1.010037d38bcc2p+0, -0x1.ff6fe1feb4e53p-9},
+{1.0, 0.0},
+{0x1.fc06d493cca10p-1, 0x1.fe91885ec8e20p-8},
+{0x1.f81e6ac3b918fp-1, 0x1.fc516f716296dp-7},
+{0x1.f44546ef18996p-1, 0x1.7bb4dd70a015bp-6},
+{0x1.f07b10382c84bp-1, 0x1.f84c99b34b674p-6},
+{0x1.ecbf7070e59d4p-1, 0x1.39f9ce4fb2d71p-5},
+{0x1.e91213f715939p-1, 0x1.7756c0fd22e78p-5},
+{0x1.e572a9a75f7b7p-1, 0x1.b43ee82db8f3ap-5},
+{0x1.e1e0e2c530207p-1, 0x1.f0b3fced60034p-5},
+{0x1.de5c72d8a8be3p-1, 0x1.165bd78d4878ep-4},
+{0x1.dae50fa5658ccp-1, 0x1.3425d2715ebe6p-4},
+{0x1.d77a71145a2dap-1, 0x1.51b8bd91b7915p-4},
+{0x1.d41c51166623ep-1, 0x1.6f15632c76a47p-4},
+{0x1.d0ca6ba0bb29fp-1, 0x1.8c3c88ecbe503p-4},
+{0x1.cd847e8e59681p-1, 0x1.a92ef077625dap-4},
+{0x1.ca4a499693e00p-1, 0x1.c5ed5745fa006p-4},
+{0x1.c71b8e399e821p-1, 0x1.e27876de1c993p-4},
+{0x1.c3f80faf19077p-1, 0x1.fed104fce4cdcp-4},
+{0x1.c0df92dc2b0ecp-1, 0x1.0d7bd9c17d78bp-3},
+{0x1.bdd1de3cbb542p-1, 0x1.1b76986cef97bp-3},
+{0x1.baceb9e1007a3p-1, 0x1.295913d24f750p-3},
+{0x1.b7d5ef543e55ep-1, 0x1.37239fa295d17p-3},
+{0x1.b4e749977d953p-1, 0x1.44d68dd78714bp-3},
+{0x1.b20295155478ep-1, 0x1.52722ebe5d780p-3},
+{0x1.af279f8e82be2p-1, 0x1.5ff6d12671f98p-3},
+{0x1.ac5638197fdf3p-1, 0x1.6d64c2389484bp-3},
+{0x1.a98e2f102e087p-1, 0x1.7abc4da40fddap-3},
+{0x1.a6cf5606d05c1p-1, 0x1.87fdbda1e8452p-3},
+{0x1.a4197fc04d746p-1, 0x1.95295b06a5f37p-3},
+{0x1.a16c80293dc01p-1, 0x1.a23f6d34abbc5p-3},
+{0x1.9ec82c4dc5bc9p-1, 0x1.af403a28e04f2p-3},
+{0x1.9c2c5a491f534p-1, 0x1.bc2c06a85721ap-3},
+{0x1.9998e1480b618p-1, 0x1.c903161240163p-3},
+{0x1.970d9977c6c2dp-1, 0x1.d5c5aa93287ebp-3},
+{0x1.948a5c023d212p-1, 0x1.e274051823fa9p-3},
+{0x1.920f0303d6809p-1, 0x1.ef0e656300c16p-3},
+{0x1.8f9b698a98b45p-1, 0x1.fb9509f05aa2ap-3},
+{0x1.8d2f6b81726f6p-1, 0x1.04041821f37afp-2},
+{0x1.8acae5bb55badp-1, 0x1.0a340a49b3029p-2},
+{0x1.886db5d9275b8p-1, 0x1.105a7918a126dp-2},
+{0x1.8617ba567c13cp-1, 0x1.1677819812b84p-2},
+{0x1.83c8d27487800p-1, 0x1.1c8b405b40c0ep-2},
+{0x1.8180de3c5dbe7p-1, 0x1.2295d16cfa6b1p-2},
+{0x1.7f3fbe71cdb71p-1, 0x1.28975066318a2p-2},
+{0x1.7d055498071c1p-1, 0x1.2e8fd855d86fcp-2},
+{0x1.7ad182e54f65ap-1, 0x1.347f83d605e59p-2},
+{0x1.78a42c3c90125p-1, 0x1.3a666d1244588p-2},
+{0x1.767d342f76944p-1, 0x1.4044adb6f8ec4p-2},
+{0x1.745c7ef26b00ap-1, 0x1.461a5f077558cp-2},
+{0x1.7241f15769d0fp-1, 0x1.4be799e20b9c8p-2},
+{0x1.702d70d396e41p-1, 0x1.51ac76a6b79dfp-2},
+{0x1.6e1ee3700cd11p-1, 0x1.57690d5744a45p-2},
+{0x1.6c162fc9cbe02p-1, 0x1.5d1d758e45217p-2},
+};
+#endif
diff --git a/math/v_logf.c b/math/v_logf.c
new file mode 100644
index 0000000000000000000000000000000000000000..7373192f03fae52c113eabcb69067019e6e2a70c
--- /dev/null
+++ b/math/v_logf.c
@@ -0,0 +1,73 @@
+/*
+ * Single-precision vector log function.
+ *
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "mathlib.h"
+#include "v_math.h"
+#if V_SUPPORTED
+
+static const float Poly[] = {
+ /* 3.34 ulp error */
+ -0x1.3e737cp-3f, 0x1.5a9aa2p-3f, -0x1.4f9934p-3f, 0x1.961348p-3f,
+ -0x1.00187cp-2f, 0x1.555d7cp-2f, -0x1.ffffc8p-2f,
+};
+#define P7 v_f32 (Poly[0])
+#define P6 v_f32 (Poly[1])
+#define P5 v_f32 (Poly[2])
+#define P4 v_f32 (Poly[3])
+#define P3 v_f32 (Poly[4])
+#define P2 v_f32 (Poly[5])
+#define P1 v_f32 (Poly[6])
+
+#define Ln2 v_f32 (0x1.62e43p-1f) /* 0x3f317218 */
+#define Min v_u32 (0x00800000)
+#define Max v_u32 (0x7f800000)
+#define Mask v_u32 (0x007fffff)
+#define Off v_u32 (0x3f2aaaab) /* 0.666667 */
+
+VPCS_ATTR
+__attribute__ ((noinline)) static v_f32_t
+specialcase (v_f32_t x, v_f32_t y, v_u32_t cmp)
+{
+ /* Fall back to scalar code. */
+ return v_call_f32 (logf, x, y, cmp);
+}
+
+VPCS_ATTR
+v_f32_t
+V_NAME(logf) (v_f32_t x)
+{
+ v_f32_t n, p, q, r, r2, y;
+ v_u32_t u, cmp;
+
+ u = v_as_u32_f32 (x);
+ cmp = v_cond_u32 (u - Min >= Max - Min);
+
+ /* x = 2^n * (1+r), where 2/3 < 1+r < 4/3 */
+ u -= Off;
+ n = v_to_f32_s32 (v_as_s32_u32 (u) >> 23); /* signextend */
+ u &= Mask;
+ u += Off;
+ r = v_as_f32_u32 (u) - v_f32 (1.0f);
+
+ /* y = log(1+r) + n*ln2. */
+ r2 = r * r;
+ /* n*ln2 + r + r2*(P1 + r*P2 + r2*(P3 + r*P4 + r2*(P5 + r*P6 + r2*P7))). */
+ p = v_fma_f32 (P6, r, P5);
+ q = v_fma_f32 (P4, r, P3);
+ y = v_fma_f32 (P2, r, P1);
+ p = v_fma_f32 (P7, r2, p);
+ q = v_fma_f32 (p, r2, q);
+ y = v_fma_f32 (q, r2, y);
+ p = v_fma_f32 (Ln2, n, r);
+ y = v_fma_f32 (y, r2, p);
+
+ if (unlikely (v_any_u32 (cmp)))
+ return specialcase (x, y, cmp);
+ return y;
+}
+VPCS_ALIAS
+#endif
diff --git a/math/v_math.h b/math/v_math.h
new file mode 100644
index 0000000000000000000000000000000000000000..f2cc4670bb9b8524c0318952b3e0a417a73746b1
--- /dev/null
+++ b/math/v_math.h
@@ -0,0 +1,641 @@
+/*
+ * Vector math abstractions.
+ *
+ * Copyright (c) 2019-2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#ifndef _V_MATH_H
+#define _V_MATH_H
+
+#ifndef WANT_VMATH
+/* Enable the build of vector math code. */
+# define WANT_VMATH 1
+#endif
+#if WANT_VMATH
+
+/* The goal of this header is to allow vector and scalar
+ build of the same algorithm, the provided intrinsic
+ wrappers are also vector length agnostic so they can
+ be implemented for SVE too (or other simd architectures)
+ and then the code should work on those targets too. */
+
+#if SCALAR
+#define V_NAME(x) __s_##x
+#elif VPCS && __aarch64__
+#define V_NAME(x) __vn_##x
+#define VPCS_ATTR __attribute__ ((aarch64_vector_pcs))
+#else
+#define V_NAME(x) __v_##x
+#endif
+
+#ifndef VPCS_ATTR
+#define VPCS_ATTR
+#endif
+#ifndef VPCS_ALIAS
+#define VPCS_ALIAS
+#endif
+
+#include
+#include "math_config.h"
+
+typedef float f32_t;
+typedef uint32_t u32_t;
+typedef int32_t s32_t;
+typedef double f64_t;
+typedef uint64_t u64_t;
+typedef int64_t s64_t;
+
+/* reinterpret as type1 from type2. */
+static inline u32_t
+as_u32_f32 (f32_t x)
+{
+ union { f32_t f; u32_t u; } r = {x};
+ return r.u;
+}
+static inline f32_t
+as_f32_u32 (u32_t x)
+{
+ union { u32_t u; f32_t f; } r = {x};
+ return r.f;
+}
+static inline s32_t
+as_s32_u32 (u32_t x)
+{
+ union { u32_t u; s32_t i; } r = {x};
+ return r.i;
+}
+static inline u32_t
+as_u32_s32 (s32_t x)
+{
+ union { s32_t i; u32_t u; } r = {x};
+ return r.u;
+}
+static inline u64_t
+as_u64_f64 (f64_t x)
+{
+ union { f64_t f; u64_t u; } r = {x};
+ return r.u;
+}
+static inline f64_t
+as_f64_u64 (u64_t x)
+{
+ union { u64_t u; f64_t f; } r = {x};
+ return r.f;
+}
+static inline s64_t
+as_s64_u64 (u64_t x)
+{
+ union { u64_t u; s64_t i; } r = {x};
+ return r.i;
+}
+static inline u64_t
+as_u64_s64 (s64_t x)
+{
+ union { s64_t i; u64_t u; } r = {x};
+ return r.u;
+}
+
+#if SCALAR
+#define V_SUPPORTED 1
+typedef f32_t v_f32_t;
+typedef u32_t v_u32_t;
+typedef s32_t v_s32_t;
+typedef f64_t v_f64_t;
+typedef u64_t v_u64_t;
+typedef s64_t v_s64_t;
+
+static inline int
+v_lanes32 (void)
+{
+ return 1;
+}
+
+static inline v_f32_t
+v_f32 (f32_t x)
+{
+ return x;
+}
+static inline v_u32_t
+v_u32 (u32_t x)
+{
+ return x;
+}
+static inline v_s32_t
+v_s32 (s32_t x)
+{
+ return x;
+}
+
+static inline f32_t
+v_get_f32 (v_f32_t x, int i)
+{
+ return x;
+}
+static inline u32_t
+v_get_u32 (v_u32_t x, int i)
+{
+ return x;
+}
+static inline s32_t
+v_get_s32 (v_s32_t x, int i)
+{
+ return x;
+}
+
+static inline void
+v_set_f32 (v_f32_t *x, int i, f32_t v)
+{
+ *x = v;
+}
+static inline void
+v_set_u32 (v_u32_t *x, int i, u32_t v)
+{
+ *x = v;
+}
+static inline void
+v_set_s32 (v_s32_t *x, int i, s32_t v)
+{
+ *x = v;
+}
+
+/* true if any elements of a v_cond result is non-zero. */
+static inline int
+v_any_u32 (v_u32_t x)
+{
+ return x != 0;
+}
+/* to wrap the result of relational operators. */
+static inline v_u32_t
+v_cond_u32 (v_u32_t x)
+{
+ return x ? -1 : 0;
+}
+static inline v_f32_t
+v_abs_f32 (v_f32_t x)
+{
+ return __builtin_fabsf (x);
+}
+static inline v_f32_t
+v_fma_f32 (v_f32_t x, v_f32_t y, v_f32_t z)
+{
+ return __builtin_fmaf (x, y, z);
+}
+static inline v_f32_t
+v_round_f32 (v_f32_t x)
+{
+ return __builtin_roundf (x);
+}
+static inline v_s32_t
+v_round_s32 (v_f32_t x)
+{
+ return __builtin_lroundf (x); /* relies on -fno-math-errno. */
+}
+/* convert to type1 from type2. */
+static inline v_f32_t
+v_to_f32_s32 (v_s32_t x)
+{
+ return x;
+}
+static inline v_f32_t
+v_to_f32_u32 (v_u32_t x)
+{
+ return x;
+}
+/* reinterpret as type1 from type2. */
+static inline v_u32_t
+v_as_u32_f32 (v_f32_t x)
+{
+ union { v_f32_t f; v_u32_t u; } r = {x};
+ return r.u;
+}
+static inline v_f32_t
+v_as_f32_u32 (v_u32_t x)
+{
+ union { v_u32_t u; v_f32_t f; } r = {x};
+ return r.f;
+}
+static inline v_s32_t
+v_as_s32_u32 (v_u32_t x)
+{
+ union { v_u32_t u; v_s32_t i; } r = {x};
+ return r.i;
+}
+static inline v_u32_t
+v_as_u32_s32 (v_s32_t x)
+{
+ union { v_s32_t i; v_u32_t u; } r = {x};
+ return r.u;
+}
+static inline v_f32_t
+v_lookup_f32 (const f32_t *tab, v_u32_t idx)
+{
+ return tab[idx];
+}
+static inline v_u32_t
+v_lookup_u32 (const u32_t *tab, v_u32_t idx)
+{
+ return tab[idx];
+}
+static inline v_f32_t
+v_call_f32 (f32_t (*f) (f32_t), v_f32_t x, v_f32_t y, v_u32_t p)
+{
+ return f (x);
+}
+static inline v_f32_t
+v_call2_f32 (f32_t (*f) (f32_t, f32_t), v_f32_t x1, v_f32_t x2, v_f32_t y,
+ v_u32_t p)
+{
+ return f (x1, x2);
+}
+
+static inline int
+v_lanes64 (void)
+{
+ return 1;
+}
+static inline v_f64_t
+v_f64 (f64_t x)
+{
+ return x;
+}
+static inline v_u64_t
+v_u64 (u64_t x)
+{
+ return x;
+}
+static inline v_s64_t
+v_s64 (s64_t x)
+{
+ return x;
+}
+static inline f64_t
+v_get_f64 (v_f64_t x, int i)
+{
+ return x;
+}
+static inline void
+v_set_f64 (v_f64_t *x, int i, f64_t v)
+{
+ *x = v;
+}
+/* true if any elements of a v_cond result is non-zero. */
+static inline int
+v_any_u64 (v_u64_t x)
+{
+ return x != 0;
+}
+/* to wrap the result of relational operators. */
+static inline v_u64_t
+v_cond_u64 (v_u64_t x)
+{
+ return x ? -1 : 0;
+}
+static inline v_f64_t
+v_abs_f64 (v_f64_t x)
+{
+ return __builtin_fabs (x);
+}
+static inline v_f64_t
+v_fma_f64 (v_f64_t x, v_f64_t y, v_f64_t z)
+{
+ return __builtin_fma (x, y, z);
+}
+static inline v_f64_t
+v_round_f64 (v_f64_t x)
+{
+ return __builtin_round (x);
+}
+static inline v_s64_t
+v_round_s64 (v_f64_t x)
+{
+ return __builtin_lround (x); /* relies on -fno-math-errno. */
+}
+/* convert to type1 from type2. */
+static inline v_f64_t
+v_to_f64_s64 (v_s64_t x)
+{
+ return x;
+}
+static inline v_f64_t
+v_to_f64_u64 (v_u64_t x)
+{
+ return x;
+}
+/* reinterpret as type1 from type2. */
+static inline v_u64_t
+v_as_u64_f64 (v_f64_t x)
+{
+ union { v_f64_t f; v_u64_t u; } r = {x};
+ return r.u;
+}
+static inline v_f64_t
+v_as_f64_u64 (v_u64_t x)
+{
+ union { v_u64_t u; v_f64_t f; } r = {x};
+ return r.f;
+}
+static inline v_s64_t
+v_as_s64_u64 (v_u64_t x)
+{
+ union { v_u64_t u; v_s64_t i; } r = {x};
+ return r.i;
+}
+static inline v_u64_t
+v_as_u64_s64 (v_s64_t x)
+{
+ union { v_s64_t i; v_u64_t u; } r = {x};
+ return r.u;
+}
+static inline v_f64_t
+v_lookup_f64 (const f64_t *tab, v_u64_t idx)
+{
+ return tab[idx];
+}
+static inline v_u64_t
+v_lookup_u64 (const u64_t *tab, v_u64_t idx)
+{
+ return tab[idx];
+}
+static inline v_f64_t
+v_call_f64 (f64_t (*f) (f64_t), v_f64_t x, v_f64_t y, v_u64_t p)
+{
+ return f (x);
+}
+
+#elif __aarch64__
+#define V_SUPPORTED 1
+#include
+typedef float32x4_t v_f32_t;
+typedef uint32x4_t v_u32_t;
+typedef int32x4_t v_s32_t;
+typedef float64x2_t v_f64_t;
+typedef uint64x2_t v_u64_t;
+typedef int64x2_t v_s64_t;
+
+static inline int
+v_lanes32 (void)
+{
+ return 4;
+}
+
+static inline v_f32_t
+v_f32 (f32_t x)
+{
+ return (v_f32_t){x, x, x, x};
+}
+static inline v_u32_t
+v_u32 (u32_t x)
+{
+ return (v_u32_t){x, x, x, x};
+}
+static inline v_s32_t
+v_s32 (s32_t x)
+{
+ return (v_s32_t){x, x, x, x};
+}
+
+static inline f32_t
+v_get_f32 (v_f32_t x, int i)
+{
+ return x[i];
+}
+static inline u32_t
+v_get_u32 (v_u32_t x, int i)
+{
+ return x[i];
+}
+static inline s32_t
+v_get_s32 (v_s32_t x, int i)
+{
+ return x[i];
+}
+
+static inline void
+v_set_f32 (v_f32_t *x, int i, f32_t v)
+{
+ (*x)[i] = v;
+}
+static inline void
+v_set_u32 (v_u32_t *x, int i, u32_t v)
+{
+ (*x)[i] = v;
+}
+static inline void
+v_set_s32 (v_s32_t *x, int i, s32_t v)
+{
+ (*x)[i] = v;
+}
+
+/* true if any elements of a v_cond result is non-zero. */
+static inline int
+v_any_u32 (v_u32_t x)
+{
+ /* assume elements in x are either 0 or -1u. */
+ return vpaddd_u64 (vreinterpretq_u64_u32 (x)) != 0;
+}
+/* to wrap the result of relational operators. */
+static inline v_u32_t
+v_cond_u32 (v_u32_t x)
+{
+ return x;
+}
+static inline v_f32_t
+v_abs_f32 (v_f32_t x)
+{
+ return vabsq_f32 (x);
+}
+static inline v_f32_t
+v_fma_f32 (v_f32_t x, v_f32_t y, v_f32_t z)
+{
+ return vfmaq_f32 (z, x, y);
+}
+static inline v_f32_t
+v_round_f32 (v_f32_t x)
+{
+ return vrndaq_f32 (x);
+}
+static inline v_s32_t
+v_round_s32 (v_f32_t x)
+{
+ return vcvtaq_s32_f32 (x);
+}
+/* convert to type1 from type2. */
+static inline v_f32_t
+v_to_f32_s32 (v_s32_t x)
+{
+ return (v_f32_t){x[0], x[1], x[2], x[3]};
+}
+static inline v_f32_t
+v_to_f32_u32 (v_u32_t x)
+{
+ return (v_f32_t){x[0], x[1], x[2], x[3]};
+}
+/* reinterpret as type1 from type2. */
+static inline v_u32_t
+v_as_u32_f32 (v_f32_t x)
+{
+ union { v_f32_t f; v_u32_t u; } r = {x};
+ return r.u;
+}
+static inline v_f32_t
+v_as_f32_u32 (v_u32_t x)
+{
+ union { v_u32_t u; v_f32_t f; } r = {x};
+ return r.f;
+}
+static inline v_s32_t
+v_as_s32_u32 (v_u32_t x)
+{
+ union { v_u32_t u; v_s32_t i; } r = {x};
+ return r.i;
+}
+static inline v_u32_t
+v_as_u32_s32 (v_s32_t x)
+{
+ union { v_s32_t i; v_u32_t u; } r = {x};
+ return r.u;
+}
+static inline v_f32_t
+v_lookup_f32 (const f32_t *tab, v_u32_t idx)
+{
+ return (v_f32_t){tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]};
+}
+static inline v_u32_t
+v_lookup_u32 (const u32_t *tab, v_u32_t idx)
+{
+ return (v_u32_t){tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]};
+}
+static inline v_f32_t
+v_call_f32 (f32_t (*f) (f32_t), v_f32_t x, v_f32_t y, v_u32_t p)
+{
+ return (v_f32_t){p[0] ? f (x[0]) : y[0], p[1] ? f (x[1]) : y[1],
+ p[2] ? f (x[2]) : y[2], p[3] ? f (x[3]) : y[3]};
+}
+static inline v_f32_t
+v_call2_f32 (f32_t (*f) (f32_t, f32_t), v_f32_t x1, v_f32_t x2, v_f32_t y,
+ v_u32_t p)
+{
+ return (
+ v_f32_t){p[0] ? f (x1[0], x2[0]) : y[0], p[1] ? f (x1[1], x2[1]) : y[1],
+ p[2] ? f (x1[2], x2[2]) : y[2], p[3] ? f (x1[3], x2[3]) : y[3]};
+}
+
+static inline int
+v_lanes64 (void)
+{
+ return 2;
+}
+static inline v_f64_t
+v_f64 (f64_t x)
+{
+ return (v_f64_t){x, x};
+}
+static inline v_u64_t
+v_u64 (u64_t x)
+{
+ return (v_u64_t){x, x};
+}
+static inline v_s64_t
+v_s64 (s64_t x)
+{
+ return (v_s64_t){x, x};
+}
+static inline f64_t
+v_get_f64 (v_f64_t x, int i)
+{
+ return x[i];
+}
+static inline void
+v_set_f64 (v_f64_t *x, int i, f64_t v)
+{
+ (*x)[i] = v;
+}
+/* true if any elements of a v_cond result is non-zero. */
+static inline int
+v_any_u64 (v_u64_t x)
+{
+ /* assume elements in x are either 0 or -1u. */
+ return vpaddd_u64 (x) != 0;
+}
+/* to wrap the result of relational operators. */
+static inline v_u64_t
+v_cond_u64 (v_u64_t x)
+{
+ return x;
+}
+static inline v_f64_t
+v_abs_f64 (v_f64_t x)
+{
+ return vabsq_f64 (x);
+}
+static inline v_f64_t
+v_fma_f64 (v_f64_t x, v_f64_t y, v_f64_t z)
+{
+ return vfmaq_f64 (z, x, y);
+}
+static inline v_f64_t
+v_round_f64 (v_f64_t x)
+{
+ return vrndaq_f64 (x);
+}
+static inline v_s64_t
+v_round_s64 (v_f64_t x)
+{
+ return vcvtaq_s64_f64 (x);
+}
+/* convert to type1 from type2. */
+static inline v_f64_t
+v_to_f64_s64 (v_s64_t x)
+{
+ return (v_f64_t){x[0], x[1]};
+}
+static inline v_f64_t
+v_to_f64_u64 (v_u64_t x)
+{
+ return (v_f64_t){x[0], x[1]};
+}
+/* reinterpret as type1 from type2. */
+static inline v_u64_t
+v_as_u64_f64 (v_f64_t x)
+{
+ union { v_f64_t f; v_u64_t u; } r = {x};
+ return r.u;
+}
+static inline v_f64_t
+v_as_f64_u64 (v_u64_t x)
+{
+ union { v_u64_t u; v_f64_t f; } r = {x};
+ return r.f;
+}
+static inline v_s64_t
+v_as_s64_u64 (v_u64_t x)
+{
+ union { v_u64_t u; v_s64_t i; } r = {x};
+ return r.i;
+}
+static inline v_u64_t
+v_as_u64_s64 (v_s64_t x)
+{
+ union { v_s64_t i; v_u64_t u; } r = {x};
+ return r.u;
+}
+static inline v_f64_t
+v_lookup_f64 (const f64_t *tab, v_u64_t idx)
+{
+ return (v_f64_t){tab[idx[0]], tab[idx[1]]};
+}
+static inline v_u64_t
+v_lookup_u64 (const u64_t *tab, v_u64_t idx)
+{
+ return (v_u64_t){tab[idx[0]], tab[idx[1]]};
+}
+static inline v_f64_t
+v_call_f64 (f64_t (*f) (f64_t), v_f64_t x, v_f64_t y, v_u64_t p)
+{
+ return (v_f64_t){p[0] ? f (x[0]) : y[0], p[1] ? f (x[1]) : y[1]};
+}
+#endif
+
+#endif
+#endif
diff --git a/math/aarch64/v_pow.c b/math/v_pow.c
similarity index 35%
rename from math/aarch64/v_pow.c
rename to math/v_pow.c
index 734f1663a283d4ce068efc2526d0dd989ba5433b..a209d57f41cee70ac78bc4f418c385f481636025 100644
--- a/math/aarch64/v_pow.c
+++ b/math/v_pow.c
@@ -1,22 +1,27 @@
/*
* Double-precision vector pow function.
*
- * Copyright (c) 2020-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * Copyright (c) 2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
*/
#include "mathlib.h"
#include "v_math.h"
+#if V_SUPPORTED
-float64x2_t VPCS_ATTR V_NAME_D2 (pow) (float64x2_t x, float64x2_t y)
+VPCS_ATTR
+v_f64_t
+V_NAME(pow) (v_f64_t x, v_f64_t y)
{
- float64x2_t z;
+ v_f64_t z;
for (int lane = 0; lane < v_lanes64 (); lane++)
{
- double sx = x[lane];
- double sy = y[lane];
- double sz = pow (sx, sy);
- z[lane] = sz;
+ f64_t sx = v_get_f64 (x, lane);
+ f64_t sy = v_get_f64 (y, lane);
+ f64_t sz = pow (sx, sy);
+ v_set_f64 (&z, lane, sz);
}
return z;
}
+VPCS_ALIAS
+#endif
diff --git a/math/v_powf.c b/math/v_powf.c
new file mode 100644
index 0000000000000000000000000000000000000000..fb80fa6f184688ee7396a12121604b12d9b1db1a
--- /dev/null
+++ b/math/v_powf.c
@@ -0,0 +1,235 @@
+/*
+ * Single-precision vector powf function.
+ *
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "mathlib.h"
+#include "v_math.h"
+#if V_SUPPORTED
+
+#define Min v_u32 (0x00800000)
+#define Max v_u32 (0x7f800000)
+#define SBITS 5
+#define Tlog v__powf_log2_data.tab
+#define Texp v__exp2f_data.tab
+#define A v__powf_log2_data.poly
+#define C v__exp2f_data.poly
+#define LOGDEG 4
+
+#if LOGDEG == 5
+/* 1.01 ulp */
+#define OFF v_u32 (0x3f330000)
+#define TBITS 4
+#elif LOGDEG == 4
+/* 2.6 ulp ~ 0.5 + 2^24 (128*Ln2*relerr_log2 + relerr_exp2) */
+#define OFF v_u32 (0x3f35d000)
+#define TBITS 5
+#endif
+
+#define V_EXP2F_TABLE_BITS SBITS
+#define V_EXP2F_POLY_ORDER 3
+struct v_exp2f_data
+{
+ uint64_t tab[1 << V_EXP2F_TABLE_BITS];
+ double poly[V_EXP2F_POLY_ORDER];
+};
+
+#define V_POWF_LOG2_TABLE_BITS TBITS
+#define V_POWF_LOG2_POLY_ORDER LOGDEG
+#define SCALE ((double) (1 << SBITS))
+struct v_powf_log2_data
+{
+ struct
+ {
+ double invc, logc;
+ } tab[1 << V_POWF_LOG2_TABLE_BITS];
+ double poly[V_POWF_LOG2_POLY_ORDER];
+};
+
+static const struct v_powf_log2_data v__powf_log2_data = {
+#if LOGDEG == 5
+ .tab = {
+{ 0x1.661ec79f8f3bep+0, -0x1.efec65b963019p-2 * SCALE },
+{ 0x1.571ed4aaf883dp+0, -0x1.b0b6832d4fca4p-2 * SCALE },
+{ 0x1.49539f0f010bp+0, -0x1.7418b0a1fb77bp-2 * SCALE },
+{ 0x1.3c995b0b80385p+0, -0x1.39de91a6dcf7bp-2 * SCALE },
+{ 0x1.30d190c8864a5p+0, -0x1.01d9bf3f2b631p-2 * SCALE },
+{ 0x1.25e227b0b8eap+0, -0x1.97c1d1b3b7afp-3 * SCALE },
+{ 0x1.1bb4a4a1a343fp+0, -0x1.2f9e393af3c9fp-3 * SCALE },
+{ 0x1.12358f08ae5bap+0, -0x1.960cbbf788d5cp-4 * SCALE },
+{ 0x1.0953f419900a7p+0, -0x1.a6f9db6475fcep-5 * SCALE },
+{ 0x1p+0, 0x0p+0 * SCALE },
+{ 0x1.e608cfd9a47acp-1, 0x1.338ca9f24f53dp-4 * SCALE },
+{ 0x1.ca4b31f026aap-1, 0x1.476a9543891bap-3 * SCALE },
+{ 0x1.b2036576afce6p-1, 0x1.e840b4ac4e4d2p-3 * SCALE },
+{ 0x1.9c2d163a1aa2dp-1, 0x1.40645f0c6651cp-2 * SCALE },
+{ 0x1.886e6037841edp-1, 0x1.88e9c2c1b9ff8p-2 * SCALE },
+{ 0x1.767dcf5534862p-1, 0x1.ce0a44eb17bccp-2 * SCALE },
+ },
+/* rel err: 1.46 * 2^-32 */
+ .poly = {
+0x1.27616c9496e0bp-2 * SCALE, -0x1.71969a075c67ap-2 * SCALE,
+0x1.ec70a6ca7baddp-2 * SCALE, -0x1.7154748bef6c8p-1 * SCALE,
+0x1.71547652ab82bp0 * SCALE,
+ }
+#elif LOGDEG == 4
+ .tab = {
+{0x1.6489890582816p+0, -0x1.e960f97b22702p-2 * SCALE},
+{0x1.5cf19b35e3472p+0, -0x1.c993406cd4db6p-2 * SCALE},
+{0x1.55aac0e956d65p+0, -0x1.aa711d9a7d0f3p-2 * SCALE},
+{0x1.4eb0022977e01p+0, -0x1.8bf37bacdce9bp-2 * SCALE},
+{0x1.47fcccda1dd1fp+0, -0x1.6e13b3519946ep-2 * SCALE},
+{0x1.418ceabab68c1p+0, -0x1.50cb8281e4089p-2 * SCALE},
+{0x1.3b5c788f1edb3p+0, -0x1.341504a237e2bp-2 * SCALE},
+{0x1.3567de48e9c9ap+0, -0x1.17eaab624ffbbp-2 * SCALE},
+{0x1.2fabc80fd19bap+0, -0x1.f88e708f8c853p-3 * SCALE},
+{0x1.2a25200ce536bp+0, -0x1.c24b6da113914p-3 * SCALE},
+{0x1.24d108e0152e3p+0, -0x1.8d02ee397cb1dp-3 * SCALE},
+{0x1.1facd8ab2fbe1p+0, -0x1.58ac1223408b3p-3 * SCALE},
+{0x1.1ab614a03efdfp+0, -0x1.253e6fd190e89p-3 * SCALE},
+{0x1.15ea6d03af9ffp+0, -0x1.e5641882c12ffp-4 * SCALE},
+{0x1.1147b994bb776p+0, -0x1.81fea712926f7p-4 * SCALE},
+{0x1.0ccbf650593aap+0, -0x1.203e240de64a3p-4 * SCALE},
+{0x1.0875408477302p+0, -0x1.8029b86a78281p-5 * SCALE},
+{0x1.0441d42a93328p+0, -0x1.85d713190fb9p-6 * SCALE},
+{0x1p+0, 0x0p+0 * SCALE},
+{0x1.f1d006c855e86p-1, 0x1.4c1cc07312997p-5 * SCALE},
+{0x1.e28c3341aa301p-1, 0x1.5e1848ccec948p-4 * SCALE},
+{0x1.d4bdf9aa64747p-1, 0x1.04cfcb7f1196fp-3 * SCALE},
+{0x1.c7b45a24e5803p-1, 0x1.582813d463c21p-3 * SCALE},
+{0x1.bb5f5eb2ed60ap-1, 0x1.a936fa68760ccp-3 * SCALE},
+{0x1.afb0bff8fe6b4p-1, 0x1.f81bc31d6cc4ep-3 * SCALE},
+{0x1.a49badf7ab1f5p-1, 0x1.2279a09fae6b1p-2 * SCALE},
+{0x1.9a14a111fc4c9p-1, 0x1.47ec0b6df5526p-2 * SCALE},
+{0x1.901131f5b2fdcp-1, 0x1.6c71762280f1p-2 * SCALE},
+{0x1.8687f73f6d865p-1, 0x1.90155070798dap-2 * SCALE},
+{0x1.7d7067eb77986p-1, 0x1.b2e23b1d3068cp-2 * SCALE},
+{0x1.74c2c1cf97b65p-1, 0x1.d4e21b0daa86ap-2 * SCALE},
+{0x1.6c77f37cff2a1p-1, 0x1.f61e2a2f67f3fp-2 * SCALE},
+ },
+/* rel err: 1.5 * 2^-30 */
+ .poly = {
+ -0x1.6ff5daa3b3d7cp-2 * SCALE,
+ 0x1.ec81d03c01aebp-2 * SCALE,
+ -0x1.71547bb43f101p-1 * SCALE,
+ 0x1.7154764a815cbp0 * SCALE,
+ }
+#endif
+};
+
+static const struct v_exp2f_data v__exp2f_data = {
+ .tab = {
+0x3ff0000000000000, 0x3fefd9b0d3158574, 0x3fefb5586cf9890f, 0x3fef9301d0125b51,
+0x3fef72b83c7d517b, 0x3fef54873168b9aa, 0x3fef387a6e756238, 0x3fef1e9df51fdee1,
+0x3fef06fe0a31b715, 0x3feef1a7373aa9cb, 0x3feedea64c123422, 0x3feece086061892d,
+0x3feebfdad5362a27, 0x3feeb42b569d4f82, 0x3feeab07dd485429, 0x3feea47eb03a5585,
+0x3feea09e667f3bcd, 0x3fee9f75e8ec5f74, 0x3feea11473eb0187, 0x3feea589994cce13,
+0x3feeace5422aa0db, 0x3feeb737b0cdc5e5, 0x3feec49182a3f090, 0x3feed503b23e255d,
+0x3feee89f995ad3ad, 0x3feeff76f2fb5e47, 0x3fef199bdd85529c, 0x3fef3720dcef9069,
+0x3fef5818dcfba487, 0x3fef7c97337b9b5f, 0x3fefa4afa2a490da, 0x3fefd0765b6e4540,
+ },
+/* rel err: 1.69 * 2^-34 */
+ .poly = {
+0x1.c6af84b912394p-5/SCALE/SCALE/SCALE, 0x1.ebfce50fac4f3p-3/SCALE/SCALE, 0x1.62e42ff0c52d6p-1/SCALE
+ },
+};
+
+VPCS_ATTR
+__attribute__ ((noinline)) static v_f32_t
+specialcase (v_f32_t x, v_f32_t y, v_f32_t ret, v_u32_t cmp)
+{
+ return v_call2_f32 (powf, x, y, ret, cmp);
+}
+
+VPCS_ATTR
+v_f32_t
+V_NAME(powf) (v_f32_t x, v_f32_t y)
+{
+ v_u32_t u, tmp, cmp, i, top, iz;
+ v_s32_t k;
+ v_f32_t ret;
+
+ u = v_as_u32_f32 (x);
+ cmp = v_cond_u32 (u - Min >= Max - Min);
+ tmp = u - OFF;
+ i = (tmp >> (23 - TBITS)) % (1 << TBITS);
+ top = tmp & 0xff800000;
+ iz = u - top;
+ k = v_as_s32_u32 (top) >> (23 - SBITS); /* arithmetic shift */
+
+ for (int lane = 0; lane < v_lanes32 (); lane++)
+ {
+ uint32_t si, siz;
+ int32_t sk;
+ float sy;
+
+ /* Use double precision for each lane. */
+ double invc, logc, z, r, p, y0, logx, ylogx, kd, s;
+ uint64_t ki, t;
+
+ si = v_get_u32 (i, lane);
+ siz = v_get_u32 (iz, lane);
+ sk = v_get_s32 (k, lane);
+ sy = v_get_f32 (y, lane);
+
+ invc = Tlog[si].invc;
+ logc = Tlog[si].logc;
+ z = (double) as_f32_u32 (siz);
+
+ /* log2(x) = log1p(z/c-1)/ln2 + log2(c) + k */
+ r = __builtin_fma (z, invc, -1.0);
+ y0 = logc + (double) sk;
+
+ /* Polynomial to approximate log1p(r)/ln2. */
+#if LOGDEG == 5
+ logx = A[0];
+ logx = r * logx + A[1];
+ logx = r * logx + A[2];
+ logx = r * logx + A[3];
+ logx = r * logx + A[4];
+ logx = r * logx + y0;
+#elif LOGDEG == 4
+ logx = A[0];
+ logx = r * logx + A[1];
+ logx = r * logx + A[2];
+ logx = r * logx + A[3];
+ logx = r * logx + y0;
+#endif
+ ylogx = sy * logx;
+ v_set_u32 (&cmp, lane,
+ (as_u64_f64 (ylogx) >> 47 & 0xffff)
+ >= as_u64_f64 (126.0 * (1 << SBITS)) >> 47
+ ? 1
+ : v_get_u32 (cmp, lane));
+
+ /* N*x = k + r with r in [-1/2, 1/2] */
+#if TOINT_INTRINSICS
+ kd = roundtoint (ylogx); /* k */
+ ki = converttoint (ylogx);
+#else
+# define SHIFT 0x1.8p52
+ kd = eval_as_double (ylogx + SHIFT);
+ ki = asuint64 (kd);
+ kd -= SHIFT;
+#endif
+ r = ylogx - kd;
+
+ /* exp2(x) = 2^(k/N) * 2^r ~= s * (C0*r^3 + C1*r^2 + C2*r + 1) */
+ t = Texp[ki % (1 << SBITS)];
+ t += ki << (52 - SBITS);
+ s = as_f64_u64 (t);
+ p = C[0];
+ p = __builtin_fma (p, r, C[1]);
+ p = __builtin_fma (p, r, C[2]);
+ p = __builtin_fma (p, s * r, s);
+
+ v_set_f32 (&ret, lane, p);
+ }
+ if (unlikely (v_any_u32 (cmp)))
+ return specialcase (x, y, ret, cmp);
+ return ret;
+}
+VPCS_ALIAS
+#endif
diff --git a/math/v_sin.c b/math/v_sin.c
new file mode 100644
index 0000000000000000000000000000000000000000..2b9ed059189ca0402c8ec93f915fa6d3ed11be88
--- /dev/null
+++ b/math/v_sin.c
@@ -0,0 +1,86 @@
+/*
+ * Double-precision vector sin function.
+ *
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "mathlib.h"
+#include "v_math.h"
+#if V_SUPPORTED
+
+static const double Poly[] = {
+/* worst-case error is 3.5 ulp.
+ abs error: 0x1.be222a58p-53 in [-pi/2, pi/2]. */
+-0x1.9f4a9c8b21dc9p-41,
+ 0x1.60e88a10163f2p-33,
+-0x1.ae6361b7254e7p-26,
+ 0x1.71de382e8d62bp-19,
+-0x1.a01a019aeb4ffp-13,
+ 0x1.111111110b25ep-7,
+-0x1.55555555554c3p-3,
+};
+
+#define C7 v_f64 (Poly[0])
+#define C6 v_f64 (Poly[1])
+#define C5 v_f64 (Poly[2])
+#define C4 v_f64 (Poly[3])
+#define C3 v_f64 (Poly[4])
+#define C2 v_f64 (Poly[5])
+#define C1 v_f64 (Poly[6])
+
+#define InvPi v_f64 (0x1.45f306dc9c883p-2)
+#define Pi1 v_f64 (0x1.921fb54442d18p+1)
+#define Pi2 v_f64 (0x1.1a62633145c06p-53)
+#define Pi3 v_f64 (0x1.c1cd129024e09p-106)
+#define Shift v_f64 (0x1.8p52)
+#define RangeVal v_f64 (0x1p23)
+#define AbsMask v_u64 (0x7fffffffffffffff)
+
+VPCS_ATTR
+__attribute__ ((noinline)) static v_f64_t
+specialcase (v_f64_t x, v_f64_t y, v_u64_t cmp)
+{
+ return v_call_f64 (sin, x, y, cmp);
+}
+
+VPCS_ATTR
+v_f64_t
+V_NAME(sin) (v_f64_t x)
+{
+ v_f64_t n, r, r2, y;
+ v_u64_t sign, odd, cmp;
+
+ r = v_as_f64_u64 (v_as_u64_f64 (x) & AbsMask);
+ sign = v_as_u64_f64 (x) & ~AbsMask;
+ cmp = v_cond_u64 (v_as_u64_f64 (r) >= v_as_u64_f64 (RangeVal));
+
+ /* n = rint(|x|/pi). */
+ n = v_fma_f64 (InvPi, r, Shift);
+ odd = v_as_u64_f64 (n) << 63;
+ n -= Shift;
+
+ /* r = |x| - n*pi (range reduction into -pi/2 .. pi/2). */
+ r = v_fma_f64 (-Pi1, n, r);
+ r = v_fma_f64 (-Pi2, n, r);
+ r = v_fma_f64 (-Pi3, n, r);
+
+ /* sin(r) poly approx. */
+ r2 = r * r;
+ y = v_fma_f64 (C7, r2, C6);
+ y = v_fma_f64 (y, r2, C5);
+ y = v_fma_f64 (y, r2, C4);
+ y = v_fma_f64 (y, r2, C3);
+ y = v_fma_f64 (y, r2, C2);
+ y = v_fma_f64 (y, r2, C1);
+ y = v_fma_f64 (y * r2, r, r);
+
+ /* sign. */
+ y = v_as_f64_u64 (v_as_u64_f64 (y) ^ sign ^ odd);
+
+ if (unlikely (v_any_u64 (cmp)))
+ return specialcase (x, y, cmp);
+ return y;
+}
+VPCS_ALIAS
+#endif
diff --git a/math/v_sinf.c b/math/v_sinf.c
new file mode 100644
index 0000000000000000000000000000000000000000..e66bfce6d8aa4888cfe610d2c7250a144366091b
--- /dev/null
+++ b/math/v_sinf.c
@@ -0,0 +1,75 @@
+/*
+ * Single-precision vector sin function.
+ *
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "mathlib.h"
+#include "v_math.h"
+#if V_SUPPORTED
+
+static const float Poly[] = {
+ /* 1.886 ulp error */
+ 0x1.5b2e76p-19f,
+ -0x1.9f42eap-13f,
+ 0x1.110df4p-7f,
+ -0x1.555548p-3f,
+};
+#define Pi1 v_f32 (0x1.921fb6p+1f)
+#define Pi2 v_f32 (-0x1.777a5cp-24f)
+#define Pi3 v_f32 (-0x1.ee59dap-49f)
+#define A3 v_f32 (Poly[3])
+#define A5 v_f32 (Poly[2])
+#define A7 v_f32 (Poly[1])
+#define A9 v_f32 (Poly[0])
+#define RangeVal v_f32 (0x1p20f)
+#define InvPi v_f32 (0x1.45f306p-2f)
+#define Shift v_f32 (0x1.8p+23f)
+#define AbsMask v_u32 (0x7fffffff)
+
+VPCS_ATTR
+static v_f32_t
+specialcase (v_f32_t x, v_f32_t y, v_u32_t cmp)
+{
+ /* Fall back to scalar code. */
+ return v_call_f32 (sinf, x, y, cmp);
+}
+
+VPCS_ATTR
+v_f32_t
+V_NAME(sinf) (v_f32_t x)
+{
+ v_f32_t n, r, r2, y;
+ v_u32_t sign, odd, cmp;
+
+ r = v_as_f32_u32 (v_as_u32_f32 (x) & AbsMask);
+ sign = v_as_u32_f32 (x) & ~AbsMask;
+ cmp = v_cond_u32 (v_as_u32_f32 (r) >= v_as_u32_f32 (RangeVal));
+
+ /* n = rint(|x|/pi) */
+ n = v_fma_f32 (InvPi, r, Shift);
+ odd = v_as_u32_f32 (n) << 31;
+ n -= Shift;
+
+ /* r = |x| - n*pi (range reduction into -pi/2 .. pi/2) */
+ r = v_fma_f32 (-Pi1, n, r);
+ r = v_fma_f32 (-Pi2, n, r);
+ r = v_fma_f32 (-Pi3, n, r);
+
+ /* y = sin(r) */
+ r2 = r * r;
+ y = v_fma_f32 (A9, r2, A7);
+ y = v_fma_f32 (y, r2, A5);
+ y = v_fma_f32 (y, r2, A3);
+ y = v_fma_f32 (y * r2, r, r);
+
+ /* sign fix */
+ y = v_as_f32_u32 (v_as_u32_f32 (y) ^ sign ^ odd);
+
+ if (unlikely (v_any_u32 (cmp)))
+ return specialcase (x, y, cmp);
+ return y;
+}
+VPCS_ALIAS
+#endif
diff --git a/math/vn_cos.c b/math/vn_cos.c
new file mode 100644
index 0000000000000000000000000000000000000000..b57a549eba68b3c9dba8a4f06a68fb80c73352c1
--- /dev/null
+++ b/math/vn_cos.c
@@ -0,0 +1,12 @@
+/*
+ * AdvSIMD vector PCS variant of __v_cos.
+ *
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+#include "mathlib.h"
+#ifdef __vpcs
+#define VPCS 1
+#define VPCS_ALIAS strong_alias (__vn_cos, _ZGVnN2v_cos)
+#include "v_cos.c"
+#endif
diff --git a/math/vn_cosf.c b/math/vn_cosf.c
new file mode 100644
index 0000000000000000000000000000000000000000..6321d4620fa700ece0d12e0ccd2445fbd4a299ec
--- /dev/null
+++ b/math/vn_cosf.c
@@ -0,0 +1,12 @@
+/*
+ * AdvSIMD vector PCS variant of __v_cosf.
+ *
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+#include "mathlib.h"
+#ifdef __vpcs
+#define VPCS 1
+#define VPCS_ALIAS strong_alias (__vn_cosf, _ZGVnN4v_cosf)
+#include "v_cosf.c"
+#endif
diff --git a/math/vn_exp.c b/math/vn_exp.c
new file mode 100644
index 0000000000000000000000000000000000000000..06e269d41766bbc7040fdd92cde5782142db0d57
--- /dev/null
+++ b/math/vn_exp.c
@@ -0,0 +1,12 @@
+/*
+ * AdvSIMD vector PCS variant of __v_exp.
+ *
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+#include "mathlib.h"
+#ifdef __vpcs
+#define VPCS 1
+#define VPCS_ALIAS strong_alias (__vn_exp, _ZGVnN2v_exp)
+#include "v_exp.c"
+#endif
diff --git a/math/vn_exp2f.c b/math/vn_exp2f.c
new file mode 100644
index 0000000000000000000000000000000000000000..db9707e86f16f94ce8d05149a58efd6fa518de14
--- /dev/null
+++ b/math/vn_exp2f.c
@@ -0,0 +1,12 @@
+/*
+ * AdvSIMD vector PCS variant of __v_exp2f.
+ *
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+#include "mathlib.h"
+#ifdef __vpcs
+#define VPCS 1
+#define VPCS_ALIAS strong_alias (__vn_exp2f, _ZGVnN4v_exp2f)
+#include "v_exp2f.c"
+#endif
diff --git a/math/vn_exp2f_1u.c b/math/vn_exp2f_1u.c
new file mode 100644
index 0000000000000000000000000000000000000000..17bd0abd7a60450f157462def7fb66b450044a75
--- /dev/null
+++ b/math/vn_exp2f_1u.c
@@ -0,0 +1,11 @@
+/*
+ * AdvSIMD vector PCS variant of __v_exp2f_1u.
+ *
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+#include "mathlib.h"
+#ifdef __vpcs
+#define VPCS 1
+#include "v_exp2f_1u.c"
+#endif
diff --git a/math/vn_expf.c b/math/vn_expf.c
new file mode 100644
index 0000000000000000000000000000000000000000..0652907225d94898aa9034b86bb2b361e0ea3586
--- /dev/null
+++ b/math/vn_expf.c
@@ -0,0 +1,12 @@
+/*
+ * AdvSIMD vector PCS variant of __v_expf.
+ *
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+#include "mathlib.h"
+#ifdef __vpcs
+#define VPCS 1
+#define VPCS_ALIAS strong_alias (__vn_expf, _ZGVnN4v_expf)
+#include "v_expf.c"
+#endif
diff --git a/math/vn_expf_1u.c b/math/vn_expf_1u.c
new file mode 100644
index 0000000000000000000000000000000000000000..3be7768148225aa7756bd5f19a2dd026ab2d35f5
--- /dev/null
+++ b/math/vn_expf_1u.c
@@ -0,0 +1,11 @@
+/*
+ * AdvSIMD vector PCS variant of __v_expf_1u.
+ *
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+#include "mathlib.h"
+#ifdef __vpcs
+#define VPCS 1
+#include "v_expf_1u.c"
+#endif
diff --git a/math/vn_log.c b/math/vn_log.c
new file mode 100644
index 0000000000000000000000000000000000000000..b58fe8ff820a7bb49aafb18d0d287c45d35f6aff
--- /dev/null
+++ b/math/vn_log.c
@@ -0,0 +1,12 @@
+/*
+ * AdvSIMD vector PCS variant of __v_log.
+ *
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+#include "mathlib.h"
+#ifdef __vpcs
+#define VPCS 1
+#define VPCS_ALIAS strong_alias (__vn_log, _ZGVnN2v_log)
+#include "v_log.c"
+#endif
diff --git a/math/vn_logf.c b/math/vn_logf.c
new file mode 100644
index 0000000000000000000000000000000000000000..cc5b8ae3ed55fec377883dd1dfabb4c678e3c48e
--- /dev/null
+++ b/math/vn_logf.c
@@ -0,0 +1,12 @@
+/*
+ * AdvSIMD vector PCS variant of __v_logf.
+ *
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+#include "mathlib.h"
+#ifdef __vpcs
+#define VPCS 1
+#define VPCS_ALIAS strong_alias (__vn_logf, _ZGVnN4v_logf)
+#include "v_logf.c"
+#endif
diff --git a/math/vn_pow.c b/math/vn_pow.c
new file mode 100644
index 0000000000000000000000000000000000000000..260950113b04016a2b8425b6a6333be1830248c1
--- /dev/null
+++ b/math/vn_pow.c
@@ -0,0 +1,12 @@
+/*
+ * AdvSIMD vector PCS variant of __v_pow.
+ *
+ * Copyright (c) 2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+#include "mathlib.h"
+#ifdef __vpcs
+#define VPCS 1
+#define VPCS_ALIAS strong_alias (__vn_pow, _ZGVnN2vv_pow)
+#include "v_pow.c"
+#endif
diff --git a/math/vn_powf.c b/math/vn_powf.c
new file mode 100644
index 0000000000000000000000000000000000000000..095d07e337ad27d26699a4159be158a756e2d79a
--- /dev/null
+++ b/math/vn_powf.c
@@ -0,0 +1,12 @@
+/*
+ * AdvSIMD vector PCS variant of __v_powf.
+ *
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+#include "mathlib.h"
+#ifdef __vpcs
+#define VPCS 1
+#define VPCS_ALIAS strong_alias (__vn_powf, _ZGVnN4vv_powf)
+#include "v_powf.c"
+#endif
diff --git a/math/vn_sin.c b/math/vn_sin.c
new file mode 100644
index 0000000000000000000000000000000000000000..905c7962335029212e84676883f9e275b06c56a4
--- /dev/null
+++ b/math/vn_sin.c
@@ -0,0 +1,12 @@
+/*
+ * AdvSIMD vector PCS variant of __v_sin.
+ *
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+#include "mathlib.h"
+#ifdef __vpcs
+#define VPCS 1
+#define VPCS_ALIAS strong_alias (__vn_sin, _ZGVnN2v_sin)
+#include "v_sin.c"
+#endif
diff --git a/math/vn_sinf.c b/math/vn_sinf.c
new file mode 100644
index 0000000000000000000000000000000000000000..1214e1a556385b12e1e90bf74ed3e5828f8182d5
--- /dev/null
+++ b/math/vn_sinf.c
@@ -0,0 +1,12 @@
+/*
+ * AdvSIMD vector PCS variant of __v_sinf.
+ *
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+#include "mathlib.h"
+#ifdef __vpcs
+#define VPCS 1
+#define VPCS_ALIAS strong_alias (__vn_sinf, _ZGVnN4v_sinf)
+#include "v_sinf.c"
+#endif
diff --git a/networking/Dir.mk b/networking/Dir.mk
index 2589e0a1f91c47b76a50bf78e1c7aa01d3ec495f..b49610341171f43700b2af195fe7b4c7f2402af7 100644
--- a/networking/Dir.mk
+++ b/networking/Dir.mk
@@ -1,7 +1,7 @@
# Makefile fragment - requires GNU make
#
# Copyright (c) 2019-2020, Arm Limited.
-# SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+# SPDX-License-Identifier: MIT
S := $(srcdir)/networking
B := build/networking
diff --git a/networking/aarch64/chksum_simd.c b/networking/aarch64/chksum_simd.c
index 90c00eb7cabe5a0f3e28b6e8f94c17e9f5750334..6d5be58b1f32d1d49482129a62d7c40e715f9d4f 100644
--- a/networking/aarch64/chksum_simd.c
+++ b/networking/aarch64/chksum_simd.c
@@ -2,7 +2,7 @@
* AArch64-specific checksum implementation using NEON
*
* Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * SPDX-License-Identifier: MIT
*/
#include "networking.h"
diff --git a/networking/arm/chksum_simd.c b/networking/arm/chksum_simd.c
index ae08fe5dd0566632cfffdcf245c4d3915884cbd3..7f69adfc963c375221bf1d661f2b6f37e5fc56c9 100644
--- a/networking/arm/chksum_simd.c
+++ b/networking/arm/chksum_simd.c
@@ -2,7 +2,7 @@
* Armv7-A specific checksum implementation using NEON
*
* Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * SPDX-License-Identifier: MIT
*/
#include "networking.h"
diff --git a/networking/chksum.c b/networking/chksum.c
index 329482ffdcee963b4deed851ce56af0f0748b6b8..95ce5baa94e43e9008e2b0750713cf0efb77e7ed 100644
--- a/networking/chksum.c
+++ b/networking/chksum.c
@@ -3,7 +3,7 @@
* This sum is often used as a simple checksum in networking.
*
* Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * SPDX-License-Identifier: MIT
*/
#include "networking.h"
diff --git a/networking/chksum_common.h b/networking/chksum_common.h
index 16f0f6c11df7015ed0a87e0032685a69c74c154f..958c8cc0742e7fb2b58e2bda236f836f69715ee9 100644
--- a/networking/chksum_common.h
+++ b/networking/chksum_common.h
@@ -2,7 +2,7 @@
* Common code for checksum implementations
*
* Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * SPDX-License-Identifier: MIT
*/
#ifndef CHKSUM_COMMON_H
diff --git a/networking/include/networking.h b/networking/include/networking.h
index 297dd4bfab0234ceabf663f5e39552b1e08f63ac..a88feff883394ef5c4d7bb840813d5af7f584e90 100644
--- a/networking/include/networking.h
+++ b/networking/include/networking.h
@@ -2,7 +2,7 @@
* Public API.
*
* Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * SPDX-License-Identifier: MIT
*/
unsigned short __chksum (const void *, unsigned int);
diff --git a/networking/test/chksum.c b/networking/test/chksum.c
index 239b5b88777be2a4870b4fd65fc29ddadc5ba11a..41b98120f2758b54b8d13122caffb00224cc3139 100644
--- a/networking/test/chksum.c
+++ b/networking/test/chksum.c
@@ -2,7 +2,7 @@
* Ones' complement checksum test & benchmark
*
* Copyright (c) 2016-2020, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * SPDX-License-Identifier: MIT
*/
#define _GNU_SOURCE
diff --git a/string/Dir.mk b/string/Dir.mk
index 40ff5acc093e9d042afdcb6748aa540da6970816..cf3453f7580d381464b4ebb5eacfe1306a427822 100644
--- a/string/Dir.mk
+++ b/string/Dir.mk
@@ -1,7 +1,7 @@
# Makefile fragment - requires GNU make
#
# Copyright (c) 2019-2021, Arm Limited.
-# SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+# SPDX-License-Identifier: MIT
S := $(srcdir)/string
B := build/string
diff --git a/string/README.contributors b/string/README.contributors
deleted file mode 100644
index 0b4a51b563669a48e24d35135eb4ef50293ef2af..0000000000000000000000000000000000000000
--- a/string/README.contributors
+++ /dev/null
@@ -1,30 +0,0 @@
-STYLE REQUIREMENTS
-==================
-
-1. Most code in this sub-directory is expected to be upstreamed into glibc so
- the GNU Coding Standard and glibc specific conventions should be followed
- to ease upstreaming.
-
-2. ABI and symbols: the code should be written so it is suitable for inclusion
- into a libc with minimal changes. This e.g. means that internal symbols
- should be hidden and in the implementation reserved namespace according to
- ISO C and POSIX rules. If possible the built shared libraries and static
- library archives should be usable to override libc symbols at link time (or
- at runtime via LD_PRELOAD). This requires the symbols to follow the glibc ABI
- (other than symbol versioning), this cannot be done reliably for static
- linking so this is a best effort requirement.
-
-3. API: include headers should be suitable for benchmarking and testing code
- and should not conflict with libc headers.
-
-
-CONTRIBUTION GUIDELINES FOR string SUB-DIRECTORY
-================================================
-1. Code:
- - The assumptions of the code must be clearly documented.
-
- - Assembly style should be consistent across different implementations.
-
-
-2. Performance:
- - Benchmarking is needed on several microarchitectures.
diff --git a/string/aarch64/__mtag_tag_region.S b/string/aarch64/__mtag_tag_region.S
index 207e22950c6d3c4e42c20460cf4a3b1d7fde9eec..84339f73cf23770b991c15e62eaba4b186a3201e 100644
--- a/string/aarch64/__mtag_tag_region.S
+++ b/string/aarch64/__mtag_tag_region.S
@@ -1,8 +1,8 @@
/*
* __mtag_tag_region - tag memory
*
- * Copyright (c) 2021-2022, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * Copyright (c) 2021, Arm Limited.
+ * SPDX-License-Identifier: MIT
*/
/* Assumptions:
@@ -15,7 +15,7 @@
* The memory region may remain untagged if tagging is not enabled.
*/
-#include "asmdefs.h"
+#include "../asmdefs.h"
#if __ARM_FEATURE_MEMORY_TAGGING
diff --git a/string/aarch64/__mtag_tag_zero_region.S b/string/aarch64/__mtag_tag_zero_region.S
index 44b8e0114f4265d1ba02acb0a8622ca27a9a6973..f58364ca6fcb8c11b548b4288efdd21c716d5866 100644
--- a/string/aarch64/__mtag_tag_zero_region.S
+++ b/string/aarch64/__mtag_tag_zero_region.S
@@ -1,8 +1,8 @@
/*
* __mtag_tag_zero_region - tag memory and fill it with zero bytes
*
- * Copyright (c) 2021-2022, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * Copyright (c) 2021, Arm Limited.
+ * SPDX-License-Identifier: MIT
*/
/* Assumptions:
@@ -15,7 +15,7 @@
* The memory region may remain untagged if tagging is not enabled.
*/
-#include "asmdefs.h"
+#include "../asmdefs.h"
#if __ARM_FEATURE_MEMORY_TAGGING
diff --git a/string/aarch64/asmdefs.h b/string/aarch64/asmdefs.h
deleted file mode 100644
index 131b95e1fea98f789a678ce075846425cf0a24e6..0000000000000000000000000000000000000000
--- a/string/aarch64/asmdefs.h
+++ /dev/null
@@ -1,106 +0,0 @@
-/*
- * Macros for asm code. AArch64 version.
- *
- * Copyright (c) 2019-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-
-#ifndef _ASMDEFS_H
-#define _ASMDEFS_H
-
-/* Branch Target Identitication support. */
-#define BTI_C hint 34
-#define BTI_J hint 36
-/* Return address signing support (pac-ret). */
-#define PACIASP hint 25; .cfi_window_save
-#define AUTIASP hint 29; .cfi_window_save
-
-/* GNU_PROPERTY_AARCH64_* macros from elf.h. */
-#define FEATURE_1_AND 0xc0000000
-#define FEATURE_1_BTI 1
-#define FEATURE_1_PAC 2
-
-/* Add a NT_GNU_PROPERTY_TYPE_0 note. */
-#ifdef __ILP32__
-#define GNU_PROPERTY(type, value) \
- .section .note.gnu.property, "a"; \
- .p2align 2; \
- .word 4; \
- .word 12; \
- .word 5; \
- .asciz "GNU"; \
- .word type; \
- .word 4; \
- .word value; \
- .text
-#else
-#define GNU_PROPERTY(type, value) \
- .section .note.gnu.property, "a"; \
- .p2align 3; \
- .word 4; \
- .word 16; \
- .word 5; \
- .asciz "GNU"; \
- .word type; \
- .word 4; \
- .word value; \
- .word 0; \
- .text
-#endif
-
-/* If set then the GNU Property Note section will be added to
- mark objects to support BTI and PAC-RET. */
-#ifndef WANT_GNU_PROPERTY
-#define WANT_GNU_PROPERTY 1
-#endif
-
-#if WANT_GNU_PROPERTY
-/* Add property note with supported features to all asm files. */
-GNU_PROPERTY (FEATURE_1_AND, FEATURE_1_BTI|FEATURE_1_PAC)
-#endif
-
-#define ENTRY_ALIGN(name, alignment) \
- .global name; \
- .type name,%function; \
- .align alignment; \
- name: \
- .cfi_startproc; \
- BTI_C;
-
-#define ENTRY(name) ENTRY_ALIGN(name, 6)
-
-#define ENTRY_ALIAS(name) \
- .global name; \
- .type name,%function; \
- name:
-
-#define END(name) \
- .cfi_endproc; \
- .size name, .-name;
-
-#define L(l) .L ## l
-
-#ifdef __ILP32__
- /* Sanitize padding bits of pointer arguments as per aapcs64 */
-#define PTR_ARG(n) mov w##n, w##n
-#else
-#define PTR_ARG(n)
-#endif
-
-#ifdef __ILP32__
- /* Sanitize padding bits of size arguments as per aapcs64 */
-#define SIZE_ARG(n) mov w##n, w##n
-#else
-#define SIZE_ARG(n)
-#endif
-
-/* Compiler supports SVE instructions */
-#ifndef HAVE_SVE
-# if __aarch64__ && (__GNUC__ >= 8 || __clang_major__ >= 5)
-# define HAVE_SVE 1
-# else
-# define HAVE_SVE 0
-# endif
-#endif
-
-#endif
diff --git a/string/aarch64/check-arch.S b/string/aarch64/check-arch.S
index 131b7fa36ec2dda1154e4435f53c5f45d6af1baf..5a54242d7de62303fe852f099f7025f32eac9f63 100644
--- a/string/aarch64/check-arch.S
+++ b/string/aarch64/check-arch.S
@@ -1,8 +1,8 @@
/*
* check ARCH setting.
*
- * Copyright (c) 2020-2022, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * Copyright (c) 2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
*/
#if !__aarch64__
@@ -10,4 +10,4 @@
#endif
/* Include for GNU property notes. */
-#include "asmdefs.h"
+#include "../asmdefs.h"
diff --git a/string/aarch64/memchr-mte.S b/string/aarch64/memchr-mte.S
index 948c3cbc7dd43a773d035c9fcf364d994fe3b5a8..c2e967d1004e06e372725f5cc8ddb95aeb629aa2 100644
--- a/string/aarch64/memchr-mte.S
+++ b/string/aarch64/memchr-mte.S
@@ -1,8 +1,8 @@
/*
* memchr - find a character in a memory zone
*
- * Copyright (c) 2020-2022, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * Copyright (c) 2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
*/
/* Assumptions:
@@ -11,7 +11,7 @@
* MTE compatible.
*/
-#include "asmdefs.h"
+#include "../asmdefs.h"
#define srcin x0
#define chrin w1
@@ -23,21 +23,25 @@
#define synd x5
#define shift x6
#define tmp x7
+#define wtmp w7
#define vrepchr v0
#define qdata q1
#define vdata v1
#define vhas_chr v2
-#define vend v3
-#define dend d3
+#define vrepmask v3
+#define vend v4
+#define dend d4
/*
Core algorithm:
- For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits
- per byte. We take 4 bits of every comparison byte with shift right and narrow
- by 4 instruction. Since the bits in the nibble mask reflect the order in
- which things occur in the original string, counting leading zeros identifies
- exactly which byte matched. */
+
+ For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
+ per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
+ requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
+ set likewise for odd bytes so that adjacent bytes can be merged. Since the
+ bits in the syndrome reflect the order in which things occur in the original
+ string, counting trailing zeros identifies exactly which byte matched. */
ENTRY (__memchr_aarch64_mte)
PTR_ARG (0)
@@ -46,53 +50,55 @@ ENTRY (__memchr_aarch64_mte)
cbz cntin, L(nomatch)
ld1 {vdata.16b}, [src]
dup vrepchr.16b, chrin
+ mov wtmp, 0xf00f
+ dup vrepmask.8h, wtmp
cmeq vhas_chr.16b, vdata.16b, vrepchr.16b
lsl shift, srcin, 2
- shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */
+ and vhas_chr.16b, vhas_chr.16b, vrepmask.16b
+ addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */
fmov synd, dend
lsr synd, synd, shift
cbz synd, L(start_loop)
rbit synd, synd
clz synd, synd
- cmp cntin, synd, lsr 2
add result, srcin, synd, lsr 2
+ cmp cntin, synd, lsr 2
csel result, result, xzr, hi
ret
- .p2align 3
L(start_loop):
sub tmp, src, srcin
- add tmp, tmp, 17
+ add tmp, tmp, 16
subs cntrem, cntin, tmp
- b.lo L(nomatch)
+ b.ls L(nomatch)
/* Make sure that it won't overread by a 16-byte chunk */
- tbz cntrem, 4, L(loop32_2)
- sub src, src, 16
+ add tmp, cntrem, 15
+ tbnz tmp, 4, L(loop32_2)
+
.p2align 4
L(loop32):
- ldr qdata, [src, 32]!
+ ldr qdata, [src, 16]!
cmeq vhas_chr.16b, vdata.16b, vrepchr.16b
umaxp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */
fmov synd, dend
cbnz synd, L(end)
L(loop32_2):
- ldr qdata, [src, 16]
- cmeq vhas_chr.16b, vdata.16b, vrepchr.16b
+ ldr qdata, [src, 16]!
subs cntrem, cntrem, 32
- b.lo L(end_2)
+ cmeq vhas_chr.16b, vdata.16b, vrepchr.16b
+ b.ls L(end)
umaxp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */
fmov synd, dend
cbz synd, L(loop32)
-L(end_2):
- add src, src, 16
L(end):
- shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */
- sub cntrem, src, srcin
+ and vhas_chr.16b, vhas_chr.16b, vrepmask.16b
+ addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */
fmov synd, dend
- sub cntrem, cntin, cntrem
+ add tmp, srcin, cntin
+ sub cntrem, tmp, src
#ifndef __AARCH64EB__
rbit synd, synd
#endif
diff --git a/string/aarch64/memchr-sve.S b/string/aarch64/memchr-sve.S
index b851cf31f2383e874c96b24ba82006d82e52f060..c22e6596f19bdde2e6ced26a3ca11e99c0c5b7f5 100644
--- a/string/aarch64/memchr-sve.S
+++ b/string/aarch64/memchr-sve.S
@@ -1,11 +1,11 @@
/*
* memchr - find a character in a memory zone
*
- * Copyright (c) 2018-2022, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * Copyright (c) 2018-2021, Arm Limited.
+ * SPDX-License-Identifier: MIT
*/
-#include "asmdefs.h"
+#include "../asmdefs.h"
#if __ARM_FEATURE_SVE
/* Assumptions:
diff --git a/string/aarch64/memchr.S b/string/aarch64/memchr.S
index fe6cfe2bc0e28d56100536ec25186f0543b03897..353f0d1eac53098f8b8e921d12af1404ec2cf96c 100644
--- a/string/aarch64/memchr.S
+++ b/string/aarch64/memchr.S
@@ -1,8 +1,8 @@
/*
* memchr - find a character in a memory zone
*
- * Copyright (c) 2014-2022, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * Copyright (c) 2014-2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
*/
/* Assumptions:
@@ -11,7 +11,7 @@
* Neon Available.
*/
-#include "asmdefs.h"
+#include "../asmdefs.h"
/* Arguments and results. */
#define srcin x0
diff --git a/string/aarch64/memcmp-sve.S b/string/aarch64/memcmp-sve.S
index d52ce4555344e5b2fcca1ebcbc8b99651c0097fb..78c5ecaa4cdcba0b826d62369d40f18afa8313d9 100644
--- a/string/aarch64/memcmp-sve.S
+++ b/string/aarch64/memcmp-sve.S
@@ -1,11 +1,11 @@
/*
* memcmp - compare memory
*
- * Copyright (c) 2018-2022, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * Copyright (c) 2018-2021, Arm Limited.
+ * SPDX-License-Identifier: MIT
*/
-#include "asmdefs.h"
+#include "../asmdefs.h"
#if __ARM_FEATURE_SVE
/* Assumptions:
diff --git a/string/aarch64/memcmp.S b/string/aarch64/memcmp.S
index 35135e72cc8e5324ade0a2443dc17fa1098142d6..3b1026642eee805ca31d7f88b13eac082ce4b726 100644
--- a/string/aarch64/memcmp.S
+++ b/string/aarch64/memcmp.S
@@ -1,84 +1,103 @@
/* memcmp - compare memory
*
- * Copyright (c) 2013-2022, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * Copyright (c) 2013-2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
*/
/* Assumptions:
*
- * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
+ * ARMv8-a, AArch64, unaligned accesses.
*/
-#include "asmdefs.h"
+#include "../asmdefs.h"
-#define src1 x0
-#define src2 x1
-#define limit x2
-#define result w0
-
-#define data1 x3
-#define data1w w3
-#define data2 x4
-#define data2w w4
-#define data3 x5
-#define data3w w5
-#define data4 x6
-#define data4w w6
-#define tmp x6
-#define src1end x7
-#define src2end x8
+/* Parameters and result. */
+#define src1 x0
+#define src2 x1
+#define limit x2
+#define result w0
+/* Internal variables. */
+#define data1 x3
+#define data1w w3
+#define data1h x4
+#define data2 x5
+#define data2w w5
+#define data2h x6
+#define tmp1 x7
+#define tmp2 x8
ENTRY (__memcmp_aarch64)
PTR_ARG (0)
PTR_ARG (1)
SIZE_ARG (2)
+ subs limit, limit, 8
+ b.lo L(less8)
- cmp limit, 16
- b.lo L(less16)
- ldp data1, data3, [src1]
- ldp data2, data4, [src2]
- ccmp data1, data2, 0, ne
- ccmp data3, data4, 0, eq
- b.ne L(return2)
-
- add src1end, src1, limit
- add src2end, src2, limit
- cmp limit, 32
- b.ls L(last_bytes)
- cmp limit, 160
- b.hs L(loop_align)
- sub limit, limit, 32
+ ldr data1, [src1], 8
+ ldr data2, [src2], 8
+ cmp data1, data2
+ b.ne L(return)
- .p2align 4
-L(loop32):
- ldp data1, data3, [src1, 16]
- ldp data2, data4, [src2, 16]
+ subs limit, limit, 8
+ b.gt L(more16)
+
+ ldr data1, [src1, limit]
+ ldr data2, [src2, limit]
+ b L(return)
+
+L(more16):
+ ldr data1, [src1], 8
+ ldr data2, [src2], 8
cmp data1, data2
- ccmp data3, data4, 0, eq
- b.ne L(return2)
- cmp limit, 16
+ bne L(return)
+
+ /* Jump directly to comparing the last 16 bytes for 32 byte (or less)
+ strings. */
+ subs limit, limit, 16
b.ls L(last_bytes)
- ldp data1, data3, [src1, 32]
- ldp data2, data4, [src2, 32]
+ /* We overlap loads between 0-32 bytes at either side of SRC1 when we
+ try to align, so limit it only to strings larger than 128 bytes. */
+ cmp limit, 96
+ b.ls L(loop16)
+
+ /* Align src1 and adjust src2 with bytes not yet done. */
+ and tmp1, src1, 15
+ add limit, limit, tmp1
+ sub src1, src1, tmp1
+ sub src2, src2, tmp1
+
+ /* Loop performing 16 bytes per iteration using aligned src1.
+ Limit is pre-decremented by 16 and must be larger than zero.
+ Exit if <= 16 bytes left to do or if the data is not equal. */
+ .p2align 4
+L(loop16):
+ ldp data1, data1h, [src1], 16
+ ldp data2, data2h, [src2], 16
+ subs limit, limit, 16
+ ccmp data1, data2, 0, hi
+ ccmp data1h, data2h, 0, eq
+ b.eq L(loop16)
+
+ cmp data1, data2
+ bne L(return)
+ mov data1, data1h
+ mov data2, data2h
cmp data1, data2
- ccmp data3, data4, 0, eq
- b.ne L(return2)
- add src1, src1, 32
- add src2, src2, 32
-L(last64):
- subs limit, limit, 32
- b.hi L(loop32)
+ bne L(return)
/* Compare last 1-16 bytes using unaligned access. */
L(last_bytes):
- ldp data1, data3, [src1end, -16]
- ldp data2, data4, [src2end, -16]
-L(return2):
+ add src1, src1, limit
+ add src2, src2, limit
+ ldp data1, data1h, [src1]
+ ldp data2, data2h, [src2]
+ cmp data1, data2
+ bne L(return)
+ mov data1, data1h
+ mov data2, data2h
cmp data1, data2
- csel data1, data1, data3, ne
- csel data2, data2, data4, ne
/* Compare data bytes and set return value to 0, -1 or 1. */
L(return):
@@ -86,105 +105,33 @@ L(return):
rev data1, data1
rev data2, data2
#endif
- cmp data1, data2
+ cmp data1, data2
+L(ret_eq):
cset result, ne
cneg result, result, lo
ret
.p2align 4
-L(less16):
- add src1end, src1, limit
- add src2end, src2, limit
- tbz limit, 3, L(less8)
- ldr data1, [src1]
- ldr data2, [src2]
- ldr data3, [src1end, -8]
- ldr data4, [src2end, -8]
- b L(return2)
-
- .p2align 4
+ /* Compare up to 8 bytes. Limit is [-8..-1]. */
L(less8):
- tbz limit, 2, L(less4)
- ldr data1w, [src1]
- ldr data2w, [src2]
- ldr data3w, [src1end, -4]
- ldr data4w, [src2end, -4]
- b L(return2)
-
-L(less4):
- tbz limit, 1, L(less2)
- ldrh data1w, [src1]
- ldrh data2w, [src2]
+ adds limit, limit, 4
+ b.lo L(less4)
+ ldr data1w, [src1], 4
+ ldr data2w, [src2], 4
cmp data1w, data2w
b.ne L(return)
-L(less2):
- mov result, 0
- tbz limit, 0, L(return_zero)
- ldrb data1w, [src1end, -1]
- ldrb data2w, [src2end, -1]
+ sub limit, limit, 4
+L(less4):
+ adds limit, limit, 4
+ beq L(ret_eq)
+L(byte_loop):
+ ldrb data1w, [src1], 1
+ ldrb data2w, [src2], 1
+ subs limit, limit, 1
+ ccmp data1w, data2w, 0, ne /* NZCV = 0b0000. */
+ b.eq L(byte_loop)
sub result, data1w, data2w
-L(return_zero):
- ret
-
-L(loop_align):
- ldp data1, data3, [src1, 16]
- ldp data2, data4, [src2, 16]
- cmp data1, data2
- ccmp data3, data4, 0, eq
- b.ne L(return2)
-
- /* Align src2 and adjust src1, src2 and limit. */
- and tmp, src2, 15
- sub tmp, tmp, 16
- sub src2, src2, tmp
- add limit, limit, tmp
- sub src1, src1, tmp
- sub limit, limit, 64 + 16
-
- .p2align 4
-L(loop64):
- ldr q0, [src1, 16]
- ldr q1, [src2, 16]
- subs limit, limit, 64
- ldr q2, [src1, 32]
- ldr q3, [src2, 32]
- eor v0.16b, v0.16b, v1.16b
- eor v1.16b, v2.16b, v3.16b
- ldr q2, [src1, 48]
- ldr q3, [src2, 48]
- umaxp v0.16b, v0.16b, v1.16b
- ldr q4, [src1, 64]!
- ldr q5, [src2, 64]!
- eor v1.16b, v2.16b, v3.16b
- eor v2.16b, v4.16b, v5.16b
- umaxp v1.16b, v1.16b, v2.16b
- umaxp v0.16b, v0.16b, v1.16b
- umaxp v0.16b, v0.16b, v0.16b
- fmov tmp, d0
- ccmp tmp, 0, 0, hi
- b.eq L(loop64)
-
- /* If equal, process last 1-64 bytes using scalar loop. */
- add limit, limit, 64 + 16
- cbz tmp, L(last64)
-
- /* Determine the 8-byte aligned offset of the first difference. */
-#ifdef __AARCH64EB__
- rev16 tmp, tmp
-#endif
- rev tmp, tmp
- clz tmp, tmp
- bic tmp, tmp, 7
- sub tmp, tmp, 48
- ldr data1, [src1, tmp]
- ldr data2, [src2, tmp]
-#ifndef __AARCH64EB__
- rev data1, data1
- rev data2, data2
-#endif
- mov result, 1
- cmp data1, data2
- cneg result, result, lo
ret
END (__memcmp_aarch64)
+
diff --git a/string/aarch64/memcpy-advsimd.S b/string/aarch64/memcpy-advsimd.S
index e6527d0dac2c48c3b313f25a5d61314df87871e0..f97f2c3047b96e489ff97395173f2069469144e0 100644
--- a/string/aarch64/memcpy-advsimd.S
+++ b/string/aarch64/memcpy-advsimd.S
@@ -1,8 +1,8 @@
/*
* memcpy - copy memory area
*
- * Copyright (c) 2019-2022, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * Copyright (c) 2019-2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
*/
/* Assumptions:
@@ -11,7 +11,7 @@
*
*/
-#include "asmdefs.h"
+#include "../asmdefs.h"
#define dstin x0
#define src x1
diff --git a/string/aarch64/memcpy-mops.S b/string/aarch64/memcpy-mops.S
deleted file mode 100644
index b45c31418717cd1e5cc7f29dd42aceab31d784c8..0000000000000000000000000000000000000000
--- a/string/aarch64/memcpy-mops.S
+++ /dev/null
@@ -1,21 +0,0 @@
-/*
- * memcpy using MOPS extension.
- *
- * Copyright (c) 2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-
-#include "asmdefs.h"
-
-ENTRY (__memcpy_aarch64_mops)
- PTR_ARG (0)
- PTR_ARG (1)
- SIZE_ARG (2)
-
- mov x3, x0
- .inst 0x19010443 /* cpyfp [x3]!, [x1]!, x2! */
- .inst 0x19410443 /* cpyfm [x3]!, [x1]!, x2! */
- .inst 0x19810443 /* cpyfe [x3]!, [x1]!, x2! */
- ret
-
-END (__memcpy_aarch64_mops)
diff --git a/string/aarch64/memcpy-sve.S b/string/aarch64/memcpy-sve.S
deleted file mode 100644
index e8a946d7db37f44fa8b819be5cf81fe0ee5f719d..0000000000000000000000000000000000000000
--- a/string/aarch64/memcpy-sve.S
+++ /dev/null
@@ -1,177 +0,0 @@
-/*
- * memcpy - copy memory area
- *
- * Copyright (c) 2019-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-
-/* Assumptions:
- *
- * ARMv8-a, AArch64, Advanced SIMD, SVE, unaligned accesses.
- *
- */
-
-#include "asmdefs.h"
-
-#ifdef HAVE_SVE
-
-.arch armv8-a+sve
-
-#define dstin x0
-#define src x1
-#define count x2
-#define dst x3
-#define srcend x4
-#define dstend x5
-#define tmp1 x6
-#define vlen x6
-
-#define A_q q0
-#define B_q q1
-#define C_q q2
-#define D_q q3
-#define E_q q4
-#define F_q q5
-#define G_q q6
-#define H_q q7
-
-/* This implementation handles overlaps and supports both memcpy and memmove
- from a single entry point. It uses unaligned accesses and branchless
- sequences to keep the code small, simple and improve performance.
- SVE vectors are used to speedup small copies.
-
- Copies are split into 3 main cases: small copies of up to 32 bytes, medium
- copies of up to 128 bytes, and large copies. The overhead of the overlap
- check is negligible since it is only required for large copies.
-
- Large copies use a software pipelined loop processing 64 bytes per iteration.
- The source pointer is 16-byte aligned to minimize unaligned accesses.
- The loop tail is handled by always copying 64 bytes from the end.
-*/
-
-ENTRY_ALIAS (__memmove_aarch64_sve)
-ENTRY (__memcpy_aarch64_sve)
- PTR_ARG (0)
- PTR_ARG (1)
- SIZE_ARG (2)
-
- cmp count, 128
- b.hi L(copy_long)
- cntb vlen
- cmp count, vlen, lsl 1
- b.hi L(copy32_128)
-
- whilelo p0.b, xzr, count
- whilelo p1.b, vlen, count
- ld1b z0.b, p0/z, [src, 0, mul vl]
- ld1b z1.b, p1/z, [src, 1, mul vl]
- st1b z0.b, p0, [dstin, 0, mul vl]
- st1b z1.b, p1, [dstin, 1, mul vl]
- ret
-
- /* Medium copies: 33..128 bytes. */
-L(copy32_128):
- add srcend, src, count
- add dstend, dstin, count
- ldp A_q, B_q, [src]
- ldp C_q, D_q, [srcend, -32]
- cmp count, 64
- b.hi L(copy128)
- stp A_q, B_q, [dstin]
- stp C_q, D_q, [dstend, -32]
- ret
-
- /* Copy 65..128 bytes. */
-L(copy128):
- ldp E_q, F_q, [src, 32]
- cmp count, 96
- b.ls L(copy96)
- ldp G_q, H_q, [srcend, -64]
- stp G_q, H_q, [dstend, -64]
-L(copy96):
- stp A_q, B_q, [dstin]
- stp E_q, F_q, [dstin, 32]
- stp C_q, D_q, [dstend, -32]
- ret
-
- /* Copy more than 128 bytes. */
-L(copy_long):
- add srcend, src, count
- add dstend, dstin, count
-
- /* Use backwards copy if there is an overlap. */
- sub tmp1, dstin, src
- cmp tmp1, count
- b.lo L(copy_long_backwards)
-
- /* Copy 16 bytes and then align src to 16-byte alignment. */
- ldr D_q, [src]
- and tmp1, src, 15
- bic src, src, 15
- sub dst, dstin, tmp1
- add count, count, tmp1 /* Count is now 16 too large. */
- ldp A_q, B_q, [src, 16]
- str D_q, [dstin]
- ldp C_q, D_q, [src, 48]
- subs count, count, 128 + 16 /* Test and readjust count. */
- b.ls L(copy64_from_end)
-L(loop64):
- stp A_q, B_q, [dst, 16]
- ldp A_q, B_q, [src, 80]
- stp C_q, D_q, [dst, 48]
- ldp C_q, D_q, [src, 112]
- add src, src, 64
- add dst, dst, 64
- subs count, count, 64
- b.hi L(loop64)
-
- /* Write the last iteration and copy 64 bytes from the end. */
-L(copy64_from_end):
- ldp E_q, F_q, [srcend, -64]
- stp A_q, B_q, [dst, 16]
- ldp A_q, B_q, [srcend, -32]
- stp C_q, D_q, [dst, 48]
- stp E_q, F_q, [dstend, -64]
- stp A_q, B_q, [dstend, -32]
- ret
-
- /* Large backwards copy for overlapping copies.
- Copy 16 bytes and then align srcend to 16-byte alignment. */
-L(copy_long_backwards):
- cbz tmp1, L(return)
- ldr D_q, [srcend, -16]
- and tmp1, srcend, 15
- bic srcend, srcend, 15
- sub count, count, tmp1
- ldp A_q, B_q, [srcend, -32]
- str D_q, [dstend, -16]
- ldp C_q, D_q, [srcend, -64]
- sub dstend, dstend, tmp1
- subs count, count, 128
- b.ls L(copy64_from_start)
-
-L(loop64_backwards):
- str B_q, [dstend, -16]
- str A_q, [dstend, -32]
- ldp A_q, B_q, [srcend, -96]
- str D_q, [dstend, -48]
- str C_q, [dstend, -64]!
- ldp C_q, D_q, [srcend, -128]
- sub srcend, srcend, 64
- subs count, count, 64
- b.hi L(loop64_backwards)
-
- /* Write the last iteration and copy 64 bytes from the start. */
-L(copy64_from_start):
- ldp E_q, F_q, [src, 32]
- stp A_q, B_q, [dstend, -32]
- ldp A_q, B_q, [src]
- stp C_q, D_q, [dstend, -64]
- stp E_q, F_q, [dstin, 32]
- stp A_q, B_q, [dstin]
-L(return):
- ret
-
-END (__memcpy_aarch64_sve)
-
-#endif
diff --git a/string/aarch64/memcpy.S b/string/aarch64/memcpy.S
index 2b1a592feb39b5c831a908bda3f42bf3f9fc44ab..8a967cdf4d2b5c014ce0737c19e4884297cd18b7 100644
--- a/string/aarch64/memcpy.S
+++ b/string/aarch64/memcpy.S
@@ -1,8 +1,8 @@
/*
* memcpy - copy memory area
*
- * Copyright (c) 2012-2022, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * Copyright (c) 2012-2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
*/
/* Assumptions:
@@ -11,7 +11,7 @@
*
*/
-#include "asmdefs.h"
+#include "../asmdefs.h"
#define dstin x0
#define src x1
diff --git a/string/aarch64/memmove-mops.S b/string/aarch64/memmove-mops.S
deleted file mode 100644
index 6c73017bb16f00ded1eaaaa5bf61fe9e68de5e9c..0000000000000000000000000000000000000000
--- a/string/aarch64/memmove-mops.S
+++ /dev/null
@@ -1,21 +0,0 @@
-/*
- * memmove using MOPS extension.
- *
- * Copyright (c) 2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-
-#include "asmdefs.h"
-
-ENTRY (__memmove_aarch64_mops)
- PTR_ARG (0)
- PTR_ARG (1)
- SIZE_ARG (2)
-
- mov x3, x0
- .inst 0x1d010443 /* cpyp [x3]!, [x1]!, x2! */
- .inst 0x1d410443 /* cpym [x3]!, [x1]!, x2! */
- .inst 0x1d810443 /* cpye [x3]!, [x1]!, x2! */
- ret
-
-END (__memmove_aarch64_mops)
diff --git a/string/aarch64/memrchr.S b/string/aarch64/memrchr.S
index 6418bdf56f414880540632cd8c8257ed3d95d6d2..7b4be847cecbf93820be6ca931cf6b4569bf382f 100644
--- a/string/aarch64/memrchr.S
+++ b/string/aarch64/memrchr.S
@@ -1,8 +1,8 @@
/*
* memrchr - find last character in a memory zone.
*
- * Copyright (c) 2020-2022, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * Copyright (c) 2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
*/
/* Assumptions:
@@ -11,7 +11,7 @@
* MTE compatible.
*/
-#include "asmdefs.h"
+#include "../asmdefs.h"
#define srcin x0
#define chrin w1
@@ -23,6 +23,7 @@
#define synd x5
#define shift x6
#define tmp x7
+#define wtmp w7
#define end x8
#define endm1 x9
@@ -30,16 +31,19 @@
#define qdata q1
#define vdata v1
#define vhas_chr v2
-#define vend v3
-#define dend d3
+#define vrepmask v3
+#define vend v4
+#define dend d4
/*
Core algorithm:
- For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits
- per byte. We take 4 bits of every comparison byte with shift right and narrow
- by 4 instruction. Since the bits in the nibble mask reflect the order in
- which things occur in the original string, counting leading zeros identifies
- exactly which byte matched. */
+
+ For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
+ per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
+ requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
+ set likewise for odd bytes so that adjacent bytes can be merged. Since the
+ bits in the syndrome reflect the order in which things occur in the original
+ string, counting trailing zeros identifies exactly which byte matched. */
ENTRY (__memrchr_aarch64)
PTR_ARG (0)
@@ -49,9 +53,12 @@ ENTRY (__memrchr_aarch64)
cbz cntin, L(nomatch)
ld1 {vdata.16b}, [src]
dup vrepchr.16b, chrin
+ mov wtmp, 0xf00f
+ dup vrepmask.8h, wtmp
cmeq vhas_chr.16b, vdata.16b, vrepchr.16b
neg shift, end, lsl 2
- shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */
+ and vhas_chr.16b, vhas_chr.16b, vrepmask.16b
+ addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */
fmov synd, dend
lsl synd, synd, shift
cbz synd, L(start_loop)
@@ -62,36 +69,34 @@ ENTRY (__memrchr_aarch64)
csel result, result, xzr, hi
ret
- nop
L(start_loop):
- subs cntrem, src, srcin
+ sub tmp, end, src
+ subs cntrem, cntin, tmp
b.ls L(nomatch)
/* Make sure that it won't overread by a 16-byte chunk */
- sub cntrem, cntrem, 1
- tbz cntrem, 4, L(loop32_2)
- add src, src, 16
+ add tmp, cntrem, 15
+ tbnz tmp, 4, L(loop32_2)
- .p2align 5
+ .p2align 4
L(loop32):
- ldr qdata, [src, -32]!
+ ldr qdata, [src, -16]!
cmeq vhas_chr.16b, vdata.16b, vrepchr.16b
umaxp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */
fmov synd, dend
cbnz synd, L(end)
L(loop32_2):
- ldr qdata, [src, -16]
+ ldr qdata, [src, -16]!
subs cntrem, cntrem, 32
cmeq vhas_chr.16b, vdata.16b, vrepchr.16b
- b.lo L(end_2)
+ b.ls L(end)
umaxp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */
fmov synd, dend
cbz synd, L(loop32)
-L(end_2):
- sub src, src, 16
L(end):
- shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */
+ and vhas_chr.16b, vhas_chr.16b, vrepmask.16b
+ addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */
fmov synd, dend
add tmp, src, 15
diff --git a/string/aarch64/memset-mops.S b/string/aarch64/memset-mops.S
deleted file mode 100644
index ec791493bae9c019b92374f0920edbec00b10507..0000000000000000000000000000000000000000
--- a/string/aarch64/memset-mops.S
+++ /dev/null
@@ -1,20 +0,0 @@
-/*
- * memset using MOPS extension.
- *
- * Copyright (c) 2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-
-#include "asmdefs.h"
-
-ENTRY (__memset_aarch64_mops)
- PTR_ARG (0)
- SIZE_ARG (2)
-
- mov x3, x0
- .inst 0x19c10443 /* setp [x3]!, x2!, x1 */
- .inst 0x19c14443 /* setm [x3]!, x2!, x1 */
- .inst 0x19c18443 /* sete [x3]!, x2!, x1 */
- ret
-
-END (__memset_aarch64_mops)
diff --git a/string/aarch64/memset.S b/string/aarch64/memset.S
index 553b0fcaefea5e5ae60c4ef583b80dc81f165ae6..9fcd97579913b025028f6728098ebd570992cb7d 100644
--- a/string/aarch64/memset.S
+++ b/string/aarch64/memset.S
@@ -1,8 +1,8 @@
/*
* memset - fill memory with a constant byte
*
- * Copyright (c) 2012-2022, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * Copyright (c) 2012-2021, Arm Limited.
+ * SPDX-License-Identifier: MIT
*/
/* Assumptions:
@@ -11,7 +11,7 @@
*
*/
-#include "asmdefs.h"
+#include "../asmdefs.h"
#define dstin x0
#define val x1
diff --git a/string/aarch64/stpcpy-mte.S b/string/aarch64/stpcpy-mte.S
new file mode 100644
index 0000000000000000000000000000000000000000..f1c7119065152def69dabaa5edfd92ada06685f1
--- /dev/null
+++ b/string/aarch64/stpcpy-mte.S
@@ -0,0 +1,10 @@
+/*
+ * stpcpy - copy a string returning pointer to end.
+ *
+ * Copyright (c) 2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#define BUILD_STPCPY 1
+
+#include "strcpy-mte.S"
diff --git a/string/aarch64/stpcpy-sve.S b/string/aarch64/stpcpy-sve.S
index 5d3f14b86026882d092f567f598ed46fdbe9447f..82dd9717b0a0af44d7a14bee1ff9de16df7a6535 100644
--- a/string/aarch64/stpcpy-sve.S
+++ b/string/aarch64/stpcpy-sve.S
@@ -2,7 +2,7 @@
* stpcpy - copy a string returning pointer to end.
*
* Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * SPDX-License-Identifier: MIT
*/
#define BUILD_STPCPY 1
diff --git a/string/aarch64/stpcpy.S b/string/aarch64/stpcpy.S
index 155c68d75a7b23a7c4f1be9cf864a7c1f1287ccd..4f62aa46238987bbbd634b3fb794433d7bd74965 100644
--- a/string/aarch64/stpcpy.S
+++ b/string/aarch64/stpcpy.S
@@ -2,7 +2,7 @@
* stpcpy - copy a string returning pointer to end.
*
* Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * SPDX-License-Identifier: MIT
*/
#define BUILD_STPCPY 1
diff --git a/string/aarch64/strchr-mte.S b/string/aarch64/strchr-mte.S
index 6ec08f7acc766b652cee0c340541f74ac01cebd7..dcb0e46258709760e7ef1c7d81e47a86457a2846 100644
--- a/string/aarch64/strchr-mte.S
+++ b/string/aarch64/strchr-mte.S
@@ -1,8 +1,8 @@
/*
* strchr - find a character in a string
*
- * Copyright (c) 2020-2022, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * Copyright (c) 2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
*/
/* Assumptions:
@@ -11,7 +11,7 @@
* MTE compatible.
*/
-#include "asmdefs.h"
+#include "../asmdefs.h"
#define srcin x0
#define chrin w1
@@ -19,7 +19,8 @@
#define src x2
#define tmp1 x1
-#define tmp2 x3
+#define wtmp2 w3
+#define tmp3 x3
#define vrepchr v0
#define vdata v1
@@ -27,30 +28,39 @@
#define vhas_nul v2
#define vhas_chr v3
#define vrepmask v4
-#define vend v5
-#define dend d5
+#define vrepmask2 v5
+#define vend v6
+#define dend d6
/* Core algorithm.
For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
- per byte. Bits 0-1 are set if the relevant byte matched the requested
- character, bits 2-3 are set if the byte is NUL or matched. Count trailing
- zeroes gives the position of the matching byte if it is a multiple of 4.
- If it is not a multiple of 4, there was no match. */
+ per byte. For even bytes, bits 0-1 are set if the relevant byte matched the
+ requested character, bits 2-3 are set if the byte is NUL (or matched), and
+ bits 4-7 are not used and must be zero if none of bits 0-3 are set). Odd
+ bytes set bits 4-7 so that adjacent bytes can be merged. Since the bits
+ in the syndrome reflect the order in which things occur in the original
+ string, counting trailing zeros identifies exactly which byte matched. */
ENTRY (__strchr_aarch64_mte)
PTR_ARG (0)
bic src, srcin, 15
dup vrepchr.16b, chrin
ld1 {vdata.16b}, [src]
- movi vrepmask.16b, 0x33
+ mov wtmp2, 0x3003
+ dup vrepmask.8h, wtmp2
cmeq vhas_nul.16b, vdata.16b, 0
cmeq vhas_chr.16b, vdata.16b, vrepchr.16b
+ mov wtmp2, 0xf00f
+ dup vrepmask2.8h, wtmp2
+
bit vhas_nul.16b, vhas_chr.16b, vrepmask.16b
- lsl tmp2, srcin, 2
- shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */
+ and vhas_nul.16b, vhas_nul.16b, vrepmask2.16b
+ lsl tmp3, srcin, 2
+ addp vend.16b, vhas_nul.16b, vhas_nul.16b /* 128->64 */
+
fmov tmp1, dend
- lsr tmp1, tmp1, tmp2
+ lsr tmp1, tmp1, tmp3
cbz tmp1, L(loop)
rbit tmp1, tmp1
@@ -64,34 +74,28 @@ ENTRY (__strchr_aarch64_mte)
.p2align 4
L(loop):
- ldr qdata, [src, 16]
- cmeq vhas_chr.16b, vdata.16b, vrepchr.16b
- cmhs vhas_nul.16b, vhas_chr.16b, vdata.16b
- umaxp vend.16b, vhas_nul.16b, vhas_nul.16b
- fmov tmp1, dend
- cbnz tmp1, L(end)
- ldr qdata, [src, 32]!
+ ldr qdata, [src, 16]!
cmeq vhas_chr.16b, vdata.16b, vrepchr.16b
cmhs vhas_nul.16b, vhas_chr.16b, vdata.16b
umaxp vend.16b, vhas_nul.16b, vhas_nul.16b
fmov tmp1, dend
cbz tmp1, L(loop)
- sub src, src, 16
-L(end):
#ifdef __AARCH64EB__
bif vhas_nul.16b, vhas_chr.16b, vrepmask.16b
- shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */
+ and vhas_nul.16b, vhas_nul.16b, vrepmask2.16b
+ addp vend.16b, vhas_nul.16b, vhas_nul.16b /* 128->64 */
fmov tmp1, dend
#else
bit vhas_nul.16b, vhas_chr.16b, vrepmask.16b
- shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */
+ and vhas_nul.16b, vhas_nul.16b, vrepmask2.16b
+ addp vend.16b, vhas_nul.16b, vhas_nul.16b /* 128->64 */
fmov tmp1, dend
rbit tmp1, tmp1
#endif
- add src, src, 16
clz tmp1, tmp1
- /* Tmp1 is a multiple of 4 if the target character was found. */
+ /* Tmp1 is an even multiple of 2 if the target character was
+ found first. Otherwise we've found the end of string. */
tst tmp1, 2
add result, src, tmp1, lsr 2
csel result, result, xzr, eq
diff --git a/string/aarch64/strchr-sve.S b/string/aarch64/strchr-sve.S
index ff075167bfefb7dcf66869626c28c7d58163ab7f..13ba9f44f9c5a3dd716252b0459955cfe12c3b18 100644
--- a/string/aarch64/strchr-sve.S
+++ b/string/aarch64/strchr-sve.S
@@ -1,11 +1,11 @@
/*
* strchr/strchrnul - find a character in a string
*
- * Copyright (c) 2018-2022, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * Copyright (c) 2018-2021, Arm Limited.
+ * SPDX-License-Identifier: MIT
*/
-#include "asmdefs.h"
+#include "../asmdefs.h"
#if __ARM_FEATURE_SVE
/* Assumptions:
diff --git a/string/aarch64/strchr.S b/string/aarch64/strchr.S
index 37193bd947a73dbf7167e3b10d5ddb8e2510dd31..1063cbfd77aa817ed1502e0b2c39643fb102c16b 100644
--- a/string/aarch64/strchr.S
+++ b/string/aarch64/strchr.S
@@ -1,8 +1,8 @@
/*
* strchr - find a character in a string
*
- * Copyright (c) 2014-2022, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * Copyright (c) 2014-2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
*/
/* Assumptions:
@@ -11,7 +11,7 @@
* Neon Available.
*/
-#include "asmdefs.h"
+#include "../asmdefs.h"
/* Arguments and results. */
#define srcin x0
diff --git a/string/aarch64/strchrnul-mte.S b/string/aarch64/strchrnul-mte.S
index 543ee88bb285852eb6a7cf22cf25480f6403c98d..1b0d0a63094c6567c3ee3654b416635f28a8acfd 100644
--- a/string/aarch64/strchrnul-mte.S
+++ b/string/aarch64/strchrnul-mte.S
@@ -1,8 +1,8 @@
/*
* strchrnul - find a character or nul in a string
*
- * Copyright (c) 2020-2022, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * Copyright (c) 2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
*/
/* Assumptions:
@@ -11,7 +11,7 @@
* MTE compatible.
*/
-#include "asmdefs.h"
+#include "../asmdefs.h"
#define srcin x0
#define chrin w1
@@ -20,32 +20,38 @@
#define src x2
#define tmp1 x1
#define tmp2 x3
+#define tmp2w w3
#define vrepchr v0
#define vdata v1
#define qdata q1
#define vhas_nul v2
#define vhas_chr v3
-#define vend v4
-#define dend d4
+#define vrepmask v4
+#define vend v5
+#define dend d5
-/*
- Core algorithm:
- For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits
- per byte. We take 4 bits of every comparison byte with shift right and narrow
- by 4 instruction. Since the bits in the nibble mask reflect the order in
- which things occur in the original string, counting leading zeros identifies
- exactly which byte matched. */
+/* Core algorithm:
+
+ For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
+ per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
+ requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
+ set likewise for odd bytes so that adjacent bytes can be merged. Since the
+ bits in the syndrome reflect the order in which things occur in the original
+ string, counting trailing zeros identifies exactly which byte matched. */
ENTRY (__strchrnul_aarch64_mte)
PTR_ARG (0)
bic src, srcin, 15
dup vrepchr.16b, chrin
ld1 {vdata.16b}, [src]
+ mov tmp2w, 0xf00f
+ dup vrepmask.8h, tmp2w
cmeq vhas_chr.16b, vdata.16b, vrepchr.16b
cmhs vhas_chr.16b, vhas_chr.16b, vdata.16b
lsl tmp2, srcin, 2
- shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */
+ and vhas_chr.16b, vhas_chr.16b, vrepmask.16b
+ addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */
fmov tmp1, dend
lsr tmp1, tmp1, tmp2 /* Mask padding bits. */
cbz tmp1, L(loop)
@@ -57,22 +63,15 @@ ENTRY (__strchrnul_aarch64_mte)
.p2align 4
L(loop):
- ldr qdata, [src, 16]
- cmeq vhas_chr.16b, vdata.16b, vrepchr.16b
- cmhs vhas_chr.16b, vhas_chr.16b, vdata.16b
- umaxp vend.16b, vhas_chr.16b, vhas_chr.16b
- fmov tmp1, dend
- cbnz tmp1, L(end)
- ldr qdata, [src, 32]!
+ ldr qdata, [src, 16]!
cmeq vhas_chr.16b, vdata.16b, vrepchr.16b
cmhs vhas_chr.16b, vhas_chr.16b, vdata.16b
umaxp vend.16b, vhas_chr.16b, vhas_chr.16b
fmov tmp1, dend
cbz tmp1, L(loop)
- sub src, src, 16
-L(end):
- shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */
- add src, src, 16
+
+ and vhas_chr.16b, vhas_chr.16b, vrepmask.16b
+ addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */
fmov tmp1, dend
#ifndef __AARCH64EB__
rbit tmp1, tmp1
diff --git a/string/aarch64/strchrnul-sve.S b/string/aarch64/strchrnul-sve.S
index 0005f9177514082544bc0f5f5a245ef5632430a7..428ff1a3d008325778eccc4e9fe1ec99bfc70bb5 100644
--- a/string/aarch64/strchrnul-sve.S
+++ b/string/aarch64/strchrnul-sve.S
@@ -2,7 +2,7 @@
* strchrnul - find a character or nul in a string
*
* Copyright (c) 2018-2019, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * SPDX-License-Identifier: MIT
*/
#define BUILD_STRCHRNUL
diff --git a/string/aarch64/strchrnul.S b/string/aarch64/strchrnul.S
index 666e8d0304c16d4f9ebb8fa443670a40673b934b..a4230d919b478d3001d412a7b3574f7ec94d2fb1 100644
--- a/string/aarch64/strchrnul.S
+++ b/string/aarch64/strchrnul.S
@@ -1,8 +1,8 @@
/*
* strchrnul - find a character or nul in a string
*
- * Copyright (c) 2014-2022, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * Copyright (c) 2014-2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
*/
/* Assumptions:
@@ -11,7 +11,7 @@
* Neon Available.
*/
-#include "asmdefs.h"
+#include "../asmdefs.h"
/* Arguments and results. */
#define srcin x0
diff --git a/string/aarch64/strcmp-mte.S b/string/aarch64/strcmp-mte.S
new file mode 100644
index 0000000000000000000000000000000000000000..12d1a6b51dd3442ca89ba7994569ce9e54b0e351
--- /dev/null
+++ b/string/aarch64/strcmp-mte.S
@@ -0,0 +1,189 @@
+/*
+ * strcmp - compare two strings
+ *
+ * Copyright (c) 2012-2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64.
+ * MTE compatible.
+ */
+
+#include "../asmdefs.h"
+
+#define REP8_01 0x0101010101010101
+#define REP8_7f 0x7f7f7f7f7f7f7f7f
+
+#define src1 x0
+#define src2 x1
+#define result x0
+
+#define data1 x2
+#define data1w w2
+#define data2 x3
+#define data2w w3
+#define has_nul x4
+#define diff x5
+#define off1 x5
+#define syndrome x6
+#define tmp x6
+#define data3 x7
+#define zeroones x8
+#define shift x9
+#define off2 x10
+
+/* On big-endian early bytes are at MSB and on little-endian LSB.
+ LS_FW means shifting towards early bytes. */
+#ifdef __AARCH64EB__
+# define LS_FW lsl
+#else
+# define LS_FW lsr
+#endif
+
+/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
+ (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
+ can be done in parallel across the entire word.
+ Since carry propagation makes 0x1 bytes before a NUL byte appear
+ NUL too in big-endian, byte-reverse the data before the NUL check. */
+
+
+ENTRY (__strcmp_aarch64_mte)
+ PTR_ARG (0)
+ PTR_ARG (1)
+ sub off2, src2, src1
+ mov zeroones, REP8_01
+ and tmp, src1, 7
+ tst off2, 7
+ b.ne L(misaligned8)
+ cbnz tmp, L(mutual_align)
+
+ .p2align 4
+
+L(loop_aligned):
+ ldr data2, [src1, off2]
+ ldr data1, [src1], 8
+L(start_realigned):
+#ifdef __AARCH64EB__
+ rev tmp, data1
+ sub has_nul, tmp, zeroones
+ orr tmp, tmp, REP8_7f
+#else
+ sub has_nul, data1, zeroones
+ orr tmp, data1, REP8_7f
+#endif
+ bics has_nul, has_nul, tmp /* Non-zero if NUL terminator. */
+ ccmp data1, data2, 0, eq
+ b.eq L(loop_aligned)
+#ifdef __AARCH64EB__
+ rev has_nul, has_nul
+#endif
+ eor diff, data1, data2
+ orr syndrome, diff, has_nul
+L(end):
+#ifndef __AARCH64EB__
+ rev syndrome, syndrome
+ rev data1, data1
+ rev data2, data2
+#endif
+ clz shift, syndrome
+ /* The most-significant-non-zero bit of the syndrome marks either the
+ first bit that is different, or the top bit of the first zero byte.
+ Shifting left now will bring the critical information into the
+ top bits. */
+ lsl data1, data1, shift
+ lsl data2, data2, shift
+ /* But we need to zero-extend (char is unsigned) the value and then
+ perform a signed 32-bit subtraction. */
+ lsr data1, data1, 56
+ sub result, data1, data2, lsr 56
+ ret
+
+ .p2align 4
+
+L(mutual_align):
+ /* Sources are mutually aligned, but are not currently at an
+ alignment boundary. Round down the addresses and then mask off
+ the bytes that precede the start point. */
+ bic src1, src1, 7
+ ldr data2, [src1, off2]
+ ldr data1, [src1], 8
+ neg shift, src2, lsl 3 /* Bits to alignment -64. */
+ mov tmp, -1
+ LS_FW tmp, tmp, shift
+ orr data1, data1, tmp
+ orr data2, data2, tmp
+ b L(start_realigned)
+
+L(misaligned8):
+ /* Align SRC1 to 8 bytes and then compare 8 bytes at a time, always
+ checking to make sure that we don't access beyond the end of SRC2. */
+ cbz tmp, L(src1_aligned)
+L(do_misaligned):
+ ldrb data1w, [src1], 1
+ ldrb data2w, [src2], 1
+ cmp data1w, 0
+ ccmp data1w, data2w, 0, ne /* NZCV = 0b0000. */
+ b.ne L(done)
+ tst src1, 7
+ b.ne L(do_misaligned)
+
+L(src1_aligned):
+ neg shift, src2, lsl 3
+ bic src2, src2, 7
+ ldr data3, [src2], 8
+#ifdef __AARCH64EB__
+ rev data3, data3
+#endif
+ lsr tmp, zeroones, shift
+ orr data3, data3, tmp
+ sub has_nul, data3, zeroones
+ orr tmp, data3, REP8_7f
+ bics has_nul, has_nul, tmp
+ b.ne L(tail)
+
+ sub off1, src2, src1
+
+ .p2align 4
+
+L(loop_unaligned):
+ ldr data3, [src1, off1]
+ ldr data2, [src1, off2]
+#ifdef __AARCH64EB__
+ rev data3, data3
+#endif
+ sub has_nul, data3, zeroones
+ orr tmp, data3, REP8_7f
+ ldr data1, [src1], 8
+ bics has_nul, has_nul, tmp
+ ccmp data1, data2, 0, eq
+ b.eq L(loop_unaligned)
+
+ lsl tmp, has_nul, shift
+#ifdef __AARCH64EB__
+ rev tmp, tmp
+#endif
+ eor diff, data1, data2
+ orr syndrome, diff, tmp
+ cbnz syndrome, L(end)
+L(tail):
+ ldr data1, [src1]
+ neg shift, shift
+ lsr data2, data3, shift
+ lsr has_nul, has_nul, shift
+#ifdef __AARCH64EB__
+ rev data2, data2
+ rev has_nul, has_nul
+#endif
+ eor diff, data1, data2
+ orr syndrome, diff, has_nul
+ b L(end)
+
+L(done):
+ sub result, data1, data2
+ ret
+
+END (__strcmp_aarch64_mte)
+
diff --git a/string/aarch64/strcmp-sve.S b/string/aarch64/strcmp-sve.S
index eaf909a378f1f52dcf180e1f10a82dba071d94c9..e6d2da5411cac58a14b62d4767022a0c22b87ecc 100644
--- a/string/aarch64/strcmp-sve.S
+++ b/string/aarch64/strcmp-sve.S
@@ -1,11 +1,11 @@
/*
* __strcmp_aarch64_sve - compare two strings
*
- * Copyright (c) 2018-2022, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * Copyright (c) 2018-2021, Arm Limited.
+ * SPDX-License-Identifier: MIT
*/
-#include "asmdefs.h"
+#include "../asmdefs.h"
#if __ARM_FEATURE_SVE
/* Assumptions:
diff --git a/string/aarch64/strcmp.S b/string/aarch64/strcmp.S
index 137a9aa06681a3c6d00062c88cddf8b9a227c220..7714ebf5577d84a279f911914f5f7f28d41f3e8c 100644
--- a/string/aarch64/strcmp.S
+++ b/string/aarch64/strcmp.S
@@ -1,184 +1,168 @@
/*
* strcmp - compare two strings
*
- * Copyright (c) 2012-2022, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * Copyright (c) 2012-2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
*/
-
/* Assumptions:
*
- * ARMv8-a, AArch64.
- * MTE compatible.
+ * ARMv8-a, AArch64
*/
-#include "asmdefs.h"
+#include "../asmdefs.h"
#define REP8_01 0x0101010101010101
#define REP8_7f 0x7f7f7f7f7f7f7f7f
+#define REP8_80 0x8080808080808080
+/* Parameters and result. */
#define src1 x0
#define src2 x1
#define result x0
+/* Internal variables. */
#define data1 x2
#define data1w w2
#define data2 x3
#define data2w w3
#define has_nul x4
#define diff x5
-#define off1 x5
#define syndrome x6
-#define tmp x6
-#define data3 x7
-#define zeroones x8
-#define shift x9
-#define off2 x10
-
-/* On big-endian early bytes are at MSB and on little-endian LSB.
- LS_FW means shifting towards early bytes. */
-#ifdef __AARCH64EB__
-# define LS_FW lsl
-#else
-# define LS_FW lsr
-#endif
-
-/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
- (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
- can be done in parallel across the entire word.
- Since carry propagation makes 0x1 bytes before a NUL byte appear
- NUL too in big-endian, byte-reverse the data before the NUL check. */
-
+#define tmp1 x7
+#define tmp2 x8
+#define tmp3 x9
+#define zeroones x10
+#define pos x11
+ /* Start of performance-critical section -- one 64B cache line. */
ENTRY (__strcmp_aarch64)
PTR_ARG (0)
PTR_ARG (1)
- sub off2, src2, src1
- mov zeroones, REP8_01
- and tmp, src1, 7
- tst off2, 7
+ eor tmp1, src1, src2
+ mov zeroones, #REP8_01
+ tst tmp1, #7
b.ne L(misaligned8)
- cbnz tmp, L(mutual_align)
-
- .p2align 4
-
+ ands tmp1, src1, #7
+ b.ne L(mutual_align)
+ /* NUL detection works on the principle that (X - 1) & (~X) & 0x80
+ (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
+ can be done in parallel across the entire word. */
L(loop_aligned):
- ldr data2, [src1, off2]
- ldr data1, [src1], 8
+ ldr data1, [src1], #8
+ ldr data2, [src2], #8
L(start_realigned):
-#ifdef __AARCH64EB__
- rev tmp, data1
- sub has_nul, tmp, zeroones
- orr tmp, tmp, REP8_7f
-#else
- sub has_nul, data1, zeroones
- orr tmp, data1, REP8_7f
-#endif
- bics has_nul, has_nul, tmp /* Non-zero if NUL terminator. */
- ccmp data1, data2, 0, eq
- b.eq L(loop_aligned)
-#ifdef __AARCH64EB__
- rev has_nul, has_nul
-#endif
- eor diff, data1, data2
+ sub tmp1, data1, zeroones
+ orr tmp2, data1, #REP8_7f
+ eor diff, data1, data2 /* Non-zero if differences found. */
+ bic has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */
orr syndrome, diff, has_nul
+ cbz syndrome, L(loop_aligned)
+ /* End of performance-critical section -- one 64B cache line. */
+
L(end):
-#ifndef __AARCH64EB__
+#ifndef __AARCH64EB__
rev syndrome, syndrome
rev data1, data1
+ /* The MS-non-zero bit of the syndrome marks either the first bit
+ that is different, or the top bit of the first zero byte.
+ Shifting left now will bring the critical information into the
+ top bits. */
+ clz pos, syndrome
rev data2, data2
-#endif
- clz shift, syndrome
- /* The most-significant-non-zero bit of the syndrome marks either the
- first bit that is different, or the top bit of the first zero byte.
+ lsl data1, data1, pos
+ lsl data2, data2, pos
+ /* But we need to zero-extend (char is unsigned) the value and then
+ perform a signed 32-bit subtraction. */
+ lsr data1, data1, #56
+ sub result, data1, data2, lsr #56
+ ret
+#else
+ /* For big-endian we cannot use the trick with the syndrome value
+ as carry-propagation can corrupt the upper bits if the trailing
+ bytes in the string contain 0x01. */
+ /* However, if there is no NUL byte in the dword, we can generate
+ the result directly. We can't just subtract the bytes as the
+ MSB might be significant. */
+ cbnz has_nul, 1f
+ cmp data1, data2
+ cset result, ne
+ cneg result, result, lo
+ ret
+1:
+ /* Re-compute the NUL-byte detection, using a byte-reversed value. */
+ rev tmp3, data1
+ sub tmp1, tmp3, zeroones
+ orr tmp2, tmp3, #REP8_7f
+ bic has_nul, tmp1, tmp2
+ rev has_nul, has_nul
+ orr syndrome, diff, has_nul
+ clz pos, syndrome
+ /* The MS-non-zero bit of the syndrome marks either the first bit
+ that is different, or the top bit of the first zero byte.
Shifting left now will bring the critical information into the
top bits. */
- lsl data1, data1, shift
- lsl data2, data2, shift
+ lsl data1, data1, pos
+ lsl data2, data2, pos
/* But we need to zero-extend (char is unsigned) the value and then
perform a signed 32-bit subtraction. */
- lsr data1, data1, 56
- sub result, data1, data2, lsr 56
+ lsr data1, data1, #56
+ sub result, data1, data2, lsr #56
ret
-
- .p2align 4
+#endif
L(mutual_align):
/* Sources are mutually aligned, but are not currently at an
alignment boundary. Round down the addresses and then mask off
- the bytes that precede the start point. */
- bic src1, src1, 7
- ldr data2, [src1, off2]
- ldr data1, [src1], 8
- neg shift, src2, lsl 3 /* Bits to alignment -64. */
- mov tmp, -1
- LS_FW tmp, tmp, shift
- orr data1, data1, tmp
- orr data2, data2, tmp
+ the bytes that preceed the start point. */
+ bic src1, src1, #7
+ bic src2, src2, #7
+ lsl tmp1, tmp1, #3 /* Bytes beyond alignment -> bits. */
+ ldr data1, [src1], #8
+ neg tmp1, tmp1 /* Bits to alignment -64. */
+ ldr data2, [src2], #8
+ mov tmp2, #~0
+#ifdef __AARCH64EB__
+ /* Big-endian. Early bytes are at MSB. */
+ lsl tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */
+#else
+ /* Little-endian. Early bytes are at LSB. */
+ lsr tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */
+#endif
+ orr data1, data1, tmp2
+ orr data2, data2, tmp2
b L(start_realigned)
L(misaligned8):
/* Align SRC1 to 8 bytes and then compare 8 bytes at a time, always
- checking to make sure that we don't access beyond the end of SRC2. */
- cbz tmp, L(src1_aligned)
+ checking to make sure that we don't access beyond page boundary in
+ SRC2. */
+ tst src1, #7
+ b.eq L(loop_misaligned)
L(do_misaligned):
- ldrb data1w, [src1], 1
- ldrb data2w, [src2], 1
- cmp data1w, 0
- ccmp data1w, data2w, 0, ne /* NZCV = 0b0000. */
+ ldrb data1w, [src1], #1
+ ldrb data2w, [src2], #1
+ cmp data1w, #1
+ ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */
b.ne L(done)
- tst src1, 7
+ tst src1, #7
b.ne L(do_misaligned)
-L(src1_aligned):
- neg shift, src2, lsl 3
- bic src2, src2, 7
- ldr data3, [src2], 8
-#ifdef __AARCH64EB__
- rev data3, data3
-#endif
- lsr tmp, zeroones, shift
- orr data3, data3, tmp
- sub has_nul, data3, zeroones
- orr tmp, data3, REP8_7f
- bics has_nul, has_nul, tmp
- b.ne L(tail)
-
- sub off1, src2, src1
-
- .p2align 4
-
-L(loop_unaligned):
- ldr data3, [src1, off1]
- ldr data2, [src1, off2]
-#ifdef __AARCH64EB__
- rev data3, data3
-#endif
- sub has_nul, data3, zeroones
- orr tmp, data3, REP8_7f
- ldr data1, [src1], 8
- bics has_nul, has_nul, tmp
- ccmp data1, data2, 0, eq
- b.eq L(loop_unaligned)
-
- lsl tmp, has_nul, shift
-#ifdef __AARCH64EB__
- rev tmp, tmp
-#endif
- eor diff, data1, data2
- orr syndrome, diff, tmp
- cbnz syndrome, L(end)
-L(tail):
- ldr data1, [src1]
- neg shift, shift
- lsr data2, data3, shift
- lsr has_nul, has_nul, shift
-#ifdef __AARCH64EB__
- rev data2, data2
- rev has_nul, has_nul
-#endif
- eor diff, data1, data2
+L(loop_misaligned):
+ /* Test if we are within the last dword of the end of a 4K page. If
+ yes then jump back to the misaligned loop to copy a byte at a time. */
+ and tmp1, src2, #0xff8
+ eor tmp1, tmp1, #0xff8
+ cbz tmp1, L(do_misaligned)
+ ldr data1, [src1], #8
+ ldr data2, [src2], #8
+
+ sub tmp1, data1, zeroones
+ orr tmp2, data1, #REP8_7f
+ eor diff, data1, data2 /* Non-zero if differences found. */
+ bic has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */
orr syndrome, diff, has_nul
+ cbz syndrome, L(loop_misaligned)
b L(end)
L(done):
diff --git a/string/aarch64/strcpy-mte.S b/string/aarch64/strcpy-mte.S
new file mode 100644
index 0000000000000000000000000000000000000000..88c222d61e53ad6841b10ef2b874852df203d800
--- /dev/null
+++ b/string/aarch64/strcpy-mte.S
@@ -0,0 +1,161 @@
+/*
+ * strcpy/stpcpy - copy a string returning pointer to start/end.
+ *
+ * Copyright (c) 2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64, Advanced SIMD.
+ * MTE compatible.
+ */
+
+#include "../asmdefs.h"
+
+#define dstin x0
+#define srcin x1
+#define result x0
+
+#define src x2
+#define dst x3
+#define len x4
+#define synd x4
+#define tmp x5
+#define wtmp w5
+#define shift x5
+#define data1 x6
+#define dataw1 w6
+#define data2 x7
+#define dataw2 w7
+
+#define dataq q0
+#define vdata v0
+#define vhas_nul v1
+#define vrepmask v2
+#define vend v3
+#define dend d3
+#define dataq2 q1
+
+#ifdef BUILD_STPCPY
+# define STRCPY __stpcpy_aarch64_mte
+# define IFSTPCPY(X,...) X,__VA_ARGS__
+#else
+# define STRCPY __strcpy_aarch64_mte
+# define IFSTPCPY(X,...)
+#endif
+
+/* Core algorithm:
+
+ For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
+ per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
+ requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
+ set likewise for odd bytes so that adjacent bytes can be merged. Since the
+ bits in the syndrome reflect the order in which things occur in the original
+ string, counting trailing zeros identifies exactly which byte matched. */
+
+ENTRY (STRCPY)
+ PTR_ARG (0)
+ PTR_ARG (1)
+ bic src, srcin, 15
+ mov wtmp, 0xf00f
+ ld1 {vdata.16b}, [src]
+ dup vrepmask.8h, wtmp
+ cmeq vhas_nul.16b, vdata.16b, 0
+ lsl shift, srcin, 2
+ and vhas_nul.16b, vhas_nul.16b, vrepmask.16b
+ addp vend.16b, vhas_nul.16b, vhas_nul.16b
+ fmov synd, dend
+ lsr synd, synd, shift
+ cbnz synd, L(tail)
+
+ ldr dataq, [src, 16]!
+ cmeq vhas_nul.16b, vdata.16b, 0
+ and vhas_nul.16b, vhas_nul.16b, vrepmask.16b
+ addp vend.16b, vhas_nul.16b, vhas_nul.16b
+ fmov synd, dend
+ cbz synd, L(start_loop)
+
+#ifndef __AARCH64EB__
+ rbit synd, synd
+#endif
+ sub tmp, src, srcin
+ clz len, synd
+ add len, tmp, len, lsr 2
+ tbz len, 4, L(less16)
+ sub tmp, len, 15
+ ldr dataq, [srcin]
+ ldr dataq2, [srcin, tmp]
+ str dataq, [dstin]
+ str dataq2, [dstin, tmp]
+ IFSTPCPY (add result, dstin, len)
+ ret
+
+ .p2align 4,,8
+L(tail):
+ rbit synd, synd
+ clz len, synd
+ lsr len, len, 2
+
+ .p2align 4
+L(less16):
+ tbz len, 3, L(less8)
+ sub tmp, len, 7
+ ldr data1, [srcin]
+ ldr data2, [srcin, tmp]
+ str data1, [dstin]
+ str data2, [dstin, tmp]
+ IFSTPCPY (add result, dstin, len)
+ ret
+
+ .p2align 4
+L(less8):
+ subs tmp, len, 3
+ b.lo L(less4)
+ ldr dataw1, [srcin]
+ ldr dataw2, [srcin, tmp]
+ str dataw1, [dstin]
+ str dataw2, [dstin, tmp]
+ IFSTPCPY (add result, dstin, len)
+ ret
+
+L(less4):
+ cbz len, L(zerobyte)
+ ldrh dataw1, [srcin]
+ strh dataw1, [dstin]
+L(zerobyte):
+ strb wzr, [dstin, len]
+ IFSTPCPY (add result, dstin, len)
+ ret
+
+ .p2align 4
+L(start_loop):
+ sub len, src, srcin
+ ldr dataq2, [srcin]
+ add dst, dstin, len
+ str dataq2, [dstin]
+
+ .p2align 5
+L(loop):
+ str dataq, [dst], 16
+ ldr dataq, [src, 16]!
+ cmeq vhas_nul.16b, vdata.16b, 0
+ umaxp vend.16b, vhas_nul.16b, vhas_nul.16b
+ fmov synd, dend
+ cbz synd, L(loop)
+
+ and vhas_nul.16b, vhas_nul.16b, vrepmask.16b
+ addp vend.16b, vhas_nul.16b, vhas_nul.16b /* 128->64 */
+ fmov synd, dend
+#ifndef __AARCH64EB__
+ rbit synd, synd
+#endif
+ clz len, synd
+ lsr len, len, 2
+ sub tmp, len, 15
+ ldr dataq, [src, tmp]
+ str dataq, [dst, tmp]
+ IFSTPCPY (add result, dst, len)
+ ret
+
+END (STRCPY)
diff --git a/string/aarch64/strcpy-sve.S b/string/aarch64/strcpy-sve.S
index 00e72dce4451b3ead0e83c2c90832088ab79fb50..f515462e09ae768dbc921ba2928150dd5a98c6e7 100644
--- a/string/aarch64/strcpy-sve.S
+++ b/string/aarch64/strcpy-sve.S
@@ -1,11 +1,11 @@
/*
* strcpy/stpcpy - copy a string returning pointer to start/end.
*
- * Copyright (c) 2018-2022, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * Copyright (c) 2018-2021, Arm Limited.
+ * SPDX-License-Identifier: MIT
*/
-#include "asmdefs.h"
+#include "../asmdefs.h"
#if __ARM_FEATURE_SVE
/* Assumptions:
diff --git a/string/aarch64/strcpy.S b/string/aarch64/strcpy.S
index 97ae37ea422973e3eeea510bf63c3f314ff574d3..6e9ed424b693919e95f7fbe8569fc9024633715a 100644
--- a/string/aarch64/strcpy.S
+++ b/string/aarch64/strcpy.S
@@ -1,156 +1,311 @@
/*
* strcpy/stpcpy - copy a string returning pointer to start/end.
*
- * Copyright (c) 2020-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * Copyright (c) 2013-2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
*/
/* Assumptions:
*
- * ARMv8-a, AArch64, Advanced SIMD.
- * MTE compatible.
+ * ARMv8-a, AArch64, unaligned accesses, min page size 4k.
*/
-#include "asmdefs.h"
+#include "../asmdefs.h"
+/* To build as stpcpy, define BUILD_STPCPY before compiling this file.
+
+ To test the page crossing code path more thoroughly, compile with
+ -DSTRCPY_TEST_PAGE_CROSS - this will force all copies through the slower
+ entry path. This option is not intended for production use. */
+
+/* Arguments and results. */
#define dstin x0
#define srcin x1
-#define result x0
+/* Locals and temporaries. */
#define src x2
#define dst x3
-#define len x4
-#define synd x4
-#define tmp x5
-#define shift x5
-#define data1 x6
-#define dataw1 w6
-#define data2 x7
-#define dataw2 w7
-
-#define dataq q0
-#define vdata v0
-#define vhas_nul v1
-#define vend v2
-#define dend d2
-#define dataq2 q1
+#define data1 x4
+#define data1w w4
+#define data2 x5
+#define data2w w5
+#define has_nul1 x6
+#define has_nul2 x7
+#define tmp1 x8
+#define tmp2 x9
+#define tmp3 x10
+#define tmp4 x11
+#define zeroones x12
+#define data1a x13
+#define data2a x14
+#define pos x15
+#define len x16
+#define to_align x17
#ifdef BUILD_STPCPY
-# define STRCPY __stpcpy_aarch64
-# define IFSTPCPY(X,...) X,__VA_ARGS__
+#define STRCPY __stpcpy_aarch64
#else
-# define STRCPY __strcpy_aarch64
-# define IFSTPCPY(X,...)
+#define STRCPY __strcpy_aarch64
#endif
-/*
- Core algorithm:
- For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits
- per byte. We take 4 bits of every comparison byte with shift right and narrow
- by 4 instruction. Since the bits in the nibble mask reflect the order in
- which things occur in the original string, counting leading zeros identifies
- exactly which byte matched. */
+ /* NUL detection works on the principle that (X - 1) & (~X) & 0x80
+ (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
+ can be done in parallel across the entire word. */
+
+#define REP8_01 0x0101010101010101
+#define REP8_7f 0x7f7f7f7f7f7f7f7f
+#define REP8_80 0x8080808080808080
+
+ /* AArch64 systems have a minimum page size of 4k. We can do a quick
+ page size check for crossing this boundary on entry and if we
+ do not, then we can short-circuit much of the entry code. We
+ expect early page-crossing strings to be rare (probability of
+ 16/MIN_PAGE_SIZE ~= 0.4%), so the branch should be quite
+ predictable, even with random strings.
+
+ We don't bother checking for larger page sizes, the cost of setting
+ up the correct page size is just not worth the extra gain from
+ a small reduction in the cases taking the slow path. Note that
+ we only care about whether the first fetch, which may be
+ misaligned, crosses a page boundary - after that we move to aligned
+ fetches for the remainder of the string. */
+
+#ifdef STRCPY_TEST_PAGE_CROSS
+ /* Make everything that isn't Qword aligned look like a page cross. */
+#define MIN_PAGE_P2 4
+#else
+#define MIN_PAGE_P2 12
+#endif
+
+#define MIN_PAGE_SIZE (1 << MIN_PAGE_P2)
ENTRY (STRCPY)
PTR_ARG (0)
PTR_ARG (1)
- bic src, srcin, 15
- ld1 {vdata.16b}, [src]
- cmeq vhas_nul.16b, vdata.16b, 0
- lsl shift, srcin, 2
- shrn vend.8b, vhas_nul.8h, 4
- fmov synd, dend
- lsr synd, synd, shift
- cbnz synd, L(tail)
-
- ldr dataq, [src, 16]!
- cmeq vhas_nul.16b, vdata.16b, 0
- shrn vend.8b, vhas_nul.8h, 4
- fmov synd, dend
- cbz synd, L(start_loop)
-
-#ifndef __AARCH64EB__
- rbit synd, synd
+ /* For moderately short strings, the fastest way to do the copy is to
+ calculate the length of the string in the same way as strlen, then
+ essentially do a memcpy of the result. This avoids the need for
+ multiple byte copies and further means that by the time we
+ reach the bulk copy loop we know we can always use DWord
+ accesses. We expect __strcpy_aarch64 to rarely be called repeatedly
+ with the same source string, so branch prediction is likely to
+ always be difficult - we mitigate against this by preferring
+ conditional select operations over branches whenever this is
+ feasible. */
+ and tmp2, srcin, #(MIN_PAGE_SIZE - 1)
+ mov zeroones, #REP8_01
+ and to_align, srcin, #15
+ cmp tmp2, #(MIN_PAGE_SIZE - 16)
+ neg tmp1, to_align
+ /* The first fetch will straddle a (possible) page boundary iff
+ srcin + 15 causes bit[MIN_PAGE_P2] to change value. A 16-byte
+ aligned string will never fail the page align check, so will
+ always take the fast path. */
+ b.gt L(page_cross)
+
+L(page_cross_ok):
+ ldp data1, data2, [srcin]
+#ifdef __AARCH64EB__
+ /* Because we expect the end to be found within 16 characters
+ (profiling shows this is the most common case), it's worth
+ swapping the bytes now to save having to recalculate the
+ termination syndrome later. We preserve data1 and data2
+ so that we can re-use the values later on. */
+ rev tmp2, data1
+ sub tmp1, tmp2, zeroones
+ orr tmp2, tmp2, #REP8_7f
+ bics has_nul1, tmp1, tmp2
+ b.ne L(fp_le8)
+ rev tmp4, data2
+ sub tmp3, tmp4, zeroones
+ orr tmp4, tmp4, #REP8_7f
+#else
+ sub tmp1, data1, zeroones
+ orr tmp2, data1, #REP8_7f
+ bics has_nul1, tmp1, tmp2
+ b.ne L(fp_le8)
+ sub tmp3, data2, zeroones
+ orr tmp4, data2, #REP8_7f
#endif
- sub tmp, src, srcin
- clz len, synd
- add len, tmp, len, lsr 2
- tbz len, 4, L(less16)
- sub tmp, len, 15
- ldr dataq, [srcin]
- ldr dataq2, [srcin, tmp]
- str dataq, [dstin]
- str dataq2, [dstin, tmp]
- IFSTPCPY (add result, dstin, len)
- ret
+ bics has_nul2, tmp3, tmp4
+ b.eq L(bulk_entry)
-L(tail):
- rbit synd, synd
- clz len, synd
- lsr len, len, 2
-L(less16):
- tbz len, 3, L(less8)
- sub tmp, len, 7
- ldr data1, [srcin]
- ldr data2, [srcin, tmp]
+ /* The string is short (<=16 bytes). We don't know exactly how
+ short though, yet. Work out the exact length so that we can
+ quickly select the optimal copy strategy. */
+L(fp_gt8):
+ rev has_nul2, has_nul2
+ clz pos, has_nul2
+ mov tmp2, #56
+ add dst, dstin, pos, lsr #3 /* Bits to bytes. */
+ sub pos, tmp2, pos
+#ifdef __AARCH64EB__
+ lsr data2, data2, pos
+#else
+ lsl data2, data2, pos
+#endif
+ str data2, [dst, #1]
str data1, [dstin]
- str data2, [dstin, tmp]
- IFSTPCPY (add result, dstin, len)
+#ifdef BUILD_STPCPY
+ add dstin, dst, #8
+#endif
ret
- .p2align 4
-L(less8):
- subs tmp, len, 3
- b.lo L(less4)
- ldr dataw1, [srcin]
- ldr dataw2, [srcin, tmp]
- str dataw1, [dstin]
- str dataw2, [dstin, tmp]
- IFSTPCPY (add result, dstin, len)
+L(fp_le8):
+ rev has_nul1, has_nul1
+ clz pos, has_nul1
+ add dst, dstin, pos, lsr #3 /* Bits to bytes. */
+ subs tmp2, pos, #24 /* Pos in bits. */
+ b.lt L(fp_lt4)
+#ifdef __AARCH64EB__
+ mov tmp2, #56
+ sub pos, tmp2, pos
+ lsr data2, data1, pos
+ lsr data1, data1, #32
+#else
+ lsr data2, data1, tmp2
+#endif
+ /* 4->7 bytes to copy. */
+ str data2w, [dst, #-3]
+ str data1w, [dstin]
+#ifdef BUILD_STPCPY
+ mov dstin, dst
+#endif
ret
-
-L(less4):
- cbz len, L(zerobyte)
- ldrh dataw1, [srcin]
- strh dataw1, [dstin]
-L(zerobyte):
- strb wzr, [dstin, len]
- IFSTPCPY (add result, dstin, len)
+L(fp_lt4):
+ cbz pos, L(fp_lt2)
+ /* 2->3 bytes to copy. */
+#ifdef __AARCH64EB__
+ lsr data1, data1, #48
+#endif
+ strh data1w, [dstin]
+ /* Fall-through, one byte (max) to go. */
+L(fp_lt2):
+ /* Null-terminated string. Last character must be zero! */
+ strb wzr, [dst]
+#ifdef BUILD_STPCPY
+ mov dstin, dst
+#endif
ret
- .p2align 4
-L(start_loop):
- sub tmp, srcin, dstin
- ldr dataq2, [srcin]
- sub dst, src, tmp
- str dataq2, [dstin]
-L(loop):
- str dataq, [dst], 32
- ldr dataq, [src, 16]
- cmeq vhas_nul.16b, vdata.16b, 0
- umaxp vend.16b, vhas_nul.16b, vhas_nul.16b
- fmov synd, dend
- cbnz synd, L(loopend)
- str dataq, [dst, -16]
- ldr dataq, [src, 32]!
- cmeq vhas_nul.16b, vdata.16b, 0
- umaxp vend.16b, vhas_nul.16b, vhas_nul.16b
- fmov synd, dend
- cbz synd, L(loop)
- add dst, dst, 16
-L(loopend):
- shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */
- fmov synd, dend
- sub dst, dst, 31
-#ifndef __AARCH64EB__
- rbit synd, synd
+ .p2align 6
+ /* Aligning here ensures that the entry code and main loop all lies
+ within one 64-byte cache line. */
+L(bulk_entry):
+ sub to_align, to_align, #16
+ stp data1, data2, [dstin]
+ sub src, srcin, to_align
+ sub dst, dstin, to_align
+ b L(entry_no_page_cross)
+
+ /* The inner loop deals with two Dwords at a time. This has a
+ slightly higher start-up cost, but we should win quite quickly,
+ especially on cores with a high number of issue slots per
+ cycle, as we get much better parallelism out of the operations. */
+L(main_loop):
+ stp data1, data2, [dst], #16
+L(entry_no_page_cross):
+ ldp data1, data2, [src], #16
+ sub tmp1, data1, zeroones
+ orr tmp2, data1, #REP8_7f
+ sub tmp3, data2, zeroones
+ orr tmp4, data2, #REP8_7f
+ bic has_nul1, tmp1, tmp2
+ bics has_nul2, tmp3, tmp4
+ ccmp has_nul1, #0, #0, eq /* NZCV = 0000 */
+ b.eq L(main_loop)
+
+ /* Since we know we are copying at least 16 bytes, the fastest way
+ to deal with the tail is to determine the location of the
+ trailing NUL, then (re)copy the 16 bytes leading up to that. */
+ cmp has_nul1, #0
+#ifdef __AARCH64EB__
+ /* For big-endian, carry propagation (if the final byte in the
+ string is 0x01) means we cannot use has_nul directly. The
+ easiest way to get the correct byte is to byte-swap the data
+ and calculate the syndrome a second time. */
+ csel data1, data1, data2, ne
+ rev data1, data1
+ sub tmp1, data1, zeroones
+ orr tmp2, data1, #REP8_7f
+ bic has_nul1, tmp1, tmp2
+#else
+ csel has_nul1, has_nul1, has_nul2, ne
+#endif
+ rev has_nul1, has_nul1
+ clz pos, has_nul1
+ add tmp1, pos, #72
+ add pos, pos, #8
+ csel pos, pos, tmp1, ne
+ add src, src, pos, lsr #3
+ add dst, dst, pos, lsr #3
+ ldp data1, data2, [src, #-32]
+ stp data1, data2, [dst, #-16]
+#ifdef BUILD_STPCPY
+ sub dstin, dst, #1
#endif
- clz len, synd
- lsr len, len, 2
- add dst, dst, len
- ldr dataq, [dst, tmp]
- str dataq, [dst]
- IFSTPCPY (add result, dst, 15)
ret
+L(page_cross):
+ bic src, srcin, #15
+ /* Start by loading two words at [srcin & ~15], then forcing the
+ bytes that precede srcin to 0xff. This means they never look
+ like termination bytes. */
+ ldp data1, data2, [src]
+ lsl tmp1, tmp1, #3 /* Bytes beyond alignment -> bits. */
+ tst to_align, #7
+ csetm tmp2, ne
+#ifdef __AARCH64EB__
+ lsl tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */
+#else
+ lsr tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */
+#endif
+ orr data1, data1, tmp2
+ orr data2a, data2, tmp2
+ cmp to_align, #8
+ csinv data1, data1, xzr, lt
+ csel data2, data2, data2a, lt
+ sub tmp1, data1, zeroones
+ orr tmp2, data1, #REP8_7f
+ sub tmp3, data2, zeroones
+ orr tmp4, data2, #REP8_7f
+ bic has_nul1, tmp1, tmp2
+ bics has_nul2, tmp3, tmp4
+ ccmp has_nul1, #0, #0, eq /* NZCV = 0000 */
+ b.eq L(page_cross_ok)
+ /* We now need to make data1 and data2 look like they've been
+ loaded directly from srcin. Do a rotate on the 128-bit value. */
+ lsl tmp1, to_align, #3 /* Bytes->bits. */
+ neg tmp2, to_align, lsl #3
+#ifdef __AARCH64EB__
+ lsl data1a, data1, tmp1
+ lsr tmp4, data2, tmp2
+ lsl data2, data2, tmp1
+ orr tmp4, tmp4, data1a
+ cmp to_align, #8
+ csel data1, tmp4, data2, lt
+ rev tmp2, data1
+ rev tmp4, data2
+ sub tmp1, tmp2, zeroones
+ orr tmp2, tmp2, #REP8_7f
+ sub tmp3, tmp4, zeroones
+ orr tmp4, tmp4, #REP8_7f
+#else
+ lsr data1a, data1, tmp1
+ lsl tmp4, data2, tmp2
+ lsr data2, data2, tmp1
+ orr tmp4, tmp4, data1a
+ cmp to_align, #8
+ csel data1, tmp4, data2, lt
+ sub tmp1, data1, zeroones
+ orr tmp2, data1, #REP8_7f
+ sub tmp3, data2, zeroones
+ orr tmp4, data2, #REP8_7f
+#endif
+ bic has_nul1, tmp1, tmp2
+ cbnz has_nul1, L(fp_le8)
+ bic has_nul2, tmp3, tmp4
+ b L(fp_gt8)
+
END (STRCPY)
+
diff --git a/string/aarch64/strlen-mte.S b/string/aarch64/strlen-mte.S
index 77235797f7c54fe5af374120f76362148b11ce0f..7cf41d5c1eac995332ae42bbaf962116eb32457d 100644
--- a/string/aarch64/strlen-mte.S
+++ b/string/aarch64/strlen-mte.S
@@ -1,8 +1,8 @@
/*
* strlen - calculate the length of a string.
*
- * Copyright (c) 2020-2022, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * Copyright (c) 2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
*/
/* Assumptions:
@@ -11,7 +11,7 @@
* MTE compatible.
*/
-#include "asmdefs.h"
+#include "../asmdefs.h"
#define srcin x0
#define result x0
@@ -19,26 +19,35 @@
#define src x1
#define synd x2
#define tmp x3
+#define wtmp w3
#define shift x4
#define data q0
#define vdata v0
#define vhas_nul v1
-#define vend v2
-#define dend d2
+#define vrepmask v2
+#define vend v3
+#define dend d3
/* Core algorithm:
- Process the string in 16-byte aligned chunks. Compute a 64-bit mask with
- four bits per byte using the shrn instruction. A count trailing zeros then
- identifies the first zero byte. */
+
+ For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
+ per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
+ requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
+ set likewise for odd bytes so that adjacent bytes can be merged. Since the
+ bits in the syndrome reflect the order in which things occur in the original
+ string, counting trailing zeros identifies exactly which byte matched. */
ENTRY (__strlen_aarch64_mte)
PTR_ARG (0)
bic src, srcin, 15
+ mov wtmp, 0xf00f
ld1 {vdata.16b}, [src]
+ dup vrepmask.8h, wtmp
cmeq vhas_nul.16b, vdata.16b, 0
lsl shift, srcin, 2
- shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */
+ and vhas_nul.16b, vhas_nul.16b, vrepmask.16b
+ addp vend.16b, vhas_nul.16b, vhas_nul.16b /* 128->64 */
fmov synd, dend
lsr synd, synd, shift
cbz synd, L(loop)
@@ -50,25 +59,19 @@ ENTRY (__strlen_aarch64_mte)
.p2align 5
L(loop):
- ldr data, [src, 16]
- cmeq vhas_nul.16b, vdata.16b, 0
- umaxp vend.16b, vhas_nul.16b, vhas_nul.16b
- fmov synd, dend
- cbnz synd, L(loop_end)
- ldr data, [src, 32]!
+ ldr data, [src, 16]!
cmeq vhas_nul.16b, vdata.16b, 0
umaxp vend.16b, vhas_nul.16b, vhas_nul.16b
fmov synd, dend
cbz synd, L(loop)
- sub src, src, 16
-L(loop_end):
- shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */
+
+ and vhas_nul.16b, vhas_nul.16b, vrepmask.16b
+ addp vend.16b, vhas_nul.16b, vhas_nul.16b /* 128->64 */
sub result, src, srcin
fmov synd, dend
#ifndef __AARCH64EB__
rbit synd, synd
#endif
- add result, result, 16
clz tmp, synd
add result, result, tmp, lsr 2
ret
diff --git a/string/aarch64/strlen-sve.S b/string/aarch64/strlen-sve.S
index 12ebbdba5c93ae99a195dd8abef828c2b7804982..2392493f1a3c4c79b67f790bfa064766253e55e7 100644
--- a/string/aarch64/strlen-sve.S
+++ b/string/aarch64/strlen-sve.S
@@ -1,11 +1,11 @@
/*
* __strlen_aarch64_sve - compute the length of a string
*
- * Copyright (c) 2018-2022, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * Copyright (c) 2018-2021, Arm Limited.
+ * SPDX-License-Identifier: MIT
*/
-#include "asmdefs.h"
+#include "../asmdefs.h"
#if __ARM_FEATURE_SVE
/* Assumptions:
diff --git a/string/aarch64/strlen.S b/string/aarch64/strlen.S
index 6f6f08f636b248abc9c9b2e847545588efca281b..a1b164a49238243419c89a365dd6757f9e9be7cd 100644
--- a/string/aarch64/strlen.S
+++ b/string/aarch64/strlen.S
@@ -1,8 +1,8 @@
/*
* strlen - calculate the length of a string.
*
- * Copyright (c) 2020-2022, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * Copyright (c) 2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
*/
/* Assumptions:
@@ -11,7 +11,7 @@
* Not MTE compatible.
*/
-#include "asmdefs.h"
+#include "../asmdefs.h"
#define srcin x0
#define len x0
@@ -36,7 +36,6 @@
#define tmp x2
#define tmpw w2
#define synd x3
-#define syndw w3
#define shift x4
/* For the first 32 bytes, NUL detection works on the principle that
@@ -111,6 +110,7 @@ ENTRY (__strlen_aarch64)
add len, len, tmp1, lsr 3
ret
+ .p2align 3
/* Look for a NUL byte at offset 16..31 in the string. */
L(bytes16_31):
ldp data1, data2, [srcin, 16]
@@ -138,7 +138,6 @@ L(bytes16_31):
add len, len, tmp1, lsr 3
ret
- nop
L(loop_entry):
bic src, srcin, 31
@@ -154,12 +153,18 @@ L(loop):
/* Low 32 bits of synd are non-zero if a NUL was found in datav1. */
cmeq maskv.16b, datav1.16b, 0
sub len, src, srcin
- cbnz syndw, 1f
+ tst synd, 0xffffffff
+ b.ne 1f
cmeq maskv.16b, datav2.16b, 0
add len, len, 16
1:
/* Generate a bitmask and compute correct byte offset. */
- shrn maskv.8b, maskv.8h, 4
+#ifdef __AARCH64EB__
+ bic maskv.8h, 0xf0
+#else
+ bic maskv.8h, 0x0f, lsl 8
+#endif
+ umaxp maskv.16b, maskv.16b, maskv.16b
fmov synd, maskd
#ifndef __AARCH64EB__
rbit synd, synd
@@ -168,6 +173,8 @@ L(loop):
add len, len, tmp, lsr 2
ret
+ .p2align 4
+
L(page_cross):
bic src, srcin, 31
mov tmpw, 0x0c03
diff --git a/string/aarch64/strncmp-mte.S b/string/aarch64/strncmp-mte.S
new file mode 100644
index 0000000000000000000000000000000000000000..c9d6fc8a158beca38419a6ccf82cd8573394f7b6
--- /dev/null
+++ b/string/aarch64/strncmp-mte.S
@@ -0,0 +1,307 @@
+/*
+ * strncmp - compare two strings
+ *
+ * Copyright (c) 2013-2021, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64
+ */
+
+#include "../asmdefs.h"
+
+#define REP8_01 0x0101010101010101
+#define REP8_7f 0x7f7f7f7f7f7f7f7f
+
+/* Parameters and result. */
+#define src1 x0
+#define src2 x1
+#define limit x2
+#define result x0
+
+/* Internal variables. */
+#define data1 x3
+#define data1w w3
+#define data2 x4
+#define data2w w4
+#define has_nul x5
+#define diff x6
+#define syndrome x7
+#define tmp1 x8
+#define tmp2 x9
+#define tmp3 x10
+#define zeroones x11
+#define pos x12
+#define mask x13
+#define endloop x14
+#define count mask
+#define offset pos
+#define neg_offset x15
+
+/* Define endian dependent shift operations.
+ On big-endian early bytes are at MSB and on little-endian LSB.
+ LS_FW means shifting towards early bytes.
+ LS_BK means shifting towards later bytes.
+ */
+#ifdef __AARCH64EB__
+#define LS_FW lsl
+#define LS_BK lsr
+#else
+#define LS_FW lsr
+#define LS_BK lsl
+#endif
+
+ENTRY (__strncmp_aarch64_mte)
+ PTR_ARG (0)
+ PTR_ARG (1)
+ SIZE_ARG (2)
+ cbz limit, L(ret0)
+ eor tmp1, src1, src2
+ mov zeroones, #REP8_01
+ tst tmp1, #7
+ and count, src1, #7
+ b.ne L(misaligned8)
+ cbnz count, L(mutual_align)
+
+ /* NUL detection works on the principle that (X - 1) & (~X) & 0x80
+ (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
+ can be done in parallel across the entire word. */
+ .p2align 4
+L(loop_aligned):
+ ldr data1, [src1], #8
+ ldr data2, [src2], #8
+L(start_realigned):
+ subs limit, limit, #8
+ sub tmp1, data1, zeroones
+ orr tmp2, data1, #REP8_7f
+ eor diff, data1, data2 /* Non-zero if differences found. */
+ csinv endloop, diff, xzr, hi /* Last Dword or differences. */
+ bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */
+ ccmp endloop, #0, #0, eq
+ b.eq L(loop_aligned)
+ /* End of main loop */
+
+L(full_check):
+#ifndef __AARCH64EB__
+ orr syndrome, diff, has_nul
+ add limit, limit, 8 /* Rewind limit to before last subs. */
+L(syndrome_check):
+ /* Limit was reached. Check if the NUL byte or the difference
+ is before the limit. */
+ rev syndrome, syndrome
+ rev data1, data1
+ clz pos, syndrome
+ rev data2, data2
+ lsl data1, data1, pos
+ cmp limit, pos, lsr #3
+ lsl data2, data2, pos
+ /* But we need to zero-extend (char is unsigned) the value and then
+ perform a signed 32-bit subtraction. */
+ lsr data1, data1, #56
+ sub result, data1, data2, lsr #56
+ csel result, result, xzr, hi
+ ret
+#else
+ /* Not reached the limit, must have found the end or a diff. */
+ tbz limit, #63, L(not_limit)
+ add tmp1, limit, 8
+ cbz limit, L(not_limit)
+
+ lsl limit, tmp1, #3 /* Bits -> bytes. */
+ mov mask, #~0
+ lsr mask, mask, limit
+ bic data1, data1, mask
+ bic data2, data2, mask
+
+ /* Make sure that the NUL byte is marked in the syndrome. */
+ orr has_nul, has_nul, mask
+
+L(not_limit):
+ /* For big-endian we cannot use the trick with the syndrome value
+ as carry-propagation can corrupt the upper bits if the trailing
+ bytes in the string contain 0x01. */
+ /* However, if there is no NUL byte in the dword, we can generate
+ the result directly. We can't just subtract the bytes as the
+ MSB might be significant. */
+ cbnz has_nul, 1f
+ cmp data1, data2
+ cset result, ne
+ cneg result, result, lo
+ ret
+1:
+ /* Re-compute the NUL-byte detection, using a byte-reversed value. */
+ rev tmp3, data1
+ sub tmp1, tmp3, zeroones
+ orr tmp2, tmp3, #REP8_7f
+ bic has_nul, tmp1, tmp2
+ rev has_nul, has_nul
+ orr syndrome, diff, has_nul
+ clz pos, syndrome
+ /* The most-significant-non-zero bit of the syndrome marks either the
+ first bit that is different, or the top bit of the first zero byte.
+ Shifting left now will bring the critical information into the
+ top bits. */
+L(end_quick):
+ lsl data1, data1, pos
+ lsl data2, data2, pos
+ /* But we need to zero-extend (char is unsigned) the value and then
+ perform a signed 32-bit subtraction. */
+ lsr data1, data1, #56
+ sub result, data1, data2, lsr #56
+ ret
+#endif
+
+L(mutual_align):
+ /* Sources are mutually aligned, but are not currently at an
+ alignment boundary. Round down the addresses and then mask off
+ the bytes that precede the start point.
+ We also need to adjust the limit calculations, but without
+ overflowing if the limit is near ULONG_MAX. */
+ bic src1, src1, #7
+ bic src2, src2, #7
+ ldr data1, [src1], #8
+ neg tmp3, count, lsl #3 /* 64 - bits(bytes beyond align). */
+ ldr data2, [src2], #8
+ mov tmp2, #~0
+ LS_FW tmp2, tmp2, tmp3 /* Shift (count & 63). */
+ /* Adjust the limit and ensure it doesn't overflow. */
+ adds limit, limit, count
+ csinv limit, limit, xzr, lo
+ orr data1, data1, tmp2
+ orr data2, data2, tmp2
+ b L(start_realigned)
+
+ .p2align 4
+ /* Don't bother with dwords for up to 16 bytes. */
+L(misaligned8):
+ cmp limit, #16
+ b.hs L(try_misaligned_words)
+
+L(byte_loop):
+ /* Perhaps we can do better than this. */
+ ldrb data1w, [src1], #1
+ ldrb data2w, [src2], #1
+ subs limit, limit, #1
+ ccmp data1w, #1, #0, hi /* NZCV = 0b0000. */
+ ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */
+ b.eq L(byte_loop)
+L(done):
+ sub result, data1, data2
+ ret
+ /* Align the SRC1 to a dword by doing a bytewise compare and then do
+ the dword loop. */
+L(try_misaligned_words):
+ cbz count, L(src1_aligned)
+
+ neg count, count
+ and count, count, #7
+ sub limit, limit, count
+
+L(page_end_loop):
+ ldrb data1w, [src1], #1
+ ldrb data2w, [src2], #1
+ cmp data1w, #1
+ ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */
+ b.ne L(done)
+ subs count, count, #1
+ b.hi L(page_end_loop)
+
+ /* The following diagram explains the comparison of misaligned strings.
+ The bytes are shown in natural order. For little-endian, it is
+ reversed in the registers. The "x" bytes are before the string.
+ The "|" separates data that is loaded at one time.
+ src1 | a a a a a a a a | b b b c c c c c | . . .
+ src2 | x x x x x a a a a a a a a b b b | c c c c c . . .
+
+ After shifting in each step, the data looks like this:
+ STEP_A STEP_B STEP_C
+ data1 a a a a a a a a b b b c c c c c b b b c c c c c
+ data2 a a a a a a a a b b b 0 0 0 0 0 0 0 0 c c c c c
+
+ The bytes with "0" are eliminated from the syndrome via mask.
+
+ Align SRC2 down to 16 bytes. This way we can read 16 bytes at a
+ time from SRC2. The comparison happens in 3 steps. After each step
+ the loop can exit, or read from SRC1 or SRC2. */
+L(src1_aligned):
+ /* Calculate offset from 8 byte alignment to string start in bits. No
+ need to mask offset since shifts are ignoring upper bits. */
+ lsl offset, src2, #3
+ bic src2, src2, #0xf
+ mov mask, -1
+ neg neg_offset, offset
+ ldr data1, [src1], #8
+ ldp tmp1, tmp2, [src2], #16
+ LS_BK mask, mask, neg_offset
+ and neg_offset, neg_offset, #63 /* Need actual value for cmp later. */
+ /* Skip the first compare if data in tmp1 is irrelevant. */
+ tbnz offset, 6, L(misaligned_mid_loop)
+
+L(loop_misaligned):
+ /* STEP_A: Compare full 8 bytes when there is enough data from SRC2.*/
+ LS_FW data2, tmp1, offset
+ LS_BK tmp1, tmp2, neg_offset
+ subs limit, limit, #8
+ orr data2, data2, tmp1 /* 8 bytes from SRC2 combined from two regs.*/
+ sub has_nul, data1, zeroones
+ eor diff, data1, data2 /* Non-zero if differences found. */
+ orr tmp3, data1, #REP8_7f
+ csinv endloop, diff, xzr, hi /* If limit, set to all ones. */
+ bic has_nul, has_nul, tmp3 /* Non-zero if NUL byte found in SRC1. */
+ orr tmp3, endloop, has_nul
+ cbnz tmp3, L(full_check)
+
+ ldr data1, [src1], #8
+L(misaligned_mid_loop):
+ /* STEP_B: Compare first part of data1 to second part of tmp2. */
+ LS_FW data2, tmp2, offset
+#ifdef __AARCH64EB__
+ /* For big-endian we do a byte reverse to avoid carry-propagation
+ problem described above. This way we can reuse the has_nul in the
+ next step and also use syndrome value trick at the end. */
+ rev tmp3, data1
+ #define data1_fixed tmp3
+#else
+ #define data1_fixed data1
+#endif
+ sub has_nul, data1_fixed, zeroones
+ orr tmp3, data1_fixed, #REP8_7f
+ eor diff, data2, data1 /* Non-zero if differences found. */
+ bic has_nul, has_nul, tmp3 /* Non-zero if NUL terminator. */
+#ifdef __AARCH64EB__
+ rev has_nul, has_nul
+#endif
+ cmp limit, neg_offset, lsr #3
+ orr syndrome, diff, has_nul
+ bic syndrome, syndrome, mask /* Ignore later bytes. */
+ csinv tmp3, syndrome, xzr, hi /* If limit, set to all ones. */
+ cbnz tmp3, L(syndrome_check)
+
+ /* STEP_C: Compare second part of data1 to first part of tmp1. */
+ ldp tmp1, tmp2, [src2], #16
+ cmp limit, #8
+ LS_BK data2, tmp1, neg_offset
+ eor diff, data2, data1 /* Non-zero if differences found. */
+ orr syndrome, diff, has_nul
+ and syndrome, syndrome, mask /* Ignore earlier bytes. */
+ csinv tmp3, syndrome, xzr, hi /* If limit, set to all ones. */
+ cbnz tmp3, L(syndrome_check)
+
+ ldr data1, [src1], #8
+ sub limit, limit, #8
+ b L(loop_misaligned)
+
+#ifdef __AARCH64EB__
+L(syndrome_check):
+ clz pos, syndrome
+ cmp pos, limit, lsl #3
+ b.lo L(end_quick)
+#endif
+
+L(ret0):
+ mov result, #0
+ ret
+END(__strncmp_aarch64_mte)
+
diff --git a/string/aarch64/strncmp-sve.S b/string/aarch64/strncmp-sve.S
index 6a9e9f7b6437fdab851d5a4a4651b3f4922bf06b..234190e245b0ba30f6257fad70b9fcbc4ce767cd 100644
--- a/string/aarch64/strncmp-sve.S
+++ b/string/aarch64/strncmp-sve.S
@@ -1,11 +1,11 @@
/*
* strncmp - compare two strings with limit
*
- * Copyright (c) 2018-2022, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * Copyright (c) 2018-2021, Arm Limited.
+ * SPDX-License-Identifier: MIT
*/
-#include "asmdefs.h"
+#include "../asmdefs.h"
#if __ARM_FEATURE_SVE
/* Assumptions:
diff --git a/string/aarch64/strncmp.S b/string/aarch64/strncmp.S
index 128a10c52bb175436312c6326030c4d34cc4190f..738b6539cab647129d801a21bb7b88876b37c070 100644
--- a/string/aarch64/strncmp.S
+++ b/string/aarch64/strncmp.S
@@ -1,20 +1,20 @@
/*
* strncmp - compare two strings
*
- * Copyright (c) 2013-2022, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * Copyright (c) 2013-2021, Arm Limited.
+ * SPDX-License-Identifier: MIT
*/
/* Assumptions:
*
- * ARMv8-a, AArch64.
- * MTE compatible.
+ * ARMv8-a, AArch64
*/
-#include "asmdefs.h"
+#include "../asmdefs.h"
#define REP8_01 0x0101010101010101
#define REP8_7f 0x7f7f7f7f7f7f7f7f
+#define REP8_80 0x8080808080808080
/* Parameters and result. */
#define src1 x0
@@ -35,24 +35,10 @@
#define tmp3 x10
#define zeroones x11
#define pos x12
-#define mask x13
-#define endloop x14
+#define limit_wd x13
+#define mask x14
+#define endloop x15
#define count mask
-#define offset pos
-#define neg_offset x15
-
-/* Define endian dependent shift operations.
- On big-endian early bytes are at MSB and on little-endian LSB.
- LS_FW means shifting towards early bytes.
- LS_BK means shifting towards later bytes.
- */
-#ifdef __AARCH64EB__
-#define LS_FW lsl
-#define LS_BK lsr
-#else
-#define LS_FW lsr
-#define LS_BK lsl
-#endif
ENTRY (__strncmp_aarch64)
PTR_ARG (0)
@@ -65,6 +51,9 @@ ENTRY (__strncmp_aarch64)
and count, src1, #7
b.ne L(misaligned8)
cbnz count, L(mutual_align)
+ /* Calculate the number of full and partial words -1. */
+ sub limit_wd, limit, #1 /* limit != 0, so no underflow. */
+ lsr limit_wd, limit_wd, #3 /* Convert to Dwords. */
/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
(=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
@@ -74,52 +63,56 @@ L(loop_aligned):
ldr data1, [src1], #8
ldr data2, [src2], #8
L(start_realigned):
- subs limit, limit, #8
+ subs limit_wd, limit_wd, #1
sub tmp1, data1, zeroones
orr tmp2, data1, #REP8_7f
eor diff, data1, data2 /* Non-zero if differences found. */
- csinv endloop, diff, xzr, hi /* Last Dword or differences. */
+ csinv endloop, diff, xzr, pl /* Last Dword or differences. */
bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */
ccmp endloop, #0, #0, eq
b.eq L(loop_aligned)
/* End of main loop */
-L(full_check):
-#ifndef __AARCH64EB__
+ /* Not reached the limit, must have found the end or a diff. */
+ tbz limit_wd, #63, L(not_limit)
+
+ /* Limit % 8 == 0 => all bytes significant. */
+ ands limit, limit, #7
+ b.eq L(not_limit)
+
+ lsl limit, limit, #3 /* Bits -> bytes. */
+ mov mask, #~0
+#ifdef __AARCH64EB__
+ lsr mask, mask, limit
+#else
+ lsl mask, mask, limit
+#endif
+ bic data1, data1, mask
+ bic data2, data2, mask
+
+ /* Make sure that the NUL byte is marked in the syndrome. */
+ orr has_nul, has_nul, mask
+
+L(not_limit):
orr syndrome, diff, has_nul
- add limit, limit, 8 /* Rewind limit to before last subs. */
-L(syndrome_check):
- /* Limit was reached. Check if the NUL byte or the difference
- is before the limit. */
+
+#ifndef __AARCH64EB__
rev syndrome, syndrome
rev data1, data1
+ /* The MS-non-zero bit of the syndrome marks either the first bit
+ that is different, or the top bit of the first zero byte.
+ Shifting left now will bring the critical information into the
+ top bits. */
clz pos, syndrome
rev data2, data2
lsl data1, data1, pos
- cmp limit, pos, lsr #3
lsl data2, data2, pos
/* But we need to zero-extend (char is unsigned) the value and then
perform a signed 32-bit subtraction. */
lsr data1, data1, #56
sub result, data1, data2, lsr #56
- csel result, result, xzr, hi
ret
#else
- /* Not reached the limit, must have found the end or a diff. */
- tbz limit, #63, L(not_limit)
- add tmp1, limit, 8
- cbz limit, L(not_limit)
-
- lsl limit, tmp1, #3 /* Bits -> bytes. */
- mov mask, #~0
- lsr mask, mask, limit
- bic data1, data1, mask
- bic data2, data2, mask
-
- /* Make sure that the NUL byte is marked in the syndrome. */
- orr has_nul, has_nul, mask
-
-L(not_limit):
/* For big-endian we cannot use the trick with the syndrome value
as carry-propagation can corrupt the upper bits if the trailing
bytes in the string contain 0x01. */
@@ -140,11 +133,10 @@ L(not_limit):
rev has_nul, has_nul
orr syndrome, diff, has_nul
clz pos, syndrome
- /* The most-significant-non-zero bit of the syndrome marks either the
- first bit that is different, or the top bit of the first zero byte.
+ /* The MS-non-zero bit of the syndrome marks either the first bit
+ that is different, or the top bit of the first zero byte.
Shifting left now will bring the critical information into the
top bits. */
-L(end_quick):
lsl data1, data1, pos
lsl data2, data2, pos
/* But we need to zero-extend (char is unsigned) the value and then
@@ -166,12 +158,22 @@ L(mutual_align):
neg tmp3, count, lsl #3 /* 64 - bits(bytes beyond align). */
ldr data2, [src2], #8
mov tmp2, #~0
- LS_FW tmp2, tmp2, tmp3 /* Shift (count & 63). */
- /* Adjust the limit and ensure it doesn't overflow. */
- adds limit, limit, count
- csinv limit, limit, xzr, lo
+ sub limit_wd, limit, #1 /* limit != 0, so no underflow. */
+#ifdef __AARCH64EB__
+ /* Big-endian. Early bytes are at MSB. */
+ lsl tmp2, tmp2, tmp3 /* Shift (count & 63). */
+#else
+ /* Little-endian. Early bytes are at LSB. */
+ lsr tmp2, tmp2, tmp3 /* Shift (count & 63). */
+#endif
+ and tmp3, limit_wd, #7
+ lsr limit_wd, limit_wd, #3
+ /* Adjust the limit. Only low 3 bits used, so overflow irrelevant. */
+ add limit, limit, count
+ add tmp3, tmp3, count
orr data1, data1, tmp2
orr data2, data2, tmp2
+ add limit_wd, limit_wd, tmp3, lsr #3
b L(start_realigned)
.p2align 4
@@ -194,11 +196,13 @@ L(done):
/* Align the SRC1 to a dword by doing a bytewise compare and then do
the dword loop. */
L(try_misaligned_words):
- cbz count, L(src1_aligned)
+ lsr limit_wd, limit, #3
+ cbz count, L(do_misaligned)
neg count, count
and count, count, #7
sub limit, limit, count
+ lsr limit_wd, limit, #3
L(page_end_loop):
ldrb data1w, [src1], #1
@@ -209,100 +213,48 @@ L(page_end_loop):
subs count, count, #1
b.hi L(page_end_loop)
- /* The following diagram explains the comparison of misaligned strings.
- The bytes are shown in natural order. For little-endian, it is
- reversed in the registers. The "x" bytes are before the string.
- The "|" separates data that is loaded at one time.
- src1 | a a a a a a a a | b b b c c c c c | . . .
- src2 | x x x x x a a a a a a a a b b b | c c c c c . . .
-
- After shifting in each step, the data looks like this:
- STEP_A STEP_B STEP_C
- data1 a a a a a a a a b b b c c c c c b b b c c c c c
- data2 a a a a a a a a b b b 0 0 0 0 0 0 0 0 c c c c c
-
- The bytes with "0" are eliminated from the syndrome via mask.
-
- Align SRC2 down to 16 bytes. This way we can read 16 bytes at a
- time from SRC2. The comparison happens in 3 steps. After each step
- the loop can exit, or read from SRC1 or SRC2. */
-L(src1_aligned):
- /* Calculate offset from 8 byte alignment to string start in bits. No
- need to mask offset since shifts are ignoring upper bits. */
- lsl offset, src2, #3
- bic src2, src2, #0xf
- mov mask, -1
- neg neg_offset, offset
- ldr data1, [src1], #8
- ldp tmp1, tmp2, [src2], #16
- LS_BK mask, mask, neg_offset
- and neg_offset, neg_offset, #63 /* Need actual value for cmp later. */
- /* Skip the first compare if data in tmp1 is irrelevant. */
- tbnz offset, 6, L(misaligned_mid_loop)
-
+L(do_misaligned):
+ /* Prepare ourselves for the next page crossing. Unlike the aligned
+ loop, we fetch 1 less dword because we risk crossing bounds on
+ SRC2. */
+ mov count, #8
+ subs limit_wd, limit_wd, #1
+ b.lo L(done_loop)
L(loop_misaligned):
- /* STEP_A: Compare full 8 bytes when there is enough data from SRC2.*/
- LS_FW data2, tmp1, offset
- LS_BK tmp1, tmp2, neg_offset
- subs limit, limit, #8
- orr data2, data2, tmp1 /* 8 bytes from SRC2 combined from two regs.*/
- sub has_nul, data1, zeroones
- eor diff, data1, data2 /* Non-zero if differences found. */
- orr tmp3, data1, #REP8_7f
- csinv endloop, diff, xzr, hi /* If limit, set to all ones. */
- bic has_nul, has_nul, tmp3 /* Non-zero if NUL byte found in SRC1. */
- orr tmp3, endloop, has_nul
- cbnz tmp3, L(full_check)
-
- ldr data1, [src1], #8
-L(misaligned_mid_loop):
- /* STEP_B: Compare first part of data1 to second part of tmp2. */
- LS_FW data2, tmp2, offset
-#ifdef __AARCH64EB__
- /* For big-endian we do a byte reverse to avoid carry-propagation
- problem described above. This way we can reuse the has_nul in the
- next step and also use syndrome value trick at the end. */
- rev tmp3, data1
- #define data1_fixed tmp3
-#else
- #define data1_fixed data1
-#endif
- sub has_nul, data1_fixed, zeroones
- orr tmp3, data1_fixed, #REP8_7f
- eor diff, data2, data1 /* Non-zero if differences found. */
- bic has_nul, has_nul, tmp3 /* Non-zero if NUL terminator. */
-#ifdef __AARCH64EB__
- rev has_nul, has_nul
-#endif
- cmp limit, neg_offset, lsr #3
- orr syndrome, diff, has_nul
- bic syndrome, syndrome, mask /* Ignore later bytes. */
- csinv tmp3, syndrome, xzr, hi /* If limit, set to all ones. */
- cbnz tmp3, L(syndrome_check)
-
- /* STEP_C: Compare second part of data1 to first part of tmp1. */
- ldp tmp1, tmp2, [src2], #16
- cmp limit, #8
- LS_BK data2, tmp1, neg_offset
- eor diff, data2, data1 /* Non-zero if differences found. */
- orr syndrome, diff, has_nul
- and syndrome, syndrome, mask /* Ignore earlier bytes. */
- csinv tmp3, syndrome, xzr, hi /* If limit, set to all ones. */
- cbnz tmp3, L(syndrome_check)
+ and tmp2, src2, #0xff8
+ eor tmp2, tmp2, #0xff8
+ cbz tmp2, L(page_end_loop)
ldr data1, [src1], #8
- sub limit, limit, #8
- b L(loop_misaligned)
+ ldr data2, [src2], #8
+ sub tmp1, data1, zeroones
+ orr tmp2, data1, #REP8_7f
+ eor diff, data1, data2 /* Non-zero if differences found. */
+ bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */
+ ccmp diff, #0, #0, eq
+ b.ne L(not_limit)
+ subs limit_wd, limit_wd, #1
+ b.pl L(loop_misaligned)
-#ifdef __AARCH64EB__
-L(syndrome_check):
- clz pos, syndrome
- cmp pos, limit, lsl #3
- b.lo L(end_quick)
-#endif
+L(done_loop):
+ /* We found a difference or a NULL before the limit was reached. */
+ and limit, limit, #7
+ cbz limit, L(not_limit)
+ /* Read the last word. */
+ sub src1, src1, 8
+ sub src2, src2, 8
+ ldr data1, [src1, limit]
+ ldr data2, [src2, limit]
+ sub tmp1, data1, zeroones
+ orr tmp2, data1, #REP8_7f
+ eor diff, data1, data2 /* Non-zero if differences found. */
+ bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */
+ ccmp diff, #0, #0, eq
+ b.ne L(not_limit)
L(ret0):
mov result, #0
ret
-END(__strncmp_aarch64)
+
+END ( __strncmp_aarch64)
diff --git a/string/aarch64/strnlen-sve.S b/string/aarch64/strnlen-sve.S
index 6c43dc427da7a9279ed400a6186bbd162cc10148..5b9ebf7763bc2491011641702eac4dbc32f45482 100644
--- a/string/aarch64/strnlen-sve.S
+++ b/string/aarch64/strnlen-sve.S
@@ -1,11 +1,11 @@
/*
* strnlen - calculate the length of a string with limit.
*
- * Copyright (c) 2019-2022, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * Copyright (c) 2019-2021, Arm Limited.
+ * SPDX-License-Identifier: MIT
*/
-#include "asmdefs.h"
+#include "../asmdefs.h"
#if __ARM_FEATURE_SVE
/* Assumptions:
diff --git a/string/aarch64/strnlen.S b/string/aarch64/strnlen.S
index f2090a7485a5646dec85bfb0b4fce421471adb13..48d2495d2082be8318c88148eb21d00ee6f0b421 100644
--- a/string/aarch64/strnlen.S
+++ b/string/aarch64/strnlen.S
@@ -1,8 +1,8 @@
/*
* strnlen - calculate the length of a string with limit.
*
- * Copyright (c) 2020-2022, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * Copyright (c) 2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
*/
/* Assumptions:
@@ -11,7 +11,7 @@
* MTE compatible.
*/
-#include "asmdefs.h"
+#include "../asmdefs.h"
#define srcin x0
#define cntin x1
@@ -20,30 +20,39 @@
#define src x2
#define synd x3
#define shift x4
+#define wtmp w4
#define tmp x4
#define cntrem x5
#define qdata q0
#define vdata v0
#define vhas_chr v1
-#define vend v2
-#define dend d2
+#define vrepmask v2
+#define vend v3
+#define dend d3
/*
Core algorithm:
- Process the string in 16-byte aligned chunks. Compute a 64-bit mask with
- four bits per byte using the shrn instruction. A count trailing zeros then
- identifies the first zero byte. */
+
+ For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
+ per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
+ requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
+ set likewise for odd bytes so that adjacent bytes can be merged. Since the
+ bits in the syndrome reflect the order in which things occur in the original
+ string, counting trailing zeros identifies exactly which byte matched. */
ENTRY (__strnlen_aarch64)
PTR_ARG (0)
SIZE_ARG (1)
bic src, srcin, 15
+ mov wtmp, 0xf00f
cbz cntin, L(nomatch)
- ld1 {vdata.16b}, [src]
+ ld1 {vdata.16b}, [src], 16
+ dup vrepmask.8h, wtmp
cmeq vhas_chr.16b, vdata.16b, 0
lsl shift, srcin, 2
- shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */
+ and vhas_chr.16b, vhas_chr.16b, vrepmask.16b
+ addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */
fmov synd, dend
lsr synd, synd, shift
cbz synd, L(start_loop)
@@ -55,40 +64,37 @@ L(finish):
csel result, cntin, result, ls
ret
-L(nomatch):
- mov result, cntin
- ret
-
L(start_loop):
sub tmp, src, srcin
- add tmp, tmp, 17
subs cntrem, cntin, tmp
- b.lo L(nomatch)
+ b.ls L(nomatch)
/* Make sure that it won't overread by a 16-byte chunk */
- tbz cntrem, 4, L(loop32_2)
- sub src, src, 16
+ add tmp, cntrem, 15
+ tbnz tmp, 4, L(loop32_2)
+
.p2align 5
L(loop32):
- ldr qdata, [src, 32]!
+ ldr qdata, [src], 16
cmeq vhas_chr.16b, vdata.16b, 0
umaxp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */
fmov synd, dend
cbnz synd, L(end)
L(loop32_2):
- ldr qdata, [src, 16]
+ ldr qdata, [src], 16
subs cntrem, cntrem, 32
cmeq vhas_chr.16b, vdata.16b, 0
- b.lo L(end_2)
+ b.ls L(end)
umaxp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */
fmov synd, dend
cbz synd, L(loop32)
-L(end_2):
- add src, src, 16
+
L(end):
- shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */
+ and vhas_chr.16b, vhas_chr.16b, vrepmask.16b
+ addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */
+ sub src, src, 16
+ mov synd, vend.d[0]
sub result, src, srcin
- fmov synd, dend
#ifndef __AARCH64EB__
rbit synd, synd
#endif
@@ -98,5 +104,9 @@ L(end):
csel result, cntin, result, ls
ret
+L(nomatch):
+ mov result, cntin
+ ret
+
END (__strnlen_aarch64)
diff --git a/string/aarch64/strrchr-mte.S b/string/aarch64/strrchr-mte.S
index bb61ab9ad4e7c5d5966daa950d7ef2c2dec4726d..1e4fb1a68f7e8bc21a65f5925194f5d188d01e7c 100644
--- a/string/aarch64/strrchr-mte.S
+++ b/string/aarch64/strrchr-mte.S
@@ -1,8 +1,8 @@
/*
* strrchr - find last position of a character in a string.
*
- * Copyright (c) 2020-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * Copyright (c) 2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
*/
/* Assumptions:
@@ -11,7 +11,7 @@
* MTE compatible.
*/
-#include "asmdefs.h"
+#include "../asmdefs.h"
#define srcin x0
#define chrin w1
@@ -19,6 +19,7 @@
#define src x2
#define tmp x3
+#define wtmp w3
#define synd x3
#define shift x4
#define src_match x4
@@ -30,6 +31,7 @@
#define vhas_nul v2
#define vhas_chr v3
#define vrepmask v4
+#define vrepmask2 v5
#define vend v5
#define dend d5
@@ -45,67 +47,55 @@ ENTRY (__strrchr_aarch64_mte)
PTR_ARG (0)
bic src, srcin, 15
dup vrepchr.16b, chrin
- movi vrepmask.16b, 0x33
- ld1 {vdata.16b}, [src]
+ mov wtmp, 0x3003
+ dup vrepmask.8h, wtmp
+ tst srcin, 15
+ beq L(loop1)
+
+ ld1 {vdata.16b}, [src], 16
cmeq vhas_nul.16b, vdata.16b, 0
cmeq vhas_chr.16b, vdata.16b, vrepchr.16b
+ mov wtmp, 0xf00f
+ dup vrepmask2.8h, wtmp
bit vhas_nul.16b, vhas_chr.16b, vrepmask.16b
- shrn vend.8b, vhas_nul.8h, 4
+ and vhas_nul.16b, vhas_nul.16b, vrepmask2.16b
+ addp vend.16b, vhas_nul.16b, vhas_nul.16b
lsl shift, srcin, 2
fmov synd, dend
lsr synd, synd, shift
lsl synd, synd, shift
ands nul_match, synd, 0xcccccccccccccccc
bne L(tail)
- cbnz synd, L(loop2_start)
+ cbnz synd, L(loop2)
- .p2align 4
+ .p2align 5
L(loop1):
- ldr q1, [src, 16]
- cmeq vhas_chr.16b, vdata.16b, vrepchr.16b
- cmhs vhas_nul.16b, vhas_chr.16b, vdata.16b
- umaxp vend.16b, vhas_nul.16b, vhas_nul.16b
- fmov synd, dend
- cbnz synd, L(loop1_end)
- ldr q1, [src, 32]!
+ ld1 {vdata.16b}, [src], 16
cmeq vhas_chr.16b, vdata.16b, vrepchr.16b
cmhs vhas_nul.16b, vhas_chr.16b, vdata.16b
umaxp vend.16b, vhas_nul.16b, vhas_nul.16b
fmov synd, dend
cbz synd, L(loop1)
- sub src, src, 16
-L(loop1_end):
- add src, src, 16
+
cmeq vhas_nul.16b, vdata.16b, 0
-#ifdef __AARCH64EB__
- bif vhas_nul.16b, vhas_chr.16b, vrepmask.16b
- shrn vend.8b, vhas_nul.8h, 4
- fmov synd, dend
- rbit synd, synd
-#else
bit vhas_nul.16b, vhas_chr.16b, vrepmask.16b
- shrn vend.8b, vhas_nul.8h, 4
+ bic vhas_nul.8h, 0x0f, lsl 8
+ addp vend.16b, vhas_nul.16b, vhas_nul.16b
fmov synd, dend
-#endif
ands nul_match, synd, 0xcccccccccccccccc
- beq L(loop2_start)
+ beq L(loop2)
+
L(tail):
sub nul_match, nul_match, 1
and chr_match, synd, 0x3333333333333333
ands chr_match, chr_match, nul_match
- add result, src, 15
+ sub result, src, 1
clz tmp, chr_match
sub result, result, tmp, lsr 2
csel result, result, xzr, ne
ret
.p2align 4
- nop
- nop
-L(loop2_start):
- add src, src, 16
- bic vrepmask.8h, 0xf0
-
L(loop2):
cmp synd, 0
csel src_match, src, src_match, ne
diff --git a/string/aarch64/strrchr-sve.S b/string/aarch64/strrchr-sve.S
index 825a7384cfc11831455e5544408e9d5faf8ce57f..d36d69af37fd71a23f656ae0c5bc87f719bd3073 100644
--- a/string/aarch64/strrchr-sve.S
+++ b/string/aarch64/strrchr-sve.S
@@ -1,11 +1,11 @@
/*
* strrchr - find the last of a character in a string
*
- * Copyright (c) 2019-2022, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * Copyright (c) 2019-2021, Arm Limited.
+ * SPDX-License-Identifier: MIT
*/
-#include "asmdefs.h"
+#include "../asmdefs.h"
#if __ARM_FEATURE_SVE
/* Assumptions:
diff --git a/string/aarch64/strrchr.S b/string/aarch64/strrchr.S
index bf9cb297b6cb3f4bc539594edb7e4a6cddc96f20..56185ff534e3915d3ada2c025b2943489b9b2d7b 100644
--- a/string/aarch64/strrchr.S
+++ b/string/aarch64/strrchr.S
@@ -1,8 +1,8 @@
/*
* strrchr - find last position of a character in a string.
*
- * Copyright (c) 2014-2022, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * Copyright (c) 2014-2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
*/
/* Assumptions:
@@ -11,7 +11,7 @@
* Neon Available.
*/
-#include "asmdefs.h"
+#include "../asmdefs.h"
/* Arguments and results. */
#define srcin x0
diff --git a/string/bench/memcpy.c b/string/bench/memcpy.c
index e070be586b528dc57d40f709e93ad2e10c34f053..d5d4ea7e0309a0a9e00dca54048cbb8dc7bb4c00 100644
--- a/string/bench/memcpy.c
+++ b/string/bench/memcpy.c
@@ -1,8 +1,8 @@
/*
* memcpy benchmark.
*
- * Copyright (c) 2020-2022, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * Copyright (c) 2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
*/
#define _GNU_SOURCE
@@ -13,15 +13,14 @@
#include "stringlib.h"
#include "benchlib.h"
-#define ITERS 5000
+#define ITERS 5000
#define ITERS2 20000000
-#define ITERS3 200000
-#define NUM_TESTS 16384
-#define MIN_SIZE 32768
-#define MAX_SIZE (1024 * 1024)
+#define ITERS3 500000
+#define MAX_COPIES 8192
+#define SIZE (256*1024)
-static uint8_t a[MAX_SIZE + 4096 + 64] __attribute__((__aligned__(64)));
-static uint8_t b[MAX_SIZE + 4096 + 64] __attribute__((__aligned__(64)));
+static uint8_t a[SIZE + 4096] __attribute__((__aligned__(64)));
+static uint8_t b[SIZE + 4096] __attribute__((__aligned__(64)));
#define F(x) {#x, x},
@@ -31,21 +30,15 @@ static const struct fun
void *(*fun)(void *, const void *, size_t);
} funtab[] =
{
+ F(memcpy)
#if __aarch64__
F(__memcpy_aarch64)
# if __ARM_NEON
F(__memcpy_aarch64_simd)
# endif
-# if __ARM_FEATURE_SVE
- F(__memcpy_aarch64_sve)
-# endif
-# if WANT_MOPS
- F(__memcpy_aarch64_mops)
-# endif
#elif __arm__
F(__memcpy_arm)
#endif
- F(memcpy)
#undef F
{0, 0}
};
@@ -116,7 +109,7 @@ typedef struct
uint64_t len : 16;
} copy_t;
-static copy_t test_arr[NUM_TESTS];
+static copy_t copy[MAX_COPIES];
typedef char *(*proto_t) (char *, const char *, size_t);
@@ -147,14 +140,14 @@ init_copies (size_t max_size)
size_t total = 0;
/* Create a random set of copies with the given size and alignment
distributions. */
- for (int i = 0; i < NUM_TESTS; i++)
+ for (int i = 0; i < MAX_COPIES; i++)
{
- test_arr[i].dst = (rand32 (0) & (max_size - 1));
- test_arr[i].dst &= ~dst_align_arr[rand32 (0) & ALIGN_MASK];
- test_arr[i].src = (rand32 (0) & (max_size - 1));
- test_arr[i].src &= ~src_align_arr[rand32 (0) & ALIGN_MASK];
- test_arr[i].len = size_arr[rand32 (0) & SIZE_MASK];
- total += test_arr[i].len;
+ copy[i].dst = (rand32 (0) & (max_size - 1));
+ copy[i].dst &= ~dst_align_arr[rand32 (0) & ALIGN_MASK];
+ copy[i].src = (rand32 (0) & (max_size - 1));
+ copy[i].src &= ~src_align_arr[rand32 (0) & ALIGN_MASK];
+ copy[i].len = size_arr[rand32 (0) & SIZE_MASK];
+ total += copy[i].len;
}
return total;
@@ -167,27 +160,25 @@ int main (void)
memset (a, 1, sizeof (a));
memset (b, 2, sizeof (b));
- printf("Random memcpy (bytes/ns):\n");
+ printf("Random memcpy:\n");
for (int f = 0; funtab[f].name != 0; f++)
{
size_t total = 0;
uint64_t tsum = 0;
- printf ("%22s ", funtab[f].name);
+ printf ("%22s (B/ns) ", funtab[f].name);
rand32 (0x12345678);
- for (int size = MIN_SIZE; size <= MAX_SIZE; size *= 2)
+ for (int size = 16384; size <= SIZE; size *= 2)
{
size_t copy_size = init_copies (size) * ITERS;
- for (int c = 0; c < NUM_TESTS; c++)
- funtab[f].fun (b + test_arr[c].dst, a + test_arr[c].src,
- test_arr[c].len);
+ for (int c = 0; c < MAX_COPIES; c++)
+ funtab[f].fun (b + copy[c].dst, a + copy[c].src, copy[c].len);
uint64_t t = clock_get_ns ();
for (int i = 0; i < ITERS; i++)
- for (int c = 0; c < NUM_TESTS; c++)
- funtab[f].fun (b + test_arr[c].dst, a + test_arr[c].src,
- test_arr[c].len);
+ for (int c = 0; c < MAX_COPIES; c++)
+ funtab[f].fun (b + copy[c].dst, a + copy[c].src, copy[c].len);
t = clock_get_ns () - t;
total += copy_size;
tsum += t;
@@ -196,147 +187,74 @@ int main (void)
printf( "avg %.2f\n", (double)total / tsum);
}
- size_t total = 0;
- uint64_t tsum = 0;
- printf ("%22s ", "memcpy_call");
- rand32 (0x12345678);
-
- for (int size = MIN_SIZE; size <= MAX_SIZE; size *= 2)
- {
- size_t copy_size = init_copies (size) * ITERS;
-
- for (int c = 0; c < NUM_TESTS; c++)
- memcpy (b + test_arr[c].dst, a + test_arr[c].src, test_arr[c].len);
-
- uint64_t t = clock_get_ns ();
- for (int i = 0; i < ITERS; i++)
- for (int c = 0; c < NUM_TESTS; c++)
- memcpy (b + test_arr[c].dst, a + test_arr[c].src, test_arr[c].len);
- t = clock_get_ns () - t;
- total += copy_size;
- tsum += t;
- printf ("%dK: %.2f ", size / 1024, (double)copy_size / t);
- }
- printf( "avg %.2f\n", (double)total / tsum);
-
-
- printf ("\nAligned medium memcpy (bytes/ns):\n");
+ printf ("\nMedium memcpy:\n");
for (int f = 0; funtab[f].name != 0; f++)
{
- printf ("%22s ", funtab[f].name);
+ printf ("%22s (B/ns) ", funtab[f].name);
- for (int size = 8; size <= 512; size *= 2)
+ for (int size = 16; size <= 512; size *= 2)
{
uint64_t t = clock_get_ns ();
for (int i = 0; i < ITERS2; i++)
funtab[f].fun (b, a, size);
t = clock_get_ns () - t;
- printf ("%dB: %.2f ", size, (double)size * ITERS2 / t);
+ printf ("%d%c: %.2f ", size < 1024 ? size : size / 1024,
+ size < 1024 ? 'B' : 'K', (double)size * ITERS2 / t);
}
printf ("\n");
}
- printf ("%22s ", "memcpy_call");
- for (int size = 8; size <= 512; size *= 2)
- {
- uint64_t t = clock_get_ns ();
- for (int i = 0; i < ITERS2; i++)
- memcpy (b, a, size);
- t = clock_get_ns () - t;
- printf ("%dB: %.2f ", size, (double)size * ITERS2 / t);
- }
- printf ("\n");
-
-
- printf ("\nUnaligned medium memcpy (bytes/ns):\n");
+ printf ("\nLarge memcpy:\n");
for (int f = 0; funtab[f].name != 0; f++)
{
- printf ("%22s ", funtab[f].name);
+ printf ("%22s (B/ns) ", funtab[f].name);
- for (int size = 8; size <= 512; size *= 2)
- {
- uint64_t t = clock_get_ns ();
- for (int i = 0; i < ITERS2; i++)
- funtab[f].fun (b + 3, a + 1, size);
- t = clock_get_ns () - t;
- printf ("%dB: %.2f ", size, (double)size * ITERS2 / t);
- }
- printf ("\n");
- }
-
- printf ("%22s ", "memcpy_call");
- for (int size = 8; size <= 512; size *= 2)
- {
- uint64_t t = clock_get_ns ();
- for (int i = 0; i < ITERS2; i++)
- memcpy (b + 3, a + 1, size);
- t = clock_get_ns () - t;
- printf ("%dB: %.2f ", size, (double)size * ITERS2 / t);
- }
- printf ("\n");
-
-
- printf ("\nLarge memcpy (bytes/ns):\n");
- for (int f = 0; funtab[f].name != 0; f++)
- {
- printf ("%22s ", funtab[f].name);
-
- for (int size = 1024; size <= 65536; size *= 2)
+ for (int size = 1024; size <= 32768; size *= 2)
{
uint64_t t = clock_get_ns ();
for (int i = 0; i < ITERS3; i++)
funtab[f].fun (b, a, size);
t = clock_get_ns () - t;
- printf ("%dK: %.2f ", size / 1024, (double)size * ITERS3 / t);
+ printf ("%d%c: %.2f ", size < 1024 ? size : size / 1024,
+ size < 1024 ? 'B' : 'K', (double)size * ITERS3 / t);
}
printf ("\n");
}
- printf ("%22s ", "memcpy_call");
- for (int size = 1024; size <= 65536; size *= 2)
- {
- uint64_t t = clock_get_ns ();
- for (int i = 0; i < ITERS3; i++)
- memcpy (b, a, size);
- t = clock_get_ns () - t;
- printf ("%dK: %.2f ", size / 1024, (double)size * ITERS3 / t);
- }
- printf ("\n");
-
-
- printf ("\nUnaligned forwards memmove (bytes/ns):\n");
+ printf ("\nUnaligned forwards memmove:\n");
for (int f = 0; funtab[f].name != 0; f++)
{
- printf ("%22s ", funtab[f].name);
+ printf ("%22s (B/ns) ", funtab[f].name);
- for (int size = 1024; size <= 65536; size *= 2)
+ for (int size = 1024; size <= 32768; size *= 2)
{
uint64_t t = clock_get_ns ();
for (int i = 0; i < ITERS3; i++)
funtab[f].fun (a, a + 256 + (i & 31), size);
t = clock_get_ns () - t;
- printf ("%dK: %.2f ", size / 1024, (double)size * ITERS3 / t);
+ printf ("%d%c: %.2f ", size < 1024 ? size : size / 1024,
+ size < 1024 ? 'B' : 'K', (double)size * ITERS3 / t);
}
printf ("\n");
}
- printf ("\nUnaligned backwards memmove (bytes/ns):\n");
+ printf ("\nUnaligned backwards memmove:\n");
for (int f = 0; funtab[f].name != 0; f++)
{
- printf ("%22s ", funtab[f].name);
+ printf ("%22s (B/ns) ", funtab[f].name);
- for (int size = 1024; size <= 65536; size *= 2)
+ for (int size = 1024; size <= 32768; size *= 2)
{
uint64_t t = clock_get_ns ();
for (int i = 0; i < ITERS3; i++)
funtab[f].fun (a + 256 + (i & 31), a, size);
t = clock_get_ns () - t;
- printf ("%dK: %.2f ", size / 1024, (double)size * ITERS3 / t);
+ printf ("%d%c: %.2f ", size < 1024 ? size : size / 1024,
+ size < 1024 ? 'B' : 'K', (double)size * ITERS3 / t);
}
printf ("\n");
}
- printf ("\n");
return 0;
}
diff --git a/string/bench/memset.c b/string/bench/memset.c
deleted file mode 100644
index 990e23ba9a368bb28960d4211b1d3e3f4d96dee4..0000000000000000000000000000000000000000
--- a/string/bench/memset.c
+++ /dev/null
@@ -1,243 +0,0 @@
-/*
- * memset benchmark.
- *
- * Copyright (c) 2021, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-
-#define _GNU_SOURCE
-#include
-#include
-#include
-#include
-#include "stringlib.h"
-#include "benchlib.h"
-
-#define ITERS 5000
-#define ITERS2 20000000
-#define ITERS3 1000000
-#define NUM_TESTS 16384
-#define MIN_SIZE 32768
-#define MAX_SIZE (1024 * 1024)
-
-static uint8_t a[MAX_SIZE + 4096] __attribute__((__aligned__(64)));
-
-#define F(x) {#x, x},
-
-static const struct fun
-{
- const char *name;
- void *(*fun)(void *, int, size_t);
-} funtab[] =
-{
-#if __aarch64__
- F(__memset_aarch64)
-#elif __arm__
- F(__memset_arm)
-#endif
- F(memset)
-#undef F
- {0, 0}
-};
-
-typedef struct { uint32_t offset : 20, len : 12; } memset_test_t;
-static memset_test_t test_arr[NUM_TESTS];
-
-typedef struct { uint16_t size; uint16_t freq; } freq_data_t;
-typedef struct { uint8_t align; uint16_t freq; } align_data_t;
-
-#define SIZE_NUM 65536
-#define SIZE_MASK (SIZE_NUM-1)
-static uint8_t len_arr[SIZE_NUM];
-
-/* Frequency data for memset sizes up to 4096 based on SPEC2017. */
-static freq_data_t memset_len_freq[] =
-{
-{40,28817}, {32,15336}, { 16,3823}, {296,3545}, { 24,3454}, { 8,1412},
-{292,1202}, { 48, 927}, { 12, 613}, { 11, 539}, {284, 493}, {108, 414},
-{ 88, 380}, { 20, 295}, {312, 271}, { 72, 233}, { 2, 200}, { 4, 192},
-{ 15, 180}, { 14, 174}, { 13, 160}, { 56, 151}, { 36, 144}, { 64, 140},
-{4095,133}, { 10, 130}, { 9, 124}, { 3, 124}, { 28, 120}, { 0, 118},
-{288, 110}, {1152, 96}, {104, 90}, { 1, 86}, {832, 76}, {248, 74},
-{1024, 69}, {120, 64}, {512, 63}, {384, 60}, { 6, 59}, { 80, 54},
-{ 17, 50}, { 7, 49}, {520, 47}, {2048, 39}, {256, 37}, {864, 33},
-{1440, 28}, { 22, 27}, {2056, 24}, {260, 23}, { 68, 23}, { 5, 22},
-{ 18, 21}, {200, 18}, {2120, 18}, { 60, 17}, { 52, 16}, {336, 15},
-{ 44, 13}, {192, 13}, {160, 12}, {2064, 12}, {128, 12}, { 76, 11},
-{164, 11}, {152, 10}, {136, 9}, {488, 7}, { 96, 6}, {560, 6},
-{1016, 6}, {112, 5}, {232, 5}, {168, 5}, {952, 5}, {184, 5},
-{144, 4}, {252, 4}, { 84, 3}, {960, 3}, {3808, 3}, {244, 3},
-{280, 3}, {224, 3}, {156, 3}, {1088, 3}, {440, 3}, {216, 2},
-{304, 2}, { 23, 2}, { 25, 2}, { 26, 2}, {264, 2}, {328, 2},
-{1096, 2}, {240, 2}, {1104, 2}, {704, 2}, {1664, 2}, {360, 2},
-{808, 1}, {544, 1}, {236, 1}, {720, 1}, {368, 1}, {424, 1},
-{640, 1}, {1112, 1}, {552, 1}, {272, 1}, {776, 1}, {376, 1},
-{ 92, 1}, {536, 1}, {824, 1}, {496, 1}, {760, 1}, {792, 1},
-{504, 1}, {344, 1}, {1816, 1}, {880, 1}, {176, 1}, {320, 1},
-{352, 1}, {2008, 1}, {208, 1}, {408, 1}, {228, 1}, {2072, 1},
-{568, 1}, {220, 1}, {616, 1}, {600, 1}, {392, 1}, {696, 1},
-{2144, 1}, {1280, 1}, {2136, 1}, {632, 1}, {584, 1}, {456, 1},
-{472, 1}, {3440, 1}, {2088, 1}, {680, 1}, {2928, 1}, {212, 1},
-{648, 1}, {1752, 1}, {664, 1}, {3512, 1}, {1032, 1}, {528, 1},
-{4072, 1}, {204, 1}, {2880, 1}, {3392, 1}, {712, 1}, { 59, 1},
-{736, 1}, {592, 1}, {2520, 1}, {744, 1}, {196, 1}, {172, 1},
-{728, 1}, {2040, 1}, {1192, 1}, {3600, 1}, {0, 0}
-};
-
-#define ALIGN_NUM 1024
-#define ALIGN_MASK (ALIGN_NUM-1)
-static uint8_t align_arr[ALIGN_NUM];
-
-/* Alignment data for memset based on SPEC2017. */
-static align_data_t memset_align_freq[] =
-{
- {16, 338}, {8, 307}, {32, 148}, {64, 131}, {4, 72}, {1, 23}, {2, 5}, {0, 0}
-};
-
-static void
-init_memset_distribution (void)
-{
- int i, j, freq, size, n;
-
- for (n = i = 0; (freq = memset_len_freq[i].freq) != 0; i++)
- for (j = 0, size = memset_len_freq[i].size; j < freq; j++)
- len_arr[n++] = size;
- assert (n == SIZE_NUM);
-
- for (n = i = 0; (freq = memset_align_freq[i].freq) != 0; i++)
- for (j = 0, size = memset_align_freq[i].align; j < freq; j++)
- align_arr[n++] = size - 1;
- assert (n == ALIGN_NUM);
-}
-
-static size_t
-init_memset (size_t max_size)
-{
- size_t total = 0;
- /* Create a random set of memsets with the given size and alignment
- distributions. */
- for (int i = 0; i < NUM_TESTS; i++)
- {
- test_arr[i].offset = (rand32 (0) & (max_size - 1));
- test_arr[i].offset &= ~align_arr[rand32 (0) & ALIGN_MASK];
- test_arr[i].len = len_arr[rand32 (0) & SIZE_MASK];
- total += test_arr[i].len;
- }
-
- return total;
-}
-
-
-int main (void)
-{
- init_memset_distribution ();
-
- memset (a, 1, sizeof (a));
-
- printf("Random memset (bytes/ns):\n");
- for (int f = 0; funtab[f].name != 0; f++)
- {
- size_t total_size = 0;
- uint64_t tsum = 0;
- printf ("%22s ", funtab[f].name);
- rand32 (0x12345678);
-
- for (int size = MIN_SIZE; size <= MAX_SIZE; size *= 2)
- {
- size_t memset_size = init_memset (size) * ITERS;
-
- for (int c = 0; c < NUM_TESTS; c++)
- funtab[f].fun (a + test_arr[c].offset, 0, test_arr[c].len);
-
- uint64_t t = clock_get_ns ();
- for (int i = 0; i < ITERS; i++)
- for (int c = 0; c < NUM_TESTS; c++)
- funtab[f].fun (a + test_arr[c].offset, 0, test_arr[c].len);
- t = clock_get_ns () - t;
- total_size += memset_size;
- tsum += t;
- printf ("%dK: %.2f ", size / 1024, (double)memset_size / t);
- }
- printf( "avg %.2f\n", (double)total_size / tsum);
- }
-
- size_t total_size = 0;
- uint64_t tsum = 0;
- printf ("%22s ", "memset_call");
- rand32 (0x12345678);
-
- for (int size = MIN_SIZE; size <= MAX_SIZE; size *= 2)
- {
- size_t memset_size = init_memset (size) * ITERS;
-
- for (int c = 0; c < NUM_TESTS; c++)
- memset (a + test_arr[c].offset, 0, test_arr[c].len);
-
- uint64_t t = clock_get_ns ();
- for (int i = 0; i < ITERS; i++)
- for (int c = 0; c < NUM_TESTS; c++)
- memset (a + test_arr[c].offset, 0, test_arr[c].len);
- t = clock_get_ns () - t;
- total_size += memset_size;
- tsum += t;
- printf ("%dK: %.2f ", size / 1024, (double)memset_size / t);
- }
- printf( "avg %.2f\n", (double)total_size / tsum);
-
-
- printf ("\nMedium memset (bytes/ns):\n");
- for (int f = 0; funtab[f].name != 0; f++)
- {
- printf ("%22s ", funtab[f].name);
-
- for (int size = 8; size <= 512; size *= 2)
- {
- uint64_t t = clock_get_ns ();
- for (int i = 0; i < ITERS2; i++)
- funtab[f].fun (a, 0, size);
- t = clock_get_ns () - t;
- printf ("%dB: %.2f ", size, (double)size * ITERS2 / t);
- }
- printf ("\n");
- }
-
- printf ("%22s ", "memset_call");
- for (int size = 8; size <= 512; size *= 2)
- {
- uint64_t t = clock_get_ns ();
- for (int i = 0; i < ITERS2; i++)
- memset (a, 0, size);
- t = clock_get_ns () - t;
- printf ("%dB: %.2f ", size, (double)size * ITERS2 / t);
- }
-
-
- printf ("\nLarge memset (bytes/ns):\n");
- for (int f = 0; funtab[f].name != 0; f++)
- {
- printf ("%22s ", funtab[f].name);
-
- for (int size = 1024; size <= 65536; size *= 2)
- {
- uint64_t t = clock_get_ns ();
- for (int i = 0; i < ITERS3; i++)
- funtab[f].fun (a, 0, size);
- t = clock_get_ns () - t;
- printf ("%dK: %.2f ", size / 1024, (double)size * ITERS3 / t);
- }
- printf ("\n");
- }
-
- printf ("%22s ", "memset_call");
- for (int size = 1024; size <= 65536; size *= 2)
- {
- uint64_t t = clock_get_ns ();
- for (int i = 0; i < ITERS3; i++)
- memset (a, 0, size);
- t = clock_get_ns () - t;
- printf ("%dK: %.2f ", size / 1024, (double)size * ITERS3 / t);
- }
- printf ("\n\n");
-
- return 0;
-}
diff --git a/string/bench/strlen.c b/string/bench/strlen.c
index f05d0d5b89e6f1c689d38ea45d2feefb99bf5f82..cc0f04bee5471a4c623e047f773bde10f0e8aac7 100644
--- a/string/bench/strlen.c
+++ b/string/bench/strlen.c
@@ -1,8 +1,8 @@
/*
* strlen benchmark.
*
- * Copyright (c) 2020-2021, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * Copyright (c) 2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
*/
#define _GNU_SOURCE
@@ -13,10 +13,10 @@
#include "stringlib.h"
#include "benchlib.h"
-#define ITERS 5000
+#define ITERS 2000
#define ITERS2 20000000
#define ITERS3 2000000
-#define NUM_TESTS 16384
+#define NUM_STRLEN 16384
#define MAX_ALIGN 32
#define MAX_STRLEN 256
@@ -49,7 +49,7 @@ static const struct fun
};
#undef F
-static uint16_t strlen_tests[NUM_TESTS];
+static uint16_t strlen_tests[NUM_STRLEN];
typedef struct { uint16_t size; uint16_t freq; } freq_data_t;
typedef struct { uint8_t align; uint16_t freq; } align_data_t;
@@ -117,7 +117,7 @@ init_strlen_tests (void)
/* Create a random set of strlen input strings using the string length
and alignment distributions. */
- for (int n = 0; n < NUM_TESTS; n++)
+ for (int n = 0; n < NUM_STRLEN; n++)
{
int align = strlen_align_arr[rand32 (0) & ALIGN_MASK];
int exp_len = strlen_len_arr[rand32 (0) & SIZE_MASK];
@@ -141,14 +141,14 @@ int main (void)
size_t res = 0, strlen_size = 0, mask = maskv;
printf ("%22s ", funtab[f].name);
- for (int c = 0; c < NUM_TESTS; c++)
+ for (int c = 0; c < NUM_STRLEN; c++)
strlen_size += funtab[f].fun (a + strlen_tests[c]);
strlen_size *= ITERS;
/* Measure latency of strlen result with (res & mask). */
uint64_t t = clock_get_ns ();
for (int i = 0; i < ITERS; i++)
- for (int c = 0; c < NUM_TESTS; c++)
+ for (int c = 0; c < NUM_STRLEN; c++)
res = funtab[f].fun (a + strlen_tests[c] + (res & mask));
t = clock_get_ns () - t;
printf ("%.2f\n", (double)strlen_size / t);
diff --git a/string/include/benchlib.h b/string/include/benchlib.h
index f1bbea388cd217981dbf6513a1c0a1fadbc894bc..0f2ce2eb6bce2685432d4207f987f3896c4b8363 100644
--- a/string/include/benchlib.h
+++ b/string/include/benchlib.h
@@ -2,7 +2,7 @@
* Benchmark support functions.
*
* Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * SPDX-License-Identifier: MIT
*/
#include
diff --git a/string/include/stringlib.h b/string/include/stringlib.h
index 650c52cbda786613bbd5daf64a827903b54bb3ba..378c3cd2d64590c05aa1cb80f6ba2559be017d2d 100644
--- a/string/include/stringlib.h
+++ b/string/include/stringlib.h
@@ -1,8 +1,8 @@
/*
* Public API.
*
- * Copyright (c) 2019-2022, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * Copyright (c) 2019-2021, Arm Limited.
+ * SPDX-License-Identifier: MIT
*/
#include
@@ -29,17 +29,19 @@ size_t __strlen_aarch64 (const char *);
size_t __strnlen_aarch64 (const char *, size_t);
int __strncmp_aarch64 (const char *, const char *, size_t);
void * __memchr_aarch64_mte (const void *, int, size_t);
+char *__strcpy_aarch64_mte (char *__restrict, const char *__restrict);
+char *__stpcpy_aarch64_mte (char *__restrict, const char *__restrict);
char *__strchr_aarch64_mte (const char *, int);
char * __strchrnul_aarch64_mte (const char *, int );
size_t __strlen_aarch64_mte (const char *);
char *__strrchr_aarch64_mte (const char *, int);
+int __strcmp_aarch64_mte (const char *, const char *);
+int __strncmp_aarch64_mte (const char *, const char *, size_t);
#if __ARM_NEON
void *__memcpy_aarch64_simd (void *__restrict, const void *__restrict, size_t);
void *__memmove_aarch64_simd (void *, const void *, size_t);
#endif
# if __ARM_FEATURE_SVE
-void *__memcpy_aarch64_sve (void *__restrict, const void *__restrict, size_t);
-void *__memmove_aarch64_sve (void *__restrict, const void *__restrict, size_t);
void *__memchr_aarch64_sve (const void *, int, size_t);
int __memcmp_aarch64_sve (const void *, const void *, size_t);
char *__strchr_aarch64_sve (const char *, int);
@@ -52,11 +54,6 @@ size_t __strlen_aarch64_sve (const char *);
size_t __strnlen_aarch64_sve (const char *, size_t);
int __strncmp_aarch64_sve (const char *, const char *, size_t);
# endif
-# if WANT_MOPS
-void *__memcpy_aarch64_mops (void *__restrict, const void *__restrict, size_t);
-void *__memmove_aarch64_mops (void *__restrict, const void *__restrict, size_t);
-void *__memset_aarch64_mops (void *, int, size_t);
-# endif
# if __ARM_FEATURE_MEMORY_TAGGING
void *__mtag_tag_region (void *, size_t);
void *__mtag_tag_zero_region (void *, size_t);
diff --git a/string/test/__mtag_tag_region.c b/string/test/__mtag_tag_region.c
index c45fa6662a77bbdab77fe6998ffb3830952016fa..d8c02d92d626a6e754b756cdcb17945e6a6a14ad 100644
--- a/string/test/__mtag_tag_region.c
+++ b/string/test/__mtag_tag_region.c
@@ -2,7 +2,7 @@
* __mtag_tag_region test.
*
* Copyright (c) 2021, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * SPDX-License-Identifier: MIT
*/
#if __ARM_FEATURE_MEMORY_TAGGING && WANT_MTE_TEST
diff --git a/string/test/__mtag_tag_zero_region.c b/string/test/__mtag_tag_zero_region.c
index a4a7861620d1f4db8eedc438cae77aa8145040d7..221c223a2f3105ab02c7b21b9560a81bddf4355d 100644
--- a/string/test/__mtag_tag_zero_region.c
+++ b/string/test/__mtag_tag_zero_region.c
@@ -2,7 +2,7 @@
* __mtag_tag_zero_region test.
*
* Copyright (c) 2021, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * SPDX-License-Identifier: MIT
*/
#if __ARM_FEATURE_MEMORY_TAGGING && WANT_MTE_TEST
diff --git a/string/test/memchr.c b/string/test/memchr.c
index c6a94481c0adbaeaf27b81c0d18643a25236f623..0ff77f5710bf2d413b5e1f9a4c5243e0fe945c2c 100644
--- a/string/test/memchr.c
+++ b/string/test/memchr.c
@@ -2,7 +2,7 @@
* memchr test.
*
* Copyright (c) 2019-2020, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * SPDX-License-Identifier: MIT
*/
#include
diff --git a/string/test/memcmp.c b/string/test/memcmp.c
index f9236b83a60d446315cbc5ddb27f03458d50b538..7a7cf9cff35af2c22248dfd21609b7e83af68976 100644
--- a/string/test/memcmp.c
+++ b/string/test/memcmp.c
@@ -2,7 +2,7 @@
* memcmp test.
*
* Copyright (c) 2019-2020, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * SPDX-License-Identifier: MIT
*/
#include
diff --git a/string/test/memcpy.c b/string/test/memcpy.c
index 0c2c75a29e2d45c13a6d900a4a8e21984266b2d8..ce0ceeef5ee844e5feadaf2cb18020436e1e9b12 100644
--- a/string/test/memcpy.c
+++ b/string/test/memcpy.c
@@ -1,8 +1,8 @@
/*
* memcpy test.
*
- * Copyright (c) 2019-2022, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * Copyright (c) 2019-2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
*/
#include
@@ -28,12 +28,6 @@ static const struct fun
# if __ARM_NEON
F(__memcpy_aarch64_simd, 1)
# endif
-# if __ARM_FEATURE_SVE
- F(__memcpy_aarch64_sve, 1)
-# endif
-# if WANT_MOPS
- F(__memcpy_aarch64_mops, 1)
-# endif
#elif __arm__
F(__memcpy_arm, 0)
#endif
diff --git a/string/test/memmove.c b/string/test/memmove.c
index a5149d74465dad744ec85bee844f053b8727739c..689b68c98af264c8d5e485e7134a0f216fce555c 100644
--- a/string/test/memmove.c
+++ b/string/test/memmove.c
@@ -1,8 +1,8 @@
/*
* memmove test.
*
- * Copyright (c) 2019-2022, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * Copyright (c) 2019-2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
*/
#include
@@ -28,12 +28,6 @@ static const struct fun
# if __ARM_NEON
F(__memmove_aarch64_simd, 1)
# endif
-# if __ARM_FEATURE_SVE
- F(__memmove_aarch64_sve, 1)
-# endif
-# if WANT_MOPS
- F(__memmove_aarch64_mops, 1)
-# endif
#endif
{0, 0, 0}
// clang-format on
diff --git a/string/test/memrchr.c b/string/test/memrchr.c
index 4171a56daefd6596cc453d075292960db6225d0f..adf96f049cc938ee48cf51c1a1fea94ac73af60a 100644
--- a/string/test/memrchr.c
+++ b/string/test/memrchr.c
@@ -2,7 +2,7 @@
* memchr test.
*
* Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * SPDX-License-Identifier: MIT
*/
#ifndef _GNU_SOURCE
diff --git a/string/test/memset.c b/string/test/memset.c
index 3489e2986a71c18e40d1a08d069664b0149de415..f1721442dbaf83f682859526632655c7ad65cd75 100644
--- a/string/test/memset.c
+++ b/string/test/memset.c
@@ -2,7 +2,7 @@
* memset test.
*
* Copyright (c) 2019-2020, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * SPDX-License-Identifier: MIT
*/
#include
@@ -25,9 +25,6 @@ static const struct fun
F(memset, 0)
#if __aarch64__
F(__memset_aarch64, 1)
-# if WANT_MOPS
- F(__memset_aarch64_mops, 1)
-# endif
#elif __arm__
F(__memset_arm, 0)
#endif
diff --git a/string/test/mte.h b/string/test/mte.h
index 40b0ecf6c194df67a51a14bbe6d3a262dc441590..e67cbd9d2d400ac1b6bbb4ce815073f483fdb20b 100644
--- a/string/test/mte.h
+++ b/string/test/mte.h
@@ -2,7 +2,7 @@
* Memory tagging testing code.
*
* Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * SPDX-License-Identifier: MIT
*/
#ifndef __TEST_MTE_H
diff --git a/string/test/stpcpy.c b/string/test/stpcpy.c
index 0300892a1f3ccaf0dc35ea0b3e85bca861ce99cc..1827e68c9a30e75b75e467968c75cca7e4f54dc8 100644
--- a/string/test/stpcpy.c
+++ b/string/test/stpcpy.c
@@ -1,8 +1,8 @@
/*
* stpcpy test.
*
- * Copyright (c) 2019-2022, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * Copyright (c) 2019-2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
*/
#ifndef _GNU_SOURCE
@@ -28,7 +28,8 @@ static const struct fun
// clang-format off
F(stpcpy, 0)
#if __aarch64__
- F(__stpcpy_aarch64, 1)
+ F(__stpcpy_aarch64, 0)
+ F(__stpcpy_aarch64_mte, 1)
# if __ARM_FEATURE_SVE
F(__stpcpy_aarch64_sve, 1)
# endif
diff --git a/string/test/strchr.c b/string/test/strchr.c
index 66180acfb57c6b824bcd39b8e23bada7ab3904a7..f3ae982ef0adf0850986741f84e9f63d131d9cfe 100644
--- a/string/test/strchr.c
+++ b/string/test/strchr.c
@@ -2,7 +2,7 @@
* strchr test.
*
* Copyright (c) 2019-2020, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * SPDX-License-Identifier: MIT
*/
#include
diff --git a/string/test/strchrnul.c b/string/test/strchrnul.c
index aad0bf59da664e02495e82ab014b19fb81b3576b..6c30ab2123f16aac57b59896740e859018fb3bf0 100644
--- a/string/test/strchrnul.c
+++ b/string/test/strchrnul.c
@@ -2,7 +2,7 @@
* strchrnul test.
*
* Copyright (c) 2019-2020, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * SPDX-License-Identifier: MIT
*/
#ifndef _GNU_SOURCE
diff --git a/string/test/strcmp.c b/string/test/strcmp.c
index 4aa95f4f2f1dd6e00fc97082abf8994f5fce2643..d57b54ed50a8a5e8b742805444510ec98a62851d 100644
--- a/string/test/strcmp.c
+++ b/string/test/strcmp.c
@@ -1,8 +1,8 @@
/*
* strcmp test.
*
- * Copyright (c) 2019-2022, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * Copyright (c) 2019-2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
*/
#include
@@ -24,7 +24,8 @@ static const struct fun
// clang-format off
F(strcmp, 0)
#if __aarch64__
- F(__strcmp_aarch64, 1)
+ F(__strcmp_aarch64, 0)
+ F(__strcmp_aarch64_mte, 1)
# if __ARM_FEATURE_SVE
F(__strcmp_aarch64_sve, 1)
# endif
diff --git a/string/test/strcpy.c b/string/test/strcpy.c
index af297f90396a95d6b88cbf0357aa1860d862f62c..e84cace9c8c610e6f03892be2eb8fc3c92d537ea 100644
--- a/string/test/strcpy.c
+++ b/string/test/strcpy.c
@@ -1,8 +1,8 @@
/*
* strcpy test.
*
- * Copyright (c) 2019-2022, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * Copyright (c) 2019-2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
*/
#include
@@ -24,7 +24,8 @@ static const struct fun
// clang-format off
F(strcpy, 0)
#if __aarch64__
- F(__strcpy_aarch64, 1)
+ F(__strcpy_aarch64, 0)
+ F(__strcpy_aarch64_mte, 1)
# if __ARM_FEATURE_SVE
F(__strcpy_aarch64_sve, 1)
# endif
diff --git a/string/test/stringtest.h b/string/test/stringtest.h
index 6bb7e1fdfeca2d291cfba0d254d564ed3c51d57b..fe855fc217369099ab10af634f392517edf89f66 100644
--- a/string/test/stringtest.h
+++ b/string/test/stringtest.h
@@ -2,7 +2,7 @@
* Common string test code.
*
* Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * SPDX-License-Identifier: MIT
*/
#include
diff --git a/string/test/strlen.c b/string/test/strlen.c
index 47ef3dcf0ef0c94adf16d07d77c341038a125389..6278380f26df71b5742944cca66d4a7568957ea6 100644
--- a/string/test/strlen.c
+++ b/string/test/strlen.c
@@ -1,14 +1,15 @@
/*
* strlen test.
*
- * Copyright (c) 2019-2022, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * Copyright (c) 2019-2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
*/
#include
#include
#include
#include
+#include
#include
#include "mte.h"
#include "stringlib.h"
diff --git a/string/test/strncmp.c b/string/test/strncmp.c
index 4bbab6f934509708d760b7cf99d8fbf8c57b21e7..018a8a431ab8ca55110b814e0e089fde6f199772 100644
--- a/string/test/strncmp.c
+++ b/string/test/strncmp.c
@@ -1,8 +1,8 @@
/*
* strncmp test.
*
- * Copyright (c) 2019-2022, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * Copyright (c) 2019-2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
*/
#include
@@ -24,7 +24,8 @@ static const struct fun
// clang-format off
F(strncmp, 0)
#if __aarch64__
- F(__strncmp_aarch64, 1)
+ F(__strncmp_aarch64, 0)
+ F(__strncmp_aarch64_mte, 1)
# if __ARM_FEATURE_SVE
F(__strncmp_aarch64_sve, 1)
# endif
diff --git a/string/test/strnlen.c b/string/test/strnlen.c
index a800fd1993cdc21a9023fb5eabfb50781a2b9d70..0dea00eaf8e3dc41bc465aa201a312e3a85bf230 100644
--- a/string/test/strnlen.c
+++ b/string/test/strnlen.c
@@ -2,7 +2,7 @@
* strnlen test.
*
* Copyright (c) 2019-2020, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * SPDX-License-Identifier: MIT
*/
#ifndef _GNU_SOURCE
diff --git a/string/test/strrchr.c b/string/test/strrchr.c
index 580ca497f8a46b1ae92d1e3288b29d2d13178ccf..fedbdc52fcc1151ffbbd168ef3bd1cb42c700ff0 100644
--- a/string/test/strrchr.c
+++ b/string/test/strrchr.c
@@ -2,7 +2,7 @@
* strrchr test.
*
* Copyright (c) 2019-2021, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * SPDX-License-Identifier: MIT
*/
#include
diff --git a/string/x86_64/check-arch.S b/string/x86_64/check-arch.S
index 5afcf7b7ee548aa275f105f72714d390da4d076a..26ade0a0c7db635acdbb3bd9592fee3ce9ec540d 100644
--- a/string/x86_64/check-arch.S
+++ b/string/x86_64/check-arch.S
@@ -2,7 +2,7 @@
* check ARCH setting.
*
* Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * SPDX-License-Identifier: MIT
*/
#if !__x86_64__