1 Star 0 Fork 140

lixing.loongson.cn/gcc

forked from src-openEuler/gcc 
加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
文件
该仓库未声明开源许可证文件(LICENSE),使用请关注具体项目描述及其代码上游依赖。
克隆/下载
0049-Add-more-flexible-check-for-pointer-aliasing-during-.patch 9.89 KB
一键复制 编辑 原始数据 按行查看 历史
郑晨卉 提交于 2024-04-11 10:45 . [Sync] Sync patch from openeuler/gcc
From b5865aef36ebaac87ae30d51f08bfe081795ed67 Mon Sep 17 00:00:00 2001
From: Chernonog Viacheslav <[email protected]>
Date: Tue, 12 Mar 2024 23:30:56 +0800
Subject: [PATCH 17/18] Add more flexible check for pointer aliasing during
vectorization It takes minimum between number of iteration and segment length
it helps to speed up loops with small number of iterations when only tail can
be vectorized
---
gcc/params.opt | 5 ++
.../sve/var_stride_flexible_segment_len_1.c | 23 +++++++
gcc/tree-data-ref.cc | 67 +++++++++++++------
gcc/tree-data-ref.h | 11 ++-
gcc/tree-vect-data-refs.cc | 14 +++-
5 files changed, 95 insertions(+), 25 deletions(-)
create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/var_stride_flexible_segment_len_1.c
diff --git a/gcc/params.opt b/gcc/params.opt
index 6176d4790..7e5c119cf 100644
--- a/gcc/params.opt
+++ b/gcc/params.opt
@@ -1180,6 +1180,11 @@ Maximum number of loop peels to enhance alignment of data references in a loop.
Common Joined UInteger Var(param_vect_max_version_for_alias_checks) Init(10) Param Optimization
Bound on number of runtime checks inserted by the vectorizer's loop versioning for alias check.
+-param=vect-alias-flexible-segment-len=
+Common Joined UInteger Var(param_flexible_seg_len) Init(0) IntegerRange(0, 1) Param Optimization
+Use a minimum length of different segments. Currenlty the minimum between
+iteration number and vectorization length is chosen by this param.
+
-param=vect-max-version-for-alignment-checks=
Common Joined UInteger Var(param_vect_max_version_for_alignment_checks) Init(6) Param Optimization
Bound on number of runtime checks inserted by the vectorizer's loop versioning for alignment check.
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/var_stride_flexible_segment_len_1.c b/gcc/testsuite/gcc.target/aarch64/sve/var_stride_flexible_segment_len_1.c
new file mode 100644
index 000000000..894f075f3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/var_stride_flexible_segment_len_1.c
@@ -0,0 +1,23 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize --param=vect-alias-flexible-segment-len=1" } */
+
+#define TYPE int
+#define SIZE 257
+
+void __attribute__ ((weak))
+f (TYPE *x, TYPE *y, unsigned short n, long m __attribute__((unused)))
+{
+ for (int i = 0; i < SIZE; ++i)
+ x[i * n] += y[i * n];
+}
+
+/* { dg-final { scan-assembler {\tld1w\tz[0-9]+} } } */
+/* { dg-final { scan-assembler {\tst1w\tz[0-9]+} } } */
+/* { dg-final { scan-assembler {\tldr\tw[0-9]+} } } */
+/* { dg-final { scan-assembler {\tstr\tw[0-9]+} } } */
+/* Should use a WAR check that multiplies by (VF-2)*4 rather than
+ an overlap check that multiplies by (257-1)*4. */
+/* { dg-final { scan-assembler {\tcntb\t(x[0-9]+)\n.*\tsub\tx[0-9]+, \1, #8\n.*\tmul\tx[0-9]+,[^\n]*\1} } } */
+/* One range check and a check for n being zero. */
+/* { dg-final { scan-assembler-times {\t(?:cmp|tst)\t} 2 } } */
+/* { dg-final { scan-assembler-times {\tccmp\t} 1 } } */
diff --git a/gcc/tree-data-ref.cc b/gcc/tree-data-ref.cc
index 397792c35..e6ae9e847 100644
--- a/gcc/tree-data-ref.cc
+++ b/gcc/tree-data-ref.cc
@@ -2329,31 +2329,15 @@ create_intersect_range_checks_index (class loop *loop, tree *cond_expr,
same arguments. Try to optimize cases in which the second access
is a write and in which some overlap is valid. */
-static bool
-create_waw_or_war_checks (tree *cond_expr,
+static void
+create_waw_or_war_checks2 (tree *cond_expr, tree seg_len_a,
const dr_with_seg_len_pair_t &alias_pair)
{
const dr_with_seg_len& dr_a = alias_pair.first;
const dr_with_seg_len& dr_b = alias_pair.second;
- /* Check for cases in which:
-
- (a) DR_B is always a write;
- (b) the accesses are well-ordered in both the original and new code
- (see the comment above the DR_ALIAS_* flags for details); and
- (c) the DR_STEPs describe all access pairs covered by ALIAS_PAIR. */
- if (alias_pair.flags & ~(DR_ALIAS_WAR | DR_ALIAS_WAW))
- return false;
-
- /* Check for equal (but possibly variable) steps. */
tree step = DR_STEP (dr_a.dr);
- if (!operand_equal_p (step, DR_STEP (dr_b.dr)))
- return false;
-
- /* Make sure that we can operate on sizetype without loss of precision. */
tree addr_type = TREE_TYPE (DR_BASE_ADDRESS (dr_a.dr));
- if (TYPE_PRECISION (addr_type) != TYPE_PRECISION (sizetype))
- return false;
/* All addresses involved are known to have a common alignment ALIGN.
We can therefore subtract ALIGN from an exclusive endpoint to get
@@ -2370,9 +2354,6 @@ create_waw_or_war_checks (tree *cond_expr,
fold_convert (ssizetype, indicator),
ssize_int (0));
- /* Get lengths in sizetype. */
- tree seg_len_a
- = fold_convert (sizetype, rewrite_to_non_trapping_overflow (dr_a.seg_len));
step = fold_convert (sizetype, rewrite_to_non_trapping_overflow (step));
/* Each access has the following pattern:
@@ -2479,6 +2460,50 @@ create_waw_or_war_checks (tree *cond_expr,
*cond_expr = fold_build2 (GT_EXPR, boolean_type_node, subject, limit);
if (dump_enabled_p ())
dump_printf (MSG_NOTE, "using an address-based WAR/WAW test\n");
+}
+
+/* This is a wrapper function for create_waw_or_war_checks2. */
+static bool
+create_waw_or_war_checks (tree *cond_expr,
+ const dr_with_seg_len_pair_t &alias_pair)
+{
+ const dr_with_seg_len& dr_a = alias_pair.first;
+ const dr_with_seg_len& dr_b = alias_pair.second;
+
+ /* Check for cases in which:
+
+ (a) DR_B is always a write;
+ (b) the accesses are well-ordered in both the original and new code
+ (see the comment above the DR_ALIAS_* flags for details); and
+ (c) the DR_STEPs describe all access pairs covered by ALIAS_PAIR. */
+ if (alias_pair.flags & ~(DR_ALIAS_WAR | DR_ALIAS_WAW))
+ return false;
+
+ /* Check for equal (but possibly variable) steps. */
+ tree step = DR_STEP (dr_a.dr);
+ if (!operand_equal_p (step, DR_STEP (dr_b.dr)))
+ return false;
+
+ /* Make sure that we can operate on sizetype without loss of precision. */
+ tree addr_type = TREE_TYPE (DR_BASE_ADDRESS (dr_a.dr));
+ if (TYPE_PRECISION (addr_type) != TYPE_PRECISION (sizetype))
+ return false;
+
+ /* Get lengths in sizetype. */
+ tree seg_len_a
+ = fold_convert (sizetype,
+ rewrite_to_non_trapping_overflow (dr_a.seg_len));
+ create_waw_or_war_checks2 (cond_expr, seg_len_a, alias_pair);
+ if (param_flexible_seg_len && dr_a.seg_len != dr_a.seg_len2)
+ {
+ tree seg_len2_a
+ = fold_convert (sizetype,
+ rewrite_to_non_trapping_overflow (dr_a.seg_len2));
+ tree cond_expr2;
+ create_waw_or_war_checks2 (&cond_expr2, seg_len2_a, alias_pair);
+ *cond_expr = fold_build2 (TRUTH_OR_EXPR, boolean_type_node,
+ *cond_expr, cond_expr2);
+ }
return true;
}
diff --git a/gcc/tree-data-ref.h b/gcc/tree-data-ref.h
index f643a95b2..9bc5f16ee 100644
--- a/gcc/tree-data-ref.h
+++ b/gcc/tree-data-ref.h
@@ -213,12 +213,19 @@ class dr_with_seg_len
public:
dr_with_seg_len (data_reference_p d, tree len, unsigned HOST_WIDE_INT size,
unsigned int a)
- : dr (d), seg_len (len), access_size (size), align (a) {}
-
+ : dr (d), seg_len (len), seg_len2 (len), access_size (size), align (a)
+ {}
+ dr_with_seg_len (data_reference_p d, tree len, tree len2,
+ unsigned HOST_WIDE_INT size, unsigned int a)
+ : dr (d), seg_len (len), seg_len2 (len2), access_size (size), align (a)
+ {}
data_reference_p dr;
/* The offset of the last access that needs to be checked minus
the offset of the first. */
tree seg_len;
+ /* The second version of segment length. Currently this is used to
+ soften checks for a small number of iterations. */
+ tree seg_len2;
/* A value that, when added to abs (SEG_LEN), gives the total number of
bytes in the segment. */
poly_uint64 access_size;
diff --git a/gcc/tree-vect-data-refs.cc b/gcc/tree-vect-data-refs.cc
index 4e615b80b..04e68f621 100644
--- a/gcc/tree-vect-data-refs.cc
+++ b/gcc/tree-vect-data-refs.cc
@@ -3646,6 +3646,7 @@ vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
{
poly_uint64 lower_bound;
tree segment_length_a, segment_length_b;
+ tree segment_length2_a, segment_length2_b;
unsigned HOST_WIDE_INT access_size_a, access_size_b;
unsigned int align_a, align_b;
@@ -3751,6 +3752,8 @@ vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
{
segment_length_a = size_zero_node;
segment_length_b = size_zero_node;
+ segment_length2_a = size_zero_node;
+ segment_length2_b = size_zero_node;
}
else
{
@@ -3759,8 +3762,15 @@ vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
length_factor = scalar_loop_iters;
else
length_factor = size_int (vect_factor);
+ /* In any case we should rememeber scalar_loop_iters
+ this helps to create flexible aliasing check
+ for small number of iterations. */
segment_length_a = vect_vfa_segment_size (dr_info_a, length_factor);
segment_length_b = vect_vfa_segment_size (dr_info_b, length_factor);
+ segment_length2_a
+ = vect_vfa_segment_size (dr_info_a, scalar_loop_iters);
+ segment_length2_b
+ = vect_vfa_segment_size (dr_info_b, scalar_loop_iters);
}
access_size_a = vect_vfa_access_size (loop_vinfo, dr_info_a);
access_size_b = vect_vfa_access_size (loop_vinfo, dr_info_b);
@@ -3805,9 +3815,9 @@ vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
}
dr_with_seg_len dr_a (dr_info_a->dr, segment_length_a,
- access_size_a, align_a);
+ segment_length2_a, access_size_a, align_a);
dr_with_seg_len dr_b (dr_info_b->dr, segment_length_b,
- access_size_b, align_b);
+ segment_length2_b, access_size_b, align_b);
/* Canonicalize the order to be the one that's needed for accurate
RAW, WAR and WAW flags, in cases where the data references are
well-ordered. The order doesn't really matter otherwise,
--
2.33.0
Loading...
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化
1
https://gitee.com/lixing-loongson-cn/gcc.git
[email protected]:lixing-loongson-cn/gcc.git
lixing-loongson-cn
gcc
gcc
master

搜索帮助