1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183
|
/*
* Copyright (c) 2024, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
package compiler.loopopts.superword;
/*
* @test
* @bug 8342498
* @summary Test SuperWord, when it aligns to field-store, and the corresponding allocation is eliminated.
* @run driver compiler.loopopts.superword.TestEliminateAllocationWithCastP2XUse
* @run main/othervm -Xbatch
* -XX:-SplitIfBlocks -XX:LoopMaxUnroll=8
* -XX:+UnlockDiagnosticVMOptions -XX:DominatorSearchLimit=45
* compiler.loopopts.superword.TestEliminateAllocationWithCastP2XUse
*/
public class TestEliminateAllocationWithCastP2XUse {
public static void main(String args[]) {
byte[] a = new byte[10_000];
for (int i = 0; i < 10000; i++) {
test(a);
}
}
// Summary:
// - Some B allocations are detected as NoEscape, but cannot be removed because of a field load.
// - The field loads cannot be LoadNode::split_through_phi because DominatorSearchLimit is too low
// for the dominates query to look through some IfNode / IfProj path.
// - We go into loop-opts.
// - In theory, the Stores of B::offset would be moved out of the loop. But we disable
// PhaseIdealLoop::try_move_store_after_loop by setting -XX:-SplitIfBlocks.
// - The field loads are folded away because of some MaxUnroll trick, where the val constant folds to 1.
// - SuperWord eventually kicks in, and vectorizes the array stores.
// - Since some vectorization has happened, SuperWord wants to align the main loop with a memory reference
// in the loop. The code here is not very smart, and just picks the memory reference that occurs the
// most often. But the B::offset stores occur more often than the array stores, and so we align to
// one of the B::offset stores. This inserts a CastP2X under the CheckCastPP of the B allocation.
// - Once loop opts is over, we eventually go into macro expansion.
// - During macro expansion, we now discover that the Allocations were marked NoEscape, and that by now
// there are no field loads any more: yay, we can remove the allocation!
// - ... except that there is the CastP2X from SuperWord alignment ...
// - The Allocation removal code wants to pattern match the CastP2X as part of a GC barrier, but then
// the pattern does not conform to the expecatation - it is after all from SuperWord. This leads to
// an assert, and SIGSEGV in product, at least with G1GC.
public static long test(byte[] a) {
// Delay val == 1 until loop-opts, with MaxUnroll trick.
int val = 0;
for (int i = 0; i < 4; i++) {
if ((i % 2) == 0) {
val = 1;
}
}
// during loop opts, we learn val == 1
// But we don't know that during EscapeAnalysis (EA) yet.
// 9 Allocations, discovered as NoEscape during EA.
B b1 = new B();
B b2 = new B();
B b3 = new B();
B b4 = new B();
B b5 = new B();
B b6 = new B();
B b7 = new B();
B b8 = new B();
B b9 = new B();
// Some path of IfNode / IfProj.
// Only folds away once we know val == 1
// This delays the LoadNode::split_through_phi, because it needs a dominates call
// to succeed, but it cannot look through this path because we set -XX:DominatorSearchLimit=45
// i.e. just a little too low to be able to look through.
// Without the LoadNode::split_through_phi before the end of EA, the Allocation cannot yet be
// removed, due to a "Field load", i.e. that Load for B::offset.
// But later, this path can actually fold away, when we know that val == 1. At that point,
// also the Load from B::offset folds away because LoadNode::split_through_phi succeeds
// At that point the B allocations have no Loads any more, and can be removed... but this only
// happens at macro expansion, after all loop opts.
if (val == 1010) { throw new RuntimeException("never"); }
if (val == 1020) { throw new RuntimeException("never"); }
if (val == 1030) { throw new RuntimeException("never"); }
if (val == 1040) { throw new RuntimeException("never"); }
if (val == 1060) { throw new RuntimeException("never"); }
if (val == 1070) { throw new RuntimeException("never"); }
if (val == 1080) { throw new RuntimeException("never"); }
if (val == 1090) { throw new RuntimeException("never"); }
if (val == 2010) { throw new RuntimeException("never"); }
if (val == 2020) { throw new RuntimeException("never"); }
if (val == 2030) { throw new RuntimeException("never"); }
if (val == 2040) { throw new RuntimeException("never"); }
if (val == 2060) { throw new RuntimeException("never"); }
if (val == 2070) { throw new RuntimeException("never"); }
if (val == 2080) { throw new RuntimeException("never"); }
if (val == 2090) { throw new RuntimeException("never"); }
if (val == 3010) { throw new RuntimeException("never"); }
if (val == 3020) { throw new RuntimeException("never"); }
if (val == 3030) { throw new RuntimeException("never"); }
if (val == 3040) { throw new RuntimeException("never"); }
if (val == 3060) { throw new RuntimeException("never"); }
if (val == 3070) { throw new RuntimeException("never"); }
if (val == 3080) { throw new RuntimeException("never"); }
if (val == 3090) { throw new RuntimeException("never"); }
if (val == 4010) { throw new RuntimeException("never"); }
if (val == 4020) { throw new RuntimeException("never"); }
if (val == 4030) { throw new RuntimeException("never"); }
if (val == 4040) { throw new RuntimeException("never"); }
if (val == 4060) { throw new RuntimeException("never"); }
if (val == 4070) { throw new RuntimeException("never"); }
if (val == 4080) { throw new RuntimeException("never"); }
if (val == 4090) { throw new RuntimeException("never"); }
long mulVal = 1;
for (int i = 0; i < a.length; i++) {
mulVal *= 3;
// We do some vector store, so that SuperWord succeeds, and creates the
// alignment code, which emits the CastP2X.
a[i]++;
// But we also have 9 Stores for the B::offset.
// SuperWord now sees more of these stores than of the array stores, and picks
// one of the B::offset stores as the alignment reference... creating a CastP2X
// for the CheckCastPP of the B allocation.
b1.offset = mulVal;
b2.offset = mulVal;
b3.offset = mulVal;
b4.offset = mulVal;
b5.offset = mulVal;
b6.offset = mulVal;
b7.offset = mulVal;
b8.offset = mulVal;
b9.offset = mulVal;
}
// This folds the loads away, once we know val == 1
// That happens during loop-opts, so after EA, but before macro expansion.
long ret = 0;
if (val == 42) {
ret = b1.offset +
b2.offset +
b3.offset +
b4.offset +
b5.offset +
b6.offset +
b7.offset +
b8.offset +
b9.offset;
}
return ret;
}
static class B {
// Add padding so that the old SuperWord::can_create_pairs accepts the field store to B.offset
long pad1 = 0; // at 16
long pad2 = 0; // at 24
long pad3 = 0; // at 32
long pad4 = 0; // at 40
long pad5 = 0; // at 48
long pad6 = 0; // at 56
long offset = 0; // offset at 64 bytes
}
}
|