| 1 | /* Vectorizer Specific Loop Manipulations |
| 2 | Copyright (C) 2003-2026 Free Software Foundation, Inc. |
| 3 | Contributed by Dorit Naishlos <dorit@il.ibm.com> |
| 4 | and Ira Rosen <irar@il.ibm.com> |
| 5 | |
| 6 | This file is part of GCC. |
| 7 | |
| 8 | GCC is free software; you can redistribute it and/or modify it under |
| 9 | the terms of the GNU General Public License as published by the Free |
| 10 | Software Foundation; either version 3, or (at your option) any later |
| 11 | version. |
| 12 | |
| 13 | GCC is distributed in the hope that it will be useful, but WITHOUT ANY |
| 14 | WARRANTY; without even the implied warranty of MERCHANTABILITY or |
| 15 | FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License |
| 16 | for more details. |
| 17 | |
| 18 | You should have received a copy of the GNU General Public License |
| 19 | along with GCC; see the file COPYING3. If not see |
| 20 | <http://www.gnu.org/licenses/>. */ |
| 21 | |
| 22 | #include "config.h" |
| 23 | #include "system.h" |
| 24 | #include "coretypes.h" |
| 25 | #include "backend.h" |
| 26 | #include "tree.h" |
| 27 | #include "gimple.h" |
| 28 | #include "cfghooks.h" |
| 29 | #include "tree-pass.h" |
| 30 | #include "ssa.h" |
| 31 | #include "fold-const.h" |
| 32 | #include "cfganal.h" |
| 33 | #include "gimplify.h" |
| 34 | #include "gimple-iterator.h" |
| 35 | #include "gimplify-me.h" |
| 36 | #include "tree-cfg.h" |
| 37 | #include "tree-ssa-loop-manip.h" |
| 38 | #include "tree-into-ssa.h" |
| 39 | #include "tree-ssa.h" |
| 40 | #include "cfgloop.h" |
| 41 | #include "tree-scalar-evolution.h" |
| 42 | #include "tree-vectorizer.h" |
| 43 | #include "tree-ssa-loop-ivopts.h" |
| 44 | #include "gimple-fold.h" |
| 45 | #include "tree-ssa-loop-niter.h" |
| 46 | #include "internal-fn.h" |
| 47 | #include "stor-layout.h" |
| 48 | #include "optabs-query.h" |
| 49 | #include "vec-perm-indices.h" |
| 50 | #include "insn-config.h" |
| 51 | #include "rtl.h" |
| 52 | #include "recog.h" |
| 53 | #include "langhooks.h" |
| 54 | #include "tree-vector-builder.h" |
| 55 | #include "optabs-tree.h" |
| 56 | #include "hierarchical_discriminator.h" |
| 57 | |
| 58 | |
| 59 | /************************************************************************* |
| 60 | Simple Loop Peeling Utilities |
| 61 | |
| 62 | Utilities to support loop peeling for vectorization purposes. |
| 63 | *************************************************************************/ |
| 64 | |
| 65 | |
| 66 | /* Renames the use *OP_P. */ |
| 67 | |
| 68 | static void |
| 69 | rename_use_op (use_operand_p op_p) |
| 70 | { |
| 71 | tree new_name; |
| 72 | |
| 73 | if (TREE_CODE (USE_FROM_PTR (op_p)) != SSA_NAME) |
| 74 | return; |
| 75 | |
| 76 | new_name = get_current_def (USE_FROM_PTR (op_p)); |
| 77 | |
| 78 | /* Something defined outside of the loop. */ |
| 79 | if (!new_name) |
| 80 | return; |
| 81 | |
| 82 | /* An ordinary ssa name defined in the loop. */ |
| 83 | |
| 84 | SET_USE (op_p, new_name); |
| 85 | } |
| 86 | |
| 87 | |
| 88 | /* Renames the variables in basic block BB. Allow renaming of PHI arguments |
| 89 | on edges incoming from outer-block header if RENAME_FROM_OUTER_LOOP is |
| 90 | true. */ |
| 91 | |
| 92 | static void |
| 93 | rename_variables_in_bb (basic_block bb, bool rename_from_outer_loop) |
| 94 | { |
| 95 | gimple *stmt; |
| 96 | use_operand_p use_p; |
| 97 | ssa_op_iter iter; |
| 98 | edge e; |
| 99 | edge_iterator ei; |
| 100 | class loop *loop = bb->loop_father; |
| 101 | class loop *outer_loop = NULL; |
| 102 | |
| 103 | if (rename_from_outer_loop) |
| 104 | { |
| 105 | gcc_assert (loop); |
| 106 | outer_loop = loop_outer (loop); |
| 107 | } |
| 108 | |
| 109 | for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (i: gsi); |
| 110 | gsi_next (i: &gsi)) |
| 111 | { |
| 112 | stmt = gsi_stmt (i: gsi); |
| 113 | FOR_EACH_SSA_USE_OPERAND (use_p, stmt, iter, SSA_OP_ALL_USES) |
| 114 | rename_use_op (op_p: use_p); |
| 115 | } |
| 116 | |
| 117 | FOR_EACH_EDGE (e, ei, bb->preds) |
| 118 | { |
| 119 | if (!flow_bb_inside_loop_p (loop, e->src)) |
| 120 | { |
| 121 | if (!rename_from_outer_loop) |
| 122 | continue; |
| 123 | if (e->src != outer_loop->header) |
| 124 | { |
| 125 | if (outer_loop->inner->next) |
| 126 | { |
| 127 | /* If outer_loop has 2 inner loops, allow there to |
| 128 | be an extra basic block which decides which of the |
| 129 | two loops to use using LOOP_VECTORIZED. */ |
| 130 | if (!single_pred_p (bb: e->src) |
| 131 | || single_pred (bb: e->src) != outer_loop->header) |
| 132 | continue; |
| 133 | } |
| 134 | } |
| 135 | } |
| 136 | for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (i: gsi); |
| 137 | gsi_next (i: &gsi)) |
| 138 | rename_use_op (PHI_ARG_DEF_PTR_FROM_EDGE (gsi.phi (), e)); |
| 139 | } |
| 140 | } |
| 141 | |
| 142 | |
| 143 | struct adjust_info |
| 144 | { |
| 145 | tree from, to; |
| 146 | basic_block bb; |
| 147 | }; |
| 148 | |
| 149 | /* A stack of values to be adjusted in debug stmts. We have to |
| 150 | process them LIFO, so that the closest substitution applies. If we |
| 151 | processed them FIFO, without the stack, we might substitute uses |
| 152 | with a PHI DEF that would soon become non-dominant, and when we got |
| 153 | to the suitable one, it wouldn't have anything to substitute any |
| 154 | more. */ |
| 155 | static vec<adjust_info, va_heap> adjust_vec; |
| 156 | |
| 157 | /* Adjust any debug stmts that referenced AI->from values to use the |
| 158 | loop-closed AI->to, if the references are dominated by AI->bb and |
| 159 | not by the definition of AI->from. */ |
| 160 | |
| 161 | static void |
| 162 | adjust_debug_stmts_now (adjust_info *ai) |
| 163 | { |
| 164 | basic_block bbphi = ai->bb; |
| 165 | tree orig_def = ai->from; |
| 166 | tree new_def = ai->to; |
| 167 | imm_use_iterator imm_iter; |
| 168 | gimple *stmt; |
| 169 | basic_block bbdef = gimple_bb (SSA_NAME_DEF_STMT (orig_def)); |
| 170 | |
| 171 | gcc_assert (dom_info_available_p (CDI_DOMINATORS)); |
| 172 | |
| 173 | /* Adjust any debug stmts that held onto non-loop-closed |
| 174 | references. */ |
| 175 | FOR_EACH_IMM_USE_STMT (stmt, imm_iter, orig_def) |
| 176 | { |
| 177 | use_operand_p use_p; |
| 178 | basic_block bbuse; |
| 179 | |
| 180 | if (!is_gimple_debug (gs: stmt)) |
| 181 | continue; |
| 182 | |
| 183 | gcc_assert (gimple_debug_bind_p (stmt)); |
| 184 | |
| 185 | bbuse = gimple_bb (g: stmt); |
| 186 | |
| 187 | if ((bbuse == bbphi |
| 188 | || dominated_by_p (CDI_DOMINATORS, bbuse, bbphi)) |
| 189 | && !(bbuse == bbdef |
| 190 | || dominated_by_p (CDI_DOMINATORS, bbuse, bbdef))) |
| 191 | { |
| 192 | if (new_def) |
| 193 | FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter) |
| 194 | SET_USE (use_p, new_def); |
| 195 | else |
| 196 | { |
| 197 | gimple_debug_bind_reset_value (dbg: stmt); |
| 198 | update_stmt (s: stmt); |
| 199 | } |
| 200 | } |
| 201 | } |
| 202 | } |
| 203 | |
| 204 | /* Adjust debug stmts as scheduled before. */ |
| 205 | |
| 206 | static void |
| 207 | adjust_vec_debug_stmts (void) |
| 208 | { |
| 209 | if (!MAY_HAVE_DEBUG_BIND_STMTS) |
| 210 | return; |
| 211 | |
| 212 | gcc_assert (adjust_vec.exists ()); |
| 213 | |
| 214 | while (!adjust_vec.is_empty ()) |
| 215 | { |
| 216 | adjust_debug_stmts_now (ai: &adjust_vec.last ()); |
| 217 | adjust_vec.pop (); |
| 218 | } |
| 219 | } |
| 220 | |
| 221 | /* Adjust any debug stmts that referenced FROM values to use the |
| 222 | loop-closed TO, if the references are dominated by BB and not by |
| 223 | the definition of FROM. If adjust_vec is non-NULL, adjustments |
| 224 | will be postponed until adjust_vec_debug_stmts is called. */ |
| 225 | |
| 226 | static void |
| 227 | adjust_debug_stmts (tree from, tree to, basic_block bb) |
| 228 | { |
| 229 | adjust_info ai; |
| 230 | |
| 231 | if (MAY_HAVE_DEBUG_BIND_STMTS |
| 232 | && TREE_CODE (from) == SSA_NAME |
| 233 | && ! SSA_NAME_IS_DEFAULT_DEF (from) |
| 234 | && ! virtual_operand_p (op: from)) |
| 235 | { |
| 236 | ai.from = from; |
| 237 | ai.to = to; |
| 238 | ai.bb = bb; |
| 239 | |
| 240 | if (adjust_vec.exists ()) |
| 241 | adjust_vec.safe_push (obj: ai); |
| 242 | else |
| 243 | adjust_debug_stmts_now (ai: &ai); |
| 244 | } |
| 245 | } |
| 246 | |
| 247 | /* Change E's phi arg in UPDATE_PHI to NEW_DEF, and record information |
| 248 | to adjust any debug stmts that referenced the old phi arg, |
| 249 | presumably non-loop-closed references left over from other |
| 250 | transformations. */ |
| 251 | |
| 252 | static void |
| 253 | adjust_phi_and_debug_stmts (gimple *update_phi, edge e, tree new_def) |
| 254 | { |
| 255 | tree orig_def = PHI_ARG_DEF_FROM_EDGE (update_phi, e); |
| 256 | |
| 257 | gcc_assert (TREE_CODE (orig_def) != SSA_NAME |
| 258 | || orig_def != new_def); |
| 259 | |
| 260 | SET_PHI_ARG_DEF (update_phi, e->dest_idx, new_def); |
| 261 | |
| 262 | if (MAY_HAVE_DEBUG_BIND_STMTS) |
| 263 | adjust_debug_stmts (from: orig_def, PHI_RESULT (update_phi), |
| 264 | bb: gimple_bb (g: update_phi)); |
| 265 | } |
| 266 | |
| 267 | /* Define one loop rgroup control CTRL from loop LOOP. INIT_CTRL is the value |
| 268 | that the control should have during the first iteration and NEXT_CTRL is the |
| 269 | value that it should have on subsequent iterations. */ |
| 270 | |
| 271 | static void |
| 272 | vect_set_loop_control (class loop *loop, tree ctrl, tree init_ctrl, |
| 273 | tree next_ctrl) |
| 274 | { |
| 275 | gphi *phi = create_phi_node (ctrl, loop->header); |
| 276 | add_phi_arg (phi, init_ctrl, loop_preheader_edge (loop), UNKNOWN_LOCATION); |
| 277 | add_phi_arg (phi, next_ctrl, loop_latch_edge (loop), UNKNOWN_LOCATION); |
| 278 | } |
| 279 | |
| 280 | /* Add SEQ to the end of LOOP's preheader block. */ |
| 281 | |
| 282 | static void |
| 283 | (class loop *loop, gimple_seq seq) |
| 284 | { |
| 285 | if (seq) |
| 286 | { |
| 287 | edge pe = loop_preheader_edge (loop); |
| 288 | basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq); |
| 289 | gcc_assert (!new_bb); |
| 290 | } |
| 291 | } |
| 292 | |
| 293 | /* Add SEQ to the beginning of LOOP's header block. */ |
| 294 | |
| 295 | static void |
| 296 | (class loop *loop, gimple_seq seq) |
| 297 | { |
| 298 | if (seq) |
| 299 | { |
| 300 | gimple_stmt_iterator gsi = gsi_after_labels (bb: loop->header); |
| 301 | gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT); |
| 302 | } |
| 303 | } |
| 304 | |
| 305 | /* Return true if the target can interleave elements of two vectors. |
| 306 | OFFSET is 0 if the first half of the vectors should be interleaved |
| 307 | or 1 if the second half should. When returning true, store the |
| 308 | associated permutation in INDICES. */ |
| 309 | |
| 310 | static bool |
| 311 | interleave_supported_p (vec_perm_indices *indices, tree vectype, |
| 312 | unsigned int offset) |
| 313 | { |
| 314 | poly_uint64 nelts = TYPE_VECTOR_SUBPARTS (node: vectype); |
| 315 | poly_uint64 base = exact_div (a: nelts, b: 2) * offset; |
| 316 | vec_perm_builder sel (nelts, 2, 3); |
| 317 | for (unsigned int i = 0; i < 3; ++i) |
| 318 | { |
| 319 | sel.quick_push (obj: base + i); |
| 320 | sel.quick_push (obj: base + i + nelts); |
| 321 | } |
| 322 | indices->new_vector (sel, 2, nelts); |
| 323 | return can_vec_perm_const_p (TYPE_MODE (vectype), TYPE_MODE (vectype), |
| 324 | *indices); |
| 325 | } |
| 326 | |
| 327 | /* Try to use permutes to define the masks in DEST_RGM using the masks |
| 328 | in SRC_RGM, given that the former has twice as many masks as the |
| 329 | latter. Return true on success, adding any new statements to SEQ. */ |
| 330 | |
| 331 | static bool |
| 332 | vect_maybe_permute_loop_masks (gimple_seq *seq, rgroup_controls *dest_rgm, |
| 333 | rgroup_controls *src_rgm) |
| 334 | { |
| 335 | tree src_masktype = src_rgm->type; |
| 336 | tree dest_masktype = dest_rgm->type; |
| 337 | machine_mode src_mode = TYPE_MODE (src_masktype); |
| 338 | insn_code icode1, icode2; |
| 339 | if (dest_rgm->max_nscalars_per_iter <= src_rgm->max_nscalars_per_iter |
| 340 | && (icode1 = optab_handler (op: vec_unpacku_hi_optab, |
| 341 | mode: src_mode)) != CODE_FOR_nothing |
| 342 | && (icode2 = optab_handler (op: vec_unpacku_lo_optab, |
| 343 | mode: src_mode)) != CODE_FOR_nothing) |
| 344 | { |
| 345 | /* Unpacking the source masks gives at least as many mask bits as |
| 346 | we need. We can then VIEW_CONVERT any excess bits away. */ |
| 347 | machine_mode dest_mode = insn_data[icode1].operand[0].mode; |
| 348 | gcc_assert (dest_mode == insn_data[icode2].operand[0].mode); |
| 349 | tree unpack_masktype = vect_halve_mask_nunits (src_masktype, dest_mode); |
| 350 | for (unsigned int i = 0; i < dest_rgm->controls.length (); ++i) |
| 351 | { |
| 352 | tree src = src_rgm->controls[i / 2]; |
| 353 | tree dest = dest_rgm->controls[i]; |
| 354 | tree_code code = ((i & 1) == (BYTES_BIG_ENDIAN ? 0 : 1) |
| 355 | ? VEC_UNPACK_HI_EXPR |
| 356 | : VEC_UNPACK_LO_EXPR); |
| 357 | gassign *stmt; |
| 358 | if (dest_masktype == unpack_masktype) |
| 359 | stmt = gimple_build_assign (dest, code, src); |
| 360 | else |
| 361 | { |
| 362 | tree temp = make_ssa_name (var: unpack_masktype); |
| 363 | stmt = gimple_build_assign (temp, code, src); |
| 364 | gimple_seq_add_stmt (seq, stmt); |
| 365 | stmt = gimple_build_assign (dest, VIEW_CONVERT_EXPR, |
| 366 | build1 (VIEW_CONVERT_EXPR, |
| 367 | dest_masktype, temp)); |
| 368 | } |
| 369 | gimple_seq_add_stmt (seq, stmt); |
| 370 | } |
| 371 | return true; |
| 372 | } |
| 373 | vec_perm_indices indices[2]; |
| 374 | if (dest_masktype == src_masktype |
| 375 | && interleave_supported_p (indices: &indices[0], vectype: src_masktype, offset: 0) |
| 376 | && interleave_supported_p (indices: &indices[1], vectype: src_masktype, offset: 1)) |
| 377 | { |
| 378 | /* The destination requires twice as many mask bits as the source, so |
| 379 | we can use interleaving permutes to double up the number of bits. */ |
| 380 | tree masks[2]; |
| 381 | for (unsigned int i = 0; i < 2; ++i) |
| 382 | masks[i] = vect_gen_perm_mask_checked (src_masktype, indices[i]); |
| 383 | for (unsigned int i = 0; i < dest_rgm->controls.length (); ++i) |
| 384 | { |
| 385 | tree src = src_rgm->controls[i / 2]; |
| 386 | tree dest = dest_rgm->controls[i]; |
| 387 | gimple *stmt = gimple_build_assign (dest, VEC_PERM_EXPR, |
| 388 | src, src, masks[i & 1]); |
| 389 | gimple_seq_add_stmt (seq, stmt); |
| 390 | } |
| 391 | return true; |
| 392 | } |
| 393 | return false; |
| 394 | } |
| 395 | |
| 396 | /* Populate DEST_RGM->controls, given that they should add up to STEP. |
| 397 | |
| 398 | STEP = MIN_EXPR <ivtmp_34, VF>; |
| 399 | |
| 400 | First length (MIN (X, VF/N)): |
| 401 | loop_len_15 = MIN_EXPR <STEP, VF/N>; |
| 402 | |
| 403 | Second length: |
| 404 | tmp = STEP - loop_len_15; |
| 405 | loop_len_16 = MIN (tmp, VF/N); |
| 406 | |
| 407 | Third length: |
| 408 | tmp2 = tmp - loop_len_16; |
| 409 | loop_len_17 = MIN (tmp2, VF/N); |
| 410 | |
| 411 | Last length: |
| 412 | loop_len_18 = tmp2 - loop_len_17; |
| 413 | */ |
| 414 | |
| 415 | static void |
| 416 | vect_adjust_loop_lens_control (tree iv_type, gimple_seq *seq, |
| 417 | rgroup_controls *dest_rgm, tree step) |
| 418 | { |
| 419 | tree ctrl_type = dest_rgm->type; |
| 420 | poly_uint64 nitems_per_ctrl |
| 421 | = TYPE_VECTOR_SUBPARTS (node: ctrl_type) * dest_rgm->factor; |
| 422 | tree length_limit = build_int_cst (iv_type, nitems_per_ctrl); |
| 423 | |
| 424 | for (unsigned int i = 0; i < dest_rgm->controls.length (); ++i) |
| 425 | { |
| 426 | tree ctrl = dest_rgm->controls[i]; |
| 427 | if (i == 0) |
| 428 | { |
| 429 | /* First iteration: MIN (X, VF/N) capped to the range [0, VF/N]. */ |
| 430 | gassign *assign |
| 431 | = gimple_build_assign (ctrl, MIN_EXPR, step, length_limit); |
| 432 | gimple_seq_add_stmt (seq, assign); |
| 433 | } |
| 434 | else if (i == dest_rgm->controls.length () - 1) |
| 435 | { |
| 436 | /* Last iteration: Remain capped to the range [0, VF/N]. */ |
| 437 | gassign *assign = gimple_build_assign (ctrl, MINUS_EXPR, step, |
| 438 | dest_rgm->controls[i - 1]); |
| 439 | gimple_seq_add_stmt (seq, assign); |
| 440 | } |
| 441 | else |
| 442 | { |
| 443 | /* (MIN (remain, VF*I/N)) capped to the range [0, VF/N]. */ |
| 444 | step = gimple_build (seq, code: MINUS_EXPR, type: iv_type, ops: step, |
| 445 | ops: dest_rgm->controls[i - 1]); |
| 446 | gassign *assign |
| 447 | = gimple_build_assign (ctrl, MIN_EXPR, step, length_limit); |
| 448 | gimple_seq_add_stmt (seq, assign); |
| 449 | } |
| 450 | } |
| 451 | } |
| 452 | |
| 453 | /* Stores the standard position for induction variable increment in belonging to |
| 454 | LOOP_EXIT (just before the exit condition of the given exit to BSI. |
| 455 | INSERT_AFTER is set to true if the increment should be inserted after |
| 456 | *BSI. */ |
| 457 | |
| 458 | void |
| 459 | vect_iv_increment_position (edge loop_exit, gimple_stmt_iterator *bsi, |
| 460 | bool *insert_after) |
| 461 | { |
| 462 | basic_block bb = loop_exit->src; |
| 463 | *bsi = gsi_last_bb (bb); |
| 464 | *insert_after = false; |
| 465 | } |
| 466 | |
| 467 | /* Helper for vect_set_loop_condition_partial_vectors. Generate definitions |
| 468 | for all the rgroup controls in RGC and return a control that is nonzero |
| 469 | when the loop needs to iterate. Add any new preheader statements to |
| 470 | PREHEADER_SEQ. Use LOOP_COND_GSI to insert code before the exit gcond. |
| 471 | |
| 472 | RGC belongs to loop LOOP. The loop originally iterated NITERS |
| 473 | times and has been vectorized according to LOOP_VINFO. |
| 474 | |
| 475 | If NITERS_SKIP is nonnull, the first iteration of the vectorized loop |
| 476 | starts with NITERS_SKIP dummy iterations of the scalar loop before |
| 477 | the real work starts. The mask elements for these dummy iterations |
| 478 | must be 0, to ensure that the extra iterations do not have an effect. |
| 479 | |
| 480 | It is known that: |
| 481 | |
| 482 | NITERS * RGC->max_nscalars_per_iter * RGC->factor |
| 483 | |
| 484 | does not overflow. However, MIGHT_WRAP_P says whether an induction |
| 485 | variable that starts at 0 and has step: |
| 486 | |
| 487 | VF * RGC->max_nscalars_per_iter * RGC->factor |
| 488 | |
| 489 | might overflow before hitting a value above: |
| 490 | |
| 491 | (NITERS + NITERS_SKIP) * RGC->max_nscalars_per_iter * RGC->factor |
| 492 | |
| 493 | This means that we cannot guarantee that such an induction variable |
| 494 | would ever hit a value that produces a set of all-false masks or zero |
| 495 | lengths for RGC. |
| 496 | |
| 497 | Note: the cost of the code generated by this function is modeled |
| 498 | by vect_estimate_min_profitable_iters, so changes here may need |
| 499 | corresponding changes there. */ |
| 500 | |
| 501 | static tree |
| 502 | vect_set_loop_controls_directly (class loop *loop, loop_vec_info loop_vinfo, |
| 503 | gimple_seq *, |
| 504 | gimple_seq *, |
| 505 | gimple_stmt_iterator loop_cond_gsi, |
| 506 | rgroup_controls *rgc, tree niters, |
| 507 | tree niters_skip, bool might_wrap_p, |
| 508 | tree *iv_step, tree *compare_step) |
| 509 | { |
| 510 | tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo); |
| 511 | tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo); |
| 512 | bool use_masks_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo); |
| 513 | |
| 514 | tree ctrl_type = rgc->type; |
| 515 | unsigned int nitems_per_iter = rgc->max_nscalars_per_iter * rgc->factor; |
| 516 | poly_uint64 nitems_per_ctrl = TYPE_VECTOR_SUBPARTS (node: ctrl_type) * rgc->factor; |
| 517 | poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo); |
| 518 | tree length_limit = NULL_TREE; |
| 519 | /* For length, we need length_limit to ensure length in range. */ |
| 520 | if (!use_masks_p) |
| 521 | length_limit = build_int_cst (compare_type, nitems_per_ctrl); |
| 522 | |
| 523 | /* Calculate the maximum number of item values that the rgroup |
| 524 | handles in total, the number that it handles for each iteration |
| 525 | of the vector loop, and the number that it should skip during the |
| 526 | first iteration of the vector loop. */ |
| 527 | tree nitems_total = niters; |
| 528 | tree nitems_step = build_int_cst (iv_type, vf); |
| 529 | tree nitems_skip = niters_skip; |
| 530 | if (nitems_per_iter != 1) |
| 531 | { |
| 532 | /* We checked before setting LOOP_VINFO_USING_PARTIAL_VECTORS_P that |
| 533 | these multiplications don't overflow. */ |
| 534 | tree compare_factor = build_int_cst (compare_type, nitems_per_iter); |
| 535 | tree iv_factor = build_int_cst (iv_type, nitems_per_iter); |
| 536 | nitems_total = gimple_build (seq: preheader_seq, code: MULT_EXPR, type: compare_type, |
| 537 | ops: nitems_total, ops: compare_factor); |
| 538 | nitems_step = gimple_build (seq: preheader_seq, code: MULT_EXPR, type: iv_type, |
| 539 | ops: nitems_step, ops: iv_factor); |
| 540 | if (nitems_skip) |
| 541 | nitems_skip = gimple_build (seq: preheader_seq, code: MULT_EXPR, type: compare_type, |
| 542 | ops: nitems_skip, ops: compare_factor); |
| 543 | } |
| 544 | |
| 545 | /* Create an induction variable that counts the number of items |
| 546 | processed. */ |
| 547 | tree index_before_incr, index_after_incr; |
| 548 | gimple_stmt_iterator incr_gsi; |
| 549 | bool insert_after; |
| 550 | edge exit_e = LOOP_VINFO_MAIN_EXIT (loop_vinfo); |
| 551 | vect_iv_increment_position (loop_exit: exit_e, bsi: &incr_gsi, insert_after: &insert_after); |
| 552 | if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo)) |
| 553 | { |
| 554 | /* Create an IV that counts down from niters_total and whose step |
| 555 | is the (variable) amount processed in the current iteration: |
| 556 | ... |
| 557 | _10 = (unsigned long) count_12(D); |
| 558 | ... |
| 559 | # ivtmp_9 = PHI <ivtmp_35(6), _10(5)> |
| 560 | _36 = (MIN_EXPR | SELECT_VL) <ivtmp_9, POLY_INT_CST [4, 4]>; |
| 561 | ... |
| 562 | vect__4.8_28 = .LEN_LOAD (_17, 32B, _36, 0); |
| 563 | ... |
| 564 | ivtmp_35 = ivtmp_9 - POLY_INT_CST [4, 4]; |
| 565 | ... |
| 566 | if (ivtmp_9 > POLY_INT_CST [4, 4]) |
| 567 | goto <bb 4>; [83.33%] |
| 568 | else |
| 569 | goto <bb 5>; [16.67%] |
| 570 | */ |
| 571 | nitems_total = gimple_convert (seq: preheader_seq, type: iv_type, op: nitems_total); |
| 572 | tree step = rgc->controls.length () == 1 ? rgc->controls[0] |
| 573 | : make_ssa_name (var: iv_type); |
| 574 | /* Create decrement IV. */ |
| 575 | if (LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo)) |
| 576 | { |
| 577 | create_iv (nitems_total, MINUS_EXPR, step, NULL_TREE, loop, &incr_gsi, |
| 578 | insert_after, &index_before_incr, &index_after_incr); |
| 579 | tree vectype = build_zero_cst (rgc->type); |
| 580 | tree len = gimple_build (seq: header_seq, code: IFN_SELECT_VL, type: iv_type, |
| 581 | ops: index_before_incr, ops: nitems_step, |
| 582 | ops: vectype); |
| 583 | gimple_seq_add_stmt (header_seq, gimple_build_assign (step, len)); |
| 584 | } |
| 585 | else |
| 586 | { |
| 587 | create_iv (nitems_total, MINUS_EXPR, nitems_step, NULL_TREE, loop, |
| 588 | &incr_gsi, insert_after, &index_before_incr, |
| 589 | &index_after_incr); |
| 590 | gimple_seq_add_stmt (header_seq, |
| 591 | gimple_build_assign (step, MIN_EXPR, |
| 592 | index_before_incr, |
| 593 | nitems_step)); |
| 594 | } |
| 595 | *iv_step = step; |
| 596 | *compare_step = nitems_step; |
| 597 | return LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo) ? index_after_incr |
| 598 | : index_before_incr; |
| 599 | } |
| 600 | |
| 601 | /* Create increment IV. */ |
| 602 | create_iv (build_int_cst (iv_type, 0), PLUS_EXPR, nitems_step, NULL_TREE, |
| 603 | loop, &incr_gsi, insert_after, &index_before_incr, |
| 604 | &index_after_incr); |
| 605 | |
| 606 | tree zero_index = build_int_cst (compare_type, 0); |
| 607 | tree test_index, test_limit, first_limit; |
| 608 | gimple_stmt_iterator *test_gsi; |
| 609 | if (might_wrap_p) |
| 610 | { |
| 611 | /* In principle the loop should stop iterating once the incremented |
| 612 | IV reaches a value greater than or equal to: |
| 613 | |
| 614 | NITEMS_TOTAL +[infinite-prec] NITEMS_SKIP |
| 615 | |
| 616 | However, there's no guarantee that this addition doesn't overflow |
| 617 | the comparison type, or that the IV hits a value above it before |
| 618 | wrapping around. We therefore adjust the limit down by one |
| 619 | IV step: |
| 620 | |
| 621 | (NITEMS_TOTAL +[infinite-prec] NITEMS_SKIP) |
| 622 | -[infinite-prec] NITEMS_STEP |
| 623 | |
| 624 | and compare the IV against this limit _before_ incrementing it. |
| 625 | Since the comparison type is unsigned, we actually want the |
| 626 | subtraction to saturate at zero: |
| 627 | |
| 628 | (NITEMS_TOTAL +[infinite-prec] NITEMS_SKIP) |
| 629 | -[sat] NITEMS_STEP |
| 630 | |
| 631 | And since NITEMS_SKIP < NITEMS_STEP, we can reassociate this as: |
| 632 | |
| 633 | NITEMS_TOTAL -[sat] (NITEMS_STEP - NITEMS_SKIP) |
| 634 | |
| 635 | where the rightmost subtraction can be done directly in |
| 636 | COMPARE_TYPE. */ |
| 637 | test_index = index_before_incr; |
| 638 | tree adjust = gimple_convert (seq: preheader_seq, type: compare_type, |
| 639 | op: nitems_step); |
| 640 | if (nitems_skip) |
| 641 | adjust = gimple_build (seq: preheader_seq, code: MINUS_EXPR, type: compare_type, |
| 642 | ops: adjust, ops: nitems_skip); |
| 643 | test_limit = gimple_build (seq: preheader_seq, code: MAX_EXPR, type: compare_type, |
| 644 | ops: nitems_total, ops: adjust); |
| 645 | test_limit = gimple_build (seq: preheader_seq, code: MINUS_EXPR, type: compare_type, |
| 646 | ops: test_limit, ops: adjust); |
| 647 | test_gsi = &incr_gsi; |
| 648 | |
| 649 | /* Get a safe limit for the first iteration. */ |
| 650 | if (nitems_skip) |
| 651 | { |
| 652 | /* The first vector iteration can handle at most NITEMS_STEP |
| 653 | items. NITEMS_STEP <= CONST_LIMIT, and adding |
| 654 | NITEMS_SKIP to that cannot overflow. */ |
| 655 | tree const_limit = build_int_cst (compare_type, |
| 656 | LOOP_VINFO_VECT_FACTOR (loop_vinfo) |
| 657 | * nitems_per_iter); |
| 658 | first_limit = gimple_build (seq: preheader_seq, code: MIN_EXPR, type: compare_type, |
| 659 | ops: nitems_total, ops: const_limit); |
| 660 | first_limit = gimple_build (seq: preheader_seq, code: PLUS_EXPR, type: compare_type, |
| 661 | ops: first_limit, ops: nitems_skip); |
| 662 | } |
| 663 | else |
| 664 | /* For the first iteration it doesn't matter whether the IV hits |
| 665 | a value above NITEMS_TOTAL. That only matters for the latch |
| 666 | condition. */ |
| 667 | first_limit = nitems_total; |
| 668 | } |
| 669 | else |
| 670 | { |
| 671 | /* Test the incremented IV, which will always hit a value above |
| 672 | the bound before wrapping. */ |
| 673 | test_index = index_after_incr; |
| 674 | test_limit = nitems_total; |
| 675 | if (nitems_skip) |
| 676 | test_limit = gimple_build (seq: preheader_seq, code: PLUS_EXPR, type: compare_type, |
| 677 | ops: test_limit, ops: nitems_skip); |
| 678 | test_gsi = &loop_cond_gsi; |
| 679 | |
| 680 | first_limit = test_limit; |
| 681 | } |
| 682 | |
| 683 | /* Convert the IV value to the comparison type (either a no-op or |
| 684 | a demotion). */ |
| 685 | gimple_seq test_seq = NULL; |
| 686 | test_index = gimple_convert (seq: &test_seq, type: compare_type, op: test_index); |
| 687 | gsi_insert_seq_before (test_gsi, test_seq, GSI_SAME_STMT); |
| 688 | |
| 689 | /* Provide a definition of each control in the group. */ |
| 690 | tree next_ctrl = NULL_TREE; |
| 691 | tree ctrl; |
| 692 | unsigned int i; |
| 693 | FOR_EACH_VEC_ELT_REVERSE (rgc->controls, i, ctrl) |
| 694 | { |
| 695 | /* Previous controls will cover BIAS items. This control covers the |
| 696 | next batch. */ |
| 697 | poly_uint64 bias = nitems_per_ctrl * i; |
| 698 | tree bias_tree = build_int_cst (compare_type, bias); |
| 699 | |
| 700 | /* See whether the first iteration of the vector loop is known |
| 701 | to have a full control. */ |
| 702 | poly_uint64 const_limit; |
| 703 | bool first_iteration_full |
| 704 | = (poly_int_tree_p (t: first_limit, value: &const_limit) |
| 705 | && known_ge (const_limit, (i + 1) * nitems_per_ctrl)); |
| 706 | |
| 707 | /* Rather than have a new IV that starts at BIAS and goes up to |
| 708 | TEST_LIMIT, prefer to use the same 0-based IV for each control |
| 709 | and adjust the bound down by BIAS. */ |
| 710 | tree this_test_limit = test_limit; |
| 711 | if (i != 0) |
| 712 | { |
| 713 | this_test_limit = gimple_build (seq: preheader_seq, code: MAX_EXPR, |
| 714 | type: compare_type, ops: this_test_limit, |
| 715 | ops: bias_tree); |
| 716 | this_test_limit = gimple_build (seq: preheader_seq, code: MINUS_EXPR, |
| 717 | type: compare_type, ops: this_test_limit, |
| 718 | ops: bias_tree); |
| 719 | } |
| 720 | |
| 721 | /* Create the initial control. First include all items that |
| 722 | are within the loop limit. */ |
| 723 | tree init_ctrl = NULL_TREE; |
| 724 | if (!first_iteration_full) |
| 725 | { |
| 726 | tree start, end; |
| 727 | if (first_limit == test_limit) |
| 728 | { |
| 729 | /* Use a natural test between zero (the initial IV value) |
| 730 | and the loop limit. The "else" block would be valid too, |
| 731 | but this choice can avoid the need to load BIAS_TREE into |
| 732 | a register. */ |
| 733 | start = zero_index; |
| 734 | end = this_test_limit; |
| 735 | } |
| 736 | else |
| 737 | { |
| 738 | /* FIRST_LIMIT is the maximum number of items handled by the |
| 739 | first iteration of the vector loop. Test the portion |
| 740 | associated with this control. */ |
| 741 | start = bias_tree; |
| 742 | end = first_limit; |
| 743 | } |
| 744 | |
| 745 | if (use_masks_p) |
| 746 | init_ctrl = vect_gen_while (preheader_seq, ctrl_type, |
| 747 | start, end, "max_mask" ); |
| 748 | else |
| 749 | { |
| 750 | init_ctrl = make_temp_ssa_name (type: compare_type, NULL, name: "max_len" ); |
| 751 | gimple_seq seq = vect_gen_len (init_ctrl, start, |
| 752 | end, length_limit); |
| 753 | gimple_seq_add_seq (preheader_seq, seq); |
| 754 | } |
| 755 | } |
| 756 | |
| 757 | /* Now AND out the bits that are within the number of skipped |
| 758 | items. */ |
| 759 | poly_uint64 const_skip; |
| 760 | if (nitems_skip |
| 761 | && !(poly_int_tree_p (t: nitems_skip, value: &const_skip) |
| 762 | && known_le (const_skip, bias))) |
| 763 | { |
| 764 | gcc_assert (use_masks_p); |
| 765 | tree unskipped_mask = vect_gen_while_not (preheader_seq, ctrl_type, |
| 766 | bias_tree, nitems_skip); |
| 767 | if (init_ctrl) |
| 768 | init_ctrl = gimple_build (seq: preheader_seq, code: BIT_AND_EXPR, type: ctrl_type, |
| 769 | ops: init_ctrl, ops: unskipped_mask); |
| 770 | else |
| 771 | init_ctrl = unskipped_mask; |
| 772 | } |
| 773 | |
| 774 | if (!init_ctrl) |
| 775 | { |
| 776 | /* First iteration is full. */ |
| 777 | if (use_masks_p) |
| 778 | init_ctrl = build_minus_one_cst (ctrl_type); |
| 779 | else |
| 780 | init_ctrl = length_limit; |
| 781 | } |
| 782 | |
| 783 | /* Get the control value for the next iteration of the loop. */ |
| 784 | if (use_masks_p) |
| 785 | { |
| 786 | gimple_seq stmts = NULL; |
| 787 | next_ctrl = vect_gen_while (&stmts, ctrl_type, test_index, |
| 788 | this_test_limit, "next_mask" ); |
| 789 | gsi_insert_seq_before (test_gsi, stmts, GSI_SAME_STMT); |
| 790 | } |
| 791 | else |
| 792 | { |
| 793 | next_ctrl = make_temp_ssa_name (type: compare_type, NULL, name: "next_len" ); |
| 794 | gimple_seq seq = vect_gen_len (next_ctrl, test_index, this_test_limit, |
| 795 | length_limit); |
| 796 | gsi_insert_seq_before (test_gsi, seq, GSI_SAME_STMT); |
| 797 | } |
| 798 | |
| 799 | vect_set_loop_control (loop, ctrl, init_ctrl, next_ctrl); |
| 800 | } |
| 801 | |
| 802 | int partial_load_bias = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo); |
| 803 | if (partial_load_bias != 0) |
| 804 | { |
| 805 | tree adjusted_len = rgc->bias_adjusted_ctrl; |
| 806 | gassign *minus = gimple_build_assign (adjusted_len, PLUS_EXPR, |
| 807 | rgc->controls[0], |
| 808 | build_int_cst |
| 809 | (TREE_TYPE (rgc->controls[0]), |
| 810 | partial_load_bias)); |
| 811 | gimple_seq_add_stmt (header_seq, minus); |
| 812 | } |
| 813 | |
| 814 | return next_ctrl; |
| 815 | } |
| 816 | |
| 817 | /* Set up the iteration condition and rgroup controls for LOOP, given |
| 818 | that LOOP_VINFO_USING_PARTIAL_VECTORS_P is true for the vectorized |
| 819 | loop. LOOP_VINFO describes the vectorization of LOOP. NITERS is |
| 820 | the number of iterations of the original scalar loop that should be |
| 821 | handled by the vector loop. NITERS_MAYBE_ZERO and FINAL_IV are as |
| 822 | for vect_set_loop_condition. |
| 823 | |
| 824 | Insert the branch-back condition before LOOP_COND_GSI and return the |
| 825 | final gcond. */ |
| 826 | |
| 827 | static gcond * |
| 828 | vect_set_loop_condition_partial_vectors (class loop *loop, edge exit_edge, |
| 829 | loop_vec_info loop_vinfo, tree niters, |
| 830 | tree final_iv, bool niters_maybe_zero, |
| 831 | gimple_stmt_iterator loop_cond_gsi) |
| 832 | { |
| 833 | gimple_seq = NULL; |
| 834 | gimple_seq = NULL; |
| 835 | |
| 836 | bool use_masks_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo); |
| 837 | tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo); |
| 838 | unsigned int compare_precision = TYPE_PRECISION (compare_type); |
| 839 | tree orig_niters = niters; |
| 840 | |
| 841 | /* Type of the initial value of NITERS. */ |
| 842 | tree ni_actual_type = TREE_TYPE (niters); |
| 843 | unsigned int ni_actual_precision = TYPE_PRECISION (ni_actual_type); |
| 844 | tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo); |
| 845 | if (niters_skip) |
| 846 | niters_skip = gimple_convert (seq: &preheader_seq, type: compare_type, op: niters_skip); |
| 847 | |
| 848 | /* Convert NITERS to the same size as the compare. */ |
| 849 | if (compare_precision > ni_actual_precision |
| 850 | && niters_maybe_zero) |
| 851 | { |
| 852 | /* We know that there is always at least one iteration, so if the |
| 853 | count is zero then it must have wrapped. Cope with this by |
| 854 | subtracting 1 before the conversion and adding 1 to the result. */ |
| 855 | gcc_assert (TYPE_UNSIGNED (ni_actual_type)); |
| 856 | niters = gimple_build (seq: &preheader_seq, code: PLUS_EXPR, type: ni_actual_type, |
| 857 | ops: niters, ops: build_minus_one_cst (ni_actual_type)); |
| 858 | niters = gimple_convert (seq: &preheader_seq, type: compare_type, op: niters); |
| 859 | niters = gimple_build (seq: &preheader_seq, code: PLUS_EXPR, type: compare_type, |
| 860 | ops: niters, ops: build_one_cst (compare_type)); |
| 861 | } |
| 862 | else |
| 863 | niters = gimple_convert (seq: &preheader_seq, type: compare_type, op: niters); |
| 864 | |
| 865 | /* Iterate over all the rgroups and fill in their controls. We could use |
| 866 | the first control from any rgroup for the loop condition; here we |
| 867 | arbitrarily pick the last. */ |
| 868 | tree test_ctrl = NULL_TREE; |
| 869 | tree iv_step = NULL_TREE; |
| 870 | tree compare_step = NULL_TREE; |
| 871 | rgroup_controls *rgc; |
| 872 | rgroup_controls *iv_rgc = nullptr; |
| 873 | unsigned int i; |
| 874 | auto_vec<rgroup_controls> *controls = use_masks_p |
| 875 | ? &LOOP_VINFO_MASKS (loop_vinfo).rgc_vec |
| 876 | : &LOOP_VINFO_LENS (loop_vinfo); |
| 877 | FOR_EACH_VEC_ELT (*controls, i, rgc) |
| 878 | if (!rgc->controls.is_empty ()) |
| 879 | { |
| 880 | /* First try using permutes. This adds a single vector |
| 881 | instruction to the loop for each mask, but needs no extra |
| 882 | loop invariants or IVs. */ |
| 883 | unsigned int nmasks = i + 1; |
| 884 | if (use_masks_p && (nmasks & 1) == 0) |
| 885 | { |
| 886 | rgroup_controls *half_rgc = &(*controls)[nmasks / 2 - 1]; |
| 887 | if (!half_rgc->controls.is_empty () |
| 888 | && vect_maybe_permute_loop_masks (seq: &header_seq, dest_rgm: rgc, src_rgm: half_rgc)) |
| 889 | continue; |
| 890 | } |
| 891 | |
| 892 | if (!LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo) |
| 893 | || !iv_rgc |
| 894 | || (iv_rgc->max_nscalars_per_iter * iv_rgc->factor |
| 895 | != rgc->max_nscalars_per_iter * rgc->factor)) |
| 896 | { |
| 897 | /* See whether zero-based IV would ever generate all-false masks |
| 898 | or zero length before wrapping around. */ |
| 899 | bool might_wrap_p = vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc); |
| 900 | |
| 901 | /* Set up all controls for this group. */ |
| 902 | test_ctrl |
| 903 | = vect_set_loop_controls_directly (loop, loop_vinfo, |
| 904 | preheader_seq: &preheader_seq, header_seq: &header_seq, |
| 905 | loop_cond_gsi, rgc, niters, |
| 906 | niters_skip, might_wrap_p, |
| 907 | iv_step: &iv_step, compare_step: &compare_step); |
| 908 | |
| 909 | iv_rgc = rgc; |
| 910 | } |
| 911 | |
| 912 | if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo) |
| 913 | && rgc->controls.length () > 1) |
| 914 | { |
| 915 | /* vect_set_loop_controls_directly creates an IV whose step |
| 916 | is equal to the expected sum of RGC->controls. Use that |
| 917 | information to populate RGC->controls. */ |
| 918 | tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo); |
| 919 | gcc_assert (iv_step); |
| 920 | vect_adjust_loop_lens_control (iv_type, seq: &header_seq, dest_rgm: rgc, step: iv_step); |
| 921 | } |
| 922 | } |
| 923 | |
| 924 | /* Emit all accumulated statements. */ |
| 925 | add_preheader_seq (loop, seq: preheader_seq); |
| 926 | add_header_seq (loop, seq: header_seq); |
| 927 | |
| 928 | /* Get a boolean result that tells us whether to iterate. */ |
| 929 | gcond *cond_stmt; |
| 930 | if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo) |
| 931 | && !LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo)) |
| 932 | { |
| 933 | gcc_assert (compare_step); |
| 934 | tree_code code = (exit_edge->flags & EDGE_TRUE_VALUE) ? LE_EXPR : GT_EXPR; |
| 935 | cond_stmt = gimple_build_cond (code, test_ctrl, compare_step, NULL_TREE, |
| 936 | NULL_TREE); |
| 937 | } |
| 938 | else |
| 939 | { |
| 940 | tree_code code = (exit_edge->flags & EDGE_TRUE_VALUE) ? EQ_EXPR : NE_EXPR; |
| 941 | tree zero_ctrl = build_zero_cst (TREE_TYPE (test_ctrl)); |
| 942 | cond_stmt |
| 943 | = gimple_build_cond (code, test_ctrl, zero_ctrl, NULL_TREE, NULL_TREE); |
| 944 | } |
| 945 | gsi_insert_before (&loop_cond_gsi, cond_stmt, GSI_SAME_STMT); |
| 946 | |
| 947 | /* The loop iterates (NITERS - 1) / VF + 1 times. |
| 948 | Subtract one from this to get the latch count. */ |
| 949 | tree step = build_int_cst (compare_type, |
| 950 | LOOP_VINFO_VECT_FACTOR (loop_vinfo)); |
| 951 | tree niters_minus_one = fold_build2 (PLUS_EXPR, compare_type, niters, |
| 952 | build_minus_one_cst (compare_type)); |
| 953 | loop->nb_iterations = fold_build2 (TRUNC_DIV_EXPR, compare_type, |
| 954 | niters_minus_one, step); |
| 955 | |
| 956 | if (final_iv) |
| 957 | { |
| 958 | gassign *assign; |
| 959 | /* If vectorizing an inverted early break loop we have to restart the |
| 960 | scalar loop at niters - vf. This matches what we do in |
| 961 | vect_gen_vector_loop_niters_mult_vf for non-masked loops. */ |
| 962 | if (LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo)) |
| 963 | { |
| 964 | tree ftype = TREE_TYPE (orig_niters); |
| 965 | tree vf = build_int_cst (ftype, LOOP_VINFO_VECT_FACTOR (loop_vinfo)); |
| 966 | assign = gimple_build_assign (final_iv, MINUS_EXPR, orig_niters, vf); |
| 967 | } |
| 968 | else |
| 969 | assign = gimple_build_assign (final_iv, orig_niters); |
| 970 | gsi_insert_on_edge_immediate (exit_edge, assign); |
| 971 | } |
| 972 | |
| 973 | return cond_stmt; |
| 974 | } |
| 975 | |
| 976 | /* Set up the iteration condition and rgroup controls for LOOP in AVX512 |
| 977 | style, given that LOOP_VINFO_USING_PARTIAL_VECTORS_P is true for the |
| 978 | vectorized loop. LOOP_VINFO describes the vectorization of LOOP. NITERS is |
| 979 | the number of iterations of the original scalar loop that should be |
| 980 | handled by the vector loop. NITERS_MAYBE_ZERO and FINAL_IV are as |
| 981 | for vect_set_loop_condition. |
| 982 | |
| 983 | Insert the branch-back condition before LOOP_COND_GSI and return the |
| 984 | final gcond. */ |
| 985 | |
| 986 | static gcond * |
| 987 | vect_set_loop_condition_partial_vectors_avx512 (class loop *loop, |
| 988 | edge exit_edge, |
| 989 | loop_vec_info loop_vinfo, tree niters, |
| 990 | tree final_iv, |
| 991 | bool niters_maybe_zero, |
| 992 | gimple_stmt_iterator loop_cond_gsi) |
| 993 | { |
| 994 | tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo); |
| 995 | tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo); |
| 996 | poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo); |
| 997 | tree orig_niters = niters; |
| 998 | gimple_seq = NULL; |
| 999 | |
| 1000 | /* Create an IV that counts down from niters and whose step |
| 1001 | is the number of iterations processed in the current iteration. |
| 1002 | Produce the controls with compares like the following. |
| 1003 | |
| 1004 | # iv_2 = PHI <niters, iv_3> |
| 1005 | rem_4 = MIN <iv_2, VF>; |
| 1006 | remv_6 = { rem_4, rem_4, rem_4, ... } |
| 1007 | mask_5 = { 0, 0, 1, 1, 2, 2, ... } < remv6; |
| 1008 | iv_3 = iv_2 - VF; |
| 1009 | if (iv_2 > VF) |
| 1010 | continue; |
| 1011 | |
| 1012 | Where the constant is built with elements at most VF - 1 and |
| 1013 | repetitions according to max_nscalars_per_iter which is guarnateed |
| 1014 | to be the same within a group. */ |
| 1015 | |
| 1016 | /* Convert NITERS to the determined IV type. */ |
| 1017 | if (TYPE_PRECISION (iv_type) > TYPE_PRECISION (TREE_TYPE (niters)) |
| 1018 | && niters_maybe_zero) |
| 1019 | { |
| 1020 | /* We know that there is always at least one iteration, so if the |
| 1021 | count is zero then it must have wrapped. Cope with this by |
| 1022 | subtracting 1 before the conversion and adding 1 to the result. */ |
| 1023 | gcc_assert (TYPE_UNSIGNED (TREE_TYPE (niters))); |
| 1024 | niters = gimple_build (seq: &preheader_seq, code: PLUS_EXPR, TREE_TYPE (niters), |
| 1025 | ops: niters, ops: build_minus_one_cst (TREE_TYPE (niters))); |
| 1026 | niters = gimple_convert (seq: &preheader_seq, type: iv_type, op: niters); |
| 1027 | niters = gimple_build (seq: &preheader_seq, code: PLUS_EXPR, type: iv_type, |
| 1028 | ops: niters, ops: build_one_cst (iv_type)); |
| 1029 | } |
| 1030 | else |
| 1031 | niters = gimple_convert (seq: &preheader_seq, type: iv_type, op: niters); |
| 1032 | |
| 1033 | /* Bias the initial value of the IV in case we need to skip iterations |
| 1034 | at the beginning. */ |
| 1035 | tree niters_adj = niters; |
| 1036 | if (niters_skip) |
| 1037 | { |
| 1038 | tree skip = gimple_convert (seq: &preheader_seq, type: iv_type, op: niters_skip); |
| 1039 | niters_adj = gimple_build (seq: &preheader_seq, code: PLUS_EXPR, |
| 1040 | type: iv_type, ops: niters, ops: skip); |
| 1041 | } |
| 1042 | |
| 1043 | /* The iteration step is the vectorization factor. */ |
| 1044 | tree iv_step = build_int_cst (iv_type, vf); |
| 1045 | |
| 1046 | /* Create the decrement IV. */ |
| 1047 | tree index_before_incr, index_after_incr; |
| 1048 | gimple_stmt_iterator incr_gsi; |
| 1049 | bool insert_after; |
| 1050 | vect_iv_increment_position (loop_exit: exit_edge, bsi: &incr_gsi, insert_after: &insert_after); |
| 1051 | create_iv (niters_adj, MINUS_EXPR, iv_step, NULL_TREE, loop, |
| 1052 | &incr_gsi, insert_after, &index_before_incr, |
| 1053 | &index_after_incr); |
| 1054 | |
| 1055 | /* Iterate over all the rgroups and fill in their controls. */ |
| 1056 | for (auto &rgc : LOOP_VINFO_MASKS (loop_vinfo).rgc_vec) |
| 1057 | { |
| 1058 | if (rgc.controls.is_empty ()) |
| 1059 | continue; |
| 1060 | |
| 1061 | tree ctrl_type = rgc.type; |
| 1062 | poly_uint64 nitems_per_ctrl = TYPE_VECTOR_SUBPARTS (node: ctrl_type); |
| 1063 | |
| 1064 | tree vectype = rgc.compare_type; |
| 1065 | |
| 1066 | /* index_after_incr is the IV specifying the remaining iterations in |
| 1067 | the next iteration. */ |
| 1068 | tree rem = index_after_incr; |
| 1069 | /* When the data type for the compare to produce the mask is |
| 1070 | smaller than the IV type we need to saturate. Saturate to |
| 1071 | the smallest possible value (IV_TYPE) so we only have to |
| 1072 | saturate once (CSE will catch redundant ones we add). */ |
| 1073 | if (TYPE_PRECISION (TREE_TYPE (vectype)) < TYPE_PRECISION (iv_type)) |
| 1074 | rem = gimple_build (&incr_gsi, false, GSI_CONTINUE_LINKING, |
| 1075 | UNKNOWN_LOCATION, |
| 1076 | MIN_EXPR, TREE_TYPE (rem), rem, iv_step); |
| 1077 | rem = gimple_convert (&incr_gsi, false, GSI_CONTINUE_LINKING, |
| 1078 | UNKNOWN_LOCATION, TREE_TYPE (vectype), rem); |
| 1079 | |
| 1080 | /* Build a data vector composed of the remaining iterations. */ |
| 1081 | rem = gimple_build_vector_from_val (&incr_gsi, false, GSI_CONTINUE_LINKING, |
| 1082 | UNKNOWN_LOCATION, vectype, rem); |
| 1083 | |
| 1084 | /* Provide a definition of each vector in the control group. */ |
| 1085 | tree next_ctrl = NULL_TREE; |
| 1086 | tree first_rem = NULL_TREE; |
| 1087 | tree ctrl; |
| 1088 | unsigned int i; |
| 1089 | FOR_EACH_VEC_ELT_REVERSE (rgc.controls, i, ctrl) |
| 1090 | { |
| 1091 | /* Previous controls will cover BIAS items. This control covers the |
| 1092 | next batch. */ |
| 1093 | poly_uint64 bias = nitems_per_ctrl * i; |
| 1094 | |
| 1095 | /* Build the constant to compare the remaining iters against, |
| 1096 | this is sth like { 0, 0, 1, 1, 2, 2, 3, 3, ... } appropriately |
| 1097 | split into pieces. */ |
| 1098 | unsigned n = TYPE_VECTOR_SUBPARTS (node: ctrl_type).to_constant (); |
| 1099 | tree_vector_builder builder (vectype, n, 1); |
| 1100 | for (unsigned i = 0; i < n; ++i) |
| 1101 | { |
| 1102 | unsigned HOST_WIDE_INT val |
| 1103 | = (i + bias.to_constant ()) / rgc.max_nscalars_per_iter; |
| 1104 | gcc_assert (val < vf.to_constant ()); |
| 1105 | builder.quick_push (obj: build_int_cst (TREE_TYPE (vectype), val)); |
| 1106 | } |
| 1107 | tree cmp_series = builder.build (); |
| 1108 | |
| 1109 | /* Create the initial control. First include all items that |
| 1110 | are within the loop limit. */ |
| 1111 | tree init_ctrl = NULL_TREE; |
| 1112 | poly_uint64 const_limit; |
| 1113 | /* See whether the first iteration of the vector loop is known |
| 1114 | to have a full control. */ |
| 1115 | if (poly_int_tree_p (t: niters, value: &const_limit) |
| 1116 | && known_ge (const_limit, (i + 1) * nitems_per_ctrl)) |
| 1117 | init_ctrl = build_minus_one_cst (ctrl_type); |
| 1118 | else |
| 1119 | { |
| 1120 | /* The remaining work items initially are niters. Saturate, |
| 1121 | splat and compare. */ |
| 1122 | if (!first_rem) |
| 1123 | { |
| 1124 | first_rem = niters; |
| 1125 | if (TYPE_PRECISION (TREE_TYPE (vectype)) |
| 1126 | < TYPE_PRECISION (iv_type)) |
| 1127 | first_rem = gimple_build (seq: &preheader_seq, |
| 1128 | code: MIN_EXPR, TREE_TYPE (first_rem), |
| 1129 | ops: first_rem, ops: iv_step); |
| 1130 | first_rem = gimple_convert (seq: &preheader_seq, TREE_TYPE (vectype), |
| 1131 | op: first_rem); |
| 1132 | first_rem = gimple_build_vector_from_val (seq: &preheader_seq, |
| 1133 | type: vectype, op: first_rem); |
| 1134 | } |
| 1135 | init_ctrl = gimple_build (seq: &preheader_seq, code: LT_EXPR, type: ctrl_type, |
| 1136 | ops: cmp_series, ops: first_rem); |
| 1137 | } |
| 1138 | |
| 1139 | /* Now AND out the bits that are within the number of skipped |
| 1140 | items. */ |
| 1141 | poly_uint64 const_skip; |
| 1142 | if (niters_skip |
| 1143 | && !(poly_int_tree_p (t: niters_skip, value: &const_skip) |
| 1144 | && known_le (const_skip, bias))) |
| 1145 | { |
| 1146 | /* For integer mode masks it's cheaper to shift out the bits |
| 1147 | since that avoids loading a constant. */ |
| 1148 | gcc_assert (GET_MODE_CLASS (TYPE_MODE (ctrl_type)) == MODE_INT); |
| 1149 | init_ctrl = gimple_build (seq: &preheader_seq, code: VIEW_CONVERT_EXPR, |
| 1150 | type: lang_hooks.types.type_for_mode |
| 1151 | (TYPE_MODE (ctrl_type), 1), |
| 1152 | ops: init_ctrl); |
| 1153 | /* ??? But when the shift amount isn't constant this requires |
| 1154 | a round-trip to GRPs. We could apply the bias to either |
| 1155 | side of the compare instead. */ |
| 1156 | tree shift = gimple_build (seq: &preheader_seq, code: MINUS_EXPR, |
| 1157 | TREE_TYPE (niters_skip), ops: niters_skip, |
| 1158 | ops: build_int_cst (TREE_TYPE (niters_skip), |
| 1159 | bias)); |
| 1160 | shift = gimple_build (seq: &preheader_seq, code: MULT_EXPR, |
| 1161 | TREE_TYPE (niters_skip), ops: shift, |
| 1162 | ops: build_int_cst (TREE_TYPE (niters_skip), |
| 1163 | rgc.max_nscalars_per_iter)); |
| 1164 | init_ctrl = gimple_build (seq: &preheader_seq, code: LSHIFT_EXPR, |
| 1165 | TREE_TYPE (init_ctrl), |
| 1166 | ops: init_ctrl, ops: shift); |
| 1167 | init_ctrl = gimple_build (seq: &preheader_seq, code: VIEW_CONVERT_EXPR, |
| 1168 | type: ctrl_type, ops: init_ctrl); |
| 1169 | } |
| 1170 | |
| 1171 | /* Get the control value for the next iteration of the loop. */ |
| 1172 | next_ctrl = gimple_build (&incr_gsi, false, GSI_CONTINUE_LINKING, |
| 1173 | UNKNOWN_LOCATION, |
| 1174 | LT_EXPR, ctrl_type, cmp_series, rem); |
| 1175 | |
| 1176 | vect_set_loop_control (loop, ctrl, init_ctrl, next_ctrl); |
| 1177 | } |
| 1178 | } |
| 1179 | |
| 1180 | /* Emit all accumulated statements. */ |
| 1181 | add_preheader_seq (loop, seq: preheader_seq); |
| 1182 | |
| 1183 | /* Adjust the exit test using the decrementing IV. */ |
| 1184 | tree_code code = (exit_edge->flags & EDGE_TRUE_VALUE) ? LE_EXPR : GT_EXPR; |
| 1185 | /* When we peel for alignment with niter_skip != 0 this can |
| 1186 | cause niter + niter_skip to wrap and since we are comparing the |
| 1187 | value before the decrement here we get a false early exit. |
| 1188 | We can't compare the value after decrement either because that |
| 1189 | decrement could wrap as well as we're not doing a saturating |
| 1190 | decrement. To avoid this situation we force a larger |
| 1191 | iv_type. */ |
| 1192 | gcond *cond_stmt = gimple_build_cond (code, index_before_incr, iv_step, |
| 1193 | NULL_TREE, NULL_TREE); |
| 1194 | gsi_insert_before (&loop_cond_gsi, cond_stmt, GSI_SAME_STMT); |
| 1195 | |
| 1196 | /* The loop iterates (NITERS - 1 + NITERS_SKIP) / VF + 1 times. |
| 1197 | Subtract one from this to get the latch count. */ |
| 1198 | tree niters_minus_one |
| 1199 | = fold_build2 (PLUS_EXPR, TREE_TYPE (orig_niters), orig_niters, |
| 1200 | build_minus_one_cst (TREE_TYPE (orig_niters))); |
| 1201 | tree niters_adj2 = fold_convert (iv_type, niters_minus_one); |
| 1202 | if (niters_skip) |
| 1203 | niters_adj2 = fold_build2 (PLUS_EXPR, iv_type, niters_minus_one, |
| 1204 | fold_convert (iv_type, niters_skip)); |
| 1205 | loop->nb_iterations = fold_build2 (TRUNC_DIV_EXPR, iv_type, |
| 1206 | niters_adj2, iv_step); |
| 1207 | |
| 1208 | if (final_iv) |
| 1209 | { |
| 1210 | gassign *assign; |
| 1211 | /* If vectorizing an inverted early break loop we have to restart the |
| 1212 | scalar loop at niters - vf. This matches what we do in |
| 1213 | vect_gen_vector_loop_niters_mult_vf for non-masked loops. */ |
| 1214 | if (LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo)) |
| 1215 | { |
| 1216 | tree ftype = TREE_TYPE (orig_niters); |
| 1217 | tree vf = build_int_cst (ftype, LOOP_VINFO_VECT_FACTOR (loop_vinfo)); |
| 1218 | assign = gimple_build_assign (final_iv, MINUS_EXPR, orig_niters, vf); |
| 1219 | } |
| 1220 | else |
| 1221 | assign = gimple_build_assign (final_iv, orig_niters); |
| 1222 | gsi_insert_on_edge_immediate (exit_edge, assign); |
| 1223 | } |
| 1224 | |
| 1225 | return cond_stmt; |
| 1226 | } |
| 1227 | |
| 1228 | |
| 1229 | /* Like vect_set_loop_condition, but handle the case in which the vector |
| 1230 | loop handles exactly VF scalars per iteration. */ |
| 1231 | |
| 1232 | static gcond * |
| 1233 | vect_set_loop_condition_normal (loop_vec_info /* loop_vinfo */, edge exit_edge, |
| 1234 | class loop *loop, tree niters, tree step, |
| 1235 | tree final_iv, bool niters_maybe_zero, |
| 1236 | gimple_stmt_iterator loop_cond_gsi) |
| 1237 | { |
| 1238 | tree indx_before_incr, indx_after_incr; |
| 1239 | gcond *cond_stmt; |
| 1240 | gcond *orig_cond; |
| 1241 | edge pe = loop_preheader_edge (loop); |
| 1242 | gimple_stmt_iterator incr_gsi; |
| 1243 | bool insert_after; |
| 1244 | enum tree_code code; |
| 1245 | tree niters_type = TREE_TYPE (niters); |
| 1246 | |
| 1247 | orig_cond = get_loop_exit_condition (exit_edge); |
| 1248 | gcc_assert (orig_cond); |
| 1249 | loop_cond_gsi = gsi_for_stmt (orig_cond); |
| 1250 | |
| 1251 | tree init, limit; |
| 1252 | if (!niters_maybe_zero && integer_onep (step)) |
| 1253 | { |
| 1254 | /* In this case we can use a simple 0-based IV: |
| 1255 | |
| 1256 | A: |
| 1257 | x = 0; |
| 1258 | do |
| 1259 | { |
| 1260 | ... |
| 1261 | x += 1; |
| 1262 | } |
| 1263 | while (x < NITERS); */ |
| 1264 | code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GE_EXPR : LT_EXPR; |
| 1265 | init = build_zero_cst (niters_type); |
| 1266 | limit = niters; |
| 1267 | } |
| 1268 | else |
| 1269 | { |
| 1270 | /* The following works for all values of NITERS except 0: |
| 1271 | |
| 1272 | B: |
| 1273 | x = 0; |
| 1274 | do |
| 1275 | { |
| 1276 | ... |
| 1277 | x += STEP; |
| 1278 | } |
| 1279 | while (x <= NITERS - STEP); |
| 1280 | |
| 1281 | so that the loop continues to iterate if x + STEP - 1 < NITERS |
| 1282 | but stops if x + STEP - 1 >= NITERS. |
| 1283 | |
| 1284 | However, if NITERS is zero, x never hits a value above NITERS - STEP |
| 1285 | before wrapping around. There are two obvious ways of dealing with |
| 1286 | this: |
| 1287 | |
| 1288 | - start at STEP - 1 and compare x before incrementing it |
| 1289 | - start at -1 and compare x after incrementing it |
| 1290 | |
| 1291 | The latter is simpler and is what we use. The loop in this case |
| 1292 | looks like: |
| 1293 | |
| 1294 | C: |
| 1295 | x = -1; |
| 1296 | do |
| 1297 | { |
| 1298 | ... |
| 1299 | x += STEP; |
| 1300 | } |
| 1301 | while (x < NITERS - STEP); |
| 1302 | |
| 1303 | In both cases the loop limit is NITERS - STEP. */ |
| 1304 | gimple_seq seq = NULL; |
| 1305 | limit = force_gimple_operand (niters, &seq, true, NULL_TREE); |
| 1306 | limit = gimple_build (seq: &seq, code: MINUS_EXPR, TREE_TYPE (limit), ops: limit, ops: step); |
| 1307 | if (seq) |
| 1308 | { |
| 1309 | basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq); |
| 1310 | gcc_assert (!new_bb); |
| 1311 | } |
| 1312 | if (niters_maybe_zero) |
| 1313 | { |
| 1314 | /* Case C. */ |
| 1315 | code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GE_EXPR : LT_EXPR; |
| 1316 | init = build_all_ones_cst (niters_type); |
| 1317 | } |
| 1318 | else |
| 1319 | { |
| 1320 | /* Case B. */ |
| 1321 | code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GT_EXPR : LE_EXPR; |
| 1322 | init = build_zero_cst (niters_type); |
| 1323 | } |
| 1324 | } |
| 1325 | |
| 1326 | vect_iv_increment_position (loop_exit: exit_edge, bsi: &incr_gsi, insert_after: &insert_after); |
| 1327 | create_iv (init, PLUS_EXPR, step, NULL_TREE, loop, |
| 1328 | &incr_gsi, insert_after, &indx_before_incr, &indx_after_incr); |
| 1329 | indx_after_incr = force_gimple_operand_gsi (&loop_cond_gsi, indx_after_incr, |
| 1330 | true, NULL_TREE, true, |
| 1331 | GSI_SAME_STMT); |
| 1332 | limit = force_gimple_operand_gsi (&loop_cond_gsi, limit, true, NULL_TREE, |
| 1333 | true, GSI_SAME_STMT); |
| 1334 | |
| 1335 | cond_stmt = gimple_build_cond (code, indx_after_incr, limit, NULL_TREE, |
| 1336 | NULL_TREE); |
| 1337 | |
| 1338 | gsi_insert_before (&loop_cond_gsi, cond_stmt, GSI_SAME_STMT); |
| 1339 | |
| 1340 | /* Record the number of latch iterations. */ |
| 1341 | if (limit == niters) |
| 1342 | /* Case A: the loop iterates NITERS times. Subtract one to get the |
| 1343 | latch count. */ |
| 1344 | loop->nb_iterations = fold_build2 (MINUS_EXPR, niters_type, niters, |
| 1345 | build_int_cst (niters_type, 1)); |
| 1346 | else |
| 1347 | /* Case B or C: the loop iterates (NITERS - STEP) / STEP + 1 times. |
| 1348 | Subtract one from this to get the latch count. */ |
| 1349 | loop->nb_iterations = fold_build2 (TRUNC_DIV_EXPR, niters_type, |
| 1350 | limit, step); |
| 1351 | |
| 1352 | if (final_iv) |
| 1353 | { |
| 1354 | gassign *assign; |
| 1355 | gcc_assert (single_pred_p (exit_edge->dest)); |
| 1356 | tree phi_dest |
| 1357 | = integer_zerop (init) ? final_iv : copy_ssa_name (var: indx_after_incr); |
| 1358 | /* Make sure to maintain LC SSA form here and elide the subtraction |
| 1359 | if the value is zero. */ |
| 1360 | gphi *phi = create_phi_node (phi_dest, exit_edge->dest); |
| 1361 | add_phi_arg (phi, indx_after_incr, exit_edge, UNKNOWN_LOCATION); |
| 1362 | if (!integer_zerop (init)) |
| 1363 | { |
| 1364 | assign = gimple_build_assign (final_iv, MINUS_EXPR, |
| 1365 | phi_dest, init); |
| 1366 | gimple_stmt_iterator gsi = gsi_after_labels (bb: exit_edge->dest); |
| 1367 | gsi_insert_before (&gsi, assign, GSI_SAME_STMT); |
| 1368 | } |
| 1369 | } |
| 1370 | |
| 1371 | return cond_stmt; |
| 1372 | } |
| 1373 | |
| 1374 | /* If we're using fully-masked loops, make LOOP iterate: |
| 1375 | |
| 1376 | N == (NITERS - 1) / STEP + 1 |
| 1377 | |
| 1378 | times. When NITERS is zero, this is equivalent to making the loop |
| 1379 | execute (1 << M) / STEP times, where M is the precision of NITERS. |
| 1380 | NITERS_MAYBE_ZERO is true if this last case might occur. |
| 1381 | |
| 1382 | If we're not using fully-masked loops, make LOOP iterate: |
| 1383 | |
| 1384 | N == (NITERS - STEP) / STEP + 1 |
| 1385 | |
| 1386 | times, where NITERS is known to be outside the range [1, STEP - 1]. |
| 1387 | This is equivalent to making the loop execute NITERS / STEP times |
| 1388 | when NITERS is nonzero and (1 << M) / STEP times otherwise. |
| 1389 | NITERS_MAYBE_ZERO again indicates whether this last case might occur. |
| 1390 | |
| 1391 | If FINAL_IV is nonnull, it is an SSA name that should be set to |
| 1392 | N * STEP on exit from the loop. |
| 1393 | |
| 1394 | Assumption: the exit-condition of LOOP is the last stmt in the loop. */ |
| 1395 | |
| 1396 | void |
| 1397 | vect_set_loop_condition (class loop *loop, edge loop_e, loop_vec_info loop_vinfo, |
| 1398 | tree niters, tree step, tree final_iv, |
| 1399 | bool niters_maybe_zero) |
| 1400 | { |
| 1401 | gcond *cond_stmt; |
| 1402 | gcond *orig_cond = get_loop_exit_condition (loop_e); |
| 1403 | gimple_stmt_iterator loop_cond_gsi = gsi_for_stmt (orig_cond); |
| 1404 | |
| 1405 | if (loop_vinfo && LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)) |
| 1406 | { |
| 1407 | if (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) == vect_partial_vectors_avx512) |
| 1408 | cond_stmt = vect_set_loop_condition_partial_vectors_avx512 (loop, exit_edge: loop_e, |
| 1409 | loop_vinfo, |
| 1410 | niters, final_iv, |
| 1411 | niters_maybe_zero, |
| 1412 | loop_cond_gsi); |
| 1413 | else |
| 1414 | cond_stmt = vect_set_loop_condition_partial_vectors (loop, exit_edge: loop_e, |
| 1415 | loop_vinfo, |
| 1416 | niters, final_iv, |
| 1417 | niters_maybe_zero, |
| 1418 | loop_cond_gsi); |
| 1419 | } |
| 1420 | else |
| 1421 | cond_stmt = vect_set_loop_condition_normal (loop_vinfo, exit_edge: loop_e, loop, |
| 1422 | niters, |
| 1423 | step, final_iv, |
| 1424 | niters_maybe_zero, |
| 1425 | loop_cond_gsi); |
| 1426 | |
| 1427 | /* Remove old loop exit test. */ |
| 1428 | stmt_vec_info orig_cond_info; |
| 1429 | if (loop_vinfo |
| 1430 | && (orig_cond_info = loop_vinfo->lookup_stmt (orig_cond))) |
| 1431 | loop_vinfo->remove_stmt (orig_cond_info); |
| 1432 | else |
| 1433 | gsi_remove (&loop_cond_gsi, true); |
| 1434 | |
| 1435 | if (dump_enabled_p ()) |
| 1436 | dump_printf_loc (MSG_NOTE, vect_location, "New loop exit condition: %G" , |
| 1437 | (gimple *) cond_stmt); |
| 1438 | } |
| 1439 | |
| 1440 | /* Get the virtual operand live on E. The precondition on this is valid |
| 1441 | immediate dominators and an actual virtual definition dominating E. */ |
| 1442 | /* ??? Costly band-aid. For the use in question we can populate a |
| 1443 | live-on-exit/end-of-BB virtual operand when copying stmts. */ |
| 1444 | |
| 1445 | static tree |
| 1446 | get_live_virtual_operand_on_edge (edge e) |
| 1447 | { |
| 1448 | basic_block bb = e->src; |
| 1449 | do |
| 1450 | { |
| 1451 | for (auto gsi = gsi_last_bb (bb); !gsi_end_p (i: gsi); gsi_prev (i: &gsi)) |
| 1452 | { |
| 1453 | gimple *stmt = gsi_stmt (i: gsi); |
| 1454 | if (gimple_vdef (g: stmt)) |
| 1455 | return gimple_vdef (g: stmt); |
| 1456 | if (gimple_vuse (g: stmt)) |
| 1457 | return gimple_vuse (g: stmt); |
| 1458 | } |
| 1459 | if (gphi *vphi = get_virtual_phi (bb)) |
| 1460 | return gimple_phi_result (gs: vphi); |
| 1461 | bb = get_immediate_dominator (CDI_DOMINATORS, bb); |
| 1462 | } |
| 1463 | while (1); |
| 1464 | } |
| 1465 | |
| 1466 | /* Given LOOP this function generates a new copy of it and puts it |
| 1467 | on E which is either the entry or exit of LOOP. If SCALAR_LOOP is |
| 1468 | non-NULL, assume LOOP and SCALAR_LOOP are equivalent and copy the |
| 1469 | basic blocks from SCALAR_LOOP instead of LOOP, but to either the |
| 1470 | entry or exit of LOOP. If FLOW_LOOPS then connect LOOP to SCALAR_LOOP as a |
| 1471 | continuation. This is correct for cases where one loop continues from the |
| 1472 | other like in the vectorizer, but not true for uses in e.g. loop distribution |
| 1473 | where the contents of the loop body are split but the iteration space of both |
| 1474 | copies remains the same. |
| 1475 | |
| 1476 | If UPDATED_DOMS is not NULL it is update with the list of basic blocks whoms |
| 1477 | dominators were updated during the peeling. When doing early break vectorization |
| 1478 | then LOOP_VINFO needs to be provided and is used to keep track of any newly created |
| 1479 | memory references that need to be updated should we decide to vectorize. */ |
| 1480 | |
| 1481 | class loop * |
| 1482 | slpeel_tree_duplicate_loop_to_edge_cfg (class loop *loop, edge loop_exit, |
| 1483 | class loop *scalar_loop, |
| 1484 | edge scalar_exit, edge e, edge *new_e, |
| 1485 | bool flow_loops, |
| 1486 | vec<basic_block> *updated_doms, |
| 1487 | bool uncounted_p, bool create_main_e) |
| 1488 | { |
| 1489 | class loop *new_loop; |
| 1490 | basic_block *new_bbs, *bbs, *pbbs; |
| 1491 | bool at_exit; |
| 1492 | bool was_imm_dom; |
| 1493 | basic_block exit_dest; |
| 1494 | edge exit, new_exit; |
| 1495 | bool duplicate_outer_loop = false; |
| 1496 | |
| 1497 | exit = loop_exit; |
| 1498 | at_exit = (e == exit); |
| 1499 | if (!at_exit && e != loop_preheader_edge (loop)) |
| 1500 | return NULL; |
| 1501 | |
| 1502 | if (scalar_loop == NULL) |
| 1503 | { |
| 1504 | scalar_loop = loop; |
| 1505 | scalar_exit = loop_exit; |
| 1506 | } |
| 1507 | else if (scalar_loop == loop) |
| 1508 | scalar_exit = loop_exit; |
| 1509 | else |
| 1510 | { |
| 1511 | /* Loop has been version, match exits up using the aux index. */ |
| 1512 | for (edge exit : get_loop_exit_edges (scalar_loop)) |
| 1513 | if (exit->aux == loop_exit->aux) |
| 1514 | { |
| 1515 | scalar_exit = exit; |
| 1516 | break; |
| 1517 | } |
| 1518 | |
| 1519 | gcc_assert (scalar_exit); |
| 1520 | } |
| 1521 | |
| 1522 | bbs = XNEWVEC (basic_block, scalar_loop->num_nodes + 1); |
| 1523 | pbbs = bbs + 1; |
| 1524 | get_loop_body_with_size (scalar_loop, pbbs, scalar_loop->num_nodes); |
| 1525 | /* Allow duplication of outer loops. */ |
| 1526 | if (scalar_loop->inner) |
| 1527 | duplicate_outer_loop = true; |
| 1528 | |
| 1529 | /* Generate new loop structure. */ |
| 1530 | new_loop = duplicate_loop (scalar_loop, loop_outer (loop: scalar_loop)); |
| 1531 | duplicate_subloops (scalar_loop, new_loop); |
| 1532 | |
| 1533 | exit_dest = exit->dest; |
| 1534 | was_imm_dom = (get_immediate_dominator (CDI_DOMINATORS, |
| 1535 | exit_dest) == exit->src ? |
| 1536 | true : false); |
| 1537 | |
| 1538 | /* Also copy the pre-header, this avoids jumping through hoops to |
| 1539 | duplicate the loop entry PHI arguments. Create an empty |
| 1540 | pre-header unconditionally for this. */ |
| 1541 | basic_block = split_edge (loop_preheader_edge (scalar_loop)); |
| 1542 | edge entry_e = single_pred_edge (bb: preheader); |
| 1543 | bbs[0] = preheader; |
| 1544 | new_bbs = XNEWVEC (basic_block, scalar_loop->num_nodes + 1); |
| 1545 | |
| 1546 | copy_bbs (bbs, scalar_loop->num_nodes + 1, new_bbs, |
| 1547 | &scalar_exit, 1, &new_exit, NULL, |
| 1548 | at_exit ? loop->latch : e->src, true); |
| 1549 | exit = loop_exit; |
| 1550 | basic_block = new_bbs[0]; |
| 1551 | |
| 1552 | gcc_assert (new_exit); |
| 1553 | |
| 1554 | /* Record the new loop exit information. new_loop doesn't have SCEV data and |
| 1555 | so we must initialize the exit information. */ |
| 1556 | if (new_e) |
| 1557 | *new_e = new_exit; |
| 1558 | |
| 1559 | /* Before installing PHI arguments make sure that the edges |
| 1560 | into them match that of the scalar loop we analyzed. This |
| 1561 | makes sure the SLP tree matches up between the main vectorized |
| 1562 | loop and the epilogue vectorized copies. */ |
| 1563 | if (single_succ_edge (bb: preheader)->dest_idx |
| 1564 | != single_succ_edge (bb: new_bbs[0])->dest_idx) |
| 1565 | { |
| 1566 | basic_block swap_bb = new_bbs[1]; |
| 1567 | gcc_assert (EDGE_COUNT (swap_bb->preds) == 2); |
| 1568 | std::swap (EDGE_PRED (swap_bb, 0), EDGE_PRED (swap_bb, 1)); |
| 1569 | EDGE_PRED (swap_bb, 0)->dest_idx = 0; |
| 1570 | EDGE_PRED (swap_bb, 1)->dest_idx = 1; |
| 1571 | } |
| 1572 | if (duplicate_outer_loop) |
| 1573 | { |
| 1574 | class loop *new_inner_loop = get_loop_copy (scalar_loop->inner); |
| 1575 | if (loop_preheader_edge (scalar_loop)->dest_idx |
| 1576 | != loop_preheader_edge (new_inner_loop)->dest_idx) |
| 1577 | { |
| 1578 | basic_block swap_bb = new_inner_loop->header; |
| 1579 | gcc_assert (EDGE_COUNT (swap_bb->preds) == 2); |
| 1580 | std::swap (EDGE_PRED (swap_bb, 0), EDGE_PRED (swap_bb, 1)); |
| 1581 | EDGE_PRED (swap_bb, 0)->dest_idx = 0; |
| 1582 | EDGE_PRED (swap_bb, 1)->dest_idx = 1; |
| 1583 | } |
| 1584 | } |
| 1585 | |
| 1586 | add_phi_args_after_copy (new_bbs, scalar_loop->num_nodes + 1, NULL); |
| 1587 | |
| 1588 | /* Skip new preheader since it's deleted if copy loop is added at entry. */ |
| 1589 | for (unsigned i = (at_exit ? 0 : 1); i < scalar_loop->num_nodes + 1; i++) |
| 1590 | rename_variables_in_bb (bb: new_bbs[i], rename_from_outer_loop: duplicate_outer_loop); |
| 1591 | |
| 1592 | /* Rename the exit uses. */ |
| 1593 | for (edge exit : get_loop_exit_edges (new_loop)) |
| 1594 | for (auto gsi = gsi_start_phis (exit->dest); |
| 1595 | !gsi_end_p (i: gsi); gsi_next (i: &gsi)) |
| 1596 | { |
| 1597 | tree orig_def = PHI_ARG_DEF_FROM_EDGE (gsi.phi (), exit); |
| 1598 | rename_use_op (PHI_ARG_DEF_PTR_FROM_EDGE (gsi.phi (), exit)); |
| 1599 | if (MAY_HAVE_DEBUG_BIND_STMTS) |
| 1600 | adjust_debug_stmts (from: orig_def, PHI_RESULT (gsi.phi ()), bb: exit->dest); |
| 1601 | } |
| 1602 | |
| 1603 | auto loop_exits = get_loop_exit_edges (loop); |
| 1604 | bool multiple_exits_p = loop_exits.length () > 1; |
| 1605 | auto_vec<basic_block> doms; |
| 1606 | |
| 1607 | if (at_exit) /* Add the loop copy at exit. */ |
| 1608 | { |
| 1609 | if (scalar_loop != loop && new_exit->dest != exit_dest) |
| 1610 | { |
| 1611 | new_exit = redirect_edge_and_branch (new_exit, exit_dest); |
| 1612 | flush_pending_stmts (new_exit); |
| 1613 | } |
| 1614 | |
| 1615 | bool need_virtual_phi = get_virtual_phi (loop->header); |
| 1616 | |
| 1617 | /* For the main loop exit preserve the LC PHI nodes. For vectorization |
| 1618 | we need them to continue or finalize reductions. Since we do not |
| 1619 | copy the loop exit blocks we have to materialize PHIs at the |
| 1620 | new destination before redirecting edges. */ |
| 1621 | for (auto gsi_from = gsi_start_phis (loop_exit->dest); |
| 1622 | !gsi_end_p (i: gsi_from); gsi_next (i: &gsi_from)) |
| 1623 | { |
| 1624 | tree res = gimple_phi_result (gs: *gsi_from); |
| 1625 | create_phi_node (copy_ssa_name (var: res), new_preheader); |
| 1626 | } |
| 1627 | edge e = redirect_edge_and_branch (loop_exit, new_preheader); |
| 1628 | gcc_assert (e == loop_exit); |
| 1629 | flush_pending_stmts (loop_exit); |
| 1630 | set_immediate_dominator (CDI_DOMINATORS, new_preheader, loop_exit->src); |
| 1631 | |
| 1632 | /* If we ended up choosing an exit leading to a path not using memory |
| 1633 | we can end up without a virtual LC PHI. Create it when it is |
| 1634 | needed because of the epilog loop continuation. */ |
| 1635 | if (need_virtual_phi && !get_virtual_phi (loop_exit->dest)) |
| 1636 | { |
| 1637 | tree = gimple_phi_result (gs: get_virtual_phi (loop->header)); |
| 1638 | gphi *vphi = create_phi_node (copy_ssa_name (var: header_def), |
| 1639 | new_preheader); |
| 1640 | add_phi_arg (vphi, get_live_virtual_operand_on_edge (e: loop_exit), |
| 1641 | loop_exit, UNKNOWN_LOCATION); |
| 1642 | } |
| 1643 | |
| 1644 | bool multiple_exits_p = loop_exits.length () > 1; |
| 1645 | basic_block main_loop_exit_block = new_preheader; |
| 1646 | basic_block alt_loop_exit_block = NULL; |
| 1647 | /* Create the CFG for multiple exits. |
| 1648 | | loop_exit | alt1 | altN |
| 1649 | v v ... v |
| 1650 | main_loop_exit_block: alt_loop_exit_block: |
| 1651 | | / |
| 1652 | v v |
| 1653 | new_preheader: |
| 1654 | where in the new preheader we need merge PHIs for |
| 1655 | the continuation values into the epilogue header. |
| 1656 | Do not bother with exit PHIs for the early exits but |
| 1657 | their live virtual operand. We'll fix up things below. */ |
| 1658 | if (multiple_exits_p || uncounted_p) |
| 1659 | { |
| 1660 | edge loop_e = single_succ_edge (bb: new_preheader); |
| 1661 | new_preheader = split_edge (loop_e); |
| 1662 | |
| 1663 | gphi *vphi = NULL; |
| 1664 | alt_loop_exit_block = new_preheader; |
| 1665 | for (auto exit : loop_exits) |
| 1666 | if (exit != loop_exit) |
| 1667 | { |
| 1668 | tree vphi_def = NULL_TREE; |
| 1669 | if (gphi *evphi = get_virtual_phi (exit->dest)) |
| 1670 | vphi_def = gimple_phi_arg_def_from_edge (gs: evphi, e: exit); |
| 1671 | edge res = redirect_edge_and_branch (exit, alt_loop_exit_block); |
| 1672 | gcc_assert (res == exit); |
| 1673 | redirect_edge_var_map_clear (exit); |
| 1674 | if (alt_loop_exit_block == new_preheader) |
| 1675 | alt_loop_exit_block = split_edge (exit); |
| 1676 | if (!need_virtual_phi) |
| 1677 | continue; |
| 1678 | /* When the edge has no virtual LC PHI get at the live |
| 1679 | virtual operand by other means. */ |
| 1680 | if (!vphi_def) |
| 1681 | vphi_def = get_live_virtual_operand_on_edge (e: exit); |
| 1682 | if (!vphi) |
| 1683 | vphi = create_phi_node (copy_ssa_name (var: vphi_def), |
| 1684 | alt_loop_exit_block); |
| 1685 | else |
| 1686 | /* Edge redirection might re-allocate the PHI node |
| 1687 | so we have to rediscover it. */ |
| 1688 | vphi = get_virtual_phi (alt_loop_exit_block); |
| 1689 | add_phi_arg (vphi, vphi_def, exit, UNKNOWN_LOCATION); |
| 1690 | } |
| 1691 | |
| 1692 | set_immediate_dominator (CDI_DOMINATORS, new_preheader, |
| 1693 | loop->header); |
| 1694 | |
| 1695 | /* Fix up the profile counts of the new exit blocks. |
| 1696 | main_loop_exit_block was created by duplicating the |
| 1697 | preheader, so needs its count scaling according to the main |
| 1698 | exit edge's probability. The remaining count from the |
| 1699 | preheader goes to the alt_loop_exit_block, since all |
| 1700 | alternative exits have been redirected there. */ |
| 1701 | main_loop_exit_block->count = loop_exit->count (); |
| 1702 | alt_loop_exit_block->count |
| 1703 | = preheader->count - main_loop_exit_block->count; |
| 1704 | } |
| 1705 | |
| 1706 | /* Adjust the epilog loop PHI entry values to continue iteration. |
| 1707 | This adds remaining necessary LC PHI nodes to the main exit |
| 1708 | and creates merge PHIs when we have multiple exits with |
| 1709 | their appropriate continuation. */ |
| 1710 | if (flow_loops) |
| 1711 | { |
| 1712 | edge loop_entry = single_succ_edge (bb: new_preheader); |
| 1713 | bool peeled_iters = (uncounted_p |
| 1714 | || single_pred (bb: loop->latch) != loop_exit->src); |
| 1715 | |
| 1716 | /* Record the new SSA names in the cache so that we can skip |
| 1717 | materializing them again when we fill in the rest of the LC SSA |
| 1718 | variables. */ |
| 1719 | hash_map <tree, tree> new_phi_args; |
| 1720 | for (auto psi = gsi_start_phis (main_loop_exit_block); |
| 1721 | !gsi_end_p (i: psi); gsi_next (i: &psi)) |
| 1722 | { |
| 1723 | gphi *phi = *psi; |
| 1724 | tree new_arg = gimple_phi_arg_def_from_edge (gs: phi, e: loop_exit); |
| 1725 | if (TREE_CODE (new_arg) != SSA_NAME) |
| 1726 | continue; |
| 1727 | |
| 1728 | /* If the loop doesn't have a virtual def then only possibly keep |
| 1729 | the epilog LC PHI for it and avoid creating new defs. */ |
| 1730 | if (virtual_operand_p (op: new_arg) && !need_virtual_phi) |
| 1731 | { |
| 1732 | auto gsi = gsi_for_stmt (phi); |
| 1733 | remove_phi_node (&gsi, true); |
| 1734 | continue; |
| 1735 | } |
| 1736 | |
| 1737 | /* If we decided not to remove the PHI node we should also not |
| 1738 | rematerialize it later on. */ |
| 1739 | new_phi_args.put (k: new_arg, v: gimple_phi_result (gs: phi)); |
| 1740 | } |
| 1741 | |
| 1742 | /* Create the merge PHI nodes in new_preheader and populate the |
| 1743 | arguments for the exits. */ |
| 1744 | if (multiple_exits_p || uncounted_p) |
| 1745 | { |
| 1746 | for (auto gsi_from = gsi_start_phis (loop->header), |
| 1747 | gsi_to = gsi_start_phis (new_loop->header); |
| 1748 | !gsi_end_p (i: gsi_from) && !gsi_end_p (i: gsi_to); |
| 1749 | gsi_next (i: &gsi_from), gsi_next (i: &gsi_to)) |
| 1750 | { |
| 1751 | gimple *from_phi = gsi_stmt (i: gsi_from); |
| 1752 | gimple *to_phi = gsi_stmt (i: gsi_to); |
| 1753 | |
| 1754 | /* When the vector loop is peeled then we need to use the |
| 1755 | value at start of the loop, otherwise the main loop exit |
| 1756 | should use the final iter value. */ |
| 1757 | tree new_arg; |
| 1758 | if (peeled_iters) |
| 1759 | new_arg = gimple_phi_result (gs: from_phi); |
| 1760 | else |
| 1761 | new_arg = PHI_ARG_DEF_FROM_EDGE (from_phi, |
| 1762 | loop_latch_edge (loop)); |
| 1763 | |
| 1764 | /* Check if we've already created a new phi node during edge |
| 1765 | redirection and re-use it if so. Otherwise create a |
| 1766 | LC PHI node to feed the merge PHI. */ |
| 1767 | tree *res; |
| 1768 | if (virtual_operand_p (op: new_arg)) |
| 1769 | { |
| 1770 | /* Use the existing virtual LC SSA from exit block. */ |
| 1771 | gphi *vphi = get_virtual_phi (main_loop_exit_block); |
| 1772 | new_arg = gimple_phi_result (gs: vphi); |
| 1773 | } |
| 1774 | else if ((res = new_phi_args.get (k: new_arg))) |
| 1775 | new_arg = *res; |
| 1776 | else |
| 1777 | { |
| 1778 | /* Create the LC PHI node for the exit. */ |
| 1779 | tree new_def = copy_ssa_name (var: new_arg); |
| 1780 | gphi *lc_phi |
| 1781 | = create_phi_node (new_def, main_loop_exit_block); |
| 1782 | SET_PHI_ARG_DEF (lc_phi, 0, new_arg); |
| 1783 | new_arg = new_def; |
| 1784 | } |
| 1785 | |
| 1786 | /* Create the PHI node in the merge block merging the |
| 1787 | main and early exit values. */ |
| 1788 | tree new_res = copy_ssa_name (var: gimple_phi_result (gs: from_phi)); |
| 1789 | gphi *lcssa_phi = create_phi_node (new_res, new_preheader); |
| 1790 | edge main_e = single_succ_edge (bb: main_loop_exit_block); |
| 1791 | SET_PHI_ARG_DEF_ON_EDGE (lcssa_phi, main_e, new_arg); |
| 1792 | |
| 1793 | /* And adjust the epilog entry value. */ |
| 1794 | adjust_phi_and_debug_stmts (update_phi: to_phi, e: loop_entry, new_def: new_res); |
| 1795 | } |
| 1796 | } |
| 1797 | |
| 1798 | if (multiple_exits_p) |
| 1799 | { |
| 1800 | /* After creating the merge PHIs handle the early exits those |
| 1801 | should use the values at the start of the loop. */ |
| 1802 | for (auto gsi_from = gsi_start_phis (loop->header), |
| 1803 | gsi_to = gsi_start_phis (new_preheader); |
| 1804 | !gsi_end_p (i: gsi_from) && !gsi_end_p (i: gsi_to); |
| 1805 | gsi_next (i: &gsi_from), gsi_next (i: &gsi_to)) |
| 1806 | { |
| 1807 | gimple *from_phi = gsi_stmt (i: gsi_from); |
| 1808 | gimple *to_phi = gsi_stmt (i: gsi_to); |
| 1809 | |
| 1810 | /* Now update the virtual PHI nodes with the right value. */ |
| 1811 | tree alt_arg = gimple_phi_result (gs: from_phi); |
| 1812 | if (virtual_operand_p (op: alt_arg)) |
| 1813 | { |
| 1814 | gphi *vphi = get_virtual_phi (alt_loop_exit_block); |
| 1815 | alt_arg = gimple_phi_result (gs: vphi); |
| 1816 | } |
| 1817 | /* For other live args we didn't create LC PHI nodes. |
| 1818 | Do so here. */ |
| 1819 | else |
| 1820 | { |
| 1821 | tree alt_def = copy_ssa_name (var: alt_arg); |
| 1822 | gphi *lc_phi |
| 1823 | = create_phi_node (alt_def, alt_loop_exit_block); |
| 1824 | for (unsigned i = 0; i < gimple_phi_num_args (gs: lc_phi); |
| 1825 | ++i) |
| 1826 | SET_PHI_ARG_DEF (lc_phi, i, alt_arg); |
| 1827 | alt_arg = alt_def; |
| 1828 | } |
| 1829 | edge alt_e = single_succ_edge (bb: alt_loop_exit_block); |
| 1830 | SET_PHI_ARG_DEF_ON_EDGE (to_phi, alt_e, alt_arg); |
| 1831 | } |
| 1832 | } |
| 1833 | /* For the single exit case only create the missing LC PHI nodes |
| 1834 | for the continuation of the loop IVs that are not also already |
| 1835 | reductions and thus had LC PHI nodes on the exit already. */ |
| 1836 | if (!multiple_exits_p && !uncounted_p) |
| 1837 | { |
| 1838 | for (auto gsi_from = gsi_start_phis (loop->header), |
| 1839 | gsi_to = gsi_start_phis (new_loop->header); |
| 1840 | !gsi_end_p (i: gsi_from) && !gsi_end_p (i: gsi_to); |
| 1841 | gsi_next (i: &gsi_from), gsi_next (i: &gsi_to)) |
| 1842 | { |
| 1843 | gimple *from_phi = gsi_stmt (i: gsi_from); |
| 1844 | gimple *to_phi = gsi_stmt (i: gsi_to); |
| 1845 | tree new_arg = PHI_ARG_DEF_FROM_EDGE (from_phi, |
| 1846 | loop_latch_edge (loop)); |
| 1847 | |
| 1848 | /* Check if we've already created a new phi node during edge |
| 1849 | redirection. If we have, only propagate the value |
| 1850 | downwards. */ |
| 1851 | if (tree *res = new_phi_args.get (k: new_arg)) |
| 1852 | { |
| 1853 | adjust_phi_and_debug_stmts (update_phi: to_phi, e: loop_entry, new_def: *res); |
| 1854 | continue; |
| 1855 | } |
| 1856 | |
| 1857 | tree new_res = copy_ssa_name (var: gimple_phi_result (gs: from_phi)); |
| 1858 | gphi *lcssa_phi = create_phi_node (new_res, new_preheader); |
| 1859 | SET_PHI_ARG_DEF_ON_EDGE (lcssa_phi, loop_exit, new_arg); |
| 1860 | adjust_phi_and_debug_stmts (update_phi: to_phi, e: loop_entry, new_def: new_res); |
| 1861 | } |
| 1862 | } |
| 1863 | } |
| 1864 | |
| 1865 | if (was_imm_dom || duplicate_outer_loop) |
| 1866 | set_immediate_dominator (CDI_DOMINATORS, exit_dest, new_exit->src); |
| 1867 | |
| 1868 | /* And remove the non-necessary forwarder again. Keep the other |
| 1869 | one so we have a proper pre-header for the loop at the exit edge. */ |
| 1870 | redirect_edge_pred (single_succ_edge (bb: preheader), |
| 1871 | single_pred (bb: preheader)); |
| 1872 | delete_basic_block (preheader); |
| 1873 | set_immediate_dominator (CDI_DOMINATORS, scalar_loop->header, |
| 1874 | loop_preheader_edge (scalar_loop)->src); |
| 1875 | |
| 1876 | /* Finally after wiring the new epilogue we need to update its main exit |
| 1877 | to the original function exit we recorded. Other exits are already |
| 1878 | correct. */ |
| 1879 | if (multiple_exits_p || uncounted_p) |
| 1880 | { |
| 1881 | class loop *update_loop = new_loop; |
| 1882 | doms = get_all_dominated_blocks (CDI_DOMINATORS, loop->header); |
| 1883 | for (unsigned i = 0; i < doms.length (); ++i) |
| 1884 | if (flow_bb_inside_loop_p (loop, doms[i])) |
| 1885 | doms.unordered_remove (ix: i); |
| 1886 | |
| 1887 | for (edge e : get_loop_exit_edges (update_loop)) |
| 1888 | { |
| 1889 | edge ex; |
| 1890 | edge_iterator ei; |
| 1891 | FOR_EACH_EDGE (ex, ei, e->dest->succs) |
| 1892 | { |
| 1893 | /* Find the first non-fallthrough block as fall-throughs can't |
| 1894 | dominate other blocks. */ |
| 1895 | if (single_succ_p (bb: ex->dest)) |
| 1896 | { |
| 1897 | doms.safe_push (obj: ex->dest); |
| 1898 | ex = single_succ_edge (bb: ex->dest); |
| 1899 | } |
| 1900 | doms.safe_push (obj: ex->dest); |
| 1901 | } |
| 1902 | doms.safe_push (obj: e->dest); |
| 1903 | } |
| 1904 | |
| 1905 | iterate_fix_dominators (CDI_DOMINATORS, doms, false); |
| 1906 | if (updated_doms) |
| 1907 | updated_doms->safe_splice (src: doms); |
| 1908 | } |
| 1909 | } |
| 1910 | else /* Add the copy at entry. */ |
| 1911 | { |
| 1912 | /* Copy the current loop LC PHI nodes between the original loop exit |
| 1913 | block and the new loop header. This allows us to later split the |
| 1914 | preheader block and still find the right LC nodes. */ |
| 1915 | if (flow_loops) |
| 1916 | for (auto gsi_from = gsi_start_phis (new_loop->header), |
| 1917 | gsi_to = gsi_start_phis (loop->header); |
| 1918 | !gsi_end_p (i: gsi_from) && !gsi_end_p (i: gsi_to); |
| 1919 | gsi_next (i: &gsi_from), gsi_next (i: &gsi_to)) |
| 1920 | { |
| 1921 | gimple *from_phi = gsi_stmt (i: gsi_from); |
| 1922 | gimple *to_phi = gsi_stmt (i: gsi_to); |
| 1923 | tree new_arg = PHI_ARG_DEF_FROM_EDGE (from_phi, |
| 1924 | loop_latch_edge (new_loop)); |
| 1925 | adjust_phi_and_debug_stmts (update_phi: to_phi, e: loop_preheader_edge (loop), |
| 1926 | new_def: new_arg); |
| 1927 | } |
| 1928 | |
| 1929 | if (scalar_loop != loop) |
| 1930 | { |
| 1931 | /* Remove the non-necessary forwarder of scalar_loop again. */ |
| 1932 | redirect_edge_pred (single_succ_edge (bb: preheader), |
| 1933 | single_pred (bb: preheader)); |
| 1934 | delete_basic_block (preheader); |
| 1935 | set_immediate_dominator (CDI_DOMINATORS, scalar_loop->header, |
| 1936 | loop_preheader_edge (scalar_loop)->src); |
| 1937 | preheader = split_edge (loop_preheader_edge (loop)); |
| 1938 | entry_e = single_pred_edge (bb: preheader); |
| 1939 | } |
| 1940 | |
| 1941 | redirect_edge_and_branch_force (entry_e, new_preheader); |
| 1942 | flush_pending_stmts (entry_e); |
| 1943 | set_immediate_dominator (CDI_DOMINATORS, new_preheader, entry_e->src); |
| 1944 | |
| 1945 | |
| 1946 | /* `vect_set_loop_condition' replaces the condition in the main exit of |
| 1947 | loop. For counted loops, this is the IV counting exit, so in the case |
| 1948 | of the prolog loop, we are replacing the old IV counting exit limit of |
| 1949 | total loop niters for the new limit of the prolog niters, as desired. |
| 1950 | For uncounted loops, we don't have an IV-counting exit to replace, so |
| 1951 | we add a dummy exit to be consumed by `vect_set_loop_condition' later |
| 1952 | on. */ |
| 1953 | if (create_main_e) |
| 1954 | { |
| 1955 | edge to_latch_e = single_pred_edge (bb: new_loop->latch); |
| 1956 | bool latch_is_false = to_latch_e->flags & EDGE_FALSE_VALUE ? true |
| 1957 | : false; |
| 1958 | |
| 1959 | /* Add new bb for duplicate exit. */ |
| 1960 | basic_block bbcond = split_edge (to_latch_e); |
| 1961 | gimple_stmt_iterator a = gsi_last_bb (bb: bbcond); |
| 1962 | |
| 1963 | /* Fix flags for the edge leading to the latch. */ |
| 1964 | to_latch_e = find_edge (bbcond, new_loop->latch); |
| 1965 | to_latch_e->flags &= ~EDGE_FALLTHRU; |
| 1966 | to_latch_e->flags |= latch_is_false ? EDGE_FALSE_VALUE |
| 1967 | : EDGE_TRUE_VALUE; |
| 1968 | |
| 1969 | /* Build the condition. */ |
| 1970 | tree cone = build_int_cst (sizetype, 1); |
| 1971 | tree czero = build_int_cst (sizetype, 0); |
| 1972 | gcond *cond_copy = gimple_build_cond (NE_EXPR, cone, czero, NULL_TREE, |
| 1973 | NULL_TREE); |
| 1974 | |
| 1975 | gsi_insert_after (&a, cond_copy, GSI_NEW_STMT); |
| 1976 | |
| 1977 | /* Add edge for exiting the loop via new condition. */ |
| 1978 | edge dup_exit = make_edge (bbcond, new_exit->dest, latch_is_false |
| 1979 | ? EDGE_TRUE_VALUE : EDGE_FALSE_VALUE); |
| 1980 | |
| 1981 | profile_probability probability = profile_probability::even (); |
| 1982 | to_latch_e->probability = dup_exit->probability = probability; |
| 1983 | |
| 1984 | set_immediate_dominator (CDI_DOMINATORS, dup_exit->src, |
| 1985 | new_exit->src); |
| 1986 | new_exit = dup_exit; |
| 1987 | *new_e = new_exit; |
| 1988 | } |
| 1989 | |
| 1990 | redirect_edge_and_branch_force (new_exit, preheader); |
| 1991 | flush_pending_stmts (new_exit); |
| 1992 | set_immediate_dominator (CDI_DOMINATORS, preheader, new_exit->src); |
| 1993 | |
| 1994 | /* And remove the non-necessary forwarder again. Keep the other |
| 1995 | one so we have a proper pre-header for the loop at the exit edge. */ |
| 1996 | redirect_edge_pred (single_succ_edge (bb: new_preheader), |
| 1997 | single_pred (bb: new_preheader)); |
| 1998 | delete_basic_block (new_preheader); |
| 1999 | set_immediate_dominator (CDI_DOMINATORS, new_loop->header, |
| 2000 | loop_preheader_edge (new_loop)->src); |
| 2001 | |
| 2002 | /* Update dominators for multiple exits. */ |
| 2003 | if (multiple_exits_p || create_main_e) |
| 2004 | { |
| 2005 | for (edge alt_e : loop_exits) |
| 2006 | { |
| 2007 | if ((alt_e == loop_exit) && !create_main_e) |
| 2008 | continue; |
| 2009 | basic_block old_dom |
| 2010 | = get_immediate_dominator (CDI_DOMINATORS, alt_e->dest); |
| 2011 | if (flow_bb_inside_loop_p (loop, old_dom)) |
| 2012 | { |
| 2013 | auto_vec<basic_block, 8> queue; |
| 2014 | for (auto son = first_dom_son (CDI_DOMINATORS, old_dom); |
| 2015 | son; son = next_dom_son (CDI_DOMINATORS, son)) |
| 2016 | if (!flow_bb_inside_loop_p (loop, son)) |
| 2017 | queue.safe_push (obj: son); |
| 2018 | for (auto son : queue) |
| 2019 | set_immediate_dominator (CDI_DOMINATORS, |
| 2020 | son, get_bb_copy (old_dom)); |
| 2021 | } |
| 2022 | } |
| 2023 | } |
| 2024 | |
| 2025 | /* When loop_exit != scalar_exit due to if-conversion loop versioning, |
| 2026 | the `scalar_exit' now has two incoming edges, one from the if-converted |
| 2027 | and one from the peeled prolog loop. It is therefore dominated by a |
| 2028 | common block between these. Update its dominator accordingly. */ |
| 2029 | if (create_main_e && loop_exit != scalar_exit) |
| 2030 | set_immediate_dominator (CDI_DOMINATORS, scalar_exit->dest, |
| 2031 | recompute_dominator (CDI_DOMINATORS, |
| 2032 | scalar_exit->dest)); |
| 2033 | } |
| 2034 | |
| 2035 | free (ptr: new_bbs); |
| 2036 | free (ptr: bbs); |
| 2037 | |
| 2038 | checking_verify_dominators (dir: CDI_DOMINATORS); |
| 2039 | |
| 2040 | return new_loop; |
| 2041 | } |
| 2042 | |
| 2043 | |
| 2044 | /* Given the condition expression COND, put it as the last statement of |
| 2045 | GUARD_BB; set both edges' probability; set dominator of GUARD_TO to |
| 2046 | DOM_BB; return the skip edge. GUARD_TO is the target basic block to |
| 2047 | skip the loop. PROBABILITY is the skip edge's probability. Mark the |
| 2048 | new edge as irreducible if IRREDUCIBLE_P is true. */ |
| 2049 | |
| 2050 | static edge |
| 2051 | slpeel_add_loop_guard (basic_block guard_bb, tree cond, |
| 2052 | basic_block guard_to, basic_block dom_bb, |
| 2053 | profile_probability probability, bool irreducible_p) |
| 2054 | { |
| 2055 | gimple_stmt_iterator gsi; |
| 2056 | edge new_e, enter_e; |
| 2057 | gcond *cond_stmt; |
| 2058 | gimple_seq gimplify_stmt_list = NULL; |
| 2059 | |
| 2060 | enter_e = EDGE_SUCC (guard_bb, 0); |
| 2061 | enter_e->flags &= ~EDGE_FALLTHRU; |
| 2062 | enter_e->flags |= EDGE_FALSE_VALUE; |
| 2063 | gsi = gsi_last_bb (bb: guard_bb); |
| 2064 | |
| 2065 | cond = force_gimple_operand_1 (cond, &gimplify_stmt_list, |
| 2066 | is_gimple_condexpr_for_cond, NULL_TREE); |
| 2067 | if (gimplify_stmt_list) |
| 2068 | gsi_insert_seq_after (&gsi, gimplify_stmt_list, GSI_NEW_STMT); |
| 2069 | |
| 2070 | cond_stmt = gimple_build_cond_from_tree (cond, NULL_TREE, NULL_TREE); |
| 2071 | gsi = gsi_last_bb (bb: guard_bb); |
| 2072 | gsi_insert_after (&gsi, cond_stmt, GSI_NEW_STMT); |
| 2073 | |
| 2074 | /* Add new edge to connect guard block to the merge/loop-exit block. */ |
| 2075 | new_e = make_edge (guard_bb, guard_to, EDGE_TRUE_VALUE); |
| 2076 | |
| 2077 | new_e->probability = probability; |
| 2078 | if (irreducible_p) |
| 2079 | new_e->flags |= EDGE_IRREDUCIBLE_LOOP; |
| 2080 | |
| 2081 | enter_e->probability = probability.invert (); |
| 2082 | set_immediate_dominator (CDI_DOMINATORS, guard_to, dom_bb); |
| 2083 | |
| 2084 | /* Split enter_e to preserve LOOPS_HAVE_PREHEADERS. */ |
| 2085 | if (enter_e->dest->loop_father->header == enter_e->dest) |
| 2086 | split_edge (enter_e); |
| 2087 | |
| 2088 | return new_e; |
| 2089 | } |
| 2090 | |
| 2091 | |
| 2092 | /* This function verifies that the following restrictions apply to LOOP: |
| 2093 | (1) it consists of exactly 2 basic blocks - header, and an empty latch |
| 2094 | for innermost loop and 5 basic blocks for outer-loop. |
| 2095 | (2) it is single entry, single exit |
| 2096 | (3) its exit condition is the last stmt in the header |
| 2097 | (4) E is the entry/exit edge of LOOP. |
| 2098 | */ |
| 2099 | |
| 2100 | bool |
| 2101 | slpeel_can_duplicate_loop_p (const class loop *loop, const_edge exit_e, |
| 2102 | const_edge e) |
| 2103 | { |
| 2104 | edge entry_e = loop_preheader_edge (loop); |
| 2105 | gcond *orig_cond = get_loop_exit_condition (exit_e); |
| 2106 | gimple_stmt_iterator loop_exit_gsi = gsi_last_bb (bb: exit_e->src); |
| 2107 | |
| 2108 | /* All loops have an outer scope; the only case loop->outer is NULL is for |
| 2109 | the function itself. */ |
| 2110 | if (!loop_outer (loop) |
| 2111 | || !empty_block_p (loop->latch) |
| 2112 | || !exit_e |
| 2113 | /* Verify that new loop exit condition can be trivially modified. */ |
| 2114 | || (!orig_cond || orig_cond != gsi_stmt (i: loop_exit_gsi)) |
| 2115 | || (e != exit_e && e != entry_e)) |
| 2116 | return false; |
| 2117 | |
| 2118 | basic_block *bbs = XNEWVEC (basic_block, loop->num_nodes); |
| 2119 | get_loop_body_with_size (loop, bbs, loop->num_nodes); |
| 2120 | bool ret = can_copy_bbs_p (bbs, loop->num_nodes); |
| 2121 | free (ptr: bbs); |
| 2122 | return ret; |
| 2123 | } |
| 2124 | |
| 2125 | /* Function find_loop_location. |
| 2126 | |
| 2127 | Extract the location of the loop in the source code. |
| 2128 | If the loop is not well formed for vectorization, an estimated |
| 2129 | location is calculated. |
| 2130 | Return the loop location if succeed and NULL if not. */ |
| 2131 | |
| 2132 | dump_user_location_t |
| 2133 | find_loop_location (class loop *loop) |
| 2134 | { |
| 2135 | gimple *stmt = NULL; |
| 2136 | basic_block bb; |
| 2137 | gimple_stmt_iterator si; |
| 2138 | |
| 2139 | if (!loop) |
| 2140 | return dump_user_location_t (); |
| 2141 | |
| 2142 | /* For the root of the loop tree return the function location. */ |
| 2143 | if (!loop_outer (loop)) |
| 2144 | return dump_user_location_t::from_function_decl (cfun->decl); |
| 2145 | |
| 2146 | if (loops_state_satisfies_p (flags: LOOPS_HAVE_RECORDED_EXITS)) |
| 2147 | { |
| 2148 | /* We only care about the loop location, so use any exit with location |
| 2149 | information. */ |
| 2150 | for (edge e : get_loop_exit_edges (loop)) |
| 2151 | { |
| 2152 | stmt = get_loop_exit_condition (e); |
| 2153 | |
| 2154 | if (stmt |
| 2155 | && LOCATION_LOCUS (gimple_location (stmt)) > BUILTINS_LOCATION) |
| 2156 | return stmt; |
| 2157 | } |
| 2158 | } |
| 2159 | |
| 2160 | /* If we got here the loop is probably not "well formed", |
| 2161 | try to estimate the loop location */ |
| 2162 | |
| 2163 | if (!loop->header) |
| 2164 | return dump_user_location_t (); |
| 2165 | |
| 2166 | bb = loop->header; |
| 2167 | |
| 2168 | for (si = gsi_start_bb (bb); !gsi_end_p (i: si); gsi_next (i: &si)) |
| 2169 | { |
| 2170 | stmt = gsi_stmt (i: si); |
| 2171 | if (LOCATION_LOCUS (gimple_location (stmt)) > BUILTINS_LOCATION) |
| 2172 | return stmt; |
| 2173 | } |
| 2174 | |
| 2175 | return dump_user_location_t (); |
| 2176 | } |
| 2177 | |
| 2178 | /* Return true if the phi described by STMT_INFO defines an IV of the |
| 2179 | loop to be vectorized. */ |
| 2180 | |
| 2181 | static bool |
| 2182 | iv_phi_p (stmt_vec_info stmt_info) |
| 2183 | { |
| 2184 | gphi *phi = as_a <gphi *> (p: stmt_info->stmt); |
| 2185 | if (virtual_operand_p (PHI_RESULT (phi))) |
| 2186 | return false; |
| 2187 | |
| 2188 | if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def |
| 2189 | || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def) |
| 2190 | return false; |
| 2191 | |
| 2192 | return true; |
| 2193 | } |
| 2194 | |
| 2195 | /* Return true if vectorizer can peel for nonlinear iv. */ |
| 2196 | static bool |
| 2197 | vect_can_peel_nonlinear_iv_p (loop_vec_info loop_vinfo, |
| 2198 | stmt_vec_info stmt_info) |
| 2199 | { |
| 2200 | enum vect_induction_op_type induction_type |
| 2201 | = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info); |
| 2202 | tree niters_skip; |
| 2203 | /* Init_expr will be update by vect_update_ivs_after_vectorizer, |
| 2204 | if niters or vf is unkown: |
| 2205 | For shift, when shift mount >= precision, there would be UD. |
| 2206 | For mult, don't known how to generate |
| 2207 | init_expr * pow (step, niters) for variable niters. |
| 2208 | For neg unknown niters are ok, since niters of vectorized main loop |
| 2209 | will always be multiple of 2. |
| 2210 | See also PR113163, PR114196 and PR114485. */ |
| 2211 | if (!LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant () |
| 2212 | || LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) |
| 2213 | || (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) |
| 2214 | && induction_type != vect_step_op_neg)) |
| 2215 | { |
| 2216 | if (dump_enabled_p ()) |
| 2217 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
| 2218 | "Peeling for epilogue is not supported" |
| 2219 | " for this nonlinear induction" |
| 2220 | " when iteration count is unknown or" |
| 2221 | " when using partial vectorization.\n" ); |
| 2222 | return false; |
| 2223 | } |
| 2224 | |
| 2225 | if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo) |
| 2226 | && induction_type == vect_step_op_mul) |
| 2227 | { |
| 2228 | if (dump_enabled_p ()) |
| 2229 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
| 2230 | "Peeling for is not supported for nonlinear mult" |
| 2231 | " induction using partial vectorization.\n" ); |
| 2232 | return false; |
| 2233 | } |
| 2234 | |
| 2235 | /* Avoid compile time hog on vect_peel_nonlinear_iv_init. */ |
| 2236 | if (induction_type == vect_step_op_mul) |
| 2237 | { |
| 2238 | tree step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info); |
| 2239 | tree type = TREE_TYPE (step_expr); |
| 2240 | |
| 2241 | if (wi::exact_log2 (wi::to_wide (t: step_expr)) == -1 |
| 2242 | && LOOP_VINFO_INT_NITERS(loop_vinfo) >= TYPE_PRECISION (type)) |
| 2243 | { |
| 2244 | if (dump_enabled_p ()) |
| 2245 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
| 2246 | "Avoid compile time hog on" |
| 2247 | " vect_peel_nonlinear_iv_init" |
| 2248 | " for nonlinear induction vec_step_op_mul" |
| 2249 | " when iteration count is too big.\n" ); |
| 2250 | return false; |
| 2251 | } |
| 2252 | } |
| 2253 | |
| 2254 | /* Also doens't support peel for neg when niter is variable. |
| 2255 | ??? generate something like niter_expr & 1 ? init_expr : -init_expr? */ |
| 2256 | niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo); |
| 2257 | if ((niters_skip != NULL_TREE |
| 2258 | && (TREE_CODE (niters_skip) != INTEGER_CST |
| 2259 | || (HOST_WIDE_INT) TREE_INT_CST_LOW (niters_skip) < 0)) |
| 2260 | || (!vect_use_loop_mask_for_alignment_p (loop_vinfo) |
| 2261 | && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)) |
| 2262 | { |
| 2263 | if (dump_enabled_p ()) |
| 2264 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
| 2265 | "Peeling for alignement is not supported" |
| 2266 | " for nonlinear induction when niters_skip" |
| 2267 | " is not constant.\n" ); |
| 2268 | return false; |
| 2269 | } |
| 2270 | |
| 2271 | return true; |
| 2272 | } |
| 2273 | |
| 2274 | /* Function vect_can_advance_ivs_p |
| 2275 | |
| 2276 | In case the number of iterations that LOOP iterates is unknown at compile |
| 2277 | time, an epilog loop will be generated, and the loop induction variables |
| 2278 | (IVs) will be "advanced" to the value they are supposed to take just before |
| 2279 | the epilog loop. Here we check that the access function of the loop IVs |
| 2280 | and the expression that represents the loop bound are simple enough. |
| 2281 | These restrictions will be relaxed in the future. */ |
| 2282 | |
| 2283 | bool |
| 2284 | vect_can_advance_ivs_p (loop_vec_info loop_vinfo) |
| 2285 | { |
| 2286 | class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); |
| 2287 | basic_block bb = loop->header; |
| 2288 | gphi_iterator gsi; |
| 2289 | |
| 2290 | /* Analyze phi functions of the loop header. */ |
| 2291 | |
| 2292 | if (dump_enabled_p ()) |
| 2293 | dump_printf_loc (MSG_NOTE, vect_location, "vect_can_advance_ivs_p:\n" ); |
| 2294 | for (gsi = gsi_start_phis (bb); !gsi_end_p (i: gsi); gsi_next (i: &gsi)) |
| 2295 | { |
| 2296 | tree evolution_part; |
| 2297 | enum vect_induction_op_type induction_type; |
| 2298 | |
| 2299 | gphi *phi = gsi.phi (); |
| 2300 | stmt_vec_info phi_info = loop_vinfo->lookup_stmt (phi); |
| 2301 | if (dump_enabled_p ()) |
| 2302 | dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G" , |
| 2303 | phi_info->stmt); |
| 2304 | |
| 2305 | /* Skip virtual phi's. The data dependences that are associated with |
| 2306 | virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. |
| 2307 | |
| 2308 | Skip reduction phis. */ |
| 2309 | if (!iv_phi_p (stmt_info: phi_info)) |
| 2310 | { |
| 2311 | if (dump_enabled_p ()) |
| 2312 | dump_printf_loc (MSG_NOTE, vect_location, |
| 2313 | "reduc or virtual phi. skip.\n" ); |
| 2314 | continue; |
| 2315 | } |
| 2316 | |
| 2317 | induction_type = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (phi_info); |
| 2318 | if (induction_type != vect_step_op_add) |
| 2319 | { |
| 2320 | if (!vect_can_peel_nonlinear_iv_p (loop_vinfo, stmt_info: phi_info)) |
| 2321 | return false; |
| 2322 | |
| 2323 | continue; |
| 2324 | } |
| 2325 | |
| 2326 | /* Analyze the evolution function. */ |
| 2327 | |
| 2328 | evolution_part = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info); |
| 2329 | if (evolution_part == NULL_TREE) |
| 2330 | { |
| 2331 | if (dump_enabled_p ()) |
| 2332 | dump_printf (MSG_MISSED_OPTIMIZATION, |
| 2333 | "No access function or evolution.\n" ); |
| 2334 | return false; |
| 2335 | } |
| 2336 | |
| 2337 | /* FORNOW: We do not transform initial conditions of IVs |
| 2338 | which evolution functions are not invariants in the loop. */ |
| 2339 | |
| 2340 | if (!expr_invariant_in_loop_p (loop, evolution_part)) |
| 2341 | { |
| 2342 | if (dump_enabled_p ()) |
| 2343 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
| 2344 | "evolution not invariant in loop.\n" ); |
| 2345 | return false; |
| 2346 | } |
| 2347 | |
| 2348 | /* FORNOW: We do not transform initial conditions of IVs |
| 2349 | which evolution functions are a polynomial of degree >= 2. */ |
| 2350 | |
| 2351 | if (tree_is_chrec (expr: evolution_part)) |
| 2352 | { |
| 2353 | if (dump_enabled_p ()) |
| 2354 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
| 2355 | "evolution is chrec.\n" ); |
| 2356 | return false; |
| 2357 | } |
| 2358 | } |
| 2359 | |
| 2360 | return true; |
| 2361 | } |
| 2362 | |
| 2363 | |
| 2364 | /* Function vect_update_ivs_after_vectorizer. |
| 2365 | |
| 2366 | "Advance" the induction variables of LOOP to the value they should take |
| 2367 | after the execution of LOOP. This is currently necessary because the |
| 2368 | vectorizer does not handle induction variables that are used after the |
| 2369 | loop. Such a situation occurs when the last iterations of LOOP are |
| 2370 | peeled, because: |
| 2371 | 1. We introduced new uses after LOOP for IVs that were not originally used |
| 2372 | after LOOP: the IVs of LOOP are now used by an epilog loop. |
| 2373 | 2. LOOP is going to be vectorized; this means that it will iterate N/VF |
| 2374 | times, whereas the loop IVs should be bumped N times. |
| 2375 | |
| 2376 | Input: |
| 2377 | - LOOP - a loop that is going to be vectorized. The last few iterations |
| 2378 | of LOOP were peeled. |
| 2379 | - NITERS - the number of iterations that LOOP executes (before it is |
| 2380 | vectorized). i.e, the number of times the ivs should be bumped. |
| 2381 | - UPDATE_E - a successor edge of LOOP->exit that is on the (only) path |
| 2382 | coming out from LOOP on which there are uses of the LOOP ivs |
| 2383 | (this is the path from LOOP->exit to epilog_loop->preheader). |
| 2384 | |
| 2385 | The new definitions of the ivs are placed in LOOP->exit. |
| 2386 | The phi args associated with the edge UPDATE_E in the bb |
| 2387 | UPDATE_E->dest are updated accordingly. |
| 2388 | |
| 2389 | - EARLY_EXIT_P - Indicates whether the exit is an early exit rather than |
| 2390 | the main latch exit. |
| 2391 | |
| 2392 | Assumption 1: Like the rest of the vectorizer, this function assumes |
| 2393 | a single loop exit that has a single predecessor. |
| 2394 | |
| 2395 | Assumption 2: The phi nodes in the LOOP header and in update_bb are |
| 2396 | organized in the same order. |
| 2397 | |
| 2398 | Assumption 3: The access function of the ivs is simple enough (see |
| 2399 | vect_can_advance_ivs_p). This assumption will be relaxed in the future. |
| 2400 | |
| 2401 | Assumption 4: Exactly one of the successors of LOOP exit-bb is on a path |
| 2402 | coming out of LOOP on which the ivs of LOOP are used (this is the path |
| 2403 | that leads to the epilog loop; other paths skip the epilog loop). This |
| 2404 | path starts with the edge UPDATE_E, and its destination (denoted update_bb) |
| 2405 | needs to have its phis updated. |
| 2406 | */ |
| 2407 | |
| 2408 | static void |
| 2409 | vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo, |
| 2410 | tree niters, edge update_e, |
| 2411 | bool early_exit_p) |
| 2412 | { |
| 2413 | gphi_iterator gsi, gsi1; |
| 2414 | class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); |
| 2415 | basic_block update_bb = update_e->dest; |
| 2416 | basic_block exit_bb = update_e->src; |
| 2417 | /* Check to see if this is an empty loop pre-header block. If it exists |
| 2418 | we need to use the edge from that block -> loop header for updates but |
| 2419 | must use the original exit_bb to add any new adjustment because there |
| 2420 | can be a skip_epilog edge bypassing the epilog and so the loop pre-header |
| 2421 | too. */ |
| 2422 | if (empty_block_p (update_bb) && single_succ_p (bb: update_bb)) |
| 2423 | { |
| 2424 | update_e = single_succ_edge (bb: update_bb); |
| 2425 | update_bb = update_e->dest; |
| 2426 | } |
| 2427 | gimple_stmt_iterator last_gsi = gsi_last_bb (bb: exit_bb); |
| 2428 | |
| 2429 | for (gsi = gsi_start_phis (loop->header), gsi1 = gsi_start_phis (update_bb); |
| 2430 | !gsi_end_p (i: gsi) && !gsi_end_p (i: gsi1); |
| 2431 | gsi_next (i: &gsi), gsi_next (i: &gsi1)) |
| 2432 | { |
| 2433 | tree init_expr; |
| 2434 | tree step_expr, off; |
| 2435 | tree type; |
| 2436 | tree var, ni, ni_name; |
| 2437 | |
| 2438 | gphi *phi = gsi.phi (); |
| 2439 | gphi *phi1 = gsi1.phi (); |
| 2440 | stmt_vec_info phi_info = loop_vinfo->lookup_stmt (phi); |
| 2441 | if (dump_enabled_p ()) |
| 2442 | dump_printf_loc (MSG_NOTE, vect_location, |
| 2443 | "vect_update_ivs_after_vectorizer: phi: %G" , |
| 2444 | (gimple *) phi); |
| 2445 | |
| 2446 | /* Skip reduction and virtual phis. */ |
| 2447 | if (!iv_phi_p (stmt_info: phi_info)) |
| 2448 | { |
| 2449 | if (dump_enabled_p ()) |
| 2450 | dump_printf_loc (MSG_NOTE, vect_location, |
| 2451 | "reduc or virtual phi. skip.\n" ); |
| 2452 | continue; |
| 2453 | } |
| 2454 | |
| 2455 | type = TREE_TYPE (gimple_phi_result (phi)); |
| 2456 | step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info); |
| 2457 | step_expr = unshare_expr (step_expr); |
| 2458 | |
| 2459 | /* FORNOW: We do not support IVs whose evolution function is a polynomial |
| 2460 | of degree >= 2 or exponential. */ |
| 2461 | gcc_assert (!tree_is_chrec (step_expr)); |
| 2462 | |
| 2463 | init_expr = PHI_ARG_DEF_FROM_EDGE (phi, loop_preheader_edge (loop)); |
| 2464 | gimple_seq stmts = NULL; |
| 2465 | enum vect_induction_op_type induction_type |
| 2466 | = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (phi_info); |
| 2467 | |
| 2468 | if (induction_type == vect_step_op_add) |
| 2469 | { |
| 2470 | tree stype = TREE_TYPE (step_expr); |
| 2471 | off = fold_build2 (MULT_EXPR, stype, |
| 2472 | fold_convert (stype, niters), step_expr); |
| 2473 | |
| 2474 | if (POINTER_TYPE_P (type)) |
| 2475 | ni = fold_build_pointer_plus (init_expr, off); |
| 2476 | else |
| 2477 | ni = fold_convert (type, |
| 2478 | fold_build2 (PLUS_EXPR, stype, |
| 2479 | fold_convert (stype, init_expr), |
| 2480 | off)); |
| 2481 | } |
| 2482 | /* Don't bother call vect_peel_nonlinear_iv_init. */ |
| 2483 | else if (induction_type == vect_step_op_neg) |
| 2484 | ni = init_expr; |
| 2485 | else |
| 2486 | ni = vect_peel_nonlinear_iv_init (&stmts, init_expr, |
| 2487 | niters, step_expr, |
| 2488 | induction_type, early_exit_p); |
| 2489 | |
| 2490 | var = create_tmp_var (type, "tmp" ); |
| 2491 | |
| 2492 | gimple_seq new_stmts = NULL; |
| 2493 | ni_name = force_gimple_operand (ni, &new_stmts, false, var); |
| 2494 | |
| 2495 | /* Exit_bb shouldn't be empty, but we also can't insert after a ctrl |
| 2496 | statements. */ |
| 2497 | if (!gsi_end_p (i: last_gsi) && !is_ctrl_stmt (gsi_stmt (i: last_gsi))) |
| 2498 | { |
| 2499 | gsi_insert_seq_after (&last_gsi, stmts, GSI_SAME_STMT); |
| 2500 | gsi_insert_seq_after (&last_gsi, new_stmts, GSI_SAME_STMT); |
| 2501 | } |
| 2502 | else |
| 2503 | { |
| 2504 | gsi_insert_seq_before (&last_gsi, stmts, GSI_SAME_STMT); |
| 2505 | gsi_insert_seq_before (&last_gsi, new_stmts, GSI_SAME_STMT); |
| 2506 | } |
| 2507 | |
| 2508 | /* Update the PHI argument on the requested edge. */ |
| 2509 | adjust_phi_and_debug_stmts (update_phi: phi1, e: update_e, new_def: ni_name); |
| 2510 | } |
| 2511 | } |
| 2512 | |
| 2513 | /* Return a gimple value containing the misalignment (measured in vector |
| 2514 | elements) for the loop described by LOOP_VINFO, i.e. how many elements |
| 2515 | it is away from a perfectly aligned address. Add any new statements |
| 2516 | to SEQ. */ |
| 2517 | |
| 2518 | static tree |
| 2519 | get_misalign_in_elems (gimple **seq, loop_vec_info loop_vinfo) |
| 2520 | { |
| 2521 | dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo); |
| 2522 | stmt_vec_info stmt_info = dr_info->stmt; |
| 2523 | tree vectype = STMT_VINFO_VECTYPE (stmt_info); |
| 2524 | |
| 2525 | poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr_info); |
| 2526 | unsigned HOST_WIDE_INT target_align_c; |
| 2527 | tree target_align_minus_1; |
| 2528 | |
| 2529 | bool negative = tree_int_cst_compare (DR_STEP (dr_info->dr), |
| 2530 | size_zero_node) < 0; |
| 2531 | tree offset = (negative |
| 2532 | ? size_int ((-TYPE_VECTOR_SUBPARTS (vectype) + 1) |
| 2533 | * TREE_INT_CST_LOW |
| 2534 | (TYPE_SIZE_UNIT (TREE_TYPE (vectype)))) |
| 2535 | : size_zero_node); |
| 2536 | tree start_addr = vect_create_addr_base_for_vector_ref (loop_vinfo, |
| 2537 | stmt_info, seq, |
| 2538 | offset); |
| 2539 | tree type = unsigned_type_for (TREE_TYPE (start_addr)); |
| 2540 | if (target_align.is_constant (const_value: &target_align_c)) |
| 2541 | target_align_minus_1 = build_int_cst (type, target_align_c - 1); |
| 2542 | else |
| 2543 | { |
| 2544 | tree vla = build_int_cst (type, target_align); |
| 2545 | target_align_minus_1 = fold_build2 (MINUS_EXPR, type, vla, |
| 2546 | build_int_cst (type, 1)); |
| 2547 | } |
| 2548 | |
| 2549 | HOST_WIDE_INT elem_size |
| 2550 | = int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype))); |
| 2551 | tree elem_size_log = build_int_cst (type, exact_log2 (x: elem_size)); |
| 2552 | |
| 2553 | /* Create: misalign_in_bytes = addr & (target_align - 1). */ |
| 2554 | tree int_start_addr = fold_convert (type, start_addr); |
| 2555 | tree misalign_in_bytes = fold_build2 (BIT_AND_EXPR, type, int_start_addr, |
| 2556 | target_align_minus_1); |
| 2557 | |
| 2558 | /* Create: misalign_in_elems = misalign_in_bytes / element_size. */ |
| 2559 | tree misalign_in_elems = fold_build2 (RSHIFT_EXPR, type, misalign_in_bytes, |
| 2560 | elem_size_log); |
| 2561 | |
| 2562 | return misalign_in_elems; |
| 2563 | } |
| 2564 | |
| 2565 | /* Function vect_gen_prolog_loop_niters |
| 2566 | |
| 2567 | Generate the number of iterations which should be peeled as prolog for the |
| 2568 | loop represented by LOOP_VINFO. It is calculated as the misalignment of |
| 2569 | DR - the data reference recorded in LOOP_VINFO_UNALIGNED_DR (LOOP_VINFO). |
| 2570 | As a result, after the execution of this loop, the data reference DR will |
| 2571 | refer to an aligned location. The following computation is generated: |
| 2572 | |
| 2573 | If the misalignment of DR is known at compile time: |
| 2574 | addr_mis = int mis = DR_MISALIGNMENT (dr); |
| 2575 | Else, compute address misalignment in bytes: |
| 2576 | addr_mis = addr & (target_align - 1) |
| 2577 | |
| 2578 | prolog_niters = ((VF - addr_mis/elem_size)&(VF-1))/step |
| 2579 | |
| 2580 | (elem_size = element type size; an element is the scalar element whose type |
| 2581 | is the inner type of the vectype) |
| 2582 | |
| 2583 | The computations will be emitted at the end of BB. We also compute and |
| 2584 | store upper bound (included) of the result in BOUND. |
| 2585 | |
| 2586 | When the step of the data-ref in the loop is not 1 (as in interleaved data |
| 2587 | and SLP), the number of iterations of the prolog must be divided by the step |
| 2588 | (which is equal to the size of interleaved group). |
| 2589 | |
| 2590 | The above formulas assume that VF == number of elements in the vector. This |
| 2591 | may not hold when there are multiple-types in the loop. |
| 2592 | In this case, for some data-references in the loop the VF does not represent |
| 2593 | the number of elements that fit in the vector. Therefore, instead of VF we |
| 2594 | use TYPE_VECTOR_SUBPARTS. */ |
| 2595 | |
| 2596 | static tree |
| 2597 | vect_gen_prolog_loop_niters (loop_vec_info loop_vinfo, |
| 2598 | basic_block bb, poly_int64 *bound) |
| 2599 | { |
| 2600 | dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo); |
| 2601 | tree var; |
| 2602 | tree niters_type |
| 2603 | = LOOP_VINFO_NITERS_UNCOUNTED_P (loop_vinfo) ? sizetype |
| 2604 | : TREE_TYPE (LOOP_VINFO_NITERS |
| 2605 | (loop_vinfo)); |
| 2606 | gimple_seq stmts = NULL, new_stmts = NULL; |
| 2607 | tree iters, iters_name; |
| 2608 | stmt_vec_info stmt_info = dr_info->stmt; |
| 2609 | tree vectype = STMT_VINFO_VECTYPE (stmt_info); |
| 2610 | poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr_info); |
| 2611 | |
| 2612 | if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0) |
| 2613 | { |
| 2614 | int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo); |
| 2615 | |
| 2616 | if (dump_enabled_p ()) |
| 2617 | dump_printf_loc (MSG_NOTE, vect_location, |
| 2618 | "known peeling = %d.\n" , npeel); |
| 2619 | |
| 2620 | iters = build_int_cst (niters_type, npeel); |
| 2621 | *bound = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo); |
| 2622 | } |
| 2623 | else |
| 2624 | { |
| 2625 | tree misalign_in_elems = get_misalign_in_elems (seq: &stmts, loop_vinfo); |
| 2626 | tree type = TREE_TYPE (misalign_in_elems); |
| 2627 | HOST_WIDE_INT elem_size |
| 2628 | = int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype))); |
| 2629 | /* We only do prolog peeling if the target alignment is known at compile |
| 2630 | time. */ |
| 2631 | poly_uint64 align_in_elems = |
| 2632 | exact_div (a: target_align, b: elem_size); |
| 2633 | tree align_in_elems_minus_1 = |
| 2634 | build_int_cst (type, align_in_elems - 1); |
| 2635 | tree align_in_elems_tree = build_int_cst (type, align_in_elems); |
| 2636 | |
| 2637 | /* Create: (niters_type) ((align_in_elems - misalign_in_elems) |
| 2638 | & (align_in_elems - 1)). */ |
| 2639 | bool negative = tree_int_cst_compare (DR_STEP (dr_info->dr), |
| 2640 | size_zero_node) < 0; |
| 2641 | if (negative) |
| 2642 | iters = fold_build2 (MINUS_EXPR, type, misalign_in_elems, |
| 2643 | align_in_elems_tree); |
| 2644 | else |
| 2645 | iters = fold_build2 (MINUS_EXPR, type, align_in_elems_tree, |
| 2646 | misalign_in_elems); |
| 2647 | iters = fold_build2 (BIT_AND_EXPR, type, iters, align_in_elems_minus_1); |
| 2648 | iters = fold_convert (niters_type, iters); |
| 2649 | *bound = align_in_elems; |
| 2650 | } |
| 2651 | |
| 2652 | if (dump_enabled_p ()) |
| 2653 | dump_printf_loc (MSG_NOTE, vect_location, |
| 2654 | "niters for prolog loop: %T\n" , iters); |
| 2655 | |
| 2656 | var = create_tmp_var (niters_type, "prolog_loop_niters" ); |
| 2657 | iters_name = force_gimple_operand (iters, &new_stmts, false, var); |
| 2658 | |
| 2659 | if (new_stmts) |
| 2660 | gimple_seq_add_seq (&stmts, new_stmts); |
| 2661 | if (stmts) |
| 2662 | { |
| 2663 | gcc_assert (single_succ_p (bb)); |
| 2664 | gimple_stmt_iterator gsi = gsi_last_bb (bb); |
| 2665 | if (gsi_end_p (i: gsi)) |
| 2666 | gsi_insert_seq_before (&gsi, stmts, GSI_SAME_STMT); |
| 2667 | else |
| 2668 | gsi_insert_seq_after (&gsi, stmts, GSI_SAME_STMT); |
| 2669 | } |
| 2670 | return iters_name; |
| 2671 | } |
| 2672 | |
| 2673 | |
| 2674 | /* Function vect_update_init_of_dr |
| 2675 | |
| 2676 | If CODE is PLUS, the vector loop starts NITERS iterations after the |
| 2677 | scalar one, otherwise CODE is MINUS and the vector loop starts NITERS |
| 2678 | iterations before the scalar one (using masking to skip inactive |
| 2679 | elements). This function updates the information recorded in DR to |
| 2680 | account for the difference. Specifically, it updates the OFFSET |
| 2681 | field of DR_INFO. */ |
| 2682 | |
| 2683 | static void |
| 2684 | vect_update_init_of_dr (dr_vec_info *dr_info, tree niters, tree_code code) |
| 2685 | { |
| 2686 | struct data_reference *dr = dr_info->dr; |
| 2687 | tree offset = dr_info->offset; |
| 2688 | if (!offset) |
| 2689 | offset = build_zero_cst (sizetype); |
| 2690 | |
| 2691 | niters = fold_build2 (MULT_EXPR, sizetype, |
| 2692 | fold_convert (sizetype, niters), |
| 2693 | fold_convert (sizetype, DR_STEP (dr))); |
| 2694 | offset = fold_build2 (code, sizetype, |
| 2695 | fold_convert (sizetype, offset), niters); |
| 2696 | dr_info->offset = offset; |
| 2697 | } |
| 2698 | |
| 2699 | |
| 2700 | /* Function vect_update_inits_of_drs |
| 2701 | |
| 2702 | Apply vect_update_inits_of_dr to all accesses in LOOP_VINFO. |
| 2703 | CODE and NITERS are as for vect_update_inits_of_dr. */ |
| 2704 | |
| 2705 | void |
| 2706 | vect_update_inits_of_drs (loop_vec_info loop_vinfo, tree niters, |
| 2707 | tree_code code) |
| 2708 | { |
| 2709 | unsigned int i; |
| 2710 | vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo); |
| 2711 | struct data_reference *dr; |
| 2712 | |
| 2713 | DUMP_VECT_SCOPE ("vect_update_inits_of_dr" ); |
| 2714 | |
| 2715 | /* Adjust niters to sizetype. We used to insert the stmts on loop preheader |
| 2716 | here, but since we might use these niters to update the epilogues niters |
| 2717 | and data references we can't insert them here as this definition might not |
| 2718 | always dominate its uses. */ |
| 2719 | if (!types_compatible_p (sizetype, TREE_TYPE (niters))) |
| 2720 | niters = fold_convert (sizetype, niters); |
| 2721 | |
| 2722 | FOR_EACH_VEC_ELT (datarefs, i, dr) |
| 2723 | { |
| 2724 | dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr); |
| 2725 | if (!STMT_VINFO_GATHER_SCATTER_P (dr_info->stmt) |
| 2726 | && !STMT_VINFO_SIMD_LANE_ACCESS_P (dr_info->stmt)) |
| 2727 | vect_update_init_of_dr (dr_info, niters, code); |
| 2728 | } |
| 2729 | } |
| 2730 | |
| 2731 | /* For the information recorded in LOOP_VINFO prepare the loop for peeling |
| 2732 | by masking. This involves calculating the number of iterations to |
| 2733 | be peeled and then aligning all memory references appropriately. */ |
| 2734 | |
| 2735 | void |
| 2736 | vect_prepare_for_masked_peels (loop_vec_info loop_vinfo) |
| 2737 | { |
| 2738 | tree misalign_in_elems; |
| 2739 | tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)); |
| 2740 | |
| 2741 | gcc_assert (vect_use_loop_mask_for_alignment_p (loop_vinfo)); |
| 2742 | |
| 2743 | /* From the information recorded in LOOP_VINFO get the number of iterations |
| 2744 | that need to be skipped via masking. */ |
| 2745 | if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0) |
| 2746 | { |
| 2747 | poly_int64 misalign = (LOOP_VINFO_VECT_FACTOR (loop_vinfo) |
| 2748 | - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)); |
| 2749 | misalign_in_elems = build_int_cst (type, misalign); |
| 2750 | } |
| 2751 | else |
| 2752 | { |
| 2753 | gimple_seq seq1 = NULL, seq2 = NULL; |
| 2754 | misalign_in_elems = get_misalign_in_elems (seq: &seq1, loop_vinfo); |
| 2755 | misalign_in_elems = fold_convert (type, misalign_in_elems); |
| 2756 | misalign_in_elems = force_gimple_operand (misalign_in_elems, |
| 2757 | &seq2, true, NULL_TREE); |
| 2758 | gimple_seq_add_seq (&seq1, seq2); |
| 2759 | if (seq1) |
| 2760 | { |
| 2761 | edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo)); |
| 2762 | basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq1); |
| 2763 | gcc_assert (!new_bb); |
| 2764 | } |
| 2765 | } |
| 2766 | |
| 2767 | if (dump_enabled_p ()) |
| 2768 | dump_printf_loc (MSG_NOTE, vect_location, |
| 2769 | "misalignment for fully-masked loop: %T\n" , |
| 2770 | misalign_in_elems); |
| 2771 | |
| 2772 | LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo) = misalign_in_elems; |
| 2773 | |
| 2774 | vect_update_inits_of_drs (loop_vinfo, niters: misalign_in_elems, code: MINUS_EXPR); |
| 2775 | } |
| 2776 | |
| 2777 | /* This function builds ni_name = number of iterations. Statements |
| 2778 | are emitted on the loop preheader edge. If NEW_VAR_P is not NULL, set |
| 2779 | it to TRUE if new ssa_var is generated. */ |
| 2780 | |
| 2781 | tree |
| 2782 | vect_build_loop_niters (loop_vec_info loop_vinfo, bool *new_var_p) |
| 2783 | { |
| 2784 | if (LOOP_VINFO_NITERS_UNCOUNTED_P (loop_vinfo)) |
| 2785 | return NULL_TREE; |
| 2786 | tree ni = unshare_expr (LOOP_VINFO_NITERS (loop_vinfo)); |
| 2787 | if (TREE_CODE (ni) == INTEGER_CST) |
| 2788 | return ni; |
| 2789 | else |
| 2790 | { |
| 2791 | tree ni_name, var; |
| 2792 | gimple_seq stmts = NULL; |
| 2793 | edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo)); |
| 2794 | |
| 2795 | var = create_tmp_var (TREE_TYPE (ni), "niters" ); |
| 2796 | ni_name = force_gimple_operand (ni, &stmts, false, var); |
| 2797 | if (stmts) |
| 2798 | { |
| 2799 | gsi_insert_seq_on_edge_immediate (pe, stmts); |
| 2800 | if (new_var_p != NULL) |
| 2801 | *new_var_p = true; |
| 2802 | } |
| 2803 | |
| 2804 | return ni_name; |
| 2805 | } |
| 2806 | } |
| 2807 | |
| 2808 | /* Calculate the number of iterations above which vectorized loop will be |
| 2809 | preferred than scalar loop. NITERS_PROLOG is the number of iterations |
| 2810 | of prolog loop. If it's integer const, the integer number is also passed |
| 2811 | in INT_NITERS_PROLOG. BOUND_PROLOG is the upper bound (inclusive) of the |
| 2812 | number of iterations of the prolog loop. BOUND_EPILOG is the corresponding |
| 2813 | value for the epilog loop. If CHECK_PROFITABILITY is true, TH is the |
| 2814 | threshold below which the scalar (rather than vectorized) loop will be |
| 2815 | executed. This function stores the upper bound (inclusive) of the result |
| 2816 | in BOUND_SCALAR. */ |
| 2817 | |
| 2818 | static tree |
| 2819 | vect_gen_scalar_loop_niters (tree niters_prolog, int int_niters_prolog, |
| 2820 | poly_int64 bound_prolog, poly_int64 bound_epilog, |
| 2821 | int th, poly_uint64 *bound_scalar, |
| 2822 | bool check_profitability) |
| 2823 | { |
| 2824 | tree type = TREE_TYPE (niters_prolog); |
| 2825 | tree niters = fold_build2 (PLUS_EXPR, type, niters_prolog, |
| 2826 | build_int_cst (type, bound_epilog)); |
| 2827 | |
| 2828 | *bound_scalar = bound_prolog + bound_epilog; |
| 2829 | if (check_profitability) |
| 2830 | { |
| 2831 | /* TH indicates the minimum niters of vectorized loop, while we |
| 2832 | compute the maximum niters of scalar loop. */ |
| 2833 | th--; |
| 2834 | /* Peeling for constant times. */ |
| 2835 | if (int_niters_prolog >= 0) |
| 2836 | { |
| 2837 | *bound_scalar = upper_bound (a: int_niters_prolog + bound_epilog, b: th); |
| 2838 | return build_int_cst (type, *bound_scalar); |
| 2839 | } |
| 2840 | /* Peeling an unknown number of times. Note that both BOUND_PROLOG |
| 2841 | and BOUND_EPILOG are inclusive upper bounds. */ |
| 2842 | if (known_ge (th, bound_prolog + bound_epilog)) |
| 2843 | { |
| 2844 | *bound_scalar = th; |
| 2845 | return build_int_cst (type, th); |
| 2846 | } |
| 2847 | /* Need to do runtime comparison. */ |
| 2848 | else if (maybe_gt (th, bound_epilog)) |
| 2849 | { |
| 2850 | *bound_scalar = upper_bound (a: *bound_scalar, b: th); |
| 2851 | return fold_build2 (MAX_EXPR, type, |
| 2852 | build_int_cst (type, th), niters); |
| 2853 | } |
| 2854 | } |
| 2855 | return niters; |
| 2856 | } |
| 2857 | |
| 2858 | /* NITERS is the number of times that the original scalar loop executes |
| 2859 | after peeling. Work out the maximum number of iterations N that can |
| 2860 | be handled by the vectorized form of the loop and then either: |
| 2861 | |
| 2862 | a) set *STEP_VECTOR_PTR to the vectorization factor and generate: |
| 2863 | |
| 2864 | niters_vector = N |
| 2865 | |
| 2866 | b) set *STEP_VECTOR_PTR to one and generate: |
| 2867 | |
| 2868 | niters_vector = N / vf |
| 2869 | |
| 2870 | In both cases, store niters_vector in *NITERS_VECTOR_PTR and add |
| 2871 | any new statements on the loop preheader edge. NITERS_NO_OVERFLOW |
| 2872 | is true if NITERS doesn't overflow (i.e. if NITERS is always nonzero). |
| 2873 | |
| 2874 | Case (a) is used for LOOP_VINFO_USING_PARTIAL_VECTORS_P or if VF is |
| 2875 | variable. As stated above, NITERS_VECTOR then equals the number |
| 2876 | of scalar iterations and vect_set_loop_condition will handle the |
| 2877 | step. As opposed to (b) we don't know anything about NITER_VECTOR's |
| 2878 | range here. |
| 2879 | */ |
| 2880 | |
| 2881 | void |
| 2882 | vect_gen_vector_loop_niters (loop_vec_info loop_vinfo, tree niters, |
| 2883 | tree *niters_vector_ptr, tree *step_vector_ptr, |
| 2884 | bool niters_no_overflow) |
| 2885 | { |
| 2886 | tree ni_minus_gap, var; |
| 2887 | tree niters_vector, step_vector; |
| 2888 | tree type = niters ? TREE_TYPE (niters) : sizetype; |
| 2889 | poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo); |
| 2890 | edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo)); |
| 2891 | |
| 2892 | /* If epilogue loop is required because of data accesses with gaps, we |
| 2893 | subtract one iteration from the total number of iterations here for |
| 2894 | correct calculation of RATIO. */ |
| 2895 | if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)) |
| 2896 | { |
| 2897 | ni_minus_gap = fold_build2 (MINUS_EXPR, type, niters, |
| 2898 | build_one_cst (type)); |
| 2899 | if (!is_gimple_val (ni_minus_gap)) |
| 2900 | { |
| 2901 | var = create_tmp_var (type, "ni_gap" ); |
| 2902 | gimple *stmts = NULL; |
| 2903 | ni_minus_gap = force_gimple_operand (ni_minus_gap, &stmts, |
| 2904 | true, var); |
| 2905 | gsi_insert_seq_on_edge_immediate (pe, stmts); |
| 2906 | } |
| 2907 | } |
| 2908 | else |
| 2909 | ni_minus_gap = niters; |
| 2910 | |
| 2911 | /* To silence some unexpected warnings, simply initialize to 0. */ |
| 2912 | unsigned HOST_WIDE_INT const_vf = 0; |
| 2913 | if (vf.is_constant (const_value: &const_vf) |
| 2914 | && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)) |
| 2915 | { |
| 2916 | /* Create: niters / vf, which is equivalent to niters >> log2(vf) when |
| 2917 | vf is a power of two, and when not we approximate using a |
| 2918 | truncating division. */ |
| 2919 | /* If it's known that niters == number of latch executions + 1 doesn't |
| 2920 | overflow, we can generate niters / vf; otherwise we generate |
| 2921 | (niters - vf) / vf + 1 by using the fact that we know ratio |
| 2922 | will be at least one. */ |
| 2923 | tree var_vf = build_int_cst (type, const_vf); |
| 2924 | if (niters_no_overflow) |
| 2925 | niters_vector = fold_build2 (TRUNC_DIV_EXPR, type, ni_minus_gap, |
| 2926 | var_vf); |
| 2927 | else |
| 2928 | niters_vector |
| 2929 | = fold_build2 (PLUS_EXPR, type, |
| 2930 | fold_build2 (TRUNC_DIV_EXPR, type, |
| 2931 | fold_build2 (MINUS_EXPR, type, |
| 2932 | ni_minus_gap, |
| 2933 | var_vf), |
| 2934 | var_vf), |
| 2935 | build_int_cst (type, 1)); |
| 2936 | step_vector = build_one_cst (type); |
| 2937 | } |
| 2938 | else |
| 2939 | { |
| 2940 | niters_vector = ni_minus_gap; |
| 2941 | step_vector = build_int_cst (type, vf); |
| 2942 | } |
| 2943 | |
| 2944 | if (!is_gimple_val (niters_vector)) |
| 2945 | { |
| 2946 | var = create_tmp_var (type, "bnd" ); |
| 2947 | gimple_seq stmts = NULL; |
| 2948 | niters_vector = force_gimple_operand (niters_vector, &stmts, true, var); |
| 2949 | gsi_insert_seq_on_edge_immediate (pe, stmts); |
| 2950 | /* Peeling algorithm guarantees that vector loop bound is at least ONE, |
| 2951 | we set range information to make niters analyzer's life easier. |
| 2952 | Note the number of latch iteration value can be TYPE_MAX_VALUE so |
| 2953 | we have to represent the vector niter TYPE_MAX_VALUE + 1 / vf. */ |
| 2954 | if (stmts != NULL |
| 2955 | && integer_onep (step_vector)) |
| 2956 | { |
| 2957 | if (niters_no_overflow) |
| 2958 | { |
| 2959 | int_range<1> vr (type, |
| 2960 | wi::one (TYPE_PRECISION (type)), |
| 2961 | wi::div_trunc (x: wi::max_value |
| 2962 | (TYPE_PRECISION (type), |
| 2963 | TYPE_SIGN (type)), |
| 2964 | y: const_vf, |
| 2965 | TYPE_SIGN (type))); |
| 2966 | set_range_info (niters_vector, vr); |
| 2967 | } |
| 2968 | /* For VF == 1 the vector IV might also overflow so we cannot |
| 2969 | assert a minimum value of 1. */ |
| 2970 | else if (const_vf > 1) |
| 2971 | { |
| 2972 | int_range<1> vr (type, |
| 2973 | wi::one (TYPE_PRECISION (type)), |
| 2974 | wi::rshift (x: wi::max_value (TYPE_PRECISION (type), |
| 2975 | TYPE_SIGN (type)) |
| 2976 | - (const_vf - 1), |
| 2977 | y: exact_log2 (x: const_vf), TYPE_SIGN (type)) |
| 2978 | + 1); |
| 2979 | set_range_info (niters_vector, vr); |
| 2980 | } |
| 2981 | } |
| 2982 | } |
| 2983 | *niters_vector_ptr = niters_vector; |
| 2984 | *step_vector_ptr = step_vector; |
| 2985 | |
| 2986 | return; |
| 2987 | } |
| 2988 | |
| 2989 | /* Given NITERS_VECTOR which is the number of iterations for vectorized |
| 2990 | loop specified by LOOP_VINFO after vectorization, compute the number |
| 2991 | of iterations before vectorization (niters_vector * vf) and store it |
| 2992 | to NITERS_VECTOR_MULT_VF_PTR. */ |
| 2993 | |
| 2994 | static void |
| 2995 | vect_gen_vector_loop_niters_mult_vf (loop_vec_info loop_vinfo, |
| 2996 | tree niters_vector, |
| 2997 | tree *niters_vector_mult_vf_ptr) |
| 2998 | { |
| 2999 | /* We should be using a step_vector of VF if VF is variable. */ |
| 3000 | int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo).to_constant (); |
| 3001 | tree type = TREE_TYPE (niters_vector); |
| 3002 | tree tree_vf = build_int_cst (type, vf); |
| 3003 | basic_block exit_bb = LOOP_VINFO_MAIN_EXIT (loop_vinfo)->dest; |
| 3004 | |
| 3005 | gcc_assert (niters_vector_mult_vf_ptr != NULL); |
| 3006 | tree niters_vector_mult_vf = fold_build2 (MULT_EXPR, type, |
| 3007 | niters_vector, tree_vf); |
| 3008 | |
| 3009 | /* If we've peeled a vector iteration then subtract one full vector |
| 3010 | iteration. */ |
| 3011 | if (LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo)) |
| 3012 | niters_vector_mult_vf = fold_build2 (MINUS_EXPR, type, |
| 3013 | niters_vector_mult_vf, tree_vf); |
| 3014 | |
| 3015 | if (!is_gimple_val (niters_vector_mult_vf)) |
| 3016 | { |
| 3017 | tree var = create_tmp_var (type, "niters_vector_mult_vf" ); |
| 3018 | gimple_seq stmts = NULL; |
| 3019 | niters_vector_mult_vf = force_gimple_operand (niters_vector_mult_vf, |
| 3020 | &stmts, true, var); |
| 3021 | gimple_stmt_iterator gsi = gsi_start_bb (bb: exit_bb); |
| 3022 | gsi_insert_seq_before (&gsi, stmts, GSI_SAME_STMT); |
| 3023 | } |
| 3024 | *niters_vector_mult_vf_ptr = niters_vector_mult_vf; |
| 3025 | } |
| 3026 | |
| 3027 | /* Function slpeel_add_loop_guard adds guard skipping from the beginning |
| 3028 | of SKIP_LOOP to the beginning of UPDATE_LOOP. GUARD_EDGE and MERGE_EDGE |
| 3029 | are two pred edges of the merge point before UPDATE_LOOP. The two loops |
| 3030 | appear like below: |
| 3031 | |
| 3032 | guard_bb: |
| 3033 | if (cond) |
| 3034 | goto merge_bb; |
| 3035 | else |
| 3036 | goto skip_loop; |
| 3037 | |
| 3038 | skip_loop: |
| 3039 | header_a: |
| 3040 | i_1 = PHI<i_0, i_2>; |
| 3041 | ... |
| 3042 | i_2 = i_1 + 1; |
| 3043 | if (cond_a) |
| 3044 | goto latch_a; |
| 3045 | else |
| 3046 | goto exit_a; |
| 3047 | latch_a: |
| 3048 | goto header_a; |
| 3049 | |
| 3050 | exit_a: |
| 3051 | i_5 = PHI<i_2>; |
| 3052 | |
| 3053 | merge_bb: |
| 3054 | ;; PHI (i_x = PHI<i_0, i_5>) to be created at merge point. |
| 3055 | |
| 3056 | update_loop: |
| 3057 | header_b: |
| 3058 | i_3 = PHI<i_5, i_4>; ;; Use of i_5 to be replaced with i_x. |
| 3059 | ... |
| 3060 | i_4 = i_3 + 1; |
| 3061 | if (cond_b) |
| 3062 | goto latch_b; |
| 3063 | else |
| 3064 | goto exit_bb; |
| 3065 | latch_b: |
| 3066 | goto header_b; |
| 3067 | |
| 3068 | exit_bb: |
| 3069 | |
| 3070 | This function creates PHI nodes at merge_bb and replaces the use of i_5 |
| 3071 | in the update_loop's PHI node with the result of new PHI result. */ |
| 3072 | |
| 3073 | static void |
| 3074 | slpeel_update_phi_nodes_for_guard1 (class loop *skip_loop, |
| 3075 | class loop *update_loop, |
| 3076 | edge guard_edge, edge merge_edge) |
| 3077 | { |
| 3078 | location_t merge_loc, guard_loc; |
| 3079 | edge orig_e = loop_preheader_edge (skip_loop); |
| 3080 | edge update_e = loop_preheader_edge (update_loop); |
| 3081 | gphi_iterator gsi_orig, gsi_update; |
| 3082 | |
| 3083 | for ((gsi_orig = gsi_start_phis (skip_loop->header), |
| 3084 | gsi_update = gsi_start_phis (update_loop->header)); |
| 3085 | !gsi_end_p (i: gsi_orig) && !gsi_end_p (i: gsi_update); |
| 3086 | gsi_next (i: &gsi_orig), gsi_next (i: &gsi_update)) |
| 3087 | { |
| 3088 | gphi *orig_phi = gsi_orig.phi (); |
| 3089 | gphi *update_phi = gsi_update.phi (); |
| 3090 | |
| 3091 | /* Generate new phi node at merge bb of the guard. */ |
| 3092 | tree new_res = copy_ssa_name (PHI_RESULT (orig_phi)); |
| 3093 | gphi *new_phi = create_phi_node (new_res, guard_edge->dest); |
| 3094 | |
| 3095 | /* Merge bb has two incoming edges: GUARD_EDGE and MERGE_EDGE. Set the |
| 3096 | args in NEW_PHI for these edges. */ |
| 3097 | tree merge_arg = PHI_ARG_DEF_FROM_EDGE (update_phi, update_e); |
| 3098 | tree guard_arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, orig_e); |
| 3099 | merge_loc = gimple_phi_arg_location_from_edge (phi: update_phi, e: update_e); |
| 3100 | guard_loc = gimple_phi_arg_location_from_edge (phi: orig_phi, e: orig_e); |
| 3101 | add_phi_arg (new_phi, merge_arg, merge_edge, merge_loc); |
| 3102 | add_phi_arg (new_phi, guard_arg, guard_edge, guard_loc); |
| 3103 | |
| 3104 | /* Update phi in UPDATE_PHI. */ |
| 3105 | adjust_phi_and_debug_stmts (update_phi, e: update_e, new_def: new_res); |
| 3106 | } |
| 3107 | } |
| 3108 | |
| 3109 | /* LOOP_VINFO is an epilogue loop whose corresponding main loop can be skipped. |
| 3110 | Return a value that equals: |
| 3111 | |
| 3112 | - MAIN_LOOP_VALUE when LOOP_VINFO is entered from the main loop and |
| 3113 | - SKIP_VALUE when the main loop is skipped. */ |
| 3114 | |
| 3115 | tree |
| 3116 | vect_get_main_loop_result (loop_vec_info loop_vinfo, tree main_loop_value, |
| 3117 | tree skip_value) |
| 3118 | { |
| 3119 | gcc_assert (loop_vinfo->main_loop_edge); |
| 3120 | |
| 3121 | tree phi_result = make_ssa_name (TREE_TYPE (main_loop_value)); |
| 3122 | basic_block bb = loop_vinfo->main_loop_edge->dest; |
| 3123 | gphi *new_phi = create_phi_node (phi_result, bb); |
| 3124 | add_phi_arg (new_phi, main_loop_value, loop_vinfo->main_loop_edge, |
| 3125 | UNKNOWN_LOCATION); |
| 3126 | add_phi_arg (new_phi, skip_value, |
| 3127 | loop_vinfo->skip_main_loop_edge, UNKNOWN_LOCATION); |
| 3128 | return phi_result; |
| 3129 | } |
| 3130 | |
| 3131 | /* Function vect_do_peeling. |
| 3132 | |
| 3133 | Input: |
| 3134 | - LOOP_VINFO: Represent a loop to be vectorized, which looks like: |
| 3135 | |
| 3136 | preheader: |
| 3137 | LOOP: |
| 3138 | header_bb: |
| 3139 | loop_body |
| 3140 | if (exit_loop_cond) goto exit_bb |
| 3141 | else goto header_bb |
| 3142 | exit_bb: |
| 3143 | |
| 3144 | - NITERS: The number of iterations of the loop. |
| 3145 | - NITERSM1: The number of iterations of the loop's latch. |
| 3146 | - NITERS_NO_OVERFLOW: No overflow in computing NITERS. |
| 3147 | - TH, CHECK_PROFITABILITY: Threshold of niters to vectorize loop if |
| 3148 | CHECK_PROFITABILITY is true. |
| 3149 | Output: |
| 3150 | - *NITERS_VECTOR and *STEP_VECTOR describe how the main loop should |
| 3151 | iterate after vectorization; see vect_set_loop_condition for details. |
| 3152 | - *NITERS_VECTOR_MULT_VF_VAR is either null or an SSA name that |
| 3153 | should be set to the number of scalar iterations handled by the |
| 3154 | vector loop. The SSA name is only used on exit from the loop. |
| 3155 | |
| 3156 | This function peels prolog and epilog from the loop, adds guards skipping |
| 3157 | PROLOG and EPILOG for various conditions. As a result, the changed CFG |
| 3158 | would look like: |
| 3159 | |
| 3160 | guard_bb_1: |
| 3161 | if (prefer_scalar_loop) goto merge_bb_1 |
| 3162 | else goto guard_bb_2 |
| 3163 | |
| 3164 | guard_bb_2: |
| 3165 | if (skip_prolog) goto merge_bb_2 |
| 3166 | else goto prolog_preheader |
| 3167 | |
| 3168 | prolog_preheader: |
| 3169 | PROLOG: |
| 3170 | prolog_header_bb: |
| 3171 | prolog_body |
| 3172 | if (exit_prolog_cond) goto prolog_exit_bb |
| 3173 | else goto prolog_header_bb |
| 3174 | prolog_exit_bb: |
| 3175 | |
| 3176 | merge_bb_2: |
| 3177 | |
| 3178 | vector_preheader: |
| 3179 | VECTOR LOOP: |
| 3180 | vector_header_bb: |
| 3181 | vector_body |
| 3182 | if (exit_vector_cond) goto vector_exit_bb |
| 3183 | else goto vector_header_bb |
| 3184 | vector_exit_bb: |
| 3185 | |
| 3186 | guard_bb_3: |
| 3187 | if (skip_epilog) goto merge_bb_3 |
| 3188 | else goto epilog_preheader |
| 3189 | |
| 3190 | merge_bb_1: |
| 3191 | |
| 3192 | epilog_preheader: |
| 3193 | EPILOG: |
| 3194 | epilog_header_bb: |
| 3195 | epilog_body |
| 3196 | if (exit_epilog_cond) goto merge_bb_3 |
| 3197 | else goto epilog_header_bb |
| 3198 | |
| 3199 | merge_bb_3: |
| 3200 | |
| 3201 | Note this function peels prolog and epilog only if it's necessary, |
| 3202 | as well as guards. |
| 3203 | This function returns the epilogue loop if a decision was made to vectorize |
| 3204 | it, otherwise NULL. |
| 3205 | |
| 3206 | The analysis resulting in this epilogue loop's loop_vec_info was performed |
| 3207 | in the same vect_analyze_loop call as the main loop's. At that time |
| 3208 | vect_analyze_loop constructs a list of accepted loop_vec_info's for lower |
| 3209 | vectorization factors than the main loop. This list is chained in the |
| 3210 | loop's loop_vec_info in the 'epilogue_vinfo' member. When we decide to |
| 3211 | vectorize the epilogue loop for a lower vectorization factor, the |
| 3212 | loop_vec_info in epilogue_vinfo is updated and linked to the epilogue loop. |
| 3213 | This is later used to vectorize the epilogue. |
| 3214 | The reason the loop_vec_info needs updating is that it was |
| 3215 | constructed based on the original main loop, and the epilogue loop is a |
| 3216 | copy of this loop, so all links pointing to statements in the original loop |
| 3217 | need updating. Furthermore, these loop_vec_infos share the |
| 3218 | data_reference's records, which will also need to be updated. |
| 3219 | |
| 3220 | TODO: Guard for prefer_scalar_loop should be emitted along with |
| 3221 | versioning conditions if loop versioning is needed. */ |
| 3222 | |
| 3223 | |
| 3224 | class loop * |
| 3225 | vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1, |
| 3226 | tree *niters_vector, tree *step_vector, |
| 3227 | tree *niters_vector_mult_vf_var, int th, |
| 3228 | bool check_profitability, bool niters_no_overflow, |
| 3229 | tree *advance) |
| 3230 | { |
| 3231 | edge e, guard_e; |
| 3232 | tree type = niters ? TREE_TYPE (niters) : sizetype; |
| 3233 | tree guard_cond; |
| 3234 | basic_block guard_bb, guard_to; |
| 3235 | profile_probability prob_prolog, prob_vector, prob_epilog; |
| 3236 | int estimated_vf; |
| 3237 | int prolog_peeling = 0; |
| 3238 | bool vect_epilogues = loop_vinfo->epilogue_vinfo != NULL; |
| 3239 | bool uncounted_p = LOOP_VINFO_NITERS_UNCOUNTED_P (loop_vinfo); |
| 3240 | |
| 3241 | if (!vect_use_loop_mask_for_alignment_p (loop_vinfo)) |
| 3242 | prolog_peeling = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo); |
| 3243 | |
| 3244 | poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo); |
| 3245 | poly_uint64 bound_epilog = 0; |
| 3246 | if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) |
| 3247 | && LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)) |
| 3248 | bound_epilog += vf - 1; |
| 3249 | if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)) |
| 3250 | bound_epilog += 1; |
| 3251 | |
| 3252 | /* For early breaks the scalar loop needs to execute at most VF times |
| 3253 | to find the element that caused the break. */ |
| 3254 | if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo)) |
| 3255 | bound_epilog = vf; |
| 3256 | |
| 3257 | bool epilog_peeling = maybe_ne (a: bound_epilog, b: 0U); |
| 3258 | poly_uint64 bound_scalar = bound_epilog; |
| 3259 | |
| 3260 | if (!prolog_peeling && !epilog_peeling) |
| 3261 | return NULL; |
| 3262 | |
| 3263 | /* Before doing any peeling make sure to reset debug binds outside of |
| 3264 | the loop referring to defs not in LC SSA. */ |
| 3265 | class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); |
| 3266 | for (unsigned i = 0; i < loop->num_nodes; ++i) |
| 3267 | { |
| 3268 | basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i]; |
| 3269 | imm_use_iterator ui; |
| 3270 | gimple *use_stmt; |
| 3271 | for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (i: gsi); |
| 3272 | gsi_next (i: &gsi)) |
| 3273 | { |
| 3274 | FOR_EACH_IMM_USE_STMT (use_stmt, ui, gimple_phi_result (gsi.phi ())) |
| 3275 | if (gimple_debug_bind_p (s: use_stmt) |
| 3276 | && loop != gimple_bb (g: use_stmt)->loop_father |
| 3277 | && !flow_loop_nested_p (loop, |
| 3278 | gimple_bb (g: use_stmt)->loop_father)) |
| 3279 | { |
| 3280 | gimple_debug_bind_reset_value (dbg: use_stmt); |
| 3281 | update_stmt (s: use_stmt); |
| 3282 | } |
| 3283 | } |
| 3284 | for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (i: gsi); |
| 3285 | gsi_next (i: &gsi)) |
| 3286 | { |
| 3287 | ssa_op_iter op_iter; |
| 3288 | def_operand_p def_p; |
| 3289 | FOR_EACH_SSA_DEF_OPERAND (def_p, gsi_stmt (gsi), op_iter, SSA_OP_DEF) |
| 3290 | FOR_EACH_IMM_USE_STMT (use_stmt, ui, DEF_FROM_PTR (def_p)) |
| 3291 | if (gimple_debug_bind_p (s: use_stmt) |
| 3292 | && loop != gimple_bb (g: use_stmt)->loop_father |
| 3293 | && !flow_loop_nested_p (loop, |
| 3294 | gimple_bb (g: use_stmt)->loop_father)) |
| 3295 | { |
| 3296 | gimple_debug_bind_reset_value (dbg: use_stmt); |
| 3297 | update_stmt (s: use_stmt); |
| 3298 | } |
| 3299 | } |
| 3300 | } |
| 3301 | |
| 3302 | prob_vector = profile_probability::guessed_always ().apply_scale (num: 9, den: 10); |
| 3303 | estimated_vf = vect_vf_for_cost (loop_vinfo); |
| 3304 | if (estimated_vf == 2) |
| 3305 | estimated_vf = 3; |
| 3306 | prob_prolog = prob_epilog = profile_probability::guessed_always () |
| 3307 | .apply_scale (num: estimated_vf - 1, den: estimated_vf); |
| 3308 | |
| 3309 | class loop *prolog = NULL, *epilog = NULL; |
| 3310 | class loop *first_loop = loop; |
| 3311 | bool irred_flag = loop_preheader_edge (loop)->flags & EDGE_IRREDUCIBLE_LOOP; |
| 3312 | |
| 3313 | /* SSA form needs to be up-to-date since we are going to manually |
| 3314 | update SSA form in slpeel_tree_duplicate_loop_to_edge_cfg and delete all |
| 3315 | update SSA state after that, so we have to make sure to not lose any |
| 3316 | pending update needs. */ |
| 3317 | gcc_assert (!need_ssa_update_p (cfun)); |
| 3318 | |
| 3319 | /* If we're vectorizing an epilogue loop, we have ensured that the |
| 3320 | virtual operand is in SSA form throughout the vectorized main loop. |
| 3321 | Normally it is possible to trace the updated |
| 3322 | vector-stmt vdefs back to scalar-stmt vdefs and vector-stmt vuses |
| 3323 | back to scalar-stmt vuses, meaning that the effect of the SSA update |
| 3324 | remains local to the main loop. However, there are rare cases in |
| 3325 | which the vectorized loop should have vdefs even when the original scalar |
| 3326 | loop didn't. For example, vectorizing a load with IFN_LOAD_LANES |
| 3327 | introduces clobbers of the temporary vector array, which in turn |
| 3328 | needs new vdefs. If the scalar loop doesn't write to memory, these |
| 3329 | new vdefs will be the only ones in the vector loop. |
| 3330 | We are currently defering updating virtual SSA form and creating |
| 3331 | of a virtual PHI for this case so we do not have to make sure the |
| 3332 | newly introduced virtual def is in LCSSA form. */ |
| 3333 | |
| 3334 | if (MAY_HAVE_DEBUG_BIND_STMTS) |
| 3335 | { |
| 3336 | gcc_assert (!adjust_vec.exists ()); |
| 3337 | adjust_vec.create (nelems: 32); |
| 3338 | } |
| 3339 | initialize_original_copy_tables (); |
| 3340 | |
| 3341 | /* Record the anchor bb at which the guard should be placed if the scalar |
| 3342 | loop might be preferred. */ |
| 3343 | basic_block anchor = loop_preheader_edge (loop)->src; |
| 3344 | |
| 3345 | /* Generate the number of iterations for the prolog loop. We do this here |
| 3346 | so that we can also get the upper bound on the number of iterations. */ |
| 3347 | tree niters_prolog; |
| 3348 | poly_int64 bound_prolog = 0; |
| 3349 | if (prolog_peeling) |
| 3350 | { |
| 3351 | niters_prolog = vect_gen_prolog_loop_niters (loop_vinfo, bb: anchor, |
| 3352 | bound: &bound_prolog); |
| 3353 | /* If algonment peeling is known, we will always execute prolog. */ |
| 3354 | if (TREE_CODE (niters_prolog) == INTEGER_CST) |
| 3355 | prob_prolog = profile_probability::always (); |
| 3356 | } |
| 3357 | else |
| 3358 | niters_prolog = build_int_cst (type, 0); |
| 3359 | |
| 3360 | loop_vec_info epilogue_vinfo = loop_vinfo->epilogue_vinfo; |
| 3361 | tree niters_vector_mult_vf = NULL_TREE; |
| 3362 | /* Saving NITERs before the loop, as this may be changed by prologue. */ |
| 3363 | tree before_loop_niters = LOOP_VINFO_NITERS (loop_vinfo); |
| 3364 | edge update_e = NULL, skip_e = NULL; |
| 3365 | unsigned int lowest_vf = constant_lower_bound (a: vf); |
| 3366 | /* Prolog loop may be skipped. */ |
| 3367 | bool skip_prolog = (prolog_peeling != 0); |
| 3368 | /* Skip this loop to epilog when there are not enough iterations to enter this |
| 3369 | vectorized loop. If true we should perform runtime checks on the NITERS |
| 3370 | to check whether we should skip the current vectorized loop. If we know |
| 3371 | the number of scalar iterations we may choose to add a runtime check if |
| 3372 | this number "maybe" smaller than the number of iterations required |
| 3373 | when we know the number of scalar iterations may potentially |
| 3374 | be smaller than the number of iterations required to enter this loop, for |
| 3375 | this we use the upper bounds on the prolog and epilog peeling. When we |
| 3376 | don't know the number of iterations and don't require versioning it is |
| 3377 | because we have asserted that there are enough scalar iterations to enter |
| 3378 | the main loop, so this skip is not necessary. When we are versioning then |
| 3379 | we only add such a skip if we have chosen to vectorize the epilogue. */ |
| 3380 | bool skip_vector = false; |
| 3381 | if (!uncounted_p) |
| 3382 | skip_vector = (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) |
| 3383 | ? maybe_lt (LOOP_VINFO_INT_NITERS (loop_vinfo), |
| 3384 | b: bound_prolog + bound_epilog) |
| 3385 | : (!LOOP_VINFO_USE_VERSIONING_WITHOUT_PEELING (loop_vinfo) |
| 3386 | || vect_epilogues)); |
| 3387 | |
| 3388 | /* Epilog loop must be executed if the number of iterations for epilog |
| 3389 | loop is known at compile time, otherwise we need to add a check at |
| 3390 | the end of vector loop and skip to the end of epilog loop. */ |
| 3391 | bool skip_epilog = (prolog_peeling < 0 |
| 3392 | || !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) |
| 3393 | || !vf.is_constant ()); |
| 3394 | /* PEELING_FOR_GAPS and peeling for early breaks are special because epilog |
| 3395 | loop must be executed. */ |
| 3396 | if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) |
| 3397 | || LOOP_VINFO_EARLY_BREAKS (loop_vinfo)) |
| 3398 | skip_epilog = false; |
| 3399 | |
| 3400 | class loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo); |
| 3401 | auto_vec<profile_count> original_counts; |
| 3402 | basic_block *original_bbs = NULL; |
| 3403 | |
| 3404 | if (skip_vector) |
| 3405 | { |
| 3406 | split_edge (loop_preheader_edge (loop)); |
| 3407 | |
| 3408 | if (epilog_peeling && (vect_epilogues || scalar_loop == NULL)) |
| 3409 | { |
| 3410 | original_bbs = get_loop_body (loop); |
| 3411 | for (unsigned int i = 0; i < loop->num_nodes; i++) |
| 3412 | original_counts.safe_push(obj: original_bbs[i]->count); |
| 3413 | } |
| 3414 | |
| 3415 | /* Due to the order in which we peel prolog and epilog, we first |
| 3416 | propagate probability to the whole loop. The purpose is to |
| 3417 | avoid adjusting probabilities of both prolog and vector loops |
| 3418 | separately. Note in this case, the probability of epilog loop |
| 3419 | needs to be scaled back later. */ |
| 3420 | basic_block bb_before_loop = loop_preheader_edge (loop)->src; |
| 3421 | if (prob_vector.initialized_p ()) |
| 3422 | { |
| 3423 | scale_bbs_frequencies (&bb_before_loop, 1, prob_vector); |
| 3424 | scale_loop_profile (loop, prob_vector, -1); |
| 3425 | } |
| 3426 | } |
| 3427 | |
| 3428 | if (vect_epilogues) |
| 3429 | { |
| 3430 | /* Make sure to set the epilogue's epilogue scalar loop, such that we can |
| 3431 | use the original scalar loop as remaining epilogue if necessary. */ |
| 3432 | LOOP_VINFO_SCALAR_LOOP (epilogue_vinfo) |
| 3433 | = LOOP_VINFO_SCALAR_LOOP (loop_vinfo); |
| 3434 | LOOP_VINFO_SCALAR_MAIN_EXIT (epilogue_vinfo) |
| 3435 | = LOOP_VINFO_SCALAR_MAIN_EXIT (loop_vinfo); |
| 3436 | } |
| 3437 | |
| 3438 | if (prolog_peeling) |
| 3439 | { |
| 3440 | e = loop_preheader_edge (loop); |
| 3441 | edge exit_e = LOOP_VINFO_MAIN_EXIT (loop_vinfo); |
| 3442 | gcc_checking_assert (slpeel_can_duplicate_loop_p (loop, exit_e, e) |
| 3443 | && (!LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo) |
| 3444 | || uncounted_p)); |
| 3445 | |
| 3446 | /* Peel prolog and put it on preheader edge of loop. */ |
| 3447 | edge scalar_e = LOOP_VINFO_SCALAR_MAIN_EXIT (loop_vinfo); |
| 3448 | edge prolog_e = NULL; |
| 3449 | prolog = slpeel_tree_duplicate_loop_to_edge_cfg (loop, loop_exit: exit_e, |
| 3450 | scalar_loop, scalar_exit: scalar_e, |
| 3451 | e, new_e: &prolog_e, flow_loops: true, NULL, |
| 3452 | uncounted_p: false, create_main_e: uncounted_p); |
| 3453 | |
| 3454 | gcc_assert (prolog); |
| 3455 | prolog->force_vectorize = false; |
| 3456 | |
| 3457 | /* Assign hierarchical discriminators to distinguish prolog loop. */ |
| 3458 | gimple *prolog_last = last_nondebug_stmt (prolog->header); |
| 3459 | location_t prolog_loc |
| 3460 | = prolog_last ? gimple_location (g: prolog_last) : UNKNOWN_LOCATION; |
| 3461 | if (prolog_loc != UNKNOWN_LOCATION) |
| 3462 | { |
| 3463 | unsigned int prolog_copyid = allocate_copyid_base (loc: prolog_loc, stride: 1); |
| 3464 | assign_discriminators_to_loop (loop: prolog, multiplicity_factor: 0, copyid: prolog_copyid); |
| 3465 | } |
| 3466 | first_loop = prolog; |
| 3467 | reset_original_copy_tables (); |
| 3468 | |
| 3469 | /* Update the number of iterations for prolog loop. */ |
| 3470 | tree step_prolog = build_one_cst (TREE_TYPE (niters_prolog)); |
| 3471 | vect_set_loop_condition (loop: prolog, loop_e: prolog_e, NULL, niters: niters_prolog, |
| 3472 | step: step_prolog, NULL_TREE, niters_maybe_zero: false); |
| 3473 | |
| 3474 | /* Skip the prolog loop. */ |
| 3475 | if (skip_prolog) |
| 3476 | { |
| 3477 | guard_cond = fold_build2 (EQ_EXPR, boolean_type_node, |
| 3478 | niters_prolog, build_int_cst (type, 0)); |
| 3479 | guard_bb = loop_preheader_edge (prolog)->src; |
| 3480 | basic_block bb_after_prolog = loop_preheader_edge (loop)->src; |
| 3481 | guard_to = split_edge (loop_preheader_edge (loop)); |
| 3482 | guard_e = slpeel_add_loop_guard (guard_bb, cond: guard_cond, |
| 3483 | guard_to, dom_bb: guard_bb, |
| 3484 | probability: prob_prolog.invert (), |
| 3485 | irreducible_p: irred_flag); |
| 3486 | for (edge alt_e : get_loop_exit_edges (prolog)) |
| 3487 | { |
| 3488 | if (alt_e == prolog_e) |
| 3489 | continue; |
| 3490 | basic_block old_dom |
| 3491 | = get_immediate_dominator (CDI_DOMINATORS, alt_e->dest); |
| 3492 | if (flow_bb_inside_loop_p (prolog, old_dom)) |
| 3493 | { |
| 3494 | auto_vec<basic_block, 8> queue; |
| 3495 | for (auto son = first_dom_son (CDI_DOMINATORS, old_dom); |
| 3496 | son; son = next_dom_son (CDI_DOMINATORS, son)) |
| 3497 | if (!flow_bb_inside_loop_p (prolog, son)) |
| 3498 | queue.safe_push (obj: son); |
| 3499 | for (auto son : queue) |
| 3500 | set_immediate_dominator (CDI_DOMINATORS, son, guard_bb); |
| 3501 | } |
| 3502 | } |
| 3503 | |
| 3504 | e = EDGE_PRED (guard_to, 0); |
| 3505 | e = (e != guard_e ? e : EDGE_PRED (guard_to, 1)); |
| 3506 | slpeel_update_phi_nodes_for_guard1 (skip_loop: prolog, update_loop: loop, guard_edge: guard_e, merge_edge: e); |
| 3507 | |
| 3508 | scale_bbs_frequencies (&bb_after_prolog, 1, prob_prolog); |
| 3509 | scale_loop_profile (prolog, prob_prolog, |
| 3510 | estimated_poly_value (x: bound_prolog) - 1); |
| 3511 | } |
| 3512 | |
| 3513 | /* Update init address of DRs. */ |
| 3514 | vect_update_inits_of_drs (loop_vinfo, niters: niters_prolog, code: PLUS_EXPR); |
| 3515 | if (!uncounted_p) |
| 3516 | { |
| 3517 | /* Update niters for vector loop. */ |
| 3518 | LOOP_VINFO_NITERS (loop_vinfo) |
| 3519 | = fold_build2 (MINUS_EXPR, type, niters, niters_prolog); |
| 3520 | LOOP_VINFO_NITERSM1 (loop_vinfo) |
| 3521 | = fold_build2 (MINUS_EXPR, type, |
| 3522 | LOOP_VINFO_NITERSM1 (loop_vinfo), niters_prolog); |
| 3523 | } |
| 3524 | bool new_var_p = false; |
| 3525 | niters = vect_build_loop_niters (loop_vinfo, new_var_p: &new_var_p); |
| 3526 | /* It's guaranteed that vector loop bound before vectorization is at |
| 3527 | least VF, so set range information for newly generated var. */ |
| 3528 | if (new_var_p) |
| 3529 | { |
| 3530 | int_range<1> vr (type, |
| 3531 | wi::to_wide (t: build_int_cst (type, lowest_vf)), |
| 3532 | wi::to_wide (TYPE_MAX_VALUE (type))); |
| 3533 | set_range_info (niters, vr); |
| 3534 | } |
| 3535 | |
| 3536 | /* Prolog iterates at most bound_prolog times, latch iterates at |
| 3537 | most bound_prolog - 1 times. */ |
| 3538 | if (bound_prolog.is_constant ()) |
| 3539 | record_niter_bound (prolog, bound_prolog.to_constant () - 1, false, |
| 3540 | true); |
| 3541 | delete_update_ssa (); |
| 3542 | adjust_vec_debug_stmts (); |
| 3543 | scev_reset (); |
| 3544 | } |
| 3545 | basic_block bb_before_epilog = NULL; |
| 3546 | |
| 3547 | if (epilog_peeling) |
| 3548 | { |
| 3549 | e = LOOP_VINFO_MAIN_EXIT (loop_vinfo); |
| 3550 | gcc_checking_assert (slpeel_can_duplicate_loop_p (loop, e, e)); |
| 3551 | |
| 3552 | /* Peel epilog and put it on exit edge of loop. If we are vectorizing |
| 3553 | said epilog then we should use a copy of the main loop as a starting |
| 3554 | point. This loop may have already had some preliminary transformations |
| 3555 | to allow for more optimal vectorization, for example if-conversion. |
| 3556 | If we are not vectorizing the epilog then we should use the scalar loop |
| 3557 | as the transformations mentioned above make less or no sense when not |
| 3558 | vectorizing. */ |
| 3559 | edge scalar_e = LOOP_VINFO_SCALAR_MAIN_EXIT (loop_vinfo); |
| 3560 | epilog = vect_epilogues ? get_loop_copy (loop) : scalar_loop; |
| 3561 | edge epilog_e = vect_epilogues ? e : scalar_e; |
| 3562 | edge new_epilog_e = NULL; |
| 3563 | auto_vec<basic_block> doms; |
| 3564 | epilog |
| 3565 | = slpeel_tree_duplicate_loop_to_edge_cfg (loop, loop_exit: e, scalar_loop: epilog, scalar_exit: epilog_e, e, |
| 3566 | new_e: &new_epilog_e, flow_loops: true, updated_doms: &doms, |
| 3567 | uncounted_p); |
| 3568 | |
| 3569 | LOOP_VINFO_EPILOGUE_MAIN_EXIT (loop_vinfo) = new_epilog_e; |
| 3570 | gcc_assert (epilog); |
| 3571 | gcc_assert (new_epilog_e); |
| 3572 | epilog->force_vectorize = false; |
| 3573 | bb_before_epilog = loop_preheader_edge (epilog)->src; |
| 3574 | |
| 3575 | /* Assign hierarchical discriminators to distinguish epilog loop. |
| 3576 | Only assign if it's a scalar epilog. If it will be vectorized |
| 3577 | (vect_epilogues), discriminators will be assigned. |
| 3578 | Use dynamic copy_id allocation instead of hardcoded constants. */ |
| 3579 | if (!vect_epilogues) |
| 3580 | { |
| 3581 | gimple *epilog_last = last_nondebug_stmt (epilog->header); |
| 3582 | location_t epilog_loc |
| 3583 | = epilog_last ? gimple_location (g: epilog_last) : UNKNOWN_LOCATION; |
| 3584 | if (epilog_loc != UNKNOWN_LOCATION) |
| 3585 | { |
| 3586 | unsigned int epilog_copyid = allocate_copyid_base (loc: epilog_loc, stride: 1); |
| 3587 | assign_discriminators_to_loop (loop: epilog, multiplicity_factor: 0, copyid: epilog_copyid); |
| 3588 | } |
| 3589 | } |
| 3590 | |
| 3591 | /* Scalar version loop may be preferred. In this case, add guard |
| 3592 | and skip to epilog. Note this only happens when the number of |
| 3593 | iterations of loop is unknown at compile time, otherwise this |
| 3594 | won't be vectorized. */ |
| 3595 | if (skip_vector) |
| 3596 | { |
| 3597 | /* Additional epilogue iteration is peeled if gap exists. */ |
| 3598 | tree t = vect_gen_scalar_loop_niters (niters_prolog, int_niters_prolog: prolog_peeling, |
| 3599 | bound_prolog, bound_epilog, |
| 3600 | th, bound_scalar: &bound_scalar, |
| 3601 | check_profitability); |
| 3602 | /* Build guard against NITERSM1 since NITERS may overflow. */ |
| 3603 | guard_cond = fold_build2 (LT_EXPR, boolean_type_node, nitersm1, t); |
| 3604 | guard_bb = anchor; |
| 3605 | guard_to = split_edge (loop_preheader_edge (epilog)); |
| 3606 | guard_e = slpeel_add_loop_guard (guard_bb, cond: guard_cond, |
| 3607 | guard_to, dom_bb: guard_bb, |
| 3608 | probability: prob_vector.invert (), |
| 3609 | irreducible_p: irred_flag); |
| 3610 | skip_e = guard_e; |
| 3611 | e = EDGE_PRED (guard_to, 0); |
| 3612 | e = (e != guard_e ? e : EDGE_PRED (guard_to, 1)); |
| 3613 | |
| 3614 | /* Handle any remaining dominator updates needed after |
| 3615 | inserting the loop skip edge above. */ |
| 3616 | if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo) |
| 3617 | && prolog_peeling) |
| 3618 | { |
| 3619 | /* Adding a skip edge to skip a loop with multiple exits |
| 3620 | means the dominator of the join blocks for all exits shifts |
| 3621 | from the prolog skip guard to the loop skip guard. */ |
| 3622 | auto prolog_skip_bb |
| 3623 | = single_pred (bb: loop_preheader_edge (prolog)->src); |
| 3624 | auto needs_update |
| 3625 | = get_dominated_by (CDI_DOMINATORS, prolog_skip_bb); |
| 3626 | |
| 3627 | /* Update everything except for the immediate children of |
| 3628 | the prolog skip block (the prolog and vector preheaders). |
| 3629 | Those should remain dominated by the prolog skip block itself, |
| 3630 | since the loop guard edge goes to the epilogue. */ |
| 3631 | for (auto bb : needs_update) |
| 3632 | if (bb != EDGE_SUCC (prolog_skip_bb, 0)->dest |
| 3633 | && bb != EDGE_SUCC (prolog_skip_bb, 1)->dest) |
| 3634 | set_immediate_dominator (CDI_DOMINATORS, bb, guard_bb); |
| 3635 | } |
| 3636 | |
| 3637 | slpeel_update_phi_nodes_for_guard1 (skip_loop: first_loop, update_loop: epilog, guard_edge: guard_e, merge_edge: e); |
| 3638 | |
| 3639 | /* Simply propagate profile info from guard_bb to guard_to which is |
| 3640 | a merge point of control flow. */ |
| 3641 | profile_count old_count = guard_to->count; |
| 3642 | guard_to->count = guard_bb->count; |
| 3643 | |
| 3644 | /* Restore the counts of the epilog loop if we didn't use the scalar loop. */ |
| 3645 | if (vect_epilogues || scalar_loop == NULL) |
| 3646 | { |
| 3647 | gcc_assert(epilog->num_nodes == loop->num_nodes); |
| 3648 | basic_block *bbs = get_loop_body (epilog); |
| 3649 | for (unsigned int i = 0; i < epilog->num_nodes; i++) |
| 3650 | { |
| 3651 | gcc_assert(get_bb_original (bbs[i]) == original_bbs[i]); |
| 3652 | bbs[i]->count = original_counts[i]; |
| 3653 | } |
| 3654 | free (ptr: bbs); |
| 3655 | free (ptr: original_bbs); |
| 3656 | } |
| 3657 | else if (old_count.nonzero_p ()) |
| 3658 | scale_loop_profile (epilog, guard_to->count.probability_in (overall: old_count), -1); |
| 3659 | |
| 3660 | /* Only need to handle basic block before epilog loop if it's not |
| 3661 | the guard_bb, which is the case when skip_vector is true. */ |
| 3662 | if (guard_bb != bb_before_epilog && single_pred_p (bb: bb_before_epilog)) |
| 3663 | bb_before_epilog->count = single_pred_edge (bb: bb_before_epilog)->count (); |
| 3664 | bb_before_epilog = loop_preheader_edge (epilog)->src; |
| 3665 | } |
| 3666 | |
| 3667 | if (!uncounted_p) |
| 3668 | { |
| 3669 | /* If loop is peeled for non-zero constant times, now niters refers to |
| 3670 | orig_niters - prolog_peeling, it won't overflow even the |
| 3671 | orig_niters overflows. */ |
| 3672 | niters_no_overflow |= (prolog_peeling > 0); |
| 3673 | vect_gen_vector_loop_niters (loop_vinfo, niters, |
| 3674 | niters_vector_ptr: niters_vector, step_vector_ptr: step_vector, |
| 3675 | niters_no_overflow); |
| 3676 | if (!integer_onep (*step_vector)) |
| 3677 | { |
| 3678 | /* On exit from the loop we will have an easy way of calcalating |
| 3679 | NITERS_VECTOR / STEP * STEP. Install a dummy definition |
| 3680 | until then. */ |
| 3681 | niters_vector_mult_vf |
| 3682 | = make_ssa_name (TREE_TYPE (*niters_vector)); |
| 3683 | SSA_NAME_DEF_STMT (niters_vector_mult_vf) = gimple_build_nop (); |
| 3684 | *niters_vector_mult_vf_var = niters_vector_mult_vf; |
| 3685 | } |
| 3686 | else |
| 3687 | vect_gen_vector_loop_niters_mult_vf (loop_vinfo, niters_vector: *niters_vector, |
| 3688 | niters_vector_mult_vf_ptr: &niters_vector_mult_vf); |
| 3689 | /* Update IVs of original loop as if they were advanced by |
| 3690 | niters_vector_mult_vf steps. */ |
| 3691 | gcc_checking_assert (vect_can_advance_ivs_p (loop_vinfo)); |
| 3692 | update_e = skip_vector ? e : loop_preheader_edge (epilog); |
| 3693 | } |
| 3694 | if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo)) |
| 3695 | update_e = single_succ_edge (LOOP_VINFO_MAIN_EXIT (loop_vinfo)->dest); |
| 3696 | |
| 3697 | /* If we have a peeled vector iteration we will never skip the epilog loop |
| 3698 | and we can simplify the cfg a lot by not doing the edge split. */ |
| 3699 | if (skip_epilog |
| 3700 | || (LOOP_VINFO_EARLY_BREAKS (loop_vinfo) |
| 3701 | && !LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo))) |
| 3702 | { |
| 3703 | guard_cond = fold_build2 (EQ_EXPR, boolean_type_node, |
| 3704 | niters, niters_vector_mult_vf); |
| 3705 | |
| 3706 | guard_bb = LOOP_VINFO_MAIN_EXIT (loop_vinfo)->dest; |
| 3707 | edge epilog_e = LOOP_VINFO_EPILOGUE_MAIN_EXIT (loop_vinfo); |
| 3708 | guard_to = epilog_e->dest; |
| 3709 | guard_e = slpeel_add_loop_guard (guard_bb, cond: guard_cond, guard_to, |
| 3710 | dom_bb: skip_vector ? anchor : guard_bb, |
| 3711 | probability: prob_epilog.invert (), |
| 3712 | irreducible_p: irred_flag); |
| 3713 | |
| 3714 | doms.safe_push (obj: guard_to); |
| 3715 | if (vect_epilogues) |
| 3716 | epilogue_vinfo->skip_this_loop_edge = guard_e; |
| 3717 | edge main_iv = LOOP_VINFO_MAIN_EXIT (loop_vinfo); |
| 3718 | gphi_iterator gsi2 = gsi_start_phis (main_iv->dest); |
| 3719 | for (gphi_iterator gsi = gsi_start_phis (guard_to); |
| 3720 | !gsi_end_p (i: gsi); gsi_next (i: &gsi)) |
| 3721 | { |
| 3722 | /* We are expecting all of the PHIs we have on epilog_e |
| 3723 | to be also on the main loop exit. But sometimes |
| 3724 | a stray virtual definition can appear at epilog_e |
| 3725 | which we can then take as the same on all exits, |
| 3726 | we've removed the LC SSA PHI on the main exit before |
| 3727 | so we wouldn't need to create a loop PHI for it. */ |
| 3728 | if (virtual_operand_p (op: gimple_phi_result (gs: *gsi)) |
| 3729 | && (gsi_end_p (i: gsi2) |
| 3730 | || !virtual_operand_p (op: gimple_phi_result (gs: *gsi2)))) |
| 3731 | add_phi_arg (*gsi, |
| 3732 | gimple_phi_arg_def_from_edge (gs: *gsi, e: epilog_e), |
| 3733 | guard_e, UNKNOWN_LOCATION); |
| 3734 | else |
| 3735 | { |
| 3736 | add_phi_arg (*gsi, gimple_phi_result (gs: *gsi2), guard_e, |
| 3737 | UNKNOWN_LOCATION); |
| 3738 | gsi_next (i: &gsi2); |
| 3739 | } |
| 3740 | } |
| 3741 | |
| 3742 | /* Only need to handle basic block before epilog loop if it's not |
| 3743 | the guard_bb, which is the case when skip_vector is true. */ |
| 3744 | if (guard_bb != bb_before_epilog) |
| 3745 | { |
| 3746 | prob_epilog = prob_vector * prob_epilog + prob_vector.invert (); |
| 3747 | |
| 3748 | scale_bbs_frequencies (&bb_before_epilog, 1, prob_epilog); |
| 3749 | } |
| 3750 | scale_loop_profile (epilog, prob_epilog, -1); |
| 3751 | } |
| 3752 | |
| 3753 | /* If we have a peeled vector iteration, all exits are the same, leave it |
| 3754 | and so the main exit needs to be treated the same as the alternative |
| 3755 | exits in that we leave their updates to vectorizable_live_operations. |
| 3756 | */ |
| 3757 | tree vector_iters_vf = niters_vector_mult_vf; |
| 3758 | if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo)) |
| 3759 | { |
| 3760 | tree tmp_niters_vf |
| 3761 | = make_ssa_name (LOOP_VINFO_EARLY_BRK_IV_TYPE (loop_vinfo)); |
| 3762 | |
| 3763 | if (!(LOOP_VINFO_NITERS_UNCOUNTED_P (loop_vinfo) |
| 3764 | && get_loop_exit_edges (loop).length () == 1)) |
| 3765 | { |
| 3766 | basic_block exit_bb = NULL; |
| 3767 | edge update_e = NULL; |
| 3768 | |
| 3769 | /* Identify the early exit merge block. I wish we had stored |
| 3770 | this. */ |
| 3771 | for (auto e : get_loop_exit_edges (loop)) |
| 3772 | if (e != LOOP_VINFO_MAIN_EXIT (loop_vinfo)) |
| 3773 | { |
| 3774 | exit_bb = e->dest; |
| 3775 | update_e = single_succ_edge (bb: exit_bb); |
| 3776 | break; |
| 3777 | } |
| 3778 | vect_update_ivs_after_vectorizer (loop_vinfo, niters: tmp_niters_vf, |
| 3779 | update_e, early_exit_p: true); |
| 3780 | } |
| 3781 | if (LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo)) |
| 3782 | vector_iters_vf = tmp_niters_vf; |
| 3783 | |
| 3784 | LOOP_VINFO_EARLY_BRK_NITERS_VAR (loop_vinfo) = tmp_niters_vf; |
| 3785 | } |
| 3786 | |
| 3787 | bool recalculate_peel_niters_init |
| 3788 | = LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo); |
| 3789 | vect_update_ivs_after_vectorizer (loop_vinfo, niters: vector_iters_vf, |
| 3790 | update_e, |
| 3791 | early_exit_p: recalculate_peel_niters_init); |
| 3792 | |
| 3793 | /* Recalculate the dominators after adding the guard edge. */ |
| 3794 | if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo)) |
| 3795 | iterate_fix_dominators (CDI_DOMINATORS, doms, false); |
| 3796 | |
| 3797 | /* When we do not have a loop-around edge to the epilog we know |
| 3798 | the vector loop covered at least VF scalar iterations unless |
| 3799 | we have early breaks. |
| 3800 | Update any known upper bound with this knowledge. */ |
| 3801 | if (! skip_vector |
| 3802 | && ! LOOP_VINFO_EARLY_BREAKS (loop_vinfo)) |
| 3803 | { |
| 3804 | if (epilog->any_upper_bound) |
| 3805 | epilog->nb_iterations_upper_bound -= lowest_vf; |
| 3806 | if (epilog->any_likely_upper_bound) |
| 3807 | epilog->nb_iterations_likely_upper_bound -= lowest_vf; |
| 3808 | if (epilog->any_estimate) |
| 3809 | epilog->nb_iterations_estimate -= lowest_vf; |
| 3810 | } |
| 3811 | |
| 3812 | unsigned HOST_WIDE_INT bound; |
| 3813 | if (bound_scalar.is_constant (const_value: &bound)) |
| 3814 | { |
| 3815 | gcc_assert (bound != 0); |
| 3816 | /* Adjust the upper bound by the extra peeled vector iteration if we |
| 3817 | are an epilogue of an peeled vect loop and not VLA. For VLA the |
| 3818 | loop bounds are unknown. */ |
| 3819 | if (LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo) |
| 3820 | && vf.is_constant ()) |
| 3821 | bound += vf.to_constant (); |
| 3822 | /* -1 to convert loop iterations to latch iterations. */ |
| 3823 | record_niter_bound (epilog, bound - 1, false, true); |
| 3824 | scale_loop_profile (epilog, profile_probability::always (), |
| 3825 | bound - 1); |
| 3826 | } |
| 3827 | |
| 3828 | delete_update_ssa (); |
| 3829 | adjust_vec_debug_stmts (); |
| 3830 | scev_reset (); |
| 3831 | } |
| 3832 | |
| 3833 | if (vect_epilogues) |
| 3834 | { |
| 3835 | epilog->aux = epilogue_vinfo; |
| 3836 | LOOP_VINFO_LOOP (epilogue_vinfo) = epilog; |
| 3837 | LOOP_VINFO_MAIN_EXIT (epilogue_vinfo) |
| 3838 | = LOOP_VINFO_EPILOGUE_MAIN_EXIT (loop_vinfo); |
| 3839 | |
| 3840 | loop_constraint_clear (loop: epilog, LOOP_C_INFINITE); |
| 3841 | |
| 3842 | /* We now must calculate the number of NITERS performed by the previous |
| 3843 | loop and EPILOGUE_NITERS to be performed by the epilogue. */ |
| 3844 | tree niters = fold_build2 (PLUS_EXPR, TREE_TYPE (niters_vector_mult_vf), |
| 3845 | niters_prolog, niters_vector_mult_vf); |
| 3846 | |
| 3847 | /* If skip_vector we may skip the previous loop, we insert a phi-node to |
| 3848 | determine whether we are coming from the previous vectorized loop |
| 3849 | using the update_e edge or the skip_vector basic block using the |
| 3850 | skip_e edge. */ |
| 3851 | if (skip_vector) |
| 3852 | { |
| 3853 | gcc_assert (update_e != NULL && skip_e != NULL); |
| 3854 | gphi *new_phi = create_phi_node (make_ssa_name (TREE_TYPE (niters)), |
| 3855 | update_e->dest); |
| 3856 | tree new_ssa = make_ssa_name (TREE_TYPE (niters)); |
| 3857 | gimple *stmt = gimple_build_assign (new_ssa, niters); |
| 3858 | gimple_stmt_iterator gsi; |
| 3859 | if (TREE_CODE (niters_vector_mult_vf) == SSA_NAME |
| 3860 | && SSA_NAME_DEF_STMT (niters_vector_mult_vf)->bb != NULL) |
| 3861 | { |
| 3862 | gsi = gsi_for_stmt (SSA_NAME_DEF_STMT (niters_vector_mult_vf)); |
| 3863 | gsi_insert_after (&gsi, stmt, GSI_NEW_STMT); |
| 3864 | } |
| 3865 | else |
| 3866 | { |
| 3867 | gsi = gsi_last_bb (bb: update_e->src); |
| 3868 | gsi_insert_before (&gsi, stmt, GSI_NEW_STMT); |
| 3869 | } |
| 3870 | |
| 3871 | niters = new_ssa; |
| 3872 | add_phi_arg (new_phi, niters, update_e, UNKNOWN_LOCATION); |
| 3873 | add_phi_arg (new_phi, build_zero_cst (TREE_TYPE (niters)), skip_e, |
| 3874 | UNKNOWN_LOCATION); |
| 3875 | niters = PHI_RESULT (new_phi); |
| 3876 | epilogue_vinfo->main_loop_edge = update_e; |
| 3877 | epilogue_vinfo->skip_main_loop_edge = skip_e; |
| 3878 | } |
| 3879 | |
| 3880 | /* Set ADVANCE to the number of iterations performed by the previous |
| 3881 | loop and its prologue. */ |
| 3882 | *advance = niters; |
| 3883 | |
| 3884 | /* Subtract the number of iterations performed by the vectorized loop |
| 3885 | from the number of total iterations. */ |
| 3886 | tree epilogue_niters = fold_build2 (MINUS_EXPR, TREE_TYPE (niters), |
| 3887 | before_loop_niters, |
| 3888 | niters); |
| 3889 | |
| 3890 | LOOP_VINFO_NITERS (epilogue_vinfo) = epilogue_niters; |
| 3891 | LOOP_VINFO_NITERSM1 (epilogue_vinfo) |
| 3892 | = fold_build2 (MINUS_EXPR, TREE_TYPE (epilogue_niters), |
| 3893 | epilogue_niters, |
| 3894 | build_one_cst (TREE_TYPE (epilogue_niters))); |
| 3895 | } |
| 3896 | |
| 3897 | adjust_vec.release (); |
| 3898 | free_original_copy_tables (); |
| 3899 | |
| 3900 | return vect_epilogues ? epilog : NULL; |
| 3901 | } |
| 3902 | |
| 3903 | /* Function vect_create_cond_for_niters_checks. |
| 3904 | |
| 3905 | Create a conditional expression that represents the run-time checks for |
| 3906 | loop's niter. The loop is guaranteed to terminate if the run-time |
| 3907 | checks hold. |
| 3908 | |
| 3909 | Input: |
| 3910 | COND_EXPR - input conditional expression. New conditions will be chained |
| 3911 | with logical AND operation. If it is NULL, then the function |
| 3912 | is used to return the number of alias checks. |
| 3913 | LOOP_VINFO - field LOOP_VINFO_MAY_ALIAS_STMTS contains the list of ddrs |
| 3914 | to be checked. |
| 3915 | |
| 3916 | Output: |
| 3917 | COND_EXPR - conditional expression. |
| 3918 | |
| 3919 | The returned COND_EXPR is the conditional expression to be used in the |
| 3920 | if statement that controls which version of the loop gets executed at |
| 3921 | runtime. */ |
| 3922 | |
| 3923 | static void |
| 3924 | vect_create_cond_for_niters_checks (loop_vec_info loop_vinfo, tree *cond_expr) |
| 3925 | { |
| 3926 | tree part_cond_expr = LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo); |
| 3927 | |
| 3928 | if (*cond_expr) |
| 3929 | *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node, |
| 3930 | *cond_expr, part_cond_expr); |
| 3931 | else |
| 3932 | *cond_expr = part_cond_expr; |
| 3933 | } |
| 3934 | |
| 3935 | /* Set *COND_EXPR to a tree that is true when both the original *COND_EXPR |
| 3936 | and PART_COND_EXPR are true. Treat a null *COND_EXPR as "true". */ |
| 3937 | |
| 3938 | static void |
| 3939 | chain_cond_expr (tree *cond_expr, tree part_cond_expr) |
| 3940 | { |
| 3941 | if (*cond_expr) |
| 3942 | *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node, |
| 3943 | *cond_expr, part_cond_expr); |
| 3944 | else |
| 3945 | *cond_expr = part_cond_expr; |
| 3946 | } |
| 3947 | |
| 3948 | /* Function vect_create_cond_for_align_checks. |
| 3949 | |
| 3950 | Create a conditional expression that represents the alignment checks for |
| 3951 | all of data references (array element references) whose alignment must be |
| 3952 | checked at runtime. |
| 3953 | |
| 3954 | Input: |
| 3955 | COND_EXPR - input conditional expression. New conditions will be chained |
| 3956 | with logical AND operation. |
| 3957 | LOOP_VINFO - three fields of the loop information are used. |
| 3958 | LOOP_VINFO_PTR_MASK is the mask used to check the alignment. |
| 3959 | LOOP_VINFO_MAY_MISALIGN_STMTS contains the refs to be checked. |
| 3960 | LOOP_VINFO_ALLOW_MUTUAL_ALIGNMENT indicates which check applies. |
| 3961 | |
| 3962 | Output: |
| 3963 | COND_EXPR_STMT_LIST - statements needed to construct the conditional |
| 3964 | expression. |
| 3965 | The returned value is the conditional expression to be used in the if |
| 3966 | statement that controls which version of the loop gets executed at runtime. |
| 3967 | |
| 3968 | Based on the boolean value of LOOP_VINFO_ALLOW_MUTUAL_ALIGNMENT, we decide |
| 3969 | which type of check should be applied and create two different expressions |
| 3970 | accordingly. |
| 3971 | 1) When LOOP_VINFO_ALLOW_MUTUAL_ALIGNMENT is false, we see if all data refs |
| 3972 | to be checked are already aligned to an alignment boundary. We create |
| 3973 | an expression of "(a_1 | a_2 | a_3 | ... | a_n) & mask", where "a_i" is |
| 3974 | the address of i'th data reference. |
| 3975 | 2) When LOOP_VINFO_ALLOW_MUTUAL_ALIGNMENT is true, we see if all data refs |
| 3976 | can be aligned to a boundary after a certain amount of peeling, in other |
| 3977 | words, their addresses have the same bottom bits according to the mask. |
| 3978 | We create "((a_1 ^ a_2) | (a_2 ^ a_3) | ... | (a_n-1 ^ a_n)) & mask", |
| 3979 | where "a_i" is the address of i'th data reference. |
| 3980 | |
| 3981 | Both algorithms make two assumptions: |
| 3982 | 1) The number of bytes "n" in a vector is a power of 2. |
| 3983 | 2) An address "a" is aligned if a%n is zero and that this |
| 3984 | test can be done as a&(n-1) == 0. For example, for 16 |
| 3985 | byte vectors the test is a&0xf == 0. */ |
| 3986 | |
| 3987 | static void |
| 3988 | vect_create_cond_for_align_checks (loop_vec_info loop_vinfo, |
| 3989 | tree *cond_expr, |
| 3990 | gimple_seq *cond_expr_stmt_list) |
| 3991 | { |
| 3992 | const vec<stmt_vec_info> &may_misalign_stmts |
| 3993 | = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo); |
| 3994 | stmt_vec_info stmt_info; |
| 3995 | poly_uint64 mask = LOOP_VINFO_PTR_MASK (loop_vinfo); |
| 3996 | tree mask_cst; |
| 3997 | unsigned int i; |
| 3998 | tree int_ptrsize_type; |
| 3999 | char tmp_name[30]; |
| 4000 | tree or_tmp_name = NULL_TREE; |
| 4001 | tree prev_addr_tmp_name = NULL_TREE; |
| 4002 | tree and_tmp_name; |
| 4003 | gimple *and_stmt; |
| 4004 | tree ptrsize_zero; |
| 4005 | tree part_cond_expr; |
| 4006 | |
| 4007 | gcc_assert (known_ne (mask, 0U)); |
| 4008 | |
| 4009 | int_ptrsize_type = signed_type_for (ptr_type_node); |
| 4010 | |
| 4011 | /* If LOOP_VINFO_ALLOW_MUTUAL_ALIGNMENT is true, we should have at least two |
| 4012 | datarefs to check the mutual alignment. */ |
| 4013 | gcc_assert (may_misalign_stmts.length () > 1 |
| 4014 | || !LOOP_VINFO_ALLOW_MUTUAL_ALIGNMENT (loop_vinfo)); |
| 4015 | |
| 4016 | FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt_info) |
| 4017 | { |
| 4018 | gimple_seq new_stmt_list = NULL; |
| 4019 | tree addr_base; |
| 4020 | tree addr_tmp_name; |
| 4021 | tree xor_tmp_name; |
| 4022 | tree new_or_tmp_name; |
| 4023 | gimple *addr_stmt, *or_stmt, *xor_stmt; |
| 4024 | tree vectype = STMT_VINFO_VECTYPE (stmt_info); |
| 4025 | bool negative = tree_int_cst_compare |
| 4026 | (DR_STEP (STMT_VINFO_DATA_REF (stmt_info)), size_zero_node) < 0; |
| 4027 | tree offset = negative |
| 4028 | ? size_int ((-TYPE_VECTOR_SUBPARTS (vectype) + 1) |
| 4029 | * TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype)))) |
| 4030 | : size_zero_node; |
| 4031 | |
| 4032 | /* create: addr_tmp = (int)(address_of_first_vector) */ |
| 4033 | addr_base = |
| 4034 | vect_create_addr_base_for_vector_ref (loop_vinfo, |
| 4035 | stmt_info, &new_stmt_list, |
| 4036 | offset); |
| 4037 | if (new_stmt_list != NULL) |
| 4038 | gimple_seq_add_seq (cond_expr_stmt_list, new_stmt_list); |
| 4039 | |
| 4040 | sprintf (s: tmp_name, format: "addr2int%d" , i); |
| 4041 | addr_tmp_name = make_temp_ssa_name (type: int_ptrsize_type, NULL, name: tmp_name); |
| 4042 | addr_stmt = gimple_build_assign (addr_tmp_name, NOP_EXPR, addr_base); |
| 4043 | gimple_seq_add_stmt (cond_expr_stmt_list, addr_stmt); |
| 4044 | |
| 4045 | if (LOOP_VINFO_ALLOW_MUTUAL_ALIGNMENT (loop_vinfo)) |
| 4046 | { |
| 4047 | /* Create "((a_1 ^ a_2) | (a_2 ^ a_3) | ... | (a_n-1 ^ a_n)) & mask" |
| 4048 | to check mutual alignment. */ |
| 4049 | if (prev_addr_tmp_name != NULL_TREE) |
| 4050 | { |
| 4051 | sprintf (s: tmp_name, format: "xorptrs%d_%d" , i - 1, i); |
| 4052 | xor_tmp_name = make_temp_ssa_name (type: int_ptrsize_type, NULL, |
| 4053 | name: tmp_name); |
| 4054 | xor_stmt = gimple_build_assign (xor_tmp_name, BIT_XOR_EXPR, |
| 4055 | prev_addr_tmp_name, |
| 4056 | addr_tmp_name); |
| 4057 | gimple_seq_add_stmt (cond_expr_stmt_list, xor_stmt); |
| 4058 | if (or_tmp_name == NULL_TREE) |
| 4059 | { |
| 4060 | /* Create the 1st XOR when the 2nd data ref is seen. */ |
| 4061 | or_tmp_name = xor_tmp_name; |
| 4062 | } |
| 4063 | else |
| 4064 | { |
| 4065 | /* Create: or_tmp = or_tmp | new_xor_tmp. */ |
| 4066 | sprintf (s: tmp_name, format: "orxors%d" , i - 1); |
| 4067 | new_or_tmp_name = make_temp_ssa_name (type: int_ptrsize_type, NULL, |
| 4068 | name: tmp_name); |
| 4069 | or_stmt = gimple_build_assign (new_or_tmp_name, BIT_IOR_EXPR, |
| 4070 | or_tmp_name, xor_tmp_name); |
| 4071 | gimple_seq_add_stmt (cond_expr_stmt_list, or_stmt); |
| 4072 | or_tmp_name = new_or_tmp_name; |
| 4073 | } |
| 4074 | } |
| 4075 | prev_addr_tmp_name = addr_tmp_name; |
| 4076 | } |
| 4077 | else |
| 4078 | { |
| 4079 | /* Create: "(a_1 | a_2 | a_3 | ... | a_n) & mask" to check if all |
| 4080 | addresses are already aligned. */ |
| 4081 | if (or_tmp_name != NULL_TREE) |
| 4082 | { |
| 4083 | /* Create: or_tmp = or_tmp | addr_tmp. */ |
| 4084 | sprintf (s: tmp_name, format: "orptrs%d" , i); |
| 4085 | new_or_tmp_name = make_temp_ssa_name (type: int_ptrsize_type, NULL, |
| 4086 | name: tmp_name); |
| 4087 | or_stmt = gimple_build_assign (new_or_tmp_name, BIT_IOR_EXPR, |
| 4088 | or_tmp_name, addr_tmp_name); |
| 4089 | gimple_seq_add_stmt (cond_expr_stmt_list, or_stmt); |
| 4090 | or_tmp_name = new_or_tmp_name; |
| 4091 | } |
| 4092 | else |
| 4093 | or_tmp_name = addr_tmp_name; |
| 4094 | } |
| 4095 | |
| 4096 | } /* end for i */ |
| 4097 | |
| 4098 | mask_cst = build_int_cst (int_ptrsize_type, mask); |
| 4099 | |
| 4100 | /* create: and_tmp = or_tmp & mask */ |
| 4101 | and_tmp_name = make_temp_ssa_name (type: int_ptrsize_type, NULL, name: "andmask" ); |
| 4102 | |
| 4103 | and_stmt = gimple_build_assign (and_tmp_name, BIT_AND_EXPR, |
| 4104 | or_tmp_name, mask_cst); |
| 4105 | gimple_seq_add_stmt (cond_expr_stmt_list, and_stmt); |
| 4106 | |
| 4107 | /* Make and_tmp the left operand of the conditional test against zero. |
| 4108 | if and_tmp has a nonzero bit then some address is unaligned. */ |
| 4109 | ptrsize_zero = build_int_cst (int_ptrsize_type, 0); |
| 4110 | part_cond_expr = fold_build2 (EQ_EXPR, boolean_type_node, |
| 4111 | and_tmp_name, ptrsize_zero); |
| 4112 | chain_cond_expr (cond_expr, part_cond_expr); |
| 4113 | } |
| 4114 | |
| 4115 | /* Function vect_create_cond_for_vla_spec_read. |
| 4116 | |
| 4117 | Create a conditional expression that represents the run-time checks with |
| 4118 | max speculative read amount in VLA modes. We check two things: |
| 4119 | 1) if the max speculative read amount exceeds the min page size |
| 4120 | 2) if the VF is power-of-2 - done by checking the max read amount instead |
| 4121 | |
| 4122 | Input: |
| 4123 | COND_EXPR - input conditional expression. New conditions will be chained |
| 4124 | with logical AND operation. |
| 4125 | LOOP_VINFO - field LOOP_VINFO_MAX_SPEC_READ_AMOUNT contains the max |
| 4126 | possible speculative read amount in VLA modes. |
| 4127 | |
| 4128 | Output: |
| 4129 | COND_EXPR - conditional expression. |
| 4130 | |
| 4131 | The returned COND_EXPR is the conditional expression to be used in the |
| 4132 | if statement that controls which version of the loop gets executed at |
| 4133 | runtime. */ |
| 4134 | |
| 4135 | static void |
| 4136 | vect_create_cond_for_vla_spec_read (loop_vec_info loop_vinfo, tree *cond_expr) |
| 4137 | { |
| 4138 | poly_uint64 read_amount_poly = LOOP_VINFO_MAX_SPEC_READ_AMOUNT (loop_vinfo); |
| 4139 | tree amount = build_int_cst (long_unsigned_type_node, read_amount_poly); |
| 4140 | |
| 4141 | /* Both the read amount and the VF must be variants, and the read amount must |
| 4142 | be a constant power-of-2 multiple of the VF. */ |
| 4143 | unsigned HOST_WIDE_INT multiple; |
| 4144 | gcc_assert (!read_amount_poly.is_constant () |
| 4145 | && !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant () |
| 4146 | && constant_multiple_p (read_amount_poly, |
| 4147 | LOOP_VINFO_VECT_FACTOR (loop_vinfo), |
| 4148 | &multiple) |
| 4149 | && pow2p_hwi (multiple)); |
| 4150 | |
| 4151 | tree cst_ul_zero = build_int_cstu (long_unsigned_type_node, 0U); |
| 4152 | tree cst_ul_one = build_int_cstu (long_unsigned_type_node, 1U); |
| 4153 | tree cst_ul_pagesize = build_int_cstu (long_unsigned_type_node, |
| 4154 | (unsigned long) param_min_pagesize); |
| 4155 | |
| 4156 | /* Create an expression of "amount & (amount - 1) == 0". */ |
| 4157 | tree amount_m1 = fold_build2 (MINUS_EXPR, long_unsigned_type_node, |
| 4158 | amount, cst_ul_one); |
| 4159 | tree amount_and_expr = fold_build2 (BIT_AND_EXPR, long_unsigned_type_node, |
| 4160 | amount, amount_m1); |
| 4161 | tree powof2_cond_expr = fold_build2 (EQ_EXPR, boolean_type_node, |
| 4162 | amount_and_expr, cst_ul_zero); |
| 4163 | chain_cond_expr (cond_expr, part_cond_expr: powof2_cond_expr); |
| 4164 | |
| 4165 | /* Create an expression of "amount <= cst_ul_pagesize". */ |
| 4166 | tree pagesize_cond_expr = fold_build2 (LE_EXPR, boolean_type_node, |
| 4167 | amount, cst_ul_pagesize); |
| 4168 | chain_cond_expr (cond_expr, part_cond_expr: pagesize_cond_expr); |
| 4169 | } |
| 4170 | |
| 4171 | /* If LOOP_VINFO_CHECK_UNEQUAL_ADDRS contains <A1, B1>, ..., <An, Bn>, |
| 4172 | create a tree representation of: (&A1 != &B1) && ... && (&An != &Bn). |
| 4173 | Set *COND_EXPR to a tree that is true when both the original *COND_EXPR |
| 4174 | and this new condition are true. Treat a null *COND_EXPR as "true". */ |
| 4175 | |
| 4176 | static void |
| 4177 | vect_create_cond_for_unequal_addrs (loop_vec_info loop_vinfo, tree *cond_expr) |
| 4178 | { |
| 4179 | const vec<vec_object_pair> &pairs |
| 4180 | = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo); |
| 4181 | unsigned int i; |
| 4182 | vec_object_pair *pair; |
| 4183 | FOR_EACH_VEC_ELT (pairs, i, pair) |
| 4184 | { |
| 4185 | tree addr1 = build_fold_addr_expr (pair->first); |
| 4186 | tree addr2 = build_fold_addr_expr (pair->second); |
| 4187 | tree part_cond_expr = fold_build2 (NE_EXPR, boolean_type_node, |
| 4188 | addr1, addr2); |
| 4189 | chain_cond_expr (cond_expr, part_cond_expr); |
| 4190 | } |
| 4191 | } |
| 4192 | |
| 4193 | /* Create an expression that is true when all lower-bound conditions for |
| 4194 | the vectorized loop are met. Chain this condition with *COND_EXPR. */ |
| 4195 | |
| 4196 | static void |
| 4197 | vect_create_cond_for_lower_bounds (loop_vec_info loop_vinfo, tree *cond_expr) |
| 4198 | { |
| 4199 | const vec<vec_lower_bound> &lower_bounds |
| 4200 | = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo); |
| 4201 | for (unsigned int i = 0; i < lower_bounds.length (); ++i) |
| 4202 | { |
| 4203 | tree expr = lower_bounds[i].expr; |
| 4204 | tree type = unsigned_type_for (TREE_TYPE (expr)); |
| 4205 | expr = fold_convert (type, expr); |
| 4206 | poly_uint64 bound = lower_bounds[i].min_value; |
| 4207 | if (!lower_bounds[i].unsigned_p) |
| 4208 | { |
| 4209 | expr = fold_build2 (PLUS_EXPR, type, expr, |
| 4210 | build_int_cstu (type, bound - 1)); |
| 4211 | bound += bound - 1; |
| 4212 | } |
| 4213 | tree part_cond_expr = fold_build2 (GE_EXPR, boolean_type_node, expr, |
| 4214 | build_int_cstu (type, bound)); |
| 4215 | chain_cond_expr (cond_expr, part_cond_expr); |
| 4216 | } |
| 4217 | } |
| 4218 | |
| 4219 | /* Function vect_create_cond_for_alias_checks. |
| 4220 | |
| 4221 | Create a conditional expression that represents the run-time checks for |
| 4222 | overlapping of address ranges represented by a list of data references |
| 4223 | relations passed as input. |
| 4224 | |
| 4225 | Input: |
| 4226 | COND_EXPR - input conditional expression. New conditions will be chained |
| 4227 | with logical AND operation. If it is NULL, then the function |
| 4228 | is used to return the number of alias checks. |
| 4229 | LOOP_VINFO - field LOOP_VINFO_MAY_ALIAS_STMTS contains the list of ddrs |
| 4230 | to be checked. |
| 4231 | |
| 4232 | Output: |
| 4233 | COND_EXPR - conditional expression. |
| 4234 | |
| 4235 | The returned COND_EXPR is the conditional expression to be used in the if |
| 4236 | statement that controls which version of the loop gets executed at runtime. |
| 4237 | */ |
| 4238 | |
| 4239 | void |
| 4240 | vect_create_cond_for_alias_checks (loop_vec_info loop_vinfo, tree * cond_expr) |
| 4241 | { |
| 4242 | const vec<dr_with_seg_len_pair_t> &comp_alias_ddrs = |
| 4243 | LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo); |
| 4244 | |
| 4245 | if (comp_alias_ddrs.is_empty ()) |
| 4246 | return; |
| 4247 | |
| 4248 | create_runtime_alias_checks (LOOP_VINFO_LOOP (loop_vinfo), |
| 4249 | &comp_alias_ddrs, cond_expr); |
| 4250 | if (dump_enabled_p ()) |
| 4251 | dump_printf_loc (MSG_NOTE, vect_location, |
| 4252 | "created %u versioning for alias checks.\n" , |
| 4253 | comp_alias_ddrs.length ()); |
| 4254 | } |
| 4255 | |
| 4256 | |
| 4257 | /* Function vect_loop_versioning. |
| 4258 | |
| 4259 | If the loop has data references that may or may not be aligned or/and |
| 4260 | has data reference relations whose independence was not proven then |
| 4261 | two versions of the loop need to be generated, one which is vectorized |
| 4262 | and one which isn't. A test is then generated to control which of the |
| 4263 | loops is executed. The test checks for the alignment of all of the |
| 4264 | data references that may or may not be aligned. An additional |
| 4265 | sequence of runtime tests is generated for each pairs of DDRs whose |
| 4266 | independence was not proven. The vectorized version of loop is |
| 4267 | executed only if both alias and alignment tests are passed. |
| 4268 | |
| 4269 | The test generated to check which version of loop is executed |
| 4270 | is modified to also check for profitability as indicated by the |
| 4271 | cost model threshold TH. |
| 4272 | |
| 4273 | The versioning precondition(s) are placed in *COND_EXPR and |
| 4274 | *COND_EXPR_STMT_LIST. */ |
| 4275 | |
| 4276 | class loop * |
| 4277 | vect_loop_versioning (loop_vec_info loop_vinfo, |
| 4278 | gimple *loop_vectorized_call) |
| 4279 | { |
| 4280 | class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *nloop; |
| 4281 | class loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo); |
| 4282 | basic_block condition_bb; |
| 4283 | gphi_iterator gsi; |
| 4284 | gimple_stmt_iterator cond_exp_gsi; |
| 4285 | basic_block merge_bb; |
| 4286 | basic_block new_exit_bb; |
| 4287 | edge new_exit_e, e; |
| 4288 | gphi *orig_phi, *new_phi; |
| 4289 | tree cond_expr = NULL_TREE; |
| 4290 | gimple_seq cond_expr_stmt_list = NULL; |
| 4291 | tree arg; |
| 4292 | profile_probability prob = profile_probability::likely (); |
| 4293 | gimple_seq gimplify_stmt_list = NULL; |
| 4294 | tree scalar_loop_iters = LOOP_VINFO_NITERSM1 (loop_vinfo); |
| 4295 | bool version_align = LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo); |
| 4296 | bool version_spec_read = LOOP_REQUIRES_VERSIONING_FOR_SPEC_READ (loop_vinfo); |
| 4297 | bool version_alias = LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo); |
| 4298 | bool version_niter = LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo); |
| 4299 | poly_uint64 versioning_threshold |
| 4300 | = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo); |
| 4301 | tree version_simd_if_cond |
| 4302 | = LOOP_REQUIRES_VERSIONING_FOR_SIMD_IF_COND (loop_vinfo); |
| 4303 | unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo); |
| 4304 | bool uncounted_p = LOOP_VINFO_NITERS_UNCOUNTED_P (loop_vinfo); |
| 4305 | |
| 4306 | if (!uncounted_p && vect_apply_runtime_profitability_check_p (loop_vinfo) |
| 4307 | && !ordered_p (a: th, b: versioning_threshold)) |
| 4308 | cond_expr = fold_build2 (GE_EXPR, boolean_type_node, scalar_loop_iters, |
| 4309 | build_int_cst (TREE_TYPE (scalar_loop_iters), |
| 4310 | th - 1)); |
| 4311 | if (!uncounted_p && maybe_ne (a: versioning_threshold, b: 0U)) |
| 4312 | { |
| 4313 | tree expr = fold_build2 (GE_EXPR, boolean_type_node, scalar_loop_iters, |
| 4314 | build_int_cst (TREE_TYPE (scalar_loop_iters), |
| 4315 | versioning_threshold - 1)); |
| 4316 | if (cond_expr) |
| 4317 | cond_expr = fold_build2 (BIT_AND_EXPR, boolean_type_node, |
| 4318 | expr, cond_expr); |
| 4319 | else |
| 4320 | cond_expr = expr; |
| 4321 | } |
| 4322 | |
| 4323 | tree cost_name = NULL_TREE; |
| 4324 | profile_probability prob2 = profile_probability::always (); |
| 4325 | if (cond_expr |
| 4326 | && EXPR_P (cond_expr) |
| 4327 | && (version_niter |
| 4328 | || version_align |
| 4329 | || version_alias |
| 4330 | || version_simd_if_cond)) |
| 4331 | { |
| 4332 | cost_name = cond_expr = force_gimple_operand_1 (unshare_expr (cond_expr), |
| 4333 | &cond_expr_stmt_list, |
| 4334 | is_gimple_val, NULL_TREE); |
| 4335 | /* Split prob () into two so that the overall probability of passing |
| 4336 | both the cost-model and versioning checks is the orig prob. */ |
| 4337 | prob2 = prob = prob.sqrt (); |
| 4338 | } |
| 4339 | |
| 4340 | if (version_niter) |
| 4341 | vect_create_cond_for_niters_checks (loop_vinfo, cond_expr: &cond_expr); |
| 4342 | |
| 4343 | if (cond_expr) |
| 4344 | { |
| 4345 | gimple_seq tem = NULL; |
| 4346 | cond_expr = force_gimple_operand_1 (unshare_expr (cond_expr), |
| 4347 | &tem, is_gimple_condexpr_for_cond, |
| 4348 | NULL_TREE); |
| 4349 | gimple_seq_add_seq (&cond_expr_stmt_list, tem); |
| 4350 | } |
| 4351 | |
| 4352 | if (version_align) |
| 4353 | vect_create_cond_for_align_checks (loop_vinfo, cond_expr: &cond_expr, |
| 4354 | cond_expr_stmt_list: &cond_expr_stmt_list); |
| 4355 | |
| 4356 | if (version_spec_read) |
| 4357 | vect_create_cond_for_vla_spec_read (loop_vinfo, cond_expr: &cond_expr); |
| 4358 | |
| 4359 | if (version_alias) |
| 4360 | { |
| 4361 | vect_create_cond_for_unequal_addrs (loop_vinfo, cond_expr: &cond_expr); |
| 4362 | vect_create_cond_for_lower_bounds (loop_vinfo, cond_expr: &cond_expr); |
| 4363 | vect_create_cond_for_alias_checks (loop_vinfo, cond_expr: &cond_expr); |
| 4364 | } |
| 4365 | |
| 4366 | if (version_simd_if_cond) |
| 4367 | { |
| 4368 | gcc_assert (dom_info_available_p (CDI_DOMINATORS)); |
| 4369 | if (flag_checking) |
| 4370 | if (basic_block bb |
| 4371 | = gimple_bb (SSA_NAME_DEF_STMT (version_simd_if_cond))) |
| 4372 | gcc_assert (bb != loop->header |
| 4373 | && dominated_by_p (CDI_DOMINATORS, loop->header, bb) |
| 4374 | && (scalar_loop == NULL |
| 4375 | || (bb != scalar_loop->header |
| 4376 | && dominated_by_p (CDI_DOMINATORS, |
| 4377 | scalar_loop->header, bb)))); |
| 4378 | tree zero = build_zero_cst (TREE_TYPE (version_simd_if_cond)); |
| 4379 | tree c = fold_build2 (NE_EXPR, boolean_type_node, |
| 4380 | version_simd_if_cond, zero); |
| 4381 | if (cond_expr) |
| 4382 | cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node, |
| 4383 | c, cond_expr); |
| 4384 | else |
| 4385 | cond_expr = c; |
| 4386 | if (dump_enabled_p ()) |
| 4387 | dump_printf_loc (MSG_NOTE, vect_location, |
| 4388 | "created versioning for simd if condition check.\n" ); |
| 4389 | } |
| 4390 | |
| 4391 | cond_expr = force_gimple_operand_1 (unshare_expr (cond_expr), |
| 4392 | &gimplify_stmt_list, |
| 4393 | is_gimple_condexpr_for_cond, NULL_TREE); |
| 4394 | gimple_seq_add_seq (&cond_expr_stmt_list, gimplify_stmt_list); |
| 4395 | |
| 4396 | /* Compute the outermost loop cond_expr and cond_expr_stmt_list are |
| 4397 | invariant in. */ |
| 4398 | class loop *outermost = outermost_invariant_loop_for_expr (loop, cond_expr); |
| 4399 | for (gimple_stmt_iterator gsi = gsi_start (seq&: cond_expr_stmt_list); |
| 4400 | !gsi_end_p (i: gsi); gsi_next (i: &gsi)) |
| 4401 | { |
| 4402 | gimple *stmt = gsi_stmt (i: gsi); |
| 4403 | update_stmt (s: stmt); |
| 4404 | ssa_op_iter iter; |
| 4405 | use_operand_p use_p; |
| 4406 | basic_block def_bb; |
| 4407 | FOR_EACH_SSA_USE_OPERAND (use_p, stmt, iter, SSA_OP_USE) |
| 4408 | if ((def_bb = gimple_bb (SSA_NAME_DEF_STMT (USE_FROM_PTR (use_p)))) |
| 4409 | && flow_bb_inside_loop_p (outermost, def_bb)) |
| 4410 | outermost = superloop_at_depth (loop, bb_loop_depth (def_bb) + 1); |
| 4411 | } |
| 4412 | |
| 4413 | /* Search for the outermost loop we can version. Avoid versioning of |
| 4414 | non-perfect nests but allow if-conversion versioned loops inside. */ |
| 4415 | class loop *loop_to_version = loop; |
| 4416 | if (flow_loop_nested_p (outermost, loop)) |
| 4417 | { |
| 4418 | if (dump_enabled_p ()) |
| 4419 | dump_printf_loc (MSG_NOTE, vect_location, |
| 4420 | "trying to apply versioning to outer loop %d\n" , |
| 4421 | outermost->num); |
| 4422 | if (outermost->num == 0) |
| 4423 | outermost = superloop_at_depth (loop, 1); |
| 4424 | /* And avoid applying versioning on non-perfect nests. */ |
| 4425 | while (loop_to_version != outermost |
| 4426 | && (e = single_exit (loop_outer (loop: loop_to_version))) |
| 4427 | && !(e->flags & EDGE_COMPLEX) |
| 4428 | && (!loop_outer (loop: loop_to_version)->inner->next |
| 4429 | || vect_loop_vectorized_call (loop_to_version)) |
| 4430 | && (!loop_outer (loop: loop_to_version)->inner->next |
| 4431 | || !loop_outer (loop: loop_to_version)->inner->next->next)) |
| 4432 | loop_to_version = loop_outer (loop: loop_to_version); |
| 4433 | } |
| 4434 | |
| 4435 | /* Apply versioning. If there is already a scalar version created by |
| 4436 | if-conversion re-use that. Note we cannot re-use the copy of |
| 4437 | an if-converted outer-loop when vectorizing the inner loop only. */ |
| 4438 | gcond *cond; |
| 4439 | if ((!loop_to_version->inner || loop == loop_to_version) |
| 4440 | && loop_vectorized_call) |
| 4441 | { |
| 4442 | gcc_assert (scalar_loop); |
| 4443 | condition_bb = gimple_bb (g: loop_vectorized_call); |
| 4444 | cond = as_a <gcond *> (p: *gsi_last_bb (bb: condition_bb)); |
| 4445 | gimple_cond_set_condition_from_tree (cond, cond_expr); |
| 4446 | update_stmt (s: cond); |
| 4447 | |
| 4448 | if (cond_expr_stmt_list) |
| 4449 | { |
| 4450 | cond_exp_gsi = gsi_for_stmt (loop_vectorized_call); |
| 4451 | gsi_insert_seq_before (&cond_exp_gsi, cond_expr_stmt_list, |
| 4452 | GSI_SAME_STMT); |
| 4453 | } |
| 4454 | |
| 4455 | /* if-conversion uses profile_probability::always () for both paths, |
| 4456 | reset the paths probabilities appropriately. */ |
| 4457 | edge te, fe; |
| 4458 | extract_true_false_edges_from_block (condition_bb, &te, &fe); |
| 4459 | te->probability = prob; |
| 4460 | fe->probability = prob.invert (); |
| 4461 | /* We can scale loops counts immediately but have to postpone |
| 4462 | scaling the scalar loop because we re-use it during peeling. |
| 4463 | |
| 4464 | Ifcvt duplicates loop preheader, loop body and produces an basic |
| 4465 | block after loop exit. We need to scale all that. */ |
| 4466 | basic_block = loop_preheader_edge (loop_to_version)->src; |
| 4467 | preheader->count = preheader->count.apply_probability (prob: prob * prob2); |
| 4468 | scale_loop_frequencies (loop_to_version, prob * prob2); |
| 4469 | /* When the loop has multiple exits then we can only version itself. |
| 4470 | This is denoted by loop_to_version == loop. In this case we can |
| 4471 | do the versioning by selecting the exit edge the vectorizer is |
| 4472 | currently using. */ |
| 4473 | edge exit_edge; |
| 4474 | if (loop_to_version == loop) |
| 4475 | exit_edge = LOOP_VINFO_MAIN_EXIT (loop_vinfo); |
| 4476 | else |
| 4477 | exit_edge = single_exit (loop_to_version); |
| 4478 | exit_edge->dest->count = preheader->count; |
| 4479 | LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo) = (prob * prob2).invert (); |
| 4480 | |
| 4481 | nloop = scalar_loop; |
| 4482 | if (dump_enabled_p ()) |
| 4483 | dump_printf_loc (MSG_NOTE, vect_location, |
| 4484 | "reusing %sloop version created by if conversion\n" , |
| 4485 | loop_to_version != loop ? "outer " : "" ); |
| 4486 | } |
| 4487 | else |
| 4488 | { |
| 4489 | if (loop_to_version != loop |
| 4490 | && dump_enabled_p ()) |
| 4491 | dump_printf_loc (MSG_NOTE, vect_location, |
| 4492 | "applying loop versioning to outer loop %d\n" , |
| 4493 | loop_to_version->num); |
| 4494 | |
| 4495 | unsigned orig_pe_idx = loop_preheader_edge (loop)->dest_idx; |
| 4496 | |
| 4497 | initialize_original_copy_tables (); |
| 4498 | nloop = loop_version (loop_to_version, cond_expr, &condition_bb, |
| 4499 | prob * prob2, (prob * prob2).invert (), |
| 4500 | prob * prob2, (prob * prob2).invert (), |
| 4501 | true); |
| 4502 | |
| 4503 | /* If the PHI nodes in the loop header were reallocated, we need to fix up |
| 4504 | our internally stashed copies of those. */ |
| 4505 | if (loop_to_version == loop) |
| 4506 | for (auto gsi = gsi_start_phis (loop->header); |
| 4507 | !gsi_end_p (i: gsi); gsi_next (i: &gsi)) |
| 4508 | loop_vinfo->resync_stmt_addr (gsi.phi ()); |
| 4509 | |
| 4510 | /* We will later insert second conditional so overall outcome of |
| 4511 | both is prob * prob2. */ |
| 4512 | edge true_e, false_e; |
| 4513 | extract_true_false_edges_from_block (condition_bb, &true_e, &false_e); |
| 4514 | true_e->probability = prob; |
| 4515 | false_e->probability = prob.invert (); |
| 4516 | gcc_assert (nloop); |
| 4517 | nloop = get_loop_copy (loop); |
| 4518 | |
| 4519 | /* Assign hierarchical discriminators to distinguish loop versions. |
| 4520 | Only assign to the scalar version here; the vectorized version will |
| 4521 | get discriminators later during transformation/peeling. |
| 4522 | Use dynamic copy_id allocation instead of hardcoded constants. */ |
| 4523 | gimple *nloop_last = last_nondebug_stmt (nloop->header); |
| 4524 | location_t nloop_loc |
| 4525 | = nloop_last ? gimple_location (g: nloop_last) : UNKNOWN_LOCATION; |
| 4526 | if (nloop_loc != UNKNOWN_LOCATION) |
| 4527 | { |
| 4528 | unsigned int nloop_copyid = allocate_copyid_base (loc: nloop_loc, stride: 1); |
| 4529 | assign_discriminators_to_loop (loop: nloop, multiplicity_factor: 0, copyid: nloop_copyid); |
| 4530 | } |
| 4531 | /* For cycle vectorization with SLP we rely on the PHI arguments |
| 4532 | appearing in the same order as the SLP node operands which for the |
| 4533 | loop PHI nodes means the preheader edge dest index needs to remain |
| 4534 | the same for the analyzed loop which also becomes the vectorized one. |
| 4535 | Make it so in case the state after versioning differs by redirecting |
| 4536 | the first edge into the header to the same destination which moves |
| 4537 | it last. */ |
| 4538 | if (loop_preheader_edge (loop)->dest_idx != orig_pe_idx) |
| 4539 | { |
| 4540 | edge e = EDGE_PRED (loop->header, 0); |
| 4541 | ssa_redirect_edge (e, e->dest); |
| 4542 | flush_pending_stmts (e); |
| 4543 | } |
| 4544 | gcc_assert (loop_preheader_edge (loop)->dest_idx == orig_pe_idx); |
| 4545 | |
| 4546 | /* Kill off IFN_LOOP_VECTORIZED_CALL in the copy, nobody will |
| 4547 | reap those otherwise; they also refer to the original |
| 4548 | loops. */ |
| 4549 | class loop *l = loop; |
| 4550 | while (gimple *call = vect_loop_vectorized_call (l)) |
| 4551 | { |
| 4552 | call = SSA_NAME_DEF_STMT (get_current_def (gimple_call_lhs (call))); |
| 4553 | fold_loop_internal_call (call, boolean_false_node); |
| 4554 | l = loop_outer (loop: l); |
| 4555 | } |
| 4556 | free_original_copy_tables (); |
| 4557 | |
| 4558 | if (cond_expr_stmt_list) |
| 4559 | { |
| 4560 | cond_exp_gsi = gsi_last_bb (bb: condition_bb); |
| 4561 | gsi_insert_seq_before (&cond_exp_gsi, cond_expr_stmt_list, |
| 4562 | GSI_SAME_STMT); |
| 4563 | } |
| 4564 | |
| 4565 | /* Loop versioning violates an assumption we try to maintain during |
| 4566 | vectorization - that the loop exit block has a single predecessor. |
| 4567 | After versioning, the exit block of both loop versions is the same |
| 4568 | basic block (i.e. it has two predecessors). Just in order to simplify |
| 4569 | following transformations in the vectorizer, we fix this situation |
| 4570 | here by adding a new (empty) block on the exit-edge of the loop, |
| 4571 | with the proper loop-exit phis to maintain loop-closed-form. |
| 4572 | If loop versioning wasn't done from loop, but scalar_loop instead, |
| 4573 | merge_bb will have already just a single successor. */ |
| 4574 | |
| 4575 | /* When the loop has multiple exits then we can only version itself. |
| 4576 | This is denoted by loop_to_version == loop. In this case we can |
| 4577 | do the versioning by selecting the exit edge the vectorizer is |
| 4578 | currently using. */ |
| 4579 | edge exit_edge; |
| 4580 | if (loop_to_version == loop) |
| 4581 | exit_edge = LOOP_VINFO_MAIN_EXIT (loop_vinfo); |
| 4582 | else |
| 4583 | exit_edge = single_exit (loop_to_version); |
| 4584 | |
| 4585 | gcc_assert (exit_edge); |
| 4586 | merge_bb = exit_edge->dest; |
| 4587 | if (EDGE_COUNT (merge_bb->preds) >= 2) |
| 4588 | { |
| 4589 | gcc_assert (EDGE_COUNT (merge_bb->preds) >= 2); |
| 4590 | new_exit_bb = split_edge (exit_edge); |
| 4591 | new_exit_e = exit_edge; |
| 4592 | e = EDGE_SUCC (new_exit_bb, 0); |
| 4593 | |
| 4594 | for (gsi = gsi_start_phis (merge_bb); !gsi_end_p (i: gsi); |
| 4595 | gsi_next (i: &gsi)) |
| 4596 | { |
| 4597 | tree new_res; |
| 4598 | orig_phi = gsi.phi (); |
| 4599 | new_res = copy_ssa_name (PHI_RESULT (orig_phi)); |
| 4600 | new_phi = create_phi_node (new_res, new_exit_bb); |
| 4601 | arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, e); |
| 4602 | add_phi_arg (new_phi, arg, new_exit_e, |
| 4603 | gimple_phi_arg_location_from_edge (phi: orig_phi, e)); |
| 4604 | adjust_phi_and_debug_stmts (update_phi: orig_phi, e, PHI_RESULT (new_phi)); |
| 4605 | } |
| 4606 | } |
| 4607 | |
| 4608 | update_ssa (TODO_update_ssa_no_phi); |
| 4609 | } |
| 4610 | |
| 4611 | /* Split the cost model check off to a separate BB. Costing assumes |
| 4612 | this is the only thing we perform when we enter the scalar loop |
| 4613 | from a failed cost decision. */ |
| 4614 | if (cost_name && TREE_CODE (cost_name) == SSA_NAME) |
| 4615 | { |
| 4616 | gimple *def = SSA_NAME_DEF_STMT (cost_name); |
| 4617 | gcc_assert (gimple_bb (def) == condition_bb); |
| 4618 | /* All uses of the cost check are 'true' after the check we |
| 4619 | are going to insert. */ |
| 4620 | replace_uses_by (cost_name, boolean_true_node); |
| 4621 | /* And we're going to build the new single use of it. */ |
| 4622 | gcond *cond = gimple_build_cond (NE_EXPR, cost_name, boolean_false_node, |
| 4623 | NULL_TREE, NULL_TREE); |
| 4624 | edge e = split_block (gimple_bb (g: def), def); |
| 4625 | gimple_stmt_iterator gsi = gsi_for_stmt (def); |
| 4626 | gsi_insert_after (&gsi, cond, GSI_NEW_STMT); |
| 4627 | edge true_e, false_e; |
| 4628 | extract_true_false_edges_from_block (e->dest, &true_e, &false_e); |
| 4629 | e->flags &= ~EDGE_FALLTHRU; |
| 4630 | e->flags |= EDGE_TRUE_VALUE; |
| 4631 | edge e2 = make_edge (e->src, false_e->dest, EDGE_FALSE_VALUE); |
| 4632 | e->probability = prob2; |
| 4633 | e2->probability = prob2.invert (); |
| 4634 | e->dest->count = e->count (); |
| 4635 | set_immediate_dominator (CDI_DOMINATORS, false_e->dest, e->src); |
| 4636 | auto_vec<basic_block, 3> adj; |
| 4637 | for (basic_block son = first_dom_son (CDI_DOMINATORS, e->dest); |
| 4638 | son; |
| 4639 | son = next_dom_son (CDI_DOMINATORS, son)) |
| 4640 | if (EDGE_COUNT (son->preds) > 1) |
| 4641 | adj.safe_push (obj: son); |
| 4642 | for (auto son : adj) |
| 4643 | set_immediate_dominator (CDI_DOMINATORS, son, e->src); |
| 4644 | //debug_bb (condition_bb); |
| 4645 | //debug_bb (e->src); |
| 4646 | } |
| 4647 | |
| 4648 | if (version_niter) |
| 4649 | { |
| 4650 | /* The versioned loop could be infinite, we need to clear existing |
| 4651 | niter information which is copied from the original loop. */ |
| 4652 | gcc_assert (loop_constraint_set_p (loop, LOOP_C_FINITE)); |
| 4653 | vect_free_loop_info_assumptions (nloop); |
| 4654 | } |
| 4655 | |
| 4656 | if (LOCATION_LOCUS (vect_location.get_location_t ()) != UNKNOWN_LOCATION |
| 4657 | && dump_enabled_p ()) |
| 4658 | { |
| 4659 | if (version_alias) |
| 4660 | dump_printf_loc (MSG_OPTIMIZED_LOCATIONS | MSG_PRIORITY_USER_FACING, |
| 4661 | vect_location, |
| 4662 | "loop versioned for vectorization because of " |
| 4663 | "possible aliasing\n" ); |
| 4664 | if (version_align) |
| 4665 | dump_printf_loc (MSG_OPTIMIZED_LOCATIONS | MSG_PRIORITY_USER_FACING, |
| 4666 | vect_location, |
| 4667 | "loop versioned for vectorization to enhance " |
| 4668 | "alignment\n" ); |
| 4669 | |
| 4670 | } |
| 4671 | |
| 4672 | return nloop; |
| 4673 | } |
| 4674 | |