1/* Vectorizer Specific Loop Manipulations
2 Copyright (C) 2003-2026 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
5
6This file is part of GCC.
7
8GCC is free software; you can redistribute it and/or modify it under
9the terms of the GNU General Public License as published by the Free
10Software Foundation; either version 3, or (at your option) any later
11version.
12
13GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14WARRANTY; without even the implied warranty of MERCHANTABILITY or
15FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16for more details.
17
18You should have received a copy of the GNU General Public License
19along with GCC; see the file COPYING3. If not see
20<http://www.gnu.org/licenses/>. */
21
22#include "config.h"
23#include "system.h"
24#include "coretypes.h"
25#include "backend.h"
26#include "tree.h"
27#include "gimple.h"
28#include "cfghooks.h"
29#include "tree-pass.h"
30#include "ssa.h"
31#include "fold-const.h"
32#include "cfganal.h"
33#include "gimplify.h"
34#include "gimple-iterator.h"
35#include "gimplify-me.h"
36#include "tree-cfg.h"
37#include "tree-ssa-loop-manip.h"
38#include "tree-into-ssa.h"
39#include "tree-ssa.h"
40#include "cfgloop.h"
41#include "tree-scalar-evolution.h"
42#include "tree-vectorizer.h"
43#include "tree-ssa-loop-ivopts.h"
44#include "gimple-fold.h"
45#include "tree-ssa-loop-niter.h"
46#include "internal-fn.h"
47#include "stor-layout.h"
48#include "optabs-query.h"
49#include "vec-perm-indices.h"
50#include "insn-config.h"
51#include "rtl.h"
52#include "recog.h"
53#include "langhooks.h"
54#include "tree-vector-builder.h"
55#include "optabs-tree.h"
56#include "hierarchical_discriminator.h"
57
58
59/*************************************************************************
60 Simple Loop Peeling Utilities
61
62 Utilities to support loop peeling for vectorization purposes.
63 *************************************************************************/
64
65
66/* Renames the use *OP_P. */
67
68static void
69rename_use_op (use_operand_p op_p)
70{
71 tree new_name;
72
73 if (TREE_CODE (USE_FROM_PTR (op_p)) != SSA_NAME)
74 return;
75
76 new_name = get_current_def (USE_FROM_PTR (op_p));
77
78 /* Something defined outside of the loop. */
79 if (!new_name)
80 return;
81
82 /* An ordinary ssa name defined in the loop. */
83
84 SET_USE (op_p, new_name);
85}
86
87
88/* Renames the variables in basic block BB. Allow renaming of PHI arguments
89 on edges incoming from outer-block header if RENAME_FROM_OUTER_LOOP is
90 true. */
91
92static void
93rename_variables_in_bb (basic_block bb, bool rename_from_outer_loop)
94{
95 gimple *stmt;
96 use_operand_p use_p;
97 ssa_op_iter iter;
98 edge e;
99 edge_iterator ei;
100 class loop *loop = bb->loop_father;
101 class loop *outer_loop = NULL;
102
103 if (rename_from_outer_loop)
104 {
105 gcc_assert (loop);
106 outer_loop = loop_outer (loop);
107 }
108
109 for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (i: gsi);
110 gsi_next (i: &gsi))
111 {
112 stmt = gsi_stmt (i: gsi);
113 FOR_EACH_SSA_USE_OPERAND (use_p, stmt, iter, SSA_OP_ALL_USES)
114 rename_use_op (op_p: use_p);
115 }
116
117 FOR_EACH_EDGE (e, ei, bb->preds)
118 {
119 if (!flow_bb_inside_loop_p (loop, e->src))
120 {
121 if (!rename_from_outer_loop)
122 continue;
123 if (e->src != outer_loop->header)
124 {
125 if (outer_loop->inner->next)
126 {
127 /* If outer_loop has 2 inner loops, allow there to
128 be an extra basic block which decides which of the
129 two loops to use using LOOP_VECTORIZED. */
130 if (!single_pred_p (bb: e->src)
131 || single_pred (bb: e->src) != outer_loop->header)
132 continue;
133 }
134 }
135 }
136 for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (i: gsi);
137 gsi_next (i: &gsi))
138 rename_use_op (PHI_ARG_DEF_PTR_FROM_EDGE (gsi.phi (), e));
139 }
140}
141
142
143struct adjust_info
144{
145 tree from, to;
146 basic_block bb;
147};
148
149/* A stack of values to be adjusted in debug stmts. We have to
150 process them LIFO, so that the closest substitution applies. If we
151 processed them FIFO, without the stack, we might substitute uses
152 with a PHI DEF that would soon become non-dominant, and when we got
153 to the suitable one, it wouldn't have anything to substitute any
154 more. */
155static vec<adjust_info, va_heap> adjust_vec;
156
157/* Adjust any debug stmts that referenced AI->from values to use the
158 loop-closed AI->to, if the references are dominated by AI->bb and
159 not by the definition of AI->from. */
160
161static void
162adjust_debug_stmts_now (adjust_info *ai)
163{
164 basic_block bbphi = ai->bb;
165 tree orig_def = ai->from;
166 tree new_def = ai->to;
167 imm_use_iterator imm_iter;
168 gimple *stmt;
169 basic_block bbdef = gimple_bb (SSA_NAME_DEF_STMT (orig_def));
170
171 gcc_assert (dom_info_available_p (CDI_DOMINATORS));
172
173 /* Adjust any debug stmts that held onto non-loop-closed
174 references. */
175 FOR_EACH_IMM_USE_STMT (stmt, imm_iter, orig_def)
176 {
177 use_operand_p use_p;
178 basic_block bbuse;
179
180 if (!is_gimple_debug (gs: stmt))
181 continue;
182
183 gcc_assert (gimple_debug_bind_p (stmt));
184
185 bbuse = gimple_bb (g: stmt);
186
187 if ((bbuse == bbphi
188 || dominated_by_p (CDI_DOMINATORS, bbuse, bbphi))
189 && !(bbuse == bbdef
190 || dominated_by_p (CDI_DOMINATORS, bbuse, bbdef)))
191 {
192 if (new_def)
193 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
194 SET_USE (use_p, new_def);
195 else
196 {
197 gimple_debug_bind_reset_value (dbg: stmt);
198 update_stmt (s: stmt);
199 }
200 }
201 }
202}
203
204/* Adjust debug stmts as scheduled before. */
205
206static void
207adjust_vec_debug_stmts (void)
208{
209 if (!MAY_HAVE_DEBUG_BIND_STMTS)
210 return;
211
212 gcc_assert (adjust_vec.exists ());
213
214 while (!adjust_vec.is_empty ())
215 {
216 adjust_debug_stmts_now (ai: &adjust_vec.last ());
217 adjust_vec.pop ();
218 }
219}
220
221/* Adjust any debug stmts that referenced FROM values to use the
222 loop-closed TO, if the references are dominated by BB and not by
223 the definition of FROM. If adjust_vec is non-NULL, adjustments
224 will be postponed until adjust_vec_debug_stmts is called. */
225
226static void
227adjust_debug_stmts (tree from, tree to, basic_block bb)
228{
229 adjust_info ai;
230
231 if (MAY_HAVE_DEBUG_BIND_STMTS
232 && TREE_CODE (from) == SSA_NAME
233 && ! SSA_NAME_IS_DEFAULT_DEF (from)
234 && ! virtual_operand_p (op: from))
235 {
236 ai.from = from;
237 ai.to = to;
238 ai.bb = bb;
239
240 if (adjust_vec.exists ())
241 adjust_vec.safe_push (obj: ai);
242 else
243 adjust_debug_stmts_now (ai: &ai);
244 }
245}
246
247/* Change E's phi arg in UPDATE_PHI to NEW_DEF, and record information
248 to adjust any debug stmts that referenced the old phi arg,
249 presumably non-loop-closed references left over from other
250 transformations. */
251
252static void
253adjust_phi_and_debug_stmts (gimple *update_phi, edge e, tree new_def)
254{
255 tree orig_def = PHI_ARG_DEF_FROM_EDGE (update_phi, e);
256
257 gcc_assert (TREE_CODE (orig_def) != SSA_NAME
258 || orig_def != new_def);
259
260 SET_PHI_ARG_DEF (update_phi, e->dest_idx, new_def);
261
262 if (MAY_HAVE_DEBUG_BIND_STMTS)
263 adjust_debug_stmts (from: orig_def, PHI_RESULT (update_phi),
264 bb: gimple_bb (g: update_phi));
265}
266
267/* Define one loop rgroup control CTRL from loop LOOP. INIT_CTRL is the value
268 that the control should have during the first iteration and NEXT_CTRL is the
269 value that it should have on subsequent iterations. */
270
271static void
272vect_set_loop_control (class loop *loop, tree ctrl, tree init_ctrl,
273 tree next_ctrl)
274{
275 gphi *phi = create_phi_node (ctrl, loop->header);
276 add_phi_arg (phi, init_ctrl, loop_preheader_edge (loop), UNKNOWN_LOCATION);
277 add_phi_arg (phi, next_ctrl, loop_latch_edge (loop), UNKNOWN_LOCATION);
278}
279
280/* Add SEQ to the end of LOOP's preheader block. */
281
282static void
283add_preheader_seq (class loop *loop, gimple_seq seq)
284{
285 if (seq)
286 {
287 edge pe = loop_preheader_edge (loop);
288 basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
289 gcc_assert (!new_bb);
290 }
291}
292
293/* Add SEQ to the beginning of LOOP's header block. */
294
295static void
296add_header_seq (class loop *loop, gimple_seq seq)
297{
298 if (seq)
299 {
300 gimple_stmt_iterator gsi = gsi_after_labels (bb: loop->header);
301 gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
302 }
303}
304
305/* Return true if the target can interleave elements of two vectors.
306 OFFSET is 0 if the first half of the vectors should be interleaved
307 or 1 if the second half should. When returning true, store the
308 associated permutation in INDICES. */
309
310static bool
311interleave_supported_p (vec_perm_indices *indices, tree vectype,
312 unsigned int offset)
313{
314 poly_uint64 nelts = TYPE_VECTOR_SUBPARTS (node: vectype);
315 poly_uint64 base = exact_div (a: nelts, b: 2) * offset;
316 vec_perm_builder sel (nelts, 2, 3);
317 for (unsigned int i = 0; i < 3; ++i)
318 {
319 sel.quick_push (obj: base + i);
320 sel.quick_push (obj: base + i + nelts);
321 }
322 indices->new_vector (sel, 2, nelts);
323 return can_vec_perm_const_p (TYPE_MODE (vectype), TYPE_MODE (vectype),
324 *indices);
325}
326
327/* Try to use permutes to define the masks in DEST_RGM using the masks
328 in SRC_RGM, given that the former has twice as many masks as the
329 latter. Return true on success, adding any new statements to SEQ. */
330
331static bool
332vect_maybe_permute_loop_masks (gimple_seq *seq, rgroup_controls *dest_rgm,
333 rgroup_controls *src_rgm)
334{
335 tree src_masktype = src_rgm->type;
336 tree dest_masktype = dest_rgm->type;
337 machine_mode src_mode = TYPE_MODE (src_masktype);
338 insn_code icode1, icode2;
339 if (dest_rgm->max_nscalars_per_iter <= src_rgm->max_nscalars_per_iter
340 && (icode1 = optab_handler (op: vec_unpacku_hi_optab,
341 mode: src_mode)) != CODE_FOR_nothing
342 && (icode2 = optab_handler (op: vec_unpacku_lo_optab,
343 mode: src_mode)) != CODE_FOR_nothing)
344 {
345 /* Unpacking the source masks gives at least as many mask bits as
346 we need. We can then VIEW_CONVERT any excess bits away. */
347 machine_mode dest_mode = insn_data[icode1].operand[0].mode;
348 gcc_assert (dest_mode == insn_data[icode2].operand[0].mode);
349 tree unpack_masktype = vect_halve_mask_nunits (src_masktype, dest_mode);
350 for (unsigned int i = 0; i < dest_rgm->controls.length (); ++i)
351 {
352 tree src = src_rgm->controls[i / 2];
353 tree dest = dest_rgm->controls[i];
354 tree_code code = ((i & 1) == (BYTES_BIG_ENDIAN ? 0 : 1)
355 ? VEC_UNPACK_HI_EXPR
356 : VEC_UNPACK_LO_EXPR);
357 gassign *stmt;
358 if (dest_masktype == unpack_masktype)
359 stmt = gimple_build_assign (dest, code, src);
360 else
361 {
362 tree temp = make_ssa_name (var: unpack_masktype);
363 stmt = gimple_build_assign (temp, code, src);
364 gimple_seq_add_stmt (seq, stmt);
365 stmt = gimple_build_assign (dest, VIEW_CONVERT_EXPR,
366 build1 (VIEW_CONVERT_EXPR,
367 dest_masktype, temp));
368 }
369 gimple_seq_add_stmt (seq, stmt);
370 }
371 return true;
372 }
373 vec_perm_indices indices[2];
374 if (dest_masktype == src_masktype
375 && interleave_supported_p (indices: &indices[0], vectype: src_masktype, offset: 0)
376 && interleave_supported_p (indices: &indices[1], vectype: src_masktype, offset: 1))
377 {
378 /* The destination requires twice as many mask bits as the source, so
379 we can use interleaving permutes to double up the number of bits. */
380 tree masks[2];
381 for (unsigned int i = 0; i < 2; ++i)
382 masks[i] = vect_gen_perm_mask_checked (src_masktype, indices[i]);
383 for (unsigned int i = 0; i < dest_rgm->controls.length (); ++i)
384 {
385 tree src = src_rgm->controls[i / 2];
386 tree dest = dest_rgm->controls[i];
387 gimple *stmt = gimple_build_assign (dest, VEC_PERM_EXPR,
388 src, src, masks[i & 1]);
389 gimple_seq_add_stmt (seq, stmt);
390 }
391 return true;
392 }
393 return false;
394}
395
396/* Populate DEST_RGM->controls, given that they should add up to STEP.
397
398 STEP = MIN_EXPR <ivtmp_34, VF>;
399
400 First length (MIN (X, VF/N)):
401 loop_len_15 = MIN_EXPR <STEP, VF/N>;
402
403 Second length:
404 tmp = STEP - loop_len_15;
405 loop_len_16 = MIN (tmp, VF/N);
406
407 Third length:
408 tmp2 = tmp - loop_len_16;
409 loop_len_17 = MIN (tmp2, VF/N);
410
411 Last length:
412 loop_len_18 = tmp2 - loop_len_17;
413*/
414
415static void
416vect_adjust_loop_lens_control (tree iv_type, gimple_seq *seq,
417 rgroup_controls *dest_rgm, tree step)
418{
419 tree ctrl_type = dest_rgm->type;
420 poly_uint64 nitems_per_ctrl
421 = TYPE_VECTOR_SUBPARTS (node: ctrl_type) * dest_rgm->factor;
422 tree length_limit = build_int_cst (iv_type, nitems_per_ctrl);
423
424 for (unsigned int i = 0; i < dest_rgm->controls.length (); ++i)
425 {
426 tree ctrl = dest_rgm->controls[i];
427 if (i == 0)
428 {
429 /* First iteration: MIN (X, VF/N) capped to the range [0, VF/N]. */
430 gassign *assign
431 = gimple_build_assign (ctrl, MIN_EXPR, step, length_limit);
432 gimple_seq_add_stmt (seq, assign);
433 }
434 else if (i == dest_rgm->controls.length () - 1)
435 {
436 /* Last iteration: Remain capped to the range [0, VF/N]. */
437 gassign *assign = gimple_build_assign (ctrl, MINUS_EXPR, step,
438 dest_rgm->controls[i - 1]);
439 gimple_seq_add_stmt (seq, assign);
440 }
441 else
442 {
443 /* (MIN (remain, VF*I/N)) capped to the range [0, VF/N]. */
444 step = gimple_build (seq, code: MINUS_EXPR, type: iv_type, ops: step,
445 ops: dest_rgm->controls[i - 1]);
446 gassign *assign
447 = gimple_build_assign (ctrl, MIN_EXPR, step, length_limit);
448 gimple_seq_add_stmt (seq, assign);
449 }
450 }
451}
452
453/* Stores the standard position for induction variable increment in belonging to
454 LOOP_EXIT (just before the exit condition of the given exit to BSI.
455 INSERT_AFTER is set to true if the increment should be inserted after
456 *BSI. */
457
458void
459vect_iv_increment_position (edge loop_exit, gimple_stmt_iterator *bsi,
460 bool *insert_after)
461{
462 basic_block bb = loop_exit->src;
463 *bsi = gsi_last_bb (bb);
464 *insert_after = false;
465}
466
467/* Helper for vect_set_loop_condition_partial_vectors. Generate definitions
468 for all the rgroup controls in RGC and return a control that is nonzero
469 when the loop needs to iterate. Add any new preheader statements to
470 PREHEADER_SEQ. Use LOOP_COND_GSI to insert code before the exit gcond.
471
472 RGC belongs to loop LOOP. The loop originally iterated NITERS
473 times and has been vectorized according to LOOP_VINFO.
474
475 If NITERS_SKIP is nonnull, the first iteration of the vectorized loop
476 starts with NITERS_SKIP dummy iterations of the scalar loop before
477 the real work starts. The mask elements for these dummy iterations
478 must be 0, to ensure that the extra iterations do not have an effect.
479
480 It is known that:
481
482 NITERS * RGC->max_nscalars_per_iter * RGC->factor
483
484 does not overflow. However, MIGHT_WRAP_P says whether an induction
485 variable that starts at 0 and has step:
486
487 VF * RGC->max_nscalars_per_iter * RGC->factor
488
489 might overflow before hitting a value above:
490
491 (NITERS + NITERS_SKIP) * RGC->max_nscalars_per_iter * RGC->factor
492
493 This means that we cannot guarantee that such an induction variable
494 would ever hit a value that produces a set of all-false masks or zero
495 lengths for RGC.
496
497 Note: the cost of the code generated by this function is modeled
498 by vect_estimate_min_profitable_iters, so changes here may need
499 corresponding changes there. */
500
501static tree
502vect_set_loop_controls_directly (class loop *loop, loop_vec_info loop_vinfo,
503 gimple_seq *preheader_seq,
504 gimple_seq *header_seq,
505 gimple_stmt_iterator loop_cond_gsi,
506 rgroup_controls *rgc, tree niters,
507 tree niters_skip, bool might_wrap_p,
508 tree *iv_step, tree *compare_step)
509{
510 tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
511 tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
512 bool use_masks_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
513
514 tree ctrl_type = rgc->type;
515 unsigned int nitems_per_iter = rgc->max_nscalars_per_iter * rgc->factor;
516 poly_uint64 nitems_per_ctrl = TYPE_VECTOR_SUBPARTS (node: ctrl_type) * rgc->factor;
517 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
518 tree length_limit = NULL_TREE;
519 /* For length, we need length_limit to ensure length in range. */
520 if (!use_masks_p)
521 length_limit = build_int_cst (compare_type, nitems_per_ctrl);
522
523 /* Calculate the maximum number of item values that the rgroup
524 handles in total, the number that it handles for each iteration
525 of the vector loop, and the number that it should skip during the
526 first iteration of the vector loop. */
527 tree nitems_total = niters;
528 tree nitems_step = build_int_cst (iv_type, vf);
529 tree nitems_skip = niters_skip;
530 if (nitems_per_iter != 1)
531 {
532 /* We checked before setting LOOP_VINFO_USING_PARTIAL_VECTORS_P that
533 these multiplications don't overflow. */
534 tree compare_factor = build_int_cst (compare_type, nitems_per_iter);
535 tree iv_factor = build_int_cst (iv_type, nitems_per_iter);
536 nitems_total = gimple_build (seq: preheader_seq, code: MULT_EXPR, type: compare_type,
537 ops: nitems_total, ops: compare_factor);
538 nitems_step = gimple_build (seq: preheader_seq, code: MULT_EXPR, type: iv_type,
539 ops: nitems_step, ops: iv_factor);
540 if (nitems_skip)
541 nitems_skip = gimple_build (seq: preheader_seq, code: MULT_EXPR, type: compare_type,
542 ops: nitems_skip, ops: compare_factor);
543 }
544
545 /* Create an induction variable that counts the number of items
546 processed. */
547 tree index_before_incr, index_after_incr;
548 gimple_stmt_iterator incr_gsi;
549 bool insert_after;
550 edge exit_e = LOOP_VINFO_MAIN_EXIT (loop_vinfo);
551 vect_iv_increment_position (loop_exit: exit_e, bsi: &incr_gsi, insert_after: &insert_after);
552 if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo))
553 {
554 /* Create an IV that counts down from niters_total and whose step
555 is the (variable) amount processed in the current iteration:
556 ...
557 _10 = (unsigned long) count_12(D);
558 ...
559 # ivtmp_9 = PHI <ivtmp_35(6), _10(5)>
560 _36 = (MIN_EXPR | SELECT_VL) <ivtmp_9, POLY_INT_CST [4, 4]>;
561 ...
562 vect__4.8_28 = .LEN_LOAD (_17, 32B, _36, 0);
563 ...
564 ivtmp_35 = ivtmp_9 - POLY_INT_CST [4, 4];
565 ...
566 if (ivtmp_9 > POLY_INT_CST [4, 4])
567 goto <bb 4>; [83.33%]
568 else
569 goto <bb 5>; [16.67%]
570 */
571 nitems_total = gimple_convert (seq: preheader_seq, type: iv_type, op: nitems_total);
572 tree step = rgc->controls.length () == 1 ? rgc->controls[0]
573 : make_ssa_name (var: iv_type);
574 /* Create decrement IV. */
575 if (LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo))
576 {
577 create_iv (nitems_total, MINUS_EXPR, step, NULL_TREE, loop, &incr_gsi,
578 insert_after, &index_before_incr, &index_after_incr);
579 tree vectype = build_zero_cst (rgc->type);
580 tree len = gimple_build (seq: header_seq, code: IFN_SELECT_VL, type: iv_type,
581 ops: index_before_incr, ops: nitems_step,
582 ops: vectype);
583 gimple_seq_add_stmt (header_seq, gimple_build_assign (step, len));
584 }
585 else
586 {
587 create_iv (nitems_total, MINUS_EXPR, nitems_step, NULL_TREE, loop,
588 &incr_gsi, insert_after, &index_before_incr,
589 &index_after_incr);
590 gimple_seq_add_stmt (header_seq,
591 gimple_build_assign (step, MIN_EXPR,
592 index_before_incr,
593 nitems_step));
594 }
595 *iv_step = step;
596 *compare_step = nitems_step;
597 return LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo) ? index_after_incr
598 : index_before_incr;
599 }
600
601 /* Create increment IV. */
602 create_iv (build_int_cst (iv_type, 0), PLUS_EXPR, nitems_step, NULL_TREE,
603 loop, &incr_gsi, insert_after, &index_before_incr,
604 &index_after_incr);
605
606 tree zero_index = build_int_cst (compare_type, 0);
607 tree test_index, test_limit, first_limit;
608 gimple_stmt_iterator *test_gsi;
609 if (might_wrap_p)
610 {
611 /* In principle the loop should stop iterating once the incremented
612 IV reaches a value greater than or equal to:
613
614 NITEMS_TOTAL +[infinite-prec] NITEMS_SKIP
615
616 However, there's no guarantee that this addition doesn't overflow
617 the comparison type, or that the IV hits a value above it before
618 wrapping around. We therefore adjust the limit down by one
619 IV step:
620
621 (NITEMS_TOTAL +[infinite-prec] NITEMS_SKIP)
622 -[infinite-prec] NITEMS_STEP
623
624 and compare the IV against this limit _before_ incrementing it.
625 Since the comparison type is unsigned, we actually want the
626 subtraction to saturate at zero:
627
628 (NITEMS_TOTAL +[infinite-prec] NITEMS_SKIP)
629 -[sat] NITEMS_STEP
630
631 And since NITEMS_SKIP < NITEMS_STEP, we can reassociate this as:
632
633 NITEMS_TOTAL -[sat] (NITEMS_STEP - NITEMS_SKIP)
634
635 where the rightmost subtraction can be done directly in
636 COMPARE_TYPE. */
637 test_index = index_before_incr;
638 tree adjust = gimple_convert (seq: preheader_seq, type: compare_type,
639 op: nitems_step);
640 if (nitems_skip)
641 adjust = gimple_build (seq: preheader_seq, code: MINUS_EXPR, type: compare_type,
642 ops: adjust, ops: nitems_skip);
643 test_limit = gimple_build (seq: preheader_seq, code: MAX_EXPR, type: compare_type,
644 ops: nitems_total, ops: adjust);
645 test_limit = gimple_build (seq: preheader_seq, code: MINUS_EXPR, type: compare_type,
646 ops: test_limit, ops: adjust);
647 test_gsi = &incr_gsi;
648
649 /* Get a safe limit for the first iteration. */
650 if (nitems_skip)
651 {
652 /* The first vector iteration can handle at most NITEMS_STEP
653 items. NITEMS_STEP <= CONST_LIMIT, and adding
654 NITEMS_SKIP to that cannot overflow. */
655 tree const_limit = build_int_cst (compare_type,
656 LOOP_VINFO_VECT_FACTOR (loop_vinfo)
657 * nitems_per_iter);
658 first_limit = gimple_build (seq: preheader_seq, code: MIN_EXPR, type: compare_type,
659 ops: nitems_total, ops: const_limit);
660 first_limit = gimple_build (seq: preheader_seq, code: PLUS_EXPR, type: compare_type,
661 ops: first_limit, ops: nitems_skip);
662 }
663 else
664 /* For the first iteration it doesn't matter whether the IV hits
665 a value above NITEMS_TOTAL. That only matters for the latch
666 condition. */
667 first_limit = nitems_total;
668 }
669 else
670 {
671 /* Test the incremented IV, which will always hit a value above
672 the bound before wrapping. */
673 test_index = index_after_incr;
674 test_limit = nitems_total;
675 if (nitems_skip)
676 test_limit = gimple_build (seq: preheader_seq, code: PLUS_EXPR, type: compare_type,
677 ops: test_limit, ops: nitems_skip);
678 test_gsi = &loop_cond_gsi;
679
680 first_limit = test_limit;
681 }
682
683 /* Convert the IV value to the comparison type (either a no-op or
684 a demotion). */
685 gimple_seq test_seq = NULL;
686 test_index = gimple_convert (seq: &test_seq, type: compare_type, op: test_index);
687 gsi_insert_seq_before (test_gsi, test_seq, GSI_SAME_STMT);
688
689 /* Provide a definition of each control in the group. */
690 tree next_ctrl = NULL_TREE;
691 tree ctrl;
692 unsigned int i;
693 FOR_EACH_VEC_ELT_REVERSE (rgc->controls, i, ctrl)
694 {
695 /* Previous controls will cover BIAS items. This control covers the
696 next batch. */
697 poly_uint64 bias = nitems_per_ctrl * i;
698 tree bias_tree = build_int_cst (compare_type, bias);
699
700 /* See whether the first iteration of the vector loop is known
701 to have a full control. */
702 poly_uint64 const_limit;
703 bool first_iteration_full
704 = (poly_int_tree_p (t: first_limit, value: &const_limit)
705 && known_ge (const_limit, (i + 1) * nitems_per_ctrl));
706
707 /* Rather than have a new IV that starts at BIAS and goes up to
708 TEST_LIMIT, prefer to use the same 0-based IV for each control
709 and adjust the bound down by BIAS. */
710 tree this_test_limit = test_limit;
711 if (i != 0)
712 {
713 this_test_limit = gimple_build (seq: preheader_seq, code: MAX_EXPR,
714 type: compare_type, ops: this_test_limit,
715 ops: bias_tree);
716 this_test_limit = gimple_build (seq: preheader_seq, code: MINUS_EXPR,
717 type: compare_type, ops: this_test_limit,
718 ops: bias_tree);
719 }
720
721 /* Create the initial control. First include all items that
722 are within the loop limit. */
723 tree init_ctrl = NULL_TREE;
724 if (!first_iteration_full)
725 {
726 tree start, end;
727 if (first_limit == test_limit)
728 {
729 /* Use a natural test between zero (the initial IV value)
730 and the loop limit. The "else" block would be valid too,
731 but this choice can avoid the need to load BIAS_TREE into
732 a register. */
733 start = zero_index;
734 end = this_test_limit;
735 }
736 else
737 {
738 /* FIRST_LIMIT is the maximum number of items handled by the
739 first iteration of the vector loop. Test the portion
740 associated with this control. */
741 start = bias_tree;
742 end = first_limit;
743 }
744
745 if (use_masks_p)
746 init_ctrl = vect_gen_while (preheader_seq, ctrl_type,
747 start, end, "max_mask");
748 else
749 {
750 init_ctrl = make_temp_ssa_name (type: compare_type, NULL, name: "max_len");
751 gimple_seq seq = vect_gen_len (init_ctrl, start,
752 end, length_limit);
753 gimple_seq_add_seq (preheader_seq, seq);
754 }
755 }
756
757 /* Now AND out the bits that are within the number of skipped
758 items. */
759 poly_uint64 const_skip;
760 if (nitems_skip
761 && !(poly_int_tree_p (t: nitems_skip, value: &const_skip)
762 && known_le (const_skip, bias)))
763 {
764 gcc_assert (use_masks_p);
765 tree unskipped_mask = vect_gen_while_not (preheader_seq, ctrl_type,
766 bias_tree, nitems_skip);
767 if (init_ctrl)
768 init_ctrl = gimple_build (seq: preheader_seq, code: BIT_AND_EXPR, type: ctrl_type,
769 ops: init_ctrl, ops: unskipped_mask);
770 else
771 init_ctrl = unskipped_mask;
772 }
773
774 if (!init_ctrl)
775 {
776 /* First iteration is full. */
777 if (use_masks_p)
778 init_ctrl = build_minus_one_cst (ctrl_type);
779 else
780 init_ctrl = length_limit;
781 }
782
783 /* Get the control value for the next iteration of the loop. */
784 if (use_masks_p)
785 {
786 gimple_seq stmts = NULL;
787 next_ctrl = vect_gen_while (&stmts, ctrl_type, test_index,
788 this_test_limit, "next_mask");
789 gsi_insert_seq_before (test_gsi, stmts, GSI_SAME_STMT);
790 }
791 else
792 {
793 next_ctrl = make_temp_ssa_name (type: compare_type, NULL, name: "next_len");
794 gimple_seq seq = vect_gen_len (next_ctrl, test_index, this_test_limit,
795 length_limit);
796 gsi_insert_seq_before (test_gsi, seq, GSI_SAME_STMT);
797 }
798
799 vect_set_loop_control (loop, ctrl, init_ctrl, next_ctrl);
800 }
801
802 int partial_load_bias = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
803 if (partial_load_bias != 0)
804 {
805 tree adjusted_len = rgc->bias_adjusted_ctrl;
806 gassign *minus = gimple_build_assign (adjusted_len, PLUS_EXPR,
807 rgc->controls[0],
808 build_int_cst
809 (TREE_TYPE (rgc->controls[0]),
810 partial_load_bias));
811 gimple_seq_add_stmt (header_seq, minus);
812 }
813
814 return next_ctrl;
815}
816
817/* Set up the iteration condition and rgroup controls for LOOP, given
818 that LOOP_VINFO_USING_PARTIAL_VECTORS_P is true for the vectorized
819 loop. LOOP_VINFO describes the vectorization of LOOP. NITERS is
820 the number of iterations of the original scalar loop that should be
821 handled by the vector loop. NITERS_MAYBE_ZERO and FINAL_IV are as
822 for vect_set_loop_condition.
823
824 Insert the branch-back condition before LOOP_COND_GSI and return the
825 final gcond. */
826
827static gcond *
828vect_set_loop_condition_partial_vectors (class loop *loop, edge exit_edge,
829 loop_vec_info loop_vinfo, tree niters,
830 tree final_iv, bool niters_maybe_zero,
831 gimple_stmt_iterator loop_cond_gsi)
832{
833 gimple_seq preheader_seq = NULL;
834 gimple_seq header_seq = NULL;
835
836 bool use_masks_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
837 tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
838 unsigned int compare_precision = TYPE_PRECISION (compare_type);
839 tree orig_niters = niters;
840
841 /* Type of the initial value of NITERS. */
842 tree ni_actual_type = TREE_TYPE (niters);
843 unsigned int ni_actual_precision = TYPE_PRECISION (ni_actual_type);
844 tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
845 if (niters_skip)
846 niters_skip = gimple_convert (seq: &preheader_seq, type: compare_type, op: niters_skip);
847
848 /* Convert NITERS to the same size as the compare. */
849 if (compare_precision > ni_actual_precision
850 && niters_maybe_zero)
851 {
852 /* We know that there is always at least one iteration, so if the
853 count is zero then it must have wrapped. Cope with this by
854 subtracting 1 before the conversion and adding 1 to the result. */
855 gcc_assert (TYPE_UNSIGNED (ni_actual_type));
856 niters = gimple_build (seq: &preheader_seq, code: PLUS_EXPR, type: ni_actual_type,
857 ops: niters, ops: build_minus_one_cst (ni_actual_type));
858 niters = gimple_convert (seq: &preheader_seq, type: compare_type, op: niters);
859 niters = gimple_build (seq: &preheader_seq, code: PLUS_EXPR, type: compare_type,
860 ops: niters, ops: build_one_cst (compare_type));
861 }
862 else
863 niters = gimple_convert (seq: &preheader_seq, type: compare_type, op: niters);
864
865 /* Iterate over all the rgroups and fill in their controls. We could use
866 the first control from any rgroup for the loop condition; here we
867 arbitrarily pick the last. */
868 tree test_ctrl = NULL_TREE;
869 tree iv_step = NULL_TREE;
870 tree compare_step = NULL_TREE;
871 rgroup_controls *rgc;
872 rgroup_controls *iv_rgc = nullptr;
873 unsigned int i;
874 auto_vec<rgroup_controls> *controls = use_masks_p
875 ? &LOOP_VINFO_MASKS (loop_vinfo).rgc_vec
876 : &LOOP_VINFO_LENS (loop_vinfo);
877 FOR_EACH_VEC_ELT (*controls, i, rgc)
878 if (!rgc->controls.is_empty ())
879 {
880 /* First try using permutes. This adds a single vector
881 instruction to the loop for each mask, but needs no extra
882 loop invariants or IVs. */
883 unsigned int nmasks = i + 1;
884 if (use_masks_p && (nmasks & 1) == 0)
885 {
886 rgroup_controls *half_rgc = &(*controls)[nmasks / 2 - 1];
887 if (!half_rgc->controls.is_empty ()
888 && vect_maybe_permute_loop_masks (seq: &header_seq, dest_rgm: rgc, src_rgm: half_rgc))
889 continue;
890 }
891
892 if (!LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo)
893 || !iv_rgc
894 || (iv_rgc->max_nscalars_per_iter * iv_rgc->factor
895 != rgc->max_nscalars_per_iter * rgc->factor))
896 {
897 /* See whether zero-based IV would ever generate all-false masks
898 or zero length before wrapping around. */
899 bool might_wrap_p = vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc);
900
901 /* Set up all controls for this group. */
902 test_ctrl
903 = vect_set_loop_controls_directly (loop, loop_vinfo,
904 preheader_seq: &preheader_seq, header_seq: &header_seq,
905 loop_cond_gsi, rgc, niters,
906 niters_skip, might_wrap_p,
907 iv_step: &iv_step, compare_step: &compare_step);
908
909 iv_rgc = rgc;
910 }
911
912 if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo)
913 && rgc->controls.length () > 1)
914 {
915 /* vect_set_loop_controls_directly creates an IV whose step
916 is equal to the expected sum of RGC->controls. Use that
917 information to populate RGC->controls. */
918 tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
919 gcc_assert (iv_step);
920 vect_adjust_loop_lens_control (iv_type, seq: &header_seq, dest_rgm: rgc, step: iv_step);
921 }
922 }
923
924 /* Emit all accumulated statements. */
925 add_preheader_seq (loop, seq: preheader_seq);
926 add_header_seq (loop, seq: header_seq);
927
928 /* Get a boolean result that tells us whether to iterate. */
929 gcond *cond_stmt;
930 if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo)
931 && !LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo))
932 {
933 gcc_assert (compare_step);
934 tree_code code = (exit_edge->flags & EDGE_TRUE_VALUE) ? LE_EXPR : GT_EXPR;
935 cond_stmt = gimple_build_cond (code, test_ctrl, compare_step, NULL_TREE,
936 NULL_TREE);
937 }
938 else
939 {
940 tree_code code = (exit_edge->flags & EDGE_TRUE_VALUE) ? EQ_EXPR : NE_EXPR;
941 tree zero_ctrl = build_zero_cst (TREE_TYPE (test_ctrl));
942 cond_stmt
943 = gimple_build_cond (code, test_ctrl, zero_ctrl, NULL_TREE, NULL_TREE);
944 }
945 gsi_insert_before (&loop_cond_gsi, cond_stmt, GSI_SAME_STMT);
946
947 /* The loop iterates (NITERS - 1) / VF + 1 times.
948 Subtract one from this to get the latch count. */
949 tree step = build_int_cst (compare_type,
950 LOOP_VINFO_VECT_FACTOR (loop_vinfo));
951 tree niters_minus_one = fold_build2 (PLUS_EXPR, compare_type, niters,
952 build_minus_one_cst (compare_type));
953 loop->nb_iterations = fold_build2 (TRUNC_DIV_EXPR, compare_type,
954 niters_minus_one, step);
955
956 if (final_iv)
957 {
958 gassign *assign;
959 /* If vectorizing an inverted early break loop we have to restart the
960 scalar loop at niters - vf. This matches what we do in
961 vect_gen_vector_loop_niters_mult_vf for non-masked loops. */
962 if (LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo))
963 {
964 tree ftype = TREE_TYPE (orig_niters);
965 tree vf = build_int_cst (ftype, LOOP_VINFO_VECT_FACTOR (loop_vinfo));
966 assign = gimple_build_assign (final_iv, MINUS_EXPR, orig_niters, vf);
967 }
968 else
969 assign = gimple_build_assign (final_iv, orig_niters);
970 gsi_insert_on_edge_immediate (exit_edge, assign);
971 }
972
973 return cond_stmt;
974}
975
976/* Set up the iteration condition and rgroup controls for LOOP in AVX512
977 style, given that LOOP_VINFO_USING_PARTIAL_VECTORS_P is true for the
978 vectorized loop. LOOP_VINFO describes the vectorization of LOOP. NITERS is
979 the number of iterations of the original scalar loop that should be
980 handled by the vector loop. NITERS_MAYBE_ZERO and FINAL_IV are as
981 for vect_set_loop_condition.
982
983 Insert the branch-back condition before LOOP_COND_GSI and return the
984 final gcond. */
985
986static gcond *
987vect_set_loop_condition_partial_vectors_avx512 (class loop *loop,
988 edge exit_edge,
989 loop_vec_info loop_vinfo, tree niters,
990 tree final_iv,
991 bool niters_maybe_zero,
992 gimple_stmt_iterator loop_cond_gsi)
993{
994 tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
995 tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
996 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
997 tree orig_niters = niters;
998 gimple_seq preheader_seq = NULL;
999
1000 /* Create an IV that counts down from niters and whose step
1001 is the number of iterations processed in the current iteration.
1002 Produce the controls with compares like the following.
1003
1004 # iv_2 = PHI <niters, iv_3>
1005 rem_4 = MIN <iv_2, VF>;
1006 remv_6 = { rem_4, rem_4, rem_4, ... }
1007 mask_5 = { 0, 0, 1, 1, 2, 2, ... } < remv6;
1008 iv_3 = iv_2 - VF;
1009 if (iv_2 > VF)
1010 continue;
1011
1012 Where the constant is built with elements at most VF - 1 and
1013 repetitions according to max_nscalars_per_iter which is guarnateed
1014 to be the same within a group. */
1015
1016 /* Convert NITERS to the determined IV type. */
1017 if (TYPE_PRECISION (iv_type) > TYPE_PRECISION (TREE_TYPE (niters))
1018 && niters_maybe_zero)
1019 {
1020 /* We know that there is always at least one iteration, so if the
1021 count is zero then it must have wrapped. Cope with this by
1022 subtracting 1 before the conversion and adding 1 to the result. */
1023 gcc_assert (TYPE_UNSIGNED (TREE_TYPE (niters)));
1024 niters = gimple_build (seq: &preheader_seq, code: PLUS_EXPR, TREE_TYPE (niters),
1025 ops: niters, ops: build_minus_one_cst (TREE_TYPE (niters)));
1026 niters = gimple_convert (seq: &preheader_seq, type: iv_type, op: niters);
1027 niters = gimple_build (seq: &preheader_seq, code: PLUS_EXPR, type: iv_type,
1028 ops: niters, ops: build_one_cst (iv_type));
1029 }
1030 else
1031 niters = gimple_convert (seq: &preheader_seq, type: iv_type, op: niters);
1032
1033 /* Bias the initial value of the IV in case we need to skip iterations
1034 at the beginning. */
1035 tree niters_adj = niters;
1036 if (niters_skip)
1037 {
1038 tree skip = gimple_convert (seq: &preheader_seq, type: iv_type, op: niters_skip);
1039 niters_adj = gimple_build (seq: &preheader_seq, code: PLUS_EXPR,
1040 type: iv_type, ops: niters, ops: skip);
1041 }
1042
1043 /* The iteration step is the vectorization factor. */
1044 tree iv_step = build_int_cst (iv_type, vf);
1045
1046 /* Create the decrement IV. */
1047 tree index_before_incr, index_after_incr;
1048 gimple_stmt_iterator incr_gsi;
1049 bool insert_after;
1050 vect_iv_increment_position (loop_exit: exit_edge, bsi: &incr_gsi, insert_after: &insert_after);
1051 create_iv (niters_adj, MINUS_EXPR, iv_step, NULL_TREE, loop,
1052 &incr_gsi, insert_after, &index_before_incr,
1053 &index_after_incr);
1054
1055 /* Iterate over all the rgroups and fill in their controls. */
1056 for (auto &rgc : LOOP_VINFO_MASKS (loop_vinfo).rgc_vec)
1057 {
1058 if (rgc.controls.is_empty ())
1059 continue;
1060
1061 tree ctrl_type = rgc.type;
1062 poly_uint64 nitems_per_ctrl = TYPE_VECTOR_SUBPARTS (node: ctrl_type);
1063
1064 tree vectype = rgc.compare_type;
1065
1066 /* index_after_incr is the IV specifying the remaining iterations in
1067 the next iteration. */
1068 tree rem = index_after_incr;
1069 /* When the data type for the compare to produce the mask is
1070 smaller than the IV type we need to saturate. Saturate to
1071 the smallest possible value (IV_TYPE) so we only have to
1072 saturate once (CSE will catch redundant ones we add). */
1073 if (TYPE_PRECISION (TREE_TYPE (vectype)) < TYPE_PRECISION (iv_type))
1074 rem = gimple_build (&incr_gsi, false, GSI_CONTINUE_LINKING,
1075 UNKNOWN_LOCATION,
1076 MIN_EXPR, TREE_TYPE (rem), rem, iv_step);
1077 rem = gimple_convert (&incr_gsi, false, GSI_CONTINUE_LINKING,
1078 UNKNOWN_LOCATION, TREE_TYPE (vectype), rem);
1079
1080 /* Build a data vector composed of the remaining iterations. */
1081 rem = gimple_build_vector_from_val (&incr_gsi, false, GSI_CONTINUE_LINKING,
1082 UNKNOWN_LOCATION, vectype, rem);
1083
1084 /* Provide a definition of each vector in the control group. */
1085 tree next_ctrl = NULL_TREE;
1086 tree first_rem = NULL_TREE;
1087 tree ctrl;
1088 unsigned int i;
1089 FOR_EACH_VEC_ELT_REVERSE (rgc.controls, i, ctrl)
1090 {
1091 /* Previous controls will cover BIAS items. This control covers the
1092 next batch. */
1093 poly_uint64 bias = nitems_per_ctrl * i;
1094
1095 /* Build the constant to compare the remaining iters against,
1096 this is sth like { 0, 0, 1, 1, 2, 2, 3, 3, ... } appropriately
1097 split into pieces. */
1098 unsigned n = TYPE_VECTOR_SUBPARTS (node: ctrl_type).to_constant ();
1099 tree_vector_builder builder (vectype, n, 1);
1100 for (unsigned i = 0; i < n; ++i)
1101 {
1102 unsigned HOST_WIDE_INT val
1103 = (i + bias.to_constant ()) / rgc.max_nscalars_per_iter;
1104 gcc_assert (val < vf.to_constant ());
1105 builder.quick_push (obj: build_int_cst (TREE_TYPE (vectype), val));
1106 }
1107 tree cmp_series = builder.build ();
1108
1109 /* Create the initial control. First include all items that
1110 are within the loop limit. */
1111 tree init_ctrl = NULL_TREE;
1112 poly_uint64 const_limit;
1113 /* See whether the first iteration of the vector loop is known
1114 to have a full control. */
1115 if (poly_int_tree_p (t: niters, value: &const_limit)
1116 && known_ge (const_limit, (i + 1) * nitems_per_ctrl))
1117 init_ctrl = build_minus_one_cst (ctrl_type);
1118 else
1119 {
1120 /* The remaining work items initially are niters. Saturate,
1121 splat and compare. */
1122 if (!first_rem)
1123 {
1124 first_rem = niters;
1125 if (TYPE_PRECISION (TREE_TYPE (vectype))
1126 < TYPE_PRECISION (iv_type))
1127 first_rem = gimple_build (seq: &preheader_seq,
1128 code: MIN_EXPR, TREE_TYPE (first_rem),
1129 ops: first_rem, ops: iv_step);
1130 first_rem = gimple_convert (seq: &preheader_seq, TREE_TYPE (vectype),
1131 op: first_rem);
1132 first_rem = gimple_build_vector_from_val (seq: &preheader_seq,
1133 type: vectype, op: first_rem);
1134 }
1135 init_ctrl = gimple_build (seq: &preheader_seq, code: LT_EXPR, type: ctrl_type,
1136 ops: cmp_series, ops: first_rem);
1137 }
1138
1139 /* Now AND out the bits that are within the number of skipped
1140 items. */
1141 poly_uint64 const_skip;
1142 if (niters_skip
1143 && !(poly_int_tree_p (t: niters_skip, value: &const_skip)
1144 && known_le (const_skip, bias)))
1145 {
1146 /* For integer mode masks it's cheaper to shift out the bits
1147 since that avoids loading a constant. */
1148 gcc_assert (GET_MODE_CLASS (TYPE_MODE (ctrl_type)) == MODE_INT);
1149 init_ctrl = gimple_build (seq: &preheader_seq, code: VIEW_CONVERT_EXPR,
1150 type: lang_hooks.types.type_for_mode
1151 (TYPE_MODE (ctrl_type), 1),
1152 ops: init_ctrl);
1153 /* ??? But when the shift amount isn't constant this requires
1154 a round-trip to GRPs. We could apply the bias to either
1155 side of the compare instead. */
1156 tree shift = gimple_build (seq: &preheader_seq, code: MINUS_EXPR,
1157 TREE_TYPE (niters_skip), ops: niters_skip,
1158 ops: build_int_cst (TREE_TYPE (niters_skip),
1159 bias));
1160 shift = gimple_build (seq: &preheader_seq, code: MULT_EXPR,
1161 TREE_TYPE (niters_skip), ops: shift,
1162 ops: build_int_cst (TREE_TYPE (niters_skip),
1163 rgc.max_nscalars_per_iter));
1164 init_ctrl = gimple_build (seq: &preheader_seq, code: LSHIFT_EXPR,
1165 TREE_TYPE (init_ctrl),
1166 ops: init_ctrl, ops: shift);
1167 init_ctrl = gimple_build (seq: &preheader_seq, code: VIEW_CONVERT_EXPR,
1168 type: ctrl_type, ops: init_ctrl);
1169 }
1170
1171 /* Get the control value for the next iteration of the loop. */
1172 next_ctrl = gimple_build (&incr_gsi, false, GSI_CONTINUE_LINKING,
1173 UNKNOWN_LOCATION,
1174 LT_EXPR, ctrl_type, cmp_series, rem);
1175
1176 vect_set_loop_control (loop, ctrl, init_ctrl, next_ctrl);
1177 }
1178 }
1179
1180 /* Emit all accumulated statements. */
1181 add_preheader_seq (loop, seq: preheader_seq);
1182
1183 /* Adjust the exit test using the decrementing IV. */
1184 tree_code code = (exit_edge->flags & EDGE_TRUE_VALUE) ? LE_EXPR : GT_EXPR;
1185 /* When we peel for alignment with niter_skip != 0 this can
1186 cause niter + niter_skip to wrap and since we are comparing the
1187 value before the decrement here we get a false early exit.
1188 We can't compare the value after decrement either because that
1189 decrement could wrap as well as we're not doing a saturating
1190 decrement. To avoid this situation we force a larger
1191 iv_type. */
1192 gcond *cond_stmt = gimple_build_cond (code, index_before_incr, iv_step,
1193 NULL_TREE, NULL_TREE);
1194 gsi_insert_before (&loop_cond_gsi, cond_stmt, GSI_SAME_STMT);
1195
1196 /* The loop iterates (NITERS - 1 + NITERS_SKIP) / VF + 1 times.
1197 Subtract one from this to get the latch count. */
1198 tree niters_minus_one
1199 = fold_build2 (PLUS_EXPR, TREE_TYPE (orig_niters), orig_niters,
1200 build_minus_one_cst (TREE_TYPE (orig_niters)));
1201 tree niters_adj2 = fold_convert (iv_type, niters_minus_one);
1202 if (niters_skip)
1203 niters_adj2 = fold_build2 (PLUS_EXPR, iv_type, niters_minus_one,
1204 fold_convert (iv_type, niters_skip));
1205 loop->nb_iterations = fold_build2 (TRUNC_DIV_EXPR, iv_type,
1206 niters_adj2, iv_step);
1207
1208 if (final_iv)
1209 {
1210 gassign *assign;
1211 /* If vectorizing an inverted early break loop we have to restart the
1212 scalar loop at niters - vf. This matches what we do in
1213 vect_gen_vector_loop_niters_mult_vf for non-masked loops. */
1214 if (LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo))
1215 {
1216 tree ftype = TREE_TYPE (orig_niters);
1217 tree vf = build_int_cst (ftype, LOOP_VINFO_VECT_FACTOR (loop_vinfo));
1218 assign = gimple_build_assign (final_iv, MINUS_EXPR, orig_niters, vf);
1219 }
1220 else
1221 assign = gimple_build_assign (final_iv, orig_niters);
1222 gsi_insert_on_edge_immediate (exit_edge, assign);
1223 }
1224
1225 return cond_stmt;
1226}
1227
1228
1229/* Like vect_set_loop_condition, but handle the case in which the vector
1230 loop handles exactly VF scalars per iteration. */
1231
1232static gcond *
1233vect_set_loop_condition_normal (loop_vec_info /* loop_vinfo */, edge exit_edge,
1234 class loop *loop, tree niters, tree step,
1235 tree final_iv, bool niters_maybe_zero,
1236 gimple_stmt_iterator loop_cond_gsi)
1237{
1238 tree indx_before_incr, indx_after_incr;
1239 gcond *cond_stmt;
1240 gcond *orig_cond;
1241 edge pe = loop_preheader_edge (loop);
1242 gimple_stmt_iterator incr_gsi;
1243 bool insert_after;
1244 enum tree_code code;
1245 tree niters_type = TREE_TYPE (niters);
1246
1247 orig_cond = get_loop_exit_condition (exit_edge);
1248 gcc_assert (orig_cond);
1249 loop_cond_gsi = gsi_for_stmt (orig_cond);
1250
1251 tree init, limit;
1252 if (!niters_maybe_zero && integer_onep (step))
1253 {
1254 /* In this case we can use a simple 0-based IV:
1255
1256 A:
1257 x = 0;
1258 do
1259 {
1260 ...
1261 x += 1;
1262 }
1263 while (x < NITERS); */
1264 code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GE_EXPR : LT_EXPR;
1265 init = build_zero_cst (niters_type);
1266 limit = niters;
1267 }
1268 else
1269 {
1270 /* The following works for all values of NITERS except 0:
1271
1272 B:
1273 x = 0;
1274 do
1275 {
1276 ...
1277 x += STEP;
1278 }
1279 while (x <= NITERS - STEP);
1280
1281 so that the loop continues to iterate if x + STEP - 1 < NITERS
1282 but stops if x + STEP - 1 >= NITERS.
1283
1284 However, if NITERS is zero, x never hits a value above NITERS - STEP
1285 before wrapping around. There are two obvious ways of dealing with
1286 this:
1287
1288 - start at STEP - 1 and compare x before incrementing it
1289 - start at -1 and compare x after incrementing it
1290
1291 The latter is simpler and is what we use. The loop in this case
1292 looks like:
1293
1294 C:
1295 x = -1;
1296 do
1297 {
1298 ...
1299 x += STEP;
1300 }
1301 while (x < NITERS - STEP);
1302
1303 In both cases the loop limit is NITERS - STEP. */
1304 gimple_seq seq = NULL;
1305 limit = force_gimple_operand (niters, &seq, true, NULL_TREE);
1306 limit = gimple_build (seq: &seq, code: MINUS_EXPR, TREE_TYPE (limit), ops: limit, ops: step);
1307 if (seq)
1308 {
1309 basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
1310 gcc_assert (!new_bb);
1311 }
1312 if (niters_maybe_zero)
1313 {
1314 /* Case C. */
1315 code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GE_EXPR : LT_EXPR;
1316 init = build_all_ones_cst (niters_type);
1317 }
1318 else
1319 {
1320 /* Case B. */
1321 code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GT_EXPR : LE_EXPR;
1322 init = build_zero_cst (niters_type);
1323 }
1324 }
1325
1326 vect_iv_increment_position (loop_exit: exit_edge, bsi: &incr_gsi, insert_after: &insert_after);
1327 create_iv (init, PLUS_EXPR, step, NULL_TREE, loop,
1328 &incr_gsi, insert_after, &indx_before_incr, &indx_after_incr);
1329 indx_after_incr = force_gimple_operand_gsi (&loop_cond_gsi, indx_after_incr,
1330 true, NULL_TREE, true,
1331 GSI_SAME_STMT);
1332 limit = force_gimple_operand_gsi (&loop_cond_gsi, limit, true, NULL_TREE,
1333 true, GSI_SAME_STMT);
1334
1335 cond_stmt = gimple_build_cond (code, indx_after_incr, limit, NULL_TREE,
1336 NULL_TREE);
1337
1338 gsi_insert_before (&loop_cond_gsi, cond_stmt, GSI_SAME_STMT);
1339
1340 /* Record the number of latch iterations. */
1341 if (limit == niters)
1342 /* Case A: the loop iterates NITERS times. Subtract one to get the
1343 latch count. */
1344 loop->nb_iterations = fold_build2 (MINUS_EXPR, niters_type, niters,
1345 build_int_cst (niters_type, 1));
1346 else
1347 /* Case B or C: the loop iterates (NITERS - STEP) / STEP + 1 times.
1348 Subtract one from this to get the latch count. */
1349 loop->nb_iterations = fold_build2 (TRUNC_DIV_EXPR, niters_type,
1350 limit, step);
1351
1352 if (final_iv)
1353 {
1354 gassign *assign;
1355 gcc_assert (single_pred_p (exit_edge->dest));
1356 tree phi_dest
1357 = integer_zerop (init) ? final_iv : copy_ssa_name (var: indx_after_incr);
1358 /* Make sure to maintain LC SSA form here and elide the subtraction
1359 if the value is zero. */
1360 gphi *phi = create_phi_node (phi_dest, exit_edge->dest);
1361 add_phi_arg (phi, indx_after_incr, exit_edge, UNKNOWN_LOCATION);
1362 if (!integer_zerop (init))
1363 {
1364 assign = gimple_build_assign (final_iv, MINUS_EXPR,
1365 phi_dest, init);
1366 gimple_stmt_iterator gsi = gsi_after_labels (bb: exit_edge->dest);
1367 gsi_insert_before (&gsi, assign, GSI_SAME_STMT);
1368 }
1369 }
1370
1371 return cond_stmt;
1372}
1373
1374/* If we're using fully-masked loops, make LOOP iterate:
1375
1376 N == (NITERS - 1) / STEP + 1
1377
1378 times. When NITERS is zero, this is equivalent to making the loop
1379 execute (1 << M) / STEP times, where M is the precision of NITERS.
1380 NITERS_MAYBE_ZERO is true if this last case might occur.
1381
1382 If we're not using fully-masked loops, make LOOP iterate:
1383
1384 N == (NITERS - STEP) / STEP + 1
1385
1386 times, where NITERS is known to be outside the range [1, STEP - 1].
1387 This is equivalent to making the loop execute NITERS / STEP times
1388 when NITERS is nonzero and (1 << M) / STEP times otherwise.
1389 NITERS_MAYBE_ZERO again indicates whether this last case might occur.
1390
1391 If FINAL_IV is nonnull, it is an SSA name that should be set to
1392 N * STEP on exit from the loop.
1393
1394 Assumption: the exit-condition of LOOP is the last stmt in the loop. */
1395
1396void
1397vect_set_loop_condition (class loop *loop, edge loop_e, loop_vec_info loop_vinfo,
1398 tree niters, tree step, tree final_iv,
1399 bool niters_maybe_zero)
1400{
1401 gcond *cond_stmt;
1402 gcond *orig_cond = get_loop_exit_condition (loop_e);
1403 gimple_stmt_iterator loop_cond_gsi = gsi_for_stmt (orig_cond);
1404
1405 if (loop_vinfo && LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
1406 {
1407 if (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) == vect_partial_vectors_avx512)
1408 cond_stmt = vect_set_loop_condition_partial_vectors_avx512 (loop, exit_edge: loop_e,
1409 loop_vinfo,
1410 niters, final_iv,
1411 niters_maybe_zero,
1412 loop_cond_gsi);
1413 else
1414 cond_stmt = vect_set_loop_condition_partial_vectors (loop, exit_edge: loop_e,
1415 loop_vinfo,
1416 niters, final_iv,
1417 niters_maybe_zero,
1418 loop_cond_gsi);
1419 }
1420 else
1421 cond_stmt = vect_set_loop_condition_normal (loop_vinfo, exit_edge: loop_e, loop,
1422 niters,
1423 step, final_iv,
1424 niters_maybe_zero,
1425 loop_cond_gsi);
1426
1427 /* Remove old loop exit test. */
1428 stmt_vec_info orig_cond_info;
1429 if (loop_vinfo
1430 && (orig_cond_info = loop_vinfo->lookup_stmt (orig_cond)))
1431 loop_vinfo->remove_stmt (orig_cond_info);
1432 else
1433 gsi_remove (&loop_cond_gsi, true);
1434
1435 if (dump_enabled_p ())
1436 dump_printf_loc (MSG_NOTE, vect_location, "New loop exit condition: %G",
1437 (gimple *) cond_stmt);
1438}
1439
1440/* Get the virtual operand live on E. The precondition on this is valid
1441 immediate dominators and an actual virtual definition dominating E. */
1442/* ??? Costly band-aid. For the use in question we can populate a
1443 live-on-exit/end-of-BB virtual operand when copying stmts. */
1444
1445static tree
1446get_live_virtual_operand_on_edge (edge e)
1447{
1448 basic_block bb = e->src;
1449 do
1450 {
1451 for (auto gsi = gsi_last_bb (bb); !gsi_end_p (i: gsi); gsi_prev (i: &gsi))
1452 {
1453 gimple *stmt = gsi_stmt (i: gsi);
1454 if (gimple_vdef (g: stmt))
1455 return gimple_vdef (g: stmt);
1456 if (gimple_vuse (g: stmt))
1457 return gimple_vuse (g: stmt);
1458 }
1459 if (gphi *vphi = get_virtual_phi (bb))
1460 return gimple_phi_result (gs: vphi);
1461 bb = get_immediate_dominator (CDI_DOMINATORS, bb);
1462 }
1463 while (1);
1464}
1465
1466/* Given LOOP this function generates a new copy of it and puts it
1467 on E which is either the entry or exit of LOOP. If SCALAR_LOOP is
1468 non-NULL, assume LOOP and SCALAR_LOOP are equivalent and copy the
1469 basic blocks from SCALAR_LOOP instead of LOOP, but to either the
1470 entry or exit of LOOP. If FLOW_LOOPS then connect LOOP to SCALAR_LOOP as a
1471 continuation. This is correct for cases where one loop continues from the
1472 other like in the vectorizer, but not true for uses in e.g. loop distribution
1473 where the contents of the loop body are split but the iteration space of both
1474 copies remains the same.
1475
1476 If UPDATED_DOMS is not NULL it is update with the list of basic blocks whoms
1477 dominators were updated during the peeling. When doing early break vectorization
1478 then LOOP_VINFO needs to be provided and is used to keep track of any newly created
1479 memory references that need to be updated should we decide to vectorize. */
1480
1481class loop *
1482slpeel_tree_duplicate_loop_to_edge_cfg (class loop *loop, edge loop_exit,
1483 class loop *scalar_loop,
1484 edge scalar_exit, edge e, edge *new_e,
1485 bool flow_loops,
1486 vec<basic_block> *updated_doms,
1487 bool uncounted_p, bool create_main_e)
1488{
1489 class loop *new_loop;
1490 basic_block *new_bbs, *bbs, *pbbs;
1491 bool at_exit;
1492 bool was_imm_dom;
1493 basic_block exit_dest;
1494 edge exit, new_exit;
1495 bool duplicate_outer_loop = false;
1496
1497 exit = loop_exit;
1498 at_exit = (e == exit);
1499 if (!at_exit && e != loop_preheader_edge (loop))
1500 return NULL;
1501
1502 if (scalar_loop == NULL)
1503 {
1504 scalar_loop = loop;
1505 scalar_exit = loop_exit;
1506 }
1507 else if (scalar_loop == loop)
1508 scalar_exit = loop_exit;
1509 else
1510 {
1511 /* Loop has been version, match exits up using the aux index. */
1512 for (edge exit : get_loop_exit_edges (scalar_loop))
1513 if (exit->aux == loop_exit->aux)
1514 {
1515 scalar_exit = exit;
1516 break;
1517 }
1518
1519 gcc_assert (scalar_exit);
1520 }
1521
1522 bbs = XNEWVEC (basic_block, scalar_loop->num_nodes + 1);
1523 pbbs = bbs + 1;
1524 get_loop_body_with_size (scalar_loop, pbbs, scalar_loop->num_nodes);
1525 /* Allow duplication of outer loops. */
1526 if (scalar_loop->inner)
1527 duplicate_outer_loop = true;
1528
1529 /* Generate new loop structure. */
1530 new_loop = duplicate_loop (scalar_loop, loop_outer (loop: scalar_loop));
1531 duplicate_subloops (scalar_loop, new_loop);
1532
1533 exit_dest = exit->dest;
1534 was_imm_dom = (get_immediate_dominator (CDI_DOMINATORS,
1535 exit_dest) == exit->src ?
1536 true : false);
1537
1538 /* Also copy the pre-header, this avoids jumping through hoops to
1539 duplicate the loop entry PHI arguments. Create an empty
1540 pre-header unconditionally for this. */
1541 basic_block preheader = split_edge (loop_preheader_edge (scalar_loop));
1542 edge entry_e = single_pred_edge (bb: preheader);
1543 bbs[0] = preheader;
1544 new_bbs = XNEWVEC (basic_block, scalar_loop->num_nodes + 1);
1545
1546 copy_bbs (bbs, scalar_loop->num_nodes + 1, new_bbs,
1547 &scalar_exit, 1, &new_exit, NULL,
1548 at_exit ? loop->latch : e->src, true);
1549 exit = loop_exit;
1550 basic_block new_preheader = new_bbs[0];
1551
1552 gcc_assert (new_exit);
1553
1554 /* Record the new loop exit information. new_loop doesn't have SCEV data and
1555 so we must initialize the exit information. */
1556 if (new_e)
1557 *new_e = new_exit;
1558
1559 /* Before installing PHI arguments make sure that the edges
1560 into them match that of the scalar loop we analyzed. This
1561 makes sure the SLP tree matches up between the main vectorized
1562 loop and the epilogue vectorized copies. */
1563 if (single_succ_edge (bb: preheader)->dest_idx
1564 != single_succ_edge (bb: new_bbs[0])->dest_idx)
1565 {
1566 basic_block swap_bb = new_bbs[1];
1567 gcc_assert (EDGE_COUNT (swap_bb->preds) == 2);
1568 std::swap (EDGE_PRED (swap_bb, 0), EDGE_PRED (swap_bb, 1));
1569 EDGE_PRED (swap_bb, 0)->dest_idx = 0;
1570 EDGE_PRED (swap_bb, 1)->dest_idx = 1;
1571 }
1572 if (duplicate_outer_loop)
1573 {
1574 class loop *new_inner_loop = get_loop_copy (scalar_loop->inner);
1575 if (loop_preheader_edge (scalar_loop)->dest_idx
1576 != loop_preheader_edge (new_inner_loop)->dest_idx)
1577 {
1578 basic_block swap_bb = new_inner_loop->header;
1579 gcc_assert (EDGE_COUNT (swap_bb->preds) == 2);
1580 std::swap (EDGE_PRED (swap_bb, 0), EDGE_PRED (swap_bb, 1));
1581 EDGE_PRED (swap_bb, 0)->dest_idx = 0;
1582 EDGE_PRED (swap_bb, 1)->dest_idx = 1;
1583 }
1584 }
1585
1586 add_phi_args_after_copy (new_bbs, scalar_loop->num_nodes + 1, NULL);
1587
1588 /* Skip new preheader since it's deleted if copy loop is added at entry. */
1589 for (unsigned i = (at_exit ? 0 : 1); i < scalar_loop->num_nodes + 1; i++)
1590 rename_variables_in_bb (bb: new_bbs[i], rename_from_outer_loop: duplicate_outer_loop);
1591
1592 /* Rename the exit uses. */
1593 for (edge exit : get_loop_exit_edges (new_loop))
1594 for (auto gsi = gsi_start_phis (exit->dest);
1595 !gsi_end_p (i: gsi); gsi_next (i: &gsi))
1596 {
1597 tree orig_def = PHI_ARG_DEF_FROM_EDGE (gsi.phi (), exit);
1598 rename_use_op (PHI_ARG_DEF_PTR_FROM_EDGE (gsi.phi (), exit));
1599 if (MAY_HAVE_DEBUG_BIND_STMTS)
1600 adjust_debug_stmts (from: orig_def, PHI_RESULT (gsi.phi ()), bb: exit->dest);
1601 }
1602
1603 auto loop_exits = get_loop_exit_edges (loop);
1604 bool multiple_exits_p = loop_exits.length () > 1;
1605 auto_vec<basic_block> doms;
1606
1607 if (at_exit) /* Add the loop copy at exit. */
1608 {
1609 if (scalar_loop != loop && new_exit->dest != exit_dest)
1610 {
1611 new_exit = redirect_edge_and_branch (new_exit, exit_dest);
1612 flush_pending_stmts (new_exit);
1613 }
1614
1615 bool need_virtual_phi = get_virtual_phi (loop->header);
1616
1617 /* For the main loop exit preserve the LC PHI nodes. For vectorization
1618 we need them to continue or finalize reductions. Since we do not
1619 copy the loop exit blocks we have to materialize PHIs at the
1620 new destination before redirecting edges. */
1621 for (auto gsi_from = gsi_start_phis (loop_exit->dest);
1622 !gsi_end_p (i: gsi_from); gsi_next (i: &gsi_from))
1623 {
1624 tree res = gimple_phi_result (gs: *gsi_from);
1625 create_phi_node (copy_ssa_name (var: res), new_preheader);
1626 }
1627 edge e = redirect_edge_and_branch (loop_exit, new_preheader);
1628 gcc_assert (e == loop_exit);
1629 flush_pending_stmts (loop_exit);
1630 set_immediate_dominator (CDI_DOMINATORS, new_preheader, loop_exit->src);
1631
1632 /* If we ended up choosing an exit leading to a path not using memory
1633 we can end up without a virtual LC PHI. Create it when it is
1634 needed because of the epilog loop continuation. */
1635 if (need_virtual_phi && !get_virtual_phi (loop_exit->dest))
1636 {
1637 tree header_def = gimple_phi_result (gs: get_virtual_phi (loop->header));
1638 gphi *vphi = create_phi_node (copy_ssa_name (var: header_def),
1639 new_preheader);
1640 add_phi_arg (vphi, get_live_virtual_operand_on_edge (e: loop_exit),
1641 loop_exit, UNKNOWN_LOCATION);
1642 }
1643
1644 bool multiple_exits_p = loop_exits.length () > 1;
1645 basic_block main_loop_exit_block = new_preheader;
1646 basic_block alt_loop_exit_block = NULL;
1647 /* Create the CFG for multiple exits.
1648 | loop_exit | alt1 | altN
1649 v v ... v
1650 main_loop_exit_block: alt_loop_exit_block:
1651 | /
1652 v v
1653 new_preheader:
1654 where in the new preheader we need merge PHIs for
1655 the continuation values into the epilogue header.
1656 Do not bother with exit PHIs for the early exits but
1657 their live virtual operand. We'll fix up things below. */
1658 if (multiple_exits_p || uncounted_p)
1659 {
1660 edge loop_e = single_succ_edge (bb: new_preheader);
1661 new_preheader = split_edge (loop_e);
1662
1663 gphi *vphi = NULL;
1664 alt_loop_exit_block = new_preheader;
1665 for (auto exit : loop_exits)
1666 if (exit != loop_exit)
1667 {
1668 tree vphi_def = NULL_TREE;
1669 if (gphi *evphi = get_virtual_phi (exit->dest))
1670 vphi_def = gimple_phi_arg_def_from_edge (gs: evphi, e: exit);
1671 edge res = redirect_edge_and_branch (exit, alt_loop_exit_block);
1672 gcc_assert (res == exit);
1673 redirect_edge_var_map_clear (exit);
1674 if (alt_loop_exit_block == new_preheader)
1675 alt_loop_exit_block = split_edge (exit);
1676 if (!need_virtual_phi)
1677 continue;
1678 /* When the edge has no virtual LC PHI get at the live
1679 virtual operand by other means. */
1680 if (!vphi_def)
1681 vphi_def = get_live_virtual_operand_on_edge (e: exit);
1682 if (!vphi)
1683 vphi = create_phi_node (copy_ssa_name (var: vphi_def),
1684 alt_loop_exit_block);
1685 else
1686 /* Edge redirection might re-allocate the PHI node
1687 so we have to rediscover it. */
1688 vphi = get_virtual_phi (alt_loop_exit_block);
1689 add_phi_arg (vphi, vphi_def, exit, UNKNOWN_LOCATION);
1690 }
1691
1692 set_immediate_dominator (CDI_DOMINATORS, new_preheader,
1693 loop->header);
1694
1695 /* Fix up the profile counts of the new exit blocks.
1696 main_loop_exit_block was created by duplicating the
1697 preheader, so needs its count scaling according to the main
1698 exit edge's probability. The remaining count from the
1699 preheader goes to the alt_loop_exit_block, since all
1700 alternative exits have been redirected there. */
1701 main_loop_exit_block->count = loop_exit->count ();
1702 alt_loop_exit_block->count
1703 = preheader->count - main_loop_exit_block->count;
1704 }
1705
1706 /* Adjust the epilog loop PHI entry values to continue iteration.
1707 This adds remaining necessary LC PHI nodes to the main exit
1708 and creates merge PHIs when we have multiple exits with
1709 their appropriate continuation. */
1710 if (flow_loops)
1711 {
1712 edge loop_entry = single_succ_edge (bb: new_preheader);
1713 bool peeled_iters = (uncounted_p
1714 || single_pred (bb: loop->latch) != loop_exit->src);
1715
1716 /* Record the new SSA names in the cache so that we can skip
1717 materializing them again when we fill in the rest of the LC SSA
1718 variables. */
1719 hash_map <tree, tree> new_phi_args;
1720 for (auto psi = gsi_start_phis (main_loop_exit_block);
1721 !gsi_end_p (i: psi); gsi_next (i: &psi))
1722 {
1723 gphi *phi = *psi;
1724 tree new_arg = gimple_phi_arg_def_from_edge (gs: phi, e: loop_exit);
1725 if (TREE_CODE (new_arg) != SSA_NAME)
1726 continue;
1727
1728 /* If the loop doesn't have a virtual def then only possibly keep
1729 the epilog LC PHI for it and avoid creating new defs. */
1730 if (virtual_operand_p (op: new_arg) && !need_virtual_phi)
1731 {
1732 auto gsi = gsi_for_stmt (phi);
1733 remove_phi_node (&gsi, true);
1734 continue;
1735 }
1736
1737 /* If we decided not to remove the PHI node we should also not
1738 rematerialize it later on. */
1739 new_phi_args.put (k: new_arg, v: gimple_phi_result (gs: phi));
1740 }
1741
1742 /* Create the merge PHI nodes in new_preheader and populate the
1743 arguments for the exits. */
1744 if (multiple_exits_p || uncounted_p)
1745 {
1746 for (auto gsi_from = gsi_start_phis (loop->header),
1747 gsi_to = gsi_start_phis (new_loop->header);
1748 !gsi_end_p (i: gsi_from) && !gsi_end_p (i: gsi_to);
1749 gsi_next (i: &gsi_from), gsi_next (i: &gsi_to))
1750 {
1751 gimple *from_phi = gsi_stmt (i: gsi_from);
1752 gimple *to_phi = gsi_stmt (i: gsi_to);
1753
1754 /* When the vector loop is peeled then we need to use the
1755 value at start of the loop, otherwise the main loop exit
1756 should use the final iter value. */
1757 tree new_arg;
1758 if (peeled_iters)
1759 new_arg = gimple_phi_result (gs: from_phi);
1760 else
1761 new_arg = PHI_ARG_DEF_FROM_EDGE (from_phi,
1762 loop_latch_edge (loop));
1763
1764 /* Check if we've already created a new phi node during edge
1765 redirection and re-use it if so. Otherwise create a
1766 LC PHI node to feed the merge PHI. */
1767 tree *res;
1768 if (virtual_operand_p (op: new_arg))
1769 {
1770 /* Use the existing virtual LC SSA from exit block. */
1771 gphi *vphi = get_virtual_phi (main_loop_exit_block);
1772 new_arg = gimple_phi_result (gs: vphi);
1773 }
1774 else if ((res = new_phi_args.get (k: new_arg)))
1775 new_arg = *res;
1776 else
1777 {
1778 /* Create the LC PHI node for the exit. */
1779 tree new_def = copy_ssa_name (var: new_arg);
1780 gphi *lc_phi
1781 = create_phi_node (new_def, main_loop_exit_block);
1782 SET_PHI_ARG_DEF (lc_phi, 0, new_arg);
1783 new_arg = new_def;
1784 }
1785
1786 /* Create the PHI node in the merge block merging the
1787 main and early exit values. */
1788 tree new_res = copy_ssa_name (var: gimple_phi_result (gs: from_phi));
1789 gphi *lcssa_phi = create_phi_node (new_res, new_preheader);
1790 edge main_e = single_succ_edge (bb: main_loop_exit_block);
1791 SET_PHI_ARG_DEF_ON_EDGE (lcssa_phi, main_e, new_arg);
1792
1793 /* And adjust the epilog entry value. */
1794 adjust_phi_and_debug_stmts (update_phi: to_phi, e: loop_entry, new_def: new_res);
1795 }
1796 }
1797
1798 if (multiple_exits_p)
1799 {
1800 /* After creating the merge PHIs handle the early exits those
1801 should use the values at the start of the loop. */
1802 for (auto gsi_from = gsi_start_phis (loop->header),
1803 gsi_to = gsi_start_phis (new_preheader);
1804 !gsi_end_p (i: gsi_from) && !gsi_end_p (i: gsi_to);
1805 gsi_next (i: &gsi_from), gsi_next (i: &gsi_to))
1806 {
1807 gimple *from_phi = gsi_stmt (i: gsi_from);
1808 gimple *to_phi = gsi_stmt (i: gsi_to);
1809
1810 /* Now update the virtual PHI nodes with the right value. */
1811 tree alt_arg = gimple_phi_result (gs: from_phi);
1812 if (virtual_operand_p (op: alt_arg))
1813 {
1814 gphi *vphi = get_virtual_phi (alt_loop_exit_block);
1815 alt_arg = gimple_phi_result (gs: vphi);
1816 }
1817 /* For other live args we didn't create LC PHI nodes.
1818 Do so here. */
1819 else
1820 {
1821 tree alt_def = copy_ssa_name (var: alt_arg);
1822 gphi *lc_phi
1823 = create_phi_node (alt_def, alt_loop_exit_block);
1824 for (unsigned i = 0; i < gimple_phi_num_args (gs: lc_phi);
1825 ++i)
1826 SET_PHI_ARG_DEF (lc_phi, i, alt_arg);
1827 alt_arg = alt_def;
1828 }
1829 edge alt_e = single_succ_edge (bb: alt_loop_exit_block);
1830 SET_PHI_ARG_DEF_ON_EDGE (to_phi, alt_e, alt_arg);
1831 }
1832 }
1833 /* For the single exit case only create the missing LC PHI nodes
1834 for the continuation of the loop IVs that are not also already
1835 reductions and thus had LC PHI nodes on the exit already. */
1836 if (!multiple_exits_p && !uncounted_p)
1837 {
1838 for (auto gsi_from = gsi_start_phis (loop->header),
1839 gsi_to = gsi_start_phis (new_loop->header);
1840 !gsi_end_p (i: gsi_from) && !gsi_end_p (i: gsi_to);
1841 gsi_next (i: &gsi_from), gsi_next (i: &gsi_to))
1842 {
1843 gimple *from_phi = gsi_stmt (i: gsi_from);
1844 gimple *to_phi = gsi_stmt (i: gsi_to);
1845 tree new_arg = PHI_ARG_DEF_FROM_EDGE (from_phi,
1846 loop_latch_edge (loop));
1847
1848 /* Check if we've already created a new phi node during edge
1849 redirection. If we have, only propagate the value
1850 downwards. */
1851 if (tree *res = new_phi_args.get (k: new_arg))
1852 {
1853 adjust_phi_and_debug_stmts (update_phi: to_phi, e: loop_entry, new_def: *res);
1854 continue;
1855 }
1856
1857 tree new_res = copy_ssa_name (var: gimple_phi_result (gs: from_phi));
1858 gphi *lcssa_phi = create_phi_node (new_res, new_preheader);
1859 SET_PHI_ARG_DEF_ON_EDGE (lcssa_phi, loop_exit, new_arg);
1860 adjust_phi_and_debug_stmts (update_phi: to_phi, e: loop_entry, new_def: new_res);
1861 }
1862 }
1863 }
1864
1865 if (was_imm_dom || duplicate_outer_loop)
1866 set_immediate_dominator (CDI_DOMINATORS, exit_dest, new_exit->src);
1867
1868 /* And remove the non-necessary forwarder again. Keep the other
1869 one so we have a proper pre-header for the loop at the exit edge. */
1870 redirect_edge_pred (single_succ_edge (bb: preheader),
1871 single_pred (bb: preheader));
1872 delete_basic_block (preheader);
1873 set_immediate_dominator (CDI_DOMINATORS, scalar_loop->header,
1874 loop_preheader_edge (scalar_loop)->src);
1875
1876 /* Finally after wiring the new epilogue we need to update its main exit
1877 to the original function exit we recorded. Other exits are already
1878 correct. */
1879 if (multiple_exits_p || uncounted_p)
1880 {
1881 class loop *update_loop = new_loop;
1882 doms = get_all_dominated_blocks (CDI_DOMINATORS, loop->header);
1883 for (unsigned i = 0; i < doms.length (); ++i)
1884 if (flow_bb_inside_loop_p (loop, doms[i]))
1885 doms.unordered_remove (ix: i);
1886
1887 for (edge e : get_loop_exit_edges (update_loop))
1888 {
1889 edge ex;
1890 edge_iterator ei;
1891 FOR_EACH_EDGE (ex, ei, e->dest->succs)
1892 {
1893 /* Find the first non-fallthrough block as fall-throughs can't
1894 dominate other blocks. */
1895 if (single_succ_p (bb: ex->dest))
1896 {
1897 doms.safe_push (obj: ex->dest);
1898 ex = single_succ_edge (bb: ex->dest);
1899 }
1900 doms.safe_push (obj: ex->dest);
1901 }
1902 doms.safe_push (obj: e->dest);
1903 }
1904
1905 iterate_fix_dominators (CDI_DOMINATORS, doms, false);
1906 if (updated_doms)
1907 updated_doms->safe_splice (src: doms);
1908 }
1909 }
1910 else /* Add the copy at entry. */
1911 {
1912 /* Copy the current loop LC PHI nodes between the original loop exit
1913 block and the new loop header. This allows us to later split the
1914 preheader block and still find the right LC nodes. */
1915 if (flow_loops)
1916 for (auto gsi_from = gsi_start_phis (new_loop->header),
1917 gsi_to = gsi_start_phis (loop->header);
1918 !gsi_end_p (i: gsi_from) && !gsi_end_p (i: gsi_to);
1919 gsi_next (i: &gsi_from), gsi_next (i: &gsi_to))
1920 {
1921 gimple *from_phi = gsi_stmt (i: gsi_from);
1922 gimple *to_phi = gsi_stmt (i: gsi_to);
1923 tree new_arg = PHI_ARG_DEF_FROM_EDGE (from_phi,
1924 loop_latch_edge (new_loop));
1925 adjust_phi_and_debug_stmts (update_phi: to_phi, e: loop_preheader_edge (loop),
1926 new_def: new_arg);
1927 }
1928
1929 if (scalar_loop != loop)
1930 {
1931 /* Remove the non-necessary forwarder of scalar_loop again. */
1932 redirect_edge_pred (single_succ_edge (bb: preheader),
1933 single_pred (bb: preheader));
1934 delete_basic_block (preheader);
1935 set_immediate_dominator (CDI_DOMINATORS, scalar_loop->header,
1936 loop_preheader_edge (scalar_loop)->src);
1937 preheader = split_edge (loop_preheader_edge (loop));
1938 entry_e = single_pred_edge (bb: preheader);
1939 }
1940
1941 redirect_edge_and_branch_force (entry_e, new_preheader);
1942 flush_pending_stmts (entry_e);
1943 set_immediate_dominator (CDI_DOMINATORS, new_preheader, entry_e->src);
1944
1945
1946 /* `vect_set_loop_condition' replaces the condition in the main exit of
1947 loop. For counted loops, this is the IV counting exit, so in the case
1948 of the prolog loop, we are replacing the old IV counting exit limit of
1949 total loop niters for the new limit of the prolog niters, as desired.
1950 For uncounted loops, we don't have an IV-counting exit to replace, so
1951 we add a dummy exit to be consumed by `vect_set_loop_condition' later
1952 on. */
1953 if (create_main_e)
1954 {
1955 edge to_latch_e = single_pred_edge (bb: new_loop->latch);
1956 bool latch_is_false = to_latch_e->flags & EDGE_FALSE_VALUE ? true
1957 : false;
1958
1959 /* Add new bb for duplicate exit. */
1960 basic_block bbcond = split_edge (to_latch_e);
1961 gimple_stmt_iterator a = gsi_last_bb (bb: bbcond);
1962
1963 /* Fix flags for the edge leading to the latch. */
1964 to_latch_e = find_edge (bbcond, new_loop->latch);
1965 to_latch_e->flags &= ~EDGE_FALLTHRU;
1966 to_latch_e->flags |= latch_is_false ? EDGE_FALSE_VALUE
1967 : EDGE_TRUE_VALUE;
1968
1969 /* Build the condition. */
1970 tree cone = build_int_cst (sizetype, 1);
1971 tree czero = build_int_cst (sizetype, 0);
1972 gcond *cond_copy = gimple_build_cond (NE_EXPR, cone, czero, NULL_TREE,
1973 NULL_TREE);
1974
1975 gsi_insert_after (&a, cond_copy, GSI_NEW_STMT);
1976
1977 /* Add edge for exiting the loop via new condition. */
1978 edge dup_exit = make_edge (bbcond, new_exit->dest, latch_is_false
1979 ? EDGE_TRUE_VALUE : EDGE_FALSE_VALUE);
1980
1981 profile_probability probability = profile_probability::even ();
1982 to_latch_e->probability = dup_exit->probability = probability;
1983
1984 set_immediate_dominator (CDI_DOMINATORS, dup_exit->src,
1985 new_exit->src);
1986 new_exit = dup_exit;
1987 *new_e = new_exit;
1988 }
1989
1990 redirect_edge_and_branch_force (new_exit, preheader);
1991 flush_pending_stmts (new_exit);
1992 set_immediate_dominator (CDI_DOMINATORS, preheader, new_exit->src);
1993
1994 /* And remove the non-necessary forwarder again. Keep the other
1995 one so we have a proper pre-header for the loop at the exit edge. */
1996 redirect_edge_pred (single_succ_edge (bb: new_preheader),
1997 single_pred (bb: new_preheader));
1998 delete_basic_block (new_preheader);
1999 set_immediate_dominator (CDI_DOMINATORS, new_loop->header,
2000 loop_preheader_edge (new_loop)->src);
2001
2002 /* Update dominators for multiple exits. */
2003 if (multiple_exits_p || create_main_e)
2004 {
2005 for (edge alt_e : loop_exits)
2006 {
2007 if ((alt_e == loop_exit) && !create_main_e)
2008 continue;
2009 basic_block old_dom
2010 = get_immediate_dominator (CDI_DOMINATORS, alt_e->dest);
2011 if (flow_bb_inside_loop_p (loop, old_dom))
2012 {
2013 auto_vec<basic_block, 8> queue;
2014 for (auto son = first_dom_son (CDI_DOMINATORS, old_dom);
2015 son; son = next_dom_son (CDI_DOMINATORS, son))
2016 if (!flow_bb_inside_loop_p (loop, son))
2017 queue.safe_push (obj: son);
2018 for (auto son : queue)
2019 set_immediate_dominator (CDI_DOMINATORS,
2020 son, get_bb_copy (old_dom));
2021 }
2022 }
2023 }
2024
2025 /* When loop_exit != scalar_exit due to if-conversion loop versioning,
2026 the `scalar_exit' now has two incoming edges, one from the if-converted
2027 and one from the peeled prolog loop. It is therefore dominated by a
2028 common block between these. Update its dominator accordingly. */
2029 if (create_main_e && loop_exit != scalar_exit)
2030 set_immediate_dominator (CDI_DOMINATORS, scalar_exit->dest,
2031 recompute_dominator (CDI_DOMINATORS,
2032 scalar_exit->dest));
2033 }
2034
2035 free (ptr: new_bbs);
2036 free (ptr: bbs);
2037
2038 checking_verify_dominators (dir: CDI_DOMINATORS);
2039
2040 return new_loop;
2041}
2042
2043
2044/* Given the condition expression COND, put it as the last statement of
2045 GUARD_BB; set both edges' probability; set dominator of GUARD_TO to
2046 DOM_BB; return the skip edge. GUARD_TO is the target basic block to
2047 skip the loop. PROBABILITY is the skip edge's probability. Mark the
2048 new edge as irreducible if IRREDUCIBLE_P is true. */
2049
2050static edge
2051slpeel_add_loop_guard (basic_block guard_bb, tree cond,
2052 basic_block guard_to, basic_block dom_bb,
2053 profile_probability probability, bool irreducible_p)
2054{
2055 gimple_stmt_iterator gsi;
2056 edge new_e, enter_e;
2057 gcond *cond_stmt;
2058 gimple_seq gimplify_stmt_list = NULL;
2059
2060 enter_e = EDGE_SUCC (guard_bb, 0);
2061 enter_e->flags &= ~EDGE_FALLTHRU;
2062 enter_e->flags |= EDGE_FALSE_VALUE;
2063 gsi = gsi_last_bb (bb: guard_bb);
2064
2065 cond = force_gimple_operand_1 (cond, &gimplify_stmt_list,
2066 is_gimple_condexpr_for_cond, NULL_TREE);
2067 if (gimplify_stmt_list)
2068 gsi_insert_seq_after (&gsi, gimplify_stmt_list, GSI_NEW_STMT);
2069
2070 cond_stmt = gimple_build_cond_from_tree (cond, NULL_TREE, NULL_TREE);
2071 gsi = gsi_last_bb (bb: guard_bb);
2072 gsi_insert_after (&gsi, cond_stmt, GSI_NEW_STMT);
2073
2074 /* Add new edge to connect guard block to the merge/loop-exit block. */
2075 new_e = make_edge (guard_bb, guard_to, EDGE_TRUE_VALUE);
2076
2077 new_e->probability = probability;
2078 if (irreducible_p)
2079 new_e->flags |= EDGE_IRREDUCIBLE_LOOP;
2080
2081 enter_e->probability = probability.invert ();
2082 set_immediate_dominator (CDI_DOMINATORS, guard_to, dom_bb);
2083
2084 /* Split enter_e to preserve LOOPS_HAVE_PREHEADERS. */
2085 if (enter_e->dest->loop_father->header == enter_e->dest)
2086 split_edge (enter_e);
2087
2088 return new_e;
2089}
2090
2091
2092/* This function verifies that the following restrictions apply to LOOP:
2093 (1) it consists of exactly 2 basic blocks - header, and an empty latch
2094 for innermost loop and 5 basic blocks for outer-loop.
2095 (2) it is single entry, single exit
2096 (3) its exit condition is the last stmt in the header
2097 (4) E is the entry/exit edge of LOOP.
2098 */
2099
2100bool
2101slpeel_can_duplicate_loop_p (const class loop *loop, const_edge exit_e,
2102 const_edge e)
2103{
2104 edge entry_e = loop_preheader_edge (loop);
2105 gcond *orig_cond = get_loop_exit_condition (exit_e);
2106 gimple_stmt_iterator loop_exit_gsi = gsi_last_bb (bb: exit_e->src);
2107
2108 /* All loops have an outer scope; the only case loop->outer is NULL is for
2109 the function itself. */
2110 if (!loop_outer (loop)
2111 || !empty_block_p (loop->latch)
2112 || !exit_e
2113 /* Verify that new loop exit condition can be trivially modified. */
2114 || (!orig_cond || orig_cond != gsi_stmt (i: loop_exit_gsi))
2115 || (e != exit_e && e != entry_e))
2116 return false;
2117
2118 basic_block *bbs = XNEWVEC (basic_block, loop->num_nodes);
2119 get_loop_body_with_size (loop, bbs, loop->num_nodes);
2120 bool ret = can_copy_bbs_p (bbs, loop->num_nodes);
2121 free (ptr: bbs);
2122 return ret;
2123}
2124
2125/* Function find_loop_location.
2126
2127 Extract the location of the loop in the source code.
2128 If the loop is not well formed for vectorization, an estimated
2129 location is calculated.
2130 Return the loop location if succeed and NULL if not. */
2131
2132dump_user_location_t
2133find_loop_location (class loop *loop)
2134{
2135 gimple *stmt = NULL;
2136 basic_block bb;
2137 gimple_stmt_iterator si;
2138
2139 if (!loop)
2140 return dump_user_location_t ();
2141
2142 /* For the root of the loop tree return the function location. */
2143 if (!loop_outer (loop))
2144 return dump_user_location_t::from_function_decl (cfun->decl);
2145
2146 if (loops_state_satisfies_p (flags: LOOPS_HAVE_RECORDED_EXITS))
2147 {
2148 /* We only care about the loop location, so use any exit with location
2149 information. */
2150 for (edge e : get_loop_exit_edges (loop))
2151 {
2152 stmt = get_loop_exit_condition (e);
2153
2154 if (stmt
2155 && LOCATION_LOCUS (gimple_location (stmt)) > BUILTINS_LOCATION)
2156 return stmt;
2157 }
2158 }
2159
2160 /* If we got here the loop is probably not "well formed",
2161 try to estimate the loop location */
2162
2163 if (!loop->header)
2164 return dump_user_location_t ();
2165
2166 bb = loop->header;
2167
2168 for (si = gsi_start_bb (bb); !gsi_end_p (i: si); gsi_next (i: &si))
2169 {
2170 stmt = gsi_stmt (i: si);
2171 if (LOCATION_LOCUS (gimple_location (stmt)) > BUILTINS_LOCATION)
2172 return stmt;
2173 }
2174
2175 return dump_user_location_t ();
2176}
2177
2178/* Return true if the phi described by STMT_INFO defines an IV of the
2179 loop to be vectorized. */
2180
2181static bool
2182iv_phi_p (stmt_vec_info stmt_info)
2183{
2184 gphi *phi = as_a <gphi *> (p: stmt_info->stmt);
2185 if (virtual_operand_p (PHI_RESULT (phi)))
2186 return false;
2187
2188 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
2189 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
2190 return false;
2191
2192 return true;
2193}
2194
2195/* Return true if vectorizer can peel for nonlinear iv. */
2196static bool
2197vect_can_peel_nonlinear_iv_p (loop_vec_info loop_vinfo,
2198 stmt_vec_info stmt_info)
2199{
2200 enum vect_induction_op_type induction_type
2201 = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info);
2202 tree niters_skip;
2203 /* Init_expr will be update by vect_update_ivs_after_vectorizer,
2204 if niters or vf is unkown:
2205 For shift, when shift mount >= precision, there would be UD.
2206 For mult, don't known how to generate
2207 init_expr * pow (step, niters) for variable niters.
2208 For neg unknown niters are ok, since niters of vectorized main loop
2209 will always be multiple of 2.
2210 See also PR113163, PR114196 and PR114485. */
2211 if (!LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant ()
2212 || LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2213 || (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2214 && induction_type != vect_step_op_neg))
2215 {
2216 if (dump_enabled_p ())
2217 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2218 "Peeling for epilogue is not supported"
2219 " for this nonlinear induction"
2220 " when iteration count is unknown or"
2221 " when using partial vectorization.\n");
2222 return false;
2223 }
2224
2225 if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo)
2226 && induction_type == vect_step_op_mul)
2227 {
2228 if (dump_enabled_p ())
2229 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2230 "Peeling for is not supported for nonlinear mult"
2231 " induction using partial vectorization.\n");
2232 return false;
2233 }
2234
2235 /* Avoid compile time hog on vect_peel_nonlinear_iv_init. */
2236 if (induction_type == vect_step_op_mul)
2237 {
2238 tree step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
2239 tree type = TREE_TYPE (step_expr);
2240
2241 if (wi::exact_log2 (wi::to_wide (t: step_expr)) == -1
2242 && LOOP_VINFO_INT_NITERS(loop_vinfo) >= TYPE_PRECISION (type))
2243 {
2244 if (dump_enabled_p ())
2245 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2246 "Avoid compile time hog on"
2247 " vect_peel_nonlinear_iv_init"
2248 " for nonlinear induction vec_step_op_mul"
2249 " when iteration count is too big.\n");
2250 return false;
2251 }
2252 }
2253
2254 /* Also doens't support peel for neg when niter is variable.
2255 ??? generate something like niter_expr & 1 ? init_expr : -init_expr? */
2256 niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
2257 if ((niters_skip != NULL_TREE
2258 && (TREE_CODE (niters_skip) != INTEGER_CST
2259 || (HOST_WIDE_INT) TREE_INT_CST_LOW (niters_skip) < 0))
2260 || (!vect_use_loop_mask_for_alignment_p (loop_vinfo)
2261 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0))
2262 {
2263 if (dump_enabled_p ())
2264 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2265 "Peeling for alignement is not supported"
2266 " for nonlinear induction when niters_skip"
2267 " is not constant.\n");
2268 return false;
2269 }
2270
2271 return true;
2272}
2273
2274/* Function vect_can_advance_ivs_p
2275
2276 In case the number of iterations that LOOP iterates is unknown at compile
2277 time, an epilog loop will be generated, and the loop induction variables
2278 (IVs) will be "advanced" to the value they are supposed to take just before
2279 the epilog loop. Here we check that the access function of the loop IVs
2280 and the expression that represents the loop bound are simple enough.
2281 These restrictions will be relaxed in the future. */
2282
2283bool
2284vect_can_advance_ivs_p (loop_vec_info loop_vinfo)
2285{
2286 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2287 basic_block bb = loop->header;
2288 gphi_iterator gsi;
2289
2290 /* Analyze phi functions of the loop header. */
2291
2292 if (dump_enabled_p ())
2293 dump_printf_loc (MSG_NOTE, vect_location, "vect_can_advance_ivs_p:\n");
2294 for (gsi = gsi_start_phis (bb); !gsi_end_p (i: gsi); gsi_next (i: &gsi))
2295 {
2296 tree evolution_part;
2297 enum vect_induction_op_type induction_type;
2298
2299 gphi *phi = gsi.phi ();
2300 stmt_vec_info phi_info = loop_vinfo->lookup_stmt (phi);
2301 if (dump_enabled_p ())
2302 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
2303 phi_info->stmt);
2304
2305 /* Skip virtual phi's. The data dependences that are associated with
2306 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.
2307
2308 Skip reduction phis. */
2309 if (!iv_phi_p (stmt_info: phi_info))
2310 {
2311 if (dump_enabled_p ())
2312 dump_printf_loc (MSG_NOTE, vect_location,
2313 "reduc or virtual phi. skip.\n");
2314 continue;
2315 }
2316
2317 induction_type = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (phi_info);
2318 if (induction_type != vect_step_op_add)
2319 {
2320 if (!vect_can_peel_nonlinear_iv_p (loop_vinfo, stmt_info: phi_info))
2321 return false;
2322
2323 continue;
2324 }
2325
2326 /* Analyze the evolution function. */
2327
2328 evolution_part = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
2329 if (evolution_part == NULL_TREE)
2330 {
2331 if (dump_enabled_p ())
2332 dump_printf (MSG_MISSED_OPTIMIZATION,
2333 "No access function or evolution.\n");
2334 return false;
2335 }
2336
2337 /* FORNOW: We do not transform initial conditions of IVs
2338 which evolution functions are not invariants in the loop. */
2339
2340 if (!expr_invariant_in_loop_p (loop, evolution_part))
2341 {
2342 if (dump_enabled_p ())
2343 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2344 "evolution not invariant in loop.\n");
2345 return false;
2346 }
2347
2348 /* FORNOW: We do not transform initial conditions of IVs
2349 which evolution functions are a polynomial of degree >= 2. */
2350
2351 if (tree_is_chrec (expr: evolution_part))
2352 {
2353 if (dump_enabled_p ())
2354 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2355 "evolution is chrec.\n");
2356 return false;
2357 }
2358 }
2359
2360 return true;
2361}
2362
2363
2364/* Function vect_update_ivs_after_vectorizer.
2365
2366 "Advance" the induction variables of LOOP to the value they should take
2367 after the execution of LOOP. This is currently necessary because the
2368 vectorizer does not handle induction variables that are used after the
2369 loop. Such a situation occurs when the last iterations of LOOP are
2370 peeled, because:
2371 1. We introduced new uses after LOOP for IVs that were not originally used
2372 after LOOP: the IVs of LOOP are now used by an epilog loop.
2373 2. LOOP is going to be vectorized; this means that it will iterate N/VF
2374 times, whereas the loop IVs should be bumped N times.
2375
2376 Input:
2377 - LOOP - a loop that is going to be vectorized. The last few iterations
2378 of LOOP were peeled.
2379 - NITERS - the number of iterations that LOOP executes (before it is
2380 vectorized). i.e, the number of times the ivs should be bumped.
2381 - UPDATE_E - a successor edge of LOOP->exit that is on the (only) path
2382 coming out from LOOP on which there are uses of the LOOP ivs
2383 (this is the path from LOOP->exit to epilog_loop->preheader).
2384
2385 The new definitions of the ivs are placed in LOOP->exit.
2386 The phi args associated with the edge UPDATE_E in the bb
2387 UPDATE_E->dest are updated accordingly.
2388
2389 - EARLY_EXIT_P - Indicates whether the exit is an early exit rather than
2390 the main latch exit.
2391
2392 Assumption 1: Like the rest of the vectorizer, this function assumes
2393 a single loop exit that has a single predecessor.
2394
2395 Assumption 2: The phi nodes in the LOOP header and in update_bb are
2396 organized in the same order.
2397
2398 Assumption 3: The access function of the ivs is simple enough (see
2399 vect_can_advance_ivs_p). This assumption will be relaxed in the future.
2400
2401 Assumption 4: Exactly one of the successors of LOOP exit-bb is on a path
2402 coming out of LOOP on which the ivs of LOOP are used (this is the path
2403 that leads to the epilog loop; other paths skip the epilog loop). This
2404 path starts with the edge UPDATE_E, and its destination (denoted update_bb)
2405 needs to have its phis updated.
2406 */
2407
2408static void
2409vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo,
2410 tree niters, edge update_e,
2411 bool early_exit_p)
2412{
2413 gphi_iterator gsi, gsi1;
2414 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2415 basic_block update_bb = update_e->dest;
2416 basic_block exit_bb = update_e->src;
2417 /* Check to see if this is an empty loop pre-header block. If it exists
2418 we need to use the edge from that block -> loop header for updates but
2419 must use the original exit_bb to add any new adjustment because there
2420 can be a skip_epilog edge bypassing the epilog and so the loop pre-header
2421 too. */
2422 if (empty_block_p (update_bb) && single_succ_p (bb: update_bb))
2423 {
2424 update_e = single_succ_edge (bb: update_bb);
2425 update_bb = update_e->dest;
2426 }
2427 gimple_stmt_iterator last_gsi = gsi_last_bb (bb: exit_bb);
2428
2429 for (gsi = gsi_start_phis (loop->header), gsi1 = gsi_start_phis (update_bb);
2430 !gsi_end_p (i: gsi) && !gsi_end_p (i: gsi1);
2431 gsi_next (i: &gsi), gsi_next (i: &gsi1))
2432 {
2433 tree init_expr;
2434 tree step_expr, off;
2435 tree type;
2436 tree var, ni, ni_name;
2437
2438 gphi *phi = gsi.phi ();
2439 gphi *phi1 = gsi1.phi ();
2440 stmt_vec_info phi_info = loop_vinfo->lookup_stmt (phi);
2441 if (dump_enabled_p ())
2442 dump_printf_loc (MSG_NOTE, vect_location,
2443 "vect_update_ivs_after_vectorizer: phi: %G",
2444 (gimple *) phi);
2445
2446 /* Skip reduction and virtual phis. */
2447 if (!iv_phi_p (stmt_info: phi_info))
2448 {
2449 if (dump_enabled_p ())
2450 dump_printf_loc (MSG_NOTE, vect_location,
2451 "reduc or virtual phi. skip.\n");
2452 continue;
2453 }
2454
2455 type = TREE_TYPE (gimple_phi_result (phi));
2456 step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
2457 step_expr = unshare_expr (step_expr);
2458
2459 /* FORNOW: We do not support IVs whose evolution function is a polynomial
2460 of degree >= 2 or exponential. */
2461 gcc_assert (!tree_is_chrec (step_expr));
2462
2463 init_expr = PHI_ARG_DEF_FROM_EDGE (phi, loop_preheader_edge (loop));
2464 gimple_seq stmts = NULL;
2465 enum vect_induction_op_type induction_type
2466 = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (phi_info);
2467
2468 if (induction_type == vect_step_op_add)
2469 {
2470 tree stype = TREE_TYPE (step_expr);
2471 off = fold_build2 (MULT_EXPR, stype,
2472 fold_convert (stype, niters), step_expr);
2473
2474 if (POINTER_TYPE_P (type))
2475 ni = fold_build_pointer_plus (init_expr, off);
2476 else
2477 ni = fold_convert (type,
2478 fold_build2 (PLUS_EXPR, stype,
2479 fold_convert (stype, init_expr),
2480 off));
2481 }
2482 /* Don't bother call vect_peel_nonlinear_iv_init. */
2483 else if (induction_type == vect_step_op_neg)
2484 ni = init_expr;
2485 else
2486 ni = vect_peel_nonlinear_iv_init (&stmts, init_expr,
2487 niters, step_expr,
2488 induction_type, early_exit_p);
2489
2490 var = create_tmp_var (type, "tmp");
2491
2492 gimple_seq new_stmts = NULL;
2493 ni_name = force_gimple_operand (ni, &new_stmts, false, var);
2494
2495 /* Exit_bb shouldn't be empty, but we also can't insert after a ctrl
2496 statements. */
2497 if (!gsi_end_p (i: last_gsi) && !is_ctrl_stmt (gsi_stmt (i: last_gsi)))
2498 {
2499 gsi_insert_seq_after (&last_gsi, stmts, GSI_SAME_STMT);
2500 gsi_insert_seq_after (&last_gsi, new_stmts, GSI_SAME_STMT);
2501 }
2502 else
2503 {
2504 gsi_insert_seq_before (&last_gsi, stmts, GSI_SAME_STMT);
2505 gsi_insert_seq_before (&last_gsi, new_stmts, GSI_SAME_STMT);
2506 }
2507
2508 /* Update the PHI argument on the requested edge. */
2509 adjust_phi_and_debug_stmts (update_phi: phi1, e: update_e, new_def: ni_name);
2510 }
2511}
2512
2513/* Return a gimple value containing the misalignment (measured in vector
2514 elements) for the loop described by LOOP_VINFO, i.e. how many elements
2515 it is away from a perfectly aligned address. Add any new statements
2516 to SEQ. */
2517
2518static tree
2519get_misalign_in_elems (gimple **seq, loop_vec_info loop_vinfo)
2520{
2521 dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2522 stmt_vec_info stmt_info = dr_info->stmt;
2523 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2524
2525 poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr_info);
2526 unsigned HOST_WIDE_INT target_align_c;
2527 tree target_align_minus_1;
2528
2529 bool negative = tree_int_cst_compare (DR_STEP (dr_info->dr),
2530 size_zero_node) < 0;
2531 tree offset = (negative
2532 ? size_int ((-TYPE_VECTOR_SUBPARTS (vectype) + 1)
2533 * TREE_INT_CST_LOW
2534 (TYPE_SIZE_UNIT (TREE_TYPE (vectype))))
2535 : size_zero_node);
2536 tree start_addr = vect_create_addr_base_for_vector_ref (loop_vinfo,
2537 stmt_info, seq,
2538 offset);
2539 tree type = unsigned_type_for (TREE_TYPE (start_addr));
2540 if (target_align.is_constant (const_value: &target_align_c))
2541 target_align_minus_1 = build_int_cst (type, target_align_c - 1);
2542 else
2543 {
2544 tree vla = build_int_cst (type, target_align);
2545 target_align_minus_1 = fold_build2 (MINUS_EXPR, type, vla,
2546 build_int_cst (type, 1));
2547 }
2548
2549 HOST_WIDE_INT elem_size
2550 = int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
2551 tree elem_size_log = build_int_cst (type, exact_log2 (x: elem_size));
2552
2553 /* Create: misalign_in_bytes = addr & (target_align - 1). */
2554 tree int_start_addr = fold_convert (type, start_addr);
2555 tree misalign_in_bytes = fold_build2 (BIT_AND_EXPR, type, int_start_addr,
2556 target_align_minus_1);
2557
2558 /* Create: misalign_in_elems = misalign_in_bytes / element_size. */
2559 tree misalign_in_elems = fold_build2 (RSHIFT_EXPR, type, misalign_in_bytes,
2560 elem_size_log);
2561
2562 return misalign_in_elems;
2563}
2564
2565/* Function vect_gen_prolog_loop_niters
2566
2567 Generate the number of iterations which should be peeled as prolog for the
2568 loop represented by LOOP_VINFO. It is calculated as the misalignment of
2569 DR - the data reference recorded in LOOP_VINFO_UNALIGNED_DR (LOOP_VINFO).
2570 As a result, after the execution of this loop, the data reference DR will
2571 refer to an aligned location. The following computation is generated:
2572
2573 If the misalignment of DR is known at compile time:
2574 addr_mis = int mis = DR_MISALIGNMENT (dr);
2575 Else, compute address misalignment in bytes:
2576 addr_mis = addr & (target_align - 1)
2577
2578 prolog_niters = ((VF - addr_mis/elem_size)&(VF-1))/step
2579
2580 (elem_size = element type size; an element is the scalar element whose type
2581 is the inner type of the vectype)
2582
2583 The computations will be emitted at the end of BB. We also compute and
2584 store upper bound (included) of the result in BOUND.
2585
2586 When the step of the data-ref in the loop is not 1 (as in interleaved data
2587 and SLP), the number of iterations of the prolog must be divided by the step
2588 (which is equal to the size of interleaved group).
2589
2590 The above formulas assume that VF == number of elements in the vector. This
2591 may not hold when there are multiple-types in the loop.
2592 In this case, for some data-references in the loop the VF does not represent
2593 the number of elements that fit in the vector. Therefore, instead of VF we
2594 use TYPE_VECTOR_SUBPARTS. */
2595
2596static tree
2597vect_gen_prolog_loop_niters (loop_vec_info loop_vinfo,
2598 basic_block bb, poly_int64 *bound)
2599{
2600 dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2601 tree var;
2602 tree niters_type
2603 = LOOP_VINFO_NITERS_UNCOUNTED_P (loop_vinfo) ? sizetype
2604 : TREE_TYPE (LOOP_VINFO_NITERS
2605 (loop_vinfo));
2606 gimple_seq stmts = NULL, new_stmts = NULL;
2607 tree iters, iters_name;
2608 stmt_vec_info stmt_info = dr_info->stmt;
2609 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2610 poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr_info);
2611
2612 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
2613 {
2614 int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2615
2616 if (dump_enabled_p ())
2617 dump_printf_loc (MSG_NOTE, vect_location,
2618 "known peeling = %d.\n", npeel);
2619
2620 iters = build_int_cst (niters_type, npeel);
2621 *bound = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2622 }
2623 else
2624 {
2625 tree misalign_in_elems = get_misalign_in_elems (seq: &stmts, loop_vinfo);
2626 tree type = TREE_TYPE (misalign_in_elems);
2627 HOST_WIDE_INT elem_size
2628 = int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
2629 /* We only do prolog peeling if the target alignment is known at compile
2630 time. */
2631 poly_uint64 align_in_elems =
2632 exact_div (a: target_align, b: elem_size);
2633 tree align_in_elems_minus_1 =
2634 build_int_cst (type, align_in_elems - 1);
2635 tree align_in_elems_tree = build_int_cst (type, align_in_elems);
2636
2637 /* Create: (niters_type) ((align_in_elems - misalign_in_elems)
2638 & (align_in_elems - 1)). */
2639 bool negative = tree_int_cst_compare (DR_STEP (dr_info->dr),
2640 size_zero_node) < 0;
2641 if (negative)
2642 iters = fold_build2 (MINUS_EXPR, type, misalign_in_elems,
2643 align_in_elems_tree);
2644 else
2645 iters = fold_build2 (MINUS_EXPR, type, align_in_elems_tree,
2646 misalign_in_elems);
2647 iters = fold_build2 (BIT_AND_EXPR, type, iters, align_in_elems_minus_1);
2648 iters = fold_convert (niters_type, iters);
2649 *bound = align_in_elems;
2650 }
2651
2652 if (dump_enabled_p ())
2653 dump_printf_loc (MSG_NOTE, vect_location,
2654 "niters for prolog loop: %T\n", iters);
2655
2656 var = create_tmp_var (niters_type, "prolog_loop_niters");
2657 iters_name = force_gimple_operand (iters, &new_stmts, false, var);
2658
2659 if (new_stmts)
2660 gimple_seq_add_seq (&stmts, new_stmts);
2661 if (stmts)
2662 {
2663 gcc_assert (single_succ_p (bb));
2664 gimple_stmt_iterator gsi = gsi_last_bb (bb);
2665 if (gsi_end_p (i: gsi))
2666 gsi_insert_seq_before (&gsi, stmts, GSI_SAME_STMT);
2667 else
2668 gsi_insert_seq_after (&gsi, stmts, GSI_SAME_STMT);
2669 }
2670 return iters_name;
2671}
2672
2673
2674/* Function vect_update_init_of_dr
2675
2676 If CODE is PLUS, the vector loop starts NITERS iterations after the
2677 scalar one, otherwise CODE is MINUS and the vector loop starts NITERS
2678 iterations before the scalar one (using masking to skip inactive
2679 elements). This function updates the information recorded in DR to
2680 account for the difference. Specifically, it updates the OFFSET
2681 field of DR_INFO. */
2682
2683static void
2684vect_update_init_of_dr (dr_vec_info *dr_info, tree niters, tree_code code)
2685{
2686 struct data_reference *dr = dr_info->dr;
2687 tree offset = dr_info->offset;
2688 if (!offset)
2689 offset = build_zero_cst (sizetype);
2690
2691 niters = fold_build2 (MULT_EXPR, sizetype,
2692 fold_convert (sizetype, niters),
2693 fold_convert (sizetype, DR_STEP (dr)));
2694 offset = fold_build2 (code, sizetype,
2695 fold_convert (sizetype, offset), niters);
2696 dr_info->offset = offset;
2697}
2698
2699
2700/* Function vect_update_inits_of_drs
2701
2702 Apply vect_update_inits_of_dr to all accesses in LOOP_VINFO.
2703 CODE and NITERS are as for vect_update_inits_of_dr. */
2704
2705void
2706vect_update_inits_of_drs (loop_vec_info loop_vinfo, tree niters,
2707 tree_code code)
2708{
2709 unsigned int i;
2710 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2711 struct data_reference *dr;
2712
2713 DUMP_VECT_SCOPE ("vect_update_inits_of_dr");
2714
2715 /* Adjust niters to sizetype. We used to insert the stmts on loop preheader
2716 here, but since we might use these niters to update the epilogues niters
2717 and data references we can't insert them here as this definition might not
2718 always dominate its uses. */
2719 if (!types_compatible_p (sizetype, TREE_TYPE (niters)))
2720 niters = fold_convert (sizetype, niters);
2721
2722 FOR_EACH_VEC_ELT (datarefs, i, dr)
2723 {
2724 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2725 if (!STMT_VINFO_GATHER_SCATTER_P (dr_info->stmt)
2726 && !STMT_VINFO_SIMD_LANE_ACCESS_P (dr_info->stmt))
2727 vect_update_init_of_dr (dr_info, niters, code);
2728 }
2729}
2730
2731/* For the information recorded in LOOP_VINFO prepare the loop for peeling
2732 by masking. This involves calculating the number of iterations to
2733 be peeled and then aligning all memory references appropriately. */
2734
2735void
2736vect_prepare_for_masked_peels (loop_vec_info loop_vinfo)
2737{
2738 tree misalign_in_elems;
2739 tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
2740
2741 gcc_assert (vect_use_loop_mask_for_alignment_p (loop_vinfo));
2742
2743 /* From the information recorded in LOOP_VINFO get the number of iterations
2744 that need to be skipped via masking. */
2745 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
2746 {
2747 poly_int64 misalign = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
2748 - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo));
2749 misalign_in_elems = build_int_cst (type, misalign);
2750 }
2751 else
2752 {
2753 gimple_seq seq1 = NULL, seq2 = NULL;
2754 misalign_in_elems = get_misalign_in_elems (seq: &seq1, loop_vinfo);
2755 misalign_in_elems = fold_convert (type, misalign_in_elems);
2756 misalign_in_elems = force_gimple_operand (misalign_in_elems,
2757 &seq2, true, NULL_TREE);
2758 gimple_seq_add_seq (&seq1, seq2);
2759 if (seq1)
2760 {
2761 edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
2762 basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq1);
2763 gcc_assert (!new_bb);
2764 }
2765 }
2766
2767 if (dump_enabled_p ())
2768 dump_printf_loc (MSG_NOTE, vect_location,
2769 "misalignment for fully-masked loop: %T\n",
2770 misalign_in_elems);
2771
2772 LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo) = misalign_in_elems;
2773
2774 vect_update_inits_of_drs (loop_vinfo, niters: misalign_in_elems, code: MINUS_EXPR);
2775}
2776
2777/* This function builds ni_name = number of iterations. Statements
2778 are emitted on the loop preheader edge. If NEW_VAR_P is not NULL, set
2779 it to TRUE if new ssa_var is generated. */
2780
2781tree
2782vect_build_loop_niters (loop_vec_info loop_vinfo, bool *new_var_p)
2783{
2784 if (LOOP_VINFO_NITERS_UNCOUNTED_P (loop_vinfo))
2785 return NULL_TREE;
2786 tree ni = unshare_expr (LOOP_VINFO_NITERS (loop_vinfo));
2787 if (TREE_CODE (ni) == INTEGER_CST)
2788 return ni;
2789 else
2790 {
2791 tree ni_name, var;
2792 gimple_seq stmts = NULL;
2793 edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
2794
2795 var = create_tmp_var (TREE_TYPE (ni), "niters");
2796 ni_name = force_gimple_operand (ni, &stmts, false, var);
2797 if (stmts)
2798 {
2799 gsi_insert_seq_on_edge_immediate (pe, stmts);
2800 if (new_var_p != NULL)
2801 *new_var_p = true;
2802 }
2803
2804 return ni_name;
2805 }
2806}
2807
2808/* Calculate the number of iterations above which vectorized loop will be
2809 preferred than scalar loop. NITERS_PROLOG is the number of iterations
2810 of prolog loop. If it's integer const, the integer number is also passed
2811 in INT_NITERS_PROLOG. BOUND_PROLOG is the upper bound (inclusive) of the
2812 number of iterations of the prolog loop. BOUND_EPILOG is the corresponding
2813 value for the epilog loop. If CHECK_PROFITABILITY is true, TH is the
2814 threshold below which the scalar (rather than vectorized) loop will be
2815 executed. This function stores the upper bound (inclusive) of the result
2816 in BOUND_SCALAR. */
2817
2818static tree
2819vect_gen_scalar_loop_niters (tree niters_prolog, int int_niters_prolog,
2820 poly_int64 bound_prolog, poly_int64 bound_epilog,
2821 int th, poly_uint64 *bound_scalar,
2822 bool check_profitability)
2823{
2824 tree type = TREE_TYPE (niters_prolog);
2825 tree niters = fold_build2 (PLUS_EXPR, type, niters_prolog,
2826 build_int_cst (type, bound_epilog));
2827
2828 *bound_scalar = bound_prolog + bound_epilog;
2829 if (check_profitability)
2830 {
2831 /* TH indicates the minimum niters of vectorized loop, while we
2832 compute the maximum niters of scalar loop. */
2833 th--;
2834 /* Peeling for constant times. */
2835 if (int_niters_prolog >= 0)
2836 {
2837 *bound_scalar = upper_bound (a: int_niters_prolog + bound_epilog, b: th);
2838 return build_int_cst (type, *bound_scalar);
2839 }
2840 /* Peeling an unknown number of times. Note that both BOUND_PROLOG
2841 and BOUND_EPILOG are inclusive upper bounds. */
2842 if (known_ge (th, bound_prolog + bound_epilog))
2843 {
2844 *bound_scalar = th;
2845 return build_int_cst (type, th);
2846 }
2847 /* Need to do runtime comparison. */
2848 else if (maybe_gt (th, bound_epilog))
2849 {
2850 *bound_scalar = upper_bound (a: *bound_scalar, b: th);
2851 return fold_build2 (MAX_EXPR, type,
2852 build_int_cst (type, th), niters);
2853 }
2854 }
2855 return niters;
2856}
2857
2858/* NITERS is the number of times that the original scalar loop executes
2859 after peeling. Work out the maximum number of iterations N that can
2860 be handled by the vectorized form of the loop and then either:
2861
2862 a) set *STEP_VECTOR_PTR to the vectorization factor and generate:
2863
2864 niters_vector = N
2865
2866 b) set *STEP_VECTOR_PTR to one and generate:
2867
2868 niters_vector = N / vf
2869
2870 In both cases, store niters_vector in *NITERS_VECTOR_PTR and add
2871 any new statements on the loop preheader edge. NITERS_NO_OVERFLOW
2872 is true if NITERS doesn't overflow (i.e. if NITERS is always nonzero).
2873
2874 Case (a) is used for LOOP_VINFO_USING_PARTIAL_VECTORS_P or if VF is
2875 variable. As stated above, NITERS_VECTOR then equals the number
2876 of scalar iterations and vect_set_loop_condition will handle the
2877 step. As opposed to (b) we don't know anything about NITER_VECTOR's
2878 range here.
2879*/
2880
2881void
2882vect_gen_vector_loop_niters (loop_vec_info loop_vinfo, tree niters,
2883 tree *niters_vector_ptr, tree *step_vector_ptr,
2884 bool niters_no_overflow)
2885{
2886 tree ni_minus_gap, var;
2887 tree niters_vector, step_vector;
2888 tree type = niters ? TREE_TYPE (niters) : sizetype;
2889 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2890 edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
2891
2892 /* If epilogue loop is required because of data accesses with gaps, we
2893 subtract one iteration from the total number of iterations here for
2894 correct calculation of RATIO. */
2895 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2896 {
2897 ni_minus_gap = fold_build2 (MINUS_EXPR, type, niters,
2898 build_one_cst (type));
2899 if (!is_gimple_val (ni_minus_gap))
2900 {
2901 var = create_tmp_var (type, "ni_gap");
2902 gimple *stmts = NULL;
2903 ni_minus_gap = force_gimple_operand (ni_minus_gap, &stmts,
2904 true, var);
2905 gsi_insert_seq_on_edge_immediate (pe, stmts);
2906 }
2907 }
2908 else
2909 ni_minus_gap = niters;
2910
2911 /* To silence some unexpected warnings, simply initialize to 0. */
2912 unsigned HOST_WIDE_INT const_vf = 0;
2913 if (vf.is_constant (const_value: &const_vf)
2914 && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2915 {
2916 /* Create: niters / vf, which is equivalent to niters >> log2(vf) when
2917 vf is a power of two, and when not we approximate using a
2918 truncating division. */
2919 /* If it's known that niters == number of latch executions + 1 doesn't
2920 overflow, we can generate niters / vf; otherwise we generate
2921 (niters - vf) / vf + 1 by using the fact that we know ratio
2922 will be at least one. */
2923 tree var_vf = build_int_cst (type, const_vf);
2924 if (niters_no_overflow)
2925 niters_vector = fold_build2 (TRUNC_DIV_EXPR, type, ni_minus_gap,
2926 var_vf);
2927 else
2928 niters_vector
2929 = fold_build2 (PLUS_EXPR, type,
2930 fold_build2 (TRUNC_DIV_EXPR, type,
2931 fold_build2 (MINUS_EXPR, type,
2932 ni_minus_gap,
2933 var_vf),
2934 var_vf),
2935 build_int_cst (type, 1));
2936 step_vector = build_one_cst (type);
2937 }
2938 else
2939 {
2940 niters_vector = ni_minus_gap;
2941 step_vector = build_int_cst (type, vf);
2942 }
2943
2944 if (!is_gimple_val (niters_vector))
2945 {
2946 var = create_tmp_var (type, "bnd");
2947 gimple_seq stmts = NULL;
2948 niters_vector = force_gimple_operand (niters_vector, &stmts, true, var);
2949 gsi_insert_seq_on_edge_immediate (pe, stmts);
2950 /* Peeling algorithm guarantees that vector loop bound is at least ONE,
2951 we set range information to make niters analyzer's life easier.
2952 Note the number of latch iteration value can be TYPE_MAX_VALUE so
2953 we have to represent the vector niter TYPE_MAX_VALUE + 1 / vf. */
2954 if (stmts != NULL
2955 && integer_onep (step_vector))
2956 {
2957 if (niters_no_overflow)
2958 {
2959 int_range<1> vr (type,
2960 wi::one (TYPE_PRECISION (type)),
2961 wi::div_trunc (x: wi::max_value
2962 (TYPE_PRECISION (type),
2963 TYPE_SIGN (type)),
2964 y: const_vf,
2965 TYPE_SIGN (type)));
2966 set_range_info (niters_vector, vr);
2967 }
2968 /* For VF == 1 the vector IV might also overflow so we cannot
2969 assert a minimum value of 1. */
2970 else if (const_vf > 1)
2971 {
2972 int_range<1> vr (type,
2973 wi::one (TYPE_PRECISION (type)),
2974 wi::rshift (x: wi::max_value (TYPE_PRECISION (type),
2975 TYPE_SIGN (type))
2976 - (const_vf - 1),
2977 y: exact_log2 (x: const_vf), TYPE_SIGN (type))
2978 + 1);
2979 set_range_info (niters_vector, vr);
2980 }
2981 }
2982 }
2983 *niters_vector_ptr = niters_vector;
2984 *step_vector_ptr = step_vector;
2985
2986 return;
2987}
2988
2989/* Given NITERS_VECTOR which is the number of iterations for vectorized
2990 loop specified by LOOP_VINFO after vectorization, compute the number
2991 of iterations before vectorization (niters_vector * vf) and store it
2992 to NITERS_VECTOR_MULT_VF_PTR. */
2993
2994static void
2995vect_gen_vector_loop_niters_mult_vf (loop_vec_info loop_vinfo,
2996 tree niters_vector,
2997 tree *niters_vector_mult_vf_ptr)
2998{
2999 /* We should be using a step_vector of VF if VF is variable. */
3000 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo).to_constant ();
3001 tree type = TREE_TYPE (niters_vector);
3002 tree tree_vf = build_int_cst (type, vf);
3003 basic_block exit_bb = LOOP_VINFO_MAIN_EXIT (loop_vinfo)->dest;
3004
3005 gcc_assert (niters_vector_mult_vf_ptr != NULL);
3006 tree niters_vector_mult_vf = fold_build2 (MULT_EXPR, type,
3007 niters_vector, tree_vf);
3008
3009 /* If we've peeled a vector iteration then subtract one full vector
3010 iteration. */
3011 if (LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo))
3012 niters_vector_mult_vf = fold_build2 (MINUS_EXPR, type,
3013 niters_vector_mult_vf, tree_vf);
3014
3015 if (!is_gimple_val (niters_vector_mult_vf))
3016 {
3017 tree var = create_tmp_var (type, "niters_vector_mult_vf");
3018 gimple_seq stmts = NULL;
3019 niters_vector_mult_vf = force_gimple_operand (niters_vector_mult_vf,
3020 &stmts, true, var);
3021 gimple_stmt_iterator gsi = gsi_start_bb (bb: exit_bb);
3022 gsi_insert_seq_before (&gsi, stmts, GSI_SAME_STMT);
3023 }
3024 *niters_vector_mult_vf_ptr = niters_vector_mult_vf;
3025}
3026
3027/* Function slpeel_add_loop_guard adds guard skipping from the beginning
3028 of SKIP_LOOP to the beginning of UPDATE_LOOP. GUARD_EDGE and MERGE_EDGE
3029 are two pred edges of the merge point before UPDATE_LOOP. The two loops
3030 appear like below:
3031
3032 guard_bb:
3033 if (cond)
3034 goto merge_bb;
3035 else
3036 goto skip_loop;
3037
3038 skip_loop:
3039 header_a:
3040 i_1 = PHI<i_0, i_2>;
3041 ...
3042 i_2 = i_1 + 1;
3043 if (cond_a)
3044 goto latch_a;
3045 else
3046 goto exit_a;
3047 latch_a:
3048 goto header_a;
3049
3050 exit_a:
3051 i_5 = PHI<i_2>;
3052
3053 merge_bb:
3054 ;; PHI (i_x = PHI<i_0, i_5>) to be created at merge point.
3055
3056 update_loop:
3057 header_b:
3058 i_3 = PHI<i_5, i_4>; ;; Use of i_5 to be replaced with i_x.
3059 ...
3060 i_4 = i_3 + 1;
3061 if (cond_b)
3062 goto latch_b;
3063 else
3064 goto exit_bb;
3065 latch_b:
3066 goto header_b;
3067
3068 exit_bb:
3069
3070 This function creates PHI nodes at merge_bb and replaces the use of i_5
3071 in the update_loop's PHI node with the result of new PHI result. */
3072
3073static void
3074slpeel_update_phi_nodes_for_guard1 (class loop *skip_loop,
3075 class loop *update_loop,
3076 edge guard_edge, edge merge_edge)
3077{
3078 location_t merge_loc, guard_loc;
3079 edge orig_e = loop_preheader_edge (skip_loop);
3080 edge update_e = loop_preheader_edge (update_loop);
3081 gphi_iterator gsi_orig, gsi_update;
3082
3083 for ((gsi_orig = gsi_start_phis (skip_loop->header),
3084 gsi_update = gsi_start_phis (update_loop->header));
3085 !gsi_end_p (i: gsi_orig) && !gsi_end_p (i: gsi_update);
3086 gsi_next (i: &gsi_orig), gsi_next (i: &gsi_update))
3087 {
3088 gphi *orig_phi = gsi_orig.phi ();
3089 gphi *update_phi = gsi_update.phi ();
3090
3091 /* Generate new phi node at merge bb of the guard. */
3092 tree new_res = copy_ssa_name (PHI_RESULT (orig_phi));
3093 gphi *new_phi = create_phi_node (new_res, guard_edge->dest);
3094
3095 /* Merge bb has two incoming edges: GUARD_EDGE and MERGE_EDGE. Set the
3096 args in NEW_PHI for these edges. */
3097 tree merge_arg = PHI_ARG_DEF_FROM_EDGE (update_phi, update_e);
3098 tree guard_arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, orig_e);
3099 merge_loc = gimple_phi_arg_location_from_edge (phi: update_phi, e: update_e);
3100 guard_loc = gimple_phi_arg_location_from_edge (phi: orig_phi, e: orig_e);
3101 add_phi_arg (new_phi, merge_arg, merge_edge, merge_loc);
3102 add_phi_arg (new_phi, guard_arg, guard_edge, guard_loc);
3103
3104 /* Update phi in UPDATE_PHI. */
3105 adjust_phi_and_debug_stmts (update_phi, e: update_e, new_def: new_res);
3106 }
3107}
3108
3109/* LOOP_VINFO is an epilogue loop whose corresponding main loop can be skipped.
3110 Return a value that equals:
3111
3112 - MAIN_LOOP_VALUE when LOOP_VINFO is entered from the main loop and
3113 - SKIP_VALUE when the main loop is skipped. */
3114
3115tree
3116vect_get_main_loop_result (loop_vec_info loop_vinfo, tree main_loop_value,
3117 tree skip_value)
3118{
3119 gcc_assert (loop_vinfo->main_loop_edge);
3120
3121 tree phi_result = make_ssa_name (TREE_TYPE (main_loop_value));
3122 basic_block bb = loop_vinfo->main_loop_edge->dest;
3123 gphi *new_phi = create_phi_node (phi_result, bb);
3124 add_phi_arg (new_phi, main_loop_value, loop_vinfo->main_loop_edge,
3125 UNKNOWN_LOCATION);
3126 add_phi_arg (new_phi, skip_value,
3127 loop_vinfo->skip_main_loop_edge, UNKNOWN_LOCATION);
3128 return phi_result;
3129}
3130
3131/* Function vect_do_peeling.
3132
3133 Input:
3134 - LOOP_VINFO: Represent a loop to be vectorized, which looks like:
3135
3136 preheader:
3137 LOOP:
3138 header_bb:
3139 loop_body
3140 if (exit_loop_cond) goto exit_bb
3141 else goto header_bb
3142 exit_bb:
3143
3144 - NITERS: The number of iterations of the loop.
3145 - NITERSM1: The number of iterations of the loop's latch.
3146 - NITERS_NO_OVERFLOW: No overflow in computing NITERS.
3147 - TH, CHECK_PROFITABILITY: Threshold of niters to vectorize loop if
3148 CHECK_PROFITABILITY is true.
3149 Output:
3150 - *NITERS_VECTOR and *STEP_VECTOR describe how the main loop should
3151 iterate after vectorization; see vect_set_loop_condition for details.
3152 - *NITERS_VECTOR_MULT_VF_VAR is either null or an SSA name that
3153 should be set to the number of scalar iterations handled by the
3154 vector loop. The SSA name is only used on exit from the loop.
3155
3156 This function peels prolog and epilog from the loop, adds guards skipping
3157 PROLOG and EPILOG for various conditions. As a result, the changed CFG
3158 would look like:
3159
3160 guard_bb_1:
3161 if (prefer_scalar_loop) goto merge_bb_1
3162 else goto guard_bb_2
3163
3164 guard_bb_2:
3165 if (skip_prolog) goto merge_bb_2
3166 else goto prolog_preheader
3167
3168 prolog_preheader:
3169 PROLOG:
3170 prolog_header_bb:
3171 prolog_body
3172 if (exit_prolog_cond) goto prolog_exit_bb
3173 else goto prolog_header_bb
3174 prolog_exit_bb:
3175
3176 merge_bb_2:
3177
3178 vector_preheader:
3179 VECTOR LOOP:
3180 vector_header_bb:
3181 vector_body
3182 if (exit_vector_cond) goto vector_exit_bb
3183 else goto vector_header_bb
3184 vector_exit_bb:
3185
3186 guard_bb_3:
3187 if (skip_epilog) goto merge_bb_3
3188 else goto epilog_preheader
3189
3190 merge_bb_1:
3191
3192 epilog_preheader:
3193 EPILOG:
3194 epilog_header_bb:
3195 epilog_body
3196 if (exit_epilog_cond) goto merge_bb_3
3197 else goto epilog_header_bb
3198
3199 merge_bb_3:
3200
3201 Note this function peels prolog and epilog only if it's necessary,
3202 as well as guards.
3203 This function returns the epilogue loop if a decision was made to vectorize
3204 it, otherwise NULL.
3205
3206 The analysis resulting in this epilogue loop's loop_vec_info was performed
3207 in the same vect_analyze_loop call as the main loop's. At that time
3208 vect_analyze_loop constructs a list of accepted loop_vec_info's for lower
3209 vectorization factors than the main loop. This list is chained in the
3210 loop's loop_vec_info in the 'epilogue_vinfo' member. When we decide to
3211 vectorize the epilogue loop for a lower vectorization factor, the
3212 loop_vec_info in epilogue_vinfo is updated and linked to the epilogue loop.
3213 This is later used to vectorize the epilogue.
3214 The reason the loop_vec_info needs updating is that it was
3215 constructed based on the original main loop, and the epilogue loop is a
3216 copy of this loop, so all links pointing to statements in the original loop
3217 need updating. Furthermore, these loop_vec_infos share the
3218 data_reference's records, which will also need to be updated.
3219
3220 TODO: Guard for prefer_scalar_loop should be emitted along with
3221 versioning conditions if loop versioning is needed. */
3222
3223
3224class loop *
3225vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
3226 tree *niters_vector, tree *step_vector,
3227 tree *niters_vector_mult_vf_var, int th,
3228 bool check_profitability, bool niters_no_overflow,
3229 tree *advance)
3230{
3231 edge e, guard_e;
3232 tree type = niters ? TREE_TYPE (niters) : sizetype;
3233 tree guard_cond;
3234 basic_block guard_bb, guard_to;
3235 profile_probability prob_prolog, prob_vector, prob_epilog;
3236 int estimated_vf;
3237 int prolog_peeling = 0;
3238 bool vect_epilogues = loop_vinfo->epilogue_vinfo != NULL;
3239 bool uncounted_p = LOOP_VINFO_NITERS_UNCOUNTED_P (loop_vinfo);
3240
3241 if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
3242 prolog_peeling = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3243
3244 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3245 poly_uint64 bound_epilog = 0;
3246 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
3247 && LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
3248 bound_epilog += vf - 1;
3249 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
3250 bound_epilog += 1;
3251
3252 /* For early breaks the scalar loop needs to execute at most VF times
3253 to find the element that caused the break. */
3254 if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
3255 bound_epilog = vf;
3256
3257 bool epilog_peeling = maybe_ne (a: bound_epilog, b: 0U);
3258 poly_uint64 bound_scalar = bound_epilog;
3259
3260 if (!prolog_peeling && !epilog_peeling)
3261 return NULL;
3262
3263 /* Before doing any peeling make sure to reset debug binds outside of
3264 the loop referring to defs not in LC SSA. */
3265 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3266 for (unsigned i = 0; i < loop->num_nodes; ++i)
3267 {
3268 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
3269 imm_use_iterator ui;
3270 gimple *use_stmt;
3271 for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (i: gsi);
3272 gsi_next (i: &gsi))
3273 {
3274 FOR_EACH_IMM_USE_STMT (use_stmt, ui, gimple_phi_result (gsi.phi ()))
3275 if (gimple_debug_bind_p (s: use_stmt)
3276 && loop != gimple_bb (g: use_stmt)->loop_father
3277 && !flow_loop_nested_p (loop,
3278 gimple_bb (g: use_stmt)->loop_father))
3279 {
3280 gimple_debug_bind_reset_value (dbg: use_stmt);
3281 update_stmt (s: use_stmt);
3282 }
3283 }
3284 for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (i: gsi);
3285 gsi_next (i: &gsi))
3286 {
3287 ssa_op_iter op_iter;
3288 def_operand_p def_p;
3289 FOR_EACH_SSA_DEF_OPERAND (def_p, gsi_stmt (gsi), op_iter, SSA_OP_DEF)
3290 FOR_EACH_IMM_USE_STMT (use_stmt, ui, DEF_FROM_PTR (def_p))
3291 if (gimple_debug_bind_p (s: use_stmt)
3292 && loop != gimple_bb (g: use_stmt)->loop_father
3293 && !flow_loop_nested_p (loop,
3294 gimple_bb (g: use_stmt)->loop_father))
3295 {
3296 gimple_debug_bind_reset_value (dbg: use_stmt);
3297 update_stmt (s: use_stmt);
3298 }
3299 }
3300 }
3301
3302 prob_vector = profile_probability::guessed_always ().apply_scale (num: 9, den: 10);
3303 estimated_vf = vect_vf_for_cost (loop_vinfo);
3304 if (estimated_vf == 2)
3305 estimated_vf = 3;
3306 prob_prolog = prob_epilog = profile_probability::guessed_always ()
3307 .apply_scale (num: estimated_vf - 1, den: estimated_vf);
3308
3309 class loop *prolog = NULL, *epilog = NULL;
3310 class loop *first_loop = loop;
3311 bool irred_flag = loop_preheader_edge (loop)->flags & EDGE_IRREDUCIBLE_LOOP;
3312
3313 /* SSA form needs to be up-to-date since we are going to manually
3314 update SSA form in slpeel_tree_duplicate_loop_to_edge_cfg and delete all
3315 update SSA state after that, so we have to make sure to not lose any
3316 pending update needs. */
3317 gcc_assert (!need_ssa_update_p (cfun));
3318
3319 /* If we're vectorizing an epilogue loop, we have ensured that the
3320 virtual operand is in SSA form throughout the vectorized main loop.
3321 Normally it is possible to trace the updated
3322 vector-stmt vdefs back to scalar-stmt vdefs and vector-stmt vuses
3323 back to scalar-stmt vuses, meaning that the effect of the SSA update
3324 remains local to the main loop. However, there are rare cases in
3325 which the vectorized loop should have vdefs even when the original scalar
3326 loop didn't. For example, vectorizing a load with IFN_LOAD_LANES
3327 introduces clobbers of the temporary vector array, which in turn
3328 needs new vdefs. If the scalar loop doesn't write to memory, these
3329 new vdefs will be the only ones in the vector loop.
3330 We are currently defering updating virtual SSA form and creating
3331 of a virtual PHI for this case so we do not have to make sure the
3332 newly introduced virtual def is in LCSSA form. */
3333
3334 if (MAY_HAVE_DEBUG_BIND_STMTS)
3335 {
3336 gcc_assert (!adjust_vec.exists ());
3337 adjust_vec.create (nelems: 32);
3338 }
3339 initialize_original_copy_tables ();
3340
3341 /* Record the anchor bb at which the guard should be placed if the scalar
3342 loop might be preferred. */
3343 basic_block anchor = loop_preheader_edge (loop)->src;
3344
3345 /* Generate the number of iterations for the prolog loop. We do this here
3346 so that we can also get the upper bound on the number of iterations. */
3347 tree niters_prolog;
3348 poly_int64 bound_prolog = 0;
3349 if (prolog_peeling)
3350 {
3351 niters_prolog = vect_gen_prolog_loop_niters (loop_vinfo, bb: anchor,
3352 bound: &bound_prolog);
3353 /* If algonment peeling is known, we will always execute prolog. */
3354 if (TREE_CODE (niters_prolog) == INTEGER_CST)
3355 prob_prolog = profile_probability::always ();
3356 }
3357 else
3358 niters_prolog = build_int_cst (type, 0);
3359
3360 loop_vec_info epilogue_vinfo = loop_vinfo->epilogue_vinfo;
3361 tree niters_vector_mult_vf = NULL_TREE;
3362 /* Saving NITERs before the loop, as this may be changed by prologue. */
3363 tree before_loop_niters = LOOP_VINFO_NITERS (loop_vinfo);
3364 edge update_e = NULL, skip_e = NULL;
3365 unsigned int lowest_vf = constant_lower_bound (a: vf);
3366 /* Prolog loop may be skipped. */
3367 bool skip_prolog = (prolog_peeling != 0);
3368 /* Skip this loop to epilog when there are not enough iterations to enter this
3369 vectorized loop. If true we should perform runtime checks on the NITERS
3370 to check whether we should skip the current vectorized loop. If we know
3371 the number of scalar iterations we may choose to add a runtime check if
3372 this number "maybe" smaller than the number of iterations required
3373 when we know the number of scalar iterations may potentially
3374 be smaller than the number of iterations required to enter this loop, for
3375 this we use the upper bounds on the prolog and epilog peeling. When we
3376 don't know the number of iterations and don't require versioning it is
3377 because we have asserted that there are enough scalar iterations to enter
3378 the main loop, so this skip is not necessary. When we are versioning then
3379 we only add such a skip if we have chosen to vectorize the epilogue. */
3380 bool skip_vector = false;
3381 if (!uncounted_p)
3382 skip_vector = (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3383 ? maybe_lt (LOOP_VINFO_INT_NITERS (loop_vinfo),
3384 b: bound_prolog + bound_epilog)
3385 : (!LOOP_VINFO_USE_VERSIONING_WITHOUT_PEELING (loop_vinfo)
3386 || vect_epilogues));
3387
3388 /* Epilog loop must be executed if the number of iterations for epilog
3389 loop is known at compile time, otherwise we need to add a check at
3390 the end of vector loop and skip to the end of epilog loop. */
3391 bool skip_epilog = (prolog_peeling < 0
3392 || !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3393 || !vf.is_constant ());
3394 /* PEELING_FOR_GAPS and peeling for early breaks are special because epilog
3395 loop must be executed. */
3396 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
3397 || LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
3398 skip_epilog = false;
3399
3400 class loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
3401 auto_vec<profile_count> original_counts;
3402 basic_block *original_bbs = NULL;
3403
3404 if (skip_vector)
3405 {
3406 split_edge (loop_preheader_edge (loop));
3407
3408 if (epilog_peeling && (vect_epilogues || scalar_loop == NULL))
3409 {
3410 original_bbs = get_loop_body (loop);
3411 for (unsigned int i = 0; i < loop->num_nodes; i++)
3412 original_counts.safe_push(obj: original_bbs[i]->count);
3413 }
3414
3415 /* Due to the order in which we peel prolog and epilog, we first
3416 propagate probability to the whole loop. The purpose is to
3417 avoid adjusting probabilities of both prolog and vector loops
3418 separately. Note in this case, the probability of epilog loop
3419 needs to be scaled back later. */
3420 basic_block bb_before_loop = loop_preheader_edge (loop)->src;
3421 if (prob_vector.initialized_p ())
3422 {
3423 scale_bbs_frequencies (&bb_before_loop, 1, prob_vector);
3424 scale_loop_profile (loop, prob_vector, -1);
3425 }
3426 }
3427
3428 if (vect_epilogues)
3429 {
3430 /* Make sure to set the epilogue's epilogue scalar loop, such that we can
3431 use the original scalar loop as remaining epilogue if necessary. */
3432 LOOP_VINFO_SCALAR_LOOP (epilogue_vinfo)
3433 = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
3434 LOOP_VINFO_SCALAR_MAIN_EXIT (epilogue_vinfo)
3435 = LOOP_VINFO_SCALAR_MAIN_EXIT (loop_vinfo);
3436 }
3437
3438 if (prolog_peeling)
3439 {
3440 e = loop_preheader_edge (loop);
3441 edge exit_e = LOOP_VINFO_MAIN_EXIT (loop_vinfo);
3442 gcc_checking_assert (slpeel_can_duplicate_loop_p (loop, exit_e, e)
3443 && (!LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo)
3444 || uncounted_p));
3445
3446 /* Peel prolog and put it on preheader edge of loop. */
3447 edge scalar_e = LOOP_VINFO_SCALAR_MAIN_EXIT (loop_vinfo);
3448 edge prolog_e = NULL;
3449 prolog = slpeel_tree_duplicate_loop_to_edge_cfg (loop, loop_exit: exit_e,
3450 scalar_loop, scalar_exit: scalar_e,
3451 e, new_e: &prolog_e, flow_loops: true, NULL,
3452 uncounted_p: false, create_main_e: uncounted_p);
3453
3454 gcc_assert (prolog);
3455 prolog->force_vectorize = false;
3456
3457 /* Assign hierarchical discriminators to distinguish prolog loop. */
3458 gimple *prolog_last = last_nondebug_stmt (prolog->header);
3459 location_t prolog_loc
3460 = prolog_last ? gimple_location (g: prolog_last) : UNKNOWN_LOCATION;
3461 if (prolog_loc != UNKNOWN_LOCATION)
3462 {
3463 unsigned int prolog_copyid = allocate_copyid_base (loc: prolog_loc, stride: 1);
3464 assign_discriminators_to_loop (loop: prolog, multiplicity_factor: 0, copyid: prolog_copyid);
3465 }
3466 first_loop = prolog;
3467 reset_original_copy_tables ();
3468
3469 /* Update the number of iterations for prolog loop. */
3470 tree step_prolog = build_one_cst (TREE_TYPE (niters_prolog));
3471 vect_set_loop_condition (loop: prolog, loop_e: prolog_e, NULL, niters: niters_prolog,
3472 step: step_prolog, NULL_TREE, niters_maybe_zero: false);
3473
3474 /* Skip the prolog loop. */
3475 if (skip_prolog)
3476 {
3477 guard_cond = fold_build2 (EQ_EXPR, boolean_type_node,
3478 niters_prolog, build_int_cst (type, 0));
3479 guard_bb = loop_preheader_edge (prolog)->src;
3480 basic_block bb_after_prolog = loop_preheader_edge (loop)->src;
3481 guard_to = split_edge (loop_preheader_edge (loop));
3482 guard_e = slpeel_add_loop_guard (guard_bb, cond: guard_cond,
3483 guard_to, dom_bb: guard_bb,
3484 probability: prob_prolog.invert (),
3485 irreducible_p: irred_flag);
3486 for (edge alt_e : get_loop_exit_edges (prolog))
3487 {
3488 if (alt_e == prolog_e)
3489 continue;
3490 basic_block old_dom
3491 = get_immediate_dominator (CDI_DOMINATORS, alt_e->dest);
3492 if (flow_bb_inside_loop_p (prolog, old_dom))
3493 {
3494 auto_vec<basic_block, 8> queue;
3495 for (auto son = first_dom_son (CDI_DOMINATORS, old_dom);
3496 son; son = next_dom_son (CDI_DOMINATORS, son))
3497 if (!flow_bb_inside_loop_p (prolog, son))
3498 queue.safe_push (obj: son);
3499 for (auto son : queue)
3500 set_immediate_dominator (CDI_DOMINATORS, son, guard_bb);
3501 }
3502 }
3503
3504 e = EDGE_PRED (guard_to, 0);
3505 e = (e != guard_e ? e : EDGE_PRED (guard_to, 1));
3506 slpeel_update_phi_nodes_for_guard1 (skip_loop: prolog, update_loop: loop, guard_edge: guard_e, merge_edge: e);
3507
3508 scale_bbs_frequencies (&bb_after_prolog, 1, prob_prolog);
3509 scale_loop_profile (prolog, prob_prolog,
3510 estimated_poly_value (x: bound_prolog) - 1);
3511 }
3512
3513 /* Update init address of DRs. */
3514 vect_update_inits_of_drs (loop_vinfo, niters: niters_prolog, code: PLUS_EXPR);
3515 if (!uncounted_p)
3516 {
3517 /* Update niters for vector loop. */
3518 LOOP_VINFO_NITERS (loop_vinfo)
3519 = fold_build2 (MINUS_EXPR, type, niters, niters_prolog);
3520 LOOP_VINFO_NITERSM1 (loop_vinfo)
3521 = fold_build2 (MINUS_EXPR, type,
3522 LOOP_VINFO_NITERSM1 (loop_vinfo), niters_prolog);
3523 }
3524 bool new_var_p = false;
3525 niters = vect_build_loop_niters (loop_vinfo, new_var_p: &new_var_p);
3526 /* It's guaranteed that vector loop bound before vectorization is at
3527 least VF, so set range information for newly generated var. */
3528 if (new_var_p)
3529 {
3530 int_range<1> vr (type,
3531 wi::to_wide (t: build_int_cst (type, lowest_vf)),
3532 wi::to_wide (TYPE_MAX_VALUE (type)));
3533 set_range_info (niters, vr);
3534 }
3535
3536 /* Prolog iterates at most bound_prolog times, latch iterates at
3537 most bound_prolog - 1 times. */
3538 if (bound_prolog.is_constant ())
3539 record_niter_bound (prolog, bound_prolog.to_constant () - 1, false,
3540 true);
3541 delete_update_ssa ();
3542 adjust_vec_debug_stmts ();
3543 scev_reset ();
3544 }
3545 basic_block bb_before_epilog = NULL;
3546
3547 if (epilog_peeling)
3548 {
3549 e = LOOP_VINFO_MAIN_EXIT (loop_vinfo);
3550 gcc_checking_assert (slpeel_can_duplicate_loop_p (loop, e, e));
3551
3552 /* Peel epilog and put it on exit edge of loop. If we are vectorizing
3553 said epilog then we should use a copy of the main loop as a starting
3554 point. This loop may have already had some preliminary transformations
3555 to allow for more optimal vectorization, for example if-conversion.
3556 If we are not vectorizing the epilog then we should use the scalar loop
3557 as the transformations mentioned above make less or no sense when not
3558 vectorizing. */
3559 edge scalar_e = LOOP_VINFO_SCALAR_MAIN_EXIT (loop_vinfo);
3560 epilog = vect_epilogues ? get_loop_copy (loop) : scalar_loop;
3561 edge epilog_e = vect_epilogues ? e : scalar_e;
3562 edge new_epilog_e = NULL;
3563 auto_vec<basic_block> doms;
3564 epilog
3565 = slpeel_tree_duplicate_loop_to_edge_cfg (loop, loop_exit: e, scalar_loop: epilog, scalar_exit: epilog_e, e,
3566 new_e: &new_epilog_e, flow_loops: true, updated_doms: &doms,
3567 uncounted_p);
3568
3569 LOOP_VINFO_EPILOGUE_MAIN_EXIT (loop_vinfo) = new_epilog_e;
3570 gcc_assert (epilog);
3571 gcc_assert (new_epilog_e);
3572 epilog->force_vectorize = false;
3573 bb_before_epilog = loop_preheader_edge (epilog)->src;
3574
3575 /* Assign hierarchical discriminators to distinguish epilog loop.
3576 Only assign if it's a scalar epilog. If it will be vectorized
3577 (vect_epilogues), discriminators will be assigned.
3578 Use dynamic copy_id allocation instead of hardcoded constants. */
3579 if (!vect_epilogues)
3580 {
3581 gimple *epilog_last = last_nondebug_stmt (epilog->header);
3582 location_t epilog_loc
3583 = epilog_last ? gimple_location (g: epilog_last) : UNKNOWN_LOCATION;
3584 if (epilog_loc != UNKNOWN_LOCATION)
3585 {
3586 unsigned int epilog_copyid = allocate_copyid_base (loc: epilog_loc, stride: 1);
3587 assign_discriminators_to_loop (loop: epilog, multiplicity_factor: 0, copyid: epilog_copyid);
3588 }
3589 }
3590
3591 /* Scalar version loop may be preferred. In this case, add guard
3592 and skip to epilog. Note this only happens when the number of
3593 iterations of loop is unknown at compile time, otherwise this
3594 won't be vectorized. */
3595 if (skip_vector)
3596 {
3597 /* Additional epilogue iteration is peeled if gap exists. */
3598 tree t = vect_gen_scalar_loop_niters (niters_prolog, int_niters_prolog: prolog_peeling,
3599 bound_prolog, bound_epilog,
3600 th, bound_scalar: &bound_scalar,
3601 check_profitability);
3602 /* Build guard against NITERSM1 since NITERS may overflow. */
3603 guard_cond = fold_build2 (LT_EXPR, boolean_type_node, nitersm1, t);
3604 guard_bb = anchor;
3605 guard_to = split_edge (loop_preheader_edge (epilog));
3606 guard_e = slpeel_add_loop_guard (guard_bb, cond: guard_cond,
3607 guard_to, dom_bb: guard_bb,
3608 probability: prob_vector.invert (),
3609 irreducible_p: irred_flag);
3610 skip_e = guard_e;
3611 e = EDGE_PRED (guard_to, 0);
3612 e = (e != guard_e ? e : EDGE_PRED (guard_to, 1));
3613
3614 /* Handle any remaining dominator updates needed after
3615 inserting the loop skip edge above. */
3616 if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo)
3617 && prolog_peeling)
3618 {
3619 /* Adding a skip edge to skip a loop with multiple exits
3620 means the dominator of the join blocks for all exits shifts
3621 from the prolog skip guard to the loop skip guard. */
3622 auto prolog_skip_bb
3623 = single_pred (bb: loop_preheader_edge (prolog)->src);
3624 auto needs_update
3625 = get_dominated_by (CDI_DOMINATORS, prolog_skip_bb);
3626
3627 /* Update everything except for the immediate children of
3628 the prolog skip block (the prolog and vector preheaders).
3629 Those should remain dominated by the prolog skip block itself,
3630 since the loop guard edge goes to the epilogue. */
3631 for (auto bb : needs_update)
3632 if (bb != EDGE_SUCC (prolog_skip_bb, 0)->dest
3633 && bb != EDGE_SUCC (prolog_skip_bb, 1)->dest)
3634 set_immediate_dominator (CDI_DOMINATORS, bb, guard_bb);
3635 }
3636
3637 slpeel_update_phi_nodes_for_guard1 (skip_loop: first_loop, update_loop: epilog, guard_edge: guard_e, merge_edge: e);
3638
3639 /* Simply propagate profile info from guard_bb to guard_to which is
3640 a merge point of control flow. */
3641 profile_count old_count = guard_to->count;
3642 guard_to->count = guard_bb->count;
3643
3644 /* Restore the counts of the epilog loop if we didn't use the scalar loop. */
3645 if (vect_epilogues || scalar_loop == NULL)
3646 {
3647 gcc_assert(epilog->num_nodes == loop->num_nodes);
3648 basic_block *bbs = get_loop_body (epilog);
3649 for (unsigned int i = 0; i < epilog->num_nodes; i++)
3650 {
3651 gcc_assert(get_bb_original (bbs[i]) == original_bbs[i]);
3652 bbs[i]->count = original_counts[i];
3653 }
3654 free (ptr: bbs);
3655 free (ptr: original_bbs);
3656 }
3657 else if (old_count.nonzero_p ())
3658 scale_loop_profile (epilog, guard_to->count.probability_in (overall: old_count), -1);
3659
3660 /* Only need to handle basic block before epilog loop if it's not
3661 the guard_bb, which is the case when skip_vector is true. */
3662 if (guard_bb != bb_before_epilog && single_pred_p (bb: bb_before_epilog))
3663 bb_before_epilog->count = single_pred_edge (bb: bb_before_epilog)->count ();
3664 bb_before_epilog = loop_preheader_edge (epilog)->src;
3665 }
3666
3667 if (!uncounted_p)
3668 {
3669 /* If loop is peeled for non-zero constant times, now niters refers to
3670 orig_niters - prolog_peeling, it won't overflow even the
3671 orig_niters overflows. */
3672 niters_no_overflow |= (prolog_peeling > 0);
3673 vect_gen_vector_loop_niters (loop_vinfo, niters,
3674 niters_vector_ptr: niters_vector, step_vector_ptr: step_vector,
3675 niters_no_overflow);
3676 if (!integer_onep (*step_vector))
3677 {
3678 /* On exit from the loop we will have an easy way of calcalating
3679 NITERS_VECTOR / STEP * STEP. Install a dummy definition
3680 until then. */
3681 niters_vector_mult_vf
3682 = make_ssa_name (TREE_TYPE (*niters_vector));
3683 SSA_NAME_DEF_STMT (niters_vector_mult_vf) = gimple_build_nop ();
3684 *niters_vector_mult_vf_var = niters_vector_mult_vf;
3685 }
3686 else
3687 vect_gen_vector_loop_niters_mult_vf (loop_vinfo, niters_vector: *niters_vector,
3688 niters_vector_mult_vf_ptr: &niters_vector_mult_vf);
3689 /* Update IVs of original loop as if they were advanced by
3690 niters_vector_mult_vf steps. */
3691 gcc_checking_assert (vect_can_advance_ivs_p (loop_vinfo));
3692 update_e = skip_vector ? e : loop_preheader_edge (epilog);
3693 }
3694 if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
3695 update_e = single_succ_edge (LOOP_VINFO_MAIN_EXIT (loop_vinfo)->dest);
3696
3697 /* If we have a peeled vector iteration we will never skip the epilog loop
3698 and we can simplify the cfg a lot by not doing the edge split. */
3699 if (skip_epilog
3700 || (LOOP_VINFO_EARLY_BREAKS (loop_vinfo)
3701 && !LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo)))
3702 {
3703 guard_cond = fold_build2 (EQ_EXPR, boolean_type_node,
3704 niters, niters_vector_mult_vf);
3705
3706 guard_bb = LOOP_VINFO_MAIN_EXIT (loop_vinfo)->dest;
3707 edge epilog_e = LOOP_VINFO_EPILOGUE_MAIN_EXIT (loop_vinfo);
3708 guard_to = epilog_e->dest;
3709 guard_e = slpeel_add_loop_guard (guard_bb, cond: guard_cond, guard_to,
3710 dom_bb: skip_vector ? anchor : guard_bb,
3711 probability: prob_epilog.invert (),
3712 irreducible_p: irred_flag);
3713
3714 doms.safe_push (obj: guard_to);
3715 if (vect_epilogues)
3716 epilogue_vinfo->skip_this_loop_edge = guard_e;
3717 edge main_iv = LOOP_VINFO_MAIN_EXIT (loop_vinfo);
3718 gphi_iterator gsi2 = gsi_start_phis (main_iv->dest);
3719 for (gphi_iterator gsi = gsi_start_phis (guard_to);
3720 !gsi_end_p (i: gsi); gsi_next (i: &gsi))
3721 {
3722 /* We are expecting all of the PHIs we have on epilog_e
3723 to be also on the main loop exit. But sometimes
3724 a stray virtual definition can appear at epilog_e
3725 which we can then take as the same on all exits,
3726 we've removed the LC SSA PHI on the main exit before
3727 so we wouldn't need to create a loop PHI for it. */
3728 if (virtual_operand_p (op: gimple_phi_result (gs: *gsi))
3729 && (gsi_end_p (i: gsi2)
3730 || !virtual_operand_p (op: gimple_phi_result (gs: *gsi2))))
3731 add_phi_arg (*gsi,
3732 gimple_phi_arg_def_from_edge (gs: *gsi, e: epilog_e),
3733 guard_e, UNKNOWN_LOCATION);
3734 else
3735 {
3736 add_phi_arg (*gsi, gimple_phi_result (gs: *gsi2), guard_e,
3737 UNKNOWN_LOCATION);
3738 gsi_next (i: &gsi2);
3739 }
3740 }
3741
3742 /* Only need to handle basic block before epilog loop if it's not
3743 the guard_bb, which is the case when skip_vector is true. */
3744 if (guard_bb != bb_before_epilog)
3745 {
3746 prob_epilog = prob_vector * prob_epilog + prob_vector.invert ();
3747
3748 scale_bbs_frequencies (&bb_before_epilog, 1, prob_epilog);
3749 }
3750 scale_loop_profile (epilog, prob_epilog, -1);
3751 }
3752
3753 /* If we have a peeled vector iteration, all exits are the same, leave it
3754 and so the main exit needs to be treated the same as the alternative
3755 exits in that we leave their updates to vectorizable_live_operations.
3756 */
3757 tree vector_iters_vf = niters_vector_mult_vf;
3758 if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
3759 {
3760 tree tmp_niters_vf
3761 = make_ssa_name (LOOP_VINFO_EARLY_BRK_IV_TYPE (loop_vinfo));
3762
3763 if (!(LOOP_VINFO_NITERS_UNCOUNTED_P (loop_vinfo)
3764 && get_loop_exit_edges (loop).length () == 1))
3765 {
3766 basic_block exit_bb = NULL;
3767 edge update_e = NULL;
3768
3769 /* Identify the early exit merge block. I wish we had stored
3770 this. */
3771 for (auto e : get_loop_exit_edges (loop))
3772 if (e != LOOP_VINFO_MAIN_EXIT (loop_vinfo))
3773 {
3774 exit_bb = e->dest;
3775 update_e = single_succ_edge (bb: exit_bb);
3776 break;
3777 }
3778 vect_update_ivs_after_vectorizer (loop_vinfo, niters: tmp_niters_vf,
3779 update_e, early_exit_p: true);
3780 }
3781 if (LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo))
3782 vector_iters_vf = tmp_niters_vf;
3783
3784 LOOP_VINFO_EARLY_BRK_NITERS_VAR (loop_vinfo) = tmp_niters_vf;
3785 }
3786
3787 bool recalculate_peel_niters_init
3788 = LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo);
3789 vect_update_ivs_after_vectorizer (loop_vinfo, niters: vector_iters_vf,
3790 update_e,
3791 early_exit_p: recalculate_peel_niters_init);
3792
3793 /* Recalculate the dominators after adding the guard edge. */
3794 if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
3795 iterate_fix_dominators (CDI_DOMINATORS, doms, false);
3796
3797 /* When we do not have a loop-around edge to the epilog we know
3798 the vector loop covered at least VF scalar iterations unless
3799 we have early breaks.
3800 Update any known upper bound with this knowledge. */
3801 if (! skip_vector
3802 && ! LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
3803 {
3804 if (epilog->any_upper_bound)
3805 epilog->nb_iterations_upper_bound -= lowest_vf;
3806 if (epilog->any_likely_upper_bound)
3807 epilog->nb_iterations_likely_upper_bound -= lowest_vf;
3808 if (epilog->any_estimate)
3809 epilog->nb_iterations_estimate -= lowest_vf;
3810 }
3811
3812 unsigned HOST_WIDE_INT bound;
3813 if (bound_scalar.is_constant (const_value: &bound))
3814 {
3815 gcc_assert (bound != 0);
3816 /* Adjust the upper bound by the extra peeled vector iteration if we
3817 are an epilogue of an peeled vect loop and not VLA. For VLA the
3818 loop bounds are unknown. */
3819 if (LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo)
3820 && vf.is_constant ())
3821 bound += vf.to_constant ();
3822 /* -1 to convert loop iterations to latch iterations. */
3823 record_niter_bound (epilog, bound - 1, false, true);
3824 scale_loop_profile (epilog, profile_probability::always (),
3825 bound - 1);
3826 }
3827
3828 delete_update_ssa ();
3829 adjust_vec_debug_stmts ();
3830 scev_reset ();
3831 }
3832
3833 if (vect_epilogues)
3834 {
3835 epilog->aux = epilogue_vinfo;
3836 LOOP_VINFO_LOOP (epilogue_vinfo) = epilog;
3837 LOOP_VINFO_MAIN_EXIT (epilogue_vinfo)
3838 = LOOP_VINFO_EPILOGUE_MAIN_EXIT (loop_vinfo);
3839
3840 loop_constraint_clear (loop: epilog, LOOP_C_INFINITE);
3841
3842 /* We now must calculate the number of NITERS performed by the previous
3843 loop and EPILOGUE_NITERS to be performed by the epilogue. */
3844 tree niters = fold_build2 (PLUS_EXPR, TREE_TYPE (niters_vector_mult_vf),
3845 niters_prolog, niters_vector_mult_vf);
3846
3847 /* If skip_vector we may skip the previous loop, we insert a phi-node to
3848 determine whether we are coming from the previous vectorized loop
3849 using the update_e edge or the skip_vector basic block using the
3850 skip_e edge. */
3851 if (skip_vector)
3852 {
3853 gcc_assert (update_e != NULL && skip_e != NULL);
3854 gphi *new_phi = create_phi_node (make_ssa_name (TREE_TYPE (niters)),
3855 update_e->dest);
3856 tree new_ssa = make_ssa_name (TREE_TYPE (niters));
3857 gimple *stmt = gimple_build_assign (new_ssa, niters);
3858 gimple_stmt_iterator gsi;
3859 if (TREE_CODE (niters_vector_mult_vf) == SSA_NAME
3860 && SSA_NAME_DEF_STMT (niters_vector_mult_vf)->bb != NULL)
3861 {
3862 gsi = gsi_for_stmt (SSA_NAME_DEF_STMT (niters_vector_mult_vf));
3863 gsi_insert_after (&gsi, stmt, GSI_NEW_STMT);
3864 }
3865 else
3866 {
3867 gsi = gsi_last_bb (bb: update_e->src);
3868 gsi_insert_before (&gsi, stmt, GSI_NEW_STMT);
3869 }
3870
3871 niters = new_ssa;
3872 add_phi_arg (new_phi, niters, update_e, UNKNOWN_LOCATION);
3873 add_phi_arg (new_phi, build_zero_cst (TREE_TYPE (niters)), skip_e,
3874 UNKNOWN_LOCATION);
3875 niters = PHI_RESULT (new_phi);
3876 epilogue_vinfo->main_loop_edge = update_e;
3877 epilogue_vinfo->skip_main_loop_edge = skip_e;
3878 }
3879
3880 /* Set ADVANCE to the number of iterations performed by the previous
3881 loop and its prologue. */
3882 *advance = niters;
3883
3884 /* Subtract the number of iterations performed by the vectorized loop
3885 from the number of total iterations. */
3886 tree epilogue_niters = fold_build2 (MINUS_EXPR, TREE_TYPE (niters),
3887 before_loop_niters,
3888 niters);
3889
3890 LOOP_VINFO_NITERS (epilogue_vinfo) = epilogue_niters;
3891 LOOP_VINFO_NITERSM1 (epilogue_vinfo)
3892 = fold_build2 (MINUS_EXPR, TREE_TYPE (epilogue_niters),
3893 epilogue_niters,
3894 build_one_cst (TREE_TYPE (epilogue_niters)));
3895 }
3896
3897 adjust_vec.release ();
3898 free_original_copy_tables ();
3899
3900 return vect_epilogues ? epilog : NULL;
3901}
3902
3903/* Function vect_create_cond_for_niters_checks.
3904
3905 Create a conditional expression that represents the run-time checks for
3906 loop's niter. The loop is guaranteed to terminate if the run-time
3907 checks hold.
3908
3909 Input:
3910 COND_EXPR - input conditional expression. New conditions will be chained
3911 with logical AND operation. If it is NULL, then the function
3912 is used to return the number of alias checks.
3913 LOOP_VINFO - field LOOP_VINFO_MAY_ALIAS_STMTS contains the list of ddrs
3914 to be checked.
3915
3916 Output:
3917 COND_EXPR - conditional expression.
3918
3919 The returned COND_EXPR is the conditional expression to be used in the
3920 if statement that controls which version of the loop gets executed at
3921 runtime. */
3922
3923static void
3924vect_create_cond_for_niters_checks (loop_vec_info loop_vinfo, tree *cond_expr)
3925{
3926 tree part_cond_expr = LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo);
3927
3928 if (*cond_expr)
3929 *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
3930 *cond_expr, part_cond_expr);
3931 else
3932 *cond_expr = part_cond_expr;
3933}
3934
3935/* Set *COND_EXPR to a tree that is true when both the original *COND_EXPR
3936 and PART_COND_EXPR are true. Treat a null *COND_EXPR as "true". */
3937
3938static void
3939chain_cond_expr (tree *cond_expr, tree part_cond_expr)
3940{
3941 if (*cond_expr)
3942 *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
3943 *cond_expr, part_cond_expr);
3944 else
3945 *cond_expr = part_cond_expr;
3946}
3947
3948/* Function vect_create_cond_for_align_checks.
3949
3950 Create a conditional expression that represents the alignment checks for
3951 all of data references (array element references) whose alignment must be
3952 checked at runtime.
3953
3954 Input:
3955 COND_EXPR - input conditional expression. New conditions will be chained
3956 with logical AND operation.
3957 LOOP_VINFO - three fields of the loop information are used.
3958 LOOP_VINFO_PTR_MASK is the mask used to check the alignment.
3959 LOOP_VINFO_MAY_MISALIGN_STMTS contains the refs to be checked.
3960 LOOP_VINFO_ALLOW_MUTUAL_ALIGNMENT indicates which check applies.
3961
3962 Output:
3963 COND_EXPR_STMT_LIST - statements needed to construct the conditional
3964 expression.
3965 The returned value is the conditional expression to be used in the if
3966 statement that controls which version of the loop gets executed at runtime.
3967
3968 Based on the boolean value of LOOP_VINFO_ALLOW_MUTUAL_ALIGNMENT, we decide
3969 which type of check should be applied and create two different expressions
3970 accordingly.
3971 1) When LOOP_VINFO_ALLOW_MUTUAL_ALIGNMENT is false, we see if all data refs
3972 to be checked are already aligned to an alignment boundary. We create
3973 an expression of "(a_1 | a_2 | a_3 | ... | a_n) & mask", where "a_i" is
3974 the address of i'th data reference.
3975 2) When LOOP_VINFO_ALLOW_MUTUAL_ALIGNMENT is true, we see if all data refs
3976 can be aligned to a boundary after a certain amount of peeling, in other
3977 words, their addresses have the same bottom bits according to the mask.
3978 We create "((a_1 ^ a_2) | (a_2 ^ a_3) | ... | (a_n-1 ^ a_n)) & mask",
3979 where "a_i" is the address of i'th data reference.
3980
3981 Both algorithms make two assumptions:
3982 1) The number of bytes "n" in a vector is a power of 2.
3983 2) An address "a" is aligned if a%n is zero and that this
3984 test can be done as a&(n-1) == 0. For example, for 16
3985 byte vectors the test is a&0xf == 0. */
3986
3987static void
3988vect_create_cond_for_align_checks (loop_vec_info loop_vinfo,
3989 tree *cond_expr,
3990 gimple_seq *cond_expr_stmt_list)
3991{
3992 const vec<stmt_vec_info> &may_misalign_stmts
3993 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
3994 stmt_vec_info stmt_info;
3995 poly_uint64 mask = LOOP_VINFO_PTR_MASK (loop_vinfo);
3996 tree mask_cst;
3997 unsigned int i;
3998 tree int_ptrsize_type;
3999 char tmp_name[30];
4000 tree or_tmp_name = NULL_TREE;
4001 tree prev_addr_tmp_name = NULL_TREE;
4002 tree and_tmp_name;
4003 gimple *and_stmt;
4004 tree ptrsize_zero;
4005 tree part_cond_expr;
4006
4007 gcc_assert (known_ne (mask, 0U));
4008
4009 int_ptrsize_type = signed_type_for (ptr_type_node);
4010
4011 /* If LOOP_VINFO_ALLOW_MUTUAL_ALIGNMENT is true, we should have at least two
4012 datarefs to check the mutual alignment. */
4013 gcc_assert (may_misalign_stmts.length () > 1
4014 || !LOOP_VINFO_ALLOW_MUTUAL_ALIGNMENT (loop_vinfo));
4015
4016 FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt_info)
4017 {
4018 gimple_seq new_stmt_list = NULL;
4019 tree addr_base;
4020 tree addr_tmp_name;
4021 tree xor_tmp_name;
4022 tree new_or_tmp_name;
4023 gimple *addr_stmt, *or_stmt, *xor_stmt;
4024 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4025 bool negative = tree_int_cst_compare
4026 (DR_STEP (STMT_VINFO_DATA_REF (stmt_info)), size_zero_node) < 0;
4027 tree offset = negative
4028 ? size_int ((-TYPE_VECTOR_SUBPARTS (vectype) + 1)
4029 * TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))))
4030 : size_zero_node;
4031
4032 /* create: addr_tmp = (int)(address_of_first_vector) */
4033 addr_base =
4034 vect_create_addr_base_for_vector_ref (loop_vinfo,
4035 stmt_info, &new_stmt_list,
4036 offset);
4037 if (new_stmt_list != NULL)
4038 gimple_seq_add_seq (cond_expr_stmt_list, new_stmt_list);
4039
4040 sprintf (s: tmp_name, format: "addr2int%d", i);
4041 addr_tmp_name = make_temp_ssa_name (type: int_ptrsize_type, NULL, name: tmp_name);
4042 addr_stmt = gimple_build_assign (addr_tmp_name, NOP_EXPR, addr_base);
4043 gimple_seq_add_stmt (cond_expr_stmt_list, addr_stmt);
4044
4045 if (LOOP_VINFO_ALLOW_MUTUAL_ALIGNMENT (loop_vinfo))
4046 {
4047 /* Create "((a_1 ^ a_2) | (a_2 ^ a_3) | ... | (a_n-1 ^ a_n)) & mask"
4048 to check mutual alignment. */
4049 if (prev_addr_tmp_name != NULL_TREE)
4050 {
4051 sprintf (s: tmp_name, format: "xorptrs%d_%d", i - 1, i);
4052 xor_tmp_name = make_temp_ssa_name (type: int_ptrsize_type, NULL,
4053 name: tmp_name);
4054 xor_stmt = gimple_build_assign (xor_tmp_name, BIT_XOR_EXPR,
4055 prev_addr_tmp_name,
4056 addr_tmp_name);
4057 gimple_seq_add_stmt (cond_expr_stmt_list, xor_stmt);
4058 if (or_tmp_name == NULL_TREE)
4059 {
4060 /* Create the 1st XOR when the 2nd data ref is seen. */
4061 or_tmp_name = xor_tmp_name;
4062 }
4063 else
4064 {
4065 /* Create: or_tmp = or_tmp | new_xor_tmp. */
4066 sprintf (s: tmp_name, format: "orxors%d", i - 1);
4067 new_or_tmp_name = make_temp_ssa_name (type: int_ptrsize_type, NULL,
4068 name: tmp_name);
4069 or_stmt = gimple_build_assign (new_or_tmp_name, BIT_IOR_EXPR,
4070 or_tmp_name, xor_tmp_name);
4071 gimple_seq_add_stmt (cond_expr_stmt_list, or_stmt);
4072 or_tmp_name = new_or_tmp_name;
4073 }
4074 }
4075 prev_addr_tmp_name = addr_tmp_name;
4076 }
4077 else
4078 {
4079 /* Create: "(a_1 | a_2 | a_3 | ... | a_n) & mask" to check if all
4080 addresses are already aligned. */
4081 if (or_tmp_name != NULL_TREE)
4082 {
4083 /* Create: or_tmp = or_tmp | addr_tmp. */
4084 sprintf (s: tmp_name, format: "orptrs%d", i);
4085 new_or_tmp_name = make_temp_ssa_name (type: int_ptrsize_type, NULL,
4086 name: tmp_name);
4087 or_stmt = gimple_build_assign (new_or_tmp_name, BIT_IOR_EXPR,
4088 or_tmp_name, addr_tmp_name);
4089 gimple_seq_add_stmt (cond_expr_stmt_list, or_stmt);
4090 or_tmp_name = new_or_tmp_name;
4091 }
4092 else
4093 or_tmp_name = addr_tmp_name;
4094 }
4095
4096 } /* end for i */
4097
4098 mask_cst = build_int_cst (int_ptrsize_type, mask);
4099
4100 /* create: and_tmp = or_tmp & mask */
4101 and_tmp_name = make_temp_ssa_name (type: int_ptrsize_type, NULL, name: "andmask");
4102
4103 and_stmt = gimple_build_assign (and_tmp_name, BIT_AND_EXPR,
4104 or_tmp_name, mask_cst);
4105 gimple_seq_add_stmt (cond_expr_stmt_list, and_stmt);
4106
4107 /* Make and_tmp the left operand of the conditional test against zero.
4108 if and_tmp has a nonzero bit then some address is unaligned. */
4109 ptrsize_zero = build_int_cst (int_ptrsize_type, 0);
4110 part_cond_expr = fold_build2 (EQ_EXPR, boolean_type_node,
4111 and_tmp_name, ptrsize_zero);
4112 chain_cond_expr (cond_expr, part_cond_expr);
4113}
4114
4115/* Function vect_create_cond_for_vla_spec_read.
4116
4117 Create a conditional expression that represents the run-time checks with
4118 max speculative read amount in VLA modes. We check two things:
4119 1) if the max speculative read amount exceeds the min page size
4120 2) if the VF is power-of-2 - done by checking the max read amount instead
4121
4122 Input:
4123 COND_EXPR - input conditional expression. New conditions will be chained
4124 with logical AND operation.
4125 LOOP_VINFO - field LOOP_VINFO_MAX_SPEC_READ_AMOUNT contains the max
4126 possible speculative read amount in VLA modes.
4127
4128 Output:
4129 COND_EXPR - conditional expression.
4130
4131 The returned COND_EXPR is the conditional expression to be used in the
4132 if statement that controls which version of the loop gets executed at
4133 runtime. */
4134
4135static void
4136vect_create_cond_for_vla_spec_read (loop_vec_info loop_vinfo, tree *cond_expr)
4137{
4138 poly_uint64 read_amount_poly = LOOP_VINFO_MAX_SPEC_READ_AMOUNT (loop_vinfo);
4139 tree amount = build_int_cst (long_unsigned_type_node, read_amount_poly);
4140
4141 /* Both the read amount and the VF must be variants, and the read amount must
4142 be a constant power-of-2 multiple of the VF. */
4143 unsigned HOST_WIDE_INT multiple;
4144 gcc_assert (!read_amount_poly.is_constant ()
4145 && !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant ()
4146 && constant_multiple_p (read_amount_poly,
4147 LOOP_VINFO_VECT_FACTOR (loop_vinfo),
4148 &multiple)
4149 && pow2p_hwi (multiple));
4150
4151 tree cst_ul_zero = build_int_cstu (long_unsigned_type_node, 0U);
4152 tree cst_ul_one = build_int_cstu (long_unsigned_type_node, 1U);
4153 tree cst_ul_pagesize = build_int_cstu (long_unsigned_type_node,
4154 (unsigned long) param_min_pagesize);
4155
4156 /* Create an expression of "amount & (amount - 1) == 0". */
4157 tree amount_m1 = fold_build2 (MINUS_EXPR, long_unsigned_type_node,
4158 amount, cst_ul_one);
4159 tree amount_and_expr = fold_build2 (BIT_AND_EXPR, long_unsigned_type_node,
4160 amount, amount_m1);
4161 tree powof2_cond_expr = fold_build2 (EQ_EXPR, boolean_type_node,
4162 amount_and_expr, cst_ul_zero);
4163 chain_cond_expr (cond_expr, part_cond_expr: powof2_cond_expr);
4164
4165 /* Create an expression of "amount <= cst_ul_pagesize". */
4166 tree pagesize_cond_expr = fold_build2 (LE_EXPR, boolean_type_node,
4167 amount, cst_ul_pagesize);
4168 chain_cond_expr (cond_expr, part_cond_expr: pagesize_cond_expr);
4169}
4170
4171/* If LOOP_VINFO_CHECK_UNEQUAL_ADDRS contains <A1, B1>, ..., <An, Bn>,
4172 create a tree representation of: (&A1 != &B1) && ... && (&An != &Bn).
4173 Set *COND_EXPR to a tree that is true when both the original *COND_EXPR
4174 and this new condition are true. Treat a null *COND_EXPR as "true". */
4175
4176static void
4177vect_create_cond_for_unequal_addrs (loop_vec_info loop_vinfo, tree *cond_expr)
4178{
4179 const vec<vec_object_pair> &pairs
4180 = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
4181 unsigned int i;
4182 vec_object_pair *pair;
4183 FOR_EACH_VEC_ELT (pairs, i, pair)
4184 {
4185 tree addr1 = build_fold_addr_expr (pair->first);
4186 tree addr2 = build_fold_addr_expr (pair->second);
4187 tree part_cond_expr = fold_build2 (NE_EXPR, boolean_type_node,
4188 addr1, addr2);
4189 chain_cond_expr (cond_expr, part_cond_expr);
4190 }
4191}
4192
4193/* Create an expression that is true when all lower-bound conditions for
4194 the vectorized loop are met. Chain this condition with *COND_EXPR. */
4195
4196static void
4197vect_create_cond_for_lower_bounds (loop_vec_info loop_vinfo, tree *cond_expr)
4198{
4199 const vec<vec_lower_bound> &lower_bounds
4200 = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
4201 for (unsigned int i = 0; i < lower_bounds.length (); ++i)
4202 {
4203 tree expr = lower_bounds[i].expr;
4204 tree type = unsigned_type_for (TREE_TYPE (expr));
4205 expr = fold_convert (type, expr);
4206 poly_uint64 bound = lower_bounds[i].min_value;
4207 if (!lower_bounds[i].unsigned_p)
4208 {
4209 expr = fold_build2 (PLUS_EXPR, type, expr,
4210 build_int_cstu (type, bound - 1));
4211 bound += bound - 1;
4212 }
4213 tree part_cond_expr = fold_build2 (GE_EXPR, boolean_type_node, expr,
4214 build_int_cstu (type, bound));
4215 chain_cond_expr (cond_expr, part_cond_expr);
4216 }
4217}
4218
4219/* Function vect_create_cond_for_alias_checks.
4220
4221 Create a conditional expression that represents the run-time checks for
4222 overlapping of address ranges represented by a list of data references
4223 relations passed as input.
4224
4225 Input:
4226 COND_EXPR - input conditional expression. New conditions will be chained
4227 with logical AND operation. If it is NULL, then the function
4228 is used to return the number of alias checks.
4229 LOOP_VINFO - field LOOP_VINFO_MAY_ALIAS_STMTS contains the list of ddrs
4230 to be checked.
4231
4232 Output:
4233 COND_EXPR - conditional expression.
4234
4235 The returned COND_EXPR is the conditional expression to be used in the if
4236 statement that controls which version of the loop gets executed at runtime.
4237*/
4238
4239void
4240vect_create_cond_for_alias_checks (loop_vec_info loop_vinfo, tree * cond_expr)
4241{
4242 const vec<dr_with_seg_len_pair_t> &comp_alias_ddrs =
4243 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
4244
4245 if (comp_alias_ddrs.is_empty ())
4246 return;
4247
4248 create_runtime_alias_checks (LOOP_VINFO_LOOP (loop_vinfo),
4249 &comp_alias_ddrs, cond_expr);
4250 if (dump_enabled_p ())
4251 dump_printf_loc (MSG_NOTE, vect_location,
4252 "created %u versioning for alias checks.\n",
4253 comp_alias_ddrs.length ());
4254}
4255
4256
4257/* Function vect_loop_versioning.
4258
4259 If the loop has data references that may or may not be aligned or/and
4260 has data reference relations whose independence was not proven then
4261 two versions of the loop need to be generated, one which is vectorized
4262 and one which isn't. A test is then generated to control which of the
4263 loops is executed. The test checks for the alignment of all of the
4264 data references that may or may not be aligned. An additional
4265 sequence of runtime tests is generated for each pairs of DDRs whose
4266 independence was not proven. The vectorized version of loop is
4267 executed only if both alias and alignment tests are passed.
4268
4269 The test generated to check which version of loop is executed
4270 is modified to also check for profitability as indicated by the
4271 cost model threshold TH.
4272
4273 The versioning precondition(s) are placed in *COND_EXPR and
4274 *COND_EXPR_STMT_LIST. */
4275
4276class loop *
4277vect_loop_versioning (loop_vec_info loop_vinfo,
4278 gimple *loop_vectorized_call)
4279{
4280 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *nloop;
4281 class loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
4282 basic_block condition_bb;
4283 gphi_iterator gsi;
4284 gimple_stmt_iterator cond_exp_gsi;
4285 basic_block merge_bb;
4286 basic_block new_exit_bb;
4287 edge new_exit_e, e;
4288 gphi *orig_phi, *new_phi;
4289 tree cond_expr = NULL_TREE;
4290 gimple_seq cond_expr_stmt_list = NULL;
4291 tree arg;
4292 profile_probability prob = profile_probability::likely ();
4293 gimple_seq gimplify_stmt_list = NULL;
4294 tree scalar_loop_iters = LOOP_VINFO_NITERSM1 (loop_vinfo);
4295 bool version_align = LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo);
4296 bool version_spec_read = LOOP_REQUIRES_VERSIONING_FOR_SPEC_READ (loop_vinfo);
4297 bool version_alias = LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo);
4298 bool version_niter = LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo);
4299 poly_uint64 versioning_threshold
4300 = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
4301 tree version_simd_if_cond
4302 = LOOP_REQUIRES_VERSIONING_FOR_SIMD_IF_COND (loop_vinfo);
4303 unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
4304 bool uncounted_p = LOOP_VINFO_NITERS_UNCOUNTED_P (loop_vinfo);
4305
4306 if (!uncounted_p && vect_apply_runtime_profitability_check_p (loop_vinfo)
4307 && !ordered_p (a: th, b: versioning_threshold))
4308 cond_expr = fold_build2 (GE_EXPR, boolean_type_node, scalar_loop_iters,
4309 build_int_cst (TREE_TYPE (scalar_loop_iters),
4310 th - 1));
4311 if (!uncounted_p && maybe_ne (a: versioning_threshold, b: 0U))
4312 {
4313 tree expr = fold_build2 (GE_EXPR, boolean_type_node, scalar_loop_iters,
4314 build_int_cst (TREE_TYPE (scalar_loop_iters),
4315 versioning_threshold - 1));
4316 if (cond_expr)
4317 cond_expr = fold_build2 (BIT_AND_EXPR, boolean_type_node,
4318 expr, cond_expr);
4319 else
4320 cond_expr = expr;
4321 }
4322
4323 tree cost_name = NULL_TREE;
4324 profile_probability prob2 = profile_probability::always ();
4325 if (cond_expr
4326 && EXPR_P (cond_expr)
4327 && (version_niter
4328 || version_align
4329 || version_alias
4330 || version_simd_if_cond))
4331 {
4332 cost_name = cond_expr = force_gimple_operand_1 (unshare_expr (cond_expr),
4333 &cond_expr_stmt_list,
4334 is_gimple_val, NULL_TREE);
4335 /* Split prob () into two so that the overall probability of passing
4336 both the cost-model and versioning checks is the orig prob. */
4337 prob2 = prob = prob.sqrt ();
4338 }
4339
4340 if (version_niter)
4341 vect_create_cond_for_niters_checks (loop_vinfo, cond_expr: &cond_expr);
4342
4343 if (cond_expr)
4344 {
4345 gimple_seq tem = NULL;
4346 cond_expr = force_gimple_operand_1 (unshare_expr (cond_expr),
4347 &tem, is_gimple_condexpr_for_cond,
4348 NULL_TREE);
4349 gimple_seq_add_seq (&cond_expr_stmt_list, tem);
4350 }
4351
4352 if (version_align)
4353 vect_create_cond_for_align_checks (loop_vinfo, cond_expr: &cond_expr,
4354 cond_expr_stmt_list: &cond_expr_stmt_list);
4355
4356 if (version_spec_read)
4357 vect_create_cond_for_vla_spec_read (loop_vinfo, cond_expr: &cond_expr);
4358
4359 if (version_alias)
4360 {
4361 vect_create_cond_for_unequal_addrs (loop_vinfo, cond_expr: &cond_expr);
4362 vect_create_cond_for_lower_bounds (loop_vinfo, cond_expr: &cond_expr);
4363 vect_create_cond_for_alias_checks (loop_vinfo, cond_expr: &cond_expr);
4364 }
4365
4366 if (version_simd_if_cond)
4367 {
4368 gcc_assert (dom_info_available_p (CDI_DOMINATORS));
4369 if (flag_checking)
4370 if (basic_block bb
4371 = gimple_bb (SSA_NAME_DEF_STMT (version_simd_if_cond)))
4372 gcc_assert (bb != loop->header
4373 && dominated_by_p (CDI_DOMINATORS, loop->header, bb)
4374 && (scalar_loop == NULL
4375 || (bb != scalar_loop->header
4376 && dominated_by_p (CDI_DOMINATORS,
4377 scalar_loop->header, bb))));
4378 tree zero = build_zero_cst (TREE_TYPE (version_simd_if_cond));
4379 tree c = fold_build2 (NE_EXPR, boolean_type_node,
4380 version_simd_if_cond, zero);
4381 if (cond_expr)
4382 cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
4383 c, cond_expr);
4384 else
4385 cond_expr = c;
4386 if (dump_enabled_p ())
4387 dump_printf_loc (MSG_NOTE, vect_location,
4388 "created versioning for simd if condition check.\n");
4389 }
4390
4391 cond_expr = force_gimple_operand_1 (unshare_expr (cond_expr),
4392 &gimplify_stmt_list,
4393 is_gimple_condexpr_for_cond, NULL_TREE);
4394 gimple_seq_add_seq (&cond_expr_stmt_list, gimplify_stmt_list);
4395
4396 /* Compute the outermost loop cond_expr and cond_expr_stmt_list are
4397 invariant in. */
4398 class loop *outermost = outermost_invariant_loop_for_expr (loop, cond_expr);
4399 for (gimple_stmt_iterator gsi = gsi_start (seq&: cond_expr_stmt_list);
4400 !gsi_end_p (i: gsi); gsi_next (i: &gsi))
4401 {
4402 gimple *stmt = gsi_stmt (i: gsi);
4403 update_stmt (s: stmt);
4404 ssa_op_iter iter;
4405 use_operand_p use_p;
4406 basic_block def_bb;
4407 FOR_EACH_SSA_USE_OPERAND (use_p, stmt, iter, SSA_OP_USE)
4408 if ((def_bb = gimple_bb (SSA_NAME_DEF_STMT (USE_FROM_PTR (use_p))))
4409 && flow_bb_inside_loop_p (outermost, def_bb))
4410 outermost = superloop_at_depth (loop, bb_loop_depth (def_bb) + 1);
4411 }
4412
4413 /* Search for the outermost loop we can version. Avoid versioning of
4414 non-perfect nests but allow if-conversion versioned loops inside. */
4415 class loop *loop_to_version = loop;
4416 if (flow_loop_nested_p (outermost, loop))
4417 {
4418 if (dump_enabled_p ())
4419 dump_printf_loc (MSG_NOTE, vect_location,
4420 "trying to apply versioning to outer loop %d\n",
4421 outermost->num);
4422 if (outermost->num == 0)
4423 outermost = superloop_at_depth (loop, 1);
4424 /* And avoid applying versioning on non-perfect nests. */
4425 while (loop_to_version != outermost
4426 && (e = single_exit (loop_outer (loop: loop_to_version)))
4427 && !(e->flags & EDGE_COMPLEX)
4428 && (!loop_outer (loop: loop_to_version)->inner->next
4429 || vect_loop_vectorized_call (loop_to_version))
4430 && (!loop_outer (loop: loop_to_version)->inner->next
4431 || !loop_outer (loop: loop_to_version)->inner->next->next))
4432 loop_to_version = loop_outer (loop: loop_to_version);
4433 }
4434
4435 /* Apply versioning. If there is already a scalar version created by
4436 if-conversion re-use that. Note we cannot re-use the copy of
4437 an if-converted outer-loop when vectorizing the inner loop only. */
4438 gcond *cond;
4439 if ((!loop_to_version->inner || loop == loop_to_version)
4440 && loop_vectorized_call)
4441 {
4442 gcc_assert (scalar_loop);
4443 condition_bb = gimple_bb (g: loop_vectorized_call);
4444 cond = as_a <gcond *> (p: *gsi_last_bb (bb: condition_bb));
4445 gimple_cond_set_condition_from_tree (cond, cond_expr);
4446 update_stmt (s: cond);
4447
4448 if (cond_expr_stmt_list)
4449 {
4450 cond_exp_gsi = gsi_for_stmt (loop_vectorized_call);
4451 gsi_insert_seq_before (&cond_exp_gsi, cond_expr_stmt_list,
4452 GSI_SAME_STMT);
4453 }
4454
4455 /* if-conversion uses profile_probability::always () for both paths,
4456 reset the paths probabilities appropriately. */
4457 edge te, fe;
4458 extract_true_false_edges_from_block (condition_bb, &te, &fe);
4459 te->probability = prob;
4460 fe->probability = prob.invert ();
4461 /* We can scale loops counts immediately but have to postpone
4462 scaling the scalar loop because we re-use it during peeling.
4463
4464 Ifcvt duplicates loop preheader, loop body and produces an basic
4465 block after loop exit. We need to scale all that. */
4466 basic_block preheader = loop_preheader_edge (loop_to_version)->src;
4467 preheader->count = preheader->count.apply_probability (prob: prob * prob2);
4468 scale_loop_frequencies (loop_to_version, prob * prob2);
4469 /* When the loop has multiple exits then we can only version itself.
4470 This is denoted by loop_to_version == loop. In this case we can
4471 do the versioning by selecting the exit edge the vectorizer is
4472 currently using. */
4473 edge exit_edge;
4474 if (loop_to_version == loop)
4475 exit_edge = LOOP_VINFO_MAIN_EXIT (loop_vinfo);
4476 else
4477 exit_edge = single_exit (loop_to_version);
4478 exit_edge->dest->count = preheader->count;
4479 LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo) = (prob * prob2).invert ();
4480
4481 nloop = scalar_loop;
4482 if (dump_enabled_p ())
4483 dump_printf_loc (MSG_NOTE, vect_location,
4484 "reusing %sloop version created by if conversion\n",
4485 loop_to_version != loop ? "outer " : "");
4486 }
4487 else
4488 {
4489 if (loop_to_version != loop
4490 && dump_enabled_p ())
4491 dump_printf_loc (MSG_NOTE, vect_location,
4492 "applying loop versioning to outer loop %d\n",
4493 loop_to_version->num);
4494
4495 unsigned orig_pe_idx = loop_preheader_edge (loop)->dest_idx;
4496
4497 initialize_original_copy_tables ();
4498 nloop = loop_version (loop_to_version, cond_expr, &condition_bb,
4499 prob * prob2, (prob * prob2).invert (),
4500 prob * prob2, (prob * prob2).invert (),
4501 true);
4502
4503 /* If the PHI nodes in the loop header were reallocated, we need to fix up
4504 our internally stashed copies of those. */
4505 if (loop_to_version == loop)
4506 for (auto gsi = gsi_start_phis (loop->header);
4507 !gsi_end_p (i: gsi); gsi_next (i: &gsi))
4508 loop_vinfo->resync_stmt_addr (gsi.phi ());
4509
4510 /* We will later insert second conditional so overall outcome of
4511 both is prob * prob2. */
4512 edge true_e, false_e;
4513 extract_true_false_edges_from_block (condition_bb, &true_e, &false_e);
4514 true_e->probability = prob;
4515 false_e->probability = prob.invert ();
4516 gcc_assert (nloop);
4517 nloop = get_loop_copy (loop);
4518
4519 /* Assign hierarchical discriminators to distinguish loop versions.
4520 Only assign to the scalar version here; the vectorized version will
4521 get discriminators later during transformation/peeling.
4522 Use dynamic copy_id allocation instead of hardcoded constants. */
4523 gimple *nloop_last = last_nondebug_stmt (nloop->header);
4524 location_t nloop_loc
4525 = nloop_last ? gimple_location (g: nloop_last) : UNKNOWN_LOCATION;
4526 if (nloop_loc != UNKNOWN_LOCATION)
4527 {
4528 unsigned int nloop_copyid = allocate_copyid_base (loc: nloop_loc, stride: 1);
4529 assign_discriminators_to_loop (loop: nloop, multiplicity_factor: 0, copyid: nloop_copyid);
4530 }
4531 /* For cycle vectorization with SLP we rely on the PHI arguments
4532 appearing in the same order as the SLP node operands which for the
4533 loop PHI nodes means the preheader edge dest index needs to remain
4534 the same for the analyzed loop which also becomes the vectorized one.
4535 Make it so in case the state after versioning differs by redirecting
4536 the first edge into the header to the same destination which moves
4537 it last. */
4538 if (loop_preheader_edge (loop)->dest_idx != orig_pe_idx)
4539 {
4540 edge e = EDGE_PRED (loop->header, 0);
4541 ssa_redirect_edge (e, e->dest);
4542 flush_pending_stmts (e);
4543 }
4544 gcc_assert (loop_preheader_edge (loop)->dest_idx == orig_pe_idx);
4545
4546 /* Kill off IFN_LOOP_VECTORIZED_CALL in the copy, nobody will
4547 reap those otherwise; they also refer to the original
4548 loops. */
4549 class loop *l = loop;
4550 while (gimple *call = vect_loop_vectorized_call (l))
4551 {
4552 call = SSA_NAME_DEF_STMT (get_current_def (gimple_call_lhs (call)));
4553 fold_loop_internal_call (call, boolean_false_node);
4554 l = loop_outer (loop: l);
4555 }
4556 free_original_copy_tables ();
4557
4558 if (cond_expr_stmt_list)
4559 {
4560 cond_exp_gsi = gsi_last_bb (bb: condition_bb);
4561 gsi_insert_seq_before (&cond_exp_gsi, cond_expr_stmt_list,
4562 GSI_SAME_STMT);
4563 }
4564
4565 /* Loop versioning violates an assumption we try to maintain during
4566 vectorization - that the loop exit block has a single predecessor.
4567 After versioning, the exit block of both loop versions is the same
4568 basic block (i.e. it has two predecessors). Just in order to simplify
4569 following transformations in the vectorizer, we fix this situation
4570 here by adding a new (empty) block on the exit-edge of the loop,
4571 with the proper loop-exit phis to maintain loop-closed-form.
4572 If loop versioning wasn't done from loop, but scalar_loop instead,
4573 merge_bb will have already just a single successor. */
4574
4575 /* When the loop has multiple exits then we can only version itself.
4576 This is denoted by loop_to_version == loop. In this case we can
4577 do the versioning by selecting the exit edge the vectorizer is
4578 currently using. */
4579 edge exit_edge;
4580 if (loop_to_version == loop)
4581 exit_edge = LOOP_VINFO_MAIN_EXIT (loop_vinfo);
4582 else
4583 exit_edge = single_exit (loop_to_version);
4584
4585 gcc_assert (exit_edge);
4586 merge_bb = exit_edge->dest;
4587 if (EDGE_COUNT (merge_bb->preds) >= 2)
4588 {
4589 gcc_assert (EDGE_COUNT (merge_bb->preds) >= 2);
4590 new_exit_bb = split_edge (exit_edge);
4591 new_exit_e = exit_edge;
4592 e = EDGE_SUCC (new_exit_bb, 0);
4593
4594 for (gsi = gsi_start_phis (merge_bb); !gsi_end_p (i: gsi);
4595 gsi_next (i: &gsi))
4596 {
4597 tree new_res;
4598 orig_phi = gsi.phi ();
4599 new_res = copy_ssa_name (PHI_RESULT (orig_phi));
4600 new_phi = create_phi_node (new_res, new_exit_bb);
4601 arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, e);
4602 add_phi_arg (new_phi, arg, new_exit_e,
4603 gimple_phi_arg_location_from_edge (phi: orig_phi, e));
4604 adjust_phi_and_debug_stmts (update_phi: orig_phi, e, PHI_RESULT (new_phi));
4605 }
4606 }
4607
4608 update_ssa (TODO_update_ssa_no_phi);
4609 }
4610
4611 /* Split the cost model check off to a separate BB. Costing assumes
4612 this is the only thing we perform when we enter the scalar loop
4613 from a failed cost decision. */
4614 if (cost_name && TREE_CODE (cost_name) == SSA_NAME)
4615 {
4616 gimple *def = SSA_NAME_DEF_STMT (cost_name);
4617 gcc_assert (gimple_bb (def) == condition_bb);
4618 /* All uses of the cost check are 'true' after the check we
4619 are going to insert. */
4620 replace_uses_by (cost_name, boolean_true_node);
4621 /* And we're going to build the new single use of it. */
4622 gcond *cond = gimple_build_cond (NE_EXPR, cost_name, boolean_false_node,
4623 NULL_TREE, NULL_TREE);
4624 edge e = split_block (gimple_bb (g: def), def);
4625 gimple_stmt_iterator gsi = gsi_for_stmt (def);
4626 gsi_insert_after (&gsi, cond, GSI_NEW_STMT);
4627 edge true_e, false_e;
4628 extract_true_false_edges_from_block (e->dest, &true_e, &false_e);
4629 e->flags &= ~EDGE_FALLTHRU;
4630 e->flags |= EDGE_TRUE_VALUE;
4631 edge e2 = make_edge (e->src, false_e->dest, EDGE_FALSE_VALUE);
4632 e->probability = prob2;
4633 e2->probability = prob2.invert ();
4634 e->dest->count = e->count ();
4635 set_immediate_dominator (CDI_DOMINATORS, false_e->dest, e->src);
4636 auto_vec<basic_block, 3> adj;
4637 for (basic_block son = first_dom_son (CDI_DOMINATORS, e->dest);
4638 son;
4639 son = next_dom_son (CDI_DOMINATORS, son))
4640 if (EDGE_COUNT (son->preds) > 1)
4641 adj.safe_push (obj: son);
4642 for (auto son : adj)
4643 set_immediate_dominator (CDI_DOMINATORS, son, e->src);
4644 //debug_bb (condition_bb);
4645 //debug_bb (e->src);
4646 }
4647
4648 if (version_niter)
4649 {
4650 /* The versioned loop could be infinite, we need to clear existing
4651 niter information which is copied from the original loop. */
4652 gcc_assert (loop_constraint_set_p (loop, LOOP_C_FINITE));
4653 vect_free_loop_info_assumptions (nloop);
4654 }
4655
4656 if (LOCATION_LOCUS (vect_location.get_location_t ()) != UNKNOWN_LOCATION
4657 && dump_enabled_p ())
4658 {
4659 if (version_alias)
4660 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS | MSG_PRIORITY_USER_FACING,
4661 vect_location,
4662 "loop versioned for vectorization because of "
4663 "possible aliasing\n");
4664 if (version_align)
4665 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS | MSG_PRIORITY_USER_FACING,
4666 vect_location,
4667 "loop versioned for vectorization to enhance "
4668 "alignment\n");
4669
4670 }
4671
4672 return nloop;
4673}
4674

source code of gcc/tree-vect-loop-manip.cc