Line data Source code
1 : /* Code locality based function cloning.
2 : Copyright The GNU Toolchain Authors
3 :
4 : This file is part of GCC.
5 :
6 : GCC is free software; you can redistribute it and/or modify it under
7 : the terms of the GNU General Public License as published by the Free
8 : Software Foundation; either version 3, or (at your option) any later
9 : version.
10 :
11 : GCC is distributed in the hope that it will be useful, but WITHOUT ANY
12 : WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 : FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14 : for more details.
15 :
16 : You should have received a copy of the GNU General Public License
17 : along with GCC; see the file COPYING3. If not see
18 : <http://www.gnu.org/licenses/>. */
19 :
20 : /* This file implements cloning required to improve partitioning of the
21 : callgraph for locality considerations.
22 :
23 : Partitioning for improving code locality.
24 : This pass aims to place frequently executed callchains closer together in
25 : memory to improve performance through improved locality. If any frequent
26 : callchains cannot be placed together because they are already placed
27 : elsewhere, local function clones are created and all callers near to the
28 : clones are redirected to use this copy.
29 :
30 : Locality code placement is done in 2 parts.
31 : 1. IPA pass to be executed after ipa-inline and before ipa-pure-const.
32 : Execute stage prepares the plan to place all nodes into partitions.
33 : 2. WPA Partition stage actually implements the plan.
34 :
35 : Brief overview of the IPA pass:
36 : 1. Create and sort callchains. If PGO is available, use real profile
37 : counts. Otherwise, use a set of heuristics to sort the callchains.
38 : 2. Create a partition plan for the callchains, processing them in the sorted
39 : order.
40 : 1. If a function is unpartitioned, place it in the current partition.
41 : 2. If a function is already placed in a partition away from current
42 : partition as part of another callchain:
43 : Create a local clone in current partition, if cloning criteria is
44 : satisfied.
45 : 3. Redirect any new caller to a local clone if one exists.
46 :
47 : Another heuristics can be used in the absense of profile information. This
48 : approach uses C++ template instantiation types to group functions together.
49 : Entry functions are sorted in the beginning by most used template types, and
50 : callees are sorted as part of partition_callchain (), again by template
51 : types.
52 :
53 : For bothe schemes, partition size is param controlled to fine tune per
54 : program behavior. */
55 :
56 : #include "config.h"
57 : #define INCLUDE_ALGORITHM
58 : #include "system.h"
59 : #include "coretypes.h"
60 : #include "target.h"
61 : #include "function.h"
62 : #include "tree.h"
63 : #include "alloc-pool.h"
64 : #include "tree-pass.h"
65 : #include "cgraph.h"
66 : #include "symbol-summary.h"
67 : #include "tree-vrp.h"
68 : #include "symtab-thunks.h"
69 : #include "sreal.h"
70 : #include "ipa-cp.h"
71 : #include "ipa-prop.h"
72 : #include "ipa-fnsummary.h"
73 : #include "ipa-modref-tree.h"
74 : #include "ipa-modref.h"
75 : #include "symtab-clones.h"
76 : #include "demangle.h"
77 : #include "ipa-locality-cloning.h"
78 :
79 : /* Locality partitions, assigns nodes to partitions. These are used later in
80 : WPA partitioning. */
81 : vec<locality_partition> locality_partitions;
82 :
83 : using loc_map_hash = int_hash<int, 0, -1>;
84 :
85 : /* Map from original node to its latest clone. Gets overwritten whenever a new
86 : clone is created from the same node. */
87 : static std::unique_ptr<hash_map<loc_map_hash, cgraph_node *>> node_to_clone;
88 : /* Map from clone to its original node. */
89 : static std::unique_ptr<hash_map<loc_map_hash, cgraph_node *>> clone_to_node;
90 :
91 : /* Data structure to hold static heuristics and orders for cgraph_nodes. */
92 : struct locality_order
93 : {
94 : cgraph_node *node;
95 : sreal order;
96 0 : locality_order (cgraph_node *node, sreal order) : node (node), order (order)
97 : {}
98 : };
99 :
100 : /* Data structure to hold information about unique template instantiations. */
101 : struct templ_info
102 : {
103 : hashval_t templ_hash;
104 : int count;
105 : int order;
106 : };
107 :
108 : /* Data structure to hold accumulated edge frequencies for unique callees.
109 : Frequencies of aliased callees are accumulated with the ultimate target
110 : alias represented by ultimate_callee.
111 : ultimate_caller is the ultimate non-inlined caller of the unique callee.
112 : Edge is the first edge encountered for the callee. */
113 0 : struct locality_callee_info
114 : {
115 : cgraph_node *ultimate_caller;
116 : cgraph_node *ultimate_callee;
117 : cgraph_edge *edge;
118 : sreal freq;
119 : };
120 :
121 : /* Data structure to hold precomputed callchain information. */
122 : struct locality_info
123 : {
124 : cgraph_node *node;
125 :
126 : /* Consolidated callees, including callees of inlined nodes. */
127 : vec<locality_callee_info *> all_callees;
128 :
129 : /* Accumulated caller->node edge frequencies for unique callers. */
130 : hash_map<loc_map_hash, sreal> caller_freq;
131 : /* Accumulated node->callee edge frequencies for unique callees. */
132 : hash_map<loc_map_hash, locality_callee_info> callee_info;
133 :
134 : /* Template instantiation info. */
135 : hashval_t templ_hash;
136 :
137 0 : locality_info ()
138 0 : {
139 0 : all_callees.create (1);
140 0 : templ_hash = 0;
141 0 : }
142 : ~locality_info ()
143 : {
144 : all_callees.release ();
145 : }
146 : };
147 :
148 : /* Template instantiation hash_map. */
149 : typedef hash_map<int_hash<hashval_t, 0>, templ_info> hash_freq_map_t;
150 : static std::unique_ptr<hash_freq_map_t> templ_hash_map;
151 :
152 : /* Pool allocation for locality_info. */
153 : static object_allocator<locality_info> loc_infos ("IPA locality callchain");
154 : static
155 : std::unique_ptr<hash_map<loc_map_hash, locality_info *>> node_to_ch_info;
156 :
157 : /* Return locality_info for NODE if present, otherwise return NULL. */
158 : static inline locality_info *
159 0 : get_locality_info (cgraph_node *node)
160 : {
161 0 : locality_info **ninfo = node_to_ch_info->get (node->get_uid ());
162 0 : if (ninfo)
163 0 : return *ninfo;
164 : return NULL;
165 : }
166 :
167 : /* Forward declaration. */
168 : static int compare_node_uids (cgraph_node *n1, cgraph_node *n2);
169 :
170 : /* Helper function to qsort all_callees in locality_info by edge hotness. */
171 : static int
172 0 : callee_default_cmp (const void *pa, const void *pb)
173 : {
174 0 : const locality_callee_info *a = *(static_cast<const locality_callee_info
175 : *const *> (pa));
176 0 : const locality_callee_info *b = *(static_cast<const locality_callee_info
177 : *const *> (pb));
178 0 : if (a->freq < b->freq)
179 : return 1;
180 0 : else if (a->freq > b->freq)
181 : return -1;
182 0 : return compare_node_uids (a->ultimate_callee, b->ultimate_callee);
183 : }
184 :
185 : /* Sort all_callees of NODE by template hashes and edge hotness.
186 :
187 : all_callees is already sorted by call frequency. If templ_hash of the
188 : caller is same as a callee's templ_hash then that callee is ordered first.
189 : Callees having same templ_hash are sorted by call frequency, callees with
190 : different templ_hash are sorted by the template order.
191 :
192 : if (Both PA and PB have templates && PA's template == PB'template) ||
193 : (Both A and B don't have templates): sort by frequency
194 :
195 : if (PA doesn't have template && PB has) || (Both have different templates &&
196 : caller's template == PB's template): PB before PA
197 :
198 : if (PA has template && PB doesn't) || (Both have different templates &&
199 : caller's template == PA's template): PA before PB
200 :
201 : if (Both have different templates && both are different than caller's
202 : template: sort by template order. */
203 : static int
204 0 : callee_templ_cmp (const void *pa, const void *pb)
205 : {
206 0 : const locality_callee_info *a = *static_cast<const locality_callee_info
207 : *const *> (pa);
208 0 : const locality_callee_info *b = *static_cast<const locality_callee_info
209 : *const *> (pb);
210 :
211 0 : locality_info *caller_info = get_locality_info (a->ultimate_caller);
212 0 : hashval_t caller_hash = 0;
213 0 : if (caller_info)
214 0 : caller_hash = caller_info->templ_hash;
215 :
216 0 : locality_info *ainfo = get_locality_info (a->edge->callee);
217 0 : locality_info *binfo = get_locality_info (b->edge->callee);
218 0 : templ_info *atinfo = NULL, *btinfo = NULL;
219 0 : hashval_t a_hash = 0, b_hash = 0;
220 0 : if (ainfo)
221 : {
222 0 : a_hash = ainfo->templ_hash;
223 0 : atinfo = templ_hash_map->get (a_hash);
224 : }
225 0 : if (binfo)
226 : {
227 0 : b_hash = binfo->templ_hash;
228 0 : btinfo = templ_hash_map->get (b_hash);
229 : }
230 :
231 0 : if (a_hash == b_hash)
232 : {
233 : /* We get here if both have same template type or both have 0 as hash. */
234 0 : return callee_default_cmp (pa, pb);
235 : }
236 0 : else if (a_hash && (!b_hash || a_hash == caller_hash))
237 : return -1;
238 0 : else if (b_hash && (!a_hash || b_hash == caller_hash))
239 : return 1;
240 : else
241 : {
242 0 : gcc_assert (atinfo && btinfo);
243 0 : if (atinfo->order < btinfo->order)
244 : return -1;
245 : /* Order being same is already handled above. */
246 0 : return 1;
247 : }
248 : return callee_default_cmp (pa, pb);
249 : }
250 :
251 : /* Sort all_callees in INFO by edge hotness. */
252 : static void
253 0 : sort_all_callees_default (locality_info *info)
254 : {
255 0 : hash_map<loc_map_hash, locality_callee_info>::iterator iter
256 0 : = info->callee_info.begin ();
257 0 : for (; iter != info->callee_info.end (); ++iter)
258 0 : info->all_callees.safe_push (&((*iter).second));
259 0 : info->all_callees.qsort (callee_default_cmp);
260 0 : }
261 :
262 : /* Populate locality_info for NODE from its direct callees and callees via
263 : inlined nodes. N is used to iterate callees of NODE and callees of inlined
264 : callees of NODE. */
265 : static void
266 0 : populate_callee_locality_info (cgraph_node *node, cgraph_node *n,
267 : locality_info *info)
268 : {
269 0 : for (cgraph_edge *e = n->callees; e; e = e->next_callee)
270 : {
271 0 : cgraph_node *c = e->callee->ultimate_alias_target ();
272 0 : if (c->inlined_to == node)
273 0 : populate_callee_locality_info (node, c, info);
274 : else
275 : {
276 0 : bool existed;
277 0 : locality_callee_info &cinfo = info->callee_info.get_or_insert
278 0 : (c->get_uid (), &existed);
279 0 : if (!existed)
280 : {
281 0 : cinfo.ultimate_callee = c;
282 0 : cinfo.ultimate_caller = node;
283 0 : cinfo.edge = e;
284 0 : cinfo.freq = e->sreal_frequency ();
285 : }
286 : else
287 0 : cinfo.freq = cinfo.freq + e->sreal_frequency ();
288 : }
289 : }
290 0 : }
291 :
292 : /* Forward declaration. */
293 : static int static_profile_cmp (const void *pa, const void *pb);
294 :
295 : /* Helper function for qsort; sort nodes in the descending order derived of
296 : template instantiation types by frequency count. */
297 : static int
298 0 : static_profile_templ_cmp (const void *pa, const void *pb)
299 : {
300 0 : const locality_order *a = *static_cast<const locality_order *const *> (pa);
301 0 : const locality_order *b = *static_cast<const locality_order *const *> (pb);
302 0 : locality_info *ainfo = get_locality_info (a->node);
303 0 : templ_info *atinfo = templ_hash_map->get (ainfo->templ_hash);
304 0 : locality_info *binfo = get_locality_info (b->node);
305 0 : templ_info *btinfo = templ_hash_map->get (binfo->templ_hash);
306 0 : if ((atinfo && !btinfo)
307 0 : || (atinfo && btinfo && atinfo->order < btinfo->order))
308 : return -1;
309 0 : else if ((btinfo && !atinfo)
310 0 : || (atinfo && btinfo && atinfo->order > btinfo->order))
311 : return 1;
312 0 : return static_profile_cmp (pa, pb);
313 : }
314 :
315 : /* Helper function for qsort; sort template instantiation types in descending
316 : order of their frequency count. */
317 : static int
318 0 : sort_templ_hashes_cmp (const void *pa, const void *pb)
319 : {
320 0 : const std::pair<hashval_t, int> *a = static_cast<const std::pair<hashval_t,
321 : int> *> (pa);
322 0 : const std::pair<hashval_t, int> *b = static_cast<const std::pair<hashval_t,
323 : int> *> (pb);
324 0 : if (a->second < b->second)
325 : return 1;
326 0 : else if (a->second > b->second)
327 : return -1;
328 : else
329 : {
330 0 : long long int res = (long long int)a->first - (long long int)b->first;
331 : /* Hashes from templ_hash_map cannot be equal. */
332 0 : gcc_assert (res != 0);
333 0 : return res > 0 ? 1 : -1;
334 : }
335 : }
336 :
337 : /* Sort template instantiation types and record the sorted order. */
338 : static void
339 0 : order_templ_hashes ()
340 : {
341 0 : auto_vec<std::pair<hashval_t, int>> templ_hashes;
342 0 : hash_freq_map_t::iterator iter = templ_hash_map->begin ();
343 0 : for (; iter != templ_hash_map->end (); ++iter)
344 0 : templ_hashes.safe_push (std::make_pair ((*iter).first,
345 0 : (*iter).second.count));
346 0 : templ_hashes.qsort (sort_templ_hashes_cmp);
347 0 : for (unsigned i = 0; i < templ_hashes.length (); i++)
348 : {
349 0 : templ_info *tinfo = templ_hash_map->get (templ_hashes[i].first);
350 0 : gcc_assert (tinfo);
351 0 : tinfo->order = i;
352 : }
353 0 : }
354 :
355 : /* Populate locality_info for NODE from its direct callers. */
356 : static void
357 0 : populate_caller_locality_info (cgraph_node *node, locality_info *info)
358 : {
359 0 : struct cgraph_edge *e;
360 0 : for (e = node->callers; e; e = e->next_caller)
361 : {
362 : /* Make a local decision about all edges for EDGE->caller but not the
363 : other nodes already in the partition. Their edges will be visited
364 : later or may have been visited before and not fit the
365 : cut-off criteria. */
366 0 : if (auto cfreq = info->caller_freq.get (e->caller->get_uid ()))
367 0 : (*cfreq) = (*cfreq) + e->sreal_frequency ();
368 : else
369 0 : info->caller_freq.put (e->caller->get_uid (), e->sreal_frequency ());
370 : }
371 0 : }
372 :
373 : /* Return true if template component is found in the demangled function name in
374 : DC. */
375 : static bool
376 0 : locality_dc_template_p (struct demangle_component *dc)
377 : {
378 0 : switch (dc->type)
379 : {
380 0 : case DEMANGLE_COMPONENT_QUAL_NAME:
381 0 : return locality_dc_template_p (dc->u.s_binary.left);
382 : case DEMANGLE_COMPONENT_TEMPLATE:
383 : return true;
384 0 : default:
385 0 : return false;
386 : }
387 : return false;
388 : }
389 :
390 : /* Generate hash for the template type from NODE's name if NODE represents a
391 : templated functions. If present, record in INFO. */
392 :
393 : static void
394 0 : populate_templ_info (cgraph_node *node, locality_info *info)
395 : {
396 0 : if (!info)
397 0 : return;
398 0 : void *alloc = NULL;
399 0 : const char *asm_name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME
400 : (node->decl));
401 0 : const char *demangled_name = cplus_demangle_v3 (asm_name, 0);
402 0 : struct demangle_component *dc =
403 0 : cplus_demangle_v3_components (asm_name, DMGL_NO_OPTS, &alloc);
404 :
405 0 : if (demangled_name && dc && locality_dc_template_p (dc))
406 : {
407 0 : const char *templ = strchr (demangled_name, '<');
408 0 : hashval_t t_hash = htab_hash_string (templ);
409 0 : info->templ_hash = t_hash;
410 :
411 0 : bool existed;
412 0 : templ_info &tinfo = templ_hash_map->get_or_insert (t_hash, &existed);
413 0 : if (existed)
414 0 : tinfo.count = tinfo.count + 1;
415 : else
416 0 : tinfo.count = 1;
417 : }
418 0 : free (alloc);
419 : }
420 :
421 : /* Initialize locality_info for node V. If CLONE_P is true, V is a locality
422 : clone; populate callee information for locality clones because caller info
423 : is needed for cloning decisions and clones are not cloned again. Populate
424 : both caller and callee info for non-clone nodes. */
425 :
426 : static inline void
427 0 : create_locality_info (cgraph_node *v, bool templ_p = false,
428 : bool clone_p = false)
429 : {
430 0 : locality_info **info = node_to_ch_info->get (v->get_uid ());
431 0 : gcc_assert (!info);
432 :
433 0 : locality_info *vinfo = loc_infos.allocate ();
434 0 : vinfo->node = v;
435 0 : node_to_ch_info->put (v->get_uid (), vinfo);
436 :
437 : /* Locality clones are not cloned again. */
438 0 : if (!clone_p)
439 0 : populate_caller_locality_info (v, vinfo);
440 0 : populate_callee_locality_info (v, v, vinfo);
441 0 : sort_all_callees_default (vinfo);
442 0 : if (templ_p)
443 0 : populate_templ_info (v, vinfo);
444 0 : }
445 :
446 : /* Return true if NODE is already in some partition. */
447 : static inline bool
448 0 : node_partitioned_p (cgraph_node *node)
449 : {
450 0 : return node->aux;
451 : }
452 :
453 : /* Add symbol NODE to partition PART. */
454 : static void
455 0 : add_node_to_partition (locality_partition part, cgraph_node *node)
456 : {
457 0 : struct cgraph_edge *e;
458 0 : if (node_partitioned_p (node))
459 0 : return;
460 :
461 0 : part->nodes.safe_push (node);
462 0 : node->aux = (void *) (uintptr_t) (part->part_id);
463 :
464 0 : if (!node->alias && node->get_partitioning_class () == SYMBOL_PARTITION)
465 0 : part->insns += ipa_size_summaries->get (node)->size;
466 :
467 : /* Add all inline clones and callees that are duplicated. */
468 0 : for (e = node->callees; e; e = e->next_callee)
469 0 : if (!e->inline_failed)
470 0 : add_node_to_partition (part, e->callee);
471 : /* omp declare_variant_alt or transparent_alias with definition or linker
472 : discardable (non-local comdat but not forced and not
473 : used by non-LTO). */
474 0 : else if (e->callee->get_partitioning_class () == SYMBOL_DUPLICATE)
475 0 : add_node_to_partition (part, e->callee);
476 :
477 : /* Add all thunks associated with the function. */
478 0 : for (e = node->callers; e; e = e->next_caller)
479 0 : if (e->caller->thunk && !e->caller->inlined_to)
480 0 : add_node_to_partition (part, e->caller);
481 :
482 : /* Add all aliases associated with the symbol. */
483 : struct ipa_ref *ref;
484 0 : FOR_EACH_ALIAS (node, ref)
485 0 : if (!ref->referring->transparent_alias)
486 : {
487 0 : cgraph_node *referring = dyn_cast<cgraph_node *> (ref->referring);
488 : /* Only add function aliases.
489 : Varpool refs are added later in LTO partitioning pass. */
490 0 : if (referring)
491 0 : add_node_to_partition (part, referring);
492 : }
493 : else
494 : {
495 : struct ipa_ref *ref2;
496 : /* We do not need to add transparent aliases if they are not used.
497 : However we must add aliases of transparent aliases if they exist. */
498 0 : FOR_EACH_ALIAS (ref->referring, ref2)
499 : {
500 : /* Nested transparent aliases are not permitted. */
501 0 : gcc_checking_assert (!ref2->referring->transparent_alias);
502 0 : cgraph_node *referring = dyn_cast<cgraph_node *> (ref2->referring);
503 0 : if (referring)
504 0 : add_node_to_partition (part, referring);
505 : }
506 : }
507 : }
508 :
509 : /* Return TRUE if NODE is in PARTITION. */
510 : static bool
511 0 : node_in_partition_p (locality_partition partition, cgraph_node *node)
512 : {
513 0 : return ((uintptr_t) (partition->part_id) == (uintptr_t) (node->aux));
514 : }
515 :
516 : /* Helper function for qsort; to break ties. */
517 : static int
518 0 : compare_node_uids (cgraph_node *n1, cgraph_node *n2)
519 : {
520 0 : int res = n1->get_uid () - n2->get_uid ();
521 0 : gcc_assert (res != 0);
522 0 : return res > 0 ? 1 : -1;
523 : }
524 :
525 : /* Helper function for qsort; sort nodes by order. */
526 : static int
527 0 : static_profile_cmp (const void *pa, const void *pb)
528 : {
529 0 : const locality_order *a = *static_cast<const locality_order *const *> (pa);
530 0 : const locality_order *b = *static_cast<const locality_order *const *> (pb);
531 : /* Ascending order. */
532 0 : if (b->order < a->order)
533 : return 1;
534 0 : if (b->order > a->order)
535 : return -1;
536 0 : return compare_node_uids (a->node, b->node);
537 : }
538 :
539 : /* Helper function for qsort; sort nodes by profile count. */
540 : static int
541 0 : compare_edge_profile_counts (const void *pa, const void *pb)
542 : {
543 0 : const locality_order *a = *static_cast<const locality_order *const *> (pa);
544 0 : const locality_order *b = *static_cast<const locality_order *const *> (pb);
545 :
546 0 : profile_count cnt1 = a->node->count.ipa ();
547 0 : profile_count cnt2 = b->node->count.ipa ();
548 0 : if (!cnt1.compatible_p (cnt2))
549 0 : return static_profile_cmp (pa, pb);
550 :
551 0 : if (cnt1 < cnt2)
552 : return 1;
553 0 : if (cnt1 > cnt2)
554 : return -1;
555 0 : return static_profile_cmp (pa, pb);
556 : }
557 :
558 : /* Create and return a new partition and increment NPARTITIONS. */
559 :
560 : static locality_partition
561 0 : create_partition (int &npartitions)
562 : {
563 0 : locality_partition part = XCNEW (struct locality_partition_def);
564 0 : npartitions++;
565 0 : part->part_id = npartitions;
566 0 : part->nodes.create (1);
567 0 : part->insns = 0;
568 0 : locality_partitions.safe_push (part);
569 0 : return part;
570 : }
571 :
572 : /* Structure for holding profile count information of callers of a node. */
573 : struct profile_stats
574 : {
575 : /* Sum of non-recursive call counts. */
576 : profile_count nonrec_count;
577 :
578 : /* Sum of recursive call counts. */
579 : profile_count rec_count;
580 :
581 : /* If non-NULL, this node is the target of alias or thunk and calls from this
582 : should be count in rec_count. */
583 : cgraph_node *target;
584 : };
585 :
586 : /* Initialize fields of STATS. */
587 : static inline void
588 0 : init_profile_stats (profile_stats *stats, cgraph_node *target = NULL)
589 : {
590 0 : stats->nonrec_count = profile_count::zero ();
591 0 : stats->rec_count = profile_count::zero ();
592 0 : stats->target = target;
593 0 : }
594 :
595 : /* Helper function of to accumulate call counts. */
596 : static bool
597 0 : accumulate_profile_counts_after_cloning (cgraph_node *node, void *data)
598 : {
599 0 : struct profile_stats *stats = (struct profile_stats *) data;
600 0 : for (cgraph_edge *e = node->callers; e; e = e->next_caller)
601 : {
602 0 : if (!e->count.initialized_p ())
603 0 : continue;
604 :
605 0 : if (e->caller == stats->target)
606 0 : stats->rec_count += e->count.ipa ();
607 : else
608 0 : stats->nonrec_count += e->count.ipa ();
609 : }
610 0 : return false;
611 : }
612 :
613 : /* NEW_NODE is a previously created clone of ORIG_NODE already present in
614 : current partition. EDGES contains newly redirected edges to NEW_NODE.
615 : Adjust profile information for both nodes and the edge. */
616 :
617 : static void
618 0 : adjust_profile_info_for_non_self_rec_edges (auto_vec<cgraph_edge *> &edges,
619 : cgraph_node *new_node,
620 : cgraph_node *orig_node)
621 : {
622 0 : profile_count orig_node_count = orig_node->count.ipa ();
623 0 : profile_count edge_count = profile_count::zero ();
624 0 : profile_count final_new_count = profile_count::zero ();
625 0 : profile_count final_orig_count = profile_count::zero ();
626 :
627 0 : for (unsigned i = 0; i < edges.length (); ++i)
628 0 : if (edges[i]->count.initialized_p ())
629 0 : edge_count += edges[i]->count.ipa ();
630 :
631 0 : final_orig_count = orig_node_count - edge_count;
632 :
633 : /* NEW_NODE->count was adjusted for other callers when the clone was
634 : first created. Just add the new edge count. */
635 0 : final_new_count = new_node->count + edge_count;
636 :
637 0 : final_new_count = orig_node_count.combine_with_ipa_count (final_new_count);
638 0 : orig_node->count = final_orig_count;
639 0 : new_node->count = final_new_count;
640 :
641 0 : if (dump_file)
642 : {
643 0 : fprintf (dump_file, "Adjusting profile information for %s\n",
644 : new_node->dump_asm_name ());
645 0 : fprintf (dump_file, "\tOriginal node %s\n", orig_node->dump_asm_name ());
646 0 : fprintf (dump_file, "\tOriginal count: ");
647 0 : orig_node_count.dump (dump_file);
648 0 : fprintf (dump_file, "\n\tAdjusted original count to: ");
649 0 : final_orig_count.dump (dump_file);
650 0 : fprintf (dump_file, "\n\tAdjusted clone count to: ");
651 0 : final_new_count.dump (dump_file);
652 0 : fprintf (dump_file, "\n");
653 : }
654 :
655 : /* Scale all callee edges according to adjusted counts. */
656 0 : profile_count orig_node_count_copy = orig_node_count;
657 0 : profile_count::adjust_for_ipa_scaling (&final_new_count,
658 : &orig_node_count_copy);
659 0 : for (cgraph_edge *cs = new_node->callees; cs; cs = cs->next_callee)
660 0 : cs->count = cs->count.apply_scale (final_new_count, orig_node_count_copy);
661 0 : for (cgraph_edge *cs = new_node->indirect_calls; cs; cs = cs->next_callee)
662 0 : cs->count = cs->count.apply_scale (final_new_count, orig_node_count_copy);
663 :
664 0 : profile_count::adjust_for_ipa_scaling (&final_orig_count, &orig_node_count);
665 0 : for (cgraph_edge *cs = orig_node->callees; cs; cs = cs->next_callee)
666 0 : cs->count = cs->count.apply_scale (final_orig_count, orig_node_count);
667 0 : for (cgraph_edge *cs = orig_node->indirect_calls; cs; cs = cs->next_callee)
668 0 : cs->count = cs->count.apply_scale (final_orig_count, orig_node_count);
669 0 : }
670 :
671 : /* Adjust profile counts of NEW_NODE and ORIG_NODE, where NEW_NODE is a clone
672 : of OLD_NODE.
673 : Assumes that all eligible edges from current partition so far are redirected
674 : to NEW_NODE and recursive edges are adjusted. */
675 :
676 : static void
677 0 : adjust_profile_info (cgraph_node *new_node, cgraph_node *orig_node)
678 : {
679 : /* If all calls to NEW_NODE are non-recursive, subtract corresponding count
680 : from ORIG_NODE and assign to NEW_NODE, any unexpected remainder stays with
681 : ORIG_NODE.
682 : Recursive calls if present, likely contribute to majority of count;
683 : scale according to redirected callers' count. */
684 :
685 0 : profile_count orig_node_count = orig_node->count.ipa ();
686 0 : profile_stats new_stats, orig_stats;
687 :
688 0 : init_profile_stats (&new_stats);
689 0 : init_profile_stats (&orig_stats);
690 :
691 0 : new_node->call_for_symbol_thunks_and_aliases
692 0 : (accumulate_profile_counts_after_cloning, &new_stats, false);
693 0 : orig_node->call_for_symbol_thunks_and_aliases
694 0 : (accumulate_profile_counts_after_cloning, &orig_stats, false);
695 :
696 0 : profile_count orig_nonrec_count = orig_stats.nonrec_count;
697 0 : profile_count orig_rec_count = orig_stats.rec_count;
698 0 : profile_count new_nonrec_count = new_stats.nonrec_count;
699 0 : profile_count new_rec_count = new_stats.rec_count;
700 :
701 0 : profile_count final_new_count = new_nonrec_count;
702 0 : profile_count final_orig_count = profile_count::zero ();
703 :
704 : /* All calls to NEW_NODE are non-recursive or recursive calls have
705 : zero count. */
706 0 : if (!new_rec_count.nonzero_p ())
707 0 : final_orig_count = orig_node_count - new_nonrec_count;
708 : else
709 : {
710 : /* If ORIG_NODE is externally visible, indirect calls or calls from
711 : another part of the code may contribute to the count.
712 : update_profiling_info () from ipa-cp.cc pretends to have an extra
713 : caller to represent the extra counts. */
714 0 : if (!orig_node->local)
715 : {
716 0 : profile_count pretend_count = (orig_node_count - new_nonrec_count -
717 0 : orig_nonrec_count - orig_rec_count);
718 0 : orig_nonrec_count += pretend_count;
719 : }
720 :
721 : /* Remaining rec_count is assigned in proportion to clone's non-recursive
722 : count. */
723 0 : profile_count rec_count = orig_node_count - new_nonrec_count
724 0 : - orig_nonrec_count;
725 0 : profile_count new_rec_scaled
726 0 : = rec_count.apply_scale (new_nonrec_count,
727 : new_nonrec_count + orig_nonrec_count);
728 0 : final_new_count += new_rec_scaled;
729 0 : final_orig_count = orig_node_count - final_new_count;
730 : }
731 :
732 0 : final_new_count = orig_node_count.combine_with_ipa_count (final_new_count);
733 0 : new_node->count = final_new_count;
734 0 : orig_node->count = final_orig_count;
735 :
736 0 : if (dump_file)
737 : {
738 0 : fprintf (dump_file, "Adjusting profile information for %s\n",
739 : new_node->dump_asm_name ());
740 0 : fprintf (dump_file, "\tOriginal node %s\n", orig_node->dump_asm_name ());
741 0 : fprintf (dump_file, "\tOriginal count: ");
742 0 : orig_node_count.dump (dump_file);
743 0 : fprintf (dump_file, "\n\tAdjusted original count to: ");
744 0 : final_orig_count.dump (dump_file);
745 0 : fprintf (dump_file, "\n\tAdjusted clone count to: ");
746 0 : final_new_count.dump (dump_file);
747 0 : fprintf (dump_file, "\n");
748 : }
749 :
750 : /* Scale all callee edges according to adjusted counts. */
751 0 : profile_count orig_node_count_copy = orig_node_count;
752 0 : profile_count::adjust_for_ipa_scaling (&final_new_count,
753 : &orig_node_count_copy);
754 0 : for (cgraph_edge *cs = new_node->callees; cs; cs = cs->next_callee)
755 0 : cs->count = cs->count.apply_scale (final_new_count, orig_node_count_copy);
756 0 : for (cgraph_edge *cs = new_node->indirect_calls; cs; cs = cs->next_callee)
757 0 : cs->count = cs->count.apply_scale (final_new_count, orig_node_count_copy);
758 :
759 0 : profile_count::adjust_for_ipa_scaling (&final_orig_count, &orig_node_count);
760 0 : for (cgraph_edge *cs = orig_node->callees; cs; cs = cs->next_callee)
761 0 : cs->count = cs->count.apply_scale (final_orig_count, orig_node_count);
762 0 : for (cgraph_edge *cs = orig_node->indirect_calls; cs; cs = cs->next_callee)
763 0 : cs->count = cs->count.apply_scale (final_orig_count, orig_node_count);
764 0 : }
765 :
766 : /* Return true if EDGE can be safely redirected to another callee. */
767 : static inline bool
768 0 : edge_redirectable_p (cgraph_edge *edge, lto_locality_cloning_model cm)
769 : {
770 0 : if (cm == LTO_LOCALITY_NON_INTERPOSABLE_CLONING)
771 : {
772 : /* Interposability may change on edge basis. */
773 0 : enum availability avail;
774 0 : avail = edge->callee->get_availability (edge->caller);
775 0 : if (avail <= AVAIL_INTERPOSABLE)
776 : return false;
777 : }
778 : return true;
779 : }
780 :
781 : /* Create a locality clone of CNODE and redirect all callers present in
782 : PARTITION.
783 : Create a clone dpending on whether CNODE itself is a clone or not. */
784 :
785 : static cgraph_node *
786 0 : create_locality_clone (cgraph_node *cnode,
787 : locality_partition partition, int &cl_num,
788 : lto_locality_cloning_model cm)
789 : {
790 0 : cgraph_node *cl_node = NULL;
791 0 : vec<cgraph_edge *> redirect_callers = vNULL;
792 : /* All callers of cnode in current partition are redirected. */
793 0 : struct cgraph_edge *edge;
794 0 : for (edge = cnode->callers; edge; edge = edge->next_caller)
795 : {
796 0 : struct cgraph_node *caller = edge->caller;
797 0 : if (node_in_partition_p (partition, caller) && caller->definition
798 0 : && caller != cnode && edge_redirectable_p (edge, cm))
799 0 : redirect_callers.safe_push (edge);
800 : }
801 :
802 0 : const char *suffix = "locality_clone";
803 :
804 0 : tree old_decl = cnode->decl;
805 0 : tree new_decl = copy_node (old_decl);
806 :
807 : /* Generate a new name for the new version. */
808 0 : const char *name = IDENTIFIER_POINTER (DECL_NAME (old_decl));
809 0 : DECL_NAME (new_decl) = clone_function_name (name, suffix, cl_num);
810 0 : SET_DECL_ASSEMBLER_NAME (new_decl,
811 : clone_function_name (old_decl, suffix, cl_num));
812 0 : cl_num++;
813 0 : if (dump_file)
814 0 : fprintf (dump_file, "\tNew name %s\n",
815 0 : IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (new_decl)));
816 :
817 0 : cl_node = cnode->create_clone (new_decl, cnode->count /*profile_count*/,
818 : false /*update_original*/, redirect_callers,
819 : false /*call_duplication_hook*/,
820 : NULL /*new_inlined_to*/,
821 : NULL /*param_adjustments*/, suffix);
822 :
823 0 : set_new_clone_decl_and_node_flags (cl_node);
824 :
825 0 : if (cnode->ipa_transforms_to_apply.exists ())
826 0 : cl_node->ipa_transforms_to_apply
827 0 : = cnode->ipa_transforms_to_apply.copy ();
828 :
829 0 : if (dump_file)
830 : {
831 0 : fprintf (dump_file, "Cloned Node: %s %s\n", cnode->dump_asm_name (),
832 : cl_node->dump_asm_name ());
833 :
834 0 : for (edge = cl_node->callers; edge; edge = edge->next_caller)
835 0 : fprintf (dump_file, "Redirected callers: %s\n",
836 0 : edge->caller->dump_asm_name ());
837 :
838 0 : for (edge = cl_node->callees; edge; edge = edge->next_callee)
839 0 : fprintf (dump_file, "Callees of clone: %s %d\n",
840 0 : edge->callee->dump_asm_name (), edge->frequency ());
841 : }
842 0 : return cl_node;
843 : }
844 :
845 : /* Redirect recursive edges of CLONE to correctly point to CLONE. As part of
846 : cloning process, all callee edges of a node are just duplicated but not
847 : redirected. Therefore, these edges still call to original of CLONE.
848 :
849 : For non-inlined CLONEs, NEW_CALLEE == CLONE and ORIG_CALLEE is CLONE's
850 : original node.
851 :
852 : For inlined node, self recursion to CLONE's original same as non-inlined,
853 : additionally, calls to CLONE->inlined_to are also recursive:
854 : NEW_CALLEE == CLONE->inlined_into and
855 : ORIG_CALLEE == original node of CLONE->inlined_into. */
856 :
857 : static void
858 0 : adjust_recursive_callees (cgraph_node *clone, cgraph_node *new_callee,
859 : cgraph_node *orig_callee)
860 : {
861 0 : cgraph_node *alias = NULL;
862 0 : for (cgraph_edge *e = clone->callees; e; e = e->next_callee)
863 : {
864 0 : if (!e->inline_failed)
865 0 : continue;
866 :
867 : /* Only self-cycle or local alias are handled. */
868 0 : cgraph_node *callee = e->callee;
869 0 : if (callee == orig_callee)
870 : {
871 0 : cgraph_node **cl = node_to_clone->get (orig_callee->get_uid ());
872 0 : gcc_assert (cl && *cl == new_callee);
873 0 : e->redirect_callee_duplicating_thunks (new_callee);
874 0 : if (dump_file)
875 0 : fprintf (dump_file, "recursive call from %s to %s orig %s\n",
876 0 : e->caller->dump_asm_name (), e->callee->dump_asm_name (),
877 : callee->dump_asm_name ());
878 : }
879 0 : else if (callee->alias
880 0 : && e->callee->ultimate_alias_target () == orig_callee)
881 : {
882 0 : if (!alias)
883 : {
884 0 : alias = dyn_cast<cgraph_node *> (
885 : new_callee->noninterposable_alias ());
886 : }
887 0 : e->redirect_callee_duplicating_thunks (alias);
888 0 : if (dump_file)
889 0 : fprintf (dump_file, "recursive call from %s to %s orig %s\n",
890 0 : e->caller->dump_asm_name (), e->callee->dump_asm_name (),
891 : callee->dump_asm_name ());
892 : }
893 : }
894 0 : new_callee->expand_all_artificial_thunks ();
895 0 : if (alias)
896 0 : alias->expand_all_artificial_thunks ();
897 0 : }
898 :
899 : /* Create clones for CALLER's inlined callees, ORIG_INLINED_TO is the original
900 : node from clone_as_needed () such that new_inlined_to is a clone of it. */
901 :
902 : static void
903 0 : inline_clones (cgraph_node *caller, cgraph_node *orig_inlined_to)
904 : {
905 0 : struct cgraph_edge *edge;
906 0 : for (edge = caller->callees; edge; edge = edge->next_callee)
907 : {
908 0 : struct cgraph_node *callee = edge->callee;
909 0 : if (edge->inline_failed)
910 0 : continue;
911 :
912 0 : if (callee->inlined_to != orig_inlined_to)
913 0 : continue;
914 :
915 0 : struct cgraph_node *new_inlined_to, *cl;
916 0 : if (caller->inlined_to)
917 0 : new_inlined_to = caller->inlined_to;
918 : else
919 : new_inlined_to = caller;
920 :
921 0 : cl = callee->create_clone (callee->decl,
922 : edge->count /*profile_count*/,
923 : true /*update_original*/,
924 0 : vNULL /*redirect_callers*/,
925 : false /*call_duplication_hook*/,
926 : new_inlined_to /*new_inlined_to*/,
927 : NULL /*param_adjustments*/,
928 : "locality_clone" /*suffix*/);
929 0 : edge->redirect_callee (cl);
930 :
931 0 : node_to_clone->put (callee->get_uid (), cl);
932 0 : clone_to_node->put (cl->get_uid (), callee);
933 :
934 0 : if (callee->thunk)
935 : {
936 0 : thunk_info *info = thunk_info::get (callee);
937 0 : *thunk_info::get_create (cl) = *info;
938 : }
939 :
940 0 : adjust_recursive_callees (cl, new_inlined_to, orig_inlined_to);
941 0 : adjust_recursive_callees (cl, cl, callee);
942 0 : if (dump_file)
943 : {
944 0 : fprintf (dump_file, "Inline cloned\n");
945 0 : cl->dump (dump_file);
946 : }
947 :
948 : /* Recursively inline till end of this callchain. */
949 0 : inline_clones (cl, orig_inlined_to);
950 : }
951 0 : }
952 :
953 : /* Clone EDGE->CALLEE if it or a clone of it is not already in PARTITION.
954 : Redirect all callers of EDGE->CALLEE that are in PARTITION, not just the
955 : EDGE. If a clone is already present in PARTITION, redirect all edges from
956 : EDGE->CALLER to EDGE->CALLEE. This is because we only visit one edge per
957 : caller to callee and redirect for all others from there.
958 :
959 : If cloning, also recursively clone inlined functions till the end of the
960 : callchain because inlined clones have 1-1 exclusive copy and edge from
961 : caller to inlined node.
962 :
963 : There are 2 flows possible:
964 : 1. Only redirect
965 : 1.1. cnode is already in current partition - cnode mustn't be a
966 : locality_clone -> nothing to do
967 : 1.2. A clone of cnode is in current partition - find out if it's the
968 : correct clone for edge - must be a locality_clone but the exact same
969 : kind as callee i.e. orig or cp/sra clone, if yes, redirect, else go to #2
970 : 1.3. Cnode/a clone of cnode is in current partition but caller is inlined
971 : 2. Clone and redirect
972 : 2.1. cnode is original node
973 : 2.2. cnode itself is a clone
974 : Clone inlines
975 : Flavors of edges:
976 : 1. Normal -> orig nodes, locality clones or cp/sra clones
977 : 2. Recursive -> direct recursion
978 : 3. Alias -> recursion via aliasing or as a result of IPA code duplication
979 : 4. Inline -> shouldn't be included in callchain. */
980 :
981 : static cgraph_node *
982 0 : clone_node_as_needed (cgraph_edge *edge, locality_partition partition,
983 : int &cl_num, lto_locality_cloning_model cm)
984 : {
985 : /* suitable_for_locality_cloning_p () currently prohibits cloning aliases due
986 : to potential versioning and materialization issues. Could be enabled in
987 : the future. suitable_for_locality_cloning_p () also checks for
988 : interposability for CNODE but not for edge redirection. */
989 0 : struct cgraph_node *cnode = edge->callee;
990 0 : struct cgraph_node *caller = edge->caller;
991 :
992 : /* If clone of cnode is already in the partition
993 : Get latest clone of cnode. If current partition has cloned cnode, that
994 : clone should be returned. Otherwise, clone from previous partition is
995 : returned
996 : Original node and its clone shouldn't co-exist in current partition
997 :
998 : This is required if callee is partitioned via another edge before caller
999 : was, and we are now visiting caller->callee edge
1000 :
1001 : 1) a -> b ==> a -> bc1; b was cloned say via d -> bc1, a is orig
1002 : 2) ac1 -> b ==> ac1 -> bc1; b was cloned and a was just cloned
1003 : 3) a -> bc1 and bc2 present, mustn't happen, b was cloned and a was
1004 : redirected without being partitioned first.
1005 : Why will we do this again - multiple edges and something's wrong in
1006 : partition_callchain ()
1007 : 4) ac1 -> bc1 ==> ac1 -> bc2; a was cloned and we already got (1) in some
1008 : other partition
1009 : 5) ac1 -> bc1 but no clone present in this PARTITION. Create from b, not
1010 : from bc1?
1011 : 6) a -> b; a -> bc0; create new clone, no clone present
1012 : 7) ac0 -> b; ac0 -> bc0 same as (6)
1013 : 8) a -> bc0 and no clone present, mustn't happen, same as (3)
1014 :
1015 : Redirect when bc1 is present and:
1016 : a -> b or ac -> b or ac -> bc0 */
1017 :
1018 0 : cgraph_node *orig_cnode = cnode;
1019 0 : cgraph_node **o_cnode = clone_to_node->get (cnode->get_uid ());
1020 0 : if (o_cnode)
1021 0 : orig_cnode = *o_cnode;
1022 :
1023 0 : cgraph_node **cnode_cl = node_to_clone->get (orig_cnode->get_uid ());
1024 :
1025 0 : if (cnode_cl && node_in_partition_p (partition, *cnode_cl))
1026 : {
1027 0 : if (node_in_partition_p (partition, caller))
1028 : {
1029 0 : bool clone_p = false;
1030 0 : auto_vec<cgraph_edge *> redirected_edges;
1031 0 : for (cgraph_edge *ec = caller->callees; ec; ec = ec->next_callee)
1032 0 : if (ec->callee == cnode && edge_redirectable_p (ec, cm))
1033 : {
1034 0 : ec->redirect_callee_duplicating_thunks (*cnode_cl);
1035 0 : clone_p = true;
1036 0 : redirected_edges.safe_push (ec);
1037 0 : if (dump_file)
1038 : {
1039 0 : fprintf (dump_file, "clone present %s %s redirecting %s\n",
1040 : cnode->dump_asm_name (),
1041 : (*cnode_cl)->dump_asm_name (),
1042 : caller->dump_asm_name ());
1043 : }
1044 : }
1045 0 : if (clone_p)
1046 : {
1047 0 : (*cnode_cl)->expand_all_artificial_thunks ();
1048 0 : adjust_profile_info_for_non_self_rec_edges (redirected_edges,
1049 : *cnode_cl, cnode);
1050 0 : return NULL;
1051 : }
1052 0 : }
1053 : }
1054 :
1055 : /* Create a new clone for a -> b, ac -> b.
1056 : For ac -> bc, should be done on bc or b?
1057 : bc could be from b_cp/b_sra or b. */
1058 :
1059 0 : if (orig_cnode != cnode)
1060 : {
1061 0 : if (dump_file)
1062 0 : fprintf (dump_file, "Clone of clone %s %s\n", cnode->dump_asm_name (),
1063 : orig_cnode->dump_asm_name ());
1064 0 : return NULL;
1065 : }
1066 :
1067 0 : struct cgraph_node *cloned_node
1068 0 : = create_locality_clone (cnode, partition, cl_num, cm);
1069 :
1070 0 : gcc_assert (cloned_node);
1071 0 : if (!cloned_node)
1072 : return NULL;
1073 :
1074 0 : node_to_clone->put (cnode->get_uid (), cloned_node);
1075 0 : clone_to_node->put (cloned_node->get_uid (), cnode);
1076 :
1077 0 : adjust_recursive_callees (cloned_node, cloned_node, cnode);
1078 0 : symtab->call_cgraph_duplication_hooks (cnode, cloned_node);
1079 :
1080 0 : adjust_profile_info (cloned_node, cnode);
1081 : /* Inline clones are created iff their inlined_to == CNODE. */
1082 0 : inline_clones (cloned_node, cnode);
1083 :
1084 0 : return cloned_node;
1085 : }
1086 :
1087 : /* Determine if EDGE->CALLEE is suitable for cloning. It is assummed that the
1088 : callee is not an inlined node. */
1089 :
1090 : static bool
1091 0 : suitable_for_locality_cloning_p (cgraph_edge *edge,
1092 : lto_locality_cloning_model cm)
1093 : {
1094 0 : cgraph_node *node = edge->callee;
1095 0 : if (!node->versionable)
1096 : return false;
1097 :
1098 : /* Out-of-line locality clones of ipcp or sra clones will be created in this
1099 : pass after IPA inline is run. A locality clone has the same function
1100 : body and the same updated signature as the ipcp/sra clone.
1101 : This fails or asserts based on how the clone is created:
1102 : 1. If param_adjustments and tree_map are not recorded for locality clone:
1103 : clone materialization (tree_function_versioning ()) fails when
1104 : updating signature and remapping calls because clone_of (ipcp/sra
1105 : clone) and locality clone differ in param information.
1106 : 2. If param_adjustments and tree_map are provided: asserts are triggered
1107 : in fnsummary duplication because IPA inline resets some summaries.
1108 :
1109 : One inelegant solution is to provide param_adjustments and tree_map, and
1110 : then set clone_of to ipcp/sra clone's clone_of. However, this sometimes
1111 : results in segmentation fault when the compiled program is run.
1112 : Disabling clone of clones altogether for now with an aim to resolve this
1113 : is future. */
1114 0 : if (node->clone_of)
1115 : return false;
1116 :
1117 0 : if (node->alias)
1118 : return false;
1119 :
1120 0 : if (edge->recursive_p ())
1121 : return false;
1122 :
1123 0 : if (!node->definition)
1124 : return false;
1125 :
1126 : /* Don't clone NODE if IPA count of NODE or EDGE is zero. */
1127 0 : if (!node->count.ipa ().nonzero_p () || !edge->count.ipa ().nonzero_p ())
1128 0 : return false;
1129 :
1130 0 : if (cm == LTO_LOCALITY_NON_INTERPOSABLE_CLONING)
1131 : {
1132 : /* Interposability may change on edge basis. */
1133 0 : enum availability avail;
1134 0 : edge->callee->ultimate_alias_target (&avail, edge->caller);
1135 0 : if (avail <= AVAIL_INTERPOSABLE)
1136 0 : return false;
1137 : }
1138 :
1139 : return true;
1140 : }
1141 :
1142 : /* Return true if edge->callee->ultimate_alias_target can be cloned. */
1143 : static bool
1144 0 : clone_node_p (cgraph_edge *edge, lto_locality_cloning_model cloning_model,
1145 : double freq_cutoff, int size)
1146 : {
1147 0 : cgraph_node *node = edge->callee->ultimate_alias_target ();
1148 :
1149 0 : if (!suitable_for_locality_cloning_p (edge, cloning_model))
1150 : return false;
1151 :
1152 0 : if (!node->alias)
1153 0 : if (ipa_size_summaries->get (node)->size >= size)
1154 : return false;
1155 :
1156 0 : if (freq_cutoff != 0.0)
1157 : {
1158 0 : locality_info *info = get_locality_info (node);
1159 0 : gcc_assert (info);
1160 0 : if (auto cfreq = info->caller_freq.get (edge->caller->get_uid ()))
1161 : {
1162 0 : if ((*cfreq).to_double () < freq_cutoff)
1163 : return false;
1164 : }
1165 0 : else if (edge->sreal_frequency ().to_double () < freq_cutoff)
1166 : return false;
1167 : }
1168 :
1169 : return true;
1170 : }
1171 :
1172 : /* Partition NODE's callees into PARTITION or clone if already partitioned and
1173 : satisfies cloning criteria such as CLONING_MODEL, REAL_FREQ and SIZE
1174 : cut-offs. */
1175 :
1176 : /* callgraph can have multiple caller to callee edges for multiple callsites
1177 : For the first such edge, we make decisions about cutoffs and cloning because
1178 : we redirect ALL callsites to cloned callee, not just one of them. */
1179 :
1180 : static void
1181 0 : partition_callchain (cgraph_node *node, locality_partition &partition,
1182 : lto_locality_cloning_model cloning_model,
1183 : double freq_cutoff, int size, int &cl_num,
1184 : int &npartitions, int64_t partition_size,
1185 : lto_locality_heuristics scheme)
1186 : {
1187 : /* Aliases are added in the same partition as their targets.
1188 : Aliases are not cloned and their callees are not processed separately. */
1189 0 : cgraph_node *cl_node = NULL;
1190 0 : if (partition->insns > partition_size)
1191 0 : partition = create_partition (npartitions);
1192 :
1193 : /* Iterate over all unique callees of NODE, direct callees and callees via
1194 : inlined nodes. This avoids calling partition_callchain () separately for
1195 : inlined nodes which themselves are already partitioned along with their
1196 : inlined_to nodes. */
1197 0 : locality_info *info = get_locality_info (node);
1198 0 : if (scheme == LTO_LOCALITY_CPP_TEMPLATE)
1199 0 : info->all_callees.qsort (callee_templ_cmp);
1200 0 : for (unsigned i = 0; i < info->all_callees.length (); i++)
1201 : {
1202 0 : cgraph_edge *e = info->all_callees[i]->edge;
1203 0 : cgraph_node *n = e->callee->ultimate_alias_target ();
1204 0 : if (n->get_partitioning_class () == SYMBOL_PARTITION)
1205 : {
1206 0 : if (!node_partitioned_p (n))
1207 : {
1208 0 : add_node_to_partition (partition, n);
1209 0 : if (dump_file)
1210 0 : fprintf (dump_file, "Partitioned node: %s\n",
1211 : n->dump_asm_name ());
1212 0 : partition_callchain (n, partition, cloning_model, freq_cutoff,
1213 : size, cl_num, npartitions, partition_size,
1214 : scheme);
1215 : }
1216 0 : else if (cloning_model >= LTO_LOCALITY_NON_INTERPOSABLE_CLONING
1217 0 : && (!e->callee->alias)
1218 0 : && node_in_partition_p (partition, e->caller)
1219 0 : && (!node_in_partition_p (partition, n)))
1220 :
1221 : {
1222 : /* 3 possible scenarios if N is already partitioned but not in
1223 : present in PARTITION:
1224 : 1. There's a clone of N present in PARTITION, redirect to that
1225 : clone, no need to check for suitability.
1226 : 2. N itself is a locality clone, cloned as part of another
1227 : callchain. If a clone of N's original node is present in
1228 : PARTITION, redirect to it without checking for suitability.
1229 : Cloned node itself is not cloned again.
1230 : Example: suppose N = B_clone ().
1231 : In partition X, edge A->B was transformed to A->B_clone0.
1232 : In current partition, A was cloned to A_clone0 and now
1233 : B_clone0 is visited via edge A_clone0->B_clone0. If a
1234 : B_clonei is present, redirect A_clone0 to it, otherise do
1235 : nothing.
1236 : 3. N is not a locality clone and no clone of N is present in
1237 : PARTITION, check for suitability and clone. */
1238 0 : cgraph_node *orig_cnode = n;
1239 0 : cgraph_node **o_cnode = clone_to_node->get (n->get_uid ());
1240 0 : if (o_cnode)
1241 0 : orig_cnode = *o_cnode;
1242 :
1243 0 : cgraph_node **cnode_cl = node_to_clone->get (orig_cnode->get_uid
1244 0 : ());
1245 :
1246 0 : if ((cnode_cl && node_in_partition_p (partition, *cnode_cl))
1247 0 : || (orig_cnode == n
1248 0 : && clone_node_p (e, cloning_model, freq_cutoff, size)))
1249 : {
1250 0 : cl_node = clone_node_as_needed (e, partition, cl_num,
1251 : cloning_model);
1252 0 : if (cl_node)
1253 : {
1254 0 : create_locality_info (cl_node,
1255 : scheme == LTO_LOCALITY_CPP_TEMPLATE,
1256 : true);
1257 0 : add_node_to_partition (partition, cl_node);
1258 0 : partition_callchain (cl_node, partition, cloning_model,
1259 : freq_cutoff, size, cl_num,
1260 : npartitions, partition_size, scheme);
1261 : }
1262 : }
1263 : }
1264 : }
1265 : }
1266 0 : }
1267 :
1268 : /* Determine whether NODE is an entrypoint to a callchain. */
1269 :
1270 : static bool
1271 0 : is_entry_node_p (cgraph_node *node)
1272 : {
1273 : /* node->inlined_to is returned as SYMBOL_DUPLICATE. */
1274 0 : if (node->get_partitioning_class () != SYMBOL_PARTITION)
1275 : return false;
1276 :
1277 : /* NODE and all its aliases or thunk are local. */
1278 0 : if (node->local_p ())
1279 : return false;
1280 :
1281 0 : if (!node->callers)
1282 : return true;
1283 :
1284 0 : for (cgraph_edge *e = node->callers; e; e = e->next_caller)
1285 : {
1286 0 : if (! e->recursive_p ())
1287 : return false;
1288 : }
1289 :
1290 : return true;
1291 : }
1292 :
1293 : /* Determine order of all external nodes if PGO profile is available.
1294 : Store the order in ORDER. */
1295 :
1296 : static bool
1297 0 : locality_determine_ipa_order (auto_vec<locality_order *> *order)
1298 : {
1299 0 : struct cgraph_node *node;
1300 0 : auto_vec<locality_order *> non_comparable_nodes;
1301 0 : FOR_EACH_DEFINED_FUNCTION (node)
1302 0 : if (node->get_partitioning_class () == SYMBOL_PARTITION)
1303 : {
1304 0 : create_locality_info (node);
1305 0 : if (node->no_reorder)
1306 : {
1307 0 : if (dump_file)
1308 0 : fprintf (dump_file, "no reorder %s\n", node->dump_asm_name ());
1309 0 : return false;
1310 : }
1311 0 : else if (is_entry_node_p (node))
1312 : {
1313 0 : profile_count pcnt = node->count.ipa ();
1314 0 : if (!pcnt.initialized_p () || !pcnt.ipa_p ())
1315 : {
1316 0 : sreal cnt = 0;
1317 0 : locality_order *lo = new locality_order (node, cnt);
1318 0 : non_comparable_nodes.safe_push (lo);
1319 0 : continue;
1320 0 : }
1321 0 : sreal count = 0;
1322 0 : struct cgraph_edge *edge;
1323 0 : for (edge = node->callees; edge; edge = edge->next_callee)
1324 : {
1325 : /* For PGO, frequency is not used in
1326 : compare_edge_profile_counts (), it's used only as part of
1327 : static profile order. */
1328 0 : sreal freq = edge->sreal_frequency ();
1329 0 : count += freq;
1330 : }
1331 0 : locality_order *cl = new locality_order (node, count);
1332 0 : order->safe_push (cl);
1333 : }
1334 : }
1335 0 : order->qsort (compare_edge_profile_counts);
1336 0 : for (auto el : non_comparable_nodes)
1337 0 : order->safe_push (el);
1338 0 : return true;
1339 0 : }
1340 :
1341 : /* Determine order of all external nodes if only static profile is available.
1342 : Store the order in ORDER. */
1343 :
1344 : static bool
1345 0 : locality_determine_static_order (auto_vec<locality_order *> *order,
1346 : lto_locality_heuristics scheme)
1347 : {
1348 0 : struct cgraph_node *node;
1349 0 : FOR_EACH_DEFINED_FUNCTION (node)
1350 0 : if (node->get_partitioning_class () == SYMBOL_PARTITION)
1351 : {
1352 0 : create_locality_info (node, scheme == LTO_LOCALITY_CPP_TEMPLATE);
1353 0 : if (node->no_reorder)
1354 : {
1355 0 : if (dump_file)
1356 0 : fprintf (dump_file, "no reorder %s\n", node->dump_asm_name ());
1357 0 : return false;
1358 : }
1359 0 : else if (is_entry_node_p (node))
1360 : {
1361 0 : sreal count = 0;
1362 0 : struct cgraph_edge *edge;
1363 0 : for (edge = node->callees; edge; edge = edge->next_callee)
1364 : {
1365 0 : sreal freq = edge->sreal_frequency ();
1366 0 : count += freq;
1367 : }
1368 0 : locality_order *cl = new locality_order (node, count);
1369 0 : order->safe_push (cl);
1370 : }
1371 : }
1372 :
1373 0 : if (scheme == LTO_LOCALITY_CPP_TEMPLATE)
1374 : {
1375 0 : order_templ_hashes ();
1376 0 : order->qsort (static_profile_templ_cmp);
1377 0 : return true;
1378 : }
1379 :
1380 0 : order->qsort (static_profile_cmp);
1381 : return true;
1382 : }
1383 :
1384 : /* Partitioning for code locality.
1385 : 1. Create and sort callchains. If PGO is available, use real profile
1386 : counts. Otherwise, use a set of heuristics to sort the callchains.
1387 : 2. Partition the external nodes and their callchains in the determined order
1388 : 2.1. If !partition, partition, else try and clone if it satisfies cloning
1389 : criteria.
1390 : 3. Partition all other unpartitioned nodes. */
1391 :
1392 : static void
1393 0 : locality_partition_and_clone (int max_locality_partition_size,
1394 : lto_locality_cloning_model cloning_model,
1395 : lto_locality_heuristics scheme,
1396 : int freq_denominator, int size)
1397 : {
1398 0 : locality_partition partition;
1399 0 : int npartitions = 0;
1400 :
1401 0 : auto_vec<locality_order *> order;
1402 0 : auto_vec<varpool_node *> varpool_order;
1403 0 : struct cgraph_node *node;
1404 0 : bool order_p;
1405 :
1406 0 : int cl_num = 0;
1407 :
1408 0 : double real_freq = 0.0;
1409 0 : if (freq_denominator > 0)
1410 0 : real_freq = 1.0 / (double) freq_denominator;
1411 :
1412 0 : cgraph_node *n = symtab->first_defined_function ();
1413 0 : if (n && n->count.ipa_p ())
1414 0 : order_p = locality_determine_ipa_order (&order);
1415 : else
1416 0 : order_p = locality_determine_static_order (&order, scheme);
1417 0 : if (!order_p)
1418 : {
1419 0 : if (dump_file)
1420 : {
1421 0 : fprintf (dump_file, "Locality partition: falling back to balanced"
1422 : "model\n");
1423 : }
1424 :
1425 0 : return;
1426 : }
1427 :
1428 0 : int64_t partition_size
1429 : = max_locality_partition_size
1430 0 : ? max_locality_partition_size : param_max_partition_size;
1431 0 : partition = create_partition (npartitions);
1432 :
1433 0 : for (unsigned i = 0; i < order.length (); i++)
1434 : {
1435 0 : node = order[i]->node;
1436 0 : if (node_partitioned_p (node))
1437 0 : continue;
1438 :
1439 0 : if (partition->insns > partition_size)
1440 0 : partition = create_partition (npartitions);
1441 0 : if (dump_file)
1442 0 : fprintf (dump_file, "Partition id: %d\n", partition->part_id);
1443 :
1444 0 : add_node_to_partition (partition, node);
1445 0 : if (dump_file)
1446 0 : fprintf (dump_file, "Ordered Node: %s\n", node->dump_asm_name ());
1447 :
1448 0 : partition_callchain (node, partition, cloning_model, real_freq, size,
1449 : cl_num, npartitions, partition_size, scheme);
1450 : }
1451 :
1452 0 : for (unsigned i = 0; i < order.length (); i++)
1453 0 : delete order[i];
1454 0 : order = vNULL;
1455 :
1456 0 : loc_infos.release ();
1457 0 : }
1458 :
1459 : /* Entry point to locality-clone pass. */
1460 : static int
1461 0 : lc_execute (void)
1462 : {
1463 0 : symtab_node *node;
1464 0 : FOR_EACH_SYMBOL (node)
1465 0 : node->aux = NULL;
1466 :
1467 0 : node_to_ch_info = std::make_unique<hash_map<loc_map_hash, locality_info *>>
1468 0 : ();
1469 0 : node_to_clone = std::make_unique<hash_map<loc_map_hash, cgraph_node *>> ();
1470 0 : clone_to_node = std::make_unique<hash_map<loc_map_hash, cgraph_node *>> ();
1471 0 : templ_hash_map = std::make_unique<hash_freq_map_t> ();
1472 :
1473 0 : locality_partition_and_clone (param_max_locality_partition_size,
1474 : flag_lto_locality_cloning,
1475 : flag_lto_locality_heuristics,
1476 : param_lto_locality_frequency,
1477 : param_lto_locality_size);
1478 :
1479 0 : FOR_EACH_SYMBOL (node)
1480 0 : node->aux = NULL;
1481 0 : return 0;
1482 : }
1483 :
1484 : namespace {
1485 :
1486 : const pass_data pass_data_ipa_locality_clone = {
1487 : IPA_PASS, /* type */
1488 : "locality-clone", /* name */
1489 : OPTGROUP_NONE, /* optinfo_flags */
1490 : TV_IPA_LC, /* tv_id */
1491 : 0, /* properties_required */
1492 : 0, /* properties_provided */
1493 : 0, /* properties_destroyed */
1494 : 0, /* todo_flags_start */
1495 : (TODO_dump_symtab | TODO_remove_functions), /* todo_flags_finish */
1496 : };
1497 :
1498 : class pass_ipa_locality_cloning : public ipa_opt_pass_d
1499 : {
1500 : public:
1501 285722 : pass_ipa_locality_cloning (gcc::context *ctxt)
1502 : : ipa_opt_pass_d (pass_data_ipa_locality_clone, ctxt,
1503 : NULL, /* generate_summary */
1504 : NULL, /* write_summary */
1505 : NULL, /* read_summary */
1506 : NULL, /* write_optimization_summary */
1507 : NULL, /* read_optimization_summary */
1508 : NULL, /* stmt_fixup */
1509 : 0, /* function_transform_todo_flags_start */
1510 : NULL, /* function_transform */
1511 285722 : NULL) /* variable_transform */
1512 285722 : {}
1513 :
1514 : /* opt_pass methods: */
1515 563719 : virtual bool gate (function *)
1516 : {
1517 563719 : return (flag_wpa && flag_ipa_reorder_for_locality);
1518 : }
1519 :
1520 0 : virtual unsigned int execute (function *) { return lc_execute (); }
1521 :
1522 : }; // class pass_ipa_locality_cloning
1523 :
1524 : } // namespace
1525 :
1526 : ipa_opt_pass_d *
1527 285722 : make_pass_ipa_locality_cloning (gcc::context *ctxt)
1528 : {
1529 285722 : return new pass_ipa_locality_cloning (ctxt);
1530 : }
|