145
|
1 /* Copyright (C) 2005-2020 Free Software Foundation, Inc.
|
0
|
2 Contributed by Richard Henderson <rth@redhat.com>.
|
|
3
|
111
|
4 This file is part of the GNU Offloading and Multi Processing Library
|
|
5 (libgomp).
|
0
|
6
|
|
7 Libgomp is free software; you can redistribute it and/or modify it
|
|
8 under the terms of the GNU General Public License as published by
|
|
9 the Free Software Foundation; either version 3, or (at your option)
|
|
10 any later version.
|
|
11
|
|
12 Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
|
|
13 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
|
|
14 FOR A PARTICULAR PURPOSE. See the GNU General Public License for
|
|
15 more details.
|
|
16
|
|
17 Under Section 7 of GPL version 3, you are granted additional
|
|
18 permissions described in the GCC Runtime Library Exception, version
|
|
19 3.1, as published by the Free Software Foundation.
|
|
20
|
|
21 You should have received a copy of the GNU General Public License and
|
|
22 a copy of the GCC Runtime Library Exception along with this program;
|
|
23 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
|
|
24 <http://www.gnu.org/licenses/>. */
|
|
25
|
|
26 /* This file contains routines for managing work-share iteration, both
|
|
27 for loops and sections. */
|
|
28
|
|
29 #include "libgomp.h"
|
|
30 #include <stdlib.h>
|
|
31
|
|
32 typedef unsigned long long gomp_ull;
|
|
33
|
|
34 /* This function implements the STATIC scheduling method. The caller should
|
|
35 iterate *pstart <= x < *pend. Return zero if there are more iterations
|
|
36 to perform; nonzero if not. Return less than 0 if this thread had
|
|
37 received the absolutely last iteration. */
|
|
38
|
|
39 int
|
|
40 gomp_iter_ull_static_next (gomp_ull *pstart, gomp_ull *pend)
|
|
41 {
|
|
42 struct gomp_thread *thr = gomp_thread ();
|
|
43 struct gomp_team *team = thr->ts.team;
|
|
44 struct gomp_work_share *ws = thr->ts.work_share;
|
|
45 unsigned long nthreads = team ? team->nthreads : 1;
|
|
46
|
|
47 if (thr->ts.static_trip == -1)
|
|
48 return -1;
|
|
49
|
|
50 /* Quick test for degenerate teams and orphaned constructs. */
|
|
51 if (nthreads == 1)
|
|
52 {
|
|
53 *pstart = ws->next_ull;
|
|
54 *pend = ws->end_ull;
|
|
55 thr->ts.static_trip = -1;
|
|
56 return ws->next_ull == ws->end_ull;
|
|
57 }
|
|
58
|
|
59 /* We interpret chunk_size zero as "unspecified", which means that we
|
|
60 should break up the iterations such that each thread makes only one
|
|
61 trip through the outer loop. */
|
|
62 if (ws->chunk_size_ull == 0)
|
|
63 {
|
111
|
64 gomp_ull n, q, i, t, s0, e0, s, e;
|
0
|
65
|
|
66 if (thr->ts.static_trip > 0)
|
|
67 return 1;
|
|
68
|
|
69 /* Compute the total number of iterations. */
|
|
70 if (__builtin_expect (ws->mode, 0) == 0)
|
|
71 n = (ws->end_ull - ws->next_ull + ws->incr_ull - 1) / ws->incr_ull;
|
|
72 else
|
|
73 n = (ws->next_ull - ws->end_ull - ws->incr_ull - 1) / -ws->incr_ull;
|
|
74 i = thr->ts.team_id;
|
|
75
|
|
76 /* Compute the "zero-based" start and end points. That is, as
|
|
77 if the loop began at zero and incremented by one. */
|
|
78 q = n / nthreads;
|
111
|
79 t = n % nthreads;
|
|
80 if (i < t)
|
|
81 {
|
|
82 t = 0;
|
|
83 q++;
|
|
84 }
|
|
85 s0 = q * i + t;
|
0
|
86 e0 = s0 + q;
|
|
87
|
|
88 /* Notice when no iterations allocated for this thread. */
|
|
89 if (s0 >= e0)
|
|
90 {
|
|
91 thr->ts.static_trip = 1;
|
|
92 return 1;
|
|
93 }
|
|
94
|
|
95 /* Transform these to the actual start and end numbers. */
|
|
96 s = s0 * ws->incr_ull + ws->next_ull;
|
|
97 e = e0 * ws->incr_ull + ws->next_ull;
|
|
98
|
|
99 *pstart = s;
|
|
100 *pend = e;
|
|
101 thr->ts.static_trip = (e0 == n ? -1 : 1);
|
|
102 return 0;
|
|
103 }
|
|
104 else
|
|
105 {
|
|
106 gomp_ull n, s0, e0, i, c, s, e;
|
|
107
|
|
108 /* Otherwise, each thread gets exactly chunk_size iterations
|
|
109 (if available) each time through the loop. */
|
|
110
|
|
111 if (__builtin_expect (ws->mode, 0) == 0)
|
|
112 n = (ws->end_ull - ws->next_ull + ws->incr_ull - 1) / ws->incr_ull;
|
|
113 else
|
|
114 n = (ws->next_ull - ws->end_ull - ws->incr_ull - 1) / -ws->incr_ull;
|
|
115 i = thr->ts.team_id;
|
|
116 c = ws->chunk_size_ull;
|
|
117
|
|
118 /* Initial guess is a C sized chunk positioned nthreads iterations
|
|
119 in, offset by our thread number. */
|
|
120 s0 = (thr->ts.static_trip * (gomp_ull) nthreads + i) * c;
|
|
121 e0 = s0 + c;
|
|
122
|
|
123 /* Detect overflow. */
|
|
124 if (s0 >= n)
|
|
125 return 1;
|
|
126 if (e0 > n)
|
|
127 e0 = n;
|
|
128
|
|
129 /* Transform these to the actual start and end numbers. */
|
|
130 s = s0 * ws->incr_ull + ws->next_ull;
|
|
131 e = e0 * ws->incr_ull + ws->next_ull;
|
|
132
|
|
133 *pstart = s;
|
|
134 *pend = e;
|
|
135
|
|
136 if (e0 == n)
|
|
137 thr->ts.static_trip = -1;
|
|
138 else
|
|
139 thr->ts.static_trip++;
|
|
140 return 0;
|
|
141 }
|
|
142 }
|
|
143
|
|
144
|
|
145 /* This function implements the DYNAMIC scheduling method. Arguments are
|
|
146 as for gomp_iter_ull_static_next. This function must be called with
|
|
147 ws->lock held. */
|
|
148
|
|
149 bool
|
|
150 gomp_iter_ull_dynamic_next_locked (gomp_ull *pstart, gomp_ull *pend)
|
|
151 {
|
|
152 struct gomp_thread *thr = gomp_thread ();
|
|
153 struct gomp_work_share *ws = thr->ts.work_share;
|
|
154 gomp_ull start, end, chunk, left;
|
|
155
|
|
156 start = ws->next_ull;
|
|
157 if (start == ws->end_ull)
|
|
158 return false;
|
|
159
|
|
160 chunk = ws->chunk_size_ull;
|
|
161 left = ws->end_ull - start;
|
|
162 if (__builtin_expect (ws->mode & 2, 0))
|
|
163 {
|
|
164 if (chunk < left)
|
|
165 chunk = left;
|
|
166 }
|
|
167 else
|
|
168 {
|
|
169 if (chunk > left)
|
|
170 chunk = left;
|
|
171 }
|
|
172 end = start + chunk;
|
|
173
|
|
174 ws->next_ull = end;
|
|
175 *pstart = start;
|
|
176 *pend = end;
|
|
177 return true;
|
|
178 }
|
|
179
|
|
180
|
|
181 #if defined HAVE_SYNC_BUILTINS && defined __LP64__
|
|
182 /* Similar, but doesn't require the lock held, and uses compare-and-swap
|
|
183 instead. Note that the only memory value that changes is ws->next_ull. */
|
|
184
|
|
185 bool
|
|
186 gomp_iter_ull_dynamic_next (gomp_ull *pstart, gomp_ull *pend)
|
|
187 {
|
|
188 struct gomp_thread *thr = gomp_thread ();
|
|
189 struct gomp_work_share *ws = thr->ts.work_share;
|
|
190 gomp_ull start, end, nend, chunk;
|
|
191
|
|
192 end = ws->end_ull;
|
|
193 chunk = ws->chunk_size_ull;
|
|
194
|
|
195 if (__builtin_expect (ws->mode & 1, 1))
|
|
196 {
|
|
197 gomp_ull tmp = __sync_fetch_and_add (&ws->next_ull, chunk);
|
|
198 if (__builtin_expect (ws->mode & 2, 0) == 0)
|
|
199 {
|
|
200 if (tmp >= end)
|
|
201 return false;
|
|
202 nend = tmp + chunk;
|
|
203 if (nend > end)
|
|
204 nend = end;
|
|
205 *pstart = tmp;
|
|
206 *pend = nend;
|
|
207 return true;
|
|
208 }
|
|
209 else
|
|
210 {
|
|
211 if (tmp <= end)
|
|
212 return false;
|
|
213 nend = tmp + chunk;
|
|
214 if (nend < end)
|
|
215 nend = end;
|
|
216 *pstart = tmp;
|
|
217 *pend = nend;
|
|
218 return true;
|
|
219 }
|
|
220 }
|
|
221
|
111
|
222 start = __atomic_load_n (&ws->next_ull, MEMMODEL_RELAXED);
|
0
|
223 while (1)
|
|
224 {
|
|
225 gomp_ull left = end - start;
|
|
226 gomp_ull tmp;
|
|
227
|
|
228 if (start == end)
|
|
229 return false;
|
|
230
|
|
231 if (__builtin_expect (ws->mode & 2, 0))
|
|
232 {
|
|
233 if (chunk < left)
|
|
234 chunk = left;
|
|
235 }
|
|
236 else
|
|
237 {
|
|
238 if (chunk > left)
|
|
239 chunk = left;
|
|
240 }
|
|
241 nend = start + chunk;
|
|
242
|
|
243 tmp = __sync_val_compare_and_swap (&ws->next_ull, start, nend);
|
|
244 if (__builtin_expect (tmp == start, 1))
|
|
245 break;
|
|
246
|
|
247 start = tmp;
|
|
248 }
|
|
249
|
|
250 *pstart = start;
|
|
251 *pend = nend;
|
|
252 return true;
|
|
253 }
|
|
254 #endif /* HAVE_SYNC_BUILTINS */
|
|
255
|
|
256
|
|
257 /* This function implements the GUIDED scheduling method. Arguments are
|
|
258 as for gomp_iter_ull_static_next. This function must be called with the
|
|
259 work share lock held. */
|
|
260
|
|
261 bool
|
|
262 gomp_iter_ull_guided_next_locked (gomp_ull *pstart, gomp_ull *pend)
|
|
263 {
|
|
264 struct gomp_thread *thr = gomp_thread ();
|
|
265 struct gomp_work_share *ws = thr->ts.work_share;
|
|
266 struct gomp_team *team = thr->ts.team;
|
|
267 gomp_ull nthreads = team ? team->nthreads : 1;
|
|
268 gomp_ull n, q;
|
|
269 gomp_ull start, end;
|
|
270
|
|
271 if (ws->next_ull == ws->end_ull)
|
|
272 return false;
|
|
273
|
|
274 start = ws->next_ull;
|
|
275 if (__builtin_expect (ws->mode, 0) == 0)
|
|
276 n = (ws->end_ull - start) / ws->incr_ull;
|
|
277 else
|
|
278 n = (start - ws->end_ull) / -ws->incr_ull;
|
|
279 q = (n + nthreads - 1) / nthreads;
|
|
280
|
|
281 if (q < ws->chunk_size_ull)
|
|
282 q = ws->chunk_size_ull;
|
|
283 if (q <= n)
|
|
284 end = start + q * ws->incr_ull;
|
|
285 else
|
|
286 end = ws->end_ull;
|
|
287
|
|
288 ws->next_ull = end;
|
|
289 *pstart = start;
|
|
290 *pend = end;
|
|
291 return true;
|
|
292 }
|
|
293
|
|
294 #if defined HAVE_SYNC_BUILTINS && defined __LP64__
|
|
295 /* Similar, but doesn't require the lock held, and uses compare-and-swap
|
|
296 instead. Note that the only memory value that changes is ws->next_ull. */
|
|
297
|
|
298 bool
|
|
299 gomp_iter_ull_guided_next (gomp_ull *pstart, gomp_ull *pend)
|
|
300 {
|
|
301 struct gomp_thread *thr = gomp_thread ();
|
|
302 struct gomp_work_share *ws = thr->ts.work_share;
|
|
303 struct gomp_team *team = thr->ts.team;
|
|
304 gomp_ull nthreads = team ? team->nthreads : 1;
|
|
305 gomp_ull start, end, nend, incr;
|
|
306 gomp_ull chunk_size;
|
|
307
|
111
|
308 start = __atomic_load_n (&ws->next_ull, MEMMODEL_RELAXED);
|
0
|
309 end = ws->end_ull;
|
|
310 incr = ws->incr_ull;
|
|
311 chunk_size = ws->chunk_size_ull;
|
|
312
|
|
313 while (1)
|
|
314 {
|
|
315 gomp_ull n, q;
|
|
316 gomp_ull tmp;
|
|
317
|
|
318 if (start == end)
|
|
319 return false;
|
|
320
|
|
321 if (__builtin_expect (ws->mode, 0) == 0)
|
|
322 n = (end - start) / incr;
|
|
323 else
|
|
324 n = (start - end) / -incr;
|
|
325 q = (n + nthreads - 1) / nthreads;
|
|
326
|
|
327 if (q < chunk_size)
|
|
328 q = chunk_size;
|
|
329 if (__builtin_expect (q <= n, 1))
|
|
330 nend = start + q * incr;
|
|
331 else
|
|
332 nend = end;
|
|
333
|
|
334 tmp = __sync_val_compare_and_swap (&ws->next_ull, start, nend);
|
|
335 if (__builtin_expect (tmp == start, 1))
|
|
336 break;
|
|
337
|
|
338 start = tmp;
|
|
339 }
|
|
340
|
|
341 *pstart = start;
|
|
342 *pend = nend;
|
|
343 return true;
|
|
344 }
|
|
345 #endif /* HAVE_SYNC_BUILTINS */
|