111
|
1 /* Implementation of the MATMUL intrinsic
|
145
|
2 Copyright (C) 2002-2020 Free Software Foundation, Inc.
|
111
|
3 Contributed by Thomas Koenig <tkoenig@gcc.gnu.org>.
|
|
4
|
|
5 This file is part of the GNU Fortran runtime library (libgfortran).
|
|
6
|
|
7 Libgfortran is free software; you can redistribute it and/or
|
|
8 modify it under the terms of the GNU General Public
|
|
9 License as published by the Free Software Foundation; either
|
|
10 version 3 of the License, or (at your option) any later version.
|
|
11
|
|
12 Libgfortran is distributed in the hope that it will be useful,
|
|
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
15 GNU General Public License for more details.
|
|
16
|
|
17 Under Section 7 of GPL version 3, you are granted additional
|
|
18 permissions described in the GCC Runtime Library Exception, version
|
|
19 3.1, as published by the Free Software Foundation.
|
|
20
|
|
21 You should have received a copy of the GNU General Public License and
|
|
22 a copy of the GCC Runtime Library Exception along with this program;
|
|
23 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
|
|
24 <http://www.gnu.org/licenses/>. */
|
|
25
|
|
26 #include "libgfortran.h"
|
|
27 #include <string.h>
|
|
28 #include <assert.h>
|
|
29
|
|
30
|
|
31 /* These are the specific versions of matmul with -mprefer-avx128. */
|
|
32
|
|
33 #if defined (HAVE_GFC_COMPLEX_8)
|
|
34
|
|
35 /* Prototype for the BLAS ?gemm subroutine, a pointer to which can be
|
|
36 passed to us by the front-end, in which case we call it for large
|
|
37 matrices. */
|
|
38
|
|
39 typedef void (*blas_call)(const char *, const char *, const int *, const int *,
|
|
40 const int *, const GFC_COMPLEX_8 *, const GFC_COMPLEX_8 *,
|
|
41 const int *, const GFC_COMPLEX_8 *, const int *,
|
|
42 const GFC_COMPLEX_8 *, GFC_COMPLEX_8 *, const int *,
|
|
43 int, int);
|
|
44
|
|
45 #if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
|
|
46 void
|
|
47 matmul_c8_avx128_fma3 (gfc_array_c8 * const restrict retarray,
|
|
48 gfc_array_c8 * const restrict a, gfc_array_c8 * const restrict b, int try_blas,
|
|
49 int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma")));
|
|
50 internal_proto(matmul_c8_avx128_fma3);
|
|
51 void
|
|
52 matmul_c8_avx128_fma3 (gfc_array_c8 * const restrict retarray,
|
|
53 gfc_array_c8 * const restrict a, gfc_array_c8 * const restrict b, int try_blas,
|
|
54 int blas_limit, blas_call gemm)
|
|
55 {
|
|
56 const GFC_COMPLEX_8 * restrict abase;
|
|
57 const GFC_COMPLEX_8 * restrict bbase;
|
|
58 GFC_COMPLEX_8 * restrict dest;
|
|
59
|
|
60 index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
|
|
61 index_type x, y, n, count, xcount, ycount;
|
|
62
|
|
63 assert (GFC_DESCRIPTOR_RANK (a) == 2
|
|
64 || GFC_DESCRIPTOR_RANK (b) == 2);
|
|
65
|
|
66 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
|
|
67
|
|
68 Either A or B (but not both) can be rank 1:
|
|
69
|
|
70 o One-dimensional argument A is implicitly treated as a row matrix
|
|
71 dimensioned [1,count], so xcount=1.
|
|
72
|
|
73 o One-dimensional argument B is implicitly treated as a column matrix
|
|
74 dimensioned [count, 1], so ycount=1.
|
|
75 */
|
|
76
|
|
77 if (retarray->base_addr == NULL)
|
|
78 {
|
|
79 if (GFC_DESCRIPTOR_RANK (a) == 1)
|
|
80 {
|
|
81 GFC_DIMENSION_SET(retarray->dim[0], 0,
|
|
82 GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
|
|
83 }
|
|
84 else if (GFC_DESCRIPTOR_RANK (b) == 1)
|
|
85 {
|
|
86 GFC_DIMENSION_SET(retarray->dim[0], 0,
|
|
87 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
|
|
88 }
|
|
89 else
|
|
90 {
|
|
91 GFC_DIMENSION_SET(retarray->dim[0], 0,
|
|
92 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
|
|
93
|
|
94 GFC_DIMENSION_SET(retarray->dim[1], 0,
|
|
95 GFC_DESCRIPTOR_EXTENT(b,1) - 1,
|
|
96 GFC_DESCRIPTOR_EXTENT(retarray,0));
|
|
97 }
|
|
98
|
|
99 retarray->base_addr
|
|
100 = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_COMPLEX_8));
|
|
101 retarray->offset = 0;
|
|
102 }
|
|
103 else if (unlikely (compile_options.bounds_check))
|
|
104 {
|
|
105 index_type ret_extent, arg_extent;
|
|
106
|
|
107 if (GFC_DESCRIPTOR_RANK (a) == 1)
|
|
108 {
|
|
109 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
|
|
110 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
|
|
111 if (arg_extent != ret_extent)
|
131
|
112 runtime_error ("Array bound mismatch for dimension 1 of "
|
|
113 "array (%ld/%ld) ",
|
111
|
114 (long int) ret_extent, (long int) arg_extent);
|
|
115 }
|
|
116 else if (GFC_DESCRIPTOR_RANK (b) == 1)
|
|
117 {
|
|
118 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
|
|
119 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
|
|
120 if (arg_extent != ret_extent)
|
131
|
121 runtime_error ("Array bound mismatch for dimension 1 of "
|
|
122 "array (%ld/%ld) ",
|
111
|
123 (long int) ret_extent, (long int) arg_extent);
|
|
124 }
|
|
125 else
|
|
126 {
|
|
127 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
|
|
128 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
|
|
129 if (arg_extent != ret_extent)
|
131
|
130 runtime_error ("Array bound mismatch for dimension 1 of "
|
|
131 "array (%ld/%ld) ",
|
111
|
132 (long int) ret_extent, (long int) arg_extent);
|
|
133
|
|
134 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
|
|
135 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
|
|
136 if (arg_extent != ret_extent)
|
131
|
137 runtime_error ("Array bound mismatch for dimension 2 of "
|
|
138 "array (%ld/%ld) ",
|
111
|
139 (long int) ret_extent, (long int) arg_extent);
|
|
140 }
|
|
141 }
|
|
142
|
|
143
|
|
144 if (GFC_DESCRIPTOR_RANK (retarray) == 1)
|
|
145 {
|
|
146 /* One-dimensional result may be addressed in the code below
|
|
147 either as a row or a column matrix. We want both cases to
|
|
148 work. */
|
|
149 rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
|
|
150 }
|
|
151 else
|
|
152 {
|
|
153 rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
|
|
154 rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
|
|
155 }
|
|
156
|
|
157
|
|
158 if (GFC_DESCRIPTOR_RANK (a) == 1)
|
|
159 {
|
|
160 /* Treat it as a a row matrix A[1,count]. */
|
|
161 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
|
|
162 aystride = 1;
|
|
163
|
|
164 xcount = 1;
|
|
165 count = GFC_DESCRIPTOR_EXTENT(a,0);
|
|
166 }
|
|
167 else
|
|
168 {
|
|
169 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
|
|
170 aystride = GFC_DESCRIPTOR_STRIDE(a,1);
|
|
171
|
|
172 count = GFC_DESCRIPTOR_EXTENT(a,1);
|
|
173 xcount = GFC_DESCRIPTOR_EXTENT(a,0);
|
|
174 }
|
|
175
|
|
176 if (count != GFC_DESCRIPTOR_EXTENT(b,0))
|
|
177 {
|
|
178 if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
|
131
|
179 runtime_error ("Incorrect extent in argument B in MATMUL intrinsic "
|
|
180 "in dimension 1: is %ld, should be %ld",
|
|
181 (long int) GFC_DESCRIPTOR_EXTENT(b,0), (long int) count);
|
111
|
182 }
|
|
183
|
|
184 if (GFC_DESCRIPTOR_RANK (b) == 1)
|
|
185 {
|
|
186 /* Treat it as a column matrix B[count,1] */
|
|
187 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
|
|
188
|
|
189 /* bystride should never be used for 1-dimensional b.
|
|
190 The value is only used for calculation of the
|
|
191 memory by the buffer. */
|
|
192 bystride = 256;
|
|
193 ycount = 1;
|
|
194 }
|
|
195 else
|
|
196 {
|
|
197 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
|
|
198 bystride = GFC_DESCRIPTOR_STRIDE(b,1);
|
|
199 ycount = GFC_DESCRIPTOR_EXTENT(b,1);
|
|
200 }
|
|
201
|
|
202 abase = a->base_addr;
|
|
203 bbase = b->base_addr;
|
|
204 dest = retarray->base_addr;
|
|
205
|
|
206 /* Now that everything is set up, we perform the multiplication
|
|
207 itself. */
|
|
208
|
|
209 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
|
|
210 #define min(a,b) ((a) <= (b) ? (a) : (b))
|
|
211 #define max(a,b) ((a) >= (b) ? (a) : (b))
|
|
212
|
|
213 if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
|
|
214 && (bxstride == 1 || bystride == 1)
|
|
215 && (((float) xcount) * ((float) ycount) * ((float) count)
|
|
216 > POW3(blas_limit)))
|
|
217 {
|
|
218 const int m = xcount, n = ycount, k = count, ldc = rystride;
|
|
219 const GFC_COMPLEX_8 one = 1, zero = 0;
|
|
220 const int lda = (axstride == 1) ? aystride : axstride,
|
|
221 ldb = (bxstride == 1) ? bystride : bxstride;
|
|
222
|
|
223 if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
|
|
224 {
|
|
225 assert (gemm != NULL);
|
131
|
226 const char *transa, *transb;
|
|
227 if (try_blas & 2)
|
|
228 transa = "C";
|
|
229 else
|
|
230 transa = axstride == 1 ? "N" : "T";
|
|
231
|
|
232 if (try_blas & 4)
|
|
233 transb = "C";
|
|
234 else
|
|
235 transb = bxstride == 1 ? "N" : "T";
|
|
236
|
|
237 gemm (transa, transb , &m,
|
111
|
238 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
|
|
239 &ldc, 1, 1);
|
|
240 return;
|
|
241 }
|
|
242 }
|
|
243
|
|
244 if (rxstride == 1 && axstride == 1 && bxstride == 1)
|
|
245 {
|
|
246 /* This block of code implements a tuned matmul, derived from
|
|
247 Superscalar GEMM-based level 3 BLAS, Beta version 0.1
|
|
248
|
|
249 Bo Kagstrom and Per Ling
|
|
250 Department of Computing Science
|
|
251 Umea University
|
|
252 S-901 87 Umea, Sweden
|
|
253
|
|
254 from netlib.org, translated to C, and modified for matmul.m4. */
|
|
255
|
|
256 const GFC_COMPLEX_8 *a, *b;
|
|
257 GFC_COMPLEX_8 *c;
|
|
258 const index_type m = xcount, n = ycount, k = count;
|
|
259
|
|
260 /* System generated locals */
|
|
261 index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
|
|
262 i1, i2, i3, i4, i5, i6;
|
|
263
|
|
264 /* Local variables */
|
|
265 GFC_COMPLEX_8 f11, f12, f21, f22, f31, f32, f41, f42,
|
|
266 f13, f14, f23, f24, f33, f34, f43, f44;
|
|
267 index_type i, j, l, ii, jj, ll;
|
|
268 index_type isec, jsec, lsec, uisec, ujsec, ulsec;
|
|
269 GFC_COMPLEX_8 *t1;
|
|
270
|
|
271 a = abase;
|
|
272 b = bbase;
|
|
273 c = retarray->base_addr;
|
|
274
|
|
275 /* Parameter adjustments */
|
|
276 c_dim1 = rystride;
|
|
277 c_offset = 1 + c_dim1;
|
|
278 c -= c_offset;
|
|
279 a_dim1 = aystride;
|
|
280 a_offset = 1 + a_dim1;
|
|
281 a -= a_offset;
|
|
282 b_dim1 = bystride;
|
|
283 b_offset = 1 + b_dim1;
|
|
284 b -= b_offset;
|
|
285
|
|
286 /* Empty c first. */
|
|
287 for (j=1; j<=n; j++)
|
|
288 for (i=1; i<=m; i++)
|
|
289 c[i + j * c_dim1] = (GFC_COMPLEX_8)0;
|
|
290
|
|
291 /* Early exit if possible */
|
|
292 if (m == 0 || n == 0 || k == 0)
|
|
293 return;
|
|
294
|
|
295 /* Adjust size of t1 to what is needed. */
|
131
|
296 index_type t1_dim, a_sz;
|
|
297 if (aystride == 1)
|
|
298 a_sz = rystride;
|
|
299 else
|
|
300 a_sz = a_dim1;
|
|
301
|
|
302 t1_dim = a_sz * 256 + b_dim1;
|
111
|
303 if (t1_dim > 65536)
|
|
304 t1_dim = 65536;
|
|
305
|
|
306 t1 = malloc (t1_dim * sizeof(GFC_COMPLEX_8));
|
|
307
|
|
308 /* Start turning the crank. */
|
|
309 i1 = n;
|
|
310 for (jj = 1; jj <= i1; jj += 512)
|
|
311 {
|
|
312 /* Computing MIN */
|
|
313 i2 = 512;
|
|
314 i3 = n - jj + 1;
|
|
315 jsec = min(i2,i3);
|
|
316 ujsec = jsec - jsec % 4;
|
|
317 i2 = k;
|
|
318 for (ll = 1; ll <= i2; ll += 256)
|
|
319 {
|
|
320 /* Computing MIN */
|
|
321 i3 = 256;
|
|
322 i4 = k - ll + 1;
|
|
323 lsec = min(i3,i4);
|
|
324 ulsec = lsec - lsec % 2;
|
|
325
|
|
326 i3 = m;
|
|
327 for (ii = 1; ii <= i3; ii += 256)
|
|
328 {
|
|
329 /* Computing MIN */
|
|
330 i4 = 256;
|
|
331 i5 = m - ii + 1;
|
|
332 isec = min(i4,i5);
|
|
333 uisec = isec - isec % 2;
|
|
334 i4 = ll + ulsec - 1;
|
|
335 for (l = ll; l <= i4; l += 2)
|
|
336 {
|
|
337 i5 = ii + uisec - 1;
|
|
338 for (i = ii; i <= i5; i += 2)
|
|
339 {
|
|
340 t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
|
|
341 a[i + l * a_dim1];
|
|
342 t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
|
|
343 a[i + (l + 1) * a_dim1];
|
|
344 t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
|
|
345 a[i + 1 + l * a_dim1];
|
|
346 t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
|
|
347 a[i + 1 + (l + 1) * a_dim1];
|
|
348 }
|
|
349 if (uisec < isec)
|
|
350 {
|
|
351 t1[l - ll + 1 + (isec << 8) - 257] =
|
|
352 a[ii + isec - 1 + l * a_dim1];
|
|
353 t1[l - ll + 2 + (isec << 8) - 257] =
|
|
354 a[ii + isec - 1 + (l + 1) * a_dim1];
|
|
355 }
|
|
356 }
|
|
357 if (ulsec < lsec)
|
|
358 {
|
|
359 i4 = ii + isec - 1;
|
|
360 for (i = ii; i<= i4; ++i)
|
|
361 {
|
|
362 t1[lsec + ((i - ii + 1) << 8) - 257] =
|
|
363 a[i + (ll + lsec - 1) * a_dim1];
|
|
364 }
|
|
365 }
|
|
366
|
|
367 uisec = isec - isec % 4;
|
|
368 i4 = jj + ujsec - 1;
|
|
369 for (j = jj; j <= i4; j += 4)
|
|
370 {
|
|
371 i5 = ii + uisec - 1;
|
|
372 for (i = ii; i <= i5; i += 4)
|
|
373 {
|
|
374 f11 = c[i + j * c_dim1];
|
|
375 f21 = c[i + 1 + j * c_dim1];
|
|
376 f12 = c[i + (j + 1) * c_dim1];
|
|
377 f22 = c[i + 1 + (j + 1) * c_dim1];
|
|
378 f13 = c[i + (j + 2) * c_dim1];
|
|
379 f23 = c[i + 1 + (j + 2) * c_dim1];
|
|
380 f14 = c[i + (j + 3) * c_dim1];
|
|
381 f24 = c[i + 1 + (j + 3) * c_dim1];
|
|
382 f31 = c[i + 2 + j * c_dim1];
|
|
383 f41 = c[i + 3 + j * c_dim1];
|
|
384 f32 = c[i + 2 + (j + 1) * c_dim1];
|
|
385 f42 = c[i + 3 + (j + 1) * c_dim1];
|
|
386 f33 = c[i + 2 + (j + 2) * c_dim1];
|
|
387 f43 = c[i + 3 + (j + 2) * c_dim1];
|
|
388 f34 = c[i + 2 + (j + 3) * c_dim1];
|
|
389 f44 = c[i + 3 + (j + 3) * c_dim1];
|
|
390 i6 = ll + lsec - 1;
|
|
391 for (l = ll; l <= i6; ++l)
|
|
392 {
|
|
393 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
|
|
394 * b[l + j * b_dim1];
|
|
395 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
|
|
396 * b[l + j * b_dim1];
|
|
397 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
|
|
398 * b[l + (j + 1) * b_dim1];
|
|
399 f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
|
|
400 * b[l + (j + 1) * b_dim1];
|
|
401 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
|
|
402 * b[l + (j + 2) * b_dim1];
|
|
403 f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
|
|
404 * b[l + (j + 2) * b_dim1];
|
|
405 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
|
|
406 * b[l + (j + 3) * b_dim1];
|
|
407 f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
|
|
408 * b[l + (j + 3) * b_dim1];
|
|
409 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
|
|
410 * b[l + j * b_dim1];
|
|
411 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
|
|
412 * b[l + j * b_dim1];
|
|
413 f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
|
|
414 * b[l + (j + 1) * b_dim1];
|
|
415 f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
|
|
416 * b[l + (j + 1) * b_dim1];
|
|
417 f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
|
|
418 * b[l + (j + 2) * b_dim1];
|
|
419 f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
|
|
420 * b[l + (j + 2) * b_dim1];
|
|
421 f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
|
|
422 * b[l + (j + 3) * b_dim1];
|
|
423 f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
|
|
424 * b[l + (j + 3) * b_dim1];
|
|
425 }
|
|
426 c[i + j * c_dim1] = f11;
|
|
427 c[i + 1 + j * c_dim1] = f21;
|
|
428 c[i + (j + 1) * c_dim1] = f12;
|
|
429 c[i + 1 + (j + 1) * c_dim1] = f22;
|
|
430 c[i + (j + 2) * c_dim1] = f13;
|
|
431 c[i + 1 + (j + 2) * c_dim1] = f23;
|
|
432 c[i + (j + 3) * c_dim1] = f14;
|
|
433 c[i + 1 + (j + 3) * c_dim1] = f24;
|
|
434 c[i + 2 + j * c_dim1] = f31;
|
|
435 c[i + 3 + j * c_dim1] = f41;
|
|
436 c[i + 2 + (j + 1) * c_dim1] = f32;
|
|
437 c[i + 3 + (j + 1) * c_dim1] = f42;
|
|
438 c[i + 2 + (j + 2) * c_dim1] = f33;
|
|
439 c[i + 3 + (j + 2) * c_dim1] = f43;
|
|
440 c[i + 2 + (j + 3) * c_dim1] = f34;
|
|
441 c[i + 3 + (j + 3) * c_dim1] = f44;
|
|
442 }
|
|
443 if (uisec < isec)
|
|
444 {
|
|
445 i5 = ii + isec - 1;
|
|
446 for (i = ii + uisec; i <= i5; ++i)
|
|
447 {
|
|
448 f11 = c[i + j * c_dim1];
|
|
449 f12 = c[i + (j + 1) * c_dim1];
|
|
450 f13 = c[i + (j + 2) * c_dim1];
|
|
451 f14 = c[i + (j + 3) * c_dim1];
|
|
452 i6 = ll + lsec - 1;
|
|
453 for (l = ll; l <= i6; ++l)
|
|
454 {
|
|
455 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
|
|
456 257] * b[l + j * b_dim1];
|
|
457 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
|
|
458 257] * b[l + (j + 1) * b_dim1];
|
|
459 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
|
|
460 257] * b[l + (j + 2) * b_dim1];
|
|
461 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
|
|
462 257] * b[l + (j + 3) * b_dim1];
|
|
463 }
|
|
464 c[i + j * c_dim1] = f11;
|
|
465 c[i + (j + 1) * c_dim1] = f12;
|
|
466 c[i + (j + 2) * c_dim1] = f13;
|
|
467 c[i + (j + 3) * c_dim1] = f14;
|
|
468 }
|
|
469 }
|
|
470 }
|
|
471 if (ujsec < jsec)
|
|
472 {
|
|
473 i4 = jj + jsec - 1;
|
|
474 for (j = jj + ujsec; j <= i4; ++j)
|
|
475 {
|
|
476 i5 = ii + uisec - 1;
|
|
477 for (i = ii; i <= i5; i += 4)
|
|
478 {
|
|
479 f11 = c[i + j * c_dim1];
|
|
480 f21 = c[i + 1 + j * c_dim1];
|
|
481 f31 = c[i + 2 + j * c_dim1];
|
|
482 f41 = c[i + 3 + j * c_dim1];
|
|
483 i6 = ll + lsec - 1;
|
|
484 for (l = ll; l <= i6; ++l)
|
|
485 {
|
|
486 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
|
|
487 257] * b[l + j * b_dim1];
|
|
488 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
|
|
489 257] * b[l + j * b_dim1];
|
|
490 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
|
|
491 257] * b[l + j * b_dim1];
|
|
492 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
|
|
493 257] * b[l + j * b_dim1];
|
|
494 }
|
|
495 c[i + j * c_dim1] = f11;
|
|
496 c[i + 1 + j * c_dim1] = f21;
|
|
497 c[i + 2 + j * c_dim1] = f31;
|
|
498 c[i + 3 + j * c_dim1] = f41;
|
|
499 }
|
|
500 i5 = ii + isec - 1;
|
|
501 for (i = ii + uisec; i <= i5; ++i)
|
|
502 {
|
|
503 f11 = c[i + j * c_dim1];
|
|
504 i6 = ll + lsec - 1;
|
|
505 for (l = ll; l <= i6; ++l)
|
|
506 {
|
|
507 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
|
|
508 257] * b[l + j * b_dim1];
|
|
509 }
|
|
510 c[i + j * c_dim1] = f11;
|
|
511 }
|
|
512 }
|
|
513 }
|
|
514 }
|
|
515 }
|
|
516 }
|
|
517 free(t1);
|
|
518 return;
|
|
519 }
|
|
520 else if (rxstride == 1 && aystride == 1 && bxstride == 1)
|
|
521 {
|
|
522 if (GFC_DESCRIPTOR_RANK (a) != 1)
|
|
523 {
|
|
524 const GFC_COMPLEX_8 *restrict abase_x;
|
|
525 const GFC_COMPLEX_8 *restrict bbase_y;
|
|
526 GFC_COMPLEX_8 *restrict dest_y;
|
|
527 GFC_COMPLEX_8 s;
|
|
528
|
|
529 for (y = 0; y < ycount; y++)
|
|
530 {
|
|
531 bbase_y = &bbase[y*bystride];
|
|
532 dest_y = &dest[y*rystride];
|
|
533 for (x = 0; x < xcount; x++)
|
|
534 {
|
|
535 abase_x = &abase[x*axstride];
|
|
536 s = (GFC_COMPLEX_8) 0;
|
|
537 for (n = 0; n < count; n++)
|
|
538 s += abase_x[n] * bbase_y[n];
|
|
539 dest_y[x] = s;
|
|
540 }
|
|
541 }
|
|
542 }
|
|
543 else
|
|
544 {
|
|
545 const GFC_COMPLEX_8 *restrict bbase_y;
|
|
546 GFC_COMPLEX_8 s;
|
|
547
|
|
548 for (y = 0; y < ycount; y++)
|
|
549 {
|
|
550 bbase_y = &bbase[y*bystride];
|
|
551 s = (GFC_COMPLEX_8) 0;
|
|
552 for (n = 0; n < count; n++)
|
|
553 s += abase[n*axstride] * bbase_y[n];
|
|
554 dest[y*rystride] = s;
|
|
555 }
|
|
556 }
|
|
557 }
|
|
558 else if (axstride < aystride)
|
|
559 {
|
|
560 for (y = 0; y < ycount; y++)
|
|
561 for (x = 0; x < xcount; x++)
|
|
562 dest[x*rxstride + y*rystride] = (GFC_COMPLEX_8)0;
|
|
563
|
|
564 for (y = 0; y < ycount; y++)
|
|
565 for (n = 0; n < count; n++)
|
|
566 for (x = 0; x < xcount; x++)
|
|
567 /* dest[x,y] += a[x,n] * b[n,y] */
|
|
568 dest[x*rxstride + y*rystride] +=
|
|
569 abase[x*axstride + n*aystride] *
|
|
570 bbase[n*bxstride + y*bystride];
|
|
571 }
|
|
572 else if (GFC_DESCRIPTOR_RANK (a) == 1)
|
|
573 {
|
|
574 const GFC_COMPLEX_8 *restrict bbase_y;
|
|
575 GFC_COMPLEX_8 s;
|
|
576
|
|
577 for (y = 0; y < ycount; y++)
|
|
578 {
|
|
579 bbase_y = &bbase[y*bystride];
|
|
580 s = (GFC_COMPLEX_8) 0;
|
|
581 for (n = 0; n < count; n++)
|
|
582 s += abase[n*axstride] * bbase_y[n*bxstride];
|
|
583 dest[y*rxstride] = s;
|
|
584 }
|
|
585 }
|
|
586 else
|
|
587 {
|
|
588 const GFC_COMPLEX_8 *restrict abase_x;
|
|
589 const GFC_COMPLEX_8 *restrict bbase_y;
|
|
590 GFC_COMPLEX_8 *restrict dest_y;
|
|
591 GFC_COMPLEX_8 s;
|
|
592
|
|
593 for (y = 0; y < ycount; y++)
|
|
594 {
|
|
595 bbase_y = &bbase[y*bystride];
|
|
596 dest_y = &dest[y*rystride];
|
|
597 for (x = 0; x < xcount; x++)
|
|
598 {
|
|
599 abase_x = &abase[x*axstride];
|
|
600 s = (GFC_COMPLEX_8) 0;
|
|
601 for (n = 0; n < count; n++)
|
|
602 s += abase_x[n*aystride] * bbase_y[n*bxstride];
|
|
603 dest_y[x*rxstride] = s;
|
|
604 }
|
|
605 }
|
|
606 }
|
|
607 }
|
|
608 #undef POW3
|
|
609 #undef min
|
|
610 #undef max
|
|
611
|
|
612 #endif
|
|
613
|
|
614 #if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
|
|
615 void
|
|
616 matmul_c8_avx128_fma4 (gfc_array_c8 * const restrict retarray,
|
|
617 gfc_array_c8 * const restrict a, gfc_array_c8 * const restrict b, int try_blas,
|
|
618 int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4")));
|
|
619 internal_proto(matmul_c8_avx128_fma4);
|
|
620 void
|
|
621 matmul_c8_avx128_fma4 (gfc_array_c8 * const restrict retarray,
|
|
622 gfc_array_c8 * const restrict a, gfc_array_c8 * const restrict b, int try_blas,
|
|
623 int blas_limit, blas_call gemm)
|
|
624 {
|
|
625 const GFC_COMPLEX_8 * restrict abase;
|
|
626 const GFC_COMPLEX_8 * restrict bbase;
|
|
627 GFC_COMPLEX_8 * restrict dest;
|
|
628
|
|
629 index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
|
|
630 index_type x, y, n, count, xcount, ycount;
|
|
631
|
|
632 assert (GFC_DESCRIPTOR_RANK (a) == 2
|
|
633 || GFC_DESCRIPTOR_RANK (b) == 2);
|
|
634
|
|
635 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
|
|
636
|
|
637 Either A or B (but not both) can be rank 1:
|
|
638
|
|
639 o One-dimensional argument A is implicitly treated as a row matrix
|
|
640 dimensioned [1,count], so xcount=1.
|
|
641
|
|
642 o One-dimensional argument B is implicitly treated as a column matrix
|
|
643 dimensioned [count, 1], so ycount=1.
|
|
644 */
|
|
645
|
|
646 if (retarray->base_addr == NULL)
|
|
647 {
|
|
648 if (GFC_DESCRIPTOR_RANK (a) == 1)
|
|
649 {
|
|
650 GFC_DIMENSION_SET(retarray->dim[0], 0,
|
|
651 GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
|
|
652 }
|
|
653 else if (GFC_DESCRIPTOR_RANK (b) == 1)
|
|
654 {
|
|
655 GFC_DIMENSION_SET(retarray->dim[0], 0,
|
|
656 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
|
|
657 }
|
|
658 else
|
|
659 {
|
|
660 GFC_DIMENSION_SET(retarray->dim[0], 0,
|
|
661 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
|
|
662
|
|
663 GFC_DIMENSION_SET(retarray->dim[1], 0,
|
|
664 GFC_DESCRIPTOR_EXTENT(b,1) - 1,
|
|
665 GFC_DESCRIPTOR_EXTENT(retarray,0));
|
|
666 }
|
|
667
|
|
668 retarray->base_addr
|
|
669 = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_COMPLEX_8));
|
|
670 retarray->offset = 0;
|
|
671 }
|
|
672 else if (unlikely (compile_options.bounds_check))
|
|
673 {
|
|
674 index_type ret_extent, arg_extent;
|
|
675
|
|
676 if (GFC_DESCRIPTOR_RANK (a) == 1)
|
|
677 {
|
|
678 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
|
|
679 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
|
|
680 if (arg_extent != ret_extent)
|
131
|
681 runtime_error ("Array bound mismatch for dimension 1 of "
|
|
682 "array (%ld/%ld) ",
|
111
|
683 (long int) ret_extent, (long int) arg_extent);
|
|
684 }
|
|
685 else if (GFC_DESCRIPTOR_RANK (b) == 1)
|
|
686 {
|
|
687 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
|
|
688 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
|
|
689 if (arg_extent != ret_extent)
|
131
|
690 runtime_error ("Array bound mismatch for dimension 1 of "
|
|
691 "array (%ld/%ld) ",
|
111
|
692 (long int) ret_extent, (long int) arg_extent);
|
|
693 }
|
|
694 else
|
|
695 {
|
|
696 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
|
|
697 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
|
|
698 if (arg_extent != ret_extent)
|
131
|
699 runtime_error ("Array bound mismatch for dimension 1 of "
|
|
700 "array (%ld/%ld) ",
|
111
|
701 (long int) ret_extent, (long int) arg_extent);
|
|
702
|
|
703 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
|
|
704 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
|
|
705 if (arg_extent != ret_extent)
|
131
|
706 runtime_error ("Array bound mismatch for dimension 2 of "
|
|
707 "array (%ld/%ld) ",
|
111
|
708 (long int) ret_extent, (long int) arg_extent);
|
|
709 }
|
|
710 }
|
|
711
|
|
712
|
|
713 if (GFC_DESCRIPTOR_RANK (retarray) == 1)
|
|
714 {
|
|
715 /* One-dimensional result may be addressed in the code below
|
|
716 either as a row or a column matrix. We want both cases to
|
|
717 work. */
|
|
718 rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
|
|
719 }
|
|
720 else
|
|
721 {
|
|
722 rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
|
|
723 rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
|
|
724 }
|
|
725
|
|
726
|
|
727 if (GFC_DESCRIPTOR_RANK (a) == 1)
|
|
728 {
|
|
729 /* Treat it as a a row matrix A[1,count]. */
|
|
730 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
|
|
731 aystride = 1;
|
|
732
|
|
733 xcount = 1;
|
|
734 count = GFC_DESCRIPTOR_EXTENT(a,0);
|
|
735 }
|
|
736 else
|
|
737 {
|
|
738 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
|
|
739 aystride = GFC_DESCRIPTOR_STRIDE(a,1);
|
|
740
|
|
741 count = GFC_DESCRIPTOR_EXTENT(a,1);
|
|
742 xcount = GFC_DESCRIPTOR_EXTENT(a,0);
|
|
743 }
|
|
744
|
|
745 if (count != GFC_DESCRIPTOR_EXTENT(b,0))
|
|
746 {
|
|
747 if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
|
131
|
748 runtime_error ("Incorrect extent in argument B in MATMUL intrinsic "
|
|
749 "in dimension 1: is %ld, should be %ld",
|
|
750 (long int) GFC_DESCRIPTOR_EXTENT(b,0), (long int) count);
|
111
|
751 }
|
|
752
|
|
753 if (GFC_DESCRIPTOR_RANK (b) == 1)
|
|
754 {
|
|
755 /* Treat it as a column matrix B[count,1] */
|
|
756 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
|
|
757
|
|
758 /* bystride should never be used for 1-dimensional b.
|
|
759 The value is only used for calculation of the
|
|
760 memory by the buffer. */
|
|
761 bystride = 256;
|
|
762 ycount = 1;
|
|
763 }
|
|
764 else
|
|
765 {
|
|
766 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
|
|
767 bystride = GFC_DESCRIPTOR_STRIDE(b,1);
|
|
768 ycount = GFC_DESCRIPTOR_EXTENT(b,1);
|
|
769 }
|
|
770
|
|
771 abase = a->base_addr;
|
|
772 bbase = b->base_addr;
|
|
773 dest = retarray->base_addr;
|
|
774
|
|
775 /* Now that everything is set up, we perform the multiplication
|
|
776 itself. */
|
|
777
|
|
778 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
|
|
779 #define min(a,b) ((a) <= (b) ? (a) : (b))
|
|
780 #define max(a,b) ((a) >= (b) ? (a) : (b))
|
|
781
|
|
782 if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
|
|
783 && (bxstride == 1 || bystride == 1)
|
|
784 && (((float) xcount) * ((float) ycount) * ((float) count)
|
|
785 > POW3(blas_limit)))
|
|
786 {
|
|
787 const int m = xcount, n = ycount, k = count, ldc = rystride;
|
|
788 const GFC_COMPLEX_8 one = 1, zero = 0;
|
|
789 const int lda = (axstride == 1) ? aystride : axstride,
|
|
790 ldb = (bxstride == 1) ? bystride : bxstride;
|
|
791
|
|
792 if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
|
|
793 {
|
|
794 assert (gemm != NULL);
|
131
|
795 const char *transa, *transb;
|
|
796 if (try_blas & 2)
|
|
797 transa = "C";
|
|
798 else
|
|
799 transa = axstride == 1 ? "N" : "T";
|
|
800
|
|
801 if (try_blas & 4)
|
|
802 transb = "C";
|
|
803 else
|
|
804 transb = bxstride == 1 ? "N" : "T";
|
|
805
|
|
806 gemm (transa, transb , &m,
|
111
|
807 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
|
|
808 &ldc, 1, 1);
|
|
809 return;
|
|
810 }
|
|
811 }
|
|
812
|
|
813 if (rxstride == 1 && axstride == 1 && bxstride == 1)
|
|
814 {
|
|
815 /* This block of code implements a tuned matmul, derived from
|
|
816 Superscalar GEMM-based level 3 BLAS, Beta version 0.1
|
|
817
|
|
818 Bo Kagstrom and Per Ling
|
|
819 Department of Computing Science
|
|
820 Umea University
|
|
821 S-901 87 Umea, Sweden
|
|
822
|
|
823 from netlib.org, translated to C, and modified for matmul.m4. */
|
|
824
|
|
825 const GFC_COMPLEX_8 *a, *b;
|
|
826 GFC_COMPLEX_8 *c;
|
|
827 const index_type m = xcount, n = ycount, k = count;
|
|
828
|
|
829 /* System generated locals */
|
|
830 index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
|
|
831 i1, i2, i3, i4, i5, i6;
|
|
832
|
|
833 /* Local variables */
|
|
834 GFC_COMPLEX_8 f11, f12, f21, f22, f31, f32, f41, f42,
|
|
835 f13, f14, f23, f24, f33, f34, f43, f44;
|
|
836 index_type i, j, l, ii, jj, ll;
|
|
837 index_type isec, jsec, lsec, uisec, ujsec, ulsec;
|
|
838 GFC_COMPLEX_8 *t1;
|
|
839
|
|
840 a = abase;
|
|
841 b = bbase;
|
|
842 c = retarray->base_addr;
|
|
843
|
|
844 /* Parameter adjustments */
|
|
845 c_dim1 = rystride;
|
|
846 c_offset = 1 + c_dim1;
|
|
847 c -= c_offset;
|
|
848 a_dim1 = aystride;
|
|
849 a_offset = 1 + a_dim1;
|
|
850 a -= a_offset;
|
|
851 b_dim1 = bystride;
|
|
852 b_offset = 1 + b_dim1;
|
|
853 b -= b_offset;
|
|
854
|
|
855 /* Empty c first. */
|
|
856 for (j=1; j<=n; j++)
|
|
857 for (i=1; i<=m; i++)
|
|
858 c[i + j * c_dim1] = (GFC_COMPLEX_8)0;
|
|
859
|
|
860 /* Early exit if possible */
|
|
861 if (m == 0 || n == 0 || k == 0)
|
|
862 return;
|
|
863
|
|
864 /* Adjust size of t1 to what is needed. */
|
131
|
865 index_type t1_dim, a_sz;
|
|
866 if (aystride == 1)
|
|
867 a_sz = rystride;
|
|
868 else
|
|
869 a_sz = a_dim1;
|
|
870
|
|
871 t1_dim = a_sz * 256 + b_dim1;
|
111
|
872 if (t1_dim > 65536)
|
|
873 t1_dim = 65536;
|
|
874
|
|
875 t1 = malloc (t1_dim * sizeof(GFC_COMPLEX_8));
|
|
876
|
|
877 /* Start turning the crank. */
|
|
878 i1 = n;
|
|
879 for (jj = 1; jj <= i1; jj += 512)
|
|
880 {
|
|
881 /* Computing MIN */
|
|
882 i2 = 512;
|
|
883 i3 = n - jj + 1;
|
|
884 jsec = min(i2,i3);
|
|
885 ujsec = jsec - jsec % 4;
|
|
886 i2 = k;
|
|
887 for (ll = 1; ll <= i2; ll += 256)
|
|
888 {
|
|
889 /* Computing MIN */
|
|
890 i3 = 256;
|
|
891 i4 = k - ll + 1;
|
|
892 lsec = min(i3,i4);
|
|
893 ulsec = lsec - lsec % 2;
|
|
894
|
|
895 i3 = m;
|
|
896 for (ii = 1; ii <= i3; ii += 256)
|
|
897 {
|
|
898 /* Computing MIN */
|
|
899 i4 = 256;
|
|
900 i5 = m - ii + 1;
|
|
901 isec = min(i4,i5);
|
|
902 uisec = isec - isec % 2;
|
|
903 i4 = ll + ulsec - 1;
|
|
904 for (l = ll; l <= i4; l += 2)
|
|
905 {
|
|
906 i5 = ii + uisec - 1;
|
|
907 for (i = ii; i <= i5; i += 2)
|
|
908 {
|
|
909 t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
|
|
910 a[i + l * a_dim1];
|
|
911 t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
|
|
912 a[i + (l + 1) * a_dim1];
|
|
913 t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
|
|
914 a[i + 1 + l * a_dim1];
|
|
915 t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
|
|
916 a[i + 1 + (l + 1) * a_dim1];
|
|
917 }
|
|
918 if (uisec < isec)
|
|
919 {
|
|
920 t1[l - ll + 1 + (isec << 8) - 257] =
|
|
921 a[ii + isec - 1 + l * a_dim1];
|
|
922 t1[l - ll + 2 + (isec << 8) - 257] =
|
|
923 a[ii + isec - 1 + (l + 1) * a_dim1];
|
|
924 }
|
|
925 }
|
|
926 if (ulsec < lsec)
|
|
927 {
|
|
928 i4 = ii + isec - 1;
|
|
929 for (i = ii; i<= i4; ++i)
|
|
930 {
|
|
931 t1[lsec + ((i - ii + 1) << 8) - 257] =
|
|
932 a[i + (ll + lsec - 1) * a_dim1];
|
|
933 }
|
|
934 }
|
|
935
|
|
936 uisec = isec - isec % 4;
|
|
937 i4 = jj + ujsec - 1;
|
|
938 for (j = jj; j <= i4; j += 4)
|
|
939 {
|
|
940 i5 = ii + uisec - 1;
|
|
941 for (i = ii; i <= i5; i += 4)
|
|
942 {
|
|
943 f11 = c[i + j * c_dim1];
|
|
944 f21 = c[i + 1 + j * c_dim1];
|
|
945 f12 = c[i + (j + 1) * c_dim1];
|
|
946 f22 = c[i + 1 + (j + 1) * c_dim1];
|
|
947 f13 = c[i + (j + 2) * c_dim1];
|
|
948 f23 = c[i + 1 + (j + 2) * c_dim1];
|
|
949 f14 = c[i + (j + 3) * c_dim1];
|
|
950 f24 = c[i + 1 + (j + 3) * c_dim1];
|
|
951 f31 = c[i + 2 + j * c_dim1];
|
|
952 f41 = c[i + 3 + j * c_dim1];
|
|
953 f32 = c[i + 2 + (j + 1) * c_dim1];
|
|
954 f42 = c[i + 3 + (j + 1) * c_dim1];
|
|
955 f33 = c[i + 2 + (j + 2) * c_dim1];
|
|
956 f43 = c[i + 3 + (j + 2) * c_dim1];
|
|
957 f34 = c[i + 2 + (j + 3) * c_dim1];
|
|
958 f44 = c[i + 3 + (j + 3) * c_dim1];
|
|
959 i6 = ll + lsec - 1;
|
|
960 for (l = ll; l <= i6; ++l)
|
|
961 {
|
|
962 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
|
|
963 * b[l + j * b_dim1];
|
|
964 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
|
|
965 * b[l + j * b_dim1];
|
|
966 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
|
|
967 * b[l + (j + 1) * b_dim1];
|
|
968 f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
|
|
969 * b[l + (j + 1) * b_dim1];
|
|
970 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
|
|
971 * b[l + (j + 2) * b_dim1];
|
|
972 f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
|
|
973 * b[l + (j + 2) * b_dim1];
|
|
974 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
|
|
975 * b[l + (j + 3) * b_dim1];
|
|
976 f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
|
|
977 * b[l + (j + 3) * b_dim1];
|
|
978 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
|
|
979 * b[l + j * b_dim1];
|
|
980 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
|
|
981 * b[l + j * b_dim1];
|
|
982 f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
|
|
983 * b[l + (j + 1) * b_dim1];
|
|
984 f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
|
|
985 * b[l + (j + 1) * b_dim1];
|
|
986 f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
|
|
987 * b[l + (j + 2) * b_dim1];
|
|
988 f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
|
|
989 * b[l + (j + 2) * b_dim1];
|
|
990 f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
|
|
991 * b[l + (j + 3) * b_dim1];
|
|
992 f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
|
|
993 * b[l + (j + 3) * b_dim1];
|
|
994 }
|
|
995 c[i + j * c_dim1] = f11;
|
|
996 c[i + 1 + j * c_dim1] = f21;
|
|
997 c[i + (j + 1) * c_dim1] = f12;
|
|
998 c[i + 1 + (j + 1) * c_dim1] = f22;
|
|
999 c[i + (j + 2) * c_dim1] = f13;
|
|
1000 c[i + 1 + (j + 2) * c_dim1] = f23;
|
|
1001 c[i + (j + 3) * c_dim1] = f14;
|
|
1002 c[i + 1 + (j + 3) * c_dim1] = f24;
|
|
1003 c[i + 2 + j * c_dim1] = f31;
|
|
1004 c[i + 3 + j * c_dim1] = f41;
|
|
1005 c[i + 2 + (j + 1) * c_dim1] = f32;
|
|
1006 c[i + 3 + (j + 1) * c_dim1] = f42;
|
|
1007 c[i + 2 + (j + 2) * c_dim1] = f33;
|
|
1008 c[i + 3 + (j + 2) * c_dim1] = f43;
|
|
1009 c[i + 2 + (j + 3) * c_dim1] = f34;
|
|
1010 c[i + 3 + (j + 3) * c_dim1] = f44;
|
|
1011 }
|
|
1012 if (uisec < isec)
|
|
1013 {
|
|
1014 i5 = ii + isec - 1;
|
|
1015 for (i = ii + uisec; i <= i5; ++i)
|
|
1016 {
|
|
1017 f11 = c[i + j * c_dim1];
|
|
1018 f12 = c[i + (j + 1) * c_dim1];
|
|
1019 f13 = c[i + (j + 2) * c_dim1];
|
|
1020 f14 = c[i + (j + 3) * c_dim1];
|
|
1021 i6 = ll + lsec - 1;
|
|
1022 for (l = ll; l <= i6; ++l)
|
|
1023 {
|
|
1024 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
|
|
1025 257] * b[l + j * b_dim1];
|
|
1026 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
|
|
1027 257] * b[l + (j + 1) * b_dim1];
|
|
1028 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
|
|
1029 257] * b[l + (j + 2) * b_dim1];
|
|
1030 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
|
|
1031 257] * b[l + (j + 3) * b_dim1];
|
|
1032 }
|
|
1033 c[i + j * c_dim1] = f11;
|
|
1034 c[i + (j + 1) * c_dim1] = f12;
|
|
1035 c[i + (j + 2) * c_dim1] = f13;
|
|
1036 c[i + (j + 3) * c_dim1] = f14;
|
|
1037 }
|
|
1038 }
|
|
1039 }
|
|
1040 if (ujsec < jsec)
|
|
1041 {
|
|
1042 i4 = jj + jsec - 1;
|
|
1043 for (j = jj + ujsec; j <= i4; ++j)
|
|
1044 {
|
|
1045 i5 = ii + uisec - 1;
|
|
1046 for (i = ii; i <= i5; i += 4)
|
|
1047 {
|
|
1048 f11 = c[i + j * c_dim1];
|
|
1049 f21 = c[i + 1 + j * c_dim1];
|
|
1050 f31 = c[i + 2 + j * c_dim1];
|
|
1051 f41 = c[i + 3 + j * c_dim1];
|
|
1052 i6 = ll + lsec - 1;
|
|
1053 for (l = ll; l <= i6; ++l)
|
|
1054 {
|
|
1055 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
|
|
1056 257] * b[l + j * b_dim1];
|
|
1057 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
|
|
1058 257] * b[l + j * b_dim1];
|
|
1059 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
|
|
1060 257] * b[l + j * b_dim1];
|
|
1061 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
|
|
1062 257] * b[l + j * b_dim1];
|
|
1063 }
|
|
1064 c[i + j * c_dim1] = f11;
|
|
1065 c[i + 1 + j * c_dim1] = f21;
|
|
1066 c[i + 2 + j * c_dim1] = f31;
|
|
1067 c[i + 3 + j * c_dim1] = f41;
|
|
1068 }
|
|
1069 i5 = ii + isec - 1;
|
|
1070 for (i = ii + uisec; i <= i5; ++i)
|
|
1071 {
|
|
1072 f11 = c[i + j * c_dim1];
|
|
1073 i6 = ll + lsec - 1;
|
|
1074 for (l = ll; l <= i6; ++l)
|
|
1075 {
|
|
1076 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
|
|
1077 257] * b[l + j * b_dim1];
|
|
1078 }
|
|
1079 c[i + j * c_dim1] = f11;
|
|
1080 }
|
|
1081 }
|
|
1082 }
|
|
1083 }
|
|
1084 }
|
|
1085 }
|
|
1086 free(t1);
|
|
1087 return;
|
|
1088 }
|
|
1089 else if (rxstride == 1 && aystride == 1 && bxstride == 1)
|
|
1090 {
|
|
1091 if (GFC_DESCRIPTOR_RANK (a) != 1)
|
|
1092 {
|
|
1093 const GFC_COMPLEX_8 *restrict abase_x;
|
|
1094 const GFC_COMPLEX_8 *restrict bbase_y;
|
|
1095 GFC_COMPLEX_8 *restrict dest_y;
|
|
1096 GFC_COMPLEX_8 s;
|
|
1097
|
|
1098 for (y = 0; y < ycount; y++)
|
|
1099 {
|
|
1100 bbase_y = &bbase[y*bystride];
|
|
1101 dest_y = &dest[y*rystride];
|
|
1102 for (x = 0; x < xcount; x++)
|
|
1103 {
|
|
1104 abase_x = &abase[x*axstride];
|
|
1105 s = (GFC_COMPLEX_8) 0;
|
|
1106 for (n = 0; n < count; n++)
|
|
1107 s += abase_x[n] * bbase_y[n];
|
|
1108 dest_y[x] = s;
|
|
1109 }
|
|
1110 }
|
|
1111 }
|
|
1112 else
|
|
1113 {
|
|
1114 const GFC_COMPLEX_8 *restrict bbase_y;
|
|
1115 GFC_COMPLEX_8 s;
|
|
1116
|
|
1117 for (y = 0; y < ycount; y++)
|
|
1118 {
|
|
1119 bbase_y = &bbase[y*bystride];
|
|
1120 s = (GFC_COMPLEX_8) 0;
|
|
1121 for (n = 0; n < count; n++)
|
|
1122 s += abase[n*axstride] * bbase_y[n];
|
|
1123 dest[y*rystride] = s;
|
|
1124 }
|
|
1125 }
|
|
1126 }
|
|
1127 else if (axstride < aystride)
|
|
1128 {
|
|
1129 for (y = 0; y < ycount; y++)
|
|
1130 for (x = 0; x < xcount; x++)
|
|
1131 dest[x*rxstride + y*rystride] = (GFC_COMPLEX_8)0;
|
|
1132
|
|
1133 for (y = 0; y < ycount; y++)
|
|
1134 for (n = 0; n < count; n++)
|
|
1135 for (x = 0; x < xcount; x++)
|
|
1136 /* dest[x,y] += a[x,n] * b[n,y] */
|
|
1137 dest[x*rxstride + y*rystride] +=
|
|
1138 abase[x*axstride + n*aystride] *
|
|
1139 bbase[n*bxstride + y*bystride];
|
|
1140 }
|
|
1141 else if (GFC_DESCRIPTOR_RANK (a) == 1)
|
|
1142 {
|
|
1143 const GFC_COMPLEX_8 *restrict bbase_y;
|
|
1144 GFC_COMPLEX_8 s;
|
|
1145
|
|
1146 for (y = 0; y < ycount; y++)
|
|
1147 {
|
|
1148 bbase_y = &bbase[y*bystride];
|
|
1149 s = (GFC_COMPLEX_8) 0;
|
|
1150 for (n = 0; n < count; n++)
|
|
1151 s += abase[n*axstride] * bbase_y[n*bxstride];
|
|
1152 dest[y*rxstride] = s;
|
|
1153 }
|
|
1154 }
|
|
1155 else
|
|
1156 {
|
|
1157 const GFC_COMPLEX_8 *restrict abase_x;
|
|
1158 const GFC_COMPLEX_8 *restrict bbase_y;
|
|
1159 GFC_COMPLEX_8 *restrict dest_y;
|
|
1160 GFC_COMPLEX_8 s;
|
|
1161
|
|
1162 for (y = 0; y < ycount; y++)
|
|
1163 {
|
|
1164 bbase_y = &bbase[y*bystride];
|
|
1165 dest_y = &dest[y*rystride];
|
|
1166 for (x = 0; x < xcount; x++)
|
|
1167 {
|
|
1168 abase_x = &abase[x*axstride];
|
|
1169 s = (GFC_COMPLEX_8) 0;
|
|
1170 for (n = 0; n < count; n++)
|
|
1171 s += abase_x[n*aystride] * bbase_y[n*bxstride];
|
|
1172 dest_y[x*rxstride] = s;
|
|
1173 }
|
|
1174 }
|
|
1175 }
|
|
1176 }
|
|
1177 #undef POW3
|
|
1178 #undef min
|
|
1179 #undef max
|
|
1180
|
|
1181 #endif
|
|
1182
|
|
1183 #endif
|
|
1184
|