111
|
1 /* Implementation of the MATMUL intrinsic
|
145
|
2 Copyright (C) 2002-2020 Free Software Foundation, Inc.
|
111
|
3 Contributed by Paul Brook <paul@nowt.org>
|
|
4
|
|
5 This file is part of the GNU Fortran runtime library (libgfortran).
|
|
6
|
|
7 Libgfortran is free software; you can redistribute it and/or
|
|
8 modify it under the terms of the GNU General Public
|
|
9 License as published by the Free Software Foundation; either
|
|
10 version 3 of the License, or (at your option) any later version.
|
|
11
|
|
12 Libgfortran is distributed in the hope that it will be useful,
|
|
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
15 GNU General Public License for more details.
|
|
16
|
|
17 Under Section 7 of GPL version 3, you are granted additional
|
|
18 permissions described in the GCC Runtime Library Exception, version
|
|
19 3.1, as published by the Free Software Foundation.
|
|
20
|
|
21 You should have received a copy of the GNU General Public License and
|
|
22 a copy of the GCC Runtime Library Exception along with this program;
|
|
23 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
|
|
24 <http://www.gnu.org/licenses/>. */
|
|
25
|
|
26 #include "libgfortran.h"
|
|
27 #include <string.h>
|
|
28 #include <assert.h>
|
|
29
|
|
30
|
|
31 #if defined (HAVE_GFC_INTEGER_8)
|
|
32
|
|
33 /* Prototype for the BLAS ?gemm subroutine, a pointer to which can be
|
|
34 passed to us by the front-end, in which case we call it for large
|
|
35 matrices. */
|
|
36
|
|
37 typedef void (*blas_call)(const char *, const char *, const int *, const int *,
|
|
38 const int *, const GFC_INTEGER_8 *, const GFC_INTEGER_8 *,
|
|
39 const int *, const GFC_INTEGER_8 *, const int *,
|
|
40 const GFC_INTEGER_8 *, GFC_INTEGER_8 *, const int *,
|
|
41 int, int);
|
|
42
|
|
43 /* The order of loops is different in the case of plain matrix
|
|
44 multiplication C=MATMUL(A,B), and in the frequent special case where
|
|
45 the argument A is the temporary result of a TRANSPOSE intrinsic:
|
|
46 C=MATMUL(TRANSPOSE(A),B). Transposed temporaries are detected by
|
|
47 looking at their strides.
|
|
48
|
|
49 The equivalent Fortran pseudo-code is:
|
|
50
|
|
51 DIMENSION A(M,COUNT), B(COUNT,N), C(M,N)
|
|
52 IF (.NOT.IS_TRANSPOSED(A)) THEN
|
|
53 C = 0
|
|
54 DO J=1,N
|
|
55 DO K=1,COUNT
|
|
56 DO I=1,M
|
|
57 C(I,J) = C(I,J)+A(I,K)*B(K,J)
|
|
58 ELSE
|
|
59 DO J=1,N
|
|
60 DO I=1,M
|
|
61 S = 0
|
|
62 DO K=1,COUNT
|
|
63 S = S+A(I,K)*B(K,J)
|
|
64 C(I,J) = S
|
|
65 ENDIF
|
|
66 */
|
|
67
|
|
68 /* If try_blas is set to a nonzero value, then the matmul function will
|
|
69 see if there is a way to perform the matrix multiplication by a call
|
|
70 to the BLAS gemm function. */
|
|
71
|
|
72 extern void matmul_i8 (gfc_array_i8 * const restrict retarray,
|
|
73 gfc_array_i8 * const restrict a, gfc_array_i8 * const restrict b, int try_blas,
|
|
74 int blas_limit, blas_call gemm);
|
|
75 export_proto(matmul_i8);
|
|
76
|
|
77 /* Put exhaustive list of possible architectures here here, ORed together. */
|
|
78
|
|
79 #if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_AVX512F)
|
|
80
|
|
81 #ifdef HAVE_AVX
|
|
82 static void
|
|
83 matmul_i8_avx (gfc_array_i8 * const restrict retarray,
|
|
84 gfc_array_i8 * const restrict a, gfc_array_i8 * const restrict b, int try_blas,
|
|
85 int blas_limit, blas_call gemm) __attribute__((__target__("avx")));
|
|
86 static void
|
|
87 matmul_i8_avx (gfc_array_i8 * const restrict retarray,
|
|
88 gfc_array_i8 * const restrict a, gfc_array_i8 * const restrict b, int try_blas,
|
|
89 int blas_limit, blas_call gemm)
|
|
90 {
|
|
91 const GFC_INTEGER_8 * restrict abase;
|
|
92 const GFC_INTEGER_8 * restrict bbase;
|
|
93 GFC_INTEGER_8 * restrict dest;
|
|
94
|
|
95 index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
|
|
96 index_type x, y, n, count, xcount, ycount;
|
|
97
|
|
98 assert (GFC_DESCRIPTOR_RANK (a) == 2
|
|
99 || GFC_DESCRIPTOR_RANK (b) == 2);
|
|
100
|
|
101 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
|
|
102
|
|
103 Either A or B (but not both) can be rank 1:
|
|
104
|
|
105 o One-dimensional argument A is implicitly treated as a row matrix
|
|
106 dimensioned [1,count], so xcount=1.
|
|
107
|
|
108 o One-dimensional argument B is implicitly treated as a column matrix
|
|
109 dimensioned [count, 1], so ycount=1.
|
|
110 */
|
|
111
|
|
112 if (retarray->base_addr == NULL)
|
|
113 {
|
|
114 if (GFC_DESCRIPTOR_RANK (a) == 1)
|
|
115 {
|
|
116 GFC_DIMENSION_SET(retarray->dim[0], 0,
|
|
117 GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
|
|
118 }
|
|
119 else if (GFC_DESCRIPTOR_RANK (b) == 1)
|
|
120 {
|
|
121 GFC_DIMENSION_SET(retarray->dim[0], 0,
|
|
122 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
|
|
123 }
|
|
124 else
|
|
125 {
|
|
126 GFC_DIMENSION_SET(retarray->dim[0], 0,
|
|
127 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
|
|
128
|
|
129 GFC_DIMENSION_SET(retarray->dim[1], 0,
|
|
130 GFC_DESCRIPTOR_EXTENT(b,1) - 1,
|
|
131 GFC_DESCRIPTOR_EXTENT(retarray,0));
|
|
132 }
|
|
133
|
|
134 retarray->base_addr
|
|
135 = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_INTEGER_8));
|
|
136 retarray->offset = 0;
|
|
137 }
|
|
138 else if (unlikely (compile_options.bounds_check))
|
|
139 {
|
|
140 index_type ret_extent, arg_extent;
|
|
141
|
|
142 if (GFC_DESCRIPTOR_RANK (a) == 1)
|
|
143 {
|
|
144 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
|
|
145 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
|
|
146 if (arg_extent != ret_extent)
|
131
|
147 runtime_error ("Array bound mismatch for dimension 1 of "
|
|
148 "array (%ld/%ld) ",
|
111
|
149 (long int) ret_extent, (long int) arg_extent);
|
|
150 }
|
|
151 else if (GFC_DESCRIPTOR_RANK (b) == 1)
|
|
152 {
|
|
153 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
|
|
154 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
|
|
155 if (arg_extent != ret_extent)
|
131
|
156 runtime_error ("Array bound mismatch for dimension 1 of "
|
|
157 "array (%ld/%ld) ",
|
111
|
158 (long int) ret_extent, (long int) arg_extent);
|
|
159 }
|
|
160 else
|
|
161 {
|
|
162 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
|
|
163 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
|
|
164 if (arg_extent != ret_extent)
|
131
|
165 runtime_error ("Array bound mismatch for dimension 1 of "
|
|
166 "array (%ld/%ld) ",
|
111
|
167 (long int) ret_extent, (long int) arg_extent);
|
|
168
|
|
169 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
|
|
170 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
|
|
171 if (arg_extent != ret_extent)
|
131
|
172 runtime_error ("Array bound mismatch for dimension 2 of "
|
|
173 "array (%ld/%ld) ",
|
111
|
174 (long int) ret_extent, (long int) arg_extent);
|
|
175 }
|
|
176 }
|
|
177
|
|
178
|
|
179 if (GFC_DESCRIPTOR_RANK (retarray) == 1)
|
|
180 {
|
|
181 /* One-dimensional result may be addressed in the code below
|
|
182 either as a row or a column matrix. We want both cases to
|
|
183 work. */
|
|
184 rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
|
|
185 }
|
|
186 else
|
|
187 {
|
|
188 rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
|
|
189 rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
|
|
190 }
|
|
191
|
|
192
|
|
193 if (GFC_DESCRIPTOR_RANK (a) == 1)
|
|
194 {
|
|
195 /* Treat it as a a row matrix A[1,count]. */
|
|
196 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
|
|
197 aystride = 1;
|
|
198
|
|
199 xcount = 1;
|
|
200 count = GFC_DESCRIPTOR_EXTENT(a,0);
|
|
201 }
|
|
202 else
|
|
203 {
|
|
204 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
|
|
205 aystride = GFC_DESCRIPTOR_STRIDE(a,1);
|
|
206
|
|
207 count = GFC_DESCRIPTOR_EXTENT(a,1);
|
|
208 xcount = GFC_DESCRIPTOR_EXTENT(a,0);
|
|
209 }
|
|
210
|
|
211 if (count != GFC_DESCRIPTOR_EXTENT(b,0))
|
|
212 {
|
|
213 if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
|
131
|
214 runtime_error ("Incorrect extent in argument B in MATMUL intrinsic "
|
|
215 "in dimension 1: is %ld, should be %ld",
|
|
216 (long int) GFC_DESCRIPTOR_EXTENT(b,0), (long int) count);
|
111
|
217 }
|
|
218
|
|
219 if (GFC_DESCRIPTOR_RANK (b) == 1)
|
|
220 {
|
|
221 /* Treat it as a column matrix B[count,1] */
|
|
222 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
|
|
223
|
|
224 /* bystride should never be used for 1-dimensional b.
|
|
225 The value is only used for calculation of the
|
|
226 memory by the buffer. */
|
|
227 bystride = 256;
|
|
228 ycount = 1;
|
|
229 }
|
|
230 else
|
|
231 {
|
|
232 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
|
|
233 bystride = GFC_DESCRIPTOR_STRIDE(b,1);
|
|
234 ycount = GFC_DESCRIPTOR_EXTENT(b,1);
|
|
235 }
|
|
236
|
|
237 abase = a->base_addr;
|
|
238 bbase = b->base_addr;
|
|
239 dest = retarray->base_addr;
|
|
240
|
|
241 /* Now that everything is set up, we perform the multiplication
|
|
242 itself. */
|
|
243
|
|
244 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
|
|
245 #define min(a,b) ((a) <= (b) ? (a) : (b))
|
|
246 #define max(a,b) ((a) >= (b) ? (a) : (b))
|
|
247
|
|
248 if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
|
|
249 && (bxstride == 1 || bystride == 1)
|
|
250 && (((float) xcount) * ((float) ycount) * ((float) count)
|
|
251 > POW3(blas_limit)))
|
|
252 {
|
|
253 const int m = xcount, n = ycount, k = count, ldc = rystride;
|
|
254 const GFC_INTEGER_8 one = 1, zero = 0;
|
|
255 const int lda = (axstride == 1) ? aystride : axstride,
|
|
256 ldb = (bxstride == 1) ? bystride : bxstride;
|
|
257
|
|
258 if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
|
|
259 {
|
|
260 assert (gemm != NULL);
|
131
|
261 const char *transa, *transb;
|
|
262 if (try_blas & 2)
|
|
263 transa = "C";
|
|
264 else
|
|
265 transa = axstride == 1 ? "N" : "T";
|
|
266
|
|
267 if (try_blas & 4)
|
|
268 transb = "C";
|
|
269 else
|
|
270 transb = bxstride == 1 ? "N" : "T";
|
|
271
|
|
272 gemm (transa, transb , &m,
|
111
|
273 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
|
|
274 &ldc, 1, 1);
|
|
275 return;
|
|
276 }
|
|
277 }
|
|
278
|
|
279 if (rxstride == 1 && axstride == 1 && bxstride == 1)
|
|
280 {
|
|
281 /* This block of code implements a tuned matmul, derived from
|
|
282 Superscalar GEMM-based level 3 BLAS, Beta version 0.1
|
|
283
|
|
284 Bo Kagstrom and Per Ling
|
|
285 Department of Computing Science
|
|
286 Umea University
|
|
287 S-901 87 Umea, Sweden
|
|
288
|
|
289 from netlib.org, translated to C, and modified for matmul.m4. */
|
|
290
|
|
291 const GFC_INTEGER_8 *a, *b;
|
|
292 GFC_INTEGER_8 *c;
|
|
293 const index_type m = xcount, n = ycount, k = count;
|
|
294
|
|
295 /* System generated locals */
|
|
296 index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
|
|
297 i1, i2, i3, i4, i5, i6;
|
|
298
|
|
299 /* Local variables */
|
|
300 GFC_INTEGER_8 f11, f12, f21, f22, f31, f32, f41, f42,
|
|
301 f13, f14, f23, f24, f33, f34, f43, f44;
|
|
302 index_type i, j, l, ii, jj, ll;
|
|
303 index_type isec, jsec, lsec, uisec, ujsec, ulsec;
|
|
304 GFC_INTEGER_8 *t1;
|
|
305
|
|
306 a = abase;
|
|
307 b = bbase;
|
|
308 c = retarray->base_addr;
|
|
309
|
|
310 /* Parameter adjustments */
|
|
311 c_dim1 = rystride;
|
|
312 c_offset = 1 + c_dim1;
|
|
313 c -= c_offset;
|
|
314 a_dim1 = aystride;
|
|
315 a_offset = 1 + a_dim1;
|
|
316 a -= a_offset;
|
|
317 b_dim1 = bystride;
|
|
318 b_offset = 1 + b_dim1;
|
|
319 b -= b_offset;
|
|
320
|
|
321 /* Empty c first. */
|
|
322 for (j=1; j<=n; j++)
|
|
323 for (i=1; i<=m; i++)
|
|
324 c[i + j * c_dim1] = (GFC_INTEGER_8)0;
|
|
325
|
|
326 /* Early exit if possible */
|
|
327 if (m == 0 || n == 0 || k == 0)
|
|
328 return;
|
|
329
|
|
330 /* Adjust size of t1 to what is needed. */
|
131
|
331 index_type t1_dim, a_sz;
|
|
332 if (aystride == 1)
|
|
333 a_sz = rystride;
|
|
334 else
|
|
335 a_sz = a_dim1;
|
|
336
|
|
337 t1_dim = a_sz * 256 + b_dim1;
|
111
|
338 if (t1_dim > 65536)
|
|
339 t1_dim = 65536;
|
|
340
|
|
341 t1 = malloc (t1_dim * sizeof(GFC_INTEGER_8));
|
|
342
|
|
343 /* Start turning the crank. */
|
|
344 i1 = n;
|
|
345 for (jj = 1; jj <= i1; jj += 512)
|
|
346 {
|
|
347 /* Computing MIN */
|
|
348 i2 = 512;
|
|
349 i3 = n - jj + 1;
|
|
350 jsec = min(i2,i3);
|
|
351 ujsec = jsec - jsec % 4;
|
|
352 i2 = k;
|
|
353 for (ll = 1; ll <= i2; ll += 256)
|
|
354 {
|
|
355 /* Computing MIN */
|
|
356 i3 = 256;
|
|
357 i4 = k - ll + 1;
|
|
358 lsec = min(i3,i4);
|
|
359 ulsec = lsec - lsec % 2;
|
|
360
|
|
361 i3 = m;
|
|
362 for (ii = 1; ii <= i3; ii += 256)
|
|
363 {
|
|
364 /* Computing MIN */
|
|
365 i4 = 256;
|
|
366 i5 = m - ii + 1;
|
|
367 isec = min(i4,i5);
|
|
368 uisec = isec - isec % 2;
|
|
369 i4 = ll + ulsec - 1;
|
|
370 for (l = ll; l <= i4; l += 2)
|
|
371 {
|
|
372 i5 = ii + uisec - 1;
|
|
373 for (i = ii; i <= i5; i += 2)
|
|
374 {
|
|
375 t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
|
|
376 a[i + l * a_dim1];
|
|
377 t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
|
|
378 a[i + (l + 1) * a_dim1];
|
|
379 t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
|
|
380 a[i + 1 + l * a_dim1];
|
|
381 t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
|
|
382 a[i + 1 + (l + 1) * a_dim1];
|
|
383 }
|
|
384 if (uisec < isec)
|
|
385 {
|
|
386 t1[l - ll + 1 + (isec << 8) - 257] =
|
|
387 a[ii + isec - 1 + l * a_dim1];
|
|
388 t1[l - ll + 2 + (isec << 8) - 257] =
|
|
389 a[ii + isec - 1 + (l + 1) * a_dim1];
|
|
390 }
|
|
391 }
|
|
392 if (ulsec < lsec)
|
|
393 {
|
|
394 i4 = ii + isec - 1;
|
|
395 for (i = ii; i<= i4; ++i)
|
|
396 {
|
|
397 t1[lsec + ((i - ii + 1) << 8) - 257] =
|
|
398 a[i + (ll + lsec - 1) * a_dim1];
|
|
399 }
|
|
400 }
|
|
401
|
|
402 uisec = isec - isec % 4;
|
|
403 i4 = jj + ujsec - 1;
|
|
404 for (j = jj; j <= i4; j += 4)
|
|
405 {
|
|
406 i5 = ii + uisec - 1;
|
|
407 for (i = ii; i <= i5; i += 4)
|
|
408 {
|
|
409 f11 = c[i + j * c_dim1];
|
|
410 f21 = c[i + 1 + j * c_dim1];
|
|
411 f12 = c[i + (j + 1) * c_dim1];
|
|
412 f22 = c[i + 1 + (j + 1) * c_dim1];
|
|
413 f13 = c[i + (j + 2) * c_dim1];
|
|
414 f23 = c[i + 1 + (j + 2) * c_dim1];
|
|
415 f14 = c[i + (j + 3) * c_dim1];
|
|
416 f24 = c[i + 1 + (j + 3) * c_dim1];
|
|
417 f31 = c[i + 2 + j * c_dim1];
|
|
418 f41 = c[i + 3 + j * c_dim1];
|
|
419 f32 = c[i + 2 + (j + 1) * c_dim1];
|
|
420 f42 = c[i + 3 + (j + 1) * c_dim1];
|
|
421 f33 = c[i + 2 + (j + 2) * c_dim1];
|
|
422 f43 = c[i + 3 + (j + 2) * c_dim1];
|
|
423 f34 = c[i + 2 + (j + 3) * c_dim1];
|
|
424 f44 = c[i + 3 + (j + 3) * c_dim1];
|
|
425 i6 = ll + lsec - 1;
|
|
426 for (l = ll; l <= i6; ++l)
|
|
427 {
|
|
428 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
|
|
429 * b[l + j * b_dim1];
|
|
430 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
|
|
431 * b[l + j * b_dim1];
|
|
432 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
|
|
433 * b[l + (j + 1) * b_dim1];
|
|
434 f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
|
|
435 * b[l + (j + 1) * b_dim1];
|
|
436 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
|
|
437 * b[l + (j + 2) * b_dim1];
|
|
438 f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
|
|
439 * b[l + (j + 2) * b_dim1];
|
|
440 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
|
|
441 * b[l + (j + 3) * b_dim1];
|
|
442 f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
|
|
443 * b[l + (j + 3) * b_dim1];
|
|
444 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
|
|
445 * b[l + j * b_dim1];
|
|
446 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
|
|
447 * b[l + j * b_dim1];
|
|
448 f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
|
|
449 * b[l + (j + 1) * b_dim1];
|
|
450 f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
|
|
451 * b[l + (j + 1) * b_dim1];
|
|
452 f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
|
|
453 * b[l + (j + 2) * b_dim1];
|
|
454 f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
|
|
455 * b[l + (j + 2) * b_dim1];
|
|
456 f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
|
|
457 * b[l + (j + 3) * b_dim1];
|
|
458 f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
|
|
459 * b[l + (j + 3) * b_dim1];
|
|
460 }
|
|
461 c[i + j * c_dim1] = f11;
|
|
462 c[i + 1 + j * c_dim1] = f21;
|
|
463 c[i + (j + 1) * c_dim1] = f12;
|
|
464 c[i + 1 + (j + 1) * c_dim1] = f22;
|
|
465 c[i + (j + 2) * c_dim1] = f13;
|
|
466 c[i + 1 + (j + 2) * c_dim1] = f23;
|
|
467 c[i + (j + 3) * c_dim1] = f14;
|
|
468 c[i + 1 + (j + 3) * c_dim1] = f24;
|
|
469 c[i + 2 + j * c_dim1] = f31;
|
|
470 c[i + 3 + j * c_dim1] = f41;
|
|
471 c[i + 2 + (j + 1) * c_dim1] = f32;
|
|
472 c[i + 3 + (j + 1) * c_dim1] = f42;
|
|
473 c[i + 2 + (j + 2) * c_dim1] = f33;
|
|
474 c[i + 3 + (j + 2) * c_dim1] = f43;
|
|
475 c[i + 2 + (j + 3) * c_dim1] = f34;
|
|
476 c[i + 3 + (j + 3) * c_dim1] = f44;
|
|
477 }
|
|
478 if (uisec < isec)
|
|
479 {
|
|
480 i5 = ii + isec - 1;
|
|
481 for (i = ii + uisec; i <= i5; ++i)
|
|
482 {
|
|
483 f11 = c[i + j * c_dim1];
|
|
484 f12 = c[i + (j + 1) * c_dim1];
|
|
485 f13 = c[i + (j + 2) * c_dim1];
|
|
486 f14 = c[i + (j + 3) * c_dim1];
|
|
487 i6 = ll + lsec - 1;
|
|
488 for (l = ll; l <= i6; ++l)
|
|
489 {
|
|
490 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
|
|
491 257] * b[l + j * b_dim1];
|
|
492 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
|
|
493 257] * b[l + (j + 1) * b_dim1];
|
|
494 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
|
|
495 257] * b[l + (j + 2) * b_dim1];
|
|
496 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
|
|
497 257] * b[l + (j + 3) * b_dim1];
|
|
498 }
|
|
499 c[i + j * c_dim1] = f11;
|
|
500 c[i + (j + 1) * c_dim1] = f12;
|
|
501 c[i + (j + 2) * c_dim1] = f13;
|
|
502 c[i + (j + 3) * c_dim1] = f14;
|
|
503 }
|
|
504 }
|
|
505 }
|
|
506 if (ujsec < jsec)
|
|
507 {
|
|
508 i4 = jj + jsec - 1;
|
|
509 for (j = jj + ujsec; j <= i4; ++j)
|
|
510 {
|
|
511 i5 = ii + uisec - 1;
|
|
512 for (i = ii; i <= i5; i += 4)
|
|
513 {
|
|
514 f11 = c[i + j * c_dim1];
|
|
515 f21 = c[i + 1 + j * c_dim1];
|
|
516 f31 = c[i + 2 + j * c_dim1];
|
|
517 f41 = c[i + 3 + j * c_dim1];
|
|
518 i6 = ll + lsec - 1;
|
|
519 for (l = ll; l <= i6; ++l)
|
|
520 {
|
|
521 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
|
|
522 257] * b[l + j * b_dim1];
|
|
523 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
|
|
524 257] * b[l + j * b_dim1];
|
|
525 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
|
|
526 257] * b[l + j * b_dim1];
|
|
527 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
|
|
528 257] * b[l + j * b_dim1];
|
|
529 }
|
|
530 c[i + j * c_dim1] = f11;
|
|
531 c[i + 1 + j * c_dim1] = f21;
|
|
532 c[i + 2 + j * c_dim1] = f31;
|
|
533 c[i + 3 + j * c_dim1] = f41;
|
|
534 }
|
|
535 i5 = ii + isec - 1;
|
|
536 for (i = ii + uisec; i <= i5; ++i)
|
|
537 {
|
|
538 f11 = c[i + j * c_dim1];
|
|
539 i6 = ll + lsec - 1;
|
|
540 for (l = ll; l <= i6; ++l)
|
|
541 {
|
|
542 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
|
|
543 257] * b[l + j * b_dim1];
|
|
544 }
|
|
545 c[i + j * c_dim1] = f11;
|
|
546 }
|
|
547 }
|
|
548 }
|
|
549 }
|
|
550 }
|
|
551 }
|
|
552 free(t1);
|
|
553 return;
|
|
554 }
|
|
555 else if (rxstride == 1 && aystride == 1 && bxstride == 1)
|
|
556 {
|
|
557 if (GFC_DESCRIPTOR_RANK (a) != 1)
|
|
558 {
|
|
559 const GFC_INTEGER_8 *restrict abase_x;
|
|
560 const GFC_INTEGER_8 *restrict bbase_y;
|
|
561 GFC_INTEGER_8 *restrict dest_y;
|
|
562 GFC_INTEGER_8 s;
|
|
563
|
|
564 for (y = 0; y < ycount; y++)
|
|
565 {
|
|
566 bbase_y = &bbase[y*bystride];
|
|
567 dest_y = &dest[y*rystride];
|
|
568 for (x = 0; x < xcount; x++)
|
|
569 {
|
|
570 abase_x = &abase[x*axstride];
|
|
571 s = (GFC_INTEGER_8) 0;
|
|
572 for (n = 0; n < count; n++)
|
|
573 s += abase_x[n] * bbase_y[n];
|
|
574 dest_y[x] = s;
|
|
575 }
|
|
576 }
|
|
577 }
|
|
578 else
|
|
579 {
|
|
580 const GFC_INTEGER_8 *restrict bbase_y;
|
|
581 GFC_INTEGER_8 s;
|
|
582
|
|
583 for (y = 0; y < ycount; y++)
|
|
584 {
|
|
585 bbase_y = &bbase[y*bystride];
|
|
586 s = (GFC_INTEGER_8) 0;
|
|
587 for (n = 0; n < count; n++)
|
|
588 s += abase[n*axstride] * bbase_y[n];
|
|
589 dest[y*rystride] = s;
|
|
590 }
|
|
591 }
|
|
592 }
|
|
593 else if (axstride < aystride)
|
|
594 {
|
|
595 for (y = 0; y < ycount; y++)
|
|
596 for (x = 0; x < xcount; x++)
|
|
597 dest[x*rxstride + y*rystride] = (GFC_INTEGER_8)0;
|
|
598
|
|
599 for (y = 0; y < ycount; y++)
|
|
600 for (n = 0; n < count; n++)
|
|
601 for (x = 0; x < xcount; x++)
|
|
602 /* dest[x,y] += a[x,n] * b[n,y] */
|
|
603 dest[x*rxstride + y*rystride] +=
|
|
604 abase[x*axstride + n*aystride] *
|
|
605 bbase[n*bxstride + y*bystride];
|
|
606 }
|
|
607 else if (GFC_DESCRIPTOR_RANK (a) == 1)
|
|
608 {
|
|
609 const GFC_INTEGER_8 *restrict bbase_y;
|
|
610 GFC_INTEGER_8 s;
|
|
611
|
|
612 for (y = 0; y < ycount; y++)
|
|
613 {
|
|
614 bbase_y = &bbase[y*bystride];
|
|
615 s = (GFC_INTEGER_8) 0;
|
|
616 for (n = 0; n < count; n++)
|
|
617 s += abase[n*axstride] * bbase_y[n*bxstride];
|
|
618 dest[y*rxstride] = s;
|
|
619 }
|
|
620 }
|
|
621 else
|
|
622 {
|
|
623 const GFC_INTEGER_8 *restrict abase_x;
|
|
624 const GFC_INTEGER_8 *restrict bbase_y;
|
|
625 GFC_INTEGER_8 *restrict dest_y;
|
|
626 GFC_INTEGER_8 s;
|
|
627
|
|
628 for (y = 0; y < ycount; y++)
|
|
629 {
|
|
630 bbase_y = &bbase[y*bystride];
|
|
631 dest_y = &dest[y*rystride];
|
|
632 for (x = 0; x < xcount; x++)
|
|
633 {
|
|
634 abase_x = &abase[x*axstride];
|
|
635 s = (GFC_INTEGER_8) 0;
|
|
636 for (n = 0; n < count; n++)
|
|
637 s += abase_x[n*aystride] * bbase_y[n*bxstride];
|
|
638 dest_y[x*rxstride] = s;
|
|
639 }
|
|
640 }
|
|
641 }
|
|
642 }
|
|
643 #undef POW3
|
|
644 #undef min
|
|
645 #undef max
|
|
646
|
|
647 #endif /* HAVE_AVX */
|
|
648
|
|
649 #ifdef HAVE_AVX2
|
|
650 static void
|
|
651 matmul_i8_avx2 (gfc_array_i8 * const restrict retarray,
|
|
652 gfc_array_i8 * const restrict a, gfc_array_i8 * const restrict b, int try_blas,
|
|
653 int blas_limit, blas_call gemm) __attribute__((__target__("avx2,fma")));
|
|
654 static void
|
|
655 matmul_i8_avx2 (gfc_array_i8 * const restrict retarray,
|
|
656 gfc_array_i8 * const restrict a, gfc_array_i8 * const restrict b, int try_blas,
|
|
657 int blas_limit, blas_call gemm)
|
|
658 {
|
|
659 const GFC_INTEGER_8 * restrict abase;
|
|
660 const GFC_INTEGER_8 * restrict bbase;
|
|
661 GFC_INTEGER_8 * restrict dest;
|
|
662
|
|
663 index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
|
|
664 index_type x, y, n, count, xcount, ycount;
|
|
665
|
|
666 assert (GFC_DESCRIPTOR_RANK (a) == 2
|
|
667 || GFC_DESCRIPTOR_RANK (b) == 2);
|
|
668
|
|
669 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
|
|
670
|
|
671 Either A or B (but not both) can be rank 1:
|
|
672
|
|
673 o One-dimensional argument A is implicitly treated as a row matrix
|
|
674 dimensioned [1,count], so xcount=1.
|
|
675
|
|
676 o One-dimensional argument B is implicitly treated as a column matrix
|
|
677 dimensioned [count, 1], so ycount=1.
|
|
678 */
|
|
679
|
|
680 if (retarray->base_addr == NULL)
|
|
681 {
|
|
682 if (GFC_DESCRIPTOR_RANK (a) == 1)
|
|
683 {
|
|
684 GFC_DIMENSION_SET(retarray->dim[0], 0,
|
|
685 GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
|
|
686 }
|
|
687 else if (GFC_DESCRIPTOR_RANK (b) == 1)
|
|
688 {
|
|
689 GFC_DIMENSION_SET(retarray->dim[0], 0,
|
|
690 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
|
|
691 }
|
|
692 else
|
|
693 {
|
|
694 GFC_DIMENSION_SET(retarray->dim[0], 0,
|
|
695 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
|
|
696
|
|
697 GFC_DIMENSION_SET(retarray->dim[1], 0,
|
|
698 GFC_DESCRIPTOR_EXTENT(b,1) - 1,
|
|
699 GFC_DESCRIPTOR_EXTENT(retarray,0));
|
|
700 }
|
|
701
|
|
702 retarray->base_addr
|
|
703 = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_INTEGER_8));
|
|
704 retarray->offset = 0;
|
|
705 }
|
|
706 else if (unlikely (compile_options.bounds_check))
|
|
707 {
|
|
708 index_type ret_extent, arg_extent;
|
|
709
|
|
710 if (GFC_DESCRIPTOR_RANK (a) == 1)
|
|
711 {
|
|
712 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
|
|
713 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
|
|
714 if (arg_extent != ret_extent)
|
131
|
715 runtime_error ("Array bound mismatch for dimension 1 of "
|
|
716 "array (%ld/%ld) ",
|
111
|
717 (long int) ret_extent, (long int) arg_extent);
|
|
718 }
|
|
719 else if (GFC_DESCRIPTOR_RANK (b) == 1)
|
|
720 {
|
|
721 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
|
|
722 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
|
|
723 if (arg_extent != ret_extent)
|
131
|
724 runtime_error ("Array bound mismatch for dimension 1 of "
|
|
725 "array (%ld/%ld) ",
|
111
|
726 (long int) ret_extent, (long int) arg_extent);
|
|
727 }
|
|
728 else
|
|
729 {
|
|
730 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
|
|
731 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
|
|
732 if (arg_extent != ret_extent)
|
131
|
733 runtime_error ("Array bound mismatch for dimension 1 of "
|
|
734 "array (%ld/%ld) ",
|
111
|
735 (long int) ret_extent, (long int) arg_extent);
|
|
736
|
|
737 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
|
|
738 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
|
|
739 if (arg_extent != ret_extent)
|
131
|
740 runtime_error ("Array bound mismatch for dimension 2 of "
|
|
741 "array (%ld/%ld) ",
|
111
|
742 (long int) ret_extent, (long int) arg_extent);
|
|
743 }
|
|
744 }
|
|
745
|
|
746
|
|
747 if (GFC_DESCRIPTOR_RANK (retarray) == 1)
|
|
748 {
|
|
749 /* One-dimensional result may be addressed in the code below
|
|
750 either as a row or a column matrix. We want both cases to
|
|
751 work. */
|
|
752 rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
|
|
753 }
|
|
754 else
|
|
755 {
|
|
756 rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
|
|
757 rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
|
|
758 }
|
|
759
|
|
760
|
|
761 if (GFC_DESCRIPTOR_RANK (a) == 1)
|
|
762 {
|
|
763 /* Treat it as a a row matrix A[1,count]. */
|
|
764 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
|
|
765 aystride = 1;
|
|
766
|
|
767 xcount = 1;
|
|
768 count = GFC_DESCRIPTOR_EXTENT(a,0);
|
|
769 }
|
|
770 else
|
|
771 {
|
|
772 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
|
|
773 aystride = GFC_DESCRIPTOR_STRIDE(a,1);
|
|
774
|
|
775 count = GFC_DESCRIPTOR_EXTENT(a,1);
|
|
776 xcount = GFC_DESCRIPTOR_EXTENT(a,0);
|
|
777 }
|
|
778
|
|
779 if (count != GFC_DESCRIPTOR_EXTENT(b,0))
|
|
780 {
|
|
781 if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
|
131
|
782 runtime_error ("Incorrect extent in argument B in MATMUL intrinsic "
|
|
783 "in dimension 1: is %ld, should be %ld",
|
|
784 (long int) GFC_DESCRIPTOR_EXTENT(b,0), (long int) count);
|
111
|
785 }
|
|
786
|
|
787 if (GFC_DESCRIPTOR_RANK (b) == 1)
|
|
788 {
|
|
789 /* Treat it as a column matrix B[count,1] */
|
|
790 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
|
|
791
|
|
792 /* bystride should never be used for 1-dimensional b.
|
|
793 The value is only used for calculation of the
|
|
794 memory by the buffer. */
|
|
795 bystride = 256;
|
|
796 ycount = 1;
|
|
797 }
|
|
798 else
|
|
799 {
|
|
800 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
|
|
801 bystride = GFC_DESCRIPTOR_STRIDE(b,1);
|
|
802 ycount = GFC_DESCRIPTOR_EXTENT(b,1);
|
|
803 }
|
|
804
|
|
805 abase = a->base_addr;
|
|
806 bbase = b->base_addr;
|
|
807 dest = retarray->base_addr;
|
|
808
|
|
809 /* Now that everything is set up, we perform the multiplication
|
|
810 itself. */
|
|
811
|
|
812 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
|
|
813 #define min(a,b) ((a) <= (b) ? (a) : (b))
|
|
814 #define max(a,b) ((a) >= (b) ? (a) : (b))
|
|
815
|
|
816 if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
|
|
817 && (bxstride == 1 || bystride == 1)
|
|
818 && (((float) xcount) * ((float) ycount) * ((float) count)
|
|
819 > POW3(blas_limit)))
|
|
820 {
|
|
821 const int m = xcount, n = ycount, k = count, ldc = rystride;
|
|
822 const GFC_INTEGER_8 one = 1, zero = 0;
|
|
823 const int lda = (axstride == 1) ? aystride : axstride,
|
|
824 ldb = (bxstride == 1) ? bystride : bxstride;
|
|
825
|
|
826 if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
|
|
827 {
|
|
828 assert (gemm != NULL);
|
131
|
829 const char *transa, *transb;
|
|
830 if (try_blas & 2)
|
|
831 transa = "C";
|
|
832 else
|
|
833 transa = axstride == 1 ? "N" : "T";
|
|
834
|
|
835 if (try_blas & 4)
|
|
836 transb = "C";
|
|
837 else
|
|
838 transb = bxstride == 1 ? "N" : "T";
|
|
839
|
|
840 gemm (transa, transb , &m,
|
111
|
841 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
|
|
842 &ldc, 1, 1);
|
|
843 return;
|
|
844 }
|
|
845 }
|
|
846
|
|
847 if (rxstride == 1 && axstride == 1 && bxstride == 1)
|
|
848 {
|
|
849 /* This block of code implements a tuned matmul, derived from
|
|
850 Superscalar GEMM-based level 3 BLAS, Beta version 0.1
|
|
851
|
|
852 Bo Kagstrom and Per Ling
|
|
853 Department of Computing Science
|
|
854 Umea University
|
|
855 S-901 87 Umea, Sweden
|
|
856
|
|
857 from netlib.org, translated to C, and modified for matmul.m4. */
|
|
858
|
|
859 const GFC_INTEGER_8 *a, *b;
|
|
860 GFC_INTEGER_8 *c;
|
|
861 const index_type m = xcount, n = ycount, k = count;
|
|
862
|
|
863 /* System generated locals */
|
|
864 index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
|
|
865 i1, i2, i3, i4, i5, i6;
|
|
866
|
|
867 /* Local variables */
|
|
868 GFC_INTEGER_8 f11, f12, f21, f22, f31, f32, f41, f42,
|
|
869 f13, f14, f23, f24, f33, f34, f43, f44;
|
|
870 index_type i, j, l, ii, jj, ll;
|
|
871 index_type isec, jsec, lsec, uisec, ujsec, ulsec;
|
|
872 GFC_INTEGER_8 *t1;
|
|
873
|
|
874 a = abase;
|
|
875 b = bbase;
|
|
876 c = retarray->base_addr;
|
|
877
|
|
878 /* Parameter adjustments */
|
|
879 c_dim1 = rystride;
|
|
880 c_offset = 1 + c_dim1;
|
|
881 c -= c_offset;
|
|
882 a_dim1 = aystride;
|
|
883 a_offset = 1 + a_dim1;
|
|
884 a -= a_offset;
|
|
885 b_dim1 = bystride;
|
|
886 b_offset = 1 + b_dim1;
|
|
887 b -= b_offset;
|
|
888
|
|
889 /* Empty c first. */
|
|
890 for (j=1; j<=n; j++)
|
|
891 for (i=1; i<=m; i++)
|
|
892 c[i + j * c_dim1] = (GFC_INTEGER_8)0;
|
|
893
|
|
894 /* Early exit if possible */
|
|
895 if (m == 0 || n == 0 || k == 0)
|
|
896 return;
|
|
897
|
|
898 /* Adjust size of t1 to what is needed. */
|
131
|
899 index_type t1_dim, a_sz;
|
|
900 if (aystride == 1)
|
|
901 a_sz = rystride;
|
|
902 else
|
|
903 a_sz = a_dim1;
|
|
904
|
|
905 t1_dim = a_sz * 256 + b_dim1;
|
111
|
906 if (t1_dim > 65536)
|
|
907 t1_dim = 65536;
|
|
908
|
|
909 t1 = malloc (t1_dim * sizeof(GFC_INTEGER_8));
|
|
910
|
|
911 /* Start turning the crank. */
|
|
912 i1 = n;
|
|
913 for (jj = 1; jj <= i1; jj += 512)
|
|
914 {
|
|
915 /* Computing MIN */
|
|
916 i2 = 512;
|
|
917 i3 = n - jj + 1;
|
|
918 jsec = min(i2,i3);
|
|
919 ujsec = jsec - jsec % 4;
|
|
920 i2 = k;
|
|
921 for (ll = 1; ll <= i2; ll += 256)
|
|
922 {
|
|
923 /* Computing MIN */
|
|
924 i3 = 256;
|
|
925 i4 = k - ll + 1;
|
|
926 lsec = min(i3,i4);
|
|
927 ulsec = lsec - lsec % 2;
|
|
928
|
|
929 i3 = m;
|
|
930 for (ii = 1; ii <= i3; ii += 256)
|
|
931 {
|
|
932 /* Computing MIN */
|
|
933 i4 = 256;
|
|
934 i5 = m - ii + 1;
|
|
935 isec = min(i4,i5);
|
|
936 uisec = isec - isec % 2;
|
|
937 i4 = ll + ulsec - 1;
|
|
938 for (l = ll; l <= i4; l += 2)
|
|
939 {
|
|
940 i5 = ii + uisec - 1;
|
|
941 for (i = ii; i <= i5; i += 2)
|
|
942 {
|
|
943 t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
|
|
944 a[i + l * a_dim1];
|
|
945 t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
|
|
946 a[i + (l + 1) * a_dim1];
|
|
947 t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
|
|
948 a[i + 1 + l * a_dim1];
|
|
949 t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
|
|
950 a[i + 1 + (l + 1) * a_dim1];
|
|
951 }
|
|
952 if (uisec < isec)
|
|
953 {
|
|
954 t1[l - ll + 1 + (isec << 8) - 257] =
|
|
955 a[ii + isec - 1 + l * a_dim1];
|
|
956 t1[l - ll + 2 + (isec << 8) - 257] =
|
|
957 a[ii + isec - 1 + (l + 1) * a_dim1];
|
|
958 }
|
|
959 }
|
|
960 if (ulsec < lsec)
|
|
961 {
|
|
962 i4 = ii + isec - 1;
|
|
963 for (i = ii; i<= i4; ++i)
|
|
964 {
|
|
965 t1[lsec + ((i - ii + 1) << 8) - 257] =
|
|
966 a[i + (ll + lsec - 1) * a_dim1];
|
|
967 }
|
|
968 }
|
|
969
|
|
970 uisec = isec - isec % 4;
|
|
971 i4 = jj + ujsec - 1;
|
|
972 for (j = jj; j <= i4; j += 4)
|
|
973 {
|
|
974 i5 = ii + uisec - 1;
|
|
975 for (i = ii; i <= i5; i += 4)
|
|
976 {
|
|
977 f11 = c[i + j * c_dim1];
|
|
978 f21 = c[i + 1 + j * c_dim1];
|
|
979 f12 = c[i + (j + 1) * c_dim1];
|
|
980 f22 = c[i + 1 + (j + 1) * c_dim1];
|
|
981 f13 = c[i + (j + 2) * c_dim1];
|
|
982 f23 = c[i + 1 + (j + 2) * c_dim1];
|
|
983 f14 = c[i + (j + 3) * c_dim1];
|
|
984 f24 = c[i + 1 + (j + 3) * c_dim1];
|
|
985 f31 = c[i + 2 + j * c_dim1];
|
|
986 f41 = c[i + 3 + j * c_dim1];
|
|
987 f32 = c[i + 2 + (j + 1) * c_dim1];
|
|
988 f42 = c[i + 3 + (j + 1) * c_dim1];
|
|
989 f33 = c[i + 2 + (j + 2) * c_dim1];
|
|
990 f43 = c[i + 3 + (j + 2) * c_dim1];
|
|
991 f34 = c[i + 2 + (j + 3) * c_dim1];
|
|
992 f44 = c[i + 3 + (j + 3) * c_dim1];
|
|
993 i6 = ll + lsec - 1;
|
|
994 for (l = ll; l <= i6; ++l)
|
|
995 {
|
|
996 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
|
|
997 * b[l + j * b_dim1];
|
|
998 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
|
|
999 * b[l + j * b_dim1];
|
|
1000 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
|
|
1001 * b[l + (j + 1) * b_dim1];
|
|
1002 f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
|
|
1003 * b[l + (j + 1) * b_dim1];
|
|
1004 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
|
|
1005 * b[l + (j + 2) * b_dim1];
|
|
1006 f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
|
|
1007 * b[l + (j + 2) * b_dim1];
|
|
1008 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
|
|
1009 * b[l + (j + 3) * b_dim1];
|
|
1010 f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
|
|
1011 * b[l + (j + 3) * b_dim1];
|
|
1012 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
|
|
1013 * b[l + j * b_dim1];
|
|
1014 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
|
|
1015 * b[l + j * b_dim1];
|
|
1016 f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
|
|
1017 * b[l + (j + 1) * b_dim1];
|
|
1018 f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
|
|
1019 * b[l + (j + 1) * b_dim1];
|
|
1020 f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
|
|
1021 * b[l + (j + 2) * b_dim1];
|
|
1022 f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
|
|
1023 * b[l + (j + 2) * b_dim1];
|
|
1024 f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
|
|
1025 * b[l + (j + 3) * b_dim1];
|
|
1026 f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
|
|
1027 * b[l + (j + 3) * b_dim1];
|
|
1028 }
|
|
1029 c[i + j * c_dim1] = f11;
|
|
1030 c[i + 1 + j * c_dim1] = f21;
|
|
1031 c[i + (j + 1) * c_dim1] = f12;
|
|
1032 c[i + 1 + (j + 1) * c_dim1] = f22;
|
|
1033 c[i + (j + 2) * c_dim1] = f13;
|
|
1034 c[i + 1 + (j + 2) * c_dim1] = f23;
|
|
1035 c[i + (j + 3) * c_dim1] = f14;
|
|
1036 c[i + 1 + (j + 3) * c_dim1] = f24;
|
|
1037 c[i + 2 + j * c_dim1] = f31;
|
|
1038 c[i + 3 + j * c_dim1] = f41;
|
|
1039 c[i + 2 + (j + 1) * c_dim1] = f32;
|
|
1040 c[i + 3 + (j + 1) * c_dim1] = f42;
|
|
1041 c[i + 2 + (j + 2) * c_dim1] = f33;
|
|
1042 c[i + 3 + (j + 2) * c_dim1] = f43;
|
|
1043 c[i + 2 + (j + 3) * c_dim1] = f34;
|
|
1044 c[i + 3 + (j + 3) * c_dim1] = f44;
|
|
1045 }
|
|
1046 if (uisec < isec)
|
|
1047 {
|
|
1048 i5 = ii + isec - 1;
|
|
1049 for (i = ii + uisec; i <= i5; ++i)
|
|
1050 {
|
|
1051 f11 = c[i + j * c_dim1];
|
|
1052 f12 = c[i + (j + 1) * c_dim1];
|
|
1053 f13 = c[i + (j + 2) * c_dim1];
|
|
1054 f14 = c[i + (j + 3) * c_dim1];
|
|
1055 i6 = ll + lsec - 1;
|
|
1056 for (l = ll; l <= i6; ++l)
|
|
1057 {
|
|
1058 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
|
|
1059 257] * b[l + j * b_dim1];
|
|
1060 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
|
|
1061 257] * b[l + (j + 1) * b_dim1];
|
|
1062 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
|
|
1063 257] * b[l + (j + 2) * b_dim1];
|
|
1064 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
|
|
1065 257] * b[l + (j + 3) * b_dim1];
|
|
1066 }
|
|
1067 c[i + j * c_dim1] = f11;
|
|
1068 c[i + (j + 1) * c_dim1] = f12;
|
|
1069 c[i + (j + 2) * c_dim1] = f13;
|
|
1070 c[i + (j + 3) * c_dim1] = f14;
|
|
1071 }
|
|
1072 }
|
|
1073 }
|
|
1074 if (ujsec < jsec)
|
|
1075 {
|
|
1076 i4 = jj + jsec - 1;
|
|
1077 for (j = jj + ujsec; j <= i4; ++j)
|
|
1078 {
|
|
1079 i5 = ii + uisec - 1;
|
|
1080 for (i = ii; i <= i5; i += 4)
|
|
1081 {
|
|
1082 f11 = c[i + j * c_dim1];
|
|
1083 f21 = c[i + 1 + j * c_dim1];
|
|
1084 f31 = c[i + 2 + j * c_dim1];
|
|
1085 f41 = c[i + 3 + j * c_dim1];
|
|
1086 i6 = ll + lsec - 1;
|
|
1087 for (l = ll; l <= i6; ++l)
|
|
1088 {
|
|
1089 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
|
|
1090 257] * b[l + j * b_dim1];
|
|
1091 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
|
|
1092 257] * b[l + j * b_dim1];
|
|
1093 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
|
|
1094 257] * b[l + j * b_dim1];
|
|
1095 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
|
|
1096 257] * b[l + j * b_dim1];
|
|
1097 }
|
|
1098 c[i + j * c_dim1] = f11;
|
|
1099 c[i + 1 + j * c_dim1] = f21;
|
|
1100 c[i + 2 + j * c_dim1] = f31;
|
|
1101 c[i + 3 + j * c_dim1] = f41;
|
|
1102 }
|
|
1103 i5 = ii + isec - 1;
|
|
1104 for (i = ii + uisec; i <= i5; ++i)
|
|
1105 {
|
|
1106 f11 = c[i + j * c_dim1];
|
|
1107 i6 = ll + lsec - 1;
|
|
1108 for (l = ll; l <= i6; ++l)
|
|
1109 {
|
|
1110 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
|
|
1111 257] * b[l + j * b_dim1];
|
|
1112 }
|
|
1113 c[i + j * c_dim1] = f11;
|
|
1114 }
|
|
1115 }
|
|
1116 }
|
|
1117 }
|
|
1118 }
|
|
1119 }
|
|
1120 free(t1);
|
|
1121 return;
|
|
1122 }
|
|
1123 else if (rxstride == 1 && aystride == 1 && bxstride == 1)
|
|
1124 {
|
|
1125 if (GFC_DESCRIPTOR_RANK (a) != 1)
|
|
1126 {
|
|
1127 const GFC_INTEGER_8 *restrict abase_x;
|
|
1128 const GFC_INTEGER_8 *restrict bbase_y;
|
|
1129 GFC_INTEGER_8 *restrict dest_y;
|
|
1130 GFC_INTEGER_8 s;
|
|
1131
|
|
1132 for (y = 0; y < ycount; y++)
|
|
1133 {
|
|
1134 bbase_y = &bbase[y*bystride];
|
|
1135 dest_y = &dest[y*rystride];
|
|
1136 for (x = 0; x < xcount; x++)
|
|
1137 {
|
|
1138 abase_x = &abase[x*axstride];
|
|
1139 s = (GFC_INTEGER_8) 0;
|
|
1140 for (n = 0; n < count; n++)
|
|
1141 s += abase_x[n] * bbase_y[n];
|
|
1142 dest_y[x] = s;
|
|
1143 }
|
|
1144 }
|
|
1145 }
|
|
1146 else
|
|
1147 {
|
|
1148 const GFC_INTEGER_8 *restrict bbase_y;
|
|
1149 GFC_INTEGER_8 s;
|
|
1150
|
|
1151 for (y = 0; y < ycount; y++)
|
|
1152 {
|
|
1153 bbase_y = &bbase[y*bystride];
|
|
1154 s = (GFC_INTEGER_8) 0;
|
|
1155 for (n = 0; n < count; n++)
|
|
1156 s += abase[n*axstride] * bbase_y[n];
|
|
1157 dest[y*rystride] = s;
|
|
1158 }
|
|
1159 }
|
|
1160 }
|
|
1161 else if (axstride < aystride)
|
|
1162 {
|
|
1163 for (y = 0; y < ycount; y++)
|
|
1164 for (x = 0; x < xcount; x++)
|
|
1165 dest[x*rxstride + y*rystride] = (GFC_INTEGER_8)0;
|
|
1166
|
|
1167 for (y = 0; y < ycount; y++)
|
|
1168 for (n = 0; n < count; n++)
|
|
1169 for (x = 0; x < xcount; x++)
|
|
1170 /* dest[x,y] += a[x,n] * b[n,y] */
|
|
1171 dest[x*rxstride + y*rystride] +=
|
|
1172 abase[x*axstride + n*aystride] *
|
|
1173 bbase[n*bxstride + y*bystride];
|
|
1174 }
|
|
1175 else if (GFC_DESCRIPTOR_RANK (a) == 1)
|
|
1176 {
|
|
1177 const GFC_INTEGER_8 *restrict bbase_y;
|
|
1178 GFC_INTEGER_8 s;
|
|
1179
|
|
1180 for (y = 0; y < ycount; y++)
|
|
1181 {
|
|
1182 bbase_y = &bbase[y*bystride];
|
|
1183 s = (GFC_INTEGER_8) 0;
|
|
1184 for (n = 0; n < count; n++)
|
|
1185 s += abase[n*axstride] * bbase_y[n*bxstride];
|
|
1186 dest[y*rxstride] = s;
|
|
1187 }
|
|
1188 }
|
|
1189 else
|
|
1190 {
|
|
1191 const GFC_INTEGER_8 *restrict abase_x;
|
|
1192 const GFC_INTEGER_8 *restrict bbase_y;
|
|
1193 GFC_INTEGER_8 *restrict dest_y;
|
|
1194 GFC_INTEGER_8 s;
|
|
1195
|
|
1196 for (y = 0; y < ycount; y++)
|
|
1197 {
|
|
1198 bbase_y = &bbase[y*bystride];
|
|
1199 dest_y = &dest[y*rystride];
|
|
1200 for (x = 0; x < xcount; x++)
|
|
1201 {
|
|
1202 abase_x = &abase[x*axstride];
|
|
1203 s = (GFC_INTEGER_8) 0;
|
|
1204 for (n = 0; n < count; n++)
|
|
1205 s += abase_x[n*aystride] * bbase_y[n*bxstride];
|
|
1206 dest_y[x*rxstride] = s;
|
|
1207 }
|
|
1208 }
|
|
1209 }
|
|
1210 }
|
|
1211 #undef POW3
|
|
1212 #undef min
|
|
1213 #undef max
|
|
1214
|
|
1215 #endif /* HAVE_AVX2 */
|
|
1216
|
|
1217 #ifdef HAVE_AVX512F
|
|
1218 static void
|
|
1219 matmul_i8_avx512f (gfc_array_i8 * const restrict retarray,
|
|
1220 gfc_array_i8 * const restrict a, gfc_array_i8 * const restrict b, int try_blas,
|
|
1221 int blas_limit, blas_call gemm) __attribute__((__target__("avx512f")));
|
|
1222 static void
|
|
1223 matmul_i8_avx512f (gfc_array_i8 * const restrict retarray,
|
|
1224 gfc_array_i8 * const restrict a, gfc_array_i8 * const restrict b, int try_blas,
|
|
1225 int blas_limit, blas_call gemm)
|
|
1226 {
|
|
1227 const GFC_INTEGER_8 * restrict abase;
|
|
1228 const GFC_INTEGER_8 * restrict bbase;
|
|
1229 GFC_INTEGER_8 * restrict dest;
|
|
1230
|
|
1231 index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
|
|
1232 index_type x, y, n, count, xcount, ycount;
|
|
1233
|
|
1234 assert (GFC_DESCRIPTOR_RANK (a) == 2
|
|
1235 || GFC_DESCRIPTOR_RANK (b) == 2);
|
|
1236
|
|
1237 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
|
|
1238
|
|
1239 Either A or B (but not both) can be rank 1:
|
|
1240
|
|
1241 o One-dimensional argument A is implicitly treated as a row matrix
|
|
1242 dimensioned [1,count], so xcount=1.
|
|
1243
|
|
1244 o One-dimensional argument B is implicitly treated as a column matrix
|
|
1245 dimensioned [count, 1], so ycount=1.
|
|
1246 */
|
|
1247
|
|
1248 if (retarray->base_addr == NULL)
|
|
1249 {
|
|
1250 if (GFC_DESCRIPTOR_RANK (a) == 1)
|
|
1251 {
|
|
1252 GFC_DIMENSION_SET(retarray->dim[0], 0,
|
|
1253 GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
|
|
1254 }
|
|
1255 else if (GFC_DESCRIPTOR_RANK (b) == 1)
|
|
1256 {
|
|
1257 GFC_DIMENSION_SET(retarray->dim[0], 0,
|
|
1258 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
|
|
1259 }
|
|
1260 else
|
|
1261 {
|
|
1262 GFC_DIMENSION_SET(retarray->dim[0], 0,
|
|
1263 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
|
|
1264
|
|
1265 GFC_DIMENSION_SET(retarray->dim[1], 0,
|
|
1266 GFC_DESCRIPTOR_EXTENT(b,1) - 1,
|
|
1267 GFC_DESCRIPTOR_EXTENT(retarray,0));
|
|
1268 }
|
|
1269
|
|
1270 retarray->base_addr
|
|
1271 = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_INTEGER_8));
|
|
1272 retarray->offset = 0;
|
|
1273 }
|
|
1274 else if (unlikely (compile_options.bounds_check))
|
|
1275 {
|
|
1276 index_type ret_extent, arg_extent;
|
|
1277
|
|
1278 if (GFC_DESCRIPTOR_RANK (a) == 1)
|
|
1279 {
|
|
1280 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
|
|
1281 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
|
|
1282 if (arg_extent != ret_extent)
|
131
|
1283 runtime_error ("Array bound mismatch for dimension 1 of "
|
|
1284 "array (%ld/%ld) ",
|
111
|
1285 (long int) ret_extent, (long int) arg_extent);
|
|
1286 }
|
|
1287 else if (GFC_DESCRIPTOR_RANK (b) == 1)
|
|
1288 {
|
|
1289 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
|
|
1290 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
|
|
1291 if (arg_extent != ret_extent)
|
131
|
1292 runtime_error ("Array bound mismatch for dimension 1 of "
|
|
1293 "array (%ld/%ld) ",
|
111
|
1294 (long int) ret_extent, (long int) arg_extent);
|
|
1295 }
|
|
1296 else
|
|
1297 {
|
|
1298 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
|
|
1299 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
|
|
1300 if (arg_extent != ret_extent)
|
131
|
1301 runtime_error ("Array bound mismatch for dimension 1 of "
|
|
1302 "array (%ld/%ld) ",
|
111
|
1303 (long int) ret_extent, (long int) arg_extent);
|
|
1304
|
|
1305 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
|
|
1306 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
|
|
1307 if (arg_extent != ret_extent)
|
131
|
1308 runtime_error ("Array bound mismatch for dimension 2 of "
|
|
1309 "array (%ld/%ld) ",
|
111
|
1310 (long int) ret_extent, (long int) arg_extent);
|
|
1311 }
|
|
1312 }
|
|
1313
|
|
1314
|
|
1315 if (GFC_DESCRIPTOR_RANK (retarray) == 1)
|
|
1316 {
|
|
1317 /* One-dimensional result may be addressed in the code below
|
|
1318 either as a row or a column matrix. We want both cases to
|
|
1319 work. */
|
|
1320 rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
|
|
1321 }
|
|
1322 else
|
|
1323 {
|
|
1324 rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
|
|
1325 rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
|
|
1326 }
|
|
1327
|
|
1328
|
|
1329 if (GFC_DESCRIPTOR_RANK (a) == 1)
|
|
1330 {
|
|
1331 /* Treat it as a a row matrix A[1,count]. */
|
|
1332 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
|
|
1333 aystride = 1;
|
|
1334
|
|
1335 xcount = 1;
|
|
1336 count = GFC_DESCRIPTOR_EXTENT(a,0);
|
|
1337 }
|
|
1338 else
|
|
1339 {
|
|
1340 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
|
|
1341 aystride = GFC_DESCRIPTOR_STRIDE(a,1);
|
|
1342
|
|
1343 count = GFC_DESCRIPTOR_EXTENT(a,1);
|
|
1344 xcount = GFC_DESCRIPTOR_EXTENT(a,0);
|
|
1345 }
|
|
1346
|
|
1347 if (count != GFC_DESCRIPTOR_EXTENT(b,0))
|
|
1348 {
|
|
1349 if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
|
131
|
1350 runtime_error ("Incorrect extent in argument B in MATMUL intrinsic "
|
|
1351 "in dimension 1: is %ld, should be %ld",
|
|
1352 (long int) GFC_DESCRIPTOR_EXTENT(b,0), (long int) count);
|
111
|
1353 }
|
|
1354
|
|
1355 if (GFC_DESCRIPTOR_RANK (b) == 1)
|
|
1356 {
|
|
1357 /* Treat it as a column matrix B[count,1] */
|
|
1358 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
|
|
1359
|
|
1360 /* bystride should never be used for 1-dimensional b.
|
|
1361 The value is only used for calculation of the
|
|
1362 memory by the buffer. */
|
|
1363 bystride = 256;
|
|
1364 ycount = 1;
|
|
1365 }
|
|
1366 else
|
|
1367 {
|
|
1368 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
|
|
1369 bystride = GFC_DESCRIPTOR_STRIDE(b,1);
|
|
1370 ycount = GFC_DESCRIPTOR_EXTENT(b,1);
|
|
1371 }
|
|
1372
|
|
1373 abase = a->base_addr;
|
|
1374 bbase = b->base_addr;
|
|
1375 dest = retarray->base_addr;
|
|
1376
|
|
1377 /* Now that everything is set up, we perform the multiplication
|
|
1378 itself. */
|
|
1379
|
|
1380 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
|
|
1381 #define min(a,b) ((a) <= (b) ? (a) : (b))
|
|
1382 #define max(a,b) ((a) >= (b) ? (a) : (b))
|
|
1383
|
|
1384 if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
|
|
1385 && (bxstride == 1 || bystride == 1)
|
|
1386 && (((float) xcount) * ((float) ycount) * ((float) count)
|
|
1387 > POW3(blas_limit)))
|
|
1388 {
|
|
1389 const int m = xcount, n = ycount, k = count, ldc = rystride;
|
|
1390 const GFC_INTEGER_8 one = 1, zero = 0;
|
|
1391 const int lda = (axstride == 1) ? aystride : axstride,
|
|
1392 ldb = (bxstride == 1) ? bystride : bxstride;
|
|
1393
|
|
1394 if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
|
|
1395 {
|
|
1396 assert (gemm != NULL);
|
131
|
1397 const char *transa, *transb;
|
|
1398 if (try_blas & 2)
|
|
1399 transa = "C";
|
|
1400 else
|
|
1401 transa = axstride == 1 ? "N" : "T";
|
|
1402
|
|
1403 if (try_blas & 4)
|
|
1404 transb = "C";
|
|
1405 else
|
|
1406 transb = bxstride == 1 ? "N" : "T";
|
|
1407
|
|
1408 gemm (transa, transb , &m,
|
111
|
1409 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
|
|
1410 &ldc, 1, 1);
|
|
1411 return;
|
|
1412 }
|
|
1413 }
|
|
1414
|
|
1415 if (rxstride == 1 && axstride == 1 && bxstride == 1)
|
|
1416 {
|
|
1417 /* This block of code implements a tuned matmul, derived from
|
|
1418 Superscalar GEMM-based level 3 BLAS, Beta version 0.1
|
|
1419
|
|
1420 Bo Kagstrom and Per Ling
|
|
1421 Department of Computing Science
|
|
1422 Umea University
|
|
1423 S-901 87 Umea, Sweden
|
|
1424
|
|
1425 from netlib.org, translated to C, and modified for matmul.m4. */
|
|
1426
|
|
1427 const GFC_INTEGER_8 *a, *b;
|
|
1428 GFC_INTEGER_8 *c;
|
|
1429 const index_type m = xcount, n = ycount, k = count;
|
|
1430
|
|
1431 /* System generated locals */
|
|
1432 index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
|
|
1433 i1, i2, i3, i4, i5, i6;
|
|
1434
|
|
1435 /* Local variables */
|
|
1436 GFC_INTEGER_8 f11, f12, f21, f22, f31, f32, f41, f42,
|
|
1437 f13, f14, f23, f24, f33, f34, f43, f44;
|
|
1438 index_type i, j, l, ii, jj, ll;
|
|
1439 index_type isec, jsec, lsec, uisec, ujsec, ulsec;
|
|
1440 GFC_INTEGER_8 *t1;
|
|
1441
|
|
1442 a = abase;
|
|
1443 b = bbase;
|
|
1444 c = retarray->base_addr;
|
|
1445
|
|
1446 /* Parameter adjustments */
|
|
1447 c_dim1 = rystride;
|
|
1448 c_offset = 1 + c_dim1;
|
|
1449 c -= c_offset;
|
|
1450 a_dim1 = aystride;
|
|
1451 a_offset = 1 + a_dim1;
|
|
1452 a -= a_offset;
|
|
1453 b_dim1 = bystride;
|
|
1454 b_offset = 1 + b_dim1;
|
|
1455 b -= b_offset;
|
|
1456
|
|
1457 /* Empty c first. */
|
|
1458 for (j=1; j<=n; j++)
|
|
1459 for (i=1; i<=m; i++)
|
|
1460 c[i + j * c_dim1] = (GFC_INTEGER_8)0;
|
|
1461
|
|
1462 /* Early exit if possible */
|
|
1463 if (m == 0 || n == 0 || k == 0)
|
|
1464 return;
|
|
1465
|
|
1466 /* Adjust size of t1 to what is needed. */
|
131
|
1467 index_type t1_dim, a_sz;
|
|
1468 if (aystride == 1)
|
|
1469 a_sz = rystride;
|
|
1470 else
|
|
1471 a_sz = a_dim1;
|
|
1472
|
|
1473 t1_dim = a_sz * 256 + b_dim1;
|
111
|
1474 if (t1_dim > 65536)
|
|
1475 t1_dim = 65536;
|
|
1476
|
|
1477 t1 = malloc (t1_dim * sizeof(GFC_INTEGER_8));
|
|
1478
|
|
1479 /* Start turning the crank. */
|
|
1480 i1 = n;
|
|
1481 for (jj = 1; jj <= i1; jj += 512)
|
|
1482 {
|
|
1483 /* Computing MIN */
|
|
1484 i2 = 512;
|
|
1485 i3 = n - jj + 1;
|
|
1486 jsec = min(i2,i3);
|
|
1487 ujsec = jsec - jsec % 4;
|
|
1488 i2 = k;
|
|
1489 for (ll = 1; ll <= i2; ll += 256)
|
|
1490 {
|
|
1491 /* Computing MIN */
|
|
1492 i3 = 256;
|
|
1493 i4 = k - ll + 1;
|
|
1494 lsec = min(i3,i4);
|
|
1495 ulsec = lsec - lsec % 2;
|
|
1496
|
|
1497 i3 = m;
|
|
1498 for (ii = 1; ii <= i3; ii += 256)
|
|
1499 {
|
|
1500 /* Computing MIN */
|
|
1501 i4 = 256;
|
|
1502 i5 = m - ii + 1;
|
|
1503 isec = min(i4,i5);
|
|
1504 uisec = isec - isec % 2;
|
|
1505 i4 = ll + ulsec - 1;
|
|
1506 for (l = ll; l <= i4; l += 2)
|
|
1507 {
|
|
1508 i5 = ii + uisec - 1;
|
|
1509 for (i = ii; i <= i5; i += 2)
|
|
1510 {
|
|
1511 t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
|
|
1512 a[i + l * a_dim1];
|
|
1513 t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
|
|
1514 a[i + (l + 1) * a_dim1];
|
|
1515 t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
|
|
1516 a[i + 1 + l * a_dim1];
|
|
1517 t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
|
|
1518 a[i + 1 + (l + 1) * a_dim1];
|
|
1519 }
|
|
1520 if (uisec < isec)
|
|
1521 {
|
|
1522 t1[l - ll + 1 + (isec << 8) - 257] =
|
|
1523 a[ii + isec - 1 + l * a_dim1];
|
|
1524 t1[l - ll + 2 + (isec << 8) - 257] =
|
|
1525 a[ii + isec - 1 + (l + 1) * a_dim1];
|
|
1526 }
|
|
1527 }
|
|
1528 if (ulsec < lsec)
|
|
1529 {
|
|
1530 i4 = ii + isec - 1;
|
|
1531 for (i = ii; i<= i4; ++i)
|
|
1532 {
|
|
1533 t1[lsec + ((i - ii + 1) << 8) - 257] =
|
|
1534 a[i + (ll + lsec - 1) * a_dim1];
|
|
1535 }
|
|
1536 }
|
|
1537
|
|
1538 uisec = isec - isec % 4;
|
|
1539 i4 = jj + ujsec - 1;
|
|
1540 for (j = jj; j <= i4; j += 4)
|
|
1541 {
|
|
1542 i5 = ii + uisec - 1;
|
|
1543 for (i = ii; i <= i5; i += 4)
|
|
1544 {
|
|
1545 f11 = c[i + j * c_dim1];
|
|
1546 f21 = c[i + 1 + j * c_dim1];
|
|
1547 f12 = c[i + (j + 1) * c_dim1];
|
|
1548 f22 = c[i + 1 + (j + 1) * c_dim1];
|
|
1549 f13 = c[i + (j + 2) * c_dim1];
|
|
1550 f23 = c[i + 1 + (j + 2) * c_dim1];
|
|
1551 f14 = c[i + (j + 3) * c_dim1];
|
|
1552 f24 = c[i + 1 + (j + 3) * c_dim1];
|
|
1553 f31 = c[i + 2 + j * c_dim1];
|
|
1554 f41 = c[i + 3 + j * c_dim1];
|
|
1555 f32 = c[i + 2 + (j + 1) * c_dim1];
|
|
1556 f42 = c[i + 3 + (j + 1) * c_dim1];
|
|
1557 f33 = c[i + 2 + (j + 2) * c_dim1];
|
|
1558 f43 = c[i + 3 + (j + 2) * c_dim1];
|
|
1559 f34 = c[i + 2 + (j + 3) * c_dim1];
|
|
1560 f44 = c[i + 3 + (j + 3) * c_dim1];
|
|
1561 i6 = ll + lsec - 1;
|
|
1562 for (l = ll; l <= i6; ++l)
|
|
1563 {
|
|
1564 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
|
|
1565 * b[l + j * b_dim1];
|
|
1566 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
|
|
1567 * b[l + j * b_dim1];
|
|
1568 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
|
|
1569 * b[l + (j + 1) * b_dim1];
|
|
1570 f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
|
|
1571 * b[l + (j + 1) * b_dim1];
|
|
1572 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
|
|
1573 * b[l + (j + 2) * b_dim1];
|
|
1574 f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
|
|
1575 * b[l + (j + 2) * b_dim1];
|
|
1576 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
|
|
1577 * b[l + (j + 3) * b_dim1];
|
|
1578 f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
|
|
1579 * b[l + (j + 3) * b_dim1];
|
|
1580 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
|
|
1581 * b[l + j * b_dim1];
|
|
1582 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
|
|
1583 * b[l + j * b_dim1];
|
|
1584 f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
|
|
1585 * b[l + (j + 1) * b_dim1];
|
|
1586 f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
|
|
1587 * b[l + (j + 1) * b_dim1];
|
|
1588 f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
|
|
1589 * b[l + (j + 2) * b_dim1];
|
|
1590 f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
|
|
1591 * b[l + (j + 2) * b_dim1];
|
|
1592 f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
|
|
1593 * b[l + (j + 3) * b_dim1];
|
|
1594 f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
|
|
1595 * b[l + (j + 3) * b_dim1];
|
|
1596 }
|
|
1597 c[i + j * c_dim1] = f11;
|
|
1598 c[i + 1 + j * c_dim1] = f21;
|
|
1599 c[i + (j + 1) * c_dim1] = f12;
|
|
1600 c[i + 1 + (j + 1) * c_dim1] = f22;
|
|
1601 c[i + (j + 2) * c_dim1] = f13;
|
|
1602 c[i + 1 + (j + 2) * c_dim1] = f23;
|
|
1603 c[i + (j + 3) * c_dim1] = f14;
|
|
1604 c[i + 1 + (j + 3) * c_dim1] = f24;
|
|
1605 c[i + 2 + j * c_dim1] = f31;
|
|
1606 c[i + 3 + j * c_dim1] = f41;
|
|
1607 c[i + 2 + (j + 1) * c_dim1] = f32;
|
|
1608 c[i + 3 + (j + 1) * c_dim1] = f42;
|
|
1609 c[i + 2 + (j + 2) * c_dim1] = f33;
|
|
1610 c[i + 3 + (j + 2) * c_dim1] = f43;
|
|
1611 c[i + 2 + (j + 3) * c_dim1] = f34;
|
|
1612 c[i + 3 + (j + 3) * c_dim1] = f44;
|
|
1613 }
|
|
1614 if (uisec < isec)
|
|
1615 {
|
|
1616 i5 = ii + isec - 1;
|
|
1617 for (i = ii + uisec; i <= i5; ++i)
|
|
1618 {
|
|
1619 f11 = c[i + j * c_dim1];
|
|
1620 f12 = c[i + (j + 1) * c_dim1];
|
|
1621 f13 = c[i + (j + 2) * c_dim1];
|
|
1622 f14 = c[i + (j + 3) * c_dim1];
|
|
1623 i6 = ll + lsec - 1;
|
|
1624 for (l = ll; l <= i6; ++l)
|
|
1625 {
|
|
1626 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
|
|
1627 257] * b[l + j * b_dim1];
|
|
1628 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
|
|
1629 257] * b[l + (j + 1) * b_dim1];
|
|
1630 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
|
|
1631 257] * b[l + (j + 2) * b_dim1];
|
|
1632 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
|
|
1633 257] * b[l + (j + 3) * b_dim1];
|
|
1634 }
|
|
1635 c[i + j * c_dim1] = f11;
|
|
1636 c[i + (j + 1) * c_dim1] = f12;
|
|
1637 c[i + (j + 2) * c_dim1] = f13;
|
|
1638 c[i + (j + 3) * c_dim1] = f14;
|
|
1639 }
|
|
1640 }
|
|
1641 }
|
|
1642 if (ujsec < jsec)
|
|
1643 {
|
|
1644 i4 = jj + jsec - 1;
|
|
1645 for (j = jj + ujsec; j <= i4; ++j)
|
|
1646 {
|
|
1647 i5 = ii + uisec - 1;
|
|
1648 for (i = ii; i <= i5; i += 4)
|
|
1649 {
|
|
1650 f11 = c[i + j * c_dim1];
|
|
1651 f21 = c[i + 1 + j * c_dim1];
|
|
1652 f31 = c[i + 2 + j * c_dim1];
|
|
1653 f41 = c[i + 3 + j * c_dim1];
|
|
1654 i6 = ll + lsec - 1;
|
|
1655 for (l = ll; l <= i6; ++l)
|
|
1656 {
|
|
1657 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
|
|
1658 257] * b[l + j * b_dim1];
|
|
1659 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
|
|
1660 257] * b[l + j * b_dim1];
|
|
1661 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
|
|
1662 257] * b[l + j * b_dim1];
|
|
1663 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
|
|
1664 257] * b[l + j * b_dim1];
|
|
1665 }
|
|
1666 c[i + j * c_dim1] = f11;
|
|
1667 c[i + 1 + j * c_dim1] = f21;
|
|
1668 c[i + 2 + j * c_dim1] = f31;
|
|
1669 c[i + 3 + j * c_dim1] = f41;
|
|
1670 }
|
|
1671 i5 = ii + isec - 1;
|
|
1672 for (i = ii + uisec; i <= i5; ++i)
|
|
1673 {
|
|
1674 f11 = c[i + j * c_dim1];
|
|
1675 i6 = ll + lsec - 1;
|
|
1676 for (l = ll; l <= i6; ++l)
|
|
1677 {
|
|
1678 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
|
|
1679 257] * b[l + j * b_dim1];
|
|
1680 }
|
|
1681 c[i + j * c_dim1] = f11;
|
|
1682 }
|
|
1683 }
|
|
1684 }
|
|
1685 }
|
|
1686 }
|
|
1687 }
|
|
1688 free(t1);
|
|
1689 return;
|
|
1690 }
|
|
1691 else if (rxstride == 1 && aystride == 1 && bxstride == 1)
|
|
1692 {
|
|
1693 if (GFC_DESCRIPTOR_RANK (a) != 1)
|
|
1694 {
|
|
1695 const GFC_INTEGER_8 *restrict abase_x;
|
|
1696 const GFC_INTEGER_8 *restrict bbase_y;
|
|
1697 GFC_INTEGER_8 *restrict dest_y;
|
|
1698 GFC_INTEGER_8 s;
|
|
1699
|
|
1700 for (y = 0; y < ycount; y++)
|
|
1701 {
|
|
1702 bbase_y = &bbase[y*bystride];
|
|
1703 dest_y = &dest[y*rystride];
|
|
1704 for (x = 0; x < xcount; x++)
|
|
1705 {
|
|
1706 abase_x = &abase[x*axstride];
|
|
1707 s = (GFC_INTEGER_8) 0;
|
|
1708 for (n = 0; n < count; n++)
|
|
1709 s += abase_x[n] * bbase_y[n];
|
|
1710 dest_y[x] = s;
|
|
1711 }
|
|
1712 }
|
|
1713 }
|
|
1714 else
|
|
1715 {
|
|
1716 const GFC_INTEGER_8 *restrict bbase_y;
|
|
1717 GFC_INTEGER_8 s;
|
|
1718
|
|
1719 for (y = 0; y < ycount; y++)
|
|
1720 {
|
|
1721 bbase_y = &bbase[y*bystride];
|
|
1722 s = (GFC_INTEGER_8) 0;
|
|
1723 for (n = 0; n < count; n++)
|
|
1724 s += abase[n*axstride] * bbase_y[n];
|
|
1725 dest[y*rystride] = s;
|
|
1726 }
|
|
1727 }
|
|
1728 }
|
|
1729 else if (axstride < aystride)
|
|
1730 {
|
|
1731 for (y = 0; y < ycount; y++)
|
|
1732 for (x = 0; x < xcount; x++)
|
|
1733 dest[x*rxstride + y*rystride] = (GFC_INTEGER_8)0;
|
|
1734
|
|
1735 for (y = 0; y < ycount; y++)
|
|
1736 for (n = 0; n < count; n++)
|
|
1737 for (x = 0; x < xcount; x++)
|
|
1738 /* dest[x,y] += a[x,n] * b[n,y] */
|
|
1739 dest[x*rxstride + y*rystride] +=
|
|
1740 abase[x*axstride + n*aystride] *
|
|
1741 bbase[n*bxstride + y*bystride];
|
|
1742 }
|
|
1743 else if (GFC_DESCRIPTOR_RANK (a) == 1)
|
|
1744 {
|
|
1745 const GFC_INTEGER_8 *restrict bbase_y;
|
|
1746 GFC_INTEGER_8 s;
|
|
1747
|
|
1748 for (y = 0; y < ycount; y++)
|
|
1749 {
|
|
1750 bbase_y = &bbase[y*bystride];
|
|
1751 s = (GFC_INTEGER_8) 0;
|
|
1752 for (n = 0; n < count; n++)
|
|
1753 s += abase[n*axstride] * bbase_y[n*bxstride];
|
|
1754 dest[y*rxstride] = s;
|
|
1755 }
|
|
1756 }
|
|
1757 else
|
|
1758 {
|
|
1759 const GFC_INTEGER_8 *restrict abase_x;
|
|
1760 const GFC_INTEGER_8 *restrict bbase_y;
|
|
1761 GFC_INTEGER_8 *restrict dest_y;
|
|
1762 GFC_INTEGER_8 s;
|
|
1763
|
|
1764 for (y = 0; y < ycount; y++)
|
|
1765 {
|
|
1766 bbase_y = &bbase[y*bystride];
|
|
1767 dest_y = &dest[y*rystride];
|
|
1768 for (x = 0; x < xcount; x++)
|
|
1769 {
|
|
1770 abase_x = &abase[x*axstride];
|
|
1771 s = (GFC_INTEGER_8) 0;
|
|
1772 for (n = 0; n < count; n++)
|
|
1773 s += abase_x[n*aystride] * bbase_y[n*bxstride];
|
|
1774 dest_y[x*rxstride] = s;
|
|
1775 }
|
|
1776 }
|
|
1777 }
|
|
1778 }
|
|
1779 #undef POW3
|
|
1780 #undef min
|
|
1781 #undef max
|
|
1782
|
|
1783 #endif /* HAVE_AVX512F */
|
|
1784
|
|
1785 /* AMD-specifix funtions with AVX128 and FMA3/FMA4. */
|
|
1786
|
|
1787 #if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
|
|
1788 void
|
|
1789 matmul_i8_avx128_fma3 (gfc_array_i8 * const restrict retarray,
|
|
1790 gfc_array_i8 * const restrict a, gfc_array_i8 * const restrict b, int try_blas,
|
|
1791 int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma")));
|
|
1792 internal_proto(matmul_i8_avx128_fma3);
|
|
1793 #endif
|
|
1794
|
|
1795 #if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
|
|
1796 void
|
|
1797 matmul_i8_avx128_fma4 (gfc_array_i8 * const restrict retarray,
|
|
1798 gfc_array_i8 * const restrict a, gfc_array_i8 * const restrict b, int try_blas,
|
|
1799 int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4")));
|
|
1800 internal_proto(matmul_i8_avx128_fma4);
|
|
1801 #endif
|
|
1802
|
|
1803 /* Function to fall back to if there is no special processor-specific version. */
|
|
1804 static void
|
|
1805 matmul_i8_vanilla (gfc_array_i8 * const restrict retarray,
|
|
1806 gfc_array_i8 * const restrict a, gfc_array_i8 * const restrict b, int try_blas,
|
|
1807 int blas_limit, blas_call gemm)
|
|
1808 {
|
|
1809 const GFC_INTEGER_8 * restrict abase;
|
|
1810 const GFC_INTEGER_8 * restrict bbase;
|
|
1811 GFC_INTEGER_8 * restrict dest;
|
|
1812
|
|
1813 index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
|
|
1814 index_type x, y, n, count, xcount, ycount;
|
|
1815
|
|
1816 assert (GFC_DESCRIPTOR_RANK (a) == 2
|
|
1817 || GFC_DESCRIPTOR_RANK (b) == 2);
|
|
1818
|
|
1819 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
|
|
1820
|
|
1821 Either A or B (but not both) can be rank 1:
|
|
1822
|
|
1823 o One-dimensional argument A is implicitly treated as a row matrix
|
|
1824 dimensioned [1,count], so xcount=1.
|
|
1825
|
|
1826 o One-dimensional argument B is implicitly treated as a column matrix
|
|
1827 dimensioned [count, 1], so ycount=1.
|
|
1828 */
|
|
1829
|
|
1830 if (retarray->base_addr == NULL)
|
|
1831 {
|
|
1832 if (GFC_DESCRIPTOR_RANK (a) == 1)
|
|
1833 {
|
|
1834 GFC_DIMENSION_SET(retarray->dim[0], 0,
|
|
1835 GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
|
|
1836 }
|
|
1837 else if (GFC_DESCRIPTOR_RANK (b) == 1)
|
|
1838 {
|
|
1839 GFC_DIMENSION_SET(retarray->dim[0], 0,
|
|
1840 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
|
|
1841 }
|
|
1842 else
|
|
1843 {
|
|
1844 GFC_DIMENSION_SET(retarray->dim[0], 0,
|
|
1845 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
|
|
1846
|
|
1847 GFC_DIMENSION_SET(retarray->dim[1], 0,
|
|
1848 GFC_DESCRIPTOR_EXTENT(b,1) - 1,
|
|
1849 GFC_DESCRIPTOR_EXTENT(retarray,0));
|
|
1850 }
|
|
1851
|
|
1852 retarray->base_addr
|
|
1853 = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_INTEGER_8));
|
|
1854 retarray->offset = 0;
|
|
1855 }
|
|
1856 else if (unlikely (compile_options.bounds_check))
|
|
1857 {
|
|
1858 index_type ret_extent, arg_extent;
|
|
1859
|
|
1860 if (GFC_DESCRIPTOR_RANK (a) == 1)
|
|
1861 {
|
|
1862 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
|
|
1863 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
|
|
1864 if (arg_extent != ret_extent)
|
131
|
1865 runtime_error ("Array bound mismatch for dimension 1 of "
|
|
1866 "array (%ld/%ld) ",
|
111
|
1867 (long int) ret_extent, (long int) arg_extent);
|
|
1868 }
|
|
1869 else if (GFC_DESCRIPTOR_RANK (b) == 1)
|
|
1870 {
|
|
1871 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
|
|
1872 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
|
|
1873 if (arg_extent != ret_extent)
|
131
|
1874 runtime_error ("Array bound mismatch for dimension 1 of "
|
|
1875 "array (%ld/%ld) ",
|
111
|
1876 (long int) ret_extent, (long int) arg_extent);
|
|
1877 }
|
|
1878 else
|
|
1879 {
|
|
1880 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
|
|
1881 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
|
|
1882 if (arg_extent != ret_extent)
|
131
|
1883 runtime_error ("Array bound mismatch for dimension 1 of "
|
|
1884 "array (%ld/%ld) ",
|
111
|
1885 (long int) ret_extent, (long int) arg_extent);
|
|
1886
|
|
1887 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
|
|
1888 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
|
|
1889 if (arg_extent != ret_extent)
|
131
|
1890 runtime_error ("Array bound mismatch for dimension 2 of "
|
|
1891 "array (%ld/%ld) ",
|
111
|
1892 (long int) ret_extent, (long int) arg_extent);
|
|
1893 }
|
|
1894 }
|
|
1895
|
|
1896
|
|
1897 if (GFC_DESCRIPTOR_RANK (retarray) == 1)
|
|
1898 {
|
|
1899 /* One-dimensional result may be addressed in the code below
|
|
1900 either as a row or a column matrix. We want both cases to
|
|
1901 work. */
|
|
1902 rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
|
|
1903 }
|
|
1904 else
|
|
1905 {
|
|
1906 rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
|
|
1907 rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
|
|
1908 }
|
|
1909
|
|
1910
|
|
1911 if (GFC_DESCRIPTOR_RANK (a) == 1)
|
|
1912 {
|
|
1913 /* Treat it as a a row matrix A[1,count]. */
|
|
1914 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
|
|
1915 aystride = 1;
|
|
1916
|
|
1917 xcount = 1;
|
|
1918 count = GFC_DESCRIPTOR_EXTENT(a,0);
|
|
1919 }
|
|
1920 else
|
|
1921 {
|
|
1922 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
|
|
1923 aystride = GFC_DESCRIPTOR_STRIDE(a,1);
|
|
1924
|
|
1925 count = GFC_DESCRIPTOR_EXTENT(a,1);
|
|
1926 xcount = GFC_DESCRIPTOR_EXTENT(a,0);
|
|
1927 }
|
|
1928
|
|
1929 if (count != GFC_DESCRIPTOR_EXTENT(b,0))
|
|
1930 {
|
|
1931 if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
|
131
|
1932 runtime_error ("Incorrect extent in argument B in MATMUL intrinsic "
|
|
1933 "in dimension 1: is %ld, should be %ld",
|
|
1934 (long int) GFC_DESCRIPTOR_EXTENT(b,0), (long int) count);
|
111
|
1935 }
|
|
1936
|
|
1937 if (GFC_DESCRIPTOR_RANK (b) == 1)
|
|
1938 {
|
|
1939 /* Treat it as a column matrix B[count,1] */
|
|
1940 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
|
|
1941
|
|
1942 /* bystride should never be used for 1-dimensional b.
|
|
1943 The value is only used for calculation of the
|
|
1944 memory by the buffer. */
|
|
1945 bystride = 256;
|
|
1946 ycount = 1;
|
|
1947 }
|
|
1948 else
|
|
1949 {
|
|
1950 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
|
|
1951 bystride = GFC_DESCRIPTOR_STRIDE(b,1);
|
|
1952 ycount = GFC_DESCRIPTOR_EXTENT(b,1);
|
|
1953 }
|
|
1954
|
|
1955 abase = a->base_addr;
|
|
1956 bbase = b->base_addr;
|
|
1957 dest = retarray->base_addr;
|
|
1958
|
|
1959 /* Now that everything is set up, we perform the multiplication
|
|
1960 itself. */
|
|
1961
|
|
1962 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
|
|
1963 #define min(a,b) ((a) <= (b) ? (a) : (b))
|
|
1964 #define max(a,b) ((a) >= (b) ? (a) : (b))
|
|
1965
|
|
1966 if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
|
|
1967 && (bxstride == 1 || bystride == 1)
|
|
1968 && (((float) xcount) * ((float) ycount) * ((float) count)
|
|
1969 > POW3(blas_limit)))
|
|
1970 {
|
|
1971 const int m = xcount, n = ycount, k = count, ldc = rystride;
|
|
1972 const GFC_INTEGER_8 one = 1, zero = 0;
|
|
1973 const int lda = (axstride == 1) ? aystride : axstride,
|
|
1974 ldb = (bxstride == 1) ? bystride : bxstride;
|
|
1975
|
|
1976 if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
|
|
1977 {
|
|
1978 assert (gemm != NULL);
|
131
|
1979 const char *transa, *transb;
|
|
1980 if (try_blas & 2)
|
|
1981 transa = "C";
|
|
1982 else
|
|
1983 transa = axstride == 1 ? "N" : "T";
|
|
1984
|
|
1985 if (try_blas & 4)
|
|
1986 transb = "C";
|
|
1987 else
|
|
1988 transb = bxstride == 1 ? "N" : "T";
|
|
1989
|
|
1990 gemm (transa, transb , &m,
|
111
|
1991 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
|
|
1992 &ldc, 1, 1);
|
|
1993 return;
|
|
1994 }
|
|
1995 }
|
|
1996
|
|
1997 if (rxstride == 1 && axstride == 1 && bxstride == 1)
|
|
1998 {
|
|
1999 /* This block of code implements a tuned matmul, derived from
|
|
2000 Superscalar GEMM-based level 3 BLAS, Beta version 0.1
|
|
2001
|
|
2002 Bo Kagstrom and Per Ling
|
|
2003 Department of Computing Science
|
|
2004 Umea University
|
|
2005 S-901 87 Umea, Sweden
|
|
2006
|
|
2007 from netlib.org, translated to C, and modified for matmul.m4. */
|
|
2008
|
|
2009 const GFC_INTEGER_8 *a, *b;
|
|
2010 GFC_INTEGER_8 *c;
|
|
2011 const index_type m = xcount, n = ycount, k = count;
|
|
2012
|
|
2013 /* System generated locals */
|
|
2014 index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
|
|
2015 i1, i2, i3, i4, i5, i6;
|
|
2016
|
|
2017 /* Local variables */
|
|
2018 GFC_INTEGER_8 f11, f12, f21, f22, f31, f32, f41, f42,
|
|
2019 f13, f14, f23, f24, f33, f34, f43, f44;
|
|
2020 index_type i, j, l, ii, jj, ll;
|
|
2021 index_type isec, jsec, lsec, uisec, ujsec, ulsec;
|
|
2022 GFC_INTEGER_8 *t1;
|
|
2023
|
|
2024 a = abase;
|
|
2025 b = bbase;
|
|
2026 c = retarray->base_addr;
|
|
2027
|
|
2028 /* Parameter adjustments */
|
|
2029 c_dim1 = rystride;
|
|
2030 c_offset = 1 + c_dim1;
|
|
2031 c -= c_offset;
|
|
2032 a_dim1 = aystride;
|
|
2033 a_offset = 1 + a_dim1;
|
|
2034 a -= a_offset;
|
|
2035 b_dim1 = bystride;
|
|
2036 b_offset = 1 + b_dim1;
|
|
2037 b -= b_offset;
|
|
2038
|
|
2039 /* Empty c first. */
|
|
2040 for (j=1; j<=n; j++)
|
|
2041 for (i=1; i<=m; i++)
|
|
2042 c[i + j * c_dim1] = (GFC_INTEGER_8)0;
|
|
2043
|
|
2044 /* Early exit if possible */
|
|
2045 if (m == 0 || n == 0 || k == 0)
|
|
2046 return;
|
|
2047
|
|
2048 /* Adjust size of t1 to what is needed. */
|
131
|
2049 index_type t1_dim, a_sz;
|
|
2050 if (aystride == 1)
|
|
2051 a_sz = rystride;
|
|
2052 else
|
|
2053 a_sz = a_dim1;
|
|
2054
|
|
2055 t1_dim = a_sz * 256 + b_dim1;
|
111
|
2056 if (t1_dim > 65536)
|
|
2057 t1_dim = 65536;
|
|
2058
|
|
2059 t1 = malloc (t1_dim * sizeof(GFC_INTEGER_8));
|
|
2060
|
|
2061 /* Start turning the crank. */
|
|
2062 i1 = n;
|
|
2063 for (jj = 1; jj <= i1; jj += 512)
|
|
2064 {
|
|
2065 /* Computing MIN */
|
|
2066 i2 = 512;
|
|
2067 i3 = n - jj + 1;
|
|
2068 jsec = min(i2,i3);
|
|
2069 ujsec = jsec - jsec % 4;
|
|
2070 i2 = k;
|
|
2071 for (ll = 1; ll <= i2; ll += 256)
|
|
2072 {
|
|
2073 /* Computing MIN */
|
|
2074 i3 = 256;
|
|
2075 i4 = k - ll + 1;
|
|
2076 lsec = min(i3,i4);
|
|
2077 ulsec = lsec - lsec % 2;
|
|
2078
|
|
2079 i3 = m;
|
|
2080 for (ii = 1; ii <= i3; ii += 256)
|
|
2081 {
|
|
2082 /* Computing MIN */
|
|
2083 i4 = 256;
|
|
2084 i5 = m - ii + 1;
|
|
2085 isec = min(i4,i5);
|
|
2086 uisec = isec - isec % 2;
|
|
2087 i4 = ll + ulsec - 1;
|
|
2088 for (l = ll; l <= i4; l += 2)
|
|
2089 {
|
|
2090 i5 = ii + uisec - 1;
|
|
2091 for (i = ii; i <= i5; i += 2)
|
|
2092 {
|
|
2093 t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
|
|
2094 a[i + l * a_dim1];
|
|
2095 t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
|
|
2096 a[i + (l + 1) * a_dim1];
|
|
2097 t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
|
|
2098 a[i + 1 + l * a_dim1];
|
|
2099 t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
|
|
2100 a[i + 1 + (l + 1) * a_dim1];
|
|
2101 }
|
|
2102 if (uisec < isec)
|
|
2103 {
|
|
2104 t1[l - ll + 1 + (isec << 8) - 257] =
|
|
2105 a[ii + isec - 1 + l * a_dim1];
|
|
2106 t1[l - ll + 2 + (isec << 8) - 257] =
|
|
2107 a[ii + isec - 1 + (l + 1) * a_dim1];
|
|
2108 }
|
|
2109 }
|
|
2110 if (ulsec < lsec)
|
|
2111 {
|
|
2112 i4 = ii + isec - 1;
|
|
2113 for (i = ii; i<= i4; ++i)
|
|
2114 {
|
|
2115 t1[lsec + ((i - ii + 1) << 8) - 257] =
|
|
2116 a[i + (ll + lsec - 1) * a_dim1];
|
|
2117 }
|
|
2118 }
|
|
2119
|
|
2120 uisec = isec - isec % 4;
|
|
2121 i4 = jj + ujsec - 1;
|
|
2122 for (j = jj; j <= i4; j += 4)
|
|
2123 {
|
|
2124 i5 = ii + uisec - 1;
|
|
2125 for (i = ii; i <= i5; i += 4)
|
|
2126 {
|
|
2127 f11 = c[i + j * c_dim1];
|
|
2128 f21 = c[i + 1 + j * c_dim1];
|
|
2129 f12 = c[i + (j + 1) * c_dim1];
|
|
2130 f22 = c[i + 1 + (j + 1) * c_dim1];
|
|
2131 f13 = c[i + (j + 2) * c_dim1];
|
|
2132 f23 = c[i + 1 + (j + 2) * c_dim1];
|
|
2133 f14 = c[i + (j + 3) * c_dim1];
|
|
2134 f24 = c[i + 1 + (j + 3) * c_dim1];
|
|
2135 f31 = c[i + 2 + j * c_dim1];
|
|
2136 f41 = c[i + 3 + j * c_dim1];
|
|
2137 f32 = c[i + 2 + (j + 1) * c_dim1];
|
|
2138 f42 = c[i + 3 + (j + 1) * c_dim1];
|
|
2139 f33 = c[i + 2 + (j + 2) * c_dim1];
|
|
2140 f43 = c[i + 3 + (j + 2) * c_dim1];
|
|
2141 f34 = c[i + 2 + (j + 3) * c_dim1];
|
|
2142 f44 = c[i + 3 + (j + 3) * c_dim1];
|
|
2143 i6 = ll + lsec - 1;
|
|
2144 for (l = ll; l <= i6; ++l)
|
|
2145 {
|
|
2146 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
|
|
2147 * b[l + j * b_dim1];
|
|
2148 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
|
|
2149 * b[l + j * b_dim1];
|
|
2150 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
|
|
2151 * b[l + (j + 1) * b_dim1];
|
|
2152 f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
|
|
2153 * b[l + (j + 1) * b_dim1];
|
|
2154 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
|
|
2155 * b[l + (j + 2) * b_dim1];
|
|
2156 f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
|
|
2157 * b[l + (j + 2) * b_dim1];
|
|
2158 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
|
|
2159 * b[l + (j + 3) * b_dim1];
|
|
2160 f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
|
|
2161 * b[l + (j + 3) * b_dim1];
|
|
2162 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
|
|
2163 * b[l + j * b_dim1];
|
|
2164 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
|
|
2165 * b[l + j * b_dim1];
|
|
2166 f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
|
|
2167 * b[l + (j + 1) * b_dim1];
|
|
2168 f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
|
|
2169 * b[l + (j + 1) * b_dim1];
|
|
2170 f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
|
|
2171 * b[l + (j + 2) * b_dim1];
|
|
2172 f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
|
|
2173 * b[l + (j + 2) * b_dim1];
|
|
2174 f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
|
|
2175 * b[l + (j + 3) * b_dim1];
|
|
2176 f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
|
|
2177 * b[l + (j + 3) * b_dim1];
|
|
2178 }
|
|
2179 c[i + j * c_dim1] = f11;
|
|
2180 c[i + 1 + j * c_dim1] = f21;
|
|
2181 c[i + (j + 1) * c_dim1] = f12;
|
|
2182 c[i + 1 + (j + 1) * c_dim1] = f22;
|
|
2183 c[i + (j + 2) * c_dim1] = f13;
|
|
2184 c[i + 1 + (j + 2) * c_dim1] = f23;
|
|
2185 c[i + (j + 3) * c_dim1] = f14;
|
|
2186 c[i + 1 + (j + 3) * c_dim1] = f24;
|
|
2187 c[i + 2 + j * c_dim1] = f31;
|
|
2188 c[i + 3 + j * c_dim1] = f41;
|
|
2189 c[i + 2 + (j + 1) * c_dim1] = f32;
|
|
2190 c[i + 3 + (j + 1) * c_dim1] = f42;
|
|
2191 c[i + 2 + (j + 2) * c_dim1] = f33;
|
|
2192 c[i + 3 + (j + 2) * c_dim1] = f43;
|
|
2193 c[i + 2 + (j + 3) * c_dim1] = f34;
|
|
2194 c[i + 3 + (j + 3) * c_dim1] = f44;
|
|
2195 }
|
|
2196 if (uisec < isec)
|
|
2197 {
|
|
2198 i5 = ii + isec - 1;
|
|
2199 for (i = ii + uisec; i <= i5; ++i)
|
|
2200 {
|
|
2201 f11 = c[i + j * c_dim1];
|
|
2202 f12 = c[i + (j + 1) * c_dim1];
|
|
2203 f13 = c[i + (j + 2) * c_dim1];
|
|
2204 f14 = c[i + (j + 3) * c_dim1];
|
|
2205 i6 = ll + lsec - 1;
|
|
2206 for (l = ll; l <= i6; ++l)
|
|
2207 {
|
|
2208 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
|
|
2209 257] * b[l + j * b_dim1];
|
|
2210 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
|
|
2211 257] * b[l + (j + 1) * b_dim1];
|
|
2212 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
|
|
2213 257] * b[l + (j + 2) * b_dim1];
|
|
2214 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
|
|
2215 257] * b[l + (j + 3) * b_dim1];
|
|
2216 }
|
|
2217 c[i + j * c_dim1] = f11;
|
|
2218 c[i + (j + 1) * c_dim1] = f12;
|
|
2219 c[i + (j + 2) * c_dim1] = f13;
|
|
2220 c[i + (j + 3) * c_dim1] = f14;
|
|
2221 }
|
|
2222 }
|
|
2223 }
|
|
2224 if (ujsec < jsec)
|
|
2225 {
|
|
2226 i4 = jj + jsec - 1;
|
|
2227 for (j = jj + ujsec; j <= i4; ++j)
|
|
2228 {
|
|
2229 i5 = ii + uisec - 1;
|
|
2230 for (i = ii; i <= i5; i += 4)
|
|
2231 {
|
|
2232 f11 = c[i + j * c_dim1];
|
|
2233 f21 = c[i + 1 + j * c_dim1];
|
|
2234 f31 = c[i + 2 + j * c_dim1];
|
|
2235 f41 = c[i + 3 + j * c_dim1];
|
|
2236 i6 = ll + lsec - 1;
|
|
2237 for (l = ll; l <= i6; ++l)
|
|
2238 {
|
|
2239 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
|
|
2240 257] * b[l + j * b_dim1];
|
|
2241 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
|
|
2242 257] * b[l + j * b_dim1];
|
|
2243 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
|
|
2244 257] * b[l + j * b_dim1];
|
|
2245 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
|
|
2246 257] * b[l + j * b_dim1];
|
|
2247 }
|
|
2248 c[i + j * c_dim1] = f11;
|
|
2249 c[i + 1 + j * c_dim1] = f21;
|
|
2250 c[i + 2 + j * c_dim1] = f31;
|
|
2251 c[i + 3 + j * c_dim1] = f41;
|
|
2252 }
|
|
2253 i5 = ii + isec - 1;
|
|
2254 for (i = ii + uisec; i <= i5; ++i)
|
|
2255 {
|
|
2256 f11 = c[i + j * c_dim1];
|
|
2257 i6 = ll + lsec - 1;
|
|
2258 for (l = ll; l <= i6; ++l)
|
|
2259 {
|
|
2260 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
|
|
2261 257] * b[l + j * b_dim1];
|
|
2262 }
|
|
2263 c[i + j * c_dim1] = f11;
|
|
2264 }
|
|
2265 }
|
|
2266 }
|
|
2267 }
|
|
2268 }
|
|
2269 }
|
|
2270 free(t1);
|
|
2271 return;
|
|
2272 }
|
|
2273 else if (rxstride == 1 && aystride == 1 && bxstride == 1)
|
|
2274 {
|
|
2275 if (GFC_DESCRIPTOR_RANK (a) != 1)
|
|
2276 {
|
|
2277 const GFC_INTEGER_8 *restrict abase_x;
|
|
2278 const GFC_INTEGER_8 *restrict bbase_y;
|
|
2279 GFC_INTEGER_8 *restrict dest_y;
|
|
2280 GFC_INTEGER_8 s;
|
|
2281
|
|
2282 for (y = 0; y < ycount; y++)
|
|
2283 {
|
|
2284 bbase_y = &bbase[y*bystride];
|
|
2285 dest_y = &dest[y*rystride];
|
|
2286 for (x = 0; x < xcount; x++)
|
|
2287 {
|
|
2288 abase_x = &abase[x*axstride];
|
|
2289 s = (GFC_INTEGER_8) 0;
|
|
2290 for (n = 0; n < count; n++)
|
|
2291 s += abase_x[n] * bbase_y[n];
|
|
2292 dest_y[x] = s;
|
|
2293 }
|
|
2294 }
|
|
2295 }
|
|
2296 else
|
|
2297 {
|
|
2298 const GFC_INTEGER_8 *restrict bbase_y;
|
|
2299 GFC_INTEGER_8 s;
|
|
2300
|
|
2301 for (y = 0; y < ycount; y++)
|
|
2302 {
|
|
2303 bbase_y = &bbase[y*bystride];
|
|
2304 s = (GFC_INTEGER_8) 0;
|
|
2305 for (n = 0; n < count; n++)
|
|
2306 s += abase[n*axstride] * bbase_y[n];
|
|
2307 dest[y*rystride] = s;
|
|
2308 }
|
|
2309 }
|
|
2310 }
|
|
2311 else if (axstride < aystride)
|
|
2312 {
|
|
2313 for (y = 0; y < ycount; y++)
|
|
2314 for (x = 0; x < xcount; x++)
|
|
2315 dest[x*rxstride + y*rystride] = (GFC_INTEGER_8)0;
|
|
2316
|
|
2317 for (y = 0; y < ycount; y++)
|
|
2318 for (n = 0; n < count; n++)
|
|
2319 for (x = 0; x < xcount; x++)
|
|
2320 /* dest[x,y] += a[x,n] * b[n,y] */
|
|
2321 dest[x*rxstride + y*rystride] +=
|
|
2322 abase[x*axstride + n*aystride] *
|
|
2323 bbase[n*bxstride + y*bystride];
|
|
2324 }
|
|
2325 else if (GFC_DESCRIPTOR_RANK (a) == 1)
|
|
2326 {
|
|
2327 const GFC_INTEGER_8 *restrict bbase_y;
|
|
2328 GFC_INTEGER_8 s;
|
|
2329
|
|
2330 for (y = 0; y < ycount; y++)
|
|
2331 {
|
|
2332 bbase_y = &bbase[y*bystride];
|
|
2333 s = (GFC_INTEGER_8) 0;
|
|
2334 for (n = 0; n < count; n++)
|
|
2335 s += abase[n*axstride] * bbase_y[n*bxstride];
|
|
2336 dest[y*rxstride] = s;
|
|
2337 }
|
|
2338 }
|
|
2339 else
|
|
2340 {
|
|
2341 const GFC_INTEGER_8 *restrict abase_x;
|
|
2342 const GFC_INTEGER_8 *restrict bbase_y;
|
|
2343 GFC_INTEGER_8 *restrict dest_y;
|
|
2344 GFC_INTEGER_8 s;
|
|
2345
|
|
2346 for (y = 0; y < ycount; y++)
|
|
2347 {
|
|
2348 bbase_y = &bbase[y*bystride];
|
|
2349 dest_y = &dest[y*rystride];
|
|
2350 for (x = 0; x < xcount; x++)
|
|
2351 {
|
|
2352 abase_x = &abase[x*axstride];
|
|
2353 s = (GFC_INTEGER_8) 0;
|
|
2354 for (n = 0; n < count; n++)
|
|
2355 s += abase_x[n*aystride] * bbase_y[n*bxstride];
|
|
2356 dest_y[x*rxstride] = s;
|
|
2357 }
|
|
2358 }
|
|
2359 }
|
|
2360 }
|
|
2361 #undef POW3
|
|
2362 #undef min
|
|
2363 #undef max
|
|
2364
|
|
2365
|
|
2366 /* Compiling main function, with selection code for the processor. */
|
|
2367
|
|
2368 /* Currently, this is i386 only. Adjust for other architectures. */
|
|
2369
|
|
2370 #include <config/i386/cpuinfo.h>
|
|
2371 void matmul_i8 (gfc_array_i8 * const restrict retarray,
|
|
2372 gfc_array_i8 * const restrict a, gfc_array_i8 * const restrict b, int try_blas,
|
|
2373 int blas_limit, blas_call gemm)
|
|
2374 {
|
|
2375 static void (*matmul_p) (gfc_array_i8 * const restrict retarray,
|
|
2376 gfc_array_i8 * const restrict a, gfc_array_i8 * const restrict b, int try_blas,
|
|
2377 int blas_limit, blas_call gemm);
|
|
2378
|
|
2379 void (*matmul_fn) (gfc_array_i8 * const restrict retarray,
|
|
2380 gfc_array_i8 * const restrict a, gfc_array_i8 * const restrict b, int try_blas,
|
|
2381 int blas_limit, blas_call gemm);
|
|
2382
|
|
2383 matmul_fn = __atomic_load_n (&matmul_p, __ATOMIC_RELAXED);
|
|
2384 if (matmul_fn == NULL)
|
|
2385 {
|
|
2386 matmul_fn = matmul_i8_vanilla;
|
|
2387 if (__cpu_model.__cpu_vendor == VENDOR_INTEL)
|
|
2388 {
|
|
2389 /* Run down the available processors in order of preference. */
|
|
2390 #ifdef HAVE_AVX512F
|
|
2391 if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX512F))
|
|
2392 {
|
|
2393 matmul_fn = matmul_i8_avx512f;
|
|
2394 goto store;
|
|
2395 }
|
|
2396
|
|
2397 #endif /* HAVE_AVX512F */
|
|
2398
|
|
2399 #ifdef HAVE_AVX2
|
|
2400 if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX2))
|
|
2401 && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA)))
|
|
2402 {
|
|
2403 matmul_fn = matmul_i8_avx2;
|
|
2404 goto store;
|
|
2405 }
|
|
2406
|
|
2407 #endif
|
|
2408
|
|
2409 #ifdef HAVE_AVX
|
|
2410 if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
|
|
2411 {
|
|
2412 matmul_fn = matmul_i8_avx;
|
|
2413 goto store;
|
|
2414 }
|
|
2415 #endif /* HAVE_AVX */
|
|
2416 }
|
|
2417 else if (__cpu_model.__cpu_vendor == VENDOR_AMD)
|
|
2418 {
|
|
2419 #if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
|
|
2420 if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
|
|
2421 && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA)))
|
|
2422 {
|
|
2423 matmul_fn = matmul_i8_avx128_fma3;
|
|
2424 goto store;
|
|
2425 }
|
|
2426 #endif
|
|
2427 #if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
|
|
2428 if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
|
|
2429 && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA4)))
|
|
2430 {
|
|
2431 matmul_fn = matmul_i8_avx128_fma4;
|
|
2432 goto store;
|
|
2433 }
|
|
2434 #endif
|
|
2435
|
|
2436 }
|
|
2437 store:
|
|
2438 __atomic_store_n (&matmul_p, matmul_fn, __ATOMIC_RELAXED);
|
|
2439 }
|
|
2440
|
|
2441 (*matmul_fn) (retarray, a, b, try_blas, blas_limit, gemm);
|
|
2442 }
|
|
2443
|
|
2444 #else /* Just the vanilla function. */
|
|
2445
|
|
2446 void
|
|
2447 matmul_i8 (gfc_array_i8 * const restrict retarray,
|
|
2448 gfc_array_i8 * const restrict a, gfc_array_i8 * const restrict b, int try_blas,
|
|
2449 int blas_limit, blas_call gemm)
|
|
2450 {
|
|
2451 const GFC_INTEGER_8 * restrict abase;
|
|
2452 const GFC_INTEGER_8 * restrict bbase;
|
|
2453 GFC_INTEGER_8 * restrict dest;
|
|
2454
|
|
2455 index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
|
|
2456 index_type x, y, n, count, xcount, ycount;
|
|
2457
|
|
2458 assert (GFC_DESCRIPTOR_RANK (a) == 2
|
|
2459 || GFC_DESCRIPTOR_RANK (b) == 2);
|
|
2460
|
|
2461 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
|
|
2462
|
|
2463 Either A or B (but not both) can be rank 1:
|
|
2464
|
|
2465 o One-dimensional argument A is implicitly treated as a row matrix
|
|
2466 dimensioned [1,count], so xcount=1.
|
|
2467
|
|
2468 o One-dimensional argument B is implicitly treated as a column matrix
|
|
2469 dimensioned [count, 1], so ycount=1.
|
|
2470 */
|
|
2471
|
|
2472 if (retarray->base_addr == NULL)
|
|
2473 {
|
|
2474 if (GFC_DESCRIPTOR_RANK (a) == 1)
|
|
2475 {
|
|
2476 GFC_DIMENSION_SET(retarray->dim[0], 0,
|
|
2477 GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
|
|
2478 }
|
|
2479 else if (GFC_DESCRIPTOR_RANK (b) == 1)
|
|
2480 {
|
|
2481 GFC_DIMENSION_SET(retarray->dim[0], 0,
|
|
2482 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
|
|
2483 }
|
|
2484 else
|
|
2485 {
|
|
2486 GFC_DIMENSION_SET(retarray->dim[0], 0,
|
|
2487 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
|
|
2488
|
|
2489 GFC_DIMENSION_SET(retarray->dim[1], 0,
|
|
2490 GFC_DESCRIPTOR_EXTENT(b,1) - 1,
|
|
2491 GFC_DESCRIPTOR_EXTENT(retarray,0));
|
|
2492 }
|
|
2493
|
|
2494 retarray->base_addr
|
|
2495 = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_INTEGER_8));
|
|
2496 retarray->offset = 0;
|
|
2497 }
|
|
2498 else if (unlikely (compile_options.bounds_check))
|
|
2499 {
|
|
2500 index_type ret_extent, arg_extent;
|
|
2501
|
|
2502 if (GFC_DESCRIPTOR_RANK (a) == 1)
|
|
2503 {
|
|
2504 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
|
|
2505 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
|
|
2506 if (arg_extent != ret_extent)
|
131
|
2507 runtime_error ("Array bound mismatch for dimension 1 of "
|
|
2508 "array (%ld/%ld) ",
|
111
|
2509 (long int) ret_extent, (long int) arg_extent);
|
|
2510 }
|
|
2511 else if (GFC_DESCRIPTOR_RANK (b) == 1)
|
|
2512 {
|
|
2513 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
|
|
2514 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
|
|
2515 if (arg_extent != ret_extent)
|
131
|
2516 runtime_error ("Array bound mismatch for dimension 1 of "
|
|
2517 "array (%ld/%ld) ",
|
111
|
2518 (long int) ret_extent, (long int) arg_extent);
|
|
2519 }
|
|
2520 else
|
|
2521 {
|
|
2522 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
|
|
2523 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
|
|
2524 if (arg_extent != ret_extent)
|
131
|
2525 runtime_error ("Array bound mismatch for dimension 1 of "
|
|
2526 "array (%ld/%ld) ",
|
111
|
2527 (long int) ret_extent, (long int) arg_extent);
|
|
2528
|
|
2529 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
|
|
2530 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
|
|
2531 if (arg_extent != ret_extent)
|
131
|
2532 runtime_error ("Array bound mismatch for dimension 2 of "
|
|
2533 "array (%ld/%ld) ",
|
111
|
2534 (long int) ret_extent, (long int) arg_extent);
|
|
2535 }
|
|
2536 }
|
|
2537
|
|
2538
|
|
2539 if (GFC_DESCRIPTOR_RANK (retarray) == 1)
|
|
2540 {
|
|
2541 /* One-dimensional result may be addressed in the code below
|
|
2542 either as a row or a column matrix. We want both cases to
|
|
2543 work. */
|
|
2544 rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
|
|
2545 }
|
|
2546 else
|
|
2547 {
|
|
2548 rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
|
|
2549 rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
|
|
2550 }
|
|
2551
|
|
2552
|
|
2553 if (GFC_DESCRIPTOR_RANK (a) == 1)
|
|
2554 {
|
|
2555 /* Treat it as a a row matrix A[1,count]. */
|
|
2556 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
|
|
2557 aystride = 1;
|
|
2558
|
|
2559 xcount = 1;
|
|
2560 count = GFC_DESCRIPTOR_EXTENT(a,0);
|
|
2561 }
|
|
2562 else
|
|
2563 {
|
|
2564 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
|
|
2565 aystride = GFC_DESCRIPTOR_STRIDE(a,1);
|
|
2566
|
|
2567 count = GFC_DESCRIPTOR_EXTENT(a,1);
|
|
2568 xcount = GFC_DESCRIPTOR_EXTENT(a,0);
|
|
2569 }
|
|
2570
|
|
2571 if (count != GFC_DESCRIPTOR_EXTENT(b,0))
|
|
2572 {
|
|
2573 if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
|
131
|
2574 runtime_error ("Incorrect extent in argument B in MATMUL intrinsic "
|
|
2575 "in dimension 1: is %ld, should be %ld",
|
|
2576 (long int) GFC_DESCRIPTOR_EXTENT(b,0), (long int) count);
|
111
|
2577 }
|
|
2578
|
|
2579 if (GFC_DESCRIPTOR_RANK (b) == 1)
|
|
2580 {
|
|
2581 /* Treat it as a column matrix B[count,1] */
|
|
2582 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
|
|
2583
|
|
2584 /* bystride should never be used for 1-dimensional b.
|
|
2585 The value is only used for calculation of the
|
|
2586 memory by the buffer. */
|
|
2587 bystride = 256;
|
|
2588 ycount = 1;
|
|
2589 }
|
|
2590 else
|
|
2591 {
|
|
2592 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
|
|
2593 bystride = GFC_DESCRIPTOR_STRIDE(b,1);
|
|
2594 ycount = GFC_DESCRIPTOR_EXTENT(b,1);
|
|
2595 }
|
|
2596
|
|
2597 abase = a->base_addr;
|
|
2598 bbase = b->base_addr;
|
|
2599 dest = retarray->base_addr;
|
|
2600
|
|
2601 /* Now that everything is set up, we perform the multiplication
|
|
2602 itself. */
|
|
2603
|
|
2604 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
|
|
2605 #define min(a,b) ((a) <= (b) ? (a) : (b))
|
|
2606 #define max(a,b) ((a) >= (b) ? (a) : (b))
|
|
2607
|
|
2608 if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
|
|
2609 && (bxstride == 1 || bystride == 1)
|
|
2610 && (((float) xcount) * ((float) ycount) * ((float) count)
|
|
2611 > POW3(blas_limit)))
|
|
2612 {
|
|
2613 const int m = xcount, n = ycount, k = count, ldc = rystride;
|
|
2614 const GFC_INTEGER_8 one = 1, zero = 0;
|
|
2615 const int lda = (axstride == 1) ? aystride : axstride,
|
|
2616 ldb = (bxstride == 1) ? bystride : bxstride;
|
|
2617
|
|
2618 if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
|
|
2619 {
|
|
2620 assert (gemm != NULL);
|
131
|
2621 const char *transa, *transb;
|
|
2622 if (try_blas & 2)
|
|
2623 transa = "C";
|
|
2624 else
|
|
2625 transa = axstride == 1 ? "N" : "T";
|
|
2626
|
|
2627 if (try_blas & 4)
|
|
2628 transb = "C";
|
|
2629 else
|
|
2630 transb = bxstride == 1 ? "N" : "T";
|
|
2631
|
|
2632 gemm (transa, transb , &m,
|
111
|
2633 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
|
|
2634 &ldc, 1, 1);
|
|
2635 return;
|
|
2636 }
|
|
2637 }
|
|
2638
|
|
2639 if (rxstride == 1 && axstride == 1 && bxstride == 1)
|
|
2640 {
|
|
2641 /* This block of code implements a tuned matmul, derived from
|
|
2642 Superscalar GEMM-based level 3 BLAS, Beta version 0.1
|
|
2643
|
|
2644 Bo Kagstrom and Per Ling
|
|
2645 Department of Computing Science
|
|
2646 Umea University
|
|
2647 S-901 87 Umea, Sweden
|
|
2648
|
|
2649 from netlib.org, translated to C, and modified for matmul.m4. */
|
|
2650
|
|
2651 const GFC_INTEGER_8 *a, *b;
|
|
2652 GFC_INTEGER_8 *c;
|
|
2653 const index_type m = xcount, n = ycount, k = count;
|
|
2654
|
|
2655 /* System generated locals */
|
|
2656 index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
|
|
2657 i1, i2, i3, i4, i5, i6;
|
|
2658
|
|
2659 /* Local variables */
|
|
2660 GFC_INTEGER_8 f11, f12, f21, f22, f31, f32, f41, f42,
|
|
2661 f13, f14, f23, f24, f33, f34, f43, f44;
|
|
2662 index_type i, j, l, ii, jj, ll;
|
|
2663 index_type isec, jsec, lsec, uisec, ujsec, ulsec;
|
|
2664 GFC_INTEGER_8 *t1;
|
|
2665
|
|
2666 a = abase;
|
|
2667 b = bbase;
|
|
2668 c = retarray->base_addr;
|
|
2669
|
|
2670 /* Parameter adjustments */
|
|
2671 c_dim1 = rystride;
|
|
2672 c_offset = 1 + c_dim1;
|
|
2673 c -= c_offset;
|
|
2674 a_dim1 = aystride;
|
|
2675 a_offset = 1 + a_dim1;
|
|
2676 a -= a_offset;
|
|
2677 b_dim1 = bystride;
|
|
2678 b_offset = 1 + b_dim1;
|
|
2679 b -= b_offset;
|
|
2680
|
|
2681 /* Empty c first. */
|
|
2682 for (j=1; j<=n; j++)
|
|
2683 for (i=1; i<=m; i++)
|
|
2684 c[i + j * c_dim1] = (GFC_INTEGER_8)0;
|
|
2685
|
|
2686 /* Early exit if possible */
|
|
2687 if (m == 0 || n == 0 || k == 0)
|
|
2688 return;
|
|
2689
|
|
2690 /* Adjust size of t1 to what is needed. */
|
131
|
2691 index_type t1_dim, a_sz;
|
|
2692 if (aystride == 1)
|
|
2693 a_sz = rystride;
|
|
2694 else
|
|
2695 a_sz = a_dim1;
|
|
2696
|
|
2697 t1_dim = a_sz * 256 + b_dim1;
|
111
|
2698 if (t1_dim > 65536)
|
|
2699 t1_dim = 65536;
|
|
2700
|
|
2701 t1 = malloc (t1_dim * sizeof(GFC_INTEGER_8));
|
|
2702
|
|
2703 /* Start turning the crank. */
|
|
2704 i1 = n;
|
|
2705 for (jj = 1; jj <= i1; jj += 512)
|
|
2706 {
|
|
2707 /* Computing MIN */
|
|
2708 i2 = 512;
|
|
2709 i3 = n - jj + 1;
|
|
2710 jsec = min(i2,i3);
|
|
2711 ujsec = jsec - jsec % 4;
|
|
2712 i2 = k;
|
|
2713 for (ll = 1; ll <= i2; ll += 256)
|
|
2714 {
|
|
2715 /* Computing MIN */
|
|
2716 i3 = 256;
|
|
2717 i4 = k - ll + 1;
|
|
2718 lsec = min(i3,i4);
|
|
2719 ulsec = lsec - lsec % 2;
|
|
2720
|
|
2721 i3 = m;
|
|
2722 for (ii = 1; ii <= i3; ii += 256)
|
|
2723 {
|
|
2724 /* Computing MIN */
|
|
2725 i4 = 256;
|
|
2726 i5 = m - ii + 1;
|
|
2727 isec = min(i4,i5);
|
|
2728 uisec = isec - isec % 2;
|
|
2729 i4 = ll + ulsec - 1;
|
|
2730 for (l = ll; l <= i4; l += 2)
|
|
2731 {
|
|
2732 i5 = ii + uisec - 1;
|
|
2733 for (i = ii; i <= i5; i += 2)
|
|
2734 {
|
|
2735 t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
|
|
2736 a[i + l * a_dim1];
|
|
2737 t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
|
|
2738 a[i + (l + 1) * a_dim1];
|
|
2739 t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
|
|
2740 a[i + 1 + l * a_dim1];
|
|
2741 t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
|
|
2742 a[i + 1 + (l + 1) * a_dim1];
|
|
2743 }
|
|
2744 if (uisec < isec)
|
|
2745 {
|
|
2746 t1[l - ll + 1 + (isec << 8) - 257] =
|
|
2747 a[ii + isec - 1 + l * a_dim1];
|
|
2748 t1[l - ll + 2 + (isec << 8) - 257] =
|
|
2749 a[ii + isec - 1 + (l + 1) * a_dim1];
|
|
2750 }
|
|
2751 }
|
|
2752 if (ulsec < lsec)
|
|
2753 {
|
|
2754 i4 = ii + isec - 1;
|
|
2755 for (i = ii; i<= i4; ++i)
|
|
2756 {
|
|
2757 t1[lsec + ((i - ii + 1) << 8) - 257] =
|
|
2758 a[i + (ll + lsec - 1) * a_dim1];
|
|
2759 }
|
|
2760 }
|
|
2761
|
|
2762 uisec = isec - isec % 4;
|
|
2763 i4 = jj + ujsec - 1;
|
|
2764 for (j = jj; j <= i4; j += 4)
|
|
2765 {
|
|
2766 i5 = ii + uisec - 1;
|
|
2767 for (i = ii; i <= i5; i += 4)
|
|
2768 {
|
|
2769 f11 = c[i + j * c_dim1];
|
|
2770 f21 = c[i + 1 + j * c_dim1];
|
|
2771 f12 = c[i + (j + 1) * c_dim1];
|
|
2772 f22 = c[i + 1 + (j + 1) * c_dim1];
|
|
2773 f13 = c[i + (j + 2) * c_dim1];
|
|
2774 f23 = c[i + 1 + (j + 2) * c_dim1];
|
|
2775 f14 = c[i + (j + 3) * c_dim1];
|
|
2776 f24 = c[i + 1 + (j + 3) * c_dim1];
|
|
2777 f31 = c[i + 2 + j * c_dim1];
|
|
2778 f41 = c[i + 3 + j * c_dim1];
|
|
2779 f32 = c[i + 2 + (j + 1) * c_dim1];
|
|
2780 f42 = c[i + 3 + (j + 1) * c_dim1];
|
|
2781 f33 = c[i + 2 + (j + 2) * c_dim1];
|
|
2782 f43 = c[i + 3 + (j + 2) * c_dim1];
|
|
2783 f34 = c[i + 2 + (j + 3) * c_dim1];
|
|
2784 f44 = c[i + 3 + (j + 3) * c_dim1];
|
|
2785 i6 = ll + lsec - 1;
|
|
2786 for (l = ll; l <= i6; ++l)
|
|
2787 {
|
|
2788 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
|
|
2789 * b[l + j * b_dim1];
|
|
2790 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
|
|
2791 * b[l + j * b_dim1];
|
|
2792 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
|
|
2793 * b[l + (j + 1) * b_dim1];
|
|
2794 f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
|
|
2795 * b[l + (j + 1) * b_dim1];
|
|
2796 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
|
|
2797 * b[l + (j + 2) * b_dim1];
|
|
2798 f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
|
|
2799 * b[l + (j + 2) * b_dim1];
|
|
2800 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
|
|
2801 * b[l + (j + 3) * b_dim1];
|
|
2802 f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
|
|
2803 * b[l + (j + 3) * b_dim1];
|
|
2804 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
|
|
2805 * b[l + j * b_dim1];
|
|
2806 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
|
|
2807 * b[l + j * b_dim1];
|
|
2808 f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
|
|
2809 * b[l + (j + 1) * b_dim1];
|
|
2810 f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
|
|
2811 * b[l + (j + 1) * b_dim1];
|
|
2812 f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
|
|
2813 * b[l + (j + 2) * b_dim1];
|
|
2814 f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
|
|
2815 * b[l + (j + 2) * b_dim1];
|
|
2816 f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
|
|
2817 * b[l + (j + 3) * b_dim1];
|
|
2818 f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
|
|
2819 * b[l + (j + 3) * b_dim1];
|
|
2820 }
|
|
2821 c[i + j * c_dim1] = f11;
|
|
2822 c[i + 1 + j * c_dim1] = f21;
|
|
2823 c[i + (j + 1) * c_dim1] = f12;
|
|
2824 c[i + 1 + (j + 1) * c_dim1] = f22;
|
|
2825 c[i + (j + 2) * c_dim1] = f13;
|
|
2826 c[i + 1 + (j + 2) * c_dim1] = f23;
|
|
2827 c[i + (j + 3) * c_dim1] = f14;
|
|
2828 c[i + 1 + (j + 3) * c_dim1] = f24;
|
|
2829 c[i + 2 + j * c_dim1] = f31;
|
|
2830 c[i + 3 + j * c_dim1] = f41;
|
|
2831 c[i + 2 + (j + 1) * c_dim1] = f32;
|
|
2832 c[i + 3 + (j + 1) * c_dim1] = f42;
|
|
2833 c[i + 2 + (j + 2) * c_dim1] = f33;
|
|
2834 c[i + 3 + (j + 2) * c_dim1] = f43;
|
|
2835 c[i + 2 + (j + 3) * c_dim1] = f34;
|
|
2836 c[i + 3 + (j + 3) * c_dim1] = f44;
|
|
2837 }
|
|
2838 if (uisec < isec)
|
|
2839 {
|
|
2840 i5 = ii + isec - 1;
|
|
2841 for (i = ii + uisec; i <= i5; ++i)
|
|
2842 {
|
|
2843 f11 = c[i + j * c_dim1];
|
|
2844 f12 = c[i + (j + 1) * c_dim1];
|
|
2845 f13 = c[i + (j + 2) * c_dim1];
|
|
2846 f14 = c[i + (j + 3) * c_dim1];
|
|
2847 i6 = ll + lsec - 1;
|
|
2848 for (l = ll; l <= i6; ++l)
|
|
2849 {
|
|
2850 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
|
|
2851 257] * b[l + j * b_dim1];
|
|
2852 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
|
|
2853 257] * b[l + (j + 1) * b_dim1];
|
|
2854 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
|
|
2855 257] * b[l + (j + 2) * b_dim1];
|
|
2856 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
|
|
2857 257] * b[l + (j + 3) * b_dim1];
|
|
2858 }
|
|
2859 c[i + j * c_dim1] = f11;
|
|
2860 c[i + (j + 1) * c_dim1] = f12;
|
|
2861 c[i + (j + 2) * c_dim1] = f13;
|
|
2862 c[i + (j + 3) * c_dim1] = f14;
|
|
2863 }
|
|
2864 }
|
|
2865 }
|
|
2866 if (ujsec < jsec)
|
|
2867 {
|
|
2868 i4 = jj + jsec - 1;
|
|
2869 for (j = jj + ujsec; j <= i4; ++j)
|
|
2870 {
|
|
2871 i5 = ii + uisec - 1;
|
|
2872 for (i = ii; i <= i5; i += 4)
|
|
2873 {
|
|
2874 f11 = c[i + j * c_dim1];
|
|
2875 f21 = c[i + 1 + j * c_dim1];
|
|
2876 f31 = c[i + 2 + j * c_dim1];
|
|
2877 f41 = c[i + 3 + j * c_dim1];
|
|
2878 i6 = ll + lsec - 1;
|
|
2879 for (l = ll; l <= i6; ++l)
|
|
2880 {
|
|
2881 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
|
|
2882 257] * b[l + j * b_dim1];
|
|
2883 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
|
|
2884 257] * b[l + j * b_dim1];
|
|
2885 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
|
|
2886 257] * b[l + j * b_dim1];
|
|
2887 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
|
|
2888 257] * b[l + j * b_dim1];
|
|
2889 }
|
|
2890 c[i + j * c_dim1] = f11;
|
|
2891 c[i + 1 + j * c_dim1] = f21;
|
|
2892 c[i + 2 + j * c_dim1] = f31;
|
|
2893 c[i + 3 + j * c_dim1] = f41;
|
|
2894 }
|
|
2895 i5 = ii + isec - 1;
|
|
2896 for (i = ii + uisec; i <= i5; ++i)
|
|
2897 {
|
|
2898 f11 = c[i + j * c_dim1];
|
|
2899 i6 = ll + lsec - 1;
|
|
2900 for (l = ll; l <= i6; ++l)
|
|
2901 {
|
|
2902 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
|
|
2903 257] * b[l + j * b_dim1];
|
|
2904 }
|
|
2905 c[i + j * c_dim1] = f11;
|
|
2906 }
|
|
2907 }
|
|
2908 }
|
|
2909 }
|
|
2910 }
|
|
2911 }
|
|
2912 free(t1);
|
|
2913 return;
|
|
2914 }
|
|
2915 else if (rxstride == 1 && aystride == 1 && bxstride == 1)
|
|
2916 {
|
|
2917 if (GFC_DESCRIPTOR_RANK (a) != 1)
|
|
2918 {
|
|
2919 const GFC_INTEGER_8 *restrict abase_x;
|
|
2920 const GFC_INTEGER_8 *restrict bbase_y;
|
|
2921 GFC_INTEGER_8 *restrict dest_y;
|
|
2922 GFC_INTEGER_8 s;
|
|
2923
|
|
2924 for (y = 0; y < ycount; y++)
|
|
2925 {
|
|
2926 bbase_y = &bbase[y*bystride];
|
|
2927 dest_y = &dest[y*rystride];
|
|
2928 for (x = 0; x < xcount; x++)
|
|
2929 {
|
|
2930 abase_x = &abase[x*axstride];
|
|
2931 s = (GFC_INTEGER_8) 0;
|
|
2932 for (n = 0; n < count; n++)
|
|
2933 s += abase_x[n] * bbase_y[n];
|
|
2934 dest_y[x] = s;
|
|
2935 }
|
|
2936 }
|
|
2937 }
|
|
2938 else
|
|
2939 {
|
|
2940 const GFC_INTEGER_8 *restrict bbase_y;
|
|
2941 GFC_INTEGER_8 s;
|
|
2942
|
|
2943 for (y = 0; y < ycount; y++)
|
|
2944 {
|
|
2945 bbase_y = &bbase[y*bystride];
|
|
2946 s = (GFC_INTEGER_8) 0;
|
|
2947 for (n = 0; n < count; n++)
|
|
2948 s += abase[n*axstride] * bbase_y[n];
|
|
2949 dest[y*rystride] = s;
|
|
2950 }
|
|
2951 }
|
|
2952 }
|
|
2953 else if (axstride < aystride)
|
|
2954 {
|
|
2955 for (y = 0; y < ycount; y++)
|
|
2956 for (x = 0; x < xcount; x++)
|
|
2957 dest[x*rxstride + y*rystride] = (GFC_INTEGER_8)0;
|
|
2958
|
|
2959 for (y = 0; y < ycount; y++)
|
|
2960 for (n = 0; n < count; n++)
|
|
2961 for (x = 0; x < xcount; x++)
|
|
2962 /* dest[x,y] += a[x,n] * b[n,y] */
|
|
2963 dest[x*rxstride + y*rystride] +=
|
|
2964 abase[x*axstride + n*aystride] *
|
|
2965 bbase[n*bxstride + y*bystride];
|
|
2966 }
|
|
2967 else if (GFC_DESCRIPTOR_RANK (a) == 1)
|
|
2968 {
|
|
2969 const GFC_INTEGER_8 *restrict bbase_y;
|
|
2970 GFC_INTEGER_8 s;
|
|
2971
|
|
2972 for (y = 0; y < ycount; y++)
|
|
2973 {
|
|
2974 bbase_y = &bbase[y*bystride];
|
|
2975 s = (GFC_INTEGER_8) 0;
|
|
2976 for (n = 0; n < count; n++)
|
|
2977 s += abase[n*axstride] * bbase_y[n*bxstride];
|
|
2978 dest[y*rxstride] = s;
|
|
2979 }
|
|
2980 }
|
|
2981 else
|
|
2982 {
|
|
2983 const GFC_INTEGER_8 *restrict abase_x;
|
|
2984 const GFC_INTEGER_8 *restrict bbase_y;
|
|
2985 GFC_INTEGER_8 *restrict dest_y;
|
|
2986 GFC_INTEGER_8 s;
|
|
2987
|
|
2988 for (y = 0; y < ycount; y++)
|
|
2989 {
|
|
2990 bbase_y = &bbase[y*bystride];
|
|
2991 dest_y = &dest[y*rystride];
|
|
2992 for (x = 0; x < xcount; x++)
|
|
2993 {
|
|
2994 abase_x = &abase[x*axstride];
|
|
2995 s = (GFC_INTEGER_8) 0;
|
|
2996 for (n = 0; n < count; n++)
|
|
2997 s += abase_x[n*aystride] * bbase_y[n*bxstride];
|
|
2998 dest_y[x*rxstride] = s;
|
|
2999 }
|
|
3000 }
|
|
3001 }
|
|
3002 }
|
|
3003 #undef POW3
|
|
3004 #undef min
|
|
3005 #undef max
|
|
3006
|
|
3007 #endif
|
|
3008 #endif
|
|
3009
|