annotate libgfortran/generated/matmul_i16.c @ 158:494b0b89df80 default tip

...
author Shinji KONO <kono@ie.u-ryukyu.ac.jp>
date Mon, 25 May 2020 18:13:55 +0900
parents 1830386684a0
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
111
kono
parents:
diff changeset
1 /* Implementation of the MATMUL intrinsic
145
1830386684a0 gcc-9.2.0
anatofuz
parents: 131
diff changeset
2 Copyright (C) 2002-2020 Free Software Foundation, Inc.
111
kono
parents:
diff changeset
3 Contributed by Paul Brook <paul@nowt.org>
kono
parents:
diff changeset
4
kono
parents:
diff changeset
5 This file is part of the GNU Fortran runtime library (libgfortran).
kono
parents:
diff changeset
6
kono
parents:
diff changeset
7 Libgfortran is free software; you can redistribute it and/or
kono
parents:
diff changeset
8 modify it under the terms of the GNU General Public
kono
parents:
diff changeset
9 License as published by the Free Software Foundation; either
kono
parents:
diff changeset
10 version 3 of the License, or (at your option) any later version.
kono
parents:
diff changeset
11
kono
parents:
diff changeset
12 Libgfortran is distributed in the hope that it will be useful,
kono
parents:
diff changeset
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
kono
parents:
diff changeset
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
kono
parents:
diff changeset
15 GNU General Public License for more details.
kono
parents:
diff changeset
16
kono
parents:
diff changeset
17 Under Section 7 of GPL version 3, you are granted additional
kono
parents:
diff changeset
18 permissions described in the GCC Runtime Library Exception, version
kono
parents:
diff changeset
19 3.1, as published by the Free Software Foundation.
kono
parents:
diff changeset
20
kono
parents:
diff changeset
21 You should have received a copy of the GNU General Public License and
kono
parents:
diff changeset
22 a copy of the GCC Runtime Library Exception along with this program;
kono
parents:
diff changeset
23 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
kono
parents:
diff changeset
24 <http://www.gnu.org/licenses/>. */
kono
parents:
diff changeset
25
kono
parents:
diff changeset
26 #include "libgfortran.h"
kono
parents:
diff changeset
27 #include <string.h>
kono
parents:
diff changeset
28 #include <assert.h>
kono
parents:
diff changeset
29
kono
parents:
diff changeset
30
kono
parents:
diff changeset
31 #if defined (HAVE_GFC_INTEGER_16)
kono
parents:
diff changeset
32
kono
parents:
diff changeset
33 /* Prototype for the BLAS ?gemm subroutine, a pointer to which can be
kono
parents:
diff changeset
34 passed to us by the front-end, in which case we call it for large
kono
parents:
diff changeset
35 matrices. */
kono
parents:
diff changeset
36
kono
parents:
diff changeset
37 typedef void (*blas_call)(const char *, const char *, const int *, const int *,
kono
parents:
diff changeset
38 const int *, const GFC_INTEGER_16 *, const GFC_INTEGER_16 *,
kono
parents:
diff changeset
39 const int *, const GFC_INTEGER_16 *, const int *,
kono
parents:
diff changeset
40 const GFC_INTEGER_16 *, GFC_INTEGER_16 *, const int *,
kono
parents:
diff changeset
41 int, int);
kono
parents:
diff changeset
42
kono
parents:
diff changeset
43 /* The order of loops is different in the case of plain matrix
kono
parents:
diff changeset
44 multiplication C=MATMUL(A,B), and in the frequent special case where
kono
parents:
diff changeset
45 the argument A is the temporary result of a TRANSPOSE intrinsic:
kono
parents:
diff changeset
46 C=MATMUL(TRANSPOSE(A),B). Transposed temporaries are detected by
kono
parents:
diff changeset
47 looking at their strides.
kono
parents:
diff changeset
48
kono
parents:
diff changeset
49 The equivalent Fortran pseudo-code is:
kono
parents:
diff changeset
50
kono
parents:
diff changeset
51 DIMENSION A(M,COUNT), B(COUNT,N), C(M,N)
kono
parents:
diff changeset
52 IF (.NOT.IS_TRANSPOSED(A)) THEN
kono
parents:
diff changeset
53 C = 0
kono
parents:
diff changeset
54 DO J=1,N
kono
parents:
diff changeset
55 DO K=1,COUNT
kono
parents:
diff changeset
56 DO I=1,M
kono
parents:
diff changeset
57 C(I,J) = C(I,J)+A(I,K)*B(K,J)
kono
parents:
diff changeset
58 ELSE
kono
parents:
diff changeset
59 DO J=1,N
kono
parents:
diff changeset
60 DO I=1,M
kono
parents:
diff changeset
61 S = 0
kono
parents:
diff changeset
62 DO K=1,COUNT
kono
parents:
diff changeset
63 S = S+A(I,K)*B(K,J)
kono
parents:
diff changeset
64 C(I,J) = S
kono
parents:
diff changeset
65 ENDIF
kono
parents:
diff changeset
66 */
kono
parents:
diff changeset
67
kono
parents:
diff changeset
68 /* If try_blas is set to a nonzero value, then the matmul function will
kono
parents:
diff changeset
69 see if there is a way to perform the matrix multiplication by a call
kono
parents:
diff changeset
70 to the BLAS gemm function. */
kono
parents:
diff changeset
71
kono
parents:
diff changeset
72 extern void matmul_i16 (gfc_array_i16 * const restrict retarray,
kono
parents:
diff changeset
73 gfc_array_i16 * const restrict a, gfc_array_i16 * const restrict b, int try_blas,
kono
parents:
diff changeset
74 int blas_limit, blas_call gemm);
kono
parents:
diff changeset
75 export_proto(matmul_i16);
kono
parents:
diff changeset
76
kono
parents:
diff changeset
77 /* Put exhaustive list of possible architectures here here, ORed together. */
kono
parents:
diff changeset
78
kono
parents:
diff changeset
79 #if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_AVX512F)
kono
parents:
diff changeset
80
kono
parents:
diff changeset
81 #ifdef HAVE_AVX
kono
parents:
diff changeset
82 static void
kono
parents:
diff changeset
83 matmul_i16_avx (gfc_array_i16 * const restrict retarray,
kono
parents:
diff changeset
84 gfc_array_i16 * const restrict a, gfc_array_i16 * const restrict b, int try_blas,
kono
parents:
diff changeset
85 int blas_limit, blas_call gemm) __attribute__((__target__("avx")));
kono
parents:
diff changeset
86 static void
kono
parents:
diff changeset
87 matmul_i16_avx (gfc_array_i16 * const restrict retarray,
kono
parents:
diff changeset
88 gfc_array_i16 * const restrict a, gfc_array_i16 * const restrict b, int try_blas,
kono
parents:
diff changeset
89 int blas_limit, blas_call gemm)
kono
parents:
diff changeset
90 {
kono
parents:
diff changeset
91 const GFC_INTEGER_16 * restrict abase;
kono
parents:
diff changeset
92 const GFC_INTEGER_16 * restrict bbase;
kono
parents:
diff changeset
93 GFC_INTEGER_16 * restrict dest;
kono
parents:
diff changeset
94
kono
parents:
diff changeset
95 index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
kono
parents:
diff changeset
96 index_type x, y, n, count, xcount, ycount;
kono
parents:
diff changeset
97
kono
parents:
diff changeset
98 assert (GFC_DESCRIPTOR_RANK (a) == 2
kono
parents:
diff changeset
99 || GFC_DESCRIPTOR_RANK (b) == 2);
kono
parents:
diff changeset
100
kono
parents:
diff changeset
101 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
kono
parents:
diff changeset
102
kono
parents:
diff changeset
103 Either A or B (but not both) can be rank 1:
kono
parents:
diff changeset
104
kono
parents:
diff changeset
105 o One-dimensional argument A is implicitly treated as a row matrix
kono
parents:
diff changeset
106 dimensioned [1,count], so xcount=1.
kono
parents:
diff changeset
107
kono
parents:
diff changeset
108 o One-dimensional argument B is implicitly treated as a column matrix
kono
parents:
diff changeset
109 dimensioned [count, 1], so ycount=1.
kono
parents:
diff changeset
110 */
kono
parents:
diff changeset
111
kono
parents:
diff changeset
112 if (retarray->base_addr == NULL)
kono
parents:
diff changeset
113 {
kono
parents:
diff changeset
114 if (GFC_DESCRIPTOR_RANK (a) == 1)
kono
parents:
diff changeset
115 {
kono
parents:
diff changeset
116 GFC_DIMENSION_SET(retarray->dim[0], 0,
kono
parents:
diff changeset
117 GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
kono
parents:
diff changeset
118 }
kono
parents:
diff changeset
119 else if (GFC_DESCRIPTOR_RANK (b) == 1)
kono
parents:
diff changeset
120 {
kono
parents:
diff changeset
121 GFC_DIMENSION_SET(retarray->dim[0], 0,
kono
parents:
diff changeset
122 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
kono
parents:
diff changeset
123 }
kono
parents:
diff changeset
124 else
kono
parents:
diff changeset
125 {
kono
parents:
diff changeset
126 GFC_DIMENSION_SET(retarray->dim[0], 0,
kono
parents:
diff changeset
127 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
kono
parents:
diff changeset
128
kono
parents:
diff changeset
129 GFC_DIMENSION_SET(retarray->dim[1], 0,
kono
parents:
diff changeset
130 GFC_DESCRIPTOR_EXTENT(b,1) - 1,
kono
parents:
diff changeset
131 GFC_DESCRIPTOR_EXTENT(retarray,0));
kono
parents:
diff changeset
132 }
kono
parents:
diff changeset
133
kono
parents:
diff changeset
134 retarray->base_addr
kono
parents:
diff changeset
135 = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_INTEGER_16));
kono
parents:
diff changeset
136 retarray->offset = 0;
kono
parents:
diff changeset
137 }
kono
parents:
diff changeset
138 else if (unlikely (compile_options.bounds_check))
kono
parents:
diff changeset
139 {
kono
parents:
diff changeset
140 index_type ret_extent, arg_extent;
kono
parents:
diff changeset
141
kono
parents:
diff changeset
142 if (GFC_DESCRIPTOR_RANK (a) == 1)
kono
parents:
diff changeset
143 {
kono
parents:
diff changeset
144 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
kono
parents:
diff changeset
145 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
kono
parents:
diff changeset
146 if (arg_extent != ret_extent)
131
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
147 runtime_error ("Array bound mismatch for dimension 1 of "
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
148 "array (%ld/%ld) ",
111
kono
parents:
diff changeset
149 (long int) ret_extent, (long int) arg_extent);
kono
parents:
diff changeset
150 }
kono
parents:
diff changeset
151 else if (GFC_DESCRIPTOR_RANK (b) == 1)
kono
parents:
diff changeset
152 {
kono
parents:
diff changeset
153 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
kono
parents:
diff changeset
154 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
kono
parents:
diff changeset
155 if (arg_extent != ret_extent)
131
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
156 runtime_error ("Array bound mismatch for dimension 1 of "
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
157 "array (%ld/%ld) ",
111
kono
parents:
diff changeset
158 (long int) ret_extent, (long int) arg_extent);
kono
parents:
diff changeset
159 }
kono
parents:
diff changeset
160 else
kono
parents:
diff changeset
161 {
kono
parents:
diff changeset
162 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
kono
parents:
diff changeset
163 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
kono
parents:
diff changeset
164 if (arg_extent != ret_extent)
131
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
165 runtime_error ("Array bound mismatch for dimension 1 of "
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
166 "array (%ld/%ld) ",
111
kono
parents:
diff changeset
167 (long int) ret_extent, (long int) arg_extent);
kono
parents:
diff changeset
168
kono
parents:
diff changeset
169 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
kono
parents:
diff changeset
170 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
kono
parents:
diff changeset
171 if (arg_extent != ret_extent)
131
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
172 runtime_error ("Array bound mismatch for dimension 2 of "
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
173 "array (%ld/%ld) ",
111
kono
parents:
diff changeset
174 (long int) ret_extent, (long int) arg_extent);
kono
parents:
diff changeset
175 }
kono
parents:
diff changeset
176 }
kono
parents:
diff changeset
177
kono
parents:
diff changeset
178
kono
parents:
diff changeset
179 if (GFC_DESCRIPTOR_RANK (retarray) == 1)
kono
parents:
diff changeset
180 {
kono
parents:
diff changeset
181 /* One-dimensional result may be addressed in the code below
kono
parents:
diff changeset
182 either as a row or a column matrix. We want both cases to
kono
parents:
diff changeset
183 work. */
kono
parents:
diff changeset
184 rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
kono
parents:
diff changeset
185 }
kono
parents:
diff changeset
186 else
kono
parents:
diff changeset
187 {
kono
parents:
diff changeset
188 rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
kono
parents:
diff changeset
189 rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
kono
parents:
diff changeset
190 }
kono
parents:
diff changeset
191
kono
parents:
diff changeset
192
kono
parents:
diff changeset
193 if (GFC_DESCRIPTOR_RANK (a) == 1)
kono
parents:
diff changeset
194 {
kono
parents:
diff changeset
195 /* Treat it as a a row matrix A[1,count]. */
kono
parents:
diff changeset
196 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
kono
parents:
diff changeset
197 aystride = 1;
kono
parents:
diff changeset
198
kono
parents:
diff changeset
199 xcount = 1;
kono
parents:
diff changeset
200 count = GFC_DESCRIPTOR_EXTENT(a,0);
kono
parents:
diff changeset
201 }
kono
parents:
diff changeset
202 else
kono
parents:
diff changeset
203 {
kono
parents:
diff changeset
204 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
kono
parents:
diff changeset
205 aystride = GFC_DESCRIPTOR_STRIDE(a,1);
kono
parents:
diff changeset
206
kono
parents:
diff changeset
207 count = GFC_DESCRIPTOR_EXTENT(a,1);
kono
parents:
diff changeset
208 xcount = GFC_DESCRIPTOR_EXTENT(a,0);
kono
parents:
diff changeset
209 }
kono
parents:
diff changeset
210
kono
parents:
diff changeset
211 if (count != GFC_DESCRIPTOR_EXTENT(b,0))
kono
parents:
diff changeset
212 {
kono
parents:
diff changeset
213 if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
131
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
214 runtime_error ("Incorrect extent in argument B in MATMUL intrinsic "
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
215 "in dimension 1: is %ld, should be %ld",
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
216 (long int) GFC_DESCRIPTOR_EXTENT(b,0), (long int) count);
111
kono
parents:
diff changeset
217 }
kono
parents:
diff changeset
218
kono
parents:
diff changeset
219 if (GFC_DESCRIPTOR_RANK (b) == 1)
kono
parents:
diff changeset
220 {
kono
parents:
diff changeset
221 /* Treat it as a column matrix B[count,1] */
kono
parents:
diff changeset
222 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
kono
parents:
diff changeset
223
kono
parents:
diff changeset
224 /* bystride should never be used for 1-dimensional b.
kono
parents:
diff changeset
225 The value is only used for calculation of the
kono
parents:
diff changeset
226 memory by the buffer. */
kono
parents:
diff changeset
227 bystride = 256;
kono
parents:
diff changeset
228 ycount = 1;
kono
parents:
diff changeset
229 }
kono
parents:
diff changeset
230 else
kono
parents:
diff changeset
231 {
kono
parents:
diff changeset
232 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
kono
parents:
diff changeset
233 bystride = GFC_DESCRIPTOR_STRIDE(b,1);
kono
parents:
diff changeset
234 ycount = GFC_DESCRIPTOR_EXTENT(b,1);
kono
parents:
diff changeset
235 }
kono
parents:
diff changeset
236
kono
parents:
diff changeset
237 abase = a->base_addr;
kono
parents:
diff changeset
238 bbase = b->base_addr;
kono
parents:
diff changeset
239 dest = retarray->base_addr;
kono
parents:
diff changeset
240
kono
parents:
diff changeset
241 /* Now that everything is set up, we perform the multiplication
kono
parents:
diff changeset
242 itself. */
kono
parents:
diff changeset
243
kono
parents:
diff changeset
244 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
kono
parents:
diff changeset
245 #define min(a,b) ((a) <= (b) ? (a) : (b))
kono
parents:
diff changeset
246 #define max(a,b) ((a) >= (b) ? (a) : (b))
kono
parents:
diff changeset
247
kono
parents:
diff changeset
248 if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
kono
parents:
diff changeset
249 && (bxstride == 1 || bystride == 1)
kono
parents:
diff changeset
250 && (((float) xcount) * ((float) ycount) * ((float) count)
kono
parents:
diff changeset
251 > POW3(blas_limit)))
kono
parents:
diff changeset
252 {
kono
parents:
diff changeset
253 const int m = xcount, n = ycount, k = count, ldc = rystride;
kono
parents:
diff changeset
254 const GFC_INTEGER_16 one = 1, zero = 0;
kono
parents:
diff changeset
255 const int lda = (axstride == 1) ? aystride : axstride,
kono
parents:
diff changeset
256 ldb = (bxstride == 1) ? bystride : bxstride;
kono
parents:
diff changeset
257
kono
parents:
diff changeset
258 if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
kono
parents:
diff changeset
259 {
kono
parents:
diff changeset
260 assert (gemm != NULL);
131
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
261 const char *transa, *transb;
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
262 if (try_blas & 2)
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
263 transa = "C";
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
264 else
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
265 transa = axstride == 1 ? "N" : "T";
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
266
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
267 if (try_blas & 4)
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
268 transb = "C";
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
269 else
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
270 transb = bxstride == 1 ? "N" : "T";
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
271
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
272 gemm (transa, transb , &m,
111
kono
parents:
diff changeset
273 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
kono
parents:
diff changeset
274 &ldc, 1, 1);
kono
parents:
diff changeset
275 return;
kono
parents:
diff changeset
276 }
kono
parents:
diff changeset
277 }
kono
parents:
diff changeset
278
kono
parents:
diff changeset
279 if (rxstride == 1 && axstride == 1 && bxstride == 1)
kono
parents:
diff changeset
280 {
kono
parents:
diff changeset
281 /* This block of code implements a tuned matmul, derived from
kono
parents:
diff changeset
282 Superscalar GEMM-based level 3 BLAS, Beta version 0.1
kono
parents:
diff changeset
283
kono
parents:
diff changeset
284 Bo Kagstrom and Per Ling
kono
parents:
diff changeset
285 Department of Computing Science
kono
parents:
diff changeset
286 Umea University
kono
parents:
diff changeset
287 S-901 87 Umea, Sweden
kono
parents:
diff changeset
288
kono
parents:
diff changeset
289 from netlib.org, translated to C, and modified for matmul.m4. */
kono
parents:
diff changeset
290
kono
parents:
diff changeset
291 const GFC_INTEGER_16 *a, *b;
kono
parents:
diff changeset
292 GFC_INTEGER_16 *c;
kono
parents:
diff changeset
293 const index_type m = xcount, n = ycount, k = count;
kono
parents:
diff changeset
294
kono
parents:
diff changeset
295 /* System generated locals */
kono
parents:
diff changeset
296 index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
kono
parents:
diff changeset
297 i1, i2, i3, i4, i5, i6;
kono
parents:
diff changeset
298
kono
parents:
diff changeset
299 /* Local variables */
kono
parents:
diff changeset
300 GFC_INTEGER_16 f11, f12, f21, f22, f31, f32, f41, f42,
kono
parents:
diff changeset
301 f13, f14, f23, f24, f33, f34, f43, f44;
kono
parents:
diff changeset
302 index_type i, j, l, ii, jj, ll;
kono
parents:
diff changeset
303 index_type isec, jsec, lsec, uisec, ujsec, ulsec;
kono
parents:
diff changeset
304 GFC_INTEGER_16 *t1;
kono
parents:
diff changeset
305
kono
parents:
diff changeset
306 a = abase;
kono
parents:
diff changeset
307 b = bbase;
kono
parents:
diff changeset
308 c = retarray->base_addr;
kono
parents:
diff changeset
309
kono
parents:
diff changeset
310 /* Parameter adjustments */
kono
parents:
diff changeset
311 c_dim1 = rystride;
kono
parents:
diff changeset
312 c_offset = 1 + c_dim1;
kono
parents:
diff changeset
313 c -= c_offset;
kono
parents:
diff changeset
314 a_dim1 = aystride;
kono
parents:
diff changeset
315 a_offset = 1 + a_dim1;
kono
parents:
diff changeset
316 a -= a_offset;
kono
parents:
diff changeset
317 b_dim1 = bystride;
kono
parents:
diff changeset
318 b_offset = 1 + b_dim1;
kono
parents:
diff changeset
319 b -= b_offset;
kono
parents:
diff changeset
320
kono
parents:
diff changeset
321 /* Empty c first. */
kono
parents:
diff changeset
322 for (j=1; j<=n; j++)
kono
parents:
diff changeset
323 for (i=1; i<=m; i++)
kono
parents:
diff changeset
324 c[i + j * c_dim1] = (GFC_INTEGER_16)0;
kono
parents:
diff changeset
325
kono
parents:
diff changeset
326 /* Early exit if possible */
kono
parents:
diff changeset
327 if (m == 0 || n == 0 || k == 0)
kono
parents:
diff changeset
328 return;
kono
parents:
diff changeset
329
kono
parents:
diff changeset
330 /* Adjust size of t1 to what is needed. */
131
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
331 index_type t1_dim, a_sz;
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
332 if (aystride == 1)
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
333 a_sz = rystride;
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
334 else
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
335 a_sz = a_dim1;
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
336
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
337 t1_dim = a_sz * 256 + b_dim1;
111
kono
parents:
diff changeset
338 if (t1_dim > 65536)
kono
parents:
diff changeset
339 t1_dim = 65536;
kono
parents:
diff changeset
340
kono
parents:
diff changeset
341 t1 = malloc (t1_dim * sizeof(GFC_INTEGER_16));
kono
parents:
diff changeset
342
kono
parents:
diff changeset
343 /* Start turning the crank. */
kono
parents:
diff changeset
344 i1 = n;
kono
parents:
diff changeset
345 for (jj = 1; jj <= i1; jj += 512)
kono
parents:
diff changeset
346 {
kono
parents:
diff changeset
347 /* Computing MIN */
kono
parents:
diff changeset
348 i2 = 512;
kono
parents:
diff changeset
349 i3 = n - jj + 1;
kono
parents:
diff changeset
350 jsec = min(i2,i3);
kono
parents:
diff changeset
351 ujsec = jsec - jsec % 4;
kono
parents:
diff changeset
352 i2 = k;
kono
parents:
diff changeset
353 for (ll = 1; ll <= i2; ll += 256)
kono
parents:
diff changeset
354 {
kono
parents:
diff changeset
355 /* Computing MIN */
kono
parents:
diff changeset
356 i3 = 256;
kono
parents:
diff changeset
357 i4 = k - ll + 1;
kono
parents:
diff changeset
358 lsec = min(i3,i4);
kono
parents:
diff changeset
359 ulsec = lsec - lsec % 2;
kono
parents:
diff changeset
360
kono
parents:
diff changeset
361 i3 = m;
kono
parents:
diff changeset
362 for (ii = 1; ii <= i3; ii += 256)
kono
parents:
diff changeset
363 {
kono
parents:
diff changeset
364 /* Computing MIN */
kono
parents:
diff changeset
365 i4 = 256;
kono
parents:
diff changeset
366 i5 = m - ii + 1;
kono
parents:
diff changeset
367 isec = min(i4,i5);
kono
parents:
diff changeset
368 uisec = isec - isec % 2;
kono
parents:
diff changeset
369 i4 = ll + ulsec - 1;
kono
parents:
diff changeset
370 for (l = ll; l <= i4; l += 2)
kono
parents:
diff changeset
371 {
kono
parents:
diff changeset
372 i5 = ii + uisec - 1;
kono
parents:
diff changeset
373 for (i = ii; i <= i5; i += 2)
kono
parents:
diff changeset
374 {
kono
parents:
diff changeset
375 t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
kono
parents:
diff changeset
376 a[i + l * a_dim1];
kono
parents:
diff changeset
377 t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
kono
parents:
diff changeset
378 a[i + (l + 1) * a_dim1];
kono
parents:
diff changeset
379 t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
kono
parents:
diff changeset
380 a[i + 1 + l * a_dim1];
kono
parents:
diff changeset
381 t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
kono
parents:
diff changeset
382 a[i + 1 + (l + 1) * a_dim1];
kono
parents:
diff changeset
383 }
kono
parents:
diff changeset
384 if (uisec < isec)
kono
parents:
diff changeset
385 {
kono
parents:
diff changeset
386 t1[l - ll + 1 + (isec << 8) - 257] =
kono
parents:
diff changeset
387 a[ii + isec - 1 + l * a_dim1];
kono
parents:
diff changeset
388 t1[l - ll + 2 + (isec << 8) - 257] =
kono
parents:
diff changeset
389 a[ii + isec - 1 + (l + 1) * a_dim1];
kono
parents:
diff changeset
390 }
kono
parents:
diff changeset
391 }
kono
parents:
diff changeset
392 if (ulsec < lsec)
kono
parents:
diff changeset
393 {
kono
parents:
diff changeset
394 i4 = ii + isec - 1;
kono
parents:
diff changeset
395 for (i = ii; i<= i4; ++i)
kono
parents:
diff changeset
396 {
kono
parents:
diff changeset
397 t1[lsec + ((i - ii + 1) << 8) - 257] =
kono
parents:
diff changeset
398 a[i + (ll + lsec - 1) * a_dim1];
kono
parents:
diff changeset
399 }
kono
parents:
diff changeset
400 }
kono
parents:
diff changeset
401
kono
parents:
diff changeset
402 uisec = isec - isec % 4;
kono
parents:
diff changeset
403 i4 = jj + ujsec - 1;
kono
parents:
diff changeset
404 for (j = jj; j <= i4; j += 4)
kono
parents:
diff changeset
405 {
kono
parents:
diff changeset
406 i5 = ii + uisec - 1;
kono
parents:
diff changeset
407 for (i = ii; i <= i5; i += 4)
kono
parents:
diff changeset
408 {
kono
parents:
diff changeset
409 f11 = c[i + j * c_dim1];
kono
parents:
diff changeset
410 f21 = c[i + 1 + j * c_dim1];
kono
parents:
diff changeset
411 f12 = c[i + (j + 1) * c_dim1];
kono
parents:
diff changeset
412 f22 = c[i + 1 + (j + 1) * c_dim1];
kono
parents:
diff changeset
413 f13 = c[i + (j + 2) * c_dim1];
kono
parents:
diff changeset
414 f23 = c[i + 1 + (j + 2) * c_dim1];
kono
parents:
diff changeset
415 f14 = c[i + (j + 3) * c_dim1];
kono
parents:
diff changeset
416 f24 = c[i + 1 + (j + 3) * c_dim1];
kono
parents:
diff changeset
417 f31 = c[i + 2 + j * c_dim1];
kono
parents:
diff changeset
418 f41 = c[i + 3 + j * c_dim1];
kono
parents:
diff changeset
419 f32 = c[i + 2 + (j + 1) * c_dim1];
kono
parents:
diff changeset
420 f42 = c[i + 3 + (j + 1) * c_dim1];
kono
parents:
diff changeset
421 f33 = c[i + 2 + (j + 2) * c_dim1];
kono
parents:
diff changeset
422 f43 = c[i + 3 + (j + 2) * c_dim1];
kono
parents:
diff changeset
423 f34 = c[i + 2 + (j + 3) * c_dim1];
kono
parents:
diff changeset
424 f44 = c[i + 3 + (j + 3) * c_dim1];
kono
parents:
diff changeset
425 i6 = ll + lsec - 1;
kono
parents:
diff changeset
426 for (l = ll; l <= i6; ++l)
kono
parents:
diff changeset
427 {
kono
parents:
diff changeset
428 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
kono
parents:
diff changeset
429 * b[l + j * b_dim1];
kono
parents:
diff changeset
430 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
kono
parents:
diff changeset
431 * b[l + j * b_dim1];
kono
parents:
diff changeset
432 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
kono
parents:
diff changeset
433 * b[l + (j + 1) * b_dim1];
kono
parents:
diff changeset
434 f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
kono
parents:
diff changeset
435 * b[l + (j + 1) * b_dim1];
kono
parents:
diff changeset
436 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
kono
parents:
diff changeset
437 * b[l + (j + 2) * b_dim1];
kono
parents:
diff changeset
438 f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
kono
parents:
diff changeset
439 * b[l + (j + 2) * b_dim1];
kono
parents:
diff changeset
440 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
kono
parents:
diff changeset
441 * b[l + (j + 3) * b_dim1];
kono
parents:
diff changeset
442 f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
kono
parents:
diff changeset
443 * b[l + (j + 3) * b_dim1];
kono
parents:
diff changeset
444 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
kono
parents:
diff changeset
445 * b[l + j * b_dim1];
kono
parents:
diff changeset
446 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
kono
parents:
diff changeset
447 * b[l + j * b_dim1];
kono
parents:
diff changeset
448 f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
kono
parents:
diff changeset
449 * b[l + (j + 1) * b_dim1];
kono
parents:
diff changeset
450 f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
kono
parents:
diff changeset
451 * b[l + (j + 1) * b_dim1];
kono
parents:
diff changeset
452 f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
kono
parents:
diff changeset
453 * b[l + (j + 2) * b_dim1];
kono
parents:
diff changeset
454 f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
kono
parents:
diff changeset
455 * b[l + (j + 2) * b_dim1];
kono
parents:
diff changeset
456 f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
kono
parents:
diff changeset
457 * b[l + (j + 3) * b_dim1];
kono
parents:
diff changeset
458 f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
kono
parents:
diff changeset
459 * b[l + (j + 3) * b_dim1];
kono
parents:
diff changeset
460 }
kono
parents:
diff changeset
461 c[i + j * c_dim1] = f11;
kono
parents:
diff changeset
462 c[i + 1 + j * c_dim1] = f21;
kono
parents:
diff changeset
463 c[i + (j + 1) * c_dim1] = f12;
kono
parents:
diff changeset
464 c[i + 1 + (j + 1) * c_dim1] = f22;
kono
parents:
diff changeset
465 c[i + (j + 2) * c_dim1] = f13;
kono
parents:
diff changeset
466 c[i + 1 + (j + 2) * c_dim1] = f23;
kono
parents:
diff changeset
467 c[i + (j + 3) * c_dim1] = f14;
kono
parents:
diff changeset
468 c[i + 1 + (j + 3) * c_dim1] = f24;
kono
parents:
diff changeset
469 c[i + 2 + j * c_dim1] = f31;
kono
parents:
diff changeset
470 c[i + 3 + j * c_dim1] = f41;
kono
parents:
diff changeset
471 c[i + 2 + (j + 1) * c_dim1] = f32;
kono
parents:
diff changeset
472 c[i + 3 + (j + 1) * c_dim1] = f42;
kono
parents:
diff changeset
473 c[i + 2 + (j + 2) * c_dim1] = f33;
kono
parents:
diff changeset
474 c[i + 3 + (j + 2) * c_dim1] = f43;
kono
parents:
diff changeset
475 c[i + 2 + (j + 3) * c_dim1] = f34;
kono
parents:
diff changeset
476 c[i + 3 + (j + 3) * c_dim1] = f44;
kono
parents:
diff changeset
477 }
kono
parents:
diff changeset
478 if (uisec < isec)
kono
parents:
diff changeset
479 {
kono
parents:
diff changeset
480 i5 = ii + isec - 1;
kono
parents:
diff changeset
481 for (i = ii + uisec; i <= i5; ++i)
kono
parents:
diff changeset
482 {
kono
parents:
diff changeset
483 f11 = c[i + j * c_dim1];
kono
parents:
diff changeset
484 f12 = c[i + (j + 1) * c_dim1];
kono
parents:
diff changeset
485 f13 = c[i + (j + 2) * c_dim1];
kono
parents:
diff changeset
486 f14 = c[i + (j + 3) * c_dim1];
kono
parents:
diff changeset
487 i6 = ll + lsec - 1;
kono
parents:
diff changeset
488 for (l = ll; l <= i6; ++l)
kono
parents:
diff changeset
489 {
kono
parents:
diff changeset
490 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
kono
parents:
diff changeset
491 257] * b[l + j * b_dim1];
kono
parents:
diff changeset
492 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
kono
parents:
diff changeset
493 257] * b[l + (j + 1) * b_dim1];
kono
parents:
diff changeset
494 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
kono
parents:
diff changeset
495 257] * b[l + (j + 2) * b_dim1];
kono
parents:
diff changeset
496 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
kono
parents:
diff changeset
497 257] * b[l + (j + 3) * b_dim1];
kono
parents:
diff changeset
498 }
kono
parents:
diff changeset
499 c[i + j * c_dim1] = f11;
kono
parents:
diff changeset
500 c[i + (j + 1) * c_dim1] = f12;
kono
parents:
diff changeset
501 c[i + (j + 2) * c_dim1] = f13;
kono
parents:
diff changeset
502 c[i + (j + 3) * c_dim1] = f14;
kono
parents:
diff changeset
503 }
kono
parents:
diff changeset
504 }
kono
parents:
diff changeset
505 }
kono
parents:
diff changeset
506 if (ujsec < jsec)
kono
parents:
diff changeset
507 {
kono
parents:
diff changeset
508 i4 = jj + jsec - 1;
kono
parents:
diff changeset
509 for (j = jj + ujsec; j <= i4; ++j)
kono
parents:
diff changeset
510 {
kono
parents:
diff changeset
511 i5 = ii + uisec - 1;
kono
parents:
diff changeset
512 for (i = ii; i <= i5; i += 4)
kono
parents:
diff changeset
513 {
kono
parents:
diff changeset
514 f11 = c[i + j * c_dim1];
kono
parents:
diff changeset
515 f21 = c[i + 1 + j * c_dim1];
kono
parents:
diff changeset
516 f31 = c[i + 2 + j * c_dim1];
kono
parents:
diff changeset
517 f41 = c[i + 3 + j * c_dim1];
kono
parents:
diff changeset
518 i6 = ll + lsec - 1;
kono
parents:
diff changeset
519 for (l = ll; l <= i6; ++l)
kono
parents:
diff changeset
520 {
kono
parents:
diff changeset
521 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
kono
parents:
diff changeset
522 257] * b[l + j * b_dim1];
kono
parents:
diff changeset
523 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
kono
parents:
diff changeset
524 257] * b[l + j * b_dim1];
kono
parents:
diff changeset
525 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
kono
parents:
diff changeset
526 257] * b[l + j * b_dim1];
kono
parents:
diff changeset
527 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
kono
parents:
diff changeset
528 257] * b[l + j * b_dim1];
kono
parents:
diff changeset
529 }
kono
parents:
diff changeset
530 c[i + j * c_dim1] = f11;
kono
parents:
diff changeset
531 c[i + 1 + j * c_dim1] = f21;
kono
parents:
diff changeset
532 c[i + 2 + j * c_dim1] = f31;
kono
parents:
diff changeset
533 c[i + 3 + j * c_dim1] = f41;
kono
parents:
diff changeset
534 }
kono
parents:
diff changeset
535 i5 = ii + isec - 1;
kono
parents:
diff changeset
536 for (i = ii + uisec; i <= i5; ++i)
kono
parents:
diff changeset
537 {
kono
parents:
diff changeset
538 f11 = c[i + j * c_dim1];
kono
parents:
diff changeset
539 i6 = ll + lsec - 1;
kono
parents:
diff changeset
540 for (l = ll; l <= i6; ++l)
kono
parents:
diff changeset
541 {
kono
parents:
diff changeset
542 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
kono
parents:
diff changeset
543 257] * b[l + j * b_dim1];
kono
parents:
diff changeset
544 }
kono
parents:
diff changeset
545 c[i + j * c_dim1] = f11;
kono
parents:
diff changeset
546 }
kono
parents:
diff changeset
547 }
kono
parents:
diff changeset
548 }
kono
parents:
diff changeset
549 }
kono
parents:
diff changeset
550 }
kono
parents:
diff changeset
551 }
kono
parents:
diff changeset
552 free(t1);
kono
parents:
diff changeset
553 return;
kono
parents:
diff changeset
554 }
kono
parents:
diff changeset
555 else if (rxstride == 1 && aystride == 1 && bxstride == 1)
kono
parents:
diff changeset
556 {
kono
parents:
diff changeset
557 if (GFC_DESCRIPTOR_RANK (a) != 1)
kono
parents:
diff changeset
558 {
kono
parents:
diff changeset
559 const GFC_INTEGER_16 *restrict abase_x;
kono
parents:
diff changeset
560 const GFC_INTEGER_16 *restrict bbase_y;
kono
parents:
diff changeset
561 GFC_INTEGER_16 *restrict dest_y;
kono
parents:
diff changeset
562 GFC_INTEGER_16 s;
kono
parents:
diff changeset
563
kono
parents:
diff changeset
564 for (y = 0; y < ycount; y++)
kono
parents:
diff changeset
565 {
kono
parents:
diff changeset
566 bbase_y = &bbase[y*bystride];
kono
parents:
diff changeset
567 dest_y = &dest[y*rystride];
kono
parents:
diff changeset
568 for (x = 0; x < xcount; x++)
kono
parents:
diff changeset
569 {
kono
parents:
diff changeset
570 abase_x = &abase[x*axstride];
kono
parents:
diff changeset
571 s = (GFC_INTEGER_16) 0;
kono
parents:
diff changeset
572 for (n = 0; n < count; n++)
kono
parents:
diff changeset
573 s += abase_x[n] * bbase_y[n];
kono
parents:
diff changeset
574 dest_y[x] = s;
kono
parents:
diff changeset
575 }
kono
parents:
diff changeset
576 }
kono
parents:
diff changeset
577 }
kono
parents:
diff changeset
578 else
kono
parents:
diff changeset
579 {
kono
parents:
diff changeset
580 const GFC_INTEGER_16 *restrict bbase_y;
kono
parents:
diff changeset
581 GFC_INTEGER_16 s;
kono
parents:
diff changeset
582
kono
parents:
diff changeset
583 for (y = 0; y < ycount; y++)
kono
parents:
diff changeset
584 {
kono
parents:
diff changeset
585 bbase_y = &bbase[y*bystride];
kono
parents:
diff changeset
586 s = (GFC_INTEGER_16) 0;
kono
parents:
diff changeset
587 for (n = 0; n < count; n++)
kono
parents:
diff changeset
588 s += abase[n*axstride] * bbase_y[n];
kono
parents:
diff changeset
589 dest[y*rystride] = s;
kono
parents:
diff changeset
590 }
kono
parents:
diff changeset
591 }
kono
parents:
diff changeset
592 }
kono
parents:
diff changeset
593 else if (axstride < aystride)
kono
parents:
diff changeset
594 {
kono
parents:
diff changeset
595 for (y = 0; y < ycount; y++)
kono
parents:
diff changeset
596 for (x = 0; x < xcount; x++)
kono
parents:
diff changeset
597 dest[x*rxstride + y*rystride] = (GFC_INTEGER_16)0;
kono
parents:
diff changeset
598
kono
parents:
diff changeset
599 for (y = 0; y < ycount; y++)
kono
parents:
diff changeset
600 for (n = 0; n < count; n++)
kono
parents:
diff changeset
601 for (x = 0; x < xcount; x++)
kono
parents:
diff changeset
602 /* dest[x,y] += a[x,n] * b[n,y] */
kono
parents:
diff changeset
603 dest[x*rxstride + y*rystride] +=
kono
parents:
diff changeset
604 abase[x*axstride + n*aystride] *
kono
parents:
diff changeset
605 bbase[n*bxstride + y*bystride];
kono
parents:
diff changeset
606 }
kono
parents:
diff changeset
607 else if (GFC_DESCRIPTOR_RANK (a) == 1)
kono
parents:
diff changeset
608 {
kono
parents:
diff changeset
609 const GFC_INTEGER_16 *restrict bbase_y;
kono
parents:
diff changeset
610 GFC_INTEGER_16 s;
kono
parents:
diff changeset
611
kono
parents:
diff changeset
612 for (y = 0; y < ycount; y++)
kono
parents:
diff changeset
613 {
kono
parents:
diff changeset
614 bbase_y = &bbase[y*bystride];
kono
parents:
diff changeset
615 s = (GFC_INTEGER_16) 0;
kono
parents:
diff changeset
616 for (n = 0; n < count; n++)
kono
parents:
diff changeset
617 s += abase[n*axstride] * bbase_y[n*bxstride];
kono
parents:
diff changeset
618 dest[y*rxstride] = s;
kono
parents:
diff changeset
619 }
kono
parents:
diff changeset
620 }
kono
parents:
diff changeset
621 else
kono
parents:
diff changeset
622 {
kono
parents:
diff changeset
623 const GFC_INTEGER_16 *restrict abase_x;
kono
parents:
diff changeset
624 const GFC_INTEGER_16 *restrict bbase_y;
kono
parents:
diff changeset
625 GFC_INTEGER_16 *restrict dest_y;
kono
parents:
diff changeset
626 GFC_INTEGER_16 s;
kono
parents:
diff changeset
627
kono
parents:
diff changeset
628 for (y = 0; y < ycount; y++)
kono
parents:
diff changeset
629 {
kono
parents:
diff changeset
630 bbase_y = &bbase[y*bystride];
kono
parents:
diff changeset
631 dest_y = &dest[y*rystride];
kono
parents:
diff changeset
632 for (x = 0; x < xcount; x++)
kono
parents:
diff changeset
633 {
kono
parents:
diff changeset
634 abase_x = &abase[x*axstride];
kono
parents:
diff changeset
635 s = (GFC_INTEGER_16) 0;
kono
parents:
diff changeset
636 for (n = 0; n < count; n++)
kono
parents:
diff changeset
637 s += abase_x[n*aystride] * bbase_y[n*bxstride];
kono
parents:
diff changeset
638 dest_y[x*rxstride] = s;
kono
parents:
diff changeset
639 }
kono
parents:
diff changeset
640 }
kono
parents:
diff changeset
641 }
kono
parents:
diff changeset
642 }
kono
parents:
diff changeset
643 #undef POW3
kono
parents:
diff changeset
644 #undef min
kono
parents:
diff changeset
645 #undef max
kono
parents:
diff changeset
646
kono
parents:
diff changeset
647 #endif /* HAVE_AVX */
kono
parents:
diff changeset
648
kono
parents:
diff changeset
649 #ifdef HAVE_AVX2
kono
parents:
diff changeset
650 static void
kono
parents:
diff changeset
651 matmul_i16_avx2 (gfc_array_i16 * const restrict retarray,
kono
parents:
diff changeset
652 gfc_array_i16 * const restrict a, gfc_array_i16 * const restrict b, int try_blas,
kono
parents:
diff changeset
653 int blas_limit, blas_call gemm) __attribute__((__target__("avx2,fma")));
kono
parents:
diff changeset
654 static void
kono
parents:
diff changeset
655 matmul_i16_avx2 (gfc_array_i16 * const restrict retarray,
kono
parents:
diff changeset
656 gfc_array_i16 * const restrict a, gfc_array_i16 * const restrict b, int try_blas,
kono
parents:
diff changeset
657 int blas_limit, blas_call gemm)
kono
parents:
diff changeset
658 {
kono
parents:
diff changeset
659 const GFC_INTEGER_16 * restrict abase;
kono
parents:
diff changeset
660 const GFC_INTEGER_16 * restrict bbase;
kono
parents:
diff changeset
661 GFC_INTEGER_16 * restrict dest;
kono
parents:
diff changeset
662
kono
parents:
diff changeset
663 index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
kono
parents:
diff changeset
664 index_type x, y, n, count, xcount, ycount;
kono
parents:
diff changeset
665
kono
parents:
diff changeset
666 assert (GFC_DESCRIPTOR_RANK (a) == 2
kono
parents:
diff changeset
667 || GFC_DESCRIPTOR_RANK (b) == 2);
kono
parents:
diff changeset
668
kono
parents:
diff changeset
669 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
kono
parents:
diff changeset
670
kono
parents:
diff changeset
671 Either A or B (but not both) can be rank 1:
kono
parents:
diff changeset
672
kono
parents:
diff changeset
673 o One-dimensional argument A is implicitly treated as a row matrix
kono
parents:
diff changeset
674 dimensioned [1,count], so xcount=1.
kono
parents:
diff changeset
675
kono
parents:
diff changeset
676 o One-dimensional argument B is implicitly treated as a column matrix
kono
parents:
diff changeset
677 dimensioned [count, 1], so ycount=1.
kono
parents:
diff changeset
678 */
kono
parents:
diff changeset
679
kono
parents:
diff changeset
680 if (retarray->base_addr == NULL)
kono
parents:
diff changeset
681 {
kono
parents:
diff changeset
682 if (GFC_DESCRIPTOR_RANK (a) == 1)
kono
parents:
diff changeset
683 {
kono
parents:
diff changeset
684 GFC_DIMENSION_SET(retarray->dim[0], 0,
kono
parents:
diff changeset
685 GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
kono
parents:
diff changeset
686 }
kono
parents:
diff changeset
687 else if (GFC_DESCRIPTOR_RANK (b) == 1)
kono
parents:
diff changeset
688 {
kono
parents:
diff changeset
689 GFC_DIMENSION_SET(retarray->dim[0], 0,
kono
parents:
diff changeset
690 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
kono
parents:
diff changeset
691 }
kono
parents:
diff changeset
692 else
kono
parents:
diff changeset
693 {
kono
parents:
diff changeset
694 GFC_DIMENSION_SET(retarray->dim[0], 0,
kono
parents:
diff changeset
695 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
kono
parents:
diff changeset
696
kono
parents:
diff changeset
697 GFC_DIMENSION_SET(retarray->dim[1], 0,
kono
parents:
diff changeset
698 GFC_DESCRIPTOR_EXTENT(b,1) - 1,
kono
parents:
diff changeset
699 GFC_DESCRIPTOR_EXTENT(retarray,0));
kono
parents:
diff changeset
700 }
kono
parents:
diff changeset
701
kono
parents:
diff changeset
702 retarray->base_addr
kono
parents:
diff changeset
703 = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_INTEGER_16));
kono
parents:
diff changeset
704 retarray->offset = 0;
kono
parents:
diff changeset
705 }
kono
parents:
diff changeset
706 else if (unlikely (compile_options.bounds_check))
kono
parents:
diff changeset
707 {
kono
parents:
diff changeset
708 index_type ret_extent, arg_extent;
kono
parents:
diff changeset
709
kono
parents:
diff changeset
710 if (GFC_DESCRIPTOR_RANK (a) == 1)
kono
parents:
diff changeset
711 {
kono
parents:
diff changeset
712 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
kono
parents:
diff changeset
713 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
kono
parents:
diff changeset
714 if (arg_extent != ret_extent)
131
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
715 runtime_error ("Array bound mismatch for dimension 1 of "
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
716 "array (%ld/%ld) ",
111
kono
parents:
diff changeset
717 (long int) ret_extent, (long int) arg_extent);
kono
parents:
diff changeset
718 }
kono
parents:
diff changeset
719 else if (GFC_DESCRIPTOR_RANK (b) == 1)
kono
parents:
diff changeset
720 {
kono
parents:
diff changeset
721 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
kono
parents:
diff changeset
722 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
kono
parents:
diff changeset
723 if (arg_extent != ret_extent)
131
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
724 runtime_error ("Array bound mismatch for dimension 1 of "
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
725 "array (%ld/%ld) ",
111
kono
parents:
diff changeset
726 (long int) ret_extent, (long int) arg_extent);
kono
parents:
diff changeset
727 }
kono
parents:
diff changeset
728 else
kono
parents:
diff changeset
729 {
kono
parents:
diff changeset
730 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
kono
parents:
diff changeset
731 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
kono
parents:
diff changeset
732 if (arg_extent != ret_extent)
131
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
733 runtime_error ("Array bound mismatch for dimension 1 of "
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
734 "array (%ld/%ld) ",
111
kono
parents:
diff changeset
735 (long int) ret_extent, (long int) arg_extent);
kono
parents:
diff changeset
736
kono
parents:
diff changeset
737 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
kono
parents:
diff changeset
738 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
kono
parents:
diff changeset
739 if (arg_extent != ret_extent)
131
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
740 runtime_error ("Array bound mismatch for dimension 2 of "
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
741 "array (%ld/%ld) ",
111
kono
parents:
diff changeset
742 (long int) ret_extent, (long int) arg_extent);
kono
parents:
diff changeset
743 }
kono
parents:
diff changeset
744 }
kono
parents:
diff changeset
745
kono
parents:
diff changeset
746
kono
parents:
diff changeset
747 if (GFC_DESCRIPTOR_RANK (retarray) == 1)
kono
parents:
diff changeset
748 {
kono
parents:
diff changeset
749 /* One-dimensional result may be addressed in the code below
kono
parents:
diff changeset
750 either as a row or a column matrix. We want both cases to
kono
parents:
diff changeset
751 work. */
kono
parents:
diff changeset
752 rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
kono
parents:
diff changeset
753 }
kono
parents:
diff changeset
754 else
kono
parents:
diff changeset
755 {
kono
parents:
diff changeset
756 rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
kono
parents:
diff changeset
757 rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
kono
parents:
diff changeset
758 }
kono
parents:
diff changeset
759
kono
parents:
diff changeset
760
kono
parents:
diff changeset
761 if (GFC_DESCRIPTOR_RANK (a) == 1)
kono
parents:
diff changeset
762 {
kono
parents:
diff changeset
763 /* Treat it as a a row matrix A[1,count]. */
kono
parents:
diff changeset
764 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
kono
parents:
diff changeset
765 aystride = 1;
kono
parents:
diff changeset
766
kono
parents:
diff changeset
767 xcount = 1;
kono
parents:
diff changeset
768 count = GFC_DESCRIPTOR_EXTENT(a,0);
kono
parents:
diff changeset
769 }
kono
parents:
diff changeset
770 else
kono
parents:
diff changeset
771 {
kono
parents:
diff changeset
772 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
kono
parents:
diff changeset
773 aystride = GFC_DESCRIPTOR_STRIDE(a,1);
kono
parents:
diff changeset
774
kono
parents:
diff changeset
775 count = GFC_DESCRIPTOR_EXTENT(a,1);
kono
parents:
diff changeset
776 xcount = GFC_DESCRIPTOR_EXTENT(a,0);
kono
parents:
diff changeset
777 }
kono
parents:
diff changeset
778
kono
parents:
diff changeset
779 if (count != GFC_DESCRIPTOR_EXTENT(b,0))
kono
parents:
diff changeset
780 {
kono
parents:
diff changeset
781 if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
131
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
782 runtime_error ("Incorrect extent in argument B in MATMUL intrinsic "
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
783 "in dimension 1: is %ld, should be %ld",
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
784 (long int) GFC_DESCRIPTOR_EXTENT(b,0), (long int) count);
111
kono
parents:
diff changeset
785 }
kono
parents:
diff changeset
786
kono
parents:
diff changeset
787 if (GFC_DESCRIPTOR_RANK (b) == 1)
kono
parents:
diff changeset
788 {
kono
parents:
diff changeset
789 /* Treat it as a column matrix B[count,1] */
kono
parents:
diff changeset
790 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
kono
parents:
diff changeset
791
kono
parents:
diff changeset
792 /* bystride should never be used for 1-dimensional b.
kono
parents:
diff changeset
793 The value is only used for calculation of the
kono
parents:
diff changeset
794 memory by the buffer. */
kono
parents:
diff changeset
795 bystride = 256;
kono
parents:
diff changeset
796 ycount = 1;
kono
parents:
diff changeset
797 }
kono
parents:
diff changeset
798 else
kono
parents:
diff changeset
799 {
kono
parents:
diff changeset
800 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
kono
parents:
diff changeset
801 bystride = GFC_DESCRIPTOR_STRIDE(b,1);
kono
parents:
diff changeset
802 ycount = GFC_DESCRIPTOR_EXTENT(b,1);
kono
parents:
diff changeset
803 }
kono
parents:
diff changeset
804
kono
parents:
diff changeset
805 abase = a->base_addr;
kono
parents:
diff changeset
806 bbase = b->base_addr;
kono
parents:
diff changeset
807 dest = retarray->base_addr;
kono
parents:
diff changeset
808
kono
parents:
diff changeset
809 /* Now that everything is set up, we perform the multiplication
kono
parents:
diff changeset
810 itself. */
kono
parents:
diff changeset
811
kono
parents:
diff changeset
812 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
kono
parents:
diff changeset
813 #define min(a,b) ((a) <= (b) ? (a) : (b))
kono
parents:
diff changeset
814 #define max(a,b) ((a) >= (b) ? (a) : (b))
kono
parents:
diff changeset
815
kono
parents:
diff changeset
816 if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
kono
parents:
diff changeset
817 && (bxstride == 1 || bystride == 1)
kono
parents:
diff changeset
818 && (((float) xcount) * ((float) ycount) * ((float) count)
kono
parents:
diff changeset
819 > POW3(blas_limit)))
kono
parents:
diff changeset
820 {
kono
parents:
diff changeset
821 const int m = xcount, n = ycount, k = count, ldc = rystride;
kono
parents:
diff changeset
822 const GFC_INTEGER_16 one = 1, zero = 0;
kono
parents:
diff changeset
823 const int lda = (axstride == 1) ? aystride : axstride,
kono
parents:
diff changeset
824 ldb = (bxstride == 1) ? bystride : bxstride;
kono
parents:
diff changeset
825
kono
parents:
diff changeset
826 if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
kono
parents:
diff changeset
827 {
kono
parents:
diff changeset
828 assert (gemm != NULL);
131
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
829 const char *transa, *transb;
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
830 if (try_blas & 2)
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
831 transa = "C";
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
832 else
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
833 transa = axstride == 1 ? "N" : "T";
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
834
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
835 if (try_blas & 4)
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
836 transb = "C";
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
837 else
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
838 transb = bxstride == 1 ? "N" : "T";
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
839
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
840 gemm (transa, transb , &m,
111
kono
parents:
diff changeset
841 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
kono
parents:
diff changeset
842 &ldc, 1, 1);
kono
parents:
diff changeset
843 return;
kono
parents:
diff changeset
844 }
kono
parents:
diff changeset
845 }
kono
parents:
diff changeset
846
kono
parents:
diff changeset
847 if (rxstride == 1 && axstride == 1 && bxstride == 1)
kono
parents:
diff changeset
848 {
kono
parents:
diff changeset
849 /* This block of code implements a tuned matmul, derived from
kono
parents:
diff changeset
850 Superscalar GEMM-based level 3 BLAS, Beta version 0.1
kono
parents:
diff changeset
851
kono
parents:
diff changeset
852 Bo Kagstrom and Per Ling
kono
parents:
diff changeset
853 Department of Computing Science
kono
parents:
diff changeset
854 Umea University
kono
parents:
diff changeset
855 S-901 87 Umea, Sweden
kono
parents:
diff changeset
856
kono
parents:
diff changeset
857 from netlib.org, translated to C, and modified for matmul.m4. */
kono
parents:
diff changeset
858
kono
parents:
diff changeset
859 const GFC_INTEGER_16 *a, *b;
kono
parents:
diff changeset
860 GFC_INTEGER_16 *c;
kono
parents:
diff changeset
861 const index_type m = xcount, n = ycount, k = count;
kono
parents:
diff changeset
862
kono
parents:
diff changeset
863 /* System generated locals */
kono
parents:
diff changeset
864 index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
kono
parents:
diff changeset
865 i1, i2, i3, i4, i5, i6;
kono
parents:
diff changeset
866
kono
parents:
diff changeset
867 /* Local variables */
kono
parents:
diff changeset
868 GFC_INTEGER_16 f11, f12, f21, f22, f31, f32, f41, f42,
kono
parents:
diff changeset
869 f13, f14, f23, f24, f33, f34, f43, f44;
kono
parents:
diff changeset
870 index_type i, j, l, ii, jj, ll;
kono
parents:
diff changeset
871 index_type isec, jsec, lsec, uisec, ujsec, ulsec;
kono
parents:
diff changeset
872 GFC_INTEGER_16 *t1;
kono
parents:
diff changeset
873
kono
parents:
diff changeset
874 a = abase;
kono
parents:
diff changeset
875 b = bbase;
kono
parents:
diff changeset
876 c = retarray->base_addr;
kono
parents:
diff changeset
877
kono
parents:
diff changeset
878 /* Parameter adjustments */
kono
parents:
diff changeset
879 c_dim1 = rystride;
kono
parents:
diff changeset
880 c_offset = 1 + c_dim1;
kono
parents:
diff changeset
881 c -= c_offset;
kono
parents:
diff changeset
882 a_dim1 = aystride;
kono
parents:
diff changeset
883 a_offset = 1 + a_dim1;
kono
parents:
diff changeset
884 a -= a_offset;
kono
parents:
diff changeset
885 b_dim1 = bystride;
kono
parents:
diff changeset
886 b_offset = 1 + b_dim1;
kono
parents:
diff changeset
887 b -= b_offset;
kono
parents:
diff changeset
888
kono
parents:
diff changeset
889 /* Empty c first. */
kono
parents:
diff changeset
890 for (j=1; j<=n; j++)
kono
parents:
diff changeset
891 for (i=1; i<=m; i++)
kono
parents:
diff changeset
892 c[i + j * c_dim1] = (GFC_INTEGER_16)0;
kono
parents:
diff changeset
893
kono
parents:
diff changeset
894 /* Early exit if possible */
kono
parents:
diff changeset
895 if (m == 0 || n == 0 || k == 0)
kono
parents:
diff changeset
896 return;
kono
parents:
diff changeset
897
kono
parents:
diff changeset
898 /* Adjust size of t1 to what is needed. */
131
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
899 index_type t1_dim, a_sz;
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
900 if (aystride == 1)
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
901 a_sz = rystride;
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
902 else
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
903 a_sz = a_dim1;
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
904
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
905 t1_dim = a_sz * 256 + b_dim1;
111
kono
parents:
diff changeset
906 if (t1_dim > 65536)
kono
parents:
diff changeset
907 t1_dim = 65536;
kono
parents:
diff changeset
908
kono
parents:
diff changeset
909 t1 = malloc (t1_dim * sizeof(GFC_INTEGER_16));
kono
parents:
diff changeset
910
kono
parents:
diff changeset
911 /* Start turning the crank. */
kono
parents:
diff changeset
912 i1 = n;
kono
parents:
diff changeset
913 for (jj = 1; jj <= i1; jj += 512)
kono
parents:
diff changeset
914 {
kono
parents:
diff changeset
915 /* Computing MIN */
kono
parents:
diff changeset
916 i2 = 512;
kono
parents:
diff changeset
917 i3 = n - jj + 1;
kono
parents:
diff changeset
918 jsec = min(i2,i3);
kono
parents:
diff changeset
919 ujsec = jsec - jsec % 4;
kono
parents:
diff changeset
920 i2 = k;
kono
parents:
diff changeset
921 for (ll = 1; ll <= i2; ll += 256)
kono
parents:
diff changeset
922 {
kono
parents:
diff changeset
923 /* Computing MIN */
kono
parents:
diff changeset
924 i3 = 256;
kono
parents:
diff changeset
925 i4 = k - ll + 1;
kono
parents:
diff changeset
926 lsec = min(i3,i4);
kono
parents:
diff changeset
927 ulsec = lsec - lsec % 2;
kono
parents:
diff changeset
928
kono
parents:
diff changeset
929 i3 = m;
kono
parents:
diff changeset
930 for (ii = 1; ii <= i3; ii += 256)
kono
parents:
diff changeset
931 {
kono
parents:
diff changeset
932 /* Computing MIN */
kono
parents:
diff changeset
933 i4 = 256;
kono
parents:
diff changeset
934 i5 = m - ii + 1;
kono
parents:
diff changeset
935 isec = min(i4,i5);
kono
parents:
diff changeset
936 uisec = isec - isec % 2;
kono
parents:
diff changeset
937 i4 = ll + ulsec - 1;
kono
parents:
diff changeset
938 for (l = ll; l <= i4; l += 2)
kono
parents:
diff changeset
939 {
kono
parents:
diff changeset
940 i5 = ii + uisec - 1;
kono
parents:
diff changeset
941 for (i = ii; i <= i5; i += 2)
kono
parents:
diff changeset
942 {
kono
parents:
diff changeset
943 t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
kono
parents:
diff changeset
944 a[i + l * a_dim1];
kono
parents:
diff changeset
945 t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
kono
parents:
diff changeset
946 a[i + (l + 1) * a_dim1];
kono
parents:
diff changeset
947 t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
kono
parents:
diff changeset
948 a[i + 1 + l * a_dim1];
kono
parents:
diff changeset
949 t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
kono
parents:
diff changeset
950 a[i + 1 + (l + 1) * a_dim1];
kono
parents:
diff changeset
951 }
kono
parents:
diff changeset
952 if (uisec < isec)
kono
parents:
diff changeset
953 {
kono
parents:
diff changeset
954 t1[l - ll + 1 + (isec << 8) - 257] =
kono
parents:
diff changeset
955 a[ii + isec - 1 + l * a_dim1];
kono
parents:
diff changeset
956 t1[l - ll + 2 + (isec << 8) - 257] =
kono
parents:
diff changeset
957 a[ii + isec - 1 + (l + 1) * a_dim1];
kono
parents:
diff changeset
958 }
kono
parents:
diff changeset
959 }
kono
parents:
diff changeset
960 if (ulsec < lsec)
kono
parents:
diff changeset
961 {
kono
parents:
diff changeset
962 i4 = ii + isec - 1;
kono
parents:
diff changeset
963 for (i = ii; i<= i4; ++i)
kono
parents:
diff changeset
964 {
kono
parents:
diff changeset
965 t1[lsec + ((i - ii + 1) << 8) - 257] =
kono
parents:
diff changeset
966 a[i + (ll + lsec - 1) * a_dim1];
kono
parents:
diff changeset
967 }
kono
parents:
diff changeset
968 }
kono
parents:
diff changeset
969
kono
parents:
diff changeset
970 uisec = isec - isec % 4;
kono
parents:
diff changeset
971 i4 = jj + ujsec - 1;
kono
parents:
diff changeset
972 for (j = jj; j <= i4; j += 4)
kono
parents:
diff changeset
973 {
kono
parents:
diff changeset
974 i5 = ii + uisec - 1;
kono
parents:
diff changeset
975 for (i = ii; i <= i5; i += 4)
kono
parents:
diff changeset
976 {
kono
parents:
diff changeset
977 f11 = c[i + j * c_dim1];
kono
parents:
diff changeset
978 f21 = c[i + 1 + j * c_dim1];
kono
parents:
diff changeset
979 f12 = c[i + (j + 1) * c_dim1];
kono
parents:
diff changeset
980 f22 = c[i + 1 + (j + 1) * c_dim1];
kono
parents:
diff changeset
981 f13 = c[i + (j + 2) * c_dim1];
kono
parents:
diff changeset
982 f23 = c[i + 1 + (j + 2) * c_dim1];
kono
parents:
diff changeset
983 f14 = c[i + (j + 3) * c_dim1];
kono
parents:
diff changeset
984 f24 = c[i + 1 + (j + 3) * c_dim1];
kono
parents:
diff changeset
985 f31 = c[i + 2 + j * c_dim1];
kono
parents:
diff changeset
986 f41 = c[i + 3 + j * c_dim1];
kono
parents:
diff changeset
987 f32 = c[i + 2 + (j + 1) * c_dim1];
kono
parents:
diff changeset
988 f42 = c[i + 3 + (j + 1) * c_dim1];
kono
parents:
diff changeset
989 f33 = c[i + 2 + (j + 2) * c_dim1];
kono
parents:
diff changeset
990 f43 = c[i + 3 + (j + 2) * c_dim1];
kono
parents:
diff changeset
991 f34 = c[i + 2 + (j + 3) * c_dim1];
kono
parents:
diff changeset
992 f44 = c[i + 3 + (j + 3) * c_dim1];
kono
parents:
diff changeset
993 i6 = ll + lsec - 1;
kono
parents:
diff changeset
994 for (l = ll; l <= i6; ++l)
kono
parents:
diff changeset
995 {
kono
parents:
diff changeset
996 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
kono
parents:
diff changeset
997 * b[l + j * b_dim1];
kono
parents:
diff changeset
998 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
kono
parents:
diff changeset
999 * b[l + j * b_dim1];
kono
parents:
diff changeset
1000 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
kono
parents:
diff changeset
1001 * b[l + (j + 1) * b_dim1];
kono
parents:
diff changeset
1002 f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
kono
parents:
diff changeset
1003 * b[l + (j + 1) * b_dim1];
kono
parents:
diff changeset
1004 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
kono
parents:
diff changeset
1005 * b[l + (j + 2) * b_dim1];
kono
parents:
diff changeset
1006 f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
kono
parents:
diff changeset
1007 * b[l + (j + 2) * b_dim1];
kono
parents:
diff changeset
1008 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
kono
parents:
diff changeset
1009 * b[l + (j + 3) * b_dim1];
kono
parents:
diff changeset
1010 f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
kono
parents:
diff changeset
1011 * b[l + (j + 3) * b_dim1];
kono
parents:
diff changeset
1012 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
kono
parents:
diff changeset
1013 * b[l + j * b_dim1];
kono
parents:
diff changeset
1014 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
kono
parents:
diff changeset
1015 * b[l + j * b_dim1];
kono
parents:
diff changeset
1016 f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
kono
parents:
diff changeset
1017 * b[l + (j + 1) * b_dim1];
kono
parents:
diff changeset
1018 f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
kono
parents:
diff changeset
1019 * b[l + (j + 1) * b_dim1];
kono
parents:
diff changeset
1020 f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
kono
parents:
diff changeset
1021 * b[l + (j + 2) * b_dim1];
kono
parents:
diff changeset
1022 f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
kono
parents:
diff changeset
1023 * b[l + (j + 2) * b_dim1];
kono
parents:
diff changeset
1024 f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
kono
parents:
diff changeset
1025 * b[l + (j + 3) * b_dim1];
kono
parents:
diff changeset
1026 f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
kono
parents:
diff changeset
1027 * b[l + (j + 3) * b_dim1];
kono
parents:
diff changeset
1028 }
kono
parents:
diff changeset
1029 c[i + j * c_dim1] = f11;
kono
parents:
diff changeset
1030 c[i + 1 + j * c_dim1] = f21;
kono
parents:
diff changeset
1031 c[i + (j + 1) * c_dim1] = f12;
kono
parents:
diff changeset
1032 c[i + 1 + (j + 1) * c_dim1] = f22;
kono
parents:
diff changeset
1033 c[i + (j + 2) * c_dim1] = f13;
kono
parents:
diff changeset
1034 c[i + 1 + (j + 2) * c_dim1] = f23;
kono
parents:
diff changeset
1035 c[i + (j + 3) * c_dim1] = f14;
kono
parents:
diff changeset
1036 c[i + 1 + (j + 3) * c_dim1] = f24;
kono
parents:
diff changeset
1037 c[i + 2 + j * c_dim1] = f31;
kono
parents:
diff changeset
1038 c[i + 3 + j * c_dim1] = f41;
kono
parents:
diff changeset
1039 c[i + 2 + (j + 1) * c_dim1] = f32;
kono
parents:
diff changeset
1040 c[i + 3 + (j + 1) * c_dim1] = f42;
kono
parents:
diff changeset
1041 c[i + 2 + (j + 2) * c_dim1] = f33;
kono
parents:
diff changeset
1042 c[i + 3 + (j + 2) * c_dim1] = f43;
kono
parents:
diff changeset
1043 c[i + 2 + (j + 3) * c_dim1] = f34;
kono
parents:
diff changeset
1044 c[i + 3 + (j + 3) * c_dim1] = f44;
kono
parents:
diff changeset
1045 }
kono
parents:
diff changeset
1046 if (uisec < isec)
kono
parents:
diff changeset
1047 {
kono
parents:
diff changeset
1048 i5 = ii + isec - 1;
kono
parents:
diff changeset
1049 for (i = ii + uisec; i <= i5; ++i)
kono
parents:
diff changeset
1050 {
kono
parents:
diff changeset
1051 f11 = c[i + j * c_dim1];
kono
parents:
diff changeset
1052 f12 = c[i + (j + 1) * c_dim1];
kono
parents:
diff changeset
1053 f13 = c[i + (j + 2) * c_dim1];
kono
parents:
diff changeset
1054 f14 = c[i + (j + 3) * c_dim1];
kono
parents:
diff changeset
1055 i6 = ll + lsec - 1;
kono
parents:
diff changeset
1056 for (l = ll; l <= i6; ++l)
kono
parents:
diff changeset
1057 {
kono
parents:
diff changeset
1058 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
kono
parents:
diff changeset
1059 257] * b[l + j * b_dim1];
kono
parents:
diff changeset
1060 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
kono
parents:
diff changeset
1061 257] * b[l + (j + 1) * b_dim1];
kono
parents:
diff changeset
1062 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
kono
parents:
diff changeset
1063 257] * b[l + (j + 2) * b_dim1];
kono
parents:
diff changeset
1064 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
kono
parents:
diff changeset
1065 257] * b[l + (j + 3) * b_dim1];
kono
parents:
diff changeset
1066 }
kono
parents:
diff changeset
1067 c[i + j * c_dim1] = f11;
kono
parents:
diff changeset
1068 c[i + (j + 1) * c_dim1] = f12;
kono
parents:
diff changeset
1069 c[i + (j + 2) * c_dim1] = f13;
kono
parents:
diff changeset
1070 c[i + (j + 3) * c_dim1] = f14;
kono
parents:
diff changeset
1071 }
kono
parents:
diff changeset
1072 }
kono
parents:
diff changeset
1073 }
kono
parents:
diff changeset
1074 if (ujsec < jsec)
kono
parents:
diff changeset
1075 {
kono
parents:
diff changeset
1076 i4 = jj + jsec - 1;
kono
parents:
diff changeset
1077 for (j = jj + ujsec; j <= i4; ++j)
kono
parents:
diff changeset
1078 {
kono
parents:
diff changeset
1079 i5 = ii + uisec - 1;
kono
parents:
diff changeset
1080 for (i = ii; i <= i5; i += 4)
kono
parents:
diff changeset
1081 {
kono
parents:
diff changeset
1082 f11 = c[i + j * c_dim1];
kono
parents:
diff changeset
1083 f21 = c[i + 1 + j * c_dim1];
kono
parents:
diff changeset
1084 f31 = c[i + 2 + j * c_dim1];
kono
parents:
diff changeset
1085 f41 = c[i + 3 + j * c_dim1];
kono
parents:
diff changeset
1086 i6 = ll + lsec - 1;
kono
parents:
diff changeset
1087 for (l = ll; l <= i6; ++l)
kono
parents:
diff changeset
1088 {
kono
parents:
diff changeset
1089 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
kono
parents:
diff changeset
1090 257] * b[l + j * b_dim1];
kono
parents:
diff changeset
1091 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
kono
parents:
diff changeset
1092 257] * b[l + j * b_dim1];
kono
parents:
diff changeset
1093 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
kono
parents:
diff changeset
1094 257] * b[l + j * b_dim1];
kono
parents:
diff changeset
1095 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
kono
parents:
diff changeset
1096 257] * b[l + j * b_dim1];
kono
parents:
diff changeset
1097 }
kono
parents:
diff changeset
1098 c[i + j * c_dim1] = f11;
kono
parents:
diff changeset
1099 c[i + 1 + j * c_dim1] = f21;
kono
parents:
diff changeset
1100 c[i + 2 + j * c_dim1] = f31;
kono
parents:
diff changeset
1101 c[i + 3 + j * c_dim1] = f41;
kono
parents:
diff changeset
1102 }
kono
parents:
diff changeset
1103 i5 = ii + isec - 1;
kono
parents:
diff changeset
1104 for (i = ii + uisec; i <= i5; ++i)
kono
parents:
diff changeset
1105 {
kono
parents:
diff changeset
1106 f11 = c[i + j * c_dim1];
kono
parents:
diff changeset
1107 i6 = ll + lsec - 1;
kono
parents:
diff changeset
1108 for (l = ll; l <= i6; ++l)
kono
parents:
diff changeset
1109 {
kono
parents:
diff changeset
1110 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
kono
parents:
diff changeset
1111 257] * b[l + j * b_dim1];
kono
parents:
diff changeset
1112 }
kono
parents:
diff changeset
1113 c[i + j * c_dim1] = f11;
kono
parents:
diff changeset
1114 }
kono
parents:
diff changeset
1115 }
kono
parents:
diff changeset
1116 }
kono
parents:
diff changeset
1117 }
kono
parents:
diff changeset
1118 }
kono
parents:
diff changeset
1119 }
kono
parents:
diff changeset
1120 free(t1);
kono
parents:
diff changeset
1121 return;
kono
parents:
diff changeset
1122 }
kono
parents:
diff changeset
1123 else if (rxstride == 1 && aystride == 1 && bxstride == 1)
kono
parents:
diff changeset
1124 {
kono
parents:
diff changeset
1125 if (GFC_DESCRIPTOR_RANK (a) != 1)
kono
parents:
diff changeset
1126 {
kono
parents:
diff changeset
1127 const GFC_INTEGER_16 *restrict abase_x;
kono
parents:
diff changeset
1128 const GFC_INTEGER_16 *restrict bbase_y;
kono
parents:
diff changeset
1129 GFC_INTEGER_16 *restrict dest_y;
kono
parents:
diff changeset
1130 GFC_INTEGER_16 s;
kono
parents:
diff changeset
1131
kono
parents:
diff changeset
1132 for (y = 0; y < ycount; y++)
kono
parents:
diff changeset
1133 {
kono
parents:
diff changeset
1134 bbase_y = &bbase[y*bystride];
kono
parents:
diff changeset
1135 dest_y = &dest[y*rystride];
kono
parents:
diff changeset
1136 for (x = 0; x < xcount; x++)
kono
parents:
diff changeset
1137 {
kono
parents:
diff changeset
1138 abase_x = &abase[x*axstride];
kono
parents:
diff changeset
1139 s = (GFC_INTEGER_16) 0;
kono
parents:
diff changeset
1140 for (n = 0; n < count; n++)
kono
parents:
diff changeset
1141 s += abase_x[n] * bbase_y[n];
kono
parents:
diff changeset
1142 dest_y[x] = s;
kono
parents:
diff changeset
1143 }
kono
parents:
diff changeset
1144 }
kono
parents:
diff changeset
1145 }
kono
parents:
diff changeset
1146 else
kono
parents:
diff changeset
1147 {
kono
parents:
diff changeset
1148 const GFC_INTEGER_16 *restrict bbase_y;
kono
parents:
diff changeset
1149 GFC_INTEGER_16 s;
kono
parents:
diff changeset
1150
kono
parents:
diff changeset
1151 for (y = 0; y < ycount; y++)
kono
parents:
diff changeset
1152 {
kono
parents:
diff changeset
1153 bbase_y = &bbase[y*bystride];
kono
parents:
diff changeset
1154 s = (GFC_INTEGER_16) 0;
kono
parents:
diff changeset
1155 for (n = 0; n < count; n++)
kono
parents:
diff changeset
1156 s += abase[n*axstride] * bbase_y[n];
kono
parents:
diff changeset
1157 dest[y*rystride] = s;
kono
parents:
diff changeset
1158 }
kono
parents:
diff changeset
1159 }
kono
parents:
diff changeset
1160 }
kono
parents:
diff changeset
1161 else if (axstride < aystride)
kono
parents:
diff changeset
1162 {
kono
parents:
diff changeset
1163 for (y = 0; y < ycount; y++)
kono
parents:
diff changeset
1164 for (x = 0; x < xcount; x++)
kono
parents:
diff changeset
1165 dest[x*rxstride + y*rystride] = (GFC_INTEGER_16)0;
kono
parents:
diff changeset
1166
kono
parents:
diff changeset
1167 for (y = 0; y < ycount; y++)
kono
parents:
diff changeset
1168 for (n = 0; n < count; n++)
kono
parents:
diff changeset
1169 for (x = 0; x < xcount; x++)
kono
parents:
diff changeset
1170 /* dest[x,y] += a[x,n] * b[n,y] */
kono
parents:
diff changeset
1171 dest[x*rxstride + y*rystride] +=
kono
parents:
diff changeset
1172 abase[x*axstride + n*aystride] *
kono
parents:
diff changeset
1173 bbase[n*bxstride + y*bystride];
kono
parents:
diff changeset
1174 }
kono
parents:
diff changeset
1175 else if (GFC_DESCRIPTOR_RANK (a) == 1)
kono
parents:
diff changeset
1176 {
kono
parents:
diff changeset
1177 const GFC_INTEGER_16 *restrict bbase_y;
kono
parents:
diff changeset
1178 GFC_INTEGER_16 s;
kono
parents:
diff changeset
1179
kono
parents:
diff changeset
1180 for (y = 0; y < ycount; y++)
kono
parents:
diff changeset
1181 {
kono
parents:
diff changeset
1182 bbase_y = &bbase[y*bystride];
kono
parents:
diff changeset
1183 s = (GFC_INTEGER_16) 0;
kono
parents:
diff changeset
1184 for (n = 0; n < count; n++)
kono
parents:
diff changeset
1185 s += abase[n*axstride] * bbase_y[n*bxstride];
kono
parents:
diff changeset
1186 dest[y*rxstride] = s;
kono
parents:
diff changeset
1187 }
kono
parents:
diff changeset
1188 }
kono
parents:
diff changeset
1189 else
kono
parents:
diff changeset
1190 {
kono
parents:
diff changeset
1191 const GFC_INTEGER_16 *restrict abase_x;
kono
parents:
diff changeset
1192 const GFC_INTEGER_16 *restrict bbase_y;
kono
parents:
diff changeset
1193 GFC_INTEGER_16 *restrict dest_y;
kono
parents:
diff changeset
1194 GFC_INTEGER_16 s;
kono
parents:
diff changeset
1195
kono
parents:
diff changeset
1196 for (y = 0; y < ycount; y++)
kono
parents:
diff changeset
1197 {
kono
parents:
diff changeset
1198 bbase_y = &bbase[y*bystride];
kono
parents:
diff changeset
1199 dest_y = &dest[y*rystride];
kono
parents:
diff changeset
1200 for (x = 0; x < xcount; x++)
kono
parents:
diff changeset
1201 {
kono
parents:
diff changeset
1202 abase_x = &abase[x*axstride];
kono
parents:
diff changeset
1203 s = (GFC_INTEGER_16) 0;
kono
parents:
diff changeset
1204 for (n = 0; n < count; n++)
kono
parents:
diff changeset
1205 s += abase_x[n*aystride] * bbase_y[n*bxstride];
kono
parents:
diff changeset
1206 dest_y[x*rxstride] = s;
kono
parents:
diff changeset
1207 }
kono
parents:
diff changeset
1208 }
kono
parents:
diff changeset
1209 }
kono
parents:
diff changeset
1210 }
kono
parents:
diff changeset
1211 #undef POW3
kono
parents:
diff changeset
1212 #undef min
kono
parents:
diff changeset
1213 #undef max
kono
parents:
diff changeset
1214
kono
parents:
diff changeset
1215 #endif /* HAVE_AVX2 */
kono
parents:
diff changeset
1216
kono
parents:
diff changeset
1217 #ifdef HAVE_AVX512F
kono
parents:
diff changeset
1218 static void
kono
parents:
diff changeset
1219 matmul_i16_avx512f (gfc_array_i16 * const restrict retarray,
kono
parents:
diff changeset
1220 gfc_array_i16 * const restrict a, gfc_array_i16 * const restrict b, int try_blas,
kono
parents:
diff changeset
1221 int blas_limit, blas_call gemm) __attribute__((__target__("avx512f")));
kono
parents:
diff changeset
1222 static void
kono
parents:
diff changeset
1223 matmul_i16_avx512f (gfc_array_i16 * const restrict retarray,
kono
parents:
diff changeset
1224 gfc_array_i16 * const restrict a, gfc_array_i16 * const restrict b, int try_blas,
kono
parents:
diff changeset
1225 int blas_limit, blas_call gemm)
kono
parents:
diff changeset
1226 {
kono
parents:
diff changeset
1227 const GFC_INTEGER_16 * restrict abase;
kono
parents:
diff changeset
1228 const GFC_INTEGER_16 * restrict bbase;
kono
parents:
diff changeset
1229 GFC_INTEGER_16 * restrict dest;
kono
parents:
diff changeset
1230
kono
parents:
diff changeset
1231 index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
kono
parents:
diff changeset
1232 index_type x, y, n, count, xcount, ycount;
kono
parents:
diff changeset
1233
kono
parents:
diff changeset
1234 assert (GFC_DESCRIPTOR_RANK (a) == 2
kono
parents:
diff changeset
1235 || GFC_DESCRIPTOR_RANK (b) == 2);
kono
parents:
diff changeset
1236
kono
parents:
diff changeset
1237 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
kono
parents:
diff changeset
1238
kono
parents:
diff changeset
1239 Either A or B (but not both) can be rank 1:
kono
parents:
diff changeset
1240
kono
parents:
diff changeset
1241 o One-dimensional argument A is implicitly treated as a row matrix
kono
parents:
diff changeset
1242 dimensioned [1,count], so xcount=1.
kono
parents:
diff changeset
1243
kono
parents:
diff changeset
1244 o One-dimensional argument B is implicitly treated as a column matrix
kono
parents:
diff changeset
1245 dimensioned [count, 1], so ycount=1.
kono
parents:
diff changeset
1246 */
kono
parents:
diff changeset
1247
kono
parents:
diff changeset
1248 if (retarray->base_addr == NULL)
kono
parents:
diff changeset
1249 {
kono
parents:
diff changeset
1250 if (GFC_DESCRIPTOR_RANK (a) == 1)
kono
parents:
diff changeset
1251 {
kono
parents:
diff changeset
1252 GFC_DIMENSION_SET(retarray->dim[0], 0,
kono
parents:
diff changeset
1253 GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
kono
parents:
diff changeset
1254 }
kono
parents:
diff changeset
1255 else if (GFC_DESCRIPTOR_RANK (b) == 1)
kono
parents:
diff changeset
1256 {
kono
parents:
diff changeset
1257 GFC_DIMENSION_SET(retarray->dim[0], 0,
kono
parents:
diff changeset
1258 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
kono
parents:
diff changeset
1259 }
kono
parents:
diff changeset
1260 else
kono
parents:
diff changeset
1261 {
kono
parents:
diff changeset
1262 GFC_DIMENSION_SET(retarray->dim[0], 0,
kono
parents:
diff changeset
1263 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
kono
parents:
diff changeset
1264
kono
parents:
diff changeset
1265 GFC_DIMENSION_SET(retarray->dim[1], 0,
kono
parents:
diff changeset
1266 GFC_DESCRIPTOR_EXTENT(b,1) - 1,
kono
parents:
diff changeset
1267 GFC_DESCRIPTOR_EXTENT(retarray,0));
kono
parents:
diff changeset
1268 }
kono
parents:
diff changeset
1269
kono
parents:
diff changeset
1270 retarray->base_addr
kono
parents:
diff changeset
1271 = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_INTEGER_16));
kono
parents:
diff changeset
1272 retarray->offset = 0;
kono
parents:
diff changeset
1273 }
kono
parents:
diff changeset
1274 else if (unlikely (compile_options.bounds_check))
kono
parents:
diff changeset
1275 {
kono
parents:
diff changeset
1276 index_type ret_extent, arg_extent;
kono
parents:
diff changeset
1277
kono
parents:
diff changeset
1278 if (GFC_DESCRIPTOR_RANK (a) == 1)
kono
parents:
diff changeset
1279 {
kono
parents:
diff changeset
1280 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
kono
parents:
diff changeset
1281 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
kono
parents:
diff changeset
1282 if (arg_extent != ret_extent)
131
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
1283 runtime_error ("Array bound mismatch for dimension 1 of "
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
1284 "array (%ld/%ld) ",
111
kono
parents:
diff changeset
1285 (long int) ret_extent, (long int) arg_extent);
kono
parents:
diff changeset
1286 }
kono
parents:
diff changeset
1287 else if (GFC_DESCRIPTOR_RANK (b) == 1)
kono
parents:
diff changeset
1288 {
kono
parents:
diff changeset
1289 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
kono
parents:
diff changeset
1290 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
kono
parents:
diff changeset
1291 if (arg_extent != ret_extent)
131
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
1292 runtime_error ("Array bound mismatch for dimension 1 of "
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
1293 "array (%ld/%ld) ",
111
kono
parents:
diff changeset
1294 (long int) ret_extent, (long int) arg_extent);
kono
parents:
diff changeset
1295 }
kono
parents:
diff changeset
1296 else
kono
parents:
diff changeset
1297 {
kono
parents:
diff changeset
1298 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
kono
parents:
diff changeset
1299 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
kono
parents:
diff changeset
1300 if (arg_extent != ret_extent)
131
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
1301 runtime_error ("Array bound mismatch for dimension 1 of "
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
1302 "array (%ld/%ld) ",
111
kono
parents:
diff changeset
1303 (long int) ret_extent, (long int) arg_extent);
kono
parents:
diff changeset
1304
kono
parents:
diff changeset
1305 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
kono
parents:
diff changeset
1306 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
kono
parents:
diff changeset
1307 if (arg_extent != ret_extent)
131
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
1308 runtime_error ("Array bound mismatch for dimension 2 of "
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
1309 "array (%ld/%ld) ",
111
kono
parents:
diff changeset
1310 (long int) ret_extent, (long int) arg_extent);
kono
parents:
diff changeset
1311 }
kono
parents:
diff changeset
1312 }
kono
parents:
diff changeset
1313
kono
parents:
diff changeset
1314
kono
parents:
diff changeset
1315 if (GFC_DESCRIPTOR_RANK (retarray) == 1)
kono
parents:
diff changeset
1316 {
kono
parents:
diff changeset
1317 /* One-dimensional result may be addressed in the code below
kono
parents:
diff changeset
1318 either as a row or a column matrix. We want both cases to
kono
parents:
diff changeset
1319 work. */
kono
parents:
diff changeset
1320 rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
kono
parents:
diff changeset
1321 }
kono
parents:
diff changeset
1322 else
kono
parents:
diff changeset
1323 {
kono
parents:
diff changeset
1324 rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
kono
parents:
diff changeset
1325 rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
kono
parents:
diff changeset
1326 }
kono
parents:
diff changeset
1327
kono
parents:
diff changeset
1328
kono
parents:
diff changeset
1329 if (GFC_DESCRIPTOR_RANK (a) == 1)
kono
parents:
diff changeset
1330 {
kono
parents:
diff changeset
1331 /* Treat it as a a row matrix A[1,count]. */
kono
parents:
diff changeset
1332 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
kono
parents:
diff changeset
1333 aystride = 1;
kono
parents:
diff changeset
1334
kono
parents:
diff changeset
1335 xcount = 1;
kono
parents:
diff changeset
1336 count = GFC_DESCRIPTOR_EXTENT(a,0);
kono
parents:
diff changeset
1337 }
kono
parents:
diff changeset
1338 else
kono
parents:
diff changeset
1339 {
kono
parents:
diff changeset
1340 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
kono
parents:
diff changeset
1341 aystride = GFC_DESCRIPTOR_STRIDE(a,1);
kono
parents:
diff changeset
1342
kono
parents:
diff changeset
1343 count = GFC_DESCRIPTOR_EXTENT(a,1);
kono
parents:
diff changeset
1344 xcount = GFC_DESCRIPTOR_EXTENT(a,0);
kono
parents:
diff changeset
1345 }
kono
parents:
diff changeset
1346
kono
parents:
diff changeset
1347 if (count != GFC_DESCRIPTOR_EXTENT(b,0))
kono
parents:
diff changeset
1348 {
kono
parents:
diff changeset
1349 if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
131
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
1350 runtime_error ("Incorrect extent in argument B in MATMUL intrinsic "
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
1351 "in dimension 1: is %ld, should be %ld",
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
1352 (long int) GFC_DESCRIPTOR_EXTENT(b,0), (long int) count);
111
kono
parents:
diff changeset
1353 }
kono
parents:
diff changeset
1354
kono
parents:
diff changeset
1355 if (GFC_DESCRIPTOR_RANK (b) == 1)
kono
parents:
diff changeset
1356 {
kono
parents:
diff changeset
1357 /* Treat it as a column matrix B[count,1] */
kono
parents:
diff changeset
1358 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
kono
parents:
diff changeset
1359
kono
parents:
diff changeset
1360 /* bystride should never be used for 1-dimensional b.
kono
parents:
diff changeset
1361 The value is only used for calculation of the
kono
parents:
diff changeset
1362 memory by the buffer. */
kono
parents:
diff changeset
1363 bystride = 256;
kono
parents:
diff changeset
1364 ycount = 1;
kono
parents:
diff changeset
1365 }
kono
parents:
diff changeset
1366 else
kono
parents:
diff changeset
1367 {
kono
parents:
diff changeset
1368 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
kono
parents:
diff changeset
1369 bystride = GFC_DESCRIPTOR_STRIDE(b,1);
kono
parents:
diff changeset
1370 ycount = GFC_DESCRIPTOR_EXTENT(b,1);
kono
parents:
diff changeset
1371 }
kono
parents:
diff changeset
1372
kono
parents:
diff changeset
1373 abase = a->base_addr;
kono
parents:
diff changeset
1374 bbase = b->base_addr;
kono
parents:
diff changeset
1375 dest = retarray->base_addr;
kono
parents:
diff changeset
1376
kono
parents:
diff changeset
1377 /* Now that everything is set up, we perform the multiplication
kono
parents:
diff changeset
1378 itself. */
kono
parents:
diff changeset
1379
kono
parents:
diff changeset
1380 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
kono
parents:
diff changeset
1381 #define min(a,b) ((a) <= (b) ? (a) : (b))
kono
parents:
diff changeset
1382 #define max(a,b) ((a) >= (b) ? (a) : (b))
kono
parents:
diff changeset
1383
kono
parents:
diff changeset
1384 if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
kono
parents:
diff changeset
1385 && (bxstride == 1 || bystride == 1)
kono
parents:
diff changeset
1386 && (((float) xcount) * ((float) ycount) * ((float) count)
kono
parents:
diff changeset
1387 > POW3(blas_limit)))
kono
parents:
diff changeset
1388 {
kono
parents:
diff changeset
1389 const int m = xcount, n = ycount, k = count, ldc = rystride;
kono
parents:
diff changeset
1390 const GFC_INTEGER_16 one = 1, zero = 0;
kono
parents:
diff changeset
1391 const int lda = (axstride == 1) ? aystride : axstride,
kono
parents:
diff changeset
1392 ldb = (bxstride == 1) ? bystride : bxstride;
kono
parents:
diff changeset
1393
kono
parents:
diff changeset
1394 if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
kono
parents:
diff changeset
1395 {
kono
parents:
diff changeset
1396 assert (gemm != NULL);
131
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
1397 const char *transa, *transb;
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
1398 if (try_blas & 2)
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
1399 transa = "C";
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
1400 else
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
1401 transa = axstride == 1 ? "N" : "T";
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
1402
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
1403 if (try_blas & 4)
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
1404 transb = "C";
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
1405 else
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
1406 transb = bxstride == 1 ? "N" : "T";
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
1407
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
1408 gemm (transa, transb , &m,
111
kono
parents:
diff changeset
1409 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
kono
parents:
diff changeset
1410 &ldc, 1, 1);
kono
parents:
diff changeset
1411 return;
kono
parents:
diff changeset
1412 }
kono
parents:
diff changeset
1413 }
kono
parents:
diff changeset
1414
kono
parents:
diff changeset
1415 if (rxstride == 1 && axstride == 1 && bxstride == 1)
kono
parents:
diff changeset
1416 {
kono
parents:
diff changeset
1417 /* This block of code implements a tuned matmul, derived from
kono
parents:
diff changeset
1418 Superscalar GEMM-based level 3 BLAS, Beta version 0.1
kono
parents:
diff changeset
1419
kono
parents:
diff changeset
1420 Bo Kagstrom and Per Ling
kono
parents:
diff changeset
1421 Department of Computing Science
kono
parents:
diff changeset
1422 Umea University
kono
parents:
diff changeset
1423 S-901 87 Umea, Sweden
kono
parents:
diff changeset
1424
kono
parents:
diff changeset
1425 from netlib.org, translated to C, and modified for matmul.m4. */
kono
parents:
diff changeset
1426
kono
parents:
diff changeset
1427 const GFC_INTEGER_16 *a, *b;
kono
parents:
diff changeset
1428 GFC_INTEGER_16 *c;
kono
parents:
diff changeset
1429 const index_type m = xcount, n = ycount, k = count;
kono
parents:
diff changeset
1430
kono
parents:
diff changeset
1431 /* System generated locals */
kono
parents:
diff changeset
1432 index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
kono
parents:
diff changeset
1433 i1, i2, i3, i4, i5, i6;
kono
parents:
diff changeset
1434
kono
parents:
diff changeset
1435 /* Local variables */
kono
parents:
diff changeset
1436 GFC_INTEGER_16 f11, f12, f21, f22, f31, f32, f41, f42,
kono
parents:
diff changeset
1437 f13, f14, f23, f24, f33, f34, f43, f44;
kono
parents:
diff changeset
1438 index_type i, j, l, ii, jj, ll;
kono
parents:
diff changeset
1439 index_type isec, jsec, lsec, uisec, ujsec, ulsec;
kono
parents:
diff changeset
1440 GFC_INTEGER_16 *t1;
kono
parents:
diff changeset
1441
kono
parents:
diff changeset
1442 a = abase;
kono
parents:
diff changeset
1443 b = bbase;
kono
parents:
diff changeset
1444 c = retarray->base_addr;
kono
parents:
diff changeset
1445
kono
parents:
diff changeset
1446 /* Parameter adjustments */
kono
parents:
diff changeset
1447 c_dim1 = rystride;
kono
parents:
diff changeset
1448 c_offset = 1 + c_dim1;
kono
parents:
diff changeset
1449 c -= c_offset;
kono
parents:
diff changeset
1450 a_dim1 = aystride;
kono
parents:
diff changeset
1451 a_offset = 1 + a_dim1;
kono
parents:
diff changeset
1452 a -= a_offset;
kono
parents:
diff changeset
1453 b_dim1 = bystride;
kono
parents:
diff changeset
1454 b_offset = 1 + b_dim1;
kono
parents:
diff changeset
1455 b -= b_offset;
kono
parents:
diff changeset
1456
kono
parents:
diff changeset
1457 /* Empty c first. */
kono
parents:
diff changeset
1458 for (j=1; j<=n; j++)
kono
parents:
diff changeset
1459 for (i=1; i<=m; i++)
kono
parents:
diff changeset
1460 c[i + j * c_dim1] = (GFC_INTEGER_16)0;
kono
parents:
diff changeset
1461
kono
parents:
diff changeset
1462 /* Early exit if possible */
kono
parents:
diff changeset
1463 if (m == 0 || n == 0 || k == 0)
kono
parents:
diff changeset
1464 return;
kono
parents:
diff changeset
1465
kono
parents:
diff changeset
1466 /* Adjust size of t1 to what is needed. */
131
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
1467 index_type t1_dim, a_sz;
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
1468 if (aystride == 1)
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
1469 a_sz = rystride;
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
1470 else
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
1471 a_sz = a_dim1;
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
1472
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
1473 t1_dim = a_sz * 256 + b_dim1;
111
kono
parents:
diff changeset
1474 if (t1_dim > 65536)
kono
parents:
diff changeset
1475 t1_dim = 65536;
kono
parents:
diff changeset
1476
kono
parents:
diff changeset
1477 t1 = malloc (t1_dim * sizeof(GFC_INTEGER_16));
kono
parents:
diff changeset
1478
kono
parents:
diff changeset
1479 /* Start turning the crank. */
kono
parents:
diff changeset
1480 i1 = n;
kono
parents:
diff changeset
1481 for (jj = 1; jj <= i1; jj += 512)
kono
parents:
diff changeset
1482 {
kono
parents:
diff changeset
1483 /* Computing MIN */
kono
parents:
diff changeset
1484 i2 = 512;
kono
parents:
diff changeset
1485 i3 = n - jj + 1;
kono
parents:
diff changeset
1486 jsec = min(i2,i3);
kono
parents:
diff changeset
1487 ujsec = jsec - jsec % 4;
kono
parents:
diff changeset
1488 i2 = k;
kono
parents:
diff changeset
1489 for (ll = 1; ll <= i2; ll += 256)
kono
parents:
diff changeset
1490 {
kono
parents:
diff changeset
1491 /* Computing MIN */
kono
parents:
diff changeset
1492 i3 = 256;
kono
parents:
diff changeset
1493 i4 = k - ll + 1;
kono
parents:
diff changeset
1494 lsec = min(i3,i4);
kono
parents:
diff changeset
1495 ulsec = lsec - lsec % 2;
kono
parents:
diff changeset
1496
kono
parents:
diff changeset
1497 i3 = m;
kono
parents:
diff changeset
1498 for (ii = 1; ii <= i3; ii += 256)
kono
parents:
diff changeset
1499 {
kono
parents:
diff changeset
1500 /* Computing MIN */
kono
parents:
diff changeset
1501 i4 = 256;
kono
parents:
diff changeset
1502 i5 = m - ii + 1;
kono
parents:
diff changeset
1503 isec = min(i4,i5);
kono
parents:
diff changeset
1504 uisec = isec - isec % 2;
kono
parents:
diff changeset
1505 i4 = ll + ulsec - 1;
kono
parents:
diff changeset
1506 for (l = ll; l <= i4; l += 2)
kono
parents:
diff changeset
1507 {
kono
parents:
diff changeset
1508 i5 = ii + uisec - 1;
kono
parents:
diff changeset
1509 for (i = ii; i <= i5; i += 2)
kono
parents:
diff changeset
1510 {
kono
parents:
diff changeset
1511 t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
kono
parents:
diff changeset
1512 a[i + l * a_dim1];
kono
parents:
diff changeset
1513 t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
kono
parents:
diff changeset
1514 a[i + (l + 1) * a_dim1];
kono
parents:
diff changeset
1515 t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
kono
parents:
diff changeset
1516 a[i + 1 + l * a_dim1];
kono
parents:
diff changeset
1517 t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
kono
parents:
diff changeset
1518 a[i + 1 + (l + 1) * a_dim1];
kono
parents:
diff changeset
1519 }
kono
parents:
diff changeset
1520 if (uisec < isec)
kono
parents:
diff changeset
1521 {
kono
parents:
diff changeset
1522 t1[l - ll + 1 + (isec << 8) - 257] =
kono
parents:
diff changeset
1523 a[ii + isec - 1 + l * a_dim1];
kono
parents:
diff changeset
1524 t1[l - ll + 2 + (isec << 8) - 257] =
kono
parents:
diff changeset
1525 a[ii + isec - 1 + (l + 1) * a_dim1];
kono
parents:
diff changeset
1526 }
kono
parents:
diff changeset
1527 }
kono
parents:
diff changeset
1528 if (ulsec < lsec)
kono
parents:
diff changeset
1529 {
kono
parents:
diff changeset
1530 i4 = ii + isec - 1;
kono
parents:
diff changeset
1531 for (i = ii; i<= i4; ++i)
kono
parents:
diff changeset
1532 {
kono
parents:
diff changeset
1533 t1[lsec + ((i - ii + 1) << 8) - 257] =
kono
parents:
diff changeset
1534 a[i + (ll + lsec - 1) * a_dim1];
kono
parents:
diff changeset
1535 }
kono
parents:
diff changeset
1536 }
kono
parents:
diff changeset
1537
kono
parents:
diff changeset
1538 uisec = isec - isec % 4;
kono
parents:
diff changeset
1539 i4 = jj + ujsec - 1;
kono
parents:
diff changeset
1540 for (j = jj; j <= i4; j += 4)
kono
parents:
diff changeset
1541 {
kono
parents:
diff changeset
1542 i5 = ii + uisec - 1;
kono
parents:
diff changeset
1543 for (i = ii; i <= i5; i += 4)
kono
parents:
diff changeset
1544 {
kono
parents:
diff changeset
1545 f11 = c[i + j * c_dim1];
kono
parents:
diff changeset
1546 f21 = c[i + 1 + j * c_dim1];
kono
parents:
diff changeset
1547 f12 = c[i + (j + 1) * c_dim1];
kono
parents:
diff changeset
1548 f22 = c[i + 1 + (j + 1) * c_dim1];
kono
parents:
diff changeset
1549 f13 = c[i + (j + 2) * c_dim1];
kono
parents:
diff changeset
1550 f23 = c[i + 1 + (j + 2) * c_dim1];
kono
parents:
diff changeset
1551 f14 = c[i + (j + 3) * c_dim1];
kono
parents:
diff changeset
1552 f24 = c[i + 1 + (j + 3) * c_dim1];
kono
parents:
diff changeset
1553 f31 = c[i + 2 + j * c_dim1];
kono
parents:
diff changeset
1554 f41 = c[i + 3 + j * c_dim1];
kono
parents:
diff changeset
1555 f32 = c[i + 2 + (j + 1) * c_dim1];
kono
parents:
diff changeset
1556 f42 = c[i + 3 + (j + 1) * c_dim1];
kono
parents:
diff changeset
1557 f33 = c[i + 2 + (j + 2) * c_dim1];
kono
parents:
diff changeset
1558 f43 = c[i + 3 + (j + 2) * c_dim1];
kono
parents:
diff changeset
1559 f34 = c[i + 2 + (j + 3) * c_dim1];
kono
parents:
diff changeset
1560 f44 = c[i + 3 + (j + 3) * c_dim1];
kono
parents:
diff changeset
1561 i6 = ll + lsec - 1;
kono
parents:
diff changeset
1562 for (l = ll; l <= i6; ++l)
kono
parents:
diff changeset
1563 {
kono
parents:
diff changeset
1564 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
kono
parents:
diff changeset
1565 * b[l + j * b_dim1];
kono
parents:
diff changeset
1566 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
kono
parents:
diff changeset
1567 * b[l + j * b_dim1];
kono
parents:
diff changeset
1568 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
kono
parents:
diff changeset
1569 * b[l + (j + 1) * b_dim1];
kono
parents:
diff changeset
1570 f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
kono
parents:
diff changeset
1571 * b[l + (j + 1) * b_dim1];
kono
parents:
diff changeset
1572 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
kono
parents:
diff changeset
1573 * b[l + (j + 2) * b_dim1];
kono
parents:
diff changeset
1574 f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
kono
parents:
diff changeset
1575 * b[l + (j + 2) * b_dim1];
kono
parents:
diff changeset
1576 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
kono
parents:
diff changeset
1577 * b[l + (j + 3) * b_dim1];
kono
parents:
diff changeset
1578 f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
kono
parents:
diff changeset
1579 * b[l + (j + 3) * b_dim1];
kono
parents:
diff changeset
1580 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
kono
parents:
diff changeset
1581 * b[l + j * b_dim1];
kono
parents:
diff changeset
1582 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
kono
parents:
diff changeset
1583 * b[l + j * b_dim1];
kono
parents:
diff changeset
1584 f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
kono
parents:
diff changeset
1585 * b[l + (j + 1) * b_dim1];
kono
parents:
diff changeset
1586 f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
kono
parents:
diff changeset
1587 * b[l + (j + 1) * b_dim1];
kono
parents:
diff changeset
1588 f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
kono
parents:
diff changeset
1589 * b[l + (j + 2) * b_dim1];
kono
parents:
diff changeset
1590 f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
kono
parents:
diff changeset
1591 * b[l + (j + 2) * b_dim1];
kono
parents:
diff changeset
1592 f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
kono
parents:
diff changeset
1593 * b[l + (j + 3) * b_dim1];
kono
parents:
diff changeset
1594 f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
kono
parents:
diff changeset
1595 * b[l + (j + 3) * b_dim1];
kono
parents:
diff changeset
1596 }
kono
parents:
diff changeset
1597 c[i + j * c_dim1] = f11;
kono
parents:
diff changeset
1598 c[i + 1 + j * c_dim1] = f21;
kono
parents:
diff changeset
1599 c[i + (j + 1) * c_dim1] = f12;
kono
parents:
diff changeset
1600 c[i + 1 + (j + 1) * c_dim1] = f22;
kono
parents:
diff changeset
1601 c[i + (j + 2) * c_dim1] = f13;
kono
parents:
diff changeset
1602 c[i + 1 + (j + 2) * c_dim1] = f23;
kono
parents:
diff changeset
1603 c[i + (j + 3) * c_dim1] = f14;
kono
parents:
diff changeset
1604 c[i + 1 + (j + 3) * c_dim1] = f24;
kono
parents:
diff changeset
1605 c[i + 2 + j * c_dim1] = f31;
kono
parents:
diff changeset
1606 c[i + 3 + j * c_dim1] = f41;
kono
parents:
diff changeset
1607 c[i + 2 + (j + 1) * c_dim1] = f32;
kono
parents:
diff changeset
1608 c[i + 3 + (j + 1) * c_dim1] = f42;
kono
parents:
diff changeset
1609 c[i + 2 + (j + 2) * c_dim1] = f33;
kono
parents:
diff changeset
1610 c[i + 3 + (j + 2) * c_dim1] = f43;
kono
parents:
diff changeset
1611 c[i + 2 + (j + 3) * c_dim1] = f34;
kono
parents:
diff changeset
1612 c[i + 3 + (j + 3) * c_dim1] = f44;
kono
parents:
diff changeset
1613 }
kono
parents:
diff changeset
1614 if (uisec < isec)
kono
parents:
diff changeset
1615 {
kono
parents:
diff changeset
1616 i5 = ii + isec - 1;
kono
parents:
diff changeset
1617 for (i = ii + uisec; i <= i5; ++i)
kono
parents:
diff changeset
1618 {
kono
parents:
diff changeset
1619 f11 = c[i + j * c_dim1];
kono
parents:
diff changeset
1620 f12 = c[i + (j + 1) * c_dim1];
kono
parents:
diff changeset
1621 f13 = c[i + (j + 2) * c_dim1];
kono
parents:
diff changeset
1622 f14 = c[i + (j + 3) * c_dim1];
kono
parents:
diff changeset
1623 i6 = ll + lsec - 1;
kono
parents:
diff changeset
1624 for (l = ll; l <= i6; ++l)
kono
parents:
diff changeset
1625 {
kono
parents:
diff changeset
1626 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
kono
parents:
diff changeset
1627 257] * b[l + j * b_dim1];
kono
parents:
diff changeset
1628 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
kono
parents:
diff changeset
1629 257] * b[l + (j + 1) * b_dim1];
kono
parents:
diff changeset
1630 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
kono
parents:
diff changeset
1631 257] * b[l + (j + 2) * b_dim1];
kono
parents:
diff changeset
1632 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
kono
parents:
diff changeset
1633 257] * b[l + (j + 3) * b_dim1];
kono
parents:
diff changeset
1634 }
kono
parents:
diff changeset
1635 c[i + j * c_dim1] = f11;
kono
parents:
diff changeset
1636 c[i + (j + 1) * c_dim1] = f12;
kono
parents:
diff changeset
1637 c[i + (j + 2) * c_dim1] = f13;
kono
parents:
diff changeset
1638 c[i + (j + 3) * c_dim1] = f14;
kono
parents:
diff changeset
1639 }
kono
parents:
diff changeset
1640 }
kono
parents:
diff changeset
1641 }
kono
parents:
diff changeset
1642 if (ujsec < jsec)
kono
parents:
diff changeset
1643 {
kono
parents:
diff changeset
1644 i4 = jj + jsec - 1;
kono
parents:
diff changeset
1645 for (j = jj + ujsec; j <= i4; ++j)
kono
parents:
diff changeset
1646 {
kono
parents:
diff changeset
1647 i5 = ii + uisec - 1;
kono
parents:
diff changeset
1648 for (i = ii; i <= i5; i += 4)
kono
parents:
diff changeset
1649 {
kono
parents:
diff changeset
1650 f11 = c[i + j * c_dim1];
kono
parents:
diff changeset
1651 f21 = c[i + 1 + j * c_dim1];
kono
parents:
diff changeset
1652 f31 = c[i + 2 + j * c_dim1];
kono
parents:
diff changeset
1653 f41 = c[i + 3 + j * c_dim1];
kono
parents:
diff changeset
1654 i6 = ll + lsec - 1;
kono
parents:
diff changeset
1655 for (l = ll; l <= i6; ++l)
kono
parents:
diff changeset
1656 {
kono
parents:
diff changeset
1657 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
kono
parents:
diff changeset
1658 257] * b[l + j * b_dim1];
kono
parents:
diff changeset
1659 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
kono
parents:
diff changeset
1660 257] * b[l + j * b_dim1];
kono
parents:
diff changeset
1661 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
kono
parents:
diff changeset
1662 257] * b[l + j * b_dim1];
kono
parents:
diff changeset
1663 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
kono
parents:
diff changeset
1664 257] * b[l + j * b_dim1];
kono
parents:
diff changeset
1665 }
kono
parents:
diff changeset
1666 c[i + j * c_dim1] = f11;
kono
parents:
diff changeset
1667 c[i + 1 + j * c_dim1] = f21;
kono
parents:
diff changeset
1668 c[i + 2 + j * c_dim1] = f31;
kono
parents:
diff changeset
1669 c[i + 3 + j * c_dim1] = f41;
kono
parents:
diff changeset
1670 }
kono
parents:
diff changeset
1671 i5 = ii + isec - 1;
kono
parents:
diff changeset
1672 for (i = ii + uisec; i <= i5; ++i)
kono
parents:
diff changeset
1673 {
kono
parents:
diff changeset
1674 f11 = c[i + j * c_dim1];
kono
parents:
diff changeset
1675 i6 = ll + lsec - 1;
kono
parents:
diff changeset
1676 for (l = ll; l <= i6; ++l)
kono
parents:
diff changeset
1677 {
kono
parents:
diff changeset
1678 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
kono
parents:
diff changeset
1679 257] * b[l + j * b_dim1];
kono
parents:
diff changeset
1680 }
kono
parents:
diff changeset
1681 c[i + j * c_dim1] = f11;
kono
parents:
diff changeset
1682 }
kono
parents:
diff changeset
1683 }
kono
parents:
diff changeset
1684 }
kono
parents:
diff changeset
1685 }
kono
parents:
diff changeset
1686 }
kono
parents:
diff changeset
1687 }
kono
parents:
diff changeset
1688 free(t1);
kono
parents:
diff changeset
1689 return;
kono
parents:
diff changeset
1690 }
kono
parents:
diff changeset
1691 else if (rxstride == 1 && aystride == 1 && bxstride == 1)
kono
parents:
diff changeset
1692 {
kono
parents:
diff changeset
1693 if (GFC_DESCRIPTOR_RANK (a) != 1)
kono
parents:
diff changeset
1694 {
kono
parents:
diff changeset
1695 const GFC_INTEGER_16 *restrict abase_x;
kono
parents:
diff changeset
1696 const GFC_INTEGER_16 *restrict bbase_y;
kono
parents:
diff changeset
1697 GFC_INTEGER_16 *restrict dest_y;
kono
parents:
diff changeset
1698 GFC_INTEGER_16 s;
kono
parents:
diff changeset
1699
kono
parents:
diff changeset
1700 for (y = 0; y < ycount; y++)
kono
parents:
diff changeset
1701 {
kono
parents:
diff changeset
1702 bbase_y = &bbase[y*bystride];
kono
parents:
diff changeset
1703 dest_y = &dest[y*rystride];
kono
parents:
diff changeset
1704 for (x = 0; x < xcount; x++)
kono
parents:
diff changeset
1705 {
kono
parents:
diff changeset
1706 abase_x = &abase[x*axstride];
kono
parents:
diff changeset
1707 s = (GFC_INTEGER_16) 0;
kono
parents:
diff changeset
1708 for (n = 0; n < count; n++)
kono
parents:
diff changeset
1709 s += abase_x[n] * bbase_y[n];
kono
parents:
diff changeset
1710 dest_y[x] = s;
kono
parents:
diff changeset
1711 }
kono
parents:
diff changeset
1712 }
kono
parents:
diff changeset
1713 }
kono
parents:
diff changeset
1714 else
kono
parents:
diff changeset
1715 {
kono
parents:
diff changeset
1716 const GFC_INTEGER_16 *restrict bbase_y;
kono
parents:
diff changeset
1717 GFC_INTEGER_16 s;
kono
parents:
diff changeset
1718
kono
parents:
diff changeset
1719 for (y = 0; y < ycount; y++)
kono
parents:
diff changeset
1720 {
kono
parents:
diff changeset
1721 bbase_y = &bbase[y*bystride];
kono
parents:
diff changeset
1722 s = (GFC_INTEGER_16) 0;
kono
parents:
diff changeset
1723 for (n = 0; n < count; n++)
kono
parents:
diff changeset
1724 s += abase[n*axstride] * bbase_y[n];
kono
parents:
diff changeset
1725 dest[y*rystride] = s;
kono
parents:
diff changeset
1726 }
kono
parents:
diff changeset
1727 }
kono
parents:
diff changeset
1728 }
kono
parents:
diff changeset
1729 else if (axstride < aystride)
kono
parents:
diff changeset
1730 {
kono
parents:
diff changeset
1731 for (y = 0; y < ycount; y++)
kono
parents:
diff changeset
1732 for (x = 0; x < xcount; x++)
kono
parents:
diff changeset
1733 dest[x*rxstride + y*rystride] = (GFC_INTEGER_16)0;
kono
parents:
diff changeset
1734
kono
parents:
diff changeset
1735 for (y = 0; y < ycount; y++)
kono
parents:
diff changeset
1736 for (n = 0; n < count; n++)
kono
parents:
diff changeset
1737 for (x = 0; x < xcount; x++)
kono
parents:
diff changeset
1738 /* dest[x,y] += a[x,n] * b[n,y] */
kono
parents:
diff changeset
1739 dest[x*rxstride + y*rystride] +=
kono
parents:
diff changeset
1740 abase[x*axstride + n*aystride] *
kono
parents:
diff changeset
1741 bbase[n*bxstride + y*bystride];
kono
parents:
diff changeset
1742 }
kono
parents:
diff changeset
1743 else if (GFC_DESCRIPTOR_RANK (a) == 1)
kono
parents:
diff changeset
1744 {
kono
parents:
diff changeset
1745 const GFC_INTEGER_16 *restrict bbase_y;
kono
parents:
diff changeset
1746 GFC_INTEGER_16 s;
kono
parents:
diff changeset
1747
kono
parents:
diff changeset
1748 for (y = 0; y < ycount; y++)
kono
parents:
diff changeset
1749 {
kono
parents:
diff changeset
1750 bbase_y = &bbase[y*bystride];
kono
parents:
diff changeset
1751 s = (GFC_INTEGER_16) 0;
kono
parents:
diff changeset
1752 for (n = 0; n < count; n++)
kono
parents:
diff changeset
1753 s += abase[n*axstride] * bbase_y[n*bxstride];
kono
parents:
diff changeset
1754 dest[y*rxstride] = s;
kono
parents:
diff changeset
1755 }
kono
parents:
diff changeset
1756 }
kono
parents:
diff changeset
1757 else
kono
parents:
diff changeset
1758 {
kono
parents:
diff changeset
1759 const GFC_INTEGER_16 *restrict abase_x;
kono
parents:
diff changeset
1760 const GFC_INTEGER_16 *restrict bbase_y;
kono
parents:
diff changeset
1761 GFC_INTEGER_16 *restrict dest_y;
kono
parents:
diff changeset
1762 GFC_INTEGER_16 s;
kono
parents:
diff changeset
1763
kono
parents:
diff changeset
1764 for (y = 0; y < ycount; y++)
kono
parents:
diff changeset
1765 {
kono
parents:
diff changeset
1766 bbase_y = &bbase[y*bystride];
kono
parents:
diff changeset
1767 dest_y = &dest[y*rystride];
kono
parents:
diff changeset
1768 for (x = 0; x < xcount; x++)
kono
parents:
diff changeset
1769 {
kono
parents:
diff changeset
1770 abase_x = &abase[x*axstride];
kono
parents:
diff changeset
1771 s = (GFC_INTEGER_16) 0;
kono
parents:
diff changeset
1772 for (n = 0; n < count; n++)
kono
parents:
diff changeset
1773 s += abase_x[n*aystride] * bbase_y[n*bxstride];
kono
parents:
diff changeset
1774 dest_y[x*rxstride] = s;
kono
parents:
diff changeset
1775 }
kono
parents:
diff changeset
1776 }
kono
parents:
diff changeset
1777 }
kono
parents:
diff changeset
1778 }
kono
parents:
diff changeset
1779 #undef POW3
kono
parents:
diff changeset
1780 #undef min
kono
parents:
diff changeset
1781 #undef max
kono
parents:
diff changeset
1782
kono
parents:
diff changeset
1783 #endif /* HAVE_AVX512F */
kono
parents:
diff changeset
1784
kono
parents:
diff changeset
1785 /* AMD-specifix funtions with AVX128 and FMA3/FMA4. */
kono
parents:
diff changeset
1786
kono
parents:
diff changeset
1787 #if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
kono
parents:
diff changeset
1788 void
kono
parents:
diff changeset
1789 matmul_i16_avx128_fma3 (gfc_array_i16 * const restrict retarray,
kono
parents:
diff changeset
1790 gfc_array_i16 * const restrict a, gfc_array_i16 * const restrict b, int try_blas,
kono
parents:
diff changeset
1791 int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma")));
kono
parents:
diff changeset
1792 internal_proto(matmul_i16_avx128_fma3);
kono
parents:
diff changeset
1793 #endif
kono
parents:
diff changeset
1794
kono
parents:
diff changeset
1795 #if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
kono
parents:
diff changeset
1796 void
kono
parents:
diff changeset
1797 matmul_i16_avx128_fma4 (gfc_array_i16 * const restrict retarray,
kono
parents:
diff changeset
1798 gfc_array_i16 * const restrict a, gfc_array_i16 * const restrict b, int try_blas,
kono
parents:
diff changeset
1799 int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4")));
kono
parents:
diff changeset
1800 internal_proto(matmul_i16_avx128_fma4);
kono
parents:
diff changeset
1801 #endif
kono
parents:
diff changeset
1802
kono
parents:
diff changeset
1803 /* Function to fall back to if there is no special processor-specific version. */
kono
parents:
diff changeset
1804 static void
kono
parents:
diff changeset
1805 matmul_i16_vanilla (gfc_array_i16 * const restrict retarray,
kono
parents:
diff changeset
1806 gfc_array_i16 * const restrict a, gfc_array_i16 * const restrict b, int try_blas,
kono
parents:
diff changeset
1807 int blas_limit, blas_call gemm)
kono
parents:
diff changeset
1808 {
kono
parents:
diff changeset
1809 const GFC_INTEGER_16 * restrict abase;
kono
parents:
diff changeset
1810 const GFC_INTEGER_16 * restrict bbase;
kono
parents:
diff changeset
1811 GFC_INTEGER_16 * restrict dest;
kono
parents:
diff changeset
1812
kono
parents:
diff changeset
1813 index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
kono
parents:
diff changeset
1814 index_type x, y, n, count, xcount, ycount;
kono
parents:
diff changeset
1815
kono
parents:
diff changeset
1816 assert (GFC_DESCRIPTOR_RANK (a) == 2
kono
parents:
diff changeset
1817 || GFC_DESCRIPTOR_RANK (b) == 2);
kono
parents:
diff changeset
1818
kono
parents:
diff changeset
1819 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
kono
parents:
diff changeset
1820
kono
parents:
diff changeset
1821 Either A or B (but not both) can be rank 1:
kono
parents:
diff changeset
1822
kono
parents:
diff changeset
1823 o One-dimensional argument A is implicitly treated as a row matrix
kono
parents:
diff changeset
1824 dimensioned [1,count], so xcount=1.
kono
parents:
diff changeset
1825
kono
parents:
diff changeset
1826 o One-dimensional argument B is implicitly treated as a column matrix
kono
parents:
diff changeset
1827 dimensioned [count, 1], so ycount=1.
kono
parents:
diff changeset
1828 */
kono
parents:
diff changeset
1829
kono
parents:
diff changeset
1830 if (retarray->base_addr == NULL)
kono
parents:
diff changeset
1831 {
kono
parents:
diff changeset
1832 if (GFC_DESCRIPTOR_RANK (a) == 1)
kono
parents:
diff changeset
1833 {
kono
parents:
diff changeset
1834 GFC_DIMENSION_SET(retarray->dim[0], 0,
kono
parents:
diff changeset
1835 GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
kono
parents:
diff changeset
1836 }
kono
parents:
diff changeset
1837 else if (GFC_DESCRIPTOR_RANK (b) == 1)
kono
parents:
diff changeset
1838 {
kono
parents:
diff changeset
1839 GFC_DIMENSION_SET(retarray->dim[0], 0,
kono
parents:
diff changeset
1840 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
kono
parents:
diff changeset
1841 }
kono
parents:
diff changeset
1842 else
kono
parents:
diff changeset
1843 {
kono
parents:
diff changeset
1844 GFC_DIMENSION_SET(retarray->dim[0], 0,
kono
parents:
diff changeset
1845 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
kono
parents:
diff changeset
1846
kono
parents:
diff changeset
1847 GFC_DIMENSION_SET(retarray->dim[1], 0,
kono
parents:
diff changeset
1848 GFC_DESCRIPTOR_EXTENT(b,1) - 1,
kono
parents:
diff changeset
1849 GFC_DESCRIPTOR_EXTENT(retarray,0));
kono
parents:
diff changeset
1850 }
kono
parents:
diff changeset
1851
kono
parents:
diff changeset
1852 retarray->base_addr
kono
parents:
diff changeset
1853 = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_INTEGER_16));
kono
parents:
diff changeset
1854 retarray->offset = 0;
kono
parents:
diff changeset
1855 }
kono
parents:
diff changeset
1856 else if (unlikely (compile_options.bounds_check))
kono
parents:
diff changeset
1857 {
kono
parents:
diff changeset
1858 index_type ret_extent, arg_extent;
kono
parents:
diff changeset
1859
kono
parents:
diff changeset
1860 if (GFC_DESCRIPTOR_RANK (a) == 1)
kono
parents:
diff changeset
1861 {
kono
parents:
diff changeset
1862 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
kono
parents:
diff changeset
1863 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
kono
parents:
diff changeset
1864 if (arg_extent != ret_extent)
131
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
1865 runtime_error ("Array bound mismatch for dimension 1 of "
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
1866 "array (%ld/%ld) ",
111
kono
parents:
diff changeset
1867 (long int) ret_extent, (long int) arg_extent);
kono
parents:
diff changeset
1868 }
kono
parents:
diff changeset
1869 else if (GFC_DESCRIPTOR_RANK (b) == 1)
kono
parents:
diff changeset
1870 {
kono
parents:
diff changeset
1871 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
kono
parents:
diff changeset
1872 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
kono
parents:
diff changeset
1873 if (arg_extent != ret_extent)
131
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
1874 runtime_error ("Array bound mismatch for dimension 1 of "
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
1875 "array (%ld/%ld) ",
111
kono
parents:
diff changeset
1876 (long int) ret_extent, (long int) arg_extent);
kono
parents:
diff changeset
1877 }
kono
parents:
diff changeset
1878 else
kono
parents:
diff changeset
1879 {
kono
parents:
diff changeset
1880 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
kono
parents:
diff changeset
1881 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
kono
parents:
diff changeset
1882 if (arg_extent != ret_extent)
131
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
1883 runtime_error ("Array bound mismatch for dimension 1 of "
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
1884 "array (%ld/%ld) ",
111
kono
parents:
diff changeset
1885 (long int) ret_extent, (long int) arg_extent);
kono
parents:
diff changeset
1886
kono
parents:
diff changeset
1887 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
kono
parents:
diff changeset
1888 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
kono
parents:
diff changeset
1889 if (arg_extent != ret_extent)
131
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
1890 runtime_error ("Array bound mismatch for dimension 2 of "
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
1891 "array (%ld/%ld) ",
111
kono
parents:
diff changeset
1892 (long int) ret_extent, (long int) arg_extent);
kono
parents:
diff changeset
1893 }
kono
parents:
diff changeset
1894 }
kono
parents:
diff changeset
1895
kono
parents:
diff changeset
1896
kono
parents:
diff changeset
1897 if (GFC_DESCRIPTOR_RANK (retarray) == 1)
kono
parents:
diff changeset
1898 {
kono
parents:
diff changeset
1899 /* One-dimensional result may be addressed in the code below
kono
parents:
diff changeset
1900 either as a row or a column matrix. We want both cases to
kono
parents:
diff changeset
1901 work. */
kono
parents:
diff changeset
1902 rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
kono
parents:
diff changeset
1903 }
kono
parents:
diff changeset
1904 else
kono
parents:
diff changeset
1905 {
kono
parents:
diff changeset
1906 rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
kono
parents:
diff changeset
1907 rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
kono
parents:
diff changeset
1908 }
kono
parents:
diff changeset
1909
kono
parents:
diff changeset
1910
kono
parents:
diff changeset
1911 if (GFC_DESCRIPTOR_RANK (a) == 1)
kono
parents:
diff changeset
1912 {
kono
parents:
diff changeset
1913 /* Treat it as a a row matrix A[1,count]. */
kono
parents:
diff changeset
1914 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
kono
parents:
diff changeset
1915 aystride = 1;
kono
parents:
diff changeset
1916
kono
parents:
diff changeset
1917 xcount = 1;
kono
parents:
diff changeset
1918 count = GFC_DESCRIPTOR_EXTENT(a,0);
kono
parents:
diff changeset
1919 }
kono
parents:
diff changeset
1920 else
kono
parents:
diff changeset
1921 {
kono
parents:
diff changeset
1922 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
kono
parents:
diff changeset
1923 aystride = GFC_DESCRIPTOR_STRIDE(a,1);
kono
parents:
diff changeset
1924
kono
parents:
diff changeset
1925 count = GFC_DESCRIPTOR_EXTENT(a,1);
kono
parents:
diff changeset
1926 xcount = GFC_DESCRIPTOR_EXTENT(a,0);
kono
parents:
diff changeset
1927 }
kono
parents:
diff changeset
1928
kono
parents:
diff changeset
1929 if (count != GFC_DESCRIPTOR_EXTENT(b,0))
kono
parents:
diff changeset
1930 {
kono
parents:
diff changeset
1931 if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
131
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
1932 runtime_error ("Incorrect extent in argument B in MATMUL intrinsic "
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
1933 "in dimension 1: is %ld, should be %ld",
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
1934 (long int) GFC_DESCRIPTOR_EXTENT(b,0), (long int) count);
111
kono
parents:
diff changeset
1935 }
kono
parents:
diff changeset
1936
kono
parents:
diff changeset
1937 if (GFC_DESCRIPTOR_RANK (b) == 1)
kono
parents:
diff changeset
1938 {
kono
parents:
diff changeset
1939 /* Treat it as a column matrix B[count,1] */
kono
parents:
diff changeset
1940 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
kono
parents:
diff changeset
1941
kono
parents:
diff changeset
1942 /* bystride should never be used for 1-dimensional b.
kono
parents:
diff changeset
1943 The value is only used for calculation of the
kono
parents:
diff changeset
1944 memory by the buffer. */
kono
parents:
diff changeset
1945 bystride = 256;
kono
parents:
diff changeset
1946 ycount = 1;
kono
parents:
diff changeset
1947 }
kono
parents:
diff changeset
1948 else
kono
parents:
diff changeset
1949 {
kono
parents:
diff changeset
1950 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
kono
parents:
diff changeset
1951 bystride = GFC_DESCRIPTOR_STRIDE(b,1);
kono
parents:
diff changeset
1952 ycount = GFC_DESCRIPTOR_EXTENT(b,1);
kono
parents:
diff changeset
1953 }
kono
parents:
diff changeset
1954
kono
parents:
diff changeset
1955 abase = a->base_addr;
kono
parents:
diff changeset
1956 bbase = b->base_addr;
kono
parents:
diff changeset
1957 dest = retarray->base_addr;
kono
parents:
diff changeset
1958
kono
parents:
diff changeset
1959 /* Now that everything is set up, we perform the multiplication
kono
parents:
diff changeset
1960 itself. */
kono
parents:
diff changeset
1961
kono
parents:
diff changeset
1962 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
kono
parents:
diff changeset
1963 #define min(a,b) ((a) <= (b) ? (a) : (b))
kono
parents:
diff changeset
1964 #define max(a,b) ((a) >= (b) ? (a) : (b))
kono
parents:
diff changeset
1965
kono
parents:
diff changeset
1966 if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
kono
parents:
diff changeset
1967 && (bxstride == 1 || bystride == 1)
kono
parents:
diff changeset
1968 && (((float) xcount) * ((float) ycount) * ((float) count)
kono
parents:
diff changeset
1969 > POW3(blas_limit)))
kono
parents:
diff changeset
1970 {
kono
parents:
diff changeset
1971 const int m = xcount, n = ycount, k = count, ldc = rystride;
kono
parents:
diff changeset
1972 const GFC_INTEGER_16 one = 1, zero = 0;
kono
parents:
diff changeset
1973 const int lda = (axstride == 1) ? aystride : axstride,
kono
parents:
diff changeset
1974 ldb = (bxstride == 1) ? bystride : bxstride;
kono
parents:
diff changeset
1975
kono
parents:
diff changeset
1976 if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
kono
parents:
diff changeset
1977 {
kono
parents:
diff changeset
1978 assert (gemm != NULL);
131
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
1979 const char *transa, *transb;
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
1980 if (try_blas & 2)
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
1981 transa = "C";
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
1982 else
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
1983 transa = axstride == 1 ? "N" : "T";
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
1984
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
1985 if (try_blas & 4)
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
1986 transb = "C";
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
1987 else
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
1988 transb = bxstride == 1 ? "N" : "T";
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
1989
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
1990 gemm (transa, transb , &m,
111
kono
parents:
diff changeset
1991 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
kono
parents:
diff changeset
1992 &ldc, 1, 1);
kono
parents:
diff changeset
1993 return;
kono
parents:
diff changeset
1994 }
kono
parents:
diff changeset
1995 }
kono
parents:
diff changeset
1996
kono
parents:
diff changeset
1997 if (rxstride == 1 && axstride == 1 && bxstride == 1)
kono
parents:
diff changeset
1998 {
kono
parents:
diff changeset
1999 /* This block of code implements a tuned matmul, derived from
kono
parents:
diff changeset
2000 Superscalar GEMM-based level 3 BLAS, Beta version 0.1
kono
parents:
diff changeset
2001
kono
parents:
diff changeset
2002 Bo Kagstrom and Per Ling
kono
parents:
diff changeset
2003 Department of Computing Science
kono
parents:
diff changeset
2004 Umea University
kono
parents:
diff changeset
2005 S-901 87 Umea, Sweden
kono
parents:
diff changeset
2006
kono
parents:
diff changeset
2007 from netlib.org, translated to C, and modified for matmul.m4. */
kono
parents:
diff changeset
2008
kono
parents:
diff changeset
2009 const GFC_INTEGER_16 *a, *b;
kono
parents:
diff changeset
2010 GFC_INTEGER_16 *c;
kono
parents:
diff changeset
2011 const index_type m = xcount, n = ycount, k = count;
kono
parents:
diff changeset
2012
kono
parents:
diff changeset
2013 /* System generated locals */
kono
parents:
diff changeset
2014 index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
kono
parents:
diff changeset
2015 i1, i2, i3, i4, i5, i6;
kono
parents:
diff changeset
2016
kono
parents:
diff changeset
2017 /* Local variables */
kono
parents:
diff changeset
2018 GFC_INTEGER_16 f11, f12, f21, f22, f31, f32, f41, f42,
kono
parents:
diff changeset
2019 f13, f14, f23, f24, f33, f34, f43, f44;
kono
parents:
diff changeset
2020 index_type i, j, l, ii, jj, ll;
kono
parents:
diff changeset
2021 index_type isec, jsec, lsec, uisec, ujsec, ulsec;
kono
parents:
diff changeset
2022 GFC_INTEGER_16 *t1;
kono
parents:
diff changeset
2023
kono
parents:
diff changeset
2024 a = abase;
kono
parents:
diff changeset
2025 b = bbase;
kono
parents:
diff changeset
2026 c = retarray->base_addr;
kono
parents:
diff changeset
2027
kono
parents:
diff changeset
2028 /* Parameter adjustments */
kono
parents:
diff changeset
2029 c_dim1 = rystride;
kono
parents:
diff changeset
2030 c_offset = 1 + c_dim1;
kono
parents:
diff changeset
2031 c -= c_offset;
kono
parents:
diff changeset
2032 a_dim1 = aystride;
kono
parents:
diff changeset
2033 a_offset = 1 + a_dim1;
kono
parents:
diff changeset
2034 a -= a_offset;
kono
parents:
diff changeset
2035 b_dim1 = bystride;
kono
parents:
diff changeset
2036 b_offset = 1 + b_dim1;
kono
parents:
diff changeset
2037 b -= b_offset;
kono
parents:
diff changeset
2038
kono
parents:
diff changeset
2039 /* Empty c first. */
kono
parents:
diff changeset
2040 for (j=1; j<=n; j++)
kono
parents:
diff changeset
2041 for (i=1; i<=m; i++)
kono
parents:
diff changeset
2042 c[i + j * c_dim1] = (GFC_INTEGER_16)0;
kono
parents:
diff changeset
2043
kono
parents:
diff changeset
2044 /* Early exit if possible */
kono
parents:
diff changeset
2045 if (m == 0 || n == 0 || k == 0)
kono
parents:
diff changeset
2046 return;
kono
parents:
diff changeset
2047
kono
parents:
diff changeset
2048 /* Adjust size of t1 to what is needed. */
131
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
2049 index_type t1_dim, a_sz;
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
2050 if (aystride == 1)
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
2051 a_sz = rystride;
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
2052 else
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
2053 a_sz = a_dim1;
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
2054
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
2055 t1_dim = a_sz * 256 + b_dim1;
111
kono
parents:
diff changeset
2056 if (t1_dim > 65536)
kono
parents:
diff changeset
2057 t1_dim = 65536;
kono
parents:
diff changeset
2058
kono
parents:
diff changeset
2059 t1 = malloc (t1_dim * sizeof(GFC_INTEGER_16));
kono
parents:
diff changeset
2060
kono
parents:
diff changeset
2061 /* Start turning the crank. */
kono
parents:
diff changeset
2062 i1 = n;
kono
parents:
diff changeset
2063 for (jj = 1; jj <= i1; jj += 512)
kono
parents:
diff changeset
2064 {
kono
parents:
diff changeset
2065 /* Computing MIN */
kono
parents:
diff changeset
2066 i2 = 512;
kono
parents:
diff changeset
2067 i3 = n - jj + 1;
kono
parents:
diff changeset
2068 jsec = min(i2,i3);
kono
parents:
diff changeset
2069 ujsec = jsec - jsec % 4;
kono
parents:
diff changeset
2070 i2 = k;
kono
parents:
diff changeset
2071 for (ll = 1; ll <= i2; ll += 256)
kono
parents:
diff changeset
2072 {
kono
parents:
diff changeset
2073 /* Computing MIN */
kono
parents:
diff changeset
2074 i3 = 256;
kono
parents:
diff changeset
2075 i4 = k - ll + 1;
kono
parents:
diff changeset
2076 lsec = min(i3,i4);
kono
parents:
diff changeset
2077 ulsec = lsec - lsec % 2;
kono
parents:
diff changeset
2078
kono
parents:
diff changeset
2079 i3 = m;
kono
parents:
diff changeset
2080 for (ii = 1; ii <= i3; ii += 256)
kono
parents:
diff changeset
2081 {
kono
parents:
diff changeset
2082 /* Computing MIN */
kono
parents:
diff changeset
2083 i4 = 256;
kono
parents:
diff changeset
2084 i5 = m - ii + 1;
kono
parents:
diff changeset
2085 isec = min(i4,i5);
kono
parents:
diff changeset
2086 uisec = isec - isec % 2;
kono
parents:
diff changeset
2087 i4 = ll + ulsec - 1;
kono
parents:
diff changeset
2088 for (l = ll; l <= i4; l += 2)
kono
parents:
diff changeset
2089 {
kono
parents:
diff changeset
2090 i5 = ii + uisec - 1;
kono
parents:
diff changeset
2091 for (i = ii; i <= i5; i += 2)
kono
parents:
diff changeset
2092 {
kono
parents:
diff changeset
2093 t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
kono
parents:
diff changeset
2094 a[i + l * a_dim1];
kono
parents:
diff changeset
2095 t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
kono
parents:
diff changeset
2096 a[i + (l + 1) * a_dim1];
kono
parents:
diff changeset
2097 t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
kono
parents:
diff changeset
2098 a[i + 1 + l * a_dim1];
kono
parents:
diff changeset
2099 t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
kono
parents:
diff changeset
2100 a[i + 1 + (l + 1) * a_dim1];
kono
parents:
diff changeset
2101 }
kono
parents:
diff changeset
2102 if (uisec < isec)
kono
parents:
diff changeset
2103 {
kono
parents:
diff changeset
2104 t1[l - ll + 1 + (isec << 8) - 257] =
kono
parents:
diff changeset
2105 a[ii + isec - 1 + l * a_dim1];
kono
parents:
diff changeset
2106 t1[l - ll + 2 + (isec << 8) - 257] =
kono
parents:
diff changeset
2107 a[ii + isec - 1 + (l + 1) * a_dim1];
kono
parents:
diff changeset
2108 }
kono
parents:
diff changeset
2109 }
kono
parents:
diff changeset
2110 if (ulsec < lsec)
kono
parents:
diff changeset
2111 {
kono
parents:
diff changeset
2112 i4 = ii + isec - 1;
kono
parents:
diff changeset
2113 for (i = ii; i<= i4; ++i)
kono
parents:
diff changeset
2114 {
kono
parents:
diff changeset
2115 t1[lsec + ((i - ii + 1) << 8) - 257] =
kono
parents:
diff changeset
2116 a[i + (ll + lsec - 1) * a_dim1];
kono
parents:
diff changeset
2117 }
kono
parents:
diff changeset
2118 }
kono
parents:
diff changeset
2119
kono
parents:
diff changeset
2120 uisec = isec - isec % 4;
kono
parents:
diff changeset
2121 i4 = jj + ujsec - 1;
kono
parents:
diff changeset
2122 for (j = jj; j <= i4; j += 4)
kono
parents:
diff changeset
2123 {
kono
parents:
diff changeset
2124 i5 = ii + uisec - 1;
kono
parents:
diff changeset
2125 for (i = ii; i <= i5; i += 4)
kono
parents:
diff changeset
2126 {
kono
parents:
diff changeset
2127 f11 = c[i + j * c_dim1];
kono
parents:
diff changeset
2128 f21 = c[i + 1 + j * c_dim1];
kono
parents:
diff changeset
2129 f12 = c[i + (j + 1) * c_dim1];
kono
parents:
diff changeset
2130 f22 = c[i + 1 + (j + 1) * c_dim1];
kono
parents:
diff changeset
2131 f13 = c[i + (j + 2) * c_dim1];
kono
parents:
diff changeset
2132 f23 = c[i + 1 + (j + 2) * c_dim1];
kono
parents:
diff changeset
2133 f14 = c[i + (j + 3) * c_dim1];
kono
parents:
diff changeset
2134 f24 = c[i + 1 + (j + 3) * c_dim1];
kono
parents:
diff changeset
2135 f31 = c[i + 2 + j * c_dim1];
kono
parents:
diff changeset
2136 f41 = c[i + 3 + j * c_dim1];
kono
parents:
diff changeset
2137 f32 = c[i + 2 + (j + 1) * c_dim1];
kono
parents:
diff changeset
2138 f42 = c[i + 3 + (j + 1) * c_dim1];
kono
parents:
diff changeset
2139 f33 = c[i + 2 + (j + 2) * c_dim1];
kono
parents:
diff changeset
2140 f43 = c[i + 3 + (j + 2) * c_dim1];
kono
parents:
diff changeset
2141 f34 = c[i + 2 + (j + 3) * c_dim1];
kono
parents:
diff changeset
2142 f44 = c[i + 3 + (j + 3) * c_dim1];
kono
parents:
diff changeset
2143 i6 = ll + lsec - 1;
kono
parents:
diff changeset
2144 for (l = ll; l <= i6; ++l)
kono
parents:
diff changeset
2145 {
kono
parents:
diff changeset
2146 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
kono
parents:
diff changeset
2147 * b[l + j * b_dim1];
kono
parents:
diff changeset
2148 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
kono
parents:
diff changeset
2149 * b[l + j * b_dim1];
kono
parents:
diff changeset
2150 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
kono
parents:
diff changeset
2151 * b[l + (j + 1) * b_dim1];
kono
parents:
diff changeset
2152 f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
kono
parents:
diff changeset
2153 * b[l + (j + 1) * b_dim1];
kono
parents:
diff changeset
2154 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
kono
parents:
diff changeset
2155 * b[l + (j + 2) * b_dim1];
kono
parents:
diff changeset
2156 f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
kono
parents:
diff changeset
2157 * b[l + (j + 2) * b_dim1];
kono
parents:
diff changeset
2158 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
kono
parents:
diff changeset
2159 * b[l + (j + 3) * b_dim1];
kono
parents:
diff changeset
2160 f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
kono
parents:
diff changeset
2161 * b[l + (j + 3) * b_dim1];
kono
parents:
diff changeset
2162 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
kono
parents:
diff changeset
2163 * b[l + j * b_dim1];
kono
parents:
diff changeset
2164 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
kono
parents:
diff changeset
2165 * b[l + j * b_dim1];
kono
parents:
diff changeset
2166 f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
kono
parents:
diff changeset
2167 * b[l + (j + 1) * b_dim1];
kono
parents:
diff changeset
2168 f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
kono
parents:
diff changeset
2169 * b[l + (j + 1) * b_dim1];
kono
parents:
diff changeset
2170 f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
kono
parents:
diff changeset
2171 * b[l + (j + 2) * b_dim1];
kono
parents:
diff changeset
2172 f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
kono
parents:
diff changeset
2173 * b[l + (j + 2) * b_dim1];
kono
parents:
diff changeset
2174 f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
kono
parents:
diff changeset
2175 * b[l + (j + 3) * b_dim1];
kono
parents:
diff changeset
2176 f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
kono
parents:
diff changeset
2177 * b[l + (j + 3) * b_dim1];
kono
parents:
diff changeset
2178 }
kono
parents:
diff changeset
2179 c[i + j * c_dim1] = f11;
kono
parents:
diff changeset
2180 c[i + 1 + j * c_dim1] = f21;
kono
parents:
diff changeset
2181 c[i + (j + 1) * c_dim1] = f12;
kono
parents:
diff changeset
2182 c[i + 1 + (j + 1) * c_dim1] = f22;
kono
parents:
diff changeset
2183 c[i + (j + 2) * c_dim1] = f13;
kono
parents:
diff changeset
2184 c[i + 1 + (j + 2) * c_dim1] = f23;
kono
parents:
diff changeset
2185 c[i + (j + 3) * c_dim1] = f14;
kono
parents:
diff changeset
2186 c[i + 1 + (j + 3) * c_dim1] = f24;
kono
parents:
diff changeset
2187 c[i + 2 + j * c_dim1] = f31;
kono
parents:
diff changeset
2188 c[i + 3 + j * c_dim1] = f41;
kono
parents:
diff changeset
2189 c[i + 2 + (j + 1) * c_dim1] = f32;
kono
parents:
diff changeset
2190 c[i + 3 + (j + 1) * c_dim1] = f42;
kono
parents:
diff changeset
2191 c[i + 2 + (j + 2) * c_dim1] = f33;
kono
parents:
diff changeset
2192 c[i + 3 + (j + 2) * c_dim1] = f43;
kono
parents:
diff changeset
2193 c[i + 2 + (j + 3) * c_dim1] = f34;
kono
parents:
diff changeset
2194 c[i + 3 + (j + 3) * c_dim1] = f44;
kono
parents:
diff changeset
2195 }
kono
parents:
diff changeset
2196 if (uisec < isec)
kono
parents:
diff changeset
2197 {
kono
parents:
diff changeset
2198 i5 = ii + isec - 1;
kono
parents:
diff changeset
2199 for (i = ii + uisec; i <= i5; ++i)
kono
parents:
diff changeset
2200 {
kono
parents:
diff changeset
2201 f11 = c[i + j * c_dim1];
kono
parents:
diff changeset
2202 f12 = c[i + (j + 1) * c_dim1];
kono
parents:
diff changeset
2203 f13 = c[i + (j + 2) * c_dim1];
kono
parents:
diff changeset
2204 f14 = c[i + (j + 3) * c_dim1];
kono
parents:
diff changeset
2205 i6 = ll + lsec - 1;
kono
parents:
diff changeset
2206 for (l = ll; l <= i6; ++l)
kono
parents:
diff changeset
2207 {
kono
parents:
diff changeset
2208 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
kono
parents:
diff changeset
2209 257] * b[l + j * b_dim1];
kono
parents:
diff changeset
2210 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
kono
parents:
diff changeset
2211 257] * b[l + (j + 1) * b_dim1];
kono
parents:
diff changeset
2212 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
kono
parents:
diff changeset
2213 257] * b[l + (j + 2) * b_dim1];
kono
parents:
diff changeset
2214 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
kono
parents:
diff changeset
2215 257] * b[l + (j + 3) * b_dim1];
kono
parents:
diff changeset
2216 }
kono
parents:
diff changeset
2217 c[i + j * c_dim1] = f11;
kono
parents:
diff changeset
2218 c[i + (j + 1) * c_dim1] = f12;
kono
parents:
diff changeset
2219 c[i + (j + 2) * c_dim1] = f13;
kono
parents:
diff changeset
2220 c[i + (j + 3) * c_dim1] = f14;
kono
parents:
diff changeset
2221 }
kono
parents:
diff changeset
2222 }
kono
parents:
diff changeset
2223 }
kono
parents:
diff changeset
2224 if (ujsec < jsec)
kono
parents:
diff changeset
2225 {
kono
parents:
diff changeset
2226 i4 = jj + jsec - 1;
kono
parents:
diff changeset
2227 for (j = jj + ujsec; j <= i4; ++j)
kono
parents:
diff changeset
2228 {
kono
parents:
diff changeset
2229 i5 = ii + uisec - 1;
kono
parents:
diff changeset
2230 for (i = ii; i <= i5; i += 4)
kono
parents:
diff changeset
2231 {
kono
parents:
diff changeset
2232 f11 = c[i + j * c_dim1];
kono
parents:
diff changeset
2233 f21 = c[i + 1 + j * c_dim1];
kono
parents:
diff changeset
2234 f31 = c[i + 2 + j * c_dim1];
kono
parents:
diff changeset
2235 f41 = c[i + 3 + j * c_dim1];
kono
parents:
diff changeset
2236 i6 = ll + lsec - 1;
kono
parents:
diff changeset
2237 for (l = ll; l <= i6; ++l)
kono
parents:
diff changeset
2238 {
kono
parents:
diff changeset
2239 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
kono
parents:
diff changeset
2240 257] * b[l + j * b_dim1];
kono
parents:
diff changeset
2241 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
kono
parents:
diff changeset
2242 257] * b[l + j * b_dim1];
kono
parents:
diff changeset
2243 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
kono
parents:
diff changeset
2244 257] * b[l + j * b_dim1];
kono
parents:
diff changeset
2245 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
kono
parents:
diff changeset
2246 257] * b[l + j * b_dim1];
kono
parents:
diff changeset
2247 }
kono
parents:
diff changeset
2248 c[i + j * c_dim1] = f11;
kono
parents:
diff changeset
2249 c[i + 1 + j * c_dim1] = f21;
kono
parents:
diff changeset
2250 c[i + 2 + j * c_dim1] = f31;
kono
parents:
diff changeset
2251 c[i + 3 + j * c_dim1] = f41;
kono
parents:
diff changeset
2252 }
kono
parents:
diff changeset
2253 i5 = ii + isec - 1;
kono
parents:
diff changeset
2254 for (i = ii + uisec; i <= i5; ++i)
kono
parents:
diff changeset
2255 {
kono
parents:
diff changeset
2256 f11 = c[i + j * c_dim1];
kono
parents:
diff changeset
2257 i6 = ll + lsec - 1;
kono
parents:
diff changeset
2258 for (l = ll; l <= i6; ++l)
kono
parents:
diff changeset
2259 {
kono
parents:
diff changeset
2260 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
kono
parents:
diff changeset
2261 257] * b[l + j * b_dim1];
kono
parents:
diff changeset
2262 }
kono
parents:
diff changeset
2263 c[i + j * c_dim1] = f11;
kono
parents:
diff changeset
2264 }
kono
parents:
diff changeset
2265 }
kono
parents:
diff changeset
2266 }
kono
parents:
diff changeset
2267 }
kono
parents:
diff changeset
2268 }
kono
parents:
diff changeset
2269 }
kono
parents:
diff changeset
2270 free(t1);
kono
parents:
diff changeset
2271 return;
kono
parents:
diff changeset
2272 }
kono
parents:
diff changeset
2273 else if (rxstride == 1 && aystride == 1 && bxstride == 1)
kono
parents:
diff changeset
2274 {
kono
parents:
diff changeset
2275 if (GFC_DESCRIPTOR_RANK (a) != 1)
kono
parents:
diff changeset
2276 {
kono
parents:
diff changeset
2277 const GFC_INTEGER_16 *restrict abase_x;
kono
parents:
diff changeset
2278 const GFC_INTEGER_16 *restrict bbase_y;
kono
parents:
diff changeset
2279 GFC_INTEGER_16 *restrict dest_y;
kono
parents:
diff changeset
2280 GFC_INTEGER_16 s;
kono
parents:
diff changeset
2281
kono
parents:
diff changeset
2282 for (y = 0; y < ycount; y++)
kono
parents:
diff changeset
2283 {
kono
parents:
diff changeset
2284 bbase_y = &bbase[y*bystride];
kono
parents:
diff changeset
2285 dest_y = &dest[y*rystride];
kono
parents:
diff changeset
2286 for (x = 0; x < xcount; x++)
kono
parents:
diff changeset
2287 {
kono
parents:
diff changeset
2288 abase_x = &abase[x*axstride];
kono
parents:
diff changeset
2289 s = (GFC_INTEGER_16) 0;
kono
parents:
diff changeset
2290 for (n = 0; n < count; n++)
kono
parents:
diff changeset
2291 s += abase_x[n] * bbase_y[n];
kono
parents:
diff changeset
2292 dest_y[x] = s;
kono
parents:
diff changeset
2293 }
kono
parents:
diff changeset
2294 }
kono
parents:
diff changeset
2295 }
kono
parents:
diff changeset
2296 else
kono
parents:
diff changeset
2297 {
kono
parents:
diff changeset
2298 const GFC_INTEGER_16 *restrict bbase_y;
kono
parents:
diff changeset
2299 GFC_INTEGER_16 s;
kono
parents:
diff changeset
2300
kono
parents:
diff changeset
2301 for (y = 0; y < ycount; y++)
kono
parents:
diff changeset
2302 {
kono
parents:
diff changeset
2303 bbase_y = &bbase[y*bystride];
kono
parents:
diff changeset
2304 s = (GFC_INTEGER_16) 0;
kono
parents:
diff changeset
2305 for (n = 0; n < count; n++)
kono
parents:
diff changeset
2306 s += abase[n*axstride] * bbase_y[n];
kono
parents:
diff changeset
2307 dest[y*rystride] = s;
kono
parents:
diff changeset
2308 }
kono
parents:
diff changeset
2309 }
kono
parents:
diff changeset
2310 }
kono
parents:
diff changeset
2311 else if (axstride < aystride)
kono
parents:
diff changeset
2312 {
kono
parents:
diff changeset
2313 for (y = 0; y < ycount; y++)
kono
parents:
diff changeset
2314 for (x = 0; x < xcount; x++)
kono
parents:
diff changeset
2315 dest[x*rxstride + y*rystride] = (GFC_INTEGER_16)0;
kono
parents:
diff changeset
2316
kono
parents:
diff changeset
2317 for (y = 0; y < ycount; y++)
kono
parents:
diff changeset
2318 for (n = 0; n < count; n++)
kono
parents:
diff changeset
2319 for (x = 0; x < xcount; x++)
kono
parents:
diff changeset
2320 /* dest[x,y] += a[x,n] * b[n,y] */
kono
parents:
diff changeset
2321 dest[x*rxstride + y*rystride] +=
kono
parents:
diff changeset
2322 abase[x*axstride + n*aystride] *
kono
parents:
diff changeset
2323 bbase[n*bxstride + y*bystride];
kono
parents:
diff changeset
2324 }
kono
parents:
diff changeset
2325 else if (GFC_DESCRIPTOR_RANK (a) == 1)
kono
parents:
diff changeset
2326 {
kono
parents:
diff changeset
2327 const GFC_INTEGER_16 *restrict bbase_y;
kono
parents:
diff changeset
2328 GFC_INTEGER_16 s;
kono
parents:
diff changeset
2329
kono
parents:
diff changeset
2330 for (y = 0; y < ycount; y++)
kono
parents:
diff changeset
2331 {
kono
parents:
diff changeset
2332 bbase_y = &bbase[y*bystride];
kono
parents:
diff changeset
2333 s = (GFC_INTEGER_16) 0;
kono
parents:
diff changeset
2334 for (n = 0; n < count; n++)
kono
parents:
diff changeset
2335 s += abase[n*axstride] * bbase_y[n*bxstride];
kono
parents:
diff changeset
2336 dest[y*rxstride] = s;
kono
parents:
diff changeset
2337 }
kono
parents:
diff changeset
2338 }
kono
parents:
diff changeset
2339 else
kono
parents:
diff changeset
2340 {
kono
parents:
diff changeset
2341 const GFC_INTEGER_16 *restrict abase_x;
kono
parents:
diff changeset
2342 const GFC_INTEGER_16 *restrict bbase_y;
kono
parents:
diff changeset
2343 GFC_INTEGER_16 *restrict dest_y;
kono
parents:
diff changeset
2344 GFC_INTEGER_16 s;
kono
parents:
diff changeset
2345
kono
parents:
diff changeset
2346 for (y = 0; y < ycount; y++)
kono
parents:
diff changeset
2347 {
kono
parents:
diff changeset
2348 bbase_y = &bbase[y*bystride];
kono
parents:
diff changeset
2349 dest_y = &dest[y*rystride];
kono
parents:
diff changeset
2350 for (x = 0; x < xcount; x++)
kono
parents:
diff changeset
2351 {
kono
parents:
diff changeset
2352 abase_x = &abase[x*axstride];
kono
parents:
diff changeset
2353 s = (GFC_INTEGER_16) 0;
kono
parents:
diff changeset
2354 for (n = 0; n < count; n++)
kono
parents:
diff changeset
2355 s += abase_x[n*aystride] * bbase_y[n*bxstride];
kono
parents:
diff changeset
2356 dest_y[x*rxstride] = s;
kono
parents:
diff changeset
2357 }
kono
parents:
diff changeset
2358 }
kono
parents:
diff changeset
2359 }
kono
parents:
diff changeset
2360 }
kono
parents:
diff changeset
2361 #undef POW3
kono
parents:
diff changeset
2362 #undef min
kono
parents:
diff changeset
2363 #undef max
kono
parents:
diff changeset
2364
kono
parents:
diff changeset
2365
kono
parents:
diff changeset
2366 /* Compiling main function, with selection code for the processor. */
kono
parents:
diff changeset
2367
kono
parents:
diff changeset
2368 /* Currently, this is i386 only. Adjust for other architectures. */
kono
parents:
diff changeset
2369
kono
parents:
diff changeset
2370 #include <config/i386/cpuinfo.h>
kono
parents:
diff changeset
2371 void matmul_i16 (gfc_array_i16 * const restrict retarray,
kono
parents:
diff changeset
2372 gfc_array_i16 * const restrict a, gfc_array_i16 * const restrict b, int try_blas,
kono
parents:
diff changeset
2373 int blas_limit, blas_call gemm)
kono
parents:
diff changeset
2374 {
kono
parents:
diff changeset
2375 static void (*matmul_p) (gfc_array_i16 * const restrict retarray,
kono
parents:
diff changeset
2376 gfc_array_i16 * const restrict a, gfc_array_i16 * const restrict b, int try_blas,
kono
parents:
diff changeset
2377 int blas_limit, blas_call gemm);
kono
parents:
diff changeset
2378
kono
parents:
diff changeset
2379 void (*matmul_fn) (gfc_array_i16 * const restrict retarray,
kono
parents:
diff changeset
2380 gfc_array_i16 * const restrict a, gfc_array_i16 * const restrict b, int try_blas,
kono
parents:
diff changeset
2381 int blas_limit, blas_call gemm);
kono
parents:
diff changeset
2382
kono
parents:
diff changeset
2383 matmul_fn = __atomic_load_n (&matmul_p, __ATOMIC_RELAXED);
kono
parents:
diff changeset
2384 if (matmul_fn == NULL)
kono
parents:
diff changeset
2385 {
kono
parents:
diff changeset
2386 matmul_fn = matmul_i16_vanilla;
kono
parents:
diff changeset
2387 if (__cpu_model.__cpu_vendor == VENDOR_INTEL)
kono
parents:
diff changeset
2388 {
kono
parents:
diff changeset
2389 /* Run down the available processors in order of preference. */
kono
parents:
diff changeset
2390 #ifdef HAVE_AVX512F
kono
parents:
diff changeset
2391 if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX512F))
kono
parents:
diff changeset
2392 {
kono
parents:
diff changeset
2393 matmul_fn = matmul_i16_avx512f;
kono
parents:
diff changeset
2394 goto store;
kono
parents:
diff changeset
2395 }
kono
parents:
diff changeset
2396
kono
parents:
diff changeset
2397 #endif /* HAVE_AVX512F */
kono
parents:
diff changeset
2398
kono
parents:
diff changeset
2399 #ifdef HAVE_AVX2
kono
parents:
diff changeset
2400 if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX2))
kono
parents:
diff changeset
2401 && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA)))
kono
parents:
diff changeset
2402 {
kono
parents:
diff changeset
2403 matmul_fn = matmul_i16_avx2;
kono
parents:
diff changeset
2404 goto store;
kono
parents:
diff changeset
2405 }
kono
parents:
diff changeset
2406
kono
parents:
diff changeset
2407 #endif
kono
parents:
diff changeset
2408
kono
parents:
diff changeset
2409 #ifdef HAVE_AVX
kono
parents:
diff changeset
2410 if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
kono
parents:
diff changeset
2411 {
kono
parents:
diff changeset
2412 matmul_fn = matmul_i16_avx;
kono
parents:
diff changeset
2413 goto store;
kono
parents:
diff changeset
2414 }
kono
parents:
diff changeset
2415 #endif /* HAVE_AVX */
kono
parents:
diff changeset
2416 }
kono
parents:
diff changeset
2417 else if (__cpu_model.__cpu_vendor == VENDOR_AMD)
kono
parents:
diff changeset
2418 {
kono
parents:
diff changeset
2419 #if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
kono
parents:
diff changeset
2420 if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
kono
parents:
diff changeset
2421 && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA)))
kono
parents:
diff changeset
2422 {
kono
parents:
diff changeset
2423 matmul_fn = matmul_i16_avx128_fma3;
kono
parents:
diff changeset
2424 goto store;
kono
parents:
diff changeset
2425 }
kono
parents:
diff changeset
2426 #endif
kono
parents:
diff changeset
2427 #if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
kono
parents:
diff changeset
2428 if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
kono
parents:
diff changeset
2429 && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA4)))
kono
parents:
diff changeset
2430 {
kono
parents:
diff changeset
2431 matmul_fn = matmul_i16_avx128_fma4;
kono
parents:
diff changeset
2432 goto store;
kono
parents:
diff changeset
2433 }
kono
parents:
diff changeset
2434 #endif
kono
parents:
diff changeset
2435
kono
parents:
diff changeset
2436 }
kono
parents:
diff changeset
2437 store:
kono
parents:
diff changeset
2438 __atomic_store_n (&matmul_p, matmul_fn, __ATOMIC_RELAXED);
kono
parents:
diff changeset
2439 }
kono
parents:
diff changeset
2440
kono
parents:
diff changeset
2441 (*matmul_fn) (retarray, a, b, try_blas, blas_limit, gemm);
kono
parents:
diff changeset
2442 }
kono
parents:
diff changeset
2443
kono
parents:
diff changeset
2444 #else /* Just the vanilla function. */
kono
parents:
diff changeset
2445
kono
parents:
diff changeset
2446 void
kono
parents:
diff changeset
2447 matmul_i16 (gfc_array_i16 * const restrict retarray,
kono
parents:
diff changeset
2448 gfc_array_i16 * const restrict a, gfc_array_i16 * const restrict b, int try_blas,
kono
parents:
diff changeset
2449 int blas_limit, blas_call gemm)
kono
parents:
diff changeset
2450 {
kono
parents:
diff changeset
2451 const GFC_INTEGER_16 * restrict abase;
kono
parents:
diff changeset
2452 const GFC_INTEGER_16 * restrict bbase;
kono
parents:
diff changeset
2453 GFC_INTEGER_16 * restrict dest;
kono
parents:
diff changeset
2454
kono
parents:
diff changeset
2455 index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
kono
parents:
diff changeset
2456 index_type x, y, n, count, xcount, ycount;
kono
parents:
diff changeset
2457
kono
parents:
diff changeset
2458 assert (GFC_DESCRIPTOR_RANK (a) == 2
kono
parents:
diff changeset
2459 || GFC_DESCRIPTOR_RANK (b) == 2);
kono
parents:
diff changeset
2460
kono
parents:
diff changeset
2461 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
kono
parents:
diff changeset
2462
kono
parents:
diff changeset
2463 Either A or B (but not both) can be rank 1:
kono
parents:
diff changeset
2464
kono
parents:
diff changeset
2465 o One-dimensional argument A is implicitly treated as a row matrix
kono
parents:
diff changeset
2466 dimensioned [1,count], so xcount=1.
kono
parents:
diff changeset
2467
kono
parents:
diff changeset
2468 o One-dimensional argument B is implicitly treated as a column matrix
kono
parents:
diff changeset
2469 dimensioned [count, 1], so ycount=1.
kono
parents:
diff changeset
2470 */
kono
parents:
diff changeset
2471
kono
parents:
diff changeset
2472 if (retarray->base_addr == NULL)
kono
parents:
diff changeset
2473 {
kono
parents:
diff changeset
2474 if (GFC_DESCRIPTOR_RANK (a) == 1)
kono
parents:
diff changeset
2475 {
kono
parents:
diff changeset
2476 GFC_DIMENSION_SET(retarray->dim[0], 0,
kono
parents:
diff changeset
2477 GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
kono
parents:
diff changeset
2478 }
kono
parents:
diff changeset
2479 else if (GFC_DESCRIPTOR_RANK (b) == 1)
kono
parents:
diff changeset
2480 {
kono
parents:
diff changeset
2481 GFC_DIMENSION_SET(retarray->dim[0], 0,
kono
parents:
diff changeset
2482 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
kono
parents:
diff changeset
2483 }
kono
parents:
diff changeset
2484 else
kono
parents:
diff changeset
2485 {
kono
parents:
diff changeset
2486 GFC_DIMENSION_SET(retarray->dim[0], 0,
kono
parents:
diff changeset
2487 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
kono
parents:
diff changeset
2488
kono
parents:
diff changeset
2489 GFC_DIMENSION_SET(retarray->dim[1], 0,
kono
parents:
diff changeset
2490 GFC_DESCRIPTOR_EXTENT(b,1) - 1,
kono
parents:
diff changeset
2491 GFC_DESCRIPTOR_EXTENT(retarray,0));
kono
parents:
diff changeset
2492 }
kono
parents:
diff changeset
2493
kono
parents:
diff changeset
2494 retarray->base_addr
kono
parents:
diff changeset
2495 = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_INTEGER_16));
kono
parents:
diff changeset
2496 retarray->offset = 0;
kono
parents:
diff changeset
2497 }
kono
parents:
diff changeset
2498 else if (unlikely (compile_options.bounds_check))
kono
parents:
diff changeset
2499 {
kono
parents:
diff changeset
2500 index_type ret_extent, arg_extent;
kono
parents:
diff changeset
2501
kono
parents:
diff changeset
2502 if (GFC_DESCRIPTOR_RANK (a) == 1)
kono
parents:
diff changeset
2503 {
kono
parents:
diff changeset
2504 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
kono
parents:
diff changeset
2505 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
kono
parents:
diff changeset
2506 if (arg_extent != ret_extent)
131
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
2507 runtime_error ("Array bound mismatch for dimension 1 of "
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
2508 "array (%ld/%ld) ",
111
kono
parents:
diff changeset
2509 (long int) ret_extent, (long int) arg_extent);
kono
parents:
diff changeset
2510 }
kono
parents:
diff changeset
2511 else if (GFC_DESCRIPTOR_RANK (b) == 1)
kono
parents:
diff changeset
2512 {
kono
parents:
diff changeset
2513 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
kono
parents:
diff changeset
2514 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
kono
parents:
diff changeset
2515 if (arg_extent != ret_extent)
131
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
2516 runtime_error ("Array bound mismatch for dimension 1 of "
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
2517 "array (%ld/%ld) ",
111
kono
parents:
diff changeset
2518 (long int) ret_extent, (long int) arg_extent);
kono
parents:
diff changeset
2519 }
kono
parents:
diff changeset
2520 else
kono
parents:
diff changeset
2521 {
kono
parents:
diff changeset
2522 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
kono
parents:
diff changeset
2523 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
kono
parents:
diff changeset
2524 if (arg_extent != ret_extent)
131
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
2525 runtime_error ("Array bound mismatch for dimension 1 of "
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
2526 "array (%ld/%ld) ",
111
kono
parents:
diff changeset
2527 (long int) ret_extent, (long int) arg_extent);
kono
parents:
diff changeset
2528
kono
parents:
diff changeset
2529 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
kono
parents:
diff changeset
2530 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
kono
parents:
diff changeset
2531 if (arg_extent != ret_extent)
131
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
2532 runtime_error ("Array bound mismatch for dimension 2 of "
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
2533 "array (%ld/%ld) ",
111
kono
parents:
diff changeset
2534 (long int) ret_extent, (long int) arg_extent);
kono
parents:
diff changeset
2535 }
kono
parents:
diff changeset
2536 }
kono
parents:
diff changeset
2537
kono
parents:
diff changeset
2538
kono
parents:
diff changeset
2539 if (GFC_DESCRIPTOR_RANK (retarray) == 1)
kono
parents:
diff changeset
2540 {
kono
parents:
diff changeset
2541 /* One-dimensional result may be addressed in the code below
kono
parents:
diff changeset
2542 either as a row or a column matrix. We want both cases to
kono
parents:
diff changeset
2543 work. */
kono
parents:
diff changeset
2544 rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
kono
parents:
diff changeset
2545 }
kono
parents:
diff changeset
2546 else
kono
parents:
diff changeset
2547 {
kono
parents:
diff changeset
2548 rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
kono
parents:
diff changeset
2549 rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
kono
parents:
diff changeset
2550 }
kono
parents:
diff changeset
2551
kono
parents:
diff changeset
2552
kono
parents:
diff changeset
2553 if (GFC_DESCRIPTOR_RANK (a) == 1)
kono
parents:
diff changeset
2554 {
kono
parents:
diff changeset
2555 /* Treat it as a a row matrix A[1,count]. */
kono
parents:
diff changeset
2556 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
kono
parents:
diff changeset
2557 aystride = 1;
kono
parents:
diff changeset
2558
kono
parents:
diff changeset
2559 xcount = 1;
kono
parents:
diff changeset
2560 count = GFC_DESCRIPTOR_EXTENT(a,0);
kono
parents:
diff changeset
2561 }
kono
parents:
diff changeset
2562 else
kono
parents:
diff changeset
2563 {
kono
parents:
diff changeset
2564 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
kono
parents:
diff changeset
2565 aystride = GFC_DESCRIPTOR_STRIDE(a,1);
kono
parents:
diff changeset
2566
kono
parents:
diff changeset
2567 count = GFC_DESCRIPTOR_EXTENT(a,1);
kono
parents:
diff changeset
2568 xcount = GFC_DESCRIPTOR_EXTENT(a,0);
kono
parents:
diff changeset
2569 }
kono
parents:
diff changeset
2570
kono
parents:
diff changeset
2571 if (count != GFC_DESCRIPTOR_EXTENT(b,0))
kono
parents:
diff changeset
2572 {
kono
parents:
diff changeset
2573 if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
131
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
2574 runtime_error ("Incorrect extent in argument B in MATMUL intrinsic "
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
2575 "in dimension 1: is %ld, should be %ld",
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
2576 (long int) GFC_DESCRIPTOR_EXTENT(b,0), (long int) count);
111
kono
parents:
diff changeset
2577 }
kono
parents:
diff changeset
2578
kono
parents:
diff changeset
2579 if (GFC_DESCRIPTOR_RANK (b) == 1)
kono
parents:
diff changeset
2580 {
kono
parents:
diff changeset
2581 /* Treat it as a column matrix B[count,1] */
kono
parents:
diff changeset
2582 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
kono
parents:
diff changeset
2583
kono
parents:
diff changeset
2584 /* bystride should never be used for 1-dimensional b.
kono
parents:
diff changeset
2585 The value is only used for calculation of the
kono
parents:
diff changeset
2586 memory by the buffer. */
kono
parents:
diff changeset
2587 bystride = 256;
kono
parents:
diff changeset
2588 ycount = 1;
kono
parents:
diff changeset
2589 }
kono
parents:
diff changeset
2590 else
kono
parents:
diff changeset
2591 {
kono
parents:
diff changeset
2592 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
kono
parents:
diff changeset
2593 bystride = GFC_DESCRIPTOR_STRIDE(b,1);
kono
parents:
diff changeset
2594 ycount = GFC_DESCRIPTOR_EXTENT(b,1);
kono
parents:
diff changeset
2595 }
kono
parents:
diff changeset
2596
kono
parents:
diff changeset
2597 abase = a->base_addr;
kono
parents:
diff changeset
2598 bbase = b->base_addr;
kono
parents:
diff changeset
2599 dest = retarray->base_addr;
kono
parents:
diff changeset
2600
kono
parents:
diff changeset
2601 /* Now that everything is set up, we perform the multiplication
kono
parents:
diff changeset
2602 itself. */
kono
parents:
diff changeset
2603
kono
parents:
diff changeset
2604 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
kono
parents:
diff changeset
2605 #define min(a,b) ((a) <= (b) ? (a) : (b))
kono
parents:
diff changeset
2606 #define max(a,b) ((a) >= (b) ? (a) : (b))
kono
parents:
diff changeset
2607
kono
parents:
diff changeset
2608 if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
kono
parents:
diff changeset
2609 && (bxstride == 1 || bystride == 1)
kono
parents:
diff changeset
2610 && (((float) xcount) * ((float) ycount) * ((float) count)
kono
parents:
diff changeset
2611 > POW3(blas_limit)))
kono
parents:
diff changeset
2612 {
kono
parents:
diff changeset
2613 const int m = xcount, n = ycount, k = count, ldc = rystride;
kono
parents:
diff changeset
2614 const GFC_INTEGER_16 one = 1, zero = 0;
kono
parents:
diff changeset
2615 const int lda = (axstride == 1) ? aystride : axstride,
kono
parents:
diff changeset
2616 ldb = (bxstride == 1) ? bystride : bxstride;
kono
parents:
diff changeset
2617
kono
parents:
diff changeset
2618 if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
kono
parents:
diff changeset
2619 {
kono
parents:
diff changeset
2620 assert (gemm != NULL);
131
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
2621 const char *transa, *transb;
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
2622 if (try_blas & 2)
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
2623 transa = "C";
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
2624 else
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
2625 transa = axstride == 1 ? "N" : "T";
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
2626
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
2627 if (try_blas & 4)
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
2628 transb = "C";
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
2629 else
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
2630 transb = bxstride == 1 ? "N" : "T";
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
2631
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
2632 gemm (transa, transb , &m,
111
kono
parents:
diff changeset
2633 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
kono
parents:
diff changeset
2634 &ldc, 1, 1);
kono
parents:
diff changeset
2635 return;
kono
parents:
diff changeset
2636 }
kono
parents:
diff changeset
2637 }
kono
parents:
diff changeset
2638
kono
parents:
diff changeset
2639 if (rxstride == 1 && axstride == 1 && bxstride == 1)
kono
parents:
diff changeset
2640 {
kono
parents:
diff changeset
2641 /* This block of code implements a tuned matmul, derived from
kono
parents:
diff changeset
2642 Superscalar GEMM-based level 3 BLAS, Beta version 0.1
kono
parents:
diff changeset
2643
kono
parents:
diff changeset
2644 Bo Kagstrom and Per Ling
kono
parents:
diff changeset
2645 Department of Computing Science
kono
parents:
diff changeset
2646 Umea University
kono
parents:
diff changeset
2647 S-901 87 Umea, Sweden
kono
parents:
diff changeset
2648
kono
parents:
diff changeset
2649 from netlib.org, translated to C, and modified for matmul.m4. */
kono
parents:
diff changeset
2650
kono
parents:
diff changeset
2651 const GFC_INTEGER_16 *a, *b;
kono
parents:
diff changeset
2652 GFC_INTEGER_16 *c;
kono
parents:
diff changeset
2653 const index_type m = xcount, n = ycount, k = count;
kono
parents:
diff changeset
2654
kono
parents:
diff changeset
2655 /* System generated locals */
kono
parents:
diff changeset
2656 index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
kono
parents:
diff changeset
2657 i1, i2, i3, i4, i5, i6;
kono
parents:
diff changeset
2658
kono
parents:
diff changeset
2659 /* Local variables */
kono
parents:
diff changeset
2660 GFC_INTEGER_16 f11, f12, f21, f22, f31, f32, f41, f42,
kono
parents:
diff changeset
2661 f13, f14, f23, f24, f33, f34, f43, f44;
kono
parents:
diff changeset
2662 index_type i, j, l, ii, jj, ll;
kono
parents:
diff changeset
2663 index_type isec, jsec, lsec, uisec, ujsec, ulsec;
kono
parents:
diff changeset
2664 GFC_INTEGER_16 *t1;
kono
parents:
diff changeset
2665
kono
parents:
diff changeset
2666 a = abase;
kono
parents:
diff changeset
2667 b = bbase;
kono
parents:
diff changeset
2668 c = retarray->base_addr;
kono
parents:
diff changeset
2669
kono
parents:
diff changeset
2670 /* Parameter adjustments */
kono
parents:
diff changeset
2671 c_dim1 = rystride;
kono
parents:
diff changeset
2672 c_offset = 1 + c_dim1;
kono
parents:
diff changeset
2673 c -= c_offset;
kono
parents:
diff changeset
2674 a_dim1 = aystride;
kono
parents:
diff changeset
2675 a_offset = 1 + a_dim1;
kono
parents:
diff changeset
2676 a -= a_offset;
kono
parents:
diff changeset
2677 b_dim1 = bystride;
kono
parents:
diff changeset
2678 b_offset = 1 + b_dim1;
kono
parents:
diff changeset
2679 b -= b_offset;
kono
parents:
diff changeset
2680
kono
parents:
diff changeset
2681 /* Empty c first. */
kono
parents:
diff changeset
2682 for (j=1; j<=n; j++)
kono
parents:
diff changeset
2683 for (i=1; i<=m; i++)
kono
parents:
diff changeset
2684 c[i + j * c_dim1] = (GFC_INTEGER_16)0;
kono
parents:
diff changeset
2685
kono
parents:
diff changeset
2686 /* Early exit if possible */
kono
parents:
diff changeset
2687 if (m == 0 || n == 0 || k == 0)
kono
parents:
diff changeset
2688 return;
kono
parents:
diff changeset
2689
kono
parents:
diff changeset
2690 /* Adjust size of t1 to what is needed. */
131
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
2691 index_type t1_dim, a_sz;
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
2692 if (aystride == 1)
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
2693 a_sz = rystride;
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
2694 else
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
2695 a_sz = a_dim1;
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
2696
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
2697 t1_dim = a_sz * 256 + b_dim1;
111
kono
parents:
diff changeset
2698 if (t1_dim > 65536)
kono
parents:
diff changeset
2699 t1_dim = 65536;
kono
parents:
diff changeset
2700
kono
parents:
diff changeset
2701 t1 = malloc (t1_dim * sizeof(GFC_INTEGER_16));
kono
parents:
diff changeset
2702
kono
parents:
diff changeset
2703 /* Start turning the crank. */
kono
parents:
diff changeset
2704 i1 = n;
kono
parents:
diff changeset
2705 for (jj = 1; jj <= i1; jj += 512)
kono
parents:
diff changeset
2706 {
kono
parents:
diff changeset
2707 /* Computing MIN */
kono
parents:
diff changeset
2708 i2 = 512;
kono
parents:
diff changeset
2709 i3 = n - jj + 1;
kono
parents:
diff changeset
2710 jsec = min(i2,i3);
kono
parents:
diff changeset
2711 ujsec = jsec - jsec % 4;
kono
parents:
diff changeset
2712 i2 = k;
kono
parents:
diff changeset
2713 for (ll = 1; ll <= i2; ll += 256)
kono
parents:
diff changeset
2714 {
kono
parents:
diff changeset
2715 /* Computing MIN */
kono
parents:
diff changeset
2716 i3 = 256;
kono
parents:
diff changeset
2717 i4 = k - ll + 1;
kono
parents:
diff changeset
2718 lsec = min(i3,i4);
kono
parents:
diff changeset
2719 ulsec = lsec - lsec % 2;
kono
parents:
diff changeset
2720
kono
parents:
diff changeset
2721 i3 = m;
kono
parents:
diff changeset
2722 for (ii = 1; ii <= i3; ii += 256)
kono
parents:
diff changeset
2723 {
kono
parents:
diff changeset
2724 /* Computing MIN */
kono
parents:
diff changeset
2725 i4 = 256;
kono
parents:
diff changeset
2726 i5 = m - ii + 1;
kono
parents:
diff changeset
2727 isec = min(i4,i5);
kono
parents:
diff changeset
2728 uisec = isec - isec % 2;
kono
parents:
diff changeset
2729 i4 = ll + ulsec - 1;
kono
parents:
diff changeset
2730 for (l = ll; l <= i4; l += 2)
kono
parents:
diff changeset
2731 {
kono
parents:
diff changeset
2732 i5 = ii + uisec - 1;
kono
parents:
diff changeset
2733 for (i = ii; i <= i5; i += 2)
kono
parents:
diff changeset
2734 {
kono
parents:
diff changeset
2735 t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
kono
parents:
diff changeset
2736 a[i + l * a_dim1];
kono
parents:
diff changeset
2737 t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
kono
parents:
diff changeset
2738 a[i + (l + 1) * a_dim1];
kono
parents:
diff changeset
2739 t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
kono
parents:
diff changeset
2740 a[i + 1 + l * a_dim1];
kono
parents:
diff changeset
2741 t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
kono
parents:
diff changeset
2742 a[i + 1 + (l + 1) * a_dim1];
kono
parents:
diff changeset
2743 }
kono
parents:
diff changeset
2744 if (uisec < isec)
kono
parents:
diff changeset
2745 {
kono
parents:
diff changeset
2746 t1[l - ll + 1 + (isec << 8) - 257] =
kono
parents:
diff changeset
2747 a[ii + isec - 1 + l * a_dim1];
kono
parents:
diff changeset
2748 t1[l - ll + 2 + (isec << 8) - 257] =
kono
parents:
diff changeset
2749 a[ii + isec - 1 + (l + 1) * a_dim1];
kono
parents:
diff changeset
2750 }
kono
parents:
diff changeset
2751 }
kono
parents:
diff changeset
2752 if (ulsec < lsec)
kono
parents:
diff changeset
2753 {
kono
parents:
diff changeset
2754 i4 = ii + isec - 1;
kono
parents:
diff changeset
2755 for (i = ii; i<= i4; ++i)
kono
parents:
diff changeset
2756 {
kono
parents:
diff changeset
2757 t1[lsec + ((i - ii + 1) << 8) - 257] =
kono
parents:
diff changeset
2758 a[i + (ll + lsec - 1) * a_dim1];
kono
parents:
diff changeset
2759 }
kono
parents:
diff changeset
2760 }
kono
parents:
diff changeset
2761
kono
parents:
diff changeset
2762 uisec = isec - isec % 4;
kono
parents:
diff changeset
2763 i4 = jj + ujsec - 1;
kono
parents:
diff changeset
2764 for (j = jj; j <= i4; j += 4)
kono
parents:
diff changeset
2765 {
kono
parents:
diff changeset
2766 i5 = ii + uisec - 1;
kono
parents:
diff changeset
2767 for (i = ii; i <= i5; i += 4)
kono
parents:
diff changeset
2768 {
kono
parents:
diff changeset
2769 f11 = c[i + j * c_dim1];
kono
parents:
diff changeset
2770 f21 = c[i + 1 + j * c_dim1];
kono
parents:
diff changeset
2771 f12 = c[i + (j + 1) * c_dim1];
kono
parents:
diff changeset
2772 f22 = c[i + 1 + (j + 1) * c_dim1];
kono
parents:
diff changeset
2773 f13 = c[i + (j + 2) * c_dim1];
kono
parents:
diff changeset
2774 f23 = c[i + 1 + (j + 2) * c_dim1];
kono
parents:
diff changeset
2775 f14 = c[i + (j + 3) * c_dim1];
kono
parents:
diff changeset
2776 f24 = c[i + 1 + (j + 3) * c_dim1];
kono
parents:
diff changeset
2777 f31 = c[i + 2 + j * c_dim1];
kono
parents:
diff changeset
2778 f41 = c[i + 3 + j * c_dim1];
kono
parents:
diff changeset
2779 f32 = c[i + 2 + (j + 1) * c_dim1];
kono
parents:
diff changeset
2780 f42 = c[i + 3 + (j + 1) * c_dim1];
kono
parents:
diff changeset
2781 f33 = c[i + 2 + (j + 2) * c_dim1];
kono
parents:
diff changeset
2782 f43 = c[i + 3 + (j + 2) * c_dim1];
kono
parents:
diff changeset
2783 f34 = c[i + 2 + (j + 3) * c_dim1];
kono
parents:
diff changeset
2784 f44 = c[i + 3 + (j + 3) * c_dim1];
kono
parents:
diff changeset
2785 i6 = ll + lsec - 1;
kono
parents:
diff changeset
2786 for (l = ll; l <= i6; ++l)
kono
parents:
diff changeset
2787 {
kono
parents:
diff changeset
2788 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
kono
parents:
diff changeset
2789 * b[l + j * b_dim1];
kono
parents:
diff changeset
2790 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
kono
parents:
diff changeset
2791 * b[l + j * b_dim1];
kono
parents:
diff changeset
2792 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
kono
parents:
diff changeset
2793 * b[l + (j + 1) * b_dim1];
kono
parents:
diff changeset
2794 f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
kono
parents:
diff changeset
2795 * b[l + (j + 1) * b_dim1];
kono
parents:
diff changeset
2796 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
kono
parents:
diff changeset
2797 * b[l + (j + 2) * b_dim1];
kono
parents:
diff changeset
2798 f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
kono
parents:
diff changeset
2799 * b[l + (j + 2) * b_dim1];
kono
parents:
diff changeset
2800 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
kono
parents:
diff changeset
2801 * b[l + (j + 3) * b_dim1];
kono
parents:
diff changeset
2802 f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
kono
parents:
diff changeset
2803 * b[l + (j + 3) * b_dim1];
kono
parents:
diff changeset
2804 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
kono
parents:
diff changeset
2805 * b[l + j * b_dim1];
kono
parents:
diff changeset
2806 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
kono
parents:
diff changeset
2807 * b[l + j * b_dim1];
kono
parents:
diff changeset
2808 f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
kono
parents:
diff changeset
2809 * b[l + (j + 1) * b_dim1];
kono
parents:
diff changeset
2810 f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
kono
parents:
diff changeset
2811 * b[l + (j + 1) * b_dim1];
kono
parents:
diff changeset
2812 f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
kono
parents:
diff changeset
2813 * b[l + (j + 2) * b_dim1];
kono
parents:
diff changeset
2814 f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
kono
parents:
diff changeset
2815 * b[l + (j + 2) * b_dim1];
kono
parents:
diff changeset
2816 f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
kono
parents:
diff changeset
2817 * b[l + (j + 3) * b_dim1];
kono
parents:
diff changeset
2818 f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
kono
parents:
diff changeset
2819 * b[l + (j + 3) * b_dim1];
kono
parents:
diff changeset
2820 }
kono
parents:
diff changeset
2821 c[i + j * c_dim1] = f11;
kono
parents:
diff changeset
2822 c[i + 1 + j * c_dim1] = f21;
kono
parents:
diff changeset
2823 c[i + (j + 1) * c_dim1] = f12;
kono
parents:
diff changeset
2824 c[i + 1 + (j + 1) * c_dim1] = f22;
kono
parents:
diff changeset
2825 c[i + (j + 2) * c_dim1] = f13;
kono
parents:
diff changeset
2826 c[i + 1 + (j + 2) * c_dim1] = f23;
kono
parents:
diff changeset
2827 c[i + (j + 3) * c_dim1] = f14;
kono
parents:
diff changeset
2828 c[i + 1 + (j + 3) * c_dim1] = f24;
kono
parents:
diff changeset
2829 c[i + 2 + j * c_dim1] = f31;
kono
parents:
diff changeset
2830 c[i + 3 + j * c_dim1] = f41;
kono
parents:
diff changeset
2831 c[i + 2 + (j + 1) * c_dim1] = f32;
kono
parents:
diff changeset
2832 c[i + 3 + (j + 1) * c_dim1] = f42;
kono
parents:
diff changeset
2833 c[i + 2 + (j + 2) * c_dim1] = f33;
kono
parents:
diff changeset
2834 c[i + 3 + (j + 2) * c_dim1] = f43;
kono
parents:
diff changeset
2835 c[i + 2 + (j + 3) * c_dim1] = f34;
kono
parents:
diff changeset
2836 c[i + 3 + (j + 3) * c_dim1] = f44;
kono
parents:
diff changeset
2837 }
kono
parents:
diff changeset
2838 if (uisec < isec)
kono
parents:
diff changeset
2839 {
kono
parents:
diff changeset
2840 i5 = ii + isec - 1;
kono
parents:
diff changeset
2841 for (i = ii + uisec; i <= i5; ++i)
kono
parents:
diff changeset
2842 {
kono
parents:
diff changeset
2843 f11 = c[i + j * c_dim1];
kono
parents:
diff changeset
2844 f12 = c[i + (j + 1) * c_dim1];
kono
parents:
diff changeset
2845 f13 = c[i + (j + 2) * c_dim1];
kono
parents:
diff changeset
2846 f14 = c[i + (j + 3) * c_dim1];
kono
parents:
diff changeset
2847 i6 = ll + lsec - 1;
kono
parents:
diff changeset
2848 for (l = ll; l <= i6; ++l)
kono
parents:
diff changeset
2849 {
kono
parents:
diff changeset
2850 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
kono
parents:
diff changeset
2851 257] * b[l + j * b_dim1];
kono
parents:
diff changeset
2852 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
kono
parents:
diff changeset
2853 257] * b[l + (j + 1) * b_dim1];
kono
parents:
diff changeset
2854 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
kono
parents:
diff changeset
2855 257] * b[l + (j + 2) * b_dim1];
kono
parents:
diff changeset
2856 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
kono
parents:
diff changeset
2857 257] * b[l + (j + 3) * b_dim1];
kono
parents:
diff changeset
2858 }
kono
parents:
diff changeset
2859 c[i + j * c_dim1] = f11;
kono
parents:
diff changeset
2860 c[i + (j + 1) * c_dim1] = f12;
kono
parents:
diff changeset
2861 c[i + (j + 2) * c_dim1] = f13;
kono
parents:
diff changeset
2862 c[i + (j + 3) * c_dim1] = f14;
kono
parents:
diff changeset
2863 }
kono
parents:
diff changeset
2864 }
kono
parents:
diff changeset
2865 }
kono
parents:
diff changeset
2866 if (ujsec < jsec)
kono
parents:
diff changeset
2867 {
kono
parents:
diff changeset
2868 i4 = jj + jsec - 1;
kono
parents:
diff changeset
2869 for (j = jj + ujsec; j <= i4; ++j)
kono
parents:
diff changeset
2870 {
kono
parents:
diff changeset
2871 i5 = ii + uisec - 1;
kono
parents:
diff changeset
2872 for (i = ii; i <= i5; i += 4)
kono
parents:
diff changeset
2873 {
kono
parents:
diff changeset
2874 f11 = c[i + j * c_dim1];
kono
parents:
diff changeset
2875 f21 = c[i + 1 + j * c_dim1];
kono
parents:
diff changeset
2876 f31 = c[i + 2 + j * c_dim1];
kono
parents:
diff changeset
2877 f41 = c[i + 3 + j * c_dim1];
kono
parents:
diff changeset
2878 i6 = ll + lsec - 1;
kono
parents:
diff changeset
2879 for (l = ll; l <= i6; ++l)
kono
parents:
diff changeset
2880 {
kono
parents:
diff changeset
2881 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
kono
parents:
diff changeset
2882 257] * b[l + j * b_dim1];
kono
parents:
diff changeset
2883 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
kono
parents:
diff changeset
2884 257] * b[l + j * b_dim1];
kono
parents:
diff changeset
2885 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
kono
parents:
diff changeset
2886 257] * b[l + j * b_dim1];
kono
parents:
diff changeset
2887 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
kono
parents:
diff changeset
2888 257] * b[l + j * b_dim1];
kono
parents:
diff changeset
2889 }
kono
parents:
diff changeset
2890 c[i + j * c_dim1] = f11;
kono
parents:
diff changeset
2891 c[i + 1 + j * c_dim1] = f21;
kono
parents:
diff changeset
2892 c[i + 2 + j * c_dim1] = f31;
kono
parents:
diff changeset
2893 c[i + 3 + j * c_dim1] = f41;
kono
parents:
diff changeset
2894 }
kono
parents:
diff changeset
2895 i5 = ii + isec - 1;
kono
parents:
diff changeset
2896 for (i = ii + uisec; i <= i5; ++i)
kono
parents:
diff changeset
2897 {
kono
parents:
diff changeset
2898 f11 = c[i + j * c_dim1];
kono
parents:
diff changeset
2899 i6 = ll + lsec - 1;
kono
parents:
diff changeset
2900 for (l = ll; l <= i6; ++l)
kono
parents:
diff changeset
2901 {
kono
parents:
diff changeset
2902 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
kono
parents:
diff changeset
2903 257] * b[l + j * b_dim1];
kono
parents:
diff changeset
2904 }
kono
parents:
diff changeset
2905 c[i + j * c_dim1] = f11;
kono
parents:
diff changeset
2906 }
kono
parents:
diff changeset
2907 }
kono
parents:
diff changeset
2908 }
kono
parents:
diff changeset
2909 }
kono
parents:
diff changeset
2910 }
kono
parents:
diff changeset
2911 }
kono
parents:
diff changeset
2912 free(t1);
kono
parents:
diff changeset
2913 return;
kono
parents:
diff changeset
2914 }
kono
parents:
diff changeset
2915 else if (rxstride == 1 && aystride == 1 && bxstride == 1)
kono
parents:
diff changeset
2916 {
kono
parents:
diff changeset
2917 if (GFC_DESCRIPTOR_RANK (a) != 1)
kono
parents:
diff changeset
2918 {
kono
parents:
diff changeset
2919 const GFC_INTEGER_16 *restrict abase_x;
kono
parents:
diff changeset
2920 const GFC_INTEGER_16 *restrict bbase_y;
kono
parents:
diff changeset
2921 GFC_INTEGER_16 *restrict dest_y;
kono
parents:
diff changeset
2922 GFC_INTEGER_16 s;
kono
parents:
diff changeset
2923
kono
parents:
diff changeset
2924 for (y = 0; y < ycount; y++)
kono
parents:
diff changeset
2925 {
kono
parents:
diff changeset
2926 bbase_y = &bbase[y*bystride];
kono
parents:
diff changeset
2927 dest_y = &dest[y*rystride];
kono
parents:
diff changeset
2928 for (x = 0; x < xcount; x++)
kono
parents:
diff changeset
2929 {
kono
parents:
diff changeset
2930 abase_x = &abase[x*axstride];
kono
parents:
diff changeset
2931 s = (GFC_INTEGER_16) 0;
kono
parents:
diff changeset
2932 for (n = 0; n < count; n++)
kono
parents:
diff changeset
2933 s += abase_x[n] * bbase_y[n];
kono
parents:
diff changeset
2934 dest_y[x] = s;
kono
parents:
diff changeset
2935 }
kono
parents:
diff changeset
2936 }
kono
parents:
diff changeset
2937 }
kono
parents:
diff changeset
2938 else
kono
parents:
diff changeset
2939 {
kono
parents:
diff changeset
2940 const GFC_INTEGER_16 *restrict bbase_y;
kono
parents:
diff changeset
2941 GFC_INTEGER_16 s;
kono
parents:
diff changeset
2942
kono
parents:
diff changeset
2943 for (y = 0; y < ycount; y++)
kono
parents:
diff changeset
2944 {
kono
parents:
diff changeset
2945 bbase_y = &bbase[y*bystride];
kono
parents:
diff changeset
2946 s = (GFC_INTEGER_16) 0;
kono
parents:
diff changeset
2947 for (n = 0; n < count; n++)
kono
parents:
diff changeset
2948 s += abase[n*axstride] * bbase_y[n];
kono
parents:
diff changeset
2949 dest[y*rystride] = s;
kono
parents:
diff changeset
2950 }
kono
parents:
diff changeset
2951 }
kono
parents:
diff changeset
2952 }
kono
parents:
diff changeset
2953 else if (axstride < aystride)
kono
parents:
diff changeset
2954 {
kono
parents:
diff changeset
2955 for (y = 0; y < ycount; y++)
kono
parents:
diff changeset
2956 for (x = 0; x < xcount; x++)
kono
parents:
diff changeset
2957 dest[x*rxstride + y*rystride] = (GFC_INTEGER_16)0;
kono
parents:
diff changeset
2958
kono
parents:
diff changeset
2959 for (y = 0; y < ycount; y++)
kono
parents:
diff changeset
2960 for (n = 0; n < count; n++)
kono
parents:
diff changeset
2961 for (x = 0; x < xcount; x++)
kono
parents:
diff changeset
2962 /* dest[x,y] += a[x,n] * b[n,y] */
kono
parents:
diff changeset
2963 dest[x*rxstride + y*rystride] +=
kono
parents:
diff changeset
2964 abase[x*axstride + n*aystride] *
kono
parents:
diff changeset
2965 bbase[n*bxstride + y*bystride];
kono
parents:
diff changeset
2966 }
kono
parents:
diff changeset
2967 else if (GFC_DESCRIPTOR_RANK (a) == 1)
kono
parents:
diff changeset
2968 {
kono
parents:
diff changeset
2969 const GFC_INTEGER_16 *restrict bbase_y;
kono
parents:
diff changeset
2970 GFC_INTEGER_16 s;
kono
parents:
diff changeset
2971
kono
parents:
diff changeset
2972 for (y = 0; y < ycount; y++)
kono
parents:
diff changeset
2973 {
kono
parents:
diff changeset
2974 bbase_y = &bbase[y*bystride];
kono
parents:
diff changeset
2975 s = (GFC_INTEGER_16) 0;
kono
parents:
diff changeset
2976 for (n = 0; n < count; n++)
kono
parents:
diff changeset
2977 s += abase[n*axstride] * bbase_y[n*bxstride];
kono
parents:
diff changeset
2978 dest[y*rxstride] = s;
kono
parents:
diff changeset
2979 }
kono
parents:
diff changeset
2980 }
kono
parents:
diff changeset
2981 else
kono
parents:
diff changeset
2982 {
kono
parents:
diff changeset
2983 const GFC_INTEGER_16 *restrict abase_x;
kono
parents:
diff changeset
2984 const GFC_INTEGER_16 *restrict bbase_y;
kono
parents:
diff changeset
2985 GFC_INTEGER_16 *restrict dest_y;
kono
parents:
diff changeset
2986 GFC_INTEGER_16 s;
kono
parents:
diff changeset
2987
kono
parents:
diff changeset
2988 for (y = 0; y < ycount; y++)
kono
parents:
diff changeset
2989 {
kono
parents:
diff changeset
2990 bbase_y = &bbase[y*bystride];
kono
parents:
diff changeset
2991 dest_y = &dest[y*rystride];
kono
parents:
diff changeset
2992 for (x = 0; x < xcount; x++)
kono
parents:
diff changeset
2993 {
kono
parents:
diff changeset
2994 abase_x = &abase[x*axstride];
kono
parents:
diff changeset
2995 s = (GFC_INTEGER_16) 0;
kono
parents:
diff changeset
2996 for (n = 0; n < count; n++)
kono
parents:
diff changeset
2997 s += abase_x[n*aystride] * bbase_y[n*bxstride];
kono
parents:
diff changeset
2998 dest_y[x*rxstride] = s;
kono
parents:
diff changeset
2999 }
kono
parents:
diff changeset
3000 }
kono
parents:
diff changeset
3001 }
kono
parents:
diff changeset
3002 }
kono
parents:
diff changeset
3003 #undef POW3
kono
parents:
diff changeset
3004 #undef min
kono
parents:
diff changeset
3005 #undef max
kono
parents:
diff changeset
3006
kono
parents:
diff changeset
3007 #endif
kono
parents:
diff changeset
3008 #endif
kono
parents:
diff changeset
3009