annotate libgfortran/generated/matmul_i1.c @ 120:f93fa5091070

fix conv1.c
author mir3636
date Thu, 08 Mar 2018 14:53:42 +0900
parents 04ced10e8804
children 84e7813d76e9
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
111
kono
parents:
diff changeset
1 /* Implementation of the MATMUL intrinsic
kono
parents:
diff changeset
2 Copyright (C) 2002-2017 Free Software Foundation, Inc.
kono
parents:
diff changeset
3 Contributed by Paul Brook <paul@nowt.org>
kono
parents:
diff changeset
4
kono
parents:
diff changeset
5 This file is part of the GNU Fortran runtime library (libgfortran).
kono
parents:
diff changeset
6
kono
parents:
diff changeset
7 Libgfortran is free software; you can redistribute it and/or
kono
parents:
diff changeset
8 modify it under the terms of the GNU General Public
kono
parents:
diff changeset
9 License as published by the Free Software Foundation; either
kono
parents:
diff changeset
10 version 3 of the License, or (at your option) any later version.
kono
parents:
diff changeset
11
kono
parents:
diff changeset
12 Libgfortran is distributed in the hope that it will be useful,
kono
parents:
diff changeset
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
kono
parents:
diff changeset
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
kono
parents:
diff changeset
15 GNU General Public License for more details.
kono
parents:
diff changeset
16
kono
parents:
diff changeset
17 Under Section 7 of GPL version 3, you are granted additional
kono
parents:
diff changeset
18 permissions described in the GCC Runtime Library Exception, version
kono
parents:
diff changeset
19 3.1, as published by the Free Software Foundation.
kono
parents:
diff changeset
20
kono
parents:
diff changeset
21 You should have received a copy of the GNU General Public License and
kono
parents:
diff changeset
22 a copy of the GCC Runtime Library Exception along with this program;
kono
parents:
diff changeset
23 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
kono
parents:
diff changeset
24 <http://www.gnu.org/licenses/>. */
kono
parents:
diff changeset
25
kono
parents:
diff changeset
26 #include "libgfortran.h"
kono
parents:
diff changeset
27 #include <string.h>
kono
parents:
diff changeset
28 #include <assert.h>
kono
parents:
diff changeset
29
kono
parents:
diff changeset
30
kono
parents:
diff changeset
31 #if defined (HAVE_GFC_INTEGER_1)
kono
parents:
diff changeset
32
kono
parents:
diff changeset
33 /* Prototype for the BLAS ?gemm subroutine, a pointer to which can be
kono
parents:
diff changeset
34 passed to us by the front-end, in which case we call it for large
kono
parents:
diff changeset
35 matrices. */
kono
parents:
diff changeset
36
kono
parents:
diff changeset
37 typedef void (*blas_call)(const char *, const char *, const int *, const int *,
kono
parents:
diff changeset
38 const int *, const GFC_INTEGER_1 *, const GFC_INTEGER_1 *,
kono
parents:
diff changeset
39 const int *, const GFC_INTEGER_1 *, const int *,
kono
parents:
diff changeset
40 const GFC_INTEGER_1 *, GFC_INTEGER_1 *, const int *,
kono
parents:
diff changeset
41 int, int);
kono
parents:
diff changeset
42
kono
parents:
diff changeset
43 /* The order of loops is different in the case of plain matrix
kono
parents:
diff changeset
44 multiplication C=MATMUL(A,B), and in the frequent special case where
kono
parents:
diff changeset
45 the argument A is the temporary result of a TRANSPOSE intrinsic:
kono
parents:
diff changeset
46 C=MATMUL(TRANSPOSE(A),B). Transposed temporaries are detected by
kono
parents:
diff changeset
47 looking at their strides.
kono
parents:
diff changeset
48
kono
parents:
diff changeset
49 The equivalent Fortran pseudo-code is:
kono
parents:
diff changeset
50
kono
parents:
diff changeset
51 DIMENSION A(M,COUNT), B(COUNT,N), C(M,N)
kono
parents:
diff changeset
52 IF (.NOT.IS_TRANSPOSED(A)) THEN
kono
parents:
diff changeset
53 C = 0
kono
parents:
diff changeset
54 DO J=1,N
kono
parents:
diff changeset
55 DO K=1,COUNT
kono
parents:
diff changeset
56 DO I=1,M
kono
parents:
diff changeset
57 C(I,J) = C(I,J)+A(I,K)*B(K,J)
kono
parents:
diff changeset
58 ELSE
kono
parents:
diff changeset
59 DO J=1,N
kono
parents:
diff changeset
60 DO I=1,M
kono
parents:
diff changeset
61 S = 0
kono
parents:
diff changeset
62 DO K=1,COUNT
kono
parents:
diff changeset
63 S = S+A(I,K)*B(K,J)
kono
parents:
diff changeset
64 C(I,J) = S
kono
parents:
diff changeset
65 ENDIF
kono
parents:
diff changeset
66 */
kono
parents:
diff changeset
67
kono
parents:
diff changeset
68 /* If try_blas is set to a nonzero value, then the matmul function will
kono
parents:
diff changeset
69 see if there is a way to perform the matrix multiplication by a call
kono
parents:
diff changeset
70 to the BLAS gemm function. */
kono
parents:
diff changeset
71
kono
parents:
diff changeset
72 extern void matmul_i1 (gfc_array_i1 * const restrict retarray,
kono
parents:
diff changeset
73 gfc_array_i1 * const restrict a, gfc_array_i1 * const restrict b, int try_blas,
kono
parents:
diff changeset
74 int blas_limit, blas_call gemm);
kono
parents:
diff changeset
75 export_proto(matmul_i1);
kono
parents:
diff changeset
76
kono
parents:
diff changeset
77 /* Put exhaustive list of possible architectures here here, ORed together. */
kono
parents:
diff changeset
78
kono
parents:
diff changeset
79 #if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_AVX512F)
kono
parents:
diff changeset
80
kono
parents:
diff changeset
81 #ifdef HAVE_AVX
kono
parents:
diff changeset
82 static void
kono
parents:
diff changeset
83 matmul_i1_avx (gfc_array_i1 * const restrict retarray,
kono
parents:
diff changeset
84 gfc_array_i1 * const restrict a, gfc_array_i1 * const restrict b, int try_blas,
kono
parents:
diff changeset
85 int blas_limit, blas_call gemm) __attribute__((__target__("avx")));
kono
parents:
diff changeset
86 static void
kono
parents:
diff changeset
87 matmul_i1_avx (gfc_array_i1 * const restrict retarray,
kono
parents:
diff changeset
88 gfc_array_i1 * const restrict a, gfc_array_i1 * const restrict b, int try_blas,
kono
parents:
diff changeset
89 int blas_limit, blas_call gemm)
kono
parents:
diff changeset
90 {
kono
parents:
diff changeset
91 const GFC_INTEGER_1 * restrict abase;
kono
parents:
diff changeset
92 const GFC_INTEGER_1 * restrict bbase;
kono
parents:
diff changeset
93 GFC_INTEGER_1 * restrict dest;
kono
parents:
diff changeset
94
kono
parents:
diff changeset
95 index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
kono
parents:
diff changeset
96 index_type x, y, n, count, xcount, ycount;
kono
parents:
diff changeset
97
kono
parents:
diff changeset
98 assert (GFC_DESCRIPTOR_RANK (a) == 2
kono
parents:
diff changeset
99 || GFC_DESCRIPTOR_RANK (b) == 2);
kono
parents:
diff changeset
100
kono
parents:
diff changeset
101 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
kono
parents:
diff changeset
102
kono
parents:
diff changeset
103 Either A or B (but not both) can be rank 1:
kono
parents:
diff changeset
104
kono
parents:
diff changeset
105 o One-dimensional argument A is implicitly treated as a row matrix
kono
parents:
diff changeset
106 dimensioned [1,count], so xcount=1.
kono
parents:
diff changeset
107
kono
parents:
diff changeset
108 o One-dimensional argument B is implicitly treated as a column matrix
kono
parents:
diff changeset
109 dimensioned [count, 1], so ycount=1.
kono
parents:
diff changeset
110 */
kono
parents:
diff changeset
111
kono
parents:
diff changeset
112 if (retarray->base_addr == NULL)
kono
parents:
diff changeset
113 {
kono
parents:
diff changeset
114 if (GFC_DESCRIPTOR_RANK (a) == 1)
kono
parents:
diff changeset
115 {
kono
parents:
diff changeset
116 GFC_DIMENSION_SET(retarray->dim[0], 0,
kono
parents:
diff changeset
117 GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
kono
parents:
diff changeset
118 }
kono
parents:
diff changeset
119 else if (GFC_DESCRIPTOR_RANK (b) == 1)
kono
parents:
diff changeset
120 {
kono
parents:
diff changeset
121 GFC_DIMENSION_SET(retarray->dim[0], 0,
kono
parents:
diff changeset
122 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
kono
parents:
diff changeset
123 }
kono
parents:
diff changeset
124 else
kono
parents:
diff changeset
125 {
kono
parents:
diff changeset
126 GFC_DIMENSION_SET(retarray->dim[0], 0,
kono
parents:
diff changeset
127 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
kono
parents:
diff changeset
128
kono
parents:
diff changeset
129 GFC_DIMENSION_SET(retarray->dim[1], 0,
kono
parents:
diff changeset
130 GFC_DESCRIPTOR_EXTENT(b,1) - 1,
kono
parents:
diff changeset
131 GFC_DESCRIPTOR_EXTENT(retarray,0));
kono
parents:
diff changeset
132 }
kono
parents:
diff changeset
133
kono
parents:
diff changeset
134 retarray->base_addr
kono
parents:
diff changeset
135 = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_INTEGER_1));
kono
parents:
diff changeset
136 retarray->offset = 0;
kono
parents:
diff changeset
137 }
kono
parents:
diff changeset
138 else if (unlikely (compile_options.bounds_check))
kono
parents:
diff changeset
139 {
kono
parents:
diff changeset
140 index_type ret_extent, arg_extent;
kono
parents:
diff changeset
141
kono
parents:
diff changeset
142 if (GFC_DESCRIPTOR_RANK (a) == 1)
kono
parents:
diff changeset
143 {
kono
parents:
diff changeset
144 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
kono
parents:
diff changeset
145 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
kono
parents:
diff changeset
146 if (arg_extent != ret_extent)
kono
parents:
diff changeset
147 runtime_error ("Incorrect extent in return array in"
kono
parents:
diff changeset
148 " MATMUL intrinsic: is %ld, should be %ld",
kono
parents:
diff changeset
149 (long int) ret_extent, (long int) arg_extent);
kono
parents:
diff changeset
150 }
kono
parents:
diff changeset
151 else if (GFC_DESCRIPTOR_RANK (b) == 1)
kono
parents:
diff changeset
152 {
kono
parents:
diff changeset
153 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
kono
parents:
diff changeset
154 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
kono
parents:
diff changeset
155 if (arg_extent != ret_extent)
kono
parents:
diff changeset
156 runtime_error ("Incorrect extent in return array in"
kono
parents:
diff changeset
157 " MATMUL intrinsic: is %ld, should be %ld",
kono
parents:
diff changeset
158 (long int) ret_extent, (long int) arg_extent);
kono
parents:
diff changeset
159 }
kono
parents:
diff changeset
160 else
kono
parents:
diff changeset
161 {
kono
parents:
diff changeset
162 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
kono
parents:
diff changeset
163 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
kono
parents:
diff changeset
164 if (arg_extent != ret_extent)
kono
parents:
diff changeset
165 runtime_error ("Incorrect extent in return array in"
kono
parents:
diff changeset
166 " MATMUL intrinsic for dimension 1:"
kono
parents:
diff changeset
167 " is %ld, should be %ld",
kono
parents:
diff changeset
168 (long int) ret_extent, (long int) arg_extent);
kono
parents:
diff changeset
169
kono
parents:
diff changeset
170 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
kono
parents:
diff changeset
171 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
kono
parents:
diff changeset
172 if (arg_extent != ret_extent)
kono
parents:
diff changeset
173 runtime_error ("Incorrect extent in return array in"
kono
parents:
diff changeset
174 " MATMUL intrinsic for dimension 2:"
kono
parents:
diff changeset
175 " is %ld, should be %ld",
kono
parents:
diff changeset
176 (long int) ret_extent, (long int) arg_extent);
kono
parents:
diff changeset
177 }
kono
parents:
diff changeset
178 }
kono
parents:
diff changeset
179
kono
parents:
diff changeset
180
kono
parents:
diff changeset
181 if (GFC_DESCRIPTOR_RANK (retarray) == 1)
kono
parents:
diff changeset
182 {
kono
parents:
diff changeset
183 /* One-dimensional result may be addressed in the code below
kono
parents:
diff changeset
184 either as a row or a column matrix. We want both cases to
kono
parents:
diff changeset
185 work. */
kono
parents:
diff changeset
186 rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
kono
parents:
diff changeset
187 }
kono
parents:
diff changeset
188 else
kono
parents:
diff changeset
189 {
kono
parents:
diff changeset
190 rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
kono
parents:
diff changeset
191 rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
kono
parents:
diff changeset
192 }
kono
parents:
diff changeset
193
kono
parents:
diff changeset
194
kono
parents:
diff changeset
195 if (GFC_DESCRIPTOR_RANK (a) == 1)
kono
parents:
diff changeset
196 {
kono
parents:
diff changeset
197 /* Treat it as a a row matrix A[1,count]. */
kono
parents:
diff changeset
198 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
kono
parents:
diff changeset
199 aystride = 1;
kono
parents:
diff changeset
200
kono
parents:
diff changeset
201 xcount = 1;
kono
parents:
diff changeset
202 count = GFC_DESCRIPTOR_EXTENT(a,0);
kono
parents:
diff changeset
203 }
kono
parents:
diff changeset
204 else
kono
parents:
diff changeset
205 {
kono
parents:
diff changeset
206 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
kono
parents:
diff changeset
207 aystride = GFC_DESCRIPTOR_STRIDE(a,1);
kono
parents:
diff changeset
208
kono
parents:
diff changeset
209 count = GFC_DESCRIPTOR_EXTENT(a,1);
kono
parents:
diff changeset
210 xcount = GFC_DESCRIPTOR_EXTENT(a,0);
kono
parents:
diff changeset
211 }
kono
parents:
diff changeset
212
kono
parents:
diff changeset
213 if (count != GFC_DESCRIPTOR_EXTENT(b,0))
kono
parents:
diff changeset
214 {
kono
parents:
diff changeset
215 if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
kono
parents:
diff changeset
216 runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
kono
parents:
diff changeset
217 }
kono
parents:
diff changeset
218
kono
parents:
diff changeset
219 if (GFC_DESCRIPTOR_RANK (b) == 1)
kono
parents:
diff changeset
220 {
kono
parents:
diff changeset
221 /* Treat it as a column matrix B[count,1] */
kono
parents:
diff changeset
222 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
kono
parents:
diff changeset
223
kono
parents:
diff changeset
224 /* bystride should never be used for 1-dimensional b.
kono
parents:
diff changeset
225 The value is only used for calculation of the
kono
parents:
diff changeset
226 memory by the buffer. */
kono
parents:
diff changeset
227 bystride = 256;
kono
parents:
diff changeset
228 ycount = 1;
kono
parents:
diff changeset
229 }
kono
parents:
diff changeset
230 else
kono
parents:
diff changeset
231 {
kono
parents:
diff changeset
232 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
kono
parents:
diff changeset
233 bystride = GFC_DESCRIPTOR_STRIDE(b,1);
kono
parents:
diff changeset
234 ycount = GFC_DESCRIPTOR_EXTENT(b,1);
kono
parents:
diff changeset
235 }
kono
parents:
diff changeset
236
kono
parents:
diff changeset
237 abase = a->base_addr;
kono
parents:
diff changeset
238 bbase = b->base_addr;
kono
parents:
diff changeset
239 dest = retarray->base_addr;
kono
parents:
diff changeset
240
kono
parents:
diff changeset
241 /* Now that everything is set up, we perform the multiplication
kono
parents:
diff changeset
242 itself. */
kono
parents:
diff changeset
243
kono
parents:
diff changeset
244 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
kono
parents:
diff changeset
245 #define min(a,b) ((a) <= (b) ? (a) : (b))
kono
parents:
diff changeset
246 #define max(a,b) ((a) >= (b) ? (a) : (b))
kono
parents:
diff changeset
247
kono
parents:
diff changeset
248 if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
kono
parents:
diff changeset
249 && (bxstride == 1 || bystride == 1)
kono
parents:
diff changeset
250 && (((float) xcount) * ((float) ycount) * ((float) count)
kono
parents:
diff changeset
251 > POW3(blas_limit)))
kono
parents:
diff changeset
252 {
kono
parents:
diff changeset
253 const int m = xcount, n = ycount, k = count, ldc = rystride;
kono
parents:
diff changeset
254 const GFC_INTEGER_1 one = 1, zero = 0;
kono
parents:
diff changeset
255 const int lda = (axstride == 1) ? aystride : axstride,
kono
parents:
diff changeset
256 ldb = (bxstride == 1) ? bystride : bxstride;
kono
parents:
diff changeset
257
kono
parents:
diff changeset
258 if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
kono
parents:
diff changeset
259 {
kono
parents:
diff changeset
260 assert (gemm != NULL);
kono
parents:
diff changeset
261 gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m,
kono
parents:
diff changeset
262 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
kono
parents:
diff changeset
263 &ldc, 1, 1);
kono
parents:
diff changeset
264 return;
kono
parents:
diff changeset
265 }
kono
parents:
diff changeset
266 }
kono
parents:
diff changeset
267
kono
parents:
diff changeset
268 if (rxstride == 1 && axstride == 1 && bxstride == 1)
kono
parents:
diff changeset
269 {
kono
parents:
diff changeset
270 /* This block of code implements a tuned matmul, derived from
kono
parents:
diff changeset
271 Superscalar GEMM-based level 3 BLAS, Beta version 0.1
kono
parents:
diff changeset
272
kono
parents:
diff changeset
273 Bo Kagstrom and Per Ling
kono
parents:
diff changeset
274 Department of Computing Science
kono
parents:
diff changeset
275 Umea University
kono
parents:
diff changeset
276 S-901 87 Umea, Sweden
kono
parents:
diff changeset
277
kono
parents:
diff changeset
278 from netlib.org, translated to C, and modified for matmul.m4. */
kono
parents:
diff changeset
279
kono
parents:
diff changeset
280 const GFC_INTEGER_1 *a, *b;
kono
parents:
diff changeset
281 GFC_INTEGER_1 *c;
kono
parents:
diff changeset
282 const index_type m = xcount, n = ycount, k = count;
kono
parents:
diff changeset
283
kono
parents:
diff changeset
284 /* System generated locals */
kono
parents:
diff changeset
285 index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
kono
parents:
diff changeset
286 i1, i2, i3, i4, i5, i6;
kono
parents:
diff changeset
287
kono
parents:
diff changeset
288 /* Local variables */
kono
parents:
diff changeset
289 GFC_INTEGER_1 f11, f12, f21, f22, f31, f32, f41, f42,
kono
parents:
diff changeset
290 f13, f14, f23, f24, f33, f34, f43, f44;
kono
parents:
diff changeset
291 index_type i, j, l, ii, jj, ll;
kono
parents:
diff changeset
292 index_type isec, jsec, lsec, uisec, ujsec, ulsec;
kono
parents:
diff changeset
293 GFC_INTEGER_1 *t1;
kono
parents:
diff changeset
294
kono
parents:
diff changeset
295 a = abase;
kono
parents:
diff changeset
296 b = bbase;
kono
parents:
diff changeset
297 c = retarray->base_addr;
kono
parents:
diff changeset
298
kono
parents:
diff changeset
299 /* Parameter adjustments */
kono
parents:
diff changeset
300 c_dim1 = rystride;
kono
parents:
diff changeset
301 c_offset = 1 + c_dim1;
kono
parents:
diff changeset
302 c -= c_offset;
kono
parents:
diff changeset
303 a_dim1 = aystride;
kono
parents:
diff changeset
304 a_offset = 1 + a_dim1;
kono
parents:
diff changeset
305 a -= a_offset;
kono
parents:
diff changeset
306 b_dim1 = bystride;
kono
parents:
diff changeset
307 b_offset = 1 + b_dim1;
kono
parents:
diff changeset
308 b -= b_offset;
kono
parents:
diff changeset
309
kono
parents:
diff changeset
310 /* Empty c first. */
kono
parents:
diff changeset
311 for (j=1; j<=n; j++)
kono
parents:
diff changeset
312 for (i=1; i<=m; i++)
kono
parents:
diff changeset
313 c[i + j * c_dim1] = (GFC_INTEGER_1)0;
kono
parents:
diff changeset
314
kono
parents:
diff changeset
315 /* Early exit if possible */
kono
parents:
diff changeset
316 if (m == 0 || n == 0 || k == 0)
kono
parents:
diff changeset
317 return;
kono
parents:
diff changeset
318
kono
parents:
diff changeset
319 /* Adjust size of t1 to what is needed. */
kono
parents:
diff changeset
320 index_type t1_dim;
kono
parents:
diff changeset
321 t1_dim = (a_dim1-1) * 256 + b_dim1;
kono
parents:
diff changeset
322 if (t1_dim > 65536)
kono
parents:
diff changeset
323 t1_dim = 65536;
kono
parents:
diff changeset
324
kono
parents:
diff changeset
325 t1 = malloc (t1_dim * sizeof(GFC_INTEGER_1));
kono
parents:
diff changeset
326
kono
parents:
diff changeset
327 /* Start turning the crank. */
kono
parents:
diff changeset
328 i1 = n;
kono
parents:
diff changeset
329 for (jj = 1; jj <= i1; jj += 512)
kono
parents:
diff changeset
330 {
kono
parents:
diff changeset
331 /* Computing MIN */
kono
parents:
diff changeset
332 i2 = 512;
kono
parents:
diff changeset
333 i3 = n - jj + 1;
kono
parents:
diff changeset
334 jsec = min(i2,i3);
kono
parents:
diff changeset
335 ujsec = jsec - jsec % 4;
kono
parents:
diff changeset
336 i2 = k;
kono
parents:
diff changeset
337 for (ll = 1; ll <= i2; ll += 256)
kono
parents:
diff changeset
338 {
kono
parents:
diff changeset
339 /* Computing MIN */
kono
parents:
diff changeset
340 i3 = 256;
kono
parents:
diff changeset
341 i4 = k - ll + 1;
kono
parents:
diff changeset
342 lsec = min(i3,i4);
kono
parents:
diff changeset
343 ulsec = lsec - lsec % 2;
kono
parents:
diff changeset
344
kono
parents:
diff changeset
345 i3 = m;
kono
parents:
diff changeset
346 for (ii = 1; ii <= i3; ii += 256)
kono
parents:
diff changeset
347 {
kono
parents:
diff changeset
348 /* Computing MIN */
kono
parents:
diff changeset
349 i4 = 256;
kono
parents:
diff changeset
350 i5 = m - ii + 1;
kono
parents:
diff changeset
351 isec = min(i4,i5);
kono
parents:
diff changeset
352 uisec = isec - isec % 2;
kono
parents:
diff changeset
353 i4 = ll + ulsec - 1;
kono
parents:
diff changeset
354 for (l = ll; l <= i4; l += 2)
kono
parents:
diff changeset
355 {
kono
parents:
diff changeset
356 i5 = ii + uisec - 1;
kono
parents:
diff changeset
357 for (i = ii; i <= i5; i += 2)
kono
parents:
diff changeset
358 {
kono
parents:
diff changeset
359 t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
kono
parents:
diff changeset
360 a[i + l * a_dim1];
kono
parents:
diff changeset
361 t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
kono
parents:
diff changeset
362 a[i + (l + 1) * a_dim1];
kono
parents:
diff changeset
363 t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
kono
parents:
diff changeset
364 a[i + 1 + l * a_dim1];
kono
parents:
diff changeset
365 t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
kono
parents:
diff changeset
366 a[i + 1 + (l + 1) * a_dim1];
kono
parents:
diff changeset
367 }
kono
parents:
diff changeset
368 if (uisec < isec)
kono
parents:
diff changeset
369 {
kono
parents:
diff changeset
370 t1[l - ll + 1 + (isec << 8) - 257] =
kono
parents:
diff changeset
371 a[ii + isec - 1 + l * a_dim1];
kono
parents:
diff changeset
372 t1[l - ll + 2 + (isec << 8) - 257] =
kono
parents:
diff changeset
373 a[ii + isec - 1 + (l + 1) * a_dim1];
kono
parents:
diff changeset
374 }
kono
parents:
diff changeset
375 }
kono
parents:
diff changeset
376 if (ulsec < lsec)
kono
parents:
diff changeset
377 {
kono
parents:
diff changeset
378 i4 = ii + isec - 1;
kono
parents:
diff changeset
379 for (i = ii; i<= i4; ++i)
kono
parents:
diff changeset
380 {
kono
parents:
diff changeset
381 t1[lsec + ((i - ii + 1) << 8) - 257] =
kono
parents:
diff changeset
382 a[i + (ll + lsec - 1) * a_dim1];
kono
parents:
diff changeset
383 }
kono
parents:
diff changeset
384 }
kono
parents:
diff changeset
385
kono
parents:
diff changeset
386 uisec = isec - isec % 4;
kono
parents:
diff changeset
387 i4 = jj + ujsec - 1;
kono
parents:
diff changeset
388 for (j = jj; j <= i4; j += 4)
kono
parents:
diff changeset
389 {
kono
parents:
diff changeset
390 i5 = ii + uisec - 1;
kono
parents:
diff changeset
391 for (i = ii; i <= i5; i += 4)
kono
parents:
diff changeset
392 {
kono
parents:
diff changeset
393 f11 = c[i + j * c_dim1];
kono
parents:
diff changeset
394 f21 = c[i + 1 + j * c_dim1];
kono
parents:
diff changeset
395 f12 = c[i + (j + 1) * c_dim1];
kono
parents:
diff changeset
396 f22 = c[i + 1 + (j + 1) * c_dim1];
kono
parents:
diff changeset
397 f13 = c[i + (j + 2) * c_dim1];
kono
parents:
diff changeset
398 f23 = c[i + 1 + (j + 2) * c_dim1];
kono
parents:
diff changeset
399 f14 = c[i + (j + 3) * c_dim1];
kono
parents:
diff changeset
400 f24 = c[i + 1 + (j + 3) * c_dim1];
kono
parents:
diff changeset
401 f31 = c[i + 2 + j * c_dim1];
kono
parents:
diff changeset
402 f41 = c[i + 3 + j * c_dim1];
kono
parents:
diff changeset
403 f32 = c[i + 2 + (j + 1) * c_dim1];
kono
parents:
diff changeset
404 f42 = c[i + 3 + (j + 1) * c_dim1];
kono
parents:
diff changeset
405 f33 = c[i + 2 + (j + 2) * c_dim1];
kono
parents:
diff changeset
406 f43 = c[i + 3 + (j + 2) * c_dim1];
kono
parents:
diff changeset
407 f34 = c[i + 2 + (j + 3) * c_dim1];
kono
parents:
diff changeset
408 f44 = c[i + 3 + (j + 3) * c_dim1];
kono
parents:
diff changeset
409 i6 = ll + lsec - 1;
kono
parents:
diff changeset
410 for (l = ll; l <= i6; ++l)
kono
parents:
diff changeset
411 {
kono
parents:
diff changeset
412 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
kono
parents:
diff changeset
413 * b[l + j * b_dim1];
kono
parents:
diff changeset
414 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
kono
parents:
diff changeset
415 * b[l + j * b_dim1];
kono
parents:
diff changeset
416 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
kono
parents:
diff changeset
417 * b[l + (j + 1) * b_dim1];
kono
parents:
diff changeset
418 f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
kono
parents:
diff changeset
419 * b[l + (j + 1) * b_dim1];
kono
parents:
diff changeset
420 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
kono
parents:
diff changeset
421 * b[l + (j + 2) * b_dim1];
kono
parents:
diff changeset
422 f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
kono
parents:
diff changeset
423 * b[l + (j + 2) * b_dim1];
kono
parents:
diff changeset
424 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
kono
parents:
diff changeset
425 * b[l + (j + 3) * b_dim1];
kono
parents:
diff changeset
426 f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
kono
parents:
diff changeset
427 * b[l + (j + 3) * b_dim1];
kono
parents:
diff changeset
428 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
kono
parents:
diff changeset
429 * b[l + j * b_dim1];
kono
parents:
diff changeset
430 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
kono
parents:
diff changeset
431 * b[l + j * b_dim1];
kono
parents:
diff changeset
432 f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
kono
parents:
diff changeset
433 * b[l + (j + 1) * b_dim1];
kono
parents:
diff changeset
434 f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
kono
parents:
diff changeset
435 * b[l + (j + 1) * b_dim1];
kono
parents:
diff changeset
436 f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
kono
parents:
diff changeset
437 * b[l + (j + 2) * b_dim1];
kono
parents:
diff changeset
438 f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
kono
parents:
diff changeset
439 * b[l + (j + 2) * b_dim1];
kono
parents:
diff changeset
440 f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
kono
parents:
diff changeset
441 * b[l + (j + 3) * b_dim1];
kono
parents:
diff changeset
442 f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
kono
parents:
diff changeset
443 * b[l + (j + 3) * b_dim1];
kono
parents:
diff changeset
444 }
kono
parents:
diff changeset
445 c[i + j * c_dim1] = f11;
kono
parents:
diff changeset
446 c[i + 1 + j * c_dim1] = f21;
kono
parents:
diff changeset
447 c[i + (j + 1) * c_dim1] = f12;
kono
parents:
diff changeset
448 c[i + 1 + (j + 1) * c_dim1] = f22;
kono
parents:
diff changeset
449 c[i + (j + 2) * c_dim1] = f13;
kono
parents:
diff changeset
450 c[i + 1 + (j + 2) * c_dim1] = f23;
kono
parents:
diff changeset
451 c[i + (j + 3) * c_dim1] = f14;
kono
parents:
diff changeset
452 c[i + 1 + (j + 3) * c_dim1] = f24;
kono
parents:
diff changeset
453 c[i + 2 + j * c_dim1] = f31;
kono
parents:
diff changeset
454 c[i + 3 + j * c_dim1] = f41;
kono
parents:
diff changeset
455 c[i + 2 + (j + 1) * c_dim1] = f32;
kono
parents:
diff changeset
456 c[i + 3 + (j + 1) * c_dim1] = f42;
kono
parents:
diff changeset
457 c[i + 2 + (j + 2) * c_dim1] = f33;
kono
parents:
diff changeset
458 c[i + 3 + (j + 2) * c_dim1] = f43;
kono
parents:
diff changeset
459 c[i + 2 + (j + 3) * c_dim1] = f34;
kono
parents:
diff changeset
460 c[i + 3 + (j + 3) * c_dim1] = f44;
kono
parents:
diff changeset
461 }
kono
parents:
diff changeset
462 if (uisec < isec)
kono
parents:
diff changeset
463 {
kono
parents:
diff changeset
464 i5 = ii + isec - 1;
kono
parents:
diff changeset
465 for (i = ii + uisec; i <= i5; ++i)
kono
parents:
diff changeset
466 {
kono
parents:
diff changeset
467 f11 = c[i + j * c_dim1];
kono
parents:
diff changeset
468 f12 = c[i + (j + 1) * c_dim1];
kono
parents:
diff changeset
469 f13 = c[i + (j + 2) * c_dim1];
kono
parents:
diff changeset
470 f14 = c[i + (j + 3) * c_dim1];
kono
parents:
diff changeset
471 i6 = ll + lsec - 1;
kono
parents:
diff changeset
472 for (l = ll; l <= i6; ++l)
kono
parents:
diff changeset
473 {
kono
parents:
diff changeset
474 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
kono
parents:
diff changeset
475 257] * b[l + j * b_dim1];
kono
parents:
diff changeset
476 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
kono
parents:
diff changeset
477 257] * b[l + (j + 1) * b_dim1];
kono
parents:
diff changeset
478 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
kono
parents:
diff changeset
479 257] * b[l + (j + 2) * b_dim1];
kono
parents:
diff changeset
480 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
kono
parents:
diff changeset
481 257] * b[l + (j + 3) * b_dim1];
kono
parents:
diff changeset
482 }
kono
parents:
diff changeset
483 c[i + j * c_dim1] = f11;
kono
parents:
diff changeset
484 c[i + (j + 1) * c_dim1] = f12;
kono
parents:
diff changeset
485 c[i + (j + 2) * c_dim1] = f13;
kono
parents:
diff changeset
486 c[i + (j + 3) * c_dim1] = f14;
kono
parents:
diff changeset
487 }
kono
parents:
diff changeset
488 }
kono
parents:
diff changeset
489 }
kono
parents:
diff changeset
490 if (ujsec < jsec)
kono
parents:
diff changeset
491 {
kono
parents:
diff changeset
492 i4 = jj + jsec - 1;
kono
parents:
diff changeset
493 for (j = jj + ujsec; j <= i4; ++j)
kono
parents:
diff changeset
494 {
kono
parents:
diff changeset
495 i5 = ii + uisec - 1;
kono
parents:
diff changeset
496 for (i = ii; i <= i5; i += 4)
kono
parents:
diff changeset
497 {
kono
parents:
diff changeset
498 f11 = c[i + j * c_dim1];
kono
parents:
diff changeset
499 f21 = c[i + 1 + j * c_dim1];
kono
parents:
diff changeset
500 f31 = c[i + 2 + j * c_dim1];
kono
parents:
diff changeset
501 f41 = c[i + 3 + j * c_dim1];
kono
parents:
diff changeset
502 i6 = ll + lsec - 1;
kono
parents:
diff changeset
503 for (l = ll; l <= i6; ++l)
kono
parents:
diff changeset
504 {
kono
parents:
diff changeset
505 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
kono
parents:
diff changeset
506 257] * b[l + j * b_dim1];
kono
parents:
diff changeset
507 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
kono
parents:
diff changeset
508 257] * b[l + j * b_dim1];
kono
parents:
diff changeset
509 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
kono
parents:
diff changeset
510 257] * b[l + j * b_dim1];
kono
parents:
diff changeset
511 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
kono
parents:
diff changeset
512 257] * b[l + j * b_dim1];
kono
parents:
diff changeset
513 }
kono
parents:
diff changeset
514 c[i + j * c_dim1] = f11;
kono
parents:
diff changeset
515 c[i + 1 + j * c_dim1] = f21;
kono
parents:
diff changeset
516 c[i + 2 + j * c_dim1] = f31;
kono
parents:
diff changeset
517 c[i + 3 + j * c_dim1] = f41;
kono
parents:
diff changeset
518 }
kono
parents:
diff changeset
519 i5 = ii + isec - 1;
kono
parents:
diff changeset
520 for (i = ii + uisec; i <= i5; ++i)
kono
parents:
diff changeset
521 {
kono
parents:
diff changeset
522 f11 = c[i + j * c_dim1];
kono
parents:
diff changeset
523 i6 = ll + lsec - 1;
kono
parents:
diff changeset
524 for (l = ll; l <= i6; ++l)
kono
parents:
diff changeset
525 {
kono
parents:
diff changeset
526 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
kono
parents:
diff changeset
527 257] * b[l + j * b_dim1];
kono
parents:
diff changeset
528 }
kono
parents:
diff changeset
529 c[i + j * c_dim1] = f11;
kono
parents:
diff changeset
530 }
kono
parents:
diff changeset
531 }
kono
parents:
diff changeset
532 }
kono
parents:
diff changeset
533 }
kono
parents:
diff changeset
534 }
kono
parents:
diff changeset
535 }
kono
parents:
diff changeset
536 free(t1);
kono
parents:
diff changeset
537 return;
kono
parents:
diff changeset
538 }
kono
parents:
diff changeset
539 else if (rxstride == 1 && aystride == 1 && bxstride == 1)
kono
parents:
diff changeset
540 {
kono
parents:
diff changeset
541 if (GFC_DESCRIPTOR_RANK (a) != 1)
kono
parents:
diff changeset
542 {
kono
parents:
diff changeset
543 const GFC_INTEGER_1 *restrict abase_x;
kono
parents:
diff changeset
544 const GFC_INTEGER_1 *restrict bbase_y;
kono
parents:
diff changeset
545 GFC_INTEGER_1 *restrict dest_y;
kono
parents:
diff changeset
546 GFC_INTEGER_1 s;
kono
parents:
diff changeset
547
kono
parents:
diff changeset
548 for (y = 0; y < ycount; y++)
kono
parents:
diff changeset
549 {
kono
parents:
diff changeset
550 bbase_y = &bbase[y*bystride];
kono
parents:
diff changeset
551 dest_y = &dest[y*rystride];
kono
parents:
diff changeset
552 for (x = 0; x < xcount; x++)
kono
parents:
diff changeset
553 {
kono
parents:
diff changeset
554 abase_x = &abase[x*axstride];
kono
parents:
diff changeset
555 s = (GFC_INTEGER_1) 0;
kono
parents:
diff changeset
556 for (n = 0; n < count; n++)
kono
parents:
diff changeset
557 s += abase_x[n] * bbase_y[n];
kono
parents:
diff changeset
558 dest_y[x] = s;
kono
parents:
diff changeset
559 }
kono
parents:
diff changeset
560 }
kono
parents:
diff changeset
561 }
kono
parents:
diff changeset
562 else
kono
parents:
diff changeset
563 {
kono
parents:
diff changeset
564 const GFC_INTEGER_1 *restrict bbase_y;
kono
parents:
diff changeset
565 GFC_INTEGER_1 s;
kono
parents:
diff changeset
566
kono
parents:
diff changeset
567 for (y = 0; y < ycount; y++)
kono
parents:
diff changeset
568 {
kono
parents:
diff changeset
569 bbase_y = &bbase[y*bystride];
kono
parents:
diff changeset
570 s = (GFC_INTEGER_1) 0;
kono
parents:
diff changeset
571 for (n = 0; n < count; n++)
kono
parents:
diff changeset
572 s += abase[n*axstride] * bbase_y[n];
kono
parents:
diff changeset
573 dest[y*rystride] = s;
kono
parents:
diff changeset
574 }
kono
parents:
diff changeset
575 }
kono
parents:
diff changeset
576 }
kono
parents:
diff changeset
577 else if (axstride < aystride)
kono
parents:
diff changeset
578 {
kono
parents:
diff changeset
579 for (y = 0; y < ycount; y++)
kono
parents:
diff changeset
580 for (x = 0; x < xcount; x++)
kono
parents:
diff changeset
581 dest[x*rxstride + y*rystride] = (GFC_INTEGER_1)0;
kono
parents:
diff changeset
582
kono
parents:
diff changeset
583 for (y = 0; y < ycount; y++)
kono
parents:
diff changeset
584 for (n = 0; n < count; n++)
kono
parents:
diff changeset
585 for (x = 0; x < xcount; x++)
kono
parents:
diff changeset
586 /* dest[x,y] += a[x,n] * b[n,y] */
kono
parents:
diff changeset
587 dest[x*rxstride + y*rystride] +=
kono
parents:
diff changeset
588 abase[x*axstride + n*aystride] *
kono
parents:
diff changeset
589 bbase[n*bxstride + y*bystride];
kono
parents:
diff changeset
590 }
kono
parents:
diff changeset
591 else if (GFC_DESCRIPTOR_RANK (a) == 1)
kono
parents:
diff changeset
592 {
kono
parents:
diff changeset
593 const GFC_INTEGER_1 *restrict bbase_y;
kono
parents:
diff changeset
594 GFC_INTEGER_1 s;
kono
parents:
diff changeset
595
kono
parents:
diff changeset
596 for (y = 0; y < ycount; y++)
kono
parents:
diff changeset
597 {
kono
parents:
diff changeset
598 bbase_y = &bbase[y*bystride];
kono
parents:
diff changeset
599 s = (GFC_INTEGER_1) 0;
kono
parents:
diff changeset
600 for (n = 0; n < count; n++)
kono
parents:
diff changeset
601 s += abase[n*axstride] * bbase_y[n*bxstride];
kono
parents:
diff changeset
602 dest[y*rxstride] = s;
kono
parents:
diff changeset
603 }
kono
parents:
diff changeset
604 }
kono
parents:
diff changeset
605 else
kono
parents:
diff changeset
606 {
kono
parents:
diff changeset
607 const GFC_INTEGER_1 *restrict abase_x;
kono
parents:
diff changeset
608 const GFC_INTEGER_1 *restrict bbase_y;
kono
parents:
diff changeset
609 GFC_INTEGER_1 *restrict dest_y;
kono
parents:
diff changeset
610 GFC_INTEGER_1 s;
kono
parents:
diff changeset
611
kono
parents:
diff changeset
612 for (y = 0; y < ycount; y++)
kono
parents:
diff changeset
613 {
kono
parents:
diff changeset
614 bbase_y = &bbase[y*bystride];
kono
parents:
diff changeset
615 dest_y = &dest[y*rystride];
kono
parents:
diff changeset
616 for (x = 0; x < xcount; x++)
kono
parents:
diff changeset
617 {
kono
parents:
diff changeset
618 abase_x = &abase[x*axstride];
kono
parents:
diff changeset
619 s = (GFC_INTEGER_1) 0;
kono
parents:
diff changeset
620 for (n = 0; n < count; n++)
kono
parents:
diff changeset
621 s += abase_x[n*aystride] * bbase_y[n*bxstride];
kono
parents:
diff changeset
622 dest_y[x*rxstride] = s;
kono
parents:
diff changeset
623 }
kono
parents:
diff changeset
624 }
kono
parents:
diff changeset
625 }
kono
parents:
diff changeset
626 }
kono
parents:
diff changeset
627 #undef POW3
kono
parents:
diff changeset
628 #undef min
kono
parents:
diff changeset
629 #undef max
kono
parents:
diff changeset
630
kono
parents:
diff changeset
631 #endif /* HAVE_AVX */
kono
parents:
diff changeset
632
kono
parents:
diff changeset
633 #ifdef HAVE_AVX2
kono
parents:
diff changeset
634 static void
kono
parents:
diff changeset
635 matmul_i1_avx2 (gfc_array_i1 * const restrict retarray,
kono
parents:
diff changeset
636 gfc_array_i1 * const restrict a, gfc_array_i1 * const restrict b, int try_blas,
kono
parents:
diff changeset
637 int blas_limit, blas_call gemm) __attribute__((__target__("avx2,fma")));
kono
parents:
diff changeset
638 static void
kono
parents:
diff changeset
639 matmul_i1_avx2 (gfc_array_i1 * const restrict retarray,
kono
parents:
diff changeset
640 gfc_array_i1 * const restrict a, gfc_array_i1 * const restrict b, int try_blas,
kono
parents:
diff changeset
641 int blas_limit, blas_call gemm)
kono
parents:
diff changeset
642 {
kono
parents:
diff changeset
643 const GFC_INTEGER_1 * restrict abase;
kono
parents:
diff changeset
644 const GFC_INTEGER_1 * restrict bbase;
kono
parents:
diff changeset
645 GFC_INTEGER_1 * restrict dest;
kono
parents:
diff changeset
646
kono
parents:
diff changeset
647 index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
kono
parents:
diff changeset
648 index_type x, y, n, count, xcount, ycount;
kono
parents:
diff changeset
649
kono
parents:
diff changeset
650 assert (GFC_DESCRIPTOR_RANK (a) == 2
kono
parents:
diff changeset
651 || GFC_DESCRIPTOR_RANK (b) == 2);
kono
parents:
diff changeset
652
kono
parents:
diff changeset
653 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
kono
parents:
diff changeset
654
kono
parents:
diff changeset
655 Either A or B (but not both) can be rank 1:
kono
parents:
diff changeset
656
kono
parents:
diff changeset
657 o One-dimensional argument A is implicitly treated as a row matrix
kono
parents:
diff changeset
658 dimensioned [1,count], so xcount=1.
kono
parents:
diff changeset
659
kono
parents:
diff changeset
660 o One-dimensional argument B is implicitly treated as a column matrix
kono
parents:
diff changeset
661 dimensioned [count, 1], so ycount=1.
kono
parents:
diff changeset
662 */
kono
parents:
diff changeset
663
kono
parents:
diff changeset
664 if (retarray->base_addr == NULL)
kono
parents:
diff changeset
665 {
kono
parents:
diff changeset
666 if (GFC_DESCRIPTOR_RANK (a) == 1)
kono
parents:
diff changeset
667 {
kono
parents:
diff changeset
668 GFC_DIMENSION_SET(retarray->dim[0], 0,
kono
parents:
diff changeset
669 GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
kono
parents:
diff changeset
670 }
kono
parents:
diff changeset
671 else if (GFC_DESCRIPTOR_RANK (b) == 1)
kono
parents:
diff changeset
672 {
kono
parents:
diff changeset
673 GFC_DIMENSION_SET(retarray->dim[0], 0,
kono
parents:
diff changeset
674 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
kono
parents:
diff changeset
675 }
kono
parents:
diff changeset
676 else
kono
parents:
diff changeset
677 {
kono
parents:
diff changeset
678 GFC_DIMENSION_SET(retarray->dim[0], 0,
kono
parents:
diff changeset
679 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
kono
parents:
diff changeset
680
kono
parents:
diff changeset
681 GFC_DIMENSION_SET(retarray->dim[1], 0,
kono
parents:
diff changeset
682 GFC_DESCRIPTOR_EXTENT(b,1) - 1,
kono
parents:
diff changeset
683 GFC_DESCRIPTOR_EXTENT(retarray,0));
kono
parents:
diff changeset
684 }
kono
parents:
diff changeset
685
kono
parents:
diff changeset
686 retarray->base_addr
kono
parents:
diff changeset
687 = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_INTEGER_1));
kono
parents:
diff changeset
688 retarray->offset = 0;
kono
parents:
diff changeset
689 }
kono
parents:
diff changeset
690 else if (unlikely (compile_options.bounds_check))
kono
parents:
diff changeset
691 {
kono
parents:
diff changeset
692 index_type ret_extent, arg_extent;
kono
parents:
diff changeset
693
kono
parents:
diff changeset
694 if (GFC_DESCRIPTOR_RANK (a) == 1)
kono
parents:
diff changeset
695 {
kono
parents:
diff changeset
696 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
kono
parents:
diff changeset
697 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
kono
parents:
diff changeset
698 if (arg_extent != ret_extent)
kono
parents:
diff changeset
699 runtime_error ("Incorrect extent in return array in"
kono
parents:
diff changeset
700 " MATMUL intrinsic: is %ld, should be %ld",
kono
parents:
diff changeset
701 (long int) ret_extent, (long int) arg_extent);
kono
parents:
diff changeset
702 }
kono
parents:
diff changeset
703 else if (GFC_DESCRIPTOR_RANK (b) == 1)
kono
parents:
diff changeset
704 {
kono
parents:
diff changeset
705 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
kono
parents:
diff changeset
706 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
kono
parents:
diff changeset
707 if (arg_extent != ret_extent)
kono
parents:
diff changeset
708 runtime_error ("Incorrect extent in return array in"
kono
parents:
diff changeset
709 " MATMUL intrinsic: is %ld, should be %ld",
kono
parents:
diff changeset
710 (long int) ret_extent, (long int) arg_extent);
kono
parents:
diff changeset
711 }
kono
parents:
diff changeset
712 else
kono
parents:
diff changeset
713 {
kono
parents:
diff changeset
714 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
kono
parents:
diff changeset
715 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
kono
parents:
diff changeset
716 if (arg_extent != ret_extent)
kono
parents:
diff changeset
717 runtime_error ("Incorrect extent in return array in"
kono
parents:
diff changeset
718 " MATMUL intrinsic for dimension 1:"
kono
parents:
diff changeset
719 " is %ld, should be %ld",
kono
parents:
diff changeset
720 (long int) ret_extent, (long int) arg_extent);
kono
parents:
diff changeset
721
kono
parents:
diff changeset
722 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
kono
parents:
diff changeset
723 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
kono
parents:
diff changeset
724 if (arg_extent != ret_extent)
kono
parents:
diff changeset
725 runtime_error ("Incorrect extent in return array in"
kono
parents:
diff changeset
726 " MATMUL intrinsic for dimension 2:"
kono
parents:
diff changeset
727 " is %ld, should be %ld",
kono
parents:
diff changeset
728 (long int) ret_extent, (long int) arg_extent);
kono
parents:
diff changeset
729 }
kono
parents:
diff changeset
730 }
kono
parents:
diff changeset
731
kono
parents:
diff changeset
732
kono
parents:
diff changeset
733 if (GFC_DESCRIPTOR_RANK (retarray) == 1)
kono
parents:
diff changeset
734 {
kono
parents:
diff changeset
735 /* One-dimensional result may be addressed in the code below
kono
parents:
diff changeset
736 either as a row or a column matrix. We want both cases to
kono
parents:
diff changeset
737 work. */
kono
parents:
diff changeset
738 rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
kono
parents:
diff changeset
739 }
kono
parents:
diff changeset
740 else
kono
parents:
diff changeset
741 {
kono
parents:
diff changeset
742 rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
kono
parents:
diff changeset
743 rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
kono
parents:
diff changeset
744 }
kono
parents:
diff changeset
745
kono
parents:
diff changeset
746
kono
parents:
diff changeset
747 if (GFC_DESCRIPTOR_RANK (a) == 1)
kono
parents:
diff changeset
748 {
kono
parents:
diff changeset
749 /* Treat it as a a row matrix A[1,count]. */
kono
parents:
diff changeset
750 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
kono
parents:
diff changeset
751 aystride = 1;
kono
parents:
diff changeset
752
kono
parents:
diff changeset
753 xcount = 1;
kono
parents:
diff changeset
754 count = GFC_DESCRIPTOR_EXTENT(a,0);
kono
parents:
diff changeset
755 }
kono
parents:
diff changeset
756 else
kono
parents:
diff changeset
757 {
kono
parents:
diff changeset
758 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
kono
parents:
diff changeset
759 aystride = GFC_DESCRIPTOR_STRIDE(a,1);
kono
parents:
diff changeset
760
kono
parents:
diff changeset
761 count = GFC_DESCRIPTOR_EXTENT(a,1);
kono
parents:
diff changeset
762 xcount = GFC_DESCRIPTOR_EXTENT(a,0);
kono
parents:
diff changeset
763 }
kono
parents:
diff changeset
764
kono
parents:
diff changeset
765 if (count != GFC_DESCRIPTOR_EXTENT(b,0))
kono
parents:
diff changeset
766 {
kono
parents:
diff changeset
767 if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
kono
parents:
diff changeset
768 runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
kono
parents:
diff changeset
769 }
kono
parents:
diff changeset
770
kono
parents:
diff changeset
771 if (GFC_DESCRIPTOR_RANK (b) == 1)
kono
parents:
diff changeset
772 {
kono
parents:
diff changeset
773 /* Treat it as a column matrix B[count,1] */
kono
parents:
diff changeset
774 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
kono
parents:
diff changeset
775
kono
parents:
diff changeset
776 /* bystride should never be used for 1-dimensional b.
kono
parents:
diff changeset
777 The value is only used for calculation of the
kono
parents:
diff changeset
778 memory by the buffer. */
kono
parents:
diff changeset
779 bystride = 256;
kono
parents:
diff changeset
780 ycount = 1;
kono
parents:
diff changeset
781 }
kono
parents:
diff changeset
782 else
kono
parents:
diff changeset
783 {
kono
parents:
diff changeset
784 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
kono
parents:
diff changeset
785 bystride = GFC_DESCRIPTOR_STRIDE(b,1);
kono
parents:
diff changeset
786 ycount = GFC_DESCRIPTOR_EXTENT(b,1);
kono
parents:
diff changeset
787 }
kono
parents:
diff changeset
788
kono
parents:
diff changeset
789 abase = a->base_addr;
kono
parents:
diff changeset
790 bbase = b->base_addr;
kono
parents:
diff changeset
791 dest = retarray->base_addr;
kono
parents:
diff changeset
792
kono
parents:
diff changeset
793 /* Now that everything is set up, we perform the multiplication
kono
parents:
diff changeset
794 itself. */
kono
parents:
diff changeset
795
kono
parents:
diff changeset
796 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
kono
parents:
diff changeset
797 #define min(a,b) ((a) <= (b) ? (a) : (b))
kono
parents:
diff changeset
798 #define max(a,b) ((a) >= (b) ? (a) : (b))
kono
parents:
diff changeset
799
kono
parents:
diff changeset
800 if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
kono
parents:
diff changeset
801 && (bxstride == 1 || bystride == 1)
kono
parents:
diff changeset
802 && (((float) xcount) * ((float) ycount) * ((float) count)
kono
parents:
diff changeset
803 > POW3(blas_limit)))
kono
parents:
diff changeset
804 {
kono
parents:
diff changeset
805 const int m = xcount, n = ycount, k = count, ldc = rystride;
kono
parents:
diff changeset
806 const GFC_INTEGER_1 one = 1, zero = 0;
kono
parents:
diff changeset
807 const int lda = (axstride == 1) ? aystride : axstride,
kono
parents:
diff changeset
808 ldb = (bxstride == 1) ? bystride : bxstride;
kono
parents:
diff changeset
809
kono
parents:
diff changeset
810 if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
kono
parents:
diff changeset
811 {
kono
parents:
diff changeset
812 assert (gemm != NULL);
kono
parents:
diff changeset
813 gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m,
kono
parents:
diff changeset
814 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
kono
parents:
diff changeset
815 &ldc, 1, 1);
kono
parents:
diff changeset
816 return;
kono
parents:
diff changeset
817 }
kono
parents:
diff changeset
818 }
kono
parents:
diff changeset
819
kono
parents:
diff changeset
820 if (rxstride == 1 && axstride == 1 && bxstride == 1)
kono
parents:
diff changeset
821 {
kono
parents:
diff changeset
822 /* This block of code implements a tuned matmul, derived from
kono
parents:
diff changeset
823 Superscalar GEMM-based level 3 BLAS, Beta version 0.1
kono
parents:
diff changeset
824
kono
parents:
diff changeset
825 Bo Kagstrom and Per Ling
kono
parents:
diff changeset
826 Department of Computing Science
kono
parents:
diff changeset
827 Umea University
kono
parents:
diff changeset
828 S-901 87 Umea, Sweden
kono
parents:
diff changeset
829
kono
parents:
diff changeset
830 from netlib.org, translated to C, and modified for matmul.m4. */
kono
parents:
diff changeset
831
kono
parents:
diff changeset
832 const GFC_INTEGER_1 *a, *b;
kono
parents:
diff changeset
833 GFC_INTEGER_1 *c;
kono
parents:
diff changeset
834 const index_type m = xcount, n = ycount, k = count;
kono
parents:
diff changeset
835
kono
parents:
diff changeset
836 /* System generated locals */
kono
parents:
diff changeset
837 index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
kono
parents:
diff changeset
838 i1, i2, i3, i4, i5, i6;
kono
parents:
diff changeset
839
kono
parents:
diff changeset
840 /* Local variables */
kono
parents:
diff changeset
841 GFC_INTEGER_1 f11, f12, f21, f22, f31, f32, f41, f42,
kono
parents:
diff changeset
842 f13, f14, f23, f24, f33, f34, f43, f44;
kono
parents:
diff changeset
843 index_type i, j, l, ii, jj, ll;
kono
parents:
diff changeset
844 index_type isec, jsec, lsec, uisec, ujsec, ulsec;
kono
parents:
diff changeset
845 GFC_INTEGER_1 *t1;
kono
parents:
diff changeset
846
kono
parents:
diff changeset
847 a = abase;
kono
parents:
diff changeset
848 b = bbase;
kono
parents:
diff changeset
849 c = retarray->base_addr;
kono
parents:
diff changeset
850
kono
parents:
diff changeset
851 /* Parameter adjustments */
kono
parents:
diff changeset
852 c_dim1 = rystride;
kono
parents:
diff changeset
853 c_offset = 1 + c_dim1;
kono
parents:
diff changeset
854 c -= c_offset;
kono
parents:
diff changeset
855 a_dim1 = aystride;
kono
parents:
diff changeset
856 a_offset = 1 + a_dim1;
kono
parents:
diff changeset
857 a -= a_offset;
kono
parents:
diff changeset
858 b_dim1 = bystride;
kono
parents:
diff changeset
859 b_offset = 1 + b_dim1;
kono
parents:
diff changeset
860 b -= b_offset;
kono
parents:
diff changeset
861
kono
parents:
diff changeset
862 /* Empty c first. */
kono
parents:
diff changeset
863 for (j=1; j<=n; j++)
kono
parents:
diff changeset
864 for (i=1; i<=m; i++)
kono
parents:
diff changeset
865 c[i + j * c_dim1] = (GFC_INTEGER_1)0;
kono
parents:
diff changeset
866
kono
parents:
diff changeset
867 /* Early exit if possible */
kono
parents:
diff changeset
868 if (m == 0 || n == 0 || k == 0)
kono
parents:
diff changeset
869 return;
kono
parents:
diff changeset
870
kono
parents:
diff changeset
871 /* Adjust size of t1 to what is needed. */
kono
parents:
diff changeset
872 index_type t1_dim;
kono
parents:
diff changeset
873 t1_dim = (a_dim1-1) * 256 + b_dim1;
kono
parents:
diff changeset
874 if (t1_dim > 65536)
kono
parents:
diff changeset
875 t1_dim = 65536;
kono
parents:
diff changeset
876
kono
parents:
diff changeset
877 t1 = malloc (t1_dim * sizeof(GFC_INTEGER_1));
kono
parents:
diff changeset
878
kono
parents:
diff changeset
879 /* Start turning the crank. */
kono
parents:
diff changeset
880 i1 = n;
kono
parents:
diff changeset
881 for (jj = 1; jj <= i1; jj += 512)
kono
parents:
diff changeset
882 {
kono
parents:
diff changeset
883 /* Computing MIN */
kono
parents:
diff changeset
884 i2 = 512;
kono
parents:
diff changeset
885 i3 = n - jj + 1;
kono
parents:
diff changeset
886 jsec = min(i2,i3);
kono
parents:
diff changeset
887 ujsec = jsec - jsec % 4;
kono
parents:
diff changeset
888 i2 = k;
kono
parents:
diff changeset
889 for (ll = 1; ll <= i2; ll += 256)
kono
parents:
diff changeset
890 {
kono
parents:
diff changeset
891 /* Computing MIN */
kono
parents:
diff changeset
892 i3 = 256;
kono
parents:
diff changeset
893 i4 = k - ll + 1;
kono
parents:
diff changeset
894 lsec = min(i3,i4);
kono
parents:
diff changeset
895 ulsec = lsec - lsec % 2;
kono
parents:
diff changeset
896
kono
parents:
diff changeset
897 i3 = m;
kono
parents:
diff changeset
898 for (ii = 1; ii <= i3; ii += 256)
kono
parents:
diff changeset
899 {
kono
parents:
diff changeset
900 /* Computing MIN */
kono
parents:
diff changeset
901 i4 = 256;
kono
parents:
diff changeset
902 i5 = m - ii + 1;
kono
parents:
diff changeset
903 isec = min(i4,i5);
kono
parents:
diff changeset
904 uisec = isec - isec % 2;
kono
parents:
diff changeset
905 i4 = ll + ulsec - 1;
kono
parents:
diff changeset
906 for (l = ll; l <= i4; l += 2)
kono
parents:
diff changeset
907 {
kono
parents:
diff changeset
908 i5 = ii + uisec - 1;
kono
parents:
diff changeset
909 for (i = ii; i <= i5; i += 2)
kono
parents:
diff changeset
910 {
kono
parents:
diff changeset
911 t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
kono
parents:
diff changeset
912 a[i + l * a_dim1];
kono
parents:
diff changeset
913 t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
kono
parents:
diff changeset
914 a[i + (l + 1) * a_dim1];
kono
parents:
diff changeset
915 t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
kono
parents:
diff changeset
916 a[i + 1 + l * a_dim1];
kono
parents:
diff changeset
917 t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
kono
parents:
diff changeset
918 a[i + 1 + (l + 1) * a_dim1];
kono
parents:
diff changeset
919 }
kono
parents:
diff changeset
920 if (uisec < isec)
kono
parents:
diff changeset
921 {
kono
parents:
diff changeset
922 t1[l - ll + 1 + (isec << 8) - 257] =
kono
parents:
diff changeset
923 a[ii + isec - 1 + l * a_dim1];
kono
parents:
diff changeset
924 t1[l - ll + 2 + (isec << 8) - 257] =
kono
parents:
diff changeset
925 a[ii + isec - 1 + (l + 1) * a_dim1];
kono
parents:
diff changeset
926 }
kono
parents:
diff changeset
927 }
kono
parents:
diff changeset
928 if (ulsec < lsec)
kono
parents:
diff changeset
929 {
kono
parents:
diff changeset
930 i4 = ii + isec - 1;
kono
parents:
diff changeset
931 for (i = ii; i<= i4; ++i)
kono
parents:
diff changeset
932 {
kono
parents:
diff changeset
933 t1[lsec + ((i - ii + 1) << 8) - 257] =
kono
parents:
diff changeset
934 a[i + (ll + lsec - 1) * a_dim1];
kono
parents:
diff changeset
935 }
kono
parents:
diff changeset
936 }
kono
parents:
diff changeset
937
kono
parents:
diff changeset
938 uisec = isec - isec % 4;
kono
parents:
diff changeset
939 i4 = jj + ujsec - 1;
kono
parents:
diff changeset
940 for (j = jj; j <= i4; j += 4)
kono
parents:
diff changeset
941 {
kono
parents:
diff changeset
942 i5 = ii + uisec - 1;
kono
parents:
diff changeset
943 for (i = ii; i <= i5; i += 4)
kono
parents:
diff changeset
944 {
kono
parents:
diff changeset
945 f11 = c[i + j * c_dim1];
kono
parents:
diff changeset
946 f21 = c[i + 1 + j * c_dim1];
kono
parents:
diff changeset
947 f12 = c[i + (j + 1) * c_dim1];
kono
parents:
diff changeset
948 f22 = c[i + 1 + (j + 1) * c_dim1];
kono
parents:
diff changeset
949 f13 = c[i + (j + 2) * c_dim1];
kono
parents:
diff changeset
950 f23 = c[i + 1 + (j + 2) * c_dim1];
kono
parents:
diff changeset
951 f14 = c[i + (j + 3) * c_dim1];
kono
parents:
diff changeset
952 f24 = c[i + 1 + (j + 3) * c_dim1];
kono
parents:
diff changeset
953 f31 = c[i + 2 + j * c_dim1];
kono
parents:
diff changeset
954 f41 = c[i + 3 + j * c_dim1];
kono
parents:
diff changeset
955 f32 = c[i + 2 + (j + 1) * c_dim1];
kono
parents:
diff changeset
956 f42 = c[i + 3 + (j + 1) * c_dim1];
kono
parents:
diff changeset
957 f33 = c[i + 2 + (j + 2) * c_dim1];
kono
parents:
diff changeset
958 f43 = c[i + 3 + (j + 2) * c_dim1];
kono
parents:
diff changeset
959 f34 = c[i + 2 + (j + 3) * c_dim1];
kono
parents:
diff changeset
960 f44 = c[i + 3 + (j + 3) * c_dim1];
kono
parents:
diff changeset
961 i6 = ll + lsec - 1;
kono
parents:
diff changeset
962 for (l = ll; l <= i6; ++l)
kono
parents:
diff changeset
963 {
kono
parents:
diff changeset
964 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
kono
parents:
diff changeset
965 * b[l + j * b_dim1];
kono
parents:
diff changeset
966 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
kono
parents:
diff changeset
967 * b[l + j * b_dim1];
kono
parents:
diff changeset
968 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
kono
parents:
diff changeset
969 * b[l + (j + 1) * b_dim1];
kono
parents:
diff changeset
970 f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
kono
parents:
diff changeset
971 * b[l + (j + 1) * b_dim1];
kono
parents:
diff changeset
972 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
kono
parents:
diff changeset
973 * b[l + (j + 2) * b_dim1];
kono
parents:
diff changeset
974 f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
kono
parents:
diff changeset
975 * b[l + (j + 2) * b_dim1];
kono
parents:
diff changeset
976 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
kono
parents:
diff changeset
977 * b[l + (j + 3) * b_dim1];
kono
parents:
diff changeset
978 f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
kono
parents:
diff changeset
979 * b[l + (j + 3) * b_dim1];
kono
parents:
diff changeset
980 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
kono
parents:
diff changeset
981 * b[l + j * b_dim1];
kono
parents:
diff changeset
982 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
kono
parents:
diff changeset
983 * b[l + j * b_dim1];
kono
parents:
diff changeset
984 f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
kono
parents:
diff changeset
985 * b[l + (j + 1) * b_dim1];
kono
parents:
diff changeset
986 f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
kono
parents:
diff changeset
987 * b[l + (j + 1) * b_dim1];
kono
parents:
diff changeset
988 f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
kono
parents:
diff changeset
989 * b[l + (j + 2) * b_dim1];
kono
parents:
diff changeset
990 f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
kono
parents:
diff changeset
991 * b[l + (j + 2) * b_dim1];
kono
parents:
diff changeset
992 f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
kono
parents:
diff changeset
993 * b[l + (j + 3) * b_dim1];
kono
parents:
diff changeset
994 f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
kono
parents:
diff changeset
995 * b[l + (j + 3) * b_dim1];
kono
parents:
diff changeset
996 }
kono
parents:
diff changeset
997 c[i + j * c_dim1] = f11;
kono
parents:
diff changeset
998 c[i + 1 + j * c_dim1] = f21;
kono
parents:
diff changeset
999 c[i + (j + 1) * c_dim1] = f12;
kono
parents:
diff changeset
1000 c[i + 1 + (j + 1) * c_dim1] = f22;
kono
parents:
diff changeset
1001 c[i + (j + 2) * c_dim1] = f13;
kono
parents:
diff changeset
1002 c[i + 1 + (j + 2) * c_dim1] = f23;
kono
parents:
diff changeset
1003 c[i + (j + 3) * c_dim1] = f14;
kono
parents:
diff changeset
1004 c[i + 1 + (j + 3) * c_dim1] = f24;
kono
parents:
diff changeset
1005 c[i + 2 + j * c_dim1] = f31;
kono
parents:
diff changeset
1006 c[i + 3 + j * c_dim1] = f41;
kono
parents:
diff changeset
1007 c[i + 2 + (j + 1) * c_dim1] = f32;
kono
parents:
diff changeset
1008 c[i + 3 + (j + 1) * c_dim1] = f42;
kono
parents:
diff changeset
1009 c[i + 2 + (j + 2) * c_dim1] = f33;
kono
parents:
diff changeset
1010 c[i + 3 + (j + 2) * c_dim1] = f43;
kono
parents:
diff changeset
1011 c[i + 2 + (j + 3) * c_dim1] = f34;
kono
parents:
diff changeset
1012 c[i + 3 + (j + 3) * c_dim1] = f44;
kono
parents:
diff changeset
1013 }
kono
parents:
diff changeset
1014 if (uisec < isec)
kono
parents:
diff changeset
1015 {
kono
parents:
diff changeset
1016 i5 = ii + isec - 1;
kono
parents:
diff changeset
1017 for (i = ii + uisec; i <= i5; ++i)
kono
parents:
diff changeset
1018 {
kono
parents:
diff changeset
1019 f11 = c[i + j * c_dim1];
kono
parents:
diff changeset
1020 f12 = c[i + (j + 1) * c_dim1];
kono
parents:
diff changeset
1021 f13 = c[i + (j + 2) * c_dim1];
kono
parents:
diff changeset
1022 f14 = c[i + (j + 3) * c_dim1];
kono
parents:
diff changeset
1023 i6 = ll + lsec - 1;
kono
parents:
diff changeset
1024 for (l = ll; l <= i6; ++l)
kono
parents:
diff changeset
1025 {
kono
parents:
diff changeset
1026 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
kono
parents:
diff changeset
1027 257] * b[l + j * b_dim1];
kono
parents:
diff changeset
1028 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
kono
parents:
diff changeset
1029 257] * b[l + (j + 1) * b_dim1];
kono
parents:
diff changeset
1030 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
kono
parents:
diff changeset
1031 257] * b[l + (j + 2) * b_dim1];
kono
parents:
diff changeset
1032 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
kono
parents:
diff changeset
1033 257] * b[l + (j + 3) * b_dim1];
kono
parents:
diff changeset
1034 }
kono
parents:
diff changeset
1035 c[i + j * c_dim1] = f11;
kono
parents:
diff changeset
1036 c[i + (j + 1) * c_dim1] = f12;
kono
parents:
diff changeset
1037 c[i + (j + 2) * c_dim1] = f13;
kono
parents:
diff changeset
1038 c[i + (j + 3) * c_dim1] = f14;
kono
parents:
diff changeset
1039 }
kono
parents:
diff changeset
1040 }
kono
parents:
diff changeset
1041 }
kono
parents:
diff changeset
1042 if (ujsec < jsec)
kono
parents:
diff changeset
1043 {
kono
parents:
diff changeset
1044 i4 = jj + jsec - 1;
kono
parents:
diff changeset
1045 for (j = jj + ujsec; j <= i4; ++j)
kono
parents:
diff changeset
1046 {
kono
parents:
diff changeset
1047 i5 = ii + uisec - 1;
kono
parents:
diff changeset
1048 for (i = ii; i <= i5; i += 4)
kono
parents:
diff changeset
1049 {
kono
parents:
diff changeset
1050 f11 = c[i + j * c_dim1];
kono
parents:
diff changeset
1051 f21 = c[i + 1 + j * c_dim1];
kono
parents:
diff changeset
1052 f31 = c[i + 2 + j * c_dim1];
kono
parents:
diff changeset
1053 f41 = c[i + 3 + j * c_dim1];
kono
parents:
diff changeset
1054 i6 = ll + lsec - 1;
kono
parents:
diff changeset
1055 for (l = ll; l <= i6; ++l)
kono
parents:
diff changeset
1056 {
kono
parents:
diff changeset
1057 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
kono
parents:
diff changeset
1058 257] * b[l + j * b_dim1];
kono
parents:
diff changeset
1059 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
kono
parents:
diff changeset
1060 257] * b[l + j * b_dim1];
kono
parents:
diff changeset
1061 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
kono
parents:
diff changeset
1062 257] * b[l + j * b_dim1];
kono
parents:
diff changeset
1063 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
kono
parents:
diff changeset
1064 257] * b[l + j * b_dim1];
kono
parents:
diff changeset
1065 }
kono
parents:
diff changeset
1066 c[i + j * c_dim1] = f11;
kono
parents:
diff changeset
1067 c[i + 1 + j * c_dim1] = f21;
kono
parents:
diff changeset
1068 c[i + 2 + j * c_dim1] = f31;
kono
parents:
diff changeset
1069 c[i + 3 + j * c_dim1] = f41;
kono
parents:
diff changeset
1070 }
kono
parents:
diff changeset
1071 i5 = ii + isec - 1;
kono
parents:
diff changeset
1072 for (i = ii + uisec; i <= i5; ++i)
kono
parents:
diff changeset
1073 {
kono
parents:
diff changeset
1074 f11 = c[i + j * c_dim1];
kono
parents:
diff changeset
1075 i6 = ll + lsec - 1;
kono
parents:
diff changeset
1076 for (l = ll; l <= i6; ++l)
kono
parents:
diff changeset
1077 {
kono
parents:
diff changeset
1078 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
kono
parents:
diff changeset
1079 257] * b[l + j * b_dim1];
kono
parents:
diff changeset
1080 }
kono
parents:
diff changeset
1081 c[i + j * c_dim1] = f11;
kono
parents:
diff changeset
1082 }
kono
parents:
diff changeset
1083 }
kono
parents:
diff changeset
1084 }
kono
parents:
diff changeset
1085 }
kono
parents:
diff changeset
1086 }
kono
parents:
diff changeset
1087 }
kono
parents:
diff changeset
1088 free(t1);
kono
parents:
diff changeset
1089 return;
kono
parents:
diff changeset
1090 }
kono
parents:
diff changeset
1091 else if (rxstride == 1 && aystride == 1 && bxstride == 1)
kono
parents:
diff changeset
1092 {
kono
parents:
diff changeset
1093 if (GFC_DESCRIPTOR_RANK (a) != 1)
kono
parents:
diff changeset
1094 {
kono
parents:
diff changeset
1095 const GFC_INTEGER_1 *restrict abase_x;
kono
parents:
diff changeset
1096 const GFC_INTEGER_1 *restrict bbase_y;
kono
parents:
diff changeset
1097 GFC_INTEGER_1 *restrict dest_y;
kono
parents:
diff changeset
1098 GFC_INTEGER_1 s;
kono
parents:
diff changeset
1099
kono
parents:
diff changeset
1100 for (y = 0; y < ycount; y++)
kono
parents:
diff changeset
1101 {
kono
parents:
diff changeset
1102 bbase_y = &bbase[y*bystride];
kono
parents:
diff changeset
1103 dest_y = &dest[y*rystride];
kono
parents:
diff changeset
1104 for (x = 0; x < xcount; x++)
kono
parents:
diff changeset
1105 {
kono
parents:
diff changeset
1106 abase_x = &abase[x*axstride];
kono
parents:
diff changeset
1107 s = (GFC_INTEGER_1) 0;
kono
parents:
diff changeset
1108 for (n = 0; n < count; n++)
kono
parents:
diff changeset
1109 s += abase_x[n] * bbase_y[n];
kono
parents:
diff changeset
1110 dest_y[x] = s;
kono
parents:
diff changeset
1111 }
kono
parents:
diff changeset
1112 }
kono
parents:
diff changeset
1113 }
kono
parents:
diff changeset
1114 else
kono
parents:
diff changeset
1115 {
kono
parents:
diff changeset
1116 const GFC_INTEGER_1 *restrict bbase_y;
kono
parents:
diff changeset
1117 GFC_INTEGER_1 s;
kono
parents:
diff changeset
1118
kono
parents:
diff changeset
1119 for (y = 0; y < ycount; y++)
kono
parents:
diff changeset
1120 {
kono
parents:
diff changeset
1121 bbase_y = &bbase[y*bystride];
kono
parents:
diff changeset
1122 s = (GFC_INTEGER_1) 0;
kono
parents:
diff changeset
1123 for (n = 0; n < count; n++)
kono
parents:
diff changeset
1124 s += abase[n*axstride] * bbase_y[n];
kono
parents:
diff changeset
1125 dest[y*rystride] = s;
kono
parents:
diff changeset
1126 }
kono
parents:
diff changeset
1127 }
kono
parents:
diff changeset
1128 }
kono
parents:
diff changeset
1129 else if (axstride < aystride)
kono
parents:
diff changeset
1130 {
kono
parents:
diff changeset
1131 for (y = 0; y < ycount; y++)
kono
parents:
diff changeset
1132 for (x = 0; x < xcount; x++)
kono
parents:
diff changeset
1133 dest[x*rxstride + y*rystride] = (GFC_INTEGER_1)0;
kono
parents:
diff changeset
1134
kono
parents:
diff changeset
1135 for (y = 0; y < ycount; y++)
kono
parents:
diff changeset
1136 for (n = 0; n < count; n++)
kono
parents:
diff changeset
1137 for (x = 0; x < xcount; x++)
kono
parents:
diff changeset
1138 /* dest[x,y] += a[x,n] * b[n,y] */
kono
parents:
diff changeset
1139 dest[x*rxstride + y*rystride] +=
kono
parents:
diff changeset
1140 abase[x*axstride + n*aystride] *
kono
parents:
diff changeset
1141 bbase[n*bxstride + y*bystride];
kono
parents:
diff changeset
1142 }
kono
parents:
diff changeset
1143 else if (GFC_DESCRIPTOR_RANK (a) == 1)
kono
parents:
diff changeset
1144 {
kono
parents:
diff changeset
1145 const GFC_INTEGER_1 *restrict bbase_y;
kono
parents:
diff changeset
1146 GFC_INTEGER_1 s;
kono
parents:
diff changeset
1147
kono
parents:
diff changeset
1148 for (y = 0; y < ycount; y++)
kono
parents:
diff changeset
1149 {
kono
parents:
diff changeset
1150 bbase_y = &bbase[y*bystride];
kono
parents:
diff changeset
1151 s = (GFC_INTEGER_1) 0;
kono
parents:
diff changeset
1152 for (n = 0; n < count; n++)
kono
parents:
diff changeset
1153 s += abase[n*axstride] * bbase_y[n*bxstride];
kono
parents:
diff changeset
1154 dest[y*rxstride] = s;
kono
parents:
diff changeset
1155 }
kono
parents:
diff changeset
1156 }
kono
parents:
diff changeset
1157 else
kono
parents:
diff changeset
1158 {
kono
parents:
diff changeset
1159 const GFC_INTEGER_1 *restrict abase_x;
kono
parents:
diff changeset
1160 const GFC_INTEGER_1 *restrict bbase_y;
kono
parents:
diff changeset
1161 GFC_INTEGER_1 *restrict dest_y;
kono
parents:
diff changeset
1162 GFC_INTEGER_1 s;
kono
parents:
diff changeset
1163
kono
parents:
diff changeset
1164 for (y = 0; y < ycount; y++)
kono
parents:
diff changeset
1165 {
kono
parents:
diff changeset
1166 bbase_y = &bbase[y*bystride];
kono
parents:
diff changeset
1167 dest_y = &dest[y*rystride];
kono
parents:
diff changeset
1168 for (x = 0; x < xcount; x++)
kono
parents:
diff changeset
1169 {
kono
parents:
diff changeset
1170 abase_x = &abase[x*axstride];
kono
parents:
diff changeset
1171 s = (GFC_INTEGER_1) 0;
kono
parents:
diff changeset
1172 for (n = 0; n < count; n++)
kono
parents:
diff changeset
1173 s += abase_x[n*aystride] * bbase_y[n*bxstride];
kono
parents:
diff changeset
1174 dest_y[x*rxstride] = s;
kono
parents:
diff changeset
1175 }
kono
parents:
diff changeset
1176 }
kono
parents:
diff changeset
1177 }
kono
parents:
diff changeset
1178 }
kono
parents:
diff changeset
1179 #undef POW3
kono
parents:
diff changeset
1180 #undef min
kono
parents:
diff changeset
1181 #undef max
kono
parents:
diff changeset
1182
kono
parents:
diff changeset
1183 #endif /* HAVE_AVX2 */
kono
parents:
diff changeset
1184
kono
parents:
diff changeset
1185 #ifdef HAVE_AVX512F
kono
parents:
diff changeset
1186 static void
kono
parents:
diff changeset
1187 matmul_i1_avx512f (gfc_array_i1 * const restrict retarray,
kono
parents:
diff changeset
1188 gfc_array_i1 * const restrict a, gfc_array_i1 * const restrict b, int try_blas,
kono
parents:
diff changeset
1189 int blas_limit, blas_call gemm) __attribute__((__target__("avx512f")));
kono
parents:
diff changeset
1190 static void
kono
parents:
diff changeset
1191 matmul_i1_avx512f (gfc_array_i1 * const restrict retarray,
kono
parents:
diff changeset
1192 gfc_array_i1 * const restrict a, gfc_array_i1 * const restrict b, int try_blas,
kono
parents:
diff changeset
1193 int blas_limit, blas_call gemm)
kono
parents:
diff changeset
1194 {
kono
parents:
diff changeset
1195 const GFC_INTEGER_1 * restrict abase;
kono
parents:
diff changeset
1196 const GFC_INTEGER_1 * restrict bbase;
kono
parents:
diff changeset
1197 GFC_INTEGER_1 * restrict dest;
kono
parents:
diff changeset
1198
kono
parents:
diff changeset
1199 index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
kono
parents:
diff changeset
1200 index_type x, y, n, count, xcount, ycount;
kono
parents:
diff changeset
1201
kono
parents:
diff changeset
1202 assert (GFC_DESCRIPTOR_RANK (a) == 2
kono
parents:
diff changeset
1203 || GFC_DESCRIPTOR_RANK (b) == 2);
kono
parents:
diff changeset
1204
kono
parents:
diff changeset
1205 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
kono
parents:
diff changeset
1206
kono
parents:
diff changeset
1207 Either A or B (but not both) can be rank 1:
kono
parents:
diff changeset
1208
kono
parents:
diff changeset
1209 o One-dimensional argument A is implicitly treated as a row matrix
kono
parents:
diff changeset
1210 dimensioned [1,count], so xcount=1.
kono
parents:
diff changeset
1211
kono
parents:
diff changeset
1212 o One-dimensional argument B is implicitly treated as a column matrix
kono
parents:
diff changeset
1213 dimensioned [count, 1], so ycount=1.
kono
parents:
diff changeset
1214 */
kono
parents:
diff changeset
1215
kono
parents:
diff changeset
1216 if (retarray->base_addr == NULL)
kono
parents:
diff changeset
1217 {
kono
parents:
diff changeset
1218 if (GFC_DESCRIPTOR_RANK (a) == 1)
kono
parents:
diff changeset
1219 {
kono
parents:
diff changeset
1220 GFC_DIMENSION_SET(retarray->dim[0], 0,
kono
parents:
diff changeset
1221 GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
kono
parents:
diff changeset
1222 }
kono
parents:
diff changeset
1223 else if (GFC_DESCRIPTOR_RANK (b) == 1)
kono
parents:
diff changeset
1224 {
kono
parents:
diff changeset
1225 GFC_DIMENSION_SET(retarray->dim[0], 0,
kono
parents:
diff changeset
1226 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
kono
parents:
diff changeset
1227 }
kono
parents:
diff changeset
1228 else
kono
parents:
diff changeset
1229 {
kono
parents:
diff changeset
1230 GFC_DIMENSION_SET(retarray->dim[0], 0,
kono
parents:
diff changeset
1231 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
kono
parents:
diff changeset
1232
kono
parents:
diff changeset
1233 GFC_DIMENSION_SET(retarray->dim[1], 0,
kono
parents:
diff changeset
1234 GFC_DESCRIPTOR_EXTENT(b,1) - 1,
kono
parents:
diff changeset
1235 GFC_DESCRIPTOR_EXTENT(retarray,0));
kono
parents:
diff changeset
1236 }
kono
parents:
diff changeset
1237
kono
parents:
diff changeset
1238 retarray->base_addr
kono
parents:
diff changeset
1239 = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_INTEGER_1));
kono
parents:
diff changeset
1240 retarray->offset = 0;
kono
parents:
diff changeset
1241 }
kono
parents:
diff changeset
1242 else if (unlikely (compile_options.bounds_check))
kono
parents:
diff changeset
1243 {
kono
parents:
diff changeset
1244 index_type ret_extent, arg_extent;
kono
parents:
diff changeset
1245
kono
parents:
diff changeset
1246 if (GFC_DESCRIPTOR_RANK (a) == 1)
kono
parents:
diff changeset
1247 {
kono
parents:
diff changeset
1248 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
kono
parents:
diff changeset
1249 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
kono
parents:
diff changeset
1250 if (arg_extent != ret_extent)
kono
parents:
diff changeset
1251 runtime_error ("Incorrect extent in return array in"
kono
parents:
diff changeset
1252 " MATMUL intrinsic: is %ld, should be %ld",
kono
parents:
diff changeset
1253 (long int) ret_extent, (long int) arg_extent);
kono
parents:
diff changeset
1254 }
kono
parents:
diff changeset
1255 else if (GFC_DESCRIPTOR_RANK (b) == 1)
kono
parents:
diff changeset
1256 {
kono
parents:
diff changeset
1257 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
kono
parents:
diff changeset
1258 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
kono
parents:
diff changeset
1259 if (arg_extent != ret_extent)
kono
parents:
diff changeset
1260 runtime_error ("Incorrect extent in return array in"
kono
parents:
diff changeset
1261 " MATMUL intrinsic: is %ld, should be %ld",
kono
parents:
diff changeset
1262 (long int) ret_extent, (long int) arg_extent);
kono
parents:
diff changeset
1263 }
kono
parents:
diff changeset
1264 else
kono
parents:
diff changeset
1265 {
kono
parents:
diff changeset
1266 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
kono
parents:
diff changeset
1267 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
kono
parents:
diff changeset
1268 if (arg_extent != ret_extent)
kono
parents:
diff changeset
1269 runtime_error ("Incorrect extent in return array in"
kono
parents:
diff changeset
1270 " MATMUL intrinsic for dimension 1:"
kono
parents:
diff changeset
1271 " is %ld, should be %ld",
kono
parents:
diff changeset
1272 (long int) ret_extent, (long int) arg_extent);
kono
parents:
diff changeset
1273
kono
parents:
diff changeset
1274 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
kono
parents:
diff changeset
1275 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
kono
parents:
diff changeset
1276 if (arg_extent != ret_extent)
kono
parents:
diff changeset
1277 runtime_error ("Incorrect extent in return array in"
kono
parents:
diff changeset
1278 " MATMUL intrinsic for dimension 2:"
kono
parents:
diff changeset
1279 " is %ld, should be %ld",
kono
parents:
diff changeset
1280 (long int) ret_extent, (long int) arg_extent);
kono
parents:
diff changeset
1281 }
kono
parents:
diff changeset
1282 }
kono
parents:
diff changeset
1283
kono
parents:
diff changeset
1284
kono
parents:
diff changeset
1285 if (GFC_DESCRIPTOR_RANK (retarray) == 1)
kono
parents:
diff changeset
1286 {
kono
parents:
diff changeset
1287 /* One-dimensional result may be addressed in the code below
kono
parents:
diff changeset
1288 either as a row or a column matrix. We want both cases to
kono
parents:
diff changeset
1289 work. */
kono
parents:
diff changeset
1290 rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
kono
parents:
diff changeset
1291 }
kono
parents:
diff changeset
1292 else
kono
parents:
diff changeset
1293 {
kono
parents:
diff changeset
1294 rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
kono
parents:
diff changeset
1295 rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
kono
parents:
diff changeset
1296 }
kono
parents:
diff changeset
1297
kono
parents:
diff changeset
1298
kono
parents:
diff changeset
1299 if (GFC_DESCRIPTOR_RANK (a) == 1)
kono
parents:
diff changeset
1300 {
kono
parents:
diff changeset
1301 /* Treat it as a a row matrix A[1,count]. */
kono
parents:
diff changeset
1302 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
kono
parents:
diff changeset
1303 aystride = 1;
kono
parents:
diff changeset
1304
kono
parents:
diff changeset
1305 xcount = 1;
kono
parents:
diff changeset
1306 count = GFC_DESCRIPTOR_EXTENT(a,0);
kono
parents:
diff changeset
1307 }
kono
parents:
diff changeset
1308 else
kono
parents:
diff changeset
1309 {
kono
parents:
diff changeset
1310 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
kono
parents:
diff changeset
1311 aystride = GFC_DESCRIPTOR_STRIDE(a,1);
kono
parents:
diff changeset
1312
kono
parents:
diff changeset
1313 count = GFC_DESCRIPTOR_EXTENT(a,1);
kono
parents:
diff changeset
1314 xcount = GFC_DESCRIPTOR_EXTENT(a,0);
kono
parents:
diff changeset
1315 }
kono
parents:
diff changeset
1316
kono
parents:
diff changeset
1317 if (count != GFC_DESCRIPTOR_EXTENT(b,0))
kono
parents:
diff changeset
1318 {
kono
parents:
diff changeset
1319 if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
kono
parents:
diff changeset
1320 runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
kono
parents:
diff changeset
1321 }
kono
parents:
diff changeset
1322
kono
parents:
diff changeset
1323 if (GFC_DESCRIPTOR_RANK (b) == 1)
kono
parents:
diff changeset
1324 {
kono
parents:
diff changeset
1325 /* Treat it as a column matrix B[count,1] */
kono
parents:
diff changeset
1326 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
kono
parents:
diff changeset
1327
kono
parents:
diff changeset
1328 /* bystride should never be used for 1-dimensional b.
kono
parents:
diff changeset
1329 The value is only used for calculation of the
kono
parents:
diff changeset
1330 memory by the buffer. */
kono
parents:
diff changeset
1331 bystride = 256;
kono
parents:
diff changeset
1332 ycount = 1;
kono
parents:
diff changeset
1333 }
kono
parents:
diff changeset
1334 else
kono
parents:
diff changeset
1335 {
kono
parents:
diff changeset
1336 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
kono
parents:
diff changeset
1337 bystride = GFC_DESCRIPTOR_STRIDE(b,1);
kono
parents:
diff changeset
1338 ycount = GFC_DESCRIPTOR_EXTENT(b,1);
kono
parents:
diff changeset
1339 }
kono
parents:
diff changeset
1340
kono
parents:
diff changeset
1341 abase = a->base_addr;
kono
parents:
diff changeset
1342 bbase = b->base_addr;
kono
parents:
diff changeset
1343 dest = retarray->base_addr;
kono
parents:
diff changeset
1344
kono
parents:
diff changeset
1345 /* Now that everything is set up, we perform the multiplication
kono
parents:
diff changeset
1346 itself. */
kono
parents:
diff changeset
1347
kono
parents:
diff changeset
1348 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
kono
parents:
diff changeset
1349 #define min(a,b) ((a) <= (b) ? (a) : (b))
kono
parents:
diff changeset
1350 #define max(a,b) ((a) >= (b) ? (a) : (b))
kono
parents:
diff changeset
1351
kono
parents:
diff changeset
1352 if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
kono
parents:
diff changeset
1353 && (bxstride == 1 || bystride == 1)
kono
parents:
diff changeset
1354 && (((float) xcount) * ((float) ycount) * ((float) count)
kono
parents:
diff changeset
1355 > POW3(blas_limit)))
kono
parents:
diff changeset
1356 {
kono
parents:
diff changeset
1357 const int m = xcount, n = ycount, k = count, ldc = rystride;
kono
parents:
diff changeset
1358 const GFC_INTEGER_1 one = 1, zero = 0;
kono
parents:
diff changeset
1359 const int lda = (axstride == 1) ? aystride : axstride,
kono
parents:
diff changeset
1360 ldb = (bxstride == 1) ? bystride : bxstride;
kono
parents:
diff changeset
1361
kono
parents:
diff changeset
1362 if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
kono
parents:
diff changeset
1363 {
kono
parents:
diff changeset
1364 assert (gemm != NULL);
kono
parents:
diff changeset
1365 gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m,
kono
parents:
diff changeset
1366 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
kono
parents:
diff changeset
1367 &ldc, 1, 1);
kono
parents:
diff changeset
1368 return;
kono
parents:
diff changeset
1369 }
kono
parents:
diff changeset
1370 }
kono
parents:
diff changeset
1371
kono
parents:
diff changeset
1372 if (rxstride == 1 && axstride == 1 && bxstride == 1)
kono
parents:
diff changeset
1373 {
kono
parents:
diff changeset
1374 /* This block of code implements a tuned matmul, derived from
kono
parents:
diff changeset
1375 Superscalar GEMM-based level 3 BLAS, Beta version 0.1
kono
parents:
diff changeset
1376
kono
parents:
diff changeset
1377 Bo Kagstrom and Per Ling
kono
parents:
diff changeset
1378 Department of Computing Science
kono
parents:
diff changeset
1379 Umea University
kono
parents:
diff changeset
1380 S-901 87 Umea, Sweden
kono
parents:
diff changeset
1381
kono
parents:
diff changeset
1382 from netlib.org, translated to C, and modified for matmul.m4. */
kono
parents:
diff changeset
1383
kono
parents:
diff changeset
1384 const GFC_INTEGER_1 *a, *b;
kono
parents:
diff changeset
1385 GFC_INTEGER_1 *c;
kono
parents:
diff changeset
1386 const index_type m = xcount, n = ycount, k = count;
kono
parents:
diff changeset
1387
kono
parents:
diff changeset
1388 /* System generated locals */
kono
parents:
diff changeset
1389 index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
kono
parents:
diff changeset
1390 i1, i2, i3, i4, i5, i6;
kono
parents:
diff changeset
1391
kono
parents:
diff changeset
1392 /* Local variables */
kono
parents:
diff changeset
1393 GFC_INTEGER_1 f11, f12, f21, f22, f31, f32, f41, f42,
kono
parents:
diff changeset
1394 f13, f14, f23, f24, f33, f34, f43, f44;
kono
parents:
diff changeset
1395 index_type i, j, l, ii, jj, ll;
kono
parents:
diff changeset
1396 index_type isec, jsec, lsec, uisec, ujsec, ulsec;
kono
parents:
diff changeset
1397 GFC_INTEGER_1 *t1;
kono
parents:
diff changeset
1398
kono
parents:
diff changeset
1399 a = abase;
kono
parents:
diff changeset
1400 b = bbase;
kono
parents:
diff changeset
1401 c = retarray->base_addr;
kono
parents:
diff changeset
1402
kono
parents:
diff changeset
1403 /* Parameter adjustments */
kono
parents:
diff changeset
1404 c_dim1 = rystride;
kono
parents:
diff changeset
1405 c_offset = 1 + c_dim1;
kono
parents:
diff changeset
1406 c -= c_offset;
kono
parents:
diff changeset
1407 a_dim1 = aystride;
kono
parents:
diff changeset
1408 a_offset = 1 + a_dim1;
kono
parents:
diff changeset
1409 a -= a_offset;
kono
parents:
diff changeset
1410 b_dim1 = bystride;
kono
parents:
diff changeset
1411 b_offset = 1 + b_dim1;
kono
parents:
diff changeset
1412 b -= b_offset;
kono
parents:
diff changeset
1413
kono
parents:
diff changeset
1414 /* Empty c first. */
kono
parents:
diff changeset
1415 for (j=1; j<=n; j++)
kono
parents:
diff changeset
1416 for (i=1; i<=m; i++)
kono
parents:
diff changeset
1417 c[i + j * c_dim1] = (GFC_INTEGER_1)0;
kono
parents:
diff changeset
1418
kono
parents:
diff changeset
1419 /* Early exit if possible */
kono
parents:
diff changeset
1420 if (m == 0 || n == 0 || k == 0)
kono
parents:
diff changeset
1421 return;
kono
parents:
diff changeset
1422
kono
parents:
diff changeset
1423 /* Adjust size of t1 to what is needed. */
kono
parents:
diff changeset
1424 index_type t1_dim;
kono
parents:
diff changeset
1425 t1_dim = (a_dim1-1) * 256 + b_dim1;
kono
parents:
diff changeset
1426 if (t1_dim > 65536)
kono
parents:
diff changeset
1427 t1_dim = 65536;
kono
parents:
diff changeset
1428
kono
parents:
diff changeset
1429 t1 = malloc (t1_dim * sizeof(GFC_INTEGER_1));
kono
parents:
diff changeset
1430
kono
parents:
diff changeset
1431 /* Start turning the crank. */
kono
parents:
diff changeset
1432 i1 = n;
kono
parents:
diff changeset
1433 for (jj = 1; jj <= i1; jj += 512)
kono
parents:
diff changeset
1434 {
kono
parents:
diff changeset
1435 /* Computing MIN */
kono
parents:
diff changeset
1436 i2 = 512;
kono
parents:
diff changeset
1437 i3 = n - jj + 1;
kono
parents:
diff changeset
1438 jsec = min(i2,i3);
kono
parents:
diff changeset
1439 ujsec = jsec - jsec % 4;
kono
parents:
diff changeset
1440 i2 = k;
kono
parents:
diff changeset
1441 for (ll = 1; ll <= i2; ll += 256)
kono
parents:
diff changeset
1442 {
kono
parents:
diff changeset
1443 /* Computing MIN */
kono
parents:
diff changeset
1444 i3 = 256;
kono
parents:
diff changeset
1445 i4 = k - ll + 1;
kono
parents:
diff changeset
1446 lsec = min(i3,i4);
kono
parents:
diff changeset
1447 ulsec = lsec - lsec % 2;
kono
parents:
diff changeset
1448
kono
parents:
diff changeset
1449 i3 = m;
kono
parents:
diff changeset
1450 for (ii = 1; ii <= i3; ii += 256)
kono
parents:
diff changeset
1451 {
kono
parents:
diff changeset
1452 /* Computing MIN */
kono
parents:
diff changeset
1453 i4 = 256;
kono
parents:
diff changeset
1454 i5 = m - ii + 1;
kono
parents:
diff changeset
1455 isec = min(i4,i5);
kono
parents:
diff changeset
1456 uisec = isec - isec % 2;
kono
parents:
diff changeset
1457 i4 = ll + ulsec - 1;
kono
parents:
diff changeset
1458 for (l = ll; l <= i4; l += 2)
kono
parents:
diff changeset
1459 {
kono
parents:
diff changeset
1460 i5 = ii + uisec - 1;
kono
parents:
diff changeset
1461 for (i = ii; i <= i5; i += 2)
kono
parents:
diff changeset
1462 {
kono
parents:
diff changeset
1463 t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
kono
parents:
diff changeset
1464 a[i + l * a_dim1];
kono
parents:
diff changeset
1465 t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
kono
parents:
diff changeset
1466 a[i + (l + 1) * a_dim1];
kono
parents:
diff changeset
1467 t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
kono
parents:
diff changeset
1468 a[i + 1 + l * a_dim1];
kono
parents:
diff changeset
1469 t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
kono
parents:
diff changeset
1470 a[i + 1 + (l + 1) * a_dim1];
kono
parents:
diff changeset
1471 }
kono
parents:
diff changeset
1472 if (uisec < isec)
kono
parents:
diff changeset
1473 {
kono
parents:
diff changeset
1474 t1[l - ll + 1 + (isec << 8) - 257] =
kono
parents:
diff changeset
1475 a[ii + isec - 1 + l * a_dim1];
kono
parents:
diff changeset
1476 t1[l - ll + 2 + (isec << 8) - 257] =
kono
parents:
diff changeset
1477 a[ii + isec - 1 + (l + 1) * a_dim1];
kono
parents:
diff changeset
1478 }
kono
parents:
diff changeset
1479 }
kono
parents:
diff changeset
1480 if (ulsec < lsec)
kono
parents:
diff changeset
1481 {
kono
parents:
diff changeset
1482 i4 = ii + isec - 1;
kono
parents:
diff changeset
1483 for (i = ii; i<= i4; ++i)
kono
parents:
diff changeset
1484 {
kono
parents:
diff changeset
1485 t1[lsec + ((i - ii + 1) << 8) - 257] =
kono
parents:
diff changeset
1486 a[i + (ll + lsec - 1) * a_dim1];
kono
parents:
diff changeset
1487 }
kono
parents:
diff changeset
1488 }
kono
parents:
diff changeset
1489
kono
parents:
diff changeset
1490 uisec = isec - isec % 4;
kono
parents:
diff changeset
1491 i4 = jj + ujsec - 1;
kono
parents:
diff changeset
1492 for (j = jj; j <= i4; j += 4)
kono
parents:
diff changeset
1493 {
kono
parents:
diff changeset
1494 i5 = ii + uisec - 1;
kono
parents:
diff changeset
1495 for (i = ii; i <= i5; i += 4)
kono
parents:
diff changeset
1496 {
kono
parents:
diff changeset
1497 f11 = c[i + j * c_dim1];
kono
parents:
diff changeset
1498 f21 = c[i + 1 + j * c_dim1];
kono
parents:
diff changeset
1499 f12 = c[i + (j + 1) * c_dim1];
kono
parents:
diff changeset
1500 f22 = c[i + 1 + (j + 1) * c_dim1];
kono
parents:
diff changeset
1501 f13 = c[i + (j + 2) * c_dim1];
kono
parents:
diff changeset
1502 f23 = c[i + 1 + (j + 2) * c_dim1];
kono
parents:
diff changeset
1503 f14 = c[i + (j + 3) * c_dim1];
kono
parents:
diff changeset
1504 f24 = c[i + 1 + (j + 3) * c_dim1];
kono
parents:
diff changeset
1505 f31 = c[i + 2 + j * c_dim1];
kono
parents:
diff changeset
1506 f41 = c[i + 3 + j * c_dim1];
kono
parents:
diff changeset
1507 f32 = c[i + 2 + (j + 1) * c_dim1];
kono
parents:
diff changeset
1508 f42 = c[i + 3 + (j + 1) * c_dim1];
kono
parents:
diff changeset
1509 f33 = c[i + 2 + (j + 2) * c_dim1];
kono
parents:
diff changeset
1510 f43 = c[i + 3 + (j + 2) * c_dim1];
kono
parents:
diff changeset
1511 f34 = c[i + 2 + (j + 3) * c_dim1];
kono
parents:
diff changeset
1512 f44 = c[i + 3 + (j + 3) * c_dim1];
kono
parents:
diff changeset
1513 i6 = ll + lsec - 1;
kono
parents:
diff changeset
1514 for (l = ll; l <= i6; ++l)
kono
parents:
diff changeset
1515 {
kono
parents:
diff changeset
1516 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
kono
parents:
diff changeset
1517 * b[l + j * b_dim1];
kono
parents:
diff changeset
1518 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
kono
parents:
diff changeset
1519 * b[l + j * b_dim1];
kono
parents:
diff changeset
1520 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
kono
parents:
diff changeset
1521 * b[l + (j + 1) * b_dim1];
kono
parents:
diff changeset
1522 f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
kono
parents:
diff changeset
1523 * b[l + (j + 1) * b_dim1];
kono
parents:
diff changeset
1524 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
kono
parents:
diff changeset
1525 * b[l + (j + 2) * b_dim1];
kono
parents:
diff changeset
1526 f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
kono
parents:
diff changeset
1527 * b[l + (j + 2) * b_dim1];
kono
parents:
diff changeset
1528 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
kono
parents:
diff changeset
1529 * b[l + (j + 3) * b_dim1];
kono
parents:
diff changeset
1530 f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
kono
parents:
diff changeset
1531 * b[l + (j + 3) * b_dim1];
kono
parents:
diff changeset
1532 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
kono
parents:
diff changeset
1533 * b[l + j * b_dim1];
kono
parents:
diff changeset
1534 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
kono
parents:
diff changeset
1535 * b[l + j * b_dim1];
kono
parents:
diff changeset
1536 f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
kono
parents:
diff changeset
1537 * b[l + (j + 1) * b_dim1];
kono
parents:
diff changeset
1538 f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
kono
parents:
diff changeset
1539 * b[l + (j + 1) * b_dim1];
kono
parents:
diff changeset
1540 f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
kono
parents:
diff changeset
1541 * b[l + (j + 2) * b_dim1];
kono
parents:
diff changeset
1542 f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
kono
parents:
diff changeset
1543 * b[l + (j + 2) * b_dim1];
kono
parents:
diff changeset
1544 f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
kono
parents:
diff changeset
1545 * b[l + (j + 3) * b_dim1];
kono
parents:
diff changeset
1546 f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
kono
parents:
diff changeset
1547 * b[l + (j + 3) * b_dim1];
kono
parents:
diff changeset
1548 }
kono
parents:
diff changeset
1549 c[i + j * c_dim1] = f11;
kono
parents:
diff changeset
1550 c[i + 1 + j * c_dim1] = f21;
kono
parents:
diff changeset
1551 c[i + (j + 1) * c_dim1] = f12;
kono
parents:
diff changeset
1552 c[i + 1 + (j + 1) * c_dim1] = f22;
kono
parents:
diff changeset
1553 c[i + (j + 2) * c_dim1] = f13;
kono
parents:
diff changeset
1554 c[i + 1 + (j + 2) * c_dim1] = f23;
kono
parents:
diff changeset
1555 c[i + (j + 3) * c_dim1] = f14;
kono
parents:
diff changeset
1556 c[i + 1 + (j + 3) * c_dim1] = f24;
kono
parents:
diff changeset
1557 c[i + 2 + j * c_dim1] = f31;
kono
parents:
diff changeset
1558 c[i + 3 + j * c_dim1] = f41;
kono
parents:
diff changeset
1559 c[i + 2 + (j + 1) * c_dim1] = f32;
kono
parents:
diff changeset
1560 c[i + 3 + (j + 1) * c_dim1] = f42;
kono
parents:
diff changeset
1561 c[i + 2 + (j + 2) * c_dim1] = f33;
kono
parents:
diff changeset
1562 c[i + 3 + (j + 2) * c_dim1] = f43;
kono
parents:
diff changeset
1563 c[i + 2 + (j + 3) * c_dim1] = f34;
kono
parents:
diff changeset
1564 c[i + 3 + (j + 3) * c_dim1] = f44;
kono
parents:
diff changeset
1565 }
kono
parents:
diff changeset
1566 if (uisec < isec)
kono
parents:
diff changeset
1567 {
kono
parents:
diff changeset
1568 i5 = ii + isec - 1;
kono
parents:
diff changeset
1569 for (i = ii + uisec; i <= i5; ++i)
kono
parents:
diff changeset
1570 {
kono
parents:
diff changeset
1571 f11 = c[i + j * c_dim1];
kono
parents:
diff changeset
1572 f12 = c[i + (j + 1) * c_dim1];
kono
parents:
diff changeset
1573 f13 = c[i + (j + 2) * c_dim1];
kono
parents:
diff changeset
1574 f14 = c[i + (j + 3) * c_dim1];
kono
parents:
diff changeset
1575 i6 = ll + lsec - 1;
kono
parents:
diff changeset
1576 for (l = ll; l <= i6; ++l)
kono
parents:
diff changeset
1577 {
kono
parents:
diff changeset
1578 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
kono
parents:
diff changeset
1579 257] * b[l + j * b_dim1];
kono
parents:
diff changeset
1580 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
kono
parents:
diff changeset
1581 257] * b[l + (j + 1) * b_dim1];
kono
parents:
diff changeset
1582 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
kono
parents:
diff changeset
1583 257] * b[l + (j + 2) * b_dim1];
kono
parents:
diff changeset
1584 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
kono
parents:
diff changeset
1585 257] * b[l + (j + 3) * b_dim1];
kono
parents:
diff changeset
1586 }
kono
parents:
diff changeset
1587 c[i + j * c_dim1] = f11;
kono
parents:
diff changeset
1588 c[i + (j + 1) * c_dim1] = f12;
kono
parents:
diff changeset
1589 c[i + (j + 2) * c_dim1] = f13;
kono
parents:
diff changeset
1590 c[i + (j + 3) * c_dim1] = f14;
kono
parents:
diff changeset
1591 }
kono
parents:
diff changeset
1592 }
kono
parents:
diff changeset
1593 }
kono
parents:
diff changeset
1594 if (ujsec < jsec)
kono
parents:
diff changeset
1595 {
kono
parents:
diff changeset
1596 i4 = jj + jsec - 1;
kono
parents:
diff changeset
1597 for (j = jj + ujsec; j <= i4; ++j)
kono
parents:
diff changeset
1598 {
kono
parents:
diff changeset
1599 i5 = ii + uisec - 1;
kono
parents:
diff changeset
1600 for (i = ii; i <= i5; i += 4)
kono
parents:
diff changeset
1601 {
kono
parents:
diff changeset
1602 f11 = c[i + j * c_dim1];
kono
parents:
diff changeset
1603 f21 = c[i + 1 + j * c_dim1];
kono
parents:
diff changeset
1604 f31 = c[i + 2 + j * c_dim1];
kono
parents:
diff changeset
1605 f41 = c[i + 3 + j * c_dim1];
kono
parents:
diff changeset
1606 i6 = ll + lsec - 1;
kono
parents:
diff changeset
1607 for (l = ll; l <= i6; ++l)
kono
parents:
diff changeset
1608 {
kono
parents:
diff changeset
1609 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
kono
parents:
diff changeset
1610 257] * b[l + j * b_dim1];
kono
parents:
diff changeset
1611 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
kono
parents:
diff changeset
1612 257] * b[l + j * b_dim1];
kono
parents:
diff changeset
1613 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
kono
parents:
diff changeset
1614 257] * b[l + j * b_dim1];
kono
parents:
diff changeset
1615 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
kono
parents:
diff changeset
1616 257] * b[l + j * b_dim1];
kono
parents:
diff changeset
1617 }
kono
parents:
diff changeset
1618 c[i + j * c_dim1] = f11;
kono
parents:
diff changeset
1619 c[i + 1 + j * c_dim1] = f21;
kono
parents:
diff changeset
1620 c[i + 2 + j * c_dim1] = f31;
kono
parents:
diff changeset
1621 c[i + 3 + j * c_dim1] = f41;
kono
parents:
diff changeset
1622 }
kono
parents:
diff changeset
1623 i5 = ii + isec - 1;
kono
parents:
diff changeset
1624 for (i = ii + uisec; i <= i5; ++i)
kono
parents:
diff changeset
1625 {
kono
parents:
diff changeset
1626 f11 = c[i + j * c_dim1];
kono
parents:
diff changeset
1627 i6 = ll + lsec - 1;
kono
parents:
diff changeset
1628 for (l = ll; l <= i6; ++l)
kono
parents:
diff changeset
1629 {
kono
parents:
diff changeset
1630 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
kono
parents:
diff changeset
1631 257] * b[l + j * b_dim1];
kono
parents:
diff changeset
1632 }
kono
parents:
diff changeset
1633 c[i + j * c_dim1] = f11;
kono
parents:
diff changeset
1634 }
kono
parents:
diff changeset
1635 }
kono
parents:
diff changeset
1636 }
kono
parents:
diff changeset
1637 }
kono
parents:
diff changeset
1638 }
kono
parents:
diff changeset
1639 }
kono
parents:
diff changeset
1640 free(t1);
kono
parents:
diff changeset
1641 return;
kono
parents:
diff changeset
1642 }
kono
parents:
diff changeset
1643 else if (rxstride == 1 && aystride == 1 && bxstride == 1)
kono
parents:
diff changeset
1644 {
kono
parents:
diff changeset
1645 if (GFC_DESCRIPTOR_RANK (a) != 1)
kono
parents:
diff changeset
1646 {
kono
parents:
diff changeset
1647 const GFC_INTEGER_1 *restrict abase_x;
kono
parents:
diff changeset
1648 const GFC_INTEGER_1 *restrict bbase_y;
kono
parents:
diff changeset
1649 GFC_INTEGER_1 *restrict dest_y;
kono
parents:
diff changeset
1650 GFC_INTEGER_1 s;
kono
parents:
diff changeset
1651
kono
parents:
diff changeset
1652 for (y = 0; y < ycount; y++)
kono
parents:
diff changeset
1653 {
kono
parents:
diff changeset
1654 bbase_y = &bbase[y*bystride];
kono
parents:
diff changeset
1655 dest_y = &dest[y*rystride];
kono
parents:
diff changeset
1656 for (x = 0; x < xcount; x++)
kono
parents:
diff changeset
1657 {
kono
parents:
diff changeset
1658 abase_x = &abase[x*axstride];
kono
parents:
diff changeset
1659 s = (GFC_INTEGER_1) 0;
kono
parents:
diff changeset
1660 for (n = 0; n < count; n++)
kono
parents:
diff changeset
1661 s += abase_x[n] * bbase_y[n];
kono
parents:
diff changeset
1662 dest_y[x] = s;
kono
parents:
diff changeset
1663 }
kono
parents:
diff changeset
1664 }
kono
parents:
diff changeset
1665 }
kono
parents:
diff changeset
1666 else
kono
parents:
diff changeset
1667 {
kono
parents:
diff changeset
1668 const GFC_INTEGER_1 *restrict bbase_y;
kono
parents:
diff changeset
1669 GFC_INTEGER_1 s;
kono
parents:
diff changeset
1670
kono
parents:
diff changeset
1671 for (y = 0; y < ycount; y++)
kono
parents:
diff changeset
1672 {
kono
parents:
diff changeset
1673 bbase_y = &bbase[y*bystride];
kono
parents:
diff changeset
1674 s = (GFC_INTEGER_1) 0;
kono
parents:
diff changeset
1675 for (n = 0; n < count; n++)
kono
parents:
diff changeset
1676 s += abase[n*axstride] * bbase_y[n];
kono
parents:
diff changeset
1677 dest[y*rystride] = s;
kono
parents:
diff changeset
1678 }
kono
parents:
diff changeset
1679 }
kono
parents:
diff changeset
1680 }
kono
parents:
diff changeset
1681 else if (axstride < aystride)
kono
parents:
diff changeset
1682 {
kono
parents:
diff changeset
1683 for (y = 0; y < ycount; y++)
kono
parents:
diff changeset
1684 for (x = 0; x < xcount; x++)
kono
parents:
diff changeset
1685 dest[x*rxstride + y*rystride] = (GFC_INTEGER_1)0;
kono
parents:
diff changeset
1686
kono
parents:
diff changeset
1687 for (y = 0; y < ycount; y++)
kono
parents:
diff changeset
1688 for (n = 0; n < count; n++)
kono
parents:
diff changeset
1689 for (x = 0; x < xcount; x++)
kono
parents:
diff changeset
1690 /* dest[x,y] += a[x,n] * b[n,y] */
kono
parents:
diff changeset
1691 dest[x*rxstride + y*rystride] +=
kono
parents:
diff changeset
1692 abase[x*axstride + n*aystride] *
kono
parents:
diff changeset
1693 bbase[n*bxstride + y*bystride];
kono
parents:
diff changeset
1694 }
kono
parents:
diff changeset
1695 else if (GFC_DESCRIPTOR_RANK (a) == 1)
kono
parents:
diff changeset
1696 {
kono
parents:
diff changeset
1697 const GFC_INTEGER_1 *restrict bbase_y;
kono
parents:
diff changeset
1698 GFC_INTEGER_1 s;
kono
parents:
diff changeset
1699
kono
parents:
diff changeset
1700 for (y = 0; y < ycount; y++)
kono
parents:
diff changeset
1701 {
kono
parents:
diff changeset
1702 bbase_y = &bbase[y*bystride];
kono
parents:
diff changeset
1703 s = (GFC_INTEGER_1) 0;
kono
parents:
diff changeset
1704 for (n = 0; n < count; n++)
kono
parents:
diff changeset
1705 s += abase[n*axstride] * bbase_y[n*bxstride];
kono
parents:
diff changeset
1706 dest[y*rxstride] = s;
kono
parents:
diff changeset
1707 }
kono
parents:
diff changeset
1708 }
kono
parents:
diff changeset
1709 else
kono
parents:
diff changeset
1710 {
kono
parents:
diff changeset
1711 const GFC_INTEGER_1 *restrict abase_x;
kono
parents:
diff changeset
1712 const GFC_INTEGER_1 *restrict bbase_y;
kono
parents:
diff changeset
1713 GFC_INTEGER_1 *restrict dest_y;
kono
parents:
diff changeset
1714 GFC_INTEGER_1 s;
kono
parents:
diff changeset
1715
kono
parents:
diff changeset
1716 for (y = 0; y < ycount; y++)
kono
parents:
diff changeset
1717 {
kono
parents:
diff changeset
1718 bbase_y = &bbase[y*bystride];
kono
parents:
diff changeset
1719 dest_y = &dest[y*rystride];
kono
parents:
diff changeset
1720 for (x = 0; x < xcount; x++)
kono
parents:
diff changeset
1721 {
kono
parents:
diff changeset
1722 abase_x = &abase[x*axstride];
kono
parents:
diff changeset
1723 s = (GFC_INTEGER_1) 0;
kono
parents:
diff changeset
1724 for (n = 0; n < count; n++)
kono
parents:
diff changeset
1725 s += abase_x[n*aystride] * bbase_y[n*bxstride];
kono
parents:
diff changeset
1726 dest_y[x*rxstride] = s;
kono
parents:
diff changeset
1727 }
kono
parents:
diff changeset
1728 }
kono
parents:
diff changeset
1729 }
kono
parents:
diff changeset
1730 }
kono
parents:
diff changeset
1731 #undef POW3
kono
parents:
diff changeset
1732 #undef min
kono
parents:
diff changeset
1733 #undef max
kono
parents:
diff changeset
1734
kono
parents:
diff changeset
1735 #endif /* HAVE_AVX512F */
kono
parents:
diff changeset
1736
kono
parents:
diff changeset
1737 /* AMD-specifix funtions with AVX128 and FMA3/FMA4. */
kono
parents:
diff changeset
1738
kono
parents:
diff changeset
1739 #if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
kono
parents:
diff changeset
1740 void
kono
parents:
diff changeset
1741 matmul_i1_avx128_fma3 (gfc_array_i1 * const restrict retarray,
kono
parents:
diff changeset
1742 gfc_array_i1 * const restrict a, gfc_array_i1 * const restrict b, int try_blas,
kono
parents:
diff changeset
1743 int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma")));
kono
parents:
diff changeset
1744 internal_proto(matmul_i1_avx128_fma3);
kono
parents:
diff changeset
1745 #endif
kono
parents:
diff changeset
1746
kono
parents:
diff changeset
1747 #if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
kono
parents:
diff changeset
1748 void
kono
parents:
diff changeset
1749 matmul_i1_avx128_fma4 (gfc_array_i1 * const restrict retarray,
kono
parents:
diff changeset
1750 gfc_array_i1 * const restrict a, gfc_array_i1 * const restrict b, int try_blas,
kono
parents:
diff changeset
1751 int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4")));
kono
parents:
diff changeset
1752 internal_proto(matmul_i1_avx128_fma4);
kono
parents:
diff changeset
1753 #endif
kono
parents:
diff changeset
1754
kono
parents:
diff changeset
1755 /* Function to fall back to if there is no special processor-specific version. */
kono
parents:
diff changeset
1756 static void
kono
parents:
diff changeset
1757 matmul_i1_vanilla (gfc_array_i1 * const restrict retarray,
kono
parents:
diff changeset
1758 gfc_array_i1 * const restrict a, gfc_array_i1 * const restrict b, int try_blas,
kono
parents:
diff changeset
1759 int blas_limit, blas_call gemm)
kono
parents:
diff changeset
1760 {
kono
parents:
diff changeset
1761 const GFC_INTEGER_1 * restrict abase;
kono
parents:
diff changeset
1762 const GFC_INTEGER_1 * restrict bbase;
kono
parents:
diff changeset
1763 GFC_INTEGER_1 * restrict dest;
kono
parents:
diff changeset
1764
kono
parents:
diff changeset
1765 index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
kono
parents:
diff changeset
1766 index_type x, y, n, count, xcount, ycount;
kono
parents:
diff changeset
1767
kono
parents:
diff changeset
1768 assert (GFC_DESCRIPTOR_RANK (a) == 2
kono
parents:
diff changeset
1769 || GFC_DESCRIPTOR_RANK (b) == 2);
kono
parents:
diff changeset
1770
kono
parents:
diff changeset
1771 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
kono
parents:
diff changeset
1772
kono
parents:
diff changeset
1773 Either A or B (but not both) can be rank 1:
kono
parents:
diff changeset
1774
kono
parents:
diff changeset
1775 o One-dimensional argument A is implicitly treated as a row matrix
kono
parents:
diff changeset
1776 dimensioned [1,count], so xcount=1.
kono
parents:
diff changeset
1777
kono
parents:
diff changeset
1778 o One-dimensional argument B is implicitly treated as a column matrix
kono
parents:
diff changeset
1779 dimensioned [count, 1], so ycount=1.
kono
parents:
diff changeset
1780 */
kono
parents:
diff changeset
1781
kono
parents:
diff changeset
1782 if (retarray->base_addr == NULL)
kono
parents:
diff changeset
1783 {
kono
parents:
diff changeset
1784 if (GFC_DESCRIPTOR_RANK (a) == 1)
kono
parents:
diff changeset
1785 {
kono
parents:
diff changeset
1786 GFC_DIMENSION_SET(retarray->dim[0], 0,
kono
parents:
diff changeset
1787 GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
kono
parents:
diff changeset
1788 }
kono
parents:
diff changeset
1789 else if (GFC_DESCRIPTOR_RANK (b) == 1)
kono
parents:
diff changeset
1790 {
kono
parents:
diff changeset
1791 GFC_DIMENSION_SET(retarray->dim[0], 0,
kono
parents:
diff changeset
1792 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
kono
parents:
diff changeset
1793 }
kono
parents:
diff changeset
1794 else
kono
parents:
diff changeset
1795 {
kono
parents:
diff changeset
1796 GFC_DIMENSION_SET(retarray->dim[0], 0,
kono
parents:
diff changeset
1797 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
kono
parents:
diff changeset
1798
kono
parents:
diff changeset
1799 GFC_DIMENSION_SET(retarray->dim[1], 0,
kono
parents:
diff changeset
1800 GFC_DESCRIPTOR_EXTENT(b,1) - 1,
kono
parents:
diff changeset
1801 GFC_DESCRIPTOR_EXTENT(retarray,0));
kono
parents:
diff changeset
1802 }
kono
parents:
diff changeset
1803
kono
parents:
diff changeset
1804 retarray->base_addr
kono
parents:
diff changeset
1805 = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_INTEGER_1));
kono
parents:
diff changeset
1806 retarray->offset = 0;
kono
parents:
diff changeset
1807 }
kono
parents:
diff changeset
1808 else if (unlikely (compile_options.bounds_check))
kono
parents:
diff changeset
1809 {
kono
parents:
diff changeset
1810 index_type ret_extent, arg_extent;
kono
parents:
diff changeset
1811
kono
parents:
diff changeset
1812 if (GFC_DESCRIPTOR_RANK (a) == 1)
kono
parents:
diff changeset
1813 {
kono
parents:
diff changeset
1814 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
kono
parents:
diff changeset
1815 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
kono
parents:
diff changeset
1816 if (arg_extent != ret_extent)
kono
parents:
diff changeset
1817 runtime_error ("Incorrect extent in return array in"
kono
parents:
diff changeset
1818 " MATMUL intrinsic: is %ld, should be %ld",
kono
parents:
diff changeset
1819 (long int) ret_extent, (long int) arg_extent);
kono
parents:
diff changeset
1820 }
kono
parents:
diff changeset
1821 else if (GFC_DESCRIPTOR_RANK (b) == 1)
kono
parents:
diff changeset
1822 {
kono
parents:
diff changeset
1823 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
kono
parents:
diff changeset
1824 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
kono
parents:
diff changeset
1825 if (arg_extent != ret_extent)
kono
parents:
diff changeset
1826 runtime_error ("Incorrect extent in return array in"
kono
parents:
diff changeset
1827 " MATMUL intrinsic: is %ld, should be %ld",
kono
parents:
diff changeset
1828 (long int) ret_extent, (long int) arg_extent);
kono
parents:
diff changeset
1829 }
kono
parents:
diff changeset
1830 else
kono
parents:
diff changeset
1831 {
kono
parents:
diff changeset
1832 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
kono
parents:
diff changeset
1833 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
kono
parents:
diff changeset
1834 if (arg_extent != ret_extent)
kono
parents:
diff changeset
1835 runtime_error ("Incorrect extent in return array in"
kono
parents:
diff changeset
1836 " MATMUL intrinsic for dimension 1:"
kono
parents:
diff changeset
1837 " is %ld, should be %ld",
kono
parents:
diff changeset
1838 (long int) ret_extent, (long int) arg_extent);
kono
parents:
diff changeset
1839
kono
parents:
diff changeset
1840 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
kono
parents:
diff changeset
1841 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
kono
parents:
diff changeset
1842 if (arg_extent != ret_extent)
kono
parents:
diff changeset
1843 runtime_error ("Incorrect extent in return array in"
kono
parents:
diff changeset
1844 " MATMUL intrinsic for dimension 2:"
kono
parents:
diff changeset
1845 " is %ld, should be %ld",
kono
parents:
diff changeset
1846 (long int) ret_extent, (long int) arg_extent);
kono
parents:
diff changeset
1847 }
kono
parents:
diff changeset
1848 }
kono
parents:
diff changeset
1849
kono
parents:
diff changeset
1850
kono
parents:
diff changeset
1851 if (GFC_DESCRIPTOR_RANK (retarray) == 1)
kono
parents:
diff changeset
1852 {
kono
parents:
diff changeset
1853 /* One-dimensional result may be addressed in the code below
kono
parents:
diff changeset
1854 either as a row or a column matrix. We want both cases to
kono
parents:
diff changeset
1855 work. */
kono
parents:
diff changeset
1856 rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
kono
parents:
diff changeset
1857 }
kono
parents:
diff changeset
1858 else
kono
parents:
diff changeset
1859 {
kono
parents:
diff changeset
1860 rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
kono
parents:
diff changeset
1861 rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
kono
parents:
diff changeset
1862 }
kono
parents:
diff changeset
1863
kono
parents:
diff changeset
1864
kono
parents:
diff changeset
1865 if (GFC_DESCRIPTOR_RANK (a) == 1)
kono
parents:
diff changeset
1866 {
kono
parents:
diff changeset
1867 /* Treat it as a a row matrix A[1,count]. */
kono
parents:
diff changeset
1868 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
kono
parents:
diff changeset
1869 aystride = 1;
kono
parents:
diff changeset
1870
kono
parents:
diff changeset
1871 xcount = 1;
kono
parents:
diff changeset
1872 count = GFC_DESCRIPTOR_EXTENT(a,0);
kono
parents:
diff changeset
1873 }
kono
parents:
diff changeset
1874 else
kono
parents:
diff changeset
1875 {
kono
parents:
diff changeset
1876 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
kono
parents:
diff changeset
1877 aystride = GFC_DESCRIPTOR_STRIDE(a,1);
kono
parents:
diff changeset
1878
kono
parents:
diff changeset
1879 count = GFC_DESCRIPTOR_EXTENT(a,1);
kono
parents:
diff changeset
1880 xcount = GFC_DESCRIPTOR_EXTENT(a,0);
kono
parents:
diff changeset
1881 }
kono
parents:
diff changeset
1882
kono
parents:
diff changeset
1883 if (count != GFC_DESCRIPTOR_EXTENT(b,0))
kono
parents:
diff changeset
1884 {
kono
parents:
diff changeset
1885 if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
kono
parents:
diff changeset
1886 runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
kono
parents:
diff changeset
1887 }
kono
parents:
diff changeset
1888
kono
parents:
diff changeset
1889 if (GFC_DESCRIPTOR_RANK (b) == 1)
kono
parents:
diff changeset
1890 {
kono
parents:
diff changeset
1891 /* Treat it as a column matrix B[count,1] */
kono
parents:
diff changeset
1892 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
kono
parents:
diff changeset
1893
kono
parents:
diff changeset
1894 /* bystride should never be used for 1-dimensional b.
kono
parents:
diff changeset
1895 The value is only used for calculation of the
kono
parents:
diff changeset
1896 memory by the buffer. */
kono
parents:
diff changeset
1897 bystride = 256;
kono
parents:
diff changeset
1898 ycount = 1;
kono
parents:
diff changeset
1899 }
kono
parents:
diff changeset
1900 else
kono
parents:
diff changeset
1901 {
kono
parents:
diff changeset
1902 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
kono
parents:
diff changeset
1903 bystride = GFC_DESCRIPTOR_STRIDE(b,1);
kono
parents:
diff changeset
1904 ycount = GFC_DESCRIPTOR_EXTENT(b,1);
kono
parents:
diff changeset
1905 }
kono
parents:
diff changeset
1906
kono
parents:
diff changeset
1907 abase = a->base_addr;
kono
parents:
diff changeset
1908 bbase = b->base_addr;
kono
parents:
diff changeset
1909 dest = retarray->base_addr;
kono
parents:
diff changeset
1910
kono
parents:
diff changeset
1911 /* Now that everything is set up, we perform the multiplication
kono
parents:
diff changeset
1912 itself. */
kono
parents:
diff changeset
1913
kono
parents:
diff changeset
1914 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
kono
parents:
diff changeset
1915 #define min(a,b) ((a) <= (b) ? (a) : (b))
kono
parents:
diff changeset
1916 #define max(a,b) ((a) >= (b) ? (a) : (b))
kono
parents:
diff changeset
1917
kono
parents:
diff changeset
1918 if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
kono
parents:
diff changeset
1919 && (bxstride == 1 || bystride == 1)
kono
parents:
diff changeset
1920 && (((float) xcount) * ((float) ycount) * ((float) count)
kono
parents:
diff changeset
1921 > POW3(blas_limit)))
kono
parents:
diff changeset
1922 {
kono
parents:
diff changeset
1923 const int m = xcount, n = ycount, k = count, ldc = rystride;
kono
parents:
diff changeset
1924 const GFC_INTEGER_1 one = 1, zero = 0;
kono
parents:
diff changeset
1925 const int lda = (axstride == 1) ? aystride : axstride,
kono
parents:
diff changeset
1926 ldb = (bxstride == 1) ? bystride : bxstride;
kono
parents:
diff changeset
1927
kono
parents:
diff changeset
1928 if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
kono
parents:
diff changeset
1929 {
kono
parents:
diff changeset
1930 assert (gemm != NULL);
kono
parents:
diff changeset
1931 gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m,
kono
parents:
diff changeset
1932 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
kono
parents:
diff changeset
1933 &ldc, 1, 1);
kono
parents:
diff changeset
1934 return;
kono
parents:
diff changeset
1935 }
kono
parents:
diff changeset
1936 }
kono
parents:
diff changeset
1937
kono
parents:
diff changeset
1938 if (rxstride == 1 && axstride == 1 && bxstride == 1)
kono
parents:
diff changeset
1939 {
kono
parents:
diff changeset
1940 /* This block of code implements a tuned matmul, derived from
kono
parents:
diff changeset
1941 Superscalar GEMM-based level 3 BLAS, Beta version 0.1
kono
parents:
diff changeset
1942
kono
parents:
diff changeset
1943 Bo Kagstrom and Per Ling
kono
parents:
diff changeset
1944 Department of Computing Science
kono
parents:
diff changeset
1945 Umea University
kono
parents:
diff changeset
1946 S-901 87 Umea, Sweden
kono
parents:
diff changeset
1947
kono
parents:
diff changeset
1948 from netlib.org, translated to C, and modified for matmul.m4. */
kono
parents:
diff changeset
1949
kono
parents:
diff changeset
1950 const GFC_INTEGER_1 *a, *b;
kono
parents:
diff changeset
1951 GFC_INTEGER_1 *c;
kono
parents:
diff changeset
1952 const index_type m = xcount, n = ycount, k = count;
kono
parents:
diff changeset
1953
kono
parents:
diff changeset
1954 /* System generated locals */
kono
parents:
diff changeset
1955 index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
kono
parents:
diff changeset
1956 i1, i2, i3, i4, i5, i6;
kono
parents:
diff changeset
1957
kono
parents:
diff changeset
1958 /* Local variables */
kono
parents:
diff changeset
1959 GFC_INTEGER_1 f11, f12, f21, f22, f31, f32, f41, f42,
kono
parents:
diff changeset
1960 f13, f14, f23, f24, f33, f34, f43, f44;
kono
parents:
diff changeset
1961 index_type i, j, l, ii, jj, ll;
kono
parents:
diff changeset
1962 index_type isec, jsec, lsec, uisec, ujsec, ulsec;
kono
parents:
diff changeset
1963 GFC_INTEGER_1 *t1;
kono
parents:
diff changeset
1964
kono
parents:
diff changeset
1965 a = abase;
kono
parents:
diff changeset
1966 b = bbase;
kono
parents:
diff changeset
1967 c = retarray->base_addr;
kono
parents:
diff changeset
1968
kono
parents:
diff changeset
1969 /* Parameter adjustments */
kono
parents:
diff changeset
1970 c_dim1 = rystride;
kono
parents:
diff changeset
1971 c_offset = 1 + c_dim1;
kono
parents:
diff changeset
1972 c -= c_offset;
kono
parents:
diff changeset
1973 a_dim1 = aystride;
kono
parents:
diff changeset
1974 a_offset = 1 + a_dim1;
kono
parents:
diff changeset
1975 a -= a_offset;
kono
parents:
diff changeset
1976 b_dim1 = bystride;
kono
parents:
diff changeset
1977 b_offset = 1 + b_dim1;
kono
parents:
diff changeset
1978 b -= b_offset;
kono
parents:
diff changeset
1979
kono
parents:
diff changeset
1980 /* Empty c first. */
kono
parents:
diff changeset
1981 for (j=1; j<=n; j++)
kono
parents:
diff changeset
1982 for (i=1; i<=m; i++)
kono
parents:
diff changeset
1983 c[i + j * c_dim1] = (GFC_INTEGER_1)0;
kono
parents:
diff changeset
1984
kono
parents:
diff changeset
1985 /* Early exit if possible */
kono
parents:
diff changeset
1986 if (m == 0 || n == 0 || k == 0)
kono
parents:
diff changeset
1987 return;
kono
parents:
diff changeset
1988
kono
parents:
diff changeset
1989 /* Adjust size of t1 to what is needed. */
kono
parents:
diff changeset
1990 index_type t1_dim;
kono
parents:
diff changeset
1991 t1_dim = (a_dim1-1) * 256 + b_dim1;
kono
parents:
diff changeset
1992 if (t1_dim > 65536)
kono
parents:
diff changeset
1993 t1_dim = 65536;
kono
parents:
diff changeset
1994
kono
parents:
diff changeset
1995 t1 = malloc (t1_dim * sizeof(GFC_INTEGER_1));
kono
parents:
diff changeset
1996
kono
parents:
diff changeset
1997 /* Start turning the crank. */
kono
parents:
diff changeset
1998 i1 = n;
kono
parents:
diff changeset
1999 for (jj = 1; jj <= i1; jj += 512)
kono
parents:
diff changeset
2000 {
kono
parents:
diff changeset
2001 /* Computing MIN */
kono
parents:
diff changeset
2002 i2 = 512;
kono
parents:
diff changeset
2003 i3 = n - jj + 1;
kono
parents:
diff changeset
2004 jsec = min(i2,i3);
kono
parents:
diff changeset
2005 ujsec = jsec - jsec % 4;
kono
parents:
diff changeset
2006 i2 = k;
kono
parents:
diff changeset
2007 for (ll = 1; ll <= i2; ll += 256)
kono
parents:
diff changeset
2008 {
kono
parents:
diff changeset
2009 /* Computing MIN */
kono
parents:
diff changeset
2010 i3 = 256;
kono
parents:
diff changeset
2011 i4 = k - ll + 1;
kono
parents:
diff changeset
2012 lsec = min(i3,i4);
kono
parents:
diff changeset
2013 ulsec = lsec - lsec % 2;
kono
parents:
diff changeset
2014
kono
parents:
diff changeset
2015 i3 = m;
kono
parents:
diff changeset
2016 for (ii = 1; ii <= i3; ii += 256)
kono
parents:
diff changeset
2017 {
kono
parents:
diff changeset
2018 /* Computing MIN */
kono
parents:
diff changeset
2019 i4 = 256;
kono
parents:
diff changeset
2020 i5 = m - ii + 1;
kono
parents:
diff changeset
2021 isec = min(i4,i5);
kono
parents:
diff changeset
2022 uisec = isec - isec % 2;
kono
parents:
diff changeset
2023 i4 = ll + ulsec - 1;
kono
parents:
diff changeset
2024 for (l = ll; l <= i4; l += 2)
kono
parents:
diff changeset
2025 {
kono
parents:
diff changeset
2026 i5 = ii + uisec - 1;
kono
parents:
diff changeset
2027 for (i = ii; i <= i5; i += 2)
kono
parents:
diff changeset
2028 {
kono
parents:
diff changeset
2029 t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
kono
parents:
diff changeset
2030 a[i + l * a_dim1];
kono
parents:
diff changeset
2031 t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
kono
parents:
diff changeset
2032 a[i + (l + 1) * a_dim1];
kono
parents:
diff changeset
2033 t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
kono
parents:
diff changeset
2034 a[i + 1 + l * a_dim1];
kono
parents:
diff changeset
2035 t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
kono
parents:
diff changeset
2036 a[i + 1 + (l + 1) * a_dim1];
kono
parents:
diff changeset
2037 }
kono
parents:
diff changeset
2038 if (uisec < isec)
kono
parents:
diff changeset
2039 {
kono
parents:
diff changeset
2040 t1[l - ll + 1 + (isec << 8) - 257] =
kono
parents:
diff changeset
2041 a[ii + isec - 1 + l * a_dim1];
kono
parents:
diff changeset
2042 t1[l - ll + 2 + (isec << 8) - 257] =
kono
parents:
diff changeset
2043 a[ii + isec - 1 + (l + 1) * a_dim1];
kono
parents:
diff changeset
2044 }
kono
parents:
diff changeset
2045 }
kono
parents:
diff changeset
2046 if (ulsec < lsec)
kono
parents:
diff changeset
2047 {
kono
parents:
diff changeset
2048 i4 = ii + isec - 1;
kono
parents:
diff changeset
2049 for (i = ii; i<= i4; ++i)
kono
parents:
diff changeset
2050 {
kono
parents:
diff changeset
2051 t1[lsec + ((i - ii + 1) << 8) - 257] =
kono
parents:
diff changeset
2052 a[i + (ll + lsec - 1) * a_dim1];
kono
parents:
diff changeset
2053 }
kono
parents:
diff changeset
2054 }
kono
parents:
diff changeset
2055
kono
parents:
diff changeset
2056 uisec = isec - isec % 4;
kono
parents:
diff changeset
2057 i4 = jj + ujsec - 1;
kono
parents:
diff changeset
2058 for (j = jj; j <= i4; j += 4)
kono
parents:
diff changeset
2059 {
kono
parents:
diff changeset
2060 i5 = ii + uisec - 1;
kono
parents:
diff changeset
2061 for (i = ii; i <= i5; i += 4)
kono
parents:
diff changeset
2062 {
kono
parents:
diff changeset
2063 f11 = c[i + j * c_dim1];
kono
parents:
diff changeset
2064 f21 = c[i + 1 + j * c_dim1];
kono
parents:
diff changeset
2065 f12 = c[i + (j + 1) * c_dim1];
kono
parents:
diff changeset
2066 f22 = c[i + 1 + (j + 1) * c_dim1];
kono
parents:
diff changeset
2067 f13 = c[i + (j + 2) * c_dim1];
kono
parents:
diff changeset
2068 f23 = c[i + 1 + (j + 2) * c_dim1];
kono
parents:
diff changeset
2069 f14 = c[i + (j + 3) * c_dim1];
kono
parents:
diff changeset
2070 f24 = c[i + 1 + (j + 3) * c_dim1];
kono
parents:
diff changeset
2071 f31 = c[i + 2 + j * c_dim1];
kono
parents:
diff changeset
2072 f41 = c[i + 3 + j * c_dim1];
kono
parents:
diff changeset
2073 f32 = c[i + 2 + (j + 1) * c_dim1];
kono
parents:
diff changeset
2074 f42 = c[i + 3 + (j + 1) * c_dim1];
kono
parents:
diff changeset
2075 f33 = c[i + 2 + (j + 2) * c_dim1];
kono
parents:
diff changeset
2076 f43 = c[i + 3 + (j + 2) * c_dim1];
kono
parents:
diff changeset
2077 f34 = c[i + 2 + (j + 3) * c_dim1];
kono
parents:
diff changeset
2078 f44 = c[i + 3 + (j + 3) * c_dim1];
kono
parents:
diff changeset
2079 i6 = ll + lsec - 1;
kono
parents:
diff changeset
2080 for (l = ll; l <= i6; ++l)
kono
parents:
diff changeset
2081 {
kono
parents:
diff changeset
2082 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
kono
parents:
diff changeset
2083 * b[l + j * b_dim1];
kono
parents:
diff changeset
2084 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
kono
parents:
diff changeset
2085 * b[l + j * b_dim1];
kono
parents:
diff changeset
2086 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
kono
parents:
diff changeset
2087 * b[l + (j + 1) * b_dim1];
kono
parents:
diff changeset
2088 f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
kono
parents:
diff changeset
2089 * b[l + (j + 1) * b_dim1];
kono
parents:
diff changeset
2090 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
kono
parents:
diff changeset
2091 * b[l + (j + 2) * b_dim1];
kono
parents:
diff changeset
2092 f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
kono
parents:
diff changeset
2093 * b[l + (j + 2) * b_dim1];
kono
parents:
diff changeset
2094 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
kono
parents:
diff changeset
2095 * b[l + (j + 3) * b_dim1];
kono
parents:
diff changeset
2096 f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
kono
parents:
diff changeset
2097 * b[l + (j + 3) * b_dim1];
kono
parents:
diff changeset
2098 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
kono
parents:
diff changeset
2099 * b[l + j * b_dim1];
kono
parents:
diff changeset
2100 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
kono
parents:
diff changeset
2101 * b[l + j * b_dim1];
kono
parents:
diff changeset
2102 f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
kono
parents:
diff changeset
2103 * b[l + (j + 1) * b_dim1];
kono
parents:
diff changeset
2104 f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
kono
parents:
diff changeset
2105 * b[l + (j + 1) * b_dim1];
kono
parents:
diff changeset
2106 f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
kono
parents:
diff changeset
2107 * b[l + (j + 2) * b_dim1];
kono
parents:
diff changeset
2108 f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
kono
parents:
diff changeset
2109 * b[l + (j + 2) * b_dim1];
kono
parents:
diff changeset
2110 f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
kono
parents:
diff changeset
2111 * b[l + (j + 3) * b_dim1];
kono
parents:
diff changeset
2112 f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
kono
parents:
diff changeset
2113 * b[l + (j + 3) * b_dim1];
kono
parents:
diff changeset
2114 }
kono
parents:
diff changeset
2115 c[i + j * c_dim1] = f11;
kono
parents:
diff changeset
2116 c[i + 1 + j * c_dim1] = f21;
kono
parents:
diff changeset
2117 c[i + (j + 1) * c_dim1] = f12;
kono
parents:
diff changeset
2118 c[i + 1 + (j + 1) * c_dim1] = f22;
kono
parents:
diff changeset
2119 c[i + (j + 2) * c_dim1] = f13;
kono
parents:
diff changeset
2120 c[i + 1 + (j + 2) * c_dim1] = f23;
kono
parents:
diff changeset
2121 c[i + (j + 3) * c_dim1] = f14;
kono
parents:
diff changeset
2122 c[i + 1 + (j + 3) * c_dim1] = f24;
kono
parents:
diff changeset
2123 c[i + 2 + j * c_dim1] = f31;
kono
parents:
diff changeset
2124 c[i + 3 + j * c_dim1] = f41;
kono
parents:
diff changeset
2125 c[i + 2 + (j + 1) * c_dim1] = f32;
kono
parents:
diff changeset
2126 c[i + 3 + (j + 1) * c_dim1] = f42;
kono
parents:
diff changeset
2127 c[i + 2 + (j + 2) * c_dim1] = f33;
kono
parents:
diff changeset
2128 c[i + 3 + (j + 2) * c_dim1] = f43;
kono
parents:
diff changeset
2129 c[i + 2 + (j + 3) * c_dim1] = f34;
kono
parents:
diff changeset
2130 c[i + 3 + (j + 3) * c_dim1] = f44;
kono
parents:
diff changeset
2131 }
kono
parents:
diff changeset
2132 if (uisec < isec)
kono
parents:
diff changeset
2133 {
kono
parents:
diff changeset
2134 i5 = ii + isec - 1;
kono
parents:
diff changeset
2135 for (i = ii + uisec; i <= i5; ++i)
kono
parents:
diff changeset
2136 {
kono
parents:
diff changeset
2137 f11 = c[i + j * c_dim1];
kono
parents:
diff changeset
2138 f12 = c[i + (j + 1) * c_dim1];
kono
parents:
diff changeset
2139 f13 = c[i + (j + 2) * c_dim1];
kono
parents:
diff changeset
2140 f14 = c[i + (j + 3) * c_dim1];
kono
parents:
diff changeset
2141 i6 = ll + lsec - 1;
kono
parents:
diff changeset
2142 for (l = ll; l <= i6; ++l)
kono
parents:
diff changeset
2143 {
kono
parents:
diff changeset
2144 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
kono
parents:
diff changeset
2145 257] * b[l + j * b_dim1];
kono
parents:
diff changeset
2146 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
kono
parents:
diff changeset
2147 257] * b[l + (j + 1) * b_dim1];
kono
parents:
diff changeset
2148 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
kono
parents:
diff changeset
2149 257] * b[l + (j + 2) * b_dim1];
kono
parents:
diff changeset
2150 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
kono
parents:
diff changeset
2151 257] * b[l + (j + 3) * b_dim1];
kono
parents:
diff changeset
2152 }
kono
parents:
diff changeset
2153 c[i + j * c_dim1] = f11;
kono
parents:
diff changeset
2154 c[i + (j + 1) * c_dim1] = f12;
kono
parents:
diff changeset
2155 c[i + (j + 2) * c_dim1] = f13;
kono
parents:
diff changeset
2156 c[i + (j + 3) * c_dim1] = f14;
kono
parents:
diff changeset
2157 }
kono
parents:
diff changeset
2158 }
kono
parents:
diff changeset
2159 }
kono
parents:
diff changeset
2160 if (ujsec < jsec)
kono
parents:
diff changeset
2161 {
kono
parents:
diff changeset
2162 i4 = jj + jsec - 1;
kono
parents:
diff changeset
2163 for (j = jj + ujsec; j <= i4; ++j)
kono
parents:
diff changeset
2164 {
kono
parents:
diff changeset
2165 i5 = ii + uisec - 1;
kono
parents:
diff changeset
2166 for (i = ii; i <= i5; i += 4)
kono
parents:
diff changeset
2167 {
kono
parents:
diff changeset
2168 f11 = c[i + j * c_dim1];
kono
parents:
diff changeset
2169 f21 = c[i + 1 + j * c_dim1];
kono
parents:
diff changeset
2170 f31 = c[i + 2 + j * c_dim1];
kono
parents:
diff changeset
2171 f41 = c[i + 3 + j * c_dim1];
kono
parents:
diff changeset
2172 i6 = ll + lsec - 1;
kono
parents:
diff changeset
2173 for (l = ll; l <= i6; ++l)
kono
parents:
diff changeset
2174 {
kono
parents:
diff changeset
2175 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
kono
parents:
diff changeset
2176 257] * b[l + j * b_dim1];
kono
parents:
diff changeset
2177 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
kono
parents:
diff changeset
2178 257] * b[l + j * b_dim1];
kono
parents:
diff changeset
2179 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
kono
parents:
diff changeset
2180 257] * b[l + j * b_dim1];
kono
parents:
diff changeset
2181 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
kono
parents:
diff changeset
2182 257] * b[l + j * b_dim1];
kono
parents:
diff changeset
2183 }
kono
parents:
diff changeset
2184 c[i + j * c_dim1] = f11;
kono
parents:
diff changeset
2185 c[i + 1 + j * c_dim1] = f21;
kono
parents:
diff changeset
2186 c[i + 2 + j * c_dim1] = f31;
kono
parents:
diff changeset
2187 c[i + 3 + j * c_dim1] = f41;
kono
parents:
diff changeset
2188 }
kono
parents:
diff changeset
2189 i5 = ii + isec - 1;
kono
parents:
diff changeset
2190 for (i = ii + uisec; i <= i5; ++i)
kono
parents:
diff changeset
2191 {
kono
parents:
diff changeset
2192 f11 = c[i + j * c_dim1];
kono
parents:
diff changeset
2193 i6 = ll + lsec - 1;
kono
parents:
diff changeset
2194 for (l = ll; l <= i6; ++l)
kono
parents:
diff changeset
2195 {
kono
parents:
diff changeset
2196 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
kono
parents:
diff changeset
2197 257] * b[l + j * b_dim1];
kono
parents:
diff changeset
2198 }
kono
parents:
diff changeset
2199 c[i + j * c_dim1] = f11;
kono
parents:
diff changeset
2200 }
kono
parents:
diff changeset
2201 }
kono
parents:
diff changeset
2202 }
kono
parents:
diff changeset
2203 }
kono
parents:
diff changeset
2204 }
kono
parents:
diff changeset
2205 }
kono
parents:
diff changeset
2206 free(t1);
kono
parents:
diff changeset
2207 return;
kono
parents:
diff changeset
2208 }
kono
parents:
diff changeset
2209 else if (rxstride == 1 && aystride == 1 && bxstride == 1)
kono
parents:
diff changeset
2210 {
kono
parents:
diff changeset
2211 if (GFC_DESCRIPTOR_RANK (a) != 1)
kono
parents:
diff changeset
2212 {
kono
parents:
diff changeset
2213 const GFC_INTEGER_1 *restrict abase_x;
kono
parents:
diff changeset
2214 const GFC_INTEGER_1 *restrict bbase_y;
kono
parents:
diff changeset
2215 GFC_INTEGER_1 *restrict dest_y;
kono
parents:
diff changeset
2216 GFC_INTEGER_1 s;
kono
parents:
diff changeset
2217
kono
parents:
diff changeset
2218 for (y = 0; y < ycount; y++)
kono
parents:
diff changeset
2219 {
kono
parents:
diff changeset
2220 bbase_y = &bbase[y*bystride];
kono
parents:
diff changeset
2221 dest_y = &dest[y*rystride];
kono
parents:
diff changeset
2222 for (x = 0; x < xcount; x++)
kono
parents:
diff changeset
2223 {
kono
parents:
diff changeset
2224 abase_x = &abase[x*axstride];
kono
parents:
diff changeset
2225 s = (GFC_INTEGER_1) 0;
kono
parents:
diff changeset
2226 for (n = 0; n < count; n++)
kono
parents:
diff changeset
2227 s += abase_x[n] * bbase_y[n];
kono
parents:
diff changeset
2228 dest_y[x] = s;
kono
parents:
diff changeset
2229 }
kono
parents:
diff changeset
2230 }
kono
parents:
diff changeset
2231 }
kono
parents:
diff changeset
2232 else
kono
parents:
diff changeset
2233 {
kono
parents:
diff changeset
2234 const GFC_INTEGER_1 *restrict bbase_y;
kono
parents:
diff changeset
2235 GFC_INTEGER_1 s;
kono
parents:
diff changeset
2236
kono
parents:
diff changeset
2237 for (y = 0; y < ycount; y++)
kono
parents:
diff changeset
2238 {
kono
parents:
diff changeset
2239 bbase_y = &bbase[y*bystride];
kono
parents:
diff changeset
2240 s = (GFC_INTEGER_1) 0;
kono
parents:
diff changeset
2241 for (n = 0; n < count; n++)
kono
parents:
diff changeset
2242 s += abase[n*axstride] * bbase_y[n];
kono
parents:
diff changeset
2243 dest[y*rystride] = s;
kono
parents:
diff changeset
2244 }
kono
parents:
diff changeset
2245 }
kono
parents:
diff changeset
2246 }
kono
parents:
diff changeset
2247 else if (axstride < aystride)
kono
parents:
diff changeset
2248 {
kono
parents:
diff changeset
2249 for (y = 0; y < ycount; y++)
kono
parents:
diff changeset
2250 for (x = 0; x < xcount; x++)
kono
parents:
diff changeset
2251 dest[x*rxstride + y*rystride] = (GFC_INTEGER_1)0;
kono
parents:
diff changeset
2252
kono
parents:
diff changeset
2253 for (y = 0; y < ycount; y++)
kono
parents:
diff changeset
2254 for (n = 0; n < count; n++)
kono
parents:
diff changeset
2255 for (x = 0; x < xcount; x++)
kono
parents:
diff changeset
2256 /* dest[x,y] += a[x,n] * b[n,y] */
kono
parents:
diff changeset
2257 dest[x*rxstride + y*rystride] +=
kono
parents:
diff changeset
2258 abase[x*axstride + n*aystride] *
kono
parents:
diff changeset
2259 bbase[n*bxstride + y*bystride];
kono
parents:
diff changeset
2260 }
kono
parents:
diff changeset
2261 else if (GFC_DESCRIPTOR_RANK (a) == 1)
kono
parents:
diff changeset
2262 {
kono
parents:
diff changeset
2263 const GFC_INTEGER_1 *restrict bbase_y;
kono
parents:
diff changeset
2264 GFC_INTEGER_1 s;
kono
parents:
diff changeset
2265
kono
parents:
diff changeset
2266 for (y = 0; y < ycount; y++)
kono
parents:
diff changeset
2267 {
kono
parents:
diff changeset
2268 bbase_y = &bbase[y*bystride];
kono
parents:
diff changeset
2269 s = (GFC_INTEGER_1) 0;
kono
parents:
diff changeset
2270 for (n = 0; n < count; n++)
kono
parents:
diff changeset
2271 s += abase[n*axstride] * bbase_y[n*bxstride];
kono
parents:
diff changeset
2272 dest[y*rxstride] = s;
kono
parents:
diff changeset
2273 }
kono
parents:
diff changeset
2274 }
kono
parents:
diff changeset
2275 else
kono
parents:
diff changeset
2276 {
kono
parents:
diff changeset
2277 const GFC_INTEGER_1 *restrict abase_x;
kono
parents:
diff changeset
2278 const GFC_INTEGER_1 *restrict bbase_y;
kono
parents:
diff changeset
2279 GFC_INTEGER_1 *restrict dest_y;
kono
parents:
diff changeset
2280 GFC_INTEGER_1 s;
kono
parents:
diff changeset
2281
kono
parents:
diff changeset
2282 for (y = 0; y < ycount; y++)
kono
parents:
diff changeset
2283 {
kono
parents:
diff changeset
2284 bbase_y = &bbase[y*bystride];
kono
parents:
diff changeset
2285 dest_y = &dest[y*rystride];
kono
parents:
diff changeset
2286 for (x = 0; x < xcount; x++)
kono
parents:
diff changeset
2287 {
kono
parents:
diff changeset
2288 abase_x = &abase[x*axstride];
kono
parents:
diff changeset
2289 s = (GFC_INTEGER_1) 0;
kono
parents:
diff changeset
2290 for (n = 0; n < count; n++)
kono
parents:
diff changeset
2291 s += abase_x[n*aystride] * bbase_y[n*bxstride];
kono
parents:
diff changeset
2292 dest_y[x*rxstride] = s;
kono
parents:
diff changeset
2293 }
kono
parents:
diff changeset
2294 }
kono
parents:
diff changeset
2295 }
kono
parents:
diff changeset
2296 }
kono
parents:
diff changeset
2297 #undef POW3
kono
parents:
diff changeset
2298 #undef min
kono
parents:
diff changeset
2299 #undef max
kono
parents:
diff changeset
2300
kono
parents:
diff changeset
2301
kono
parents:
diff changeset
2302 /* Compiling main function, with selection code for the processor. */
kono
parents:
diff changeset
2303
kono
parents:
diff changeset
2304 /* Currently, this is i386 only. Adjust for other architectures. */
kono
parents:
diff changeset
2305
kono
parents:
diff changeset
2306 #include <config/i386/cpuinfo.h>
kono
parents:
diff changeset
2307 void matmul_i1 (gfc_array_i1 * const restrict retarray,
kono
parents:
diff changeset
2308 gfc_array_i1 * const restrict a, gfc_array_i1 * const restrict b, int try_blas,
kono
parents:
diff changeset
2309 int blas_limit, blas_call gemm)
kono
parents:
diff changeset
2310 {
kono
parents:
diff changeset
2311 static void (*matmul_p) (gfc_array_i1 * const restrict retarray,
kono
parents:
diff changeset
2312 gfc_array_i1 * const restrict a, gfc_array_i1 * const restrict b, int try_blas,
kono
parents:
diff changeset
2313 int blas_limit, blas_call gemm);
kono
parents:
diff changeset
2314
kono
parents:
diff changeset
2315 void (*matmul_fn) (gfc_array_i1 * const restrict retarray,
kono
parents:
diff changeset
2316 gfc_array_i1 * const restrict a, gfc_array_i1 * const restrict b, int try_blas,
kono
parents:
diff changeset
2317 int blas_limit, blas_call gemm);
kono
parents:
diff changeset
2318
kono
parents:
diff changeset
2319 matmul_fn = __atomic_load_n (&matmul_p, __ATOMIC_RELAXED);
kono
parents:
diff changeset
2320 if (matmul_fn == NULL)
kono
parents:
diff changeset
2321 {
kono
parents:
diff changeset
2322 matmul_fn = matmul_i1_vanilla;
kono
parents:
diff changeset
2323 if (__cpu_model.__cpu_vendor == VENDOR_INTEL)
kono
parents:
diff changeset
2324 {
kono
parents:
diff changeset
2325 /* Run down the available processors in order of preference. */
kono
parents:
diff changeset
2326 #ifdef HAVE_AVX512F
kono
parents:
diff changeset
2327 if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX512F))
kono
parents:
diff changeset
2328 {
kono
parents:
diff changeset
2329 matmul_fn = matmul_i1_avx512f;
kono
parents:
diff changeset
2330 goto store;
kono
parents:
diff changeset
2331 }
kono
parents:
diff changeset
2332
kono
parents:
diff changeset
2333 #endif /* HAVE_AVX512F */
kono
parents:
diff changeset
2334
kono
parents:
diff changeset
2335 #ifdef HAVE_AVX2
kono
parents:
diff changeset
2336 if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX2))
kono
parents:
diff changeset
2337 && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA)))
kono
parents:
diff changeset
2338 {
kono
parents:
diff changeset
2339 matmul_fn = matmul_i1_avx2;
kono
parents:
diff changeset
2340 goto store;
kono
parents:
diff changeset
2341 }
kono
parents:
diff changeset
2342
kono
parents:
diff changeset
2343 #endif
kono
parents:
diff changeset
2344
kono
parents:
diff changeset
2345 #ifdef HAVE_AVX
kono
parents:
diff changeset
2346 if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
kono
parents:
diff changeset
2347 {
kono
parents:
diff changeset
2348 matmul_fn = matmul_i1_avx;
kono
parents:
diff changeset
2349 goto store;
kono
parents:
diff changeset
2350 }
kono
parents:
diff changeset
2351 #endif /* HAVE_AVX */
kono
parents:
diff changeset
2352 }
kono
parents:
diff changeset
2353 else if (__cpu_model.__cpu_vendor == VENDOR_AMD)
kono
parents:
diff changeset
2354 {
kono
parents:
diff changeset
2355 #if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
kono
parents:
diff changeset
2356 if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
kono
parents:
diff changeset
2357 && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA)))
kono
parents:
diff changeset
2358 {
kono
parents:
diff changeset
2359 matmul_fn = matmul_i1_avx128_fma3;
kono
parents:
diff changeset
2360 goto store;
kono
parents:
diff changeset
2361 }
kono
parents:
diff changeset
2362 #endif
kono
parents:
diff changeset
2363 #if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
kono
parents:
diff changeset
2364 if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
kono
parents:
diff changeset
2365 && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA4)))
kono
parents:
diff changeset
2366 {
kono
parents:
diff changeset
2367 matmul_fn = matmul_i1_avx128_fma4;
kono
parents:
diff changeset
2368 goto store;
kono
parents:
diff changeset
2369 }
kono
parents:
diff changeset
2370 #endif
kono
parents:
diff changeset
2371
kono
parents:
diff changeset
2372 }
kono
parents:
diff changeset
2373 store:
kono
parents:
diff changeset
2374 __atomic_store_n (&matmul_p, matmul_fn, __ATOMIC_RELAXED);
kono
parents:
diff changeset
2375 }
kono
parents:
diff changeset
2376
kono
parents:
diff changeset
2377 (*matmul_fn) (retarray, a, b, try_blas, blas_limit, gemm);
kono
parents:
diff changeset
2378 }
kono
parents:
diff changeset
2379
kono
parents:
diff changeset
2380 #else /* Just the vanilla function. */
kono
parents:
diff changeset
2381
kono
parents:
diff changeset
2382 void
kono
parents:
diff changeset
2383 matmul_i1 (gfc_array_i1 * const restrict retarray,
kono
parents:
diff changeset
2384 gfc_array_i1 * const restrict a, gfc_array_i1 * const restrict b, int try_blas,
kono
parents:
diff changeset
2385 int blas_limit, blas_call gemm)
kono
parents:
diff changeset
2386 {
kono
parents:
diff changeset
2387 const GFC_INTEGER_1 * restrict abase;
kono
parents:
diff changeset
2388 const GFC_INTEGER_1 * restrict bbase;
kono
parents:
diff changeset
2389 GFC_INTEGER_1 * restrict dest;
kono
parents:
diff changeset
2390
kono
parents:
diff changeset
2391 index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
kono
parents:
diff changeset
2392 index_type x, y, n, count, xcount, ycount;
kono
parents:
diff changeset
2393
kono
parents:
diff changeset
2394 assert (GFC_DESCRIPTOR_RANK (a) == 2
kono
parents:
diff changeset
2395 || GFC_DESCRIPTOR_RANK (b) == 2);
kono
parents:
diff changeset
2396
kono
parents:
diff changeset
2397 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
kono
parents:
diff changeset
2398
kono
parents:
diff changeset
2399 Either A or B (but not both) can be rank 1:
kono
parents:
diff changeset
2400
kono
parents:
diff changeset
2401 o One-dimensional argument A is implicitly treated as a row matrix
kono
parents:
diff changeset
2402 dimensioned [1,count], so xcount=1.
kono
parents:
diff changeset
2403
kono
parents:
diff changeset
2404 o One-dimensional argument B is implicitly treated as a column matrix
kono
parents:
diff changeset
2405 dimensioned [count, 1], so ycount=1.
kono
parents:
diff changeset
2406 */
kono
parents:
diff changeset
2407
kono
parents:
diff changeset
2408 if (retarray->base_addr == NULL)
kono
parents:
diff changeset
2409 {
kono
parents:
diff changeset
2410 if (GFC_DESCRIPTOR_RANK (a) == 1)
kono
parents:
diff changeset
2411 {
kono
parents:
diff changeset
2412 GFC_DIMENSION_SET(retarray->dim[0], 0,
kono
parents:
diff changeset
2413 GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
kono
parents:
diff changeset
2414 }
kono
parents:
diff changeset
2415 else if (GFC_DESCRIPTOR_RANK (b) == 1)
kono
parents:
diff changeset
2416 {
kono
parents:
diff changeset
2417 GFC_DIMENSION_SET(retarray->dim[0], 0,
kono
parents:
diff changeset
2418 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
kono
parents:
diff changeset
2419 }
kono
parents:
diff changeset
2420 else
kono
parents:
diff changeset
2421 {
kono
parents:
diff changeset
2422 GFC_DIMENSION_SET(retarray->dim[0], 0,
kono
parents:
diff changeset
2423 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
kono
parents:
diff changeset
2424
kono
parents:
diff changeset
2425 GFC_DIMENSION_SET(retarray->dim[1], 0,
kono
parents:
diff changeset
2426 GFC_DESCRIPTOR_EXTENT(b,1) - 1,
kono
parents:
diff changeset
2427 GFC_DESCRIPTOR_EXTENT(retarray,0));
kono
parents:
diff changeset
2428 }
kono
parents:
diff changeset
2429
kono
parents:
diff changeset
2430 retarray->base_addr
kono
parents:
diff changeset
2431 = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_INTEGER_1));
kono
parents:
diff changeset
2432 retarray->offset = 0;
kono
parents:
diff changeset
2433 }
kono
parents:
diff changeset
2434 else if (unlikely (compile_options.bounds_check))
kono
parents:
diff changeset
2435 {
kono
parents:
diff changeset
2436 index_type ret_extent, arg_extent;
kono
parents:
diff changeset
2437
kono
parents:
diff changeset
2438 if (GFC_DESCRIPTOR_RANK (a) == 1)
kono
parents:
diff changeset
2439 {
kono
parents:
diff changeset
2440 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
kono
parents:
diff changeset
2441 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
kono
parents:
diff changeset
2442 if (arg_extent != ret_extent)
kono
parents:
diff changeset
2443 runtime_error ("Incorrect extent in return array in"
kono
parents:
diff changeset
2444 " MATMUL intrinsic: is %ld, should be %ld",
kono
parents:
diff changeset
2445 (long int) ret_extent, (long int) arg_extent);
kono
parents:
diff changeset
2446 }
kono
parents:
diff changeset
2447 else if (GFC_DESCRIPTOR_RANK (b) == 1)
kono
parents:
diff changeset
2448 {
kono
parents:
diff changeset
2449 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
kono
parents:
diff changeset
2450 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
kono
parents:
diff changeset
2451 if (arg_extent != ret_extent)
kono
parents:
diff changeset
2452 runtime_error ("Incorrect extent in return array in"
kono
parents:
diff changeset
2453 " MATMUL intrinsic: is %ld, should be %ld",
kono
parents:
diff changeset
2454 (long int) ret_extent, (long int) arg_extent);
kono
parents:
diff changeset
2455 }
kono
parents:
diff changeset
2456 else
kono
parents:
diff changeset
2457 {
kono
parents:
diff changeset
2458 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
kono
parents:
diff changeset
2459 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
kono
parents:
diff changeset
2460 if (arg_extent != ret_extent)
kono
parents:
diff changeset
2461 runtime_error ("Incorrect extent in return array in"
kono
parents:
diff changeset
2462 " MATMUL intrinsic for dimension 1:"
kono
parents:
diff changeset
2463 " is %ld, should be %ld",
kono
parents:
diff changeset
2464 (long int) ret_extent, (long int) arg_extent);
kono
parents:
diff changeset
2465
kono
parents:
diff changeset
2466 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
kono
parents:
diff changeset
2467 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
kono
parents:
diff changeset
2468 if (arg_extent != ret_extent)
kono
parents:
diff changeset
2469 runtime_error ("Incorrect extent in return array in"
kono
parents:
diff changeset
2470 " MATMUL intrinsic for dimension 2:"
kono
parents:
diff changeset
2471 " is %ld, should be %ld",
kono
parents:
diff changeset
2472 (long int) ret_extent, (long int) arg_extent);
kono
parents:
diff changeset
2473 }
kono
parents:
diff changeset
2474 }
kono
parents:
diff changeset
2475
kono
parents:
diff changeset
2476
kono
parents:
diff changeset
2477 if (GFC_DESCRIPTOR_RANK (retarray) == 1)
kono
parents:
diff changeset
2478 {
kono
parents:
diff changeset
2479 /* One-dimensional result may be addressed in the code below
kono
parents:
diff changeset
2480 either as a row or a column matrix. We want both cases to
kono
parents:
diff changeset
2481 work. */
kono
parents:
diff changeset
2482 rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
kono
parents:
diff changeset
2483 }
kono
parents:
diff changeset
2484 else
kono
parents:
diff changeset
2485 {
kono
parents:
diff changeset
2486 rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
kono
parents:
diff changeset
2487 rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
kono
parents:
diff changeset
2488 }
kono
parents:
diff changeset
2489
kono
parents:
diff changeset
2490
kono
parents:
diff changeset
2491 if (GFC_DESCRIPTOR_RANK (a) == 1)
kono
parents:
diff changeset
2492 {
kono
parents:
diff changeset
2493 /* Treat it as a a row matrix A[1,count]. */
kono
parents:
diff changeset
2494 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
kono
parents:
diff changeset
2495 aystride = 1;
kono
parents:
diff changeset
2496
kono
parents:
diff changeset
2497 xcount = 1;
kono
parents:
diff changeset
2498 count = GFC_DESCRIPTOR_EXTENT(a,0);
kono
parents:
diff changeset
2499 }
kono
parents:
diff changeset
2500 else
kono
parents:
diff changeset
2501 {
kono
parents:
diff changeset
2502 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
kono
parents:
diff changeset
2503 aystride = GFC_DESCRIPTOR_STRIDE(a,1);
kono
parents:
diff changeset
2504
kono
parents:
diff changeset
2505 count = GFC_DESCRIPTOR_EXTENT(a,1);
kono
parents:
diff changeset
2506 xcount = GFC_DESCRIPTOR_EXTENT(a,0);
kono
parents:
diff changeset
2507 }
kono
parents:
diff changeset
2508
kono
parents:
diff changeset
2509 if (count != GFC_DESCRIPTOR_EXTENT(b,0))
kono
parents:
diff changeset
2510 {
kono
parents:
diff changeset
2511 if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
kono
parents:
diff changeset
2512 runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
kono
parents:
diff changeset
2513 }
kono
parents:
diff changeset
2514
kono
parents:
diff changeset
2515 if (GFC_DESCRIPTOR_RANK (b) == 1)
kono
parents:
diff changeset
2516 {
kono
parents:
diff changeset
2517 /* Treat it as a column matrix B[count,1] */
kono
parents:
diff changeset
2518 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
kono
parents:
diff changeset
2519
kono
parents:
diff changeset
2520 /* bystride should never be used for 1-dimensional b.
kono
parents:
diff changeset
2521 The value is only used for calculation of the
kono
parents:
diff changeset
2522 memory by the buffer. */
kono
parents:
diff changeset
2523 bystride = 256;
kono
parents:
diff changeset
2524 ycount = 1;
kono
parents:
diff changeset
2525 }
kono
parents:
diff changeset
2526 else
kono
parents:
diff changeset
2527 {
kono
parents:
diff changeset
2528 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
kono
parents:
diff changeset
2529 bystride = GFC_DESCRIPTOR_STRIDE(b,1);
kono
parents:
diff changeset
2530 ycount = GFC_DESCRIPTOR_EXTENT(b,1);
kono
parents:
diff changeset
2531 }
kono
parents:
diff changeset
2532
kono
parents:
diff changeset
2533 abase = a->base_addr;
kono
parents:
diff changeset
2534 bbase = b->base_addr;
kono
parents:
diff changeset
2535 dest = retarray->base_addr;
kono
parents:
diff changeset
2536
kono
parents:
diff changeset
2537 /* Now that everything is set up, we perform the multiplication
kono
parents:
diff changeset
2538 itself. */
kono
parents:
diff changeset
2539
kono
parents:
diff changeset
2540 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
kono
parents:
diff changeset
2541 #define min(a,b) ((a) <= (b) ? (a) : (b))
kono
parents:
diff changeset
2542 #define max(a,b) ((a) >= (b) ? (a) : (b))
kono
parents:
diff changeset
2543
kono
parents:
diff changeset
2544 if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
kono
parents:
diff changeset
2545 && (bxstride == 1 || bystride == 1)
kono
parents:
diff changeset
2546 && (((float) xcount) * ((float) ycount) * ((float) count)
kono
parents:
diff changeset
2547 > POW3(blas_limit)))
kono
parents:
diff changeset
2548 {
kono
parents:
diff changeset
2549 const int m = xcount, n = ycount, k = count, ldc = rystride;
kono
parents:
diff changeset
2550 const GFC_INTEGER_1 one = 1, zero = 0;
kono
parents:
diff changeset
2551 const int lda = (axstride == 1) ? aystride : axstride,
kono
parents:
diff changeset
2552 ldb = (bxstride == 1) ? bystride : bxstride;
kono
parents:
diff changeset
2553
kono
parents:
diff changeset
2554 if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
kono
parents:
diff changeset
2555 {
kono
parents:
diff changeset
2556 assert (gemm != NULL);
kono
parents:
diff changeset
2557 gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m,
kono
parents:
diff changeset
2558 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
kono
parents:
diff changeset
2559 &ldc, 1, 1);
kono
parents:
diff changeset
2560 return;
kono
parents:
diff changeset
2561 }
kono
parents:
diff changeset
2562 }
kono
parents:
diff changeset
2563
kono
parents:
diff changeset
2564 if (rxstride == 1 && axstride == 1 && bxstride == 1)
kono
parents:
diff changeset
2565 {
kono
parents:
diff changeset
2566 /* This block of code implements a tuned matmul, derived from
kono
parents:
diff changeset
2567 Superscalar GEMM-based level 3 BLAS, Beta version 0.1
kono
parents:
diff changeset
2568
kono
parents:
diff changeset
2569 Bo Kagstrom and Per Ling
kono
parents:
diff changeset
2570 Department of Computing Science
kono
parents:
diff changeset
2571 Umea University
kono
parents:
diff changeset
2572 S-901 87 Umea, Sweden
kono
parents:
diff changeset
2573
kono
parents:
diff changeset
2574 from netlib.org, translated to C, and modified for matmul.m4. */
kono
parents:
diff changeset
2575
kono
parents:
diff changeset
2576 const GFC_INTEGER_1 *a, *b;
kono
parents:
diff changeset
2577 GFC_INTEGER_1 *c;
kono
parents:
diff changeset
2578 const index_type m = xcount, n = ycount, k = count;
kono
parents:
diff changeset
2579
kono
parents:
diff changeset
2580 /* System generated locals */
kono
parents:
diff changeset
2581 index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
kono
parents:
diff changeset
2582 i1, i2, i3, i4, i5, i6;
kono
parents:
diff changeset
2583
kono
parents:
diff changeset
2584 /* Local variables */
kono
parents:
diff changeset
2585 GFC_INTEGER_1 f11, f12, f21, f22, f31, f32, f41, f42,
kono
parents:
diff changeset
2586 f13, f14, f23, f24, f33, f34, f43, f44;
kono
parents:
diff changeset
2587 index_type i, j, l, ii, jj, ll;
kono
parents:
diff changeset
2588 index_type isec, jsec, lsec, uisec, ujsec, ulsec;
kono
parents:
diff changeset
2589 GFC_INTEGER_1 *t1;
kono
parents:
diff changeset
2590
kono
parents:
diff changeset
2591 a = abase;
kono
parents:
diff changeset
2592 b = bbase;
kono
parents:
diff changeset
2593 c = retarray->base_addr;
kono
parents:
diff changeset
2594
kono
parents:
diff changeset
2595 /* Parameter adjustments */
kono
parents:
diff changeset
2596 c_dim1 = rystride;
kono
parents:
diff changeset
2597 c_offset = 1 + c_dim1;
kono
parents:
diff changeset
2598 c -= c_offset;
kono
parents:
diff changeset
2599 a_dim1 = aystride;
kono
parents:
diff changeset
2600 a_offset = 1 + a_dim1;
kono
parents:
diff changeset
2601 a -= a_offset;
kono
parents:
diff changeset
2602 b_dim1 = bystride;
kono
parents:
diff changeset
2603 b_offset = 1 + b_dim1;
kono
parents:
diff changeset
2604 b -= b_offset;
kono
parents:
diff changeset
2605
kono
parents:
diff changeset
2606 /* Empty c first. */
kono
parents:
diff changeset
2607 for (j=1; j<=n; j++)
kono
parents:
diff changeset
2608 for (i=1; i<=m; i++)
kono
parents:
diff changeset
2609 c[i + j * c_dim1] = (GFC_INTEGER_1)0;
kono
parents:
diff changeset
2610
kono
parents:
diff changeset
2611 /* Early exit if possible */
kono
parents:
diff changeset
2612 if (m == 0 || n == 0 || k == 0)
kono
parents:
diff changeset
2613 return;
kono
parents:
diff changeset
2614
kono
parents:
diff changeset
2615 /* Adjust size of t1 to what is needed. */
kono
parents:
diff changeset
2616 index_type t1_dim;
kono
parents:
diff changeset
2617 t1_dim = (a_dim1-1) * 256 + b_dim1;
kono
parents:
diff changeset
2618 if (t1_dim > 65536)
kono
parents:
diff changeset
2619 t1_dim = 65536;
kono
parents:
diff changeset
2620
kono
parents:
diff changeset
2621 t1 = malloc (t1_dim * sizeof(GFC_INTEGER_1));
kono
parents:
diff changeset
2622
kono
parents:
diff changeset
2623 /* Start turning the crank. */
kono
parents:
diff changeset
2624 i1 = n;
kono
parents:
diff changeset
2625 for (jj = 1; jj <= i1; jj += 512)
kono
parents:
diff changeset
2626 {
kono
parents:
diff changeset
2627 /* Computing MIN */
kono
parents:
diff changeset
2628 i2 = 512;
kono
parents:
diff changeset
2629 i3 = n - jj + 1;
kono
parents:
diff changeset
2630 jsec = min(i2,i3);
kono
parents:
diff changeset
2631 ujsec = jsec - jsec % 4;
kono
parents:
diff changeset
2632 i2 = k;
kono
parents:
diff changeset
2633 for (ll = 1; ll <= i2; ll += 256)
kono
parents:
diff changeset
2634 {
kono
parents:
diff changeset
2635 /* Computing MIN */
kono
parents:
diff changeset
2636 i3 = 256;
kono
parents:
diff changeset
2637 i4 = k - ll + 1;
kono
parents:
diff changeset
2638 lsec = min(i3,i4);
kono
parents:
diff changeset
2639 ulsec = lsec - lsec % 2;
kono
parents:
diff changeset
2640
kono
parents:
diff changeset
2641 i3 = m;
kono
parents:
diff changeset
2642 for (ii = 1; ii <= i3; ii += 256)
kono
parents:
diff changeset
2643 {
kono
parents:
diff changeset
2644 /* Computing MIN */
kono
parents:
diff changeset
2645 i4 = 256;
kono
parents:
diff changeset
2646 i5 = m - ii + 1;
kono
parents:
diff changeset
2647 isec = min(i4,i5);
kono
parents:
diff changeset
2648 uisec = isec - isec % 2;
kono
parents:
diff changeset
2649 i4 = ll + ulsec - 1;
kono
parents:
diff changeset
2650 for (l = ll; l <= i4; l += 2)
kono
parents:
diff changeset
2651 {
kono
parents:
diff changeset
2652 i5 = ii + uisec - 1;
kono
parents:
diff changeset
2653 for (i = ii; i <= i5; i += 2)
kono
parents:
diff changeset
2654 {
kono
parents:
diff changeset
2655 t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
kono
parents:
diff changeset
2656 a[i + l * a_dim1];
kono
parents:
diff changeset
2657 t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
kono
parents:
diff changeset
2658 a[i + (l + 1) * a_dim1];
kono
parents:
diff changeset
2659 t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
kono
parents:
diff changeset
2660 a[i + 1 + l * a_dim1];
kono
parents:
diff changeset
2661 t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
kono
parents:
diff changeset
2662 a[i + 1 + (l + 1) * a_dim1];
kono
parents:
diff changeset
2663 }
kono
parents:
diff changeset
2664 if (uisec < isec)
kono
parents:
diff changeset
2665 {
kono
parents:
diff changeset
2666 t1[l - ll + 1 + (isec << 8) - 257] =
kono
parents:
diff changeset
2667 a[ii + isec - 1 + l * a_dim1];
kono
parents:
diff changeset
2668 t1[l - ll + 2 + (isec << 8) - 257] =
kono
parents:
diff changeset
2669 a[ii + isec - 1 + (l + 1) * a_dim1];
kono
parents:
diff changeset
2670 }
kono
parents:
diff changeset
2671 }
kono
parents:
diff changeset
2672 if (ulsec < lsec)
kono
parents:
diff changeset
2673 {
kono
parents:
diff changeset
2674 i4 = ii + isec - 1;
kono
parents:
diff changeset
2675 for (i = ii; i<= i4; ++i)
kono
parents:
diff changeset
2676 {
kono
parents:
diff changeset
2677 t1[lsec + ((i - ii + 1) << 8) - 257] =
kono
parents:
diff changeset
2678 a[i + (ll + lsec - 1) * a_dim1];
kono
parents:
diff changeset
2679 }
kono
parents:
diff changeset
2680 }
kono
parents:
diff changeset
2681
kono
parents:
diff changeset
2682 uisec = isec - isec % 4;
kono
parents:
diff changeset
2683 i4 = jj + ujsec - 1;
kono
parents:
diff changeset
2684 for (j = jj; j <= i4; j += 4)
kono
parents:
diff changeset
2685 {
kono
parents:
diff changeset
2686 i5 = ii + uisec - 1;
kono
parents:
diff changeset
2687 for (i = ii; i <= i5; i += 4)
kono
parents:
diff changeset
2688 {
kono
parents:
diff changeset
2689 f11 = c[i + j * c_dim1];
kono
parents:
diff changeset
2690 f21 = c[i + 1 + j * c_dim1];
kono
parents:
diff changeset
2691 f12 = c[i + (j + 1) * c_dim1];
kono
parents:
diff changeset
2692 f22 = c[i + 1 + (j + 1) * c_dim1];
kono
parents:
diff changeset
2693 f13 = c[i + (j + 2) * c_dim1];
kono
parents:
diff changeset
2694 f23 = c[i + 1 + (j + 2) * c_dim1];
kono
parents:
diff changeset
2695 f14 = c[i + (j + 3) * c_dim1];
kono
parents:
diff changeset
2696 f24 = c[i + 1 + (j + 3) * c_dim1];
kono
parents:
diff changeset
2697 f31 = c[i + 2 + j * c_dim1];
kono
parents:
diff changeset
2698 f41 = c[i + 3 + j * c_dim1];
kono
parents:
diff changeset
2699 f32 = c[i + 2 + (j + 1) * c_dim1];
kono
parents:
diff changeset
2700 f42 = c[i + 3 + (j + 1) * c_dim1];
kono
parents:
diff changeset
2701 f33 = c[i + 2 + (j + 2) * c_dim1];
kono
parents:
diff changeset
2702 f43 = c[i + 3 + (j + 2) * c_dim1];
kono
parents:
diff changeset
2703 f34 = c[i + 2 + (j + 3) * c_dim1];
kono
parents:
diff changeset
2704 f44 = c[i + 3 + (j + 3) * c_dim1];
kono
parents:
diff changeset
2705 i6 = ll + lsec - 1;
kono
parents:
diff changeset
2706 for (l = ll; l <= i6; ++l)
kono
parents:
diff changeset
2707 {
kono
parents:
diff changeset
2708 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
kono
parents:
diff changeset
2709 * b[l + j * b_dim1];
kono
parents:
diff changeset
2710 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
kono
parents:
diff changeset
2711 * b[l + j * b_dim1];
kono
parents:
diff changeset
2712 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
kono
parents:
diff changeset
2713 * b[l + (j + 1) * b_dim1];
kono
parents:
diff changeset
2714 f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
kono
parents:
diff changeset
2715 * b[l + (j + 1) * b_dim1];
kono
parents:
diff changeset
2716 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
kono
parents:
diff changeset
2717 * b[l + (j + 2) * b_dim1];
kono
parents:
diff changeset
2718 f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
kono
parents:
diff changeset
2719 * b[l + (j + 2) * b_dim1];
kono
parents:
diff changeset
2720 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
kono
parents:
diff changeset
2721 * b[l + (j + 3) * b_dim1];
kono
parents:
diff changeset
2722 f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
kono
parents:
diff changeset
2723 * b[l + (j + 3) * b_dim1];
kono
parents:
diff changeset
2724 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
kono
parents:
diff changeset
2725 * b[l + j * b_dim1];
kono
parents:
diff changeset
2726 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
kono
parents:
diff changeset
2727 * b[l + j * b_dim1];
kono
parents:
diff changeset
2728 f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
kono
parents:
diff changeset
2729 * b[l + (j + 1) * b_dim1];
kono
parents:
diff changeset
2730 f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
kono
parents:
diff changeset
2731 * b[l + (j + 1) * b_dim1];
kono
parents:
diff changeset
2732 f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
kono
parents:
diff changeset
2733 * b[l + (j + 2) * b_dim1];
kono
parents:
diff changeset
2734 f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
kono
parents:
diff changeset
2735 * b[l + (j + 2) * b_dim1];
kono
parents:
diff changeset
2736 f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
kono
parents:
diff changeset
2737 * b[l + (j + 3) * b_dim1];
kono
parents:
diff changeset
2738 f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
kono
parents:
diff changeset
2739 * b[l + (j + 3) * b_dim1];
kono
parents:
diff changeset
2740 }
kono
parents:
diff changeset
2741 c[i + j * c_dim1] = f11;
kono
parents:
diff changeset
2742 c[i + 1 + j * c_dim1] = f21;
kono
parents:
diff changeset
2743 c[i + (j + 1) * c_dim1] = f12;
kono
parents:
diff changeset
2744 c[i + 1 + (j + 1) * c_dim1] = f22;
kono
parents:
diff changeset
2745 c[i + (j + 2) * c_dim1] = f13;
kono
parents:
diff changeset
2746 c[i + 1 + (j + 2) * c_dim1] = f23;
kono
parents:
diff changeset
2747 c[i + (j + 3) * c_dim1] = f14;
kono
parents:
diff changeset
2748 c[i + 1 + (j + 3) * c_dim1] = f24;
kono
parents:
diff changeset
2749 c[i + 2 + j * c_dim1] = f31;
kono
parents:
diff changeset
2750 c[i + 3 + j * c_dim1] = f41;
kono
parents:
diff changeset
2751 c[i + 2 + (j + 1) * c_dim1] = f32;
kono
parents:
diff changeset
2752 c[i + 3 + (j + 1) * c_dim1] = f42;
kono
parents:
diff changeset
2753 c[i + 2 + (j + 2) * c_dim1] = f33;
kono
parents:
diff changeset
2754 c[i + 3 + (j + 2) * c_dim1] = f43;
kono
parents:
diff changeset
2755 c[i + 2 + (j + 3) * c_dim1] = f34;
kono
parents:
diff changeset
2756 c[i + 3 + (j + 3) * c_dim1] = f44;
kono
parents:
diff changeset
2757 }
kono
parents:
diff changeset
2758 if (uisec < isec)
kono
parents:
diff changeset
2759 {
kono
parents:
diff changeset
2760 i5 = ii + isec - 1;
kono
parents:
diff changeset
2761 for (i = ii + uisec; i <= i5; ++i)
kono
parents:
diff changeset
2762 {
kono
parents:
diff changeset
2763 f11 = c[i + j * c_dim1];
kono
parents:
diff changeset
2764 f12 = c[i + (j + 1) * c_dim1];
kono
parents:
diff changeset
2765 f13 = c[i + (j + 2) * c_dim1];
kono
parents:
diff changeset
2766 f14 = c[i + (j + 3) * c_dim1];
kono
parents:
diff changeset
2767 i6 = ll + lsec - 1;
kono
parents:
diff changeset
2768 for (l = ll; l <= i6; ++l)
kono
parents:
diff changeset
2769 {
kono
parents:
diff changeset
2770 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
kono
parents:
diff changeset
2771 257] * b[l + j * b_dim1];
kono
parents:
diff changeset
2772 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
kono
parents:
diff changeset
2773 257] * b[l + (j + 1) * b_dim1];
kono
parents:
diff changeset
2774 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
kono
parents:
diff changeset
2775 257] * b[l + (j + 2) * b_dim1];
kono
parents:
diff changeset
2776 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
kono
parents:
diff changeset
2777 257] * b[l + (j + 3) * b_dim1];
kono
parents:
diff changeset
2778 }
kono
parents:
diff changeset
2779 c[i + j * c_dim1] = f11;
kono
parents:
diff changeset
2780 c[i + (j + 1) * c_dim1] = f12;
kono
parents:
diff changeset
2781 c[i + (j + 2) * c_dim1] = f13;
kono
parents:
diff changeset
2782 c[i + (j + 3) * c_dim1] = f14;
kono
parents:
diff changeset
2783 }
kono
parents:
diff changeset
2784 }
kono
parents:
diff changeset
2785 }
kono
parents:
diff changeset
2786 if (ujsec < jsec)
kono
parents:
diff changeset
2787 {
kono
parents:
diff changeset
2788 i4 = jj + jsec - 1;
kono
parents:
diff changeset
2789 for (j = jj + ujsec; j <= i4; ++j)
kono
parents:
diff changeset
2790 {
kono
parents:
diff changeset
2791 i5 = ii + uisec - 1;
kono
parents:
diff changeset
2792 for (i = ii; i <= i5; i += 4)
kono
parents:
diff changeset
2793 {
kono
parents:
diff changeset
2794 f11 = c[i + j * c_dim1];
kono
parents:
diff changeset
2795 f21 = c[i + 1 + j * c_dim1];
kono
parents:
diff changeset
2796 f31 = c[i + 2 + j * c_dim1];
kono
parents:
diff changeset
2797 f41 = c[i + 3 + j * c_dim1];
kono
parents:
diff changeset
2798 i6 = ll + lsec - 1;
kono
parents:
diff changeset
2799 for (l = ll; l <= i6; ++l)
kono
parents:
diff changeset
2800 {
kono
parents:
diff changeset
2801 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
kono
parents:
diff changeset
2802 257] * b[l + j * b_dim1];
kono
parents:
diff changeset
2803 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
kono
parents:
diff changeset
2804 257] * b[l + j * b_dim1];
kono
parents:
diff changeset
2805 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
kono
parents:
diff changeset
2806 257] * b[l + j * b_dim1];
kono
parents:
diff changeset
2807 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
kono
parents:
diff changeset
2808 257] * b[l + j * b_dim1];
kono
parents:
diff changeset
2809 }
kono
parents:
diff changeset
2810 c[i + j * c_dim1] = f11;
kono
parents:
diff changeset
2811 c[i + 1 + j * c_dim1] = f21;
kono
parents:
diff changeset
2812 c[i + 2 + j * c_dim1] = f31;
kono
parents:
diff changeset
2813 c[i + 3 + j * c_dim1] = f41;
kono
parents:
diff changeset
2814 }
kono
parents:
diff changeset
2815 i5 = ii + isec - 1;
kono
parents:
diff changeset
2816 for (i = ii + uisec; i <= i5; ++i)
kono
parents:
diff changeset
2817 {
kono
parents:
diff changeset
2818 f11 = c[i + j * c_dim1];
kono
parents:
diff changeset
2819 i6 = ll + lsec - 1;
kono
parents:
diff changeset
2820 for (l = ll; l <= i6; ++l)
kono
parents:
diff changeset
2821 {
kono
parents:
diff changeset
2822 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
kono
parents:
diff changeset
2823 257] * b[l + j * b_dim1];
kono
parents:
diff changeset
2824 }
kono
parents:
diff changeset
2825 c[i + j * c_dim1] = f11;
kono
parents:
diff changeset
2826 }
kono
parents:
diff changeset
2827 }
kono
parents:
diff changeset
2828 }
kono
parents:
diff changeset
2829 }
kono
parents:
diff changeset
2830 }
kono
parents:
diff changeset
2831 }
kono
parents:
diff changeset
2832 free(t1);
kono
parents:
diff changeset
2833 return;
kono
parents:
diff changeset
2834 }
kono
parents:
diff changeset
2835 else if (rxstride == 1 && aystride == 1 && bxstride == 1)
kono
parents:
diff changeset
2836 {
kono
parents:
diff changeset
2837 if (GFC_DESCRIPTOR_RANK (a) != 1)
kono
parents:
diff changeset
2838 {
kono
parents:
diff changeset
2839 const GFC_INTEGER_1 *restrict abase_x;
kono
parents:
diff changeset
2840 const GFC_INTEGER_1 *restrict bbase_y;
kono
parents:
diff changeset
2841 GFC_INTEGER_1 *restrict dest_y;
kono
parents:
diff changeset
2842 GFC_INTEGER_1 s;
kono
parents:
diff changeset
2843
kono
parents:
diff changeset
2844 for (y = 0; y < ycount; y++)
kono
parents:
diff changeset
2845 {
kono
parents:
diff changeset
2846 bbase_y = &bbase[y*bystride];
kono
parents:
diff changeset
2847 dest_y = &dest[y*rystride];
kono
parents:
diff changeset
2848 for (x = 0; x < xcount; x++)
kono
parents:
diff changeset
2849 {
kono
parents:
diff changeset
2850 abase_x = &abase[x*axstride];
kono
parents:
diff changeset
2851 s = (GFC_INTEGER_1) 0;
kono
parents:
diff changeset
2852 for (n = 0; n < count; n++)
kono
parents:
diff changeset
2853 s += abase_x[n] * bbase_y[n];
kono
parents:
diff changeset
2854 dest_y[x] = s;
kono
parents:
diff changeset
2855 }
kono
parents:
diff changeset
2856 }
kono
parents:
diff changeset
2857 }
kono
parents:
diff changeset
2858 else
kono
parents:
diff changeset
2859 {
kono
parents:
diff changeset
2860 const GFC_INTEGER_1 *restrict bbase_y;
kono
parents:
diff changeset
2861 GFC_INTEGER_1 s;
kono
parents:
diff changeset
2862
kono
parents:
diff changeset
2863 for (y = 0; y < ycount; y++)
kono
parents:
diff changeset
2864 {
kono
parents:
diff changeset
2865 bbase_y = &bbase[y*bystride];
kono
parents:
diff changeset
2866 s = (GFC_INTEGER_1) 0;
kono
parents:
diff changeset
2867 for (n = 0; n < count; n++)
kono
parents:
diff changeset
2868 s += abase[n*axstride] * bbase_y[n];
kono
parents:
diff changeset
2869 dest[y*rystride] = s;
kono
parents:
diff changeset
2870 }
kono
parents:
diff changeset
2871 }
kono
parents:
diff changeset
2872 }
kono
parents:
diff changeset
2873 else if (axstride < aystride)
kono
parents:
diff changeset
2874 {
kono
parents:
diff changeset
2875 for (y = 0; y < ycount; y++)
kono
parents:
diff changeset
2876 for (x = 0; x < xcount; x++)
kono
parents:
diff changeset
2877 dest[x*rxstride + y*rystride] = (GFC_INTEGER_1)0;
kono
parents:
diff changeset
2878
kono
parents:
diff changeset
2879 for (y = 0; y < ycount; y++)
kono
parents:
diff changeset
2880 for (n = 0; n < count; n++)
kono
parents:
diff changeset
2881 for (x = 0; x < xcount; x++)
kono
parents:
diff changeset
2882 /* dest[x,y] += a[x,n] * b[n,y] */
kono
parents:
diff changeset
2883 dest[x*rxstride + y*rystride] +=
kono
parents:
diff changeset
2884 abase[x*axstride + n*aystride] *
kono
parents:
diff changeset
2885 bbase[n*bxstride + y*bystride];
kono
parents:
diff changeset
2886 }
kono
parents:
diff changeset
2887 else if (GFC_DESCRIPTOR_RANK (a) == 1)
kono
parents:
diff changeset
2888 {
kono
parents:
diff changeset
2889 const GFC_INTEGER_1 *restrict bbase_y;
kono
parents:
diff changeset
2890 GFC_INTEGER_1 s;
kono
parents:
diff changeset
2891
kono
parents:
diff changeset
2892 for (y = 0; y < ycount; y++)
kono
parents:
diff changeset
2893 {
kono
parents:
diff changeset
2894 bbase_y = &bbase[y*bystride];
kono
parents:
diff changeset
2895 s = (GFC_INTEGER_1) 0;
kono
parents:
diff changeset
2896 for (n = 0; n < count; n++)
kono
parents:
diff changeset
2897 s += abase[n*axstride] * bbase_y[n*bxstride];
kono
parents:
diff changeset
2898 dest[y*rxstride] = s;
kono
parents:
diff changeset
2899 }
kono
parents:
diff changeset
2900 }
kono
parents:
diff changeset
2901 else
kono
parents:
diff changeset
2902 {
kono
parents:
diff changeset
2903 const GFC_INTEGER_1 *restrict abase_x;
kono
parents:
diff changeset
2904 const GFC_INTEGER_1 *restrict bbase_y;
kono
parents:
diff changeset
2905 GFC_INTEGER_1 *restrict dest_y;
kono
parents:
diff changeset
2906 GFC_INTEGER_1 s;
kono
parents:
diff changeset
2907
kono
parents:
diff changeset
2908 for (y = 0; y < ycount; y++)
kono
parents:
diff changeset
2909 {
kono
parents:
diff changeset
2910 bbase_y = &bbase[y*bystride];
kono
parents:
diff changeset
2911 dest_y = &dest[y*rystride];
kono
parents:
diff changeset
2912 for (x = 0; x < xcount; x++)
kono
parents:
diff changeset
2913 {
kono
parents:
diff changeset
2914 abase_x = &abase[x*axstride];
kono
parents:
diff changeset
2915 s = (GFC_INTEGER_1) 0;
kono
parents:
diff changeset
2916 for (n = 0; n < count; n++)
kono
parents:
diff changeset
2917 s += abase_x[n*aystride] * bbase_y[n*bxstride];
kono
parents:
diff changeset
2918 dest_y[x*rxstride] = s;
kono
parents:
diff changeset
2919 }
kono
parents:
diff changeset
2920 }
kono
parents:
diff changeset
2921 }
kono
parents:
diff changeset
2922 }
kono
parents:
diff changeset
2923 #undef POW3
kono
parents:
diff changeset
2924 #undef min
kono
parents:
diff changeset
2925 #undef max
kono
parents:
diff changeset
2926
kono
parents:
diff changeset
2927 #endif
kono
parents:
diff changeset
2928 #endif
kono
parents:
diff changeset
2929