annotate libgfortran/generated/matmulavx128_c4.c @ 158:494b0b89df80 default tip

...
author Shinji KONO <kono@ie.u-ryukyu.ac.jp>
date Mon, 25 May 2020 18:13:55 +0900
parents 1830386684a0
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
111
kono
parents:
diff changeset
1 /* Implementation of the MATMUL intrinsic
145
1830386684a0 gcc-9.2.0
anatofuz
parents: 131
diff changeset
2 Copyright (C) 2002-2020 Free Software Foundation, Inc.
111
kono
parents:
diff changeset
3 Contributed by Thomas Koenig <tkoenig@gcc.gnu.org>.
kono
parents:
diff changeset
4
kono
parents:
diff changeset
5 This file is part of the GNU Fortran runtime library (libgfortran).
kono
parents:
diff changeset
6
kono
parents:
diff changeset
7 Libgfortran is free software; you can redistribute it and/or
kono
parents:
diff changeset
8 modify it under the terms of the GNU General Public
kono
parents:
diff changeset
9 License as published by the Free Software Foundation; either
kono
parents:
diff changeset
10 version 3 of the License, or (at your option) any later version.
kono
parents:
diff changeset
11
kono
parents:
diff changeset
12 Libgfortran is distributed in the hope that it will be useful,
kono
parents:
diff changeset
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
kono
parents:
diff changeset
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
kono
parents:
diff changeset
15 GNU General Public License for more details.
kono
parents:
diff changeset
16
kono
parents:
diff changeset
17 Under Section 7 of GPL version 3, you are granted additional
kono
parents:
diff changeset
18 permissions described in the GCC Runtime Library Exception, version
kono
parents:
diff changeset
19 3.1, as published by the Free Software Foundation.
kono
parents:
diff changeset
20
kono
parents:
diff changeset
21 You should have received a copy of the GNU General Public License and
kono
parents:
diff changeset
22 a copy of the GCC Runtime Library Exception along with this program;
kono
parents:
diff changeset
23 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
kono
parents:
diff changeset
24 <http://www.gnu.org/licenses/>. */
kono
parents:
diff changeset
25
kono
parents:
diff changeset
26 #include "libgfortran.h"
kono
parents:
diff changeset
27 #include <string.h>
kono
parents:
diff changeset
28 #include <assert.h>
kono
parents:
diff changeset
29
kono
parents:
diff changeset
30
kono
parents:
diff changeset
31 /* These are the specific versions of matmul with -mprefer-avx128. */
kono
parents:
diff changeset
32
kono
parents:
diff changeset
33 #if defined (HAVE_GFC_COMPLEX_4)
kono
parents:
diff changeset
34
kono
parents:
diff changeset
35 /* Prototype for the BLAS ?gemm subroutine, a pointer to which can be
kono
parents:
diff changeset
36 passed to us by the front-end, in which case we call it for large
kono
parents:
diff changeset
37 matrices. */
kono
parents:
diff changeset
38
kono
parents:
diff changeset
39 typedef void (*blas_call)(const char *, const char *, const int *, const int *,
kono
parents:
diff changeset
40 const int *, const GFC_COMPLEX_4 *, const GFC_COMPLEX_4 *,
kono
parents:
diff changeset
41 const int *, const GFC_COMPLEX_4 *, const int *,
kono
parents:
diff changeset
42 const GFC_COMPLEX_4 *, GFC_COMPLEX_4 *, const int *,
kono
parents:
diff changeset
43 int, int);
kono
parents:
diff changeset
44
kono
parents:
diff changeset
45 #if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
kono
parents:
diff changeset
46 void
kono
parents:
diff changeset
47 matmul_c4_avx128_fma3 (gfc_array_c4 * const restrict retarray,
kono
parents:
diff changeset
48 gfc_array_c4 * const restrict a, gfc_array_c4 * const restrict b, int try_blas,
kono
parents:
diff changeset
49 int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma")));
kono
parents:
diff changeset
50 internal_proto(matmul_c4_avx128_fma3);
kono
parents:
diff changeset
51 void
kono
parents:
diff changeset
52 matmul_c4_avx128_fma3 (gfc_array_c4 * const restrict retarray,
kono
parents:
diff changeset
53 gfc_array_c4 * const restrict a, gfc_array_c4 * const restrict b, int try_blas,
kono
parents:
diff changeset
54 int blas_limit, blas_call gemm)
kono
parents:
diff changeset
55 {
kono
parents:
diff changeset
56 const GFC_COMPLEX_4 * restrict abase;
kono
parents:
diff changeset
57 const GFC_COMPLEX_4 * restrict bbase;
kono
parents:
diff changeset
58 GFC_COMPLEX_4 * restrict dest;
kono
parents:
diff changeset
59
kono
parents:
diff changeset
60 index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
kono
parents:
diff changeset
61 index_type x, y, n, count, xcount, ycount;
kono
parents:
diff changeset
62
kono
parents:
diff changeset
63 assert (GFC_DESCRIPTOR_RANK (a) == 2
kono
parents:
diff changeset
64 || GFC_DESCRIPTOR_RANK (b) == 2);
kono
parents:
diff changeset
65
kono
parents:
diff changeset
66 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
kono
parents:
diff changeset
67
kono
parents:
diff changeset
68 Either A or B (but not both) can be rank 1:
kono
parents:
diff changeset
69
kono
parents:
diff changeset
70 o One-dimensional argument A is implicitly treated as a row matrix
kono
parents:
diff changeset
71 dimensioned [1,count], so xcount=1.
kono
parents:
diff changeset
72
kono
parents:
diff changeset
73 o One-dimensional argument B is implicitly treated as a column matrix
kono
parents:
diff changeset
74 dimensioned [count, 1], so ycount=1.
kono
parents:
diff changeset
75 */
kono
parents:
diff changeset
76
kono
parents:
diff changeset
77 if (retarray->base_addr == NULL)
kono
parents:
diff changeset
78 {
kono
parents:
diff changeset
79 if (GFC_DESCRIPTOR_RANK (a) == 1)
kono
parents:
diff changeset
80 {
kono
parents:
diff changeset
81 GFC_DIMENSION_SET(retarray->dim[0], 0,
kono
parents:
diff changeset
82 GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
kono
parents:
diff changeset
83 }
kono
parents:
diff changeset
84 else if (GFC_DESCRIPTOR_RANK (b) == 1)
kono
parents:
diff changeset
85 {
kono
parents:
diff changeset
86 GFC_DIMENSION_SET(retarray->dim[0], 0,
kono
parents:
diff changeset
87 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
kono
parents:
diff changeset
88 }
kono
parents:
diff changeset
89 else
kono
parents:
diff changeset
90 {
kono
parents:
diff changeset
91 GFC_DIMENSION_SET(retarray->dim[0], 0,
kono
parents:
diff changeset
92 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
kono
parents:
diff changeset
93
kono
parents:
diff changeset
94 GFC_DIMENSION_SET(retarray->dim[1], 0,
kono
parents:
diff changeset
95 GFC_DESCRIPTOR_EXTENT(b,1) - 1,
kono
parents:
diff changeset
96 GFC_DESCRIPTOR_EXTENT(retarray,0));
kono
parents:
diff changeset
97 }
kono
parents:
diff changeset
98
kono
parents:
diff changeset
99 retarray->base_addr
kono
parents:
diff changeset
100 = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_COMPLEX_4));
kono
parents:
diff changeset
101 retarray->offset = 0;
kono
parents:
diff changeset
102 }
kono
parents:
diff changeset
103 else if (unlikely (compile_options.bounds_check))
kono
parents:
diff changeset
104 {
kono
parents:
diff changeset
105 index_type ret_extent, arg_extent;
kono
parents:
diff changeset
106
kono
parents:
diff changeset
107 if (GFC_DESCRIPTOR_RANK (a) == 1)
kono
parents:
diff changeset
108 {
kono
parents:
diff changeset
109 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
kono
parents:
diff changeset
110 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
kono
parents:
diff changeset
111 if (arg_extent != ret_extent)
131
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
112 runtime_error ("Array bound mismatch for dimension 1 of "
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
113 "array (%ld/%ld) ",
111
kono
parents:
diff changeset
114 (long int) ret_extent, (long int) arg_extent);
kono
parents:
diff changeset
115 }
kono
parents:
diff changeset
116 else if (GFC_DESCRIPTOR_RANK (b) == 1)
kono
parents:
diff changeset
117 {
kono
parents:
diff changeset
118 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
kono
parents:
diff changeset
119 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
kono
parents:
diff changeset
120 if (arg_extent != ret_extent)
131
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
121 runtime_error ("Array bound mismatch for dimension 1 of "
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
122 "array (%ld/%ld) ",
111
kono
parents:
diff changeset
123 (long int) ret_extent, (long int) arg_extent);
kono
parents:
diff changeset
124 }
kono
parents:
diff changeset
125 else
kono
parents:
diff changeset
126 {
kono
parents:
diff changeset
127 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
kono
parents:
diff changeset
128 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
kono
parents:
diff changeset
129 if (arg_extent != ret_extent)
131
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
130 runtime_error ("Array bound mismatch for dimension 1 of "
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
131 "array (%ld/%ld) ",
111
kono
parents:
diff changeset
132 (long int) ret_extent, (long int) arg_extent);
kono
parents:
diff changeset
133
kono
parents:
diff changeset
134 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
kono
parents:
diff changeset
135 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
kono
parents:
diff changeset
136 if (arg_extent != ret_extent)
131
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
137 runtime_error ("Array bound mismatch for dimension 2 of "
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
138 "array (%ld/%ld) ",
111
kono
parents:
diff changeset
139 (long int) ret_extent, (long int) arg_extent);
kono
parents:
diff changeset
140 }
kono
parents:
diff changeset
141 }
kono
parents:
diff changeset
142
kono
parents:
diff changeset
143
kono
parents:
diff changeset
144 if (GFC_DESCRIPTOR_RANK (retarray) == 1)
kono
parents:
diff changeset
145 {
kono
parents:
diff changeset
146 /* One-dimensional result may be addressed in the code below
kono
parents:
diff changeset
147 either as a row or a column matrix. We want both cases to
kono
parents:
diff changeset
148 work. */
kono
parents:
diff changeset
149 rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
kono
parents:
diff changeset
150 }
kono
parents:
diff changeset
151 else
kono
parents:
diff changeset
152 {
kono
parents:
diff changeset
153 rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
kono
parents:
diff changeset
154 rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
kono
parents:
diff changeset
155 }
kono
parents:
diff changeset
156
kono
parents:
diff changeset
157
kono
parents:
diff changeset
158 if (GFC_DESCRIPTOR_RANK (a) == 1)
kono
parents:
diff changeset
159 {
kono
parents:
diff changeset
160 /* Treat it as a a row matrix A[1,count]. */
kono
parents:
diff changeset
161 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
kono
parents:
diff changeset
162 aystride = 1;
kono
parents:
diff changeset
163
kono
parents:
diff changeset
164 xcount = 1;
kono
parents:
diff changeset
165 count = GFC_DESCRIPTOR_EXTENT(a,0);
kono
parents:
diff changeset
166 }
kono
parents:
diff changeset
167 else
kono
parents:
diff changeset
168 {
kono
parents:
diff changeset
169 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
kono
parents:
diff changeset
170 aystride = GFC_DESCRIPTOR_STRIDE(a,1);
kono
parents:
diff changeset
171
kono
parents:
diff changeset
172 count = GFC_DESCRIPTOR_EXTENT(a,1);
kono
parents:
diff changeset
173 xcount = GFC_DESCRIPTOR_EXTENT(a,0);
kono
parents:
diff changeset
174 }
kono
parents:
diff changeset
175
kono
parents:
diff changeset
176 if (count != GFC_DESCRIPTOR_EXTENT(b,0))
kono
parents:
diff changeset
177 {
kono
parents:
diff changeset
178 if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
131
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
179 runtime_error ("Incorrect extent in argument B in MATMUL intrinsic "
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
180 "in dimension 1: is %ld, should be %ld",
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
181 (long int) GFC_DESCRIPTOR_EXTENT(b,0), (long int) count);
111
kono
parents:
diff changeset
182 }
kono
parents:
diff changeset
183
kono
parents:
diff changeset
184 if (GFC_DESCRIPTOR_RANK (b) == 1)
kono
parents:
diff changeset
185 {
kono
parents:
diff changeset
186 /* Treat it as a column matrix B[count,1] */
kono
parents:
diff changeset
187 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
kono
parents:
diff changeset
188
kono
parents:
diff changeset
189 /* bystride should never be used for 1-dimensional b.
kono
parents:
diff changeset
190 The value is only used for calculation of the
kono
parents:
diff changeset
191 memory by the buffer. */
kono
parents:
diff changeset
192 bystride = 256;
kono
parents:
diff changeset
193 ycount = 1;
kono
parents:
diff changeset
194 }
kono
parents:
diff changeset
195 else
kono
parents:
diff changeset
196 {
kono
parents:
diff changeset
197 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
kono
parents:
diff changeset
198 bystride = GFC_DESCRIPTOR_STRIDE(b,1);
kono
parents:
diff changeset
199 ycount = GFC_DESCRIPTOR_EXTENT(b,1);
kono
parents:
diff changeset
200 }
kono
parents:
diff changeset
201
kono
parents:
diff changeset
202 abase = a->base_addr;
kono
parents:
diff changeset
203 bbase = b->base_addr;
kono
parents:
diff changeset
204 dest = retarray->base_addr;
kono
parents:
diff changeset
205
kono
parents:
diff changeset
206 /* Now that everything is set up, we perform the multiplication
kono
parents:
diff changeset
207 itself. */
kono
parents:
diff changeset
208
kono
parents:
diff changeset
209 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
kono
parents:
diff changeset
210 #define min(a,b) ((a) <= (b) ? (a) : (b))
kono
parents:
diff changeset
211 #define max(a,b) ((a) >= (b) ? (a) : (b))
kono
parents:
diff changeset
212
kono
parents:
diff changeset
213 if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
kono
parents:
diff changeset
214 && (bxstride == 1 || bystride == 1)
kono
parents:
diff changeset
215 && (((float) xcount) * ((float) ycount) * ((float) count)
kono
parents:
diff changeset
216 > POW3(blas_limit)))
kono
parents:
diff changeset
217 {
kono
parents:
diff changeset
218 const int m = xcount, n = ycount, k = count, ldc = rystride;
kono
parents:
diff changeset
219 const GFC_COMPLEX_4 one = 1, zero = 0;
kono
parents:
diff changeset
220 const int lda = (axstride == 1) ? aystride : axstride,
kono
parents:
diff changeset
221 ldb = (bxstride == 1) ? bystride : bxstride;
kono
parents:
diff changeset
222
kono
parents:
diff changeset
223 if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
kono
parents:
diff changeset
224 {
kono
parents:
diff changeset
225 assert (gemm != NULL);
131
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
226 const char *transa, *transb;
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
227 if (try_blas & 2)
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
228 transa = "C";
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
229 else
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
230 transa = axstride == 1 ? "N" : "T";
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
231
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
232 if (try_blas & 4)
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
233 transb = "C";
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
234 else
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
235 transb = bxstride == 1 ? "N" : "T";
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
236
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
237 gemm (transa, transb , &m,
111
kono
parents:
diff changeset
238 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
kono
parents:
diff changeset
239 &ldc, 1, 1);
kono
parents:
diff changeset
240 return;
kono
parents:
diff changeset
241 }
kono
parents:
diff changeset
242 }
kono
parents:
diff changeset
243
kono
parents:
diff changeset
244 if (rxstride == 1 && axstride == 1 && bxstride == 1)
kono
parents:
diff changeset
245 {
kono
parents:
diff changeset
246 /* This block of code implements a tuned matmul, derived from
kono
parents:
diff changeset
247 Superscalar GEMM-based level 3 BLAS, Beta version 0.1
kono
parents:
diff changeset
248
kono
parents:
diff changeset
249 Bo Kagstrom and Per Ling
kono
parents:
diff changeset
250 Department of Computing Science
kono
parents:
diff changeset
251 Umea University
kono
parents:
diff changeset
252 S-901 87 Umea, Sweden
kono
parents:
diff changeset
253
kono
parents:
diff changeset
254 from netlib.org, translated to C, and modified for matmul.m4. */
kono
parents:
diff changeset
255
kono
parents:
diff changeset
256 const GFC_COMPLEX_4 *a, *b;
kono
parents:
diff changeset
257 GFC_COMPLEX_4 *c;
kono
parents:
diff changeset
258 const index_type m = xcount, n = ycount, k = count;
kono
parents:
diff changeset
259
kono
parents:
diff changeset
260 /* System generated locals */
kono
parents:
diff changeset
261 index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
kono
parents:
diff changeset
262 i1, i2, i3, i4, i5, i6;
kono
parents:
diff changeset
263
kono
parents:
diff changeset
264 /* Local variables */
kono
parents:
diff changeset
265 GFC_COMPLEX_4 f11, f12, f21, f22, f31, f32, f41, f42,
kono
parents:
diff changeset
266 f13, f14, f23, f24, f33, f34, f43, f44;
kono
parents:
diff changeset
267 index_type i, j, l, ii, jj, ll;
kono
parents:
diff changeset
268 index_type isec, jsec, lsec, uisec, ujsec, ulsec;
kono
parents:
diff changeset
269 GFC_COMPLEX_4 *t1;
kono
parents:
diff changeset
270
kono
parents:
diff changeset
271 a = abase;
kono
parents:
diff changeset
272 b = bbase;
kono
parents:
diff changeset
273 c = retarray->base_addr;
kono
parents:
diff changeset
274
kono
parents:
diff changeset
275 /* Parameter adjustments */
kono
parents:
diff changeset
276 c_dim1 = rystride;
kono
parents:
diff changeset
277 c_offset = 1 + c_dim1;
kono
parents:
diff changeset
278 c -= c_offset;
kono
parents:
diff changeset
279 a_dim1 = aystride;
kono
parents:
diff changeset
280 a_offset = 1 + a_dim1;
kono
parents:
diff changeset
281 a -= a_offset;
kono
parents:
diff changeset
282 b_dim1 = bystride;
kono
parents:
diff changeset
283 b_offset = 1 + b_dim1;
kono
parents:
diff changeset
284 b -= b_offset;
kono
parents:
diff changeset
285
kono
parents:
diff changeset
286 /* Empty c first. */
kono
parents:
diff changeset
287 for (j=1; j<=n; j++)
kono
parents:
diff changeset
288 for (i=1; i<=m; i++)
kono
parents:
diff changeset
289 c[i + j * c_dim1] = (GFC_COMPLEX_4)0;
kono
parents:
diff changeset
290
kono
parents:
diff changeset
291 /* Early exit if possible */
kono
parents:
diff changeset
292 if (m == 0 || n == 0 || k == 0)
kono
parents:
diff changeset
293 return;
kono
parents:
diff changeset
294
kono
parents:
diff changeset
295 /* Adjust size of t1 to what is needed. */
131
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
296 index_type t1_dim, a_sz;
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
297 if (aystride == 1)
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
298 a_sz = rystride;
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
299 else
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
300 a_sz = a_dim1;
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
301
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
302 t1_dim = a_sz * 256 + b_dim1;
111
kono
parents:
diff changeset
303 if (t1_dim > 65536)
kono
parents:
diff changeset
304 t1_dim = 65536;
kono
parents:
diff changeset
305
kono
parents:
diff changeset
306 t1 = malloc (t1_dim * sizeof(GFC_COMPLEX_4));
kono
parents:
diff changeset
307
kono
parents:
diff changeset
308 /* Start turning the crank. */
kono
parents:
diff changeset
309 i1 = n;
kono
parents:
diff changeset
310 for (jj = 1; jj <= i1; jj += 512)
kono
parents:
diff changeset
311 {
kono
parents:
diff changeset
312 /* Computing MIN */
kono
parents:
diff changeset
313 i2 = 512;
kono
parents:
diff changeset
314 i3 = n - jj + 1;
kono
parents:
diff changeset
315 jsec = min(i2,i3);
kono
parents:
diff changeset
316 ujsec = jsec - jsec % 4;
kono
parents:
diff changeset
317 i2 = k;
kono
parents:
diff changeset
318 for (ll = 1; ll <= i2; ll += 256)
kono
parents:
diff changeset
319 {
kono
parents:
diff changeset
320 /* Computing MIN */
kono
parents:
diff changeset
321 i3 = 256;
kono
parents:
diff changeset
322 i4 = k - ll + 1;
kono
parents:
diff changeset
323 lsec = min(i3,i4);
kono
parents:
diff changeset
324 ulsec = lsec - lsec % 2;
kono
parents:
diff changeset
325
kono
parents:
diff changeset
326 i3 = m;
kono
parents:
diff changeset
327 for (ii = 1; ii <= i3; ii += 256)
kono
parents:
diff changeset
328 {
kono
parents:
diff changeset
329 /* Computing MIN */
kono
parents:
diff changeset
330 i4 = 256;
kono
parents:
diff changeset
331 i5 = m - ii + 1;
kono
parents:
diff changeset
332 isec = min(i4,i5);
kono
parents:
diff changeset
333 uisec = isec - isec % 2;
kono
parents:
diff changeset
334 i4 = ll + ulsec - 1;
kono
parents:
diff changeset
335 for (l = ll; l <= i4; l += 2)
kono
parents:
diff changeset
336 {
kono
parents:
diff changeset
337 i5 = ii + uisec - 1;
kono
parents:
diff changeset
338 for (i = ii; i <= i5; i += 2)
kono
parents:
diff changeset
339 {
kono
parents:
diff changeset
340 t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
kono
parents:
diff changeset
341 a[i + l * a_dim1];
kono
parents:
diff changeset
342 t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
kono
parents:
diff changeset
343 a[i + (l + 1) * a_dim1];
kono
parents:
diff changeset
344 t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
kono
parents:
diff changeset
345 a[i + 1 + l * a_dim1];
kono
parents:
diff changeset
346 t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
kono
parents:
diff changeset
347 a[i + 1 + (l + 1) * a_dim1];
kono
parents:
diff changeset
348 }
kono
parents:
diff changeset
349 if (uisec < isec)
kono
parents:
diff changeset
350 {
kono
parents:
diff changeset
351 t1[l - ll + 1 + (isec << 8) - 257] =
kono
parents:
diff changeset
352 a[ii + isec - 1 + l * a_dim1];
kono
parents:
diff changeset
353 t1[l - ll + 2 + (isec << 8) - 257] =
kono
parents:
diff changeset
354 a[ii + isec - 1 + (l + 1) * a_dim1];
kono
parents:
diff changeset
355 }
kono
parents:
diff changeset
356 }
kono
parents:
diff changeset
357 if (ulsec < lsec)
kono
parents:
diff changeset
358 {
kono
parents:
diff changeset
359 i4 = ii + isec - 1;
kono
parents:
diff changeset
360 for (i = ii; i<= i4; ++i)
kono
parents:
diff changeset
361 {
kono
parents:
diff changeset
362 t1[lsec + ((i - ii + 1) << 8) - 257] =
kono
parents:
diff changeset
363 a[i + (ll + lsec - 1) * a_dim1];
kono
parents:
diff changeset
364 }
kono
parents:
diff changeset
365 }
kono
parents:
diff changeset
366
kono
parents:
diff changeset
367 uisec = isec - isec % 4;
kono
parents:
diff changeset
368 i4 = jj + ujsec - 1;
kono
parents:
diff changeset
369 for (j = jj; j <= i4; j += 4)
kono
parents:
diff changeset
370 {
kono
parents:
diff changeset
371 i5 = ii + uisec - 1;
kono
parents:
diff changeset
372 for (i = ii; i <= i5; i += 4)
kono
parents:
diff changeset
373 {
kono
parents:
diff changeset
374 f11 = c[i + j * c_dim1];
kono
parents:
diff changeset
375 f21 = c[i + 1 + j * c_dim1];
kono
parents:
diff changeset
376 f12 = c[i + (j + 1) * c_dim1];
kono
parents:
diff changeset
377 f22 = c[i + 1 + (j + 1) * c_dim1];
kono
parents:
diff changeset
378 f13 = c[i + (j + 2) * c_dim1];
kono
parents:
diff changeset
379 f23 = c[i + 1 + (j + 2) * c_dim1];
kono
parents:
diff changeset
380 f14 = c[i + (j + 3) * c_dim1];
kono
parents:
diff changeset
381 f24 = c[i + 1 + (j + 3) * c_dim1];
kono
parents:
diff changeset
382 f31 = c[i + 2 + j * c_dim1];
kono
parents:
diff changeset
383 f41 = c[i + 3 + j * c_dim1];
kono
parents:
diff changeset
384 f32 = c[i + 2 + (j + 1) * c_dim1];
kono
parents:
diff changeset
385 f42 = c[i + 3 + (j + 1) * c_dim1];
kono
parents:
diff changeset
386 f33 = c[i + 2 + (j + 2) * c_dim1];
kono
parents:
diff changeset
387 f43 = c[i + 3 + (j + 2) * c_dim1];
kono
parents:
diff changeset
388 f34 = c[i + 2 + (j + 3) * c_dim1];
kono
parents:
diff changeset
389 f44 = c[i + 3 + (j + 3) * c_dim1];
kono
parents:
diff changeset
390 i6 = ll + lsec - 1;
kono
parents:
diff changeset
391 for (l = ll; l <= i6; ++l)
kono
parents:
diff changeset
392 {
kono
parents:
diff changeset
393 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
kono
parents:
diff changeset
394 * b[l + j * b_dim1];
kono
parents:
diff changeset
395 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
kono
parents:
diff changeset
396 * b[l + j * b_dim1];
kono
parents:
diff changeset
397 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
kono
parents:
diff changeset
398 * b[l + (j + 1) * b_dim1];
kono
parents:
diff changeset
399 f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
kono
parents:
diff changeset
400 * b[l + (j + 1) * b_dim1];
kono
parents:
diff changeset
401 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
kono
parents:
diff changeset
402 * b[l + (j + 2) * b_dim1];
kono
parents:
diff changeset
403 f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
kono
parents:
diff changeset
404 * b[l + (j + 2) * b_dim1];
kono
parents:
diff changeset
405 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
kono
parents:
diff changeset
406 * b[l + (j + 3) * b_dim1];
kono
parents:
diff changeset
407 f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
kono
parents:
diff changeset
408 * b[l + (j + 3) * b_dim1];
kono
parents:
diff changeset
409 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
kono
parents:
diff changeset
410 * b[l + j * b_dim1];
kono
parents:
diff changeset
411 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
kono
parents:
diff changeset
412 * b[l + j * b_dim1];
kono
parents:
diff changeset
413 f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
kono
parents:
diff changeset
414 * b[l + (j + 1) * b_dim1];
kono
parents:
diff changeset
415 f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
kono
parents:
diff changeset
416 * b[l + (j + 1) * b_dim1];
kono
parents:
diff changeset
417 f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
kono
parents:
diff changeset
418 * b[l + (j + 2) * b_dim1];
kono
parents:
diff changeset
419 f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
kono
parents:
diff changeset
420 * b[l + (j + 2) * b_dim1];
kono
parents:
diff changeset
421 f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
kono
parents:
diff changeset
422 * b[l + (j + 3) * b_dim1];
kono
parents:
diff changeset
423 f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
kono
parents:
diff changeset
424 * b[l + (j + 3) * b_dim1];
kono
parents:
diff changeset
425 }
kono
parents:
diff changeset
426 c[i + j * c_dim1] = f11;
kono
parents:
diff changeset
427 c[i + 1 + j * c_dim1] = f21;
kono
parents:
diff changeset
428 c[i + (j + 1) * c_dim1] = f12;
kono
parents:
diff changeset
429 c[i + 1 + (j + 1) * c_dim1] = f22;
kono
parents:
diff changeset
430 c[i + (j + 2) * c_dim1] = f13;
kono
parents:
diff changeset
431 c[i + 1 + (j + 2) * c_dim1] = f23;
kono
parents:
diff changeset
432 c[i + (j + 3) * c_dim1] = f14;
kono
parents:
diff changeset
433 c[i + 1 + (j + 3) * c_dim1] = f24;
kono
parents:
diff changeset
434 c[i + 2 + j * c_dim1] = f31;
kono
parents:
diff changeset
435 c[i + 3 + j * c_dim1] = f41;
kono
parents:
diff changeset
436 c[i + 2 + (j + 1) * c_dim1] = f32;
kono
parents:
diff changeset
437 c[i + 3 + (j + 1) * c_dim1] = f42;
kono
parents:
diff changeset
438 c[i + 2 + (j + 2) * c_dim1] = f33;
kono
parents:
diff changeset
439 c[i + 3 + (j + 2) * c_dim1] = f43;
kono
parents:
diff changeset
440 c[i + 2 + (j + 3) * c_dim1] = f34;
kono
parents:
diff changeset
441 c[i + 3 + (j + 3) * c_dim1] = f44;
kono
parents:
diff changeset
442 }
kono
parents:
diff changeset
443 if (uisec < isec)
kono
parents:
diff changeset
444 {
kono
parents:
diff changeset
445 i5 = ii + isec - 1;
kono
parents:
diff changeset
446 for (i = ii + uisec; i <= i5; ++i)
kono
parents:
diff changeset
447 {
kono
parents:
diff changeset
448 f11 = c[i + j * c_dim1];
kono
parents:
diff changeset
449 f12 = c[i + (j + 1) * c_dim1];
kono
parents:
diff changeset
450 f13 = c[i + (j + 2) * c_dim1];
kono
parents:
diff changeset
451 f14 = c[i + (j + 3) * c_dim1];
kono
parents:
diff changeset
452 i6 = ll + lsec - 1;
kono
parents:
diff changeset
453 for (l = ll; l <= i6; ++l)
kono
parents:
diff changeset
454 {
kono
parents:
diff changeset
455 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
kono
parents:
diff changeset
456 257] * b[l + j * b_dim1];
kono
parents:
diff changeset
457 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
kono
parents:
diff changeset
458 257] * b[l + (j + 1) * b_dim1];
kono
parents:
diff changeset
459 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
kono
parents:
diff changeset
460 257] * b[l + (j + 2) * b_dim1];
kono
parents:
diff changeset
461 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
kono
parents:
diff changeset
462 257] * b[l + (j + 3) * b_dim1];
kono
parents:
diff changeset
463 }
kono
parents:
diff changeset
464 c[i + j * c_dim1] = f11;
kono
parents:
diff changeset
465 c[i + (j + 1) * c_dim1] = f12;
kono
parents:
diff changeset
466 c[i + (j + 2) * c_dim1] = f13;
kono
parents:
diff changeset
467 c[i + (j + 3) * c_dim1] = f14;
kono
parents:
diff changeset
468 }
kono
parents:
diff changeset
469 }
kono
parents:
diff changeset
470 }
kono
parents:
diff changeset
471 if (ujsec < jsec)
kono
parents:
diff changeset
472 {
kono
parents:
diff changeset
473 i4 = jj + jsec - 1;
kono
parents:
diff changeset
474 for (j = jj + ujsec; j <= i4; ++j)
kono
parents:
diff changeset
475 {
kono
parents:
diff changeset
476 i5 = ii + uisec - 1;
kono
parents:
diff changeset
477 for (i = ii; i <= i5; i += 4)
kono
parents:
diff changeset
478 {
kono
parents:
diff changeset
479 f11 = c[i + j * c_dim1];
kono
parents:
diff changeset
480 f21 = c[i + 1 + j * c_dim1];
kono
parents:
diff changeset
481 f31 = c[i + 2 + j * c_dim1];
kono
parents:
diff changeset
482 f41 = c[i + 3 + j * c_dim1];
kono
parents:
diff changeset
483 i6 = ll + lsec - 1;
kono
parents:
diff changeset
484 for (l = ll; l <= i6; ++l)
kono
parents:
diff changeset
485 {
kono
parents:
diff changeset
486 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
kono
parents:
diff changeset
487 257] * b[l + j * b_dim1];
kono
parents:
diff changeset
488 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
kono
parents:
diff changeset
489 257] * b[l + j * b_dim1];
kono
parents:
diff changeset
490 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
kono
parents:
diff changeset
491 257] * b[l + j * b_dim1];
kono
parents:
diff changeset
492 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
kono
parents:
diff changeset
493 257] * b[l + j * b_dim1];
kono
parents:
diff changeset
494 }
kono
parents:
diff changeset
495 c[i + j * c_dim1] = f11;
kono
parents:
diff changeset
496 c[i + 1 + j * c_dim1] = f21;
kono
parents:
diff changeset
497 c[i + 2 + j * c_dim1] = f31;
kono
parents:
diff changeset
498 c[i + 3 + j * c_dim1] = f41;
kono
parents:
diff changeset
499 }
kono
parents:
diff changeset
500 i5 = ii + isec - 1;
kono
parents:
diff changeset
501 for (i = ii + uisec; i <= i5; ++i)
kono
parents:
diff changeset
502 {
kono
parents:
diff changeset
503 f11 = c[i + j * c_dim1];
kono
parents:
diff changeset
504 i6 = ll + lsec - 1;
kono
parents:
diff changeset
505 for (l = ll; l <= i6; ++l)
kono
parents:
diff changeset
506 {
kono
parents:
diff changeset
507 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
kono
parents:
diff changeset
508 257] * b[l + j * b_dim1];
kono
parents:
diff changeset
509 }
kono
parents:
diff changeset
510 c[i + j * c_dim1] = f11;
kono
parents:
diff changeset
511 }
kono
parents:
diff changeset
512 }
kono
parents:
diff changeset
513 }
kono
parents:
diff changeset
514 }
kono
parents:
diff changeset
515 }
kono
parents:
diff changeset
516 }
kono
parents:
diff changeset
517 free(t1);
kono
parents:
diff changeset
518 return;
kono
parents:
diff changeset
519 }
kono
parents:
diff changeset
520 else if (rxstride == 1 && aystride == 1 && bxstride == 1)
kono
parents:
diff changeset
521 {
kono
parents:
diff changeset
522 if (GFC_DESCRIPTOR_RANK (a) != 1)
kono
parents:
diff changeset
523 {
kono
parents:
diff changeset
524 const GFC_COMPLEX_4 *restrict abase_x;
kono
parents:
diff changeset
525 const GFC_COMPLEX_4 *restrict bbase_y;
kono
parents:
diff changeset
526 GFC_COMPLEX_4 *restrict dest_y;
kono
parents:
diff changeset
527 GFC_COMPLEX_4 s;
kono
parents:
diff changeset
528
kono
parents:
diff changeset
529 for (y = 0; y < ycount; y++)
kono
parents:
diff changeset
530 {
kono
parents:
diff changeset
531 bbase_y = &bbase[y*bystride];
kono
parents:
diff changeset
532 dest_y = &dest[y*rystride];
kono
parents:
diff changeset
533 for (x = 0; x < xcount; x++)
kono
parents:
diff changeset
534 {
kono
parents:
diff changeset
535 abase_x = &abase[x*axstride];
kono
parents:
diff changeset
536 s = (GFC_COMPLEX_4) 0;
kono
parents:
diff changeset
537 for (n = 0; n < count; n++)
kono
parents:
diff changeset
538 s += abase_x[n] * bbase_y[n];
kono
parents:
diff changeset
539 dest_y[x] = s;
kono
parents:
diff changeset
540 }
kono
parents:
diff changeset
541 }
kono
parents:
diff changeset
542 }
kono
parents:
diff changeset
543 else
kono
parents:
diff changeset
544 {
kono
parents:
diff changeset
545 const GFC_COMPLEX_4 *restrict bbase_y;
kono
parents:
diff changeset
546 GFC_COMPLEX_4 s;
kono
parents:
diff changeset
547
kono
parents:
diff changeset
548 for (y = 0; y < ycount; y++)
kono
parents:
diff changeset
549 {
kono
parents:
diff changeset
550 bbase_y = &bbase[y*bystride];
kono
parents:
diff changeset
551 s = (GFC_COMPLEX_4) 0;
kono
parents:
diff changeset
552 for (n = 0; n < count; n++)
kono
parents:
diff changeset
553 s += abase[n*axstride] * bbase_y[n];
kono
parents:
diff changeset
554 dest[y*rystride] = s;
kono
parents:
diff changeset
555 }
kono
parents:
diff changeset
556 }
kono
parents:
diff changeset
557 }
kono
parents:
diff changeset
558 else if (axstride < aystride)
kono
parents:
diff changeset
559 {
kono
parents:
diff changeset
560 for (y = 0; y < ycount; y++)
kono
parents:
diff changeset
561 for (x = 0; x < xcount; x++)
kono
parents:
diff changeset
562 dest[x*rxstride + y*rystride] = (GFC_COMPLEX_4)0;
kono
parents:
diff changeset
563
kono
parents:
diff changeset
564 for (y = 0; y < ycount; y++)
kono
parents:
diff changeset
565 for (n = 0; n < count; n++)
kono
parents:
diff changeset
566 for (x = 0; x < xcount; x++)
kono
parents:
diff changeset
567 /* dest[x,y] += a[x,n] * b[n,y] */
kono
parents:
diff changeset
568 dest[x*rxstride + y*rystride] +=
kono
parents:
diff changeset
569 abase[x*axstride + n*aystride] *
kono
parents:
diff changeset
570 bbase[n*bxstride + y*bystride];
kono
parents:
diff changeset
571 }
kono
parents:
diff changeset
572 else if (GFC_DESCRIPTOR_RANK (a) == 1)
kono
parents:
diff changeset
573 {
kono
parents:
diff changeset
574 const GFC_COMPLEX_4 *restrict bbase_y;
kono
parents:
diff changeset
575 GFC_COMPLEX_4 s;
kono
parents:
diff changeset
576
kono
parents:
diff changeset
577 for (y = 0; y < ycount; y++)
kono
parents:
diff changeset
578 {
kono
parents:
diff changeset
579 bbase_y = &bbase[y*bystride];
kono
parents:
diff changeset
580 s = (GFC_COMPLEX_4) 0;
kono
parents:
diff changeset
581 for (n = 0; n < count; n++)
kono
parents:
diff changeset
582 s += abase[n*axstride] * bbase_y[n*bxstride];
kono
parents:
diff changeset
583 dest[y*rxstride] = s;
kono
parents:
diff changeset
584 }
kono
parents:
diff changeset
585 }
kono
parents:
diff changeset
586 else
kono
parents:
diff changeset
587 {
kono
parents:
diff changeset
588 const GFC_COMPLEX_4 *restrict abase_x;
kono
parents:
diff changeset
589 const GFC_COMPLEX_4 *restrict bbase_y;
kono
parents:
diff changeset
590 GFC_COMPLEX_4 *restrict dest_y;
kono
parents:
diff changeset
591 GFC_COMPLEX_4 s;
kono
parents:
diff changeset
592
kono
parents:
diff changeset
593 for (y = 0; y < ycount; y++)
kono
parents:
diff changeset
594 {
kono
parents:
diff changeset
595 bbase_y = &bbase[y*bystride];
kono
parents:
diff changeset
596 dest_y = &dest[y*rystride];
kono
parents:
diff changeset
597 for (x = 0; x < xcount; x++)
kono
parents:
diff changeset
598 {
kono
parents:
diff changeset
599 abase_x = &abase[x*axstride];
kono
parents:
diff changeset
600 s = (GFC_COMPLEX_4) 0;
kono
parents:
diff changeset
601 for (n = 0; n < count; n++)
kono
parents:
diff changeset
602 s += abase_x[n*aystride] * bbase_y[n*bxstride];
kono
parents:
diff changeset
603 dest_y[x*rxstride] = s;
kono
parents:
diff changeset
604 }
kono
parents:
diff changeset
605 }
kono
parents:
diff changeset
606 }
kono
parents:
diff changeset
607 }
kono
parents:
diff changeset
608 #undef POW3
kono
parents:
diff changeset
609 #undef min
kono
parents:
diff changeset
610 #undef max
kono
parents:
diff changeset
611
kono
parents:
diff changeset
612 #endif
kono
parents:
diff changeset
613
kono
parents:
diff changeset
614 #if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
kono
parents:
diff changeset
615 void
kono
parents:
diff changeset
616 matmul_c4_avx128_fma4 (gfc_array_c4 * const restrict retarray,
kono
parents:
diff changeset
617 gfc_array_c4 * const restrict a, gfc_array_c4 * const restrict b, int try_blas,
kono
parents:
diff changeset
618 int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4")));
kono
parents:
diff changeset
619 internal_proto(matmul_c4_avx128_fma4);
kono
parents:
diff changeset
620 void
kono
parents:
diff changeset
621 matmul_c4_avx128_fma4 (gfc_array_c4 * const restrict retarray,
kono
parents:
diff changeset
622 gfc_array_c4 * const restrict a, gfc_array_c4 * const restrict b, int try_blas,
kono
parents:
diff changeset
623 int blas_limit, blas_call gemm)
kono
parents:
diff changeset
624 {
kono
parents:
diff changeset
625 const GFC_COMPLEX_4 * restrict abase;
kono
parents:
diff changeset
626 const GFC_COMPLEX_4 * restrict bbase;
kono
parents:
diff changeset
627 GFC_COMPLEX_4 * restrict dest;
kono
parents:
diff changeset
628
kono
parents:
diff changeset
629 index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
kono
parents:
diff changeset
630 index_type x, y, n, count, xcount, ycount;
kono
parents:
diff changeset
631
kono
parents:
diff changeset
632 assert (GFC_DESCRIPTOR_RANK (a) == 2
kono
parents:
diff changeset
633 || GFC_DESCRIPTOR_RANK (b) == 2);
kono
parents:
diff changeset
634
kono
parents:
diff changeset
635 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
kono
parents:
diff changeset
636
kono
parents:
diff changeset
637 Either A or B (but not both) can be rank 1:
kono
parents:
diff changeset
638
kono
parents:
diff changeset
639 o One-dimensional argument A is implicitly treated as a row matrix
kono
parents:
diff changeset
640 dimensioned [1,count], so xcount=1.
kono
parents:
diff changeset
641
kono
parents:
diff changeset
642 o One-dimensional argument B is implicitly treated as a column matrix
kono
parents:
diff changeset
643 dimensioned [count, 1], so ycount=1.
kono
parents:
diff changeset
644 */
kono
parents:
diff changeset
645
kono
parents:
diff changeset
646 if (retarray->base_addr == NULL)
kono
parents:
diff changeset
647 {
kono
parents:
diff changeset
648 if (GFC_DESCRIPTOR_RANK (a) == 1)
kono
parents:
diff changeset
649 {
kono
parents:
diff changeset
650 GFC_DIMENSION_SET(retarray->dim[0], 0,
kono
parents:
diff changeset
651 GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
kono
parents:
diff changeset
652 }
kono
parents:
diff changeset
653 else if (GFC_DESCRIPTOR_RANK (b) == 1)
kono
parents:
diff changeset
654 {
kono
parents:
diff changeset
655 GFC_DIMENSION_SET(retarray->dim[0], 0,
kono
parents:
diff changeset
656 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
kono
parents:
diff changeset
657 }
kono
parents:
diff changeset
658 else
kono
parents:
diff changeset
659 {
kono
parents:
diff changeset
660 GFC_DIMENSION_SET(retarray->dim[0], 0,
kono
parents:
diff changeset
661 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
kono
parents:
diff changeset
662
kono
parents:
diff changeset
663 GFC_DIMENSION_SET(retarray->dim[1], 0,
kono
parents:
diff changeset
664 GFC_DESCRIPTOR_EXTENT(b,1) - 1,
kono
parents:
diff changeset
665 GFC_DESCRIPTOR_EXTENT(retarray,0));
kono
parents:
diff changeset
666 }
kono
parents:
diff changeset
667
kono
parents:
diff changeset
668 retarray->base_addr
kono
parents:
diff changeset
669 = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_COMPLEX_4));
kono
parents:
diff changeset
670 retarray->offset = 0;
kono
parents:
diff changeset
671 }
kono
parents:
diff changeset
672 else if (unlikely (compile_options.bounds_check))
kono
parents:
diff changeset
673 {
kono
parents:
diff changeset
674 index_type ret_extent, arg_extent;
kono
parents:
diff changeset
675
kono
parents:
diff changeset
676 if (GFC_DESCRIPTOR_RANK (a) == 1)
kono
parents:
diff changeset
677 {
kono
parents:
diff changeset
678 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
kono
parents:
diff changeset
679 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
kono
parents:
diff changeset
680 if (arg_extent != ret_extent)
131
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
681 runtime_error ("Array bound mismatch for dimension 1 of "
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
682 "array (%ld/%ld) ",
111
kono
parents:
diff changeset
683 (long int) ret_extent, (long int) arg_extent);
kono
parents:
diff changeset
684 }
kono
parents:
diff changeset
685 else if (GFC_DESCRIPTOR_RANK (b) == 1)
kono
parents:
diff changeset
686 {
kono
parents:
diff changeset
687 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
kono
parents:
diff changeset
688 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
kono
parents:
diff changeset
689 if (arg_extent != ret_extent)
131
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
690 runtime_error ("Array bound mismatch for dimension 1 of "
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
691 "array (%ld/%ld) ",
111
kono
parents:
diff changeset
692 (long int) ret_extent, (long int) arg_extent);
kono
parents:
diff changeset
693 }
kono
parents:
diff changeset
694 else
kono
parents:
diff changeset
695 {
kono
parents:
diff changeset
696 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
kono
parents:
diff changeset
697 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
kono
parents:
diff changeset
698 if (arg_extent != ret_extent)
131
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
699 runtime_error ("Array bound mismatch for dimension 1 of "
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
700 "array (%ld/%ld) ",
111
kono
parents:
diff changeset
701 (long int) ret_extent, (long int) arg_extent);
kono
parents:
diff changeset
702
kono
parents:
diff changeset
703 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
kono
parents:
diff changeset
704 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
kono
parents:
diff changeset
705 if (arg_extent != ret_extent)
131
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
706 runtime_error ("Array bound mismatch for dimension 2 of "
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
707 "array (%ld/%ld) ",
111
kono
parents:
diff changeset
708 (long int) ret_extent, (long int) arg_extent);
kono
parents:
diff changeset
709 }
kono
parents:
diff changeset
710 }
kono
parents:
diff changeset
711
kono
parents:
diff changeset
712
kono
parents:
diff changeset
713 if (GFC_DESCRIPTOR_RANK (retarray) == 1)
kono
parents:
diff changeset
714 {
kono
parents:
diff changeset
715 /* One-dimensional result may be addressed in the code below
kono
parents:
diff changeset
716 either as a row or a column matrix. We want both cases to
kono
parents:
diff changeset
717 work. */
kono
parents:
diff changeset
718 rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
kono
parents:
diff changeset
719 }
kono
parents:
diff changeset
720 else
kono
parents:
diff changeset
721 {
kono
parents:
diff changeset
722 rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
kono
parents:
diff changeset
723 rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
kono
parents:
diff changeset
724 }
kono
parents:
diff changeset
725
kono
parents:
diff changeset
726
kono
parents:
diff changeset
727 if (GFC_DESCRIPTOR_RANK (a) == 1)
kono
parents:
diff changeset
728 {
kono
parents:
diff changeset
729 /* Treat it as a a row matrix A[1,count]. */
kono
parents:
diff changeset
730 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
kono
parents:
diff changeset
731 aystride = 1;
kono
parents:
diff changeset
732
kono
parents:
diff changeset
733 xcount = 1;
kono
parents:
diff changeset
734 count = GFC_DESCRIPTOR_EXTENT(a,0);
kono
parents:
diff changeset
735 }
kono
parents:
diff changeset
736 else
kono
parents:
diff changeset
737 {
kono
parents:
diff changeset
738 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
kono
parents:
diff changeset
739 aystride = GFC_DESCRIPTOR_STRIDE(a,1);
kono
parents:
diff changeset
740
kono
parents:
diff changeset
741 count = GFC_DESCRIPTOR_EXTENT(a,1);
kono
parents:
diff changeset
742 xcount = GFC_DESCRIPTOR_EXTENT(a,0);
kono
parents:
diff changeset
743 }
kono
parents:
diff changeset
744
kono
parents:
diff changeset
745 if (count != GFC_DESCRIPTOR_EXTENT(b,0))
kono
parents:
diff changeset
746 {
kono
parents:
diff changeset
747 if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
131
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
748 runtime_error ("Incorrect extent in argument B in MATMUL intrinsic "
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
749 "in dimension 1: is %ld, should be %ld",
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
750 (long int) GFC_DESCRIPTOR_EXTENT(b,0), (long int) count);
111
kono
parents:
diff changeset
751 }
kono
parents:
diff changeset
752
kono
parents:
diff changeset
753 if (GFC_DESCRIPTOR_RANK (b) == 1)
kono
parents:
diff changeset
754 {
kono
parents:
diff changeset
755 /* Treat it as a column matrix B[count,1] */
kono
parents:
diff changeset
756 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
kono
parents:
diff changeset
757
kono
parents:
diff changeset
758 /* bystride should never be used for 1-dimensional b.
kono
parents:
diff changeset
759 The value is only used for calculation of the
kono
parents:
diff changeset
760 memory by the buffer. */
kono
parents:
diff changeset
761 bystride = 256;
kono
parents:
diff changeset
762 ycount = 1;
kono
parents:
diff changeset
763 }
kono
parents:
diff changeset
764 else
kono
parents:
diff changeset
765 {
kono
parents:
diff changeset
766 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
kono
parents:
diff changeset
767 bystride = GFC_DESCRIPTOR_STRIDE(b,1);
kono
parents:
diff changeset
768 ycount = GFC_DESCRIPTOR_EXTENT(b,1);
kono
parents:
diff changeset
769 }
kono
parents:
diff changeset
770
kono
parents:
diff changeset
771 abase = a->base_addr;
kono
parents:
diff changeset
772 bbase = b->base_addr;
kono
parents:
diff changeset
773 dest = retarray->base_addr;
kono
parents:
diff changeset
774
kono
parents:
diff changeset
775 /* Now that everything is set up, we perform the multiplication
kono
parents:
diff changeset
776 itself. */
kono
parents:
diff changeset
777
kono
parents:
diff changeset
778 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
kono
parents:
diff changeset
779 #define min(a,b) ((a) <= (b) ? (a) : (b))
kono
parents:
diff changeset
780 #define max(a,b) ((a) >= (b) ? (a) : (b))
kono
parents:
diff changeset
781
kono
parents:
diff changeset
782 if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
kono
parents:
diff changeset
783 && (bxstride == 1 || bystride == 1)
kono
parents:
diff changeset
784 && (((float) xcount) * ((float) ycount) * ((float) count)
kono
parents:
diff changeset
785 > POW3(blas_limit)))
kono
parents:
diff changeset
786 {
kono
parents:
diff changeset
787 const int m = xcount, n = ycount, k = count, ldc = rystride;
kono
parents:
diff changeset
788 const GFC_COMPLEX_4 one = 1, zero = 0;
kono
parents:
diff changeset
789 const int lda = (axstride == 1) ? aystride : axstride,
kono
parents:
diff changeset
790 ldb = (bxstride == 1) ? bystride : bxstride;
kono
parents:
diff changeset
791
kono
parents:
diff changeset
792 if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
kono
parents:
diff changeset
793 {
kono
parents:
diff changeset
794 assert (gemm != NULL);
131
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
795 const char *transa, *transb;
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
796 if (try_blas & 2)
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
797 transa = "C";
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
798 else
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
799 transa = axstride == 1 ? "N" : "T";
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
800
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
801 if (try_blas & 4)
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
802 transb = "C";
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
803 else
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
804 transb = bxstride == 1 ? "N" : "T";
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
805
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
806 gemm (transa, transb , &m,
111
kono
parents:
diff changeset
807 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
kono
parents:
diff changeset
808 &ldc, 1, 1);
kono
parents:
diff changeset
809 return;
kono
parents:
diff changeset
810 }
kono
parents:
diff changeset
811 }
kono
parents:
diff changeset
812
kono
parents:
diff changeset
813 if (rxstride == 1 && axstride == 1 && bxstride == 1)
kono
parents:
diff changeset
814 {
kono
parents:
diff changeset
815 /* This block of code implements a tuned matmul, derived from
kono
parents:
diff changeset
816 Superscalar GEMM-based level 3 BLAS, Beta version 0.1
kono
parents:
diff changeset
817
kono
parents:
diff changeset
818 Bo Kagstrom and Per Ling
kono
parents:
diff changeset
819 Department of Computing Science
kono
parents:
diff changeset
820 Umea University
kono
parents:
diff changeset
821 S-901 87 Umea, Sweden
kono
parents:
diff changeset
822
kono
parents:
diff changeset
823 from netlib.org, translated to C, and modified for matmul.m4. */
kono
parents:
diff changeset
824
kono
parents:
diff changeset
825 const GFC_COMPLEX_4 *a, *b;
kono
parents:
diff changeset
826 GFC_COMPLEX_4 *c;
kono
parents:
diff changeset
827 const index_type m = xcount, n = ycount, k = count;
kono
parents:
diff changeset
828
kono
parents:
diff changeset
829 /* System generated locals */
kono
parents:
diff changeset
830 index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
kono
parents:
diff changeset
831 i1, i2, i3, i4, i5, i6;
kono
parents:
diff changeset
832
kono
parents:
diff changeset
833 /* Local variables */
kono
parents:
diff changeset
834 GFC_COMPLEX_4 f11, f12, f21, f22, f31, f32, f41, f42,
kono
parents:
diff changeset
835 f13, f14, f23, f24, f33, f34, f43, f44;
kono
parents:
diff changeset
836 index_type i, j, l, ii, jj, ll;
kono
parents:
diff changeset
837 index_type isec, jsec, lsec, uisec, ujsec, ulsec;
kono
parents:
diff changeset
838 GFC_COMPLEX_4 *t1;
kono
parents:
diff changeset
839
kono
parents:
diff changeset
840 a = abase;
kono
parents:
diff changeset
841 b = bbase;
kono
parents:
diff changeset
842 c = retarray->base_addr;
kono
parents:
diff changeset
843
kono
parents:
diff changeset
844 /* Parameter adjustments */
kono
parents:
diff changeset
845 c_dim1 = rystride;
kono
parents:
diff changeset
846 c_offset = 1 + c_dim1;
kono
parents:
diff changeset
847 c -= c_offset;
kono
parents:
diff changeset
848 a_dim1 = aystride;
kono
parents:
diff changeset
849 a_offset = 1 + a_dim1;
kono
parents:
diff changeset
850 a -= a_offset;
kono
parents:
diff changeset
851 b_dim1 = bystride;
kono
parents:
diff changeset
852 b_offset = 1 + b_dim1;
kono
parents:
diff changeset
853 b -= b_offset;
kono
parents:
diff changeset
854
kono
parents:
diff changeset
855 /* Empty c first. */
kono
parents:
diff changeset
856 for (j=1; j<=n; j++)
kono
parents:
diff changeset
857 for (i=1; i<=m; i++)
kono
parents:
diff changeset
858 c[i + j * c_dim1] = (GFC_COMPLEX_4)0;
kono
parents:
diff changeset
859
kono
parents:
diff changeset
860 /* Early exit if possible */
kono
parents:
diff changeset
861 if (m == 0 || n == 0 || k == 0)
kono
parents:
diff changeset
862 return;
kono
parents:
diff changeset
863
kono
parents:
diff changeset
864 /* Adjust size of t1 to what is needed. */
131
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
865 index_type t1_dim, a_sz;
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
866 if (aystride == 1)
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
867 a_sz = rystride;
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
868 else
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
869 a_sz = a_dim1;
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
870
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
871 t1_dim = a_sz * 256 + b_dim1;
111
kono
parents:
diff changeset
872 if (t1_dim > 65536)
kono
parents:
diff changeset
873 t1_dim = 65536;
kono
parents:
diff changeset
874
kono
parents:
diff changeset
875 t1 = malloc (t1_dim * sizeof(GFC_COMPLEX_4));
kono
parents:
diff changeset
876
kono
parents:
diff changeset
877 /* Start turning the crank. */
kono
parents:
diff changeset
878 i1 = n;
kono
parents:
diff changeset
879 for (jj = 1; jj <= i1; jj += 512)
kono
parents:
diff changeset
880 {
kono
parents:
diff changeset
881 /* Computing MIN */
kono
parents:
diff changeset
882 i2 = 512;
kono
parents:
diff changeset
883 i3 = n - jj + 1;
kono
parents:
diff changeset
884 jsec = min(i2,i3);
kono
parents:
diff changeset
885 ujsec = jsec - jsec % 4;
kono
parents:
diff changeset
886 i2 = k;
kono
parents:
diff changeset
887 for (ll = 1; ll <= i2; ll += 256)
kono
parents:
diff changeset
888 {
kono
parents:
diff changeset
889 /* Computing MIN */
kono
parents:
diff changeset
890 i3 = 256;
kono
parents:
diff changeset
891 i4 = k - ll + 1;
kono
parents:
diff changeset
892 lsec = min(i3,i4);
kono
parents:
diff changeset
893 ulsec = lsec - lsec % 2;
kono
parents:
diff changeset
894
kono
parents:
diff changeset
895 i3 = m;
kono
parents:
diff changeset
896 for (ii = 1; ii <= i3; ii += 256)
kono
parents:
diff changeset
897 {
kono
parents:
diff changeset
898 /* Computing MIN */
kono
parents:
diff changeset
899 i4 = 256;
kono
parents:
diff changeset
900 i5 = m - ii + 1;
kono
parents:
diff changeset
901 isec = min(i4,i5);
kono
parents:
diff changeset
902 uisec = isec - isec % 2;
kono
parents:
diff changeset
903 i4 = ll + ulsec - 1;
kono
parents:
diff changeset
904 for (l = ll; l <= i4; l += 2)
kono
parents:
diff changeset
905 {
kono
parents:
diff changeset
906 i5 = ii + uisec - 1;
kono
parents:
diff changeset
907 for (i = ii; i <= i5; i += 2)
kono
parents:
diff changeset
908 {
kono
parents:
diff changeset
909 t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
kono
parents:
diff changeset
910 a[i + l * a_dim1];
kono
parents:
diff changeset
911 t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
kono
parents:
diff changeset
912 a[i + (l + 1) * a_dim1];
kono
parents:
diff changeset
913 t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
kono
parents:
diff changeset
914 a[i + 1 + l * a_dim1];
kono
parents:
diff changeset
915 t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
kono
parents:
diff changeset
916 a[i + 1 + (l + 1) * a_dim1];
kono
parents:
diff changeset
917 }
kono
parents:
diff changeset
918 if (uisec < isec)
kono
parents:
diff changeset
919 {
kono
parents:
diff changeset
920 t1[l - ll + 1 + (isec << 8) - 257] =
kono
parents:
diff changeset
921 a[ii + isec - 1 + l * a_dim1];
kono
parents:
diff changeset
922 t1[l - ll + 2 + (isec << 8) - 257] =
kono
parents:
diff changeset
923 a[ii + isec - 1 + (l + 1) * a_dim1];
kono
parents:
diff changeset
924 }
kono
parents:
diff changeset
925 }
kono
parents:
diff changeset
926 if (ulsec < lsec)
kono
parents:
diff changeset
927 {
kono
parents:
diff changeset
928 i4 = ii + isec - 1;
kono
parents:
diff changeset
929 for (i = ii; i<= i4; ++i)
kono
parents:
diff changeset
930 {
kono
parents:
diff changeset
931 t1[lsec + ((i - ii + 1) << 8) - 257] =
kono
parents:
diff changeset
932 a[i + (ll + lsec - 1) * a_dim1];
kono
parents:
diff changeset
933 }
kono
parents:
diff changeset
934 }
kono
parents:
diff changeset
935
kono
parents:
diff changeset
936 uisec = isec - isec % 4;
kono
parents:
diff changeset
937 i4 = jj + ujsec - 1;
kono
parents:
diff changeset
938 for (j = jj; j <= i4; j += 4)
kono
parents:
diff changeset
939 {
kono
parents:
diff changeset
940 i5 = ii + uisec - 1;
kono
parents:
diff changeset
941 for (i = ii; i <= i5; i += 4)
kono
parents:
diff changeset
942 {
kono
parents:
diff changeset
943 f11 = c[i + j * c_dim1];
kono
parents:
diff changeset
944 f21 = c[i + 1 + j * c_dim1];
kono
parents:
diff changeset
945 f12 = c[i + (j + 1) * c_dim1];
kono
parents:
diff changeset
946 f22 = c[i + 1 + (j + 1) * c_dim1];
kono
parents:
diff changeset
947 f13 = c[i + (j + 2) * c_dim1];
kono
parents:
diff changeset
948 f23 = c[i + 1 + (j + 2) * c_dim1];
kono
parents:
diff changeset
949 f14 = c[i + (j + 3) * c_dim1];
kono
parents:
diff changeset
950 f24 = c[i + 1 + (j + 3) * c_dim1];
kono
parents:
diff changeset
951 f31 = c[i + 2 + j * c_dim1];
kono
parents:
diff changeset
952 f41 = c[i + 3 + j * c_dim1];
kono
parents:
diff changeset
953 f32 = c[i + 2 + (j + 1) * c_dim1];
kono
parents:
diff changeset
954 f42 = c[i + 3 + (j + 1) * c_dim1];
kono
parents:
diff changeset
955 f33 = c[i + 2 + (j + 2) * c_dim1];
kono
parents:
diff changeset
956 f43 = c[i + 3 + (j + 2) * c_dim1];
kono
parents:
diff changeset
957 f34 = c[i + 2 + (j + 3) * c_dim1];
kono
parents:
diff changeset
958 f44 = c[i + 3 + (j + 3) * c_dim1];
kono
parents:
diff changeset
959 i6 = ll + lsec - 1;
kono
parents:
diff changeset
960 for (l = ll; l <= i6; ++l)
kono
parents:
diff changeset
961 {
kono
parents:
diff changeset
962 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
kono
parents:
diff changeset
963 * b[l + j * b_dim1];
kono
parents:
diff changeset
964 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
kono
parents:
diff changeset
965 * b[l + j * b_dim1];
kono
parents:
diff changeset
966 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
kono
parents:
diff changeset
967 * b[l + (j + 1) * b_dim1];
kono
parents:
diff changeset
968 f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
kono
parents:
diff changeset
969 * b[l + (j + 1) * b_dim1];
kono
parents:
diff changeset
970 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
kono
parents:
diff changeset
971 * b[l + (j + 2) * b_dim1];
kono
parents:
diff changeset
972 f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
kono
parents:
diff changeset
973 * b[l + (j + 2) * b_dim1];
kono
parents:
diff changeset
974 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
kono
parents:
diff changeset
975 * b[l + (j + 3) * b_dim1];
kono
parents:
diff changeset
976 f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
kono
parents:
diff changeset
977 * b[l + (j + 3) * b_dim1];
kono
parents:
diff changeset
978 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
kono
parents:
diff changeset
979 * b[l + j * b_dim1];
kono
parents:
diff changeset
980 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
kono
parents:
diff changeset
981 * b[l + j * b_dim1];
kono
parents:
diff changeset
982 f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
kono
parents:
diff changeset
983 * b[l + (j + 1) * b_dim1];
kono
parents:
diff changeset
984 f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
kono
parents:
diff changeset
985 * b[l + (j + 1) * b_dim1];
kono
parents:
diff changeset
986 f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
kono
parents:
diff changeset
987 * b[l + (j + 2) * b_dim1];
kono
parents:
diff changeset
988 f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
kono
parents:
diff changeset
989 * b[l + (j + 2) * b_dim1];
kono
parents:
diff changeset
990 f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
kono
parents:
diff changeset
991 * b[l + (j + 3) * b_dim1];
kono
parents:
diff changeset
992 f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
kono
parents:
diff changeset
993 * b[l + (j + 3) * b_dim1];
kono
parents:
diff changeset
994 }
kono
parents:
diff changeset
995 c[i + j * c_dim1] = f11;
kono
parents:
diff changeset
996 c[i + 1 + j * c_dim1] = f21;
kono
parents:
diff changeset
997 c[i + (j + 1) * c_dim1] = f12;
kono
parents:
diff changeset
998 c[i + 1 + (j + 1) * c_dim1] = f22;
kono
parents:
diff changeset
999 c[i + (j + 2) * c_dim1] = f13;
kono
parents:
diff changeset
1000 c[i + 1 + (j + 2) * c_dim1] = f23;
kono
parents:
diff changeset
1001 c[i + (j + 3) * c_dim1] = f14;
kono
parents:
diff changeset
1002 c[i + 1 + (j + 3) * c_dim1] = f24;
kono
parents:
diff changeset
1003 c[i + 2 + j * c_dim1] = f31;
kono
parents:
diff changeset
1004 c[i + 3 + j * c_dim1] = f41;
kono
parents:
diff changeset
1005 c[i + 2 + (j + 1) * c_dim1] = f32;
kono
parents:
diff changeset
1006 c[i + 3 + (j + 1) * c_dim1] = f42;
kono
parents:
diff changeset
1007 c[i + 2 + (j + 2) * c_dim1] = f33;
kono
parents:
diff changeset
1008 c[i + 3 + (j + 2) * c_dim1] = f43;
kono
parents:
diff changeset
1009 c[i + 2 + (j + 3) * c_dim1] = f34;
kono
parents:
diff changeset
1010 c[i + 3 + (j + 3) * c_dim1] = f44;
kono
parents:
diff changeset
1011 }
kono
parents:
diff changeset
1012 if (uisec < isec)
kono
parents:
diff changeset
1013 {
kono
parents:
diff changeset
1014 i5 = ii + isec - 1;
kono
parents:
diff changeset
1015 for (i = ii + uisec; i <= i5; ++i)
kono
parents:
diff changeset
1016 {
kono
parents:
diff changeset
1017 f11 = c[i + j * c_dim1];
kono
parents:
diff changeset
1018 f12 = c[i + (j + 1) * c_dim1];
kono
parents:
diff changeset
1019 f13 = c[i + (j + 2) * c_dim1];
kono
parents:
diff changeset
1020 f14 = c[i + (j + 3) * c_dim1];
kono
parents:
diff changeset
1021 i6 = ll + lsec - 1;
kono
parents:
diff changeset
1022 for (l = ll; l <= i6; ++l)
kono
parents:
diff changeset
1023 {
kono
parents:
diff changeset
1024 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
kono
parents:
diff changeset
1025 257] * b[l + j * b_dim1];
kono
parents:
diff changeset
1026 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
kono
parents:
diff changeset
1027 257] * b[l + (j + 1) * b_dim1];
kono
parents:
diff changeset
1028 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
kono
parents:
diff changeset
1029 257] * b[l + (j + 2) * b_dim1];
kono
parents:
diff changeset
1030 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
kono
parents:
diff changeset
1031 257] * b[l + (j + 3) * b_dim1];
kono
parents:
diff changeset
1032 }
kono
parents:
diff changeset
1033 c[i + j * c_dim1] = f11;
kono
parents:
diff changeset
1034 c[i + (j + 1) * c_dim1] = f12;
kono
parents:
diff changeset
1035 c[i + (j + 2) * c_dim1] = f13;
kono
parents:
diff changeset
1036 c[i + (j + 3) * c_dim1] = f14;
kono
parents:
diff changeset
1037 }
kono
parents:
diff changeset
1038 }
kono
parents:
diff changeset
1039 }
kono
parents:
diff changeset
1040 if (ujsec < jsec)
kono
parents:
diff changeset
1041 {
kono
parents:
diff changeset
1042 i4 = jj + jsec - 1;
kono
parents:
diff changeset
1043 for (j = jj + ujsec; j <= i4; ++j)
kono
parents:
diff changeset
1044 {
kono
parents:
diff changeset
1045 i5 = ii + uisec - 1;
kono
parents:
diff changeset
1046 for (i = ii; i <= i5; i += 4)
kono
parents:
diff changeset
1047 {
kono
parents:
diff changeset
1048 f11 = c[i + j * c_dim1];
kono
parents:
diff changeset
1049 f21 = c[i + 1 + j * c_dim1];
kono
parents:
diff changeset
1050 f31 = c[i + 2 + j * c_dim1];
kono
parents:
diff changeset
1051 f41 = c[i + 3 + j * c_dim1];
kono
parents:
diff changeset
1052 i6 = ll + lsec - 1;
kono
parents:
diff changeset
1053 for (l = ll; l <= i6; ++l)
kono
parents:
diff changeset
1054 {
kono
parents:
diff changeset
1055 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
kono
parents:
diff changeset
1056 257] * b[l + j * b_dim1];
kono
parents:
diff changeset
1057 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
kono
parents:
diff changeset
1058 257] * b[l + j * b_dim1];
kono
parents:
diff changeset
1059 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
kono
parents:
diff changeset
1060 257] * b[l + j * b_dim1];
kono
parents:
diff changeset
1061 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
kono
parents:
diff changeset
1062 257] * b[l + j * b_dim1];
kono
parents:
diff changeset
1063 }
kono
parents:
diff changeset
1064 c[i + j * c_dim1] = f11;
kono
parents:
diff changeset
1065 c[i + 1 + j * c_dim1] = f21;
kono
parents:
diff changeset
1066 c[i + 2 + j * c_dim1] = f31;
kono
parents:
diff changeset
1067 c[i + 3 + j * c_dim1] = f41;
kono
parents:
diff changeset
1068 }
kono
parents:
diff changeset
1069 i5 = ii + isec - 1;
kono
parents:
diff changeset
1070 for (i = ii + uisec; i <= i5; ++i)
kono
parents:
diff changeset
1071 {
kono
parents:
diff changeset
1072 f11 = c[i + j * c_dim1];
kono
parents:
diff changeset
1073 i6 = ll + lsec - 1;
kono
parents:
diff changeset
1074 for (l = ll; l <= i6; ++l)
kono
parents:
diff changeset
1075 {
kono
parents:
diff changeset
1076 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
kono
parents:
diff changeset
1077 257] * b[l + j * b_dim1];
kono
parents:
diff changeset
1078 }
kono
parents:
diff changeset
1079 c[i + j * c_dim1] = f11;
kono
parents:
diff changeset
1080 }
kono
parents:
diff changeset
1081 }
kono
parents:
diff changeset
1082 }
kono
parents:
diff changeset
1083 }
kono
parents:
diff changeset
1084 }
kono
parents:
diff changeset
1085 }
kono
parents:
diff changeset
1086 free(t1);
kono
parents:
diff changeset
1087 return;
kono
parents:
diff changeset
1088 }
kono
parents:
diff changeset
1089 else if (rxstride == 1 && aystride == 1 && bxstride == 1)
kono
parents:
diff changeset
1090 {
kono
parents:
diff changeset
1091 if (GFC_DESCRIPTOR_RANK (a) != 1)
kono
parents:
diff changeset
1092 {
kono
parents:
diff changeset
1093 const GFC_COMPLEX_4 *restrict abase_x;
kono
parents:
diff changeset
1094 const GFC_COMPLEX_4 *restrict bbase_y;
kono
parents:
diff changeset
1095 GFC_COMPLEX_4 *restrict dest_y;
kono
parents:
diff changeset
1096 GFC_COMPLEX_4 s;
kono
parents:
diff changeset
1097
kono
parents:
diff changeset
1098 for (y = 0; y < ycount; y++)
kono
parents:
diff changeset
1099 {
kono
parents:
diff changeset
1100 bbase_y = &bbase[y*bystride];
kono
parents:
diff changeset
1101 dest_y = &dest[y*rystride];
kono
parents:
diff changeset
1102 for (x = 0; x < xcount; x++)
kono
parents:
diff changeset
1103 {
kono
parents:
diff changeset
1104 abase_x = &abase[x*axstride];
kono
parents:
diff changeset
1105 s = (GFC_COMPLEX_4) 0;
kono
parents:
diff changeset
1106 for (n = 0; n < count; n++)
kono
parents:
diff changeset
1107 s += abase_x[n] * bbase_y[n];
kono
parents:
diff changeset
1108 dest_y[x] = s;
kono
parents:
diff changeset
1109 }
kono
parents:
diff changeset
1110 }
kono
parents:
diff changeset
1111 }
kono
parents:
diff changeset
1112 else
kono
parents:
diff changeset
1113 {
kono
parents:
diff changeset
1114 const GFC_COMPLEX_4 *restrict bbase_y;
kono
parents:
diff changeset
1115 GFC_COMPLEX_4 s;
kono
parents:
diff changeset
1116
kono
parents:
diff changeset
1117 for (y = 0; y < ycount; y++)
kono
parents:
diff changeset
1118 {
kono
parents:
diff changeset
1119 bbase_y = &bbase[y*bystride];
kono
parents:
diff changeset
1120 s = (GFC_COMPLEX_4) 0;
kono
parents:
diff changeset
1121 for (n = 0; n < count; n++)
kono
parents:
diff changeset
1122 s += abase[n*axstride] * bbase_y[n];
kono
parents:
diff changeset
1123 dest[y*rystride] = s;
kono
parents:
diff changeset
1124 }
kono
parents:
diff changeset
1125 }
kono
parents:
diff changeset
1126 }
kono
parents:
diff changeset
1127 else if (axstride < aystride)
kono
parents:
diff changeset
1128 {
kono
parents:
diff changeset
1129 for (y = 0; y < ycount; y++)
kono
parents:
diff changeset
1130 for (x = 0; x < xcount; x++)
kono
parents:
diff changeset
1131 dest[x*rxstride + y*rystride] = (GFC_COMPLEX_4)0;
kono
parents:
diff changeset
1132
kono
parents:
diff changeset
1133 for (y = 0; y < ycount; y++)
kono
parents:
diff changeset
1134 for (n = 0; n < count; n++)
kono
parents:
diff changeset
1135 for (x = 0; x < xcount; x++)
kono
parents:
diff changeset
1136 /* dest[x,y] += a[x,n] * b[n,y] */
kono
parents:
diff changeset
1137 dest[x*rxstride + y*rystride] +=
kono
parents:
diff changeset
1138 abase[x*axstride + n*aystride] *
kono
parents:
diff changeset
1139 bbase[n*bxstride + y*bystride];
kono
parents:
diff changeset
1140 }
kono
parents:
diff changeset
1141 else if (GFC_DESCRIPTOR_RANK (a) == 1)
kono
parents:
diff changeset
1142 {
kono
parents:
diff changeset
1143 const GFC_COMPLEX_4 *restrict bbase_y;
kono
parents:
diff changeset
1144 GFC_COMPLEX_4 s;
kono
parents:
diff changeset
1145
kono
parents:
diff changeset
1146 for (y = 0; y < ycount; y++)
kono
parents:
diff changeset
1147 {
kono
parents:
diff changeset
1148 bbase_y = &bbase[y*bystride];
kono
parents:
diff changeset
1149 s = (GFC_COMPLEX_4) 0;
kono
parents:
diff changeset
1150 for (n = 0; n < count; n++)
kono
parents:
diff changeset
1151 s += abase[n*axstride] * bbase_y[n*bxstride];
kono
parents:
diff changeset
1152 dest[y*rxstride] = s;
kono
parents:
diff changeset
1153 }
kono
parents:
diff changeset
1154 }
kono
parents:
diff changeset
1155 else
kono
parents:
diff changeset
1156 {
kono
parents:
diff changeset
1157 const GFC_COMPLEX_4 *restrict abase_x;
kono
parents:
diff changeset
1158 const GFC_COMPLEX_4 *restrict bbase_y;
kono
parents:
diff changeset
1159 GFC_COMPLEX_4 *restrict dest_y;
kono
parents:
diff changeset
1160 GFC_COMPLEX_4 s;
kono
parents:
diff changeset
1161
kono
parents:
diff changeset
1162 for (y = 0; y < ycount; y++)
kono
parents:
diff changeset
1163 {
kono
parents:
diff changeset
1164 bbase_y = &bbase[y*bystride];
kono
parents:
diff changeset
1165 dest_y = &dest[y*rystride];
kono
parents:
diff changeset
1166 for (x = 0; x < xcount; x++)
kono
parents:
diff changeset
1167 {
kono
parents:
diff changeset
1168 abase_x = &abase[x*axstride];
kono
parents:
diff changeset
1169 s = (GFC_COMPLEX_4) 0;
kono
parents:
diff changeset
1170 for (n = 0; n < count; n++)
kono
parents:
diff changeset
1171 s += abase_x[n*aystride] * bbase_y[n*bxstride];
kono
parents:
diff changeset
1172 dest_y[x*rxstride] = s;
kono
parents:
diff changeset
1173 }
kono
parents:
diff changeset
1174 }
kono
parents:
diff changeset
1175 }
kono
parents:
diff changeset
1176 }
kono
parents:
diff changeset
1177 #undef POW3
kono
parents:
diff changeset
1178 #undef min
kono
parents:
diff changeset
1179 #undef max
kono
parents:
diff changeset
1180
kono
parents:
diff changeset
1181 #endif
kono
parents:
diff changeset
1182
kono
parents:
diff changeset
1183 #endif
kono
parents:
diff changeset
1184