annotate libgfortran/generated/matmulavx128_c4.c @ 111:04ced10e8804

gcc 7
author kono
date Fri, 27 Oct 2017 22:46:09 +0900
parents
children 84e7813d76e9
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
111
kono
parents:
diff changeset
1 /* Implementation of the MATMUL intrinsic
kono
parents:
diff changeset
2 Copyright (C) 2002-2017 Free Software Foundation, Inc.
kono
parents:
diff changeset
3 Contributed by Thomas Koenig <tkoenig@gcc.gnu.org>.
kono
parents:
diff changeset
4
kono
parents:
diff changeset
5 This file is part of the GNU Fortran runtime library (libgfortran).
kono
parents:
diff changeset
6
kono
parents:
diff changeset
7 Libgfortran is free software; you can redistribute it and/or
kono
parents:
diff changeset
8 modify it under the terms of the GNU General Public
kono
parents:
diff changeset
9 License as published by the Free Software Foundation; either
kono
parents:
diff changeset
10 version 3 of the License, or (at your option) any later version.
kono
parents:
diff changeset
11
kono
parents:
diff changeset
12 Libgfortran is distributed in the hope that it will be useful,
kono
parents:
diff changeset
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
kono
parents:
diff changeset
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
kono
parents:
diff changeset
15 GNU General Public License for more details.
kono
parents:
diff changeset
16
kono
parents:
diff changeset
17 Under Section 7 of GPL version 3, you are granted additional
kono
parents:
diff changeset
18 permissions described in the GCC Runtime Library Exception, version
kono
parents:
diff changeset
19 3.1, as published by the Free Software Foundation.
kono
parents:
diff changeset
20
kono
parents:
diff changeset
21 You should have received a copy of the GNU General Public License and
kono
parents:
diff changeset
22 a copy of the GCC Runtime Library Exception along with this program;
kono
parents:
diff changeset
23 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
kono
parents:
diff changeset
24 <http://www.gnu.org/licenses/>. */
kono
parents:
diff changeset
25
kono
parents:
diff changeset
26 #include "libgfortran.h"
kono
parents:
diff changeset
27 #include <string.h>
kono
parents:
diff changeset
28 #include <assert.h>
kono
parents:
diff changeset
29
kono
parents:
diff changeset
30
kono
parents:
diff changeset
31 /* These are the specific versions of matmul with -mprefer-avx128. */
kono
parents:
diff changeset
32
kono
parents:
diff changeset
33 #if defined (HAVE_GFC_COMPLEX_4)
kono
parents:
diff changeset
34
kono
parents:
diff changeset
35 /* Prototype for the BLAS ?gemm subroutine, a pointer to which can be
kono
parents:
diff changeset
36 passed to us by the front-end, in which case we call it for large
kono
parents:
diff changeset
37 matrices. */
kono
parents:
diff changeset
38
kono
parents:
diff changeset
39 typedef void (*blas_call)(const char *, const char *, const int *, const int *,
kono
parents:
diff changeset
40 const int *, const GFC_COMPLEX_4 *, const GFC_COMPLEX_4 *,
kono
parents:
diff changeset
41 const int *, const GFC_COMPLEX_4 *, const int *,
kono
parents:
diff changeset
42 const GFC_COMPLEX_4 *, GFC_COMPLEX_4 *, const int *,
kono
parents:
diff changeset
43 int, int);
kono
parents:
diff changeset
44
kono
parents:
diff changeset
45 #if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
kono
parents:
diff changeset
46 void
kono
parents:
diff changeset
47 matmul_c4_avx128_fma3 (gfc_array_c4 * const restrict retarray,
kono
parents:
diff changeset
48 gfc_array_c4 * const restrict a, gfc_array_c4 * const restrict b, int try_blas,
kono
parents:
diff changeset
49 int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma")));
kono
parents:
diff changeset
50 internal_proto(matmul_c4_avx128_fma3);
kono
parents:
diff changeset
51 void
kono
parents:
diff changeset
52 matmul_c4_avx128_fma3 (gfc_array_c4 * const restrict retarray,
kono
parents:
diff changeset
53 gfc_array_c4 * const restrict a, gfc_array_c4 * const restrict b, int try_blas,
kono
parents:
diff changeset
54 int blas_limit, blas_call gemm)
kono
parents:
diff changeset
55 {
kono
parents:
diff changeset
56 const GFC_COMPLEX_4 * restrict abase;
kono
parents:
diff changeset
57 const GFC_COMPLEX_4 * restrict bbase;
kono
parents:
diff changeset
58 GFC_COMPLEX_4 * restrict dest;
kono
parents:
diff changeset
59
kono
parents:
diff changeset
60 index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
kono
parents:
diff changeset
61 index_type x, y, n, count, xcount, ycount;
kono
parents:
diff changeset
62
kono
parents:
diff changeset
63 assert (GFC_DESCRIPTOR_RANK (a) == 2
kono
parents:
diff changeset
64 || GFC_DESCRIPTOR_RANK (b) == 2);
kono
parents:
diff changeset
65
kono
parents:
diff changeset
66 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
kono
parents:
diff changeset
67
kono
parents:
diff changeset
68 Either A or B (but not both) can be rank 1:
kono
parents:
diff changeset
69
kono
parents:
diff changeset
70 o One-dimensional argument A is implicitly treated as a row matrix
kono
parents:
diff changeset
71 dimensioned [1,count], so xcount=1.
kono
parents:
diff changeset
72
kono
parents:
diff changeset
73 o One-dimensional argument B is implicitly treated as a column matrix
kono
parents:
diff changeset
74 dimensioned [count, 1], so ycount=1.
kono
parents:
diff changeset
75 */
kono
parents:
diff changeset
76
kono
parents:
diff changeset
77 if (retarray->base_addr == NULL)
kono
parents:
diff changeset
78 {
kono
parents:
diff changeset
79 if (GFC_DESCRIPTOR_RANK (a) == 1)
kono
parents:
diff changeset
80 {
kono
parents:
diff changeset
81 GFC_DIMENSION_SET(retarray->dim[0], 0,
kono
parents:
diff changeset
82 GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
kono
parents:
diff changeset
83 }
kono
parents:
diff changeset
84 else if (GFC_DESCRIPTOR_RANK (b) == 1)
kono
parents:
diff changeset
85 {
kono
parents:
diff changeset
86 GFC_DIMENSION_SET(retarray->dim[0], 0,
kono
parents:
diff changeset
87 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
kono
parents:
diff changeset
88 }
kono
parents:
diff changeset
89 else
kono
parents:
diff changeset
90 {
kono
parents:
diff changeset
91 GFC_DIMENSION_SET(retarray->dim[0], 0,
kono
parents:
diff changeset
92 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
kono
parents:
diff changeset
93
kono
parents:
diff changeset
94 GFC_DIMENSION_SET(retarray->dim[1], 0,
kono
parents:
diff changeset
95 GFC_DESCRIPTOR_EXTENT(b,1) - 1,
kono
parents:
diff changeset
96 GFC_DESCRIPTOR_EXTENT(retarray,0));
kono
parents:
diff changeset
97 }
kono
parents:
diff changeset
98
kono
parents:
diff changeset
99 retarray->base_addr
kono
parents:
diff changeset
100 = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_COMPLEX_4));
kono
parents:
diff changeset
101 retarray->offset = 0;
kono
parents:
diff changeset
102 }
kono
parents:
diff changeset
103 else if (unlikely (compile_options.bounds_check))
kono
parents:
diff changeset
104 {
kono
parents:
diff changeset
105 index_type ret_extent, arg_extent;
kono
parents:
diff changeset
106
kono
parents:
diff changeset
107 if (GFC_DESCRIPTOR_RANK (a) == 1)
kono
parents:
diff changeset
108 {
kono
parents:
diff changeset
109 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
kono
parents:
diff changeset
110 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
kono
parents:
diff changeset
111 if (arg_extent != ret_extent)
kono
parents:
diff changeset
112 runtime_error ("Incorrect extent in return array in"
kono
parents:
diff changeset
113 " MATMUL intrinsic: is %ld, should be %ld",
kono
parents:
diff changeset
114 (long int) ret_extent, (long int) arg_extent);
kono
parents:
diff changeset
115 }
kono
parents:
diff changeset
116 else if (GFC_DESCRIPTOR_RANK (b) == 1)
kono
parents:
diff changeset
117 {
kono
parents:
diff changeset
118 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
kono
parents:
diff changeset
119 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
kono
parents:
diff changeset
120 if (arg_extent != ret_extent)
kono
parents:
diff changeset
121 runtime_error ("Incorrect extent in return array in"
kono
parents:
diff changeset
122 " MATMUL intrinsic: is %ld, should be %ld",
kono
parents:
diff changeset
123 (long int) ret_extent, (long int) arg_extent);
kono
parents:
diff changeset
124 }
kono
parents:
diff changeset
125 else
kono
parents:
diff changeset
126 {
kono
parents:
diff changeset
127 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
kono
parents:
diff changeset
128 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
kono
parents:
diff changeset
129 if (arg_extent != ret_extent)
kono
parents:
diff changeset
130 runtime_error ("Incorrect extent in return array in"
kono
parents:
diff changeset
131 " MATMUL intrinsic for dimension 1:"
kono
parents:
diff changeset
132 " is %ld, should be %ld",
kono
parents:
diff changeset
133 (long int) ret_extent, (long int) arg_extent);
kono
parents:
diff changeset
134
kono
parents:
diff changeset
135 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
kono
parents:
diff changeset
136 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
kono
parents:
diff changeset
137 if (arg_extent != ret_extent)
kono
parents:
diff changeset
138 runtime_error ("Incorrect extent in return array in"
kono
parents:
diff changeset
139 " MATMUL intrinsic for dimension 2:"
kono
parents:
diff changeset
140 " is %ld, should be %ld",
kono
parents:
diff changeset
141 (long int) ret_extent, (long int) arg_extent);
kono
parents:
diff changeset
142 }
kono
parents:
diff changeset
143 }
kono
parents:
diff changeset
144
kono
parents:
diff changeset
145
kono
parents:
diff changeset
146 if (GFC_DESCRIPTOR_RANK (retarray) == 1)
kono
parents:
diff changeset
147 {
kono
parents:
diff changeset
148 /* One-dimensional result may be addressed in the code below
kono
parents:
diff changeset
149 either as a row or a column matrix. We want both cases to
kono
parents:
diff changeset
150 work. */
kono
parents:
diff changeset
151 rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
kono
parents:
diff changeset
152 }
kono
parents:
diff changeset
153 else
kono
parents:
diff changeset
154 {
kono
parents:
diff changeset
155 rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
kono
parents:
diff changeset
156 rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
kono
parents:
diff changeset
157 }
kono
parents:
diff changeset
158
kono
parents:
diff changeset
159
kono
parents:
diff changeset
160 if (GFC_DESCRIPTOR_RANK (a) == 1)
kono
parents:
diff changeset
161 {
kono
parents:
diff changeset
162 /* Treat it as a a row matrix A[1,count]. */
kono
parents:
diff changeset
163 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
kono
parents:
diff changeset
164 aystride = 1;
kono
parents:
diff changeset
165
kono
parents:
diff changeset
166 xcount = 1;
kono
parents:
diff changeset
167 count = GFC_DESCRIPTOR_EXTENT(a,0);
kono
parents:
diff changeset
168 }
kono
parents:
diff changeset
169 else
kono
parents:
diff changeset
170 {
kono
parents:
diff changeset
171 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
kono
parents:
diff changeset
172 aystride = GFC_DESCRIPTOR_STRIDE(a,1);
kono
parents:
diff changeset
173
kono
parents:
diff changeset
174 count = GFC_DESCRIPTOR_EXTENT(a,1);
kono
parents:
diff changeset
175 xcount = GFC_DESCRIPTOR_EXTENT(a,0);
kono
parents:
diff changeset
176 }
kono
parents:
diff changeset
177
kono
parents:
diff changeset
178 if (count != GFC_DESCRIPTOR_EXTENT(b,0))
kono
parents:
diff changeset
179 {
kono
parents:
diff changeset
180 if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
kono
parents:
diff changeset
181 runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
kono
parents:
diff changeset
182 }
kono
parents:
diff changeset
183
kono
parents:
diff changeset
184 if (GFC_DESCRIPTOR_RANK (b) == 1)
kono
parents:
diff changeset
185 {
kono
parents:
diff changeset
186 /* Treat it as a column matrix B[count,1] */
kono
parents:
diff changeset
187 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
kono
parents:
diff changeset
188
kono
parents:
diff changeset
189 /* bystride should never be used for 1-dimensional b.
kono
parents:
diff changeset
190 The value is only used for calculation of the
kono
parents:
diff changeset
191 memory by the buffer. */
kono
parents:
diff changeset
192 bystride = 256;
kono
parents:
diff changeset
193 ycount = 1;
kono
parents:
diff changeset
194 }
kono
parents:
diff changeset
195 else
kono
parents:
diff changeset
196 {
kono
parents:
diff changeset
197 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
kono
parents:
diff changeset
198 bystride = GFC_DESCRIPTOR_STRIDE(b,1);
kono
parents:
diff changeset
199 ycount = GFC_DESCRIPTOR_EXTENT(b,1);
kono
parents:
diff changeset
200 }
kono
parents:
diff changeset
201
kono
parents:
diff changeset
202 abase = a->base_addr;
kono
parents:
diff changeset
203 bbase = b->base_addr;
kono
parents:
diff changeset
204 dest = retarray->base_addr;
kono
parents:
diff changeset
205
kono
parents:
diff changeset
206 /* Now that everything is set up, we perform the multiplication
kono
parents:
diff changeset
207 itself. */
kono
parents:
diff changeset
208
kono
parents:
diff changeset
209 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
kono
parents:
diff changeset
210 #define min(a,b) ((a) <= (b) ? (a) : (b))
kono
parents:
diff changeset
211 #define max(a,b) ((a) >= (b) ? (a) : (b))
kono
parents:
diff changeset
212
kono
parents:
diff changeset
213 if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
kono
parents:
diff changeset
214 && (bxstride == 1 || bystride == 1)
kono
parents:
diff changeset
215 && (((float) xcount) * ((float) ycount) * ((float) count)
kono
parents:
diff changeset
216 > POW3(blas_limit)))
kono
parents:
diff changeset
217 {
kono
parents:
diff changeset
218 const int m = xcount, n = ycount, k = count, ldc = rystride;
kono
parents:
diff changeset
219 const GFC_COMPLEX_4 one = 1, zero = 0;
kono
parents:
diff changeset
220 const int lda = (axstride == 1) ? aystride : axstride,
kono
parents:
diff changeset
221 ldb = (bxstride == 1) ? bystride : bxstride;
kono
parents:
diff changeset
222
kono
parents:
diff changeset
223 if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
kono
parents:
diff changeset
224 {
kono
parents:
diff changeset
225 assert (gemm != NULL);
kono
parents:
diff changeset
226 gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m,
kono
parents:
diff changeset
227 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
kono
parents:
diff changeset
228 &ldc, 1, 1);
kono
parents:
diff changeset
229 return;
kono
parents:
diff changeset
230 }
kono
parents:
diff changeset
231 }
kono
parents:
diff changeset
232
kono
parents:
diff changeset
233 if (rxstride == 1 && axstride == 1 && bxstride == 1)
kono
parents:
diff changeset
234 {
kono
parents:
diff changeset
235 /* This block of code implements a tuned matmul, derived from
kono
parents:
diff changeset
236 Superscalar GEMM-based level 3 BLAS, Beta version 0.1
kono
parents:
diff changeset
237
kono
parents:
diff changeset
238 Bo Kagstrom and Per Ling
kono
parents:
diff changeset
239 Department of Computing Science
kono
parents:
diff changeset
240 Umea University
kono
parents:
diff changeset
241 S-901 87 Umea, Sweden
kono
parents:
diff changeset
242
kono
parents:
diff changeset
243 from netlib.org, translated to C, and modified for matmul.m4. */
kono
parents:
diff changeset
244
kono
parents:
diff changeset
245 const GFC_COMPLEX_4 *a, *b;
kono
parents:
diff changeset
246 GFC_COMPLEX_4 *c;
kono
parents:
diff changeset
247 const index_type m = xcount, n = ycount, k = count;
kono
parents:
diff changeset
248
kono
parents:
diff changeset
249 /* System generated locals */
kono
parents:
diff changeset
250 index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
kono
parents:
diff changeset
251 i1, i2, i3, i4, i5, i6;
kono
parents:
diff changeset
252
kono
parents:
diff changeset
253 /* Local variables */
kono
parents:
diff changeset
254 GFC_COMPLEX_4 f11, f12, f21, f22, f31, f32, f41, f42,
kono
parents:
diff changeset
255 f13, f14, f23, f24, f33, f34, f43, f44;
kono
parents:
diff changeset
256 index_type i, j, l, ii, jj, ll;
kono
parents:
diff changeset
257 index_type isec, jsec, lsec, uisec, ujsec, ulsec;
kono
parents:
diff changeset
258 GFC_COMPLEX_4 *t1;
kono
parents:
diff changeset
259
kono
parents:
diff changeset
260 a = abase;
kono
parents:
diff changeset
261 b = bbase;
kono
parents:
diff changeset
262 c = retarray->base_addr;
kono
parents:
diff changeset
263
kono
parents:
diff changeset
264 /* Parameter adjustments */
kono
parents:
diff changeset
265 c_dim1 = rystride;
kono
parents:
diff changeset
266 c_offset = 1 + c_dim1;
kono
parents:
diff changeset
267 c -= c_offset;
kono
parents:
diff changeset
268 a_dim1 = aystride;
kono
parents:
diff changeset
269 a_offset = 1 + a_dim1;
kono
parents:
diff changeset
270 a -= a_offset;
kono
parents:
diff changeset
271 b_dim1 = bystride;
kono
parents:
diff changeset
272 b_offset = 1 + b_dim1;
kono
parents:
diff changeset
273 b -= b_offset;
kono
parents:
diff changeset
274
kono
parents:
diff changeset
275 /* Empty c first. */
kono
parents:
diff changeset
276 for (j=1; j<=n; j++)
kono
parents:
diff changeset
277 for (i=1; i<=m; i++)
kono
parents:
diff changeset
278 c[i + j * c_dim1] = (GFC_COMPLEX_4)0;
kono
parents:
diff changeset
279
kono
parents:
diff changeset
280 /* Early exit if possible */
kono
parents:
diff changeset
281 if (m == 0 || n == 0 || k == 0)
kono
parents:
diff changeset
282 return;
kono
parents:
diff changeset
283
kono
parents:
diff changeset
284 /* Adjust size of t1 to what is needed. */
kono
parents:
diff changeset
285 index_type t1_dim;
kono
parents:
diff changeset
286 t1_dim = (a_dim1-1) * 256 + b_dim1;
kono
parents:
diff changeset
287 if (t1_dim > 65536)
kono
parents:
diff changeset
288 t1_dim = 65536;
kono
parents:
diff changeset
289
kono
parents:
diff changeset
290 t1 = malloc (t1_dim * sizeof(GFC_COMPLEX_4));
kono
parents:
diff changeset
291
kono
parents:
diff changeset
292 /* Start turning the crank. */
kono
parents:
diff changeset
293 i1 = n;
kono
parents:
diff changeset
294 for (jj = 1; jj <= i1; jj += 512)
kono
parents:
diff changeset
295 {
kono
parents:
diff changeset
296 /* Computing MIN */
kono
parents:
diff changeset
297 i2 = 512;
kono
parents:
diff changeset
298 i3 = n - jj + 1;
kono
parents:
diff changeset
299 jsec = min(i2,i3);
kono
parents:
diff changeset
300 ujsec = jsec - jsec % 4;
kono
parents:
diff changeset
301 i2 = k;
kono
parents:
diff changeset
302 for (ll = 1; ll <= i2; ll += 256)
kono
parents:
diff changeset
303 {
kono
parents:
diff changeset
304 /* Computing MIN */
kono
parents:
diff changeset
305 i3 = 256;
kono
parents:
diff changeset
306 i4 = k - ll + 1;
kono
parents:
diff changeset
307 lsec = min(i3,i4);
kono
parents:
diff changeset
308 ulsec = lsec - lsec % 2;
kono
parents:
diff changeset
309
kono
parents:
diff changeset
310 i3 = m;
kono
parents:
diff changeset
311 for (ii = 1; ii <= i3; ii += 256)
kono
parents:
diff changeset
312 {
kono
parents:
diff changeset
313 /* Computing MIN */
kono
parents:
diff changeset
314 i4 = 256;
kono
parents:
diff changeset
315 i5 = m - ii + 1;
kono
parents:
diff changeset
316 isec = min(i4,i5);
kono
parents:
diff changeset
317 uisec = isec - isec % 2;
kono
parents:
diff changeset
318 i4 = ll + ulsec - 1;
kono
parents:
diff changeset
319 for (l = ll; l <= i4; l += 2)
kono
parents:
diff changeset
320 {
kono
parents:
diff changeset
321 i5 = ii + uisec - 1;
kono
parents:
diff changeset
322 for (i = ii; i <= i5; i += 2)
kono
parents:
diff changeset
323 {
kono
parents:
diff changeset
324 t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
kono
parents:
diff changeset
325 a[i + l * a_dim1];
kono
parents:
diff changeset
326 t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
kono
parents:
diff changeset
327 a[i + (l + 1) * a_dim1];
kono
parents:
diff changeset
328 t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
kono
parents:
diff changeset
329 a[i + 1 + l * a_dim1];
kono
parents:
diff changeset
330 t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
kono
parents:
diff changeset
331 a[i + 1 + (l + 1) * a_dim1];
kono
parents:
diff changeset
332 }
kono
parents:
diff changeset
333 if (uisec < isec)
kono
parents:
diff changeset
334 {
kono
parents:
diff changeset
335 t1[l - ll + 1 + (isec << 8) - 257] =
kono
parents:
diff changeset
336 a[ii + isec - 1 + l * a_dim1];
kono
parents:
diff changeset
337 t1[l - ll + 2 + (isec << 8) - 257] =
kono
parents:
diff changeset
338 a[ii + isec - 1 + (l + 1) * a_dim1];
kono
parents:
diff changeset
339 }
kono
parents:
diff changeset
340 }
kono
parents:
diff changeset
341 if (ulsec < lsec)
kono
parents:
diff changeset
342 {
kono
parents:
diff changeset
343 i4 = ii + isec - 1;
kono
parents:
diff changeset
344 for (i = ii; i<= i4; ++i)
kono
parents:
diff changeset
345 {
kono
parents:
diff changeset
346 t1[lsec + ((i - ii + 1) << 8) - 257] =
kono
parents:
diff changeset
347 a[i + (ll + lsec - 1) * a_dim1];
kono
parents:
diff changeset
348 }
kono
parents:
diff changeset
349 }
kono
parents:
diff changeset
350
kono
parents:
diff changeset
351 uisec = isec - isec % 4;
kono
parents:
diff changeset
352 i4 = jj + ujsec - 1;
kono
parents:
diff changeset
353 for (j = jj; j <= i4; j += 4)
kono
parents:
diff changeset
354 {
kono
parents:
diff changeset
355 i5 = ii + uisec - 1;
kono
parents:
diff changeset
356 for (i = ii; i <= i5; i += 4)
kono
parents:
diff changeset
357 {
kono
parents:
diff changeset
358 f11 = c[i + j * c_dim1];
kono
parents:
diff changeset
359 f21 = c[i + 1 + j * c_dim1];
kono
parents:
diff changeset
360 f12 = c[i + (j + 1) * c_dim1];
kono
parents:
diff changeset
361 f22 = c[i + 1 + (j + 1) * c_dim1];
kono
parents:
diff changeset
362 f13 = c[i + (j + 2) * c_dim1];
kono
parents:
diff changeset
363 f23 = c[i + 1 + (j + 2) * c_dim1];
kono
parents:
diff changeset
364 f14 = c[i + (j + 3) * c_dim1];
kono
parents:
diff changeset
365 f24 = c[i + 1 + (j + 3) * c_dim1];
kono
parents:
diff changeset
366 f31 = c[i + 2 + j * c_dim1];
kono
parents:
diff changeset
367 f41 = c[i + 3 + j * c_dim1];
kono
parents:
diff changeset
368 f32 = c[i + 2 + (j + 1) * c_dim1];
kono
parents:
diff changeset
369 f42 = c[i + 3 + (j + 1) * c_dim1];
kono
parents:
diff changeset
370 f33 = c[i + 2 + (j + 2) * c_dim1];
kono
parents:
diff changeset
371 f43 = c[i + 3 + (j + 2) * c_dim1];
kono
parents:
diff changeset
372 f34 = c[i + 2 + (j + 3) * c_dim1];
kono
parents:
diff changeset
373 f44 = c[i + 3 + (j + 3) * c_dim1];
kono
parents:
diff changeset
374 i6 = ll + lsec - 1;
kono
parents:
diff changeset
375 for (l = ll; l <= i6; ++l)
kono
parents:
diff changeset
376 {
kono
parents:
diff changeset
377 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
kono
parents:
diff changeset
378 * b[l + j * b_dim1];
kono
parents:
diff changeset
379 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
kono
parents:
diff changeset
380 * b[l + j * b_dim1];
kono
parents:
diff changeset
381 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
kono
parents:
diff changeset
382 * b[l + (j + 1) * b_dim1];
kono
parents:
diff changeset
383 f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
kono
parents:
diff changeset
384 * b[l + (j + 1) * b_dim1];
kono
parents:
diff changeset
385 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
kono
parents:
diff changeset
386 * b[l + (j + 2) * b_dim1];
kono
parents:
diff changeset
387 f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
kono
parents:
diff changeset
388 * b[l + (j + 2) * b_dim1];
kono
parents:
diff changeset
389 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
kono
parents:
diff changeset
390 * b[l + (j + 3) * b_dim1];
kono
parents:
diff changeset
391 f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
kono
parents:
diff changeset
392 * b[l + (j + 3) * b_dim1];
kono
parents:
diff changeset
393 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
kono
parents:
diff changeset
394 * b[l + j * b_dim1];
kono
parents:
diff changeset
395 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
kono
parents:
diff changeset
396 * b[l + j * b_dim1];
kono
parents:
diff changeset
397 f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
kono
parents:
diff changeset
398 * b[l + (j + 1) * b_dim1];
kono
parents:
diff changeset
399 f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
kono
parents:
diff changeset
400 * b[l + (j + 1) * b_dim1];
kono
parents:
diff changeset
401 f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
kono
parents:
diff changeset
402 * b[l + (j + 2) * b_dim1];
kono
parents:
diff changeset
403 f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
kono
parents:
diff changeset
404 * b[l + (j + 2) * b_dim1];
kono
parents:
diff changeset
405 f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
kono
parents:
diff changeset
406 * b[l + (j + 3) * b_dim1];
kono
parents:
diff changeset
407 f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
kono
parents:
diff changeset
408 * b[l + (j + 3) * b_dim1];
kono
parents:
diff changeset
409 }
kono
parents:
diff changeset
410 c[i + j * c_dim1] = f11;
kono
parents:
diff changeset
411 c[i + 1 + j * c_dim1] = f21;
kono
parents:
diff changeset
412 c[i + (j + 1) * c_dim1] = f12;
kono
parents:
diff changeset
413 c[i + 1 + (j + 1) * c_dim1] = f22;
kono
parents:
diff changeset
414 c[i + (j + 2) * c_dim1] = f13;
kono
parents:
diff changeset
415 c[i + 1 + (j + 2) * c_dim1] = f23;
kono
parents:
diff changeset
416 c[i + (j + 3) * c_dim1] = f14;
kono
parents:
diff changeset
417 c[i + 1 + (j + 3) * c_dim1] = f24;
kono
parents:
diff changeset
418 c[i + 2 + j * c_dim1] = f31;
kono
parents:
diff changeset
419 c[i + 3 + j * c_dim1] = f41;
kono
parents:
diff changeset
420 c[i + 2 + (j + 1) * c_dim1] = f32;
kono
parents:
diff changeset
421 c[i + 3 + (j + 1) * c_dim1] = f42;
kono
parents:
diff changeset
422 c[i + 2 + (j + 2) * c_dim1] = f33;
kono
parents:
diff changeset
423 c[i + 3 + (j + 2) * c_dim1] = f43;
kono
parents:
diff changeset
424 c[i + 2 + (j + 3) * c_dim1] = f34;
kono
parents:
diff changeset
425 c[i + 3 + (j + 3) * c_dim1] = f44;
kono
parents:
diff changeset
426 }
kono
parents:
diff changeset
427 if (uisec < isec)
kono
parents:
diff changeset
428 {
kono
parents:
diff changeset
429 i5 = ii + isec - 1;
kono
parents:
diff changeset
430 for (i = ii + uisec; i <= i5; ++i)
kono
parents:
diff changeset
431 {
kono
parents:
diff changeset
432 f11 = c[i + j * c_dim1];
kono
parents:
diff changeset
433 f12 = c[i + (j + 1) * c_dim1];
kono
parents:
diff changeset
434 f13 = c[i + (j + 2) * c_dim1];
kono
parents:
diff changeset
435 f14 = c[i + (j + 3) * c_dim1];
kono
parents:
diff changeset
436 i6 = ll + lsec - 1;
kono
parents:
diff changeset
437 for (l = ll; l <= i6; ++l)
kono
parents:
diff changeset
438 {
kono
parents:
diff changeset
439 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
kono
parents:
diff changeset
440 257] * b[l + j * b_dim1];
kono
parents:
diff changeset
441 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
kono
parents:
diff changeset
442 257] * b[l + (j + 1) * b_dim1];
kono
parents:
diff changeset
443 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
kono
parents:
diff changeset
444 257] * b[l + (j + 2) * b_dim1];
kono
parents:
diff changeset
445 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
kono
parents:
diff changeset
446 257] * b[l + (j + 3) * b_dim1];
kono
parents:
diff changeset
447 }
kono
parents:
diff changeset
448 c[i + j * c_dim1] = f11;
kono
parents:
diff changeset
449 c[i + (j + 1) * c_dim1] = f12;
kono
parents:
diff changeset
450 c[i + (j + 2) * c_dim1] = f13;
kono
parents:
diff changeset
451 c[i + (j + 3) * c_dim1] = f14;
kono
parents:
diff changeset
452 }
kono
parents:
diff changeset
453 }
kono
parents:
diff changeset
454 }
kono
parents:
diff changeset
455 if (ujsec < jsec)
kono
parents:
diff changeset
456 {
kono
parents:
diff changeset
457 i4 = jj + jsec - 1;
kono
parents:
diff changeset
458 for (j = jj + ujsec; j <= i4; ++j)
kono
parents:
diff changeset
459 {
kono
parents:
diff changeset
460 i5 = ii + uisec - 1;
kono
parents:
diff changeset
461 for (i = ii; i <= i5; i += 4)
kono
parents:
diff changeset
462 {
kono
parents:
diff changeset
463 f11 = c[i + j * c_dim1];
kono
parents:
diff changeset
464 f21 = c[i + 1 + j * c_dim1];
kono
parents:
diff changeset
465 f31 = c[i + 2 + j * c_dim1];
kono
parents:
diff changeset
466 f41 = c[i + 3 + j * c_dim1];
kono
parents:
diff changeset
467 i6 = ll + lsec - 1;
kono
parents:
diff changeset
468 for (l = ll; l <= i6; ++l)
kono
parents:
diff changeset
469 {
kono
parents:
diff changeset
470 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
kono
parents:
diff changeset
471 257] * b[l + j * b_dim1];
kono
parents:
diff changeset
472 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
kono
parents:
diff changeset
473 257] * b[l + j * b_dim1];
kono
parents:
diff changeset
474 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
kono
parents:
diff changeset
475 257] * b[l + j * b_dim1];
kono
parents:
diff changeset
476 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
kono
parents:
diff changeset
477 257] * b[l + j * b_dim1];
kono
parents:
diff changeset
478 }
kono
parents:
diff changeset
479 c[i + j * c_dim1] = f11;
kono
parents:
diff changeset
480 c[i + 1 + j * c_dim1] = f21;
kono
parents:
diff changeset
481 c[i + 2 + j * c_dim1] = f31;
kono
parents:
diff changeset
482 c[i + 3 + j * c_dim1] = f41;
kono
parents:
diff changeset
483 }
kono
parents:
diff changeset
484 i5 = ii + isec - 1;
kono
parents:
diff changeset
485 for (i = ii + uisec; i <= i5; ++i)
kono
parents:
diff changeset
486 {
kono
parents:
diff changeset
487 f11 = c[i + j * c_dim1];
kono
parents:
diff changeset
488 i6 = ll + lsec - 1;
kono
parents:
diff changeset
489 for (l = ll; l <= i6; ++l)
kono
parents:
diff changeset
490 {
kono
parents:
diff changeset
491 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
kono
parents:
diff changeset
492 257] * b[l + j * b_dim1];
kono
parents:
diff changeset
493 }
kono
parents:
diff changeset
494 c[i + j * c_dim1] = f11;
kono
parents:
diff changeset
495 }
kono
parents:
diff changeset
496 }
kono
parents:
diff changeset
497 }
kono
parents:
diff changeset
498 }
kono
parents:
diff changeset
499 }
kono
parents:
diff changeset
500 }
kono
parents:
diff changeset
501 free(t1);
kono
parents:
diff changeset
502 return;
kono
parents:
diff changeset
503 }
kono
parents:
diff changeset
504 else if (rxstride == 1 && aystride == 1 && bxstride == 1)
kono
parents:
diff changeset
505 {
kono
parents:
diff changeset
506 if (GFC_DESCRIPTOR_RANK (a) != 1)
kono
parents:
diff changeset
507 {
kono
parents:
diff changeset
508 const GFC_COMPLEX_4 *restrict abase_x;
kono
parents:
diff changeset
509 const GFC_COMPLEX_4 *restrict bbase_y;
kono
parents:
diff changeset
510 GFC_COMPLEX_4 *restrict dest_y;
kono
parents:
diff changeset
511 GFC_COMPLEX_4 s;
kono
parents:
diff changeset
512
kono
parents:
diff changeset
513 for (y = 0; y < ycount; y++)
kono
parents:
diff changeset
514 {
kono
parents:
diff changeset
515 bbase_y = &bbase[y*bystride];
kono
parents:
diff changeset
516 dest_y = &dest[y*rystride];
kono
parents:
diff changeset
517 for (x = 0; x < xcount; x++)
kono
parents:
diff changeset
518 {
kono
parents:
diff changeset
519 abase_x = &abase[x*axstride];
kono
parents:
diff changeset
520 s = (GFC_COMPLEX_4) 0;
kono
parents:
diff changeset
521 for (n = 0; n < count; n++)
kono
parents:
diff changeset
522 s += abase_x[n] * bbase_y[n];
kono
parents:
diff changeset
523 dest_y[x] = s;
kono
parents:
diff changeset
524 }
kono
parents:
diff changeset
525 }
kono
parents:
diff changeset
526 }
kono
parents:
diff changeset
527 else
kono
parents:
diff changeset
528 {
kono
parents:
diff changeset
529 const GFC_COMPLEX_4 *restrict bbase_y;
kono
parents:
diff changeset
530 GFC_COMPLEX_4 s;
kono
parents:
diff changeset
531
kono
parents:
diff changeset
532 for (y = 0; y < ycount; y++)
kono
parents:
diff changeset
533 {
kono
parents:
diff changeset
534 bbase_y = &bbase[y*bystride];
kono
parents:
diff changeset
535 s = (GFC_COMPLEX_4) 0;
kono
parents:
diff changeset
536 for (n = 0; n < count; n++)
kono
parents:
diff changeset
537 s += abase[n*axstride] * bbase_y[n];
kono
parents:
diff changeset
538 dest[y*rystride] = s;
kono
parents:
diff changeset
539 }
kono
parents:
diff changeset
540 }
kono
parents:
diff changeset
541 }
kono
parents:
diff changeset
542 else if (axstride < aystride)
kono
parents:
diff changeset
543 {
kono
parents:
diff changeset
544 for (y = 0; y < ycount; y++)
kono
parents:
diff changeset
545 for (x = 0; x < xcount; x++)
kono
parents:
diff changeset
546 dest[x*rxstride + y*rystride] = (GFC_COMPLEX_4)0;
kono
parents:
diff changeset
547
kono
parents:
diff changeset
548 for (y = 0; y < ycount; y++)
kono
parents:
diff changeset
549 for (n = 0; n < count; n++)
kono
parents:
diff changeset
550 for (x = 0; x < xcount; x++)
kono
parents:
diff changeset
551 /* dest[x,y] += a[x,n] * b[n,y] */
kono
parents:
diff changeset
552 dest[x*rxstride + y*rystride] +=
kono
parents:
diff changeset
553 abase[x*axstride + n*aystride] *
kono
parents:
diff changeset
554 bbase[n*bxstride + y*bystride];
kono
parents:
diff changeset
555 }
kono
parents:
diff changeset
556 else if (GFC_DESCRIPTOR_RANK (a) == 1)
kono
parents:
diff changeset
557 {
kono
parents:
diff changeset
558 const GFC_COMPLEX_4 *restrict bbase_y;
kono
parents:
diff changeset
559 GFC_COMPLEX_4 s;
kono
parents:
diff changeset
560
kono
parents:
diff changeset
561 for (y = 0; y < ycount; y++)
kono
parents:
diff changeset
562 {
kono
parents:
diff changeset
563 bbase_y = &bbase[y*bystride];
kono
parents:
diff changeset
564 s = (GFC_COMPLEX_4) 0;
kono
parents:
diff changeset
565 for (n = 0; n < count; n++)
kono
parents:
diff changeset
566 s += abase[n*axstride] * bbase_y[n*bxstride];
kono
parents:
diff changeset
567 dest[y*rxstride] = s;
kono
parents:
diff changeset
568 }
kono
parents:
diff changeset
569 }
kono
parents:
diff changeset
570 else
kono
parents:
diff changeset
571 {
kono
parents:
diff changeset
572 const GFC_COMPLEX_4 *restrict abase_x;
kono
parents:
diff changeset
573 const GFC_COMPLEX_4 *restrict bbase_y;
kono
parents:
diff changeset
574 GFC_COMPLEX_4 *restrict dest_y;
kono
parents:
diff changeset
575 GFC_COMPLEX_4 s;
kono
parents:
diff changeset
576
kono
parents:
diff changeset
577 for (y = 0; y < ycount; y++)
kono
parents:
diff changeset
578 {
kono
parents:
diff changeset
579 bbase_y = &bbase[y*bystride];
kono
parents:
diff changeset
580 dest_y = &dest[y*rystride];
kono
parents:
diff changeset
581 for (x = 0; x < xcount; x++)
kono
parents:
diff changeset
582 {
kono
parents:
diff changeset
583 abase_x = &abase[x*axstride];
kono
parents:
diff changeset
584 s = (GFC_COMPLEX_4) 0;
kono
parents:
diff changeset
585 for (n = 0; n < count; n++)
kono
parents:
diff changeset
586 s += abase_x[n*aystride] * bbase_y[n*bxstride];
kono
parents:
diff changeset
587 dest_y[x*rxstride] = s;
kono
parents:
diff changeset
588 }
kono
parents:
diff changeset
589 }
kono
parents:
diff changeset
590 }
kono
parents:
diff changeset
591 }
kono
parents:
diff changeset
592 #undef POW3
kono
parents:
diff changeset
593 #undef min
kono
parents:
diff changeset
594 #undef max
kono
parents:
diff changeset
595
kono
parents:
diff changeset
596 #endif
kono
parents:
diff changeset
597
kono
parents:
diff changeset
598 #if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
kono
parents:
diff changeset
599 void
kono
parents:
diff changeset
600 matmul_c4_avx128_fma4 (gfc_array_c4 * const restrict retarray,
kono
parents:
diff changeset
601 gfc_array_c4 * const restrict a, gfc_array_c4 * const restrict b, int try_blas,
kono
parents:
diff changeset
602 int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4")));
kono
parents:
diff changeset
603 internal_proto(matmul_c4_avx128_fma4);
kono
parents:
diff changeset
604 void
kono
parents:
diff changeset
605 matmul_c4_avx128_fma4 (gfc_array_c4 * const restrict retarray,
kono
parents:
diff changeset
606 gfc_array_c4 * const restrict a, gfc_array_c4 * const restrict b, int try_blas,
kono
parents:
diff changeset
607 int blas_limit, blas_call gemm)
kono
parents:
diff changeset
608 {
kono
parents:
diff changeset
609 const GFC_COMPLEX_4 * restrict abase;
kono
parents:
diff changeset
610 const GFC_COMPLEX_4 * restrict bbase;
kono
parents:
diff changeset
611 GFC_COMPLEX_4 * restrict dest;
kono
parents:
diff changeset
612
kono
parents:
diff changeset
613 index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
kono
parents:
diff changeset
614 index_type x, y, n, count, xcount, ycount;
kono
parents:
diff changeset
615
kono
parents:
diff changeset
616 assert (GFC_DESCRIPTOR_RANK (a) == 2
kono
parents:
diff changeset
617 || GFC_DESCRIPTOR_RANK (b) == 2);
kono
parents:
diff changeset
618
kono
parents:
diff changeset
619 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
kono
parents:
diff changeset
620
kono
parents:
diff changeset
621 Either A or B (but not both) can be rank 1:
kono
parents:
diff changeset
622
kono
parents:
diff changeset
623 o One-dimensional argument A is implicitly treated as a row matrix
kono
parents:
diff changeset
624 dimensioned [1,count], so xcount=1.
kono
parents:
diff changeset
625
kono
parents:
diff changeset
626 o One-dimensional argument B is implicitly treated as a column matrix
kono
parents:
diff changeset
627 dimensioned [count, 1], so ycount=1.
kono
parents:
diff changeset
628 */
kono
parents:
diff changeset
629
kono
parents:
diff changeset
630 if (retarray->base_addr == NULL)
kono
parents:
diff changeset
631 {
kono
parents:
diff changeset
632 if (GFC_DESCRIPTOR_RANK (a) == 1)
kono
parents:
diff changeset
633 {
kono
parents:
diff changeset
634 GFC_DIMENSION_SET(retarray->dim[0], 0,
kono
parents:
diff changeset
635 GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
kono
parents:
diff changeset
636 }
kono
parents:
diff changeset
637 else if (GFC_DESCRIPTOR_RANK (b) == 1)
kono
parents:
diff changeset
638 {
kono
parents:
diff changeset
639 GFC_DIMENSION_SET(retarray->dim[0], 0,
kono
parents:
diff changeset
640 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
kono
parents:
diff changeset
641 }
kono
parents:
diff changeset
642 else
kono
parents:
diff changeset
643 {
kono
parents:
diff changeset
644 GFC_DIMENSION_SET(retarray->dim[0], 0,
kono
parents:
diff changeset
645 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
kono
parents:
diff changeset
646
kono
parents:
diff changeset
647 GFC_DIMENSION_SET(retarray->dim[1], 0,
kono
parents:
diff changeset
648 GFC_DESCRIPTOR_EXTENT(b,1) - 1,
kono
parents:
diff changeset
649 GFC_DESCRIPTOR_EXTENT(retarray,0));
kono
parents:
diff changeset
650 }
kono
parents:
diff changeset
651
kono
parents:
diff changeset
652 retarray->base_addr
kono
parents:
diff changeset
653 = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_COMPLEX_4));
kono
parents:
diff changeset
654 retarray->offset = 0;
kono
parents:
diff changeset
655 }
kono
parents:
diff changeset
656 else if (unlikely (compile_options.bounds_check))
kono
parents:
diff changeset
657 {
kono
parents:
diff changeset
658 index_type ret_extent, arg_extent;
kono
parents:
diff changeset
659
kono
parents:
diff changeset
660 if (GFC_DESCRIPTOR_RANK (a) == 1)
kono
parents:
diff changeset
661 {
kono
parents:
diff changeset
662 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
kono
parents:
diff changeset
663 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
kono
parents:
diff changeset
664 if (arg_extent != ret_extent)
kono
parents:
diff changeset
665 runtime_error ("Incorrect extent in return array in"
kono
parents:
diff changeset
666 " MATMUL intrinsic: is %ld, should be %ld",
kono
parents:
diff changeset
667 (long int) ret_extent, (long int) arg_extent);
kono
parents:
diff changeset
668 }
kono
parents:
diff changeset
669 else if (GFC_DESCRIPTOR_RANK (b) == 1)
kono
parents:
diff changeset
670 {
kono
parents:
diff changeset
671 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
kono
parents:
diff changeset
672 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
kono
parents:
diff changeset
673 if (arg_extent != ret_extent)
kono
parents:
diff changeset
674 runtime_error ("Incorrect extent in return array in"
kono
parents:
diff changeset
675 " MATMUL intrinsic: is %ld, should be %ld",
kono
parents:
diff changeset
676 (long int) ret_extent, (long int) arg_extent);
kono
parents:
diff changeset
677 }
kono
parents:
diff changeset
678 else
kono
parents:
diff changeset
679 {
kono
parents:
diff changeset
680 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
kono
parents:
diff changeset
681 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
kono
parents:
diff changeset
682 if (arg_extent != ret_extent)
kono
parents:
diff changeset
683 runtime_error ("Incorrect extent in return array in"
kono
parents:
diff changeset
684 " MATMUL intrinsic for dimension 1:"
kono
parents:
diff changeset
685 " is %ld, should be %ld",
kono
parents:
diff changeset
686 (long int) ret_extent, (long int) arg_extent);
kono
parents:
diff changeset
687
kono
parents:
diff changeset
688 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
kono
parents:
diff changeset
689 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
kono
parents:
diff changeset
690 if (arg_extent != ret_extent)
kono
parents:
diff changeset
691 runtime_error ("Incorrect extent in return array in"
kono
parents:
diff changeset
692 " MATMUL intrinsic for dimension 2:"
kono
parents:
diff changeset
693 " is %ld, should be %ld",
kono
parents:
diff changeset
694 (long int) ret_extent, (long int) arg_extent);
kono
parents:
diff changeset
695 }
kono
parents:
diff changeset
696 }
kono
parents:
diff changeset
697
kono
parents:
diff changeset
698
kono
parents:
diff changeset
699 if (GFC_DESCRIPTOR_RANK (retarray) == 1)
kono
parents:
diff changeset
700 {
kono
parents:
diff changeset
701 /* One-dimensional result may be addressed in the code below
kono
parents:
diff changeset
702 either as a row or a column matrix. We want both cases to
kono
parents:
diff changeset
703 work. */
kono
parents:
diff changeset
704 rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
kono
parents:
diff changeset
705 }
kono
parents:
diff changeset
706 else
kono
parents:
diff changeset
707 {
kono
parents:
diff changeset
708 rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
kono
parents:
diff changeset
709 rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
kono
parents:
diff changeset
710 }
kono
parents:
diff changeset
711
kono
parents:
diff changeset
712
kono
parents:
diff changeset
713 if (GFC_DESCRIPTOR_RANK (a) == 1)
kono
parents:
diff changeset
714 {
kono
parents:
diff changeset
715 /* Treat it as a a row matrix A[1,count]. */
kono
parents:
diff changeset
716 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
kono
parents:
diff changeset
717 aystride = 1;
kono
parents:
diff changeset
718
kono
parents:
diff changeset
719 xcount = 1;
kono
parents:
diff changeset
720 count = GFC_DESCRIPTOR_EXTENT(a,0);
kono
parents:
diff changeset
721 }
kono
parents:
diff changeset
722 else
kono
parents:
diff changeset
723 {
kono
parents:
diff changeset
724 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
kono
parents:
diff changeset
725 aystride = GFC_DESCRIPTOR_STRIDE(a,1);
kono
parents:
diff changeset
726
kono
parents:
diff changeset
727 count = GFC_DESCRIPTOR_EXTENT(a,1);
kono
parents:
diff changeset
728 xcount = GFC_DESCRIPTOR_EXTENT(a,0);
kono
parents:
diff changeset
729 }
kono
parents:
diff changeset
730
kono
parents:
diff changeset
731 if (count != GFC_DESCRIPTOR_EXTENT(b,0))
kono
parents:
diff changeset
732 {
kono
parents:
diff changeset
733 if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
kono
parents:
diff changeset
734 runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
kono
parents:
diff changeset
735 }
kono
parents:
diff changeset
736
kono
parents:
diff changeset
737 if (GFC_DESCRIPTOR_RANK (b) == 1)
kono
parents:
diff changeset
738 {
kono
parents:
diff changeset
739 /* Treat it as a column matrix B[count,1] */
kono
parents:
diff changeset
740 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
kono
parents:
diff changeset
741
kono
parents:
diff changeset
742 /* bystride should never be used for 1-dimensional b.
kono
parents:
diff changeset
743 The value is only used for calculation of the
kono
parents:
diff changeset
744 memory by the buffer. */
kono
parents:
diff changeset
745 bystride = 256;
kono
parents:
diff changeset
746 ycount = 1;
kono
parents:
diff changeset
747 }
kono
parents:
diff changeset
748 else
kono
parents:
diff changeset
749 {
kono
parents:
diff changeset
750 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
kono
parents:
diff changeset
751 bystride = GFC_DESCRIPTOR_STRIDE(b,1);
kono
parents:
diff changeset
752 ycount = GFC_DESCRIPTOR_EXTENT(b,1);
kono
parents:
diff changeset
753 }
kono
parents:
diff changeset
754
kono
parents:
diff changeset
755 abase = a->base_addr;
kono
parents:
diff changeset
756 bbase = b->base_addr;
kono
parents:
diff changeset
757 dest = retarray->base_addr;
kono
parents:
diff changeset
758
kono
parents:
diff changeset
759 /* Now that everything is set up, we perform the multiplication
kono
parents:
diff changeset
760 itself. */
kono
parents:
diff changeset
761
kono
parents:
diff changeset
762 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
kono
parents:
diff changeset
763 #define min(a,b) ((a) <= (b) ? (a) : (b))
kono
parents:
diff changeset
764 #define max(a,b) ((a) >= (b) ? (a) : (b))
kono
parents:
diff changeset
765
kono
parents:
diff changeset
766 if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
kono
parents:
diff changeset
767 && (bxstride == 1 || bystride == 1)
kono
parents:
diff changeset
768 && (((float) xcount) * ((float) ycount) * ((float) count)
kono
parents:
diff changeset
769 > POW3(blas_limit)))
kono
parents:
diff changeset
770 {
kono
parents:
diff changeset
771 const int m = xcount, n = ycount, k = count, ldc = rystride;
kono
parents:
diff changeset
772 const GFC_COMPLEX_4 one = 1, zero = 0;
kono
parents:
diff changeset
773 const int lda = (axstride == 1) ? aystride : axstride,
kono
parents:
diff changeset
774 ldb = (bxstride == 1) ? bystride : bxstride;
kono
parents:
diff changeset
775
kono
parents:
diff changeset
776 if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
kono
parents:
diff changeset
777 {
kono
parents:
diff changeset
778 assert (gemm != NULL);
kono
parents:
diff changeset
779 gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m,
kono
parents:
diff changeset
780 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
kono
parents:
diff changeset
781 &ldc, 1, 1);
kono
parents:
diff changeset
782 return;
kono
parents:
diff changeset
783 }
kono
parents:
diff changeset
784 }
kono
parents:
diff changeset
785
kono
parents:
diff changeset
786 if (rxstride == 1 && axstride == 1 && bxstride == 1)
kono
parents:
diff changeset
787 {
kono
parents:
diff changeset
788 /* This block of code implements a tuned matmul, derived from
kono
parents:
diff changeset
789 Superscalar GEMM-based level 3 BLAS, Beta version 0.1
kono
parents:
diff changeset
790
kono
parents:
diff changeset
791 Bo Kagstrom and Per Ling
kono
parents:
diff changeset
792 Department of Computing Science
kono
parents:
diff changeset
793 Umea University
kono
parents:
diff changeset
794 S-901 87 Umea, Sweden
kono
parents:
diff changeset
795
kono
parents:
diff changeset
796 from netlib.org, translated to C, and modified for matmul.m4. */
kono
parents:
diff changeset
797
kono
parents:
diff changeset
798 const GFC_COMPLEX_4 *a, *b;
kono
parents:
diff changeset
799 GFC_COMPLEX_4 *c;
kono
parents:
diff changeset
800 const index_type m = xcount, n = ycount, k = count;
kono
parents:
diff changeset
801
kono
parents:
diff changeset
802 /* System generated locals */
kono
parents:
diff changeset
803 index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
kono
parents:
diff changeset
804 i1, i2, i3, i4, i5, i6;
kono
parents:
diff changeset
805
kono
parents:
diff changeset
806 /* Local variables */
kono
parents:
diff changeset
807 GFC_COMPLEX_4 f11, f12, f21, f22, f31, f32, f41, f42,
kono
parents:
diff changeset
808 f13, f14, f23, f24, f33, f34, f43, f44;
kono
parents:
diff changeset
809 index_type i, j, l, ii, jj, ll;
kono
parents:
diff changeset
810 index_type isec, jsec, lsec, uisec, ujsec, ulsec;
kono
parents:
diff changeset
811 GFC_COMPLEX_4 *t1;
kono
parents:
diff changeset
812
kono
parents:
diff changeset
813 a = abase;
kono
parents:
diff changeset
814 b = bbase;
kono
parents:
diff changeset
815 c = retarray->base_addr;
kono
parents:
diff changeset
816
kono
parents:
diff changeset
817 /* Parameter adjustments */
kono
parents:
diff changeset
818 c_dim1 = rystride;
kono
parents:
diff changeset
819 c_offset = 1 + c_dim1;
kono
parents:
diff changeset
820 c -= c_offset;
kono
parents:
diff changeset
821 a_dim1 = aystride;
kono
parents:
diff changeset
822 a_offset = 1 + a_dim1;
kono
parents:
diff changeset
823 a -= a_offset;
kono
parents:
diff changeset
824 b_dim1 = bystride;
kono
parents:
diff changeset
825 b_offset = 1 + b_dim1;
kono
parents:
diff changeset
826 b -= b_offset;
kono
parents:
diff changeset
827
kono
parents:
diff changeset
828 /* Empty c first. */
kono
parents:
diff changeset
829 for (j=1; j<=n; j++)
kono
parents:
diff changeset
830 for (i=1; i<=m; i++)
kono
parents:
diff changeset
831 c[i + j * c_dim1] = (GFC_COMPLEX_4)0;
kono
parents:
diff changeset
832
kono
parents:
diff changeset
833 /* Early exit if possible */
kono
parents:
diff changeset
834 if (m == 0 || n == 0 || k == 0)
kono
parents:
diff changeset
835 return;
kono
parents:
diff changeset
836
kono
parents:
diff changeset
837 /* Adjust size of t1 to what is needed. */
kono
parents:
diff changeset
838 index_type t1_dim;
kono
parents:
diff changeset
839 t1_dim = (a_dim1-1) * 256 + b_dim1;
kono
parents:
diff changeset
840 if (t1_dim > 65536)
kono
parents:
diff changeset
841 t1_dim = 65536;
kono
parents:
diff changeset
842
kono
parents:
diff changeset
843 t1 = malloc (t1_dim * sizeof(GFC_COMPLEX_4));
kono
parents:
diff changeset
844
kono
parents:
diff changeset
845 /* Start turning the crank. */
kono
parents:
diff changeset
846 i1 = n;
kono
parents:
diff changeset
847 for (jj = 1; jj <= i1; jj += 512)
kono
parents:
diff changeset
848 {
kono
parents:
diff changeset
849 /* Computing MIN */
kono
parents:
diff changeset
850 i2 = 512;
kono
parents:
diff changeset
851 i3 = n - jj + 1;
kono
parents:
diff changeset
852 jsec = min(i2,i3);
kono
parents:
diff changeset
853 ujsec = jsec - jsec % 4;
kono
parents:
diff changeset
854 i2 = k;
kono
parents:
diff changeset
855 for (ll = 1; ll <= i2; ll += 256)
kono
parents:
diff changeset
856 {
kono
parents:
diff changeset
857 /* Computing MIN */
kono
parents:
diff changeset
858 i3 = 256;
kono
parents:
diff changeset
859 i4 = k - ll + 1;
kono
parents:
diff changeset
860 lsec = min(i3,i4);
kono
parents:
diff changeset
861 ulsec = lsec - lsec % 2;
kono
parents:
diff changeset
862
kono
parents:
diff changeset
863 i3 = m;
kono
parents:
diff changeset
864 for (ii = 1; ii <= i3; ii += 256)
kono
parents:
diff changeset
865 {
kono
parents:
diff changeset
866 /* Computing MIN */
kono
parents:
diff changeset
867 i4 = 256;
kono
parents:
diff changeset
868 i5 = m - ii + 1;
kono
parents:
diff changeset
869 isec = min(i4,i5);
kono
parents:
diff changeset
870 uisec = isec - isec % 2;
kono
parents:
diff changeset
871 i4 = ll + ulsec - 1;
kono
parents:
diff changeset
872 for (l = ll; l <= i4; l += 2)
kono
parents:
diff changeset
873 {
kono
parents:
diff changeset
874 i5 = ii + uisec - 1;
kono
parents:
diff changeset
875 for (i = ii; i <= i5; i += 2)
kono
parents:
diff changeset
876 {
kono
parents:
diff changeset
877 t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
kono
parents:
diff changeset
878 a[i + l * a_dim1];
kono
parents:
diff changeset
879 t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
kono
parents:
diff changeset
880 a[i + (l + 1) * a_dim1];
kono
parents:
diff changeset
881 t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
kono
parents:
diff changeset
882 a[i + 1 + l * a_dim1];
kono
parents:
diff changeset
883 t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
kono
parents:
diff changeset
884 a[i + 1 + (l + 1) * a_dim1];
kono
parents:
diff changeset
885 }
kono
parents:
diff changeset
886 if (uisec < isec)
kono
parents:
diff changeset
887 {
kono
parents:
diff changeset
888 t1[l - ll + 1 + (isec << 8) - 257] =
kono
parents:
diff changeset
889 a[ii + isec - 1 + l * a_dim1];
kono
parents:
diff changeset
890 t1[l - ll + 2 + (isec << 8) - 257] =
kono
parents:
diff changeset
891 a[ii + isec - 1 + (l + 1) * a_dim1];
kono
parents:
diff changeset
892 }
kono
parents:
diff changeset
893 }
kono
parents:
diff changeset
894 if (ulsec < lsec)
kono
parents:
diff changeset
895 {
kono
parents:
diff changeset
896 i4 = ii + isec - 1;
kono
parents:
diff changeset
897 for (i = ii; i<= i4; ++i)
kono
parents:
diff changeset
898 {
kono
parents:
diff changeset
899 t1[lsec + ((i - ii + 1) << 8) - 257] =
kono
parents:
diff changeset
900 a[i + (ll + lsec - 1) * a_dim1];
kono
parents:
diff changeset
901 }
kono
parents:
diff changeset
902 }
kono
parents:
diff changeset
903
kono
parents:
diff changeset
904 uisec = isec - isec % 4;
kono
parents:
diff changeset
905 i4 = jj + ujsec - 1;
kono
parents:
diff changeset
906 for (j = jj; j <= i4; j += 4)
kono
parents:
diff changeset
907 {
kono
parents:
diff changeset
908 i5 = ii + uisec - 1;
kono
parents:
diff changeset
909 for (i = ii; i <= i5; i += 4)
kono
parents:
diff changeset
910 {
kono
parents:
diff changeset
911 f11 = c[i + j * c_dim1];
kono
parents:
diff changeset
912 f21 = c[i + 1 + j * c_dim1];
kono
parents:
diff changeset
913 f12 = c[i + (j + 1) * c_dim1];
kono
parents:
diff changeset
914 f22 = c[i + 1 + (j + 1) * c_dim1];
kono
parents:
diff changeset
915 f13 = c[i + (j + 2) * c_dim1];
kono
parents:
diff changeset
916 f23 = c[i + 1 + (j + 2) * c_dim1];
kono
parents:
diff changeset
917 f14 = c[i + (j + 3) * c_dim1];
kono
parents:
diff changeset
918 f24 = c[i + 1 + (j + 3) * c_dim1];
kono
parents:
diff changeset
919 f31 = c[i + 2 + j * c_dim1];
kono
parents:
diff changeset
920 f41 = c[i + 3 + j * c_dim1];
kono
parents:
diff changeset
921 f32 = c[i + 2 + (j + 1) * c_dim1];
kono
parents:
diff changeset
922 f42 = c[i + 3 + (j + 1) * c_dim1];
kono
parents:
diff changeset
923 f33 = c[i + 2 + (j + 2) * c_dim1];
kono
parents:
diff changeset
924 f43 = c[i + 3 + (j + 2) * c_dim1];
kono
parents:
diff changeset
925 f34 = c[i + 2 + (j + 3) * c_dim1];
kono
parents:
diff changeset
926 f44 = c[i + 3 + (j + 3) * c_dim1];
kono
parents:
diff changeset
927 i6 = ll + lsec - 1;
kono
parents:
diff changeset
928 for (l = ll; l <= i6; ++l)
kono
parents:
diff changeset
929 {
kono
parents:
diff changeset
930 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
kono
parents:
diff changeset
931 * b[l + j * b_dim1];
kono
parents:
diff changeset
932 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
kono
parents:
diff changeset
933 * b[l + j * b_dim1];
kono
parents:
diff changeset
934 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
kono
parents:
diff changeset
935 * b[l + (j + 1) * b_dim1];
kono
parents:
diff changeset
936 f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
kono
parents:
diff changeset
937 * b[l + (j + 1) * b_dim1];
kono
parents:
diff changeset
938 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
kono
parents:
diff changeset
939 * b[l + (j + 2) * b_dim1];
kono
parents:
diff changeset
940 f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
kono
parents:
diff changeset
941 * b[l + (j + 2) * b_dim1];
kono
parents:
diff changeset
942 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
kono
parents:
diff changeset
943 * b[l + (j + 3) * b_dim1];
kono
parents:
diff changeset
944 f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
kono
parents:
diff changeset
945 * b[l + (j + 3) * b_dim1];
kono
parents:
diff changeset
946 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
kono
parents:
diff changeset
947 * b[l + j * b_dim1];
kono
parents:
diff changeset
948 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
kono
parents:
diff changeset
949 * b[l + j * b_dim1];
kono
parents:
diff changeset
950 f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
kono
parents:
diff changeset
951 * b[l + (j + 1) * b_dim1];
kono
parents:
diff changeset
952 f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
kono
parents:
diff changeset
953 * b[l + (j + 1) * b_dim1];
kono
parents:
diff changeset
954 f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
kono
parents:
diff changeset
955 * b[l + (j + 2) * b_dim1];
kono
parents:
diff changeset
956 f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
kono
parents:
diff changeset
957 * b[l + (j + 2) * b_dim1];
kono
parents:
diff changeset
958 f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
kono
parents:
diff changeset
959 * b[l + (j + 3) * b_dim1];
kono
parents:
diff changeset
960 f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
kono
parents:
diff changeset
961 * b[l + (j + 3) * b_dim1];
kono
parents:
diff changeset
962 }
kono
parents:
diff changeset
963 c[i + j * c_dim1] = f11;
kono
parents:
diff changeset
964 c[i + 1 + j * c_dim1] = f21;
kono
parents:
diff changeset
965 c[i + (j + 1) * c_dim1] = f12;
kono
parents:
diff changeset
966 c[i + 1 + (j + 1) * c_dim1] = f22;
kono
parents:
diff changeset
967 c[i + (j + 2) * c_dim1] = f13;
kono
parents:
diff changeset
968 c[i + 1 + (j + 2) * c_dim1] = f23;
kono
parents:
diff changeset
969 c[i + (j + 3) * c_dim1] = f14;
kono
parents:
diff changeset
970 c[i + 1 + (j + 3) * c_dim1] = f24;
kono
parents:
diff changeset
971 c[i + 2 + j * c_dim1] = f31;
kono
parents:
diff changeset
972 c[i + 3 + j * c_dim1] = f41;
kono
parents:
diff changeset
973 c[i + 2 + (j + 1) * c_dim1] = f32;
kono
parents:
diff changeset
974 c[i + 3 + (j + 1) * c_dim1] = f42;
kono
parents:
diff changeset
975 c[i + 2 + (j + 2) * c_dim1] = f33;
kono
parents:
diff changeset
976 c[i + 3 + (j + 2) * c_dim1] = f43;
kono
parents:
diff changeset
977 c[i + 2 + (j + 3) * c_dim1] = f34;
kono
parents:
diff changeset
978 c[i + 3 + (j + 3) * c_dim1] = f44;
kono
parents:
diff changeset
979 }
kono
parents:
diff changeset
980 if (uisec < isec)
kono
parents:
diff changeset
981 {
kono
parents:
diff changeset
982 i5 = ii + isec - 1;
kono
parents:
diff changeset
983 for (i = ii + uisec; i <= i5; ++i)
kono
parents:
diff changeset
984 {
kono
parents:
diff changeset
985 f11 = c[i + j * c_dim1];
kono
parents:
diff changeset
986 f12 = c[i + (j + 1) * c_dim1];
kono
parents:
diff changeset
987 f13 = c[i + (j + 2) * c_dim1];
kono
parents:
diff changeset
988 f14 = c[i + (j + 3) * c_dim1];
kono
parents:
diff changeset
989 i6 = ll + lsec - 1;
kono
parents:
diff changeset
990 for (l = ll; l <= i6; ++l)
kono
parents:
diff changeset
991 {
kono
parents:
diff changeset
992 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
kono
parents:
diff changeset
993 257] * b[l + j * b_dim1];
kono
parents:
diff changeset
994 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
kono
parents:
diff changeset
995 257] * b[l + (j + 1) * b_dim1];
kono
parents:
diff changeset
996 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
kono
parents:
diff changeset
997 257] * b[l + (j + 2) * b_dim1];
kono
parents:
diff changeset
998 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
kono
parents:
diff changeset
999 257] * b[l + (j + 3) * b_dim1];
kono
parents:
diff changeset
1000 }
kono
parents:
diff changeset
1001 c[i + j * c_dim1] = f11;
kono
parents:
diff changeset
1002 c[i + (j + 1) * c_dim1] = f12;
kono
parents:
diff changeset
1003 c[i + (j + 2) * c_dim1] = f13;
kono
parents:
diff changeset
1004 c[i + (j + 3) * c_dim1] = f14;
kono
parents:
diff changeset
1005 }
kono
parents:
diff changeset
1006 }
kono
parents:
diff changeset
1007 }
kono
parents:
diff changeset
1008 if (ujsec < jsec)
kono
parents:
diff changeset
1009 {
kono
parents:
diff changeset
1010 i4 = jj + jsec - 1;
kono
parents:
diff changeset
1011 for (j = jj + ujsec; j <= i4; ++j)
kono
parents:
diff changeset
1012 {
kono
parents:
diff changeset
1013 i5 = ii + uisec - 1;
kono
parents:
diff changeset
1014 for (i = ii; i <= i5; i += 4)
kono
parents:
diff changeset
1015 {
kono
parents:
diff changeset
1016 f11 = c[i + j * c_dim1];
kono
parents:
diff changeset
1017 f21 = c[i + 1 + j * c_dim1];
kono
parents:
diff changeset
1018 f31 = c[i + 2 + j * c_dim1];
kono
parents:
diff changeset
1019 f41 = c[i + 3 + j * c_dim1];
kono
parents:
diff changeset
1020 i6 = ll + lsec - 1;
kono
parents:
diff changeset
1021 for (l = ll; l <= i6; ++l)
kono
parents:
diff changeset
1022 {
kono
parents:
diff changeset
1023 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
kono
parents:
diff changeset
1024 257] * b[l + j * b_dim1];
kono
parents:
diff changeset
1025 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
kono
parents:
diff changeset
1026 257] * b[l + j * b_dim1];
kono
parents:
diff changeset
1027 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
kono
parents:
diff changeset
1028 257] * b[l + j * b_dim1];
kono
parents:
diff changeset
1029 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
kono
parents:
diff changeset
1030 257] * b[l + j * b_dim1];
kono
parents:
diff changeset
1031 }
kono
parents:
diff changeset
1032 c[i + j * c_dim1] = f11;
kono
parents:
diff changeset
1033 c[i + 1 + j * c_dim1] = f21;
kono
parents:
diff changeset
1034 c[i + 2 + j * c_dim1] = f31;
kono
parents:
diff changeset
1035 c[i + 3 + j * c_dim1] = f41;
kono
parents:
diff changeset
1036 }
kono
parents:
diff changeset
1037 i5 = ii + isec - 1;
kono
parents:
diff changeset
1038 for (i = ii + uisec; i <= i5; ++i)
kono
parents:
diff changeset
1039 {
kono
parents:
diff changeset
1040 f11 = c[i + j * c_dim1];
kono
parents:
diff changeset
1041 i6 = ll + lsec - 1;
kono
parents:
diff changeset
1042 for (l = ll; l <= i6; ++l)
kono
parents:
diff changeset
1043 {
kono
parents:
diff changeset
1044 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
kono
parents:
diff changeset
1045 257] * b[l + j * b_dim1];
kono
parents:
diff changeset
1046 }
kono
parents:
diff changeset
1047 c[i + j * c_dim1] = f11;
kono
parents:
diff changeset
1048 }
kono
parents:
diff changeset
1049 }
kono
parents:
diff changeset
1050 }
kono
parents:
diff changeset
1051 }
kono
parents:
diff changeset
1052 }
kono
parents:
diff changeset
1053 }
kono
parents:
diff changeset
1054 free(t1);
kono
parents:
diff changeset
1055 return;
kono
parents:
diff changeset
1056 }
kono
parents:
diff changeset
1057 else if (rxstride == 1 && aystride == 1 && bxstride == 1)
kono
parents:
diff changeset
1058 {
kono
parents:
diff changeset
1059 if (GFC_DESCRIPTOR_RANK (a) != 1)
kono
parents:
diff changeset
1060 {
kono
parents:
diff changeset
1061 const GFC_COMPLEX_4 *restrict abase_x;
kono
parents:
diff changeset
1062 const GFC_COMPLEX_4 *restrict bbase_y;
kono
parents:
diff changeset
1063 GFC_COMPLEX_4 *restrict dest_y;
kono
parents:
diff changeset
1064 GFC_COMPLEX_4 s;
kono
parents:
diff changeset
1065
kono
parents:
diff changeset
1066 for (y = 0; y < ycount; y++)
kono
parents:
diff changeset
1067 {
kono
parents:
diff changeset
1068 bbase_y = &bbase[y*bystride];
kono
parents:
diff changeset
1069 dest_y = &dest[y*rystride];
kono
parents:
diff changeset
1070 for (x = 0; x < xcount; x++)
kono
parents:
diff changeset
1071 {
kono
parents:
diff changeset
1072 abase_x = &abase[x*axstride];
kono
parents:
diff changeset
1073 s = (GFC_COMPLEX_4) 0;
kono
parents:
diff changeset
1074 for (n = 0; n < count; n++)
kono
parents:
diff changeset
1075 s += abase_x[n] * bbase_y[n];
kono
parents:
diff changeset
1076 dest_y[x] = s;
kono
parents:
diff changeset
1077 }
kono
parents:
diff changeset
1078 }
kono
parents:
diff changeset
1079 }
kono
parents:
diff changeset
1080 else
kono
parents:
diff changeset
1081 {
kono
parents:
diff changeset
1082 const GFC_COMPLEX_4 *restrict bbase_y;
kono
parents:
diff changeset
1083 GFC_COMPLEX_4 s;
kono
parents:
diff changeset
1084
kono
parents:
diff changeset
1085 for (y = 0; y < ycount; y++)
kono
parents:
diff changeset
1086 {
kono
parents:
diff changeset
1087 bbase_y = &bbase[y*bystride];
kono
parents:
diff changeset
1088 s = (GFC_COMPLEX_4) 0;
kono
parents:
diff changeset
1089 for (n = 0; n < count; n++)
kono
parents:
diff changeset
1090 s += abase[n*axstride] * bbase_y[n];
kono
parents:
diff changeset
1091 dest[y*rystride] = s;
kono
parents:
diff changeset
1092 }
kono
parents:
diff changeset
1093 }
kono
parents:
diff changeset
1094 }
kono
parents:
diff changeset
1095 else if (axstride < aystride)
kono
parents:
diff changeset
1096 {
kono
parents:
diff changeset
1097 for (y = 0; y < ycount; y++)
kono
parents:
diff changeset
1098 for (x = 0; x < xcount; x++)
kono
parents:
diff changeset
1099 dest[x*rxstride + y*rystride] = (GFC_COMPLEX_4)0;
kono
parents:
diff changeset
1100
kono
parents:
diff changeset
1101 for (y = 0; y < ycount; y++)
kono
parents:
diff changeset
1102 for (n = 0; n < count; n++)
kono
parents:
diff changeset
1103 for (x = 0; x < xcount; x++)
kono
parents:
diff changeset
1104 /* dest[x,y] += a[x,n] * b[n,y] */
kono
parents:
diff changeset
1105 dest[x*rxstride + y*rystride] +=
kono
parents:
diff changeset
1106 abase[x*axstride + n*aystride] *
kono
parents:
diff changeset
1107 bbase[n*bxstride + y*bystride];
kono
parents:
diff changeset
1108 }
kono
parents:
diff changeset
1109 else if (GFC_DESCRIPTOR_RANK (a) == 1)
kono
parents:
diff changeset
1110 {
kono
parents:
diff changeset
1111 const GFC_COMPLEX_4 *restrict bbase_y;
kono
parents:
diff changeset
1112 GFC_COMPLEX_4 s;
kono
parents:
diff changeset
1113
kono
parents:
diff changeset
1114 for (y = 0; y < ycount; y++)
kono
parents:
diff changeset
1115 {
kono
parents:
diff changeset
1116 bbase_y = &bbase[y*bystride];
kono
parents:
diff changeset
1117 s = (GFC_COMPLEX_4) 0;
kono
parents:
diff changeset
1118 for (n = 0; n < count; n++)
kono
parents:
diff changeset
1119 s += abase[n*axstride] * bbase_y[n*bxstride];
kono
parents:
diff changeset
1120 dest[y*rxstride] = s;
kono
parents:
diff changeset
1121 }
kono
parents:
diff changeset
1122 }
kono
parents:
diff changeset
1123 else
kono
parents:
diff changeset
1124 {
kono
parents:
diff changeset
1125 const GFC_COMPLEX_4 *restrict abase_x;
kono
parents:
diff changeset
1126 const GFC_COMPLEX_4 *restrict bbase_y;
kono
parents:
diff changeset
1127 GFC_COMPLEX_4 *restrict dest_y;
kono
parents:
diff changeset
1128 GFC_COMPLEX_4 s;
kono
parents:
diff changeset
1129
kono
parents:
diff changeset
1130 for (y = 0; y < ycount; y++)
kono
parents:
diff changeset
1131 {
kono
parents:
diff changeset
1132 bbase_y = &bbase[y*bystride];
kono
parents:
diff changeset
1133 dest_y = &dest[y*rystride];
kono
parents:
diff changeset
1134 for (x = 0; x < xcount; x++)
kono
parents:
diff changeset
1135 {
kono
parents:
diff changeset
1136 abase_x = &abase[x*axstride];
kono
parents:
diff changeset
1137 s = (GFC_COMPLEX_4) 0;
kono
parents:
diff changeset
1138 for (n = 0; n < count; n++)
kono
parents:
diff changeset
1139 s += abase_x[n*aystride] * bbase_y[n*bxstride];
kono
parents:
diff changeset
1140 dest_y[x*rxstride] = s;
kono
parents:
diff changeset
1141 }
kono
parents:
diff changeset
1142 }
kono
parents:
diff changeset
1143 }
kono
parents:
diff changeset
1144 }
kono
parents:
diff changeset
1145 #undef POW3
kono
parents:
diff changeset
1146 #undef min
kono
parents:
diff changeset
1147 #undef max
kono
parents:
diff changeset
1148
kono
parents:
diff changeset
1149 #endif
kono
parents:
diff changeset
1150
kono
parents:
diff changeset
1151 #endif
kono
parents:
diff changeset
1152