blob: e6eaaa6d1d1e903b25363c9977623630eba20b41 [file] [log] [blame]
Neels Hofmeyr17518fe2017-06-20 04:35:06 +02001/*! \file conv_acc_sse_impl.h
2 * Accelerated Viterbi decoder implementation:
Vadim Yanitskiy46e533c2017-06-19 18:21:02 +07003 * Actual definitions which are being included
Neels Hofmeyr17518fe2017-06-20 04:35:06 +02004 * from both conv_acc_sse.c and conv_acc_sse_avx.c. */
5/*
Vadim Yanitskiy0d49f472017-05-28 18:20:02 +07006 * Copyright (C) 2013, 2014 Thomas Tsou <tom@tsou.cc>
7 *
8 * All Rights Reserved
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License along
21 * with this program; if not, write to the Free Software Foundation, Inc.,
22 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
23 */
24
25extern int sse41_supported;
26
27/* Octo-Viterbi butterfly
28 * Compute 8-wide butterfly generating 16 path decisions and 16 accumulated
29 * sums. Inputs all packed 16-bit integers in three 128-bit XMM registers.
30 * Two intermediate registers are used and results are set in the upper 4
31 * registers.
32 *
33 * Input:
34 * M0 - Path metrics 0 (packed 16-bit integers)
35 * M1 - Path metrics 1 (packed 16-bit integers)
36 * M2 - Branch metrics (packed 16-bit integers)
37 *
38 * Output:
39 * M2 - Selected and accumulated path metrics 0
40 * M4 - Selected and accumulated path metrics 1
41 * M3 - Path selections 0
42 * M1 - Path selections 1
43 */
44#define SSE_BUTTERFLY(M0, M1, M2, M3, M4) \
45{ \
46 M3 = _mm_adds_epi16(M0, M2); \
47 M4 = _mm_subs_epi16(M1, M2); \
48 M0 = _mm_subs_epi16(M0, M2); \
49 M1 = _mm_adds_epi16(M1, M2); \
50 M2 = _mm_max_epi16(M3, M4); \
51 M3 = _mm_or_si128(_mm_cmpgt_epi16(M3, M4), _mm_cmpeq_epi16(M3, M4)); \
52 M4 = _mm_max_epi16(M0, M1); \
53 M1 = _mm_or_si128(_mm_cmpgt_epi16(M0, M1), _mm_cmpeq_epi16(M0, M1)); \
54}
55
56/* Two lane deinterleaving K = 5:
57 * Take 16 interleaved 16-bit integers and deinterleave to 2 packed 128-bit
58 * registers. The operation summarized below. Four registers are used with
59 * the lower 2 as input and upper 2 as output.
60 *
61 * In - 10101010 10101010 10101010 10101010
62 * Out - 00000000 11111111 00000000 11111111
63 *
64 * Input:
65 * M0:1 - Packed 16-bit integers
66 *
67 * Output:
68 * M2:3 - Deinterleaved packed 16-bit integers
69 */
70#define _I8_SHUFFLE_MASK 15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0
71
72#define SSE_DEINTERLEAVE_K5(M0, M1, M2, M3) \
73{ \
74 M2 = _mm_set_epi8(_I8_SHUFFLE_MASK); \
75 M0 = _mm_shuffle_epi8(M0, M2); \
76 M1 = _mm_shuffle_epi8(M1, M2); \
77 M2 = _mm_unpacklo_epi64(M0, M1); \
78 M3 = _mm_unpackhi_epi64(M0, M1); \
79}
80
81/* Two lane deinterleaving K = 7:
82 * Take 64 interleaved 16-bit integers and deinterleave to 8 packed 128-bit
83 * registers. The operation summarized below. 16 registers are used with the
84 * lower 8 as input and upper 8 as output.
85 *
86 * In - 10101010 10101010 10101010 10101010 ...
87 * Out - 00000000 11111111 00000000 11111111 ...
88 *
89 * Input:
90 * M0:7 - Packed 16-bit integers
91 *
92 * Output:
93 * M8:15 - Deinterleaved packed 16-bit integers
94 */
95#define SSE_DEINTERLEAVE_K7(M0, M1, M2, M3, M4, M5, M6, M7, \
96 M8, M9, M10, M11, M12, M13, M14, M15) \
97{ \
98 M8 = _mm_set_epi8(_I8_SHUFFLE_MASK); \
99 M0 = _mm_shuffle_epi8(M0, M8); \
100 M1 = _mm_shuffle_epi8(M1, M8); \
101 M2 = _mm_shuffle_epi8(M2, M8); \
102 M3 = _mm_shuffle_epi8(M3, M8); \
103 M4 = _mm_shuffle_epi8(M4, M8); \
104 M5 = _mm_shuffle_epi8(M5, M8); \
105 M6 = _mm_shuffle_epi8(M6, M8); \
106 M7 = _mm_shuffle_epi8(M7, M8); \
107 M8 = _mm_unpacklo_epi64(M0, M1); \
108 M9 = _mm_unpackhi_epi64(M0, M1); \
109 M10 = _mm_unpacklo_epi64(M2, M3); \
110 M11 = _mm_unpackhi_epi64(M2, M3); \
111 M12 = _mm_unpacklo_epi64(M4, M5); \
112 M13 = _mm_unpackhi_epi64(M4, M5); \
113 M14 = _mm_unpacklo_epi64(M6, M7); \
114 M15 = _mm_unpackhi_epi64(M6, M7); \
115}
116
117/* Generate branch metrics N = 2:
118 * Compute 16 branch metrics from trellis outputs and input values.
119 *
120 * Input:
121 * M0:3 - 16 x 2 packed 16-bit trellis outputs
122 * M4 - Expanded and packed 16-bit input value
123 *
124 * Output:
125 * M6:7 - 16 computed 16-bit branch metrics
126 */
127#define SSE_BRANCH_METRIC_N2(M0, M1, M2, M3, M4, M6, M7) \
128{ \
129 M0 = _mm_sign_epi16(M4, M0); \
130 M1 = _mm_sign_epi16(M4, M1); \
131 M2 = _mm_sign_epi16(M4, M2); \
132 M3 = _mm_sign_epi16(M4, M3); \
133 M6 = _mm_hadds_epi16(M0, M1); \
134 M7 = _mm_hadds_epi16(M2, M3); \
135}
136
137/* Generate branch metrics N = 4:
138 * Compute 8 branch metrics from trellis outputs and input values. This
139 * macro is reused for N less than 4 where the extra soft input bits are
140 * padded.
141 *
142 * Input:
143 * M0:3 - 8 x 4 packed 16-bit trellis outputs
144 * M4 - Expanded and packed 16-bit input value
145 *
146 * Output:
147 * M5 - 8 computed 16-bit branch metrics
148 */
149#define SSE_BRANCH_METRIC_N4(M0, M1, M2, M3, M4, M5) \
150{ \
151 M0 = _mm_sign_epi16(M4, M0); \
152 M1 = _mm_sign_epi16(M4, M1); \
153 M2 = _mm_sign_epi16(M4, M2); \
154 M3 = _mm_sign_epi16(M4, M3); \
155 M0 = _mm_hadds_epi16(M0, M1); \
156 M1 = _mm_hadds_epi16(M2, M3); \
157 M5 = _mm_hadds_epi16(M0, M1); \
158}
159
160/* Horizontal minimum
161 * Compute horizontal minimum of packed unsigned 16-bit integers and place
162 * result in the low 16-bit element of the source register. Only SSE 4.1
163 * has a dedicated minpos instruction. One intermediate register is used
164 * if SSE 4.1 is not available. This is a destructive operation and the
165 * source register is overwritten.
166 *
167 * Input:
168 * M0 - Packed unsigned 16-bit integers
169 *
170 * Output:
171 * M0 - Minimum value placed in low 16-bit element
172 */
173#if defined(HAVE_SSE4_1) || defined(HAVE_SSE41)
174#define SSE_MINPOS(M0, M1) \
175{ \
176 if (sse41_supported) { \
177 M0 = _mm_minpos_epu16(M0); \
178 } else { \
179 M1 = _mm_shuffle_epi32(M0, _MM_SHUFFLE(0, 0, 3, 2)); \
180 M0 = _mm_min_epi16(M0, M1); \
181 M1 = _mm_shufflelo_epi16(M0, _MM_SHUFFLE(0, 0, 3, 2)); \
182 M0 = _mm_min_epi16(M0, M1); \
183 M1 = _mm_shufflelo_epi16(M0, _MM_SHUFFLE(0, 0, 0, 1)); \
184 M0 = _mm_min_epi16(M0, M1); \
185 } \
186}
187#else
188#define SSE_MINPOS(M0, M1) \
189{ \
190 M1 = _mm_shuffle_epi32(M0, _MM_SHUFFLE(0, 0, 3, 2)); \
191 M0 = _mm_min_epi16(M0, M1); \
192 M1 = _mm_shufflelo_epi16(M0, _MM_SHUFFLE(0, 0, 3, 2)); \
193 M0 = _mm_min_epi16(M0, M1); \
194 M1 = _mm_shufflelo_epi16(M0, _MM_SHUFFLE(0, 0, 0, 1)); \
195 M0 = _mm_min_epi16(M0, M1); \
196}
197#endif
198
199/* Normalize state metrics K = 5:
200 * Compute 16-wide normalization by subtracting the smallest value from
201 * all values. Inputs are 16 packed 16-bit integers across 2 XMM registers.
202 * Two intermediate registers are used and normalized results are placed
203 * in the originating locations.
204 *
205 * Input:
206 * M0:1 - Path metrics 0:1 (packed 16-bit integers)
207 *
208 * Output:
209 * M0:1 - Normalized path metrics 0:1
210 */
211#define SSE_NORMALIZE_K5(M0, M1, M2, M3) \
212{ \
213 M2 = _mm_min_epi16(M0, M1); \
214 SSE_MINPOS(M2, M3) \
215 SSE_BROADCAST(M2) \
216 M0 = _mm_subs_epi16(M0, M2); \
217 M1 = _mm_subs_epi16(M1, M2); \
218}
219
220/* Normalize state metrics K = 7:
221 * Compute 64-wide normalization by subtracting the smallest value from
222 * all values. Inputs are 8 registers of accumulated sums and 4 temporary
223 * registers. Normalized results are returned in the originating locations.
224 *
225 * Input:
226 * M0:7 - Path metrics 0:7 (packed 16-bit integers)
227 *
228 * Output:
229 * M0:7 - Normalized path metrics 0:7
230 */
231#define SSE_NORMALIZE_K7(M0, M1, M2, M3, M4, M5, M6, M7, M8, M9, M10, M11) \
232{ \
233 M8 = _mm_min_epi16(M0, M1); \
234 M9 = _mm_min_epi16(M2, M3); \
235 M10 = _mm_min_epi16(M4, M5); \
236 M11 = _mm_min_epi16(M6, M7); \
237 M8 = _mm_min_epi16(M8, M9); \
238 M10 = _mm_min_epi16(M10, M11); \
239 M8 = _mm_min_epi16(M8, M10); \
240 SSE_MINPOS(M8, M9) \
241 SSE_BROADCAST(M8) \
242 M0 = _mm_subs_epi16(M0, M8); \
243 M1 = _mm_subs_epi16(M1, M8); \
244 M2 = _mm_subs_epi16(M2, M8); \
245 M3 = _mm_subs_epi16(M3, M8); \
246 M4 = _mm_subs_epi16(M4, M8); \
247 M5 = _mm_subs_epi16(M5, M8); \
248 M6 = _mm_subs_epi16(M6, M8); \
249 M7 = _mm_subs_epi16(M7, M8); \
250}
251
252/* Combined BMU/PMU (K=5, N=2)
253 * Compute branch metrics followed by path metrics for half rate 16-state
254 * trellis. 8 butterflies are computed. Accumulated path sums are not
255 * preserved and read and written into the same memory location. Normalize
256 * sums if requires.
257 */
258__always_inline static void _sse_metrics_k5_n2(const int16_t *val,
259 const int16_t *out, int16_t *sums, int16_t *paths, int norm)
260{
261 __m128i m0, m1, m2, m3, m4, m5, m6;
262
263 /* (BMU) Load input sequence */
264 m2 = _mm_castpd_si128(_mm_loaddup_pd((double const *) val));
265
266 /* (BMU) Load trellis outputs */
267 m0 = _mm_load_si128((__m128i *) &out[0]);
268 m1 = _mm_load_si128((__m128i *) &out[8]);
269
270 /* (BMU) Compute branch metrics */
271 m0 = _mm_sign_epi16(m2, m0);
272 m1 = _mm_sign_epi16(m2, m1);
273 m2 = _mm_hadds_epi16(m0, m1);
274
275 /* (PMU) Load accumulated path metrics */
276 m0 = _mm_load_si128((__m128i *) &sums[0]);
277 m1 = _mm_load_si128((__m128i *) &sums[8]);
278
279 SSE_DEINTERLEAVE_K5(m0, m1, m3, m4)
280
281 /* (PMU) Butterflies: 0-7 */
282 SSE_BUTTERFLY(m3, m4, m2, m5, m6)
283
284 if (norm)
285 SSE_NORMALIZE_K5(m2, m6, m0, m1)
286
287 _mm_store_si128((__m128i *) &sums[0], m2);
288 _mm_store_si128((__m128i *) &sums[8], m6);
289 _mm_store_si128((__m128i *) &paths[0], m5);
290 _mm_store_si128((__m128i *) &paths[8], m4);
291}
292
293/* Combined BMU/PMU (K=5, N=3 and N=4)
294 * Compute branch metrics followed by path metrics for 16-state and rates
295 * to 1/4. 8 butterflies are computed. The input sequence is read four 16-bit
296 * values at a time, and extra values should be set to zero for rates other
297 * than 1/4. Normally only rates 1/3 and 1/4 are used as there is a
298 * dedicated implementation of rate 1/2.
299 */
300__always_inline static void _sse_metrics_k5_n4(const int16_t *val,
301 const int16_t *out, int16_t *sums, int16_t *paths, int norm)
302{
303 __m128i m0, m1, m2, m3, m4, m5, m6;
304
305 /* (BMU) Load input sequence */
306 m4 = _mm_castpd_si128(_mm_loaddup_pd((double const *) val));
307
308 /* (BMU) Load trellis outputs */
309 m0 = _mm_load_si128((__m128i *) &out[0]);
310 m1 = _mm_load_si128((__m128i *) &out[8]);
311 m2 = _mm_load_si128((__m128i *) &out[16]);
312 m3 = _mm_load_si128((__m128i *) &out[24]);
313
314 SSE_BRANCH_METRIC_N4(m0, m1, m2, m3, m4, m2)
315
316 /* (PMU) Load accumulated path metrics */
317 m0 = _mm_load_si128((__m128i *) &sums[0]);
318 m1 = _mm_load_si128((__m128i *) &sums[8]);
319
320 SSE_DEINTERLEAVE_K5(m0, m1, m3, m4)
321
322 /* (PMU) Butterflies: 0-7 */
323 SSE_BUTTERFLY(m3, m4, m2, m5, m6)
324
325 if (norm)
326 SSE_NORMALIZE_K5(m2, m6, m0, m1)
327
328 _mm_store_si128((__m128i *) &sums[0], m2);
329 _mm_store_si128((__m128i *) &sums[8], m6);
330 _mm_store_si128((__m128i *) &paths[0], m5);
331 _mm_store_si128((__m128i *) &paths[8], m4);
332}
333
334/* Combined BMU/PMU (K=7, N=2)
335 * Compute branch metrics followed by path metrics for half rate 64-state
336 * trellis. 32 butterfly operations are computed. Deinterleaving path
337 * metrics requires usage of the full SSE register file, so separate sums
338 * before computing branch metrics to avoid register spilling.
339 */
340__always_inline static void _sse_metrics_k7_n2(const int16_t *val,
341 const int16_t *out, int16_t *sums, int16_t *paths, int norm)
342{
343 __m128i m0, m1, m2, m3, m4, m5, m6, m7, m8,
344 m9, m10, m11, m12, m13, m14, m15;
345
346 /* (PMU) Load accumulated path metrics */
347 m0 = _mm_load_si128((__m128i *) &sums[0]);
348 m1 = _mm_load_si128((__m128i *) &sums[8]);
349 m2 = _mm_load_si128((__m128i *) &sums[16]);
350 m3 = _mm_load_si128((__m128i *) &sums[24]);
351 m4 = _mm_load_si128((__m128i *) &sums[32]);
352 m5 = _mm_load_si128((__m128i *) &sums[40]);
353 m6 = _mm_load_si128((__m128i *) &sums[48]);
354 m7 = _mm_load_si128((__m128i *) &sums[56]);
355
356 /* (PMU) Deinterleave to even-odd registers */
357 SSE_DEINTERLEAVE_K7(m0, m1, m2, m3 ,m4 ,m5, m6, m7,
358 m8, m9, m10, m11, m12, m13, m14, m15)
359
360 /* (BMU) Load input symbols */
361 m7 = _mm_castpd_si128(_mm_loaddup_pd((double const *) val));
362
363 /* (BMU) Load trellis outputs */
364 m0 = _mm_load_si128((__m128i *) &out[0]);
365 m1 = _mm_load_si128((__m128i *) &out[8]);
366 m2 = _mm_load_si128((__m128i *) &out[16]);
367 m3 = _mm_load_si128((__m128i *) &out[24]);
368
369 SSE_BRANCH_METRIC_N2(m0, m1, m2, m3, m7, m4, m5)
370
371 m0 = _mm_load_si128((__m128i *) &out[32]);
372 m1 = _mm_load_si128((__m128i *) &out[40]);
373 m2 = _mm_load_si128((__m128i *) &out[48]);
374 m3 = _mm_load_si128((__m128i *) &out[56]);
375
376 SSE_BRANCH_METRIC_N2(m0, m1, m2, m3, m7, m6, m7)
377
378 /* (PMU) Butterflies: 0-15 */
379 SSE_BUTTERFLY(m8, m9, m4, m0, m1)
380 SSE_BUTTERFLY(m10, m11, m5, m2, m3)
381
382 _mm_store_si128((__m128i *) &paths[0], m0);
383 _mm_store_si128((__m128i *) &paths[8], m2);
384 _mm_store_si128((__m128i *) &paths[32], m9);
385 _mm_store_si128((__m128i *) &paths[40], m11);
386
387 /* (PMU) Butterflies: 17-31 */
388 SSE_BUTTERFLY(m12, m13, m6, m0, m2)
389 SSE_BUTTERFLY(m14, m15, m7, m9, m11)
390
391 _mm_store_si128((__m128i *) &paths[16], m0);
392 _mm_store_si128((__m128i *) &paths[24], m9);
393 _mm_store_si128((__m128i *) &paths[48], m13);
394 _mm_store_si128((__m128i *) &paths[56], m15);
395
396 if (norm)
397 SSE_NORMALIZE_K7(m4, m1, m5, m3, m6, m2,
398 m7, m11, m0, m8, m9, m10)
399
400 _mm_store_si128((__m128i *) &sums[0], m4);
401 _mm_store_si128((__m128i *) &sums[8], m5);
402 _mm_store_si128((__m128i *) &sums[16], m6);
403 _mm_store_si128((__m128i *) &sums[24], m7);
404 _mm_store_si128((__m128i *) &sums[32], m1);
405 _mm_store_si128((__m128i *) &sums[40], m3);
406 _mm_store_si128((__m128i *) &sums[48], m2);
407 _mm_store_si128((__m128i *) &sums[56], m11);
408}
409
410/* Combined BMU/PMU (K=7, N=3 and N=4)
411 * Compute branch metrics followed by path metrics for half rate 64-state
412 * trellis. 32 butterfly operations are computed. Deinterleave path
413 * metrics before computing branch metrics as in the half rate case.
414 */
415__always_inline static void _sse_metrics_k7_n4(const int16_t *val,
416 const int16_t *out, int16_t *sums, int16_t *paths, int norm)
417{
418 __m128i m0, m1, m2, m3, m4, m5, m6, m7;
419 __m128i m8, m9, m10, m11, m12, m13, m14, m15;
420
421 /* (PMU) Load accumulated path metrics */
422 m0 = _mm_load_si128((__m128i *) &sums[0]);
423 m1 = _mm_load_si128((__m128i *) &sums[8]);
424 m2 = _mm_load_si128((__m128i *) &sums[16]);
425 m3 = _mm_load_si128((__m128i *) &sums[24]);
426 m4 = _mm_load_si128((__m128i *) &sums[32]);
427 m5 = _mm_load_si128((__m128i *) &sums[40]);
428 m6 = _mm_load_si128((__m128i *) &sums[48]);
429 m7 = _mm_load_si128((__m128i *) &sums[56]);
430
431 /* (PMU) Deinterleave into even and odd packed registers */
432 SSE_DEINTERLEAVE_K7(m0, m1, m2, m3 ,m4 ,m5, m6, m7,
433 m8, m9, m10, m11, m12, m13, m14, m15)
434
435 /* (BMU) Load and expand 8-bit input out to 16-bits */
436 m7 = _mm_castpd_si128(_mm_loaddup_pd((double const *) val));
437
438 /* (BMU) Load and compute branch metrics */
439 m0 = _mm_load_si128((__m128i *) &out[0]);
440 m1 = _mm_load_si128((__m128i *) &out[8]);
441 m2 = _mm_load_si128((__m128i *) &out[16]);
442 m3 = _mm_load_si128((__m128i *) &out[24]);
443
444 SSE_BRANCH_METRIC_N4(m0, m1, m2, m3, m7, m4)
445
446 m0 = _mm_load_si128((__m128i *) &out[32]);
447 m1 = _mm_load_si128((__m128i *) &out[40]);
448 m2 = _mm_load_si128((__m128i *) &out[48]);
449 m3 = _mm_load_si128((__m128i *) &out[56]);
450
451 SSE_BRANCH_METRIC_N4(m0, m1, m2, m3, m7, m5)
452
453 m0 = _mm_load_si128((__m128i *) &out[64]);
454 m1 = _mm_load_si128((__m128i *) &out[72]);
455 m2 = _mm_load_si128((__m128i *) &out[80]);
456 m3 = _mm_load_si128((__m128i *) &out[88]);
457
458 SSE_BRANCH_METRIC_N4(m0, m1, m2, m3, m7, m6)
459
460 m0 = _mm_load_si128((__m128i *) &out[96]);
461 m1 = _mm_load_si128((__m128i *) &out[104]);
462 m2 = _mm_load_si128((__m128i *) &out[112]);
463 m3 = _mm_load_si128((__m128i *) &out[120]);
464
465 SSE_BRANCH_METRIC_N4(m0, m1, m2, m3, m7, m7)
466
467 /* (PMU) Butterflies: 0-15 */
468 SSE_BUTTERFLY(m8, m9, m4, m0, m1)
469 SSE_BUTTERFLY(m10, m11, m5, m2, m3)
470
471 _mm_store_si128((__m128i *) &paths[0], m0);
472 _mm_store_si128((__m128i *) &paths[8], m2);
473 _mm_store_si128((__m128i *) &paths[32], m9);
474 _mm_store_si128((__m128i *) &paths[40], m11);
475
476 /* (PMU) Butterflies: 17-31 */
477 SSE_BUTTERFLY(m12, m13, m6, m0, m2)
478 SSE_BUTTERFLY(m14, m15, m7, m9, m11)
479
480 _mm_store_si128((__m128i *) &paths[16], m0);
481 _mm_store_si128((__m128i *) &paths[24], m9);
482 _mm_store_si128((__m128i *) &paths[48], m13);
483 _mm_store_si128((__m128i *) &paths[56], m15);
484
485 if (norm)
486 SSE_NORMALIZE_K7(m4, m1, m5, m3, m6, m2,
487 m7, m11, m0, m8, m9, m10)
488
489 _mm_store_si128((__m128i *) &sums[0], m4);
490 _mm_store_si128((__m128i *) &sums[8], m5);
491 _mm_store_si128((__m128i *) &sums[16], m6);
492 _mm_store_si128((__m128i *) &sums[24], m7);
493 _mm_store_si128((__m128i *) &sums[32], m1);
494 _mm_store_si128((__m128i *) &sums[40], m3);
495 _mm_store_si128((__m128i *) &sums[48], m2);
496 _mm_store_si128((__m128i *) &sums[56], m11);
497}