<T>LAPACK 0.1.1
C++ Template Linear Algebra PACKage
Loading...
Searching...
No Matches
rot_sequence3.hpp
Go to the documentation of this file.
1
3//
4// Copyright (c) 2021-2023, University of Colorado Denver. All rights reserved.
5//
6// This file is part of <T>LAPACK.
7// <T>LAPACK is free software: you can redistribute it and/or modify it under
8// the terms of the BSD 3-Clause license. See the accompanying LICENSE file.
9
10#ifndef TLAPACK_ROT_SEQUENCE3_HH
11#define TLAPACK_ROT_SEQUENCE3_HH
12
14#include "tlapack/blas/rot.hpp"
15
16namespace tlapack {
17
87template <TLAPACK_SIDE side_t,
88 TLAPACK_DIRECTION direction_t,
93 side_t side, direction_t direction, const C_t& C, const S_t& S, A_t& A)
94{
95 using T = type_t<A_t>;
96 using idx_t = size_type<A_t>;
97
98 // constants
99 const idx_t m = nrows(A);
100 const idx_t n = ncols(A);
101 const idx_t k = (side == Side::Left) ? m - 1 : n - 1;
102 const idx_t l = ncols(C);
103
104 // Blocking parameter
105 const idx_t nb = 256;
106
107 // Check dimensions
108 tlapack_check((idx_t)ncols(S) == l);
109 tlapack_check((idx_t)nrows(C) == k);
110 tlapack_check((idx_t)nrows(S) == k);
111 // tlapack_check(k >= l);
112
113 // quick return
114 if (k < 1 or l < 1) return;
115
116 // If the dimensions don't allow for effective pipelining,
117 // then apply the rotations using rot_sequence
118 if (l == 1 or k < l) {
119 for (idx_t j = 0; j < l; ++j) {
120 auto c = col(C, j);
121 auto s = col(S, j);
122 rot_sequence(side, direction, c, s, A);
123 }
124 return;
125 }
126
127 // Apply rotations
128 if constexpr (layout<A_t> == Layout::ColMajor) {
129 if (side == Side::Left) {
130 if (direction == Direction::Forward) {
131 // Left side, forward direction
132#pragma omp parallel for
133 for (idx_t ib = 0; ib < n; ib += nb) {
134 idx_t ib2 = std::min(ib + nb, n);
135 // Startup phase
136 for (idx_t i1 = 0; i1 < n; ++i1) {
137 for (idx_t j = 0; j < l - 1; ++j) {
138 for (idx_t i = 0, g2 = j; i < j + 1; ++i, --g2) {
139 idx_t g = m - 2 - g2;
140 T temp =
141 C(g, i) * A(g, i1) + S(g, i) * A(g + 1, i1);
142 A(g + 1, i1) = -conj(S(g, i)) * A(g, i1) +
143 C(g, i) * A(g + 1, i1);
144 A(g, i1) = temp;
145 }
146 }
147 }
148 // Pipeline phase
149 for (idx_t i1 = ib; i1 < ib2; ++i1) {
150 for (idx_t j = l - 1; j + 1 < m - 1; j += 2) {
151 for (idx_t i = 0, g2 = j; i + 1 < l;
152 i += 2, g2 -= 2) {
153 idx_t g = m - 2 - g2;
154 //
155 // Apply first rotation
156 //
157
158 // A(g,i1) after first rotation
159 T temp1 =
160 C(g, i) * A(g, i1) + S(g, i) * A(g + 1, i1);
161 // A(g+1,i1) after first rotation
162 T temp2 = -conj(S(g, i)) * A(g, i1) +
163 C(g, i) * A(g + 1, i1);
164
165 //
166 // Apply second rotation
167 //
168
169 // A(g,i1) after second rotation
170 T temp3 = -conj(S(g - 1, i)) * A(g - 1, i1) +
171 C(g - 1, i) * temp1;
172 A(g - 1, i1) = C(g - 1, i) * A(g - 1, i1) +
173 S(g - 1, i) * temp1;
174
175 //
176 // Apply third rotation
177 //
178
179 // A(g+1,i1) after third rotation
180 T temp4 = C(g + 1, i + 1) * temp2 +
181 S(g + 1, i + 1) * A(g + 2, i1);
182 A(g + 2, i1) = -conj(S(g + 1, i + 1)) * temp2 +
183 C(g + 1, i + 1) * A(g + 2, i1);
184
185 // Apply fourth rotation
186 A(g, i1) =
187 C(g, i + 1) * temp3 + S(g, i + 1) * temp4;
188 A(g + 1, i1) = -conj(S(g, i + 1)) * temp3 +
189 C(g, i + 1) * temp4;
190 }
191
192 if (l % 2 == 1) {
193 // Apply two more rotations that could not be
194 // fused
195 idx_t i = l - 1;
196 idx_t g2 = j - (l - 1);
197 idx_t g = m - 2 - g2;
198
199 // Apply first rotation
200 T temp =
201 C(g, i) * A(g, i1) + S(g, i) * A(g + 1, i1);
202 A(g + 1, i1) = -conj(S(g, i)) * A(g, i1) +
203 C(g, i) * A(g + 1, i1);
204 A(g, i1) = temp;
205
206 // Apply second rotation
207 T temp2 = C(g - 1, i) * A(g - 1, i1) +
208 S(g - 1, i) * A(g, i1);
209 A(g, i1) = -conj(S(g - 1, i)) * A(g - 1, i1) +
210 C(g - 1, i) * A(g, i1);
211 A(g - 1, i1) = temp2;
212 }
213 }
214 }
215 // Shutdown phase
216 for (idx_t i1 = ib; i1 < ib2; ++i1) {
217 for (idx_t j = ((m - l + 1) % 2); j < l; ++j) {
218 for (idx_t i = j, g2 = m - 2; i < l; ++i, --g2) {
219 idx_t g = m - 2 - g2;
220 T temp =
221 C(g, i) * A(g, i1) + S(g, i) * A(g + 1, i1);
222 A(g + 1, i1) = -conj(S(g, i)) * A(g, i1) +
223 C(g, i) * A(g + 1, i1);
224 A(g, i1) = temp;
225 }
226 }
227 }
228 }
229 }
230 else {
231 // Left side, backward direction
232#pragma omp parallel for
233 for (idx_t ib = 0; ib < n; ib += nb) {
234 idx_t ib2 = std::min(ib + nb, n);
235 // Startup phase
236 for (idx_t i1 = ib; i1 < ib2; ++i1) {
237 for (idx_t j = 0; j < l - 1; ++j) {
238 for (idx_t i = 0, g = j; i < j + 1; ++i, --g) {
239 T temp =
240 C(g, i) * A(g, i1) + S(g, i) * A(g + 1, i1);
241 A(g + 1, i1) = -conj(S(g, i)) * A(g, i1) +
242 C(g, i) * A(g + 1, i1);
243 A(g, i1) = temp;
244 }
245 }
246 }
247 // Pipeline phase
248 for (idx_t i1 = ib; i1 < ib2; ++i1) {
249 for (idx_t j = l - 1; j + 1 < m - 1; j += 2) {
250 for (idx_t i = 0, g = j; i + 1 < l;
251 i += 2, g -= 2) {
252 //
253 // Apply first rotation
254 //
255
256 // A(g,i1) after first rotation
257 T temp1 =
258 C(g, i) * A(g, i1) + S(g, i) * A(g + 1, i1);
259 // A(g+1,i1) after first rotation
260 T temp2 = -conj(S(g, i)) * A(g, i1) +
261 C(g, i) * A(g + 1, i1);
262
263 //
264 // Apply second rotation
265 //
266
267 // A(g+1,i1) after second rotation
268 T temp3 = C(g + 1, i) * temp2 +
269 S(g + 1, i) * A(g + 2, i1);
270 A(g + 2, i1) = -conj(S(g + 1, i)) * temp2 +
271 C(g + 1, i) * A(g + 2, i1);
272
273 //
274 // Apply third rotation
275 //
276
277 // A(g,i1) after third rotation
278 T temp4 =
279 -conj(S(g - 1, i + 1)) * A(g - 1, i1) +
280 C(g - 1, i + 1) * temp1;
281 A(g - 1, i1) = C(g - 1, i + 1) * A(g - 1, i1) +
282 S(g - 1, i + 1) * temp1;
283
284 // Apply fourth rotation
285 A(g, i1) =
286 C(g, i + 1) * temp4 + S(g, i + 1) * temp3;
287 A(g + 1, i1) = -conj(S(g, i + 1)) * temp4 +
288 C(g, i + 1) * temp3;
289 }
290
291 if (l % 2 == 1) {
292 // Apply two more rotations that could not be
293 // fused
294 idx_t i = l - 1;
295 idx_t g = j - (l - 1);
296
297 // Apply first rotation
298 T temp =
299 C(g, i) * A(g, i1) + S(g, i) * A(g + 1, i1);
300 A(g + 1, i1) = -conj(S(g, i)) * A(g, i1) +
301 C(g, i) * A(g + 1, i1);
302 A(g, i1) = temp;
303
304 // Apply second rotation
305 T temp2 = C(g + 1, i) * A(g + 1, i1) +
306 S(g + 1, i) * A(g + 2, i1);
307 A(g + 2, i1) =
308 -conj(S(g + 1, i)) * A(g + 1, i1) +
309 C(g + 1, i) * A(g + 2, i1);
310 A(g + 1, i1) = temp2;
311 }
312 }
313 }
314 // Shutdown phase
315 for (idx_t i1 = ib; i1 < ib2; ++i1) {
316 for (idx_t j = ((m - l + 1) % 2); j < l; ++j) {
317 for (idx_t i = j, g = m - 2; i < l; ++i, --g) {
318 T temp =
319 C(g, i) * A(g, i1) + S(g, i) * A(g + 1, i1);
320 A(g + 1, i1) = -conj(S(g, i)) * A(g, i1) +
321 C(g, i) * A(g + 1, i1);
322 A(g, i1) = temp;
323 }
324 }
325 }
326 }
327 }
328 }
329 else {
330 if (direction == Direction::Forward) {
331 // Right side, forward direction
332#pragma omp parallel for
333 for (idx_t ib = 0; ib < m; ib += nb) {
334 idx_t ib2 = std::min(ib + nb, m);
335 // Startup phase
336 for (idx_t j = 0; j < l - 1; ++j) {
337 for (idx_t i = 0, g2 = j; i < j + 1; ++i, --g2) {
338 idx_t g = n - 2 - g2;
339 for (idx_t i1 = ib; i1 < ib2; ++i1) {
340 T temp = C(g, i) * A(i1, g) +
341 conj(S(g, i)) * A(i1, g + 1);
342 A(i1, g + 1) = -S(g, i) * A(i1, g) +
343 C(g, i) * A(i1, g + 1);
344 A(i1, g) = temp;
345 }
346 }
347 }
348 // Pipeline phase
349 for (idx_t j = l - 1; j + 1 < n - 1; j += 2) {
350 for (idx_t i = 0, g2 = j; i + 1 < l; i += 2, g2 -= 2) {
351 idx_t g = n - 2 - g2;
352 for (idx_t i1 = ib; i1 < ib2; ++i1) {
353 //
354 // Apply first rotation
355 //
356
357 // A(i1,g) after first rotation
358 T temp1 = C(g, i) * A(i1, g) +
359 conj(S(g, i)) * A(i1, g + 1);
360 // A(i1,g+1) after first rotation
361 T temp2 = -S(g, i) * A(i1, g) +
362 C(g, i) * A(i1, g + 1);
363
364 //
365 // Apply second rotation
366 //
367
368 // A(i1,g) after second rotation
369 T temp3 = -S(g - 1, i) * A(i1, g - 1) +
370 C(g - 1, i) * temp1;
371 A(i1, g - 1) = C(g - 1, i) * A(i1, g - 1) +
372 conj(S(g - 1, i)) * temp1;
373 //
374 // Apply third rotation
375 //
376
377 // A(i1,g+1) after third rotation
378 T temp4 = C(g + 1, i + 1) * temp2 +
379 conj(S(g + 1, i + 1)) * A(i1, g + 2);
380 A(i1, g + 2) = -S(g + 1, i + 1) * temp2 +
381 C(g + 1, i + 1) * A(i1, g + 2);
382
383 //
384 // Apply fourth rotation
385 //
386
387 A(i1, g) = C(g, i + 1) * temp3 +
388 conj(S(g, i + 1)) * temp4;
389 A(i1, g + 1) =
390 -S(g, i + 1) * temp3 + C(g, i + 1) * temp4;
391 }
392 }
393 if (l % 2 == 1) {
394 // Apply two more rotations that could not be fused
395 idx_t i = l - 1;
396 idx_t g2 = j - (l - 1);
397 idx_t g = n - 2 - g2;
398
399 for (idx_t i1 = ib; i1 < ib2; ++i1) {
400 // Apply first rotation
401 T temp = C(g, i) * A(i1, g) +
402 conj(S(g, i)) * A(i1, g + 1);
403 A(i1, g + 1) = -S(g, i) * A(i1, g) +
404 C(g, i) * A(i1, g + 1);
405 A(i1, g) = temp;
406
407 // Apply second rotation
408 T temp2 = C(g - 1, i) * A(i1, g - 1) +
409 conj(S(g - 1, i)) * A(i1, g);
410 A(i1, g) = -S(g - 1, i) * A(i1, g - 1) +
411 C(g - 1, i) * A(i1, g);
412 A(i1, g - 1) = temp2;
413 }
414 }
415 }
416 // Shutdown phase
417 for (idx_t j = ((n - l + 1) % 2); j < l; ++j) {
418 for (idx_t i = j, g2 = n - 2; i < l; ++i, --g2) {
419 idx_t g = n - 2 - g2;
420 for (idx_t i1 = ib; i1 < ib2; ++i1) {
421 T temp = C(g, i) * A(i1, g) +
422 conj(S(g, i)) * A(i1, g + 1);
423 A(i1, g + 1) = -S(g, i) * A(i1, g) +
424 C(g, i) * A(i1, g + 1);
425 A(i1, g) = temp;
426 }
427 }
428 }
429 }
430 }
431 else {
432 // Right side, backward direction
433#pragma omp parallel for
434 for (idx_t ib = 0; ib < m; ib += nb) {
435 idx_t ib2 = std::min(ib + nb, m);
436 // Startup phase
437 for (idx_t j = 0; j < l - 1; ++j) {
438 for (idx_t i = 0, g = j; i < j + 1; ++i, --g) {
439 for (idx_t i1 = ib; i1 < ib2; ++i1) {
440 T temp = C(g, i) * A(i1, g) +
441 conj(S(g, i)) * A(i1, g + 1);
442 A(i1, g + 1) = -S(g, i) * A(i1, g) +
443 C(g, i) * A(i1, g + 1);
444 A(i1, g) = temp;
445 }
446 }
447 }
448 // Pipeline phase
449 for (idx_t j = l - 1; j + 1 < n - 1; j += 2) {
450 for (idx_t i = 0, g = j; i + 1 < l; i += 2, g -= 2) {
451 for (idx_t i1 = ib; i1 < ib2; ++i1) {
452 //
453 // Apply first rotation
454 //
455
456 // A(i1,g) after first rotation
457 T temp1 = C(g, i) * A(i1, g) +
458 conj(S(g, i)) * A(i1, g + 1);
459 // A(i1,g+1) after first rotation
460 T temp2 = -S(g, i) * A(i1, g) +
461 C(g, i) * A(i1, g + 1);
462
463 //
464 // Apply second rotation
465 //
466
467 // A(i1,g+1) after second rotation
468 T temp3 = C(g + 1, i) * temp2 +
469 conj(S(g + 1, i)) * A(i1, g + 2);
470 A(i1, g + 2) = -S(g + 1, i) * temp2 +
471 C(g + 1, i) * A(i1, g + 2);
472
473 //
474 // Apply third rotation
475 //
476
477 // A(i1,g) after third rotation
478 T temp4 = -S(g - 1, i + 1) * A(i1, g - 1) +
479 C(g - 1, i + 1) * temp1;
480 A(i1, g - 1) = C(g - 1, i + 1) * A(i1, g - 1) +
481 conj(S(g - 1, i + 1)) * temp1;
482
483 // Apply fourth rotation
484 A(i1, g) = C(g, i + 1) * temp4 +
485 conj(S(g, i + 1)) * temp3;
486 A(i1, g + 1) =
487 -S(g, i + 1) * temp4 + C(g, i + 1) * temp3;
488 }
489 }
490 if (l % 2 == 1) {
491 // Apply two more rotations that could not be fused
492 idx_t i = l - 1;
493 idx_t g = j - (l - 1);
494
495 for (idx_t i1 = ib; i1 < ib2; ++i1) {
496 // Apply first rotation
497 T temp = C(g, i) * A(i1, g) +
498 conj(S(g, i)) * A(i1, g + 1);
499 A(i1, g + 1) = -S(g, i) * A(i1, g) +
500 C(g, i) * A(i1, g + 1);
501 A(i1, g) = temp;
502
503 // Apply second rotation
504 T temp2 = C(g + 1, i) * A(i1, g + 1) +
505 conj(S(g + 1, i)) * A(i1, g + 2);
506 A(i1, g + 2) = -S(g + 1, i) * A(i1, g + 1) +
507 C(g + 1, i) * A(i1, g + 2);
508 A(i1, g + 1) = temp2;
509 }
510 }
511 }
512 // Shutdown phase
513 for (idx_t j = ((n - l + 1) % 2); j < l; ++j) {
514 for (idx_t i = j, g = n - 2; i < l; ++i, --g) {
515 for (idx_t i1 = ib; i1 < ib2; ++i1) {
516 T temp = C(g, i) * A(i1, g) +
517 conj(S(g, i)) * A(i1, g + 1);
518 A(i1, g + 1) = -S(g, i) * A(i1, g) +
519 C(g, i) * A(i1, g + 1);
520 A(i1, g) = temp;
521 }
522 }
523 }
524 }
525 }
526 }
527 }
528 else {
529 if (side == Side::Left) {
530 if (direction == Direction::Forward) {
531 // Left side, forward direction
532#pragma omp parallel for
533 for (idx_t ib = 0; ib < n; ib += nb) {
534 idx_t ib2 = std::min(ib + nb, n);
535 // Startup phase
536 for (idx_t j = 0; j < l - 1; ++j) {
537 for (idx_t i = 0, g2 = j; i < j + 1; ++i, --g2) {
538 idx_t g = m - 2 - g2;
539 for (idx_t i1 = 0; i1 < n; ++i1) {
540 T temp =
541 C(g, i) * A(g, i1) + S(g, i) * A(g + 1, i1);
542 A(g + 1, i1) = -conj(S(g, i)) * A(g, i1) +
543 C(g, i) * A(g + 1, i1);
544 A(g, i1) = temp;
545 }
546 }
547 }
548 // Pipeline phase
549 for (idx_t j = l - 1; j + 1 < m - 1; j += 2) {
550 for (idx_t i = 0, g2 = j; i + 1 < l; i += 2, g2 -= 2) {
551 idx_t g = m - 2 - g2;
552 for (idx_t i1 = ib; i1 < ib2; ++i1) {
553 //
554 // Apply first rotation
555 //
556
557 // A(g,i1) after first rotation
558 T temp1 =
559 C(g, i) * A(g, i1) + S(g, i) * A(g + 1, i1);
560 // A(g+1,i1) after first rotation
561 T temp2 = -conj(S(g, i)) * A(g, i1) +
562 C(g, i) * A(g + 1, i1);
563
564 //
565 // Apply second rotation
566 //
567
568 // A(g,i1) after second rotation
569 T temp3 = -conj(S(g - 1, i)) * A(g - 1, i1) +
570 C(g - 1, i) * temp1;
571 A(g - 1, i1) = C(g - 1, i) * A(g - 1, i1) +
572 S(g - 1, i) * temp1;
573
574 //
575 // Apply third rotation
576 //
577
578 // A(g+1,i1) after third rotation
579 T temp4 = C(g + 1, i + 1) * temp2 +
580 S(g + 1, i + 1) * A(g + 2, i1);
581 A(g + 2, i1) = -conj(S(g + 1, i + 1)) * temp2 +
582 C(g + 1, i + 1) * A(g + 2, i1);
583
584 // Apply fourth rotation
585 A(g, i1) =
586 C(g, i + 1) * temp3 + S(g, i + 1) * temp4;
587 A(g + 1, i1) = -conj(S(g, i + 1)) * temp3 +
588 C(g, i + 1) * temp4;
589 }
590 }
591
592 if (l % 2 == 1) {
593 // Apply two more rotations that could not be
594 // fused
595 idx_t i = l - 1;
596 idx_t g2 = j - (l - 1);
597 idx_t g = m - 2 - g2;
598
599 for (idx_t i1 = ib; i1 < ib2; ++i1) {
600 // Apply first rotation
601 T temp =
602 C(g, i) * A(g, i1) + S(g, i) * A(g + 1, i1);
603 A(g + 1, i1) = -conj(S(g, i)) * A(g, i1) +
604 C(g, i) * A(g + 1, i1);
605 A(g, i1) = temp;
606
607 // Apply second rotation
608 T temp2 = C(g - 1, i) * A(g - 1, i1) +
609 S(g - 1, i) * A(g, i1);
610 A(g, i1) = -conj(S(g - 1, i)) * A(g - 1, i1) +
611 C(g - 1, i) * A(g, i1);
612 A(g - 1, i1) = temp2;
613 }
614 }
615 }
616 // Shutdown phase
617 for (idx_t j = ((m - l + 1) % 2); j < l; ++j) {
618 for (idx_t i = j, g2 = m - 2; i < l; ++i, --g2) {
619 idx_t g = m - 2 - g2;
620 for (idx_t i1 = ib; i1 < ib2; ++i1) {
621 T temp =
622 C(g, i) * A(g, i1) + S(g, i) * A(g + 1, i1);
623 A(g + 1, i1) = -conj(S(g, i)) * A(g, i1) +
624 C(g, i) * A(g + 1, i1);
625 A(g, i1) = temp;
626 }
627 }
628 }
629 }
630 }
631 else {
632 // Left side, backward direction
633#pragma omp parallel for
634 for (idx_t ib = 0; ib < n; ib += nb) {
635 idx_t ib2 = std::min(ib + nb, n);
636 // Startup phase
637 for (idx_t j = 0; j < l - 1; ++j) {
638 for (idx_t i = 0, g = j; i < j + 1; ++i, --g) {
639 for (idx_t i1 = ib; i1 < ib2; ++i1) {
640 T temp =
641 C(g, i) * A(g, i1) + S(g, i) * A(g + 1, i1);
642 A(g + 1, i1) = -conj(S(g, i)) * A(g, i1) +
643 C(g, i) * A(g + 1, i1);
644 A(g, i1) = temp;
645 }
646 }
647 }
648 // Pipeline phase
649 for (idx_t j = l - 1; j + 1 < m - 1; j += 2) {
650 for (idx_t i = 0, g = j; i + 1 < l; i += 2, g -= 2) {
651 for (idx_t i1 = ib; i1 < ib2; ++i1) {
652 //
653 // Apply first rotation
654 //
655
656 // A(g,i1) after first rotation
657 T temp1 =
658 C(g, i) * A(g, i1) + S(g, i) * A(g + 1, i1);
659 // A(g+1,i1) after first rotation
660 T temp2 = -conj(S(g, i)) * A(g, i1) +
661 C(g, i) * A(g + 1, i1);
662
663 //
664 // Apply second rotation
665 //
666
667 // A(g+1,i1) after second rotation
668 T temp3 = C(g + 1, i) * temp2 +
669 S(g + 1, i) * A(g + 2, i1);
670 A(g + 2, i1) = -conj(S(g + 1, i)) * temp2 +
671 C(g + 1, i) * A(g + 2, i1);
672
673 //
674 // Apply third rotation
675 //
676
677 // A(g,i1) after third rotation
678 T temp4 =
679 -conj(S(g - 1, i + 1)) * A(g - 1, i1) +
680 C(g - 1, i + 1) * temp1;
681 A(g - 1, i1) = C(g - 1, i + 1) * A(g - 1, i1) +
682 S(g - 1, i + 1) * temp1;
683
684 // Apply fourth rotation
685 A(g, i1) =
686 C(g, i + 1) * temp4 + S(g, i + 1) * temp3;
687 A(g + 1, i1) = -conj(S(g, i + 1)) * temp4 +
688 C(g, i + 1) * temp3;
689 }
690 }
691
692 if (l % 2 == 1) {
693 // Apply two more rotations that could not be
694 // fused
695 idx_t i = l - 1;
696 idx_t g = j - (l - 1);
697
698 for (idx_t i1 = ib; i1 < ib2; ++i1) {
699 // Apply first rotation
700 T temp =
701 C(g, i) * A(g, i1) + S(g, i) * A(g + 1, i1);
702 A(g + 1, i1) = -conj(S(g, i)) * A(g, i1) +
703 C(g, i) * A(g + 1, i1);
704 A(g, i1) = temp;
705
706 // Apply second rotation
707 T temp2 = C(g + 1, i) * A(g + 1, i1) +
708 S(g + 1, i) * A(g + 2, i1);
709 A(g + 2, i1) =
710 -conj(S(g + 1, i)) * A(g + 1, i1) +
711 C(g + 1, i) * A(g + 2, i1);
712 A(g + 1, i1) = temp2;
713 }
714 }
715 }
716 // Shutdown phase
717 for (idx_t j = ((m - l + 1) % 2); j < l; ++j) {
718 for (idx_t i = j, g = m - 2; i < l; ++i, --g) {
719 for (idx_t i1 = ib; i1 < ib2; ++i1) {
720 T temp =
721 C(g, i) * A(g, i1) + S(g, i) * A(g + 1, i1);
722 A(g + 1, i1) = -conj(S(g, i)) * A(g, i1) +
723 C(g, i) * A(g + 1, i1);
724 A(g, i1) = temp;
725 }
726 }
727 }
728 }
729 }
730 }
731 else {
732 if (direction == Direction::Forward) {
733 // Right side, forward direction
734#pragma omp parallel for
735 for (idx_t ib = 0; ib < m; ib += nb) {
736 idx_t ib2 = std::min(ib + nb, m);
737 // Startup phase
738 for (idx_t i1 = ib; i1 < ib2; ++i1) {
739 for (idx_t j = 0; j < l - 1; ++j) {
740 for (idx_t i = 0, g2 = j; i < j + 1; ++i, --g2) {
741 idx_t g = n - 2 - g2;
742 T temp = C(g, i) * A(i1, g) +
743 conj(S(g, i)) * A(i1, g + 1);
744 A(i1, g + 1) = -S(g, i) * A(i1, g) +
745 C(g, i) * A(i1, g + 1);
746 A(i1, g) = temp;
747 }
748 }
749 }
750 // Pipeline phase
751 for (idx_t i1 = ib; i1 < ib2; ++i1) {
752 for (idx_t j = l - 1; j + 1 < n - 1; j += 2) {
753 for (idx_t i = 0, g2 = j; i + 1 < l;
754 i += 2, g2 -= 2) {
755 idx_t g = n - 2 - g2;
756 //
757 // Apply first rotation
758 //
759
760 // A(i1,g) after first rotation
761 T temp1 = C(g, i) * A(i1, g) +
762 conj(S(g, i)) * A(i1, g + 1);
763 // A(i1,g+1) after first rotation
764 T temp2 = -S(g, i) * A(i1, g) +
765 C(g, i) * A(i1, g + 1);
766
767 //
768 // Apply second rotation
769 //
770
771 // A(i1,g) after second rotation
772 T temp3 = -S(g - 1, i) * A(i1, g - 1) +
773 C(g - 1, i) * temp1;
774 A(i1, g - 1) = C(g - 1, i) * A(i1, g - 1) +
775 conj(S(g - 1, i)) * temp1;
776 //
777 // Apply third rotation
778 //
779
780 // A(i1,g+1) after third rotation
781 T temp4 = C(g + 1, i + 1) * temp2 +
782 conj(S(g + 1, i + 1)) * A(i1, g + 2);
783 A(i1, g + 2) = -S(g + 1, i + 1) * temp2 +
784 C(g + 1, i + 1) * A(i1, g + 2);
785
786 //
787 // Apply fourth rotation
788 //
789
790 A(i1, g) = C(g, i + 1) * temp3 +
791 conj(S(g, i + 1)) * temp4;
792 A(i1, g + 1) =
793 -S(g, i + 1) * temp3 + C(g, i + 1) * temp4;
794 }
795 if (l % 2 == 1) {
796 // Apply two more rotations that could not be
797 // fused
798 idx_t i = l - 1;
799 idx_t g2 = j - (l - 1);
800 idx_t g = n - 2 - g2;
801
802 // Apply first rotation
803 T temp = C(g, i) * A(i1, g) +
804 conj(S(g, i)) * A(i1, g + 1);
805 A(i1, g + 1) = -S(g, i) * A(i1, g) +
806 C(g, i) * A(i1, g + 1);
807 A(i1, g) = temp;
808
809 // Apply second rotation
810 T temp2 = C(g - 1, i) * A(i1, g - 1) +
811 conj(S(g - 1, i)) * A(i1, g);
812 A(i1, g) = -S(g - 1, i) * A(i1, g - 1) +
813 C(g - 1, i) * A(i1, g);
814 A(i1, g - 1) = temp2;
815 }
816 }
817 }
818 // Shutdown phase
819 for (idx_t i1 = ib; i1 < ib2; ++i1) {
820 for (idx_t j = ((n - l + 1) % 2); j < l; ++j) {
821 for (idx_t i = j, g2 = n - 2; i < l; ++i, --g2) {
822 idx_t g = n - 2 - g2;
823 T temp = C(g, i) * A(i1, g) +
824 conj(S(g, i)) * A(i1, g + 1);
825 A(i1, g + 1) = -S(g, i) * A(i1, g) +
826 C(g, i) * A(i1, g + 1);
827 A(i1, g) = temp;
828 }
829 }
830 }
831 }
832 }
833 else {
834 // Right side, backward direction
835#pragma omp parallel for
836 for (idx_t ib = 0; ib < m; ib += nb) {
837 idx_t ib2 = std::min(ib + nb, m);
838 // Startup phase
839 for (idx_t i1 = ib; i1 < ib2; ++i1) {
840 for (idx_t j = 0; j < l - 1; ++j) {
841 for (idx_t i = 0, g = j; i < j + 1; ++i, --g) {
842 T temp = C(g, i) * A(i1, g) +
843 conj(S(g, i)) * A(i1, g + 1);
844 A(i1, g + 1) = -S(g, i) * A(i1, g) +
845 C(g, i) * A(i1, g + 1);
846 A(i1, g) = temp;
847 }
848 }
849 }
850 // Pipeline phase
851 for (idx_t i1 = ib; i1 < ib2; ++i1) {
852 for (idx_t j = l - 1; j + 1 < n - 1; j += 2) {
853 for (idx_t i = 0, g = j; i + 1 < l;
854 i += 2, g -= 2) {
855 //
856 // Apply first rotation
857 //
858
859 // A(i1,g) after first rotation
860 T temp1 = C(g, i) * A(i1, g) +
861 conj(S(g, i)) * A(i1, g + 1);
862 // A(i1,g+1) after first rotation
863 T temp2 = -S(g, i) * A(i1, g) +
864 C(g, i) * A(i1, g + 1);
865
866 //
867 // Apply second rotation
868 //
869
870 // A(i1,g+1) after second rotation
871 T temp3 = C(g + 1, i) * temp2 +
872 conj(S(g + 1, i)) * A(i1, g + 2);
873 A(i1, g + 2) = -S(g + 1, i) * temp2 +
874 C(g + 1, i) * A(i1, g + 2);
875
876 //
877 // Apply third rotation
878 //
879
880 // A(i1,g) after third rotation
881 T temp4 = -S(g - 1, i + 1) * A(i1, g - 1) +
882 C(g - 1, i + 1) * temp1;
883 A(i1, g - 1) = C(g - 1, i + 1) * A(i1, g - 1) +
884 conj(S(g - 1, i + 1)) * temp1;
885
886 // Apply fourth rotation
887 A(i1, g) = C(g, i + 1) * temp4 +
888 conj(S(g, i + 1)) * temp3;
889 A(i1, g + 1) =
890 -S(g, i + 1) * temp4 + C(g, i + 1) * temp3;
891 }
892 if (l % 2 == 1) {
893 // Apply two more rotations that could not be
894 // fused
895 idx_t i = l - 1;
896 idx_t g = j - (l - 1);
897
898 // Apply first rotation
899 T temp = C(g, i) * A(i1, g) +
900 conj(S(g, i)) * A(i1, g + 1);
901 A(i1, g + 1) = -S(g, i) * A(i1, g) +
902 C(g, i) * A(i1, g + 1);
903 A(i1, g) = temp;
904
905 // Apply second rotation
906 T temp2 = C(g + 1, i) * A(i1, g + 1) +
907 conj(S(g + 1, i)) * A(i1, g + 2);
908 A(i1, g + 2) = -S(g + 1, i) * A(i1, g + 1) +
909 C(g + 1, i) * A(i1, g + 2);
910 A(i1, g + 1) = temp2;
911 }
912 }
913 }
914 // Shutdown phase
915 for (idx_t i1 = ib; i1 < ib2; ++i1) {
916 for (idx_t j = ((n - l + 1) % 2); j < l; ++j) {
917 for (idx_t i = j, g = n - 2; i < l; ++i, --g) {
918 T temp = C(g, i) * A(i1, g) +
919 conj(S(g, i)) * A(i1, g + 1);
920 A(i1, g + 1) = -S(g, i) * A(i1, g) +
921 C(g, i) * A(i1, g + 1);
922 A(i1, g) = temp;
923 }
924 }
925 }
926 }
927 }
928 }
929 }
930} // namespace tlapack
931
932} // namespace tlapack
933
934#endif // TLAPACK_ROT_SEQUENCE3_HH
constexpr T conj(const T &x) noexcept
Extends std::conj() to real datatypes.
Definition utils.hpp:100
#define TLAPACK_SIDE
Macro for tlapack::concepts::Side compatible with C++17.
Definition concepts.hpp:927
#define TLAPACK_SMATRIX
Macro for tlapack::concepts::SliceableMatrix compatible with C++17.
Definition concepts.hpp:899
#define TLAPACK_DIRECTION
Macro for tlapack::concepts::Direction compatible with C++17.
Definition concepts.hpp:930
int rot_sequence(side_t side, direction_t direction, const C_t &c, const S_t &s, A_t &A)
Applies a sequence of plane rotations to an (m-by-n) matrix.
Definition rot_sequence.hpp:81
void rot_sequence3(side_t side, direction_t direction, const C_t &C, const S_t &S, A_t &A)
Applies a sequence of plane rotations to an (m-by-n) matrix.
Definition rot_sequence3.hpp:92
#define tlapack_check(cond)
Throw an error if cond is false.
Definition exceptionHandling.hpp:98
typename traits::real_type_traits< Types..., int >::type real_type
The common real type of the list of types.
Definition scalar_type_traits.hpp:113