<T>LAPACK 0.1.1
C++ Template Linear Algebra PACKage
Loading...
Searching...
No Matches
multishift_qr_sweep.hpp
Go to the documentation of this file.
1
3//
4// Copyright (c) 2021-2023, University of Colorado Denver. All rights reserved.
5//
6// This file is part of <T>LAPACK.
7// <T>LAPACK is free software: you can redistribute it and/or modify it under
8// the terms of the BSD 3-Clause license. See the accompanying LICENSE file.
9
10#ifndef TLAPACK_QR_SWEEP_HH
11#define TLAPACK_QR_SWEEP_HH
12
14#include "tlapack/blas/gemm.hpp"
18
19namespace tlapack {
47template <class T,
48 TLAPACK_SMATRIX matrix_t,
49 TLAPACK_VECTOR vector_t,
50 enable_if_t<is_complex<type_t<vector_t>>, bool> = true>
52 bool want_z,
55 const matrix_t& A,
56 const vector_t& s,
57 const matrix_t& Z)
58{
59 if constexpr (is_same_v<T, type_t<matrix_t>>)
60 return WorkInfo(3, size(s) / 2);
61 else
62 return WorkInfo(0);
63}
64
73template <TLAPACK_SMATRIX matrix_t,
74 TLAPACK_VECTOR vector_t,
75 TLAPACK_WORKSPACE work_t,
76 enable_if_t<is_complex<type_t<vector_t>>, bool> = true>
78 bool want_z,
81 matrix_t& A,
82 const vector_t& s,
83 matrix_t& Z,
84 work_t& work)
85{
86 using TA = type_t<matrix_t>;
87 using real_t = real_type<TA>;
88 using idx_t = size_type<matrix_t>;
90
91 const real_t one(1);
92 const real_t zero(0);
93 const idx_t n = ncols(A);
94 const real_t eps = ulp<real_t>();
95 const real_t small_num = safe_min<real_t>() * ((real_t)n / eps);
96
97 // check arguments
98 tlapack_check(n >= 12);
99 tlapack_check(nrows(A) == n);
100 if (want_z) {
101 tlapack_check(ncols(Z) == n);
102 tlapack_check(nrows(Z) == n);
103 }
104
105 // Matrix V
106 auto [V, work1] = reshape(work, 3, size(s) / 2);
107
108 const idx_t n_block_max = (n - 3) / 3;
109 const idx_t n_shifts_max =
110 min(ihi - ilo - 1, std::max<idx_t>(2, 3 * (n_block_max / 4)));
111
112 idx_t n_shifts = std::min<idx_t>(size(s), n_shifts_max);
113 if (n_shifts % 2 == 1) n_shifts = n_shifts - 1;
114 idx_t n_bulges = n_shifts / 2;
115
116 const idx_t n_block_desired = std::min<idx_t>(2 * n_shifts, n_block_max);
117
118 // Define workspace matrices
119 // We use the lower triangular part of A as workspace
120
121 // U stores the orthogonal transformations
122 auto U = slice(A, range{n - n_block_desired, n}, range{0, n_block_desired});
123
124 // Workspace for horizontal multiplications
125 auto WH = slice(A, range{n - n_block_desired, n},
127
128 // Workspace for vertical multiplications
129 auto WV = slice(A, range{n_block_desired + 3, n - n_block_desired},
131
132 // i_pos_block points to the start of the block of bulges
133 idx_t i_pos_block;
134
135 //
136 // The following code block introduces the bulges into the matrix
137 //
138 {
139 // Near-the-diagonal bulge introduction
140 // The calculations are initially limited to the window:
141 // A(ilo:ilo+n_block,ilo:ilo+n_block) The rest is updated later via
142 // level 3 BLAS
143 idx_t n_block = min(n_block_desired, ihi - ilo);
144 idx_t istart_m = ilo;
145 idx_t istop_m = ilo + n_block;
146 auto U2 = slice(U, range{0, n_block}, range{0, n_block});
147 laset(GENERAL, zero, one, U2);
148
149 for (idx_t i_pos_last = ilo; i_pos_last < ilo + n_block - 2;
150 ++i_pos_last) {
151 // The number of bulges that are in the pencil
152 idx_t n_active_bulges = min(n_bulges, ((i_pos_last - ilo) / 2) + 1);
153 for (idx_t i_bulge = 0; i_bulge < n_active_bulges; ++i_bulge) {
154 idx_t i_pos = i_pos_last - 2 * i_bulge;
155 auto v = col(V, i_bulge);
156 if (i_pos == ilo) {
157 // Introduce bulge
158 TA tau;
159 auto H = slice(A, range{ilo, ilo + 3}, range{ilo, ilo + 3});
160 lahqr_shiftcolumn(H, v, s[size(s) - 1 - 2 * i_bulge],
161 s[size(s) - 1 - 2 * i_bulge - 1]);
163 v[0] = tau;
164 }
165 else {
166 // Chase bulge down
167 auto H = slice(A, range{i_pos - 1, i_pos + 3},
168 range{i_pos - 1, i_pos + 3});
169 move_bulge(H, v, s[size(s) - 1 - 2 * i_bulge],
170 s[size(s) - 1 - 2 * i_bulge - 1]);
171 }
172
173 // Apply the reflector we just calculated from the right
174 // We leave the last row for later (it interferes with the
175 // optimally packed bulges)
176 for (idx_t j = istart_m; j < i_pos + 3; ++j) {
177 const TA sum = A(j, i_pos) + v[1] * A(j, i_pos + 1) +
178 v[2] * A(j, i_pos + 2);
179 A(j, i_pos) = A(j, i_pos) - sum * v[0];
180 A(j, i_pos + 1) = A(j, i_pos + 1) - sum * v[0] * conj(v[1]);
181 A(j, i_pos + 2) = A(j, i_pos + 2) - sum * v[0] * conj(v[2]);
182 }
183
184 // Apply the reflector we just calculated from the left
185 // We only update a single column, the rest is updated later
186 const TA sum = A(i_pos, i_pos) +
187 conj(v[1]) * A(i_pos + 1, i_pos) +
188 conj(v[2]) * A(i_pos + 2, i_pos);
189 A(i_pos, i_pos) = A(i_pos, i_pos) - sum * conj(v[0]);
190 A(i_pos + 1, i_pos) =
191 A(i_pos + 1, i_pos) - sum * conj(v[0]) * v[1];
192 A(i_pos + 2, i_pos) =
193 A(i_pos + 2, i_pos) - sum * conj(v[0]) * v[2];
194
195 // Test for deflation.
196 if (i_pos > ilo) {
197 if (A(i_pos, i_pos - 1) != zero) {
198 real_t tst1 = abs1(A(i_pos - 1, i_pos - 1)) +
199 abs1(A(i_pos, i_pos));
200 if (tst1 == zero) {
201 if (i_pos > ilo + 1)
202 tst1 += abs1(A(i_pos - 1, i_pos - 2));
203 if (i_pos > ilo + 2)
204 tst1 += abs1(A(i_pos - 1, i_pos - 3));
205 if (i_pos > ilo + 3)
206 tst1 += abs1(A(i_pos - 1, i_pos - 4));
207 if (i_pos < ihi - 1)
208 tst1 += abs1(A(i_pos + 1, i_pos));
209 if (i_pos < ihi - 2)
210 tst1 += abs1(A(i_pos + 2, i_pos));
211 if (i_pos < ihi - 3)
212 tst1 += abs1(A(i_pos + 3, i_pos));
213 }
214 if (abs1(A(i_pos, i_pos - 1)) <
215 max(small_num, eps * tst1)) {
216 const real_t aij = abs1(A(i_pos, i_pos - 1));
217 const real_t aji = abs1(A(i_pos - 1, i_pos));
218 const real_t ab =
219 (aij > aji) ? aij
220 : aji; // Propagates NaNs in aji
221 const real_t ba =
222 (aij < aji) ? aij
223 : aji; // Propagates NaNs in aji
224 const real_t aa =
225 max(abs1(A(i_pos, i_pos)),
226 abs1(A(i_pos, i_pos) -
227 A(i_pos - 1, i_pos - 1)));
228 const real_t bb =
229 min(abs1(A(i_pos, i_pos)),
230 abs1(A(i_pos, i_pos) -
231 A(i_pos - 1, i_pos - 1)));
232 const real_t s = aa + ab;
233 if (ba * (ab / s) <=
234 max(small_num, eps * (bb * (aa / s)))) {
235 A(i_pos, i_pos - 1) = zero;
236 }
237 }
238 }
239 }
240 }
241
242 // The following code performs the delayed update from the left
243 // it is optimized for column oriented matrices, but the increased
244 // complexity likely causes slower code for (idx_t j = ilo; j <
245 // istop_m; ++j)
246 // {
247 // idx_t i_bulge_start = (i_pos_last + 2 > j) ? (i_pos_last + 2
248 // - j) / 2 : 0; for (idx_t i_bulge = i_bulge_start; i_bulge <
249 // n_active_bulges; ++i_bulge)
250 // {
251 // idx_t i_pos = i_pos_last - 2 * i_bulge;
252 // auto v = col(V, i_bulge);
253 // auto sum = A(i_pos, j) + conj(v[1]) * A(i_pos + 1, j) +
254 // conj(v[2]) * A(i_pos + 2, j); A(i_pos, j) = A(i_pos, j) -
255 // sum * conj(v[0]); A(i_pos + 1, j) = A(i_pos + 1, j) - sum
256 // * conj(v[0]) * v[1]; A(i_pos + 2, j) = A(i_pos + 2, j) -
257 // sum * conj(v[0]) * v[2];
258 // }
259 // }
260
261 // Delayed update from the left
262 for (idx_t i_bulge = 0; i_bulge < n_active_bulges; ++i_bulge) {
263 idx_t i_pos = i_pos_last - 2 * i_bulge;
264 auto v = col(V, i_bulge);
265 for (idx_t j = i_pos + 1; j < istop_m; ++j) {
266 const TA sum = A(i_pos, j) + conj(v[1]) * A(i_pos + 1, j) +
267 conj(v[2]) * A(i_pos + 2, j);
268 A(i_pos, j) = A(i_pos, j) - sum * conj(v[0]);
269 A(i_pos + 1, j) = A(i_pos + 1, j) - sum * conj(v[0]) * v[1];
270 A(i_pos + 2, j) = A(i_pos + 2, j) - sum * conj(v[0]) * v[2];
271 }
272 }
273
274 // Accumulate the reflectors into U
275 for (idx_t i_bulge = 0; i_bulge < n_active_bulges; ++i_bulge) {
276 idx_t i_pos = i_pos_last - 2 * i_bulge;
277 auto v = col(V, i_bulge);
278 idx_t i1 = 0;
279 idx_t i2 =
280 min(nrows(U2), (i_pos_last - ilo) + (i_pos_last - ilo) + 3);
281 for (idx_t j = i1; j < i2; ++j) {
282 const TA sum = U2(j, i_pos - ilo) +
283 v[1] * U2(j, i_pos - ilo + 1) +
284 v[2] * U2(j, i_pos - ilo + 2);
285 U2(j, i_pos - ilo) = U2(j, i_pos - ilo) - sum * v[0];
286 U2(j, i_pos - ilo + 1) =
287 U2(j, i_pos - ilo + 1) - sum * v[0] * conj(v[1]);
288 U2(j, i_pos - ilo + 2) =
289 U2(j, i_pos - ilo + 2) - sum * v[0] * conj(v[2]);
290 }
291 }
292 }
293 // Update rest of the matrix
294 if (want_t) {
295 istart_m = 0;
296 istop_m = n;
297 }
298 else {
299 istart_m = ilo;
300 istop_m = ihi;
301 }
302 // Horizontal multiply
303 if (ilo + n_shifts + 1 < istop_m) {
304 idx_t i = ilo + n_block;
305 while (i < istop_m) {
306 idx_t iblock = std::min<idx_t>(istop_m - i, ncols(WH));
307 auto A_slice =
308 slice(A, range{ilo, ilo + n_block}, range{i, i + iblock});
309 auto WH_slice = slice(WH, range{0, nrows(A_slice)},
310 range{0, ncols(A_slice)});
313 i = i + iblock;
314 }
315 }
316 // Vertical multiply
317 if (istart_m < ilo) {
318 idx_t i = istart_m;
319 while (i < ilo) {
320 idx_t iblock = std::min<idx_t>(ilo - i, nrows(WV));
321 auto A_slice =
322 slice(A, range{i, i + iblock}, range{ilo, ilo + n_block});
323 auto WV_slice = slice(WV, range{0, nrows(A_slice)},
324 range{0, ncols(A_slice)});
327 i = i + iblock;
328 }
329 }
330 // Update Z (also a vertical multiplication)
331 if (want_z) {
332 idx_t i = 0;
333 while (i < n) {
334 idx_t iblock = std::min<idx_t>(n - i, nrows(WV));
335 auto Z_slice =
336 slice(Z, range{i, i + iblock}, range{ilo, ilo + n_block});
337 auto WV_slice = slice(WV, range{0, nrows(Z_slice)},
338 range{0, ncols(Z_slice)});
341 i = i + iblock;
342 }
343 }
344
346 }
347
348 //
349 // The following code block moves the bulges down untill they are low enough
350 // to be removed
351 //
352 while (i_pos_block + n_block_desired < ihi) {
353 // Number of positions each bulge will be moved down
354 idx_t n_pos = std::min<idx_t>(n_block_desired - n_shifts,
355 ihi - n_shifts - 1 - i_pos_block);
356 // Actual blocksize
357 idx_t n_block = n_shifts + n_pos;
358
359 auto U2 = slice(U, range{0, n_block}, range{0, n_block});
360 laset(GENERAL, zero, one, U2);
361
362 // Near-the-diagonal bulge chase
363 // The calculations are initially limited to the window:
364 // A(i_pos_block-1:i_pos_block+n_block,i_pos_block:i_pos_block+n_block)
365 // The rest is updated later via level 3 BLAS
366
367 idx_t istart_m = i_pos_block;
368 idx_t istop_m = i_pos_block + n_block;
369 for (idx_t i_pos_last = i_pos_block + n_shifts - 2;
371 for (idx_t i_bulge = 0; i_bulge < n_bulges; ++i_bulge) {
372 idx_t i_pos = i_pos_last - 2 * i_bulge;
373 auto v = col(V, i_bulge);
374 auto H = slice(A, range{i_pos - 1, i_pos + 3},
375 range{i_pos - 1, i_pos + 3});
376 move_bulge(H, v, s[size(s) - 1 - 2 * i_bulge],
377 s[size(s) - 1 - 2 * i_bulge - 1]);
378
379 // Apply the reflector we just calculated from the right
380 // We leave the last row for later (it interferes with the
381 // optimally packed bulges)
382 for (idx_t j = istart_m; j < i_pos + 3; ++j) {
383 const TA sum = A(j, i_pos) + v[1] * A(j, i_pos + 1) +
384 v[2] * A(j, i_pos + 2);
385 A(j, i_pos) = A(j, i_pos) - sum * v[0];
386 A(j, i_pos + 1) = A(j, i_pos + 1) - sum * v[0] * conj(v[1]);
387 A(j, i_pos + 2) = A(j, i_pos + 2) - sum * v[0] * conj(v[2]);
388 }
389
390 // Apply the reflector we just calculated from the left
391 // We only update a single column, the rest is updated later
392 const TA sum = A(i_pos, i_pos) +
393 conj(v[1]) * A(i_pos + 1, i_pos) +
394 conj(v[2]) * A(i_pos + 2, i_pos);
395 A(i_pos, i_pos) = A(i_pos, i_pos) - sum * conj(v[0]);
396 A(i_pos + 1, i_pos) =
397 A(i_pos + 1, i_pos) - sum * conj(v[0]) * v[1];
398 A(i_pos + 2, i_pos) =
399 A(i_pos + 2, i_pos) - sum * conj(v[0]) * v[2];
400
401 // Test for deflation.
402 if (i_pos > ilo) {
403 if (A(i_pos, i_pos - 1) != zero) {
404 real_t tst1 = abs1(A(i_pos - 1, i_pos - 1)) +
405 abs1(A(i_pos, i_pos));
406 if (tst1 == zero) {
407 if (i_pos > ilo + 1)
408 tst1 += abs1(A(i_pos - 1, i_pos - 2));
409 if (i_pos > ilo + 2)
410 tst1 += abs1(A(i_pos - 1, i_pos - 3));
411 if (i_pos > ilo + 3)
412 tst1 += abs1(A(i_pos - 1, i_pos - 4));
413 if (i_pos < ihi - 1)
414 tst1 += abs1(A(i_pos + 1, i_pos));
415 if (i_pos < ihi - 2)
416 tst1 += abs1(A(i_pos + 2, i_pos));
417 if (i_pos < ihi - 3)
418 tst1 += abs1(A(i_pos + 3, i_pos));
419 }
420 if (abs1(A(i_pos, i_pos - 1)) <
421 max(small_num, eps * tst1)) {
422 const real_t aij = abs1(A(i_pos, i_pos - 1));
423 const real_t aji = abs1(A(i_pos - 1, i_pos));
424 const real_t ab =
425 (aij > aji) ? aij
426 : aji; // Propagates NaNs in aji
427 const real_t ba =
428 (aij < aji) ? aij
429 : aji; // Propagates NaNs in aji
430 const real_t aa =
431 max(abs1(A(i_pos, i_pos)),
432 abs1(A(i_pos, i_pos) -
433 A(i_pos - 1, i_pos - 1)));
434 const real_t bb =
435 min(abs1(A(i_pos, i_pos)),
436 abs1(A(i_pos, i_pos) -
437 A(i_pos - 1, i_pos - 1)));
438 const real_t s = aa + ab;
439 if (ba * (ab / s) <=
440 max(small_num, eps * (bb * (aa / s)))) {
441 A(i_pos, i_pos - 1) = zero;
442 }
443 }
444 }
445 }
446 }
447
448 // The following code performs the delayed update from the left
449 // it is optimized for column oriented matrices, but the increased
450 // complexity likely causes slower code for (idx_t j = i_pos_block;
451 // j < istop_m; ++j)
452 // {
453 // idx_t i_bulge_start = (i_pos_last + 2 > j) ? (i_pos_last + 2
454 // - j) / 2 : 0; for (idx_t i_bulge = i_bulge_start; i_bulge <
455 // n_bulges; ++i_bulge)
456 // {
457 // idx_t i_pos = i_pos_last - 2 * i_bulge;
458 // auto v = col(V, i_bulge);
459 // auto sum = A(i_pos, j) + conj(v[1]) * A(i_pos + 1, j) +
460 // conj(v[2]) * A(i_pos + 2, j); A(i_pos, j) = A(i_pos, j) -
461 // sum * conj(v[0]); A(i_pos + 1, j) = A(i_pos + 1, j) - sum
462 // * conj(v[0]) * v[1]; A(i_pos + 2, j) = A(i_pos + 2, j) -
463 // sum * conj(v[0]) * v[2];
464 // }
465 // }
466
467 // Delayed update from the left
468 for (idx_t i_bulge = 0; i_bulge < n_bulges; ++i_bulge) {
469 idx_t i_pos = i_pos_last - 2 * i_bulge;
470 auto v = col(V, i_bulge);
471 for (idx_t j = i_pos + 1; j < istop_m; ++j) {
472 const TA sum = A(i_pos, j) + conj(v[1]) * A(i_pos + 1, j) +
473 conj(v[2]) * A(i_pos + 2, j);
474 A(i_pos, j) = A(i_pos, j) - sum * conj(v[0]);
475 A(i_pos + 1, j) = A(i_pos + 1, j) - sum * conj(v[0]) * v[1];
476 A(i_pos + 2, j) = A(i_pos + 2, j) - sum * conj(v[0]) * v[2];
477 }
478 }
479
480 // Accumulate the reflectors into U
481 for (idx_t i_bulge = 0; i_bulge < n_bulges; ++i_bulge) {
482 idx_t i_pos = i_pos_last - 2 * i_bulge;
483 auto v = col(V, i_bulge);
484 idx_t i1 = (i_pos - i_pos_block) -
486 idx_t i2 =
487 min(nrows(U2),
489 (i_pos_last - i_pos_block - n_shifts + 2) + 3);
490 for (idx_t j = i1; j < i2; ++j) {
491 const TA sum = U2(j, i_pos - i_pos_block) +
492 v[1] * U2(j, i_pos - i_pos_block + 1) +
493 v[2] * U2(j, i_pos - i_pos_block + 2);
494 U2(j, i_pos - i_pos_block) =
495 U2(j, i_pos - i_pos_block) - sum * v[0];
496 U2(j, i_pos - i_pos_block + 1) =
497 U2(j, i_pos - i_pos_block + 1) -
498 sum * v[0] * conj(v[1]);
499 U2(j, i_pos - i_pos_block + 2) =
500 U2(j, i_pos - i_pos_block + 2) -
501 sum * v[0] * conj(v[2]);
502 }
503 }
504 }
505 // Update rest of the matrix
506 if (want_t) {
507 istart_m = 0;
508 istop_m = n;
509 }
510 else {
511 istart_m = ilo;
512 istop_m = ihi;
513 }
514 // Horizontal multiply
515 if (i_pos_block + n_block < istop_m) {
516 idx_t i = i_pos_block + n_block;
517 while (i < istop_m) {
518 idx_t iblock = std::min<idx_t>(istop_m - i, ncols(WH));
519 auto A_slice =
521 range{i, i + iblock});
522 auto WH_slice = slice(WH, range{0, nrows(A_slice)},
523 range{0, ncols(A_slice)});
526 i = i + iblock;
527 }
528 }
529 // Vertical multiply
530 if (istart_m < i_pos_block) {
531 idx_t i = istart_m;
532 while (i < i_pos_block) {
533 idx_t iblock = std::min<idx_t>(i_pos_block - i, nrows(WV));
534 auto A_slice = slice(A, range{i, i + iblock},
536 auto WV_slice = slice(WV, range{0, nrows(A_slice)},
537 range{0, ncols(A_slice)});
540 i = i + iblock;
541 }
542 }
543 // Update Z (also a vertical multiplication)
544 if (want_z) {
545 idx_t i = 0;
546 while (i < n) {
547 idx_t iblock = std::min<idx_t>(n - i, nrows(WV));
548 auto Z_slice = slice(Z, range{i, i + iblock},
550 auto WV_slice = slice(WV, range{0, nrows(Z_slice)},
551 range{0, ncols(Z_slice)});
554 i = i + iblock;
555 }
556 }
557
559 }
560
561 //
562 // The following code removes the bulges from the matrix
563 //
564 {
565 idx_t n_block = ihi - i_pos_block;
566
567 auto U2 = slice(U, range{0, n_block}, range{0, n_block});
568 laset(GENERAL, zero, one, U2);
569
570 // Near-the-diagonal bulge chase
571 // The calculations are initially limited to the window:
572 // A(i_pos_block-1:ihi,i_pos_block:ihi) The rest is updated later via
573 // level 3 BLAS
574
575 idx_t istart_m = i_pos_block;
576 idx_t istop_m = ihi;
577
578 for (idx_t i_pos_last = i_pos_block + n_shifts - 2;
579 i_pos_last < ihi + n_shifts - 1; ++i_pos_last) {
580 idx_t i_bulge_start =
581 (i_pos_last + 3 > ihi) ? (i_pos_last + 3 - ihi) / 2 : 0;
582 for (idx_t i_bulge = i_bulge_start; i_bulge < n_bulges; ++i_bulge) {
583 idx_t i_pos = i_pos_last - 2 * i_bulge;
584 if (i_pos == ihi - 2) {
585 // Special case, the bulge is at the bottom, needs a smaller
586 // reflector (order 2)
587 auto v = slice(V, range{0, 2}, i_bulge);
588 auto h = slice(A, range{i_pos, i_pos + 2}, i_pos - 1);
590 v[1] = h[1];
591 h[1] = zero;
592
593 const TA t1 = conj(v[0]);
594 const TA v2 = v[1];
595 const TA t2 = t1 * v2;
596 // Apply the reflector we just calculated from the right
597 for (idx_t j = istart_m; j < i_pos + 2; ++j) {
598 const TA sum = A(j, i_pos) + v2 * A(j, i_pos + 1);
599 A(j, i_pos) = A(j, i_pos) - sum * conj(t1);
600 A(j, i_pos + 1) = A(j, i_pos + 1) - sum * conj(t2);
601 }
602 // Apply the reflector we just calculated from the left
603 for (idx_t j = i_pos; j < istop_m; ++j) {
604 const TA sum = A(i_pos, j) + conj(v2) * A(i_pos + 1, j);
605 A(i_pos, j) = A(i_pos, j) - sum * t1;
606 A(i_pos + 1, j) = A(i_pos + 1, j) - sum * t2;
607 }
608 // Accumulate the reflector into U
609 // The loop bounds should be changed to reflect the fact
610 // that U2 starts off as diagonal
611 for (idx_t j = 0; j < nrows(U2); ++j) {
612 const TA sum = U2(j, i_pos - i_pos_block) +
613 v2 * U2(j, i_pos - i_pos_block + 1);
614 U2(j, i_pos - i_pos_block) =
615 U2(j, i_pos - i_pos_block) - sum * conj(t1);
616 U2(j, i_pos - i_pos_block + 1) =
617 U2(j, i_pos - i_pos_block + 1) - sum * conj(t2);
618 }
619 }
620 else {
621 auto v = col(V, i_bulge);
622 auto H = slice(A, range{i_pos - 1, i_pos + 3},
623 range{i_pos - 1, i_pos + 3});
624 move_bulge(H, v, s[size(s) - 1 - 2 * i_bulge],
625 s[size(s) - 1 - 2 * i_bulge - 1]);
626
627 const TA t1 = conj(v[0]);
628 const TA v2 = v[1];
629 const TA t2 = t1 * v2;
630 const TA v3 = v[2];
631 const TA t3 = t1 * v3;
632 // Apply the reflector we just calculated from the right
633 // (but leave the last row for later)
634 for (idx_t j = istart_m; j < i_pos + 3; ++j) {
635 const TA sum = A(j, i_pos) + v2 * A(j, i_pos + 1) +
636 v3 * A(j, i_pos + 2);
637 A(j, i_pos) = A(j, i_pos) - sum * conj(t1);
638 A(j, i_pos + 1) = A(j, i_pos + 1) - sum * conj(t2);
639 A(j, i_pos + 2) = A(j, i_pos + 2) - sum * conj(t3);
640 }
641
642 // Apply the reflector we just calculated from the left
643 // We only update a single column, the rest is updated later
644 const TA sum = A(i_pos, i_pos) +
645 conj(v[1]) * A(i_pos + 1, i_pos) +
646 conj(v[2]) * A(i_pos + 2, i_pos);
647 A(i_pos, i_pos) = A(i_pos, i_pos) - sum * conj(v[0]);
648 A(i_pos + 1, i_pos) =
649 A(i_pos + 1, i_pos) - sum * conj(v[0]) * v[1];
650 A(i_pos + 2, i_pos) =
651 A(i_pos + 2, i_pos) - sum * conj(v[0]) * v[2];
652
653 // Test for deflation.
654 if (i_pos > ilo) {
655 if (A(i_pos, i_pos - 1) != zero) {
656 real_t tst1 = abs1(A(i_pos - 1, i_pos - 1)) +
657 abs1(A(i_pos, i_pos));
658 if (tst1 == zero) {
659 if (i_pos > ilo + 1)
660 tst1 += abs1(A(i_pos - 1, i_pos - 2));
661 if (i_pos > ilo + 2)
662 tst1 += abs1(A(i_pos - 1, i_pos - 3));
663 if (i_pos > ilo + 3)
664 tst1 += abs1(A(i_pos - 1, i_pos - 4));
665 if (i_pos < ihi - 1)
666 tst1 += abs1(A(i_pos + 1, i_pos));
667 if (i_pos < ihi - 2)
668 tst1 += abs1(A(i_pos + 2, i_pos));
669 if (i_pos < ihi - 3)
670 tst1 += abs1(A(i_pos + 3, i_pos));
671 }
672 if (abs1(A(i_pos, i_pos - 1)) <
673 max(small_num, eps * tst1)) {
674 const real_t aij = abs1(A(i_pos, i_pos - 1));
675 const real_t aji = abs1(A(i_pos - 1, i_pos));
676 const real_t ab =
677 (aij > aji)
678 ? aij
679 : aji; // Propagates NaNs in aji
680 const real_t ba =
681 (aij < aji)
682 ? aij
683 : aji; // Propagates NaNs in aji
684 const real_t aa =
685 max(abs1(A(i_pos, i_pos)),
686 abs1(A(i_pos, i_pos) -
687 A(i_pos - 1, i_pos - 1)));
688 const real_t bb =
689 min(abs1(A(i_pos, i_pos)),
690 abs1(A(i_pos, i_pos) -
691 A(i_pos - 1, i_pos - 1)));
692 const real_t s = aa + ab;
693 if (ba * (ab / s) <=
694 max(small_num, eps * (bb * (aa / s)))) {
695 A(i_pos, i_pos - 1) = zero;
696 }
697 }
698 }
699 }
700 }
701 }
702
704 (i_pos_last + 4 > ihi) ? (i_pos_last + 4 - ihi) / 2 : 0;
705
706 // The following code performs the delayed update from the left
707 // it is optimized for column oriented matrices, but the increased
708 // complexity likely causes slower code for (idx_t j = i_pos_block;
709 // j < istop_m; ++j)
710 // {
711 // idx_t i_bulge_start2 = (i_pos_last + 2 > j) ? (i_pos_last + 2
712 // - j) / 2 : 0; i_bulge_start2 =
713 // max(i_bulge_start,i_bulge_start2); for (idx_t i_bulge =
714 // i_bulge_start2; i_bulge < n_bulges; ++i_bulge)
715 // {
716 // idx_t i_pos = i_pos_last - 2 * i_bulge;
717 // auto v = col(V, i_bulge);
718 // auto sum = A(i_pos, j) + conj(v[1]) * A(i_pos + 1, j) +
719 // conj(v[2]) * A(i_pos + 2, j); A(i_pos, j) = A(i_pos, j) -
720 // sum * conj(v[0]); A(i_pos + 1, j) = A(i_pos + 1, j) - sum
721 // * conj(v[0]) * v[1]; A(i_pos + 2, j) = A(i_pos + 2, j) -
722 // sum * conj(v[0]) * v[2];
723 // }
724 // }
725
726 // Delayed update from the left
727 for (idx_t i_bulge = i_bulge_start; i_bulge < n_bulges; ++i_bulge) {
728 idx_t i_pos = i_pos_last - 2 * i_bulge;
729 auto v = col(V, i_bulge);
730 for (idx_t j = i_pos + 1; j < istop_m; ++j) {
731 const TA sum = A(i_pos, j) + conj(v[1]) * A(i_pos + 1, j) +
732 conj(v[2]) * A(i_pos + 2, j);
733 A(i_pos, j) = A(i_pos, j) - sum * conj(v[0]);
734 A(i_pos + 1, j) = A(i_pos + 1, j) - sum * conj(v[0]) * v[1];
735 A(i_pos + 2, j) = A(i_pos + 2, j) - sum * conj(v[0]) * v[2];
736 }
737 }
738
739 // Accumulate the reflectors into U
740 for (idx_t i_bulge = i_bulge_start; i_bulge < n_bulges; ++i_bulge) {
741 idx_t i_pos = i_pos_last - 2 * i_bulge;
742 auto v = col(V, i_bulge);
743 idx_t i1 = (i_pos - i_pos_block) -
745 idx_t i2 =
746 min(nrows(U2),
748 (i_pos_last - i_pos_block - n_shifts + 2) + 3);
749 for (idx_t j = i1; j < i2; ++j) {
750 const TA sum = U2(j, i_pos - i_pos_block) +
751 v[1] * U2(j, i_pos - i_pos_block + 1) +
752 v[2] * U2(j, i_pos - i_pos_block + 2);
753 U2(j, i_pos - i_pos_block) =
754 U2(j, i_pos - i_pos_block) - sum * v[0];
755 U2(j, i_pos - i_pos_block + 1) =
756 U2(j, i_pos - i_pos_block + 1) -
757 sum * v[0] * conj(v[1]);
758 U2(j, i_pos - i_pos_block + 2) =
759 U2(j, i_pos - i_pos_block + 2) -
760 sum * v[0] * conj(v[2]);
761 }
762 }
763 }
764
765 // Update rest of the matrix
766 if (want_t) {
767 istart_m = 0;
768 istop_m = n;
769 }
770 else {
771 istart_m = ilo;
772 istop_m = ihi;
773 }
774 // Horizontal multiply
775 if (ihi < istop_m) {
776 idx_t i = ihi;
777 while (i < istop_m) {
778 idx_t iblock = std::min<idx_t>(istop_m - i, ncols(WH));
779 auto A_slice =
780 slice(A, range{i_pos_block, ihi}, range{i, i + iblock});
781 auto WH_slice = slice(WH, range{0, nrows(A_slice)},
782 range{0, ncols(A_slice)});
785 i = i + iblock;
786 }
787 }
788 // Vertical multiply
789 if (istart_m < i_pos_block) {
790 idx_t i = istart_m;
791 while (i < i_pos_block) {
792 idx_t iblock = std::min<idx_t>(i_pos_block - i, nrows(WV));
793 auto A_slice =
794 slice(A, range{i, i + iblock}, range{i_pos_block, ihi});
795 auto WV_slice = slice(WV, range{0, nrows(A_slice)},
796 range{0, ncols(A_slice)});
799 i = i + iblock;
800 }
801 }
802 // Update Z (also a vertical multiplication)
803 if (want_z) {
804 idx_t i = 0;
805 while (i < n) {
806 idx_t iblock = std::min<idx_t>(n - i, nrows(WV));
807 auto Z_slice =
808 slice(Z, range{i, i + iblock}, range{i_pos_block, ihi});
809 auto WV_slice = slice(WV, range{0, nrows(Z_slice)},
810 range{0, ncols(Z_slice)});
813 i = i + iblock;
814 }
815 }
816 }
817}
818
846template <TLAPACK_SMATRIX matrix_t,
847 TLAPACK_VECTOR vector_t,
848 enable_if_t<is_complex<type_t<vector_t>>, bool> = true>
850 bool want_z,
853 matrix_t& A,
854 const vector_t& s,
855 matrix_t& Z)
856{
857 using TA = type_t<matrix_t>;
858
859 // Functor
861
862 // Allocates workspace
865 std::vector<TA> work_;
866 auto work = new_matrix(work_, workinfo.m, workinfo.n);
867
869}
870
871} // namespace tlapack
872
873#endif // TLAPACK_QR_SWEEP_HH
constexpr internal::Forward FORWARD
Forward direction.
Definition types.hpp:376
constexpr internal::GeneralAccess GENERAL
General access.
Definition types.hpp:175
constexpr internal::ConjTranspose CONJ_TRANS
conjugate transpose
Definition types.hpp:259
constexpr internal::ColumnwiseStorage COLUMNWISE_STORAGE
Columnwise storage.
Definition types.hpp:409
constexpr internal::NoTranspose NO_TRANS
no transpose
Definition types.hpp:255
constexpr T conj(const T &x) noexcept
Extends std::conj() to real datatypes.
Definition utils.hpp:100
constexpr real_type< T > abs1(const T &x)
1-norm absolute value, |Re(x)| + |Im(x)|
Definition utils.hpp:133
#define TLAPACK_SMATRIX
Macro for tlapack::concepts::SliceableMatrix compatible with C++17.
Definition concepts.hpp:899
#define TLAPACK_WORKSPACE
Macro for tlapack::concepts::Workspace compatible with C++17.
Definition concepts.hpp:912
#define TLAPACK_VECTOR
Macro for tlapack::concepts::Vector compatible with C++17.
Definition concepts.hpp:906
void multishift_QR_sweep(bool want_t, bool want_z, size_type< matrix_t > ilo, size_type< matrix_t > ihi, matrix_t &A, const vector_t &s, matrix_t &Z)
multishift_QR_sweep performs a single small-bulge multi-shift QR sweep.
Definition multishift_qr_sweep.hpp:849
void move_bulge(matrix_t &H, vector_t &v, complex_type< type_t< matrix_t > > s1, complex_type< type_t< matrix_t > > s2)
Given a 4-by-3 matrix H and small order reflector v, move_bulge applies the delayed right update to t...
Definition move_bulge.hpp:37
void laset(uplo_t uplo, const type_t< matrix_t > &alpha, const type_t< matrix_t > &beta, matrix_t &A)
Initializes a matrix to diagonal and off-diagonal values.
Definition laset.hpp:38
void larfg(storage_t storeMode, type_t< vector_t > &alpha, vector_t &x, type_t< vector_t > &tau)
Generates a elementary Householder reflection.
Definition larfg.hpp:73
void lacpy(uplo_t uplo, const matrixA_t &A, matrixB_t &B)
Copies a matrix from A to B.
Definition lacpy.hpp:38
int lahqr_shiftcolumn(const matrix_t &H, vector_t &v, complex_type< type_t< matrix_t > > s1, complex_type< type_t< matrix_t > > s2)
Given a 2-by-2 or 3-by-3 matrix H, lahqr_shiftcolumn calculates a multiple of the product: (H - s1*I)...
Definition lahqr_shiftcolumn.hpp:41
void gemm(Op transA, Op transB, const alpha_t &alpha, const matrixA_t &A, const matrixB_t &B, const beta_t &beta, matrixC_t &C)
General matrix-matrix multiply:
Definition gemm.hpp:61
void multishift_QR_sweep_work(bool want_t, bool want_z, size_type< matrix_t > ilo, size_type< matrix_t > ihi, matrix_t &A, const vector_t &s, matrix_t &Z, work_t &work)
multishift_QR_sweep performs a single small-bulge multi-shift QR sweep. Workspace is provided as an...
Definition multishift_qr_sweep.hpp:77
#define tlapack_check(cond)
Throw an error if cond is false.
Definition exceptionHandling.hpp:98
constexpr WorkInfo multishift_QR_sweep_worksize(bool want_t, bool want_z, size_type< matrix_t > ilo, size_type< matrix_t > ihi, const matrix_t &A, const vector_t &s, const matrix_t &Z)
Worspace query of multishift_QR_sweep()
Definition multishift_qr_sweep.hpp:51
typename traits::real_type_traits< Types..., int >::type real_type
The common real type of the list of types.
Definition scalar_type_traits.hpp:113
Output information in the workspace query.
Definition workspace.hpp:16