62 using idx_t = size_type<A_t>;
63 using range = pair<idx_t, idx_t>;
64 using T = type_t<A_t>;
65 using real_t = real_type<T>;
66 using r_matrix_t = real_type<A_t>;
68 Create<A_t> new_matrix;
69 Create<r_matrix_t> new_real_matrix;
72 const idx_t n = ncols(A);
73 const idx_t nb = opts.nb;
74 const idx_t nh = ihi - ilo - 1;
88 for (idx_t j = 0; j < n; ++j)
89 for (idx_t i = j + 1; i < n; ++i)
93 if (nh <= 1)
return 0;
96 std::vector<real_t> Cl_;
97 auto Cl = new_real_matrix(Cl_, nh - 1, nb);
99 auto Sl = new_matrix(Sl_, nh - 1, nb);
100 std::vector<real_t> Cr_;
101 auto Cr = new_real_matrix(Cr_, nh - 1, nb);
103 auto Sr = new_matrix(Sr_, nh - 1, nb);
106 auto Qt = new_matrix(Qt_, 2 * nb, 2 * nb);
108 auto C = new_matrix(C_, 2 * nb, n);
109 auto D = new_matrix(C_, n, 2 * nb);
111 for (idx_t j = ilo; j + 2 < ihi; j = j + nb) {
113 idx_t nnb = std::min<idx_t>(nb, ihi - 2 - j);
115 idx_t n2nb = (ihi - j - 2) / nnb - 1;
117 idx_t nblst = ihi - j - 1 - n2nb * nnb;
122 for (idx_t jb = 0; jb < nnb; ++jb) {
124 for (idx_t jbb = 0; jbb < jb; ++jbb) {
125 for (idx_t i = ihi - 1; i > j + jbb + 1; --i) {
126 real_t c = Cl(i - ilo - 2, jbb);
127 T s = Sl(i - ilo - 2, jbb);
128 T temp = c * A(i - 1, j + jb) + s * A(i, j + jb);
130 -conj(s) * A(i - 1, j + jb) + c * A(i, j + jb);
131 A(i - 1, j + jb) = temp;
135 for (idx_t i = ihi - 1; i > j + jb + 1; --i) {
136 rotg(A(i - 1, j + jb), A(i, j + jb), Cl(i - ilo - 2, jb),
137 Sl(i - ilo - 2, jb));
142 auto B2 = slice(B, range(j + jb + 1, ihi), range(j + jb + 1, ihi));
143 auto clv = slice(Cl, range(j - ilo + jb, ihi - ilo - 2), jb);
144 auto slv = slice(Sl, range(j - ilo + jb, ihi - ilo - 2), jb);
145 auto crv = slice(Cr, range(j - ilo + jb, ihi - ilo - 2), jb);
146 auto srv = slice(Sr, range(j - ilo + jb, ihi - ilo - 2), jb);
148 auto B3 = slice(B, range(j, j + jb + 1), range(j + jb + 1, ihi));
151 auto A2 = slice(A, range(j, ihi), range(j + jb + 1, ihi));
163 auto Qt2 = slice(Qt, range(0, nblst), range(0, nblst));
164 laset(GENERAL, (T)0, (T)1, Qt2);
166 for (idx_t jb = 0; jb < nnb; ++jb) {
167 for (idx_t i = nblst - 1; i > jb; --i) {
168 auto q1 = slice(Qt2, range(i - 1 - jb, nblst), i - 1);
169 auto q2 = slice(Qt2, range(i - 1 - jb, nblst), i);
170 rot(q1, q2, Cl(j - ilo + nnb * n2nb + i - 1, jb),
171 conj(Sl(j - ilo + nnb * n2nb + i - 1, jb)));
175 auto A2 = slice(A, range(ihi - nblst, ihi), range(j + nnb, n));
176 auto C2 = slice(C, range(0, nblst), range(j + nnb, n));
177 gemm(CONJ_TRANS, NO_TRANS, (T)1, Qt2, A2, C2);
178 lacpy(GENERAL, C2, A2);
181 auto B2 = slice(B, range(ihi - nblst, ihi), range(ihi, n));
182 auto C3 = slice(C, range(0, nblst), range(ihi, n));
183 gemm(CONJ_TRANS, NO_TRANS, (T)1, Qt2, B2, C3);
184 lacpy(GENERAL, C3, B2);
187 auto Q2 = cols(Q, range(ihi - nblst, ihi));
188 auto D2 = cols(D, range(0, nblst));
189 gemm(NO_TRANS, NO_TRANS, (T)1, Q2, Qt2, D2);
190 lacpy(GENERAL, D2, Q2);
192 for (idx_t ib = n2nb - 1; ib != (idx_t)-1; ib--) {
193 auto Qt2 = slice(Qt, range(0, 2 * nnb), range(0, 2 * nnb));
194 laset(GENERAL, (T)0, (T)1, Qt2);
195 for (idx_t jb = 0; jb < nnb; ++jb) {
196 for (idx_t i = nnb + jb; i > jb; --i) {
198 slice(Qt2, range(i - 1 - jb, nnb + jb + 1), i - 1);
199 auto q2 = slice(Qt2, range(i - 1 - jb, nnb + jb + 1), i);
200 rot(q1, q2, Cl(j - ilo + ib * nnb + i - 1, jb),
201 conj(Sl(j - ilo + ib * nnb + i - 1, jb)));
206 slice(A, range(j + 1 + nnb * ib, j + 1 + nnb * ib + 2 * nnb),
208 auto C2 = slice(C, range(0, 2 * nnb), range(j + nnb, n));
209 gemm(CONJ_TRANS, NO_TRANS, (T)1, Qt2, A2, C2);
210 lacpy(GENERAL, C2, A2);
214 B, range(j + 1 + nnb * ib, j + 1 + nnb * ib + 2 * nnb),
216 auto C3 = slice(C, range(0, 2 * nnb), range(ihi, n));
217 gemm(CONJ_TRANS, NO_TRANS, (T)1, Qt2, B2, C3);
218 lacpy(GENERAL, C3, B2);
222 cols(Q, range(j + 1 + nnb * ib, j + 1 + nnb * ib + 2 * nnb));
223 auto D2 = cols(D, range(0, 2 * nnb));
224 gemm(NO_TRANS, NO_TRANS, (T)1, Q2, Qt2, D2);
225 lacpy(GENERAL, D2, Q2);
234 auto Qt2 = slice(Qt, range(0, nblst), range(0, nblst));
235 laset(GENERAL, (T)0, (T)1, Qt2);
237 for (idx_t jb = 0; jb < nnb; ++jb) {
238 for (idx_t i = nblst - 1; i > jb; --i) {
239 auto q1 = slice(Qt2, range(i - 1 - jb, nblst), i - 1);
240 auto q2 = slice(Qt2, range(i - 1 - jb, nblst), i);
241 rot(q1, q2, Cr(j - ilo + nnb * n2nb + i - 1, jb),
242 conj(Sr(j - ilo + nnb * n2nb + i - 1, jb)));
247 auto A2 = slice(A, range(0, j), range(ihi - nblst, ihi));
248 auto D2 = slice(D, range(0, j), range(0, nblst));
249 gemm(NO_TRANS, NO_TRANS, (T)1, A2, Qt2, D2);
250 lacpy(GENERAL, D2, A2);
252 auto B2 = slice(B, range(0, j), range(ihi - nblst, ihi));
253 gemm(NO_TRANS, NO_TRANS, (T)1, B2, Qt2, D2);
254 lacpy(GENERAL, D2, B2);
257 auto Z2 = cols(Z, range(ihi - nblst, ihi));
258 auto D2 = cols(D, range(0, nblst));
259 gemm(NO_TRANS, NO_TRANS, (T)1, Z2, Qt2, D2);
260 lacpy(GENERAL, D2, Z2);
262 for (idx_t ib = n2nb - 1; ib != (idx_t)-1; ib--) {
263 auto Qt2 = slice(Qt, range(0, 2 * nnb), range(0, 2 * nnb));
264 laset(GENERAL, (T)0, (T)1, Qt2);
265 for (idx_t jb = 0; jb < nnb; ++jb) {
266 for (idx_t i = nnb + jb; i > jb; --i) {
268 slice(Qt2, range(i - 1 - jb, nnb + jb + 1), i - 1);
269 auto q2 = slice(Qt2, range(i - 1 - jb, nnb + jb + 1), i);
270 rot(q1, q2, Cr(j - ilo + ib * nnb + i - 1, jb),
271 conj(Sr(j - ilo + ib * nnb + i - 1, jb)));
277 slice(A, range(0, j),
278 range(j + 1 + nnb * ib, j + 1 + nnb * ib + 2 * nnb));
279 auto D2 = slice(D, range(0, j), range(0, 2 * nnb));
280 gemm(NO_TRANS, NO_TRANS, (T)1, A2, Qt2, D2);
281 lacpy(GENERAL, D2, A2);
284 slice(B, range(0, j),
285 range(j + 1 + nnb * ib, j + 1 + nnb * ib + 2 * nnb));
286 gemm(NO_TRANS, NO_TRANS, (T)1, B2, Qt2, D2);
287 lacpy(GENERAL, D2, B2);
291 cols(Z, range(j + 1 + nnb * ib, j + 1 + nnb * ib + 2 * nnb));
292 auto D2 = cols(D, range(0, 2 * nnb));
293 gemm(NO_TRANS, NO_TRANS, (T)1, Z2, Qt2, D2);
294 lacpy(GENERAL, D2, Z2);