65 using idx_t = size_type<matrix_t>;
66 using work_t = matrix_type<matrix_t, vector_t>;
67 using range = pair<idx_t, idx_t>;
70 const idx_t m = nrows(A);
71 const idx_t n = ncols(A);
72 const idx_t k = size(tau);
73 const idx_t nb = min((idx_t)opts.nb, k);
76 auto&& V = (storeMode == StoreV::Columnwise)
77 ? slice(A, range{0, m}, range{0, nb})
78 : slice(A, range{0, nb}, range{0, n});
80 if (storeMode == StoreV::Columnwise) {
84 auto&& matrixT = slice(A, range{0, nb}, range{0, nb});
85 auto&& C = slice(A, range{0, m},
86 (direction == Direction::Forward)
88 : range{0, (n - k) + ((k - 1) / nb) * nb});
91 workinfo = larfb_worksize<T>(LEFT_SIDE, NO_TRANS, direction,
92 COLUMNWISE_STORAGE, V, matrixT, C);
95 if (is_same_v<T, type_t<work_t>>) workinfo += WorkInfo(nb, nb);
102 auto&& matrixT = slice(A, range{0, nb}, range{0, nb});
104 (direction == Direction::Forward)
106 : range{0, (m - k) + ((k - 1) / nb) * nb},
110 workinfo = larfb_worksize<T>(RIGHT_SIDE, CONJ_TRANS, direction,
111 ROWWISE_STORAGE, V, matrixT, C);
114 if (is_same_v<T, type_t<work_t>>) workinfo += WorkInfo(nb, nb);
119 auto&& taui = slice(tau, range{0, nb});
120 workinfo.minMax(ungq_level2_worksize<T>(direction, storeMode, V, taui));
145 using T = type_t<matrix_t>;
146 using real_t = real_type<T>;
147 using idx_t = size_type<matrix_t>;
148 using range = pair<idx_t, idx_t>;
151 const real_t zero(0);
153 const idx_t m = nrows(A);
154 const idx_t n = ncols(A);
155 const idx_t k = size(tau);
156 const idx_t nb = min((idx_t)opts.nb, k);
160 direction != Direction::Forward);
161 tlapack_check((storeMode == StoreV::Columnwise) ? (m >= n && n >= k)
162 : (n >= m && m >= k));
165 if (m <= 0 || n <= 0)
return 0;
168 auto [matrixT, work1] =
169 (min(m, n) > nb) ? reshape(work, nb, nb) : reshape(work, 0, 0);
171 if (storeMode == StoreV::Columnwise) {
172 if (direction == Direction::Forward) {
174 auto A0 = slice(A, range{0, k}, range{k, n});
175 auto A1 = slice(A, range{k, m}, range{k, n});
176 laset(GENERAL, zero, zero, A0);
177 laset(GENERAL, zero, one, A1);
179 for (idx_t j = ((k - 1) / nb) * nb; j != -nb; j -= nb) {
180 const idx_t ib = min(nb, k - j);
181 const auto tauj = slice(tau, range{j, j + ib});
183 auto V = slice(A, range{j, m}, range{j, j + ib});
187 auto matrixTj = slice(matrixT, range{0, ib}, range{0, ib});
188 auto C = slice(A, range{j, m}, range{j + ib, n});
190 larft(FORWARD, COLUMNWISE_STORAGE, V, tauj, matrixTj);
191 larfb_work(LEFT_SIDE, NO_TRANS, FORWARD, COLUMNWISE_STORAGE,
192 V, matrixTj, C, work1);
197 auto A0 = slice(A, range{0, j}, range{j, j + ib});
198 laset(GENERAL, zero, zero, A0);
204 auto A0 = slice(A, range{0, m - n}, range{0, n - k});
205 auto A1 = slice(A, range{m - n, m - k}, range{0, n - k});
206 auto A2 = slice(A, range{m - k, m}, range{0, n - k});
207 laset(GENERAL, zero, zero, A0);
208 laset(GENERAL, zero, one, A1);
209 laset(GENERAL, zero, zero, A2);
211 for (idx_t j = 0; j < k; j += nb) {
212 const idx_t ib = min(nb, k - j);
213 const idx_t sizev = m - k + j + ib;
214 const idx_t jj = n - k + j;
215 const auto tauj = slice(tau, range{j, j + ib});
217 auto V = slice(A, range{0, sizev}, range{jj, jj + ib});
222 auto matrixTj = slice(matrixT, range{0, ib}, range{0, ib});
223 auto C = slice(A, range{0, sizev}, range{0, jj});
225 larft(BACKWARD, COLUMNWISE_STORAGE, V, tauj, matrixTj);
227 COLUMNWISE_STORAGE, V, matrixTj, C, work1);
232 auto A0 = slice(A, range{sizev, m}, range{jj, jj + ib});
234 laset(GENERAL, zero, zero, A0);
239 if (direction == Direction::Forward) {
241 auto A0 = slice(A, range{k, m}, range{0, k});
242 auto A1 = slice(A, range{k, m}, range{k, n});
243 laset(GENERAL, zero, zero, A0);
244 laset(GENERAL, zero, one, A1);
246 for (idx_t i = ((k - 1) / nb) * nb; i != -nb; i -= nb) {
247 const idx_t ib = min(nb, k - i);
248 const auto taui = slice(tau, range{i, i + ib});
250 auto V = slice(A, range{i, i + ib}, range{i, n});
254 auto matrixTi = slice(matrixT, range{0, ib}, range{0, ib});
255 auto C = slice(A, range{i + ib, m}, range{i, n});
257 larft(FORWARD, ROWWISE_STORAGE, V, taui, matrixTi);
258 larfb_work(RIGHT_SIDE, CONJ_TRANS, FORWARD, ROWWISE_STORAGE,
259 V, matrixTi, C, work1);
264 auto A0 = slice(A, range{i, i + ib}, range{0, i});
265 laset(GENERAL, zero, zero, A0);
271 auto A0 = slice(A, range{0, m - k}, range{0, n - m});
272 auto A1 = slice(A, range{0, m - k}, range{n - m, n - k});
273 auto A2 = slice(A, range{0, m - k}, range{n - k, n});
274 laset(GENERAL, zero, zero, A0);
275 laset(GENERAL, zero, one, A1);
276 laset(GENERAL, zero, zero, A2);
278 for (idx_t i = 0; i < k; i += nb) {
279 const idx_t ib = min(nb, k - i);
280 const idx_t sizev = n - k + i + ib;
281 const idx_t ii = m - k + i;
282 const auto taui = slice(tau, range{i, i + ib});
284 auto V = slice(A, range{ii, ii + ib}, range{0, sizev});
289 auto matrixTi = slice(matrixT, range{0, ib}, range{0, ib});
290 auto C = slice(A, range{0, ii}, range{0, sizev});
292 larft(BACKWARD, ROWWISE_STORAGE, V, taui, matrixTi);
294 ROWWISE_STORAGE, V, matrixTi, C, work1);
299 auto A0 = slice(A, range{ii, ii + ib}, range{sizev, n});
301 laset(GENERAL, zero, zero, A0);