blasLapackWrapper.cpp 32.3 KB
Newer Older
Baum's avatar
Baum committed
1
// Xerus - A General Purpose Tensor Library
Fuchsi*'s avatar
Fuchsi* committed
2
// Copyright (C) 2014-2019 Benjamin Huber and Sebastian Wolf. 
Baum's avatar
Baum committed
3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
// 
// Xerus is free software: you can redistribute it and/or modify
// it under the terms of the GNU Affero General Public License as published
// by the Free Software Foundation, either version 3 of the License,
// or (at your option) any later version.
// 
// Xerus is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Affero General Public License for more details.
// 
// You should have received a copy of the GNU Affero General Public License
// along with Xerus. If not, see <http://www.gnu.org/licenses/>.
//
// For further information on Xerus visit https://libXerus.org 
// or contact us at contact@libXerus.org.

20
/**
21 22 23
* @file
* @brief Implementation of the blas and lapack wrapper functions.
*/
24

Baum's avatar
Baum committed
25

26
#include <complex.h>
27
// fix for non standard-conform complex implementation
28 29
#undef I

30
// workaround for broken Lapack
31 32 33 34
#define lapack_complex_float    float _Complex
#define lapack_complex_double   double _Complex
extern "C"
{
35
	#include <cblas.h> 
36
}
Sebastian Wolf's avatar
Sebastian Wolf committed
37

38
#ifdef __has_include
39 40 41 42
	#if __has_include(<lapacke.h>)
		#include <lapacke.h>
	#elif __has_include(<lapacke/lapacke.h>)
		#include <lapacke/lapacke.h>
43 44
	#else
		#pragma error no lapacke found
45
	#endif
46
#else
47
	#include <lapacke.h>
48 49
#endif

50 51 52

#include <memory>
#include <xerus/misc/standard.h>
53
#include <xerus/misc/performanceAnalysis.h>
54
#include <xerus/misc/check.h>
55

Sebastian Wolf's avatar
Sebastian Wolf committed
56
#include <xerus/misc/stringUtilities.h>
57
#include <xerus/basic.h>
58

Sebastian Wolf's avatar
Sebastian Wolf committed
59 60 61
#include <xerus/blasLapackWrapper.h>
#include <xerus/misc/basicArraySupport.h>
#include <xerus/misc/math.h>
62
#include <xerus/misc/internal.h>
Sebastian Wolf's avatar
Sebastian Wolf committed
63 64


65

66
namespace xerus {
67
	namespace blasWrapper {
Fuchsi*'s avatar
Fuchsi* committed
68
		/// @brief stores in @a _out the transpose of the @a _m x @a _n matrix @a _in
Fuchsi*'s avatar
Fuchsi* committed
69
		void low_level_transpose(double * _out, const double * _in, size_t _m, size_t _n) {
Fuchsi*'s avatar
Fuchsi* committed
70 71 72 73 74 75 76
			for (size_t i=0; i<_m; ++i) {
				for (size_t j=0; j<_n; ++j) {
					_out[j*_m + i] = _in[i*_n+j];
				}
			}
		}
		
Fuchsi*'s avatar
Fuchsi* committed
77 78 79 80 81 82 83 84
		/// @brief checks if the given double array contains NANs
		bool contains_nan(const double * _in, size_t _n) {
			for (size_t i=0; i<_n; ++i) {
				if (std::isnan(_in[i])) return true;
			}
			return false;
		}
		
85 86
		//----------------------------------------------- LEVEL I BLAS ----------------------------------------------------------
		
87 88 89 90 91 92 93 94 95 96 97 98
		double one_norm(const double* const _x, const size_t _n) {
			REQUIRE(_n <= static_cast<size_t>(std::numeric_limits<int>::max()), "Dimension to large for BLAS/Lapack");
			
			XERUS_PA_START;
			
			const double result = cblas_dasum(static_cast<int>(_n), _x, 1);
			
			XERUS_PA_END("Dense BLAS", "One Norm", misc::to_string(_n));
			
			return result;
		}
		
99 100 101
		double two_norm(const double* const _x, const size_t _n) {
			REQUIRE(_n <= static_cast<size_t>(std::numeric_limits<int>::max()), "Dimension to large for BLAS/Lapack");
			
102
			XERUS_PA_START;
103 104 105
			
			const double result = cblas_dnrm2(static_cast<int>(_n), _x, 1);
			
106
			XERUS_PA_END("Dense BLAS", "Two Norm", misc::to_string(_n));
107 108 109 110 111 112 113
			
			return result;
		}
		
		double dot_product(const double* const _x, const size_t _n, const double* const _y) {
			REQUIRE(_n <= static_cast<size_t>(std::numeric_limits<int>::max()), "Dimension to large for BLAS/Lapack");
			
114
			XERUS_PA_START;
115 116 117
			
			const double result = cblas_ddot(static_cast<int>(_n), _x, 1, _y, 1);
			
118
			XERUS_PA_END("Dense BLAS", "Dot Product", misc::to_string(_n)+"*"+misc::to_string(_n));
119 120 121 122 123 124 125 126 127 128 129 130 131
			
			return result;
		}
		
		
		//----------------------------------------------- LEVEL II BLAS ---------------------------------------------------------
		
		void matrix_vector_product(double* const _x, const size_t _m, const double _alpha, const double* const _A, const size_t _n, const bool _transposed, const double* const _y) {
			// Delegate call if appropriate
			if(_m == 1) { *_x = _alpha*dot_product(_A, _n, _y); return;}
			
			REQUIRE(_m <= static_cast<size_t>(std::numeric_limits<int>::max()), "Dimension to large for BLAS/Lapack");
			REQUIRE(_n <= static_cast<size_t>(std::numeric_limits<int>::max()), "Dimension to large for BLAS/Lapack");
Michael Goette's avatar
Michael Goette committed
132

133
			XERUS_PA_START;
134 135 136 137 138
			if(!_transposed) {
				cblas_dgemv(CblasRowMajor, CblasNoTrans, static_cast<int>(_m), static_cast<int>(_n), _alpha, _A, static_cast<int>(_n), _y, 1, 0.0, _x, 1);
			} else {
				cblas_dgemv(CblasRowMajor, CblasTrans, static_cast<int>(_n), static_cast<int>(_m), _alpha, _A, static_cast<int>(_m) , _y, 1, 0.0, _x, 1);
			}
139
			XERUS_PA_END("Dense BLAS", "Matrix Vector Product", misc::to_string(_m)+"x"+misc::to_string(_n)+" * "+misc::to_string(_n));
140 141 142 143 144 145
		}
		
		void dyadic_vector_product(double* _A, const size_t _m, const size_t _n, const double _alpha, const double*const  _x, const double* const _y) {
			REQUIRE(_m <= static_cast<size_t>(std::numeric_limits<int>::max()), "Dimension to large for BLAS/Lapack");
			REQUIRE(_n <= static_cast<size_t>(std::numeric_limits<int>::max()), "Dimension to large for BLAS/Lapack");
			
146
			XERUS_PA_START;
147 148 149 150 151 152
			
			//Blas wants to add the product to A, but we don't.
			misc::set_zero(_A, _m*_n);
			
			cblas_dger(CblasRowMajor, static_cast<int>(_m), static_cast<int>(_n), _alpha, _x, 1, _y, 1, _A, static_cast<int>(_n));
			
153
			XERUS_PA_END("Dense BLAS", "Dyadic Vector Product", misc::to_string(_m)+" o "+misc::to_string(_n));
154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184
		}
		
		
		//----------------------------------------------- LEVEL III BLAS --------------------------------------------------------
		/// Performs the Matrix-Matrix product c = a * b
		void matrix_matrix_product( double* const _C,
									const size_t _leftDim,
									const size_t _rightDim,
									const double _alpha,
									const double* const _A,
									const size_t _lda,
									const bool _transposeA,
									const size_t _middleDim,
									const double* const _B,
									const size_t _ldb,
									const bool _transposeB) {
			//Delegate call if appropriate
			if(_leftDim == 1) {
				matrix_vector_product(_C, _rightDim, _alpha, _B, _middleDim, !_transposeB, _A);
			} else if(_rightDim == 1) {
				matrix_vector_product(_C, _leftDim, _alpha, _A, _middleDim, _transposeA, _B);
			} else if(_middleDim == 1) { 
				dyadic_vector_product(_C, _leftDim, _rightDim, _alpha, _A, _B);
			} else {
			
				REQUIRE(_leftDim <= static_cast<size_t>(std::numeric_limits<int>::max()), "Dimension to large for BLAS/Lapack");
				REQUIRE(_middleDim <= static_cast<size_t>(std::numeric_limits<int>::max()), "Dimension to large for BLAS/Lapack");
				REQUIRE(_rightDim <= static_cast<size_t>(std::numeric_limits<int>::max()), "Dimension to large for BLAS/Lapack");
				REQUIRE(_lda <= static_cast<size_t>(std::numeric_limits<int>::max()), "Dimension to large for BLAS/Lapack");
				REQUIRE(_ldb <= static_cast<size_t>(std::numeric_limits<int>::max()), "Dimension to large for BLAS/Lapack");
				
185
				XERUS_PA_START;
186 187
				
				cblas_dgemm( CblasRowMajor,                             // Array storage format
Sebastian Wolf's avatar
Sebastian Wolf committed
188 189
						_transposeA ? CblasTrans : CblasNoTrans,        // LHS transposed?
						_transposeB ? CblasTrans : CblasNoTrans,        // RHS transposed?
190 191 192
						static_cast<int>(_leftDim),                     // Left dimension
						static_cast<int>(_rightDim),                    // Right dimension
						static_cast<int>(_middleDim),                   // Middle dimension
Sebastian Wolf's avatar
Sebastian Wolf committed
193 194
						_alpha,                                         // Factor to the Product
						_A,                                             // Pointer to LHS
195
						static_cast<int>(_lda),                         // LDA
Sebastian Wolf's avatar
Sebastian Wolf committed
196
						_B,                                             // Pointer to RHS
197
						static_cast<int>(_ldb),                         // LDB
Sebastian Wolf's avatar
Sebastian Wolf committed
198 199
						0.0,                                            // Factor of C (Zero if only the product is required)
						_C,                                             // Pointer to result
200
						static_cast<int>(_rightDim)                     // LDC
Sebastian Wolf's avatar
Sebastian Wolf committed
201
				);
202
				
203
				XERUS_PA_END("Dense BLAS", "Matrix-Matrix-Multiplication", misc::to_string(_leftDim)+"x"+misc::to_string(_middleDim)+" * "+misc::to_string(_middleDim)+"x"+misc::to_string(_rightDim));
204 205 206 207 208 209 210 211 212 213
			}
		}
		
		
		
		//----------------------------------------------- LAPACK ----------------------------------------------------------------
		
		void svd( double* const _U, double* const _S, double* const _Vt, const double* const _A, const size_t _m, const size_t _n) {
			//Create copy of A
			const std::unique_ptr<double[]> tmpA(new double[_m*_n]);
214
			misc::copy(tmpA.get(), _A, _m*_n);
215
			
216 217 218 219
			svd_destructive(_U, _S, _Vt, tmpA.get(), _m, _n);
		}
		
		
Fuchsi*'s avatar
Fuchsi* committed
220 221 222 223 224 225
		lapack_int dgesdd_get_workarray_size(lapack_int m, lapack_int n) {
			lapack_int info = 0;
			char job = 'S';
			double work = 0;
			lapack_int lwork = -1;
			lapack_int min = std::min(m,n);
Fuchsi*'s avatar
Fuchsi* committed
226
			dgesdd_( &job, &n, &m, nullptr, &n, nullptr, nullptr, &n, nullptr, &min, &work, &lwork, nullptr, &info );
Fuchsi*'s avatar
Fuchsi* committed
227 228 229 230 231
			REQUIRE(info == 0, "work array size query of dgesdd returned " << info);
			return lapack_int(work);
		}
		
		
232
		void dgesdd_work(lapack_int m, lapack_int n, double* a, double* s, double* u, double* vt, double* work, lapack_int lwork, lapack_int* iwork ) {
Fuchsi*'s avatar
Fuchsi* committed
233 234 235 236 237
			REQUIRE(lwork > 0, "");
			lapack_int info = 0;
			char job = 'S';
			lapack_int min = std::min(m,n);
			
Fuchsi*'s avatar
Fuchsi* committed
238 239
			// if A = U*S*V^T, then A^T = V^T*S*U^T, so instead of transposing all input and output matrices we can simply exchange the order of U and Vt
			dgesdd_( &job, &n, &m, a, &n, s, vt, &n, u, &min, work, &lwork, iwork, &info );
Fuchsi*'s avatar
Fuchsi* committed
240 241 242
			REQUIRE(info == 0, "dgesdd failed with info " << info);
		}
		
243
		void svd_destructive( double* const _U, double* const _S, double* const _Vt, double* const _A, const size_t _m, const size_t _n) {
Fuchsi*'s avatar
Fuchsi* committed
244 245 246 247 248
			REQUIRE(_m <= static_cast<size_t>(std::numeric_limits<lapack_int>::max()), "Dimension to large for BLAS/Lapack");
			REQUIRE(_n <= static_cast<size_t>(std::numeric_limits<lapack_int>::max()), "Dimension to large for BLAS/Lapack");
			REQUIRE(_m*_n <= static_cast<size_t>(std::numeric_limits<lapack_int>::max()), "Dimension to large for BLAS/Lapack");
			
			REQUIRE(!contains_nan(_A, _m*_n), "Input matrix to SVD may not contain NaN");
249
			
250
			XERUS_PA_START;
Fuchsi*'s avatar
Fuchsi* committed
251 252
			lapack_int m = lapack_int(_m);
			lapack_int n = lapack_int(_n);
253
			std::unique_ptr<lapack_int[]> iwork(new lapack_int[std::max(1ul,8u*std::min(_m,_n))]);
Fuchsi*'s avatar
Fuchsi* committed
254
			lapack_int lwork = dgesdd_get_workarray_size(m, n);
255
			std::unique_ptr<double[]> work(new double[size_t(lwork)]);
256
			
Fuchsi*'s avatar
Fuchsi* committed
257
			dgesdd_work( m, n, _A, _S, _U, _Vt, work.get(), lwork, iwork.get());
258
			
259
			XERUS_PA_END("Dense LAPACK", "Singular Value Decomposition", misc::to_string(_m)+"x"+misc::to_string(_n));
260 261 262 263
		}
		
		
		std::tuple<std::unique_ptr<double[]>, std::unique_ptr<double[]>, size_t> qc(const double* const _A, const size_t _m, const size_t _n) {
Fuchsi*'s avatar
Fuchsi* committed
264 265
			REQUIRE(_m <= static_cast<size_t>(std::numeric_limits<lapack_int>::max()), "Dimension to large for BLAS/Lapack");
			REQUIRE(_n <= static_cast<size_t>(std::numeric_limits<lapack_int>::max()), "Dimension to large for BLAS/Lapack");
266 267 268 269
			
			REQUIRE(_n > 0, "Dimension n must be larger than zero");
			REQUIRE(_m > 0, "Dimension m must be larger than zero");
			
Fuchsi*'s avatar
Fuchsi* committed
270 271
			REQUIRE(!contains_nan(_A, _m*_n), "Input matrix to QC factorization may not contain NaN");
			
272
			XERUS_PA_START;
273
			
Sebastian Wolf's avatar
Sebastian Wolf committed
274
			const size_t maxRank = std::min(_m, _n);
275
			
Fuchsi*'s avatar
Fuchsi* committed
276
			// Factors for Householder reflections
277 278
			const std::unique_ptr<double[]> tau(new double[maxRank]);
			
Fuchsi*'s avatar
Fuchsi* committed
279
			const std::unique_ptr<lapack_int[]> permutation(new lapack_int[_n]);
280
			misc::set_zero(permutation.get(), _n); // Lapack requires the entries to be zero.
281
			
Fuchsi*'s avatar
Fuchsi* committed
282 283 284 285
			// transpose input
			const std::unique_ptr<double[]> tA(new double[_m*_n]);
			low_level_transpose(tA.get(), _A, _m, _n);
			
286
			// Calculate QR factorisations with column pivoting
Fuchsi*'s avatar
Fuchsi* committed
287 288 289 290 291 292 293 294 295 296 297 298 299 300
			lapack_int m = static_cast<lapack_int>(_m);
			lapack_int n = static_cast<lapack_int>(_n);
			lapack_int info = 0;
			lapack_int lwork = -1;
			double work_query = 0;
			// query work array size
			dgeqp3_(&m, &n, nullptr, &m, nullptr, nullptr, &work_query, &lwork, &info);
			REQUIRE(info == 0, "dgeqp3 (QC) work size query failed. info = " << info);
			lwork = lapack_int(work_query);
			std::unique_ptr<double[]> work(new double[size_t(lwork)]);
			
			// perform factorization
			dgeqp3_(&m, &n, tA.get(), &m, permutation.get(), tau.get(), work.get(), &lwork, &info);
			REQUIRE(info == 0, "dgeqp3 (QC) failed. info = " << info);
301
			
302 303 304
			
			// Determine the actual rank
			size_t rank;
Fuchsi*'s avatar
Fuchsi* committed
305 306 307
			auto cutoff = 16*std::numeric_limits<double>::epsilon()*std::abs(tA[0]);
			for (rank = 1; rank < maxRank; ++rank) {
				if (std::abs(tA[rank*(_m+1)]) < cutoff) {
308
					break;
309
				}
310 311
			}
			
312 313 314 315
			
			// Create the matrix C
			std::unique_ptr<double[]> C(new double[rank*_n]);
			misc::set_zero(C.get(), rank*_n); 
316
			
Sebastian Wolf's avatar
Sebastian Wolf committed
317
			// Copy the upper triangular Matrix C (rank x _n) into position
318
			for (size_t col = 0; col < _n; ++col) {
319
				const size_t targetCol = static_cast<size_t>(permutation[col]-1); // For Lapack numbers start at 1 (instead of 0).
Fuchsi*'s avatar
Fuchsi* committed
320 321
				for(size_t row = 0; row < rank && row <= col; ++row) {
					C[row*_n + targetCol] = tA[row + col*_m];
322 323 324
				}
			}
			
325
			
326
			// Create orthogonal matrix Q
Fuchsi*'s avatar
Fuchsi* committed
327 328 329 330 331 332 333 334 335 336 337
			lapack_int lwork2 = -1;
			lapack_int min = std::min(m,n);
			dorgqr_(&m, &min, &min, nullptr, &m, nullptr, &work_query, &lwork2, &info);
			REQUIRE(info == 0, "dorgqr_ (QC) getting work array size failed. info = " << info);
			lwork2 = lapack_int(work_query);
			if (lwork2 > lwork) {
				lwork = lwork2;
				work.reset(new double[size_t(lwork)]);
			}
			dorgqr_(&m, &min, &min, tA.get(), &m, tau.get(), work.get(), &lwork, &info);
			REQUIRE(info == 0, "dorgqr_ (QC) failed. info = " << info);
338
			
339 340
			// Copy the newly created Q into position
			std::unique_ptr<double[]> Q(new double[_m*rank]);
341
			if(rank == _n) {
Fuchsi*'s avatar
Fuchsi* committed
342
				low_level_transpose(Q.get(), tA.get(), rank, _m);
Fuchsi*'s avatar
Fuchsi* committed
343 344
			} else {
				for(size_t row = 0; row < _m; ++row) {
Fuchsi*'s avatar
Fuchsi* committed
345 346 347
					for (size_t col = 0; col < rank; ++col) {
						Q[row*rank + col] = tA[row + col*_m];
					}
Fuchsi*'s avatar
Fuchsi* committed
348 349
				}
			}
350
			
351
			XERUS_PA_END("Dense LAPACK", "QRP Factorisation", misc::to_string(_m)+"x"+misc::to_string(rank)+" * "+misc::to_string(rank)+"x"+misc::to_string(_n));
352 353
			
			return std::make_tuple(std::move(Q), std::move(C), rank);
354
		}
355 356
		
		
Fuchsi*'s avatar
Fuchsi* committed
357 358 359 360 361
		std::tuple<std::unique_ptr<double[]>, std::unique_ptr<double[]>, size_t> qc_destructive(double* const _A, const size_t _m, const size_t _n) {
			return qc(_A, _m, _n);
		}
		
		
362
		std::tuple<std::unique_ptr<double[]>, std::unique_ptr<double[]>, size_t> cq(const double* const _A, const size_t _m, const size_t _n) {
363
			const std::unique_ptr<double[]> tmpA(new double[_m*_n]);
364
			misc::copy(tmpA.get(), _A, _m*_n);
365
			
366
			return cq_destructive(tmpA.get(), _m, _n);
367 368
		}
		
369
		
370 371 372 373 374
		// We use that in col-major we get At = Qt * Ct => A = C * Q, i.e. doing the calculation in col-major and switching Q and C give the desired result.
		std::tuple<std::unique_ptr<double[]>, std::unique_ptr<double[]>, size_t> cq_destructive(double* const _A, const size_t _m, const size_t _n) {
			REQUIRE(_n <= static_cast<size_t>(std::numeric_limits<int>::max()), "Dimension to large for BLAS/Lapack");
			REQUIRE(_m <= static_cast<size_t>(std::numeric_limits<int>::max()), "Dimension to large for BLAS/Lapack");
			
375 376
			REQUIRE(_m > 0, "Dimension m must be larger than zero");
			REQUIRE(_n > 0, "Dimension n must be larger than zero");
377
			
378
			XERUS_PA_START;
379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409
			
			// Maximal rank is used by Lapacke
			const size_t maxRank = std::min(_n, _m);
			
			// Tmp Array for Lapacke
			const std::unique_ptr<double[]> tau(new double[maxRank]);
			
			const std::unique_ptr<int[]> permutation(new int[_m]());
			misc::set_zero(permutation.get(), _m); // Lapack requires the entries to be zero.
			
			// Calculate QR factorisations with column pivoting
			IF_CHECK(int lapackAnswer = ) LAPACKE_dgeqp3(LAPACK_COL_MAJOR, static_cast<int>(_n), static_cast<int>(_m), _A, static_cast<int>(_n), permutation.get(), tau.get());
			REQUIRE(lapackAnswer == 0, "Unable to perform QC factorisaton (dgeqp3). Lapacke says: " << lapackAnswer );
			
			
			// Determine the actual rank
			size_t rank;
			for (rank = 1; rank <= maxRank; ++rank) {
				if (rank == maxRank || std::abs(_A[rank+rank*_n]) < 16*std::numeric_limits<double>::epsilon()*_A[0]) {
					break;
				}
			}
			
			
			// Create the matrix C
			std::unique_ptr<double[]> C(new double[rank*_m]);
			misc::set_zero(C.get(), rank*_m); 
			
			// Copy the upper triangular Matrix C (rank x _m) into position
			for (size_t col = 0; col < _m; ++col) {
				const size_t targetCol = static_cast<size_t>(permutation[col]-1); // For Lapack numbers start at 1 (instead of 0).
410
				misc::copy(C.get()+targetCol*rank, _A+col*_n, std::min(rank, col+1));
411 412 413 414 415 416 417 418 419 420 421
			}
			
			
			// Create orthogonal matrix Q
			IF_CHECK(lapackAnswer = ) LAPACKE_dorgqr(LAPACK_COL_MAJOR, static_cast<int>(_n), static_cast<int>(maxRank), static_cast<int>(maxRank), _A, static_cast<int>(_n), tau.get());
			CHECK(lapackAnswer == 0, error, "Unable to reconstruct Q from the QC factorisation. Lapacke says: " << lapackAnswer);
			
			// Copy the newly created Q into position
			std::unique_ptr<double[]> Q(new double[_n*rank]);
			misc::copy(Q.get(), _A, _n*rank);
			
422
			XERUS_PA_END("Dense LAPACK", "QRP Factorisation", misc::to_string(_n)+"x"+misc::to_string(rank)+" * "+misc::to_string(rank)+"x"+misc::to_string(_m));
423 424 425 426
			
			return std::make_tuple(std::move(C), std::move(Q), rank);
		}
		
427 428 429
		
		void qr(double* const _Q, double* const _R, const double* const _A, const size_t _m, const size_t _n) {
			// Create tmp copy of A since Lapack wants to destroy it
430
			const std::unique_ptr<double[]> tmpA(new double[_m*_n]);
431
			misc::copy(tmpA.get(), _A, _m*_n);
432
			
433
			qr_destructive(_Q, _R, tmpA.get(), _m, _n);
434 435 436
		}
		
		
437 438 439
		void inplace_qr(double* const _AtoQ, double* const _R, const size_t _m, const size_t _n) {
			qr_destructive(_AtoQ, _R, _AtoQ, _m, _n);
		}
440 441
		
		
442 443 444 445 446 447 448 449 450 451
		void qr_destructive( double* const _Q, double* const _R, double* const _A, const size_t _m, const size_t _n) {
			REQUIRE(_m <= static_cast<size_t>(std::numeric_limits<int>::max()), "Dimension to large for BLAS/Lapack");
			REQUIRE(_n <= static_cast<size_t>(std::numeric_limits<int>::max()), "Dimension to large for BLAS/Lapack");
			
			REQUIRE(_n > 0, "Dimension n must be larger than zero");
			REQUIRE(_m > 0, "Dimension m must be larger than zero");
			
			REQUIRE(_Q && _R && _A, "QR decomposition must not be called with null pointers: Q:" << _Q << " R: " << _R << " A: " << _A);
			REQUIRE(_A != _R, "_A and _R must be different, otherwise qr call will fail.");
			
452
			XERUS_PA_START;
453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475
			
			// Maximal rank is used by Lapacke
			const size_t rank = std::min(_m, _n); 
			
			// Tmp Array for Lapacke
			const std::unique_ptr<double[]> tau(new double[rank]);
			
			// Calculate QR factorisations
			IF_CHECK( int lapackAnswer = ) LAPACKE_dgeqrf(LAPACK_ROW_MAJOR, static_cast<int>(_m), static_cast<int>(_n), _A, static_cast<int>(_n), tau.get());
			CHECK(lapackAnswer == 0, error, "Unable to perform QR factorisaton. Lapacke says: " << lapackAnswer );
			
			// Copy the upper triangular Matrix R (rank x _n) into position
			for(size_t row =0; row < rank; ++row) {
				misc::set_zero(_R+row*_n, row); // Set starting zeros
				misc::copy(_R+row*_n+row, _A+row*_n+row, _n-row); // Copy upper triangular part from lapack result.
			}
			
			// Create orthogonal matrix Q (in tmpA)
			IF_CHECK( lapackAnswer = ) LAPACKE_dorgqr(LAPACK_ROW_MAJOR, static_cast<int>(_m), static_cast<int>(rank), static_cast<int>(rank), _A, static_cast<int>(_n), tau.get());
			CHECK(lapackAnswer == 0, error, "Unable to reconstruct Q from the QR factorisation. Lapacke says: " << lapackAnswer);
			
			// Copy Q (_m x rank) into position
			if(_A != _Q) {
476
				if(_n == rank) {
477
					misc::copy(_Q, _A, _m*_n);
478
				} else {
479
					for(size_t row = 0; row < _m; ++row) {
480
						misc::copy(_Q+row*rank, _A+row*_n, rank);
481 482
					}
				}
483
			} else if(_n != rank) { // Note extra treatmeant to avoid memcpy overlap
484
				for(size_t row = 1; row < _m; ++row) {
485
					misc::copy_inplace(_Q+row*rank, _A+row*_n, rank);
486 487
				}
			}
488
			
489
			XERUS_PA_END("Dense LAPACK", "QR Factorisation", misc::to_string(_m)+"x"+misc::to_string(_n));
490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516
		}
		
		
		void rq( double* const _R, double* const _Q, const double* const _A, const size_t _m, const size_t _n) {
			// Create tmp copy of A since Lapack wants to destroy it
			const std::unique_ptr<double[]> tmpA(new double[_m*_n]);
			misc::copy(tmpA.get(), _A, _m*_n);
			
			rq_destructive(_R, _Q, tmpA.get(), _m, _n);
		}
		
		
		void inplace_rq( double* const _R, double* const _AtoQ, const size_t _m, const size_t _n) {
			rq_destructive(_R, _AtoQ, _AtoQ, _m, _n);
		}
		
		
		void rq_destructive( double* const _R, double* const _Q, double* const _A, const size_t _m, const size_t _n) {
			REQUIRE(_m <= static_cast<size_t>(std::numeric_limits<int>::max()), "Dimension to large for BLAS/Lapack");
			REQUIRE(_n <= static_cast<size_t>(std::numeric_limits<int>::max()), "Dimension to large for BLAS/Lapack");
			
			REQUIRE(_n > 0, "Dimension n must be larger than zero");
			REQUIRE(_m > 0, "Dimension m must be larger than zero");
			
			REQUIRE(_Q && _R && _A, "QR decomposition must not be called with null pointers: R " << _R << " Q: " << _Q << " A: " << _A);
			REQUIRE(_A != _R, "_A and _R must be different, otherwise qr call will fail.");
			
517
			XERUS_PA_START;
518 519 520 521 522 523 524 525
			
			// Maximal rank is used by Lapacke
			const size_t rank = std::min(_m, _n); 
			
			// Tmp Array for Lapacke
			const std::unique_ptr<double[]> tau(new double[rank]);
			
			IF_CHECK( int lapackAnswer = ) LAPACKE_dgerqf(LAPACK_ROW_MAJOR, static_cast<int>(_m), static_cast<int>(_n), _A, static_cast<int>(_n), tau.get());
526
			CHECK(lapackAnswer == 0, error, "Unable to perform RQ factorisaton. Lapacke says: " << lapackAnswer << ". Call was: LAPACKE_dgerqf(LAPACK_ROW_MAJOR, "<<static_cast<int>(_m)<<", "<<static_cast<int>(_n)<<", "<<_A<<", "<<static_cast<int>(_n)<<", "<<tau.get()<<");" );
527 528 529 530 531 532 533 534 535 536 537 538 539 540
			
			
			// Copy the upper triangular Matrix R (_m x rank) into position.
			size_t row = 0;
			for( ; row < _m - rank; ++row) {
				misc::copy(_R+row*rank, _A+row*_n+_n-rank, rank);
			}
			for(size_t skip = 0; row < _m; ++row, ++skip) {
				misc::set_zero(_R+row*rank, skip); // Set starting zeros
				misc::copy(_R+row*rank+skip, _A+row*_n+_n-rank+skip, rank-skip); // Copy upper triangular part from lapack result.
			}
			
			// Create orthogonal matrix Q (in _A). Lapacke expects to get the last rank rows of A...
			IF_CHECK( lapackAnswer = ) LAPACKE_dorgrq(LAPACK_ROW_MAJOR, static_cast<int>(rank), static_cast<int>(_n), static_cast<int>(rank), _A+(_m-rank)*_n, static_cast<int>(_n), tau.get()); 
Sebastian Wolf's avatar
Sebastian Wolf committed
541
			CHECK(lapackAnswer == 0, error, "Unable to reconstruct Q from the RQ factorisation. Lapacke says: " << lapackAnswer << ". Call was: LAPACKE_dorgrq(LAPACK_ROW_MAJOR, "<<static_cast<int>(rank)<<", "<<static_cast<int>(_n)<<", "<<static_cast<int>(rank)<<", "<<_A+(_m-rank)*_n<<", "<<static_cast<int>(_n)<<", "<<tau.get()<<");");
542

543 544 545 546 547 548
			
			//Copy Q (rank x _n) into position
			if(_A != _Q) {
				misc::copy(_Q, _A+(_m-rank)*_n, rank*_n);
			}
			
549
			XERUS_PA_END("Dense LAPACK", "RQ Factorisation", misc::to_string(_m)+"x"+misc::to_string(_n));
550 551 552
		}
		
		
553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588
		static bool is_symmetric(const double* const _A, const size_t _n) {
			double max = 0;
			for (size_t i=0; i<_n*_n; ++i) {
				max = std::max(max, _A[i]);
			}
			
			for (size_t i=0; i<_n; ++i) {
				for (size_t j=i+1; j<_n; ++j) {
					if (std::abs(_A[i*_n + j] - _A[i + j*_n]) >= 4 * max * std::numeric_limits<double>::epsilon()) {
// 						LOG(aslkdjj, std::abs(_A[i*_n + j] - _A[i + j*_n]) << " / " << _A[i*_n + j] << " " << max);
						return false;
					}
				}
			}
			return true;
		}
		
		/// @brief checks whether the diagonal of @a _A is all positive or all negative. returns false otherwise
		static bool pos_neg_definite_diagonal(const double* const _A, const size_t _n) {
			bool positive = (_A[0] > 0);
			const size_t steps=_n+1;
			if (positive) {
				for (size_t i=1; i<_n; ++i) {
					if (_A[i*steps] < std::numeric_limits<double>::epsilon()) {
						return false;
					}
				}
				return true;
			} else {
				for (size_t i=1; i<_n; ++i) {
					if (_A[i*steps] > -std::numeric_limits<double>::epsilon()) {
						return false;
					}
				}
				return true;
			}
589 590 591 592
			
		}
		
		/// Solves Ax = b for x
593
		/// order of checks and solvers inspired by matlabs mldivide https://de.mathworks.com/help/matlab/ref/mldivide.html
594 595
		void solve(double* const _x, const double* const _A, const size_t _m, const size_t _n, const double* const _b, const size_t _nrhs) {
			REQUIRE(_m <= static_cast<size_t>(std::numeric_limits<int>::max()), "Dimension to large for BLAS/Lapack");
596
			REQUIRE(_n <= static_cast<size_t>(std::numeric_limits<int>::max()), "Dimension to large for BLAS/Lapack");
597 598 599 600 601 602
			REQUIRE(_nrhs <= static_cast<size_t>(std::numeric_limits<int>::max()), "Dimension to large for BLAS/Lapack");
			
			const std::unique_ptr<double[]> tmpA(new double[_m*_n]);
			misc::copy(tmpA.get(), _A, _m*_n);
			
			LOG(debug, "solving with...");
603
			
604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619
			// not rectangular -> fallback to least-squares (QR or SVD decomposition to solve)
			if (_m != _n) {
				LOG(debug, "SVD");
				const std::unique_ptr<double[]> tmpB(new double[_m*_nrhs]);
				misc::copy(tmpB.get(), _b, _m*_nrhs);
				solve_least_squares_destructive(_x, tmpA.get(), _m, _n, tmpB.get(), _nrhs);
				return;
			}
			
			// not symmetric -> LU solver
			if (!is_symmetric(_A, _n)) {
				LOG(debug, "LU");
				XERUS_PA_START;
				
				std::unique_ptr<int[]> pivot(new int[_n]);
				
620
				misc::copy(_x, _b, _n*_nrhs);
621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636
				
				IF_CHECK( int lapackAnswer = ) LAPACKE_dgesv(
					LAPACK_ROW_MAJOR,
					static_cast<int>(_n),		// Dimensions of A (nxn)
					static_cast<int>(_nrhs), 	// Number of rhs b
					tmpA.get(),					// input: A, output: L and U
					static_cast<int>(_n),		// LDA
					pivot.get(),				// output: permutation P
					_x,							// input: rhs b, output: solution x
					static_cast<int>(_nrhs)		// ldb
				);
				CHECK(lapackAnswer == 0, error, "Unable to solve Ax = b (PLU solver). Lapacke says: " << lapackAnswer);
				
				XERUS_PA_END("Dense LAPACK", "Solve (PLU)", misc::to_string(_n)+"x"+misc::to_string(_n)+"x"+misc::to_string(_nrhs));
				return;
			}
637
			
638 639 640 641 642 643 644 645 646
			// positive or negative diagonal -> try cholesky
			if (pos_neg_definite_diagonal(_A, _n)) {
				LOG(debug, "trying cholesky");
				int lapackAnswer = 0;
				{
					XERUS_PA_START;
					
					lapackAnswer = LAPACKE_dpotrf2(
						LAPACK_ROW_MAJOR,
Philipp Trunschke's avatar
Philipp Trunschke committed
647 648 649 650
						'U', 				// Upper triangle of A is read
						static_cast<int>(_n),		// dimensions of A
						tmpA.get(),			// input: A, output: cholesky factorisation
						static_cast<int>(_n)		// LDA
651 652 653 654 655 656 657 658 659
					);
					
					XERUS_PA_END("Dense LAPACK", "Cholesky decomposition", misc::to_string(_n)+"x"+misc::to_string(_n));
				}
				
				if (lapackAnswer == 0) {
					LOG(debug, "cholesky");
					XERUS_PA_START;
					
660
					misc::copy(_x, _b, _n*_nrhs);
661 662 663
					
					lapackAnswer = LAPACKE_dpotrs(
						LAPACK_ROW_MAJOR,
Philipp Trunschke's avatar
Philipp Trunschke committed
664 665 666 667 668 669 670
						'U',				// upper triangle of cholesky decomp is stored in tmpA
						static_cast<int>(_n),		// dimensions of A
						static_cast<int>(_nrhs),	// number of rhs
						tmpA.get(),			// input: cholesky decomp
						static_cast<int>(_n), 		// lda
						_x,				// input: rhs b, output: solution x
						static_cast<int>(_nrhs)		// ldb
671 672 673 674 675 676 677 678 679 680 681
					);
					CHECK(lapackAnswer == 0, error, "Unable to solve Ax = b (cholesky solver). Lapacke says: " << lapackAnswer);
					
					XERUS_PA_END("Dense LAPACK", "Solve (Cholesky)", misc::to_string(_n)+"x"+misc::to_string(_n)+"x"+misc::to_string(_nrhs));
					
					return;
				} else {
					// restore tmpA
					misc::copy(tmpA.get(), _A, _m*_n);
				}
			}
682
			
683 684 685
			LOG(debug, "LDL");
			// non-definite diagonal or choleksy failed -> fallback to LDL^T decomposition
			XERUS_PA_START;
686
			
687
			misc::copy(_x, _b, _n*_nrhs);
688 689
			std::unique_ptr<int[]> pivot(new int[_n]);
			
690
			LAPACKE_dsysv(
691
				LAPACK_ROW_MAJOR,
692 693 694 695 696 697 698 699 700 701 702 703
				'U',					// upper triangular part of _A is accessed
				static_cast<int>(_n),	// dimension of A
				static_cast<int>(_nrhs),// number of rhs
				tmpA.get(), 			// input: A, output: part of the LDL decomposition
				static_cast<int>(_n),	// lda
				pivot.get(), 			// output: details of blockstructure of D
				_x,						// input: rhs b, output: solution x
				static_cast<int>(_nrhs)	// ldb
			);
			
			XERUS_PA_END("Dense LAPACK", "Solve (LDL)", misc::to_string(_n)+"x"+misc::to_string(_n)+"x"+misc::to_string(_nrhs));
		}
704
		
705 706 707 708 709
		/// Solves Ax = x*lambda for x and lambda
		void solve_ev(double* const _x, double* const _re, double* const _im, const double* const _A, const size_t _n) {
			REQUIRE(_n <= static_cast<size_t>(std::numeric_limits<int>::max()), "Dimension to large for BLAS/Lapack");

			const std::unique_ptr<double[]> tmpA(new double[_n*_n]);
710
			misc::copy(tmpA.get(), _A, _n * _n);
711 712 713 714 715 716 717 718 719 720 721

			LOG(debug, "solving with...");

			//so far only non symmetric -> dgeev
			LOG(debug, "DGEEV");
			XERUS_PA_START;

			std::unique_ptr<double[]> leftev(new double[1]);

			IF_CHECK( int lapackAnswer = ) LAPACKE_dgeev(
				LAPACK_ROW_MAJOR,
722 723
				'N', 										// No left eigenvalues are computed
				'V', 										// Right eigenvalues are computed
724 725 726 727 728 729 730 731 732 733 734
				static_cast<int>(_n),		// Dimensions of A (nxn)
				tmpA.get(),							// input: A, output: L and U
				static_cast<int>(_n),		// LDA
				_re, 										// real part of the eigenvalues
				_im, 										// imaginary part of the eigenvalues
				leftev.get(),						// output: left eigenvectors, here dummy
				static_cast<int>(_n), 	// LDVL
				_x,											// right eigenvectors
				static_cast<int>(_n) 		// LDVR TODO check size of _x
			);
			CHECK(lapackAnswer == 0, error, "Unable to solve Ax = lambda*x (DGEEV solver). Lapacke says: " << lapackAnswer);
Fuchsi*'s avatar
Fuchsi* committed
735
			XERUS_PA_END("Dense LAPACK", "Solve (DGEEV)", misc::to_string(_n)+"x"+misc::to_string(_n));
736 737 738

			return;
		}
739
	
740
		void solve_least_squares( double* const _x, const double* const _A, const size_t _m, const size_t _n, const double* const _b, const size_t _p){
741 742 743
			const std::unique_ptr<double[]> tmpA(new double[_m*_n]);
			misc::copy(tmpA.get(), _A, _m*_n);
			
744 745
			const std::unique_ptr<double[]> tmpB(new double[_m*_p]);
			misc::copy(tmpB.get(), _b, _m*_p);
746
			
747
			solve_least_squares_destructive(_x, tmpA.get(), _m, _n, tmpB.get(), _p);
748 749 750
		}
		
		
751
		void solve_least_squares_destructive( double* const _x, double* const _A, const size_t _m, const size_t _n, double* const _b, const size_t _p){
752 753
			REQUIRE(_m <= static_cast<size_t>(std::numeric_limits<int>::max()), "Dimension to large for BLAS/Lapack");
			REQUIRE(_n <= static_cast<size_t>(std::numeric_limits<int>::max()), "Dimension to large for BLAS/Lapack");
754
			REQUIRE(_p <= static_cast<size_t>(std::numeric_limits<int>::max()), "Dimension to large for BLAS/Lapack");
755
			
756
			XERUS_PA_START;
757 758 759
			
			std::unique_ptr<int[]> pivot(new int[_n]);
			misc::set_zero(pivot.get(), _n);
760
			
Sebastian Wolf's avatar
Sebastian Wolf committed
761
			std::unique_ptr<double[]> signulars(new double[std::min(_n, _m)]);
762
			
763 764 765 766 767
			int rank;
			
			double* bOrX;
			if(_m >= _n) {
				bOrX = _b;
768 769 770 771
			} else {
				bOrX = _x;
				misc::copy(bOrX, _b, _m*_p);
				misc::set_zero(bOrX+_m*_p, (_n-_m)*_p); // Lapacke is unhappy if the array contains NANs...
772 773
			}
			
Sebastian Wolf's avatar
Sebastian Wolf committed
774
// 			IF_CHECK( int lapackAnswer = ) LAPACKE_dgelsy(
775 776 777 778 779 780 781
// 				LAPACK_ROW_MAJOR, 
// 				static_cast<int>(_m),   // Left dimension of A
// 				static_cast<int>(_n),   // Right dimension of A
// 				static_cast<int>(_p),	// Number of rhss
// 				_A,         			// Matrix A
// 				static_cast<int>(_n),   // LDA
// 				bOrX,       			// On input b, on output x
Sebastian Wolf's avatar
Sebastian Wolf committed
782 783
// 				static_cast<int>(_p),          			// LDB
// 				pivot.get(),			// Pivot, entries must be zero to allow pivoting
784 785 786
// 				xerus::EPSILON,      	// Used to determine the accuracy of the Lapacke call. Basically all singular values smaller than RCOND*s[0] are ignored. (s[0] is the largest signular value)
// 				&rank);     			// Outputs the rank of A
			
Sebastian Wolf's avatar
Sebastian Wolf committed
787 788 789 790 791 792 793 794 795 796 797 798 799
			IF_CHECK( int lapackAnswer = ) LAPACKE_dgelsd(
				LAPACK_ROW_MAJOR, 
				static_cast<int>(_m),   // Left dimension of A
				static_cast<int>(_n),   // Right dimension of A
				static_cast<int>(_p),	// Number of rhss
				_A,         			// Matrix A
				static_cast<int>(_n),   // LDA
				bOrX,       			// On input b, on output x
				static_cast<int>(_p),	// LDB
				signulars.get(),		// Pivot, entries must be zero to allow pivoting
				xerus::EPSILON,      	// Used to determine the accuracy of the Lapacke call. Basically all singular values smaller than RCOND*s[0] are ignored. (s[0] is the largest signular value)
				&rank);     			// Outputs the rank of A
			
800 801 802 803
			CHECK(lapackAnswer == 0, error, "Unable to solves min ||Ax - b||_2 for x. Lapacke says: " << lapackAnswer << " sizes are " << _m << " x " << _n << " * " << _p);
			
			if(_m >= _n) { // I.e. bOrX is _b
				misc::copy(_x, bOrX, _n*_p);
804 805
			}
			
806
			XERUS_PA_END("Dense LAPACK", "Solve Least Squares", misc::to_string(_m)+"x"+misc::to_string(_n)+" * "+misc::to_string(_p));
807 808
		}
		
809
	} // namespace blasWrapper
Baum's avatar
Baum committed
810

811
} // namespace xerus
812