blasLapackWrapper.cpp 29.5 KB
Newer Older
Baum's avatar
Baum committed
1
// Xerus - A General Purpose Tensor Library
Sebastian Wolf's avatar
Sebastian Wolf committed
2
// Copyright (C) 2014-2016 Benjamin Huber and Sebastian Wolf. 
Baum's avatar
Baum committed
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
// 
// Xerus is free software: you can redistribute it and/or modify
// it under the terms of the GNU Affero General Public License as published
// by the Free Software Foundation, either version 3 of the License,
// or (at your option) any later version.
// 
// Xerus is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Affero General Public License for more details.
// 
// You should have received a copy of the GNU Affero General Public License
// along with Xerus. If not, see <http://www.gnu.org/licenses/>.
//
// For further information on Xerus visit https://libXerus.org 
// or contact us at contact@libXerus.org.

20
/**
21
22
23
* @file
* @brief Implementation of the blas and lapack wrapper functions.
*/
24

Baum's avatar
Baum committed
25

26
#include <complex.h>
27
// fix for non standard-conform complex implementation
28
29
#undef I

30
// workaround for broken Lapack
31
32
33
34
#define lapack_complex_float    float _Complex
#define lapack_complex_double   double _Complex
extern "C"
{
35
	#include <cblas.h> 
36
}
Sebastian Wolf's avatar
Sebastian Wolf committed
37

38
#ifdef __has_include
39
40
41
42
	#if __has_include(<lapacke.h>)
		#include <lapacke.h>
	#elif __has_include(<lapacke/lapacke.h>)
		#include <lapacke/lapacke.h>
43
44
	#else
		#pragma error no lapacke found
45
	#endif
46
#else
47
	#include <lapacke.h>
48
49
#endif

50
51
52

#include <memory>
#include <xerus/misc/standard.h>
53
#include <xerus/misc/performanceAnalysis.h>
54
#include <xerus/misc/check.h>
55

Sebastian Wolf's avatar
Sebastian Wolf committed
56
#include <xerus/misc/stringUtilities.h>
57
#include <xerus/basic.h>
58

Sebastian Wolf's avatar
Sebastian Wolf committed
59
60
61
#include <xerus/blasLapackWrapper.h>
#include <xerus/misc/basicArraySupport.h>
#include <xerus/misc/math.h>
62
#include <xerus/misc/internal.h>
Sebastian Wolf's avatar
Sebastian Wolf committed
63
64


65

66
namespace xerus {
67
	namespace blasWrapper {
68
69
		// the following routines use a work array as temporary storage
		// to avoid the overhead of repeated reallocation for many small calls, every thread pre-allocates a small scratch-space
Sebastian Wolf's avatar
Sebastian Wolf committed
70
71
// 		const size_t DEFAULT_WORKSPACE_SIZE = 1024*1024;
// 		thread_local value_t defaultWorkspace[DEFAULT_WORKSPACE_SIZE]; // NOTE recheck compatibility with eigen (dolfin) when reinserting this!
72
		
73
74
75
76
77
78
		
		//----------------------------------------------- LEVEL I BLAS ----------------------------------------------------------
		
		double two_norm(const double* const _x, const size_t _n) {
			REQUIRE(_n <= static_cast<size_t>(std::numeric_limits<int>::max()), "Dimension to large for BLAS/Lapack");
			
79
			XERUS_PA_START;
80
81
82
			
			const double result = cblas_dnrm2(static_cast<int>(_n), _x, 1);
			
83
			XERUS_PA_END("Dense BLAS", "Two Norm", misc::to_string(_n));
84
85
86
87
88
89
90
			
			return result;
		}
		
		double dot_product(const double* const _x, const size_t _n, const double* const _y) {
			REQUIRE(_n <= static_cast<size_t>(std::numeric_limits<int>::max()), "Dimension to large for BLAS/Lapack");
			
91
			XERUS_PA_START;
92
93
94
			
			const double result = cblas_ddot(static_cast<int>(_n), _x, 1, _y, 1);
			
95
			XERUS_PA_END("Dense BLAS", "Dot Product", misc::to_string(_n)+"*"+misc::to_string(_n));
96
97
98
99
100
101
102
103
104
105
106
107
108
109
			
			return result;
		}
		
		
		//----------------------------------------------- LEVEL II BLAS ---------------------------------------------------------
		
		void matrix_vector_product(double* const _x, const size_t _m, const double _alpha, const double* const _A, const size_t _n, const bool _transposed, const double* const _y) {
			// Delegate call if appropriate
			if(_m == 1) { *_x = _alpha*dot_product(_A, _n, _y); return;}
			
			REQUIRE(_m <= static_cast<size_t>(std::numeric_limits<int>::max()), "Dimension to large for BLAS/Lapack");
			REQUIRE(_n <= static_cast<size_t>(std::numeric_limits<int>::max()), "Dimension to large for BLAS/Lapack");
			
110
			XERUS_PA_START;
111
112
113
114
115
116
			if(!_transposed) {
				cblas_dgemv(CblasRowMajor, CblasNoTrans, static_cast<int>(_m), static_cast<int>(_n), _alpha, _A, static_cast<int>(_n), _y, 1, 0.0, _x, 1);
			} else {
				cblas_dgemv(CblasRowMajor, CblasTrans, static_cast<int>(_n), static_cast<int>(_m), _alpha, _A, static_cast<int>(_m) , _y, 1, 0.0, _x, 1);
			}
			
117
			XERUS_PA_END("Dense BLAS", "Matrix Vector Product", misc::to_string(_m)+"x"+misc::to_string(_n)+" * "+misc::to_string(_n));
118
119
120
121
122
123
		}
		
		void dyadic_vector_product(double* _A, const size_t _m, const size_t _n, const double _alpha, const double*const  _x, const double* const _y) {
			REQUIRE(_m <= static_cast<size_t>(std::numeric_limits<int>::max()), "Dimension to large for BLAS/Lapack");
			REQUIRE(_n <= static_cast<size_t>(std::numeric_limits<int>::max()), "Dimension to large for BLAS/Lapack");
			
124
			XERUS_PA_START;
125
126
127
128
129
130
			
			//Blas wants to add the product to A, but we don't.
			misc::set_zero(_A, _m*_n);
			
			cblas_dger(CblasRowMajor, static_cast<int>(_m), static_cast<int>(_n), _alpha, _x, 1, _y, 1, _A, static_cast<int>(_n));
			
131
			XERUS_PA_END("Dense BLAS", "Dyadic Vector Product", misc::to_string(_m)+" o "+misc::to_string(_n));
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
		}
		
		
		//----------------------------------------------- LEVEL III BLAS --------------------------------------------------------
		/// Performs the Matrix-Matrix product c = a * b
		void matrix_matrix_product( double* const _C,
									const size_t _leftDim,
									const size_t _rightDim,
									const double _alpha,
									const double* const _A,
									const size_t _lda,
									const bool _transposeA,
									const size_t _middleDim,
									const double* const _B,
									const size_t _ldb,
									const bool _transposeB) {
			//Delegate call if appropriate
			if(_leftDim == 1) {
				matrix_vector_product(_C, _rightDim, _alpha, _B, _middleDim, !_transposeB, _A);
			} else if(_rightDim == 1) {
				matrix_vector_product(_C, _leftDim, _alpha, _A, _middleDim, _transposeA, _B);
			} else if(_middleDim == 1) { 
				dyadic_vector_product(_C, _leftDim, _rightDim, _alpha, _A, _B);
			} else {
			
				REQUIRE(_leftDim <= static_cast<size_t>(std::numeric_limits<int>::max()), "Dimension to large for BLAS/Lapack");
				REQUIRE(_middleDim <= static_cast<size_t>(std::numeric_limits<int>::max()), "Dimension to large for BLAS/Lapack");
				REQUIRE(_rightDim <= static_cast<size_t>(std::numeric_limits<int>::max()), "Dimension to large for BLAS/Lapack");
				REQUIRE(_lda <= static_cast<size_t>(std::numeric_limits<int>::max()), "Dimension to large for BLAS/Lapack");
				REQUIRE(_ldb <= static_cast<size_t>(std::numeric_limits<int>::max()), "Dimension to large for BLAS/Lapack");
				
163
				XERUS_PA_START;
164
165
				
				cblas_dgemm( CblasRowMajor,                             // Array storage format
Sebastian Wolf's avatar
Sebastian Wolf committed
166
167
						_transposeA ? CblasTrans : CblasNoTrans,        // LHS transposed?
						_transposeB ? CblasTrans : CblasNoTrans,        // RHS transposed?
168
169
170
						static_cast<int>(_leftDim),                     // Left dimension
						static_cast<int>(_rightDim),                    // Right dimension
						static_cast<int>(_middleDim),                   // Middle dimension
Sebastian Wolf's avatar
Sebastian Wolf committed
171
172
						_alpha,                                         // Factor to the Product
						_A,                                             // Pointer to LHS
173
						static_cast<int>(_lda),                         // LDA
Sebastian Wolf's avatar
Sebastian Wolf committed
174
						_B,                                             // Pointer to RHS
175
						static_cast<int>(_ldb),                         // LDB
Sebastian Wolf's avatar
Sebastian Wolf committed
176
177
						0.0,                                            // Factor of C (Zero if only the product is required)
						_C,                                             // Pointer to result
178
						static_cast<int>(_rightDim)                     // LDC
Sebastian Wolf's avatar
Sebastian Wolf committed
179
				);
180
				
181
				XERUS_PA_END("Dense BLAS", "Matrix-Matrix-Multiplication", misc::to_string(_leftDim)+"x"+misc::to_string(_middleDim)+" * "+misc::to_string(_middleDim)+"x"+misc::to_string(_rightDim));
182
183
184
185
186
187
188
189
190
191
			}
		}
		
		
		
		//----------------------------------------------- LAPACK ----------------------------------------------------------------
		
		void svd( double* const _U, double* const _S, double* const _Vt, const double* const _A, const size_t _m, const size_t _n) {
			//Create copy of A
			const std::unique_ptr<double[]> tmpA(new double[_m*_n]);
192
			misc::copy(tmpA.get(), _A, _m*_n);
193
			
194
195
196
197
198
199
200
201
			svd_destructive(_U, _S, _Vt, tmpA.get(), _m, _n);
		}
		
		
		void svd_destructive( double* const _U, double* const _S, double* const _Vt, double* const _A, const size_t _m, const size_t _n) {
			REQUIRE(_m <= static_cast<size_t>(std::numeric_limits<int>::max()), "Dimension to large for BLAS/Lapack");
			REQUIRE(_n <= static_cast<size_t>(std::numeric_limits<int>::max()), "Dimension to large for BLAS/Lapack");
			
202
			XERUS_PA_START;
203
204
205
206
			std::unique_ptr<double[]> tmpA(new double[_m*_n]);
			misc::copy(tmpA.get(), _A, _m*_n);
			
			int lapackAnswer = LAPACKE_dgesdd(LAPACK_ROW_MAJOR, 'S', static_cast<int>(_m), static_cast<int>(_n), _A, static_cast<int>(_n), _S, _U, static_cast<int>(std::min(_m, _n)), _Vt, static_cast<int>(_n));
207
208
			CHECK(lapackAnswer == 0, warning, "Lapack failed to compute SVD. Answer is: " << lapackAnswer);
			CHECK(lapackAnswer == 0, warning, "Call was: LAPACKE_dgesdd(LAPACK_ROW_MAJOR, 'S', " << static_cast<int>(_m) << ", " << static_cast<int>(_n) << ", " << _A << ", " << static_cast<int>(_n) <<", " 
209
			<< _S <<", " << _U << ", " << static_cast<int>(std::min(_m, _n)) << ", " << _Vt << ", " << static_cast<int>(_n) << ");");
210
			if(lapackAnswer != 0) {
211
212
213
214
215
216
				LOG(warning, "SVD failed ");
// 				for(size_t i=0; i < _m; ++i) {
// 					for(size_t j=0; j < _n; ++j) {
// 						LOG(warning, tmpA[i*_n+j]);
// 					}
// 				}
217
			}
218
			
219
			XERUS_PA_END("Dense LAPACK", "Singular Value Decomposition", misc::to_string(_m)+"x"+misc::to_string(_n));
220
221
222
223
224
225
226
227
228
229
230
		}
		
		
		std::tuple<std::unique_ptr<double[]>, std::unique_ptr<double[]>, size_t> qc(const double* const _A, const size_t _m, const size_t _n) {
			const std::unique_ptr<double[]> tmpA(new double[_m*_n]);
			misc::copy(tmpA.get(), _A, _m*_n);
			
			return qc_destructive(tmpA.get(), _m, _n);
		}
		
		
231
		std::tuple<std::unique_ptr<double[]>, std::unique_ptr<double[]>, size_t> qc_destructive(double* const _A, const size_t _m, const size_t _n) {
232
233
			REQUIRE(_m <= static_cast<size_t>(std::numeric_limits<int>::max()), "Dimension to large for BLAS/Lapack");
			REQUIRE(_n <= static_cast<size_t>(std::numeric_limits<int>::max()), "Dimension to large for BLAS/Lapack");
234
235
236
237
			
			REQUIRE(_n > 0, "Dimension n must be larger than zero");
			REQUIRE(_m > 0, "Dimension m must be larger than zero");
			
238
			XERUS_PA_START;
239
240
			
			// Maximal rank is used by Lapacke
Sebastian Wolf's avatar
Sebastian Wolf committed
241
			const size_t maxRank = std::min(_m, _n);
242
243
244
245
			
			// Tmp Array for Lapacke
			const std::unique_ptr<double[]> tau(new double[maxRank]);
			
246
247
			const std::unique_ptr<int[]> permutation(new int[_n]());
			misc::set_zero(permutation.get(), _n); // Lapack requires the entries to be zero.
248
249
			
			// Calculate QR factorisations with column pivoting
250
			IF_CHECK(int lapackAnswer = ) LAPACKE_dgeqp3(LAPACK_ROW_MAJOR, static_cast<int>(_m), static_cast<int>(_n), _A, static_cast<int>(_n), permutation.get(), tau.get());
251
			REQUIRE(lapackAnswer == 0, "Unable to perform QC factorisaton (dgeqp3). Lapacke says: " << lapackAnswer );
252
			
253
254
255
256
257
258
			
			// Determine the actual rank
			size_t rank;
			for (rank = 1; rank <= maxRank; ++rank) {
				if (rank == maxRank || std::abs(_A[rank+rank*_n]) < 16*std::numeric_limits<double>::epsilon()*_A[0]) {
					break;
259
				}
260
261
			}
			
262
263
264
265
			
			// Create the matrix C
			std::unique_ptr<double[]> C(new double[rank*_n]);
			misc::set_zero(C.get(), rank*_n); 
266
			
Sebastian Wolf's avatar
Sebastian Wolf committed
267
			// Copy the upper triangular Matrix C (rank x _n) into position
268
			for (size_t col = 0; col < _n; ++col) {
269
270
271
				const size_t targetCol = static_cast<size_t>(permutation[col]-1); // For Lapack numbers start at 1 (instead of 0).
				for(size_t row = 0; row < rank && row < col+1; ++row) {
					C[row*_n + targetCol] = _A[row*_n + col];
272
273
274
				}
			}
			
275
			
276
			// Create orthogonal matrix Q
277
278
			IF_CHECK(lapackAnswer = ) LAPACKE_dorgqr(LAPACK_ROW_MAJOR, static_cast<int>(_m), static_cast<int>(maxRank), static_cast<int>(maxRank), _A, static_cast<int>(_n), tau.get());
			CHECK(lapackAnswer == 0, error, "Unable to reconstruct Q from the QC factorisation. Lapacke says: " << lapackAnswer);
279
			
280
281
			// Copy the newly created Q into position
			std::unique_ptr<double[]> Q(new double[_m*rank]);
282
			if(rank == _n) {
283
				misc::copy(Q.get(), _A, _m*rank);
Benjamin Huber's avatar
Benjamin Huber committed
284
285
			} else {
				for(size_t row = 0; row < _m; ++row) {
286
					misc::copy(Q.get()+row*rank, _A+row*_n, rank);
Benjamin Huber's avatar
Benjamin Huber committed
287
288
				}
			}
289
			
290
			XERUS_PA_END("Dense LAPACK", "QRP Factorisation", misc::to_string(_m)+"x"+misc::to_string(rank)+" * "+misc::to_string(rank)+"x"+misc::to_string(_n));
291
292
			
			return std::make_tuple(std::move(Q), std::move(C), rank);
293
		}
294
295
		
		
296
		std::tuple<std::unique_ptr<double[]>, std::unique_ptr<double[]>, size_t> cq(const double* const _A, const size_t _m, const size_t _n) {
297
			const std::unique_ptr<double[]> tmpA(new double[_m*_n]);
298
			misc::copy(tmpA.get(), _A, _m*_n);
299
			
300
			return cq_destructive(tmpA.get(), _m, _n);
301
302
		}
		
303
		
304
305
306
307
308
309
310
311
		// We use that in col-major we get At = Qt * Ct => A = C * Q, i.e. doing the calculation in col-major and switching Q and C give the desired result.
		std::tuple<std::unique_ptr<double[]>, std::unique_ptr<double[]>, size_t> cq_destructive(double* const _A, const size_t _m, const size_t _n) {
			REQUIRE(_n <= static_cast<size_t>(std::numeric_limits<int>::max()), "Dimension to large for BLAS/Lapack");
			REQUIRE(_m <= static_cast<size_t>(std::numeric_limits<int>::max()), "Dimension to large for BLAS/Lapack");
			
			REQUIRE(_m > 0, "Dimension n must be larger than zero");
			REQUIRE(_n > 0, "Dimension m must be larger than zero");
			
312
			XERUS_PA_START;
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
			
			// Maximal rank is used by Lapacke
			const size_t maxRank = std::min(_n, _m);
			
			// Tmp Array for Lapacke
			const std::unique_ptr<double[]> tau(new double[maxRank]);
			
			const std::unique_ptr<int[]> permutation(new int[_m]());
			misc::set_zero(permutation.get(), _m); // Lapack requires the entries to be zero.
			
			// Calculate QR factorisations with column pivoting
			IF_CHECK(int lapackAnswer = ) LAPACKE_dgeqp3(LAPACK_COL_MAJOR, static_cast<int>(_n), static_cast<int>(_m), _A, static_cast<int>(_n), permutation.get(), tau.get());
			REQUIRE(lapackAnswer == 0, "Unable to perform QC factorisaton (dgeqp3). Lapacke says: " << lapackAnswer );
			
			
			// Determine the actual rank
			size_t rank;
			for (rank = 1; rank <= maxRank; ++rank) {
				if (rank == maxRank || std::abs(_A[rank+rank*_n]) < 16*std::numeric_limits<double>::epsilon()*_A[0]) {
					break;
				}
			}
			
			
			// Create the matrix C
			std::unique_ptr<double[]> C(new double[rank*_m]);
			misc::set_zero(C.get(), rank*_m); 
			
			// Copy the upper triangular Matrix C (rank x _m) into position
			for (size_t col = 0; col < _m; ++col) {
				const size_t targetCol = static_cast<size_t>(permutation[col]-1); // For Lapack numbers start at 1 (instead of 0).
344
				misc::copy(C.get()+targetCol*rank, _A+col*_n, std::min(rank, col+1));
345
346
347
348
349
350
351
352
353
354
355
			}
			
			
			// Create orthogonal matrix Q
			IF_CHECK(lapackAnswer = ) LAPACKE_dorgqr(LAPACK_COL_MAJOR, static_cast<int>(_n), static_cast<int>(maxRank), static_cast<int>(maxRank), _A, static_cast<int>(_n), tau.get());
			CHECK(lapackAnswer == 0, error, "Unable to reconstruct Q from the QC factorisation. Lapacke says: " << lapackAnswer);
			
			// Copy the newly created Q into position
			std::unique_ptr<double[]> Q(new double[_n*rank]);
			misc::copy(Q.get(), _A, _n*rank);
			
356
			XERUS_PA_END("Dense LAPACK", "QRP Factorisation", misc::to_string(_n)+"x"+misc::to_string(rank)+" * "+misc::to_string(rank)+"x"+misc::to_string(_m));
357
358
359
360
			
			return std::make_tuple(std::move(C), std::move(Q), rank);
		}
		
361
362
363
		
		void qr(double* const _Q, double* const _R, const double* const _A, const size_t _m, const size_t _n) {
			// Create tmp copy of A since Lapack wants to destroy it
364
			const std::unique_ptr<double[]> tmpA(new double[_m*_n]);
365
			misc::copy(tmpA.get(), _A, _m*_n);
366
			
367
			qr_destructive(_Q, _R, tmpA.get(), _m, _n);
368
369
370
		}
		
		
371
372
373
		void inplace_qr(double* const _AtoQ, double* const _R, const size_t _m, const size_t _n) {
			qr_destructive(_AtoQ, _R, _AtoQ, _m, _n);
		}
374
375
		
		
376
377
378
379
380
381
382
383
384
385
		void qr_destructive( double* const _Q, double* const _R, double* const _A, const size_t _m, const size_t _n) {
			REQUIRE(_m <= static_cast<size_t>(std::numeric_limits<int>::max()), "Dimension to large for BLAS/Lapack");
			REQUIRE(_n <= static_cast<size_t>(std::numeric_limits<int>::max()), "Dimension to large for BLAS/Lapack");
			
			REQUIRE(_n > 0, "Dimension n must be larger than zero");
			REQUIRE(_m > 0, "Dimension m must be larger than zero");
			
			REQUIRE(_Q && _R && _A, "QR decomposition must not be called with null pointers: Q:" << _Q << " R: " << _R << " A: " << _A);
			REQUIRE(_A != _R, "_A and _R must be different, otherwise qr call will fail.");
			
386
			XERUS_PA_START;
387
388
389
390
391
392
393
394
			
			// Maximal rank is used by Lapacke
			const size_t rank = std::min(_m, _n); 
			
			// Tmp Array for Lapacke
			const std::unique_ptr<double[]> tau(new double[rank]);
			
			// Calculate QR factorisations
395
//             LOG(Lapacke, "Call to dorgqr with parameters: " << LAPACK_ROW_MAJOR << ", " << static_cast<int>(_m)  << ", " << static_cast<int>(_n)  << ", " << _A << ", " << static_cast<int>(_n)  << ", " << tau.get());
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
			IF_CHECK( int lapackAnswer = ) LAPACKE_dgeqrf(LAPACK_ROW_MAJOR, static_cast<int>(_m), static_cast<int>(_n), _A, static_cast<int>(_n), tau.get());
			CHECK(lapackAnswer == 0, error, "Unable to perform QR factorisaton. Lapacke says: " << lapackAnswer );
			
			// Copy the upper triangular Matrix R (rank x _n) into position
			for(size_t row =0; row < rank; ++row) {
				misc::set_zero(_R+row*_n, row); // Set starting zeros
				misc::copy(_R+row*_n+row, _A+row*_n+row, _n-row); // Copy upper triangular part from lapack result.
			}
			
			// Create orthogonal matrix Q (in tmpA)
	//         LOG(Lapacke, "Call to dorgqr with parameters: " << LAPACK_ROW_MAJOR << ", " << static_cast<int>(_m)  << ", " << static_cast<int>(rank)  << ", " << static_cast<int>(rank) << ", " << _A << ", " << static_cast<int>(_n)  << ", " << tau.get());
			IF_CHECK( lapackAnswer = ) LAPACKE_dorgqr(LAPACK_ROW_MAJOR, static_cast<int>(_m), static_cast<int>(rank), static_cast<int>(rank), _A, static_cast<int>(_n), tau.get());
			CHECK(lapackAnswer == 0, error, "Unable to reconstruct Q from the QR factorisation. Lapacke says: " << lapackAnswer);
			
			// Copy Q (_m x rank) into position
			if(_A != _Q) {
412
				if(_n == rank) {
413
					misc::copy(_Q, _A, _m*_n);
414
				} else {
415
					for(size_t row = 0; row < _m; ++row) {
416
						misc::copy(_Q+row*rank, _A+row*_n, rank);
417
418
					}
				}
419
			} else if(_n != rank) { // Note extra treatmeant to avoid memcpy overlap
420
				for(size_t row = 1; row < _m; ++row) {
421
					misc::copy_inplace(_Q+row*rank, _A+row*_n, rank);
422
423
				}
			}
424
			
425
			XERUS_PA_END("Dense LAPACK", "QR Factorisation", misc::to_string(_m)+"x"+misc::to_string(_n));
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
		}
		
		
		void rq( double* const _R, double* const _Q, const double* const _A, const size_t _m, const size_t _n) {
			// Create tmp copy of A since Lapack wants to destroy it
			const std::unique_ptr<double[]> tmpA(new double[_m*_n]);
			misc::copy(tmpA.get(), _A, _m*_n);
			
			rq_destructive(_R, _Q, tmpA.get(), _m, _n);
		}
		
		
		void inplace_rq( double* const _R, double* const _AtoQ, const size_t _m, const size_t _n) {
			rq_destructive(_R, _AtoQ, _AtoQ, _m, _n);
		}
		
		
		void rq_destructive( double* const _R, double* const _Q, double* const _A, const size_t _m, const size_t _n) {
			REQUIRE(_m <= static_cast<size_t>(std::numeric_limits<int>::max()), "Dimension to large for BLAS/Lapack");
			REQUIRE(_n <= static_cast<size_t>(std::numeric_limits<int>::max()), "Dimension to large for BLAS/Lapack");
			
			REQUIRE(_n > 0, "Dimension n must be larger than zero");
			REQUIRE(_m > 0, "Dimension m must be larger than zero");
			
			REQUIRE(_Q && _R && _A, "QR decomposition must not be called with null pointers: R " << _R << " Q: " << _Q << " A: " << _A);
			REQUIRE(_A != _R, "_A and _R must be different, otherwise qr call will fail.");
			
453
			XERUS_PA_START;
454
455
456
457
458
459
460
461
			
			// Maximal rank is used by Lapacke
			const size_t rank = std::min(_m, _n); 
			
			// Tmp Array for Lapacke
			const std::unique_ptr<double[]> tau(new double[rank]);
			
			IF_CHECK( int lapackAnswer = ) LAPACKE_dgerqf(LAPACK_ROW_MAJOR, static_cast<int>(_m), static_cast<int>(_n), _A, static_cast<int>(_n), tau.get());
Sebastian Wolf's avatar
Sebastian Wolf committed
462
			CHECK(lapackAnswer == 0, error, "Unable to perform QR factorisaton. Lapacke says: " << lapackAnswer << ". Call was: LAPACKE_dgerqf(LAPACK_ROW_MAJOR, "<<static_cast<int>(_m)<<", "<<static_cast<int>(_n)<<", "<<_A<<", "<<static_cast<int>(_n)<<", "<<tau.get()<<");" );
463
464
465
466
467
468
469
470
471
472
473
474
475
476
			
			
			// Copy the upper triangular Matrix R (_m x rank) into position.
			size_t row = 0;
			for( ; row < _m - rank; ++row) {
				misc::copy(_R+row*rank, _A+row*_n+_n-rank, rank);
			}
			for(size_t skip = 0; row < _m; ++row, ++skip) {
				misc::set_zero(_R+row*rank, skip); // Set starting zeros
				misc::copy(_R+row*rank+skip, _A+row*_n+_n-rank+skip, rank-skip); // Copy upper triangular part from lapack result.
			}
			
			// Create orthogonal matrix Q (in _A). Lapacke expects to get the last rank rows of A...
			IF_CHECK( lapackAnswer = ) LAPACKE_dorgrq(LAPACK_ROW_MAJOR, static_cast<int>(rank), static_cast<int>(_n), static_cast<int>(rank), _A+(_m-rank)*_n, static_cast<int>(_n), tau.get()); 
Sebastian Wolf's avatar
Sebastian Wolf committed
477
			CHECK(lapackAnswer == 0, error, "Unable to reconstruct Q from the RQ factorisation. Lapacke says: " << lapackAnswer << ". Call was: LAPACKE_dorgrq(LAPACK_ROW_MAJOR, "<<static_cast<int>(rank)<<", "<<static_cast<int>(_n)<<", "<<static_cast<int>(rank)<<", "<<_A+(_m-rank)*_n<<", "<<static_cast<int>(_n)<<", "<<tau.get()<<");");
478

479
480
481
482
483
484
			
			//Copy Q (rank x _n) into position
			if(_A != _Q) {
				misc::copy(_Q, _A+(_m-rank)*_n, rank*_n);
			}
			
485
			XERUS_PA_END("Dense LAPACK", "RQ Factorisation", misc::to_string(_m)+"x"+misc::to_string(_n));
486
487
488
		}
		
		
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
		static bool is_symmetric(const double* const _A, const size_t _n) {
			double max = 0;
			for (size_t i=0; i<_n*_n; ++i) {
				max = std::max(max, _A[i]);
			}
			
			for (size_t i=0; i<_n; ++i) {
				for (size_t j=i+1; j<_n; ++j) {
					if (std::abs(_A[i*_n + j] - _A[i + j*_n]) >= 4 * max * std::numeric_limits<double>::epsilon()) {
// 						LOG(aslkdjj, std::abs(_A[i*_n + j] - _A[i + j*_n]) << " / " << _A[i*_n + j] << " " << max);
						return false;
					}
				}
			}
			return true;
		}
		
		/// @brief checks whether the diagonal of @a _A is all positive or all negative. returns false otherwise
		static bool pos_neg_definite_diagonal(const double* const _A, const size_t _n) {
			bool positive = (_A[0] > 0);
			const size_t steps=_n+1;
			if (positive) {
				for (size_t i=1; i<_n; ++i) {
					if (_A[i*steps] < std::numeric_limits<double>::epsilon()) {
						return false;
					}
				}
				return true;
			} else {
				for (size_t i=1; i<_n; ++i) {
					if (_A[i*steps] > -std::numeric_limits<double>::epsilon()) {
						return false;
					}
				}
				return true;
			}
525
526
527
528
			
		}
		
		/// Solves Ax = b for x
529
530
		void solve(double* const _x, const double* const _A, const size_t _m, const size_t _n, const double* const _b, const size_t _nrhs) {
			REQUIRE(_m <= static_cast<size_t>(std::numeric_limits<int>::max()), "Dimension to large for BLAS/Lapack");
531
			REQUIRE(_n <= static_cast<size_t>(std::numeric_limits<int>::max()), "Dimension to large for BLAS/Lapack");
532
533
534
535
536
537
			REQUIRE(_nrhs <= static_cast<size_t>(std::numeric_limits<int>::max()), "Dimension to large for BLAS/Lapack");
			
			const std::unique_ptr<double[]> tmpA(new double[_m*_n]);
			misc::copy(tmpA.get(), _A, _m*_n);
			
			LOG(debug, "solving with...");
538
			
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
			// not rectangular -> fallback to least-squares (QR or SVD decomposition to solve)
			if (_m != _n) {
				LOG(debug, "SVD");
				const std::unique_ptr<double[]> tmpB(new double[_m*_nrhs]);
				misc::copy(tmpB.get(), _b, _m*_nrhs);
				solve_least_squares_destructive(_x, tmpA.get(), _m, _n, tmpB.get(), _nrhs);
				return;
			}
			
			// not symmetric -> LU solver
			if (!is_symmetric(_A, _n)) {
				LOG(debug, "LU");
				XERUS_PA_START;
				
				std::unique_ptr<int[]> pivot(new int[_n]);
				
				misc::copy(_x, _b, _n);
				
				IF_CHECK( int lapackAnswer = ) LAPACKE_dgesv(
					LAPACK_ROW_MAJOR,
					static_cast<int>(_n),		// Dimensions of A (nxn)
					static_cast<int>(_nrhs), 	// Number of rhs b
					tmpA.get(),					// input: A, output: L and U
					static_cast<int>(_n),		// LDA
					pivot.get(),				// output: permutation P
					_x,							// input: rhs b, output: solution x
					static_cast<int>(_nrhs)		// ldb
				);
				CHECK(lapackAnswer == 0, error, "Unable to solve Ax = b (PLU solver). Lapacke says: " << lapackAnswer);
				
				XERUS_PA_END("Dense LAPACK", "Solve (PLU)", misc::to_string(_n)+"x"+misc::to_string(_n)+"x"+misc::to_string(_nrhs));
				return;
			}
572
			
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
			// positive or negative diagonal -> try cholesky
			if (pos_neg_definite_diagonal(_A, _n)) {
				LOG(debug, "trying cholesky");
				int lapackAnswer = 0;
				{
					XERUS_PA_START;
					
					lapackAnswer = LAPACKE_dpotrf2(
						LAPACK_ROW_MAJOR,
						'U', 					// Upper triangle of A is read
						static_cast<int>(_n),	// dimensions of A
						tmpA.get(),				// input: A, output: cholesky factorisation
						static_cast<int>(_n)	// LDA
					);
					
					XERUS_PA_END("Dense LAPACK", "Cholesky decomposition", misc::to_string(_n)+"x"+misc::to_string(_n));
				}
				
				if (lapackAnswer == 0) {
					LOG(debug, "cholesky");
					XERUS_PA_START;
					
					misc::copy(_x, _b, _n);
					
					lapackAnswer = LAPACKE_dpotrs(
						LAPACK_ROW_MAJOR,
						'U',					// upper triangle of cholesky decomp is stored in tmpA
						static_cast<int>(_n),	// dimensions of A
						static_cast<int>(_nrhs),// number of rhs
						tmpA.get(),				// input: cholesky decomp
						static_cast<int>(_n), 	// lda
						_x,						// input: rhs b, output: solution x
						static_cast<int>(_nrhs)	// ldb
					);
					CHECK(lapackAnswer == 0, error, "Unable to solve Ax = b (cholesky solver). Lapacke says: " << lapackAnswer);
					
					XERUS_PA_END("Dense LAPACK", "Solve (Cholesky)", misc::to_string(_n)+"x"+misc::to_string(_n)+"x"+misc::to_string(_nrhs));
					
					return;
				} else {
					// restore tmpA
					misc::copy(tmpA.get(), _A, _m*_n);
				}
			}
			
			LOG(debug, "LDL");
			// non-definite diagonal or choleksy failed -> fallback to LDL^T decomposition
			XERUS_PA_START;
			
			misc::copy(_x, _b, _n);
623
624
			std::unique_ptr<int[]> pivot(new int[_n]);
			
625
			LAPACKE_dsysv(
626
				LAPACK_ROW_MAJOR,
627
628
629
630
631
632
633
634
635
636
637
638
				'U',					// upper triangular part of _A is accessed
				static_cast<int>(_n),	// dimension of A
				static_cast<int>(_nrhs),// number of rhs
				tmpA.get(), 			// input: A, output: part of the LDL decomposition
				static_cast<int>(_n),	// lda
				pivot.get(), 			// output: details of blockstructure of D
				_x,						// input: rhs b, output: solution x
				static_cast<int>(_nrhs)	// ldb
			);
			
			XERUS_PA_END("Dense LAPACK", "Solve (LDL)", misc::to_string(_n)+"x"+misc::to_string(_n)+"x"+misc::to_string(_nrhs));
		}
639
640
		
	
641
		void solve_least_squares( double* const _x, const double* const _A, const size_t _m, const size_t _n, const double* const _b, const size_t _p){
642
643
644
			const std::unique_ptr<double[]> tmpA(new double[_m*_n]);
			misc::copy(tmpA.get(), _A, _m*_n);
			
645
646
			const std::unique_ptr<double[]> tmpB(new double[_m*_p]);
			misc::copy(tmpB.get(), _b, _m*_p);
647
			
648
			solve_least_squares_destructive(_x, tmpA.get(), _m, _n, tmpB.get(), _p);
649
650
651
		}
		
		
652
		void solve_least_squares_destructive( double* const _x, double* const _A, const size_t _m, const size_t _n, double* const _b, const size_t _p){
653
654
			REQUIRE(_m <= static_cast<size_t>(std::numeric_limits<int>::max()), "Dimension to large for BLAS/Lapack");
			REQUIRE(_n <= static_cast<size_t>(std::numeric_limits<int>::max()), "Dimension to large for BLAS/Lapack");
655
			REQUIRE(_p <= static_cast<size_t>(std::numeric_limits<int>::max()), "Dimension to large for BLAS/Lapack");
656
			
657
			XERUS_PA_START;
658
659
660
			
			std::unique_ptr<int[]> pivot(new int[_n]);
			misc::set_zero(pivot.get(), _n);
661
			
Sebastian Wolf's avatar
Sebastian Wolf committed
662
			std::unique_ptr<double[]> signulars(new double[std::min(_n, _m)]);
663
			
664
665
666
667
668
			int rank;
			
			double* bOrX;
			if(_m >= _n) {
				bOrX = _b;
669
670
671
672
			} else {
				bOrX = _x;
				misc::copy(bOrX, _b, _m*_p);
				misc::set_zero(bOrX+_m*_p, (_n-_m)*_p); // Lapacke is unhappy if the array contains NANs...
673
674
			}
			
Sebastian Wolf's avatar
Sebastian Wolf committed
675
// 			IF_CHECK( int lapackAnswer = ) LAPACKE_dgelsy(
676
677
678
679
680
681
682
// 				LAPACK_ROW_MAJOR, 
// 				static_cast<int>(_m),   // Left dimension of A
// 				static_cast<int>(_n),   // Right dimension of A
// 				static_cast<int>(_p),	// Number of rhss
// 				_A,         			// Matrix A
// 				static_cast<int>(_n),   // LDA
// 				bOrX,       			// On input b, on output x
Sebastian Wolf's avatar
Sebastian Wolf committed
683
684
// 				static_cast<int>(_p),          			// LDB
// 				pivot.get(),			// Pivot, entries must be zero to allow pivoting
685
686
687
// 				xerus::EPSILON,      	// Used to determine the accuracy of the Lapacke call. Basically all singular values smaller than RCOND*s[0] are ignored. (s[0] is the largest signular value)
// 				&rank);     			// Outputs the rank of A
			
Sebastian Wolf's avatar
Sebastian Wolf committed
688
689
690
691
692
693
694
695
696
697
698
699
700
			IF_CHECK( int lapackAnswer = ) LAPACKE_dgelsd(
				LAPACK_ROW_MAJOR, 
				static_cast<int>(_m),   // Left dimension of A
				static_cast<int>(_n),   // Right dimension of A
				static_cast<int>(_p),	// Number of rhss
				_A,         			// Matrix A
				static_cast<int>(_n),   // LDA
				bOrX,       			// On input b, on output x
				static_cast<int>(_p),	// LDB
				signulars.get(),		// Pivot, entries must be zero to allow pivoting
				xerus::EPSILON,      	// Used to determine the accuracy of the Lapacke call. Basically all singular values smaller than RCOND*s[0] are ignored. (s[0] is the largest signular value)
				&rank);     			// Outputs the rank of A
			
701
702
703
704
			CHECK(lapackAnswer == 0, error, "Unable to solves min ||Ax - b||_2 for x. Lapacke says: " << lapackAnswer << " sizes are " << _m << " x " << _n << " * " << _p);
			
			if(_m >= _n) { // I.e. bOrX is _b
				misc::copy(_x, bOrX, _n*_p);
705
706
			}
			
707
			XERUS_PA_END("Dense LAPACK", "Solve Least Squares", misc::to_string(_m)+"x"+misc::to_string(_n)+" * "+misc::to_string(_p));
708
709
		}
		
710
	} // namespace blasWrapper
Baum's avatar
Baum committed
711

712
} // namespace xerus
713