PEXSI
 All Classes Namespaces Files Functions Variables Typedefs Pages
pselinv_impl.hpp
Go to the documentation of this file.
1 /*
2  Copyright (c) 2012 The Regents of the University of California,
3  through Lawrence Berkeley National Laboratory.
4 
5 Authors: Lin Lin and Mathias Jacquelin
6 
7 This file is part of PEXSI. All rights reserved.
8 
9 Redistribution and use in source and binary forms, with or without
10 modification, are permitted provided that the following conditions are met:
11 
12 (1) Redistributions of source code must retain the above copyright notice, this
13 list of conditions and the following disclaimer.
14 (2) Redistributions in binary form must reproduce the above copyright notice,
15 this list of conditions and the following disclaimer in the documentation
16 and/or other materials provided with the distribution.
17 (3) Neither the name of the University of California, Lawrence Berkeley
18 National Laboratory, U.S. Dept. of Energy nor the names of its contributors may
19 be used to endorse or promote products derived from this software without
20 specific prior written permission.
21 
22 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
23 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
24 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
25 DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
26 ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
27 (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
29 ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
31 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 
33 You are under no obligation whatsoever to provide any bug fixes, patches, or
34 upgrades to the features, functionality or performance of the source code
35 ("Enhancements") to anyone; however, if you choose to make your Enhancements
36 available either publicly, or directly to Lawrence Berkeley National
37 Laboratory, without imposing a separate written license agreement for such
38 Enhancements, then you hereby grant the following license: a non-exclusive,
39 royalty-free perpetual license to install, use, modify, prepare derivative
40 works, incorporate into other computer software, distribute, and sublicense
41 such enhancements or derivative works thereof, in binary and source code form.
42  */
46 #ifndef _PEXSI_PSELINV_IMPL_HPP_
47 #define _PEXSI_PSELINV_IMPL_HPP_
48 
49 #include <list>
50 
51 #include "pexsi/timer.h"
53 
54 
55 #define MPI_MAX_COMM (1024)
56 #define BCAST_THRESHOLD 16
57 
58 #define MOD(a,b) \
59  ( ((a)%(b)+(b))%(b))
60 
61 
62 
63  namespace PEXSI{
64  inline GridType::GridType ( MPI_Comm Bcomm, int nprow, int npcol )
65  {
66 #ifndef _RELEASE_
67  PushCallStack("GridType::GridType");
68 #endif
69  Int info;
70  MPI_Initialized( &info );
71  if( !info ){
72 #ifdef USE_ABORT
73  abort();
74 #endif
75  throw std::logic_error( "MPI has not been initialized." );
76  }
77  MPI_Group comm_group;
78  MPI_Comm_group( Bcomm, &comm_group );
79  MPI_Comm_create( Bcomm, comm_group, &comm );
80  // comm = Bcomm;
81 
82  MPI_Comm_rank( comm, &mpirank );
83  MPI_Comm_size( comm, &mpisize );
84  if( mpisize != nprow * npcol ){
85 #ifdef USE_ABORT
86  abort();
87 #endif
88  throw std::logic_error( "mpisize != nprow * npcol." );
89  }
90 
91  numProcRow = nprow;
92  numProcCol = npcol;
93 
94  Int myrow = mpirank / npcol;
95  Int mycol = mpirank % npcol;
96 
97  MPI_Comm_split( comm, myrow, mycol, &rowComm );
98  MPI_Comm_split( comm, mycol, myrow, &colComm );
99 
100  MPI_Group_free( &comm_group );
101 
102 #ifndef _RELEASE_
103  PopCallStack();
104 #endif
105 
106  return ;
107  } // ----- end of method GridType::GridType -----
108 
109 
110  inline GridType::~GridType ( )
111  {
112 #ifndef _RELEASE_
113  PushCallStack("GridType::~GridType");
114 #endif
115  // Dot not free grid.comm which is not generated by GridType().
116 
117  MPI_Comm_free( &rowComm );
118  MPI_Comm_free( &colComm );
119  MPI_Comm_free( &comm );
120 
121 #ifndef _RELEASE_
122  PopCallStack();
123 #endif
124  return ;
125  } // ----- end of method GridType::~GridType -----
126  }
127 
128 
129 namespace PEXSI{
130  template<typename T>
131  void PMatrix<T>::deallocate(){
132 
133  grid_ = NULL;
134  super_ = NULL;
135  options_ = NULL;
136  optionsLU_ = NULL;
137 
138  ColBlockIdx_.clear();
139  RowBlockIdx_.clear();
140  U_.clear();
141  L_.clear();
142  workingSet_.clear();
143 
144  // Communication variables
145  isSendToBelow_.Clear();
146  isSendToRight_.Clear();
147  isSendToDiagonal_.Clear();
148  isSendToCrossDiagonal_.Clear();
149 
150  isRecvFromBelow_.Clear();
151  isRecvFromAbove_.Clear();
152  isRecvFromLeft_.Clear();
153  isRecvFromCrossDiagonal_.Clear();
154 
155 
156  //Cleanup tree information
157  for(int i =0;i<fwdToBelowTree_.size();++i){
158  if(fwdToBelowTree_[i]!=NULL){
159  delete fwdToBelowTree_[i];
160  }
161  }
162 
163  for(int i =0;i<fwdToRightTree_.size();++i){
164  if(fwdToRightTree_[i]!=NULL){
165  delete fwdToRightTree_[i];
166  }
167  }
168 
169  for(int i =0;i<redToLeftTree_.size();++i){
170  if(redToLeftTree_[i]!=NULL){
171  delete redToLeftTree_[i];
172  }
173  }
174  for(int i =0;i<redToAboveTree_.size();++i){
175  if(redToAboveTree_[i]!=NULL){
176  delete redToAboveTree_[i];
177  }
178  }
179  }
180 
181  template<typename T>
182  PMatrix<T> & PMatrix<T>::operator = ( const PMatrix<T> & C){
183  if(&C!=this){
184  //If we have some memory allocated, delete it
185  deallocate();
186 
187  grid_ = C.grid_;
188  super_ = C.super_;
189  options_ = C.options_;
190  optionsLU_ = C.optionsLU_;
191 
192 
193  ColBlockIdx_ = C.ColBlockIdx_;
194  RowBlockIdx_ = C.RowBlockIdx_;
195  L_ = C.L_;
196  U_ = C.U_;
197 
198  workingSet_ = C.workingSet_;
199 
200 
201  // Communication variables
202  isSendToBelow_ = C.isSendToBelow_;
203  isSendToRight_ = C.isSendToRight_;
204  isSendToDiagonal_ = C.isSendToDiagonal_;
205  isSendToCrossDiagonal_ = C.isSendToCrossDiagonal_;
206 
207  isRecvFromBelow_ = C.isRecvFromBelow_;
208  isRecvFromAbove_ = C.isRecvFromAbove_;
209  isRecvFromLeft_ = C.isRecvFromLeft_;
210  isRecvFromCrossDiagonal_ = C.isRecvFromCrossDiagonal_;
211 
212  fwdToBelowTree_.resize(C.fwdToBelowTree_.size());
213  for(int i = 0 ; i< C.fwdToBelowTree_.size();++i){
214  if(C.fwdToBelowTree_[i]!=NULL){
215  fwdToBelowTree_[i] = C.fwdToBelowTree_[i]->clone();
216  }
217  }
218 
219  fwdToRightTree_.resize(C.fwdToRightTree_.size());
220  for(int i = 0 ; i< C.fwdToRightTree_.size();++i){
221  if(C.fwdToRightTree_[i]!=NULL){
222  fwdToRightTree_[i] = C.fwdToRightTree_[i]->clone();
223  }
224  }
225 
226  redToLeftTree_.resize(C.redToLeftTree_.size());
227  for(int i = 0 ; i< C.redToLeftTree_.size();++i){
228  if(C.redToLeftTree_[i]!=NULL){
229  redToLeftTree_[i] = C.redToLeftTree_[i]->clone();
230  }
231  }
232  redToAboveTree_.resize(C.redToAboveTree_.size());
233  for(int i = 0 ; i< C.redToAboveTree_.size();++i){
234  if(C.redToAboveTree_[i]!=NULL){
235  redToAboveTree_[i] = C.redToAboveTree_[i]->clone();
236  }
237  }
238 
239  }
240 
241  return *this;
242  }
243 
244  template<typename T>
245  PMatrix<T>::PMatrix( const PMatrix<T> & C){
246  //If we have some memory allocated, delete it
247  deallocate();
248 
249  grid_ = C.grid_;
250  super_ = C.super_;
251  options_ = C.options_;
252  optionsLU_ = C.optionsLU_;
253 
254 
255  ColBlockIdx_ = C.ColBlockIdx_;
256  RowBlockIdx_ = C.RowBlockIdx_;
257  L_ = C.L_;
258  U_ = C.U_;
259 
260  workingSet_ = C.workingSet_;
261 
262 
263  // Communication variables
264  isSendToBelow_ = C.isSendToBelow_;
265  isSendToRight_ = C.isSendToRight_;
266  isSendToDiagonal_ = C.isSendToDiagonal_;
267  isSendToCrossDiagonal_ = C.isSendToCrossDiagonal_;
268 
269  isRecvFromBelow_ = C.isRecvFromBelow_;
270  isRecvFromAbove_ = C.isRecvFromAbove_;
271  isRecvFromLeft_ = C.isRecvFromLeft_;
272  isRecvFromCrossDiagonal_ = C.isRecvFromCrossDiagonal_;
273 
274 
275  fwdToBelowTree_.resize(C.fwdToBelowTree_.size());
276  for(int i = 0 ; i< C.fwdToBelowTree_.size();++i){
277  if(C.fwdToBelowTree_[i]!=NULL){
278  fwdToBelowTree_[i] = C.fwdToBelowTree_[i]->clone();
279  }
280  }
281 
282  fwdToRightTree_.resize(C.fwdToRightTree_.size());
283  for(int i = 0 ; i< C.fwdToRightTree_.size();++i){
284  if(C.fwdToRightTree_[i]!=NULL){
285  fwdToRightTree_[i] = C.fwdToRightTree_[i]->clone();
286  }
287  }
288 
289  redToLeftTree_.resize(C.redToLeftTree_.size());
290  for(int i = 0 ; i< C.redToLeftTree_.size();++i){
291  if(C.redToLeftTree_[i]!=NULL){
292  redToLeftTree_[i] = C.redToLeftTree_[i]->clone();
293  }
294  }
295  redToAboveTree_.resize(C.redToAboveTree_.size());
296  for(int i = 0 ; i< C.redToAboveTree_.size();++i){
297  if(C.redToAboveTree_[i]!=NULL){
298  redToAboveTree_[i] = C.redToAboveTree_[i]->clone();
299  }
300  }
301 
302  }
303 
304  template<typename T>
305  PMatrix<T>::PMatrix (
306  const GridType* g,
307  const SuperNodeType* s,
308  const PEXSI::PSelInvOptions * o,
309  const PEXSI::SuperLUOptions * oLU
310  )
311  {
312 #ifndef _RELEASE_
313  PushCallStack("PMatrix::PMatrix");
314 #endif
315 
316  this->Setup( g, s, o, oLU );
317 
318 #ifndef _RELEASE_
319  PopCallStack();
320 #endif
321  return ;
322  } // ----- end of method PMatrix::PMatrix -----
323 
324  template<typename T>
325  PMatrix<T>::~PMatrix ( )
326  {
327 
328 #ifndef _RELEASE_
329  PushCallStack("PMatrix::~PMatrix");
330 #endif
331 
332  deallocate();
333 
334 #ifndef _RELEASE_
335  PopCallStack();
336 #endif
337  return ;
338  } // ----- end of method PMatrix::~PMatrix -----
339 
340  template<typename T>
341  void PMatrix<T>::Setup(
342  const GridType* g,
343  const SuperNodeType* s,
344  const PEXSI::PSelInvOptions * o,
345  const PEXSI::SuperLUOptions * oLU
346  )
347  {
348 #ifndef _RELEASE_
349  PushCallStack("PMatrix::Setup");
350 #endif
351 
352  grid_ = g;
353  super_ = s;
354  options_ = o;
355  optionsLU_ = oLU;
356 
357  // if( grid_->numProcRow != grid_->numProcCol ){
358  // #ifdef USE_ABORT
359  //abort();
360  //#endif
361  //throw std::runtime_error( "The current version of SelInv only works for square processor grids." ); }
362 
363  L_.clear();
364  U_.clear();
365  ColBlockIdx_.clear();
366  RowBlockIdx_.clear();
367 
368  L_.resize( this->NumLocalBlockCol() );
369  U_.resize( this->NumLocalBlockRow() );
370 
371  ColBlockIdx_.resize( this->NumLocalBlockCol() );
372  RowBlockIdx_.resize( this->NumLocalBlockRow() );
373 
374  //workingSet_.resize(this->NumSuper());
375 #if ( _DEBUGlevel_ >= 1 )
376  statusOFS << std::endl << "PMatrix is constructed. The grid information: " << std::endl;
377  statusOFS << "mpirank = " << MYPROC(grid_) << std::endl;
378  statusOFS << "myrow = " << MYROW(grid_) << std::endl;
379  statusOFS << "mycol = " << MYCOL(grid_) << std::endl;
380 #endif
381 
382 #ifndef _RELEASE_
383  PopCallStack();
384 #endif
385  return ;
386  } // ----- end of method PMatrix::Setup -----
387 
388 
390  template<typename T>
392  SuperNodeBufferType & snode,
393  std::vector<LBlock<T> > & LcolRecv,
394  std::vector<UBlock<T> > & UrowRecv,
395  NumMat<T> & AinvBuf,
396  NumMat<T> & UBuf )
397  {
398  TIMER_START(Compute_Sinv_LT_Lookup_Indexes);
399 
400  TIMER_START(Build_colptr_rowptr);
401  // rowPtr[ib] gives the row index in snode.LUpdateBuf for the first
402  // nonzero row in LcolRecv[ib]. The total number of rows in
403  // snode.LUpdateBuf is given by rowPtr[end]-1
404  std::vector<Int> rowPtr(LcolRecv.size() + 1);
405  // colPtr[jb] gives the column index in UBuf for the first
406  // nonzero column in UrowRecv[jb]. The total number of rows in
407  // UBuf is given by colPtr[end]-1
408  std::vector<Int> colPtr(UrowRecv.size() + 1);
409 
410  rowPtr[0] = 0;
411  for( Int ib = 0; ib < LcolRecv.size(); ib++ ){
412  rowPtr[ib+1] = rowPtr[ib] + LcolRecv[ib].numRow;
413  }
414  colPtr[0] = 0;
415  for( Int jb = 0; jb < UrowRecv.size(); jb++ ){
416  colPtr[jb+1] = colPtr[jb] + UrowRecv[jb].numCol;
417  }
418 
419  Int numRowAinvBuf = *rowPtr.rbegin();
420  Int numColAinvBuf = *colPtr.rbegin();
421  TIMER_STOP(Build_colptr_rowptr);
422 
423  TIMER_START(Allocate_lookup);
424  // Allocate for the computational storage
425  AinvBuf.Resize( numRowAinvBuf, numColAinvBuf );
426  UBuf.Resize( SuperSize( snode.Index, super_ ), numColAinvBuf );
427  // TIMER_START(SetValue_lookup);
428  // SetValue( AinvBuf, ZERO<T>() );
429  //SetValue( snode.LUpdateBuf, ZERO<T>() );
430  // SetValue( UBuf, ZERO<T>() );
431  // TIMER_STOP(SetValue_lookup);
432  TIMER_STOP(Allocate_lookup);
433 
434  TIMER_START(Fill_UBuf);
435  // Fill UBuf first. Make the transpose later in the Gemm phase.
436  for( Int jb = 0; jb < UrowRecv.size(); jb++ ){
437  UBlock<T>& UB = UrowRecv[jb];
438  if( UB.numRow != SuperSize(snode.Index, super_) ){
439  throw std::logic_error( "The size of UB is not right. Something is seriously wrong." );
440  }
441  lapack::Lacpy( 'A', UB.numRow, UB.numCol, UB.nzval.Data(),
442  UB.numRow, UBuf.VecData( colPtr[jb] ), SuperSize( snode.Index, super_ ) );
443  }
444  TIMER_STOP(Fill_UBuf);
445 
446  // Calculate the relative indices for (isup, jsup)
447  // Fill AinvBuf with the information in L or U block.
448  TIMER_START(JB_Loop);
449 
450 #ifdef STDFIND
451  // for( Int jb = 0; jb < UrowRecv.size(); jb++ ){
452  //
453  // UBlock& UB = UrowRecv[jb];
454  // Int jsup = UB.blockIdx;
455  // Int SinvColsSta = FirstBlockCol( jsup, super_ );
456  //
457  // // Column relative indicies
458  // std::vector<Int> relCols( UB.numCol );
459  // for( Int j = 0; j < UB.numCol; j++ ){
460  // relCols[j] = UB.cols[j] - SinvColsSta;
461  // }
462  //
463  //
464  //
465  //
466  // for( Int ib = 0; ib < LcolRecv.size(); ib++ ){
467  // LBlock& LB = LcolRecv[ib];
468  // Int isup = LB.blockIdx;
469  // Int SinvRowsSta = FirstBlockCol( isup, super_ );
470  // Scalar* nzvalAinv = &AinvBuf( rowPtr[ib], colPtr[jb] );
471  // Int ldAinv = numRowAinvBuf;
472  //
473  // // Pin down the corresponding block in the part of Sinv.
474  // if( isup >= jsup ){
475  // std::vector<LBlock>& LcolSinv = this->L( LBj(jsup, grid_ ) );
476  // bool isBlockFound = false;
477  // TIMER_START(PARSING_ROW_BLOCKIDX);
478  // for( Int ibSinv = 0; ibSinv < LcolSinv.size(); ibSinv++ ){
479  // // Found the (isup, jsup) block in Sinv
480  // if( LcolSinv[ibSinv].blockIdx == isup ){
481  // LBlock& SinvB = LcolSinv[ibSinv];
482  //
483  // // Row relative indices
484  // std::vector<Int> relRows( LB.numRow );
485  // Int* rowsLBPtr = LB.rows.Data();
486  // Int* rowsSinvBPtr = SinvB.rows.Data();
487  //
488  // TIMER_START(STDFIND_ROW);
489  // Int * pos =&rowsSinvBPtr[0];
490  // Int * last =&rowsSinvBPtr[SinvB.numRow];
491  // for( Int i = 0; i < LB.numRow; i++ ){
492  // // pos = std::find(pos, &rowsSinvBPtr[SinvB.numRow-1], rowsLBPtr[i]);
493  // pos = std::find(rowsSinvBPtr, last, rowsLBPtr[i]);
494  // if(pos != last){
495  // relRows[i] = (Int)(pos - rowsSinvBPtr);
496  // }
497  // else{
498  // std::ostringstream msg;
499  // msg << "Row " << rowsLBPtr[i] <<
500  // " in LB cannot find the corresponding row in SinvB" << std::endl
501  // << "LB.rows = " << LB.rows << std::endl
502  // << "SinvB.rows = " << SinvB.rows << std::endl;
503  // throw std::runtime_error( msg.str().c_str() );
504  // }
505  // }
506  // TIMER_STOP(STDFIND_ROW);
507  //
508  // TIMER_START(Copy_Sinv_to_Ainv);
509  // // Transfer the values from Sinv to AinvBlock
510  // Scalar* nzvalSinv = SinvB.nzval.Data();
511  // Int ldSinv = SinvB.numRow;
512  // for( Int j = 0; j < UB.numCol; j++ ){
513  // for( Int i = 0; i < LB.numRow; i++ ){
514  // nzvalAinv[i+j*ldAinv] =
515  // nzvalSinv[relRows[i] + relCols[j] * ldSinv];
516  // }
517  // }
518  // TIMER_STOP(Copy_Sinv_to_Ainv);
519  //
520  // isBlockFound = true;
521  // break;
522  // }
523  // } // for (ibSinv )
524  // TIMER_STOP(PARSING_ROW_BLOCKIDX);
525  // if( isBlockFound == false ){
526  // std::ostringstream msg;
527  // msg << "Block(" << isup << ", " << jsup
528  // << ") did not find a matching block in Sinv." << std::endl;
529  // throw std::runtime_error( msg.str().c_str() );
530  // }
531  // } // if (isup, jsup) is in L
532  // else{
533  // // Row relative indices
534  // std::vector<Int> relRows( LB.numRow );
535  // Int SinvRowsSta = FirstBlockCol( isup, super_ );
536  // for( Int i = 0; i < LB.numRow; i++ ){
537  // relRows[i] = LB.rows[i] - SinvRowsSta;
538  // }
539  // std::vector<UBlock>& UrowSinv = this->U( LBi( isup, grid_ ) );
540  // bool isBlockFound = false;
541  // TIMER_START(PARSING_COL_BLOCKIDX);
542  // for( Int jbSinv = 0; jbSinv < UrowSinv.size(); jbSinv++ ){
543  // // Found the (isup, jsup) block in Sinv
544  // if( UrowSinv[jbSinv].blockIdx == jsup ){
545  // UBlock& SinvB = UrowSinv[jbSinv];
546  //
547  //
548  //
549  // // Column relative indices
550  // std::vector<Int> relCols( UB.numCol );
551  // Int* colsUBPtr = UB.cols.Data();
552  // Int* colsSinvBPtr = SinvB.cols.Data();
553  // TIMER_START(STDFIND_COL);
554  // Int * pos =&colsSinvBPtr[0];
555  // Int * last =&colsSinvBPtr[SinvB.numCol];
556  // for( Int j = 0; j < UB.numCol; j++ ){
557  // //colsUB is sorted
558  // pos = std::find(colsSinvBPtr, last, colsUBPtr[j]);
559  // if(pos !=last){
560  // relCols[j] = (Int)(pos - colsSinvBPtr);
561  // }
562  // else{
563  // std::ostringstream msg;
564  // msg << "Col " << colsUBPtr[j] <<
565  // " in UB cannot find the corresponding row in SinvB" << std::endl
566  // << "UB.cols = " << UB.cols << std::endl
567  // << "UinvB.cols = " << SinvB.cols << std::endl;
568  // throw std::runtime_error( msg.str().c_str() );
569  // }
570  // }
571  // TIMER_STOP(STDFIND_COL);
572  //
573  //
574  // TIMER_START(Copy_Sinv_to_Ainv);
575  // // Transfer the values from Sinv to AinvBlock
576  // Scalar* nzvalSinv = SinvB.nzval.Data();
577  // Int ldSinv = SinvB.numRow;
578  // for( Int j = 0; j < UB.numCol; j++ ){
579  // for( Int i = 0; i < LB.numRow; i++ ){
580  // nzvalAinv[i+j*ldAinv] =
581  // nzvalSinv[relRows[i] + relCols[j] * ldSinv];
582  // }
583  // }
584  // TIMER_STOP(Copy_Sinv_to_Ainv);
585  //
586  // isBlockFound = true;
587  // break;
588  // }
589  // } // for (jbSinv)
590  // TIMER_STOP(PARSING_COL_BLOCKIDX);
591  // if( isBlockFound == false ){
592  // std::ostringstream msg;
593  // msg << "Block(" << isup << ", " << jsup
594  // << ") did not find a matching block in Sinv." << std::endl;
595  // throw std::runtime_error( msg.str().c_str() );
596  // }
597  // } // if (isup, jsup) is in U
598  //
599  // } // for( ib )
600  // } // for ( jb )
601 #else
602  for( Int jb = 0; jb < UrowRecv.size(); jb++ ){
603  for( Int ib = 0; ib < LcolRecv.size(); ib++ ){
604  LBlock<T>& LB = LcolRecv[ib];
605  UBlock<T>& UB = UrowRecv[jb];
606  Int isup = LB.blockIdx;
607  Int jsup = UB.blockIdx;
608  T* nzvalAinv = &AinvBuf( rowPtr[ib], colPtr[jb] );
609  Int ldAinv = AinvBuf.m();
610 
611  // Pin down the corresponding block in the part of Sinv.
612  if( isup >= jsup ){
613  std::vector<LBlock<T> >& LcolSinv = this->L( LBj(jsup, grid_ ) );
614  bool isBlockFound = false;
615  TIMER_START(PARSING_ROW_BLOCKIDX);
616  for( Int ibSinv = 0; ibSinv < LcolSinv.size(); ibSinv++ ){
617  // Found the (isup, jsup) block in Sinv
618  if( LcolSinv[ibSinv].blockIdx == isup ){
619  LBlock<T> & SinvB = LcolSinv[ibSinv];
620 
621  // Row relative indices
622  std::vector<Int> relRows( LB.numRow );
623  Int* rowsLBPtr = LB.rows.Data();
624  Int* rowsSinvBPtr = SinvB.rows.Data();
625  for( Int i = 0; i < LB.numRow; i++ ){
626  bool isRowFound = false;
627  for( Int i1 = 0; i1 < SinvB.numRow; i1++ ){
628  if( rowsLBPtr[i] == rowsSinvBPtr[i1] ){
629  isRowFound = true;
630  relRows[i] = i1;
631  break;
632  }
633  }
634  if( isRowFound == false ){
635  std::ostringstream msg;
636  msg << "Row " << rowsLBPtr[i] <<
637  " in LB cannot find the corresponding row in SinvB" << std::endl
638  << "LB.rows = " << LB.rows << std::endl
639  << "SinvB.rows = " << SinvB.rows << std::endl;
640  throw std::runtime_error( msg.str().c_str() );
641  }
642  }
643 
644  // Column relative indicies
645  std::vector<Int> relCols( UB.numCol );
646  Int SinvColsSta = FirstBlockCol( jsup, super_ );
647  for( Int j = 0; j < UB.numCol; j++ ){
648  relCols[j] = UB.cols[j] - SinvColsSta;
649  }
650 
651  // Transfer the values from Sinv to AinvBlock
652  T* nzvalSinv = SinvB.nzval.Data();
653  Int ldSinv = SinvB.numRow;
654  for( Int j = 0; j < UB.numCol; j++ ){
655  for( Int i = 0; i < LB.numRow; i++ ){
656  nzvalAinv[i+j*ldAinv] =
657  nzvalSinv[relRows[i] + relCols[j] * ldSinv];
658  }
659  }
660 
661  isBlockFound = true;
662  break;
663  }
664  } // for (ibSinv )
665  TIMER_STOP(PARSING_ROW_BLOCKIDX);
666  if( isBlockFound == false ){
667  std::ostringstream msg;
668  msg << "Block(" << isup << ", " << jsup
669  << ") did not find a matching block in Sinv." << std::endl;
670  throw std::runtime_error( msg.str().c_str() );
671  }
672  } // if (isup, jsup) is in L
673  else{
674  std::vector<UBlock<T> >& UrowSinv = this->U( LBi( isup, grid_ ) );
675  bool isBlockFound = false;
676  TIMER_START(PARSING_COL_BLOCKIDX);
677  for( Int jbSinv = 0; jbSinv < UrowSinv.size(); jbSinv++ ){
678  // Found the (isup, jsup) block in Sinv
679  if( UrowSinv[jbSinv].blockIdx == jsup ){
680  UBlock<T> & SinvB = UrowSinv[jbSinv];
681 
682  // Row relative indices
683  std::vector<Int> relRows( LB.numRow );
684  Int SinvRowsSta = FirstBlockCol( isup, super_ );
685  for( Int i = 0; i < LB.numRow; i++ ){
686  relRows[i] = LB.rows[i] - SinvRowsSta;
687  }
688 
689  // Column relative indices
690  std::vector<Int> relCols( UB.numCol );
691  Int* colsUBPtr = UB.cols.Data();
692  Int* colsSinvBPtr = SinvB.cols.Data();
693  for( Int j = 0; j < UB.numCol; j++ ){
694  bool isColFound = false;
695  for( Int j1 = 0; j1 < SinvB.numCol; j1++ ){
696  if( colsUBPtr[j] == colsSinvBPtr[j1] ){
697  isColFound = true;
698  relCols[j] = j1;
699  break;
700  }
701  }
702  if( isColFound == false ){
703  std::ostringstream msg;
704  msg << "Col " << colsUBPtr[j] <<
705  " in UB cannot find the corresponding row in SinvB" << std::endl
706  << "UB.cols = " << UB.cols << std::endl
707  << "UinvB.cols = " << SinvB.cols << std::endl;
708  throw std::runtime_error( msg.str().c_str() );
709  }
710  }
711 
712 
713  // Transfer the values from Sinv to AinvBlock
714  T* nzvalSinv = SinvB.nzval.Data();
715  Int ldSinv = SinvB.numRow;
716  for( Int j = 0; j < UB.numCol; j++ ){
717  for( Int i = 0; i < LB.numRow; i++ ){
718  nzvalAinv[i+j*ldAinv] =
719  nzvalSinv[relRows[i] + relCols[j] * ldSinv];
720  }
721  }
722 
723  isBlockFound = true;
724  break;
725  }
726  } // for (jbSinv)
727  TIMER_STOP(PARSING_COL_BLOCKIDX);
728  if( isBlockFound == false ){
729  std::ostringstream msg;
730  msg << "Block(" << isup << ", " << jsup
731  << ") did not find a matching block in Sinv." << std::endl;
732  throw std::runtime_error( msg.str().c_str() );
733  }
734  } // if (isup, jsup) is in U
735 
736  } // for( ib )
737  } // for ( jb )
738 
739 
740 #endif
741  TIMER_STOP(JB_Loop);
742 
743 
744  TIMER_STOP(Compute_Sinv_LT_Lookup_Indexes);
745  }
746 
747  template<typename T>
749  std::vector<SuperNodeBufferType > & arrSuperNodes,
750  Int stepSuper)
751  {
752 
753  TIMER_START(Send_CD_Update_U);
754  //compute the number of requests
755  Int sendCount = 0;
756  Int recvCount = 0;
757  Int sendOffset[stepSuper];
758  Int recvOffset[stepSuper];
759  Int recvIdx=0;
760  for (Int supidx=0; supidx<stepSuper; supidx++){
761  SuperNodeBufferType & snode = arrSuperNodes[supidx];
762  sendOffset[supidx]=sendCount;
763  recvOffset[supidx]=recvCount;
764  sendCount+= CountSendToCrossDiagonal(snode.Index);
765  recvCount+= CountRecvFromCrossDiagonal(snode.Index);
766  }
767 
768 
769  std::vector<MPI_Request > arrMpiReqsSendCD(sendCount, MPI_REQUEST_NULL );
770  std::vector<MPI_Request > arrMpiReqsSizeSendCD(sendCount, MPI_REQUEST_NULL );
771 
772  std::vector<MPI_Request > arrMpiReqsRecvCD(recvCount, MPI_REQUEST_NULL );
773  std::vector<MPI_Request > arrMpiReqsSizeRecvCD(recvCount, MPI_REQUEST_NULL );
774  std::vector<std::vector<char> > arrSstrLcolSendCD(sendCount);
775  std::vector<int > arrSstrLcolSizeSendCD(sendCount);
776  std::vector<std::vector<char> > arrSstrLcolRecvCD(recvCount);
777  std::vector<int > arrSstrLcolSizeRecvCD(recvCount);
778 
779  for (Int supidx=0; supidx<stepSuper; supidx++){
780  SuperNodeBufferType & snode = arrSuperNodes[supidx];
781 
782  // Send LUpdateBufReduced to the cross diagonal blocks.
783  // NOTE: This assumes square processor grid
784 
785  TIMER_START(Send_L_CrossDiag);
786 
787  if( MYCOL( grid_ ) == PCOL( snode.Index, grid_ ) && isSendToCrossDiagonal_(grid_->numProcCol, snode.Index ) ){
788 
789  Int sendIdx = 0;
790  for(Int dstCol = 0; dstCol<grid_->numProcCol; dstCol++){
791  if(isSendToCrossDiagonal_(dstCol,snode.Index) ){
792  Int dest = PNUM(PROW(snode.Index,grid_),dstCol,grid_);
793 
794  if( MYPROC( grid_ ) != dest ){
795  MPI_Request & mpiReqSizeSend = arrMpiReqsSizeSendCD[sendOffset[supidx]+sendIdx];
796  MPI_Request & mpiReqSend = arrMpiReqsSendCD[sendOffset[supidx]+sendIdx];
797 
798 
799  std::stringstream sstm;
800  std::vector<char> & sstrLcolSend = arrSstrLcolSendCD[sendOffset[supidx]+sendIdx];
801  Int & sstrSize = arrSstrLcolSizeSendCD[sendOffset[supidx]+sendIdx];
802 
803  serialize( snode.RowLocalPtr, sstm, NO_MASK );
804  serialize( snode.BlockIdxLocal, sstm, NO_MASK );
805  serialize( snode.LUpdateBuf, sstm, NO_MASK );
806 
807  sstrLcolSend.resize( Size(sstm) );
808  sstm.read( &sstrLcolSend[0], sstrLcolSend.size() );
809  sstrSize = sstrLcolSend.size();
810 
811 
812  MPI_Isend( &sstrSize, sizeof(sstrSize), MPI_BYTE, dest, IDX_TO_TAG(snode.Index,SELINV_TAG_L_SIZE_CD), grid_->comm, &mpiReqSizeSend );
813  MPI_Isend( (void*)&sstrLcolSend[0], sstrSize, MPI_BYTE, dest, IDX_TO_TAG(snode.Index,SELINV_TAG_L_CONTENT_CD), grid_->comm, &mpiReqSend );
814 
815  PROFILE_COMM(MYPROC(this->grid_),dest,IDX_TO_TAG(snode.Index,SELINV_TAG_L_SIZE_CD),sizeof(sstrSize));
816  PROFILE_COMM(MYPROC(this->grid_),dest,IDX_TO_TAG(snode.Index,SELINV_TAG_L_CONTENT_CD),sstrSize);
817 
818 
819  sendIdx++;
820  }
821  }
822  }
823 
824 
825  } // sender
826  TIMER_STOP(Send_L_CrossDiag);
827  }
828 
829 
830  //Do Irecv for sizes
831  for (Int supidx=0; supidx<stepSuper; supidx++){
832  SuperNodeBufferType & snode = arrSuperNodes[supidx];
833  //If I'm a receiver
834  if( MYROW( grid_ ) == PROW( snode.Index, grid_ ) && isRecvFromCrossDiagonal_(grid_->numProcRow, snode.Index ) ){
835  Int recvIdx=0;
836  for(Int srcRow = 0; srcRow<grid_->numProcRow; srcRow++){
837  if(isRecvFromCrossDiagonal_(srcRow,snode.Index) ){
838  Int src = PNUM(srcRow,PCOL(snode.Index,grid_),grid_);
839  if( MYPROC( grid_ ) != src ){
840  Int & sstrSize = arrSstrLcolSizeRecvCD[recvOffset[supidx]+recvIdx];
841  MPI_Request & mpiReqSizeRecv = arrMpiReqsSizeRecvCD[recvOffset[supidx]+recvIdx];
842  MPI_Irecv( &sstrSize, 1, MPI_INT, src, IDX_TO_TAG(snode.Index,SELINV_TAG_L_SIZE_CD), grid_->comm, &mpiReqSizeRecv );
843  recvIdx++;
844  }
845  }
846  }
847  }//end if I'm a receiver
848  }
849 
850  //waitall sizes
851  mpi::Waitall(arrMpiReqsSizeRecvCD);
852  //Allocate content and do Irecv
853  for (Int supidx=0; supidx<stepSuper; supidx++){
854  SuperNodeBufferType & snode = arrSuperNodes[supidx];
855  //If I'm a receiver
856  if( MYROW( grid_ ) == PROW( snode.Index, grid_ ) && isRecvFromCrossDiagonal_(grid_->numProcRow, snode.Index ) ){
857  Int recvIdx=0;
858  for(Int srcRow = 0; srcRow<grid_->numProcRow; srcRow++){
859  if(isRecvFromCrossDiagonal_(srcRow,snode.Index) ){
860  Int src = PNUM(srcRow,PCOL(snode.Index,grid_),grid_);
861  if( MYPROC( grid_ ) != src ){
862  Int & sstrSize = arrSstrLcolSizeRecvCD[recvOffset[supidx]+recvIdx];
863  std::vector<char> & sstrLcolRecv = arrSstrLcolRecvCD[recvOffset[supidx]+recvIdx];
864  MPI_Request & mpiReqRecv = arrMpiReqsRecvCD[recvOffset[supidx]+recvIdx];
865  sstrLcolRecv.resize( sstrSize);
866  MPI_Irecv( (void*)&sstrLcolRecv[0], sstrSize, MPI_BYTE, src, IDX_TO_TAG(snode.Index,SELINV_TAG_L_CONTENT_CD), grid_->comm, &mpiReqRecv );
867  //statusOFS<<"P"<<MYPROC(grid_)<<" received "<<sstrSize<<" bytes of L/U from CD P"<<src<<std::endl;
868  recvIdx++;
869  }
870  }
871  }
872  }//end if I'm a receiver
873  }
874 
875  //waitall content
876  mpi::Waitall(arrMpiReqsRecvCD);
877  //Do the work
878  for (Int supidx=0; supidx<stepSuper; supidx++){
879  SuperNodeBufferType & snode = arrSuperNodes[supidx];
880 
881  // Send LUpdateBufReduced to the cross diagonal blocks.
882  // NOTE: This assumes square processor grid
883  if( MYROW( grid_ ) == PROW( snode.Index, grid_ ) && isRecvFromCrossDiagonal_(grid_->numProcRow, snode.Index ) ){
884 
885 #if ( _DEBUGlevel_ >= 1 )
886  statusOFS << std::endl << " ["<<snode.Index<<"] "<< "Update the upper triangular block" << std::endl << std::endl;
887  statusOFS << std::endl << " ["<<snode.Index<<"] "<< "blockIdxLocal:" << snode.BlockIdxLocal << std::endl << std::endl;
888  statusOFS << std::endl << " ["<<snode.Index<<"] "<< "rowLocalPtr:" << snode.RowLocalPtr << std::endl << std::endl;
889 #endif
890 
891  std::vector<UBlock<T> >& Urow = this->U( LBi( snode.Index, grid_ ) );
892  std::vector<bool> isBlockFound(Urow.size(),false);
893 
894  recvIdx=0;
895  for(Int srcRow = 0; srcRow<grid_->numProcRow; srcRow++){
896  if(isRecvFromCrossDiagonal_(srcRow,snode.Index) ){
897  Int src = PNUM(srcRow,PCOL(snode.Index,grid_),grid_);
898  TIMER_START(Recv_L_CrossDiag);
899 
900  std::vector<Int> rowLocalPtrRecv;
901  std::vector<Int> blockIdxLocalRecv;
902  NumMat<T> UUpdateBuf;
903 
904  if( MYPROC( grid_ ) != src ){
905  std::stringstream sstm;
906  Int & sstrSize = arrSstrLcolSizeRecvCD[recvOffset[supidx]+recvIdx];
907  std::vector<char> & sstrLcolRecv = arrSstrLcolRecvCD[recvOffset[supidx]+recvIdx];
908  sstm.write( &sstrLcolRecv[0], sstrSize );
909 
910  deserialize( rowLocalPtrRecv, sstm, NO_MASK );
911  deserialize( blockIdxLocalRecv, sstm, NO_MASK );
912  deserialize( UUpdateBuf, sstm, NO_MASK );
913 
914  recvIdx++;
915 
916  } // sender is not the same as receiver
917  else{
918  rowLocalPtrRecv = snode.RowLocalPtr;
919  blockIdxLocalRecv = snode.BlockIdxLocal;
920  UUpdateBuf = snode.LUpdateBuf;
921  } // sender is the same as receiver
922 
923 
924 
925  TIMER_STOP(Recv_L_CrossDiag);
926 
927 #if ( _DEBUGlevel_ >= 1 )
928  statusOFS<<" ["<<snode.Index<<"] P"<<MYPROC(grid_)<<" ("<<MYROW(grid_)<<","<<MYCOL(grid_)<<") <--- LBj("<<snode.Index<<") <--- P"<<src<<std::endl;
929  statusOFS << std::endl << " ["<<snode.Index<<"] "<< "rowLocalPtrRecv:" << rowLocalPtrRecv << std::endl << std::endl;
930  statusOFS << std::endl << " ["<<snode.Index<<"] "<< "blockIdxLocalRecv:" << blockIdxLocalRecv << std::endl << std::endl;
931 #endif
932 
933 
934  // Update U
935  for( Int ib = 0; ib < blockIdxLocalRecv.size(); ib++ ){
936  for( Int jb = 0; jb < Urow.size(); jb++ ){
937  UBlock<T>& UB = Urow[jb];
938  if( UB.blockIdx == blockIdxLocalRecv[ib] ){
939  NumMat<T> Ltmp ( UB.numCol, UB.numRow );
940  lapack::Lacpy( 'A', Ltmp.m(), Ltmp.n(),
941  &UUpdateBuf( rowLocalPtrRecv[ib], 0 ),
942  UUpdateBuf.m(), Ltmp.Data(), Ltmp.m() );
943  isBlockFound[jb] = true;
944  Transpose( Ltmp, UB.nzval );
945  break;
946  }
947  }
948  }
949  }
950  }
951 
952  for( Int jb = 0; jb < Urow.size(); jb++ ){
953  UBlock<T>& UB = Urow[jb];
954  if( !isBlockFound[jb] ){
955 #ifdef USE_ABORT
956  abort();
957 #endif
958  throw std::logic_error( "UBlock cannot find its update. Something is seriously wrong." );
959  }
960  }
961  } // receiver
962  }
963 
964  TIMER_STOP(Send_CD_Update_U);
965 
966  mpi::Waitall(arrMpiReqsSizeSendCD);
967  mpi::Waitall(arrMpiReqsSendCD);
968  };
969 
970  template<typename T>
972  SuperNodeBufferType & snode,
973  std::vector<LBlock<T> > & LcolRecv,
974  std::vector<UBlock<T> > & UrowRecv )
975  {
976 #if ( _DEBUGlevel_ >= 1 )
977  statusOFS << std::endl << "["<<snode.Index<<"] "<< "Unpack the received data for processors participate in Gemm. " << std::endl << std::endl;
978 #endif
979  // U part
980  if( MYROW( grid_ ) != PROW( snode.Index, grid_ ) ){
981  std::stringstream sstm;
982  sstm.write( &snode.SstrUrowRecv[0], snode.SstrUrowRecv.size() );
983  std::vector<Int> mask( UBlockMask::TOTAL_NUMBER, 1 );
984  Int numUBlock;
985  deserialize( numUBlock, sstm, NO_MASK );
986  UrowRecv.resize( numUBlock );
987  for( Int jb = 0; jb < numUBlock; jb++ ){
988  deserialize( UrowRecv[jb], sstm, mask );
989  }
990  } // sender is not the same as receiver
991  else{
992  // U is obtained locally, just make a copy. Include everything
993  // (there is no diagonal block)
994  // Is it a copy ? LL: YES. Maybe we should replace the copy by
995  // something more efficient especially for mpisize == 1
996  UrowRecv.resize(this->U( LBi( snode.Index, grid_ ) ).size());
997  std::copy(this->U( LBi( snode.Index, grid_ ) ).begin(),this->U( LBi( snode.Index, grid_ )).end(),UrowRecv.begin());
998  } // sender is the same as receiver
999 
1000 
1001  //L part
1002  if( MYCOL( grid_ ) != PCOL( snode.Index, grid_ ) ){
1003  std::stringstream sstm;
1004  sstm.write( &snode.SstrLcolRecv[0], snode.SstrLcolRecv.size() );
1005  std::vector<Int> mask( LBlockMask::TOTAL_NUMBER, 1 );
1006  mask[LBlockMask::NZVAL] = 0; // nzval is excluded
1007  Int numLBlock;
1008  deserialize( numLBlock, sstm, NO_MASK );
1009  LcolRecv.resize( numLBlock );
1010  for( Int ib = 0; ib < numLBlock; ib++ ){
1011  deserialize( LcolRecv[ib], sstm, mask );
1012  }
1013  } // sender is not the same as receiver
1014  else{
1015  // L is obtained locally, just make a copy.
1016  // Do not include the diagonal block
1017  std::vector<LBlock<T> >& Lcol = this->L( LBj( snode.Index, grid_ ) );
1018  Int startIdx = ( MYROW( grid_ ) == PROW( snode.Index, grid_ ) )?1:0;
1019  LcolRecv.resize( Lcol.size() - startIdx );
1020  std::copy(Lcol.begin()+startIdx,Lcol.end(),LcolRecv.begin());
1021  } // sender is the same as receiver
1022  }
1023 
1024  template<typename T>
1026  {
1027 
1028  //---------Computing Diagonal block, all processors in the column are participating to all pipelined supernodes
1029  if( MYCOL( grid_ ) == PCOL( snode.Index, grid_ ) ){
1030 #if ( _DEBUGlevel_ >= 2 )
1031  statusOFS << std::endl << "["<<snode.Index<<"] "<< "Updating the diagonal block" << std::endl << std::endl;
1032 #endif
1033  std::vector<LBlock<T> >& Lcol = this->L( LBj( snode.Index, grid_ ) );
1034 
1035  //Allocate DiagBuf even if Lcol.size() == 0
1036  snode.DiagBuf.Resize(SuperSize( snode.Index, super_ ), SuperSize( snode.Index, super_ ));
1037  SetValue(snode.DiagBuf, ZERO<T>());
1038 
1039  // Do I own the diagonal block ?
1040  Int startIb = (MYROW( grid_ ) == PROW( snode.Index, grid_ ))?1:0;
1041  for( Int ib = startIb; ib < Lcol.size(); ib++ ){
1042 
1043 #ifdef GEMM_PROFILE
1044  gemm_stat.push_back(snode.DiagBuf.m());
1045  gemm_stat.push_back(snode.DiagBuf.n());
1046  gemm_stat.push_back(Lcol[ib].numRow);
1047 #endif
1048 
1049  LBlock<T> & LB = Lcol[ib];
1050 
1051  blas::Gemm( 'T', 'N', snode.DiagBuf.m(), snode.DiagBuf.n(), LB.numRow,
1052  MINUS_ONE<T>(), &snode.LUpdateBuf( snode.RowLocalPtr[ib-startIb], 0 ), snode.LUpdateBuf.m(),
1053  LB.nzval.Data(), LB.nzval.m(), ONE<T>(), snode.DiagBuf.Data(), snode.DiagBuf.m() );
1054  }
1055 
1056 #if ( _DEBUGlevel_ >= 1 )
1057  statusOFS << std::endl << "["<<snode.Index<<"] "<< "Updated the diagonal block" << std::endl << std::endl;
1058 #endif
1059  }
1060  }
1061 
1062 
1063 
1064  template<typename T>
1065  void PMatrix<T>::GetEtree(std::vector<Int> & etree_supno )
1066  {
1067 
1068 #ifndef _RELEASE_
1069  PushCallStack("PMatrix::GetEtree");
1070  double begin = MPI_Wtime( );
1071 #endif
1072  Int nsupers = this->NumSuper();
1073 
1074  if( optionsLU_->ColPerm != "PARMETIS" ) {
1075  /* Use the etree computed from serial symb. fact., and turn it
1076  into supernodal tree. */
1077  const SuperNodeType * superNode = this->SuperNode();
1078 
1079 
1080  //translate from columns to supernodes etree using supIdx
1081  etree_supno.resize(this->NumSuper());
1082  for(Int i = 0; i < superNode->etree.m(); ++i){
1083  Int curSnode = superNode->superIdx[i];
1084  Int parentSnode = (superNode->etree[i]>= superNode->etree.m()) ?this->NumSuper():superNode->superIdx[superNode->etree[i]];
1085  if( curSnode != parentSnode){
1086  etree_supno[curSnode] = parentSnode;
1087  }
1088  }
1089 
1090  } else { /* ParSymbFACT==YES and SymPattern==YES and RowPerm == NOROWPERM */
1091  /* Compute an "etree" based on struct(L),
1092  assuming struct(U) = struct(L'). */
1093 
1094  /* find the first block in each supernodal-column of local L-factor */
1095  std::vector<Int> etree_supno_l( nsupers, nsupers );
1096  for( Int ksup = 0; ksup < nsupers; ksup++ ){
1097  if( MYCOL( grid_ ) == PCOL( ksup, grid_ ) ){
1098  // L part
1099  std::vector<LBlock<T> >& Lcol = this->L( LBj(ksup, grid_) );
1100  if(Lcol.size()>0){
1101  Int firstBlk = 0;
1102  if( MYROW( grid_ ) == PROW( ksup, grid_ ) )
1103  firstBlk=1;
1104 
1105  for( Int ib = firstBlk; ib < Lcol.size(); ib++ ){
1106  etree_supno_l[ksup] = std::min(etree_supno_l[ksup] , Lcol[ib].blockIdx);
1107  }
1108  }
1109  }
1110  }
1111 
1112 
1113 #if ( _DEBUGlevel_ >= 1 )
1114  statusOFS << std::endl << " Local supernodal elimination tree is " << etree_supno_l <<std::endl<<std::endl;
1115 
1116 #endif
1117  /* form global e-tree */
1118  etree_supno.resize( nsupers );
1119  mpi::Allreduce( (Int*) &etree_supno_l[0],(Int *) &etree_supno[0], nsupers, MPI_MIN, grid_->comm );
1120  etree_supno[nsupers-1]=nsupers;
1121  }
1122 
1123 #ifndef _RELEASE_
1124  double end = MPI_Wtime( );
1125  statusOFS<<"Building the list took "<<end-begin<<"s"<<std::endl;
1126 #endif
1127 #ifndef _RELEASE_
1128  PopCallStack();
1129 #endif
1130  } // ----- end of method PMatrix::GetEtree -----
1131 
1132 
1133  template<typename T>
1135  std::vector<Int> & snodeEtree,
1136  std::vector<std::vector<Int> > & WSet)
1137  {
1138  TIMER_START(Compute_WorkSet);
1139  Int numSuper = this->NumSuper();
1140 
1141 
1142  if (options_->maxPipelineDepth!=1){
1143  //find roots in the supernode etree (it must be postordered)
1144  //initialize the parent we are looking at
1145  //Int rootParent = snodeEtree[numSuper-2];
1146  Int rootParent = numSuper;
1147 
1148  //compute the level of each supernode and the total number of levels
1149  //IntNumVec level(numSuper);
1150  //level(rootParent)=0;
1151  IntNumVec level(numSuper+1);
1152  level(rootParent)=-1;
1153  Int numLevel = 0;
1154  for(Int i=rootParent-1; i>=0; i-- ){ level(i) = level(snodeEtree[i])+1; numLevel = std::max(numLevel, level(i)); }
1155  numLevel++;
1156 
1157  //Compute the number of supernodes at each level
1158  IntNumVec levelSize(numLevel);
1159  SetValue(levelSize,I_ZERO);
1160  //for(Int i=rootParent-1; i>=0; i-- ){ levelSize(level(i))++; }
1161  for(Int i=rootParent-1; i>=0; i-- ){ if(level[i]>=0){ levelSize(level(i))++; } }
1162 
1163  //Allocate memory
1164  WSet.resize(numLevel,std::vector<Int>());
1165  for(Int i=0; i<numLevel; i++ ){WSet[i].reserve(levelSize(i));}
1166 
1167  //Fill the worklist based on the level of each supernode
1168  for(Int i=rootParent-1; i>=0; i-- ){
1169  WSet[level(i)].push_back(i);
1170  }
1171 
1172  //Constrain the size of each list to be min(MPI_MAX_COMM,options_->maxPipelineDepth)
1173  Int limit = (options_->maxPipelineDepth>0)?std::min(MPI_MAX_COMM,options_->maxPipelineDepth):MPI_MAX_COMM;
1174  for (Int lidx=0; lidx<WSet.size() ; lidx++){
1175  if(WSet[lidx].size()>limit)
1176  {
1177  std::vector<std::vector<Int> >::iterator pos = WSet.begin()+lidx+1;
1178  WSet.insert(pos,std::vector<Int>());
1179  WSet[lidx+1].insert(WSet[lidx+1].begin(),WSet[lidx].begin() + limit ,WSet[lidx].end());
1180  WSet[lidx].erase(WSet[lidx].begin()+limit,WSet[lidx].end());
1181  }
1182  }
1183 
1184 
1185 
1186  }
1187  else{
1188  for( Int ksup = numSuper - 2; ksup >= 0; ksup-- ){
1189  WSet.push_back(std::vector<Int>());
1190  WSet.back().push_back(ksup);
1191  }
1192 
1193  }
1194 
1195 
1196 
1197 
1198  TIMER_STOP(Compute_WorkSet);
1199 #if ( _DEBUGlevel_ >= 1 )
1200  for (Int lidx=0; lidx<WSet.size() ; lidx++){
1201  statusOFS << std::endl << "L"<< lidx << " is: {";
1202  for (Int supidx=0; supidx<WSet[lidx].size() ; supidx++){
1203  statusOFS << WSet[lidx][supidx] << " ["<<snodeEtree[WSet[lidx][supidx]]<<"] ";
1204  }
1205  statusOFS << " }"<< std::endl;
1206  }
1207 #endif
1208  }
1209 
1210  template<typename T>
1211  inline void PMatrix<T>::SelInvIntra_P2p(Int lidx)
1212  {
1213 
1214 #if defined (PROFILE) || defined(PMPI) || defined(USE_TAU)
1215  Real begin_SendULWaitContentFirst, end_SendULWaitContentFirst, time_SendULWaitContentFirst = 0;
1216 #endif
1217  Int numSuper = this->NumSuper();
1218  std::vector<std::vector<Int> > & superList = this->WorkingSet();
1219  Int numSteps = superList.size();
1220  Int stepSuper = superList[lidx].size();
1221 
1222  TIMER_START(AllocateBuffer);
1223 
1224  //This is required to send the size and content of U/L
1225  std::vector<std::vector<MPI_Request> > arrMpireqsSendToBelow;
1226  arrMpireqsSendToBelow.resize( stepSuper, std::vector<MPI_Request>( 2 * grid_->numProcRow, MPI_REQUEST_NULL ));
1227  std::vector<std::vector<MPI_Request> > arrMpireqsSendToRight;
1228  arrMpireqsSendToRight.resize(stepSuper, std::vector<MPI_Request>( 2 * grid_->numProcCol, MPI_REQUEST_NULL ));
1229 
1230  //This is required to reduce L
1231  std::vector<MPI_Request> arrMpireqsSendToLeft;
1232  arrMpireqsSendToLeft.resize(stepSuper, MPI_REQUEST_NULL );
1233  //This is required to reduce D
1234  std::vector<MPI_Request> arrMpireqsSendToAbove;
1235  arrMpireqsSendToAbove.resize(stepSuper, MPI_REQUEST_NULL );
1236 
1237  //This is required to receive the size and content of U/L
1238  std::vector<MPI_Request> arrMpireqsRecvSizeFromAny;
1239  arrMpireqsRecvSizeFromAny.resize(stepSuper*2 , MPI_REQUEST_NULL);
1240  std::vector<MPI_Request> arrMpireqsRecvContentFromAny;
1241  arrMpireqsRecvContentFromAny.resize(stepSuper*2 , MPI_REQUEST_NULL);
1242 
1243 
1244  //allocate the buffers for this supernode
1245  std::vector<SuperNodeBufferType> arrSuperNodes(stepSuper);
1246  for (Int supidx=0; supidx<stepSuper; supidx++){
1247  arrSuperNodes[supidx].Index = superList[lidx][supidx];
1248  }
1249 
1250 
1251 
1252  int numSentToLeft = 0;
1253  std::vector<int> reqSentToLeft;
1254 
1255 
1256  NumMat<T> AinvBuf, UBuf;
1257 
1258  TIMER_STOP(AllocateBuffer);
1259 
1260 #ifndef _RELEASE_
1261  PushCallStack("PMatrix::SelInv_P2p::UpdateL");
1262 #endif
1263 #if ( _DEBUGlevel_ >= 1 )
1264  statusOFS << std::endl << "Communication to the Schur complement." << std::endl << std::endl;
1265 #endif
1266 
1267  {
1268  // Senders
1269  //Receivers have to resize their buffers
1270  TIMER_START(IRecv_Content_UL);
1271  // Receivers (Content)
1272  for (Int supidx=0; supidx<stepSuper ; supidx++){
1273  SuperNodeBufferType & snode = arrSuperNodes[supidx];
1274  MPI_Request * mpireqsRecvFromAbove = &arrMpireqsRecvContentFromAny[supidx*2];
1275  MPI_Request * mpireqsRecvFromLeft = &arrMpireqsRecvContentFromAny[supidx*2+1];
1276 
1277  if( isRecvFromAbove_( snode.Index ) &&
1278  MYROW( grid_ ) != PROW( snode.Index, grid_ ) ){
1279 
1280  TreeBcast * bcastUTree = fwdToBelowTree_[snode.Index];
1281  if(bcastUTree!=NULL){
1282  Int myRoot = bcastUTree->GetRoot();
1283  snode.SizeSstrUrowRecv = bcastUTree->GetMsgSize();
1284  snode.SstrUrowRecv.resize( snode.SizeSstrUrowRecv);
1285  MPI_Irecv( &snode.SstrUrowRecv[0], snode.SizeSstrUrowRecv, MPI_BYTE,
1286  myRoot, IDX_TO_TAG(snode.Index,SELINV_TAG_U_CONTENT),
1287  grid_->colComm, mpireqsRecvFromAbove );
1288 #if ( _DEBUGlevel_ >= 1 )
1289  statusOFS << std::endl << "["<<snode.Index<<"] "<< "Receiving U " << snode.SizeSstrUrowRecv << " BYTES from "<<myRoot<<" on tag "<<IDX_TO_TAG(snode.Index,SELINV_TAG_U_CONTENT)<< std::endl << std::endl;
1290 #endif
1291  }
1292  } // if I need to receive from up
1293 
1294  if( isRecvFromLeft_( snode.Index ) &&
1295  MYCOL( grid_ ) != PCOL( snode.Index, grid_ ) ){
1296  TreeBcast * bcastLTree = fwdToRightTree_[snode.Index];
1297  if(bcastLTree!=NULL){
1298  Int myRoot = bcastLTree->GetRoot();
1299  snode.SizeSstrLcolRecv = bcastLTree->GetMsgSize();
1300  snode.SstrLcolRecv.resize(snode.SizeSstrLcolRecv);
1301  MPI_Irecv( &snode.SstrLcolRecv[0], snode.SizeSstrLcolRecv, MPI_BYTE,
1302  myRoot, IDX_TO_TAG(snode.Index,SELINV_TAG_L_CONTENT),
1303  grid_->rowComm, mpireqsRecvFromLeft );
1304 #if ( _DEBUGlevel_ >= 1 )
1305  statusOFS << std::endl << "["<<snode.Index<<"] "<< "Receiving L " << snode.SizeSstrLcolRecv << " BYTES from "<<myRoot<<" on tag "<<IDX_TO_TAG(snode.Index,SELINV_TAG_L_CONTENT)<< std::endl << std::endl;
1306 #endif
1307  }
1308  } // if I need to receive from left
1309  }
1310  TIMER_STOP(IRecv_Content_UL);
1311 
1312  // Senders
1313  TIMER_START(ISend_Content_UL);
1314  for (Int supidx=0; supidx<stepSuper; supidx++){
1315  SuperNodeBufferType & snode = arrSuperNodes[supidx];
1316  std::vector<MPI_Request> & mpireqsSendToBelow = arrMpireqsSendToBelow[supidx];
1317  std::vector<MPI_Request> & mpireqsSendToRight = arrMpireqsSendToRight[supidx];
1318 
1319 #if ( _DEBUGlevel_ >= 1 )
1320  statusOFS << std::endl << "["<<snode.Index<<"] "
1321  << "Communication for the U part." << std::endl << std::endl;
1322 #endif
1323  // Communication for the U part.
1324  if( MYROW( grid_ ) == PROW( snode.Index, grid_ ) ){
1325  std::vector<UBlock<T> >& Urow = this->U( LBi(snode.Index, grid_) );
1326  // Pack the data in U
1327  TIMER_START(Serialize_UL);
1328  std::stringstream sstm;
1329 
1330  std::vector<Int> mask( UBlockMask::TOTAL_NUMBER, 1 );
1331  // All blocks are to be sent down.
1332  serialize( (Int)Urow.size(), sstm, NO_MASK );
1333  for( Int jb = 0; jb < Urow.size(); jb++ ){
1334  serialize( Urow[jb], sstm, mask );
1335  }
1336  snode.SstrUrowSend.resize( Size( sstm ) );
1337  sstm.read( &snode.SstrUrowSend[0], snode.SstrUrowSend.size() );
1338  snode.SizeSstrUrowSend = snode.SstrUrowSend.size();
1339  TIMER_STOP(Serialize_UL);
1340  TreeBcast * bcastUTree = fwdToBelowTree_[snode.Index];
1341  if(bcastUTree!=NULL){
1342  bcastUTree->ForwardMessage((char*)&snode.SstrUrowSend[0], snode.SizeSstrUrowSend,
1343  IDX_TO_TAG(snode.Index,SELINV_TAG_U_CONTENT), &mpireqsSendToBelow[0] );
1344  for( Int idxRecv = 0; idxRecv < bcastUTree->GetDestCount(); ++idxRecv ){
1345  Int iProcRow = bcastUTree->GetDest(idxRecv);
1346 #if ( _DEBUGlevel_ >= 1 )
1347  statusOFS << std::endl << "["<<snode.Index<<"] "<< "Sending U " << snode.SizeSstrUrowSend << " BYTES on tag "<<IDX_TO_TAG(snode.Index,SELINV_TAG_U_CONTENT) << std::endl << std::endl;
1348 #endif
1349  }
1350  }
1351  } // if I am the sender
1352 
1353 
1354 #if ( _DEBUGlevel_ >= 1 )
1355  statusOFS << std::endl << "["<<snode.Index<<"] "<< "Communication for the L part." << std::endl << std::endl;
1356 #endif
1357 
1358 
1359 
1360  // Communication for the L part.
1361  if( MYCOL( grid_ ) == PCOL( snode.Index, grid_ ) ){
1362  std::vector<LBlock<T> >& Lcol = this->L( LBj(snode.Index, grid_) );
1363  TIMER_START(Serialize_UL);
1364  // Pack the data in L
1365  std::stringstream sstm;
1366  std::vector<Int> mask( LBlockMask::TOTAL_NUMBER, 1 );
1367  mask[LBlockMask::NZVAL] = 0; // nzval is excluded
1368 
1369  // All blocks except for the diagonal block are to be sent right
1370 
1371  if( MYROW( grid_ ) == PROW( snode.Index, grid_ ) )
1372  serialize( (Int)Lcol.size() - 1, sstm, NO_MASK );
1373  else
1374  serialize( (Int)Lcol.size(), sstm, NO_MASK );
1375 
1376  for( Int ib = 0; ib < Lcol.size(); ib++ ){
1377  if( Lcol[ib].blockIdx > snode.Index ){
1378 #if ( _DEBUGlevel_ >= 2 )
1379  statusOFS << std::endl << "["<<snode.Index<<"] "<< "Serializing Block index " << Lcol[ib].blockIdx << std::endl;
1380 #endif
1381  serialize( Lcol[ib], sstm, mask );
1382  }
1383  }
1384  snode.SstrLcolSend.resize( Size( sstm ) );
1385  sstm.read( &snode.SstrLcolSend[0], snode.SstrLcolSend.size() );
1386  snode.SizeSstrLcolSend = snode.SstrLcolSend.size();
1387  TIMER_STOP(Serialize_UL);
1388 
1389 
1390  TreeBcast * bcastLTree = fwdToRightTree_[snode.Index];
1391  if(bcastLTree!=NULL){
1392  bcastLTree->ForwardMessage((char*)&snode.SstrLcolSend[0], snode.SizeSstrLcolSend,
1393  IDX_TO_TAG(snode.Index,SELINV_TAG_L_CONTENT), &mpireqsSendToRight[0] );
1394 
1395  for( Int idxRecv = 0; idxRecv < bcastLTree->GetDestCount(); ++idxRecv ){
1396  Int iProcCol = bcastLTree->GetDest(idxRecv);
1397 #if ( _DEBUGlevel_ >= 1 )
1398  statusOFS << std::endl << "["<<snode.Index<<"] "<< "Sending L " << snode.SizeSstrLcolSend << " BYTES on tag "<<IDX_TO_TAG(snode.Index,SELINV_TAG_L_CONTENT) << std::endl << std::endl;
1399 #endif
1400  }
1401  }
1402  } // if I am the sender
1403  } //Senders
1404  TIMER_STOP(ISend_Content_UL);
1405  }
1406 
1407  vector<char> redLdone(stepSuper,0);
1408  for (Int supidx=0; supidx<stepSuper; supidx++){
1409  SuperNodeBufferType & snode = arrSuperNodes[supidx];
1410  TreeReduce<T> * redLTree = redToLeftTree_[snode.Index];
1411 
1412  if(redLTree != NULL){
1413  redLTree->SetTag(IDX_TO_TAG(snode.Index,SELINV_TAG_L_REDUCE));
1414  //Initialize the tree
1415  redLTree->AllocRecvBuffers();
1416  //Post All Recv requests;
1417  redLTree->PostFirstRecv();
1418  }
1419  }
1420 
1421 
1422  TIMER_START(Compute_Sinv_LT);
1423  {
1424  Int msgForwarded = 0;
1425  Int msgToFwd = 0;
1426  Int gemmProcessed = 0;
1427  Int gemmToDo = 0;
1428  // Int toRecvGemm = 0;
1429  //copy the list of supernodes we need to process
1430  std::list<Int> readySupidx;
1431  //find local things to do
1432  for(Int supidx = 0;supidx<stepSuper;supidx++){
1433  SuperNodeBufferType & snode = arrSuperNodes[supidx];
1434 
1435 
1436  if( isRecvFromAbove_( snode.Index ) && isRecvFromLeft_( snode.Index )){
1437  gemmToDo++;
1438  if( MYCOL( grid_ ) == PCOL( snode.Index, grid_ ) ){
1439  snode.isReady++;
1440  }
1441 
1442  if( MYROW( grid_ ) == PROW( snode.Index, grid_ ) ){
1443  snode.isReady++;
1444  }
1445 
1446  if(snode.isReady==2){
1447  readySupidx.push_back(supidx);
1448 #if ( _DEBUGlevel_ >= 1 )
1449  statusOFS<<std::endl<<"Locally processing ["<<snode.Index<<"]"<<std::endl;
1450 #endif
1451  }
1452  }
1453  else if( (isRecvFromLeft_( snode.Index ) ) && MYCOL( grid_ ) != PCOL( snode.Index, grid_ ) )
1454  {
1455  //Get the reduction tree
1456  TreeReduce<T> * redLTree = redToLeftTree_[snode.Index];
1457 
1458  if(redLTree != NULL){
1459  TIMER_START(Reduce_Sinv_LT_Isend);
1460  //send the data to NULL to ensure 0 byte send
1461  redLTree->SetLocalBuffer(NULL);
1462  redLTree->SetDataReady(true);
1463  bool done = redLTree->Progress();
1464  TIMER_STOP(Reduce_Sinv_LT_Isend);
1465  }
1466  }// if( isRecvFromAbove_( snode.Index ) && isRecvFromLeft_( snode.Index ))
1467 
1468  if(MYROW(grid_)!=PROW(snode.Index,grid_)){
1469  TreeBcast * bcastUTree = fwdToBelowTree_[snode.Index];
1470  if(bcastUTree != NULL){
1471  if(bcastUTree->GetDestCount()>0){
1472  msgToFwd++;
1473  }
1474  }
1475  }
1476 
1477  if(MYCOL(grid_)!=PCOL(snode.Index,grid_)){
1478  TreeBcast * bcastLTree = fwdToRightTree_[snode.Index];
1479  if(bcastLTree != NULL){
1480  if(bcastLTree->GetDestCount()>0){
1481  msgToFwd++;
1482  }
1483  }
1484  }
1485  }
1486 
1487 #if ( _DEBUGlevel_ >= 1 )
1488  statusOFS<<std::endl<<"gemmToDo ="<<gemmToDo<<std::endl;
1489  statusOFS<<std::endl<<"msgToFwd ="<<msgToFwd<<std::endl;
1490 #endif
1491 
1492 
1493 #if defined (PROFILE)
1494  end_SendULWaitContentFirst=0;
1495  begin_SendULWaitContentFirst=0;
1496 #endif
1497 
1498  while(gemmProcessed<gemmToDo || msgForwarded < msgToFwd)
1499  {
1500  Int reqidx = MPI_UNDEFINED;
1501  Int supidx = -1;
1502 
1503 
1504  //while I don't have anything to do, wait for data to arrive
1505  do{
1506 
1507 
1508  int reqIndices[arrMpireqsRecvContentFromAny.size()];
1509  int numRecv = 0;
1510 
1511  //then process with the remote ones
1512 
1513  TIMER_START(WaitContent_UL);
1514 #if defined(PROFILE)
1515  if(begin_SendULWaitContentFirst==0){
1516  begin_SendULWaitContentFirst=1;
1517  TIMER_START(WaitContent_UL_First);
1518  }
1519 #endif
1520 
1521  numRecv = 0;
1522  MPI_Waitsome(2*stepSuper, &arrMpireqsRecvContentFromAny[0], &numRecv, reqIndices, MPI_STATUSES_IGNORE);
1523 
1524  for(int i =0;i<numRecv;i++){
1525  reqidx = reqIndices[i];
1526  //I've received something
1527  if(reqidx!=MPI_UNDEFINED)
1528  {
1529  //this stays true
1530  supidx = reqidx/2;
1531  SuperNodeBufferType & snode = arrSuperNodes[supidx];
1532 
1533  //If it's a U block
1534  if(reqidx%2==0){
1535  TreeBcast * bcastUTree = fwdToBelowTree_[snode.Index];
1536  if(bcastUTree != NULL){
1537  if(bcastUTree->GetDestCount()>0){
1538 
1539  std::vector<MPI_Request> & mpireqsSendToBelow = arrMpireqsSendToBelow[supidx];
1540 #if ( _DEBUGlevel_ >= 1 )
1541  for( Int idxRecv = 0; idxRecv < bcastUTree->GetDestCount(); ++idxRecv ){
1542  Int iProcRow = bcastUTree->GetDest(idxRecv);
1543  statusOFS << std::endl << "["<<snode.Index<<"] "<< "Forwarding U " << snode.SizeSstrUrowRecv << " BYTES to "<<iProcRow<<" on tag "<<IDX_TO_TAG(snode.Index,SELINV_TAG_U_CONTENT)<< std::endl << std::endl;
1544  }
1545 #endif
1546 
1547  bcastUTree->ForwardMessage( (char*)&snode.SstrUrowRecv[0], snode.SizeSstrUrowRecv,
1548  IDX_TO_TAG(snode.Index,SELINV_TAG_U_CONTENT), &mpireqsSendToBelow[0] );
1549 #if ( _DEBUGlevel_ >= 1 )
1550  for( Int idxRecv = 0; idxRecv < bcastUTree->GetDestCount(); ++idxRecv ){
1551  Int iProcRow = bcastUTree->GetDest(idxRecv);
1552  statusOFS << std::endl << "["<<snode.Index<<"] "<< "Forwarded U " << snode.SizeSstrUrowRecv << " BYTES to "<<iProcRow<<" on tag "<<IDX_TO_TAG(snode.Index,SELINV_TAG_U_CONTENT)<< std::endl << std::endl;
1553  }
1554 #endif
1555  msgForwarded++;
1556  }
1557  }
1558  }
1559  //If it's a L block
1560  else if(reqidx%2==1){
1561  TreeBcast * bcastLTree = fwdToRightTree_[snode.Index];
1562  if(bcastLTree != NULL){
1563  if(bcastLTree->GetDestCount()>0){
1564 
1565  std::vector<MPI_Request> & mpireqsSendToRight = arrMpireqsSendToRight[supidx];
1566 #if ( _DEBUGlevel_ >= 1 )
1567  for( Int idxRecv = 0; idxRecv < bcastLTree->GetDestCount(); ++idxRecv ){
1568  Int iProcCol = bcastLTree->GetDest(idxRecv);
1569  statusOFS << std::endl << "["<<snode.Index<<"] "<< "Forwarding L " << snode.SizeSstrLcolRecv << " BYTES to "<<iProcCol<<" on tag "<<IDX_TO_TAG(snode.Index,SELINV_TAG_L_CONTENT)<< std::endl << std::endl;
1570  }
1571 #endif
1572 
1573  bcastLTree->ForwardMessage( (char*)&snode.SstrLcolRecv[0], snode.SizeSstrLcolRecv,
1574  IDX_TO_TAG(snode.Index,SELINV_TAG_L_CONTENT), &mpireqsSendToRight[0] );
1575 
1576  // for( Int idxRecv = 0; idxRecv < bcastLTree->GetDestCount(); ++idxRecv ){
1577  // Int iProcCol = bcastLTree->GetDest(idxRecv);
1578  // PROFILE_COMM(MYPROC(this->grid_),PNUM(MYROW(this->grid_),iProcCol,this->grid_),IDX_TO_TAG(snode.Index,SELINV_TAG_L_CONTENT),snode.SizeSstrLcolRecv);
1579  // }
1580 #if ( _DEBUGlevel_ >= 1 )
1581  for( Int idxRecv = 0; idxRecv < bcastLTree->GetDestCount(); ++idxRecv ){
1582  Int iProcCol = bcastLTree->GetDest(idxRecv);
1583  statusOFS << std::endl << "["<<snode.Index<<"] "<< "Forwarded L " << snode.SizeSstrLcolRecv << " BYTES to "<<iProcCol<<" on tag "<<IDX_TO_TAG(snode.Index,SELINV_TAG_L_CONTENT)<< std::endl << std::endl;
1584  }
1585 #endif
1586  msgForwarded++;
1587  }
1588  }
1589  }
1590 
1591 
1592 #if ( _DEBUGlevel_ >= 1 )
1593  statusOFS<<std::endl<<"Received data for ["<<snode.Index<<"] reqidx%2="<<reqidx%2<<" is ready ?"<<snode.isReady<<std::endl;
1594 #endif
1595 
1596  if( isRecvFromAbove_( snode.Index ) && isRecvFromLeft_( snode.Index )){
1597  snode.isReady++;
1598 
1599  //if we received both L and U, the supernode is ready
1600  if(snode.isReady==2){
1601  readySupidx.push_back(supidx);
1602 
1603 #if defined(PROFILE)
1604  if(end_SendULWaitContentFirst==0){
1605  TIMER_STOP(WaitContent_UL_First);
1606  end_SendULWaitContentFirst=1;
1607  }
1608 #endif
1609  }
1610  }
1611  }
1612 
1613  }//end for waitsome
1614 
1615  TIMER_STOP(WaitContent_UL);
1616 
1617  } while( (gemmProcessed<gemmToDo && readySupidx.size()==0) || (gemmProcessed==gemmToDo && msgForwarded<msgToFwd) );
1618 
1619  //If I have some work to do
1620  if(readySupidx.size()>0)
1621  {
1622  supidx = readySupidx.back();
1623  readySupidx.pop_back();
1624  SuperNodeBufferType & snode = arrSuperNodes[supidx];
1625 
1626 
1627  // Only the processors received information participate in the Gemm
1628  if( isRecvFromAbove_( snode.Index ) && isRecvFromLeft_( snode.Index ) ){
1629 
1630  std::vector<LBlock<T> > LcolRecv;
1631  std::vector<UBlock<T> > UrowRecv;
1632  // Save all the data to be updated for { L( isup, snode.Index ) | isup > snode.Index }.
1633  // The size will be updated in the Gemm phase and the reduce phase
1634 
1635  UnpackData(snode, LcolRecv, UrowRecv);
1636 
1637  //NumMat<T> AinvBuf, UBuf;
1638  SelInv_lookup_indexes(snode,LcolRecv, UrowRecv,AinvBuf,UBuf);
1639 
1640  snode.LUpdateBuf.Resize( AinvBuf.m(), SuperSize( snode.Index, super_ ) );
1641 
1642 #ifdef GEMM_PROFILE
1643  gemm_stat.push_back(AinvBuf.m());
1644  gemm_stat.push_back(UBuf.m());
1645  gemm_stat.push_back(AinvBuf.n());
1646 #endif
1647 
1648  TIMER_START(Compute_Sinv_LT_GEMM);
1649  blas::Gemm( 'N', 'T', AinvBuf.m(), UBuf.m(), AinvBuf.n(), MINUS_ONE<T>(),
1650  AinvBuf.Data(), AinvBuf.m(),
1651  UBuf.Data(), UBuf.m(), ZERO<T>(),
1652  snode.LUpdateBuf.Data(), snode.LUpdateBuf.m() );
1653  TIMER_STOP(Compute_Sinv_LT_GEMM);
1654 
1655 
1656 #if ( _DEBUGlevel_ >= 2 )
1657  statusOFS << std::endl << "["<<snode.Index<<"] "<< "snode.LUpdateBuf: " << snode.LUpdateBuf << std::endl;
1658 #endif
1659  } // if Gemm is to be done locally
1660 
1661  //Get the reduction tree
1662  TreeReduce<T> * redLTree = redToLeftTree_[snode.Index];
1663  if(redLTree != NULL){
1664  TIMER_START(Reduce_Sinv_LT_Isend);
1665  //send the data
1666  redLTree->SetLocalBuffer(snode.LUpdateBuf.Data());
1667  redLTree->SetDataReady(true);
1668  bool done = redLTree->Progress();
1669  TIMER_STOP(Reduce_Sinv_LT_Isend);
1670  }
1671  gemmProcessed++;
1672 
1673 #if ( _DEBUGlevel_ >= 1 )
1674  statusOFS<<std::endl<<"gemmProcessed ="<<gemmProcessed<<"/"<<gemmToDo<<std::endl;
1675 #endif
1676 
1677  //advance reductions
1678  for (Int supidx=0; supidx<stepSuper; supidx++){
1679  SuperNodeBufferType & snode = arrSuperNodes[supidx];
1680  TreeReduce<T> * redLTree = redToLeftTree_[snode.Index];
1681  if(redLTree != NULL && !redLdone[supidx]){
1682  bool done = redLTree->Progress();
1683  }
1684  }
1685  }
1686  }
1687 
1688  }
1689  TIMER_STOP(Compute_Sinv_LT);
1690 
1691  //Reduce Sinv L^T to the processors in PCOL(ksup,grid_)
1692 
1693 
1694  TIMER_START(Reduce_Sinv_LT);
1695  //blocking wait for the reduction
1696  bool all_done = false;
1697  while(!all_done)
1698  {
1699  all_done = true;
1700 
1701  for (Int supidx=0; supidx<stepSuper; supidx++){
1702  SuperNodeBufferType & snode = arrSuperNodes[supidx];
1703 
1704  TreeReduce<T> * redLTree = redToLeftTree_[snode.Index];
1705 
1706  if(redLTree != NULL && !redLdone[supidx]){
1707  bool done = redLTree->Progress();
1708  if(done){
1709  if( MYCOL( grid_ ) == PCOL( snode.Index, grid_ ) ){
1710  //determine the number of rows in LUpdateBufReduced
1711  Int numRowLUpdateBuf;
1712  std::vector<LBlock<T> >& Lcol = this->L( LBj( snode.Index, grid_ ) );
1713  if( MYROW( grid_ ) != PROW( snode.Index, grid_ ) ){
1714  snode.RowLocalPtr.resize( Lcol.size() + 1 );
1715  snode.BlockIdxLocal.resize( Lcol.size() );
1716  snode.RowLocalPtr[0] = 0;
1717  for( Int ib = 0; ib < Lcol.size(); ib++ ){
1718  snode.RowLocalPtr[ib+1] = snode.RowLocalPtr[ib] + Lcol[ib].numRow;
1719  snode.BlockIdxLocal[ib] = Lcol[ib].blockIdx;
1720  }
1721  } // I do not own the diagonal block
1722  else{
1723  snode.RowLocalPtr.resize( Lcol.size() );
1724  snode.BlockIdxLocal.resize( Lcol.size() - 1 );
1725  snode.RowLocalPtr[0] = 0;
1726  for( Int ib = 1; ib < Lcol.size(); ib++ ){
1727  snode.RowLocalPtr[ib] = snode.RowLocalPtr[ib-1] + Lcol[ib].numRow;
1728  snode.BlockIdxLocal[ib-1] = Lcol[ib].blockIdx;
1729  }
1730  } // I own the diagonal block, skip the diagonal block
1731  numRowLUpdateBuf = *snode.RowLocalPtr.rbegin();
1732 
1733  if( numRowLUpdateBuf > 0 ){
1734  if( snode.LUpdateBuf.m() == 0 && snode.LUpdateBuf.n() == 0 ){
1735  snode.LUpdateBuf.Resize( numRowLUpdateBuf,SuperSize( snode.Index, super_ ) );
1736  }
1737  }
1738 
1739  //copy the buffer from the reduce tree
1740  redLTree->SetLocalBuffer(snode.LUpdateBuf.Data());
1741 
1742  }
1743  redLdone[supidx]=1;
1744  redLTree->CleanupBuffers();
1745  }
1746 
1747  all_done = all_done && (done || redLdone[supidx]);
1748  }
1749  }
1750  }
1751  TIMER_STOP(Reduce_Sinv_LT);
1752 
1753 
1754 
1755 #ifndef _RELEASE_
1756  PopCallStack();
1757 #endif
1758  //--------------------- End of reduce of LUpdateBuf-------------------------
1759 #ifndef _RELEASE_
1760  PushCallStack("PMatrix::SelInv_P2p::UpdateD");
1761 #endif
1762 
1763  TIMER_START(Update_Diagonal);
1764 
1765  for (Int supidx=0; supidx<stepSuper; supidx++){
1766  SuperNodeBufferType & snode = arrSuperNodes[supidx];
1767 
1768  ComputeDiagUpdate(snode);
1769 
1770  //Get the reduction tree
1771  TreeReduce<T> * redDTree = redToAboveTree_[snode.Index];
1772 
1773  if(redDTree != NULL){
1774  //send the data
1775  if( MYROW( grid_ ) == PROW( snode.Index, grid_ ) ){
1776  if(snode.DiagBuf.Size()==0){
1777  snode.DiagBuf.Resize( SuperSize( snode.Index, super_ ), SuperSize( snode.Index, super_ ));
1778  SetValue(snode.DiagBuf, ZERO<T>());
1779  }
1780  }
1781 
1782 
1783  redDTree->SetLocalBuffer(snode.DiagBuf.Data());
1784  if(!redDTree->IsAllocated()){
1785  redDTree->SetTag(IDX_TO_TAG(snode.Index,SELINV_TAG_D_REDUCE));
1786  redDTree->AllocRecvBuffers();
1787  //Post All Recv requests;
1788  redDTree->PostFirstRecv();
1789  }
1790 
1791  redDTree->SetDataReady(true);
1792  bool done = redDTree->Progress();
1793  }
1794 
1795  //advance reductions
1796  for (Int supidx=0; supidx<stepSuper; supidx++){
1797  SuperNodeBufferType & snode = arrSuperNodes[supidx];
1798  TreeReduce<T> * redDTree = redToAboveTree_[snode.Index];
1799  if(redDTree != NULL){
1800  if(redDTree->IsAllocated()){
1801  bool done = redDTree->Progress();
1802  }
1803  }
1804  }
1805  }
1806  TIMER_STOP(Update_Diagonal);
1807 
1808  TIMER_START(Reduce_Diagonal);
1809  //blocking wait for the reduction
1810  {
1811  vector<char> is_done(stepSuper,0);
1812  bool all_done = false;
1813  while(!all_done)
1814  {
1815  all_done = true;
1816 
1817  for (Int supidx=0; supidx<stepSuper; supidx++){
1818  SuperNodeBufferType & snode = arrSuperNodes[supidx];
1819  TreeReduce<T> * redDTree = redToAboveTree_[snode.Index];
1820 
1821  if(redDTree != NULL){
1822  bool done = redDTree->Progress();
1823  if(done && !is_done[supidx]){
1824  if( MYCOL( grid_ ) == PCOL( snode.Index, grid_ ) ){
1825  if( MYROW( grid_ ) == PROW( snode.Index, grid_ ) ){
1826  LBlock<T> & LB = this->L( LBj( snode.Index, grid_ ) )[0];
1827  // Symmetrize LB
1828  blas::Axpy( LB.numRow * LB.numCol, ONE<T>(), snode.DiagBuf.Data(), 1, LB.nzval.Data(), 1 );
1829  Symmetrize( LB.nzval );
1830  }
1831  }
1832 
1833  is_done[supidx]=1;
1834  redDTree->CleanupBuffers();
1835  }
1836 
1837  all_done = all_done && (done || is_done[supidx]);
1838  }
1839  }
1840  }
1841  }
1842  TIMER_STOP(Reduce_Diagonal);
1843 
1844 #ifndef _RELEASE_
1845  PopCallStack();
1846 #endif
1847 
1848 
1849 #ifndef _RELEASE_
1850  PushCallStack("PMatrix::SelInv_P2p::UpdateU");
1851 #endif
1852 
1853 
1854 
1855  SendRecvCD_UpdateU(arrSuperNodes, stepSuper);
1856 
1857 #ifndef _RELEASE_
1858  PopCallStack();
1859 #endif
1860 
1861 #ifndef _RELEASE_
1862  PushCallStack("PMatrix::SelInv_P2p::UpdateLFinal");
1863 #endif
1864 
1865  TIMER_START(Update_L);
1866  for (Int supidx=0; supidx<stepSuper; supidx++){
1867  SuperNodeBufferType & snode = arrSuperNodes[supidx];
1868 
1869 #if ( _DEBUGlevel_ >= 1 )
1870  statusOFS << std::endl << "["<<snode.Index<<"] "<< "Finish updating the L part by filling LUpdateBufReduced back to L" << std::endl << std::endl;
1871 #endif
1872 
1873  if( MYCOL( grid_ ) == PCOL( snode.Index, grid_ ) && snode.LUpdateBuf.m() > 0 ){
1874  std::vector<LBlock<T> >& Lcol = this->L( LBj( snode.Index, grid_ ) );
1875  //Need to skip the diagonal block if present
1876  Int startBlock = (MYROW( grid_ ) == PROW( snode.Index, grid_ ))?1:0;
1877  for( Int ib = startBlock; ib < Lcol.size(); ib++ ){
1878  LBlock<T> & LB = Lcol[ib];
1879  lapack::Lacpy( 'A', LB.numRow, LB.numCol, &snode.LUpdateBuf( snode.RowLocalPtr[ib-startBlock], 0 ),
1880  snode.LUpdateBuf.m(), LB.nzval.Data(), LB.numRow );
1881  }
1882  } // Finish updating L
1883  } // for (snode.Index) : Main loop
1884  TIMER_STOP(Update_L);
1885 
1886 #ifndef _RELEASE_
1887  PopCallStack();
1888 #endif
1889 
1890 
1891  TIMER_START(Barrier);
1892 
1893 
1894  for (Int supidx=0; supidx<stepSuper; supidx++){
1895  SuperNodeBufferType & snode = arrSuperNodes[supidx];
1896  TreeReduce<T> * redLTree = redToLeftTree_[snode.Index];
1897 
1898  if(redLTree != NULL){
1899  redLTree->Wait();
1900  redLTree->CleanupBuffers();
1901  }
1902  }
1903 
1904 
1905  for (Int supidx=0; supidx<stepSuper; supidx++){
1906  SuperNodeBufferType & snode = arrSuperNodes[supidx];
1907  TreeReduce<T> * redDTree = redToAboveTree_[snode.Index];
1908 
1909  if(redDTree != NULL){
1910  redDTree->Wait();
1911  redDTree->CleanupBuffers();
1912  }
1913  }
1914 
1915  mpi::Waitall(arrMpireqsRecvContentFromAny);
1916 
1917  for (Int supidx=0; supidx<stepSuper; supidx++){
1918  Int ksup = superList[lidx][supidx];
1919  std::vector<MPI_Request> & mpireqsSendToRight = arrMpireqsSendToRight[supidx];
1920  std::vector<MPI_Request> & mpireqsSendToBelow = arrMpireqsSendToBelow[supidx];
1921 
1922 #if ( _DEBUGlevel_ >= 1 )
1923  statusOFS<<"["<<ksup<<"] mpireqsSendToRight"<<std::endl;
1924 #endif
1925  mpi::Waitall( mpireqsSendToRight );
1926 #if ( _DEBUGlevel_ >= 1 )
1927  statusOFS<<"["<<ksup<<"] mpireqsSendToBelow"<<std::endl;
1928 #endif
1929  mpi::Waitall( mpireqsSendToBelow );
1930 
1931  }
1932 
1933 #if ( _DEBUGlevel_ >= 1 )
1934  statusOFS<<"barrier done"<<std::endl;
1935 #endif
1936  TIMER_STOP(Barrier);
1937 
1938 
1939 #ifdef LIST_BARRIER
1940 #ifndef ALL_BARRIER
1941  if (options_->maxPipelineDepth!=-1)
1942 #endif
1943  {
1944  MPI_Barrier(grid_->comm);
1945  }
1946 #endif
1947 
1948  }
1949 
1950  template<typename T>
1952  {
1953  ConstructCommunicationPattern_P2p();
1954  } // ----- end of method PMatrix::ConstructCommunicationPattern -----
1955 
1956 
1957 
1958  template<typename T>
1960  {
1961 #ifndef _RELEASE_
1962  PushCallStack("PMatrix::ConstructCommunicationPattern_P2p");
1963 #endif
1964 
1965 
1966  Int numSuper = this->NumSuper();
1967 
1968  TIMER_START(Allocate);
1969 
1970 #ifndef _RELEASE_
1971  PushCallStack( "Initialize the communication pattern" );
1972 #endif
1973 
1974 
1975  fwdToBelowTree_.resize(numSuper, NULL );
1976  fwdToRightTree_.resize(numSuper, NULL );
1977  redToLeftTree_.resize(numSuper, NULL );
1978  redToAboveTree_.resize(numSuper, NULL );
1979 
1980  isSendToBelow_.Resize(grid_->numProcRow, numSuper);
1981  isSendToRight_.Resize(grid_->numProcCol, numSuper);
1982  isSendToDiagonal_.Resize( numSuper );
1983  SetValue( isSendToBelow_, false );
1984  SetValue( isSendToRight_, false );
1985  SetValue( isSendToDiagonal_, false );
1986 
1987  isSendToCrossDiagonal_.Resize(grid_->numProcCol+1, numSuper );
1988  SetValue( isSendToCrossDiagonal_, false );
1989  isRecvFromCrossDiagonal_.Resize(grid_->numProcRow+1, numSuper );
1990  SetValue( isRecvFromCrossDiagonal_, false );
1991 
1992  isRecvFromAbove_.Resize( numSuper );
1993  isRecvFromLeft_.Resize( numSuper );
1994  isRecvFromBelow_.Resize( grid_->numProcRow, numSuper );
1995  SetValue( isRecvFromAbove_, false );
1996  SetValue( isRecvFromBelow_, false );
1997  SetValue( isRecvFromLeft_, false );
1998 #ifndef _RELEASE_
1999  PopCallStack();
2000 #endif
2001 
2002  TIMER_STOP(Allocate);
2003 
2004  TIMER_START(GetEtree);
2005  std::vector<Int> snodeEtree(this->NumSuper());
2006  GetEtree(snodeEtree);
2007  TIMER_STOP(GetEtree);
2008 
2009 
2010 
2011 #ifndef _RELEASE_
2012  PushCallStack( "Local column communication" );
2013 #endif
2014 #if ( _DEBUGlevel_ >= 1 )
2015  statusOFS << std::endl << "Local column communication" << std::endl;
2016 #endif
2017  // localColBlockRowIdx stores the nonzero block indices for each local block column.
2018  // The nonzero block indices including contribution from both L and U.
2019  // Dimension: numLocalBlockCol x numNonzeroBlock
2020  std::vector<std::set<Int> > localColBlockRowIdx;
2021 
2022  localColBlockRowIdx.resize( this->NumLocalBlockCol() );
2023 
2024 
2025  TIMER_START(Column_communication);
2026 
2027 
2028  for( Int ksup = 0; ksup < numSuper; ksup++ ){
2029  // All block columns perform independently
2030  if( MYCOL( grid_ ) == PCOL( ksup, grid_ ) ){
2031 
2032 
2033  // Communication
2034  std::vector<Int> tAllBlockRowIdx;
2035  std::vector<Int> & colBlockIdx = ColBlockIdx(LBj(ksup, grid_));
2036  TIMER_START(Allgatherv_Column_communication);
2037  if( grid_ -> mpisize != 1 )
2038  mpi::Allgatherv( colBlockIdx, tAllBlockRowIdx, grid_->colComm );
2039  else
2040  tAllBlockRowIdx = colBlockIdx;
2041 
2042  TIMER_STOP(Allgatherv_Column_communication);
2043 
2044  localColBlockRowIdx[LBj( ksup, grid_ )].insert(
2045  tAllBlockRowIdx.begin(), tAllBlockRowIdx.end() );
2046 
2047 #if ( _DEBUGlevel_ >= 1 )
2048  statusOFS
2049  << " Column block " << ksup
2050  << " has the following nonzero block rows" << std::endl;
2051  for( std::set<Int>::iterator si = localColBlockRowIdx[LBj( ksup, grid_ )].begin();
2052  si != localColBlockRowIdx[LBj( ksup, grid_ )].end();
2053  si++ ){
2054  statusOFS << *si << " ";
2055  }
2056  statusOFS << std::endl;
2057 #endif
2058 
2059  } // if( MYCOL( grid_ ) == PCOL( ksup, grid_ ) )
2060  } // for(ksup)
2061 
2062 
2063 #ifndef _RELEASE_
2064  PopCallStack();
2065 #endif
2066 
2067  TIMER_STOP(Column_communication);
2068 
2069  TIMER_START(Row_communication);
2070 #ifndef _RELEASE_
2071  PushCallStack( "Local row communication" );
2072 #endif
2073 #if ( _DEBUGlevel_ >= 1 )
2074  statusOFS << std::endl << "Local row communication" << std::endl;
2075 #endif
2076  // localRowBlockColIdx stores the nonzero block indices for each local block row.
2077  // The nonzero block indices including contribution from both L and U.
2078  // Dimension: numLocalBlockRow x numNonzeroBlock
2079  std::vector<std::set<Int> > localRowBlockColIdx;
2080 
2081  localRowBlockColIdx.resize( this->NumLocalBlockRow() );
2082 
2083  for( Int ksup = 0; ksup < numSuper; ksup++ ){
2084  // All block columns perform independently
2085  if( MYROW( grid_ ) == PROW( ksup, grid_ ) ){
2086 
2087  // Communication
2088  std::vector<Int> tAllBlockColIdx;
2089  std::vector<Int> & rowBlockIdx = RowBlockIdx(LBi(ksup, grid_));
2090  TIMER_START(Allgatherv_Row_communication);
2091  if( grid_ -> mpisize != 1 )
2092  mpi::Allgatherv( rowBlockIdx, tAllBlockColIdx, grid_->rowComm );
2093  else
2094  tAllBlockColIdx = rowBlockIdx;
2095 
2096  TIMER_STOP(Allgatherv_Row_communication);
2097 
2098  localRowBlockColIdx[LBi( ksup, grid_ )].insert(
2099  tAllBlockColIdx.begin(), tAllBlockColIdx.end() );
2100 
2101 #if ( _DEBUGlevel_ >= 1 )
2102  statusOFS
2103  << " Row block " << ksup
2104  << " has the following nonzero block columns" << std::endl;
2105  for( std::set<Int>::iterator si = localRowBlockColIdx[LBi( ksup, grid_ )].begin();
2106  si != localRowBlockColIdx[LBi( ksup, grid_ )].end();
2107  si++ ){
2108  statusOFS << *si << " ";
2109  }
2110  statusOFS << std::endl;
2111 #endif
2112 
2113  } // if( MYROW( grid_ ) == PROW( ksup, grid_ ) )
2114  } // for(ksup)
2115 
2116 #ifndef _RELEASE_
2117  PopCallStack();
2118 #endif
2119 
2120  TIMER_STOP(Row_communication);
2121 
2122  TIMER_START(STB_RFA);
2123 #ifndef _RELEASE_
2124  PushCallStack("SendToBelow / RecvFromAbove");
2125 #endif
2126  for( Int ksup = 0; ksup < numSuper - 1; ksup++ ){
2127  // Loop over all the supernodes to the right of ksup
2128  Int jsup = snodeEtree[ksup];
2129  while(jsup<numSuper){
2130  Int jsupLocalBlockCol = LBj( jsup, grid_ );
2131  Int jsupProcCol = PCOL( jsup, grid_ );
2132  if( MYCOL( grid_ ) == jsupProcCol ){
2133  // SendToBelow / RecvFromAbove only if (ksup, jsup) is nonzero.
2134  if( localColBlockRowIdx[jsupLocalBlockCol].count( ksup ) > 0 ) {
2135  for( std::set<Int>::iterator si = localColBlockRowIdx[jsupLocalBlockCol].begin();
2136  si != localColBlockRowIdx[jsupLocalBlockCol].end(); si++ ){
2137  Int isup = *si;
2138  Int isupProcRow = PROW( isup, grid_ );
2139  if( isup > ksup ){
2140  if( MYROW( grid_ ) == isupProcRow ){
2141  isRecvFromAbove_(ksup) = true;
2142  }
2143  if( MYROW( grid_ ) == PROW( ksup, grid_ ) ){
2144  isSendToBelow_( isupProcRow, ksup ) = true;
2145  }
2146  } // if( isup > ksup )
2147  } // for (si)
2148  } // if( localColBlockRowIdx[jsupLocalBlockCol].count( ksup ) > 0 )
2149  } // if( MYCOL( grid_ ) == PCOL( jsup, grid_ ) )
2150  jsup = snodeEtree[jsup];
2151  } // for(jsup)
2152  } // for(ksup)
2153 #ifndef _RELEASE_
2154  PopCallStack();
2155 #endif
2156  TIMER_STOP(STB_RFA);
2157 
2158  TIMER_START(STR_RFL_RFB);
2159 #ifndef _RELEASE_
2160  PushCallStack("SendToRight / RecvFromLeft");
2161 #endif
2162  for( Int ksup = 0; ksup < numSuper - 1; ksup++ ){
2163  // Loop over all the supernodes below ksup
2164  Int isup = snodeEtree[ksup];
2165  while(isup<numSuper){
2166  Int isupLocalBlockRow = LBi( isup, grid_ );
2167  Int isupProcRow = PROW( isup, grid_ );
2168  if( MYROW( grid_ ) == isupProcRow ){
2169  // SendToRight / RecvFromLeft only if (isup, ksup) is nonzero.
2170  if( localRowBlockColIdx[isupLocalBlockRow].count( ksup ) > 0 ){
2171  for( std::set<Int>::iterator si = localRowBlockColIdx[isupLocalBlockRow].begin();
2172  si != localRowBlockColIdx[isupLocalBlockRow].end(); si++ ){
2173  Int jsup = *si;
2174  Int jsupProcCol = PCOL( jsup, grid_ );
2175  if( jsup > ksup ){
2176  if( MYCOL( grid_ ) == jsupProcCol ){
2177  isRecvFromLeft_(ksup) = true;
2178  }
2179  if( MYCOL( grid_ ) == PCOL( ksup, grid_ ) ){
2180  isSendToRight_( jsupProcCol, ksup ) = true;
2181  }
2182  }
2183  } // for (si)
2184  } // if( localRowBlockColIdx[isupLocalBlockRow].count( ksup ) > 0 )
2185  } // if( MYROW( grid_ ) == isupProcRow )
2186 
2187  if( MYCOL( grid_ ) == PCOL(ksup, grid_) ){
2188  if( MYROW( grid_ ) == PROW( ksup, grid_ ) ){
2189  isRecvFromBelow_(isupProcRow,ksup) = true;
2190  }
2191  else if (MYROW(grid_) == isupProcRow){
2192  isSendToDiagonal_(ksup)=true;
2193  }
2194  } // if( MYCOL( grid_ ) == PCOL(ksup, grid_) )
2195  isup = snodeEtree[isup];
2196  } // for (isup)
2197  } // for (ksup)
2198 
2199 #ifndef _RELEASE_
2200  PopCallStack();
2201 #endif
2202  TIMER_STOP(STR_RFL_RFB);
2203 
2204 
2205  TIMER_START(BUILD_BCAST_TREES);
2206  //Allgather RFL values within column
2207 
2208  vector<double> SeedRFL(numSuper,0.0);
2209  vector<Int> aggRFL(numSuper);
2210  vector<Int> globalAggRFL(numSuper*grid_->numProcCol);
2211  for( Int ksup = 0; ksup < numSuper ; ksup++ ){
2212  if(MYCOL(grid_)==PCOL(ksup,grid_)){
2213  std::vector<LBlock<T> >& Lcol = this->L( LBj(ksup, grid_) );
2214  // All blocks except for the diagonal block are to be sent right
2215 
2216  Int totalSize = 0;
2217  //one integer holding the number of Lblocks
2218  totalSize+=sizeof(Int);
2219  for( Int ib = 0; ib < Lcol.size(); ib++ ){
2220  if( Lcol[ib].blockIdx > ksup ){
2221  //three indices + one IntNumVec
2222  totalSize+=3*sizeof(Int);
2223  totalSize+= sizeof(Int)+Lcol[ib].rows.ByteSize();
2224  }
2225  }
2226 
2227 
2228  aggRFL[ksup]=totalSize;
2229  SeedRFL[ksup]=rand();
2230 
2231  }
2232  else if(isRecvFromLeft_(ksup)){
2233  aggRFL[ksup]=1;
2234  }
2235  else{
2236  aggRFL[ksup]=0;
2237  }
2238  }
2239  // //allgather
2240  MPI_Allgather(&aggRFL[0],numSuper*sizeof(Int),MPI_BYTE,
2241  &globalAggRFL[0],numSuper*sizeof(Int),MPI_BYTE,
2242  grid_->rowComm);
2243  MPI_Allreduce(MPI_IN_PLACE,&SeedRFL[0],numSuper,MPI_DOUBLE,MPI_MAX,grid_->rowComm);
2244 
2245  vector<double> SeedRFA(numSuper,0.0);
2246  vector<Int> aggRFA(numSuper);
2247  vector<Int> globalAggRFA(numSuper*grid_->numProcRow);
2248  for( Int ksup = 0; ksup < numSuper ; ksup++ ){
2249  if(MYROW(grid_)==PROW(ksup,grid_)){
2250  std::vector<UBlock<T> >& Urow = this->U( LBi(ksup, grid_) );
2251  // All blocks except for the diagonal block are to be sent right
2252 
2253  Int totalSize = 0;
2254  //one integer holding the number of Lblocks
2255  totalSize+=sizeof(Int);
2256  for( Int jb = 0; jb < Urow.size(); jb++ ){
2257  if( Urow[jb].blockIdx >= ksup ){
2258  //three indices + one IntNumVec + one NumMat<T>
2259  totalSize+=3*sizeof(Int);
2260  totalSize+= sizeof(Int)+Urow[jb].cols.ByteSize();
2261  totalSize+= 2*sizeof(Int)+Urow[jb].nzval.ByteSize();
2262  }
2263  }
2264  aggRFA[ksup]=totalSize;
2265  SeedRFA[ksup]=rand();
2266  }
2267  else if(isRecvFromAbove_(ksup)){
2268  aggRFA[ksup]=1;
2269  }
2270  else{
2271  aggRFA[ksup]=0;
2272  }
2273  }
2274 
2275 
2276  //allgather
2277  MPI_Allgather(&aggRFA[0],numSuper*sizeof(Int),MPI_BYTE,
2278  &globalAggRFA[0],numSuper*sizeof(Int),MPI_BYTE,
2279  grid_->colComm);
2280  MPI_Allreduce(MPI_IN_PLACE,&SeedRFA[0],numSuper,MPI_DOUBLE,MPI_MAX,grid_->colComm);
2281 
2282  for( Int ksup = 0; ksup < numSuper; ksup++ ){
2283  set<Int> set_ranks;
2284  Int msgSize = 0;
2285  for( Int iProcRow = 0; iProcRow < grid_->numProcRow; iProcRow++ ){
2286  Int isRFA = globalAggRFA[iProcRow*numSuper + ksup];
2287  if(isRFA>0){
2288  if( iProcRow != PROW( ksup, grid_ ) ){
2289  set_ranks.insert(iProcRow);
2290  }
2291  else{
2292  msgSize = isRFA;
2293  }
2294  }
2295  }
2296 
2297  if( isRecvFromAbove_(ksup) || CountSendToBelow(ksup)>0 ){
2298  vector<Int> tree_ranks;
2299  tree_ranks.push_back(PROW(ksup,grid_));
2300  tree_ranks.insert(tree_ranks.end(),set_ranks.begin(),set_ranks.end());
2301  TreeBcast * & BcastUTree = fwdToBelowTree_[ksup];
2302  BcastUTree = TreeBcast::Create(this->grid_->colComm,&tree_ranks[0],tree_ranks.size(),msgSize,SeedRFA[ksup]);
2303 #ifdef COMM_PROFILE
2304  BcastUTree->SetGlobalComm(grid_->comm);
2305 #endif
2306  }
2307 
2308  set_ranks.clear();
2309  for( Int iProcCol = 0; iProcCol < grid_->numProcCol; iProcCol++ ){
2310  Int isRFL = globalAggRFL[iProcCol*numSuper + ksup];
2311  if(isRFL>0){
2312  if( iProcCol != PCOL( ksup, grid_ ) ){
2313  set_ranks.insert(iProcCol);
2314  }
2315  else{
2316  msgSize = isRFL;
2317  }
2318  }
2319  }
2320 
2321  if( isRecvFromLeft_(ksup) || CountSendToRight(ksup)>0 ){
2322  vector<Int> tree_ranks;
2323  tree_ranks.push_back(PCOL(ksup,grid_));
2324  tree_ranks.insert(tree_ranks.end(),set_ranks.begin(),set_ranks.end());
2325 
2326  TreeBcast * & BcastLTree = fwdToRightTree_[ksup];
2327  BcastLTree = TreeBcast::Create(this->grid_->rowComm,&tree_ranks[0],tree_ranks.size(),msgSize,SeedRFL[ksup]);
2328 #ifdef COMM_PROFILE
2329  BcastLTree->SetGlobalComm(grid_->comm);
2330 #endif
2331  }
2332  }
2333 
2334  //do the same for the other arrays
2335  TIMER_STOP(BUILD_BCAST_TREES);
2336  TIMER_START(BUILD_REDUCE_D_TREE);
2337  vector<double> SeedSTD(numSuper,0.0);
2338  vector<Int> aggSTD(numSuper);
2339  vector<Int> globalAggSTD(numSuper*grid_->numProcRow);
2340  for( Int ksup = 0; ksup < numSuper; ksup++ ){
2341  if( MYCOL( grid_ ) == PCOL(ksup, grid_) && MYROW(grid_)==PROW(ksup,grid_)){
2342  Int totalSize = sizeof(T)*SuperSize( ksup, super_ )*SuperSize( ksup, super_ );
2343  aggSTD[ksup]=totalSize;
2344  SeedSTD[ksup]=rand();
2345  }
2346  else if(isSendToDiagonal_(ksup)){
2347  aggSTD[ksup]=1;
2348  }
2349  else{
2350  aggSTD[ksup]=0;
2351  }
2352  }
2353 
2354 
2355  //allgather
2356  MPI_Allgather(&aggSTD[0],numSuper*sizeof(Int),MPI_BYTE,
2357  &globalAggSTD[0],numSuper*sizeof(Int),MPI_BYTE,
2358  grid_->colComm);
2359  MPI_Allreduce(MPI_IN_PLACE,&SeedSTD[0],numSuper,MPI_DOUBLE,MPI_MAX,grid_->colComm);
2360 
2361 
2362  for( Int ksup = 0; ksup < numSuper; ksup++ ){
2363  if( MYCOL( grid_ ) == PCOL(ksup, grid_) ){
2364  set<Int> set_ranks;
2365  Int msgSize = 0;
2366  for( Int iProcRow = 0; iProcRow < grid_->numProcRow; iProcRow++ ){
2367  Int isSTD = globalAggSTD[iProcRow*numSuper + ksup];
2368  if(isSTD>0){
2369  if( iProcRow != PROW( ksup, grid_ ) ){
2370  set_ranks.insert(iProcRow);
2371  }
2372  else{
2373  msgSize = isSTD;
2374  }
2375  }
2376  }
2377 
2378  Int amISTD = globalAggSTD[MYROW(grid_)*numSuper + ksup];
2379 
2380  // if( MYCOL( grid_ ) == PCOL(ksup, grid_) && MYROW(grid_)==PROW(ksup,grid_)){
2381  // assert(amISTD>0);
2382  // }
2383 
2384  if( amISTD ){
2385  vector<Int> tree_ranks;
2386  tree_ranks.push_back(PROW(ksup,grid_));
2387  tree_ranks.insert(tree_ranks.end(),set_ranks.begin(),set_ranks.end());
2388 
2389  //assert(set_ranks.find(MYROW(grid_))!= set_ranks.end() || MYROW(grid_)==tree_ranks[0]);
2390 
2391  TreeReduce<T> * & redDTree = redToAboveTree_[ksup];
2392 
2393 
2394  redDTree = TreeReduce<T>::Create(this->grid_->colComm,&tree_ranks[0],tree_ranks.size(),msgSize,SeedSTD[ksup]);
2395 #ifdef COMM_PROFILE
2396  redDTree->SetGlobalComm(grid_->comm);
2397 #endif
2398  }
2399  }
2400  }
2401 
2402  TIMER_STOP(BUILD_REDUCE_D_TREE);
2403 
2404 
2405 
2406  TIMER_START(BUILD_REDUCE_L_TREE);
2407  vector<double> SeedRTL(numSuper,0.0);
2408  vector<Int> aggRTL(numSuper);
2409  vector<Int> globalAggRTL(numSuper*grid_->numProcCol);
2410  for( Int ksup = 0; ksup < numSuper ; ksup++ ){
2411  if(MYCOL(grid_)==PCOL(ksup,grid_)){
2412  std::vector<LBlock<T> >& Lcol = this->L( LBj(ksup, grid_) );
2413  // All blocks except for the diagonal block are to be sent right
2414 
2415  Int totalSize = 0;
2416 
2417  //determine the number of rows in LUpdateBufReduced
2418  Int numRowLUpdateBuf = 0;
2419  if( MYROW( grid_ ) != PROW( ksup, grid_ ) ){
2420  for( Int ib = 0; ib < Lcol.size(); ib++ ){
2421  numRowLUpdateBuf += Lcol[ib].numRow;
2422  }
2423  } // I do not own the diagonal block
2424  else{
2425  for( Int ib = 1; ib < Lcol.size(); ib++ ){
2426  numRowLUpdateBuf += Lcol[ib].numRow;
2427  }
2428  } // I own the diagonal block, skip the diagonal block
2429 
2430  //if(ksup==297){gdb_lock();}
2431  totalSize = numRowLUpdateBuf*SuperSize( ksup, super_ )*sizeof(T);
2432 
2433  aggRTL[ksup]=totalSize;
2434 
2435  SeedRTL[ksup]=rand();
2436  }
2437  else if(isRecvFromLeft_(ksup)){
2438  aggRTL[ksup]=1;
2439  }
2440  else{
2441  aggRTL[ksup]=0;
2442  }
2443  }
2444  // //allgather
2445  MPI_Allgather(&aggRTL[0],numSuper*sizeof(Int),MPI_BYTE,
2446  &globalAggRTL[0],numSuper*sizeof(Int),MPI_BYTE,
2447  grid_->rowComm);
2448  MPI_Allreduce(MPI_IN_PLACE,&SeedRTL[0],numSuper,MPI_DOUBLE,MPI_MAX,grid_->rowComm);
2449 
2450 
2451  for( Int ksup = 0; ksup < numSuper ; ksup++ ){
2452  set<Int> set_ranks;
2453  Int msgSize = 0;
2454  for( Int iProcCol = 0; iProcCol < grid_->numProcCol; iProcCol++ ){
2455  Int isRTL = globalAggRTL[iProcCol*numSuper + ksup];
2456  if(isRTL>0){
2457  if( iProcCol != PCOL( ksup, grid_ ) ){
2458  set_ranks.insert(iProcCol);
2459  }
2460  else{
2461  msgSize = isRTL;
2462  }
2463  }
2464  }
2465 
2466  if( isRecvFromLeft_(ksup) || CountSendToRight(ksup)>0 ){
2467  vector<Int> tree_ranks;
2468  tree_ranks.push_back(PCOL(ksup,grid_));
2469  tree_ranks.insert(tree_ranks.end(),set_ranks.begin(),set_ranks.end());
2470 
2471  TreeReduce<T> * & redLTree = redToLeftTree_[ksup];
2472  redLTree = TreeReduce<T>::Create(this->grid_->rowComm,&tree_ranks[0],tree_ranks.size(),msgSize,SeedRTL[ksup]);
2473 #ifdef COMM_PROFILE
2474  redLTree->SetGlobalComm(grid_->comm);
2475 #endif
2476  }
2477  }
2478 
2479  TIMER_STOP(BUILD_REDUCE_L_TREE);
2480 
2481 
2482 
2483 
2484 
2485 
2486 
2487 #if ( _DEBUGlevel_ >= 1 )
2488  statusOFS << std::endl << "isSendToBelow:" << std::endl;
2489  for(int j = 0;j< isSendToBelow_.n();j++){
2490  statusOFS << "["<<j<<"] ";
2491  for(int i =0; i < isSendToBelow_.m();i++){
2492  statusOFS<< isSendToBelow_(i,j) << " ";
2493  }
2494  statusOFS<<std::endl;
2495  }
2496 
2497  statusOFS << std::endl << "isRecvFromAbove:" << std::endl;
2498  for(int j = 0;j< isRecvFromAbove_.m();j++){
2499  statusOFS << "["<<j<<"] "<< isRecvFromAbove_(j)<<std::endl;
2500  }
2501 #endif
2502 #if ( _DEBUGlevel_ >= 1 )
2503  statusOFS << std::endl << "isSendToRight:" << std::endl;
2504  for(int j = 0;j< isSendToRight_.n();j++){
2505  statusOFS << "["<<j<<"] ";
2506  for(int i =0; i < isSendToRight_.m();i++){
2507  statusOFS<< isSendToRight_(i,j) << " ";
2508  }
2509  statusOFS<<std::endl;
2510  }
2511 
2512  statusOFS << std::endl << "isRecvFromLeft:" << std::endl;
2513  for(int j = 0;j< isRecvFromLeft_.m();j++){
2514  statusOFS << "["<<j<<"] "<< isRecvFromLeft_(j)<<std::endl;
2515  }
2516 
2517  statusOFS << std::endl << "isRecvFromBelow:" << std::endl;
2518  for(int j = 0;j< isRecvFromBelow_.n();j++){
2519  statusOFS << "["<<j<<"] ";
2520  for(int i =0; i < isRecvFromBelow_.m();i++){
2521  statusOFS<< isRecvFromBelow_(i,j) << " ";
2522  }
2523  statusOFS<<std::endl;
2524  }
2525 #endif
2526 
2527 
2528 
2529 
2530 
2531 
2532 
2533  TIMER_START(STCD_RFCD);
2534 
2535 
2536 #ifndef _RELEASE_
2537  PushCallStack("SendToCrossDiagonal / RecvFromCrossDiagonal");
2538 #endif
2539  for( Int ksup = 0; ksup < numSuper - 1; ksup++ ){
2540  if( MYCOL( grid_ ) == PCOL( ksup, grid_ ) ){
2541  for( std::set<Int>::iterator si = localColBlockRowIdx[LBj( ksup, grid_ )].begin();
2542  si != localColBlockRowIdx[LBj( ksup, grid_ )].end(); si++ ){
2543  Int isup = *si;
2544  Int isupProcRow = PROW( isup, grid_ );
2545  Int isupProcCol = PCOL( isup, grid_ );
2546  if( isup > ksup && MYROW( grid_ ) == isupProcRow ){
2547  isSendToCrossDiagonal_(grid_->numProcCol, ksup ) = true;
2548  isSendToCrossDiagonal_(isupProcCol, ksup ) = true;
2549  }
2550  } // for (si)
2551  } // if( MYCOL( grid_ ) == PCOL( ksup, grid_ ) )
2552  } // for (ksup)
2553 
2554  for( Int ksup = 0; ksup < numSuper - 1; ksup++ ){
2555  if( MYROW( grid_ ) == PROW( ksup, grid_ ) ){
2556  for( std::set<Int>::iterator si = localRowBlockColIdx[ LBi(ksup, grid_) ].begin();
2557  si != localRowBlockColIdx[ LBi(ksup, grid_) ].end(); si++ ){
2558  Int jsup = *si;
2559  Int jsupProcCol = PCOL( jsup, grid_ );
2560  Int jsupProcRow = PROW( jsup, grid_ );
2561  if( jsup > ksup && MYCOL(grid_) == jsupProcCol ){
2562  isRecvFromCrossDiagonal_(grid_->numProcRow, ksup ) = true;
2563  isRecvFromCrossDiagonal_(jsupProcRow, ksup ) = true;
2564  }
2565  } // for (si)
2566  } // if( MYROW( grid_ ) == PROW( ksup, grid_ ) )
2567  } // for (ksup)
2568 #if ( _DEBUGlevel_ >= 1 )
2569  statusOFS << std::endl << "isSendToCrossDiagonal:" << std::endl;
2570  for(int j =0; j < isSendToCrossDiagonal_.n();j++){
2571  if(isSendToCrossDiagonal_(grid_->numProcCol,j)){
2572  statusOFS << "["<<j<<"] ";
2573  for(int i =0; i < isSendToCrossDiagonal_.m()-1;i++){
2574  if(isSendToCrossDiagonal_(i,j))
2575  {
2576  statusOFS<< PNUM(PROW(j,grid_),i,grid_)<<" ";
2577  }
2578  }
2579  statusOFS<<std::endl;
2580  }
2581  }
2582 
2583  statusOFS << std::endl << "isRecvFromCrossDiagonal:" << std::endl;
2584  for(int j =0; j < isRecvFromCrossDiagonal_.n();j++){
2585  if(isRecvFromCrossDiagonal_(grid_->numProcRow,j)){
2586  statusOFS << "["<<j<<"] ";
2587  for(int i =0; i < isRecvFromCrossDiagonal_.m()-1;i++){
2588  if(isRecvFromCrossDiagonal_(i,j))
2589  {
2590  statusOFS<< PNUM(i,PCOL(j,grid_),grid_)<<" ";
2591  }
2592  }
2593  statusOFS<<std::endl;
2594  }
2595  }
2596 
2597 
2598 #endif
2599 
2600 #ifndef _RELEASE_
2601  PopCallStack();
2602 #endif
2603 
2604  TIMER_STOP(STCD_RFCD);
2605 
2606 #ifndef _RELEASE_
2607  PopCallStack();
2608 #endif
2609 
2610  //Build the list of supernodes based on the elimination tree from SuperLU
2611  GetWorkSet(snodeEtree,this->WorkingSet());
2612 
2613  return ;
2614  } // ----- end of method PMatrix::ConstructCommunicationPattern_P2p -----
2615 
2616  template<typename T>
2618  {
2619 
2620  SelInv_P2p ( );
2621 
2622 
2623 #ifdef GEMM_PROFILE
2624  statOFS<<"m"<<"\t"<<"n"<<"\t"<<"z"<<std::endl;
2625  for(auto it = gemm_stat.begin(); it!=gemm_stat.end(); it+=3){
2626  statOFS<<*it<<"\t"<<*(it+1)<<"\t"<<*(it+2)<<std::endl;
2627  }
2628 #endif
2629 
2630 #ifdef COMM_PROFILE
2631  //std::cout<<"DUMPING COMM STATS "<<comm_stat.size()<<" "<<std::endl;
2632  commOFS<<HEADER_COMM<<std::endl;
2633  for(auto it = comm_stat.begin(); it!=comm_stat.end(); it+=4){
2634  commOFS<<LINE_COMM(it)<<std::endl;
2635  }
2636 #endif
2637  } // ----- end of method PMatrix::SelInv -----
2638 
2639 
2640 
2641  template<typename T>
2643  {
2644  TIMER_START(SelInv_P2p);
2645 
2646 #ifndef _RELEASE_
2647  PushCallStack("PMatrix::SelInv_P2p");
2648 #endif
2649 
2650 
2651  Int numSuper = this->NumSuper();
2652 
2653  // Main loop
2654  std::vector<std::vector<Int> > & superList = this->WorkingSet();
2655  Int numSteps = superList.size();
2656 
2657  for (Int lidx=0; lidx<numSteps ; lidx++){
2658  Int stepSuper = superList[lidx].size();
2659 
2660  SelInvIntra_P2p(lidx);
2661 
2662  // if(lidx==1){ return;};
2663  }
2664 
2665 #ifndef _RELEASE_
2666  PopCallStack();
2667 #endif
2668 
2669  TIMER_STOP(SelInv_P2p);
2670 
2671  return ;
2672  } // ----- end of method PMatrix::SelInv_P2p -----
2673 
2674 
2675 
2676  template<typename T>
2678  {
2679 #ifndef _RELEASE_
2680  PushCallStack("PMatrix::PreSelInv");
2681 #endif
2682 
2683  Int numSuper = this->NumSuper();
2684 
2685 #ifndef _RELEASE_
2686  PushCallStack("L(i,k) <- L(i,k) * L(k,k)^{-1}");
2687 #endif
2688 #if ( _DEBUGlevel_ >= 1 )
2689  statusOFS << std::endl << "L(i,k) <- L(i,k) * L(k,k)^{-1}" << std::endl << std::endl;
2690 #endif
2691  for( Int ksup = 0; ksup < numSuper; ksup++ ){
2692  if( MYCOL( grid_ ) == PCOL( ksup, grid_ ) ){
2693  // Broadcast the diagonal L block
2694  NumMat<T> nzvalLDiag;
2695  std::vector<LBlock<T> >& Lcol = this->L( LBj( ksup, grid_ ) );
2696  if( MYROW( grid_ ) == PROW( ksup, grid_ ) ){
2697  nzvalLDiag = Lcol[0].nzval;
2698  if( nzvalLDiag.m() != SuperSize(ksup, super_) ||
2699  nzvalLDiag.n() != SuperSize(ksup, super_) ){
2700 #ifdef USE_ABORT
2701  abort();
2702 #endif
2703  throw std::runtime_error( "The size of the diagonal block of L is wrong." );
2704  }
2705  } // Owns the diagonal block
2706  else
2707  {
2708  nzvalLDiag.Resize( SuperSize(ksup, super_), SuperSize(ksup, super_) );
2709  }
2710  MPI_Bcast( (void*)nzvalLDiag.Data(), nzvalLDiag.ByteSize(),
2711  MPI_BYTE, PROW( ksup, grid_ ), grid_->colComm );
2712 
2713  // Triangular solve
2714  for( Int ib = 0; ib < Lcol.size(); ib++ ){
2715  LBlock<T> & LB = Lcol[ib];
2716  if( LB.blockIdx > ksup ){
2717 #if ( _DEBUGlevel_ >= 2 )
2718  // Check the correctness of the triangular solve for the first local column
2719  if( LBj( ksup, grid_ ) == 0 ){
2720  statusOFS << "Diag L(" << ksup << ", " << ksup << "): " << nzvalLDiag << std::endl;
2721  statusOFS << "Before solve L(" << LB.blockIdx << ", " << ksup << "): " << LB.nzval << std::endl;
2722  }
2723 #endif
2724  blas::Trsm( 'R', 'L', 'N', 'U', LB.numRow, LB.numCol, ONE<T>(),
2725  nzvalLDiag.Data(), LB.numCol, LB.nzval.Data(), LB.numRow );
2726 #if ( _DEBUGlevel_ >= 2 )
2727  // Check the correctness of the triangular solve for the first local column
2728  if( LBj( ksup, grid_ ) == 0 ){
2729  statusOFS << "After solve L(" << LB.blockIdx << ", " << ksup << "): " << LB.nzval << std::endl;
2730  }
2731 #endif
2732  }
2733  }
2734  } // if( MYCOL( grid_ ) == PCOL( ksup, grid_ ) )
2735  } // for (ksup)
2736 
2737 
2738 #ifndef _RELEASE_
2739  PopCallStack();
2740 #endif
2741 
2742 
2743 #ifndef _RELEASE_
2744  PushCallStack("U(k,i) <- L(i,k)");
2745 #endif
2746 #if ( _DEBUGlevel_ >= 1 )
2747  statusOFS << std::endl << "U(k,i) <- L(i,k)" << std::endl << std::endl;
2748 #endif
2749 
2750  for( Int ksup = 0; ksup < numSuper; ksup++ ){
2751  Int ksupProcRow = PROW( ksup, grid_ );
2752  Int ksupProcCol = PCOL( ksup, grid_ );
2753 
2754  Int sendCount = CountSendToCrossDiagonal(ksup);
2755  Int recvCount = CountRecvFromCrossDiagonal(ksup);
2756 
2757  std::vector<MPI_Request > arrMpiReqsSend(sendCount, MPI_REQUEST_NULL );
2758  std::vector<MPI_Request > arrMpiReqsSizeSend(sendCount, MPI_REQUEST_NULL );
2759  std::vector<std::vector<char> > arrSstrLcolSend(sendCount);
2760  std::vector<Int > arrSstrLcolSizeSend(sendCount);
2761 
2762  std::vector<MPI_Request > arrMpiReqsRecv(recvCount, MPI_REQUEST_NULL );
2763  std::vector<MPI_Request > arrMpiReqsSizeRecv(recvCount, MPI_REQUEST_NULL );
2764  std::vector<std::vector<char> > arrSstrLcolRecv(recvCount);
2765  std::vector<Int > arrSstrLcolSizeRecv(recvCount);
2766 
2767 
2768 
2769  // Sender
2770  if( isSendToCrossDiagonal_(grid_->numProcCol,ksup) ){
2771 #if ( _DEBUGlevel_ >= 1 )
2772  statusOFS<<"["<<ksup<<"] P"<<MYPROC(grid_)<<" should send to "<<CountSendToCrossDiagonal(ksup)<<" processors"<<std::endl;
2773 #endif
2774 
2775  Int sendIdx = 0;
2776  for(Int dstCol = 0; dstCol<grid_->numProcCol; dstCol++){
2777  if(isSendToCrossDiagonal_(dstCol,ksup) ){
2778  Int dst = PNUM(PROW(ksup,grid_),dstCol,grid_);
2779  if(MYPROC(grid_)!= dst){
2780  // Pack L data
2781  std::stringstream sstm;
2782  std::vector<char> & sstrLcolSend = arrSstrLcolSend[sendIdx];
2783  Int & sstrSize = arrSstrLcolSizeSend[sendIdx];
2784  MPI_Request & mpiReqSend = arrMpiReqsSend[sendIdx];
2785  MPI_Request & mpiReqSizeSend = arrMpiReqsSizeSend[sendIdx];
2786 
2787  std::vector<Int> mask( LBlockMask::TOTAL_NUMBER, 1 );
2788  std::vector<LBlock<T> >& Lcol = this->L( LBj(ksup, grid_) );
2789  // All blocks except for the diagonal block are to be sent right
2790  //TODO not true > this is a scatter operation ! Can we know the destination ?
2791 
2792  Int count = 0;
2793  if( MYROW( grid_ ) == PROW( ksup, grid_ ) ){
2794  for( Int ib = 1; ib < Lcol.size(); ib++ ){
2795  if( Lcol[ib].blockIdx > ksup && (Lcol[ib].blockIdx % grid_->numProcCol) == dstCol ){
2796  count++;
2797  }
2798  }
2799  }
2800  else{
2801 
2802  for( Int ib = 0; ib < Lcol.size(); ib++ ){
2803  if( Lcol[ib].blockIdx > ksup && (Lcol[ib].blockIdx % grid_->numProcCol) == dstCol ){
2804  count++;
2805  }
2806  }
2807  }
2808 
2809  serialize( (Int)count, sstm, NO_MASK );
2810 
2811  for( Int ib = 0; ib < Lcol.size(); ib++ ){
2812  if( Lcol[ib].blockIdx > ksup && (Lcol[ib].blockIdx % grid_->numProcCol) == dstCol ){
2813 #if ( _DEBUGlevel_ >= 1 )
2814  statusOFS<<"["<<ksup<<"] SEND contains "<<Lcol[ib].blockIdx<< " which corresponds to "<<GBj(ib,grid_)<<std::endl;
2815 #endif
2816  serialize( Lcol[ib], sstm, mask );
2817  }
2818  }
2819 
2820  sstrLcolSend.resize( Size(sstm) );
2821  sstm.read( &sstrLcolSend[0], sstrLcolSend.size() );
2822  sstrSize = sstrLcolSend.size();
2823 
2824 
2825 
2826  // Send/Recv is possible here due to the one to one correspondence
2827  // in the case of square processor grid
2828 
2829 #if ( _DEBUGlevel_ >= 1 )
2830  statusOFS<<"["<<ksup<<"] P"<<MYPROC(grid_)<<" ("<<MYROW(grid_)<<","<<MYCOL(grid_)<<") ---> LBj("<<ksup<<")="<<LBj(ksup,grid_)<<" ---> P"<<dst<<" ("<<ksupProcRow<<","<<dstCol<<")"<<std::endl;
2831 #endif
2832  MPI_Isend( &sstrSize, sizeof(sstrSize), MPI_BYTE, dst, SELINV_TAG_D_SIZE, grid_->comm, &mpiReqSizeSend );
2833  MPI_Isend( (void*)&sstrLcolSend[0], sstrSize, MPI_BYTE, dst, SELINV_TAG_D_CONTENT, grid_->comm, &mpiReqSend );
2834 
2835 
2836  PROFILE_COMM(MYPROC(this->grid_),dst,SELINV_TAG_D_SIZE,sizeof(sstrSize));
2837  PROFILE_COMM(MYPROC(this->grid_),dst,SELINV_TAG_D_CONTENT,sstrSize);
2838 
2839  //mpi::Send( sstm, dst,SELINV_TAG_D_SIZE, SELINV_TAG_D_CONTENT, grid_->comm );
2840 
2841  sendIdx++;
2842  } // if I am a sender
2843  }
2844  }
2845  }
2846 
2847 
2848 
2849 
2850 
2851  // Receiver
2852  if( isRecvFromCrossDiagonal_(grid_->numProcRow,ksup) ){
2853 
2854 
2855 #if ( _DEBUGlevel_ >= 1 )
2856  statusOFS<<"["<<ksup<<"] P"<<MYPROC(grid_)<<" should receive from "<<CountRecvFromCrossDiagonal(ksup)<<" processors"<<std::endl;
2857 #endif
2858 
2859 
2860  std::vector<UBlock<T> >& Urow = this->U( LBi( ksup, grid_ ) );
2861  std::vector<bool> isBlockFound(Urow.size(),false);
2862 
2863 
2864  Int recvIdx = 0;
2865  //receive size first
2866  for(Int srcRow = 0; srcRow<grid_->numProcRow; srcRow++){
2867  if(isRecvFromCrossDiagonal_(srcRow,ksup) ){
2868  std::vector<LBlock<T> > LcolRecv;
2869  Int src = PNUM(srcRow,PCOL(ksup,grid_),grid_);
2870  if(MYPROC(grid_)!= src){
2871  MPI_Request & mpiReqSizeRecv = arrMpiReqsSizeRecv[recvIdx];
2872  Int & sstrSize = arrSstrLcolSizeRecv[recvIdx];
2873 
2874  MPI_Irecv( &sstrSize, 1, MPI_INT, src, SELINV_TAG_D_SIZE, grid_->comm, &mpiReqSizeRecv );
2875 
2876  recvIdx++;
2877  }
2878  }
2879  }
2880 
2881  mpi::Waitall(arrMpiReqsSizeRecv);
2882 
2883 
2884 
2885  //receive content
2886  recvIdx = 0;
2887  for(Int srcRow = 0; srcRow<grid_->numProcRow; srcRow++){
2888  if(isRecvFromCrossDiagonal_(srcRow,ksup) ){
2889  std::vector<LBlock<T> > LcolRecv;
2890  Int src = PNUM(srcRow,PCOL(ksup,grid_),grid_);
2891  if(MYPROC(grid_)!= src){
2892  MPI_Request & mpiReqRecv = arrMpiReqsRecv[recvIdx];
2893  Int & sstrSize = arrSstrLcolSizeRecv[recvIdx];
2894  std::vector<char> & sstrLcolRecv = arrSstrLcolRecv[recvIdx];
2895  sstrLcolRecv.resize(sstrSize);
2896 
2897  MPI_Irecv( &sstrLcolRecv[0], sstrSize, MPI_BYTE, src, SELINV_TAG_D_CONTENT, grid_->comm, &mpiReqRecv );
2898 
2899  recvIdx++;
2900  }
2901  }
2902  }
2903 
2904  mpi::Waitall(arrMpiReqsRecv);
2905 
2906 
2907 
2908  //Process the content
2909  recvIdx = 0;
2910  for(Int srcRow = 0; srcRow<grid_->numProcRow; srcRow++){
2911  if(isRecvFromCrossDiagonal_(srcRow,ksup) ){
2912  std::vector<LBlock<T> > LcolRecv;
2913  Int src = PNUM(srcRow,PCOL(ksup,grid_),grid_);
2914  if(MYPROC(grid_)!= src){
2915 
2916  Int & sstrSize = arrSstrLcolSizeRecv[recvIdx];
2917  std::vector<char> & sstrLcolRecv = arrSstrLcolRecv[recvIdx];
2918  std::stringstream sstm;
2919 
2920 #if ( _DEBUGlevel_ >= 1 )
2921  statusOFS<<"["<<ksup<<"] P"<<MYPROC(grid_)<<" ("<<MYROW(grid_)<<","<<MYCOL(grid_)<<") <--- LBj("<<ksup<<") <--- P"<<src<<" ("<<srcRow<<","<<ksupProcCol<<")"<<std::endl;
2922 #endif
2923 
2924 
2925  sstm.write( &sstrLcolRecv[0], sstrSize );
2926 
2927  // Unpack L data.
2928  Int numLBlock;
2929  std::vector<Int> mask( LBlockMask::TOTAL_NUMBER, 1 );
2930  deserialize( numLBlock, sstm, NO_MASK );
2931  LcolRecv.resize(numLBlock);
2932  for( Int ib = 0; ib < numLBlock; ib++ ){
2933  deserialize( LcolRecv[ib], sstm, mask );
2934 #if ( _DEBUGlevel_ >= 1 )
2935  statusOFS<<"["<<ksup<<"] RECV contains "<<LcolRecv[ib].blockIdx<< " which corresponds to "<< ib * grid_->numProcRow + srcRow; // <<std::endl;
2936  // statusOFS<<" L is on row "<< srcRow <<" whereas U is on col "<<((ib * grid_->numProcRow + srcRow)/grid_->numProcCol)%grid_->numProcCol <<std::endl;
2937  statusOFS<<" L is on row "<< srcRow <<" whereas U is on col "<< LcolRecv[ib].blockIdx % grid_->numProcCol <<std::endl;
2938 #endif
2939  }
2940 
2941 
2942  recvIdx++;
2943 
2944  } // sender is not the same as receiver
2945  else{
2946  // L is obtained locally, just make a copy. Do not include the diagonal block
2947  std::vector<LBlock<T> >& Lcol = this->L( LBj( ksup, grid_ ) );
2948  if( MYROW( grid_ ) != PROW( ksup, grid_ ) ){
2949  LcolRecv.resize( Lcol.size() );
2950  for( Int ib = 0; ib < Lcol.size(); ib++ ){
2951  LcolRecv[ib] = Lcol[ib];
2952  }
2953  }
2954  else{
2955  LcolRecv.resize( Lcol.size() - 1 );
2956  for( Int ib = 0; ib < Lcol.size() - 1; ib++ ){
2957  LcolRecv[ib] = Lcol[ib+1];
2958  }
2959  }
2960  } // sender is the same as receiver
2961 
2962  // Update U
2963  // Make sure that the size of L and the corresponding U blocks match.
2964  for( Int ib = 0; ib < LcolRecv.size(); ib++ ){
2965  LBlock<T> & LB = LcolRecv[ib];
2966  if( LB.blockIdx <= ksup ){
2967 #ifdef USE_ABORT
2968  abort();
2969 #endif
2970  throw std::logic_error( "LcolRecv contains the wrong blocks." );
2971  }
2972  for( Int jb = 0; jb < Urow.size(); jb++ ){
2973  UBlock<T> & UB = Urow[jb];
2974  if( LB.blockIdx == UB.blockIdx ){
2975  // Compare size
2976  if( LB.numRow != UB.numCol || LB.numCol != UB.numRow ){
2977  std::ostringstream msg;
2978  msg << "LB(" << LB.blockIdx << ", " << ksup << ") and UB("
2979  << ksup << ", " << UB.blockIdx << ") do not share the same size." << std::endl
2980  << "LB: " << LB.numRow << " x " << LB.numCol << std::endl
2981  << "UB: " << UB.numRow << " x " << UB.numCol << std::endl;
2982 #ifdef USE_ABORT
2983  abort();
2984 #endif
2985  throw std::runtime_error( msg.str().c_str() );
2986  }
2987 
2988  // Note that the order of the column indices of the U
2989  // block may not follow the order of the row indices,
2990  // overwrite the information in U.
2991  UB.cols = LB.rows;
2992  Transpose( LB.nzval, UB.nzval );
2993 
2994 #if ( _DEBUGlevel_ >= 1 )
2995  statusOFS<<"["<<ksup<<"] USING "<<LB.blockIdx<< std::endl;
2996 #endif
2997  isBlockFound[jb] = true;
2998  break;
2999  } // if( LB.blockIdx == UB.blockIdx )
3000  } // for (jb)
3001  } // for (ib)
3002  }
3003  }
3004 
3005  for( Int jb = 0; jb < Urow.size(); jb++ ){
3006  UBlock<T> & UB = Urow[jb];
3007  if( !isBlockFound[jb] ){
3008 #ifdef USE_ABORT
3009  abort();
3010 #endif
3011  throw std::logic_error( "UBlock cannot find its update. Something is seriously wrong." );
3012  }
3013  }
3014 
3015 
3016 
3017  } // if I am a receiver
3018 
3019 
3020  //Wait until every receiver is done
3021  mpi::Waitall(arrMpiReqsSizeSend);
3022  mpi::Waitall(arrMpiReqsSend);
3023 
3024 
3025  } // for (ksup)
3026 
3027 #ifndef _RELEASE_
3028  PopCallStack();
3029 #endif
3030 
3031 #ifndef _RELEASE_
3032  PushCallStack("L(i,i) <- [L(k,k) * U(k,k)]^{-1} ");
3033 #endif
3034 #if ( _DEBUGlevel_ >= 1 )
3035  statusOFS << std::endl << "L(i,i) <- [L(k,k) * U(k,k)]^{-1}" << std::endl << std::endl;
3036 #endif
3037 
3038  for( Int ksup = 0; ksup < numSuper; ksup++ ){
3039  if( MYROW( grid_ ) == PROW( ksup, grid_ ) &&
3040  MYCOL( grid_ ) == PCOL( ksup, grid_ ) ){
3041  IntNumVec ipiv( SuperSize( ksup, super_ ) );
3042  // Note that the pivoting vector ipiv should follow the FORTRAN
3043  // notation by adding the +1
3044  for(Int i = 0; i < SuperSize( ksup, super_ ); i++){
3045  ipiv[i] = i + 1;
3046  }
3047  LBlock<T> & LB = (this->L( LBj( ksup, grid_ ) ))[0];
3048 #if ( _DEBUGlevel_ >= 2 )
3049  // Check the correctness of the matrix inversion for the first local column
3050  statusOFS << "Factorized A (" << ksup << ", " << ksup << "): " << LB.nzval << std::endl;
3051 #endif
3052  lapack::Getri( SuperSize( ksup, super_ ), LB.nzval.Data(),
3053  SuperSize( ksup, super_ ), ipiv.Data() );
3054 
3055  // Symmetrize the diagonal block
3056  Symmetrize( LB.nzval );
3057 
3058 #if ( _DEBUGlevel_ >= 2 )
3059  // Check the correctness of the matrix inversion for the first local column
3060  statusOFS << "Inversed A (" << ksup << ", " << ksup << "): " << LB.nzval << std::endl;
3061 #endif
3062  } // if I need to inverse the diagonal block
3063  } // for (ksup)
3064 
3065 
3066 #ifndef _RELEASE_
3067  PopCallStack();
3068 #endif
3069 
3070 
3071 
3072 #ifndef _RELEASE_
3073  PopCallStack();
3074 #endif
3075 
3076  return ;
3077  } // ----- end of method PMatrix::PreSelInv -----
3078 
3079  template<typename T>
3081  {
3082 #ifndef _RELEASE_
3083  PushCallStack("PMatrix::GetDiagonal");
3084 #endif
3085  Int numSuper = this->NumSuper();
3086 
3087  Int numCol = this->NumCol();
3088  const IntNumVec& perm = super_->perm;
3089  const IntNumVec& permInv = super_->permInv;
3090 
3091  const IntNumVec * pPerm_r;
3092  const IntNumVec * pPermInv_r;
3093 
3094  pPerm_r = &super_->perm_r;
3095  pPermInv_r = &super_->permInv_r;
3096 
3097  const IntNumVec& perm_r = *pPerm_r;
3098  const IntNumVec& permInv_r = *pPermInv_r;
3099 
3100 
3101  NumVec<T> diagLocal( numCol );
3102  SetValue( diagLocal, ZERO<T>() );
3103 
3104  diag.Resize( numCol );
3105  SetValue( diag, ZERO<T>() );
3106 
3107  for( Int orow = 0; orow < numCol; orow++){
3108  //row index in the permuted order
3109  Int row = perm[ orow ];
3110  //col index in the permuted order
3111  Int col = perm[ perm_r[ orow] ];
3112 
3113  Int blockColIdx = BlockIdx( col, super_ );
3114  Int blockRowIdx = BlockIdx( row, super_ );
3115 
3116 
3117  // I own the column
3118  if( MYROW( grid_ ) == PROW( blockRowIdx, grid_ ) &&
3119  MYCOL( grid_ ) == PCOL( blockColIdx, grid_ ) ){
3120  // Search for the nzval
3121  bool isFound = false;
3122 
3123  if( blockColIdx <= blockRowIdx ){
3124  // Data on the L side
3125 
3126  std::vector<LBlock<T> >& Lcol = this->L( LBj( blockColIdx, grid_ ) );
3127 
3128  for( Int ib = 0; ib < Lcol.size(); ib++ ){
3129 #if ( _DEBUGlevel_ >= 1 )
3130  statusOFS << "blockRowIdx = " << blockRowIdx << ", Lcol[ib].blockIdx = " << Lcol[ib].blockIdx << ", blockColIdx = " << blockColIdx << std::endl;
3131 #endif
3132  if( Lcol[ib].blockIdx == blockRowIdx ){
3133  IntNumVec& rows = Lcol[ib].rows;
3134  for( int iloc = 0; iloc < Lcol[ib].numRow; iloc++ ){
3135  if( rows[iloc] == row ){
3136  Int jloc = col - FirstBlockCol( blockColIdx, super_ );
3137 
3138  diagLocal[ orow ] = Lcol[ib].nzval( iloc, jloc );
3139 
3140 
3141  isFound = true;
3142  break;
3143  } // found the corresponding row
3144  }
3145  }
3146  if( isFound == true ) break;
3147  } // for (ib)
3148  }
3149  else{
3150  // Data on the U side
3151 
3152  std::vector<UBlock<T> >& Urow = this->U( LBi( blockRowIdx, grid_ ) );
3153 
3154  for( Int jb = 0; jb < Urow.size(); jb++ ){
3155  if( Urow[jb].blockIdx == blockColIdx ){
3156  IntNumVec& cols = Urow[jb].cols;
3157  for( int jloc = 0; jloc < Urow[jb].numCol; jloc++ ){
3158  if( cols[jloc] == col ){
3159  Int iloc = row - FirstBlockRow( blockRowIdx, super_ );
3160 
3161  diagLocal[ orow ] = Urow[jb].nzval( iloc, jloc );
3162 
3163  isFound = true;
3164  break;
3165  } // found the corresponding col
3166  }
3167  }
3168  if( isFound == true ) break;
3169  } // for (jb)
3170  } // if( blockColIdx <= blockRowIdx )
3171 
3172  // Did not find the corresponding row, set the value to zero.
3173  if( isFound == false ){
3174  statusOFS << "In the permutated order, (" << row << ", " << col <<
3175  ") is not found in PMatrix." << std::endl;
3176  diagLocal[orow] = ZERO<T>();
3177  }
3178  }
3179  }
3180 
3181 // //TODO This doesnt work with row perm
3182 // for( Int ksup = 0; ksup < numSuper; ksup++ ){
3183 // Int numRows = SuperSize(ksup, this->super_);
3184 //
3185 // // I own the diagonal block
3186 // if( MYROW( grid_ ) == PROW( ksup, grid_ ) &&
3187 // MYCOL( grid_ ) == PCOL( ksup, grid_ ) ){
3188 // LBlock<T> & LB = this->L( LBj( ksup, grid_ ) )[0];
3189 // for( Int i = 0; i < LB.numRow; i++ ){
3190 // diagLocal( permInv[ LB.rows(i) ] ) = LB.nzval( i, perm[permInv_r[i]] );
3191 // }
3192 // }
3193 // }
3194 
3195  // All processors own diag
3196  mpi::Allreduce( diagLocal.Data(), diag.Data(), numCol, MPI_SUM, grid_->comm );
3197 
3198 #ifndef _RELEASE_
3199  PopCallStack();
3200 #endif
3201 
3202  return ;
3203  } // ----- end of method PMatrix::GetDiagonal -----
3204 
3205 
3206 
3207  template<typename T>
3209  {
3210 #ifndef _RELEASE_
3211  PushCallStack("PMatrix::PMatrixToDistSparseMatrix");
3212 #endif
3213 #if ( _DEBUGlevel_ >= 1 )
3214  statusOFS << std::endl << "Converting PMatrix to DistSparseMatrix." << std::endl;
3215 #endif
3216  Int mpirank = grid_->mpirank;
3217  Int mpisize = grid_->mpisize;
3218 
3219  std::vector<Int> rowSend( mpisize );
3220  std::vector<Int> colSend( mpisize );
3221  std::vector<T> valSend( mpisize );
3222  std::vector<Int> sizeSend( mpisize, 0 );
3223  std::vector<Int> displsSend( mpisize, 0 );
3224 
3225  std::vector<Int> rowRecv( mpisize );
3226  std::vector<Int> colRecv( mpisize );
3227  std::vector<T> valRecv( mpisize );
3228  std::vector<Int> sizeRecv( mpisize, 0 );
3229  std::vector<Int> displsRecv( mpisize, 0 );
3230 
3231  Int numSuper = this->NumSuper();
3232  const IntNumVec& perm = super_->perm;
3233  const IntNumVec& permInv = super_->permInv;
3234 
3235  const IntNumVec * pPerm_r;
3236  const IntNumVec * pPermInv_r;
3237 
3238  pPerm_r = &super_->perm_r;
3239  pPermInv_r = &super_->permInv_r;
3240 
3241  const IntNumVec& perm_r = *pPerm_r;
3242  const IntNumVec& permInv_r = *pPermInv_r;
3243 
3244  // The number of local columns in DistSparseMatrix format for the
3245  // processor with rank 0. This number is the same for processors
3246  // with rank ranging from 0 to mpisize - 2, and may or may not differ
3247  // from the number of local columns for processor with rank mpisize -
3248  // 1.
3249  Int numColFirst = this->NumCol() / mpisize;
3250 
3251  // Count the size first.
3252  for( Int ksup = 0; ksup < numSuper; ksup++ ){
3253  // L blocks
3254  if( MYCOL( grid_ ) == PCOL( ksup, grid_ ) ){
3255  std::vector<LBlock<T> >& Lcol = this->L( LBj( ksup, grid_ ) );
3256  for( Int ib = 0; ib < Lcol.size(); ib++ ){
3257  for( Int j = 0; j < Lcol[ib].numCol; j++ ){
3258  Int jcol = permInv[ permInv_r[ j + FirstBlockCol( ksup, super_ ) ] ];
3259  Int dest = std::min( jcol / numColFirst, mpisize - 1 );
3260  sizeSend[dest] += Lcol[ib].numRow;
3261  }
3262  }
3263  } // I own the column of ksup
3264 
3265  // U blocks
3266  if( MYROW( grid_ ) == PROW( ksup, grid_ ) ){
3267  std::vector<UBlock<T> >& Urow = this->U( LBi( ksup, grid_ ) );
3268  for( Int jb = 0; jb < Urow.size(); jb++ ){
3269  IntNumVec& cols = Urow[jb].cols;
3270  for( Int j = 0; j < cols.m(); j++ ){
3271  Int jcol = permInv[ permInv_r[ cols[j] ] ];
3272  Int dest = std::min( jcol / numColFirst, mpisize - 1 );
3273  sizeSend[dest] += Urow[jb].numRow;
3274  }
3275  }
3276  } // I own the row of ksup
3277  } // for (ksup)
3278 
3279  // All-to-all exchange of size information
3280  MPI_Alltoall(
3281  &sizeSend[0], 1, MPI_INT,
3282  &sizeRecv[0], 1, MPI_INT, grid_->comm );
3283 
3284 
3285 
3286  // Reserve the space
3287  for( Int ip = 0; ip < mpisize; ip++ ){
3288  if( ip == 0 ){
3289  displsSend[ip] = 0;
3290  }
3291  else{
3292  displsSend[ip] = displsSend[ip-1] + sizeSend[ip-1];
3293  }
3294 
3295  if( ip == 0 ){
3296  displsRecv[ip] = 0;
3297  }
3298  else{
3299  displsRecv[ip] = displsRecv[ip-1] + sizeRecv[ip-1];
3300  }
3301  }
3302  Int sizeSendTotal = displsSend[mpisize-1] + sizeSend[mpisize-1];
3303  Int sizeRecvTotal = displsRecv[mpisize-1] + sizeRecv[mpisize-1];
3304 
3305  rowSend.resize( sizeSendTotal );
3306  colSend.resize( sizeSendTotal );
3307  valSend.resize( sizeSendTotal );
3308 
3309  rowRecv.resize( sizeRecvTotal );
3310  colRecv.resize( sizeRecvTotal );
3311  valRecv.resize( sizeRecvTotal );
3312 
3313 #if ( _DEBUGlevel_ >= 1 )
3314  statusOFS << "displsSend = " << displsSend << std::endl;
3315  statusOFS << "displsRecv = " << displsRecv << std::endl;
3316 #endif
3317 
3318  // Put (row, col, val) to the sending buffer
3319  std::vector<Int> cntSize( mpisize, 0 );
3320 
3321  for( Int ksup = 0; ksup < numSuper; ksup++ ){
3322  // L blocks
3323  if( MYCOL( grid_ ) == PCOL( ksup, grid_ ) ){
3324  std::vector<LBlock<T> >& Lcol = this->L( LBj( ksup, grid_ ) );
3325  for( Int ib = 0; ib < Lcol.size(); ib++ ){
3326  IntNumVec& rows = Lcol[ib].rows;
3327  NumMat<T>& nzval = Lcol[ib].nzval;
3328  for( Int j = 0; j < Lcol[ib].numCol; j++ ){
3329  Int jcol = permInv[ permInv_r[ j + FirstBlockCol( ksup, super_ ) ] ];
3330  Int dest = std::min( jcol / numColFirst, mpisize - 1 );
3331  for( Int i = 0; i < rows.m(); i++ ){
3332  rowSend[displsSend[dest] + cntSize[dest]] = permInv[ rows[i] ];
3333  colSend[displsSend[dest] + cntSize[dest]] = jcol;
3334  valSend[displsSend[dest] + cntSize[dest]] = nzval( i, j );
3335  cntSize[dest]++;
3336  }
3337  }
3338  }
3339  } // I own the column of ksup
3340 
3341  // U blocks
3342  if( MYROW( grid_ ) == PROW( ksup, grid_ ) ){
3343  std::vector<UBlock<T> >& Urow = this->U( LBi( ksup, grid_ ) );
3344  for( Int jb = 0; jb < Urow.size(); jb++ ){
3345  IntNumVec& cols = Urow[jb].cols;
3346  NumMat<T>& nzval = Urow[jb].nzval;
3347  for( Int j = 0; j < cols.m(); j++ ){
3348  Int jcol = permInv[ permInv_r[ cols[j] ] ];
3349  Int dest = std::min( jcol / numColFirst, mpisize - 1 );
3350  for( Int i = 0; i < Urow[jb].numRow; i++ ){
3351  rowSend[displsSend[dest] + cntSize[dest]] = permInv[ i + FirstBlockCol( ksup, super_ ) ];
3352  colSend[displsSend[dest] + cntSize[dest]] = jcol;
3353  valSend[displsSend[dest] + cntSize[dest]] = nzval( i, j );
3354  cntSize[dest]++;
3355  }
3356  }
3357  }
3358  } // I own the row of ksup
3359  }
3360 
3361  // Check sizes match
3362  for( Int ip = 0; ip < mpisize; ip++ ){
3363  if( cntSize[ip] != sizeSend[ip] )
3364  {
3365 #ifdef USE_ABORT
3366  abort();
3367 #endif
3368  throw std::runtime_error( "Sizes of the sending information do not match." );
3369  }
3370  }
3371 
3372 
3373  // Alltoallv to exchange information
3374  mpi::Alltoallv(
3375  &rowSend[0], &sizeSend[0], &displsSend[0],
3376  &rowRecv[0], &sizeRecv[0], &displsRecv[0],
3377  grid_->comm );
3378  mpi::Alltoallv(
3379  &colSend[0], &sizeSend[0], &displsSend[0],
3380  &colRecv[0], &sizeRecv[0], &displsRecv[0],
3381  grid_->comm );
3382  mpi::Alltoallv(
3383  &valSend[0], &sizeSend[0], &displsSend[0],
3384  &valRecv[0], &sizeRecv[0], &displsRecv[0],
3385  grid_->comm );
3386 
3387 #if ( _DEBUGlevel_ >= 1 )
3388  statusOFS << "Alltoallv communication finished." << std::endl;
3389 #endif
3390 
3391  //#if ( _DEBUGlevel_ >= 1 )
3392  // for( Int ip = 0; ip < mpisize; ip++ ){
3393  // statusOFS << "rowSend[" << ip << "] = " << rowSend[ip] << std::endl;
3394  // statusOFS << "rowRecv[" << ip << "] = " << rowRecv[ip] << std::endl;
3395  // statusOFS << "colSend[" << ip << "] = " << colSend[ip] << std::endl;
3396  // statusOFS << "colRecv[" << ip << "] = " << colRecv[ip] << std::endl;
3397  // statusOFS << "valSend[" << ip << "] = " << valSend[ip] << std::endl;
3398  // statusOFS << "valRecv[" << ip << "] = " << valRecv[ip] << std::endl;
3399  // }
3400  //#endif
3401 
3402  // Organize the received message.
3403  Int firstCol = mpirank * numColFirst;
3404  Int numColLocal;
3405  if( mpirank == mpisize-1 )
3406  numColLocal = this->NumCol() - numColFirst * (mpisize-1);
3407  else
3408  numColLocal = numColFirst;
3409 
3410  std::vector<std::vector<Int> > rows( numColLocal );
3411  std::vector<std::vector<T> > vals( numColLocal );
3412 
3413  for( Int ip = 0; ip < mpisize; ip++ ){
3414  Int* rowRecvCur = &rowRecv[displsRecv[ip]];
3415  Int* colRecvCur = &colRecv[displsRecv[ip]];
3416  T* valRecvCur = &valRecv[displsRecv[ip]];
3417  for( Int i = 0; i < sizeRecv[ip]; i++ ){
3418  rows[colRecvCur[i]-firstCol].push_back( rowRecvCur[i] );
3419  vals[colRecvCur[i]-firstCol].push_back( valRecvCur[i] );
3420  } // for (i)
3421  } // for (ip)
3422 
3423  // Sort the rows
3424  std::vector<std::vector<Int> > sortIndex( numColLocal );
3425  for( Int j = 0; j < numColLocal; j++ ){
3426  sortIndex[j].resize( rows[j].size() );
3427  for( Int i = 0; i < sortIndex[j].size(); i++ )
3428  sortIndex[j][i] = i;
3429  std::sort( sortIndex[j].begin(), sortIndex[j].end(),
3430  IndexComp<std::vector<Int>& > ( rows[j] ) );
3431  } // for (j)
3432 
3433  // Form DistSparseMatrix according to the received message
3434  // NOTE: for indicies, DistSparseMatrix follows the FORTRAN
3435  // convention (1 based) while PMatrix follows the C convention (0
3436  // based)
3437  A.size = this->NumCol();
3438  A.nnzLocal = 0;
3439  A.colptrLocal.Resize( numColLocal + 1 );
3440  // Note that 1 is important since the index follows the FORTRAN convention
3441  A.colptrLocal(0) = 1;
3442  for( Int j = 0; j < numColLocal; j++ ){
3443  A.nnzLocal += rows[j].size();
3444  A.colptrLocal(j+1) = A.colptrLocal(j) + rows[j].size();
3445  }
3446 
3447  A.comm = grid_->comm;
3448 
3449 #if ( _DEBUGlevel_ >= 1 )
3450  statusOFS << "nnzLocal = " << A.nnzLocal << std::endl;
3451  statusOFS << "nnz = " << A.Nnz() << std::endl;
3452 #endif
3453 
3454 
3455  A.rowindLocal.Resize( A.nnzLocal );
3456  A.nzvalLocal.Resize( A.nnzLocal );
3457 
3458  Int* rowPtr = A.rowindLocal.Data();
3459  T* nzvalPtr = A.nzvalLocal.Data();
3460  for( Int j = 0; j < numColLocal; j++ ){
3461  std::vector<Int>& rowsCur = rows[j];
3462  std::vector<Int>& sortIndexCur = sortIndex[j];
3463  std::vector<T>& valsCur = vals[j];
3464  for( Int i = 0; i < rows[j].size(); i++ ){
3465  // Note that 1 is important since the index follows the FORTRAN convention
3466  *(rowPtr++) = rowsCur[sortIndexCur[i]] + 1;
3467  *(nzvalPtr++) = valsCur[sortIndexCur[i]];
3468  }
3469  }
3470 
3471 #if ( _DEBUGlevel_ >= 1 )
3472  statusOFS << "A.colptrLocal[end] = " << A.colptrLocal(numColLocal) << std::endl;
3473  statusOFS << "A.rowindLocal.size() = " << A.rowindLocal.m() << std::endl;
3474  statusOFS << "A.rowindLocal[end] = " << A.rowindLocal(A.nnzLocal-1) << std::endl;
3475  statusOFS << "A.nzvalLocal[end] = " << A.nzvalLocal(A.nnzLocal-1) << std::endl;
3476 #endif
3477 
3478 
3479 #ifndef _RELEASE_
3480  PopCallStack();
3481 #endif
3482 
3483  return ;
3484  } // ----- end of method PMatrix::PMatrixToDistSparseMatrix -----
3485 
3486 
3487 
3488  template<typename T>
3490  {
3491 #ifndef _RELEASE_
3492  PushCallStack("PMatrix::PMatrixToDistSparseMatrix_OLD");
3493 #endif
3494 #if ( _DEBUGlevel_ >= 1 )
3495  statusOFS << std::endl << "Converting PMatrix to DistSparseMatrix (2nd format)." << std::endl;
3496 #endif
3497  Int mpirank = grid_->mpirank;
3498  Int mpisize = grid_->mpisize;
3499 
3500  std::vector<Int> rowSend( mpisize );
3501  std::vector<Int> colSend( mpisize );
3502  std::vector<T> valSend( mpisize );
3503  std::vector<Int> sizeSend( mpisize, 0 );
3504  std::vector<Int> displsSend( mpisize, 0 );
3505 
3506  std::vector<Int> rowRecv( mpisize );
3507  std::vector<Int> colRecv( mpisize );
3508  std::vector<T> valRecv( mpisize );
3509  std::vector<Int> sizeRecv( mpisize, 0 );
3510  std::vector<Int> displsRecv( mpisize, 0 );
3511 
3512  Int numSuper = this->NumSuper();
3513  const IntNumVec& permInv = super_->permInv;
3514 
3515 
3516 
3517  const IntNumVec * pPermInv_r;
3518 
3519  if(optionsLU_->RowPerm=="NOROWPERM"){
3520  pPermInv_r = &super_->permInv;
3521  }
3522  else{
3523  pPermInv_r = &super_->permInv_r;
3524  }
3525 
3526  const IntNumVec& permInv_r = *pPermInv_r;
3527 
3528 
3529 
3530 
3531 
3532 
3533  // The number of local columns in DistSparseMatrix format for the
3534  // processor with rank 0. This number is the same for processors
3535  // with rank ranging from 0 to mpisize - 2, and may or may not differ
3536  // from the number of local columns for processor with rank mpisize -
3537  // 1.
3538  Int numColFirst = this->NumCol() / mpisize;
3539 
3540  // Count the size first.
3541  for( Int ksup = 0; ksup < numSuper; ksup++ ){
3542  // L blocks
3543  if( MYCOL( grid_ ) == PCOL( ksup, grid_ ) ){
3544  std::vector<LBlock<T> >& Lcol = this->L( LBj( ksup, grid_ ) );
3545  for( Int ib = 0; ib < Lcol.size(); ib++ ){
3546  for( Int j = 0; j < Lcol[ib].numCol; j++ ){
3547  Int jcol = permInv( j + FirstBlockCol( ksup, super_ ) );
3548  Int dest = std::min( jcol / numColFirst, mpisize - 1 );
3549  sizeSend[dest] += Lcol[ib].numRow;
3550  }
3551  }
3552  } // I own the column of ksup
3553 
3554  // U blocks
3555  if( MYROW( grid_ ) == PROW( ksup, grid_ ) ){
3556  std::vector<UBlock<T> >& Urow = this->U( LBi( ksup, grid_ ) );
3557  for( Int jb = 0; jb < Urow.size(); jb++ ){
3558  IntNumVec& cols = Urow[jb].cols;
3559  for( Int j = 0; j < cols.m(); j++ ){
3560  Int jcol = permInv( cols(j) );
3561  Int dest = std::min( jcol / numColFirst, mpisize - 1 );
3562  sizeSend[dest] += Urow[jb].numRow;
3563  }
3564  }
3565  } // I own the row of ksup
3566  } // for (ksup)
3567 
3568  // All-to-all exchange of size information
3569  MPI_Alltoall(
3570  &sizeSend[0], 1, MPI_INT,
3571  &sizeRecv[0], 1, MPI_INT, grid_->comm );
3572 
3573 
3574 
3575  // Reserve the space
3576  for( Int ip = 0; ip < mpisize; ip++ ){
3577  if( ip == 0 ){
3578  displsSend[ip] = 0;
3579  }
3580  else{
3581  displsSend[ip] = displsSend[ip-1] + sizeSend[ip-1];
3582  }
3583 
3584  if( ip == 0 ){
3585  displsRecv[ip] = 0;
3586  }
3587  else{
3588  displsRecv[ip] = displsRecv[ip-1] + sizeRecv[ip-1];
3589  }
3590  }
3591  Int sizeSendTotal = displsSend[mpisize-1] + sizeSend[mpisize-1];
3592  Int sizeRecvTotal = displsRecv[mpisize-1] + sizeRecv[mpisize-1];
3593 
3594  rowSend.resize( sizeSendTotal );
3595  colSend.resize( sizeSendTotal );
3596  valSend.resize( sizeSendTotal );
3597 
3598  rowRecv.resize( sizeRecvTotal );
3599  colRecv.resize( sizeRecvTotal );
3600  valRecv.resize( sizeRecvTotal );
3601 
3602 #if ( _DEBUGlevel_ >= 1 )
3603  statusOFS << "displsSend = " << displsSend << std::endl;
3604  statusOFS << "displsRecv = " << displsRecv << std::endl;
3605 #endif
3606 
3607  // Put (row, col, val) to the sending buffer
3608  std::vector<Int> cntSize( mpisize, 0 );
3609 
3610 
3611  for( Int ksup = 0; ksup < numSuper; ksup++ ){
3612  // L blocks
3613  if( MYCOL( grid_ ) == PCOL( ksup, grid_ ) ){
3614  std::vector<LBlock<T> >& Lcol = this->L( LBj( ksup, grid_ ) );
3615  for( Int ib = 0; ib < Lcol.size(); ib++ ){
3616  IntNumVec& rows = Lcol[ib].rows;
3617  NumMat<T>& nzval = Lcol[ib].nzval;
3618  for( Int j = 0; j < Lcol[ib].numCol; j++ ){
3619  Int jcol = permInv( j + FirstBlockCol( ksup, super_ ) );
3620  Int dest = std::min( jcol / numColFirst, mpisize - 1 );
3621  for( Int i = 0; i < rows.m(); i++ ){
3622  rowSend[displsSend[dest] + cntSize[dest]] = permInv_r( rows(i) );
3623  colSend[displsSend[dest] + cntSize[dest]] = jcol;
3624  valSend[displsSend[dest] + cntSize[dest]] = nzval( i, j );
3625  cntSize[dest]++;
3626  }
3627  }
3628  }
3629  } // I own the column of ksup
3630 
3631 
3632  // U blocks
3633  if( MYROW( grid_ ) == PROW( ksup, grid_ ) ){
3634  std::vector<UBlock<T> >& Urow = this->U( LBi( ksup, grid_ ) );
3635  for( Int jb = 0; jb < Urow.size(); jb++ ){
3636  IntNumVec& cols = Urow[jb].cols;
3637  NumMat<T>& nzval = Urow[jb].nzval;
3638  for( Int j = 0; j < cols.m(); j++ ){
3639  Int jcol = permInv( cols(j) );
3640  Int dest = std::min( jcol / numColFirst, mpisize - 1 );
3641  for( Int i = 0; i < Urow[jb].numRow; i++ ){
3642  rowSend[displsSend[dest] + cntSize[dest]] =
3643  permInv_r( i + FirstBlockCol( ksup, super_ ) );
3644  colSend[displsSend[dest] + cntSize[dest]] = jcol;
3645  valSend[displsSend[dest] + cntSize[dest]] = nzval( i, j );
3646  cntSize[dest]++;
3647  }
3648  }
3649  }
3650  } // I own the row of ksup
3651  }
3652 
3653 
3654 
3655  // Check sizes match
3656  for( Int ip = 0; ip < mpisize; ip++ ){
3657  if( cntSize[ip] != sizeSend[ip] ){
3658 #ifdef USE_ABORT
3659  abort();
3660 #endif
3661  throw std::runtime_error( "Sizes of the sending information do not match." );
3662  }
3663  }
3664 
3665  // Alltoallv to exchange information
3666  mpi::Alltoallv(
3667  &rowSend[0], &sizeSend[0], &displsSend[0],
3668  &rowRecv[0], &sizeRecv[0], &displsRecv[0],
3669  grid_->comm );
3670  mpi::Alltoallv(
3671  &colSend[0], &sizeSend[0], &displsSend[0],
3672  &colRecv[0], &sizeRecv[0], &displsRecv[0],
3673  grid_->comm );
3674  mpi::Alltoallv(
3675  &valSend[0], &sizeSend[0], &displsSend[0],
3676  &valRecv[0], &sizeRecv[0], &displsRecv[0],
3677  grid_->comm );
3678 
3679 #if ( _DEBUGlevel_ >= 1 )
3680  statusOFS << "Alltoallv communication finished." << std::endl;
3681 #endif
3682 
3683  //#if ( _DEBUGlevel_ >= 1 )
3684  // for( Int ip = 0; ip < mpisize; ip++ ){
3685  // statusOFS << "rowSend[" << ip << "] = " << rowSend[ip] << std::endl;
3686  // statusOFS << "rowRecv[" << ip << "] = " << rowRecv[ip] << std::endl;
3687  // statusOFS << "colSend[" << ip << "] = " << colSend[ip] << std::endl;
3688  // statusOFS << "colRecv[" << ip << "] = " << colRecv[ip] << std::endl;
3689  // statusOFS << "valSend[" << ip << "] = " << valSend[ip] << std::endl;
3690  // statusOFS << "valRecv[" << ip << "] = " << valRecv[ip] << std::endl;
3691  // }
3692  //#endif
3693 
3694  // Organize the received message.
3695  Int firstCol = mpirank * numColFirst;
3696  Int numColLocal;
3697  if( mpirank == mpisize-1 )
3698  numColLocal = this->NumCol() - numColFirst * (mpisize-1);
3699  else
3700  numColLocal = numColFirst;
3701 
3702  std::vector<std::vector<Int> > rows( numColLocal );
3703  std::vector<std::vector<T> > vals( numColLocal );
3704 
3705  for( Int ip = 0; ip < mpisize; ip++ ){
3706  Int* rowRecvCur = &rowRecv[displsRecv[ip]];
3707  Int* colRecvCur = &colRecv[displsRecv[ip]];
3708  T* valRecvCur = &valRecv[displsRecv[ip]];
3709  for( Int i = 0; i < sizeRecv[ip]; i++ ){
3710  rows[colRecvCur[i]-firstCol].push_back( rowRecvCur[i] );
3711  vals[colRecvCur[i]-firstCol].push_back( valRecvCur[i] );
3712  } // for (i)
3713  } // for (ip)
3714 
3715  // Sort the rows
3716  std::vector<std::vector<Int> > sortIndex( numColLocal );
3717  for( Int j = 0; j < numColLocal; j++ ){
3718  sortIndex[j].resize( rows[j].size() );
3719  for( Int i = 0; i < sortIndex[j].size(); i++ )
3720  sortIndex[j][i] = i;
3721  std::sort( sortIndex[j].begin(), sortIndex[j].end(),
3722  IndexComp<std::vector<Int>& > ( rows[j] ) );
3723  } // for (j)
3724 
3725  // Form DistSparseMatrix according to the received message
3726  // NOTE: for indicies, DistSparseMatrix follows the FORTRAN
3727  // convention (1 based) while PMatrix follows the C convention (0
3728  // based)
3729  if( A.size != this->NumCol() ){
3730 #ifdef USE_ABORT
3731  abort();
3732 #endif
3733  throw std::runtime_error( "The DistSparseMatrix providing the pattern has a different size from PMatrix." );
3734  }
3735  if( A.colptrLocal.m() != numColLocal + 1 ){
3736 #ifdef USE_ABORT
3737  abort();
3738 #endif
3739  throw std::runtime_error( "The DistSparseMatrix providing the pattern has a different number of local columns from PMatrix." );
3740  }
3741 
3742  B.size = A.size;
3743  B.nnz = A.nnz;
3744  B.nnzLocal = A.nnzLocal;
3745  B.colptrLocal = A.colptrLocal;
3746  B.rowindLocal = A.rowindLocal;
3747  B.nzvalLocal.Resize( B.nnzLocal );
3748  SetValue( B.nzvalLocal, ZERO<T>() );
3749  // Make sure that the communicator of A and B are the same.
3750  // FIXME Find a better way to compare the communicators
3751  // if( grid_->comm != A.comm ){
3752  // #ifdef USE_ABORT
3753  // abort();
3754  //#endif
3755  //throw std::runtime_error( "The DistSparseMatrix providing the pattern has a different communicator from PMatrix." );
3756  // }
3757  B.comm = grid_->comm;
3758 
3759  Int* rowPtr = B.rowindLocal.Data();
3760  T* nzvalPtr = B.nzvalLocal.Data();
3761  for( Int j = 0; j < numColLocal; j++ ){
3762  std::vector<Int>& rowsCur = rows[j];
3763  std::vector<Int>& sortIndexCur = sortIndex[j];
3764  std::vector<T>& valsCur = vals[j];
3765  std::vector<Int> rowsCurSorted( rowsCur.size() );
3766  // Note that 1 is important since the index follows the FORTRAN convention
3767  for( Int i = 0; i < rowsCurSorted.size(); i++ ){
3768  rowsCurSorted[i] = rowsCur[sortIndexCur[i]] + 1;
3769  }
3770 
3771  // Search and match the indices
3772  std::vector<Int>::iterator it;
3773  for( Int i = B.colptrLocal(j) - 1;
3774  i < B.colptrLocal(j+1) - 1; i++ ){
3775  it = std::lower_bound( rowsCurSorted.begin(), rowsCurSorted.end(),
3776  *(rowPtr++) );
3777  if( it == rowsCurSorted.end() ){
3778  // Did not find the row, set it to zero
3779  *(nzvalPtr++) = ZERO<T>();
3780  }
3781  else{
3782  // Found the row, set it according to the received value
3783  *(nzvalPtr++) = valsCur[ sortIndexCur[it-rowsCurSorted.begin()] ];
3784  }
3785  } // for (i)
3786  } // for (j)
3787 
3788 #if ( _DEBUGlevel_ >= 1 )
3789  statusOFS << "B.colptrLocal[end] = " << B.colptrLocal(numColLocal) << std::endl;
3790  statusOFS << "B.rowindLocal.size() = " << B.rowindLocal.m() << std::endl;
3791  statusOFS << "B.rowindLocal[end] = " << B.rowindLocal(B.nnzLocal-1) << std::endl;
3792  statusOFS << "B.nzvalLocal[end] = " << B.nzvalLocal(B.nnzLocal-1) << std::endl;
3793 #endif
3794 
3795 
3796 #ifndef _RELEASE_
3797  PopCallStack();
3798 #endif
3799 
3800  return ;
3801  } // ----- end of method PMatrix::PMatrixToDistSparseMatrix_OLD -----
3802 
3803 
3804  // A (maybe) more memory efficient way for converting the PMatrix to a
3805  // DistSparseMatrix structure.
3806  //
3807  template<typename T>
3809  {
3810 
3811 #ifndef _RELEASE_
3812  PushCallStack("PMatrix::PMatrixToDistSparseMatrix");
3813 #endif
3814 #if ( _DEBUGlevel_ >= 1 )
3815  statusOFS << std::endl << "Converting PMatrix to DistSparseMatrix (2nd format)." << std::endl;
3816 #endif
3817  Int mpirank = grid_->mpirank;
3818  Int mpisize = grid_->mpisize;
3819 
3820  std::vector<Int> rowSend( mpisize );
3821  std::vector<Int> colSend( mpisize );
3822  std::vector<T> valSend( mpisize );
3823  std::vector<Int> sizeSend( mpisize, 0 );
3824  std::vector<Int> displsSend( mpisize, 0 );
3825 
3826  std::vector<Int> rowRecv( mpisize );
3827  std::vector<Int> colRecv( mpisize );
3828  std::vector<T> valRecv( mpisize );
3829  std::vector<Int> sizeRecv( mpisize, 0 );
3830  std::vector<Int> displsRecv( mpisize, 0 );
3831 
3832  Int numSuper = this->NumSuper();
3833  const IntNumVec& perm = super_->perm;
3834  const IntNumVec& permInv = super_->permInv;
3835 
3836  const IntNumVec * pPerm_r;
3837  const IntNumVec * pPermInv_r;
3838 
3839  pPerm_r = &super_->perm_r;
3840  pPermInv_r = &super_->permInv_r;
3841 
3842  const IntNumVec& perm_r = *pPerm_r;
3843  const IntNumVec& permInv_r = *pPermInv_r;
3844 
3845 
3846  // Count the sizes from the A matrix first
3847  Int numColFirst = this->NumCol() / mpisize;
3848  Int firstCol = mpirank * numColFirst;
3849  Int numColLocal;
3850  if( mpirank == mpisize-1 )
3851  numColLocal = this->NumCol() - numColFirst * (mpisize-1);
3852  else
3853  numColLocal = numColFirst;
3854 
3855 
3856 
3857 
3858 
3859  Int* rowPtr = A.rowindLocal.Data();
3860  Int* colPtr = A.colptrLocal.Data();
3861 
3862  for( Int j = 0; j < numColLocal; j++ ){
3863  Int ocol = firstCol + j;
3864  Int col = perm[ perm_r[ ocol] ];
3865  Int blockColIdx = BlockIdx( col, super_ );
3866  Int procCol = PCOL( blockColIdx, grid_ );
3867  for( Int i = colPtr[j] - 1; i < colPtr[j+1] - 1; i++ ){
3868  Int orow = rowPtr[i]-1;
3869  Int row = perm[ orow ];
3870  Int blockRowIdx = BlockIdx( row, super_ );
3871  Int procRow = PROW( blockRowIdx, grid_ );
3872  Int dest = PNUM( procRow, procCol, grid_ );
3873 #if ( _DEBUGlevel_ >= 1 )
3874  statusOFS << "("<< orow<<", "<<ocol<<") == "<< "("<< row<<", "<<col<<")"<< std::endl;
3875  statusOFS << "BlockIdx = " << blockRowIdx << ", " <<blockColIdx << std::endl;
3876  statusOFS << procRow << ", " << procCol << ", "
3877  << dest << std::endl;
3878 #endif
3879  sizeSend[dest]++;
3880  } // for (i)
3881  } // for (j)
3882 
3883  // All-to-all exchange of size information
3884  MPI_Alltoall(
3885  &sizeSend[0], 1, MPI_INT,
3886  &sizeRecv[0], 1, MPI_INT, grid_->comm );
3887 
3888 #if ( _DEBUGlevel_ >= 1 )
3889  statusOFS << std::endl << "sizeSend: " << sizeSend << std::endl;
3890  statusOFS << std::endl << "sizeRecv: " << sizeRecv << std::endl;
3891 #endif
3892 
3893 
3894 
3895  // Reserve the space
3896  for( Int ip = 0; ip < mpisize; ip++ ){
3897  if( ip == 0 ){
3898  displsSend[ip] = 0;
3899  }
3900  else{
3901  displsSend[ip] = displsSend[ip-1] + sizeSend[ip-1];
3902  }
3903 
3904  if( ip == 0 ){
3905  displsRecv[ip] = 0;
3906  }
3907  else{
3908  displsRecv[ip] = displsRecv[ip-1] + sizeRecv[ip-1];
3909  }
3910  }
3911 
3912  Int sizeSendTotal = displsSend[mpisize-1] + sizeSend[mpisize-1];
3913  Int sizeRecvTotal = displsRecv[mpisize-1] + sizeRecv[mpisize-1];
3914 
3915  rowSend.resize( sizeSendTotal );
3916  colSend.resize( sizeSendTotal );
3917  valSend.resize( sizeSendTotal );
3918 
3919  rowRecv.resize( sizeRecvTotal );
3920  colRecv.resize( sizeRecvTotal );
3921  valRecv.resize( sizeRecvTotal );
3922 
3923 #if ( _DEBUGlevel_ >= 1 )
3924  statusOFS << "displsSend = " << displsSend << std::endl;
3925  statusOFS << "displsRecv = " << displsRecv << std::endl;
3926 #endif
3927 
3928  // Put (row, col) to the sending buffer
3929  std::vector<Int> cntSize( mpisize, 0 );
3930 
3931  rowPtr = A.rowindLocal.Data();
3932  colPtr = A.colptrLocal.Data();
3933 
3934  for( Int j = 0; j < numColLocal; j++ ){
3935 
3936  Int ocol = firstCol + j;
3937  Int col = perm[ perm_r[ ocol] ];
3938  Int blockColIdx = BlockIdx( col, super_ );
3939  Int procCol = PCOL( blockColIdx, grid_ );
3940  for( Int i = colPtr[j] - 1; i < colPtr[j+1] - 1; i++ ){
3941  Int orow = rowPtr[i]-1;
3942  Int row = perm[ orow ];
3943  Int blockRowIdx = BlockIdx( row, super_ );
3944  Int procRow = PROW( blockRowIdx, grid_ );
3945  Int dest = PNUM( procRow, procCol, grid_ );
3946  rowSend[displsSend[dest] + cntSize[dest]] = row;
3947  colSend[displsSend[dest] + cntSize[dest]] = col;
3948  cntSize[dest]++;
3949  } // for (i)
3950  } // for (j)
3951 
3952 
3953  // Check sizes match
3954  for( Int ip = 0; ip < mpisize; ip++ ){
3955  if( cntSize[ip] != sizeSend[ip] ){
3956 #ifdef USE_ABORT
3957  abort();
3958 #endif
3959  throw std::runtime_error( "Sizes of the sending information do not match." );
3960  }
3961  }
3962 
3963  // Alltoallv to exchange information
3964  mpi::Alltoallv(
3965  &rowSend[0], &sizeSend[0], &displsSend[0],
3966  &rowRecv[0], &sizeRecv[0], &displsRecv[0],
3967  grid_->comm );
3968  mpi::Alltoallv(
3969  &colSend[0], &sizeSend[0], &displsSend[0],
3970  &colRecv[0], &sizeRecv[0], &displsRecv[0],
3971  grid_->comm );
3972 
3973 #if ( _DEBUGlevel_ >= 1 )
3974  statusOFS << "Alltoallv communication of nonzero indices finished." << std::endl;
3975 #endif
3976 
3977 
3978 #if ( _DEBUGlevel_ >= 1 )
3979  for( Int ip = 0; ip < mpisize; ip++ ){
3980  statusOFS << "rowSend[" << ip << "] = " << rowSend[ip] << std::endl;
3981  statusOFS << "rowRecv[" << ip << "] = " << rowRecv[ip] << std::endl;
3982  statusOFS << "colSend[" << ip << "] = " << colSend[ip] << std::endl;
3983  statusOFS << "colRecv[" << ip << "] = " << colRecv[ip] << std::endl;
3984  }
3985 
3986 
3987  DumpLU();
3988 
3989 
3990 
3991 #endif
3992 
3993  // For each (row, col), fill the nonzero values to valRecv locally.
3994  for( Int g = 0; g < sizeRecvTotal; g++ ){
3995  Int row = rowRecv[g];
3996  Int col = colRecv[g];
3997 
3998  Int blockRowIdx = BlockIdx( row, super_ );
3999  Int blockColIdx = BlockIdx( col, super_ );
4000 
4001  // Search for the nzval
4002  bool isFound = false;
4003 
4004  if( blockColIdx <= blockRowIdx ){
4005  // Data on the L side
4006 
4007  std::vector<LBlock<T> >& Lcol = this->L( LBj( blockColIdx, grid_ ) );
4008 
4009  for( Int ib = 0; ib < Lcol.size(); ib++ ){
4010 #if ( _DEBUGlevel_ >= 1 )
4011  statusOFS << "blockRowIdx = " << blockRowIdx << ", Lcol[ib].blockIdx = " << Lcol[ib].blockIdx << ", blockColIdx = " << blockColIdx << std::endl;
4012 #endif
4013  if( Lcol[ib].blockIdx == blockRowIdx ){
4014  IntNumVec& rows = Lcol[ib].rows;
4015  for( int iloc = 0; iloc < Lcol[ib].numRow; iloc++ ){
4016  if( rows[iloc] == row ){
4017  Int jloc = col - FirstBlockCol( blockColIdx, super_ );
4018  valRecv[g] = Lcol[ib].nzval( iloc, jloc );
4019  isFound = true;
4020  break;
4021  } // found the corresponding row
4022  }
4023  }
4024  if( isFound == true ) break;
4025  } // for (ib)
4026  }
4027  else{
4028  // Data on the U side
4029 
4030  std::vector<UBlock<T> >& Urow = this->U( LBi( blockRowIdx, grid_ ) );
4031 
4032  for( Int jb = 0; jb < Urow.size(); jb++ ){
4033  if( Urow[jb].blockIdx == blockColIdx ){
4034  IntNumVec& cols = Urow[jb].cols;
4035  for( int jloc = 0; jloc < Urow[jb].numCol; jloc++ ){
4036  if( cols[jloc] == col ){
4037  Int iloc = row - FirstBlockRow( blockRowIdx, super_ );
4038  valRecv[g] = Urow[jb].nzval( iloc, jloc );
4039  isFound = true;
4040  break;
4041  } // found the corresponding col
4042  }
4043  }
4044  if( isFound == true ) break;
4045  } // for (jb)
4046  } // if( blockColIdx <= blockRowIdx )
4047 
4048  // Did not find the corresponding row, set the value to zero.
4049  if( isFound == false ){
4050  statusOFS << "In the permutated order, (" << row << ", " << col <<
4051  ") is not found in PMatrix." << std::endl;
4052  valRecv[g] = ZERO<T>();
4053  }
4054 
4055  } // for (g)
4056 
4057 
4058  // Feed back valRecv to valSend through Alltoallv. NOTE: for the
4059  // values, the roles of "send" and "recv" are swapped.
4060  mpi::Alltoallv(
4061  &valRecv[0], &sizeRecv[0], &displsRecv[0],
4062  &valSend[0], &sizeSend[0], &displsSend[0],
4063  grid_->comm );
4064 
4065 #if ( _DEBUGlevel_ >= 1 )
4066  statusOFS << "Alltoallv communication of nonzero values finished." << std::endl;
4067 #endif
4068 
4069  // Put the nonzero values from valSend to the matrix B.
4070  B.size = A.size;
4071  B.nnz = A.nnz;
4072  B.nnzLocal = A.nnzLocal;
4073  B.colptrLocal = A.colptrLocal;
4074  B.rowindLocal = A.rowindLocal;
4075  B.nzvalLocal.Resize( B.nnzLocal );
4076  SetValue( B.nzvalLocal, ZERO<T>() );
4077  // Make sure that the communicator of A and B are the same.
4078  // FIXME Find a better way to compare the communicators
4079  // if( grid_->comm != A.comm ){
4080  // #ifdef USE_ABORT
4081  //abort();
4082  //#endif
4083  //throw std::runtime_error( "The DistSparseMatrix providing the pattern has a different communicator from PMatrix." );
4084  // }
4085  B.comm = grid_->comm;
4086 
4087  for( Int i = 0; i < mpisize; i++ )
4088  cntSize[i] = 0;
4089 
4090  rowPtr = B.rowindLocal.Data();
4091  colPtr = B.colptrLocal.Data();
4092  T* valPtr = B.nzvalLocal.Data();
4093 
4094  for( Int j = 0; j < numColLocal; j++ ){
4095  Int ocol = firstCol + j;
4096  Int col = perm[ perm_r[ ocol] ];
4097  Int blockColIdx = BlockIdx( col, super_ );
4098  Int procCol = PCOL( blockColIdx, grid_ );
4099  for( Int i = colPtr[j] - 1; i < colPtr[j+1] - 1; i++ ){
4100  Int orow = rowPtr[i]-1;
4101  Int row = perm[ orow ];
4102  Int blockRowIdx = BlockIdx( row, super_ );
4103  Int procRow = PROW( blockRowIdx, grid_ );
4104  Int dest = PNUM( procRow, procCol, grid_ );
4105  valPtr[i] = valSend[displsSend[dest] + cntSize[dest]];
4106  cntSize[dest]++;
4107  } // for (i)
4108  } // for (j)
4109 
4110  // Check sizes match
4111  for( Int ip = 0; ip < mpisize; ip++ ){
4112  if( cntSize[ip] != sizeSend[ip] ){
4113 #ifdef USE_ABORT
4114  abort();
4115 #endif
4116  throw std::runtime_error( "Sizes of the sending information do not match." );
4117  }
4118  }
4119 
4120 
4121 #ifndef _RELEASE_
4122  PopCallStack();
4123 #endif
4124 
4125  return ;
4126  } // ----- end of method PMatrix::PMatrixToDistSparseMatrix -----
4127 
4128 
4129 
4130 
4131  template<typename T>
4133  {
4134 #ifndef _RELEASE_
4135  PushCallStack("PMatrix::NnzLocal");
4136 #endif
4137  Int numSuper = this->NumSuper();
4138  Int nnzLocal = 0;
4139  for( Int ksup = 0; ksup < numSuper; ksup++ ){
4140  if( MYCOL( grid_ ) == PCOL( ksup, grid_ ) ){
4141  std::vector<LBlock<T> >& Lcol = this->L( LBj( ksup, grid_ ) );
4142  for( Int ib = 0; ib < Lcol.size(); ib++ ){
4143  nnzLocal += Lcol[ib].numRow * Lcol[ib].numCol;
4144  }
4145  } // if I own the column of ksup
4146  if( MYROW( grid_ ) == PROW( ksup, grid_ ) ){
4147  std::vector<UBlock<T> >& Urow = this->U( LBi( ksup, grid_ ) );
4148  for( Int jb = 0; jb < Urow.size(); jb++ ){
4149  nnzLocal += Urow[jb].numRow * Urow[jb].numCol;
4150  }
4151  } // if I own the row of ksup
4152  }
4153 
4154 #ifndef _RELEASE_
4155  PopCallStack();
4156 #endif
4157 
4158  return nnzLocal;
4159  } // ----- end of method PMatrix::NnzLocal -----
4160 
4161 
4162  template<typename T>
4163  LongInt PMatrix<T>::Nnz ( )
4164  {
4165 #ifndef _RELEASE_
4166  PushCallStack("PMatrix::Nnz");
4167 #endif
4168  LongInt nnzLocal = LongInt( this->NnzLocal() );
4169  LongInt nnz;
4170 
4171  MPI_Allreduce( &nnzLocal, &nnz, 1, MPI_LONG_LONG, MPI_SUM, grid_->comm );
4172 
4173 #ifndef _RELEASE_
4174  PopCallStack();
4175 #endif
4176 
4177  return nnz;
4178  } // ----- end of method PMatrix::Nnz -----
4179 
4180  template< >
4181  inline void PMatrix<Real>::GetNegativeInertia ( Real& inertia )
4182  {
4183 #ifndef _RELEASE_
4184  PushCallStack("PMatrix::GetNegativeInertia");
4185 #endif
4186  Int numSuper = this->NumSuper();
4187 
4188  Real inertiaLocal = 0.0;
4189  inertia = 0.0;
4190 
4191  for( Int ksup = 0; ksup < numSuper; ksup++ ){
4192  // I own the diagonal block
4193  if( MYROW( grid_ ) == PROW( ksup, grid_ ) &&
4194  MYCOL( grid_ ) == PCOL( ksup, grid_ ) ){
4195  LBlock<Real> & LB = this->L( LBj( ksup, grid_ ) )[0];
4196  for( Int i = 0; i < LB.numRow; i++ ){
4197  if( LB.nzval(i, i) < 0 )
4198  inertiaLocal++;
4199  }
4200  }
4201  }
4202 
4203  // All processors own diag
4204  mpi::Allreduce( &inertiaLocal, &inertia, 1, MPI_SUM, grid_->comm );
4205 
4206 #ifndef _RELEASE_
4207  PopCallStack();
4208 #endif
4209 
4210  return ;
4211  } // ----- end of method PMatrix::GetNegativeInertia -----
4212 
4213 
4214  template< >
4215  inline void PMatrix<Complex>::GetNegativeInertia ( Real& inertia )
4216  {
4217 #ifndef _RELEASE_
4218  PushCallStack("PMatrix::GetNegativeInertia");
4219 #endif
4220  Int numSuper = this->NumSuper();
4221 
4222  Real inertiaLocal = 0.0;
4223  inertia = 0.0;
4224 
4225  for( Int ksup = 0; ksup < numSuper; ksup++ ){
4226  // I own the diagonal block
4227  if( MYROW( grid_ ) == PROW( ksup, grid_ ) &&
4228  MYCOL( grid_ ) == PCOL( ksup, grid_ ) ){
4229  LBlock<Complex> & LB = this->L( LBj( ksup, grid_ ) )[0];
4230  for( Int i = 0; i < LB.numRow; i++ ){
4231  if( LB.nzval(i, i).real() < 0 )
4232  inertiaLocal++;
4233  }
4234  }
4235  }
4236 
4237  // All processors own diag
4238  mpi::Allreduce( &inertiaLocal, &inertia, 1, MPI_SUM, grid_->comm );
4239 
4240 #ifndef _RELEASE_
4241  PopCallStack();
4242 #endif
4243 
4244  return ;
4245  } // ----- end of method PMatrix::GetNegativeInertia -----
4246 
4247  template<typename T>
4248  inline PMatrix<T> * PMatrix<T>::Create(const GridType * pGridType, const SuperNodeType * pSuper, const PSelInvOptions * pSelInvOpt , const SuperLUOptions * pLuOpt)
4249  {
4250  PMatrix<T> * pMat = NULL;
4251  if(pLuOpt->Symmetric == 0){
4252  pMat = new PMatrixUnsym<T>( pGridType, pSuper, pSelInvOpt, pLuOpt );
4253  }
4254  else{
4255  pMat = new PMatrix<T>( pGridType, pSuper, pSelInvOpt, pLuOpt );
4256  }
4257 
4258  return pMat;
4259  } // ----- end of factory method PMatrix::Create -----
4260 
4261  template<typename T>
4262  inline PMatrix<T> * PMatrix<T>::Create(const SuperLUOptions * pLuOpt)
4263  {
4264  PMatrix<T> * pMat = NULL;
4265  if(pLuOpt->Symmetric == 0){
4266  pMat = new PMatrixUnsym<T>();
4267  }
4268  else{
4269  pMat = new PMatrix<T>();
4270  }
4271 
4272  return pMat;
4273  } // ----- end of factory method PMatrix::Create -----
4274 } // namespace PEXSI
4275 
4276 
4277 
4278 #endif //_PEXSI_PSELINV_IMPL_HPP_
virtual void PreSelInv()
PreSelInv prepares the structure in L_ and U_ so that SelInv only involves matrix-matrix multiplicati...
Definition: pselinv_impl.hpp:2677
Int blockIdx
Block index (supernodal index)
Definition: pselinv.hpp:236
void PMatrixToDistSparseMatrix(DistSparseMatrix< T > &A)
PMatrixToDistSparseMatrix converts the PMatrix into a distributed compressed sparse column matrix for...
Definition: pselinv_impl.hpp:3208
void SelInvIntra_P2p(Int lidx)
SelInvIntra_P2p.
Definition: pselinv_impl.hpp:1211
A thin interface for passing parameters to set the SuperLU options.
Definition: superlu_dist_internal.hpp:62
Definition: TreeBcast.hpp:22
SuperNodeType describes mapping between supernode and column, the permutation information, and potentially the elimination tree (not implemented here).
Definition: pselinv.hpp:165
Interface with SuperLU_Dist (version 3.0 and later)
void GetNegativeInertia(Real &inertia)
GetNegativeInertia computes the negative inertia of a PMatrix. This can be used to estimate e...
Int PROW(Int bnum, const GridType *g)
PROW returns the processor row that the bnum-th block (supernode) belongs to.
Definition: pselinv.hpp:292
Int numRow
Number of nonzero rows.
Definition: pselinv.hpp:187
IntNumVec colptrLocal
Dimension numColLocal + 1, storing the pointers to the nonzero row indices and nonzero values in rowp...
Definition: sparse_matrix.hpp:108
void UnpackData(SuperNodeBufferType &snode, std::vector< LBlock< T > > &LcolRecv, std::vector< UBlock< T > > &UrowRecv)
UnpackData.
Definition: pselinv_impl.hpp:971
Int MYPROC(const GridType *g)
MYPROC returns the current processor rank.
Definition: pselinv.hpp:279
Int nnzLocal
Local number of local nonzeros elements on this processor.
Definition: sparse_matrix.hpp:96
Int PCOL(Int bnum, const GridType *g)
PCOL returns the processor column that the bnum-th block (supernode) belongs to.
Definition: pselinv.hpp:297
Int LBi(Int bnum, const GridType *g)
LBi returns the local block number on the processor at processor row PROW( bnum, g )...
Definition: pselinv.hpp:307
Int Symmetric
Option to specify if matrix is symmetric or not.
Definition: superlu_dist_internal.hpp:114
Profiling and timing using TAU.
Int SuperSize(Int bnum, const SuperNodeType *s)
SuperSize returns the size of the block bnum.
Definition: pselinv.hpp:348
Int LBj(Int bnum, const GridType *g)
LBj returns the local block number on the processor at processor column PCOL( bnum, g ).
Definition: pselinv.hpp:312
Int GBj(Int jLocal, const GridType *g)
GBj returns the global block number from a local block number in the column direction.
Definition: pselinv.hpp:322
void SetValue(NumMat< F > &M, F val)
SetValue sets a numerical matrix to a constant val.
Definition: NumMat_impl.hpp:171
Int numCol
Number of nonzero columns.
Definition: pselinv.hpp:190
Int NumCol(const SuperNodeType *s)
NumCol returns the total number of columns for a supernodal partiiton.
Definition: pselinv.hpp:357
Definition: pselinv.hpp:598
void GetDiagonal(NumVec< T > &diag)
GetDiagonal extracts the diagonal elements of the PMatrix.
Definition: pselinv_impl.hpp:3080
void GetWorkSet(std::vector< Int > &snodeEtree, std::vector< std::vector< Int > > &WSet)
GetWorkSet.
Definition: pselinv_impl.hpp:1134
NumVec< F > nzvalLocal
Dimension nnzLocal, storing the nonzero values.
Definition: sparse_matrix.hpp:116
Int FirstBlockRow(Int bnum, const SuperNodeType *s)
FirstBlockRow returns the first column of a block bnum. Note: the functionality of FirstBlockRow is e...
Definition: pselinv.hpp:343
void GetEtree(std::vector< Int > &etree_supno)
GetEtree computes the supernodal elimination tree to be used later in the pipelined selected inversio...
Definition: pselinv_impl.hpp:1065
LBlock stores a nonzero block in the lower triangular part or the diagonal part in PSelInv...
Definition: pselinv.hpp:181
IntNumVec rowindLocal
Dimension nnzLocal, storing the nonzero row indices. The indices are 1-based (FORTRAN-convention), i.e. the first row index is 1.
Definition: sparse_matrix.hpp:113
Int PNUM(Int i, Int j, const GridType *g)
PNUM returns the processor rank that the bnum-th block (supernode) belongs to.
Definition: pselinv.hpp:302
virtual void ConstructCommunicationPattern()
ConstructCommunicationPattern constructs the communication pattern to be used later in the selected i...
Definition: pselinv_impl.hpp:1951
void ConstructCommunicationPattern_P2p()
ConstructCommunicationPattern_P2p constructs the communication pattern to be used later in the select...
Definition: pselinv_impl.hpp:1959
GridType is the PSelInv way of defining the grid.
Definition: pselinv.hpp:128
Int MYCOL(const GridType *g)
MYCOL returns my processor column.
Definition: pselinv.hpp:287
Int size
Matrix dimension.
Definition: sparse_matrix.hpp:93
void PMatrixToDistSparseMatrix_OLD(const DistSparseMatrix< T > &A, DistSparseMatrix< T > &B)
PMatrixToDistSparseMatrix_OLD converts the PMatrix into a distributed compressed sparse column matrix...
Definition: pselinv_impl.hpp:3489
IntNumVec rows
Dimension numRow * 1, index (0-based) for the number of nonzero rows.
Definition: pselinv.hpp:193
Int numCol
Number of nonzero columns.
Definition: pselinv.hpp:242
UBlock stores a nonzero block in the upper triangular part in PSelInv.
Definition: pselinv.hpp:233
Int FirstBlockCol(Int bnum, const SuperNodeType *s)
FirstBlockCol returns the first column of a block bnum.
Definition: pselinv.hpp:336
LongInt Nnz()
Compute the total number of nonzeros through MPI_Allreduce.
Definition: sparse_matrix_impl.hpp:107
MPI_Comm comm
MPI communicator.
Definition: sparse_matrix.hpp:119
Int NnzLocal()
NnzLocal computes the number of nonzero elements (L and U) saved locally.
Definition: pselinv_impl.hpp:4132
Int blockIdx
Block index (supernodal index)
Definition: pselinv.hpp:184
PMatrix contains the main data structure and the computational routine for the parallel selected inve...
Definition: ngchol_interf.hpp:57
void SendRecvCD_UpdateU(std::vector< SuperNodeBufferType > &arrSuperNodes, Int stepSuper)
SendRecvCD_UpdateU.
Definition: pselinv_impl.hpp:748
Int BlockIdx(Int i, const SuperNodeType *s)
BlockIdx returns the block index of a column i.
Definition: pselinv.hpp:331
Int numRow
Number of nonzero rows.
Definition: pselinv.hpp:239
NumMat< T > nzval
Dimension numRow * numCol, nonzero elements.
Definition: pselinv.hpp:248
Int nnz
Total number of nonzeros elements.
Definition: sparse_matrix.hpp:101
Int NumSuper(const SuperNodeType *s)
NumSuper returns the total number of supernodes.
Definition: pselinv.hpp:352
void SelInv_P2p()
Point-to-point version of the selected inversion.
Definition: pselinv_impl.hpp:2642
NumMat< T > nzval
Dimension numRow * numCol, nonzero elements.
Definition: pselinv.hpp:197
A thin interface for passing parameters to set the PSelInv options.
Definition: pselinv.hpp:101
Definition: utility.hpp:1481
IntNumVec cols
Dimension numRow * 1, index (0-based) for the number of nonzero rows.
Definition: pselinv.hpp:245
virtual void SelInv()
SelInv is the main function for the selected inversion.
Definition: pselinv_impl.hpp:2617
LongInt Nnz()
Nnz computes the total number of nonzero elements in the PMatrix.
Definition: pselinv_impl.hpp:4163
void SelInv_lookup_indexes(SuperNodeBufferType &snode, std::vector< LBlock< T > > &LcolRecv, std::vector< UBlock< T > > &UrowRecv, NumMat< T > &AinvBuf, NumMat< T > &UBuf)
SelInv_lookup_indexes.
Definition: pselinv_impl.hpp:391
Definition: TreeBcast.hpp:555
void ComputeDiagUpdate(SuperNodeBufferType &snode)
ComputeDiagUpdate.
Definition: pselinv_impl.hpp:1025
DistSparseMatrix describes a Sparse matrix in the compressed sparse column format (CSC) and distribut...
Definition: sparse_matrix.hpp:91
PMatrixUnsym contains the main data structure and the computational routine for the parallel selected...
Definition: pselinv.hpp:978
Int MYROW(const GridType *g)
MYROW returns my processor row.
Definition: pselinv.hpp:283