46 #ifndef _PEXSI_PSELINV_IMPL_HPP_
47 #define _PEXSI_PSELINV_IMPL_HPP_
56 #define MPI_MAX_COMM (1024)
57 #define BCAST_THRESHOLD 16
62 #ifdef COMMUNICATOR_PROFILE
64 class CommunicatorStat{
66 typedef std::map<std::vector<bool> , std::vector<Int> > bitMaskSet;
68 NumVec<Int> countSendToBelow_;
69 bitMaskSet maskSendToBelow_;
71 NumVec<Int> countRecvFromBelow_;
72 bitMaskSet maskRecvFromBelow_;
74 NumVec<Int> countSendToRight_;
75 bitMaskSet maskSendToRight_;
83 inline GridType::GridType ( MPI_Comm Bcomm,
int nprow,
int npcol )
86 PushCallStack(
"GridType::GridType");
89 MPI_Initialized( &info );
94 throw std::logic_error(
"MPI has not been initialized." );
97 MPI_Comm_group( Bcomm, &comm_group );
98 MPI_Comm_create( Bcomm, comm_group, &comm );
101 MPI_Comm_rank( comm, &mpirank );
102 MPI_Comm_size( comm, &mpisize );
103 if( mpisize != nprow * npcol ){
107 throw std::logic_error(
"mpisize != nprow * npcol." );
113 #ifdef SWAP_ROWS_COLS
114 Int myrow = mpirank % npcol;
115 Int mycol = mpirank / npcol;
117 Int myrow = mpirank / npcol;
118 Int mycol = mpirank % npcol;
121 MPI_Comm_split( comm, myrow, mycol, &rowComm );
122 MPI_Comm_split( comm, mycol, myrow, &colComm );
124 MPI_Group_free( &comm_group );
134 inline GridType::~GridType ( )
137 PushCallStack(
"GridType::~GridType");
141 MPI_Comm_free( &rowComm );
142 MPI_Comm_free( &colComm );
143 MPI_Comm_free( &comm );
155 void PMatrix<T>::deallocate(){
162 ColBlockIdx_.clear();
163 RowBlockIdx_.clear();
169 isSendToBelow_.Clear();
170 isSendToRight_.Clear();
171 isSendToDiagonal_.Clear();
172 isSendToCrossDiagonal_.Clear();
174 isRecvFromBelow_.Clear();
175 isRecvFromAbove_.Clear();
176 isRecvFromLeft_.Clear();
177 isRecvFromCrossDiagonal_.Clear();
182 for(
int i =0;i<fwdToBelowTree2_.size();++i){
183 if(fwdToBelowTree2_[i]!=NULL){
184 delete fwdToBelowTree2_[i];
185 fwdToBelowTree2_[i] = NULL;
188 for(
int i =0;i<fwdToRightTree2_.size();++i){
189 if(fwdToRightTree2_[i]!=NULL){
190 delete fwdToRightTree2_[i];
191 fwdToRightTree2_[i] = NULL;
195 for(
int i =0;i<fwdToBelowTree_.size();++i){
196 if(fwdToBelowTree_[i]!=NULL){
197 delete fwdToBelowTree_[i];
198 fwdToBelowTree_[i] = NULL;
202 for(
int i =0;i<fwdToRightTree_.size();++i){
203 if(fwdToRightTree_[i]!=NULL){
204 delete fwdToRightTree_[i];
205 fwdToRightTree_[i] = NULL;
210 for(
int i =0;i<redToLeftTree_.size();++i){
211 if(redToLeftTree_[i]!=NULL){
212 delete redToLeftTree_[i];
213 redToLeftTree_[i] = NULL;
216 for(
int i =0;i<redToAboveTree_.size();++i){
217 if(redToAboveTree_[i]!=NULL){
218 delete redToAboveTree_[i];
219 redToAboveTree_[i] = NULL;
224 #if defined(COMM_PROFILE) || defined(COMM_PROFILE_BCAST)
225 if(commOFS.is_open()){
234 PMatrix<T> & PMatrix<T>::operator = (
const PMatrix<T> & C){
241 options_ = C.options_;
242 optionsLU_ = C.optionsLU_;
246 limIndex_ = C.limIndex_;
248 ColBlockIdx_ = C.ColBlockIdx_;
249 RowBlockIdx_ = C.RowBlockIdx_;
253 workingSet_ = C.workingSet_;
257 isSendToBelow_ = C.isSendToBelow_;
258 isSendToRight_ = C.isSendToRight_;
259 isSendToDiagonal_ = C.isSendToDiagonal_;
260 isSendToCrossDiagonal_ = C.isSendToCrossDiagonal_;
262 isRecvFromBelow_ = C.isRecvFromBelow_;
263 isRecvFromAbove_ = C.isRecvFromAbove_;
264 isRecvFromLeft_ = C.isRecvFromLeft_;
265 isRecvFromCrossDiagonal_ = C.isRecvFromCrossDiagonal_;
268 fwdToBelowTree2_.resize(C.fwdToBelowTree2_.size());
269 for(
int i = 0 ; i< C.fwdToBelowTree2_.size();++i){
270 if(C.fwdToBelowTree2_[i]!=NULL){
271 fwdToBelowTree2_[i] = C.fwdToBelowTree2_[i]->clone();
274 fwdToRightTree2_.resize(C.fwdToRightTree2_.size());
275 for(
int i = 0 ; i< C.fwdToRightTree2_.size();++i){
276 if(C.fwdToRightTree2_[i]!=NULL){
277 fwdToRightTree2_[i] = C.fwdToRightTree2_[i]->clone();
281 fwdToBelowTree_.resize(C.fwdToBelowTree_.size());
282 for(
int i = 0 ; i< C.fwdToBelowTree_.size();++i){
283 if(C.fwdToBelowTree_[i]!=NULL){
284 fwdToBelowTree_[i] = C.fwdToBelowTree_[i]->clone();
287 fwdToRightTree_.resize(C.fwdToRightTree_.size());
288 for(
int i = 0 ; i< C.fwdToRightTree_.size();++i){
289 if(C.fwdToRightTree_[i]!=NULL){
290 fwdToRightTree_[i] = C.fwdToRightTree_[i]->clone();
295 redToLeftTree_.resize(C.redToLeftTree_.size());
296 for(
int i = 0 ; i< C.redToLeftTree_.size();++i){
297 if(C.redToLeftTree_[i]!=NULL){
298 redToLeftTree_[i] = C.redToLeftTree_[i]->clone();
301 redToAboveTree_.resize(C.redToAboveTree_.size());
302 for(
int i = 0 ; i< C.redToAboveTree_.size();++i){
303 if(C.redToAboveTree_[i]!=NULL){
304 redToAboveTree_[i] = C.redToAboveTree_[i]->clone();
314 PMatrix<T>::PMatrix(
const PMatrix<T> & C):PMatrix(){
320 options_ = C.options_;
321 optionsLU_ = C.optionsLU_;
324 limIndex_ = C.limIndex_;
326 ColBlockIdx_ = C.ColBlockIdx_;
327 RowBlockIdx_ = C.RowBlockIdx_;
331 workingSet_ = C.workingSet_;
335 isSendToBelow_ = C.isSendToBelow_;
336 isSendToRight_ = C.isSendToRight_;
337 isSendToDiagonal_ = C.isSendToDiagonal_;
338 isSendToCrossDiagonal_ = C.isSendToCrossDiagonal_;
340 isRecvFromBelow_ = C.isRecvFromBelow_;
341 isRecvFromAbove_ = C.isRecvFromAbove_;
342 isRecvFromLeft_ = C.isRecvFromLeft_;
343 isRecvFromCrossDiagonal_ = C.isRecvFromCrossDiagonal_;
348 fwdToBelowTree2_.resize(C.fwdToBelowTree2_.size());
349 for(
int i = 0 ; i< C.fwdToBelowTree2_.size();++i){
350 if(C.fwdToBelowTree2_[i]!=NULL){
351 fwdToBelowTree2_[i] = C.fwdToBelowTree2_[i]->clone();
354 fwdToRightTree2_.resize(C.fwdToRightTree2_.size());
355 for(
int i = 0 ; i< C.fwdToRightTree2_.size();++i){
356 if(C.fwdToRightTree2_[i]!=NULL){
357 fwdToRightTree2_[i] = C.fwdToRightTree2_[i]->clone();
361 fwdToBelowTree_.resize(C.fwdToBelowTree_.size());
362 for(
int i = 0 ; i< C.fwdToBelowTree_.size();++i){
363 if(C.fwdToBelowTree_[i]!=NULL){
364 fwdToBelowTree_[i] = C.fwdToBelowTree_[i]->clone();
367 fwdToRightTree_.resize(C.fwdToRightTree_.size());
368 for(
int i = 0 ; i< C.fwdToRightTree_.size();++i){
369 if(C.fwdToRightTree_[i]!=NULL){
370 fwdToRightTree_[i] = C.fwdToRightTree_[i]->clone();
375 redToLeftTree_.resize(C.redToLeftTree_.size());
376 for(
int i = 0 ; i< C.redToLeftTree_.size();++i){
377 if(C.redToLeftTree_[i]!=NULL){
378 redToLeftTree_[i] = C.redToLeftTree_[i]->clone();
381 redToAboveTree_.resize(C.redToAboveTree_.size());
382 for(
int i = 0 ; i< C.redToAboveTree_.size();++i){
383 if(C.redToAboveTree_[i]!=NULL){
384 redToAboveTree_[i] = C.redToAboveTree_[i]->clone();
392 PMatrix<T>::PMatrix ( ){
397 PMatrix<T>::PMatrix (
399 const SuperNodeType* s,
405 PushCallStack(
"PMatrix::PMatrix");
408 this->Setup( g, s, o, oLU );
418 PMatrix<T>::~PMatrix ( )
422 PushCallStack(
"PMatrix::~PMatrix");
434 void PMatrix<T>::Setup(
436 const SuperNodeType* s,
442 PushCallStack(
"PMatrix::Setup");
458 ColBlockIdx_.clear();
459 RowBlockIdx_.clear();
461 L_.resize( this->NumLocalBlockCol() );
462 U_.resize( this->NumLocalBlockRow() );
464 ColBlockIdx_.resize( this->NumLocalBlockCol() );
465 RowBlockIdx_.resize( this->NumLocalBlockRow() );
468 #if ( _DEBUGlevel_ >= 1 )
469 statusOFS << std::endl <<
"PMatrix is constructed. The grid information: " << std::endl;
470 statusOFS <<
"mpirank = " <<
MYPROC(grid_) << std::endl;
471 statusOFS <<
"myrow = " <<
MYROW(grid_) << std::endl;
472 statusOFS <<
"mycol = " <<
MYCOL(grid_) << std::endl;
482 limIndex_ = (Int)maxTag_/(Int)SELINV_TAG_COUNT -1;
485 #if defined(COMM_PROFILE) || defined(COMM_PROFILE_BCAST)
486 std::stringstream ss3;
487 ss3 <<
"comm_stat" <<
MYPROC(this->grid_);
488 if(!commOFS.is_open()){
489 commOFS.open( ss3.str().c_str());
511 TIMER_START(Compute_Sinv_LT_Lookup_Indexes);
513 TIMER_START(Build_colptr_rowptr);
517 std::vector<Int> rowPtr(LcolRecv.size() + 1);
521 std::vector<Int> colPtr(UrowRecv.size() + 1);
524 for( Int ib = 0; ib < LcolRecv.size(); ib++ ){
525 rowPtr[ib+1] = rowPtr[ib] + LcolRecv[ib].numRow;
528 for( Int jb = 0; jb < UrowRecv.size(); jb++ ){
529 colPtr[jb+1] = colPtr[jb] + UrowRecv[jb].numCol;
532 Int numRowAinvBuf = *rowPtr.rbegin();
533 Int numColAinvBuf = *colPtr.rbegin();
534 TIMER_STOP(Build_colptr_rowptr);
536 TIMER_START(Allocate_lookup);
538 AinvBuf.Resize( numRowAinvBuf, numColAinvBuf );
539 UBuf.Resize(
SuperSize( snode.Index, super_ ), numColAinvBuf );
545 TIMER_STOP(Allocate_lookup);
547 TIMER_START(Fill_UBuf);
549 for( Int jb = 0; jb < UrowRecv.size(); jb++ ){
552 throw std::logic_error(
"The size of UB is not right. Something is seriously wrong." );
555 UB.
numRow, UBuf.VecData( colPtr[jb] ),
SuperSize( snode.Index, super_ ) );
557 TIMER_STOP(Fill_UBuf);
561 TIMER_START(JB_Loop);
715 for( Int jb = 0; jb < UrowRecv.size(); jb++ ){
716 for( Int ib = 0; ib < LcolRecv.size(); ib++ ){
721 T* nzvalAinv = &AinvBuf( rowPtr[ib], colPtr[jb] );
722 Int ldAinv = AinvBuf.m();
726 std::vector<LBlock<T> >& LcolSinv = this->L(
LBj(jsup, grid_ ) );
727 bool isBlockFound =
false;
728 TIMER_START(PARSING_ROW_BLOCKIDX);
729 for( Int ibSinv = 0; ibSinv < LcolSinv.size(); ibSinv++ ){
731 if( LcolSinv[ibSinv].blockIdx == isup ){
735 std::vector<Int> relRows( LB.
numRow );
736 Int* rowsLBPtr = LB.
rows.Data();
737 Int* rowsSinvBPtr = SinvB.
rows.Data();
738 for( Int i = 0; i < LB.
numRow; i++ ){
739 bool isRowFound =
false;
740 for( Int i1 = 0; i1 < SinvB.
numRow; i1++ ){
741 if( rowsLBPtr[i] == rowsSinvBPtr[i1] ){
747 if( isRowFound ==
false ){
748 std::ostringstream msg;
749 msg <<
"Row " << rowsLBPtr[i] <<
750 " in LB cannot find the corresponding row in SinvB" << std::endl
751 <<
"LB.rows = " << LB.
rows << std::endl
752 <<
"SinvB.rows = " << SinvB.
rows << std::endl;
753 throw std::runtime_error( msg.str().c_str() );
758 std::vector<Int> relCols( UB.
numCol );
760 for( Int j = 0; j < UB.
numCol; j++ ){
761 relCols[j] = UB.
cols[j] - SinvColsSta;
765 T* nzvalSinv = SinvB.
nzval.Data();
766 Int ldSinv = SinvB.
numRow;
767 for( Int j = 0; j < UB.
numCol; j++ ){
768 for( Int i = 0; i < LB.
numRow; i++ ){
769 nzvalAinv[i+j*ldAinv] =
770 nzvalSinv[relRows[i] + relCols[j] * ldSinv];
778 TIMER_STOP(PARSING_ROW_BLOCKIDX);
779 if( isBlockFound ==
false ){
780 std::ostringstream msg;
781 msg <<
"Block(" << isup <<
", " << jsup
782 <<
") did not find a matching block in Sinv." << std::endl;
783 throw std::runtime_error( msg.str().c_str() );
787 std::vector<UBlock<T> >& UrowSinv = this->U(
LBi( isup, grid_ ) );
788 bool isBlockFound =
false;
789 TIMER_START(PARSING_COL_BLOCKIDX);
790 for( Int jbSinv = 0; jbSinv < UrowSinv.size(); jbSinv++ ){
792 if( UrowSinv[jbSinv].blockIdx == jsup ){
796 std::vector<Int> relRows( LB.
numRow );
798 for( Int i = 0; i < LB.
numRow; i++ ){
799 relRows[i] = LB.
rows[i] - SinvRowsSta;
803 std::vector<Int> relCols( UB.
numCol );
804 Int* colsUBPtr = UB.
cols.Data();
805 Int* colsSinvBPtr = SinvB.
cols.Data();
806 for( Int j = 0; j < UB.
numCol; j++ ){
807 bool isColFound =
false;
808 for( Int j1 = 0; j1 < SinvB.
numCol; j1++ ){
809 if( colsUBPtr[j] == colsSinvBPtr[j1] ){
815 if( isColFound ==
false ){
816 std::ostringstream msg;
817 msg <<
"Col " << colsUBPtr[j] <<
818 " in UB cannot find the corresponding row in SinvB" << std::endl
819 <<
"UB.cols = " << UB.
cols << std::endl
820 <<
"UinvB.cols = " << SinvB.
cols << std::endl;
821 throw std::runtime_error( msg.str().c_str() );
827 T* nzvalSinv = SinvB.
nzval.Data();
828 Int ldSinv = SinvB.
numRow;
829 for( Int j = 0; j < UB.
numCol; j++ ){
830 for( Int i = 0; i < LB.
numRow; i++ ){
831 nzvalAinv[i+j*ldAinv] =
832 nzvalSinv[relRows[i] + relCols[j] * ldSinv];
840 TIMER_STOP(PARSING_COL_BLOCKIDX);
841 if( isBlockFound ==
false ){
842 std::ostringstream msg;
843 msg <<
"Block(" << isup <<
", " << jsup
844 <<
") did not find a matching block in Sinv." << std::endl;
845 throw std::runtime_error( msg.str().c_str() );
857 TIMER_STOP(Compute_Sinv_LT_Lookup_Indexes);
862 std::vector<SuperNodeBufferType > & arrSuperNodes,
866 TIMER_START(Send_CD_Update_U);
870 Int sendOffset[stepSuper];
871 Int recvOffset[stepSuper];
873 for (Int supidx=0; supidx<stepSuper; supidx++){
875 sendOffset[supidx]=sendCount;
876 recvOffset[supidx]=recvCount;
877 sendCount+= CountSendToCrossDiagonal(snode.Index);
878 recvCount+= CountRecvFromCrossDiagonal(snode.Index);
882 std::vector<MPI_Request > arrMpiReqsSendCD(sendCount, MPI_REQUEST_NULL );
883 std::vector<MPI_Request > arrMpiReqsSizeSendCD(sendCount, MPI_REQUEST_NULL );
885 std::vector<MPI_Request > arrMpiReqsRecvCD(recvCount, MPI_REQUEST_NULL );
886 std::vector<MPI_Request > arrMpiReqsSizeRecvCD(recvCount, MPI_REQUEST_NULL );
887 std::vector<std::vector<char> > arrSstrLcolSendCD(sendCount);
888 std::vector<int > arrSstrLcolSizeSendCD(sendCount);
889 std::vector<std::vector<char> > arrSstrLcolRecvCD(recvCount);
890 std::vector<int > arrSstrLcolSizeRecvCD(recvCount);
892 for (Int supidx=0; supidx<stepSuper; supidx++){
898 TIMER_START(Send_L_CrossDiag);
900 if(
MYCOL( grid_ ) ==
PCOL( snode.Index, grid_ ) && isSendToCrossDiagonal_(grid_->numProcCol, snode.Index ) ){
903 for(Int dstCol = 0; dstCol<grid_->numProcCol; dstCol++){
904 if(isSendToCrossDiagonal_(dstCol,snode.Index) ){
905 Int dest =
PNUM(
PROW(snode.Index,grid_),dstCol,grid_);
907 if(
MYPROC( grid_ ) != dest ){
908 MPI_Request & mpiReqSizeSend = arrMpiReqsSizeSendCD[sendOffset[supidx]+sendIdx];
909 MPI_Request & mpiReqSend = arrMpiReqsSendCD[sendOffset[supidx]+sendIdx];
912 std::stringstream sstm;
913 std::vector<char> & sstrLcolSend = arrSstrLcolSendCD[sendOffset[supidx]+sendIdx];
914 Int & sstrSize = arrSstrLcolSizeSendCD[sendOffset[supidx]+sendIdx];
916 serialize( snode.RowLocalPtr, sstm, NO_MASK );
917 serialize( snode.BlockIdxLocal, sstm, NO_MASK );
918 serialize( snode.LUpdateBuf, sstm, NO_MASK );
920 sstrLcolSend.resize( Size(sstm) );
921 sstm.read( &sstrLcolSend[0], sstrLcolSend.size() );
922 sstrSize = sstrLcolSend.size();
924 #if ( _DEBUGlevel_ >= 1 )
925 assert(IDX_TO_TAG(snode.Rank,SELINV_TAG_L_SIZE_CD,limIndex_)<=maxTag_);
926 assert(IDX_TO_TAG(snode.Rank,SELINV_TAG_L_CONTENT_CD,limIndex_)<=maxTag_);
929 MPI_Isend( &sstrSize,
sizeof(sstrSize), MPI_BYTE, dest, IDX_TO_TAG(snode.Rank,SELINV_TAG_L_SIZE_CD,limIndex_), grid_->comm, &mpiReqSizeSend );
930 MPI_Isend( (
void*)&sstrLcolSend[0], sstrSize, MPI_BYTE, dest, IDX_TO_TAG(snode.Rank,SELINV_TAG_L_CONTENT_CD,limIndex_), grid_->comm, &mpiReqSend );
932 PROFILE_COMM(
MYPROC(this->grid_),dest,IDX_TO_TAG(snode.Rank,SELINV_TAG_L_SIZE_CD,limIndex_),
sizeof(sstrSize));
933 PROFILE_COMM(
MYPROC(this->grid_),dest,IDX_TO_TAG(snode.Rank,SELINV_TAG_L_CONTENT_CD,limIndex_),sstrSize);
943 TIMER_STOP(Send_L_CrossDiag);
948 for (Int supidx=0; supidx<stepSuper; supidx++){
951 if(
MYROW( grid_ ) ==
PROW( snode.Index, grid_ ) && isRecvFromCrossDiagonal_(grid_->numProcRow, snode.Index ) ){
953 for(Int srcRow = 0; srcRow<grid_->numProcRow; srcRow++){
954 if(isRecvFromCrossDiagonal_(srcRow,snode.Index) ){
955 Int src =
PNUM(srcRow,
PCOL(snode.Index,grid_),grid_);
956 if(
MYPROC( grid_ ) != src ){
957 Int & sstrSize = arrSstrLcolSizeRecvCD[recvOffset[supidx]+recvIdx];
958 MPI_Request & mpiReqSizeRecv = arrMpiReqsSizeRecvCD[recvOffset[supidx]+recvIdx];
960 #if ( _DEBUGlevel_ >= 1 )
961 assert(IDX_TO_TAG(snode.Rank,SELINV_TAG_L_CONTENT_CD,limIndex_)<=maxTag_);
963 MPI_Irecv( &sstrSize, 1, MPI_INT, src, IDX_TO_TAG(snode.Rank,SELINV_TAG_L_SIZE_CD,limIndex_), grid_->comm, &mpiReqSizeRecv);
972 mpi::Waitall(arrMpiReqsSizeRecvCD);
974 for (Int supidx=0; supidx<stepSuper; supidx++){
977 if(
MYROW( grid_ ) ==
PROW( snode.Index, grid_ ) && isRecvFromCrossDiagonal_(grid_->numProcRow, snode.Index ) ){
979 for(Int srcRow = 0; srcRow<grid_->numProcRow; srcRow++){
980 if(isRecvFromCrossDiagonal_(srcRow,snode.Index) ){
981 Int src =
PNUM(srcRow,
PCOL(snode.Index,grid_),grid_);
982 if(
MYPROC( grid_ ) != src ){
983 Int & sstrSize = arrSstrLcolSizeRecvCD[recvOffset[supidx]+recvIdx];
984 std::vector<char> & sstrLcolRecv = arrSstrLcolRecvCD[recvOffset[supidx]+recvIdx];
985 MPI_Request & mpiReqRecv = arrMpiReqsRecvCD[recvOffset[supidx]+recvIdx];
986 sstrLcolRecv.resize( sstrSize);
988 #if ( _DEBUGlevel_ >= 1 )
989 assert(IDX_TO_TAG(snode.Rank,SELINV_TAG_L_CONTENT_CD,limIndex_)<=maxTag_);
991 MPI_Irecv( (
void*)&sstrLcolRecv[0], sstrSize, MPI_BYTE, src, IDX_TO_TAG(snode.Rank,SELINV_TAG_L_CONTENT_CD,limIndex_), grid_->comm, &mpiReqRecv );
1001 mpi::Waitall(arrMpiReqsRecvCD);
1003 for (Int supidx=0; supidx<stepSuper; supidx++){
1008 if(
MYROW( grid_ ) ==
PROW( snode.Index, grid_ ) && isRecvFromCrossDiagonal_(grid_->numProcRow, snode.Index ) ){
1010 #if ( _DEBUGlevel_ >= 1 )
1011 statusOFS << std::endl <<
" ["<<snode.Index<<
"] "<<
"Update the upper triangular block" << std::endl << std::endl;
1012 statusOFS << std::endl <<
" ["<<snode.Index<<
"] "<<
"blockIdxLocal:" << snode.BlockIdxLocal << std::endl << std::endl;
1013 statusOFS << std::endl <<
" ["<<snode.Index<<
"] "<<
"rowLocalPtr:" << snode.RowLocalPtr << std::endl << std::endl;
1016 std::vector<UBlock<T> >& Urow = this->U(
LBi( snode.Index, grid_ ) );
1017 std::vector<bool> isBlockFound(Urow.size(),
false);
1020 for(Int srcRow = 0; srcRow<grid_->numProcRow; srcRow++){
1021 if(isRecvFromCrossDiagonal_(srcRow,snode.Index) ){
1022 Int src =
PNUM(srcRow,
PCOL(snode.Index,grid_),grid_);
1023 TIMER_START(Recv_L_CrossDiag);
1025 std::vector<Int> rowLocalPtrRecv;
1026 std::vector<Int> blockIdxLocalRecv;
1029 if(
MYPROC( grid_ ) != src ){
1030 std::stringstream sstm;
1031 Int & sstrSize = arrSstrLcolSizeRecvCD[recvOffset[supidx]+recvIdx];
1032 std::vector<char> & sstrLcolRecv = arrSstrLcolRecvCD[recvOffset[supidx]+recvIdx];
1033 sstm.write( &sstrLcolRecv[0], sstrSize );
1035 deserialize( rowLocalPtrRecv, sstm, NO_MASK );
1036 deserialize( blockIdxLocalRecv, sstm, NO_MASK );
1037 deserialize( UUpdateBuf, sstm, NO_MASK );
1043 rowLocalPtrRecv = snode.RowLocalPtr;
1044 blockIdxLocalRecv = snode.BlockIdxLocal;
1045 UUpdateBuf = snode.LUpdateBuf;
1050 TIMER_STOP(Recv_L_CrossDiag);
1052 #if ( _DEBUGlevel_ >= 1 )
1053 statusOFS<<
" ["<<snode.Index<<
"] P"<<
MYPROC(grid_)<<
" ("<<
MYROW(grid_)<<
","<<
MYCOL(grid_)<<
") <--- LBj("<<snode.Index<<
") <--- P"<<src<<std::endl;
1054 statusOFS << std::endl <<
" ["<<snode.Index<<
"] "<<
"rowLocalPtrRecv:" << rowLocalPtrRecv << std::endl << std::endl;
1055 statusOFS << std::endl <<
" ["<<snode.Index<<
"] "<<
"blockIdxLocalRecv:" << blockIdxLocalRecv << std::endl << std::endl;
1060 for( Int ib = 0; ib < blockIdxLocalRecv.size(); ib++ ){
1061 for( Int jb = 0; jb < Urow.size(); jb++ ){
1063 if( UB.
blockIdx == blockIdxLocalRecv[ib] ){
1065 lapack::Lacpy(
'A', Ltmp.m(), Ltmp.n(),
1066 &UUpdateBuf( rowLocalPtrRecv[ib], 0 ),
1067 UUpdateBuf.m(), Ltmp.Data(), Ltmp.m() );
1068 isBlockFound[jb] =
true;
1069 Transpose( Ltmp, UB.
nzval );
1077 for( Int jb = 0; jb < Urow.size(); jb++ ){
1079 if( !isBlockFound[jb] ){
1083 throw std::logic_error(
"UBlock cannot find its update. Something is seriously wrong." );
1089 TIMER_STOP(Send_CD_Update_U);
1091 mpi::Waitall(arrMpiReqsSizeSendCD);
1092 mpi::Waitall(arrMpiReqsSendCD);
1095 template<
typename T>
1101 #if ( _DEBUGlevel_ >= 1 )
1102 statusOFS << std::endl <<
"["<<snode.Index<<
"] "<<
"Unpack the received data for processors participate in Gemm. " << std::endl << std::endl;
1105 if(
MYROW( grid_ ) !=
PROW( snode.Index, grid_ ) ){
1106 std::stringstream sstm;
1108 sstm.write( &snode.SstrUrowRecv[0], snode.SstrUrowRecv.size() );
1111 TreeBcast2<T> * bcastUTree2 = fwdToBelowTree2_[snode.Index];
1112 sstm.write( (
char*)bcastUTree2->GetLocalBuffer(),bcastUTree2->GetMsgSize());
1114 std::vector<Int> mask( UBlockMask::TOTAL_NUMBER, 1 );
1116 deserialize( numUBlock, sstm, NO_MASK );
1117 UrowRecv.resize( numUBlock );
1118 for( Int jb = 0; jb < numUBlock; jb++ ){
1119 deserialize( UrowRecv[jb], sstm, mask );
1127 UrowRecv.resize(this->U(
LBi( snode.Index, grid_ ) ).size());
1128 std::copy(this->U(
LBi( snode.Index, grid_ ) ).begin(),this->U(
LBi( snode.Index, grid_ )).end(),UrowRecv.begin());
1133 if(
MYCOL( grid_ ) !=
PCOL( snode.Index, grid_ ) ){
1134 std::stringstream sstm;
1136 sstm.write( &snode.SstrLcolRecv[0], snode.SstrLcolRecv.size() );
1139 TreeBcast2<T> * bcastLTree2 = fwdToRightTree2_[snode.Index];
1140 sstm.write( (
char*)bcastLTree2->GetLocalBuffer(),bcastLTree2->GetMsgSize());
1142 std::vector<Int> mask( LBlockMask::TOTAL_NUMBER, 1 );
1143 mask[LBlockMask::NZVAL] = 0;
1145 deserialize( numLBlock, sstm, NO_MASK );
1146 LcolRecv.resize( numLBlock );
1147 for( Int ib = 0; ib < numLBlock; ib++ ){
1148 deserialize( LcolRecv[ib], sstm, mask );
1154 std::vector<LBlock<T> >& Lcol = this->L(
LBj( snode.Index, grid_ ) );
1155 Int startIdx = (
MYROW( grid_ ) ==
PROW( snode.Index, grid_ ) )?1:0;
1156 LcolRecv.resize( Lcol.size() - startIdx );
1157 std::copy(Lcol.begin()+startIdx,Lcol.end(),LcolRecv.begin());
1161 template<
typename T>
1166 if(
MYCOL( grid_ ) ==
PCOL( snode.Index, grid_ ) ){
1167 #if ( _DEBUGlevel_ >= 2 )
1168 statusOFS << std::endl <<
"["<<snode.Index<<
"] "<<
"Updating the diagonal block" << std::endl << std::endl;
1170 std::vector<LBlock<T> >& Lcol = this->L(
LBj( snode.Index, grid_ ) );
1173 snode.DiagBuf.Resize(
SuperSize( snode.Index, super_ ),
SuperSize( snode.Index, super_ ));
1174 SetValue(snode.DiagBuf, ZERO<T>());
1177 Int startIb = (
MYROW( grid_ ) ==
PROW( snode.Index, grid_ ))?1:0;
1178 for( Int ib = startIb; ib < Lcol.size(); ib++ ){
1181 gemm_stat.push_back(snode.DiagBuf.m());
1182 gemm_stat.push_back(snode.DiagBuf.n());
1183 gemm_stat.push_back(Lcol[ib].numRow);
1188 blas::Gemm(
'T',
'N', snode.DiagBuf.m(), snode.DiagBuf.n(), LB.
numRow,
1189 MINUS_ONE<T>(), &snode.LUpdateBuf( snode.RowLocalPtr[ib-startIb], 0 ), snode.LUpdateBuf.m(),
1190 LB.
nzval.Data(), LB.
nzval.m(), ONE<T>(), snode.DiagBuf.Data(), snode.DiagBuf.m() );
1193 #if ( _DEBUGlevel_ >= 1 )
1194 statusOFS << std::endl <<
"["<<snode.Index<<
"] "<<
"Updated the diagonal block" << std::endl << std::endl;
1201 template<
typename T>
1206 PushCallStack(
"PMatrix::GetEtree");
1207 double begin = MPI_Wtime( );
1211 if( optionsLU_->ColPerm !=
"PARMETIS" ) {
1218 etree_supno.resize(this->
NumSuper());
1219 for(Int i = 0; i < superNode->etree.m(); ++i){
1220 Int curSnode = superNode->superIdx[i];
1221 Int parentSnode = (superNode->etree[i]>= superNode->etree.m()) ?this->
NumSuper():superNode->superIdx[superNode->etree[i]];
1222 if( curSnode != parentSnode){
1223 etree_supno[curSnode] = parentSnode;
1232 std::vector<Int> etree_supno_l( nsupers, nsupers );
1233 for( Int ksup = 0; ksup < nsupers; ksup++ ){
1234 if(
MYCOL( grid_ ) ==
PCOL( ksup, grid_ ) ){
1236 std::vector<LBlock<T> >& Lcol = this->L(
LBj(ksup, grid_) );
1239 if(
MYROW( grid_ ) ==
PROW( ksup, grid_ ) )
1242 for( Int ib = firstBlk; ib < Lcol.size(); ib++ ){
1243 etree_supno_l[ksup] = std::min(etree_supno_l[ksup] , Lcol[ib].blockIdx);
1250 #if ( _DEBUGlevel_ >= 1 )
1251 statusOFS << std::endl <<
" Local supernodal elimination tree is " << etree_supno_l <<std::endl<<std::endl;
1255 etree_supno.resize( nsupers );
1256 mpi::Allreduce( (Int*) &etree_supno_l[0],(Int *) &etree_supno[0], nsupers, MPI_MIN, grid_->comm );
1257 etree_supno[nsupers-1]=nsupers;
1261 double end = MPI_Wtime( );
1262 statusOFS<<
"Building the list took "<<end-begin<<
"s"<<std::endl;
1270 template<
typename T>
1272 std::vector<Int> & snodeEtree,
1273 std::vector<std::vector<Int> > & WSet)
1275 TIMER_START(Compute_WorkSet);
1279 if (options_->maxPipelineDepth!=1){
1281 Int maxDepth = options_->maxPipelineDepth;
1282 maxDepth=maxDepth==-1?std::numeric_limits<Int>::max():maxDepth;
1283 #if ( _DEBUGlevel_ >= 1 )
1284 statusOFS<<
"MaxDepth is "<<maxDepth<<endl;
1290 Int rootParent = numSuper;
1296 level(rootParent)=-1;
1298 for(Int i=rootParent-1; i>=0; i-- ){
1299 level[i] = level[snodeEtree[i]]+1;
1300 numLevel = std::max(numLevel, level[i]);
1308 for(Int i=rootParent-1; i>=0; i-- ){
1310 levelSize[level[i]]++;
1315 WSet.resize(numLevel,std::vector<Int>());
1316 for(Int i=0; i<numLevel; i++ ){
1317 WSet[i].reserve(levelSize(i));
1321 for(Int i=rootParent-1; i>=0; i-- ){
1342 WSet[level[i]].push_back(i);
1349 Int limit = maxDepth;
1351 for (Int lidx=0; lidx<WSet.size() ; lidx++){
1360 Int maxRank = rank + WSet[lidx].size()-1;
1362 #if ( _DEBUGlevel_ >= 1 )
1363 statusOFS<< (Int)(rank/limIndex_) <<
" vs2 "<<(Int)(maxRank/limIndex_)<<std::endl;
1365 if( (Int)(rank/limIndex_) != (Int)(maxRank/limIndex_)){
1367 splitIdx = maxRank - maxRank%limIndex_ - rank;
1370 Int splitPoint = min((Int)WSet[lidx].size()-1,splitIdx>0?min(limit-1,splitIdx):limit-1);
1371 #if ( _DEBUGlevel_ >= 1 )
1373 statusOFS<<
"TEST SPLIT at "<<splitIdx<<
" "<<std::endl;
1376 split = split || (WSet[lidx].size()>limit);
1378 rank += splitPoint+1;
1380 if( split && splitPoint>0)
1382 #if ( _DEBUGlevel_ >= 1 )
1383 statusOFS<<
"SPLITPOINT is "<<splitPoint<<
" "<<std::endl;
1386 #if ( _DEBUGlevel_ >= 1 )
1387 if(splitPoint != limit-1){
1388 statusOFS<<
"-----------------------"<<std::endl;
1389 statusOFS<<lidx<<
": "<<orank<<
" -- "<<maxRank<<std::endl;
1390 statusOFS<<
" is NOW "<<std::endl;
1391 statusOFS<<lidx<<
": "<<orank<<
" -- "<<rank-1<<std::endl;
1392 statusOFS<<lidx+1<<
": "<<rank<<
" -- "<<maxRank<<std::endl;
1393 statusOFS<<
"-----------------------"<<std::endl;
1396 std::vector<std::vector<Int> >::iterator pos = WSet.begin()+lidx+1;
1397 WSet.insert(pos,std::vector<Int>());
1398 WSet[lidx+1].insert(WSet[lidx+1].begin(),WSet[lidx].begin() + splitPoint+1 ,WSet[lidx].end());
1399 WSet[lidx].erase(WSet[lidx].begin()+splitPoint+1,WSet[lidx].end());
1401 #if ( _DEBUGlevel_ >= 1 )
1402 if(splitPoint != limit-1){
1403 assert((orank+WSet[lidx].size()-1)%limIndex_==0);
1488 for( Int ksup = numSuper - 2; ksup >= 0; ksup-- ){
1489 WSet.push_back(std::vector<Int>());
1490 WSet.back().push_back(ksup);
1498 TIMER_STOP(Compute_WorkSet);
1499 #if ( _DEBUGlevel_ >= 1 )
1500 for (Int lidx=0; lidx<WSet.size() ; lidx++){
1501 statusOFS << std::endl <<
"L"<< lidx <<
" is: {";
1502 for (Int supidx=0; supidx<WSet[lidx].size() ; supidx++){
1503 statusOFS << WSet[lidx][supidx] <<
" ["<<snodeEtree[WSet[lidx][supidx]]<<
"] ";
1505 statusOFS <<
" }"<< std::endl;
1510 template<
typename T>
1514 #if defined (PROFILE) || defined(PMPI) || defined(USE_TAU)
1515 Real begin_SendULWaitContentFirst, end_SendULWaitContentFirst, time_SendULWaitContentFirst = 0;
1518 std::vector<std::vector<Int> > & superList = this->WorkingSet();
1519 Int numSteps = superList.size();
1520 Int stepSuper = superList[lidx].size();
1525 TIMER_START(AllocateBuffer);
1528 for (Int supidx=0; supidx<superList[lidx].size(); supidx++){
1529 Int snodeIdx = superList[lidx][supidx];
1531 TreeBcast * bcastLTree = fwdToRightTree_[snodeIdx];
1532 TreeBcast * bcastUTree = fwdToBelowTree_[snodeIdx];
1534 TreeBcast2<T> * bcastLTree = fwdToRightTree2_[snodeIdx];
1535 TreeBcast2<T> * bcastUTree = fwdToBelowTree2_[snodeIdx];
1539 bool participating =
MYROW( grid_ ) ==
PROW( snodeIdx, grid_ )
1540 ||
MYCOL( grid_ ) ==
PCOL( snodeIdx, grid_ )
1541 || CountSendToRight(snodeIdx) > 0
1542 || CountSendToBelow(snodeIdx) > 0
1543 || CountSendToCrossDiagonal(snodeIdx) > 0
1544 || CountRecvFromCrossDiagonal(snodeIdx) >0
1545 || ( isRecvFromLeft_( snodeIdx ) )
1546 || ( isRecvFromAbove_( snodeIdx ) )
1547 || isSendToDiagonal_(snodeIdx)
1548 || (bcastUTree!=NULL)
1549 || (bcastLTree!=NULL)
1551 || (redDTree!=NULL) ;
1561 std::vector<std::vector<MPI_Request> > arrMpireqsSendToBelow;
1562 arrMpireqsSendToBelow.resize( stepSuper, std::vector<MPI_Request>( 2 * grid_->numProcRow, MPI_REQUEST_NULL ));
1563 std::vector<std::vector<MPI_Request> > arrMpireqsSendToRight;
1564 arrMpireqsSendToRight.resize(stepSuper, std::vector<MPI_Request>( 2 * grid_->numProcCol, MPI_REQUEST_NULL ));
1569 std::vector<MPI_Request> arrMpireqsSendToLeft;
1570 arrMpireqsSendToLeft.resize(stepSuper, MPI_REQUEST_NULL );
1572 std::vector<MPI_Request> arrMpireqsSendToAbove;
1573 arrMpireqsSendToAbove.resize(stepSuper, MPI_REQUEST_NULL );
1578 std::vector<MPI_Request> arrMpireqsRecvSizeFromAny;
1579 arrMpireqsRecvSizeFromAny.resize(stepSuper*2 , MPI_REQUEST_NULL);
1580 std::vector<MPI_Request> arrMpireqsRecvContentFromAny;
1581 arrMpireqsRecvContentFromAny.resize(stepSuper*2 , MPI_REQUEST_NULL);
1585 std::vector<SuperNodeBufferType> arrSuperNodes(stepSuper);
1587 for (Int supidx=0; supidx<superList[lidx].size(); supidx++){
1588 Int snodeIdx = superList[lidx][supidx];
1590 TreeBcast * bcastLTree = fwdToRightTree_[snodeIdx];
1591 TreeBcast * bcastUTree = fwdToBelowTree_[snodeIdx];
1593 TreeBcast2<T> * bcastLTree = fwdToRightTree2_[snodeIdx];
1594 TreeBcast2<T> * bcastUTree = fwdToBelowTree2_[snodeIdx];
1598 bool participating =
MYROW( grid_ ) ==
PROW( snodeIdx, grid_ )
1599 ||
MYCOL( grid_ ) ==
PCOL( snodeIdx, grid_ )
1600 || CountSendToRight(snodeIdx) > 0
1601 || CountSendToBelow(snodeIdx) > 0
1602 || CountSendToCrossDiagonal(snodeIdx) > 0
1603 || CountRecvFromCrossDiagonal(snodeIdx) >0
1604 || ( isRecvFromLeft_( snodeIdx ) )
1605 || ( isRecvFromAbove_( snodeIdx ) )
1606 || isSendToDiagonal_(snodeIdx)
1607 || (bcastUTree!=NULL)
1608 || (bcastLTree!=NULL)
1610 || (redDTree!=NULL) ;
1613 arrSuperNodes[pos].Index = superList[lidx][supidx];
1614 arrSuperNodes[pos].Rank = rank;
1626 int numSentToLeft = 0;
1627 std::vector<int> reqSentToLeft;
1632 TIMER_STOP(AllocateBuffer);
1635 PushCallStack(
"PMatrix::SelInv_P2p::UpdateL");
1637 #if ( _DEBUGlevel_ >= 1 )
1638 statusOFS << std::endl <<
"Communication to the Schur complement." << std::endl << std::endl;
1642 vector<char> bcastUready(stepSuper,0);
1644 vector<char> bcastLready(stepSuper,0);
1650 TIMER_START(IRecv_Content_UL);
1655 for (Int supidx=0; supidx<stepSuper ; supidx++){
1658 MPI_Request * mpireqsRecvFromAbove = &arrMpireqsRecvContentFromAny[supidx*2];
1659 MPI_Request * mpireqsRecvFromLeft = &arrMpireqsRecvContentFromAny[supidx*2+1];
1662 if( isRecvFromAbove_( snode.Index ) &&
1663 MYROW( grid_ ) !=
PROW( snode.Index, grid_ ) ){
1666 TreeBcast2<T> * bcastUTree2 = fwdToBelowTree2_[snode.Index];
1668 if(bcastUTree2 != NULL){
1669 bcastUTree2->SetTag(IDX_TO_TAG(snode.Rank,SELINV_TAG_U_CONTENT,limIndex_));
1673 bool done = bcastUTree2->Progress();
1675 #if ( _DEBUGlevel_ >= 1 )
1676 statusOFS<<
"["<<snode.Index<<
"] "<<
" trying to progress bcast U "<<done<<std::endl;
1680 TreeBcast * bcastUTree = fwdToBelowTree_[snode.Index];
1681 if(bcastUTree!=NULL){
1682 Int myRoot = bcastUTree->GetRoot();
1683 snode.SizeSstrUrowRecv = bcastUTree->GetMsgSize();
1684 snode.SstrUrowRecv.resize( snode.SizeSstrUrowRecv);
1685 MPI_Irecv( &snode.SstrUrowRecv[0], snode.SizeSstrUrowRecv, MPI_BYTE,
1686 myRoot, IDX_TO_TAG(snode.Rank,SELINV_TAG_U_CONTENT,limIndex_),
1687 grid_->colComm, mpireqsRecvFromAbove );
1688 #if ( _DEBUGlevel_ >= 1 )
1689 statusOFS << std::endl <<
"["<<snode.Index<<
"] "<<
"Receiving U " << snode.SizeSstrUrowRecv <<
" BYTES from "<<myRoot<<
" on tag "<<IDX_TO_TAG(snode.Rank,SELINV_TAG_U_CONTENT,limIndex_)<< std::endl << std::endl;
1695 if( isRecvFromLeft_( snode.Index ) &&
1696 MYCOL( grid_ ) !=
PCOL( snode.Index, grid_ ) ){
1698 TreeBcast2<T> * bcastLTree2 = fwdToRightTree2_[snode.Index];
1700 if(bcastLTree2 != NULL){
1701 bcastLTree2->SetTag(IDX_TO_TAG(snode.Rank,SELINV_TAG_L_CONTENT,limIndex_));
1705 bool done = bcastLTree2->Progress();
1707 #if ( _DEBUGlevel_ >= 1 )
1708 statusOFS<<
"["<<snode.Index<<
"] "<<
" trying to progress bcast L "<<done<<std::endl;
1713 TreeBcast * bcastLTree = fwdToRightTree_[snode.Index];
1714 if(bcastLTree!=NULL){
1715 Int myRoot = bcastLTree->GetRoot();
1716 snode.SizeSstrLcolRecv = bcastLTree->GetMsgSize();
1717 snode.SstrLcolRecv.resize(snode.SizeSstrLcolRecv);
1718 MPI_Irecv( &snode.SstrLcolRecv[0], snode.SizeSstrLcolRecv, MPI_BYTE,
1719 myRoot, IDX_TO_TAG(snode.Rank,SELINV_TAG_L_CONTENT,limIndex_),
1720 grid_->rowComm, mpireqsRecvFromLeft );
1721 #if ( _DEBUGlevel_ >= 1 )
1722 statusOFS << std::endl <<
"["<<snode.Index<<
"] "<<
"Receiving L " << snode.SizeSstrLcolRecv <<
" BYTES from "<<myRoot<<
" on tag "<<IDX_TO_TAG(snode.Rank,SELINV_TAG_L_CONTENT,limIndex_)<< std::endl << std::endl;
1728 TIMER_STOP(IRecv_Content_UL);
1731 TIMER_START(ISend_Content_UL);
1732 for (Int supidx=0; supidx<stepSuper; supidx++){
1735 std::vector<MPI_Request> & mpireqsSendToBelow = arrMpireqsSendToBelow[supidx];
1736 std::vector<MPI_Request> & mpireqsSendToRight = arrMpireqsSendToRight[supidx];
1739 #if ( _DEBUGlevel_ >= 1 )
1740 statusOFS << std::endl <<
"["<<snode.Index<<
"] "
1741 <<
"Communication for the U part." << std::endl << std::endl;
1745 if(
MYROW( grid_ ) ==
PROW( snode.Index, grid_ ) ){
1746 std::vector<UBlock<T> >& Urow = this->U(
LBi(snode.Index, grid_) );
1748 TIMER_START(Serialize_UL);
1749 std::stringstream sstm;
1751 std::vector<Int> mask( UBlockMask::TOTAL_NUMBER, 1 );
1753 serialize( (Int)Urow.size(), sstm, NO_MASK );
1754 for( Int jb = 0; jb < Urow.size(); jb++ ){
1755 serialize( Urow[jb], sstm, mask );
1757 snode.SstrUrowSend.resize( Size( sstm ) );
1758 sstm.read( &snode.SstrUrowSend[0], snode.SstrUrowSend.size() );
1759 snode.SizeSstrUrowSend = snode.SstrUrowSend.size();
1760 TIMER_STOP(Serialize_UL);
1763 TreeBcast2<T> * bcastUTree2 = fwdToBelowTree2_[snode.Index];
1764 if(bcastUTree2!=NULL){
1765 bcastUTree2->SetTag(IDX_TO_TAG(snode.Rank,SELINV_TAG_U_CONTENT,limIndex_));
1766 bcastUTree2->SetLocalBuffer((T*)&snode.SstrUrowSend[0]);
1767 bcastUTree2->SetDataReady(
true);
1768 bool done = bcastUTree2->Progress();
1769 #if ( _DEBUGlevel_ >= 1 )
1770 statusOFS<<
"["<<snode.Index<<
"] "<<
" trying to progress bcast U "<<done<<std::endl;
1775 #if ( _DEBUGlevel_ >= 1 )
1776 for( Int idxRecv = 0; idxRecv < bcastUTree2->GetDestCount(); ++idxRecv ){
1777 Int iProcRow = bcastUTree2->GetDest(idxRecv);
1778 statusOFS << std::endl <<
"["<<snode.Index<<
"] "<<
"Sending U " << snode.SizeSstrUrowSend <<
" BYTES on tag "<<IDX_TO_TAG(snode.Rank,SELINV_TAG_U_CONTENT,limIndex_) << std::endl << std::endl;
1784 TreeBcast * bcastUTree = fwdToBelowTree_[snode.Index];
1785 if(bcastUTree!=NULL){
1786 bcastUTree->ForwardMessage((
char*)&snode.SstrUrowSend[0], snode.SizeSstrUrowSend,
1787 IDX_TO_TAG(snode.Rank,SELINV_TAG_U_CONTENT,limIndex_), &mpireqsSendToBelow[0]);
1788 for( Int idxRecv = 0; idxRecv < bcastUTree->GetDestCount(); ++idxRecv ){
1789 Int iProcRow = bcastUTree->GetDest(idxRecv);
1790 #if ( _DEBUGlevel_ >= 1 )
1791 statusOFS << std::endl <<
"["<<snode.Index<<
"] "<<
"Sending U " << snode.SizeSstrUrowSend <<
" BYTES on tag "<<IDX_TO_TAG(snode.Rank,SELINV_TAG_U_CONTENT,limIndex_) << std::endl << std::endl;
1800 #if ( _DEBUGlevel_ >= 1 )
1801 statusOFS << std::endl <<
"["<<snode.Index<<
"] "<<
"Communication for the L part." << std::endl << std::endl;
1807 if(
MYCOL( grid_ ) ==
PCOL( snode.Index, grid_ ) ){
1808 std::vector<LBlock<T> >& Lcol = this->L(
LBj(snode.Index, grid_) );
1809 TIMER_START(Serialize_UL);
1811 std::stringstream sstm;
1812 std::vector<Int> mask( LBlockMask::TOTAL_NUMBER, 1 );
1813 mask[LBlockMask::NZVAL] = 0;
1817 if(
MYROW( grid_ ) ==
PROW( snode.Index, grid_ ) )
1818 serialize( (Int)Lcol.size() - 1, sstm, NO_MASK );
1820 serialize( (Int)Lcol.size(), sstm, NO_MASK );
1822 for( Int ib = 0; ib < Lcol.size(); ib++ ){
1823 if( Lcol[ib].blockIdx > snode.Index ){
1824 #if ( _DEBUGlevel_ >= 2 )
1825 statusOFS << std::endl <<
"["<<snode.Index<<
"] "<<
"Serializing Block index " << Lcol[ib].blockIdx << std::endl;
1827 serialize( Lcol[ib], sstm, mask );
1830 snode.SstrLcolSend.resize( Size( sstm ) );
1831 sstm.read( &snode.SstrLcolSend[0], snode.SstrLcolSend.size() );
1832 snode.SizeSstrLcolSend = snode.SstrLcolSend.size();
1833 TIMER_STOP(Serialize_UL);
1836 TreeBcast2<T> * bcastLTree2 = fwdToRightTree2_[snode.Index];
1837 if(bcastLTree2!=NULL){
1838 bcastLTree2->SetTag(IDX_TO_TAG(snode.Rank,SELINV_TAG_L_CONTENT,limIndex_));
1839 bcastLTree2->SetLocalBuffer((T*)&snode.SstrLcolSend[0]);
1840 bcastLTree2->SetDataReady(
true);
1841 bool done = bcastLTree2->Progress();
1842 #if ( _DEBUGlevel_ >= 1 )
1843 statusOFS<<
"["<<snode.Index<<
"] "<<
" trying to progress bcast L "<<done<<std::endl;
1848 #if ( _DEBUGlevel_ >= 1 )
1849 for( Int idxRecv = 0; idxRecv < bcastLTree2->GetDestCount(); ++idxRecv ){
1850 Int iProcRow = bcastLTree2->GetDest(idxRecv);
1851 statusOFS << std::endl <<
"["<<snode.Index<<
"] "<<
"Sending L " << snode.SizeSstrLcolSend <<
" BYTES on tag "<<IDX_TO_TAG(snode.Rank,SELINV_TAG_L_CONTENT,limIndex_) << std::endl << std::endl;
1858 TreeBcast * bcastLTree = fwdToRightTree_[snode.Index];
1859 if(bcastLTree!=NULL){
1860 bcastLTree->ForwardMessage((
char*)&snode.SstrLcolSend[0], snode.SizeSstrLcolSend,
1861 IDX_TO_TAG(snode.Rank,SELINV_TAG_L_CONTENT,limIndex_), &mpireqsSendToRight[0]);
1863 for( Int idxRecv = 0; idxRecv < bcastLTree->GetDestCount(); ++idxRecv ){
1864 Int iProcCol = bcastLTree->GetDest(idxRecv);
1865 #if ( _DEBUGlevel_ >= 1 )
1866 statusOFS << std::endl <<
"["<<snode.Index<<
"] "<<
"Sending L " << snode.SizeSstrLcolSend <<
" BYTES on tag "<<IDX_TO_TAG(snode.Rank,SELINV_TAG_L_CONTENT,limIndex_) << std::endl << std::endl;
1873 TIMER_STOP(ISend_Content_UL);
1876 vector<char> redLdone(stepSuper,0);
1877 for (Int supidx=0; supidx<stepSuper; supidx++){
1881 if(redLTree != NULL){
1882 redLTree->SetTag(IDX_TO_TAG(snode.Rank,SELINV_TAG_L_REDUCE,limIndex_));
1884 redLTree->AllocRecvBuffers();
1886 redLTree->PostFirstRecv();
1891 TIMER_START(Compute_Sinv_LT);
1893 Int msgForwarded = 0;
1895 Int gemmProcessed = 0;
1899 std::list<Int> readySupidx;
1901 for(Int supidx = 0;supidx<stepSuper;supidx++){
1905 if( isRecvFromAbove_( snode.Index ) && isRecvFromLeft_( snode.Index )){
1907 if(
MYCOL( grid_ ) ==
PCOL( snode.Index, grid_ ) ){
1911 if(
MYROW( grid_ ) ==
PROW( snode.Index, grid_ ) ){
1915 if(snode.isReady==2){
1916 readySupidx.push_back(supidx);
1917 #if ( _DEBUGlevel_ >= 1 )
1918 statusOFS<<std::endl<<
"Locally processing ["<<snode.Index<<
"]"<<std::endl;
1922 else if( (isRecvFromLeft_( snode.Index ) ) &&
MYCOL( grid_ ) !=
PCOL( snode.Index, grid_ ) )
1927 if(redLTree != NULL){
1928 TIMER_START(Reduce_Sinv_LT_Isend);
1930 redLTree->SetLocalBuffer(NULL);
1931 redLTree->SetDataReady(
true);
1933 bool done = redLTree->Progress();
1935 #if ( _DEBUGlevel_ >= 1 )
1936 statusOFS<<
"["<<snode.Index<<
"] "<<
" trying to progress reduce L "<<done<<std::endl;
1938 TIMER_STOP(Reduce_Sinv_LT_Isend);
1942 if(
MYROW(grid_)!=
PROW(snode.Index,grid_)){
1944 TreeBcast2<T> * bcastUTree2 = fwdToBelowTree2_[snode.Index];
1945 if(bcastUTree2 != NULL){
1946 if(bcastUTree2->GetDestCount()>0){
1951 TreeBcast * bcastUTree = fwdToBelowTree_[snode.Index];
1952 if(bcastUTree != NULL){
1953 if(bcastUTree->GetDestCount()>0){
1960 if(
MYCOL(grid_)!=
PCOL(snode.Index,grid_)){
1962 TreeBcast2<T> * bcastLTree2 = fwdToRightTree2_[snode.Index];
1963 if(bcastLTree2 != NULL){
1964 if(bcastLTree2->GetDestCount()>0){
1970 TreeBcast * bcastLTree = fwdToRightTree_[snode.Index];
1971 if(bcastLTree != NULL){
1972 if(bcastLTree->GetDestCount()>0){
1980 #if ( _DEBUGlevel_ >= 1 )
1981 statusOFS<<std::endl<<
"gemmToDo ="<<gemmToDo<<std::endl;
1982 statusOFS<<std::endl<<
"msgToFwd ="<<msgToFwd<<std::endl;
1986 #if defined (PROFILE)
1987 end_SendULWaitContentFirst=0;
1988 begin_SendULWaitContentFirst=0;
1991 while(gemmProcessed<gemmToDo || msgForwarded < msgToFwd)
1993 Int reqidx = MPI_UNDEFINED;
2004 TIMER_START(WaitContent_UL);
2005 #if defined(PROFILE)
2006 if(begin_SendULWaitContentFirst==0){
2007 begin_SendULWaitContentFirst=1;
2008 TIMER_START(WaitContent_UL_First);
2018 for (Int supidx2=0; supidx2<stepSuper; supidx2++){
2021 TreeBcast2<T> * bcastUTree2 = fwdToBelowTree2_[snode.Index];
2023 if(bcastUTree2 != NULL ){
2024 if(!bcastUready[supidx2]){
2025 if(!bcastUTree2->IsRoot()){
2026 bool ready = bcastUTree2->IsDataReceived();
2038 if( isRecvFromAbove_( snode.Index ) && isRecvFromLeft_( snode.Index )){
2041 if(snode.isReady==2){
2042 readySupidx.push_back(supidx2);
2043 #if defined(PROFILE)
2044 if(end_SendULWaitContentFirst==0){
2045 TIMER_STOP(WaitContent_UL_First);
2046 end_SendULWaitContentFirst=1;
2052 bcastUready[supidx2]=1;
2054 for( Int idxRecv = 0; idxRecv < bcastUTree2->GetDestCount(); ++idxRecv ){
2055 Int iProcRow = bcastUTree2->GetDest(idxRecv);
2056 statusOFS << std::endl <<
"MATDEBUG ["<<snode.Index<<
"] "<<
"Forwarded U " << snode.SizeSstrUrowRecv <<
" BYTES to "<<iProcRow<<
" on tag "<<IDX_TO_TAG(snode.Rank,SELINV_TAG_U_CONTENT,limIndex_)<< std::endl << std::endl;
2064 bcastUready[supidx2]=1;
2068 bool done = bcastUTree2->Progress();
2070 #if ( _DEBUGlevel_ >= 1 )
2071 statusOFS<<
"["<<snode.Index<<
"] "<<
" trying to progress bcast U "<<done<<std::endl;
2083 TreeBcast2<T> * bcastLTree2 = fwdToRightTree2_[snode.Index];
2085 if(bcastLTree2 != NULL ){
2086 if(!bcastLready[supidx2]){
2087 if(!bcastLTree2->IsRoot()){
2088 bool ready = bcastLTree2->IsDataReceived();
2098 if( isRecvFromAbove_( snode.Index ) && isRecvFromLeft_( snode.Index )){
2101 if(snode.isReady==2){
2102 readySupidx.push_back(supidx2);
2103 #if defined(PROFILE)
2104 if(end_SendULWaitContentFirst==0){
2105 TIMER_STOP(WaitContent_UL_First);
2106 end_SendULWaitContentFirst=1;
2112 bcastLready[supidx2]=1;
2114 for( Int idxRecv = 0; idxRecv < bcastLTree2->GetDestCount(); ++idxRecv ){
2115 Int iProcRow = bcastLTree2->GetDest(idxRecv);
2116 statusOFS << std::endl <<
"MATDEBUG ["<<snode.Index<<
"] "<<
"Forwarded L " << snode.SizeSstrLcolRecv <<
" BYTES to "<<iProcRow<<
" on tag "<<IDX_TO_TAG(snode.Rank,SELINV_TAG_L_CONTENT,limIndex_)<< std::endl << std::endl;
2124 bcastLready[supidx2]=1;
2128 bool done = bcastLTree2->Progress();
2130 #if ( _DEBUGlevel_ >= 1 )
2131 statusOFS<<
"["<<snode.Index<<
"] "<<
" trying to progress bcast L "<<done<<std::endl;
2151 int reqIndices[arrMpireqsRecvContentFromAny.size()];
2154 int err = MPI_Waitsome(2*stepSuper, &arrMpireqsRecvContentFromAny[0], &numRecv, reqIndices, MPI_STATUSES_IGNORE);
2155 assert(err==MPI_SUCCESS);
2157 for(
int i =0;i<numRecv;i++){
2158 reqidx = reqIndices[i];
2160 if(reqidx!=MPI_UNDEFINED)
2169 TreeBcast * bcastUTree = fwdToBelowTree_[snode.Index];
2170 if(bcastUTree != NULL){
2171 if(bcastUTree->GetDestCount()>0){
2173 std::vector<MPI_Request> & mpireqsSendToBelow = arrMpireqsSendToBelow[supidx];
2174 #if ( _DEBUGlevel_ >= 1 )
2175 for( Int idxRecv = 0; idxRecv < bcastUTree->GetDestCount(); ++idxRecv ){
2176 Int iProcRow = bcastUTree->GetDest(idxRecv);
2177 statusOFS << std::endl <<
"["<<snode.Index<<
"] "<<
"Forwarding U " << snode.SizeSstrUrowRecv <<
" BYTES to "<<iProcRow<<
" on tag "<<IDX_TO_TAG(snode.Rank,SELINV_TAG_U_CONTENT,limIndex_)<< std::endl << std::endl;
2181 bcastUTree->ForwardMessage( (
char*)&snode.SstrUrowRecv[0], snode.SizeSstrUrowRecv,
2182 IDX_TO_TAG(snode.Rank,SELINV_TAG_U_CONTENT,limIndex_), &mpireqsSendToBelow[0] );
2183 #if ( _DEBUGlevel_ >= 1 )
2184 for( Int idxRecv = 0; idxRecv < bcastUTree->GetDestCount(); ++idxRecv ){
2185 Int iProcRow = bcastUTree->GetDest(idxRecv);
2186 statusOFS << std::endl <<
"["<<snode.Index<<
"] "<<
"Forwarded U " << snode.SizeSstrUrowRecv <<
" BYTES to "<<iProcRow<<
" on tag "<<IDX_TO_TAG(snode.Rank,SELINV_TAG_U_CONTENT,limIndex_)<< std::endl << std::endl;
2195 else if(reqidx%2==1){
2196 TreeBcast * bcastLTree = fwdToRightTree_[snode.Index];
2197 if(bcastLTree != NULL){
2198 if(bcastLTree->GetDestCount()>0){
2200 std::vector<MPI_Request> & mpireqsSendToRight = arrMpireqsSendToRight[supidx];
2201 #if ( _DEBUGlevel_ >= 1 )
2202 for( Int idxRecv = 0; idxRecv < bcastLTree->GetDestCount(); ++idxRecv ){
2203 Int iProcCol = bcastLTree->GetDest(idxRecv);
2204 statusOFS << std::endl <<
"["<<snode.Index<<
"] "<<
"Forwarding L " << snode.SizeSstrLcolRecv <<
" BYTES to "<<iProcCol<<
" on tag "<<IDX_TO_TAG(snode.Rank,SELINV_TAG_L_CONTENT,limIndex_)<< std::endl << std::endl;
2208 bcastLTree->ForwardMessage( (
char*)&snode.SstrLcolRecv[0], snode.SizeSstrLcolRecv,
2209 IDX_TO_TAG(snode.Rank,SELINV_TAG_L_CONTENT,limIndex_), &mpireqsSendToRight[0]);
2215 #if ( _DEBUGlevel_ >= 1 )
2216 for( Int idxRecv = 0; idxRecv < bcastLTree->GetDestCount(); ++idxRecv ){
2217 Int iProcCol = bcastLTree->GetDest(idxRecv);
2218 statusOFS << std::endl <<
"["<<snode.Index<<
"] "<<
"Forwarded L " << snode.SizeSstrLcolRecv <<
" BYTES to "<<iProcCol<<
" on tag "<<IDX_TO_TAG(snode.Rank,SELINV_TAG_L_CONTENT,limIndex_)<< std::endl << std::endl;
2227 #if ( _DEBUGlevel_ >= 1 )
2228 statusOFS<<std::endl<<
"Received data for ["<<snode.Index<<
"] reqidx%2="<<reqidx%2<<
" is ready ?"<<snode.isReady<<std::endl;
2231 if( isRecvFromAbove_( snode.Index ) && isRecvFromLeft_( snode.Index )){
2235 if(snode.isReady==2){
2236 readySupidx.push_back(supidx);
2238 #if defined(PROFILE)
2239 if(end_SendULWaitContentFirst==0){
2240 TIMER_STOP(WaitContent_UL_First);
2241 end_SendULWaitContentFirst=1;
2251 TIMER_STOP(WaitContent_UL);
2253 }
while( (gemmProcessed<gemmToDo && readySupidx.size()==0) || (gemmProcessed==gemmToDo && msgForwarded<msgToFwd) );
2256 if(readySupidx.size()>0)
2258 supidx = readySupidx.back();
2259 readySupidx.pop_back();
2264 if( isRecvFromAbove_( snode.Index ) && isRecvFromLeft_( snode.Index ) ){
2266 std::vector<LBlock<T> > LcolRecv;
2267 std::vector<UBlock<T> > UrowRecv;
2271 UnpackData(snode, LcolRecv, UrowRecv);
2274 TreeBcast2<T> * bcastUTree2 = fwdToBelowTree2_[snode.Index];
2275 TreeBcast2<T> * bcastLTree2 = fwdToRightTree2_[snode.Index];
2276 if(bcastUTree2->IsDone()){
2277 bcastUTree2->CleanupBuffers();
2279 if(bcastLTree2->IsDone()){
2280 bcastLTree2->CleanupBuffers();
2285 SelInv_lookup_indexes(snode,LcolRecv, UrowRecv,AinvBuf,UBuf);
2287 snode.LUpdateBuf.Resize( AinvBuf.m(),
SuperSize( snode.Index, super_ ) );
2290 gemm_stat.push_back(AinvBuf.m());
2291 gemm_stat.push_back(UBuf.m());
2292 gemm_stat.push_back(AinvBuf.n());
2295 TIMER_START(Compute_Sinv_LT_GEMM);
2296 blas::Gemm(
'N',
'T', AinvBuf.m(), UBuf.m(), AinvBuf.n(), MINUS_ONE<T>(),
2297 AinvBuf.Data(), AinvBuf.m(),
2298 UBuf.Data(), UBuf.m(), ZERO<T>(),
2299 snode.LUpdateBuf.Data(), snode.LUpdateBuf.m() );
2300 TIMER_STOP(Compute_Sinv_LT_GEMM);
2303 #if ( _DEBUGlevel_ >= 2 )
2304 statusOFS << std::endl <<
"["<<snode.Index<<
"] "<<
"snode.LUpdateBuf: " << snode.LUpdateBuf << std::endl;
2310 if(redLTree != NULL){
2311 assert( snode.LUpdateBuf.m() != 0 && snode.LUpdateBuf.n() != 0 );
2312 TIMER_START(Reduce_Sinv_LT_Isend);
2314 redLTree->SetLocalBuffer(snode.LUpdateBuf.Data());
2315 redLTree->SetDataReady(
true);
2317 bool done = redLTree->Progress();
2318 #if ( _DEBUGlevel_ >= 1 )
2319 statusOFS<<
"["<<snode.Index<<
"] "<<
" trying to progress reduce L "<<done<<std::endl;
2321 TIMER_STOP(Reduce_Sinv_LT_Isend);
2326 #if ( _DEBUGlevel_ >= 1 )
2327 statusOFS<<std::endl<<
"gemmProcessed ="<<gemmProcessed<<
"/"<<gemmToDo<<std::endl;
2331 for (Int supidx=0; supidx<stepSuper; supidx++){
2334 if(redLTree != NULL && !redLdone[supidx]){
2335 bool done = redLTree->Progress();
2337 #if ( _DEBUGlevel_ >= 1 )
2338 statusOFS<<
"["<<snode.Index<<
"] "<<
" trying to progress reduce L "<<done<<std::endl;
2346 TIMER_STOP(Compute_Sinv_LT);
2351 TIMER_START(Reduce_Sinv_LT);
2353 bool all_done =
false;
2358 for (Int supidx=0; supidx<stepSuper; supidx++){
2364 if(redLTree != NULL )
2366 if(redLTree != NULL && !redLdone[supidx])
2370 bool done = redLTree->Progress();
2371 #if ( _DEBUGlevel_ >= 1 )
2372 statusOFS<<
"["<<snode.Index<<
"] "<<
" trying to progress reduce L "<<done<<std::endl;
2374 redLdone[supidx]=done?1:0;
2377 #if ( _DEBUGlevel_ >= 1 )
2378 statusOFS<<
"["<<snode.Index<<
"] "<<
" DONE reduce L"<<std::endl;
2382 if(
MYCOL( grid_ ) ==
PCOL( snode.Index, grid_ ) ){
2384 Int numRowLUpdateBuf;
2385 std::vector<LBlock<T> >& Lcol = this->L(
LBj( snode.Index, grid_ ) );
2386 if(
MYROW( grid_ ) !=
PROW( snode.Index, grid_ ) ){
2387 snode.RowLocalPtr.resize( Lcol.size() + 1 );
2388 snode.BlockIdxLocal.resize( Lcol.size() );
2389 snode.RowLocalPtr[0] = 0;
2390 for( Int ib = 0; ib < Lcol.size(); ib++ ){
2391 snode.RowLocalPtr[ib+1] = snode.RowLocalPtr[ib] + Lcol[ib].numRow;
2392 snode.BlockIdxLocal[ib] = Lcol[ib].blockIdx;
2396 snode.RowLocalPtr.resize( Lcol.size() );
2397 snode.BlockIdxLocal.resize( Lcol.size() - 1 );
2398 snode.RowLocalPtr[0] = 0;
2399 for( Int ib = 1; ib < Lcol.size(); ib++ ){
2400 snode.RowLocalPtr[ib] = snode.RowLocalPtr[ib-1] + Lcol[ib].numRow;
2401 snode.BlockIdxLocal[ib-1] = Lcol[ib].blockIdx;
2404 numRowLUpdateBuf = *snode.RowLocalPtr.rbegin();
2406 if( numRowLUpdateBuf > 0 ){
2407 if( snode.LUpdateBuf.m() == 0 && snode.LUpdateBuf.n() == 0 ){
2408 snode.LUpdateBuf.Resize( numRowLUpdateBuf,
SuperSize( snode.Index, super_ ) );
2409 SetValue(snode.LUpdateBuf, ZERO<T>());
2414 redLTree->SetLocalBuffer(snode.LUpdateBuf.Data());
2419 redLTree->CleanupBuffers();
2422 all_done = all_done && (done || redLdone[supidx]);
2426 TIMER_STOP(Reduce_Sinv_LT);
2435 PushCallStack(
"PMatrix::SelInv_P2p::UpdateD");
2438 TIMER_START(Update_Diagonal);
2440 for (Int supidx=0; supidx<stepSuper; supidx++){
2443 ComputeDiagUpdate(snode);
2448 if(redDTree != NULL){
2450 if(
MYROW( grid_ ) ==
PROW( snode.Index, grid_ ) ){
2451 if(snode.DiagBuf.Size()==0){
2452 snode.DiagBuf.Resize(
SuperSize( snode.Index, super_ ),
SuperSize( snode.Index, super_ ));
2453 SetValue(snode.DiagBuf, ZERO<T>());
2458 redDTree->SetLocalBuffer(snode.DiagBuf.Data());
2459 if(!redDTree->IsAllocated()){
2460 redDTree->SetTag(IDX_TO_TAG(snode.Rank,SELINV_TAG_D_REDUCE,limIndex_));
2461 redDTree->AllocRecvBuffers();
2463 redDTree->PostFirstRecv();
2466 redDTree->SetDataReady(
true);
2467 bool done = redDTree->Progress();
2468 #if ( _DEBUGlevel_ >= 1 )
2469 statusOFS<<
"["<<snode.Index<<
"] "<<
" trying to progress reduce D "<<done<<std::endl;
2474 for (Int supidx=0; supidx<stepSuper; supidx++){
2477 if(redDTree != NULL){
2479 if(redDTree->IsAllocated())
2482 bool done = redDTree->Progress();
2483 #if ( _DEBUGlevel_ >= 1 )
2484 statusOFS<<
"["<<snode.Index<<
"] "<<
" trying to progress reduce D "<<done<<std::endl;
2490 TIMER_STOP(Update_Diagonal);
2492 TIMER_START(Reduce_Diagonal);
2495 vector<char> is_done(stepSuper,0);
2496 bool all_done =
false;
2501 for (Int supidx=0; supidx<stepSuper; supidx++){
2506 if(redDTree != NULL )
2508 if(redDTree != NULL && !is_done[supidx])
2512 bool done = redDTree->Progress();
2513 #if ( _DEBUGlevel_ >= 1 )
2514 statusOFS<<
"["<<snode.Index<<
"] "<<
" trying to progress reduce D "<<done<<std::endl;
2516 is_done[supidx]=done?1:0;
2519 #if ( _DEBUGlevel_ >= 1 )
2520 statusOFS<<
"["<<snode.Index<<
"] "<<
" DONE reduce D"<<std::endl;
2522 if(
MYCOL( grid_ ) ==
PCOL( snode.Index, grid_ ) ){
2523 if(
MYROW( grid_ ) ==
PROW( snode.Index, grid_ ) ){
2524 LBlock<T> & LB = this->L(
LBj( snode.Index, grid_ ) )[0];
2526 blas::Axpy( LB.
numRow * LB.
numCol, ONE<T>(), snode.DiagBuf.Data(), 1, LB.
nzval.Data(), 1 );
2527 Symmetrize( LB.
nzval );
2533 redDTree->CleanupBuffers();
2536 all_done = all_done && (done || is_done[supidx]);
2541 TIMER_STOP(Reduce_Diagonal);
2549 PushCallStack(
"PMatrix::SelInv_P2p::UpdateU");
2554 SendRecvCD_UpdateU(arrSuperNodes, stepSuper);
2561 PushCallStack(
"PMatrix::SelInv_P2p::UpdateLFinal");
2564 TIMER_START(Update_L);
2565 for (Int supidx=0; supidx<stepSuper; supidx++){
2568 #if ( _DEBUGlevel_ >= 1 )
2569 statusOFS << std::endl <<
"["<<snode.Index<<
"] "<<
"Finish updating the L part by filling LUpdateBufReduced back to L" << std::endl << std::endl;
2572 if(
MYCOL( grid_ ) ==
PCOL( snode.Index, grid_ ) && snode.LUpdateBuf.m() > 0 ){
2573 std::vector<LBlock<T> >& Lcol = this->L(
LBj( snode.Index, grid_ ) );
2575 Int startBlock = (
MYROW( grid_ ) ==
PROW( snode.Index, grid_ ))?1:0;
2576 for( Int ib = startBlock; ib < Lcol.size(); ib++ ){
2578 lapack::Lacpy(
'A', LB.
numRow, LB.
numCol, &snode.LUpdateBuf( snode.RowLocalPtr[ib-startBlock], 0 ),
2579 snode.LUpdateBuf.m(), LB.
nzval.Data(), LB.
numRow );
2583 TIMER_STOP(Update_L);
2590 TIMER_START(Barrier);
2594 for (Int supidx=0; supidx<stepSuper; supidx++){
2596 TreeBcast2<T> * &bcastUTree2 = fwdToBelowTree2_[snode.Index];
2598 if(bcastUTree2 != NULL){
2599 bcastUTree2->Wait();
2600 bcastUTree2->CleanupBuffers();
2605 TreeBcast2<T> * &bcastLTree2 = fwdToRightTree2_[snode.Index];
2607 if(bcastLTree2 != NULL){
2608 bcastLTree2->Wait();
2609 bcastLTree2->CleanupBuffers();
2617 for (Int supidx=0; supidx<stepSuper; supidx++){
2622 if(redLTree != NULL){
2624 redLTree->CleanupBuffers();
2630 for (Int supidx=0; supidx<stepSuper; supidx++){
2634 if(redDTree != NULL){
2636 redDTree->CleanupBuffers();
2643 mpi::Waitall(arrMpireqsRecvContentFromAny);
2644 for (Int supidx=0; supidx<stepSuper; supidx++){
2646 Int ksup = snode.Index;
2647 std::vector<MPI_Request> & mpireqsSendToRight = arrMpireqsSendToRight[supidx];
2648 std::vector<MPI_Request> & mpireqsSendToBelow = arrMpireqsSendToBelow[supidx];
2650 #if ( _DEBUGlevel_ >= 1 )
2651 statusOFS<<
"["<<ksup<<
"] mpireqsSendToRight"<<std::endl;
2653 mpi::Waitall( mpireqsSendToRight );
2654 #if ( _DEBUGlevel_ >= 1 )
2655 statusOFS<<
"["<<ksup<<
"] mpireqsSendToBelow"<<std::endl;
2658 mpi::Waitall( mpireqsSendToBelow );
2662 #if ( _DEBUGlevel_ >= 1 )
2663 statusOFS<<
"barrier done"<<std::endl;
2665 TIMER_STOP(Barrier);
2669 template<
typename T>
2672 ConstructCommunicationPattern_P2p();
2677 template<
typename T>
2680 #ifdef COMMUNICATOR_PROFILE
2681 CommunicatorStat stats;
2682 stats.countSendToBelow_.Resize(numSuper);
2683 stats.countSendToRight_.Resize(numSuper);
2684 stats.countRecvFromBelow_.Resize( numSuper );
2685 SetValue( stats.countSendToBelow_, 0 );
2686 SetValue( stats.countSendToRight_, 0 );
2687 SetValue( stats.countRecvFromBelow_, 0 );
2692 PushCallStack(
"PMatrix::ConstructCommunicationPattern_P2p");
2698 TIMER_START(Allocate);
2701 PushCallStack(
"Initialize the communication pattern" );
2706 fwdToBelowTree2_.resize(numSuper, NULL );
2707 fwdToRightTree2_.resize(numSuper, NULL );
2709 fwdToBelowTree_.resize(numSuper, NULL );
2710 fwdToRightTree_.resize(numSuper, NULL );
2712 redToLeftTree_.resize(numSuper, NULL );
2713 redToAboveTree_.resize(numSuper, NULL );
2715 isSendToBelow_.Resize(grid_->numProcRow, numSuper);
2716 isSendToRight_.Resize(grid_->numProcCol, numSuper);
2717 isSendToDiagonal_.Resize( numSuper );
2720 SetValue( isSendToDiagonal_,
false );
2722 isSendToCrossDiagonal_.Resize(grid_->numProcCol+1, numSuper );
2723 SetValue( isSendToCrossDiagonal_,
false );
2724 isRecvFromCrossDiagonal_.Resize(grid_->numProcRow+1, numSuper );
2725 SetValue( isRecvFromCrossDiagonal_,
false );
2727 isRecvFromAbove_.Resize( numSuper );
2728 isRecvFromLeft_.Resize( numSuper );
2729 isRecvFromBelow_.Resize( grid_->numProcRow, numSuper );
2730 SetValue( isRecvFromAbove_,
false );
2731 SetValue( isRecvFromBelow_,
false );
2732 SetValue( isRecvFromLeft_,
false );
2737 TIMER_STOP(Allocate);
2739 TIMER_START(GetEtree);
2740 std::vector<Int> snodeEtree(this->
NumSuper());
2741 GetEtree(snodeEtree);
2742 TIMER_STOP(GetEtree);
2747 PushCallStack(
"Local column communication" );
2749 #if ( _DEBUGlevel_ >= 1 )
2750 statusOFS << std::endl <<
"Local column communication" << std::endl;
2755 std::vector<std::set<Int> > localColBlockRowIdx;
2757 localColBlockRowIdx.resize( this->NumLocalBlockCol() );
2760 TIMER_START(Column_communication);
2763 for( Int ksup = 0; ksup < numSuper; ksup++ ){
2765 if(
MYCOL( grid_ ) ==
PCOL( ksup, grid_ ) ){
2769 std::vector<Int> tAllBlockRowIdx;
2770 std::vector<Int> & colBlockIdx = ColBlockIdx(
LBj(ksup, grid_));
2771 TIMER_START(Allgatherv_Column_communication);
2772 if( grid_ -> mpisize != 1 )
2773 mpi::Allgatherv( colBlockIdx, tAllBlockRowIdx, grid_->colComm );
2775 tAllBlockRowIdx = colBlockIdx;
2777 TIMER_STOP(Allgatherv_Column_communication);
2779 localColBlockRowIdx[
LBj( ksup, grid_ )].insert(
2780 tAllBlockRowIdx.begin(), tAllBlockRowIdx.end() );
2782 #if ( _DEBUGlevel_ >= 1 )
2784 <<
" Column block " << ksup
2785 <<
" has the following nonzero block rows" << std::endl;
2786 for( std::set<Int>::iterator si = localColBlockRowIdx[
LBj( ksup, grid_ )].begin();
2787 si != localColBlockRowIdx[
LBj( ksup, grid_ )].end();
2789 statusOFS << *si <<
" ";
2791 statusOFS << std::endl;
2802 TIMER_STOP(Column_communication);
2804 TIMER_START(Row_communication);
2806 PushCallStack(
"Local row communication" );
2808 #if ( _DEBUGlevel_ >= 1 )
2809 statusOFS << std::endl <<
"Local row communication" << std::endl;
2814 std::vector<std::set<Int> > localRowBlockColIdx;
2816 localRowBlockColIdx.resize( this->NumLocalBlockRow() );
2818 for( Int ksup = 0; ksup < numSuper; ksup++ ){
2820 if(
MYROW( grid_ ) ==
PROW( ksup, grid_ ) ){
2823 std::vector<Int> tAllBlockColIdx;
2824 std::vector<Int> & rowBlockIdx = RowBlockIdx(
LBi(ksup, grid_));
2825 TIMER_START(Allgatherv_Row_communication);
2826 if( grid_ -> mpisize != 1 )
2827 mpi::Allgatherv( rowBlockIdx, tAllBlockColIdx, grid_->rowComm );
2829 tAllBlockColIdx = rowBlockIdx;
2831 TIMER_STOP(Allgatherv_Row_communication);
2833 localRowBlockColIdx[
LBi( ksup, grid_ )].insert(
2834 tAllBlockColIdx.begin(), tAllBlockColIdx.end() );
2836 #if ( _DEBUGlevel_ >= 1 )
2838 <<
" Row block " << ksup
2839 <<
" has the following nonzero block columns" << std::endl;
2840 for( std::set<Int>::iterator si = localRowBlockColIdx[
LBi( ksup, grid_ )].begin();
2841 si != localRowBlockColIdx[
LBi( ksup, grid_ )].end();
2843 statusOFS << *si <<
" ";
2845 statusOFS << std::endl;
2855 TIMER_STOP(Row_communication);
2857 TIMER_START(STB_RFA);
2859 PushCallStack(
"SendToBelow / RecvFromAbove");
2861 for( Int ksup = 0; ksup < numSuper - 1; ksup++ ){
2863 #ifdef COMMUNICATOR_PROFILE
2864 std::vector<bool> sTB(grid_->numProcRow,
false);
2867 Int jsup = snodeEtree[ksup];
2868 while(jsup<numSuper){
2869 Int jsupLocalBlockCol =
LBj( jsup, grid_ );
2870 Int jsupProcCol =
PCOL( jsup, grid_ );
2871 if(
MYCOL( grid_ ) == jsupProcCol ){
2873 if( localColBlockRowIdx[jsupLocalBlockCol].count( ksup ) > 0 ) {
2874 for( std::set<Int>::iterator si = localColBlockRowIdx[jsupLocalBlockCol].begin();
2875 si != localColBlockRowIdx[jsupLocalBlockCol].end(); si++ ){
2877 Int isupProcRow =
PROW( isup, grid_ );
2879 if(
MYROW( grid_ ) == isupProcRow ){
2880 isRecvFromAbove_(ksup) =
true;
2882 if(
MYROW( grid_ ) ==
PROW( ksup, grid_ ) ){
2883 isSendToBelow_( isupProcRow, ksup ) =
true;
2888 #ifdef COMMUNICATOR_PROFILE
2889 sTB[
PROW(ksup,grid_) ] =
true;
2890 if( localColBlockRowIdx[jsupLocalBlockCol].count( ksup ) > 0 ) {
2891 for( std::set<Int>::iterator si = localColBlockRowIdx[jsupLocalBlockCol].begin();
2892 si != localColBlockRowIdx[jsupLocalBlockCol].end(); si++ ){
2894 Int isupProcRow =
PROW( isup, grid_ );
2896 sTB[isupProcRow] =
true;
2905 jsup = snodeEtree[jsup];
2908 #ifdef COMMUNICATOR_PROFILE
2909 Int count= std::count(sTB.begin(), sTB.end(),
true);
2910 Int color = sTB[
MYROW(grid_)];
2912 std::vector<Int> & snodeList = stats.maskSendToBelow_[sTB];
2913 snodeList.push_back(ksup);
2915 stats.countSendToBelow_(ksup) = count * color;
2923 TIMER_STOP(STB_RFA);
2925 TIMER_START(STR_RFL_RFB);
2927 PushCallStack(
"SendToRight / RecvFromLeft");
2929 for( Int ksup = 0; ksup < numSuper - 1; ksup++ ){
2931 #ifdef COMMUNICATOR_PROFILE
2932 std::vector<bool> sTR(grid_->numProcCol,
false);
2933 std::vector<bool> rFB(grid_->numProcRow,
false);
2936 Int isup = snodeEtree[ksup];
2937 while(isup<numSuper){
2938 Int isupLocalBlockRow =
LBi( isup, grid_ );
2939 Int isupProcRow =
PROW( isup, grid_ );
2940 if(
MYROW( grid_ ) == isupProcRow ){
2942 if( localRowBlockColIdx[isupLocalBlockRow].count( ksup ) > 0 ){
2943 for( std::set<Int>::iterator si = localRowBlockColIdx[isupLocalBlockRow].begin();
2944 si != localRowBlockColIdx[isupLocalBlockRow].end(); si++ ){
2946 Int jsupProcCol =
PCOL( jsup, grid_ );
2948 if(
MYCOL( grid_ ) == jsupProcCol ){
2949 isRecvFromLeft_(ksup) =
true;
2951 if(
MYCOL( grid_ ) ==
PCOL( ksup, grid_ ) ){
2952 isSendToRight_( jsupProcCol, ksup ) =
true;
2958 #ifdef COMMUNICATOR_PROFILE
2959 sTR[
PCOL(ksup,grid_) ] =
true;
2960 if( localRowBlockColIdx[isupLocalBlockRow].count( ksup ) > 0 ){
2961 for( std::set<Int>::iterator si = localRowBlockColIdx[isupLocalBlockRow].begin();
2962 si != localRowBlockColIdx[isupLocalBlockRow].end(); si++ ){
2964 Int jsupProcCol =
PCOL( jsup, grid_ );
2966 sTR[ jsupProcCol ] =
true;
2975 if(
MYCOL( grid_ ) ==
PCOL(ksup, grid_) ){
2976 if(
MYROW( grid_ ) ==
PROW( ksup, grid_ ) ){
2977 isRecvFromBelow_(isupProcRow,ksup) =
true;
2979 else if (
MYROW(grid_) == isupProcRow){
2980 isSendToDiagonal_(ksup)=
true;
2984 #ifdef COMMUNICATOR_PROFILE
2985 rFB[
PROW(ksup,grid_) ] =
true;
2986 if(
MYCOL( grid_ ) ==
PCOL(ksup, grid_) ){
2987 rFB[ isupProcRow ] =
true;
2992 isup = snodeEtree[isup];
2995 #ifdef COMMUNICATOR_PROFILE
2996 Int count= std::count(rFB.begin(), rFB.end(),
true);
2997 Int color = rFB[
MYROW(grid_)];
2999 std::vector<Int> & snodeList = stats.maskRecvFromBelow_[rFB];
3000 snodeList.push_back(ksup);
3002 stats.countRecvFromBelow_(ksup) = count * color;
3004 count= std::count(sTR.begin(), sTR.end(),
true);
3005 color = sTR[
MYCOL(grid_)];
3007 std::vector<Int> & snodeList = stats.maskSendToRight_[sTR];
3008 snodeList.push_back(ksup);
3010 stats.countSendToRight_(ksup) = count * color;
3019 TIMER_STOP(STR_RFL_RFB);
3021 #ifdef COMMUNICATOR_PROFILE
3026 stats.maskSendToBelow_.insert(stats.maskRecvFromBelow_.begin(),stats.maskRecvFromBelow_.end());
3027 statusOFS << std::endl <<
"stats.maskSendToBelow_:" << stats.maskSendToBelow_.size() <<std::endl;
3046 statusOFS << std::endl <<
"stats.maskSendToRight_:" << stats.maskSendToRight_.size() <<std::endl;
3051 TIMER_START(BUILD_BCAST_TREES);
3054 vector<double> SeedRFL(numSuper,0.0);
3055 vector<Int> aggRFL(numSuper);
3056 vector<Int> globalAggRFL(numSuper*grid_->numProcCol);
3057 for( Int ksup = 0; ksup < numSuper ; ksup++ ){
3058 #if ( _DEBUGlevel_ >= 1 )
3059 statusOFS<<
"1 "<<ksup<<std::endl;
3062 std::vector<LBlock<T> >& Lcol = this->L(
LBj(ksup, grid_) );
3067 totalSize+=
sizeof(Int);
3068 for( Int ib = 0; ib < Lcol.size(); ib++ ){
3069 if( Lcol[ib].blockIdx > ksup ){
3071 totalSize+=3*
sizeof(Int);
3072 totalSize+=
sizeof(Int)+Lcol[ib].rows.ByteSize();
3077 aggRFL[ksup]=totalSize;
3078 SeedRFL[ksup]=rand();
3081 else if(isRecvFromLeft_(ksup)){
3089 MPI_Allgather(&aggRFL[0],numSuper*
sizeof(Int),MPI_BYTE,
3090 &globalAggRFL[0],numSuper*
sizeof(Int),MPI_BYTE,
3092 MPI_Allreduce(MPI_IN_PLACE,&SeedRFL[0],numSuper,MPI_DOUBLE,MPI_MAX,grid_->rowComm);
3095 for( Int ksup = 0; ksup < numSuper; ksup++ ){
3096 #if ( _DEBUGlevel_ >= 1 )
3097 statusOFS<<
"3 "<<ksup<<std::endl;
3099 if( isRecvFromLeft_(ksup) || CountSendToRight(ksup)>0 ){
3101 vector<Int> tree_ranks;
3103 for( Int iProcCol = 0; iProcCol < grid_->numProcCol; iProcCol++ ){
3104 if( iProcCol !=
PCOL( ksup, grid_ ) ){
3105 Int isRFL = globalAggRFL[iProcCol*numSuper + ksup];
3112 tree_ranks.reserve(countRFL+1);
3113 tree_ranks.push_back(
PCOL(ksup,grid_));
3115 for( Int iProcCol = 0; iProcCol < grid_->numProcCol; iProcCol++ ){
3116 Int isRFL = globalAggRFL[iProcCol*numSuper + ksup];
3118 if( iProcCol !=
PCOL( ksup, grid_ ) ){
3119 tree_ranks.push_back(iProcCol);
3128 TreeBcast2<T> * & BcastLTree2 = fwdToRightTree2_[ksup];
3129 if(BcastLTree2!=NULL){
delete BcastLTree2; BcastLTree2=NULL;}
3130 BcastLTree2 = TreeBcast2<T>::Create(this->grid_->rowComm,&tree_ranks[0],tree_ranks.size(),msgSize,SeedRFL[ksup]);
3132 #ifdef COMM_PROFILE_BCAST
3133 BcastLTree2->SetGlobalComm(grid_->comm);
3136 TreeBcast * & BcastLTree = fwdToRightTree_[ksup];
3137 if(BcastLTree!=NULL){
delete BcastLTree; BcastLTree=NULL;}
3138 BcastLTree = TreeBcast::Create(this->grid_->rowComm,&tree_ranks[0],tree_ranks.size(),msgSize,SeedRFL[ksup]);
3139 #ifdef COMM_PROFILE_BCAST
3140 BcastLTree->SetGlobalComm(grid_->comm);
3153 vector<double> SeedRFA(numSuper,0.0);
3154 vector<Int> aggRFA(numSuper);
3155 vector<Int> globalAggRFA(numSuper*grid_->numProcRow);
3156 for( Int ksup = 0; ksup < numSuper ; ksup++ ){
3157 #if ( _DEBUGlevel_ >= 1 )
3158 statusOFS<<
"2 "<<ksup<<std::endl;
3161 std::vector<UBlock<T> >& Urow = this->U(
LBi(ksup, grid_) );
3166 totalSize+=
sizeof(Int);
3167 for( Int jb = 0; jb < Urow.size(); jb++ ){
3168 if( Urow[jb].blockIdx >= ksup ){
3170 totalSize+=3*
sizeof(Int);
3171 totalSize+=
sizeof(Int)+Urow[jb].cols.ByteSize();
3172 totalSize+= 2*
sizeof(Int)+Urow[jb].nzval.ByteSize();
3175 aggRFA[ksup]=totalSize;
3176 SeedRFA[ksup]=rand();
3178 else if(isRecvFromAbove_(ksup)){
3188 MPI_Allgather(&aggRFA[0],numSuper*
sizeof(Int),MPI_BYTE,
3189 &globalAggRFA[0],numSuper*
sizeof(Int),MPI_BYTE,
3191 MPI_Allreduce(MPI_IN_PLACE,&SeedRFA[0],numSuper,MPI_DOUBLE,MPI_MAX,grid_->colComm);
3193 for( Int ksup = 0; ksup < numSuper; ksup++ ){
3194 #if ( _DEBUGlevel_ >= 1 )
3195 statusOFS<<
"4 "<<ksup<<std::endl;
3197 if( isRecvFromAbove_(ksup) || CountSendToBelow(ksup)>0 ){
3198 vector<Int> tree_ranks;
3201 for( Int iProcRow = 0; iProcRow < grid_->numProcRow; iProcRow++ ){
3202 if( iProcRow !=
PROW( ksup, grid_ ) ){
3203 Int isRFA = globalAggRFA[iProcRow*numSuper + ksup];
3210 tree_ranks.reserve(countRFA+1);
3211 tree_ranks.push_back(
PROW(ksup,grid_));
3213 for( Int iProcRow = 0; iProcRow < grid_->numProcRow; iProcRow++ ){
3214 Int isRFA = globalAggRFA[iProcRow*numSuper + ksup];
3216 if( iProcRow !=
PROW( ksup, grid_ ) ){
3217 tree_ranks.push_back(iProcRow);
3226 TreeBcast2<T> * & BcastUTree2 = fwdToBelowTree2_[ksup];
3227 if(BcastUTree2!=NULL){
delete BcastUTree2; BcastUTree2=NULL;}
3228 BcastUTree2 = TreeBcast2<T>::Create(this->grid_->colComm,&tree_ranks[0],tree_ranks.size(),msgSize,SeedRFA[ksup]);
3230 #ifdef COMM_PROFILE_BCAST
3231 BcastUTree2->SetGlobalComm(grid_->comm);
3234 TreeBcast * & BcastUTree = fwdToBelowTree_[ksup];
3235 if(BcastUTree!=NULL){
delete BcastUTree; BcastUTree=NULL;}
3236 BcastUTree = TreeBcast::Create(this->grid_->colComm,&tree_ranks[0],tree_ranks.size(),msgSize,SeedRFA[ksup]);
3237 #ifdef COMM_PROFILE_BCAST
3238 BcastUTree->SetGlobalComm(grid_->comm);
3246 TIMER_STOP(BUILD_BCAST_TREES);
3247 TIMER_START(BUILD_REDUCE_D_TREE);
3249 vector<double> SeedSTD(numSuper,0.0);
3250 vector<Int> aggSTD(numSuper);
3251 vector<Int> globalAggSTD(numSuper*grid_->numProcRow);
3252 for( Int ksup = 0; ksup < numSuper; ksup++ ){
3255 aggSTD[ksup]=totalSize;
3256 SeedSTD[ksup]=rand();
3258 else if(isSendToDiagonal_(ksup)){
3268 MPI_Allgather(&aggSTD[0],numSuper*
sizeof(Int),MPI_BYTE,
3269 &globalAggSTD[0],numSuper*
sizeof(Int),MPI_BYTE,
3271 MPI_Allreduce(MPI_IN_PLACE,&SeedSTD[0],numSuper,MPI_DOUBLE,MPI_MAX,grid_->colComm);
3274 for( Int ksup = 0; ksup < numSuper; ksup++ ){
3275 #if ( _DEBUGlevel_ >= 1 )
3276 statusOFS<<
"5 "<<ksup<<std::endl;
3278 if(
MYCOL( grid_ ) ==
PCOL(ksup, grid_) ){
3280 Int amISTD = globalAggSTD[
MYROW(grid_)*numSuper + ksup];
3286 vector<Int> tree_ranks;
3290 for( Int iProcRow = 0; iProcRow < grid_->numProcRow; iProcRow++ ){
3291 Int isSTD = globalAggSTD[iProcRow*numSuper + ksup];
3293 if( iProcRow !=
PROW( ksup, grid_ ) ){
3294 set_ranks.insert(iProcRow);
3303 tree_ranks.push_back(
PROW(ksup,grid_));
3304 tree_ranks.insert(tree_ranks.end(),set_ranks.begin(),set_ranks.end());
3307 for( Int iProcRow = 0; iProcRow < grid_->numProcRow; iProcRow++ ){
3308 if( iProcRow !=
PROW( ksup, grid_ ) ){
3309 Int isSTD = globalAggSTD[iProcRow*numSuper + ksup];
3316 tree_ranks.reserve(countSTD+1);
3317 tree_ranks.push_back(
PROW(ksup,grid_));
3319 for( Int iProcRow = 0; iProcRow < grid_->numProcRow; iProcRow++ ){
3320 Int isSTD = globalAggSTD[iProcRow*numSuper + ksup];
3322 if( iProcRow !=
PROW( ksup, grid_ ) ){
3323 tree_ranks.push_back(iProcRow);
3336 if(redDTree!=NULL){
delete redDTree; redDTree=NULL;}
3337 redDTree =
TreeReduce<T>::Create(this->grid_->colComm,&tree_ranks[0],tree_ranks.size(),msgSize,SeedSTD[ksup]);
3339 redDTree->SetGlobalComm(grid_->comm);
3345 TIMER_STOP(BUILD_REDUCE_D_TREE);
3349 TIMER_START(BUILD_REDUCE_L_TREE);
3351 vector<double> SeedRTL(numSuper,0.0);
3352 vector<Int> aggRTL(numSuper);
3353 vector<Int> globalAggRTL(numSuper*grid_->numProcCol);
3354 for( Int ksup = 0; ksup < numSuper ; ksup++ ){
3356 std::vector<LBlock<T> >& Lcol = this->L(
LBj(ksup, grid_) );
3362 Int numRowLUpdateBuf = 0;
3363 if(
MYROW( grid_ ) !=
PROW( ksup, grid_ ) ){
3364 for( Int ib = 0; ib < Lcol.size(); ib++ ){
3365 numRowLUpdateBuf += Lcol[ib].numRow;
3369 for( Int ib = 1; ib < Lcol.size(); ib++ ){
3370 numRowLUpdateBuf += Lcol[ib].numRow;
3375 totalSize = numRowLUpdateBuf*
SuperSize( ksup, super_ )*
sizeof(T);
3377 aggRTL[ksup]=totalSize;
3379 SeedRTL[ksup]=rand();
3381 else if(isRecvFromLeft_(ksup)){
3389 MPI_Allgather(&aggRTL[0],numSuper*
sizeof(Int),MPI_BYTE,
3390 &globalAggRTL[0],numSuper*
sizeof(Int),MPI_BYTE,
3392 MPI_Allreduce(MPI_IN_PLACE,&SeedRTL[0],numSuper,MPI_DOUBLE,MPI_MAX,grid_->rowComm);
3395 for( Int ksup = 0; ksup < numSuper ; ksup++ ){
3396 #if ( _DEBUGlevel_ >= 1 )
3397 statusOFS<<
"6 "<<ksup<<std::endl;
3399 if( isRecvFromLeft_(ksup) || CountSendToRight(ksup)>0 ){
3400 vector<Int> tree_ranks;
3404 for( Int iProcCol = 0; iProcCol < grid_->numProcCol; iProcCol++ ){
3405 Int isRTL = globalAggRTL[iProcCol*numSuper + ksup];
3407 if( iProcCol !=
PCOL( ksup, grid_ ) ){
3408 set_ranks.insert(iProcCol);
3415 tree_ranks.push_back(
PCOL(ksup,grid_));
3416 tree_ranks.insert(tree_ranks.end(),set_ranks.begin(),set_ranks.end());
3419 for( Int iProcCol = 0; iProcCol < grid_->numProcCol; iProcCol++ ){
3420 if( iProcCol !=
PCOL( ksup, grid_ ) ){
3421 Int isRTL = globalAggRTL[iProcCol*numSuper + ksup];
3428 tree_ranks.reserve(countRTL+1);
3429 tree_ranks.push_back(
PCOL(ksup,grid_));
3431 for( Int iProcCol = 0; iProcCol < grid_->numProcCol; iProcCol++ ){
3432 Int isRTL = globalAggRTL[iProcCol*numSuper + ksup];
3434 if( iProcCol !=
PCOL( ksup, grid_ ) ){
3435 tree_ranks.push_back(iProcCol);
3447 if(redLTree!=NULL){
delete redLTree; redLTree=NULL;}
3448 redLTree =
TreeReduce<T>::Create(this->grid_->rowComm,&tree_ranks[0],tree_ranks.size(),msgSize,SeedRTL[ksup]);
3450 redLTree->SetGlobalComm(grid_->comm);
3455 TIMER_STOP(BUILD_REDUCE_L_TREE);
3463 #if ( _DEBUGlevel_ >= 1 )
3464 statusOFS << std::endl <<
"isSendToBelow:" << std::endl;
3465 for(
int j = 0;j< isSendToBelow_.n();j++){
3466 statusOFS <<
"["<<j<<
"] ";
3467 for(
int i =0; i < isSendToBelow_.m();i++){
3468 statusOFS<< isSendToBelow_(i,j) <<
" ";
3470 statusOFS<<std::endl;
3473 statusOFS << std::endl <<
"isRecvFromAbove:" << std::endl;
3474 for(
int j = 0;j< isRecvFromAbove_.m();j++){
3475 statusOFS <<
"["<<j<<
"] "<< isRecvFromAbove_(j)<<std::endl;
3478 #if ( _DEBUGlevel_ >= 1 )
3479 statusOFS << std::endl <<
"isSendToRight:" << std::endl;
3480 for(
int j = 0;j< isSendToRight_.n();j++){
3481 statusOFS <<
"["<<j<<
"] ";
3482 for(
int i =0; i < isSendToRight_.m();i++){
3483 statusOFS<< isSendToRight_(i,j) <<
" ";
3485 statusOFS<<std::endl;
3488 statusOFS << std::endl <<
"isRecvFromLeft:" << std::endl;
3489 for(
int j = 0;j< isRecvFromLeft_.m();j++){
3490 statusOFS <<
"["<<j<<
"] "<< isRecvFromLeft_(j)<<std::endl;
3493 statusOFS << std::endl <<
"isRecvFromBelow:" << std::endl;
3494 for(
int j = 0;j< isRecvFromBelow_.n();j++){
3495 statusOFS <<
"["<<j<<
"] ";
3496 for(
int i =0; i < isRecvFromBelow_.m();i++){
3497 statusOFS<< isRecvFromBelow_(i,j) <<
" ";
3499 statusOFS<<std::endl;
3509 TIMER_START(STCD_RFCD);
3513 PushCallStack(
"SendToCrossDiagonal / RecvFromCrossDiagonal");
3515 for( Int ksup = 0; ksup < numSuper - 1; ksup++ ){
3516 if(
MYCOL( grid_ ) ==
PCOL( ksup, grid_ ) ){
3517 for( std::set<Int>::iterator si = localColBlockRowIdx[
LBj( ksup, grid_ )].begin();
3518 si != localColBlockRowIdx[
LBj( ksup, grid_ )].end(); si++ ){
3520 Int isupProcRow =
PROW( isup, grid_ );
3521 Int isupProcCol =
PCOL( isup, grid_ );
3522 if( isup > ksup &&
MYROW( grid_ ) == isupProcRow ){
3523 isSendToCrossDiagonal_(grid_->numProcCol, ksup ) =
true;
3524 isSendToCrossDiagonal_(isupProcCol, ksup ) =
true;
3530 for( Int ksup = 0; ksup < numSuper - 1; ksup++ ){
3531 if(
MYROW( grid_ ) ==
PROW( ksup, grid_ ) ){
3532 for( std::set<Int>::iterator si = localRowBlockColIdx[
LBi(ksup, grid_) ].begin();
3533 si != localRowBlockColIdx[
LBi(ksup, grid_) ].end(); si++ ){
3535 Int jsupProcCol =
PCOL( jsup, grid_ );
3536 Int jsupProcRow =
PROW( jsup, grid_ );
3537 if( jsup > ksup &&
MYCOL(grid_) == jsupProcCol ){
3538 isRecvFromCrossDiagonal_(grid_->numProcRow, ksup ) =
true;
3539 isRecvFromCrossDiagonal_(jsupProcRow, ksup ) =
true;
3544 #if ( _DEBUGlevel_ >= 1 )
3545 statusOFS << std::endl <<
"isSendToCrossDiagonal:" << std::endl;
3546 for(
int j =0; j < isSendToCrossDiagonal_.n();j++){
3547 if(isSendToCrossDiagonal_(grid_->numProcCol,j)){
3548 statusOFS <<
"["<<j<<
"] ";
3549 for(
int i =0; i < isSendToCrossDiagonal_.m()-1;i++){
3550 if(isSendToCrossDiagonal_(i,j))
3552 statusOFS<<
PNUM(
PROW(j,grid_),i,grid_)<<
" ";
3555 statusOFS<<std::endl;
3559 statusOFS << std::endl <<
"isRecvFromCrossDiagonal:" << std::endl;
3560 for(
int j =0; j < isRecvFromCrossDiagonal_.n();j++){
3561 if(isRecvFromCrossDiagonal_(grid_->numProcRow,j)){
3562 statusOFS <<
"["<<j<<
"] ";
3563 for(
int i =0; i < isRecvFromCrossDiagonal_.m()-1;i++){
3564 if(isRecvFromCrossDiagonal_(i,j))
3566 statusOFS<<
PNUM(i,
PCOL(j,grid_),grid_)<<
" ";
3569 statusOFS<<std::endl;
3580 TIMER_STOP(STCD_RFCD);
3587 GetWorkSet(snodeEtree,this->WorkingSet());
3592 template<
typename T>
3596 if(optionsLU_->Symmetric == 0){
3597 throw std::logic_error(
"The matrix is not symmetric, this routine can't handle it !" );
3603 statOFS<<
"m"<<
"\t"<<
"n"<<
"\t"<<
"z"<<std::endl;
3604 for(std::deque<int >::iterator it = gemm_stat.begin(); it!=gemm_stat.end(); it+=3){
3605 statOFS<<*it<<
"\t"<<*(it+1)<<
"\t"<<*(it+2)<<std::endl;
3609 #if defined(COMM_PROFILE) || defined(COMM_PROFILE_BCAST)
3611 commOFS<<HEADER_COMM<<std::endl;
3612 for(std::deque<int>::iterator it = comm_stat.begin(); it!=comm_stat.end(); it+=4){
3613 commOFS<<LINE_COMM(it)<<std::endl;
3618 for(
int i = 0 ; i< fwdToBelowTree_.size();++i){
3619 if(fwdToBelowTree_[i]!=NULL){
3620 fwdToBelowTree_[i]->Reset();
3623 for(
int i = 0 ; i< fwdToRightTree_.size();++i){
3624 if(fwdToRightTree_[i]!=NULL){
3625 fwdToRightTree_[i]->Reset();
3628 for(
int i = 0 ; i< redToLeftTree_.size();++i){
3629 if(redToLeftTree_[i]!=NULL){
3630 redToLeftTree_[i]->Reset();
3633 for(
int i = 0 ; i< redToAboveTree_.size();++i){
3634 if(redToAboveTree_[i]!=NULL){
3635 redToAboveTree_[i]->Reset();
3645 template<
typename T>
3648 TIMER_START(SelInv_P2p);
3651 PushCallStack(
"PMatrix::SelInv_P2p");
3654 #if ( _DEBUGlevel_ >= 1 )
3655 statusOFS<<
"maxTag value: "<<maxTag_<<std::endl;
3661 std::vector<std::vector<Int> > & superList = this->WorkingSet();
3662 Int numSteps = superList.size();
3665 for (Int lidx=0; lidx<numSteps ; lidx++){
3666 Int stepSuper = superList[lidx].size();
3668 SelInvIntra_P2p(lidx,rank);
3669 #if ( _DEBUGlevel_ >= 1 )
3670 statusOFS<<
"OUT "<<lidx<<
"/"<<numSteps<<
" "<<limIndex_<<std::endl;
3675 if (options_->maxPipelineDepth!=-1)
3678 MPI_Barrier(grid_->comm);
3687 if(lidx>0 && (rank-1)%limIndex_==0){
3691 MPI_Barrier(grid_->comm);
3698 MPI_Barrier(grid_->comm);
3703 TIMER_STOP(SelInv_P2p);
3710 template<
typename T>
3714 PushCallStack(
"PMatrix::PreSelInv");
3720 PushCallStack(
"L(i,k) <- L(i,k) * L(k,k)^{-1}");
3722 #if ( _DEBUGlevel_ >= 1 )
3723 statusOFS << std::endl <<
"L(i,k) <- L(i,k) * L(k,k)^{-1}" << std::endl << std::endl;
3725 for( Int ksup = 0; ksup < numSuper; ksup++ ){
3726 if(
MYCOL( grid_ ) ==
PCOL( ksup, grid_ ) ){
3729 std::vector<LBlock<T> >& Lcol = this->L(
LBj( ksup, grid_ ) );
3730 if(
MYROW( grid_ ) ==
PROW( ksup, grid_ ) ){
3731 nzvalLDiag = Lcol[0].nzval;
3732 if( nzvalLDiag.m() !=
SuperSize(ksup, super_) ||
3733 nzvalLDiag.n() !=
SuperSize(ksup, super_) ){
3737 throw std::runtime_error(
"The size of the diagonal block of L is wrong." );
3744 MPI_Bcast( (
void*)nzvalLDiag.Data(), nzvalLDiag.ByteSize(),
3745 MPI_BYTE,
PROW( ksup, grid_ ), grid_->colComm );
3748 for( Int ib = 0; ib < Lcol.size(); ib++ ){
3751 #if ( _DEBUGlevel_ >= 2 )
3753 if(
LBj( ksup, grid_ ) == 0 ){
3754 statusOFS <<
"Diag L(" << ksup <<
", " << ksup <<
"): " << nzvalLDiag << std::endl;
3755 statusOFS <<
"Before solve L(" << LB.
blockIdx <<
", " << ksup <<
"): " << LB.
nzval << std::endl;
3758 blas::Trsm(
'R',
'L',
'N',
'U', LB.
numRow, LB.
numCol, ONE<T>(),
3760 #if ( _DEBUGlevel_ >= 2 )
3762 if(
LBj( ksup, grid_ ) == 0 ){
3763 statusOFS <<
"After solve L(" << LB.
blockIdx <<
", " << ksup <<
"): " << LB.
nzval << std::endl;
3778 PushCallStack(
"U(k,i) <- L(i,k)");
3780 #if ( _DEBUGlevel_ >= 1 )
3781 statusOFS << std::endl <<
"U(k,i) <- L(i,k)" << std::endl << std::endl;
3784 for( Int ksup = 0; ksup < numSuper; ksup++ ){
3785 Int ksupProcRow =
PROW( ksup, grid_ );
3786 Int ksupProcCol =
PCOL( ksup, grid_ );
3788 Int sendCount = CountSendToCrossDiagonal(ksup);
3789 Int recvCount = CountRecvFromCrossDiagonal(ksup);
3791 std::vector<MPI_Request > arrMpiReqsSend(sendCount, MPI_REQUEST_NULL );
3792 std::vector<MPI_Request > arrMpiReqsSizeSend(sendCount, MPI_REQUEST_NULL );
3793 std::vector<std::vector<char> > arrSstrLcolSend(sendCount);
3794 std::vector<Int > arrSstrLcolSizeSend(sendCount);
3796 std::vector<MPI_Request > arrMpiReqsRecv(recvCount, MPI_REQUEST_NULL );
3797 std::vector<MPI_Request > arrMpiReqsSizeRecv(recvCount, MPI_REQUEST_NULL );
3798 std::vector<std::vector<char> > arrSstrLcolRecv(recvCount);
3799 std::vector<Int > arrSstrLcolSizeRecv(recvCount);
3804 if( isSendToCrossDiagonal_(grid_->numProcCol,ksup) ){
3805 #if ( _DEBUGlevel_ >= 1 )
3806 statusOFS<<
"["<<ksup<<
"] P"<<
MYPROC(grid_)<<
" should send to "<<CountSendToCrossDiagonal(ksup)<<
" processors"<<std::endl;
3810 for(Int dstCol = 0; dstCol<grid_->numProcCol; dstCol++){
3811 if(isSendToCrossDiagonal_(dstCol,ksup) ){
3812 Int dst =
PNUM(
PROW(ksup,grid_),dstCol,grid_);
3815 std::stringstream sstm;
3816 std::vector<char> & sstrLcolSend = arrSstrLcolSend[sendIdx];
3817 Int & sstrSize = arrSstrLcolSizeSend[sendIdx];
3818 MPI_Request & mpiReqSend = arrMpiReqsSend[sendIdx];
3819 MPI_Request & mpiReqSizeSend = arrMpiReqsSizeSend[sendIdx];
3821 std::vector<Int> mask( LBlockMask::TOTAL_NUMBER, 1 );
3822 std::vector<LBlock<T> >& Lcol = this->L(
LBj(ksup, grid_) );
3827 if(
MYROW( grid_ ) ==
PROW( ksup, grid_ ) ){
3828 for( Int ib = 1; ib < Lcol.size(); ib++ ){
3829 if( Lcol[ib].blockIdx > ksup && (Lcol[ib].blockIdx % grid_->numProcCol) == dstCol ){
3836 for( Int ib = 0; ib < Lcol.size(); ib++ ){
3837 if( Lcol[ib].blockIdx > ksup && (Lcol[ib].blockIdx % grid_->numProcCol) == dstCol ){
3843 serialize( (Int)count, sstm, NO_MASK );
3845 for( Int ib = 0; ib < Lcol.size(); ib++ ){
3846 if( Lcol[ib].blockIdx > ksup && (Lcol[ib].blockIdx % grid_->numProcCol) == dstCol ){
3847 #if ( _DEBUGlevel_ >= 1 )
3848 statusOFS<<
"["<<ksup<<
"] SEND contains "<<Lcol[ib].blockIdx<<
" which corresponds to "<<
GBj(ib,grid_)<<std::endl;
3850 serialize( Lcol[ib], sstm, mask );
3854 sstrLcolSend.resize( Size(sstm) );
3855 sstm.read( &sstrLcolSend[0], sstrLcolSend.size() );
3856 sstrSize = sstrLcolSend.size();
3863 #if ( _DEBUGlevel_ >= 1 )
3864 statusOFS<<
"["<<ksup<<
"] P"<<
MYPROC(grid_)<<
" ("<<
MYROW(grid_)<<
","<<
MYCOL(grid_)<<
") ---> LBj("<<ksup<<
")="<<
LBj(ksup,grid_)<<
" ---> P"<<dst<<
" ("<<ksupProcRow<<
","<<dstCol<<
")"<<std::endl;
3866 MPI_Isend( &sstrSize,
sizeof(sstrSize), MPI_BYTE, dst, SELINV_TAG_L_SIZE_CD, grid_->comm, &mpiReqSizeSend );
3867 MPI_Isend( (
void*)&sstrLcolSend[0], sstrSize, MPI_BYTE, dst, SELINV_TAG_L_CONTENT_CD, grid_->comm, &mpiReqSend );
3870 PROFILE_COMM(
MYPROC(this->grid_),dst,SELINV_TAG_L_SIZE_CD,
sizeof(sstrSize));
3871 PROFILE_COMM(
MYPROC(this->grid_),dst,SELINV_TAG_L_CONTENT_CD,sstrSize);
3886 if( isRecvFromCrossDiagonal_(grid_->numProcRow,ksup) ){
3889 #if ( _DEBUGlevel_ >= 1 )
3890 statusOFS<<
"["<<ksup<<
"] P"<<
MYPROC(grid_)<<
" should receive from "<<CountRecvFromCrossDiagonal(ksup)<<
" processors"<<std::endl;
3894 std::vector<UBlock<T> >& Urow = this->U(
LBi( ksup, grid_ ) );
3895 std::vector<bool> isBlockFound(Urow.size(),
false);
3900 for(Int srcRow = 0; srcRow<grid_->numProcRow; srcRow++){
3901 if(isRecvFromCrossDiagonal_(srcRow,ksup) ){
3902 std::vector<LBlock<T> > LcolRecv;
3903 Int src =
PNUM(srcRow,
PCOL(ksup,grid_),grid_);
3905 MPI_Request & mpiReqSizeRecv = arrMpiReqsSizeRecv[recvIdx];
3906 Int & sstrSize = arrSstrLcolSizeRecv[recvIdx];
3908 MPI_Irecv( &sstrSize, 1, MPI_INT, src, SELINV_TAG_L_SIZE_CD, grid_->comm, &mpiReqSizeRecv );
3915 mpi::Waitall(arrMpiReqsSizeRecv);
3921 for(Int srcRow = 0; srcRow<grid_->numProcRow; srcRow++){
3922 if(isRecvFromCrossDiagonal_(srcRow,ksup) ){
3923 std::vector<LBlock<T> > LcolRecv;
3924 Int src =
PNUM(srcRow,
PCOL(ksup,grid_),grid_);
3926 MPI_Request & mpiReqRecv = arrMpiReqsRecv[recvIdx];
3927 Int & sstrSize = arrSstrLcolSizeRecv[recvIdx];
3928 std::vector<char> & sstrLcolRecv = arrSstrLcolRecv[recvIdx];
3929 sstrLcolRecv.resize(sstrSize);
3931 MPI_Irecv( &sstrLcolRecv[0], sstrSize, MPI_BYTE, src, SELINV_TAG_L_CONTENT_CD, grid_->comm, &mpiReqRecv );
3938 mpi::Waitall(arrMpiReqsRecv);
3944 for(Int srcRow = 0; srcRow<grid_->numProcRow; srcRow++){
3945 if(isRecvFromCrossDiagonal_(srcRow,ksup) ){
3946 std::vector<LBlock<T> > LcolRecv;
3947 Int src =
PNUM(srcRow,
PCOL(ksup,grid_),grid_);
3950 Int & sstrSize = arrSstrLcolSizeRecv[recvIdx];
3951 std::vector<char> & sstrLcolRecv = arrSstrLcolRecv[recvIdx];
3952 std::stringstream sstm;
3954 #if ( _DEBUGlevel_ >= 1 )
3955 statusOFS<<
"["<<ksup<<
"] P"<<
MYPROC(grid_)<<
" ("<<
MYROW(grid_)<<
","<<
MYCOL(grid_)<<
") <--- LBj("<<ksup<<
") <--- P"<<src<<
" ("<<srcRow<<
","<<ksupProcCol<<
")"<<std::endl;
3959 sstm.write( &sstrLcolRecv[0], sstrSize );
3963 std::vector<Int> mask( LBlockMask::TOTAL_NUMBER, 1 );
3964 deserialize( numLBlock, sstm, NO_MASK );
3965 LcolRecv.resize(numLBlock);
3966 for( Int ib = 0; ib < numLBlock; ib++ ){
3967 deserialize( LcolRecv[ib], sstm, mask );
3968 #if ( _DEBUGlevel_ >= 1 )
3969 statusOFS<<
"["<<ksup<<
"] RECV contains "<<LcolRecv[ib].blockIdx<<
" which corresponds to "<< ib * grid_->numProcRow + srcRow;
3971 statusOFS<<
" L is on row "<< srcRow <<
" whereas U is on col "<< LcolRecv[ib].blockIdx % grid_->numProcCol <<std::endl;
3981 std::vector<LBlock<T> >& Lcol = this->L(
LBj( ksup, grid_ ) );
3982 if(
MYROW( grid_ ) !=
PROW( ksup, grid_ ) ){
3983 LcolRecv.resize( Lcol.size() );
3984 for( Int ib = 0; ib < Lcol.size(); ib++ ){
3985 LcolRecv[ib] = Lcol[ib];
3989 LcolRecv.resize( Lcol.size() - 1 );
3990 for( Int ib = 0; ib < Lcol.size() - 1; ib++ ){
3991 LcolRecv[ib] = Lcol[ib+1];
3998 for( Int ib = 0; ib < LcolRecv.size(); ib++ ){
4004 throw std::logic_error(
"LcolRecv contains the wrong blocks." );
4006 for( Int jb = 0; jb < Urow.size(); jb++ ){
4011 std::ostringstream msg;
4012 msg <<
"LB(" << LB.
blockIdx <<
", " << ksup <<
") and UB("
4013 << ksup <<
", " << UB.
blockIdx <<
") do not share the same size." << std::endl
4014 <<
"LB: " << LB.
numRow <<
" x " << LB.
numCol << std::endl
4015 <<
"UB: " << UB.
numRow <<
" x " << UB.
numCol << std::endl;
4019 throw std::runtime_error( msg.str().c_str() );
4028 #if ( _DEBUGlevel_ >= 1 )
4029 statusOFS<<
"["<<ksup<<
"] USING "<<LB.
blockIdx<< std::endl;
4031 isBlockFound[jb] =
true;
4039 for( Int jb = 0; jb < Urow.size(); jb++ ){
4041 if( !isBlockFound[jb] ){
4045 throw std::logic_error(
"UBlock cannot find its update. Something is seriously wrong." );
4055 mpi::Waitall(arrMpiReqsSizeSend);
4056 mpi::Waitall(arrMpiReqsSend);
4066 PushCallStack(
"L(i,i) <- [L(k,k) * U(k,k)]^{-1} ");
4068 #if ( _DEBUGlevel_ >= 1 )
4069 statusOFS << std::endl <<
"L(i,i) <- [L(k,k) * U(k,k)]^{-1}" << std::endl << std::endl;
4072 for( Int ksup = 0; ksup < numSuper; ksup++ ){
4073 if(
MYROW( grid_ ) ==
PROW( ksup, grid_ ) &&
4074 MYCOL( grid_ ) ==
PCOL( ksup, grid_ ) ){
4078 for(Int i = 0; i <
SuperSize( ksup, super_ ); i++){
4082 #if ( _DEBUGlevel_ >= 2 )
4084 statusOFS <<
"Factorized A (" << ksup <<
", " << ksup <<
"): " << LB.
nzval << std::endl;
4087 SuperSize( ksup, super_ ), ipiv.Data() );
4090 Symmetrize( LB.
nzval );
4092 #if ( _DEBUGlevel_ >= 2 )
4094 statusOFS <<
"Inversed A (" << ksup <<
", " << ksup <<
"): " << LB.
nzval << std::endl;
4113 template<
typename T>
4117 PushCallStack(
"PMatrix::GetDiagonal");
4121 Int numCol = this->
NumCol();
4123 const IntNumVec& permInv = super_->permInv;
4128 pPerm_r = &super_->perm_r;
4129 pPermInv_r = &super_->permInv_r;
4132 const IntNumVec& permInv_r = *pPermInv_r;
4138 diag.Resize( numCol );
4141 for( Int orow = 0; orow < numCol; orow++){
4143 Int row = perm[ orow ];
4145 Int col = perm[ perm_r[ orow] ];
4147 Int blockColIdx =
BlockIdx( col, super_ );
4148 Int blockRowIdx =
BlockIdx( row, super_ );
4152 if(
MYROW( grid_ ) ==
PROW( blockRowIdx, grid_ ) &&
4153 MYCOL( grid_ ) ==
PCOL( blockColIdx, grid_ ) ){
4155 bool isFound =
false;
4157 if( blockColIdx <= blockRowIdx ){
4160 std::vector<LBlock<T> >& Lcol = this->L(
LBj( blockColIdx, grid_ ) );
4162 for( Int ib = 0; ib < Lcol.size(); ib++ ){
4163 #if ( _DEBUGlevel_ >= 1 )
4164 statusOFS <<
"blockRowIdx = " << blockRowIdx <<
", Lcol[ib].blockIdx = " << Lcol[ib].blockIdx <<
", blockColIdx = " << blockColIdx << std::endl;
4166 if( Lcol[ib].blockIdx == blockRowIdx ){
4168 for(
int iloc = 0; iloc < Lcol[ib].numRow; iloc++ ){
4169 if( rows[iloc] == row ){
4172 diagLocal[ orow ] = Lcol[ib].nzval( iloc, jloc );
4180 if( isFound ==
true )
break;
4186 std::vector<UBlock<T> >& Urow = this->U(
LBi( blockRowIdx, grid_ ) );
4188 for( Int jb = 0; jb < Urow.size(); jb++ ){
4189 if( Urow[jb].blockIdx == blockColIdx ){
4191 for(
int jloc = 0; jloc < Urow[jb].numCol; jloc++ ){
4192 if( cols[jloc] == col ){
4195 diagLocal[ orow ] = Urow[jb].nzval( iloc, jloc );
4202 if( isFound ==
true )
break;
4207 if( isFound ==
false ){
4208 statusOFS <<
"In the permutated order, (" << row <<
", " << col <<
4209 ") is not found in PMatrix." << std::endl;
4210 diagLocal[orow] = ZERO<T>();
4230 mpi::Allreduce( diagLocal.Data(), diag.Data(), numCol, MPI_SUM, grid_->comm );
4241 template<
typename T>
4245 PushCallStack(
"PMatrix::PMatrixToDistSparseMatrix");
4247 #if ( _DEBUGlevel_ >= 1 )
4248 statusOFS << std::endl <<
"Converting PMatrix to DistSparseMatrix." << std::endl;
4250 Int mpirank = grid_->mpirank;
4251 Int mpisize = grid_->mpisize;
4253 std::vector<Int> rowSend( mpisize );
4254 std::vector<Int> colSend( mpisize );
4255 std::vector<T> valSend( mpisize );
4256 std::vector<Int> sizeSend( mpisize, 0 );
4257 std::vector<Int> displsSend( mpisize, 0 );
4259 std::vector<Int> rowRecv( mpisize );
4260 std::vector<Int> colRecv( mpisize );
4261 std::vector<T> valRecv( mpisize );
4262 std::vector<Int> sizeRecv( mpisize, 0 );
4263 std::vector<Int> displsRecv( mpisize, 0 );
4267 const IntNumVec& permInv = super_->permInv;
4272 pPerm_r = &super_->perm_r;
4273 pPermInv_r = &super_->permInv_r;
4276 const IntNumVec& permInv_r = *pPermInv_r;
4283 Int numColFirst = this->
NumCol() / mpisize;
4286 for( Int ksup = 0; ksup < numSuper; ksup++ ){
4288 if(
MYCOL( grid_ ) ==
PCOL( ksup, grid_ ) ){
4289 std::vector<LBlock<T> >& Lcol = this->L(
LBj( ksup, grid_ ) );
4290 for( Int ib = 0; ib < Lcol.size(); ib++ ){
4291 for( Int j = 0; j < Lcol[ib].numCol; j++ ){
4292 Int jcol = permInv[ permInv_r[ j +
FirstBlockCol( ksup, super_ ) ] ];
4293 Int dest = std::min( jcol / numColFirst, mpisize - 1 );
4294 sizeSend[dest] += Lcol[ib].numRow;
4300 if(
MYROW( grid_ ) ==
PROW( ksup, grid_ ) ){
4301 std::vector<UBlock<T> >& Urow = this->U(
LBi( ksup, grid_ ) );
4302 for( Int jb = 0; jb < Urow.size(); jb++ ){
4304 for( Int j = 0; j < cols.m(); j++ ){
4305 Int jcol = permInv[ permInv_r[ cols[j] ] ];
4306 Int dest = std::min( jcol / numColFirst, mpisize - 1 );
4307 sizeSend[dest] += Urow[jb].numRow;
4315 &sizeSend[0], 1, MPI_INT,
4316 &sizeRecv[0], 1, MPI_INT, grid_->comm );
4321 for( Int ip = 0; ip < mpisize; ip++ ){
4326 displsSend[ip] = displsSend[ip-1] + sizeSend[ip-1];
4333 displsRecv[ip] = displsRecv[ip-1] + sizeRecv[ip-1];
4336 Int sizeSendTotal = displsSend[mpisize-1] + sizeSend[mpisize-1];
4337 Int sizeRecvTotal = displsRecv[mpisize-1] + sizeRecv[mpisize-1];
4339 rowSend.resize( sizeSendTotal );
4340 colSend.resize( sizeSendTotal );
4341 valSend.resize( sizeSendTotal );
4343 rowRecv.resize( sizeRecvTotal );
4344 colRecv.resize( sizeRecvTotal );
4345 valRecv.resize( sizeRecvTotal );
4347 #if ( _DEBUGlevel_ >= 1 )
4348 statusOFS <<
"displsSend = " << displsSend << std::endl;
4349 statusOFS <<
"displsRecv = " << displsRecv << std::endl;
4353 std::vector<Int> cntSize( mpisize, 0 );
4355 for( Int ksup = 0; ksup < numSuper; ksup++ ){
4357 if(
MYCOL( grid_ ) ==
PCOL( ksup, grid_ ) ){
4358 std::vector<LBlock<T> >& Lcol = this->L(
LBj( ksup, grid_ ) );
4359 for( Int ib = 0; ib < Lcol.size(); ib++ ){
4362 for( Int j = 0; j < Lcol[ib].numCol; j++ ){
4363 Int jcol = permInv[ permInv_r[ j +
FirstBlockCol( ksup, super_ ) ] ];
4364 Int dest = std::min( jcol / numColFirst, mpisize - 1 );
4365 for( Int i = 0; i < rows.m(); i++ ){
4366 rowSend[displsSend[dest] + cntSize[dest]] = permInv[ rows[i] ];
4367 colSend[displsSend[dest] + cntSize[dest]] = jcol;
4368 valSend[displsSend[dest] + cntSize[dest]] = nzval( i, j );
4376 if(
MYROW( grid_ ) ==
PROW( ksup, grid_ ) ){
4377 std::vector<UBlock<T> >& Urow = this->U(
LBi( ksup, grid_ ) );
4378 for( Int jb = 0; jb < Urow.size(); jb++ ){
4381 for( Int j = 0; j < cols.m(); j++ ){
4382 Int jcol = permInv[ permInv_r[ cols[j] ] ];
4383 Int dest = std::min( jcol / numColFirst, mpisize - 1 );
4384 for( Int i = 0; i < Urow[jb].numRow; i++ ){
4385 rowSend[displsSend[dest] + cntSize[dest]] = permInv[ i +
FirstBlockCol( ksup, super_ ) ];
4386 colSend[displsSend[dest] + cntSize[dest]] = jcol;
4387 valSend[displsSend[dest] + cntSize[dest]] = nzval( i, j );
4396 for( Int ip = 0; ip < mpisize; ip++ ){
4397 if( cntSize[ip] != sizeSend[ip] )
4402 throw std::runtime_error(
"Sizes of the sending information do not match." );
4409 &rowSend[0], &sizeSend[0], &displsSend[0],
4410 &rowRecv[0], &sizeRecv[0], &displsRecv[0],
4413 &colSend[0], &sizeSend[0], &displsSend[0],
4414 &colRecv[0], &sizeRecv[0], &displsRecv[0],
4417 &valSend[0], &sizeSend[0], &displsSend[0],
4418 &valRecv[0], &sizeRecv[0], &displsRecv[0],
4421 #if ( _DEBUGlevel_ >= 1 )
4422 statusOFS <<
"Alltoallv communication finished." << std::endl;
4437 Int firstCol = mpirank * numColFirst;
4439 if( mpirank == mpisize-1 )
4440 numColLocal = this->
NumCol() - numColFirst * (mpisize-1);
4442 numColLocal = numColFirst;
4444 std::vector<std::vector<Int> > rows( numColLocal );
4445 std::vector<std::vector<T> > vals( numColLocal );
4447 for( Int ip = 0; ip < mpisize; ip++ ){
4448 Int* rowRecvCur = &rowRecv[displsRecv[ip]];
4449 Int* colRecvCur = &colRecv[displsRecv[ip]];
4450 T* valRecvCur = &valRecv[displsRecv[ip]];
4451 for( Int i = 0; i < sizeRecv[ip]; i++ ){
4452 rows[colRecvCur[i]-firstCol].push_back( rowRecvCur[i] );
4453 vals[colRecvCur[i]-firstCol].push_back( valRecvCur[i] );
4458 std::vector<std::vector<Int> > sortIndex( numColLocal );
4459 for( Int j = 0; j < numColLocal; j++ ){
4460 sortIndex[j].resize( rows[j].size() );
4461 for( Int i = 0; i < sortIndex[j].size(); i++ )
4462 sortIndex[j][i] = i;
4463 std::sort( sortIndex[j].begin(), sortIndex[j].end(),
4464 IndexComp<std::vector<Int>& > ( rows[j] ) );
4476 for( Int j = 0; j < numColLocal; j++ ){
4481 A.
comm = grid_->comm;
4483 #if ( _DEBUGlevel_ >= 1 )
4484 statusOFS <<
"nnzLocal = " << A.
nnzLocal << std::endl;
4485 statusOFS <<
"nnz = " << A.
Nnz() << std::endl;
4494 for( Int j = 0; j < numColLocal; j++ ){
4495 std::vector<Int>& rowsCur = rows[j];
4496 std::vector<Int>& sortIndexCur = sortIndex[j];
4497 std::vector<T>& valsCur = vals[j];
4498 for( Int i = 0; i < rows[j].size(); i++ ){
4500 *(rowPtr++) = rowsCur[sortIndexCur[i]] + 1;
4501 *(nzvalPtr++) = valsCur[sortIndexCur[i]];
4505 #if ( _DEBUGlevel_ >= 1 )
4506 statusOFS <<
"A.colptrLocal[end] = " << A.
colptrLocal(numColLocal) << std::endl;
4507 statusOFS <<
"A.rowindLocal.size() = " << A.
rowindLocal.m() << std::endl;
4522 template<
typename T>
4526 PushCallStack(
"PMatrix::PMatrixToDistSparseMatrix_OLD");
4528 #if ( _DEBUGlevel_ >= 1 )
4529 statusOFS << std::endl <<
"Converting PMatrix to DistSparseMatrix (2nd format)." << std::endl;
4531 Int mpirank = grid_->mpirank;
4532 Int mpisize = grid_->mpisize;
4534 std::vector<Int> rowSend( mpisize );
4535 std::vector<Int> colSend( mpisize );
4536 std::vector<T> valSend( mpisize );
4537 std::vector<Int> sizeSend( mpisize, 0 );
4538 std::vector<Int> displsSend( mpisize, 0 );
4540 std::vector<Int> rowRecv( mpisize );
4541 std::vector<Int> colRecv( mpisize );
4542 std::vector<T> valRecv( mpisize );
4543 std::vector<Int> sizeRecv( mpisize, 0 );
4544 std::vector<Int> displsRecv( mpisize, 0 );
4547 const IntNumVec& permInv = super_->permInv;
4553 if(optionsLU_->RowPerm==
"NOROWPERM"){
4554 pPermInv_r = &super_->permInv;
4557 pPermInv_r = &super_->permInv_r;
4560 const IntNumVec& permInv_r = *pPermInv_r;
4572 Int numColFirst = this->
NumCol() / mpisize;
4575 for( Int ksup = 0; ksup < numSuper; ksup++ ){
4577 if(
MYCOL( grid_ ) ==
PCOL( ksup, grid_ ) ){
4578 std::vector<LBlock<T> >& Lcol = this->L(
LBj( ksup, grid_ ) );
4579 for( Int ib = 0; ib < Lcol.size(); ib++ ){
4580 for( Int j = 0; j < Lcol[ib].numCol; j++ ){
4582 Int dest = std::min( jcol / numColFirst, mpisize - 1 );
4583 sizeSend[dest] += Lcol[ib].numRow;
4589 if(
MYROW( grid_ ) ==
PROW( ksup, grid_ ) ){
4590 std::vector<UBlock<T> >& Urow = this->U(
LBi( ksup, grid_ ) );
4591 for( Int jb = 0; jb < Urow.size(); jb++ ){
4593 for( Int j = 0; j < cols.m(); j++ ){
4594 Int jcol = permInv( cols(j) );
4595 Int dest = std::min( jcol / numColFirst, mpisize - 1 );
4596 sizeSend[dest] += Urow[jb].numRow;
4604 &sizeSend[0], 1, MPI_INT,
4605 &sizeRecv[0], 1, MPI_INT, grid_->comm );
4610 for( Int ip = 0; ip < mpisize; ip++ ){
4615 displsSend[ip] = displsSend[ip-1] + sizeSend[ip-1];
4622 displsRecv[ip] = displsRecv[ip-1] + sizeRecv[ip-1];
4625 Int sizeSendTotal = displsSend[mpisize-1] + sizeSend[mpisize-1];
4626 Int sizeRecvTotal = displsRecv[mpisize-1] + sizeRecv[mpisize-1];
4628 rowSend.resize( sizeSendTotal );
4629 colSend.resize( sizeSendTotal );
4630 valSend.resize( sizeSendTotal );
4632 rowRecv.resize( sizeRecvTotal );
4633 colRecv.resize( sizeRecvTotal );
4634 valRecv.resize( sizeRecvTotal );
4636 #if ( _DEBUGlevel_ >= 1 )
4637 statusOFS <<
"displsSend = " << displsSend << std::endl;
4638 statusOFS <<
"displsRecv = " << displsRecv << std::endl;
4642 std::vector<Int> cntSize( mpisize, 0 );
4645 for( Int ksup = 0; ksup < numSuper; ksup++ ){
4647 if(
MYCOL( grid_ ) ==
PCOL( ksup, grid_ ) ){
4648 std::vector<LBlock<T> >& Lcol = this->L(
LBj( ksup, grid_ ) );
4649 for( Int ib = 0; ib < Lcol.size(); ib++ ){
4652 for( Int j = 0; j < Lcol[ib].numCol; j++ ){
4654 Int dest = std::min( jcol / numColFirst, mpisize - 1 );
4655 for( Int i = 0; i < rows.m(); i++ ){
4656 rowSend[displsSend[dest] + cntSize[dest]] = permInv_r( rows(i) );
4657 colSend[displsSend[dest] + cntSize[dest]] = jcol;
4658 valSend[displsSend[dest] + cntSize[dest]] = nzval( i, j );
4667 if(
MYROW( grid_ ) ==
PROW( ksup, grid_ ) ){
4668 std::vector<UBlock<T> >& Urow = this->U(
LBi( ksup, grid_ ) );
4669 for( Int jb = 0; jb < Urow.size(); jb++ ){
4672 for( Int j = 0; j < cols.m(); j++ ){
4673 Int jcol = permInv( cols(j) );
4674 Int dest = std::min( jcol / numColFirst, mpisize - 1 );
4675 for( Int i = 0; i < Urow[jb].numRow; i++ ){
4676 rowSend[displsSend[dest] + cntSize[dest]] =
4678 colSend[displsSend[dest] + cntSize[dest]] = jcol;
4679 valSend[displsSend[dest] + cntSize[dest]] = nzval( i, j );
4690 for( Int ip = 0; ip < mpisize; ip++ ){
4691 if( cntSize[ip] != sizeSend[ip] ){
4695 throw std::runtime_error(
"Sizes of the sending information do not match." );
4701 &rowSend[0], &sizeSend[0], &displsSend[0],
4702 &rowRecv[0], &sizeRecv[0], &displsRecv[0],
4705 &colSend[0], &sizeSend[0], &displsSend[0],
4706 &colRecv[0], &sizeRecv[0], &displsRecv[0],
4709 &valSend[0], &sizeSend[0], &displsSend[0],
4710 &valRecv[0], &sizeRecv[0], &displsRecv[0],
4713 #if ( _DEBUGlevel_ >= 1 )
4714 statusOFS <<
"Alltoallv communication finished." << std::endl;
4729 Int firstCol = mpirank * numColFirst;
4731 if( mpirank == mpisize-1 )
4732 numColLocal = this->
NumCol() - numColFirst * (mpisize-1);
4734 numColLocal = numColFirst;
4736 std::vector<std::vector<Int> > rows( numColLocal );
4737 std::vector<std::vector<T> > vals( numColLocal );
4739 for( Int ip = 0; ip < mpisize; ip++ ){
4740 Int* rowRecvCur = &rowRecv[displsRecv[ip]];
4741 Int* colRecvCur = &colRecv[displsRecv[ip]];
4742 T* valRecvCur = &valRecv[displsRecv[ip]];
4743 for( Int i = 0; i < sizeRecv[ip]; i++ ){
4744 rows[colRecvCur[i]-firstCol].push_back( rowRecvCur[i] );
4745 vals[colRecvCur[i]-firstCol].push_back( valRecvCur[i] );
4750 std::vector<std::vector<Int> > sortIndex( numColLocal );
4751 for( Int j = 0; j < numColLocal; j++ ){
4752 sortIndex[j].resize( rows[j].size() );
4753 for( Int i = 0; i < sortIndex[j].size(); i++ )
4754 sortIndex[j][i] = i;
4755 std::sort( sortIndex[j].begin(), sortIndex[j].end(),
4756 IndexComp<std::vector<Int>& > ( rows[j] ) );
4763 if( A.
size != this->NumCol() ){
4767 throw std::runtime_error(
"The DistSparseMatrix providing the pattern has a different size from PMatrix." );
4773 throw std::runtime_error(
"The DistSparseMatrix providing the pattern has a different number of local columns from PMatrix." );
4791 B.
comm = grid_->comm;
4795 for( Int j = 0; j < numColLocal; j++ ){
4796 std::vector<Int>& rowsCur = rows[j];
4797 std::vector<Int>& sortIndexCur = sortIndex[j];
4798 std::vector<T>& valsCur = vals[j];
4799 std::vector<Int> rowsCurSorted( rowsCur.size() );
4801 for( Int i = 0; i < rowsCurSorted.size(); i++ ){
4802 rowsCurSorted[i] = rowsCur[sortIndexCur[i]] + 1;
4806 std::vector<Int>::iterator it;
4809 it = std::lower_bound( rowsCurSorted.begin(), rowsCurSorted.end(),
4811 if( it == rowsCurSorted.end() ){
4813 *(nzvalPtr++) = ZERO<T>();
4817 *(nzvalPtr++) = valsCur[ sortIndexCur[it-rowsCurSorted.begin()] ];
4822 #if ( _DEBUGlevel_ >= 1 )
4823 statusOFS <<
"B.colptrLocal[end] = " << B.
colptrLocal(numColLocal) << std::endl;
4824 statusOFS <<
"B.rowindLocal.size() = " << B.
rowindLocal.m() << std::endl;
4841 template<
typename T>
4846 PushCallStack(
"PMatrix::PMatrixToDistSparseMatrix");
4848 #if ( _DEBUGlevel_ >= 1 )
4849 statusOFS << std::endl <<
"Converting PMatrix to DistSparseMatrix (2nd format)." << std::endl;
4851 Int mpirank = grid_->mpirank;
4852 Int mpisize = grid_->mpisize;
4854 std::vector<Int> rowSend( mpisize );
4855 std::vector<Int> colSend( mpisize );
4856 std::vector<T> valSend( mpisize );
4857 std::vector<Int> sizeSend( mpisize, 0 );
4858 std::vector<Int> displsSend( mpisize, 0 );
4860 std::vector<Int> rowRecv( mpisize );
4861 std::vector<Int> colRecv( mpisize );
4862 std::vector<T> valRecv( mpisize );
4863 std::vector<Int> sizeRecv( mpisize, 0 );
4864 std::vector<Int> displsRecv( mpisize, 0 );
4868 const IntNumVec& permInv = super_->permInv;
4873 pPerm_r = &super_->perm_r;
4874 pPermInv_r = &super_->permInv_r;
4877 const IntNumVec& permInv_r = *pPermInv_r;
4881 Int numColFirst = this->
NumCol() / mpisize;
4882 Int firstCol = mpirank * numColFirst;
4884 if( mpirank == mpisize-1 )
4885 numColLocal = this->
NumCol() - numColFirst * (mpisize-1);
4887 numColLocal = numColFirst;
4896 for( Int j = 0; j < numColLocal; j++ ){
4897 Int ocol = firstCol + j;
4898 Int col = perm[ perm_r[ ocol] ];
4899 Int blockColIdx =
BlockIdx( col, super_ );
4900 Int procCol =
PCOL( blockColIdx, grid_ );
4901 for( Int i = colPtr[j] - 1; i < colPtr[j+1] - 1; i++ ){
4902 Int orow = rowPtr[i]-1;
4903 Int row = perm[ orow ];
4904 Int blockRowIdx =
BlockIdx( row, super_ );
4905 Int procRow =
PROW( blockRowIdx, grid_ );
4906 Int dest =
PNUM( procRow, procCol, grid_ );
4907 #if ( _DEBUGlevel_ >= 1 )
4908 statusOFS <<
"("<< orow<<
", "<<ocol<<
") == "<<
"("<< row<<
", "<<col<<
")"<< std::endl;
4909 statusOFS <<
"BlockIdx = " << blockRowIdx <<
", " <<blockColIdx << std::endl;
4910 statusOFS << procRow <<
", " << procCol <<
", "
4911 << dest << std::endl;
4919 &sizeSend[0], 1, MPI_INT,
4920 &sizeRecv[0], 1, MPI_INT, grid_->comm );
4922 #if ( _DEBUGlevel_ >= 1 )
4923 statusOFS << std::endl <<
"sizeSend: " << sizeSend << std::endl;
4924 statusOFS << std::endl <<
"sizeRecv: " << sizeRecv << std::endl;
4930 for( Int ip = 0; ip < mpisize; ip++ ){
4935 displsSend[ip] = displsSend[ip-1] + sizeSend[ip-1];
4942 displsRecv[ip] = displsRecv[ip-1] + sizeRecv[ip-1];
4946 Int sizeSendTotal = displsSend[mpisize-1] + sizeSend[mpisize-1];
4947 Int sizeRecvTotal = displsRecv[mpisize-1] + sizeRecv[mpisize-1];
4949 rowSend.resize( sizeSendTotal );
4950 colSend.resize( sizeSendTotal );
4951 valSend.resize( sizeSendTotal );
4953 rowRecv.resize( sizeRecvTotal );
4954 colRecv.resize( sizeRecvTotal );
4955 valRecv.resize( sizeRecvTotal );
4957 #if ( _DEBUGlevel_ >= 1 )
4958 statusOFS <<
"displsSend = " << displsSend << std::endl;
4959 statusOFS <<
"displsRecv = " << displsRecv << std::endl;
4963 std::vector<Int> cntSize( mpisize, 0 );
4968 for( Int j = 0; j < numColLocal; j++ ){
4970 Int ocol = firstCol + j;
4971 Int col = perm[ perm_r[ ocol] ];
4972 Int blockColIdx =
BlockIdx( col, super_ );
4973 Int procCol =
PCOL( blockColIdx, grid_ );
4974 for( Int i = colPtr[j] - 1; i < colPtr[j+1] - 1; i++ ){
4975 Int orow = rowPtr[i]-1;
4976 Int row = perm[ orow ];
4977 Int blockRowIdx =
BlockIdx( row, super_ );
4978 Int procRow =
PROW( blockRowIdx, grid_ );
4979 Int dest =
PNUM( procRow, procCol, grid_ );
4980 rowSend[displsSend[dest] + cntSize[dest]] = row;
4981 colSend[displsSend[dest] + cntSize[dest]] = col;
4988 for( Int ip = 0; ip < mpisize; ip++ ){
4989 if( cntSize[ip] != sizeSend[ip] ){
4993 throw std::runtime_error(
"Sizes of the sending information do not match." );
4999 &rowSend[0], &sizeSend[0], &displsSend[0],
5000 &rowRecv[0], &sizeRecv[0], &displsRecv[0],
5003 &colSend[0], &sizeSend[0], &displsSend[0],
5004 &colRecv[0], &sizeRecv[0], &displsRecv[0],
5007 #if ( _DEBUGlevel_ >= 1 )
5008 statusOFS <<
"Alltoallv communication of nonzero indices finished." << std::endl;
5012 #if ( _DEBUGlevel_ >= 1 )
5013 for( Int ip = 0; ip < mpisize; ip++ ){
5014 statusOFS <<
"rowSend[" << ip <<
"] = " << rowSend[ip] << std::endl;
5015 statusOFS <<
"rowRecv[" << ip <<
"] = " << rowRecv[ip] << std::endl;
5016 statusOFS <<
"colSend[" << ip <<
"] = " << colSend[ip] << std::endl;
5017 statusOFS <<
"colRecv[" << ip <<
"] = " << colRecv[ip] << std::endl;
5028 for( Int g = 0; g < sizeRecvTotal; g++ ){
5029 Int row = rowRecv[g];
5030 Int col = colRecv[g];
5032 Int blockRowIdx =
BlockIdx( row, super_ );
5033 Int blockColIdx =
BlockIdx( col, super_ );
5036 bool isFound =
false;
5038 if( blockColIdx <= blockRowIdx ){
5041 std::vector<LBlock<T> >& Lcol = this->L(
LBj( blockColIdx, grid_ ) );
5043 for( Int ib = 0; ib < Lcol.size(); ib++ ){
5044 #if ( _DEBUGlevel_ >= 1 )
5045 statusOFS <<
"blockRowIdx = " << blockRowIdx <<
", Lcol[ib].blockIdx = " << Lcol[ib].blockIdx <<
", blockColIdx = " << blockColIdx << std::endl;
5047 if( Lcol[ib].blockIdx == blockRowIdx ){
5049 for(
int iloc = 0; iloc < Lcol[ib].numRow; iloc++ ){
5050 if( rows[iloc] == row ){
5052 valRecv[g] = Lcol[ib].nzval( iloc, jloc );
5058 if( isFound ==
true )
break;
5064 std::vector<UBlock<T> >& Urow = this->U(
LBi( blockRowIdx, grid_ ) );
5066 for( Int jb = 0; jb < Urow.size(); jb++ ){
5067 if( Urow[jb].blockIdx == blockColIdx ){
5069 for(
int jloc = 0; jloc < Urow[jb].numCol; jloc++ ){
5070 if( cols[jloc] == col ){
5072 valRecv[g] = Urow[jb].nzval( iloc, jloc );
5078 if( isFound ==
true )
break;
5083 if( isFound ==
false ){
5084 statusOFS <<
"In the permutated order, (" << row <<
", " << col <<
5085 ") is not found in PMatrix." << std::endl;
5086 valRecv[g] = ZERO<T>();
5095 &valRecv[0], &sizeRecv[0], &displsRecv[0],
5096 &valSend[0], &sizeSend[0], &displsSend[0],
5099 #if ( _DEBUGlevel_ >= 1 )
5100 statusOFS <<
"Alltoallv communication of nonzero values finished." << std::endl;
5119 B.
comm = grid_->comm;
5121 for( Int i = 0; i < mpisize; i++ )
5128 for( Int j = 0; j < numColLocal; j++ ){
5129 Int ocol = firstCol + j;
5130 Int col = perm[ perm_r[ ocol] ];
5131 Int blockColIdx =
BlockIdx( col, super_ );
5132 Int procCol =
PCOL( blockColIdx, grid_ );
5133 for( Int i = colPtr[j] - 1; i < colPtr[j+1] - 1; i++ ){
5134 Int orow = rowPtr[i]-1;
5135 Int row = perm[ orow ];
5136 Int blockRowIdx =
BlockIdx( row, super_ );
5137 Int procRow =
PROW( blockRowIdx, grid_ );
5138 Int dest =
PNUM( procRow, procCol, grid_ );
5139 valPtr[i] = valSend[displsSend[dest] + cntSize[dest]];
5145 for( Int ip = 0; ip < mpisize; ip++ ){
5146 if( cntSize[ip] != sizeSend[ip] ){
5150 throw std::runtime_error(
"Sizes of the sending information do not match." );
5165 template<
typename T>
5169 PushCallStack(
"PMatrix::NnzLocal");
5173 for( Int ksup = 0; ksup < numSuper; ksup++ ){
5174 if(
MYCOL( grid_ ) ==
PCOL( ksup, grid_ ) ){
5175 std::vector<LBlock<T> >& Lcol = this->L(
LBj( ksup, grid_ ) );
5176 for( Int ib = 0; ib < Lcol.size(); ib++ ){
5177 nnzLocal += Lcol[ib].numRow * Lcol[ib].numCol;
5180 if(
MYROW( grid_ ) ==
PROW( ksup, grid_ ) ){
5181 std::vector<UBlock<T> >& Urow = this->U(
LBi( ksup, grid_ ) );
5182 for( Int jb = 0; jb < Urow.size(); jb++ ){
5183 nnzLocal += Urow[jb].numRow * Urow[jb].numCol;
5196 template<
typename T>
5200 PushCallStack(
"PMatrix::Nnz");
5202 LongInt nnzLocal = LongInt( this->NnzLocal() );
5205 MPI_Allreduce( &nnzLocal, &nnz, 1, MPI_LONG_LONG, MPI_SUM, grid_->comm );
5218 PushCallStack(
"PMatrix::GetNegativeInertia");
5222 Real inertiaLocal = 0.0;
5225 for( Int ksup = 0; ksup < numSuper; ksup++ ){
5227 if(
MYROW( grid_ ) ==
PROW( ksup, grid_ ) &&
5228 MYCOL( grid_ ) ==
PCOL( ksup, grid_ ) ){
5230 for( Int i = 0; i < LB.
numRow; i++ ){
5231 if( LB.
nzval(i, i) < 0 )
5238 mpi::Allreduce( &inertiaLocal, &inertia, 1, MPI_SUM, grid_->comm );
5252 PushCallStack(
"PMatrix::GetNegativeInertia");
5256 Real inertiaLocal = 0.0;
5259 for( Int ksup = 0; ksup < numSuper; ksup++ ){
5261 if(
MYROW( grid_ ) ==
PROW( ksup, grid_ ) &&
5262 MYCOL( grid_ ) ==
PCOL( ksup, grid_ ) ){
5263 LBlock<Complex> & LB = this->L(
LBj( ksup, grid_ ) )[0];
5264 for( Int i = 0; i < LB.numRow; i++ ){
5265 if( LB.nzval(i, i).real() < 0 )
5272 mpi::Allreduce( &inertiaLocal, &inertia, 1, MPI_SUM, grid_->comm );
5281 template<
typename T>
5289 pMat =
new PMatrix<T>( pGridType, pSuper, pSelInvOpt, pLuOpt );
5295 template<
typename T>
5303 pMat =
new PMatrix<T>();
5311 template<
typename T>
5312 inline void PMatrix<T>::DumpLU()
5315 const IntNumVec& perm = super_->perm;
5316 const IntNumVec& permInv = super_->permInv;
5318 const IntNumVec * pPerm_r;
5319 const IntNumVec * pPermInv_r;
5321 if(optionsLU_->RowPerm==
"NOROWPERM"){
5322 pPerm_r = &super_->perm;
5323 pPermInv_r = &super_->permInv;
5326 pPerm_r = &super_->perm_r;
5327 pPermInv_r = &super_->permInv_r;
5330 const IntNumVec& perm_r = *pPerm_r;
5331 const IntNumVec& permInv_r = *pPermInv_r;
5333 statusOFS<<
"Col perm is "<<perm<<std::endl;
5334 statusOFS<<
"Row perm is "<<perm_r<<std::endl;
5336 statusOFS<<
"Inv Col perm is "<<permInv<<std::endl;
5337 statusOFS<<
"Inv Row perm is "<<permInv_r<<std::endl;
5342 statusOFS<<
"Content of L"<<std::endl;
5344 for(Int j = 0;j<this->L_.size()-1;++j){
5345 std::vector<LBlock<T> >& Lcol = this->L( j );
5346 Int blockColIdx =
GBj( j, this->grid_ );
5350 for( Int ib = 0; ib < Lcol.size(); ib++ ){
5351 for(Int ir = 0; ir< Lcol[ib].rows.m(); ++ir){
5352 Int row = Lcol[ib].rows[ir];
5353 for(Int col = fc; col<fc+Lcol[ib].numCol;col++){
5354 Int ocol = permInv_r[permInv[col]];
5355 Int orow = permInv[row];
5356 Int jloc = col -
FirstBlockCol( blockColIdx, this->super_ );
5358 T val = Lcol[ib].nzval( iloc, jloc );
5359 statusOFS <<
"("<< orow<<
", "<<ocol<<
") == "<<
"("<< row<<
", "<<col<<
") = "<<val<< std::endl;
5365 statusOFS<<
"Content of U"<<std::endl;
5368 for(Int i = 0;i<this->U_.size()-1;++i){
5369 std::vector<UBlock<T> >& Urow = this->U( i );
5370 Int blockRowIdx =
GBi( i, this->grid_ );
5372 for( Int jb = 0; jb < Urow.size(); jb++ ){
5373 for(Int row = fr; row<fr+Urow[jb].numRow;row++){
5374 for(Int ic = 0; ic< Urow[jb].cols.m(); ++ic){
5375 Int col = Urow[jb].cols[ic];
5376 Int ocol = permInv_r[permInv[col]];
5377 Int orow = permInv[row];
5378 Int iloc = row -
FirstBlockRow( blockRowIdx, this->super_ );
5380 T val = Urow[jb].nzval( iloc, jloc );
5381 statusOFS <<
"("<< orow<<
", "<<ocol<<
") == "<<
"("<< row<<
", "<<col<<
") = "<<val<< std::endl;
5393 template<
typename T>
5394 inline void PMatrix<T>::CopyLU(
const PMatrix<T> & C){
5395 ColBlockIdx_ = C.ColBlockIdx_;
5396 RowBlockIdx_ = C.RowBlockIdx_;
5406 #endif //_PEXSI_PSELINV_IMPL_HPP_
virtual void PreSelInv()
PreSelInv prepares the structure in L_ and U_ so that SelInv only involves matrix-matrix multiplicati...
Definition: pselinv_impl.hpp:3711
Int blockIdx
Block index (supernodal index)
Definition: pselinv.hpp:237
void PMatrixToDistSparseMatrix(DistSparseMatrix< T > &A)
PMatrixToDistSparseMatrix converts the PMatrix into a distributed compressed sparse column matrix for...
Definition: pselinv_impl.hpp:4242
A thin interface for passing parameters to set the SuperLU options.
Definition: superlu_dist_internal.hpp:62
Definition: TreeBcast.hpp:705
SuperNodeType describes mapping between supernode and column, the permutation information, and potentially the elimination tree (not implemented here).
Definition: pselinv.hpp:166
Interface with SuperLU_Dist (version 3.0 and later)
void GetNegativeInertia(Real &inertia)
GetNegativeInertia computes the negative inertia of a PMatrix. This can be used to estimate e...
Int PROW(Int bnum, const GridType *g)
PROW returns the processor row that the bnum-th block (supernode) belongs to.
Definition: pselinv.hpp:293
Int numRow
Number of nonzero rows.
Definition: pselinv.hpp:188
IntNumVec colptrLocal
Dimension numColLocal + 1, storing the pointers to the nonzero row indices and nonzero values in rowp...
Definition: sparse_matrix.hpp:108
void UnpackData(SuperNodeBufferType &snode, std::vector< LBlock< T > > &LcolRecv, std::vector< UBlock< T > > &UrowRecv)
UnpackData.
Definition: pselinv_impl.hpp:1096
Int MYPROC(const GridType *g)
MYPROC returns the current processor rank.
Definition: pselinv.hpp:280
Int nnzLocal
Local number of local nonzeros elements on this processor.
Definition: sparse_matrix.hpp:96
Int PCOL(Int bnum, const GridType *g)
PCOL returns the processor column that the bnum-th block (supernode) belongs to.
Definition: pselinv.hpp:298
Int LBi(Int bnum, const GridType *g)
LBi returns the local block number on the processor at processor row PROW( bnum, g )...
Definition: pselinv.hpp:308
Int Symmetric
Option to specify if matrix is symmetric or not.
Definition: superlu_dist_internal.hpp:108
Profiling and timing using TAU.
Int SuperSize(Int bnum, const SuperNodeType *s)
SuperSize returns the size of the block bnum.
Definition: pselinv.hpp:349
Int LBj(Int bnum, const GridType *g)
LBj returns the local block number on the processor at processor column PCOL( bnum, g ).
Definition: pselinv.hpp:313
Int GBj(Int jLocal, const GridType *g)
GBj returns the global block number from a local block number in the column direction.
Definition: pselinv.hpp:323
void SetValue(NumMat< F > &M, F val)
SetValue sets a numerical matrix to a constant val.
Definition: NumMat_impl.hpp:171
Int numCol
Number of nonzero columns.
Definition: pselinv.hpp:191
Int NumCol(const SuperNodeType *s)
NumCol returns the total number of columns for a supernodal partiiton.
Definition: pselinv.hpp:358
Definition: pselinv.hpp:604
void GetDiagonal(NumVec< T > &diag)
GetDiagonal extracts the diagonal elements of the PMatrix.
Definition: pselinv_impl.hpp:4114
void GetWorkSet(std::vector< Int > &snodeEtree, std::vector< std::vector< Int > > &WSet)
GetWorkSet.
Definition: pselinv_impl.hpp:1271
NumVec< F > nzvalLocal
Dimension nnzLocal, storing the nonzero values.
Definition: sparse_matrix.hpp:116
Int FirstBlockRow(Int bnum, const SuperNodeType *s)
FirstBlockRow returns the first column of a block bnum. Note: the functionality of FirstBlockRow is e...
Definition: pselinv.hpp:344
void GetEtree(std::vector< Int > &etree_supno)
GetEtree computes the supernodal elimination tree to be used later in the pipelined selected inversio...
Definition: pselinv_impl.hpp:1202
LBlock stores a nonzero block in the lower triangular part or the diagonal part in PSelInv...
Definition: pselinv.hpp:182
IntNumVec rowindLocal
Dimension nnzLocal, storing the nonzero row indices. The indices are 1-based (FORTRAN-convention), i.e. the first row index is 1.
Definition: sparse_matrix.hpp:113
Int PNUM(Int i, Int j, const GridType *g)
PNUM returns the processor rank that the bnum-th block (supernode) belongs to.
Definition: pselinv.hpp:303
virtual void ConstructCommunicationPattern()
ConstructCommunicationPattern constructs the communication pattern to be used later in the selected i...
Definition: pselinv_impl.hpp:2670
void ConstructCommunicationPattern_P2p()
ConstructCommunicationPattern_P2p constructs the communication pattern to be used later in the select...
Definition: pselinv_impl.hpp:2678
Int GBi(Int iLocal, const GridType *g)
GBi returns the global block number from a local block number in the row direction.
Definition: pselinv.hpp:318
GridType is the PSelInv way of defining the grid.
Definition: pselinv.hpp:129
void SelInvIntra_P2p(Int lidx, Int &rank)
SelInvIntra_P2p.
Definition: pselinv_impl.hpp:1511
Int MYCOL(const GridType *g)
MYCOL returns my processor column.
Definition: pselinv.hpp:288
Int size
Matrix dimension.
Definition: sparse_matrix.hpp:93
void PMatrixToDistSparseMatrix_OLD(const DistSparseMatrix< T > &A, DistSparseMatrix< T > &B)
PMatrixToDistSparseMatrix_OLD converts the PMatrix into a distributed compressed sparse column matrix...
Definition: pselinv_impl.hpp:4523
IntNumVec rows
Dimension numRow * 1, index (0-based) for the number of nonzero rows.
Definition: pselinv.hpp:194
Int numCol
Number of nonzero columns.
Definition: pselinv.hpp:243
UBlock stores a nonzero block in the upper triangular part in PSelInv.
Definition: pselinv.hpp:234
Int FirstBlockCol(Int bnum, const SuperNodeType *s)
FirstBlockCol returns the first column of a block bnum.
Definition: pselinv.hpp:337
LongInt Nnz()
Compute the total number of nonzeros through MPI_Allreduce.
Definition: sparse_matrix_impl.hpp:107
MPI_Comm comm
MPI communicator.
Definition: sparse_matrix.hpp:119
Int NnzLocal()
NnzLocal computes the number of nonzero elements (L and U) saved locally.
Definition: pselinv_impl.hpp:5166
Int blockIdx
Block index (supernodal index)
Definition: pselinv.hpp:185
PMatrix contains the main data structure and the computational routine for the parallel selected inve...
Definition: ngchol_interf.hpp:57
void SendRecvCD_UpdateU(std::vector< SuperNodeBufferType > &arrSuperNodes, Int stepSuper)
SendRecvCD_UpdateU.
Definition: pselinv_impl.hpp:861
Int BlockIdx(Int i, const SuperNodeType *s)
BlockIdx returns the block index of a column i.
Definition: pselinv.hpp:332
Int numRow
Number of nonzero rows.
Definition: pselinv.hpp:240
NumMat< T > nzval
Dimension numRow * numCol, nonzero elements.
Definition: pselinv.hpp:249
Int nnz
Total number of nonzeros elements.
Definition: sparse_matrix.hpp:101
Int NumSuper(const SuperNodeType *s)
NumSuper returns the total number of supernodes.
Definition: pselinv.hpp:353
void SelInv_P2p()
Point-to-point version of the selected inversion.
Definition: pselinv_impl.hpp:3646
NumMat< T > nzval
Dimension numRow * numCol, nonzero elements.
Definition: pselinv.hpp:198
A thin interface for passing parameters to set the PSelInv options.
Definition: pselinv.hpp:102
Definition: utility.hpp:1481
IntNumVec cols
Dimension numCol * 1, index (0-based) for the number of nonzero rows.
Definition: pselinv.hpp:246
virtual void SelInv()
SelInv is the main function for the selected inversion.
Definition: pselinv_impl.hpp:3593
LongInt Nnz()
Nnz computes the total number of nonzero elements in the PMatrix.
Definition: pselinv_impl.hpp:5197
void SelInv_lookup_indexes(SuperNodeBufferType &snode, std::vector< LBlock< T > > &LcolRecv, std::vector< UBlock< T > > &UrowRecv, NumMat< T > &AinvBuf, NumMat< T > &UBuf)
SelInv_lookup_indexes.
Definition: pselinv_impl.hpp:504
Definition: TreeBcast.hpp:1268
void ComputeDiagUpdate(SuperNodeBufferType &snode)
ComputeDiagUpdate.
Definition: pselinv_impl.hpp:1162
DistSparseMatrix describes a Sparse matrix in the compressed sparse column format (CSC) and distribut...
Definition: sparse_matrix.hpp:91
PMatrixUnsym contains the main data structure and the computational routine for the parallel selected...
Definition: pselinv.hpp:991
Int MYROW(const GridType *g)
MYROW returns my processor row.
Definition: pselinv.hpp:284