46 #ifndef _PEXSI_PSELINV_IMPL_HPP_
47 #define _PEXSI_PSELINV_IMPL_HPP_
56 #define MPI_MAX_COMM (1024)
57 #define BCAST_THRESHOLD 16
62 #ifdef COMMUNICATOR_PROFILE
64 class CommunicatorStat{
66 typedef std::map<std::vector<bool> , std::vector<Int> > bitMaskSet;
68 NumVec<Int> countSendToBelow_;
69 bitMaskSet maskSendToBelow_;
71 NumVec<Int> countRecvFromBelow_;
72 bitMaskSet maskRecvFromBelow_;
74 NumVec<Int> countSendToRight_;
75 bitMaskSet maskSendToRight_;
83 inline GridType::GridType ( MPI_Comm Bcomm,
int nprow,
int npcol )
86 MPI_Initialized( &info );
88 ErrorHandling(
"MPI has not been initialized." );
91 MPI_Comm_group( Bcomm, &comm_group );
92 MPI_Comm_create( Bcomm, comm_group, &comm );
95 MPI_Comm_rank( comm, &mpirank );
96 MPI_Comm_size( comm, &mpisize );
97 if( mpisize != nprow * npcol ){
98 ErrorHandling(
"mpisize != nprow * npcol." );
104 #ifdef SWAP_ROWS_COLS
105 Int myrow = mpirank % npcol;
106 Int mycol = mpirank / npcol;
108 Int myrow = mpirank / npcol;
109 Int mycol = mpirank % npcol;
112 MPI_Comm_split( comm, myrow, mycol, &rowComm );
113 MPI_Comm_split( comm, mycol, myrow, &colComm );
115 MPI_Group_free( &comm_group );
122 inline GridType::~GridType ( )
126 MPI_Comm_free( &rowComm );
127 MPI_Comm_free( &colComm );
128 MPI_Comm_free( &comm );
137 void PMatrix<T>::deallocate(){
144 ColBlockIdx_.clear();
145 RowBlockIdx_.clear();
151 isSendToBelow_.Clear();
152 isSendToRight_.Clear();
153 isSendToDiagonal_.Clear();
154 isSendToCrossDiagonal_.Clear();
156 isRecvFromBelow_.Clear();
157 isRecvFromAbove_.Clear();
158 isRecvFromLeft_.Clear();
159 isRecvFromCrossDiagonal_.Clear();
164 for(
int i =0;i<fwdToBelowTree2_.size();++i){
165 if(fwdToBelowTree2_[i]!=NULL){
166 delete fwdToBelowTree2_[i];
167 fwdToBelowTree2_[i] = NULL;
170 for(
int i =0;i<fwdToRightTree2_.size();++i){
171 if(fwdToRightTree2_[i]!=NULL){
172 delete fwdToRightTree2_[i];
173 fwdToRightTree2_[i] = NULL;
177 for(
int i =0;i<fwdToBelowTree_.size();++i){
178 if(fwdToBelowTree_[i]!=NULL){
179 delete fwdToBelowTree_[i];
180 fwdToBelowTree_[i] = NULL;
184 for(
int i =0;i<fwdToRightTree_.size();++i){
185 if(fwdToRightTree_[i]!=NULL){
186 delete fwdToRightTree_[i];
187 fwdToRightTree_[i] = NULL;
192 for(
int i =0;i<redToLeftTree_.size();++i){
193 if(redToLeftTree_[i]!=NULL){
194 delete redToLeftTree_[i];
195 redToLeftTree_[i] = NULL;
198 for(
int i =0;i<redToAboveTree_.size();++i){
199 if(redToAboveTree_[i]!=NULL){
200 delete redToAboveTree_[i];
201 redToAboveTree_[i] = NULL;
206 #if defined(COMM_PROFILE) || defined(COMM_PROFILE_BCAST)
207 if(commOFS.is_open()){
216 PMatrix<T> & PMatrix<T>::operator = (
const PMatrix<T> & C){
223 options_ = C.options_;
224 optionsFact_ = C.optionsFact_;
228 limIndex_ = C.limIndex_;
230 ColBlockIdx_ = C.ColBlockIdx_;
231 RowBlockIdx_ = C.RowBlockIdx_;
235 workingSet_ = C.workingSet_;
239 isSendToBelow_ = C.isSendToBelow_;
240 isSendToRight_ = C.isSendToRight_;
241 isSendToDiagonal_ = C.isSendToDiagonal_;
242 isSendToCrossDiagonal_ = C.isSendToCrossDiagonal_;
244 isRecvFromBelow_ = C.isRecvFromBelow_;
245 isRecvFromAbove_ = C.isRecvFromAbove_;
246 isRecvFromLeft_ = C.isRecvFromLeft_;
247 isRecvFromCrossDiagonal_ = C.isRecvFromCrossDiagonal_;
250 fwdToBelowTree2_.resize(C.fwdToBelowTree2_.size());
251 for(
int i = 0 ; i< C.fwdToBelowTree2_.size();++i){
252 if(C.fwdToBelowTree2_[i]!=NULL){
253 fwdToBelowTree2_[i] = C.fwdToBelowTree2_[i]->clone();
256 fwdToRightTree2_.resize(C.fwdToRightTree2_.size());
257 for(
int i = 0 ; i< C.fwdToRightTree2_.size();++i){
258 if(C.fwdToRightTree2_[i]!=NULL){
259 fwdToRightTree2_[i] = C.fwdToRightTree2_[i]->clone();
263 fwdToBelowTree_.resize(C.fwdToBelowTree_.size());
264 for(
int i = 0 ; i< C.fwdToBelowTree_.size();++i){
265 if(C.fwdToBelowTree_[i]!=NULL){
266 fwdToBelowTree_[i] = C.fwdToBelowTree_[i]->clone();
269 fwdToRightTree_.resize(C.fwdToRightTree_.size());
270 for(
int i = 0 ; i< C.fwdToRightTree_.size();++i){
271 if(C.fwdToRightTree_[i]!=NULL){
272 fwdToRightTree_[i] = C.fwdToRightTree_[i]->clone();
277 redToLeftTree_.resize(C.redToLeftTree_.size());
278 for(
int i = 0 ; i< C.redToLeftTree_.size();++i){
279 if(C.redToLeftTree_[i]!=NULL){
280 redToLeftTree_[i] = C.redToLeftTree_[i]->clone();
283 redToAboveTree_.resize(C.redToAboveTree_.size());
284 for(
int i = 0 ; i< C.redToAboveTree_.size();++i){
285 if(C.redToAboveTree_[i]!=NULL){
286 redToAboveTree_[i] = C.redToAboveTree_[i]->clone();
296 PMatrix<T>::PMatrix(
const PMatrix<T> & C):PMatrix(){
302 options_ = C.options_;
303 optionsFact_ = C.optionsFact_;
306 limIndex_ = C.limIndex_;
308 ColBlockIdx_ = C.ColBlockIdx_;
309 RowBlockIdx_ = C.RowBlockIdx_;
313 workingSet_ = C.workingSet_;
317 isSendToBelow_ = C.isSendToBelow_;
318 isSendToRight_ = C.isSendToRight_;
319 isSendToDiagonal_ = C.isSendToDiagonal_;
320 isSendToCrossDiagonal_ = C.isSendToCrossDiagonal_;
322 isRecvFromBelow_ = C.isRecvFromBelow_;
323 isRecvFromAbove_ = C.isRecvFromAbove_;
324 isRecvFromLeft_ = C.isRecvFromLeft_;
325 isRecvFromCrossDiagonal_ = C.isRecvFromCrossDiagonal_;
330 fwdToBelowTree2_.resize(C.fwdToBelowTree2_.size());
331 for(
int i = 0 ; i< C.fwdToBelowTree2_.size();++i){
332 if(C.fwdToBelowTree2_[i]!=NULL){
333 fwdToBelowTree2_[i] = C.fwdToBelowTree2_[i]->clone();
336 fwdToRightTree2_.resize(C.fwdToRightTree2_.size());
337 for(
int i = 0 ; i< C.fwdToRightTree2_.size();++i){
338 if(C.fwdToRightTree2_[i]!=NULL){
339 fwdToRightTree2_[i] = C.fwdToRightTree2_[i]->clone();
343 fwdToBelowTree_.resize(C.fwdToBelowTree_.size());
344 for(
int i = 0 ; i< C.fwdToBelowTree_.size();++i){
345 if(C.fwdToBelowTree_[i]!=NULL){
346 fwdToBelowTree_[i] = C.fwdToBelowTree_[i]->clone();
349 fwdToRightTree_.resize(C.fwdToRightTree_.size());
350 for(
int i = 0 ; i< C.fwdToRightTree_.size();++i){
351 if(C.fwdToRightTree_[i]!=NULL){
352 fwdToRightTree_[i] = C.fwdToRightTree_[i]->clone();
357 redToLeftTree_.resize(C.redToLeftTree_.size());
358 for(
int i = 0 ; i< C.redToLeftTree_.size();++i){
359 if(C.redToLeftTree_[i]!=NULL){
360 redToLeftTree_[i] = C.redToLeftTree_[i]->clone();
363 redToAboveTree_.resize(C.redToAboveTree_.size());
364 for(
int i = 0 ; i< C.redToAboveTree_.size();++i){
365 if(C.redToAboveTree_[i]!=NULL){
366 redToAboveTree_[i] = C.redToAboveTree_[i]->clone();
374 PMatrix<T>::PMatrix ( ){
378 PMatrix<T>::PMatrix (
380 const SuperNodeType* s,
386 this->Setup( g, s, o, oFact );
393 PMatrix<T>::~PMatrix ( )
403 void PMatrix<T>::Setup(
405 const SuperNodeType* s,
414 optionsFact_ = oFact;
421 ColBlockIdx_.clear();
422 RowBlockIdx_.clear();
424 L_.resize( this->NumLocalBlockCol() );
425 U_.resize( this->NumLocalBlockRow() );
427 ColBlockIdx_.resize( this->NumLocalBlockCol() );
428 RowBlockIdx_.resize( this->NumLocalBlockRow() );
431 #if ( _DEBUGlevel_ >= 1 )
432 statusOFS << std::endl <<
"PMatrix is constructed. The grid information: " << std::endl;
433 statusOFS <<
"mpirank = " <<
MYPROC(grid_) << std::endl;
434 statusOFS <<
"myrow = " <<
MYROW(grid_) << std::endl;
435 statusOFS <<
"mycol = " <<
MYCOL(grid_) << std::endl;
445 MPI_Comm_get_attr(grid_->comm, MPI_TAG_UB, &v, &flag);
452 limIndex_ = (Int)maxTag_/(Int)SELINV_TAG_COUNT -1;
455 #if defined(COMM_PROFILE) || defined(COMM_PROFILE_BCAST)
456 std::stringstream ss3;
457 ss3 <<
"comm_stat" <<
MYPROC(this->grid_);
458 if(!commOFS.is_open()){
459 commOFS.open( ss3.str().c_str());
478 TIMER_START(Compute_Sinv_LT_Lookup_Indexes);
480 TIMER_START(Build_colptr_rowptr);
484 std::vector<Int> rowPtr(LcolRecv.size() + 1);
488 std::vector<Int> colPtr(UrowRecv.size() + 1);
491 for( Int ib = 0; ib < LcolRecv.size(); ib++ ){
492 rowPtr[ib+1] = rowPtr[ib] + LcolRecv[ib].numRow;
495 for( Int jb = 0; jb < UrowRecv.size(); jb++ ){
496 colPtr[jb+1] = colPtr[jb] + UrowRecv[jb].numCol;
499 Int numRowAinvBuf = *rowPtr.rbegin();
500 Int numColAinvBuf = *colPtr.rbegin();
501 TIMER_STOP(Build_colptr_rowptr);
503 TIMER_START(Allocate_lookup);
505 AinvBuf.Resize( numRowAinvBuf, numColAinvBuf );
506 UBuf.Resize(
SuperSize( snode.Index, super_ ), numColAinvBuf );
512 TIMER_STOP(Allocate_lookup);
514 TIMER_START(Fill_UBuf);
516 for( Int jb = 0; jb < UrowRecv.size(); jb++ ){
519 ErrorHandling(
"The size of UB is not right. Something is seriously wrong." );
522 UB.
numRow, UBuf.VecData( colPtr[jb] ),
SuperSize( snode.Index, super_ ) );
524 TIMER_STOP(Fill_UBuf);
528 TIMER_START(JB_Loop);
682 for( Int jb = 0; jb < UrowRecv.size(); jb++ ){
683 for( Int ib = 0; ib < LcolRecv.size(); ib++ ){
688 T* nzvalAinv = &AinvBuf( rowPtr[ib], colPtr[jb] );
689 Int ldAinv = AinvBuf.m();
693 std::vector<LBlock<T> >& LcolSinv = this->L(
LBj(jsup, grid_ ) );
694 bool isBlockFound =
false;
695 TIMER_START(PARSING_ROW_BLOCKIDX);
696 for( Int ibSinv = 0; ibSinv < LcolSinv.size(); ibSinv++ ){
698 if( LcolSinv[ibSinv].blockIdx == isup ){
702 std::vector<Int> relRows( LB.
numRow );
703 Int* rowsLBPtr = LB.
rows.Data();
704 Int* rowsSinvBPtr = SinvB.
rows.Data();
705 for( Int i = 0; i < LB.
numRow; i++ ){
706 bool isRowFound =
false;
707 for( Int i1 = 0; i1 < SinvB.
numRow; i1++ ){
708 if( rowsLBPtr[i] == rowsSinvBPtr[i1] ){
714 if( isRowFound ==
false ){
715 std::ostringstream msg;
716 msg <<
"Row " << rowsLBPtr[i] <<
717 " in LB cannot find the corresponding row in SinvB" << std::endl
718 <<
"LB.rows = " << LB.
rows << std::endl
719 <<
"SinvB.rows = " << SinvB.
rows << std::endl;
720 ErrorHandling( msg.str().c_str() );
725 std::vector<Int> relCols( UB.
numCol );
727 for( Int j = 0; j < UB.
numCol; j++ ){
728 relCols[j] = UB.
cols[j] - SinvColsSta;
732 T* nzvalSinv = SinvB.
nzval.Data();
733 Int ldSinv = SinvB.
numRow;
734 for( Int j = 0; j < UB.
numCol; j++ ){
735 for( Int i = 0; i < LB.
numRow; i++ ){
736 nzvalAinv[i+j*ldAinv] =
737 nzvalSinv[relRows[i] + relCols[j] * ldSinv];
745 TIMER_STOP(PARSING_ROW_BLOCKIDX);
746 if( isBlockFound ==
false ){
747 std::ostringstream msg;
748 msg <<
"Block(" << isup <<
", " << jsup
749 <<
") did not find a matching block in Sinv." << std::endl;
750 ErrorHandling( msg.str().c_str() );
754 std::vector<UBlock<T> >& UrowSinv = this->U(
LBi( isup, grid_ ) );
755 bool isBlockFound =
false;
756 TIMER_START(PARSING_COL_BLOCKIDX);
757 for( Int jbSinv = 0; jbSinv < UrowSinv.size(); jbSinv++ ){
759 if( UrowSinv[jbSinv].blockIdx == jsup ){
763 std::vector<Int> relRows( LB.
numRow );
765 for( Int i = 0; i < LB.
numRow; i++ ){
766 relRows[i] = LB.
rows[i] - SinvRowsSta;
770 std::vector<Int> relCols( UB.
numCol );
771 Int* colsUBPtr = UB.
cols.Data();
772 Int* colsSinvBPtr = SinvB.
cols.Data();
773 for( Int j = 0; j < UB.
numCol; j++ ){
774 bool isColFound =
false;
775 for( Int j1 = 0; j1 < SinvB.
numCol; j1++ ){
776 if( colsUBPtr[j] == colsSinvBPtr[j1] ){
782 if( isColFound ==
false ){
783 std::ostringstream msg;
784 msg <<
"Col " << colsUBPtr[j] <<
785 " in UB cannot find the corresponding row in SinvB" << std::endl
786 <<
"UB.cols = " << UB.
cols << std::endl
787 <<
"UinvB.cols = " << SinvB.
cols << std::endl;
788 ErrorHandling( msg.str().c_str() );
794 T* nzvalSinv = SinvB.
nzval.Data();
795 Int ldSinv = SinvB.
numRow;
796 for( Int j = 0; j < UB.
numCol; j++ ){
797 for( Int i = 0; i < LB.
numRow; i++ ){
798 nzvalAinv[i+j*ldAinv] =
799 nzvalSinv[relRows[i] + relCols[j] * ldSinv];
807 TIMER_STOP(PARSING_COL_BLOCKIDX);
808 if( isBlockFound ==
false ){
809 std::ostringstream msg;
810 msg <<
"Block(" << isup <<
", " << jsup
811 <<
") did not find a matching block in Sinv." << std::endl;
812 ErrorHandling( msg.str().c_str() );
824 TIMER_STOP(Compute_Sinv_LT_Lookup_Indexes);
829 std::vector<SuperNodeBufferType > & arrSuperNodes,
833 TIMER_START(Send_CD_Update_U);
837 Int sendOffset[stepSuper];
838 Int recvOffset[stepSuper];
840 for (Int supidx=0; supidx<stepSuper; supidx++){
842 sendOffset[supidx]=sendCount;
843 recvOffset[supidx]=recvCount;
844 sendCount+= CountSendToCrossDiagonal(snode.Index);
845 recvCount+= CountRecvFromCrossDiagonal(snode.Index);
849 std::vector<MPI_Request > arrMpiReqsSendCD(sendCount, MPI_REQUEST_NULL );
850 std::vector<MPI_Request > arrMpiReqsSizeSendCD(sendCount, MPI_REQUEST_NULL );
852 std::vector<MPI_Request > arrMpiReqsRecvCD(recvCount, MPI_REQUEST_NULL );
853 std::vector<MPI_Request > arrMpiReqsSizeRecvCD(recvCount, MPI_REQUEST_NULL );
854 std::vector<std::vector<char> > arrSstrLcolSendCD(sendCount);
855 std::vector<int > arrSstrLcolSizeSendCD(sendCount);
856 std::vector<std::vector<char> > arrSstrLcolRecvCD(recvCount);
857 std::vector<int > arrSstrLcolSizeRecvCD(recvCount);
859 for (Int supidx=0; supidx<stepSuper; supidx++){
865 TIMER_START(Send_L_CrossDiag);
867 if(
MYCOL( grid_ ) ==
PCOL( snode.Index, grid_ ) && isSendToCrossDiagonal_(grid_->numProcCol, snode.Index ) ){
870 for(Int dstCol = 0; dstCol<grid_->numProcCol; dstCol++){
871 if(isSendToCrossDiagonal_(dstCol,snode.Index) ){
872 Int dest =
PNUM(
PROW(snode.Index,grid_),dstCol,grid_);
874 if(
MYPROC( grid_ ) != dest ){
875 MPI_Request & mpiReqSizeSend = arrMpiReqsSizeSendCD[sendOffset[supidx]+sendIdx];
876 MPI_Request & mpiReqSend = arrMpiReqsSendCD[sendOffset[supidx]+sendIdx];
879 std::stringstream sstm;
880 std::vector<char> & sstrLcolSend = arrSstrLcolSendCD[sendOffset[supidx]+sendIdx];
881 Int & sstrSize = arrSstrLcolSizeSendCD[sendOffset[supidx]+sendIdx];
883 serialize( snode.RowLocalPtr, sstm, NO_MASK );
884 serialize( snode.BlockIdxLocal, sstm, NO_MASK );
885 serialize( snode.LUpdateBuf, sstm, NO_MASK );
887 sstrLcolSend.resize( Size(sstm) );
888 sstm.read( &sstrLcolSend[0], sstrLcolSend.size() );
889 sstrSize = sstrLcolSend.size();
891 #if ( _DEBUGlevel_ >= 1 )
892 assert(IDX_TO_TAG(snode.Rank,SELINV_TAG_L_SIZE_CD,limIndex_)<=maxTag_);
893 assert(IDX_TO_TAG(snode.Rank,SELINV_TAG_L_CONTENT_CD,limIndex_)<=maxTag_);
896 MPI_Isend( &sstrSize,
sizeof(sstrSize), MPI_BYTE, dest, IDX_TO_TAG(snode.Rank,SELINV_TAG_L_SIZE_CD,limIndex_), grid_->comm, &mpiReqSizeSend );
897 MPI_Isend( (
void*)&sstrLcolSend[0], sstrSize, MPI_BYTE, dest, IDX_TO_TAG(snode.Rank,SELINV_TAG_L_CONTENT_CD,limIndex_), grid_->comm, &mpiReqSend );
899 PROFILE_COMM(
MYPROC(this->grid_),dest,IDX_TO_TAG(snode.Rank,SELINV_TAG_L_SIZE_CD,limIndex_),
sizeof(sstrSize));
900 PROFILE_COMM(
MYPROC(this->grid_),dest,IDX_TO_TAG(snode.Rank,SELINV_TAG_L_CONTENT_CD,limIndex_),sstrSize);
910 TIMER_STOP(Send_L_CrossDiag);
915 for (Int supidx=0; supidx<stepSuper; supidx++){
918 if(
MYROW( grid_ ) ==
PROW( snode.Index, grid_ ) && isRecvFromCrossDiagonal_(grid_->numProcRow, snode.Index ) ){
920 for(Int srcRow = 0; srcRow<grid_->numProcRow; srcRow++){
921 if(isRecvFromCrossDiagonal_(srcRow,snode.Index) ){
922 Int src =
PNUM(srcRow,
PCOL(snode.Index,grid_),grid_);
923 if(
MYPROC( grid_ ) != src ){
924 Int & sstrSize = arrSstrLcolSizeRecvCD[recvOffset[supidx]+recvIdx];
925 MPI_Request & mpiReqSizeRecv = arrMpiReqsSizeRecvCD[recvOffset[supidx]+recvIdx];
927 #if ( _DEBUGlevel_ >= 1 )
928 assert(IDX_TO_TAG(snode.Rank,SELINV_TAG_L_CONTENT_CD,limIndex_)<=maxTag_);
930 MPI_Irecv( &sstrSize, 1, MPI_INT, src, IDX_TO_TAG(snode.Rank,SELINV_TAG_L_SIZE_CD,limIndex_), grid_->comm, &mpiReqSizeRecv);
939 mpi::Waitall(arrMpiReqsSizeRecvCD);
941 for (Int supidx=0; supidx<stepSuper; supidx++){
944 if(
MYROW( grid_ ) ==
PROW( snode.Index, grid_ ) && isRecvFromCrossDiagonal_(grid_->numProcRow, snode.Index ) ){
946 for(Int srcRow = 0; srcRow<grid_->numProcRow; srcRow++){
947 if(isRecvFromCrossDiagonal_(srcRow,snode.Index) ){
948 Int src =
PNUM(srcRow,
PCOL(snode.Index,grid_),grid_);
949 if(
MYPROC( grid_ ) != src ){
950 Int & sstrSize = arrSstrLcolSizeRecvCD[recvOffset[supidx]+recvIdx];
951 std::vector<char> & sstrLcolRecv = arrSstrLcolRecvCD[recvOffset[supidx]+recvIdx];
952 MPI_Request & mpiReqRecv = arrMpiReqsRecvCD[recvOffset[supidx]+recvIdx];
953 sstrLcolRecv.resize( sstrSize);
955 #if ( _DEBUGlevel_ >= 1 )
956 assert(IDX_TO_TAG(snode.Rank,SELINV_TAG_L_CONTENT_CD,limIndex_)<=maxTag_);
958 MPI_Irecv( (
void*)&sstrLcolRecv[0], sstrSize, MPI_BYTE, src, IDX_TO_TAG(snode.Rank,SELINV_TAG_L_CONTENT_CD,limIndex_), grid_->comm, &mpiReqRecv );
968 mpi::Waitall(arrMpiReqsRecvCD);
970 for (Int supidx=0; supidx<stepSuper; supidx++){
975 if(
MYROW( grid_ ) ==
PROW( snode.Index, grid_ ) && isRecvFromCrossDiagonal_(grid_->numProcRow, snode.Index ) ){
977 #if ( _DEBUGlevel_ >= 1 )
978 statusOFS << std::endl <<
" ["<<snode.Index<<
"] "<<
"Update the upper triangular block" << std::endl << std::endl;
979 statusOFS << std::endl <<
" ["<<snode.Index<<
"] "<<
"blockIdxLocal:" << snode.BlockIdxLocal << std::endl << std::endl;
980 statusOFS << std::endl <<
" ["<<snode.Index<<
"] "<<
"rowLocalPtr:" << snode.RowLocalPtr << std::endl << std::endl;
983 std::vector<UBlock<T> >& Urow = this->U(
LBi( snode.Index, grid_ ) );
984 std::vector<bool> isBlockFound(Urow.size(),
false);
987 for(Int srcRow = 0; srcRow<grid_->numProcRow; srcRow++){
988 if(isRecvFromCrossDiagonal_(srcRow,snode.Index) ){
989 Int src =
PNUM(srcRow,
PCOL(snode.Index,grid_),grid_);
990 TIMER_START(Recv_L_CrossDiag);
992 std::vector<Int> rowLocalPtrRecv;
993 std::vector<Int> blockIdxLocalRecv;
996 if(
MYPROC( grid_ ) != src ){
997 std::stringstream sstm;
998 Int & sstrSize = arrSstrLcolSizeRecvCD[recvOffset[supidx]+recvIdx];
999 std::vector<char> & sstrLcolRecv = arrSstrLcolRecvCD[recvOffset[supidx]+recvIdx];
1000 sstm.write( &sstrLcolRecv[0], sstrSize );
1002 deserialize( rowLocalPtrRecv, sstm, NO_MASK );
1003 deserialize( blockIdxLocalRecv, sstm, NO_MASK );
1004 deserialize( UUpdateBuf, sstm, NO_MASK );
1010 rowLocalPtrRecv = snode.RowLocalPtr;
1011 blockIdxLocalRecv = snode.BlockIdxLocal;
1012 UUpdateBuf = snode.LUpdateBuf;
1017 TIMER_STOP(Recv_L_CrossDiag);
1019 #if ( _DEBUGlevel_ >= 1 )
1020 statusOFS<<
" ["<<snode.Index<<
"] P"<<
MYPROC(grid_)<<
" ("<<
MYROW(grid_)<<
","<<
MYCOL(grid_)<<
") <--- LBj("<<snode.Index<<
") <--- P"<<src<<std::endl;
1021 statusOFS << std::endl <<
" ["<<snode.Index<<
"] "<<
"rowLocalPtrRecv:" << rowLocalPtrRecv << std::endl << std::endl;
1022 statusOFS << std::endl <<
" ["<<snode.Index<<
"] "<<
"blockIdxLocalRecv:" << blockIdxLocalRecv << std::endl << std::endl;
1027 for( Int ib = 0; ib < blockIdxLocalRecv.size(); ib++ ){
1028 for( Int jb = 0; jb < Urow.size(); jb++ ){
1030 if( UB.
blockIdx == blockIdxLocalRecv[ib] ){
1032 lapack::Lacpy(
'A', Ltmp.m(), Ltmp.n(),
1033 &UUpdateBuf( rowLocalPtrRecv[ib], 0 ),
1034 UUpdateBuf.m(), Ltmp.Data(), Ltmp.m() );
1035 isBlockFound[jb] =
true;
1036 Transpose( Ltmp, UB.
nzval );
1044 for( Int jb = 0; jb < Urow.size(); jb++ ){
1046 if( !isBlockFound[jb] ){
1047 ErrorHandling(
"UBlock cannot find its update. Something is seriously wrong." );
1053 TIMER_STOP(Send_CD_Update_U);
1055 mpi::Waitall(arrMpiReqsSizeSendCD);
1056 mpi::Waitall(arrMpiReqsSendCD);
1059 template<
typename T>
1065 #if ( _DEBUGlevel_ >= 1 )
1066 statusOFS << std::endl <<
"["<<snode.Index<<
"] "<<
"Unpack the received data for processors participate in Gemm. " << std::endl << std::endl;
1069 if(
MYROW( grid_ ) !=
PROW( snode.Index, grid_ ) ){
1070 std::stringstream sstm;
1072 sstm.write( &snode.SstrUrowRecv[0], snode.SstrUrowRecv.size() );
1075 TreeBcast2<T> * bcastUTree2 = fwdToBelowTree2_[snode.Index];
1076 sstm.write( (
char*)bcastUTree2->GetLocalBuffer(),bcastUTree2->GetMsgSize());
1078 std::vector<Int> mask( UBlockMask::TOTAL_NUMBER, 1 );
1080 deserialize( numUBlock, sstm, NO_MASK );
1081 UrowRecv.resize( numUBlock );
1082 for( Int jb = 0; jb < numUBlock; jb++ ){
1083 deserialize( UrowRecv[jb], sstm, mask );
1091 UrowRecv.resize(this->U(
LBi( snode.Index, grid_ ) ).size());
1092 std::copy(this->U(
LBi( snode.Index, grid_ ) ).begin(),this->U(
LBi( snode.Index, grid_ )).end(),UrowRecv.begin());
1097 if(
MYCOL( grid_ ) !=
PCOL( snode.Index, grid_ ) ){
1098 std::stringstream sstm;
1100 sstm.write( &snode.SstrLcolRecv[0], snode.SstrLcolRecv.size() );
1103 TreeBcast2<T> * bcastLTree2 = fwdToRightTree2_[snode.Index];
1104 sstm.write( (
char*)bcastLTree2->GetLocalBuffer(),bcastLTree2->GetMsgSize());
1106 std::vector<Int> mask( LBlockMask::TOTAL_NUMBER, 1 );
1107 mask[LBlockMask::NZVAL] = 0;
1109 deserialize( numLBlock, sstm, NO_MASK );
1110 LcolRecv.resize( numLBlock );
1111 for( Int ib = 0; ib < numLBlock; ib++ ){
1112 deserialize( LcolRecv[ib], sstm, mask );
1118 std::vector<LBlock<T> >& Lcol = this->L(
LBj( snode.Index, grid_ ) );
1119 Int startIdx = (
MYROW( grid_ ) ==
PROW( snode.Index, grid_ ) )?1:0;
1120 LcolRecv.resize( Lcol.size() - startIdx );
1121 std::copy(Lcol.begin()+startIdx,Lcol.end(),LcolRecv.begin());
1125 template<
typename T>
1130 if(
MYCOL( grid_ ) ==
PCOL( snode.Index, grid_ ) ){
1131 #if ( _DEBUGlevel_ >= 2 )
1132 statusOFS << std::endl <<
"["<<snode.Index<<
"] "<<
"Updating the diagonal block" << std::endl << std::endl;
1134 std::vector<LBlock<T> >& Lcol = this->L(
LBj( snode.Index, grid_ ) );
1137 snode.DiagBuf.Resize(
SuperSize( snode.Index, super_ ),
SuperSize( snode.Index, super_ ));
1138 SetValue(snode.DiagBuf, ZERO<T>());
1141 Int startIb = (
MYROW( grid_ ) ==
PROW( snode.Index, grid_ ))?1:0;
1142 for( Int ib = startIb; ib < Lcol.size(); ib++ ){
1145 gemm_stat.push_back(snode.DiagBuf.m());
1146 gemm_stat.push_back(snode.DiagBuf.n());
1147 gemm_stat.push_back(Lcol[ib].numRow);
1152 blas::Gemm(
'T',
'N', snode.DiagBuf.m(), snode.DiagBuf.n(), LB.
numRow,
1153 MINUS_ONE<T>(), &snode.LUpdateBuf( snode.RowLocalPtr[ib-startIb], 0 ), snode.LUpdateBuf.m(),
1154 LB.
nzval.Data(), LB.
nzval.m(), ONE<T>(), snode.DiagBuf.Data(), snode.DiagBuf.m() );
1157 #if ( _DEBUGlevel_ >= 1 )
1158 statusOFS << std::endl <<
"["<<snode.Index<<
"] "<<
"Updated the diagonal block" << std::endl << std::endl;
1165 template<
typename T>
1170 if( optionsFact_->ColPerm !=
"PARMETIS" ) {
1177 etree_supno.resize(this->
NumSuper());
1178 for(Int i = 0; i < superNode->etree.m(); ++i){
1179 Int curSnode = superNode->superIdx[i];
1180 Int parentSnode = (superNode->etree[i]>= superNode->etree.m()) ?this->
NumSuper():superNode->superIdx[superNode->etree[i]];
1181 if( curSnode != parentSnode){
1182 etree_supno[curSnode] = parentSnode;
1191 std::vector<Int> etree_supno_l( nsupers, nsupers );
1192 for( Int ksup = 0; ksup < nsupers; ksup++ ){
1193 if(
MYCOL( grid_ ) ==
PCOL( ksup, grid_ ) ){
1195 std::vector<LBlock<T> >& Lcol = this->L(
LBj(ksup, grid_) );
1198 if(
MYROW( grid_ ) ==
PROW( ksup, grid_ ) )
1201 for( Int ib = firstBlk; ib < Lcol.size(); ib++ ){
1202 etree_supno_l[ksup] = std::min(etree_supno_l[ksup] , Lcol[ib].blockIdx);
1209 #if ( _DEBUGlevel_ >= 1 )
1210 statusOFS << std::endl <<
" Local supernodal elimination tree is " << etree_supno_l <<std::endl<<std::endl;
1214 etree_supno.resize( nsupers );
1215 mpi::Allreduce( (Int*) &etree_supno_l[0],(Int *) &etree_supno[0], nsupers, MPI_MIN, grid_->comm );
1216 etree_supno[nsupers-1]=nsupers;
1220 double end = MPI_Wtime( );
1221 statusOFS<<
"Building the list took "<<end-begin<<
"s"<<std::endl;
1226 template<
typename T>
1228 std::vector<Int> & snodeEtree,
1229 std::vector<std::vector<Int> > & WSet)
1231 TIMER_START(Compute_WorkSet);
1235 if (options_->maxPipelineDepth!=1){
1237 Int maxDepth = options_->maxPipelineDepth;
1238 maxDepth=maxDepth==-1?std::numeric_limits<Int>::max():maxDepth;
1239 #if ( _DEBUGlevel_ >= 1 )
1240 statusOFS<<
"MaxDepth is "<<maxDepth<<endl;
1246 Int rootParent = numSuper;
1252 level(rootParent)=-1;
1254 for(Int i=rootParent-1; i>=0; i-- ){
1255 level[i] = level[snodeEtree[i]]+1;
1256 numLevel = std::max(numLevel, level[i]);
1264 for(Int i=rootParent-1; i>=0; i-- ){
1266 levelSize[level[i]]++;
1271 WSet.resize(numLevel,std::vector<Int>());
1272 for(Int i=0; i<numLevel; i++ ){
1273 WSet[i].reserve(levelSize(i));
1277 for(Int i=rootParent-1; i>=0; i-- ){
1298 WSet[level[i]].push_back(i);
1305 Int limit = maxDepth;
1307 for (Int lidx=0; lidx<WSet.size() ; lidx++){
1316 Int maxRank = rank + WSet[lidx].size()-1;
1318 #if ( _DEBUGlevel_ >= 1 )
1319 statusOFS<< (Int)(rank/limIndex_) <<
" vs2 "<<(Int)(maxRank/limIndex_)<<std::endl;
1321 if( (Int)(rank/limIndex_) != (Int)(maxRank/limIndex_)){
1324 splitIdx = limIndex_-(rank%limIndex_)-1;
1327 Int splitPoint = std::min((Int)WSet[lidx].size()-1,splitIdx>0?std::min(limit-1,splitIdx):limit-1);
1328 #if ( _DEBUGlevel_ >= 1 )
1330 statusOFS<<
"TEST SPLIT at "<<splitIdx<<
" "<<std::endl;
1333 split = split || (WSet[lidx].size()>limit);
1335 rank += splitPoint+1;
1337 if( split && splitPoint>0)
1339 statusOFS<<
"SPLITPOINT is "<<splitPoint<<
" "<<std::endl;
1340 #if ( _DEBUGlevel_ >= 1 )
1341 if(splitPoint != limit-1){
1342 statusOFS<<
"-----------------------"<<std::endl;
1343 statusOFS<<lidx<<
": "<<orank<<
" -- "<<maxRank<<std::endl;
1344 statusOFS<<
" is NOW "<<std::endl;
1345 statusOFS<<lidx<<
": "<<orank<<
" -- "<<rank-1<<std::endl;
1346 statusOFS<<lidx+1<<
": "<<rank<<
" -- "<<maxRank<<std::endl;
1347 statusOFS<<
"-----------------------"<<std::endl;
1350 std::vector<std::vector<Int> >::iterator pos = WSet.begin()+lidx+1;
1351 WSet.insert(pos,std::vector<Int>());
1352 WSet[lidx+1].insert(WSet[lidx+1].begin(),WSet[lidx].begin() + splitPoint+1 ,WSet[lidx].end());
1353 WSet[lidx].erase(WSet[lidx].begin()+splitPoint+1,WSet[lidx].end());
1355 #if ( _DEBUGlevel_ >= 1 )
1356 if(splitPoint != limit-1){
1357 assert((orank+WSet[lidx].size()-1)%limIndex_==0);
1442 for( Int ksup = numSuper - 2; ksup >= 0; ksup-- ){
1443 WSet.push_back(std::vector<Int>());
1444 WSet.back().push_back(ksup);
1452 TIMER_STOP(Compute_WorkSet);
1453 #if ( _DEBUGlevel_ >= 1 )
1454 for (Int lidx=0; lidx<WSet.size() ; lidx++){
1455 statusOFS << std::endl <<
"L"<< lidx <<
" is: {";
1456 for (Int supidx=0; supidx<WSet[lidx].size() ; supidx++){
1457 statusOFS << WSet[lidx][supidx] <<
" ["<<snodeEtree[WSet[lidx][supidx]]<<
"] ";
1459 statusOFS <<
" }"<< std::endl;
1464 template<
typename T>
1468 #if defined (PROFILE) || defined(PMPI) || defined(USE_TAU)
1469 Real begin_SendULWaitContentFirst, end_SendULWaitContentFirst, time_SendULWaitContentFirst = 0;
1472 std::vector<std::vector<Int> > & superList = this->WorkingSet();
1473 Int numSteps = superList.size();
1474 Int stepSuper = superList[lidx].size();
1479 TIMER_START(AllocateBuffer);
1482 for (Int supidx=0; supidx<superList[lidx].size(); supidx++){
1483 Int snodeIdx = superList[lidx][supidx];
1485 TreeBcast * bcastLTree = fwdToRightTree_[snodeIdx];
1486 TreeBcast * bcastUTree = fwdToBelowTree_[snodeIdx];
1488 TreeBcast2<T> * bcastLTree = fwdToRightTree2_[snodeIdx];
1489 TreeBcast2<T> * bcastUTree = fwdToBelowTree2_[snodeIdx];
1493 bool participating =
MYROW( grid_ ) ==
PROW( snodeIdx, grid_ )
1494 ||
MYCOL( grid_ ) ==
PCOL( snodeIdx, grid_ )
1495 || CountSendToRight(snodeIdx) > 0
1496 || CountSendToBelow(snodeIdx) > 0
1497 || CountSendToCrossDiagonal(snodeIdx) > 0
1498 || CountRecvFromCrossDiagonal(snodeIdx) >0
1499 || ( isRecvFromLeft_( snodeIdx ) )
1500 || ( isRecvFromAbove_( snodeIdx ) )
1501 || isSendToDiagonal_(snodeIdx)
1502 || (bcastUTree!=NULL)
1503 || (bcastLTree!=NULL)
1505 || (redDTree!=NULL) ;
1515 std::vector<std::vector<MPI_Request> > arrMpireqsSendToBelow;
1516 arrMpireqsSendToBelow.resize( stepSuper, std::vector<MPI_Request>( 2 * grid_->numProcRow, MPI_REQUEST_NULL ));
1517 std::vector<std::vector<MPI_Request> > arrMpireqsSendToRight;
1518 arrMpireqsSendToRight.resize(stepSuper, std::vector<MPI_Request>( 2 * grid_->numProcCol, MPI_REQUEST_NULL ));
1523 std::vector<MPI_Request> arrMpireqsSendToLeft;
1524 arrMpireqsSendToLeft.resize(stepSuper, MPI_REQUEST_NULL );
1526 std::vector<MPI_Request> arrMpireqsSendToAbove;
1527 arrMpireqsSendToAbove.resize(stepSuper, MPI_REQUEST_NULL );
1532 std::vector<MPI_Request> arrMpireqsRecvSizeFromAny;
1533 arrMpireqsRecvSizeFromAny.resize(stepSuper*2 , MPI_REQUEST_NULL);
1534 std::vector<MPI_Request> arrMpireqsRecvContentFromAny;
1535 arrMpireqsRecvContentFromAny.resize(stepSuper*2 , MPI_REQUEST_NULL);
1539 std::vector<SuperNodeBufferType> arrSuperNodes(stepSuper);
1541 for (Int supidx=0; supidx<superList[lidx].size(); supidx++){
1542 Int snodeIdx = superList[lidx][supidx];
1544 TreeBcast * bcastLTree = fwdToRightTree_[snodeIdx];
1545 TreeBcast * bcastUTree = fwdToBelowTree_[snodeIdx];
1547 TreeBcast2<T> * bcastLTree = fwdToRightTree2_[snodeIdx];
1548 TreeBcast2<T> * bcastUTree = fwdToBelowTree2_[snodeIdx];
1552 bool participating =
MYROW( grid_ ) ==
PROW( snodeIdx, grid_ )
1553 ||
MYCOL( grid_ ) ==
PCOL( snodeIdx, grid_ )
1554 || CountSendToRight(snodeIdx) > 0
1555 || CountSendToBelow(snodeIdx) > 0
1556 || CountSendToCrossDiagonal(snodeIdx) > 0
1557 || CountRecvFromCrossDiagonal(snodeIdx) >0
1558 || ( isRecvFromLeft_( snodeIdx ) )
1559 || ( isRecvFromAbove_( snodeIdx ) )
1560 || isSendToDiagonal_(snodeIdx)
1561 || (bcastUTree!=NULL)
1562 || (bcastLTree!=NULL)
1564 || (redDTree!=NULL) ;
1567 arrSuperNodes[pos].Index = superList[lidx][supidx];
1568 arrSuperNodes[pos].Rank = rank;
1580 int numSentToLeft = 0;
1581 std::vector<int> reqSentToLeft;
1586 TIMER_STOP(AllocateBuffer);
1588 #if ( _DEBUGlevel_ >= 1 )
1589 statusOFS << std::endl <<
"Communication to the Schur complement." << std::endl << std::endl;
1593 vector<char> bcastUready(stepSuper,0);
1595 vector<char> bcastLready(stepSuper,0);
1601 TIMER_START(IRecv_Content_UL);
1606 for (Int supidx=0; supidx<stepSuper ; supidx++){
1609 MPI_Request * mpireqsRecvFromAbove = &arrMpireqsRecvContentFromAny[supidx*2];
1610 MPI_Request * mpireqsRecvFromLeft = &arrMpireqsRecvContentFromAny[supidx*2+1];
1613 if( isRecvFromAbove_( snode.Index ) &&
1614 MYROW( grid_ ) !=
PROW( snode.Index, grid_ ) ){
1617 TreeBcast2<T> * bcastUTree2 = fwdToBelowTree2_[snode.Index];
1619 if(bcastUTree2 != NULL){
1620 bcastUTree2->SetTag(IDX_TO_TAG(snode.Rank,SELINV_TAG_U_CONTENT,limIndex_));
1624 bool done = bcastUTree2->Progress();
1626 #if ( _DEBUGlevel_ >= 1 )
1627 statusOFS<<
"["<<snode.Index<<
"] "<<
" trying to progress bcast U "<<done<<std::endl;
1631 TreeBcast * bcastUTree = fwdToBelowTree_[snode.Index];
1632 if(bcastUTree!=NULL){
1633 Int myRoot = bcastUTree->GetRoot();
1634 snode.SizeSstrUrowRecv = bcastUTree->GetMsgSize();
1635 snode.SstrUrowRecv.resize( snode.SizeSstrUrowRecv);
1636 MPI_Irecv( &snode.SstrUrowRecv[0], snode.SizeSstrUrowRecv, MPI_BYTE,
1637 myRoot, IDX_TO_TAG(snode.Rank,SELINV_TAG_U_CONTENT,limIndex_),
1638 grid_->colComm, mpireqsRecvFromAbove );
1639 #if ( _DEBUGlevel_ >= 1 )
1640 statusOFS << std::endl <<
"["<<snode.Index<<
"] "<<
"Receiving U " << snode.SizeSstrUrowRecv <<
" BYTES from "<<myRoot<<
" on tag "<<IDX_TO_TAG(snode.Rank,SELINV_TAG_U_CONTENT,limIndex_)<< std::endl << std::endl;
1646 if( isRecvFromLeft_( snode.Index ) &&
1647 MYCOL( grid_ ) !=
PCOL( snode.Index, grid_ ) ){
1649 TreeBcast2<T> * bcastLTree2 = fwdToRightTree2_[snode.Index];
1651 if(bcastLTree2 != NULL){
1652 bcastLTree2->SetTag(IDX_TO_TAG(snode.Rank,SELINV_TAG_L_CONTENT,limIndex_));
1656 bool done = bcastLTree2->Progress();
1658 #if ( _DEBUGlevel_ >= 1 )
1659 statusOFS<<
"["<<snode.Index<<
"] "<<
" trying to progress bcast L "<<done<<std::endl;
1664 TreeBcast * bcastLTree = fwdToRightTree_[snode.Index];
1665 if(bcastLTree!=NULL){
1666 Int myRoot = bcastLTree->GetRoot();
1667 snode.SizeSstrLcolRecv = bcastLTree->GetMsgSize();
1668 snode.SstrLcolRecv.resize(snode.SizeSstrLcolRecv);
1669 MPI_Irecv( &snode.SstrLcolRecv[0], snode.SizeSstrLcolRecv, MPI_BYTE,
1670 myRoot, IDX_TO_TAG(snode.Rank,SELINV_TAG_L_CONTENT,limIndex_),
1671 grid_->rowComm, mpireqsRecvFromLeft );
1672 #if ( _DEBUGlevel_ >= 1 )
1673 statusOFS << std::endl <<
"["<<snode.Index<<
"] "<<
"Receiving L " << snode.SizeSstrLcolRecv <<
" BYTES from "<<myRoot<<
" on tag "<<IDX_TO_TAG(snode.Rank,SELINV_TAG_L_CONTENT,limIndex_)<< std::endl << std::endl;
1679 TIMER_STOP(IRecv_Content_UL);
1682 TIMER_START(ISend_Content_UL);
1683 for (Int supidx=0; supidx<stepSuper; supidx++){
1686 std::vector<MPI_Request> & mpireqsSendToBelow = arrMpireqsSendToBelow[supidx];
1687 std::vector<MPI_Request> & mpireqsSendToRight = arrMpireqsSendToRight[supidx];
1690 #if ( _DEBUGlevel_ >= 1 )
1691 statusOFS << std::endl <<
"["<<snode.Index<<
"] "
1692 <<
"Communication for the U part." << std::endl << std::endl;
1696 if(
MYROW( grid_ ) ==
PROW( snode.Index, grid_ ) ){
1697 std::vector<UBlock<T> >& Urow = this->U(
LBi(snode.Index, grid_) );
1699 TIMER_START(Serialize_UL);
1700 std::stringstream sstm;
1702 std::vector<Int> mask( UBlockMask::TOTAL_NUMBER, 1 );
1704 serialize( (Int)Urow.size(), sstm, NO_MASK );
1705 for( Int jb = 0; jb < Urow.size(); jb++ ){
1706 serialize( Urow[jb], sstm, mask );
1708 snode.SstrUrowSend.resize( Size( sstm ) );
1709 sstm.read( &snode.SstrUrowSend[0], snode.SstrUrowSend.size() );
1710 snode.SizeSstrUrowSend = snode.SstrUrowSend.size();
1711 TIMER_STOP(Serialize_UL);
1714 TreeBcast2<T> * bcastUTree2 = fwdToBelowTree2_[snode.Index];
1715 if(bcastUTree2!=NULL){
1716 bcastUTree2->SetTag(IDX_TO_TAG(snode.Rank,SELINV_TAG_U_CONTENT,limIndex_));
1717 bcastUTree2->SetLocalBuffer((T*)&snode.SstrUrowSend[0]);
1718 bcastUTree2->SetDataReady(
true);
1719 bool done = bcastUTree2->Progress();
1720 #if ( _DEBUGlevel_ >= 1 )
1721 statusOFS<<
"["<<snode.Index<<
"] "<<
" trying to progress bcast U "<<done<<std::endl;
1726 #if ( _DEBUGlevel_ >= 1 )
1727 for( Int idxRecv = 0; idxRecv < bcastUTree2->GetDestCount(); ++idxRecv ){
1728 Int iProcRow = bcastUTree2->GetDest(idxRecv);
1729 statusOFS << std::endl <<
"["<<snode.Index<<
"] "<<
"Sending U " << snode.SizeSstrUrowSend <<
" BYTES on tag "<<IDX_TO_TAG(snode.Rank,SELINV_TAG_U_CONTENT,limIndex_) << std::endl << std::endl;
1735 TreeBcast * bcastUTree = fwdToBelowTree_[snode.Index];
1736 if(bcastUTree!=NULL){
1737 bcastUTree->ForwardMessage((
char*)&snode.SstrUrowSend[0], snode.SizeSstrUrowSend,
1738 IDX_TO_TAG(snode.Rank,SELINV_TAG_U_CONTENT,limIndex_), &mpireqsSendToBelow[0]);
1739 for( Int idxRecv = 0; idxRecv < bcastUTree->GetDestCount(); ++idxRecv ){
1740 Int iProcRow = bcastUTree->GetDest(idxRecv);
1741 #if ( _DEBUGlevel_ >= 1 )
1742 statusOFS << std::endl <<
"["<<snode.Index<<
"] "<<
"Sending U " << snode.SizeSstrUrowSend <<
" BYTES on tag "<<IDX_TO_TAG(snode.Rank,SELINV_TAG_U_CONTENT,limIndex_) << std::endl << std::endl;
1751 #if ( _DEBUGlevel_ >= 1 )
1752 statusOFS << std::endl <<
"["<<snode.Index<<
"] "<<
"Communication for the L part." << std::endl << std::endl;
1758 if(
MYCOL( grid_ ) ==
PCOL( snode.Index, grid_ ) ){
1759 std::vector<LBlock<T> >& Lcol = this->L(
LBj(snode.Index, grid_) );
1760 TIMER_START(Serialize_UL);
1762 std::stringstream sstm;
1763 std::vector<Int> mask( LBlockMask::TOTAL_NUMBER, 1 );
1764 mask[LBlockMask::NZVAL] = 0;
1768 if(
MYROW( grid_ ) ==
PROW( snode.Index, grid_ ) )
1769 serialize( (Int)Lcol.size() - 1, sstm, NO_MASK );
1771 serialize( (Int)Lcol.size(), sstm, NO_MASK );
1773 for( Int ib = 0; ib < Lcol.size(); ib++ ){
1774 if( Lcol[ib].blockIdx > snode.Index ){
1775 #if ( _DEBUGlevel_ >= 2 )
1776 statusOFS << std::endl <<
"["<<snode.Index<<
"] "<<
"Serializing Block index " << Lcol[ib].blockIdx << std::endl;
1778 serialize( Lcol[ib], sstm, mask );
1781 snode.SstrLcolSend.resize( Size( sstm ) );
1782 sstm.read( &snode.SstrLcolSend[0], snode.SstrLcolSend.size() );
1783 snode.SizeSstrLcolSend = snode.SstrLcolSend.size();
1784 TIMER_STOP(Serialize_UL);
1787 TreeBcast2<T> * bcastLTree2 = fwdToRightTree2_[snode.Index];
1788 if(bcastLTree2!=NULL){
1789 bcastLTree2->SetTag(IDX_TO_TAG(snode.Rank,SELINV_TAG_L_CONTENT,limIndex_));
1790 bcastLTree2->SetLocalBuffer((T*)&snode.SstrLcolSend[0]);
1791 bcastLTree2->SetDataReady(
true);
1792 bool done = bcastLTree2->Progress();
1793 #if ( _DEBUGlevel_ >= 1 )
1794 statusOFS<<
"["<<snode.Index<<
"] "<<
" trying to progress bcast L "<<done<<std::endl;
1799 #if ( _DEBUGlevel_ >= 1 )
1800 for( Int idxRecv = 0; idxRecv < bcastLTree2->GetDestCount(); ++idxRecv ){
1801 Int iProcRow = bcastLTree2->GetDest(idxRecv);
1802 statusOFS << std::endl <<
"["<<snode.Index<<
"] "<<
"Sending L " << snode.SizeSstrLcolSend <<
" BYTES on tag "<<IDX_TO_TAG(snode.Rank,SELINV_TAG_L_CONTENT,limIndex_) << std::endl << std::endl;
1809 TreeBcast * bcastLTree = fwdToRightTree_[snode.Index];
1810 if(bcastLTree!=NULL){
1811 bcastLTree->ForwardMessage((
char*)&snode.SstrLcolSend[0], snode.SizeSstrLcolSend,
1812 IDX_TO_TAG(snode.Rank,SELINV_TAG_L_CONTENT,limIndex_), &mpireqsSendToRight[0]);
1814 for( Int idxRecv = 0; idxRecv < bcastLTree->GetDestCount(); ++idxRecv ){
1815 Int iProcCol = bcastLTree->GetDest(idxRecv);
1816 #if ( _DEBUGlevel_ >= 1 )
1817 statusOFS << std::endl <<
"["<<snode.Index<<
"] "<<
"Sending L " << snode.SizeSstrLcolSend <<
" BYTES on tag "<<IDX_TO_TAG(snode.Rank,SELINV_TAG_L_CONTENT,limIndex_) << std::endl << std::endl;
1824 TIMER_STOP(ISend_Content_UL);
1827 vector<char> redLdone(stepSuper,0);
1828 for (Int supidx=0; supidx<stepSuper; supidx++){
1832 if(redLTree != NULL){
1833 redLTree->SetTag(IDX_TO_TAG(snode.Rank,SELINV_TAG_L_REDUCE,limIndex_));
1835 redLTree->AllocRecvBuffers();
1837 redLTree->PostFirstRecv();
1842 TIMER_START(Compute_Sinv_LT);
1844 Int msgForwarded = 0;
1846 Int gemmProcessed = 0;
1850 std::list<Int> readySupidx;
1852 for(Int supidx = 0;supidx<stepSuper;supidx++){
1856 if( isRecvFromAbove_( snode.Index ) && isRecvFromLeft_( snode.Index )){
1858 if(
MYCOL( grid_ ) ==
PCOL( snode.Index, grid_ ) ){
1862 if(
MYROW( grid_ ) ==
PROW( snode.Index, grid_ ) ){
1866 if(snode.isReady==2){
1867 readySupidx.push_back(supidx);
1868 #if ( _DEBUGlevel_ >= 1 )
1869 statusOFS<<std::endl<<
"Locally processing ["<<snode.Index<<
"]"<<std::endl;
1873 else if( (isRecvFromLeft_( snode.Index ) ) &&
MYCOL( grid_ ) !=
PCOL( snode.Index, grid_ ) )
1878 if(redLTree != NULL){
1879 TIMER_START(Reduce_Sinv_LT_Isend);
1881 redLTree->SetLocalBuffer(NULL);
1882 redLTree->SetDataReady(
true);
1884 bool done = redLTree->Progress();
1886 #if ( _DEBUGlevel_ >= 1 )
1887 statusOFS<<
"["<<snode.Index<<
"] "<<
" trying to progress reduce L "<<done<<std::endl;
1889 TIMER_STOP(Reduce_Sinv_LT_Isend);
1893 if(
MYROW(grid_)!=
PROW(snode.Index,grid_)){
1895 TreeBcast2<T> * bcastUTree2 = fwdToBelowTree2_[snode.Index];
1896 if(bcastUTree2 != NULL){
1897 if(bcastUTree2->GetDestCount()>0){
1902 TreeBcast * bcastUTree = fwdToBelowTree_[snode.Index];
1903 if(bcastUTree != NULL){
1904 if(bcastUTree->GetDestCount()>0){
1911 if(
MYCOL(grid_)!=
PCOL(snode.Index,grid_)){
1913 TreeBcast2<T> * bcastLTree2 = fwdToRightTree2_[snode.Index];
1914 if(bcastLTree2 != NULL){
1915 if(bcastLTree2->GetDestCount()>0){
1921 TreeBcast * bcastLTree = fwdToRightTree_[snode.Index];
1922 if(bcastLTree != NULL){
1923 if(bcastLTree->GetDestCount()>0){
1931 #if ( _DEBUGlevel_ >= 1 )
1932 statusOFS<<std::endl<<
"gemmToDo ="<<gemmToDo<<std::endl;
1933 statusOFS<<std::endl<<
"msgToFwd ="<<msgToFwd<<std::endl;
1937 #if defined (PROFILE)
1938 end_SendULWaitContentFirst=0;
1939 begin_SendULWaitContentFirst=0;
1942 while(gemmProcessed<gemmToDo || msgForwarded < msgToFwd)
1944 Int reqidx = MPI_UNDEFINED;
1955 TIMER_START(WaitContent_UL);
1956 #if defined(PROFILE)
1957 if(begin_SendULWaitContentFirst==0){
1958 begin_SendULWaitContentFirst=1;
1959 TIMER_START(WaitContent_UL_First);
1969 for (Int supidx2=0; supidx2<stepSuper; supidx2++){
1972 TreeBcast2<T> * bcastUTree2 = fwdToBelowTree2_[snode.Index];
1974 if(bcastUTree2 != NULL ){
1975 if(!bcastUready[supidx2]){
1976 if(!bcastUTree2->IsRoot()){
1977 bool ready = bcastUTree2->IsDataReceived();
1989 if( isRecvFromAbove_( snode.Index ) && isRecvFromLeft_( snode.Index )){
1992 if(snode.isReady==2){
1993 readySupidx.push_back(supidx2);
1994 #if defined(PROFILE)
1995 if(end_SendULWaitContentFirst==0){
1996 TIMER_STOP(WaitContent_UL_First);
1997 end_SendULWaitContentFirst=1;
2003 bcastUready[supidx2]=1;
2005 for( Int idxRecv = 0; idxRecv < bcastUTree2->GetDestCount(); ++idxRecv ){
2006 Int iProcRow = bcastUTree2->GetDest(idxRecv);
2007 statusOFS << std::endl <<
"MATDEBUG ["<<snode.Index<<
"] "<<
"Forwarded U " << snode.SizeSstrUrowRecv <<
" BYTES to "<<iProcRow<<
" on tag "<<IDX_TO_TAG(snode.Rank,SELINV_TAG_U_CONTENT,limIndex_)<< std::endl << std::endl;
2015 bcastUready[supidx2]=1;
2019 bool done = bcastUTree2->Progress();
2021 #if ( _DEBUGlevel_ >= 1 )
2022 statusOFS<<
"["<<snode.Index<<
"] "<<
" trying to progress bcast U "<<done<<std::endl;
2034 TreeBcast2<T> * bcastLTree2 = fwdToRightTree2_[snode.Index];
2036 if(bcastLTree2 != NULL ){
2037 if(!bcastLready[supidx2]){
2038 if(!bcastLTree2->IsRoot()){
2039 bool ready = bcastLTree2->IsDataReceived();
2049 if( isRecvFromAbove_( snode.Index ) && isRecvFromLeft_( snode.Index )){
2052 if(snode.isReady==2){
2053 readySupidx.push_back(supidx2);
2054 #if defined(PROFILE)
2055 if(end_SendULWaitContentFirst==0){
2056 TIMER_STOP(WaitContent_UL_First);
2057 end_SendULWaitContentFirst=1;
2063 bcastLready[supidx2]=1;
2065 for( Int idxRecv = 0; idxRecv < bcastLTree2->GetDestCount(); ++idxRecv ){
2066 Int iProcRow = bcastLTree2->GetDest(idxRecv);
2067 statusOFS << std::endl <<
"MATDEBUG ["<<snode.Index<<
"] "<<
"Forwarded L " << snode.SizeSstrLcolRecv <<
" BYTES to "<<iProcRow<<
" on tag "<<IDX_TO_TAG(snode.Rank,SELINV_TAG_L_CONTENT,limIndex_)<< std::endl << std::endl;
2075 bcastLready[supidx2]=1;
2079 bool done = bcastLTree2->Progress();
2081 #if ( _DEBUGlevel_ >= 1 )
2082 statusOFS<<
"["<<snode.Index<<
"] "<<
" trying to progress bcast L "<<done<<std::endl;
2102 int reqIndices[arrMpireqsRecvContentFromAny.size()];
2105 int err = MPI_Waitsome(2*stepSuper, &arrMpireqsRecvContentFromAny[0], &numRecv, reqIndices, MPI_STATUSES_IGNORE);
2106 assert(err==MPI_SUCCESS);
2108 for(
int i =0;i<numRecv;i++){
2109 reqidx = reqIndices[i];
2111 if(reqidx!=MPI_UNDEFINED)
2120 TreeBcast * bcastUTree = fwdToBelowTree_[snode.Index];
2121 if(bcastUTree != NULL){
2122 if(bcastUTree->GetDestCount()>0){
2124 std::vector<MPI_Request> & mpireqsSendToBelow = arrMpireqsSendToBelow[supidx];
2125 #if ( _DEBUGlevel_ >= 1 )
2126 for( Int idxRecv = 0; idxRecv < bcastUTree->GetDestCount(); ++idxRecv ){
2127 Int iProcRow = bcastUTree->GetDest(idxRecv);
2128 statusOFS << std::endl <<
"["<<snode.Index<<
"] "<<
"Forwarding U " << snode.SizeSstrUrowRecv <<
" BYTES to "<<iProcRow<<
" on tag "<<IDX_TO_TAG(snode.Rank,SELINV_TAG_U_CONTENT,limIndex_)<< std::endl << std::endl;
2132 bcastUTree->ForwardMessage( (
char*)&snode.SstrUrowRecv[0], snode.SizeSstrUrowRecv,
2133 IDX_TO_TAG(snode.Rank,SELINV_TAG_U_CONTENT,limIndex_), &mpireqsSendToBelow[0] );
2134 #if ( _DEBUGlevel_ >= 1 )
2135 for( Int idxRecv = 0; idxRecv < bcastUTree->GetDestCount(); ++idxRecv ){
2136 Int iProcRow = bcastUTree->GetDest(idxRecv);
2137 statusOFS << std::endl <<
"["<<snode.Index<<
"] "<<
"Forwarded U " << snode.SizeSstrUrowRecv <<
" BYTES to "<<iProcRow<<
" on tag "<<IDX_TO_TAG(snode.Rank,SELINV_TAG_U_CONTENT,limIndex_)<< std::endl << std::endl;
2146 else if(reqidx%2==1){
2147 TreeBcast * bcastLTree = fwdToRightTree_[snode.Index];
2148 if(bcastLTree != NULL){
2149 if(bcastLTree->GetDestCount()>0){
2151 std::vector<MPI_Request> & mpireqsSendToRight = arrMpireqsSendToRight[supidx];
2152 #if ( _DEBUGlevel_ >= 1 )
2153 for( Int idxRecv = 0; idxRecv < bcastLTree->GetDestCount(); ++idxRecv ){
2154 Int iProcCol = bcastLTree->GetDest(idxRecv);
2155 statusOFS << std::endl <<
"["<<snode.Index<<
"] "<<
"Forwarding L " << snode.SizeSstrLcolRecv <<
" BYTES to "<<iProcCol<<
" on tag "<<IDX_TO_TAG(snode.Rank,SELINV_TAG_L_CONTENT,limIndex_)<< std::endl << std::endl;
2159 bcastLTree->ForwardMessage( (
char*)&snode.SstrLcolRecv[0], snode.SizeSstrLcolRecv,
2160 IDX_TO_TAG(snode.Rank,SELINV_TAG_L_CONTENT,limIndex_), &mpireqsSendToRight[0]);
2166 #if ( _DEBUGlevel_ >= 1 )
2167 for( Int idxRecv = 0; idxRecv < bcastLTree->GetDestCount(); ++idxRecv ){
2168 Int iProcCol = bcastLTree->GetDest(idxRecv);
2169 statusOFS << std::endl <<
"["<<snode.Index<<
"] "<<
"Forwarded L " << snode.SizeSstrLcolRecv <<
" BYTES to "<<iProcCol<<
" on tag "<<IDX_TO_TAG(snode.Rank,SELINV_TAG_L_CONTENT,limIndex_)<< std::endl << std::endl;
2178 #if ( _DEBUGlevel_ >= 1 )
2179 statusOFS<<std::endl<<
"Received data for ["<<snode.Index<<
"] reqidx%2="<<reqidx%2<<
" is ready ?"<<snode.isReady<<std::endl;
2182 if( isRecvFromAbove_( snode.Index ) && isRecvFromLeft_( snode.Index )){
2186 if(snode.isReady==2){
2187 readySupidx.push_back(supidx);
2189 #if defined(PROFILE)
2190 if(end_SendULWaitContentFirst==0){
2191 TIMER_STOP(WaitContent_UL_First);
2192 end_SendULWaitContentFirst=1;
2202 TIMER_STOP(WaitContent_UL);
2204 }
while( (gemmProcessed<gemmToDo && readySupidx.size()==0) || (gemmProcessed==gemmToDo && msgForwarded<msgToFwd) );
2207 if(readySupidx.size()>0)
2209 supidx = readySupidx.back();
2210 readySupidx.pop_back();
2215 if( isRecvFromAbove_( snode.Index ) && isRecvFromLeft_( snode.Index ) ){
2217 std::vector<LBlock<T> > LcolRecv;
2218 std::vector<UBlock<T> > UrowRecv;
2222 UnpackData(snode, LcolRecv, UrowRecv);
2225 TreeBcast2<T> * bcastUTree2 = fwdToBelowTree2_[snode.Index];
2226 TreeBcast2<T> * bcastLTree2 = fwdToRightTree2_[snode.Index];
2227 if(bcastUTree2->IsDone()){
2228 bcastUTree2->CleanupBuffers();
2230 if(bcastLTree2->IsDone()){
2231 bcastLTree2->CleanupBuffers();
2236 SelInv_lookup_indexes(snode,LcolRecv, UrowRecv,AinvBuf,UBuf);
2238 snode.LUpdateBuf.Resize( AinvBuf.m(),
SuperSize( snode.Index, super_ ) );
2241 gemm_stat.push_back(AinvBuf.m());
2242 gemm_stat.push_back(UBuf.m());
2243 gemm_stat.push_back(AinvBuf.n());
2246 TIMER_START(Compute_Sinv_LT_GEMM);
2247 blas::Gemm(
'N',
'T', AinvBuf.m(), UBuf.m(), AinvBuf.n(), MINUS_ONE<T>(),
2248 AinvBuf.Data(), AinvBuf.m(),
2249 UBuf.Data(), UBuf.m(), ZERO<T>(),
2250 snode.LUpdateBuf.Data(), snode.LUpdateBuf.m() );
2251 TIMER_STOP(Compute_Sinv_LT_GEMM);
2254 #if ( _DEBUGlevel_ >= 2 )
2255 statusOFS << std::endl <<
"["<<snode.Index<<
"] "<<
"snode.LUpdateBuf: " << snode.LUpdateBuf << std::endl;
2261 if(redLTree != NULL){
2262 assert( snode.LUpdateBuf.m() != 0 && snode.LUpdateBuf.n() != 0 );
2263 TIMER_START(Reduce_Sinv_LT_Isend);
2265 redLTree->SetLocalBuffer(snode.LUpdateBuf.Data());
2266 redLTree->SetDataReady(
true);
2268 bool done = redLTree->Progress();
2269 #if ( _DEBUGlevel_ >= 1 )
2270 statusOFS<<
"["<<snode.Index<<
"] "<<
" trying to progress reduce L "<<done<<std::endl;
2272 TIMER_STOP(Reduce_Sinv_LT_Isend);
2277 #if ( _DEBUGlevel_ >= 1 )
2278 statusOFS<<std::endl<<
"gemmProcessed ="<<gemmProcessed<<
"/"<<gemmToDo<<std::endl;
2282 for (Int supidx=0; supidx<stepSuper; supidx++){
2285 if(redLTree != NULL && !redLdone[supidx]){
2286 bool done = redLTree->Progress();
2288 #if ( _DEBUGlevel_ >= 1 )
2289 statusOFS<<
"["<<snode.Index<<
"] "<<
" trying to progress reduce L "<<done<<std::endl;
2297 TIMER_STOP(Compute_Sinv_LT);
2302 TIMER_START(Reduce_Sinv_LT);
2304 bool all_done =
false;
2309 for (Int supidx=0; supidx<stepSuper; supidx++){
2315 if(redLTree != NULL )
2317 if(redLTree != NULL && !redLdone[supidx])
2325 #if ( _DEBUGlevel_ >= 1 )
2326 statusOFS<<
"["<<snode.Index<<
"] "<<
" trying to progress reduce L "<<done<<std::endl;
2328 redLdone[supidx]=done?1:0;
2331 #if ( _DEBUGlevel_ >= 1 )
2332 statusOFS<<
"["<<snode.Index<<
"] "<<
" DONE reduce L"<<std::endl;
2336 if(
MYCOL( grid_ ) ==
PCOL( snode.Index, grid_ ) ){
2338 Int numRowLUpdateBuf;
2339 std::vector<LBlock<T> >& Lcol = this->L(
LBj( snode.Index, grid_ ) );
2340 if(
MYROW( grid_ ) !=
PROW( snode.Index, grid_ ) ){
2341 snode.RowLocalPtr.resize( Lcol.size() + 1 );
2342 snode.BlockIdxLocal.resize( Lcol.size() );
2343 snode.RowLocalPtr[0] = 0;
2344 for( Int ib = 0; ib < Lcol.size(); ib++ ){
2345 snode.RowLocalPtr[ib+1] = snode.RowLocalPtr[ib] + Lcol[ib].numRow;
2346 snode.BlockIdxLocal[ib] = Lcol[ib].blockIdx;
2350 snode.RowLocalPtr.resize( Lcol.size() );
2351 snode.BlockIdxLocal.resize( Lcol.size() - 1 );
2352 snode.RowLocalPtr[0] = 0;
2353 for( Int ib = 1; ib < Lcol.size(); ib++ ){
2354 snode.RowLocalPtr[ib] = snode.RowLocalPtr[ib-1] + Lcol[ib].numRow;
2355 snode.BlockIdxLocal[ib-1] = Lcol[ib].blockIdx;
2358 numRowLUpdateBuf = *snode.RowLocalPtr.rbegin();
2360 if( numRowLUpdateBuf > 0 ){
2361 if( snode.LUpdateBuf.m() == 0 && snode.LUpdateBuf.n() == 0 ){
2362 snode.LUpdateBuf.Resize( numRowLUpdateBuf,
SuperSize( snode.Index, super_ ) );
2363 SetValue(snode.LUpdateBuf, ZERO<T>());
2368 redLTree->SetLocalBuffer(snode.LUpdateBuf.Data());
2373 redLTree->CleanupBuffers();
2376 all_done = all_done && (done || redLdone[supidx]);
2380 TIMER_STOP(Reduce_Sinv_LT);
2386 TIMER_START(Update_Diagonal);
2388 for (Int supidx=0; supidx<stepSuper; supidx++){
2391 ComputeDiagUpdate(snode);
2396 if(redDTree != NULL){
2398 if(
MYROW( grid_ ) ==
PROW( snode.Index, grid_ ) ){
2399 if(snode.DiagBuf.Size()==0){
2400 snode.DiagBuf.Resize(
SuperSize( snode.Index, super_ ),
SuperSize( snode.Index, super_ ));
2401 SetValue(snode.DiagBuf, ZERO<T>());
2406 redDTree->SetLocalBuffer(snode.DiagBuf.Data());
2407 if(!redDTree->IsAllocated()){
2408 redDTree->SetTag(IDX_TO_TAG(snode.Rank,SELINV_TAG_D_REDUCE,limIndex_));
2409 redDTree->AllocRecvBuffers();
2411 redDTree->PostFirstRecv();
2414 redDTree->SetDataReady(
true);
2415 bool done = redDTree->Progress();
2416 #if ( _DEBUGlevel_ >= 1 )
2417 statusOFS<<
"["<<snode.Index<<
"] "<<
" trying to progress reduce D "<<done<<std::endl;
2422 for (Int supidx=0; supidx<stepSuper; supidx++){
2425 if(redDTree != NULL){
2427 if(redDTree->IsAllocated())
2430 bool done = redDTree->Progress();
2431 #if ( _DEBUGlevel_ >= 1 )
2432 statusOFS<<
"["<<snode.Index<<
"] "<<
" trying to progress reduce D "<<done<<std::endl;
2438 TIMER_STOP(Update_Diagonal);
2440 TIMER_START(Reduce_Diagonal);
2443 vector<char> is_done(stepSuper,0);
2444 bool all_done =
false;
2449 for (Int supidx=0; supidx<stepSuper; supidx++){
2454 if(redDTree != NULL )
2456 if(redDTree != NULL && !is_done[supidx])
2460 bool done = redDTree->Progress();
2461 #if ( _DEBUGlevel_ >= 1 )
2462 statusOFS<<
"["<<snode.Index<<
"] "<<
" trying to progress reduce D "<<done<<std::endl;
2464 is_done[supidx]=done?1:0;
2467 #if ( _DEBUGlevel_ >= 1 )
2468 statusOFS<<
"["<<snode.Index<<
"] "<<
" DONE reduce D"<<std::endl;
2470 if(
MYCOL( grid_ ) ==
PCOL( snode.Index, grid_ ) ){
2471 if(
MYROW( grid_ ) ==
PROW( snode.Index, grid_ ) ){
2472 LBlock<T> & LB = this->L(
LBj( snode.Index, grid_ ) )[0];
2474 blas::Axpy( LB.
numRow * LB.
numCol, ONE<T>(), snode.DiagBuf.Data(), 1, LB.
nzval.Data(), 1 );
2475 Symmetrize( LB.
nzval );
2481 redDTree->CleanupBuffers();
2484 all_done = all_done && (done || is_done[supidx]);
2489 TIMER_STOP(Reduce_Diagonal);
2496 SendRecvCD_UpdateU(arrSuperNodes, stepSuper);
2500 TIMER_START(Update_L);
2501 for (Int supidx=0; supidx<stepSuper; supidx++){
2504 #if ( _DEBUGlevel_ >= 1 )
2505 statusOFS << std::endl <<
"["<<snode.Index<<
"] "<<
"Finish updating the L part by filling LUpdateBufReduced back to L" << std::endl << std::endl;
2508 if(
MYCOL( grid_ ) ==
PCOL( snode.Index, grid_ ) && snode.LUpdateBuf.m() > 0 ){
2509 std::vector<LBlock<T> >& Lcol = this->L(
LBj( snode.Index, grid_ ) );
2511 Int startBlock = (
MYROW( grid_ ) ==
PROW( snode.Index, grid_ ))?1:0;
2512 for( Int ib = startBlock; ib < Lcol.size(); ib++ ){
2514 lapack::Lacpy(
'A', LB.
numRow, LB.
numCol, &snode.LUpdateBuf( snode.RowLocalPtr[ib-startBlock], 0 ),
2515 snode.LUpdateBuf.m(), LB.
nzval.Data(), LB.
numRow );
2519 TIMER_STOP(Update_L);
2523 TIMER_START(Barrier);
2527 for (Int supidx=0; supidx<stepSuper; supidx++){
2529 TreeBcast2<T> * &bcastUTree2 = fwdToBelowTree2_[snode.Index];
2531 if(bcastUTree2 != NULL){
2532 bcastUTree2->Wait();
2533 bcastUTree2->CleanupBuffers();
2538 TreeBcast2<T> * &bcastLTree2 = fwdToRightTree2_[snode.Index];
2540 if(bcastLTree2 != NULL){
2541 bcastLTree2->Wait();
2542 bcastLTree2->CleanupBuffers();
2550 for (Int supidx=0; supidx<stepSuper; supidx++){
2555 if(redLTree != NULL){
2557 redLTree->CleanupBuffers();
2563 for (Int supidx=0; supidx<stepSuper; supidx++){
2567 if(redDTree != NULL){
2569 redDTree->CleanupBuffers();
2576 mpi::Waitall(arrMpireqsRecvContentFromAny);
2577 for (Int supidx=0; supidx<stepSuper; supidx++){
2579 Int ksup = snode.Index;
2580 std::vector<MPI_Request> & mpireqsSendToRight = arrMpireqsSendToRight[supidx];
2581 std::vector<MPI_Request> & mpireqsSendToBelow = arrMpireqsSendToBelow[supidx];
2583 #if ( _DEBUGlevel_ >= 1 )
2584 statusOFS<<
"["<<ksup<<
"] mpireqsSendToRight"<<std::endl;
2586 mpi::Waitall( mpireqsSendToRight );
2587 #if ( _DEBUGlevel_ >= 1 )
2588 statusOFS<<
"["<<ksup<<
"] mpireqsSendToBelow"<<std::endl;
2591 mpi::Waitall( mpireqsSendToBelow );
2595 #if ( _DEBUGlevel_ >= 1 )
2596 statusOFS<<
"barrier done"<<std::endl;
2598 TIMER_STOP(Barrier);
2602 template<
typename T>
2605 ConstructCommunicationPattern_P2p();
2610 template<
typename T>
2613 #ifdef COMMUNICATOR_PROFILE
2614 CommunicatorStat stats;
2615 stats.countSendToBelow_.Resize(numSuper);
2616 stats.countSendToRight_.Resize(numSuper);
2617 stats.countRecvFromBelow_.Resize( numSuper );
2618 SetValue( stats.countSendToBelow_, 0 );
2619 SetValue( stats.countSendToRight_, 0 );
2620 SetValue( stats.countRecvFromBelow_, 0 );
2628 TIMER_START(Allocate);
2633 fwdToBelowTree2_.resize(numSuper, NULL );
2634 fwdToRightTree2_.resize(numSuper, NULL );
2636 fwdToBelowTree_.resize(numSuper, NULL );
2637 fwdToRightTree_.resize(numSuper, NULL );
2639 redToLeftTree_.resize(numSuper, NULL );
2640 redToAboveTree_.resize(numSuper, NULL );
2642 isSendToBelow_.Resize(grid_->numProcRow, numSuper);
2643 isSendToRight_.Resize(grid_->numProcCol, numSuper);
2644 isSendToDiagonal_.Resize( numSuper );
2647 SetValue( isSendToDiagonal_,
false );
2649 isSendToCrossDiagonal_.Resize(grid_->numProcCol+1, numSuper );
2650 SetValue( isSendToCrossDiagonal_,
false );
2651 isRecvFromCrossDiagonal_.Resize(grid_->numProcRow+1, numSuper );
2652 SetValue( isRecvFromCrossDiagonal_,
false );
2654 isRecvFromAbove_.Resize( numSuper );
2655 isRecvFromLeft_.Resize( numSuper );
2656 isRecvFromBelow_.Resize( grid_->numProcRow, numSuper );
2657 SetValue( isRecvFromAbove_,
false );
2658 SetValue( isRecvFromBelow_,
false );
2659 SetValue( isRecvFromLeft_,
false );
2661 TIMER_STOP(Allocate);
2663 TIMER_START(GetEtree);
2664 std::vector<Int> snodeEtree(this->
NumSuper());
2665 GetEtree(snodeEtree);
2666 TIMER_STOP(GetEtree);
2670 #if ( _DEBUGlevel_ >= 1 )
2671 statusOFS << std::endl <<
"Local column communication" << std::endl;
2676 std::vector<std::set<Int> > localColBlockRowIdx;
2678 localColBlockRowIdx.resize( this->NumLocalBlockCol() );
2681 TIMER_START(Column_communication);
2684 for( Int ksup = 0; ksup < numSuper; ksup++ ){
2686 if(
MYCOL( grid_ ) ==
PCOL( ksup, grid_ ) ){
2690 std::vector<Int> tAllBlockRowIdx;
2691 std::vector<Int> & colBlockIdx = ColBlockIdx(
LBj(ksup, grid_));
2692 TIMER_START(Allgatherv_Column_communication);
2693 if( grid_ -> mpisize != 1 )
2694 mpi::Allgatherv( colBlockIdx, tAllBlockRowIdx, grid_->colComm );
2696 tAllBlockRowIdx = colBlockIdx;
2698 TIMER_STOP(Allgatherv_Column_communication);
2700 localColBlockRowIdx[
LBj( ksup, grid_ )].insert(
2701 tAllBlockRowIdx.begin(), tAllBlockRowIdx.end() );
2703 #if ( _DEBUGlevel_ >= 1 )
2705 <<
" Column block " << ksup
2706 <<
" has the following nonzero block rows" << std::endl;
2707 for( std::set<Int>::iterator si = localColBlockRowIdx[
LBj( ksup, grid_ )].begin();
2708 si != localColBlockRowIdx[
LBj( ksup, grid_ )].end();
2710 statusOFS << *si <<
" ";
2712 statusOFS << std::endl;
2720 TIMER_STOP(Column_communication);
2722 TIMER_START(Row_communication);
2723 #if ( _DEBUGlevel_ >= 1 )
2724 statusOFS << std::endl <<
"Local row communication" << std::endl;
2729 std::vector<std::set<Int> > localRowBlockColIdx;
2731 localRowBlockColIdx.resize( this->NumLocalBlockRow() );
2733 for( Int ksup = 0; ksup < numSuper; ksup++ ){
2735 if(
MYROW( grid_ ) ==
PROW( ksup, grid_ ) ){
2738 std::vector<Int> tAllBlockColIdx;
2739 std::vector<Int> & rowBlockIdx = RowBlockIdx(
LBi(ksup, grid_));
2740 TIMER_START(Allgatherv_Row_communication);
2741 if( grid_ -> mpisize != 1 )
2742 mpi::Allgatherv( rowBlockIdx, tAllBlockColIdx, grid_->rowComm );
2744 tAllBlockColIdx = rowBlockIdx;
2746 TIMER_STOP(Allgatherv_Row_communication);
2748 localRowBlockColIdx[
LBi( ksup, grid_ )].insert(
2749 tAllBlockColIdx.begin(), tAllBlockColIdx.end() );
2751 #if ( _DEBUGlevel_ >= 1 )
2753 <<
" Row block " << ksup
2754 <<
" has the following nonzero block columns" << std::endl;
2755 for( std::set<Int>::iterator si = localRowBlockColIdx[
LBi( ksup, grid_ )].begin();
2756 si != localRowBlockColIdx[
LBi( ksup, grid_ )].end();
2758 statusOFS << *si <<
" ";
2760 statusOFS << std::endl;
2767 TIMER_STOP(Row_communication);
2769 TIMER_START(STB_RFA);
2770 for( Int ksup = 0; ksup < numSuper - 1; ksup++ ){
2772 #ifdef COMMUNICATOR_PROFILE
2773 std::vector<bool> sTB(grid_->numProcRow,
false);
2776 Int jsup = snodeEtree[ksup];
2777 while(jsup<numSuper){
2778 Int jsupLocalBlockCol =
LBj( jsup, grid_ );
2779 Int jsupProcCol =
PCOL( jsup, grid_ );
2780 if(
MYCOL( grid_ ) == jsupProcCol ){
2782 if( localColBlockRowIdx[jsupLocalBlockCol].count( ksup ) > 0 ) {
2783 for( std::set<Int>::iterator si = localColBlockRowIdx[jsupLocalBlockCol].begin();
2784 si != localColBlockRowIdx[jsupLocalBlockCol].end(); si++ ){
2786 Int isupProcRow =
PROW( isup, grid_ );
2788 if(
MYROW( grid_ ) == isupProcRow ){
2789 isRecvFromAbove_(ksup) =
true;
2791 if(
MYROW( grid_ ) ==
PROW( ksup, grid_ ) ){
2792 isSendToBelow_( isupProcRow, ksup ) =
true;
2797 #ifdef COMMUNICATOR_PROFILE
2798 sTB[
PROW(ksup,grid_) ] =
true;
2799 if( localColBlockRowIdx[jsupLocalBlockCol].count( ksup ) > 0 ) {
2800 for( std::set<Int>::iterator si = localColBlockRowIdx[jsupLocalBlockCol].begin();
2801 si != localColBlockRowIdx[jsupLocalBlockCol].end(); si++ ){
2803 Int isupProcRow =
PROW( isup, grid_ );
2805 sTB[isupProcRow] =
true;
2814 jsup = snodeEtree[jsup];
2817 #ifdef COMMUNICATOR_PROFILE
2818 Int count= std::count(sTB.begin(), sTB.end(),
true);
2819 Int color = sTB[
MYROW(grid_)];
2821 std::vector<Int> & snodeList = stats.maskSendToBelow_[sTB];
2822 snodeList.push_back(ksup);
2824 stats.countSendToBelow_(ksup) = count * color;
2829 TIMER_STOP(STB_RFA);
2831 TIMER_START(STR_RFL_RFB);
2832 for( Int ksup = 0; ksup < numSuper - 1; ksup++ ){
2834 #ifdef COMMUNICATOR_PROFILE
2835 std::vector<bool> sTR(grid_->numProcCol,
false);
2836 std::vector<bool> rFB(grid_->numProcRow,
false);
2839 Int isup = snodeEtree[ksup];
2840 while(isup<numSuper){
2841 Int isupLocalBlockRow =
LBi( isup, grid_ );
2842 Int isupProcRow =
PROW( isup, grid_ );
2843 if(
MYROW( grid_ ) == isupProcRow ){
2845 if( localRowBlockColIdx[isupLocalBlockRow].count( ksup ) > 0 ){
2846 for( std::set<Int>::iterator si = localRowBlockColIdx[isupLocalBlockRow].begin();
2847 si != localRowBlockColIdx[isupLocalBlockRow].end(); si++ ){
2849 Int jsupProcCol =
PCOL( jsup, grid_ );
2851 if(
MYCOL( grid_ ) == jsupProcCol ){
2852 isRecvFromLeft_(ksup) =
true;
2854 if(
MYCOL( grid_ ) ==
PCOL( ksup, grid_ ) ){
2855 isSendToRight_( jsupProcCol, ksup ) =
true;
2861 #ifdef COMMUNICATOR_PROFILE
2862 sTR[
PCOL(ksup,grid_) ] =
true;
2863 if( localRowBlockColIdx[isupLocalBlockRow].count( ksup ) > 0 ){
2864 for( std::set<Int>::iterator si = localRowBlockColIdx[isupLocalBlockRow].begin();
2865 si != localRowBlockColIdx[isupLocalBlockRow].end(); si++ ){
2867 Int jsupProcCol =
PCOL( jsup, grid_ );
2869 sTR[ jsupProcCol ] =
true;
2878 if(
MYCOL( grid_ ) ==
PCOL(ksup, grid_) ){
2879 if(
MYROW( grid_ ) ==
PROW( ksup, grid_ ) ){
2880 isRecvFromBelow_(isupProcRow,ksup) =
true;
2882 else if (
MYROW(grid_) == isupProcRow){
2883 isSendToDiagonal_(ksup)=
true;
2887 #ifdef COMMUNICATOR_PROFILE
2888 rFB[
PROW(ksup,grid_) ] =
true;
2889 if(
MYCOL( grid_ ) ==
PCOL(ksup, grid_) ){
2890 rFB[ isupProcRow ] =
true;
2895 isup = snodeEtree[isup];
2898 #ifdef COMMUNICATOR_PROFILE
2899 Int count= std::count(rFB.begin(), rFB.end(),
true);
2900 Int color = rFB[
MYROW(grid_)];
2902 std::vector<Int> & snodeList = stats.maskRecvFromBelow_[rFB];
2903 snodeList.push_back(ksup);
2905 stats.countRecvFromBelow_(ksup) = count * color;
2907 count= std::count(sTR.begin(), sTR.end(),
true);
2908 color = sTR[
MYCOL(grid_)];
2910 std::vector<Int> & snodeList = stats.maskSendToRight_[sTR];
2911 snodeList.push_back(ksup);
2913 stats.countSendToRight_(ksup) = count * color;
2919 TIMER_STOP(STR_RFL_RFB);
2921 #ifdef COMMUNICATOR_PROFILE
2926 stats.maskSendToBelow_.insert(stats.maskRecvFromBelow_.begin(),stats.maskRecvFromBelow_.end());
2927 statusOFS << std::endl <<
"stats.maskSendToBelow_:" << stats.maskSendToBelow_.size() <<std::endl;
2946 statusOFS << std::endl <<
"stats.maskSendToRight_:" << stats.maskSendToRight_.size() <<std::endl;
2951 TIMER_START(BUILD_BCAST_TREES);
2954 vector<double> SeedRFL(numSuper,0.0);
2955 vector<Int> aggRFL(numSuper);
2956 vector<Int> globalAggRFL(numSuper*grid_->numProcCol);
2957 for( Int ksup = 0; ksup < numSuper ; ksup++ ){
2958 #if ( _DEBUGlevel_ >= 1 )
2959 statusOFS<<
"1 "<<ksup<<std::endl;
2962 std::vector<LBlock<T> >& Lcol = this->L(
LBj(ksup, grid_) );
2967 totalSize+=
sizeof(Int);
2968 for( Int ib = 0; ib < Lcol.size(); ib++ ){
2969 if( Lcol[ib].blockIdx > ksup ){
2971 totalSize+=3*
sizeof(Int);
2972 totalSize+=
sizeof(Int)+Lcol[ib].rows.ByteSize();
2977 aggRFL[ksup]=totalSize;
2978 SeedRFL[ksup]=rand();
2981 else if(isRecvFromLeft_(ksup)){
2989 MPI_Allgather(&aggRFL[0],numSuper*
sizeof(Int),MPI_BYTE,
2990 &globalAggRFL[0],numSuper*
sizeof(Int),MPI_BYTE,
2992 MPI_Allreduce(MPI_IN_PLACE,&SeedRFL[0],numSuper,MPI_DOUBLE,MPI_MAX,grid_->rowComm);
2995 for( Int ksup = 0; ksup < numSuper; ksup++ ){
2996 #if ( _DEBUGlevel_ >= 1 )
2997 statusOFS<<
"3 "<<ksup<<std::endl;
2999 if( isRecvFromLeft_(ksup) || CountSendToRight(ksup)>0 ){
3001 vector<Int> tree_ranks;
3003 for( Int iProcCol = 0; iProcCol < grid_->numProcCol; iProcCol++ ){
3004 if( iProcCol !=
PCOL( ksup, grid_ ) ){
3005 Int isRFL = globalAggRFL[iProcCol*numSuper + ksup];
3012 tree_ranks.reserve(countRFL+1);
3013 tree_ranks.push_back(
PCOL(ksup,grid_));
3015 for( Int iProcCol = 0; iProcCol < grid_->numProcCol; iProcCol++ ){
3016 Int isRFL = globalAggRFL[iProcCol*numSuper + ksup];
3018 if( iProcCol !=
PCOL( ksup, grid_ ) ){
3019 tree_ranks.push_back(iProcCol);
3028 TreeBcast2<T> * & BcastLTree2 = fwdToRightTree2_[ksup];
3029 if(BcastLTree2!=NULL){
delete BcastLTree2; BcastLTree2=NULL;}
3030 BcastLTree2 = TreeBcast2<T>::Create(this->grid_->rowComm,&tree_ranks[0],tree_ranks.size(),msgSize,SeedRFL[ksup]);
3032 #ifdef COMM_PROFILE_BCAST
3033 BcastLTree2->SetGlobalComm(grid_->comm);
3036 TreeBcast * & BcastLTree = fwdToRightTree_[ksup];
3037 if(BcastLTree!=NULL){
delete BcastLTree; BcastLTree=NULL;}
3038 BcastLTree = TreeBcast::Create(this->grid_->rowComm,&tree_ranks[0],tree_ranks.size(),msgSize,SeedRFL[ksup]);
3039 #ifdef COMM_PROFILE_BCAST
3040 BcastLTree->SetGlobalComm(grid_->comm);
3053 vector<double> SeedRFA(numSuper,0.0);
3054 vector<Int> aggRFA(numSuper);
3055 vector<Int> globalAggRFA(numSuper*grid_->numProcRow);
3056 for( Int ksup = 0; ksup < numSuper ; ksup++ ){
3057 #if ( _DEBUGlevel_ >= 1 )
3058 statusOFS<<
"2 "<<ksup<<std::endl;
3061 std::vector<UBlock<T> >& Urow = this->U(
LBi(ksup, grid_) );
3066 totalSize+=
sizeof(Int);
3067 for( Int jb = 0; jb < Urow.size(); jb++ ){
3068 if( Urow[jb].blockIdx >= ksup ){
3070 totalSize+=3*
sizeof(Int);
3071 totalSize+=
sizeof(Int)+Urow[jb].cols.ByteSize();
3072 totalSize+= 2*
sizeof(Int)+Urow[jb].nzval.ByteSize();
3075 aggRFA[ksup]=totalSize;
3076 SeedRFA[ksup]=rand();
3078 else if(isRecvFromAbove_(ksup)){
3088 MPI_Allgather(&aggRFA[0],numSuper*
sizeof(Int),MPI_BYTE,
3089 &globalAggRFA[0],numSuper*
sizeof(Int),MPI_BYTE,
3091 MPI_Allreduce(MPI_IN_PLACE,&SeedRFA[0],numSuper,MPI_DOUBLE,MPI_MAX,grid_->colComm);
3093 for( Int ksup = 0; ksup < numSuper; ksup++ ){
3094 #if ( _DEBUGlevel_ >= 1 )
3095 statusOFS<<
"4 "<<ksup<<std::endl;
3097 if( isRecvFromAbove_(ksup) || CountSendToBelow(ksup)>0 ){
3098 vector<Int> tree_ranks;
3101 for( Int iProcRow = 0; iProcRow < grid_->numProcRow; iProcRow++ ){
3102 if( iProcRow !=
PROW( ksup, grid_ ) ){
3103 Int isRFA = globalAggRFA[iProcRow*numSuper + ksup];
3110 tree_ranks.reserve(countRFA+1);
3111 tree_ranks.push_back(
PROW(ksup,grid_));
3113 for( Int iProcRow = 0; iProcRow < grid_->numProcRow; iProcRow++ ){
3114 Int isRFA = globalAggRFA[iProcRow*numSuper + ksup];
3116 if( iProcRow !=
PROW( ksup, grid_ ) ){
3117 tree_ranks.push_back(iProcRow);
3126 TreeBcast2<T> * & BcastUTree2 = fwdToBelowTree2_[ksup];
3127 if(BcastUTree2!=NULL){
delete BcastUTree2; BcastUTree2=NULL;}
3128 BcastUTree2 = TreeBcast2<T>::Create(this->grid_->colComm,&tree_ranks[0],tree_ranks.size(),msgSize,SeedRFA[ksup]);
3130 #ifdef COMM_PROFILE_BCAST
3131 BcastUTree2->SetGlobalComm(grid_->comm);
3134 TreeBcast * & BcastUTree = fwdToBelowTree_[ksup];
3135 if(BcastUTree!=NULL){
delete BcastUTree; BcastUTree=NULL;}
3136 BcastUTree = TreeBcast::Create(this->grid_->colComm,&tree_ranks[0],tree_ranks.size(),msgSize,SeedRFA[ksup]);
3137 #ifdef COMM_PROFILE_BCAST
3138 BcastUTree->SetGlobalComm(grid_->comm);
3146 TIMER_STOP(BUILD_BCAST_TREES);
3147 TIMER_START(BUILD_REDUCE_D_TREE);
3149 vector<double> SeedSTD(numSuper,0.0);
3150 vector<Int> aggSTD(numSuper);
3151 vector<Int> globalAggSTD(numSuper*grid_->numProcRow);
3152 for( Int ksup = 0; ksup < numSuper; ksup++ ){
3155 aggSTD[ksup]=totalSize;
3156 SeedSTD[ksup]=rand();
3158 else if(isSendToDiagonal_(ksup)){
3168 MPI_Allgather(&aggSTD[0],numSuper*
sizeof(Int),MPI_BYTE,
3169 &globalAggSTD[0],numSuper*
sizeof(Int),MPI_BYTE,
3171 MPI_Allreduce(MPI_IN_PLACE,&SeedSTD[0],numSuper,MPI_DOUBLE,MPI_MAX,grid_->colComm);
3174 for( Int ksup = 0; ksup < numSuper; ksup++ ){
3175 #if ( _DEBUGlevel_ >= 1 )
3176 statusOFS<<
"5 "<<ksup<<std::endl;
3178 if(
MYCOL( grid_ ) ==
PCOL(ksup, grid_) ){
3180 Int amISTD = globalAggSTD[
MYROW(grid_)*numSuper + ksup];
3186 vector<Int> tree_ranks;
3190 for( Int iProcRow = 0; iProcRow < grid_->numProcRow; iProcRow++ ){
3191 Int isSTD = globalAggSTD[iProcRow*numSuper + ksup];
3193 if( iProcRow !=
PROW( ksup, grid_ ) ){
3194 set_ranks.insert(iProcRow);
3203 tree_ranks.push_back(
PROW(ksup,grid_));
3204 tree_ranks.insert(tree_ranks.end(),set_ranks.begin(),set_ranks.end());
3207 for( Int iProcRow = 0; iProcRow < grid_->numProcRow; iProcRow++ ){
3208 if( iProcRow !=
PROW( ksup, grid_ ) ){
3209 Int isSTD = globalAggSTD[iProcRow*numSuper + ksup];
3216 tree_ranks.reserve(countSTD+1);
3217 tree_ranks.push_back(
PROW(ksup,grid_));
3219 for( Int iProcRow = 0; iProcRow < grid_->numProcRow; iProcRow++ ){
3220 Int isSTD = globalAggSTD[iProcRow*numSuper + ksup];
3222 if( iProcRow !=
PROW( ksup, grid_ ) ){
3223 tree_ranks.push_back(iProcRow);
3236 if(redDTree!=NULL){
delete redDTree; redDTree=NULL;}
3237 redDTree =
TreeReduce<T>::Create(this->grid_->colComm,&tree_ranks[0],tree_ranks.size(),msgSize,SeedSTD[ksup]);
3239 redDTree->SetGlobalComm(grid_->comm);
3245 TIMER_STOP(BUILD_REDUCE_D_TREE);
3249 TIMER_START(BUILD_REDUCE_L_TREE);
3251 vector<double> SeedRTL(numSuper,0.0);
3252 vector<Int> aggRTL(numSuper);
3253 vector<Int> globalAggRTL(numSuper*grid_->numProcCol);
3254 for( Int ksup = 0; ksup < numSuper ; ksup++ ){
3256 std::vector<LBlock<T> >& Lcol = this->L(
LBj(ksup, grid_) );
3262 Int numRowLUpdateBuf = 0;
3263 if(
MYROW( grid_ ) !=
PROW( ksup, grid_ ) ){
3264 for( Int ib = 0; ib < Lcol.size(); ib++ ){
3265 numRowLUpdateBuf += Lcol[ib].numRow;
3269 for( Int ib = 1; ib < Lcol.size(); ib++ ){
3270 numRowLUpdateBuf += Lcol[ib].numRow;
3275 totalSize = numRowLUpdateBuf*
SuperSize( ksup, super_ )*
sizeof(T);
3277 aggRTL[ksup]=totalSize;
3279 SeedRTL[ksup]=rand();
3281 else if(isRecvFromLeft_(ksup)){
3289 MPI_Allgather(&aggRTL[0],numSuper*
sizeof(Int),MPI_BYTE,
3290 &globalAggRTL[0],numSuper*
sizeof(Int),MPI_BYTE,
3292 MPI_Allreduce(MPI_IN_PLACE,&SeedRTL[0],numSuper,MPI_DOUBLE,MPI_MAX,grid_->rowComm);
3295 for( Int ksup = 0; ksup < numSuper ; ksup++ ){
3296 #if ( _DEBUGlevel_ >= 1 )
3297 statusOFS<<
"6 "<<ksup<<std::endl;
3299 if( isRecvFromLeft_(ksup) || CountSendToRight(ksup)>0 ){
3300 vector<Int> tree_ranks;
3304 for( Int iProcCol = 0; iProcCol < grid_->numProcCol; iProcCol++ ){
3305 Int isRTL = globalAggRTL[iProcCol*numSuper + ksup];
3307 if( iProcCol !=
PCOL( ksup, grid_ ) ){
3308 set_ranks.insert(iProcCol);
3315 tree_ranks.push_back(
PCOL(ksup,grid_));
3316 tree_ranks.insert(tree_ranks.end(),set_ranks.begin(),set_ranks.end());
3319 for( Int iProcCol = 0; iProcCol < grid_->numProcCol; iProcCol++ ){
3320 if( iProcCol !=
PCOL( ksup, grid_ ) ){
3321 Int isRTL = globalAggRTL[iProcCol*numSuper + ksup];
3328 tree_ranks.reserve(countRTL+1);
3329 tree_ranks.push_back(
PCOL(ksup,grid_));
3331 for( Int iProcCol = 0; iProcCol < grid_->numProcCol; iProcCol++ ){
3332 Int isRTL = globalAggRTL[iProcCol*numSuper + ksup];
3334 if( iProcCol !=
PCOL( ksup, grid_ ) ){
3335 tree_ranks.push_back(iProcCol);
3347 if(redLTree!=NULL){
delete redLTree; redLTree=NULL;}
3348 redLTree =
TreeReduce<T>::Create(this->grid_->rowComm,&tree_ranks[0],tree_ranks.size(),msgSize,SeedRTL[ksup]);
3350 redLTree->SetGlobalComm(grid_->comm);
3355 TIMER_STOP(BUILD_REDUCE_L_TREE);
3363 #if ( _DEBUGlevel_ >= 1 )
3364 statusOFS << std::endl <<
"isSendToBelow:" << std::endl;
3365 for(
int j = 0;j< isSendToBelow_.n();j++){
3366 statusOFS <<
"["<<j<<
"] ";
3367 for(
int i =0; i < isSendToBelow_.m();i++){
3368 statusOFS<< isSendToBelow_(i,j) <<
" ";
3370 statusOFS<<std::endl;
3373 statusOFS << std::endl <<
"isRecvFromAbove:" << std::endl;
3374 for(
int j = 0;j< isRecvFromAbove_.m();j++){
3375 statusOFS <<
"["<<j<<
"] "<< isRecvFromAbove_(j)<<std::endl;
3378 #if ( _DEBUGlevel_ >= 1 )
3379 statusOFS << std::endl <<
"isSendToRight:" << std::endl;
3380 for(
int j = 0;j< isSendToRight_.n();j++){
3381 statusOFS <<
"["<<j<<
"] ";
3382 for(
int i =0; i < isSendToRight_.m();i++){
3383 statusOFS<< isSendToRight_(i,j) <<
" ";
3385 statusOFS<<std::endl;
3388 statusOFS << std::endl <<
"isRecvFromLeft:" << std::endl;
3389 for(
int j = 0;j< isRecvFromLeft_.m();j++){
3390 statusOFS <<
"["<<j<<
"] "<< isRecvFromLeft_(j)<<std::endl;
3393 statusOFS << std::endl <<
"isRecvFromBelow:" << std::endl;
3394 for(
int j = 0;j< isRecvFromBelow_.n();j++){
3395 statusOFS <<
"["<<j<<
"] ";
3396 for(
int i =0; i < isRecvFromBelow_.m();i++){
3397 statusOFS<< isRecvFromBelow_(i,j) <<
" ";
3399 statusOFS<<std::endl;
3409 TIMER_START(STCD_RFCD);
3412 for( Int ksup = 0; ksup < numSuper - 1; ksup++ ){
3413 if(
MYCOL( grid_ ) ==
PCOL( ksup, grid_ ) ){
3414 for( std::set<Int>::iterator si = localColBlockRowIdx[
LBj( ksup, grid_ )].begin();
3415 si != localColBlockRowIdx[
LBj( ksup, grid_ )].end(); si++ ){
3417 Int isupProcRow =
PROW( isup, grid_ );
3418 Int isupProcCol =
PCOL( isup, grid_ );
3419 if( isup > ksup &&
MYROW( grid_ ) == isupProcRow ){
3420 isSendToCrossDiagonal_(grid_->numProcCol, ksup ) =
true;
3421 isSendToCrossDiagonal_(isupProcCol, ksup ) =
true;
3427 for( Int ksup = 0; ksup < numSuper - 1; ksup++ ){
3428 if(
MYROW( grid_ ) ==
PROW( ksup, grid_ ) ){
3429 for( std::set<Int>::iterator si = localRowBlockColIdx[
LBi(ksup, grid_) ].begin();
3430 si != localRowBlockColIdx[
LBi(ksup, grid_) ].end(); si++ ){
3432 Int jsupProcCol =
PCOL( jsup, grid_ );
3433 Int jsupProcRow =
PROW( jsup, grid_ );
3434 if( jsup > ksup &&
MYCOL(grid_) == jsupProcCol ){
3435 isRecvFromCrossDiagonal_(grid_->numProcRow, ksup ) =
true;
3436 isRecvFromCrossDiagonal_(jsupProcRow, ksup ) =
true;
3441 #if ( _DEBUGlevel_ >= 1 )
3442 statusOFS << std::endl <<
"isSendToCrossDiagonal:" << std::endl;
3443 for(
int j =0; j < isSendToCrossDiagonal_.n();j++){
3444 if(isSendToCrossDiagonal_(grid_->numProcCol,j)){
3445 statusOFS <<
"["<<j<<
"] ";
3446 for(
int i =0; i < isSendToCrossDiagonal_.m()-1;i++){
3447 if(isSendToCrossDiagonal_(i,j))
3449 statusOFS<<
PNUM(
PROW(j,grid_),i,grid_)<<
" ";
3452 statusOFS<<std::endl;
3456 statusOFS << std::endl <<
"isRecvFromCrossDiagonal:" << std::endl;
3457 for(
int j =0; j < isRecvFromCrossDiagonal_.n();j++){
3458 if(isRecvFromCrossDiagonal_(grid_->numProcRow,j)){
3459 statusOFS <<
"["<<j<<
"] ";
3460 for(
int i =0; i < isRecvFromCrossDiagonal_.m()-1;i++){
3461 if(isRecvFromCrossDiagonal_(i,j))
3463 statusOFS<<
PNUM(i,
PCOL(j,grid_),grid_)<<
" ";
3466 statusOFS<<std::endl;
3474 TIMER_STOP(STCD_RFCD);
3478 GetWorkSet(snodeEtree,this->WorkingSet());
3483 template<
typename T>
3487 if(optionsFact_->Symmetric == 0){
3488 ErrorHandling(
"The matrix is not symmetric, this routine can't handle it !" );
3494 statOFS<<
"m"<<
"\t"<<
"n"<<
"\t"<<
"z"<<std::endl;
3495 for(std::deque<int >::iterator it = gemm_stat.begin(); it!=gemm_stat.end(); it+=3){
3496 statOFS<<*it<<
"\t"<<*(it+1)<<
"\t"<<*(it+2)<<std::endl;
3500 #if defined(COMM_PROFILE) || defined(COMM_PROFILE_BCAST)
3502 commOFS<<HEADER_COMM<<std::endl;
3503 for(std::deque<int>::iterator it = comm_stat.begin(); it!=comm_stat.end(); it+=4){
3504 commOFS<<LINE_COMM(it)<<std::endl;
3509 for(
int i = 0 ; i< fwdToBelowTree_.size();++i){
3510 if(fwdToBelowTree_[i]!=NULL){
3511 fwdToBelowTree_[i]->Reset();
3514 for(
int i = 0 ; i< fwdToRightTree_.size();++i){
3515 if(fwdToRightTree_[i]!=NULL){
3516 fwdToRightTree_[i]->Reset();
3519 for(
int i = 0 ; i< redToLeftTree_.size();++i){
3520 if(redToLeftTree_[i]!=NULL){
3521 redToLeftTree_[i]->Reset();
3524 for(
int i = 0 ; i< redToAboveTree_.size();++i){
3525 if(redToAboveTree_[i]!=NULL){
3526 redToAboveTree_[i]->Reset();
3536 template<
typename T>
3539 TIMER_START(SelInv_P2p);
3542 statusOFS<<
"maxTag value: "<<maxTag_<<std::endl;
3547 std::vector<std::vector<Int> > & superList = this->WorkingSet();
3548 Int numSteps = superList.size();
3551 for (Int lidx=0; lidx<numSteps ; lidx++){
3552 Int stepSuper = superList[lidx].size();
3554 SelInvIntra_P2p(lidx,rank);
3555 #if ( _DEBUGlevel_ >= 1 )
3556 statusOFS<<
"OUT "<<lidx<<
"/"<<numSteps<<
" "<<limIndex_<<std::endl;
3561 if (options_->maxPipelineDepth!=-1)
3564 MPI_Barrier(grid_->comm);
3573 if(lidx>0 && (rank-1)%limIndex_==0){
3577 MPI_Barrier(grid_->comm);
3584 MPI_Barrier(grid_->comm);
3586 TIMER_STOP(SelInv_P2p);
3593 template<
typename T>
3599 #if ( _DEBUGlevel_ >= 1 )
3600 statusOFS << std::endl <<
"L(i,k) <- L(i,k) * L(k,k)^{-1}" << std::endl << std::endl;
3602 for( Int ksup = 0; ksup < numSuper; ksup++ ){
3603 if(
MYCOL( grid_ ) ==
PCOL( ksup, grid_ ) ){
3606 std::vector<LBlock<T> >& Lcol = this->L(
LBj( ksup, grid_ ) );
3607 if(
MYROW( grid_ ) ==
PROW( ksup, grid_ ) ){
3608 nzvalLDiag = Lcol[0].nzval;
3609 if( nzvalLDiag.m() !=
SuperSize(ksup, super_) ||
3610 nzvalLDiag.n() !=
SuperSize(ksup, super_) ){
3611 ErrorHandling(
"The size of the diagonal block of L is wrong." );
3618 MPI_Bcast( (
void*)nzvalLDiag.Data(), nzvalLDiag.ByteSize(),
3619 MPI_BYTE,
PROW( ksup, grid_ ), grid_->colComm );
3622 for( Int ib = 0; ib < Lcol.size(); ib++ ){
3625 #if ( _DEBUGlevel_ >= 2 )
3627 if(
LBj( ksup, grid_ ) == 0 ){
3628 statusOFS <<
"Diag L(" << ksup <<
", " << ksup <<
"): " << nzvalLDiag << std::endl;
3629 statusOFS <<
"Before solve L(" << LB.
blockIdx <<
", " << ksup <<
"): " << LB.
nzval << std::endl;
3632 blas::Trsm(
'R',
'L',
'N',
'U', LB.
numRow, LB.
numCol, ONE<T>(),
3634 #if ( _DEBUGlevel_ >= 2 )
3636 if(
LBj( ksup, grid_ ) == 0 ){
3637 statusOFS <<
"After solve L(" << LB.
blockIdx <<
", " << ksup <<
"): " << LB.
nzval << std::endl;
3648 #if ( _DEBUGlevel_ >= 1 )
3649 statusOFS << std::endl <<
"U(k,i) <- L(i,k)" << std::endl << std::endl;
3652 for( Int ksup = 0; ksup < numSuper; ksup++ ){
3653 Int ksupProcRow =
PROW( ksup, grid_ );
3654 Int ksupProcCol =
PCOL( ksup, grid_ );
3656 Int sendCount = CountSendToCrossDiagonal(ksup);
3657 Int recvCount = CountRecvFromCrossDiagonal(ksup);
3659 std::vector<MPI_Request > arrMpiReqsSend(sendCount, MPI_REQUEST_NULL );
3660 std::vector<MPI_Request > arrMpiReqsSizeSend(sendCount, MPI_REQUEST_NULL );
3661 std::vector<std::vector<char> > arrSstrLcolSend(sendCount);
3662 std::vector<Int > arrSstrLcolSizeSend(sendCount);
3664 std::vector<MPI_Request > arrMpiReqsRecv(recvCount, MPI_REQUEST_NULL );
3665 std::vector<MPI_Request > arrMpiReqsSizeRecv(recvCount, MPI_REQUEST_NULL );
3666 std::vector<std::vector<char> > arrSstrLcolRecv(recvCount);
3667 std::vector<Int > arrSstrLcolSizeRecv(recvCount);
3672 if( isSendToCrossDiagonal_(grid_->numProcCol,ksup) ){
3673 #if ( _DEBUGlevel_ >= 1 )
3674 statusOFS<<
"["<<ksup<<
"] P"<<
MYPROC(grid_)<<
" should send to "<<CountSendToCrossDiagonal(ksup)<<
" processors"<<std::endl;
3678 for(Int dstCol = 0; dstCol<grid_->numProcCol; dstCol++){
3679 if(isSendToCrossDiagonal_(dstCol,ksup) ){
3680 Int dst =
PNUM(
PROW(ksup,grid_),dstCol,grid_);
3683 std::stringstream sstm;
3684 std::vector<char> & sstrLcolSend = arrSstrLcolSend[sendIdx];
3685 Int & sstrSize = arrSstrLcolSizeSend[sendIdx];
3686 MPI_Request & mpiReqSend = arrMpiReqsSend[sendIdx];
3687 MPI_Request & mpiReqSizeSend = arrMpiReqsSizeSend[sendIdx];
3689 std::vector<Int> mask( LBlockMask::TOTAL_NUMBER, 1 );
3690 std::vector<LBlock<T> >& Lcol = this->L(
LBj(ksup, grid_) );
3695 if(
MYROW( grid_ ) ==
PROW( ksup, grid_ ) ){
3696 for( Int ib = 1; ib < Lcol.size(); ib++ ){
3697 if( Lcol[ib].blockIdx > ksup && (Lcol[ib].blockIdx % grid_->numProcCol) == dstCol ){
3704 for( Int ib = 0; ib < Lcol.size(); ib++ ){
3705 if( Lcol[ib].blockIdx > ksup && (Lcol[ib].blockIdx % grid_->numProcCol) == dstCol ){
3711 serialize( (Int)count, sstm, NO_MASK );
3713 for( Int ib = 0; ib < Lcol.size(); ib++ ){
3714 if( Lcol[ib].blockIdx > ksup && (Lcol[ib].blockIdx % grid_->numProcCol) == dstCol ){
3715 #if ( _DEBUGlevel_ >= 1 )
3716 statusOFS<<
"["<<ksup<<
"] SEND contains "<<Lcol[ib].blockIdx<<
" which corresponds to "<<
GBj(ib,grid_)<<std::endl;
3718 serialize( Lcol[ib], sstm, mask );
3722 sstrLcolSend.resize( Size(sstm) );
3723 sstm.read( &sstrLcolSend[0], sstrLcolSend.size() );
3724 sstrSize = sstrLcolSend.size();
3731 #if ( _DEBUGlevel_ >= 1 )
3732 statusOFS<<
"["<<ksup<<
"] P"<<
MYPROC(grid_)<<
" ("<<
MYROW(grid_)<<
","<<
MYCOL(grid_)<<
") ---> LBj("<<ksup<<
")="<<
LBj(ksup,grid_)<<
" ---> P"<<dst<<
" ("<<ksupProcRow<<
","<<dstCol<<
")"<<std::endl;
3734 MPI_Isend( &sstrSize,
sizeof(sstrSize), MPI_BYTE, dst, SELINV_TAG_L_SIZE_CD, grid_->comm, &mpiReqSizeSend );
3735 MPI_Isend( (
void*)&sstrLcolSend[0], sstrSize, MPI_BYTE, dst, SELINV_TAG_L_CONTENT_CD, grid_->comm, &mpiReqSend );
3738 PROFILE_COMM(
MYPROC(this->grid_),dst,SELINV_TAG_L_SIZE_CD,
sizeof(sstrSize));
3739 PROFILE_COMM(
MYPROC(this->grid_),dst,SELINV_TAG_L_CONTENT_CD,sstrSize);
3754 if( isRecvFromCrossDiagonal_(grid_->numProcRow,ksup) ){
3757 #if ( _DEBUGlevel_ >= 1 )
3758 statusOFS<<
"["<<ksup<<
"] P"<<
MYPROC(grid_)<<
" should receive from "<<CountRecvFromCrossDiagonal(ksup)<<
" processors"<<std::endl;
3762 std::vector<UBlock<T> >& Urow = this->U(
LBi( ksup, grid_ ) );
3763 std::vector<bool> isBlockFound(Urow.size(),
false);
3768 for(Int srcRow = 0; srcRow<grid_->numProcRow; srcRow++){
3769 if(isRecvFromCrossDiagonal_(srcRow,ksup) ){
3770 std::vector<LBlock<T> > LcolRecv;
3771 Int src =
PNUM(srcRow,
PCOL(ksup,grid_),grid_);
3773 MPI_Request & mpiReqSizeRecv = arrMpiReqsSizeRecv[recvIdx];
3774 Int & sstrSize = arrSstrLcolSizeRecv[recvIdx];
3776 MPI_Irecv( &sstrSize, 1, MPI_INT, src, SELINV_TAG_L_SIZE_CD, grid_->comm, &mpiReqSizeRecv );
3783 mpi::Waitall(arrMpiReqsSizeRecv);
3789 for(Int srcRow = 0; srcRow<grid_->numProcRow; srcRow++){
3790 if(isRecvFromCrossDiagonal_(srcRow,ksup) ){
3791 std::vector<LBlock<T> > LcolRecv;
3792 Int src =
PNUM(srcRow,
PCOL(ksup,grid_),grid_);
3794 MPI_Request & mpiReqRecv = arrMpiReqsRecv[recvIdx];
3795 Int & sstrSize = arrSstrLcolSizeRecv[recvIdx];
3796 std::vector<char> & sstrLcolRecv = arrSstrLcolRecv[recvIdx];
3797 sstrLcolRecv.resize(sstrSize);
3799 MPI_Irecv( &sstrLcolRecv[0], sstrSize, MPI_BYTE, src, SELINV_TAG_L_CONTENT_CD, grid_->comm, &mpiReqRecv );
3806 mpi::Waitall(arrMpiReqsRecv);
3812 for(Int srcRow = 0; srcRow<grid_->numProcRow; srcRow++){
3813 if(isRecvFromCrossDiagonal_(srcRow,ksup) ){
3814 std::vector<LBlock<T> > LcolRecv;
3815 Int src =
PNUM(srcRow,
PCOL(ksup,grid_),grid_);
3818 Int & sstrSize = arrSstrLcolSizeRecv[recvIdx];
3819 std::vector<char> & sstrLcolRecv = arrSstrLcolRecv[recvIdx];
3820 std::stringstream sstm;
3822 #if ( _DEBUGlevel_ >= 1 )
3823 statusOFS<<
"["<<ksup<<
"] P"<<
MYPROC(grid_)<<
" ("<<
MYROW(grid_)<<
","<<
MYCOL(grid_)<<
") <--- LBj("<<ksup<<
") <--- P"<<src<<
" ("<<srcRow<<
","<<ksupProcCol<<
")"<<std::endl;
3827 sstm.write( &sstrLcolRecv[0], sstrSize );
3831 std::vector<Int> mask( LBlockMask::TOTAL_NUMBER, 1 );
3832 deserialize( numLBlock, sstm, NO_MASK );
3833 LcolRecv.resize(numLBlock);
3834 for( Int ib = 0; ib < numLBlock; ib++ ){
3835 deserialize( LcolRecv[ib], sstm, mask );
3836 #if ( _DEBUGlevel_ >= 1 )
3837 statusOFS<<
"["<<ksup<<
"] RECV contains "<<LcolRecv[ib].blockIdx<<
" which corresponds to "<< ib * grid_->numProcRow + srcRow;
3839 statusOFS<<
" L is on row "<< srcRow <<
" whereas U is on col "<< LcolRecv[ib].blockIdx % grid_->numProcCol <<std::endl;
3849 std::vector<LBlock<T> >& Lcol = this->L(
LBj( ksup, grid_ ) );
3850 if(
MYROW( grid_ ) !=
PROW( ksup, grid_ ) ){
3851 LcolRecv.resize( Lcol.size() );
3852 for( Int ib = 0; ib < Lcol.size(); ib++ ){
3853 LcolRecv[ib] = Lcol[ib];
3857 LcolRecv.resize( Lcol.size() - 1 );
3858 for( Int ib = 0; ib < Lcol.size() - 1; ib++ ){
3859 LcolRecv[ib] = Lcol[ib+1];
3866 for( Int ib = 0; ib < LcolRecv.size(); ib++ ){
3869 ErrorHandling(
"LcolRecv contains the wrong blocks." );
3871 for( Int jb = 0; jb < Urow.size(); jb++ ){
3876 std::ostringstream msg;
3877 msg <<
"LB(" << LB.
blockIdx <<
", " << ksup <<
") and UB("
3878 << ksup <<
", " << UB.
blockIdx <<
") do not share the same size." << std::endl
3879 <<
"LB: " << LB.
numRow <<
" x " << LB.
numCol << std::endl
3880 <<
"UB: " << UB.
numRow <<
" x " << UB.
numCol << std::endl;
3881 ErrorHandling( msg.str().c_str() );
3890 #if ( _DEBUGlevel_ >= 1 )
3891 statusOFS<<
"["<<ksup<<
"] USING "<<LB.
blockIdx<< std::endl;
3893 isBlockFound[jb] =
true;
3901 for( Int jb = 0; jb < Urow.size(); jb++ ){
3903 if( !isBlockFound[jb] ){
3904 ErrorHandling(
"UBlock cannot find its update. Something is seriously wrong." );
3914 mpi::Waitall(arrMpiReqsSizeSend);
3915 mpi::Waitall(arrMpiReqsSend);
3921 #if ( _DEBUGlevel_ >= 1 )
3922 statusOFS << std::endl <<
"L(i,i) <- [L(k,k) * U(k,k)]^{-1}" << std::endl << std::endl;
3925 for( Int ksup = 0; ksup < numSuper; ksup++ ){
3926 if(
MYROW( grid_ ) ==
PROW( ksup, grid_ ) &&
3927 MYCOL( grid_ ) ==
PCOL( ksup, grid_ ) ){
3931 for(Int i = 0; i <
SuperSize( ksup, super_ ); i++){
3935 #if ( _DEBUGlevel_ >= 2 )
3937 statusOFS <<
"Factorized A (" << ksup <<
", " << ksup <<
"): " << LB.
nzval << std::endl;
3940 SuperSize( ksup, super_ ), ipiv.Data() );
3943 Symmetrize( LB.
nzval );
3945 #if ( _DEBUGlevel_ >= 2 )
3947 statusOFS <<
"Inversed A (" << ksup <<
", " << ksup <<
"): " << LB.
nzval << std::endl;
3960 template<
typename T>
3965 Int numCol = this->
NumCol();
3967 const IntNumVec& permInv = super_->permInv;
3972 pPerm_r = &super_->perm_r;
3973 pPermInv_r = &super_->permInv_r;
3976 const IntNumVec& permInv_r = *pPermInv_r;
3982 diag.Resize( numCol );
3985 for( Int orow = 0; orow < numCol; orow++){
3987 Int row = perm[ orow ];
3989 Int col = perm[ perm_r[ orow] ];
3991 Int blockColIdx =
BlockIdx( col, super_ );
3992 Int blockRowIdx =
BlockIdx( row, super_ );
3996 if(
MYROW( grid_ ) ==
PROW( blockRowIdx, grid_ ) &&
3997 MYCOL( grid_ ) ==
PCOL( blockColIdx, grid_ ) ){
3999 bool isFound =
false;
4001 if( blockColIdx <= blockRowIdx ){
4004 std::vector<LBlock<T> >& Lcol = this->L(
LBj( blockColIdx, grid_ ) );
4006 for( Int ib = 0; ib < Lcol.size(); ib++ ){
4007 #if ( _DEBUGlevel_ >= 1 )
4008 statusOFS <<
"blockRowIdx = " << blockRowIdx <<
", Lcol[ib].blockIdx = " << Lcol[ib].blockIdx <<
", blockColIdx = " << blockColIdx << std::endl;
4010 if( Lcol[ib].blockIdx == blockRowIdx ){
4012 for(
int iloc = 0; iloc < Lcol[ib].numRow; iloc++ ){
4013 if( rows[iloc] == row ){
4016 diagLocal[ orow ] = Lcol[ib].nzval( iloc, jloc );
4024 if( isFound ==
true )
break;
4030 std::vector<UBlock<T> >& Urow = this->U(
LBi( blockRowIdx, grid_ ) );
4032 for( Int jb = 0; jb < Urow.size(); jb++ ){
4033 if( Urow[jb].blockIdx == blockColIdx ){
4035 for(
int jloc = 0; jloc < Urow[jb].numCol; jloc++ ){
4036 if( cols[jloc] == col ){
4039 diagLocal[ orow ] = Urow[jb].nzval( iloc, jloc );
4046 if( isFound ==
true )
break;
4051 if( isFound ==
false ){
4052 statusOFS <<
"In the permutated order, (" << row <<
", " << col <<
4053 ") is not found in PMatrix." << std::endl;
4054 diagLocal[orow] = ZERO<T>();
4074 mpi::Allreduce( diagLocal.Data(), diag.Data(), numCol, MPI_SUM, grid_->comm );
4082 template<
typename T>
4085 #if ( _DEBUGlevel_ >= 1 )
4086 statusOFS << std::endl <<
"Converting PMatrix to DistSparseMatrix." << std::endl;
4088 Int mpirank = grid_->mpirank;
4089 Int mpisize = grid_->mpisize;
4091 std::vector<Int> rowSend( mpisize );
4092 std::vector<Int> colSend( mpisize );
4093 std::vector<T> valSend( mpisize );
4094 std::vector<Int> sizeSend( mpisize, 0 );
4095 std::vector<Int> displsSend( mpisize, 0 );
4097 std::vector<Int> rowRecv( mpisize );
4098 std::vector<Int> colRecv( mpisize );
4099 std::vector<T> valRecv( mpisize );
4100 std::vector<Int> sizeRecv( mpisize, 0 );
4101 std::vector<Int> displsRecv( mpisize, 0 );
4105 const IntNumVec& permInv = super_->permInv;
4110 pPerm_r = &super_->perm_r;
4111 pPermInv_r = &super_->permInv_r;
4114 const IntNumVec& permInv_r = *pPermInv_r;
4121 Int numColFirst = this->
NumCol() / mpisize;
4124 for( Int ksup = 0; ksup < numSuper; ksup++ ){
4126 if(
MYCOL( grid_ ) ==
PCOL( ksup, grid_ ) ){
4127 std::vector<LBlock<T> >& Lcol = this->L(
LBj( ksup, grid_ ) );
4128 for( Int ib = 0; ib < Lcol.size(); ib++ ){
4129 for( Int j = 0; j < Lcol[ib].numCol; j++ ){
4130 Int jcol = permInv[ permInv_r[ j +
FirstBlockCol( ksup, super_ ) ] ];
4131 Int dest = std::min( jcol / numColFirst, mpisize - 1 );
4132 sizeSend[dest] += Lcol[ib].numRow;
4138 if(
MYROW( grid_ ) ==
PROW( ksup, grid_ ) ){
4139 std::vector<UBlock<T> >& Urow = this->U(
LBi( ksup, grid_ ) );
4140 for( Int jb = 0; jb < Urow.size(); jb++ ){
4142 for( Int j = 0; j < cols.m(); j++ ){
4143 Int jcol = permInv[ permInv_r[ cols[j] ] ];
4144 Int dest = std::min( jcol / numColFirst, mpisize - 1 );
4145 sizeSend[dest] += Urow[jb].numRow;
4153 &sizeSend[0], 1, MPI_INT,
4154 &sizeRecv[0], 1, MPI_INT, grid_->comm );
4159 for( Int ip = 0; ip < mpisize; ip++ ){
4164 displsSend[ip] = displsSend[ip-1] + sizeSend[ip-1];
4171 displsRecv[ip] = displsRecv[ip-1] + sizeRecv[ip-1];
4174 Int sizeSendTotal = displsSend[mpisize-1] + sizeSend[mpisize-1];
4175 Int sizeRecvTotal = displsRecv[mpisize-1] + sizeRecv[mpisize-1];
4177 rowSend.resize( sizeSendTotal );
4178 colSend.resize( sizeSendTotal );
4179 valSend.resize( sizeSendTotal );
4181 rowRecv.resize( sizeRecvTotal );
4182 colRecv.resize( sizeRecvTotal );
4183 valRecv.resize( sizeRecvTotal );
4185 #if ( _DEBUGlevel_ >= 1 )
4186 statusOFS <<
"displsSend = " << displsSend << std::endl;
4187 statusOFS <<
"displsRecv = " << displsRecv << std::endl;
4191 std::vector<Int> cntSize( mpisize, 0 );
4193 for( Int ksup = 0; ksup < numSuper; ksup++ ){
4195 if(
MYCOL( grid_ ) ==
PCOL( ksup, grid_ ) ){
4196 std::vector<LBlock<T> >& Lcol = this->L(
LBj( ksup, grid_ ) );
4197 for( Int ib = 0; ib < Lcol.size(); ib++ ){
4200 for( Int j = 0; j < Lcol[ib].numCol; j++ ){
4201 Int jcol = permInv[ permInv_r[ j +
FirstBlockCol( ksup, super_ ) ] ];
4202 Int dest = std::min( jcol / numColFirst, mpisize - 1 );
4203 for( Int i = 0; i < rows.m(); i++ ){
4204 rowSend[displsSend[dest] + cntSize[dest]] = permInv[ rows[i] ];
4205 colSend[displsSend[dest] + cntSize[dest]] = jcol;
4206 valSend[displsSend[dest] + cntSize[dest]] = nzval( i, j );
4214 if(
MYROW( grid_ ) ==
PROW( ksup, grid_ ) ){
4215 std::vector<UBlock<T> >& Urow = this->U(
LBi( ksup, grid_ ) );
4216 for( Int jb = 0; jb < Urow.size(); jb++ ){
4219 for( Int j = 0; j < cols.m(); j++ ){
4220 Int jcol = permInv[ permInv_r[ cols[j] ] ];
4221 Int dest = std::min( jcol / numColFirst, mpisize - 1 );
4222 for( Int i = 0; i < Urow[jb].numRow; i++ ){
4223 rowSend[displsSend[dest] + cntSize[dest]] = permInv[ i +
FirstBlockCol( ksup, super_ ) ];
4224 colSend[displsSend[dest] + cntSize[dest]] = jcol;
4225 valSend[displsSend[dest] + cntSize[dest]] = nzval( i, j );
4234 for( Int ip = 0; ip < mpisize; ip++ ){
4235 if( cntSize[ip] != sizeSend[ip] )
4237 ErrorHandling(
"Sizes of the sending information do not match." );
4244 &rowSend[0], &sizeSend[0], &displsSend[0],
4245 &rowRecv[0], &sizeRecv[0], &displsRecv[0],
4248 &colSend[0], &sizeSend[0], &displsSend[0],
4249 &colRecv[0], &sizeRecv[0], &displsRecv[0],
4252 &valSend[0], &sizeSend[0], &displsSend[0],
4253 &valRecv[0], &sizeRecv[0], &displsRecv[0],
4256 #if ( _DEBUGlevel_ >= 1 )
4257 statusOFS <<
"Alltoallv communication finished." << std::endl;
4272 Int firstCol = mpirank * numColFirst;
4274 if( mpirank == mpisize-1 )
4275 numColLocal = this->
NumCol() - numColFirst * (mpisize-1);
4277 numColLocal = numColFirst;
4279 std::vector<std::vector<Int> > rows( numColLocal );
4280 std::vector<std::vector<T> > vals( numColLocal );
4282 for( Int ip = 0; ip < mpisize; ip++ ){
4283 Int* rowRecvCur = &rowRecv[displsRecv[ip]];
4284 Int* colRecvCur = &colRecv[displsRecv[ip]];
4285 T* valRecvCur = &valRecv[displsRecv[ip]];
4286 for( Int i = 0; i < sizeRecv[ip]; i++ ){
4287 rows[colRecvCur[i]-firstCol].push_back( rowRecvCur[i] );
4288 vals[colRecvCur[i]-firstCol].push_back( valRecvCur[i] );
4293 std::vector<std::vector<Int> > sortIndex( numColLocal );
4294 for( Int j = 0; j < numColLocal; j++ ){
4295 sortIndex[j].resize( rows[j].size() );
4296 for( Int i = 0; i < sortIndex[j].size(); i++ )
4297 sortIndex[j][i] = i;
4298 std::sort( sortIndex[j].begin(), sortIndex[j].end(),
4299 IndexComp<std::vector<Int>& > ( rows[j] ) );
4311 for( Int j = 0; j < numColLocal; j++ ){
4316 A.
comm = grid_->comm;
4318 #if ( _DEBUGlevel_ >= 1 )
4319 statusOFS <<
"nnzLocal = " << A.
nnzLocal << std::endl;
4320 statusOFS <<
"nnz = " << A.
Nnz() << std::endl;
4329 for( Int j = 0; j < numColLocal; j++ ){
4330 std::vector<Int>& rowsCur = rows[j];
4331 std::vector<Int>& sortIndexCur = sortIndex[j];
4332 std::vector<T>& valsCur = vals[j];
4333 for( Int i = 0; i < rows[j].size(); i++ ){
4335 *(rowPtr++) = rowsCur[sortIndexCur[i]] + 1;
4336 *(nzvalPtr++) = valsCur[sortIndexCur[i]];
4340 #if ( _DEBUGlevel_ >= 1 )
4341 statusOFS <<
"A.colptrLocal[end] = " << A.
colptrLocal(numColLocal) << std::endl;
4342 statusOFS <<
"A.rowindLocal.size() = " << A.
rowindLocal.m() << std::endl;
4354 template<
typename T>
4357 #if ( _DEBUGlevel_ >= 1 )
4358 statusOFS << std::endl <<
"Converting PMatrix to DistSparseMatrix (2nd format)." << std::endl;
4360 Int mpirank = grid_->mpirank;
4361 Int mpisize = grid_->mpisize;
4363 std::vector<Int> rowSend( mpisize );
4364 std::vector<Int> colSend( mpisize );
4365 std::vector<T> valSend( mpisize );
4366 std::vector<Int> sizeSend( mpisize, 0 );
4367 std::vector<Int> displsSend( mpisize, 0 );
4369 std::vector<Int> rowRecv( mpisize );
4370 std::vector<Int> colRecv( mpisize );
4371 std::vector<T> valRecv( mpisize );
4372 std::vector<Int> sizeRecv( mpisize, 0 );
4373 std::vector<Int> displsRecv( mpisize, 0 );
4376 const IntNumVec& permInv = super_->permInv;
4382 if(optionsFact_->RowPerm==
"NOROWPERM"){
4383 pPermInv_r = &super_->permInv;
4386 pPermInv_r = &super_->permInv_r;
4389 const IntNumVec& permInv_r = *pPermInv_r;
4401 Int numColFirst = this->
NumCol() / mpisize;
4404 for( Int ksup = 0; ksup < numSuper; ksup++ ){
4406 if(
MYCOL( grid_ ) ==
PCOL( ksup, grid_ ) ){
4407 std::vector<LBlock<T> >& Lcol = this->L(
LBj( ksup, grid_ ) );
4408 for( Int ib = 0; ib < Lcol.size(); ib++ ){
4409 for( Int j = 0; j < Lcol[ib].numCol; j++ ){
4411 Int dest = std::min( jcol / numColFirst, mpisize - 1 );
4412 sizeSend[dest] += Lcol[ib].numRow;
4418 if(
MYROW( grid_ ) ==
PROW( ksup, grid_ ) ){
4419 std::vector<UBlock<T> >& Urow = this->U(
LBi( ksup, grid_ ) );
4420 for( Int jb = 0; jb < Urow.size(); jb++ ){
4422 for( Int j = 0; j < cols.m(); j++ ){
4423 Int jcol = permInv( cols(j) );
4424 Int dest = std::min( jcol / numColFirst, mpisize - 1 );
4425 sizeSend[dest] += Urow[jb].numRow;
4433 &sizeSend[0], 1, MPI_INT,
4434 &sizeRecv[0], 1, MPI_INT, grid_->comm );
4439 for( Int ip = 0; ip < mpisize; ip++ ){
4444 displsSend[ip] = displsSend[ip-1] + sizeSend[ip-1];
4451 displsRecv[ip] = displsRecv[ip-1] + sizeRecv[ip-1];
4454 Int sizeSendTotal = displsSend[mpisize-1] + sizeSend[mpisize-1];
4455 Int sizeRecvTotal = displsRecv[mpisize-1] + sizeRecv[mpisize-1];
4457 rowSend.resize( sizeSendTotal );
4458 colSend.resize( sizeSendTotal );
4459 valSend.resize( sizeSendTotal );
4461 rowRecv.resize( sizeRecvTotal );
4462 colRecv.resize( sizeRecvTotal );
4463 valRecv.resize( sizeRecvTotal );
4465 #if ( _DEBUGlevel_ >= 1 )
4466 statusOFS <<
"displsSend = " << displsSend << std::endl;
4467 statusOFS <<
"displsRecv = " << displsRecv << std::endl;
4471 std::vector<Int> cntSize( mpisize, 0 );
4474 for( Int ksup = 0; ksup < numSuper; ksup++ ){
4476 if(
MYCOL( grid_ ) ==
PCOL( ksup, grid_ ) ){
4477 std::vector<LBlock<T> >& Lcol = this->L(
LBj( ksup, grid_ ) );
4478 for( Int ib = 0; ib < Lcol.size(); ib++ ){
4481 for( Int j = 0; j < Lcol[ib].numCol; j++ ){
4483 Int dest = std::min( jcol / numColFirst, mpisize - 1 );
4484 for( Int i = 0; i < rows.m(); i++ ){
4485 rowSend[displsSend[dest] + cntSize[dest]] = permInv_r( rows(i) );
4486 colSend[displsSend[dest] + cntSize[dest]] = jcol;
4487 valSend[displsSend[dest] + cntSize[dest]] = nzval( i, j );
4496 if(
MYROW( grid_ ) ==
PROW( ksup, grid_ ) ){
4497 std::vector<UBlock<T> >& Urow = this->U(
LBi( ksup, grid_ ) );
4498 for( Int jb = 0; jb < Urow.size(); jb++ ){
4501 for( Int j = 0; j < cols.m(); j++ ){
4502 Int jcol = permInv( cols(j) );
4503 Int dest = std::min( jcol / numColFirst, mpisize - 1 );
4504 for( Int i = 0; i < Urow[jb].numRow; i++ ){
4505 rowSend[displsSend[dest] + cntSize[dest]] =
4507 colSend[displsSend[dest] + cntSize[dest]] = jcol;
4508 valSend[displsSend[dest] + cntSize[dest]] = nzval( i, j );
4519 for( Int ip = 0; ip < mpisize; ip++ ){
4520 if( cntSize[ip] != sizeSend[ip] ){
4521 ErrorHandling(
"Sizes of the sending information do not match." );
4527 &rowSend[0], &sizeSend[0], &displsSend[0],
4528 &rowRecv[0], &sizeRecv[0], &displsRecv[0],
4531 &colSend[0], &sizeSend[0], &displsSend[0],
4532 &colRecv[0], &sizeRecv[0], &displsRecv[0],
4535 &valSend[0], &sizeSend[0], &displsSend[0],
4536 &valRecv[0], &sizeRecv[0], &displsRecv[0],
4539 #if ( _DEBUGlevel_ >= 1 )
4540 statusOFS <<
"Alltoallv communication finished." << std::endl;
4555 Int firstCol = mpirank * numColFirst;
4557 if( mpirank == mpisize-1 )
4558 numColLocal = this->
NumCol() - numColFirst * (mpisize-1);
4560 numColLocal = numColFirst;
4562 std::vector<std::vector<Int> > rows( numColLocal );
4563 std::vector<std::vector<T> > vals( numColLocal );
4565 for( Int ip = 0; ip < mpisize; ip++ ){
4566 Int* rowRecvCur = &rowRecv[displsRecv[ip]];
4567 Int* colRecvCur = &colRecv[displsRecv[ip]];
4568 T* valRecvCur = &valRecv[displsRecv[ip]];
4569 for( Int i = 0; i < sizeRecv[ip]; i++ ){
4570 rows[colRecvCur[i]-firstCol].push_back( rowRecvCur[i] );
4571 vals[colRecvCur[i]-firstCol].push_back( valRecvCur[i] );
4576 std::vector<std::vector<Int> > sortIndex( numColLocal );
4577 for( Int j = 0; j < numColLocal; j++ ){
4578 sortIndex[j].resize( rows[j].size() );
4579 for( Int i = 0; i < sortIndex[j].size(); i++ )
4580 sortIndex[j][i] = i;
4581 std::sort( sortIndex[j].begin(), sortIndex[j].end(),
4582 IndexComp<std::vector<Int>& > ( rows[j] ) );
4589 if( A.
size != this->NumCol() ){
4590 ErrorHandling(
"The DistSparseMatrix providing the pattern has a different size from PMatrix." );
4593 ErrorHandling(
"The DistSparseMatrix providing the pattern has a different number of local columns from PMatrix." );
4608 B.
comm = grid_->comm;
4612 for( Int j = 0; j < numColLocal; j++ ){
4613 std::vector<Int>& rowsCur = rows[j];
4614 std::vector<Int>& sortIndexCur = sortIndex[j];
4615 std::vector<T>& valsCur = vals[j];
4616 std::vector<Int> rowsCurSorted( rowsCur.size() );
4618 for( Int i = 0; i < rowsCurSorted.size(); i++ ){
4619 rowsCurSorted[i] = rowsCur[sortIndexCur[i]] + 1;
4623 std::vector<Int>::iterator it;
4626 it = std::lower_bound( rowsCurSorted.begin(), rowsCurSorted.end(),
4628 if( it == rowsCurSorted.end() ){
4630 *(nzvalPtr++) = ZERO<T>();
4634 *(nzvalPtr++) = valsCur[ sortIndexCur[it-rowsCurSorted.begin()] ];
4639 #if ( _DEBUGlevel_ >= 1 )
4640 statusOFS <<
"B.colptrLocal[end] = " << B.
colptrLocal(numColLocal) << std::endl;
4641 statusOFS <<
"B.rowindLocal.size() = " << B.
rowindLocal.m() << std::endl;
4655 template<
typename T>
4659 #if ( _DEBUGlevel_ >= 1 )
4660 statusOFS << std::endl <<
"Converting PMatrix to DistSparseMatrix (2nd format)." << std::endl;
4662 Int mpirank = grid_->mpirank;
4663 Int mpisize = grid_->mpisize;
4665 std::vector<Int> rowSend( mpisize );
4666 std::vector<Int> colSend( mpisize );
4667 std::vector<T> valSend( mpisize );
4668 std::vector<Int> sizeSend( mpisize, 0 );
4669 std::vector<Int> displsSend( mpisize, 0 );
4671 std::vector<Int> rowRecv( mpisize );
4672 std::vector<Int> colRecv( mpisize );
4673 std::vector<T> valRecv( mpisize );
4674 std::vector<Int> sizeRecv( mpisize, 0 );
4675 std::vector<Int> displsRecv( mpisize, 0 );
4679 const IntNumVec& permInv = super_->permInv;
4684 pPerm_r = &super_->perm_r;
4685 pPermInv_r = &super_->permInv_r;
4688 const IntNumVec& permInv_r = *pPermInv_r;
4692 Int numColFirst = this->
NumCol() / mpisize;
4693 Int firstCol = mpirank * numColFirst;
4695 if( mpirank == mpisize-1 )
4696 numColLocal = this->
NumCol() - numColFirst * (mpisize-1);
4698 numColLocal = numColFirst;
4707 for( Int j = 0; j < numColLocal; j++ ){
4708 Int ocol = firstCol + j;
4709 Int col = perm[ perm_r[ ocol] ];
4710 Int blockColIdx =
BlockIdx( col, super_ );
4711 Int procCol =
PCOL( blockColIdx, grid_ );
4712 for( Int i = colPtr[j] - 1; i < colPtr[j+1] - 1; i++ ){
4713 Int orow = rowPtr[i]-1;
4714 Int row = perm[ orow ];
4715 Int blockRowIdx =
BlockIdx( row, super_ );
4716 Int procRow =
PROW( blockRowIdx, grid_ );
4717 Int dest =
PNUM( procRow, procCol, grid_ );
4718 #if ( _DEBUGlevel_ >= 1 )
4719 statusOFS <<
"("<< orow<<
", "<<ocol<<
") == "<<
"("<< row<<
", "<<col<<
")"<< std::endl;
4720 statusOFS <<
"BlockIdx = " << blockRowIdx <<
", " <<blockColIdx << std::endl;
4721 statusOFS << procRow <<
", " << procCol <<
", "
4722 << dest << std::endl;
4730 &sizeSend[0], 1, MPI_INT,
4731 &sizeRecv[0], 1, MPI_INT, grid_->comm );
4733 #if ( _DEBUGlevel_ >= 1 )
4734 statusOFS << std::endl <<
"sizeSend: " << sizeSend << std::endl;
4735 statusOFS << std::endl <<
"sizeRecv: " << sizeRecv << std::endl;
4741 for( Int ip = 0; ip < mpisize; ip++ ){
4746 displsSend[ip] = displsSend[ip-1] + sizeSend[ip-1];
4753 displsRecv[ip] = displsRecv[ip-1] + sizeRecv[ip-1];
4757 Int sizeSendTotal = displsSend[mpisize-1] + sizeSend[mpisize-1];
4758 Int sizeRecvTotal = displsRecv[mpisize-1] + sizeRecv[mpisize-1];
4760 rowSend.resize( sizeSendTotal );
4761 colSend.resize( sizeSendTotal );
4762 valSend.resize( sizeSendTotal );
4764 rowRecv.resize( sizeRecvTotal );
4765 colRecv.resize( sizeRecvTotal );
4766 valRecv.resize( sizeRecvTotal );
4768 #if ( _DEBUGlevel_ >= 1 )
4769 statusOFS <<
"displsSend = " << displsSend << std::endl;
4770 statusOFS <<
"displsRecv = " << displsRecv << std::endl;
4774 std::vector<Int> cntSize( mpisize, 0 );
4779 for( Int j = 0; j < numColLocal; j++ ){
4781 Int ocol = firstCol + j;
4782 Int col = perm[ perm_r[ ocol] ];
4783 Int blockColIdx =
BlockIdx( col, super_ );
4784 Int procCol =
PCOL( blockColIdx, grid_ );
4785 for( Int i = colPtr[j] - 1; i < colPtr[j+1] - 1; i++ ){
4786 Int orow = rowPtr[i]-1;
4787 Int row = perm[ orow ];
4788 Int blockRowIdx =
BlockIdx( row, super_ );
4789 Int procRow =
PROW( blockRowIdx, grid_ );
4790 Int dest =
PNUM( procRow, procCol, grid_ );
4791 rowSend[displsSend[dest] + cntSize[dest]] = row;
4792 colSend[displsSend[dest] + cntSize[dest]] = col;
4799 for( Int ip = 0; ip < mpisize; ip++ ){
4800 if( cntSize[ip] != sizeSend[ip] ){
4801 ErrorHandling(
"Sizes of the sending information do not match." );
4807 &rowSend[0], &sizeSend[0], &displsSend[0],
4808 &rowRecv[0], &sizeRecv[0], &displsRecv[0],
4811 &colSend[0], &sizeSend[0], &displsSend[0],
4812 &colRecv[0], &sizeRecv[0], &displsRecv[0],
4815 #if ( _DEBUGlevel_ >= 1 )
4816 statusOFS <<
"Alltoallv communication of nonzero indices finished." << std::endl;
4820 #if ( _DEBUGlevel_ >= 1 )
4821 for( Int ip = 0; ip < mpisize; ip++ ){
4822 statusOFS <<
"rowSend[" << ip <<
"] = " << rowSend[ip] << std::endl;
4823 statusOFS <<
"rowRecv[" << ip <<
"] = " << rowRecv[ip] << std::endl;
4824 statusOFS <<
"colSend[" << ip <<
"] = " << colSend[ip] << std::endl;
4825 statusOFS <<
"colRecv[" << ip <<
"] = " << colRecv[ip] << std::endl;
4836 for( Int g = 0; g < sizeRecvTotal; g++ ){
4837 Int row = rowRecv[g];
4838 Int col = colRecv[g];
4840 Int blockRowIdx =
BlockIdx( row, super_ );
4841 Int blockColIdx =
BlockIdx( col, super_ );
4844 bool isFound =
false;
4846 if( blockColIdx <= blockRowIdx ){
4849 std::vector<LBlock<T> >& Lcol = this->L(
LBj( blockColIdx, grid_ ) );
4851 for( Int ib = 0; ib < Lcol.size(); ib++ ){
4852 #if ( _DEBUGlevel_ >= 1 )
4853 statusOFS <<
"blockRowIdx = " << blockRowIdx <<
", Lcol[ib].blockIdx = " << Lcol[ib].blockIdx <<
", blockColIdx = " << blockColIdx << std::endl;
4855 if( Lcol[ib].blockIdx == blockRowIdx ){
4857 for(
int iloc = 0; iloc < Lcol[ib].numRow; iloc++ ){
4858 if( rows[iloc] == row ){
4860 valRecv[g] = Lcol[ib].nzval( iloc, jloc );
4866 if( isFound ==
true )
break;
4872 std::vector<UBlock<T> >& Urow = this->U(
LBi( blockRowIdx, grid_ ) );
4874 for( Int jb = 0; jb < Urow.size(); jb++ ){
4875 if( Urow[jb].blockIdx == blockColIdx ){
4877 for(
int jloc = 0; jloc < Urow[jb].numCol; jloc++ ){
4878 if( cols[jloc] == col ){
4880 valRecv[g] = Urow[jb].nzval( iloc, jloc );
4886 if( isFound ==
true )
break;
4891 if( isFound ==
false ){
4892 statusOFS <<
"In the permutated order, (" << row <<
", " << col <<
4893 ") is not found in PMatrix." << std::endl;
4894 valRecv[g] = ZERO<T>();
4903 &valRecv[0], &sizeRecv[0], &displsRecv[0],
4904 &valSend[0], &sizeSend[0], &displsSend[0],
4907 #if ( _DEBUGlevel_ >= 1 )
4908 statusOFS <<
"Alltoallv communication of nonzero values finished." << std::endl;
4924 B.
comm = grid_->comm;
4926 for( Int i = 0; i < mpisize; i++ )
4933 for( Int j = 0; j < numColLocal; j++ ){
4934 Int ocol = firstCol + j;
4935 Int col = perm[ perm_r[ ocol] ];
4936 Int blockColIdx =
BlockIdx( col, super_ );
4937 Int procCol =
PCOL( blockColIdx, grid_ );
4938 for( Int i = colPtr[j] - 1; i < colPtr[j+1] - 1; i++ ){
4939 Int orow = rowPtr[i]-1;
4940 Int row = perm[ orow ];
4941 Int blockRowIdx =
BlockIdx( row, super_ );
4942 Int procRow =
PROW( blockRowIdx, grid_ );
4943 Int dest =
PNUM( procRow, procCol, grid_ );
4944 valPtr[i] = valSend[displsSend[dest] + cntSize[dest]];
4950 for( Int ip = 0; ip < mpisize; ip++ ){
4951 if( cntSize[ip] != sizeSend[ip] ){
4952 ErrorHandling(
"Sizes of the sending information do not match." );
4964 template<
typename T>
4969 for( Int ksup = 0; ksup < numSuper; ksup++ ){
4970 if(
MYCOL( grid_ ) ==
PCOL( ksup, grid_ ) ){
4971 std::vector<LBlock<T> >& Lcol = this->L(
LBj( ksup, grid_ ) );
4972 for( Int ib = 0; ib < Lcol.size(); ib++ ){
4973 nnzLocal += Lcol[ib].numRow * Lcol[ib].numCol;
4976 if(
MYROW( grid_ ) ==
PROW( ksup, grid_ ) ){
4977 std::vector<UBlock<T> >& Urow = this->U(
LBi( ksup, grid_ ) );
4978 for( Int jb = 0; jb < Urow.size(); jb++ ){
4979 nnzLocal += Urow[jb].numRow * Urow[jb].numCol;
4989 template<
typename T>
4992 LongInt nnzLocal = LongInt( this->NnzLocal() );
4995 MPI_Allreduce( &nnzLocal, &nnz, 1, MPI_LONG_LONG, MPI_SUM, grid_->comm );
5006 Real inertiaLocal = 0.0;
5009 for( Int ksup = 0; ksup < numSuper; ksup++ ){
5011 if(
MYROW( grid_ ) ==
PROW( ksup, grid_ ) &&
5012 MYCOL( grid_ ) ==
PCOL( ksup, grid_ ) ){
5014 for( Int i = 0; i < LB.
numRow; i++ ){
5015 if( LB.
nzval(i, i) < 0 )
5022 mpi::Allreduce( &inertiaLocal, &inertia, 1, MPI_SUM, grid_->comm );
5034 Real inertiaLocal = 0.0;
5037 for( Int ksup = 0; ksup < numSuper; ksup++ ){
5039 if(
MYROW( grid_ ) ==
PROW( ksup, grid_ ) &&
5040 MYCOL( grid_ ) ==
PCOL( ksup, grid_ ) ){
5041 LBlock<Complex> & LB = this->L(
LBj( ksup, grid_ ) )[0];
5042 for( Int i = 0; i < LB.numRow; i++ ){
5043 if( LB.nzval(i, i).real() < 0 )
5050 mpi::Allreduce( &inertiaLocal, &inertia, 1, MPI_SUM, grid_->comm );
5056 template<
typename T>
5060 if(pFactOpt->Symmetric == 0){
5061 pMat =
new PMatrixUnsym<T>( pGridType, pSuper, pSelInvOpt, pFactOpt );
5064 pMat =
new PMatrix<T>( pGridType, pSuper, pSelInvOpt, pFactOpt );
5070 template<
typename T>
5074 if(pFactOpt->Symmetric == 0){
5078 pMat =
new PMatrix<T>();
5086 template<
typename T>
5087 inline void PMatrix<T>::DumpLU()
5090 const IntNumVec& perm = super_->perm;
5091 const IntNumVec& permInv = super_->permInv;
5093 const IntNumVec * pPerm_r;
5094 const IntNumVec * pPermInv_r;
5096 if(optionsFact_->RowPerm==
"NOROWPERM"){
5097 pPerm_r = &super_->perm;
5098 pPermInv_r = &super_->permInv;
5101 pPerm_r = &super_->perm_r;
5102 pPermInv_r = &super_->permInv_r;
5105 const IntNumVec& perm_r = *pPerm_r;
5106 const IntNumVec& permInv_r = *pPermInv_r;
5108 statusOFS<<
"Col perm is "<<perm<<std::endl;
5109 statusOFS<<
"Row perm is "<<perm_r<<std::endl;
5111 statusOFS<<
"Inv Col perm is "<<permInv<<std::endl;
5112 statusOFS<<
"Inv Row perm is "<<permInv_r<<std::endl;
5117 statusOFS<<
"Content of L"<<std::endl;
5119 for(Int j = 0;j<this->L_.size()-1;++j){
5120 std::vector<LBlock<T> >& Lcol = this->L( j );
5121 Int blockColIdx =
GBj( j, this->grid_ );
5125 for( Int ib = 0; ib < Lcol.size(); ib++ ){
5126 for(Int ir = 0; ir< Lcol[ib].rows.m(); ++ir){
5127 Int row = Lcol[ib].rows[ir];
5128 for(Int col = fc; col<fc+Lcol[ib].numCol;col++){
5129 Int ocol = permInv_r[permInv[col]];
5130 Int orow = permInv[row];
5131 Int jloc = col -
FirstBlockCol( blockColIdx, this->super_ );
5133 T val = Lcol[ib].nzval( iloc, jloc );
5134 statusOFS <<
"("<< orow<<
", "<<ocol<<
") == "<<
"("<< row<<
", "<<col<<
") = "<<val<< std::endl;
5140 statusOFS<<
"Content of U"<<std::endl;
5143 for(Int i = 0;i<this->U_.size()-1;++i){
5144 std::vector<UBlock<T> >& Urow = this->U( i );
5145 Int blockRowIdx =
GBi( i, this->grid_ );
5147 for( Int jb = 0; jb < Urow.size(); jb++ ){
5148 for(Int row = fr; row<fr+Urow[jb].numRow;row++){
5149 for(Int ic = 0; ic< Urow[jb].cols.m(); ++ic){
5150 Int col = Urow[jb].cols[ic];
5151 Int ocol = permInv_r[permInv[col]];
5152 Int orow = permInv[row];
5153 Int iloc = row -
FirstBlockRow( blockRowIdx, this->super_ );
5155 T val = Urow[jb].nzval( iloc, jloc );
5156 statusOFS <<
"("<< orow<<
", "<<ocol<<
") == "<<
"("<< row<<
", "<<col<<
") = "<<val<< std::endl;
5168 template<
typename T>
5169 inline void PMatrix<T>::CopyLU(
const PMatrix<T> & C){
5170 ColBlockIdx_ = C.ColBlockIdx_;
5171 RowBlockIdx_ = C.RowBlockIdx_;
5181 #endif //_PEXSI_PSELINV_IMPL_HPP_
virtual void PreSelInv()
PreSelInv prepares the structure in L_ and U_ so that SelInv only involves matrix-matrix multiplicati...
Definition: pselinv_impl.hpp:3594
A thin interface for passing parameters to set the Factorization options.
Definition: pselinv.hpp:115
Int blockIdx
Block index (supernodal index)
Definition: pselinv.hpp:248
void PMatrixToDistSparseMatrix(DistSparseMatrix< T > &A)
PMatrixToDistSparseMatrix converts the PMatrix into a distributed compressed sparse column matrix for...
Definition: pselinv_impl.hpp:4083
Definition: TreeBcast.hpp:705
SuperNodeType describes mapping between supernode and column, the permutation information, and potentially the elimination tree (not implemented here).
Definition: pselinv.hpp:177
Interface with SuperLU_Dist (version 3.0 and later)
void GetNegativeInertia(Real &inertia)
GetNegativeInertia computes the negative inertia of a PMatrix. This can be used to estimate e...
Int PROW(Int bnum, const GridType *g)
PROW returns the processor row that the bnum-th block (supernode) belongs to.
Definition: pselinv.hpp:304
Int numRow
Number of nonzero rows.
Definition: pselinv.hpp:199
IntNumVec colptrLocal
Dimension numColLocal + 1, storing the pointers to the nonzero row indices and nonzero values in rowp...
Definition: sparse_matrix.hpp:108
void UnpackData(SuperNodeBufferType &snode, std::vector< LBlock< T > > &LcolRecv, std::vector< UBlock< T > > &UrowRecv)
UnpackData.
Definition: pselinv_impl.hpp:1060
Int MYPROC(const GridType *g)
MYPROC returns the current processor rank.
Definition: pselinv.hpp:291
Int nnzLocal
Local number of local nonzeros elements on this processor.
Definition: sparse_matrix.hpp:96
Int PCOL(Int bnum, const GridType *g)
PCOL returns the processor column that the bnum-th block (supernode) belongs to.
Definition: pselinv.hpp:309
Int LBi(Int bnum, const GridType *g)
LBi returns the local block number on the processor at processor row PROW( bnum, g )...
Definition: pselinv.hpp:319
Profiling and timing using TAU.
Int SuperSize(Int bnum, const SuperNodeType *s)
SuperSize returns the size of the block bnum.
Definition: pselinv.hpp:360
Int LBj(Int bnum, const GridType *g)
LBj returns the local block number on the processor at processor column PCOL( bnum, g ).
Definition: pselinv.hpp:324
Int GBj(Int jLocal, const GridType *g)
GBj returns the global block number from a local block number in the column direction.
Definition: pselinv.hpp:334
void SetValue(NumMat< F > &M, F val)
SetValue sets a numerical matrix to a constant val.
Definition: NumMat_impl.hpp:153
Int numCol
Number of nonzero columns.
Definition: pselinv.hpp:202
Int NumCol(const SuperNodeType *s)
NumCol returns the total number of columns for a supernodal partiiton.
Definition: pselinv.hpp:369
Definition: pselinv.hpp:615
void GetDiagonal(NumVec< T > &diag)
GetDiagonal extracts the diagonal elements of the PMatrix.
Definition: pselinv_impl.hpp:3961
void GetWorkSet(std::vector< Int > &snodeEtree, std::vector< std::vector< Int > > &WSet)
GetWorkSet.
Definition: pselinv_impl.hpp:1227
NumVec< F > nzvalLocal
Dimension nnzLocal, storing the nonzero values.
Definition: sparse_matrix.hpp:116
Int FirstBlockRow(Int bnum, const SuperNodeType *s)
FirstBlockRow returns the first column of a block bnum. Note: the functionality of FirstBlockRow is e...
Definition: pselinv.hpp:355
void GetEtree(std::vector< Int > &etree_supno)
GetEtree computes the supernodal elimination tree to be used later in the pipelined selected inversio...
Definition: pselinv_impl.hpp:1166
LBlock stores a nonzero block in the lower triangular part or the diagonal part in PSelInv...
Definition: pselinv.hpp:193
IntNumVec rowindLocal
Dimension nnzLocal, storing the nonzero row indices. The indices are 1-based (FORTRAN-convention), i.e. the first row index is 1.
Definition: sparse_matrix.hpp:113
Int PNUM(Int i, Int j, const GridType *g)
PNUM returns the processor rank that the bnum-th block (supernode) belongs to.
Definition: pselinv.hpp:314
virtual void ConstructCommunicationPattern()
ConstructCommunicationPattern constructs the communication pattern to be used later in the selected i...
Definition: pselinv_impl.hpp:2603
void ConstructCommunicationPattern_P2p()
ConstructCommunicationPattern_P2p constructs the communication pattern to be used later in the select...
Definition: pselinv_impl.hpp:2611
Int GBi(Int iLocal, const GridType *g)
GBi returns the global block number from a local block number in the row direction.
Definition: pselinv.hpp:329
GridType is the PSelInv way of defining the grid.
Definition: pselinv.hpp:140
void SelInvIntra_P2p(Int lidx, Int &rank)
SelInvIntra_P2p.
Definition: pselinv_impl.hpp:1465
Int MYCOL(const GridType *g)
MYCOL returns my processor column.
Definition: pselinv.hpp:299
Int size
Matrix dimension.
Definition: sparse_matrix.hpp:93
void PMatrixToDistSparseMatrix_OLD(const DistSparseMatrix< T > &A, DistSparseMatrix< T > &B)
PMatrixToDistSparseMatrix_OLD converts the PMatrix into a distributed compressed sparse column matrix...
Definition: pselinv_impl.hpp:4355
IntNumVec rows
Dimension numRow * 1, index (0-based) for the number of nonzero rows.
Definition: pselinv.hpp:205
Int numCol
Number of nonzero columns.
Definition: pselinv.hpp:254
UBlock stores a nonzero block in the upper triangular part in PSelInv.
Definition: pselinv.hpp:245
Int FirstBlockCol(Int bnum, const SuperNodeType *s)
FirstBlockCol returns the first column of a block bnum.
Definition: pselinv.hpp:348
LongInt Nnz()
Compute the total number of nonzeros through MPI_Allreduce.
Definition: sparse_matrix_impl.hpp:101
MPI_Comm comm
MPI communicator.
Definition: sparse_matrix.hpp:119
Int NnzLocal()
NnzLocal computes the number of nonzero elements (L and U) saved locally.
Definition: pselinv_impl.hpp:4965
Int blockIdx
Block index (supernodal index)
Definition: pselinv.hpp:196
PMatrix contains the main data structure and the computational routine for the parallel selected inve...
Definition: pselinv.hpp:542
void SendRecvCD_UpdateU(std::vector< SuperNodeBufferType > &arrSuperNodes, Int stepSuper)
SendRecvCD_UpdateU.
Definition: pselinv_impl.hpp:828
Int BlockIdx(Int i, const SuperNodeType *s)
BlockIdx returns the block index of a column i.
Definition: pselinv.hpp:343
Int numRow
Number of nonzero rows.
Definition: pselinv.hpp:251
NumMat< T > nzval
Dimension numRow * numCol, nonzero elements.
Definition: pselinv.hpp:260
Int nnz
Total number of nonzeros elements.
Definition: sparse_matrix.hpp:101
Int NumSuper(const SuperNodeType *s)
NumSuper returns the total number of supernodes.
Definition: pselinv.hpp:364
void SelInv_P2p()
Point-to-point version of the selected inversion.
Definition: pselinv_impl.hpp:3537
NumMat< T > nzval
Dimension numRow * numCol, nonzero elements.
Definition: pselinv.hpp:209
A thin interface for passing parameters to set the PSelInv options.
Definition: pselinv.hpp:102
Definition: utility.hpp:1626
IntNumVec cols
Dimension numCol * 1, index (0-based) for the number of nonzero rows.
Definition: pselinv.hpp:257
virtual void SelInv()
SelInv is the main function for the selected inversion.
Definition: pselinv_impl.hpp:3484
LongInt Nnz()
Nnz computes the total number of nonzero elements in the PMatrix.
Definition: pselinv_impl.hpp:4990
void SelInv_lookup_indexes(SuperNodeBufferType &snode, std::vector< LBlock< T > > &LcolRecv, std::vector< UBlock< T > > &UrowRecv, NumMat< T > &AinvBuf, NumMat< T > &UBuf)
SelInv_lookup_indexes.
Definition: pselinv_impl.hpp:471
Definition: TreeBcast.hpp:1272
void ComputeDiagUpdate(SuperNodeBufferType &snode)
ComputeDiagUpdate.
Definition: pselinv_impl.hpp:1126
DistSparseMatrix describes a Sparse matrix in the compressed sparse column format (CSC) and distribut...
Definition: sparse_matrix.hpp:91
PMatrixUnsym contains the main data structure and the computational routine for the parallel selected...
Definition: pselinv.hpp:1003
Int MYROW(const GridType *g)
MYROW returns my processor row.
Definition: pselinv.hpp:295