46 #ifndef _PEXSI_PSELINV_IMPL_HPP_
47 #define _PEXSI_PSELINV_IMPL_HPP_
55 #define MPI_MAX_COMM (1024)
56 #define BCAST_THRESHOLD 16
64 inline GridType::GridType ( MPI_Comm Bcomm,
int nprow,
int npcol )
67 PushCallStack(
"GridType::GridType");
70 MPI_Initialized( &info );
75 throw std::logic_error(
"MPI has not been initialized." );
78 MPI_Comm_group( Bcomm, &comm_group );
79 MPI_Comm_create( Bcomm, comm_group, &comm );
82 MPI_Comm_rank( comm, &mpirank );
83 MPI_Comm_size( comm, &mpisize );
84 if( mpisize != nprow * npcol ){
88 throw std::logic_error(
"mpisize != nprow * npcol." );
94 Int myrow = mpirank / npcol;
95 Int mycol = mpirank % npcol;
97 MPI_Comm_split( comm, myrow, mycol, &rowComm );
98 MPI_Comm_split( comm, mycol, myrow, &colComm );
100 MPI_Group_free( &comm_group );
110 inline GridType::~GridType ( )
113 PushCallStack(
"GridType::~GridType");
117 MPI_Comm_free( &rowComm );
118 MPI_Comm_free( &colComm );
119 MPI_Comm_free( &comm );
131 void PMatrix<T>::deallocate(){
138 ColBlockIdx_.clear();
139 RowBlockIdx_.clear();
145 isSendToBelow_.Clear();
146 isSendToRight_.Clear();
147 isSendToDiagonal_.Clear();
148 isSendToCrossDiagonal_.Clear();
150 isRecvFromBelow_.Clear();
151 isRecvFromAbove_.Clear();
152 isRecvFromLeft_.Clear();
153 isRecvFromCrossDiagonal_.Clear();
157 for(
int i =0;i<fwdToBelowTree_.size();++i){
158 if(fwdToBelowTree_[i]!=NULL){
159 delete fwdToBelowTree_[i];
163 for(
int i =0;i<fwdToRightTree_.size();++i){
164 if(fwdToRightTree_[i]!=NULL){
165 delete fwdToRightTree_[i];
169 for(
int i =0;i<redToLeftTree_.size();++i){
170 if(redToLeftTree_[i]!=NULL){
171 delete redToLeftTree_[i];
174 for(
int i =0;i<redToAboveTree_.size();++i){
175 if(redToAboveTree_[i]!=NULL){
176 delete redToAboveTree_[i];
182 PMatrix<T> & PMatrix<T>::operator = (
const PMatrix<T> & C){
189 options_ = C.options_;
190 optionsLU_ = C.optionsLU_;
193 ColBlockIdx_ = C.ColBlockIdx_;
194 RowBlockIdx_ = C.RowBlockIdx_;
198 workingSet_ = C.workingSet_;
202 isSendToBelow_ = C.isSendToBelow_;
203 isSendToRight_ = C.isSendToRight_;
204 isSendToDiagonal_ = C.isSendToDiagonal_;
205 isSendToCrossDiagonal_ = C.isSendToCrossDiagonal_;
207 isRecvFromBelow_ = C.isRecvFromBelow_;
208 isRecvFromAbove_ = C.isRecvFromAbove_;
209 isRecvFromLeft_ = C.isRecvFromLeft_;
210 isRecvFromCrossDiagonal_ = C.isRecvFromCrossDiagonal_;
212 fwdToBelowTree_.resize(C.fwdToBelowTree_.size());
213 for(
int i = 0 ; i< C.fwdToBelowTree_.size();++i){
214 if(C.fwdToBelowTree_[i]!=NULL){
215 fwdToBelowTree_[i] = C.fwdToBelowTree_[i]->clone();
219 fwdToRightTree_.resize(C.fwdToRightTree_.size());
220 for(
int i = 0 ; i< C.fwdToRightTree_.size();++i){
221 if(C.fwdToRightTree_[i]!=NULL){
222 fwdToRightTree_[i] = C.fwdToRightTree_[i]->clone();
226 redToLeftTree_.resize(C.redToLeftTree_.size());
227 for(
int i = 0 ; i< C.redToLeftTree_.size();++i){
228 if(C.redToLeftTree_[i]!=NULL){
229 redToLeftTree_[i] = C.redToLeftTree_[i]->clone();
232 redToAboveTree_.resize(C.redToAboveTree_.size());
233 for(
int i = 0 ; i< C.redToAboveTree_.size();++i){
234 if(C.redToAboveTree_[i]!=NULL){
235 redToAboveTree_[i] = C.redToAboveTree_[i]->clone();
245 PMatrix<T>::PMatrix(
const PMatrix<T> & C){
251 options_ = C.options_;
252 optionsLU_ = C.optionsLU_;
255 ColBlockIdx_ = C.ColBlockIdx_;
256 RowBlockIdx_ = C.RowBlockIdx_;
260 workingSet_ = C.workingSet_;
264 isSendToBelow_ = C.isSendToBelow_;
265 isSendToRight_ = C.isSendToRight_;
266 isSendToDiagonal_ = C.isSendToDiagonal_;
267 isSendToCrossDiagonal_ = C.isSendToCrossDiagonal_;
269 isRecvFromBelow_ = C.isRecvFromBelow_;
270 isRecvFromAbove_ = C.isRecvFromAbove_;
271 isRecvFromLeft_ = C.isRecvFromLeft_;
272 isRecvFromCrossDiagonal_ = C.isRecvFromCrossDiagonal_;
275 fwdToBelowTree_.resize(C.fwdToBelowTree_.size());
276 for(
int i = 0 ; i< C.fwdToBelowTree_.size();++i){
277 if(C.fwdToBelowTree_[i]!=NULL){
278 fwdToBelowTree_[i] = C.fwdToBelowTree_[i]->clone();
282 fwdToRightTree_.resize(C.fwdToRightTree_.size());
283 for(
int i = 0 ; i< C.fwdToRightTree_.size();++i){
284 if(C.fwdToRightTree_[i]!=NULL){
285 fwdToRightTree_[i] = C.fwdToRightTree_[i]->clone();
289 redToLeftTree_.resize(C.redToLeftTree_.size());
290 for(
int i = 0 ; i< C.redToLeftTree_.size();++i){
291 if(C.redToLeftTree_[i]!=NULL){
292 redToLeftTree_[i] = C.redToLeftTree_[i]->clone();
295 redToAboveTree_.resize(C.redToAboveTree_.size());
296 for(
int i = 0 ; i< C.redToAboveTree_.size();++i){
297 if(C.redToAboveTree_[i]!=NULL){
298 redToAboveTree_[i] = C.redToAboveTree_[i]->clone();
305 PMatrix<T>::PMatrix (
307 const SuperNodeType* s,
313 PushCallStack(
"PMatrix::PMatrix");
316 this->Setup( g, s, o, oLU );
325 PMatrix<T>::~PMatrix ( )
329 PushCallStack(
"PMatrix::~PMatrix");
341 void PMatrix<T>::Setup(
343 const SuperNodeType* s,
349 PushCallStack(
"PMatrix::Setup");
365 ColBlockIdx_.clear();
366 RowBlockIdx_.clear();
368 L_.resize( this->NumLocalBlockCol() );
369 U_.resize( this->NumLocalBlockRow() );
371 ColBlockIdx_.resize( this->NumLocalBlockCol() );
372 RowBlockIdx_.resize( this->NumLocalBlockRow() );
375 #if ( _DEBUGlevel_ >= 1 )
376 statusOFS << std::endl <<
"PMatrix is constructed. The grid information: " << std::endl;
377 statusOFS <<
"mpirank = " <<
MYPROC(grid_) << std::endl;
378 statusOFS <<
"myrow = " <<
MYROW(grid_) << std::endl;
379 statusOFS <<
"mycol = " <<
MYCOL(grid_) << std::endl;
398 TIMER_START(Compute_Sinv_LT_Lookup_Indexes);
400 TIMER_START(Build_colptr_rowptr);
404 std::vector<Int> rowPtr(LcolRecv.size() + 1);
408 std::vector<Int> colPtr(UrowRecv.size() + 1);
411 for( Int ib = 0; ib < LcolRecv.size(); ib++ ){
412 rowPtr[ib+1] = rowPtr[ib] + LcolRecv[ib].numRow;
415 for( Int jb = 0; jb < UrowRecv.size(); jb++ ){
416 colPtr[jb+1] = colPtr[jb] + UrowRecv[jb].numCol;
419 Int numRowAinvBuf = *rowPtr.rbegin();
420 Int numColAinvBuf = *colPtr.rbegin();
421 TIMER_STOP(Build_colptr_rowptr);
423 TIMER_START(Allocate_lookup);
425 AinvBuf.Resize( numRowAinvBuf, numColAinvBuf );
426 UBuf.Resize(
SuperSize( snode.Index, super_ ), numColAinvBuf );
432 TIMER_STOP(Allocate_lookup);
434 TIMER_START(Fill_UBuf);
436 for( Int jb = 0; jb < UrowRecv.size(); jb++ ){
439 throw std::logic_error(
"The size of UB is not right. Something is seriously wrong." );
442 UB.
numRow, UBuf.VecData( colPtr[jb] ),
SuperSize( snode.Index, super_ ) );
444 TIMER_STOP(Fill_UBuf);
448 TIMER_START(JB_Loop);
602 for( Int jb = 0; jb < UrowRecv.size(); jb++ ){
603 for( Int ib = 0; ib < LcolRecv.size(); ib++ ){
608 T* nzvalAinv = &AinvBuf( rowPtr[ib], colPtr[jb] );
609 Int ldAinv = AinvBuf.m();
613 std::vector<LBlock<T> >& LcolSinv = this->L(
LBj(jsup, grid_ ) );
614 bool isBlockFound =
false;
615 TIMER_START(PARSING_ROW_BLOCKIDX);
616 for( Int ibSinv = 0; ibSinv < LcolSinv.size(); ibSinv++ ){
618 if( LcolSinv[ibSinv].blockIdx == isup ){
622 std::vector<Int> relRows( LB.
numRow );
623 Int* rowsLBPtr = LB.
rows.Data();
624 Int* rowsSinvBPtr = SinvB.
rows.Data();
625 for( Int i = 0; i < LB.
numRow; i++ ){
626 bool isRowFound =
false;
627 for( Int i1 = 0; i1 < SinvB.
numRow; i1++ ){
628 if( rowsLBPtr[i] == rowsSinvBPtr[i1] ){
634 if( isRowFound ==
false ){
635 std::ostringstream msg;
636 msg <<
"Row " << rowsLBPtr[i] <<
637 " in LB cannot find the corresponding row in SinvB" << std::endl
638 <<
"LB.rows = " << LB.
rows << std::endl
639 <<
"SinvB.rows = " << SinvB.
rows << std::endl;
640 throw std::runtime_error( msg.str().c_str() );
645 std::vector<Int> relCols( UB.
numCol );
647 for( Int j = 0; j < UB.
numCol; j++ ){
648 relCols[j] = UB.
cols[j] - SinvColsSta;
652 T* nzvalSinv = SinvB.
nzval.Data();
653 Int ldSinv = SinvB.
numRow;
654 for( Int j = 0; j < UB.
numCol; j++ ){
655 for( Int i = 0; i < LB.
numRow; i++ ){
656 nzvalAinv[i+j*ldAinv] =
657 nzvalSinv[relRows[i] + relCols[j] * ldSinv];
665 TIMER_STOP(PARSING_ROW_BLOCKIDX);
666 if( isBlockFound ==
false ){
667 std::ostringstream msg;
668 msg <<
"Block(" << isup <<
", " << jsup
669 <<
") did not find a matching block in Sinv." << std::endl;
670 throw std::runtime_error( msg.str().c_str() );
674 std::vector<UBlock<T> >& UrowSinv = this->U(
LBi( isup, grid_ ) );
675 bool isBlockFound =
false;
676 TIMER_START(PARSING_COL_BLOCKIDX);
677 for( Int jbSinv = 0; jbSinv < UrowSinv.size(); jbSinv++ ){
679 if( UrowSinv[jbSinv].blockIdx == jsup ){
683 std::vector<Int> relRows( LB.
numRow );
685 for( Int i = 0; i < LB.
numRow; i++ ){
686 relRows[i] = LB.
rows[i] - SinvRowsSta;
690 std::vector<Int> relCols( UB.
numCol );
691 Int* colsUBPtr = UB.
cols.Data();
692 Int* colsSinvBPtr = SinvB.
cols.Data();
693 for( Int j = 0; j < UB.
numCol; j++ ){
694 bool isColFound =
false;
695 for( Int j1 = 0; j1 < SinvB.
numCol; j1++ ){
696 if( colsUBPtr[j] == colsSinvBPtr[j1] ){
702 if( isColFound ==
false ){
703 std::ostringstream msg;
704 msg <<
"Col " << colsUBPtr[j] <<
705 " in UB cannot find the corresponding row in SinvB" << std::endl
706 <<
"UB.cols = " << UB.
cols << std::endl
707 <<
"UinvB.cols = " << SinvB.
cols << std::endl;
708 throw std::runtime_error( msg.str().c_str() );
714 T* nzvalSinv = SinvB.
nzval.Data();
715 Int ldSinv = SinvB.
numRow;
716 for( Int j = 0; j < UB.
numCol; j++ ){
717 for( Int i = 0; i < LB.
numRow; i++ ){
718 nzvalAinv[i+j*ldAinv] =
719 nzvalSinv[relRows[i] + relCols[j] * ldSinv];
727 TIMER_STOP(PARSING_COL_BLOCKIDX);
728 if( isBlockFound ==
false ){
729 std::ostringstream msg;
730 msg <<
"Block(" << isup <<
", " << jsup
731 <<
") did not find a matching block in Sinv." << std::endl;
732 throw std::runtime_error( msg.str().c_str() );
744 TIMER_STOP(Compute_Sinv_LT_Lookup_Indexes);
749 std::vector<SuperNodeBufferType > & arrSuperNodes,
753 TIMER_START(Send_CD_Update_U);
757 Int sendOffset[stepSuper];
758 Int recvOffset[stepSuper];
760 for (Int supidx=0; supidx<stepSuper; supidx++){
762 sendOffset[supidx]=sendCount;
763 recvOffset[supidx]=recvCount;
764 sendCount+= CountSendToCrossDiagonal(snode.Index);
765 recvCount+= CountRecvFromCrossDiagonal(snode.Index);
769 std::vector<MPI_Request > arrMpiReqsSendCD(sendCount, MPI_REQUEST_NULL );
770 std::vector<MPI_Request > arrMpiReqsSizeSendCD(sendCount, MPI_REQUEST_NULL );
772 std::vector<MPI_Request > arrMpiReqsRecvCD(recvCount, MPI_REQUEST_NULL );
773 std::vector<MPI_Request > arrMpiReqsSizeRecvCD(recvCount, MPI_REQUEST_NULL );
774 std::vector<std::vector<char> > arrSstrLcolSendCD(sendCount);
775 std::vector<int > arrSstrLcolSizeSendCD(sendCount);
776 std::vector<std::vector<char> > arrSstrLcolRecvCD(recvCount);
777 std::vector<int > arrSstrLcolSizeRecvCD(recvCount);
779 for (Int supidx=0; supidx<stepSuper; supidx++){
785 TIMER_START(Send_L_CrossDiag);
787 if(
MYCOL( grid_ ) ==
PCOL( snode.Index, grid_ ) && isSendToCrossDiagonal_(grid_->numProcCol, snode.Index ) ){
790 for(Int dstCol = 0; dstCol<grid_->numProcCol; dstCol++){
791 if(isSendToCrossDiagonal_(dstCol,snode.Index) ){
792 Int dest =
PNUM(
PROW(snode.Index,grid_),dstCol,grid_);
794 if(
MYPROC( grid_ ) != dest ){
795 MPI_Request & mpiReqSizeSend = arrMpiReqsSizeSendCD[sendOffset[supidx]+sendIdx];
796 MPI_Request & mpiReqSend = arrMpiReqsSendCD[sendOffset[supidx]+sendIdx];
799 std::stringstream sstm;
800 std::vector<char> & sstrLcolSend = arrSstrLcolSendCD[sendOffset[supidx]+sendIdx];
801 Int & sstrSize = arrSstrLcolSizeSendCD[sendOffset[supidx]+sendIdx];
803 serialize( snode.RowLocalPtr, sstm, NO_MASK );
804 serialize( snode.BlockIdxLocal, sstm, NO_MASK );
805 serialize( snode.LUpdateBuf, sstm, NO_MASK );
807 sstrLcolSend.resize( Size(sstm) );
808 sstm.read( &sstrLcolSend[0], sstrLcolSend.size() );
809 sstrSize = sstrLcolSend.size();
812 MPI_Isend( &sstrSize,
sizeof(sstrSize), MPI_BYTE, dest, IDX_TO_TAG(snode.Index,SELINV_TAG_L_SIZE_CD), grid_->comm, &mpiReqSizeSend );
813 MPI_Isend( (
void*)&sstrLcolSend[0], sstrSize, MPI_BYTE, dest, IDX_TO_TAG(snode.Index,SELINV_TAG_L_CONTENT_CD), grid_->comm, &mpiReqSend );
815 PROFILE_COMM(
MYPROC(this->grid_),dest,IDX_TO_TAG(snode.Index,SELINV_TAG_L_SIZE_CD),
sizeof(sstrSize));
816 PROFILE_COMM(
MYPROC(this->grid_),dest,IDX_TO_TAG(snode.Index,SELINV_TAG_L_CONTENT_CD),sstrSize);
826 TIMER_STOP(Send_L_CrossDiag);
831 for (Int supidx=0; supidx<stepSuper; supidx++){
834 if(
MYROW( grid_ ) ==
PROW( snode.Index, grid_ ) && isRecvFromCrossDiagonal_(grid_->numProcRow, snode.Index ) ){
836 for(Int srcRow = 0; srcRow<grid_->numProcRow; srcRow++){
837 if(isRecvFromCrossDiagonal_(srcRow,snode.Index) ){
838 Int src =
PNUM(srcRow,
PCOL(snode.Index,grid_),grid_);
839 if(
MYPROC( grid_ ) != src ){
840 Int & sstrSize = arrSstrLcolSizeRecvCD[recvOffset[supidx]+recvIdx];
841 MPI_Request & mpiReqSizeRecv = arrMpiReqsSizeRecvCD[recvOffset[supidx]+recvIdx];
842 MPI_Irecv( &sstrSize, 1, MPI_INT, src, IDX_TO_TAG(snode.Index,SELINV_TAG_L_SIZE_CD), grid_->comm, &mpiReqSizeRecv );
851 mpi::Waitall(arrMpiReqsSizeRecvCD);
853 for (Int supidx=0; supidx<stepSuper; supidx++){
856 if(
MYROW( grid_ ) ==
PROW( snode.Index, grid_ ) && isRecvFromCrossDiagonal_(grid_->numProcRow, snode.Index ) ){
858 for(Int srcRow = 0; srcRow<grid_->numProcRow; srcRow++){
859 if(isRecvFromCrossDiagonal_(srcRow,snode.Index) ){
860 Int src =
PNUM(srcRow,
PCOL(snode.Index,grid_),grid_);
861 if(
MYPROC( grid_ ) != src ){
862 Int & sstrSize = arrSstrLcolSizeRecvCD[recvOffset[supidx]+recvIdx];
863 std::vector<char> & sstrLcolRecv = arrSstrLcolRecvCD[recvOffset[supidx]+recvIdx];
864 MPI_Request & mpiReqRecv = arrMpiReqsRecvCD[recvOffset[supidx]+recvIdx];
865 sstrLcolRecv.resize( sstrSize);
866 MPI_Irecv( (
void*)&sstrLcolRecv[0], sstrSize, MPI_BYTE, src, IDX_TO_TAG(snode.Index,SELINV_TAG_L_CONTENT_CD), grid_->comm, &mpiReqRecv );
876 mpi::Waitall(arrMpiReqsRecvCD);
878 for (Int supidx=0; supidx<stepSuper; supidx++){
883 if(
MYROW( grid_ ) ==
PROW( snode.Index, grid_ ) && isRecvFromCrossDiagonal_(grid_->numProcRow, snode.Index ) ){
885 #if ( _DEBUGlevel_ >= 1 )
886 statusOFS << std::endl <<
" ["<<snode.Index<<
"] "<<
"Update the upper triangular block" << std::endl << std::endl;
887 statusOFS << std::endl <<
" ["<<snode.Index<<
"] "<<
"blockIdxLocal:" << snode.BlockIdxLocal << std::endl << std::endl;
888 statusOFS << std::endl <<
" ["<<snode.Index<<
"] "<<
"rowLocalPtr:" << snode.RowLocalPtr << std::endl << std::endl;
891 std::vector<UBlock<T> >& Urow = this->U(
LBi( snode.Index, grid_ ) );
892 std::vector<bool> isBlockFound(Urow.size(),
false);
895 for(Int srcRow = 0; srcRow<grid_->numProcRow; srcRow++){
896 if(isRecvFromCrossDiagonal_(srcRow,snode.Index) ){
897 Int src =
PNUM(srcRow,
PCOL(snode.Index,grid_),grid_);
898 TIMER_START(Recv_L_CrossDiag);
900 std::vector<Int> rowLocalPtrRecv;
901 std::vector<Int> blockIdxLocalRecv;
904 if(
MYPROC( grid_ ) != src ){
905 std::stringstream sstm;
906 Int & sstrSize = arrSstrLcolSizeRecvCD[recvOffset[supidx]+recvIdx];
907 std::vector<char> & sstrLcolRecv = arrSstrLcolRecvCD[recvOffset[supidx]+recvIdx];
908 sstm.write( &sstrLcolRecv[0], sstrSize );
910 deserialize( rowLocalPtrRecv, sstm, NO_MASK );
911 deserialize( blockIdxLocalRecv, sstm, NO_MASK );
912 deserialize( UUpdateBuf, sstm, NO_MASK );
918 rowLocalPtrRecv = snode.RowLocalPtr;
919 blockIdxLocalRecv = snode.BlockIdxLocal;
920 UUpdateBuf = snode.LUpdateBuf;
925 TIMER_STOP(Recv_L_CrossDiag);
927 #if ( _DEBUGlevel_ >= 1 )
928 statusOFS<<
" ["<<snode.Index<<
"] P"<<
MYPROC(grid_)<<
" ("<<
MYROW(grid_)<<
","<<
MYCOL(grid_)<<
") <--- LBj("<<snode.Index<<
") <--- P"<<src<<std::endl;
929 statusOFS << std::endl <<
" ["<<snode.Index<<
"] "<<
"rowLocalPtrRecv:" << rowLocalPtrRecv << std::endl << std::endl;
930 statusOFS << std::endl <<
" ["<<snode.Index<<
"] "<<
"blockIdxLocalRecv:" << blockIdxLocalRecv << std::endl << std::endl;
935 for( Int ib = 0; ib < blockIdxLocalRecv.size(); ib++ ){
936 for( Int jb = 0; jb < Urow.size(); jb++ ){
938 if( UB.
blockIdx == blockIdxLocalRecv[ib] ){
940 lapack::Lacpy(
'A', Ltmp.m(), Ltmp.n(),
941 &UUpdateBuf( rowLocalPtrRecv[ib], 0 ),
942 UUpdateBuf.m(), Ltmp.Data(), Ltmp.m() );
943 isBlockFound[jb] =
true;
944 Transpose( Ltmp, UB.
nzval );
952 for( Int jb = 0; jb < Urow.size(); jb++ ){
954 if( !isBlockFound[jb] ){
958 throw std::logic_error(
"UBlock cannot find its update. Something is seriously wrong." );
964 TIMER_STOP(Send_CD_Update_U);
966 mpi::Waitall(arrMpiReqsSizeSendCD);
967 mpi::Waitall(arrMpiReqsSendCD);
976 #if ( _DEBUGlevel_ >= 1 )
977 statusOFS << std::endl <<
"["<<snode.Index<<
"] "<<
"Unpack the received data for processors participate in Gemm. " << std::endl << std::endl;
980 if(
MYROW( grid_ ) !=
PROW( snode.Index, grid_ ) ){
981 std::stringstream sstm;
982 sstm.write( &snode.SstrUrowRecv[0], snode.SstrUrowRecv.size() );
983 std::vector<Int> mask( UBlockMask::TOTAL_NUMBER, 1 );
985 deserialize( numUBlock, sstm, NO_MASK );
986 UrowRecv.resize( numUBlock );
987 for( Int jb = 0; jb < numUBlock; jb++ ){
988 deserialize( UrowRecv[jb], sstm, mask );
996 UrowRecv.resize(this->U(
LBi( snode.Index, grid_ ) ).size());
997 std::copy(this->U(
LBi( snode.Index, grid_ ) ).begin(),this->U(
LBi( snode.Index, grid_ )).end(),UrowRecv.begin());
1002 if(
MYCOL( grid_ ) !=
PCOL( snode.Index, grid_ ) ){
1003 std::stringstream sstm;
1004 sstm.write( &snode.SstrLcolRecv[0], snode.SstrLcolRecv.size() );
1005 std::vector<Int> mask( LBlockMask::TOTAL_NUMBER, 1 );
1006 mask[LBlockMask::NZVAL] = 0;
1008 deserialize( numLBlock, sstm, NO_MASK );
1009 LcolRecv.resize( numLBlock );
1010 for( Int ib = 0; ib < numLBlock; ib++ ){
1011 deserialize( LcolRecv[ib], sstm, mask );
1017 std::vector<LBlock<T> >& Lcol = this->L(
LBj( snode.Index, grid_ ) );
1018 Int startIdx = (
MYROW( grid_ ) ==
PROW( snode.Index, grid_ ) )?1:0;
1019 LcolRecv.resize( Lcol.size() - startIdx );
1020 std::copy(Lcol.begin()+startIdx,Lcol.end(),LcolRecv.begin());
1024 template<
typename T>
1029 if(
MYCOL( grid_ ) ==
PCOL( snode.Index, grid_ ) ){
1030 #if ( _DEBUGlevel_ >= 2 )
1031 statusOFS << std::endl <<
"["<<snode.Index<<
"] "<<
"Updating the diagonal block" << std::endl << std::endl;
1033 std::vector<LBlock<T> >& Lcol = this->L(
LBj( snode.Index, grid_ ) );
1036 snode.DiagBuf.Resize(
SuperSize( snode.Index, super_ ),
SuperSize( snode.Index, super_ ));
1037 SetValue(snode.DiagBuf, ZERO<T>());
1040 Int startIb = (
MYROW( grid_ ) ==
PROW( snode.Index, grid_ ))?1:0;
1041 for( Int ib = startIb; ib < Lcol.size(); ib++ ){
1044 gemm_stat.push_back(snode.DiagBuf.m());
1045 gemm_stat.push_back(snode.DiagBuf.n());
1046 gemm_stat.push_back(Lcol[ib].numRow);
1051 blas::Gemm(
'T',
'N', snode.DiagBuf.m(), snode.DiagBuf.n(), LB.
numRow,
1052 MINUS_ONE<T>(), &snode.LUpdateBuf( snode.RowLocalPtr[ib-startIb], 0 ), snode.LUpdateBuf.m(),
1053 LB.
nzval.Data(), LB.
nzval.m(), ONE<T>(), snode.DiagBuf.Data(), snode.DiagBuf.m() );
1056 #if ( _DEBUGlevel_ >= 1 )
1057 statusOFS << std::endl <<
"["<<snode.Index<<
"] "<<
"Updated the diagonal block" << std::endl << std::endl;
1064 template<
typename T>
1069 PushCallStack(
"PMatrix::GetEtree");
1070 double begin = MPI_Wtime( );
1074 if( optionsLU_->ColPerm !=
"PARMETIS" ) {
1081 etree_supno.resize(this->
NumSuper());
1082 for(Int i = 0; i < superNode->etree.m(); ++i){
1083 Int curSnode = superNode->superIdx[i];
1084 Int parentSnode = (superNode->etree[i]>= superNode->etree.m()) ?this->
NumSuper():superNode->superIdx[superNode->etree[i]];
1085 if( curSnode != parentSnode){
1086 etree_supno[curSnode] = parentSnode;
1095 std::vector<Int> etree_supno_l( nsupers, nsupers );
1096 for( Int ksup = 0; ksup < nsupers; ksup++ ){
1097 if(
MYCOL( grid_ ) ==
PCOL( ksup, grid_ ) ){
1099 std::vector<LBlock<T> >& Lcol = this->L(
LBj(ksup, grid_) );
1102 if(
MYROW( grid_ ) ==
PROW( ksup, grid_ ) )
1105 for( Int ib = firstBlk; ib < Lcol.size(); ib++ ){
1106 etree_supno_l[ksup] = std::min(etree_supno_l[ksup] , Lcol[ib].blockIdx);
1113 #if ( _DEBUGlevel_ >= 1 )
1114 statusOFS << std::endl <<
" Local supernodal elimination tree is " << etree_supno_l <<std::endl<<std::endl;
1118 etree_supno.resize( nsupers );
1119 mpi::Allreduce( (Int*) &etree_supno_l[0],(Int *) &etree_supno[0], nsupers, MPI_MIN, grid_->comm );
1120 etree_supno[nsupers-1]=nsupers;
1124 double end = MPI_Wtime( );
1125 statusOFS<<
"Building the list took "<<end-begin<<
"s"<<std::endl;
1133 template<
typename T>
1135 std::vector<Int> & snodeEtree,
1136 std::vector<std::vector<Int> > & WSet)
1138 TIMER_START(Compute_WorkSet);
1142 if (options_->maxPipelineDepth!=1){
1146 Int rootParent = numSuper;
1152 level(rootParent)=-1;
1154 for(Int i=rootParent-1; i>=0; i-- ){ level(i) = level(snodeEtree[i])+1; numLevel = std::max(numLevel, level(i)); }
1161 for(Int i=rootParent-1; i>=0; i-- ){
if(level[i]>=0){ levelSize(level(i))++; } }
1164 WSet.resize(numLevel,std::vector<Int>());
1165 for(Int i=0; i<numLevel; i++ ){WSet[i].reserve(levelSize(i));}
1168 for(Int i=rootParent-1; i>=0; i-- ){
1169 WSet[level(i)].push_back(i);
1173 Int limit = (options_->maxPipelineDepth>0)?std::min(MPI_MAX_COMM,options_->maxPipelineDepth):MPI_MAX_COMM;
1174 for (Int lidx=0; lidx<WSet.size() ; lidx++){
1175 if(WSet[lidx].size()>limit)
1177 std::vector<std::vector<Int> >::iterator pos = WSet.begin()+lidx+1;
1178 WSet.insert(pos,std::vector<Int>());
1179 WSet[lidx+1].insert(WSet[lidx+1].begin(),WSet[lidx].begin() + limit ,WSet[lidx].end());
1180 WSet[lidx].erase(WSet[lidx].begin()+limit,WSet[lidx].end());
1188 for( Int ksup = numSuper - 2; ksup >= 0; ksup-- ){
1189 WSet.push_back(std::vector<Int>());
1190 WSet.back().push_back(ksup);
1198 TIMER_STOP(Compute_WorkSet);
1199 #if ( _DEBUGlevel_ >= 1 )
1200 for (Int lidx=0; lidx<WSet.size() ; lidx++){
1201 statusOFS << std::endl <<
"L"<< lidx <<
" is: {";
1202 for (Int supidx=0; supidx<WSet[lidx].size() ; supidx++){
1203 statusOFS << WSet[lidx][supidx] <<
" ["<<snodeEtree[WSet[lidx][supidx]]<<
"] ";
1205 statusOFS <<
" }"<< std::endl;
1210 template<
typename T>
1214 #if defined (PROFILE) || defined(PMPI) || defined(USE_TAU)
1215 Real begin_SendULWaitContentFirst, end_SendULWaitContentFirst, time_SendULWaitContentFirst = 0;
1218 std::vector<std::vector<Int> > & superList = this->WorkingSet();
1219 Int numSteps = superList.size();
1220 Int stepSuper = superList[lidx].size();
1222 TIMER_START(AllocateBuffer);
1225 std::vector<std::vector<MPI_Request> > arrMpireqsSendToBelow;
1226 arrMpireqsSendToBelow.resize( stepSuper, std::vector<MPI_Request>( 2 * grid_->numProcRow, MPI_REQUEST_NULL ));
1227 std::vector<std::vector<MPI_Request> > arrMpireqsSendToRight;
1228 arrMpireqsSendToRight.resize(stepSuper, std::vector<MPI_Request>( 2 * grid_->numProcCol, MPI_REQUEST_NULL ));
1231 std::vector<MPI_Request> arrMpireqsSendToLeft;
1232 arrMpireqsSendToLeft.resize(stepSuper, MPI_REQUEST_NULL );
1234 std::vector<MPI_Request> arrMpireqsSendToAbove;
1235 arrMpireqsSendToAbove.resize(stepSuper, MPI_REQUEST_NULL );
1238 std::vector<MPI_Request> arrMpireqsRecvSizeFromAny;
1239 arrMpireqsRecvSizeFromAny.resize(stepSuper*2 , MPI_REQUEST_NULL);
1240 std::vector<MPI_Request> arrMpireqsRecvContentFromAny;
1241 arrMpireqsRecvContentFromAny.resize(stepSuper*2 , MPI_REQUEST_NULL);
1245 std::vector<SuperNodeBufferType> arrSuperNodes(stepSuper);
1246 for (Int supidx=0; supidx<stepSuper; supidx++){
1247 arrSuperNodes[supidx].Index = superList[lidx][supidx];
1252 int numSentToLeft = 0;
1253 std::vector<int> reqSentToLeft;
1258 TIMER_STOP(AllocateBuffer);
1261 PushCallStack(
"PMatrix::SelInv_P2p::UpdateL");
1263 #if ( _DEBUGlevel_ >= 1 )
1264 statusOFS << std::endl <<
"Communication to the Schur complement." << std::endl << std::endl;
1270 TIMER_START(IRecv_Content_UL);
1272 for (Int supidx=0; supidx<stepSuper ; supidx++){
1274 MPI_Request * mpireqsRecvFromAbove = &arrMpireqsRecvContentFromAny[supidx*2];
1275 MPI_Request * mpireqsRecvFromLeft = &arrMpireqsRecvContentFromAny[supidx*2+1];
1277 if( isRecvFromAbove_( snode.Index ) &&
1278 MYROW( grid_ ) !=
PROW( snode.Index, grid_ ) ){
1280 TreeBcast * bcastUTree = fwdToBelowTree_[snode.Index];
1281 if(bcastUTree!=NULL){
1282 Int myRoot = bcastUTree->GetRoot();
1283 snode.SizeSstrUrowRecv = bcastUTree->GetMsgSize();
1284 snode.SstrUrowRecv.resize( snode.SizeSstrUrowRecv);
1285 MPI_Irecv( &snode.SstrUrowRecv[0], snode.SizeSstrUrowRecv, MPI_BYTE,
1286 myRoot, IDX_TO_TAG(snode.Index,SELINV_TAG_U_CONTENT),
1287 grid_->colComm, mpireqsRecvFromAbove );
1288 #if ( _DEBUGlevel_ >= 1 )
1289 statusOFS << std::endl <<
"["<<snode.Index<<
"] "<<
"Receiving U " << snode.SizeSstrUrowRecv <<
" BYTES from "<<myRoot<<
" on tag "<<IDX_TO_TAG(snode.Index,SELINV_TAG_U_CONTENT)<< std::endl << std::endl;
1294 if( isRecvFromLeft_( snode.Index ) &&
1295 MYCOL( grid_ ) !=
PCOL( snode.Index, grid_ ) ){
1296 TreeBcast * bcastLTree = fwdToRightTree_[snode.Index];
1297 if(bcastLTree!=NULL){
1298 Int myRoot = bcastLTree->GetRoot();
1299 snode.SizeSstrLcolRecv = bcastLTree->GetMsgSize();
1300 snode.SstrLcolRecv.resize(snode.SizeSstrLcolRecv);
1301 MPI_Irecv( &snode.SstrLcolRecv[0], snode.SizeSstrLcolRecv, MPI_BYTE,
1302 myRoot, IDX_TO_TAG(snode.Index,SELINV_TAG_L_CONTENT),
1303 grid_->rowComm, mpireqsRecvFromLeft );
1304 #if ( _DEBUGlevel_ >= 1 )
1305 statusOFS << std::endl <<
"["<<snode.Index<<
"] "<<
"Receiving L " << snode.SizeSstrLcolRecv <<
" BYTES from "<<myRoot<<
" on tag "<<IDX_TO_TAG(snode.Index,SELINV_TAG_L_CONTENT)<< std::endl << std::endl;
1310 TIMER_STOP(IRecv_Content_UL);
1313 TIMER_START(ISend_Content_UL);
1314 for (Int supidx=0; supidx<stepSuper; supidx++){
1316 std::vector<MPI_Request> & mpireqsSendToBelow = arrMpireqsSendToBelow[supidx];
1317 std::vector<MPI_Request> & mpireqsSendToRight = arrMpireqsSendToRight[supidx];
1319 #if ( _DEBUGlevel_ >= 1 )
1320 statusOFS << std::endl <<
"["<<snode.Index<<
"] "
1321 <<
"Communication for the U part." << std::endl << std::endl;
1324 if(
MYROW( grid_ ) ==
PROW( snode.Index, grid_ ) ){
1325 std::vector<UBlock<T> >& Urow = this->U(
LBi(snode.Index, grid_) );
1327 TIMER_START(Serialize_UL);
1328 std::stringstream sstm;
1330 std::vector<Int> mask( UBlockMask::TOTAL_NUMBER, 1 );
1332 serialize( (Int)Urow.size(), sstm, NO_MASK );
1333 for( Int jb = 0; jb < Urow.size(); jb++ ){
1334 serialize( Urow[jb], sstm, mask );
1336 snode.SstrUrowSend.resize( Size( sstm ) );
1337 sstm.read( &snode.SstrUrowSend[0], snode.SstrUrowSend.size() );
1338 snode.SizeSstrUrowSend = snode.SstrUrowSend.size();
1339 TIMER_STOP(Serialize_UL);
1340 TreeBcast * bcastUTree = fwdToBelowTree_[snode.Index];
1341 if(bcastUTree!=NULL){
1342 bcastUTree->ForwardMessage((
char*)&snode.SstrUrowSend[0], snode.SizeSstrUrowSend,
1343 IDX_TO_TAG(snode.Index,SELINV_TAG_U_CONTENT), &mpireqsSendToBelow[0] );
1344 for( Int idxRecv = 0; idxRecv < bcastUTree->GetDestCount(); ++idxRecv ){
1345 Int iProcRow = bcastUTree->GetDest(idxRecv);
1346 #if ( _DEBUGlevel_ >= 1 )
1347 statusOFS << std::endl <<
"["<<snode.Index<<
"] "<<
"Sending U " << snode.SizeSstrUrowSend <<
" BYTES on tag "<<IDX_TO_TAG(snode.Index,SELINV_TAG_U_CONTENT) << std::endl << std::endl;
1354 #if ( _DEBUGlevel_ >= 1 )
1355 statusOFS << std::endl <<
"["<<snode.Index<<
"] "<<
"Communication for the L part." << std::endl << std::endl;
1361 if(
MYCOL( grid_ ) ==
PCOL( snode.Index, grid_ ) ){
1362 std::vector<LBlock<T> >& Lcol = this->L(
LBj(snode.Index, grid_) );
1363 TIMER_START(Serialize_UL);
1365 std::stringstream sstm;
1366 std::vector<Int> mask( LBlockMask::TOTAL_NUMBER, 1 );
1367 mask[LBlockMask::NZVAL] = 0;
1371 if(
MYROW( grid_ ) ==
PROW( snode.Index, grid_ ) )
1372 serialize( (Int)Lcol.size() - 1, sstm, NO_MASK );
1374 serialize( (Int)Lcol.size(), sstm, NO_MASK );
1376 for( Int ib = 0; ib < Lcol.size(); ib++ ){
1377 if( Lcol[ib].blockIdx > snode.Index ){
1378 #if ( _DEBUGlevel_ >= 2 )
1379 statusOFS << std::endl <<
"["<<snode.Index<<
"] "<<
"Serializing Block index " << Lcol[ib].blockIdx << std::endl;
1381 serialize( Lcol[ib], sstm, mask );
1384 snode.SstrLcolSend.resize( Size( sstm ) );
1385 sstm.read( &snode.SstrLcolSend[0], snode.SstrLcolSend.size() );
1386 snode.SizeSstrLcolSend = snode.SstrLcolSend.size();
1387 TIMER_STOP(Serialize_UL);
1390 TreeBcast * bcastLTree = fwdToRightTree_[snode.Index];
1391 if(bcastLTree!=NULL){
1392 bcastLTree->ForwardMessage((
char*)&snode.SstrLcolSend[0], snode.SizeSstrLcolSend,
1393 IDX_TO_TAG(snode.Index,SELINV_TAG_L_CONTENT), &mpireqsSendToRight[0] );
1395 for( Int idxRecv = 0; idxRecv < bcastLTree->GetDestCount(); ++idxRecv ){
1396 Int iProcCol = bcastLTree->GetDest(idxRecv);
1397 #if ( _DEBUGlevel_ >= 1 )
1398 statusOFS << std::endl <<
"["<<snode.Index<<
"] "<<
"Sending L " << snode.SizeSstrLcolSend <<
" BYTES on tag "<<IDX_TO_TAG(snode.Index,SELINV_TAG_L_CONTENT) << std::endl << std::endl;
1404 TIMER_STOP(ISend_Content_UL);
1407 vector<char> redLdone(stepSuper,0);
1408 for (Int supidx=0; supidx<stepSuper; supidx++){
1412 if(redLTree != NULL){
1413 redLTree->SetTag(IDX_TO_TAG(snode.Index,SELINV_TAG_L_REDUCE));
1415 redLTree->AllocRecvBuffers();
1417 redLTree->PostFirstRecv();
1422 TIMER_START(Compute_Sinv_LT);
1424 Int msgForwarded = 0;
1426 Int gemmProcessed = 0;
1430 std::list<Int> readySupidx;
1432 for(Int supidx = 0;supidx<stepSuper;supidx++){
1436 if( isRecvFromAbove_( snode.Index ) && isRecvFromLeft_( snode.Index )){
1438 if(
MYCOL( grid_ ) ==
PCOL( snode.Index, grid_ ) ){
1442 if(
MYROW( grid_ ) ==
PROW( snode.Index, grid_ ) ){
1446 if(snode.isReady==2){
1447 readySupidx.push_back(supidx);
1448 #if ( _DEBUGlevel_ >= 1 )
1449 statusOFS<<std::endl<<
"Locally processing ["<<snode.Index<<
"]"<<std::endl;
1453 else if( (isRecvFromLeft_( snode.Index ) ) &&
MYCOL( grid_ ) !=
PCOL( snode.Index, grid_ ) )
1458 if(redLTree != NULL){
1459 TIMER_START(Reduce_Sinv_LT_Isend);
1461 redLTree->SetLocalBuffer(NULL);
1462 redLTree->SetDataReady(
true);
1463 bool done = redLTree->Progress();
1464 TIMER_STOP(Reduce_Sinv_LT_Isend);
1468 if(
MYROW(grid_)!=
PROW(snode.Index,grid_)){
1469 TreeBcast * bcastUTree = fwdToBelowTree_[snode.Index];
1470 if(bcastUTree != NULL){
1471 if(bcastUTree->GetDestCount()>0){
1477 if(
MYCOL(grid_)!=
PCOL(snode.Index,grid_)){
1478 TreeBcast * bcastLTree = fwdToRightTree_[snode.Index];
1479 if(bcastLTree != NULL){
1480 if(bcastLTree->GetDestCount()>0){
1487 #if ( _DEBUGlevel_ >= 1 )
1488 statusOFS<<std::endl<<
"gemmToDo ="<<gemmToDo<<std::endl;
1489 statusOFS<<std::endl<<
"msgToFwd ="<<msgToFwd<<std::endl;
1493 #if defined (PROFILE)
1494 end_SendULWaitContentFirst=0;
1495 begin_SendULWaitContentFirst=0;
1498 while(gemmProcessed<gemmToDo || msgForwarded < msgToFwd)
1500 Int reqidx = MPI_UNDEFINED;
1508 int reqIndices[arrMpireqsRecvContentFromAny.size()];
1513 TIMER_START(WaitContent_UL);
1514 #if defined(PROFILE)
1515 if(begin_SendULWaitContentFirst==0){
1516 begin_SendULWaitContentFirst=1;
1517 TIMER_START(WaitContent_UL_First);
1522 MPI_Waitsome(2*stepSuper, &arrMpireqsRecvContentFromAny[0], &numRecv, reqIndices, MPI_STATUSES_IGNORE);
1524 for(
int i =0;i<numRecv;i++){
1525 reqidx = reqIndices[i];
1527 if(reqidx!=MPI_UNDEFINED)
1535 TreeBcast * bcastUTree = fwdToBelowTree_[snode.Index];
1536 if(bcastUTree != NULL){
1537 if(bcastUTree->GetDestCount()>0){
1539 std::vector<MPI_Request> & mpireqsSendToBelow = arrMpireqsSendToBelow[supidx];
1540 #if ( _DEBUGlevel_ >= 1 )
1541 for( Int idxRecv = 0; idxRecv < bcastUTree->GetDestCount(); ++idxRecv ){
1542 Int iProcRow = bcastUTree->GetDest(idxRecv);
1543 statusOFS << std::endl <<
"["<<snode.Index<<
"] "<<
"Forwarding U " << snode.SizeSstrUrowRecv <<
" BYTES to "<<iProcRow<<
" on tag "<<IDX_TO_TAG(snode.Index,SELINV_TAG_U_CONTENT)<< std::endl << std::endl;
1547 bcastUTree->ForwardMessage( (
char*)&snode.SstrUrowRecv[0], snode.SizeSstrUrowRecv,
1548 IDX_TO_TAG(snode.Index,SELINV_TAG_U_CONTENT), &mpireqsSendToBelow[0] );
1549 #if ( _DEBUGlevel_ >= 1 )
1550 for( Int idxRecv = 0; idxRecv < bcastUTree->GetDestCount(); ++idxRecv ){
1551 Int iProcRow = bcastUTree->GetDest(idxRecv);
1552 statusOFS << std::endl <<
"["<<snode.Index<<
"] "<<
"Forwarded U " << snode.SizeSstrUrowRecv <<
" BYTES to "<<iProcRow<<
" on tag "<<IDX_TO_TAG(snode.Index,SELINV_TAG_U_CONTENT)<< std::endl << std::endl;
1560 else if(reqidx%2==1){
1561 TreeBcast * bcastLTree = fwdToRightTree_[snode.Index];
1562 if(bcastLTree != NULL){
1563 if(bcastLTree->GetDestCount()>0){
1565 std::vector<MPI_Request> & mpireqsSendToRight = arrMpireqsSendToRight[supidx];
1566 #if ( _DEBUGlevel_ >= 1 )
1567 for( Int idxRecv = 0; idxRecv < bcastLTree->GetDestCount(); ++idxRecv ){
1568 Int iProcCol = bcastLTree->GetDest(idxRecv);
1569 statusOFS << std::endl <<
"["<<snode.Index<<
"] "<<
"Forwarding L " << snode.SizeSstrLcolRecv <<
" BYTES to "<<iProcCol<<
" on tag "<<IDX_TO_TAG(snode.Index,SELINV_TAG_L_CONTENT)<< std::endl << std::endl;
1573 bcastLTree->ForwardMessage( (
char*)&snode.SstrLcolRecv[0], snode.SizeSstrLcolRecv,
1574 IDX_TO_TAG(snode.Index,SELINV_TAG_L_CONTENT), &mpireqsSendToRight[0] );
1580 #if ( _DEBUGlevel_ >= 1 )
1581 for( Int idxRecv = 0; idxRecv < bcastLTree->GetDestCount(); ++idxRecv ){
1582 Int iProcCol = bcastLTree->GetDest(idxRecv);
1583 statusOFS << std::endl <<
"["<<snode.Index<<
"] "<<
"Forwarded L " << snode.SizeSstrLcolRecv <<
" BYTES to "<<iProcCol<<
" on tag "<<IDX_TO_TAG(snode.Index,SELINV_TAG_L_CONTENT)<< std::endl << std::endl;
1592 #if ( _DEBUGlevel_ >= 1 )
1593 statusOFS<<std::endl<<
"Received data for ["<<snode.Index<<
"] reqidx%2="<<reqidx%2<<
" is ready ?"<<snode.isReady<<std::endl;
1596 if( isRecvFromAbove_( snode.Index ) && isRecvFromLeft_( snode.Index )){
1600 if(snode.isReady==2){
1601 readySupidx.push_back(supidx);
1603 #if defined(PROFILE)
1604 if(end_SendULWaitContentFirst==0){
1605 TIMER_STOP(WaitContent_UL_First);
1606 end_SendULWaitContentFirst=1;
1615 TIMER_STOP(WaitContent_UL);
1617 }
while( (gemmProcessed<gemmToDo && readySupidx.size()==0) || (gemmProcessed==gemmToDo && msgForwarded<msgToFwd) );
1620 if(readySupidx.size()>0)
1622 supidx = readySupidx.back();
1623 readySupidx.pop_back();
1628 if( isRecvFromAbove_( snode.Index ) && isRecvFromLeft_( snode.Index ) ){
1630 std::vector<LBlock<T> > LcolRecv;
1631 std::vector<UBlock<T> > UrowRecv;
1635 UnpackData(snode, LcolRecv, UrowRecv);
1638 SelInv_lookup_indexes(snode,LcolRecv, UrowRecv,AinvBuf,UBuf);
1640 snode.LUpdateBuf.Resize( AinvBuf.m(),
SuperSize( snode.Index, super_ ) );
1643 gemm_stat.push_back(AinvBuf.m());
1644 gemm_stat.push_back(UBuf.m());
1645 gemm_stat.push_back(AinvBuf.n());
1648 TIMER_START(Compute_Sinv_LT_GEMM);
1649 blas::Gemm(
'N',
'T', AinvBuf.m(), UBuf.m(), AinvBuf.n(), MINUS_ONE<T>(),
1650 AinvBuf.Data(), AinvBuf.m(),
1651 UBuf.Data(), UBuf.m(), ZERO<T>(),
1652 snode.LUpdateBuf.Data(), snode.LUpdateBuf.m() );
1653 TIMER_STOP(Compute_Sinv_LT_GEMM);
1656 #if ( _DEBUGlevel_ >= 2 )
1657 statusOFS << std::endl <<
"["<<snode.Index<<
"] "<<
"snode.LUpdateBuf: " << snode.LUpdateBuf << std::endl;
1663 if(redLTree != NULL){
1664 TIMER_START(Reduce_Sinv_LT_Isend);
1666 redLTree->SetLocalBuffer(snode.LUpdateBuf.Data());
1667 redLTree->SetDataReady(
true);
1668 bool done = redLTree->Progress();
1669 TIMER_STOP(Reduce_Sinv_LT_Isend);
1673 #if ( _DEBUGlevel_ >= 1 )
1674 statusOFS<<std::endl<<
"gemmProcessed ="<<gemmProcessed<<
"/"<<gemmToDo<<std::endl;
1678 for (Int supidx=0; supidx<stepSuper; supidx++){
1681 if(redLTree != NULL && !redLdone[supidx]){
1682 bool done = redLTree->Progress();
1689 TIMER_STOP(Compute_Sinv_LT);
1694 TIMER_START(Reduce_Sinv_LT);
1696 bool all_done =
false;
1701 for (Int supidx=0; supidx<stepSuper; supidx++){
1706 if(redLTree != NULL && !redLdone[supidx]){
1707 bool done = redLTree->Progress();
1709 if(
MYCOL( grid_ ) ==
PCOL( snode.Index, grid_ ) ){
1711 Int numRowLUpdateBuf;
1712 std::vector<LBlock<T> >& Lcol = this->L(
LBj( snode.Index, grid_ ) );
1713 if(
MYROW( grid_ ) !=
PROW( snode.Index, grid_ ) ){
1714 snode.RowLocalPtr.resize( Lcol.size() + 1 );
1715 snode.BlockIdxLocal.resize( Lcol.size() );
1716 snode.RowLocalPtr[0] = 0;
1717 for( Int ib = 0; ib < Lcol.size(); ib++ ){
1718 snode.RowLocalPtr[ib+1] = snode.RowLocalPtr[ib] + Lcol[ib].numRow;
1719 snode.BlockIdxLocal[ib] = Lcol[ib].blockIdx;
1723 snode.RowLocalPtr.resize( Lcol.size() );
1724 snode.BlockIdxLocal.resize( Lcol.size() - 1 );
1725 snode.RowLocalPtr[0] = 0;
1726 for( Int ib = 1; ib < Lcol.size(); ib++ ){
1727 snode.RowLocalPtr[ib] = snode.RowLocalPtr[ib-1] + Lcol[ib].numRow;
1728 snode.BlockIdxLocal[ib-1] = Lcol[ib].blockIdx;
1731 numRowLUpdateBuf = *snode.RowLocalPtr.rbegin();
1733 if( numRowLUpdateBuf > 0 ){
1734 if( snode.LUpdateBuf.m() == 0 && snode.LUpdateBuf.n() == 0 ){
1735 snode.LUpdateBuf.Resize( numRowLUpdateBuf,
SuperSize( snode.Index, super_ ) );
1740 redLTree->SetLocalBuffer(snode.LUpdateBuf.Data());
1744 redLTree->CleanupBuffers();
1747 all_done = all_done && (done || redLdone[supidx]);
1751 TIMER_STOP(Reduce_Sinv_LT);
1760 PushCallStack(
"PMatrix::SelInv_P2p::UpdateD");
1763 TIMER_START(Update_Diagonal);
1765 for (Int supidx=0; supidx<stepSuper; supidx++){
1768 ComputeDiagUpdate(snode);
1773 if(redDTree != NULL){
1775 if(
MYROW( grid_ ) ==
PROW( snode.Index, grid_ ) ){
1776 if(snode.DiagBuf.Size()==0){
1777 snode.DiagBuf.Resize(
SuperSize( snode.Index, super_ ),
SuperSize( snode.Index, super_ ));
1778 SetValue(snode.DiagBuf, ZERO<T>());
1783 redDTree->SetLocalBuffer(snode.DiagBuf.Data());
1784 if(!redDTree->IsAllocated()){
1785 redDTree->SetTag(IDX_TO_TAG(snode.Index,SELINV_TAG_D_REDUCE));
1786 redDTree->AllocRecvBuffers();
1788 redDTree->PostFirstRecv();
1791 redDTree->SetDataReady(
true);
1792 bool done = redDTree->Progress();
1796 for (Int supidx=0; supidx<stepSuper; supidx++){
1799 if(redDTree != NULL){
1800 if(redDTree->IsAllocated()){
1801 bool done = redDTree->Progress();
1806 TIMER_STOP(Update_Diagonal);
1808 TIMER_START(Reduce_Diagonal);
1811 vector<char> is_done(stepSuper,0);
1812 bool all_done =
false;
1817 for (Int supidx=0; supidx<stepSuper; supidx++){
1821 if(redDTree != NULL){
1822 bool done = redDTree->Progress();
1823 if(done && !is_done[supidx]){
1824 if(
MYCOL( grid_ ) ==
PCOL( snode.Index, grid_ ) ){
1825 if(
MYROW( grid_ ) ==
PROW( snode.Index, grid_ ) ){
1826 LBlock<T> & LB = this->L(
LBj( snode.Index, grid_ ) )[0];
1828 blas::Axpy( LB.
numRow * LB.
numCol, ONE<T>(), snode.DiagBuf.Data(), 1, LB.
nzval.Data(), 1 );
1829 Symmetrize( LB.
nzval );
1834 redDTree->CleanupBuffers();
1837 all_done = all_done && (done || is_done[supidx]);
1842 TIMER_STOP(Reduce_Diagonal);
1850 PushCallStack(
"PMatrix::SelInv_P2p::UpdateU");
1855 SendRecvCD_UpdateU(arrSuperNodes, stepSuper);
1862 PushCallStack(
"PMatrix::SelInv_P2p::UpdateLFinal");
1865 TIMER_START(Update_L);
1866 for (Int supidx=0; supidx<stepSuper; supidx++){
1869 #if ( _DEBUGlevel_ >= 1 )
1870 statusOFS << std::endl <<
"["<<snode.Index<<
"] "<<
"Finish updating the L part by filling LUpdateBufReduced back to L" << std::endl << std::endl;
1873 if(
MYCOL( grid_ ) ==
PCOL( snode.Index, grid_ ) && snode.LUpdateBuf.m() > 0 ){
1874 std::vector<LBlock<T> >& Lcol = this->L(
LBj( snode.Index, grid_ ) );
1876 Int startBlock = (
MYROW( grid_ ) ==
PROW( snode.Index, grid_ ))?1:0;
1877 for( Int ib = startBlock; ib < Lcol.size(); ib++ ){
1879 lapack::Lacpy(
'A', LB.
numRow, LB.
numCol, &snode.LUpdateBuf( snode.RowLocalPtr[ib-startBlock], 0 ),
1880 snode.LUpdateBuf.m(), LB.
nzval.Data(), LB.
numRow );
1884 TIMER_STOP(Update_L);
1891 TIMER_START(Barrier);
1894 for (Int supidx=0; supidx<stepSuper; supidx++){
1898 if(redLTree != NULL){
1900 redLTree->CleanupBuffers();
1905 for (Int supidx=0; supidx<stepSuper; supidx++){
1909 if(redDTree != NULL){
1911 redDTree->CleanupBuffers();
1915 mpi::Waitall(arrMpireqsRecvContentFromAny);
1917 for (Int supidx=0; supidx<stepSuper; supidx++){
1918 Int ksup = superList[lidx][supidx];
1919 std::vector<MPI_Request> & mpireqsSendToRight = arrMpireqsSendToRight[supidx];
1920 std::vector<MPI_Request> & mpireqsSendToBelow = arrMpireqsSendToBelow[supidx];
1922 #if ( _DEBUGlevel_ >= 1 )
1923 statusOFS<<
"["<<ksup<<
"] mpireqsSendToRight"<<std::endl;
1925 mpi::Waitall( mpireqsSendToRight );
1926 #if ( _DEBUGlevel_ >= 1 )
1927 statusOFS<<
"["<<ksup<<
"] mpireqsSendToBelow"<<std::endl;
1929 mpi::Waitall( mpireqsSendToBelow );
1933 #if ( _DEBUGlevel_ >= 1 )
1934 statusOFS<<
"barrier done"<<std::endl;
1936 TIMER_STOP(Barrier);
1941 if (options_->maxPipelineDepth!=-1)
1944 MPI_Barrier(grid_->comm);
1950 template<
typename T>
1953 ConstructCommunicationPattern_P2p();
1958 template<
typename T>
1962 PushCallStack(
"PMatrix::ConstructCommunicationPattern_P2p");
1968 TIMER_START(Allocate);
1971 PushCallStack(
"Initialize the communication pattern" );
1975 fwdToBelowTree_.resize(numSuper, NULL );
1976 fwdToRightTree_.resize(numSuper, NULL );
1977 redToLeftTree_.resize(numSuper, NULL );
1978 redToAboveTree_.resize(numSuper, NULL );
1980 isSendToBelow_.Resize(grid_->numProcRow, numSuper);
1981 isSendToRight_.Resize(grid_->numProcCol, numSuper);
1982 isSendToDiagonal_.Resize( numSuper );
1985 SetValue( isSendToDiagonal_,
false );
1987 isSendToCrossDiagonal_.Resize(grid_->numProcCol+1, numSuper );
1988 SetValue( isSendToCrossDiagonal_,
false );
1989 isRecvFromCrossDiagonal_.Resize(grid_->numProcRow+1, numSuper );
1990 SetValue( isRecvFromCrossDiagonal_,
false );
1992 isRecvFromAbove_.Resize( numSuper );
1993 isRecvFromLeft_.Resize( numSuper );
1994 isRecvFromBelow_.Resize( grid_->numProcRow, numSuper );
1995 SetValue( isRecvFromAbove_,
false );
1996 SetValue( isRecvFromBelow_,
false );
1997 SetValue( isRecvFromLeft_,
false );
2002 TIMER_STOP(Allocate);
2004 TIMER_START(GetEtree);
2005 std::vector<Int> snodeEtree(this->
NumSuper());
2006 GetEtree(snodeEtree);
2007 TIMER_STOP(GetEtree);
2012 PushCallStack(
"Local column communication" );
2014 #if ( _DEBUGlevel_ >= 1 )
2015 statusOFS << std::endl <<
"Local column communication" << std::endl;
2020 std::vector<std::set<Int> > localColBlockRowIdx;
2022 localColBlockRowIdx.resize( this->NumLocalBlockCol() );
2025 TIMER_START(Column_communication);
2028 for( Int ksup = 0; ksup < numSuper; ksup++ ){
2030 if(
MYCOL( grid_ ) ==
PCOL( ksup, grid_ ) ){
2034 std::vector<Int> tAllBlockRowIdx;
2035 std::vector<Int> & colBlockIdx = ColBlockIdx(
LBj(ksup, grid_));
2036 TIMER_START(Allgatherv_Column_communication);
2037 if( grid_ -> mpisize != 1 )
2038 mpi::Allgatherv( colBlockIdx, tAllBlockRowIdx, grid_->colComm );
2040 tAllBlockRowIdx = colBlockIdx;
2042 TIMER_STOP(Allgatherv_Column_communication);
2044 localColBlockRowIdx[
LBj( ksup, grid_ )].insert(
2045 tAllBlockRowIdx.begin(), tAllBlockRowIdx.end() );
2047 #if ( _DEBUGlevel_ >= 1 )
2049 <<
" Column block " << ksup
2050 <<
" has the following nonzero block rows" << std::endl;
2051 for( std::set<Int>::iterator si = localColBlockRowIdx[
LBj( ksup, grid_ )].begin();
2052 si != localColBlockRowIdx[
LBj( ksup, grid_ )].end();
2054 statusOFS << *si <<
" ";
2056 statusOFS << std::endl;
2067 TIMER_STOP(Column_communication);
2069 TIMER_START(Row_communication);
2071 PushCallStack(
"Local row communication" );
2073 #if ( _DEBUGlevel_ >= 1 )
2074 statusOFS << std::endl <<
"Local row communication" << std::endl;
2079 std::vector<std::set<Int> > localRowBlockColIdx;
2081 localRowBlockColIdx.resize( this->NumLocalBlockRow() );
2083 for( Int ksup = 0; ksup < numSuper; ksup++ ){
2085 if(
MYROW( grid_ ) ==
PROW( ksup, grid_ ) ){
2088 std::vector<Int> tAllBlockColIdx;
2089 std::vector<Int> & rowBlockIdx = RowBlockIdx(
LBi(ksup, grid_));
2090 TIMER_START(Allgatherv_Row_communication);
2091 if( grid_ -> mpisize != 1 )
2092 mpi::Allgatherv( rowBlockIdx, tAllBlockColIdx, grid_->rowComm );
2094 tAllBlockColIdx = rowBlockIdx;
2096 TIMER_STOP(Allgatherv_Row_communication);
2098 localRowBlockColIdx[
LBi( ksup, grid_ )].insert(
2099 tAllBlockColIdx.begin(), tAllBlockColIdx.end() );
2101 #if ( _DEBUGlevel_ >= 1 )
2103 <<
" Row block " << ksup
2104 <<
" has the following nonzero block columns" << std::endl;
2105 for( std::set<Int>::iterator si = localRowBlockColIdx[
LBi( ksup, grid_ )].begin();
2106 si != localRowBlockColIdx[
LBi( ksup, grid_ )].end();
2108 statusOFS << *si <<
" ";
2110 statusOFS << std::endl;
2120 TIMER_STOP(Row_communication);
2122 TIMER_START(STB_RFA);
2124 PushCallStack(
"SendToBelow / RecvFromAbove");
2126 for( Int ksup = 0; ksup < numSuper - 1; ksup++ ){
2128 Int jsup = snodeEtree[ksup];
2129 while(jsup<numSuper){
2130 Int jsupLocalBlockCol =
LBj( jsup, grid_ );
2131 Int jsupProcCol =
PCOL( jsup, grid_ );
2132 if(
MYCOL( grid_ ) == jsupProcCol ){
2134 if( localColBlockRowIdx[jsupLocalBlockCol].count( ksup ) > 0 ) {
2135 for( std::set<Int>::iterator si = localColBlockRowIdx[jsupLocalBlockCol].begin();
2136 si != localColBlockRowIdx[jsupLocalBlockCol].end(); si++ ){
2138 Int isupProcRow =
PROW( isup, grid_ );
2140 if(
MYROW( grid_ ) == isupProcRow ){
2141 isRecvFromAbove_(ksup) =
true;
2143 if(
MYROW( grid_ ) ==
PROW( ksup, grid_ ) ){
2144 isSendToBelow_( isupProcRow, ksup ) =
true;
2150 jsup = snodeEtree[jsup];
2156 TIMER_STOP(STB_RFA);
2158 TIMER_START(STR_RFL_RFB);
2160 PushCallStack(
"SendToRight / RecvFromLeft");
2162 for( Int ksup = 0; ksup < numSuper - 1; ksup++ ){
2164 Int isup = snodeEtree[ksup];
2165 while(isup<numSuper){
2166 Int isupLocalBlockRow =
LBi( isup, grid_ );
2167 Int isupProcRow =
PROW( isup, grid_ );
2168 if(
MYROW( grid_ ) == isupProcRow ){
2170 if( localRowBlockColIdx[isupLocalBlockRow].count( ksup ) > 0 ){
2171 for( std::set<Int>::iterator si = localRowBlockColIdx[isupLocalBlockRow].begin();
2172 si != localRowBlockColIdx[isupLocalBlockRow].end(); si++ ){
2174 Int jsupProcCol =
PCOL( jsup, grid_ );
2176 if(
MYCOL( grid_ ) == jsupProcCol ){
2177 isRecvFromLeft_(ksup) =
true;
2179 if(
MYCOL( grid_ ) ==
PCOL( ksup, grid_ ) ){
2180 isSendToRight_( jsupProcCol, ksup ) =
true;
2187 if(
MYCOL( grid_ ) ==
PCOL(ksup, grid_) ){
2188 if(
MYROW( grid_ ) ==
PROW( ksup, grid_ ) ){
2189 isRecvFromBelow_(isupProcRow,ksup) =
true;
2191 else if (
MYROW(grid_) == isupProcRow){
2192 isSendToDiagonal_(ksup)=
true;
2195 isup = snodeEtree[isup];
2202 TIMER_STOP(STR_RFL_RFB);
2205 TIMER_START(BUILD_BCAST_TREES);
2208 vector<double> SeedRFL(numSuper,0.0);
2209 vector<Int> aggRFL(numSuper);
2210 vector<Int> globalAggRFL(numSuper*grid_->numProcCol);
2211 for( Int ksup = 0; ksup < numSuper ; ksup++ ){
2213 std::vector<LBlock<T> >& Lcol = this->L(
LBj(ksup, grid_) );
2218 totalSize+=
sizeof(Int);
2219 for( Int ib = 0; ib < Lcol.size(); ib++ ){
2220 if( Lcol[ib].blockIdx > ksup ){
2222 totalSize+=3*
sizeof(Int);
2223 totalSize+=
sizeof(Int)+Lcol[ib].rows.ByteSize();
2228 aggRFL[ksup]=totalSize;
2229 SeedRFL[ksup]=rand();
2232 else if(isRecvFromLeft_(ksup)){
2240 MPI_Allgather(&aggRFL[0],numSuper*
sizeof(Int),MPI_BYTE,
2241 &globalAggRFL[0],numSuper*
sizeof(Int),MPI_BYTE,
2243 MPI_Allreduce(MPI_IN_PLACE,&SeedRFL[0],numSuper,MPI_DOUBLE,MPI_MAX,grid_->rowComm);
2245 vector<double> SeedRFA(numSuper,0.0);
2246 vector<Int> aggRFA(numSuper);
2247 vector<Int> globalAggRFA(numSuper*grid_->numProcRow);
2248 for( Int ksup = 0; ksup < numSuper ; ksup++ ){
2250 std::vector<UBlock<T> >& Urow = this->U(
LBi(ksup, grid_) );
2255 totalSize+=
sizeof(Int);
2256 for( Int jb = 0; jb < Urow.size(); jb++ ){
2257 if( Urow[jb].blockIdx >= ksup ){
2259 totalSize+=3*
sizeof(Int);
2260 totalSize+=
sizeof(Int)+Urow[jb].cols.ByteSize();
2261 totalSize+= 2*
sizeof(Int)+Urow[jb].nzval.ByteSize();
2264 aggRFA[ksup]=totalSize;
2265 SeedRFA[ksup]=rand();
2267 else if(isRecvFromAbove_(ksup)){
2277 MPI_Allgather(&aggRFA[0],numSuper*
sizeof(Int),MPI_BYTE,
2278 &globalAggRFA[0],numSuper*
sizeof(Int),MPI_BYTE,
2280 MPI_Allreduce(MPI_IN_PLACE,&SeedRFA[0],numSuper,MPI_DOUBLE,MPI_MAX,grid_->colComm);
2282 for( Int ksup = 0; ksup < numSuper; ksup++ ){
2285 for( Int iProcRow = 0; iProcRow < grid_->numProcRow; iProcRow++ ){
2286 Int isRFA = globalAggRFA[iProcRow*numSuper + ksup];
2288 if( iProcRow !=
PROW( ksup, grid_ ) ){
2289 set_ranks.insert(iProcRow);
2297 if( isRecvFromAbove_(ksup) || CountSendToBelow(ksup)>0 ){
2298 vector<Int> tree_ranks;
2299 tree_ranks.push_back(
PROW(ksup,grid_));
2300 tree_ranks.insert(tree_ranks.end(),set_ranks.begin(),set_ranks.end());
2301 TreeBcast * & BcastUTree = fwdToBelowTree_[ksup];
2302 BcastUTree = TreeBcast::Create(this->grid_->colComm,&tree_ranks[0],tree_ranks.size(),msgSize,SeedRFA[ksup]);
2304 BcastUTree->SetGlobalComm(grid_->comm);
2309 for( Int iProcCol = 0; iProcCol < grid_->numProcCol; iProcCol++ ){
2310 Int isRFL = globalAggRFL[iProcCol*numSuper + ksup];
2312 if( iProcCol !=
PCOL( ksup, grid_ ) ){
2313 set_ranks.insert(iProcCol);
2321 if( isRecvFromLeft_(ksup) || CountSendToRight(ksup)>0 ){
2322 vector<Int> tree_ranks;
2323 tree_ranks.push_back(
PCOL(ksup,grid_));
2324 tree_ranks.insert(tree_ranks.end(),set_ranks.begin(),set_ranks.end());
2326 TreeBcast * & BcastLTree = fwdToRightTree_[ksup];
2327 BcastLTree = TreeBcast::Create(this->grid_->rowComm,&tree_ranks[0],tree_ranks.size(),msgSize,SeedRFL[ksup]);
2329 BcastLTree->SetGlobalComm(grid_->comm);
2335 TIMER_STOP(BUILD_BCAST_TREES);
2336 TIMER_START(BUILD_REDUCE_D_TREE);
2337 vector<double> SeedSTD(numSuper,0.0);
2338 vector<Int> aggSTD(numSuper);
2339 vector<Int> globalAggSTD(numSuper*grid_->numProcRow);
2340 for( Int ksup = 0; ksup < numSuper; ksup++ ){
2343 aggSTD[ksup]=totalSize;
2344 SeedSTD[ksup]=rand();
2346 else if(isSendToDiagonal_(ksup)){
2356 MPI_Allgather(&aggSTD[0],numSuper*
sizeof(Int),MPI_BYTE,
2357 &globalAggSTD[0],numSuper*
sizeof(Int),MPI_BYTE,
2359 MPI_Allreduce(MPI_IN_PLACE,&SeedSTD[0],numSuper,MPI_DOUBLE,MPI_MAX,grid_->colComm);
2362 for( Int ksup = 0; ksup < numSuper; ksup++ ){
2363 if(
MYCOL( grid_ ) ==
PCOL(ksup, grid_) ){
2366 for( Int iProcRow = 0; iProcRow < grid_->numProcRow; iProcRow++ ){
2367 Int isSTD = globalAggSTD[iProcRow*numSuper + ksup];
2369 if( iProcRow !=
PROW( ksup, grid_ ) ){
2370 set_ranks.insert(iProcRow);
2378 Int amISTD = globalAggSTD[
MYROW(grid_)*numSuper + ksup];
2385 vector<Int> tree_ranks;
2386 tree_ranks.push_back(
PROW(ksup,grid_));
2387 tree_ranks.insert(tree_ranks.end(),set_ranks.begin(),set_ranks.end());
2394 redDTree =
TreeReduce<T>::Create(this->grid_->colComm,&tree_ranks[0],tree_ranks.size(),msgSize,SeedSTD[ksup]);
2396 redDTree->SetGlobalComm(grid_->comm);
2402 TIMER_STOP(BUILD_REDUCE_D_TREE);
2406 TIMER_START(BUILD_REDUCE_L_TREE);
2407 vector<double> SeedRTL(numSuper,0.0);
2408 vector<Int> aggRTL(numSuper);
2409 vector<Int> globalAggRTL(numSuper*grid_->numProcCol);
2410 for( Int ksup = 0; ksup < numSuper ; ksup++ ){
2412 std::vector<LBlock<T> >& Lcol = this->L(
LBj(ksup, grid_) );
2418 Int numRowLUpdateBuf = 0;
2419 if(
MYROW( grid_ ) !=
PROW( ksup, grid_ ) ){
2420 for( Int ib = 0; ib < Lcol.size(); ib++ ){
2421 numRowLUpdateBuf += Lcol[ib].numRow;
2425 for( Int ib = 1; ib < Lcol.size(); ib++ ){
2426 numRowLUpdateBuf += Lcol[ib].numRow;
2431 totalSize = numRowLUpdateBuf*
SuperSize( ksup, super_ )*
sizeof(T);
2433 aggRTL[ksup]=totalSize;
2435 SeedRTL[ksup]=rand();
2437 else if(isRecvFromLeft_(ksup)){
2445 MPI_Allgather(&aggRTL[0],numSuper*
sizeof(Int),MPI_BYTE,
2446 &globalAggRTL[0],numSuper*
sizeof(Int),MPI_BYTE,
2448 MPI_Allreduce(MPI_IN_PLACE,&SeedRTL[0],numSuper,MPI_DOUBLE,MPI_MAX,grid_->rowComm);
2451 for( Int ksup = 0; ksup < numSuper ; ksup++ ){
2454 for( Int iProcCol = 0; iProcCol < grid_->numProcCol; iProcCol++ ){
2455 Int isRTL = globalAggRTL[iProcCol*numSuper + ksup];
2457 if( iProcCol !=
PCOL( ksup, grid_ ) ){
2458 set_ranks.insert(iProcCol);
2466 if( isRecvFromLeft_(ksup) || CountSendToRight(ksup)>0 ){
2467 vector<Int> tree_ranks;
2468 tree_ranks.push_back(
PCOL(ksup,grid_));
2469 tree_ranks.insert(tree_ranks.end(),set_ranks.begin(),set_ranks.end());
2472 redLTree =
TreeReduce<T>::Create(this->grid_->rowComm,&tree_ranks[0],tree_ranks.size(),msgSize,SeedRTL[ksup]);
2474 redLTree->SetGlobalComm(grid_->comm);
2479 TIMER_STOP(BUILD_REDUCE_L_TREE);
2487 #if ( _DEBUGlevel_ >= 1 )
2488 statusOFS << std::endl <<
"isSendToBelow:" << std::endl;
2489 for(
int j = 0;j< isSendToBelow_.n();j++){
2490 statusOFS <<
"["<<j<<
"] ";
2491 for(
int i =0; i < isSendToBelow_.m();i++){
2492 statusOFS<< isSendToBelow_(i,j) <<
" ";
2494 statusOFS<<std::endl;
2497 statusOFS << std::endl <<
"isRecvFromAbove:" << std::endl;
2498 for(
int j = 0;j< isRecvFromAbove_.m();j++){
2499 statusOFS <<
"["<<j<<
"] "<< isRecvFromAbove_(j)<<std::endl;
2502 #if ( _DEBUGlevel_ >= 1 )
2503 statusOFS << std::endl <<
"isSendToRight:" << std::endl;
2504 for(
int j = 0;j< isSendToRight_.n();j++){
2505 statusOFS <<
"["<<j<<
"] ";
2506 for(
int i =0; i < isSendToRight_.m();i++){
2507 statusOFS<< isSendToRight_(i,j) <<
" ";
2509 statusOFS<<std::endl;
2512 statusOFS << std::endl <<
"isRecvFromLeft:" << std::endl;
2513 for(
int j = 0;j< isRecvFromLeft_.m();j++){
2514 statusOFS <<
"["<<j<<
"] "<< isRecvFromLeft_(j)<<std::endl;
2517 statusOFS << std::endl <<
"isRecvFromBelow:" << std::endl;
2518 for(
int j = 0;j< isRecvFromBelow_.n();j++){
2519 statusOFS <<
"["<<j<<
"] ";
2520 for(
int i =0; i < isRecvFromBelow_.m();i++){
2521 statusOFS<< isRecvFromBelow_(i,j) <<
" ";
2523 statusOFS<<std::endl;
2533 TIMER_START(STCD_RFCD);
2537 PushCallStack(
"SendToCrossDiagonal / RecvFromCrossDiagonal");
2539 for( Int ksup = 0; ksup < numSuper - 1; ksup++ ){
2540 if(
MYCOL( grid_ ) ==
PCOL( ksup, grid_ ) ){
2541 for( std::set<Int>::iterator si = localColBlockRowIdx[
LBj( ksup, grid_ )].begin();
2542 si != localColBlockRowIdx[
LBj( ksup, grid_ )].end(); si++ ){
2544 Int isupProcRow =
PROW( isup, grid_ );
2545 Int isupProcCol =
PCOL( isup, grid_ );
2546 if( isup > ksup &&
MYROW( grid_ ) == isupProcRow ){
2547 isSendToCrossDiagonal_(grid_->numProcCol, ksup ) =
true;
2548 isSendToCrossDiagonal_(isupProcCol, ksup ) =
true;
2554 for( Int ksup = 0; ksup < numSuper - 1; ksup++ ){
2555 if(
MYROW( grid_ ) ==
PROW( ksup, grid_ ) ){
2556 for( std::set<Int>::iterator si = localRowBlockColIdx[
LBi(ksup, grid_) ].begin();
2557 si != localRowBlockColIdx[
LBi(ksup, grid_) ].end(); si++ ){
2559 Int jsupProcCol =
PCOL( jsup, grid_ );
2560 Int jsupProcRow =
PROW( jsup, grid_ );
2561 if( jsup > ksup &&
MYCOL(grid_) == jsupProcCol ){
2562 isRecvFromCrossDiagonal_(grid_->numProcRow, ksup ) =
true;
2563 isRecvFromCrossDiagonal_(jsupProcRow, ksup ) =
true;
2568 #if ( _DEBUGlevel_ >= 1 )
2569 statusOFS << std::endl <<
"isSendToCrossDiagonal:" << std::endl;
2570 for(
int j =0; j < isSendToCrossDiagonal_.n();j++){
2571 if(isSendToCrossDiagonal_(grid_->numProcCol,j)){
2572 statusOFS <<
"["<<j<<
"] ";
2573 for(
int i =0; i < isSendToCrossDiagonal_.m()-1;i++){
2574 if(isSendToCrossDiagonal_(i,j))
2576 statusOFS<<
PNUM(
PROW(j,grid_),i,grid_)<<
" ";
2579 statusOFS<<std::endl;
2583 statusOFS << std::endl <<
"isRecvFromCrossDiagonal:" << std::endl;
2584 for(
int j =0; j < isRecvFromCrossDiagonal_.n();j++){
2585 if(isRecvFromCrossDiagonal_(grid_->numProcRow,j)){
2586 statusOFS <<
"["<<j<<
"] ";
2587 for(
int i =0; i < isRecvFromCrossDiagonal_.m()-1;i++){
2588 if(isRecvFromCrossDiagonal_(i,j))
2590 statusOFS<<
PNUM(i,
PCOL(j,grid_),grid_)<<
" ";
2593 statusOFS<<std::endl;
2604 TIMER_STOP(STCD_RFCD);
2611 GetWorkSet(snodeEtree,this->WorkingSet());
2616 template<
typename T>
2624 statOFS<<
"m"<<
"\t"<<
"n"<<
"\t"<<
"z"<<std::endl;
2625 for(
auto it = gemm_stat.begin(); it!=gemm_stat.end(); it+=3){
2626 statOFS<<*it<<
"\t"<<*(it+1)<<
"\t"<<*(it+2)<<std::endl;
2632 commOFS<<HEADER_COMM<<std::endl;
2633 for(
auto it = comm_stat.begin(); it!=comm_stat.end(); it+=4){
2634 commOFS<<LINE_COMM(it)<<std::endl;
2641 template<
typename T>
2644 TIMER_START(SelInv_P2p);
2647 PushCallStack(
"PMatrix::SelInv_P2p");
2654 std::vector<std::vector<Int> > & superList = this->WorkingSet();
2655 Int numSteps = superList.size();
2657 for (Int lidx=0; lidx<numSteps ; lidx++){
2658 Int stepSuper = superList[lidx].size();
2660 SelInvIntra_P2p(lidx);
2669 TIMER_STOP(SelInv_P2p);
2676 template<
typename T>
2680 PushCallStack(
"PMatrix::PreSelInv");
2686 PushCallStack(
"L(i,k) <- L(i,k) * L(k,k)^{-1}");
2688 #if ( _DEBUGlevel_ >= 1 )
2689 statusOFS << std::endl <<
"L(i,k) <- L(i,k) * L(k,k)^{-1}" << std::endl << std::endl;
2691 for( Int ksup = 0; ksup < numSuper; ksup++ ){
2692 if(
MYCOL( grid_ ) ==
PCOL( ksup, grid_ ) ){
2695 std::vector<LBlock<T> >& Lcol = this->L(
LBj( ksup, grid_ ) );
2696 if(
MYROW( grid_ ) ==
PROW( ksup, grid_ ) ){
2697 nzvalLDiag = Lcol[0].nzval;
2698 if( nzvalLDiag.m() !=
SuperSize(ksup, super_) ||
2699 nzvalLDiag.n() !=
SuperSize(ksup, super_) ){
2703 throw std::runtime_error(
"The size of the diagonal block of L is wrong." );
2710 MPI_Bcast( (
void*)nzvalLDiag.Data(), nzvalLDiag.ByteSize(),
2711 MPI_BYTE,
PROW( ksup, grid_ ), grid_->colComm );
2714 for( Int ib = 0; ib < Lcol.size(); ib++ ){
2717 #if ( _DEBUGlevel_ >= 2 )
2719 if(
LBj( ksup, grid_ ) == 0 ){
2720 statusOFS <<
"Diag L(" << ksup <<
", " << ksup <<
"): " << nzvalLDiag << std::endl;
2721 statusOFS <<
"Before solve L(" << LB.
blockIdx <<
", " << ksup <<
"): " << LB.
nzval << std::endl;
2724 blas::Trsm(
'R',
'L',
'N',
'U', LB.
numRow, LB.
numCol, ONE<T>(),
2726 #if ( _DEBUGlevel_ >= 2 )
2728 if(
LBj( ksup, grid_ ) == 0 ){
2729 statusOFS <<
"After solve L(" << LB.
blockIdx <<
", " << ksup <<
"): " << LB.
nzval << std::endl;
2744 PushCallStack(
"U(k,i) <- L(i,k)");
2746 #if ( _DEBUGlevel_ >= 1 )
2747 statusOFS << std::endl <<
"U(k,i) <- L(i,k)" << std::endl << std::endl;
2750 for( Int ksup = 0; ksup < numSuper; ksup++ ){
2751 Int ksupProcRow =
PROW( ksup, grid_ );
2752 Int ksupProcCol =
PCOL( ksup, grid_ );
2754 Int sendCount = CountSendToCrossDiagonal(ksup);
2755 Int recvCount = CountRecvFromCrossDiagonal(ksup);
2757 std::vector<MPI_Request > arrMpiReqsSend(sendCount, MPI_REQUEST_NULL );
2758 std::vector<MPI_Request > arrMpiReqsSizeSend(sendCount, MPI_REQUEST_NULL );
2759 std::vector<std::vector<char> > arrSstrLcolSend(sendCount);
2760 std::vector<Int > arrSstrLcolSizeSend(sendCount);
2762 std::vector<MPI_Request > arrMpiReqsRecv(recvCount, MPI_REQUEST_NULL );
2763 std::vector<MPI_Request > arrMpiReqsSizeRecv(recvCount, MPI_REQUEST_NULL );
2764 std::vector<std::vector<char> > arrSstrLcolRecv(recvCount);
2765 std::vector<Int > arrSstrLcolSizeRecv(recvCount);
2770 if( isSendToCrossDiagonal_(grid_->numProcCol,ksup) ){
2771 #if ( _DEBUGlevel_ >= 1 )
2772 statusOFS<<
"["<<ksup<<
"] P"<<
MYPROC(grid_)<<
" should send to "<<CountSendToCrossDiagonal(ksup)<<
" processors"<<std::endl;
2776 for(Int dstCol = 0; dstCol<grid_->numProcCol; dstCol++){
2777 if(isSendToCrossDiagonal_(dstCol,ksup) ){
2778 Int dst =
PNUM(
PROW(ksup,grid_),dstCol,grid_);
2781 std::stringstream sstm;
2782 std::vector<char> & sstrLcolSend = arrSstrLcolSend[sendIdx];
2783 Int & sstrSize = arrSstrLcolSizeSend[sendIdx];
2784 MPI_Request & mpiReqSend = arrMpiReqsSend[sendIdx];
2785 MPI_Request & mpiReqSizeSend = arrMpiReqsSizeSend[sendIdx];
2787 std::vector<Int> mask( LBlockMask::TOTAL_NUMBER, 1 );
2788 std::vector<LBlock<T> >& Lcol = this->L(
LBj(ksup, grid_) );
2793 if(
MYROW( grid_ ) ==
PROW( ksup, grid_ ) ){
2794 for( Int ib = 1; ib < Lcol.size(); ib++ ){
2795 if( Lcol[ib].blockIdx > ksup && (Lcol[ib].blockIdx % grid_->numProcCol) == dstCol ){
2802 for( Int ib = 0; ib < Lcol.size(); ib++ ){
2803 if( Lcol[ib].blockIdx > ksup && (Lcol[ib].blockIdx % grid_->numProcCol) == dstCol ){
2809 serialize( (Int)count, sstm, NO_MASK );
2811 for( Int ib = 0; ib < Lcol.size(); ib++ ){
2812 if( Lcol[ib].blockIdx > ksup && (Lcol[ib].blockIdx % grid_->numProcCol) == dstCol ){
2813 #if ( _DEBUGlevel_ >= 1 )
2814 statusOFS<<
"["<<ksup<<
"] SEND contains "<<Lcol[ib].blockIdx<<
" which corresponds to "<<
GBj(ib,grid_)<<std::endl;
2816 serialize( Lcol[ib], sstm, mask );
2820 sstrLcolSend.resize( Size(sstm) );
2821 sstm.read( &sstrLcolSend[0], sstrLcolSend.size() );
2822 sstrSize = sstrLcolSend.size();
2829 #if ( _DEBUGlevel_ >= 1 )
2830 statusOFS<<
"["<<ksup<<
"] P"<<
MYPROC(grid_)<<
" ("<<
MYROW(grid_)<<
","<<
MYCOL(grid_)<<
") ---> LBj("<<ksup<<
")="<<
LBj(ksup,grid_)<<
" ---> P"<<dst<<
" ("<<ksupProcRow<<
","<<dstCol<<
")"<<std::endl;
2832 MPI_Isend( &sstrSize,
sizeof(sstrSize), MPI_BYTE, dst, SELINV_TAG_D_SIZE, grid_->comm, &mpiReqSizeSend );
2833 MPI_Isend( (
void*)&sstrLcolSend[0], sstrSize, MPI_BYTE, dst, SELINV_TAG_D_CONTENT, grid_->comm, &mpiReqSend );
2836 PROFILE_COMM(
MYPROC(this->grid_),dst,SELINV_TAG_D_SIZE,
sizeof(sstrSize));
2837 PROFILE_COMM(
MYPROC(this->grid_),dst,SELINV_TAG_D_CONTENT,sstrSize);
2852 if( isRecvFromCrossDiagonal_(grid_->numProcRow,ksup) ){
2855 #if ( _DEBUGlevel_ >= 1 )
2856 statusOFS<<
"["<<ksup<<
"] P"<<
MYPROC(grid_)<<
" should receive from "<<CountRecvFromCrossDiagonal(ksup)<<
" processors"<<std::endl;
2860 std::vector<UBlock<T> >& Urow = this->U(
LBi( ksup, grid_ ) );
2861 std::vector<bool> isBlockFound(Urow.size(),
false);
2866 for(Int srcRow = 0; srcRow<grid_->numProcRow; srcRow++){
2867 if(isRecvFromCrossDiagonal_(srcRow,ksup) ){
2868 std::vector<LBlock<T> > LcolRecv;
2869 Int src =
PNUM(srcRow,
PCOL(ksup,grid_),grid_);
2871 MPI_Request & mpiReqSizeRecv = arrMpiReqsSizeRecv[recvIdx];
2872 Int & sstrSize = arrSstrLcolSizeRecv[recvIdx];
2874 MPI_Irecv( &sstrSize, 1, MPI_INT, src, SELINV_TAG_D_SIZE, grid_->comm, &mpiReqSizeRecv );
2881 mpi::Waitall(arrMpiReqsSizeRecv);
2887 for(Int srcRow = 0; srcRow<grid_->numProcRow; srcRow++){
2888 if(isRecvFromCrossDiagonal_(srcRow,ksup) ){
2889 std::vector<LBlock<T> > LcolRecv;
2890 Int src =
PNUM(srcRow,
PCOL(ksup,grid_),grid_);
2892 MPI_Request & mpiReqRecv = arrMpiReqsRecv[recvIdx];
2893 Int & sstrSize = arrSstrLcolSizeRecv[recvIdx];
2894 std::vector<char> & sstrLcolRecv = arrSstrLcolRecv[recvIdx];
2895 sstrLcolRecv.resize(sstrSize);
2897 MPI_Irecv( &sstrLcolRecv[0], sstrSize, MPI_BYTE, src, SELINV_TAG_D_CONTENT, grid_->comm, &mpiReqRecv );
2904 mpi::Waitall(arrMpiReqsRecv);
2910 for(Int srcRow = 0; srcRow<grid_->numProcRow; srcRow++){
2911 if(isRecvFromCrossDiagonal_(srcRow,ksup) ){
2912 std::vector<LBlock<T> > LcolRecv;
2913 Int src =
PNUM(srcRow,
PCOL(ksup,grid_),grid_);
2916 Int & sstrSize = arrSstrLcolSizeRecv[recvIdx];
2917 std::vector<char> & sstrLcolRecv = arrSstrLcolRecv[recvIdx];
2918 std::stringstream sstm;
2920 #if ( _DEBUGlevel_ >= 1 )
2921 statusOFS<<
"["<<ksup<<
"] P"<<
MYPROC(grid_)<<
" ("<<
MYROW(grid_)<<
","<<
MYCOL(grid_)<<
") <--- LBj("<<ksup<<
") <--- P"<<src<<
" ("<<srcRow<<
","<<ksupProcCol<<
")"<<std::endl;
2925 sstm.write( &sstrLcolRecv[0], sstrSize );
2929 std::vector<Int> mask( LBlockMask::TOTAL_NUMBER, 1 );
2930 deserialize( numLBlock, sstm, NO_MASK );
2931 LcolRecv.resize(numLBlock);
2932 for( Int ib = 0; ib < numLBlock; ib++ ){
2933 deserialize( LcolRecv[ib], sstm, mask );
2934 #if ( _DEBUGlevel_ >= 1 )
2935 statusOFS<<
"["<<ksup<<
"] RECV contains "<<LcolRecv[ib].blockIdx<<
" which corresponds to "<< ib * grid_->numProcRow + srcRow;
2937 statusOFS<<
" L is on row "<< srcRow <<
" whereas U is on col "<< LcolRecv[ib].blockIdx % grid_->numProcCol <<std::endl;
2947 std::vector<LBlock<T> >& Lcol = this->L(
LBj( ksup, grid_ ) );
2948 if(
MYROW( grid_ ) !=
PROW( ksup, grid_ ) ){
2949 LcolRecv.resize( Lcol.size() );
2950 for( Int ib = 0; ib < Lcol.size(); ib++ ){
2951 LcolRecv[ib] = Lcol[ib];
2955 LcolRecv.resize( Lcol.size() - 1 );
2956 for( Int ib = 0; ib < Lcol.size() - 1; ib++ ){
2957 LcolRecv[ib] = Lcol[ib+1];
2964 for( Int ib = 0; ib < LcolRecv.size(); ib++ ){
2970 throw std::logic_error(
"LcolRecv contains the wrong blocks." );
2972 for( Int jb = 0; jb < Urow.size(); jb++ ){
2977 std::ostringstream msg;
2978 msg <<
"LB(" << LB.
blockIdx <<
", " << ksup <<
") and UB("
2979 << ksup <<
", " << UB.
blockIdx <<
") do not share the same size." << std::endl
2980 <<
"LB: " << LB.
numRow <<
" x " << LB.
numCol << std::endl
2981 <<
"UB: " << UB.
numRow <<
" x " << UB.
numCol << std::endl;
2985 throw std::runtime_error( msg.str().c_str() );
2994 #if ( _DEBUGlevel_ >= 1 )
2995 statusOFS<<
"["<<ksup<<
"] USING "<<LB.
blockIdx<< std::endl;
2997 isBlockFound[jb] =
true;
3005 for( Int jb = 0; jb < Urow.size(); jb++ ){
3007 if( !isBlockFound[jb] ){
3011 throw std::logic_error(
"UBlock cannot find its update. Something is seriously wrong." );
3021 mpi::Waitall(arrMpiReqsSizeSend);
3022 mpi::Waitall(arrMpiReqsSend);
3032 PushCallStack(
"L(i,i) <- [L(k,k) * U(k,k)]^{-1} ");
3034 #if ( _DEBUGlevel_ >= 1 )
3035 statusOFS << std::endl <<
"L(i,i) <- [L(k,k) * U(k,k)]^{-1}" << std::endl << std::endl;
3038 for( Int ksup = 0; ksup < numSuper; ksup++ ){
3039 if(
MYROW( grid_ ) ==
PROW( ksup, grid_ ) &&
3040 MYCOL( grid_ ) ==
PCOL( ksup, grid_ ) ){
3044 for(Int i = 0; i <
SuperSize( ksup, super_ ); i++){
3048 #if ( _DEBUGlevel_ >= 2 )
3050 statusOFS <<
"Factorized A (" << ksup <<
", " << ksup <<
"): " << LB.
nzval << std::endl;
3053 SuperSize( ksup, super_ ), ipiv.Data() );
3056 Symmetrize( LB.
nzval );
3058 #if ( _DEBUGlevel_ >= 2 )
3060 statusOFS <<
"Inversed A (" << ksup <<
", " << ksup <<
"): " << LB.
nzval << std::endl;
3079 template<
typename T>
3083 PushCallStack(
"PMatrix::GetDiagonal");
3087 Int numCol = this->
NumCol();
3089 const IntNumVec& permInv = super_->permInv;
3094 pPerm_r = &super_->perm_r;
3095 pPermInv_r = &super_->permInv_r;
3098 const IntNumVec& permInv_r = *pPermInv_r;
3104 diag.Resize( numCol );
3107 for( Int orow = 0; orow < numCol; orow++){
3109 Int row = perm[ orow ];
3111 Int col = perm[ perm_r[ orow] ];
3113 Int blockColIdx =
BlockIdx( col, super_ );
3114 Int blockRowIdx =
BlockIdx( row, super_ );
3118 if(
MYROW( grid_ ) ==
PROW( blockRowIdx, grid_ ) &&
3119 MYCOL( grid_ ) ==
PCOL( blockColIdx, grid_ ) ){
3121 bool isFound =
false;
3123 if( blockColIdx <= blockRowIdx ){
3126 std::vector<LBlock<T> >& Lcol = this->L(
LBj( blockColIdx, grid_ ) );
3128 for( Int ib = 0; ib < Lcol.size(); ib++ ){
3129 #if ( _DEBUGlevel_ >= 1 )
3130 statusOFS <<
"blockRowIdx = " << blockRowIdx <<
", Lcol[ib].blockIdx = " << Lcol[ib].blockIdx <<
", blockColIdx = " << blockColIdx << std::endl;
3132 if( Lcol[ib].blockIdx == blockRowIdx ){
3134 for(
int iloc = 0; iloc < Lcol[ib].numRow; iloc++ ){
3135 if( rows[iloc] == row ){
3138 diagLocal[ orow ] = Lcol[ib].nzval( iloc, jloc );
3146 if( isFound ==
true )
break;
3152 std::vector<UBlock<T> >& Urow = this->U(
LBi( blockRowIdx, grid_ ) );
3154 for( Int jb = 0; jb < Urow.size(); jb++ ){
3155 if( Urow[jb].blockIdx == blockColIdx ){
3157 for(
int jloc = 0; jloc < Urow[jb].numCol; jloc++ ){
3158 if( cols[jloc] == col ){
3161 diagLocal[ orow ] = Urow[jb].nzval( iloc, jloc );
3168 if( isFound ==
true )
break;
3173 if( isFound ==
false ){
3174 statusOFS <<
"In the permutated order, (" << row <<
", " << col <<
3175 ") is not found in PMatrix." << std::endl;
3176 diagLocal[orow] = ZERO<T>();
3196 mpi::Allreduce( diagLocal.Data(), diag.Data(), numCol, MPI_SUM, grid_->comm );
3207 template<
typename T>
3211 PushCallStack(
"PMatrix::PMatrixToDistSparseMatrix");
3213 #if ( _DEBUGlevel_ >= 1 )
3214 statusOFS << std::endl <<
"Converting PMatrix to DistSparseMatrix." << std::endl;
3216 Int mpirank = grid_->mpirank;
3217 Int mpisize = grid_->mpisize;
3219 std::vector<Int> rowSend( mpisize );
3220 std::vector<Int> colSend( mpisize );
3221 std::vector<T> valSend( mpisize );
3222 std::vector<Int> sizeSend( mpisize, 0 );
3223 std::vector<Int> displsSend( mpisize, 0 );
3225 std::vector<Int> rowRecv( mpisize );
3226 std::vector<Int> colRecv( mpisize );
3227 std::vector<T> valRecv( mpisize );
3228 std::vector<Int> sizeRecv( mpisize, 0 );
3229 std::vector<Int> displsRecv( mpisize, 0 );
3233 const IntNumVec& permInv = super_->permInv;
3238 pPerm_r = &super_->perm_r;
3239 pPermInv_r = &super_->permInv_r;
3242 const IntNumVec& permInv_r = *pPermInv_r;
3249 Int numColFirst = this->
NumCol() / mpisize;
3252 for( Int ksup = 0; ksup < numSuper; ksup++ ){
3254 if(
MYCOL( grid_ ) ==
PCOL( ksup, grid_ ) ){
3255 std::vector<LBlock<T> >& Lcol = this->L(
LBj( ksup, grid_ ) );
3256 for( Int ib = 0; ib < Lcol.size(); ib++ ){
3257 for( Int j = 0; j < Lcol[ib].numCol; j++ ){
3258 Int jcol = permInv[ permInv_r[ j +
FirstBlockCol( ksup, super_ ) ] ];
3259 Int dest = std::min( jcol / numColFirst, mpisize - 1 );
3260 sizeSend[dest] += Lcol[ib].numRow;
3266 if(
MYROW( grid_ ) ==
PROW( ksup, grid_ ) ){
3267 std::vector<UBlock<T> >& Urow = this->U(
LBi( ksup, grid_ ) );
3268 for( Int jb = 0; jb < Urow.size(); jb++ ){
3270 for( Int j = 0; j < cols.m(); j++ ){
3271 Int jcol = permInv[ permInv_r[ cols[j] ] ];
3272 Int dest = std::min( jcol / numColFirst, mpisize - 1 );
3273 sizeSend[dest] += Urow[jb].numRow;
3281 &sizeSend[0], 1, MPI_INT,
3282 &sizeRecv[0], 1, MPI_INT, grid_->comm );
3287 for( Int ip = 0; ip < mpisize; ip++ ){
3292 displsSend[ip] = displsSend[ip-1] + sizeSend[ip-1];
3299 displsRecv[ip] = displsRecv[ip-1] + sizeRecv[ip-1];
3302 Int sizeSendTotal = displsSend[mpisize-1] + sizeSend[mpisize-1];
3303 Int sizeRecvTotal = displsRecv[mpisize-1] + sizeRecv[mpisize-1];
3305 rowSend.resize( sizeSendTotal );
3306 colSend.resize( sizeSendTotal );
3307 valSend.resize( sizeSendTotal );
3309 rowRecv.resize( sizeRecvTotal );
3310 colRecv.resize( sizeRecvTotal );
3311 valRecv.resize( sizeRecvTotal );
3313 #if ( _DEBUGlevel_ >= 1 )
3314 statusOFS <<
"displsSend = " << displsSend << std::endl;
3315 statusOFS <<
"displsRecv = " << displsRecv << std::endl;
3319 std::vector<Int> cntSize( mpisize, 0 );
3321 for( Int ksup = 0; ksup < numSuper; ksup++ ){
3323 if(
MYCOL( grid_ ) ==
PCOL( ksup, grid_ ) ){
3324 std::vector<LBlock<T> >& Lcol = this->L(
LBj( ksup, grid_ ) );
3325 for( Int ib = 0; ib < Lcol.size(); ib++ ){
3328 for( Int j = 0; j < Lcol[ib].numCol; j++ ){
3329 Int jcol = permInv[ permInv_r[ j +
FirstBlockCol( ksup, super_ ) ] ];
3330 Int dest = std::min( jcol / numColFirst, mpisize - 1 );
3331 for( Int i = 0; i < rows.m(); i++ ){
3332 rowSend[displsSend[dest] + cntSize[dest]] = permInv[ rows[i] ];
3333 colSend[displsSend[dest] + cntSize[dest]] = jcol;
3334 valSend[displsSend[dest] + cntSize[dest]] = nzval( i, j );
3342 if(
MYROW( grid_ ) ==
PROW( ksup, grid_ ) ){
3343 std::vector<UBlock<T> >& Urow = this->U(
LBi( ksup, grid_ ) );
3344 for( Int jb = 0; jb < Urow.size(); jb++ ){
3347 for( Int j = 0; j < cols.m(); j++ ){
3348 Int jcol = permInv[ permInv_r[ cols[j] ] ];
3349 Int dest = std::min( jcol / numColFirst, mpisize - 1 );
3350 for( Int i = 0; i < Urow[jb].numRow; i++ ){
3351 rowSend[displsSend[dest] + cntSize[dest]] = permInv[ i +
FirstBlockCol( ksup, super_ ) ];
3352 colSend[displsSend[dest] + cntSize[dest]] = jcol;
3353 valSend[displsSend[dest] + cntSize[dest]] = nzval( i, j );
3362 for( Int ip = 0; ip < mpisize; ip++ ){
3363 if( cntSize[ip] != sizeSend[ip] )
3368 throw std::runtime_error(
"Sizes of the sending information do not match." );
3375 &rowSend[0], &sizeSend[0], &displsSend[0],
3376 &rowRecv[0], &sizeRecv[0], &displsRecv[0],
3379 &colSend[0], &sizeSend[0], &displsSend[0],
3380 &colRecv[0], &sizeRecv[0], &displsRecv[0],
3383 &valSend[0], &sizeSend[0], &displsSend[0],
3384 &valRecv[0], &sizeRecv[0], &displsRecv[0],
3387 #if ( _DEBUGlevel_ >= 1 )
3388 statusOFS <<
"Alltoallv communication finished." << std::endl;
3403 Int firstCol = mpirank * numColFirst;
3405 if( mpirank == mpisize-1 )
3406 numColLocal = this->
NumCol() - numColFirst * (mpisize-1);
3408 numColLocal = numColFirst;
3410 std::vector<std::vector<Int> > rows( numColLocal );
3411 std::vector<std::vector<T> > vals( numColLocal );
3413 for( Int ip = 0; ip < mpisize; ip++ ){
3414 Int* rowRecvCur = &rowRecv[displsRecv[ip]];
3415 Int* colRecvCur = &colRecv[displsRecv[ip]];
3416 T* valRecvCur = &valRecv[displsRecv[ip]];
3417 for( Int i = 0; i < sizeRecv[ip]; i++ ){
3418 rows[colRecvCur[i]-firstCol].push_back( rowRecvCur[i] );
3419 vals[colRecvCur[i]-firstCol].push_back( valRecvCur[i] );
3424 std::vector<std::vector<Int> > sortIndex( numColLocal );
3425 for( Int j = 0; j < numColLocal; j++ ){
3426 sortIndex[j].resize( rows[j].size() );
3427 for( Int i = 0; i < sortIndex[j].size(); i++ )
3428 sortIndex[j][i] = i;
3429 std::sort( sortIndex[j].begin(), sortIndex[j].end(),
3430 IndexComp<std::vector<Int>& > ( rows[j] ) );
3442 for( Int j = 0; j < numColLocal; j++ ){
3447 A.
comm = grid_->comm;
3449 #if ( _DEBUGlevel_ >= 1 )
3450 statusOFS <<
"nnzLocal = " << A.
nnzLocal << std::endl;
3451 statusOFS <<
"nnz = " << A.
Nnz() << std::endl;
3460 for( Int j = 0; j < numColLocal; j++ ){
3461 std::vector<Int>& rowsCur = rows[j];
3462 std::vector<Int>& sortIndexCur = sortIndex[j];
3463 std::vector<T>& valsCur = vals[j];
3464 for( Int i = 0; i < rows[j].size(); i++ ){
3466 *(rowPtr++) = rowsCur[sortIndexCur[i]] + 1;
3467 *(nzvalPtr++) = valsCur[sortIndexCur[i]];
3471 #if ( _DEBUGlevel_ >= 1 )
3472 statusOFS <<
"A.colptrLocal[end] = " << A.
colptrLocal(numColLocal) << std::endl;
3473 statusOFS <<
"A.rowindLocal.size() = " << A.
rowindLocal.m() << std::endl;
3488 template<
typename T>
3492 PushCallStack(
"PMatrix::PMatrixToDistSparseMatrix_OLD");
3494 #if ( _DEBUGlevel_ >= 1 )
3495 statusOFS << std::endl <<
"Converting PMatrix to DistSparseMatrix (2nd format)." << std::endl;
3497 Int mpirank = grid_->mpirank;
3498 Int mpisize = grid_->mpisize;
3500 std::vector<Int> rowSend( mpisize );
3501 std::vector<Int> colSend( mpisize );
3502 std::vector<T> valSend( mpisize );
3503 std::vector<Int> sizeSend( mpisize, 0 );
3504 std::vector<Int> displsSend( mpisize, 0 );
3506 std::vector<Int> rowRecv( mpisize );
3507 std::vector<Int> colRecv( mpisize );
3508 std::vector<T> valRecv( mpisize );
3509 std::vector<Int> sizeRecv( mpisize, 0 );
3510 std::vector<Int> displsRecv( mpisize, 0 );
3513 const IntNumVec& permInv = super_->permInv;
3519 if(optionsLU_->RowPerm==
"NOROWPERM"){
3520 pPermInv_r = &super_->permInv;
3523 pPermInv_r = &super_->permInv_r;
3526 const IntNumVec& permInv_r = *pPermInv_r;
3538 Int numColFirst = this->
NumCol() / mpisize;
3541 for( Int ksup = 0; ksup < numSuper; ksup++ ){
3543 if(
MYCOL( grid_ ) ==
PCOL( ksup, grid_ ) ){
3544 std::vector<LBlock<T> >& Lcol = this->L(
LBj( ksup, grid_ ) );
3545 for( Int ib = 0; ib < Lcol.size(); ib++ ){
3546 for( Int j = 0; j < Lcol[ib].numCol; j++ ){
3548 Int dest = std::min( jcol / numColFirst, mpisize - 1 );
3549 sizeSend[dest] += Lcol[ib].numRow;
3555 if(
MYROW( grid_ ) ==
PROW( ksup, grid_ ) ){
3556 std::vector<UBlock<T> >& Urow = this->U(
LBi( ksup, grid_ ) );
3557 for( Int jb = 0; jb < Urow.size(); jb++ ){
3559 for( Int j = 0; j < cols.m(); j++ ){
3560 Int jcol = permInv( cols(j) );
3561 Int dest = std::min( jcol / numColFirst, mpisize - 1 );
3562 sizeSend[dest] += Urow[jb].numRow;
3570 &sizeSend[0], 1, MPI_INT,
3571 &sizeRecv[0], 1, MPI_INT, grid_->comm );
3576 for( Int ip = 0; ip < mpisize; ip++ ){
3581 displsSend[ip] = displsSend[ip-1] + sizeSend[ip-1];
3588 displsRecv[ip] = displsRecv[ip-1] + sizeRecv[ip-1];
3591 Int sizeSendTotal = displsSend[mpisize-1] + sizeSend[mpisize-1];
3592 Int sizeRecvTotal = displsRecv[mpisize-1] + sizeRecv[mpisize-1];
3594 rowSend.resize( sizeSendTotal );
3595 colSend.resize( sizeSendTotal );
3596 valSend.resize( sizeSendTotal );
3598 rowRecv.resize( sizeRecvTotal );
3599 colRecv.resize( sizeRecvTotal );
3600 valRecv.resize( sizeRecvTotal );
3602 #if ( _DEBUGlevel_ >= 1 )
3603 statusOFS <<
"displsSend = " << displsSend << std::endl;
3604 statusOFS <<
"displsRecv = " << displsRecv << std::endl;
3608 std::vector<Int> cntSize( mpisize, 0 );
3611 for( Int ksup = 0; ksup < numSuper; ksup++ ){
3613 if(
MYCOL( grid_ ) ==
PCOL( ksup, grid_ ) ){
3614 std::vector<LBlock<T> >& Lcol = this->L(
LBj( ksup, grid_ ) );
3615 for( Int ib = 0; ib < Lcol.size(); ib++ ){
3618 for( Int j = 0; j < Lcol[ib].numCol; j++ ){
3620 Int dest = std::min( jcol / numColFirst, mpisize - 1 );
3621 for( Int i = 0; i < rows.m(); i++ ){
3622 rowSend[displsSend[dest] + cntSize[dest]] = permInv_r( rows(i) );
3623 colSend[displsSend[dest] + cntSize[dest]] = jcol;
3624 valSend[displsSend[dest] + cntSize[dest]] = nzval( i, j );
3633 if(
MYROW( grid_ ) ==
PROW( ksup, grid_ ) ){
3634 std::vector<UBlock<T> >& Urow = this->U(
LBi( ksup, grid_ ) );
3635 for( Int jb = 0; jb < Urow.size(); jb++ ){
3638 for( Int j = 0; j < cols.m(); j++ ){
3639 Int jcol = permInv( cols(j) );
3640 Int dest = std::min( jcol / numColFirst, mpisize - 1 );
3641 for( Int i = 0; i < Urow[jb].numRow; i++ ){
3642 rowSend[displsSend[dest] + cntSize[dest]] =
3644 colSend[displsSend[dest] + cntSize[dest]] = jcol;
3645 valSend[displsSend[dest] + cntSize[dest]] = nzval( i, j );
3656 for( Int ip = 0; ip < mpisize; ip++ ){
3657 if( cntSize[ip] != sizeSend[ip] ){
3661 throw std::runtime_error(
"Sizes of the sending information do not match." );
3667 &rowSend[0], &sizeSend[0], &displsSend[0],
3668 &rowRecv[0], &sizeRecv[0], &displsRecv[0],
3671 &colSend[0], &sizeSend[0], &displsSend[0],
3672 &colRecv[0], &sizeRecv[0], &displsRecv[0],
3675 &valSend[0], &sizeSend[0], &displsSend[0],
3676 &valRecv[0], &sizeRecv[0], &displsRecv[0],
3679 #if ( _DEBUGlevel_ >= 1 )
3680 statusOFS <<
"Alltoallv communication finished." << std::endl;
3695 Int firstCol = mpirank * numColFirst;
3697 if( mpirank == mpisize-1 )
3698 numColLocal = this->
NumCol() - numColFirst * (mpisize-1);
3700 numColLocal = numColFirst;
3702 std::vector<std::vector<Int> > rows( numColLocal );
3703 std::vector<std::vector<T> > vals( numColLocal );
3705 for( Int ip = 0; ip < mpisize; ip++ ){
3706 Int* rowRecvCur = &rowRecv[displsRecv[ip]];
3707 Int* colRecvCur = &colRecv[displsRecv[ip]];
3708 T* valRecvCur = &valRecv[displsRecv[ip]];
3709 for( Int i = 0; i < sizeRecv[ip]; i++ ){
3710 rows[colRecvCur[i]-firstCol].push_back( rowRecvCur[i] );
3711 vals[colRecvCur[i]-firstCol].push_back( valRecvCur[i] );
3716 std::vector<std::vector<Int> > sortIndex( numColLocal );
3717 for( Int j = 0; j < numColLocal; j++ ){
3718 sortIndex[j].resize( rows[j].size() );
3719 for( Int i = 0; i < sortIndex[j].size(); i++ )
3720 sortIndex[j][i] = i;
3721 std::sort( sortIndex[j].begin(), sortIndex[j].end(),
3722 IndexComp<std::vector<Int>& > ( rows[j] ) );
3729 if( A.
size != this->NumCol() ){
3733 throw std::runtime_error(
"The DistSparseMatrix providing the pattern has a different size from PMatrix." );
3739 throw std::runtime_error(
"The DistSparseMatrix providing the pattern has a different number of local columns from PMatrix." );
3757 B.
comm = grid_->comm;
3761 for( Int j = 0; j < numColLocal; j++ ){
3762 std::vector<Int>& rowsCur = rows[j];
3763 std::vector<Int>& sortIndexCur = sortIndex[j];
3764 std::vector<T>& valsCur = vals[j];
3765 std::vector<Int> rowsCurSorted( rowsCur.size() );
3767 for( Int i = 0; i < rowsCurSorted.size(); i++ ){
3768 rowsCurSorted[i] = rowsCur[sortIndexCur[i]] + 1;
3772 std::vector<Int>::iterator it;
3775 it = std::lower_bound( rowsCurSorted.begin(), rowsCurSorted.end(),
3777 if( it == rowsCurSorted.end() ){
3779 *(nzvalPtr++) = ZERO<T>();
3783 *(nzvalPtr++) = valsCur[ sortIndexCur[it-rowsCurSorted.begin()] ];
3788 #if ( _DEBUGlevel_ >= 1 )
3789 statusOFS <<
"B.colptrLocal[end] = " << B.
colptrLocal(numColLocal) << std::endl;
3790 statusOFS <<
"B.rowindLocal.size() = " << B.
rowindLocal.m() << std::endl;
3807 template<
typename T>
3812 PushCallStack(
"PMatrix::PMatrixToDistSparseMatrix");
3814 #if ( _DEBUGlevel_ >= 1 )
3815 statusOFS << std::endl <<
"Converting PMatrix to DistSparseMatrix (2nd format)." << std::endl;
3817 Int mpirank = grid_->mpirank;
3818 Int mpisize = grid_->mpisize;
3820 std::vector<Int> rowSend( mpisize );
3821 std::vector<Int> colSend( mpisize );
3822 std::vector<T> valSend( mpisize );
3823 std::vector<Int> sizeSend( mpisize, 0 );
3824 std::vector<Int> displsSend( mpisize, 0 );
3826 std::vector<Int> rowRecv( mpisize );
3827 std::vector<Int> colRecv( mpisize );
3828 std::vector<T> valRecv( mpisize );
3829 std::vector<Int> sizeRecv( mpisize, 0 );
3830 std::vector<Int> displsRecv( mpisize, 0 );
3834 const IntNumVec& permInv = super_->permInv;
3839 pPerm_r = &super_->perm_r;
3840 pPermInv_r = &super_->permInv_r;
3843 const IntNumVec& permInv_r = *pPermInv_r;
3847 Int numColFirst = this->
NumCol() / mpisize;
3848 Int firstCol = mpirank * numColFirst;
3850 if( mpirank == mpisize-1 )
3851 numColLocal = this->
NumCol() - numColFirst * (mpisize-1);
3853 numColLocal = numColFirst;
3862 for( Int j = 0; j < numColLocal; j++ ){
3863 Int ocol = firstCol + j;
3864 Int col = perm[ perm_r[ ocol] ];
3865 Int blockColIdx =
BlockIdx( col, super_ );
3866 Int procCol =
PCOL( blockColIdx, grid_ );
3867 for( Int i = colPtr[j] - 1; i < colPtr[j+1] - 1; i++ ){
3868 Int orow = rowPtr[i]-1;
3869 Int row = perm[ orow ];
3870 Int blockRowIdx =
BlockIdx( row, super_ );
3871 Int procRow =
PROW( blockRowIdx, grid_ );
3872 Int dest =
PNUM( procRow, procCol, grid_ );
3873 #if ( _DEBUGlevel_ >= 1 )
3874 statusOFS <<
"("<< orow<<
", "<<ocol<<
") == "<<
"("<< row<<
", "<<col<<
")"<< std::endl;
3875 statusOFS <<
"BlockIdx = " << blockRowIdx <<
", " <<blockColIdx << std::endl;
3876 statusOFS << procRow <<
", " << procCol <<
", "
3877 << dest << std::endl;
3885 &sizeSend[0], 1, MPI_INT,
3886 &sizeRecv[0], 1, MPI_INT, grid_->comm );
3888 #if ( _DEBUGlevel_ >= 1 )
3889 statusOFS << std::endl <<
"sizeSend: " << sizeSend << std::endl;
3890 statusOFS << std::endl <<
"sizeRecv: " << sizeRecv << std::endl;
3896 for( Int ip = 0; ip < mpisize; ip++ ){
3901 displsSend[ip] = displsSend[ip-1] + sizeSend[ip-1];
3908 displsRecv[ip] = displsRecv[ip-1] + sizeRecv[ip-1];
3912 Int sizeSendTotal = displsSend[mpisize-1] + sizeSend[mpisize-1];
3913 Int sizeRecvTotal = displsRecv[mpisize-1] + sizeRecv[mpisize-1];
3915 rowSend.resize( sizeSendTotal );
3916 colSend.resize( sizeSendTotal );
3917 valSend.resize( sizeSendTotal );
3919 rowRecv.resize( sizeRecvTotal );
3920 colRecv.resize( sizeRecvTotal );
3921 valRecv.resize( sizeRecvTotal );
3923 #if ( _DEBUGlevel_ >= 1 )
3924 statusOFS <<
"displsSend = " << displsSend << std::endl;
3925 statusOFS <<
"displsRecv = " << displsRecv << std::endl;
3929 std::vector<Int> cntSize( mpisize, 0 );
3934 for( Int j = 0; j < numColLocal; j++ ){
3936 Int ocol = firstCol + j;
3937 Int col = perm[ perm_r[ ocol] ];
3938 Int blockColIdx =
BlockIdx( col, super_ );
3939 Int procCol =
PCOL( blockColIdx, grid_ );
3940 for( Int i = colPtr[j] - 1; i < colPtr[j+1] - 1; i++ ){
3941 Int orow = rowPtr[i]-1;
3942 Int row = perm[ orow ];
3943 Int blockRowIdx =
BlockIdx( row, super_ );
3944 Int procRow =
PROW( blockRowIdx, grid_ );
3945 Int dest =
PNUM( procRow, procCol, grid_ );
3946 rowSend[displsSend[dest] + cntSize[dest]] = row;
3947 colSend[displsSend[dest] + cntSize[dest]] = col;
3954 for( Int ip = 0; ip < mpisize; ip++ ){
3955 if( cntSize[ip] != sizeSend[ip] ){
3959 throw std::runtime_error(
"Sizes of the sending information do not match." );
3965 &rowSend[0], &sizeSend[0], &displsSend[0],
3966 &rowRecv[0], &sizeRecv[0], &displsRecv[0],
3969 &colSend[0], &sizeSend[0], &displsSend[0],
3970 &colRecv[0], &sizeRecv[0], &displsRecv[0],
3973 #if ( _DEBUGlevel_ >= 1 )
3974 statusOFS <<
"Alltoallv communication of nonzero indices finished." << std::endl;
3978 #if ( _DEBUGlevel_ >= 1 )
3979 for( Int ip = 0; ip < mpisize; ip++ ){
3980 statusOFS <<
"rowSend[" << ip <<
"] = " << rowSend[ip] << std::endl;
3981 statusOFS <<
"rowRecv[" << ip <<
"] = " << rowRecv[ip] << std::endl;
3982 statusOFS <<
"colSend[" << ip <<
"] = " << colSend[ip] << std::endl;
3983 statusOFS <<
"colRecv[" << ip <<
"] = " << colRecv[ip] << std::endl;
3994 for( Int g = 0; g < sizeRecvTotal; g++ ){
3995 Int row = rowRecv[g];
3996 Int col = colRecv[g];
3998 Int blockRowIdx =
BlockIdx( row, super_ );
3999 Int blockColIdx =
BlockIdx( col, super_ );
4002 bool isFound =
false;
4004 if( blockColIdx <= blockRowIdx ){
4007 std::vector<LBlock<T> >& Lcol = this->L(
LBj( blockColIdx, grid_ ) );
4009 for( Int ib = 0; ib < Lcol.size(); ib++ ){
4010 #if ( _DEBUGlevel_ >= 1 )
4011 statusOFS <<
"blockRowIdx = " << blockRowIdx <<
", Lcol[ib].blockIdx = " << Lcol[ib].blockIdx <<
", blockColIdx = " << blockColIdx << std::endl;
4013 if( Lcol[ib].blockIdx == blockRowIdx ){
4015 for(
int iloc = 0; iloc < Lcol[ib].numRow; iloc++ ){
4016 if( rows[iloc] == row ){
4018 valRecv[g] = Lcol[ib].nzval( iloc, jloc );
4024 if( isFound ==
true )
break;
4030 std::vector<UBlock<T> >& Urow = this->U(
LBi( blockRowIdx, grid_ ) );
4032 for( Int jb = 0; jb < Urow.size(); jb++ ){
4033 if( Urow[jb].blockIdx == blockColIdx ){
4035 for(
int jloc = 0; jloc < Urow[jb].numCol; jloc++ ){
4036 if( cols[jloc] == col ){
4038 valRecv[g] = Urow[jb].nzval( iloc, jloc );
4044 if( isFound ==
true )
break;
4049 if( isFound ==
false ){
4050 statusOFS <<
"In the permutated order, (" << row <<
", " << col <<
4051 ") is not found in PMatrix." << std::endl;
4052 valRecv[g] = ZERO<T>();
4061 &valRecv[0], &sizeRecv[0], &displsRecv[0],
4062 &valSend[0], &sizeSend[0], &displsSend[0],
4065 #if ( _DEBUGlevel_ >= 1 )
4066 statusOFS <<
"Alltoallv communication of nonzero values finished." << std::endl;
4085 B.
comm = grid_->comm;
4087 for( Int i = 0; i < mpisize; i++ )
4094 for( Int j = 0; j < numColLocal; j++ ){
4095 Int ocol = firstCol + j;
4096 Int col = perm[ perm_r[ ocol] ];
4097 Int blockColIdx =
BlockIdx( col, super_ );
4098 Int procCol =
PCOL( blockColIdx, grid_ );
4099 for( Int i = colPtr[j] - 1; i < colPtr[j+1] - 1; i++ ){
4100 Int orow = rowPtr[i]-1;
4101 Int row = perm[ orow ];
4102 Int blockRowIdx =
BlockIdx( row, super_ );
4103 Int procRow =
PROW( blockRowIdx, grid_ );
4104 Int dest =
PNUM( procRow, procCol, grid_ );
4105 valPtr[i] = valSend[displsSend[dest] + cntSize[dest]];
4111 for( Int ip = 0; ip < mpisize; ip++ ){
4112 if( cntSize[ip] != sizeSend[ip] ){
4116 throw std::runtime_error(
"Sizes of the sending information do not match." );
4131 template<
typename T>
4135 PushCallStack(
"PMatrix::NnzLocal");
4139 for( Int ksup = 0; ksup < numSuper; ksup++ ){
4140 if(
MYCOL( grid_ ) ==
PCOL( ksup, grid_ ) ){
4141 std::vector<LBlock<T> >& Lcol = this->L(
LBj( ksup, grid_ ) );
4142 for( Int ib = 0; ib < Lcol.size(); ib++ ){
4143 nnzLocal += Lcol[ib].numRow * Lcol[ib].numCol;
4146 if(
MYROW( grid_ ) ==
PROW( ksup, grid_ ) ){
4147 std::vector<UBlock<T> >& Urow = this->U(
LBi( ksup, grid_ ) );
4148 for( Int jb = 0; jb < Urow.size(); jb++ ){
4149 nnzLocal += Urow[jb].numRow * Urow[jb].numCol;
4162 template<
typename T>
4166 PushCallStack(
"PMatrix::Nnz");
4168 LongInt nnzLocal = LongInt( this->NnzLocal() );
4171 MPI_Allreduce( &nnzLocal, &nnz, 1, MPI_LONG_LONG, MPI_SUM, grid_->comm );
4184 PushCallStack(
"PMatrix::GetNegativeInertia");
4188 Real inertiaLocal = 0.0;
4191 for( Int ksup = 0; ksup < numSuper; ksup++ ){
4193 if(
MYROW( grid_ ) ==
PROW( ksup, grid_ ) &&
4194 MYCOL( grid_ ) ==
PCOL( ksup, grid_ ) ){
4196 for( Int i = 0; i < LB.
numRow; i++ ){
4197 if( LB.
nzval(i, i) < 0 )
4204 mpi::Allreduce( &inertiaLocal, &inertia, 1, MPI_SUM, grid_->comm );
4218 PushCallStack(
"PMatrix::GetNegativeInertia");
4222 Real inertiaLocal = 0.0;
4225 for( Int ksup = 0; ksup < numSuper; ksup++ ){
4227 if(
MYROW( grid_ ) ==
PROW( ksup, grid_ ) &&
4228 MYCOL( grid_ ) ==
PCOL( ksup, grid_ ) ){
4229 LBlock<Complex> & LB = this->L(
LBj( ksup, grid_ ) )[0];
4230 for( Int i = 0; i < LB.numRow; i++ ){
4231 if( LB.nzval(i, i).real() < 0 )
4238 mpi::Allreduce( &inertiaLocal, &inertia, 1, MPI_SUM, grid_->comm );
4247 template<
typename T>
4255 pMat =
new PMatrix<T>( pGridType, pSuper, pSelInvOpt, pLuOpt );
4261 template<
typename T>
4269 pMat =
new PMatrix<T>();
4278 #endif //_PEXSI_PSELINV_IMPL_HPP_
virtual void PreSelInv()
PreSelInv prepares the structure in L_ and U_ so that SelInv only involves matrix-matrix multiplicati...
Definition: pselinv_impl.hpp:2677
Int blockIdx
Block index (supernodal index)
Definition: pselinv.hpp:236
void PMatrixToDistSparseMatrix(DistSparseMatrix< T > &A)
PMatrixToDistSparseMatrix converts the PMatrix into a distributed compressed sparse column matrix for...
Definition: pselinv_impl.hpp:3208
void SelInvIntra_P2p(Int lidx)
SelInvIntra_P2p.
Definition: pselinv_impl.hpp:1211
A thin interface for passing parameters to set the SuperLU options.
Definition: superlu_dist_internal.hpp:62
Definition: TreeBcast.hpp:22
SuperNodeType describes mapping between supernode and column, the permutation information, and potentially the elimination tree (not implemented here).
Definition: pselinv.hpp:165
Interface with SuperLU_Dist (version 3.0 and later)
void GetNegativeInertia(Real &inertia)
GetNegativeInertia computes the negative inertia of a PMatrix. This can be used to estimate e...
Int PROW(Int bnum, const GridType *g)
PROW returns the processor row that the bnum-th block (supernode) belongs to.
Definition: pselinv.hpp:292
Int numRow
Number of nonzero rows.
Definition: pselinv.hpp:187
IntNumVec colptrLocal
Dimension numColLocal + 1, storing the pointers to the nonzero row indices and nonzero values in rowp...
Definition: sparse_matrix.hpp:108
void UnpackData(SuperNodeBufferType &snode, std::vector< LBlock< T > > &LcolRecv, std::vector< UBlock< T > > &UrowRecv)
UnpackData.
Definition: pselinv_impl.hpp:971
Int MYPROC(const GridType *g)
MYPROC returns the current processor rank.
Definition: pselinv.hpp:279
Int nnzLocal
Local number of local nonzeros elements on this processor.
Definition: sparse_matrix.hpp:96
Int PCOL(Int bnum, const GridType *g)
PCOL returns the processor column that the bnum-th block (supernode) belongs to.
Definition: pselinv.hpp:297
Int LBi(Int bnum, const GridType *g)
LBi returns the local block number on the processor at processor row PROW( bnum, g )...
Definition: pselinv.hpp:307
Int Symmetric
Option to specify if matrix is symmetric or not.
Definition: superlu_dist_internal.hpp:114
Profiling and timing using TAU.
Int SuperSize(Int bnum, const SuperNodeType *s)
SuperSize returns the size of the block bnum.
Definition: pselinv.hpp:348
Int LBj(Int bnum, const GridType *g)
LBj returns the local block number on the processor at processor column PCOL( bnum, g ).
Definition: pselinv.hpp:312
Int GBj(Int jLocal, const GridType *g)
GBj returns the global block number from a local block number in the column direction.
Definition: pselinv.hpp:322
void SetValue(NumMat< F > &M, F val)
SetValue sets a numerical matrix to a constant val.
Definition: NumMat_impl.hpp:171
Int numCol
Number of nonzero columns.
Definition: pselinv.hpp:190
Int NumCol(const SuperNodeType *s)
NumCol returns the total number of columns for a supernodal partiiton.
Definition: pselinv.hpp:357
Definition: pselinv.hpp:598
void GetDiagonal(NumVec< T > &diag)
GetDiagonal extracts the diagonal elements of the PMatrix.
Definition: pselinv_impl.hpp:3080
void GetWorkSet(std::vector< Int > &snodeEtree, std::vector< std::vector< Int > > &WSet)
GetWorkSet.
Definition: pselinv_impl.hpp:1134
NumVec< F > nzvalLocal
Dimension nnzLocal, storing the nonzero values.
Definition: sparse_matrix.hpp:116
Int FirstBlockRow(Int bnum, const SuperNodeType *s)
FirstBlockRow returns the first column of a block bnum. Note: the functionality of FirstBlockRow is e...
Definition: pselinv.hpp:343
void GetEtree(std::vector< Int > &etree_supno)
GetEtree computes the supernodal elimination tree to be used later in the pipelined selected inversio...
Definition: pselinv_impl.hpp:1065
LBlock stores a nonzero block in the lower triangular part or the diagonal part in PSelInv...
Definition: pselinv.hpp:181
IntNumVec rowindLocal
Dimension nnzLocal, storing the nonzero row indices. The indices are 1-based (FORTRAN-convention), i.e. the first row index is 1.
Definition: sparse_matrix.hpp:113
Int PNUM(Int i, Int j, const GridType *g)
PNUM returns the processor rank that the bnum-th block (supernode) belongs to.
Definition: pselinv.hpp:302
virtual void ConstructCommunicationPattern()
ConstructCommunicationPattern constructs the communication pattern to be used later in the selected i...
Definition: pselinv_impl.hpp:1951
void ConstructCommunicationPattern_P2p()
ConstructCommunicationPattern_P2p constructs the communication pattern to be used later in the select...
Definition: pselinv_impl.hpp:1959
GridType is the PSelInv way of defining the grid.
Definition: pselinv.hpp:128
Int MYCOL(const GridType *g)
MYCOL returns my processor column.
Definition: pselinv.hpp:287
Int size
Matrix dimension.
Definition: sparse_matrix.hpp:93
void PMatrixToDistSparseMatrix_OLD(const DistSparseMatrix< T > &A, DistSparseMatrix< T > &B)
PMatrixToDistSparseMatrix_OLD converts the PMatrix into a distributed compressed sparse column matrix...
Definition: pselinv_impl.hpp:3489
IntNumVec rows
Dimension numRow * 1, index (0-based) for the number of nonzero rows.
Definition: pselinv.hpp:193
Int numCol
Number of nonzero columns.
Definition: pselinv.hpp:242
UBlock stores a nonzero block in the upper triangular part in PSelInv.
Definition: pselinv.hpp:233
Int FirstBlockCol(Int bnum, const SuperNodeType *s)
FirstBlockCol returns the first column of a block bnum.
Definition: pselinv.hpp:336
LongInt Nnz()
Compute the total number of nonzeros through MPI_Allreduce.
Definition: sparse_matrix_impl.hpp:107
MPI_Comm comm
MPI communicator.
Definition: sparse_matrix.hpp:119
Int NnzLocal()
NnzLocal computes the number of nonzero elements (L and U) saved locally.
Definition: pselinv_impl.hpp:4132
Int blockIdx
Block index (supernodal index)
Definition: pselinv.hpp:184
PMatrix contains the main data structure and the computational routine for the parallel selected inve...
Definition: ngchol_interf.hpp:57
void SendRecvCD_UpdateU(std::vector< SuperNodeBufferType > &arrSuperNodes, Int stepSuper)
SendRecvCD_UpdateU.
Definition: pselinv_impl.hpp:748
Int BlockIdx(Int i, const SuperNodeType *s)
BlockIdx returns the block index of a column i.
Definition: pselinv.hpp:331
Int numRow
Number of nonzero rows.
Definition: pselinv.hpp:239
NumMat< T > nzval
Dimension numRow * numCol, nonzero elements.
Definition: pselinv.hpp:248
Int nnz
Total number of nonzeros elements.
Definition: sparse_matrix.hpp:101
Int NumSuper(const SuperNodeType *s)
NumSuper returns the total number of supernodes.
Definition: pselinv.hpp:352
void SelInv_P2p()
Point-to-point version of the selected inversion.
Definition: pselinv_impl.hpp:2642
NumMat< T > nzval
Dimension numRow * numCol, nonzero elements.
Definition: pselinv.hpp:197
A thin interface for passing parameters to set the PSelInv options.
Definition: pselinv.hpp:101
Definition: utility.hpp:1481
IntNumVec cols
Dimension numRow * 1, index (0-based) for the number of nonzero rows.
Definition: pselinv.hpp:245
virtual void SelInv()
SelInv is the main function for the selected inversion.
Definition: pselinv_impl.hpp:2617
LongInt Nnz()
Nnz computes the total number of nonzero elements in the PMatrix.
Definition: pselinv_impl.hpp:4163
void SelInv_lookup_indexes(SuperNodeBufferType &snode, std::vector< LBlock< T > > &LcolRecv, std::vector< UBlock< T > > &UrowRecv, NumMat< T > &AinvBuf, NumMat< T > &UBuf)
SelInv_lookup_indexes.
Definition: pselinv_impl.hpp:391
Definition: TreeBcast.hpp:555
void ComputeDiagUpdate(SuperNodeBufferType &snode)
ComputeDiagUpdate.
Definition: pselinv_impl.hpp:1025
DistSparseMatrix describes a Sparse matrix in the compressed sparse column format (CSC) and distribut...
Definition: sparse_matrix.hpp:91
PMatrixUnsym contains the main data structure and the computational routine for the parallel selected...
Definition: pselinv.hpp:978
Int MYROW(const GridType *g)
MYROW returns my processor row.
Definition: pselinv.hpp:283