46 #ifndef _PEXSI_PSELINV_IMPL_HPP_
47 #define _PEXSI_PSELINV_IMPL_HPP_
55 #define MPI_MAX_COMM (1024)
56 #define BCAST_THRESHOLD 16
64 inline GridType::GridType ( MPI_Comm Bcomm,
int nprow,
int npcol )
67 PushCallStack(
"GridType::GridType");
70 MPI_Initialized( &info );
75 throw std::logic_error(
"MPI has not been initialized." );
78 MPI_Comm_group( Bcomm, &comm_group );
79 MPI_Comm_create( Bcomm, comm_group, &comm );
82 MPI_Comm_rank( comm, &mpirank );
83 MPI_Comm_size( comm, &mpisize );
84 if( mpisize != nprow * npcol ){
88 throw std::logic_error(
"mpisize != nprow * npcol." );
94 Int myrow = mpirank / npcol;
95 Int mycol = mpirank % npcol;
97 MPI_Comm_split( comm, myrow, mycol, &rowComm );
98 MPI_Comm_split( comm, mycol, myrow, &colComm );
100 MPI_Group_free( &comm_group );
110 inline GridType::~GridType ( )
113 PushCallStack(
"GridType::~GridType");
117 MPI_Comm_free( &rowComm );
118 MPI_Comm_free( &colComm );
119 MPI_Comm_free( &comm );
131 void PMatrix<T>::deallocate(){
137 ColBlockIdx_.clear();
138 RowBlockIdx_.clear();
144 isSendToBelow_.Clear();
145 isSendToRight_.Clear();
146 isSendToDiagonal_.Clear();
147 isSendToCrossDiagonal_.Clear();
149 isRecvFromBelow_.Clear();
150 isRecvFromAbove_.Clear();
151 isRecvFromLeft_.Clear();
152 isRecvFromCrossDiagonal_.Clear();
156 for(
int i =0;i<fwdToBelowTree_.size();++i){
157 if(fwdToBelowTree_[i]!=NULL){
158 delete fwdToBelowTree_[i];
162 for(
int i =0;i<fwdToRightTree_.size();++i){
163 if(fwdToRightTree_[i]!=NULL){
164 delete fwdToRightTree_[i];
168 for(
int i =0;i<redToLeftTree_.size();++i){
169 if(redToLeftTree_[i]!=NULL){
170 delete redToLeftTree_[i];
173 for(
int i =0;i<redToAboveTree_.size();++i){
174 if(redToAboveTree_[i]!=NULL){
175 delete redToAboveTree_[i];
181 PMatrix<T> & PMatrix<T>::operator = (
const PMatrix<T> & C){
188 options_ = C.options_;
191 ColBlockIdx_ = C.ColBlockIdx_;
192 RowBlockIdx_ = C.RowBlockIdx_;
196 workingSet_ = C.workingSet_;
200 isSendToBelow_ = C.isSendToBelow_;
201 isSendToRight_ = C.isSendToRight_;
202 isSendToDiagonal_ = C.isSendToDiagonal_;
203 isSendToCrossDiagonal_ = C.isSendToCrossDiagonal_;
205 isRecvFromBelow_ = C.isRecvFromBelow_;
206 isRecvFromAbove_ = C.isRecvFromAbove_;
207 isRecvFromLeft_ = C.isRecvFromLeft_;
208 isRecvFromCrossDiagonal_ = C.isRecvFromCrossDiagonal_;
210 fwdToBelowTree_.resize(C.fwdToBelowTree_.size());
211 for(
int i = 0 ; i< C.fwdToBelowTree_.size();++i){
212 if(C.fwdToBelowTree_[i]!=NULL){
213 fwdToBelowTree_[i] = C.fwdToBelowTree_[i]->clone();
217 fwdToRightTree_.resize(C.fwdToRightTree_.size());
218 for(
int i = 0 ; i< C.fwdToRightTree_.size();++i){
219 if(C.fwdToRightTree_[i]!=NULL){
220 fwdToRightTree_[i] = C.fwdToRightTree_[i]->clone();
224 redToLeftTree_.resize(C.redToLeftTree_.size());
225 for(
int i = 0 ; i< C.redToLeftTree_.size();++i){
226 if(C.redToLeftTree_[i]!=NULL){
227 redToLeftTree_[i] = C.redToLeftTree_[i]->clone();
230 redToAboveTree_.resize(C.redToAboveTree_.size());
231 for(
int i = 0 ; i< C.redToAboveTree_.size();++i){
232 if(C.redToAboveTree_[i]!=NULL){
233 redToAboveTree_[i] = C.redToAboveTree_[i]->clone();
243 PMatrix<T>::PMatrix(
const PMatrix<T> & C){
249 options_ = C.options_;
252 ColBlockIdx_ = C.ColBlockIdx_;
253 RowBlockIdx_ = C.RowBlockIdx_;
257 workingSet_ = C.workingSet_;
261 isSendToBelow_ = C.isSendToBelow_;
262 isSendToRight_ = C.isSendToRight_;
263 isSendToDiagonal_ = C.isSendToDiagonal_;
264 isSendToCrossDiagonal_ = C.isSendToCrossDiagonal_;
266 isRecvFromBelow_ = C.isRecvFromBelow_;
267 isRecvFromAbove_ = C.isRecvFromAbove_;
268 isRecvFromLeft_ = C.isRecvFromLeft_;
269 isRecvFromCrossDiagonal_ = C.isRecvFromCrossDiagonal_;
272 fwdToBelowTree_.resize(C.fwdToBelowTree_.size());
273 for(
int i = 0 ; i< C.fwdToBelowTree_.size();++i){
274 if(C.fwdToBelowTree_[i]!=NULL){
275 fwdToBelowTree_[i] = C.fwdToBelowTree_[i]->clone();
279 fwdToRightTree_.resize(C.fwdToRightTree_.size());
280 for(
int i = 0 ; i< C.fwdToRightTree_.size();++i){
281 if(C.fwdToRightTree_[i]!=NULL){
282 fwdToRightTree_[i] = C.fwdToRightTree_[i]->clone();
286 redToLeftTree_.resize(C.redToLeftTree_.size());
287 for(
int i = 0 ; i< C.redToLeftTree_.size();++i){
288 if(C.redToLeftTree_[i]!=NULL){
289 redToLeftTree_[i] = C.redToLeftTree_[i]->clone();
292 redToAboveTree_.resize(C.redToAboveTree_.size());
293 for(
int i = 0 ; i< C.redToAboveTree_.size();++i){
294 if(C.redToAboveTree_[i]!=NULL){
295 redToAboveTree_[i] = C.redToAboveTree_[i]->clone();
302 PMatrix<T>::PMatrix (
304 const SuperNodeType* s,
309 PushCallStack(
"PMatrix::PMatrix");
312 this->Setup( g, s, o );
321 PMatrix<T>::~PMatrix ( )
325 PushCallStack(
"PMatrix::~PMatrix");
337 void PMatrix<T>::Setup(
339 const SuperNodeType* s,
344 PushCallStack(
"PMatrix::Setup");
359 ColBlockIdx_.clear();
360 RowBlockIdx_.clear();
362 L_.resize( this->NumLocalBlockCol() );
363 U_.resize( this->NumLocalBlockRow() );
365 ColBlockIdx_.resize( this->NumLocalBlockCol() );
366 RowBlockIdx_.resize( this->NumLocalBlockRow() );
369 #if ( _DEBUGlevel_ >= 1 )
370 statusOFS << std::endl <<
"PMatrix is constructed. The grid information: " << std::endl;
371 statusOFS <<
"mpirank = " <<
MYPROC(grid_) << std::endl;
372 statusOFS <<
"myrow = " <<
MYROW(grid_) << std::endl;
373 statusOFS <<
"mycol = " <<
MYCOL(grid_) << std::endl;
392 TIMER_START(Compute_Sinv_LT_Lookup_Indexes);
394 TIMER_START(Build_colptr_rowptr);
398 std::vector<Int> rowPtr(LcolRecv.size() + 1);
402 std::vector<Int> colPtr(UrowRecv.size() + 1);
405 for( Int ib = 0; ib < LcolRecv.size(); ib++ ){
406 rowPtr[ib+1] = rowPtr[ib] + LcolRecv[ib].numRow;
409 for( Int jb = 0; jb < UrowRecv.size(); jb++ ){
410 colPtr[jb+1] = colPtr[jb] + UrowRecv[jb].numCol;
413 Int numRowAinvBuf = *rowPtr.rbegin();
414 Int numColAinvBuf = *colPtr.rbegin();
415 TIMER_STOP(Build_colptr_rowptr);
417 TIMER_START(Allocate_lookup);
419 AinvBuf.Resize( numRowAinvBuf, numColAinvBuf );
420 UBuf.Resize(
SuperSize( snode.Index, super_ ), numColAinvBuf );
426 TIMER_STOP(Allocate_lookup);
428 TIMER_START(Fill_UBuf);
430 for( Int jb = 0; jb < UrowRecv.size(); jb++ ){
433 throw std::logic_error(
"The size of UB is not right. Something is seriously wrong." );
436 UB.
numRow, UBuf.VecData( colPtr[jb] ),
SuperSize( snode.Index, super_ ) );
438 TIMER_STOP(Fill_UBuf);
442 TIMER_START(JB_Loop);
596 for( Int jb = 0; jb < UrowRecv.size(); jb++ ){
597 for( Int ib = 0; ib < LcolRecv.size(); ib++ ){
602 T* nzvalAinv = &AinvBuf( rowPtr[ib], colPtr[jb] );
603 Int ldAinv = AinvBuf.m();
607 std::vector<LBlock<T> >& LcolSinv = this->L(
LBj(jsup, grid_ ) );
608 bool isBlockFound =
false;
609 TIMER_START(PARSING_ROW_BLOCKIDX);
610 for( Int ibSinv = 0; ibSinv < LcolSinv.size(); ibSinv++ ){
612 if( LcolSinv[ibSinv].blockIdx == isup ){
616 std::vector<Int> relRows( LB.
numRow );
617 Int* rowsLBPtr = LB.
rows.Data();
618 Int* rowsSinvBPtr = SinvB.
rows.Data();
619 for( Int i = 0; i < LB.
numRow; i++ ){
620 bool isRowFound =
false;
621 for( Int i1 = 0; i1 < SinvB.
numRow; i1++ ){
622 if( rowsLBPtr[i] == rowsSinvBPtr[i1] ){
628 if( isRowFound ==
false ){
629 std::ostringstream msg;
630 msg <<
"Row " << rowsLBPtr[i] <<
631 " in LB cannot find the corresponding row in SinvB" << std::endl
632 <<
"LB.rows = " << LB.
rows << std::endl
633 <<
"SinvB.rows = " << SinvB.
rows << std::endl;
634 throw std::runtime_error( msg.str().c_str() );
639 std::vector<Int> relCols( UB.
numCol );
641 for( Int j = 0; j < UB.
numCol; j++ ){
642 relCols[j] = UB.
cols[j] - SinvColsSta;
646 T* nzvalSinv = SinvB.
nzval.Data();
647 Int ldSinv = SinvB.
numRow;
648 for( Int j = 0; j < UB.
numCol; j++ ){
649 for( Int i = 0; i < LB.
numRow; i++ ){
650 nzvalAinv[i+j*ldAinv] =
651 nzvalSinv[relRows[i] + relCols[j] * ldSinv];
659 TIMER_STOP(PARSING_ROW_BLOCKIDX);
660 if( isBlockFound ==
false ){
661 std::ostringstream msg;
662 msg <<
"Block(" << isup <<
", " << jsup
663 <<
") did not find a matching block in Sinv." << std::endl;
664 throw std::runtime_error( msg.str().c_str() );
668 std::vector<UBlock<T> >& UrowSinv = this->U(
LBi( isup, grid_ ) );
669 bool isBlockFound =
false;
670 TIMER_START(PARSING_COL_BLOCKIDX);
671 for( Int jbSinv = 0; jbSinv < UrowSinv.size(); jbSinv++ ){
673 if( UrowSinv[jbSinv].blockIdx == jsup ){
677 std::vector<Int> relRows( LB.
numRow );
679 for( Int i = 0; i < LB.
numRow; i++ ){
680 relRows[i] = LB.
rows[i] - SinvRowsSta;
684 std::vector<Int> relCols( UB.
numCol );
685 Int* colsUBPtr = UB.
cols.Data();
686 Int* colsSinvBPtr = SinvB.
cols.Data();
687 for( Int j = 0; j < UB.
numCol; j++ ){
688 bool isColFound =
false;
689 for( Int j1 = 0; j1 < SinvB.
numCol; j1++ ){
690 if( colsUBPtr[j] == colsSinvBPtr[j1] ){
696 if( isColFound ==
false ){
697 std::ostringstream msg;
698 msg <<
"Col " << colsUBPtr[j] <<
699 " in UB cannot find the corresponding row in SinvB" << std::endl
700 <<
"UB.cols = " << UB.
cols << std::endl
701 <<
"UinvB.cols = " << SinvB.
cols << std::endl;
702 throw std::runtime_error( msg.str().c_str() );
708 T* nzvalSinv = SinvB.
nzval.Data();
709 Int ldSinv = SinvB.
numRow;
710 for( Int j = 0; j < UB.
numCol; j++ ){
711 for( Int i = 0; i < LB.
numRow; i++ ){
712 nzvalAinv[i+j*ldAinv] =
713 nzvalSinv[relRows[i] + relCols[j] * ldSinv];
721 TIMER_STOP(PARSING_COL_BLOCKIDX);
722 if( isBlockFound ==
false ){
723 std::ostringstream msg;
724 msg <<
"Block(" << isup <<
", " << jsup
725 <<
") did not find a matching block in Sinv." << std::endl;
726 throw std::runtime_error( msg.str().c_str() );
738 TIMER_STOP(Compute_Sinv_LT_Lookup_Indexes);
743 std::vector<SuperNodeBufferType > & arrSuperNodes,
747 TIMER_START(Send_CD_Update_U);
751 Int sendOffset[stepSuper];
752 Int recvOffset[stepSuper];
754 for (Int supidx=0; supidx<stepSuper; supidx++){
756 sendOffset[supidx]=sendCount;
757 recvOffset[supidx]=recvCount;
758 sendCount+= CountSendToCrossDiagonal(snode.Index);
759 recvCount+= CountRecvFromCrossDiagonal(snode.Index);
763 std::vector<MPI_Request > arrMpiReqsSendCD(sendCount, MPI_REQUEST_NULL );
764 std::vector<MPI_Request > arrMpiReqsSizeSendCD(sendCount, MPI_REQUEST_NULL );
766 std::vector<MPI_Request > arrMpiReqsRecvCD(recvCount, MPI_REQUEST_NULL );
767 std::vector<MPI_Request > arrMpiReqsSizeRecvCD(recvCount, MPI_REQUEST_NULL );
768 std::vector<std::vector<char> > arrSstrLcolSendCD(sendCount);
769 std::vector<int > arrSstrLcolSizeSendCD(sendCount);
770 std::vector<std::vector<char> > arrSstrLcolRecvCD(recvCount);
771 std::vector<int > arrSstrLcolSizeRecvCD(recvCount);
773 for (Int supidx=0; supidx<stepSuper; supidx++){
779 TIMER_START(Send_L_CrossDiag);
781 if(
MYCOL( grid_ ) ==
PCOL( snode.Index, grid_ ) && isSendToCrossDiagonal_(grid_->numProcCol, snode.Index ) ){
784 for(Int dstCol = 0; dstCol<grid_->numProcCol; dstCol++){
785 if(isSendToCrossDiagonal_(dstCol,snode.Index) ){
786 Int dest =
PNUM(
PROW(snode.Index,grid_),dstCol,grid_);
788 if(
MYPROC( grid_ ) != dest ){
789 MPI_Request & mpiReqSizeSend = arrMpiReqsSizeSendCD[sendOffset[supidx]+sendIdx];
790 MPI_Request & mpiReqSend = arrMpiReqsSendCD[sendOffset[supidx]+sendIdx];
793 std::stringstream sstm;
794 std::vector<char> & sstrLcolSend = arrSstrLcolSendCD[sendOffset[supidx]+sendIdx];
795 Int & sstrSize = arrSstrLcolSizeSendCD[sendOffset[supidx]+sendIdx];
797 serialize( snode.RowLocalPtr, sstm, NO_MASK );
798 serialize( snode.BlockIdxLocal, sstm, NO_MASK );
799 serialize( snode.LUpdateBuf, sstm, NO_MASK );
801 sstrLcolSend.resize( Size(sstm) );
802 sstm.read( &sstrLcolSend[0], sstrLcolSend.size() );
803 sstrSize = sstrLcolSend.size();
806 MPI_Isend( &sstrSize,
sizeof(sstrSize), MPI_BYTE, dest, IDX_TO_TAG(snode.Index,SELINV_TAG_L_SIZE_CD), grid_->comm, &mpiReqSizeSend );
807 MPI_Isend( (
void*)&sstrLcolSend[0], sstrSize, MPI_BYTE, dest, IDX_TO_TAG(snode.Index,SELINV_TAG_L_CONTENT_CD), grid_->comm, &mpiReqSend );
809 PROFILE_COMM(
MYPROC(this->grid_),dest,IDX_TO_TAG(snode.Index,SELINV_TAG_L_SIZE_CD),
sizeof(sstrSize));
810 PROFILE_COMM(
MYPROC(this->grid_),dest,IDX_TO_TAG(snode.Index,SELINV_TAG_L_CONTENT_CD),sstrSize);
820 TIMER_STOP(Send_L_CrossDiag);
825 for (Int supidx=0; supidx<stepSuper; supidx++){
828 if(
MYROW( grid_ ) ==
PROW( snode.Index, grid_ ) && isRecvFromCrossDiagonal_(grid_->numProcRow, snode.Index ) ){
830 for(Int srcRow = 0; srcRow<grid_->numProcRow; srcRow++){
831 if(isRecvFromCrossDiagonal_(srcRow,snode.Index) ){
832 Int src =
PNUM(srcRow,
PCOL(snode.Index,grid_),grid_);
833 if(
MYPROC( grid_ ) != src ){
834 Int & sstrSize = arrSstrLcolSizeRecvCD[recvOffset[supidx]+recvIdx];
835 MPI_Request & mpiReqSizeRecv = arrMpiReqsSizeRecvCD[recvOffset[supidx]+recvIdx];
836 MPI_Irecv( &sstrSize, 1, MPI_INT, src, IDX_TO_TAG(snode.Index,SELINV_TAG_L_SIZE_CD), grid_->comm, &mpiReqSizeRecv );
845 mpi::Waitall(arrMpiReqsSizeRecvCD);
847 for (Int supidx=0; supidx<stepSuper; supidx++){
850 if(
MYROW( grid_ ) ==
PROW( snode.Index, grid_ ) && isRecvFromCrossDiagonal_(grid_->numProcRow, snode.Index ) ){
852 for(Int srcRow = 0; srcRow<grid_->numProcRow; srcRow++){
853 if(isRecvFromCrossDiagonal_(srcRow,snode.Index) ){
854 Int src =
PNUM(srcRow,
PCOL(snode.Index,grid_),grid_);
855 if(
MYPROC( grid_ ) != src ){
856 Int & sstrSize = arrSstrLcolSizeRecvCD[recvOffset[supidx]+recvIdx];
857 std::vector<char> & sstrLcolRecv = arrSstrLcolRecvCD[recvOffset[supidx]+recvIdx];
858 MPI_Request & mpiReqRecv = arrMpiReqsRecvCD[recvOffset[supidx]+recvIdx];
859 sstrLcolRecv.resize( sstrSize);
860 MPI_Irecv( (
void*)&sstrLcolRecv[0], sstrSize, MPI_BYTE, src, IDX_TO_TAG(snode.Index,SELINV_TAG_L_CONTENT_CD), grid_->comm, &mpiReqRecv );
870 mpi::Waitall(arrMpiReqsRecvCD);
872 for (Int supidx=0; supidx<stepSuper; supidx++){
877 if(
MYROW( grid_ ) ==
PROW( snode.Index, grid_ ) && isRecvFromCrossDiagonal_(grid_->numProcRow, snode.Index ) ){
879 #if ( _DEBUGlevel_ >= 1 )
880 statusOFS << std::endl <<
" ["<<snode.Index<<
"] "<<
"Update the upper triangular block" << std::endl << std::endl;
881 statusOFS << std::endl <<
" ["<<snode.Index<<
"] "<<
"blockIdxLocal:" << snode.BlockIdxLocal << std::endl << std::endl;
882 statusOFS << std::endl <<
" ["<<snode.Index<<
"] "<<
"rowLocalPtr:" << snode.RowLocalPtr << std::endl << std::endl;
885 std::vector<UBlock<T> >& Urow = this->U(
LBi( snode.Index, grid_ ) );
886 std::vector<bool> isBlockFound(Urow.size(),
false);
889 for(Int srcRow = 0; srcRow<grid_->numProcRow; srcRow++){
890 if(isRecvFromCrossDiagonal_(srcRow,snode.Index) ){
891 Int src =
PNUM(srcRow,
PCOL(snode.Index,grid_),grid_);
892 TIMER_START(Recv_L_CrossDiag);
894 std::vector<Int> rowLocalPtrRecv;
895 std::vector<Int> blockIdxLocalRecv;
898 if(
MYPROC( grid_ ) != src ){
899 std::stringstream sstm;
900 Int & sstrSize = arrSstrLcolSizeRecvCD[recvOffset[supidx]+recvIdx];
901 std::vector<char> & sstrLcolRecv = arrSstrLcolRecvCD[recvOffset[supidx]+recvIdx];
902 sstm.write( &sstrLcolRecv[0], sstrSize );
904 deserialize( rowLocalPtrRecv, sstm, NO_MASK );
905 deserialize( blockIdxLocalRecv, sstm, NO_MASK );
906 deserialize( UUpdateBuf, sstm, NO_MASK );
912 rowLocalPtrRecv = snode.RowLocalPtr;
913 blockIdxLocalRecv = snode.BlockIdxLocal;
914 UUpdateBuf = snode.LUpdateBuf;
919 TIMER_STOP(Recv_L_CrossDiag);
921 #if ( _DEBUGlevel_ >= 1 )
922 statusOFS<<
" ["<<snode.Index<<
"] P"<<
MYPROC(grid_)<<
" ("<<
MYROW(grid_)<<
","<<
MYCOL(grid_)<<
") <--- LBj("<<snode.Index<<
") <--- P"<<src<<std::endl;
923 statusOFS << std::endl <<
" ["<<snode.Index<<
"] "<<
"rowLocalPtrRecv:" << rowLocalPtrRecv << std::endl << std::endl;
924 statusOFS << std::endl <<
" ["<<snode.Index<<
"] "<<
"blockIdxLocalRecv:" << blockIdxLocalRecv << std::endl << std::endl;
929 for( Int ib = 0; ib < blockIdxLocalRecv.size(); ib++ ){
930 for( Int jb = 0; jb < Urow.size(); jb++ ){
932 if( UB.
blockIdx == blockIdxLocalRecv[ib] ){
934 lapack::Lacpy(
'A', Ltmp.m(), Ltmp.n(),
935 &UUpdateBuf( rowLocalPtrRecv[ib], 0 ),
936 UUpdateBuf.m(), Ltmp.Data(), Ltmp.m() );
937 isBlockFound[jb] =
true;
938 Transpose( Ltmp, UB.
nzval );
946 for( Int jb = 0; jb < Urow.size(); jb++ ){
948 if( !isBlockFound[jb] ){
952 throw std::logic_error(
"UBlock cannot find its update. Something is seriously wrong." );
958 TIMER_STOP(Send_CD_Update_U);
960 mpi::Waitall(arrMpiReqsSizeSendCD);
961 mpi::Waitall(arrMpiReqsSendCD);
970 #if ( _DEBUGlevel_ >= 1 )
971 statusOFS << std::endl <<
"["<<snode.Index<<
"] "<<
"Unpack the received data for processors participate in Gemm. " << std::endl << std::endl;
974 if(
MYROW( grid_ ) !=
PROW( snode.Index, grid_ ) ){
975 std::stringstream sstm;
976 sstm.write( &snode.SstrUrowRecv[0], snode.SstrUrowRecv.size() );
977 std::vector<Int> mask( UBlockMask::TOTAL_NUMBER, 1 );
979 deserialize( numUBlock, sstm, NO_MASK );
980 UrowRecv.resize( numUBlock );
981 for( Int jb = 0; jb < numUBlock; jb++ ){
982 deserialize( UrowRecv[jb], sstm, mask );
990 UrowRecv.resize(this->U(
LBi( snode.Index, grid_ ) ).size());
991 std::copy(this->U(
LBi( snode.Index, grid_ ) ).begin(),this->U(
LBi( snode.Index, grid_ )).end(),UrowRecv.begin());
996 if(
MYCOL( grid_ ) !=
PCOL( snode.Index, grid_ ) ){
997 std::stringstream sstm;
998 sstm.write( &snode.SstrLcolRecv[0], snode.SstrLcolRecv.size() );
999 std::vector<Int> mask( LBlockMask::TOTAL_NUMBER, 1 );
1000 mask[LBlockMask::NZVAL] = 0;
1002 deserialize( numLBlock, sstm, NO_MASK );
1003 LcolRecv.resize( numLBlock );
1004 for( Int ib = 0; ib < numLBlock; ib++ ){
1005 deserialize( LcolRecv[ib], sstm, mask );
1011 std::vector<LBlock<T> >& Lcol = this->L(
LBj( snode.Index, grid_ ) );
1012 Int startIdx = (
MYROW( grid_ ) ==
PROW( snode.Index, grid_ ) )?1:0;
1013 LcolRecv.resize( Lcol.size() - startIdx );
1014 std::copy(Lcol.begin()+startIdx,Lcol.end(),LcolRecv.begin());
1018 template<
typename T>
1023 if(
MYCOL( grid_ ) ==
PCOL( snode.Index, grid_ ) ){
1024 #if ( _DEBUGlevel_ >= 2 )
1025 statusOFS << std::endl <<
"["<<snode.Index<<
"] "<<
"Updating the diagonal block" << std::endl << std::endl;
1027 std::vector<LBlock<T> >& Lcol = this->L(
LBj( snode.Index, grid_ ) );
1030 snode.DiagBuf.Resize(
SuperSize( snode.Index, super_ ),
SuperSize( snode.Index, super_ ));
1031 SetValue(snode.DiagBuf, ZERO<T>());
1034 Int startIb = (
MYROW( grid_ ) ==
PROW( snode.Index, grid_ ))?1:0;
1035 for( Int ib = startIb; ib < Lcol.size(); ib++ ){
1038 gemm_stat.push_back(snode.DiagBuf.m());
1039 gemm_stat.push_back(snode.DiagBuf.n());
1040 gemm_stat.push_back(Lcol[ib].numRow);
1045 blas::Gemm(
'T',
'N', snode.DiagBuf.m(), snode.DiagBuf.n(), LB.
numRow,
1046 MINUS_ONE<T>(), &snode.LUpdateBuf( snode.RowLocalPtr[ib-startIb], 0 ), snode.LUpdateBuf.m(),
1047 LB.
nzval.Data(), LB.
nzval.m(), ONE<T>(), snode.DiagBuf.Data(), snode.DiagBuf.m() );
1050 #if ( _DEBUGlevel_ >= 1 )
1051 statusOFS << std::endl <<
"["<<snode.Index<<
"] "<<
"Updated the diagonal block" << std::endl << std::endl;
1057 template<
typename T>
1061 PushCallStack(
"PMatrix::GetEtree");
1062 double begin = MPI_Wtime( );
1066 if( options_->ColPerm !=
"PARMETIS" ) {
1073 etree_supno.resize(this->
NumSuper());
1074 for(Int i = 0; i < superNode->etree.m(); ++i){
1075 Int curSnode = superNode->superIdx[i];
1076 Int parentSnode = (superNode->etree[i]>= superNode->etree.m()) ?this->
NumSuper():superNode->superIdx[superNode->etree[i]];
1077 if( curSnode != parentSnode){
1078 etree_supno[curSnode] = parentSnode;
1087 std::vector<Int> etree_supno_l( nsupers, nsupers );
1088 for( Int ksup = 0; ksup < nsupers; ksup++ ){
1089 if(
MYCOL( grid_ ) ==
PCOL( ksup, grid_ ) ){
1091 std::vector<LBlock<T> >& Lcol = this->L(
LBj(ksup, grid_) );
1094 if(
MYROW( grid_ ) ==
PROW( ksup, grid_ ) )
1097 for( Int ib = firstBlk; ib < Lcol.size(); ib++ ){
1098 etree_supno_l[ksup] = std::min(etree_supno_l[ksup] , Lcol[ib].blockIdx);
1105 #if ( _DEBUGlevel_ >= 1 )
1106 statusOFS << std::endl <<
" Local supernodal elimination tree is " << etree_supno_l <<std::endl<<std::endl;
1110 etree_supno.resize( nsupers );
1111 mpi::Allreduce( (Int*) &etree_supno_l[0],(Int *) &etree_supno[0], nsupers, MPI_MIN, grid_->comm );
1112 etree_supno[nsupers-1]=nsupers;
1116 double end = MPI_Wtime( );
1117 statusOFS<<
"Building the list took "<<end-begin<<
"s"<<std::endl;
1124 template<
typename T>
1126 std::vector<Int> & snodeEtree,
1127 std::vector<std::vector<Int> > & WSet)
1129 TIMER_START(Compute_WorkSet);
1133 if (options_->maxPipelineDepth!=1){
1137 Int rootParent = numSuper;
1143 level(rootParent)=-1;
1145 for(Int i=rootParent-1; i>=0; i-- ){ level(i) = level(snodeEtree[i])+1; numLevel = std::max(numLevel, level(i)); }
1152 for(Int i=rootParent-1; i>=0; i-- ){
if(level[i]>=0){ levelSize(level(i))++; } }
1155 WSet.resize(numLevel,std::vector<Int>());
1156 for(Int i=0; i<numLevel; i++ ){WSet[i].reserve(levelSize(i));}
1159 for(Int i=rootParent-1; i>=0; i-- ){
1160 WSet[level(i)].push_back(i);
1164 Int limit = (options_->maxPipelineDepth>0)?std::min(MPI_MAX_COMM,options_->maxPipelineDepth):MPI_MAX_COMM;
1165 for (Int lidx=0; lidx<WSet.size() ; lidx++){
1166 if(WSet[lidx].size()>limit)
1168 std::vector<std::vector<Int> >::iterator pos = WSet.begin()+lidx+1;
1169 WSet.insert(pos,std::vector<Int>());
1170 WSet[lidx+1].insert(WSet[lidx+1].begin(),WSet[lidx].begin() + limit ,WSet[lidx].end());
1171 WSet[lidx].erase(WSet[lidx].begin()+limit,WSet[lidx].end());
1179 for( Int ksup = numSuper - 2; ksup >= 0; ksup-- ){
1180 WSet.push_back(std::vector<Int>());
1181 WSet.back().push_back(ksup);
1189 TIMER_STOP(Compute_WorkSet);
1190 #if ( _DEBUGlevel_ >= 1 )
1191 for (Int lidx=0; lidx<WSet.size() ; lidx++){
1192 statusOFS << std::endl <<
"L"<< lidx <<
" is: {";
1193 for (Int supidx=0; supidx<WSet[lidx].size() ; supidx++){
1194 statusOFS << WSet[lidx][supidx] <<
" ["<<snodeEtree[WSet[lidx][supidx]]<<
"] ";
1196 statusOFS <<
" }"<< std::endl;
1201 template<
typename T>
1205 #if defined (PROFILE) || defined(PMPI) || defined(USE_TAU)
1206 Real begin_SendULWaitContentFirst, end_SendULWaitContentFirst, time_SendULWaitContentFirst = 0;
1209 std::vector<std::vector<Int> > & superList = this->WorkingSet();
1210 Int numSteps = superList.size();
1211 Int stepSuper = superList[lidx].size();
1213 TIMER_START(AllocateBuffer);
1216 std::vector<std::vector<MPI_Request> > arrMpireqsSendToBelow;
1217 arrMpireqsSendToBelow.resize( stepSuper, std::vector<MPI_Request>( 2 * grid_->numProcRow, MPI_REQUEST_NULL ));
1218 std::vector<std::vector<MPI_Request> > arrMpireqsSendToRight;
1219 arrMpireqsSendToRight.resize(stepSuper, std::vector<MPI_Request>( 2 * grid_->numProcCol, MPI_REQUEST_NULL ));
1222 std::vector<MPI_Request> arrMpireqsSendToLeft;
1223 arrMpireqsSendToLeft.resize(stepSuper, MPI_REQUEST_NULL );
1225 std::vector<MPI_Request> arrMpireqsSendToAbove;
1226 arrMpireqsSendToAbove.resize(stepSuper, MPI_REQUEST_NULL );
1229 std::vector<MPI_Request> arrMpireqsRecvSizeFromAny;
1230 arrMpireqsRecvSizeFromAny.resize(stepSuper*2 , MPI_REQUEST_NULL);
1231 std::vector<MPI_Request> arrMpireqsRecvContentFromAny;
1232 arrMpireqsRecvContentFromAny.resize(stepSuper*2 , MPI_REQUEST_NULL);
1236 std::vector<SuperNodeBufferType> arrSuperNodes(stepSuper);
1237 for (Int supidx=0; supidx<stepSuper; supidx++){
1238 arrSuperNodes[supidx].Index = superList[lidx][supidx];
1243 int numSentToLeft = 0;
1244 std::vector<int> reqSentToLeft;
1249 TIMER_STOP(AllocateBuffer);
1252 PushCallStack(
"PMatrix::SelInv_P2p::UpdateL");
1254 #if ( _DEBUGlevel_ >= 1 )
1255 statusOFS << std::endl <<
"Communication to the Schur complement." << std::endl << std::endl;
1261 TIMER_START(IRecv_Content_UL);
1263 for (Int supidx=0; supidx<stepSuper ; supidx++){
1265 MPI_Request * mpireqsRecvFromAbove = &arrMpireqsRecvContentFromAny[supidx*2];
1266 MPI_Request * mpireqsRecvFromLeft = &arrMpireqsRecvContentFromAny[supidx*2+1];
1268 if( isRecvFromAbove_( snode.Index ) &&
1269 MYROW( grid_ ) !=
PROW( snode.Index, grid_ ) ){
1271 TreeBcast * bcastUTree = fwdToBelowTree_[snode.Index];
1272 if(bcastUTree!=NULL){
1273 Int myRoot = bcastUTree->GetRoot();
1274 snode.SizeSstrUrowRecv = bcastUTree->GetMsgSize();
1275 snode.SstrUrowRecv.resize( snode.SizeSstrUrowRecv);
1276 MPI_Irecv( &snode.SstrUrowRecv[0], snode.SizeSstrUrowRecv, MPI_BYTE,
1277 myRoot, IDX_TO_TAG(snode.Index,SELINV_TAG_U_CONTENT),
1278 grid_->colComm, mpireqsRecvFromAbove );
1279 #if ( _DEBUGlevel_ >= 1 )
1280 statusOFS << std::endl <<
"["<<snode.Index<<
"] "<<
"Receiving U " << snode.SizeSstrUrowRecv <<
" BYTES from "<<myRoot<<
" on tag "<<IDX_TO_TAG(snode.Index,SELINV_TAG_U_CONTENT)<< std::endl << std::endl;
1285 if( isRecvFromLeft_( snode.Index ) &&
1286 MYCOL( grid_ ) !=
PCOL( snode.Index, grid_ ) ){
1287 TreeBcast * bcastLTree = fwdToRightTree_[snode.Index];
1288 if(bcastLTree!=NULL){
1289 Int myRoot = bcastLTree->GetRoot();
1290 snode.SizeSstrLcolRecv = bcastLTree->GetMsgSize();
1291 snode.SstrLcolRecv.resize(snode.SizeSstrLcolRecv);
1292 MPI_Irecv( &snode.SstrLcolRecv[0], snode.SizeSstrLcolRecv, MPI_BYTE,
1293 myRoot, IDX_TO_TAG(snode.Index,SELINV_TAG_L_CONTENT),
1294 grid_->rowComm, mpireqsRecvFromLeft );
1295 #if ( _DEBUGlevel_ >= 1 )
1296 statusOFS << std::endl <<
"["<<snode.Index<<
"] "<<
"Receiving L " << snode.SizeSstrLcolRecv <<
" BYTES from "<<myRoot<<
" on tag "<<IDX_TO_TAG(snode.Index,SELINV_TAG_L_CONTENT)<< std::endl << std::endl;
1301 TIMER_STOP(IRecv_Content_UL);
1304 TIMER_START(ISend_Content_UL);
1305 for (Int supidx=0; supidx<stepSuper; supidx++){
1307 std::vector<MPI_Request> & mpireqsSendToBelow = arrMpireqsSendToBelow[supidx];
1308 std::vector<MPI_Request> & mpireqsSendToRight = arrMpireqsSendToRight[supidx];
1310 #if ( _DEBUGlevel_ >= 1 )
1311 statusOFS << std::endl <<
"["<<snode.Index<<
"] "
1312 <<
"Communication for the U part." << std::endl << std::endl;
1315 if(
MYROW( grid_ ) ==
PROW( snode.Index, grid_ ) ){
1316 std::vector<UBlock<T> >& Urow = this->U(
LBi(snode.Index, grid_) );
1318 TIMER_START(Serialize_UL);
1319 std::stringstream sstm;
1321 std::vector<Int> mask( UBlockMask::TOTAL_NUMBER, 1 );
1323 serialize( (Int)Urow.size(), sstm, NO_MASK );
1324 for( Int jb = 0; jb < Urow.size(); jb++ ){
1325 serialize( Urow[jb], sstm, mask );
1327 snode.SstrUrowSend.resize( Size( sstm ) );
1328 sstm.read( &snode.SstrUrowSend[0], snode.SstrUrowSend.size() );
1329 snode.SizeSstrUrowSend = snode.SstrUrowSend.size();
1330 TIMER_STOP(Serialize_UL);
1331 TreeBcast * bcastUTree = fwdToBelowTree_[snode.Index];
1332 if(bcastUTree!=NULL){
1333 bcastUTree->ForwardMessage((
char*)&snode.SstrUrowSend[0], snode.SizeSstrUrowSend,
1334 IDX_TO_TAG(snode.Index,SELINV_TAG_U_CONTENT), &mpireqsSendToBelow[0] );
1335 for( Int idxRecv = 0; idxRecv < bcastUTree->GetDestCount(); ++idxRecv ){
1336 Int iProcRow = bcastUTree->GetDest(idxRecv);
1337 #if ( _DEBUGlevel_ >= 1 )
1338 statusOFS << std::endl <<
"["<<snode.Index<<
"] "<<
"Sending U " << snode.SizeSstrUrowSend <<
" BYTES on tag "<<IDX_TO_TAG(snode.Index,SELINV_TAG_U_CONTENT) << std::endl << std::endl;
1345 #if ( _DEBUGlevel_ >= 1 )
1346 statusOFS << std::endl <<
"["<<snode.Index<<
"] "<<
"Communication for the L part." << std::endl << std::endl;
1352 if(
MYCOL( grid_ ) ==
PCOL( snode.Index, grid_ ) ){
1353 std::vector<LBlock<T> >& Lcol = this->L(
LBj(snode.Index, grid_) );
1354 TIMER_START(Serialize_UL);
1356 std::stringstream sstm;
1357 std::vector<Int> mask( LBlockMask::TOTAL_NUMBER, 1 );
1358 mask[LBlockMask::NZVAL] = 0;
1362 if(
MYROW( grid_ ) ==
PROW( snode.Index, grid_ ) )
1363 serialize( (Int)Lcol.size() - 1, sstm, NO_MASK );
1365 serialize( (Int)Lcol.size(), sstm, NO_MASK );
1367 for( Int ib = 0; ib < Lcol.size(); ib++ ){
1368 if( Lcol[ib].blockIdx > snode.Index ){
1369 #if ( _DEBUGlevel_ >= 2 )
1370 statusOFS << std::endl <<
"["<<snode.Index<<
"] "<<
"Serializing Block index " << Lcol[ib].blockIdx << std::endl;
1372 serialize( Lcol[ib], sstm, mask );
1375 snode.SstrLcolSend.resize( Size( sstm ) );
1376 sstm.read( &snode.SstrLcolSend[0], snode.SstrLcolSend.size() );
1377 snode.SizeSstrLcolSend = snode.SstrLcolSend.size();
1378 TIMER_STOP(Serialize_UL);
1381 TreeBcast * bcastLTree = fwdToRightTree_[snode.Index];
1382 if(bcastLTree!=NULL){
1383 bcastLTree->ForwardMessage((
char*)&snode.SstrLcolSend[0], snode.SizeSstrLcolSend,
1384 IDX_TO_TAG(snode.Index,SELINV_TAG_L_CONTENT), &mpireqsSendToRight[0] );
1386 for( Int idxRecv = 0; idxRecv < bcastLTree->GetDestCount(); ++idxRecv ){
1387 Int iProcCol = bcastLTree->GetDest(idxRecv);
1388 #if ( _DEBUGlevel_ >= 1 )
1389 statusOFS << std::endl <<
"["<<snode.Index<<
"] "<<
"Sending L " << snode.SizeSstrLcolSend <<
" BYTES on tag "<<IDX_TO_TAG(snode.Index,SELINV_TAG_L_CONTENT) << std::endl << std::endl;
1395 TIMER_STOP(ISend_Content_UL);
1398 vector<char> redLdone(stepSuper,0);
1399 for (Int supidx=0; supidx<stepSuper; supidx++){
1403 if(redLTree != NULL){
1404 redLTree->SetTag(IDX_TO_TAG(snode.Index,SELINV_TAG_L_REDUCE));
1406 redLTree->AllocRecvBuffers();
1408 redLTree->PostFirstRecv();
1413 TIMER_START(Compute_Sinv_LT);
1415 Int msgForwarded = 0;
1417 Int gemmProcessed = 0;
1421 std::list<Int> readySupidx;
1423 for(Int supidx = 0;supidx<stepSuper;supidx++){
1427 if( isRecvFromAbove_( snode.Index ) && isRecvFromLeft_( snode.Index )){
1429 if(
MYCOL( grid_ ) ==
PCOL( snode.Index, grid_ ) ){
1433 if(
MYROW( grid_ ) ==
PROW( snode.Index, grid_ ) ){
1437 if(snode.isReady==2){
1438 readySupidx.push_back(supidx);
1439 #if ( _DEBUGlevel_ >= 1 )
1440 statusOFS<<std::endl<<
"Locally processing ["<<snode.Index<<
"]"<<std::endl;
1444 else if( (isRecvFromLeft_( snode.Index ) ) &&
MYCOL( grid_ ) !=
PCOL( snode.Index, grid_ ) )
1449 if(redLTree != NULL){
1450 TIMER_START(Reduce_Sinv_LT_Isend);
1452 redLTree->SetLocalBuffer(NULL);
1453 redLTree->SetDataReady(
true);
1454 bool done = redLTree->Progress();
1455 TIMER_STOP(Reduce_Sinv_LT_Isend);
1459 if(
MYROW(grid_)!=
PROW(snode.Index,grid_)){
1460 TreeBcast * bcastUTree = fwdToBelowTree_[snode.Index];
1461 if(bcastUTree != NULL){
1462 if(bcastUTree->GetDestCount()>0){
1468 if(
MYCOL(grid_)!=
PCOL(snode.Index,grid_)){
1469 TreeBcast * bcastLTree = fwdToRightTree_[snode.Index];
1470 if(bcastLTree != NULL){
1471 if(bcastLTree->GetDestCount()>0){
1478 #if ( _DEBUGlevel_ >= 1 )
1479 statusOFS<<std::endl<<
"gemmToDo ="<<gemmToDo<<std::endl;
1480 statusOFS<<std::endl<<
"msgToFwd ="<<msgToFwd<<std::endl;
1484 #if defined (PROFILE)
1485 end_SendULWaitContentFirst=0;
1486 begin_SendULWaitContentFirst=0;
1489 while(gemmProcessed<gemmToDo || msgForwarded < msgToFwd)
1491 Int reqidx = MPI_UNDEFINED;
1499 int reqIndices[arrMpireqsRecvContentFromAny.size()];
1504 TIMER_START(WaitContent_UL);
1505 #if defined(PROFILE)
1506 if(begin_SendULWaitContentFirst==0){
1507 begin_SendULWaitContentFirst=1;
1508 TIMER_START(WaitContent_UL_First);
1513 MPI_Waitsome(2*stepSuper, &arrMpireqsRecvContentFromAny[0], &numRecv, reqIndices, MPI_STATUSES_IGNORE);
1515 for(
int i =0;i<numRecv;i++){
1516 reqidx = reqIndices[i];
1518 if(reqidx!=MPI_UNDEFINED)
1526 TreeBcast * bcastUTree = fwdToBelowTree_[snode.Index];
1527 if(bcastUTree != NULL){
1528 if(bcastUTree->GetDestCount()>0){
1530 std::vector<MPI_Request> & mpireqsSendToBelow = arrMpireqsSendToBelow[supidx];
1531 #if ( _DEBUGlevel_ >= 1 )
1532 for( Int idxRecv = 0; idxRecv < bcastUTree->GetDestCount(); ++idxRecv ){
1533 Int iProcRow = bcastUTree->GetDest(idxRecv);
1534 statusOFS << std::endl <<
"["<<snode.Index<<
"] "<<
"Forwarding U " << snode.SizeSstrUrowRecv <<
" BYTES to "<<iProcRow<<
" on tag "<<IDX_TO_TAG(snode.Index,SELINV_TAG_U_CONTENT)<< std::endl << std::endl;
1538 bcastUTree->ForwardMessage( (
char*)&snode.SstrUrowRecv[0], snode.SizeSstrUrowRecv,
1539 IDX_TO_TAG(snode.Index,SELINV_TAG_U_CONTENT), &mpireqsSendToBelow[0] );
1540 #if ( _DEBUGlevel_ >= 1 )
1541 for( Int idxRecv = 0; idxRecv < bcastUTree->GetDestCount(); ++idxRecv ){
1542 Int iProcRow = bcastUTree->GetDest(idxRecv);
1543 statusOFS << std::endl <<
"["<<snode.Index<<
"] "<<
"Forwarded U " << snode.SizeSstrUrowRecv <<
" BYTES to "<<iProcRow<<
" on tag "<<IDX_TO_TAG(snode.Index,SELINV_TAG_U_CONTENT)<< std::endl << std::endl;
1551 else if(reqidx%2==1){
1552 TreeBcast * bcastLTree = fwdToRightTree_[snode.Index];
1553 if(bcastLTree != NULL){
1554 if(bcastLTree->GetDestCount()>0){
1556 std::vector<MPI_Request> & mpireqsSendToRight = arrMpireqsSendToRight[supidx];
1557 #if ( _DEBUGlevel_ >= 1 )
1558 for( Int idxRecv = 0; idxRecv < bcastLTree->GetDestCount(); ++idxRecv ){
1559 Int iProcCol = bcastLTree->GetDest(idxRecv);
1560 statusOFS << std::endl <<
"["<<snode.Index<<
"] "<<
"Forwarding L " << snode.SizeSstrLcolRecv <<
" BYTES to "<<iProcCol<<
" on tag "<<IDX_TO_TAG(snode.Index,SELINV_TAG_L_CONTENT)<< std::endl << std::endl;
1564 bcastLTree->ForwardMessage( (
char*)&snode.SstrLcolRecv[0], snode.SizeSstrLcolRecv,
1565 IDX_TO_TAG(snode.Index,SELINV_TAG_L_CONTENT), &mpireqsSendToRight[0] );
1571 #if ( _DEBUGlevel_ >= 1 )
1572 for( Int idxRecv = 0; idxRecv < bcastLTree->GetDestCount(); ++idxRecv ){
1573 Int iProcCol = bcastLTree->GetDest(idxRecv);
1574 statusOFS << std::endl <<
"["<<snode.Index<<
"] "<<
"Forwarded L " << snode.SizeSstrLcolRecv <<
" BYTES to "<<iProcCol<<
" on tag "<<IDX_TO_TAG(snode.Index,SELINV_TAG_L_CONTENT)<< std::endl << std::endl;
1583 #if ( _DEBUGlevel_ >= 1 )
1584 statusOFS<<std::endl<<
"Received data for ["<<snode.Index<<
"] reqidx%2="<<reqidx%2<<
" is ready ?"<<snode.isReady<<std::endl;
1587 if( isRecvFromAbove_( snode.Index ) && isRecvFromLeft_( snode.Index )){
1591 if(snode.isReady==2){
1592 readySupidx.push_back(supidx);
1594 #if defined(PROFILE)
1595 if(end_SendULWaitContentFirst==0){
1596 TIMER_STOP(WaitContent_UL_First);
1597 end_SendULWaitContentFirst=1;
1606 TIMER_STOP(WaitContent_UL);
1608 }
while( (gemmProcessed<gemmToDo && readySupidx.size()==0) || (gemmProcessed==gemmToDo && msgForwarded<msgToFwd) );
1611 if(readySupidx.size()>0)
1613 supidx = readySupidx.back();
1614 readySupidx.pop_back();
1619 if( isRecvFromAbove_( snode.Index ) && isRecvFromLeft_( snode.Index ) ){
1621 std::vector<LBlock<T> > LcolRecv;
1622 std::vector<UBlock<T> > UrowRecv;
1626 UnpackData(snode, LcolRecv, UrowRecv);
1629 SelInv_lookup_indexes(snode,LcolRecv, UrowRecv,AinvBuf,UBuf);
1631 snode.LUpdateBuf.Resize( AinvBuf.m(),
SuperSize( snode.Index, super_ ) );
1634 gemm_stat.push_back(AinvBuf.m());
1635 gemm_stat.push_back(UBuf.m());
1636 gemm_stat.push_back(AinvBuf.n());
1639 TIMER_START(Compute_Sinv_LT_GEMM);
1640 blas::Gemm(
'N',
'T', AinvBuf.m(), UBuf.m(), AinvBuf.n(), MINUS_ONE<T>(),
1641 AinvBuf.Data(), AinvBuf.m(),
1642 UBuf.Data(), UBuf.m(), ZERO<T>(),
1643 snode.LUpdateBuf.Data(), snode.LUpdateBuf.m() );
1644 TIMER_STOP(Compute_Sinv_LT_GEMM);
1647 #if ( _DEBUGlevel_ >= 2 )
1648 statusOFS << std::endl <<
"["<<snode.Index<<
"] "<<
"snode.LUpdateBuf: " << snode.LUpdateBuf << std::endl;
1654 if(redLTree != NULL){
1655 TIMER_START(Reduce_Sinv_LT_Isend);
1657 redLTree->SetLocalBuffer(snode.LUpdateBuf.Data());
1658 redLTree->SetDataReady(
true);
1659 bool done = redLTree->Progress();
1660 TIMER_STOP(Reduce_Sinv_LT_Isend);
1664 #if ( _DEBUGlevel_ >= 1 )
1665 statusOFS<<std::endl<<
"gemmProcessed ="<<gemmProcessed<<
"/"<<gemmToDo<<std::endl;
1669 for (Int supidx=0; supidx<stepSuper; supidx++){
1672 if(redLTree != NULL && !redLdone[supidx]){
1673 bool done = redLTree->Progress();
1680 TIMER_STOP(Compute_Sinv_LT);
1685 TIMER_START(Reduce_Sinv_LT);
1687 bool all_done =
false;
1692 for (Int supidx=0; supidx<stepSuper; supidx++){
1697 if(redLTree != NULL && !redLdone[supidx]){
1698 bool done = redLTree->Progress();
1700 if(
MYCOL( grid_ ) ==
PCOL( snode.Index, grid_ ) ){
1702 Int numRowLUpdateBuf;
1703 std::vector<LBlock<T> >& Lcol = this->L(
LBj( snode.Index, grid_ ) );
1704 if(
MYROW( grid_ ) !=
PROW( snode.Index, grid_ ) ){
1705 snode.RowLocalPtr.resize( Lcol.size() + 1 );
1706 snode.BlockIdxLocal.resize( Lcol.size() );
1707 snode.RowLocalPtr[0] = 0;
1708 for( Int ib = 0; ib < Lcol.size(); ib++ ){
1709 snode.RowLocalPtr[ib+1] = snode.RowLocalPtr[ib] + Lcol[ib].numRow;
1710 snode.BlockIdxLocal[ib] = Lcol[ib].blockIdx;
1714 snode.RowLocalPtr.resize( Lcol.size() );
1715 snode.BlockIdxLocal.resize( Lcol.size() - 1 );
1716 snode.RowLocalPtr[0] = 0;
1717 for( Int ib = 1; ib < Lcol.size(); ib++ ){
1718 snode.RowLocalPtr[ib] = snode.RowLocalPtr[ib-1] + Lcol[ib].numRow;
1719 snode.BlockIdxLocal[ib-1] = Lcol[ib].blockIdx;
1722 numRowLUpdateBuf = *snode.RowLocalPtr.rbegin();
1724 if( numRowLUpdateBuf > 0 ){
1725 if( snode.LUpdateBuf.m() == 0 && snode.LUpdateBuf.n() == 0 ){
1726 snode.LUpdateBuf.Resize( numRowLUpdateBuf,
SuperSize( snode.Index, super_ ) );
1731 redLTree->SetLocalBuffer(snode.LUpdateBuf.Data());
1735 redLTree->CleanupBuffers();
1738 all_done = all_done && (done || redLdone[supidx]);
1742 TIMER_STOP(Reduce_Sinv_LT);
1751 PushCallStack(
"PMatrix::SelInv_P2p::UpdateD");
1754 TIMER_START(Update_Diagonal);
1756 for (Int supidx=0; supidx<stepSuper; supidx++){
1759 ComputeDiagUpdate(snode);
1764 if(redDTree != NULL){
1766 if(
MYROW( grid_ ) ==
PROW( snode.Index, grid_ ) ){
1767 if(snode.DiagBuf.Size()==0){
1768 snode.DiagBuf.Resize(
SuperSize( snode.Index, super_ ),
SuperSize( snode.Index, super_ ));
1769 SetValue(snode.DiagBuf, ZERO<T>());
1774 redDTree->SetLocalBuffer(snode.DiagBuf.Data());
1775 if(!redDTree->IsAllocated()){
1776 redDTree->SetTag(IDX_TO_TAG(snode.Index,SELINV_TAG_D_REDUCE));
1777 redDTree->AllocRecvBuffers();
1779 redDTree->PostFirstRecv();
1782 redDTree->SetDataReady(
true);
1783 bool done = redDTree->Progress();
1787 for (Int supidx=0; supidx<stepSuper; supidx++){
1790 if(redDTree != NULL){
1791 if(redDTree->IsAllocated()){
1792 bool done = redDTree->Progress();
1797 TIMER_STOP(Update_Diagonal);
1799 TIMER_START(Reduce_Diagonal);
1802 vector<char> is_done(stepSuper,0);
1803 bool all_done =
false;
1808 for (Int supidx=0; supidx<stepSuper; supidx++){
1812 if(redDTree != NULL){
1813 bool done = redDTree->Progress();
1814 if(done && !is_done[supidx]){
1815 if(
MYCOL( grid_ ) ==
PCOL( snode.Index, grid_ ) ){
1816 if(
MYROW( grid_ ) ==
PROW( snode.Index, grid_ ) ){
1817 LBlock<T> & LB = this->L(
LBj( snode.Index, grid_ ) )[0];
1819 blas::Axpy( LB.
numRow * LB.
numCol, ONE<T>(), snode.DiagBuf.Data(), 1, LB.
nzval.Data(), 1 );
1820 Symmetrize( LB.
nzval );
1825 redDTree->CleanupBuffers();
1828 all_done = all_done && (done || is_done[supidx]);
1833 TIMER_STOP(Reduce_Diagonal);
1841 PushCallStack(
"PMatrix::SelInv_P2p::UpdateU");
1846 SendRecvCD_UpdateU(arrSuperNodes, stepSuper);
1853 PushCallStack(
"PMatrix::SelInv_P2p::UpdateLFinal");
1856 TIMER_START(Update_L);
1857 for (Int supidx=0; supidx<stepSuper; supidx++){
1860 #if ( _DEBUGlevel_ >= 1 )
1861 statusOFS << std::endl <<
"["<<snode.Index<<
"] "<<
"Finish updating the L part by filling LUpdateBufReduced back to L" << std::endl << std::endl;
1864 if(
MYCOL( grid_ ) ==
PCOL( snode.Index, grid_ ) && snode.LUpdateBuf.m() > 0 ){
1865 std::vector<LBlock<T> >& Lcol = this->L(
LBj( snode.Index, grid_ ) );
1867 Int startBlock = (
MYROW( grid_ ) ==
PROW( snode.Index, grid_ ))?1:0;
1868 for( Int ib = startBlock; ib < Lcol.size(); ib++ ){
1870 lapack::Lacpy(
'A', LB.
numRow, LB.
numCol, &snode.LUpdateBuf( snode.RowLocalPtr[ib-startBlock], 0 ),
1871 snode.LUpdateBuf.m(), LB.
nzval.Data(), LB.
numRow );
1875 TIMER_STOP(Update_L);
1882 TIMER_START(Barrier);
1885 for (Int supidx=0; supidx<stepSuper; supidx++){
1889 if(redLTree != NULL){
1891 redLTree->CleanupBuffers();
1896 for (Int supidx=0; supidx<stepSuper; supidx++){
1900 if(redDTree != NULL){
1902 redDTree->CleanupBuffers();
1906 mpi::Waitall(arrMpireqsRecvContentFromAny);
1908 for (Int supidx=0; supidx<stepSuper; supidx++){
1909 Int ksup = superList[lidx][supidx];
1910 std::vector<MPI_Request> & mpireqsSendToRight = arrMpireqsSendToRight[supidx];
1911 std::vector<MPI_Request> & mpireqsSendToBelow = arrMpireqsSendToBelow[supidx];
1913 #if ( _DEBUGlevel_ >= 1 )
1914 statusOFS<<
"["<<ksup<<
"] mpireqsSendToRight"<<std::endl;
1916 mpi::Waitall( mpireqsSendToRight );
1917 #if ( _DEBUGlevel_ >= 1 )
1918 statusOFS<<
"["<<ksup<<
"] mpireqsSendToBelow"<<std::endl;
1920 mpi::Waitall( mpireqsSendToBelow );
1924 #if ( _DEBUGlevel_ >= 1 )
1925 statusOFS<<
"barrier done"<<std::endl;
1927 TIMER_STOP(Barrier);
1932 if (options_->maxPipelineDepth!=-1)
1935 MPI_Barrier(grid_->comm);
1941 template<
typename T>
1944 ConstructCommunicationPattern_P2p();
1949 template<
typename T>
1953 PushCallStack(
"PMatrix::ConstructCommunicationPattern_P2p");
1959 TIMER_START(Allocate);
1962 PushCallStack(
"Initialize the communication pattern" );
1966 fwdToBelowTree_.resize(numSuper, NULL );
1967 fwdToRightTree_.resize(numSuper, NULL );
1968 redToLeftTree_.resize(numSuper, NULL );
1969 redToAboveTree_.resize(numSuper, NULL );
1971 isSendToBelow_.Resize(grid_->numProcRow, numSuper);
1972 isSendToRight_.Resize(grid_->numProcCol, numSuper);
1973 isSendToDiagonal_.Resize( numSuper );
1976 SetValue( isSendToDiagonal_,
false );
1978 isSendToCrossDiagonal_.Resize(grid_->numProcCol+1, numSuper );
1979 SetValue( isSendToCrossDiagonal_,
false );
1980 isRecvFromCrossDiagonal_.Resize(grid_->numProcRow+1, numSuper );
1981 SetValue( isRecvFromCrossDiagonal_,
false );
1983 isRecvFromAbove_.Resize( numSuper );
1984 isRecvFromLeft_.Resize( numSuper );
1985 isRecvFromBelow_.Resize( grid_->numProcRow, numSuper );
1986 SetValue( isRecvFromAbove_,
false );
1987 SetValue( isRecvFromBelow_,
false );
1988 SetValue( isRecvFromLeft_,
false );
1993 TIMER_STOP(Allocate);
1995 TIMER_START(GetEtree);
1996 std::vector<Int> snodeEtree(this->
NumSuper());
1997 GetEtree(snodeEtree);
1998 TIMER_STOP(GetEtree);
2003 PushCallStack(
"Local column communication" );
2005 #if ( _DEBUGlevel_ >= 1 )
2006 statusOFS << std::endl <<
"Local column communication" << std::endl;
2011 std::vector<std::set<Int> > localColBlockRowIdx;
2013 localColBlockRowIdx.resize( this->NumLocalBlockCol() );
2016 TIMER_START(Column_communication);
2019 for( Int ksup = 0; ksup < numSuper; ksup++ ){
2021 if(
MYCOL( grid_ ) ==
PCOL( ksup, grid_ ) ){
2025 std::vector<Int> tAllBlockRowIdx;
2026 std::vector<Int> & colBlockIdx = ColBlockIdx(
LBj(ksup, grid_));
2027 TIMER_START(Allgatherv_Column_communication);
2028 if( grid_ -> mpisize != 1 )
2029 mpi::Allgatherv( colBlockIdx, tAllBlockRowIdx, grid_->colComm );
2031 tAllBlockRowIdx = colBlockIdx;
2033 TIMER_STOP(Allgatherv_Column_communication);
2035 localColBlockRowIdx[
LBj( ksup, grid_ )].insert(
2036 tAllBlockRowIdx.begin(), tAllBlockRowIdx.end() );
2038 #if ( _DEBUGlevel_ >= 1 )
2040 <<
" Column block " << ksup
2041 <<
" has the following nonzero block rows" << std::endl;
2042 for( std::set<Int>::iterator si = localColBlockRowIdx[
LBj( ksup, grid_ )].begin();
2043 si != localColBlockRowIdx[
LBj( ksup, grid_ )].end();
2045 statusOFS << *si <<
" ";
2047 statusOFS << std::endl;
2058 TIMER_STOP(Column_communication);
2060 TIMER_START(Row_communication);
2062 PushCallStack(
"Local row communication" );
2064 #if ( _DEBUGlevel_ >= 1 )
2065 statusOFS << std::endl <<
"Local row communication" << std::endl;
2070 std::vector<std::set<Int> > localRowBlockColIdx;
2072 localRowBlockColIdx.resize( this->NumLocalBlockRow() );
2074 for( Int ksup = 0; ksup < numSuper; ksup++ ){
2076 if(
MYROW( grid_ ) ==
PROW( ksup, grid_ ) ){
2079 std::vector<Int> tAllBlockColIdx;
2080 std::vector<Int> & rowBlockIdx = RowBlockIdx(
LBi(ksup, grid_));
2081 TIMER_START(Allgatherv_Row_communication);
2082 if( grid_ -> mpisize != 1 )
2083 mpi::Allgatherv( rowBlockIdx, tAllBlockColIdx, grid_->rowComm );
2085 tAllBlockColIdx = rowBlockIdx;
2087 TIMER_STOP(Allgatherv_Row_communication);
2089 localRowBlockColIdx[
LBi( ksup, grid_ )].insert(
2090 tAllBlockColIdx.begin(), tAllBlockColIdx.end() );
2092 #if ( _DEBUGlevel_ >= 1 )
2094 <<
" Row block " << ksup
2095 <<
" has the following nonzero block columns" << std::endl;
2096 for( std::set<Int>::iterator si = localRowBlockColIdx[
LBi( ksup, grid_ )].begin();
2097 si != localRowBlockColIdx[
LBi( ksup, grid_ )].end();
2099 statusOFS << *si <<
" ";
2101 statusOFS << std::endl;
2111 TIMER_STOP(Row_communication);
2113 TIMER_START(STB_RFA);
2115 PushCallStack(
"SendToBelow / RecvFromAbove");
2117 for( Int ksup = 0; ksup < numSuper - 1; ksup++ ){
2119 Int jsup = snodeEtree[ksup];
2120 while(jsup<numSuper){
2121 Int jsupLocalBlockCol =
LBj( jsup, grid_ );
2122 Int jsupProcCol =
PCOL( jsup, grid_ );
2123 if(
MYCOL( grid_ ) == jsupProcCol ){
2125 if( localColBlockRowIdx[jsupLocalBlockCol].count( ksup ) > 0 ) {
2126 for( std::set<Int>::iterator si = localColBlockRowIdx[jsupLocalBlockCol].begin();
2127 si != localColBlockRowIdx[jsupLocalBlockCol].end(); si++ ){
2129 Int isupProcRow =
PROW( isup, grid_ );
2131 if(
MYROW( grid_ ) == isupProcRow ){
2132 isRecvFromAbove_(ksup) =
true;
2134 if(
MYROW( grid_ ) ==
PROW( ksup, grid_ ) ){
2135 isSendToBelow_( isupProcRow, ksup ) =
true;
2141 jsup = snodeEtree[jsup];
2147 TIMER_STOP(STB_RFA);
2149 TIMER_START(STR_RFL_RFB);
2151 PushCallStack(
"SendToRight / RecvFromLeft");
2153 for( Int ksup = 0; ksup < numSuper - 1; ksup++ ){
2155 Int isup = snodeEtree[ksup];
2156 while(isup<numSuper){
2157 Int isupLocalBlockRow =
LBi( isup, grid_ );
2158 Int isupProcRow =
PROW( isup, grid_ );
2159 if(
MYROW( grid_ ) == isupProcRow ){
2161 if( localRowBlockColIdx[isupLocalBlockRow].count( ksup ) > 0 ){
2162 for( std::set<Int>::iterator si = localRowBlockColIdx[isupLocalBlockRow].begin();
2163 si != localRowBlockColIdx[isupLocalBlockRow].end(); si++ ){
2165 Int jsupProcCol =
PCOL( jsup, grid_ );
2167 if(
MYCOL( grid_ ) == jsupProcCol ){
2168 isRecvFromLeft_(ksup) =
true;
2170 if(
MYCOL( grid_ ) ==
PCOL( ksup, grid_ ) ){
2171 isSendToRight_( jsupProcCol, ksup ) =
true;
2178 if(
MYCOL( grid_ ) ==
PCOL(ksup, grid_) ){
2179 if(
MYROW( grid_ ) ==
PROW( ksup, grid_ ) ){
2180 isRecvFromBelow_(isupProcRow,ksup) =
true;
2182 else if (
MYROW(grid_) == isupProcRow){
2183 isSendToDiagonal_(ksup)=
true;
2186 isup = snodeEtree[isup];
2193 TIMER_STOP(STR_RFL_RFB);
2196 TIMER_START(BUILD_BCAST_TREES);
2199 vector<double> SeedRFL(numSuper,0.0);
2200 vector<Int> aggRFL(numSuper);
2201 vector<Int> globalAggRFL(numSuper*grid_->numProcCol);
2202 for( Int ksup = 0; ksup < numSuper ; ksup++ ){
2204 std::vector<LBlock<T> >& Lcol = this->L(
LBj(ksup, grid_) );
2209 totalSize+=
sizeof(Int);
2210 for( Int ib = 0; ib < Lcol.size(); ib++ ){
2211 if( Lcol[ib].blockIdx > ksup ){
2213 totalSize+=3*
sizeof(Int);
2214 totalSize+=
sizeof(Int)+Lcol[ib].rows.ByteSize();
2219 aggRFL[ksup]=totalSize;
2220 SeedRFL[ksup]=rand();
2223 else if(isRecvFromLeft_(ksup)){
2231 MPI_Allgather(&aggRFL[0],numSuper*
sizeof(Int),MPI_BYTE,
2232 &globalAggRFL[0],numSuper*
sizeof(Int),MPI_BYTE,
2234 MPI_Allreduce(MPI_IN_PLACE,&SeedRFL[0],numSuper,MPI_DOUBLE,MPI_MAX,grid_->rowComm);
2236 vector<double> SeedRFA(numSuper,0.0);
2237 vector<Int> aggRFA(numSuper);
2238 vector<Int> globalAggRFA(numSuper*grid_->numProcRow);
2239 for( Int ksup = 0; ksup < numSuper ; ksup++ ){
2241 std::vector<UBlock<T> >& Urow = this->U(
LBi(ksup, grid_) );
2246 totalSize+=
sizeof(Int);
2247 for( Int jb = 0; jb < Urow.size(); jb++ ){
2248 if( Urow[jb].blockIdx >= ksup ){
2250 totalSize+=3*
sizeof(Int);
2251 totalSize+=
sizeof(Int)+Urow[jb].cols.ByteSize();
2252 totalSize+= 2*
sizeof(Int)+Urow[jb].nzval.ByteSize();
2255 aggRFA[ksup]=totalSize;
2256 SeedRFA[ksup]=rand();
2258 else if(isRecvFromAbove_(ksup)){
2268 MPI_Allgather(&aggRFA[0],numSuper*
sizeof(Int),MPI_BYTE,
2269 &globalAggRFA[0],numSuper*
sizeof(Int),MPI_BYTE,
2271 MPI_Allreduce(MPI_IN_PLACE,&SeedRFA[0],numSuper,MPI_DOUBLE,MPI_MAX,grid_->colComm);
2273 for( Int ksup = 0; ksup < numSuper; ksup++ ){
2276 for( Int iProcRow = 0; iProcRow < grid_->numProcRow; iProcRow++ ){
2277 Int isRFA = globalAggRFA[iProcRow*numSuper + ksup];
2279 if( iProcRow !=
PROW( ksup, grid_ ) ){
2280 set_ranks.insert(iProcRow);
2288 if( isRecvFromAbove_(ksup) || CountSendToBelow(ksup)>0 ){
2289 vector<Int> tree_ranks;
2290 tree_ranks.push_back(
PROW(ksup,grid_));
2291 tree_ranks.insert(tree_ranks.end(),set_ranks.begin(),set_ranks.end());
2292 TreeBcast * & BcastUTree = fwdToBelowTree_[ksup];
2293 BcastUTree = TreeBcast::Create(this->grid_->colComm,&tree_ranks[0],tree_ranks.size(),msgSize,SeedRFA[ksup]);
2295 BcastUTree->SetGlobalComm(grid_->comm);
2300 for( Int iProcCol = 0; iProcCol < grid_->numProcCol; iProcCol++ ){
2301 Int isRFL = globalAggRFL[iProcCol*numSuper + ksup];
2303 if( iProcCol !=
PCOL( ksup, grid_ ) ){
2304 set_ranks.insert(iProcCol);
2312 if( isRecvFromLeft_(ksup) || CountSendToRight(ksup)>0 ){
2313 vector<Int> tree_ranks;
2314 tree_ranks.push_back(
PCOL(ksup,grid_));
2315 tree_ranks.insert(tree_ranks.end(),set_ranks.begin(),set_ranks.end());
2317 TreeBcast * & BcastLTree = fwdToRightTree_[ksup];
2318 BcastLTree = TreeBcast::Create(this->grid_->rowComm,&tree_ranks[0],tree_ranks.size(),msgSize,SeedRFL[ksup]);
2320 BcastLTree->SetGlobalComm(grid_->comm);
2326 TIMER_STOP(BUILD_BCAST_TREES);
2327 TIMER_START(BUILD_REDUCE_D_TREE);
2328 vector<double> SeedSTD(numSuper,0.0);
2329 vector<Int> aggSTD(numSuper);
2330 vector<Int> globalAggSTD(numSuper*grid_->numProcRow);
2331 for( Int ksup = 0; ksup < numSuper; ksup++ ){
2334 aggSTD[ksup]=totalSize;
2335 SeedSTD[ksup]=rand();
2337 else if(isSendToDiagonal_(ksup)){
2347 MPI_Allgather(&aggSTD[0],numSuper*
sizeof(Int),MPI_BYTE,
2348 &globalAggSTD[0],numSuper*
sizeof(Int),MPI_BYTE,
2350 MPI_Allreduce(MPI_IN_PLACE,&SeedSTD[0],numSuper,MPI_DOUBLE,MPI_MAX,grid_->colComm);
2353 for( Int ksup = 0; ksup < numSuper; ksup++ ){
2354 if(
MYCOL( grid_ ) ==
PCOL(ksup, grid_) ){
2357 for( Int iProcRow = 0; iProcRow < grid_->numProcRow; iProcRow++ ){
2358 Int isSTD = globalAggSTD[iProcRow*numSuper + ksup];
2360 if( iProcRow !=
PROW( ksup, grid_ ) ){
2361 set_ranks.insert(iProcRow);
2369 Int amISTD = globalAggSTD[
MYROW(grid_)*numSuper + ksup];
2376 vector<Int> tree_ranks;
2377 tree_ranks.push_back(
PROW(ksup,grid_));
2378 tree_ranks.insert(tree_ranks.end(),set_ranks.begin(),set_ranks.end());
2385 redDTree =
TreeReduce<T>::Create(this->grid_->colComm,&tree_ranks[0],tree_ranks.size(),msgSize,SeedSTD[ksup]);
2387 redDTree->SetGlobalComm(grid_->comm);
2393 TIMER_STOP(BUILD_REDUCE_D_TREE);
2397 TIMER_START(BUILD_REDUCE_L_TREE);
2398 vector<double> SeedRTL(numSuper,0.0);
2399 vector<Int> aggRTL(numSuper);
2400 vector<Int> globalAggRTL(numSuper*grid_->numProcCol);
2401 for( Int ksup = 0; ksup < numSuper ; ksup++ ){
2403 std::vector<LBlock<T> >& Lcol = this->L(
LBj(ksup, grid_) );
2409 Int numRowLUpdateBuf = 0;
2410 if(
MYROW( grid_ ) !=
PROW( ksup, grid_ ) ){
2411 for( Int ib = 0; ib < Lcol.size(); ib++ ){
2412 numRowLUpdateBuf += Lcol[ib].numRow;
2416 for( Int ib = 1; ib < Lcol.size(); ib++ ){
2417 numRowLUpdateBuf += Lcol[ib].numRow;
2422 totalSize = numRowLUpdateBuf*
SuperSize( ksup, super_ )*
sizeof(T);
2424 aggRTL[ksup]=totalSize;
2426 SeedRTL[ksup]=rand();
2428 else if(isRecvFromLeft_(ksup)){
2436 MPI_Allgather(&aggRTL[0],numSuper*
sizeof(Int),MPI_BYTE,
2437 &globalAggRTL[0],numSuper*
sizeof(Int),MPI_BYTE,
2439 MPI_Allreduce(MPI_IN_PLACE,&SeedRTL[0],numSuper,MPI_DOUBLE,MPI_MAX,grid_->rowComm);
2442 for( Int ksup = 0; ksup < numSuper ; ksup++ ){
2445 for( Int iProcCol = 0; iProcCol < grid_->numProcCol; iProcCol++ ){
2446 Int isRTL = globalAggRTL[iProcCol*numSuper + ksup];
2448 if( iProcCol !=
PCOL( ksup, grid_ ) ){
2449 set_ranks.insert(iProcCol);
2457 if( isRecvFromLeft_(ksup) || CountSendToRight(ksup)>0 ){
2458 vector<Int> tree_ranks;
2459 tree_ranks.push_back(
PCOL(ksup,grid_));
2460 tree_ranks.insert(tree_ranks.end(),set_ranks.begin(),set_ranks.end());
2463 redLTree =
TreeReduce<T>::Create(this->grid_->rowComm,&tree_ranks[0],tree_ranks.size(),msgSize,SeedRTL[ksup]);
2465 redLTree->SetGlobalComm(grid_->comm);
2470 TIMER_STOP(BUILD_REDUCE_L_TREE);
2478 #if ( _DEBUGlevel_ >= 1 )
2479 statusOFS << std::endl <<
"isSendToBelow:" << std::endl;
2480 for(
int j = 0;j< isSendToBelow_.n();j++){
2481 statusOFS <<
"["<<j<<
"] ";
2482 for(
int i =0; i < isSendToBelow_.m();i++){
2483 statusOFS<< isSendToBelow_(i,j) <<
" ";
2485 statusOFS<<std::endl;
2488 statusOFS << std::endl <<
"isRecvFromAbove:" << std::endl;
2489 for(
int j = 0;j< isRecvFromAbove_.m();j++){
2490 statusOFS <<
"["<<j<<
"] "<< isRecvFromAbove_(j)<<std::endl;
2493 #if ( _DEBUGlevel_ >= 1 )
2494 statusOFS << std::endl <<
"isSendToRight:" << std::endl;
2495 for(
int j = 0;j< isSendToRight_.n();j++){
2496 statusOFS <<
"["<<j<<
"] ";
2497 for(
int i =0; i < isSendToRight_.m();i++){
2498 statusOFS<< isSendToRight_(i,j) <<
" ";
2500 statusOFS<<std::endl;
2503 statusOFS << std::endl <<
"isRecvFromLeft:" << std::endl;
2504 for(
int j = 0;j< isRecvFromLeft_.m();j++){
2505 statusOFS <<
"["<<j<<
"] "<< isRecvFromLeft_(j)<<std::endl;
2508 statusOFS << std::endl <<
"isRecvFromBelow:" << std::endl;
2509 for(
int j = 0;j< isRecvFromBelow_.n();j++){
2510 statusOFS <<
"["<<j<<
"] ";
2511 for(
int i =0; i < isRecvFromBelow_.m();i++){
2512 statusOFS<< isRecvFromBelow_(i,j) <<
" ";
2514 statusOFS<<std::endl;
2524 TIMER_START(STCD_RFCD);
2528 PushCallStack(
"SendToCrossDiagonal / RecvFromCrossDiagonal");
2530 for( Int ksup = 0; ksup < numSuper - 1; ksup++ ){
2531 if(
MYCOL( grid_ ) ==
PCOL( ksup, grid_ ) ){
2532 for( std::set<Int>::iterator si = localColBlockRowIdx[
LBj( ksup, grid_ )].begin();
2533 si != localColBlockRowIdx[
LBj( ksup, grid_ )].end(); si++ ){
2535 Int isupProcRow =
PROW( isup, grid_ );
2536 Int isupProcCol =
PCOL( isup, grid_ );
2537 if( isup > ksup &&
MYROW( grid_ ) == isupProcRow ){
2538 isSendToCrossDiagonal_(grid_->numProcCol, ksup ) =
true;
2539 isSendToCrossDiagonal_(isupProcCol, ksup ) =
true;
2545 for( Int ksup = 0; ksup < numSuper - 1; ksup++ ){
2546 if(
MYROW( grid_ ) ==
PROW( ksup, grid_ ) ){
2547 for( std::set<Int>::iterator si = localRowBlockColIdx[
LBi(ksup, grid_) ].begin();
2548 si != localRowBlockColIdx[
LBi(ksup, grid_) ].end(); si++ ){
2550 Int jsupProcCol =
PCOL( jsup, grid_ );
2551 Int jsupProcRow =
PROW( jsup, grid_ );
2552 if( jsup > ksup &&
MYCOL(grid_) == jsupProcCol ){
2553 isRecvFromCrossDiagonal_(grid_->numProcRow, ksup ) =
true;
2554 isRecvFromCrossDiagonal_(jsupProcRow, ksup ) =
true;
2559 #if ( _DEBUGlevel_ >= 1 )
2560 statusOFS << std::endl <<
"isSendToCrossDiagonal:" << std::endl;
2561 for(
int j =0; j < isSendToCrossDiagonal_.n();j++){
2562 if(isSendToCrossDiagonal_(grid_->numProcCol,j)){
2563 statusOFS <<
"["<<j<<
"] ";
2564 for(
int i =0; i < isSendToCrossDiagonal_.m()-1;i++){
2565 if(isSendToCrossDiagonal_(i,j))
2567 statusOFS<<
PNUM(
PROW(j,grid_),i,grid_)<<
" ";
2570 statusOFS<<std::endl;
2574 statusOFS << std::endl <<
"isRecvFromCrossDiagonal:" << std::endl;
2575 for(
int j =0; j < isRecvFromCrossDiagonal_.n();j++){
2576 if(isRecvFromCrossDiagonal_(grid_->numProcRow,j)){
2577 statusOFS <<
"["<<j<<
"] ";
2578 for(
int i =0; i < isRecvFromCrossDiagonal_.m()-1;i++){
2579 if(isRecvFromCrossDiagonal_(i,j))
2581 statusOFS<<
PNUM(i,
PCOL(j,grid_),grid_)<<
" ";
2584 statusOFS<<std::endl;
2595 TIMER_STOP(STCD_RFCD);
2602 GetWorkSet(snodeEtree,this->WorkingSet());
2607 template<
typename T>
2615 statOFS<<
"m"<<
"\t"<<
"n"<<
"\t"<<
"z"<<std::endl;
2616 for(
auto it = gemm_stat.begin(); it!=gemm_stat.end(); it+=3){
2617 statOFS<<*it<<
"\t"<<*(it+1)<<
"\t"<<*(it+2)<<std::endl;
2623 commOFS<<HEADER_COMM<<std::endl;
2624 for(
auto it = comm_stat.begin(); it!=comm_stat.end(); it+=4){
2625 commOFS<<LINE_COMM(it)<<std::endl;
2632 template<
typename T>
2635 TIMER_START(SelInv_P2p);
2638 PushCallStack(
"PMatrix::SelInv_P2p");
2645 std::vector<std::vector<Int> > & superList = this->WorkingSet();
2646 Int numSteps = superList.size();
2648 for (Int lidx=0; lidx<numSteps ; lidx++){
2649 Int stepSuper = superList[lidx].size();
2651 SelInvIntra_P2p(lidx);
2660 TIMER_STOP(SelInv_P2p);
2667 template<
typename T>
2671 PushCallStack(
"PMatrix::PreSelInv");
2677 PushCallStack(
"L(i,k) <- L(i,k) * L(k,k)^{-1}");
2679 #if ( _DEBUGlevel_ >= 1 )
2680 statusOFS << std::endl <<
"L(i,k) <- L(i,k) * L(k,k)^{-1}" << std::endl << std::endl;
2682 for( Int ksup = 0; ksup < numSuper; ksup++ ){
2683 if(
MYCOL( grid_ ) ==
PCOL( ksup, grid_ ) ){
2686 std::vector<LBlock<T> >& Lcol = this->L(
LBj( ksup, grid_ ) );
2687 if(
MYROW( grid_ ) ==
PROW( ksup, grid_ ) ){
2688 nzvalLDiag = Lcol[0].nzval;
2689 if( nzvalLDiag.m() !=
SuperSize(ksup, super_) ||
2690 nzvalLDiag.n() !=
SuperSize(ksup, super_) ){
2694 throw std::runtime_error(
"The size of the diagonal block of L is wrong." );
2701 MPI_Bcast( (
void*)nzvalLDiag.Data(), nzvalLDiag.ByteSize(),
2702 MPI_BYTE,
PROW( ksup, grid_ ), grid_->colComm );
2705 for( Int ib = 0; ib < Lcol.size(); ib++ ){
2708 #if ( _DEBUGlevel_ >= 2 )
2710 if(
LBj( ksup, grid_ ) == 0 ){
2711 statusOFS <<
"Diag L(" << ksup <<
", " << ksup <<
"): " << nzvalLDiag << std::endl;
2712 statusOFS <<
"Before solve L(" << LB.
blockIdx <<
", " << ksup <<
"): " << LB.
nzval << std::endl;
2715 blas::Trsm(
'R',
'L',
'N',
'U', LB.
numRow, LB.
numCol, ONE<T>(),
2717 #if ( _DEBUGlevel_ >= 2 )
2719 if(
LBj( ksup, grid_ ) == 0 ){
2720 statusOFS <<
"After solve L(" << LB.
blockIdx <<
", " << ksup <<
"): " << LB.
nzval << std::endl;
2735 PushCallStack(
"U(k,i) <- L(i,k)");
2737 #if ( _DEBUGlevel_ >= 1 )
2738 statusOFS << std::endl <<
"U(k,i) <- L(i,k)" << std::endl << std::endl;
2741 for( Int ksup = 0; ksup < numSuper; ksup++ ){
2742 Int ksupProcRow =
PROW( ksup, grid_ );
2743 Int ksupProcCol =
PCOL( ksup, grid_ );
2745 Int sendCount = CountSendToCrossDiagonal(ksup);
2746 Int recvCount = CountRecvFromCrossDiagonal(ksup);
2748 std::vector<MPI_Request > arrMpiReqsSend(sendCount, MPI_REQUEST_NULL );
2749 std::vector<MPI_Request > arrMpiReqsSizeSend(sendCount, MPI_REQUEST_NULL );
2750 std::vector<std::vector<char> > arrSstrLcolSend(sendCount);
2751 std::vector<Int > arrSstrLcolSizeSend(sendCount);
2753 std::vector<MPI_Request > arrMpiReqsRecv(recvCount, MPI_REQUEST_NULL );
2754 std::vector<MPI_Request > arrMpiReqsSizeRecv(recvCount, MPI_REQUEST_NULL );
2755 std::vector<std::vector<char> > arrSstrLcolRecv(recvCount);
2756 std::vector<Int > arrSstrLcolSizeRecv(recvCount);
2761 if( isSendToCrossDiagonal_(grid_->numProcCol,ksup) ){
2762 #if ( _DEBUGlevel_ >= 1 )
2763 statusOFS<<
"["<<ksup<<
"] P"<<
MYPROC(grid_)<<
" should send to "<<CountSendToCrossDiagonal(ksup)<<
" processors"<<std::endl;
2767 for(Int dstCol = 0; dstCol<grid_->numProcCol; dstCol++){
2768 if(isSendToCrossDiagonal_(dstCol,ksup) ){
2769 Int dst =
PNUM(
PROW(ksup,grid_),dstCol,grid_);
2772 std::stringstream sstm;
2773 std::vector<char> & sstrLcolSend = arrSstrLcolSend[sendIdx];
2774 Int & sstrSize = arrSstrLcolSizeSend[sendIdx];
2775 MPI_Request & mpiReqSend = arrMpiReqsSend[sendIdx];
2776 MPI_Request & mpiReqSizeSend = arrMpiReqsSizeSend[sendIdx];
2778 std::vector<Int> mask( LBlockMask::TOTAL_NUMBER, 1 );
2779 std::vector<LBlock<T> >& Lcol = this->L(
LBj(ksup, grid_) );
2784 if(
MYROW( grid_ ) ==
PROW( ksup, grid_ ) ){
2785 for( Int ib = 1; ib < Lcol.size(); ib++ ){
2786 if( Lcol[ib].blockIdx > ksup && (Lcol[ib].blockIdx % grid_->numProcCol) == dstCol ){
2793 for( Int ib = 0; ib < Lcol.size(); ib++ ){
2794 if( Lcol[ib].blockIdx > ksup && (Lcol[ib].blockIdx % grid_->numProcCol) == dstCol ){
2800 serialize( (Int)count, sstm, NO_MASK );
2802 for( Int ib = 0; ib < Lcol.size(); ib++ ){
2803 if( Lcol[ib].blockIdx > ksup && (Lcol[ib].blockIdx % grid_->numProcCol) == dstCol ){
2804 #if ( _DEBUGlevel_ >= 1 )
2805 statusOFS<<
"["<<ksup<<
"] SEND contains "<<Lcol[ib].blockIdx<<
" which corresponds to "<<
GBj(ib,grid_)<<std::endl;
2807 serialize( Lcol[ib], sstm, mask );
2811 sstrLcolSend.resize( Size(sstm) );
2812 sstm.read( &sstrLcolSend[0], sstrLcolSend.size() );
2813 sstrSize = sstrLcolSend.size();
2820 #if ( _DEBUGlevel_ >= 1 )
2821 statusOFS<<
"["<<ksup<<
"] P"<<
MYPROC(grid_)<<
" ("<<
MYROW(grid_)<<
","<<
MYCOL(grid_)<<
") ---> LBj("<<ksup<<
")="<<
LBj(ksup,grid_)<<
" ---> P"<<dst<<
" ("<<ksupProcRow<<
","<<dstCol<<
")"<<std::endl;
2823 MPI_Isend( &sstrSize,
sizeof(sstrSize), MPI_BYTE, dst, SELINV_TAG_D_SIZE, grid_->comm, &mpiReqSizeSend );
2824 MPI_Isend( (
void*)&sstrLcolSend[0], sstrSize, MPI_BYTE, dst, SELINV_TAG_D_CONTENT, grid_->comm, &mpiReqSend );
2827 PROFILE_COMM(
MYPROC(this->grid_),dst,SELINV_TAG_D_SIZE,
sizeof(sstrSize));
2828 PROFILE_COMM(
MYPROC(this->grid_),dst,SELINV_TAG_D_CONTENT,sstrSize);
2843 if( isRecvFromCrossDiagonal_(grid_->numProcRow,ksup) ){
2846 #if ( _DEBUGlevel_ >= 1 )
2847 statusOFS<<
"["<<ksup<<
"] P"<<
MYPROC(grid_)<<
" should receive from "<<CountRecvFromCrossDiagonal(ksup)<<
" processors"<<std::endl;
2851 std::vector<UBlock<T> >& Urow = this->U(
LBi( ksup, grid_ ) );
2852 std::vector<bool> isBlockFound(Urow.size(),
false);
2857 for(Int srcRow = 0; srcRow<grid_->numProcRow; srcRow++){
2858 if(isRecvFromCrossDiagonal_(srcRow,ksup) ){
2859 std::vector<LBlock<T> > LcolRecv;
2860 Int src =
PNUM(srcRow,
PCOL(ksup,grid_),grid_);
2862 MPI_Request & mpiReqSizeRecv = arrMpiReqsSizeRecv[recvIdx];
2863 Int & sstrSize = arrSstrLcolSizeRecv[recvIdx];
2865 MPI_Irecv( &sstrSize, 1, MPI_INT, src, SELINV_TAG_D_SIZE, grid_->comm, &mpiReqSizeRecv );
2872 mpi::Waitall(arrMpiReqsSizeRecv);
2878 for(Int srcRow = 0; srcRow<grid_->numProcRow; srcRow++){
2879 if(isRecvFromCrossDiagonal_(srcRow,ksup) ){
2880 std::vector<LBlock<T> > LcolRecv;
2881 Int src =
PNUM(srcRow,
PCOL(ksup,grid_),grid_);
2883 MPI_Request & mpiReqRecv = arrMpiReqsRecv[recvIdx];
2884 Int & sstrSize = arrSstrLcolSizeRecv[recvIdx];
2885 std::vector<char> & sstrLcolRecv = arrSstrLcolRecv[recvIdx];
2886 sstrLcolRecv.resize(sstrSize);
2888 MPI_Irecv( &sstrLcolRecv[0], sstrSize, MPI_BYTE, src, SELINV_TAG_D_CONTENT, grid_->comm, &mpiReqRecv );
2895 mpi::Waitall(arrMpiReqsRecv);
2901 for(Int srcRow = 0; srcRow<grid_->numProcRow; srcRow++){
2902 if(isRecvFromCrossDiagonal_(srcRow,ksup) ){
2903 std::vector<LBlock<T> > LcolRecv;
2904 Int src =
PNUM(srcRow,
PCOL(ksup,grid_),grid_);
2907 Int & sstrSize = arrSstrLcolSizeRecv[recvIdx];
2908 std::vector<char> & sstrLcolRecv = arrSstrLcolRecv[recvIdx];
2909 std::stringstream sstm;
2911 #if ( _DEBUGlevel_ >= 1 )
2912 statusOFS<<
"["<<ksup<<
"] P"<<
MYPROC(grid_)<<
" ("<<
MYROW(grid_)<<
","<<
MYCOL(grid_)<<
") <--- LBj("<<ksup<<
") <--- P"<<src<<
" ("<<srcRow<<
","<<ksupProcCol<<
")"<<std::endl;
2916 sstm.write( &sstrLcolRecv[0], sstrSize );
2920 std::vector<Int> mask( LBlockMask::TOTAL_NUMBER, 1 );
2921 deserialize( numLBlock, sstm, NO_MASK );
2922 LcolRecv.resize(numLBlock);
2923 for( Int ib = 0; ib < numLBlock; ib++ ){
2924 deserialize( LcolRecv[ib], sstm, mask );
2925 #if ( _DEBUGlevel_ >= 1 )
2926 statusOFS<<
"["<<ksup<<
"] RECV contains "<<LcolRecv[ib].blockIdx<<
" which corresponds to "<< ib * grid_->numProcRow + srcRow;
2928 statusOFS<<
" L is on row "<< srcRow <<
" whereas U is on col "<< LcolRecv[ib].blockIdx % grid_->numProcCol <<std::endl;
2938 std::vector<LBlock<T> >& Lcol = this->L(
LBj( ksup, grid_ ) );
2939 if(
MYROW( grid_ ) !=
PROW( ksup, grid_ ) ){
2940 LcolRecv.resize( Lcol.size() );
2941 for( Int ib = 0; ib < Lcol.size(); ib++ ){
2942 LcolRecv[ib] = Lcol[ib];
2946 LcolRecv.resize( Lcol.size() - 1 );
2947 for( Int ib = 0; ib < Lcol.size() - 1; ib++ ){
2948 LcolRecv[ib] = Lcol[ib+1];
2955 for( Int ib = 0; ib < LcolRecv.size(); ib++ ){
2961 throw std::logic_error(
"LcolRecv contains the wrong blocks." );
2963 for( Int jb = 0; jb < Urow.size(); jb++ ){
2968 std::ostringstream msg;
2969 msg <<
"LB(" << LB.
blockIdx <<
", " << ksup <<
") and UB("
2970 << ksup <<
", " << UB.
blockIdx <<
") do not share the same size." << std::endl
2971 <<
"LB: " << LB.
numRow <<
" x " << LB.
numCol << std::endl
2972 <<
"UB: " << UB.
numRow <<
" x " << UB.
numCol << std::endl;
2976 throw std::runtime_error( msg.str().c_str() );
2985 #if ( _DEBUGlevel_ >= 1 )
2986 statusOFS<<
"["<<ksup<<
"] USING "<<LB.
blockIdx<< std::endl;
2988 isBlockFound[jb] =
true;
2996 for( Int jb = 0; jb < Urow.size(); jb++ ){
2998 if( !isBlockFound[jb] ){
3002 throw std::logic_error(
"UBlock cannot find its update. Something is seriously wrong." );
3012 mpi::Waitall(arrMpiReqsSizeSend);
3013 mpi::Waitall(arrMpiReqsSend);
3023 PushCallStack(
"L(i,i) <- [L(k,k) * U(k,k)]^{-1} ");
3025 #if ( _DEBUGlevel_ >= 1 )
3026 statusOFS << std::endl <<
"L(i,i) <- [L(k,k) * U(k,k)]^{-1}" << std::endl << std::endl;
3029 for( Int ksup = 0; ksup < numSuper; ksup++ ){
3030 if(
MYROW( grid_ ) ==
PROW( ksup, grid_ ) &&
3031 MYCOL( grid_ ) ==
PCOL( ksup, grid_ ) ){
3035 for(Int i = 0; i <
SuperSize( ksup, super_ ); i++){
3039 #if ( _DEBUGlevel_ >= 2 )
3041 statusOFS <<
"Factorized A (" << ksup <<
", " << ksup <<
"): " << LB.
nzval << std::endl;
3044 SuperSize( ksup, super_ ), ipiv.Data() );
3047 Symmetrize( LB.
nzval );
3049 #if ( _DEBUGlevel_ >= 2 )
3051 statusOFS <<
"Inversed A (" << ksup <<
", " << ksup <<
"): " << LB.
nzval << std::endl;
3070 template<
typename T>
3074 PushCallStack(
"PMatrix::GetDiagonal");
3078 Int numCol = this->
NumCol();
3080 const IntNumVec& permInv = super_->permInv;
3085 if(options_->RowPerm==
"NOROWPERM"){
3086 pPerm_r = &super_->perm;
3087 pPermInv_r = &super_->permInv;
3090 pPerm_r = &super_->perm_r;
3091 pPermInv_r = &super_->permInv_r;
3095 const IntNumVec& permInv_r = *pPermInv_r;
3101 diag.Resize( numCol );
3105 for( Int ksup = 0; ksup < numSuper; ksup++ ){
3107 if(
MYROW( grid_ ) ==
PROW( ksup, grid_ ) &&
3108 MYCOL( grid_ ) ==
PCOL( ksup, grid_ ) ){
3110 for( Int i = 0; i < LB.
numRow; i++ ){
3111 diagLocal( permInv_r( LB.
rows(i) ) ) = LB.
nzval( i, perm(permInv_r(i)) );
3117 mpi::Allreduce( diagLocal.Data(), diag.Data(), numCol, MPI_SUM, grid_->comm );
3128 template<
typename T>
3132 PushCallStack(
"PMatrix::PMatrixToDistSparseMatrix");
3134 #if ( _DEBUGlevel_ >= 1 )
3135 statusOFS << std::endl <<
"Converting PMatrix to DistSparseMatrix." << std::endl;
3137 Int mpirank = grid_->mpirank;
3138 Int mpisize = grid_->mpisize;
3140 std::vector<Int> rowSend( mpisize );
3141 std::vector<Int> colSend( mpisize );
3142 std::vector<T> valSend( mpisize );
3143 std::vector<Int> sizeSend( mpisize, 0 );
3144 std::vector<Int> displsSend( mpisize, 0 );
3146 std::vector<Int> rowRecv( mpisize );
3147 std::vector<Int> colRecv( mpisize );
3148 std::vector<T> valRecv( mpisize );
3149 std::vector<Int> sizeRecv( mpisize, 0 );
3150 std::vector<Int> displsRecv( mpisize, 0 );
3153 const IntNumVec& permInv = super_->permInv;
3156 if(options_->RowPerm==
"NOROWPERM"){
3157 pPermInv_r = &super_->permInv;
3160 pPermInv_r = &super_->permInv_r;
3163 const IntNumVec& permInv_r = *pPermInv_r;
3172 Int numColFirst = this->
NumCol() / mpisize;
3175 for( Int ksup = 0; ksup < numSuper; ksup++ ){
3177 if(
MYCOL( grid_ ) ==
PCOL( ksup, grid_ ) ){
3178 std::vector<LBlock<T> >& Lcol = this->L(
LBj( ksup, grid_ ) );
3179 for( Int ib = 0; ib < Lcol.size(); ib++ ){
3180 for( Int j = 0; j < Lcol[ib].numCol; j++ ){
3182 Int dest = std::min( jcol / numColFirst, mpisize - 1 );
3183 sizeSend[dest] += Lcol[ib].numRow;
3189 if(
MYROW( grid_ ) ==
PROW( ksup, grid_ ) ){
3190 std::vector<UBlock<T> >& Urow = this->U(
LBi( ksup, grid_ ) );
3191 for( Int jb = 0; jb < Urow.size(); jb++ ){
3193 for( Int j = 0; j < cols.m(); j++ ){
3194 Int jcol = permInv( cols(j) );
3195 Int dest = std::min( jcol / numColFirst, mpisize - 1 );
3196 sizeSend[dest] += Urow[jb].numRow;
3204 &sizeSend[0], 1, MPI_INT,
3205 &sizeRecv[0], 1, MPI_INT, grid_->comm );
3210 for( Int ip = 0; ip < mpisize; ip++ ){
3215 displsSend[ip] = displsSend[ip-1] + sizeSend[ip-1];
3222 displsRecv[ip] = displsRecv[ip-1] + sizeRecv[ip-1];
3225 Int sizeSendTotal = displsSend[mpisize-1] + sizeSend[mpisize-1];
3226 Int sizeRecvTotal = displsRecv[mpisize-1] + sizeRecv[mpisize-1];
3228 rowSend.resize( sizeSendTotal );
3229 colSend.resize( sizeSendTotal );
3230 valSend.resize( sizeSendTotal );
3232 rowRecv.resize( sizeRecvTotal );
3233 colRecv.resize( sizeRecvTotal );
3234 valRecv.resize( sizeRecvTotal );
3236 #if ( _DEBUGlevel_ >= 1 )
3237 statusOFS <<
"displsSend = " << displsSend << std::endl;
3238 statusOFS <<
"displsRecv = " << displsRecv << std::endl;
3242 std::vector<Int> cntSize( mpisize, 0 );
3244 for( Int ksup = 0; ksup < numSuper; ksup++ ){
3246 if(
MYCOL( grid_ ) ==
PCOL( ksup, grid_ ) ){
3247 std::vector<LBlock<T> >& Lcol = this->L(
LBj( ksup, grid_ ) );
3248 for( Int ib = 0; ib < Lcol.size(); ib++ ){
3251 for( Int j = 0; j < Lcol[ib].numCol; j++ ){
3253 Int dest = std::min( jcol / numColFirst, mpisize - 1 );
3254 for( Int i = 0; i < rows.m(); i++ ){
3255 rowSend[displsSend[dest] + cntSize[dest]] = permInv_r( rows(i) );
3256 colSend[displsSend[dest] + cntSize[dest]] = jcol;
3257 valSend[displsSend[dest] + cntSize[dest]] = nzval( i, j );
3265 if(
MYROW( grid_ ) ==
PROW( ksup, grid_ ) ){
3266 std::vector<UBlock<T> >& Urow = this->U(
LBi( ksup, grid_ ) );
3267 for( Int jb = 0; jb < Urow.size(); jb++ ){
3270 for( Int j = 0; j < cols.m(); j++ ){
3271 Int jcol = permInv( cols(j) );
3272 Int dest = std::min( jcol / numColFirst, mpisize - 1 );
3273 for( Int i = 0; i < Urow[jb].numRow; i++ ){
3274 rowSend[displsSend[dest] + cntSize[dest]] =
3276 colSend[displsSend[dest] + cntSize[dest]] = jcol;
3277 valSend[displsSend[dest] + cntSize[dest]] = nzval( i, j );
3286 for( Int ip = 0; ip < mpisize; ip++ ){
3287 if( cntSize[ip] != sizeSend[ip] )
3292 throw std::runtime_error(
"Sizes of the sending information do not match." );
3299 &rowSend[0], &sizeSend[0], &displsSend[0],
3300 &rowRecv[0], &sizeRecv[0], &displsRecv[0],
3303 &colSend[0], &sizeSend[0], &displsSend[0],
3304 &colRecv[0], &sizeRecv[0], &displsRecv[0],
3307 &valSend[0], &sizeSend[0], &displsSend[0],
3308 &valRecv[0], &sizeRecv[0], &displsRecv[0],
3311 #if ( _DEBUGlevel_ >= 1 )
3312 statusOFS <<
"Alltoallv communication finished." << std::endl;
3327 Int firstCol = mpirank * numColFirst;
3329 if( mpirank == mpisize-1 )
3330 numColLocal = this->
NumCol() - numColFirst * (mpisize-1);
3332 numColLocal = numColFirst;
3334 std::vector<std::vector<Int> > rows( numColLocal );
3335 std::vector<std::vector<T> > vals( numColLocal );
3337 for( Int ip = 0; ip < mpisize; ip++ ){
3338 Int* rowRecvCur = &rowRecv[displsRecv[ip]];
3339 Int* colRecvCur = &colRecv[displsRecv[ip]];
3340 T* valRecvCur = &valRecv[displsRecv[ip]];
3341 for( Int i = 0; i < sizeRecv[ip]; i++ ){
3342 rows[colRecvCur[i]-firstCol].push_back( rowRecvCur[i] );
3343 vals[colRecvCur[i]-firstCol].push_back( valRecvCur[i] );
3348 std::vector<std::vector<Int> > sortIndex( numColLocal );
3349 for( Int j = 0; j < numColLocal; j++ ){
3350 sortIndex[j].resize( rows[j].size() );
3351 for( Int i = 0; i < sortIndex[j].size(); i++ )
3352 sortIndex[j][i] = i;
3353 std::sort( sortIndex[j].begin(), sortIndex[j].end(),
3354 IndexComp<std::vector<Int>& > ( rows[j] ) );
3366 for( Int j = 0; j < numColLocal; j++ ){
3371 A.
comm = grid_->comm;
3373 #if ( _DEBUGlevel_ >= 1 )
3374 statusOFS <<
"nnzLocal = " << A.
nnzLocal << std::endl;
3375 statusOFS <<
"nnz = " << A.
Nnz() << std::endl;
3384 for( Int j = 0; j < numColLocal; j++ ){
3385 std::vector<Int>& rowsCur = rows[j];
3386 std::vector<Int>& sortIndexCur = sortIndex[j];
3387 std::vector<T>& valsCur = vals[j];
3388 for( Int i = 0; i < rows[j].size(); i++ ){
3390 *(rowPtr++) = rowsCur[sortIndexCur[i]] + 1;
3391 *(nzvalPtr++) = valsCur[sortIndexCur[i]];
3395 #if ( _DEBUGlevel_ >= 1 )
3396 statusOFS <<
"A.colptrLocal[end] = " << A.
colptrLocal(numColLocal) << std::endl;
3397 statusOFS <<
"A.rowindLocal.size() = " << A.
rowindLocal.m() << std::endl;
3412 template<
typename T>
3416 PushCallStack(
"PMatrix::PMatrixToDistSparseMatrix");
3418 #if ( _DEBUGlevel_ >= 1 )
3419 statusOFS << std::endl <<
"Converting PMatrix to DistSparseMatrix (2nd format)." << std::endl;
3421 Int mpirank = grid_->mpirank;
3422 Int mpisize = grid_->mpisize;
3424 std::vector<Int> rowSend( mpisize );
3425 std::vector<Int> colSend( mpisize );
3426 std::vector<T> valSend( mpisize );
3427 std::vector<Int> sizeSend( mpisize, 0 );
3428 std::vector<Int> displsSend( mpisize, 0 );
3430 std::vector<Int> rowRecv( mpisize );
3431 std::vector<Int> colRecv( mpisize );
3432 std::vector<T> valRecv( mpisize );
3433 std::vector<Int> sizeRecv( mpisize, 0 );
3434 std::vector<Int> displsRecv( mpisize, 0 );
3437 const IntNumVec& permInv = super_->permInv;
3443 if(options_->RowPerm==
"NOROWPERM"){
3444 pPermInv_r = &super_->permInv;
3447 pPermInv_r = &super_->permInv_r;
3450 const IntNumVec& permInv_r = *pPermInv_r;
3462 Int numColFirst = this->
NumCol() / mpisize;
3465 for( Int ksup = 0; ksup < numSuper; ksup++ ){
3467 if(
MYCOL( grid_ ) ==
PCOL( ksup, grid_ ) ){
3468 std::vector<LBlock<T> >& Lcol = this->L(
LBj( ksup, grid_ ) );
3469 for( Int ib = 0; ib < Lcol.size(); ib++ ){
3470 for( Int j = 0; j < Lcol[ib].numCol; j++ ){
3472 Int dest = std::min( jcol / numColFirst, mpisize - 1 );
3473 sizeSend[dest] += Lcol[ib].numRow;
3479 if(
MYROW( grid_ ) ==
PROW( ksup, grid_ ) ){
3480 std::vector<UBlock<T> >& Urow = this->U(
LBi( ksup, grid_ ) );
3481 for( Int jb = 0; jb < Urow.size(); jb++ ){
3483 for( Int j = 0; j < cols.m(); j++ ){
3484 Int jcol = permInv( cols(j) );
3485 Int dest = std::min( jcol / numColFirst, mpisize - 1 );
3486 sizeSend[dest] += Urow[jb].numRow;
3494 &sizeSend[0], 1, MPI_INT,
3495 &sizeRecv[0], 1, MPI_INT, grid_->comm );
3500 for( Int ip = 0; ip < mpisize; ip++ ){
3505 displsSend[ip] = displsSend[ip-1] + sizeSend[ip-1];
3512 displsRecv[ip] = displsRecv[ip-1] + sizeRecv[ip-1];
3515 Int sizeSendTotal = displsSend[mpisize-1] + sizeSend[mpisize-1];
3516 Int sizeRecvTotal = displsRecv[mpisize-1] + sizeRecv[mpisize-1];
3518 rowSend.resize( sizeSendTotal );
3519 colSend.resize( sizeSendTotal );
3520 valSend.resize( sizeSendTotal );
3522 rowRecv.resize( sizeRecvTotal );
3523 colRecv.resize( sizeRecvTotal );
3524 valRecv.resize( sizeRecvTotal );
3526 #if ( _DEBUGlevel_ >= 1 )
3527 statusOFS <<
"displsSend = " << displsSend << std::endl;
3528 statusOFS <<
"displsRecv = " << displsRecv << std::endl;
3532 std::vector<Int> cntSize( mpisize, 0 );
3535 for( Int ksup = 0; ksup < numSuper; ksup++ ){
3537 if(
MYCOL( grid_ ) ==
PCOL( ksup, grid_ ) ){
3538 std::vector<LBlock<T> >& Lcol = this->L(
LBj( ksup, grid_ ) );
3539 for( Int ib = 0; ib < Lcol.size(); ib++ ){
3542 for( Int j = 0; j < Lcol[ib].numCol; j++ ){
3544 Int dest = std::min( jcol / numColFirst, mpisize - 1 );
3545 for( Int i = 0; i < rows.m(); i++ ){
3546 rowSend[displsSend[dest] + cntSize[dest]] = permInv_r( rows(i) );
3547 colSend[displsSend[dest] + cntSize[dest]] = jcol;
3548 valSend[displsSend[dest] + cntSize[dest]] = nzval( i, j );
3557 if(
MYROW( grid_ ) ==
PROW( ksup, grid_ ) ){
3558 std::vector<UBlock<T> >& Urow = this->U(
LBi( ksup, grid_ ) );
3559 for( Int jb = 0; jb < Urow.size(); jb++ ){
3562 for( Int j = 0; j < cols.m(); j++ ){
3563 Int jcol = permInv( cols(j) );
3564 Int dest = std::min( jcol / numColFirst, mpisize - 1 );
3565 for( Int i = 0; i < Urow[jb].numRow; i++ ){
3566 rowSend[displsSend[dest] + cntSize[dest]] =
3568 colSend[displsSend[dest] + cntSize[dest]] = jcol;
3569 valSend[displsSend[dest] + cntSize[dest]] = nzval( i, j );
3580 for( Int ip = 0; ip < mpisize; ip++ ){
3581 if( cntSize[ip] != sizeSend[ip] ){
3585 throw std::runtime_error(
"Sizes of the sending information do not match." );
3591 &rowSend[0], &sizeSend[0], &displsSend[0],
3592 &rowRecv[0], &sizeRecv[0], &displsRecv[0],
3595 &colSend[0], &sizeSend[0], &displsSend[0],
3596 &colRecv[0], &sizeRecv[0], &displsRecv[0],
3599 &valSend[0], &sizeSend[0], &displsSend[0],
3600 &valRecv[0], &sizeRecv[0], &displsRecv[0],
3603 #if ( _DEBUGlevel_ >= 1 )
3604 statusOFS <<
"Alltoallv communication finished." << std::endl;
3619 Int firstCol = mpirank * numColFirst;
3621 if( mpirank == mpisize-1 )
3622 numColLocal = this->
NumCol() - numColFirst * (mpisize-1);
3624 numColLocal = numColFirst;
3626 std::vector<std::vector<Int> > rows( numColLocal );
3627 std::vector<std::vector<T> > vals( numColLocal );
3629 for( Int ip = 0; ip < mpisize; ip++ ){
3630 Int* rowRecvCur = &rowRecv[displsRecv[ip]];
3631 Int* colRecvCur = &colRecv[displsRecv[ip]];
3632 T* valRecvCur = &valRecv[displsRecv[ip]];
3633 for( Int i = 0; i < sizeRecv[ip]; i++ ){
3634 rows[colRecvCur[i]-firstCol].push_back( rowRecvCur[i] );
3635 vals[colRecvCur[i]-firstCol].push_back( valRecvCur[i] );
3640 std::vector<std::vector<Int> > sortIndex( numColLocal );
3641 for( Int j = 0; j < numColLocal; j++ ){
3642 sortIndex[j].resize( rows[j].size() );
3643 for( Int i = 0; i < sortIndex[j].size(); i++ )
3644 sortIndex[j][i] = i;
3645 std::sort( sortIndex[j].begin(), sortIndex[j].end(),
3646 IndexComp<std::vector<Int>& > ( rows[j] ) );
3653 if( A.
size != this->NumCol() ){
3657 throw std::runtime_error(
"The DistSparseMatrix providing the pattern has a different size from PMatrix." );
3663 throw std::runtime_error(
"The DistSparseMatrix providing the pattern has a different number of local columns from PMatrix." );
3681 B.
comm = grid_->comm;
3685 for( Int j = 0; j < numColLocal; j++ ){
3686 std::vector<Int>& rowsCur = rows[j];
3687 std::vector<Int>& sortIndexCur = sortIndex[j];
3688 std::vector<T>& valsCur = vals[j];
3689 std::vector<Int> rowsCurSorted( rowsCur.size() );
3691 for( Int i = 0; i < rowsCurSorted.size(); i++ ){
3692 rowsCurSorted[i] = rowsCur[sortIndexCur[i]] + 1;
3696 std::vector<Int>::iterator it;
3699 it = std::lower_bound( rowsCurSorted.begin(), rowsCurSorted.end(),
3701 if( it == rowsCurSorted.end() ){
3703 *(nzvalPtr++) = ZERO<T>();
3707 *(nzvalPtr++) = valsCur[ sortIndexCur[it-rowsCurSorted.begin()] ];
3712 #if ( _DEBUGlevel_ >= 1 )
3713 statusOFS <<
"B.colptrLocal[end] = " << B.
colptrLocal(numColLocal) << std::endl;
3714 statusOFS <<
"B.rowindLocal.size() = " << B.
rowindLocal.m() << std::endl;
3731 template<
typename T>
3735 PushCallStack(
"PMatrix::PMatrixToDistSparseMatrix2");
3737 #if ( _DEBUGlevel_ >= 1 )
3738 statusOFS << std::endl <<
"Converting PMatrix to DistSparseMatrix (2nd format)." << std::endl;
3740 Int mpirank = grid_->mpirank;
3741 Int mpisize = grid_->mpisize;
3743 std::vector<Int> rowSend( mpisize );
3744 std::vector<Int> colSend( mpisize );
3745 std::vector<T> valSend( mpisize );
3746 std::vector<Int> sizeSend( mpisize, 0 );
3747 std::vector<Int> displsSend( mpisize, 0 );
3749 std::vector<Int> rowRecv( mpisize );
3750 std::vector<Int> colRecv( mpisize );
3751 std::vector<T> valRecv( mpisize );
3752 std::vector<Int> sizeRecv( mpisize, 0 );
3753 std::vector<Int> displsRecv( mpisize, 0 );
3757 const IntNumVec& permInv = super_->permInv;
3762 if(options_->RowPerm==
"NOROWPERM"){
3763 pPerm_r = &super_->perm;
3764 pPermInv_r = &super_->permInv;
3767 pPerm_r = &super_->perm_r;
3768 pPermInv_r = &super_->permInv_r;
3772 const IntNumVec& permInv_r = *pPermInv_r;
3775 Int numColFirst = this->
NumCol() / mpisize;
3776 Int firstCol = mpirank * numColFirst;
3778 if( mpirank == mpisize-1 )
3779 numColLocal = this->
NumCol() - numColFirst * (mpisize-1);
3781 numColLocal = numColFirst;
3786 for( Int j = 0; j < numColLocal; j++ ){
3787 Int col = perm( firstCol + j );
3788 Int blockColIdx =
BlockIdx( col, super_ );
3789 Int procCol =
PCOL( blockColIdx, grid_ );
3790 for( Int i = colPtr[j] - 1; i < colPtr[j+1] - 1; i++ ){
3791 Int row = perm_r( *(rowPtr++) - 1 );
3792 Int blockRowIdx =
BlockIdx( row, super_ );
3793 Int procRow =
PROW( blockRowIdx, grid_ );
3794 Int dest =
PNUM( procRow, procCol, grid_ );
3795 #if ( _DEBUGlevel_ >= 1 )
3796 statusOFS <<
"BlockIdx = " << blockRowIdx <<
", " <<blockColIdx << std::endl;
3797 statusOFS << procRow <<
", " << procCol <<
", "
3798 << dest << std::endl;
3806 &sizeSend[0], 1, MPI_INT,
3807 &sizeRecv[0], 1, MPI_INT, grid_->comm );
3809 #if ( _DEBUGlevel_ >= 1 )
3810 statusOFS << std::endl <<
"sizeSend: " << sizeSend << std::endl;
3811 statusOFS << std::endl <<
"sizeRecv: " << sizeRecv << std::endl;
3817 for( Int ip = 0; ip < mpisize; ip++ ){
3822 displsSend[ip] = displsSend[ip-1] + sizeSend[ip-1];
3829 displsRecv[ip] = displsRecv[ip-1] + sizeRecv[ip-1];
3833 Int sizeSendTotal = displsSend[mpisize-1] + sizeSend[mpisize-1];
3834 Int sizeRecvTotal = displsRecv[mpisize-1] + sizeRecv[mpisize-1];
3836 rowSend.resize( sizeSendTotal );
3837 colSend.resize( sizeSendTotal );
3838 valSend.resize( sizeSendTotal );
3840 rowRecv.resize( sizeRecvTotal );
3841 colRecv.resize( sizeRecvTotal );
3842 valRecv.resize( sizeRecvTotal );
3844 #if ( _DEBUGlevel_ >= 1 )
3845 statusOFS <<
"displsSend = " << displsSend << std::endl;
3846 statusOFS <<
"displsRecv = " << displsRecv << std::endl;
3850 std::vector<Int> cntSize( mpisize, 0 );
3855 for( Int j = 0; j < numColLocal; j++ ){
3856 Int col = perm( firstCol + j );
3857 Int blockColIdx =
BlockIdx( col, super_ );
3858 Int procCol =
PCOL( blockColIdx, grid_ );
3859 for( Int i = colPtr[j] - 1; i < colPtr[j+1] - 1; i++ ){
3860 Int row = perm_r( *(rowPtr++) - 1 );
3861 Int blockRowIdx =
BlockIdx( row, super_ );
3862 Int procRow =
PROW( blockRowIdx, grid_ );
3863 Int dest =
PNUM( procRow, procCol, grid_ );
3864 rowSend[displsSend[dest] + cntSize[dest]] = row;
3865 colSend[displsSend[dest] + cntSize[dest]] = col;
3872 for( Int ip = 0; ip < mpisize; ip++ ){
3873 if( cntSize[ip] != sizeSend[ip] ){
3877 throw std::runtime_error(
"Sizes of the sending information do not match." );
3883 &rowSend[0], &sizeSend[0], &displsSend[0],
3884 &rowRecv[0], &sizeRecv[0], &displsRecv[0],
3887 &colSend[0], &sizeSend[0], &displsSend[0],
3888 &colRecv[0], &sizeRecv[0], &displsRecv[0],
3891 #if ( _DEBUGlevel_ >= 1 )
3892 statusOFS <<
"Alltoallv communication of nonzero indices finished." << std::endl;
3896 #if ( _DEBUGlevel_ >= 1 )
3897 for( Int ip = 0; ip < mpisize; ip++ ){
3898 statusOFS <<
"rowSend[" << ip <<
"] = " << rowSend[ip] << std::endl;
3899 statusOFS <<
"rowRecv[" << ip <<
"] = " << rowRecv[ip] << std::endl;
3900 statusOFS <<
"colSend[" << ip <<
"] = " << colSend[ip] << std::endl;
3901 statusOFS <<
"colRecv[" << ip <<
"] = " << colRecv[ip] << std::endl;
3906 for( Int g = 0; g < sizeRecvTotal; g++ ){
3907 Int row = rowRecv[g];
3908 Int col = colRecv[g];
3910 Int blockRowIdx =
BlockIdx( row, super_ );
3911 Int blockColIdx =
BlockIdx( col, super_ );
3914 bool isFound =
false;
3916 if( blockColIdx <= blockRowIdx ){
3919 std::vector<LBlock<T> >& Lcol = this->L(
LBj( blockColIdx, grid_ ) );
3921 for( Int ib = 0; ib < Lcol.size(); ib++ ){
3922 #if ( _DEBUGlevel_ >= 1 )
3923 statusOFS <<
"blockRowIdx = " << blockRowIdx <<
", Lcol[ib].blockIdx = " << Lcol[ib].blockIdx <<
", blockColIdx = " << blockColIdx << std::endl;
3925 if( Lcol[ib].blockIdx == blockRowIdx ){
3927 for(
int iloc = 0; iloc < Lcol[ib].numRow; iloc++ ){
3928 if( rows[iloc] == row ){
3930 valRecv[g] = Lcol[ib].nzval( iloc, jloc );
3936 if( isFound ==
true )
break;
3942 std::vector<UBlock<T> >& Urow = this->U(
LBi( blockRowIdx, grid_ ) );
3944 for( Int jb = 0; jb < Urow.size(); jb++ ){
3945 if( Urow[jb].blockIdx == blockColIdx ){
3947 for(
int jloc = 0; jloc < Urow[jb].numCol; jloc++ ){
3948 if( cols[jloc] == col ){
3950 valRecv[g] = Urow[jb].nzval( iloc, jloc );
3956 if( isFound ==
true )
break;
3961 if( isFound ==
false ){
3962 statusOFS <<
"In the permutated order, (" << row <<
", " << col <<
3963 ") is not found in PMatrix." << std::endl;
3964 valRecv[g] = ZERO<T>();
3973 &valRecv[0], &sizeRecv[0], &displsRecv[0],
3974 &valSend[0], &sizeSend[0], &displsSend[0],
3977 #if ( _DEBUGlevel_ >= 1 )
3978 statusOFS <<
"Alltoallv communication of nonzero values finished." << std::endl;
3997 B.
comm = grid_->comm;
3999 for( Int i = 0; i < mpisize; i++ )
4006 for( Int j = 0; j < numColLocal; j++ ){
4007 Int col = perm( firstCol + j );
4008 Int blockColIdx =
BlockIdx( col, super_ );
4009 Int procCol =
PCOL( blockColIdx, grid_ );
4010 for( Int i = colPtr[j] - 1; i < colPtr[j+1] - 1; i++ ){
4011 Int row = perm_r( *(rowPtr++) - 1 );
4012 Int blockRowIdx =
BlockIdx( row, super_ );
4013 Int procRow =
PROW( blockRowIdx, grid_ );
4014 Int dest =
PNUM( procRow, procCol, grid_ );
4015 *(valPtr++) = valSend[displsSend[dest] + cntSize[dest]];
4021 for( Int ip = 0; ip < mpisize; ip++ ){
4022 if( cntSize[ip] != sizeSend[ip] ){
4026 throw std::runtime_error(
"Sizes of the sending information do not match." );
4041 template<
typename T>
4045 PushCallStack(
"PMatrix::NnzLocal");
4049 for( Int ksup = 0; ksup < numSuper; ksup++ ){
4050 if(
MYCOL( grid_ ) ==
PCOL( ksup, grid_ ) ){
4051 std::vector<LBlock<T> >& Lcol = this->L(
LBj( ksup, grid_ ) );
4052 for( Int ib = 0; ib < Lcol.size(); ib++ ){
4053 nnzLocal += Lcol[ib].numRow * Lcol[ib].numCol;
4056 if(
MYROW( grid_ ) ==
PROW( ksup, grid_ ) ){
4057 std::vector<UBlock<T> >& Urow = this->U(
LBi( ksup, grid_ ) );
4058 for( Int jb = 0; jb < Urow.size(); jb++ ){
4059 nnzLocal += Urow[jb].numRow * Urow[jb].numCol;
4072 template<
typename T>
4076 PushCallStack(
"PMatrix::Nnz");
4078 LongInt nnzLocal = LongInt( this->NnzLocal() );
4081 MPI_Allreduce( &nnzLocal, &nnz, 1, MPI_LONG_LONG, MPI_SUM, grid_->comm );
4094 PushCallStack(
"PMatrix::GetNegativeInertia");
4098 Real inertiaLocal = 0.0;
4101 for( Int ksup = 0; ksup < numSuper; ksup++ ){
4103 if(
MYROW( grid_ ) ==
PROW( ksup, grid_ ) &&
4104 MYCOL( grid_ ) ==
PCOL( ksup, grid_ ) ){
4106 for( Int i = 0; i < LB.
numRow; i++ ){
4107 if( LB.
nzval(i, i) < 0 )
4114 mpi::Allreduce( &inertiaLocal, &inertia, 1, MPI_SUM, grid_->comm );
4128 PushCallStack(
"PMatrix::GetNegativeInertia");
4132 Real inertiaLocal = 0.0;
4135 for( Int ksup = 0; ksup < numSuper; ksup++ ){
4137 if(
MYROW( grid_ ) ==
PROW( ksup, grid_ ) &&
4138 MYCOL( grid_ ) ==
PCOL( ksup, grid_ ) ){
4139 LBlock<Complex> & LB = this->L(
LBj( ksup, grid_ ) )[0];
4140 for( Int i = 0; i < LB.numRow; i++ ){
4141 if( LB.nzval(i, i).real() < 0 )
4148 mpi::Allreduce( &inertiaLocal, &inertia, 1, MPI_SUM, grid_->comm );
4157 template<
typename T>
4161 pMat =
new PMatrix<T>( pGridType, pSuper, pLuOpt );
4166 template<
typename T>
4178 #endif //_PEXSI_PSELINV_IMPL_HPP_
virtual void PreSelInv()
PreSelInv prepares the structure in L_ and U_ so that SelInv only involves matrix-matrix multiplicati...
Definition: pselinv_impl.hpp:2668
Int blockIdx
Block index (supernodal index)
Definition: pselinv.hpp:214
void PMatrixToDistSparseMatrix(DistSparseMatrix< T > &A)
PMatrixToDistSparseMatrix converts the PMatrix into a distributed compressed sparse column matrix for...
Definition: pselinv_impl.hpp:3129
void SelInvIntra_P2p(Int lidx)
SelInvIntra_P2p.
Definition: pselinv_impl.hpp:1202
A thin interface for passing parameters to set the SuperLU options.
Definition: superlu_dist_internal.hpp:62
Definition: TreeBcast.hpp:22
SuperNodeType describes mapping between supernode and column, the permutation information, and potentially the elimination tree (not implemented here).
Definition: pselinv.hpp:143
Interface with SuperLU_Dist (version 3.0 and later)
void GetNegativeInertia(Real &inertia)
GetNegativeInertia computes the negative inertia of a PMatrix. This can be used to estimate e...
Int PROW(Int bnum, const GridType *g)
PROW returns the processor row that the bnum-th block (supernode) belongs to.
Definition: pselinv.hpp:270
Int numRow
Number of nonzero rows.
Definition: pselinv.hpp:165
IntNumVec colptrLocal
Dimension numColLocal + 1, storing the pointers to the nonzero row indices and nonzero values in rowp...
Definition: sparse_matrix.hpp:108
void UnpackData(SuperNodeBufferType &snode, std::vector< LBlock< T > > &LcolRecv, std::vector< UBlock< T > > &UrowRecv)
UnpackData.
Definition: pselinv_impl.hpp:965
Int MYPROC(const GridType *g)
MYPROC returns the current processor rank.
Definition: pselinv.hpp:257
Int nnzLocal
Local number of local nonzeros elements on this processor.
Definition: sparse_matrix.hpp:96
Int PCOL(Int bnum, const GridType *g)
PCOL returns the processor column that the bnum-th block (supernode) belongs to.
Definition: pselinv.hpp:275
Int LBi(Int bnum, const GridType *g)
LBi returns the local block number on the processor at processor row PROW( bnum, g )...
Definition: pselinv.hpp:285
Profiling and timing using TAU.
Int SuperSize(Int bnum, const SuperNodeType *s)
SuperSize returns the size of the block bnum.
Definition: pselinv.hpp:326
Int LBj(Int bnum, const GridType *g)
LBj returns the local block number on the processor at processor column PCOL( bnum, g ).
Definition: pselinv.hpp:290
Int GBj(Int jLocal, const GridType *g)
GBj returns the global block number from a local block number in the column direction.
Definition: pselinv.hpp:300
void SetValue(NumMat< F > &M, F val)
SetValue sets a numerical matrix to a constant val.
Definition: NumMat_impl.hpp:171
Int numCol
Number of nonzero columns.
Definition: pselinv.hpp:168
Int NumCol(const SuperNodeType *s)
NumCol returns the total number of columns for a supernodal partiiton.
Definition: pselinv.hpp:335
Definition: pselinv.hpp:569
void GetDiagonal(NumVec< T > &diag)
GetDiagonal extracts the diagonal elements of the PMatrix.
Definition: pselinv_impl.hpp:3071
void GetWorkSet(std::vector< Int > &snodeEtree, std::vector< std::vector< Int > > &WSet)
GetWorkSet.
Definition: pselinv_impl.hpp:1125
NumVec< F > nzvalLocal
Dimension nnzLocal, storing the nonzero values.
Definition: sparse_matrix.hpp:116
Int FirstBlockRow(Int bnum, const SuperNodeType *s)
FirstBlockRow returns the first column of a block bnum. Note: the functionality of FirstBlockRow is e...
Definition: pselinv.hpp:321
void GetEtree(std::vector< Int > &etree_supno)
GetEtree computes the supernodal elimination tree to be used later in the pipelined selected inversio...
Definition: pselinv_impl.hpp:1058
LBlock stores a nonzero block in the lower triangular part or the diagonal part in PSelInv...
Definition: pselinv.hpp:159
IntNumVec rowindLocal
Dimension nnzLocal, storing the nonzero row indices. The indices are 1-based (FORTRAN-convention), i.e. the first row index is 1.
Definition: sparse_matrix.hpp:113
Int PNUM(Int i, Int j, const GridType *g)
PNUM returns the processor rank that the bnum-th block (supernode) belongs to.
Definition: pselinv.hpp:280
virtual void ConstructCommunicationPattern()
ConstructCommunicationPattern constructs the communication pattern to be used later in the selected i...
Definition: pselinv_impl.hpp:1942
void ConstructCommunicationPattern_P2p()
ConstructCommunicationPattern_P2p constructs the communication pattern to be used later in the select...
Definition: pselinv_impl.hpp:1950
GridType is the PSelInv way of defining the grid.
Definition: pselinv.hpp:106
Int MYCOL(const GridType *g)
MYCOL returns my processor column.
Definition: pselinv.hpp:265
Int size
Matrix dimension.
Definition: sparse_matrix.hpp:93
IntNumVec rows
Dimension numRow * 1, index (0-based) for the number of nonzero rows.
Definition: pselinv.hpp:171
Int numCol
Number of nonzero columns.
Definition: pselinv.hpp:220
UBlock stores a nonzero block in the upper triangular part in PSelInv.
Definition: pselinv.hpp:211
Int FirstBlockCol(Int bnum, const SuperNodeType *s)
FirstBlockCol returns the first column of a block bnum.
Definition: pselinv.hpp:314
LongInt Nnz()
Compute the total number of nonzeros through MPI_Allreduce.
Definition: sparse_matrix_impl.hpp:107
MPI_Comm comm
MPI communicator.
Definition: sparse_matrix.hpp:119
Int NnzLocal()
NnzLocal computes the number of nonzero elements (L and U) saved locally.
Definition: pselinv_impl.hpp:4042
Int blockIdx
Block index (supernodal index)
Definition: pselinv.hpp:162
PMatrix contains the main data structure and the computational routine for the parallel selected inve...
Definition: ngchol_interf.hpp:57
void SendRecvCD_UpdateU(std::vector< SuperNodeBufferType > &arrSuperNodes, Int stepSuper)
SendRecvCD_UpdateU.
Definition: pselinv_impl.hpp:742
Int BlockIdx(Int i, const SuperNodeType *s)
BlockIdx returns the block index of a column i.
Definition: pselinv.hpp:309
Int numRow
Number of nonzero rows.
Definition: pselinv.hpp:217
NumMat< T > nzval
Dimension numRow * numCol, nonzero elements.
Definition: pselinv.hpp:226
Int nnz
Total number of nonzeros elements.
Definition: sparse_matrix.hpp:101
Int NumSuper(const SuperNodeType *s)
NumSuper returns the total number of supernodes.
Definition: pselinv.hpp:330
void SelInv_P2p()
Point-to-point version of the selected inversion.
Definition: pselinv_impl.hpp:2633
NumMat< T > nzval
Dimension numRow * numCol, nonzero elements.
Definition: pselinv.hpp:175
Definition: utility.hpp:1481
IntNumVec cols
Dimension numRow * 1, index (0-based) for the number of nonzero rows.
Definition: pselinv.hpp:223
virtual void SelInv()
SelInv is the main function for the selected inversion.
Definition: pselinv_impl.hpp:2608
LongInt Nnz()
Nnz computes the total number of nonzero elements in the PMatrix.
Definition: pselinv_impl.hpp:4073
void SelInv_lookup_indexes(SuperNodeBufferType &snode, std::vector< LBlock< T > > &LcolRecv, std::vector< UBlock< T > > &UrowRecv, NumMat< T > &AinvBuf, NumMat< T > &UBuf)
SelInv_lookup_indexes.
Definition: pselinv_impl.hpp:385
void PMatrixToDistSparseMatrix2(const DistSparseMatrix< T > &A, DistSparseMatrix< T > &B)
PMatrixToDistSparseMatrix2 is a more efficient version which performs the same job as PMatrixToDistSp...
Definition: pselinv_impl.hpp:3732
Definition: TreeBcast.hpp:555
void ComputeDiagUpdate(SuperNodeBufferType &snode)
ComputeDiagUpdate.
Definition: pselinv_impl.hpp:1019
DistSparseMatrix describes a Sparse matrix in the compressed sparse column format (CSC) and distribut...
Definition: sparse_matrix.hpp:91
Int MYROW(const GridType *g)
MYROW returns my processor row.
Definition: pselinv.hpp:261