46 #ifndef _PEXSI_PSELINV_IMPL_HPP_
47 #define _PEXSI_PSELINV_IMPL_HPP_
52 #define IDX_TO_TAG(lidx,tag) (SELINV_TAG_COUNT*(lidx)+(tag))
53 #define TAG_TO_IDX(tag,typetag) (((tag)-(typetag))/SELINV_TAG_COUNT)
55 #define MPI_MAX_COMM (1024)
56 #define BCAST_THRESHOLD 20
65 inline GridType::GridType ( MPI_Comm Bcomm,
int nprow,
int npcol )
68 PushCallStack(
"GridType::GridType");
71 MPI_Initialized( &info );
73 throw std::logic_error(
"MPI has not been initialized." );
76 MPI_Comm_group( Bcomm, &comm_group );
77 MPI_Comm_create( Bcomm, comm_group, &comm );
80 MPI_Comm_rank( comm, &mpirank );
81 MPI_Comm_size( comm, &mpisize );
82 if( mpisize != nprow * npcol ){
83 throw std::logic_error(
"mpisize != nprow * npcol." );
89 Int myrow = mpirank / npcol;
90 Int mycol = mpirank % npcol;
92 MPI_Comm_split( comm, myrow, mycol, &rowComm );
93 MPI_Comm_split( comm, mycol, myrow, &colComm );
95 MPI_Group_free( &comm_group );
105 inline GridType::~GridType ( )
108 PushCallStack(
"GridType::~GridType");
112 MPI_Comm_free( &rowComm );
113 MPI_Comm_free( &colComm );
114 MPI_Comm_free( &comm );
126 PMatrix<T>::PMatrix (
128 const SuperNodeType* s,
133 PushCallStack(
"PMatrix::PMatrix");
136 this->Setup( g, s, o );
146 void PMatrix<T>::Setup(
148 const SuperNodeType* s,
152 PushCallStack(
"PMatrix::Setup");
164 ColBlockIdx_.clear();
165 RowBlockIdx_.clear();
167 L_.resize( this->NumLocalBlockCol() );
168 U_.resize( this->NumLocalBlockRow() );
170 ColBlockIdx_.resize( this->NumLocalBlockCol() );
171 RowBlockIdx_.resize( this->NumLocalBlockRow() );
174 #if ( _DEBUGlevel_ >= 1 )
175 statusOFS << std::endl <<
"PMatrix is constructed. The grid information: " << std::endl;
176 statusOFS <<
"mpirank = " <<
MYPROC(grid_) << std::endl;
177 statusOFS <<
"myrow = " <<
MYROW(grid_) << std::endl;
178 statusOFS <<
"mycol = " <<
MYCOL(grid_) << std::endl;
190 void PMatrix<T>::getMaxCommunicatorSizes()
192 IntNumVec maxSizes(this->WorkingSet().size());
194 for (Int lidx=0; lidx<this->WorkingSet().size() ; lidx++){
195 Int stepSuper = this->WorkingSet()[lidx].size();
196 for(Int supidx = 0;supidx<stepSuper;supidx++){
197 Int ksup = this->WorkingSet()[lidx][supidx];
199 Int count= std::count(commSendToRightMask_[ksup].begin(), commSendToRightMask_[ksup].end(),
true);
200 maxSizes[lidx] = std::max(maxSizes[lidx],count);
202 count= std::count(commSendToBelowMask_[ksup].begin(), commSendToBelowMask_[ksup].end(),
true);
203 maxSizes[lidx] = std::max(maxSizes[lidx],count);
205 count= std::count(commRecvFromBelowMask_[ksup].begin(), commRecvFromBelowMask_[ksup].end(),
true);
206 maxSizes[lidx] = std::max(maxSizes[lidx],count);
211 maxCommSizes_.Resize(maxSizes.m());
213 MPI_Allreduce( maxSizes.Data(), maxCommSizes_.Data(), maxSizes.m(), MPI_INT, MPI_MAX, grid_->comm );
217 inline void PMatrix<T>::SelInv_lookup_indexes(
218 SuperNodeBufferType & snode,
219 std::vector<LBlock<T> > & LcolRecv,
220 std::vector<UBlock<T> > & UrowRecv,
224 TIMER_START(Compute_Sinv_LT_Lookup_Indexes);
226 TIMER_START(Build_colptr_rowptr);
230 std::vector<Int> rowPtr(LcolRecv.size() + 1);
234 std::vector<Int> colPtr(UrowRecv.size() + 1);
237 for( Int ib = 0; ib < LcolRecv.size(); ib++ ){
238 rowPtr[ib+1] = rowPtr[ib] + LcolRecv[ib].numRow;
241 for( Int jb = 0; jb < UrowRecv.size(); jb++ ){
242 colPtr[jb+1] = colPtr[jb] + UrowRecv[jb].numCol;
245 Int numRowAinvBuf = *rowPtr.rbegin();
246 Int numColAinvBuf = *colPtr.rbegin();
247 TIMER_STOP(Build_colptr_rowptr);
249 TIMER_START(Allocate_lookup);
251 AinvBuf.Resize( numRowAinvBuf, numColAinvBuf );
252 UBuf.Resize(
SuperSize( snode.Index, super_ ), numColAinvBuf );
258 TIMER_STOP(Allocate_lookup);
260 TIMER_START(Fill_UBuf);
262 for( Int jb = 0; jb < UrowRecv.size(); jb++ ){
263 UBlock<T>& UB = UrowRecv[jb];
264 if( UB.numRow !=
SuperSize(snode.Index, super_) ){
265 throw std::logic_error(
"The size of UB is not right. Something is seriously wrong." );
267 lapack::Lacpy(
'A', UB.numRow, UB.numCol, UB.nzval.Data(),
268 UB.numRow, UBuf.VecData( colPtr[jb] ),
SuperSize( snode.Index, super_ ) );
270 TIMER_STOP(Fill_UBuf);
274 TIMER_START(JB_Loop);
428 for( Int jb = 0; jb < UrowRecv.size(); jb++ ){
429 for( Int ib = 0; ib < LcolRecv.size(); ib++ ){
430 LBlock<T>& LB = LcolRecv[ib];
431 UBlock<T>& UB = UrowRecv[jb];
432 Int isup = LB.blockIdx;
433 Int jsup = UB.blockIdx;
434 T* nzvalAinv = &AinvBuf( rowPtr[ib], colPtr[jb] );
435 Int ldAinv = AinvBuf.m();
439 std::vector<LBlock<T> >& LcolSinv = this->L(
LBj(jsup, grid_ ) );
440 bool isBlockFound =
false;
441 TIMER_START(PARSING_ROW_BLOCKIDX);
442 for( Int ibSinv = 0; ibSinv < LcolSinv.size(); ibSinv++ ){
444 if( LcolSinv[ibSinv].blockIdx == isup ){
445 LBlock<T> & SinvB = LcolSinv[ibSinv];
448 std::vector<Int> relRows( LB.numRow );
449 Int* rowsLBPtr = LB.rows.Data();
450 Int* rowsSinvBPtr = SinvB.rows.Data();
451 for( Int i = 0; i < LB.numRow; i++ ){
452 bool isRowFound =
false;
453 for( Int i1 = 0; i1 < SinvB.numRow; i1++ ){
454 if( rowsLBPtr[i] == rowsSinvBPtr[i1] ){
460 if( isRowFound ==
false ){
461 std::ostringstream msg;
462 msg <<
"Row " << rowsLBPtr[i] <<
463 " in LB cannot find the corresponding row in SinvB" << std::endl
464 <<
"LB.rows = " << LB.rows << std::endl
465 <<
"SinvB.rows = " << SinvB.rows << std::endl;
466 throw std::runtime_error( msg.str().c_str() );
471 std::vector<Int> relCols( UB.numCol );
473 for( Int j = 0; j < UB.numCol; j++ ){
474 relCols[j] = UB.cols[j] - SinvColsSta;
478 T* nzvalSinv = SinvB.nzval.Data();
479 Int ldSinv = SinvB.numRow;
480 for( Int j = 0; j < UB.numCol; j++ ){
481 for( Int i = 0; i < LB.numRow; i++ ){
482 nzvalAinv[i+j*ldAinv] =
483 nzvalSinv[relRows[i] + relCols[j] * ldSinv];
491 TIMER_STOP(PARSING_ROW_BLOCKIDX);
492 if( isBlockFound ==
false ){
493 std::ostringstream msg;
494 msg <<
"Block(" << isup <<
", " << jsup
495 <<
") did not find a matching block in Sinv." << std::endl;
496 throw std::runtime_error( msg.str().c_str() );
500 std::vector<UBlock<T> >& UrowSinv = this->U(
LBi( isup, grid_ ) );
501 bool isBlockFound =
false;
502 TIMER_START(PARSING_COL_BLOCKIDX);
503 for( Int jbSinv = 0; jbSinv < UrowSinv.size(); jbSinv++ ){
505 if( UrowSinv[jbSinv].blockIdx == jsup ){
506 UBlock<T> & SinvB = UrowSinv[jbSinv];
509 std::vector<Int> relRows( LB.numRow );
511 for( Int i = 0; i < LB.numRow; i++ ){
512 relRows[i] = LB.rows[i] - SinvRowsSta;
516 std::vector<Int> relCols( UB.numCol );
517 Int* colsUBPtr = UB.cols.Data();
518 Int* colsSinvBPtr = SinvB.cols.Data();
519 for( Int j = 0; j < UB.numCol; j++ ){
520 bool isColFound =
false;
521 for( Int j1 = 0; j1 < SinvB.numCol; j1++ ){
522 if( colsUBPtr[j] == colsSinvBPtr[j1] ){
528 if( isColFound ==
false ){
529 std::ostringstream msg;
530 msg <<
"Col " << colsUBPtr[j] <<
531 " in UB cannot find the corresponding row in SinvB" << std::endl
532 <<
"UB.cols = " << UB.cols << std::endl
533 <<
"UinvB.cols = " << SinvB.cols << std::endl;
534 throw std::runtime_error( msg.str().c_str() );
540 T* nzvalSinv = SinvB.nzval.Data();
541 Int ldSinv = SinvB.numRow;
542 for( Int j = 0; j < UB.numCol; j++ ){
543 for( Int i = 0; i < LB.numRow; i++ ){
544 nzvalAinv[i+j*ldAinv] =
545 nzvalSinv[relRows[i] + relCols[j] * ldSinv];
553 TIMER_STOP(PARSING_COL_BLOCKIDX);
554 if( isBlockFound ==
false ){
555 std::ostringstream msg;
556 msg <<
"Block(" << isup <<
", " << jsup
557 <<
") did not find a matching block in Sinv." << std::endl;
558 throw std::runtime_error( msg.str().c_str() );
570 TIMER_STOP(Compute_Sinv_LT_Lookup_Indexes);
575 inline void PMatrix<T>::SendRecvCD_UpdateU(
576 std::vector<SuperNodeBufferType > & arrSuperNodes,
580 TIMER_START(Send_CD_Update_U);
584 Int sendOffset[stepSuper];
585 Int recvOffset[stepSuper];
587 for (Int supidx=0; supidx<stepSuper; supidx++){
588 SuperNodeBufferType & snode = arrSuperNodes[supidx];
589 sendOffset[supidx]=sendCount;
590 recvOffset[supidx]=recvCount;
591 sendCount+= CountSendToCrossDiagonal(snode.Index);
592 recvCount+= CountRecvFromCrossDiagonal(snode.Index);
596 std::vector<MPI_Request > arrMpiReqsSendCD(sendCount, MPI_REQUEST_NULL );
597 std::vector<MPI_Request > arrMpiReqsSizeSendCD(sendCount, MPI_REQUEST_NULL );
599 std::vector<MPI_Request > arrMpiReqsRecvCD(recvCount, MPI_REQUEST_NULL );
600 std::vector<MPI_Request > arrMpiReqsSizeRecvCD(recvCount, MPI_REQUEST_NULL );
601 std::vector<std::vector<char> > arrSstrLcolSendCD(sendCount);
602 std::vector<int > arrSstrLcolSizeSendCD(sendCount);
603 std::vector<std::vector<char> > arrSstrLcolRecvCD(recvCount);
604 std::vector<int > arrSstrLcolSizeRecvCD(recvCount);
606 for (Int supidx=0; supidx<stepSuper; supidx++){
607 SuperNodeBufferType & snode = arrSuperNodes[supidx];
612 TIMER_START(Send_L_CrossDiag);
614 if(
MYCOL( grid_ ) ==
PCOL( snode.Index, grid_ ) && isSendToCrossDiagonal_(grid_->numProcCol, snode.Index ) ){
617 for(Int dstCol = 0; dstCol<grid_->numProcCol; dstCol++){
618 if(isSendToCrossDiagonal_(dstCol,snode.Index) ){
619 Int dest =
PNUM(
PROW(snode.Index,grid_),dstCol,grid_);
621 if(
MYPROC( grid_ ) != dest ){
622 MPI_Request & mpiReqSizeSend = arrMpiReqsSizeSendCD[sendOffset[supidx]+sendIdx];
623 MPI_Request & mpiReqSend = arrMpiReqsSendCD[sendOffset[supidx]+sendIdx];
626 std::stringstream sstm;
627 std::vector<char> & sstrLcolSend = arrSstrLcolSendCD[sendOffset[supidx]+sendIdx];
628 Int & sstrSize = arrSstrLcolSizeSendCD[sendOffset[supidx]+sendIdx];
630 serialize( snode.RowLocalPtr, sstm, NO_MASK );
631 serialize( snode.BlockIdxLocal, sstm, NO_MASK );
632 serialize( snode.LUpdateBuf, sstm, NO_MASK );
634 sstrLcolSend.resize( Size(sstm) );
635 sstm.read( &sstrLcolSend[0], sstrLcolSend.size() );
636 sstrSize = sstrLcolSend.size();
639 MPI_Isend( &sstrSize, 1, MPI_INT, dest, IDX_TO_TAG(supidx,SELINV_TAG_L_SIZE), grid_->comm, &mpiReqSizeSend );
640 MPI_Isend( (
void*)&sstrLcolSend[0], sstrSize, MPI_BYTE, dest, IDX_TO_TAG(supidx,SELINV_TAG_L_CONTENT), grid_->comm, &mpiReqSend );
648 TIMER_STOP(Send_L_CrossDiag);
653 for (Int supidx=0; supidx<stepSuper; supidx++){
654 SuperNodeBufferType & snode = arrSuperNodes[supidx];
656 if(
MYROW( grid_ ) ==
PROW( snode.Index, grid_ ) && isRecvFromCrossDiagonal_(grid_->numProcRow, snode.Index ) ){
658 for(Int srcRow = 0; srcRow<grid_->numProcRow; srcRow++){
659 if(isRecvFromCrossDiagonal_(srcRow,snode.Index) ){
660 Int src =
PNUM(srcRow,
PCOL(snode.Index,grid_),grid_);
661 if(
MYPROC( grid_ ) != src ){
662 Int & sstrSize = arrSstrLcolSizeRecvCD[recvOffset[supidx]+recvIdx];
663 MPI_Request & mpiReqSizeRecv = arrMpiReqsSizeRecvCD[recvOffset[supidx]+recvIdx];
664 MPI_Irecv( &sstrSize, 1, MPI_INT, src, IDX_TO_TAG(supidx,SELINV_TAG_L_SIZE), grid_->comm, &mpiReqSizeRecv );
673 mpi::Waitall(arrMpiReqsSizeRecvCD);
675 for (Int supidx=0; supidx<stepSuper; supidx++){
676 SuperNodeBufferType & snode = arrSuperNodes[supidx];
678 if(
MYROW( grid_ ) ==
PROW( snode.Index, grid_ ) && isRecvFromCrossDiagonal_(grid_->numProcRow, snode.Index ) ){
680 for(Int srcRow = 0; srcRow<grid_->numProcRow; srcRow++){
681 if(isRecvFromCrossDiagonal_(srcRow,snode.Index) ){
682 Int src =
PNUM(srcRow,
PCOL(snode.Index,grid_),grid_);
683 if(
MYPROC( grid_ ) != src ){
684 Int & sstrSize = arrSstrLcolSizeRecvCD[recvOffset[supidx]+recvIdx];
685 std::vector<char> & sstrLcolRecv = arrSstrLcolRecvCD[recvOffset[supidx]+recvIdx];
686 MPI_Request & mpiReqRecv = arrMpiReqsRecvCD[recvOffset[supidx]+recvIdx];
687 sstrLcolRecv.resize( sstrSize);
688 MPI_Irecv( (
void*)&sstrLcolRecv[0], sstrSize, MPI_BYTE, src, IDX_TO_TAG(supidx,SELINV_TAG_L_CONTENT), grid_->comm, &mpiReqRecv );
698 mpi::Waitall(arrMpiReqsRecvCD);
700 for (Int supidx=0; supidx<stepSuper; supidx++){
701 SuperNodeBufferType & snode = arrSuperNodes[supidx];
705 if(
MYROW( grid_ ) ==
PROW( snode.Index, grid_ ) && isRecvFromCrossDiagonal_(grid_->numProcRow, snode.Index ) ){
707 #if ( _DEBUGlevel_ >= 1 )
708 statusOFS << std::endl <<
" ["<<snode.Index<<
"] "<<
"Update the upper triangular block" << std::endl << std::endl;
709 statusOFS << std::endl <<
" ["<<snode.Index<<
"] "<<
"blockIdxLocal:" << snode.BlockIdxLocal << std::endl << std::endl;
710 statusOFS << std::endl <<
" ["<<snode.Index<<
"] "<<
"rowLocalPtr:" << snode.RowLocalPtr << std::endl << std::endl;
713 std::vector<UBlock<T> >& Urow = this->U(
LBi( snode.Index, grid_ ) );
714 std::vector<bool> isBlockFound(Urow.size(),
false);
717 for(Int srcRow = 0; srcRow<grid_->numProcRow; srcRow++){
718 if(isRecvFromCrossDiagonal_(srcRow,snode.Index) ){
719 Int src =
PNUM(srcRow,
PCOL(snode.Index,grid_),grid_);
720 TIMER_START(Recv_L_CrossDiag);
722 std::vector<Int> rowLocalPtrRecv;
723 std::vector<Int> blockIdxLocalRecv;
724 NumMat<T> UUpdateBuf;
726 if(
MYPROC( grid_ ) != src ){
727 std::stringstream sstm;
728 Int & sstrSize = arrSstrLcolSizeRecvCD[recvOffset[supidx]+recvIdx];
729 std::vector<char> & sstrLcolRecv = arrSstrLcolRecvCD[recvOffset[supidx]+recvIdx];
730 sstm.write( &sstrLcolRecv[0], sstrSize );
732 deserialize( rowLocalPtrRecv, sstm, NO_MASK );
733 deserialize( blockIdxLocalRecv, sstm, NO_MASK );
734 deserialize( UUpdateBuf, sstm, NO_MASK );
740 rowLocalPtrRecv = snode.RowLocalPtr;
741 blockIdxLocalRecv = snode.BlockIdxLocal;
742 UUpdateBuf = snode.LUpdateBuf;
747 TIMER_STOP(Recv_L_CrossDiag);
749 #if ( _DEBUGlevel_ >= 1 )
750 statusOFS<<
" ["<<snode.Index<<
"] P"<<
MYPROC(grid_)<<
" ("<<
MYROW(grid_)<<
","<<
MYCOL(grid_)<<
") <--- LBj("<<snode.Index<<
") <--- P"<<src<<std::endl;
751 statusOFS << std::endl <<
" ["<<snode.Index<<
"] "<<
"rowLocalPtrRecv:" << rowLocalPtrRecv << std::endl << std::endl;
752 statusOFS << std::endl <<
" ["<<snode.Index<<
"] "<<
"blockIdxLocalRecv:" << blockIdxLocalRecv << std::endl << std::endl;
757 for( Int ib = 0; ib < blockIdxLocalRecv.size(); ib++ ){
758 for( Int jb = 0; jb < Urow.size(); jb++ ){
759 UBlock<T>& UB = Urow[jb];
760 if( UB.blockIdx == blockIdxLocalRecv[ib] ){
761 NumMat<T> Ltmp ( UB.numCol, UB.numRow );
762 lapack::Lacpy(
'A', Ltmp.m(), Ltmp.n(),
763 &UUpdateBuf( rowLocalPtrRecv[ib], 0 ),
764 UUpdateBuf.m(), Ltmp.Data(), Ltmp.m() );
765 isBlockFound[jb] =
true;
766 Transpose( Ltmp, UB.nzval );
774 for( Int jb = 0; jb < Urow.size(); jb++ ){
775 UBlock<T>& UB = Urow[jb];
776 if( !isBlockFound[jb] ){
777 throw std::logic_error(
"UBlock cannot find its update. Something is seriously wrong." );
783 TIMER_STOP(Send_CD_Update_U);
785 mpi::Waitall(arrMpiReqsSizeSendCD);
786 mpi::Waitall(arrMpiReqsSendCD);
790 inline void PMatrix<T>::UnpackData(
791 SuperNodeBufferType & snode,
792 std::vector<LBlock<T> > & LcolRecv,
793 std::vector<UBlock<T> > & UrowRecv )
796 #if ( _DEBUGlevel_ >= 1 )
797 statusOFS << std::endl <<
"["<<snode.Index<<
"] "<<
"Unpack the received data for processors participate in Gemm. " << std::endl << std::endl;
800 if(
MYROW( grid_ ) !=
PROW( snode.Index, grid_ ) ){
801 std::stringstream sstm;
802 sstm.write( &snode.SstrUrowRecv[0], snode.SstrUrowRecv.size() );
803 std::vector<Int> mask( UBlockMask::TOTAL_NUMBER, 1 );
805 deserialize( numUBlock, sstm, NO_MASK );
806 UrowRecv.resize( numUBlock );
807 for( Int jb = 0; jb < numUBlock; jb++ ){
808 deserialize( UrowRecv[jb], sstm, mask );
816 UrowRecv.resize(this->U(
LBi( snode.Index, grid_ ) ).size());
817 std::copy(this->U(
LBi( snode.Index, grid_ ) ).begin(),this->U(
LBi( snode.Index, grid_ )).end(),UrowRecv.begin());
822 if(
MYCOL( grid_ ) !=
PCOL( snode.Index, grid_ ) ){
823 std::stringstream sstm;
824 sstm.write( &snode.SstrLcolRecv[0], snode.SstrLcolRecv.size() );
825 std::vector<Int> mask( LBlockMask::TOTAL_NUMBER, 1 );
826 mask[LBlockMask::NZVAL] = 0;
828 deserialize( numLBlock, sstm, NO_MASK );
829 LcolRecv.resize( numLBlock );
830 for( Int ib = 0; ib < numLBlock; ib++ ){
831 deserialize( LcolRecv[ib], sstm, mask );
837 std::vector<LBlock<T> >& Lcol = this->L(
LBj( snode.Index, grid_ ) );
838 Int startIdx = (
MYROW( grid_ ) ==
PROW( snode.Index, grid_ ) )?1:0;
839 LcolRecv.resize( Lcol.size() - startIdx );
840 std::copy(Lcol.begin()+startIdx,Lcol.end(),LcolRecv.begin());
845 inline void PMatrix<T>::ComputeDiagUpdate(SuperNodeBufferType & snode)
849 if(
MYCOL( grid_ ) ==
PCOL( snode.Index, grid_ ) ){
850 #if ( _DEBUGlevel_ >= 1 )
851 statusOFS << std::endl <<
"["<<snode.Index<<
"] "<<
"Updating the diagonal block" << std::endl << std::endl;
853 std::vector<LBlock<T> >& Lcol = this->L(
LBj( snode.Index, grid_ ) );
856 snode.DiagBuf.Resize(
SuperSize( snode.Index, super_ ),
SuperSize( snode.Index, super_ ));
860 Int startIb = (
MYROW( grid_ ) ==
PROW( snode.Index, grid_ ))?1:0;
861 for( Int ib = startIb; ib < Lcol.size(); ib++ ){
862 blas::Gemm(
'T',
'N', snode.DiagBuf.m(), snode.DiagBuf.n(), Lcol[ib].numRow,
863 MINUS_ONE<T>(), &snode.LUpdateBuf( snode.RowLocalPtr[ib-startIb], 0 ), snode.LUpdateBuf.m(),
864 Lcol[ib].nzval.Data(), Lcol[ib].nzval.m(), ONE<T>(), snode.DiagBuf.Data(), snode.DiagBuf.m() );
867 #if ( _DEBUGlevel_ >= 1 )
868 statusOFS << std::endl <<
"["<<snode.Index<<
"] "<<
"Updated the diagonal block" << std::endl << std::endl;
879 PushCallStack(
"PMatrix::GetEtree");
880 double begin = MPI_Wtime( );
884 if( options_->ColPerm !=
"PARMETIS" ) {
891 etree_supno.resize(this->
NumSuper());
892 for(Int i = 0; i < superNode->etree.m(); ++i){
893 Int curSnode = superNode->superIdx[i];
894 Int parentSnode = (superNode->etree[i]>= superNode->etree.m()) ?this->
NumSuper():superNode->superIdx[superNode->etree[i]];
895 if( curSnode != parentSnode){
896 etree_supno[curSnode] = parentSnode;
905 std::vector<Int> etree_supno_l( nsupers, nsupers );
906 for( Int ksup = 0; ksup < nsupers; ksup++ ){
907 if(
MYCOL( grid_ ) ==
PCOL( ksup, grid_ ) ){
909 std::vector<LBlock<T> >& Lcol = this->L(
LBj(ksup, grid_) );
912 if(
MYROW( grid_ ) ==
PROW( ksup, grid_ ) )
915 for( Int ib = firstBlk; ib < Lcol.size(); ib++ ){
916 etree_supno_l[ksup] = std::min(etree_supno_l[ksup] , Lcol[ib].blockIdx);
923 #if ( _DEBUGlevel_ >= 1 )
924 statusOFS << std::endl <<
" Local supernodal elimination tree is " << etree_supno_l <<std::endl<<std::endl;
928 etree_supno.resize( nsupers );
929 mpi::Allreduce( (Int*) &etree_supno_l[0],(Int *) &etree_supno[0], nsupers, MPI_MIN, grid_->comm );
930 etree_supno[nsupers-1]=nsupers;
934 double end = MPI_Wtime( );
935 statusOFS<<
"Building the list took "<<end-begin<<
"s"<<std::endl;
947 std::vector<Int> & snodeEtree,
948 std::vector<std::vector<Int> > & WSet)
950 TIMER_START(Compute_WorkSet);
954 if (options_->maxPipelineDepth!=1){
957 Int rootParent = snodeEtree[numSuper-2];
963 for(Int i=rootParent-1; i>=0; i-- ){ level(i) = level(snodeEtree[i])+1; numLevel = std::max(numLevel, level(i)); }
967 IntNumVec levelSize(numLevel);
969 for(Int i=rootParent-1; i>=0; i-- ){ levelSize(level(i))++; }
972 WSet.resize(numLevel,std::vector<Int>());
973 for(Int i=0; i<numLevel; i++ ){WSet[i].reserve(levelSize(i));}
976 for(Int i=rootParent-1; i>=0; i-- ){
977 WSet[level(i)].push_back(i);
981 Int limit = (options_->maxPipelineDepth>0)?std::min(MPI_MAX_COMM,options_->maxPipelineDepth):MPI_MAX_COMM;
982 for (Int lidx=0; lidx<WSet.size() ; lidx++){
983 if(WSet[lidx].size()>limit)
985 std::vector<std::vector<Int> >::iterator pos = WSet.begin()+lidx+1;
986 WSet.insert(pos,std::vector<Int>());
987 WSet[lidx+1].insert(WSet[lidx+1].begin(),WSet[lidx].begin() + limit ,WSet[lidx].end());
988 WSet[lidx].erase(WSet[lidx].begin()+limit,WSet[lidx].end());
996 for( Int ksup = numSuper - 2; ksup >= 0; ksup-- ){
997 WSet.push_back(std::vector<Int>());
998 WSet.back().push_back(ksup);
1006 TIMER_STOP(Compute_WorkSet);
1007 #if ( _DEBUGlevel_ >= 1 )
1008 for (Int lidx=0; lidx<WSet.size() ; lidx++){
1009 statusOFS << std::endl <<
"L"<< lidx <<
" is: {";
1010 for (Int supidx=0; supidx<WSet[lidx].size() ; supidx++){
1011 statusOFS << WSet[lidx][supidx] <<
" ["<<snodeEtree[WSet[lidx][supidx]]<<
"] ";
1013 statusOFS <<
" }"<< std::endl;
2699 template<
typename T>
2700 inline void PMatrix<T>::SelInvIntra_P2p(Int lidx)
2703 #if defined (PROFILE) || defined(PMPI) || defined(USE_TAU)
2704 Real begin_SendULWaitContentFirst, end_SendULWaitContentFirst, time_SendULWaitContentFirst = 0;
2707 std::vector<std::vector<Int> > & superList = this->WorkingSet();
2708 Int numSteps = superList.size();
2709 Int stepSuper = superList[lidx].size();
2711 TIMER_START(AllocateBuffer);
2714 std::vector<std::vector<MPI_Request> > arrMpireqsSendToBelow;
2715 arrMpireqsSendToBelow.resize( stepSuper, std::vector<MPI_Request>( 2 * grid_->numProcRow, MPI_REQUEST_NULL ));
2716 std::vector<std::vector<MPI_Request> > arrMpireqsSendToRight;
2717 arrMpireqsSendToRight.resize(stepSuper, std::vector<MPI_Request>( 2 * grid_->numProcCol, MPI_REQUEST_NULL ));
2720 std::vector<MPI_Request> arrMpireqsSendToLeft;
2721 arrMpireqsSendToLeft.resize(stepSuper, MPI_REQUEST_NULL );
2723 std::vector<MPI_Request> arrMpireqsSendToAbove;
2724 arrMpireqsSendToAbove.resize(stepSuper, MPI_REQUEST_NULL );
2727 std::vector<MPI_Request> arrMpireqsRecvSizeFromAny;
2728 arrMpireqsRecvSizeFromAny.resize(stepSuper*2 , MPI_REQUEST_NULL);
2729 std::vector<MPI_Request> arrMpireqsRecvContentFromAny;
2730 arrMpireqsRecvContentFromAny.resize(stepSuper*2 , MPI_REQUEST_NULL);
2734 std::vector<SuperNodeBufferType> arrSuperNodes(stepSuper);
2735 for (Int supidx=0; supidx<stepSuper; supidx++){
2736 arrSuperNodes[supidx].Index = superList[lidx][supidx];
2739 NumMat<T> AinvBuf, UBuf;
2741 TIMER_STOP(AllocateBuffer);
2744 PushCallStack(
"PMatrix::SelInv_P2p::UpdateL");
2746 #if ( _DEBUGlevel_ >= 1 )
2747 statusOFS << std::endl <<
"Communication to the Schur complement." << std::endl << std::endl;
2752 for (Int supidx=0; supidx<stepSuper; supidx++){
2753 SuperNodeBufferType & snode = arrSuperNodes[supidx];
2754 std::vector<MPI_Request> & mpireqsSendToBelow = arrMpireqsSendToBelow[supidx];
2755 std::vector<MPI_Request> & mpireqsSendToRight = arrMpireqsSendToRight[supidx];
2757 #if ( _DEBUGlevel_ >= 1 )
2758 statusOFS << std::endl <<
"["<<snode.Index<<
"] "
2759 <<
"Communication for the U part." << std::endl << std::endl;
2763 if(
MYROW( grid_ ) ==
PROW( snode.Index, grid_ ) ){
2765 TIMER_START(Serialize_UL);
2766 std::stringstream sstm;
2767 std::vector<Int> mask( UBlockMask::TOTAL_NUMBER, 1 );
2768 std::vector<UBlock<T> >& Urow = this->U(
LBi(snode.Index, grid_) );
2770 serialize( (Int)Urow.size(), sstm, NO_MASK );
2771 for( Int jb = 0; jb < Urow.size(); jb++ ){
2772 serialize( Urow[jb], sstm, mask );
2774 snode.SstrUrowSend.resize( Size( sstm ) );
2775 sstm.read( &snode.SstrUrowSend[0], snode.SstrUrowSend.size() );
2776 snode.SizeSstrUrowSend = snode.SstrUrowSend.size();
2777 TIMER_STOP(Serialize_UL);
2779 for( Int iProcRow = 0; iProcRow < grid_->numProcRow; iProcRow++ ){
2780 if(
MYROW( grid_ ) != iProcRow &&
2781 isSendToBelow_( iProcRow,snode.Index ) ==
true ){
2783 MPI_Isend( &snode.SizeSstrUrowSend, 1, MPI_INT,
2784 iProcRow, IDX_TO_TAG(supidx,SELINV_TAG_U_SIZE), grid_->colComm, &mpireqsSendToBelow[2*iProcRow] );
2785 MPI_Isend( (
void*)&snode.SstrUrowSend[0], snode.SizeSstrUrowSend, MPI_BYTE,
2786 iProcRow, IDX_TO_TAG(supidx,SELINV_TAG_U_CONTENT),
2787 grid_->colComm, &mpireqsSendToBelow[2*iProcRow+1] );
2788 #if ( _DEBUGlevel_ >= 1 )
2789 statusOFS << std::endl <<
"["<<snode.Index<<
"] "<<
"Sending U " << snode.SizeSstrUrowSend <<
" BYTES"<< std::endl << std::endl;
2795 #if ( _DEBUGlevel_ >= 1 )
2796 statusOFS << std::endl <<
"["<<snode.Index<<
"] "<<
"Communication for the L part." << std::endl << std::endl;
2802 if(
MYCOL( grid_ ) ==
PCOL( snode.Index, grid_ ) ){
2804 TIMER_START(Serialize_UL);
2806 std::stringstream sstm;
2807 std::vector<Int> mask( LBlockMask::TOTAL_NUMBER, 1 );
2808 mask[LBlockMask::NZVAL] = 0;
2810 std::vector<LBlock<T> >& Lcol = this->L(
LBj(snode.Index, grid_) );
2813 if(
MYROW( grid_ ) ==
PROW( snode.Index, grid_ ) )
2814 serialize( (Int)Lcol.size() - 1, sstm, NO_MASK );
2816 serialize( (Int)Lcol.size(), sstm, NO_MASK );
2818 for( Int ib = 0; ib < Lcol.size(); ib++ ){
2819 if( Lcol[ib].blockIdx > snode.Index ){
2820 #if ( _DEBUGlevel_ >= 2 )
2821 statusOFS << std::endl <<
"["<<snode.Index<<
"] "<<
"Serializing Block index " << Lcol[ib].blockIdx << std::endl;
2823 serialize( Lcol[ib], sstm, mask );
2826 snode.SstrLcolSend.resize( Size( sstm ) );
2827 sstm.read( &snode.SstrLcolSend[0], snode.SstrLcolSend.size() );
2828 snode.SizeSstrLcolSend = snode.SstrLcolSend.size();
2829 TIMER_STOP(Serialize_UL);
2831 for( Int iProcCol = 0; iProcCol < grid_->numProcCol ; iProcCol++ ){
2832 if(
MYCOL( grid_ ) != iProcCol &&
2833 isSendToRight_( iProcCol, snode.Index ) ==
true ){
2835 MPI_Isend( &snode.SizeSstrLcolSend, 1, MPI_INT,
2836 iProcCol, IDX_TO_TAG(supidx,SELINV_TAG_L_SIZE),
2837 grid_->rowComm, &mpireqsSendToRight[2*iProcCol] );
2838 MPI_Isend( (
void*)&snode.SstrLcolSend[0], snode.SizeSstrLcolSend, MPI_BYTE,
2839 iProcCol, IDX_TO_TAG(supidx,SELINV_TAG_L_CONTENT),
2840 grid_->rowComm, &mpireqsSendToRight[2*iProcCol+1] );
2841 #if ( _DEBUGlevel_ >= 1 )
2842 statusOFS << std::endl <<
"["<<snode.Index<<
"] "<<
"Sending L " << snode.SizeSstrLcolSend<<
" BYTES" << std::endl << std::endl;
2851 for (Int supidx=0; supidx<stepSuper ; supidx++){
2852 SuperNodeBufferType & snode = arrSuperNodes[supidx];
2853 MPI_Request * mpireqsRecvFromAbove = &arrMpireqsRecvSizeFromAny[supidx*2];
2854 MPI_Request * mpireqsRecvFromLeft = &arrMpireqsRecvSizeFromAny[supidx*2+1];
2857 if( isRecvFromAbove_( snode.Index ) &&
2858 MYROW( grid_ ) !=
PROW( snode.Index, grid_ ) ){
2859 MPI_Irecv( &snode.SizeSstrUrowRecv, 1, MPI_INT,
PROW( snode.Index, grid_ ),
2860 IDX_TO_TAG(supidx,SELINV_TAG_U_SIZE),
2861 grid_->colComm, mpireqsRecvFromAbove );
2862 #if ( _DEBUGlevel_ >= 1 )
2863 statusOFS << std::endl <<
"["<<snode.Index<<
"] "<<
"Receiving U size on tag " << IDX_TO_TAG(supidx,SELINV_TAG_U_SIZE)<< std::endl << std::endl;
2868 if( isRecvFromLeft_( snode.Index ) &&
2869 MYCOL( grid_ ) !=
PCOL( snode.Index, grid_ ) ){
2870 MPI_Irecv( &snode.SizeSstrLcolRecv, 1, MPI_INT,
PCOL( snode.Index, grid_ ),
2871 IDX_TO_TAG(supidx,SELINV_TAG_L_SIZE),
2872 grid_->rowComm, mpireqsRecvFromLeft );
2873 #if ( _DEBUGlevel_ >= 1 )
2874 statusOFS << std::endl <<
"["<<snode.Index<<
"] "<<
"Receiving L size on tag " << IDX_TO_TAG(supidx,SELINV_TAG_L_SIZE)<< std::endl << std::endl;
2880 TIMER_START(WaitSize_UL);
2881 mpi::Waitall(arrMpireqsRecvSizeFromAny);
2882 TIMER_STOP(WaitSize_UL);
2885 for (Int supidx=0; supidx<stepSuper ; supidx++){
2886 SuperNodeBufferType & snode = arrSuperNodes[supidx];
2888 MPI_Request * mpireqsRecvFromAbove = &arrMpireqsRecvContentFromAny[supidx*2];
2889 MPI_Request * mpireqsRecvFromLeft = &arrMpireqsRecvContentFromAny[supidx*2+1];
2891 if( isRecvFromAbove_( snode.Index ) &&
2892 MYROW( grid_ ) !=
PROW( snode.Index, grid_ ) ){
2893 snode.SstrUrowRecv.resize( snode.SizeSstrUrowRecv );
2894 MPI_Irecv( &snode.SstrUrowRecv[0], snode.SizeSstrUrowRecv, MPI_BYTE,
2895 PROW( snode.Index, grid_ ), IDX_TO_TAG(supidx,SELINV_TAG_U_CONTENT),
2896 grid_->colComm, mpireqsRecvFromAbove );
2897 #if ( _DEBUGlevel_ >= 1 )
2898 statusOFS << std::endl <<
"["<<snode.Index<<
"] "<<
"Receiving U " << snode.SizeSstrUrowRecv <<
" BYTES"<< std::endl << std::endl;
2902 if( isRecvFromLeft_( snode.Index ) &&
2903 MYCOL( grid_ ) !=
PCOL( snode.Index, grid_ ) ){
2904 snode.SstrLcolRecv.resize( snode.SizeSstrLcolRecv );
2905 MPI_Irecv( &snode.SstrLcolRecv[0], snode.SizeSstrLcolRecv, MPI_BYTE,
2906 PCOL( snode.Index, grid_ ), IDX_TO_TAG(supidx,SELINV_TAG_L_CONTENT),
2908 mpireqsRecvFromLeft );
2909 #if ( _DEBUGlevel_ >= 1 )
2910 statusOFS << std::endl <<
"["<<snode.Index<<
"] "<<
"Receiving L " << snode.SizeSstrLcolRecv <<
" BYTES"<< std::endl << std::endl;
2919 TIMER_START(Compute_Sinv_LT);
2921 Int gemmProcessed = 0;
2925 std::vector<Int> readySupidx;
2927 for(Int supidx = 0;supidx<stepSuper;supidx++){
2928 SuperNodeBufferType & snode = arrSuperNodes[supidx];
2942 if( isRecvFromAbove_( snode.Index ) && isRecvFromLeft_( snode.Index )){
2944 if(
MYCOL( grid_ ) ==
PCOL( snode.Index, grid_ ) ){
2948 if(
MYROW( grid_ ) ==
PROW( snode.Index, grid_ ) ){
2952 if(snode.isReady==2){
2953 readySupidx.push_back(supidx);
2954 #if ( _DEBUGlevel_ >= 1 )
2955 statusOFS<<std::endl<<
"Locally processing ["<<snode.Index<<
"]"<<std::endl;
2959 else if( (isRecvFromLeft_( snode.Index ) ) &&
MYCOL( grid_ ) !=
PCOL( snode.Index, grid_ ) )
2961 MPI_Request & mpireqsSendToLeft = arrMpireqsSendToLeft[supidx];
2963 MPI_Isend( NULL, 0, MPI_BYTE,
PCOL(snode.Index,grid_) ,IDX_TO_TAG(supidx,SELINV_TAG_L_REDUCE), grid_->rowComm, &mpireqsSendToLeft );
2965 #if ( _DEBUGlevel_ >= 1 )
2966 statusOFS << std::endl <<
"["<<snode.Index<<
"] "<<
" P"<<
MYPROC(grid_)<<
" has sent "<< 0 <<
" bytes to " <<
PNUM(
MYROW(grid_),
PCOL(snode.Index,grid_),grid_) << std::endl;
2971 #if ( _DEBUGlevel_ >= 1 )
2972 statusOFS<<std::endl<<
"gemmToDo ="<<gemmToDo<<std::endl;
2977 #if defined (PROFILE)
2978 end_SendULWaitContentFirst=0;
2979 begin_SendULWaitContentFirst=0;
2982 while(gemmProcessed<gemmToDo)
2984 Int reqidx = MPI_UNDEFINED;
2992 int reqIndices[arrMpireqsRecvContentFromAny.size()];
2997 TIMER_START(WaitContent_UL);
2998 #if defined(PROFILE)
2999 if(begin_SendULWaitContentFirst==0){
3000 begin_SendULWaitContentFirst=1;
3001 TIMER_START(WaitContent_UL_First);
3009 MPI_Waitsome(2*stepSuper, &arrMpireqsRecvContentFromAny[0], &numRecv, reqIndices, MPI_STATUSES_IGNORE);
3011 for(
int i =0;i<numRecv;i++){
3012 reqidx = reqIndices[i];
3014 if(reqidx!=MPI_UNDEFINED)
3018 SuperNodeBufferType & snode = arrSuperNodes[supidx];
3021 #if ( _DEBUGlevel_ >= 1 )
3022 statusOFS<<std::endl<<
"Received data for ["<<snode.Index<<
"] reqidx%2="<<reqidx%2<<
" is ready ?"<<snode.isReady<<std::endl;
3025 if(snode.isReady==2){
3026 readySupidx.push_back(supidx);
3028 #if defined(PROFILE)
3029 if(end_SendULWaitContentFirst==0){
3030 TIMER_STOP(WaitContent_UL_First);
3031 end_SendULWaitContentFirst=1;
3039 TIMER_STOP(WaitContent_UL);
3041 }
while(readySupidx.size()==0);
3050 if(readySupidx.size()>0)
3052 supidx = readySupidx.back();
3053 readySupidx.pop_back();
3054 SuperNodeBufferType & snode = arrSuperNodes[supidx];
3058 if( isRecvFromAbove_( snode.Index ) && isRecvFromLeft_( snode.Index ) ){
3060 std::vector<LBlock<T> > LcolRecv;
3061 std::vector<UBlock<T> > UrowRecv;
3065 UnpackData(snode, LcolRecv, UrowRecv);
3068 SelInv_lookup_indexes(snode,LcolRecv, UrowRecv,AinvBuf,UBuf);
3070 snode.LUpdateBuf.Resize( AinvBuf.m(),
SuperSize( snode.Index, super_ ) );
3071 TIMER_START(Compute_Sinv_LT_GEMM);
3072 blas::Gemm(
'N',
'T', AinvBuf.m(), UBuf.m(), AinvBuf.n(), MINUS_ONE<T>(),
3073 AinvBuf.Data(), AinvBuf.m(),
3074 UBuf.Data(), UBuf.m(), ZERO<T>(),
3075 snode.LUpdateBuf.Data(), snode.LUpdateBuf.m() );
3076 TIMER_STOP(Compute_Sinv_LT_GEMM);
3079 #if ( _DEBUGlevel_ >= 2 )
3080 statusOFS << std::endl <<
"["<<snode.Index<<
"] "<<
"snode.LUpdateBuf: " << snode.LUpdateBuf << std::endl;
3086 if( isRecvFromAbove_( snode.Index ) ){
3087 if( isRecvFromLeft_( snode.Index ) &&
MYCOL( grid_ ) !=
PCOL( snode.Index, grid_ ) )
3089 MPI_Request & mpireqsSendToLeft = arrMpireqsSendToLeft[supidx];
3091 MPI_Isend( snode.LUpdateBuf.Data(), snode.LUpdateBuf.ByteSize(), MPI_BYTE,
PCOL(snode.Index,grid_) ,IDX_TO_TAG(supidx,SELINV_TAG_L_REDUCE), grid_->rowComm, &mpireqsSendToLeft );
3093 #if ( _DEBUGlevel_ >= 1 )
3094 statusOFS << std::endl <<
"["<<snode.Index<<
"] "<<
" P"<<
MYCOL(grid_)<<
" has sent "<< snode.LUpdateBuf.ByteSize() <<
" bytes to " <<
PCOL(snode.Index,grid_) << std::endl;
3102 #if ( _DEBUGlevel_ >= 1 )
3103 statusOFS<<std::endl<<
"gemmProcessed ="<<gemmProcessed<<
"/"<<gemmToDo<<std::endl;
3111 TIMER_STOP(Compute_Sinv_LT);
3114 TIMER_START(Reduce_Sinv_LT);
3116 for (Int supidx=0; supidx<stepSuper; supidx++){
3117 SuperNodeBufferType & snode = arrSuperNodes[supidx];
3120 if(
MYCOL( grid_ ) ==
PCOL( snode.Index, grid_ ) ){
3123 Int numRowLUpdateBuf;
3124 std::vector<LBlock<T> >& Lcol = this->L(
LBj( snode.Index, grid_ ) );
3125 if(
MYROW( grid_ ) !=
PROW( snode.Index, grid_ ) ){
3126 snode.RowLocalPtr.resize( Lcol.size() + 1 );
3127 snode.BlockIdxLocal.resize( Lcol.size() );
3128 snode.RowLocalPtr[0] = 0;
3129 for( Int ib = 0; ib < Lcol.size(); ib++ ){
3130 snode.RowLocalPtr[ib+1] = snode.RowLocalPtr[ib] + Lcol[ib].numRow;
3131 snode.BlockIdxLocal[ib] = Lcol[ib].blockIdx;
3135 snode.RowLocalPtr.resize( Lcol.size() );
3136 snode.BlockIdxLocal.resize( Lcol.size() - 1 );
3137 snode.RowLocalPtr[0] = 0;
3138 for( Int ib = 1; ib < Lcol.size(); ib++ ){
3139 snode.RowLocalPtr[ib] = snode.RowLocalPtr[ib-1] + Lcol[ib].numRow;
3140 snode.BlockIdxLocal[ib-1] = Lcol[ib].blockIdx;
3143 numRowLUpdateBuf = *snode.RowLocalPtr.rbegin();
3146 if( numRowLUpdateBuf > 0 ){
3147 if( snode.LUpdateBuf.m() == 0 && snode.LUpdateBuf.n() == 0 ){
3148 snode.LUpdateBuf.Resize( numRowLUpdateBuf,
SuperSize( snode.Index, super_ ) );
3150 SetValue( snode.LUpdateBuf, ZERO<T>() );
3154 #if ( _DEBUGlevel_ >= 2 )
3155 statusOFS << std::endl <<
"["<<snode.Index<<
"] "<<
"LUpdateBuf Before Reduction: " << snode.LUpdateBuf << std::endl << std::endl;
3158 Int totCountRecv = 0;
3159 Int numRecv = CountSendToRight(snode.Index);
3160 NumMat<T> LUpdateBufRecv(numRowLUpdateBuf,
SuperSize( snode.Index, super_ ) );
3161 for( Int countRecv = 0; countRecv < numRecv ; ++countRecv ){
3165 TIMER_START(L_RECV);
3166 MPI_Recv(LUpdateBufRecv.Data(), LUpdateBufRecv.ByteSize(), MPI_BYTE, MPI_ANY_SOURCE,IDX_TO_TAG(supidx,SELINV_TAG_L_REDUCE), grid_->rowComm,&stat);
3168 MPI_Get_count(&stat, MPI_BYTE, &size);
3172 #if ( _DEBUGlevel_ >= 1 )
3173 statusOFS << std::endl <<
"["<<snode.Index<<
"] "<<
" P"<<
MYCOL(grid_)<<
" has received "<< size <<
" bytes from " << stat.MPI_SOURCE << std::endl;
3175 #if ( _DEBUGlevel_ >= 2 )
3176 statusOFS << std::endl <<
"["<<snode.Index<<
"] "<<
"LUpdateBufRecv: " << LUpdateBufRecv << std::endl << std::endl;
3179 blas::Axpy(snode.LUpdateBuf.Size(), ONE<T>(), LUpdateBufRecv.Data(), 1, snode.LUpdateBuf.Data(), 1 );
3183 #if ( _DEBUGlevel_ >= 2 )
3184 statusOFS << std::endl <<
"["<<snode.Index<<
"] "<<
"LUpdateBuf After Reduction: " << snode.LUpdateBuf << std::endl << std::endl;
3189 TIMER_STOP(Reduce_Sinv_LT);
3191 mpi::Waitall( arrMpireqsSendToLeft );
3196 PushCallStack(
"PMatrix::SelInv_P2p::UpdateD");
3199 TIMER_START(Update_Diagonal);
3200 for (Int supidx=0; supidx<stepSuper; supidx++){
3201 SuperNodeBufferType & snode = arrSuperNodes[supidx];
3203 ComputeDiagUpdate(snode);
3205 if(
MYCOL( grid_ ) ==
PCOL( snode.Index, grid_ ) ){
3206 if(
MYROW( grid_ ) !=
PROW( snode.Index, grid_ ) ){
3207 if(isSendToDiagonal_(snode.Index)){
3209 MPI_Request & mpireqsSendToAbove = arrMpireqsSendToAbove[supidx];
3210 MPI_Isend( snode.DiagBuf.Data(), snode.DiagBuf.ByteSize(), MPI_BYTE,
3211 PROW(snode.Index,grid_) ,IDX_TO_TAG(supidx,SELINV_TAG_D_REDUCE), grid_->colComm, &mpireqsSendToAbove );
3213 #if ( _DEBUGlevel_ >= 1 )
3214 statusOFS << std::endl <<
"["<<snode.Index<<
"] "<<
" P"<<
MYROW(grid_)<<
" has sent "<< snode.DiagBuf.ByteSize() <<
" bytes of DiagBuf to " <<
PROW(snode.Index,grid_) <<
" isSendToDiagonal = "<< isSendToDiagonal_(snode.Index) << std::endl;
3221 TIMER_STOP(Update_Diagonal);
3225 TIMER_START(Reduce_Diagonal);
3227 for (Int supidx=0; supidx<stepSuper; supidx++){
3228 SuperNodeBufferType & snode = arrSuperNodes[supidx];
3229 if(
MYCOL( grid_ ) ==
PCOL( snode.Index, grid_ ) ){
3230 if(
MYROW( grid_ ) ==
PROW( snode.Index, grid_ ) ){
3231 if(snode.DiagBuf.Size()==0){
3232 snode.DiagBuf.Resize(
SuperSize( snode.Index, super_ ),
SuperSize( snode.Index, super_ ));
3233 SetValue(snode.DiagBuf, ZERO<T>());
3236 Int totCountRecv = 0;
3237 Int numRecv = CountRecvFromBelow(snode.Index);
3238 NumMat<T> DiagBufRecv(snode.DiagBuf.m(),snode.DiagBuf.n());
3240 for( Int countRecv = 0; countRecv < numRecv ; ++countRecv ){
3244 TIMER_START(D_RECV);
3245 MPI_Recv(DiagBufRecv.Data(), DiagBufRecv.ByteSize(), MPI_BYTE, MPI_ANY_SOURCE,IDX_TO_TAG(supidx,SELINV_TAG_D_REDUCE), grid_->colComm,&stat);
3247 MPI_Get_count(&stat, MPI_BYTE, &size);
3251 blas::Axpy(snode.DiagBuf.Size(), ONE<T>(), DiagBufRecv.Data(),
3252 1, snode.DiagBuf.Data(), 1 );
3255 LBlock<T> & LB = this->L(
LBj( snode.Index, grid_ ) )[0];
3257 blas::Axpy( LB.numRow * LB.numCol, ONE<T>(), snode.DiagBuf.Data(), 1, LB.nzval.Data(), 1 );
3258 Symmetrize( LB.nzval );
3265 TIMER_STOP(Reduce_Diagonal);
3273 PushCallStack(
"PMatrix::SelInv_P2p::UpdateU");
3276 SendRecvCD_UpdateU(arrSuperNodes, stepSuper);
3283 PushCallStack(
"PMatrix::SelInv_P2p::UpdateLFinal");
3286 TIMER_START(Update_L);
3288 for (Int supidx=0; supidx<stepSuper; supidx++){
3289 SuperNodeBufferType & snode = arrSuperNodes[supidx];
3291 #if ( _DEBUGlevel_ >= 1 )
3292 statusOFS << std::endl <<
"["<<snode.Index<<
"] "<<
"Finish updating the L part by filling LUpdateBufReduced back to L" << std::endl << std::endl;
3297 if(
MYCOL( grid_ ) ==
PCOL( snode.Index, grid_ ) && snode.LUpdateBuf.m() > 0 ){
3298 std::vector<LBlock<T> >& Lcol = this->L(
LBj( snode.Index, grid_ ) );
3300 Int startBlock = (
MYROW( grid_ ) ==
PROW( snode.Index, grid_ ))?1:0;
3301 for( Int ib = startBlock; ib < Lcol.size(); ib++ ){
3302 LBlock<T> & LB = Lcol[ib];
3303 lapack::Lacpy(
'A', LB.numRow, LB.numCol, &snode.LUpdateBuf( snode.RowLocalPtr[ib-startBlock], 0 ),
3304 snode.LUpdateBuf.m(), LB.nzval.Data(), LB.numRow );
3310 TIMER_STOP(Update_L);
3320 TIMER_START(Barrier);
3321 mpi::Waitall(arrMpireqsRecvContentFromAny);
3325 mpi::Waitall(arrMpireqsSendToAbove);
3327 for (Int supidx=0; supidx<stepSuper; supidx++){
3328 Int ksup = superList[lidx][supidx];
3329 std::vector<MPI_Request> & mpireqsSendToRight = arrMpireqsSendToRight[supidx];
3330 std::vector<MPI_Request> & mpireqsSendToBelow = arrMpireqsSendToBelow[supidx];
3332 if(
MYCOL( grid_ ) ==
PCOL( ksup, grid_ ) ){
3333 MPI_Barrier(grid_->colComm);
3336 mpi::Waitall( mpireqsSendToRight );
3337 mpi::Waitall( mpireqsSendToBelow );
3341 TIMER_STOP(Barrier);
3346 if (options_->maxPipelineDepth!=-1)
3349 MPI_Barrier(grid_->comm);
3355 template<
typename T>
3358 ConstructCommunicationPattern_P2p();
3363 template<
typename T>
3367 PushCallStack(
"PMatrix::ConstructCommunicationPattern_P2p");
3373 TIMER_START(Allocate);
3376 PushCallStack(
"Initialize the communication pattern" );
3378 isSendToBelow_.Resize(grid_->numProcRow, numSuper);
3379 isSendToRight_.Resize(grid_->numProcCol, numSuper);
3380 isSendToDiagonal_.Resize( numSuper );
3383 SetValue( isSendToDiagonal_,
false );
3385 isSendToCrossDiagonal_.Resize(grid_->numProcCol+1, numSuper );
3386 SetValue( isSendToCrossDiagonal_,
false );
3387 isRecvFromCrossDiagonal_.Resize(grid_->numProcRow+1, numSuper );
3388 SetValue( isRecvFromCrossDiagonal_,
false );
3390 isRecvFromAbove_.Resize( numSuper );
3391 isRecvFromLeft_.Resize( numSuper );
3392 isRecvFromBelow_.Resize( grid_->numProcRow, numSuper );
3393 SetValue( isRecvFromAbove_,
false );
3394 SetValue( isRecvFromBelow_,
false );
3395 SetValue( isRecvFromLeft_,
false );
3400 TIMER_STOP(Allocate);
3425 TIMER_START(GetEtree);
3426 std::vector<Int> snodeEtree(this->
NumSuper());
3427 GetEtree(snodeEtree);
3428 TIMER_STOP(GetEtree);
3433 PushCallStack(
"Local column communication" );
3435 #if ( _DEBUGlevel_ >= 1 )
3436 statusOFS << std::endl <<
"Local column communication" << std::endl;
3441 std::vector<std::set<Int> > localColBlockRowIdx;
3443 localColBlockRowIdx.resize( this->NumLocalBlockCol() );
3446 TIMER_START(Column_communication);
3449 for( Int ksup = 0; ksup < numSuper; ksup++ ){
3451 if(
MYCOL( grid_ ) ==
PCOL( ksup, grid_ ) ){
3455 std::vector<Int> tAllBlockRowIdx;
3456 std::vector<Int> & colBlockIdx = ColBlockIdx(
LBj(ksup, grid_));
3457 TIMER_START(Allgatherv_Column_communication);
3458 if( grid_ -> mpisize != 1 )
3459 mpi::Allgatherv( colBlockIdx, tAllBlockRowIdx, grid_->colComm );
3461 tAllBlockRowIdx = colBlockIdx;
3463 TIMER_STOP(Allgatherv_Column_communication);
3465 localColBlockRowIdx[
LBj( ksup, grid_ )].insert(
3466 tAllBlockRowIdx.begin(), tAllBlockRowIdx.end() );
3468 #if ( _DEBUGlevel_ >= 1 )
3470 <<
" Column block " << ksup
3471 <<
" has the following nonzero block rows" << std::endl;
3472 for( std::set<Int>::iterator si = localColBlockRowIdx[
LBj( ksup, grid_ )].begin();
3473 si != localColBlockRowIdx[
LBj( ksup, grid_ )].end();
3475 statusOFS << *si <<
" ";
3477 statusOFS << std::endl;
3488 TIMER_STOP(Column_communication);
3490 TIMER_START(Row_communication);
3492 PushCallStack(
"Local row communication" );
3494 #if ( _DEBUGlevel_ >= 1 )
3495 statusOFS << std::endl <<
"Local row communication" << std::endl;
3500 std::vector<std::set<Int> > localRowBlockColIdx;
3502 localRowBlockColIdx.resize( this->NumLocalBlockRow() );
3504 for( Int ksup = 0; ksup < numSuper; ksup++ ){
3506 if(
MYROW( grid_ ) ==
PROW( ksup, grid_ ) ){
3509 std::vector<Int> tAllBlockColIdx;
3510 std::vector<Int> & rowBlockIdx = RowBlockIdx(
LBi(ksup, grid_));
3511 TIMER_START(Allgatherv_Row_communication);
3512 if( grid_ -> mpisize != 1 )
3513 mpi::Allgatherv( rowBlockIdx, tAllBlockColIdx, grid_->rowComm );
3515 tAllBlockColIdx = rowBlockIdx;
3517 TIMER_STOP(Allgatherv_Row_communication);
3519 localRowBlockColIdx[
LBi( ksup, grid_ )].insert(
3520 tAllBlockColIdx.begin(), tAllBlockColIdx.end() );
3522 #if ( _DEBUGlevel_ >= 1 )
3524 <<
" Row block " << ksup
3525 <<
" has the following nonzero block columns" << std::endl;
3526 for( std::set<Int>::iterator si = localRowBlockColIdx[
LBi( ksup, grid_ )].begin();
3527 si != localRowBlockColIdx[
LBi( ksup, grid_ )].end();
3529 statusOFS << *si <<
" ";
3531 statusOFS << std::endl;
3541 TIMER_STOP(Row_communication);
3543 TIMER_START(STB_RFA);
3545 PushCallStack(
"SendToBelow / RecvFromAbove");
3547 for( Int ksup = 0; ksup < numSuper - 1; ksup++ ){
3551 Int jsup = snodeEtree[ksup];
3552 while(jsup<numSuper){
3553 Int jsupLocalBlockCol =
LBj( jsup, grid_ );
3554 Int jsupProcCol =
PCOL( jsup, grid_ );
3555 if(
MYCOL( grid_ ) == jsupProcCol ){
3558 if( localColBlockRowIdx[jsupLocalBlockCol].count( ksup ) > 0 ) {
3559 for( std::set<Int>::iterator si = localColBlockRowIdx[jsupLocalBlockCol].begin();
3560 si != localColBlockRowIdx[jsupLocalBlockCol].end(); si++ ){
3562 Int isupProcRow =
PROW( isup, grid_ );
3564 if(
MYROW( grid_ ) == isupProcRow ){
3565 isRecvFromAbove_(ksup) =
true;
3567 if(
MYROW( grid_ ) ==
PROW( ksup, grid_ ) ){
3568 isSendToBelow_( isupProcRow, ksup ) =
true;
3575 jsup = snodeEtree[jsup];
3580 #if ( _DEBUGlevel_ >= 1 )
3581 statusOFS << std::endl <<
"isSendToBelow:" << std::endl;
3582 for(
int j = 0;j< isSendToBelow_.n();j++){
3583 statusOFS <<
"["<<j<<
"] ";
3584 for(
int i =0; i < isSendToBelow_.m();i++){
3585 statusOFS<< isSendToBelow_(i,j) <<
" ";
3587 statusOFS<<std::endl;
3590 statusOFS << std::endl <<
"isRecvFromAbove:" << std::endl;
3591 for(
int j = 0;j< isRecvFromAbove_.m();j++){
3592 statusOFS <<
"["<<j<<
"] "<< isRecvFromAbove_(j)<<std::endl;
3601 TIMER_STOP(STB_RFA);
3610 TIMER_START(STR_RFL_RFB);
3614 PushCallStack(
"SendToRight / RecvFromLeft");
3616 for( Int ksup = 0; ksup < numSuper - 1; ksup++ ){
3619 Int isup = snodeEtree[ksup];
3620 while(isup<numSuper){
3621 Int isupLocalBlockRow =
LBi( isup, grid_ );
3622 Int isupProcRow =
PROW( isup, grid_ );
3623 if(
MYROW( grid_ ) == isupProcRow ){
3625 if( localRowBlockColIdx[isupLocalBlockRow].count( ksup ) > 0 ){
3626 for( std::set<Int>::iterator si = localRowBlockColIdx[isupLocalBlockRow].begin();
3627 si != localRowBlockColIdx[isupLocalBlockRow].end(); si++ ){
3629 Int jsupProcCol =
PCOL( jsup, grid_ );
3632 if(
MYCOL( grid_ ) == jsupProcCol ){
3633 isRecvFromLeft_(ksup) =
true;
3635 if(
MYCOL( grid_ ) ==
PCOL( ksup, grid_ ) ){
3636 isSendToRight_( jsupProcCol, ksup ) =
true;
3644 if(
MYCOL( grid_ ) ==
PCOL(ksup, grid_) ){
3646 if(
MYROW( grid_ ) ==
PROW( ksup, grid_ ) ){
3647 isRecvFromBelow_(isupProcRow,ksup) =
true;
3649 else if (
MYROW(grid_) == isupProcRow){
3650 isSendToDiagonal_(ksup)=
true;
3653 isup = snodeEtree[isup];
3659 #if ( _DEBUGlevel_ >= 1 )
3660 statusOFS << std::endl <<
"isSendToRight:" << std::endl;
3661 for(
int j = 0;j< isSendToRight_.n();j++){
3662 statusOFS <<
"["<<j<<
"] ";
3663 for(
int i =0; i < isSendToRight_.m();i++){
3664 statusOFS<< isSendToRight_(i,j) <<
" ";
3666 statusOFS<<std::endl;
3669 statusOFS << std::endl <<
"isRecvFromLeft:" << std::endl;
3670 for(
int j = 0;j< isRecvFromLeft_.m();j++){
3671 statusOFS <<
"["<<j<<
"] "<< isRecvFromLeft_(j)<<std::endl;
3674 statusOFS << std::endl <<
"isRecvFromBelow:" << std::endl;
3675 for(
int j = 0;j< isRecvFromBelow_.n();j++){
3676 statusOFS <<
"["<<j<<
"] ";
3677 for(
int i =0; i < isRecvFromBelow_.m();i++){
3678 statusOFS<< isRecvFromBelow_(i,j) <<
" ";
3680 statusOFS<<std::endl;
3689 TIMER_STOP(STR_RFL_RFB);
3694 TIMER_START(STCD_RFCD);
3698 PushCallStack(
"SendToCrossDiagonal / RecvFromCrossDiagonal");
3700 for( Int ksup = 0; ksup < numSuper - 1; ksup++ ){
3701 if(
MYCOL( grid_ ) ==
PCOL( ksup, grid_ ) ){
3702 for( std::set<Int>::iterator si = localColBlockRowIdx[
LBj( ksup, grid_ )].begin();
3703 si != localColBlockRowIdx[
LBj( ksup, grid_ )].end(); si++ ){
3705 Int isupProcRow =
PROW( isup, grid_ );
3706 Int isupProcCol =
PCOL( isup, grid_ );
3707 if( isup > ksup &&
MYROW( grid_ ) == isupProcRow ){
3708 isSendToCrossDiagonal_(grid_->numProcCol, ksup ) =
true;
3709 isSendToCrossDiagonal_(isupProcCol, ksup ) =
true;
3715 for( Int ksup = 0; ksup < numSuper - 1; ksup++ ){
3716 if(
MYROW( grid_ ) ==
PROW( ksup, grid_ ) ){
3717 for( std::set<Int>::iterator si = localRowBlockColIdx[
LBi(ksup, grid_) ].begin();
3718 si != localRowBlockColIdx[
LBi(ksup, grid_) ].end(); si++ ){
3720 Int jsupProcCol =
PCOL( jsup, grid_ );
3721 Int jsupProcRow =
PROW( jsup, grid_ );
3722 if( jsup > ksup &&
MYCOL(grid_) == jsupProcCol ){
3723 isRecvFromCrossDiagonal_(grid_->numProcRow, ksup ) =
true;
3724 isRecvFromCrossDiagonal_(jsupProcRow, ksup ) =
true;
3729 #if ( _DEBUGlevel_ >= 1 )
3730 statusOFS << std::endl <<
"isSendToCrossDiagonal:" << std::endl;
3731 for(
int j =0; j < isSendToCrossDiagonal_.n();j++){
3732 if(isSendToCrossDiagonal_(grid_->numProcCol,j)){
3733 statusOFS <<
"["<<j<<
"] ";
3734 for(
int i =0; i < isSendToCrossDiagonal_.m()-1;i++){
3735 if(isSendToCrossDiagonal_(i,j))
3737 statusOFS<<
PNUM(
PROW(j,grid_),i,grid_)<<
" ";
3740 statusOFS<<std::endl;
3744 statusOFS << std::endl <<
"isRecvFromCrossDiagonal:" << std::endl;
3745 for(
int j =0; j < isRecvFromCrossDiagonal_.n();j++){
3746 if(isRecvFromCrossDiagonal_(grid_->numProcRow,j)){
3747 statusOFS <<
"["<<j<<
"] ";
3748 for(
int i =0; i < isRecvFromCrossDiagonal_.m()-1;i++){
3749 if(isRecvFromCrossDiagonal_(i,j))
3751 statusOFS<<
PNUM(i,
PCOL(j,grid_),grid_)<<
" ";
3754 statusOFS<<std::endl;
3765 TIMER_STOP(STCD_RFCD);
3772 GetWorkSet(snodeEtree,this->WorkingSet());
3777 template<
typename T>
3785 template<
typename T>
3788 TIMER_START(SelInv_P2p);
3791 PushCallStack(
"PMatrix::SelInv_P2p");
3798 std::vector<std::vector<Int> > & superList = this->WorkingSet();
3799 Int numSteps = superList.size();
3801 for (Int lidx=0; lidx<numSteps ; lidx++){
3802 Int stepSuper = superList[lidx].size();
3804 SelInvIntra_P2p(lidx);
3811 TIMER_STOP(SelInv_P2p);
3818 template<
typename T>
3822 PushCallStack(
"PMatrix::PreSelInv");
3828 PushCallStack(
"L(i,k) <- L(i,k) * L(k,k)^{-1}");
3830 #if ( _DEBUGlevel_ >= 1 )
3831 statusOFS << std::endl <<
"L(i,k) <- L(i,k) * L(k,k)^{-1}" << std::endl << std::endl;
3833 for( Int ksup = 0; ksup < numSuper; ksup++ ){
3834 if(
MYCOL( grid_ ) ==
PCOL( ksup, grid_ ) ){
3837 std::vector<LBlock<T> >& Lcol = this->L(
LBj( ksup, grid_ ) );
3838 if(
MYROW( grid_ ) ==
PROW( ksup, grid_ ) ){
3839 nzvalLDiag = Lcol[0].nzval;
3840 if( nzvalLDiag.m() !=
SuperSize(ksup, super_) ||
3841 nzvalLDiag.n() !=
SuperSize(ksup, super_) ){
3842 throw std::runtime_error(
"The size of the diagonal block of L is wrong." );
3849 MPI_Bcast( (
void*)nzvalLDiag.Data(), nzvalLDiag.ByteSize(),
3850 MPI_BYTE,
PROW( ksup, grid_ ), grid_->colComm );
3853 for( Int ib = 0; ib < Lcol.size(); ib++ ){
3856 #if ( _DEBUGlevel_ >= 2 )
3858 if(
LBj( ksup, grid_ ) == 0 ){
3859 statusOFS <<
"Diag L(" << ksup <<
", " << ksup <<
"): " << nzvalLDiag << std::endl;
3860 statusOFS <<
"Before solve L(" << LB.
blockIdx <<
", " << ksup <<
"): " << LB.
nzval << std::endl;
3863 blas::Trsm(
'R',
'L',
'N',
'U', LB.
numRow, LB.
numCol, ONE<T>(),
3865 #if ( _DEBUGlevel_ >= 2 )
3867 if(
LBj( ksup, grid_ ) == 0 ){
3868 statusOFS <<
"After solve L(" << LB.
blockIdx <<
", " << ksup <<
"): " << LB.
nzval << std::endl;
3883 PushCallStack(
"U(k,i) <- L(i,k)");
3885 #if ( _DEBUGlevel_ >= 1 )
3886 statusOFS << std::endl <<
"U(k,i) <- L(i,k)" << std::endl << std::endl;
3889 for( Int ksup = 0; ksup < numSuper; ksup++ ){
3890 Int ksupProcRow =
PROW( ksup, grid_ );
3891 Int ksupProcCol =
PCOL( ksup, grid_ );
3893 Int sendCount = CountSendToCrossDiagonal(ksup);
3894 Int recvCount = CountRecvFromCrossDiagonal(ksup);
3896 std::vector<MPI_Request > arrMpiReqsSend(sendCount, MPI_REQUEST_NULL );
3897 std::vector<MPI_Request > arrMpiReqsSizeSend(sendCount, MPI_REQUEST_NULL );
3898 std::vector<std::vector<char> > arrSstrLcolSend(sendCount);
3899 std::vector<Int > arrSstrLcolSizeSend(sendCount);
3901 std::vector<MPI_Request > arrMpiReqsRecv(recvCount, MPI_REQUEST_NULL );
3902 std::vector<MPI_Request > arrMpiReqsSizeRecv(recvCount, MPI_REQUEST_NULL );
3903 std::vector<std::vector<char> > arrSstrLcolRecv(recvCount);
3904 std::vector<Int > arrSstrLcolSizeRecv(recvCount);
3909 if( isSendToCrossDiagonal_(grid_->numProcCol,ksup) ){
3910 #if ( _DEBUGlevel_ >= 1 )
3911 statusOFS<<
"["<<ksup<<
"] P"<<
MYPROC(grid_)<<
" should send to "<<CountSendToCrossDiagonal(ksup)<<
" processors"<<std::endl;
3915 for(Int dstCol = 0; dstCol<grid_->numProcCol; dstCol++){
3916 if(isSendToCrossDiagonal_(dstCol,ksup) ){
3917 Int dst =
PNUM(
PROW(ksup,grid_),dstCol,grid_);
3920 std::stringstream sstm;
3921 std::vector<char> & sstrLcolSend = arrSstrLcolSend[sendIdx];
3922 Int & sstrSize = arrSstrLcolSizeSend[sendIdx];
3923 MPI_Request & mpiReqSend = arrMpiReqsSend[sendIdx];
3924 MPI_Request & mpiReqSizeSend = arrMpiReqsSizeSend[sendIdx];
3926 std::vector<Int> mask( LBlockMask::TOTAL_NUMBER, 1 );
3927 std::vector<LBlock<T> >& Lcol = this->L(
LBj(ksup, grid_) );
3932 if(
MYROW( grid_ ) ==
PROW( ksup, grid_ ) ){
3933 for( Int ib = 1; ib < Lcol.size(); ib++ ){
3934 if( Lcol[ib].blockIdx > ksup && (Lcol[ib].blockIdx % grid_->numProcCol) == dstCol ){
3941 for( Int ib = 0; ib < Lcol.size(); ib++ ){
3942 if( Lcol[ib].blockIdx > ksup && (Lcol[ib].blockIdx % grid_->numProcCol) == dstCol ){
3948 serialize( (Int)count, sstm, NO_MASK );
3950 for( Int ib = 0; ib < Lcol.size(); ib++ ){
3951 if( Lcol[ib].blockIdx > ksup && (Lcol[ib].blockIdx % grid_->numProcCol) == dstCol ){
3952 #if ( _DEBUGlevel_ >= 1 )
3953 statusOFS<<
"["<<ksup<<
"] SEND contains "<<Lcol[ib].blockIdx<<
" which corresponds to "<<
GBj(ib,grid_)<<std::endl;
3955 serialize( Lcol[ib], sstm, mask );
3959 sstrLcolSend.resize( Size(sstm) );
3960 sstm.read( &sstrLcolSend[0], sstrLcolSend.size() );
3961 sstrSize = sstrLcolSend.size();
3968 #if ( _DEBUGlevel_ >= 1 )
3969 statusOFS<<
"["<<ksup<<
"] P"<<
MYPROC(grid_)<<
" ("<<
MYROW(grid_)<<
","<<
MYCOL(grid_)<<
") ---> LBj("<<ksup<<
")="<<
LBj(ksup,grid_)<<
" ---> P"<<dst<<
" ("<<ksupProcRow<<
","<<dstCol<<
")"<<std::endl;
3971 MPI_Isend( &sstrSize, 1, MPI_INT, dst, SELINV_TAG_D_SIZE, grid_->comm, &mpiReqSizeSend );
3972 MPI_Isend( (
void*)&sstrLcolSend[0], sstrSize, MPI_BYTE, dst, SELINV_TAG_D_CONTENT, grid_->comm, &mpiReqSend );
3986 if( isRecvFromCrossDiagonal_(grid_->numProcRow,ksup) ){
3989 #if ( _DEBUGlevel_ >= 1 )
3990 statusOFS<<
"["<<ksup<<
"] P"<<
MYPROC(grid_)<<
" should receive from "<<CountRecvFromCrossDiagonal(ksup)<<
" processors"<<std::endl;
3994 std::vector<UBlock<T> >& Urow = this->U(
LBi( ksup, grid_ ) );
3995 std::vector<bool> isBlockFound(Urow.size(),
false);
4000 for(Int srcRow = 0; srcRow<grid_->numProcRow; srcRow++){
4001 if(isRecvFromCrossDiagonal_(srcRow,ksup) ){
4002 std::vector<LBlock<T> > LcolRecv;
4003 Int src =
PNUM(srcRow,
PCOL(ksup,grid_),grid_);
4005 MPI_Request & mpiReqSizeRecv = arrMpiReqsSizeRecv[recvIdx];
4006 Int & sstrSize = arrSstrLcolSizeRecv[recvIdx];
4008 MPI_Irecv( &sstrSize, 1, MPI_INT, src, SELINV_TAG_D_SIZE, grid_->comm, &mpiReqSizeRecv );
4015 mpi::Waitall(arrMpiReqsSizeRecv);
4021 for(Int srcRow = 0; srcRow<grid_->numProcRow; srcRow++){
4022 if(isRecvFromCrossDiagonal_(srcRow,ksup) ){
4023 std::vector<LBlock<T> > LcolRecv;
4024 Int src =
PNUM(srcRow,
PCOL(ksup,grid_),grid_);
4026 MPI_Request & mpiReqRecv = arrMpiReqsRecv[recvIdx];
4027 Int & sstrSize = arrSstrLcolSizeRecv[recvIdx];
4028 std::vector<char> & sstrLcolRecv = arrSstrLcolRecv[recvIdx];
4029 sstrLcolRecv.resize(sstrSize);
4031 MPI_Irecv( &sstrLcolRecv[0], sstrSize, MPI_BYTE, src, SELINV_TAG_D_CONTENT, grid_->comm, &mpiReqRecv );
4038 mpi::Waitall(arrMpiReqsRecv);
4044 for(Int srcRow = 0; srcRow<grid_->numProcRow; srcRow++){
4045 if(isRecvFromCrossDiagonal_(srcRow,ksup) ){
4046 std::vector<LBlock<T> > LcolRecv;
4047 Int src =
PNUM(srcRow,
PCOL(ksup,grid_),grid_);
4050 Int & sstrSize = arrSstrLcolSizeRecv[recvIdx];
4051 std::vector<char> & sstrLcolRecv = arrSstrLcolRecv[recvIdx];
4052 std::stringstream sstm;
4054 #if ( _DEBUGlevel_ >= 1 )
4055 statusOFS<<
"["<<ksup<<
"] P"<<
MYPROC(grid_)<<
" ("<<
MYROW(grid_)<<
","<<
MYCOL(grid_)<<
") <--- LBj("<<ksup<<
") <--- P"<<src<<
" ("<<srcRow<<
","<<ksupProcCol<<
")"<<std::endl;
4059 sstm.write( &sstrLcolRecv[0], sstrSize );
4063 std::vector<Int> mask( LBlockMask::TOTAL_NUMBER, 1 );
4064 deserialize( numLBlock, sstm, NO_MASK );
4065 LcolRecv.resize(numLBlock);
4066 for( Int ib = 0; ib < numLBlock; ib++ ){
4067 deserialize( LcolRecv[ib], sstm, mask );
4068 #if ( _DEBUGlevel_ >= 1 )
4069 statusOFS<<
"["<<ksup<<
"] RECV contains "<<LcolRecv[ib].blockIdx<<
" which corresponds to "<< ib * grid_->numProcRow + srcRow;
4071 statusOFS<<
" L is on row "<< srcRow <<
" whereas U is on col "<< LcolRecv[ib].blockIdx % grid_->numProcCol <<std::endl;
4081 std::vector<LBlock<T> >& Lcol = this->L(
LBj( ksup, grid_ ) );
4082 if(
MYROW( grid_ ) !=
PROW( ksup, grid_ ) ){
4083 LcolRecv.resize( Lcol.size() );
4084 for( Int ib = 0; ib < Lcol.size(); ib++ ){
4085 LcolRecv[ib] = Lcol[ib];
4089 LcolRecv.resize( Lcol.size() - 1 );
4090 for( Int ib = 0; ib < Lcol.size() - 1; ib++ ){
4091 LcolRecv[ib] = Lcol[ib+1];
4098 for( Int ib = 0; ib < LcolRecv.size(); ib++ ){
4101 throw std::logic_error(
"LcolRecv contains the wrong blocks." );
4103 for( Int jb = 0; jb < Urow.size(); jb++ ){
4108 std::ostringstream msg;
4109 msg <<
"LB(" << LB.
blockIdx <<
", " << ksup <<
") and UB("
4110 << ksup <<
", " << UB.
blockIdx <<
") do not share the same size." << std::endl
4111 <<
"LB: " << LB.
numRow <<
" x " << LB.
numCol << std::endl
4112 <<
"UB: " << UB.
numRow <<
" x " << UB.
numCol << std::endl;
4113 throw std::runtime_error( msg.str().c_str() );
4122 #if ( _DEBUGlevel_ >= 1 )
4123 statusOFS<<
"["<<ksup<<
"] USING "<<LB.
blockIdx<< std::endl;
4125 isBlockFound[jb] =
true;
4133 for( Int jb = 0; jb < Urow.size(); jb++ ){
4135 if( !isBlockFound[jb] ){
4136 throw std::logic_error(
"UBlock cannot find its update. Something is seriously wrong." );
4146 mpi::Waitall(arrMpiReqsSizeSend);
4147 mpi::Waitall(arrMpiReqsSend);
4157 PushCallStack(
"L(i,i) <- [L(k,k) * U(k,k)]^{-1} ");
4159 #if ( _DEBUGlevel_ >= 1 )
4160 statusOFS << std::endl <<
"L(i,i) <- [L(k,k) * U(k,k)]^{-1}" << std::endl << std::endl;
4163 for( Int ksup = 0; ksup < numSuper; ksup++ ){
4164 if(
MYROW( grid_ ) ==
PROW( ksup, grid_ ) &&
4165 MYCOL( grid_ ) ==
PCOL( ksup, grid_ ) ){
4169 for(Int i = 0; i <
SuperSize( ksup, super_ ); i++){
4173 #if ( _DEBUGlevel_ >= 2 )
4175 statusOFS <<
"Factorized A (" << ksup <<
", " << ksup <<
"): " << LB.
nzval << std::endl;
4178 SuperSize( ksup, super_ ), ipiv.Data() );
4181 Symmetrize( LB.
nzval );
4182 #if ( _DEBUGlevel_ >= 2 )
4184 statusOFS <<
"Inversed A (" << ksup <<
", " << ksup <<
"): " << LB.
nzval << std::endl;
4203 template<
typename T>
4207 PushCallStack(
"PMatrix::GetDiagonal");
4211 Int numCol = this->
NumCol();
4212 const IntNumVec& permInv = super_->permInv;
4217 diag.Resize( numCol );
4221 for( Int ksup = 0; ksup < numSuper; ksup++ ){
4223 if(
MYROW( grid_ ) ==
PROW( ksup, grid_ ) &&
4224 MYCOL( grid_ ) ==
PCOL( ksup, grid_ ) ){
4226 for( Int i = 0; i < LB.
numRow; i++ ){
4227 diagLocal( permInv( LB.
rows(i) ) ) = LB.
nzval( i, i );
4233 mpi::Allreduce( diagLocal.Data(), diag.Data(), numCol, MPI_SUM, grid_->comm );
4244 template<
typename T>
4248 PushCallStack(
"PMatrix::PMatrixToDistSparseMatrix");
4250 #if ( _DEBUGlevel_ >= 1 )
4251 statusOFS << std::endl <<
"Converting PMatrix to DistSparseMatrix." << std::endl;
4253 Int mpirank = grid_->mpirank;
4254 Int mpisize = grid_->mpisize;
4256 std::vector<Int> rowSend( mpisize );
4257 std::vector<Int> colSend( mpisize );
4258 std::vector<T> valSend( mpisize );
4259 std::vector<Int> sizeSend( mpisize, 0 );
4260 std::vector<Int> displsSend( mpisize, 0 );
4262 std::vector<Int> rowRecv( mpisize );
4263 std::vector<Int> colRecv( mpisize );
4264 std::vector<T> valRecv( mpisize );
4265 std::vector<Int> sizeRecv( mpisize, 0 );
4266 std::vector<Int> displsRecv( mpisize, 0 );
4269 const IntNumVec& permInv = super_->permInv;
4276 Int numColFirst = this->
NumCol() / mpisize;
4279 for( Int ksup = 0; ksup < numSuper; ksup++ ){
4281 if(
MYCOL( grid_ ) ==
PCOL( ksup, grid_ ) ){
4282 std::vector<LBlock<T> >& Lcol = this->L(
LBj( ksup, grid_ ) );
4283 for( Int ib = 0; ib < Lcol.size(); ib++ ){
4284 for( Int j = 0; j < Lcol[ib].numCol; j++ ){
4286 Int dest = std::min( jcol / numColFirst, mpisize - 1 );
4287 sizeSend[dest] += Lcol[ib].numRow;
4293 if(
MYROW( grid_ ) ==
PROW( ksup, grid_ ) ){
4294 std::vector<UBlock<T> >& Urow = this->U(
LBi( ksup, grid_ ) );
4295 for( Int jb = 0; jb < Urow.size(); jb++ ){
4297 for( Int j = 0; j < cols.m(); j++ ){
4298 Int jcol = permInv( cols(j) );
4299 Int dest = std::min( jcol / numColFirst, mpisize - 1 );
4300 sizeSend[dest] += Urow[jb].numRow;
4308 &sizeSend[0], 1, MPI_INT,
4309 &sizeRecv[0], 1, MPI_INT, grid_->comm );
4314 for( Int ip = 0; ip < mpisize; ip++ ){
4319 displsSend[ip] = displsSend[ip-1] + sizeSend[ip-1];
4326 displsRecv[ip] = displsRecv[ip-1] + sizeRecv[ip-1];
4329 Int sizeSendTotal = displsSend[mpisize-1] + sizeSend[mpisize-1];
4330 Int sizeRecvTotal = displsRecv[mpisize-1] + sizeRecv[mpisize-1];
4332 rowSend.resize( sizeSendTotal );
4333 colSend.resize( sizeSendTotal );
4334 valSend.resize( sizeSendTotal );
4336 rowRecv.resize( sizeRecvTotal );
4337 colRecv.resize( sizeRecvTotal );
4338 valRecv.resize( sizeRecvTotal );
4340 #if ( _DEBUGlevel_ >= 1 )
4341 statusOFS <<
"displsSend = " << displsSend << std::endl;
4342 statusOFS <<
"displsRecv = " << displsRecv << std::endl;
4346 std::vector<Int> cntSize( mpisize, 0 );
4348 for( Int ksup = 0; ksup < numSuper; ksup++ ){
4350 if(
MYCOL( grid_ ) ==
PCOL( ksup, grid_ ) ){
4351 std::vector<LBlock<T> >& Lcol = this->L(
LBj( ksup, grid_ ) );
4352 for( Int ib = 0; ib < Lcol.size(); ib++ ){
4355 for( Int j = 0; j < Lcol[ib].numCol; j++ ){
4357 Int dest = std::min( jcol / numColFirst, mpisize - 1 );
4358 for( Int i = 0; i < rows.m(); i++ ){
4359 rowSend[displsSend[dest] + cntSize[dest]] = permInv( rows(i) );
4360 colSend[displsSend[dest] + cntSize[dest]] = jcol;
4361 valSend[displsSend[dest] + cntSize[dest]] = nzval( i, j );
4369 if(
MYROW( grid_ ) ==
PROW( ksup, grid_ ) ){
4370 std::vector<UBlock<T> >& Urow = this->U(
LBi( ksup, grid_ ) );
4371 for( Int jb = 0; jb < Urow.size(); jb++ ){
4374 for( Int j = 0; j < cols.m(); j++ ){
4375 Int jcol = permInv( cols(j) );
4376 Int dest = std::min( jcol / numColFirst, mpisize - 1 );
4377 for( Int i = 0; i < Urow[jb].numRow; i++ ){
4378 rowSend[displsSend[dest] + cntSize[dest]] =
4380 colSend[displsSend[dest] + cntSize[dest]] = jcol;
4381 valSend[displsSend[dest] + cntSize[dest]] = nzval( i, j );
4390 for( Int ip = 0; ip < mpisize; ip++ ){
4391 if( cntSize[ip] != sizeSend[ip] )
4392 throw std::runtime_error(
"Sizes of the sending information do not match." );
4398 &rowSend[0], &sizeSend[0], &displsSend[0],
4399 &rowRecv[0], &sizeRecv[0], &displsRecv[0],
4402 &colSend[0], &sizeSend[0], &displsSend[0],
4403 &colRecv[0], &sizeRecv[0], &displsRecv[0],
4406 &valSend[0], &sizeSend[0], &displsSend[0],
4407 &valRecv[0], &sizeRecv[0], &displsRecv[0],
4410 #if ( _DEBUGlevel_ >= 1 )
4411 statusOFS <<
"Alltoallv communication finished." << std::endl;
4426 Int firstCol = mpirank * numColFirst;
4428 if( mpirank == mpisize-1 )
4429 numColLocal = this->
NumCol() - numColFirst * (mpisize-1);
4431 numColLocal = numColFirst;
4433 std::vector<std::vector<Int> > rows( numColLocal );
4434 std::vector<std::vector<T> > vals( numColLocal );
4436 for( Int ip = 0; ip < mpisize; ip++ ){
4437 Int* rowRecvCur = &rowRecv[displsRecv[ip]];
4438 Int* colRecvCur = &colRecv[displsRecv[ip]];
4439 T* valRecvCur = &valRecv[displsRecv[ip]];
4440 for( Int i = 0; i < sizeRecv[ip]; i++ ){
4441 rows[colRecvCur[i]-firstCol].push_back( rowRecvCur[i] );
4442 vals[colRecvCur[i]-firstCol].push_back( valRecvCur[i] );
4447 std::vector<std::vector<Int> > sortIndex( numColLocal );
4448 for( Int j = 0; j < numColLocal; j++ ){
4449 sortIndex[j].resize( rows[j].size() );
4450 for( Int i = 0; i < sortIndex[j].size(); i++ )
4451 sortIndex[j][i] = i;
4452 std::sort( sortIndex[j].begin(), sortIndex[j].end(),
4453 IndexComp<std::vector<Int>& > ( rows[j] ) );
4465 for( Int j = 0; j < numColLocal; j++ ){
4470 A.
comm = grid_->comm;
4472 #if ( _DEBUGlevel_ >= 1 )
4473 statusOFS <<
"nnzLocal = " << A.
nnzLocal << std::endl;
4474 statusOFS <<
"nnz = " << A.
Nnz() << std::endl;
4483 for( Int j = 0; j < numColLocal; j++ ){
4484 std::vector<Int>& rowsCur = rows[j];
4485 std::vector<Int>& sortIndexCur = sortIndex[j];
4486 std::vector<T>& valsCur = vals[j];
4487 for( Int i = 0; i < rows[j].size(); i++ ){
4489 *(rowPtr++) = rowsCur[sortIndexCur[i]] + 1;
4490 *(nzvalPtr++) = valsCur[sortIndexCur[i]];
4494 #if ( _DEBUGlevel_ >= 1 )
4495 statusOFS <<
"A.colptrLocal[end] = " << A.
colptrLocal(numColLocal) << std::endl;
4496 statusOFS <<
"A.rowindLocal.size() = " << A.
rowindLocal.m() << std::endl;
4511 template<
typename T>
4515 PushCallStack(
"PMatrix::PMatrixToDistSparseMatrix");
4517 #if ( _DEBUGlevel_ >= 1 )
4518 statusOFS << std::endl <<
"Converting PMatrix to DistSparseMatrix (2nd format)." << std::endl;
4520 Int mpirank = grid_->mpirank;
4521 Int mpisize = grid_->mpisize;
4523 std::vector<Int> rowSend( mpisize );
4524 std::vector<Int> colSend( mpisize );
4525 std::vector<T> valSend( mpisize );
4526 std::vector<Int> sizeSend( mpisize, 0 );
4527 std::vector<Int> displsSend( mpisize, 0 );
4529 std::vector<Int> rowRecv( mpisize );
4530 std::vector<Int> colRecv( mpisize );
4531 std::vector<T> valRecv( mpisize );
4532 std::vector<Int> sizeRecv( mpisize, 0 );
4533 std::vector<Int> displsRecv( mpisize, 0 );
4536 const IntNumVec& permInv = super_->permInv;
4543 Int numColFirst = this->
NumCol() / mpisize;
4546 for( Int ksup = 0; ksup < numSuper; ksup++ ){
4548 if(
MYCOL( grid_ ) ==
PCOL( ksup, grid_ ) ){
4549 std::vector<LBlock<T> >& Lcol = this->L(
LBj( ksup, grid_ ) );
4550 for( Int ib = 0; ib < Lcol.size(); ib++ ){
4551 for( Int j = 0; j < Lcol[ib].numCol; j++ ){
4553 Int dest = std::min( jcol / numColFirst, mpisize - 1 );
4554 sizeSend[dest] += Lcol[ib].numRow;
4560 if(
MYROW( grid_ ) ==
PROW( ksup, grid_ ) ){
4561 std::vector<UBlock<T> >& Urow = this->U(
LBi( ksup, grid_ ) );
4562 for( Int jb = 0; jb < Urow.size(); jb++ ){
4564 for( Int j = 0; j < cols.m(); j++ ){
4565 Int jcol = permInv( cols(j) );
4566 Int dest = std::min( jcol / numColFirst, mpisize - 1 );
4567 sizeSend[dest] += Urow[jb].numRow;
4575 &sizeSend[0], 1, MPI_INT,
4576 &sizeRecv[0], 1, MPI_INT, grid_->comm );
4581 for( Int ip = 0; ip < mpisize; ip++ ){
4586 displsSend[ip] = displsSend[ip-1] + sizeSend[ip-1];
4593 displsRecv[ip] = displsRecv[ip-1] + sizeRecv[ip-1];
4596 Int sizeSendTotal = displsSend[mpisize-1] + sizeSend[mpisize-1];
4597 Int sizeRecvTotal = displsRecv[mpisize-1] + sizeRecv[mpisize-1];
4599 rowSend.resize( sizeSendTotal );
4600 colSend.resize( sizeSendTotal );
4601 valSend.resize( sizeSendTotal );
4603 rowRecv.resize( sizeRecvTotal );
4604 colRecv.resize( sizeRecvTotal );
4605 valRecv.resize( sizeRecvTotal );
4607 #if ( _DEBUGlevel_ >= 1 )
4608 statusOFS <<
"displsSend = " << displsSend << std::endl;
4609 statusOFS <<
"displsRecv = " << displsRecv << std::endl;
4613 std::vector<Int> cntSize( mpisize, 0 );
4616 for( Int ksup = 0; ksup < numSuper; ksup++ ){
4618 if(
MYCOL( grid_ ) ==
PCOL( ksup, grid_ ) ){
4619 std::vector<LBlock<T> >& Lcol = this->L(
LBj( ksup, grid_ ) );
4620 for( Int ib = 0; ib < Lcol.size(); ib++ ){
4623 for( Int j = 0; j < Lcol[ib].numCol; j++ ){
4625 Int dest = std::min( jcol / numColFirst, mpisize - 1 );
4626 for( Int i = 0; i < rows.m(); i++ ){
4627 rowSend[displsSend[dest] + cntSize[dest]] = permInv( rows(i) );
4628 colSend[displsSend[dest] + cntSize[dest]] = jcol;
4629 valSend[displsSend[dest] + cntSize[dest]] = nzval( i, j );
4638 if(
MYROW( grid_ ) ==
PROW( ksup, grid_ ) ){
4639 std::vector<UBlock<T> >& Urow = this->U(
LBi( ksup, grid_ ) );
4640 for( Int jb = 0; jb < Urow.size(); jb++ ){
4643 for( Int j = 0; j < cols.m(); j++ ){
4644 Int jcol = permInv( cols(j) );
4645 Int dest = std::min( jcol / numColFirst, mpisize - 1 );
4646 for( Int i = 0; i < Urow[jb].numRow; i++ ){
4647 rowSend[displsSend[dest] + cntSize[dest]] =
4649 colSend[displsSend[dest] + cntSize[dest]] = jcol;
4650 valSend[displsSend[dest] + cntSize[dest]] = nzval( i, j );
4661 for( Int ip = 0; ip < mpisize; ip++ ){
4662 if( cntSize[ip] != sizeSend[ip] )
4663 throw std::runtime_error(
"Sizes of the sending information do not match." );
4668 &rowSend[0], &sizeSend[0], &displsSend[0],
4669 &rowRecv[0], &sizeRecv[0], &displsRecv[0],
4672 &colSend[0], &sizeSend[0], &displsSend[0],
4673 &colRecv[0], &sizeRecv[0], &displsRecv[0],
4676 &valSend[0], &sizeSend[0], &displsSend[0],
4677 &valRecv[0], &sizeRecv[0], &displsRecv[0],
4680 #if ( _DEBUGlevel_ >= 1 )
4681 statusOFS <<
"Alltoallv communication finished." << std::endl;
4696 Int firstCol = mpirank * numColFirst;
4698 if( mpirank == mpisize-1 )
4699 numColLocal = this->
NumCol() - numColFirst * (mpisize-1);
4701 numColLocal = numColFirst;
4703 std::vector<std::vector<Int> > rows( numColLocal );
4704 std::vector<std::vector<T> > vals( numColLocal );
4706 for( Int ip = 0; ip < mpisize; ip++ ){
4707 Int* rowRecvCur = &rowRecv[displsRecv[ip]];
4708 Int* colRecvCur = &colRecv[displsRecv[ip]];
4709 T* valRecvCur = &valRecv[displsRecv[ip]];
4710 for( Int i = 0; i < sizeRecv[ip]; i++ ){
4711 rows[colRecvCur[i]-firstCol].push_back( rowRecvCur[i] );
4712 vals[colRecvCur[i]-firstCol].push_back( valRecvCur[i] );
4717 std::vector<std::vector<Int> > sortIndex( numColLocal );
4718 for( Int j = 0; j < numColLocal; j++ ){
4719 sortIndex[j].resize( rows[j].size() );
4720 for( Int i = 0; i < sortIndex[j].size(); i++ )
4721 sortIndex[j][i] = i;
4722 std::sort( sortIndex[j].begin(), sortIndex[j].end(),
4723 IndexComp<std::vector<Int>& > ( rows[j] ) );
4730 if( A.
size != this->NumCol() ){
4731 throw std::runtime_error(
"The DistSparseMatrix providing the pattern has a different size from PMatrix." );
4734 throw std::runtime_error(
"The DistSparseMatrix providing the pattern has a different number of local columns from PMatrix." );
4749 B.
comm = grid_->comm;
4753 for( Int j = 0; j < numColLocal; j++ ){
4754 std::vector<Int>& rowsCur = rows[j];
4755 std::vector<Int>& sortIndexCur = sortIndex[j];
4756 std::vector<T>& valsCur = vals[j];
4757 std::vector<Int> rowsCurSorted( rowsCur.size() );
4759 for( Int i = 0; i < rowsCurSorted.size(); i++ ){
4760 rowsCurSorted[i] = rowsCur[sortIndexCur[i]] + 1;
4764 std::vector<Int>::iterator it;
4767 it = std::lower_bound( rowsCurSorted.begin(), rowsCurSorted.end(),
4769 if( it == rowsCurSorted.end() ){
4771 *(nzvalPtr++) = ZERO<T>();
4775 *(nzvalPtr++) = valsCur[ sortIndexCur[it-rowsCurSorted.begin()] ];
4780 #if ( _DEBUGlevel_ >= 1 )
4781 statusOFS <<
"B.colptrLocal[end] = " << B.
colptrLocal(numColLocal) << std::endl;
4782 statusOFS <<
"B.rowindLocal.size() = " << B.
rowindLocal.m() << std::endl;
4800 template<
typename T>
4804 PushCallStack(
"PMatrix::PMatrixToDistSparseMatrix2");
4806 #if ( _DEBUGlevel_ >= 1 )
4807 statusOFS << std::endl <<
"Converting PMatrix to DistSparseMatrix (2nd format)." << std::endl;
4809 Int mpirank = grid_->mpirank;
4810 Int mpisize = grid_->mpisize;
4812 std::vector<Int> rowSend( mpisize );
4813 std::vector<Int> colSend( mpisize );
4814 std::vector<T> valSend( mpisize );
4815 std::vector<Int> sizeSend( mpisize, 0 );
4816 std::vector<Int> displsSend( mpisize, 0 );
4818 std::vector<Int> rowRecv( mpisize );
4819 std::vector<Int> colRecv( mpisize );
4820 std::vector<T> valRecv( mpisize );
4821 std::vector<Int> sizeRecv( mpisize, 0 );
4822 std::vector<Int> displsRecv( mpisize, 0 );
4826 const IntNumVec& permInv = super_->permInv;
4830 Int numColFirst = this->
NumCol() / mpisize;
4831 Int firstCol = mpirank * numColFirst;
4833 if( mpirank == mpisize-1 )
4834 numColLocal = this->
NumCol() - numColFirst * (mpisize-1);
4836 numColLocal = numColFirst;
4841 for( Int j = 0; j < numColLocal; j++ ){
4842 Int col = perm( firstCol + j );
4843 Int blockColIdx =
BlockIdx( col, super_ );
4844 Int procCol =
PCOL( blockColIdx, grid_ );
4845 for( Int i = colPtr[j] - 1; i < colPtr[j+1] - 1; i++ ){
4846 Int row = perm( *(rowPtr++) - 1 );
4847 Int blockRowIdx =
BlockIdx( row, super_ );
4848 Int procRow =
PROW( blockRowIdx, grid_ );
4849 Int dest =
PNUM( procRow, procCol, grid_ );
4850 #if ( _DEBUGlevel_ >= 1 )
4851 statusOFS <<
"BlockIdx = " << blockRowIdx <<
", " <<blockColIdx << std::endl;
4852 statusOFS << procRow <<
", " << procCol <<
", "
4853 << dest << std::endl;
4861 &sizeSend[0], 1, MPI_INT,
4862 &sizeRecv[0], 1, MPI_INT, grid_->comm );
4864 #if ( _DEBUGlevel_ >= 1 )
4865 statusOFS << std::endl <<
"sizeSend: " << sizeSend << std::endl;
4866 statusOFS << std::endl <<
"sizeRecv: " << sizeRecv << std::endl;
4872 for( Int ip = 0; ip < mpisize; ip++ ){
4877 displsSend[ip] = displsSend[ip-1] + sizeSend[ip-1];
4884 displsRecv[ip] = displsRecv[ip-1] + sizeRecv[ip-1];
4888 Int sizeSendTotal = displsSend[mpisize-1] + sizeSend[mpisize-1];
4889 Int sizeRecvTotal = displsRecv[mpisize-1] + sizeRecv[mpisize-1];
4891 rowSend.resize( sizeSendTotal );
4892 colSend.resize( sizeSendTotal );
4893 valSend.resize( sizeSendTotal );
4895 rowRecv.resize( sizeRecvTotal );
4896 colRecv.resize( sizeRecvTotal );
4897 valRecv.resize( sizeRecvTotal );
4899 #if ( _DEBUGlevel_ >= 1 )
4900 statusOFS <<
"displsSend = " << displsSend << std::endl;
4901 statusOFS <<
"displsRecv = " << displsRecv << std::endl;
4905 std::vector<Int> cntSize( mpisize, 0 );
4910 for( Int j = 0; j < numColLocal; j++ ){
4911 Int col = perm( firstCol + j );
4912 Int blockColIdx =
BlockIdx( col, super_ );
4913 Int procCol =
PCOL( blockColIdx, grid_ );
4914 for( Int i = colPtr[j] - 1; i < colPtr[j+1] - 1; i++ ){
4915 Int row = perm( *(rowPtr++) - 1 );
4916 Int blockRowIdx =
BlockIdx( row, super_ );
4917 Int procRow =
PROW( blockRowIdx, grid_ );
4918 Int dest =
PNUM( procRow, procCol, grid_ );
4919 rowSend[displsSend[dest] + cntSize[dest]] = row;
4920 colSend[displsSend[dest] + cntSize[dest]] = col;
4927 for( Int ip = 0; ip < mpisize; ip++ ){
4928 if( cntSize[ip] != sizeSend[ip] )
4929 throw std::runtime_error(
"Sizes of the sending information do not match." );
4934 &rowSend[0], &sizeSend[0], &displsSend[0],
4935 &rowRecv[0], &sizeRecv[0], &displsRecv[0],
4938 &colSend[0], &sizeSend[0], &displsSend[0],
4939 &colRecv[0], &sizeRecv[0], &displsRecv[0],
4942 #if ( _DEBUGlevel_ >= 1 )
4943 statusOFS <<
"Alltoallv communication of nonzero indices finished." << std::endl;
4947 #if ( _DEBUGlevel_ >= 1 )
4948 for( Int ip = 0; ip < mpisize; ip++ ){
4949 statusOFS <<
"rowSend[" << ip <<
"] = " << rowSend[ip] << std::endl;
4950 statusOFS <<
"rowRecv[" << ip <<
"] = " << rowRecv[ip] << std::endl;
4951 statusOFS <<
"colSend[" << ip <<
"] = " << colSend[ip] << std::endl;
4952 statusOFS <<
"colRecv[" << ip <<
"] = " << colRecv[ip] << std::endl;
4957 for( Int g = 0; g < sizeRecvTotal; g++ ){
4958 Int row = rowRecv[g];
4959 Int col = colRecv[g];
4961 Int blockRowIdx =
BlockIdx( row, super_ );
4962 Int blockColIdx =
BlockIdx( col, super_ );
4965 bool isFound =
false;
4967 if( blockColIdx <= blockRowIdx ){
4970 std::vector<LBlock<T> >& Lcol = this->L(
LBj( blockColIdx, grid_ ) );
4972 for( Int ib = 0; ib < Lcol.size(); ib++ ){
4973 #if ( _DEBUGlevel_ >= 1 )
4974 statusOFS <<
"blockRowIdx = " << blockRowIdx <<
", Lcol[ib].blockIdx = " << Lcol[ib].blockIdx <<
", blockColIdx = " << blockColIdx << std::endl;
4976 if( Lcol[ib].blockIdx == blockRowIdx ){
4978 for(
int iloc = 0; iloc < Lcol[ib].numRow; iloc++ ){
4979 if( rows[iloc] == row ){
4981 valRecv[g] = Lcol[ib].nzval( iloc, jloc );
4987 if( isFound ==
true )
break;
4993 std::vector<UBlock<T> >& Urow = this->U(
LBi( blockRowIdx, grid_ ) );
4995 for( Int jb = 0; jb < Urow.size(); jb++ ){
4996 if( Urow[jb].blockIdx == blockColIdx ){
4998 for(
int jloc = 0; jloc < Urow[jb].numCol; jloc++ ){
4999 if( cols[jloc] == col ){
5001 valRecv[g] = Urow[jb].nzval( iloc, jloc );
5007 if( isFound ==
true )
break;
5012 if( isFound ==
false ){
5013 statusOFS <<
"In the permutated order, (" << row <<
", " << col <<
5014 ") is not found in PMatrix." << std::endl;
5015 valRecv[g] = ZERO<T>();
5024 &valRecv[0], &sizeRecv[0], &displsRecv[0],
5025 &valSend[0], &sizeSend[0], &displsSend[0],
5028 #if ( _DEBUGlevel_ >= 1 )
5029 statusOFS <<
"Alltoallv communication of nonzero values finished." << std::endl;
5045 B.
comm = grid_->comm;
5047 for( Int i = 0; i < mpisize; i++ )
5054 for( Int j = 0; j < numColLocal; j++ ){
5055 Int col = perm( firstCol + j );
5056 Int blockColIdx =
BlockIdx( col, super_ );
5057 Int procCol =
PCOL( blockColIdx, grid_ );
5058 for( Int i = colPtr[j] - 1; i < colPtr[j+1] - 1; i++ ){
5059 Int row = perm( *(rowPtr++) - 1 );
5060 Int blockRowIdx =
BlockIdx( row, super_ );
5061 Int procRow =
PROW( blockRowIdx, grid_ );
5062 Int dest =
PNUM( procRow, procCol, grid_ );
5063 *(valPtr++) = valSend[displsSend[dest] + cntSize[dest]];
5069 for( Int ip = 0; ip < mpisize; ip++ ){
5070 if( cntSize[ip] != sizeSend[ip] )
5071 throw std::runtime_error(
"Sizes of the sending information do not match." );
5085 template<
typename T>
5089 PushCallStack(
"PMatrix::NnzLocal");
5093 for( Int ksup = 0; ksup < numSuper; ksup++ ){
5094 if(
MYCOL( grid_ ) ==
PCOL( ksup, grid_ ) ){
5095 std::vector<LBlock<T> >& Lcol = this->L(
LBj( ksup, grid_ ) );
5096 for( Int ib = 0; ib < Lcol.size(); ib++ ){
5097 nnzLocal += Lcol[ib].numRow * Lcol[ib].numCol;
5100 if(
MYROW( grid_ ) ==
PROW( ksup, grid_ ) ){
5101 std::vector<UBlock<T> >& Urow = this->U(
LBi( ksup, grid_ ) );
5102 for( Int jb = 0; jb < Urow.size(); jb++ ){
5103 nnzLocal += Urow[jb].numRow * Urow[jb].numCol;
5116 template<
typename T>
5120 PushCallStack(
"PMatrix::Nnz");
5122 LongInt nnzLocal = LongInt( this->NnzLocal() );
5125 MPI_Allreduce( &nnzLocal, &nnz, 1, MPI_LONG_LONG, MPI_SUM, grid_->comm );
5138 PushCallStack(
"PMatrix::GetNegativeInertia");
5142 Real inertiaLocal = 0.0;
5145 for( Int ksup = 0; ksup < numSuper; ksup++ ){
5147 if(
MYROW( grid_ ) ==
PROW( ksup, grid_ ) &&
5148 MYCOL( grid_ ) ==
PCOL( ksup, grid_ ) ){
5150 for( Int i = 0; i < LB.
numRow; i++ ){
5151 if( LB.
nzval(i, i) < 0 )
5158 mpi::Allreduce( &inertiaLocal, &inertia, 1, MPI_SUM, grid_->comm );
5172 PushCallStack(
"PMatrix::GetNegativeInertia");
5176 Real inertiaLocal = 0.0;
5179 for( Int ksup = 0; ksup < numSuper; ksup++ ){
5181 if(
MYROW( grid_ ) ==
PROW( ksup, grid_ ) &&
5182 MYCOL( grid_ ) ==
PCOL( ksup, grid_ ) ){
5183 LBlock<Complex> & LB = this->L(
LBj( ksup, grid_ ) )[0];
5184 for( Int i = 0; i < LB.numRow; i++ ){
5185 if( LB.nzval(i, i).real() < 0 )
5192 mpi::Allreduce( &inertiaLocal, &inertia, 1, MPI_SUM, grid_->comm );
5204 #endif //_PEXSI_PSELINV_IMPL_HPP_
void PreSelInv()
PreSelInv prepares the structure in L_ and U_ so that SelInv only involves matrix-matrix multiplicati...
Definition: pselinv_impl.hpp:3819
Int blockIdx
Block index (supernodal index)
Definition: pselinv.hpp:187
void PMatrixToDistSparseMatrix(DistSparseMatrix< T > &A)
PMatrixToDistSparseMatrix converts the PMatrix into a distributed compressed sparse column matrix for...
Definition: pselinv_impl.hpp:4245
A thin interface for passing parameters to set the SuperLU options.
Definition: superlu_dist_internal.hpp:62
SuperNodeType describes mapping between supernode and column, the permutation information, and potentially the elimination tree (not implemented here).
Definition: pselinv.hpp:122
Interface with SuperLU_Dist (version 3.0 and later)
void GetNegativeInertia(Real &inertia)
GetNegativeInertia computes the negative inertia of a PMatrix. This can be used to estimate e...
Int PROW(Int bnum, const GridType *g)
PROW returns the processor row that the bnum-th block (supernode) belongs to.
Definition: pselinv.hpp:241
Int numRow
Number of nonzero rows.
Definition: pselinv.hpp:142
IntNumVec colptrLocal
Dimension numColLocal + 1, storing the pointers to the nonzero row indices and nonzero values in rowp...
Definition: sparse_matrix.hpp:108
Int MYPROC(const GridType *g)
MYPROC returns the current processor rank.
Definition: pselinv.hpp:228
Int nnzLocal
Local number of local nonzeros elements on this processor.
Definition: sparse_matrix.hpp:96
Int PCOL(Int bnum, const GridType *g)
PCOL returns the processor column that the bnum-th block (supernode) belongs to.
Definition: pselinv.hpp:246
Int LBi(Int bnum, const GridType *g)
LBi returns the local block number on the processor at processor row PROW( bnum, g )...
Definition: pselinv.hpp:256
Profiling and timing using TAU.
Int SuperSize(Int bnum, const SuperNodeType *s)
SuperSize returns the size of the block bnum.
Definition: pselinv.hpp:297
Int LBj(Int bnum, const GridType *g)
LBj returns the local block number on the processor at processor column PCOL( bnum, g ).
Definition: pselinv.hpp:261
Int GBj(Int jLocal, const GridType *g)
GBj returns the global block number from a local block number in the column direction.
Definition: pselinv.hpp:271
void SetValue(NumMat< F > &M, F val)
SetValue sets a numerical matrix to a constant val.
Definition: NumMat_impl.hpp:137
Int numCol
Number of nonzero columns.
Definition: pselinv.hpp:145
Int NumCol(const SuperNodeType *s)
NumCol returns the total number of columns for a supernodal partiiton.
Definition: pselinv.hpp:306
void GetDiagonal(NumVec< T > &diag)
GetDiagonal extracts the diagonal elements of the PMatrix.
Definition: pselinv_impl.hpp:4204
NumVec< F > nzvalLocal
Dimension nnzLocal, storing the nonzero values.
Definition: sparse_matrix.hpp:116
Int FirstBlockRow(Int bnum, const SuperNodeType *s)
FirstBlockRow returns the first column of a block bnum. Note: the functionality of FirstBlockRow is e...
Definition: pselinv.hpp:292
void GetEtree(std::vector< Int > &etree_supno)
GetEtree computes the supernodal elimination tree to be used later in the pipelined selected inversio...
Definition: pselinv_impl.hpp:875
LBlock stores a nonzero block in the lower triangular part or the diagonal part in PSelInv...
Definition: pselinv.hpp:136
IntNumVec rowindLocal
Dimension nnzLocal, storing the nonzero row indices. The indices are 1-based (FORTRAN-convention), i.e. the first row index is 1.
Definition: sparse_matrix.hpp:113
Int PNUM(Int i, Int j, const GridType *g)
PNUM returns the processor rank that the bnum-th block (supernode) belongs to.
Definition: pselinv.hpp:251
void ConstructCommunicationPattern()
ConstructCommunicationPattern constructs the communication pattern to be used later in the selected i...
Definition: pselinv_impl.hpp:3356
void ConstructCommunicationPattern_P2p()
ConstructCommunicationPattern_P2p constructs the communication pattern to be used later in the select...
Definition: pselinv_impl.hpp:3364
Int MYCOL(const GridType *g)
MYCOL returns my processor column.
Definition: pselinv.hpp:236
Int size
Matrix dimension.
Definition: sparse_matrix.hpp:93
IntNumVec rows
Dimension numRow * 1, index (0-based) for the number of nonzero rows.
Definition: pselinv.hpp:148
Int numCol
Number of nonzero columns.
Definition: pselinv.hpp:193
UBlock stores a nonzero block in the upper triangular part in PSelInv.
Definition: pselinv.hpp:184
Int FirstBlockCol(Int bnum, const SuperNodeType *s)
FirstBlockCol returns the first column of a block bnum.
Definition: pselinv.hpp:285
LongInt Nnz()
Compute the total number of nonzeros through MPI_Allreduce.
Definition: sparse_matrix_impl.hpp:60
MPI_Comm comm
MPI communicator.
Definition: sparse_matrix.hpp:119
Int NnzLocal()
NnzLocal computes the number of nonzero elements (L and U) saved locally.
Definition: pselinv_impl.hpp:5086
Int blockIdx
Block index (supernodal index)
Definition: pselinv.hpp:139
PMatrix contains the main data structure and the computational routine for the parallel selected inve...
Definition: pselinv.hpp:482
Int BlockIdx(Int i, const SuperNodeType *s)
BlockIdx returns the block index of a column i.
Definition: pselinv.hpp:280
Int numRow
Number of nonzero rows.
Definition: pselinv.hpp:190
NumMat< T > nzval
Dimension numRow * numCol, nonzero elements.
Definition: pselinv.hpp:199
Int nnz
Total number of nonzeros elements.
Definition: sparse_matrix.hpp:101
Int NumSuper(const SuperNodeType *s)
NumSuper returns the total number of supernodes.
Definition: pselinv.hpp:301
void SelInv_P2p()
Point-to-point version of the selected inversion.
Definition: pselinv_impl.hpp:3786
NumMat< T > nzval
Dimension numRow * numCol, nonzero elements.
Definition: pselinv.hpp:151
Definition: utility.hpp:1481
IntNumVec cols
Dimension numRow * 1, index (0-based) for the number of nonzero rows.
Definition: pselinv.hpp:196
void SelInv()
SelInv is the main function for the selected inversion.
Definition: pselinv_impl.hpp:3778
LongInt Nnz()
Nnz computes the total number of nonzero elements in the PMatrix.
Definition: pselinv_impl.hpp:5117
void PMatrixToDistSparseMatrix2(const DistSparseMatrix< T > &A, DistSparseMatrix< T > &B)
PMatrixToDistSparseMatrix2 is a more efficient version which performs the same job as PMatrixToDistSp...
Definition: pselinv_impl.hpp:4801
DistSparseMatrix describes a Sparse matrix in the compressed sparse column format (CSC) and distribut...
Definition: sparse_matrix.hpp:91
Int MYROW(const GridType *g)
MYROW returns my processor row.
Definition: pselinv.hpp:232