46 #ifndef _PEXSI_PSELINV_UNSYM_IMPL_HPP_
47 #define _PEXSI_PSELINV_UNSYM_IMPL_HPP_
58 PMatrixUnsym<T>::PMatrixUnsym (
60 const SuperNodeType* s,
61 const PSelInvOptions * o,
62 const SuperLUOptions * oLU
66 PushCallStack(
"PMatrixUnsym::PMatrixUnsym");
69 this->Setup( g, s, o, oLU );
79 void PMatrixUnsym<T>::Setup(
81 const SuperNodeType* s,
82 const PSelInvOptions * o,
83 const SuperLUOptions * oLU
86 PushCallStack(
"PMatrixUnsym::Setup");
89 PMatrix<T>::Setup(g,s,o,oLU);
95 Lrow_.resize( this->NumLocalBlockRow() );
96 Ucol_.resize( this->NumLocalBlockCol() );
100 LrowSize_.resize( this->NumLocalBlockRow() );
101 UcolSize_.resize( this->NumLocalBlockCol() );
122 TIMER_START(Compute_Sinv_LT_Lookup_Indexes);
128 TIMER_START(Build_colptr_rowptr);
130 #if ( _DEBUGlevel_ >= 2 )
131 statusOFS<<
"UrowRecv blockIdx: "<<std::endl;
132 for( Int jb = 0; jb < UrowRecv.size(); jb++ ){
133 statusOFS<<UrowRecv[jb].blockIdx<<
" ";
135 statusOFS<<std::endl;
137 statusOFS<<
"UcolRecv blockIdx: "<<std::endl;
138 for( Int jb = 0; jb < UcolRecv.size(); jb++ ){
139 statusOFS<<UcolRecv[jb].blockIdx<<
" ";
141 statusOFS<<std::endl;
143 statusOFS<<
"LcolRecv blockIdx: "<<std::endl;
144 for( Int jb = 0; jb < LcolRecv.size(); jb++ ){
145 statusOFS<<LcolRecv[jb].blockIdx<<
" ";
147 statusOFS<<std::endl;
150 statusOFS<<
"LrowRecv blockIdx: "<<std::endl;
151 for( Int jb = 0; jb < LrowRecv.size(); jb++ ){
152 statusOFS<<LrowRecv[jb].blockIdx<<
" ";
154 statusOFS<<std::endl;
157 TIMER_START(Sort_LrowRecv_UcolRecv);
159 std::sort(LrowRecv.begin(),LrowRecv.end(),LBlockComparator<T>);
160 std::sort(UcolRecv.begin(),UcolRecv.end(),UBlockComparator<T>);
161 TIMER_STOP(Sort_LrowRecv_UcolRecv);
166 std::vector<Int> rowPtrL(LcolRecv.size() + 1);
168 for( Int ib = 0; ib < LcolRecv.size(); ib++ ){
170 rowPtrL[ib+1] = rowPtrL[ib] + LB.
numRow;
177 std::vector<Int> colPtrL(LrowRecv.size() + 1);
178 std::vector<Int> colPtrU(UcolRecv.size() + 1,-1);
180 for( Int jb = 0; jb < LrowRecv.size(); jb++ ){
182 colPtrL[jb+1] = colPtrL[jb] + LB.
numCol;
186 for( Int jb = 0; jb < UcolRecv.size(); jb++ ){
188 colPtrU[jb+1] = colPtrU[jb] + UB.
numCol;
191 #if ( _DEBUGlevel_ >= 1 )
192 statusOFS<<rowPtrL<<std::endl;
193 statusOFS<<colPtrL<<std::endl;
194 statusOFS<<colPtrU<<std::endl;
198 Int numRowAinvBuf = *colPtrU.rbegin();
200 Int numColAinvBuf = *colPtrL.rbegin();
201 TIMER_STOP(Build_colptr_rowptr);
203 TIMER_START(Allocate_lookup);
205 AinvBuf.Resize( numRowAinvBuf, numColAinvBuf );
206 LBuf.Resize(
SuperSize( snode.Index, this->super_ ), numColAinvBuf );
207 UBuf.Resize(
SuperSize( snode.Index, this->super_ ), numRowAinvBuf );
208 TIMER_STOP(Allocate_lookup);
212 TIMER_START(Fill_LBuf);
214 for( Int jb = 0; jb < LrowRecv.size(); jb++ ){
219 statusOFS<<
"The size of LB is not right. Something is seriously wrong."<<std::endl;
222 throw std::logic_error(
223 "The size of LB is not right. Something is seriously wrong." );
226 LB.
numRow, LBuf.VecData( colPtrL[jb] ), LBuf.m() );
229 TIMER_STOP(Fill_LBuf);
231 TIMER_START(Fill_UBuf);
234 for( Int jb = 0; jb < UcolRecv.size(); jb++ ){
240 statusOFS<<
"The size of UB is not right. Something is seriously wrong."<<std::endl;
243 throw std::logic_error(
244 "The size of UB is not right. Something is seriously wrong." );
249 UB.
numRow, UBuf.VecData( colPtrU[jb] ), UBuf.m() );
252 TIMER_STOP(Fill_UBuf);
256 TIMER_START(JB_Loop);
258 for( Int jb = 0; jb < LrowRecv.size(); jb++ ){
265 for( Int ib = 0; ib < LcolRecv.size(); ib++ ){
269 T* nzvalAinv = &AinvBuf( rowPtrL[ib], colPtrL[jb] );
270 Int ldAinv = numRowAinvBuf;
275 std::vector<Int> relCols( LrowB.
numCol );
276 for( Int j = 0; j < LrowB.
numCol; j++ ){
277 relCols[j] = LrowB.
rows[j] - SinvColsSta;
280 std::vector<LBlock<T> >& LcolSinv = this->L(
LBj(jsup, this->grid_ ));
281 bool isBlockFound =
false;
282 TIMER_START(PARSING_ROW_BLOCKIDX);
283 for( Int ibSinv = 0; ibSinv < LcolSinv.size(); ibSinv++ ){
285 if( LcolSinv[ibSinv].blockIdx == isup ){
289 std::vector<Int> relRows( LB.
numRow );
290 Int* rowsLBPtr = LB.
rows.Data();
291 Int* rowsSinvBPtr = SinvB.
rows.Data();
293 TIMER_START(STDFIND_ROW);
294 Int * pos =&rowsSinvBPtr[0];
295 Int * last =&rowsSinvBPtr[SinvB.
numRow];
296 for( Int i = 0; i < LB.
numRow; i++ ){
297 pos = std::upper_bound(pos, last, rowsLBPtr[i])-1;
299 relRows[i] = (Int)(pos - rowsSinvBPtr);
302 std::ostringstream msg;
303 msg <<
"Row " << rowsLBPtr[i] <<
304 " in LB cannot find the corresponding row in SinvB"
306 <<
"LB.rows = " << LB.
rows << std::endl
307 <<
"SinvB.rows = " << SinvB.
rows << std::endl;
311 throw std::runtime_error( msg.str().c_str() );
314 TIMER_STOP(STDFIND_ROW);
316 TIMER_START(Copy_Sinv_to_Ainv);
318 T* nzvalSinv = SinvB.
nzval.Data();
319 Int ldSinv = SinvB.
numRow;
320 for( Int j = 0; j < LrowB.
numCol; j++ ){
321 for( Int i = 0; i < LB.
numRow; i++ ){
324 nzvalAinv[i + j*ldAinv] =
325 nzvalSinv[relRows[i]+relCols[j]*ldSinv];
328 TIMER_STOP(Copy_Sinv_to_Ainv);
334 TIMER_STOP(PARSING_ROW_BLOCKIDX);
335 if( isBlockFound ==
false ){
336 std::ostringstream msg;
337 msg <<
"["<<snode.Index<<
"] "<<
"Block(" << isup <<
", " << jsup
338 <<
") did not find a matching block in Sinv." << std::endl;
340 statusOFS<<msg.str();
350 std::vector<Int> relRows( LB.
numRow );
351 for( Int i = 0; i < LB.
numRow; i++ ){
352 relRows[i] = LB.
rows[i] - SinvRowsSta;
355 std::vector<UBlock<T> >& UrowSinv =
356 this->U(
LBi( isup, this->grid_ ) );
357 bool isBlockFound =
false;
358 TIMER_START(PARSING_COL_BLOCKIDX);
359 for( Int jbSinv = 0; jbSinv < UrowSinv.size(); jbSinv++ ){
361 if( UrowSinv[jbSinv].blockIdx == jsup ){
365 std::vector<Int> relCols( LrowB.
numCol );
366 Int* rowsLrowBPtr = LrowB.
rows.Data();
367 Int* colsSinvBPtr = SinvB.
cols.Data();
369 TIMER_START(STDFIND_COL);
370 Int * pos =&colsSinvBPtr[0];
371 Int * last =&colsSinvBPtr[SinvB.
numCol];
372 for( Int j = 0; j < LrowB.
numCol; j++ ){
374 pos = std::upper_bound(pos, last, rowsLrowBPtr[j])-1;
376 relCols[j] = (Int)(pos - colsSinvBPtr);
379 std::ostringstream msg;
380 msg <<
"Col " << rowsLrowBPtr[j] <<
381 " in UB cannot find the corresponding row in SinvB"
383 <<
"LrowB.rows = " << LrowB.
rows << std::endl
384 <<
"UinvB.cols = " << SinvB.
cols << std::endl;
388 throw std::runtime_error( msg.str().c_str() );
391 TIMER_STOP(STDFIND_COL);
393 TIMER_START(Copy_Sinv_to_Ainv);
395 T* nzvalSinv = SinvB.
nzval.Data();
396 Int ldSinv = SinvB.
numRow;
397 for( Int j = 0; j < LrowB.
numCol; j++ ){
398 for( Int i = 0; i < LB.
numRow; i++ ){
402 nzvalAinv[i + j*ldAinv] =
403 nzvalSinv[relRows[i]+relCols[j]*ldSinv];
406 TIMER_STOP(Copy_Sinv_to_Ainv);
412 TIMER_STOP(PARSING_COL_BLOCKIDX);
413 if( isBlockFound ==
false ){
414 std::ostringstream msg;
415 msg <<
"["<<snode.Index<<
"] "<<
"Block(" << isup <<
", " << jsup
416 <<
") did not find a matching block in Sinv." << std::endl;
417 statusOFS<<msg.str();
429 TIMER_STOP(Compute_Sinv_LT_Lookup_Indexes);
436 std::vector<MPI_Request > arrMpiReqsSendLCD;
437 std::vector<MPI_Request > arrMpiReqsSizeSendLCD;
438 std::vector<MPI_Request > arrMpiReqsRecvLCD;
439 std::vector<MPI_Request > arrMpiReqsSizeRecvLCD;
440 std::vector<std::vector<char> > arrSstrLcolSendCD;
441 std::vector<int > arrSstrLcolSizeSendCD;
442 std::vector<std::vector<char> > arrSstrLrowRecvCD;
443 std::vector<int > arrSstrLrowSizeRecvCD;
446 std::vector<MPI_Request > arrMpiReqsSendUCD;
447 std::vector<MPI_Request > arrMpiReqsSizeSendUCD;
448 std::vector<MPI_Request > arrMpiReqsRecvUCD;
449 std::vector<MPI_Request > arrMpiReqsSizeRecvUCD;
450 std::vector<std::vector<char> > arrSstrUrowSendCD;
451 std::vector<int > arrSstrUrowSizeSendCD;
452 std::vector<std::vector<char> > arrSstrUcolRecvCD;
453 std::vector<int > arrSstrUcolSizeRecvCD;
455 std::vector<Int>sendOffset;
456 std::vector<Int>recvOffset;
458 void resize(Int sendCount, Int recvCount){
460 arrMpiReqsSendLCD.assign(sendCount, MPI_REQUEST_NULL );
461 arrMpiReqsSizeSendLCD.assign(sendCount, MPI_REQUEST_NULL );
462 arrMpiReqsRecvLCD.assign(recvCount, MPI_REQUEST_NULL );
463 arrMpiReqsSizeRecvLCD.assign(recvCount, MPI_REQUEST_NULL );
464 arrSstrLcolSendCD.resize(sendCount);
465 arrSstrLcolSizeSendCD.resize(sendCount);
466 arrSstrLrowRecvCD.resize(recvCount);
467 arrSstrLrowSizeRecvCD.resize(recvCount);
470 arrMpiReqsSendUCD.assign(recvCount, MPI_REQUEST_NULL );
471 arrMpiReqsSizeSendUCD.assign(recvCount, MPI_REQUEST_NULL );
472 arrMpiReqsRecvUCD.assign(sendCount, MPI_REQUEST_NULL );
473 arrMpiReqsSizeRecvUCD.assign(sendCount, MPI_REQUEST_NULL );
474 arrSstrUrowSendCD.resize(recvCount);
475 arrSstrUrowSizeSendCD.resize(recvCount);
476 arrSstrUcolRecvCD.resize(sendCount);
477 arrSstrUcolSizeRecvCD.resize(sendCount);
481 mpi::Waitall(arrMpiReqsSizeSendLCD);
482 mpi::Waitall(arrMpiReqsSendLCD);
483 mpi::Waitall(arrMpiReqsSizeSendUCD);
484 mpi::Waitall(arrMpiReqsSendUCD);
505 std::vector<Int > & arrSuperNodes,
508 TIMER_START(SendRecvSizesCD);
512 buffers.sendOffset.resize(stepSuper);
513 buffers.recvOffset.resize(stepSuper);
516 for (Int supidx=0; supidx<stepSuper; supidx++){
517 Int snode_index = arrSuperNodes[supidx];
518 buffers.sendOffset[supidx]=sendCount;
519 buffers.recvOffset[supidx]=recvCount;
520 sendCount+= this->CountSendToCrossDiagonal(snode_index);
521 recvCount+= this->CountRecvFromCrossDiagonal(snode_index);
524 buffers.resize(sendCount,recvCount);
528 for (Int supidx=0; supidx<stepSuper; supidx++){
529 Int snode_index = arrSuperNodes[supidx];
531 TIMER_START(Send_L_Recv_Size_U_CrossDiag);
533 if(
MYCOL( this->grid_ ) ==
PCOL( snode_index, this->grid_ )
534 && this->isSendToCrossDiagonal_(this->grid_->numProcCol, snode_index ) ){
538 for(Int dstCol = 0; dstCol<this->grid_->numProcCol; dstCol++){
539 if(this->isSendToCrossDiagonal_(dstCol,snode_index) ){
540 Int dest =
PNUM(
PROW(snode_index,this->grid_),dstCol,this->grid_);
542 if(
MYPROC( this->grid_ ) != dest ){
544 MPI_Request & mpiReqSizeSendL =
545 buffers.arrMpiReqsSizeSendLCD[buffers.sendOffset[supidx]+sendIdxL];
546 MPI_Request & mpiReqSendL =
547 buffers.arrMpiReqsSendLCD[buffers.sendOffset[supidx]+sendIdxL];
548 std::vector<char> & sstrLcolSend =
549 buffers.arrSstrLcolSendCD[buffers.sendOffset[supidx]+sendIdxL];
551 buffers.arrSstrLcolSizeSendCD[buffers.sendOffset[supidx]+sendIdxL];
554 std::stringstream sstm;
555 std::vector<Int> mask( LBlockMask::TOTAL_NUMBER, 1 );
556 std::vector<LBlock<T> >& Lcol = this->L(
LBj(snode_index, this->grid_) );
561 Int startIdx = (
MYROW( this->grid_ ) ==
PROW( snode_index, this->grid_ ) ) ? 1:0;
563 for( Int ib = startIdx; ib < Lcol.size(); ib++ ){
564 if( Lcol[ib].blockIdx > snode_index &&
565 (Lcol[ib].blockIdx % this->grid_->numProcCol) == dstCol ){
570 serialize( (Int)count, sstm, NO_MASK );
572 for( Int ib = startIdx; ib < Lcol.size(); ib++ ){
573 if( Lcol[ib].blockIdx > snode_index &&
574 (Lcol[ib].blockIdx % this->grid_->numProcCol) == dstCol ){
575 #if ( _DEBUGlevel_ >= 1 )
576 statusOFS <<
"["<<snode_index<<
"] SEND contains "<<Lcol[ib].blockIdx
577 <<
" which corresponds to "<<
GBj(ib,this->grid_)
581 serialize( Lcol[ib], sstm, mask );
585 sstrLcolSend.resize( Size(sstm) );
586 sstm.read( &sstrLcolSend[0], sstrLcolSend.size() );
587 sstrSizeL = sstrLcolSend.size();
591 MPI_Isend( &sstrSizeL, 1, MPI_INT, dest,
592 IDX_TO_TAG2(snode_index,supidx,SELINV_TAG_L_SIZE_CD),
593 this->grid_->comm, &mpiReqSizeSendL );
594 MPI_Isend( (
void*)&sstrLcolSend[0], sstrSizeL, MPI_BYTE, dest,
595 IDX_TO_TAG2(snode_index,supidx,SELINV_TAG_L_CONTENT_CD),
596 this->grid_->comm, &mpiReqSendL );
601 buffers.arrSstrUcolSizeRecvCD[buffers.sendOffset[supidx]+recvIdxU];
602 MPI_Request & mpiReqSizeRecvU =
603 buffers.arrMpiReqsSizeRecvUCD[buffers.sendOffset[supidx]+recvIdxU];
605 MPI_Irecv( &sstrSizeU, 1, MPI_INT, dest,
606 IDX_TO_TAG2(snode_index,supidx,SELINV_TAG_U_SIZE_CD),
607 this->grid_->comm, &mpiReqSizeRecvU );
613 TIMER_STOP(Send_L_Recv_Size_U_CrossDiag);
618 for (Int supidx=0; supidx<stepSuper; supidx++){
619 Int snode_index = arrSuperNodes[supidx];
620 TIMER_START(Send_U_Recv_Size_L_CrossDiag);
622 if(
MYROW( this->grid_ ) ==
PROW( snode_index, this->grid_ )
623 && this->isRecvFromCrossDiagonal_(this->grid_->numProcRow, snode_index ) ){
626 for(Int srcRow = 0; srcRow<this->grid_->numProcRow; srcRow++){
627 if(this->isRecvFromCrossDiagonal_(srcRow,snode_index) ){
628 Int src =
PNUM(srcRow,
PCOL(snode_index,this->grid_),this->grid_);
629 if(
MYPROC( this->grid_ ) != src ){
632 buffers.arrSstrLrowSizeRecvCD[buffers.recvOffset[supidx]+recvIdxL];
633 MPI_Request & mpiReqSizeRecvL =
634 buffers.arrMpiReqsSizeRecvLCD[buffers.recvOffset[supidx]+recvIdxL];
636 MPI_Irecv( &sstrSizeL, 1, MPI_INT, src,
637 IDX_TO_TAG2(snode_index,supidx,SELINV_TAG_L_SIZE_CD),
638 this->grid_->comm, &mpiReqSizeRecvL );
643 MPI_Request & mpiReqSizeSendU =
644 buffers.arrMpiReqsSizeSendUCD[buffers.recvOffset[supidx]+sendIdxU];
645 MPI_Request & mpiReqSendU =
646 buffers.arrMpiReqsSendUCD[buffers.recvOffset[supidx]+sendIdxU];
647 std::vector<char> & sstrUrowSend =
648 buffers.arrSstrUrowSendCD[buffers.recvOffset[supidx]+sendIdxU];
650 buffers.arrSstrUrowSizeSendCD[buffers.recvOffset[supidx]+sendIdxU];
653 std::stringstream sstm;
654 std::vector<Int> mask( UBlockMask::TOTAL_NUMBER, 1 );
655 std::vector<UBlock<T> >& Urow = this->U(
LBi(snode_index, this->grid_) );
660 for( Int jb = 0; jb < Urow.size(); jb++ ){
661 if( Urow[jb].blockIdx > snode_index
662 && (Urow[jb].blockIdx % this->grid_->numProcRow) == srcRow ){
667 serialize( (Int)count, sstm, NO_MASK );
669 for( Int jb = 0; jb < Urow.size(); jb++ ){
670 if( Urow[jb].blockIdx > snode_index
671 && (Urow[jb].blockIdx % this->grid_->numProcRow) == srcRow ){
672 #if ( _DEBUGlevel_ >= 1 )
673 statusOFS <<
"["<<snode_index<<
"] SEND contains "<<Urow[jb].blockIdx
674 <<
" which corresponds to "<<
GBi(jb,this->grid_)
678 serialize( Urow[jb], sstm, mask );
682 sstrUrowSend.resize( Size(sstm) );
683 sstm.read( &sstrUrowSend[0], sstrUrowSend.size() );
684 sstrSizeU = sstrUrowSend.size();
687 MPI_Isend( &sstrSizeU, 1, MPI_INT, src,
688 IDX_TO_TAG2(snode_index,supidx,SELINV_TAG_U_SIZE_CD),
689 this->grid_->comm, &mpiReqSizeSendU );
690 MPI_Isend( (
void*)&sstrUrowSend[0], sstrSizeU, MPI_BYTE, src,
691 IDX_TO_TAG2(snode_index,supidx,SELINV_TAG_U_CONTENT_CD),
692 this->grid_->comm, &mpiReqSendU );
698 TIMER_STOP(Send_U_Recv_Size_L_CrossDiag);
700 TIMER_STOP(SendRecvSizesCD);
704 inline void PMatrixUnsym<T>::IRecvContentCD(
705 std::vector<Int > & arrSuperNodes,
706 Int stepSuper, CDBuffers & buffers) {
707 TIMER_START(IrecvContentCD);
711 TIMER_START(Wait_Size_L_CrossDiag);
712 mpi::Waitall(buffers.arrMpiReqsSizeRecvLCD);
713 TIMER_STOP(Wait_Size_L_CrossDiag);
718 for (Int supidx=0; supidx<stepSuper; supidx++){
719 Int snode_index = arrSuperNodes[supidx];
721 TIMER_START(Recv_L_CrossDiag);
722 if(
MYROW( this->grid_ ) ==
PROW( snode_index, this->grid_ )
723 && this->isRecvFromCrossDiagonal_(this->grid_->numProcRow, snode_index ) ){
725 for(Int srcRow = 0; srcRow<this->grid_->numProcRow; srcRow++){
726 if(this->isRecvFromCrossDiagonal_(srcRow,snode_index) ){
727 Int src =
PNUM(srcRow,
PCOL(snode_index,this->grid_),this->grid_);
728 if(
MYPROC( this->grid_ ) != src ){
730 buffers.arrSstrLrowSizeRecvCD[buffers.recvOffset[supidx]+recvIdxL];
731 std::vector<char> & sstrLrowRecv =
732 buffers.arrSstrLrowRecvCD[buffers.recvOffset[supidx]+recvIdxL];
733 MPI_Request & mpiReqRecvL =
734 buffers.arrMpiReqsRecvLCD[buffers.recvOffset[supidx]+recvIdxL];
735 sstrLrowRecv.resize( sstrSizeL);
737 MPI_Irecv( (
void*)&sstrLrowRecv[0], sstrSizeL, MPI_BYTE, src,
738 IDX_TO_TAG2(snode_index,supidx,SELINV_TAG_L_CONTENT_CD),
739 this->grid_->comm, &mpiReqRecvL );
746 TIMER_STOP(Recv_L_CrossDiag);
750 TIMER_START(Wait_Size_U_CrossDiag);
751 mpi::Waitall(buffers.arrMpiReqsSizeRecvUCD);
752 TIMER_STOP(Wait_Size_U_CrossDiag);
755 for (Int supidx=0; supidx<stepSuper; supidx++){
756 Int snode_index = arrSuperNodes[supidx];
757 TIMER_START(Recv_U_CrossDiag);
758 if(
MYCOL( this->grid_ ) ==
PCOL( snode_index, this->grid_ )
759 && this->isSendToCrossDiagonal_(this->grid_->numProcCol, snode_index ) ){
761 for(Int srcCol = 0; srcCol<this->grid_->numProcCol; srcCol++){
762 if(this->isSendToCrossDiagonal_(srcCol,snode_index) ){
763 Int src =
PNUM(
PROW(snode_index,this->grid_),srcCol,this->grid_);
764 if(
MYPROC( this->grid_ ) != src ){
766 buffers.arrSstrUcolSizeRecvCD[buffers.sendOffset[supidx]+recvIdxU];
767 std::vector<char> & sstrUcolRecv =
768 buffers.arrSstrUcolRecvCD[buffers.sendOffset[supidx]+recvIdxU];
769 MPI_Request & mpiReqRecvU =
770 buffers.arrMpiReqsRecvUCD[buffers.sendOffset[supidx]+recvIdxU];
771 sstrUcolRecv.resize( sstrSizeU );
773 MPI_Irecv( (
void*)&sstrUcolRecv[0], sstrSizeU, MPI_BYTE, src,
774 IDX_TO_TAG2(snode_index,supidx,SELINV_TAG_U_CONTENT_CD),
775 this->grid_->comm, &mpiReqRecvU );
781 TIMER_STOP(Recv_U_CrossDiag);
784 TIMER_START(IrecvContentCD);
789 inline void PMatrixUnsym<T>::WaitContentLCD(
790 std::vector<Int > & arrSuperNodes,
791 Int stepSuper, CDBuffers & buffers) {
792 TIMER_START(WaitContentLCD);
794 TIMER_START(Wait_L_CrossDiag);
795 mpi::Waitall(buffers.arrMpiReqsRecvLCD);
796 TIMER_STOP(Wait_L_CrossDiag);
802 for (Int supidx=0; supidx<stepSuper; supidx++){
803 Int snode_index = arrSuperNodes[supidx];
804 Int ksupProcRow =
PROW( snode_index, this->grid_ );
805 Int ksupProcCol =
PCOL( snode_index, this->grid_ );
807 if(
MYROW( this->grid_ ) ==
PROW( snode_index, this->grid_ ) &&
808 this->isRecvFromCrossDiagonal_(this->grid_->numProcRow, snode_index ) ){
810 #if ( _DEBUGlevel_ >= 1 )
811 statusOFS << std::endl <<
" ["<<snode_index<<
"] "
812 <<
"Update the upper triangular block"
813 << std::endl << std::endl;
814 statusOFS << std::endl <<
" ["<<snode_index<<
"] "
816 << std::endl << std::endl;
817 statusOFS << std::endl <<
" ["<<snode_index<<
"] "
819 << std::endl << std::endl;
822 std::vector<LBlock<T> >& Lcol = this->L(
LBj(snode_index, this->grid_) );
823 std::vector<LBlock<T> >& Lrow = this->Lrow(
LBi( snode_index, this->grid_ ) );
830 std::vector<Int> isBlockFound(Lrow.size(),
false);
833 for(Int srcRow = 0; srcRow<this->grid_->numProcRow; srcRow++){
834 if(this->isRecvFromCrossDiagonal_(srcRow,snode_index) ){
835 Int src =
PNUM(srcRow,
PCOL(snode_index,this->grid_),this->grid_);
836 TIMER_START(Recv_L_CrossDiag);
839 std::vector<LBlock<T> > * pLcol;
840 std::vector<LBlock<T> > LcolRecv;
841 if(
MYPROC(this->grid_)!= src){
843 buffers.arrSstrLrowSizeRecvCD[buffers.recvOffset[supidx]+recvIdxL];
844 std::vector<char> & sstrLrowRecv =
845 buffers.arrSstrLrowRecvCD[buffers.recvOffset[supidx]+recvIdxL];
847 std::stringstream sstm;
849 #if ( _DEBUGlevel_ >= 1 )
850 statusOFS <<
"["<<snode_index<<
"] P"<<
MYPROC(this->grid_)<<
" ("<<
MYROW(this->grid_)
851 <<
","<<
MYCOL(this->grid_)<<
") <--- LBj("<<snode_index<<
") <--- P"
852 << src <<
" ("<<srcRow<<
","<<ksupProcCol<<
")"
855 sstm.write( &sstrLrowRecv[0], sstrSizeL );
859 std::vector<Int> mask( LBlockMask::TOTAL_NUMBER, 1 );
860 deserialize( numLBlock, sstm, NO_MASK );
861 LcolRecv.resize(numLBlock);
862 for( Int ib = 0; ib < numLBlock; ib++ ){
863 deserialize( LcolRecv[ib], sstm, mask );
864 #if ( _DEBUGlevel_ >= 1 )
865 statusOFS <<
"["<<snode_index<<
"] RECV contains "
866 << LcolRecv[ib].blockIdx<<
" which corresponds to "
867 << ib * this->grid_->numProcRow + srcRow;
868 statusOFS <<
" L is on row "<< srcRow
869 <<
" whereas Lrow is on col "
870 << LcolRecv[ib].blockIdx % this->grid_->numProcCol
883 TIMER_STOP(Recv_L_CrossDiag);
888 Int startIdx = (
MYPROC( this->grid_ ) == src &&
MYPROC(this->grid_) ==
PNUM(ksupProcRow,ksupProcCol, this->grid_) ) ? 1:0;
889 for( Int ib = startIdx; ib < pLcol->size(); ib++ ){
890 LBlock<T> & LB = (*pLcol)[ib];
891 if( LB.blockIdx <= snode_index ){
895 throw std::logic_error(
"LcolRecv contains the wrong blocks." );
899 if(
PCOL(LB.blockIdx, this->grid_) !=
MYCOL(this->grid_)){
906 for( Int iib = 0; iib < Lrow.size(); iib++ ){
907 LBlock<T> & LrowB = Lrow[ iib ];
910 if( LrowB.blockIdx == -1){
911 LrowB.blockIdx = LB.blockIdx;
914 if( LB.blockIdx == LrowB.blockIdx ){
919 LrowB.rows = LB.rows;
921 Transpose(LB.nzval, LrowB.nzval);
922 LrowB.numCol = LB.numRow;
923 LrowB.numRow = LB.numCol;
927 #if ( _DEBUGlevel_ >= 1 )
928 statusOFS<<
"["<<snode_index<<
"] USING LB "<<LB.blockIdx<< std::endl;
930 isBlockFound[iib] = 1;
938 for( Int ib = 0; ib < Lrow.size(); ib++ ){
939 LBlock<T> & LB = Lrow[ib];
940 if( !isBlockFound[ib] ){
942 std::vector<LBlock<T> >& Lcol = this->L(
LBj( snode_index, this->grid_ ) );
943 std::vector<UBlock<T> >& Urow = this->U(
LBi( snode_index, this->grid_ ) );
945 for( Int ib = 0; ib < Lcol.size(); ib++ ){ statusOFS<<Lcol[ib].blockIdx<<
" "; }statusOFS<<endl;
946 for( Int jb = 0; jb < Urow.size(); jb++ ){ statusOFS<<Urow[jb].blockIdx; }statusOFS<<endl;
947 for( Int jb = 0; jb < Lrow.size(); jb++ ){ statusOFS<<Lrow[jb].blockIdx<<
" "; }statusOFS<<endl;
950 throw std::logic_error(
951 "LBlock cannot find its update. Something is seriously wrong."
958 TIMER_STOP(WaitContentLCD);
964 inline void PMatrixUnsym<T>::WaitContentUCD(
965 std::vector<Int > & arrSuperNodes,
966 Int stepSuper, CDBuffers & buffers) {
968 TIMER_START(WaitContentUCD);
970 TIMER_START(Wait_U_CrossDiag);
971 mpi::Waitall(buffers.arrMpiReqsRecvUCD);
972 TIMER_STOP(Wait_U_CrossDiag);
976 for (Int supidx=0; supidx<stepSuper; supidx++){
977 Int snode_index = arrSuperNodes[supidx];
978 Int ksupProcRow =
PROW( snode_index, this->grid_ );
979 Int ksupProcCol =
PCOL( snode_index, this->grid_ );
982 if(
MYCOL( this->grid_ ) ==
PCOL( snode_index, this->grid_ ) &&
983 this->isSendToCrossDiagonal_(this->grid_->numProcCol, snode_index ) ){
985 std::vector<UBlock<T> >& Ucol = this->Ucol(
LBj( snode_index, this->grid_ ) );
992 std::vector<Int> isBlockFound(Ucol.size(),
false);
995 for(Int srcCol = 0; srcCol<this->grid_->numProcCol; srcCol++){
996 if(this->isSendToCrossDiagonal_(srcCol,snode_index) ){
997 Int src =
PNUM(
PROW(snode_index,this->grid_),srcCol,this->grid_);
998 TIMER_START(Recv_U_CrossDiag);
1001 std::vector<UBlock<T> > * pUrow;
1002 std::vector<UBlock<T> > UrowRecv;
1003 if(
MYPROC(this->grid_)!= src){
1005 buffers.arrSstrUcolSizeRecvCD[buffers.sendOffset[supidx]+recvIdxU];
1006 std::vector<char> & sstrUcolRecv =
1007 buffers.arrSstrUcolRecvCD[buffers.sendOffset[supidx]+recvIdxU];
1010 std::stringstream sstm;
1012 #if ( _DEBUGlevel_ >= 1 )
1013 statusOFS <<
"["<<snode_index<<
"] P"<<
MYPROC(this->grid_)<<
" ("<<
MYROW(this->grid_)
1014 <<
","<<
MYCOL(this->grid_)<<
") <--- LBi("<<snode_index<<
") <--- P"
1015 << src<<
" ("<<ksupProcCol<<
","<<srcCol<<
")"<<std::endl;
1017 sstm.write( &sstrUcolRecv[0], sstrSizeU );
1021 std::vector<Int> mask( UBlockMask::TOTAL_NUMBER, 1 );
1022 deserialize( numUBlock, sstm, NO_MASK );
1023 UrowRecv.resize(numUBlock);
1024 for( Int jb = 0; jb < numUBlock; jb++ ){
1025 deserialize( UrowRecv[jb], sstm, mask );
1026 #if ( _DEBUGlevel_ >= 1 )
1027 statusOFS <<
"["<<snode_index<<
"] RECV contains "
1028 << UrowRecv[jb].blockIdx<<
" which corresponds to "
1029 << jb * this->grid_->numProcCol + srcCol;
1030 statusOFS <<
" Urow is on col "<< srcCol
1031 <<
" whereas U is on row "
1032 << UrowRecv[jb].blockIdx % this->grid_->numProcRow
1041 std::vector<UBlock<T> >& Urow = this->U(
LBi( snode_index, this->grid_ ) );
1046 TIMER_STOP(Recv_U_CrossDiag);
1049 for( Int jb = 0; jb < pUrow->size(); jb++ ){
1050 UBlock<T> & UB = (*pUrow)[jb];
1051 if( UB.blockIdx <= snode_index ){
1055 throw std::logic_error(
"UrowRecv contains the wrong blocks." );
1059 if(
PROW(UB.blockIdx, this->grid_) !=
MYROW(this->grid_)){
1065 for( Int jjb = 0; jjb < Ucol.size(); jjb++ ){
1066 UBlock<T> & UcolB = Ucol[jjb];
1070 if( UcolB.blockIdx == -1){
1071 UcolB.blockIdx = UB.blockIdx;
1075 if( UB.blockIdx == UcolB.blockIdx ){
1083 #if ( _DEBUGlevel_ >= 1 )
1084 statusOFS<<
"["<<snode_index<<
"] USING UB "<<UB.blockIdx<< std::endl;
1086 isBlockFound[jjb] = 1;
1095 for( Int jb = 0; jb < Ucol.size(); jb++ ){
1096 UBlock<T> & UB = Ucol[jb];
1097 if( !isBlockFound[jb] ){
1100 std::vector<LBlock<T> >& Lcol = this->L(
LBj( snode_index, this->grid_ ) );
1101 std::vector<UBlock<T> >& Urow = this->U(
LBi( snode_index, this->grid_ ) );
1103 for( Int ib = 0; ib < Lcol.size(); ib++ ){ statusOFS<<Lcol[ib].blockIdx<<
" "; }statusOFS<<endl;
1104 for( Int jb = 0; jb < Urow.size(); jb++ ){ statusOFS<<Urow[jb].blockIdx; }statusOFS<<endl;
1105 for( Int jb = 0; jb < Ucol.size(); jb++ ){ statusOFS<<Ucol[jb].blockIdx<<
" "; }statusOFS<<endl;
1108 throw std::logic_error(
1109 "UBlock cannot find its update. Something is seriously wrong."
1117 TIMER_STOP(WaitContentUCD);
1123 template<
typename T>
1125 std::vector<SuperNodeBufferTypeUnsym > & arrSuperNodes,
1129 TIMER_START(Send_CD);
1133 Int sendOffset[stepSuper];
1134 Int recvOffset[stepSuper];
1137 for (Int supidx=0; supidx<stepSuper; supidx++){
1139 sendOffset[supidx]=sendCount;
1140 recvOffset[supidx]=recvCount;
1141 sendCount+= this->CountSendToCrossDiagonal(snode.Index);
1142 recvCount+= this->CountRecvFromCrossDiagonal(snode.Index);
1146 std::vector<MPI_Request > arrMpiReqsSendLCD(sendCount, MPI_REQUEST_NULL );
1147 std::vector<MPI_Request > arrMpiReqsSizeSendLCD(sendCount, MPI_REQUEST_NULL );
1148 std::vector<MPI_Request > arrMpiReqsRecvLCD(recvCount, MPI_REQUEST_NULL );
1149 std::vector<MPI_Request > arrMpiReqsSizeRecvLCD(recvCount, MPI_REQUEST_NULL );
1150 std::vector<std::vector<char> > arrSstrLcolSendCD(sendCount);
1151 std::vector<int > arrSstrLcolSizeSendCD(sendCount);
1152 std::vector<std::vector<char> > arrSstrLrowRecvCD(recvCount);
1153 std::vector<int > arrSstrLrowSizeRecvCD(recvCount);
1156 std::vector<MPI_Request > arrMpiReqsSendUCD(recvCount, MPI_REQUEST_NULL );
1157 std::vector<MPI_Request > arrMpiReqsSizeSendUCD(recvCount, MPI_REQUEST_NULL );
1158 std::vector<MPI_Request > arrMpiReqsRecvUCD(sendCount, MPI_REQUEST_NULL );
1159 std::vector<MPI_Request > arrMpiReqsSizeRecvUCD(sendCount, MPI_REQUEST_NULL );
1160 std::vector<std::vector<char> > arrSstrUrowSendCD(recvCount);
1161 std::vector<int > arrSstrUrowSizeSendCD(recvCount);
1162 std::vector<std::vector<char> > arrSstrUcolRecvCD(sendCount);
1163 std::vector<int > arrSstrUcolSizeRecvCD(sendCount);
1168 for (Int supidx=0; supidx<stepSuper; supidx++){
1171 TIMER_START(Send_L_Recv_Size_U_CrossDiag);
1173 if(
MYCOL( this->grid_ ) ==
PCOL( snode.Index, this->grid_ )
1174 && this->isSendToCrossDiagonal_(this->grid_->numProcCol, snode.Index ) ){
1178 for(Int dstCol = 0; dstCol<this->grid_->numProcCol; dstCol++){
1179 if(this->isSendToCrossDiagonal_(dstCol,snode.Index) ){
1180 Int dest =
PNUM(
PROW(snode.Index,this->grid_),dstCol,this->grid_);
1182 if(
MYPROC( this->grid_ ) != dest ){
1184 MPI_Request & mpiReqSizeSendL =
1185 arrMpiReqsSizeSendLCD[sendOffset[supidx]+sendIdxL];
1186 MPI_Request & mpiReqSendL =
1187 arrMpiReqsSendLCD[sendOffset[supidx]+sendIdxL];
1188 std::vector<char> & sstrLcolSend =
1189 arrSstrLcolSendCD[sendOffset[supidx]+sendIdxL];
1191 arrSstrLcolSizeSendCD[sendOffset[supidx]+sendIdxL];
1194 std::stringstream sstm;
1195 std::vector<Int> mask( LBlockMask::TOTAL_NUMBER, 1 );
1196 std::vector<LBlock<T> >& Lcol = this->L(
LBj(snode.Index, this->grid_) );
1201 Int startIdx = (
MYROW( this->grid_ ) ==
PROW( snode.Index, this->grid_ ) ) ? 1:0;
1203 for( Int ib = startIdx; ib < Lcol.size(); ib++ ){
1204 if( Lcol[ib].blockIdx > snode.Index &&
1205 (Lcol[ib].blockIdx % this->grid_->numProcCol) == dstCol ){
1210 serialize( (Int)count, sstm, NO_MASK );
1212 for( Int ib = startIdx; ib < Lcol.size(); ib++ ){
1213 if( Lcol[ib].blockIdx > snode.Index &&
1214 (Lcol[ib].blockIdx % this->grid_->numProcCol) == dstCol ){
1215 #if ( _DEBUGlevel_ >= 1 )
1216 statusOFS <<
"["<<snode.Index<<
"] SEND contains "<<Lcol[ib].blockIdx
1217 <<
" which corresponds to "<<
GBj(ib,this->grid_)
1221 serialize( Lcol[ib], sstm, mask );
1225 sstrLcolSend.resize( Size(sstm) );
1226 sstm.read( &sstrLcolSend[0], sstrLcolSend.size() );
1227 sstrSizeL = sstrLcolSend.size();
1231 MPI_Isend( &sstrSizeL, 1, MPI_INT, dest,
1232 IDX_TO_TAG2(snode.Index,supidx,SELINV_TAG_L_SIZE_CD),
1233 this->grid_->comm, &mpiReqSizeSendL );
1234 MPI_Isend( (
void*)&sstrLcolSend[0], sstrSizeL, MPI_BYTE, dest,
1235 IDX_TO_TAG2(snode.Index,supidx,SELINV_TAG_L_CONTENT_CD),
1236 this->grid_->comm, &mpiReqSendL );
1241 arrSstrUcolSizeRecvCD[sendOffset[supidx]+recvIdxU];
1242 MPI_Request & mpiReqSizeRecvU =
1243 arrMpiReqsSizeRecvUCD[sendOffset[supidx]+recvIdxU];
1245 MPI_Irecv( &sstrSizeU, 1, MPI_INT, dest,
1246 IDX_TO_TAG2(snode.Index,supidx,SELINV_TAG_U_SIZE_CD),
1247 this->grid_->comm, &mpiReqSizeRecvU );
1253 TIMER_STOP(Send_L_Recv_Size_U_CrossDiag);
1258 for (Int supidx=0; supidx<stepSuper; supidx++){
1260 TIMER_START(Send_U_Recv_Size_L_CrossDiag);
1262 if(
MYROW( this->grid_ ) ==
PROW( snode.Index, this->grid_ )
1263 && this->isRecvFromCrossDiagonal_(this->grid_->numProcRow, snode.Index ) ){
1266 for(Int srcRow = 0; srcRow<this->grid_->numProcRow; srcRow++){
1267 if(this->isRecvFromCrossDiagonal_(srcRow,snode.Index) ){
1268 Int src =
PNUM(srcRow,
PCOL(snode.Index,this->grid_),this->grid_);
1269 if(
MYPROC( this->grid_ ) != src ){
1272 arrSstrLrowSizeRecvCD[recvOffset[supidx]+recvIdxL];
1273 MPI_Request & mpiReqSizeRecvL =
1274 arrMpiReqsSizeRecvLCD[recvOffset[supidx]+recvIdxL];
1276 MPI_Irecv( &sstrSizeL, 1, MPI_INT, src,
1277 IDX_TO_TAG2(snode.Index,supidx,SELINV_TAG_L_SIZE_CD),
1278 this->grid_->comm, &mpiReqSizeRecvL );
1283 MPI_Request & mpiReqSizeSendU =
1284 arrMpiReqsSizeSendUCD[recvOffset[supidx]+sendIdxU];
1285 MPI_Request & mpiReqSendU =
1286 arrMpiReqsSendUCD[recvOffset[supidx]+sendIdxU];
1287 std::vector<char> & sstrUrowSend =
1288 arrSstrUrowSendCD[recvOffset[supidx]+sendIdxU];
1290 arrSstrUrowSizeSendCD[recvOffset[supidx]+sendIdxU];
1293 std::stringstream sstm;
1294 std::vector<Int> mask( UBlockMask::TOTAL_NUMBER, 1 );
1295 std::vector<UBlock<T> >& Urow = this->U(
LBi(snode.Index, this->grid_) );
1300 for( Int jb = 0; jb < Urow.size(); jb++ ){
1301 if( Urow[jb].blockIdx > snode.Index
1302 && (Urow[jb].blockIdx % this->grid_->numProcRow) == srcRow ){
1307 serialize( (Int)count, sstm, NO_MASK );
1309 for( Int jb = 0; jb < Urow.size(); jb++ ){
1310 if( Urow[jb].blockIdx > snode.Index
1311 && (Urow[jb].blockIdx % this->grid_->numProcRow) == srcRow ){
1312 #if ( _DEBUGlevel_ >= 1 )
1313 statusOFS <<
"["<<snode.Index<<
"] SEND contains "<<Urow[jb].blockIdx
1314 <<
" which corresponds to "<<
GBi(jb,this->grid_)
1318 serialize( Urow[jb], sstm, mask );
1322 sstrUrowSend.resize( Size(sstm) );
1323 sstm.read( &sstrUrowSend[0], sstrUrowSend.size() );
1324 sstrSizeU = sstrUrowSend.size();
1327 MPI_Isend( &sstrSizeU, 1, MPI_INT, src,
1328 IDX_TO_TAG2(snode.Index,supidx,SELINV_TAG_U_SIZE_CD),
1329 this->grid_->comm, &mpiReqSizeSendU );
1330 MPI_Isend( (
void*)&sstrUrowSend[0], sstrSizeU, MPI_BYTE, src,
1331 IDX_TO_TAG2(snode.Index,supidx,SELINV_TAG_U_CONTENT_CD),
1332 this->grid_->comm, &mpiReqSendU );
1338 TIMER_STOP(Send_U_Recv_Size_L_CrossDiag);
1342 TIMER_START(Wait_Size_L_CrossDiag);
1343 mpi::Waitall(arrMpiReqsSizeRecvLCD);
1344 TIMER_STOP(Wait_Size_L_CrossDiag);
1349 for (Int supidx=0; supidx<stepSuper; supidx++){
1352 TIMER_START(Recv_L_CrossDiag);
1353 if(
MYROW( this->grid_ ) ==
PROW( snode.Index, this->grid_ )
1354 && this->isRecvFromCrossDiagonal_(this->grid_->numProcRow, snode.Index ) ){
1356 for(Int srcRow = 0; srcRow<this->grid_->numProcRow; srcRow++){
1357 if(this->isRecvFromCrossDiagonal_(srcRow,snode.Index) ){
1358 Int src =
PNUM(srcRow,
PCOL(snode.Index,this->grid_),this->grid_);
1359 if(
MYPROC( this->grid_ ) != src ){
1361 arrSstrLrowSizeRecvCD[recvOffset[supidx]+recvIdxL];
1362 std::vector<char> & sstrLrowRecv =
1363 arrSstrLrowRecvCD[recvOffset[supidx]+recvIdxL];
1364 MPI_Request & mpiReqRecvL =
1365 arrMpiReqsRecvLCD[recvOffset[supidx]+recvIdxL];
1366 sstrLrowRecv.resize( sstrSizeL);
1368 MPI_Irecv( (
void*)&sstrLrowRecv[0], sstrSizeL, MPI_BYTE, src,
1369 IDX_TO_TAG2(snode.Index,supidx,SELINV_TAG_L_CONTENT_CD),
1370 this->grid_->comm, &mpiReqRecvL );
1377 TIMER_STOP(Recv_L_CrossDiag);
1381 TIMER_START(Wait_Size_U_CrossDiag);
1382 mpi::Waitall(arrMpiReqsSizeRecvUCD);
1383 TIMER_STOP(Wait_Size_U_CrossDiag);
1386 for (Int supidx=0; supidx<stepSuper; supidx++){
1388 TIMER_START(Recv_U_CrossDiag);
1389 if(
MYCOL( this->grid_ ) ==
PCOL( snode.Index, this->grid_ )
1390 && this->isSendToCrossDiagonal_(this->grid_->numProcCol, snode.Index ) ){
1392 for(Int srcCol = 0; srcCol<this->grid_->numProcCol; srcCol++){
1393 if(this->isSendToCrossDiagonal_(srcCol,snode.Index) ){
1394 Int src =
PNUM(
PROW(snode.Index,this->grid_),srcCol,this->grid_);
1395 if(
MYPROC( this->grid_ ) != src ){
1397 arrSstrUcolSizeRecvCD[sendOffset[supidx]+recvIdxU];
1398 std::vector<char> & sstrUcolRecv =
1399 arrSstrUcolRecvCD[sendOffset[supidx]+recvIdxU];
1400 MPI_Request & mpiReqRecvU =
1401 arrMpiReqsRecvUCD[sendOffset[supidx]+recvIdxU];
1402 sstrUcolRecv.resize( sstrSizeU );
1404 MPI_Irecv( (
void*)&sstrUcolRecv[0], sstrSizeU, MPI_BYTE, src,
1405 IDX_TO_TAG2(snode.Index,supidx,SELINV_TAG_U_CONTENT_CD),
1406 this->grid_->comm, &mpiReqRecvU );
1412 TIMER_STOP(Recv_U_CrossDiag);
1416 TIMER_START(Wait_L_CrossDiag);
1417 mpi::Waitall(arrMpiReqsRecvLCD);
1418 TIMER_STOP(Wait_L_CrossDiag);
1424 for (Int supidx=0; supidx<stepSuper; supidx++){
1426 Int ksupProcRow =
PROW( snode.Index, this->grid_ );
1427 Int ksupProcCol =
PCOL( snode.Index, this->grid_ );
1429 if(
MYROW( this->grid_ ) ==
PROW( snode.Index, this->grid_ ) &&
1430 this->isRecvFromCrossDiagonal_(this->grid_->numProcRow, snode.Index ) ){
1432 #if ( _DEBUGlevel_ >= 1 )
1433 statusOFS << std::endl <<
" ["<<snode.Index<<
"] "
1434 <<
"Update the upper triangular block"
1435 << std::endl << std::endl;
1436 statusOFS << std::endl <<
" ["<<snode.Index<<
"] "
1437 <<
"blockIdxLocal:" << snode.BlockIdxLocal
1438 << std::endl << std::endl;
1439 statusOFS << std::endl <<
" ["<<snode.Index<<
"] "
1440 <<
"rowLocalPtr:" << snode.RowLocalPtr
1441 << std::endl << std::endl;
1444 std::vector<LBlock<T> >& Lcol = this->L(
LBj(snode.Index, this->grid_) );
1445 std::vector<LBlock<T> >& Lrow = this->Lrow(
LBi( snode.Index, this->grid_ ) );
1452 std::vector<Int> isBlockFound(Lrow.size(),
false);
1455 for(Int srcRow = 0; srcRow<this->grid_->numProcRow; srcRow++){
1456 if(this->isRecvFromCrossDiagonal_(srcRow,snode.Index) ){
1457 Int src =
PNUM(srcRow,
PCOL(snode.Index,this->grid_),this->grid_);
1458 TIMER_START(Recv_L_CrossDiag);
1461 std::vector<LBlock<T> > * pLcol;
1462 std::vector<LBlock<T> > LcolRecv;
1463 if(
MYPROC(this->grid_)!= src){
1465 arrSstrLrowSizeRecvCD[recvOffset[supidx]+recvIdxL];
1466 std::vector<char> & sstrLrowRecv =
1467 arrSstrLrowRecvCD[recvOffset[supidx]+recvIdxL];
1469 std::stringstream sstm;
1471 #if ( _DEBUGlevel_ >= 1 )
1472 statusOFS <<
"["<<snode.Index<<
"] P"<<
MYPROC(this->grid_)<<
" ("<<
MYROW(this->grid_)
1473 <<
","<<
MYCOL(this->grid_)<<
") <--- LBj("<<snode.Index<<
") <--- P"
1474 << src <<
" ("<<srcRow<<
","<<ksupProcCol<<
")"
1477 sstm.write( &sstrLrowRecv[0], sstrSizeL );
1481 std::vector<Int> mask( LBlockMask::TOTAL_NUMBER, 1 );
1482 deserialize( numLBlock, sstm, NO_MASK );
1483 LcolRecv.resize(numLBlock);
1484 for( Int ib = 0; ib < numLBlock; ib++ ){
1485 deserialize( LcolRecv[ib], sstm, mask );
1486 #if ( _DEBUGlevel_ >= 1 )
1487 statusOFS <<
"["<<snode.Index<<
"] RECV contains "
1488 << LcolRecv[ib].blockIdx<<
" which corresponds to "
1489 << ib * this->grid_->numProcRow + srcRow;
1490 statusOFS <<
" L is on row "<< srcRow
1491 <<
" whereas Lrow is on col "
1492 << LcolRecv[ib].blockIdx % this->grid_->numProcCol
1505 TIMER_STOP(Recv_L_CrossDiag);
1510 Int startIdx = (
MYPROC( this->grid_ ) == src &&
MYPROC(this->grid_) ==
PNUM(ksupProcRow,ksupProcCol, this->grid_) ) ? 1:0;
1511 for( Int ib = startIdx; ib < pLcol->size(); ib++ ){
1517 throw std::logic_error(
"LcolRecv contains the wrong blocks." );
1528 for( Int iib = 0; iib < Lrow.size(); iib++ ){
1549 #if ( _DEBUGlevel_ >= 1 )
1550 statusOFS<<
"["<<snode.Index<<
"] USING LB "<<LB.
blockIdx<< std::endl;
1552 isBlockFound[iib] = 1;
1560 for( Int ib = 0; ib < Lrow.size(); ib++ ){
1562 if( !isBlockFound[ib] ){
1566 throw std::logic_error(
1567 "LBlock cannot find its update. Something is seriously wrong."
1575 TIMER_START(Wait_U_CrossDiag);
1576 mpi::Waitall(arrMpiReqsRecvUCD);
1577 TIMER_STOP(Wait_U_CrossDiag);
1581 for (Int supidx=0; supidx<stepSuper; supidx++){
1583 Int ksupProcRow =
PROW( snode.Index, this->grid_ );
1584 Int ksupProcCol =
PCOL( snode.Index, this->grid_ );
1587 if(
MYCOL( this->grid_ ) ==
PCOL( snode.Index, this->grid_ ) &&
1588 this->isSendToCrossDiagonal_(this->grid_->numProcCol, snode.Index ) ){
1590 std::vector<UBlock<T> >& Ucol = this->Ucol(
LBj( snode.Index, this->grid_ ) );
1597 std::vector<Int> isBlockFound(Ucol.size(),
false);
1600 for(Int srcCol = 0; srcCol<this->grid_->numProcCol; srcCol++){
1601 if(this->isSendToCrossDiagonal_(srcCol,snode.Index) ){
1602 Int src =
PNUM(
PROW(snode.Index,this->grid_),srcCol,this->grid_);
1603 TIMER_START(Recv_U_CrossDiag);
1606 std::vector<UBlock<T> > * pUrow;
1607 std::vector<UBlock<T> > UrowRecv;
1608 if(
MYPROC(this->grid_)!= src){
1610 arrSstrUcolSizeRecvCD[sendOffset[supidx]+recvIdxU];
1611 std::vector<char> & sstrUcolRecv =
1612 arrSstrUcolRecvCD[sendOffset[supidx]+recvIdxU];
1615 std::stringstream sstm;
1617 #if ( _DEBUGlevel_ >= 1 )
1618 statusOFS <<
"["<<snode.Index<<
"] P"<<
MYPROC(this->grid_)<<
" ("<<
MYROW(this->grid_)
1619 <<
","<<
MYCOL(this->grid_)<<
") <--- LBi("<<snode.Index<<
") <--- P"
1620 << src<<
" ("<<ksupProcCol<<
","<<srcCol<<
")"<<std::endl;
1622 sstm.write( &sstrUcolRecv[0], sstrSizeU );
1626 std::vector<Int> mask( UBlockMask::TOTAL_NUMBER, 1 );
1627 deserialize( numUBlock, sstm, NO_MASK );
1628 UrowRecv.resize(numUBlock);
1629 for( Int jb = 0; jb < numUBlock; jb++ ){
1630 deserialize( UrowRecv[jb], sstm, mask );
1631 #if ( _DEBUGlevel_ >= 1 )
1632 statusOFS <<
"["<<snode.Index<<
"] RECV contains "
1633 << UrowRecv[jb].blockIdx<<
" which corresponds to "
1634 << jb * this->grid_->numProcCol + srcCol;
1635 statusOFS <<
" Urow is on col "<< srcCol
1636 <<
" whereas U is on row "
1637 << UrowRecv[jb].blockIdx % this->grid_->numProcRow
1646 std::vector<UBlock<T> >& Urow = this->U(
LBi( snode.Index, this->grid_ ) );
1651 TIMER_STOP(Recv_U_CrossDiag);
1654 for( Int jb = 0; jb < pUrow->size(); jb++ ){
1660 throw std::logic_error(
"UrowRecv contains the wrong blocks." );
1670 for( Int jjb = 0; jjb < Ucol.size(); jjb++ ){
1688 #if ( _DEBUGlevel_ >= 1 )
1689 statusOFS<<
"["<<snode.Index<<
"] USING UB "<<UB.
blockIdx<< std::endl;
1691 isBlockFound[jjb] = 1;
1700 for( Int jb = 0; jb < Ucol.size(); jb++ ){
1702 if( !isBlockFound[jb] ){
1706 throw std::logic_error(
1707 "UBlock cannot find its update. Something is seriously wrong."
1715 TIMER_STOP(Send_CD);
1717 mpi::Waitall(arrMpiReqsSizeSendLCD);
1718 mpi::Waitall(arrMpiReqsSendLCD);
1720 mpi::Waitall(arrMpiReqsSizeSendUCD);
1721 mpi::Waitall(arrMpiReqsSendUCD);
1726 template<
typename T>
1736 TIMER_START(Unpack_data);
1741 #if ( _DEBUGlevel_ >= 1 )
1742 statusOFS << std::endl <<
"["<<snode.Index<<
"] "<<
"Unpack the received data for processors participate in Gemm. " << std::endl << std::endl;
1745 if(
MYROW( this->grid_ ) !=
PROW( snode.Index, this->grid_ ) ){
1746 std::stringstream sstm;
1747 sstm.write( &snode.SstrLrowRecv[0], snode.SstrLrowRecv.size() );
1748 std::vector<Int> mask( LBlockMask::TOTAL_NUMBER, 1 );
1750 deserialize( numLBlock, sstm, NO_MASK );
1751 LrowRecv.resize( numLBlock );
1752 for( Int ib = 0; ib < numLBlock; ib++ ){
1753 deserialize( LrowRecv[ib], sstm, mask );
1756 #if ( _DEBUGlevel_ >= 1 )
1757 statusOFS<<
"["<<snode.Index<<
"] "<<
"Lrow RECV "<<std::endl;
1758 for(Int ib=0;ib<LrowRecv.size();++ib){statusOFS<<LrowRecv[ib].blockIdx<<
" ";}
1759 statusOFS<<std::endl;
1767 LrowRecv.resize(this->Lrow(
LBi( snode.Index, this->grid_ ) ).size());
1768 std::copy(this->Lrow(
LBi( snode.Index, this->grid_ ) ).begin(),this->Lrow(
LBi( snode.Index, this->grid_ )).end(),LrowRecv.begin());
1773 if(
MYCOL( this->grid_ ) !=
PCOL( snode.Index, this->grid_ ) ){
1774 std::stringstream sstm;
1775 sstm.write( &snode.SstrLcolRecv[0], snode.SstrLcolRecv.size() );
1776 std::vector<Int> mask( LBlockMask::TOTAL_NUMBER, 1 );
1777 mask[LBlockMask::NZVAL] = 0;
1779 deserialize( numLBlock, sstm, NO_MASK );
1780 LcolRecv.resize( numLBlock );
1781 for( Int ib = 0; ib < numLBlock; ib++ ){
1782 deserialize( LcolRecv[ib], sstm, mask );
1786 #if ( _DEBUGlevel_ >= 1 )
1787 statusOFS<<
"["<<snode.Index<<
"] "<<
"Lcol RECV "<<std::endl;
1788 for(Int ib=0;ib<LcolRecv.size();++ib){statusOFS<<LcolRecv[ib].blockIdx<<
" ";}
1789 statusOFS<<std::endl;
1796 std::vector<LBlock<T> >& Lcol = this->L(
LBj( snode.Index, this->grid_ ) );
1797 Int startIdx = (
MYROW( this->grid_ ) ==
PROW( snode.Index, this->grid_ ) )?1:0;
1798 LcolRecv.resize( Lcol.size() - startIdx );
1799 std::copy(Lcol.begin()+startIdx,Lcol.end(),LcolRecv.begin());
1803 if(
MYROW( this->grid_ ) !=
PROW( snode.Index, this->grid_ ) ){
1804 std::stringstream sstm;
1805 sstm.write( &snode.SstrUrowRecv[0], snode.SstrUrowRecv.size() );
1806 std::vector<Int> mask( UBlockMask::TOTAL_NUMBER, 1 );
1807 mask[UBlockMask::NZVAL] = 0;
1809 deserialize( numUBlock, sstm, NO_MASK );
1810 UrowRecv.resize( numUBlock );
1811 for( Int jb = 0; jb < numUBlock; jb++ ){
1812 deserialize( UrowRecv[jb], sstm, mask );
1815 #if ( _DEBUGlevel_ >= 1 )
1816 statusOFS<<
"["<<snode.Index<<
"] "<<
"Urow RECV "<<std::endl;
1817 for(Int ib=0;ib<UrowRecv.size();++ib){statusOFS<<UrowRecv[ib].blockIdx<<
" ";}
1818 statusOFS<<std::endl;
1826 UrowRecv.resize(this->U(
LBi( snode.Index, this->grid_ ) ).size());
1827 std::copy(this->U(
LBi( snode.Index, this->grid_ ) ).begin(),this->U(
LBi( snode.Index, this->grid_ )).end(),UrowRecv.begin());
1832 if(
MYCOL( this->grid_ ) !=
PCOL( snode.Index, this->grid_ ) ){
1833 std::stringstream sstm;
1834 sstm.write( &snode.SstrUcolRecv[0], snode.SstrUcolRecv.size() );
1835 std::vector<Int> mask( UBlockMask::TOTAL_NUMBER, 1 );
1837 deserialize( numUBlock, sstm, NO_MASK );
1838 UcolRecv.resize( numUBlock );
1839 for( Int jb = 0; jb < numUBlock; jb++ ){
1840 deserialize( UcolRecv[jb], sstm, mask );
1843 #if ( _DEBUGlevel_ >= 1 )
1844 statusOFS<<
"["<<snode.Index<<
"] "<<
"Ucol RECV "<<std::endl;
1845 for(Int ib=0;ib<UcolRecv.size();++ib){statusOFS<<UcolRecv[ib].blockIdx<<
" ";}
1846 statusOFS<<std::endl;
1852 std::vector<UBlock<T> >& Ucol = this->Ucol(
LBj( snode.Index, this->grid_ ) );
1853 UcolRecv.resize( Ucol.size() );
1854 std::copy(Ucol.begin(),Ucol.end(),UcolRecv.begin());
1857 TIMER_STOP(Unpack_data);
1861 template<
typename T>
1865 TIMER_START(ComputeDiagUpdate);
1868 if(
MYCOL( this->grid_ ) ==
PCOL( snode.Index, this->grid_ ) ){
1869 #if ( _DEBUGlevel_ >= 1 )
1870 statusOFS << std::endl <<
"["<<snode.Index<<
"] "
1871 <<
"Updating the diagonal block" << std::endl << std::endl;
1873 std::vector<LBlock<T> >& Lcol = this->L(
LBj( snode.Index, this->grid_ ) );
1874 std::vector<UBlock<T> >& Ucol = this->Ucol(
LBj( snode.Index, this->grid_ ) );
1877 snode.DiagBuf.Resize(
SuperSize( snode.Index, this->super_ ),
1878 SuperSize( snode.Index, this->super_ ));
1879 SetValue(snode.DiagBuf, ZERO<T>());
1882 Int startIb = (
MYROW( this->grid_ ) ==
PROW( snode.Index, this->grid_ ))?1:0;
1883 for( Int ib = startIb; ib < Lcol.size(); ib++ ){
1887 for(jb = 0; jb < Ucol.size(); ++jb){
1888 if(Ucol[jb].blockIdx == LcolB.
blockIdx){
1894 assert(jb < Ucol.size());
1898 if(1 || (LcolB.
numRow>0 && LcolB.
numCol>0 && UcolB.numRow>0 && UcolB.numCol>0)){
1900 blas::Gemm(
'N',
'N', snode.DiagBuf.m(), snode.DiagBuf.n(),
1901 LcolB.
numRow, MINUS_ONE<T>(),
1902 UcolB.nzval.Data(), UcolB.nzval.m(),
1903 &snode.LUpdateBuf( snode.RowLocalPtr[ib-startIb], 0 ),
1904 snode.LUpdateBuf.m(),
1905 ONE<T>(), snode.DiagBuf.Data(), snode.DiagBuf.m() );
1909 #if ( _DEBUGlevel_ >= 1 )
1910 statusOFS << std::endl <<
"["<<snode.Index<<
"] "
1911 <<
"Updated the diagonal block" << std::endl << std::endl;
1914 TIMER_STOP(ComputeDiagUpdate);
1924 template<
typename T>
1928 #if defined (PROFILE) || defined(PMPI) || defined(USE_TAU)
1929 Real begin_SendULWaitContentFirst, end_SendULWaitContentFirst, time_SendULWaitContentFirst = 0;
1932 std::vector<std::vector<Int> > & superList = this->WorkingSet();
1933 Int numSteps = superList.size();
1934 Int stepSuper = superList[lidx].size();
1936 TIMER_START(AllocateBuffer);
1939 std::vector<std::vector<MPI_Request> > arrMpireqsSendLToBelow;
1940 arrMpireqsSendLToBelow.resize( stepSuper, std::vector<MPI_Request>( 2 * this->grid_->numProcRow, MPI_REQUEST_NULL ));
1941 std::vector<std::vector<MPI_Request> > arrMpireqsSendLToRight;
1942 arrMpireqsSendLToRight.resize(stepSuper, std::vector<MPI_Request>( 2 * this->grid_->numProcCol, MPI_REQUEST_NULL ));
1944 std::vector<std::vector<MPI_Request> > arrMpireqsSendUToBelow;
1945 arrMpireqsSendUToBelow.resize( stepSuper, std::vector<MPI_Request>( 2 * this->grid_->numProcRow, MPI_REQUEST_NULL ));
1946 std::vector<std::vector<MPI_Request> > arrMpireqsSendUToRight;
1947 arrMpireqsSendUToRight.resize(stepSuper, std::vector<MPI_Request>( 2 * this->grid_->numProcCol, MPI_REQUEST_NULL ));
1950 std::vector<MPI_Request> arrMpireqsSendLToLeft;
1951 arrMpireqsSendLToLeft.resize(stepSuper, MPI_REQUEST_NULL );
1954 std::vector<MPI_Request> arrMpireqsSendUToAbove;
1955 arrMpireqsSendUToAbove.resize(stepSuper, MPI_REQUEST_NULL );
1958 std::vector<MPI_Request> arrMpireqsSendToAbove;
1959 arrMpireqsSendToAbove.resize(stepSuper, MPI_REQUEST_NULL );
1964 std::vector<MPI_Request> arrMpireqsRecvLSizeFromAny;
1965 arrMpireqsRecvLSizeFromAny.resize(stepSuper*2 , MPI_REQUEST_NULL);
1966 std::vector<MPI_Request> arrMpireqsRecvLContentFromAny;
1967 arrMpireqsRecvLContentFromAny.resize(stepSuper*2 , MPI_REQUEST_NULL);
1969 std::vector<MPI_Request> arrMpireqsRecvUSizeFromAny;
1970 arrMpireqsRecvUSizeFromAny.resize(stepSuper*2 , MPI_REQUEST_NULL);
1971 std::vector<MPI_Request> arrMpireqsRecvUContentFromAny;
1972 arrMpireqsRecvUContentFromAny.resize(stepSuper*2 , MPI_REQUEST_NULL);
1978 std::vector<SuperNodeBufferTypeUnsym> arrSuperNodes(stepSuper);
1979 for (Int supidx=0; supidx<stepSuper; supidx++){
1980 arrSuperNodes[supidx].Index = superList[lidx][supidx];
1985 TIMER_STOP(AllocateBuffer);
1989 Int next_lidx = lidx+1;
1993 PushCallStack(
"PMatrixUnsym::SelInv_P2p::SendRecvCD");
1999 Int next_lidx = lidx;
2002 bool doSend =
false;
2003 for (Int supidx=0; supidx<superList[next_lidx].size(); supidx++){
2004 Int snode_index = superList[next_lidx][supidx];
2005 if(
MYROW( this->grid_ ) ==
PROW( snode_index, this->grid_ ) ){
2006 std::vector<LBlock<T> >& Lrow = this->Lrow(
LBi(snode_index, this->grid_) );
2007 Int& LrowSize = this->LrowSize_[
LBi(snode_index, this->grid_) ];
2008 if(Lrow.size()!=LrowSize){
2014 if(
MYCOL( this->grid_ ) ==
PCOL( snode_index, this->grid_ ) ){
2015 std::vector<UBlock<T> >& Ucol = this->Ucol(
LBj(snode_index, this->grid_) );
2016 Int& UcolSize = this->UcolSize_[
LBj(snode_index, this->grid_) ];
2017 if(Ucol.size()!=UcolSize){
2027 SendRecvSizesCD(superList[next_lidx], superList[next_lidx].size(),buffers);
2028 IRecvContentCD(superList[next_lidx], superList[next_lidx].size(),buffers);
2029 WaitContentLCD(superList[next_lidx], superList[next_lidx].size(),buffers);
2030 WaitContentUCD(superList[next_lidx], superList[next_lidx].size(),buffers);
2031 buffers.WaitAllSend();
2036 if(next_lidx < superList.size()){
2038 for (Int supidx=0; supidx<superList[next_lidx].size(); supidx++){
2039 Int snode_index = superList[next_lidx][supidx];
2040 if(
MYROW( this->grid_ ) ==
PROW( snode_index, this->grid_ ) ){
2041 std::vector<LBlock<T> >& Lrow = this->Lrow(
LBi(snode_index, this->grid_) );
2042 Int& LrowSize = this->LrowSize_[
LBi(snode_index, this->grid_) ];
2046 if(
MYCOL( this->grid_ ) ==
PCOL( snode_index, this->grid_ ) ){
2047 std::vector<UBlock<T> >& Ucol = this->Ucol(
LBj(snode_index, this->grid_) );
2048 Int& UcolSize = this->UcolSize_[
LBj(snode_index, this->grid_) ];
2053 SendRecvSizesCD(superList[next_lidx],superList[next_lidx].size(),nextCDBuffers);
2061 PushCallStack(
"PMatrixUnsym::SelInv_P2p::UpdateLU");
2063 #if ( _DEBUGlevel_ >= 1 )
2064 statusOFS << std::endl <<
"Communication to the Schur complement." << std::endl << std::endl;
2068 for (Int supidx=0; supidx<stepSuper; supidx++){
2070 std::vector<MPI_Request> & mpireqsSendLToBelow = arrMpireqsSendLToBelow[supidx];
2071 std::vector<MPI_Request> & mpireqsSendLToRight = arrMpireqsSendLToRight[supidx];
2072 std::vector<MPI_Request> & mpireqsSendUToBelow = arrMpireqsSendUToBelow[supidx];
2073 std::vector<MPI_Request> & mpireqsSendUToRight = arrMpireqsSendUToRight[supidx];
2075 #if ( _DEBUGlevel_ >= 1 )
2076 statusOFS << std::endl <<
"["<<snode.Index<<
"] "
2077 <<
"Communication for the Lrow part." << std::endl << std::endl;
2080 if(
MYROW( this->grid_ ) ==
PROW( snode.Index, this->grid_ ) ){
2082 TIMER_START(Serialize_LrowL);
2083 std::stringstream sstm;
2084 std::vector<Int> mask( LBlockMask::TOTAL_NUMBER, 1 );
2085 std::vector<LBlock<T> >& Lrow = this->Lrow(
LBi(snode.Index, this->grid_) );
2086 std::vector<UBlock<T> >& Urow = this->U(
LBi(snode.Index, this->grid_) );
2090 serialize( (Int)Lrow.size(), sstm, NO_MASK );
2091 for( Int ib = 0; ib < Lrow.size(); ib++ ){
2092 assert( Lrow[ib].blockIdx > snode.Index );
2093 serialize( Lrow[ib], sstm, mask );
2095 snode.SstrLrowSend.resize( Size( sstm ) );
2096 sstm.read( &snode.SstrLrowSend[0], snode.SstrLrowSend.size() );
2097 snode.SizeSstrLrowSend = snode.SstrLrowSend.size();
2098 TIMER_STOP(Serialize_LrowL);
2100 for( Int iProcRow = 0; iProcRow < this->grid_->numProcRow; iProcRow++ ){
2101 if(
MYROW( this->grid_ ) != iProcRow &&
2102 this->isSendToBelow_( iProcRow,snode.Index ) ==
true ){
2105 MPI_Isend( &snode.SizeSstrLrowSend, 1, MPI_INT,
2106 iProcRow, IDX_TO_TAG2(snode.Index,supidx,SELINV_TAG_LROW_SIZE), this->grid_->colComm, &mpireqsSendLToBelow[2*iProcRow] );
2107 MPI_Isend( (
void*)&snode.SstrLrowSend[0], snode.SizeSstrLrowSend, MPI_BYTE,
2108 iProcRow, IDX_TO_TAG2(snode.Index,supidx,SELINV_TAG_LROW_CONTENT),
2109 this->grid_->colComm, &mpireqsSendLToBelow[2*iProcRow+1] );
2110 #if ( _DEBUGlevel_ >= 1 )
2111 statusOFS<<
"["<<snode.Index<<
"] "<<
"Lrow SENT "<<std::endl;
2112 for(Int ib=0;ib<Lrow.size();++ib){statusOFS<<Lrow[ib].blockIdx<<
" ";}
2113 statusOFS<<std::endl;
2114 statusOFS <<
"["<<snode.Index<<
"] "<<
"Sending Lrow "
2115 << snode.SizeSstrLrowSend <<
" BYTES to P" <<
PNUM(iProcRow,
MYCOL(this->grid_),this->grid_)
2116 << std::endl << std::endl;
2122 #if ( _DEBUGlevel_ >= 1 )
2123 statusOFS << std::endl <<
"["<<snode.Index<<
"] "<<
"Communication for the L part." << std::endl << std::endl;
2126 if(
MYCOL( this->grid_ ) ==
PCOL( snode.Index, this->grid_ ) ){
2128 TIMER_START(Serialize_LcolL);
2130 std::stringstream sstm;
2131 std::vector<Int> mask( LBlockMask::TOTAL_NUMBER, 1 );
2132 mask[LBlockMask::NZVAL] = 0;
2134 std::vector<LBlock<T> >& Lcol = this->L(
LBj(snode.Index, this->grid_) );
2138 Int startIdx = (
MYROW( this->grid_ ) ==
PROW( snode.Index, this->grid_ ) )?1:0;
2139 serialize( (Int)Lcol.size() - startIdx, sstm, NO_MASK );
2140 for( Int ib = startIdx; ib < Lcol.size(); ib++ ){
2141 assert( Lcol[ib].blockIdx > snode.Index );
2143 #if ( _DEBUGlevel_ >= 2 )
2146 serialize( Lcol[ib], sstm, mask );
2148 snode.SstrLcolSend.resize( Size( sstm ) );
2149 sstm.read( &snode.SstrLcolSend[0], snode.SstrLcolSend.size() );
2150 snode.SizeSstrLcolSend = snode.SstrLcolSend.size();
2151 TIMER_STOP(Serialize_LcolL);
2153 for( Int iProcCol = 0; iProcCol < this->grid_->numProcCol ; iProcCol++ ){
2154 if(
MYCOL( this->grid_ ) != iProcCol &&
2155 this->isSendToRight_( iProcCol, snode.Index ) ==
true ){
2157 MPI_Isend( &snode.SizeSstrLcolSend, 1, MPI_INT,
2158 iProcCol, IDX_TO_TAG2(snode.Index,supidx,SELINV_TAG_L_SIZE),
2159 this->grid_->rowComm, &mpireqsSendLToRight[2*iProcCol] );
2160 MPI_Isend( (
void*)&snode.SstrLcolSend[0], snode.SizeSstrLcolSend, MPI_BYTE,
2161 iProcCol, IDX_TO_TAG2(snode.Index,supidx,SELINV_TAG_L_CONTENT),
2162 this->grid_->rowComm, &mpireqsSendLToRight[2*iProcCol+1] );
2163 #if ( _DEBUGlevel_ >= 1 )
2164 statusOFS<<
"["<<snode.Index<<
"] "<<
"L SENT "<<std::endl;
2165 for(Int ib=startIdx;ib<Lcol.size();++ib){statusOFS<<Lcol[ib].blockIdx<<
" ";}
2166 statusOFS<<std::endl;
2167 statusOFS <<
"["<<snode.Index<<
"] "<<
"Sending L "
2168 << snode.SizeSstrLcolSend <<
" BYTES to P" <<
PNUM(
MYROW(this->grid_),iProcCol,this->grid_)
2169 << std::endl << std::endl;
2175 #if ( _DEBUGlevel_ >= 1 )
2176 statusOFS << std::endl <<
"["<<snode.Index<<
"] "
2177 <<
"Communication for the U part." << std::endl << std::endl;
2180 if(
MYROW( this->grid_ ) ==
PROW( snode.Index, this->grid_ ) ){
2182 TIMER_START(Serialize_UcolU);
2184 std::stringstream sstm;
2185 std::vector<Int> mask( UBlockMask::TOTAL_NUMBER, 1 );
2186 mask[UBlockMask::NZVAL] = 0;
2188 std::vector<UBlock<T> >& Urow = this->U(
LBi(snode.Index, this->grid_) );
2191 serialize( (Int)Urow.size(), sstm, NO_MASK );
2192 for( Int jb = 0; jb < Urow.size(); jb++ ){
2193 assert( Urow[jb].blockIdx > snode.Index );
2194 #if ( _DEBUGlevel_ >= 2 )
2197 serialize( Urow[jb], sstm, mask );
2199 snode.SstrUrowSend.resize( Size( sstm ) );
2200 sstm.read( &snode.SstrUrowSend[0], snode.SstrUrowSend.size() );
2201 snode.SizeSstrUrowSend = snode.SstrUrowSend.size();
2202 TIMER_STOP(Serialize_UcolU);
2204 for( Int iProcRow = 0; iProcRow < this->grid_->numProcRow; iProcRow++ ){
2205 if(
MYROW( this->grid_ ) != iProcRow &&
2206 this->isSendToBelow_( iProcRow,snode.Index ) ==
true ){
2208 MPI_Isend( &snode.SizeSstrUrowSend, 1, MPI_INT,
2209 iProcRow, IDX_TO_TAG2(snode.Index,supidx,SELINV_TAG_U_SIZE), this->grid_->colComm, &mpireqsSendUToBelow[2*iProcRow] );
2210 MPI_Isend( (
void*)&snode.SstrLrowSend[0], snode.SizeSstrUrowSend, MPI_BYTE,
2211 iProcRow, IDX_TO_TAG2(snode.Index,supidx,SELINV_TAG_U_CONTENT),
2212 this->grid_->colComm, &mpireqsSendUToBelow[2*iProcRow+1] );
2213 #if ( _DEBUGlevel_ >= 1 )
2214 statusOFS << std::endl <<
"["<<snode.Index<<
"] "<<
"Sending U " << snode.SizeSstrUrowSend <<
" BYTES"<< std::endl << std::endl;
2215 statusOFS<<
"["<<snode.Index<<
"] "<<
"U SENT "<<std::endl;
2216 for(Int ib=0;ib<Urow.size();++ib){statusOFS<<Urow[ib].blockIdx<<
" ";}
2217 statusOFS<<std::endl;
2218 statusOFS <<
"["<<snode.Index<<
"] "<<
"Sending U "
2219 << snode.SizeSstrUrowSend <<
" BYTES to P" <<
PNUM(iProcRow,
MYCOL(this->grid_),this->grid_)
2220 << std::endl << std::endl;
2226 #if ( _DEBUGlevel_ >= 1 )
2227 statusOFS <<
"["<<snode.Index<<
"] "
2228 <<
"Communication for the Ucol part." << std::endl
2232 if(
MYCOL( this->grid_ ) ==
PCOL( snode.Index, this->grid_ ) ){
2234 TIMER_START(Serialize_UcolU);
2235 std::stringstream sstm;
2236 std::vector<Int> mask( UBlockMask::TOTAL_NUMBER, 1 );
2237 std::vector<UBlock<T> >& Ucol =
2238 this->Ucol(
LBj(snode.Index, this->grid_) );
2240 serialize( (Int)Ucol.size(), sstm, NO_MASK );
2241 for( Int jb = 0; jb < Ucol.size(); jb++ ){
2243 assert( UB.
blockIdx > snode.Index );
2244 serialize( UB, sstm, mask );
2246 snode.SstrUcolSend.resize( Size( sstm ) );
2247 sstm.read( &snode.SstrUcolSend[0], snode.SstrUcolSend.size() );
2248 snode.SizeSstrUcolSend = snode.SstrUcolSend.size();
2249 TIMER_STOP(Serialize_UcolU);
2251 for( Int iProcCol = 0;
2252 iProcCol < this->grid_->numProcCol ; iProcCol++ ){
2253 if(
MYCOL( this->grid_ ) != iProcCol &&
2254 this->isSendToRight_( iProcCol, snode.Index ) ==
true ){
2256 MPI_Isend( &snode.SizeSstrUcolSend, 1, MPI_INT,
2257 iProcCol, IDX_TO_TAG2(snode.Index,supidx,SELINV_TAG_UCOL_SIZE),
2258 this->grid_->rowComm, &mpireqsSendUToRight[2*iProcCol] );
2259 MPI_Isend( &snode.SstrUcolSend[0],snode.SizeSstrUcolSend,
2261 IDX_TO_TAG2(snode.Index,supidx,SELINV_TAG_UCOL_CONTENT),
2262 this->grid_->rowComm,
2263 &mpireqsSendUToRight[2*iProcCol+1] );
2264 #if ( _DEBUGlevel_ >= 2 )
2265 statusOFS<<
"["<<snode.Index<<
"] "<<
"Ucol SENT "<<std::endl;
2266 for(Int ib=0;ib<Ucol.size();++ib){
2267 statusOFS<<Ucol[ib].blockIdx<<
" ";
2269 statusOFS<<std::endl;
2271 #if ( _DEBUGlevel_ >= 1 )
2272 statusOFS <<
"["<<snode.Index<<
"] "<<
"Sending Ucol "
2273 << snode.SizeSstrUcolSend <<
" BYTES to P"
2274 <<
PNUM(
MYROW(this->grid_),iProcCol,this->grid_)
2275 << std::endl << std::endl;
2288 TIMER_START(WaitContentLU);
2290 for (Int supidx=0; supidx<stepSuper ; supidx++){
2292 MPI_Request * mpireqsRecvLFromAbove =
2293 &arrMpireqsRecvLSizeFromAny[supidx*2];
2294 MPI_Request * mpireqsRecvLFromLeft =
2295 &arrMpireqsRecvLSizeFromAny[supidx*2+1];
2296 MPI_Request * mpireqsRecvUFromAbove =
2297 &arrMpireqsRecvUSizeFromAny[supidx*2];
2298 MPI_Request * mpireqsRecvUFromLeft =
2299 &arrMpireqsRecvUSizeFromAny[supidx*2+1];
2302 if( this->isRecvFromAbove_( snode.Index ) &&
2303 MYROW( this->grid_ ) !=
PROW( snode.Index, this->grid_ ) ){
2304 Int sender =
PROW( snode.Index, this->grid_ );
2305 MPI_Irecv( &snode.SizeSstrLrowRecv, 1, MPI_INT, sender,
2306 IDX_TO_TAG2(snode.Index,supidx,SELINV_TAG_LROW_SIZE),
2307 this->grid_->colComm, mpireqsRecvLFromAbove );
2308 #if ( _DEBUGlevel_ >= 1 )
2309 statusOFS <<
"["<<snode.Index<<
"] "<<
"Receiving Lrow"
2310 <<
" size on tag "<<IDX_TO_TAG2(snode.Index,supidx,SELINV_TAG_LROW_SIZE)
2311 <<
" from P" <<
PNUM(sender,
MYCOL(this->grid_),this->grid_)
2312 << std::endl << std::endl;
2314 MPI_Irecv( &snode.SizeSstrUrowRecv, 1, MPI_INT, sender,
2315 IDX_TO_TAG2(snode.Index,supidx,SELINV_TAG_U_SIZE),
2316 this->grid_->colComm, mpireqsRecvUFromAbove );
2317 #if ( _DEBUGlevel_ >= 1 )
2318 statusOFS <<
"["<<snode.Index<<
"] "<<
"Receiving U"
2319 <<
" size on tag "<<IDX_TO_TAG2(snode.Index,supidx,SELINV_TAG_U_SIZE)
2320 <<
" from P" <<
PNUM(sender,
MYCOL(this->grid_),this->grid_)
2321 << std::endl << std::endl;
2326 if( this->isRecvFromLeft_( snode.Index ) &&
2327 MYCOL( this->grid_ ) !=
PCOL( snode.Index, this->grid_ ) ){
2328 Int sender =
PCOL( snode.Index, this->grid_ );
2329 MPI_Irecv( &snode.SizeSstrLcolRecv, 1, MPI_INT, sender,
2330 IDX_TO_TAG2(snode.Index,supidx,SELINV_TAG_L_SIZE),
2331 this->grid_->rowComm, mpireqsRecvLFromLeft );
2332 #if ( _DEBUGlevel_ >= 1 )
2333 statusOFS <<
"["<<snode.Index<<
"] "<<
"Receiving L"
2334 <<
" size on tag "<<IDX_TO_TAG2(snode.Index,supidx,SELINV_TAG_L_SIZE)
2335 <<
" from P" <<
PNUM(
MYROW(this->grid_),sender,this->grid_)
2336 << std::endl << std::endl;
2338 MPI_Irecv( &snode.SizeSstrUcolRecv, 1, MPI_INT, sender,
2339 IDX_TO_TAG2(snode.Index,supidx,SELINV_TAG_UCOL_SIZE),
2340 this->grid_->rowComm, mpireqsRecvUFromLeft );
2341 #if ( _DEBUGlevel_ >= 1 )
2342 statusOFS <<
"["<<snode.Index<<
"] "<<
"Receiving Ucol"
2343 <<
" size on tag "<<IDX_TO_TAG2(snode.Index,supidx,SELINV_TAG_UCOL_SIZE)
2344 <<
" from P" <<
PNUM(
MYROW(this->grid_),sender,this->grid_)
2345 << std::endl << std::endl;
2349 TIMER_STOP(WaitContentLU);
2352 TIMER_START(WaitContentLU);
2353 TIMER_START(WaitSize_LrowL);
2354 mpi::Waitall(arrMpireqsRecvLSizeFromAny);
2355 TIMER_STOP(WaitSize_LrowL);
2358 for (Int supidx=0; supidx<stepSuper ; supidx++){
2361 MPI_Request * mpireqsRecvFromAbove =
2362 &arrMpireqsRecvLContentFromAny[supidx*2];
2363 MPI_Request * mpireqsRecvFromLeft =
2364 &arrMpireqsRecvLContentFromAny[supidx*2+1];
2366 TIMER_START(Alloc_Buffer_Recv_LrowL);
2367 if( this->isRecvFromAbove_( snode.Index ) &&
2368 MYROW( this->grid_ ) !=
PROW( snode.Index, this->grid_ ) ){
2369 snode.SstrLrowRecv.resize( snode.SizeSstrLrowRecv );
2370 Int sender =
PROW( snode.Index, this->grid_ );
2371 MPI_Irecv( &snode.SstrLrowRecv[0], snode.SizeSstrLrowRecv, MPI_BYTE,
2372 sender, IDX_TO_TAG2(snode.Index,supidx,SELINV_TAG_LROW_CONTENT),
2373 this->grid_->colComm, mpireqsRecvFromAbove );
2374 #if ( _DEBUGlevel_ >= 1 )
2375 statusOFS <<
"["<<snode.Index<<
"] "<<
"Receiving Lrow "
2376 << snode.SizeSstrLrowRecv <<
" BYTES from P"
2377 <<
PNUM(sender,
MYCOL(this->grid_),this->grid_)
2378 << std::endl << std::endl;
2381 TIMER_STOP(Alloc_Buffer_Recv_LrowL);
2383 TIMER_START(Alloc_Buffer_Recv_LcolL);
2384 if( this->isRecvFromLeft_( snode.Index ) &&
2385 MYCOL( this->grid_ ) !=
PCOL( snode.Index, this->grid_ ) ){
2386 snode.SstrLcolRecv.resize( snode.SizeSstrLcolRecv );
2387 Int sender =
PCOL( snode.Index, this->grid_ );
2388 MPI_Irecv( &snode.SstrLcolRecv[0], snode.SizeSstrLcolRecv, MPI_BYTE,
2389 sender, IDX_TO_TAG2(snode.Index,supidx,SELINV_TAG_L_CONTENT),
2390 this->grid_->rowComm,
2391 mpireqsRecvFromLeft );
2392 #if ( _DEBUGlevel_ >= 1 )
2393 statusOFS <<
"["<<snode.Index<<
"] "<<
"Receiving L "
2394 << snode.SizeSstrLcolRecv <<
" BYTES from P"
2395 <<
PNUM(
MYROW(this->grid_),sender,this->grid_)
2396 << std::endl << std::endl;
2399 TIMER_STOP(Alloc_Buffer_Recv_LcolL);
2401 TIMER_STOP(WaitContentLU);
2405 TIMER_START(WaitContentLU);
2406 TIMER_START(WaitSize_UcolU);
2407 mpi::Waitall(arrMpireqsRecvUSizeFromAny);
2408 TIMER_STOP(WaitSize_UcolU);
2411 for (Int supidx=0; supidx<stepSuper ; supidx++){
2414 MPI_Request * mpireqsRecvFromAbove =
2415 &arrMpireqsRecvUContentFromAny[supidx*2];
2416 MPI_Request * mpireqsRecvFromLeft =
2417 &arrMpireqsRecvUContentFromAny[supidx*2+1];
2419 TIMER_START(Alloc_Buffer_Recv_UrowL);
2420 if( this->isRecvFromAbove_( snode.Index ) &&
2421 MYROW( this->grid_ ) !=
PROW( snode.Index, this->grid_ ) ){
2422 snode.SstrUrowRecv.resize( snode.SizeSstrUrowRecv );
2423 Int sender =
PROW( snode.Index, this->grid_ );
2424 MPI_Irecv( &snode.SstrUrowRecv[0], snode.SizeSstrUrowRecv, MPI_BYTE,
2425 sender, IDX_TO_TAG2(snode.Index,supidx,SELINV_TAG_U_CONTENT),
2426 this->grid_->colComm, mpireqsRecvFromAbove );
2427 #if ( _DEBUGlevel_ >= 1 )
2428 statusOFS <<
"["<<snode.Index<<
"] "<<
"Receiving U "
2429 << snode.SizeSstrUrowRecv <<
" BYTES from P"
2430 <<
PNUM(sender,
MYCOL(this->grid_),this->grid_)
2431 << std::endl << std::endl;
2434 TIMER_STOP(Alloc_Buffer_Recv_UrowL);
2436 TIMER_START(Alloc_Buffer_Recv_UcolL);
2437 if( this->isRecvFromLeft_( snode.Index ) &&
2438 MYCOL( this->grid_ ) !=
PCOL( snode.Index, this->grid_ ) ){
2439 snode.SstrUcolRecv.resize( snode.SizeSstrUcolRecv );
2440 Int sender =
PCOL( snode.Index, this->grid_ );
2441 MPI_Irecv( &snode.SstrUcolRecv[0], snode.SizeSstrUcolRecv, MPI_BYTE,
2442 sender, IDX_TO_TAG2(snode.Index,supidx,SELINV_TAG_UCOL_CONTENT),
2443 this->grid_->rowComm,
2444 mpireqsRecvFromLeft );
2445 #if ( _DEBUGlevel_ >= 1 )
2446 statusOFS <<
"["<<snode.Index<<
"] "<<
"Receiving Ucol "
2447 << snode.SizeSstrUcolRecv <<
" BYTES from P"
2448 <<
PNUM(
MYROW(this->grid_),sender,this->grid_)
2449 << std::endl << std::endl;
2452 TIMER_STOP(Alloc_Buffer_Recv_UcolL);
2455 TIMER_STOP(WaitContentLU);
2458 TIMER_START(Compute_Sinv_LU);
2460 Int gemmProcessed = 0;
2464 std::vector<Int> readySupidx;
2466 for(Int supidx = 0;supidx<stepSuper;supidx++){
2468 if( this->isRecvFromAbove_( snode.Index )
2469 && this->isRecvFromLeft_( snode.Index )){
2471 if(
MYCOL( this->grid_ ) ==
PCOL( snode.Index, this->grid_ ) ){
2475 if(
MYROW( this->grid_ ) ==
PROW( snode.Index, this->grid_ ) ){
2479 if(snode.isReady==4){
2480 readySupidx.push_back(supidx);
2481 #if ( _DEBUGlevel_ >= 1 )
2482 statusOFS<<
"Locally processing ["<<snode.Index<<
"]"<<std::endl;
2487 TIMER_START(Reduce_Sinv_L_Send);
2488 if( this->isRecvFromLeft_( snode.Index )
2489 &&
MYCOL( this->grid_ ) !=
PCOL( snode.Index, this->grid_ ) ){
2490 MPI_Request & mpireqsSendToLeft = arrMpireqsSendLToLeft[supidx];
2493 MPI_Isend( NULL, 0, MPI_BYTE,
PCOL(snode.Index,this->grid_),
2494 IDX_TO_TAG2(snode.Index,supidx,SELINV_TAG_L_REDUCE),
2495 this->grid_->rowComm, &mpireqsSendToLeft );
2497 #if ( _DEBUGlevel_ >= 1 )
2499 PCOL(snode.Index,this->grid_),this->grid_);
2500 statusOFS <<
"["<<snode.Index<<
"] "<<
" LReduce P"
2501 <<
MYPROC(this->grid_) <<
" has sent "
2502 << 0 <<
" bytes to "
2503 << dst << std::endl;
2506 TIMER_STOP(Reduce_Sinv_L_Send);
2513 #if ( _DEBUGlevel_ >= 1 )
2514 statusOFS<<std::endl<<
"gemmToDo ="<<gemmToDo<<std::endl;
2518 #if defined (PROFILE)
2519 end_SendULWaitContentFirst=0;
2520 begin_SendULWaitContentFirst=0;
2523 while(gemmProcessed<gemmToDo)
2525 Int reqidx = MPI_UNDEFINED;
2528 TIMER_START(WaitContentLU);
2531 int reqIndicesL[arrMpireqsRecvLContentFromAny.size()];
2536 #if defined(PROFILE)
2537 if(begin_SendULWaitContentFirst==0){
2538 begin_SendULWaitContentFirst=1;
2539 TIMER_START(WaitContent_LrowL_First);
2543 TIMER_START(WaitContent_LrowL);
2544 MPI_Waitsome(2*stepSuper, &arrMpireqsRecvLContentFromAny[0],
2545 &numRecv, reqIndicesL, MPI_STATUSES_IGNORE);
2547 for(
int i =0;i<numRecv;i++){
2548 reqidx = reqIndicesL[i];
2550 if(reqidx!=MPI_UNDEFINED){
2556 #if ( _DEBUGlevel_ >= 1 )
2557 statusOFS <<
"["<<snode.Index<<
"] "<<
"Received data for L"
2558 <<
" reqidx%2=" << reqidx%2
2559 <<
" is ready ?"<<snode.isReady<<std::endl;
2562 if(snode.isReady==4){
2563 readySupidx.push_back(supidx);
2564 #if defined(PROFILE)
2565 if(end_SendULWaitContentFirst==0){
2566 TIMER_STOP(WaitContent_LrowL_First);
2567 end_SendULWaitContentFirst=1;
2574 TIMER_STOP(WaitContent_LrowL);
2577 int reqIndicesU[arrMpireqsRecvUContentFromAny.size()];
2581 TIMER_START(WaitContent_UcolU);
2582 MPI_Waitsome(2*stepSuper, &arrMpireqsRecvUContentFromAny[0],
2583 &numRecv, reqIndicesU, MPI_STATUSES_IGNORE);
2585 for(
int i =0;i<numRecv;i++){
2586 reqidx = reqIndicesU[i];
2588 if(reqidx!=MPI_UNDEFINED){
2593 #if ( _DEBUGlevel_ >= 1 )
2594 statusOFS <<
"["<<snode.Index<<
"] "<<
"Received data for U"
2595 <<
" reqidx%2=" << reqidx%2
2596 <<
" is ready ?"<<snode.isReady<<std::endl;
2599 if(snode.isReady==4){
2600 readySupidx.push_back(supidx);
2601 #if defined(PROFILE)
2602 if(end_SendULWaitContentFirst==0){
2603 TIMER_STOP(WaitContent_LrowL_First);
2604 end_SendULWaitContentFirst=1;
2611 TIMER_STOP(WaitContent_UcolU);
2613 }
while(readySupidx.size()==0);
2614 TIMER_STOP(WaitContentLU);
2617 if(readySupidx.size()>0)
2619 supidx = readySupidx.back();
2620 readySupidx.pop_back();
2625 if( this->isRecvFromAbove_( snode.Index )
2626 && this->isRecvFromLeft_( snode.Index ) ){
2627 std::vector<LBlock<T> > LcolRecv;
2628 std::vector<LBlock<T> > LrowRecv;
2629 std::vector<UBlock<T> > UcolRecv;
2630 std::vector<UBlock<T> > UrowRecv;
2640 UnpackData(snode, LcolRecv, LrowRecv, UcolRecv, UrowRecv);
2642 SelInv_lookup_indexes(snode, LcolRecv, LrowRecv,
2643 UcolRecv, UrowRecv,AinvBuf,LBuf, UBuf);
2646 #if ( _DEBUGlevel_ >= 2 )
2647 statusOFS <<
"["<<snode.Index<<
"] " <<
"LBuf: ";
2648 statusOFS << LBuf << std::endl;
2649 statusOFS <<
"["<<snode.Index<<
"] " <<
"UBuf: ";
2650 statusOFS << UBuf << std::endl;
2654 LUpdateBuf.Resize( AinvBuf.m(),
2655 SuperSize( snode.Index, this->super_ ) );
2659 TIMER_START(Compute_Sinv_L_Resize);
2660 snode.LUpdateBuf.Resize( AinvBuf.m(),
2661 SuperSize( snode.Index, this->super_ ) );
2662 TIMER_STOP(Compute_Sinv_L_Resize);
2664 TIMER_START(Compute_Sinv_LT_GEMM);
2665 blas::Gemm(
'N',
'T', AinvBuf.m(), LBuf.m(), AinvBuf.n(),
2666 MINUS_ONE<T>(), AinvBuf.Data(), AinvBuf.m(),
2667 LBuf.Data(), LBuf.m(), ZERO<T>(),
2668 snode.LUpdateBuf.Data(), snode.LUpdateBuf.m() );
2669 TIMER_STOP(Compute_Sinv_LT_GEMM);
2671 TIMER_START(Compute_Sinv_U_Resize);
2672 snode.UUpdateBuf.Resize(
SuperSize( snode.Index, this->super_ ),
2674 TIMER_STOP(Compute_Sinv_U_Resize);
2676 TIMER_START(Compute_Sinv_U_GEMM);
2677 blas::Gemm(
'N',
'N', UBuf.m(), AinvBuf.n(), AinvBuf.m(),
2678 MINUS_ONE<T>(), UBuf.Data(), UBuf.m(),
2679 AinvBuf.Data(), AinvBuf.m(), ZERO<T>(),
2680 snode.UUpdateBuf.Data(), snode.UUpdateBuf.m() );
2681 TIMER_STOP(Compute_Sinv_U_GEMM);
2683 #if ( _DEBUGlevel_ >= 2 )
2684 statusOFS <<
"["<<snode.Index<<
"] " <<
"snode.LUpdateBuf: ";
2685 statusOFS << snode.LUpdateBuf << std::endl;
2686 statusOFS <<
"["<<snode.Index<<
"] " <<
"snode.UUpdateBuf: ";
2687 statusOFS << snode.UUpdateBuf << std::endl;
2699 TIMER_START(Reduce_Sinv_L_Send);
2702 if( this->isRecvFromAbove_( snode.Index ) ){
2703 if( this->isRecvFromLeft_( snode.Index )
2704 &&
MYCOL( this->grid_ ) !=
PCOL( snode.Index, this->grid_ ) ){
2705 MPI_Request & reqsSendToLeft = arrMpireqsSendLToLeft[supidx];
2706 MPI_Isend( snode.LUpdateBuf.Data(), snode.LUpdateBuf.ByteSize(),
2707 MPI_BYTE,
PCOL(snode.Index,this->grid_),
2708 IDX_TO_TAG2(snode.Index,supidx,SELINV_TAG_L_REDUCE),
2709 this->grid_->rowComm, &reqsSendToLeft );
2711 #if ( _DEBUGlevel_ >= 1 )
2713 PCOL(snode.Index,this->grid_),this->grid_);
2714 statusOFS <<
"["<<snode.Index<<
"] "<<
" LReduce P"
2715 <<
MYPROC(this->grid_) <<
" has sent "
2716 << snode.LUpdateBuf.ByteSize() <<
" bytes to "
2717 << dst << std::endl;
2721 TIMER_STOP(Reduce_Sinv_L_Send);
2724 #if ( _DEBUGlevel_ >= 1 )
2725 statusOFS <<
"["<<snode.Index<<
"] "<<
"gemmProcessed = "
2726 << gemmProcessed<<
"/"<<gemmToDo<<std::endl;
2732 TIMER_STOP(Compute_Sinv_LU);
2741 TIMER_START(Reduce_Sinv_L);
2743 for (Int supidx=0; supidx<stepSuper; supidx++){
2745 if(
MYCOL( this->grid_ ) ==
PCOL( snode.Index, this->grid_ ) ){
2747 Int numRowLUpdateBuf;
2748 std::vector<LBlock<T> >& Lcol =
2749 this->L(
LBj( snode.Index, this->grid_ ) );
2753 (
MYROW( this->grid_ )==
PROW(snode.Index,this->grid_))?1:0;
2754 snode.RowLocalPtr.resize( Lcol.size() + 1 - offset );
2755 snode.BlockIdxLocal.resize( Lcol.size() - offset );
2756 snode.RowLocalPtr[0] = 0;
2757 for( Int ib = 0; ib < snode.BlockIdxLocal.size(); ib++ ){
2758 snode.RowLocalPtr[ib+1] = snode.RowLocalPtr[ib]
2759 + Lcol[ib+offset].numRow;
2760 snode.BlockIdxLocal[ib] = Lcol[ib+offset].blockIdx;
2762 numRowLUpdateBuf = *snode.RowLocalPtr.rbegin();
2765 if( numRowLUpdateBuf > 0 ){
2766 if( snode.LUpdateBuf.m() == 0 && snode.LUpdateBuf.n() == 0 ){
2767 snode.LUpdateBuf.Resize( numRowLUpdateBuf,
2768 SuperSize( snode.Index, this->super_ ) );
2770 SetValue( snode.LUpdateBuf, ZERO<T>() );
2774 #if ( _DEBUGlevel_ >= 2 )
2775 statusOFS <<
"["<<snode.Index<<
"] "<<
"LUpdateBuf Before Reduction: "
2776 << snode.LUpdateBuf << std::endl << std::endl;
2779 Int totCountRecv = 0;
2780 Int numRecv = this->CountSendToRight(snode.Index);
2781 NumMat<T> LUpdateBufRecv(numRowLUpdateBuf,
2782 SuperSize( snode.Index, this->super_ ) );
2783 for( Int countRecv = 0; countRecv < numRecv ; ++countRecv ){
2787 TIMER_START(L_RECV);
2788 MPI_Recv(LUpdateBufRecv.Data(), LUpdateBufRecv.ByteSize(),
2789 MPI_BYTE, MPI_ANY_SOURCE,
2790 IDX_TO_TAG2(snode.Index,supidx,SELINV_TAG_L_REDUCE),
2791 this->grid_->rowComm,&stat);
2793 MPI_Get_count(&stat, MPI_BYTE, &size);
2796 #if ( _DEBUGlevel_ >= 1 )
2797 Int src =
PNUM(
MYROW(this->grid_),stat.MPI_SOURCE,this->grid_);
2798 statusOFS <<
"["<<snode.Index<<
"] "<<
" LReduce P"
2799 <<
MYPROC(this->grid_)<<
" has received "<< size
2800 <<
" bytes from " << src << std::endl;
2802 #if ( _DEBUGlevel_ >= 2 )
2803 statusOFS <<
"["<<snode.Index<<
"] "<<
"LUpdateBufRecv: "
2804 << LUpdateBufRecv << std::endl << std::endl;
2807 blas::Axpy(snode.LUpdateBuf.Size(), ONE<T>(),
2808 LUpdateBufRecv.Data(), 1,
2809 snode.LUpdateBuf.Data(), 1 );
2812 #if ( _DEBUGlevel_ >= 2 )
2813 statusOFS <<
"["<<snode.Index<<
"] "<<
"LUpdateBuf After Reduction: "
2814 << snode.LUpdateBuf << std::endl << std::endl;
2819 TIMER_STOP(Reduce_Sinv_L);
2823 mpi::Waitall( arrMpireqsSendLToLeft );
2826 if(next_lidx < superList.size()){
2827 IRecvContentCD(superList[next_lidx], superList[next_lidx].size(),nextCDBuffers);
2834 PushCallStack(
"PMatrixUnsym::SelInv_P2p::UpdateD");
2837 TIMER_START(Update_Diagonal);
2838 for (Int supidx=0; supidx<stepSuper; supidx++){
2841 ComputeDiagUpdate(snode);
2843 if(
MYCOL( this->grid_ ) ==
PCOL( snode.Index, this->grid_ ) ){
2844 if(
MYROW( this->grid_ ) !=
PROW( snode.Index, this->grid_ ) ){
2845 if(this->isSendToDiagonal_(snode.Index)){
2847 MPI_Request & mpireqsSendToAbove = arrMpireqsSendToAbove[supidx];
2848 MPI_Isend( snode.DiagBuf.Data(), snode.DiagBuf.ByteSize(),
2849 MPI_BYTE,
PROW(snode.Index,this->grid_) ,
2850 IDX_TO_TAG2(snode.Index,supidx,SELINV_TAG_D_REDUCE),
2851 this->grid_->colComm, &mpireqsSendToAbove );
2853 #if ( _DEBUGlevel_ >= 1 )
2854 statusOFS <<
"["<<snode.Index<<
"] "<<
" P"<<
MYROW(this->grid_)
2855 <<
" has sent "<< snode.DiagBuf.ByteSize()
2856 <<
" bytes of DiagBuf to "
2857 <<
PROW(snode.Index,this->grid_) << std::endl;
2864 TIMER_STOP(Update_Diagonal);
2868 TIMER_START(Reduce_Diagonal);
2870 for (Int supidx=0; supidx<stepSuper; supidx++){
2872 if(
MYCOL( this->grid_ ) ==
PCOL( snode.Index, this->grid_ ) ){
2873 if(
MYROW( this->grid_ ) ==
PROW( snode.Index, this->grid_ ) ){
2874 if(snode.DiagBuf.Size()==0){
2875 snode.DiagBuf.Resize(
SuperSize( snode.Index, this->super_ ),
2876 SuperSize( snode.Index, this->super_ ));
2877 SetValue(snode.DiagBuf, ZERO<T>());
2880 Int totCountRecv = 0;
2881 Int numRecv = this->CountRecvFromBelow(snode.Index);
2882 NumMat<T> DiagBufRecv(snode.DiagBuf.m(),snode.DiagBuf.n());
2884 for( Int countRecv = 0; countRecv < numRecv ; ++countRecv ){
2888 TIMER_START(D_RECV);
2889 MPI_Recv(DiagBufRecv.Data(), DiagBufRecv.ByteSize(), MPI_BYTE,
2890 MPI_ANY_SOURCE,IDX_TO_TAG2(snode.Index,supidx,SELINV_TAG_D_REDUCE),
2891 this->grid_->colComm,&stat);
2893 MPI_Get_count(&stat, MPI_BYTE, &size);
2897 blas::Axpy(snode.DiagBuf.Size(), ONE<T>(),
2898 DiagBufRecv.Data(), 1,
2899 snode.DiagBuf.Data(), 1 );
2902 LBlock<T> & LB = this->L(
LBj( snode.Index, this->grid_ ) )[0];
2904 snode.DiagBuf.Data(), 1,
2905 LB.
nzval.Data(), 1 );
2912 TIMER_STOP(Reduce_Diagonal);
2920 TIMER_START(Reduce_Sinv_U);
2922 for (Int supidx=0; supidx<stepSuper; supidx++){
2926 if(
MYROW( this->grid_ ) ==
PROW( snode.Index, this->grid_ ) ){
2928 Int numColUUpdateBuf;
2930 std::vector<UBlock<T> >& Urow =
2931 this->U(
LBi( snode.Index, this->grid_ ) );
2933 snode.ColLocalPtr.resize( Urow.size() + 1 );
2934 snode.BlockIdxLocalU.resize( Urow.size() );
2935 snode.ColLocalPtr[0] = 0;
2969 Int urowsize = Urow.size();
2970 snode.ColLocalPtr[0] = 0;
2971 for( Int jb = 0; jb < Urow.size(); jb++ ){
2973 snode.ColLocalPtr[jb+1] = snode.ColLocalPtr[jb]+UB.
numCol;
2974 snode.BlockIdxLocalU[jb] = UB.
blockIdx;
2978 #if ( _DEBUGlevel_ >= 2 )
2979 statusOFS <<
"["<<snode.Index<<
"] "<<
"UReduce collocalptr "
2980 << snode.ColLocalPtr << std::endl;
2981 statusOFS <<
"["<<snode.Index<<
"] "<<
"UReduce blockidxlocalU "
2982 << snode.BlockIdxLocalU << std::endl;
2985 numColUUpdateBuf = *snode.ColLocalPtr.rbegin();
2987 if( numColUUpdateBuf > 0 ){
2988 if( snode.UUpdateBuf.m() == 0 && snode.UUpdateBuf.n() == 0 ){
2989 snode.UUpdateBuf.Resize(
SuperSize( snode.Index, this->super_ ),
2992 SetValue( snode.UUpdateBuf, ZERO<T>() );
2996 #if ( _DEBUGlevel_ >= 2 )
2997 statusOFS <<
"["<<snode.Index<<
"] "<<
"UUpdateBuf Before Reduction: "
2998 << snode.UUpdateBuf << std::endl << std::endl;
3001 Int totCountRecv = 0;
3003 Int numRecv = this->CountSendToBelow(snode.Index);
3008 #if ( _DEBUGlevel_ >= 1 )
3009 statusOFS <<
"["<<snode.Index<<
"] "<<
" UReduce P"
3010 <<
MYPROC(this->grid_) <<
" can receive at most "
3011 << UUpdateBufRecv.ByteSize() <<
" bytes" << std::endl;
3013 for( Int countRecv = 0; countRecv < numRecv ; ++countRecv ){
3017 TIMER_START(U_RECV);
3018 MPI_Recv(UUpdateBufRecv.Data(), UUpdateBufRecv.ByteSize(),
3019 MPI_BYTE, MPI_ANY_SOURCE,
3020 IDX_TO_TAG2(snode.Index,supidx,SELINV_TAG_U_REDUCE),
3021 this->grid_->colComm,&stat);
3023 MPI_Get_count(&stat, MPI_BYTE, &size);
3027 #if ( _DEBUGlevel_ >= 1 )
3028 Int src =
PNUM(stat.MPI_SOURCE,
MYCOL(this->grid_),this->grid_);
3029 statusOFS <<
"["<<snode.Index<<
"] "<<
" UReduce P"
3030 <<
MYPROC(this->grid_)<<
" has received "
3031 << size <<
" bytes from P" << src << std::endl;
3034 #if ( _DEBUGlevel_ >= 2 )
3035 statusOFS <<
"["<<snode.Index<<
"] "<<
"UUpdateBufRecv: "
3036 << UUpdateBufRecv << std::endl << std::endl;
3039 blas::Axpy(snode.UUpdateBuf.Size(), ONE<T>(),
3040 UUpdateBufRecv.Data(), 1,
3041 snode.UUpdateBuf.Data(), 1);
3044 #if ( _DEBUGlevel_ >= 2 )
3045 statusOFS <<
"["<<snode.Index<<
"] "<<
"UUpdateBuf After Reduction: "
3046 << snode.UUpdateBuf << std::endl << std::endl;
3052 TIMER_START(Reduce_Sinv_U_Send);
3053 if(!this->isRecvFromLeft_( snode.Index ) && this->isRecvFromAbove_( snode.Index ) ){
3054 MPI_Request & mpireqsSendToAbove = arrMpireqsSendUToAbove[supidx];
3057 MPI_Isend( NULL, 0, MPI_BYTE,
PROW(snode.Index,this->grid_),
3058 IDX_TO_TAG2(snode.Index,supidx,SELINV_TAG_U_REDUCE),
3059 this->grid_->colComm, &mpireqsSendToAbove );
3061 #if ( _DEBUGlevel_ >= 1 )
3062 Int dst =
PNUM(
PROW(snode.Index,this->grid_),
3063 MYCOL(this->grid_),this->grid_);
3064 statusOFS <<
"["<<snode.Index<<
"] "<<
" UReduce P"
3065 <<
MYPROC(this->grid_) <<
" has sent "
3066 << 0 <<
" bytes to "
3067 << dst << std::endl;
3070 TIMER_STOP(Reduce_Sinv_U_Send);
3073 TIMER_START(Reduce_Sinv_U_Send);
3076 if( this->isRecvFromAbove_( snode.Index ) && this->isRecvFromLeft_( snode.Index ) ){
3077 MPI_Request & reqsSendToAbove = arrMpireqsSendUToAbove[supidx];
3078 TIMER_START(Reduce_Sinv_U_Send_Isend);
3079 MPI_Isend( snode.UUpdateBuf.Data(), snode.UUpdateBuf.ByteSize(),
3080 MPI_BYTE,
PROW(snode.Index,this->grid_),
3081 IDX_TO_TAG2(snode.Index,supidx,SELINV_TAG_U_REDUCE),
3082 this->grid_->colComm, &reqsSendToAbove );
3083 TIMER_STOP(Reduce_Sinv_U_Send_Isend);
3084 #if ( _DEBUGlevel_ >= 1 )
3085 Int dst =
PNUM(
PROW(snode.Index,this->grid_),
3086 MYCOL(this->grid_),this->grid_);
3087 statusOFS <<
"["<<snode.Index<<
"] "<<
" UReduce P"
3088 <<
MYPROC(this->grid_) <<
" has sent "
3089 << snode.UUpdateBuf.ByteSize() <<
" bytes to "
3090 << dst << std::endl;
3093 TIMER_STOP(Reduce_Sinv_U_Send);
3099 TIMER_STOP(Reduce_Sinv_U);
3102 mpi::Waitall( arrMpireqsSendUToAbove );
3110 for (Int supidx=0; supidx<stepSuper; supidx++){
3112 if(
MYROW( this->grid_ ) ==
PROW( snode.Index, this->grid_ )){
3113 std::vector<LBlock<T> >& Lrow = this->Lrow(
LBi(snode.Index, this->grid_) );
3116 if(
MYCOL( this->grid_ ) ==
PCOL( snode.Index, this->grid_ )){
3117 std::vector<UBlock<T> >& Ucol = this->Ucol(
LBj(snode.Index, this->grid_) );
3123 PushCallStack(
"PMatrixUnsym::SelInv_P2p::UpdateLFinal");
3126 TIMER_START(Update_L);
3128 for (Int supidx=0; supidx<stepSuper; supidx++){
3131 #if ( _DEBUGlevel_ >= 1 )
3132 statusOFS << std::endl <<
"["<<snode.Index<<
"] "
3133 <<
"Finish updating the L part by filling LUpdateBufReduced"
3134 <<
" back to L" << std::endl << std::endl;
3137 if(
MYCOL( this->grid_ ) ==
PCOL( snode.Index, this->grid_ )
3138 && snode.LUpdateBuf.m() > 0 ){
3139 std::vector<LBlock<T> >& Lcol = this->L(
LBj(snode.Index,this->grid_));
3142 (
MYROW( this->grid_ ) ==
PROW( snode.Index, this->grid_ ))?1:0;
3143 for( Int ib = startBlock; ib < Lcol.size(); ib++ ){
3147 &snode.LUpdateBuf(snode.RowLocalPtr[ib-startBlock], 0),
3148 snode.LUpdateBuf.m(), LB.
nzval.Data(), LB.
numRow );
3155 TIMER_STOP(Update_L);
3163 PushCallStack(
"PMatrixUnsym::SelInv_P2p::UpdateUFinal");
3166 TIMER_START(Update_U);
3168 for (Int supidx=0; supidx<stepSuper; supidx++){
3171 #if ( _DEBUGlevel_ >= 1 )
3172 statusOFS <<
"["<<snode.Index<<
"] "
3173 <<
"Finish updating the U part by filling UUpdateBufReduced"
3174 <<
" back to U" << std::endl << std::endl;
3177 if(
MYROW( this->grid_ ) ==
PROW( snode.Index, this->grid_ )
3178 && snode.UUpdateBuf.m() > 0 ){
3179 std::vector<UBlock<T> >& Urow = this->U(
LBi(snode.Index,this->grid_));
3181 for( Int jb = 0; jb < Urow.size(); jb++ ){
3184 #if ( _DEBUGlevel_ >= 1 )
3185 statusOFS <<
"["<<snode.Index<<
"] "<<
"Putting colptr "
3186 << snode.ColLocalPtr[jb] <<
" of UUpdateBuf "
3187 << snode.ColLocalPtr <<
" "
3188 <<
"in UB "<<UB.
blockIdx<<std::endl;
3192 &snode.UUpdateBuf( 0, snode.ColLocalPtr[jb] ),
3193 snode.UUpdateBuf.m(), UB.
nzval.Data(), UB.
numRow );
3194 #if ( _DEBUGlevel_ >= 2 )
3195 statusOFS<<
"["<<snode.Index<<
"] "<<
"UB: "<<UB.
nzval<<std::endl;
3203 TIMER_STOP(Update_U);
3211 if(next_lidx < superList.size()){
3212 WaitContentLCD(superList[next_lidx], superList[next_lidx].size(),nextCDBuffers);
3213 WaitContentUCD(superList[next_lidx], superList[next_lidx].size(),nextCDBuffers);
3218 TIMER_START(Barrier);
3219 if(next_lidx < superList.size()){
3220 nextCDBuffers.WaitAllSend();
3222 mpi::Waitall(arrMpireqsRecvLContentFromAny);
3223 mpi::Waitall(arrMpireqsRecvUContentFromAny);
3227 mpi::Waitall(arrMpireqsSendToAbove);
3229 for (Int supidx=0; supidx<stepSuper; supidx++){
3230 Int ksup = superList[lidx][supidx];
3231 std::vector<MPI_Request> & mpireqsSendLToRight = arrMpireqsSendLToRight[supidx];
3232 std::vector<MPI_Request> & mpireqsSendLToBelow = arrMpireqsSendLToBelow[supidx];
3233 std::vector<MPI_Request> & mpireqsSendUToRight = arrMpireqsSendUToRight[supidx];
3234 std::vector<MPI_Request> & mpireqsSendUToBelow = arrMpireqsSendUToBelow[supidx];
3236 mpi::Waitall( mpireqsSendLToRight );
3237 mpi::Waitall( mpireqsSendLToBelow );
3238 mpi::Waitall( mpireqsSendUToRight );
3239 mpi::Waitall( mpireqsSendUToBelow );
3242 TIMER_STOP(Barrier);
3248 if (this->options_->maxPipelineDepth!=-1)
3251 MPI_Barrier(this->grid_->comm);
3257 template<
typename T>
3260 this->SelInv_P2p ( );
3263 template<
typename T>
3266 TIMER_START(SelInv_P2p);
3269 PushCallStack(
"PMatrixUnsym::SelInv_P2p");
3276 std::vector<std::vector<Int> > & superList = this->WorkingSet();
3277 Int numSteps = superList.size();
3279 for (Int lidx=0; lidx<numSteps ; lidx++){
3280 Int stepSuper = superList[lidx].size();
3281 this->SelInvIntra_P2p(lidx);
3288 TIMER_STOP(SelInv_P2p);
3305 template<
typename T>
3309 PushCallStack(
"PMatrixUnsym::PreSelInv");
3315 PushCallStack(
"L(i,k) <- L(i,k) * L(k,k)^{-1}");
3317 #if ( _DEBUGlevel_ >= 1 )
3318 statusOFS << std::endl <<
"L(i,k) <- L(i,k) * L(k,k)^{-1}"
3319 << std::endl << std::endl;
3322 for( Int ksup = 0; ksup < numSuper; ksup++ ){
3323 if(
MYCOL( this->grid_ ) ==
PCOL( ksup, this->grid_ ) ){
3326 std::vector<LBlock<T> >& Lcol = this->L(
LBj( ksup, this->grid_ ) );
3327 if(
MYROW( this->grid_ ) ==
PROW( ksup, this->grid_ ) ){
3328 nzvalLDiag = Lcol[0].nzval;
3329 if( nzvalLDiag.m() !=
SuperSize(ksup, this->super_) ||
3330 nzvalLDiag.n() !=
SuperSize(ksup, this->super_) ){
3334 throw std::runtime_error(
3335 "The size of the diagonal block of L is wrong." );
3341 MPI_Bcast( (
void*)nzvalLDiag.Data(), nzvalLDiag.ByteSize(),
3342 MPI_BYTE,
PROW( ksup, this->grid_ ), this->grid_->colComm );
3345 for( Int ib = 0; ib < Lcol.size(); ib++ ){
3348 #if ( _DEBUGlevel_ >= 2 )
3358 #if defined( CHECK_NORM )
3362 ONE<T>(), nzvalLDiag.Data(), LB.
numCol,
3364 #if defined( CHECK_NORM )
3367 blas::Trmm(
'R',
'L',
'N',
'U',res.m(),res.n(),ONE<T>(),nzvalLDiag.Data(),nzvalLDiag.m(),res.Data(),res.m());
3368 blas::Axpy(res.Size(), MINUS_ONE<T>(), Ljk.Data(), 1, res.Data(), 1 );
3369 double norm = lapack::Lange(
'F',res.m(),res.n(),res.Data(),res.m(),Ljk.Data());
3371 statusOFS <<
"After solve norm of residual of L(" << LB.
blockIdx <<
", " << ksup
3372 <<
"): " << norm << std::endl;
3398 PushCallStack(
"U(k,j) <- U(k,k)^{-1} * U(k,j)");
3400 #if ( _DEBUGlevel_ >= 1 )
3401 statusOFS << std::endl <<
"U(k,j) <- U(k,k)^{-1} * U(k,j)"
3402 << std::endl << std::endl;
3404 for( Int ksup = 0; ksup < numSuper; ksup++ ){
3405 if(
MYROW( this->grid_ ) ==
PROW( ksup, this->grid_ ) ){
3408 std::vector<UBlock<T> >& Urow = this->U(
LBi( ksup, this->grid_ ) );
3409 if(
MYCOL( this->grid_ ) ==
PCOL( ksup, this->grid_ ) ){
3410 std::vector<LBlock<T> >& Lcol = this->L(
LBj( ksup, this->grid_ ) );
3411 nzvalUDiag = Lcol[0].nzval;
3412 if( nzvalUDiag.m() !=
SuperSize(ksup, this->super_) ||
3413 nzvalUDiag.n() !=
SuperSize(ksup, this->super_) ){
3417 throw std::runtime_error(
3418 "The size of the diagonal block of U is wrong." );
3424 MPI_Bcast( (
void*)nzvalUDiag.Data(), nzvalUDiag.ByteSize(),
3425 MPI_BYTE,
PCOL( ksup, this->grid_ ), this->grid_->rowComm );
3428 for( Int jb = 0; jb < Urow.size(); jb++ ){
3431 #if ( _DEBUGlevel_ >= 2 )
3440 #if defined( CHECK_NORM )
3444 ONE<T>(), nzvalUDiag.Data(), UB.
numRow,
3446 #if defined( CHECK_NORM )
3449 blas::Trmm(
'L',
'U',
'N',
'N',res.m(),res.n(),ONE<T>(),nzvalUDiag.Data(),nzvalUDiag.m(),res.Data(),res.m());
3450 blas::Axpy(res.Size(), MINUS_ONE<T>(), Ukj.Data(), 1, res.Data(), 1 );
3451 double norm = lapack::Lange(
'F',res.m(),res.n(),res.Data(),res.m(),Ukj.Data());
3452 statusOFS <<
"After solve, norm of residual of U(" << ksup <<
", " << UB.
blockIdx
3453 <<
"): " << norm << std::endl;
3476 PushCallStack(
"L(i,i) <- [L(k,k) * U(k,k)]^{-1} ");
3478 #if ( _DEBUGlevel_ >= 1 )
3479 statusOFS << std::endl <<
"L(i,i) <- [L(k,k) * U(k,k)]^{-1}" << std::endl
3483 for( Int ksup = 0; ksup < numSuper; ksup++ ){
3484 if(
MYROW( this->grid_ ) ==
PROW( ksup, this->grid_ ) &&
3485 MYCOL( this->grid_ ) ==
PCOL( ksup, this->grid_ ) ){
3489 for(Int i = 0; i <
SuperSize( ksup, this->super_ ); i++){
3492 LBlock<T> & LB = (this->L(
LBj( ksup, this->grid_ ) ))[0];
3493 #if ( _DEBUGlevel_ >= 2 )
3500 #if defined( CHECK_NORM )
3504 SuperSize( ksup, this->super_ ), ipiv.Data() );
3506 #if defined( CHECK_NORM )
3512 for(Int i = 0; i<Akk.m();++i){
3513 for(Int j = i; j<Akk.n();++j){
3517 blas::Trmm(
'L',
'L',
'N',
'U',Akk.m(),Akk.n(),ONE<T>(),Lkk.Data(),Lkk.m(),Akk.Data(),Akk.m());
3523 blas::Gemm(
'N',
'N',res.m(),res.n(),res.m(),ONE<T>(),LB.
nzval.Data(),res.m(),Akk.Data(),res.m(),ZERO<T>(),res.Data(),res.m());
3524 for(Int i = 0; i<res.m();++i){
3528 double norm = lapack::Lange(
'F',res.m(),res.n(),res.Data(),res.m(),Lkk.Data());
3529 statusOFS <<
"After inversion, norm of residual of A(" << ksup <<
", " << ksup
3530 <<
"): " << norm << std::endl;
3556 template<
typename T>
3559 ConstructCommunicationPattern_P2p();
3564 template<
typename T>
3568 PushCallStack(
"PMatrixUnsym::ConstructCommunicationPattern_P2p");
3571 TIMER_START(ConstructCommunicationPattern);
3575 TIMER_START(Allocate);
3578 PushCallStack(
"Initialize the communication pattern" );
3580 this->isSendToBelow_.Resize(this->grid_->numProcRow, numSuper);
3581 this->isSendToRight_.Resize(this->grid_->numProcCol, numSuper);
3582 this->isSendToDiagonal_.Resize( numSuper );
3583 SetValue( this->isSendToBelow_,
false );
3584 SetValue( this->isSendToRight_,
false );
3585 SetValue( this->isSendToDiagonal_,
false );
3587 this->isSendToCrossDiagonal_.Resize(this->grid_->numProcCol+1, numSuper );
3588 SetValue( this->isSendToCrossDiagonal_,
false );
3589 this->isRecvFromCrossDiagonal_.Resize(this->grid_->numProcRow+1, numSuper );
3590 SetValue( this->isRecvFromCrossDiagonal_,
false );
3592 this->isRecvFromAbove_.Resize( numSuper );
3593 this->isRecvFromLeft_.Resize( numSuper );
3594 this->isRecvFromBelow_.Resize( this->grid_->numProcRow, numSuper );
3595 SetValue( this->isRecvFromAbove_,
false );
3596 SetValue( this->isRecvFromBelow_,
false );
3597 SetValue( this->isRecvFromLeft_,
false );
3602 TIMER_STOP(Allocate);
3605 TIMER_START(GetEtree);
3606 std::vector<Int> snodeEtree(this->
NumSuper());
3607 this->GetEtree(snodeEtree);
3608 TIMER_STOP(GetEtree);
3611 PushCallStack(
"Local column communication" );
3616 std::vector<std::vector<LBlock<T> > * > colBlockRowBuf;
3617 colBlockRowBuf.resize( numSuper );
3618 std::vector<std::vector<UBlock<T> > * > rowBlockColBuf;
3619 rowBlockColBuf.resize( numSuper );
3620 for( Int ksup = 0; ksup < numSuper; ksup++ ){
3621 if(
MYCOL( this->grid_ ) ==
PCOL( ksup, this->grid_ )
3622 ||
MYROW( this->grid_ ) ==
PROW( ksup, this->grid_ ) ){
3623 colBlockRowBuf[ksup] =
new std::vector<LBlock<T> >();
3624 if(
MYROW( this->grid_ ) ==
PROW( ksup, this->grid_ ) ){
3625 rowBlockColBuf[ksup] =
new std::vector<UBlock<T> >();
3633 TIMER_START(Column_communication);
3634 for( Int ksup = 0; ksup < numSuper; ksup++ ){
3636 if(
MYCOL( this->grid_ ) ==
PCOL( ksup, this->grid_ ) ){
3638 std::vector< LBlock<T> > & Lcol = this->L(
LBj(ksup, this->grid_ ) );
3639 std::vector< LBlock<T> > & LcolRecv = *colBlockRowBuf[ksup];
3640 std::vector<char> sendBuf;
3641 std::vector<char> recvBuf;
3644 std::stringstream sstms,sstmr;
3645 std::vector<Int> mask( LBlockMask::TOTAL_NUMBER, 1 );
3646 mask[LBlockMask::NZVAL] = 0;
3649 Int numLBlock, totalLBlock;
3650 numLBlock = Lcol.size();
3651 mpi::Allreduce ( &numLBlock, &totalLBlock, 1, MPI_SUM, this->grid_->colComm);
3654 for( Int ib = 0; ib < Lcol.size(); ib++ ){
3655 serialize( Lcol[ib], sstms, mask );
3657 sendBuf.resize( Size( sstms ) );
3658 sstms.read( &sendBuf[0], sendBuf.size() );
3659 localSize = sendBuf.size();
3662 TIMER_START(Allgatherv_Column_communication);
3663 if( this->grid_ -> mpisize != 1 )
3665 mpi::Allgatherv( sendBuf, recvBuf, this->grid_->colComm );
3670 TIMER_STOP(Allgatherv_Column_communication);
3673 sstmr.write( &recvBuf[0], recvBuf.size() );
3675 LcolRecv.resize(totalLBlock);
3679 for( Int ib = 0; ib < totalLBlock; ib++ ){
3680 deserialize( LcolRecv[ib], sstmr, mask );
3685 std::sort(LcolRecv.begin(),LcolRecv.end(),LBlockComparator<T>);
3688 #if ( _DEBUGlevel_ >= 1 )
3689 statusOFS<<
"["<<ksup<<
"] "<<
"Lcol1: ";
for(
auto it = Lcol.begin();it != Lcol.end();++it){statusOFS<<it->blockIdx<<
" ";}statusOFS<<endl;
3690 statusOFS<<
"["<<ksup<<
"] "<<
"LcolRecv1: ";
for(
auto it = LcolRecv.begin();it != LcolRecv.end();++it){statusOFS<<it->blockIdx<<
" ";}statusOFS<<endl;
3694 TIMER_STOP(Column_communication);
3701 PushCallStack(
"Local row communication" );
3703 TIMER_START(Row_communication);
3704 for( Int ksup = 0; ksup < numSuper; ksup++ ){
3706 if(
MYROW( this->grid_ ) ==
PROW( ksup, this->grid_ ) ){
3708 std::vector< UBlock<T> > & Urow = this->U(
LBi(ksup, this->grid_ ) );
3709 std::vector< UBlock<T> > & UrowRecv = *rowBlockColBuf[ksup];
3710 std::vector<char> sendBuf;
3711 std::vector<char> recvBuf;
3714 std::stringstream sstms,sstmr;
3715 std::vector<Int> mask( UBlockMask::TOTAL_NUMBER, 1 );
3716 mask[UBlockMask::NZVAL] = 0;
3718 Int numUBlock, totalUBlock;
3719 numUBlock = Urow.size();
3720 mpi::Allreduce ( &numUBlock, &totalUBlock, 1, MPI_SUM, this->grid_->rowComm);
3723 for( Int jb = 0; jb < Urow.size(); jb++ ){
3724 serialize( Urow[jb], sstms, mask );
3726 sendBuf.resize( Size( sstms ) );
3727 sstms.read( &sendBuf[0], sendBuf.size() );
3728 localSize = sendBuf.size();
3731 TIMER_START(Allgatherv_Row_communication);
3732 if( this->grid_ -> mpisize != 1 )
3733 mpi::Allgatherv( sendBuf, recvBuf, this->grid_->rowComm );
3736 TIMER_STOP(Allgatherv_Row_communication);
3738 sstmr.write( &recvBuf[0], recvBuf.size() );
3742 UrowRecv.resize(totalUBlock);
3743 for( Int jb = 0; jb < totalUBlock; jb++ ){
3744 deserialize( UrowRecv[jb], sstmr, mask );
3748 std::sort(UrowRecv.begin(),UrowRecv.end(),UBlockComparator<T>);
3752 #if ( _DEBUGlevel_ >= 1 )
3753 statusOFS<<
"["<<ksup<<
"] "<<
"Urow1: ";
for(
auto it = Urow.begin();it != Urow.end();++it){statusOFS<<it->blockIdx<<
" ";}statusOFS<<endl;
3754 statusOFS<<
"["<<ksup<<
"] "<<
"UrowRecv1: ";
for(
auto it = UrowRecv.begin();it != UrowRecv.end();++it){statusOFS<<it->blockIdx<<
" ";}statusOFS<<endl;
3759 TIMER_STOP(Row_communication);
3765 PushCallStack(
"Local row communication" );
3768 TIMER_START(Row_communication);
3769 for( Int ksup = 0; ksup < numSuper; ksup++ ){
3771 if(
MYROW( this->grid_ ) ==
PROW( ksup, this->grid_ ) ){
3773 std::vector< LBlock<T> > * pLcolRecv;
3774 std::vector< LBlock<T> > & LcolSend = *colBlockRowBuf[ksup];
3775 std::vector< UBlock<T> > & UrowRecv = *rowBlockColBuf[ksup];
3776 std::vector< LBlock<T> > Union;
3777 std::vector<char> sendBuf;
3779 std::vector<Int> mask( LBlockMask::TOTAL_NUMBER, 1 );
3780 mask[LBlockMask::NZVAL] = 0;
3781 if( this->grid_ -> mpisize != 1 ){
3782 pLcolRecv =
new std::vector<LBlock<T> >();
3784 if(
MYCOL( this->grid_ ) ==
PCOL( ksup, this->grid_ ) ){
3786 std::stringstream sstms;
3788 serialize( (Int)LcolSend.size(), sstms, NO_MASK );
3789 for( Int ib = 0; ib < LcolSend.size(); ib++ ){
3790 serialize( LcolSend[ib], sstms, mask );
3792 sendBuf.resize( Size( sstms ) );
3793 sstms.read( &sendBuf[0], sendBuf.size() );
3794 localSize = sendBuf.size();
3796 TIMER_START(Bcast_Row_communication);
3797 mpi::Bcast( sendBuf,
PCOL(ksup,this->grid_), this->grid_->rowComm );
3798 TIMER_STOP(Bcast_Row_communication);
3800 std::stringstream sstmr;
3801 sstmr.write( &sendBuf[0], sendBuf.size() );
3804 deserialize( numLBlock, sstmr, NO_MASK );
3805 pLcolRecv->resize(numLBlock);
3806 for( Int ib = 0; ib < numLBlock; ib++ ){
3807 deserialize( (*pLcolRecv)[ib], sstmr, mask );
3811 pLcolRecv = &LcolSend;
3817 auto result = back_inserter(Union);
3818 auto firstUrow = UrowRecv.begin();
3819 auto lastUrow = UrowRecv.end();
3820 auto firstLcol = pLcolRecv->begin();
3821 auto lastLcol = pLcolRecv->end();
3824 if (firstUrow==lastUrow){
3825 result = std::copy(firstLcol,lastLcol,result);
3828 if (firstLcol==lastLcol){
3829 for(
auto it = firstUrow; it != lastUrow; ++it){
3841 if (firstUrow->blockIdx<firstLcol->blockIdx) {
3844 LB.
numRow = firstUrow->numCol;
3845 LB.
numCol = firstUrow->numRow;
3846 LB.
rows = firstUrow->cols;
3850 else if (firstLcol->blockIdx<firstUrow->blockIdx) {
3851 *result = *firstLcol;
3856 std::set<Int> uRowCol;
3857 uRowCol.insert(&firstLcol->rows[0],
3858 &firstLcol->rows[0]+firstLcol->numRow);
3859 uRowCol.insert(&firstUrow->cols[0],
3860 &firstUrow->cols[0]+firstUrow->numCol);
3864 LB.
numRow = uRowCol.size();
3865 LB.
numCol = firstUrow->numRow;
3866 LB.
rows.Resize(uRowCol.size());
3867 std::copy(uRowCol.begin(),uRowCol.end(),&LB.
rows[0]);
3875 #if ( _DEBUGlevel_ >= 1 )
3876 statusOFS<<
"["<<ksup<<
"] "<<
"Lcol: ";
for(
auto it = pLcolRecv->begin();it != pLcolRecv->end();++it){statusOFS<<it->blockIdx<<
" ";}statusOFS<<endl;
3877 statusOFS<<
"["<<ksup<<
"] "<<
"Urow: ";
for(
auto it = UrowRecv.begin();it != UrowRecv.end();++it){statusOFS<<it->blockIdx<<
" ";}statusOFS<<endl;
3878 statusOFS<<
"["<<ksup<<
"] "<<
"Union: ";
for(
auto it = Union.begin();it != Union.end();++it){statusOFS<<it->blockIdx<<
" ";}statusOFS<<endl;
3882 LcolSend.swap(Union);
3884 if( this->grid_ -> mpisize != 1 ){
3889 TIMER_STOP(Row_communication);
3896 PushCallStack(
"Local col communication" );
3898 TIMER_START(Col_communication);
3899 for( Int ksup = 0; ksup < numSuper; ksup++ ){
3901 if(
MYCOL( this->grid_ ) ==
PCOL( ksup, this->grid_ ) ){
3903 std::vector< LBlock<T> > * pUnionRecv;
3904 std::vector< LBlock<T> > & UnionSend = *colBlockRowBuf[ksup];
3906 std::vector<char> sendBuf;
3908 std::vector<Int> mask( LBlockMask::TOTAL_NUMBER, 1 );
3909 mask[LBlockMask::NZVAL] = 0;
3910 if( this->grid_ -> mpisize != 1 ){
3911 pUnionRecv =
new std::vector<LBlock<T> >();
3913 if(
MYROW( this->grid_ ) ==
PROW( ksup, this->grid_ ) ){
3915 std::stringstream sstms;
3917 serialize( (Int)UnionSend.size(), sstms, NO_MASK );
3918 for( Int ib = 0; ib < UnionSend.size(); ib++ ){
3919 serialize( UnionSend[ib], sstms, mask );
3921 sendBuf.resize( Size( sstms ) );
3922 sstms.read( &sendBuf[0], sendBuf.size() );
3923 localSize = sendBuf.size();
3925 TIMER_START(Bcast_Col_communication);
3926 mpi::Bcast( sendBuf,
PROW(ksup,this->grid_), this->grid_->colComm );
3927 TIMER_STOP(Bcast_Col_communication);
3929 std::stringstream sstmr;
3930 sstmr.write( &sendBuf[0], sendBuf.size() );
3933 deserialize( numLBlock, sstmr, NO_MASK );
3934 pUnionRecv->resize(numLBlock);
3935 for( Int ib = 0; ib < numLBlock; ib++ ){
3936 deserialize( (*pUnionRecv)[ib], sstmr, mask );
3940 UnionSend.resize(pUnionRecv->size());
3941 std::copy(pUnionRecv->begin(),pUnionRecv->end(),UnionSend.begin());
3944 pUnionRecv = &UnionSend;
3947 #if ( _DEBUGlevel_ >= 1 )
3948 statusOFS<<
"["<<ksup<<
"] "<<
"Union: ";
for(
auto it = UnionSend.begin();it != UnionSend.end();++it){statusOFS<<it->blockIdx<<
" ";}statusOFS<<endl;
3951 if( this->grid_ -> mpisize != 1 ){
3956 TIMER_STOP(Col_communication);
3962 PushCallStack(
"Extending_UL");
3965 for( Int ksup = 0; ksup < numSuper; ksup++ ){
3967 if(
MYROW( this->grid_ ) ==
PROW( ksup, this->grid_ ) ){
3969 std::vector< LBlock<T> > & Union = *colBlockRowBuf[ksup];
3970 std::vector< UBlock<T> > & Urow = this->U(
LBi(ksup, this->grid_ ) );
3971 Int & LrowSize = this->LrowSize_[
LBi(ksup, this->grid_ ) ];
3972 TIMER_START(Computing_Lrow_size);
3975 for(
auto it = Union.begin(); it!=Union.end();++it){
3976 Int Idx = it->blockIdx;
3977 if(Idx > ksup && (Idx % this->grid_->numProcCol) ==
MYCOL(this->grid_) ){
3981 TIMER_STOP(Computing_Lrow_size);
3983 TIMER_START(Extending_Urow);
3984 Urow.reserve(LrowSize);
3986 for(
auto it = Union.begin(); it!=Union.end();++it){
3987 Int Idx = it->blockIdx;
3988 if(Idx > ksup && (Idx % this->grid_->numProcCol) ==
MYCOL(this->grid_) ){
3989 bool isFound =
false;
3991 for(Int jb = 0; jb < Urow.size(); ++jb){
4018 assert(UB.
numRow == it->numCol);
4019 if( UB.
numCol != it->numRow ){
4031 for(Int j = 0; j<UB.
numCol; ++j){
4032 Int newCol = UB.
cols[j];
4033 if(jOldCols<tmpCols.m()){
4034 Int oldCol = tmpCols[jOldCols];
4035 if(newCol == oldCol){
4036 T * nzcolPtr = tmpNzval.VecData(jOldCols);
4037 std::copy(nzcolPtr,nzcolPtr+UB.
numRow,UB.
nzval.VecData(j));
4045 assert(jOldCols>=tmpCols.m());
4052 #if ( _DEBUGlevel_ >= 1 )
4053 statusOFS<<
"["<<ksup<<
"] "<<
"LrowSize = "<<LrowSize<<std::endl;
4055 std::sort(Urow.begin(),Urow.end(),UBlockComparator<T>);
4056 #if ( _DEBUGlevel_ >= 1 )
4057 statusOFS<<
"["<<ksup<<
"] "<<
"Urow extended: ";
for(
auto it = Urow.begin();it != Urow.end();++it){statusOFS<<it->blockIdx<<
" ";}statusOFS<<endl;
4059 TIMER_STOP(Extending_Urow);
4063 for( Int ksup = 0; ksup < numSuper; ksup++ ){
4065 if(
MYCOL( this->grid_ ) ==
PCOL( ksup, this->grid_ ) ){
4067 std::vector< LBlock<T> > & Union = *colBlockRowBuf[ksup];
4068 std::vector< LBlock<T> > & Lcol = this->L(
LBj(ksup, this->grid_ ) );
4069 Int & UcolSize = this->UcolSize_[
LBj(ksup, this->grid_ ) ];
4070 TIMER_START(Computing_Ucol_size);
4073 for(
auto it = Union.begin(); it!=Union.end();++it){
4074 Int Idx = it->blockIdx;
4075 if(Idx > ksup && (Idx % this->grid_->numProcRow) ==
MYROW(this->grid_) ){
4079 TIMER_STOP(Computing_Ucol_size);
4081 TIMER_START(Extending_Lcol);
4082 Lcol.reserve(UcolSize);
4084 for(
auto it = Union.begin(); it!=Union.end();++it){
4085 Int Idx = it->blockIdx;
4086 if(Idx > ksup && (Idx % this->grid_->numProcRow) ==
MYROW(this->grid_) ){
4087 bool isFound =
false;
4089 for(Int ib = 0; ib < Lcol.size(); ++ib){
4103 Lcol.push_back(*it);
4111 assert(LB.
numCol == it->numCol);
4112 if( LB.
numRow != it->numRow ){
4124 for(Int i = 0; i<LB.
numRow; ++i){
4125 Int newRow = LB.
rows[i];
4126 if(iOldRows<tmpRows.m()){
4127 Int oldRow = tmpRows[iOldRows];
4128 if(newRow == oldRow){
4129 for(Int j = 0; j<LB.
numCol; ++j){
4130 LB.
nzval(i,j) = tmpNzval(iOldRows,j);
4139 assert(iOldRows>=tmpRows.m());
4146 #if ( _DEBUGlevel_ >= 1 )
4147 statusOFS<<
"["<<ksup<<
"] "<<
"UcolSize = "<<UcolSize<<std::endl;
4149 std::sort(Lcol.begin(),Lcol.end(),LBlockComparator<T>);
4150 #if ( _DEBUGlevel_ >= 1 )
4151 statusOFS<<
"["<<ksup<<
"] "<<
"Lcol extended: ";
for(
auto it = Lcol.begin();it != Lcol.end();++it){statusOFS<<it->blockIdx<<
" ";}statusOFS<<endl;
4153 TIMER_STOP(Extending_Lcol);
4164 PushCallStack(
"Compute_full_row_struct");
4168 std::vector<Int> nextNZColBlockRow;
4169 nextNZColBlockRow.resize( this->NumLocalBlockCol(), 0 );
4171 std::vector<Int> nextNZRowBlockCol;
4172 nextNZRowBlockCol.resize( this->NumLocalBlockRow() , 0);
4175 for( Int ksup = 0; ksup < numSuper; ksup++ ){
4177 std::vector<Int> tAllBlockRowIdx;
4178 if(
MYCOL( this->grid_ ) ==
PCOL( ksup, this->grid_ ) ){
4180 std::vector< LBlock<T> > & Union = *colBlockRowBuf[ksup];
4182 std::vector<Int> tlocalBlockRowIdx;
4185 for( Int jsup = 0; jsup < ksup; jsup++ ){
4187 Int pjsup = snodeEtree[jsup];
4188 if(pjsup<=ksup &&
MYROW( this->grid_ ) ==
PROW( jsup, this->grid_ ) ){
4190 std::vector< UBlock<T> > & Urow = this->U(
LBi(jsup, this->grid_ ) );
4194 #if ( _DEBUGlevel_ >= 1 )
4195 statusOFS<<
"["<<ksup<<
"] "<<
"Urow["<<jsup<<
"]: ";
for(
auto it = Urow.begin(); it != Urow.end(); ++it){ statusOFS<<it->blockIdx<<
" "; } statusOFS<<endl;
4198 Int & nextIdx = nextNZRowBlockCol[
LBi(jsup, this->grid_) ];
4199 while(nextIdx<Urow.size()){
4200 if(Urow[nextIdx].blockIdx < ksup){
4207 if(nextIdx<Urow.size()){
4208 if(Urow[nextIdx].blockIdx == ksup){
4209 tlocalBlockRowIdx.push_back(jsup);
4220 TIMER_START(Allgatherv_Column_communication);
4221 if( this->grid_ -> mpisize != 1 )
4222 mpi::Allgatherv( tlocalBlockRowIdx, tAllBlockRowIdx, this->grid_->colComm );
4224 tAllBlockRowIdx = tlocalBlockRowIdx;
4226 TIMER_STOP(Allgatherv_Column_communication);
4231 for(
auto it = tAllBlockRowIdx.begin(); it != tAllBlockRowIdx.end(); ++it ){
4236 Union.push_back(LB);
4239 std::sort(Union.begin(),Union.end(),LBlockComparator<T>);
4241 #if ( _DEBUGlevel_ >= 1 )
4242 statusOFS<<
"["<<ksup<<
"] "<<
"NEW Union: ";
4243 for(
auto it = Union.begin(); it != Union.end(); ++it){ statusOFS<<it->blockIdx<<
" "; }
4252 for( Int ksup = 0; ksup < numSuper; ksup++ ){
4254 std::vector<Int> tAllBlockColIdx;
4255 if(
MYROW( this->grid_ ) ==
PROW( ksup, this->grid_ ) ){
4257 std::vector< LBlock<T> > & Union = *colBlockRowBuf[ksup];
4258 std::vector<Int> tlocalBlockColIdx;
4260 for( Int jsup = 0; jsup < ksup; jsup++ ){
4262 Int pjsup = snodeEtree[jsup];
4263 if(pjsup<=ksup &&
MYCOL( this->grid_ ) ==
PCOL( jsup, this->grid_ ) ){
4264 std::vector< LBlock<T> > & Lcol = this->L(
LBj(jsup, this->grid_ ) );
4268 #if ( _DEBUGlevel_ >= 1 )
4269 statusOFS<<
"["<<ksup<<
"] "<<
"Lcol["<<jsup<<
"]: ";
for(
auto it = Lcol.begin(); it != Lcol.end(); ++it){ statusOFS<<it->blockIdx<<
" "; } statusOFS<<endl;
4272 Int & nextIdx = nextNZColBlockRow[
LBj(jsup, this->grid_) ];
4273 while(nextIdx<Lcol.size()){
4274 if(Lcol[nextIdx].blockIdx < ksup){
4281 if(nextIdx<Lcol.size()){
4282 if(Lcol[nextIdx].blockIdx == ksup){
4283 tlocalBlockColIdx.push_back(jsup);
4298 TIMER_START(Allgatherv_Row_communication);
4299 if( this->grid_ -> mpisize != 1 )
4300 mpi::Allgatherv( tlocalBlockColIdx, tAllBlockColIdx, this->grid_->rowComm );
4302 tAllBlockColIdx = tlocalBlockColIdx;
4304 TIMER_STOP(Allgatherv_Row_communication);
4309 for(
auto it = tAllBlockColIdx.begin(); it != tAllBlockColIdx.end(); ++it ){
4314 Union.push_back(LB);
4317 std::sort(Union.begin(),Union.end(),LBlockComparator<T>);
4318 auto last = std::unique(Union.begin(),Union.end(),LBlockEqualComparator<T>);
4319 Union.erase(last,Union.end());
4321 #if ( _DEBUGlevel_ >= 1 )
4322 statusOFS<<
"["<<ksup<<
"] "<<
"NEW Union: ";
4323 for(
auto it = Union.begin(); it != Union.end(); ++it){ statusOFS<<it->blockIdx<<
" "; }
4337 TIMER_START(STB_RFA);
4339 PushCallStack(
"SendToBelow / RecvFromAbove");
4341 for( Int ksup = 0; ksup < numSuper - 1; ksup++ ){
4345 Int jsup = snodeEtree[ksup];
4346 while(jsup<numSuper){
4347 Int jsupLocalBlockCol =
LBj( jsup, this->grid_ );
4348 Int jsupProcCol =
PCOL( jsup, this->grid_ );
4349 if(
MYCOL( this->grid_ ) == jsupProcCol ){
4350 std::vector< LBlock<T> > & Union = *colBlockRowBuf[jsup];
4354 for(
auto it = Union.begin();it!=Union.end();++it){
4355 if(it->blockIdx == ksup){
4359 else if(it->blockIdx > ksup){
4364 for(
auto si = Union.begin(); si != Union.end(); si++ ){
4365 Int isup = si->blockIdx;
4366 Int isupProcRow =
PROW( isup, this->grid_ );
4368 if(
MYROW( this->grid_ ) == isupProcRow ){
4369 this->isRecvFromAbove_(ksup) =
true;
4371 if(
MYROW( this->grid_ ) ==
PROW( ksup, this->grid_ ) ){
4372 this->isSendToBelow_( isupProcRow, ksup ) =
true;
4379 jsup = snodeEtree[jsup];
4383 #if ( _DEBUGlevel_ >= 1 )
4384 statusOFS << std::endl <<
"this->isSendToBelow:" << std::endl;
4385 for(
int j = 0;j< this->isSendToBelow_.n();j++){
4386 statusOFS <<
"["<<j<<
"] ";
4387 for(
int i =0; i < this->isSendToBelow_.m();i++){
4388 statusOFS<< this->isSendToBelow_(i,j) <<
" ";
4390 statusOFS<<std::endl;
4393 statusOFS << std::endl <<
"this->isRecvFromAbove:" << std::endl;
4394 for(
int j = 0;j< this->isRecvFromAbove_.m();j++){
4395 statusOFS <<
"["<<j<<
"] "<< this->isRecvFromAbove_(j)<<std::endl;
4404 TIMER_STOP(STB_RFA);
4413 TIMER_START(STR_RFL_RFB);
4417 PushCallStack(
"SendToRight / RecvFromLeft");
4419 for( Int ksup = 0; ksup < numSuper - 1; ksup++ ){
4422 Int isup = snodeEtree[ksup];
4423 while(isup<numSuper){
4424 Int isupLocalBlockRow =
LBi( isup, this->grid_ );
4425 Int isupProcRow =
PROW( isup, this->grid_ );
4426 if(
MYROW( this->grid_ ) == isupProcRow ){
4427 std::vector< LBlock<T> > & Union = *colBlockRowBuf[isup];
4431 for(
auto it = Union.begin();it!=Union.end();++it){
4432 if(it->blockIdx == ksup){
4436 else if(it->blockIdx > ksup){
4441 for(
auto si = Union.begin(); si != Union.end(); si++ ){
4442 Int jsup = si->blockIdx;
4443 Int jsupProcCol =
PCOL( jsup, this->grid_ );
4445 if(
MYCOL( this->grid_ ) == jsupProcCol ){
4446 this->isRecvFromLeft_(ksup) =
true;
4448 if(
MYCOL( this->grid_ ) ==
PCOL( ksup, this->grid_ ) ){
4449 this->isSendToRight_( jsupProcCol, ksup ) =
true;
4456 if(
MYCOL( this->grid_ ) ==
PCOL(ksup, this->grid_) ){
4457 if(
MYROW( this->grid_ ) ==
PROW( ksup, this->grid_ ) ){
4458 this->isRecvFromBelow_(isupProcRow,ksup) =
true;
4460 else if (
MYROW(this->grid_) == isupProcRow){
4461 this->isSendToDiagonal_(ksup)=
true;
4464 isup = snodeEtree[isup];
4468 #if ( _DEBUGlevel_ >= 1 )
4469 statusOFS << std::endl <<
"this->isSendToRight:" << std::endl;
4470 for(
int j = 0;j< this->isSendToRight_.n();j++){
4471 statusOFS <<
"["<<j<<
"] ";
4472 for(
int i =0; i < this->isSendToRight_.m();i++){
4473 statusOFS<< this->isSendToRight_(i,j) <<
" ";
4475 statusOFS<<std::endl;
4478 statusOFS << std::endl <<
"this->isRecvFromLeft:" << std::endl;
4479 for(
int j = 0;j< this->isRecvFromLeft_.m();j++){
4480 statusOFS <<
"["<<j<<
"] "<< this->isRecvFromLeft_(j)<<std::endl;
4483 statusOFS << std::endl <<
"this->isRecvFromBelow:" << std::endl;
4484 for(
int j = 0;j< this->isRecvFromBelow_.n();j++){
4485 statusOFS <<
"["<<j<<
"] ";
4486 for(
int i =0; i < this->isRecvFromBelow_.m();i++){
4487 statusOFS<< this->isRecvFromBelow_(i,j) <<
" ";
4489 statusOFS<<std::endl;
4497 TIMER_STOP(STR_RFL_RFB);
4502 TIMER_START(STCD_RFCD);
4506 PushCallStack(
"SendToCrossDiagonal / RecvFromCrossDiagonal");
4508 for( Int ksup = 0; ksup < numSuper - 1; ksup++ ){
4509 if(
MYCOL( this->grid_ ) ==
PCOL( ksup, this->grid_ ) ){
4510 std::vector< LBlock<T> > & Union = *colBlockRowBuf[ksup];
4511 for(
auto si = Union.begin(); si != Union.end(); si++ ){
4512 Int isup = si->blockIdx;
4513 Int isupProcRow =
PROW( isup, this->grid_ );
4514 Int isupProcCol =
PCOL( isup, this->grid_ );
4515 if( isup > ksup &&
MYROW( this->grid_ ) == isupProcRow ){
4516 this->isSendToCrossDiagonal_(this->grid_->numProcCol, ksup ) =
true;
4517 this->isSendToCrossDiagonal_(isupProcCol, ksup ) =
true;
4523 for( Int ksup = 0; ksup < numSuper - 1; ksup++ ){
4524 if(
MYROW( this->grid_ ) ==
PROW( ksup, this->grid_ ) ){
4525 std::vector< LBlock<T> > & Union = *colBlockRowBuf[ksup];
4526 for(
auto si = Union.begin(); si != Union.end(); si++ ){
4527 Int jsup = si->blockIdx;
4528 Int jsupProcCol =
PCOL( jsup, this->grid_ );
4529 Int jsupProcRow =
PROW( jsup, this->grid_ );
4530 if( jsup > ksup &&
MYCOL(this->grid_) == jsupProcCol ){
4531 this->isRecvFromCrossDiagonal_(this->grid_->numProcRow, ksup ) =
true;
4532 this->isRecvFromCrossDiagonal_(jsupProcRow, ksup ) =
true;
4537 #if ( _DEBUGlevel_ >= 1 )
4538 statusOFS << std::endl <<
"this->isSendToCrossDiagonal:" << std::endl;
4539 for(
int j =0; j < this->isSendToCrossDiagonal_.n();j++){
4540 if(this->isSendToCrossDiagonal_(this->grid_->numProcCol,j)){
4541 statusOFS <<
"["<<j<<
"] ";
4542 for(
int i =0; i < this->isSendToCrossDiagonal_.m()-1;i++){
4543 if(this->isSendToCrossDiagonal_(i,j))
4545 statusOFS<<
PNUM(
PROW(j,this->grid_),i,this->grid_)<<
" ";
4548 statusOFS<<std::endl;
4552 statusOFS << std::endl <<
"this->isRecvFromCrossDiagonal:" << std::endl;
4553 for(
int j =0; j < this->isRecvFromCrossDiagonal_.n();j++){
4554 if(this->isRecvFromCrossDiagonal_(this->grid_->numProcRow,j)){
4555 statusOFS <<
"["<<j<<
"] ";
4556 for(
int i =0; i < this->isRecvFromCrossDiagonal_.m()-1;i++){
4557 if(this->isRecvFromCrossDiagonal_(i,j))
4559 statusOFS<<
PNUM(i,
PCOL(j,this->grid_),this->grid_)<<
" ";
4562 statusOFS<<std::endl;
4573 TIMER_STOP(STCD_RFCD);
4575 for( Int ksup = 0; ksup < numSuper; ksup++ ){
4576 if(
MYCOL( this->grid_ ) ==
PCOL( ksup, this->grid_ )
4577 ||
MYROW( this->grid_ ) ==
PROW( ksup, this->grid_ ) ){
4578 delete colBlockRowBuf[ksup];
4579 if(
MYROW( this->grid_ ) ==
PROW( ksup, this->grid_ ) ){
4580 delete rowBlockColBuf[ksup];
4586 this->GetWorkSet(snodeEtree,this->WorkingSet());
4588 TIMER_STOP(ConstructCommunicationPattern);
4600 #endif //_PEXSI_PSELINV_UNSYM_IMPL_HPP_
void UnpackData(SuperNodeBufferTypeUnsym &snode, std::vector< LBlock< T > > &LcolRecv, std::vector< LBlock< T > > &LrowRecv, std::vector< UBlock< T > > &UcolRecv, std::vector< UBlock< T > > &UrowRecv)
UnpackData.
Definition: pselinv_unsym_impl.hpp:1727
void SelInvIntra_P2p(Int lidx)
SelInvIntra_P2p.
Definition: pselinv_unsym_impl.hpp:1925
Int blockIdx
Block index (supernodal index)
Definition: pselinv.hpp:237
void ComputeDiagUpdate(SuperNodeBufferTypeUnsym &snode)
ComputeDiagUpdate.
Definition: pselinv_unsym_impl.hpp:1862
Interface with SuperLU_Dist (version 3.0 and later)
Int PROW(Int bnum, const GridType *g)
PROW returns the processor row that the bnum-th block (supernode) belongs to.
Definition: pselinv.hpp:293
Int numRow
Number of nonzero rows.
Definition: pselinv.hpp:188
void SelInv_lookup_indexes(SuperNodeBufferTypeUnsym &snode, std::vector< LBlock< T > > &LcolRecv, std::vector< LBlock< T > > &LrowRecv, std::vector< UBlock< T > > &UcolRecv, std::vector< UBlock< T > > &UrowRecv, NumMat< T > &AinvBuf, NumMat< T > &LBuf, NumMat< T > &UBuf)
SelInv_lookup_indexes.
Definition: pselinv_unsym_impl.hpp:112
Int MYPROC(const GridType *g)
MYPROC returns the current processor rank.
Definition: pselinv.hpp:280
Int PCOL(Int bnum, const GridType *g)
PCOL returns the processor column that the bnum-th block (supernode) belongs to.
Definition: pselinv.hpp:298
Int LBi(Int bnum, const GridType *g)
LBi returns the local block number on the processor at processor row PROW( bnum, g )...
Definition: pselinv.hpp:308
Profiling and timing using TAU.
Int SuperSize(Int bnum, const SuperNodeType *s)
SuperSize returns the size of the block bnum.
Definition: pselinv.hpp:349
Int LBj(Int bnum, const GridType *g)
LBj returns the local block number on the processor at processor column PCOL( bnum, g ).
Definition: pselinv.hpp:313
Int GBj(Int jLocal, const GridType *g)
GBj returns the global block number from a local block number in the column direction.
Definition: pselinv.hpp:323
void SetValue(NumMat< F > &M, F val)
SetValue sets a numerical matrix to a constant val.
Definition: NumMat_impl.hpp:171
Int numCol
Number of nonzero columns.
Definition: pselinv.hpp:191
Definition: pselinv_unsym_impl.hpp:434
LBlock stores a nonzero block in the lower triangular part or the diagonal part in PSelInv...
Definition: pselinv.hpp:182
Int PNUM(Int i, Int j, const GridType *g)
PNUM returns the processor rank that the bnum-th block (supernode) belongs to.
Definition: pselinv.hpp:303
Int GBi(Int iLocal, const GridType *g)
GBi returns the global block number from a local block number in the row direction.
Definition: pselinv.hpp:318
Int MYCOL(const GridType *g)
MYCOL returns my processor column.
Definition: pselinv.hpp:288
virtual void ConstructCommunicationPattern()
ConstructCommunicationPattern constructs the communication pattern to be used later in the selected i...
Definition: pselinv_unsym_impl.hpp:3557
IntNumVec rows
Dimension numRow * 1, index (0-based) for the number of nonzero rows.
Definition: pselinv.hpp:194
Int numCol
Number of nonzero columns.
Definition: pselinv.hpp:243
UBlock stores a nonzero block in the upper triangular part in PSelInv.
Definition: pselinv.hpp:234
Definition: pselinv_unsym.hpp:198
Int FirstBlockCol(Int bnum, const SuperNodeType *s)
FirstBlockCol returns the first column of a block bnum.
Definition: pselinv.hpp:337
Int blockIdx
Block index (supernodal index)
Definition: pselinv.hpp:185
void SendRecvCD(std::vector< SuperNodeBufferTypeUnsym > &arrSuperNodes, Int stepSuper)
SendRecvCD_UpdateU.
Definition: pselinv_unsym_impl.hpp:1124
Int numRow
Number of nonzero rows.
Definition: pselinv.hpp:240
virtual void SelInv()
SelInv is the main function for the selected inversion.
Definition: pselinv_unsym_impl.hpp:3258
void ConstructCommunicationPattern_P2p()
ConstructCommunicationPattern_P2p constructs the communication pattern to be used later in the select...
Definition: pselinv_unsym_impl.hpp:3565
NumMat< T > nzval
Dimension numRow * numCol, nonzero elements.
Definition: pselinv.hpp:249
Int NumSuper(const SuperNodeType *s)
NumSuper returns the total number of supernodes.
Definition: pselinv.hpp:353
NumMat< T > nzval
Dimension numRow * numCol, nonzero elements.
Definition: pselinv.hpp:198
virtual void PreSelInv()
PreSelInv prepares the structure in L_ and U_ so that SelInv only involves matrix-matrix multiplicati...
Definition: pselinv_unsym_impl.hpp:3306
IntNumVec cols
Dimension numCol * 1, index (0-based) for the number of nonzero rows.
Definition: pselinv.hpp:246
void SelInv_P2p()
Point-to-point version of the selected inversion.
Definition: pselinv_unsym_impl.hpp:3264
PMatrixUnsym contains the main data structure and the computational routine for the parallel selected...
Definition: pselinv.hpp:991
Int MYROW(const GridType *g)
MYROW returns my processor row.
Definition: pselinv.hpp:284