46 #ifndef _PEXSI_PSELINV_UNSYM_IMPL_HPP_
47 #define _PEXSI_PSELINV_UNSYM_IMPL_HPP_
58 PMatrixUnsym<T>::PMatrixUnsym (
60 const SuperNodeType* s,
61 const PSelInvOptions * o,
62 const FactorizationOptions * oFact
66 this->Setup( g, s, o, oFact );
73 void PMatrixUnsym<T>::Setup(
75 const SuperNodeType* s,
76 const PSelInvOptions * o,
77 const FactorizationOptions * oFact
80 PMatrix<T>::Setup(g,s,o,oFact);
86 Lrow_.resize( this->NumLocalBlockRow() );
87 Ucol_.resize( this->NumLocalBlockCol() );
91 LrowSize_.resize( this->NumLocalBlockRow() );
92 UcolSize_.resize( this->NumLocalBlockCol() );
110 TIMER_START(Compute_Sinv_LT_Lookup_Indexes);
116 TIMER_START(Build_colptr_rowptr);
118 #if ( _DEBUGlevel_ >= 2 )
119 statusOFS<<
"UrowRecv blockIdx: "<<std::endl;
120 for( Int jb = 0; jb < UrowRecv.size(); jb++ ){
121 statusOFS<<UrowRecv[jb].blockIdx<<
" ";
123 statusOFS<<std::endl;
125 statusOFS<<
"UcolRecv blockIdx: "<<std::endl;
126 for( Int jb = 0; jb < UcolRecv.size(); jb++ ){
127 statusOFS<<UcolRecv[jb].blockIdx<<
" ";
129 statusOFS<<std::endl;
131 statusOFS<<
"LcolRecv blockIdx: "<<std::endl;
132 for( Int jb = 0; jb < LcolRecv.size(); jb++ ){
133 statusOFS<<LcolRecv[jb].blockIdx<<
" ";
135 statusOFS<<std::endl;
138 statusOFS<<
"LrowRecv blockIdx: "<<std::endl;
139 for( Int jb = 0; jb < LrowRecv.size(); jb++ ){
140 statusOFS<<LrowRecv[jb].blockIdx<<
" ";
142 statusOFS<<std::endl;
145 TIMER_START(Sort_LrowRecv_UcolRecv);
147 std::sort(LrowRecv.begin(),LrowRecv.end(),LBlockComparator<T>);
148 std::sort(UcolRecv.begin(),UcolRecv.end(),UBlockComparator<T>);
149 TIMER_STOP(Sort_LrowRecv_UcolRecv);
154 std::vector<Int> rowPtrL(LcolRecv.size() + 1);
156 for( Int ib = 0; ib < LcolRecv.size(); ib++ ){
158 rowPtrL[ib+1] = rowPtrL[ib] + LB.
numRow;
165 std::vector<Int> colPtrL(LrowRecv.size() + 1);
166 std::vector<Int> colPtrU(UcolRecv.size() + 1,-1);
168 for( Int jb = 0; jb < LrowRecv.size(); jb++ ){
170 colPtrL[jb+1] = colPtrL[jb] + LB.
numCol;
174 for( Int jb = 0; jb < UcolRecv.size(); jb++ ){
176 colPtrU[jb+1] = colPtrU[jb] + UB.
numCol;
179 #if ( _DEBUGlevel_ >= 1 )
180 statusOFS<<rowPtrL<<std::endl;
181 statusOFS<<colPtrL<<std::endl;
182 statusOFS<<colPtrU<<std::endl;
186 Int numRowAinvBuf = *colPtrU.rbegin();
188 Int numColAinvBuf = *colPtrL.rbegin();
189 TIMER_STOP(Build_colptr_rowptr);
191 TIMER_START(Allocate_lookup);
193 AinvBuf.Resize( numRowAinvBuf, numColAinvBuf );
194 LBuf.Resize(
SuperSize( snode.Index, this->super_ ), numColAinvBuf );
195 UBuf.Resize(
SuperSize( snode.Index, this->super_ ), numRowAinvBuf );
196 TIMER_STOP(Allocate_lookup);
200 TIMER_START(Fill_LBuf);
202 for( Int jb = 0; jb < LrowRecv.size(); jb++ ){
207 "The size of LB is not right. Something is seriously wrong." );
210 LB.
numRow, LBuf.VecData( colPtrL[jb] ), LBuf.m() );
213 TIMER_STOP(Fill_LBuf);
215 TIMER_START(Fill_UBuf);
218 for( Int jb = 0; jb < UcolRecv.size(); jb++ ){
224 "The size of UB is not right. Something is seriously wrong." );
229 UB.
numRow, UBuf.VecData( colPtrU[jb] ), UBuf.m() );
232 TIMER_STOP(Fill_UBuf);
236 TIMER_START(JB_Loop);
238 for( Int jb = 0; jb < LrowRecv.size(); jb++ ){
245 for( Int ib = 0; ib < LcolRecv.size(); ib++ ){
249 T* nzvalAinv = &AinvBuf( rowPtrL[ib], colPtrL[jb] );
250 Int ldAinv = numRowAinvBuf;
255 std::vector<Int> relCols( LrowB.
numCol );
256 for( Int j = 0; j < LrowB.
numCol; j++ ){
257 relCols[j] = LrowB.
rows[j] - SinvColsSta;
260 std::vector<LBlock<T> >& LcolSinv = this->L(
LBj(jsup, this->grid_ ));
261 bool isBlockFound =
false;
262 TIMER_START(PARSING_ROW_BLOCKIDX);
263 for( Int ibSinv = 0; ibSinv < LcolSinv.size(); ibSinv++ ){
265 if( LcolSinv[ibSinv].blockIdx == isup ){
269 std::vector<Int> relRows( LB.
numRow );
270 Int* rowsLBPtr = LB.
rows.Data();
271 Int* rowsSinvBPtr = SinvB.
rows.Data();
273 TIMER_START(STDFIND_ROW);
274 Int * pos =&rowsSinvBPtr[0];
275 Int * last =&rowsSinvBPtr[SinvB.
numRow];
276 for( Int i = 0; i < LB.
numRow; i++ ){
277 pos = std::upper_bound(pos, last, rowsLBPtr[i])-1;
279 relRows[i] = (Int)(pos - rowsSinvBPtr);
282 std::ostringstream msg;
283 msg <<
"Row " << rowsLBPtr[i] <<
284 " in LB cannot find the corresponding row in SinvB"
286 <<
"LB.rows = " << LB.
rows << std::endl
287 <<
"SinvB.rows = " << SinvB.
rows << std::endl;
288 ErrorHandling( msg.str().c_str() );
291 TIMER_STOP(STDFIND_ROW);
293 TIMER_START(Copy_Sinv_to_Ainv);
295 T* nzvalSinv = SinvB.
nzval.Data();
296 Int ldSinv = SinvB.
numRow;
297 for( Int j = 0; j < LrowB.
numCol; j++ ){
298 for( Int i = 0; i < LB.
numRow; i++ ){
301 nzvalAinv[i + j*ldAinv] =
302 nzvalSinv[relRows[i]+relCols[j]*ldSinv];
305 TIMER_STOP(Copy_Sinv_to_Ainv);
311 TIMER_STOP(PARSING_ROW_BLOCKIDX);
312 if( isBlockFound ==
false ){
313 std::ostringstream msg;
314 msg <<
"["<<snode.Index<<
"] "<<
"Block(" << isup <<
", " << jsup
315 <<
") did not find a matching block in Sinv." << std::endl;
317 statusOFS<<msg.str();
324 std::vector<Int> relRows( LB.
numRow );
325 for( Int i = 0; i < LB.
numRow; i++ ){
326 relRows[i] = LB.
rows[i] - SinvRowsSta;
329 std::vector<UBlock<T> >& UrowSinv =
330 this->U(
LBi( isup, this->grid_ ) );
331 bool isBlockFound =
false;
332 TIMER_START(PARSING_COL_BLOCKIDX);
333 for( Int jbSinv = 0; jbSinv < UrowSinv.size(); jbSinv++ ){
335 if( UrowSinv[jbSinv].blockIdx == jsup ){
339 std::vector<Int> relCols( LrowB.
numCol );
340 Int* rowsLrowBPtr = LrowB.
rows.Data();
341 Int* colsSinvBPtr = SinvB.
cols.Data();
343 TIMER_START(STDFIND_COL);
344 Int * pos =&colsSinvBPtr[0];
345 Int * last =&colsSinvBPtr[SinvB.
numCol];
346 for( Int j = 0; j < LrowB.
numCol; j++ ){
348 pos = std::upper_bound(pos, last, rowsLrowBPtr[j])-1;
350 relCols[j] = (Int)(pos - colsSinvBPtr);
353 std::ostringstream msg;
354 msg <<
"Col " << rowsLrowBPtr[j] <<
355 " in UB cannot find the corresponding row in SinvB"
357 <<
"LrowB.rows = " << LrowB.
rows << std::endl
358 <<
"UinvB.cols = " << SinvB.
cols << std::endl;
359 ErrorHandling( msg.str().c_str() );
362 TIMER_STOP(STDFIND_COL);
364 TIMER_START(Copy_Sinv_to_Ainv);
366 T* nzvalSinv = SinvB.
nzval.Data();
367 Int ldSinv = SinvB.
numRow;
368 for( Int j = 0; j < LrowB.
numCol; j++ ){
369 for( Int i = 0; i < LB.
numRow; i++ ){
373 nzvalAinv[i + j*ldAinv] =
374 nzvalSinv[relRows[i]+relCols[j]*ldSinv];
377 TIMER_STOP(Copy_Sinv_to_Ainv);
383 TIMER_STOP(PARSING_COL_BLOCKIDX);
384 if( isBlockFound ==
false ){
385 std::ostringstream msg;
386 msg <<
"["<<snode.Index<<
"] "<<
"Block(" << isup <<
", " << jsup
387 <<
") did not find a matching block in Sinv." << std::endl;
388 statusOFS<<msg.str();
397 TIMER_STOP(Compute_Sinv_LT_Lookup_Indexes);
404 std::vector<MPI_Request > arrMpiReqsSendLCD;
405 std::vector<MPI_Request > arrMpiReqsSizeSendLCD;
406 std::vector<MPI_Request > arrMpiReqsRecvLCD;
407 std::vector<MPI_Request > arrMpiReqsSizeRecvLCD;
408 std::vector<std::vector<char> > arrSstrLcolSendCD;
409 std::vector<int > arrSstrLcolSizeSendCD;
410 std::vector<std::vector<char> > arrSstrLrowRecvCD;
411 std::vector<int > arrSstrLrowSizeRecvCD;
414 std::vector<MPI_Request > arrMpiReqsSendUCD;
415 std::vector<MPI_Request > arrMpiReqsSizeSendUCD;
416 std::vector<MPI_Request > arrMpiReqsRecvUCD;
417 std::vector<MPI_Request > arrMpiReqsSizeRecvUCD;
418 std::vector<std::vector<char> > arrSstrUrowSendCD;
419 std::vector<int > arrSstrUrowSizeSendCD;
420 std::vector<std::vector<char> > arrSstrUcolRecvCD;
421 std::vector<int > arrSstrUcolSizeRecvCD;
423 std::vector<Int>sendOffset;
424 std::vector<Int>recvOffset;
426 void resize(Int sendCount, Int recvCount){
428 arrMpiReqsSendLCD.assign(sendCount, MPI_REQUEST_NULL );
429 arrMpiReqsSizeSendLCD.assign(sendCount, MPI_REQUEST_NULL );
430 arrMpiReqsRecvLCD.assign(recvCount, MPI_REQUEST_NULL );
431 arrMpiReqsSizeRecvLCD.assign(recvCount, MPI_REQUEST_NULL );
432 arrSstrLcolSendCD.resize(sendCount);
433 arrSstrLcolSizeSendCD.resize(sendCount);
434 arrSstrLrowRecvCD.resize(recvCount);
435 arrSstrLrowSizeRecvCD.resize(recvCount);
438 arrMpiReqsSendUCD.assign(recvCount, MPI_REQUEST_NULL );
439 arrMpiReqsSizeSendUCD.assign(recvCount, MPI_REQUEST_NULL );
440 arrMpiReqsRecvUCD.assign(sendCount, MPI_REQUEST_NULL );
441 arrMpiReqsSizeRecvUCD.assign(sendCount, MPI_REQUEST_NULL );
442 arrSstrUrowSendCD.resize(recvCount);
443 arrSstrUrowSizeSendCD.resize(recvCount);
444 arrSstrUcolRecvCD.resize(sendCount);
445 arrSstrUcolSizeRecvCD.resize(sendCount);
449 mpi::Waitall(arrMpiReqsSizeSendLCD);
450 mpi::Waitall(arrMpiReqsSendLCD);
451 mpi::Waitall(arrMpiReqsSizeSendUCD);
452 mpi::Waitall(arrMpiReqsSendUCD);
473 std::vector<Int > & arrSuperNodes,
476 TIMER_START(SendRecvSizesCD);
480 buffers.sendOffset.resize(stepSuper);
481 buffers.recvOffset.resize(stepSuper);
484 for (Int supidx=0; supidx<stepSuper; supidx++){
485 Int snode_index = arrSuperNodes[supidx];
486 buffers.sendOffset[supidx]=sendCount;
487 buffers.recvOffset[supidx]=recvCount;
488 sendCount+= this->CountSendToCrossDiagonal(snode_index);
489 recvCount+= this->CountRecvFromCrossDiagonal(snode_index);
492 buffers.resize(sendCount,recvCount);
496 for (Int supidx=0; supidx<stepSuper; supidx++){
497 Int snode_index = arrSuperNodes[supidx];
499 TIMER_START(Send_L_Recv_Size_U_CrossDiag);
501 if(
MYCOL( this->grid_ ) ==
PCOL( snode_index, this->grid_ )
502 && this->isSendToCrossDiagonal_(this->grid_->numProcCol, snode_index ) ){
506 for(Int dstCol = 0; dstCol<this->grid_->numProcCol; dstCol++){
507 if(this->isSendToCrossDiagonal_(dstCol,snode_index) ){
508 Int dest =
PNUM(
PROW(snode_index,this->grid_),dstCol,this->grid_);
510 if(
MYPROC( this->grid_ ) != dest ){
512 MPI_Request & mpiReqSizeSendL =
513 buffers.arrMpiReqsSizeSendLCD[buffers.sendOffset[supidx]+sendIdxL];
514 MPI_Request & mpiReqSendL =
515 buffers.arrMpiReqsSendLCD[buffers.sendOffset[supidx]+sendIdxL];
516 std::vector<char> & sstrLcolSend =
517 buffers.arrSstrLcolSendCD[buffers.sendOffset[supidx]+sendIdxL];
519 buffers.arrSstrLcolSizeSendCD[buffers.sendOffset[supidx]+sendIdxL];
522 std::stringstream sstm;
523 std::vector<Int> mask( LBlockMask::TOTAL_NUMBER, 1 );
524 std::vector<LBlock<T> >& Lcol = this->L(
LBj(snode_index, this->grid_) );
529 Int startIdx = (
MYROW( this->grid_ ) ==
PROW( snode_index, this->grid_ ) ) ? 1:0;
531 for( Int ib = startIdx; ib < Lcol.size(); ib++ ){
532 if( Lcol[ib].blockIdx > snode_index &&
533 (Lcol[ib].blockIdx % this->grid_->numProcCol) == dstCol ){
538 serialize( (Int)count, sstm, NO_MASK );
540 for( Int ib = startIdx; ib < Lcol.size(); ib++ ){
541 if( Lcol[ib].blockIdx > snode_index &&
542 (Lcol[ib].blockIdx % this->grid_->numProcCol) == dstCol ){
543 #if ( _DEBUGlevel_ >= 1 )
544 statusOFS <<
"["<<snode_index<<
"] SEND contains "<<Lcol[ib].blockIdx
545 <<
" which corresponds to "<<
GBj(ib,this->grid_)
549 serialize( Lcol[ib], sstm, mask );
553 sstrLcolSend.resize( Size(sstm) );
554 sstm.read( &sstrLcolSend[0], sstrLcolSend.size() );
555 sstrSizeL = sstrLcolSend.size();
559 MPI_Isend( &sstrSizeL, 1, MPI_INT, dest,
560 IDX_TO_TAG2(snode_index,supidx,SELINV_TAG_L_SIZE_CD),
561 this->grid_->comm, &mpiReqSizeSendL );
562 MPI_Isend( (
void*)&sstrLcolSend[0], sstrSizeL, MPI_BYTE, dest,
563 IDX_TO_TAG2(snode_index,supidx,SELINV_TAG_L_CONTENT_CD),
564 this->grid_->comm, &mpiReqSendL );
569 buffers.arrSstrUcolSizeRecvCD[buffers.sendOffset[supidx]+recvIdxU];
570 MPI_Request & mpiReqSizeRecvU =
571 buffers.arrMpiReqsSizeRecvUCD[buffers.sendOffset[supidx]+recvIdxU];
573 MPI_Irecv( &sstrSizeU, 1, MPI_INT, dest,
574 IDX_TO_TAG2(snode_index,supidx,SELINV_TAG_U_SIZE_CD),
575 this->grid_->comm, &mpiReqSizeRecvU );
581 TIMER_STOP(Send_L_Recv_Size_U_CrossDiag);
586 for (Int supidx=0; supidx<stepSuper; supidx++){
587 Int snode_index = arrSuperNodes[supidx];
588 TIMER_START(Send_U_Recv_Size_L_CrossDiag);
590 if(
MYROW( this->grid_ ) ==
PROW( snode_index, this->grid_ )
591 && this->isRecvFromCrossDiagonal_(this->grid_->numProcRow, snode_index ) ){
594 for(Int srcRow = 0; srcRow<this->grid_->numProcRow; srcRow++){
595 if(this->isRecvFromCrossDiagonal_(srcRow,snode_index) ){
596 Int src =
PNUM(srcRow,
PCOL(snode_index,this->grid_),this->grid_);
597 if(
MYPROC( this->grid_ ) != src ){
600 buffers.arrSstrLrowSizeRecvCD[buffers.recvOffset[supidx]+recvIdxL];
601 MPI_Request & mpiReqSizeRecvL =
602 buffers.arrMpiReqsSizeRecvLCD[buffers.recvOffset[supidx]+recvIdxL];
604 MPI_Irecv( &sstrSizeL, 1, MPI_INT, src,
605 IDX_TO_TAG2(snode_index,supidx,SELINV_TAG_L_SIZE_CD),
606 this->grid_->comm, &mpiReqSizeRecvL );
611 MPI_Request & mpiReqSizeSendU =
612 buffers.arrMpiReqsSizeSendUCD[buffers.recvOffset[supidx]+sendIdxU];
613 MPI_Request & mpiReqSendU =
614 buffers.arrMpiReqsSendUCD[buffers.recvOffset[supidx]+sendIdxU];
615 std::vector<char> & sstrUrowSend =
616 buffers.arrSstrUrowSendCD[buffers.recvOffset[supidx]+sendIdxU];
618 buffers.arrSstrUrowSizeSendCD[buffers.recvOffset[supidx]+sendIdxU];
621 std::stringstream sstm;
622 std::vector<Int> mask( UBlockMask::TOTAL_NUMBER, 1 );
623 std::vector<UBlock<T> >& Urow = this->U(
LBi(snode_index, this->grid_) );
628 for( Int jb = 0; jb < Urow.size(); jb++ ){
629 if( Urow[jb].blockIdx > snode_index
630 && (Urow[jb].blockIdx % this->grid_->numProcRow) == srcRow ){
635 serialize( (Int)count, sstm, NO_MASK );
637 for( Int jb = 0; jb < Urow.size(); jb++ ){
638 if( Urow[jb].blockIdx > snode_index
639 && (Urow[jb].blockIdx % this->grid_->numProcRow) == srcRow ){
640 #if ( _DEBUGlevel_ >= 1 )
641 statusOFS <<
"["<<snode_index<<
"] SEND contains "<<Urow[jb].blockIdx
642 <<
" which corresponds to "<<
GBi(jb,this->grid_)
646 serialize( Urow[jb], sstm, mask );
650 sstrUrowSend.resize( Size(sstm) );
651 sstm.read( &sstrUrowSend[0], sstrUrowSend.size() );
652 sstrSizeU = sstrUrowSend.size();
655 MPI_Isend( &sstrSizeU, 1, MPI_INT, src,
656 IDX_TO_TAG2(snode_index,supidx,SELINV_TAG_U_SIZE_CD),
657 this->grid_->comm, &mpiReqSizeSendU );
658 MPI_Isend( (
void*)&sstrUrowSend[0], sstrSizeU, MPI_BYTE, src,
659 IDX_TO_TAG2(snode_index,supidx,SELINV_TAG_U_CONTENT_CD),
660 this->grid_->comm, &mpiReqSendU );
666 TIMER_STOP(Send_U_Recv_Size_L_CrossDiag);
668 TIMER_STOP(SendRecvSizesCD);
672 inline void PMatrixUnsym<T>::IRecvContentCD(
673 std::vector<Int > & arrSuperNodes,
674 Int stepSuper, CDBuffers & buffers) {
675 TIMER_START(IrecvContentCD);
679 TIMER_START(Wait_Size_L_CrossDiag);
680 mpi::Waitall(buffers.arrMpiReqsSizeRecvLCD);
681 TIMER_STOP(Wait_Size_L_CrossDiag);
686 for (Int supidx=0; supidx<stepSuper; supidx++){
687 Int snode_index = arrSuperNodes[supidx];
689 TIMER_START(Recv_L_CrossDiag);
690 if(
MYROW( this->grid_ ) ==
PROW( snode_index, this->grid_ )
691 && this->isRecvFromCrossDiagonal_(this->grid_->numProcRow, snode_index ) ){
693 for(Int srcRow = 0; srcRow<this->grid_->numProcRow; srcRow++){
694 if(this->isRecvFromCrossDiagonal_(srcRow,snode_index) ){
695 Int src =
PNUM(srcRow,
PCOL(snode_index,this->grid_),this->grid_);
696 if(
MYPROC( this->grid_ ) != src ){
698 buffers.arrSstrLrowSizeRecvCD[buffers.recvOffset[supidx]+recvIdxL];
699 std::vector<char> & sstrLrowRecv =
700 buffers.arrSstrLrowRecvCD[buffers.recvOffset[supidx]+recvIdxL];
701 MPI_Request & mpiReqRecvL =
702 buffers.arrMpiReqsRecvLCD[buffers.recvOffset[supidx]+recvIdxL];
703 sstrLrowRecv.resize( sstrSizeL);
705 MPI_Irecv( (
void*)&sstrLrowRecv[0], sstrSizeL, MPI_BYTE, src,
706 IDX_TO_TAG2(snode_index,supidx,SELINV_TAG_L_CONTENT_CD),
707 this->grid_->comm, &mpiReqRecvL );
714 TIMER_STOP(Recv_L_CrossDiag);
718 TIMER_START(Wait_Size_U_CrossDiag);
719 mpi::Waitall(buffers.arrMpiReqsSizeRecvUCD);
720 TIMER_STOP(Wait_Size_U_CrossDiag);
723 for (Int supidx=0; supidx<stepSuper; supidx++){
724 Int snode_index = arrSuperNodes[supidx];
725 TIMER_START(Recv_U_CrossDiag);
726 if(
MYCOL( this->grid_ ) ==
PCOL( snode_index, this->grid_ )
727 && this->isSendToCrossDiagonal_(this->grid_->numProcCol, snode_index ) ){
729 for(Int srcCol = 0; srcCol<this->grid_->numProcCol; srcCol++){
730 if(this->isSendToCrossDiagonal_(srcCol,snode_index) ){
731 Int src =
PNUM(
PROW(snode_index,this->grid_),srcCol,this->grid_);
732 if(
MYPROC( this->grid_ ) != src ){
734 buffers.arrSstrUcolSizeRecvCD[buffers.sendOffset[supidx]+recvIdxU];
735 std::vector<char> & sstrUcolRecv =
736 buffers.arrSstrUcolRecvCD[buffers.sendOffset[supidx]+recvIdxU];
737 MPI_Request & mpiReqRecvU =
738 buffers.arrMpiReqsRecvUCD[buffers.sendOffset[supidx]+recvIdxU];
739 sstrUcolRecv.resize( sstrSizeU );
741 MPI_Irecv( (
void*)&sstrUcolRecv[0], sstrSizeU, MPI_BYTE, src,
742 IDX_TO_TAG2(snode_index,supidx,SELINV_TAG_U_CONTENT_CD),
743 this->grid_->comm, &mpiReqRecvU );
749 TIMER_STOP(Recv_U_CrossDiag);
752 TIMER_START(IrecvContentCD);
757 inline void PMatrixUnsym<T>::WaitContentLCD(
758 std::vector<Int > & arrSuperNodes,
759 Int stepSuper, CDBuffers & buffers) {
760 TIMER_START(WaitContentLCD);
762 TIMER_START(Wait_L_CrossDiag);
763 mpi::Waitall(buffers.arrMpiReqsRecvLCD);
764 TIMER_STOP(Wait_L_CrossDiag);
770 for (Int supidx=0; supidx<stepSuper; supidx++){
771 Int snode_index = arrSuperNodes[supidx];
772 Int ksupProcRow =
PROW( snode_index, this->grid_ );
773 Int ksupProcCol =
PCOL( snode_index, this->grid_ );
775 if(
MYROW( this->grid_ ) ==
PROW( snode_index, this->grid_ ) &&
776 this->isRecvFromCrossDiagonal_(this->grid_->numProcRow, snode_index ) ){
778 #if ( _DEBUGlevel_ >= 1 )
779 statusOFS << std::endl <<
" ["<<snode_index<<
"] "
780 <<
"Update the upper triangular block"
781 << std::endl << std::endl;
782 statusOFS << std::endl <<
" ["<<snode_index<<
"] "
784 << std::endl << std::endl;
785 statusOFS << std::endl <<
" ["<<snode_index<<
"] "
787 << std::endl << std::endl;
790 std::vector<LBlock<T> >& Lcol = this->L(
LBj(snode_index, this->grid_) );
791 std::vector<LBlock<T> >& Lrow = this->Lrow(
LBi( snode_index, this->grid_ ) );
798 std::vector<Int> isBlockFound(Lrow.size(),
false);
801 for(Int srcRow = 0; srcRow<this->grid_->numProcRow; srcRow++){
802 if(this->isRecvFromCrossDiagonal_(srcRow,snode_index) ){
803 Int src =
PNUM(srcRow,
PCOL(snode_index,this->grid_),this->grid_);
804 TIMER_START(Recv_L_CrossDiag);
807 std::vector<LBlock<T> > * pLcol;
808 std::vector<LBlock<T> > LcolRecv;
809 if(
MYPROC(this->grid_)!= src){
811 buffers.arrSstrLrowSizeRecvCD[buffers.recvOffset[supidx]+recvIdxL];
812 std::vector<char> & sstrLrowRecv =
813 buffers.arrSstrLrowRecvCD[buffers.recvOffset[supidx]+recvIdxL];
815 std::stringstream sstm;
817 #if ( _DEBUGlevel_ >= 1 )
818 statusOFS <<
"["<<snode_index<<
"] P"<<
MYPROC(this->grid_)<<
" ("<<
MYROW(this->grid_)
819 <<
","<<
MYCOL(this->grid_)<<
") <--- LBj("<<snode_index<<
") <--- P"
820 << src <<
" ("<<srcRow<<
","<<ksupProcCol<<
")"
823 sstm.write( &sstrLrowRecv[0], sstrSizeL );
827 std::vector<Int> mask( LBlockMask::TOTAL_NUMBER, 1 );
828 deserialize( numLBlock, sstm, NO_MASK );
829 LcolRecv.resize(numLBlock);
830 for( Int ib = 0; ib < numLBlock; ib++ ){
831 deserialize( LcolRecv[ib], sstm, mask );
832 #if ( _DEBUGlevel_ >= 1 )
833 statusOFS <<
"["<<snode_index<<
"] RECV contains "
834 << LcolRecv[ib].blockIdx<<
" which corresponds to "
835 << ib * this->grid_->numProcRow + srcRow;
836 statusOFS <<
" L is on row "<< srcRow
837 <<
" whereas Lrow is on col "
838 << LcolRecv[ib].blockIdx % this->grid_->numProcCol
851 TIMER_STOP(Recv_L_CrossDiag);
856 Int startIdx = (
MYPROC( this->grid_ ) == src &&
MYPROC(this->grid_) ==
PNUM(ksupProcRow,ksupProcCol, this->grid_) ) ? 1:0;
857 for( Int ib = startIdx; ib < pLcol->size(); ib++ ){
858 LBlock<T> & LB = (*pLcol)[ib];
859 if( LB.blockIdx <= snode_index ){
860 ErrorHandling(
"LcolRecv contains the wrong blocks." );
864 if(
PCOL(LB.blockIdx, this->grid_) !=
MYCOL(this->grid_)){
871 for( Int iib = 0; iib < Lrow.size(); iib++ ){
872 LBlock<T> & LrowB = Lrow[ iib ];
875 if( LrowB.blockIdx == -1){
876 LrowB.blockIdx = LB.blockIdx;
879 if( LB.blockIdx == LrowB.blockIdx ){
884 LrowB.rows = LB.rows;
886 Transpose(LB.nzval, LrowB.nzval);
887 LrowB.numCol = LB.numRow;
888 LrowB.numRow = LB.numCol;
892 #if ( _DEBUGlevel_ >= 1 )
893 statusOFS<<
"["<<snode_index<<
"] USING LB "<<LB.blockIdx<< std::endl;
895 isBlockFound[iib] = 1;
903 for( Int ib = 0; ib < Lrow.size(); ib++ ){
904 LBlock<T> & LB = Lrow[ib];
905 if( !isBlockFound[ib] ){
907 "LBlock cannot find its update. Something is seriously wrong."
914 TIMER_STOP(WaitContentLCD);
920 inline void PMatrixUnsym<T>::WaitContentUCD(
921 std::vector<Int > & arrSuperNodes,
922 Int stepSuper, CDBuffers & buffers) {
924 TIMER_START(WaitContentUCD);
926 TIMER_START(Wait_U_CrossDiag);
927 mpi::Waitall(buffers.arrMpiReqsRecvUCD);
928 TIMER_STOP(Wait_U_CrossDiag);
932 for (Int supidx=0; supidx<stepSuper; supidx++){
933 Int snode_index = arrSuperNodes[supidx];
934 Int ksupProcRow =
PROW( snode_index, this->grid_ );
935 Int ksupProcCol =
PCOL( snode_index, this->grid_ );
938 if(
MYCOL( this->grid_ ) ==
PCOL( snode_index, this->grid_ ) &&
939 this->isSendToCrossDiagonal_(this->grid_->numProcCol, snode_index ) ){
941 std::vector<UBlock<T> >& Ucol = this->Ucol(
LBj( snode_index, this->grid_ ) );
948 std::vector<Int> isBlockFound(Ucol.size(),
false);
951 for(Int srcCol = 0; srcCol<this->grid_->numProcCol; srcCol++){
952 if(this->isSendToCrossDiagonal_(srcCol,snode_index) ){
953 Int src =
PNUM(
PROW(snode_index,this->grid_),srcCol,this->grid_);
954 TIMER_START(Recv_U_CrossDiag);
957 std::vector<UBlock<T> > * pUrow;
958 std::vector<UBlock<T> > UrowRecv;
959 if(
MYPROC(this->grid_)!= src){
961 buffers.arrSstrUcolSizeRecvCD[buffers.sendOffset[supidx]+recvIdxU];
962 std::vector<char> & sstrUcolRecv =
963 buffers.arrSstrUcolRecvCD[buffers.sendOffset[supidx]+recvIdxU];
966 std::stringstream sstm;
968 #if ( _DEBUGlevel_ >= 1 )
969 statusOFS <<
"["<<snode_index<<
"] P"<<
MYPROC(this->grid_)<<
" ("<<
MYROW(this->grid_)
970 <<
","<<
MYCOL(this->grid_)<<
") <--- LBi("<<snode_index<<
") <--- P"
971 << src<<
" ("<<ksupProcCol<<
","<<srcCol<<
")"<<std::endl;
973 sstm.write( &sstrUcolRecv[0], sstrSizeU );
977 std::vector<Int> mask( UBlockMask::TOTAL_NUMBER, 1 );
978 deserialize( numUBlock, sstm, NO_MASK );
979 UrowRecv.resize(numUBlock);
980 for( Int jb = 0; jb < numUBlock; jb++ ){
981 deserialize( UrowRecv[jb], sstm, mask );
982 #if ( _DEBUGlevel_ >= 1 )
983 statusOFS <<
"["<<snode_index<<
"] RECV contains "
984 << UrowRecv[jb].blockIdx<<
" which corresponds to "
985 << jb * this->grid_->numProcCol + srcCol;
986 statusOFS <<
" Urow is on col "<< srcCol
987 <<
" whereas U is on row "
988 << UrowRecv[jb].blockIdx % this->grid_->numProcRow
997 std::vector<UBlock<T> >& Urow = this->U(
LBi( snode_index, this->grid_ ) );
1002 TIMER_STOP(Recv_U_CrossDiag);
1005 for( Int jb = 0; jb < pUrow->size(); jb++ ){
1006 UBlock<T> & UB = (*pUrow)[jb];
1007 if( UB.blockIdx <= snode_index ){
1008 ErrorHandling(
"UrowRecv contains the wrong blocks." );
1012 if(
PROW(UB.blockIdx, this->grid_) !=
MYROW(this->grid_)){
1018 for( Int jjb = 0; jjb < Ucol.size(); jjb++ ){
1019 UBlock<T> & UcolB = Ucol[jjb];
1023 if( UcolB.blockIdx == -1){
1024 UcolB.blockIdx = UB.blockIdx;
1028 if( UB.blockIdx == UcolB.blockIdx ){
1036 #if ( _DEBUGlevel_ >= 1 )
1037 statusOFS<<
"["<<snode_index<<
"] USING UB "<<UB.blockIdx<< std::endl;
1039 isBlockFound[jjb] = 1;
1048 for( Int jb = 0; jb < Ucol.size(); jb++ ){
1049 UBlock<T> & UB = Ucol[jb];
1050 if( !isBlockFound[jb] ){
1053 "UBlock cannot find its update. Something is seriously wrong."
1061 TIMER_STOP(WaitContentUCD);
1067 template<
typename T>
1069 std::vector<SuperNodeBufferTypeUnsym > & arrSuperNodes,
1073 TIMER_START(Send_CD);
1077 Int sendOffset[stepSuper];
1078 Int recvOffset[stepSuper];
1081 for (Int supidx=0; supidx<stepSuper; supidx++){
1083 sendOffset[supidx]=sendCount;
1084 recvOffset[supidx]=recvCount;
1085 sendCount+= this->CountSendToCrossDiagonal(snode.Index);
1086 recvCount+= this->CountRecvFromCrossDiagonal(snode.Index);
1090 std::vector<MPI_Request > arrMpiReqsSendLCD(sendCount, MPI_REQUEST_NULL );
1091 std::vector<MPI_Request > arrMpiReqsSizeSendLCD(sendCount, MPI_REQUEST_NULL );
1092 std::vector<MPI_Request > arrMpiReqsRecvLCD(recvCount, MPI_REQUEST_NULL );
1093 std::vector<MPI_Request > arrMpiReqsSizeRecvLCD(recvCount, MPI_REQUEST_NULL );
1094 std::vector<std::vector<char> > arrSstrLcolSendCD(sendCount);
1095 std::vector<int > arrSstrLcolSizeSendCD(sendCount);
1096 std::vector<std::vector<char> > arrSstrLrowRecvCD(recvCount);
1097 std::vector<int > arrSstrLrowSizeRecvCD(recvCount);
1100 std::vector<MPI_Request > arrMpiReqsSendUCD(recvCount, MPI_REQUEST_NULL );
1101 std::vector<MPI_Request > arrMpiReqsSizeSendUCD(recvCount, MPI_REQUEST_NULL );
1102 std::vector<MPI_Request > arrMpiReqsRecvUCD(sendCount, MPI_REQUEST_NULL );
1103 std::vector<MPI_Request > arrMpiReqsSizeRecvUCD(sendCount, MPI_REQUEST_NULL );
1104 std::vector<std::vector<char> > arrSstrUrowSendCD(recvCount);
1105 std::vector<int > arrSstrUrowSizeSendCD(recvCount);
1106 std::vector<std::vector<char> > arrSstrUcolRecvCD(sendCount);
1107 std::vector<int > arrSstrUcolSizeRecvCD(sendCount);
1112 for (Int supidx=0; supidx<stepSuper; supidx++){
1115 TIMER_START(Send_L_Recv_Size_U_CrossDiag);
1117 if(
MYCOL( this->grid_ ) ==
PCOL( snode.Index, this->grid_ )
1118 && this->isSendToCrossDiagonal_(this->grid_->numProcCol, snode.Index ) ){
1122 for(Int dstCol = 0; dstCol<this->grid_->numProcCol; dstCol++){
1123 if(this->isSendToCrossDiagonal_(dstCol,snode.Index) ){
1124 Int dest =
PNUM(
PROW(snode.Index,this->grid_),dstCol,this->grid_);
1126 if(
MYPROC( this->grid_ ) != dest ){
1128 MPI_Request & mpiReqSizeSendL =
1129 arrMpiReqsSizeSendLCD[sendOffset[supidx]+sendIdxL];
1130 MPI_Request & mpiReqSendL =
1131 arrMpiReqsSendLCD[sendOffset[supidx]+sendIdxL];
1132 std::vector<char> & sstrLcolSend =
1133 arrSstrLcolSendCD[sendOffset[supidx]+sendIdxL];
1135 arrSstrLcolSizeSendCD[sendOffset[supidx]+sendIdxL];
1138 std::stringstream sstm;
1139 std::vector<Int> mask( LBlockMask::TOTAL_NUMBER, 1 );
1140 std::vector<LBlock<T> >& Lcol = this->L(
LBj(snode.Index, this->grid_) );
1145 Int startIdx = (
MYROW( this->grid_ ) ==
PROW( snode.Index, this->grid_ ) ) ? 1:0;
1147 for( Int ib = startIdx; ib < Lcol.size(); ib++ ){
1148 if( Lcol[ib].blockIdx > snode.Index &&
1149 (Lcol[ib].blockIdx % this->grid_->numProcCol) == dstCol ){
1154 serialize( (Int)count, sstm, NO_MASK );
1156 for( Int ib = startIdx; ib < Lcol.size(); ib++ ){
1157 if( Lcol[ib].blockIdx > snode.Index &&
1158 (Lcol[ib].blockIdx % this->grid_->numProcCol) == dstCol ){
1159 #if ( _DEBUGlevel_ >= 1 )
1160 statusOFS <<
"["<<snode.Index<<
"] SEND contains "<<Lcol[ib].blockIdx
1161 <<
" which corresponds to "<<
GBj(ib,this->grid_)
1165 serialize( Lcol[ib], sstm, mask );
1169 sstrLcolSend.resize( Size(sstm) );
1170 sstm.read( &sstrLcolSend[0], sstrLcolSend.size() );
1171 sstrSizeL = sstrLcolSend.size();
1175 MPI_Isend( &sstrSizeL, 1, MPI_INT, dest,
1176 IDX_TO_TAG2(snode.Index,supidx,SELINV_TAG_L_SIZE_CD),
1177 this->grid_->comm, &mpiReqSizeSendL );
1178 MPI_Isend( (
void*)&sstrLcolSend[0], sstrSizeL, MPI_BYTE, dest,
1179 IDX_TO_TAG2(snode.Index,supidx,SELINV_TAG_L_CONTENT_CD),
1180 this->grid_->comm, &mpiReqSendL );
1185 arrSstrUcolSizeRecvCD[sendOffset[supidx]+recvIdxU];
1186 MPI_Request & mpiReqSizeRecvU =
1187 arrMpiReqsSizeRecvUCD[sendOffset[supidx]+recvIdxU];
1189 MPI_Irecv( &sstrSizeU, 1, MPI_INT, dest,
1190 IDX_TO_TAG2(snode.Index,supidx,SELINV_TAG_U_SIZE_CD),
1191 this->grid_->comm, &mpiReqSizeRecvU );
1197 TIMER_STOP(Send_L_Recv_Size_U_CrossDiag);
1202 for (Int supidx=0; supidx<stepSuper; supidx++){
1204 TIMER_START(Send_U_Recv_Size_L_CrossDiag);
1206 if(
MYROW( this->grid_ ) ==
PROW( snode.Index, this->grid_ )
1207 && this->isRecvFromCrossDiagonal_(this->grid_->numProcRow, snode.Index ) ){
1210 for(Int srcRow = 0; srcRow<this->grid_->numProcRow; srcRow++){
1211 if(this->isRecvFromCrossDiagonal_(srcRow,snode.Index) ){
1212 Int src =
PNUM(srcRow,
PCOL(snode.Index,this->grid_),this->grid_);
1213 if(
MYPROC( this->grid_ ) != src ){
1216 arrSstrLrowSizeRecvCD[recvOffset[supidx]+recvIdxL];
1217 MPI_Request & mpiReqSizeRecvL =
1218 arrMpiReqsSizeRecvLCD[recvOffset[supidx]+recvIdxL];
1220 MPI_Irecv( &sstrSizeL, 1, MPI_INT, src,
1221 IDX_TO_TAG2(snode.Index,supidx,SELINV_TAG_L_SIZE_CD),
1222 this->grid_->comm, &mpiReqSizeRecvL );
1227 MPI_Request & mpiReqSizeSendU =
1228 arrMpiReqsSizeSendUCD[recvOffset[supidx]+sendIdxU];
1229 MPI_Request & mpiReqSendU =
1230 arrMpiReqsSendUCD[recvOffset[supidx]+sendIdxU];
1231 std::vector<char> & sstrUrowSend =
1232 arrSstrUrowSendCD[recvOffset[supidx]+sendIdxU];
1234 arrSstrUrowSizeSendCD[recvOffset[supidx]+sendIdxU];
1237 std::stringstream sstm;
1238 std::vector<Int> mask( UBlockMask::TOTAL_NUMBER, 1 );
1239 std::vector<UBlock<T> >& Urow = this->U(
LBi(snode.Index, this->grid_) );
1244 for( Int jb = 0; jb < Urow.size(); jb++ ){
1245 if( Urow[jb].blockIdx > snode.Index
1246 && (Urow[jb].blockIdx % this->grid_->numProcRow) == srcRow ){
1251 serialize( (Int)count, sstm, NO_MASK );
1253 for( Int jb = 0; jb < Urow.size(); jb++ ){
1254 if( Urow[jb].blockIdx > snode.Index
1255 && (Urow[jb].blockIdx % this->grid_->numProcRow) == srcRow ){
1256 #if ( _DEBUGlevel_ >= 1 )
1257 statusOFS <<
"["<<snode.Index<<
"] SEND contains "<<Urow[jb].blockIdx
1258 <<
" which corresponds to "<<
GBi(jb,this->grid_)
1262 serialize( Urow[jb], sstm, mask );
1266 sstrUrowSend.resize( Size(sstm) );
1267 sstm.read( &sstrUrowSend[0], sstrUrowSend.size() );
1268 sstrSizeU = sstrUrowSend.size();
1271 MPI_Isend( &sstrSizeU, 1, MPI_INT, src,
1272 IDX_TO_TAG2(snode.Index,supidx,SELINV_TAG_U_SIZE_CD),
1273 this->grid_->comm, &mpiReqSizeSendU );
1274 MPI_Isend( (
void*)&sstrUrowSend[0], sstrSizeU, MPI_BYTE, src,
1275 IDX_TO_TAG2(snode.Index,supidx,SELINV_TAG_U_CONTENT_CD),
1276 this->grid_->comm, &mpiReqSendU );
1282 TIMER_STOP(Send_U_Recv_Size_L_CrossDiag);
1286 TIMER_START(Wait_Size_L_CrossDiag);
1287 mpi::Waitall(arrMpiReqsSizeRecvLCD);
1288 TIMER_STOP(Wait_Size_L_CrossDiag);
1293 for (Int supidx=0; supidx<stepSuper; supidx++){
1296 TIMER_START(Recv_L_CrossDiag);
1297 if(
MYROW( this->grid_ ) ==
PROW( snode.Index, this->grid_ )
1298 && this->isRecvFromCrossDiagonal_(this->grid_->numProcRow, snode.Index ) ){
1300 for(Int srcRow = 0; srcRow<this->grid_->numProcRow; srcRow++){
1301 if(this->isRecvFromCrossDiagonal_(srcRow,snode.Index) ){
1302 Int src =
PNUM(srcRow,
PCOL(snode.Index,this->grid_),this->grid_);
1303 if(
MYPROC( this->grid_ ) != src ){
1305 arrSstrLrowSizeRecvCD[recvOffset[supidx]+recvIdxL];
1306 std::vector<char> & sstrLrowRecv =
1307 arrSstrLrowRecvCD[recvOffset[supidx]+recvIdxL];
1308 MPI_Request & mpiReqRecvL =
1309 arrMpiReqsRecvLCD[recvOffset[supidx]+recvIdxL];
1310 sstrLrowRecv.resize( sstrSizeL);
1312 MPI_Irecv( (
void*)&sstrLrowRecv[0], sstrSizeL, MPI_BYTE, src,
1313 IDX_TO_TAG2(snode.Index,supidx,SELINV_TAG_L_CONTENT_CD),
1314 this->grid_->comm, &mpiReqRecvL );
1321 TIMER_STOP(Recv_L_CrossDiag);
1325 TIMER_START(Wait_Size_U_CrossDiag);
1326 mpi::Waitall(arrMpiReqsSizeRecvUCD);
1327 TIMER_STOP(Wait_Size_U_CrossDiag);
1330 for (Int supidx=0; supidx<stepSuper; supidx++){
1332 TIMER_START(Recv_U_CrossDiag);
1333 if(
MYCOL( this->grid_ ) ==
PCOL( snode.Index, this->grid_ )
1334 && this->isSendToCrossDiagonal_(this->grid_->numProcCol, snode.Index ) ){
1336 for(Int srcCol = 0; srcCol<this->grid_->numProcCol; srcCol++){
1337 if(this->isSendToCrossDiagonal_(srcCol,snode.Index) ){
1338 Int src =
PNUM(
PROW(snode.Index,this->grid_),srcCol,this->grid_);
1339 if(
MYPROC( this->grid_ ) != src ){
1341 arrSstrUcolSizeRecvCD[sendOffset[supidx]+recvIdxU];
1342 std::vector<char> & sstrUcolRecv =
1343 arrSstrUcolRecvCD[sendOffset[supidx]+recvIdxU];
1344 MPI_Request & mpiReqRecvU =
1345 arrMpiReqsRecvUCD[sendOffset[supidx]+recvIdxU];
1346 sstrUcolRecv.resize( sstrSizeU );
1348 MPI_Irecv( (
void*)&sstrUcolRecv[0], sstrSizeU, MPI_BYTE, src,
1349 IDX_TO_TAG2(snode.Index,supidx,SELINV_TAG_U_CONTENT_CD),
1350 this->grid_->comm, &mpiReqRecvU );
1356 TIMER_STOP(Recv_U_CrossDiag);
1360 TIMER_START(Wait_L_CrossDiag);
1361 mpi::Waitall(arrMpiReqsRecvLCD);
1362 TIMER_STOP(Wait_L_CrossDiag);
1368 for (Int supidx=0; supidx<stepSuper; supidx++){
1370 Int ksupProcRow =
PROW( snode.Index, this->grid_ );
1371 Int ksupProcCol =
PCOL( snode.Index, this->grid_ );
1373 if(
MYROW( this->grid_ ) ==
PROW( snode.Index, this->grid_ ) &&
1374 this->isRecvFromCrossDiagonal_(this->grid_->numProcRow, snode.Index ) ){
1376 #if ( _DEBUGlevel_ >= 1 )
1377 statusOFS << std::endl <<
" ["<<snode.Index<<
"] "
1378 <<
"Update the upper triangular block"
1379 << std::endl << std::endl;
1380 statusOFS << std::endl <<
" ["<<snode.Index<<
"] "
1381 <<
"blockIdxLocal:" << snode.BlockIdxLocal
1382 << std::endl << std::endl;
1383 statusOFS << std::endl <<
" ["<<snode.Index<<
"] "
1384 <<
"rowLocalPtr:" << snode.RowLocalPtr
1385 << std::endl << std::endl;
1388 std::vector<LBlock<T> >& Lcol = this->L(
LBj(snode.Index, this->grid_) );
1389 std::vector<LBlock<T> >& Lrow = this->Lrow(
LBi( snode.Index, this->grid_ ) );
1396 std::vector<Int> isBlockFound(Lrow.size(),
false);
1399 for(Int srcRow = 0; srcRow<this->grid_->numProcRow; srcRow++){
1400 if(this->isRecvFromCrossDiagonal_(srcRow,snode.Index) ){
1401 Int src =
PNUM(srcRow,
PCOL(snode.Index,this->grid_),this->grid_);
1402 TIMER_START(Recv_L_CrossDiag);
1405 std::vector<LBlock<T> > * pLcol;
1406 std::vector<LBlock<T> > LcolRecv;
1407 if(
MYPROC(this->grid_)!= src){
1409 arrSstrLrowSizeRecvCD[recvOffset[supidx]+recvIdxL];
1410 std::vector<char> & sstrLrowRecv =
1411 arrSstrLrowRecvCD[recvOffset[supidx]+recvIdxL];
1413 std::stringstream sstm;
1415 #if ( _DEBUGlevel_ >= 1 )
1416 statusOFS <<
"["<<snode.Index<<
"] P"<<
MYPROC(this->grid_)<<
" ("<<
MYROW(this->grid_)
1417 <<
","<<
MYCOL(this->grid_)<<
") <--- LBj("<<snode.Index<<
") <--- P"
1418 << src <<
" ("<<srcRow<<
","<<ksupProcCol<<
")"
1421 sstm.write( &sstrLrowRecv[0], sstrSizeL );
1425 std::vector<Int> mask( LBlockMask::TOTAL_NUMBER, 1 );
1426 deserialize( numLBlock, sstm, NO_MASK );
1427 LcolRecv.resize(numLBlock);
1428 for( Int ib = 0; ib < numLBlock; ib++ ){
1429 deserialize( LcolRecv[ib], sstm, mask );
1430 #if ( _DEBUGlevel_ >= 1 )
1431 statusOFS <<
"["<<snode.Index<<
"] RECV contains "
1432 << LcolRecv[ib].blockIdx<<
" which corresponds to "
1433 << ib * this->grid_->numProcRow + srcRow;
1434 statusOFS <<
" L is on row "<< srcRow
1435 <<
" whereas Lrow is on col "
1436 << LcolRecv[ib].blockIdx % this->grid_->numProcCol
1449 TIMER_STOP(Recv_L_CrossDiag);
1454 Int startIdx = (
MYPROC( this->grid_ ) == src &&
MYPROC(this->grid_) ==
PNUM(ksupProcRow,ksupProcCol, this->grid_) ) ? 1:0;
1455 for( Int ib = startIdx; ib < pLcol->size(); ib++ ){
1458 ErrorHandling(
"LcolRecv contains the wrong blocks." );
1469 for( Int iib = 0; iib < Lrow.size(); iib++ ){
1490 #if ( _DEBUGlevel_ >= 1 )
1491 statusOFS<<
"["<<snode.Index<<
"] USING LB "<<LB.
blockIdx<< std::endl;
1493 isBlockFound[iib] = 1;
1501 for( Int ib = 0; ib < Lrow.size(); ib++ ){
1503 if( !isBlockFound[ib] ){
1505 "LBlock cannot find its update. Something is seriously wrong."
1513 TIMER_START(Wait_U_CrossDiag);
1514 mpi::Waitall(arrMpiReqsRecvUCD);
1515 TIMER_STOP(Wait_U_CrossDiag);
1519 for (Int supidx=0; supidx<stepSuper; supidx++){
1521 Int ksupProcRow =
PROW( snode.Index, this->grid_ );
1522 Int ksupProcCol =
PCOL( snode.Index, this->grid_ );
1525 if(
MYCOL( this->grid_ ) ==
PCOL( snode.Index, this->grid_ ) &&
1526 this->isSendToCrossDiagonal_(this->grid_->numProcCol, snode.Index ) ){
1528 std::vector<UBlock<T> >& Ucol = this->Ucol(
LBj( snode.Index, this->grid_ ) );
1535 std::vector<Int> isBlockFound(Ucol.size(),
false);
1538 for(Int srcCol = 0; srcCol<this->grid_->numProcCol; srcCol++){
1539 if(this->isSendToCrossDiagonal_(srcCol,snode.Index) ){
1540 Int src =
PNUM(
PROW(snode.Index,this->grid_),srcCol,this->grid_);
1541 TIMER_START(Recv_U_CrossDiag);
1544 std::vector<UBlock<T> > * pUrow;
1545 std::vector<UBlock<T> > UrowRecv;
1546 if(
MYPROC(this->grid_)!= src){
1548 arrSstrUcolSizeRecvCD[sendOffset[supidx]+recvIdxU];
1549 std::vector<char> & sstrUcolRecv =
1550 arrSstrUcolRecvCD[sendOffset[supidx]+recvIdxU];
1553 std::stringstream sstm;
1555 #if ( _DEBUGlevel_ >= 1 )
1556 statusOFS <<
"["<<snode.Index<<
"] P"<<
MYPROC(this->grid_)<<
" ("<<
MYROW(this->grid_)
1557 <<
","<<
MYCOL(this->grid_)<<
") <--- LBi("<<snode.Index<<
") <--- P"
1558 << src<<
" ("<<ksupProcCol<<
","<<srcCol<<
")"<<std::endl;
1560 sstm.write( &sstrUcolRecv[0], sstrSizeU );
1564 std::vector<Int> mask( UBlockMask::TOTAL_NUMBER, 1 );
1565 deserialize( numUBlock, sstm, NO_MASK );
1566 UrowRecv.resize(numUBlock);
1567 for( Int jb = 0; jb < numUBlock; jb++ ){
1568 deserialize( UrowRecv[jb], sstm, mask );
1569 #if ( _DEBUGlevel_ >= 1 )
1570 statusOFS <<
"["<<snode.Index<<
"] RECV contains "
1571 << UrowRecv[jb].blockIdx<<
" which corresponds to "
1572 << jb * this->grid_->numProcCol + srcCol;
1573 statusOFS <<
" Urow is on col "<< srcCol
1574 <<
" whereas U is on row "
1575 << UrowRecv[jb].blockIdx % this->grid_->numProcRow
1584 std::vector<UBlock<T> >& Urow = this->U(
LBi( snode.Index, this->grid_ ) );
1589 TIMER_STOP(Recv_U_CrossDiag);
1592 for( Int jb = 0; jb < pUrow->size(); jb++ ){
1595 ErrorHandling(
"UrowRecv contains the wrong blocks." );
1605 for( Int jjb = 0; jjb < Ucol.size(); jjb++ ){
1623 #if ( _DEBUGlevel_ >= 1 )
1624 statusOFS<<
"["<<snode.Index<<
"] USING UB "<<UB.
blockIdx<< std::endl;
1626 isBlockFound[jjb] = 1;
1635 for( Int jb = 0; jb < Ucol.size(); jb++ ){
1637 if( !isBlockFound[jb] ){
1639 "UBlock cannot find its update. Something is seriously wrong."
1647 TIMER_STOP(Send_CD);
1649 mpi::Waitall(arrMpiReqsSizeSendLCD);
1650 mpi::Waitall(arrMpiReqsSendLCD);
1652 mpi::Waitall(arrMpiReqsSizeSendUCD);
1653 mpi::Waitall(arrMpiReqsSendUCD);
1658 template<
typename T>
1668 TIMER_START(Unpack_data);
1673 #if ( _DEBUGlevel_ >= 1 )
1674 statusOFS << std::endl <<
"["<<snode.Index<<
"] "<<
"Unpack the received data for processors participate in Gemm. " << std::endl << std::endl;
1677 if(
MYROW( this->grid_ ) !=
PROW( snode.Index, this->grid_ ) ){
1678 std::stringstream sstm;
1679 sstm.write( &snode.SstrLrowRecv[0], snode.SstrLrowRecv.size() );
1680 std::vector<Int> mask( LBlockMask::TOTAL_NUMBER, 1 );
1682 deserialize( numLBlock, sstm, NO_MASK );
1683 LrowRecv.resize( numLBlock );
1684 for( Int ib = 0; ib < numLBlock; ib++ ){
1685 deserialize( LrowRecv[ib], sstm, mask );
1688 #if ( _DEBUGlevel_ >= 1 )
1689 statusOFS<<
"["<<snode.Index<<
"] "<<
"Lrow RECV "<<std::endl;
1690 for(Int ib=0;ib<LrowRecv.size();++ib){statusOFS<<LrowRecv[ib].blockIdx<<
" ";}
1691 statusOFS<<std::endl;
1699 LrowRecv.resize(this->Lrow(
LBi( snode.Index, this->grid_ ) ).size());
1700 std::copy(this->Lrow(
LBi( snode.Index, this->grid_ ) ).begin(),this->Lrow(
LBi( snode.Index, this->grid_ )).end(),LrowRecv.begin());
1705 if(
MYCOL( this->grid_ ) !=
PCOL( snode.Index, this->grid_ ) ){
1706 std::stringstream sstm;
1707 sstm.write( &snode.SstrLcolRecv[0], snode.SstrLcolRecv.size() );
1708 std::vector<Int> mask( LBlockMask::TOTAL_NUMBER, 1 );
1709 mask[LBlockMask::NZVAL] = 0;
1711 deserialize( numLBlock, sstm, NO_MASK );
1712 LcolRecv.resize( numLBlock );
1713 for( Int ib = 0; ib < numLBlock; ib++ ){
1714 deserialize( LcolRecv[ib], sstm, mask );
1718 #if ( _DEBUGlevel_ >= 1 )
1719 statusOFS<<
"["<<snode.Index<<
"] "<<
"Lcol RECV "<<std::endl;
1720 for(Int ib=0;ib<LcolRecv.size();++ib){statusOFS<<LcolRecv[ib].blockIdx<<
" ";}
1721 statusOFS<<std::endl;
1728 std::vector<LBlock<T> >& Lcol = this->L(
LBj( snode.Index, this->grid_ ) );
1729 Int startIdx = (
MYROW( this->grid_ ) ==
PROW( snode.Index, this->grid_ ) )?1:0;
1730 LcolRecv.resize( Lcol.size() - startIdx );
1731 std::copy(Lcol.begin()+startIdx,Lcol.end(),LcolRecv.begin());
1735 if(
MYROW( this->grid_ ) !=
PROW( snode.Index, this->grid_ ) ){
1736 std::stringstream sstm;
1737 sstm.write( &snode.SstrUrowRecv[0], snode.SstrUrowRecv.size() );
1738 std::vector<Int> mask( UBlockMask::TOTAL_NUMBER, 1 );
1739 mask[UBlockMask::NZVAL] = 0;
1741 deserialize( numUBlock, sstm, NO_MASK );
1742 UrowRecv.resize( numUBlock );
1743 for( Int jb = 0; jb < numUBlock; jb++ ){
1744 deserialize( UrowRecv[jb], sstm, mask );
1747 #if ( _DEBUGlevel_ >= 1 )
1748 statusOFS<<
"["<<snode.Index<<
"] "<<
"Urow RECV "<<std::endl;
1749 for(Int ib=0;ib<UrowRecv.size();++ib){statusOFS<<UrowRecv[ib].blockIdx<<
" ";}
1750 statusOFS<<std::endl;
1758 UrowRecv.resize(this->U(
LBi( snode.Index, this->grid_ ) ).size());
1759 std::copy(this->U(
LBi( snode.Index, this->grid_ ) ).begin(),this->U(
LBi( snode.Index, this->grid_ )).end(),UrowRecv.begin());
1764 if(
MYCOL( this->grid_ ) !=
PCOL( snode.Index, this->grid_ ) ){
1765 std::stringstream sstm;
1766 sstm.write( &snode.SstrUcolRecv[0], snode.SstrUcolRecv.size() );
1767 std::vector<Int> mask( UBlockMask::TOTAL_NUMBER, 1 );
1769 deserialize( numUBlock, sstm, NO_MASK );
1770 UcolRecv.resize( numUBlock );
1771 for( Int jb = 0; jb < numUBlock; jb++ ){
1772 deserialize( UcolRecv[jb], sstm, mask );
1775 #if ( _DEBUGlevel_ >= 1 )
1776 statusOFS<<
"["<<snode.Index<<
"] "<<
"Ucol RECV "<<std::endl;
1777 for(Int ib=0;ib<UcolRecv.size();++ib){statusOFS<<UcolRecv[ib].blockIdx<<
" ";}
1778 statusOFS<<std::endl;
1784 std::vector<UBlock<T> >& Ucol = this->Ucol(
LBj( snode.Index, this->grid_ ) );
1785 UcolRecv.resize( Ucol.size() );
1786 std::copy(Ucol.begin(),Ucol.end(),UcolRecv.begin());
1789 TIMER_STOP(Unpack_data);
1793 template<
typename T>
1797 TIMER_START(ComputeDiagUpdate);
1800 if(
MYCOL( this->grid_ ) ==
PCOL( snode.Index, this->grid_ ) ){
1801 #if ( _DEBUGlevel_ >= 1 )
1802 statusOFS << std::endl <<
"["<<snode.Index<<
"] "
1803 <<
"Updating the diagonal block" << std::endl << std::endl;
1805 std::vector<LBlock<T> >& Lcol = this->L(
LBj( snode.Index, this->grid_ ) );
1806 std::vector<UBlock<T> >& Ucol = this->Ucol(
LBj( snode.Index, this->grid_ ) );
1809 snode.DiagBuf.Resize(
SuperSize( snode.Index, this->super_ ),
1810 SuperSize( snode.Index, this->super_ ));
1811 SetValue(snode.DiagBuf, ZERO<T>());
1814 Int startIb = (
MYROW( this->grid_ ) ==
PROW( snode.Index, this->grid_ ))?1:0;
1815 for( Int ib = startIb; ib < Lcol.size(); ib++ ){
1819 for(jb = 0; jb < Ucol.size(); ++jb){
1820 if(Ucol[jb].blockIdx == LcolB.
blockIdx){
1826 assert(jb < Ucol.size());
1830 if(1 || (LcolB.
numRow>0 && LcolB.
numCol>0 && UcolB.numRow>0 && UcolB.numCol>0)){
1832 blas::Gemm(
'N',
'N', snode.DiagBuf.m(), snode.DiagBuf.n(),
1833 LcolB.
numRow, MINUS_ONE<T>(),
1834 UcolB.nzval.Data(), UcolB.nzval.m(),
1835 &snode.LUpdateBuf( snode.RowLocalPtr[ib-startIb], 0 ),
1836 snode.LUpdateBuf.m(),
1837 ONE<T>(), snode.DiagBuf.Data(), snode.DiagBuf.m() );
1841 #if ( _DEBUGlevel_ >= 1 )
1842 statusOFS << std::endl <<
"["<<snode.Index<<
"] "
1843 <<
"Updated the diagonal block" << std::endl << std::endl;
1846 TIMER_STOP(ComputeDiagUpdate);
1856 template<
typename T>
1860 #if defined (PROFILE) || defined(PMPI) || defined(USE_TAU)
1861 Real begin_SendULWaitContentFirst, end_SendULWaitContentFirst, time_SendULWaitContentFirst = 0;
1864 std::vector<std::vector<Int> > & superList = this->WorkingSet();
1865 Int numSteps = superList.size();
1866 Int stepSuper = superList[lidx].size();
1868 TIMER_START(AllocateBuffer);
1871 std::vector<std::vector<MPI_Request> > arrMpireqsSendLToBelow;
1872 arrMpireqsSendLToBelow.resize( stepSuper, std::vector<MPI_Request>( 2 * this->grid_->numProcRow, MPI_REQUEST_NULL ));
1873 std::vector<std::vector<MPI_Request> > arrMpireqsSendLToRight;
1874 arrMpireqsSendLToRight.resize(stepSuper, std::vector<MPI_Request>( 2 * this->grid_->numProcCol, MPI_REQUEST_NULL ));
1876 std::vector<std::vector<MPI_Request> > arrMpireqsSendUToBelow;
1877 arrMpireqsSendUToBelow.resize( stepSuper, std::vector<MPI_Request>( 2 * this->grid_->numProcRow, MPI_REQUEST_NULL ));
1878 std::vector<std::vector<MPI_Request> > arrMpireqsSendUToRight;
1879 arrMpireqsSendUToRight.resize(stepSuper, std::vector<MPI_Request>( 2 * this->grid_->numProcCol, MPI_REQUEST_NULL ));
1882 std::vector<MPI_Request> arrMpireqsSendLToLeft;
1883 arrMpireqsSendLToLeft.resize(stepSuper, MPI_REQUEST_NULL );
1886 std::vector<MPI_Request> arrMpireqsSendUToAbove;
1887 arrMpireqsSendUToAbove.resize(stepSuper, MPI_REQUEST_NULL );
1890 std::vector<MPI_Request> arrMpireqsSendToAbove;
1891 arrMpireqsSendToAbove.resize(stepSuper, MPI_REQUEST_NULL );
1896 std::vector<MPI_Request> arrMpireqsRecvLSizeFromAny;
1897 arrMpireqsRecvLSizeFromAny.resize(stepSuper*2 , MPI_REQUEST_NULL);
1898 std::vector<MPI_Request> arrMpireqsRecvLContentFromAny;
1899 arrMpireqsRecvLContentFromAny.resize(stepSuper*2 , MPI_REQUEST_NULL);
1901 std::vector<MPI_Request> arrMpireqsRecvUSizeFromAny;
1902 arrMpireqsRecvUSizeFromAny.resize(stepSuper*2 , MPI_REQUEST_NULL);
1903 std::vector<MPI_Request> arrMpireqsRecvUContentFromAny;
1904 arrMpireqsRecvUContentFromAny.resize(stepSuper*2 , MPI_REQUEST_NULL);
1910 std::vector<SuperNodeBufferTypeUnsym> arrSuperNodes(stepSuper);
1911 for (Int supidx=0; supidx<stepSuper; supidx++){
1912 arrSuperNodes[supidx].Index = superList[lidx][supidx];
1917 TIMER_STOP(AllocateBuffer);
1921 Int next_lidx = lidx+1;
1928 Int next_lidx = lidx;
1931 bool doSend =
false;
1932 for (Int supidx=0; supidx<superList[next_lidx].size(); supidx++){
1933 Int snode_index = superList[next_lidx][supidx];
1934 if(
MYROW( this->grid_ ) ==
PROW( snode_index, this->grid_ ) ){
1935 std::vector<LBlock<T> >& Lrow = this->Lrow(
LBi(snode_index, this->grid_) );
1936 Int& LrowSize = this->LrowSize_[
LBi(snode_index, this->grid_) ];
1937 if(Lrow.size()!=LrowSize){
1943 if(
MYCOL( this->grid_ ) ==
PCOL( snode_index, this->grid_ ) ){
1944 std::vector<UBlock<T> >& Ucol = this->Ucol(
LBj(snode_index, this->grid_) );
1945 Int& UcolSize = this->UcolSize_[
LBj(snode_index, this->grid_) ];
1946 if(Ucol.size()!=UcolSize){
1956 SendRecvSizesCD(superList[next_lidx], superList[next_lidx].size(),buffers);
1957 IRecvContentCD(superList[next_lidx], superList[next_lidx].size(),buffers);
1958 WaitContentLCD(superList[next_lidx], superList[next_lidx].size(),buffers);
1959 WaitContentUCD(superList[next_lidx], superList[next_lidx].size(),buffers);
1960 buffers.WaitAllSend();
1965 if(next_lidx < superList.size()){
1967 for (Int supidx=0; supidx<superList[next_lidx].size(); supidx++){
1968 Int snode_index = superList[next_lidx][supidx];
1969 if(
MYROW( this->grid_ ) ==
PROW( snode_index, this->grid_ ) ){
1970 std::vector<LBlock<T> >& Lrow = this->Lrow(
LBi(snode_index, this->grid_) );
1971 Int& LrowSize = this->LrowSize_[
LBi(snode_index, this->grid_) ];
1975 if(
MYCOL( this->grid_ ) ==
PCOL( snode_index, this->grid_ ) ){
1976 std::vector<UBlock<T> >& Ucol = this->Ucol(
LBj(snode_index, this->grid_) );
1977 Int& UcolSize = this->UcolSize_[
LBj(snode_index, this->grid_) ];
1982 SendRecvSizesCD(superList[next_lidx],superList[next_lidx].size(),nextCDBuffers);
1986 #if ( _DEBUGlevel_ >= 1 )
1987 statusOFS << std::endl <<
"Communication to the Schur complement." << std::endl << std::endl;
1991 for (Int supidx=0; supidx<stepSuper; supidx++){
1993 std::vector<MPI_Request> & mpireqsSendLToBelow = arrMpireqsSendLToBelow[supidx];
1994 std::vector<MPI_Request> & mpireqsSendLToRight = arrMpireqsSendLToRight[supidx];
1995 std::vector<MPI_Request> & mpireqsSendUToBelow = arrMpireqsSendUToBelow[supidx];
1996 std::vector<MPI_Request> & mpireqsSendUToRight = arrMpireqsSendUToRight[supidx];
1998 #if ( _DEBUGlevel_ >= 1 )
1999 statusOFS << std::endl <<
"["<<snode.Index<<
"] "
2000 <<
"Communication for the Lrow part." << std::endl << std::endl;
2003 if(
MYROW( this->grid_ ) ==
PROW( snode.Index, this->grid_ ) ){
2005 TIMER_START(Serialize_LrowL);
2006 std::stringstream sstm;
2007 std::vector<Int> mask( LBlockMask::TOTAL_NUMBER, 1 );
2008 std::vector<LBlock<T> >& Lrow = this->Lrow(
LBi(snode.Index, this->grid_) );
2009 std::vector<UBlock<T> >& Urow = this->U(
LBi(snode.Index, this->grid_) );
2013 serialize( (Int)Lrow.size(), sstm, NO_MASK );
2014 for( Int ib = 0; ib < Lrow.size(); ib++ ){
2015 assert( Lrow[ib].blockIdx > snode.Index );
2016 serialize( Lrow[ib], sstm, mask );
2018 snode.SstrLrowSend.resize( Size( sstm ) );
2019 sstm.read( &snode.SstrLrowSend[0], snode.SstrLrowSend.size() );
2020 snode.SizeSstrLrowSend = snode.SstrLrowSend.size();
2021 TIMER_STOP(Serialize_LrowL);
2023 for( Int iProcRow = 0; iProcRow < this->grid_->numProcRow; iProcRow++ ){
2024 if(
MYROW( this->grid_ ) != iProcRow &&
2025 this->isSendToBelow_( iProcRow,snode.Index ) ==
true ){
2028 MPI_Isend( &snode.SizeSstrLrowSend, 1, MPI_INT,
2029 iProcRow, IDX_TO_TAG2(snode.Index,supidx,SELINV_TAG_LROW_SIZE), this->grid_->colComm, &mpireqsSendLToBelow[2*iProcRow] );
2030 MPI_Isend( (
void*)&snode.SstrLrowSend[0], snode.SizeSstrLrowSend, MPI_BYTE,
2031 iProcRow, IDX_TO_TAG2(snode.Index,supidx,SELINV_TAG_LROW_CONTENT),
2032 this->grid_->colComm, &mpireqsSendLToBelow[2*iProcRow+1] );
2033 #if ( _DEBUGlevel_ >= 1 )
2034 statusOFS<<
"["<<snode.Index<<
"] "<<
"Lrow SENT "<<std::endl;
2035 for(Int ib=0;ib<Lrow.size();++ib){statusOFS<<Lrow[ib].blockIdx<<
" ";}
2036 statusOFS<<std::endl;
2037 statusOFS <<
"["<<snode.Index<<
"] "<<
"Sending Lrow "
2038 << snode.SizeSstrLrowSend <<
" BYTES to P" <<
PNUM(iProcRow,
MYCOL(this->grid_),this->grid_)
2039 << std::endl << std::endl;
2045 #if ( _DEBUGlevel_ >= 1 )
2046 statusOFS << std::endl <<
"["<<snode.Index<<
"] "<<
"Communication for the L part." << std::endl << std::endl;
2049 if(
MYCOL( this->grid_ ) ==
PCOL( snode.Index, this->grid_ ) ){
2051 TIMER_START(Serialize_LcolL);
2053 std::stringstream sstm;
2054 std::vector<Int> mask( LBlockMask::TOTAL_NUMBER, 1 );
2055 mask[LBlockMask::NZVAL] = 0;
2057 std::vector<LBlock<T> >& Lcol = this->L(
LBj(snode.Index, this->grid_) );
2061 Int startIdx = (
MYROW( this->grid_ ) ==
PROW( snode.Index, this->grid_ ) )?1:0;
2062 serialize( (Int)Lcol.size() - startIdx, sstm, NO_MASK );
2063 for( Int ib = startIdx; ib < Lcol.size(); ib++ ){
2064 assert( Lcol[ib].blockIdx > snode.Index );
2066 #if ( _DEBUGlevel_ >= 2 )
2069 serialize( Lcol[ib], sstm, mask );
2071 snode.SstrLcolSend.resize( Size( sstm ) );
2072 sstm.read( &snode.SstrLcolSend[0], snode.SstrLcolSend.size() );
2073 snode.SizeSstrLcolSend = snode.SstrLcolSend.size();
2074 TIMER_STOP(Serialize_LcolL);
2076 for( Int iProcCol = 0; iProcCol < this->grid_->numProcCol ; iProcCol++ ){
2077 if(
MYCOL( this->grid_ ) != iProcCol &&
2078 this->isSendToRight_( iProcCol, snode.Index ) ==
true ){
2080 MPI_Isend( &snode.SizeSstrLcolSend, 1, MPI_INT,
2081 iProcCol, IDX_TO_TAG2(snode.Index,supidx,SELINV_TAG_L_SIZE),
2082 this->grid_->rowComm, &mpireqsSendLToRight[2*iProcCol] );
2083 MPI_Isend( (
void*)&snode.SstrLcolSend[0], snode.SizeSstrLcolSend, MPI_BYTE,
2084 iProcCol, IDX_TO_TAG2(snode.Index,supidx,SELINV_TAG_L_CONTENT),
2085 this->grid_->rowComm, &mpireqsSendLToRight[2*iProcCol+1] );
2086 #if ( _DEBUGlevel_ >= 1 )
2087 statusOFS<<
"["<<snode.Index<<
"] "<<
"L SENT "<<std::endl;
2088 for(Int ib=startIdx;ib<Lcol.size();++ib){statusOFS<<Lcol[ib].blockIdx<<
" ";}
2089 statusOFS<<std::endl;
2090 statusOFS <<
"["<<snode.Index<<
"] "<<
"Sending L "
2091 << snode.SizeSstrLcolSend <<
" BYTES to P" <<
PNUM(
MYROW(this->grid_),iProcCol,this->grid_)
2092 << std::endl << std::endl;
2098 #if ( _DEBUGlevel_ >= 1 )
2099 statusOFS << std::endl <<
"["<<snode.Index<<
"] "
2100 <<
"Communication for the U part." << std::endl << std::endl;
2103 if(
MYROW( this->grid_ ) ==
PROW( snode.Index, this->grid_ ) ){
2105 TIMER_START(Serialize_UcolU);
2107 std::stringstream sstm;
2108 std::vector<Int> mask( UBlockMask::TOTAL_NUMBER, 1 );
2109 mask[UBlockMask::NZVAL] = 0;
2111 std::vector<UBlock<T> >& Urow = this->U(
LBi(snode.Index, this->grid_) );
2114 serialize( (Int)Urow.size(), sstm, NO_MASK );
2115 for( Int jb = 0; jb < Urow.size(); jb++ ){
2116 assert( Urow[jb].blockIdx > snode.Index );
2117 #if ( _DEBUGlevel_ >= 2 )
2120 serialize( Urow[jb], sstm, mask );
2122 snode.SstrUrowSend.resize( Size( sstm ) );
2123 sstm.read( &snode.SstrUrowSend[0], snode.SstrUrowSend.size() );
2124 snode.SizeSstrUrowSend = snode.SstrUrowSend.size();
2125 TIMER_STOP(Serialize_UcolU);
2127 for( Int iProcRow = 0; iProcRow < this->grid_->numProcRow; iProcRow++ ){
2128 if(
MYROW( this->grid_ ) != iProcRow &&
2129 this->isSendToBelow_( iProcRow,snode.Index ) ==
true ){
2131 MPI_Isend( &snode.SizeSstrUrowSend, 1, MPI_INT,
2132 iProcRow, IDX_TO_TAG2(snode.Index,supidx,SELINV_TAG_U_SIZE), this->grid_->colComm, &mpireqsSendUToBelow[2*iProcRow] );
2133 MPI_Isend( (
void*)&snode.SstrLrowSend[0], snode.SizeSstrUrowSend, MPI_BYTE,
2134 iProcRow, IDX_TO_TAG2(snode.Index,supidx,SELINV_TAG_U_CONTENT),
2135 this->grid_->colComm, &mpireqsSendUToBelow[2*iProcRow+1] );
2136 #if ( _DEBUGlevel_ >= 1 )
2137 statusOFS << std::endl <<
"["<<snode.Index<<
"] "<<
"Sending U " << snode.SizeSstrUrowSend <<
" BYTES"<< std::endl << std::endl;
2138 statusOFS<<
"["<<snode.Index<<
"] "<<
"U SENT "<<std::endl;
2139 for(Int ib=0;ib<Urow.size();++ib){statusOFS<<Urow[ib].blockIdx<<
" ";}
2140 statusOFS<<std::endl;
2141 statusOFS <<
"["<<snode.Index<<
"] "<<
"Sending U "
2142 << snode.SizeSstrUrowSend <<
" BYTES to P" <<
PNUM(iProcRow,
MYCOL(this->grid_),this->grid_)
2143 << std::endl << std::endl;
2149 #if ( _DEBUGlevel_ >= 1 )
2150 statusOFS <<
"["<<snode.Index<<
"] "
2151 <<
"Communication for the Ucol part." << std::endl
2155 if(
MYCOL( this->grid_ ) ==
PCOL( snode.Index, this->grid_ ) ){
2157 TIMER_START(Serialize_UcolU);
2158 std::stringstream sstm;
2159 std::vector<Int> mask( UBlockMask::TOTAL_NUMBER, 1 );
2160 std::vector<UBlock<T> >& Ucol =
2161 this->Ucol(
LBj(snode.Index, this->grid_) );
2163 serialize( (Int)Ucol.size(), sstm, NO_MASK );
2164 for( Int jb = 0; jb < Ucol.size(); jb++ ){
2166 assert( UB.
blockIdx > snode.Index );
2167 serialize( UB, sstm, mask );
2169 snode.SstrUcolSend.resize( Size( sstm ) );
2170 sstm.read( &snode.SstrUcolSend[0], snode.SstrUcolSend.size() );
2171 snode.SizeSstrUcolSend = snode.SstrUcolSend.size();
2172 TIMER_STOP(Serialize_UcolU);
2174 for( Int iProcCol = 0;
2175 iProcCol < this->grid_->numProcCol ; iProcCol++ ){
2176 if(
MYCOL( this->grid_ ) != iProcCol &&
2177 this->isSendToRight_( iProcCol, snode.Index ) ==
true ){
2179 MPI_Isend( &snode.SizeSstrUcolSend, 1, MPI_INT,
2180 iProcCol, IDX_TO_TAG2(snode.Index,supidx,SELINV_TAG_UCOL_SIZE),
2181 this->grid_->rowComm, &mpireqsSendUToRight[2*iProcCol] );
2182 MPI_Isend( &snode.SstrUcolSend[0],snode.SizeSstrUcolSend,
2184 IDX_TO_TAG2(snode.Index,supidx,SELINV_TAG_UCOL_CONTENT),
2185 this->grid_->rowComm,
2186 &mpireqsSendUToRight[2*iProcCol+1] );
2187 #if ( _DEBUGlevel_ >= 2 )
2188 statusOFS<<
"["<<snode.Index<<
"] "<<
"Ucol SENT "<<std::endl;
2189 for(Int ib=0;ib<Ucol.size();++ib){
2190 statusOFS<<Ucol[ib].blockIdx<<
" ";
2192 statusOFS<<std::endl;
2194 #if ( _DEBUGlevel_ >= 1 )
2195 statusOFS <<
"["<<snode.Index<<
"] "<<
"Sending Ucol "
2196 << snode.SizeSstrUcolSend <<
" BYTES to P"
2197 <<
PNUM(
MYROW(this->grid_),iProcCol,this->grid_)
2198 << std::endl << std::endl;
2211 TIMER_START(WaitContentLU);
2213 for (Int supidx=0; supidx<stepSuper ; supidx++){
2215 MPI_Request * mpireqsRecvLFromAbove =
2216 &arrMpireqsRecvLSizeFromAny[supidx*2];
2217 MPI_Request * mpireqsRecvLFromLeft =
2218 &arrMpireqsRecvLSizeFromAny[supidx*2+1];
2219 MPI_Request * mpireqsRecvUFromAbove =
2220 &arrMpireqsRecvUSizeFromAny[supidx*2];
2221 MPI_Request * mpireqsRecvUFromLeft =
2222 &arrMpireqsRecvUSizeFromAny[supidx*2+1];
2225 if( this->isRecvFromAbove_( snode.Index ) &&
2226 MYROW( this->grid_ ) !=
PROW( snode.Index, this->grid_ ) ){
2227 Int sender =
PROW( snode.Index, this->grid_ );
2228 MPI_Irecv( &snode.SizeSstrLrowRecv, 1, MPI_INT, sender,
2229 IDX_TO_TAG2(snode.Index,supidx,SELINV_TAG_LROW_SIZE),
2230 this->grid_->colComm, mpireqsRecvLFromAbove );
2231 #if ( _DEBUGlevel_ >= 1 )
2232 statusOFS <<
"["<<snode.Index<<
"] "<<
"Receiving Lrow"
2233 <<
" size on tag "<<IDX_TO_TAG2(snode.Index,supidx,SELINV_TAG_LROW_SIZE)
2234 <<
" from P" <<
PNUM(sender,
MYCOL(this->grid_),this->grid_)
2235 << std::endl << std::endl;
2237 MPI_Irecv( &snode.SizeSstrUrowRecv, 1, MPI_INT, sender,
2238 IDX_TO_TAG2(snode.Index,supidx,SELINV_TAG_U_SIZE),
2239 this->grid_->colComm, mpireqsRecvUFromAbove );
2240 #if ( _DEBUGlevel_ >= 1 )
2241 statusOFS <<
"["<<snode.Index<<
"] "<<
"Receiving U"
2242 <<
" size on tag "<<IDX_TO_TAG2(snode.Index,supidx,SELINV_TAG_U_SIZE)
2243 <<
" from P" <<
PNUM(sender,
MYCOL(this->grid_),this->grid_)
2244 << std::endl << std::endl;
2249 if( this->isRecvFromLeft_( snode.Index ) &&
2250 MYCOL( this->grid_ ) !=
PCOL( snode.Index, this->grid_ ) ){
2251 Int sender =
PCOL( snode.Index, this->grid_ );
2252 MPI_Irecv( &snode.SizeSstrLcolRecv, 1, MPI_INT, sender,
2253 IDX_TO_TAG2(snode.Index,supidx,SELINV_TAG_L_SIZE),
2254 this->grid_->rowComm, mpireqsRecvLFromLeft );
2255 #if ( _DEBUGlevel_ >= 1 )
2256 statusOFS <<
"["<<snode.Index<<
"] "<<
"Receiving L"
2257 <<
" size on tag "<<IDX_TO_TAG2(snode.Index,supidx,SELINV_TAG_L_SIZE)
2258 <<
" from P" <<
PNUM(
MYROW(this->grid_),sender,this->grid_)
2259 << std::endl << std::endl;
2261 MPI_Irecv( &snode.SizeSstrUcolRecv, 1, MPI_INT, sender,
2262 IDX_TO_TAG2(snode.Index,supidx,SELINV_TAG_UCOL_SIZE),
2263 this->grid_->rowComm, mpireqsRecvUFromLeft );
2264 #if ( _DEBUGlevel_ >= 1 )
2265 statusOFS <<
"["<<snode.Index<<
"] "<<
"Receiving Ucol"
2266 <<
" size on tag "<<IDX_TO_TAG2(snode.Index,supidx,SELINV_TAG_UCOL_SIZE)
2267 <<
" from P" <<
PNUM(
MYROW(this->grid_),sender,this->grid_)
2268 << std::endl << std::endl;
2272 TIMER_STOP(WaitContentLU);
2275 TIMER_START(WaitContentLU);
2276 TIMER_START(WaitSize_LrowL);
2277 mpi::Waitall(arrMpireqsRecvLSizeFromAny);
2278 TIMER_STOP(WaitSize_LrowL);
2281 for (Int supidx=0; supidx<stepSuper ; supidx++){
2284 MPI_Request * mpireqsRecvFromAbove =
2285 &arrMpireqsRecvLContentFromAny[supidx*2];
2286 MPI_Request * mpireqsRecvFromLeft =
2287 &arrMpireqsRecvLContentFromAny[supidx*2+1];
2289 TIMER_START(Alloc_Buffer_Recv_LrowL);
2290 if( this->isRecvFromAbove_( snode.Index ) &&
2291 MYROW( this->grid_ ) !=
PROW( snode.Index, this->grid_ ) ){
2292 snode.SstrLrowRecv.resize( snode.SizeSstrLrowRecv );
2293 Int sender =
PROW( snode.Index, this->grid_ );
2294 MPI_Irecv( &snode.SstrLrowRecv[0], snode.SizeSstrLrowRecv, MPI_BYTE,
2295 sender, IDX_TO_TAG2(snode.Index,supidx,SELINV_TAG_LROW_CONTENT),
2296 this->grid_->colComm, mpireqsRecvFromAbove );
2297 #if ( _DEBUGlevel_ >= 1 )
2298 statusOFS <<
"["<<snode.Index<<
"] "<<
"Receiving Lrow "
2299 << snode.SizeSstrLrowRecv <<
" BYTES from P"
2300 <<
PNUM(sender,
MYCOL(this->grid_),this->grid_)
2301 << std::endl << std::endl;
2304 TIMER_STOP(Alloc_Buffer_Recv_LrowL);
2306 TIMER_START(Alloc_Buffer_Recv_LcolL);
2307 if( this->isRecvFromLeft_( snode.Index ) &&
2308 MYCOL( this->grid_ ) !=
PCOL( snode.Index, this->grid_ ) ){
2309 snode.SstrLcolRecv.resize( snode.SizeSstrLcolRecv );
2310 Int sender =
PCOL( snode.Index, this->grid_ );
2311 MPI_Irecv( &snode.SstrLcolRecv[0], snode.SizeSstrLcolRecv, MPI_BYTE,
2312 sender, IDX_TO_TAG2(snode.Index,supidx,SELINV_TAG_L_CONTENT),
2313 this->grid_->rowComm,
2314 mpireqsRecvFromLeft );
2315 #if ( _DEBUGlevel_ >= 1 )
2316 statusOFS <<
"["<<snode.Index<<
"] "<<
"Receiving L "
2317 << snode.SizeSstrLcolRecv <<
" BYTES from P"
2318 <<
PNUM(
MYROW(this->grid_),sender,this->grid_)
2319 << std::endl << std::endl;
2322 TIMER_STOP(Alloc_Buffer_Recv_LcolL);
2324 TIMER_STOP(WaitContentLU);
2328 TIMER_START(WaitContentLU);
2329 TIMER_START(WaitSize_UcolU);
2330 mpi::Waitall(arrMpireqsRecvUSizeFromAny);
2331 TIMER_STOP(WaitSize_UcolU);
2334 for (Int supidx=0; supidx<stepSuper ; supidx++){
2337 MPI_Request * mpireqsRecvFromAbove =
2338 &arrMpireqsRecvUContentFromAny[supidx*2];
2339 MPI_Request * mpireqsRecvFromLeft =
2340 &arrMpireqsRecvUContentFromAny[supidx*2+1];
2342 TIMER_START(Alloc_Buffer_Recv_UrowL);
2343 if( this->isRecvFromAbove_( snode.Index ) &&
2344 MYROW( this->grid_ ) !=
PROW( snode.Index, this->grid_ ) ){
2345 snode.SstrUrowRecv.resize( snode.SizeSstrUrowRecv );
2346 Int sender =
PROW( snode.Index, this->grid_ );
2347 MPI_Irecv( &snode.SstrUrowRecv[0], snode.SizeSstrUrowRecv, MPI_BYTE,
2348 sender, IDX_TO_TAG2(snode.Index,supidx,SELINV_TAG_U_CONTENT),
2349 this->grid_->colComm, mpireqsRecvFromAbove );
2350 #if ( _DEBUGlevel_ >= 1 )
2351 statusOFS <<
"["<<snode.Index<<
"] "<<
"Receiving U "
2352 << snode.SizeSstrUrowRecv <<
" BYTES from P"
2353 <<
PNUM(sender,
MYCOL(this->grid_),this->grid_)
2354 << std::endl << std::endl;
2357 TIMER_STOP(Alloc_Buffer_Recv_UrowL);
2359 TIMER_START(Alloc_Buffer_Recv_UcolL);
2360 if( this->isRecvFromLeft_( snode.Index ) &&
2361 MYCOL( this->grid_ ) !=
PCOL( snode.Index, this->grid_ ) ){
2362 snode.SstrUcolRecv.resize( snode.SizeSstrUcolRecv );
2363 Int sender =
PCOL( snode.Index, this->grid_ );
2364 MPI_Irecv( &snode.SstrUcolRecv[0], snode.SizeSstrUcolRecv, MPI_BYTE,
2365 sender, IDX_TO_TAG2(snode.Index,supidx,SELINV_TAG_UCOL_CONTENT),
2366 this->grid_->rowComm,
2367 mpireqsRecvFromLeft );
2368 #if ( _DEBUGlevel_ >= 1 )
2369 statusOFS <<
"["<<snode.Index<<
"] "<<
"Receiving Ucol "
2370 << snode.SizeSstrUcolRecv <<
" BYTES from P"
2371 <<
PNUM(
MYROW(this->grid_),sender,this->grid_)
2372 << std::endl << std::endl;
2375 TIMER_STOP(Alloc_Buffer_Recv_UcolL);
2378 TIMER_STOP(WaitContentLU);
2381 TIMER_START(Compute_Sinv_LU);
2383 Int gemmProcessed = 0;
2387 std::vector<Int> readySupidx;
2389 for(Int supidx = 0;supidx<stepSuper;supidx++){
2391 if( this->isRecvFromAbove_( snode.Index )
2392 && this->isRecvFromLeft_( snode.Index )){
2394 if(
MYCOL( this->grid_ ) ==
PCOL( snode.Index, this->grid_ ) ){
2398 if(
MYROW( this->grid_ ) ==
PROW( snode.Index, this->grid_ ) ){
2402 if(snode.isReady==4){
2403 readySupidx.push_back(supidx);
2404 #if ( _DEBUGlevel_ >= 1 )
2405 statusOFS<<
"Locally processing ["<<snode.Index<<
"]"<<std::endl;
2410 TIMER_START(Reduce_Sinv_L_Send);
2411 if( this->isRecvFromLeft_( snode.Index )
2412 &&
MYCOL( this->grid_ ) !=
PCOL( snode.Index, this->grid_ ) ){
2413 MPI_Request & mpireqsSendToLeft = arrMpireqsSendLToLeft[supidx];
2416 MPI_Isend( NULL, 0, MPI_BYTE,
PCOL(snode.Index,this->grid_),
2417 IDX_TO_TAG2(snode.Index,supidx,SELINV_TAG_L_REDUCE),
2418 this->grid_->rowComm, &mpireqsSendToLeft );
2420 #if ( _DEBUGlevel_ >= 1 )
2422 PCOL(snode.Index,this->grid_),this->grid_);
2423 statusOFS <<
"["<<snode.Index<<
"] "<<
" LReduce P"
2424 <<
MYPROC(this->grid_) <<
" has sent "
2425 << 0 <<
" bytes to "
2426 << dst << std::endl;
2429 TIMER_STOP(Reduce_Sinv_L_Send);
2436 #if ( _DEBUGlevel_ >= 1 )
2437 statusOFS<<std::endl<<
"gemmToDo ="<<gemmToDo<<std::endl;
2441 #if defined (PROFILE)
2442 end_SendULWaitContentFirst=0;
2443 begin_SendULWaitContentFirst=0;
2446 while(gemmProcessed<gemmToDo)
2448 Int reqidx = MPI_UNDEFINED;
2451 TIMER_START(WaitContentLU);
2454 int reqIndicesL[arrMpireqsRecvLContentFromAny.size()];
2459 #if defined(PROFILE)
2460 if(begin_SendULWaitContentFirst==0){
2461 begin_SendULWaitContentFirst=1;
2462 TIMER_START(WaitContent_LrowL_First);
2466 TIMER_START(WaitContent_LrowL);
2467 MPI_Waitsome(2*stepSuper, &arrMpireqsRecvLContentFromAny[0],
2468 &numRecv, reqIndicesL, MPI_STATUSES_IGNORE);
2470 for(
int i =0;i<numRecv;i++){
2471 reqidx = reqIndicesL[i];
2473 if(reqidx!=MPI_UNDEFINED){
2479 #if ( _DEBUGlevel_ >= 1 )
2480 statusOFS <<
"["<<snode.Index<<
"] "<<
"Received data for L"
2481 <<
" reqidx%2=" << reqidx%2
2482 <<
" is ready ?"<<snode.isReady<<std::endl;
2485 if(snode.isReady==4){
2486 readySupidx.push_back(supidx);
2487 #if defined(PROFILE)
2488 if(end_SendULWaitContentFirst==0){
2489 TIMER_STOP(WaitContent_LrowL_First);
2490 end_SendULWaitContentFirst=1;
2497 TIMER_STOP(WaitContent_LrowL);
2500 int reqIndicesU[arrMpireqsRecvUContentFromAny.size()];
2504 TIMER_START(WaitContent_UcolU);
2505 MPI_Waitsome(2*stepSuper, &arrMpireqsRecvUContentFromAny[0],
2506 &numRecv, reqIndicesU, MPI_STATUSES_IGNORE);
2508 for(
int i =0;i<numRecv;i++){
2509 reqidx = reqIndicesU[i];
2511 if(reqidx!=MPI_UNDEFINED){
2516 #if ( _DEBUGlevel_ >= 1 )
2517 statusOFS <<
"["<<snode.Index<<
"] "<<
"Received data for U"
2518 <<
" reqidx%2=" << reqidx%2
2519 <<
" is ready ?"<<snode.isReady<<std::endl;
2522 if(snode.isReady==4){
2523 readySupidx.push_back(supidx);
2524 #if defined(PROFILE)
2525 if(end_SendULWaitContentFirst==0){
2526 TIMER_STOP(WaitContent_LrowL_First);
2527 end_SendULWaitContentFirst=1;
2534 TIMER_STOP(WaitContent_UcolU);
2536 }
while(readySupidx.size()==0);
2537 TIMER_STOP(WaitContentLU);
2540 if(readySupidx.size()>0)
2542 supidx = readySupidx.back();
2543 readySupidx.pop_back();
2548 if( this->isRecvFromAbove_( snode.Index )
2549 && this->isRecvFromLeft_( snode.Index ) ){
2550 std::vector<LBlock<T> > LcolRecv;
2551 std::vector<LBlock<T> > LrowRecv;
2552 std::vector<UBlock<T> > UcolRecv;
2553 std::vector<UBlock<T> > UrowRecv;
2563 UnpackData(snode, LcolRecv, LrowRecv, UcolRecv, UrowRecv);
2565 SelInv_lookup_indexes(snode, LcolRecv, LrowRecv,
2566 UcolRecv, UrowRecv,AinvBuf,LBuf, UBuf);
2569 #if ( _DEBUGlevel_ >= 2 )
2570 statusOFS <<
"["<<snode.Index<<
"] " <<
"LBuf: ";
2571 statusOFS << LBuf << std::endl;
2572 statusOFS <<
"["<<snode.Index<<
"] " <<
"UBuf: ";
2573 statusOFS << UBuf << std::endl;
2577 LUpdateBuf.Resize( AinvBuf.m(),
2578 SuperSize( snode.Index, this->super_ ) );
2582 TIMER_START(Compute_Sinv_L_Resize);
2583 snode.LUpdateBuf.Resize( AinvBuf.m(),
2584 SuperSize( snode.Index, this->super_ ) );
2585 TIMER_STOP(Compute_Sinv_L_Resize);
2587 TIMER_START(Compute_Sinv_LT_GEMM);
2588 blas::Gemm(
'N',
'T', AinvBuf.m(), LBuf.m(), AinvBuf.n(),
2589 MINUS_ONE<T>(), AinvBuf.Data(), AinvBuf.m(),
2590 LBuf.Data(), LBuf.m(), ZERO<T>(),
2591 snode.LUpdateBuf.Data(), snode.LUpdateBuf.m() );
2592 TIMER_STOP(Compute_Sinv_LT_GEMM);
2594 TIMER_START(Compute_Sinv_U_Resize);
2595 snode.UUpdateBuf.Resize(
SuperSize( snode.Index, this->super_ ),
2597 TIMER_STOP(Compute_Sinv_U_Resize);
2599 TIMER_START(Compute_Sinv_U_GEMM);
2600 blas::Gemm(
'N',
'N', UBuf.m(), AinvBuf.n(), AinvBuf.m(),
2601 MINUS_ONE<T>(), UBuf.Data(), UBuf.m(),
2602 AinvBuf.Data(), AinvBuf.m(), ZERO<T>(),
2603 snode.UUpdateBuf.Data(), snode.UUpdateBuf.m() );
2604 TIMER_STOP(Compute_Sinv_U_GEMM);
2606 #if ( _DEBUGlevel_ >= 2 )
2607 statusOFS <<
"["<<snode.Index<<
"] " <<
"snode.LUpdateBuf: ";
2608 statusOFS << snode.LUpdateBuf << std::endl;
2609 statusOFS <<
"["<<snode.Index<<
"] " <<
"snode.UUpdateBuf: ";
2610 statusOFS << snode.UUpdateBuf << std::endl;
2622 TIMER_START(Reduce_Sinv_L_Send);
2625 if( this->isRecvFromAbove_( snode.Index ) ){
2626 if( this->isRecvFromLeft_( snode.Index )
2627 &&
MYCOL( this->grid_ ) !=
PCOL( snode.Index, this->grid_ ) ){
2628 MPI_Request & reqsSendToLeft = arrMpireqsSendLToLeft[supidx];
2629 MPI_Isend( snode.LUpdateBuf.Data(), snode.LUpdateBuf.ByteSize(),
2630 MPI_BYTE,
PCOL(snode.Index,this->grid_),
2631 IDX_TO_TAG2(snode.Index,supidx,SELINV_TAG_L_REDUCE),
2632 this->grid_->rowComm, &reqsSendToLeft );
2634 #if ( _DEBUGlevel_ >= 1 )
2636 PCOL(snode.Index,this->grid_),this->grid_);
2637 statusOFS <<
"["<<snode.Index<<
"] "<<
" LReduce P"
2638 <<
MYPROC(this->grid_) <<
" has sent "
2639 << snode.LUpdateBuf.ByteSize() <<
" bytes to "
2640 << dst << std::endl;
2644 TIMER_STOP(Reduce_Sinv_L_Send);
2647 #if ( _DEBUGlevel_ >= 1 )
2648 statusOFS <<
"["<<snode.Index<<
"] "<<
"gemmProcessed = "
2649 << gemmProcessed<<
"/"<<gemmToDo<<std::endl;
2655 TIMER_STOP(Compute_Sinv_LU);
2664 TIMER_START(Reduce_Sinv_L);
2666 for (Int supidx=0; supidx<stepSuper; supidx++){
2668 if(
MYCOL( this->grid_ ) ==
PCOL( snode.Index, this->grid_ ) ){
2670 Int numRowLUpdateBuf;
2671 std::vector<LBlock<T> >& Lcol =
2672 this->L(
LBj( snode.Index, this->grid_ ) );
2676 (
MYROW( this->grid_ )==
PROW(snode.Index,this->grid_))?1:0;
2677 snode.RowLocalPtr.resize( Lcol.size() + 1 - offset );
2678 snode.BlockIdxLocal.resize( Lcol.size() - offset );
2679 snode.RowLocalPtr[0] = 0;
2680 for( Int ib = 0; ib < snode.BlockIdxLocal.size(); ib++ ){
2681 snode.RowLocalPtr[ib+1] = snode.RowLocalPtr[ib]
2682 + Lcol[ib+offset].numRow;
2683 snode.BlockIdxLocal[ib] = Lcol[ib+offset].blockIdx;
2685 numRowLUpdateBuf = *snode.RowLocalPtr.rbegin();
2688 if( numRowLUpdateBuf > 0 ){
2689 if( snode.LUpdateBuf.m() == 0 && snode.LUpdateBuf.n() == 0 ){
2690 snode.LUpdateBuf.Resize( numRowLUpdateBuf,
2691 SuperSize( snode.Index, this->super_ ) );
2693 SetValue( snode.LUpdateBuf, ZERO<T>() );
2697 #if ( _DEBUGlevel_ >= 2 )
2698 statusOFS <<
"["<<snode.Index<<
"] "<<
"LUpdateBuf Before Reduction: "
2699 << snode.LUpdateBuf << std::endl << std::endl;
2702 Int totCountRecv = 0;
2703 Int numRecv = this->CountSendToRight(snode.Index);
2704 NumMat<T> LUpdateBufRecv(numRowLUpdateBuf,
2705 SuperSize( snode.Index, this->super_ ) );
2706 for( Int countRecv = 0; countRecv < numRecv ; ++countRecv ){
2710 TIMER_START(L_RECV);
2711 MPI_Recv(LUpdateBufRecv.Data(), LUpdateBufRecv.ByteSize(),
2712 MPI_BYTE, MPI_ANY_SOURCE,
2713 IDX_TO_TAG2(snode.Index,supidx,SELINV_TAG_L_REDUCE),
2714 this->grid_->rowComm,&stat);
2716 MPI_Get_count(&stat, MPI_BYTE, &size);
2719 #if ( _DEBUGlevel_ >= 1 )
2720 Int src =
PNUM(
MYROW(this->grid_),stat.MPI_SOURCE,this->grid_);
2721 statusOFS <<
"["<<snode.Index<<
"] "<<
" LReduce P"
2722 <<
MYPROC(this->grid_)<<
" has received "<< size
2723 <<
" bytes from " << src << std::endl;
2725 #if ( _DEBUGlevel_ >= 2 )
2726 statusOFS <<
"["<<snode.Index<<
"] "<<
"LUpdateBufRecv: "
2727 << LUpdateBufRecv << std::endl << std::endl;
2730 blas::Axpy(snode.LUpdateBuf.Size(), ONE<T>(),
2731 LUpdateBufRecv.Data(), 1,
2732 snode.LUpdateBuf.Data(), 1 );
2735 #if ( _DEBUGlevel_ >= 2 )
2736 statusOFS <<
"["<<snode.Index<<
"] "<<
"LUpdateBuf After Reduction: "
2737 << snode.LUpdateBuf << std::endl << std::endl;
2742 TIMER_STOP(Reduce_Sinv_L);
2746 mpi::Waitall( arrMpireqsSendLToLeft );
2749 if(next_lidx < superList.size()){
2750 IRecvContentCD(superList[next_lidx], superList[next_lidx].size(),nextCDBuffers);
2757 TIMER_START(Update_Diagonal);
2758 for (Int supidx=0; supidx<stepSuper; supidx++){
2761 ComputeDiagUpdate(snode);
2763 if(
MYCOL( this->grid_ ) ==
PCOL( snode.Index, this->grid_ ) ){
2764 if(
MYROW( this->grid_ ) !=
PROW( snode.Index, this->grid_ ) ){
2765 if(this->isSendToDiagonal_(snode.Index)){
2767 MPI_Request & mpireqsSendToAbove = arrMpireqsSendToAbove[supidx];
2768 MPI_Isend( snode.DiagBuf.Data(), snode.DiagBuf.ByteSize(),
2769 MPI_BYTE,
PROW(snode.Index,this->grid_) ,
2770 IDX_TO_TAG2(snode.Index,supidx,SELINV_TAG_D_REDUCE),
2771 this->grid_->colComm, &mpireqsSendToAbove );
2773 #if ( _DEBUGlevel_ >= 1 )
2774 statusOFS <<
"["<<snode.Index<<
"] "<<
" P"<<
MYROW(this->grid_)
2775 <<
" has sent "<< snode.DiagBuf.ByteSize()
2776 <<
" bytes of DiagBuf to "
2777 <<
PROW(snode.Index,this->grid_) << std::endl;
2784 TIMER_STOP(Update_Diagonal);
2788 TIMER_START(Reduce_Diagonal);
2790 for (Int supidx=0; supidx<stepSuper; supidx++){
2792 if(
MYCOL( this->grid_ ) ==
PCOL( snode.Index, this->grid_ ) ){
2793 if(
MYROW( this->grid_ ) ==
PROW( snode.Index, this->grid_ ) ){
2794 if(snode.DiagBuf.Size()==0){
2795 snode.DiagBuf.Resize(
SuperSize( snode.Index, this->super_ ),
2796 SuperSize( snode.Index, this->super_ ));
2797 SetValue(snode.DiagBuf, ZERO<T>());
2800 Int totCountRecv = 0;
2801 Int numRecv = this->CountRecvFromBelow(snode.Index);
2802 NumMat<T> DiagBufRecv(snode.DiagBuf.m(),snode.DiagBuf.n());
2804 for( Int countRecv = 0; countRecv < numRecv ; ++countRecv ){
2808 TIMER_START(D_RECV);
2809 MPI_Recv(DiagBufRecv.Data(), DiagBufRecv.ByteSize(), MPI_BYTE,
2810 MPI_ANY_SOURCE,IDX_TO_TAG2(snode.Index,supidx,SELINV_TAG_D_REDUCE),
2811 this->grid_->colComm,&stat);
2813 MPI_Get_count(&stat, MPI_BYTE, &size);
2817 blas::Axpy(snode.DiagBuf.Size(), ONE<T>(),
2818 DiagBufRecv.Data(), 1,
2819 snode.DiagBuf.Data(), 1 );
2822 LBlock<T> & LB = this->L(
LBj( snode.Index, this->grid_ ) )[0];
2824 snode.DiagBuf.Data(), 1,
2825 LB.
nzval.Data(), 1 );
2832 TIMER_STOP(Reduce_Diagonal);
2837 TIMER_START(Reduce_Sinv_U);
2839 for (Int supidx=0; supidx<stepSuper; supidx++){
2843 if(
MYROW( this->grid_ ) ==
PROW( snode.Index, this->grid_ ) ){
2845 Int numColUUpdateBuf;
2847 std::vector<UBlock<T> >& Urow =
2848 this->U(
LBi( snode.Index, this->grid_ ) );
2850 snode.ColLocalPtr.resize( Urow.size() + 1 );
2851 snode.BlockIdxLocalU.resize( Urow.size() );
2852 snode.ColLocalPtr[0] = 0;
2886 Int urowsize = Urow.size();
2887 snode.ColLocalPtr[0] = 0;
2888 for( Int jb = 0; jb < Urow.size(); jb++ ){
2890 snode.ColLocalPtr[jb+1] = snode.ColLocalPtr[jb]+UB.
numCol;
2891 snode.BlockIdxLocalU[jb] = UB.
blockIdx;
2895 #if ( _DEBUGlevel_ >= 2 )
2896 statusOFS <<
"["<<snode.Index<<
"] "<<
"UReduce collocalptr "
2897 << snode.ColLocalPtr << std::endl;
2898 statusOFS <<
"["<<snode.Index<<
"] "<<
"UReduce blockidxlocalU "
2899 << snode.BlockIdxLocalU << std::endl;
2902 numColUUpdateBuf = *snode.ColLocalPtr.rbegin();
2904 if( numColUUpdateBuf > 0 ){
2905 if( snode.UUpdateBuf.m() == 0 && snode.UUpdateBuf.n() == 0 ){
2906 snode.UUpdateBuf.Resize(
SuperSize( snode.Index, this->super_ ),
2909 SetValue( snode.UUpdateBuf, ZERO<T>() );
2913 #if ( _DEBUGlevel_ >= 2 )
2914 statusOFS <<
"["<<snode.Index<<
"] "<<
"UUpdateBuf Before Reduction: "
2915 << snode.UUpdateBuf << std::endl << std::endl;
2918 Int totCountRecv = 0;
2920 Int numRecv = this->CountSendToBelow(snode.Index);
2925 #if ( _DEBUGlevel_ >= 1 )
2926 statusOFS <<
"["<<snode.Index<<
"] "<<
" UReduce P"
2927 <<
MYPROC(this->grid_) <<
" can receive at most "
2928 << UUpdateBufRecv.ByteSize() <<
" bytes" << std::endl;
2930 for( Int countRecv = 0; countRecv < numRecv ; ++countRecv ){
2934 TIMER_START(U_RECV);
2935 MPI_Recv(UUpdateBufRecv.Data(), UUpdateBufRecv.ByteSize(),
2936 MPI_BYTE, MPI_ANY_SOURCE,
2937 IDX_TO_TAG2(snode.Index,supidx,SELINV_TAG_U_REDUCE),
2938 this->grid_->colComm,&stat);
2940 MPI_Get_count(&stat, MPI_BYTE, &size);
2944 #if ( _DEBUGlevel_ >= 1 )
2945 Int src =
PNUM(stat.MPI_SOURCE,
MYCOL(this->grid_),this->grid_);
2946 statusOFS <<
"["<<snode.Index<<
"] "<<
" UReduce P"
2947 <<
MYPROC(this->grid_)<<
" has received "
2948 << size <<
" bytes from P" << src << std::endl;
2951 #if ( _DEBUGlevel_ >= 2 )
2952 statusOFS <<
"["<<snode.Index<<
"] "<<
"UUpdateBufRecv: "
2953 << UUpdateBufRecv << std::endl << std::endl;
2956 blas::Axpy(snode.UUpdateBuf.Size(), ONE<T>(),
2957 UUpdateBufRecv.Data(), 1,
2958 snode.UUpdateBuf.Data(), 1);
2961 #if ( _DEBUGlevel_ >= 2 )
2962 statusOFS <<
"["<<snode.Index<<
"] "<<
"UUpdateBuf After Reduction: "
2963 << snode.UUpdateBuf << std::endl << std::endl;
2969 TIMER_START(Reduce_Sinv_U_Send);
2970 if(!this->isRecvFromLeft_( snode.Index ) && this->isRecvFromAbove_( snode.Index ) ){
2971 MPI_Request & mpireqsSendToAbove = arrMpireqsSendUToAbove[supidx];
2974 MPI_Isend( NULL, 0, MPI_BYTE,
PROW(snode.Index,this->grid_),
2975 IDX_TO_TAG2(snode.Index,supidx,SELINV_TAG_U_REDUCE),
2976 this->grid_->colComm, &mpireqsSendToAbove );
2978 #if ( _DEBUGlevel_ >= 1 )
2979 Int dst =
PNUM(
PROW(snode.Index,this->grid_),
2980 MYCOL(this->grid_),this->grid_);
2981 statusOFS <<
"["<<snode.Index<<
"] "<<
" UReduce P"
2982 <<
MYPROC(this->grid_) <<
" has sent "
2983 << 0 <<
" bytes to "
2984 << dst << std::endl;
2987 TIMER_STOP(Reduce_Sinv_U_Send);
2990 TIMER_START(Reduce_Sinv_U_Send);
2993 if( this->isRecvFromAbove_( snode.Index ) && this->isRecvFromLeft_( snode.Index ) ){
2994 MPI_Request & reqsSendToAbove = arrMpireqsSendUToAbove[supidx];
2995 TIMER_START(Reduce_Sinv_U_Send_Isend);
2996 MPI_Isend( snode.UUpdateBuf.Data(), snode.UUpdateBuf.ByteSize(),
2997 MPI_BYTE,
PROW(snode.Index,this->grid_),
2998 IDX_TO_TAG2(snode.Index,supidx,SELINV_TAG_U_REDUCE),
2999 this->grid_->colComm, &reqsSendToAbove );
3000 TIMER_STOP(Reduce_Sinv_U_Send_Isend);
3001 #if ( _DEBUGlevel_ >= 1 )
3002 Int dst =
PNUM(
PROW(snode.Index,this->grid_),
3003 MYCOL(this->grid_),this->grid_);
3004 statusOFS <<
"["<<snode.Index<<
"] "<<
" UReduce P"
3005 <<
MYPROC(this->grid_) <<
" has sent "
3006 << snode.UUpdateBuf.ByteSize() <<
" bytes to "
3007 << dst << std::endl;
3010 TIMER_STOP(Reduce_Sinv_U_Send);
3016 TIMER_STOP(Reduce_Sinv_U);
3019 mpi::Waitall( arrMpireqsSendUToAbove );
3027 for (Int supidx=0; supidx<stepSuper; supidx++){
3029 if(
MYROW( this->grid_ ) ==
PROW( snode.Index, this->grid_ )){
3030 std::vector<LBlock<T> >& Lrow = this->Lrow(
LBi(snode.Index, this->grid_) );
3033 if(
MYCOL( this->grid_ ) ==
PCOL( snode.Index, this->grid_ )){
3034 std::vector<UBlock<T> >& Ucol = this->Ucol(
LBj(snode.Index, this->grid_) );
3040 TIMER_START(Update_L);
3042 for (Int supidx=0; supidx<stepSuper; supidx++){
3045 #if ( _DEBUGlevel_ >= 1 )
3046 statusOFS << std::endl <<
"["<<snode.Index<<
"] "
3047 <<
"Finish updating the L part by filling LUpdateBufReduced"
3048 <<
" back to L" << std::endl << std::endl;
3051 if(
MYCOL( this->grid_ ) ==
PCOL( snode.Index, this->grid_ )
3052 && snode.LUpdateBuf.m() > 0 ){
3053 std::vector<LBlock<T> >& Lcol = this->L(
LBj(snode.Index,this->grid_));
3056 (
MYROW( this->grid_ ) ==
PROW( snode.Index, this->grid_ ))?1:0;
3057 for( Int ib = startBlock; ib < Lcol.size(); ib++ ){
3061 &snode.LUpdateBuf(snode.RowLocalPtr[ib-startBlock], 0),
3062 snode.LUpdateBuf.m(), LB.
nzval.Data(), LB.
numRow );
3069 TIMER_STOP(Update_L);
3074 TIMER_START(Update_U);
3076 for (Int supidx=0; supidx<stepSuper; supidx++){
3079 #if ( _DEBUGlevel_ >= 1 )
3080 statusOFS <<
"["<<snode.Index<<
"] "
3081 <<
"Finish updating the U part by filling UUpdateBufReduced"
3082 <<
" back to U" << std::endl << std::endl;
3085 if(
MYROW( this->grid_ ) ==
PROW( snode.Index, this->grid_ )
3086 && snode.UUpdateBuf.m() > 0 ){
3087 std::vector<UBlock<T> >& Urow = this->U(
LBi(snode.Index,this->grid_));
3089 for( Int jb = 0; jb < Urow.size(); jb++ ){
3092 #if ( _DEBUGlevel_ >= 1 )
3093 statusOFS <<
"["<<snode.Index<<
"] "<<
"Putting colptr "
3094 << snode.ColLocalPtr[jb] <<
" of UUpdateBuf "
3095 << snode.ColLocalPtr <<
" "
3096 <<
"in UB "<<UB.
blockIdx<<std::endl;
3100 &snode.UUpdateBuf( 0, snode.ColLocalPtr[jb] ),
3101 snode.UUpdateBuf.m(), UB.
nzval.Data(), UB.
numRow );
3102 #if ( _DEBUGlevel_ >= 2 )
3103 statusOFS<<
"["<<snode.Index<<
"] "<<
"UB: "<<UB.
nzval<<std::endl;
3111 TIMER_STOP(Update_U);
3116 if(next_lidx < superList.size()){
3117 WaitContentLCD(superList[next_lidx], superList[next_lidx].size(),nextCDBuffers);
3118 WaitContentUCD(superList[next_lidx], superList[next_lidx].size(),nextCDBuffers);
3123 TIMER_START(Barrier);
3124 if(next_lidx < superList.size()){
3125 nextCDBuffers.WaitAllSend();
3127 mpi::Waitall(arrMpireqsRecvLContentFromAny);
3128 mpi::Waitall(arrMpireqsRecvUContentFromAny);
3132 mpi::Waitall(arrMpireqsSendToAbove);
3134 for (Int supidx=0; supidx<stepSuper; supidx++){
3135 Int ksup = superList[lidx][supidx];
3136 std::vector<MPI_Request> & mpireqsSendLToRight = arrMpireqsSendLToRight[supidx];
3137 std::vector<MPI_Request> & mpireqsSendLToBelow = arrMpireqsSendLToBelow[supidx];
3138 std::vector<MPI_Request> & mpireqsSendUToRight = arrMpireqsSendUToRight[supidx];
3139 std::vector<MPI_Request> & mpireqsSendUToBelow = arrMpireqsSendUToBelow[supidx];
3141 mpi::Waitall( mpireqsSendLToRight );
3142 mpi::Waitall( mpireqsSendLToBelow );
3143 mpi::Waitall( mpireqsSendUToRight );
3144 mpi::Waitall( mpireqsSendUToBelow );
3147 TIMER_STOP(Barrier);
3153 if (this->options_->maxPipelineDepth!=-1)
3156 MPI_Barrier(this->grid_->comm);
3162 template<
typename T>
3165 this->SelInv_P2p ( );
3168 template<
typename T>
3171 TIMER_START(SelInv_P2p);
3178 std::vector<std::vector<Int> > & superList = this->WorkingSet();
3179 Int numSteps = superList.size();
3181 for (Int lidx=0; lidx<numSteps ; lidx++){
3182 Int stepSuper = superList[lidx].size();
3183 this->SelInvIntra_P2p(lidx);
3187 TIMER_STOP(SelInv_P2p);
3204 template<
typename T>
3210 #if ( _DEBUGlevel_ >= 1 )
3211 statusOFS << std::endl <<
"L(i,k) <- L(i,k) * L(k,k)^{-1}"
3212 << std::endl << std::endl;
3215 for( Int ksup = 0; ksup < numSuper; ksup++ ){
3216 if(
MYCOL( this->grid_ ) ==
PCOL( ksup, this->grid_ ) ){
3219 std::vector<LBlock<T> >& Lcol = this->L(
LBj( ksup, this->grid_ ) );
3220 if(
MYROW( this->grid_ ) ==
PROW( ksup, this->grid_ ) ){
3221 nzvalLDiag = Lcol[0].nzval;
3222 if( nzvalLDiag.m() !=
SuperSize(ksup, this->super_) ||
3223 nzvalLDiag.n() !=
SuperSize(ksup, this->super_) ){
3225 "The size of the diagonal block of L is wrong." );
3231 MPI_Bcast( (
void*)nzvalLDiag.Data(), nzvalLDiag.ByteSize(),
3232 MPI_BYTE,
PROW( ksup, this->grid_ ), this->grid_->colComm );
3235 for( Int ib = 0; ib < Lcol.size(); ib++ ){
3238 #if ( _DEBUGlevel_ >= 2 )
3248 #if defined( CHECK_NORM )
3252 ONE<T>(), nzvalLDiag.Data(), LB.
numCol,
3254 #if defined( CHECK_NORM )
3257 blas::Trmm(
'R',
'L',
'N',
'U',res.m(),res.n(),ONE<T>(),nzvalLDiag.Data(),nzvalLDiag.m(),res.Data(),res.m());
3258 blas::Axpy(res.Size(), MINUS_ONE<T>(), Ljk.Data(), 1, res.Data(), 1 );
3259 double norm = lapack::Lange(
'F',res.m(),res.n(),res.Data(),res.m(),Ljk.Data());
3261 statusOFS <<
"After solve norm of residual of L(" << LB.
blockIdx <<
", " << ksup
3262 <<
"): " << norm << std::endl;
3284 #if ( _DEBUGlevel_ >= 1 )
3285 statusOFS << std::endl <<
"U(k,j) <- U(k,k)^{-1} * U(k,j)"
3286 << std::endl << std::endl;
3288 for( Int ksup = 0; ksup < numSuper; ksup++ ){
3289 if(
MYROW( this->grid_ ) ==
PROW( ksup, this->grid_ ) ){
3292 std::vector<UBlock<T> >& Urow = this->U(
LBi( ksup, this->grid_ ) );
3293 if(
MYCOL( this->grid_ ) ==
PCOL( ksup, this->grid_ ) ){
3294 std::vector<LBlock<T> >& Lcol = this->L(
LBj( ksup, this->grid_ ) );
3295 nzvalUDiag = Lcol[0].nzval;
3296 if( nzvalUDiag.m() !=
SuperSize(ksup, this->super_) ||
3297 nzvalUDiag.n() !=
SuperSize(ksup, this->super_) ){
3299 "The size of the diagonal block of U is wrong." );
3305 MPI_Bcast( (
void*)nzvalUDiag.Data(), nzvalUDiag.ByteSize(),
3306 MPI_BYTE,
PCOL( ksup, this->grid_ ), this->grid_->rowComm );
3309 for( Int jb = 0; jb < Urow.size(); jb++ ){
3312 #if ( _DEBUGlevel_ >= 2 )
3321 #if defined( CHECK_NORM )
3325 ONE<T>(), nzvalUDiag.Data(), UB.
numRow,
3327 #if defined( CHECK_NORM )
3330 blas::Trmm(
'L',
'U',
'N',
'N',res.m(),res.n(),ONE<T>(),nzvalUDiag.Data(),nzvalUDiag.m(),res.Data(),res.m());
3331 blas::Axpy(res.Size(), MINUS_ONE<T>(), Ukj.Data(), 1, res.Data(), 1 );
3332 double norm = lapack::Lange(
'F',res.m(),res.n(),res.Data(),res.m(),Ukj.Data());
3333 statusOFS <<
"After solve, norm of residual of U(" << ksup <<
", " << UB.
blockIdx
3334 <<
"): " << norm << std::endl;
3353 #if ( _DEBUGlevel_ >= 1 )
3354 statusOFS << std::endl <<
"L(i,i) <- [L(k,k) * U(k,k)]^{-1}" << std::endl
3358 for( Int ksup = 0; ksup < numSuper; ksup++ ){
3359 if(
MYROW( this->grid_ ) ==
PROW( ksup, this->grid_ ) &&
3360 MYCOL( this->grid_ ) ==
PCOL( ksup, this->grid_ ) ){
3364 for(Int i = 0; i <
SuperSize( ksup, this->super_ ); i++){
3367 LBlock<T> & LB = (this->L(
LBj( ksup, this->grid_ ) ))[0];
3368 #if ( _DEBUGlevel_ >= 2 )
3375 #if defined( CHECK_NORM )
3379 SuperSize( ksup, this->super_ ), ipiv.Data() );
3381 #if defined( CHECK_NORM )
3387 for(Int i = 0; i<Akk.m();++i){
3388 for(Int j = i; j<Akk.n();++j){
3392 blas::Trmm(
'L',
'L',
'N',
'U',Akk.m(),Akk.n(),ONE<T>(),Lkk.Data(),Lkk.m(),Akk.Data(),Akk.m());
3398 blas::Gemm(
'N',
'N',res.m(),res.n(),res.m(),ONE<T>(),LB.
nzval.Data(),res.m(),Akk.Data(),res.m(),ZERO<T>(),res.Data(),res.m());
3399 for(Int i = 0; i<res.m();++i){
3403 double norm = lapack::Lange(
'F',res.m(),res.n(),res.Data(),res.m(),Lkk.Data());
3404 statusOFS <<
"After inversion, norm of residual of A(" << ksup <<
", " << ksup
3405 <<
"): " << norm << std::endl;
3425 template<
typename T>
3428 ConstructCommunicationPattern_P2p();
3433 template<
typename T>
3437 TIMER_START(ConstructCommunicationPattern);
3441 TIMER_START(Allocate);
3443 this->isSendToBelow_.Resize(this->grid_->numProcRow, numSuper);
3444 this->isSendToRight_.Resize(this->grid_->numProcCol, numSuper);
3445 this->isSendToDiagonal_.Resize( numSuper );
3446 SetValue( this->isSendToBelow_,
false );
3447 SetValue( this->isSendToRight_,
false );
3448 SetValue( this->isSendToDiagonal_,
false );
3450 this->isSendToCrossDiagonal_.Resize(this->grid_->numProcCol+1, numSuper );
3451 SetValue( this->isSendToCrossDiagonal_,
false );
3452 this->isRecvFromCrossDiagonal_.Resize(this->grid_->numProcRow+1, numSuper );
3453 SetValue( this->isRecvFromCrossDiagonal_,
false );
3455 this->isRecvFromAbove_.Resize( numSuper );
3456 this->isRecvFromLeft_.Resize( numSuper );
3457 this->isRecvFromBelow_.Resize( this->grid_->numProcRow, numSuper );
3458 SetValue( this->isRecvFromAbove_,
false );
3459 SetValue( this->isRecvFromBelow_,
false );
3460 SetValue( this->isRecvFromLeft_,
false );
3462 TIMER_STOP(Allocate);
3465 TIMER_START(GetEtree);
3466 std::vector<Int> snodeEtree(this->
NumSuper());
3467 this->GetEtree(snodeEtree);
3468 TIMER_STOP(GetEtree);
3473 std::vector<std::vector<LBlock<T> > * > colBlockRowBuf;
3474 colBlockRowBuf.resize( numSuper );
3475 std::vector<std::vector<UBlock<T> > * > rowBlockColBuf;
3476 rowBlockColBuf.resize( numSuper );
3477 for( Int ksup = 0; ksup < numSuper; ksup++ ){
3478 if(
MYCOL( this->grid_ ) ==
PCOL( ksup, this->grid_ )
3479 ||
MYROW( this->grid_ ) ==
PROW( ksup, this->grid_ ) ){
3480 colBlockRowBuf[ksup] =
new std::vector<LBlock<T> >();
3481 if(
MYROW( this->grid_ ) ==
PROW( ksup, this->grid_ ) ){
3482 rowBlockColBuf[ksup] =
new std::vector<UBlock<T> >();
3490 TIMER_START(Column_communication);
3491 for( Int ksup = 0; ksup < numSuper; ksup++ ){
3493 if(
MYCOL( this->grid_ ) ==
PCOL( ksup, this->grid_ ) ){
3495 std::vector< LBlock<T> > & Lcol = this->L(
LBj(ksup, this->grid_ ) );
3496 std::vector< LBlock<T> > & LcolRecv = *colBlockRowBuf[ksup];
3497 std::vector<char> sendBuf;
3498 std::vector<char> recvBuf;
3501 std::stringstream sstms,sstmr;
3502 std::vector<Int> mask( LBlockMask::TOTAL_NUMBER, 1 );
3503 mask[LBlockMask::NZVAL] = 0;
3506 Int numLBlock, totalLBlock;
3507 numLBlock = Lcol.size();
3508 mpi::Allreduce ( &numLBlock, &totalLBlock, 1, MPI_SUM, this->grid_->colComm);
3511 for( Int ib = 0; ib < Lcol.size(); ib++ ){
3512 serialize( Lcol[ib], sstms, mask );
3514 sendBuf.resize( Size( sstms ) );
3515 sstms.read( &sendBuf[0], sendBuf.size() );
3516 localSize = sendBuf.size();
3519 TIMER_START(Allgatherv_Column_communication);
3520 if( this->grid_ -> mpisize != 1 )
3522 mpi::Allgatherv( sendBuf, recvBuf, this->grid_->colComm );
3527 TIMER_STOP(Allgatherv_Column_communication);
3530 sstmr.write( &recvBuf[0], recvBuf.size() );
3532 LcolRecv.resize(totalLBlock);
3536 for( Int ib = 0; ib < totalLBlock; ib++ ){
3537 deserialize( LcolRecv[ib], sstmr, mask );
3542 std::sort(LcolRecv.begin(),LcolRecv.end(),LBlockComparator<T>);
3545 #if ( _DEBUGlevel_ >= 1 )
3546 statusOFS<<
"["<<ksup<<
"] "<<
"Lcol1: ";
for(
auto it = Lcol.begin();it != Lcol.end();++it){statusOFS<<it->blockIdx<<
" ";}statusOFS<<endl;
3547 statusOFS<<
"["<<ksup<<
"] "<<
"LcolRecv1: ";
for(
auto it = LcolRecv.begin();it != LcolRecv.end();++it){statusOFS<<it->blockIdx<<
" ";}statusOFS<<endl;
3551 TIMER_STOP(Column_communication);
3554 TIMER_START(Row_communication);
3555 for( Int ksup = 0; ksup < numSuper; ksup++ ){
3557 if(
MYROW( this->grid_ ) ==
PROW( ksup, this->grid_ ) ){
3559 std::vector< UBlock<T> > & Urow = this->U(
LBi(ksup, this->grid_ ) );
3560 std::vector< UBlock<T> > & UrowRecv = *rowBlockColBuf[ksup];
3561 std::vector<char> sendBuf;
3562 std::vector<char> recvBuf;
3565 std::stringstream sstms,sstmr;
3566 std::vector<Int> mask( UBlockMask::TOTAL_NUMBER, 1 );
3567 mask[UBlockMask::NZVAL] = 0;
3569 Int numUBlock, totalUBlock;
3570 numUBlock = Urow.size();
3571 mpi::Allreduce ( &numUBlock, &totalUBlock, 1, MPI_SUM, this->grid_->rowComm);
3574 for( Int jb = 0; jb < Urow.size(); jb++ ){
3575 serialize( Urow[jb], sstms, mask );
3577 sendBuf.resize( Size( sstms ) );
3578 sstms.read( &sendBuf[0], sendBuf.size() );
3579 localSize = sendBuf.size();
3582 TIMER_START(Allgatherv_Row_communication);
3583 if( this->grid_ -> mpisize != 1 )
3584 mpi::Allgatherv( sendBuf, recvBuf, this->grid_->rowComm );
3587 TIMER_STOP(Allgatherv_Row_communication);
3589 sstmr.write( &recvBuf[0], recvBuf.size() );
3593 UrowRecv.resize(totalUBlock);
3594 for( Int jb = 0; jb < totalUBlock; jb++ ){
3595 deserialize( UrowRecv[jb], sstmr, mask );
3599 std::sort(UrowRecv.begin(),UrowRecv.end(),UBlockComparator<T>);
3603 #if ( _DEBUGlevel_ >= 1 )
3604 statusOFS<<
"["<<ksup<<
"] "<<
"Urow1: ";
for(
auto it = Urow.begin();it != Urow.end();++it){statusOFS<<it->blockIdx<<
" ";}statusOFS<<endl;
3605 statusOFS<<
"["<<ksup<<
"] "<<
"UrowRecv1: ";
for(
auto it = UrowRecv.begin();it != UrowRecv.end();++it){statusOFS<<it->blockIdx<<
" ";}statusOFS<<endl;
3610 TIMER_STOP(Row_communication);
3613 TIMER_START(Row_communication);
3614 for( Int ksup = 0; ksup < numSuper; ksup++ ){
3616 if(
MYROW( this->grid_ ) ==
PROW( ksup, this->grid_ ) ){
3618 std::vector< LBlock<T> > * pLcolRecv;
3619 std::vector< LBlock<T> > & LcolSend = *colBlockRowBuf[ksup];
3620 std::vector< UBlock<T> > & UrowRecv = *rowBlockColBuf[ksup];
3621 std::vector< LBlock<T> > Union;
3622 std::vector<char> sendBuf;
3624 std::vector<Int> mask( LBlockMask::TOTAL_NUMBER, 1 );
3625 mask[LBlockMask::NZVAL] = 0;
3626 if( this->grid_ -> mpisize != 1 ){
3627 pLcolRecv =
new std::vector<LBlock<T> >();
3629 if(
MYCOL( this->grid_ ) ==
PCOL( ksup, this->grid_ ) ){
3631 std::stringstream sstms;
3633 serialize( (Int)LcolSend.size(), sstms, NO_MASK );
3634 for( Int ib = 0; ib < LcolSend.size(); ib++ ){
3635 serialize( LcolSend[ib], sstms, mask );
3637 sendBuf.resize( Size( sstms ) );
3638 sstms.read( &sendBuf[0], sendBuf.size() );
3639 localSize = sendBuf.size();
3641 TIMER_START(Bcast_Row_communication);
3642 mpi::Bcast( sendBuf,
PCOL(ksup,this->grid_), this->grid_->rowComm );
3643 TIMER_STOP(Bcast_Row_communication);
3645 std::stringstream sstmr;
3646 sstmr.write( &sendBuf[0], sendBuf.size() );
3649 deserialize( numLBlock, sstmr, NO_MASK );
3650 pLcolRecv->resize(numLBlock);
3651 for( Int ib = 0; ib < numLBlock; ib++ ){
3652 deserialize( (*pLcolRecv)[ib], sstmr, mask );
3656 pLcolRecv = &LcolSend;
3662 auto result = back_inserter(Union);
3663 auto firstUrow = UrowRecv.begin();
3664 auto lastUrow = UrowRecv.end();
3665 auto firstLcol = pLcolRecv->begin();
3666 auto lastLcol = pLcolRecv->end();
3669 if (firstUrow==lastUrow){
3670 result = std::copy(firstLcol,lastLcol,result);
3673 if (firstLcol==lastLcol){
3674 for(
auto it = firstUrow; it != lastUrow; ++it){
3686 if (firstUrow->blockIdx<firstLcol->blockIdx) {
3689 LB.
numRow = firstUrow->numCol;
3690 LB.
numCol = firstUrow->numRow;
3691 LB.
rows = firstUrow->cols;
3695 else if (firstLcol->blockIdx<firstUrow->blockIdx) {
3696 *result = *firstLcol;
3701 std::set<Int> uRowCol;
3702 uRowCol.insert(&firstLcol->rows[0],
3703 &firstLcol->rows[0]+firstLcol->numRow);
3704 uRowCol.insert(&firstUrow->cols[0],
3705 &firstUrow->cols[0]+firstUrow->numCol);
3709 LB.
numRow = uRowCol.size();
3710 LB.
numCol = firstUrow->numRow;
3711 LB.
rows.Resize(uRowCol.size());
3712 std::copy(uRowCol.begin(),uRowCol.end(),&LB.
rows[0]);
3720 #if ( _DEBUGlevel_ >= 1 )
3721 statusOFS<<
"["<<ksup<<
"] "<<
"Lcol: ";
for(
auto it = pLcolRecv->begin();it != pLcolRecv->end();++it){statusOFS<<it->blockIdx<<
" ";}statusOFS<<endl;
3722 statusOFS<<
"["<<ksup<<
"] "<<
"Urow: ";
for(
auto it = UrowRecv.begin();it != UrowRecv.end();++it){statusOFS<<it->blockIdx<<
" ";}statusOFS<<endl;
3723 statusOFS<<
"["<<ksup<<
"] "<<
"Union: ";
for(
auto it = Union.begin();it != Union.end();++it){statusOFS<<it->blockIdx<<
" ";}statusOFS<<endl;
3727 LcolSend.swap(Union);
3729 if( this->grid_ -> mpisize != 1 ){
3734 TIMER_STOP(Row_communication);
3737 TIMER_START(Col_communication);
3738 for( Int ksup = 0; ksup < numSuper; ksup++ ){
3740 if(
MYCOL( this->grid_ ) ==
PCOL( ksup, this->grid_ ) ){
3742 std::vector< LBlock<T> > * pUnionRecv;
3743 std::vector< LBlock<T> > & UnionSend = *colBlockRowBuf[ksup];
3745 std::vector<char> sendBuf;
3747 std::vector<Int> mask( LBlockMask::TOTAL_NUMBER, 1 );
3748 mask[LBlockMask::NZVAL] = 0;
3749 if( this->grid_ -> mpisize != 1 ){
3750 pUnionRecv =
new std::vector<LBlock<T> >();
3752 if(
MYROW( this->grid_ ) ==
PROW( ksup, this->grid_ ) ){
3754 std::stringstream sstms;
3756 serialize( (Int)UnionSend.size(), sstms, NO_MASK );
3757 for( Int ib = 0; ib < UnionSend.size(); ib++ ){
3758 serialize( UnionSend[ib], sstms, mask );
3760 sendBuf.resize( Size( sstms ) );
3761 sstms.read( &sendBuf[0], sendBuf.size() );
3762 localSize = sendBuf.size();
3764 TIMER_START(Bcast_Col_communication);
3765 mpi::Bcast( sendBuf,
PROW(ksup,this->grid_), this->grid_->colComm );
3766 TIMER_STOP(Bcast_Col_communication);
3768 std::stringstream sstmr;
3769 sstmr.write( &sendBuf[0], sendBuf.size() );
3772 deserialize( numLBlock, sstmr, NO_MASK );
3773 pUnionRecv->resize(numLBlock);
3774 for( Int ib = 0; ib < numLBlock; ib++ ){
3775 deserialize( (*pUnionRecv)[ib], sstmr, mask );
3779 UnionSend.resize(pUnionRecv->size());
3780 std::copy(pUnionRecv->begin(),pUnionRecv->end(),UnionSend.begin());
3783 pUnionRecv = &UnionSend;
3786 #if ( _DEBUGlevel_ >= 1 )
3787 statusOFS<<
"["<<ksup<<
"] "<<
"Union: ";
for(
auto it = UnionSend.begin();it != UnionSend.end();++it){statusOFS<<it->blockIdx<<
" ";}statusOFS<<endl;
3790 if( this->grid_ -> mpisize != 1 ){
3795 TIMER_STOP(Col_communication);
3798 for( Int ksup = 0; ksup < numSuper; ksup++ ){
3800 if(
MYROW( this->grid_ ) ==
PROW( ksup, this->grid_ ) ){
3802 std::vector< LBlock<T> > & Union = *colBlockRowBuf[ksup];
3803 std::vector< UBlock<T> > & Urow = this->U(
LBi(ksup, this->grid_ ) );
3804 Int & LrowSize = this->LrowSize_[
LBi(ksup, this->grid_ ) ];
3805 TIMER_START(Computing_Lrow_size);
3808 for(
auto it = Union.begin(); it!=Union.end();++it){
3809 Int Idx = it->blockIdx;
3810 if(Idx > ksup && (Idx % this->grid_->numProcCol) ==
MYCOL(this->grid_) ){
3814 TIMER_STOP(Computing_Lrow_size);
3816 TIMER_START(Extending_Urow);
3817 Urow.reserve(LrowSize);
3819 for(
auto it = Union.begin(); it!=Union.end();++it){
3820 Int Idx = it->blockIdx;
3821 if(Idx > ksup && (Idx % this->grid_->numProcCol) ==
MYCOL(this->grid_) ){
3822 bool isFound =
false;
3824 for(Int jb = 0; jb < Urow.size(); ++jb){
3851 assert(UB.
numRow == it->numCol);
3852 if( UB.
numCol != it->numRow ){
3864 for(Int j = 0; j<UB.
numCol; ++j){
3865 Int newCol = UB.
cols[j];
3866 if(jOldCols<tmpCols.m()){
3867 Int oldCol = tmpCols[jOldCols];
3868 if(newCol == oldCol){
3869 T * nzcolPtr = tmpNzval.VecData(jOldCols);
3870 std::copy(nzcolPtr,nzcolPtr+UB.
numRow,UB.
nzval.VecData(j));
3878 assert(jOldCols>=tmpCols.m());
3885 #if ( _DEBUGlevel_ >= 1 )
3886 statusOFS<<
"["<<ksup<<
"] "<<
"LrowSize = "<<LrowSize<<std::endl;
3888 std::sort(Urow.begin(),Urow.end(),UBlockComparator<T>);
3889 #if ( _DEBUGlevel_ >= 1 )
3890 statusOFS<<
"["<<ksup<<
"] "<<
"Urow extended: ";
for(
auto it = Urow.begin();it != Urow.end();++it){statusOFS<<it->blockIdx<<
" ";}statusOFS<<endl;
3892 TIMER_STOP(Extending_Urow);
3896 for( Int ksup = 0; ksup < numSuper; ksup++ ){
3898 if(
MYCOL( this->grid_ ) ==
PCOL( ksup, this->grid_ ) ){
3900 std::vector< LBlock<T> > & Union = *colBlockRowBuf[ksup];
3901 std::vector< LBlock<T> > & Lcol = this->L(
LBj(ksup, this->grid_ ) );
3902 Int & UcolSize = this->UcolSize_[
LBj(ksup, this->grid_ ) ];
3903 TIMER_START(Computing_Ucol_size);
3906 for(
auto it = Union.begin(); it!=Union.end();++it){
3907 Int Idx = it->blockIdx;
3908 if(Idx > ksup && (Idx % this->grid_->numProcRow) ==
MYROW(this->grid_) ){
3912 TIMER_STOP(Computing_Ucol_size);
3914 TIMER_START(Extending_Lcol);
3915 Lcol.reserve(UcolSize);
3917 for(
auto it = Union.begin(); it!=Union.end();++it){
3918 Int Idx = it->blockIdx;
3919 if(Idx > ksup && (Idx % this->grid_->numProcRow) ==
MYROW(this->grid_) ){
3920 bool isFound =
false;
3922 for(Int ib = 0; ib < Lcol.size(); ++ib){
3936 Lcol.push_back(*it);
3944 assert(LB.
numCol == it->numCol);
3945 if( LB.
numRow != it->numRow ){
3957 for(Int i = 0; i<LB.
numRow; ++i){
3958 Int newRow = LB.
rows[i];
3959 if(iOldRows<tmpRows.m()){
3960 Int oldRow = tmpRows[iOldRows];
3961 if(newRow == oldRow){
3962 for(Int j = 0; j<LB.
numCol; ++j){
3963 LB.
nzval(i,j) = tmpNzval(iOldRows,j);
3972 assert(iOldRows>=tmpRows.m());
3979 #if ( _DEBUGlevel_ >= 1 )
3980 statusOFS<<
"["<<ksup<<
"] "<<
"UcolSize = "<<UcolSize<<std::endl;
3982 std::sort(Lcol.begin(),Lcol.end(),LBlockComparator<T>);
3983 #if ( _DEBUGlevel_ >= 1 )
3984 statusOFS<<
"["<<ksup<<
"] "<<
"Lcol extended: ";
for(
auto it = Lcol.begin();it != Lcol.end();++it){statusOFS<<it->blockIdx<<
" ";}statusOFS<<endl;
3986 TIMER_STOP(Extending_Lcol);
3995 std::vector<Int> nextNZColBlockRow;
3996 nextNZColBlockRow.resize( this->NumLocalBlockCol(), 0 );
3998 std::vector<Int> nextNZRowBlockCol;
3999 nextNZRowBlockCol.resize( this->NumLocalBlockRow() , 0);
4002 for( Int ksup = 0; ksup < numSuper; ksup++ ){
4004 std::vector<Int> tAllBlockRowIdx;
4005 if(
MYCOL( this->grid_ ) ==
PCOL( ksup, this->grid_ ) ){
4007 std::vector< LBlock<T> > & Union = *colBlockRowBuf[ksup];
4009 std::vector<Int> tlocalBlockRowIdx;
4012 for( Int jsup = 0; jsup < ksup; jsup++ ){
4014 Int pjsup = snodeEtree[jsup];
4015 if(pjsup<=ksup &&
MYROW( this->grid_ ) ==
PROW( jsup, this->grid_ ) ){
4017 std::vector< UBlock<T> > & Urow = this->U(
LBi(jsup, this->grid_ ) );
4021 #if ( _DEBUGlevel_ >= 1 )
4022 statusOFS<<
"["<<ksup<<
"] "<<
"Urow["<<jsup<<
"]: ";
for(
auto it = Urow.begin(); it != Urow.end(); ++it){ statusOFS<<it->blockIdx<<
" "; } statusOFS<<endl;
4025 Int & nextIdx = nextNZRowBlockCol[
LBi(jsup, this->grid_) ];
4026 while(nextIdx<Urow.size()){
4027 if(Urow[nextIdx].blockIdx < ksup){
4034 if(nextIdx<Urow.size()){
4035 if(Urow[nextIdx].blockIdx == ksup){
4036 tlocalBlockRowIdx.push_back(jsup);
4047 TIMER_START(Allgatherv_Column_communication);
4048 if( this->grid_ -> mpisize != 1 )
4049 mpi::Allgatherv( tlocalBlockRowIdx, tAllBlockRowIdx, this->grid_->colComm );
4051 tAllBlockRowIdx = tlocalBlockRowIdx;
4053 TIMER_STOP(Allgatherv_Column_communication);
4058 for(
auto it = tAllBlockRowIdx.begin(); it != tAllBlockRowIdx.end(); ++it ){
4063 Union.push_back(LB);
4066 std::sort(Union.begin(),Union.end(),LBlockComparator<T>);
4068 #if ( _DEBUGlevel_ >= 1 )
4069 statusOFS<<
"["<<ksup<<
"] "<<
"NEW Union: ";
4070 for(
auto it = Union.begin(); it != Union.end(); ++it){ statusOFS<<it->blockIdx<<
" "; }
4079 for( Int ksup = 0; ksup < numSuper; ksup++ ){
4081 std::vector<Int> tAllBlockColIdx;
4082 if(
MYROW( this->grid_ ) ==
PROW( ksup, this->grid_ ) ){
4084 std::vector< LBlock<T> > & Union = *colBlockRowBuf[ksup];
4085 std::vector<Int> tlocalBlockColIdx;
4087 for( Int jsup = 0; jsup < ksup; jsup++ ){
4089 Int pjsup = snodeEtree[jsup];
4090 if(pjsup<=ksup &&
MYCOL( this->grid_ ) ==
PCOL( jsup, this->grid_ ) ){
4091 std::vector< LBlock<T> > & Lcol = this->L(
LBj(jsup, this->grid_ ) );
4095 #if ( _DEBUGlevel_ >= 1 )
4096 statusOFS<<
"["<<ksup<<
"] "<<
"Lcol["<<jsup<<
"]: ";
for(
auto it = Lcol.begin(); it != Lcol.end(); ++it){ statusOFS<<it->blockIdx<<
" "; } statusOFS<<endl;
4099 Int & nextIdx = nextNZColBlockRow[
LBj(jsup, this->grid_) ];
4100 while(nextIdx<Lcol.size()){
4101 if(Lcol[nextIdx].blockIdx < ksup){
4108 if(nextIdx<Lcol.size()){
4109 if(Lcol[nextIdx].blockIdx == ksup){
4110 tlocalBlockColIdx.push_back(jsup);
4125 TIMER_START(Allgatherv_Row_communication);
4126 if( this->grid_ -> mpisize != 1 )
4127 mpi::Allgatherv( tlocalBlockColIdx, tAllBlockColIdx, this->grid_->rowComm );
4129 tAllBlockColIdx = tlocalBlockColIdx;
4131 TIMER_STOP(Allgatherv_Row_communication);
4136 for(
auto it = tAllBlockColIdx.begin(); it != tAllBlockColIdx.end(); ++it ){
4141 Union.push_back(LB);
4144 std::sort(Union.begin(),Union.end(),LBlockComparator<T>);
4145 auto last = std::unique(Union.begin(),Union.end(),LBlockEqualComparator<T>);
4146 Union.erase(last,Union.end());
4148 #if ( _DEBUGlevel_ >= 1 )
4149 statusOFS<<
"["<<ksup<<
"] "<<
"NEW Union: ";
4150 for(
auto it = Union.begin(); it != Union.end(); ++it){ statusOFS<<it->blockIdx<<
" "; }
4161 TIMER_START(STB_RFA);
4162 for( Int ksup = 0; ksup < numSuper - 1; ksup++ ){
4166 Int jsup = snodeEtree[ksup];
4167 while(jsup<numSuper){
4168 Int jsupLocalBlockCol =
LBj( jsup, this->grid_ );
4169 Int jsupProcCol =
PCOL( jsup, this->grid_ );
4170 if(
MYCOL( this->grid_ ) == jsupProcCol ){
4171 std::vector< LBlock<T> > & Union = *colBlockRowBuf[jsup];
4175 for(
auto it = Union.begin();it!=Union.end();++it){
4176 if(it->blockIdx == ksup){
4180 else if(it->blockIdx > ksup){
4185 for(
auto si = Union.begin(); si != Union.end(); si++ ){
4186 Int isup = si->blockIdx;
4187 Int isupProcRow =
PROW( isup, this->grid_ );
4189 if(
MYROW( this->grid_ ) == isupProcRow ){
4190 this->isRecvFromAbove_(ksup) =
true;
4192 if(
MYROW( this->grid_ ) ==
PROW( ksup, this->grid_ ) ){
4193 this->isSendToBelow_( isupProcRow, ksup ) =
true;
4200 jsup = snodeEtree[jsup];
4204 #if ( _DEBUGlevel_ >= 1 )
4205 statusOFS << std::endl <<
"this->isSendToBelow:" << std::endl;
4206 for(
int j = 0;j< this->isSendToBelow_.n();j++){
4207 statusOFS <<
"["<<j<<
"] ";
4208 for(
int i =0; i < this->isSendToBelow_.m();i++){
4209 statusOFS<< this->isSendToBelow_(i,j) <<
" ";
4211 statusOFS<<std::endl;
4214 statusOFS << std::endl <<
"this->isRecvFromAbove:" << std::endl;
4215 for(
int j = 0;j< this->isRecvFromAbove_.m();j++){
4216 statusOFS <<
"["<<j<<
"] "<< this->isRecvFromAbove_(j)<<std::endl;
4222 TIMER_STOP(STB_RFA);
4231 TIMER_START(STR_RFL_RFB);
4234 for( Int ksup = 0; ksup < numSuper - 1; ksup++ ){
4237 Int isup = snodeEtree[ksup];
4238 while(isup<numSuper){
4239 Int isupLocalBlockRow =
LBi( isup, this->grid_ );
4240 Int isupProcRow =
PROW( isup, this->grid_ );
4241 if(
MYROW( this->grid_ ) == isupProcRow ){
4242 std::vector< LBlock<T> > & Union = *colBlockRowBuf[isup];
4246 for(
auto it = Union.begin();it!=Union.end();++it){
4247 if(it->blockIdx == ksup){
4251 else if(it->blockIdx > ksup){
4256 for(
auto si = Union.begin(); si != Union.end(); si++ ){
4257 Int jsup = si->blockIdx;
4258 Int jsupProcCol =
PCOL( jsup, this->grid_ );
4260 if(
MYCOL( this->grid_ ) == jsupProcCol ){
4261 this->isRecvFromLeft_(ksup) =
true;
4263 if(
MYCOL( this->grid_ ) ==
PCOL( ksup, this->grid_ ) ){
4264 this->isSendToRight_( jsupProcCol, ksup ) =
true;
4271 if(
MYCOL( this->grid_ ) ==
PCOL(ksup, this->grid_) ){
4272 if(
MYROW( this->grid_ ) ==
PROW( ksup, this->grid_ ) ){
4273 this->isRecvFromBelow_(isupProcRow,ksup) =
true;
4275 else if (
MYROW(this->grid_) == isupProcRow){
4276 this->isSendToDiagonal_(ksup)=
true;
4279 isup = snodeEtree[isup];
4283 #if ( _DEBUGlevel_ >= 1 )
4284 statusOFS << std::endl <<
"this->isSendToRight:" << std::endl;
4285 for(
int j = 0;j< this->isSendToRight_.n();j++){
4286 statusOFS <<
"["<<j<<
"] ";
4287 for(
int i =0; i < this->isSendToRight_.m();i++){
4288 statusOFS<< this->isSendToRight_(i,j) <<
" ";
4290 statusOFS<<std::endl;
4293 statusOFS << std::endl <<
"this->isRecvFromLeft:" << std::endl;
4294 for(
int j = 0;j< this->isRecvFromLeft_.m();j++){
4295 statusOFS <<
"["<<j<<
"] "<< this->isRecvFromLeft_(j)<<std::endl;
4298 statusOFS << std::endl <<
"this->isRecvFromBelow:" << std::endl;
4299 for(
int j = 0;j< this->isRecvFromBelow_.n();j++){
4300 statusOFS <<
"["<<j<<
"] ";
4301 for(
int i =0; i < this->isRecvFromBelow_.m();i++){
4302 statusOFS<< this->isRecvFromBelow_(i,j) <<
" ";
4304 statusOFS<<std::endl;
4309 TIMER_STOP(STR_RFL_RFB);
4314 TIMER_START(STCD_RFCD);
4317 for( Int ksup = 0; ksup < numSuper - 1; ksup++ ){
4318 if(
MYCOL( this->grid_ ) ==
PCOL( ksup, this->grid_ ) ){
4319 std::vector< LBlock<T> > & Union = *colBlockRowBuf[ksup];
4320 for(
auto si = Union.begin(); si != Union.end(); si++ ){
4321 Int isup = si->blockIdx;
4322 Int isupProcRow =
PROW( isup, this->grid_ );
4323 Int isupProcCol =
PCOL( isup, this->grid_ );
4324 if( isup > ksup &&
MYROW( this->grid_ ) == isupProcRow ){
4325 this->isSendToCrossDiagonal_(this->grid_->numProcCol, ksup ) =
true;
4326 this->isSendToCrossDiagonal_(isupProcCol, ksup ) =
true;
4332 for( Int ksup = 0; ksup < numSuper - 1; ksup++ ){
4333 if(
MYROW( this->grid_ ) ==
PROW( ksup, this->grid_ ) ){
4334 std::vector< LBlock<T> > & Union = *colBlockRowBuf[ksup];
4335 for(
auto si = Union.begin(); si != Union.end(); si++ ){
4336 Int jsup = si->blockIdx;
4337 Int jsupProcCol =
PCOL( jsup, this->grid_ );
4338 Int jsupProcRow =
PROW( jsup, this->grid_ );
4339 if( jsup > ksup &&
MYCOL(this->grid_) == jsupProcCol ){
4340 this->isRecvFromCrossDiagonal_(this->grid_->numProcRow, ksup ) =
true;
4341 this->isRecvFromCrossDiagonal_(jsupProcRow, ksup ) =
true;
4346 #if ( _DEBUGlevel_ >= 1 )
4347 statusOFS << std::endl <<
"this->isSendToCrossDiagonal:" << std::endl;
4348 for(
int j =0; j < this->isSendToCrossDiagonal_.n();j++){
4349 if(this->isSendToCrossDiagonal_(this->grid_->numProcCol,j)){
4350 statusOFS <<
"["<<j<<
"] ";
4351 for(
int i =0; i < this->isSendToCrossDiagonal_.m()-1;i++){
4352 if(this->isSendToCrossDiagonal_(i,j))
4354 statusOFS<<
PNUM(
PROW(j,this->grid_),i,this->grid_)<<
" ";
4357 statusOFS<<std::endl;
4361 statusOFS << std::endl <<
"this->isRecvFromCrossDiagonal:" << std::endl;
4362 for(
int j =0; j < this->isRecvFromCrossDiagonal_.n();j++){
4363 if(this->isRecvFromCrossDiagonal_(this->grid_->numProcRow,j)){
4364 statusOFS <<
"["<<j<<
"] ";
4365 for(
int i =0; i < this->isRecvFromCrossDiagonal_.m()-1;i++){
4366 if(this->isRecvFromCrossDiagonal_(i,j))
4368 statusOFS<<
PNUM(i,
PCOL(j,this->grid_),this->grid_)<<
" ";
4371 statusOFS<<std::endl;
4379 TIMER_STOP(STCD_RFCD);
4381 for( Int ksup = 0; ksup < numSuper; ksup++ ){
4382 if(
MYCOL( this->grid_ ) ==
PCOL( ksup, this->grid_ )
4383 ||
MYROW( this->grid_ ) ==
PROW( ksup, this->grid_ ) ){
4384 delete colBlockRowBuf[ksup];
4385 if(
MYROW( this->grid_ ) ==
PROW( ksup, this->grid_ ) ){
4386 delete rowBlockColBuf[ksup];
4392 this->GetWorkSet(snodeEtree,this->WorkingSet());
4394 TIMER_STOP(ConstructCommunicationPattern);
4403 #endif //_PEXSI_PSELINV_UNSYM_IMPL_HPP_
void UnpackData(SuperNodeBufferTypeUnsym &snode, std::vector< LBlock< T > > &LcolRecv, std::vector< LBlock< T > > &LrowRecv, std::vector< UBlock< T > > &UcolRecv, std::vector< UBlock< T > > &UrowRecv)
UnpackData.
Definition: pselinv_unsym_impl.hpp:1659
void SelInvIntra_P2p(Int lidx)
SelInvIntra_P2p.
Definition: pselinv_unsym_impl.hpp:1857
Int blockIdx
Block index (supernodal index)
Definition: pselinv.hpp:248
void ComputeDiagUpdate(SuperNodeBufferTypeUnsym &snode)
ComputeDiagUpdate.
Definition: pselinv_unsym_impl.hpp:1794
Interface with SuperLU_Dist (version 3.0 and later)
Int PROW(Int bnum, const GridType *g)
PROW returns the processor row that the bnum-th block (supernode) belongs to.
Definition: pselinv.hpp:304
Int numRow
Number of nonzero rows.
Definition: pselinv.hpp:199
void SelInv_lookup_indexes(SuperNodeBufferTypeUnsym &snode, std::vector< LBlock< T > > &LcolRecv, std::vector< LBlock< T > > &LrowRecv, std::vector< UBlock< T > > &UcolRecv, std::vector< UBlock< T > > &UrowRecv, NumMat< T > &AinvBuf, NumMat< T > &LBuf, NumMat< T > &UBuf)
SelInv_lookup_indexes.
Definition: pselinv_unsym_impl.hpp:100
Int MYPROC(const GridType *g)
MYPROC returns the current processor rank.
Definition: pselinv.hpp:291
Int PCOL(Int bnum, const GridType *g)
PCOL returns the processor column that the bnum-th block (supernode) belongs to.
Definition: pselinv.hpp:309
Int LBi(Int bnum, const GridType *g)
LBi returns the local block number on the processor at processor row PROW( bnum, g )...
Definition: pselinv.hpp:319
Profiling and timing using TAU.
Int SuperSize(Int bnum, const SuperNodeType *s)
SuperSize returns the size of the block bnum.
Definition: pselinv.hpp:360
Int LBj(Int bnum, const GridType *g)
LBj returns the local block number on the processor at processor column PCOL( bnum, g ).
Definition: pselinv.hpp:324
Int GBj(Int jLocal, const GridType *g)
GBj returns the global block number from a local block number in the column direction.
Definition: pselinv.hpp:334
void SetValue(NumMat< F > &M, F val)
SetValue sets a numerical matrix to a constant val.
Definition: NumMat_impl.hpp:153
Int numCol
Number of nonzero columns.
Definition: pselinv.hpp:202
Definition: pselinv_unsym_impl.hpp:402
LBlock stores a nonzero block in the lower triangular part or the diagonal part in PSelInv...
Definition: pselinv.hpp:193
Int PNUM(Int i, Int j, const GridType *g)
PNUM returns the processor rank that the bnum-th block (supernode) belongs to.
Definition: pselinv.hpp:314
Int GBi(Int iLocal, const GridType *g)
GBi returns the global block number from a local block number in the row direction.
Definition: pselinv.hpp:329
Int MYCOL(const GridType *g)
MYCOL returns my processor column.
Definition: pselinv.hpp:299
virtual void ConstructCommunicationPattern()
ConstructCommunicationPattern constructs the communication pattern to be used later in the selected i...
Definition: pselinv_unsym_impl.hpp:3426
IntNumVec rows
Dimension numRow * 1, index (0-based) for the number of nonzero rows.
Definition: pselinv.hpp:205
Int numCol
Number of nonzero columns.
Definition: pselinv.hpp:254
UBlock stores a nonzero block in the upper triangular part in PSelInv.
Definition: pselinv.hpp:245
Definition: pselinv_unsym.hpp:198
Int FirstBlockCol(Int bnum, const SuperNodeType *s)
FirstBlockCol returns the first column of a block bnum.
Definition: pselinv.hpp:348
Int blockIdx
Block index (supernodal index)
Definition: pselinv.hpp:196
void SendRecvCD(std::vector< SuperNodeBufferTypeUnsym > &arrSuperNodes, Int stepSuper)
SendRecvCD_UpdateU.
Definition: pselinv_unsym_impl.hpp:1068
Int numRow
Number of nonzero rows.
Definition: pselinv.hpp:251
virtual void SelInv()
SelInv is the main function for the selected inversion.
Definition: pselinv_unsym_impl.hpp:3163
void ConstructCommunicationPattern_P2p()
ConstructCommunicationPattern_P2p constructs the communication pattern to be used later in the select...
Definition: pselinv_unsym_impl.hpp:3434
NumMat< T > nzval
Dimension numRow * numCol, nonzero elements.
Definition: pselinv.hpp:260
Int NumSuper(const SuperNodeType *s)
NumSuper returns the total number of supernodes.
Definition: pselinv.hpp:364
NumMat< T > nzval
Dimension numRow * numCol, nonzero elements.
Definition: pselinv.hpp:209
virtual void PreSelInv()
PreSelInv prepares the structure in L_ and U_ so that SelInv only involves matrix-matrix multiplicati...
Definition: pselinv_unsym_impl.hpp:3205
IntNumVec cols
Dimension numCol * 1, index (0-based) for the number of nonzero rows.
Definition: pselinv.hpp:257
void SelInv_P2p()
Point-to-point version of the selected inversion.
Definition: pselinv_unsym_impl.hpp:3169
PMatrixUnsym contains the main data structure and the computational routine for the parallel selected...
Definition: pselinv.hpp:1003
Int MYROW(const GridType *g)
MYROW returns my processor row.
Definition: pselinv.hpp:295