PEXSI
 All Classes Namespaces Files Functions Variables Typedefs Pages
pselinv_unsym_impl.hpp
Go to the documentation of this file.
1 /*
2  Copyright (c) 2012 The Regents of the University of California,
3  through Lawrence Berkeley National Laboratory.
4 
5 Authors: Lin Lin and Mathias Jacquelin
6 
7 This file is part of PEXSI. All rights reserved.
8 
9 Redistribution and use in source and binary forms, with or without
10 modification, are permitted provided that the following conditions are met:
11 
12 (1) Redistributions of source code must retain the above copyright notice, this
13 list of conditions and the following disclaimer.
14 (2) Redistributions in binary form must reproduce the above copyright notice,
15 this list of conditions and the following disclaimer in the documentation
16 and/or other materials provided with the distribution.
17 (3) Neither the name of the University of California, Lawrence Berkeley
18 National Laboratory, U.S. Dept. of Energy nor the names of its contributors may
19 be used to endorse or promote products derived from this software without
20 specific prior written permission.
21 
22 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
23 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
24 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
25 DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
26 ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
27 (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
29 ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
31 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 
33 You are under no obligation whatsoever to provide any bug fixes, patches, or
34 upgrades to the features, functionality or performance of the source code
35 ("Enhancements") to anyone; however, if you choose to make your Enhancements
36 available either publicly, or directly to Lawrence Berkeley National
37 Laboratory, without imposing a separate written license agreement for such
38 Enhancements, then you hereby grant the following license: a non-exclusive,
39 royalty-free perpetual license to install, use, modify, prepare derivative
40 works, incorporate into other computer software, distribute, and sublicense
41 such enhancements or derivative works thereof, in binary and source code form.
42  */
46 #ifndef _PEXSI_PSELINV_UNSYM_IMPL_HPP_
47 #define _PEXSI_PSELINV_UNSYM_IMPL_HPP_
48 
49 #include "pexsi/timer.h"
51 #include <algorithm>
52 
53 
54 
55 
56 namespace PEXSI{
57 template<typename T>
58  PMatrixUnsym<T>::PMatrixUnsym (
59  const GridType* g,
60  const SuperNodeType* s,
61  const PSelInvOptions * o,
62  const FactorizationOptions * oFact
63  )
64  {
65 
66  this->Setup( g, s, o, oFact );
67 
68  return ;
69  } // ----- end of method PMatrixUnsym::PMatrixUnsym -----
70 
71 
72 template<typename T>
73  void PMatrixUnsym<T>::Setup(
74  const GridType* g,
75  const SuperNodeType* s,
76  const PSelInvOptions * o,
77  const FactorizationOptions * oFact
78  ) {
79 
80  PMatrix<T>::Setup(g,s,o,oFact);
81 
82 
83 
84  Lrow_.clear();
85  Ucol_.clear();
86  Lrow_.resize( this->NumLocalBlockRow() );
87  Ucol_.resize( this->NumLocalBlockCol() );
88 
89  LrowSize_.clear();
90  UcolSize_.clear();
91  LrowSize_.resize( this->NumLocalBlockRow() );
92  UcolSize_.resize( this->NumLocalBlockCol() );
93 
94  return ;
95  } // ----- end of method PMatrixUnsym::Setup -----
96 
97 
99 template<typename T>
101  SuperNodeBufferTypeUnsym & snode,
102  std::vector<LBlock<T> > & LcolRecv,
103  std::vector<LBlock<T> > & LrowRecv,
104  std::vector<UBlock<T> > & UcolRecv,
105  std::vector<UBlock<T> > & UrowRecv, /*useless so far*/
106  NumMat<T> & AinvBuf,
107  NumMat<T> & LBuf,
108  NumMat<T> & UBuf )
109  {
110  TIMER_START(Compute_Sinv_LT_Lookup_Indexes);
111 
112 
113 
114 
115 
116  TIMER_START(Build_colptr_rowptr);
117 
118 #if ( _DEBUGlevel_ >= 2 )
119  statusOFS<<"UrowRecv blockIdx: "<<std::endl;
120  for( Int jb = 0; jb < UrowRecv.size(); jb++ ){
121  statusOFS<<UrowRecv[jb].blockIdx<<" ";
122  }
123  statusOFS<<std::endl;
124 
125  statusOFS<<"UcolRecv blockIdx: "<<std::endl;
126  for( Int jb = 0; jb < UcolRecv.size(); jb++ ){
127  statusOFS<<UcolRecv[jb].blockIdx<<" ";
128  }
129  statusOFS<<std::endl;
130 
131  statusOFS<<"LcolRecv blockIdx: "<<std::endl;
132  for( Int jb = 0; jb < LcolRecv.size(); jb++ ){
133  statusOFS<<LcolRecv[jb].blockIdx<<" ";
134  }
135  statusOFS<<std::endl;
136 
137 
138  statusOFS<<"LrowRecv blockIdx: "<<std::endl;
139  for( Int jb = 0; jb < LrowRecv.size(); jb++ ){
140  statusOFS<<LrowRecv[jb].blockIdx<<" ";
141  }
142  statusOFS<<std::endl;
143 #endif
144 
145  TIMER_START(Sort_LrowRecv_UcolRecv);
146  //LrowRecv should be sorted by blockIdx, same for UcolRecv
147  std::sort(LrowRecv.begin(),LrowRecv.end(),LBlockComparator<T>);
148  std::sort(UcolRecv.begin(),UcolRecv.end(),UBlockComparator<T>);
149  TIMER_STOP(Sort_LrowRecv_UcolRecv);
150 
151  // rowPtrL[ib] gives the row index in snode.LUpdateBuf for the first
152  // nonzero row in LcolRecv[ib]. The total number of rows in
153  // snode.LUpdateBuf is given by rowPtr[end]-1
154  std::vector<Int> rowPtrL(LcolRecv.size() + 1);
155  rowPtrL[0] = 0;
156  for( Int ib = 0; ib < LcolRecv.size(); ib++ ){
157  LBlock<T> & LB = LcolRecv[ib];
158  rowPtrL[ib+1] = rowPtrL[ib] + LB.numRow;
159  }
160 
161  // colPtrL[jb] gives the column index in LBuf for the first
162  // nonzero column in UrowRecv[jb]. The total number of rows in
163  // LBuf is given by colPtr[end]-1
164 
165  std::vector<Int> colPtrL(LrowRecv.size() + 1);
166  std::vector<Int> colPtrU(UcolRecv.size() + 1,-1);
167  colPtrL[0] = 0;
168  for( Int jb = 0; jb < LrowRecv.size(); jb++ ){
169  LBlock<T> & LB = LrowRecv[jb];
170  colPtrL[jb+1] = colPtrL[jb] + LB.numCol;
171  }
172 
173  colPtrU[0] = 0;
174  for( Int jb = 0; jb < UcolRecv.size(); jb++ ){
175  UBlock<T> & UB = UcolRecv[jb];
176  colPtrU[jb+1] = colPtrU[jb] + UB.numCol;
177  }
178 
179 #if ( _DEBUGlevel_ >= 1 )
180  statusOFS<<rowPtrL<<std::endl;
181  statusOFS<<colPtrL<<std::endl;
182  statusOFS<<colPtrU<<std::endl;
183 #endif
184 
185 
186  Int numRowAinvBuf = *colPtrU.rbegin();
187  //Int numRowAinvBuf = *rowPtrL.rbegin();
188  Int numColAinvBuf = *colPtrL.rbegin();
189  TIMER_STOP(Build_colptr_rowptr);
190 
191  TIMER_START(Allocate_lookup);
192  // Allocate for the computational storage
193  AinvBuf.Resize( numRowAinvBuf, numColAinvBuf );
194  LBuf.Resize( SuperSize( snode.Index, this->super_ ), numColAinvBuf );
195  UBuf.Resize( SuperSize( snode.Index, this->super_ ), numRowAinvBuf );
196  TIMER_STOP(Allocate_lookup);
197 
198 
199 
200  TIMER_START(Fill_LBuf);
201  // Fill LBuf first. Make the transpose later in the Gemm phase.
202  for( Int jb = 0; jb < LrowRecv.size(); jb++ ){
203  LBlock<T>& LB = LrowRecv[jb];
204  if(1 || (LB.numRow>0 && LB.numCol>0)){
205  if( LB.numRow != SuperSize(snode.Index, this->super_) ){
206  ErrorHandling(
207  "The size of LB is not right. Something is seriously wrong." );
208  }
209  lapack::Lacpy( 'A', LB.numRow, LB.numCol, LB.nzval.Data(),
210  LB.numRow, LBuf.VecData( colPtrL[jb] ), LBuf.m() );
211  }
212  }
213  TIMER_STOP(Fill_LBuf);
214 
215  TIMER_START(Fill_UBuf);
216  // Fill UBuf first.
217  SetValue(UBuf, ZERO<T>());
218  for( Int jb = 0; jb < UcolRecv.size(); jb++ ){
219  UBlock<T>& UB = UcolRecv[jb];
220 
221  if(1||(UB.numRow>0 && UB.numCol>0)){
222  if( UB.numRow != SuperSize(snode.Index, this->super_) ){
223  ErrorHandling(
224  "The size of UB is not right. Something is seriously wrong." );
225  }
226 
227 
228  lapack::Lacpy( 'A', UB.numRow, UB.numCol, UB.nzval.Data(),
229  UB.numRow, UBuf.VecData( colPtrU[jb] ), UBuf.m() );
230  }
231  }
232  TIMER_STOP(Fill_UBuf);
233 
234  // Calculate the relative indices for (isup, jsup)
235  // Fill AinvBuf with the information in L or U block.
236  TIMER_START(JB_Loop);
237  // for( Int jb = 0; jb < UrowRecv.size(); jb++ ){
238  for( Int jb = 0; jb < LrowRecv.size(); jb++ ){
239  LBlock<T>& LrowB = LrowRecv[jb];
240  Int jsup = LrowB.blockIdx;
241 
242  Int SinvColsSta = FirstBlockCol( jsup, this->super_ );
243 
244 
245  for( Int ib = 0; ib < LcolRecv.size(); ib++ ){
246  LBlock<T>& LB = LcolRecv[ib];
247  Int isup = LB.blockIdx;
248  Int SinvRowsSta = FirstBlockCol( isup, this->super_ );
249  T* nzvalAinv = &AinvBuf( rowPtrL[ib], colPtrL[jb] );
250  Int ldAinv = numRowAinvBuf;
251 
252  // Pin down the corresponding block in the part of Sinv.
253  if( isup >= jsup ){
254  // Column relative indicies
255  std::vector<Int> relCols( LrowB.numCol );
256  for( Int j = 0; j < LrowB.numCol; j++ ){
257  relCols[j] = LrowB.rows[j] - SinvColsSta;
258  }
259 
260  std::vector<LBlock<T> >& LcolSinv = this->L( LBj(jsup, this->grid_ ));
261  bool isBlockFound = false;
262  TIMER_START(PARSING_ROW_BLOCKIDX);
263  for( Int ibSinv = 0; ibSinv < LcolSinv.size(); ibSinv++ ){
264  // Found the (isup, jsup) block in Sinv
265  if( LcolSinv[ibSinv].blockIdx == isup ){
266  LBlock<T>& SinvB = LcolSinv[ibSinv];
267 
268  // Row relative indices
269  std::vector<Int> relRows( LB.numRow );
270  Int* rowsLBPtr = LB.rows.Data();
271  Int* rowsSinvBPtr = SinvB.rows.Data();
272 
273  TIMER_START(STDFIND_ROW);
274  Int * pos =&rowsSinvBPtr[0];
275  Int * last =&rowsSinvBPtr[SinvB.numRow];
276  for( Int i = 0; i < LB.numRow; i++ ){
277  pos = std::upper_bound(pos, last, rowsLBPtr[i])-1;
278  if(pos != last){
279  relRows[i] = (Int)(pos - rowsSinvBPtr);
280  }
281  else{
282  std::ostringstream msg;
283  msg << "Row " << rowsLBPtr[i] <<
284  " in LB cannot find the corresponding row in SinvB"
285  << std::endl
286  << "LB.rows = " << LB.rows << std::endl
287  << "SinvB.rows = " << SinvB.rows << std::endl;
288  ErrorHandling( msg.str().c_str() );
289  }
290  }
291  TIMER_STOP(STDFIND_ROW);
292 
293  TIMER_START(Copy_Sinv_to_Ainv);
294  // Transfer the values from Sinv to AinvBlock
295  T* nzvalSinv = SinvB.nzval.Data();
296  Int ldSinv = SinvB.numRow;
297  for( Int j = 0; j < LrowB.numCol; j++ ){
298  for( Int i = 0; i < LB.numRow; i++ ){
299  // AinvBuf( rowPtrL[ib] + i, colPtrL[jb] + j ) =
300  // SinvB.nzval(relRows[i],relCols[j]);
301  nzvalAinv[i + j*ldAinv] =
302  nzvalSinv[relRows[i]+relCols[j]*ldSinv];
303  }
304  }
305  TIMER_STOP(Copy_Sinv_to_Ainv);
306 
307  isBlockFound = true;
308  break;
309  }
310  } // for (ibSinv )
311  TIMER_STOP(PARSING_ROW_BLOCKIDX);
312  if( isBlockFound == false ){
313  std::ostringstream msg;
314  msg << "["<<snode.Index<<"] "<<"Block(" << isup << ", " << jsup
315  << ") did not find a matching block in Sinv." << std::endl;
316  //#if ( _DEBUGlevel_ >= 1 )
317  statusOFS<<msg.str();
318  //#endif
319  // ErrorHandling( msg.str().c_str() );
320  }
321  } // if (isup, jsup) is in L
322  else{
323  // Row relative indices
324  std::vector<Int> relRows( LB.numRow );
325  for( Int i = 0; i < LB.numRow; i++ ){
326  relRows[i] = LB.rows[i] - SinvRowsSta;
327  }
328 
329  std::vector<UBlock<T> >& UrowSinv =
330  this->U( LBi( isup, this->grid_ ) );
331  bool isBlockFound = false;
332  TIMER_START(PARSING_COL_BLOCKIDX);
333  for( Int jbSinv = 0; jbSinv < UrowSinv.size(); jbSinv++ ){
334  // Found the (isup, jsup) block in Sinv
335  if( UrowSinv[jbSinv].blockIdx == jsup ){
336  UBlock<T>& SinvB = UrowSinv[jbSinv];
337 
338  // Column relative indices
339  std::vector<Int> relCols( LrowB.numCol );
340  Int* rowsLrowBPtr = LrowB.rows.Data();
341  Int* colsSinvBPtr = SinvB.cols.Data();
342 
343  TIMER_START(STDFIND_COL);
344  Int * pos =&colsSinvBPtr[0];
345  Int * last =&colsSinvBPtr[SinvB.numCol];
346  for( Int j = 0; j < LrowB.numCol; j++ ){
347  //rowsLrowB is sorted
348  pos = std::upper_bound(pos, last, rowsLrowBPtr[j])-1;
349  if(pos !=last){
350  relCols[j] = (Int)(pos - colsSinvBPtr);
351  }
352  else{
353  std::ostringstream msg;
354  msg << "Col " << rowsLrowBPtr[j] <<
355  " in UB cannot find the corresponding row in SinvB"
356  << std::endl
357  << "LrowB.rows = " << LrowB.rows << std::endl
358  << "UinvB.cols = " << SinvB.cols << std::endl;
359  ErrorHandling( msg.str().c_str() );
360  }
361  }
362  TIMER_STOP(STDFIND_COL);
363 
364  TIMER_START(Copy_Sinv_to_Ainv);
365  // Transfer the values from Sinv to AinvBlock
366  T* nzvalSinv = SinvB.nzval.Data();
367  Int ldSinv = SinvB.numRow;
368  for( Int j = 0; j < LrowB.numCol; j++ ){
369  for( Int i = 0; i < LB.numRow; i++ ){
370  // AinvBuf( rowPtrL[ib] + i, colPtrL[jb] + j ) =
371  // SinvB.nzval(relRows[i],relCols[j]);
372 
373  nzvalAinv[i + j*ldAinv] =
374  nzvalSinv[relRows[i]+relCols[j]*ldSinv];
375  }
376  }
377  TIMER_STOP(Copy_Sinv_to_Ainv);
378 
379  isBlockFound = true;
380  break;
381  }
382  } // for (jbSinv)
383  TIMER_STOP(PARSING_COL_BLOCKIDX);
384  if( isBlockFound == false ){
385  std::ostringstream msg;
386  msg << "["<<snode.Index<<"] "<< "Block(" << isup << ", " << jsup
387  << ") did not find a matching block in Sinv." << std::endl;
388  statusOFS<<msg.str();
389  // ErrorHandling( msg.str().c_str() );
390  }
391  } // if (isup, jsup) is in U
392 
393  } // for( ib )
394  } // for ( jb )
395  TIMER_STOP(JB_Loop);
396 
397  TIMER_STOP(Compute_Sinv_LT_Lookup_Indexes);
398  } // End of method PMatrixUnsym::Selinv_lookup_indexes
399 
400 
401 
402  struct CDBuffers{
403  //Buffers for L
404  std::vector<MPI_Request > arrMpiReqsSendLCD;
405  std::vector<MPI_Request > arrMpiReqsSizeSendLCD;
406  std::vector<MPI_Request > arrMpiReqsRecvLCD;
407  std::vector<MPI_Request > arrMpiReqsSizeRecvLCD;
408  std::vector<std::vector<char> > arrSstrLcolSendCD;
409  std::vector<int > arrSstrLcolSizeSendCD;
410  std::vector<std::vector<char> > arrSstrLrowRecvCD;
411  std::vector<int > arrSstrLrowSizeRecvCD;
412 
413  //Buffers for U
414  std::vector<MPI_Request > arrMpiReqsSendUCD;
415  std::vector<MPI_Request > arrMpiReqsSizeSendUCD;
416  std::vector<MPI_Request > arrMpiReqsRecvUCD;
417  std::vector<MPI_Request > arrMpiReqsSizeRecvUCD;
418  std::vector<std::vector<char> > arrSstrUrowSendCD;
419  std::vector<int > arrSstrUrowSizeSendCD;
420  std::vector<std::vector<char> > arrSstrUcolRecvCD;
421  std::vector<int > arrSstrUcolSizeRecvCD;
422 
423  std::vector<Int>sendOffset;
424  std::vector<Int>recvOffset;
425 
426  void resize(Int sendCount, Int recvCount){
427  //Buffers for L
428  arrMpiReqsSendLCD.assign(sendCount, MPI_REQUEST_NULL );
429  arrMpiReqsSizeSendLCD.assign(sendCount, MPI_REQUEST_NULL );
430  arrMpiReqsRecvLCD.assign(recvCount, MPI_REQUEST_NULL );
431  arrMpiReqsSizeRecvLCD.assign(recvCount, MPI_REQUEST_NULL );
432  arrSstrLcolSendCD.resize(sendCount);
433  arrSstrLcolSizeSendCD.resize(sendCount);
434  arrSstrLrowRecvCD.resize(recvCount);
435  arrSstrLrowSizeRecvCD.resize(recvCount);
436 
437  //Buffers for U
438  arrMpiReqsSendUCD.assign(recvCount, MPI_REQUEST_NULL );
439  arrMpiReqsSizeSendUCD.assign(recvCount, MPI_REQUEST_NULL );
440  arrMpiReqsRecvUCD.assign(sendCount, MPI_REQUEST_NULL );
441  arrMpiReqsSizeRecvUCD.assign(sendCount, MPI_REQUEST_NULL );
442  arrSstrUrowSendCD.resize(recvCount);
443  arrSstrUrowSizeSendCD.resize(recvCount);
444  arrSstrUcolRecvCD.resize(sendCount);
445  arrSstrUcolSizeRecvCD.resize(sendCount);
446  }
447 
448  void WaitAllSend(){
449  mpi::Waitall(arrMpiReqsSizeSendLCD);
450  mpi::Waitall(arrMpiReqsSendLCD);
451  mpi::Waitall(arrMpiReqsSizeSendUCD);
452  mpi::Waitall(arrMpiReqsSendUCD);
453 
454  // MPI_Waitall(arrMpiReqsSizeSendLCD.size(), &arrMpiReqsSizeSendLCD[0], MPI_STATUSES_IGNORE);
455  // MPI_Waitall(arrMpiReqsSendLCD.size(), &arrMpiReqsSendLCD[0], MPI_STATUSES_IGNORE);
456  // MPI_Waitall(arrMpiReqsSizeSendUCD.size(), &arrMpiReqsSizeSendUCD[0], MPI_STATUSES_IGNORE);
457  // MPI_Waitall(arrMpiReqsSendUCD.size(), &arrMpiReqsSendUCD[0], MPI_STATUSES_IGNORE);
458  }
459 
460  void WaitAllRecv(){
461  // mpi::Waitall(arrMpiReqsSizeRecvLCD);
462  // mpi::Waitall(arrMpiReqsRecvLCD);
463  // mpi::Waitall(arrMpiReqsSizeRecvUCD);
464  // mpi::Waitall(arrMpiReqsRecvUCD);
465  }
466 
467 
468  };
469 
470 
471  template<typename T>
473  std::vector<Int > & arrSuperNodes,
474  Int stepSuper, CDBuffers & buffers)
475  {
476  TIMER_START(SendRecvSizesCD);
477  //compute the number of requests
478  Int sendCount = 0;
479  Int recvCount = 0;
480  buffers.sendOffset.resize(stepSuper);
481  buffers.recvOffset.resize(stepSuper);
482  Int recvIdxL=0;
483  Int recvIdxU=0;
484  for (Int supidx=0; supidx<stepSuper; supidx++){
485  Int snode_index = arrSuperNodes[supidx];
486  buffers.sendOffset[supidx]=sendCount;
487  buffers.recvOffset[supidx]=recvCount;
488  sendCount+= this->CountSendToCrossDiagonal(snode_index);
489  recvCount+= this->CountRecvFromCrossDiagonal(snode_index);
490  }
491 
492  buffers.resize(sendCount,recvCount);
493 
494 
495  //Do Isend for size and content of L and Irecv for sizes of U
496  for (Int supidx=0; supidx<stepSuper; supidx++){
497  Int snode_index = arrSuperNodes[supidx];
498 
499  TIMER_START(Send_L_Recv_Size_U_CrossDiag);
500 
501  if( MYCOL( this->grid_ ) == PCOL( snode_index, this->grid_ )
502  && this->isSendToCrossDiagonal_(this->grid_->numProcCol, snode_index ) ){
503 
504  Int sendIdxL = 0;
505  Int recvIdxU = 0;
506  for(Int dstCol = 0; dstCol<this->grid_->numProcCol; dstCol++){
507  if(this->isSendToCrossDiagonal_(dstCol,snode_index) ){
508  Int dest = PNUM(PROW(snode_index,this->grid_),dstCol,this->grid_);
509 
510  if( MYPROC( this->grid_ ) != dest ){
511  //Send size and content of L
512  MPI_Request & mpiReqSizeSendL =
513  buffers.arrMpiReqsSizeSendLCD[buffers.sendOffset[supidx]+sendIdxL];
514  MPI_Request & mpiReqSendL =
515  buffers.arrMpiReqsSendLCD[buffers.sendOffset[supidx]+sendIdxL];
516  std::vector<char> & sstrLcolSend =
517  buffers.arrSstrLcolSendCD[buffers.sendOffset[supidx]+sendIdxL];
518  Int & sstrSizeL =
519  buffers.arrSstrLcolSizeSendCD[buffers.sendOffset[supidx]+sendIdxL];
520 
521  // Pack L data
522  std::stringstream sstm;
523  std::vector<Int> mask( LBlockMask::TOTAL_NUMBER, 1 );
524  std::vector<LBlock<T> >& Lcol = this->L( LBj(snode_index, this->grid_) );
525  // All blocks except for the diagonal block are to be sent right
526  //TODO not true > this is a scatter operation ! Can we know the destination ?
527 
528  //Skip the diagonal block if necessary
529  Int startIdx = ( MYROW( this->grid_ ) == PROW( snode_index, this->grid_ ) ) ? 1:0;
530  Int count = 0;
531  for( Int ib = startIdx; ib < Lcol.size(); ib++ ){
532  if( Lcol[ib].blockIdx > snode_index &&
533  (Lcol[ib].blockIdx % this->grid_->numProcCol) == dstCol ){
534  count++;
535  }
536  }
537 
538  serialize( (Int)count, sstm, NO_MASK );
539 
540  for( Int ib = startIdx; ib < Lcol.size(); ib++ ){
541  if( Lcol[ib].blockIdx > snode_index &&
542  (Lcol[ib].blockIdx % this->grid_->numProcCol) == dstCol ){
543 #if ( _DEBUGlevel_ >= 1 )
544  statusOFS << "["<<snode_index<<"] SEND contains "<<Lcol[ib].blockIdx
545  << " which corresponds to "<<GBj(ib,this->grid_)
546  << " to P"<<dest
547  << std::endl;
548 #endif
549  serialize( Lcol[ib], sstm, mask );
550  }
551  }
552 
553  sstrLcolSend.resize( Size(sstm) );
554  sstm.read( &sstrLcolSend[0], sstrLcolSend.size() );
555  sstrSizeL = sstrLcolSend.size();
556 
557 
558 
559  MPI_Isend( &sstrSizeL, 1, MPI_INT, dest,
560  IDX_TO_TAG2(snode_index,supidx,SELINV_TAG_L_SIZE_CD),
561  this->grid_->comm, &mpiReqSizeSendL );
562  MPI_Isend( (void*)&sstrLcolSend[0], sstrSizeL, MPI_BYTE, dest,
563  IDX_TO_TAG2(snode_index,supidx,SELINV_TAG_L_CONTENT_CD),
564  this->grid_->comm, &mpiReqSendL );
565  sendIdxL++;
566 
567  //Recv for U size
568  Int & sstrSizeU =
569  buffers.arrSstrUcolSizeRecvCD[buffers.sendOffset[supidx]+recvIdxU];
570  MPI_Request & mpiReqSizeRecvU =
571  buffers.arrMpiReqsSizeRecvUCD[buffers.sendOffset[supidx]+recvIdxU];
572 
573  MPI_Irecv( &sstrSizeU, 1, MPI_INT, dest,
574  IDX_TO_TAG2(snode_index,supidx,SELINV_TAG_U_SIZE_CD),
575  this->grid_->comm, &mpiReqSizeRecvU );
576  recvIdxU++;
577  }
578  }
579  }
580  } // sender
581  TIMER_STOP(Send_L_Recv_Size_U_CrossDiag);
582  }
583 
584 
585  //Do Irecv for sizes of L and Isend for size and content of U
586  for (Int supidx=0; supidx<stepSuper; supidx++){
587  Int snode_index = arrSuperNodes[supidx];
588  TIMER_START(Send_U_Recv_Size_L_CrossDiag);
589  //If I'm a receiver
590  if( MYROW( this->grid_ ) == PROW( snode_index, this->grid_ )
591  && this->isRecvFromCrossDiagonal_(this->grid_->numProcRow, snode_index ) ){
592  Int recvIdxL=0;
593  Int sendIdxU=0;
594  for(Int srcRow = 0; srcRow<this->grid_->numProcRow; srcRow++){
595  if(this->isRecvFromCrossDiagonal_(srcRow,snode_index) ){
596  Int src = PNUM(srcRow,PCOL(snode_index,this->grid_),this->grid_);
597  if( MYPROC( this->grid_ ) != src ){
598  //Recv size of L
599  Int & sstrSizeL =
600  buffers.arrSstrLrowSizeRecvCD[buffers.recvOffset[supidx]+recvIdxL];
601  MPI_Request & mpiReqSizeRecvL =
602  buffers.arrMpiReqsSizeRecvLCD[buffers.recvOffset[supidx]+recvIdxL];
603 
604  MPI_Irecv( &sstrSizeL, 1, MPI_INT, src,
605  IDX_TO_TAG2(snode_index,supidx,SELINV_TAG_L_SIZE_CD),
606  this->grid_->comm, &mpiReqSizeRecvL );
607  recvIdxL++;
608 
609 
610  //Send size and content of U
611  MPI_Request & mpiReqSizeSendU =
612  buffers.arrMpiReqsSizeSendUCD[buffers.recvOffset[supidx]+sendIdxU];
613  MPI_Request & mpiReqSendU =
614  buffers.arrMpiReqsSendUCD[buffers.recvOffset[supidx]+sendIdxU];
615  std::vector<char> & sstrUrowSend =
616  buffers.arrSstrUrowSendCD[buffers.recvOffset[supidx]+sendIdxU];
617  Int & sstrSizeU =
618  buffers.arrSstrUrowSizeSendCD[buffers.recvOffset[supidx]+sendIdxU];
619 
620  // Pack U data
621  std::stringstream sstm;
622  std::vector<Int> mask( UBlockMask::TOTAL_NUMBER, 1 );
623  std::vector<UBlock<T> >& Urow = this->U( LBi(snode_index, this->grid_) );
624  // All blocks except for the diagonal block are to be sent right
625  //TODO not true > this is a scatter operation ! Can we know the destination ?
626 
627  Int count = 0;
628  for( Int jb = 0; jb < Urow.size(); jb++ ){
629  if( Urow[jb].blockIdx > snode_index
630  && (Urow[jb].blockIdx % this->grid_->numProcRow) == srcRow ){
631  count++;
632  }
633  }
634 
635  serialize( (Int)count, sstm, NO_MASK );
636 
637  for( Int jb = 0; jb < Urow.size(); jb++ ){
638  if( Urow[jb].blockIdx > snode_index
639  && (Urow[jb].blockIdx % this->grid_->numProcRow) == srcRow ){
640 #if ( _DEBUGlevel_ >= 1 )
641  statusOFS << "["<<snode_index<<"] SEND contains "<<Urow[jb].blockIdx
642  << " which corresponds to "<<GBi(jb,this->grid_)
643  << " to P"<<src
644  << std::endl;
645 #endif
646  serialize( Urow[jb], sstm, mask );
647  }
648  }
649 
650  sstrUrowSend.resize( Size(sstm) );
651  sstm.read( &sstrUrowSend[0], sstrUrowSend.size() );
652  sstrSizeU = sstrUrowSend.size();
653 
654 
655  MPI_Isend( &sstrSizeU, 1, MPI_INT, src,
656  IDX_TO_TAG2(snode_index,supidx,SELINV_TAG_U_SIZE_CD),
657  this->grid_->comm, &mpiReqSizeSendU );
658  MPI_Isend( (void*)&sstrUrowSend[0], sstrSizeU, MPI_BYTE, src,
659  IDX_TO_TAG2(snode_index,supidx,SELINV_TAG_U_CONTENT_CD),
660  this->grid_->comm, &mpiReqSendU );
661  sendIdxU++;
662  }
663  }
664  }
665  }//end if I'm a receiver
666  TIMER_STOP(Send_U_Recv_Size_L_CrossDiag);
667  }
668  TIMER_STOP(SendRecvSizesCD);
669  }
670 
671  template<typename T>
672  inline void PMatrixUnsym<T>::IRecvContentCD(
673  std::vector<Int > & arrSuperNodes,
674  Int stepSuper, CDBuffers & buffers) {
675  TIMER_START(IrecvContentCD);
676 
677 
678  //waitall sizes of L
679  TIMER_START(Wait_Size_L_CrossDiag);
680  mpi::Waitall(buffers.arrMpiReqsSizeRecvLCD);
681  TIMER_STOP(Wait_Size_L_CrossDiag);
682 
683  //if(this->grid_->mpirank==0){gdb_lock();}
684 
685  //Allocate content and do Irecv for content of L
686  for (Int supidx=0; supidx<stepSuper; supidx++){
687  Int snode_index = arrSuperNodes[supidx];
688  //If I'm a receiver
689  TIMER_START(Recv_L_CrossDiag);
690  if( MYROW( this->grid_ ) == PROW( snode_index, this->grid_ )
691  && this->isRecvFromCrossDiagonal_(this->grid_->numProcRow, snode_index ) ){
692  Int recvIdxL=0;
693  for(Int srcRow = 0; srcRow<this->grid_->numProcRow; srcRow++){
694  if(this->isRecvFromCrossDiagonal_(srcRow,snode_index) ){
695  Int src = PNUM(srcRow,PCOL(snode_index,this->grid_),this->grid_);
696  if( MYPROC( this->grid_ ) != src ){
697  Int & sstrSizeL =
698  buffers.arrSstrLrowSizeRecvCD[buffers.recvOffset[supidx]+recvIdxL];
699  std::vector<char> & sstrLrowRecv =
700  buffers.arrSstrLrowRecvCD[buffers.recvOffset[supidx]+recvIdxL];
701  MPI_Request & mpiReqRecvL =
702  buffers.arrMpiReqsRecvLCD[buffers.recvOffset[supidx]+recvIdxL];
703  sstrLrowRecv.resize( sstrSizeL);
704 
705  MPI_Irecv( (void*)&sstrLrowRecv[0], sstrSizeL, MPI_BYTE, src,
706  IDX_TO_TAG2(snode_index,supidx,SELINV_TAG_L_CONTENT_CD),
707  this->grid_->comm, &mpiReqRecvL );
708  recvIdxL++;
709 
710  }
711  }
712  }
713  }//end if I'm a receiver
714  TIMER_STOP(Recv_L_CrossDiag);
715  }
716 
717  //waitall sizes of U
718  TIMER_START(Wait_Size_U_CrossDiag);
719  mpi::Waitall(buffers.arrMpiReqsSizeRecvUCD);
720  TIMER_STOP(Wait_Size_U_CrossDiag);
721 
722  //Allocate content and do Irecv for content of U
723  for (Int supidx=0; supidx<stepSuper; supidx++){
724  Int snode_index = arrSuperNodes[supidx];
725  TIMER_START(Recv_U_CrossDiag);
726  if( MYCOL( this->grid_ ) == PCOL( snode_index, this->grid_ )
727  && this->isSendToCrossDiagonal_(this->grid_->numProcCol, snode_index ) ){
728  Int recvIdxU = 0;
729  for(Int srcCol = 0; srcCol<this->grid_->numProcCol; srcCol++){
730  if(this->isSendToCrossDiagonal_(srcCol,snode_index) ){
731  Int src = PNUM(PROW(snode_index,this->grid_),srcCol,this->grid_);
732  if( MYPROC( this->grid_ ) != src ){
733  Int & sstrSizeU =
734  buffers.arrSstrUcolSizeRecvCD[buffers.sendOffset[supidx]+recvIdxU];
735  std::vector<char> & sstrUcolRecv =
736  buffers.arrSstrUcolRecvCD[buffers.sendOffset[supidx]+recvIdxU];
737  MPI_Request & mpiReqRecvU =
738  buffers.arrMpiReqsRecvUCD[buffers.sendOffset[supidx]+recvIdxU];
739  sstrUcolRecv.resize( sstrSizeU );
740 
741  MPI_Irecv( (void*)&sstrUcolRecv[0], sstrSizeU, MPI_BYTE, src,
742  IDX_TO_TAG2(snode_index,supidx,SELINV_TAG_U_CONTENT_CD),
743  this->grid_->comm, &mpiReqRecvU );
744  recvIdxU++;
745  }
746  }
747  }
748  }//end if I'm a receiver
749  TIMER_STOP(Recv_U_CrossDiag);
750  }
751 
752  TIMER_START(IrecvContentCD);
753  }
754 
755 
756  template<typename T>
757  inline void PMatrixUnsym<T>::WaitContentLCD(
758  std::vector<Int > & arrSuperNodes,
759  Int stepSuper, CDBuffers & buffers) {
760  TIMER_START(WaitContentLCD);
761  //waitall content of L
762  TIMER_START(Wait_L_CrossDiag);
763  mpi::Waitall(buffers.arrMpiReqsRecvLCD);
764  TIMER_STOP(Wait_L_CrossDiag);
765 
766 
767 
768  //Do the work for Lrow
769  NumMat<T> Ltmp;
770  for (Int supidx=0; supidx<stepSuper; supidx++){
771  Int snode_index = arrSuperNodes[supidx];
772  Int ksupProcRow = PROW( snode_index, this->grid_ );
773  Int ksupProcCol = PCOL( snode_index, this->grid_ );
774 
775  if( MYROW( this->grid_ ) == PROW( snode_index, this->grid_ ) &&
776  this->isRecvFromCrossDiagonal_(this->grid_->numProcRow, snode_index ) ){
777 
778 #if ( _DEBUGlevel_ >= 1 )
779  statusOFS << std::endl << " ["<<snode_index<<"] "
780  << "Update the upper triangular block"
781  << std::endl << std::endl;
782  statusOFS << std::endl << " ["<<snode_index<<"] "
783  // << "blockIdxLocal:" << snode.BlockIdxLocal
784  << std::endl << std::endl;
785  statusOFS << std::endl << " ["<<snode_index<<"] "
786  // << "rowLocalPtr:" << snode.RowLocalPtr
787  << std::endl << std::endl;
788 #endif
789 
790  std::vector<LBlock<T> >& Lcol = this->L( LBj(snode_index, this->grid_) );
791  std::vector<LBlock<T> >& Lrow = this->Lrow( LBi( snode_index, this->grid_ ) );
792 
793  // if(Lrow.size() == 0){
794  // Int& LrowSize = this->LrowSize_[ LBi( snode_index, this->grid_ ) ];
795  // Lrow.resize(LrowSize);
796  // }
797 
798  std::vector<Int> isBlockFound(Lrow.size(),false);
799  Int recvIdxL=0;
800 
801  for(Int srcRow = 0; srcRow<this->grid_->numProcRow; srcRow++){
802  if(this->isRecvFromCrossDiagonal_(srcRow,snode_index) ){
803  Int src = PNUM(srcRow,PCOL(snode_index,this->grid_),this->grid_);
804  TIMER_START(Recv_L_CrossDiag);
805 
806  //Declaring some pointers to avoid copy if data is local
807  std::vector<LBlock<T> > * pLcol;
808  std::vector<LBlock<T> > LcolRecv;
809  if(MYPROC(this->grid_)!= src){
810  Int & sstrSizeL =
811  buffers.arrSstrLrowSizeRecvCD[buffers.recvOffset[supidx]+recvIdxL];
812  std::vector<char> & sstrLrowRecv =
813  buffers.arrSstrLrowRecvCD[buffers.recvOffset[supidx]+recvIdxL];
814 
815  std::stringstream sstm;
816 
817 #if ( _DEBUGlevel_ >= 1 )
818  statusOFS << "["<<snode_index<<"] P"<<MYPROC(this->grid_)<<" ("<<MYROW(this->grid_)
819  << ","<<MYCOL(this->grid_)<<") <--- LBj("<<snode_index<<") <--- P"
820  << src <<" ("<<srcRow<<","<<ksupProcCol<<")"
821  << std::endl;
822 #endif
823  sstm.write( &sstrLrowRecv[0], sstrSizeL );
824 
825  // Unpack L data.
826  Int numLBlock;
827  std::vector<Int> mask( LBlockMask::TOTAL_NUMBER, 1 );
828  deserialize( numLBlock, sstm, NO_MASK );
829  LcolRecv.resize(numLBlock);
830  for( Int ib = 0; ib < numLBlock; ib++ ){
831  deserialize( LcolRecv[ib], sstm, mask );
832 #if ( _DEBUGlevel_ >= 1 )
833  statusOFS << "["<<snode_index<<"] RECV contains "
834  << LcolRecv[ib].blockIdx<< " which corresponds to "
835  << ib * this->grid_->numProcRow + srcRow;
836  statusOFS << " L is on row "<< srcRow
837  << " whereas Lrow is on col "
838  << LcolRecv[ib].blockIdx % this->grid_->numProcCol
839  << std::endl;
840 #endif
841  }
842  recvIdxL++;
843  pLcol = &LcolRecv;
844  } // sender is not the same as receiver
845  else{
846  // L is obtained locally, just make a copy. Do not include the diagonal block
847  pLcol = &Lcol;
848  } // sender is the same as receiver
849 
850 
851  TIMER_STOP(Recv_L_CrossDiag);
852 
853  //We can directly put LcolRecv in Lrow (stil need to transpose)
854 
855  //Skip the diagonal block if necessary
856  Int startIdx = ( MYPROC( this->grid_ ) == src && MYPROC(this->grid_) == PNUM(ksupProcRow,ksupProcCol, this->grid_) ) ? 1:0;
857  for( Int ib = startIdx; ib < pLcol->size(); ib++ ){
858  LBlock<T> & LB = (*pLcol)[ib];
859  if( LB.blockIdx <= snode_index ){
860  ErrorHandling( "LcolRecv contains the wrong blocks." );
861  }
862 
863  //check that LB would be on this proc if it was a UB
864  if(PCOL(LB.blockIdx, this->grid_) != MYCOL(this->grid_)){
865  continue;
866  }
867 
868 
869 
870  //std::vector<LBlock<T> > & LrowCD = this->Lrow( LBi( ksup, super_ ) );
871  for( Int iib = 0; iib < Lrow.size(); iib++ ){
872  LBlock<T> & LrowB = Lrow[ iib ];
873 
874  //If not initialized, set it to be the one
875  if( LrowB.blockIdx == -1){
876  LrowB.blockIdx = LB.blockIdx;
877  }
878 
879  if( LB.blockIdx == LrowB.blockIdx ){
880  // Note that the order of the column indices of the U
881  // block may not follow the order of the row indices,
882  // overwrite the information in U.
883  // LrowB = LB;
884  LrowB.rows = LB.rows;
885  //Store in "row major" format / i.e transpose is in col-major
886  Transpose(LB.nzval, LrowB.nzval);
887  LrowB.numCol = LB.numRow;
888  LrowB.numRow = LB.numCol;
889 
890  // SetValue(LrowB.nzval,ZERO<T>());
891 
892 #if ( _DEBUGlevel_ >= 1 )
893  statusOFS<<"["<<snode_index<<"] USING LB "<<LB.blockIdx<< std::endl;
894 #endif
895  isBlockFound[iib] = 1;//true;
896  break;
897  } // if( LB.blockIdx == LrowB.blockIdx )
898  } // for (iib)
899  } // for (ib)
900  }
901  }
902 
903  for( Int ib = 0; ib < Lrow.size(); ib++ ){
904  LBlock<T> & LB = Lrow[ib];
905  if( !isBlockFound[ib] ){
906  ErrorHandling(
907  "LBlock cannot find its update. Something is seriously wrong."
908  );
909  }
910  }
911  } // Done copying L
912  }
913 
914  TIMER_STOP(WaitContentLCD);
915  }
916 
917 
918 
919  template<typename T>
920  inline void PMatrixUnsym<T>::WaitContentUCD(
921  std::vector<Int > & arrSuperNodes,
922  Int stepSuper, CDBuffers & buffers) {
923 
924  TIMER_START(WaitContentUCD);
925  //waitall content of U
926  TIMER_START(Wait_U_CrossDiag);
927  mpi::Waitall(buffers.arrMpiReqsRecvUCD);
928  TIMER_STOP(Wait_U_CrossDiag);
929 
930 
931  //Do the work for U
932  for (Int supidx=0; supidx<stepSuper; supidx++){
933  Int snode_index = arrSuperNodes[supidx];
934  Int ksupProcRow = PROW( snode_index, this->grid_ );
935  Int ksupProcCol = PCOL( snode_index, this->grid_ );
936 
937 
938  if( MYCOL( this->grid_ ) == PCOL( snode_index, this->grid_ ) &&
939  this->isSendToCrossDiagonal_(this->grid_->numProcCol, snode_index ) ){
940 
941  std::vector<UBlock<T> >& Ucol = this->Ucol( LBj( snode_index, this->grid_ ) );
942 
943  // if(Ucol.size() == 0){
944  // Int& UcolSize = this->UcolSize_[ LBj( snode_index, this->grid_ ) ];
945  // Ucol.resize(UcolSize);
946  // }
947 
948  std::vector<Int> isBlockFound(Ucol.size(),false);
949  Int recvIdxU=0;
950 
951  for(Int srcCol = 0; srcCol<this->grid_->numProcCol; srcCol++){
952  if(this->isSendToCrossDiagonal_(srcCol,snode_index) ){
953  Int src = PNUM(PROW(snode_index,this->grid_),srcCol,this->grid_);
954  TIMER_START(Recv_U_CrossDiag);
955 
956  //Declaring some pointers to avoid copy if data is local
957  std::vector<UBlock<T> > * pUrow;
958  std::vector<UBlock<T> > UrowRecv;
959  if(MYPROC(this->grid_)!= src){
960  Int & sstrSizeU =
961  buffers.arrSstrUcolSizeRecvCD[buffers.sendOffset[supidx]+recvIdxU];
962  std::vector<char> & sstrUcolRecv =
963  buffers.arrSstrUcolRecvCD[buffers.sendOffset[supidx]+recvIdxU];
964 
965 
966  std::stringstream sstm;
967 
968 #if ( _DEBUGlevel_ >= 1 )
969  statusOFS << "["<<snode_index<<"] P"<<MYPROC(this->grid_)<<" ("<<MYROW(this->grid_)
970  << ","<<MYCOL(this->grid_)<<") <--- LBi("<<snode_index<<") <--- P"
971  << src<<" ("<<ksupProcCol<<","<<srcCol<<")"<<std::endl;
972 #endif
973  sstm.write( &sstrUcolRecv[0], sstrSizeU );
974 
975  // Unpack U data.
976  Int numUBlock;
977  std::vector<Int> mask( UBlockMask::TOTAL_NUMBER, 1 );
978  deserialize( numUBlock, sstm, NO_MASK );
979  UrowRecv.resize(numUBlock);
980  for( Int jb = 0; jb < numUBlock; jb++ ){
981  deserialize( UrowRecv[jb], sstm, mask );
982 #if ( _DEBUGlevel_ >= 1 )
983  statusOFS << "["<<snode_index<<"] RECV contains "
984  << UrowRecv[jb].blockIdx<< " which corresponds to "
985  << jb * this->grid_->numProcCol + srcCol;
986  statusOFS << " Urow is on col "<< srcCol
987  << " whereas U is on row "
988  << UrowRecv[jb].blockIdx % this->grid_->numProcRow
989  << std::endl;
990 #endif
991  }
992  pUrow = &UrowRecv;
993  recvIdxU++;
994  } // sender is not the same as receiver
995  else{
996  // U is obtained locally, just make a copy.
997  std::vector<UBlock<T> >& Urow = this->U( LBi( snode_index, this->grid_ ) );
998  pUrow = &Urow;
999  } // sender is the same as receiver
1000 
1001 
1002  TIMER_STOP(Recv_U_CrossDiag);
1003 
1004  //We can directly put UrowRecv in Ucol
1005  for( Int jb = 0; jb < pUrow->size(); jb++ ){
1006  UBlock<T> & UB = (*pUrow)[jb];
1007  if( UB.blockIdx <= snode_index ){
1008  ErrorHandling( "UrowRecv contains the wrong blocks." );
1009  }
1010 
1011  //check that UB would be on this proc if it was a LB
1012  if(PROW(UB.blockIdx, this->grid_) != MYROW(this->grid_)){
1013  continue;
1014  }
1015 
1016 
1017  //std::vector<UBlock<T> > & UcolCD = this->Ucol( LBi( ksup, super_ ) );
1018  for( Int jjb = 0; jjb < Ucol.size(); jjb++ ){
1019  UBlock<T> & UcolB = Ucol[jjb];
1020 
1021 
1022  //If not initialized, set it to be the one
1023  if( UcolB.blockIdx == -1){
1024  UcolB.blockIdx = UB.blockIdx;
1025  }
1026 
1027 
1028  if( UB.blockIdx == UcolB.blockIdx ){
1029  // Compare size
1030  // Note that the order of the column indices of the U
1031  // block may not follow the order of the row indices,
1032  // overwrite the information in U.
1033  UcolB = UB;
1034 
1035  //SetValue(UcolB.nzval,ZERO<T>());
1036 #if ( _DEBUGlevel_ >= 1 )
1037  statusOFS<<"["<<snode_index<<"] USING UB "<<UB.blockIdx<< std::endl;
1038 #endif
1039  isBlockFound[jjb] = 1;
1040  break;
1041  } // if( UB.blockIdx == UcolB.blockIdx )
1042  } // for (jjb)
1043  } // for (jb)
1044 
1045  }
1046  }
1047 
1048  for( Int jb = 0; jb < Ucol.size(); jb++ ){
1049  UBlock<T> & UB = Ucol[jb];
1050  if( !isBlockFound[jb] ){
1051 
1052  ErrorHandling(
1053  "UBlock cannot find its update. Something is seriously wrong."
1054  );
1055  }
1056  } // end for (jb)
1057  } // Done copying U
1058 
1059  }
1060 
1061  TIMER_STOP(WaitContentUCD);
1062  }
1063 
1064 
1065 
1066 
1067  template<typename T>
1069  std::vector<SuperNodeBufferTypeUnsym > & arrSuperNodes,
1070  Int stepSuper)
1071  {
1072 
1073  TIMER_START(Send_CD);
1074  //compute the number of requests
1075  Int sendCount = 0;
1076  Int recvCount = 0;
1077  Int sendOffset[stepSuper];
1078  Int recvOffset[stepSuper];
1079  Int recvIdxL=0;
1080  Int recvIdxU=0;
1081  for (Int supidx=0; supidx<stepSuper; supidx++){
1082  SuperNodeBufferTypeUnsym & snode = arrSuperNodes[supidx];
1083  sendOffset[supidx]=sendCount;
1084  recvOffset[supidx]=recvCount;
1085  sendCount+= this->CountSendToCrossDiagonal(snode.Index);
1086  recvCount+= this->CountRecvFromCrossDiagonal(snode.Index);
1087  }
1088 
1089  //Buffers for L
1090  std::vector<MPI_Request > arrMpiReqsSendLCD(sendCount, MPI_REQUEST_NULL );
1091  std::vector<MPI_Request > arrMpiReqsSizeSendLCD(sendCount, MPI_REQUEST_NULL );
1092  std::vector<MPI_Request > arrMpiReqsRecvLCD(recvCount, MPI_REQUEST_NULL );
1093  std::vector<MPI_Request > arrMpiReqsSizeRecvLCD(recvCount, MPI_REQUEST_NULL );
1094  std::vector<std::vector<char> > arrSstrLcolSendCD(sendCount);
1095  std::vector<int > arrSstrLcolSizeSendCD(sendCount);
1096  std::vector<std::vector<char> > arrSstrLrowRecvCD(recvCount);
1097  std::vector<int > arrSstrLrowSizeRecvCD(recvCount);
1098 
1099  //Buffers for U
1100  std::vector<MPI_Request > arrMpiReqsSendUCD(recvCount, MPI_REQUEST_NULL );
1101  std::vector<MPI_Request > arrMpiReqsSizeSendUCD(recvCount, MPI_REQUEST_NULL );
1102  std::vector<MPI_Request > arrMpiReqsRecvUCD(sendCount, MPI_REQUEST_NULL );
1103  std::vector<MPI_Request > arrMpiReqsSizeRecvUCD(sendCount, MPI_REQUEST_NULL );
1104  std::vector<std::vector<char> > arrSstrUrowSendCD(recvCount);
1105  std::vector<int > arrSstrUrowSizeSendCD(recvCount);
1106  std::vector<std::vector<char> > arrSstrUcolRecvCD(sendCount);
1107  std::vector<int > arrSstrUcolSizeRecvCD(sendCount);
1108 
1109 
1110 
1111  //Do Isend for size and content of L and Irecv for sizes of U
1112  for (Int supidx=0; supidx<stepSuper; supidx++){
1113  SuperNodeBufferTypeUnsym & snode = arrSuperNodes[supidx];
1114 
1115  TIMER_START(Send_L_Recv_Size_U_CrossDiag);
1116 
1117  if( MYCOL( this->grid_ ) == PCOL( snode.Index, this->grid_ )
1118  && this->isSendToCrossDiagonal_(this->grid_->numProcCol, snode.Index ) ){
1119 
1120  Int sendIdxL = 0;
1121  Int recvIdxU = 0;
1122  for(Int dstCol = 0; dstCol<this->grid_->numProcCol; dstCol++){
1123  if(this->isSendToCrossDiagonal_(dstCol,snode.Index) ){
1124  Int dest = PNUM(PROW(snode.Index,this->grid_),dstCol,this->grid_);
1125 
1126  if( MYPROC( this->grid_ ) != dest ){
1127  //Send size and content of L
1128  MPI_Request & mpiReqSizeSendL =
1129  arrMpiReqsSizeSendLCD[sendOffset[supidx]+sendIdxL];
1130  MPI_Request & mpiReqSendL =
1131  arrMpiReqsSendLCD[sendOffset[supidx]+sendIdxL];
1132  std::vector<char> & sstrLcolSend =
1133  arrSstrLcolSendCD[sendOffset[supidx]+sendIdxL];
1134  Int & sstrSizeL =
1135  arrSstrLcolSizeSendCD[sendOffset[supidx]+sendIdxL];
1136 
1137  // Pack L data
1138  std::stringstream sstm;
1139  std::vector<Int> mask( LBlockMask::TOTAL_NUMBER, 1 );
1140  std::vector<LBlock<T> >& Lcol = this->L( LBj(snode.Index, this->grid_) );
1141  // All blocks except for the diagonal block are to be sent right
1142  //TODO not true > this is a scatter operation ! Can we know the destination ?
1143 
1144  //Skip the diagonal block if necessary
1145  Int startIdx = ( MYROW( this->grid_ ) == PROW( snode.Index, this->grid_ ) ) ? 1:0;
1146  Int count = 0;
1147  for( Int ib = startIdx; ib < Lcol.size(); ib++ ){
1148  if( Lcol[ib].blockIdx > snode.Index &&
1149  (Lcol[ib].blockIdx % this->grid_->numProcCol) == dstCol ){
1150  count++;
1151  }
1152  }
1153 
1154  serialize( (Int)count, sstm, NO_MASK );
1155 
1156  for( Int ib = startIdx; ib < Lcol.size(); ib++ ){
1157  if( Lcol[ib].blockIdx > snode.Index &&
1158  (Lcol[ib].blockIdx % this->grid_->numProcCol) == dstCol ){
1159 #if ( _DEBUGlevel_ >= 1 )
1160  statusOFS << "["<<snode.Index<<"] SEND contains "<<Lcol[ib].blockIdx
1161  << " which corresponds to "<<GBj(ib,this->grid_)
1162  << " to P"<<dest
1163  << std::endl;
1164 #endif
1165  serialize( Lcol[ib], sstm, mask );
1166  }
1167  }
1168 
1169  sstrLcolSend.resize( Size(sstm) );
1170  sstm.read( &sstrLcolSend[0], sstrLcolSend.size() );
1171  sstrSizeL = sstrLcolSend.size();
1172 
1173 
1174 
1175  MPI_Isend( &sstrSizeL, 1, MPI_INT, dest,
1176  IDX_TO_TAG2(snode.Index,supidx,SELINV_TAG_L_SIZE_CD),
1177  this->grid_->comm, &mpiReqSizeSendL );
1178  MPI_Isend( (void*)&sstrLcolSend[0], sstrSizeL, MPI_BYTE, dest,
1179  IDX_TO_TAG2(snode.Index,supidx,SELINV_TAG_L_CONTENT_CD),
1180  this->grid_->comm, &mpiReqSendL );
1181  sendIdxL++;
1182 
1183  //Recv for U size
1184  Int & sstrSizeU =
1185  arrSstrUcolSizeRecvCD[sendOffset[supidx]+recvIdxU];
1186  MPI_Request & mpiReqSizeRecvU =
1187  arrMpiReqsSizeRecvUCD[sendOffset[supidx]+recvIdxU];
1188 
1189  MPI_Irecv( &sstrSizeU, 1, MPI_INT, dest,
1190  IDX_TO_TAG2(snode.Index,supidx,SELINV_TAG_U_SIZE_CD),
1191  this->grid_->comm, &mpiReqSizeRecvU );
1192  recvIdxU++;
1193  }
1194  }
1195  }
1196  } // sender
1197  TIMER_STOP(Send_L_Recv_Size_U_CrossDiag);
1198  }
1199 
1200 
1201  //Do Irecv for sizes of L and Isend for size and content of U
1202  for (Int supidx=0; supidx<stepSuper; supidx++){
1203  SuperNodeBufferTypeUnsym & snode = arrSuperNodes[supidx];
1204  TIMER_START(Send_U_Recv_Size_L_CrossDiag);
1205  //If I'm a receiver
1206  if( MYROW( this->grid_ ) == PROW( snode.Index, this->grid_ )
1207  && this->isRecvFromCrossDiagonal_(this->grid_->numProcRow, snode.Index ) ){
1208  Int recvIdxL=0;
1209  Int sendIdxU=0;
1210  for(Int srcRow = 0; srcRow<this->grid_->numProcRow; srcRow++){
1211  if(this->isRecvFromCrossDiagonal_(srcRow,snode.Index) ){
1212  Int src = PNUM(srcRow,PCOL(snode.Index,this->grid_),this->grid_);
1213  if( MYPROC( this->grid_ ) != src ){
1214  //Recv size of L
1215  Int & sstrSizeL =
1216  arrSstrLrowSizeRecvCD[recvOffset[supidx]+recvIdxL];
1217  MPI_Request & mpiReqSizeRecvL =
1218  arrMpiReqsSizeRecvLCD[recvOffset[supidx]+recvIdxL];
1219 
1220  MPI_Irecv( &sstrSizeL, 1, MPI_INT, src,
1221  IDX_TO_TAG2(snode.Index,supidx,SELINV_TAG_L_SIZE_CD),
1222  this->grid_->comm, &mpiReqSizeRecvL );
1223  recvIdxL++;
1224 
1225 
1226  //Send size and content of U
1227  MPI_Request & mpiReqSizeSendU =
1228  arrMpiReqsSizeSendUCD[recvOffset[supidx]+sendIdxU];
1229  MPI_Request & mpiReqSendU =
1230  arrMpiReqsSendUCD[recvOffset[supidx]+sendIdxU];
1231  std::vector<char> & sstrUrowSend =
1232  arrSstrUrowSendCD[recvOffset[supidx]+sendIdxU];
1233  Int & sstrSizeU =
1234  arrSstrUrowSizeSendCD[recvOffset[supidx]+sendIdxU];
1235 
1236  // Pack U data
1237  std::stringstream sstm;
1238  std::vector<Int> mask( UBlockMask::TOTAL_NUMBER, 1 );
1239  std::vector<UBlock<T> >& Urow = this->U( LBi(snode.Index, this->grid_) );
1240  // All blocks except for the diagonal block are to be sent right
1241  //TODO not true > this is a scatter operation ! Can we know the destination ?
1242 
1243  Int count = 0;
1244  for( Int jb = 0; jb < Urow.size(); jb++ ){
1245  if( Urow[jb].blockIdx > snode.Index
1246  && (Urow[jb].blockIdx % this->grid_->numProcRow) == srcRow ){
1247  count++;
1248  }
1249  }
1250 
1251  serialize( (Int)count, sstm, NO_MASK );
1252 
1253  for( Int jb = 0; jb < Urow.size(); jb++ ){
1254  if( Urow[jb].blockIdx > snode.Index
1255  && (Urow[jb].blockIdx % this->grid_->numProcRow) == srcRow ){
1256 #if ( _DEBUGlevel_ >= 1 )
1257  statusOFS << "["<<snode.Index<<"] SEND contains "<<Urow[jb].blockIdx
1258  << " which corresponds to "<<GBi(jb,this->grid_)
1259  << " to P"<<src
1260  << std::endl;
1261 #endif
1262  serialize( Urow[jb], sstm, mask );
1263  }
1264  }
1265 
1266  sstrUrowSend.resize( Size(sstm) );
1267  sstm.read( &sstrUrowSend[0], sstrUrowSend.size() );
1268  sstrSizeU = sstrUrowSend.size();
1269 
1270 
1271  MPI_Isend( &sstrSizeU, 1, MPI_INT, src,
1272  IDX_TO_TAG2(snode.Index,supidx,SELINV_TAG_U_SIZE_CD),
1273  this->grid_->comm, &mpiReqSizeSendU );
1274  MPI_Isend( (void*)&sstrUrowSend[0], sstrSizeU, MPI_BYTE, src,
1275  IDX_TO_TAG2(snode.Index,supidx,SELINV_TAG_U_CONTENT_CD),
1276  this->grid_->comm, &mpiReqSendU );
1277  sendIdxU++;
1278  }
1279  }
1280  }
1281  }//end if I'm a receiver
1282  TIMER_STOP(Send_U_Recv_Size_L_CrossDiag);
1283  }
1284 
1285  //waitall sizes of L
1286  TIMER_START(Wait_Size_L_CrossDiag);
1287  mpi::Waitall(arrMpiReqsSizeRecvLCD);
1288  TIMER_STOP(Wait_Size_L_CrossDiag);
1289 
1290  //if(this->grid_->mpirank==0){gdb_lock();}
1291 
1292  //Allocate content and do Irecv for content of L
1293  for (Int supidx=0; supidx<stepSuper; supidx++){
1294  SuperNodeBufferTypeUnsym & snode = arrSuperNodes[supidx];
1295  //If I'm a receiver
1296  TIMER_START(Recv_L_CrossDiag);
1297  if( MYROW( this->grid_ ) == PROW( snode.Index, this->grid_ )
1298  && this->isRecvFromCrossDiagonal_(this->grid_->numProcRow, snode.Index ) ){
1299  Int recvIdxL=0;
1300  for(Int srcRow = 0; srcRow<this->grid_->numProcRow; srcRow++){
1301  if(this->isRecvFromCrossDiagonal_(srcRow,snode.Index) ){
1302  Int src = PNUM(srcRow,PCOL(snode.Index,this->grid_),this->grid_);
1303  if( MYPROC( this->grid_ ) != src ){
1304  Int & sstrSizeL =
1305  arrSstrLrowSizeRecvCD[recvOffset[supidx]+recvIdxL];
1306  std::vector<char> & sstrLrowRecv =
1307  arrSstrLrowRecvCD[recvOffset[supidx]+recvIdxL];
1308  MPI_Request & mpiReqRecvL =
1309  arrMpiReqsRecvLCD[recvOffset[supidx]+recvIdxL];
1310  sstrLrowRecv.resize( sstrSizeL);
1311 
1312  MPI_Irecv( (void*)&sstrLrowRecv[0], sstrSizeL, MPI_BYTE, src,
1313  IDX_TO_TAG2(snode.Index,supidx,SELINV_TAG_L_CONTENT_CD),
1314  this->grid_->comm, &mpiReqRecvL );
1315  recvIdxL++;
1316 
1317  }
1318  }
1319  }
1320  }//end if I'm a receiver
1321  TIMER_STOP(Recv_L_CrossDiag);
1322  }
1323 
1324  //waitall sizes of U
1325  TIMER_START(Wait_Size_U_CrossDiag);
1326  mpi::Waitall(arrMpiReqsSizeRecvUCD);
1327  TIMER_STOP(Wait_Size_U_CrossDiag);
1328 
1329  //Allocate content and do Irecv for content of U
1330  for (Int supidx=0; supidx<stepSuper; supidx++){
1331  SuperNodeBufferTypeUnsym & snode = arrSuperNodes[supidx];
1332  TIMER_START(Recv_U_CrossDiag);
1333  if( MYCOL( this->grid_ ) == PCOL( snode.Index, this->grid_ )
1334  && this->isSendToCrossDiagonal_(this->grid_->numProcCol, snode.Index ) ){
1335  Int recvIdxU = 0;
1336  for(Int srcCol = 0; srcCol<this->grid_->numProcCol; srcCol++){
1337  if(this->isSendToCrossDiagonal_(srcCol,snode.Index) ){
1338  Int src = PNUM(PROW(snode.Index,this->grid_),srcCol,this->grid_);
1339  if( MYPROC( this->grid_ ) != src ){
1340  Int & sstrSizeU =
1341  arrSstrUcolSizeRecvCD[sendOffset[supidx]+recvIdxU];
1342  std::vector<char> & sstrUcolRecv =
1343  arrSstrUcolRecvCD[sendOffset[supidx]+recvIdxU];
1344  MPI_Request & mpiReqRecvU =
1345  arrMpiReqsRecvUCD[sendOffset[supidx]+recvIdxU];
1346  sstrUcolRecv.resize( sstrSizeU );
1347 
1348  MPI_Irecv( (void*)&sstrUcolRecv[0], sstrSizeU, MPI_BYTE, src,
1349  IDX_TO_TAG2(snode.Index,supidx,SELINV_TAG_U_CONTENT_CD),
1350  this->grid_->comm, &mpiReqRecvU );
1351  recvIdxU++;
1352  }
1353  }
1354  }
1355  }//end if I'm a receiver
1356  TIMER_STOP(Recv_U_CrossDiag);
1357  }
1358 
1359  //waitall content of L
1360  TIMER_START(Wait_L_CrossDiag);
1361  mpi::Waitall(arrMpiReqsRecvLCD);
1362  TIMER_STOP(Wait_L_CrossDiag);
1363 
1364 
1365 
1366  //Do the work for Lrow
1367  NumMat<T> Ltmp;
1368  for (Int supidx=0; supidx<stepSuper; supidx++){
1369  SuperNodeBufferTypeUnsym & snode = arrSuperNodes[supidx];
1370  Int ksupProcRow = PROW( snode.Index, this->grid_ );
1371  Int ksupProcCol = PCOL( snode.Index, this->grid_ );
1372 
1373  if( MYROW( this->grid_ ) == PROW( snode.Index, this->grid_ ) &&
1374  this->isRecvFromCrossDiagonal_(this->grid_->numProcRow, snode.Index ) ){
1375 
1376 #if ( _DEBUGlevel_ >= 1 )
1377  statusOFS << std::endl << " ["<<snode.Index<<"] "
1378  << "Update the upper triangular block"
1379  << std::endl << std::endl;
1380  statusOFS << std::endl << " ["<<snode.Index<<"] "
1381  << "blockIdxLocal:" << snode.BlockIdxLocal
1382  << std::endl << std::endl;
1383  statusOFS << std::endl << " ["<<snode.Index<<"] "
1384  << "rowLocalPtr:" << snode.RowLocalPtr
1385  << std::endl << std::endl;
1386 #endif
1387 
1388  std::vector<LBlock<T> >& Lcol = this->L( LBj(snode.Index, this->grid_) );
1389  std::vector<LBlock<T> >& Lrow = this->Lrow( LBi( snode.Index, this->grid_ ) );
1390 
1391  // if(Lrow.size() == 0){
1392  // Int& LrowSize = this->LrowSize_[ LBi( snode.Index, this->grid_ ) ];
1393  // Lrow.resize(LrowSize);
1394  // }
1395 
1396  std::vector<Int> isBlockFound(Lrow.size(),false);
1397  Int recvIdxL=0;
1398 
1399  for(Int srcRow = 0; srcRow<this->grid_->numProcRow; srcRow++){
1400  if(this->isRecvFromCrossDiagonal_(srcRow,snode.Index) ){
1401  Int src = PNUM(srcRow,PCOL(snode.Index,this->grid_),this->grid_);
1402  TIMER_START(Recv_L_CrossDiag);
1403 
1404  //Declaring some pointers to avoid copy if data is local
1405  std::vector<LBlock<T> > * pLcol;
1406  std::vector<LBlock<T> > LcolRecv;
1407  if(MYPROC(this->grid_)!= src){
1408  Int & sstrSizeL =
1409  arrSstrLrowSizeRecvCD[recvOffset[supidx]+recvIdxL];
1410  std::vector<char> & sstrLrowRecv =
1411  arrSstrLrowRecvCD[recvOffset[supidx]+recvIdxL];
1412 
1413  std::stringstream sstm;
1414 
1415 #if ( _DEBUGlevel_ >= 1 )
1416  statusOFS << "["<<snode.Index<<"] P"<<MYPROC(this->grid_)<<" ("<<MYROW(this->grid_)
1417  << ","<<MYCOL(this->grid_)<<") <--- LBj("<<snode.Index<<") <--- P"
1418  << src <<" ("<<srcRow<<","<<ksupProcCol<<")"
1419  << std::endl;
1420 #endif
1421  sstm.write( &sstrLrowRecv[0], sstrSizeL );
1422 
1423  // Unpack L data.
1424  Int numLBlock;
1425  std::vector<Int> mask( LBlockMask::TOTAL_NUMBER, 1 );
1426  deserialize( numLBlock, sstm, NO_MASK );
1427  LcolRecv.resize(numLBlock);
1428  for( Int ib = 0; ib < numLBlock; ib++ ){
1429  deserialize( LcolRecv[ib], sstm, mask );
1430 #if ( _DEBUGlevel_ >= 1 )
1431  statusOFS << "["<<snode.Index<<"] RECV contains "
1432  << LcolRecv[ib].blockIdx<< " which corresponds to "
1433  << ib * this->grid_->numProcRow + srcRow;
1434  statusOFS << " L is on row "<< srcRow
1435  << " whereas Lrow is on col "
1436  << LcolRecv[ib].blockIdx % this->grid_->numProcCol
1437  << std::endl;
1438 #endif
1439  }
1440  recvIdxL++;
1441  pLcol = &LcolRecv;
1442  } // sender is not the same as receiver
1443  else{
1444  // L is obtained locally, just make a copy. Do not include the diagonal block
1445  pLcol = &Lcol;
1446  } // sender is the same as receiver
1447 
1448 
1449  TIMER_STOP(Recv_L_CrossDiag);
1450 
1451  //We can directly put LcolRecv in Lrow (stil need to transpose)
1452 
1453  //Skip the diagonal block if necessary
1454  Int startIdx = ( MYPROC( this->grid_ ) == src && MYPROC(this->grid_) == PNUM(ksupProcRow,ksupProcCol, this->grid_) ) ? 1:0;
1455  for( Int ib = startIdx; ib < pLcol->size(); ib++ ){
1456  LBlock<T> & LB = (*pLcol)[ib];
1457  if( LB.blockIdx <= snode.Index ){
1458  ErrorHandling( "LcolRecv contains the wrong blocks." );
1459  }
1460 
1461  //check that LB would be on this proc if it was a UB
1462  if(PCOL(LB.blockIdx, this->grid_) != MYCOL(this->grid_)){
1463  continue;
1464  }
1465 
1466 
1467 
1468  //std::vector<LBlock<T> > & LrowCD = this->Lrow( LBi( ksup, super_ ) );
1469  for( Int iib = 0; iib < Lrow.size(); iib++ ){
1470  LBlock<T> & LrowB = Lrow[ iib ];
1471 
1472  //If not initialized, set it to be the one
1473  if( LrowB.blockIdx == -1){
1474  LrowB.blockIdx = LB.blockIdx;
1475  }
1476 
1477  if( LB.blockIdx == LrowB.blockIdx ){
1478  // Note that the order of the column indices of the U
1479  // block may not follow the order of the row indices,
1480  // overwrite the information in U.
1481  // LrowB = LB;
1482  LrowB.rows = LB.rows;
1483  //Store in "row major" format / i.e transpose is in col-major
1484  Transpose(LB.nzval, LrowB.nzval);
1485  LrowB.numCol = LB.numRow;
1486  LrowB.numRow = LB.numCol;
1487 
1488  // SetValue(LrowB.nzval,ZERO<T>());
1489 
1490 #if ( _DEBUGlevel_ >= 1 )
1491  statusOFS<<"["<<snode.Index<<"] USING LB "<<LB.blockIdx<< std::endl;
1492 #endif
1493  isBlockFound[iib] = 1;//true;
1494  break;
1495  } // if( LB.blockIdx == LrowB.blockIdx )
1496  } // for (iib)
1497  } // for (ib)
1498  }
1499  }
1500 
1501  for( Int ib = 0; ib < Lrow.size(); ib++ ){
1502  LBlock<T> & LB = Lrow[ib];
1503  if( !isBlockFound[ib] ){
1504  ErrorHandling(
1505  "LBlock cannot find its update. Something is seriously wrong."
1506  );
1507  }
1508  }
1509  } // Done copying L
1510  }
1511 
1512  //waitall content of U
1513  TIMER_START(Wait_U_CrossDiag);
1514  mpi::Waitall(arrMpiReqsRecvUCD);
1515  TIMER_STOP(Wait_U_CrossDiag);
1516 
1517 
1518  //Do the work for U
1519  for (Int supidx=0; supidx<stepSuper; supidx++){
1520  SuperNodeBufferTypeUnsym & snode = arrSuperNodes[supidx];
1521  Int ksupProcRow = PROW( snode.Index, this->grid_ );
1522  Int ksupProcCol = PCOL( snode.Index, this->grid_ );
1523 
1524 
1525  if( MYCOL( this->grid_ ) == PCOL( snode.Index, this->grid_ ) &&
1526  this->isSendToCrossDiagonal_(this->grid_->numProcCol, snode.Index ) ){
1527 
1528  std::vector<UBlock<T> >& Ucol = this->Ucol( LBj( snode.Index, this->grid_ ) );
1529 
1530  // if(Ucol.size() == 0){
1531  // Int& UcolSize = this->UcolSize_[ LBj( snode.Index, this->grid_ ) ];
1532  // Ucol.resize(UcolSize);
1533  // }
1534 
1535  std::vector<Int> isBlockFound(Ucol.size(),false);
1536  Int recvIdxU=0;
1537 
1538  for(Int srcCol = 0; srcCol<this->grid_->numProcCol; srcCol++){
1539  if(this->isSendToCrossDiagonal_(srcCol,snode.Index) ){
1540  Int src = PNUM(PROW(snode.Index,this->grid_),srcCol,this->grid_);
1541  TIMER_START(Recv_U_CrossDiag);
1542 
1543  //Declaring some pointers to avoid copy if data is local
1544  std::vector<UBlock<T> > * pUrow;
1545  std::vector<UBlock<T> > UrowRecv;
1546  if(MYPROC(this->grid_)!= src){
1547  Int & sstrSizeU =
1548  arrSstrUcolSizeRecvCD[sendOffset[supidx]+recvIdxU];
1549  std::vector<char> & sstrUcolRecv =
1550  arrSstrUcolRecvCD[sendOffset[supidx]+recvIdxU];
1551 
1552 
1553  std::stringstream sstm;
1554 
1555 #if ( _DEBUGlevel_ >= 1 )
1556  statusOFS << "["<<snode.Index<<"] P"<<MYPROC(this->grid_)<<" ("<<MYROW(this->grid_)
1557  << ","<<MYCOL(this->grid_)<<") <--- LBi("<<snode.Index<<") <--- P"
1558  << src<<" ("<<ksupProcCol<<","<<srcCol<<")"<<std::endl;
1559 #endif
1560  sstm.write( &sstrUcolRecv[0], sstrSizeU );
1561 
1562  // Unpack U data.
1563  Int numUBlock;
1564  std::vector<Int> mask( UBlockMask::TOTAL_NUMBER, 1 );
1565  deserialize( numUBlock, sstm, NO_MASK );
1566  UrowRecv.resize(numUBlock);
1567  for( Int jb = 0; jb < numUBlock; jb++ ){
1568  deserialize( UrowRecv[jb], sstm, mask );
1569 #if ( _DEBUGlevel_ >= 1 )
1570  statusOFS << "["<<snode.Index<<"] RECV contains "
1571  << UrowRecv[jb].blockIdx<< " which corresponds to "
1572  << jb * this->grid_->numProcCol + srcCol;
1573  statusOFS << " Urow is on col "<< srcCol
1574  << " whereas U is on row "
1575  << UrowRecv[jb].blockIdx % this->grid_->numProcRow
1576  << std::endl;
1577 #endif
1578  }
1579  pUrow = &UrowRecv;
1580  recvIdxU++;
1581  } // sender is not the same as receiver
1582  else{
1583  // U is obtained locally, just make a copy.
1584  std::vector<UBlock<T> >& Urow = this->U( LBi( snode.Index, this->grid_ ) );
1585  pUrow = &Urow;
1586  } // sender is the same as receiver
1587 
1588 
1589  TIMER_STOP(Recv_U_CrossDiag);
1590 
1591  //We can directly put UrowRecv in Ucol
1592  for( Int jb = 0; jb < pUrow->size(); jb++ ){
1593  UBlock<T> & UB = (*pUrow)[jb];
1594  if( UB.blockIdx <= snode.Index ){
1595  ErrorHandling( "UrowRecv contains the wrong blocks." );
1596  }
1597 
1598  //check that UB would be on this proc if it was a LB
1599  if(PROW(UB.blockIdx, this->grid_) != MYROW(this->grid_)){
1600  continue;
1601  }
1602 
1603 
1604  //std::vector<UBlock<T> > & UcolCD = this->Ucol( LBi( ksup, super_ ) );
1605  for( Int jjb = 0; jjb < Ucol.size(); jjb++ ){
1606  UBlock<T> & UcolB = Ucol[jjb];
1607 
1608 
1609  //If not initialized, set it to be the one
1610  if( UcolB.blockIdx == -1){
1611  UcolB.blockIdx = UB.blockIdx;
1612  }
1613 
1614 
1615  if( UB.blockIdx == UcolB.blockIdx ){
1616  // Compare size
1617  // Note that the order of the column indices of the U
1618  // block may not follow the order of the row indices,
1619  // overwrite the information in U.
1620  UcolB = UB;
1621 
1622  //SetValue(UcolB.nzval,ZERO<T>());
1623 #if ( _DEBUGlevel_ >= 1 )
1624  statusOFS<<"["<<snode.Index<<"] USING UB "<<UB.blockIdx<< std::endl;
1625 #endif
1626  isBlockFound[jjb] = 1;
1627  break;
1628  } // if( UB.blockIdx == UcolB.blockIdx )
1629  } // for (jjb)
1630  } // for (jb)
1631 
1632  }
1633  }
1634 
1635  for( Int jb = 0; jb < Ucol.size(); jb++ ){
1636  UBlock<T> & UB = Ucol[jb];
1637  if( !isBlockFound[jb] ){
1638  ErrorHandling(
1639  "UBlock cannot find its update. Something is seriously wrong."
1640  );
1641  }
1642  } // end for (jb)
1643  } // Done copying U
1644 
1645  }
1646 
1647  TIMER_STOP(Send_CD);
1648 
1649  mpi::Waitall(arrMpiReqsSizeSendLCD);
1650  mpi::Waitall(arrMpiReqsSendLCD);
1651 
1652  mpi::Waitall(arrMpiReqsSizeSendUCD);
1653  mpi::Waitall(arrMpiReqsSendUCD);
1654 
1655 
1656  } // End of method PMatrixUnsym::SendRecvCD
1657 
1658  template<typename T>
1660  SuperNodeBufferTypeUnsym & snode,
1661  std::vector<LBlock<T> > & LcolRecv,
1662  std::vector<LBlock<T> > & LrowRecv,
1663  std::vector<UBlock<T> > & UcolRecv,
1664  std::vector<UBlock<T> > & UrowRecv
1665  )
1666  {
1667 
1668  TIMER_START(Unpack_data);
1669 
1670 
1671 
1672 
1673 #if ( _DEBUGlevel_ >= 1 )
1674  statusOFS << std::endl << "["<<snode.Index<<"] "<< "Unpack the received data for processors participate in Gemm. " << std::endl << std::endl;
1675 #endif
1676  // Lrow part
1677  if( MYROW( this->grid_ ) != PROW( snode.Index, this->grid_ ) ){
1678  std::stringstream sstm;
1679  sstm.write( &snode.SstrLrowRecv[0], snode.SstrLrowRecv.size() );
1680  std::vector<Int> mask( LBlockMask::TOTAL_NUMBER, 1 );
1681  Int numLBlock;
1682  deserialize( numLBlock, sstm, NO_MASK );
1683  LrowRecv.resize( numLBlock );
1684  for( Int ib = 0; ib < numLBlock; ib++ ){
1685  deserialize( LrowRecv[ib], sstm, mask );
1686  }
1687 
1688 #if ( _DEBUGlevel_ >= 1 )
1689  statusOFS<< "["<<snode.Index<<"] "<<"Lrow RECV "<<std::endl;
1690  for(Int ib=0;ib<LrowRecv.size();++ib){statusOFS<<LrowRecv[ib].blockIdx<<" ";}
1691  statusOFS<<std::endl;
1692 #endif
1693  } // sender is not the same as receiver
1694  else{
1695  // U is obtained locally, just make a copy. Include everything
1696  // (there is no diagonal block)
1697  // Is it a copy ? LL: YES. Maybe we should replace the copy by
1698  // something more efficient especially for mpisize == 1
1699  LrowRecv.resize(this->Lrow( LBi( snode.Index, this->grid_ ) ).size());
1700  std::copy(this->Lrow( LBi( snode.Index, this->grid_ ) ).begin(),this->Lrow( LBi( snode.Index, this->grid_ )).end(),LrowRecv.begin());
1701  } // sender is the same as receiver
1702 
1703 
1704  //L part
1705  if( MYCOL( this->grid_ ) != PCOL( snode.Index, this->grid_ ) ){
1706  std::stringstream sstm;
1707  sstm.write( &snode.SstrLcolRecv[0], snode.SstrLcolRecv.size() );
1708  std::vector<Int> mask( LBlockMask::TOTAL_NUMBER, 1 );
1709  mask[LBlockMask::NZVAL] = 0; // nzval is excluded
1710  Int numLBlock;
1711  deserialize( numLBlock, sstm, NO_MASK );
1712  LcolRecv.resize( numLBlock );
1713  for( Int ib = 0; ib < numLBlock; ib++ ){
1714  deserialize( LcolRecv[ib], sstm, mask );
1715  }
1716 
1717 
1718 #if ( _DEBUGlevel_ >= 1 )
1719  statusOFS<< "["<<snode.Index<<"] "<<"Lcol RECV "<<std::endl;
1720  for(Int ib=0;ib<LcolRecv.size();++ib){statusOFS<<LcolRecv[ib].blockIdx<<" ";}
1721  statusOFS<<std::endl;
1722 #endif
1723 
1724  } // sender is not the same as receiver
1725  else{
1726  // L is obtained locally, just make a copy.
1727  // Do not include the diagonal block
1728  std::vector<LBlock<T> >& Lcol = this->L( LBj( snode.Index, this->grid_ ) );
1729  Int startIdx = ( MYROW( this->grid_ ) == PROW( snode.Index, this->grid_ ) )?1:0;
1730  LcolRecv.resize( Lcol.size() - startIdx );
1731  std::copy(Lcol.begin()+startIdx,Lcol.end(),LcolRecv.begin());
1732  } // sender is the same as receiver
1733 
1734  // U part
1735  if( MYROW( this->grid_ ) != PROW( snode.Index, this->grid_ ) ){
1736  std::stringstream sstm;
1737  sstm.write( &snode.SstrUrowRecv[0], snode.SstrUrowRecv.size() );
1738  std::vector<Int> mask( UBlockMask::TOTAL_NUMBER, 1 );
1739  mask[UBlockMask::NZVAL] = 0; // nzval is excluded
1740  Int numUBlock;
1741  deserialize( numUBlock, sstm, NO_MASK );
1742  UrowRecv.resize( numUBlock );
1743  for( Int jb = 0; jb < numUBlock; jb++ ){
1744  deserialize( UrowRecv[jb], sstm, mask );
1745  }
1746 
1747 #if ( _DEBUGlevel_ >= 1 )
1748  statusOFS<< "["<<snode.Index<<"] "<<"Urow RECV "<<std::endl;
1749  for(Int ib=0;ib<UrowRecv.size();++ib){statusOFS<<UrowRecv[ib].blockIdx<<" ";}
1750  statusOFS<<std::endl;
1751 #endif
1752  } // sender is not the same as receiver
1753  else{
1754  // U is obtained locally, just make a copy. Include everything
1755  // (there is no diagonal block)
1756  // Is it a copy ? LL: YES. Maybe we should replace the copy by
1757  // something more efficient especially for mpisize == 1
1758  UrowRecv.resize(this->U( LBi( snode.Index, this->grid_ ) ).size());
1759  std::copy(this->U( LBi( snode.Index, this->grid_ ) ).begin(),this->U( LBi( snode.Index, this->grid_ )).end(),UrowRecv.begin());
1760  } // sender is the same as receiver
1761 
1762 
1763  // Ucol part
1764  if( MYCOL( this->grid_ ) != PCOL( snode.Index, this->grid_ ) ){
1765  std::stringstream sstm;
1766  sstm.write( &snode.SstrUcolRecv[0], snode.SstrUcolRecv.size() );
1767  std::vector<Int> mask( UBlockMask::TOTAL_NUMBER, 1 );
1768  Int numUBlock;
1769  deserialize( numUBlock, sstm, NO_MASK );
1770  UcolRecv.resize( numUBlock );
1771  for( Int jb = 0; jb < numUBlock; jb++ ){
1772  deserialize( UcolRecv[jb], sstm, mask );
1773  }
1774 
1775 #if ( _DEBUGlevel_ >= 1 )
1776  statusOFS<< "["<<snode.Index<<"] "<<"Ucol RECV "<<std::endl;
1777  for(Int ib=0;ib<UcolRecv.size();++ib){statusOFS<<UcolRecv[ib].blockIdx<<" ";}
1778  statusOFS<<std::endl;
1779 #endif
1780  } // sender is not the same as receiver
1781  else{
1782  // Ucol is obtained locally, just make a copy.
1783  // Do not include the diagonal block
1784  std::vector<UBlock<T> >& Ucol = this->Ucol( LBj( snode.Index, this->grid_ ) );
1785  UcolRecv.resize( Ucol.size() );
1786  std::copy(Ucol.begin(),Ucol.end(),UcolRecv.begin());
1787  } // sender is the same as receiver
1788 
1789  TIMER_STOP(Unpack_data);
1790 
1791  } // End of method PMatrixUnsym<T>::UnpackData
1792 
1793  template<typename T>
1795  {
1796 
1797  TIMER_START(ComputeDiagUpdate);
1798  //--------- Computing Diagonal block, all processors in the column
1799  //--------- are participating to all pipelined supernodes
1800  if( MYCOL( this->grid_ ) == PCOL( snode.Index, this->grid_ ) ){
1801 #if ( _DEBUGlevel_ >= 1 )
1802  statusOFS << std::endl << "["<<snode.Index<<"] "
1803  << "Updating the diagonal block" << std::endl << std::endl;
1804 #endif
1805  std::vector<LBlock<T> >& Lcol = this->L( LBj( snode.Index, this->grid_ ) );
1806  std::vector<UBlock<T> >& Ucol = this->Ucol( LBj( snode.Index, this->grid_ ) );
1807 
1808  //Allocate DiagBuf even if Lcol.size() == 0
1809  snode.DiagBuf.Resize(SuperSize( snode.Index, this->super_ ),
1810  SuperSize( snode.Index, this->super_ ));
1811  SetValue(snode.DiagBuf, ZERO<T>());
1812 
1813  // Do I own the diagonal block ?
1814  Int startIb = (MYROW( this->grid_ ) == PROW( snode.Index, this->grid_ ))?1:0;
1815  for( Int ib = startIb; ib < Lcol.size(); ib++ ){
1816  LBlock<T> & LcolB = Lcol[ib];
1817  //find corresponding U block
1818  Int jb = 0;
1819  for(jb = 0; jb < Ucol.size(); ++jb){
1820  if(Ucol[jb].blockIdx == LcolB.blockIdx){
1821  break;
1822  }
1823  }
1824 
1825 
1826  assert(jb < Ucol.size());
1827 
1828  UBlock<T> & UcolB = Ucol[jb];
1829 
1830  if(1 || (LcolB.numRow>0 && LcolB.numCol>0 && UcolB.numRow>0 && UcolB.numCol>0)){
1831  //Compute U S-1 L
1832  blas::Gemm( 'N', 'N', snode.DiagBuf.m(), snode.DiagBuf.n(),
1833  LcolB.numRow, MINUS_ONE<T>(),
1834  UcolB.nzval.Data(), UcolB.nzval.m(),
1835  &snode.LUpdateBuf( snode.RowLocalPtr[ib-startIb], 0 ),
1836  snode.LUpdateBuf.m(),
1837  ONE<T>(), snode.DiagBuf.Data(), snode.DiagBuf.m() );
1838  }
1839  }
1840 
1841 #if ( _DEBUGlevel_ >= 1 )
1842  statusOFS << std::endl << "["<<snode.Index<<"] "
1843  << "Updated the diagonal block" << std::endl << std::endl;
1844 #endif
1845  }
1846  TIMER_STOP(ComputeDiagUpdate);
1847  } // End of method PMatrixUnsym<T>::ComputeDiagUpdate
1848 
1849 
1850 
1851 
1852 
1853 
1854 
1855 
1856  template<typename T>
1857  inline void PMatrixUnsym<T>::SelInvIntra_P2p(Int lidx)
1858  {
1859 
1860 #if defined (PROFILE) || defined(PMPI) || defined(USE_TAU)
1861  Real begin_SendULWaitContentFirst, end_SendULWaitContentFirst, time_SendULWaitContentFirst = 0;
1862 #endif
1863  Int numSuper = this->NumSuper();
1864  std::vector<std::vector<Int> > & superList = this->WorkingSet();
1865  Int numSteps = superList.size();
1866  Int stepSuper = superList[lidx].size();
1867 
1868  TIMER_START(AllocateBuffer);
1869 
1870  //This is required to send the size and content of U/L
1871  std::vector<std::vector<MPI_Request> > arrMpireqsSendLToBelow;
1872  arrMpireqsSendLToBelow.resize( stepSuper, std::vector<MPI_Request>( 2 * this->grid_->numProcRow, MPI_REQUEST_NULL ));
1873  std::vector<std::vector<MPI_Request> > arrMpireqsSendLToRight;
1874  arrMpireqsSendLToRight.resize(stepSuper, std::vector<MPI_Request>( 2 * this->grid_->numProcCol, MPI_REQUEST_NULL ));
1875 
1876  std::vector<std::vector<MPI_Request> > arrMpireqsSendUToBelow;
1877  arrMpireqsSendUToBelow.resize( stepSuper, std::vector<MPI_Request>( 2 * this->grid_->numProcRow, MPI_REQUEST_NULL ));
1878  std::vector<std::vector<MPI_Request> > arrMpireqsSendUToRight;
1879  arrMpireqsSendUToRight.resize(stepSuper, std::vector<MPI_Request>( 2 * this->grid_->numProcCol, MPI_REQUEST_NULL ));
1880 
1881  //This is required to reduce L
1882  std::vector<MPI_Request> arrMpireqsSendLToLeft;
1883  arrMpireqsSendLToLeft.resize(stepSuper, MPI_REQUEST_NULL );
1884 
1885  //This is required to reduce U
1886  std::vector<MPI_Request> arrMpireqsSendUToAbove;
1887  arrMpireqsSendUToAbove.resize(stepSuper, MPI_REQUEST_NULL );
1888 
1889  //This is required to reduce D
1890  std::vector<MPI_Request> arrMpireqsSendToAbove;
1891  arrMpireqsSendToAbove.resize(stepSuper, MPI_REQUEST_NULL );
1892 
1893  //------------------------------------------------------------------------
1894 
1895  //This is required to receive the size and content of U/L
1896  std::vector<MPI_Request> arrMpireqsRecvLSizeFromAny;
1897  arrMpireqsRecvLSizeFromAny.resize(stepSuper*2 , MPI_REQUEST_NULL);
1898  std::vector<MPI_Request> arrMpireqsRecvLContentFromAny;
1899  arrMpireqsRecvLContentFromAny.resize(stepSuper*2 , MPI_REQUEST_NULL);
1900 
1901  std::vector<MPI_Request> arrMpireqsRecvUSizeFromAny;
1902  arrMpireqsRecvUSizeFromAny.resize(stepSuper*2 , MPI_REQUEST_NULL);
1903  std::vector<MPI_Request> arrMpireqsRecvUContentFromAny;
1904  arrMpireqsRecvUContentFromAny.resize(stepSuper*2 , MPI_REQUEST_NULL);
1905 
1906 
1907 
1908 
1909  //allocate the buffers for this supernode
1910  std::vector<SuperNodeBufferTypeUnsym> arrSuperNodes(stepSuper);
1911  for (Int supidx=0; supidx<stepSuper; supidx++){
1912  arrSuperNodes[supidx].Index = superList[lidx][supidx];
1913  }
1914 
1915  NumMat<T> AinvBuf, UBuf, LBuf;
1916 
1917  TIMER_STOP(AllocateBuffer);
1918 
1919 
1920 
1921  Int next_lidx = lidx+1;
1922  CDBuffers nextCDBuffers;
1923 
1924 
1925  //Perhaps this should be done for the next step super in a non blocking way
1926  if(lidx==0){
1927  CDBuffers buffers;
1928  Int next_lidx = lidx;
1929 
1930  //Resize L and U first
1931  bool doSend = false;
1932  for (Int supidx=0; supidx<superList[next_lidx].size(); supidx++){
1933  Int snode_index = superList[next_lidx][supidx];
1934  if( MYROW( this->grid_ ) == PROW( snode_index, this->grid_ ) ){
1935  std::vector<LBlock<T> >& Lrow = this->Lrow( LBi(snode_index, this->grid_) );
1936  Int& LrowSize = this->LrowSize_[ LBi(snode_index, this->grid_) ];
1937  if(Lrow.size()!=LrowSize){
1938  doSend= true;
1939  Lrow.resize(LrowSize,LBlock<T>());
1940  }
1941  }
1942 
1943  if( MYCOL( this->grid_ ) == PCOL( snode_index, this->grid_ ) ){
1944  std::vector<UBlock<T> >& Ucol = this->Ucol( LBj(snode_index, this->grid_) );
1945  Int& UcolSize = this->UcolSize_[ LBj(snode_index, this->grid_) ];
1946  if(Ucol.size()!=UcolSize){
1947  doSend= true;
1948  Ucol.resize(UcolSize,UBlock<T>());
1949  }
1950  }
1951  }
1952 
1953 
1954  if(doSend){
1955 
1956  SendRecvSizesCD(superList[next_lidx], superList[next_lidx].size(),buffers);
1957  IRecvContentCD(superList[next_lidx], superList[next_lidx].size(),buffers);
1958  WaitContentLCD(superList[next_lidx], superList[next_lidx].size(),buffers);
1959  WaitContentUCD(superList[next_lidx], superList[next_lidx].size(),buffers);
1960  buffers.WaitAllSend();
1961 
1962  }
1963  }
1964 
1965  if(next_lidx < superList.size()){
1966  //Resize L and U first
1967  for (Int supidx=0; supidx<superList[next_lidx].size(); supidx++){
1968  Int snode_index = superList[next_lidx][supidx];
1969  if( MYROW( this->grid_ ) == PROW( snode_index, this->grid_ ) ){
1970  std::vector<LBlock<T> >& Lrow = this->Lrow( LBi(snode_index, this->grid_) );
1971  Int& LrowSize = this->LrowSize_[ LBi(snode_index, this->grid_) ];
1972  Lrow.resize(LrowSize,LBlock<T>());
1973  }
1974 
1975  if( MYCOL( this->grid_ ) == PCOL( snode_index, this->grid_ ) ){
1976  std::vector<UBlock<T> >& Ucol = this->Ucol( LBj(snode_index, this->grid_) );
1977  Int& UcolSize = this->UcolSize_[ LBj(snode_index, this->grid_) ];
1978  Ucol.resize(UcolSize,UBlock<T>());
1979  }
1980  }
1981 
1982  SendRecvSizesCD(superList[next_lidx],superList[next_lidx].size(),nextCDBuffers);
1983  }
1984 
1985 
1986 #if ( _DEBUGlevel_ >= 1 )
1987  statusOFS << std::endl << "Communication to the Schur complement." << std::endl << std::endl;
1988 #endif
1989 
1990  // Senders
1991  for (Int supidx=0; supidx<stepSuper; supidx++){
1992  SuperNodeBufferTypeUnsym & snode = arrSuperNodes[supidx];
1993  std::vector<MPI_Request> & mpireqsSendLToBelow = arrMpireqsSendLToBelow[supidx];
1994  std::vector<MPI_Request> & mpireqsSendLToRight = arrMpireqsSendLToRight[supidx];
1995  std::vector<MPI_Request> & mpireqsSendUToBelow = arrMpireqsSendUToBelow[supidx];
1996  std::vector<MPI_Request> & mpireqsSendUToRight = arrMpireqsSendUToRight[supidx];
1997 
1998 #if ( _DEBUGlevel_ >= 1 )
1999  statusOFS << std::endl << "["<<snode.Index<<"] "
2000  << "Communication for the Lrow part." << std::endl << std::endl;
2001 #endif
2002  // Communication for the Lrow part.
2003  if( MYROW( this->grid_ ) == PROW( snode.Index, this->grid_ ) ){
2004  // Pack the data in Lrow
2005  TIMER_START(Serialize_LrowL);
2006  std::stringstream sstm;
2007  std::vector<Int> mask( LBlockMask::TOTAL_NUMBER, 1 );
2008  std::vector<LBlock<T> >& Lrow = this->Lrow( LBi(snode.Index, this->grid_) );
2009  std::vector<UBlock<T> >& Urow = this->U( LBi(snode.Index, this->grid_) );
2010 
2011 
2012  // All blocks are to be sent down.
2013  serialize( (Int)Lrow.size(), sstm, NO_MASK );
2014  for( Int ib = 0; ib < Lrow.size(); ib++ ){
2015  assert( Lrow[ib].blockIdx > snode.Index );
2016  serialize( Lrow[ib], sstm, mask );
2017  }
2018  snode.SstrLrowSend.resize( Size( sstm ) );
2019  sstm.read( &snode.SstrLrowSend[0], snode.SstrLrowSend.size() );
2020  snode.SizeSstrLrowSend = snode.SstrLrowSend.size();
2021  TIMER_STOP(Serialize_LrowL);
2022 
2023  for( Int iProcRow = 0; iProcRow < this->grid_->numProcRow; iProcRow++ ){
2024  if( MYROW( this->grid_ ) != iProcRow &&
2025  this->isSendToBelow_( iProcRow,snode.Index ) == true ){
2026 
2027  // Use Isend to send to multiple targets
2028  MPI_Isend( &snode.SizeSstrLrowSend, 1, MPI_INT,
2029  iProcRow, IDX_TO_TAG2(snode.Index,supidx,SELINV_TAG_LROW_SIZE), this->grid_->colComm, &mpireqsSendLToBelow[2*iProcRow] );
2030  MPI_Isend( (void*)&snode.SstrLrowSend[0], snode.SizeSstrLrowSend, MPI_BYTE,
2031  iProcRow, IDX_TO_TAG2(snode.Index,supidx,SELINV_TAG_LROW_CONTENT),
2032  this->grid_->colComm, &mpireqsSendLToBelow[2*iProcRow+1] );
2033 #if ( _DEBUGlevel_ >= 1 )
2034  statusOFS<< "["<<snode.Index<<"] "<<"Lrow SENT "<<std::endl;
2035  for(Int ib=0;ib<Lrow.size();++ib){statusOFS<<Lrow[ib].blockIdx<<" ";}
2036  statusOFS<<std::endl;
2037  statusOFS << "["<<snode.Index<<"] "<< "Sending Lrow "
2038  << snode.SizeSstrLrowSend << " BYTES to P" << PNUM(iProcRow,MYCOL(this->grid_),this->grid_)
2039  << std::endl << std::endl;
2040 #endif
2041  } // Send
2042  } // for (iProcRow)
2043  } // if I am the sender
2044 
2045 #if ( _DEBUGlevel_ >= 1 )
2046  statusOFS << std::endl << "["<<snode.Index<<"] "<< "Communication for the L part." << std::endl << std::endl;
2047 #endif
2048  // Communication for the L (Lcol) part.
2049  if( MYCOL( this->grid_ ) == PCOL( snode.Index, this->grid_ ) ){
2050 
2051  TIMER_START(Serialize_LcolL);
2052  // Pack the data in L
2053  std::stringstream sstm;
2054  std::vector<Int> mask( LBlockMask::TOTAL_NUMBER, 1 );
2055  mask[LBlockMask::NZVAL] = 0; // nzval is excluded
2056 
2057  std::vector<LBlock<T> >& Lcol = this->L( LBj(snode.Index, this->grid_) );
2058  // All blocks except for the diagonal block are to be sent right
2059 
2060 
2061  Int startIdx = ( MYROW( this->grid_ ) == PROW( snode.Index, this->grid_ ) )?1:0;
2062  serialize( (Int)Lcol.size() - startIdx, sstm, NO_MASK );
2063  for( Int ib = startIdx; ib < Lcol.size(); ib++ ){
2064  assert( Lcol[ib].blockIdx > snode.Index );
2065 
2066 #if ( _DEBUGlevel_ >= 2 )
2067  // statusOFS << std::endl << "["<<snode.Index<<"] "<< "Serializing Block index " << Lcol[ib].blockIdx << std::endl;
2068 #endif
2069  serialize( Lcol[ib], sstm, mask );
2070  }
2071  snode.SstrLcolSend.resize( Size( sstm ) );
2072  sstm.read( &snode.SstrLcolSend[0], snode.SstrLcolSend.size() );
2073  snode.SizeSstrLcolSend = snode.SstrLcolSend.size();
2074  TIMER_STOP(Serialize_LcolL);
2075 
2076  for( Int iProcCol = 0; iProcCol < this->grid_->numProcCol ; iProcCol++ ){
2077  if( MYCOL( this->grid_ ) != iProcCol &&
2078  this->isSendToRight_( iProcCol, snode.Index ) == true ){
2079  // Use Isend to send to multiple targets
2080  MPI_Isend( &snode.SizeSstrLcolSend, 1, MPI_INT,
2081  iProcCol, IDX_TO_TAG2(snode.Index,supidx,SELINV_TAG_L_SIZE),
2082  this->grid_->rowComm, &mpireqsSendLToRight[2*iProcCol] );
2083  MPI_Isend( (void*)&snode.SstrLcolSend[0], snode.SizeSstrLcolSend, MPI_BYTE,
2084  iProcCol, IDX_TO_TAG2(snode.Index,supidx,SELINV_TAG_L_CONTENT),
2085  this->grid_->rowComm, &mpireqsSendLToRight[2*iProcCol+1] );
2086 #if ( _DEBUGlevel_ >= 1 )
2087  statusOFS<< "["<<snode.Index<<"] "<<"L SENT "<<std::endl;
2088  for(Int ib=startIdx;ib<Lcol.size();++ib){statusOFS<<Lcol[ib].blockIdx<<" ";}
2089  statusOFS<<std::endl;
2090  statusOFS << "["<<snode.Index<<"] "<< "Sending L "
2091  << snode.SizeSstrLcolSend << " BYTES to P" << PNUM(MYROW(this->grid_),iProcCol,this->grid_)
2092  << std::endl << std::endl;
2093 #endif
2094  } // Send
2095  } // for (iProcCol)
2096  } // if I am the sender
2097 
2098 #if ( _DEBUGlevel_ >= 1 )
2099  statusOFS << std::endl << "["<<snode.Index<<"] "
2100  << "Communication for the U part." << std::endl << std::endl;
2101 #endif
2102  // Communication for the U (Urow) part.
2103  if( MYROW( this->grid_ ) == PROW( snode.Index, this->grid_ ) ){
2104 
2105  TIMER_START(Serialize_UcolU);
2106  // Pack the data in U
2107  std::stringstream sstm;
2108  std::vector<Int> mask( UBlockMask::TOTAL_NUMBER, 1 );
2109  mask[UBlockMask::NZVAL] = 0; // nzval is excluded
2110 
2111  std::vector<UBlock<T> >& Urow = this->U( LBi(snode.Index, this->grid_) );
2112  // All blocks except for the diagonal block are to be sent right
2113 
2114  serialize( (Int)Urow.size(), sstm, NO_MASK );
2115  for( Int jb = 0; jb < Urow.size(); jb++ ){
2116  assert( Urow[jb].blockIdx > snode.Index );
2117 #if ( _DEBUGlevel_ >= 2 )
2118  // statusOFS << std::endl << "["<<snode.Index<<"] "<< "Serializing Block index " << Urow[jb].blockIdx << std::endl;
2119 #endif
2120  serialize( Urow[jb], sstm, mask );
2121  }
2122  snode.SstrUrowSend.resize( Size( sstm ) );
2123  sstm.read( &snode.SstrUrowSend[0], snode.SstrUrowSend.size() );
2124  snode.SizeSstrUrowSend = snode.SstrUrowSend.size();
2125  TIMER_STOP(Serialize_UcolU);
2126 
2127  for( Int iProcRow = 0; iProcRow < this->grid_->numProcRow; iProcRow++ ){
2128  if( MYROW( this->grid_ ) != iProcRow &&
2129  this->isSendToBelow_( iProcRow,snode.Index ) == true ){
2130  // Use Isend to send to multiple targets
2131  MPI_Isend( &snode.SizeSstrUrowSend, 1, MPI_INT,
2132  iProcRow, IDX_TO_TAG2(snode.Index,supidx,SELINV_TAG_U_SIZE), this->grid_->colComm, &mpireqsSendUToBelow[2*iProcRow] );
2133  MPI_Isend( (void*)&snode.SstrLrowSend[0], snode.SizeSstrUrowSend, MPI_BYTE,
2134  iProcRow, IDX_TO_TAG2(snode.Index,supidx,SELINV_TAG_U_CONTENT),
2135  this->grid_->colComm, &mpireqsSendUToBelow[2*iProcRow+1] );
2136 #if ( _DEBUGlevel_ >= 1 )
2137  statusOFS << std::endl << "["<<snode.Index<<"] "<< "Sending U " << snode.SizeSstrUrowSend << " BYTES"<< std::endl << std::endl;
2138  statusOFS<< "["<<snode.Index<<"] "<<"U SENT "<<std::endl;
2139  for(Int ib=0;ib<Urow.size();++ib){statusOFS<<Urow[ib].blockIdx<<" ";}
2140  statusOFS<<std::endl;
2141  statusOFS << "["<<snode.Index<<"] "<< "Sending U "
2142  << snode.SizeSstrUrowSend << " BYTES to P" << PNUM(iProcRow,MYCOL(this->grid_),this->grid_)
2143  << std::endl << std::endl;
2144 #endif
2145  } // Send
2146  } // for (iProcRow)
2147  } // if I am the sender
2148 
2149 #if ( _DEBUGlevel_ >= 1 )
2150  statusOFS << "["<<snode.Index<<"] "
2151  << "Communication for the Ucol part." << std::endl
2152  << std::endl;
2153 #endif
2154  // Communication for the Ucol part.
2155  if( MYCOL( this->grid_ ) == PCOL( snode.Index, this->grid_ ) ){
2156  // Pack the data in Ucol
2157  TIMER_START(Serialize_UcolU);
2158  std::stringstream sstm;
2159  std::vector<Int> mask( UBlockMask::TOTAL_NUMBER, 1 );
2160  std::vector<UBlock<T> >& Ucol =
2161  this->Ucol( LBj(snode.Index, this->grid_) );
2162  // All blocks are to be sent down.
2163  serialize( (Int)Ucol.size(), sstm, NO_MASK );
2164  for( Int jb = 0; jb < Ucol.size(); jb++ ){
2165  UBlock<T> & UB = Ucol[jb];
2166  assert( UB.blockIdx > snode.Index );
2167  serialize( UB, sstm, mask );
2168  }
2169  snode.SstrUcolSend.resize( Size( sstm ) );
2170  sstm.read( &snode.SstrUcolSend[0], snode.SstrUcolSend.size() );
2171  snode.SizeSstrUcolSend = snode.SstrUcolSend.size();
2172  TIMER_STOP(Serialize_UcolU);
2173 
2174  for( Int iProcCol = 0;
2175  iProcCol < this->grid_->numProcCol ; iProcCol++ ){
2176  if( MYCOL( this->grid_ ) != iProcCol &&
2177  this->isSendToRight_( iProcCol, snode.Index ) == true ){
2178  // Use Isend to send to multiple targets
2179  MPI_Isend( &snode.SizeSstrUcolSend, 1, MPI_INT,
2180  iProcCol, IDX_TO_TAG2(snode.Index,supidx,SELINV_TAG_UCOL_SIZE),
2181  this->grid_->rowComm, &mpireqsSendUToRight[2*iProcCol] );
2182  MPI_Isend( &snode.SstrUcolSend[0],snode.SizeSstrUcolSend,
2183  MPI_BYTE, iProcCol,
2184  IDX_TO_TAG2(snode.Index,supidx,SELINV_TAG_UCOL_CONTENT),
2185  this->grid_->rowComm,
2186  &mpireqsSendUToRight[2*iProcCol+1] );
2187 #if ( _DEBUGlevel_ >= 2 )
2188  statusOFS<< "["<<snode.Index<<"] "<<"Ucol SENT "<<std::endl;
2189  for(Int ib=0;ib<Ucol.size();++ib){
2190  statusOFS<<Ucol[ib].blockIdx<<" ";
2191  }
2192  statusOFS<<std::endl;
2193 #endif
2194 #if ( _DEBUGlevel_ >= 1 )
2195  statusOFS << "["<<snode.Index<<"] "<< "Sending Ucol "
2196  << snode.SizeSstrUcolSend << " BYTES to P"
2197  << PNUM(MYROW(this->grid_),iProcCol,this->grid_)
2198  << std::endl << std::endl;
2199 #endif
2200  } // Send
2201  } // for (iProcCol)
2202  } // if I am the sender
2203 
2204 
2205 
2206  } //Senders
2207 
2208  //TODO Ideally, we should not receive data in sequence
2209  // but in any order with ksup packed with the data
2210 
2211  TIMER_START(WaitContentLU);
2212  // Receivers (Size)
2213  for (Int supidx=0; supidx<stepSuper ; supidx++){
2214  SuperNodeBufferTypeUnsym & snode = arrSuperNodes[supidx];
2215  MPI_Request * mpireqsRecvLFromAbove =
2216  &arrMpireqsRecvLSizeFromAny[supidx*2];
2217  MPI_Request * mpireqsRecvLFromLeft =
2218  &arrMpireqsRecvLSizeFromAny[supidx*2+1];
2219  MPI_Request * mpireqsRecvUFromAbove =
2220  &arrMpireqsRecvUSizeFromAny[supidx*2];
2221  MPI_Request * mpireqsRecvUFromLeft =
2222  &arrMpireqsRecvUSizeFromAny[supidx*2+1];
2223 
2224  // Receive the size first
2225  if( this->isRecvFromAbove_( snode.Index ) &&
2226  MYROW( this->grid_ ) != PROW( snode.Index, this->grid_ ) ){
2227  Int sender = PROW( snode.Index, this->grid_ );
2228  MPI_Irecv( &snode.SizeSstrLrowRecv, 1, MPI_INT, sender,
2229  IDX_TO_TAG2(snode.Index,supidx,SELINV_TAG_LROW_SIZE),
2230  this->grid_->colComm, mpireqsRecvLFromAbove );
2231 #if ( _DEBUGlevel_ >= 1 )
2232  statusOFS << "["<<snode.Index<<"] "<< "Receiving Lrow"
2233  <<" size on tag "<<IDX_TO_TAG2(snode.Index,supidx,SELINV_TAG_LROW_SIZE)
2234  <<" from P" << PNUM(sender,MYCOL(this->grid_),this->grid_)
2235  << std::endl << std::endl;
2236 #endif
2237  MPI_Irecv( &snode.SizeSstrUrowRecv, 1, MPI_INT, sender,
2238  IDX_TO_TAG2(snode.Index,supidx,SELINV_TAG_U_SIZE),
2239  this->grid_->colComm, mpireqsRecvUFromAbove );
2240 #if ( _DEBUGlevel_ >= 1 )
2241  statusOFS << "["<<snode.Index<<"] "<< "Receiving U"
2242  <<" size on tag "<<IDX_TO_TAG2(snode.Index,supidx,SELINV_TAG_U_SIZE)
2243  <<" from P" << PNUM(sender,MYCOL(this->grid_),this->grid_)
2244  << std::endl << std::endl;
2245 #endif
2246  } // if I need to receive from up
2247 
2248 
2249  if( this->isRecvFromLeft_( snode.Index ) &&
2250  MYCOL( this->grid_ ) != PCOL( snode.Index, this->grid_ ) ){
2251  Int sender = PCOL( snode.Index, this->grid_ );
2252  MPI_Irecv( &snode.SizeSstrLcolRecv, 1, MPI_INT, sender,
2253  IDX_TO_TAG2(snode.Index,supidx,SELINV_TAG_L_SIZE),
2254  this->grid_->rowComm, mpireqsRecvLFromLeft );
2255 #if ( _DEBUGlevel_ >= 1 )
2256  statusOFS << "["<<snode.Index<<"] "<< "Receiving L"
2257  <<" size on tag "<<IDX_TO_TAG2(snode.Index,supidx,SELINV_TAG_L_SIZE)
2258  <<" from P" << PNUM(MYROW(this->grid_),sender,this->grid_)
2259  << std::endl << std::endl;
2260 #endif
2261  MPI_Irecv( &snode.SizeSstrUcolRecv, 1, MPI_INT, sender,
2262  IDX_TO_TAG2(snode.Index,supidx,SELINV_TAG_UCOL_SIZE),
2263  this->grid_->rowComm, mpireqsRecvUFromLeft );
2264 #if ( _DEBUGlevel_ >= 1 )
2265  statusOFS << "["<<snode.Index<<"] "<< "Receiving Ucol"
2266  <<" size on tag "<<IDX_TO_TAG2(snode.Index,supidx,SELINV_TAG_UCOL_SIZE)
2267  <<" from P" << PNUM(MYROW(this->grid_),sender,this->grid_)
2268  << std::endl << std::endl;
2269 #endif
2270  } // if I need to receive from left
2271  }
2272  TIMER_STOP(WaitContentLU);
2273 
2274  //Wait to receive all the sizes for L
2275  TIMER_START(WaitContentLU);
2276  TIMER_START(WaitSize_LrowL);
2277  mpi::Waitall(arrMpireqsRecvLSizeFromAny);
2278  TIMER_STOP(WaitSize_LrowL);
2279 
2280  // Receivers (Content)
2281  for (Int supidx=0; supidx<stepSuper ; supidx++){
2282  SuperNodeBufferTypeUnsym & snode = arrSuperNodes[supidx];
2283 
2284  MPI_Request * mpireqsRecvFromAbove =
2285  &arrMpireqsRecvLContentFromAny[supidx*2];
2286  MPI_Request * mpireqsRecvFromLeft =
2287  &arrMpireqsRecvLContentFromAny[supidx*2+1];
2288 
2289  TIMER_START(Alloc_Buffer_Recv_LrowL);
2290  if( this->isRecvFromAbove_( snode.Index ) &&
2291  MYROW( this->grid_ ) != PROW( snode.Index, this->grid_ ) ){
2292  snode.SstrLrowRecv.resize( snode.SizeSstrLrowRecv );
2293  Int sender = PROW( snode.Index, this->grid_ );
2294  MPI_Irecv( &snode.SstrLrowRecv[0], snode.SizeSstrLrowRecv, MPI_BYTE,
2295  sender, IDX_TO_TAG2(snode.Index,supidx,SELINV_TAG_LROW_CONTENT),
2296  this->grid_->colComm, mpireqsRecvFromAbove );
2297 #if ( _DEBUGlevel_ >= 1 )
2298  statusOFS << "["<<snode.Index<<"] "<< "Receiving Lrow "
2299  << snode.SizeSstrLrowRecv << " BYTES from P"
2300  << PNUM(sender,MYCOL(this->grid_),this->grid_)
2301  << std::endl << std::endl;
2302 #endif
2303  } // if I need to receive from up
2304  TIMER_STOP(Alloc_Buffer_Recv_LrowL);
2305 
2306  TIMER_START(Alloc_Buffer_Recv_LcolL);
2307  if( this->isRecvFromLeft_( snode.Index ) &&
2308  MYCOL( this->grid_ ) != PCOL( snode.Index, this->grid_ ) ){
2309  snode.SstrLcolRecv.resize( snode.SizeSstrLcolRecv );
2310  Int sender = PCOL( snode.Index, this->grid_ );
2311  MPI_Irecv( &snode.SstrLcolRecv[0], snode.SizeSstrLcolRecv, MPI_BYTE,
2312  sender, IDX_TO_TAG2(snode.Index,supidx,SELINV_TAG_L_CONTENT),
2313  this->grid_->rowComm,
2314  mpireqsRecvFromLeft );
2315 #if ( _DEBUGlevel_ >= 1 )
2316  statusOFS << "["<<snode.Index<<"] "<< "Receiving L "
2317  << snode.SizeSstrLcolRecv << " BYTES from P"
2318  << PNUM(MYROW(this->grid_),sender,this->grid_)
2319  << std::endl << std::endl;
2320 #endif
2321  } // if I need to receive from left
2322  TIMER_STOP(Alloc_Buffer_Recv_LcolL);
2323  }
2324  TIMER_STOP(WaitContentLU);
2325 
2326 
2327  //Wait to receive all the sizes for U
2328  TIMER_START(WaitContentLU);
2329  TIMER_START(WaitSize_UcolU);
2330  mpi::Waitall(arrMpireqsRecvUSizeFromAny);
2331  TIMER_STOP(WaitSize_UcolU);
2332 
2333  // Receivers (Content)
2334  for (Int supidx=0; supidx<stepSuper ; supidx++){
2335  SuperNodeBufferTypeUnsym & snode = arrSuperNodes[supidx];
2336 
2337  MPI_Request * mpireqsRecvFromAbove =
2338  &arrMpireqsRecvUContentFromAny[supidx*2];
2339  MPI_Request * mpireqsRecvFromLeft =
2340  &arrMpireqsRecvUContentFromAny[supidx*2+1];
2341 
2342  TIMER_START(Alloc_Buffer_Recv_UrowL);
2343  if( this->isRecvFromAbove_( snode.Index ) &&
2344  MYROW( this->grid_ ) != PROW( snode.Index, this->grid_ ) ){
2345  snode.SstrUrowRecv.resize( snode.SizeSstrUrowRecv );
2346  Int sender = PROW( snode.Index, this->grid_ );
2347  MPI_Irecv( &snode.SstrUrowRecv[0], snode.SizeSstrUrowRecv, MPI_BYTE,
2348  sender, IDX_TO_TAG2(snode.Index,supidx,SELINV_TAG_U_CONTENT),
2349  this->grid_->colComm, mpireqsRecvFromAbove );
2350 #if ( _DEBUGlevel_ >= 1 )
2351  statusOFS << "["<<snode.Index<<"] "<< "Receiving U "
2352  << snode.SizeSstrUrowRecv << " BYTES from P"
2353  << PNUM(sender,MYCOL(this->grid_),this->grid_)
2354  << std::endl << std::endl;
2355 #endif
2356  } // if I need to receive from up
2357  TIMER_STOP(Alloc_Buffer_Recv_UrowL);
2358 
2359  TIMER_START(Alloc_Buffer_Recv_UcolL);
2360  if( this->isRecvFromLeft_( snode.Index ) &&
2361  MYCOL( this->grid_ ) != PCOL( snode.Index, this->grid_ ) ){
2362  snode.SstrUcolRecv.resize( snode.SizeSstrUcolRecv );
2363  Int sender = PCOL( snode.Index, this->grid_ );
2364  MPI_Irecv( &snode.SstrUcolRecv[0], snode.SizeSstrUcolRecv, MPI_BYTE,
2365  sender, IDX_TO_TAG2(snode.Index,supidx,SELINV_TAG_UCOL_CONTENT),
2366  this->grid_->rowComm,
2367  mpireqsRecvFromLeft );
2368 #if ( _DEBUGlevel_ >= 1 )
2369  statusOFS << "["<<snode.Index<<"] "<< "Receiving Ucol "
2370  << snode.SizeSstrUcolRecv << " BYTES from P"
2371  << PNUM(MYROW(this->grid_),sender,this->grid_)
2372  << std::endl << std::endl;
2373 #endif
2374  } // if I need to receive from left
2375  TIMER_STOP(Alloc_Buffer_Recv_UcolL);
2376  }
2377 
2378  TIMER_STOP(WaitContentLU);
2379 
2380 
2381  TIMER_START(Compute_Sinv_LU);
2382  {
2383  Int gemmProcessed = 0;
2384  Int gemmToDo = 0;
2385  // Int toRecvGemm = 0;
2386  //copy the list of supernodes we need to process
2387  std::vector<Int> readySupidx;
2388  //find local things to do
2389  for(Int supidx = 0;supidx<stepSuper;supidx++){
2390  SuperNodeBufferTypeUnsym & snode = arrSuperNodes[supidx];
2391  if( this->isRecvFromAbove_( snode.Index )
2392  && this->isRecvFromLeft_( snode.Index )){
2393  gemmToDo+=2;
2394  if( MYCOL( this->grid_ ) == PCOL( snode.Index, this->grid_ ) ){
2395  snode.isReady+=2;
2396  }
2397 
2398  if( MYROW( this->grid_ ) == PROW( snode.Index, this->grid_ ) ){
2399  snode.isReady+=2;
2400  }
2401 
2402  if(snode.isReady==4){
2403  readySupidx.push_back(supidx);
2404 #if ( _DEBUGlevel_ >= 1 )
2405  statusOFS<<"Locally processing ["<<snode.Index<<"]"<<std::endl;
2406 #endif
2407  }
2408  }
2409  else{
2410  TIMER_START(Reduce_Sinv_L_Send);
2411  if( this->isRecvFromLeft_( snode.Index )
2412  && MYCOL( this->grid_ ) != PCOL( snode.Index, this->grid_ ) ){
2413  MPI_Request & mpireqsSendToLeft = arrMpireqsSendLToLeft[supidx];
2414  // Dummy 0-b send If I was a receiver, I need to send my data to
2415  // proc in column of snode.Index
2416  MPI_Isend( NULL, 0, MPI_BYTE, PCOL(snode.Index,this->grid_),
2417  IDX_TO_TAG2(snode.Index,supidx,SELINV_TAG_L_REDUCE),
2418  this->grid_->rowComm, &mpireqsSendToLeft );
2419 
2420 #if ( _DEBUGlevel_ >= 1 )
2421  Int dst = PNUM(MYROW(this->grid_),
2422  PCOL(snode.Index,this->grid_),this->grid_);
2423  statusOFS << "["<<snode.Index<<"] "<< " LReduce P"
2424  << MYPROC(this->grid_) << " has sent "
2425  << 0 << " bytes to "
2426  << dst << std::endl;
2427 #endif
2428  }// if( isRecvFromLeft_( snode.Index ))
2429  TIMER_STOP(Reduce_Sinv_L_Send);
2430 
2431 
2432 
2433  }
2434  }
2435 
2436 #if ( _DEBUGlevel_ >= 1 )
2437  statusOFS<<std::endl<<"gemmToDo ="<<gemmToDo<<std::endl;
2438 #endif
2439 
2440 
2441 #if defined (PROFILE)
2442  end_SendULWaitContentFirst=0;
2443  begin_SendULWaitContentFirst=0;
2444 #endif
2445 
2446  while(gemmProcessed<gemmToDo)
2447  {
2448  Int reqidx = MPI_UNDEFINED;
2449  Int supidx = -1;
2450 
2451  TIMER_START(WaitContentLU);
2452  //while I don't have anything to do, wait for data to arrive
2453  do{
2454  int reqIndicesL[arrMpireqsRecvLContentFromAny.size()];
2455  int numRecv = 0;
2456 
2457  //then process with the remote ones
2458 
2459 #if defined(PROFILE)
2460  if(begin_SendULWaitContentFirst==0){
2461  begin_SendULWaitContentFirst=1;
2462  TIMER_START(WaitContent_LrowL_First);
2463  }
2464 #endif
2465 
2466  TIMER_START(WaitContent_LrowL);
2467  MPI_Waitsome(2*stepSuper, &arrMpireqsRecvLContentFromAny[0],
2468  &numRecv, reqIndicesL, MPI_STATUSES_IGNORE);
2469 
2470  for(int i =0;i<numRecv;i++){
2471  reqidx = reqIndicesL[i];
2472  //I've received something
2473  if(reqidx!=MPI_UNDEFINED){
2474 
2475  supidx = reqidx/2;
2476  SuperNodeBufferTypeUnsym & snode = arrSuperNodes[supidx];
2477  snode.isReady++;
2478 
2479 #if ( _DEBUGlevel_ >= 1 )
2480  statusOFS << "["<<snode.Index<<"] "<<"Received data for L"
2481  << " reqidx%2=" << reqidx%2
2482  << " is ready ?"<<snode.isReady<<std::endl;
2483 #endif
2484  //if we received both L and U, the supernode is ready
2485  if(snode.isReady==4){
2486  readySupidx.push_back(supidx);
2487 #if defined(PROFILE)
2488  if(end_SendULWaitContentFirst==0){
2489  TIMER_STOP(WaitContent_LrowL_First);
2490  end_SendULWaitContentFirst=1;
2491  }
2492 #endif
2493  }
2494  }
2495 
2496  }//end for waitsome
2497  TIMER_STOP(WaitContent_LrowL);
2498 
2499 
2500  int reqIndicesU[arrMpireqsRecvUContentFromAny.size()];
2501  numRecv = 0;
2502 
2503 
2504  TIMER_START(WaitContent_UcolU);
2505  MPI_Waitsome(2*stepSuper, &arrMpireqsRecvUContentFromAny[0],
2506  &numRecv, reqIndicesU, MPI_STATUSES_IGNORE);
2507 
2508  for(int i =0;i<numRecv;i++){
2509  reqidx = reqIndicesU[i];
2510  //I've received something
2511  if(reqidx!=MPI_UNDEFINED){
2512  supidx = reqidx/2;
2513  SuperNodeBufferTypeUnsym & snode = arrSuperNodes[supidx];
2514  snode.isReady++;
2515 
2516 #if ( _DEBUGlevel_ >= 1 )
2517  statusOFS << "["<<snode.Index<<"] "<<"Received data for U"
2518  << " reqidx%2=" << reqidx%2
2519  << " is ready ?"<<snode.isReady<<std::endl;
2520 #endif
2521  //if we received both L and U, the supernode is ready
2522  if(snode.isReady==4){
2523  readySupidx.push_back(supidx);
2524 #if defined(PROFILE)
2525  if(end_SendULWaitContentFirst==0){
2526  TIMER_STOP(WaitContent_LrowL_First);
2527  end_SendULWaitContentFirst=1;
2528  }
2529 #endif
2530  }
2531  }
2532 
2533  }//end for waitsome
2534  TIMER_STOP(WaitContent_UcolU);
2535 
2536  } while(readySupidx.size()==0);
2537  TIMER_STOP(WaitContentLU);
2538 
2539  //If I have some work to do
2540  if(readySupidx.size()>0)
2541  {
2542  supidx = readySupidx.back();
2543  readySupidx.pop_back();
2544  SuperNodeBufferTypeUnsym & snode = arrSuperNodes[supidx];
2545 
2546 
2547  // Only the processors received information participate in the Gemm
2548  if( this->isRecvFromAbove_( snode.Index )
2549  && this->isRecvFromLeft_( snode.Index ) ){
2550  std::vector<LBlock<T> > LcolRecv;
2551  std::vector<LBlock<T> > LrowRecv;
2552  std::vector<UBlock<T> > UcolRecv;
2553  std::vector<UBlock<T> > UrowRecv;
2554 
2555  //TODO REMOVE THIS THIS IS ONLY FOR DEBUGING PURPOSE
2556  NumMat<T> * pAinvBuf = new NumMat<T>();
2557  NumMat<T> * pUBuf = new NumMat<T>();
2558  NumMat<T> * pLBuf = new NumMat<T>();
2559  NumMat<T> & AinvBuf = *pAinvBuf;
2560  NumMat<T> & UBuf = *pUBuf;
2561  NumMat<T> & LBuf = *pLBuf;
2562 
2563  UnpackData(snode, LcolRecv, LrowRecv, UcolRecv, UrowRecv);
2564 
2565  SelInv_lookup_indexes(snode, LcolRecv, LrowRecv,
2566  UcolRecv, UrowRecv,AinvBuf,LBuf, UBuf);
2567 
2568 
2569 #if ( _DEBUGlevel_ >= 2 )
2570  statusOFS << "["<<snode.Index<<"] " << "LBuf: ";
2571  statusOFS << LBuf << std::endl;
2572  statusOFS << "["<<snode.Index<<"] " << "UBuf: ";
2573  statusOFS << UBuf << std::endl;
2574 #endif
2575 
2576  NumMat<T> LUpdateBuf;
2577  LUpdateBuf.Resize( AinvBuf.m(),
2578  SuperSize( snode.Index, this->super_ ) );
2579 
2580 
2581 
2582  TIMER_START(Compute_Sinv_L_Resize);
2583  snode.LUpdateBuf.Resize( AinvBuf.m(),
2584  SuperSize( snode.Index, this->super_ ) );
2585  TIMER_STOP(Compute_Sinv_L_Resize);
2586 
2587  TIMER_START(Compute_Sinv_LT_GEMM);
2588  blas::Gemm('N', 'T', AinvBuf.m(), LBuf.m(), AinvBuf.n(),
2589  MINUS_ONE<T>(), AinvBuf.Data(), AinvBuf.m(),
2590  LBuf.Data(), LBuf.m(), ZERO<T>(),
2591  snode.LUpdateBuf.Data(), snode.LUpdateBuf.m() );
2592  TIMER_STOP(Compute_Sinv_LT_GEMM);
2593 
2594  TIMER_START(Compute_Sinv_U_Resize);
2595  snode.UUpdateBuf.Resize( SuperSize( snode.Index, this->super_ ),
2596  AinvBuf.n() );
2597  TIMER_STOP(Compute_Sinv_U_Resize);
2598 
2599  TIMER_START(Compute_Sinv_U_GEMM);
2600  blas::Gemm('N', 'N', UBuf.m(), AinvBuf.n(), AinvBuf.m(),
2601  MINUS_ONE<T>(), UBuf.Data(), UBuf.m(),
2602  AinvBuf.Data(), AinvBuf.m(), ZERO<T>(),
2603  snode.UUpdateBuf.Data(), snode.UUpdateBuf.m() );
2604  TIMER_STOP(Compute_Sinv_U_GEMM);
2605 
2606 #if ( _DEBUGlevel_ >= 2 )
2607  statusOFS << "["<<snode.Index<<"] " << "snode.LUpdateBuf: ";
2608  statusOFS << snode.LUpdateBuf << std::endl;
2609  statusOFS << "["<<snode.Index<<"] " << "snode.UUpdateBuf: ";
2610  statusOFS << snode.UUpdateBuf << std::endl;
2611 #endif
2612 
2613  //TODO REMOVE THIS THIS IS ONLY FOR DEBUGING PURPOSE
2614  delete pLBuf;
2615  delete pUBuf;
2616  delete pAinvBuf;
2617 
2618 
2619  } // if Gemm is to be done locally
2620 
2621 
2622  TIMER_START(Reduce_Sinv_L_Send);
2623  // If I was a receiver, I need to send my data to proc in column
2624  // of snode.Index
2625  if( this->isRecvFromAbove_( snode.Index ) ){
2626  if( this->isRecvFromLeft_( snode.Index )
2627  && MYCOL( this->grid_ ) != PCOL( snode.Index, this->grid_ ) ){
2628  MPI_Request & reqsSendToLeft = arrMpireqsSendLToLeft[supidx];
2629  MPI_Isend( snode.LUpdateBuf.Data(), snode.LUpdateBuf.ByteSize(),
2630  MPI_BYTE, PCOL(snode.Index,this->grid_),
2631  IDX_TO_TAG2(snode.Index,supidx,SELINV_TAG_L_REDUCE),
2632  this->grid_->rowComm, &reqsSendToLeft );
2633 
2634 #if ( _DEBUGlevel_ >= 1 )
2635  Int dst = PNUM(MYROW(this->grid_),
2636  PCOL(snode.Index,this->grid_),this->grid_);
2637  statusOFS << "["<<snode.Index<<"] "<< " LReduce P"
2638  << MYPROC(this->grid_) << " has sent "
2639  << snode.LUpdateBuf.ByteSize() << " bytes to "
2640  << dst << std::endl;
2641 #endif
2642  }//Sender
2643  }
2644  TIMER_STOP(Reduce_Sinv_L_Send);
2645 
2646  gemmProcessed+=2;
2647 #if ( _DEBUGlevel_ >= 1 )
2648  statusOFS << "["<<snode.Index<<"] "<<"gemmProcessed = "
2649  << gemmProcessed<<"/"<<gemmToDo<<std::endl;
2650 #endif
2651  }
2652  }
2653 
2654  }
2655  TIMER_STOP(Compute_Sinv_LU);
2656 
2657 
2658 
2659 
2660 
2661 
2662 
2663  //Reduce Sinv L to the processors in PCOL(ksup,this->grid_)
2664  TIMER_START(Reduce_Sinv_L);
2665 
2666  for (Int supidx=0; supidx<stepSuper; supidx++){
2667  SuperNodeBufferTypeUnsym & snode = arrSuperNodes[supidx];
2668  if( MYCOL( this->grid_ ) == PCOL( snode.Index, this->grid_ ) ){
2669  //determine the number of rows in LUpdateBufReduced
2670  Int numRowLUpdateBuf;
2671  std::vector<LBlock<T> >& Lcol =
2672  this->L( LBj( snode.Index, this->grid_ ) );
2673 
2674  // If I own the diagonal block, skip the diagonal block
2675  Int offset =
2676  (MYROW( this->grid_ )==PROW(snode.Index,this->grid_))?1:0;
2677  snode.RowLocalPtr.resize( Lcol.size() + 1 - offset );
2678  snode.BlockIdxLocal.resize( Lcol.size() - offset );
2679  snode.RowLocalPtr[0] = 0;
2680  for( Int ib = 0; ib < snode.BlockIdxLocal.size(); ib++ ){
2681  snode.RowLocalPtr[ib+1] = snode.RowLocalPtr[ib]
2682  + Lcol[ib+offset].numRow;
2683  snode.BlockIdxLocal[ib] = Lcol[ib+offset].blockIdx;
2684  }
2685  numRowLUpdateBuf = *snode.RowLocalPtr.rbegin();
2686 
2687 
2688  if( numRowLUpdateBuf > 0 ){
2689  if( snode.LUpdateBuf.m() == 0 && snode.LUpdateBuf.n() == 0 ){
2690  snode.LUpdateBuf.Resize( numRowLUpdateBuf,
2691  SuperSize( snode.Index, this->super_ ) );
2692  // Fill zero is important
2693  SetValue( snode.LUpdateBuf, ZERO<T>() );
2694  }
2695  }
2696 
2697 #if ( _DEBUGlevel_ >= 2 )
2698  statusOFS << "["<<snode.Index<<"] "<<"LUpdateBuf Before Reduction: "
2699  << snode.LUpdateBuf << std::endl << std::endl;
2700 #endif
2701 
2702  Int totCountRecv = 0;
2703  Int numRecv = this->CountSendToRight(snode.Index);
2704  NumMat<T> LUpdateBufRecv(numRowLUpdateBuf,
2705  SuperSize( snode.Index, this->super_ ) );
2706  for( Int countRecv = 0; countRecv < numRecv ; ++countRecv ){
2707  //Do the blocking recv
2708  MPI_Status stat;
2709  Int size = 0;
2710  TIMER_START(L_RECV);
2711  MPI_Recv(LUpdateBufRecv.Data(), LUpdateBufRecv.ByteSize(),
2712  MPI_BYTE, MPI_ANY_SOURCE,
2713  IDX_TO_TAG2(snode.Index,supidx,SELINV_TAG_L_REDUCE),
2714  this->grid_->rowComm,&stat);
2715  TIMER_STOP(L_RECV);
2716  MPI_Get_count(&stat, MPI_BYTE, &size);
2717  //if the processor contributes
2718  if(size>0){
2719 #if ( _DEBUGlevel_ >= 1 )
2720  Int src = PNUM(MYROW(this->grid_),stat.MPI_SOURCE,this->grid_);
2721  statusOFS << "["<<snode.Index<<"] "<< " LReduce P"
2722  << MYPROC(this->grid_)<<" has received "<< size
2723  << " bytes from " << src << std::endl;
2724 #endif
2725 #if ( _DEBUGlevel_ >= 2 )
2726  statusOFS << "["<<snode.Index<<"] "<< "LUpdateBufRecv: "
2727  << LUpdateBufRecv << std::endl << std::endl;
2728 #endif
2729  //do the sum
2730  blas::Axpy(snode.LUpdateBuf.Size(), ONE<T>(),
2731  LUpdateBufRecv.Data(), 1,
2732  snode.LUpdateBuf.Data(), 1 );
2733  }
2734  } // for (iProcCol)
2735 #if ( _DEBUGlevel_ >= 2 )
2736  statusOFS << "["<<snode.Index<<"] "<< "LUpdateBuf After Reduction: "
2737  << snode.LUpdateBuf << std::endl << std::endl;
2738 #endif
2739  } // Receiver
2740  }
2741 
2742  TIMER_STOP(Reduce_Sinv_L);
2743  //--------------------- End of reduce of LUpdateBuf-------------------------
2744 
2745 
2746  mpi::Waitall( arrMpireqsSendLToLeft );
2747 
2748 
2749  if(next_lidx < superList.size()){
2750  IRecvContentCD(superList[next_lidx], superList[next_lidx].size(),nextCDBuffers);
2751  }
2752 
2753 
2754 
2755 
2756 
2757  TIMER_START(Update_Diagonal);
2758  for (Int supidx=0; supidx<stepSuper; supidx++){
2759  SuperNodeBufferTypeUnsym & snode = arrSuperNodes[supidx];
2760 
2761  ComputeDiagUpdate(snode);
2762 
2763  if( MYCOL( this->grid_ ) == PCOL( snode.Index, this->grid_ ) ){
2764  if( MYROW( this->grid_ ) != PROW( snode.Index, this->grid_ ) ){
2765  if(this->isSendToDiagonal_(snode.Index)){
2766  //send to above
2767  MPI_Request & mpireqsSendToAbove = arrMpireqsSendToAbove[supidx];
2768  MPI_Isend( snode.DiagBuf.Data(), snode.DiagBuf.ByteSize(),
2769  MPI_BYTE, PROW(snode.Index,this->grid_) ,
2770  IDX_TO_TAG2(snode.Index,supidx,SELINV_TAG_D_REDUCE),
2771  this->grid_->colComm, &mpireqsSendToAbove );
2772 
2773 #if ( _DEBUGlevel_ >= 1 )
2774  statusOFS << "["<<snode.Index<<"] "<< " P"<<MYROW(this->grid_)
2775  <<" has sent "<< snode.DiagBuf.ByteSize()
2776  << " bytes of DiagBuf to "
2777  << PROW(snode.Index,this->grid_) << std::endl;
2778 #endif
2779  }
2780  }
2781  }
2782  }
2783 
2784  TIMER_STOP(Update_Diagonal);
2785 
2786 
2787 
2788  TIMER_START(Reduce_Diagonal);
2789 
2790  for (Int supidx=0; supidx<stepSuper; supidx++){
2791  SuperNodeBufferTypeUnsym & snode = arrSuperNodes[supidx];
2792  if( MYCOL( this->grid_ ) == PCOL( snode.Index, this->grid_ ) ){
2793  if( MYROW( this->grid_ ) == PROW( snode.Index, this->grid_ ) ){
2794  if(snode.DiagBuf.Size()==0){
2795  snode.DiagBuf.Resize( SuperSize( snode.Index, this->super_ ),
2796  SuperSize( snode.Index, this->super_ ));
2797  SetValue(snode.DiagBuf, ZERO<T>());
2798  }
2799  //receive from below
2800  Int totCountRecv = 0;
2801  Int numRecv = this->CountRecvFromBelow(snode.Index);
2802  NumMat<T> DiagBufRecv(snode.DiagBuf.m(),snode.DiagBuf.n());
2803 
2804  for( Int countRecv = 0; countRecv < numRecv ; ++countRecv ){
2805  //Do the blocking recv
2806  MPI_Status stat;
2807  Int size = 0;
2808  TIMER_START(D_RECV);
2809  MPI_Recv(DiagBufRecv.Data(), DiagBufRecv.ByteSize(), MPI_BYTE,
2810  MPI_ANY_SOURCE,IDX_TO_TAG2(snode.Index,supidx,SELINV_TAG_D_REDUCE),
2811  this->grid_->colComm,&stat);
2812  TIMER_STOP(D_RECV);
2813  MPI_Get_count(&stat, MPI_BYTE, &size);
2814  //if the processor contributes
2815  if(size>0){
2816  // Add DiagBufRecv to diagonal block.
2817  blas::Axpy(snode.DiagBuf.Size(), ONE<T>(),
2818  DiagBufRecv.Data(), 1,
2819  snode.DiagBuf.Data(), 1 );
2820  }
2821  }
2822  LBlock<T> & LB = this->L( LBj( snode.Index, this->grid_ ) )[0];
2823  blas::Axpy( LB.numRow * LB.numCol, ONE<T>(),
2824  snode.DiagBuf.Data(), 1,
2825  LB.nzval.Data(), 1 );
2826  }
2827 
2828  }
2829  }
2830 
2831 
2832  TIMER_STOP(Reduce_Diagonal);
2833 
2834 
2835 
2836  //Reduce U Sinv to the processors in PROW(ksup,this->grid_)
2837  TIMER_START(Reduce_Sinv_U);
2838 
2839  for (Int supidx=0; supidx<stepSuper; supidx++){
2840  SuperNodeBufferTypeUnsym & snode = arrSuperNodes[supidx];
2841 
2842 
2843  if( MYROW( this->grid_ ) == PROW( snode.Index, this->grid_ ) ){
2844  //determine the number of rows in UUpdateBufReduced
2845  Int numColUUpdateBuf;
2846  //FIXME U must be revised to store the same structure as L ?
2847  std::vector<UBlock<T> >& Urow =
2848  this->U( LBi( snode.Index, this->grid_ ) );
2849 
2850  snode.ColLocalPtr.resize( Urow.size() + 1 );
2851  snode.BlockIdxLocalU.resize( Urow.size() );
2852  snode.ColLocalPtr[0] = 0;
2853 
2854  // std::vector<LBlock<T> >& Lrow = this->Lrow( LBi( snode.Index, this->grid_ ) );
2855  // if(Lrow.size()>=Urow.size()){
2856  // statusOFS<<"UReduce first case"<<std::endl;
2857  // std::vector<Int> colPtrL(Lrow.size()+1);
2858  // colPtrL[0] = 0;
2859  // for( Int ib = 0; ib < Lrow.size(); ib++ ){
2860  // colPtrL[ib+1] = colPtrL[ib] + Lrow[ib].numCol;
2861  // }
2862  //
2863  //
2864  //
2865  // for( Int jb = 0; jb < Urow.size(); jb++ ){
2866  // Int indexL =0;
2867  // for( Int ib = 0; ib < Lrow.size(); ib++ ){
2868  // if(Lrow[ib].blockIdx == Urow[jb].blockIdx){
2869  // indexL = ib;
2870  // break;
2871  // }
2872  // }
2873  // statusOFS<<jb<<" vs "<<indexL<<std::endl;
2874  //
2875  // snode.ColLocalPtr[jb] = colPtrL[indexL];
2876  // //snode.ColLocalPtr[jb+1] = snode.ColLocalPtr[jb] + Urow[jb].numCol;
2877  // snode.BlockIdxLocalU[jb] = Lrow[indexL].blockIdx;
2878  // //snode.BlockIdxLocalU[jb] = Urow[jb].blockIdx;
2879  // }
2880  // snode.ColLocalPtr.back()=colPtrL.back();
2881  //
2882  // statusOFS<<colPtrL<<std::endl;
2883  // }
2884  // else{
2885  // statusOFS<<"UReduce second case"<<std::endl;
2886  Int urowsize = Urow.size();
2887  snode.ColLocalPtr[0] = 0;
2888  for( Int jb = 0; jb < Urow.size(); jb++ ){
2889  UBlock<T> & UB = Urow[jb];
2890  snode.ColLocalPtr[jb+1] = snode.ColLocalPtr[jb]+UB.numCol;
2891  snode.BlockIdxLocalU[jb] = UB.blockIdx;
2892  }
2893  // }
2894 
2895 #if ( _DEBUGlevel_ >= 2 )
2896  statusOFS << "["<<snode.Index<<"] "<<"UReduce collocalptr "
2897  << snode.ColLocalPtr << std::endl;
2898  statusOFS << "["<<snode.Index<<"] "<<"UReduce blockidxlocalU "
2899  << snode.BlockIdxLocalU << std::endl;
2900 #endif
2901 
2902  numColUUpdateBuf = *snode.ColLocalPtr.rbegin();
2903 
2904  if( numColUUpdateBuf > 0 ){
2905  if( snode.UUpdateBuf.m() == 0 && snode.UUpdateBuf.n() == 0 ){
2906  snode.UUpdateBuf.Resize( SuperSize( snode.Index, this->super_ ),
2907  numColUUpdateBuf );
2908  // Fill zero is important
2909  SetValue( snode.UUpdateBuf, ZERO<T>() );
2910  }
2911  }
2912 
2913 #if ( _DEBUGlevel_ >= 2 )
2914  statusOFS << "["<<snode.Index<<"] "<<"UUpdateBuf Before Reduction: "
2915  << snode.UUpdateBuf << std::endl << std::endl;
2916 #endif
2917 
2918  Int totCountRecv = 0;
2919 
2920  Int numRecv = this->CountSendToBelow(snode.Index);
2921 
2922  NumMat<T> UUpdateBufRecv(SuperSize( snode.Index, this->super_ ),
2923  numColUUpdateBuf );
2924 
2925 #if ( _DEBUGlevel_ >= 1 )
2926  statusOFS << "["<<snode.Index<<"] "<< " UReduce P"
2927  << MYPROC(this->grid_) << " can receive at most "
2928  << UUpdateBufRecv.ByteSize() << " bytes" << std::endl;
2929 #endif
2930  for( Int countRecv = 0; countRecv < numRecv ; ++countRecv ){
2931  //Do the blocking recv
2932  MPI_Status stat;
2933  Int size = 0;
2934  TIMER_START(U_RECV);
2935  MPI_Recv(UUpdateBufRecv.Data(), UUpdateBufRecv.ByteSize(),
2936  MPI_BYTE, MPI_ANY_SOURCE,
2937  IDX_TO_TAG2(snode.Index,supidx,SELINV_TAG_U_REDUCE),
2938  this->grid_->colComm,&stat);
2939  TIMER_STOP(U_RECV);
2940  MPI_Get_count(&stat, MPI_BYTE, &size);
2941  //if the processor contributes
2942  if(size>0){
2943 
2944 #if ( _DEBUGlevel_ >= 1 )
2945  Int src = PNUM(stat.MPI_SOURCE,MYCOL(this->grid_),this->grid_);
2946  statusOFS << "["<<snode.Index<<"] "<< " UReduce P"
2947  << MYPROC(this->grid_)<<" has received "
2948  << size << " bytes from P" << src << std::endl;
2949 #endif
2950 
2951 #if ( _DEBUGlevel_ >= 2 )
2952  statusOFS << "["<<snode.Index<<"] "<< "UUpdateBufRecv: "
2953  << UUpdateBufRecv << std::endl << std::endl;
2954 #endif
2955  //do the sum
2956  blas::Axpy(snode.UUpdateBuf.Size(), ONE<T>(),
2957  UUpdateBufRecv.Data(), 1,
2958  snode.UUpdateBuf.Data(), 1);
2959  }
2960  } // for (iProcRow)
2961 #if ( _DEBUGlevel_ >= 2 )
2962  statusOFS << "["<<snode.Index<<"] "<<"UUpdateBuf After Reduction: "
2963  << snode.UUpdateBuf << std::endl << std::endl;
2964 #endif
2965  } // Receiver
2966  else{
2967 
2968 
2969  TIMER_START(Reduce_Sinv_U_Send);
2970  if(!this->isRecvFromLeft_( snode.Index ) && this->isRecvFromAbove_( snode.Index ) ){
2971  MPI_Request & mpireqsSendToAbove = arrMpireqsSendUToAbove[supidx];
2972  // Dummy 0-b send If I was a receiver, I need to send my data to
2973  // proc in row of snode.Index
2974  MPI_Isend( NULL, 0, MPI_BYTE, PROW(snode.Index,this->grid_),
2975  IDX_TO_TAG2(snode.Index,supidx,SELINV_TAG_U_REDUCE),
2976  this->grid_->colComm, &mpireqsSendToAbove );
2977 
2978 #if ( _DEBUGlevel_ >= 1 )
2979  Int dst = PNUM(PROW(snode.Index,this->grid_),
2980  MYCOL(this->grid_),this->grid_);
2981  statusOFS << "["<<snode.Index<<"] "<< " UReduce P"
2982  << MYPROC(this->grid_) << " has sent "
2983  << 0 << " bytes to "
2984  << dst << std::endl;
2985 #endif
2986  }// if( isRecvFromAbove_( snode.Index ) )
2987  TIMER_STOP(Reduce_Sinv_U_Send);
2988 
2989 
2990  TIMER_START(Reduce_Sinv_U_Send);
2991  // If I was a receiver, I need to send my data to proc in row
2992  // of snode.Index
2993  if( this->isRecvFromAbove_( snode.Index ) && this->isRecvFromLeft_( snode.Index ) ){
2994  MPI_Request & reqsSendToAbove = arrMpireqsSendUToAbove[supidx];
2995  TIMER_START(Reduce_Sinv_U_Send_Isend);
2996  MPI_Isend( snode.UUpdateBuf.Data(), snode.UUpdateBuf.ByteSize(),
2997  MPI_BYTE, PROW(snode.Index,this->grid_),
2998  IDX_TO_TAG2(snode.Index,supidx,SELINV_TAG_U_REDUCE),
2999  this->grid_->colComm, &reqsSendToAbove );
3000  TIMER_STOP(Reduce_Sinv_U_Send_Isend);
3001 #if ( _DEBUGlevel_ >= 1 )
3002  Int dst = PNUM(PROW(snode.Index,this->grid_),
3003  MYCOL(this->grid_),this->grid_);
3004  statusOFS << "["<<snode.Index<<"] "<< " UReduce P"
3005  << MYPROC(this->grid_) << " has sent "
3006  << snode.UUpdateBuf.ByteSize() << " bytes to "
3007  << dst << std::endl;
3008 #endif
3009  }//Sender
3010  TIMER_STOP(Reduce_Sinv_U_Send);
3011 
3012 
3013  }
3014  }
3015 
3016  TIMER_STOP(Reduce_Sinv_U);
3017  //--------------------- End of reduce of UUpdateBuf-------------------------
3018 
3019  mpi::Waitall( arrMpireqsSendUToAbove );
3020 
3021 
3022 
3023 
3024 
3025 
3026  //Deallocate Lrow and Ucol
3027  for (Int supidx=0; supidx<stepSuper; supidx++){
3028  SuperNodeBufferTypeUnsym & snode = arrSuperNodes[supidx];
3029  if( MYROW( this->grid_ ) == PROW( snode.Index, this->grid_ )){
3030  std::vector<LBlock<T> >& Lrow = this->Lrow( LBi(snode.Index, this->grid_) );
3031  Lrow.clear();
3032  }
3033  if( MYCOL( this->grid_ ) == PCOL( snode.Index, this->grid_ )){
3034  std::vector<UBlock<T> >& Ucol = this->Ucol( LBj(snode.Index, this->grid_) );
3035  Ucol.clear();
3036  }
3037  }
3038 
3039 
3040  TIMER_START(Update_L);
3041 
3042  for (Int supidx=0; supidx<stepSuper; supidx++){
3043  SuperNodeBufferTypeUnsym & snode = arrSuperNodes[supidx];
3044 
3045 #if ( _DEBUGlevel_ >= 1 )
3046  statusOFS << std::endl << "["<<snode.Index<<"] "
3047  << "Finish updating the L part by filling LUpdateBufReduced"
3048  << " back to L" << std::endl << std::endl;
3049 #endif
3050 
3051  if( MYCOL( this->grid_ ) == PCOL( snode.Index, this->grid_ )
3052  && snode.LUpdateBuf.m() > 0 ){
3053  std::vector<LBlock<T> >& Lcol = this->L(LBj(snode.Index,this->grid_));
3054  //Need to skip the diagonal block if present
3055  Int startBlock =
3056  (MYROW( this->grid_ ) == PROW( snode.Index, this->grid_ ))?1:0;
3057  for( Int ib = startBlock; ib < Lcol.size(); ib++ ){
3058  LBlock<T> & LB = Lcol[ib];
3059  if(1|| (LB.numRow>0 && LB.numCol>0)){
3060  lapack::Lacpy( 'A', LB.numRow, LB.numCol,
3061  &snode.LUpdateBuf(snode.RowLocalPtr[ib-startBlock], 0),
3062  snode.LUpdateBuf.m(), LB.nzval.Data(), LB.numRow );
3063  }
3064  }
3065  } // Finish updating L
3066  } // for (snode.Index) : Main loop
3067 
3068 
3069  TIMER_STOP(Update_L);
3070 
3071 
3072 
3073 
3074  TIMER_START(Update_U);
3075 
3076  for (Int supidx=0; supidx<stepSuper; supidx++){
3077  SuperNodeBufferTypeUnsym & snode = arrSuperNodes[supidx];
3078 
3079 #if ( _DEBUGlevel_ >= 1 )
3080  statusOFS << "["<<snode.Index<<"] "
3081  << "Finish updating the U part by filling UUpdateBufReduced"
3082  << " back to U" << std::endl << std::endl;
3083 #endif
3084 
3085  if( MYROW( this->grid_ ) == PROW( snode.Index, this->grid_ )
3086  && snode.UUpdateBuf.m() > 0 ){
3087  std::vector<UBlock<T> >& Urow = this->U(LBi(snode.Index,this->grid_));
3088 
3089  for( Int jb = 0; jb < Urow.size(); jb++ ){
3090  UBlock<T> & UB = Urow[jb];
3091  if(1|| (UB.numRow>0 && UB.numCol>0)){
3092 #if ( _DEBUGlevel_ >= 1 )
3093  statusOFS << "["<<snode.Index<<"] "<<"Putting colptr "
3094  << snode.ColLocalPtr[jb] << " of UUpdateBuf "
3095  << snode.ColLocalPtr << " "
3096  << "in UB "<<UB.blockIdx<<std::endl;
3097 #endif
3098  //Indices follow L order... look at Lrow
3099  lapack::Lacpy( 'A', UB.numRow, UB.numCol,
3100  &snode.UUpdateBuf( 0, snode.ColLocalPtr[jb] ),
3101  snode.UUpdateBuf.m(), UB.nzval.Data(), UB.numRow );
3102 #if ( _DEBUGlevel_ >= 2 )
3103  statusOFS<< "["<<snode.Index<<"] "<<"UB: "<<UB.nzval<<std::endl;
3104 #endif
3105  }
3106  }
3107  } // Finish updating U
3108  } // for (snode.Index) : Main loop
3109 
3110 
3111  TIMER_STOP(Update_U);
3112 
3113 
3114 
3115 
3116  if(next_lidx < superList.size()){
3117  WaitContentLCD(superList[next_lidx], superList[next_lidx].size(),nextCDBuffers);
3118  WaitContentUCD(superList[next_lidx], superList[next_lidx].size(),nextCDBuffers);
3119  }
3120 
3121 
3122 
3123  TIMER_START(Barrier);
3124  if(next_lidx < superList.size()){
3125  nextCDBuffers.WaitAllSend();
3126  }
3127  mpi::Waitall(arrMpireqsRecvLContentFromAny);
3128  mpi::Waitall(arrMpireqsRecvUContentFromAny);
3129  //Sync for reduce L
3130  // mpi::Waitall( arrMpireqsSendToLeft );
3131  //Sync for reduce D
3132  mpi::Waitall(arrMpireqsSendToAbove);
3133 
3134  for (Int supidx=0; supidx<stepSuper; supidx++){
3135  Int ksup = superList[lidx][supidx];
3136  std::vector<MPI_Request> & mpireqsSendLToRight = arrMpireqsSendLToRight[supidx];
3137  std::vector<MPI_Request> & mpireqsSendLToBelow = arrMpireqsSendLToBelow[supidx];
3138  std::vector<MPI_Request> & mpireqsSendUToRight = arrMpireqsSendUToRight[supidx];
3139  std::vector<MPI_Request> & mpireqsSendUToBelow = arrMpireqsSendUToBelow[supidx];
3140 
3141  mpi::Waitall( mpireqsSendLToRight );
3142  mpi::Waitall( mpireqsSendLToBelow );
3143  mpi::Waitall( mpireqsSendUToRight );
3144  mpi::Waitall( mpireqsSendUToBelow );
3145 
3146  }
3147  TIMER_STOP(Barrier);
3148 
3149 
3150 
3151 #ifdef LIST_BARRIER
3152 #ifndef ALL_BARRIER
3153  if (this->options_->maxPipelineDepth!=-1)
3154 #endif
3155  {
3156  MPI_Barrier(this->grid_->comm);
3157  }
3158 #endif
3159 
3160  }
3161 
3162  template<typename T>
3164  {
3165  this->SelInv_P2p ( );
3166  } // ----- end of method PMatrixUnsym::SelInv -----
3167 
3168  template<typename T>
3170  {
3171  TIMER_START(SelInv_P2p);
3172 
3173 
3174 
3175  Int numSuper = this->NumSuper();
3176 
3177  // Main loop
3178  std::vector<std::vector<Int> > & superList = this->WorkingSet();
3179  Int numSteps = superList.size();
3180 
3181  for (Int lidx=0; lidx<numSteps ; lidx++){
3182  Int stepSuper = superList[lidx].size();
3183  this->SelInvIntra_P2p(lidx);
3184  }
3185 
3186 
3187  TIMER_STOP(SelInv_P2p);
3188 
3189  return ;
3190  } // ----- end of method PMatrixUnsym::SelInv_P2p -----
3191 
3192 
3193 
3194 
3195 
3196 
3197 
3198 
3199 
3200 
3201 
3202 
3203 
3204  template<typename T>
3206  {
3207 
3208  Int numSuper = this->NumSuper();
3209 
3210 #if ( _DEBUGlevel_ >= 1 )
3211  statusOFS << std::endl << "L(i,k) <- L(i,k) * L(k,k)^{-1}"
3212  << std::endl << std::endl;
3213 #endif
3214 
3215  for( Int ksup = 0; ksup < numSuper; ksup++ ){
3216  if( MYCOL( this->grid_ ) == PCOL( ksup, this->grid_ ) ){
3217  // Broadcast the diagonal L block
3218  NumMat<T> nzvalLDiag;
3219  std::vector<LBlock<T> >& Lcol = this->L( LBj( ksup, this->grid_ ) );
3220  if( MYROW( this->grid_ ) == PROW( ksup, this->grid_ ) ){
3221  nzvalLDiag = Lcol[0].nzval;
3222  if( nzvalLDiag.m() != SuperSize(ksup, this->super_) ||
3223  nzvalLDiag.n() != SuperSize(ksup, this->super_) ){
3224  ErrorHandling(
3225  "The size of the diagonal block of L is wrong." );
3226  }
3227  } // Owns the diagonal block
3228  else {
3229  nzvalLDiag.Resize(SuperSize(ksup, this->super_), SuperSize(ksup, this->super_));
3230  }
3231  MPI_Bcast( (void*)nzvalLDiag.Data(), nzvalLDiag.ByteSize(),
3232  MPI_BYTE, PROW( ksup, this->grid_ ), this->grid_->colComm );
3233 
3234  // Triangular solve
3235  for( Int ib = 0; ib < Lcol.size(); ib++ ){
3236  LBlock<T> & LB = Lcol[ib];
3237  if( LB.blockIdx > ksup && (1 || (LB.numCol>0 && LB.numRow>0))){
3238 #if ( _DEBUGlevel_ >= 2 )
3239  // Check the correctness of the triangular solve
3240  //for the first local column
3241  // if( LBj( ksup, this->grid_ ) == 0 ){
3242  // statusOFS << "Diag L(" << ksup << ", " << ksup << "): "
3243  // << nzvalLDiag << std::endl;
3244  // statusOFS << "Before solve L(" << LB.blockIdx << ", " << ksup
3245  // << "): " << LB.nzval << std::endl;
3246  // }
3247 #endif
3248 #if defined( CHECK_NORM )
3249  NumMat<T> Ljk = LB.nzval;
3250 #endif
3251  blas::Trsm( 'R', 'L', 'N', 'U', LB.numRow, LB.numCol,
3252  ONE<T>(), nzvalLDiag.Data(), LB.numCol,
3253  LB.nzval.Data(), LB.numRow );
3254 #if defined( CHECK_NORM )
3255  NumMat<T> res = LB.nzval;
3256  //Compute L'(jk)U(kk) which should be Ljk
3257  blas::Trmm('R','L','N','U',res.m(),res.n(),ONE<T>(),nzvalLDiag.Data(),nzvalLDiag.m(),res.Data(),res.m());
3258  blas::Axpy(res.Size(), MINUS_ONE<T>(), Ljk.Data(), 1, res.Data(), 1 );
3259  double norm = lapack::Lange('F',res.m(),res.n(),res.Data(),res.m(),Ljk.Data());
3260 
3261  statusOFS << "After solve norm of residual of L(" << LB.blockIdx << ", " << ksup
3262  << "): " << norm << std::endl;
3263 
3264 
3265 
3266 
3267 
3268 
3269  // Check the correctness of the triangular solve
3270  // for the first local column
3271  // if( LBj( ksup, this->grid_ ) == 0 ){
3272  // statusOFS << "After solve L(" << LB.blockIdx << ", " << ksup
3273  // << "): " << LB.nzval << std::endl;
3274  // }
3275 #endif
3276  }
3277  }
3278  } // if( MYCOL( this->grid_ ) == PCOL( ksup, this->grid_ ) )
3279  } // for (ksup)
3280 
3281 
3282 
3283 
3284 #if ( _DEBUGlevel_ >= 1 )
3285  statusOFS << std::endl << "U(k,j) <- U(k,k)^{-1} * U(k,j)"
3286  << std::endl << std::endl;
3287 #endif
3288  for( Int ksup = 0; ksup < numSuper; ksup++ ){
3289  if( MYROW( this->grid_ ) == PROW( ksup, this->grid_ ) ){
3290  // Broadcast the diagonal L block
3291  NumMat<T> nzvalUDiag;
3292  std::vector<UBlock<T> >& Urow = this->U( LBi( ksup, this->grid_ ) );
3293  if( MYCOL( this->grid_ ) == PCOL( ksup, this->grid_ ) ){
3294  std::vector<LBlock<T> >& Lcol = this->L( LBj( ksup, this->grid_ ) );
3295  nzvalUDiag = Lcol[0].nzval;
3296  if( nzvalUDiag.m() != SuperSize(ksup, this->super_) ||
3297  nzvalUDiag.n() != SuperSize(ksup, this->super_) ){
3298  ErrorHandling(
3299  "The size of the diagonal block of U is wrong." );
3300  }
3301  } // Owns the diagonal block
3302  else {
3303  nzvalUDiag.Resize(SuperSize(ksup, this->super_), SuperSize(ksup, this->super_));
3304  }
3305  MPI_Bcast( (void*)nzvalUDiag.Data(), nzvalUDiag.ByteSize(),
3306  MPI_BYTE, PCOL( ksup, this->grid_ ), this->grid_->rowComm );
3307 
3308  // Triangular solve
3309  for( Int jb = 0; jb < Urow.size(); jb++ ){
3310  UBlock<T> & UB = Urow[jb];
3311  if( UB.blockIdx > ksup && (1||(UB.numCol>0 && UB.numRow>0))){
3312 #if ( _DEBUGlevel_ >= 2 )
3313  // Check the correctness of the triangular solve for the first local column
3314  // if( LBi( ksup, this->grid_ ) == 0 ){
3315  // statusOFS << "Diag U(" << ksup << ", " << ksup << "): "
3316  // << nzvalUDiag << std::endl;
3317  // statusOFS << "Before solve U(" << ksup << ", " << UB.blockIdx
3318  // << "): " << UB.nzval << std::endl;
3319  // }
3320 #endif
3321 #if defined( CHECK_NORM )
3322  NumMat<T> Ukj = UB.nzval;
3323 #endif
3324  blas::Trsm( 'L', 'U', 'N', 'N', UB.numRow, UB.numCol,
3325  ONE<T>(), nzvalUDiag.Data(), UB.numRow,
3326  UB.nzval.Data(), UB.numRow );
3327 #if defined( CHECK_NORM )
3328  NumMat<T> res = UB.nzval;
3329  //Compute U(kk) * U^(kj) which should be Ukj
3330  blas::Trmm('L','U','N','N',res.m(),res.n(),ONE<T>(),nzvalUDiag.Data(),nzvalUDiag.m(),res.Data(),res.m());
3331  blas::Axpy(res.Size(), MINUS_ONE<T>(), Ukj.Data(), 1, res.Data(), 1 );
3332  double norm = lapack::Lange('F',res.m(),res.n(),res.Data(),res.m(),Ukj.Data());
3333  statusOFS << "After solve, norm of residual of U(" << ksup << ", " << UB.blockIdx
3334  << "): " << norm << std::endl;
3335 
3336 
3337  // Check the correctness of the triangular solve for the first local column
3338  // if( LBi( ksup, this->grid_ ) == 0 ){
3339  // statusOFS << "After solve U(" << ksup << ", " << UB.blockIdx
3340  // << "): " << UB.nzval << std::endl;
3341  // }
3342 #endif
3343  }
3344  }
3345  } // if( MYROW( this->grid_ ) == PROW( ksup, this->grid_ ) )
3346  } // for (ksup)
3347 
3348 
3349 
3350 
3351 
3352 
3353 #if ( _DEBUGlevel_ >= 1 )
3354  statusOFS << std::endl << "L(i,i) <- [L(k,k) * U(k,k)]^{-1}" << std::endl
3355  << std::endl;
3356 #endif
3357 
3358  for( Int ksup = 0; ksup < numSuper; ksup++ ){
3359  if( MYROW( this->grid_ ) == PROW( ksup, this->grid_ ) &&
3360  MYCOL( this->grid_ ) == PCOL( ksup, this->grid_ ) ){
3361  IntNumVec ipiv( SuperSize( ksup, this->super_ ) );
3362  // Note that the pivoting vector ipiv should follow the FORTRAN
3363  // notation by adding the +1
3364  for(Int i = 0; i < SuperSize( ksup, this->super_ ); i++){
3365  ipiv[i] = i + 1;
3366  }
3367  LBlock<T> & LB = (this->L( LBj( ksup, this->grid_ ) ))[0];
3368 #if ( _DEBUGlevel_ >= 2 )
3369  // Check the correctness of the matrix inversion
3370  // for the first local column
3371  // statusOFS << "Factorized A (" << ksup << ", " << ksup << "): "
3372  // << LB.nzval << std::endl;
3373 #endif
3374 
3375 #if defined( CHECK_NORM )
3376  NumMat<T> Lkk = LB.nzval;
3377 #endif
3378  lapack::Getri( SuperSize( ksup, this->super_ ), LB.nzval.Data(),
3379  SuperSize( ksup, this->super_ ), ipiv.Data() );
3380 
3381 #if defined( CHECK_NORM )
3382  NumMat<T> res (LB.nzval.m(), LB.nzval.n());
3383  NumMat<T> Akk (LB.nzval.m(), LB.nzval.n());
3384  //rebuild A
3385  SetValue(Akk,ZERO<T>());
3386  //copy u into A
3387  for(Int i = 0; i<Akk.m();++i){
3388  for(Int j = i; j<Akk.n();++j){
3389  Akk(i,j)= Lkk(i,j);
3390  }
3391  }
3392  blas::Trmm('L','L','N','U',Akk.m(),Akk.n(),ONE<T>(),Lkk.Data(),Lkk.m(),Akk.Data(),Akk.m());
3393 
3394  // statusOFS << "After inversion, original A(" << ksup << ", " << ksup
3395  // << "): " << Akk << std::endl;
3396 
3397  //Compute U(kk) * U'(kj) which should be Ukj
3398  blas::Gemm('N','N',res.m(),res.n(),res.m(),ONE<T>(),LB.nzval.Data(),res.m(),Akk.Data(),res.m(),ZERO<T>(),res.Data(),res.m());
3399  for(Int i = 0; i<res.m();++i){
3400  res(i,i)-=ONE<T>();
3401  }
3402 
3403  double norm = lapack::Lange('F',res.m(),res.n(),res.Data(),res.m(),Lkk.Data());
3404  statusOFS << "After inversion, norm of residual of A(" << ksup << ", " << ksup
3405  << "): " << norm << std::endl;
3406 
3407 
3408 
3409 
3410  // Check the correctness of the matrix inversion
3411  // for the first local column
3412  // statusOFS << "Inverted A (" << ksup << ", " << ksup << "): "
3413  // << LB.nzval << std::endl;
3414 #endif
3415  } // if I need to invert the diagonal block
3416  } // for (ksup)
3417 
3418 
3419 
3420 
3421  return ;
3422  } // ----- end of method PMatrixUnsym::PreSelInv -----
3423 
3424 
3425  template<typename T>
3427  {
3428  ConstructCommunicationPattern_P2p();
3429  } // ----- end of method PMatrixUnsym::ConstructCommunicationPattern -----
3430 
3431 
3432 
3433  template<typename T>
3435  {
3436 
3437  TIMER_START(ConstructCommunicationPattern);
3438 
3439  Int numSuper = this->NumSuper();
3440 
3441  TIMER_START(Allocate);
3442 
3443  this->isSendToBelow_.Resize(this->grid_->numProcRow, numSuper);
3444  this->isSendToRight_.Resize(this->grid_->numProcCol, numSuper);
3445  this->isSendToDiagonal_.Resize( numSuper );
3446  SetValue( this->isSendToBelow_, false );
3447  SetValue( this->isSendToRight_, false );
3448  SetValue( this->isSendToDiagonal_, false );
3449 
3450  this->isSendToCrossDiagonal_.Resize(this->grid_->numProcCol+1, numSuper );
3451  SetValue( this->isSendToCrossDiagonal_, false );
3452  this->isRecvFromCrossDiagonal_.Resize(this->grid_->numProcRow+1, numSuper );
3453  SetValue( this->isRecvFromCrossDiagonal_, false );
3454 
3455  this->isRecvFromAbove_.Resize( numSuper );
3456  this->isRecvFromLeft_.Resize( numSuper );
3457  this->isRecvFromBelow_.Resize( this->grid_->numProcRow, numSuper );
3458  SetValue( this->isRecvFromAbove_, false );
3459  SetValue( this->isRecvFromBelow_, false );
3460  SetValue( this->isRecvFromLeft_, false );
3461 
3462  TIMER_STOP(Allocate);
3463 
3464 
3465  TIMER_START(GetEtree);
3466  std::vector<Int> snodeEtree(this->NumSuper());
3467  this->GetEtree(snodeEtree);
3468  TIMER_STOP(GetEtree);
3469 
3470  // localColBlockRowIdx stores the nonzero block indices for each local block column.
3471  // The nonzero block indices including contribution from both L and U.
3472  // Dimension: numLocalBlockCol x numNonzeroBlock
3473  std::vector<std::vector<LBlock<T> > * > colBlockRowBuf;
3474  colBlockRowBuf.resize( numSuper );
3475  std::vector<std::vector<UBlock<T> > * > rowBlockColBuf;
3476  rowBlockColBuf.resize( numSuper );
3477  for( Int ksup = 0; ksup < numSuper; ksup++ ){
3478  if( MYCOL( this->grid_ ) == PCOL( ksup, this->grid_ )
3479  || MYROW( this->grid_ ) == PROW( ksup, this->grid_ ) ){
3480  colBlockRowBuf[ksup] = new std::vector<LBlock<T> >();
3481  if( MYROW( this->grid_ ) == PROW( ksup, this->grid_ ) ){
3482  rowBlockColBuf[ksup] = new std::vector<UBlock<T> >();
3483  }
3484  }
3485  }
3486 
3487 
3488 
3489 
3490  TIMER_START(Column_communication);
3491  for( Int ksup = 0; ksup < numSuper; ksup++ ){
3492  // All block columns perform independently
3493  if( MYCOL( this->grid_ ) == PCOL( ksup, this->grid_ ) ){
3494  // Communication
3495  std::vector< LBlock<T> > & Lcol = this->L( LBj(ksup, this->grid_ ) );
3496  std::vector< LBlock<T> > & LcolRecv = *colBlockRowBuf[ksup];
3497  std::vector<char> sendBuf;
3498  std::vector<char> recvBuf;
3499  Int localSize = 0;
3500 
3501  std::stringstream sstms,sstmr;
3502  std::vector<Int> mask( LBlockMask::TOTAL_NUMBER, 1 );
3503  mask[LBlockMask::NZVAL] = 0; // nzval is excluded
3504 
3505  // All blocks are to be allgathered within column.
3506  Int numLBlock, totalLBlock;
3507  numLBlock = Lcol.size();
3508  mpi::Allreduce ( &numLBlock, &totalLBlock, 1, MPI_SUM, this->grid_->colComm);
3509 
3510  //serialize( (Int)Lcol.size(), sstms, NO_MASK );
3511  for( Int ib = 0; ib < Lcol.size(); ib++ ){
3512  serialize( Lcol[ib], sstms, mask );
3513  }
3514  sendBuf.resize( Size( sstms ) );
3515  sstms.read( &sendBuf[0], sendBuf.size() );
3516  localSize = sendBuf.size();
3517 
3518  //allgather to get the localsizes and displacements
3519  TIMER_START(Allgatherv_Column_communication);
3520  if( this->grid_ -> mpisize != 1 )
3521  {
3522  mpi::Allgatherv( sendBuf, recvBuf, this->grid_->colComm );
3523  }
3524  else{
3525  recvBuf = sendBuf;
3526  }
3527  TIMER_STOP(Allgatherv_Column_communication);
3528 
3529 
3530  sstmr.write( &recvBuf[0], recvBuf.size() );
3531 
3532  LcolRecv.resize(totalLBlock);
3533 
3534  //deserialize( numLBlock, sstmr, NO_MASK );
3535 
3536  for( Int ib = 0; ib < totalLBlock; ib++ ){
3537  deserialize( LcolRecv[ib], sstmr, mask );
3538  }
3539 
3540 
3541  //Sort and make it unique
3542  std::sort(LcolRecv.begin(),LcolRecv.end(),LBlockComparator<T>);
3543  //auto last = std::unique(LcolRecv.begin(),LcolRecv.end(),LBlockComparator<T>);
3544  //LcolRecv.resize(last - LcolRecv.begin());
3545 #if ( _DEBUGlevel_ >= 1 )
3546  statusOFS<<"["<<ksup<<"] "<<"Lcol1: ";for(auto it = Lcol.begin();it != Lcol.end();++it){statusOFS<<it->blockIdx<<" ";}statusOFS<<endl;
3547  statusOFS<<"["<<ksup<<"] "<<"LcolRecv1: ";for(auto it = LcolRecv.begin();it != LcolRecv.end();++it){statusOFS<<it->blockIdx<<" ";}statusOFS<<endl;
3548 #endif
3549  } // if( MYCOL( this->grid_ ) == PCOL( ksup, this->grid_ ) )
3550  }
3551  TIMER_STOP(Column_communication);
3552 
3553 
3554  TIMER_START(Row_communication);
3555  for( Int ksup = 0; ksup < numSuper; ksup++ ){
3556  // All block columns perform independently
3557  if( MYROW( this->grid_ ) == PROW( ksup, this->grid_ ) ){
3558  // Communication
3559  std::vector< UBlock<T> > & Urow = this->U( LBi(ksup, this->grid_ ) );
3560  std::vector< UBlock<T> > & UrowRecv = *rowBlockColBuf[ksup];
3561  std::vector<char> sendBuf;
3562  std::vector<char> recvBuf;
3563  Int localSize = 0;
3564 
3565  std::stringstream sstms,sstmr;
3566  std::vector<Int> mask( UBlockMask::TOTAL_NUMBER, 1 );
3567  mask[UBlockMask::NZVAL] = 0; // nzval is excluded
3568 
3569  Int numUBlock, totalUBlock;
3570  numUBlock = Urow.size();
3571  mpi::Allreduce ( &numUBlock, &totalUBlock, 1, MPI_SUM, this->grid_->rowComm);
3572  // All blocks are to be allgathered within column.
3573  //serialize( (Int)Urow.size(), sstms, NO_MASK );
3574  for( Int jb = 0; jb < Urow.size(); jb++ ){
3575  serialize( Urow[jb], sstms, mask );
3576  }
3577  sendBuf.resize( Size( sstms ) );
3578  sstms.read( &sendBuf[0], sendBuf.size() );
3579  localSize = sendBuf.size();
3580 
3581  //allgather to get the localsizes and displacements
3582  TIMER_START(Allgatherv_Row_communication);
3583  if( this->grid_ -> mpisize != 1 )
3584  mpi::Allgatherv( sendBuf, recvBuf, this->grid_->rowComm );
3585  else
3586  recvBuf = sendBuf;
3587  TIMER_STOP(Allgatherv_Row_communication);
3588 
3589  sstmr.write( &recvBuf[0], recvBuf.size() );
3590 
3591  //deserialize( numUBlock, sstmr, NO_MASK );
3592 
3593  UrowRecv.resize(totalUBlock);
3594  for( Int jb = 0; jb < totalUBlock; jb++ ){
3595  deserialize( UrowRecv[jb], sstmr, mask );
3596  }
3597 
3598  //Sort and make it unique
3599  std::sort(UrowRecv.begin(),UrowRecv.end(),UBlockComparator<T>);
3600  // auto last = std::unique(UrowRecv.begin(),UrowRecv.end(),UBlockComparator<T>);
3601  // UrowRecv.resize(last - UrowRecv.begin());
3602 
3603 #if ( _DEBUGlevel_ >= 1 )
3604  statusOFS<<"["<<ksup<<"] "<<"Urow1: ";for(auto it = Urow.begin();it != Urow.end();++it){statusOFS<<it->blockIdx<<" ";}statusOFS<<endl;
3605  statusOFS<<"["<<ksup<<"] "<<"UrowRecv1: ";for(auto it = UrowRecv.begin();it != UrowRecv.end();++it){statusOFS<<it->blockIdx<<" ";}statusOFS<<endl;
3606 #endif
3607 
3608  } // if( MYROW( this->grid_ ) == PROW( ksup, this->grid_ ) )
3609  }
3610  TIMER_STOP(Row_communication);
3611 
3612  //Broadcast from diagonal processor and merge
3613  TIMER_START(Row_communication);
3614  for( Int ksup = 0; ksup < numSuper; ksup++ ){
3615  // All block columns perform independently
3616  if( MYROW( this->grid_ ) == PROW( ksup, this->grid_ ) ){
3617  // Communication
3618  std::vector< LBlock<T> > * pLcolRecv;
3619  std::vector< LBlock<T> > & LcolSend = *colBlockRowBuf[ksup];
3620  std::vector< UBlock<T> > & UrowRecv = *rowBlockColBuf[ksup];
3621  std::vector< LBlock<T> > Union;
3622  std::vector<char> sendBuf;
3623 
3624  std::vector<Int> mask( LBlockMask::TOTAL_NUMBER, 1 );
3625  mask[LBlockMask::NZVAL] = 0; // nzval is excluded
3626  if( this->grid_ -> mpisize != 1 ){
3627  pLcolRecv = new std::vector<LBlock<T> >();
3628  //diagonal processor
3629  if( MYCOL( this->grid_ ) == PCOL( ksup, this->grid_ ) ){
3630  Int localSize = 0;
3631  std::stringstream sstms;
3632 
3633  serialize( (Int)LcolSend.size(), sstms, NO_MASK );
3634  for( Int ib = 0; ib < LcolSend.size(); ib++ ){
3635  serialize( LcolSend[ib], sstms, mask );
3636  }
3637  sendBuf.resize( Size( sstms ) );
3638  sstms.read( &sendBuf[0], sendBuf.size() );
3639  localSize = sendBuf.size();
3640  }
3641  TIMER_START(Bcast_Row_communication);
3642  mpi::Bcast( sendBuf, PCOL(ksup,this->grid_), this->grid_->rowComm );
3643  TIMER_STOP(Bcast_Row_communication);
3644 
3645  std::stringstream sstmr;
3646  sstmr.write( &sendBuf[0], sendBuf.size() );
3647 
3648  Int numLBlock;
3649  deserialize( numLBlock, sstmr, NO_MASK );
3650  pLcolRecv->resize(numLBlock);
3651  for( Int ib = 0; ib < numLBlock; ib++ ){
3652  deserialize( (*pLcolRecv)[ib], sstmr, mask );
3653  }
3654  }
3655  else{
3656  pLcolRecv = &LcolSend;
3657  }
3658 
3659  //LcolRecv is sorted and with unique elements
3660  //UrowRecv is sorted and with unique elements
3661 
3662  auto result = back_inserter(Union);
3663  auto firstUrow = UrowRecv.begin();
3664  auto lastUrow = UrowRecv.end();
3665  auto firstLcol = pLcolRecv->begin();
3666  auto lastLcol = pLcolRecv->end();
3667  while (true)
3668  {
3669  if (firstUrow==lastUrow){
3670  result = std::copy(firstLcol,lastLcol,result);
3671  break;
3672  }
3673  if (firstLcol==lastLcol){
3674  for(auto it = firstUrow; it != lastUrow; ++it){
3675  LBlock<T> LB;
3676  LB.blockIdx = it->blockIdx;
3677  LB.numRow = it->numCol;
3678  LB.numCol = it->numRow;
3679  LB.rows = it->cols;
3680  *result = LB;
3681  ++result;
3682  }
3683  break;
3684  }
3685 
3686  if (firstUrow->blockIdx<firstLcol->blockIdx) {
3687  LBlock<T> LB;
3688  LB.blockIdx = firstUrow->blockIdx;
3689  LB.numRow = firstUrow->numCol;
3690  LB.numCol = firstUrow->numRow;
3691  LB.rows = firstUrow->cols;
3692  *result = LB;
3693  ++firstUrow;
3694  }
3695  else if (firstLcol->blockIdx<firstUrow->blockIdx) {
3696  *result = *firstLcol;
3697  ++firstLcol;
3698  }
3699  else {
3700  //compute set union between rows and cols
3701  std::set<Int> uRowCol;
3702  uRowCol.insert(&firstLcol->rows[0],
3703  &firstLcol->rows[0]+firstLcol->numRow);
3704  uRowCol.insert(&firstUrow->cols[0],
3705  &firstUrow->cols[0]+firstUrow->numCol);
3706 
3707  LBlock<T> LB;
3708  LB.blockIdx = firstUrow->blockIdx;
3709  LB.numRow = uRowCol.size();
3710  LB.numCol = firstUrow->numRow;
3711  LB.rows.Resize(uRowCol.size());
3712  std::copy(uRowCol.begin(),uRowCol.end(),&LB.rows[0]);
3713  *result = LB;
3714  ++firstUrow;
3715  ++firstLcol;
3716  }
3717  ++result;
3718  }
3719 
3720 #if ( _DEBUGlevel_ >= 1 )
3721  statusOFS<<"["<<ksup<<"] "<<"Lcol: ";for(auto it = pLcolRecv->begin();it != pLcolRecv->end();++it){statusOFS<<it->blockIdx<<" ";}statusOFS<<endl;
3722  statusOFS<<"["<<ksup<<"] "<<"Urow: ";for(auto it = UrowRecv.begin();it != UrowRecv.end();++it){statusOFS<<it->blockIdx<<" ";}statusOFS<<endl;
3723  statusOFS<<"["<<ksup<<"] "<<"Union: ";for(auto it = Union.begin();it != Union.end();++it){statusOFS<<it->blockIdx<<" ";}statusOFS<<endl;
3724 #endif
3725 
3726  //replace LcolSend by the union
3727  LcolSend.swap(Union);
3728 
3729  if( this->grid_ -> mpisize != 1 ){
3730  delete pLcolRecv;
3731  }
3732  } // if( MYROW( this->grid_ ) == PROW( ksup, this->grid_ ) )
3733  }
3734  TIMER_STOP(Row_communication);
3735 
3736 
3737  TIMER_START(Col_communication);
3738  for( Int ksup = 0; ksup < numSuper; ksup++ ){
3739  // All block columns perform independently
3740  if( MYCOL( this->grid_ ) == PCOL( ksup, this->grid_ ) ){
3741  // Communication
3742  std::vector< LBlock<T> > * pUnionRecv;
3743  std::vector< LBlock<T> > & UnionSend = *colBlockRowBuf[ksup];
3744 
3745  std::vector<char> sendBuf;
3746 
3747  std::vector<Int> mask( LBlockMask::TOTAL_NUMBER, 1 );
3748  mask[LBlockMask::NZVAL] = 0; // nzval is excluded
3749  if( this->grid_ -> mpisize != 1 ){
3750  pUnionRecv = new std::vector<LBlock<T> >();
3751  //diagonal processor
3752  if( MYROW( this->grid_ ) == PROW( ksup, this->grid_ ) ){
3753  Int localSize = 0;
3754  std::stringstream sstms;
3755 
3756  serialize( (Int)UnionSend.size(), sstms, NO_MASK );
3757  for( Int ib = 0; ib < UnionSend.size(); ib++ ){
3758  serialize( UnionSend[ib], sstms, mask );
3759  }
3760  sendBuf.resize( Size( sstms ) );
3761  sstms.read( &sendBuf[0], sendBuf.size() );
3762  localSize = sendBuf.size();
3763  }
3764  TIMER_START(Bcast_Col_communication);
3765  mpi::Bcast( sendBuf, PROW(ksup,this->grid_), this->grid_->colComm );
3766  TIMER_STOP(Bcast_Col_communication);
3767 
3768  std::stringstream sstmr;
3769  sstmr.write( &sendBuf[0], sendBuf.size() );
3770 
3771  Int numLBlock;
3772  deserialize( numLBlock, sstmr, NO_MASK );
3773  pUnionRecv->resize(numLBlock);
3774  for( Int ib = 0; ib < numLBlock; ib++ ){
3775  deserialize( (*pUnionRecv)[ib], sstmr, mask );
3776  }
3777 
3778 
3779  UnionSend.resize(pUnionRecv->size());
3780  std::copy(pUnionRecv->begin(),pUnionRecv->end(),UnionSend.begin());
3781  }
3782  else{
3783  pUnionRecv = &UnionSend;
3784  }
3785 
3786 #if ( _DEBUGlevel_ >= 1 )
3787  statusOFS<<"["<<ksup<<"] "<<"Union: ";for(auto it = UnionSend.begin();it != UnionSend.end();++it){statusOFS<<it->blockIdx<<" ";}statusOFS<<endl;
3788 #endif
3789 
3790  if( this->grid_ -> mpisize != 1 ){
3791  delete pUnionRecv;
3792  }
3793  } // if( MYCOL( this->grid_ ) == PCOL( ksup, this->grid_ ) )
3794  }
3795  TIMER_STOP(Col_communication);
3796 
3797 
3798  for( Int ksup = 0; ksup < numSuper; ksup++ ){
3799  // All block columns perform independently
3800  if( MYROW( this->grid_ ) == PROW( ksup, this->grid_ ) ){
3801 
3802  std::vector< LBlock<T> > & Union = *colBlockRowBuf[ksup];
3803  std::vector< UBlock<T> > & Urow = this->U( LBi(ksup, this->grid_ ) );
3804  Int & LrowSize = this->LrowSize_[ LBi(ksup, this->grid_ ) ];
3805  TIMER_START(Computing_Lrow_size);
3806  //Allocate Lrow and extend Urow
3807  LrowSize = 0;
3808  for(auto it = Union.begin(); it!=Union.end();++it){
3809  Int Idx = it->blockIdx;
3810  if(Idx > ksup && (Idx % this->grid_->numProcCol) == MYCOL(this->grid_) ){
3811  ++LrowSize;
3812  }
3813  }
3814  TIMER_STOP(Computing_Lrow_size);
3815 
3816  TIMER_START(Extending_Urow);
3817  Urow.reserve(LrowSize);
3818 
3819  for(auto it = Union.begin(); it!=Union.end();++it){
3820  Int Idx = it->blockIdx;
3821  if(Idx > ksup && (Idx % this->grid_->numProcCol) == MYCOL(this->grid_) ){
3822  bool isFound = false;
3823  Int nextIdx = -1;
3824  for(Int jb = 0; jb < Urow.size(); ++jb){
3825  UBlock<T> & UB = Urow[jb];
3826  if(UB.blockIdx == Idx){
3827  isFound = true;
3828  nextIdx = jb;
3829  break;
3830  }
3831  if(UB.blockIdx > Idx){
3832  nextIdx = jb;
3833  break;
3834  }
3835  }
3836  if(!isFound){
3837  //push_back
3838  UBlock<T> UB;
3839  UB.blockIdx = Idx;
3840  UB.numRow = it->numCol;
3841  UB.numCol = it->numRow;
3842  UB.cols = it->rows;
3843  Urow.push_back(UB);
3844  UBlock<T> & UBl = Urow.back();
3845  UBl.nzval.Resize(UB.numRow,UB.numCol);
3846  SetValue(UBl.nzval,ZERO<T>());
3847  }
3848  else{
3849  //make sure blocks are the same size
3850  UBlock<T> & UB = Urow[nextIdx];
3851  assert(UB.numRow == it->numCol);
3852  if( UB.numCol != it->numRow ){
3853  NumMat<T> tmpNzval = UB.nzval;
3854  IntNumVec tmpCols = UB.cols;
3855 
3856  UB.numRow = it->numCol;
3857  UB.numCol = it->numRow;
3858  UB.cols = it->rows;
3859  UB.nzval.Resize(UB.numRow,UB.numCol);
3860  SetValue(UB.nzval,ZERO<T>());
3861 
3862  //now put nzvals back in place
3863  Int jOldCols = 0;
3864  for(Int j = 0; j<UB.numCol; ++j){
3865  Int newCol = UB.cols[j];
3866  if(jOldCols<tmpCols.m()){
3867  Int oldCol = tmpCols[jOldCols];
3868  if(newCol == oldCol){
3869  T * nzcolPtr = tmpNzval.VecData(jOldCols);
3870  std::copy(nzcolPtr,nzcolPtr+UB.numRow,UB.nzval.VecData(j));
3871  jOldCols++;
3872  }
3873  }
3874  else{
3875  break;
3876  }
3877  }
3878  assert(jOldCols>=tmpCols.m());
3879 
3880  }
3881 
3882  }
3883  }
3884  }
3885 #if ( _DEBUGlevel_ >= 1 )
3886  statusOFS<<"["<<ksup<<"] "<<"LrowSize = "<<LrowSize<<std::endl;
3887 #endif
3888  std::sort(Urow.begin(),Urow.end(),UBlockComparator<T>);
3889 #if ( _DEBUGlevel_ >= 1 )
3890  statusOFS<<"["<<ksup<<"] "<<"Urow extended: ";for(auto it = Urow.begin();it != Urow.end();++it){statusOFS<<it->blockIdx<<" ";}statusOFS<<endl;
3891 #endif
3892  TIMER_STOP(Extending_Urow);
3893  } // if( MYROW( this->grid_ ) == PROW( ksup, this->grid_ ) )
3894  } // for(ksup)
3895 
3896  for( Int ksup = 0; ksup < numSuper; ksup++ ){
3897  // All block columns perform independently
3898  if( MYCOL( this->grid_ ) == PCOL( ksup, this->grid_ ) ){
3899 
3900  std::vector< LBlock<T> > & Union = *colBlockRowBuf[ksup];
3901  std::vector< LBlock<T> > & Lcol = this->L( LBj(ksup, this->grid_ ) );
3902  Int & UcolSize = this->UcolSize_[ LBj(ksup, this->grid_ ) ];
3903  TIMER_START(Computing_Ucol_size);
3904  //Allocate Ucol and extend Lcol
3905  UcolSize = 0;
3906  for(auto it = Union.begin(); it!=Union.end();++it){
3907  Int Idx = it->blockIdx;
3908  if(Idx > ksup && (Idx % this->grid_->numProcRow) == MYROW(this->grid_) ){
3909  ++UcolSize;
3910  }
3911  }
3912  TIMER_STOP(Computing_Ucol_size);
3913 
3914  TIMER_START(Extending_Lcol);
3915  Lcol.reserve(UcolSize);
3916 
3917  for(auto it = Union.begin(); it!=Union.end();++it){
3918  Int Idx = it->blockIdx;
3919  if(Idx > ksup && (Idx % this->grid_->numProcRow) == MYROW(this->grid_) ){
3920  bool isFound = false;
3921  Int nextIdx = -1;
3922  for(Int ib = 0; ib < Lcol.size(); ++ib){
3923  LBlock<T> & LB = Lcol[ib];
3924  if(LB.blockIdx == Idx){
3925  isFound = true;
3926  nextIdx = ib;
3927  break;
3928  }
3929  if(LB.blockIdx > Idx){
3930  nextIdx = ib;
3931  break;
3932  }
3933  }
3934  if(!isFound){
3935  //push_back
3936  Lcol.push_back(*it);
3937  LBlock<T> & LB = Lcol.back();
3938  LB.nzval.Resize(LB.numRow,LB.numCol);
3939  SetValue(LB.nzval,ZERO<T>());
3940  }
3941  else{
3942  //make sure blocks are the same size
3943  LBlock<T> & LB = Lcol[nextIdx];
3944  assert(LB.numCol == it->numCol);
3945  if( LB.numRow != it->numRow ){
3946  NumMat<T> tmpNzval = LB.nzval;
3947  IntNumVec tmpRows = LB.rows;
3948 
3949  LB.numRow = it->numRow;
3950  LB.numCol = it->numCol;
3951  LB.rows = it->rows;
3952  LB.nzval.Resize(LB.numRow,LB.numCol);
3953  SetValue(LB.nzval,ZERO<T>());
3954 
3955  //now put nzvals back in place
3956  Int iOldRows = 0;
3957  for(Int i = 0; i<LB.numRow; ++i){
3958  Int newRow = LB.rows[i];
3959  if(iOldRows<tmpRows.m()){
3960  Int oldRow = tmpRows[iOldRows];
3961  if(newRow == oldRow){
3962  for(Int j = 0; j<LB.numCol; ++j){
3963  LB.nzval(i,j) = tmpNzval(iOldRows,j);
3964  }
3965  iOldRows++;
3966  }
3967  }
3968  else{
3969  break;
3970  }
3971  }
3972  assert(iOldRows>=tmpRows.m());
3973 
3974  }
3975 
3976  }
3977  }
3978  }
3979 #if ( _DEBUGlevel_ >= 1 )
3980  statusOFS<<"["<<ksup<<"] "<<"UcolSize = "<<UcolSize<<std::endl;
3981 #endif
3982  std::sort(Lcol.begin(),Lcol.end(),LBlockComparator<T>);
3983 #if ( _DEBUGlevel_ >= 1 )
3984  statusOFS<<"["<<ksup<<"] "<<"Lcol extended: ";for(auto it = Lcol.begin();it != Lcol.end();++it){statusOFS<<it->blockIdx<<" ";}statusOFS<<endl;
3985 #endif
3986  TIMER_STOP(Extending_Lcol);
3987  } // if( MYCOL( this->grid_ ) == PCOL( ksup, this->grid_ ) )
3988  } // for(ksup)
3989 
3990 
3991 
3992 
3993 
3994  //pointers to next non zero block
3995  std::vector<Int> nextNZColBlockRow;
3996  nextNZColBlockRow.resize( this->NumLocalBlockCol(), 0 );
3997 
3998  std::vector<Int> nextNZRowBlockCol;
3999  nextNZRowBlockCol.resize( this->NumLocalBlockRow() , 0);
4000 
4001  if(1){
4002  for( Int ksup = 0; ksup < numSuper; ksup++ ){
4003  // All block columns perform independently
4004  std::vector<Int> tAllBlockRowIdx;
4005  if( MYCOL( this->grid_ ) == PCOL( ksup, this->grid_ ) ){
4006  // Communication
4007  std::vector< LBlock<T> > & Union = *colBlockRowBuf[ksup];
4008 
4009  std::vector<Int> tlocalBlockRowIdx;
4010  //look at previous supernodes (ideally look at children in the etree only)
4011 
4012  for( Int jsup = 0; jsup < ksup; jsup++ ){
4013  //look into U
4014  Int pjsup = snodeEtree[jsup];
4015  if(pjsup<=ksup && MYROW( this->grid_ ) == PROW( jsup, this->grid_ ) ){
4016  // if( MYCOL( this->grid_ ) == PCOL( jsup, this->grid_ ) ){
4017  std::vector< UBlock<T> > & Urow = this->U( LBi(jsup, this->grid_ ) );
4018 
4019 
4020 
4021 #if ( _DEBUGlevel_ >= 1 )
4022  statusOFS<<"["<<ksup<<"] "<<"Urow["<<jsup<<"]: "; for(auto it = Urow.begin(); it != Urow.end(); ++it){ statusOFS<<it->blockIdx<<" "; } statusOFS<<endl;
4023 #endif
4024 
4025  Int & nextIdx = nextNZRowBlockCol[ LBi(jsup, this->grid_) ];
4026  while(nextIdx<Urow.size()){
4027  if(Urow[nextIdx].blockIdx < ksup){
4028  ++nextIdx;
4029  }
4030  else{
4031  break;
4032  }
4033  }
4034  if(nextIdx<Urow.size()){
4035  if(Urow[nextIdx].blockIdx == ksup){
4036  tlocalBlockRowIdx.push_back(jsup);
4037  }
4038  }
4039  // }
4040  }
4041  }
4042 
4043 
4044 
4045 
4046 
4047  TIMER_START(Allgatherv_Column_communication);
4048  if( this->grid_ -> mpisize != 1 )
4049  mpi::Allgatherv( tlocalBlockRowIdx, tAllBlockRowIdx, this->grid_->colComm );
4050  else
4051  tAllBlockRowIdx = tlocalBlockRowIdx;
4052 
4053  TIMER_STOP(Allgatherv_Column_communication);
4054 
4055  //add dummy LBlocks to Union
4056 
4057 
4058  for(auto it = tAllBlockRowIdx.begin(); it != tAllBlockRowIdx.end(); ++it ){
4059  LBlock<T> LB;
4060  LB.numCol = 0;
4061  LB.numRow = 0;
4062  LB.blockIdx = *it;
4063  Union.push_back(LB);
4064  }
4065 
4066  std::sort(Union.begin(),Union.end(),LBlockComparator<T>);
4067 
4068 #if ( _DEBUGlevel_ >= 1 )
4069  statusOFS<<"["<<ksup<<"] "<<"NEW Union: ";
4070  for(auto it = Union.begin(); it != Union.end(); ++it){ statusOFS<<it->blockIdx<<" "; }
4071  statusOFS<<endl;
4072 #endif
4073 
4074  } // if( MYCOL( this->grid_ ) == PCOL( ksup, this->grid_ ) )
4075  }
4076 
4077  }
4078 
4079  for( Int ksup = 0; ksup < numSuper; ksup++ ){
4080  // All block columns perform independently
4081  std::vector<Int> tAllBlockColIdx;
4082  if( MYROW( this->grid_ ) == PROW( ksup, this->grid_ ) ){
4083  // Communication
4084  std::vector< LBlock<T> > & Union = *colBlockRowBuf[ksup];
4085  std::vector<Int> tlocalBlockColIdx;
4086  //look at previous supernodes (ideally look at children in the etree only)
4087  for( Int jsup = 0; jsup < ksup; jsup++ ){
4088  //look into U
4089  Int pjsup = snodeEtree[jsup];
4090  if(pjsup<=ksup && MYCOL( this->grid_ ) == PCOL( jsup, this->grid_ ) ){
4091  std::vector< LBlock<T> > & Lcol = this->L( LBj(jsup, this->grid_ ) );
4092 
4093 
4094 
4095 #if ( _DEBUGlevel_ >= 1 )
4096  statusOFS<<"["<<ksup<<"] "<<"Lcol["<<jsup<<"]: "; for(auto it = Lcol.begin(); it != Lcol.end(); ++it){ statusOFS<<it->blockIdx<<" "; } statusOFS<<endl;
4097 #endif
4098 
4099  Int & nextIdx = nextNZColBlockRow[ LBj(jsup, this->grid_) ];
4100  while(nextIdx<Lcol.size()){
4101  if(Lcol[nextIdx].blockIdx < ksup){
4102  ++nextIdx;
4103  }
4104  else{
4105  break;
4106  }
4107  }
4108  if(nextIdx<Lcol.size()){
4109  if(Lcol[nextIdx].blockIdx == ksup){
4110  tlocalBlockColIdx.push_back(jsup);
4111  }
4112  }
4113  }
4114  }
4115 
4116 
4117 
4118 
4119 
4120 
4121 
4122 
4123 
4124 
4125  TIMER_START(Allgatherv_Row_communication);
4126  if( this->grid_ -> mpisize != 1 )
4127  mpi::Allgatherv( tlocalBlockColIdx, tAllBlockColIdx, this->grid_->rowComm );
4128  else
4129  tAllBlockColIdx = tlocalBlockColIdx;
4130 
4131  TIMER_STOP(Allgatherv_Row_communication);
4132 
4133  //add dummy LBlocks to Union
4134 
4135 
4136  for(auto it = tAllBlockColIdx.begin(); it != tAllBlockColIdx.end(); ++it ){
4137  LBlock<T> LB;
4138  LB.numCol = 0;
4139  LB.numRow = 0;
4140  LB.blockIdx = *it;
4141  Union.push_back(LB);
4142  }
4143 
4144  std::sort(Union.begin(),Union.end(),LBlockComparator<T>);
4145  auto last = std::unique(Union.begin(),Union.end(),LBlockEqualComparator<T>);
4146  Union.erase(last,Union.end());
4147 
4148 #if ( _DEBUGlevel_ >= 1 )
4149  statusOFS<<"["<<ksup<<"] "<<"NEW Union: ";
4150  for(auto it = Union.begin(); it != Union.end(); ++it){ statusOFS<<it->blockIdx<<" "; }
4151  statusOFS<<endl;
4152 #endif
4153 
4154  } // if( MYROW( this->grid_ ) == PROW( ksup, this->grid_ ) )
4155  }
4156 
4157 
4158 
4159 
4160 
4161  TIMER_START(STB_RFA);
4162  for( Int ksup = 0; ksup < numSuper - 1; ksup++ ){
4163  // Loop over all the supernodes to the right of ksup
4164 
4165 
4166  Int jsup = snodeEtree[ksup];
4167  while(jsup<numSuper){
4168  Int jsupLocalBlockCol = LBj( jsup, this->grid_ );
4169  Int jsupProcCol = PCOL( jsup, this->grid_ );
4170  if( MYCOL( this->grid_ ) == jsupProcCol ){
4171  std::vector< LBlock<T> > & Union = *colBlockRowBuf[jsup];
4172 
4173  // SendToBelow / RecvFromAbove only if (ksup, jsup) is nonzero.
4174  Int isFound = 0;
4175  for(auto it = Union.begin();it!=Union.end();++it){
4176  if(it->blockIdx == ksup){
4177  isFound = 1;
4178  break;
4179  }
4180  else if(it->blockIdx > ksup){
4181  break;
4182  }
4183  }
4184  if( isFound ) {
4185  for( auto si = Union.begin(); si != Union.end(); si++ ){
4186  Int isup = si->blockIdx;
4187  Int isupProcRow = PROW( isup, this->grid_ );
4188  if( isup > ksup ){
4189  if( MYROW( this->grid_ ) == isupProcRow ){
4190  this->isRecvFromAbove_(ksup) = true;
4191  }
4192  if( MYROW( this->grid_ ) == PROW( ksup, this->grid_ ) ){
4193  this->isSendToBelow_( isupProcRow, ksup ) = true;
4194  }
4195  } // if( isup > ksup )
4196  } // for (si)
4197  } // if( isFound )
4198 
4199  } // if( MYCOL( this->grid_ ) == PCOL( jsup, this->grid_ ) )
4200  jsup = snodeEtree[jsup];
4201  } // for(jsup)
4202  } // for(ksup)
4203 
4204 #if ( _DEBUGlevel_ >= 1 )
4205  statusOFS << std::endl << "this->isSendToBelow:" << std::endl;
4206  for(int j = 0;j< this->isSendToBelow_.n();j++){
4207  statusOFS << "["<<j<<"] ";
4208  for(int i =0; i < this->isSendToBelow_.m();i++){
4209  statusOFS<< this->isSendToBelow_(i,j) << " ";
4210  }
4211  statusOFS<<std::endl;
4212  }
4213 
4214  statusOFS << std::endl << "this->isRecvFromAbove:" << std::endl;
4215  for(int j = 0;j< this->isRecvFromAbove_.m();j++){
4216  statusOFS << "["<<j<<"] "<< this->isRecvFromAbove_(j)<<std::endl;
4217  }
4218 #endif
4219 
4220 
4221 
4222  TIMER_STOP(STB_RFA);
4223 
4224 
4225 
4226 
4227 
4228 
4229 
4230 
4231  TIMER_START(STR_RFL_RFB);
4232 
4233 
4234  for( Int ksup = 0; ksup < numSuper - 1; ksup++ ){
4235  // Loop over all the supernodes below ksup
4236 
4237  Int isup = snodeEtree[ksup];
4238  while(isup<numSuper){
4239  Int isupLocalBlockRow = LBi( isup, this->grid_ );
4240  Int isupProcRow = PROW( isup, this->grid_ );
4241  if( MYROW( this->grid_ ) == isupProcRow ){
4242  std::vector< LBlock<T> > & Union = *colBlockRowBuf[isup];
4243 
4244  // SendToRight / RecvFromLeft only if (isup, ksup) is nonzero.
4245  Int isFound = 0;
4246  for(auto it = Union.begin();it!=Union.end();++it){
4247  if(it->blockIdx == ksup){
4248  isFound = 1;
4249  break;
4250  }
4251  else if(it->blockIdx > ksup){
4252  break;
4253  }
4254  }
4255  if( isFound ) {
4256  for( auto si = Union.begin(); si != Union.end(); si++ ){
4257  Int jsup = si->blockIdx;
4258  Int jsupProcCol = PCOL( jsup, this->grid_ );
4259  if( jsup > ksup ){
4260  if( MYCOL( this->grid_ ) == jsupProcCol ){
4261  this->isRecvFromLeft_(ksup) = true;
4262  }
4263  if( MYCOL( this->grid_ ) == PCOL( ksup, this->grid_ ) ){
4264  this->isSendToRight_( jsupProcCol, ksup ) = true;
4265  }
4266  }
4267  } // for (si)
4268  } // if( isFound )
4269  } // if( MYROW( this->grid_ ) == isupProcRow )
4270 
4271  if( MYCOL( this->grid_ ) == PCOL(ksup, this->grid_) ){
4272  if( MYROW( this->grid_ ) == PROW( ksup, this->grid_ ) ){
4273  this->isRecvFromBelow_(isupProcRow,ksup) = true;
4274  }
4275  else if (MYROW(this->grid_) == isupProcRow){
4276  this->isSendToDiagonal_(ksup)=true;
4277  }
4278  } // if( MYCOL( this->grid_ ) == PCOL(ksup, this->grid_) )
4279  isup = snodeEtree[isup];
4280  } // for (isup)
4281  } // for (ksup)
4282 
4283 #if ( _DEBUGlevel_ >= 1 )
4284  statusOFS << std::endl << "this->isSendToRight:" << std::endl;
4285  for(int j = 0;j< this->isSendToRight_.n();j++){
4286  statusOFS << "["<<j<<"] ";
4287  for(int i =0; i < this->isSendToRight_.m();i++){
4288  statusOFS<< this->isSendToRight_(i,j) << " ";
4289  }
4290  statusOFS<<std::endl;
4291  }
4292 
4293  statusOFS << std::endl << "this->isRecvFromLeft:" << std::endl;
4294  for(int j = 0;j< this->isRecvFromLeft_.m();j++){
4295  statusOFS << "["<<j<<"] "<< this->isRecvFromLeft_(j)<<std::endl;
4296  }
4297 
4298  statusOFS << std::endl << "this->isRecvFromBelow:" << std::endl;
4299  for(int j = 0;j< this->isRecvFromBelow_.n();j++){
4300  statusOFS << "["<<j<<"] ";
4301  for(int i =0; i < this->isRecvFromBelow_.m();i++){
4302  statusOFS<< this->isRecvFromBelow_(i,j) << " ";
4303  }
4304  statusOFS<<std::endl;
4305  }
4306 #endif
4307 
4308 
4309  TIMER_STOP(STR_RFL_RFB);
4310 
4311 
4312 
4313 
4314  TIMER_START(STCD_RFCD);
4315 
4316 
4317  for( Int ksup = 0; ksup < numSuper - 1; ksup++ ){
4318  if( MYCOL( this->grid_ ) == PCOL( ksup, this->grid_ ) ){
4319  std::vector< LBlock<T> > & Union = *colBlockRowBuf[ksup];
4320  for( auto si = Union.begin(); si != Union.end(); si++ ){
4321  Int isup = si->blockIdx;
4322  Int isupProcRow = PROW( isup, this->grid_ );
4323  Int isupProcCol = PCOL( isup, this->grid_ );
4324  if( isup > ksup && MYROW( this->grid_ ) == isupProcRow ){
4325  this->isSendToCrossDiagonal_(this->grid_->numProcCol, ksup ) = true;
4326  this->isSendToCrossDiagonal_(isupProcCol, ksup ) = true;
4327  }
4328  } // for (si)
4329  } // if( MYCOL( this->grid_ ) == PCOL( ksup, this->grid_ ) )
4330  } // for (ksup)
4331 
4332  for( Int ksup = 0; ksup < numSuper - 1; ksup++ ){
4333  if( MYROW( this->grid_ ) == PROW( ksup, this->grid_ ) ){
4334  std::vector< LBlock<T> > & Union = *colBlockRowBuf[ksup];
4335  for( auto si = Union.begin(); si != Union.end(); si++ ){
4336  Int jsup = si->blockIdx;
4337  Int jsupProcCol = PCOL( jsup, this->grid_ );
4338  Int jsupProcRow = PROW( jsup, this->grid_ );
4339  if( jsup > ksup && MYCOL(this->grid_) == jsupProcCol ){
4340  this->isRecvFromCrossDiagonal_(this->grid_->numProcRow, ksup ) = true;
4341  this->isRecvFromCrossDiagonal_(jsupProcRow, ksup ) = true;
4342  }
4343  } // for (si)
4344  } // if( MYROW( this->grid_ ) == PROW( ksup, this->grid_ ) )
4345  } // for (ksup)
4346 #if ( _DEBUGlevel_ >= 1 )
4347  statusOFS << std::endl << "this->isSendToCrossDiagonal:" << std::endl;
4348  for(int j =0; j < this->isSendToCrossDiagonal_.n();j++){
4349  if(this->isSendToCrossDiagonal_(this->grid_->numProcCol,j)){
4350  statusOFS << "["<<j<<"] ";
4351  for(int i =0; i < this->isSendToCrossDiagonal_.m()-1;i++){
4352  if(this->isSendToCrossDiagonal_(i,j))
4353  {
4354  statusOFS<< PNUM(PROW(j,this->grid_),i,this->grid_)<<" ";
4355  }
4356  }
4357  statusOFS<<std::endl;
4358  }
4359  }
4360 
4361  statusOFS << std::endl << "this->isRecvFromCrossDiagonal:" << std::endl;
4362  for(int j =0; j < this->isRecvFromCrossDiagonal_.n();j++){
4363  if(this->isRecvFromCrossDiagonal_(this->grid_->numProcRow,j)){
4364  statusOFS << "["<<j<<"] ";
4365  for(int i =0; i < this->isRecvFromCrossDiagonal_.m()-1;i++){
4366  if(this->isRecvFromCrossDiagonal_(i,j))
4367  {
4368  statusOFS<< PNUM(i,PCOL(j,this->grid_),this->grid_)<<" ";
4369  }
4370  }
4371  statusOFS<<std::endl;
4372  }
4373  }
4374 
4375 
4376 #endif
4377 
4378 
4379  TIMER_STOP(STCD_RFCD);
4380 
4381  for( Int ksup = 0; ksup < numSuper; ksup++ ){
4382  if( MYCOL( this->grid_ ) == PCOL( ksup, this->grid_ )
4383  || MYROW( this->grid_ ) == PROW( ksup, this->grid_ ) ){
4384  delete colBlockRowBuf[ksup];
4385  if( MYROW( this->grid_ ) == PROW( ksup, this->grid_ ) ){
4386  delete rowBlockColBuf[ksup];
4387  }
4388  }
4389  }
4390 
4391  //Build the list of supernodes based on the elimination tree from SuperLU
4392  this->GetWorkSet(snodeEtree,this->WorkingSet());
4393 
4394  TIMER_STOP(ConstructCommunicationPattern);
4395 
4396 
4397  return ;
4398  } // ----- end of method PMatrixUnsym::ConstructCommunicationPattern_P2p -----
4399 
4400 
4401  } // namespace PEXSI
4402 
4403 #endif //_PEXSI_PSELINV_UNSYM_IMPL_HPP_
void UnpackData(SuperNodeBufferTypeUnsym &snode, std::vector< LBlock< T > > &LcolRecv, std::vector< LBlock< T > > &LrowRecv, std::vector< UBlock< T > > &UcolRecv, std::vector< UBlock< T > > &UrowRecv)
UnpackData.
Definition: pselinv_unsym_impl.hpp:1659
void SelInvIntra_P2p(Int lidx)
SelInvIntra_P2p.
Definition: pselinv_unsym_impl.hpp:1857
Int blockIdx
Block index (supernodal index)
Definition: pselinv.hpp:248
void ComputeDiagUpdate(SuperNodeBufferTypeUnsym &snode)
ComputeDiagUpdate.
Definition: pselinv_unsym_impl.hpp:1794
Interface with SuperLU_Dist (version 3.0 and later)
Int PROW(Int bnum, const GridType *g)
PROW returns the processor row that the bnum-th block (supernode) belongs to.
Definition: pselinv.hpp:304
Int numRow
Number of nonzero rows.
Definition: pselinv.hpp:199
void SelInv_lookup_indexes(SuperNodeBufferTypeUnsym &snode, std::vector< LBlock< T > > &LcolRecv, std::vector< LBlock< T > > &LrowRecv, std::vector< UBlock< T > > &UcolRecv, std::vector< UBlock< T > > &UrowRecv, NumMat< T > &AinvBuf, NumMat< T > &LBuf, NumMat< T > &UBuf)
SelInv_lookup_indexes.
Definition: pselinv_unsym_impl.hpp:100
Int MYPROC(const GridType *g)
MYPROC returns the current processor rank.
Definition: pselinv.hpp:291
Int PCOL(Int bnum, const GridType *g)
PCOL returns the processor column that the bnum-th block (supernode) belongs to.
Definition: pselinv.hpp:309
Int LBi(Int bnum, const GridType *g)
LBi returns the local block number on the processor at processor row PROW( bnum, g )...
Definition: pselinv.hpp:319
Profiling and timing using TAU.
Int SuperSize(Int bnum, const SuperNodeType *s)
SuperSize returns the size of the block bnum.
Definition: pselinv.hpp:360
Int LBj(Int bnum, const GridType *g)
LBj returns the local block number on the processor at processor column PCOL( bnum, g ).
Definition: pselinv.hpp:324
Int GBj(Int jLocal, const GridType *g)
GBj returns the global block number from a local block number in the column direction.
Definition: pselinv.hpp:334
void SetValue(NumMat< F > &M, F val)
SetValue sets a numerical matrix to a constant val.
Definition: NumMat_impl.hpp:153
Int numCol
Number of nonzero columns.
Definition: pselinv.hpp:202
Definition: pselinv_unsym_impl.hpp:402
LBlock stores a nonzero block in the lower triangular part or the diagonal part in PSelInv...
Definition: pselinv.hpp:193
Int PNUM(Int i, Int j, const GridType *g)
PNUM returns the processor rank that the bnum-th block (supernode) belongs to.
Definition: pselinv.hpp:314
Int GBi(Int iLocal, const GridType *g)
GBi returns the global block number from a local block number in the row direction.
Definition: pselinv.hpp:329
Int MYCOL(const GridType *g)
MYCOL returns my processor column.
Definition: pselinv.hpp:299
virtual void ConstructCommunicationPattern()
ConstructCommunicationPattern constructs the communication pattern to be used later in the selected i...
Definition: pselinv_unsym_impl.hpp:3426
IntNumVec rows
Dimension numRow * 1, index (0-based) for the number of nonzero rows.
Definition: pselinv.hpp:205
Int numCol
Number of nonzero columns.
Definition: pselinv.hpp:254
UBlock stores a nonzero block in the upper triangular part in PSelInv.
Definition: pselinv.hpp:245
Definition: pselinv_unsym.hpp:198
Int FirstBlockCol(Int bnum, const SuperNodeType *s)
FirstBlockCol returns the first column of a block bnum.
Definition: pselinv.hpp:348
Int blockIdx
Block index (supernodal index)
Definition: pselinv.hpp:196
void SendRecvCD(std::vector< SuperNodeBufferTypeUnsym > &arrSuperNodes, Int stepSuper)
SendRecvCD_UpdateU.
Definition: pselinv_unsym_impl.hpp:1068
Int numRow
Number of nonzero rows.
Definition: pselinv.hpp:251
virtual void SelInv()
SelInv is the main function for the selected inversion.
Definition: pselinv_unsym_impl.hpp:3163
void ConstructCommunicationPattern_P2p()
ConstructCommunicationPattern_P2p constructs the communication pattern to be used later in the select...
Definition: pselinv_unsym_impl.hpp:3434
NumMat< T > nzval
Dimension numRow * numCol, nonzero elements.
Definition: pselinv.hpp:260
Int NumSuper(const SuperNodeType *s)
NumSuper returns the total number of supernodes.
Definition: pselinv.hpp:364
NumMat< T > nzval
Dimension numRow * numCol, nonzero elements.
Definition: pselinv.hpp:209
virtual void PreSelInv()
PreSelInv prepares the structure in L_ and U_ so that SelInv only involves matrix-matrix multiplicati...
Definition: pselinv_unsym_impl.hpp:3205
IntNumVec cols
Dimension numCol * 1, index (0-based) for the number of nonzero rows.
Definition: pselinv.hpp:257
void SelInv_P2p()
Point-to-point version of the selected inversion.
Definition: pselinv_unsym_impl.hpp:3169
PMatrixUnsym contains the main data structure and the computational routine for the parallel selected...
Definition: pselinv.hpp:1003
Int MYROW(const GridType *g)
MYROW returns my processor row.
Definition: pselinv.hpp:295