PEXSI
 All Classes Namespaces Files Functions Variables Typedefs Pages
pselinv_unsym_impl.hpp
Go to the documentation of this file.
1 /*
2  Copyright (c) 2012 The Regents of the University of California,
3  through Lawrence Berkeley National Laboratory.
4 
5 Authors: Lin Lin and Mathias Jacquelin
6 
7 This file is part of PEXSI. All rights reserved.
8 
9 Redistribution and use in source and binary forms, with or without
10 modification, are permitted provided that the following conditions are met:
11 
12 (1) Redistributions of source code must retain the above copyright notice, this
13 list of conditions and the following disclaimer.
14 (2) Redistributions in binary form must reproduce the above copyright notice,
15 this list of conditions and the following disclaimer in the documentation
16 and/or other materials provided with the distribution.
17 (3) Neither the name of the University of California, Lawrence Berkeley
18 National Laboratory, U.S. Dept. of Energy nor the names of its contributors may
19 be used to endorse or promote products derived from this software without
20 specific prior written permission.
21 
22 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
23 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
24 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
25 DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
26 ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
27 (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
29 ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
31 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 
33 You are under no obligation whatsoever to provide any bug fixes, patches, or
34 upgrades to the features, functionality or performance of the source code
35 ("Enhancements") to anyone; however, if you choose to make your Enhancements
36 available either publicly, or directly to Lawrence Berkeley National
37 Laboratory, without imposing a separate written license agreement for such
38 Enhancements, then you hereby grant the following license: a non-exclusive,
39 royalty-free perpetual license to install, use, modify, prepare derivative
40 works, incorporate into other computer software, distribute, and sublicense
41 such enhancements or derivative works thereof, in binary and source code form.
42  */
46 #ifndef _PEXSI_PSELINV_UNSYM_IMPL_HPP_
47 #define _PEXSI_PSELINV_UNSYM_IMPL_HPP_
48 
49 #include "pexsi/timer.h"
51 #include <algorithm>
52 
53 
54 
55 
56 namespace PEXSI{
57  template<typename T>
58  PMatrixUnsym<T>::PMatrixUnsym (
59  const GridType* g,
60  const SuperNodeType* s,
61  const PSelInvOptions * o,
62  const SuperLUOptions * oLU
63  )
64  {
65 #ifndef _RELEASE_
66  PushCallStack("PMatrixUnsym::PMatrixUnsym");
67 #endif
68 
69  this->Setup( g, s, o, oLU );
70 
71 #ifndef _RELEASE_
72  PopCallStack();
73 #endif
74  return ;
75  } // ----- end of method PMatrixUnsym::PMatrixUnsym -----
76 
77 
78  template<typename T>
79  void PMatrixUnsym<T>::Setup(
80  const GridType* g,
81  const SuperNodeType* s,
82  const PSelInvOptions * o,
83  const SuperLUOptions * oLU
84  ) {
85 #ifndef _RELEASE_
86  PushCallStack("PMatrixUnsym::Setup");
87 #endif
88 
89  PMatrix<T>::Setup(g,s,o,oLU);
90 
91 
92 
93  Lrow_.clear();
94  Ucol_.clear();
95  Lrow_.resize( this->NumLocalBlockRow() );
96  Ucol_.resize( this->NumLocalBlockCol() );
97 
98  LrowSize_.clear();
99  UcolSize_.clear();
100  LrowSize_.resize( this->NumLocalBlockRow() );
101  UcolSize_.resize( this->NumLocalBlockCol() );
102 
103 #ifndef _RELEASE_
104  PopCallStack();
105 #endif
106  return ;
107  } // ----- end of method PMatrixUnsym::Setup -----
108 
109 
111  template<typename T>
113  SuperNodeBufferTypeUnsym & snode,
114  std::vector<LBlock<T> > & LcolRecv,
115  std::vector<LBlock<T> > & LrowRecv,
116  std::vector<UBlock<T> > & UcolRecv,
117  std::vector<UBlock<T> > & UrowRecv, /*useless so far*/
118  NumMat<T> & AinvBuf,
119  NumMat<T> & LBuf,
120  NumMat<T> & UBuf )
121  {
122  TIMER_START(Compute_Sinv_LT_Lookup_Indexes);
123 
124 
125 
126 
127 
128  TIMER_START(Build_colptr_rowptr);
129 
130 #if ( _DEBUGlevel_ >= 2 )
131  statusOFS<<"UrowRecv blockIdx: "<<std::endl;
132  for( Int jb = 0; jb < UrowRecv.size(); jb++ ){
133  statusOFS<<UrowRecv[jb].blockIdx<<" ";
134  }
135  statusOFS<<std::endl;
136 
137  statusOFS<<"UcolRecv blockIdx: "<<std::endl;
138  for( Int jb = 0; jb < UcolRecv.size(); jb++ ){
139  statusOFS<<UcolRecv[jb].blockIdx<<" ";
140  }
141  statusOFS<<std::endl;
142 
143  statusOFS<<"LcolRecv blockIdx: "<<std::endl;
144  for( Int jb = 0; jb < LcolRecv.size(); jb++ ){
145  statusOFS<<LcolRecv[jb].blockIdx<<" ";
146  }
147  statusOFS<<std::endl;
148 
149 
150  statusOFS<<"LrowRecv blockIdx: "<<std::endl;
151  for( Int jb = 0; jb < LrowRecv.size(); jb++ ){
152  statusOFS<<LrowRecv[jb].blockIdx<<" ";
153  }
154  statusOFS<<std::endl;
155 #endif
156 
157  TIMER_START(Sort_LrowRecv_UcolRecv);
158  //LrowRecv should be sorted by blockIdx, same for UcolRecv
159  std::sort(LrowRecv.begin(),LrowRecv.end(),LBlockComparator<T>);
160  std::sort(UcolRecv.begin(),UcolRecv.end(),UBlockComparator<T>);
161  TIMER_STOP(Sort_LrowRecv_UcolRecv);
162 
163  // rowPtrL[ib] gives the row index in snode.LUpdateBuf for the first
164  // nonzero row in LcolRecv[ib]. The total number of rows in
165  // snode.LUpdateBuf is given by rowPtr[end]-1
166  std::vector<Int> rowPtrL(LcolRecv.size() + 1);
167  rowPtrL[0] = 0;
168  for( Int ib = 0; ib < LcolRecv.size(); ib++ ){
169  LBlock<T> & LB = LcolRecv[ib];
170  rowPtrL[ib+1] = rowPtrL[ib] + LB.numRow;
171  }
172 
173  // colPtrL[jb] gives the column index in LBuf for the first
174  // nonzero column in UrowRecv[jb]. The total number of rows in
175  // LBuf is given by colPtr[end]-1
176 
177  std::vector<Int> colPtrL(LrowRecv.size() + 1);
178  std::vector<Int> colPtrU(UcolRecv.size() + 1,-1);
179  colPtrL[0] = 0;
180  for( Int jb = 0; jb < LrowRecv.size(); jb++ ){
181  LBlock<T> & LB = LrowRecv[jb];
182  colPtrL[jb+1] = colPtrL[jb] + LB.numCol;
183  }
184 
185  colPtrU[0] = 0;
186  for( Int jb = 0; jb < UcolRecv.size(); jb++ ){
187  UBlock<T> & UB = UcolRecv[jb];
188  colPtrU[jb+1] = colPtrU[jb] + UB.numCol;
189  }
190 
191 #if ( _DEBUGlevel_ >= 1 )
192  statusOFS<<rowPtrL<<std::endl;
193  statusOFS<<colPtrL<<std::endl;
194  statusOFS<<colPtrU<<std::endl;
195 #endif
196 
197 
198  Int numRowAinvBuf = *colPtrU.rbegin();
199  //Int numRowAinvBuf = *rowPtrL.rbegin();
200  Int numColAinvBuf = *colPtrL.rbegin();
201  TIMER_STOP(Build_colptr_rowptr);
202 
203  TIMER_START(Allocate_lookup);
204  // Allocate for the computational storage
205  AinvBuf.Resize( numRowAinvBuf, numColAinvBuf );
206  LBuf.Resize( SuperSize( snode.Index, this->super_ ), numColAinvBuf );
207  UBuf.Resize( SuperSize( snode.Index, this->super_ ), numRowAinvBuf );
208  TIMER_STOP(Allocate_lookup);
209 
210 
211 
212  TIMER_START(Fill_LBuf);
213  // Fill LBuf first. Make the transpose later in the Gemm phase.
214  for( Int jb = 0; jb < LrowRecv.size(); jb++ ){
215  LBlock<T>& LB = LrowRecv[jb];
216  if(1 || (LB.numRow>0 && LB.numCol>0)){
217  if( LB.numRow != SuperSize(snode.Index, this->super_) ){
218 #ifdef USE_ABORT
219  statusOFS<<"The size of LB is not right. Something is seriously wrong."<<std::endl;
220  abort();
221 #endif
222  throw std::logic_error(
223  "The size of LB is not right. Something is seriously wrong." );
224  }
225  lapack::Lacpy( 'A', LB.numRow, LB.numCol, LB.nzval.Data(),
226  LB.numRow, LBuf.VecData( colPtrL[jb] ), LBuf.m() );
227  }
228  }
229  TIMER_STOP(Fill_LBuf);
230 
231  TIMER_START(Fill_UBuf);
232  // Fill UBuf first.
233  SetValue(UBuf, ZERO<T>());
234  for( Int jb = 0; jb < UcolRecv.size(); jb++ ){
235  UBlock<T>& UB = UcolRecv[jb];
236 
237  if(1||(UB.numRow>0 && UB.numCol>0)){
238  if( UB.numRow != SuperSize(snode.Index, this->super_) ){
239 #ifdef USE_ABORT
240  statusOFS<<"The size of UB is not right. Something is seriously wrong."<<std::endl;
241  abort();
242 #endif
243  throw std::logic_error(
244  "The size of UB is not right. Something is seriously wrong." );
245  }
246 
247 
248  lapack::Lacpy( 'A', UB.numRow, UB.numCol, UB.nzval.Data(),
249  UB.numRow, UBuf.VecData( colPtrU[jb] ), UBuf.m() );
250  }
251  }
252  TIMER_STOP(Fill_UBuf);
253 
254  // Calculate the relative indices for (isup, jsup)
255  // Fill AinvBuf with the information in L or U block.
256  TIMER_START(JB_Loop);
257 // for( Int jb = 0; jb < UrowRecv.size(); jb++ ){
258  for( Int jb = 0; jb < LrowRecv.size(); jb++ ){
259  LBlock<T>& LrowB = LrowRecv[jb];
260  Int jsup = LrowB.blockIdx;
261 
262  Int SinvColsSta = FirstBlockCol( jsup, this->super_ );
263 
264 
265  for( Int ib = 0; ib < LcolRecv.size(); ib++ ){
266  LBlock<T>& LB = LcolRecv[ib];
267  Int isup = LB.blockIdx;
268  Int SinvRowsSta = FirstBlockCol( isup, this->super_ );
269  T* nzvalAinv = &AinvBuf( rowPtrL[ib], colPtrL[jb] );
270  Int ldAinv = numRowAinvBuf;
271 
272  // Pin down the corresponding block in the part of Sinv.
273  if( isup >= jsup ){
274  // Column relative indicies
275  std::vector<Int> relCols( LrowB.numCol );
276  for( Int j = 0; j < LrowB.numCol; j++ ){
277  relCols[j] = LrowB.rows[j] - SinvColsSta;
278  }
279 
280  std::vector<LBlock<T> >& LcolSinv = this->L( LBj(jsup, this->grid_ ));
281  bool isBlockFound = false;
282  TIMER_START(PARSING_ROW_BLOCKIDX);
283  for( Int ibSinv = 0; ibSinv < LcolSinv.size(); ibSinv++ ){
284  // Found the (isup, jsup) block in Sinv
285  if( LcolSinv[ibSinv].blockIdx == isup ){
286  LBlock<T>& SinvB = LcolSinv[ibSinv];
287 
288  // Row relative indices
289  std::vector<Int> relRows( LB.numRow );
290  Int* rowsLBPtr = LB.rows.Data();
291  Int* rowsSinvBPtr = SinvB.rows.Data();
292 
293  TIMER_START(STDFIND_ROW);
294  Int * pos =&rowsSinvBPtr[0];
295  Int * last =&rowsSinvBPtr[SinvB.numRow];
296  for( Int i = 0; i < LB.numRow; i++ ){
297  pos = std::upper_bound(pos, last, rowsLBPtr[i])-1;
298  if(pos != last){
299  relRows[i] = (Int)(pos - rowsSinvBPtr);
300  }
301  else{
302  std::ostringstream msg;
303  msg << "Row " << rowsLBPtr[i] <<
304  " in LB cannot find the corresponding row in SinvB"
305  << std::endl
306  << "LB.rows = " << LB.rows << std::endl
307  << "SinvB.rows = " << SinvB.rows << std::endl;
308 #ifdef USE_ABORT
309  abort();
310 #endif
311  throw std::runtime_error( msg.str().c_str() );
312  }
313  }
314  TIMER_STOP(STDFIND_ROW);
315 
316  TIMER_START(Copy_Sinv_to_Ainv);
317  // Transfer the values from Sinv to AinvBlock
318  T* nzvalSinv = SinvB.nzval.Data();
319  Int ldSinv = SinvB.numRow;
320  for( Int j = 0; j < LrowB.numCol; j++ ){
321  for( Int i = 0; i < LB.numRow; i++ ){
322 // AinvBuf( rowPtrL[ib] + i, colPtrL[jb] + j ) =
323 // SinvB.nzval(relRows[i],relCols[j]);
324  nzvalAinv[i + j*ldAinv] =
325  nzvalSinv[relRows[i]+relCols[j]*ldSinv];
326  }
327  }
328  TIMER_STOP(Copy_Sinv_to_Ainv);
329 
330  isBlockFound = true;
331  break;
332  }
333  } // for (ibSinv )
334  TIMER_STOP(PARSING_ROW_BLOCKIDX);
335  if( isBlockFound == false ){
336  std::ostringstream msg;
337  msg << "["<<snode.Index<<"] "<<"Block(" << isup << ", " << jsup
338  << ") did not find a matching block in Sinv." << std::endl;
339 //#if ( _DEBUGlevel_ >= 1 )
340  statusOFS<<msg.str();
341 //#endif
342 //#ifdef USE_ABORT
343 // abort();
344 //#endif
345 // throw std::runtime_error( msg.str().c_str() );
346  }
347  } // if (isup, jsup) is in L
348  else{
349  // Row relative indices
350  std::vector<Int> relRows( LB.numRow );
351  for( Int i = 0; i < LB.numRow; i++ ){
352  relRows[i] = LB.rows[i] - SinvRowsSta;
353  }
354 
355  std::vector<UBlock<T> >& UrowSinv =
356  this->U( LBi( isup, this->grid_ ) );
357  bool isBlockFound = false;
358  TIMER_START(PARSING_COL_BLOCKIDX);
359  for( Int jbSinv = 0; jbSinv < UrowSinv.size(); jbSinv++ ){
360  // Found the (isup, jsup) block in Sinv
361  if( UrowSinv[jbSinv].blockIdx == jsup ){
362  UBlock<T>& SinvB = UrowSinv[jbSinv];
363 
364  // Column relative indices
365  std::vector<Int> relCols( LrowB.numCol );
366  Int* rowsLrowBPtr = LrowB.rows.Data();
367  Int* colsSinvBPtr = SinvB.cols.Data();
368 
369  TIMER_START(STDFIND_COL);
370  Int * pos =&colsSinvBPtr[0];
371  Int * last =&colsSinvBPtr[SinvB.numCol];
372  for( Int j = 0; j < LrowB.numCol; j++ ){
373  //rowsLrowB is sorted
374  pos = std::upper_bound(pos, last, rowsLrowBPtr[j])-1;
375  if(pos !=last){
376  relCols[j] = (Int)(pos - colsSinvBPtr);
377  }
378  else{
379  std::ostringstream msg;
380  msg << "Col " << rowsLrowBPtr[j] <<
381  " in UB cannot find the corresponding row in SinvB"
382  << std::endl
383  << "LrowB.rows = " << LrowB.rows << std::endl
384  << "UinvB.cols = " << SinvB.cols << std::endl;
385 #ifdef USE_ABORT
386  abort();
387 #endif
388  throw std::runtime_error( msg.str().c_str() );
389  }
390  }
391  TIMER_STOP(STDFIND_COL);
392 
393  TIMER_START(Copy_Sinv_to_Ainv);
394  // Transfer the values from Sinv to AinvBlock
395  T* nzvalSinv = SinvB.nzval.Data();
396  Int ldSinv = SinvB.numRow;
397  for( Int j = 0; j < LrowB.numCol; j++ ){
398  for( Int i = 0; i < LB.numRow; i++ ){
399 // AinvBuf( rowPtrL[ib] + i, colPtrL[jb] + j ) =
400 // SinvB.nzval(relRows[i],relCols[j]);
401 
402  nzvalAinv[i + j*ldAinv] =
403  nzvalSinv[relRows[i]+relCols[j]*ldSinv];
404  }
405  }
406  TIMER_STOP(Copy_Sinv_to_Ainv);
407 
408  isBlockFound = true;
409  break;
410  }
411  } // for (jbSinv)
412  TIMER_STOP(PARSING_COL_BLOCKIDX);
413  if( isBlockFound == false ){
414  std::ostringstream msg;
415  msg << "["<<snode.Index<<"] "<< "Block(" << isup << ", " << jsup
416  << ") did not find a matching block in Sinv." << std::endl;
417  statusOFS<<msg.str();
418 //#ifdef USE_ABORT
419 // abort();
420 //#endif
421 // throw std::runtime_error( msg.str().c_str() );
422  }
423  } // if (isup, jsup) is in U
424 
425  } // for( ib )
426  } // for ( jb )
427  TIMER_STOP(JB_Loop);
428 
429  TIMER_STOP(Compute_Sinv_LT_Lookup_Indexes);
430  } // End of method PMatrixUnsym::Selinv_lookup_indexes
431 
432 
433 
434 struct CDBuffers{
435  //Buffers for L
436  std::vector<MPI_Request > arrMpiReqsSendLCD;
437  std::vector<MPI_Request > arrMpiReqsSizeSendLCD;
438  std::vector<MPI_Request > arrMpiReqsRecvLCD;
439  std::vector<MPI_Request > arrMpiReqsSizeRecvLCD;
440  std::vector<std::vector<char> > arrSstrLcolSendCD;
441  std::vector<int > arrSstrLcolSizeSendCD;
442  std::vector<std::vector<char> > arrSstrLrowRecvCD;
443  std::vector<int > arrSstrLrowSizeRecvCD;
444 
445  //Buffers for U
446  std::vector<MPI_Request > arrMpiReqsSendUCD;
447  std::vector<MPI_Request > arrMpiReqsSizeSendUCD;
448  std::vector<MPI_Request > arrMpiReqsRecvUCD;
449  std::vector<MPI_Request > arrMpiReqsSizeRecvUCD;
450  std::vector<std::vector<char> > arrSstrUrowSendCD;
451  std::vector<int > arrSstrUrowSizeSendCD;
452  std::vector<std::vector<char> > arrSstrUcolRecvCD;
453  std::vector<int > arrSstrUcolSizeRecvCD;
454 
455  std::vector<Int>sendOffset;
456  std::vector<Int>recvOffset;
457 
458  void resize(Int sendCount, Int recvCount){
459  //Buffers for L
460  arrMpiReqsSendLCD.assign(sendCount, MPI_REQUEST_NULL );
461  arrMpiReqsSizeSendLCD.assign(sendCount, MPI_REQUEST_NULL );
462  arrMpiReqsRecvLCD.assign(recvCount, MPI_REQUEST_NULL );
463  arrMpiReqsSizeRecvLCD.assign(recvCount, MPI_REQUEST_NULL );
464  arrSstrLcolSendCD.resize(sendCount);
465  arrSstrLcolSizeSendCD.resize(sendCount);
466  arrSstrLrowRecvCD.resize(recvCount);
467  arrSstrLrowSizeRecvCD.resize(recvCount);
468 
469  //Buffers for U
470  arrMpiReqsSendUCD.assign(recvCount, MPI_REQUEST_NULL );
471  arrMpiReqsSizeSendUCD.assign(recvCount, MPI_REQUEST_NULL );
472  arrMpiReqsRecvUCD.assign(sendCount, MPI_REQUEST_NULL );
473  arrMpiReqsSizeRecvUCD.assign(sendCount, MPI_REQUEST_NULL );
474  arrSstrUrowSendCD.resize(recvCount);
475  arrSstrUrowSizeSendCD.resize(recvCount);
476  arrSstrUcolRecvCD.resize(sendCount);
477  arrSstrUcolSizeRecvCD.resize(sendCount);
478  }
479 
480  void WaitAllSend(){
481  mpi::Waitall(arrMpiReqsSizeSendLCD);
482  mpi::Waitall(arrMpiReqsSendLCD);
483  mpi::Waitall(arrMpiReqsSizeSendUCD);
484  mpi::Waitall(arrMpiReqsSendUCD);
485 
486 // MPI_Waitall(arrMpiReqsSizeSendLCD.size(), &arrMpiReqsSizeSendLCD[0], MPI_STATUSES_IGNORE);
487 // MPI_Waitall(arrMpiReqsSendLCD.size(), &arrMpiReqsSendLCD[0], MPI_STATUSES_IGNORE);
488 // MPI_Waitall(arrMpiReqsSizeSendUCD.size(), &arrMpiReqsSizeSendUCD[0], MPI_STATUSES_IGNORE);
489 // MPI_Waitall(arrMpiReqsSendUCD.size(), &arrMpiReqsSendUCD[0], MPI_STATUSES_IGNORE);
490  }
491 
492  void WaitAllRecv(){
493 // mpi::Waitall(arrMpiReqsSizeRecvLCD);
494 // mpi::Waitall(arrMpiReqsRecvLCD);
495 // mpi::Waitall(arrMpiReqsSizeRecvUCD);
496 // mpi::Waitall(arrMpiReqsRecvUCD);
497  }
498 
499 
500 };
501 
502 
503  template<typename T>
505  std::vector<Int > & arrSuperNodes,
506  Int stepSuper, CDBuffers & buffers)
507  {
508  TIMER_START(SendRecvSizesCD);
509  //compute the number of requests
510  Int sendCount = 0;
511  Int recvCount = 0;
512  buffers.sendOffset.resize(stepSuper);
513  buffers.recvOffset.resize(stepSuper);
514  Int recvIdxL=0;
515  Int recvIdxU=0;
516  for (Int supidx=0; supidx<stepSuper; supidx++){
517  Int snode_index = arrSuperNodes[supidx];
518  buffers.sendOffset[supidx]=sendCount;
519  buffers.recvOffset[supidx]=recvCount;
520  sendCount+= this->CountSendToCrossDiagonal(snode_index);
521  recvCount+= this->CountRecvFromCrossDiagonal(snode_index);
522  }
523 
524  buffers.resize(sendCount,recvCount);
525 
526 
527  //Do Isend for size and content of L and Irecv for sizes of U
528  for (Int supidx=0; supidx<stepSuper; supidx++){
529  Int snode_index = arrSuperNodes[supidx];
530 
531  TIMER_START(Send_L_Recv_Size_U_CrossDiag);
532 
533  if( MYCOL( this->grid_ ) == PCOL( snode_index, this->grid_ )
534  && this->isSendToCrossDiagonal_(this->grid_->numProcCol, snode_index ) ){
535 
536  Int sendIdxL = 0;
537  Int recvIdxU = 0;
538  for(Int dstCol = 0; dstCol<this->grid_->numProcCol; dstCol++){
539  if(this->isSendToCrossDiagonal_(dstCol,snode_index) ){
540  Int dest = PNUM(PROW(snode_index,this->grid_),dstCol,this->grid_);
541 
542  if( MYPROC( this->grid_ ) != dest ){
543  //Send size and content of L
544  MPI_Request & mpiReqSizeSendL =
545  buffers.arrMpiReqsSizeSendLCD[buffers.sendOffset[supidx]+sendIdxL];
546  MPI_Request & mpiReqSendL =
547  buffers.arrMpiReqsSendLCD[buffers.sendOffset[supidx]+sendIdxL];
548  std::vector<char> & sstrLcolSend =
549  buffers.arrSstrLcolSendCD[buffers.sendOffset[supidx]+sendIdxL];
550  Int & sstrSizeL =
551  buffers.arrSstrLcolSizeSendCD[buffers.sendOffset[supidx]+sendIdxL];
552 
553  // Pack L data
554  std::stringstream sstm;
555  std::vector<Int> mask( LBlockMask::TOTAL_NUMBER, 1 );
556  std::vector<LBlock<T> >& Lcol = this->L( LBj(snode_index, this->grid_) );
557  // All blocks except for the diagonal block are to be sent right
558  //TODO not true > this is a scatter operation ! Can we know the destination ?
559 
560  //Skip the diagonal block if necessary
561  Int startIdx = ( MYROW( this->grid_ ) == PROW( snode_index, this->grid_ ) ) ? 1:0;
562  Int count = 0;
563  for( Int ib = startIdx; ib < Lcol.size(); ib++ ){
564  if( Lcol[ib].blockIdx > snode_index &&
565  (Lcol[ib].blockIdx % this->grid_->numProcCol) == dstCol ){
566  count++;
567  }
568  }
569 
570  serialize( (Int)count, sstm, NO_MASK );
571 
572  for( Int ib = startIdx; ib < Lcol.size(); ib++ ){
573  if( Lcol[ib].blockIdx > snode_index &&
574  (Lcol[ib].blockIdx % this->grid_->numProcCol) == dstCol ){
575 #if ( _DEBUGlevel_ >= 1 )
576  statusOFS << "["<<snode_index<<"] SEND contains "<<Lcol[ib].blockIdx
577  << " which corresponds to "<<GBj(ib,this->grid_)
578  << " to P"<<dest
579  << std::endl;
580 #endif
581  serialize( Lcol[ib], sstm, mask );
582  }
583  }
584 
585  sstrLcolSend.resize( Size(sstm) );
586  sstm.read( &sstrLcolSend[0], sstrLcolSend.size() );
587  sstrSizeL = sstrLcolSend.size();
588 
589 
590 
591  MPI_Isend( &sstrSizeL, 1, MPI_INT, dest,
592  IDX_TO_TAG2(snode_index,supidx,SELINV_TAG_L_SIZE_CD),
593  this->grid_->comm, &mpiReqSizeSendL );
594  MPI_Isend( (void*)&sstrLcolSend[0], sstrSizeL, MPI_BYTE, dest,
595  IDX_TO_TAG2(snode_index,supidx,SELINV_TAG_L_CONTENT_CD),
596  this->grid_->comm, &mpiReqSendL );
597  sendIdxL++;
598 
599  //Recv for U size
600  Int & sstrSizeU =
601  buffers.arrSstrUcolSizeRecvCD[buffers.sendOffset[supidx]+recvIdxU];
602  MPI_Request & mpiReqSizeRecvU =
603  buffers.arrMpiReqsSizeRecvUCD[buffers.sendOffset[supidx]+recvIdxU];
604 
605  MPI_Irecv( &sstrSizeU, 1, MPI_INT, dest,
606  IDX_TO_TAG2(snode_index,supidx,SELINV_TAG_U_SIZE_CD),
607  this->grid_->comm, &mpiReqSizeRecvU );
608  recvIdxU++;
609  }
610  }
611  }
612  } // sender
613  TIMER_STOP(Send_L_Recv_Size_U_CrossDiag);
614  }
615 
616 
617  //Do Irecv for sizes of L and Isend for size and content of U
618  for (Int supidx=0; supidx<stepSuper; supidx++){
619  Int snode_index = arrSuperNodes[supidx];
620  TIMER_START(Send_U_Recv_Size_L_CrossDiag);
621  //If I'm a receiver
622  if( MYROW( this->grid_ ) == PROW( snode_index, this->grid_ )
623  && this->isRecvFromCrossDiagonal_(this->grid_->numProcRow, snode_index ) ){
624  Int recvIdxL=0;
625  Int sendIdxU=0;
626  for(Int srcRow = 0; srcRow<this->grid_->numProcRow; srcRow++){
627  if(this->isRecvFromCrossDiagonal_(srcRow,snode_index) ){
628  Int src = PNUM(srcRow,PCOL(snode_index,this->grid_),this->grid_);
629  if( MYPROC( this->grid_ ) != src ){
630  //Recv size of L
631  Int & sstrSizeL =
632  buffers.arrSstrLrowSizeRecvCD[buffers.recvOffset[supidx]+recvIdxL];
633  MPI_Request & mpiReqSizeRecvL =
634  buffers.arrMpiReqsSizeRecvLCD[buffers.recvOffset[supidx]+recvIdxL];
635 
636  MPI_Irecv( &sstrSizeL, 1, MPI_INT, src,
637  IDX_TO_TAG2(snode_index,supidx,SELINV_TAG_L_SIZE_CD),
638  this->grid_->comm, &mpiReqSizeRecvL );
639  recvIdxL++;
640 
641 
642  //Send size and content of U
643  MPI_Request & mpiReqSizeSendU =
644  buffers.arrMpiReqsSizeSendUCD[buffers.recvOffset[supidx]+sendIdxU];
645  MPI_Request & mpiReqSendU =
646  buffers.arrMpiReqsSendUCD[buffers.recvOffset[supidx]+sendIdxU];
647  std::vector<char> & sstrUrowSend =
648  buffers.arrSstrUrowSendCD[buffers.recvOffset[supidx]+sendIdxU];
649  Int & sstrSizeU =
650  buffers.arrSstrUrowSizeSendCD[buffers.recvOffset[supidx]+sendIdxU];
651 
652  // Pack U data
653  std::stringstream sstm;
654  std::vector<Int> mask( UBlockMask::TOTAL_NUMBER, 1 );
655  std::vector<UBlock<T> >& Urow = this->U( LBi(snode_index, this->grid_) );
656  // All blocks except for the diagonal block are to be sent right
657  //TODO not true > this is a scatter operation ! Can we know the destination ?
658 
659  Int count = 0;
660  for( Int jb = 0; jb < Urow.size(); jb++ ){
661  if( Urow[jb].blockIdx > snode_index
662  && (Urow[jb].blockIdx % this->grid_->numProcRow) == srcRow ){
663  count++;
664  }
665  }
666 
667  serialize( (Int)count, sstm, NO_MASK );
668 
669  for( Int jb = 0; jb < Urow.size(); jb++ ){
670  if( Urow[jb].blockIdx > snode_index
671  && (Urow[jb].blockIdx % this->grid_->numProcRow) == srcRow ){
672 #if ( _DEBUGlevel_ >= 1 )
673  statusOFS << "["<<snode_index<<"] SEND contains "<<Urow[jb].blockIdx
674  << " which corresponds to "<<GBi(jb,this->grid_)
675  << " to P"<<src
676  << std::endl;
677 #endif
678  serialize( Urow[jb], sstm, mask );
679  }
680  }
681 
682  sstrUrowSend.resize( Size(sstm) );
683  sstm.read( &sstrUrowSend[0], sstrUrowSend.size() );
684  sstrSizeU = sstrUrowSend.size();
685 
686 
687  MPI_Isend( &sstrSizeU, 1, MPI_INT, src,
688  IDX_TO_TAG2(snode_index,supidx,SELINV_TAG_U_SIZE_CD),
689  this->grid_->comm, &mpiReqSizeSendU );
690  MPI_Isend( (void*)&sstrUrowSend[0], sstrSizeU, MPI_BYTE, src,
691  IDX_TO_TAG2(snode_index,supidx,SELINV_TAG_U_CONTENT_CD),
692  this->grid_->comm, &mpiReqSendU );
693  sendIdxU++;
694  }
695  }
696  }
697  }//end if I'm a receiver
698  TIMER_STOP(Send_U_Recv_Size_L_CrossDiag);
699  }
700  TIMER_STOP(SendRecvSizesCD);
701  }
702 
703  template<typename T>
704  inline void PMatrixUnsym<T>::IRecvContentCD(
705  std::vector<Int > & arrSuperNodes,
706  Int stepSuper, CDBuffers & buffers) {
707  TIMER_START(IrecvContentCD);
708 
709 
710  //waitall sizes of L
711  TIMER_START(Wait_Size_L_CrossDiag);
712  mpi::Waitall(buffers.arrMpiReqsSizeRecvLCD);
713  TIMER_STOP(Wait_Size_L_CrossDiag);
714 
715 //if(this->grid_->mpirank==0){gdb_lock();}
716 
717  //Allocate content and do Irecv for content of L
718  for (Int supidx=0; supidx<stepSuper; supidx++){
719  Int snode_index = arrSuperNodes[supidx];
720  //If I'm a receiver
721  TIMER_START(Recv_L_CrossDiag);
722  if( MYROW( this->grid_ ) == PROW( snode_index, this->grid_ )
723  && this->isRecvFromCrossDiagonal_(this->grid_->numProcRow, snode_index ) ){
724  Int recvIdxL=0;
725  for(Int srcRow = 0; srcRow<this->grid_->numProcRow; srcRow++){
726  if(this->isRecvFromCrossDiagonal_(srcRow,snode_index) ){
727  Int src = PNUM(srcRow,PCOL(snode_index,this->grid_),this->grid_);
728  if( MYPROC( this->grid_ ) != src ){
729  Int & sstrSizeL =
730  buffers.arrSstrLrowSizeRecvCD[buffers.recvOffset[supidx]+recvIdxL];
731  std::vector<char> & sstrLrowRecv =
732  buffers.arrSstrLrowRecvCD[buffers.recvOffset[supidx]+recvIdxL];
733  MPI_Request & mpiReqRecvL =
734  buffers.arrMpiReqsRecvLCD[buffers.recvOffset[supidx]+recvIdxL];
735  sstrLrowRecv.resize( sstrSizeL);
736 
737  MPI_Irecv( (void*)&sstrLrowRecv[0], sstrSizeL, MPI_BYTE, src,
738  IDX_TO_TAG2(snode_index,supidx,SELINV_TAG_L_CONTENT_CD),
739  this->grid_->comm, &mpiReqRecvL );
740  recvIdxL++;
741 
742  }
743  }
744  }
745  }//end if I'm a receiver
746  TIMER_STOP(Recv_L_CrossDiag);
747  }
748 
749  //waitall sizes of U
750  TIMER_START(Wait_Size_U_CrossDiag);
751  mpi::Waitall(buffers.arrMpiReqsSizeRecvUCD);
752  TIMER_STOP(Wait_Size_U_CrossDiag);
753 
754  //Allocate content and do Irecv for content of U
755  for (Int supidx=0; supidx<stepSuper; supidx++){
756  Int snode_index = arrSuperNodes[supidx];
757  TIMER_START(Recv_U_CrossDiag);
758  if( MYCOL( this->grid_ ) == PCOL( snode_index, this->grid_ )
759  && this->isSendToCrossDiagonal_(this->grid_->numProcCol, snode_index ) ){
760  Int recvIdxU = 0;
761  for(Int srcCol = 0; srcCol<this->grid_->numProcCol; srcCol++){
762  if(this->isSendToCrossDiagonal_(srcCol,snode_index) ){
763  Int src = PNUM(PROW(snode_index,this->grid_),srcCol,this->grid_);
764  if( MYPROC( this->grid_ ) != src ){
765  Int & sstrSizeU =
766  buffers.arrSstrUcolSizeRecvCD[buffers.sendOffset[supidx]+recvIdxU];
767  std::vector<char> & sstrUcolRecv =
768  buffers.arrSstrUcolRecvCD[buffers.sendOffset[supidx]+recvIdxU];
769  MPI_Request & mpiReqRecvU =
770  buffers.arrMpiReqsRecvUCD[buffers.sendOffset[supidx]+recvIdxU];
771  sstrUcolRecv.resize( sstrSizeU );
772 
773  MPI_Irecv( (void*)&sstrUcolRecv[0], sstrSizeU, MPI_BYTE, src,
774  IDX_TO_TAG2(snode_index,supidx,SELINV_TAG_U_CONTENT_CD),
775  this->grid_->comm, &mpiReqRecvU );
776  recvIdxU++;
777  }
778  }
779  }
780  }//end if I'm a receiver
781  TIMER_STOP(Recv_U_CrossDiag);
782  }
783 
784  TIMER_START(IrecvContentCD);
785  }
786 
787 
788  template<typename T>
789  inline void PMatrixUnsym<T>::WaitContentLCD(
790  std::vector<Int > & arrSuperNodes,
791  Int stepSuper, CDBuffers & buffers) {
792  TIMER_START(WaitContentLCD);
793  //waitall content of L
794  TIMER_START(Wait_L_CrossDiag);
795  mpi::Waitall(buffers.arrMpiReqsRecvLCD);
796  TIMER_STOP(Wait_L_CrossDiag);
797 
798 
799 
800  //Do the work for Lrow
801  NumMat<T> Ltmp;
802  for (Int supidx=0; supidx<stepSuper; supidx++){
803  Int snode_index = arrSuperNodes[supidx];
804  Int ksupProcRow = PROW( snode_index, this->grid_ );
805  Int ksupProcCol = PCOL( snode_index, this->grid_ );
806 
807  if( MYROW( this->grid_ ) == PROW( snode_index, this->grid_ ) &&
808  this->isRecvFromCrossDiagonal_(this->grid_->numProcRow, snode_index ) ){
809 
810 #if ( _DEBUGlevel_ >= 1 )
811  statusOFS << std::endl << " ["<<snode_index<<"] "
812  << "Update the upper triangular block"
813  << std::endl << std::endl;
814  statusOFS << std::endl << " ["<<snode_index<<"] "
815 // << "blockIdxLocal:" << snode.BlockIdxLocal
816  << std::endl << std::endl;
817  statusOFS << std::endl << " ["<<snode_index<<"] "
818 // << "rowLocalPtr:" << snode.RowLocalPtr
819  << std::endl << std::endl;
820 #endif
821 
822  std::vector<LBlock<T> >& Lcol = this->L( LBj(snode_index, this->grid_) );
823  std::vector<LBlock<T> >& Lrow = this->Lrow( LBi( snode_index, this->grid_ ) );
824 
825 // if(Lrow.size() == 0){
826 // Int& LrowSize = this->LrowSize_[ LBi( snode_index, this->grid_ ) ];
827 // Lrow.resize(LrowSize);
828 // }
829 
830  std::vector<Int> isBlockFound(Lrow.size(),false);
831  Int recvIdxL=0;
832 
833  for(Int srcRow = 0; srcRow<this->grid_->numProcRow; srcRow++){
834  if(this->isRecvFromCrossDiagonal_(srcRow,snode_index) ){
835  Int src = PNUM(srcRow,PCOL(snode_index,this->grid_),this->grid_);
836  TIMER_START(Recv_L_CrossDiag);
837 
838  //Declaring some pointers to avoid copy if data is local
839  std::vector<LBlock<T> > * pLcol;
840  std::vector<LBlock<T> > LcolRecv;
841  if(MYPROC(this->grid_)!= src){
842  Int & sstrSizeL =
843  buffers.arrSstrLrowSizeRecvCD[buffers.recvOffset[supidx]+recvIdxL];
844  std::vector<char> & sstrLrowRecv =
845  buffers.arrSstrLrowRecvCD[buffers.recvOffset[supidx]+recvIdxL];
846 
847  std::stringstream sstm;
848 
849 #if ( _DEBUGlevel_ >= 1 )
850  statusOFS << "["<<snode_index<<"] P"<<MYPROC(this->grid_)<<" ("<<MYROW(this->grid_)
851  << ","<<MYCOL(this->grid_)<<") <--- LBj("<<snode_index<<") <--- P"
852  << src <<" ("<<srcRow<<","<<ksupProcCol<<")"
853  << std::endl;
854 #endif
855  sstm.write( &sstrLrowRecv[0], sstrSizeL );
856 
857  // Unpack L data.
858  Int numLBlock;
859  std::vector<Int> mask( LBlockMask::TOTAL_NUMBER, 1 );
860  deserialize( numLBlock, sstm, NO_MASK );
861  LcolRecv.resize(numLBlock);
862  for( Int ib = 0; ib < numLBlock; ib++ ){
863  deserialize( LcolRecv[ib], sstm, mask );
864 #if ( _DEBUGlevel_ >= 1 )
865  statusOFS << "["<<snode_index<<"] RECV contains "
866  << LcolRecv[ib].blockIdx<< " which corresponds to "
867  << ib * this->grid_->numProcRow + srcRow;
868  statusOFS << " L is on row "<< srcRow
869  << " whereas Lrow is on col "
870  << LcolRecv[ib].blockIdx % this->grid_->numProcCol
871  << std::endl;
872 #endif
873  }
874  recvIdxL++;
875  pLcol = &LcolRecv;
876  } // sender is not the same as receiver
877  else{
878  // L is obtained locally, just make a copy. Do not include the diagonal block
879  pLcol = &Lcol;
880  } // sender is the same as receiver
881 
882 
883  TIMER_STOP(Recv_L_CrossDiag);
884 
885  //We can directly put LcolRecv in Lrow (stil need to transpose)
886 
887  //Skip the diagonal block if necessary
888  Int startIdx = ( MYPROC( this->grid_ ) == src && MYPROC(this->grid_) == PNUM(ksupProcRow,ksupProcCol, this->grid_) ) ? 1:0;
889  for( Int ib = startIdx; ib < pLcol->size(); ib++ ){
890  LBlock<T> & LB = (*pLcol)[ib];
891  if( LB.blockIdx <= snode_index ){
892 #ifdef USE_ABORT
893  abort();
894 #endif
895  throw std::logic_error( "LcolRecv contains the wrong blocks." );
896  }
897 
898  //check that LB would be on this proc if it was a UB
899  if(PCOL(LB.blockIdx, this->grid_) != MYCOL(this->grid_)){
900  continue;
901  }
902 
903 
904 
905  //std::vector<LBlock<T> > & LrowCD = this->Lrow( LBi( ksup, super_ ) );
906  for( Int iib = 0; iib < Lrow.size(); iib++ ){
907  LBlock<T> & LrowB = Lrow[ iib ];
908 
909  //If not initialized, set it to be the one
910  if( LrowB.blockIdx == -1){
911  LrowB.blockIdx = LB.blockIdx;
912  }
913 
914  if( LB.blockIdx == LrowB.blockIdx ){
915  // Note that the order of the column indices of the U
916  // block may not follow the order of the row indices,
917  // overwrite the information in U.
918 // LrowB = LB;
919  LrowB.rows = LB.rows;
920  //Store in "row major" format / i.e transpose is in col-major
921  Transpose(LB.nzval, LrowB.nzval);
922  LrowB.numCol = LB.numRow;
923  LrowB.numRow = LB.numCol;
924 
925 // SetValue(LrowB.nzval,ZERO<T>());
926 
927 #if ( _DEBUGlevel_ >= 1 )
928  statusOFS<<"["<<snode_index<<"] USING LB "<<LB.blockIdx<< std::endl;
929 #endif
930  isBlockFound[iib] = 1;//true;
931  break;
932  } // if( LB.blockIdx == LrowB.blockIdx )
933  } // for (iib)
934  } // for (ib)
935  }
936  }
937 
938  for( Int ib = 0; ib < Lrow.size(); ib++ ){
939  LBlock<T> & LB = Lrow[ib];
940  if( !isBlockFound[ib] ){
941 #ifdef USE_ABORT
942  std::vector<LBlock<T> >& Lcol = this->L( LBj( snode_index, this->grid_ ) );
943  std::vector<UBlock<T> >& Urow = this->U( LBi( snode_index, this->grid_ ) );
944 
945  for( Int ib = 0; ib < Lcol.size(); ib++ ){ statusOFS<<Lcol[ib].blockIdx<<" "; }statusOFS<<endl;
946  for( Int jb = 0; jb < Urow.size(); jb++ ){ statusOFS<<Urow[jb].blockIdx; }statusOFS<<endl;
947  for( Int jb = 0; jb < Lrow.size(); jb++ ){ statusOFS<<Lrow[jb].blockIdx<<" "; }statusOFS<<endl;
948  abort();
949 #endif
950  throw std::logic_error(
951  "LBlock cannot find its update. Something is seriously wrong."
952  );
953  }
954  }
955  } // Done copying L
956  }
957 
958  TIMER_STOP(WaitContentLCD);
959 }
960 
961 
962 
963  template<typename T>
964  inline void PMatrixUnsym<T>::WaitContentUCD(
965  std::vector<Int > & arrSuperNodes,
966  Int stepSuper, CDBuffers & buffers) {
967 
968  TIMER_START(WaitContentUCD);
969  //waitall content of U
970  TIMER_START(Wait_U_CrossDiag);
971  mpi::Waitall(buffers.arrMpiReqsRecvUCD);
972  TIMER_STOP(Wait_U_CrossDiag);
973 
974 
975  //Do the work for U
976  for (Int supidx=0; supidx<stepSuper; supidx++){
977  Int snode_index = arrSuperNodes[supidx];
978  Int ksupProcRow = PROW( snode_index, this->grid_ );
979  Int ksupProcCol = PCOL( snode_index, this->grid_ );
980 
981 
982  if( MYCOL( this->grid_ ) == PCOL( snode_index, this->grid_ ) &&
983  this->isSendToCrossDiagonal_(this->grid_->numProcCol, snode_index ) ){
984 
985  std::vector<UBlock<T> >& Ucol = this->Ucol( LBj( snode_index, this->grid_ ) );
986 
987 // if(Ucol.size() == 0){
988 // Int& UcolSize = this->UcolSize_[ LBj( snode_index, this->grid_ ) ];
989 // Ucol.resize(UcolSize);
990 // }
991 
992  std::vector<Int> isBlockFound(Ucol.size(),false);
993  Int recvIdxU=0;
994 
995  for(Int srcCol = 0; srcCol<this->grid_->numProcCol; srcCol++){
996  if(this->isSendToCrossDiagonal_(srcCol,snode_index) ){
997  Int src = PNUM(PROW(snode_index,this->grid_),srcCol,this->grid_);
998  TIMER_START(Recv_U_CrossDiag);
999 
1000  //Declaring some pointers to avoid copy if data is local
1001  std::vector<UBlock<T> > * pUrow;
1002  std::vector<UBlock<T> > UrowRecv;
1003  if(MYPROC(this->grid_)!= src){
1004  Int & sstrSizeU =
1005  buffers.arrSstrUcolSizeRecvCD[buffers.sendOffset[supidx]+recvIdxU];
1006  std::vector<char> & sstrUcolRecv =
1007  buffers.arrSstrUcolRecvCD[buffers.sendOffset[supidx]+recvIdxU];
1008 
1009 
1010  std::stringstream sstm;
1011 
1012 #if ( _DEBUGlevel_ >= 1 )
1013  statusOFS << "["<<snode_index<<"] P"<<MYPROC(this->grid_)<<" ("<<MYROW(this->grid_)
1014  << ","<<MYCOL(this->grid_)<<") <--- LBi("<<snode_index<<") <--- P"
1015  << src<<" ("<<ksupProcCol<<","<<srcCol<<")"<<std::endl;
1016 #endif
1017  sstm.write( &sstrUcolRecv[0], sstrSizeU );
1018 
1019  // Unpack U data.
1020  Int numUBlock;
1021  std::vector<Int> mask( UBlockMask::TOTAL_NUMBER, 1 );
1022  deserialize( numUBlock, sstm, NO_MASK );
1023  UrowRecv.resize(numUBlock);
1024  for( Int jb = 0; jb < numUBlock; jb++ ){
1025  deserialize( UrowRecv[jb], sstm, mask );
1026 #if ( _DEBUGlevel_ >= 1 )
1027  statusOFS << "["<<snode_index<<"] RECV contains "
1028  << UrowRecv[jb].blockIdx<< " which corresponds to "
1029  << jb * this->grid_->numProcCol + srcCol;
1030  statusOFS << " Urow is on col "<< srcCol
1031  << " whereas U is on row "
1032  << UrowRecv[jb].blockIdx % this->grid_->numProcRow
1033  << std::endl;
1034 #endif
1035  }
1036  pUrow = &UrowRecv;
1037  recvIdxU++;
1038  } // sender is not the same as receiver
1039  else{
1040  // U is obtained locally, just make a copy.
1041  std::vector<UBlock<T> >& Urow = this->U( LBi( snode_index, this->grid_ ) );
1042  pUrow = &Urow;
1043  } // sender is the same as receiver
1044 
1045 
1046  TIMER_STOP(Recv_U_CrossDiag);
1047 
1048  //We can directly put UrowRecv in Ucol
1049  for( Int jb = 0; jb < pUrow->size(); jb++ ){
1050  UBlock<T> & UB = (*pUrow)[jb];
1051  if( UB.blockIdx <= snode_index ){
1052 #ifdef USE_ABORT
1053  abort();
1054 #endif
1055  throw std::logic_error( "UrowRecv contains the wrong blocks." );
1056  }
1057 
1058  //check that UB would be on this proc if it was a LB
1059  if(PROW(UB.blockIdx, this->grid_) != MYROW(this->grid_)){
1060  continue;
1061  }
1062 
1063 
1064  //std::vector<UBlock<T> > & UcolCD = this->Ucol( LBi( ksup, super_ ) );
1065  for( Int jjb = 0; jjb < Ucol.size(); jjb++ ){
1066  UBlock<T> & UcolB = Ucol[jjb];
1067 
1068 
1069  //If not initialized, set it to be the one
1070  if( UcolB.blockIdx == -1){
1071  UcolB.blockIdx = UB.blockIdx;
1072  }
1073 
1074 
1075  if( UB.blockIdx == UcolB.blockIdx ){
1076  // Compare size
1077  // Note that the order of the column indices of the U
1078  // block may not follow the order of the row indices,
1079  // overwrite the information in U.
1080  UcolB = UB;
1081 
1082 //SetValue(UcolB.nzval,ZERO<T>());
1083 #if ( _DEBUGlevel_ >= 1 )
1084  statusOFS<<"["<<snode_index<<"] USING UB "<<UB.blockIdx<< std::endl;
1085 #endif
1086  isBlockFound[jjb] = 1;
1087  break;
1088  } // if( UB.blockIdx == UcolB.blockIdx )
1089  } // for (jjb)
1090  } // for (jb)
1091 
1092  }
1093  }
1094 
1095  for( Int jb = 0; jb < Ucol.size(); jb++ ){
1096  UBlock<T> & UB = Ucol[jb];
1097  if( !isBlockFound[jb] ){
1098 
1099 #ifdef USE_ABORT
1100  std::vector<LBlock<T> >& Lcol = this->L( LBj( snode_index, this->grid_ ) );
1101  std::vector<UBlock<T> >& Urow = this->U( LBi( snode_index, this->grid_ ) );
1102 
1103  for( Int ib = 0; ib < Lcol.size(); ib++ ){ statusOFS<<Lcol[ib].blockIdx<<" "; }statusOFS<<endl;
1104  for( Int jb = 0; jb < Urow.size(); jb++ ){ statusOFS<<Urow[jb].blockIdx; }statusOFS<<endl;
1105  for( Int jb = 0; jb < Ucol.size(); jb++ ){ statusOFS<<Ucol[jb].blockIdx<<" "; }statusOFS<<endl;
1106  abort();
1107 #endif
1108  throw std::logic_error(
1109  "UBlock cannot find its update. Something is seriously wrong."
1110  );
1111  }
1112  } // end for (jb)
1113  } // Done copying U
1114 
1115  }
1116 
1117  TIMER_STOP(WaitContentUCD);
1118 }
1119 
1120 
1121 
1122 
1123  template<typename T>
1125  std::vector<SuperNodeBufferTypeUnsym > & arrSuperNodes,
1126  Int stepSuper)
1127  {
1128 
1129  TIMER_START(Send_CD);
1130  //compute the number of requests
1131  Int sendCount = 0;
1132  Int recvCount = 0;
1133  Int sendOffset[stepSuper];
1134  Int recvOffset[stepSuper];
1135  Int recvIdxL=0;
1136  Int recvIdxU=0;
1137  for (Int supidx=0; supidx<stepSuper; supidx++){
1138  SuperNodeBufferTypeUnsym & snode = arrSuperNodes[supidx];
1139  sendOffset[supidx]=sendCount;
1140  recvOffset[supidx]=recvCount;
1141  sendCount+= this->CountSendToCrossDiagonal(snode.Index);
1142  recvCount+= this->CountRecvFromCrossDiagonal(snode.Index);
1143  }
1144 
1145  //Buffers for L
1146  std::vector<MPI_Request > arrMpiReqsSendLCD(sendCount, MPI_REQUEST_NULL );
1147  std::vector<MPI_Request > arrMpiReqsSizeSendLCD(sendCount, MPI_REQUEST_NULL );
1148  std::vector<MPI_Request > arrMpiReqsRecvLCD(recvCount, MPI_REQUEST_NULL );
1149  std::vector<MPI_Request > arrMpiReqsSizeRecvLCD(recvCount, MPI_REQUEST_NULL );
1150  std::vector<std::vector<char> > arrSstrLcolSendCD(sendCount);
1151  std::vector<int > arrSstrLcolSizeSendCD(sendCount);
1152  std::vector<std::vector<char> > arrSstrLrowRecvCD(recvCount);
1153  std::vector<int > arrSstrLrowSizeRecvCD(recvCount);
1154 
1155  //Buffers for U
1156  std::vector<MPI_Request > arrMpiReqsSendUCD(recvCount, MPI_REQUEST_NULL );
1157  std::vector<MPI_Request > arrMpiReqsSizeSendUCD(recvCount, MPI_REQUEST_NULL );
1158  std::vector<MPI_Request > arrMpiReqsRecvUCD(sendCount, MPI_REQUEST_NULL );
1159  std::vector<MPI_Request > arrMpiReqsSizeRecvUCD(sendCount, MPI_REQUEST_NULL );
1160  std::vector<std::vector<char> > arrSstrUrowSendCD(recvCount);
1161  std::vector<int > arrSstrUrowSizeSendCD(recvCount);
1162  std::vector<std::vector<char> > arrSstrUcolRecvCD(sendCount);
1163  std::vector<int > arrSstrUcolSizeRecvCD(sendCount);
1164 
1165 
1166 
1167  //Do Isend for size and content of L and Irecv for sizes of U
1168  for (Int supidx=0; supidx<stepSuper; supidx++){
1169  SuperNodeBufferTypeUnsym & snode = arrSuperNodes[supidx];
1170 
1171  TIMER_START(Send_L_Recv_Size_U_CrossDiag);
1172 
1173  if( MYCOL( this->grid_ ) == PCOL( snode.Index, this->grid_ )
1174  && this->isSendToCrossDiagonal_(this->grid_->numProcCol, snode.Index ) ){
1175 
1176  Int sendIdxL = 0;
1177  Int recvIdxU = 0;
1178  for(Int dstCol = 0; dstCol<this->grid_->numProcCol; dstCol++){
1179  if(this->isSendToCrossDiagonal_(dstCol,snode.Index) ){
1180  Int dest = PNUM(PROW(snode.Index,this->grid_),dstCol,this->grid_);
1181 
1182  if( MYPROC( this->grid_ ) != dest ){
1183  //Send size and content of L
1184  MPI_Request & mpiReqSizeSendL =
1185  arrMpiReqsSizeSendLCD[sendOffset[supidx]+sendIdxL];
1186  MPI_Request & mpiReqSendL =
1187  arrMpiReqsSendLCD[sendOffset[supidx]+sendIdxL];
1188  std::vector<char> & sstrLcolSend =
1189  arrSstrLcolSendCD[sendOffset[supidx]+sendIdxL];
1190  Int & sstrSizeL =
1191  arrSstrLcolSizeSendCD[sendOffset[supidx]+sendIdxL];
1192 
1193  // Pack L data
1194  std::stringstream sstm;
1195  std::vector<Int> mask( LBlockMask::TOTAL_NUMBER, 1 );
1196  std::vector<LBlock<T> >& Lcol = this->L( LBj(snode.Index, this->grid_) );
1197  // All blocks except for the diagonal block are to be sent right
1198  //TODO not true > this is a scatter operation ! Can we know the destination ?
1199 
1200  //Skip the diagonal block if necessary
1201  Int startIdx = ( MYROW( this->grid_ ) == PROW( snode.Index, this->grid_ ) ) ? 1:0;
1202  Int count = 0;
1203  for( Int ib = startIdx; ib < Lcol.size(); ib++ ){
1204  if( Lcol[ib].blockIdx > snode.Index &&
1205  (Lcol[ib].blockIdx % this->grid_->numProcCol) == dstCol ){
1206  count++;
1207  }
1208  }
1209 
1210  serialize( (Int)count, sstm, NO_MASK );
1211 
1212  for( Int ib = startIdx; ib < Lcol.size(); ib++ ){
1213  if( Lcol[ib].blockIdx > snode.Index &&
1214  (Lcol[ib].blockIdx % this->grid_->numProcCol) == dstCol ){
1215 #if ( _DEBUGlevel_ >= 1 )
1216  statusOFS << "["<<snode.Index<<"] SEND contains "<<Lcol[ib].blockIdx
1217  << " which corresponds to "<<GBj(ib,this->grid_)
1218  << " to P"<<dest
1219  << std::endl;
1220 #endif
1221  serialize( Lcol[ib], sstm, mask );
1222  }
1223  }
1224 
1225  sstrLcolSend.resize( Size(sstm) );
1226  sstm.read( &sstrLcolSend[0], sstrLcolSend.size() );
1227  sstrSizeL = sstrLcolSend.size();
1228 
1229 
1230 
1231  MPI_Isend( &sstrSizeL, 1, MPI_INT, dest,
1232  IDX_TO_TAG2(snode.Index,supidx,SELINV_TAG_L_SIZE_CD),
1233  this->grid_->comm, &mpiReqSizeSendL );
1234  MPI_Isend( (void*)&sstrLcolSend[0], sstrSizeL, MPI_BYTE, dest,
1235  IDX_TO_TAG2(snode.Index,supidx,SELINV_TAG_L_CONTENT_CD),
1236  this->grid_->comm, &mpiReqSendL );
1237  sendIdxL++;
1238 
1239  //Recv for U size
1240  Int & sstrSizeU =
1241  arrSstrUcolSizeRecvCD[sendOffset[supidx]+recvIdxU];
1242  MPI_Request & mpiReqSizeRecvU =
1243  arrMpiReqsSizeRecvUCD[sendOffset[supidx]+recvIdxU];
1244 
1245  MPI_Irecv( &sstrSizeU, 1, MPI_INT, dest,
1246  IDX_TO_TAG2(snode.Index,supidx,SELINV_TAG_U_SIZE_CD),
1247  this->grid_->comm, &mpiReqSizeRecvU );
1248  recvIdxU++;
1249  }
1250  }
1251  }
1252  } // sender
1253  TIMER_STOP(Send_L_Recv_Size_U_CrossDiag);
1254  }
1255 
1256 
1257  //Do Irecv for sizes of L and Isend for size and content of U
1258  for (Int supidx=0; supidx<stepSuper; supidx++){
1259  SuperNodeBufferTypeUnsym & snode = arrSuperNodes[supidx];
1260  TIMER_START(Send_U_Recv_Size_L_CrossDiag);
1261  //If I'm a receiver
1262  if( MYROW( this->grid_ ) == PROW( snode.Index, this->grid_ )
1263  && this->isRecvFromCrossDiagonal_(this->grid_->numProcRow, snode.Index ) ){
1264  Int recvIdxL=0;
1265  Int sendIdxU=0;
1266  for(Int srcRow = 0; srcRow<this->grid_->numProcRow; srcRow++){
1267  if(this->isRecvFromCrossDiagonal_(srcRow,snode.Index) ){
1268  Int src = PNUM(srcRow,PCOL(snode.Index,this->grid_),this->grid_);
1269  if( MYPROC( this->grid_ ) != src ){
1270  //Recv size of L
1271  Int & sstrSizeL =
1272  arrSstrLrowSizeRecvCD[recvOffset[supidx]+recvIdxL];
1273  MPI_Request & mpiReqSizeRecvL =
1274  arrMpiReqsSizeRecvLCD[recvOffset[supidx]+recvIdxL];
1275 
1276  MPI_Irecv( &sstrSizeL, 1, MPI_INT, src,
1277  IDX_TO_TAG2(snode.Index,supidx,SELINV_TAG_L_SIZE_CD),
1278  this->grid_->comm, &mpiReqSizeRecvL );
1279  recvIdxL++;
1280 
1281 
1282  //Send size and content of U
1283  MPI_Request & mpiReqSizeSendU =
1284  arrMpiReqsSizeSendUCD[recvOffset[supidx]+sendIdxU];
1285  MPI_Request & mpiReqSendU =
1286  arrMpiReqsSendUCD[recvOffset[supidx]+sendIdxU];
1287  std::vector<char> & sstrUrowSend =
1288  arrSstrUrowSendCD[recvOffset[supidx]+sendIdxU];
1289  Int & sstrSizeU =
1290  arrSstrUrowSizeSendCD[recvOffset[supidx]+sendIdxU];
1291 
1292  // Pack U data
1293  std::stringstream sstm;
1294  std::vector<Int> mask( UBlockMask::TOTAL_NUMBER, 1 );
1295  std::vector<UBlock<T> >& Urow = this->U( LBi(snode.Index, this->grid_) );
1296  // All blocks except for the diagonal block are to be sent right
1297  //TODO not true > this is a scatter operation ! Can we know the destination ?
1298 
1299  Int count = 0;
1300  for( Int jb = 0; jb < Urow.size(); jb++ ){
1301  if( Urow[jb].blockIdx > snode.Index
1302  && (Urow[jb].blockIdx % this->grid_->numProcRow) == srcRow ){
1303  count++;
1304  }
1305  }
1306 
1307  serialize( (Int)count, sstm, NO_MASK );
1308 
1309  for( Int jb = 0; jb < Urow.size(); jb++ ){
1310  if( Urow[jb].blockIdx > snode.Index
1311  && (Urow[jb].blockIdx % this->grid_->numProcRow) == srcRow ){
1312 #if ( _DEBUGlevel_ >= 1 )
1313  statusOFS << "["<<snode.Index<<"] SEND contains "<<Urow[jb].blockIdx
1314  << " which corresponds to "<<GBi(jb,this->grid_)
1315  << " to P"<<src
1316  << std::endl;
1317 #endif
1318  serialize( Urow[jb], sstm, mask );
1319  }
1320  }
1321 
1322  sstrUrowSend.resize( Size(sstm) );
1323  sstm.read( &sstrUrowSend[0], sstrUrowSend.size() );
1324  sstrSizeU = sstrUrowSend.size();
1325 
1326 
1327  MPI_Isend( &sstrSizeU, 1, MPI_INT, src,
1328  IDX_TO_TAG2(snode.Index,supidx,SELINV_TAG_U_SIZE_CD),
1329  this->grid_->comm, &mpiReqSizeSendU );
1330  MPI_Isend( (void*)&sstrUrowSend[0], sstrSizeU, MPI_BYTE, src,
1331  IDX_TO_TAG2(snode.Index,supidx,SELINV_TAG_U_CONTENT_CD),
1332  this->grid_->comm, &mpiReqSendU );
1333  sendIdxU++;
1334  }
1335  }
1336  }
1337  }//end if I'm a receiver
1338  TIMER_STOP(Send_U_Recv_Size_L_CrossDiag);
1339  }
1340 
1341  //waitall sizes of L
1342  TIMER_START(Wait_Size_L_CrossDiag);
1343  mpi::Waitall(arrMpiReqsSizeRecvLCD);
1344  TIMER_STOP(Wait_Size_L_CrossDiag);
1345 
1346 //if(this->grid_->mpirank==0){gdb_lock();}
1347 
1348  //Allocate content and do Irecv for content of L
1349  for (Int supidx=0; supidx<stepSuper; supidx++){
1350  SuperNodeBufferTypeUnsym & snode = arrSuperNodes[supidx];
1351  //If I'm a receiver
1352  TIMER_START(Recv_L_CrossDiag);
1353  if( MYROW( this->grid_ ) == PROW( snode.Index, this->grid_ )
1354  && this->isRecvFromCrossDiagonal_(this->grid_->numProcRow, snode.Index ) ){
1355  Int recvIdxL=0;
1356  for(Int srcRow = 0; srcRow<this->grid_->numProcRow; srcRow++){
1357  if(this->isRecvFromCrossDiagonal_(srcRow,snode.Index) ){
1358  Int src = PNUM(srcRow,PCOL(snode.Index,this->grid_),this->grid_);
1359  if( MYPROC( this->grid_ ) != src ){
1360  Int & sstrSizeL =
1361  arrSstrLrowSizeRecvCD[recvOffset[supidx]+recvIdxL];
1362  std::vector<char> & sstrLrowRecv =
1363  arrSstrLrowRecvCD[recvOffset[supidx]+recvIdxL];
1364  MPI_Request & mpiReqRecvL =
1365  arrMpiReqsRecvLCD[recvOffset[supidx]+recvIdxL];
1366  sstrLrowRecv.resize( sstrSizeL);
1367 
1368  MPI_Irecv( (void*)&sstrLrowRecv[0], sstrSizeL, MPI_BYTE, src,
1369  IDX_TO_TAG2(snode.Index,supidx,SELINV_TAG_L_CONTENT_CD),
1370  this->grid_->comm, &mpiReqRecvL );
1371  recvIdxL++;
1372 
1373  }
1374  }
1375  }
1376  }//end if I'm a receiver
1377  TIMER_STOP(Recv_L_CrossDiag);
1378  }
1379 
1380  //waitall sizes of U
1381  TIMER_START(Wait_Size_U_CrossDiag);
1382  mpi::Waitall(arrMpiReqsSizeRecvUCD);
1383  TIMER_STOP(Wait_Size_U_CrossDiag);
1384 
1385  //Allocate content and do Irecv for content of U
1386  for (Int supidx=0; supidx<stepSuper; supidx++){
1387  SuperNodeBufferTypeUnsym & snode = arrSuperNodes[supidx];
1388  TIMER_START(Recv_U_CrossDiag);
1389  if( MYCOL( this->grid_ ) == PCOL( snode.Index, this->grid_ )
1390  && this->isSendToCrossDiagonal_(this->grid_->numProcCol, snode.Index ) ){
1391  Int recvIdxU = 0;
1392  for(Int srcCol = 0; srcCol<this->grid_->numProcCol; srcCol++){
1393  if(this->isSendToCrossDiagonal_(srcCol,snode.Index) ){
1394  Int src = PNUM(PROW(snode.Index,this->grid_),srcCol,this->grid_);
1395  if( MYPROC( this->grid_ ) != src ){
1396  Int & sstrSizeU =
1397  arrSstrUcolSizeRecvCD[sendOffset[supidx]+recvIdxU];
1398  std::vector<char> & sstrUcolRecv =
1399  arrSstrUcolRecvCD[sendOffset[supidx]+recvIdxU];
1400  MPI_Request & mpiReqRecvU =
1401  arrMpiReqsRecvUCD[sendOffset[supidx]+recvIdxU];
1402  sstrUcolRecv.resize( sstrSizeU );
1403 
1404  MPI_Irecv( (void*)&sstrUcolRecv[0], sstrSizeU, MPI_BYTE, src,
1405  IDX_TO_TAG2(snode.Index,supidx,SELINV_TAG_U_CONTENT_CD),
1406  this->grid_->comm, &mpiReqRecvU );
1407  recvIdxU++;
1408  }
1409  }
1410  }
1411  }//end if I'm a receiver
1412  TIMER_STOP(Recv_U_CrossDiag);
1413  }
1414 
1415  //waitall content of L
1416  TIMER_START(Wait_L_CrossDiag);
1417  mpi::Waitall(arrMpiReqsRecvLCD);
1418  TIMER_STOP(Wait_L_CrossDiag);
1419 
1420 
1421 
1422  //Do the work for Lrow
1423  NumMat<T> Ltmp;
1424  for (Int supidx=0; supidx<stepSuper; supidx++){
1425  SuperNodeBufferTypeUnsym & snode = arrSuperNodes[supidx];
1426  Int ksupProcRow = PROW( snode.Index, this->grid_ );
1427  Int ksupProcCol = PCOL( snode.Index, this->grid_ );
1428 
1429  if( MYROW( this->grid_ ) == PROW( snode.Index, this->grid_ ) &&
1430  this->isRecvFromCrossDiagonal_(this->grid_->numProcRow, snode.Index ) ){
1431 
1432 #if ( _DEBUGlevel_ >= 1 )
1433  statusOFS << std::endl << " ["<<snode.Index<<"] "
1434  << "Update the upper triangular block"
1435  << std::endl << std::endl;
1436  statusOFS << std::endl << " ["<<snode.Index<<"] "
1437  << "blockIdxLocal:" << snode.BlockIdxLocal
1438  << std::endl << std::endl;
1439  statusOFS << std::endl << " ["<<snode.Index<<"] "
1440  << "rowLocalPtr:" << snode.RowLocalPtr
1441  << std::endl << std::endl;
1442 #endif
1443 
1444  std::vector<LBlock<T> >& Lcol = this->L( LBj(snode.Index, this->grid_) );
1445  std::vector<LBlock<T> >& Lrow = this->Lrow( LBi( snode.Index, this->grid_ ) );
1446 
1447 // if(Lrow.size() == 0){
1448 // Int& LrowSize = this->LrowSize_[ LBi( snode.Index, this->grid_ ) ];
1449 // Lrow.resize(LrowSize);
1450 // }
1451 
1452  std::vector<Int> isBlockFound(Lrow.size(),false);
1453  Int recvIdxL=0;
1454 
1455  for(Int srcRow = 0; srcRow<this->grid_->numProcRow; srcRow++){
1456  if(this->isRecvFromCrossDiagonal_(srcRow,snode.Index) ){
1457  Int src = PNUM(srcRow,PCOL(snode.Index,this->grid_),this->grid_);
1458  TIMER_START(Recv_L_CrossDiag);
1459 
1460  //Declaring some pointers to avoid copy if data is local
1461  std::vector<LBlock<T> > * pLcol;
1462  std::vector<LBlock<T> > LcolRecv;
1463  if(MYPROC(this->grid_)!= src){
1464  Int & sstrSizeL =
1465  arrSstrLrowSizeRecvCD[recvOffset[supidx]+recvIdxL];
1466  std::vector<char> & sstrLrowRecv =
1467  arrSstrLrowRecvCD[recvOffset[supidx]+recvIdxL];
1468 
1469  std::stringstream sstm;
1470 
1471 #if ( _DEBUGlevel_ >= 1 )
1472  statusOFS << "["<<snode.Index<<"] P"<<MYPROC(this->grid_)<<" ("<<MYROW(this->grid_)
1473  << ","<<MYCOL(this->grid_)<<") <--- LBj("<<snode.Index<<") <--- P"
1474  << src <<" ("<<srcRow<<","<<ksupProcCol<<")"
1475  << std::endl;
1476 #endif
1477  sstm.write( &sstrLrowRecv[0], sstrSizeL );
1478 
1479  // Unpack L data.
1480  Int numLBlock;
1481  std::vector<Int> mask( LBlockMask::TOTAL_NUMBER, 1 );
1482  deserialize( numLBlock, sstm, NO_MASK );
1483  LcolRecv.resize(numLBlock);
1484  for( Int ib = 0; ib < numLBlock; ib++ ){
1485  deserialize( LcolRecv[ib], sstm, mask );
1486 #if ( _DEBUGlevel_ >= 1 )
1487  statusOFS << "["<<snode.Index<<"] RECV contains "
1488  << LcolRecv[ib].blockIdx<< " which corresponds to "
1489  << ib * this->grid_->numProcRow + srcRow;
1490  statusOFS << " L is on row "<< srcRow
1491  << " whereas Lrow is on col "
1492  << LcolRecv[ib].blockIdx % this->grid_->numProcCol
1493  << std::endl;
1494 #endif
1495  }
1496  recvIdxL++;
1497  pLcol = &LcolRecv;
1498  } // sender is not the same as receiver
1499  else{
1500  // L is obtained locally, just make a copy. Do not include the diagonal block
1501  pLcol = &Lcol;
1502  } // sender is the same as receiver
1503 
1504 
1505  TIMER_STOP(Recv_L_CrossDiag);
1506 
1507  //We can directly put LcolRecv in Lrow (stil need to transpose)
1508 
1509  //Skip the diagonal block if necessary
1510  Int startIdx = ( MYPROC( this->grid_ ) == src && MYPROC(this->grid_) == PNUM(ksupProcRow,ksupProcCol, this->grid_) ) ? 1:0;
1511  for( Int ib = startIdx; ib < pLcol->size(); ib++ ){
1512  LBlock<T> & LB = (*pLcol)[ib];
1513  if( LB.blockIdx <= snode.Index ){
1514 #ifdef USE_ABORT
1515  abort();
1516 #endif
1517  throw std::logic_error( "LcolRecv contains the wrong blocks." );
1518  }
1519 
1520  //check that LB would be on this proc if it was a UB
1521  if(PCOL(LB.blockIdx, this->grid_) != MYCOL(this->grid_)){
1522  continue;
1523  }
1524 
1525 
1526 
1527  //std::vector<LBlock<T> > & LrowCD = this->Lrow( LBi( ksup, super_ ) );
1528  for( Int iib = 0; iib < Lrow.size(); iib++ ){
1529  LBlock<T> & LrowB = Lrow[ iib ];
1530 
1531  //If not initialized, set it to be the one
1532  if( LrowB.blockIdx == -1){
1533  LrowB.blockIdx = LB.blockIdx;
1534  }
1535 
1536  if( LB.blockIdx == LrowB.blockIdx ){
1537  // Note that the order of the column indices of the U
1538  // block may not follow the order of the row indices,
1539  // overwrite the information in U.
1540 // LrowB = LB;
1541  LrowB.rows = LB.rows;
1542  //Store in "row major" format / i.e transpose is in col-major
1543  Transpose(LB.nzval, LrowB.nzval);
1544  LrowB.numCol = LB.numRow;
1545  LrowB.numRow = LB.numCol;
1546 
1547 // SetValue(LrowB.nzval,ZERO<T>());
1548 
1549 #if ( _DEBUGlevel_ >= 1 )
1550  statusOFS<<"["<<snode.Index<<"] USING LB "<<LB.blockIdx<< std::endl;
1551 #endif
1552  isBlockFound[iib] = 1;//true;
1553  break;
1554  } // if( LB.blockIdx == LrowB.blockIdx )
1555  } // for (iib)
1556  } // for (ib)
1557  }
1558  }
1559 
1560  for( Int ib = 0; ib < Lrow.size(); ib++ ){
1561  LBlock<T> & LB = Lrow[ib];
1562  if( !isBlockFound[ib] ){
1563 #ifdef USE_ABORT
1564  abort();
1565 #endif
1566  throw std::logic_error(
1567  "LBlock cannot find its update. Something is seriously wrong."
1568  );
1569  }
1570  }
1571  } // Done copying L
1572  }
1573 
1574  //waitall content of U
1575  TIMER_START(Wait_U_CrossDiag);
1576  mpi::Waitall(arrMpiReqsRecvUCD);
1577  TIMER_STOP(Wait_U_CrossDiag);
1578 
1579 
1580  //Do the work for U
1581  for (Int supidx=0; supidx<stepSuper; supidx++){
1582  SuperNodeBufferTypeUnsym & snode = arrSuperNodes[supidx];
1583  Int ksupProcRow = PROW( snode.Index, this->grid_ );
1584  Int ksupProcCol = PCOL( snode.Index, this->grid_ );
1585 
1586 
1587  if( MYCOL( this->grid_ ) == PCOL( snode.Index, this->grid_ ) &&
1588  this->isSendToCrossDiagonal_(this->grid_->numProcCol, snode.Index ) ){
1589 
1590  std::vector<UBlock<T> >& Ucol = this->Ucol( LBj( snode.Index, this->grid_ ) );
1591 
1592 // if(Ucol.size() == 0){
1593 // Int& UcolSize = this->UcolSize_[ LBj( snode.Index, this->grid_ ) ];
1594 // Ucol.resize(UcolSize);
1595 // }
1596 
1597  std::vector<Int> isBlockFound(Ucol.size(),false);
1598  Int recvIdxU=0;
1599 
1600  for(Int srcCol = 0; srcCol<this->grid_->numProcCol; srcCol++){
1601  if(this->isSendToCrossDiagonal_(srcCol,snode.Index) ){
1602  Int src = PNUM(PROW(snode.Index,this->grid_),srcCol,this->grid_);
1603  TIMER_START(Recv_U_CrossDiag);
1604 
1605  //Declaring some pointers to avoid copy if data is local
1606  std::vector<UBlock<T> > * pUrow;
1607  std::vector<UBlock<T> > UrowRecv;
1608  if(MYPROC(this->grid_)!= src){
1609  Int & sstrSizeU =
1610  arrSstrUcolSizeRecvCD[sendOffset[supidx]+recvIdxU];
1611  std::vector<char> & sstrUcolRecv =
1612  arrSstrUcolRecvCD[sendOffset[supidx]+recvIdxU];
1613 
1614 
1615  std::stringstream sstm;
1616 
1617 #if ( _DEBUGlevel_ >= 1 )
1618  statusOFS << "["<<snode.Index<<"] P"<<MYPROC(this->grid_)<<" ("<<MYROW(this->grid_)
1619  << ","<<MYCOL(this->grid_)<<") <--- LBi("<<snode.Index<<") <--- P"
1620  << src<<" ("<<ksupProcCol<<","<<srcCol<<")"<<std::endl;
1621 #endif
1622  sstm.write( &sstrUcolRecv[0], sstrSizeU );
1623 
1624  // Unpack U data.
1625  Int numUBlock;
1626  std::vector<Int> mask( UBlockMask::TOTAL_NUMBER, 1 );
1627  deserialize( numUBlock, sstm, NO_MASK );
1628  UrowRecv.resize(numUBlock);
1629  for( Int jb = 0; jb < numUBlock; jb++ ){
1630  deserialize( UrowRecv[jb], sstm, mask );
1631 #if ( _DEBUGlevel_ >= 1 )
1632  statusOFS << "["<<snode.Index<<"] RECV contains "
1633  << UrowRecv[jb].blockIdx<< " which corresponds to "
1634  << jb * this->grid_->numProcCol + srcCol;
1635  statusOFS << " Urow is on col "<< srcCol
1636  << " whereas U is on row "
1637  << UrowRecv[jb].blockIdx % this->grid_->numProcRow
1638  << std::endl;
1639 #endif
1640  }
1641  pUrow = &UrowRecv;
1642  recvIdxU++;
1643  } // sender is not the same as receiver
1644  else{
1645  // U is obtained locally, just make a copy.
1646  std::vector<UBlock<T> >& Urow = this->U( LBi( snode.Index, this->grid_ ) );
1647  pUrow = &Urow;
1648  } // sender is the same as receiver
1649 
1650 
1651  TIMER_STOP(Recv_U_CrossDiag);
1652 
1653  //We can directly put UrowRecv in Ucol
1654  for( Int jb = 0; jb < pUrow->size(); jb++ ){
1655  UBlock<T> & UB = (*pUrow)[jb];
1656  if( UB.blockIdx <= snode.Index ){
1657 #ifdef USE_ABORT
1658  abort();
1659 #endif
1660  throw std::logic_error( "UrowRecv contains the wrong blocks." );
1661  }
1662 
1663  //check that UB would be on this proc if it was a LB
1664  if(PROW(UB.blockIdx, this->grid_) != MYROW(this->grid_)){
1665  continue;
1666  }
1667 
1668 
1669  //std::vector<UBlock<T> > & UcolCD = this->Ucol( LBi( ksup, super_ ) );
1670  for( Int jjb = 0; jjb < Ucol.size(); jjb++ ){
1671  UBlock<T> & UcolB = Ucol[jjb];
1672 
1673 
1674  //If not initialized, set it to be the one
1675  if( UcolB.blockIdx == -1){
1676  UcolB.blockIdx = UB.blockIdx;
1677  }
1678 
1679 
1680  if( UB.blockIdx == UcolB.blockIdx ){
1681  // Compare size
1682  // Note that the order of the column indices of the U
1683  // block may not follow the order of the row indices,
1684  // overwrite the information in U.
1685  UcolB = UB;
1686 
1687 //SetValue(UcolB.nzval,ZERO<T>());
1688 #if ( _DEBUGlevel_ >= 1 )
1689  statusOFS<<"["<<snode.Index<<"] USING UB "<<UB.blockIdx<< std::endl;
1690 #endif
1691  isBlockFound[jjb] = 1;
1692  break;
1693  } // if( UB.blockIdx == UcolB.blockIdx )
1694  } // for (jjb)
1695  } // for (jb)
1696 
1697  }
1698  }
1699 
1700  for( Int jb = 0; jb < Ucol.size(); jb++ ){
1701  UBlock<T> & UB = Ucol[jb];
1702  if( !isBlockFound[jb] ){
1703 #ifdef USE_ABORT
1704  abort();
1705 #endif
1706  throw std::logic_error(
1707  "UBlock cannot find its update. Something is seriously wrong."
1708  );
1709  }
1710  } // end for (jb)
1711  } // Done copying U
1712 
1713  }
1714 
1715  TIMER_STOP(Send_CD);
1716 
1717  mpi::Waitall(arrMpiReqsSizeSendLCD);
1718  mpi::Waitall(arrMpiReqsSendLCD);
1719 
1720  mpi::Waitall(arrMpiReqsSizeSendUCD);
1721  mpi::Waitall(arrMpiReqsSendUCD);
1722 
1723 
1724  } // End of method PMatrixUnsym::SendRecvCD
1725 
1726  template<typename T>
1728  SuperNodeBufferTypeUnsym & snode,
1729  std::vector<LBlock<T> > & LcolRecv,
1730  std::vector<LBlock<T> > & LrowRecv,
1731  std::vector<UBlock<T> > & UcolRecv,
1732  std::vector<UBlock<T> > & UrowRecv
1733  )
1734  {
1735 
1736  TIMER_START(Unpack_data);
1737 
1738 
1739 
1740 
1741 #if ( _DEBUGlevel_ >= 1 )
1742  statusOFS << std::endl << "["<<snode.Index<<"] "<< "Unpack the received data for processors participate in Gemm. " << std::endl << std::endl;
1743 #endif
1744  // Lrow part
1745  if( MYROW( this->grid_ ) != PROW( snode.Index, this->grid_ ) ){
1746  std::stringstream sstm;
1747  sstm.write( &snode.SstrLrowRecv[0], snode.SstrLrowRecv.size() );
1748  std::vector<Int> mask( LBlockMask::TOTAL_NUMBER, 1 );
1749  Int numLBlock;
1750  deserialize( numLBlock, sstm, NO_MASK );
1751  LrowRecv.resize( numLBlock );
1752  for( Int ib = 0; ib < numLBlock; ib++ ){
1753  deserialize( LrowRecv[ib], sstm, mask );
1754  }
1755 
1756 #if ( _DEBUGlevel_ >= 1 )
1757  statusOFS<< "["<<snode.Index<<"] "<<"Lrow RECV "<<std::endl;
1758  for(Int ib=0;ib<LrowRecv.size();++ib){statusOFS<<LrowRecv[ib].blockIdx<<" ";}
1759  statusOFS<<std::endl;
1760 #endif
1761  } // sender is not the same as receiver
1762  else{
1763  // U is obtained locally, just make a copy. Include everything
1764  // (there is no diagonal block)
1765  // Is it a copy ? LL: YES. Maybe we should replace the copy by
1766  // something more efficient especially for mpisize == 1
1767  LrowRecv.resize(this->Lrow( LBi( snode.Index, this->grid_ ) ).size());
1768  std::copy(this->Lrow( LBi( snode.Index, this->grid_ ) ).begin(),this->Lrow( LBi( snode.Index, this->grid_ )).end(),LrowRecv.begin());
1769  } // sender is the same as receiver
1770 
1771 
1772  //L part
1773  if( MYCOL( this->grid_ ) != PCOL( snode.Index, this->grid_ ) ){
1774  std::stringstream sstm;
1775  sstm.write( &snode.SstrLcolRecv[0], snode.SstrLcolRecv.size() );
1776  std::vector<Int> mask( LBlockMask::TOTAL_NUMBER, 1 );
1777  mask[LBlockMask::NZVAL] = 0; // nzval is excluded
1778  Int numLBlock;
1779  deserialize( numLBlock, sstm, NO_MASK );
1780  LcolRecv.resize( numLBlock );
1781  for( Int ib = 0; ib < numLBlock; ib++ ){
1782  deserialize( LcolRecv[ib], sstm, mask );
1783  }
1784 
1785 
1786 #if ( _DEBUGlevel_ >= 1 )
1787  statusOFS<< "["<<snode.Index<<"] "<<"Lcol RECV "<<std::endl;
1788  for(Int ib=0;ib<LcolRecv.size();++ib){statusOFS<<LcolRecv[ib].blockIdx<<" ";}
1789  statusOFS<<std::endl;
1790 #endif
1791 
1792  } // sender is not the same as receiver
1793  else{
1794  // L is obtained locally, just make a copy.
1795  // Do not include the diagonal block
1796  std::vector<LBlock<T> >& Lcol = this->L( LBj( snode.Index, this->grid_ ) );
1797  Int startIdx = ( MYROW( this->grid_ ) == PROW( snode.Index, this->grid_ ) )?1:0;
1798  LcolRecv.resize( Lcol.size() - startIdx );
1799  std::copy(Lcol.begin()+startIdx,Lcol.end(),LcolRecv.begin());
1800  } // sender is the same as receiver
1801 
1802  // U part
1803  if( MYROW( this->grid_ ) != PROW( snode.Index, this->grid_ ) ){
1804  std::stringstream sstm;
1805  sstm.write( &snode.SstrUrowRecv[0], snode.SstrUrowRecv.size() );
1806  std::vector<Int> mask( UBlockMask::TOTAL_NUMBER, 1 );
1807  mask[UBlockMask::NZVAL] = 0; // nzval is excluded
1808  Int numUBlock;
1809  deserialize( numUBlock, sstm, NO_MASK );
1810  UrowRecv.resize( numUBlock );
1811  for( Int jb = 0; jb < numUBlock; jb++ ){
1812  deserialize( UrowRecv[jb], sstm, mask );
1813  }
1814 
1815 #if ( _DEBUGlevel_ >= 1 )
1816  statusOFS<< "["<<snode.Index<<"] "<<"Urow RECV "<<std::endl;
1817  for(Int ib=0;ib<UrowRecv.size();++ib){statusOFS<<UrowRecv[ib].blockIdx<<" ";}
1818  statusOFS<<std::endl;
1819 #endif
1820  } // sender is not the same as receiver
1821  else{
1822  // U is obtained locally, just make a copy. Include everything
1823  // (there is no diagonal block)
1824  // Is it a copy ? LL: YES. Maybe we should replace the copy by
1825  // something more efficient especially for mpisize == 1
1826  UrowRecv.resize(this->U( LBi( snode.Index, this->grid_ ) ).size());
1827  std::copy(this->U( LBi( snode.Index, this->grid_ ) ).begin(),this->U( LBi( snode.Index, this->grid_ )).end(),UrowRecv.begin());
1828  } // sender is the same as receiver
1829 
1830 
1831  // Ucol part
1832  if( MYCOL( this->grid_ ) != PCOL( snode.Index, this->grid_ ) ){
1833  std::stringstream sstm;
1834  sstm.write( &snode.SstrUcolRecv[0], snode.SstrUcolRecv.size() );
1835  std::vector<Int> mask( UBlockMask::TOTAL_NUMBER, 1 );
1836  Int numUBlock;
1837  deserialize( numUBlock, sstm, NO_MASK );
1838  UcolRecv.resize( numUBlock );
1839  for( Int jb = 0; jb < numUBlock; jb++ ){
1840  deserialize( UcolRecv[jb], sstm, mask );
1841  }
1842 
1843 #if ( _DEBUGlevel_ >= 1 )
1844  statusOFS<< "["<<snode.Index<<"] "<<"Ucol RECV "<<std::endl;
1845  for(Int ib=0;ib<UcolRecv.size();++ib){statusOFS<<UcolRecv[ib].blockIdx<<" ";}
1846  statusOFS<<std::endl;
1847 #endif
1848  } // sender is not the same as receiver
1849  else{
1850  // Ucol is obtained locally, just make a copy.
1851  // Do not include the diagonal block
1852  std::vector<UBlock<T> >& Ucol = this->Ucol( LBj( snode.Index, this->grid_ ) );
1853  UcolRecv.resize( Ucol.size() );
1854  std::copy(Ucol.begin(),Ucol.end(),UcolRecv.begin());
1855  } // sender is the same as receiver
1856 
1857  TIMER_STOP(Unpack_data);
1858 
1859  } // End of method PMatrixUnsym<T>::UnpackData
1860 
1861  template<typename T>
1863  {
1864 
1865  TIMER_START(ComputeDiagUpdate);
1866  //--------- Computing Diagonal block, all processors in the column
1867  //--------- are participating to all pipelined supernodes
1868  if( MYCOL( this->grid_ ) == PCOL( snode.Index, this->grid_ ) ){
1869 #if ( _DEBUGlevel_ >= 1 )
1870  statusOFS << std::endl << "["<<snode.Index<<"] "
1871  << "Updating the diagonal block" << std::endl << std::endl;
1872 #endif
1873  std::vector<LBlock<T> >& Lcol = this->L( LBj( snode.Index, this->grid_ ) );
1874  std::vector<UBlock<T> >& Ucol = this->Ucol( LBj( snode.Index, this->grid_ ) );
1875 
1876  //Allocate DiagBuf even if Lcol.size() == 0
1877  snode.DiagBuf.Resize(SuperSize( snode.Index, this->super_ ),
1878  SuperSize( snode.Index, this->super_ ));
1879  SetValue(snode.DiagBuf, ZERO<T>());
1880 
1881  // Do I own the diagonal block ?
1882  Int startIb = (MYROW( this->grid_ ) == PROW( snode.Index, this->grid_ ))?1:0;
1883  for( Int ib = startIb; ib < Lcol.size(); ib++ ){
1884  LBlock<T> & LcolB = Lcol[ib];
1885  //find corresponding U block
1886  Int jb = 0;
1887  for(jb = 0; jb < Ucol.size(); ++jb){
1888  if(Ucol[jb].blockIdx == LcolB.blockIdx){
1889  break;
1890  }
1891  }
1892 
1893 
1894  assert(jb < Ucol.size());
1895 
1896  UBlock<T> & UcolB = Ucol[jb];
1897 
1898  if(1 || (LcolB.numRow>0 && LcolB.numCol>0 && UcolB.numRow>0 && UcolB.numCol>0)){
1899  //Compute U S-1 L
1900  blas::Gemm( 'N', 'N', snode.DiagBuf.m(), snode.DiagBuf.n(),
1901  LcolB.numRow, MINUS_ONE<T>(),
1902  UcolB.nzval.Data(), UcolB.nzval.m(),
1903  &snode.LUpdateBuf( snode.RowLocalPtr[ib-startIb], 0 ),
1904  snode.LUpdateBuf.m(),
1905  ONE<T>(), snode.DiagBuf.Data(), snode.DiagBuf.m() );
1906  }
1907  }
1908 
1909 #if ( _DEBUGlevel_ >= 1 )
1910  statusOFS << std::endl << "["<<snode.Index<<"] "
1911  << "Updated the diagonal block" << std::endl << std::endl;
1912 #endif
1913  }
1914  TIMER_STOP(ComputeDiagUpdate);
1915  } // End of method PMatrixUnsym<T>::ComputeDiagUpdate
1916 
1917 
1918 
1919 
1920 
1921 
1922 
1923 
1924  template<typename T>
1925  inline void PMatrixUnsym<T>::SelInvIntra_P2p(Int lidx)
1926  {
1927 
1928 #if defined (PROFILE) || defined(PMPI) || defined(USE_TAU)
1929  Real begin_SendULWaitContentFirst, end_SendULWaitContentFirst, time_SendULWaitContentFirst = 0;
1930 #endif
1931  Int numSuper = this->NumSuper();
1932  std::vector<std::vector<Int> > & superList = this->WorkingSet();
1933  Int numSteps = superList.size();
1934  Int stepSuper = superList[lidx].size();
1935 
1936  TIMER_START(AllocateBuffer);
1937 
1938  //This is required to send the size and content of U/L
1939  std::vector<std::vector<MPI_Request> > arrMpireqsSendLToBelow;
1940  arrMpireqsSendLToBelow.resize( stepSuper, std::vector<MPI_Request>( 2 * this->grid_->numProcRow, MPI_REQUEST_NULL ));
1941  std::vector<std::vector<MPI_Request> > arrMpireqsSendLToRight;
1942  arrMpireqsSendLToRight.resize(stepSuper, std::vector<MPI_Request>( 2 * this->grid_->numProcCol, MPI_REQUEST_NULL ));
1943 
1944  std::vector<std::vector<MPI_Request> > arrMpireqsSendUToBelow;
1945  arrMpireqsSendUToBelow.resize( stepSuper, std::vector<MPI_Request>( 2 * this->grid_->numProcRow, MPI_REQUEST_NULL ));
1946  std::vector<std::vector<MPI_Request> > arrMpireqsSendUToRight;
1947  arrMpireqsSendUToRight.resize(stepSuper, std::vector<MPI_Request>( 2 * this->grid_->numProcCol, MPI_REQUEST_NULL ));
1948 
1949  //This is required to reduce L
1950  std::vector<MPI_Request> arrMpireqsSendLToLeft;
1951  arrMpireqsSendLToLeft.resize(stepSuper, MPI_REQUEST_NULL );
1952 
1953  //This is required to reduce U
1954  std::vector<MPI_Request> arrMpireqsSendUToAbove;
1955  arrMpireqsSendUToAbove.resize(stepSuper, MPI_REQUEST_NULL );
1956 
1957  //This is required to reduce D
1958  std::vector<MPI_Request> arrMpireqsSendToAbove;
1959  arrMpireqsSendToAbove.resize(stepSuper, MPI_REQUEST_NULL );
1960 
1961  //------------------------------------------------------------------------
1962 
1963  //This is required to receive the size and content of U/L
1964  std::vector<MPI_Request> arrMpireqsRecvLSizeFromAny;
1965  arrMpireqsRecvLSizeFromAny.resize(stepSuper*2 , MPI_REQUEST_NULL);
1966  std::vector<MPI_Request> arrMpireqsRecvLContentFromAny;
1967  arrMpireqsRecvLContentFromAny.resize(stepSuper*2 , MPI_REQUEST_NULL);
1968 
1969  std::vector<MPI_Request> arrMpireqsRecvUSizeFromAny;
1970  arrMpireqsRecvUSizeFromAny.resize(stepSuper*2 , MPI_REQUEST_NULL);
1971  std::vector<MPI_Request> arrMpireqsRecvUContentFromAny;
1972  arrMpireqsRecvUContentFromAny.resize(stepSuper*2 , MPI_REQUEST_NULL);
1973 
1974 
1975 
1976 
1977  //allocate the buffers for this supernode
1978  std::vector<SuperNodeBufferTypeUnsym> arrSuperNodes(stepSuper);
1979  for (Int supidx=0; supidx<stepSuper; supidx++){
1980  arrSuperNodes[supidx].Index = superList[lidx][supidx];
1981  }
1982 
1983  NumMat<T> AinvBuf, UBuf, LBuf;
1984 
1985  TIMER_STOP(AllocateBuffer);
1986 
1987 
1988 
1989  Int next_lidx = lidx+1;
1990  CDBuffers nextCDBuffers;
1991 
1992 #ifndef _RELEASE_
1993  PushCallStack("PMatrixUnsym::SelInv_P2p::SendRecvCD");
1994 #endif
1995 
1996 //Perhaps this should be done for the next step super in a non blocking way
1997  if(lidx==0){
1998  CDBuffers buffers;
1999  Int next_lidx = lidx;
2000 
2001  //Resize L and U first
2002  bool doSend = false;
2003  for (Int supidx=0; supidx<superList[next_lidx].size(); supidx++){
2004  Int snode_index = superList[next_lidx][supidx];
2005  if( MYROW( this->grid_ ) == PROW( snode_index, this->grid_ ) ){
2006  std::vector<LBlock<T> >& Lrow = this->Lrow( LBi(snode_index, this->grid_) );
2007  Int& LrowSize = this->LrowSize_[ LBi(snode_index, this->grid_) ];
2008  if(Lrow.size()!=LrowSize){
2009  doSend= true;
2010  Lrow.resize(LrowSize,LBlock<T>());
2011  }
2012  }
2013 
2014  if( MYCOL( this->grid_ ) == PCOL( snode_index, this->grid_ ) ){
2015  std::vector<UBlock<T> >& Ucol = this->Ucol( LBj(snode_index, this->grid_) );
2016  Int& UcolSize = this->UcolSize_[ LBj(snode_index, this->grid_) ];
2017  if(Ucol.size()!=UcolSize){
2018  doSend= true;
2019  Ucol.resize(UcolSize,UBlock<T>());
2020  }
2021  }
2022  }
2023 
2024 
2025  if(doSend){
2026 
2027  SendRecvSizesCD(superList[next_lidx], superList[next_lidx].size(),buffers);
2028  IRecvContentCD(superList[next_lidx], superList[next_lidx].size(),buffers);
2029  WaitContentLCD(superList[next_lidx], superList[next_lidx].size(),buffers);
2030  WaitContentUCD(superList[next_lidx], superList[next_lidx].size(),buffers);
2031  buffers.WaitAllSend();
2032 
2033  }
2034  }
2035 
2036  if(next_lidx < superList.size()){
2037  //Resize L and U first
2038  for (Int supidx=0; supidx<superList[next_lidx].size(); supidx++){
2039  Int snode_index = superList[next_lidx][supidx];
2040  if( MYROW( this->grid_ ) == PROW( snode_index, this->grid_ ) ){
2041  std::vector<LBlock<T> >& Lrow = this->Lrow( LBi(snode_index, this->grid_) );
2042  Int& LrowSize = this->LrowSize_[ LBi(snode_index, this->grid_) ];
2043  Lrow.resize(LrowSize,LBlock<T>());
2044  }
2045 
2046  if( MYCOL( this->grid_ ) == PCOL( snode_index, this->grid_ ) ){
2047  std::vector<UBlock<T> >& Ucol = this->Ucol( LBj(snode_index, this->grid_) );
2048  Int& UcolSize = this->UcolSize_[ LBj(snode_index, this->grid_) ];
2049  Ucol.resize(UcolSize,UBlock<T>());
2050  }
2051  }
2052 
2053  SendRecvSizesCD(superList[next_lidx],superList[next_lidx].size(),nextCDBuffers);
2054  }
2055 #ifndef _RELEASE_
2056  PopCallStack();
2057 #endif
2058 
2059 
2060 #ifndef _RELEASE_
2061  PushCallStack("PMatrixUnsym::SelInv_P2p::UpdateLU");
2062 #endif
2063 #if ( _DEBUGlevel_ >= 1 )
2064  statusOFS << std::endl << "Communication to the Schur complement." << std::endl << std::endl;
2065 #endif
2066 
2067  // Senders
2068  for (Int supidx=0; supidx<stepSuper; supidx++){
2069  SuperNodeBufferTypeUnsym & snode = arrSuperNodes[supidx];
2070  std::vector<MPI_Request> & mpireqsSendLToBelow = arrMpireqsSendLToBelow[supidx];
2071  std::vector<MPI_Request> & mpireqsSendLToRight = arrMpireqsSendLToRight[supidx];
2072  std::vector<MPI_Request> & mpireqsSendUToBelow = arrMpireqsSendUToBelow[supidx];
2073  std::vector<MPI_Request> & mpireqsSendUToRight = arrMpireqsSendUToRight[supidx];
2074 
2075 #if ( _DEBUGlevel_ >= 1 )
2076  statusOFS << std::endl << "["<<snode.Index<<"] "
2077  << "Communication for the Lrow part." << std::endl << std::endl;
2078 #endif
2079  // Communication for the Lrow part.
2080  if( MYROW( this->grid_ ) == PROW( snode.Index, this->grid_ ) ){
2081  // Pack the data in Lrow
2082  TIMER_START(Serialize_LrowL);
2083  std::stringstream sstm;
2084  std::vector<Int> mask( LBlockMask::TOTAL_NUMBER, 1 );
2085  std::vector<LBlock<T> >& Lrow = this->Lrow( LBi(snode.Index, this->grid_) );
2086  std::vector<UBlock<T> >& Urow = this->U( LBi(snode.Index, this->grid_) );
2087 
2088 
2089  // All blocks are to be sent down.
2090  serialize( (Int)Lrow.size(), sstm, NO_MASK );
2091  for( Int ib = 0; ib < Lrow.size(); ib++ ){
2092  assert( Lrow[ib].blockIdx > snode.Index );
2093  serialize( Lrow[ib], sstm, mask );
2094  }
2095  snode.SstrLrowSend.resize( Size( sstm ) );
2096  sstm.read( &snode.SstrLrowSend[0], snode.SstrLrowSend.size() );
2097  snode.SizeSstrLrowSend = snode.SstrLrowSend.size();
2098  TIMER_STOP(Serialize_LrowL);
2099 
2100  for( Int iProcRow = 0; iProcRow < this->grid_->numProcRow; iProcRow++ ){
2101  if( MYROW( this->grid_ ) != iProcRow &&
2102  this->isSendToBelow_( iProcRow,snode.Index ) == true ){
2103 
2104  // Use Isend to send to multiple targets
2105  MPI_Isend( &snode.SizeSstrLrowSend, 1, MPI_INT,
2106  iProcRow, IDX_TO_TAG2(snode.Index,supidx,SELINV_TAG_LROW_SIZE), this->grid_->colComm, &mpireqsSendLToBelow[2*iProcRow] );
2107  MPI_Isend( (void*)&snode.SstrLrowSend[0], snode.SizeSstrLrowSend, MPI_BYTE,
2108  iProcRow, IDX_TO_TAG2(snode.Index,supidx,SELINV_TAG_LROW_CONTENT),
2109  this->grid_->colComm, &mpireqsSendLToBelow[2*iProcRow+1] );
2110 #if ( _DEBUGlevel_ >= 1 )
2111  statusOFS<< "["<<snode.Index<<"] "<<"Lrow SENT "<<std::endl;
2112  for(Int ib=0;ib<Lrow.size();++ib){statusOFS<<Lrow[ib].blockIdx<<" ";}
2113  statusOFS<<std::endl;
2114  statusOFS << "["<<snode.Index<<"] "<< "Sending Lrow "
2115  << snode.SizeSstrLrowSend << " BYTES to P" << PNUM(iProcRow,MYCOL(this->grid_),this->grid_)
2116  << std::endl << std::endl;
2117 #endif
2118  } // Send
2119  } // for (iProcRow)
2120  } // if I am the sender
2121 
2122 #if ( _DEBUGlevel_ >= 1 )
2123  statusOFS << std::endl << "["<<snode.Index<<"] "<< "Communication for the L part." << std::endl << std::endl;
2124 #endif
2125  // Communication for the L (Lcol) part.
2126  if( MYCOL( this->grid_ ) == PCOL( snode.Index, this->grid_ ) ){
2127 
2128  TIMER_START(Serialize_LcolL);
2129  // Pack the data in L
2130  std::stringstream sstm;
2131  std::vector<Int> mask( LBlockMask::TOTAL_NUMBER, 1 );
2132  mask[LBlockMask::NZVAL] = 0; // nzval is excluded
2133 
2134  std::vector<LBlock<T> >& Lcol = this->L( LBj(snode.Index, this->grid_) );
2135  // All blocks except for the diagonal block are to be sent right
2136 
2137 
2138  Int startIdx = ( MYROW( this->grid_ ) == PROW( snode.Index, this->grid_ ) )?1:0;
2139  serialize( (Int)Lcol.size() - startIdx, sstm, NO_MASK );
2140  for( Int ib = startIdx; ib < Lcol.size(); ib++ ){
2141  assert( Lcol[ib].blockIdx > snode.Index );
2142 
2143 #if ( _DEBUGlevel_ >= 2 )
2144 // statusOFS << std::endl << "["<<snode.Index<<"] "<< "Serializing Block index " << Lcol[ib].blockIdx << std::endl;
2145 #endif
2146  serialize( Lcol[ib], sstm, mask );
2147  }
2148  snode.SstrLcolSend.resize( Size( sstm ) );
2149  sstm.read( &snode.SstrLcolSend[0], snode.SstrLcolSend.size() );
2150  snode.SizeSstrLcolSend = snode.SstrLcolSend.size();
2151  TIMER_STOP(Serialize_LcolL);
2152 
2153  for( Int iProcCol = 0; iProcCol < this->grid_->numProcCol ; iProcCol++ ){
2154  if( MYCOL( this->grid_ ) != iProcCol &&
2155  this->isSendToRight_( iProcCol, snode.Index ) == true ){
2156  // Use Isend to send to multiple targets
2157  MPI_Isend( &snode.SizeSstrLcolSend, 1, MPI_INT,
2158  iProcCol, IDX_TO_TAG2(snode.Index,supidx,SELINV_TAG_L_SIZE),
2159  this->grid_->rowComm, &mpireqsSendLToRight[2*iProcCol] );
2160  MPI_Isend( (void*)&snode.SstrLcolSend[0], snode.SizeSstrLcolSend, MPI_BYTE,
2161  iProcCol, IDX_TO_TAG2(snode.Index,supidx,SELINV_TAG_L_CONTENT),
2162  this->grid_->rowComm, &mpireqsSendLToRight[2*iProcCol+1] );
2163 #if ( _DEBUGlevel_ >= 1 )
2164  statusOFS<< "["<<snode.Index<<"] "<<"L SENT "<<std::endl;
2165  for(Int ib=startIdx;ib<Lcol.size();++ib){statusOFS<<Lcol[ib].blockIdx<<" ";}
2166  statusOFS<<std::endl;
2167  statusOFS << "["<<snode.Index<<"] "<< "Sending L "
2168  << snode.SizeSstrLcolSend << " BYTES to P" << PNUM(MYROW(this->grid_),iProcCol,this->grid_)
2169  << std::endl << std::endl;
2170 #endif
2171  } // Send
2172  } // for (iProcCol)
2173  } // if I am the sender
2174 
2175 #if ( _DEBUGlevel_ >= 1 )
2176  statusOFS << std::endl << "["<<snode.Index<<"] "
2177  << "Communication for the U part." << std::endl << std::endl;
2178 #endif
2179  // Communication for the U (Urow) part.
2180  if( MYROW( this->grid_ ) == PROW( snode.Index, this->grid_ ) ){
2181 
2182  TIMER_START(Serialize_UcolU);
2183  // Pack the data in U
2184  std::stringstream sstm;
2185  std::vector<Int> mask( UBlockMask::TOTAL_NUMBER, 1 );
2186  mask[UBlockMask::NZVAL] = 0; // nzval is excluded
2187 
2188  std::vector<UBlock<T> >& Urow = this->U( LBi(snode.Index, this->grid_) );
2189  // All blocks except for the diagonal block are to be sent right
2190 
2191  serialize( (Int)Urow.size(), sstm, NO_MASK );
2192  for( Int jb = 0; jb < Urow.size(); jb++ ){
2193  assert( Urow[jb].blockIdx > snode.Index );
2194 #if ( _DEBUGlevel_ >= 2 )
2195 // statusOFS << std::endl << "["<<snode.Index<<"] "<< "Serializing Block index " << Urow[jb].blockIdx << std::endl;
2196 #endif
2197  serialize( Urow[jb], sstm, mask );
2198  }
2199  snode.SstrUrowSend.resize( Size( sstm ) );
2200  sstm.read( &snode.SstrUrowSend[0], snode.SstrUrowSend.size() );
2201  snode.SizeSstrUrowSend = snode.SstrUrowSend.size();
2202  TIMER_STOP(Serialize_UcolU);
2203 
2204  for( Int iProcRow = 0; iProcRow < this->grid_->numProcRow; iProcRow++ ){
2205  if( MYROW( this->grid_ ) != iProcRow &&
2206  this->isSendToBelow_( iProcRow,snode.Index ) == true ){
2207  // Use Isend to send to multiple targets
2208  MPI_Isend( &snode.SizeSstrUrowSend, 1, MPI_INT,
2209  iProcRow, IDX_TO_TAG2(snode.Index,supidx,SELINV_TAG_U_SIZE), this->grid_->colComm, &mpireqsSendUToBelow[2*iProcRow] );
2210  MPI_Isend( (void*)&snode.SstrLrowSend[0], snode.SizeSstrUrowSend, MPI_BYTE,
2211  iProcRow, IDX_TO_TAG2(snode.Index,supidx,SELINV_TAG_U_CONTENT),
2212  this->grid_->colComm, &mpireqsSendUToBelow[2*iProcRow+1] );
2213 #if ( _DEBUGlevel_ >= 1 )
2214  statusOFS << std::endl << "["<<snode.Index<<"] "<< "Sending U " << snode.SizeSstrUrowSend << " BYTES"<< std::endl << std::endl;
2215  statusOFS<< "["<<snode.Index<<"] "<<"U SENT "<<std::endl;
2216  for(Int ib=0;ib<Urow.size();++ib){statusOFS<<Urow[ib].blockIdx<<" ";}
2217  statusOFS<<std::endl;
2218  statusOFS << "["<<snode.Index<<"] "<< "Sending U "
2219  << snode.SizeSstrUrowSend << " BYTES to P" << PNUM(iProcRow,MYCOL(this->grid_),this->grid_)
2220  << std::endl << std::endl;
2221 #endif
2222  } // Send
2223  } // for (iProcRow)
2224  } // if I am the sender
2225 
2226 #if ( _DEBUGlevel_ >= 1 )
2227  statusOFS << "["<<snode.Index<<"] "
2228  << "Communication for the Ucol part." << std::endl
2229  << std::endl;
2230 #endif
2231  // Communication for the Ucol part.
2232  if( MYCOL( this->grid_ ) == PCOL( snode.Index, this->grid_ ) ){
2233  // Pack the data in Ucol
2234  TIMER_START(Serialize_UcolU);
2235  std::stringstream sstm;
2236  std::vector<Int> mask( UBlockMask::TOTAL_NUMBER, 1 );
2237  std::vector<UBlock<T> >& Ucol =
2238  this->Ucol( LBj(snode.Index, this->grid_) );
2239  // All blocks are to be sent down.
2240  serialize( (Int)Ucol.size(), sstm, NO_MASK );
2241  for( Int jb = 0; jb < Ucol.size(); jb++ ){
2242  UBlock<T> & UB = Ucol[jb];
2243  assert( UB.blockIdx > snode.Index );
2244  serialize( UB, sstm, mask );
2245  }
2246  snode.SstrUcolSend.resize( Size( sstm ) );
2247  sstm.read( &snode.SstrUcolSend[0], snode.SstrUcolSend.size() );
2248  snode.SizeSstrUcolSend = snode.SstrUcolSend.size();
2249  TIMER_STOP(Serialize_UcolU);
2250 
2251  for( Int iProcCol = 0;
2252  iProcCol < this->grid_->numProcCol ; iProcCol++ ){
2253  if( MYCOL( this->grid_ ) != iProcCol &&
2254  this->isSendToRight_( iProcCol, snode.Index ) == true ){
2255  // Use Isend to send to multiple targets
2256  MPI_Isend( &snode.SizeSstrUcolSend, 1, MPI_INT,
2257  iProcCol, IDX_TO_TAG2(snode.Index,supidx,SELINV_TAG_UCOL_SIZE),
2258  this->grid_->rowComm, &mpireqsSendUToRight[2*iProcCol] );
2259  MPI_Isend( &snode.SstrUcolSend[0],snode.SizeSstrUcolSend,
2260  MPI_BYTE, iProcCol,
2261  IDX_TO_TAG2(snode.Index,supidx,SELINV_TAG_UCOL_CONTENT),
2262  this->grid_->rowComm,
2263  &mpireqsSendUToRight[2*iProcCol+1] );
2264 #if ( _DEBUGlevel_ >= 2 )
2265  statusOFS<< "["<<snode.Index<<"] "<<"Ucol SENT "<<std::endl;
2266  for(Int ib=0;ib<Ucol.size();++ib){
2267  statusOFS<<Ucol[ib].blockIdx<<" ";
2268  }
2269  statusOFS<<std::endl;
2270 #endif
2271 #if ( _DEBUGlevel_ >= 1 )
2272  statusOFS << "["<<snode.Index<<"] "<< "Sending Ucol "
2273  << snode.SizeSstrUcolSend << " BYTES to P"
2274  << PNUM(MYROW(this->grid_),iProcCol,this->grid_)
2275  << std::endl << std::endl;
2276 #endif
2277  } // Send
2278  } // for (iProcCol)
2279  } // if I am the sender
2280 
2281 
2282 
2283  } //Senders
2284 
2285  //TODO Ideally, we should not receive data in sequence
2286  // but in any order with ksup packed with the data
2287 
2288  TIMER_START(WaitContentLU);
2289  // Receivers (Size)
2290  for (Int supidx=0; supidx<stepSuper ; supidx++){
2291  SuperNodeBufferTypeUnsym & snode = arrSuperNodes[supidx];
2292  MPI_Request * mpireqsRecvLFromAbove =
2293  &arrMpireqsRecvLSizeFromAny[supidx*2];
2294  MPI_Request * mpireqsRecvLFromLeft =
2295  &arrMpireqsRecvLSizeFromAny[supidx*2+1];
2296  MPI_Request * mpireqsRecvUFromAbove =
2297  &arrMpireqsRecvUSizeFromAny[supidx*2];
2298  MPI_Request * mpireqsRecvUFromLeft =
2299  &arrMpireqsRecvUSizeFromAny[supidx*2+1];
2300 
2301  // Receive the size first
2302  if( this->isRecvFromAbove_( snode.Index ) &&
2303  MYROW( this->grid_ ) != PROW( snode.Index, this->grid_ ) ){
2304  Int sender = PROW( snode.Index, this->grid_ );
2305  MPI_Irecv( &snode.SizeSstrLrowRecv, 1, MPI_INT, sender,
2306  IDX_TO_TAG2(snode.Index,supidx,SELINV_TAG_LROW_SIZE),
2307  this->grid_->colComm, mpireqsRecvLFromAbove );
2308 #if ( _DEBUGlevel_ >= 1 )
2309  statusOFS << "["<<snode.Index<<"] "<< "Receiving Lrow"
2310  <<" size on tag "<<IDX_TO_TAG2(snode.Index,supidx,SELINV_TAG_LROW_SIZE)
2311  <<" from P" << PNUM(sender,MYCOL(this->grid_),this->grid_)
2312  << std::endl << std::endl;
2313 #endif
2314  MPI_Irecv( &snode.SizeSstrUrowRecv, 1, MPI_INT, sender,
2315  IDX_TO_TAG2(snode.Index,supidx,SELINV_TAG_U_SIZE),
2316  this->grid_->colComm, mpireqsRecvUFromAbove );
2317 #if ( _DEBUGlevel_ >= 1 )
2318  statusOFS << "["<<snode.Index<<"] "<< "Receiving U"
2319  <<" size on tag "<<IDX_TO_TAG2(snode.Index,supidx,SELINV_TAG_U_SIZE)
2320  <<" from P" << PNUM(sender,MYCOL(this->grid_),this->grid_)
2321  << std::endl << std::endl;
2322 #endif
2323  } // if I need to receive from up
2324 
2325 
2326  if( this->isRecvFromLeft_( snode.Index ) &&
2327  MYCOL( this->grid_ ) != PCOL( snode.Index, this->grid_ ) ){
2328  Int sender = PCOL( snode.Index, this->grid_ );
2329  MPI_Irecv( &snode.SizeSstrLcolRecv, 1, MPI_INT, sender,
2330  IDX_TO_TAG2(snode.Index,supidx,SELINV_TAG_L_SIZE),
2331  this->grid_->rowComm, mpireqsRecvLFromLeft );
2332 #if ( _DEBUGlevel_ >= 1 )
2333  statusOFS << "["<<snode.Index<<"] "<< "Receiving L"
2334  <<" size on tag "<<IDX_TO_TAG2(snode.Index,supidx,SELINV_TAG_L_SIZE)
2335  <<" from P" << PNUM(MYROW(this->grid_),sender,this->grid_)
2336  << std::endl << std::endl;
2337 #endif
2338  MPI_Irecv( &snode.SizeSstrUcolRecv, 1, MPI_INT, sender,
2339  IDX_TO_TAG2(snode.Index,supidx,SELINV_TAG_UCOL_SIZE),
2340  this->grid_->rowComm, mpireqsRecvUFromLeft );
2341 #if ( _DEBUGlevel_ >= 1 )
2342  statusOFS << "["<<snode.Index<<"] "<< "Receiving Ucol"
2343  <<" size on tag "<<IDX_TO_TAG2(snode.Index,supidx,SELINV_TAG_UCOL_SIZE)
2344  <<" from P" << PNUM(MYROW(this->grid_),sender,this->grid_)
2345  << std::endl << std::endl;
2346 #endif
2347  } // if I need to receive from left
2348  }
2349  TIMER_STOP(WaitContentLU);
2350 
2351  //Wait to receive all the sizes for L
2352  TIMER_START(WaitContentLU);
2353  TIMER_START(WaitSize_LrowL);
2354  mpi::Waitall(arrMpireqsRecvLSizeFromAny);
2355  TIMER_STOP(WaitSize_LrowL);
2356 
2357  // Receivers (Content)
2358  for (Int supidx=0; supidx<stepSuper ; supidx++){
2359  SuperNodeBufferTypeUnsym & snode = arrSuperNodes[supidx];
2360 
2361  MPI_Request * mpireqsRecvFromAbove =
2362  &arrMpireqsRecvLContentFromAny[supidx*2];
2363  MPI_Request * mpireqsRecvFromLeft =
2364  &arrMpireqsRecvLContentFromAny[supidx*2+1];
2365 
2366  TIMER_START(Alloc_Buffer_Recv_LrowL);
2367  if( this->isRecvFromAbove_( snode.Index ) &&
2368  MYROW( this->grid_ ) != PROW( snode.Index, this->grid_ ) ){
2369  snode.SstrLrowRecv.resize( snode.SizeSstrLrowRecv );
2370  Int sender = PROW( snode.Index, this->grid_ );
2371  MPI_Irecv( &snode.SstrLrowRecv[0], snode.SizeSstrLrowRecv, MPI_BYTE,
2372  sender, IDX_TO_TAG2(snode.Index,supidx,SELINV_TAG_LROW_CONTENT),
2373  this->grid_->colComm, mpireqsRecvFromAbove );
2374 #if ( _DEBUGlevel_ >= 1 )
2375  statusOFS << "["<<snode.Index<<"] "<< "Receiving Lrow "
2376  << snode.SizeSstrLrowRecv << " BYTES from P"
2377  << PNUM(sender,MYCOL(this->grid_),this->grid_)
2378  << std::endl << std::endl;
2379 #endif
2380  } // if I need to receive from up
2381  TIMER_STOP(Alloc_Buffer_Recv_LrowL);
2382 
2383  TIMER_START(Alloc_Buffer_Recv_LcolL);
2384  if( this->isRecvFromLeft_( snode.Index ) &&
2385  MYCOL( this->grid_ ) != PCOL( snode.Index, this->grid_ ) ){
2386  snode.SstrLcolRecv.resize( snode.SizeSstrLcolRecv );
2387  Int sender = PCOL( snode.Index, this->grid_ );
2388  MPI_Irecv( &snode.SstrLcolRecv[0], snode.SizeSstrLcolRecv, MPI_BYTE,
2389  sender, IDX_TO_TAG2(snode.Index,supidx,SELINV_TAG_L_CONTENT),
2390  this->grid_->rowComm,
2391  mpireqsRecvFromLeft );
2392 #if ( _DEBUGlevel_ >= 1 )
2393  statusOFS << "["<<snode.Index<<"] "<< "Receiving L "
2394  << snode.SizeSstrLcolRecv << " BYTES from P"
2395  << PNUM(MYROW(this->grid_),sender,this->grid_)
2396  << std::endl << std::endl;
2397 #endif
2398  } // if I need to receive from left
2399  TIMER_STOP(Alloc_Buffer_Recv_LcolL);
2400  }
2401  TIMER_STOP(WaitContentLU);
2402 
2403 
2404  //Wait to receive all the sizes for U
2405  TIMER_START(WaitContentLU);
2406  TIMER_START(WaitSize_UcolU);
2407  mpi::Waitall(arrMpireqsRecvUSizeFromAny);
2408  TIMER_STOP(WaitSize_UcolU);
2409 
2410  // Receivers (Content)
2411  for (Int supidx=0; supidx<stepSuper ; supidx++){
2412  SuperNodeBufferTypeUnsym & snode = arrSuperNodes[supidx];
2413 
2414  MPI_Request * mpireqsRecvFromAbove =
2415  &arrMpireqsRecvUContentFromAny[supidx*2];
2416  MPI_Request * mpireqsRecvFromLeft =
2417  &arrMpireqsRecvUContentFromAny[supidx*2+1];
2418 
2419  TIMER_START(Alloc_Buffer_Recv_UrowL);
2420  if( this->isRecvFromAbove_( snode.Index ) &&
2421  MYROW( this->grid_ ) != PROW( snode.Index, this->grid_ ) ){
2422  snode.SstrUrowRecv.resize( snode.SizeSstrUrowRecv );
2423  Int sender = PROW( snode.Index, this->grid_ );
2424  MPI_Irecv( &snode.SstrUrowRecv[0], snode.SizeSstrUrowRecv, MPI_BYTE,
2425  sender, IDX_TO_TAG2(snode.Index,supidx,SELINV_TAG_U_CONTENT),
2426  this->grid_->colComm, mpireqsRecvFromAbove );
2427 #if ( _DEBUGlevel_ >= 1 )
2428  statusOFS << "["<<snode.Index<<"] "<< "Receiving U "
2429  << snode.SizeSstrUrowRecv << " BYTES from P"
2430  << PNUM(sender,MYCOL(this->grid_),this->grid_)
2431  << std::endl << std::endl;
2432 #endif
2433  } // if I need to receive from up
2434  TIMER_STOP(Alloc_Buffer_Recv_UrowL);
2435 
2436  TIMER_START(Alloc_Buffer_Recv_UcolL);
2437  if( this->isRecvFromLeft_( snode.Index ) &&
2438  MYCOL( this->grid_ ) != PCOL( snode.Index, this->grid_ ) ){
2439  snode.SstrUcolRecv.resize( snode.SizeSstrUcolRecv );
2440  Int sender = PCOL( snode.Index, this->grid_ );
2441  MPI_Irecv( &snode.SstrUcolRecv[0], snode.SizeSstrUcolRecv, MPI_BYTE,
2442  sender, IDX_TO_TAG2(snode.Index,supidx,SELINV_TAG_UCOL_CONTENT),
2443  this->grid_->rowComm,
2444  mpireqsRecvFromLeft );
2445 #if ( _DEBUGlevel_ >= 1 )
2446  statusOFS << "["<<snode.Index<<"] "<< "Receiving Ucol "
2447  << snode.SizeSstrUcolRecv << " BYTES from P"
2448  << PNUM(MYROW(this->grid_),sender,this->grid_)
2449  << std::endl << std::endl;
2450 #endif
2451  } // if I need to receive from left
2452  TIMER_STOP(Alloc_Buffer_Recv_UcolL);
2453  }
2454 
2455  TIMER_STOP(WaitContentLU);
2456 
2457 
2458  TIMER_START(Compute_Sinv_LU);
2459  {
2460  Int gemmProcessed = 0;
2461  Int gemmToDo = 0;
2462  // Int toRecvGemm = 0;
2463  //copy the list of supernodes we need to process
2464  std::vector<Int> readySupidx;
2465  //find local things to do
2466  for(Int supidx = 0;supidx<stepSuper;supidx++){
2467  SuperNodeBufferTypeUnsym & snode = arrSuperNodes[supidx];
2468  if( this->isRecvFromAbove_( snode.Index )
2469  && this->isRecvFromLeft_( snode.Index )){
2470  gemmToDo+=2;
2471  if( MYCOL( this->grid_ ) == PCOL( snode.Index, this->grid_ ) ){
2472  snode.isReady+=2;
2473  }
2474 
2475  if( MYROW( this->grid_ ) == PROW( snode.Index, this->grid_ ) ){
2476  snode.isReady+=2;
2477  }
2478 
2479  if(snode.isReady==4){
2480  readySupidx.push_back(supidx);
2481 #if ( _DEBUGlevel_ >= 1 )
2482  statusOFS<<"Locally processing ["<<snode.Index<<"]"<<std::endl;
2483 #endif
2484  }
2485  }
2486  else{
2487  TIMER_START(Reduce_Sinv_L_Send);
2488  if( this->isRecvFromLeft_( snode.Index )
2489  && MYCOL( this->grid_ ) != PCOL( snode.Index, this->grid_ ) ){
2490  MPI_Request & mpireqsSendToLeft = arrMpireqsSendLToLeft[supidx];
2491  // Dummy 0-b send If I was a receiver, I need to send my data to
2492  // proc in column of snode.Index
2493  MPI_Isend( NULL, 0, MPI_BYTE, PCOL(snode.Index,this->grid_),
2494  IDX_TO_TAG2(snode.Index,supidx,SELINV_TAG_L_REDUCE),
2495  this->grid_->rowComm, &mpireqsSendToLeft );
2496 
2497 #if ( _DEBUGlevel_ >= 1 )
2498  Int dst = PNUM(MYROW(this->grid_),
2499  PCOL(snode.Index,this->grid_),this->grid_);
2500  statusOFS << "["<<snode.Index<<"] "<< " LReduce P"
2501  << MYPROC(this->grid_) << " has sent "
2502  << 0 << " bytes to "
2503  << dst << std::endl;
2504 #endif
2505  }// if( isRecvFromLeft_( snode.Index ))
2506  TIMER_STOP(Reduce_Sinv_L_Send);
2507 
2508 
2509 
2510  }
2511  }
2512 
2513 #if ( _DEBUGlevel_ >= 1 )
2514  statusOFS<<std::endl<<"gemmToDo ="<<gemmToDo<<std::endl;
2515 #endif
2516 
2517 
2518 #if defined (PROFILE)
2519  end_SendULWaitContentFirst=0;
2520  begin_SendULWaitContentFirst=0;
2521 #endif
2522 
2523  while(gemmProcessed<gemmToDo)
2524  {
2525  Int reqidx = MPI_UNDEFINED;
2526  Int supidx = -1;
2527 
2528  TIMER_START(WaitContentLU);
2529  //while I don't have anything to do, wait for data to arrive
2530  do{
2531  int reqIndicesL[arrMpireqsRecvLContentFromAny.size()];
2532  int numRecv = 0;
2533 
2534  //then process with the remote ones
2535 
2536 #if defined(PROFILE)
2537  if(begin_SendULWaitContentFirst==0){
2538  begin_SendULWaitContentFirst=1;
2539  TIMER_START(WaitContent_LrowL_First);
2540  }
2541 #endif
2542 
2543  TIMER_START(WaitContent_LrowL);
2544  MPI_Waitsome(2*stepSuper, &arrMpireqsRecvLContentFromAny[0],
2545  &numRecv, reqIndicesL, MPI_STATUSES_IGNORE);
2546 
2547  for(int i =0;i<numRecv;i++){
2548  reqidx = reqIndicesL[i];
2549  //I've received something
2550  if(reqidx!=MPI_UNDEFINED){
2551 
2552  supidx = reqidx/2;
2553  SuperNodeBufferTypeUnsym & snode = arrSuperNodes[supidx];
2554  snode.isReady++;
2555 
2556 #if ( _DEBUGlevel_ >= 1 )
2557  statusOFS << "["<<snode.Index<<"] "<<"Received data for L"
2558  << " reqidx%2=" << reqidx%2
2559  << " is ready ?"<<snode.isReady<<std::endl;
2560 #endif
2561  //if we received both L and U, the supernode is ready
2562  if(snode.isReady==4){
2563  readySupidx.push_back(supidx);
2564 #if defined(PROFILE)
2565  if(end_SendULWaitContentFirst==0){
2566  TIMER_STOP(WaitContent_LrowL_First);
2567  end_SendULWaitContentFirst=1;
2568  }
2569 #endif
2570  }
2571  }
2572 
2573  }//end for waitsome
2574  TIMER_STOP(WaitContent_LrowL);
2575 
2576 
2577  int reqIndicesU[arrMpireqsRecvUContentFromAny.size()];
2578  numRecv = 0;
2579 
2580 
2581  TIMER_START(WaitContent_UcolU);
2582  MPI_Waitsome(2*stepSuper, &arrMpireqsRecvUContentFromAny[0],
2583  &numRecv, reqIndicesU, MPI_STATUSES_IGNORE);
2584 
2585  for(int i =0;i<numRecv;i++){
2586  reqidx = reqIndicesU[i];
2587  //I've received something
2588  if(reqidx!=MPI_UNDEFINED){
2589  supidx = reqidx/2;
2590  SuperNodeBufferTypeUnsym & snode = arrSuperNodes[supidx];
2591  snode.isReady++;
2592 
2593 #if ( _DEBUGlevel_ >= 1 )
2594  statusOFS << "["<<snode.Index<<"] "<<"Received data for U"
2595  << " reqidx%2=" << reqidx%2
2596  << " is ready ?"<<snode.isReady<<std::endl;
2597 #endif
2598  //if we received both L and U, the supernode is ready
2599  if(snode.isReady==4){
2600  readySupidx.push_back(supidx);
2601 #if defined(PROFILE)
2602  if(end_SendULWaitContentFirst==0){
2603  TIMER_STOP(WaitContent_LrowL_First);
2604  end_SendULWaitContentFirst=1;
2605  }
2606 #endif
2607  }
2608  }
2609 
2610  }//end for waitsome
2611  TIMER_STOP(WaitContent_UcolU);
2612 
2613  } while(readySupidx.size()==0);
2614  TIMER_STOP(WaitContentLU);
2615 
2616  //If I have some work to do
2617  if(readySupidx.size()>0)
2618  {
2619  supidx = readySupidx.back();
2620  readySupidx.pop_back();
2621  SuperNodeBufferTypeUnsym & snode = arrSuperNodes[supidx];
2622 
2623 
2624  // Only the processors received information participate in the Gemm
2625  if( this->isRecvFromAbove_( snode.Index )
2626  && this->isRecvFromLeft_( snode.Index ) ){
2627  std::vector<LBlock<T> > LcolRecv;
2628  std::vector<LBlock<T> > LrowRecv;
2629  std::vector<UBlock<T> > UcolRecv;
2630  std::vector<UBlock<T> > UrowRecv;
2631 
2632 //TODO REMOVE THIS THIS IS ONLY FOR DEBUGING PURPOSE
2633 NumMat<T> * pAinvBuf = new NumMat<T>();
2634 NumMat<T> * pUBuf = new NumMat<T>();
2635 NumMat<T> * pLBuf = new NumMat<T>();
2636 NumMat<T> & AinvBuf = *pAinvBuf;
2637 NumMat<T> & UBuf = *pUBuf;
2638 NumMat<T> & LBuf = *pLBuf;
2639 
2640  UnpackData(snode, LcolRecv, LrowRecv, UcolRecv, UrowRecv);
2641 
2642  SelInv_lookup_indexes(snode, LcolRecv, LrowRecv,
2643  UcolRecv, UrowRecv,AinvBuf,LBuf, UBuf);
2644 
2645 
2646 #if ( _DEBUGlevel_ >= 2 )
2647  statusOFS << "["<<snode.Index<<"] " << "LBuf: ";
2648  statusOFS << LBuf << std::endl;
2649  statusOFS << "["<<snode.Index<<"] " << "UBuf: ";
2650  statusOFS << UBuf << std::endl;
2651 #endif
2652 
2653  NumMat<T> LUpdateBuf;
2654  LUpdateBuf.Resize( AinvBuf.m(),
2655  SuperSize( snode.Index, this->super_ ) );
2656 
2657 
2658 
2659  TIMER_START(Compute_Sinv_L_Resize);
2660  snode.LUpdateBuf.Resize( AinvBuf.m(),
2661  SuperSize( snode.Index, this->super_ ) );
2662  TIMER_STOP(Compute_Sinv_L_Resize);
2663 
2664  TIMER_START(Compute_Sinv_LT_GEMM);
2665  blas::Gemm('N', 'T', AinvBuf.m(), LBuf.m(), AinvBuf.n(),
2666  MINUS_ONE<T>(), AinvBuf.Data(), AinvBuf.m(),
2667  LBuf.Data(), LBuf.m(), ZERO<T>(),
2668  snode.LUpdateBuf.Data(), snode.LUpdateBuf.m() );
2669  TIMER_STOP(Compute_Sinv_LT_GEMM);
2670 
2671  TIMER_START(Compute_Sinv_U_Resize);
2672  snode.UUpdateBuf.Resize( SuperSize( snode.Index, this->super_ ),
2673  AinvBuf.n() );
2674  TIMER_STOP(Compute_Sinv_U_Resize);
2675 
2676  TIMER_START(Compute_Sinv_U_GEMM);
2677  blas::Gemm('N', 'N', UBuf.m(), AinvBuf.n(), AinvBuf.m(),
2678  MINUS_ONE<T>(), UBuf.Data(), UBuf.m(),
2679  AinvBuf.Data(), AinvBuf.m(), ZERO<T>(),
2680  snode.UUpdateBuf.Data(), snode.UUpdateBuf.m() );
2681  TIMER_STOP(Compute_Sinv_U_GEMM);
2682 
2683 #if ( _DEBUGlevel_ >= 2 )
2684  statusOFS << "["<<snode.Index<<"] " << "snode.LUpdateBuf: ";
2685  statusOFS << snode.LUpdateBuf << std::endl;
2686  statusOFS << "["<<snode.Index<<"] " << "snode.UUpdateBuf: ";
2687  statusOFS << snode.UUpdateBuf << std::endl;
2688 #endif
2689 
2690 //TODO REMOVE THIS THIS IS ONLY FOR DEBUGING PURPOSE
2691 delete pLBuf;
2692 delete pUBuf;
2693 delete pAinvBuf;
2694 
2695 
2696  } // if Gemm is to be done locally
2697 
2698 
2699  TIMER_START(Reduce_Sinv_L_Send);
2700  // If I was a receiver, I need to send my data to proc in column
2701  // of snode.Index
2702  if( this->isRecvFromAbove_( snode.Index ) ){
2703  if( this->isRecvFromLeft_( snode.Index )
2704  && MYCOL( this->grid_ ) != PCOL( snode.Index, this->grid_ ) ){
2705  MPI_Request & reqsSendToLeft = arrMpireqsSendLToLeft[supidx];
2706  MPI_Isend( snode.LUpdateBuf.Data(), snode.LUpdateBuf.ByteSize(),
2707  MPI_BYTE, PCOL(snode.Index,this->grid_),
2708  IDX_TO_TAG2(snode.Index,supidx,SELINV_TAG_L_REDUCE),
2709  this->grid_->rowComm, &reqsSendToLeft );
2710 
2711 #if ( _DEBUGlevel_ >= 1 )
2712  Int dst = PNUM(MYROW(this->grid_),
2713  PCOL(snode.Index,this->grid_),this->grid_);
2714  statusOFS << "["<<snode.Index<<"] "<< " LReduce P"
2715  << MYPROC(this->grid_) << " has sent "
2716  << snode.LUpdateBuf.ByteSize() << " bytes to "
2717  << dst << std::endl;
2718 #endif
2719  }//Sender
2720  }
2721  TIMER_STOP(Reduce_Sinv_L_Send);
2722 
2723  gemmProcessed+=2;
2724 #if ( _DEBUGlevel_ >= 1 )
2725  statusOFS << "["<<snode.Index<<"] "<<"gemmProcessed = "
2726  << gemmProcessed<<"/"<<gemmToDo<<std::endl;
2727 #endif
2728  }
2729  }
2730 
2731  }
2732  TIMER_STOP(Compute_Sinv_LU);
2733 
2734 
2735 
2736 
2737 
2738 
2739 
2740  //Reduce Sinv L to the processors in PCOL(ksup,this->grid_)
2741  TIMER_START(Reduce_Sinv_L);
2742 
2743  for (Int supidx=0; supidx<stepSuper; supidx++){
2744  SuperNodeBufferTypeUnsym & snode = arrSuperNodes[supidx];
2745  if( MYCOL( this->grid_ ) == PCOL( snode.Index, this->grid_ ) ){
2746  //determine the number of rows in LUpdateBufReduced
2747  Int numRowLUpdateBuf;
2748  std::vector<LBlock<T> >& Lcol =
2749  this->L( LBj( snode.Index, this->grid_ ) );
2750 
2751  // If I own the diagonal block, skip the diagonal block
2752  Int offset =
2753  (MYROW( this->grid_ )==PROW(snode.Index,this->grid_))?1:0;
2754  snode.RowLocalPtr.resize( Lcol.size() + 1 - offset );
2755  snode.BlockIdxLocal.resize( Lcol.size() - offset );
2756  snode.RowLocalPtr[0] = 0;
2757  for( Int ib = 0; ib < snode.BlockIdxLocal.size(); ib++ ){
2758  snode.RowLocalPtr[ib+1] = snode.RowLocalPtr[ib]
2759  + Lcol[ib+offset].numRow;
2760  snode.BlockIdxLocal[ib] = Lcol[ib+offset].blockIdx;
2761  }
2762  numRowLUpdateBuf = *snode.RowLocalPtr.rbegin();
2763 
2764 
2765  if( numRowLUpdateBuf > 0 ){
2766  if( snode.LUpdateBuf.m() == 0 && snode.LUpdateBuf.n() == 0 ){
2767  snode.LUpdateBuf.Resize( numRowLUpdateBuf,
2768  SuperSize( snode.Index, this->super_ ) );
2769  // Fill zero is important
2770  SetValue( snode.LUpdateBuf, ZERO<T>() );
2771  }
2772  }
2773 
2774 #if ( _DEBUGlevel_ >= 2 )
2775  statusOFS << "["<<snode.Index<<"] "<<"LUpdateBuf Before Reduction: "
2776  << snode.LUpdateBuf << std::endl << std::endl;
2777 #endif
2778 
2779  Int totCountRecv = 0;
2780  Int numRecv = this->CountSendToRight(snode.Index);
2781  NumMat<T> LUpdateBufRecv(numRowLUpdateBuf,
2782  SuperSize( snode.Index, this->super_ ) );
2783  for( Int countRecv = 0; countRecv < numRecv ; ++countRecv ){
2784  //Do the blocking recv
2785  MPI_Status stat;
2786  Int size = 0;
2787  TIMER_START(L_RECV);
2788  MPI_Recv(LUpdateBufRecv.Data(), LUpdateBufRecv.ByteSize(),
2789  MPI_BYTE, MPI_ANY_SOURCE,
2790  IDX_TO_TAG2(snode.Index,supidx,SELINV_TAG_L_REDUCE),
2791  this->grid_->rowComm,&stat);
2792  TIMER_STOP(L_RECV);
2793  MPI_Get_count(&stat, MPI_BYTE, &size);
2794  //if the processor contributes
2795  if(size>0){
2796 #if ( _DEBUGlevel_ >= 1 )
2797  Int src = PNUM(MYROW(this->grid_),stat.MPI_SOURCE,this->grid_);
2798  statusOFS << "["<<snode.Index<<"] "<< " LReduce P"
2799  << MYPROC(this->grid_)<<" has received "<< size
2800  << " bytes from " << src << std::endl;
2801 #endif
2802 #if ( _DEBUGlevel_ >= 2 )
2803  statusOFS << "["<<snode.Index<<"] "<< "LUpdateBufRecv: "
2804  << LUpdateBufRecv << std::endl << std::endl;
2805 #endif
2806  //do the sum
2807  blas::Axpy(snode.LUpdateBuf.Size(), ONE<T>(),
2808  LUpdateBufRecv.Data(), 1,
2809  snode.LUpdateBuf.Data(), 1 );
2810  }
2811  } // for (iProcCol)
2812 #if ( _DEBUGlevel_ >= 2 )
2813  statusOFS << "["<<snode.Index<<"] "<< "LUpdateBuf After Reduction: "
2814  << snode.LUpdateBuf << std::endl << std::endl;
2815 #endif
2816  } // Receiver
2817  }
2818 
2819  TIMER_STOP(Reduce_Sinv_L);
2820  //--------------------- End of reduce of LUpdateBuf-------------------------
2821 
2822 
2823  mpi::Waitall( arrMpireqsSendLToLeft );
2824 
2825 
2826  if(next_lidx < superList.size()){
2827  IRecvContentCD(superList[next_lidx], superList[next_lidx].size(),nextCDBuffers);
2828  }
2829 
2830 
2831 
2832 
2833 #ifndef _RELEASE_
2834  PushCallStack("PMatrixUnsym::SelInv_P2p::UpdateD");
2835 #endif
2836 
2837  TIMER_START(Update_Diagonal);
2838  for (Int supidx=0; supidx<stepSuper; supidx++){
2839  SuperNodeBufferTypeUnsym & snode = arrSuperNodes[supidx];
2840 
2841  ComputeDiagUpdate(snode);
2842 
2843  if( MYCOL( this->grid_ ) == PCOL( snode.Index, this->grid_ ) ){
2844  if( MYROW( this->grid_ ) != PROW( snode.Index, this->grid_ ) ){
2845  if(this->isSendToDiagonal_(snode.Index)){
2846  //send to above
2847  MPI_Request & mpireqsSendToAbove = arrMpireqsSendToAbove[supidx];
2848  MPI_Isend( snode.DiagBuf.Data(), snode.DiagBuf.ByteSize(),
2849  MPI_BYTE, PROW(snode.Index,this->grid_) ,
2850  IDX_TO_TAG2(snode.Index,supidx,SELINV_TAG_D_REDUCE),
2851  this->grid_->colComm, &mpireqsSendToAbove );
2852 
2853 #if ( _DEBUGlevel_ >= 1 )
2854  statusOFS << "["<<snode.Index<<"] "<< " P"<<MYROW(this->grid_)
2855  <<" has sent "<< snode.DiagBuf.ByteSize()
2856  << " bytes of DiagBuf to "
2857  << PROW(snode.Index,this->grid_) << std::endl;
2858 #endif
2859  }
2860  }
2861  }
2862  }
2863 
2864  TIMER_STOP(Update_Diagonal);
2865 
2866 
2867 
2868  TIMER_START(Reduce_Diagonal);
2869 
2870  for (Int supidx=0; supidx<stepSuper; supidx++){
2871  SuperNodeBufferTypeUnsym & snode = arrSuperNodes[supidx];
2872  if( MYCOL( this->grid_ ) == PCOL( snode.Index, this->grid_ ) ){
2873  if( MYROW( this->grid_ ) == PROW( snode.Index, this->grid_ ) ){
2874  if(snode.DiagBuf.Size()==0){
2875  snode.DiagBuf.Resize( SuperSize( snode.Index, this->super_ ),
2876  SuperSize( snode.Index, this->super_ ));
2877  SetValue(snode.DiagBuf, ZERO<T>());
2878  }
2879  //receive from below
2880  Int totCountRecv = 0;
2881  Int numRecv = this->CountRecvFromBelow(snode.Index);
2882  NumMat<T> DiagBufRecv(snode.DiagBuf.m(),snode.DiagBuf.n());
2883 
2884  for( Int countRecv = 0; countRecv < numRecv ; ++countRecv ){
2885  //Do the blocking recv
2886  MPI_Status stat;
2887  Int size = 0;
2888  TIMER_START(D_RECV);
2889  MPI_Recv(DiagBufRecv.Data(), DiagBufRecv.ByteSize(), MPI_BYTE,
2890  MPI_ANY_SOURCE,IDX_TO_TAG2(snode.Index,supidx,SELINV_TAG_D_REDUCE),
2891  this->grid_->colComm,&stat);
2892  TIMER_STOP(D_RECV);
2893  MPI_Get_count(&stat, MPI_BYTE, &size);
2894  //if the processor contributes
2895  if(size>0){
2896  // Add DiagBufRecv to diagonal block.
2897  blas::Axpy(snode.DiagBuf.Size(), ONE<T>(),
2898  DiagBufRecv.Data(), 1,
2899  snode.DiagBuf.Data(), 1 );
2900  }
2901  }
2902  LBlock<T> & LB = this->L( LBj( snode.Index, this->grid_ ) )[0];
2903  blas::Axpy( LB.numRow * LB.numCol, ONE<T>(),
2904  snode.DiagBuf.Data(), 1,
2905  LB.nzval.Data(), 1 );
2906  }
2907 
2908  }
2909  }
2910 
2911 
2912  TIMER_STOP(Reduce_Diagonal);
2913 
2914 #ifndef _RELEASE_
2915  PopCallStack();
2916 #endif
2917 
2918 
2919  //Reduce U Sinv to the processors in PROW(ksup,this->grid_)
2920  TIMER_START(Reduce_Sinv_U);
2921 
2922  for (Int supidx=0; supidx<stepSuper; supidx++){
2923  SuperNodeBufferTypeUnsym & snode = arrSuperNodes[supidx];
2924 
2925 
2926  if( MYROW( this->grid_ ) == PROW( snode.Index, this->grid_ ) ){
2927  //determine the number of rows in UUpdateBufReduced
2928  Int numColUUpdateBuf;
2929  //FIXME U must be revised to store the same structure as L ?
2930  std::vector<UBlock<T> >& Urow =
2931  this->U( LBi( snode.Index, this->grid_ ) );
2932 
2933  snode.ColLocalPtr.resize( Urow.size() + 1 );
2934  snode.BlockIdxLocalU.resize( Urow.size() );
2935  snode.ColLocalPtr[0] = 0;
2936 
2937 // std::vector<LBlock<T> >& Lrow = this->Lrow( LBi( snode.Index, this->grid_ ) );
2938 // if(Lrow.size()>=Urow.size()){
2939 // statusOFS<<"UReduce first case"<<std::endl;
2940 // std::vector<Int> colPtrL(Lrow.size()+1);
2941 // colPtrL[0] = 0;
2942 // for( Int ib = 0; ib < Lrow.size(); ib++ ){
2943 // colPtrL[ib+1] = colPtrL[ib] + Lrow[ib].numCol;
2944 // }
2945 //
2946 //
2947 //
2948 // for( Int jb = 0; jb < Urow.size(); jb++ ){
2949 // Int indexL =0;
2950 // for( Int ib = 0; ib < Lrow.size(); ib++ ){
2951 // if(Lrow[ib].blockIdx == Urow[jb].blockIdx){
2952 // indexL = ib;
2953 // break;
2954 // }
2955 // }
2956 // statusOFS<<jb<<" vs "<<indexL<<std::endl;
2957 //
2958 // snode.ColLocalPtr[jb] = colPtrL[indexL];
2959 // //snode.ColLocalPtr[jb+1] = snode.ColLocalPtr[jb] + Urow[jb].numCol;
2960 // snode.BlockIdxLocalU[jb] = Lrow[indexL].blockIdx;
2961 // //snode.BlockIdxLocalU[jb] = Urow[jb].blockIdx;
2962 // }
2963 // snode.ColLocalPtr.back()=colPtrL.back();
2964 //
2965 // statusOFS<<colPtrL<<std::endl;
2966 // }
2967 // else{
2968 // statusOFS<<"UReduce second case"<<std::endl;
2969  Int urowsize = Urow.size();
2970  snode.ColLocalPtr[0] = 0;
2971  for( Int jb = 0; jb < Urow.size(); jb++ ){
2972  UBlock<T> & UB = Urow[jb];
2973  snode.ColLocalPtr[jb+1] = snode.ColLocalPtr[jb]+UB.numCol;
2974  snode.BlockIdxLocalU[jb] = UB.blockIdx;
2975  }
2976 // }
2977 
2978 #if ( _DEBUGlevel_ >= 2 )
2979  statusOFS << "["<<snode.Index<<"] "<<"UReduce collocalptr "
2980  << snode.ColLocalPtr << std::endl;
2981  statusOFS << "["<<snode.Index<<"] "<<"UReduce blockidxlocalU "
2982  << snode.BlockIdxLocalU << std::endl;
2983 #endif
2984 
2985  numColUUpdateBuf = *snode.ColLocalPtr.rbegin();
2986 
2987  if( numColUUpdateBuf > 0 ){
2988  if( snode.UUpdateBuf.m() == 0 && snode.UUpdateBuf.n() == 0 ){
2989  snode.UUpdateBuf.Resize( SuperSize( snode.Index, this->super_ ),
2990  numColUUpdateBuf );
2991  // Fill zero is important
2992  SetValue( snode.UUpdateBuf, ZERO<T>() );
2993  }
2994  }
2995 
2996 #if ( _DEBUGlevel_ >= 2 )
2997  statusOFS << "["<<snode.Index<<"] "<<"UUpdateBuf Before Reduction: "
2998  << snode.UUpdateBuf << std::endl << std::endl;
2999 #endif
3000 
3001  Int totCountRecv = 0;
3002 
3003  Int numRecv = this->CountSendToBelow(snode.Index);
3004 
3005  NumMat<T> UUpdateBufRecv(SuperSize( snode.Index, this->super_ ),
3006  numColUUpdateBuf );
3007 
3008 #if ( _DEBUGlevel_ >= 1 )
3009  statusOFS << "["<<snode.Index<<"] "<< " UReduce P"
3010  << MYPROC(this->grid_) << " can receive at most "
3011  << UUpdateBufRecv.ByteSize() << " bytes" << std::endl;
3012 #endif
3013  for( Int countRecv = 0; countRecv < numRecv ; ++countRecv ){
3014  //Do the blocking recv
3015  MPI_Status stat;
3016  Int size = 0;
3017  TIMER_START(U_RECV);
3018  MPI_Recv(UUpdateBufRecv.Data(), UUpdateBufRecv.ByteSize(),
3019  MPI_BYTE, MPI_ANY_SOURCE,
3020  IDX_TO_TAG2(snode.Index,supidx,SELINV_TAG_U_REDUCE),
3021  this->grid_->colComm,&stat);
3022  TIMER_STOP(U_RECV);
3023  MPI_Get_count(&stat, MPI_BYTE, &size);
3024  //if the processor contributes
3025  if(size>0){
3026 
3027 #if ( _DEBUGlevel_ >= 1 )
3028  Int src = PNUM(stat.MPI_SOURCE,MYCOL(this->grid_),this->grid_);
3029  statusOFS << "["<<snode.Index<<"] "<< " UReduce P"
3030  << MYPROC(this->grid_)<<" has received "
3031  << size << " bytes from P" << src << std::endl;
3032 #endif
3033 
3034 #if ( _DEBUGlevel_ >= 2 )
3035  statusOFS << "["<<snode.Index<<"] "<< "UUpdateBufRecv: "
3036  << UUpdateBufRecv << std::endl << std::endl;
3037 #endif
3038  //do the sum
3039  blas::Axpy(snode.UUpdateBuf.Size(), ONE<T>(),
3040  UUpdateBufRecv.Data(), 1,
3041  snode.UUpdateBuf.Data(), 1);
3042  }
3043  } // for (iProcRow)
3044 #if ( _DEBUGlevel_ >= 2 )
3045  statusOFS << "["<<snode.Index<<"] "<<"UUpdateBuf After Reduction: "
3046  << snode.UUpdateBuf << std::endl << std::endl;
3047 #endif
3048  } // Receiver
3049  else{
3050 
3051 
3052  TIMER_START(Reduce_Sinv_U_Send);
3053  if(!this->isRecvFromLeft_( snode.Index ) && this->isRecvFromAbove_( snode.Index ) ){
3054  MPI_Request & mpireqsSendToAbove = arrMpireqsSendUToAbove[supidx];
3055  // Dummy 0-b send If I was a receiver, I need to send my data to
3056  // proc in row of snode.Index
3057  MPI_Isend( NULL, 0, MPI_BYTE, PROW(snode.Index,this->grid_),
3058  IDX_TO_TAG2(snode.Index,supidx,SELINV_TAG_U_REDUCE),
3059  this->grid_->colComm, &mpireqsSendToAbove );
3060 
3061 #if ( _DEBUGlevel_ >= 1 )
3062  Int dst = PNUM(PROW(snode.Index,this->grid_),
3063  MYCOL(this->grid_),this->grid_);
3064  statusOFS << "["<<snode.Index<<"] "<< " UReduce P"
3065  << MYPROC(this->grid_) << " has sent "
3066  << 0 << " bytes to "
3067  << dst << std::endl;
3068 #endif
3069  }// if( isRecvFromAbove_( snode.Index ) )
3070  TIMER_STOP(Reduce_Sinv_U_Send);
3071 
3072 
3073  TIMER_START(Reduce_Sinv_U_Send);
3074  // If I was a receiver, I need to send my data to proc in row
3075  // of snode.Index
3076  if( this->isRecvFromAbove_( snode.Index ) && this->isRecvFromLeft_( snode.Index ) ){
3077  MPI_Request & reqsSendToAbove = arrMpireqsSendUToAbove[supidx];
3078  TIMER_START(Reduce_Sinv_U_Send_Isend);
3079  MPI_Isend( snode.UUpdateBuf.Data(), snode.UUpdateBuf.ByteSize(),
3080  MPI_BYTE, PROW(snode.Index,this->grid_),
3081  IDX_TO_TAG2(snode.Index,supidx,SELINV_TAG_U_REDUCE),
3082  this->grid_->colComm, &reqsSendToAbove );
3083  TIMER_STOP(Reduce_Sinv_U_Send_Isend);
3084 #if ( _DEBUGlevel_ >= 1 )
3085  Int dst = PNUM(PROW(snode.Index,this->grid_),
3086  MYCOL(this->grid_),this->grid_);
3087  statusOFS << "["<<snode.Index<<"] "<< " UReduce P"
3088  << MYPROC(this->grid_) << " has sent "
3089  << snode.UUpdateBuf.ByteSize() << " bytes to "
3090  << dst << std::endl;
3091 #endif
3092  }//Sender
3093  TIMER_STOP(Reduce_Sinv_U_Send);
3094 
3095 
3096  }
3097  }
3098 
3099  TIMER_STOP(Reduce_Sinv_U);
3100  //--------------------- End of reduce of UUpdateBuf-------------------------
3101 
3102  mpi::Waitall( arrMpireqsSendUToAbove );
3103 
3104 
3105 
3106 
3107 
3108 
3109  //Deallocate Lrow and Ucol
3110  for (Int supidx=0; supidx<stepSuper; supidx++){
3111  SuperNodeBufferTypeUnsym & snode = arrSuperNodes[supidx];
3112  if( MYROW( this->grid_ ) == PROW( snode.Index, this->grid_ )){
3113  std::vector<LBlock<T> >& Lrow = this->Lrow( LBi(snode.Index, this->grid_) );
3114  Lrow.clear();
3115  }
3116  if( MYCOL( this->grid_ ) == PCOL( snode.Index, this->grid_ )){
3117  std::vector<UBlock<T> >& Ucol = this->Ucol( LBj(snode.Index, this->grid_) );
3118  Ucol.clear();
3119  }
3120  }
3121 
3122 #ifndef _RELEASE_
3123  PushCallStack("PMatrixUnsym::SelInv_P2p::UpdateLFinal");
3124 #endif
3125 
3126  TIMER_START(Update_L);
3127 
3128  for (Int supidx=0; supidx<stepSuper; supidx++){
3129  SuperNodeBufferTypeUnsym & snode = arrSuperNodes[supidx];
3130 
3131 #if ( _DEBUGlevel_ >= 1 )
3132  statusOFS << std::endl << "["<<snode.Index<<"] "
3133  << "Finish updating the L part by filling LUpdateBufReduced"
3134  << " back to L" << std::endl << std::endl;
3135 #endif
3136 
3137  if( MYCOL( this->grid_ ) == PCOL( snode.Index, this->grid_ )
3138  && snode.LUpdateBuf.m() > 0 ){
3139  std::vector<LBlock<T> >& Lcol = this->L(LBj(snode.Index,this->grid_));
3140  //Need to skip the diagonal block if present
3141  Int startBlock =
3142  (MYROW( this->grid_ ) == PROW( snode.Index, this->grid_ ))?1:0;
3143  for( Int ib = startBlock; ib < Lcol.size(); ib++ ){
3144  LBlock<T> & LB = Lcol[ib];
3145  if(1|| (LB.numRow>0 && LB.numCol>0)){
3146  lapack::Lacpy( 'A', LB.numRow, LB.numCol,
3147  &snode.LUpdateBuf(snode.RowLocalPtr[ib-startBlock], 0),
3148  snode.LUpdateBuf.m(), LB.nzval.Data(), LB.numRow );
3149  }
3150  }
3151  } // Finish updating L
3152  } // for (snode.Index) : Main loop
3153 
3154 
3155  TIMER_STOP(Update_L);
3156 
3157 #ifndef _RELEASE_
3158  PopCallStack();
3159 #endif
3160 
3161 
3162 #ifndef _RELEASE_
3163  PushCallStack("PMatrixUnsym::SelInv_P2p::UpdateUFinal");
3164 #endif
3165 
3166  TIMER_START(Update_U);
3167 
3168  for (Int supidx=0; supidx<stepSuper; supidx++){
3169  SuperNodeBufferTypeUnsym & snode = arrSuperNodes[supidx];
3170 
3171 #if ( _DEBUGlevel_ >= 1 )
3172  statusOFS << "["<<snode.Index<<"] "
3173  << "Finish updating the U part by filling UUpdateBufReduced"
3174  << " back to U" << std::endl << std::endl;
3175 #endif
3176 
3177  if( MYROW( this->grid_ ) == PROW( snode.Index, this->grid_ )
3178  && snode.UUpdateBuf.m() > 0 ){
3179  std::vector<UBlock<T> >& Urow = this->U(LBi(snode.Index,this->grid_));
3180 
3181  for( Int jb = 0; jb < Urow.size(); jb++ ){
3182  UBlock<T> & UB = Urow[jb];
3183  if(1|| (UB.numRow>0 && UB.numCol>0)){
3184 #if ( _DEBUGlevel_ >= 1 )
3185  statusOFS << "["<<snode.Index<<"] "<<"Putting colptr "
3186  << snode.ColLocalPtr[jb] << " of UUpdateBuf "
3187  << snode.ColLocalPtr << " "
3188  << "in UB "<<UB.blockIdx<<std::endl;
3189 #endif
3190  //Indices follow L order... look at Lrow
3191  lapack::Lacpy( 'A', UB.numRow, UB.numCol,
3192  &snode.UUpdateBuf( 0, snode.ColLocalPtr[jb] ),
3193  snode.UUpdateBuf.m(), UB.nzval.Data(), UB.numRow );
3194 #if ( _DEBUGlevel_ >= 2 )
3195  statusOFS<< "["<<snode.Index<<"] "<<"UB: "<<UB.nzval<<std::endl;
3196 #endif
3197  }
3198  }
3199  } // Finish updating U
3200  } // for (snode.Index) : Main loop
3201 
3202 
3203  TIMER_STOP(Update_U);
3204 
3205 #ifndef _RELEASE_
3206  PopCallStack();
3207 #endif
3208 
3209 
3210 
3211  if(next_lidx < superList.size()){
3212  WaitContentLCD(superList[next_lidx], superList[next_lidx].size(),nextCDBuffers);
3213  WaitContentUCD(superList[next_lidx], superList[next_lidx].size(),nextCDBuffers);
3214  }
3215 
3216 
3217 
3218  TIMER_START(Barrier);
3219  if(next_lidx < superList.size()){
3220  nextCDBuffers.WaitAllSend();
3221  }
3222  mpi::Waitall(arrMpireqsRecvLContentFromAny);
3223  mpi::Waitall(arrMpireqsRecvUContentFromAny);
3224  //Sync for reduce L
3225  // mpi::Waitall( arrMpireqsSendToLeft );
3226  //Sync for reduce D
3227  mpi::Waitall(arrMpireqsSendToAbove);
3228 
3229  for (Int supidx=0; supidx<stepSuper; supidx++){
3230  Int ksup = superList[lidx][supidx];
3231  std::vector<MPI_Request> & mpireqsSendLToRight = arrMpireqsSendLToRight[supidx];
3232  std::vector<MPI_Request> & mpireqsSendLToBelow = arrMpireqsSendLToBelow[supidx];
3233  std::vector<MPI_Request> & mpireqsSendUToRight = arrMpireqsSendUToRight[supidx];
3234  std::vector<MPI_Request> & mpireqsSendUToBelow = arrMpireqsSendUToBelow[supidx];
3235 
3236  mpi::Waitall( mpireqsSendLToRight );
3237  mpi::Waitall( mpireqsSendLToBelow );
3238  mpi::Waitall( mpireqsSendUToRight );
3239  mpi::Waitall( mpireqsSendUToBelow );
3240 
3241  }
3242  TIMER_STOP(Barrier);
3243 
3244 
3245 
3246 #ifdef LIST_BARRIER
3247 #ifndef ALL_BARRIER
3248  if (this->options_->maxPipelineDepth!=-1)
3249 #endif
3250  {
3251  MPI_Barrier(this->grid_->comm);
3252  }
3253 #endif
3254 
3255  }
3256 
3257  template<typename T>
3259  {
3260  this->SelInv_P2p ( );
3261  } // ----- end of method PMatrixUnsym::SelInv -----
3262 
3263  template<typename T>
3265  {
3266  TIMER_START(SelInv_P2p);
3267 
3268 #ifndef _RELEASE_
3269  PushCallStack("PMatrixUnsym::SelInv_P2p");
3270 #endif
3271 
3272 
3273  Int numSuper = this->NumSuper();
3274 
3275  // Main loop
3276  std::vector<std::vector<Int> > & superList = this->WorkingSet();
3277  Int numSteps = superList.size();
3278 
3279  for (Int lidx=0; lidx<numSteps ; lidx++){
3280  Int stepSuper = superList[lidx].size();
3281  this->SelInvIntra_P2p(lidx);
3282  }
3283 
3284 #ifndef _RELEASE_
3285  PopCallStack();
3286 #endif
3287 
3288  TIMER_STOP(SelInv_P2p);
3289 
3290  return ;
3291  } // ----- end of method PMatrixUnsym::SelInv_P2p -----
3292 
3293 
3294 
3295 
3296 
3297 
3298 
3299 
3300 
3301 
3302 
3303 
3304 
3305  template<typename T>
3307  {
3308 #ifndef _RELEASE_
3309  PushCallStack("PMatrixUnsym::PreSelInv");
3310 #endif
3311 
3312  Int numSuper = this->NumSuper();
3313 
3314 #ifndef _RELEASE_
3315  PushCallStack("L(i,k) <- L(i,k) * L(k,k)^{-1}");
3316 #endif
3317 #if ( _DEBUGlevel_ >= 1 )
3318  statusOFS << std::endl << "L(i,k) <- L(i,k) * L(k,k)^{-1}"
3319  << std::endl << std::endl;
3320 #endif
3321 
3322  for( Int ksup = 0; ksup < numSuper; ksup++ ){
3323  if( MYCOL( this->grid_ ) == PCOL( ksup, this->grid_ ) ){
3324  // Broadcast the diagonal L block
3325  NumMat<T> nzvalLDiag;
3326  std::vector<LBlock<T> >& Lcol = this->L( LBj( ksup, this->grid_ ) );
3327  if( MYROW( this->grid_ ) == PROW( ksup, this->grid_ ) ){
3328  nzvalLDiag = Lcol[0].nzval;
3329  if( nzvalLDiag.m() != SuperSize(ksup, this->super_) ||
3330  nzvalLDiag.n() != SuperSize(ksup, this->super_) ){
3331 #ifdef USE_ABORT
3332  abort();
3333 #endif
3334  throw std::runtime_error(
3335  "The size of the diagonal block of L is wrong." );
3336  }
3337  } // Owns the diagonal block
3338  else {
3339  nzvalLDiag.Resize(SuperSize(ksup, this->super_), SuperSize(ksup, this->super_));
3340  }
3341  MPI_Bcast( (void*)nzvalLDiag.Data(), nzvalLDiag.ByteSize(),
3342  MPI_BYTE, PROW( ksup, this->grid_ ), this->grid_->colComm );
3343 
3344  // Triangular solve
3345  for( Int ib = 0; ib < Lcol.size(); ib++ ){
3346  LBlock<T> & LB = Lcol[ib];
3347  if( LB.blockIdx > ksup && (1 || (LB.numCol>0 && LB.numRow>0))){
3348 #if ( _DEBUGlevel_ >= 2 )
3349  // Check the correctness of the triangular solve
3350  //for the first local column
3351 // if( LBj( ksup, this->grid_ ) == 0 ){
3352 // statusOFS << "Diag L(" << ksup << ", " << ksup << "): "
3353 // << nzvalLDiag << std::endl;
3354 // statusOFS << "Before solve L(" << LB.blockIdx << ", " << ksup
3355 // << "): " << LB.nzval << std::endl;
3356 // }
3357 #endif
3358 #if defined( CHECK_NORM )
3359  NumMat<T> Ljk = LB.nzval;
3360 #endif
3361  blas::Trsm( 'R', 'L', 'N', 'U', LB.numRow, LB.numCol,
3362  ONE<T>(), nzvalLDiag.Data(), LB.numCol,
3363  LB.nzval.Data(), LB.numRow );
3364 #if defined( CHECK_NORM )
3365  NumMat<T> res = LB.nzval;
3366  //Compute L'(jk)U(kk) which should be Ljk
3367  blas::Trmm('R','L','N','U',res.m(),res.n(),ONE<T>(),nzvalLDiag.Data(),nzvalLDiag.m(),res.Data(),res.m());
3368  blas::Axpy(res.Size(), MINUS_ONE<T>(), Ljk.Data(), 1, res.Data(), 1 );
3369  double norm = lapack::Lange('F',res.m(),res.n(),res.Data(),res.m(),Ljk.Data());
3370 
3371  statusOFS << "After solve norm of residual of L(" << LB.blockIdx << ", " << ksup
3372  << "): " << norm << std::endl;
3373 
3374 
3375 
3376 
3377 
3378 
3379  // Check the correctness of the triangular solve
3380  // for the first local column
3381 // if( LBj( ksup, this->grid_ ) == 0 ){
3382 // statusOFS << "After solve L(" << LB.blockIdx << ", " << ksup
3383 // << "): " << LB.nzval << std::endl;
3384 // }
3385 #endif
3386  }
3387  }
3388  } // if( MYCOL( this->grid_ ) == PCOL( ksup, this->grid_ ) )
3389  } // for (ksup)
3390 
3391 
3392 #ifndef _RELEASE_
3393  PopCallStack();
3394 #endif
3395 
3396 
3397 #ifndef _RELEASE_
3398  PushCallStack("U(k,j) <- U(k,k)^{-1} * U(k,j)");
3399 #endif
3400 #if ( _DEBUGlevel_ >= 1 )
3401  statusOFS << std::endl << "U(k,j) <- U(k,k)^{-1} * U(k,j)"
3402  << std::endl << std::endl;
3403 #endif
3404  for( Int ksup = 0; ksup < numSuper; ksup++ ){
3405  if( MYROW( this->grid_ ) == PROW( ksup, this->grid_ ) ){
3406  // Broadcast the diagonal L block
3407  NumMat<T> nzvalUDiag;
3408  std::vector<UBlock<T> >& Urow = this->U( LBi( ksup, this->grid_ ) );
3409  if( MYCOL( this->grid_ ) == PCOL( ksup, this->grid_ ) ){
3410  std::vector<LBlock<T> >& Lcol = this->L( LBj( ksup, this->grid_ ) );
3411  nzvalUDiag = Lcol[0].nzval;
3412  if( nzvalUDiag.m() != SuperSize(ksup, this->super_) ||
3413  nzvalUDiag.n() != SuperSize(ksup, this->super_) ){
3414 #ifdef USE_ABORT
3415  abort();
3416 #endif
3417  throw std::runtime_error(
3418  "The size of the diagonal block of U is wrong." );
3419  }
3420  } // Owns the diagonal block
3421  else {
3422  nzvalUDiag.Resize(SuperSize(ksup, this->super_), SuperSize(ksup, this->super_));
3423  }
3424  MPI_Bcast( (void*)nzvalUDiag.Data(), nzvalUDiag.ByteSize(),
3425  MPI_BYTE, PCOL( ksup, this->grid_ ), this->grid_->rowComm );
3426 
3427  // Triangular solve
3428  for( Int jb = 0; jb < Urow.size(); jb++ ){
3429  UBlock<T> & UB = Urow[jb];
3430  if( UB.blockIdx > ksup && (1||(UB.numCol>0 && UB.numRow>0))){
3431 #if ( _DEBUGlevel_ >= 2 )
3432  // Check the correctness of the triangular solve for the first local column
3433 // if( LBi( ksup, this->grid_ ) == 0 ){
3434 // statusOFS << "Diag U(" << ksup << ", " << ksup << "): "
3435 // << nzvalUDiag << std::endl;
3436 // statusOFS << "Before solve U(" << ksup << ", " << UB.blockIdx
3437 // << "): " << UB.nzval << std::endl;
3438 // }
3439 #endif
3440 #if defined( CHECK_NORM )
3441  NumMat<T> Ukj = UB.nzval;
3442 #endif
3443  blas::Trsm( 'L', 'U', 'N', 'N', UB.numRow, UB.numCol,
3444  ONE<T>(), nzvalUDiag.Data(), UB.numRow,
3445  UB.nzval.Data(), UB.numRow );
3446 #if defined( CHECK_NORM )
3447  NumMat<T> res = UB.nzval;
3448  //Compute U(kk) * U^(kj) which should be Ukj
3449  blas::Trmm('L','U','N','N',res.m(),res.n(),ONE<T>(),nzvalUDiag.Data(),nzvalUDiag.m(),res.Data(),res.m());
3450  blas::Axpy(res.Size(), MINUS_ONE<T>(), Ukj.Data(), 1, res.Data(), 1 );
3451  double norm = lapack::Lange('F',res.m(),res.n(),res.Data(),res.m(),Ukj.Data());
3452  statusOFS << "After solve, norm of residual of U(" << ksup << ", " << UB.blockIdx
3453  << "): " << norm << std::endl;
3454 
3455 
3456  // Check the correctness of the triangular solve for the first local column
3457 // if( LBi( ksup, this->grid_ ) == 0 ){
3458 // statusOFS << "After solve U(" << ksup << ", " << UB.blockIdx
3459 // << "): " << UB.nzval << std::endl;
3460 // }
3461 #endif
3462  }
3463  }
3464  } // if( MYROW( this->grid_ ) == PROW( ksup, this->grid_ ) )
3465  } // for (ksup)
3466 
3467 
3468 #ifndef _RELEASE_
3469  PopCallStack();
3470 #endif
3471 
3472 
3473 
3474 
3475 #ifndef _RELEASE_
3476  PushCallStack("L(i,i) <- [L(k,k) * U(k,k)]^{-1} ");
3477 #endif
3478 #if ( _DEBUGlevel_ >= 1 )
3479  statusOFS << std::endl << "L(i,i) <- [L(k,k) * U(k,k)]^{-1}" << std::endl
3480  << std::endl;
3481 #endif
3482 
3483  for( Int ksup = 0; ksup < numSuper; ksup++ ){
3484  if( MYROW( this->grid_ ) == PROW( ksup, this->grid_ ) &&
3485  MYCOL( this->grid_ ) == PCOL( ksup, this->grid_ ) ){
3486  IntNumVec ipiv( SuperSize( ksup, this->super_ ) );
3487  // Note that the pivoting vector ipiv should follow the FORTRAN
3488  // notation by adding the +1
3489  for(Int i = 0; i < SuperSize( ksup, this->super_ ); i++){
3490  ipiv[i] = i + 1;
3491  }
3492  LBlock<T> & LB = (this->L( LBj( ksup, this->grid_ ) ))[0];
3493 #if ( _DEBUGlevel_ >= 2 )
3494  // Check the correctness of the matrix inversion
3495  // for the first local column
3496 // statusOFS << "Factorized A (" << ksup << ", " << ksup << "): "
3497 // << LB.nzval << std::endl;
3498 #endif
3499 
3500 #if defined( CHECK_NORM )
3501  NumMat<T> Lkk = LB.nzval;
3502 #endif
3503  lapack::Getri( SuperSize( ksup, this->super_ ), LB.nzval.Data(),
3504  SuperSize( ksup, this->super_ ), ipiv.Data() );
3505 
3506 #if defined( CHECK_NORM )
3507  NumMat<T> res (LB.nzval.m(), LB.nzval.n());
3508  NumMat<T> Akk (LB.nzval.m(), LB.nzval.n());
3509  //rebuild A
3510  SetValue(Akk,ZERO<T>());
3511  //copy u into A
3512  for(Int i = 0; i<Akk.m();++i){
3513  for(Int j = i; j<Akk.n();++j){
3514  Akk(i,j)= Lkk(i,j);
3515  }
3516  }
3517  blas::Trmm('L','L','N','U',Akk.m(),Akk.n(),ONE<T>(),Lkk.Data(),Lkk.m(),Akk.Data(),Akk.m());
3518 
3519 // statusOFS << "After inversion, original A(" << ksup << ", " << ksup
3520 // << "): " << Akk << std::endl;
3521 
3522  //Compute U(kk) * U'(kj) which should be Ukj
3523  blas::Gemm('N','N',res.m(),res.n(),res.m(),ONE<T>(),LB.nzval.Data(),res.m(),Akk.Data(),res.m(),ZERO<T>(),res.Data(),res.m());
3524  for(Int i = 0; i<res.m();++i){
3525  res(i,i)-=ONE<T>();
3526  }
3527 
3528  double norm = lapack::Lange('F',res.m(),res.n(),res.Data(),res.m(),Lkk.Data());
3529  statusOFS << "After inversion, norm of residual of A(" << ksup << ", " << ksup
3530  << "): " << norm << std::endl;
3531 
3532 
3533 
3534 
3535  // Check the correctness of the matrix inversion
3536  // for the first local column
3537 // statusOFS << "Inverted A (" << ksup << ", " << ksup << "): "
3538 // << LB.nzval << std::endl;
3539 #endif
3540  } // if I need to invert the diagonal block
3541  } // for (ksup)
3542 
3543 
3544 #ifndef _RELEASE_
3545  PopCallStack();
3546 #endif
3547 
3548 #ifndef _RELEASE_
3549  PopCallStack();
3550 #endif
3551 
3552  return ;
3553  } // ----- end of method PMatrixUnsym::PreSelInv -----
3554 
3555 
3556  template<typename T>
3558  {
3559  ConstructCommunicationPattern_P2p();
3560  } // ----- end of method PMatrixUnsym::ConstructCommunicationPattern -----
3561 
3562 
3563 
3564  template<typename T>
3566  {
3567 #ifndef _RELEASE_
3568  PushCallStack("PMatrixUnsym::ConstructCommunicationPattern_P2p");
3569 #endif
3570 
3571  TIMER_START(ConstructCommunicationPattern);
3572 
3573  Int numSuper = this->NumSuper();
3574 
3575  TIMER_START(Allocate);
3576 
3577 #ifndef _RELEASE_
3578  PushCallStack( "Initialize the communication pattern" );
3579 #endif
3580  this->isSendToBelow_.Resize(this->grid_->numProcRow, numSuper);
3581  this->isSendToRight_.Resize(this->grid_->numProcCol, numSuper);
3582  this->isSendToDiagonal_.Resize( numSuper );
3583  SetValue( this->isSendToBelow_, false );
3584  SetValue( this->isSendToRight_, false );
3585  SetValue( this->isSendToDiagonal_, false );
3586 
3587  this->isSendToCrossDiagonal_.Resize(this->grid_->numProcCol+1, numSuper );
3588  SetValue( this->isSendToCrossDiagonal_, false );
3589  this->isRecvFromCrossDiagonal_.Resize(this->grid_->numProcRow+1, numSuper );
3590  SetValue( this->isRecvFromCrossDiagonal_, false );
3591 
3592  this->isRecvFromAbove_.Resize( numSuper );
3593  this->isRecvFromLeft_.Resize( numSuper );
3594  this->isRecvFromBelow_.Resize( this->grid_->numProcRow, numSuper );
3595  SetValue( this->isRecvFromAbove_, false );
3596  SetValue( this->isRecvFromBelow_, false );
3597  SetValue( this->isRecvFromLeft_, false );
3598 #ifndef _RELEASE_
3599  PopCallStack();
3600 #endif
3601 
3602  TIMER_STOP(Allocate);
3603 
3604 
3605  TIMER_START(GetEtree);
3606  std::vector<Int> snodeEtree(this->NumSuper());
3607  this->GetEtree(snodeEtree);
3608  TIMER_STOP(GetEtree);
3609 
3610 #ifndef _RELEASE_
3611  PushCallStack( "Local column communication" );
3612 #endif
3613  // localColBlockRowIdx stores the nonzero block indices for each local block column.
3614  // The nonzero block indices including contribution from both L and U.
3615  // Dimension: numLocalBlockCol x numNonzeroBlock
3616  std::vector<std::vector<LBlock<T> > * > colBlockRowBuf;
3617  colBlockRowBuf.resize( numSuper );
3618  std::vector<std::vector<UBlock<T> > * > rowBlockColBuf;
3619  rowBlockColBuf.resize( numSuper );
3620  for( Int ksup = 0; ksup < numSuper; ksup++ ){
3621  if( MYCOL( this->grid_ ) == PCOL( ksup, this->grid_ )
3622  || MYROW( this->grid_ ) == PROW( ksup, this->grid_ ) ){
3623  colBlockRowBuf[ksup] = new std::vector<LBlock<T> >();
3624  if( MYROW( this->grid_ ) == PROW( ksup, this->grid_ ) ){
3625  rowBlockColBuf[ksup] = new std::vector<UBlock<T> >();
3626  }
3627  }
3628  }
3629 
3630 
3631 
3632 
3633  TIMER_START(Column_communication);
3634  for( Int ksup = 0; ksup < numSuper; ksup++ ){
3635  // All block columns perform independently
3636  if( MYCOL( this->grid_ ) == PCOL( ksup, this->grid_ ) ){
3637  // Communication
3638  std::vector< LBlock<T> > & Lcol = this->L( LBj(ksup, this->grid_ ) );
3639  std::vector< LBlock<T> > & LcolRecv = *colBlockRowBuf[ksup];
3640  std::vector<char> sendBuf;
3641  std::vector<char> recvBuf;
3642  Int localSize = 0;
3643 
3644  std::stringstream sstms,sstmr;
3645  std::vector<Int> mask( LBlockMask::TOTAL_NUMBER, 1 );
3646  mask[LBlockMask::NZVAL] = 0; // nzval is excluded
3647 
3648  // All blocks are to be allgathered within column.
3649  Int numLBlock, totalLBlock;
3650  numLBlock = Lcol.size();
3651  mpi::Allreduce ( &numLBlock, &totalLBlock, 1, MPI_SUM, this->grid_->colComm);
3652 
3653  //serialize( (Int)Lcol.size(), sstms, NO_MASK );
3654  for( Int ib = 0; ib < Lcol.size(); ib++ ){
3655  serialize( Lcol[ib], sstms, mask );
3656  }
3657  sendBuf.resize( Size( sstms ) );
3658  sstms.read( &sendBuf[0], sendBuf.size() );
3659  localSize = sendBuf.size();
3660 
3661  //allgather to get the localsizes and displacements
3662  TIMER_START(Allgatherv_Column_communication);
3663  if( this->grid_ -> mpisize != 1 )
3664  {
3665  mpi::Allgatherv( sendBuf, recvBuf, this->grid_->colComm );
3666  }
3667  else{
3668  recvBuf = sendBuf;
3669  }
3670  TIMER_STOP(Allgatherv_Column_communication);
3671 
3672 
3673  sstmr.write( &recvBuf[0], recvBuf.size() );
3674 
3675  LcolRecv.resize(totalLBlock);
3676 
3677  //deserialize( numLBlock, sstmr, NO_MASK );
3678 
3679  for( Int ib = 0; ib < totalLBlock; ib++ ){
3680  deserialize( LcolRecv[ib], sstmr, mask );
3681  }
3682 
3683 
3684  //Sort and make it unique
3685  std::sort(LcolRecv.begin(),LcolRecv.end(),LBlockComparator<T>);
3686  //auto last = std::unique(LcolRecv.begin(),LcolRecv.end(),LBlockComparator<T>);
3687  //LcolRecv.resize(last - LcolRecv.begin());
3688 #if ( _DEBUGlevel_ >= 1 )
3689  statusOFS<<"["<<ksup<<"] "<<"Lcol1: ";for(auto it = Lcol.begin();it != Lcol.end();++it){statusOFS<<it->blockIdx<<" ";}statusOFS<<endl;
3690  statusOFS<<"["<<ksup<<"] "<<"LcolRecv1: ";for(auto it = LcolRecv.begin();it != LcolRecv.end();++it){statusOFS<<it->blockIdx<<" ";}statusOFS<<endl;
3691 #endif
3692  } // if( MYCOL( this->grid_ ) == PCOL( ksup, this->grid_ ) )
3693  }
3694  TIMER_STOP(Column_communication);
3695 
3696 #ifndef _RELEASE_
3697  PopCallStack();
3698 #endif
3699 
3700 #ifndef _RELEASE_
3701  PushCallStack( "Local row communication" );
3702 #endif
3703  TIMER_START(Row_communication);
3704  for( Int ksup = 0; ksup < numSuper; ksup++ ){
3705  // All block columns perform independently
3706  if( MYROW( this->grid_ ) == PROW( ksup, this->grid_ ) ){
3707  // Communication
3708  std::vector< UBlock<T> > & Urow = this->U( LBi(ksup, this->grid_ ) );
3709  std::vector< UBlock<T> > & UrowRecv = *rowBlockColBuf[ksup];
3710  std::vector<char> sendBuf;
3711  std::vector<char> recvBuf;
3712  Int localSize = 0;
3713 
3714  std::stringstream sstms,sstmr;
3715  std::vector<Int> mask( UBlockMask::TOTAL_NUMBER, 1 );
3716  mask[UBlockMask::NZVAL] = 0; // nzval is excluded
3717 
3718  Int numUBlock, totalUBlock;
3719  numUBlock = Urow.size();
3720  mpi::Allreduce ( &numUBlock, &totalUBlock, 1, MPI_SUM, this->grid_->rowComm);
3721  // All blocks are to be allgathered within column.
3722  //serialize( (Int)Urow.size(), sstms, NO_MASK );
3723  for( Int jb = 0; jb < Urow.size(); jb++ ){
3724  serialize( Urow[jb], sstms, mask );
3725  }
3726  sendBuf.resize( Size( sstms ) );
3727  sstms.read( &sendBuf[0], sendBuf.size() );
3728  localSize = sendBuf.size();
3729 
3730  //allgather to get the localsizes and displacements
3731  TIMER_START(Allgatherv_Row_communication);
3732  if( this->grid_ -> mpisize != 1 )
3733  mpi::Allgatherv( sendBuf, recvBuf, this->grid_->rowComm );
3734  else
3735  recvBuf = sendBuf;
3736  TIMER_STOP(Allgatherv_Row_communication);
3737 
3738  sstmr.write( &recvBuf[0], recvBuf.size() );
3739 
3740  //deserialize( numUBlock, sstmr, NO_MASK );
3741 
3742  UrowRecv.resize(totalUBlock);
3743  for( Int jb = 0; jb < totalUBlock; jb++ ){
3744  deserialize( UrowRecv[jb], sstmr, mask );
3745  }
3746 
3747  //Sort and make it unique
3748  std::sort(UrowRecv.begin(),UrowRecv.end(),UBlockComparator<T>);
3749 // auto last = std::unique(UrowRecv.begin(),UrowRecv.end(),UBlockComparator<T>);
3750 // UrowRecv.resize(last - UrowRecv.begin());
3751 
3752 #if ( _DEBUGlevel_ >= 1 )
3753  statusOFS<<"["<<ksup<<"] "<<"Urow1: ";for(auto it = Urow.begin();it != Urow.end();++it){statusOFS<<it->blockIdx<<" ";}statusOFS<<endl;
3754  statusOFS<<"["<<ksup<<"] "<<"UrowRecv1: ";for(auto it = UrowRecv.begin();it != UrowRecv.end();++it){statusOFS<<it->blockIdx<<" ";}statusOFS<<endl;
3755 #endif
3756 
3757  } // if( MYROW( this->grid_ ) == PROW( ksup, this->grid_ ) )
3758  }
3759  TIMER_STOP(Row_communication);
3760 
3761 #ifndef _RELEASE_
3762  PopCallStack();
3763 #endif
3764 #ifndef _RELEASE_
3765  PushCallStack( "Local row communication" );
3766 #endif
3767  //Broadcast from diagonal processor and merge
3768  TIMER_START(Row_communication);
3769  for( Int ksup = 0; ksup < numSuper; ksup++ ){
3770  // All block columns perform independently
3771  if( MYROW( this->grid_ ) == PROW( ksup, this->grid_ ) ){
3772  // Communication
3773  std::vector< LBlock<T> > * pLcolRecv;
3774  std::vector< LBlock<T> > & LcolSend = *colBlockRowBuf[ksup];
3775  std::vector< UBlock<T> > & UrowRecv = *rowBlockColBuf[ksup];
3776  std::vector< LBlock<T> > Union;
3777  std::vector<char> sendBuf;
3778 
3779  std::vector<Int> mask( LBlockMask::TOTAL_NUMBER, 1 );
3780  mask[LBlockMask::NZVAL] = 0; // nzval is excluded
3781  if( this->grid_ -> mpisize != 1 ){
3782  pLcolRecv = new std::vector<LBlock<T> >();
3783  //diagonal processor
3784  if( MYCOL( this->grid_ ) == PCOL( ksup, this->grid_ ) ){
3785  Int localSize = 0;
3786  std::stringstream sstms;
3787 
3788  serialize( (Int)LcolSend.size(), sstms, NO_MASK );
3789  for( Int ib = 0; ib < LcolSend.size(); ib++ ){
3790  serialize( LcolSend[ib], sstms, mask );
3791  }
3792  sendBuf.resize( Size( sstms ) );
3793  sstms.read( &sendBuf[0], sendBuf.size() );
3794  localSize = sendBuf.size();
3795  }
3796  TIMER_START(Bcast_Row_communication);
3797  mpi::Bcast( sendBuf, PCOL(ksup,this->grid_), this->grid_->rowComm );
3798  TIMER_STOP(Bcast_Row_communication);
3799 
3800  std::stringstream sstmr;
3801  sstmr.write( &sendBuf[0], sendBuf.size() );
3802 
3803  Int numLBlock;
3804  deserialize( numLBlock, sstmr, NO_MASK );
3805  pLcolRecv->resize(numLBlock);
3806  for( Int ib = 0; ib < numLBlock; ib++ ){
3807  deserialize( (*pLcolRecv)[ib], sstmr, mask );
3808  }
3809  }
3810  else{
3811  pLcolRecv = &LcolSend;
3812  }
3813 
3814  //LcolRecv is sorted and with unique elements
3815  //UrowRecv is sorted and with unique elements
3816 
3817  auto result = back_inserter(Union);
3818  auto firstUrow = UrowRecv.begin();
3819  auto lastUrow = UrowRecv.end();
3820  auto firstLcol = pLcolRecv->begin();
3821  auto lastLcol = pLcolRecv->end();
3822  while (true)
3823  {
3824  if (firstUrow==lastUrow){
3825  result = std::copy(firstLcol,lastLcol,result);
3826  break;
3827  }
3828  if (firstLcol==lastLcol){
3829  for(auto it = firstUrow; it != lastUrow; ++it){
3830  LBlock<T> LB;
3831  LB.blockIdx = it->blockIdx;
3832  LB.numRow = it->numCol;
3833  LB.numCol = it->numRow;
3834  LB.rows = it->cols;
3835  *result = LB;
3836  ++result;
3837  }
3838  break;
3839  }
3840 
3841  if (firstUrow->blockIdx<firstLcol->blockIdx) {
3842  LBlock<T> LB;
3843  LB.blockIdx = firstUrow->blockIdx;
3844  LB.numRow = firstUrow->numCol;
3845  LB.numCol = firstUrow->numRow;
3846  LB.rows = firstUrow->cols;
3847  *result = LB;
3848  ++firstUrow;
3849  }
3850  else if (firstLcol->blockIdx<firstUrow->blockIdx) {
3851  *result = *firstLcol;
3852  ++firstLcol;
3853  }
3854  else {
3855  //compute set union between rows and cols
3856  std::set<Int> uRowCol;
3857  uRowCol.insert(&firstLcol->rows[0],
3858  &firstLcol->rows[0]+firstLcol->numRow);
3859  uRowCol.insert(&firstUrow->cols[0],
3860  &firstUrow->cols[0]+firstUrow->numCol);
3861 
3862  LBlock<T> LB;
3863  LB.blockIdx = firstUrow->blockIdx;
3864  LB.numRow = uRowCol.size();
3865  LB.numCol = firstUrow->numRow;
3866  LB.rows.Resize(uRowCol.size());
3867  std::copy(uRowCol.begin(),uRowCol.end(),&LB.rows[0]);
3868  *result = LB;
3869  ++firstUrow;
3870  ++firstLcol;
3871  }
3872  ++result;
3873  }
3874 
3875 #if ( _DEBUGlevel_ >= 1 )
3876  statusOFS<<"["<<ksup<<"] "<<"Lcol: ";for(auto it = pLcolRecv->begin();it != pLcolRecv->end();++it){statusOFS<<it->blockIdx<<" ";}statusOFS<<endl;
3877  statusOFS<<"["<<ksup<<"] "<<"Urow: ";for(auto it = UrowRecv.begin();it != UrowRecv.end();++it){statusOFS<<it->blockIdx<<" ";}statusOFS<<endl;
3878  statusOFS<<"["<<ksup<<"] "<<"Union: ";for(auto it = Union.begin();it != Union.end();++it){statusOFS<<it->blockIdx<<" ";}statusOFS<<endl;
3879 #endif
3880 
3881  //replace LcolSend by the union
3882  LcolSend.swap(Union);
3883 
3884  if( this->grid_ -> mpisize != 1 ){
3885  delete pLcolRecv;
3886  }
3887  } // if( MYROW( this->grid_ ) == PROW( ksup, this->grid_ ) )
3888  }
3889  TIMER_STOP(Row_communication);
3890 #ifndef _RELEASE_
3891  PopCallStack();
3892 #endif
3893 
3894 
3895 #ifndef _RELEASE_
3896  PushCallStack( "Local col communication" );
3897 #endif
3898  TIMER_START(Col_communication);
3899  for( Int ksup = 0; ksup < numSuper; ksup++ ){
3900  // All block columns perform independently
3901  if( MYCOL( this->grid_ ) == PCOL( ksup, this->grid_ ) ){
3902  // Communication
3903  std::vector< LBlock<T> > * pUnionRecv;
3904  std::vector< LBlock<T> > & UnionSend = *colBlockRowBuf[ksup];
3905 
3906  std::vector<char> sendBuf;
3907 
3908  std::vector<Int> mask( LBlockMask::TOTAL_NUMBER, 1 );
3909  mask[LBlockMask::NZVAL] = 0; // nzval is excluded
3910  if( this->grid_ -> mpisize != 1 ){
3911  pUnionRecv = new std::vector<LBlock<T> >();
3912  //diagonal processor
3913  if( MYROW( this->grid_ ) == PROW( ksup, this->grid_ ) ){
3914  Int localSize = 0;
3915  std::stringstream sstms;
3916 
3917  serialize( (Int)UnionSend.size(), sstms, NO_MASK );
3918  for( Int ib = 0; ib < UnionSend.size(); ib++ ){
3919  serialize( UnionSend[ib], sstms, mask );
3920  }
3921  sendBuf.resize( Size( sstms ) );
3922  sstms.read( &sendBuf[0], sendBuf.size() );
3923  localSize = sendBuf.size();
3924  }
3925  TIMER_START(Bcast_Col_communication);
3926  mpi::Bcast( sendBuf, PROW(ksup,this->grid_), this->grid_->colComm );
3927  TIMER_STOP(Bcast_Col_communication);
3928 
3929  std::stringstream sstmr;
3930  sstmr.write( &sendBuf[0], sendBuf.size() );
3931 
3932  Int numLBlock;
3933  deserialize( numLBlock, sstmr, NO_MASK );
3934  pUnionRecv->resize(numLBlock);
3935  for( Int ib = 0; ib < numLBlock; ib++ ){
3936  deserialize( (*pUnionRecv)[ib], sstmr, mask );
3937  }
3938 
3939 
3940  UnionSend.resize(pUnionRecv->size());
3941  std::copy(pUnionRecv->begin(),pUnionRecv->end(),UnionSend.begin());
3942  }
3943  else{
3944  pUnionRecv = &UnionSend;
3945  }
3946 
3947 #if ( _DEBUGlevel_ >= 1 )
3948  statusOFS<<"["<<ksup<<"] "<<"Union: ";for(auto it = UnionSend.begin();it != UnionSend.end();++it){statusOFS<<it->blockIdx<<" ";}statusOFS<<endl;
3949 #endif
3950 
3951  if( this->grid_ -> mpisize != 1 ){
3952  delete pUnionRecv;
3953  }
3954  } // if( MYCOL( this->grid_ ) == PCOL( ksup, this->grid_ ) )
3955  }
3956  TIMER_STOP(Col_communication);
3957 #ifndef _RELEASE_
3958  PopCallStack();
3959 #endif
3960 
3961 #ifndef _RELEASE_
3962  PushCallStack("Extending_UL");
3963 #endif
3964 
3965  for( Int ksup = 0; ksup < numSuper; ksup++ ){
3966  // All block columns perform independently
3967  if( MYROW( this->grid_ ) == PROW( ksup, this->grid_ ) ){
3968 
3969  std::vector< LBlock<T> > & Union = *colBlockRowBuf[ksup];
3970  std::vector< UBlock<T> > & Urow = this->U( LBi(ksup, this->grid_ ) );
3971  Int & LrowSize = this->LrowSize_[ LBi(ksup, this->grid_ ) ];
3972  TIMER_START(Computing_Lrow_size);
3973  //Allocate Lrow and extend Urow
3974  LrowSize = 0;
3975  for(auto it = Union.begin(); it!=Union.end();++it){
3976  Int Idx = it->blockIdx;
3977  if(Idx > ksup && (Idx % this->grid_->numProcCol) == MYCOL(this->grid_) ){
3978  ++LrowSize;
3979  }
3980  }
3981  TIMER_STOP(Computing_Lrow_size);
3982 
3983  TIMER_START(Extending_Urow);
3984  Urow.reserve(LrowSize);
3985 
3986  for(auto it = Union.begin(); it!=Union.end();++it){
3987  Int Idx = it->blockIdx;
3988  if(Idx > ksup && (Idx % this->grid_->numProcCol) == MYCOL(this->grid_) ){
3989  bool isFound = false;
3990  Int nextIdx = -1;
3991  for(Int jb = 0; jb < Urow.size(); ++jb){
3992  UBlock<T> & UB = Urow[jb];
3993  if(UB.blockIdx == Idx){
3994  isFound = true;
3995  nextIdx = jb;
3996  break;
3997  }
3998  if(UB.blockIdx > Idx){
3999  nextIdx = jb;
4000  break;
4001  }
4002  }
4003  if(!isFound){
4004  //push_back
4005  UBlock<T> UB;
4006  UB.blockIdx = Idx;
4007  UB.numRow = it->numCol;
4008  UB.numCol = it->numRow;
4009  UB.cols = it->rows;
4010  Urow.push_back(UB);
4011  UBlock<T> & UBl = Urow.back();
4012  UBl.nzval.Resize(UB.numRow,UB.numCol);
4013  SetValue(UBl.nzval,ZERO<T>());
4014  }
4015  else{
4016  //make sure blocks are the same size
4017  UBlock<T> & UB = Urow[nextIdx];
4018  assert(UB.numRow == it->numCol);
4019  if( UB.numCol != it->numRow ){
4020  NumMat<T> tmpNzval = UB.nzval;
4021  IntNumVec tmpCols = UB.cols;
4022 
4023  UB.numRow = it->numCol;
4024  UB.numCol = it->numRow;
4025  UB.cols = it->rows;
4026  UB.nzval.Resize(UB.numRow,UB.numCol);
4027  SetValue(UB.nzval,ZERO<T>());
4028 
4029  //now put nzvals back in place
4030  Int jOldCols = 0;
4031  for(Int j = 0; j<UB.numCol; ++j){
4032  Int newCol = UB.cols[j];
4033  if(jOldCols<tmpCols.m()){
4034  Int oldCol = tmpCols[jOldCols];
4035  if(newCol == oldCol){
4036  T * nzcolPtr = tmpNzval.VecData(jOldCols);
4037  std::copy(nzcolPtr,nzcolPtr+UB.numRow,UB.nzval.VecData(j));
4038  jOldCols++;
4039  }
4040  }
4041  else{
4042  break;
4043  }
4044  }
4045  assert(jOldCols>=tmpCols.m());
4046 
4047  }
4048 
4049  }
4050  }
4051  }
4052 #if ( _DEBUGlevel_ >= 1 )
4053  statusOFS<<"["<<ksup<<"] "<<"LrowSize = "<<LrowSize<<std::endl;
4054 #endif
4055  std::sort(Urow.begin(),Urow.end(),UBlockComparator<T>);
4056 #if ( _DEBUGlevel_ >= 1 )
4057  statusOFS<<"["<<ksup<<"] "<<"Urow extended: ";for(auto it = Urow.begin();it != Urow.end();++it){statusOFS<<it->blockIdx<<" ";}statusOFS<<endl;
4058 #endif
4059  TIMER_STOP(Extending_Urow);
4060  } // if( MYROW( this->grid_ ) == PROW( ksup, this->grid_ ) )
4061  } // for(ksup)
4062 
4063  for( Int ksup = 0; ksup < numSuper; ksup++ ){
4064  // All block columns perform independently
4065  if( MYCOL( this->grid_ ) == PCOL( ksup, this->grid_ ) ){
4066 
4067  std::vector< LBlock<T> > & Union = *colBlockRowBuf[ksup];
4068  std::vector< LBlock<T> > & Lcol = this->L( LBj(ksup, this->grid_ ) );
4069  Int & UcolSize = this->UcolSize_[ LBj(ksup, this->grid_ ) ];
4070  TIMER_START(Computing_Ucol_size);
4071  //Allocate Ucol and extend Lcol
4072  UcolSize = 0;
4073  for(auto it = Union.begin(); it!=Union.end();++it){
4074  Int Idx = it->blockIdx;
4075  if(Idx > ksup && (Idx % this->grid_->numProcRow) == MYROW(this->grid_) ){
4076  ++UcolSize;
4077  }
4078  }
4079  TIMER_STOP(Computing_Ucol_size);
4080 
4081  TIMER_START(Extending_Lcol);
4082  Lcol.reserve(UcolSize);
4083 
4084  for(auto it = Union.begin(); it!=Union.end();++it){
4085  Int Idx = it->blockIdx;
4086  if(Idx > ksup && (Idx % this->grid_->numProcRow) == MYROW(this->grid_) ){
4087  bool isFound = false;
4088  Int nextIdx = -1;
4089  for(Int ib = 0; ib < Lcol.size(); ++ib){
4090  LBlock<T> & LB = Lcol[ib];
4091  if(LB.blockIdx == Idx){
4092  isFound = true;
4093  nextIdx = ib;
4094  break;
4095  }
4096  if(LB.blockIdx > Idx){
4097  nextIdx = ib;
4098  break;
4099  }
4100  }
4101  if(!isFound){
4102  //push_back
4103  Lcol.push_back(*it);
4104  LBlock<T> & LB = Lcol.back();
4105  LB.nzval.Resize(LB.numRow,LB.numCol);
4106  SetValue(LB.nzval,ZERO<T>());
4107  }
4108  else{
4109  //make sure blocks are the same size
4110  LBlock<T> & LB = Lcol[nextIdx];
4111  assert(LB.numCol == it->numCol);
4112  if( LB.numRow != it->numRow ){
4113  NumMat<T> tmpNzval = LB.nzval;
4114  IntNumVec tmpRows = LB.rows;
4115 
4116  LB.numRow = it->numRow;
4117  LB.numCol = it->numCol;
4118  LB.rows = it->rows;
4119  LB.nzval.Resize(LB.numRow,LB.numCol);
4120  SetValue(LB.nzval,ZERO<T>());
4121 
4122  //now put nzvals back in place
4123  Int iOldRows = 0;
4124  for(Int i = 0; i<LB.numRow; ++i){
4125  Int newRow = LB.rows[i];
4126  if(iOldRows<tmpRows.m()){
4127  Int oldRow = tmpRows[iOldRows];
4128  if(newRow == oldRow){
4129  for(Int j = 0; j<LB.numCol; ++j){
4130  LB.nzval(i,j) = tmpNzval(iOldRows,j);
4131  }
4132  iOldRows++;
4133  }
4134  }
4135  else{
4136  break;
4137  }
4138  }
4139  assert(iOldRows>=tmpRows.m());
4140 
4141  }
4142 
4143  }
4144  }
4145  }
4146 #if ( _DEBUGlevel_ >= 1 )
4147  statusOFS<<"["<<ksup<<"] "<<"UcolSize = "<<UcolSize<<std::endl;
4148 #endif
4149  std::sort(Lcol.begin(),Lcol.end(),LBlockComparator<T>);
4150 #if ( _DEBUGlevel_ >= 1 )
4151  statusOFS<<"["<<ksup<<"] "<<"Lcol extended: ";for(auto it = Lcol.begin();it != Lcol.end();++it){statusOFS<<it->blockIdx<<" ";}statusOFS<<endl;
4152 #endif
4153  TIMER_STOP(Extending_Lcol);
4154  } // if( MYCOL( this->grid_ ) == PCOL( ksup, this->grid_ ) )
4155  } // for(ksup)
4156 
4157 #ifndef _RELEASE_
4158  PopCallStack();
4159 #endif
4160 
4161 
4162 
4163 #ifndef _RELEASE_
4164  PushCallStack("Compute_full_row_struct");
4165 #endif
4166 
4167  //pointers to next non zero block
4168  std::vector<Int> nextNZColBlockRow;
4169  nextNZColBlockRow.resize( this->NumLocalBlockCol(), 0 );
4170 
4171  std::vector<Int> nextNZRowBlockCol;
4172  nextNZRowBlockCol.resize( this->NumLocalBlockRow() , 0);
4173 
4174  if(1){
4175  for( Int ksup = 0; ksup < numSuper; ksup++ ){
4176  // All block columns perform independently
4177  std::vector<Int> tAllBlockRowIdx;
4178  if( MYCOL( this->grid_ ) == PCOL( ksup, this->grid_ ) ){
4179  // Communication
4180  std::vector< LBlock<T> > & Union = *colBlockRowBuf[ksup];
4181 
4182  std::vector<Int> tlocalBlockRowIdx;
4183  //look at previous supernodes (ideally look at children in the etree only)
4184 
4185  for( Int jsup = 0; jsup < ksup; jsup++ ){
4186  //look into U
4187  Int pjsup = snodeEtree[jsup];
4188  if(pjsup<=ksup && MYROW( this->grid_ ) == PROW( jsup, this->grid_ ) ){
4189  // if( MYCOL( this->grid_ ) == PCOL( jsup, this->grid_ ) ){
4190  std::vector< UBlock<T> > & Urow = this->U( LBi(jsup, this->grid_ ) );
4191 
4192 
4193 
4194 #if ( _DEBUGlevel_ >= 1 )
4195  statusOFS<<"["<<ksup<<"] "<<"Urow["<<jsup<<"]: "; for(auto it = Urow.begin(); it != Urow.end(); ++it){ statusOFS<<it->blockIdx<<" "; } statusOFS<<endl;
4196 #endif
4197 
4198  Int & nextIdx = nextNZRowBlockCol[ LBi(jsup, this->grid_) ];
4199  while(nextIdx<Urow.size()){
4200  if(Urow[nextIdx].blockIdx < ksup){
4201  ++nextIdx;
4202  }
4203  else{
4204  break;
4205  }
4206  }
4207  if(nextIdx<Urow.size()){
4208  if(Urow[nextIdx].blockIdx == ksup){
4209  tlocalBlockRowIdx.push_back(jsup);
4210  }
4211  }
4212  // }
4213  }
4214  }
4215 
4216 
4217 
4218 
4219 
4220  TIMER_START(Allgatherv_Column_communication);
4221  if( this->grid_ -> mpisize != 1 )
4222  mpi::Allgatherv( tlocalBlockRowIdx, tAllBlockRowIdx, this->grid_->colComm );
4223  else
4224  tAllBlockRowIdx = tlocalBlockRowIdx;
4225 
4226  TIMER_STOP(Allgatherv_Column_communication);
4227 
4228  //add dummy LBlocks to Union
4229 
4230 
4231  for(auto it = tAllBlockRowIdx.begin(); it != tAllBlockRowIdx.end(); ++it ){
4232  LBlock<T> LB;
4233  LB.numCol = 0;
4234  LB.numRow = 0;
4235  LB.blockIdx = *it;
4236  Union.push_back(LB);
4237  }
4238 
4239  std::sort(Union.begin(),Union.end(),LBlockComparator<T>);
4240 
4241 #if ( _DEBUGlevel_ >= 1 )
4242  statusOFS<<"["<<ksup<<"] "<<"NEW Union: ";
4243  for(auto it = Union.begin(); it != Union.end(); ++it){ statusOFS<<it->blockIdx<<" "; }
4244  statusOFS<<endl;
4245 #endif
4246 
4247  } // if( MYCOL( this->grid_ ) == PCOL( ksup, this->grid_ ) )
4248  }
4249 
4250  }
4251 
4252  for( Int ksup = 0; ksup < numSuper; ksup++ ){
4253  // All block columns perform independently
4254  std::vector<Int> tAllBlockColIdx;
4255  if( MYROW( this->grid_ ) == PROW( ksup, this->grid_ ) ){
4256  // Communication
4257  std::vector< LBlock<T> > & Union = *colBlockRowBuf[ksup];
4258  std::vector<Int> tlocalBlockColIdx;
4259  //look at previous supernodes (ideally look at children in the etree only)
4260  for( Int jsup = 0; jsup < ksup; jsup++ ){
4261  //look into U
4262  Int pjsup = snodeEtree[jsup];
4263  if(pjsup<=ksup && MYCOL( this->grid_ ) == PCOL( jsup, this->grid_ ) ){
4264  std::vector< LBlock<T> > & Lcol = this->L( LBj(jsup, this->grid_ ) );
4265 
4266 
4267 
4268 #if ( _DEBUGlevel_ >= 1 )
4269  statusOFS<<"["<<ksup<<"] "<<"Lcol["<<jsup<<"]: "; for(auto it = Lcol.begin(); it != Lcol.end(); ++it){ statusOFS<<it->blockIdx<<" "; } statusOFS<<endl;
4270 #endif
4271 
4272  Int & nextIdx = nextNZColBlockRow[ LBj(jsup, this->grid_) ];
4273  while(nextIdx<Lcol.size()){
4274  if(Lcol[nextIdx].blockIdx < ksup){
4275  ++nextIdx;
4276  }
4277  else{
4278  break;
4279  }
4280  }
4281  if(nextIdx<Lcol.size()){
4282  if(Lcol[nextIdx].blockIdx == ksup){
4283  tlocalBlockColIdx.push_back(jsup);
4284  }
4285  }
4286  }
4287  }
4288 
4289 
4290 
4291 
4292 
4293 
4294 
4295 
4296 
4297 
4298  TIMER_START(Allgatherv_Row_communication);
4299  if( this->grid_ -> mpisize != 1 )
4300  mpi::Allgatherv( tlocalBlockColIdx, tAllBlockColIdx, this->grid_->rowComm );
4301  else
4302  tAllBlockColIdx = tlocalBlockColIdx;
4303 
4304  TIMER_STOP(Allgatherv_Row_communication);
4305 
4306  //add dummy LBlocks to Union
4307 
4308 
4309  for(auto it = tAllBlockColIdx.begin(); it != tAllBlockColIdx.end(); ++it ){
4310  LBlock<T> LB;
4311  LB.numCol = 0;
4312  LB.numRow = 0;
4313  LB.blockIdx = *it;
4314  Union.push_back(LB);
4315  }
4316 
4317  std::sort(Union.begin(),Union.end(),LBlockComparator<T>);
4318  auto last = std::unique(Union.begin(),Union.end(),LBlockEqualComparator<T>);
4319  Union.erase(last,Union.end());
4320 
4321 #if ( _DEBUGlevel_ >= 1 )
4322  statusOFS<<"["<<ksup<<"] "<<"NEW Union: ";
4323  for(auto it = Union.begin(); it != Union.end(); ++it){ statusOFS<<it->blockIdx<<" "; }
4324  statusOFS<<endl;
4325 #endif
4326 
4327  } // if( MYROW( this->grid_ ) == PROW( ksup, this->grid_ ) )
4328  }
4329 
4330 
4331 #ifndef _RELEASE_
4332  PopCallStack();
4333 #endif
4334 
4335 
4336 
4337  TIMER_START(STB_RFA);
4338 #ifndef _RELEASE_
4339  PushCallStack("SendToBelow / RecvFromAbove");
4340 #endif
4341  for( Int ksup = 0; ksup < numSuper - 1; ksup++ ){
4342  // Loop over all the supernodes to the right of ksup
4343 
4344 
4345  Int jsup = snodeEtree[ksup];
4346  while(jsup<numSuper){
4347  Int jsupLocalBlockCol = LBj( jsup, this->grid_ );
4348  Int jsupProcCol = PCOL( jsup, this->grid_ );
4349  if( MYCOL( this->grid_ ) == jsupProcCol ){
4350  std::vector< LBlock<T> > & Union = *colBlockRowBuf[jsup];
4351 
4352  // SendToBelow / RecvFromAbove only if (ksup, jsup) is nonzero.
4353  Int isFound = 0;
4354  for(auto it = Union.begin();it!=Union.end();++it){
4355  if(it->blockIdx == ksup){
4356  isFound = 1;
4357  break;
4358  }
4359  else if(it->blockIdx > ksup){
4360  break;
4361  }
4362  }
4363  if( isFound ) {
4364  for( auto si = Union.begin(); si != Union.end(); si++ ){
4365  Int isup = si->blockIdx;
4366  Int isupProcRow = PROW( isup, this->grid_ );
4367  if( isup > ksup ){
4368  if( MYROW( this->grid_ ) == isupProcRow ){
4369  this->isRecvFromAbove_(ksup) = true;
4370  }
4371  if( MYROW( this->grid_ ) == PROW( ksup, this->grid_ ) ){
4372  this->isSendToBelow_( isupProcRow, ksup ) = true;
4373  }
4374  } // if( isup > ksup )
4375  } // for (si)
4376  } // if( isFound )
4377 
4378  } // if( MYCOL( this->grid_ ) == PCOL( jsup, this->grid_ ) )
4379  jsup = snodeEtree[jsup];
4380  } // for(jsup)
4381  } // for(ksup)
4382 
4383 #if ( _DEBUGlevel_ >= 1 )
4384  statusOFS << std::endl << "this->isSendToBelow:" << std::endl;
4385  for(int j = 0;j< this->isSendToBelow_.n();j++){
4386  statusOFS << "["<<j<<"] ";
4387  for(int i =0; i < this->isSendToBelow_.m();i++){
4388  statusOFS<< this->isSendToBelow_(i,j) << " ";
4389  }
4390  statusOFS<<std::endl;
4391  }
4392 
4393  statusOFS << std::endl << "this->isRecvFromAbove:" << std::endl;
4394  for(int j = 0;j< this->isRecvFromAbove_.m();j++){
4395  statusOFS << "["<<j<<"] "<< this->isRecvFromAbove_(j)<<std::endl;
4396  }
4397 #endif
4398 
4399 #ifndef _RELEASE_
4400  PopCallStack();
4401 #endif
4402 
4403 
4404  TIMER_STOP(STB_RFA);
4405 
4406 
4407 
4408 
4409 
4410 
4411 
4412 
4413  TIMER_START(STR_RFL_RFB);
4414 
4415 
4416 #ifndef _RELEASE_
4417  PushCallStack("SendToRight / RecvFromLeft");
4418 #endif
4419  for( Int ksup = 0; ksup < numSuper - 1; ksup++ ){
4420  // Loop over all the supernodes below ksup
4421 
4422  Int isup = snodeEtree[ksup];
4423  while(isup<numSuper){
4424  Int isupLocalBlockRow = LBi( isup, this->grid_ );
4425  Int isupProcRow = PROW( isup, this->grid_ );
4426  if( MYROW( this->grid_ ) == isupProcRow ){
4427  std::vector< LBlock<T> > & Union = *colBlockRowBuf[isup];
4428 
4429  // SendToRight / RecvFromLeft only if (isup, ksup) is nonzero.
4430  Int isFound = 0;
4431  for(auto it = Union.begin();it!=Union.end();++it){
4432  if(it->blockIdx == ksup){
4433  isFound = 1;
4434  break;
4435  }
4436  else if(it->blockIdx > ksup){
4437  break;
4438  }
4439  }
4440  if( isFound ) {
4441  for( auto si = Union.begin(); si != Union.end(); si++ ){
4442  Int jsup = si->blockIdx;
4443  Int jsupProcCol = PCOL( jsup, this->grid_ );
4444  if( jsup > ksup ){
4445  if( MYCOL( this->grid_ ) == jsupProcCol ){
4446  this->isRecvFromLeft_(ksup) = true;
4447  }
4448  if( MYCOL( this->grid_ ) == PCOL( ksup, this->grid_ ) ){
4449  this->isSendToRight_( jsupProcCol, ksup ) = true;
4450  }
4451  }
4452  } // for (si)
4453  } // if( isFound )
4454  } // if( MYROW( this->grid_ ) == isupProcRow )
4455 
4456  if( MYCOL( this->grid_ ) == PCOL(ksup, this->grid_) ){
4457  if( MYROW( this->grid_ ) == PROW( ksup, this->grid_ ) ){
4458  this->isRecvFromBelow_(isupProcRow,ksup) = true;
4459  }
4460  else if (MYROW(this->grid_) == isupProcRow){
4461  this->isSendToDiagonal_(ksup)=true;
4462  }
4463  } // if( MYCOL( this->grid_ ) == PCOL(ksup, this->grid_) )
4464  isup = snodeEtree[isup];
4465  } // for (isup)
4466  } // for (ksup)
4467 
4468 #if ( _DEBUGlevel_ >= 1 )
4469  statusOFS << std::endl << "this->isSendToRight:" << std::endl;
4470  for(int j = 0;j< this->isSendToRight_.n();j++){
4471  statusOFS << "["<<j<<"] ";
4472  for(int i =0; i < this->isSendToRight_.m();i++){
4473  statusOFS<< this->isSendToRight_(i,j) << " ";
4474  }
4475  statusOFS<<std::endl;
4476  }
4477 
4478  statusOFS << std::endl << "this->isRecvFromLeft:" << std::endl;
4479  for(int j = 0;j< this->isRecvFromLeft_.m();j++){
4480  statusOFS << "["<<j<<"] "<< this->isRecvFromLeft_(j)<<std::endl;
4481  }
4482 
4483  statusOFS << std::endl << "this->isRecvFromBelow:" << std::endl;
4484  for(int j = 0;j< this->isRecvFromBelow_.n();j++){
4485  statusOFS << "["<<j<<"] ";
4486  for(int i =0; i < this->isRecvFromBelow_.m();i++){
4487  statusOFS<< this->isRecvFromBelow_(i,j) << " ";
4488  }
4489  statusOFS<<std::endl;
4490  }
4491 #endif
4492 
4493 #ifndef _RELEASE_
4494  PopCallStack();
4495 #endif
4496 
4497  TIMER_STOP(STR_RFL_RFB);
4498 
4499 
4500 
4501 
4502  TIMER_START(STCD_RFCD);
4503 
4504 
4505 #ifndef _RELEASE_
4506  PushCallStack("SendToCrossDiagonal / RecvFromCrossDiagonal");
4507 #endif
4508  for( Int ksup = 0; ksup < numSuper - 1; ksup++ ){
4509  if( MYCOL( this->grid_ ) == PCOL( ksup, this->grid_ ) ){
4510  std::vector< LBlock<T> > & Union = *colBlockRowBuf[ksup];
4511  for( auto si = Union.begin(); si != Union.end(); si++ ){
4512  Int isup = si->blockIdx;
4513  Int isupProcRow = PROW( isup, this->grid_ );
4514  Int isupProcCol = PCOL( isup, this->grid_ );
4515  if( isup > ksup && MYROW( this->grid_ ) == isupProcRow ){
4516  this->isSendToCrossDiagonal_(this->grid_->numProcCol, ksup ) = true;
4517  this->isSendToCrossDiagonal_(isupProcCol, ksup ) = true;
4518  }
4519  } // for (si)
4520  } // if( MYCOL( this->grid_ ) == PCOL( ksup, this->grid_ ) )
4521  } // for (ksup)
4522 
4523  for( Int ksup = 0; ksup < numSuper - 1; ksup++ ){
4524  if( MYROW( this->grid_ ) == PROW( ksup, this->grid_ ) ){
4525  std::vector< LBlock<T> > & Union = *colBlockRowBuf[ksup];
4526  for( auto si = Union.begin(); si != Union.end(); si++ ){
4527  Int jsup = si->blockIdx;
4528  Int jsupProcCol = PCOL( jsup, this->grid_ );
4529  Int jsupProcRow = PROW( jsup, this->grid_ );
4530  if( jsup > ksup && MYCOL(this->grid_) == jsupProcCol ){
4531  this->isRecvFromCrossDiagonal_(this->grid_->numProcRow, ksup ) = true;
4532  this->isRecvFromCrossDiagonal_(jsupProcRow, ksup ) = true;
4533  }
4534  } // for (si)
4535  } // if( MYROW( this->grid_ ) == PROW( ksup, this->grid_ ) )
4536  } // for (ksup)
4537 #if ( _DEBUGlevel_ >= 1 )
4538  statusOFS << std::endl << "this->isSendToCrossDiagonal:" << std::endl;
4539  for(int j =0; j < this->isSendToCrossDiagonal_.n();j++){
4540  if(this->isSendToCrossDiagonal_(this->grid_->numProcCol,j)){
4541  statusOFS << "["<<j<<"] ";
4542  for(int i =0; i < this->isSendToCrossDiagonal_.m()-1;i++){
4543  if(this->isSendToCrossDiagonal_(i,j))
4544  {
4545  statusOFS<< PNUM(PROW(j,this->grid_),i,this->grid_)<<" ";
4546  }
4547  }
4548  statusOFS<<std::endl;
4549  }
4550  }
4551 
4552  statusOFS << std::endl << "this->isRecvFromCrossDiagonal:" << std::endl;
4553  for(int j =0; j < this->isRecvFromCrossDiagonal_.n();j++){
4554  if(this->isRecvFromCrossDiagonal_(this->grid_->numProcRow,j)){
4555  statusOFS << "["<<j<<"] ";
4556  for(int i =0; i < this->isRecvFromCrossDiagonal_.m()-1;i++){
4557  if(this->isRecvFromCrossDiagonal_(i,j))
4558  {
4559  statusOFS<< PNUM(i,PCOL(j,this->grid_),this->grid_)<<" ";
4560  }
4561  }
4562  statusOFS<<std::endl;
4563  }
4564  }
4565 
4566 
4567 #endif
4568 
4569 #ifndef _RELEASE_
4570  PopCallStack();
4571 #endif
4572 
4573  TIMER_STOP(STCD_RFCD);
4574 
4575  for( Int ksup = 0; ksup < numSuper; ksup++ ){
4576  if( MYCOL( this->grid_ ) == PCOL( ksup, this->grid_ )
4577  || MYROW( this->grid_ ) == PROW( ksup, this->grid_ ) ){
4578  delete colBlockRowBuf[ksup];
4579  if( MYROW( this->grid_ ) == PROW( ksup, this->grid_ ) ){
4580  delete rowBlockColBuf[ksup];
4581  }
4582  }
4583  }
4584 
4585  //Build the list of supernodes based on the elimination tree from SuperLU
4586  this->GetWorkSet(snodeEtree,this->WorkingSet());
4587 
4588  TIMER_STOP(ConstructCommunicationPattern);
4589 
4590 #ifndef _RELEASE_
4591  PopCallStack();
4592 #endif
4593 
4594  return ;
4595  } // ----- end of method PMatrixUnsym::ConstructCommunicationPattern_P2p -----
4596 
4597 
4598 } // namespace PEXSI
4599 
4600 #endif //_PEXSI_PSELINV_UNSYM_IMPL_HPP_
void UnpackData(SuperNodeBufferTypeUnsym &snode, std::vector< LBlock< T > > &LcolRecv, std::vector< LBlock< T > > &LrowRecv, std::vector< UBlock< T > > &UcolRecv, std::vector< UBlock< T > > &UrowRecv)
UnpackData.
Definition: pselinv_unsym_impl.hpp:1727
void SelInvIntra_P2p(Int lidx)
SelInvIntra_P2p.
Definition: pselinv_unsym_impl.hpp:1925
Int blockIdx
Block index (supernodal index)
Definition: pselinv.hpp:237
void ComputeDiagUpdate(SuperNodeBufferTypeUnsym &snode)
ComputeDiagUpdate.
Definition: pselinv_unsym_impl.hpp:1862
Interface with SuperLU_Dist (version 3.0 and later)
Int PROW(Int bnum, const GridType *g)
PROW returns the processor row that the bnum-th block (supernode) belongs to.
Definition: pselinv.hpp:293
Int numRow
Number of nonzero rows.
Definition: pselinv.hpp:188
void SelInv_lookup_indexes(SuperNodeBufferTypeUnsym &snode, std::vector< LBlock< T > > &LcolRecv, std::vector< LBlock< T > > &LrowRecv, std::vector< UBlock< T > > &UcolRecv, std::vector< UBlock< T > > &UrowRecv, NumMat< T > &AinvBuf, NumMat< T > &LBuf, NumMat< T > &UBuf)
SelInv_lookup_indexes.
Definition: pselinv_unsym_impl.hpp:112
Int MYPROC(const GridType *g)
MYPROC returns the current processor rank.
Definition: pselinv.hpp:280
Int PCOL(Int bnum, const GridType *g)
PCOL returns the processor column that the bnum-th block (supernode) belongs to.
Definition: pselinv.hpp:298
Int LBi(Int bnum, const GridType *g)
LBi returns the local block number on the processor at processor row PROW( bnum, g )...
Definition: pselinv.hpp:308
Profiling and timing using TAU.
Int SuperSize(Int bnum, const SuperNodeType *s)
SuperSize returns the size of the block bnum.
Definition: pselinv.hpp:349
Int LBj(Int bnum, const GridType *g)
LBj returns the local block number on the processor at processor column PCOL( bnum, g ).
Definition: pselinv.hpp:313
Int GBj(Int jLocal, const GridType *g)
GBj returns the global block number from a local block number in the column direction.
Definition: pselinv.hpp:323
void SetValue(NumMat< F > &M, F val)
SetValue sets a numerical matrix to a constant val.
Definition: NumMat_impl.hpp:171
Int numCol
Number of nonzero columns.
Definition: pselinv.hpp:191
Definition: pselinv_unsym_impl.hpp:434
LBlock stores a nonzero block in the lower triangular part or the diagonal part in PSelInv...
Definition: pselinv.hpp:182
Int PNUM(Int i, Int j, const GridType *g)
PNUM returns the processor rank that the bnum-th block (supernode) belongs to.
Definition: pselinv.hpp:303
Int GBi(Int iLocal, const GridType *g)
GBi returns the global block number from a local block number in the row direction.
Definition: pselinv.hpp:318
Int MYCOL(const GridType *g)
MYCOL returns my processor column.
Definition: pselinv.hpp:288
virtual void ConstructCommunicationPattern()
ConstructCommunicationPattern constructs the communication pattern to be used later in the selected i...
Definition: pselinv_unsym_impl.hpp:3557
IntNumVec rows
Dimension numRow * 1, index (0-based) for the number of nonzero rows.
Definition: pselinv.hpp:194
Int numCol
Number of nonzero columns.
Definition: pselinv.hpp:243
UBlock stores a nonzero block in the upper triangular part in PSelInv.
Definition: pselinv.hpp:234
Definition: pselinv_unsym.hpp:198
Int FirstBlockCol(Int bnum, const SuperNodeType *s)
FirstBlockCol returns the first column of a block bnum.
Definition: pselinv.hpp:337
Int blockIdx
Block index (supernodal index)
Definition: pselinv.hpp:185
void SendRecvCD(std::vector< SuperNodeBufferTypeUnsym > &arrSuperNodes, Int stepSuper)
SendRecvCD_UpdateU.
Definition: pselinv_unsym_impl.hpp:1124
Int numRow
Number of nonzero rows.
Definition: pselinv.hpp:240
virtual void SelInv()
SelInv is the main function for the selected inversion.
Definition: pselinv_unsym_impl.hpp:3258
void ConstructCommunicationPattern_P2p()
ConstructCommunicationPattern_P2p constructs the communication pattern to be used later in the select...
Definition: pselinv_unsym_impl.hpp:3565
NumMat< T > nzval
Dimension numRow * numCol, nonzero elements.
Definition: pselinv.hpp:249
Int NumSuper(const SuperNodeType *s)
NumSuper returns the total number of supernodes.
Definition: pselinv.hpp:353
NumMat< T > nzval
Dimension numRow * numCol, nonzero elements.
Definition: pselinv.hpp:198
virtual void PreSelInv()
PreSelInv prepares the structure in L_ and U_ so that SelInv only involves matrix-matrix multiplicati...
Definition: pselinv_unsym_impl.hpp:3306
IntNumVec cols
Dimension numCol * 1, index (0-based) for the number of nonzero rows.
Definition: pselinv.hpp:246
void SelInv_P2p()
Point-to-point version of the selected inversion.
Definition: pselinv_unsym_impl.hpp:3264
PMatrixUnsym contains the main data structure and the computational routine for the parallel selected...
Definition: pselinv.hpp:991
Int MYROW(const GridType *g)
MYROW returns my processor row.
Definition: pselinv.hpp:284