PEXSI
 All Classes Namespaces Files Functions Variables Typedefs Pages
pselinv_impl.hpp
Go to the documentation of this file.
1 /*
2  Copyright (c) 2012 The Regents of the University of California,
3  through Lawrence Berkeley National Laboratory.
4 
5  Authors: Lin Lin and Mathias Jacquelin
6 
7  This file is part of PEXSI. All rights reserved.
8 
9  Redistribution and use in source and binary forms, with or without
10  modification, are permitted provided that the following conditions are met:
11 
12  (1) Redistributions of source code must retain the above copyright notice, this
13  list of conditions and the following disclaimer.
14  (2) Redistributions in binary form must reproduce the above copyright notice,
15  this list of conditions and the following disclaimer in the documentation
16  and/or other materials provided with the distribution.
17  (3) Neither the name of the University of California, Lawrence Berkeley
18  National Laboratory, U.S. Dept. of Energy nor the names of its contributors may
19  be used to endorse or promote products derived from this software without
20  specific prior written permission.
21 
22  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
23  ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
24  WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
25  DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
26  ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
27  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
29  ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
31  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 
33  You are under no obligation whatsoever to provide any bug fixes, patches, or
34  upgrades to the features, functionality or performance of the source code
35  ("Enhancements") to anyone; however, if you choose to make your Enhancements
36  available either publicly, or directly to Lawrence Berkeley National
37  Laboratory, without imposing a separate written license agreement for such
38  Enhancements, then you hereby grant the following license: a non-exclusive,
39  royalty-free perpetual license to install, use, modify, prepare derivative
40  works, incorporate into other computer software, distribute, and sublicense
41  such enhancements or derivative works thereof, in binary and source code form.
42 */
46 #ifndef _PEXSI_PSELINV_IMPL_HPP_
47 #define _PEXSI_PSELINV_IMPL_HPP_
48 
49 #include "timer.h"
50 #include "superlu_dist_interf.hpp"
51 
52 #define IDX_TO_TAG(lidx,tag) (SELINV_TAG_COUNT*(lidx)+(tag))
53 #define TAG_TO_IDX(tag,typetag) (((tag)-(typetag))/SELINV_TAG_COUNT)
54 
55 #define MPI_MAX_COMM (1024)
56 #define BCAST_THRESHOLD 20
57 
58 #define MOD(a,b) \
59  ( ((a)%(b)+(b))%(b))
60 
61 #define LIST_BARRIER
62 //#define ALL_BARRIER
63 
64 namespace PEXSI{
65  inline GridType::GridType ( MPI_Comm Bcomm, int nprow, int npcol )
66  {
67 #ifndef _RELEASE_
68  PushCallStack("GridType::GridType");
69 #endif
70  Int info;
71  MPI_Initialized( &info );
72  if( !info ){
73  throw std::logic_error( "MPI has not been initialized." );
74  }
75  MPI_Group comm_group;
76  MPI_Comm_group( Bcomm, &comm_group );
77  MPI_Comm_create( Bcomm, comm_group, &comm );
78  // comm = Bcomm;
79 
80  MPI_Comm_rank( comm, &mpirank );
81  MPI_Comm_size( comm, &mpisize );
82  if( mpisize != nprow * npcol ){
83  throw std::logic_error( "mpisize != nprow * npcol." );
84  }
85 
86  numProcRow = nprow;
87  numProcCol = npcol;
88 
89  Int myrow = mpirank / npcol;
90  Int mycol = mpirank % npcol;
91 
92  MPI_Comm_split( comm, myrow, mycol, &rowComm );
93  MPI_Comm_split( comm, mycol, myrow, &colComm );
94 
95  MPI_Group_free( &comm_group );
96 
97 #ifndef _RELEASE_
98  PopCallStack();
99 #endif
100 
101  return ;
102  } // ----- end of method GridType::GridType -----
103 
104 
105  inline GridType::~GridType ( )
106  {
107 #ifndef _RELEASE_
108  PushCallStack("GridType::~GridType");
109 #endif
110  // Dot not free grid.comm which is not generated by GridType().
111 
112  MPI_Comm_free( &rowComm );
113  MPI_Comm_free( &colComm );
114  MPI_Comm_free( &comm );
115 
116 #ifndef _RELEASE_
117  PopCallStack();
118 #endif
119  return ;
120  } // ----- end of method GridType::~GridType -----
121 }
122 
123 
124 namespace PEXSI{
125  template<typename T>
126  PMatrix<T>::PMatrix (
127  const GridType* g,
128  const SuperNodeType* s,
129  const PEXSI::SuperLUOptions * o
130  )
131  {
132 #ifndef _RELEASE_
133  PushCallStack("PMatrix::PMatrix");
134 #endif
135 
136  this->Setup( g, s, o );
137 
138 #ifndef _RELEASE_
139  PopCallStack();
140 #endif
141  return ;
142  } // ----- end of method PMatrix::PMatrix -----
143 
144 
145  template<typename T>
146  void PMatrix<T>::Setup(
147  const GridType* g,
148  const SuperNodeType* s,
149  const PEXSI::SuperLUOptions * o
150  ) {
151 #ifndef _RELEASE_
152  PushCallStack("PMatrix::Setup");
153 #endif
154 
155  grid_ = g;
156  super_ = s;
157  options_ = o;
158 
159  // if( grid_->numProcRow != grid_->numProcCol ){
160  // throw std::runtime_error( "The current version of SelInv only works for square processor grids." ); }
161 
162  L_.clear();
163  U_.clear();
164  ColBlockIdx_.clear();
165  RowBlockIdx_.clear();
166 
167  L_.resize( this->NumLocalBlockCol() );
168  U_.resize( this->NumLocalBlockRow() );
169 
170  ColBlockIdx_.resize( this->NumLocalBlockCol() );
171  RowBlockIdx_.resize( this->NumLocalBlockRow() );
172 
173  //workingSet_.resize(this->NumSuper());
174 #if ( _DEBUGlevel_ >= 1 )
175  statusOFS << std::endl << "PMatrix is constructed. The grid information: " << std::endl;
176  statusOFS << "mpirank = " << MYPROC(grid_) << std::endl;
177  statusOFS << "myrow = " << MYROW(grid_) << std::endl;
178  statusOFS << "mycol = " << MYCOL(grid_) << std::endl;
179 #endif
180 
181 #ifndef _RELEASE_
182  PopCallStack();
183 #endif
184  return ;
185  } // ----- end of method PMatrix::Setup -----
186 
187 
189  template<typename T>
190  void PMatrix<T>::getMaxCommunicatorSizes()
191  {
192  IntNumVec maxSizes(this->WorkingSet().size());
193  SetValue(maxSizes,I_ONE);
194  for (Int lidx=0; lidx<this->WorkingSet().size() ; lidx++){
195  Int stepSuper = this->WorkingSet()[lidx].size();
196  for(Int supidx = 0;supidx<stepSuper;supidx++){
197  Int ksup = this->WorkingSet()[lidx][supidx];
198 
199  Int count= std::count(commSendToRightMask_[ksup].begin(), commSendToRightMask_[ksup].end(), true);
200  maxSizes[lidx] = std::max(maxSizes[lidx],count);
201 
202  count= std::count(commSendToBelowMask_[ksup].begin(), commSendToBelowMask_[ksup].end(), true);
203  maxSizes[lidx] = std::max(maxSizes[lidx],count);
204 
205  count= std::count(commRecvFromBelowMask_[ksup].begin(), commRecvFromBelowMask_[ksup].end(), true);
206  maxSizes[lidx] = std::max(maxSizes[lidx],count);
207  }
208  }
209 
210 
211  maxCommSizes_.Resize(maxSizes.m());
212  SetValue(maxCommSizes_,I_ONE);
213  MPI_Allreduce( maxSizes.Data(), maxCommSizes_.Data(), maxSizes.m(), MPI_INT, MPI_MAX, grid_->comm );
214  }
215 
216  template<typename T>
217  inline void PMatrix<T>::SelInv_lookup_indexes(
218  SuperNodeBufferType & snode,
219  std::vector<LBlock<T> > & LcolRecv,
220  std::vector<UBlock<T> > & UrowRecv,
221  NumMat<T> & AinvBuf,
222  NumMat<T> & UBuf )
223  {
224  TIMER_START(Compute_Sinv_LT_Lookup_Indexes);
225 
226  TIMER_START(Build_colptr_rowptr);
227  // rowPtr[ib] gives the row index in snode.LUpdateBuf for the first
228  // nonzero row in LcolRecv[ib]. The total number of rows in
229  // snode.LUpdateBuf is given by rowPtr[end]-1
230  std::vector<Int> rowPtr(LcolRecv.size() + 1);
231  // colPtr[jb] gives the column index in UBuf for the first
232  // nonzero column in UrowRecv[jb]. The total number of rows in
233  // UBuf is given by colPtr[end]-1
234  std::vector<Int> colPtr(UrowRecv.size() + 1);
235 
236  rowPtr[0] = 0;
237  for( Int ib = 0; ib < LcolRecv.size(); ib++ ){
238  rowPtr[ib+1] = rowPtr[ib] + LcolRecv[ib].numRow;
239  }
240  colPtr[0] = 0;
241  for( Int jb = 0; jb < UrowRecv.size(); jb++ ){
242  colPtr[jb+1] = colPtr[jb] + UrowRecv[jb].numCol;
243  }
244 
245  Int numRowAinvBuf = *rowPtr.rbegin();
246  Int numColAinvBuf = *colPtr.rbegin();
247  TIMER_STOP(Build_colptr_rowptr);
248 
249  TIMER_START(Allocate_lookup);
250  // Allocate for the computational storage
251  AinvBuf.Resize( numRowAinvBuf, numColAinvBuf );
252  UBuf.Resize( SuperSize( snode.Index, super_ ), numColAinvBuf );
253  // TIMER_START(SetValue_lookup);
254  // SetValue( AinvBuf, ZERO<T>() );
255  //SetValue( snode.LUpdateBuf, ZERO<T>() );
256  // SetValue( UBuf, ZERO<T>() );
257  // TIMER_STOP(SetValue_lookup);
258  TIMER_STOP(Allocate_lookup);
259 
260  TIMER_START(Fill_UBuf);
261  // Fill UBuf first. Make the transpose later in the Gemm phase.
262  for( Int jb = 0; jb < UrowRecv.size(); jb++ ){
263  UBlock<T>& UB = UrowRecv[jb];
264  if( UB.numRow != SuperSize(snode.Index, super_) ){
265  throw std::logic_error( "The size of UB is not right. Something is seriously wrong." );
266  }
267  lapack::Lacpy( 'A', UB.numRow, UB.numCol, UB.nzval.Data(),
268  UB.numRow, UBuf.VecData( colPtr[jb] ), SuperSize( snode.Index, super_ ) );
269  }
270  TIMER_STOP(Fill_UBuf);
271 
272  // Calculate the relative indices for (isup, jsup)
273  // Fill AinvBuf with the information in L or U block.
274  TIMER_START(JB_Loop);
275 
276 #ifdef STDFIND
277 // for( Int jb = 0; jb < UrowRecv.size(); jb++ ){
278 //
279 // UBlock& UB = UrowRecv[jb];
280 // Int jsup = UB.blockIdx;
281 // Int SinvColsSta = FirstBlockCol( jsup, super_ );
282 //
283 // // Column relative indicies
284 // std::vector<Int> relCols( UB.numCol );
285 // for( Int j = 0; j < UB.numCol; j++ ){
286 // relCols[j] = UB.cols[j] - SinvColsSta;
287 // }
288 //
289 //
290 //
291 //
292 // for( Int ib = 0; ib < LcolRecv.size(); ib++ ){
293 // LBlock& LB = LcolRecv[ib];
294 // Int isup = LB.blockIdx;
295 // Int SinvRowsSta = FirstBlockCol( isup, super_ );
296 // Scalar* nzvalAinv = &AinvBuf( rowPtr[ib], colPtr[jb] );
297 // Int ldAinv = numRowAinvBuf;
298 //
299 // // Pin down the corresponding block in the part of Sinv.
300 // if( isup >= jsup ){
301 // std::vector<LBlock>& LcolSinv = this->L( LBj(jsup, grid_ ) );
302 // bool isBlockFound = false;
303 // TIMER_START(PARSING_ROW_BLOCKIDX);
304 // for( Int ibSinv = 0; ibSinv < LcolSinv.size(); ibSinv++ ){
305 // // Found the (isup, jsup) block in Sinv
306 // if( LcolSinv[ibSinv].blockIdx == isup ){
307 // LBlock& SinvB = LcolSinv[ibSinv];
308 //
309 // // Row relative indices
310 // std::vector<Int> relRows( LB.numRow );
311 // Int* rowsLBPtr = LB.rows.Data();
312 // Int* rowsSinvBPtr = SinvB.rows.Data();
313 //
314 // TIMER_START(STDFIND_ROW);
315 // Int * pos =&rowsSinvBPtr[0];
316 // Int * last =&rowsSinvBPtr[SinvB.numRow];
317 // for( Int i = 0; i < LB.numRow; i++ ){
318 // // pos = std::find(pos, &rowsSinvBPtr[SinvB.numRow-1], rowsLBPtr[i]);
319 // pos = std::find(rowsSinvBPtr, last, rowsLBPtr[i]);
320 // if(pos != last){
321 // relRows[i] = (Int)(pos - rowsSinvBPtr);
322 // }
323 // else{
324 // std::ostringstream msg;
325 // msg << "Row " << rowsLBPtr[i] <<
326 // " in LB cannot find the corresponding row in SinvB" << std::endl
327 // << "LB.rows = " << LB.rows << std::endl
328 // << "SinvB.rows = " << SinvB.rows << std::endl;
329 // throw std::runtime_error( msg.str().c_str() );
330 // }
331 // }
332 // TIMER_STOP(STDFIND_ROW);
333 //
334 // TIMER_START(Copy_Sinv_to_Ainv);
335 // // Transfer the values from Sinv to AinvBlock
336 // Scalar* nzvalSinv = SinvB.nzval.Data();
337 // Int ldSinv = SinvB.numRow;
338 // for( Int j = 0; j < UB.numCol; j++ ){
339 // for( Int i = 0; i < LB.numRow; i++ ){
340 // nzvalAinv[i+j*ldAinv] =
341 // nzvalSinv[relRows[i] + relCols[j] * ldSinv];
342 // }
343 // }
344 // TIMER_STOP(Copy_Sinv_to_Ainv);
345 //
346 // isBlockFound = true;
347 // break;
348 // }
349 // } // for (ibSinv )
350 // TIMER_STOP(PARSING_ROW_BLOCKIDX);
351 // if( isBlockFound == false ){
352 // std::ostringstream msg;
353 // msg << "Block(" << isup << ", " << jsup
354 // << ") did not find a matching block in Sinv." << std::endl;
355 // throw std::runtime_error( msg.str().c_str() );
356 // }
357 // } // if (isup, jsup) is in L
358 // else{
359 // // Row relative indices
360 // std::vector<Int> relRows( LB.numRow );
361 // Int SinvRowsSta = FirstBlockCol( isup, super_ );
362 // for( Int i = 0; i < LB.numRow; i++ ){
363 // relRows[i] = LB.rows[i] - SinvRowsSta;
364 // }
365 // std::vector<UBlock>& UrowSinv = this->U( LBi( isup, grid_ ) );
366 // bool isBlockFound = false;
367 // TIMER_START(PARSING_COL_BLOCKIDX);
368 // for( Int jbSinv = 0; jbSinv < UrowSinv.size(); jbSinv++ ){
369 // // Found the (isup, jsup) block in Sinv
370 // if( UrowSinv[jbSinv].blockIdx == jsup ){
371 // UBlock& SinvB = UrowSinv[jbSinv];
372 //
373 //
374 //
375 // // Column relative indices
376 // std::vector<Int> relCols( UB.numCol );
377 // Int* colsUBPtr = UB.cols.Data();
378 // Int* colsSinvBPtr = SinvB.cols.Data();
379 // TIMER_START(STDFIND_COL);
380 // Int * pos =&colsSinvBPtr[0];
381 // Int * last =&colsSinvBPtr[SinvB.numCol];
382 // for( Int j = 0; j < UB.numCol; j++ ){
383 // //colsUB is sorted
384 // pos = std::find(colsSinvBPtr, last, colsUBPtr[j]);
385 // if(pos !=last){
386 // relCols[j] = (Int)(pos - colsSinvBPtr);
387 // }
388 // else{
389 // std::ostringstream msg;
390 // msg << "Col " << colsUBPtr[j] <<
391 // " in UB cannot find the corresponding row in SinvB" << std::endl
392 // << "UB.cols = " << UB.cols << std::endl
393 // << "UinvB.cols = " << SinvB.cols << std::endl;
394 // throw std::runtime_error( msg.str().c_str() );
395 // }
396 // }
397 // TIMER_STOP(STDFIND_COL);
398 //
399 //
400 // TIMER_START(Copy_Sinv_to_Ainv);
401 // // Transfer the values from Sinv to AinvBlock
402 // Scalar* nzvalSinv = SinvB.nzval.Data();
403 // Int ldSinv = SinvB.numRow;
404 // for( Int j = 0; j < UB.numCol; j++ ){
405 // for( Int i = 0; i < LB.numRow; i++ ){
406 // nzvalAinv[i+j*ldAinv] =
407 // nzvalSinv[relRows[i] + relCols[j] * ldSinv];
408 // }
409 // }
410 // TIMER_STOP(Copy_Sinv_to_Ainv);
411 //
412 // isBlockFound = true;
413 // break;
414 // }
415 // } // for (jbSinv)
416 // TIMER_STOP(PARSING_COL_BLOCKIDX);
417 // if( isBlockFound == false ){
418 // std::ostringstream msg;
419 // msg << "Block(" << isup << ", " << jsup
420 // << ") did not find a matching block in Sinv." << std::endl;
421 // throw std::runtime_error( msg.str().c_str() );
422 // }
423 // } // if (isup, jsup) is in U
424 //
425 // } // for( ib )
426 // } // for ( jb )
427 #else
428  for( Int jb = 0; jb < UrowRecv.size(); jb++ ){
429  for( Int ib = 0; ib < LcolRecv.size(); ib++ ){
430  LBlock<T>& LB = LcolRecv[ib];
431  UBlock<T>& UB = UrowRecv[jb];
432  Int isup = LB.blockIdx;
433  Int jsup = UB.blockIdx;
434  T* nzvalAinv = &AinvBuf( rowPtr[ib], colPtr[jb] );
435  Int ldAinv = AinvBuf.m();
436 
437  // Pin down the corresponding block in the part of Sinv.
438  if( isup >= jsup ){
439  std::vector<LBlock<T> >& LcolSinv = this->L( LBj(jsup, grid_ ) );
440  bool isBlockFound = false;
441  TIMER_START(PARSING_ROW_BLOCKIDX);
442  for( Int ibSinv = 0; ibSinv < LcolSinv.size(); ibSinv++ ){
443  // Found the (isup, jsup) block in Sinv
444  if( LcolSinv[ibSinv].blockIdx == isup ){
445  LBlock<T> & SinvB = LcolSinv[ibSinv];
446 
447  // Row relative indices
448  std::vector<Int> relRows( LB.numRow );
449  Int* rowsLBPtr = LB.rows.Data();
450  Int* rowsSinvBPtr = SinvB.rows.Data();
451  for( Int i = 0; i < LB.numRow; i++ ){
452  bool isRowFound = false;
453  for( Int i1 = 0; i1 < SinvB.numRow; i1++ ){
454  if( rowsLBPtr[i] == rowsSinvBPtr[i1] ){
455  isRowFound = true;
456  relRows[i] = i1;
457  break;
458  }
459  }
460  if( isRowFound == false ){
461  std::ostringstream msg;
462  msg << "Row " << rowsLBPtr[i] <<
463  " in LB cannot find the corresponding row in SinvB" << std::endl
464  << "LB.rows = " << LB.rows << std::endl
465  << "SinvB.rows = " << SinvB.rows << std::endl;
466  throw std::runtime_error( msg.str().c_str() );
467  }
468  }
469 
470  // Column relative indicies
471  std::vector<Int> relCols( UB.numCol );
472  Int SinvColsSta = FirstBlockCol( jsup, super_ );
473  for( Int j = 0; j < UB.numCol; j++ ){
474  relCols[j] = UB.cols[j] - SinvColsSta;
475  }
476 
477  // Transfer the values from Sinv to AinvBlock
478  T* nzvalSinv = SinvB.nzval.Data();
479  Int ldSinv = SinvB.numRow;
480  for( Int j = 0; j < UB.numCol; j++ ){
481  for( Int i = 0; i < LB.numRow; i++ ){
482  nzvalAinv[i+j*ldAinv] =
483  nzvalSinv[relRows[i] + relCols[j] * ldSinv];
484  }
485  }
486 
487  isBlockFound = true;
488  break;
489  }
490  } // for (ibSinv )
491  TIMER_STOP(PARSING_ROW_BLOCKIDX);
492  if( isBlockFound == false ){
493  std::ostringstream msg;
494  msg << "Block(" << isup << ", " << jsup
495  << ") did not find a matching block in Sinv." << std::endl;
496  throw std::runtime_error( msg.str().c_str() );
497  }
498  } // if (isup, jsup) is in L
499  else{
500  std::vector<UBlock<T> >& UrowSinv = this->U( LBi( isup, grid_ ) );
501  bool isBlockFound = false;
502  TIMER_START(PARSING_COL_BLOCKIDX);
503  for( Int jbSinv = 0; jbSinv < UrowSinv.size(); jbSinv++ ){
504  // Found the (isup, jsup) block in Sinv
505  if( UrowSinv[jbSinv].blockIdx == jsup ){
506  UBlock<T> & SinvB = UrowSinv[jbSinv];
507 
508  // Row relative indices
509  std::vector<Int> relRows( LB.numRow );
510  Int SinvRowsSta = FirstBlockCol( isup, super_ );
511  for( Int i = 0; i < LB.numRow; i++ ){
512  relRows[i] = LB.rows[i] - SinvRowsSta;
513  }
514 
515  // Column relative indices
516  std::vector<Int> relCols( UB.numCol );
517  Int* colsUBPtr = UB.cols.Data();
518  Int* colsSinvBPtr = SinvB.cols.Data();
519  for( Int j = 0; j < UB.numCol; j++ ){
520  bool isColFound = false;
521  for( Int j1 = 0; j1 < SinvB.numCol; j1++ ){
522  if( colsUBPtr[j] == colsSinvBPtr[j1] ){
523  isColFound = true;
524  relCols[j] = j1;
525  break;
526  }
527  }
528  if( isColFound == false ){
529  std::ostringstream msg;
530  msg << "Col " << colsUBPtr[j] <<
531  " in UB cannot find the corresponding row in SinvB" << std::endl
532  << "UB.cols = " << UB.cols << std::endl
533  << "UinvB.cols = " << SinvB.cols << std::endl;
534  throw std::runtime_error( msg.str().c_str() );
535  }
536  }
537 
538 
539  // Transfer the values from Sinv to AinvBlock
540  T* nzvalSinv = SinvB.nzval.Data();
541  Int ldSinv = SinvB.numRow;
542  for( Int j = 0; j < UB.numCol; j++ ){
543  for( Int i = 0; i < LB.numRow; i++ ){
544  nzvalAinv[i+j*ldAinv] =
545  nzvalSinv[relRows[i] + relCols[j] * ldSinv];
546  }
547  }
548 
549  isBlockFound = true;
550  break;
551  }
552  } // for (jbSinv)
553  TIMER_STOP(PARSING_COL_BLOCKIDX);
554  if( isBlockFound == false ){
555  std::ostringstream msg;
556  msg << "Block(" << isup << ", " << jsup
557  << ") did not find a matching block in Sinv." << std::endl;
558  throw std::runtime_error( msg.str().c_str() );
559  }
560  } // if (isup, jsup) is in U
561 
562  } // for( ib )
563  } // for ( jb )
564 
565 
566 #endif
567  TIMER_STOP(JB_Loop);
568 
569 
570  TIMER_STOP(Compute_Sinv_LT_Lookup_Indexes);
571  }
572 
573 
574  template<typename T>
575  inline void PMatrix<T>::SendRecvCD_UpdateU(
576  std::vector<SuperNodeBufferType > & arrSuperNodes,
577  Int stepSuper)
578  {
579 
580  TIMER_START(Send_CD_Update_U);
581  //compute the number of requests
582  Int sendCount = 0;
583  Int recvCount = 0;
584  Int sendOffset[stepSuper];
585  Int recvOffset[stepSuper];
586  Int recvIdx=0;
587  for (Int supidx=0; supidx<stepSuper; supidx++){
588  SuperNodeBufferType & snode = arrSuperNodes[supidx];
589  sendOffset[supidx]=sendCount;
590  recvOffset[supidx]=recvCount;
591  sendCount+= CountSendToCrossDiagonal(snode.Index);
592  recvCount+= CountRecvFromCrossDiagonal(snode.Index);
593  }
594 
595 
596  std::vector<MPI_Request > arrMpiReqsSendCD(sendCount, MPI_REQUEST_NULL );
597  std::vector<MPI_Request > arrMpiReqsSizeSendCD(sendCount, MPI_REQUEST_NULL );
598 
599  std::vector<MPI_Request > arrMpiReqsRecvCD(recvCount, MPI_REQUEST_NULL );
600  std::vector<MPI_Request > arrMpiReqsSizeRecvCD(recvCount, MPI_REQUEST_NULL );
601  std::vector<std::vector<char> > arrSstrLcolSendCD(sendCount);
602  std::vector<int > arrSstrLcolSizeSendCD(sendCount);
603  std::vector<std::vector<char> > arrSstrLcolRecvCD(recvCount);
604  std::vector<int > arrSstrLcolSizeRecvCD(recvCount);
605 
606  for (Int supidx=0; supidx<stepSuper; supidx++){
607  SuperNodeBufferType & snode = arrSuperNodes[supidx];
608 
609  // Send LUpdateBufReduced to the cross diagonal blocks.
610  // NOTE: This assumes square processor grid
611 
612  TIMER_START(Send_L_CrossDiag);
613 
614  if( MYCOL( grid_ ) == PCOL( snode.Index, grid_ ) && isSendToCrossDiagonal_(grid_->numProcCol, snode.Index ) ){
615 
616  Int sendIdx = 0;
617  for(Int dstCol = 0; dstCol<grid_->numProcCol; dstCol++){
618  if(isSendToCrossDiagonal_(dstCol,snode.Index) ){
619  Int dest = PNUM(PROW(snode.Index,grid_),dstCol,grid_);
620 
621  if( MYPROC( grid_ ) != dest ){
622  MPI_Request & mpiReqSizeSend = arrMpiReqsSizeSendCD[sendOffset[supidx]+sendIdx];
623  MPI_Request & mpiReqSend = arrMpiReqsSendCD[sendOffset[supidx]+sendIdx];
624 
625 
626  std::stringstream sstm;
627  std::vector<char> & sstrLcolSend = arrSstrLcolSendCD[sendOffset[supidx]+sendIdx];
628  Int & sstrSize = arrSstrLcolSizeSendCD[sendOffset[supidx]+sendIdx];
629 
630  serialize( snode.RowLocalPtr, sstm, NO_MASK );
631  serialize( snode.BlockIdxLocal, sstm, NO_MASK );
632  serialize( snode.LUpdateBuf, sstm, NO_MASK );
633 
634  sstrLcolSend.resize( Size(sstm) );
635  sstm.read( &sstrLcolSend[0], sstrLcolSend.size() );
636  sstrSize = sstrLcolSend.size();
637 
638 
639  MPI_Isend( &sstrSize, 1, MPI_INT, dest, IDX_TO_TAG(supidx,SELINV_TAG_L_SIZE), grid_->comm, &mpiReqSizeSend );
640  MPI_Isend( (void*)&sstrLcolSend[0], sstrSize, MPI_BYTE, dest, IDX_TO_TAG(supidx,SELINV_TAG_L_CONTENT), grid_->comm, &mpiReqSend );
641  sendIdx++;
642  }
643  }
644  }
645 
646 
647  } // sender
648  TIMER_STOP(Send_L_CrossDiag);
649  }
650 
651 
652  //Do Irecv for sizes
653  for (Int supidx=0; supidx<stepSuper; supidx++){
654  SuperNodeBufferType & snode = arrSuperNodes[supidx];
655  //If I'm a receiver
656  if( MYROW( grid_ ) == PROW( snode.Index, grid_ ) && isRecvFromCrossDiagonal_(grid_->numProcRow, snode.Index ) ){
657  Int recvIdx=0;
658  for(Int srcRow = 0; srcRow<grid_->numProcRow; srcRow++){
659  if(isRecvFromCrossDiagonal_(srcRow,snode.Index) ){
660  Int src = PNUM(srcRow,PCOL(snode.Index,grid_),grid_);
661  if( MYPROC( grid_ ) != src ){
662  Int & sstrSize = arrSstrLcolSizeRecvCD[recvOffset[supidx]+recvIdx];
663  MPI_Request & mpiReqSizeRecv = arrMpiReqsSizeRecvCD[recvOffset[supidx]+recvIdx];
664  MPI_Irecv( &sstrSize, 1, MPI_INT, src, IDX_TO_TAG(supidx,SELINV_TAG_L_SIZE), grid_->comm, &mpiReqSizeRecv );
665  recvIdx++;
666  }
667  }
668  }
669  }//end if I'm a receiver
670  }
671 
672  //waitall sizes
673  mpi::Waitall(arrMpiReqsSizeRecvCD);
674  //Allocate content and do Irecv
675  for (Int supidx=0; supidx<stepSuper; supidx++){
676  SuperNodeBufferType & snode = arrSuperNodes[supidx];
677  //If I'm a receiver
678  if( MYROW( grid_ ) == PROW( snode.Index, grid_ ) && isRecvFromCrossDiagonal_(grid_->numProcRow, snode.Index ) ){
679  Int recvIdx=0;
680  for(Int srcRow = 0; srcRow<grid_->numProcRow; srcRow++){
681  if(isRecvFromCrossDiagonal_(srcRow,snode.Index) ){
682  Int src = PNUM(srcRow,PCOL(snode.Index,grid_),grid_);
683  if( MYPROC( grid_ ) != src ){
684  Int & sstrSize = arrSstrLcolSizeRecvCD[recvOffset[supidx]+recvIdx];
685  std::vector<char> & sstrLcolRecv = arrSstrLcolRecvCD[recvOffset[supidx]+recvIdx];
686  MPI_Request & mpiReqRecv = arrMpiReqsRecvCD[recvOffset[supidx]+recvIdx];
687  sstrLcolRecv.resize( sstrSize);
688  MPI_Irecv( (void*)&sstrLcolRecv[0], sstrSize, MPI_BYTE, src, IDX_TO_TAG(supidx,SELINV_TAG_L_CONTENT), grid_->comm, &mpiReqRecv );
689  //statusOFS<<"P"<<MYPROC(grid_)<<" received "<<sstrSize<<" bytes of L/U from CD P"<<src<<std::endl;
690  recvIdx++;
691  }
692  }
693  }
694  }//end if I'm a receiver
695  }
696 
697  //waitall content
698  mpi::Waitall(arrMpiReqsRecvCD);
699  //Do the work
700  for (Int supidx=0; supidx<stepSuper; supidx++){
701  SuperNodeBufferType & snode = arrSuperNodes[supidx];
702 
703  // Send LUpdateBufReduced to the cross diagonal blocks.
704  // NOTE: This assumes square processor grid
705  if( MYROW( grid_ ) == PROW( snode.Index, grid_ ) && isRecvFromCrossDiagonal_(grid_->numProcRow, snode.Index ) ){
706 
707 #if ( _DEBUGlevel_ >= 1 )
708  statusOFS << std::endl << " ["<<snode.Index<<"] "<< "Update the upper triangular block" << std::endl << std::endl;
709  statusOFS << std::endl << " ["<<snode.Index<<"] "<< "blockIdxLocal:" << snode.BlockIdxLocal << std::endl << std::endl;
710  statusOFS << std::endl << " ["<<snode.Index<<"] "<< "rowLocalPtr:" << snode.RowLocalPtr << std::endl << std::endl;
711 #endif
712 
713  std::vector<UBlock<T> >& Urow = this->U( LBi( snode.Index, grid_ ) );
714  std::vector<bool> isBlockFound(Urow.size(),false);
715 
716  recvIdx=0;
717  for(Int srcRow = 0; srcRow<grid_->numProcRow; srcRow++){
718  if(isRecvFromCrossDiagonal_(srcRow,snode.Index) ){
719  Int src = PNUM(srcRow,PCOL(snode.Index,grid_),grid_);
720  TIMER_START(Recv_L_CrossDiag);
721 
722  std::vector<Int> rowLocalPtrRecv;
723  std::vector<Int> blockIdxLocalRecv;
724  NumMat<T> UUpdateBuf;
725 
726  if( MYPROC( grid_ ) != src ){
727  std::stringstream sstm;
728  Int & sstrSize = arrSstrLcolSizeRecvCD[recvOffset[supidx]+recvIdx];
729  std::vector<char> & sstrLcolRecv = arrSstrLcolRecvCD[recvOffset[supidx]+recvIdx];
730  sstm.write( &sstrLcolRecv[0], sstrSize );
731 
732  deserialize( rowLocalPtrRecv, sstm, NO_MASK );
733  deserialize( blockIdxLocalRecv, sstm, NO_MASK );
734  deserialize( UUpdateBuf, sstm, NO_MASK );
735 
736  recvIdx++;
737 
738  } // sender is not the same as receiver
739  else{
740  rowLocalPtrRecv = snode.RowLocalPtr;
741  blockIdxLocalRecv = snode.BlockIdxLocal;
742  UUpdateBuf = snode.LUpdateBuf;
743  } // sender is the same as receiver
744 
745 
746 
747  TIMER_STOP(Recv_L_CrossDiag);
748 
749 #if ( _DEBUGlevel_ >= 1 )
750  statusOFS<<" ["<<snode.Index<<"] P"<<MYPROC(grid_)<<" ("<<MYROW(grid_)<<","<<MYCOL(grid_)<<") <--- LBj("<<snode.Index<<") <--- P"<<src<<std::endl;
751  statusOFS << std::endl << " ["<<snode.Index<<"] "<< "rowLocalPtrRecv:" << rowLocalPtrRecv << std::endl << std::endl;
752  statusOFS << std::endl << " ["<<snode.Index<<"] "<< "blockIdxLocalRecv:" << blockIdxLocalRecv << std::endl << std::endl;
753 #endif
754 
755 
756  // Update U
757  for( Int ib = 0; ib < blockIdxLocalRecv.size(); ib++ ){
758  for( Int jb = 0; jb < Urow.size(); jb++ ){
759  UBlock<T>& UB = Urow[jb];
760  if( UB.blockIdx == blockIdxLocalRecv[ib] ){
761  NumMat<T> Ltmp ( UB.numCol, UB.numRow );
762  lapack::Lacpy( 'A', Ltmp.m(), Ltmp.n(),
763  &UUpdateBuf( rowLocalPtrRecv[ib], 0 ),
764  UUpdateBuf.m(), Ltmp.Data(), Ltmp.m() );
765  isBlockFound[jb] = true;
766  Transpose( Ltmp, UB.nzval );
767  break;
768  }
769  }
770  }
771  }
772  }
773 
774  for( Int jb = 0; jb < Urow.size(); jb++ ){
775  UBlock<T>& UB = Urow[jb];
776  if( !isBlockFound[jb] ){
777  throw std::logic_error( "UBlock cannot find its update. Something is seriously wrong." );
778  }
779  }
780  } // receiver
781  }
782 
783  TIMER_STOP(Send_CD_Update_U);
784 
785  mpi::Waitall(arrMpiReqsSizeSendCD);
786  mpi::Waitall(arrMpiReqsSendCD);
787  };
788 
789  template<typename T>
790  inline void PMatrix<T>::UnpackData(
791  SuperNodeBufferType & snode,
792  std::vector<LBlock<T> > & LcolRecv,
793  std::vector<UBlock<T> > & UrowRecv )
794  {
795 
796 #if ( _DEBUGlevel_ >= 1 )
797  statusOFS << std::endl << "["<<snode.Index<<"] "<< "Unpack the received data for processors participate in Gemm. " << std::endl << std::endl;
798 #endif
799  // U part
800  if( MYROW( grid_ ) != PROW( snode.Index, grid_ ) ){
801  std::stringstream sstm;
802  sstm.write( &snode.SstrUrowRecv[0], snode.SstrUrowRecv.size() );
803  std::vector<Int> mask( UBlockMask::TOTAL_NUMBER, 1 );
804  Int numUBlock;
805  deserialize( numUBlock, sstm, NO_MASK );
806  UrowRecv.resize( numUBlock );
807  for( Int jb = 0; jb < numUBlock; jb++ ){
808  deserialize( UrowRecv[jb], sstm, mask );
809  }
810  } // sender is not the same as receiver
811  else{
812  // U is obtained locally, just make a copy. Include everything
813  // (there is no diagonal block)
814  // Is it a copy ? LL: YES. Maybe we should replace the copy by
815  // something more efficient especially for mpisize == 1
816  UrowRecv.resize(this->U( LBi( snode.Index, grid_ ) ).size());
817  std::copy(this->U( LBi( snode.Index, grid_ ) ).begin(),this->U( LBi( snode.Index, grid_ )).end(),UrowRecv.begin());
818  } // sender is the same as receiver
819 
820 
821  //L part
822  if( MYCOL( grid_ ) != PCOL( snode.Index, grid_ ) ){
823  std::stringstream sstm;
824  sstm.write( &snode.SstrLcolRecv[0], snode.SstrLcolRecv.size() );
825  std::vector<Int> mask( LBlockMask::TOTAL_NUMBER, 1 );
826  mask[LBlockMask::NZVAL] = 0; // nzval is excluded
827  Int numLBlock;
828  deserialize( numLBlock, sstm, NO_MASK );
829  LcolRecv.resize( numLBlock );
830  for( Int ib = 0; ib < numLBlock; ib++ ){
831  deserialize( LcolRecv[ib], sstm, mask );
832  }
833  } // sender is not the same as receiver
834  else{
835  // L is obtained locally, just make a copy.
836  // Do not include the diagonal block
837  std::vector<LBlock<T> >& Lcol = this->L( LBj( snode.Index, grid_ ) );
838  Int startIdx = ( MYROW( grid_ ) == PROW( snode.Index, grid_ ) )?1:0;
839  LcolRecv.resize( Lcol.size() - startIdx );
840  std::copy(Lcol.begin()+startIdx,Lcol.end(),LcolRecv.begin());
841  } // sender is the same as receiver
842  }
843 
844  template<typename T>
845  inline void PMatrix<T>::ComputeDiagUpdate(SuperNodeBufferType & snode)
846  {
847 
848  //---------Computing Diagonal block, all processors in the column are participating to all pipelined supernodes
849  if( MYCOL( grid_ ) == PCOL( snode.Index, grid_ ) ){
850 #if ( _DEBUGlevel_ >= 1 )
851  statusOFS << std::endl << "["<<snode.Index<<"] "<< "Updating the diagonal block" << std::endl << std::endl;
852 #endif
853  std::vector<LBlock<T> >& Lcol = this->L( LBj( snode.Index, grid_ ) );
854 
855  //Allocate DiagBuf even if Lcol.size() == 0
856  snode.DiagBuf.Resize(SuperSize( snode.Index, super_ ), SuperSize( snode.Index, super_ ));
857  SetValue(snode.DiagBuf, ZERO<T>());
858 
859  // Do I own the diagonal block ?
860  Int startIb = (MYROW( grid_ ) == PROW( snode.Index, grid_ ))?1:0;
861  for( Int ib = startIb; ib < Lcol.size(); ib++ ){
862  blas::Gemm( 'T', 'N', snode.DiagBuf.m(), snode.DiagBuf.n(), Lcol[ib].numRow,
863  MINUS_ONE<T>(), &snode.LUpdateBuf( snode.RowLocalPtr[ib-startIb], 0 ), snode.LUpdateBuf.m(),
864  Lcol[ib].nzval.Data(), Lcol[ib].nzval.m(), ONE<T>(), snode.DiagBuf.Data(), snode.DiagBuf.m() );
865  }
866 
867 #if ( _DEBUGlevel_ >= 1 )
868  statusOFS << std::endl << "["<<snode.Index<<"] "<< "Updated the diagonal block" << std::endl << std::endl;
869 #endif
870  }
871  }
872 
873 
874  template<typename T>
875  void PMatrix<T>::GetEtree(std::vector<Int> & etree_supno )
876  {
877 
878 #ifndef _RELEASE_
879  PushCallStack("PMatrix::GetEtree");
880  double begin = MPI_Wtime( );
881 #endif
882  Int nsupers = this->NumSuper();
883 
884  if( options_->ColPerm != "PARMETIS" ) {
885  /* Use the etree computed from serial symb. fact., and turn it
886  into supernodal tree. */
887  const SuperNodeType * superNode = this->SuperNode();
888 
889 
890  //translate from columns to supernodes etree using supIdx
891  etree_supno.resize(this->NumSuper());
892  for(Int i = 0; i < superNode->etree.m(); ++i){
893  Int curSnode = superNode->superIdx[i];
894  Int parentSnode = (superNode->etree[i]>= superNode->etree.m()) ?this->NumSuper():superNode->superIdx[superNode->etree[i]];
895  if( curSnode != parentSnode){
896  etree_supno[curSnode] = parentSnode;
897  }
898  }
899 
900  } else { /* ParSymbFACT==YES and SymPattern==YES and RowPerm == NOROWPERM */
901  /* Compute an "etree" based on struct(L),
902  assuming struct(U) = struct(L'). */
903 
904  /* find the first block in each supernodal-column of local L-factor */
905  std::vector<Int> etree_supno_l( nsupers, nsupers );
906  for( Int ksup = 0; ksup < nsupers; ksup++ ){
907  if( MYCOL( grid_ ) == PCOL( ksup, grid_ ) ){
908  // L part
909  std::vector<LBlock<T> >& Lcol = this->L( LBj(ksup, grid_) );
910  if(Lcol.size()>0){
911  Int firstBlk = 0;
912  if( MYROW( grid_ ) == PROW( ksup, grid_ ) )
913  firstBlk=1;
914 
915  for( Int ib = firstBlk; ib < Lcol.size(); ib++ ){
916  etree_supno_l[ksup] = std::min(etree_supno_l[ksup] , Lcol[ib].blockIdx);
917  }
918  }
919  }
920  }
921 
922 
923 #if ( _DEBUGlevel_ >= 1 )
924  statusOFS << std::endl << " Local supernodal elimination tree is " << etree_supno_l <<std::endl<<std::endl;
925 
926 #endif
927  /* form global e-tree */
928  etree_supno.resize( nsupers );
929  mpi::Allreduce( (Int*) &etree_supno_l[0],(Int *) &etree_supno[0], nsupers, MPI_MIN, grid_->comm );
930  etree_supno[nsupers-1]=nsupers;
931  }
932 
933 #ifndef _RELEASE_
934  double end = MPI_Wtime( );
935  statusOFS<<"Building the list took "<<end-begin<<"s"<<std::endl;
936 #endif
937 #ifndef _RELEASE_
938  PopCallStack();
939 #endif
940  } // ----- end of method PMatrix::GetEtree -----
941 
942 
943 
944 
945  template<typename T>
946  inline void PMatrix<T>::GetWorkSet(
947  std::vector<Int> & snodeEtree,
948  std::vector<std::vector<Int> > & WSet)
949  {
950  TIMER_START(Compute_WorkSet);
951  Int numSuper = this->NumSuper();
952 
953 
954  if (options_->maxPipelineDepth!=1){
955  //find roots in the supernode etree (it must be postordered)
956  //initialize the parent we are looking at
957  Int rootParent = snodeEtree[numSuper-2];
958 
959  //compute the level of each supernode and the total number of levels
960  IntNumVec level(numSuper);
961  level(rootParent)=0;
962  Int numLevel = 0;
963  for(Int i=rootParent-1; i>=0; i-- ){ level(i) = level(snodeEtree[i])+1; numLevel = std::max(numLevel, level(i)); }
964  numLevel++;
965 
966  //Compute the number of supernodes at each level
967  IntNumVec levelSize(numLevel);
968  SetValue(levelSize,I_ZERO);
969  for(Int i=rootParent-1; i>=0; i-- ){ levelSize(level(i))++; }
970 
971  //Allocate memory
972  WSet.resize(numLevel,std::vector<Int>());
973  for(Int i=0; i<numLevel; i++ ){WSet[i].reserve(levelSize(i));}
974 
975  //Fill the worklist based on the level of each supernode
976  for(Int i=rootParent-1; i>=0; i-- ){
977  WSet[level(i)].push_back(i);
978  }
979 
980  //Constrain the size of each list to be min(MPI_MAX_COMM,options_->maxPipelineDepth)
981  Int limit = (options_->maxPipelineDepth>0)?std::min(MPI_MAX_COMM,options_->maxPipelineDepth):MPI_MAX_COMM;
982  for (Int lidx=0; lidx<WSet.size() ; lidx++){
983  if(WSet[lidx].size()>limit)
984  {
985  std::vector<std::vector<Int> >::iterator pos = WSet.begin()+lidx+1;
986  WSet.insert(pos,std::vector<Int>());
987  WSet[lidx+1].insert(WSet[lidx+1].begin(),WSet[lidx].begin() + limit ,WSet[lidx].end());
988  WSet[lidx].erase(WSet[lidx].begin()+limit,WSet[lidx].end());
989  }
990  }
991 
992 
993 
994  }
995  else{
996  for( Int ksup = numSuper - 2; ksup >= 0; ksup-- ){
997  WSet.push_back(std::vector<Int>());
998  WSet.back().push_back(ksup);
999  }
1000 
1001  }
1002 
1003 
1004 
1005 
1006  TIMER_STOP(Compute_WorkSet);
1007 #if ( _DEBUGlevel_ >= 1 )
1008  for (Int lidx=0; lidx<WSet.size() ; lidx++){
1009  statusOFS << std::endl << "L"<< lidx << " is: {";
1010  for (Int supidx=0; supidx<WSet[lidx].size() ; supidx++){
1011  statusOFS << WSet[lidx][supidx] << " ["<<snodeEtree[WSet[lidx][supidx]]<<"] ";
1012  }
1013  statusOFS << " }"<< std::endl;
1014  }
1015 #endif
1016  }
1017 
1018 /*
1019  template<typename T>
1020  void PMatrix<T>::ConstructCommunicators_Collectives(Int lidx)
1021  {
1022 
1023  Int numSuper = this->NumSuper();
1024  std::vector<std::vector<Int> > & superList = this->WorkingSet();
1025  Int numSteps = superList.size();
1026  Int stepSuper = superList[lidx].size();
1027 
1028  MPI_Group rowCommGroup;
1029  MPI_Comm_group(grid_->rowComm,&rowCommGroup);
1030  MPI_Group colCommGroup;
1031  MPI_Comm_group(grid_->colComm,&colCommGroup);
1032 
1033 
1034  //create the communicators
1035  commSendToRight_.resize(stepSuper,MPI_COMM_NULL);
1036  commSendToBelow_.resize(stepSuper,MPI_COMM_NULL);
1037  commRecvFromBelow_.resize(stepSuper,MPI_COMM_NULL);
1038  for(Int supidx = 0;supidx<stepSuper;supidx++){
1039  Int ksup = superList[lidx][supidx];
1040 
1041  Int count= std::count(commSendToRightMask_[ksup].begin(), commSendToRightMask_[ksup].end(), true);
1042 
1043 
1044  if(count>1){
1045  MPI_Group commGroup;
1046 
1047  Int color = commSendToRightMask_[ksup][MYCOL(grid_)];
1048 
1049  MPI_Comm_split(grid_->rowComm, color ,MYCOL(grid_) , &commSendToRight_[supidx]);
1050  MPI_Comm_group(commSendToRight_[supidx],&commGroup);
1051  //now for each supernode, we need to store the pointer to the communnicator and the rank of the root
1052  Int curRoot = PCOL(ksup,grid_);
1053  Int newRank = -1;
1054  if(color>0){
1055  MPI_Group_translate_ranks(rowCommGroup, 1,&curRoot,commGroup, &newRank);
1056 
1057  if(newRank==MPI_UNDEFINED){
1058  statusOFS<<"["<<ksup<<"] Root COL "<<curRoot<<" has no matching rank"<<std::endl;
1059  }
1060  }
1061 
1062  commSendToRightRoot_[ksup] = newRank;
1063  }
1064 
1065  count= std::count(commSendToBelowMask_[ksup].begin(), commSendToBelowMask_[ksup].end(), true);
1066 
1067  if(count>1){
1068  MPI_Group commGroup;
1069 
1070  Int color = commSendToBelowMask_[ksup][MYROW(grid_)];
1071 
1072  MPI_Comm_split(grid_->colComm, color ,MYROW(grid_) , &commSendToBelow_[supidx]);
1073  MPI_Comm_group(commSendToBelow_[supidx],&commGroup);
1074  //now for each supernode, we need to store the pointer to the communnicator and the rank of the root
1075  Int curRoot = PROW(ksup,grid_);
1076  Int newRank = -1;
1077  if(color>0){
1078  MPI_Group_translate_ranks(colCommGroup, 1,&curRoot,commGroup, &newRank);
1079 
1080  if(newRank==MPI_UNDEFINED){
1081  statusOFS<<"["<<ksup<<"] Root ROW "<<curRoot<<" has no matching rank"<<std::endl;
1082  }
1083  }
1084 
1085  commSendToBelowRoot_[ksup] = newRank;
1086  }
1087 
1088  count= std::count(commRecvFromBelowMask_[ksup].begin(), commRecvFromBelowMask_[ksup].end(), true);
1089  if(count>1){
1090  MPI_Group commGroup;
1091 
1092  Int color = commRecvFromBelowMask_[ksup][MYROW(grid_)];
1093 
1094  MPI_Comm_split(grid_->colComm, color ,MYROW(grid_) , &commRecvFromBelow_[supidx]);
1095  MPI_Comm_group(commRecvFromBelow_[supidx],&commGroup);
1096  //now for each supernode, we need to store the pointer to the communnicator and the rank of the root
1097  Int curRoot = PROW(ksup,grid_);
1098  Int newRank = -1;
1099  if(color>0){
1100  MPI_Group_translate_ranks(colCommGroup, 1,&curRoot,commGroup, &newRank);
1101 
1102  if(newRank==MPI_UNDEFINED){
1103  statusOFS<<"["<<ksup<<"] Root ROW "<<curRoot<<" has no matching rank"<<std::endl;
1104  }
1105  }
1106 
1107  commRecvFromBelowRoot_[ksup] = newRank;
1108  }
1109 
1110  }
1111  };
1112 
1113 
1114 
1115  void PMatrix::DestructCommunicators_Collectives ( )
1116  {
1117 #ifndef _RELEASE_
1118  PushCallStack("Destructing communicators");
1119 #endif
1120  {
1121  for(int i = 0; i< commSendToBelow_.size(); ++i){
1122  if(commSendToBelow_[i]!=MPI_COMM_NULL){
1123  MPI_Comm_free(&commSendToBelow_[i]);
1124  }
1125  }
1126 
1127  for(int i = 0; i< commRecvFromBelow_.size(); ++i){
1128  if(commRecvFromBelow_[i]!=MPI_COMM_NULL){
1129  MPI_Comm_free(&commRecvFromBelow_[i]);
1130  }
1131  }
1132 
1133  for(int i = 0; i< commSendToRight_.size(); ++i){
1134  if(commSendToRight_[i]!=MPI_COMM_NULL){
1135  MPI_Comm_free(&commSendToRight_[i]);
1136  }
1137  }
1138  }
1139 #ifndef _RELEASE_
1140  PopCallStack();
1141 #endif
1142 
1143 
1144  } // ----- end of method PMatrix::DestructCommunicators_Collectives -----
1145 
1146 
1147  inline void PMatrix::SelInvIntra_Collectives(Int lidx)
1148  {
1149 
1150 #if defined (PROFILE) || defined(PMPI) || defined(USE_TAU)
1151  Real begin_SendULWaitContentFirst, end_SendULWaitContentFirst, time_SendULWaitContentFirst = 0;
1152 #endif
1153  Int numSuper = this->NumSuper();
1154  std::vector<std::vector<Int> > & superList = this->WorkingSet();
1155  Int numSteps = superList.size();
1156  Int stepSuper = superList[lidx].size();
1157  TIMER_START(AllocateBuffer);
1158 
1159  //allocate the buffers for this supernode
1160  std::vector<SuperNodeBufferType> arrSuperNodes(stepSuper);
1161  for (Int supidx=0; supidx<stepSuper; supidx++){
1162  arrSuperNodes[supidx].Index = superList[lidx][supidx];
1163  }
1164 
1165  NumMat<Scalar> AinvBuf, UBuf;
1166  TIMER_STOP(AllocateBuffer);
1167 
1168 #ifndef _RELEASE_
1169  PushCallStack("PMatrix::SelInv_Collectives::UpdateL");
1170 #endif
1171 
1172 
1173  TIMER_START(WaitContent_UL);
1174  TIMER_START(WaitContent_UL_Bcast);
1175 
1176  for (Int supidx=0; supidx<stepSuper ; supidx++){
1177  SuperNodeBufferType & snode = arrSuperNodes[supidx];
1178 
1179 
1180  // Communication for the U part.
1181  if(countSendToBelow_(snode.Index)>1){
1182  MPI_Comm * colComm = &commSendToBelow_[supidx];
1183  Int root = commSendToBelowRoot_[snode.Index];
1184  bool isRecvFromAbove = isRecvFromAbove_( snode.Index );
1185  if( MYROW( grid_ ) == PROW( snode.Index, grid_ ) ){
1186 
1187 #if ( _DEBUGlevel_ >= 1 )
1188  statusOFS << std::endl <<"BCAST ["<<snode.Index<<"] Communication to the Schur complement. (Sending)" << std::endl << std::endl;
1189 #endif
1190  // Pack the data in U
1191  std::stringstream sstm;
1192  std::vector<Int> mask( UBlockMask::TOTAL_NUMBER, 1 );
1193  std::vector<UBlock>& Urow = this->U( LBi(snode.Index, grid_) );
1194  // All blocks are to be sent down.
1195  serialize( (Int)Urow.size(), sstm, NO_MASK );
1196  for( Int jb = 0; jb < Urow.size(); jb++ ){
1197  serialize( Urow[jb], sstm, mask );
1198  }
1199  snode.SstrUrowSend.resize( Size( sstm ) );
1200  sstm.read( &snode.SstrUrowSend[0], snode.SstrUrowSend.size() );
1201  snode.SizeSstrUrowSend = snode.SstrUrowSend.size();
1202 
1203  //send size
1204  MPI_Bcast(&snode.SizeSstrUrowSend, 1 , MPI_INT, root, *colComm );
1205  //send content
1206  MPI_Bcast(&snode.SstrUrowSend[0], snode.SizeSstrUrowSend , MPI_BYTE, root, *colComm );
1207  }
1208  else if( isRecvFromAbove &&
1209  MYROW( grid_ ) != PROW( snode.Index, grid_ ) ){
1210 
1211 #if ( _DEBUGlevel_ >= 1 )
1212  statusOFS << std::endl <<"BCAST ["<<snode.Index<<"] Communication to the Schur complement. (Receiving)" << std::endl << std::endl;
1213 #endif
1214  //size
1215  MPI_Bcast(&snode.SizeSstrUrowRecv, 1 , MPI_INT, root, *colComm );
1216  //content
1217  snode.SstrUrowRecv.resize( snode.SizeSstrUrowRecv );
1218  MPI_Bcast(&snode.SstrUrowRecv[0], snode.SizeSstrUrowRecv , MPI_BYTE, root, *colComm );
1219  }
1220  }
1221  // Communication for the L part.
1222  if(countSendToRight_(snode.Index)>1){
1223  MPI_Comm * rowComm = &commSendToRight_[supidx];
1224  Int root = commSendToRightRoot_[snode.Index];
1225  bool isRecvFromLeft = isRecvFromLeft_( snode.Index );
1226  if( MYCOL( grid_ ) == PCOL( snode.Index, grid_ ) ){
1227 
1228 #if ( _DEBUGlevel_ >= 1 )
1229  statusOFS << std::endl <<"BCAST ["<<snode.Index<<"] Communication to the Schur complement. (Sending)" << std::endl << std::endl;
1230 #endif
1231  // Pack the data in L
1232  std::stringstream sstm;
1233  std::vector<Int> mask( LBlockMask::TOTAL_NUMBER, 1 );
1234  mask[LBlockMask::NZVAL] = 0; // nzval is excluded
1235 
1236  std::vector<LBlock>& Lcol = this->L( LBj(snode.Index, grid_) );
1237  // All blocks except for the diagonal block are to be sent right
1238  if( MYROW( grid_ ) == PROW( snode.Index, grid_ ) )
1239  serialize( (Int)Lcol.size() - 1, sstm, NO_MASK );
1240  else
1241  serialize( (Int)Lcol.size(), sstm, NO_MASK );
1242 
1243  for( Int ib = 0; ib < Lcol.size(); ib++ ){
1244  if( Lcol[ib].blockIdx > snode.Index ){
1245  serialize( Lcol[ib], sstm, mask );
1246  }
1247  }
1248  snode.SstrLcolSend.resize( Size( sstm ) );
1249  sstm.read( &snode.SstrLcolSend[0], snode.SstrLcolSend.size() );
1250  snode.SizeSstrLcolSend = snode.SstrLcolSend.size();
1251  //send size
1252  MPI_Bcast(&snode.SizeSstrLcolSend, 1 , MPI_INT, root, *rowComm );
1253  //send content
1254  MPI_Bcast(&snode.SstrLcolSend[0], snode.SizeSstrLcolSend , MPI_BYTE, root, *rowComm );
1255  }
1256  else if( isRecvFromLeft && MYCOL( grid_ ) != PCOL( snode.Index, grid_ ) ){
1257 #if ( _DEBUGlevel_ >= 1 )
1258  statusOFS << std::endl <<"BCAST ["<<snode.Index<<"] Communication to the Schur complement. (Receiving)" << std::endl << std::endl;
1259 #endif
1260  //size
1261  MPI_Bcast(&snode.SizeSstrLcolRecv, 1 , MPI_INT, root, *rowComm );
1262  //content
1263  snode.SstrLcolRecv.resize( snode.SizeSstrLcolRecv );
1264  MPI_Bcast(&snode.SstrLcolRecv[0], snode.SizeSstrLcolRecv , MPI_BYTE, root, *rowComm );
1265  }
1266  }
1267 
1268  }
1269 
1270  TIMER_STOP(WaitContent_UL_Bcast);
1271  TIMER_STOP(WaitContent_UL);
1272 
1273  TIMER_START(Compute_Sinv_LT);
1274 
1275  for (Int supidx=0; supidx<stepSuper; supidx++){
1276  SuperNodeBufferType & snode = arrSuperNodes[supidx];
1277  // Overlap the communication with computation. All processors move
1278  // to Gemm phase when ready
1279  // Only the processors received information participate in the Gemm
1280  if( isRecvFromAbove_( snode.Index ) && isRecvFromLeft_( snode.Index ) ){
1281 
1282  std::vector<LBlock> LcolRecv;
1283  std::vector<UBlock> UrowRecv;
1284 #if ( _DEBUGlevel_ >= 1 )
1285  statusOFS << std::endl << "BCAST ["<<snode.Index<<"] "<< "Main work: Gemm" << std::endl << std::endl;
1286 #endif
1287 
1288  // Save all the data to be updated for { L( isup, snode.Index ) | isup > snode.Index }.
1289  // The size will be updated in the Gemm phase and the reduce phase
1290  UnpackData(snode, LcolRecv, UrowRecv);
1291 
1292  SelInv_lookup_indexes(snode,LcolRecv, UrowRecv,AinvBuf,UBuf);
1293 
1294  snode.LUpdateBuf.Resize( AinvBuf.m(), SuperSize( snode.Index, super_ ) );
1295 
1296  TIMER_START(Compute_Sinv_LT_GEMM);
1297 
1298 
1299 #if ( _DEBUGlevel_ >= 2 )
1300  statusOFS << std::endl << "BCAST ["<<snode.Index<<"] "<< "AinvBuf: " << AinvBuf << std::endl;
1301  statusOFS << std::endl << "BCAST ["<<snode.Index<<"] "<< "UBuf: " << UBuf << std::endl;
1302 #endif
1303  // Gemm for snode.LUpdateBuf = -AinvBuf * UBuf^T
1304  blas::Gemm( 'N', 'T', AinvBuf.m(), UBuf.m(), AinvBuf.n(), MINUS_ONE<T>(),
1305  AinvBuf.Data(), AinvBuf.m(),
1306  UBuf.Data(), UBuf.m(), ZERO<T>(),
1307  snode.LUpdateBuf.Data(), snode.LUpdateBuf.m() );
1308 
1309  TIMER_STOP(Compute_Sinv_LT_GEMM);
1310 
1311 #if ( _DEBUGlevel_ >= 2 )
1312  statusOFS << std::endl << "BCAST ["<<snode.Index<<"] "<< "snode.LUpdateBuf: " << snode.LUpdateBuf << std::endl;
1313 #endif
1314  } // if Gemm is to be done locally
1315 
1316  }
1317 
1318  TIMER_STOP(Compute_Sinv_LT);
1319 
1320 
1321  //Reduce Sinv L^T to the processors in PCOL(snode.Index,grid_)
1322  TIMER_START(Reduce_Sinv_LT);
1323  for (Int supidx=0; supidx<stepSuper; supidx++){
1324  SuperNodeBufferType & snode = arrSuperNodes[supidx];
1325 
1326  // Processor column of snode.Index collects the symbolic data for snode.LUpdateBuf.
1327  Int numRowLUpdateBuf;
1328 
1329 
1330  if( MYCOL( grid_ ) == PCOL( snode.Index, grid_ ) ){
1331  std::vector<LBlock>& Lcol = this->L( LBj( snode.Index, grid_ ) );
1332  if( MYROW( grid_ ) != PROW( snode.Index, grid_ ) ){
1333  snode.RowLocalPtr.resize( Lcol.size() + 1 );
1334  snode.BlockIdxLocal.resize( Lcol.size() );
1335  snode.RowLocalPtr[0] = 0;
1336  for( Int ib = 0; ib < Lcol.size(); ib++ ){
1337  snode.RowLocalPtr[ib+1] = snode.RowLocalPtr[ib] + Lcol[ib].numRow;
1338  snode.BlockIdxLocal[ib] = Lcol[ib].blockIdx;
1339  }
1340  } // I do not own the diagonal block
1341  else{
1342  snode.RowLocalPtr.resize( Lcol.size() );
1343  snode.BlockIdxLocal.resize( Lcol.size() - 1 );
1344  snode.RowLocalPtr[0] = 0;
1345  for( Int ib = 1; ib < Lcol.size(); ib++ ){
1346  snode.RowLocalPtr[ib] = snode.RowLocalPtr[ib-1] + Lcol[ib].numRow;
1347  snode.BlockIdxLocal[ib-1] = Lcol[ib].blockIdx;
1348  }
1349  } // I owns the diagonal block, skip the diagonal block
1350  numRowLUpdateBuf = *snode.RowLocalPtr.rbegin();
1351  if( numRowLUpdateBuf > 0 ){
1352  if( snode.LUpdateBuf.m() == 0 && snode.LUpdateBuf.n() == 0 ){
1353  snode.LUpdateBuf.Resize( numRowLUpdateBuf, SuperSize( snode.Index, super_ ) );
1354  // Fill zero is important
1355  SetValue( snode.LUpdateBuf, ZERO<T>() );
1356  }
1357  }
1358  }
1359 
1360 
1361  if(countSendToRight_(snode.Index)>1){
1362 
1363  MPI_Comm * rowComm = &commSendToRight_[supidx];
1364  Int root = commSendToRightRoot_[snode.Index];
1365 
1366  // Processor column sends the total row dimension to all processors
1367  // in the same row to prepare for reduce
1368  MPI_Bcast( &numRowLUpdateBuf, 1, MPI_INT, root, *rowComm );
1369 
1370  // If LUpdatebuf has not been constructed, resize and fill with zero
1371  if( numRowLUpdateBuf > 0 ){
1372  if( snode.LUpdateBuf.m() == 0 && snode.LUpdateBuf.n() == 0 ){
1373  snode.LUpdateBuf.Resize( numRowLUpdateBuf, SuperSize( snode.Index, super_ ) );
1374  // Fill zero is important
1375  SetValue( snode.LUpdateBuf, ZERO<T>() );
1376  }
1377 
1378  if( MYCOL( grid_ ) == PCOL( snode.Index, grid_ ) ){
1379  mpi::Reduce( (Scalar*)MPI_IN_PLACE, snode.LUpdateBuf.Data(),
1380  numRowLUpdateBuf * SuperSize( snode.Index, super_ ), MPI_SUM,
1381  root, *rowComm );
1382  }
1383  else{
1384  mpi::Reduce( snode.LUpdateBuf.Data(), NULL,
1385  numRowLUpdateBuf * SuperSize( snode.Index, super_ ), MPI_SUM,
1386  root, *rowComm );
1387  }
1388 
1389  if( MYCOL( grid_ ) == PCOL( snode.Index, grid_ ) ){
1390 
1391 #if ( _DEBUGlevel_ >= 2 )
1392  statusOFS << std::endl << "BCAST ["<<snode.Index<<"] "<< "LUpdateBuf: " << snode.LUpdateBuf << std::endl << std::endl;
1393 #endif
1394 
1395  }
1396 
1397  } // Perform reduce for nonzero block rows in the column of snode.Index
1398 
1399 
1400  }
1401 
1402 
1403  }
1404 
1405  TIMER_STOP(Reduce_Sinv_LT);
1406  //--------------------- End of reduce of snode.LUpdateBuf-------------------------
1407 
1408 #ifndef _RELEASE_
1409  PopCallStack();
1410 #endif
1411 
1412 #ifndef _RELEASE_
1413  PushCallStack("PMatrix::SelInv_Collectives::UpdateD");
1414 #endif
1415 
1416  TIMER_START(Update_Diagonal);
1417 
1418  for (Int supidx=0; supidx<stepSuper; supidx++){
1419  SuperNodeBufferType & snode = arrSuperNodes[supidx];
1420 
1421  ComputeDiagUpdate(snode);
1422 
1423  } //end for
1424  TIMER_STOP(Update_Diagonal);
1425 
1426 
1427  TIMER_START(Reduce_Diagonal);
1428 
1429 
1430  for (Int supidx=0; supidx<stepSuper; supidx++){
1431  SuperNodeBufferType & snode = arrSuperNodes[supidx];
1432 
1433  if( MYCOL( grid_ ) == PCOL( snode.Index, grid_ ) ){
1434 
1435 
1436 #if ( _DEBUGlevel_ >= 2 )
1437  statusOFS << std::endl << "BCAST ["<<snode.Index<<"] "<< "snode.DiagBuf: " << snode.DiagBuf << std::endl << std::endl;
1438 #endif
1439 
1440 
1441 
1442  if(countRecvFromBelow_(snode.Index)>1)
1443  {
1444  MPI_Comm * colComm = &commRecvFromBelow_[supidx];
1445  Int root = commRecvFromBelowRoot_[snode.Index];
1446 
1447  if( MYROW( grid_ ) == PROW( snode.Index, grid_ ) ){
1448  if(snode.DiagBuf.m()==0 && snode.DiagBuf.n()==0){
1449  snode.DiagBuf.Resize( SuperSize( snode.Index, super_ ), SuperSize( snode.Index, super_ ));
1450  SetValue(snode.DiagBuf, ZERO<T>());
1451  }
1452 
1453  mpi::Reduce((Scalar*) MPI_IN_PLACE, snode.DiagBuf.Data(),
1454  SuperSize( snode.Index, super_ ) * SuperSize( snode.Index, super_ ),
1455  MPI_SUM, root, *colComm );
1456  }
1457  else{
1458 
1459  mpi::Reduce( snode.DiagBuf.Data(), NULL,
1460  SuperSize( snode.Index, super_ ) * SuperSize( snode.Index, super_ ),
1461  MPI_SUM, root, *colComm );
1462  }
1463 
1464  }
1465 
1466  // Add DiagBufReduced to diagonal block.
1467  if( MYROW( grid_ ) == PROW( snode.Index, grid_ ) ){
1468 
1469  if(snode.DiagBuf.m()>0 && snode.DiagBuf.n()>0){
1470  LBlock& LB = this->L( LBj( snode.Index, grid_ ) )[0];
1471  // Symmetrize LB
1472  blas::Axpy( LB.numRow * LB.numCol, ONE<T>(), snode.DiagBuf.Data(),
1473  1, LB.nzval.Data(), 1 );
1474  Symmetrize( LB.nzval );
1475  }
1476  }
1477  }
1478  }
1479 
1480  TIMER_STOP(Reduce_Diagonal);
1481 
1482 #ifndef _RELEASE_
1483  PopCallStack();
1484 #endif
1485 
1486 
1487 #ifndef _RELEASE_
1488  PushCallStack("PMatrix::SelInv_Collectives::UpdateU");
1489 #endif
1490 
1491  SendRecvCD_UpdateU(arrSuperNodes, stepSuper);
1492 
1493 #ifndef _RELEASE_
1494  PopCallStack();
1495 #endif
1496 
1497 
1498 #ifndef _RELEASE_
1499  PushCallStack("PMatrix::SelInv_Collectives::UpdateLFinal");
1500 #endif
1501 
1502  TIMER_START(Update_L);
1503 
1504  for (Int supidx=0; supidx<stepSuper; supidx++){
1505  SuperNodeBufferType & snode = arrSuperNodes[supidx];
1506 
1507 #if ( _DEBUGlevel_ >= 1 )
1508  statusOFS << std::endl << "["<<snode.Index<<"] "<< "Finish updating the L part by filling LUpdateBufReduced back to L" << std::endl << std::endl;
1509 #endif
1510  if( MYCOL( grid_ ) == PCOL( snode.Index, grid_ ) && snode.LUpdateBuf.m() > 0 ){
1511  std::vector<LBlock>& Lcol = this->L( LBj( snode.Index, grid_ ) );
1512  //Need to skip the diagonal block if present
1513  Int startBlock = (MYROW( grid_ ) == PROW( snode.Index, grid_ ))?1:0;
1514  for( Int ib = startBlock; ib < Lcol.size(); ib++ ){
1515  LBlock& LB = Lcol[ib];
1516  lapack::Lacpy( 'A', LB.numRow, LB.numCol, &snode.LUpdateBuf( snode.RowLocalPtr[ib-startBlock], 0 ),
1517  snode.LUpdateBuf.m(), LB.nzval.Data(), LB.numRow );
1518  }
1519  } // Finish updating L
1520  } // for (snode.Index) : Main loop
1521 
1522 
1523  TIMER_STOP(Update_L);
1524 
1525 #ifdef LIST_BARRIER
1526 #ifndef ALL_BARRIER
1527  if (options_->maxPipelineDepth!=-1)
1528 #endif
1529  {
1530  MPI_Barrier(grid_->comm);
1531  }
1532 #endif
1533 
1534 #ifndef _RELEASE_
1535  PopCallStack();
1536 #endif
1537  }
1538 
1539 void PMatrix::ConstructCommunicationPattern_Collectives ( )
1540  {
1541 #if defined (PROFILE) || defined(PMPI) || defined(USE_TAU)
1542 // TAU_PROFILE_SET_CONTEXT(grid_->comm);
1543 #endif
1544 
1545 
1546  TIMER_START(ConstructCommunicationPattern_Collectives);
1547 #ifndef _RELEASE_
1548  PushCallStack("PMatrix::ConstructCommunicationPattern_Collectives");
1549 #endif
1550  Int numSuper = this->NumSuper();
1551 #ifndef _RELEASE_
1552  PushCallStack( "Initialize the communication pattern" );
1553 #endif
1554  isSendToBelow_.Resize(grid_->numProcRow, numSuper);
1555  isSendToRight_.Resize(grid_->numProcCol, numSuper);
1556  SetValue( isSendToBelow_, false );
1557  SetValue( isSendToRight_, false );
1558 
1559 
1560  isRecvFromAbove_.Resize( numSuper );
1561  isRecvFromLeft_.Resize( numSuper );
1562  SetValue( isRecvFromAbove_, false );
1563  SetValue( isRecvFromLeft_, false );
1564 
1565  isSendToCrossDiagonal_.Resize(grid_->numProcCol+1, numSuper );
1566  SetValue( isSendToCrossDiagonal_, false );
1567  isRecvFromCrossDiagonal_.Resize(grid_->numProcRow+1, numSuper );
1568  SetValue( isRecvFromCrossDiagonal_, false );
1569 
1570  countSendToBelow_.Resize(numSuper);
1571  countSendToRight_.Resize(numSuper);
1572  countRecvFromBelow_.Resize( numSuper );
1573  SetValue( countSendToBelow_, 0 );
1574  SetValue( countSendToRight_, 0 );
1575  SetValue( countRecvFromBelow_, 0 );
1576 
1577 
1578 
1579  commSendToRightMask_.resize(numSuper);
1580  commSendToBelowMask_.resize(numSuper);
1581  commRecvFromBelowMask_.resize(numSuper);
1582 
1583 
1584 #ifndef _RELEASE_
1585  PopCallStack();
1586 #endif
1587 
1588 
1589 
1590  TIMER_START(Get_ETREE);
1591 
1592  std::vector<Int> snodeEtree(this->NumSuper());
1593  GetEtree(snodeEtree);
1594 #if ( _DEBUGlevel_ >= 1 )
1595  statusOFS << std::endl << " Supernodal elimination tree is " << snodeEtree <<std::endl<<std::endl;
1596 #endif
1597 
1598  TIMER_STOP(Get_ETREE);
1599 
1600 
1601 
1602 
1603 
1604 
1605 
1606 
1607 #ifndef _RELEASE_
1608  PushCallStack( "Local column communication" );
1609 #endif
1610 #if ( _DEBUGlevel_ >= 1 )
1611  statusOFS << std::endl << "Local column communication" << std::endl;
1612 #endif
1613 
1614 
1615  TIMER_START(Local_Column_Communication);
1616 
1617  // localColBlockRowIdx stores the nonzero block indices for each local block column.
1618  // The nonzero block indices including contribution from both L and U.
1619  // Dimension: numLocalBlockCol x numNonzeroBlock
1620  std::vector<std::set<Int> > localColBlockRowIdx;
1621  localColBlockRowIdx.resize( this->NumLocalBlockCol() );
1622 
1623  for( Int ksup = 0; ksup < numSuper; ksup++ ){
1624  // All block columns perform independently
1625  if( MYCOL( grid_ ) == PCOL( ksup, grid_ ) ){
1626  std::vector<Int> tBlockRowIdx;
1627  tBlockRowIdx.clear();
1628 
1629  // L part
1630  std::vector<LBlock>& Lcol = this->L( LBj(ksup, grid_) );
1631  for( Int ib = 0; ib < Lcol.size(); ib++ ){
1632  tBlockRowIdx.push_back( Lcol[ib].blockIdx );
1633  }
1634 
1635  // U part
1636  for( Int ib = 0; ib < this->NumLocalBlockRow(); ib++ ){
1637  std::vector<UBlock>& Urow = this->U(ib);
1638  for( Int jb = 0; jb < Urow.size(); jb++ ){
1639  if( Urow[jb].blockIdx == ksup ){
1640  tBlockRowIdx.push_back( GBi( ib, grid_ ) );
1641  }
1642  }
1643  }
1644 
1645  // Communication
1646  std::vector<Int> tAllBlockRowIdx;
1647  mpi::Allgatherv( tBlockRowIdx, tAllBlockRowIdx, grid_->colComm );
1648 
1649  localColBlockRowIdx[LBj( ksup, grid_ )].insert(
1650  tAllBlockRowIdx.begin(), tAllBlockRowIdx.end() );
1651 
1652 #if ( _DEBUGlevel_ >= 1 )
1653  statusOFS
1654  << " Column block " << ksup
1655  << " has the following nonzero block rows" << std::endl;
1656  for( std::set<Int>::iterator si = localColBlockRowIdx[LBj( ksup, grid_ )].begin();
1657  si != localColBlockRowIdx[LBj( ksup, grid_ )].end();
1658  si++ ){
1659  statusOFS << *si << " ";
1660  }
1661  statusOFS << std::endl;
1662 #endif
1663 
1664  } // if( MYCOL( grid_ ) == PCOL( ksup, grid_ ) )
1665  } // for(ksup)
1666 
1667  TIMER_STOP(Local_Column_Communication);
1668 
1669 #ifndef _RELEASE_
1670  PopCallStack();
1671 #endif
1672 
1673 
1674 #ifndef _RELEASE_
1675  PushCallStack( "Local row communication" );
1676 #endif
1677 #if ( _DEBUGlevel_ >= 1 )
1678  statusOFS << std::endl << "Local row communication" << std::endl;
1679 #endif
1680 
1681  TIMER_START(Local_Row_Communication);
1682 
1683  // localRowBlockColIdx stores the nonzero block indices for each local block row.
1684  // The nonzero block indices including contribution from both L and U.
1685  // Dimension: numLocalBlockRow x numNonzeroBlock
1686  std::vector<std::set<Int> > localRowBlockColIdx;
1687 
1688  localRowBlockColIdx.resize( this->NumLocalBlockRow() );
1689 
1690  for( Int ksup = 0; ksup < numSuper; ksup++ ){
1691  // All block columns perform independently
1692  if( MYROW( grid_ ) == PROW( ksup, grid_ ) ){
1693  std::vector<Int> tBlockColIdx;
1694  tBlockColIdx.clear();
1695 
1696  // U part
1697  std::vector<UBlock>& Urow = this->U( LBi(ksup, grid_) );
1698  for( Int jb = 0; jb < Urow.size(); jb++ ){
1699  tBlockColIdx.push_back( Urow[jb].blockIdx );
1700  }
1701 
1702  // L part
1703  for( Int jb = 0; jb < this->NumLocalBlockCol(); jb++ ){
1704  std::vector<LBlock>& Lcol = this->L(jb);
1705  for( Int ib = 0; ib < Lcol.size(); ib++ ){
1706  if( Lcol[ib].blockIdx == ksup ){
1707  tBlockColIdx.push_back( GBj( jb, grid_ ) );
1708  }
1709  }
1710  }
1711 
1712  // Communication
1713  std::vector<Int> tAllBlockColIdx;
1714  mpi::Allgatherv( tBlockColIdx, tAllBlockColIdx, grid_->rowComm );
1715 
1716  localRowBlockColIdx[LBi( ksup, grid_ )].insert(
1717  tAllBlockColIdx.begin(), tAllBlockColIdx.end() );
1718 
1719 #if ( _DEBUGlevel_ >= 1 )
1720  statusOFS
1721  << " Row block " << ksup
1722  << " has the following nonzero block columns" << std::endl;
1723  for( std::set<Int>::iterator si = localRowBlockColIdx[LBi( ksup, grid_ )].begin();
1724  si != localRowBlockColIdx[LBi( ksup, grid_ )].end();
1725  si++ ){
1726  statusOFS << *si << " ";
1727  }
1728  statusOFS << std::endl;
1729 #endif
1730 
1731  } // if( MYROW( grid_ ) == PROW( ksup, grid_ ) )
1732  } // for(ksup)
1733 
1734  TIMER_STOP(Local_Row_Communication);
1735 #ifndef _RELEASE_
1736  PopCallStack();
1737 #endif
1738 
1739 
1740 #ifndef _RELEASE_
1741  PushCallStack("SendToBelow / RecvFromAbove");
1742 #endif
1743 
1744  TIMER_START(SendToBelow_RecvFromAbove);
1745 
1746  for( Int ksup = 0; ksup < numSuper - 1; ksup++ ){
1747  // Loop over all the supernodes to the right of ksup
1748 
1749  std::vector<bool> sTB(grid_->numProcRow,false);
1750 
1751  //Explore the ancestors of ksup
1752  Int jsup = snodeEtree[ksup];
1753  while(jsup<numSuper)
1754  {
1755  Int jsupLocalBlockCol = LBj( jsup, grid_ );
1756  Int jsupProcCol = PCOL( jsup, grid_ );
1757  if( MYCOL( grid_ ) == jsupProcCol ){
1758 
1759  // SendToBelow / RecvFromAbove only if (ksup, jsup) is nonzero.
1760  if( localColBlockRowIdx[jsupLocalBlockCol].count( ksup ) > 0 ) {
1761  for( std::set<Int>::iterator si = localColBlockRowIdx[jsupLocalBlockCol].begin();
1762  si != localColBlockRowIdx[jsupLocalBlockCol].end(); si++ ){
1763  Int isup = *si;
1764  Int isupProcRow = PROW( isup, grid_ );
1765  if( isup > ksup ){
1766  if( MYROW( grid_ ) == isupProcRow ){
1767  isRecvFromAbove_(ksup) = true;
1768  }
1769  if( MYROW( grid_ ) == PROW( ksup, grid_ ) ){
1770  isSendToBelow_( isupProcRow, ksup ) = true;
1771  }
1772  } // if( isup > ksup )
1773  } // for (si)
1774  } // if( localColBlockRowIdx[jsupLocalBlockCol].count( ksup ) > 0 )
1775 
1776  sTB[ PROW(ksup,grid_) ] = true;
1777  if( localColBlockRowIdx[jsupLocalBlockCol].count( ksup ) > 0 ) {
1778  for( std::set<Int>::iterator si = localColBlockRowIdx[jsupLocalBlockCol].begin();
1779  si != localColBlockRowIdx[jsupLocalBlockCol].end(); si++ ){
1780  Int isup = *si;
1781  Int isupProcRow = PROW( isup, grid_ );
1782  if( isup > ksup ){
1783  sTB[isupProcRow] = true;
1784  } // if( isup > ksup )
1785  } // for (si)
1786  }
1787  } // if( MYCOL( grid_ ) == PCOL( jsup, grid_ ) )
1788 
1789  jsup = snodeEtree[jsup];
1790  } // while(jsup)
1791 
1792 #if ( _DEBUGlevel_ >= 1 )
1793  if( MYROW( grid_ ) == PROW( ksup, grid_ ) ){
1794  statusOFS<<"["<<ksup<<"] sTB is: "; for(int curi=0; curi<sTB.size();curi++){statusOFS<<sTB[curi]<<" ";} statusOFS<<std::endl;
1795  statusOFS<<"["<<ksup<<"] isSendToBelow_ is: "; for(int curi=0; curi<sTB.size();curi++){statusOFS<<isSendToBelow_(curi,ksup)<<" ";} statusOFS<<std::endl;
1796  }
1797 #endif
1798  TIMER_START(Creating_MaskSendToBelow);
1799  Int count= std::count(sTB.begin(), sTB.end(), true);
1800  Int color = sTB[MYROW(grid_)];
1801  if(count>1){
1802  commSendToBelowMask_[ksup] = sTB;
1803  }
1804  countSendToBelow_(ksup) = count * color;
1805  TIMER_STOP(Creating_MaskSendToBelow);
1806  } // for(ksup)
1807 
1808 #if ( _DEBUGlevel_ >= 1 )
1809  statusOFS << std::endl << "isSendToBelow:" << std::endl;
1810  for(int j = 0;j< isSendToBelow_.n();j++){
1811  statusOFS << "["<<j<<"] ";
1812  for(int i =0; i < isSendToBelow_.m();i++){
1813  statusOFS<< isSendToBelow_(i,j) << " ";
1814  }
1815  statusOFS<<std::endl;
1816  }
1817 
1818  statusOFS << std::endl << "isRecvFromAbove:" << std::endl;
1819  for(int j = 0;j< isRecvFromAbove_.m();j++){
1820  statusOFS << "["<<j<<"] "<< isRecvFromAbove_(j)<<std::endl;
1821  }
1822 #endif
1823 #ifdef PRINT_COMMUNICATOR_STAT
1824  {
1825  statusOFS << std::endl << "countSendToBelow:" << countSendToBelow_ << std::endl;
1826  }
1827 #endif
1828 #ifndef _RELEASE_
1829  PopCallStack();
1830 #endif
1831 
1832 
1833 
1834  TIMER_STOP(SendToBelow_RecvFromAbove);
1835 
1836 
1837 #ifndef _RELEASE_
1838  PushCallStack("SendToRight / RecvFromLeft");
1839 #endif
1840 
1841 
1842  TIMER_START(SendToRight_RecvFromLeft);
1843 
1844 
1845  for( Int ksup = 0; ksup < numSuper - 1; ksup++ ){
1846  // Loop over all the supernodes below ksup
1847  std::vector<bool> sTR(grid_->numProcCol,false);
1848  std::vector<bool> rFB(grid_->numProcRow,false);
1849 
1850  Int isup = snodeEtree[ksup];
1851  while(isup<numSuper)
1852  {
1853  Int isupLocalBlockRow = LBi( isup, grid_ );
1854  Int isupProcRow = PROW( isup, grid_ );
1855  if( MYROW( grid_ ) == isupProcRow ){
1856  // SendToRight / RecvFromLeft only if (isup, ksup) is nonzero.
1857  if( localRowBlockColIdx[isupLocalBlockRow].count( ksup ) > 0 ){
1858  for( std::set<Int>::iterator sj = localRowBlockColIdx[isupLocalBlockRow].begin();
1859  sj != localRowBlockColIdx[isupLocalBlockRow].end(); sj++ ){
1860  Int jsup = *sj;
1861  Int jsupProcCol = PCOL( jsup, grid_ );
1862  if( jsup > ksup ){
1863 
1864  if( MYCOL( grid_ ) == jsupProcCol ){
1865  isRecvFromLeft_(ksup) = true;
1866  }
1867  if( MYCOL( grid_ ) == PCOL( ksup, grid_ ) ){
1868  isSendToRight_( jsupProcCol, ksup ) = true;
1869  }
1870  }
1871  } // for (sj)
1872  } // if( localRowBlockColIdx[isupLocalBlockRow].count( ksup ) > 0 )
1873 
1874 
1875  sTR[ PCOL(ksup,grid_) ] = true;
1876  if( localRowBlockColIdx[isupLocalBlockRow].count( ksup ) > 0 ){
1877  for( std::set<Int>::iterator sj = localRowBlockColIdx[isupLocalBlockRow].begin();
1878  sj != localRowBlockColIdx[isupLocalBlockRow].end(); sj++ ){
1879  Int jsup = *sj;
1880  Int jsupProcCol = PCOL( jsup, grid_ );
1881  if( jsup > ksup ){
1882  sTR[ jsupProcCol ] = true;
1883  } // if( jsup > ksup )
1884  } // for (sj)
1885  }
1886  } // if( MYROW( grid_ ) == isupProcRow )
1887 
1888 
1889  rFB[ PROW(ksup,grid_) ] = true;
1890  if( MYCOL( grid_ ) == PCOL(ksup, grid_) ){
1891  rFB[ isupProcRow ] = true;
1892 
1893  } // if( MYCOL( grid_ ) == PCOL(ksup, grid_) )
1894 
1895  isup = snodeEtree[isup];
1896  } // while(isup)
1897 #if ( _DEBUGlevel_ >= 1 )
1898  if( MYCOL( grid_ ) == PCOL( ksup, grid_ ) ){
1899  statusOFS<<"["<<ksup<<"] sTR is: "; for(int curi=0; curi<sTR.size();curi++){statusOFS<<sTR[curi]<<" ";} statusOFS<<std::endl;
1900  statusOFS<<"["<<ksup<<"] rFB is: "; for(int curi=0; curi<rFB.size();curi++){statusOFS<<rFB[curi]<<" ";} statusOFS<<std::endl;
1901  }
1902 #endif
1903  // std::vector<bool> mask( sTR.Data(), sTR.Data() + sTR.m() );
1904  TIMER_START(Creating_MaskSendToRight_RecvFromBelow);
1905  Int count= std::count(sTR.begin(), sTR.end(), true);
1906  Int color = sTR[MYCOL(grid_)];
1907  if(count>1){
1908  commSendToRightMask_[ksup] = sTR;
1909  }
1910  countSendToRight_(ksup) = count * color;
1911 
1912 
1913 
1914  count= std::count(rFB.begin(), rFB.end(), true);
1915  color = rFB[MYROW(grid_)];
1916  if(count>1){
1917  commRecvFromBelowMask_[ksup] = rFB;
1918  }
1919  countRecvFromBelow_(ksup) = count * color;
1920  TIMER_STOP(Creating_MaskSendToRight_RecvFromBelow);
1921 
1922  } // for (ksup)
1923 
1924 
1925 #if ( _DEBUGlevel_ >= 1 )
1926  statusOFS << std::endl << "isSendToRight:" << std::endl;
1927  for(int j = 0;j< isSendToRight_.n();j++){
1928  statusOFS << "["<<j<<"] ";
1929  for(int i =0; i < isSendToRight_.m();i++){
1930  statusOFS<< isSendToRight_(i,j) << " ";
1931  }
1932  statusOFS<<std::endl;
1933  }
1934 
1935  statusOFS << std::endl << "isRecvFromLeft:" << std::endl;
1936  for(int j = 0;j< isRecvFromLeft_.m();j++){
1937  statusOFS << "["<<j<<"] "<< isRecvFromLeft_(j)<<std::endl;
1938  }
1939 
1940  statusOFS << std::endl << "isRecvFromBelow:" << std::endl;
1941  for(int j = 0;j< isRecvFromBelow_.n();j++){
1942  statusOFS << "["<<j<<"] ";
1943  for(int i =0; i < isRecvFromBelow_.m();i++){
1944  statusOFS<< isRecvFromBelow_(i,j) << " ";
1945  }
1946  statusOFS<<std::endl;
1947  }
1948 #endif
1949 
1950 #ifdef PRINT_COMMUNICATOR_STAT
1951  {
1952  statusOFS << std::endl << "countSendToRight:" << countSendToRight_ << std::endl;
1953 
1954 
1955  statusOFS << std::endl << "countRecvFromBelow:" << countRecvFromBelow_ << std::endl;
1956  }
1957 #endif
1958 
1959 #ifndef _RELEASE_
1960  PopCallStack();
1961 #endif
1962 
1963 
1964 
1965 
1966 
1967  TIMER_STOP(SendToRight_RecvFromLeft);
1968 
1969 
1970 
1971 #ifndef _RELEASE_
1972  PushCallStack("SendToCrossDiagonal / RecvFromCrossDiagonal");
1973 #endif
1974 
1975  TIMER_START(SendToCD);
1976 
1977  for( Int ksup = 0; ksup < numSuper - 1; ksup++ ){
1978  if( MYCOL( grid_ ) == PCOL( ksup, grid_ ) ){
1979  for( std::set<Int>::iterator si = localColBlockRowIdx[LBj( ksup, grid_ )].begin();
1980  si != localColBlockRowIdx[LBj( ksup, grid_ )].end(); si++ ){
1981  Int isup = *si;
1982  Int isupProcRow = PROW( isup, grid_ );
1983  Int isupProcCol = PCOL( isup, grid_ );
1984  if( isup > ksup ){
1985  if( MYROW( grid_ ) == isupProcRow){
1986 #if ( _DEBUGlevel_ >= 1 )
1987  statusOFS<<"["<<ksup<<"] should send "<<isup<<" to "<<PNUM(PROW(ksup,grid_),isupProcCol,grid_)<<std::endl;
1988 #endif
1989  isSendToCrossDiagonal_(grid_->numProcCol, ksup ) = true;
1990  isSendToCrossDiagonal_(isupProcCol, ksup ) = true;
1991  }
1992  }
1993  } // for (si)
1994  } // if( MYCOL( grid_ ) == PCOL( ksup, grid_ ) )
1995 
1996  if( MYROW( grid_ ) == PROW( ksup, grid_ ) ){
1997  for( std::set<Int>::iterator si = localRowBlockColIdx[ LBi(ksup, grid_) ].begin();
1998  si != localRowBlockColIdx[ LBi(ksup, grid_) ].end(); si++ ){
1999  Int jsup = *si;
2000  Int jsupProcCol = PCOL( jsup, grid_ );
2001  Int jsupProcRow = PROW( jsup, grid_ );
2002  if( jsup > ksup){
2003  if( MYCOL(grid_) == jsupProcCol ){
2004 #if ( _DEBUGlevel_ >= 1 )
2005  statusOFS<<"["<<ksup<<"] should receive "<<jsup<<" from "<<PNUM(jsupProcRow,PCOL(ksup,grid_),grid_)<<std::endl;
2006 #endif
2007  isRecvFromCrossDiagonal_(grid_->numProcRow, ksup ) = true;
2008  isRecvFromCrossDiagonal_(jsupProcRow, ksup ) = true;
2009  }
2010  }
2011  } // for (si)
2012  } // if( MYROW( grid_ ) == PROW( ksup, grid_ ) )
2013 
2014  } // for (ksup)
2015 
2016 
2017 
2018 
2019 #if ( _DEBUGlevel_ >= 1 )
2020  statusOFS << std::endl << "isSendToCrossDiagonal:" << std::endl;
2021  for(int j =0; j < isSendToCrossDiagonal_.n();j++){
2022  if(isSendToCrossDiagonal_(grid_->numProcCol,j)){
2023  statusOFS << "["<<j<<"] ";
2024  for(int i =0; i < isSendToCrossDiagonal_.m()-1;i++){
2025  if(isSendToCrossDiagonal_(i,j))
2026  {
2027  statusOFS<< PNUM(PROW(j,grid_),i,grid_)<<" ";
2028  }
2029  }
2030  statusOFS<<std::endl;
2031  }
2032  }
2033 
2034  statusOFS << std::endl << "isRecvFromCrossDiagonal:" << std::endl;
2035  for(int j =0; j < isRecvFromCrossDiagonal_.n();j++){
2036  if(isRecvFromCrossDiagonal_(grid_->numProcRow,j)){
2037  statusOFS << "["<<j<<"] ";
2038  for(int i =0; i < isRecvFromCrossDiagonal_.m()-1;i++){
2039  if(isRecvFromCrossDiagonal_(i,j))
2040  {
2041  statusOFS<< PNUM(i,PCOL(j,grid_),grid_)<<" ";
2042  }
2043  }
2044  statusOFS<<std::endl;
2045  }
2046  }
2047 #endif
2048 
2049 
2050  TIMER_STOP(SendToCD);
2051 
2052 #ifndef _RELEASE_
2053  PopCallStack();
2054 #endif
2055 
2056 
2057 #ifndef _RELEASE_
2058  PopCallStack();
2059 #endif
2060 
2061 
2062 
2063  //Build the list of supernodes based on the elimination tree from SuperLU
2064  GetWorkSet(snodeEtree,this->WorkingSet());
2065 
2066  TIMER_STOP(ConstructCommunicationPattern_Collectives);
2067  return ;
2068  } // ----- end of method PMatrix::ConstructCommunicationPattern_Collectives -----
2069 
2070 
2071 
2072 
2073  void PMatrix::ConstructCommunicationPattern_Hybrid ( )
2074  {
2075  TIMER_START(ConstructCommunicationPattern_Hybrid);
2076  #ifndef _RELEASE_
2077  PushCallStack("PMatrix::ConstructCommunicationPattern_Hybrid");
2078  #endif
2079  Int numSuper = this->NumSuper();
2080  #ifndef _RELEASE_
2081  PushCallStack( "Initialize the communication pattern" );
2082  #endif
2083  isSendToBelow_.Resize(grid_->numProcRow, numSuper);
2084  isSendToRight_.Resize(grid_->numProcCol, numSuper);
2085  SetValue( isSendToBelow_, false );
2086  SetValue( isSendToRight_, false );
2087 
2088 
2089  isRecvFromBelow_.Resize(grid_->numProcRow, numSuper);
2090  SetValue( isRecvFromBelow_, false );
2091  isSendToDiagonal_.Resize( numSuper );
2092  SetValue( isSendToDiagonal_, false );
2093 
2094  isRecvFromAbove_.Resize( numSuper );
2095  isRecvFromLeft_.Resize( numSuper );
2096  SetValue( isRecvFromAbove_, false );
2097  SetValue( isRecvFromLeft_, false );
2098 
2099  isSendToCrossDiagonal_.Resize(grid_->numProcCol+1, numSuper );
2100  SetValue( isSendToCrossDiagonal_, false );
2101  isRecvFromCrossDiagonal_.Resize(grid_->numProcRow+1, numSuper );
2102  SetValue( isRecvFromCrossDiagonal_, false );
2103 
2104  countSendToBelow_.Resize(numSuper);
2105  countSendToRight_.Resize(numSuper);
2106  countRecvFromBelow_.Resize( numSuper );
2107  SetValue( countSendToBelow_, 0 );
2108  SetValue( countSendToRight_, 0 );
2109  SetValue( countRecvFromBelow_, 0 );
2110 
2111 
2112 
2113  commSendToRightMask_.resize(numSuper);
2114  commSendToBelowMask_.resize(numSuper);
2115  commRecvFromBelowMask_.resize(numSuper);
2116 
2117 
2118  #ifndef _RELEASE_
2119  PopCallStack();
2120  #endif
2121 
2122 
2123 
2124  TIMER_START(Get_ETREE);
2125 
2126  std::vector<Int> snodeEtree(this->NumSuper());
2127  GetEtree(snodeEtree);
2128  #if ( _DEBUGlevel_ >= 1 )
2129  statusOFS << std::endl << " Supernodal elimination tree is " << snodeEtree <<std::endl<<std::endl;
2130  #endif
2131 
2132  TIMER_STOP(Get_ETREE);
2133 
2134 
2135 
2136 
2137 
2138 
2139 
2140 
2141  #ifndef _RELEASE_
2142  PushCallStack( "Local column communication" );
2143  #endif
2144  #if ( _DEBUGlevel_ >= 1 )
2145  statusOFS << std::endl << "Local column communication" << std::endl;
2146  #endif
2147 
2148 
2149  TIMER_START(Local_Column_Communication);
2150 
2151  // localColBlockRowIdx stores the nonzero block indices for each local block column.
2152  // The nonzero block indices including contribution from both L and U.
2153  // Dimension: numLocalBlockCol x numNonzeroBlock
2154  std::vector<std::set<Int> > localColBlockRowIdx;
2155  localColBlockRowIdx.resize( this->NumLocalBlockCol() );
2156 
2157  for( Int ksup = 0; ksup < numSuper; ksup++ ){
2158  // All block columns perform independently
2159  if( MYCOL( grid_ ) == PCOL( ksup, grid_ ) ){
2160  std::vector<Int> tBlockRowIdx;
2161  tBlockRowIdx.clear();
2162 
2163  // L part
2164  std::vector<LBlock>& Lcol = this->L( LBj(ksup, grid_) );
2165  for( Int ib = 0; ib < Lcol.size(); ib++ ){
2166  tBlockRowIdx.push_back( Lcol[ib].blockIdx );
2167  }
2168 
2169  // U part
2170  for( Int ib = 0; ib < this->NumLocalBlockRow(); ib++ ){
2171  std::vector<UBlock>& Urow = this->U(ib);
2172  for( Int jb = 0; jb < Urow.size(); jb++ ){
2173  if( Urow[jb].blockIdx == ksup ){
2174  tBlockRowIdx.push_back( GBi( ib, grid_ ) );
2175  }
2176  }
2177  }
2178 
2179  // Communication
2180  std::vector<Int> tAllBlockRowIdx;
2181  mpi::Allgatherv( tBlockRowIdx, tAllBlockRowIdx, grid_->colComm );
2182 
2183  localColBlockRowIdx[LBj( ksup, grid_ )].insert(
2184  tAllBlockRowIdx.begin(), tAllBlockRowIdx.end() );
2185 
2186  #if ( _DEBUGlevel_ >= 1 )
2187  statusOFS
2188  << " Column block " << ksup
2189  << " has the following nonzero block rows" << std::endl;
2190  for( std::set<Int>::iterator si = localColBlockRowIdx[LBj( ksup, grid_ )].begin();
2191  si != localColBlockRowIdx[LBj( ksup, grid_ )].end();
2192  si++ ){
2193  statusOFS << *si << " ";
2194  }
2195  statusOFS << std::endl;
2196  #endif
2197 
2198  } // if( MYCOL( grid_ ) == PCOL( ksup, grid_ ) )
2199  } // for(ksup)
2200 
2201  TIMER_STOP(Local_Column_Communication);
2202 
2203  #ifndef _RELEASE_
2204  PopCallStack();
2205  #endif
2206 
2207 
2208  #ifndef _RELEASE_
2209  PushCallStack( "Local row communication" );
2210  #endif
2211  #if ( _DEBUGlevel_ >= 1 )
2212  statusOFS << std::endl << "Local row communication" << std::endl;
2213  #endif
2214 
2215  TIMER_START(Local_Row_Communication);
2216 
2217  // localRowBlockColIdx stores the nonzero block indices for each local block row.
2218  // The nonzero block indices including contribution from both L and U.
2219  // Dimension: numLocalBlockRow x numNonzeroBlock
2220  std::vector<std::set<Int> > localRowBlockColIdx;
2221 
2222  localRowBlockColIdx.resize( this->NumLocalBlockRow() );
2223 
2224  for( Int ksup = 0; ksup < numSuper; ksup++ ){
2225  // All block columns perform independently
2226  if( MYROW( grid_ ) == PROW( ksup, grid_ ) ){
2227  std::vector<Int> tBlockColIdx;
2228  tBlockColIdx.clear();
2229 
2230  // U part
2231  std::vector<UBlock>& Urow = this->U( LBi(ksup, grid_) );
2232  for( Int jb = 0; jb < Urow.size(); jb++ ){
2233  tBlockColIdx.push_back( Urow[jb].blockIdx );
2234  }
2235 
2236  // L part
2237  for( Int jb = 0; jb < this->NumLocalBlockCol(); jb++ ){
2238  std::vector<LBlock>& Lcol = this->L(jb);
2239  for( Int ib = 0; ib < Lcol.size(); ib++ ){
2240  if( Lcol[ib].blockIdx == ksup ){
2241  tBlockColIdx.push_back( GBj( jb, grid_ ) );
2242  }
2243  }
2244  }
2245 
2246  // Communication
2247  std::vector<Int> tAllBlockColIdx;
2248  mpi::Allgatherv( tBlockColIdx, tAllBlockColIdx, grid_->rowComm );
2249 
2250  localRowBlockColIdx[LBi( ksup, grid_ )].insert(
2251  tAllBlockColIdx.begin(), tAllBlockColIdx.end() );
2252 
2253  #if ( _DEBUGlevel_ >= 1 )
2254  statusOFS
2255  << " Row block " << ksup
2256  << " has the following nonzero block columns" << std::endl;
2257  for( std::set<Int>::iterator si = localRowBlockColIdx[LBi( ksup, grid_ )].begin();
2258  si != localRowBlockColIdx[LBi( ksup, grid_ )].end();
2259  si++ ){
2260  statusOFS << *si << " ";
2261  }
2262  statusOFS << std::endl;
2263  #endif
2264 
2265  } // if( MYROW( grid_ ) == PROW( ksup, grid_ ) )
2266  } // for(ksup)
2267 
2268  TIMER_STOP(Local_Row_Communication);
2269  #ifndef _RELEASE_
2270  PopCallStack();
2271  #endif
2272 
2273 
2274  #ifndef _RELEASE_
2275  PushCallStack("SendToBelow / RecvFromAbove");
2276  #endif
2277 
2278  TIMER_START(SendToBelow_RecvFromAbove);
2279 
2280  for( Int ksup = 0; ksup < numSuper - 1; ksup++ ){
2281  // Loop over all the supernodes to the right of ksup
2282 
2283  std::vector<bool> sTB(grid_->numProcRow,false);
2284 
2285  //Explore the ancestors of ksup
2286  Int jsup = snodeEtree[ksup];
2287  while(jsup<numSuper)
2288  {
2289  Int jsupLocalBlockCol = LBj( jsup, grid_ );
2290  Int jsupProcCol = PCOL( jsup, grid_ );
2291  if( MYCOL( grid_ ) == jsupProcCol ){
2292 
2293  // SendToBelow / RecvFromAbove only if (ksup, jsup) is nonzero.
2294  if( localColBlockRowIdx[jsupLocalBlockCol].count( ksup ) > 0 ) {
2295  for( std::set<Int>::iterator si = localColBlockRowIdx[jsupLocalBlockCol].begin();
2296  si != localColBlockRowIdx[jsupLocalBlockCol].end(); si++ ){
2297  Int isup = *si;
2298  Int isupProcRow = PROW( isup, grid_ );
2299  if( isup > ksup ){
2300  if( MYROW( grid_ ) == isupProcRow ){
2301  isRecvFromAbove_(ksup) = true;
2302  }
2303  if( MYROW( grid_ ) == PROW( ksup, grid_ ) ){
2304  isSendToBelow_( isupProcRow, ksup ) = true;
2305  }
2306  } // if( isup > ksup )
2307  } // for (si)
2308  } // if( localColBlockRowIdx[jsupLocalBlockCol].count( ksup ) > 0 )
2309 
2310  sTB[ PROW(ksup,grid_) ] = true;
2311  if( localColBlockRowIdx[jsupLocalBlockCol].count( ksup ) > 0 ) {
2312  for( std::set<Int>::iterator si = localColBlockRowIdx[jsupLocalBlockCol].begin();
2313  si != localColBlockRowIdx[jsupLocalBlockCol].end(); si++ ){
2314  Int isup = *si;
2315  Int isupProcRow = PROW( isup, grid_ );
2316  if( isup > ksup ){
2317  sTB[isupProcRow] = true;
2318  } // if( isup > ksup )
2319  } // for (si)
2320  }
2321  } // if( MYCOL( grid_ ) == PCOL( jsup, grid_ ) )
2322 
2323  jsup = snodeEtree[jsup];
2324  } // while(jsup)
2325 
2326  #if ( _DEBUGlevel_ >= 1 )
2327  if( MYROW( grid_ ) == PROW( ksup, grid_ ) ){
2328  statusOFS<<"["<<ksup<<"] sTB is: "; for(int curi=0; curi<sTB.size();curi++){statusOFS<<sTB[curi]<<" ";} statusOFS<<std::endl;
2329  statusOFS<<"["<<ksup<<"] isSendToBelow_ is: "; for(int curi=0; curi<sTB.size();curi++){statusOFS<<isSendToBelow_(curi,ksup)<<" ";} statusOFS<<std::endl;
2330  }
2331  #endif
2332  TIMER_START(Creating_MaskSendToBelow);
2333  Int count= std::count(sTB.begin(), sTB.end(), true);
2334  Int color = sTB[MYROW(grid_)];
2335  if(count>1){
2336  commSendToBelowMask_[ksup] = sTB;
2337  }
2338  countSendToBelow_(ksup) = count * color;
2339  TIMER_STOP(Creating_MaskSendToBelow);
2340  } // for(ksup)
2341 
2342  #if ( _DEBUGlevel_ >= 1 )
2343  statusOFS << std::endl << "isSendToBelow:" << std::endl;
2344  for(int j = 0;j< isSendToBelow_.n();j++){
2345  statusOFS << "["<<j<<"] ";
2346  for(int i =0; i < isSendToBelow_.m();i++){
2347  statusOFS<< isSendToBelow_(i,j) << " ";
2348  }
2349  statusOFS<<std::endl;
2350  }
2351 
2352  statusOFS << std::endl << "isRecvFromAbove:" << std::endl;
2353  for(int j = 0;j< isRecvFromAbove_.m();j++){
2354  statusOFS << "["<<j<<"] "<< isRecvFromAbove_(j)<<std::endl;
2355  }
2356  #endif
2357  #ifdef PRINT_COMMUNICATOR_STAT
2358  {
2359  statusOFS << std::endl << "countSendToBelow:" << countSendToBelow_ << std::endl;
2360  }
2361  #endif
2362  #ifndef _RELEASE_
2363  PopCallStack();
2364  #endif
2365 
2366 
2367 
2368  TIMER_STOP(SendToBelow_RecvFromAbove);
2369 
2370 
2371  #ifndef _RELEASE_
2372  PushCallStack("SendToRight / RecvFromLeft");
2373  #endif
2374 
2375 
2376  TIMER_START(SendToRight_RecvFromLeft);
2377 
2378 
2379  for( Int ksup = 0; ksup < numSuper - 1; ksup++ ){
2380  // Loop over all the supernodes below ksup
2381  std::vector<bool> sTR(grid_->numProcCol,false);
2382  std::vector<bool> rFB(grid_->numProcRow,false);
2383 
2384  Int isup = snodeEtree[ksup];
2385  while(isup<numSuper)
2386  {
2387  Int isupLocalBlockRow = LBi( isup, grid_ );
2388  Int isupProcRow = PROW( isup, grid_ );
2389  if( MYROW( grid_ ) == isupProcRow ){
2390  // SendToRight / RecvFromLeft only if (isup, ksup) is nonzero.
2391  if( localRowBlockColIdx[isupLocalBlockRow].count( ksup ) > 0 ){
2392  for( std::set<Int>::iterator sj = localRowBlockColIdx[isupLocalBlockRow].begin();
2393  sj != localRowBlockColIdx[isupLocalBlockRow].end(); sj++ ){
2394  Int jsup = *sj;
2395  Int jsupProcCol = PCOL( jsup, grid_ );
2396  if( jsup > ksup ){
2397 
2398  if( MYCOL( grid_ ) == jsupProcCol ){
2399  isRecvFromLeft_(ksup) = true;
2400  }
2401  if( MYCOL( grid_ ) == PCOL( ksup, grid_ ) ){
2402  isSendToRight_( jsupProcCol, ksup ) = true;
2403  }
2404  }
2405  } // for (sj)
2406  } // if( localRowBlockColIdx[isupLocalBlockRow].count( ksup ) > 0 )
2407 
2408 
2409  sTR[ PCOL(ksup,grid_) ] = true;
2410  if( localRowBlockColIdx[isupLocalBlockRow].count( ksup ) > 0 ){
2411  for( std::set<Int>::iterator sj = localRowBlockColIdx[isupLocalBlockRow].begin();
2412  sj != localRowBlockColIdx[isupLocalBlockRow].end(); sj++ ){
2413  Int jsup = *sj;
2414  Int jsupProcCol = PCOL( jsup, grid_ );
2415  if( jsup > ksup ){
2416  sTR[ jsupProcCol ] = true;
2417  } // if( jsup > ksup )
2418  } // for (sj)
2419  }
2420  } // if( MYROW( grid_ ) == isupProcRow )
2421 
2422 
2423  rFB[ PROW(ksup,grid_) ] = true;
2424  if( MYCOL( grid_ ) == PCOL(ksup, grid_) ){
2425  rFB[ isupProcRow ] = true;
2426 
2427  if( MYROW( grid_ ) == PROW( ksup, grid_ ) ){
2428  isRecvFromBelow_(isupProcRow,ksup) = true;
2429  }
2430  else if (MYROW(grid_) == isupProcRow){
2431  isSendToDiagonal_(ksup)=true;
2432  }
2433  } // if( MYCOL( grid_ ) == PCOL(ksup, grid_) )
2434 
2435  isup = snodeEtree[isup];
2436  } // while(isup)
2437  #if ( _DEBUGlevel_ >= 1 )
2438  if( MYCOL( grid_ ) == PCOL( ksup, grid_ ) ){
2439  statusOFS<<"["<<ksup<<"] sTR is: "; for(int curi=0; curi<sTR.size();curi++){statusOFS<<sTR[curi]<<" ";} statusOFS<<std::endl;
2440  statusOFS<<"["<<ksup<<"] rFB is: "; for(int curi=0; curi<rFB.size();curi++){statusOFS<<rFB[curi]<<" ";} statusOFS<<std::endl;
2441  }
2442  #endif
2443  // std::vector<bool> mask( sTR.Data(), sTR.Data() + sTR.m() );
2444  TIMER_START(Creating_MaskSendToRight_RecvFromBelow);
2445  Int count= std::count(sTR.begin(), sTR.end(), true);
2446  Int color = sTR[MYCOL(grid_)];
2447  if(count>1){
2448  commSendToRightMask_[ksup] = sTR;
2449  }
2450  countSendToRight_(ksup) = count * color;
2451 
2452 
2453 
2454  count= std::count(rFB.begin(), rFB.end(), true);
2455  color = rFB[MYROW(grid_)];
2456  if(count>1){
2457  commRecvFromBelowMask_[ksup] = rFB;
2458  }
2459  countRecvFromBelow_(ksup) = count * color;
2460  TIMER_STOP(Creating_MaskSendToRight_RecvFromBelow);
2461 
2462  } // for (ksup)
2463 
2464 
2465  #if ( _DEBUGlevel_ >= 1 )
2466  statusOFS << std::endl << "isSendToRight:" << std::endl;
2467  for(int j = 0;j< isSendToRight_.n();j++){
2468  statusOFS << "["<<j<<"] ";
2469  for(int i =0; i < isSendToRight_.m();i++){
2470  statusOFS<< isSendToRight_(i,j) << " ";
2471  }
2472  statusOFS<<std::endl;
2473  }
2474 
2475  statusOFS << std::endl << "isRecvFromLeft:" << std::endl;
2476  for(int j = 0;j< isRecvFromLeft_.m();j++){
2477  statusOFS << "["<<j<<"] "<< isRecvFromLeft_(j)<<std::endl;
2478  }
2479 
2480  statusOFS << std::endl << "isRecvFromBelow:" << std::endl;
2481  for(int j = 0;j< isRecvFromBelow_.n();j++){
2482  statusOFS << "["<<j<<"] ";
2483  for(int i =0; i < isRecvFromBelow_.m();i++){
2484  statusOFS<< isRecvFromBelow_(i,j) << " ";
2485  }
2486  statusOFS<<std::endl;
2487  }
2488  #endif
2489 
2490  #ifdef PRINT_COMMUNICATOR_STAT
2491  {
2492  statusOFS << std::endl << "countSendToRight:" << countSendToRight_ << std::endl;
2493 
2494 
2495  statusOFS << std::endl << "countRecvFromBelow:" << countRecvFromBelow_ << std::endl;
2496  }
2497  #endif
2498 
2499  #ifndef _RELEASE_
2500  PopCallStack();
2501  #endif
2502 
2503 
2504 
2505 
2506 
2507  TIMER_STOP(SendToRight_RecvFromLeft);
2508 
2509 
2510 
2511  #ifndef _RELEASE_
2512  PushCallStack("SendToCrossDiagonal / RecvFromCrossDiagonal");
2513  #endif
2514 
2515  TIMER_START(SendToCD);
2516 
2517  for( Int ksup = 0; ksup < numSuper - 1; ksup++ ){
2518  if( MYCOL( grid_ ) == PCOL( ksup, grid_ ) ){
2519  for( std::set<Int>::iterator si = localColBlockRowIdx[LBj( ksup, grid_ )].begin();
2520  si != localColBlockRowIdx[LBj( ksup, grid_ )].end(); si++ ){
2521  Int isup = *si;
2522  Int isupProcRow = PROW( isup, grid_ );
2523  Int isupProcCol = PCOL( isup, grid_ );
2524  if( isup > ksup ){
2525  if( MYROW( grid_ ) == isupProcRow){
2526  #if ( _DEBUGlevel_ >= 1 )
2527  statusOFS<<"["<<ksup<<"] should send "<<isup<<" to "<<PNUM(PROW(ksup,grid_),isupProcCol,grid_)<<std::endl;
2528  #endif
2529  isSendToCrossDiagonal_(grid_->numProcCol, ksup ) = true;
2530  isSendToCrossDiagonal_(isupProcCol, ksup ) = true;
2531  }
2532  }
2533  } // for (si)
2534  } // if( MYCOL( grid_ ) == PCOL( ksup, grid_ ) )
2535 
2536  if( MYROW( grid_ ) == PROW( ksup, grid_ ) ){
2537  for( std::set<Int>::iterator si = localRowBlockColIdx[ LBi(ksup, grid_) ].begin();
2538  si != localRowBlockColIdx[ LBi(ksup, grid_) ].end(); si++ ){
2539  Int jsup = *si;
2540  Int jsupProcCol = PCOL( jsup, grid_ );
2541  Int jsupProcRow = PROW( jsup, grid_ );
2542  if( jsup > ksup){
2543  if( MYCOL(grid_) == jsupProcCol ){
2544  #if ( _DEBUGlevel_ >= 1 )
2545  statusOFS<<"["<<ksup<<"] should receive "<<jsup<<" from "<<PNUM(jsupProcRow,PCOL(ksup,grid_),grid_)<<std::endl;
2546  #endif
2547  isRecvFromCrossDiagonal_(grid_->numProcRow, ksup ) = true;
2548  isRecvFromCrossDiagonal_(jsupProcRow, ksup ) = true;
2549  }
2550  }
2551  } // for (si)
2552  } // if( MYROW( grid_ ) == PROW( ksup, grid_ ) )
2553 
2554  } // for (ksup)
2555 
2556 
2557 
2558 
2559  #if ( _DEBUGlevel_ >= 1 )
2560  statusOFS << std::endl << "isSendToCrossDiagonal:" << std::endl;
2561  for(int j =0; j < isSendToCrossDiagonal_.n();j++){
2562  if(isSendToCrossDiagonal_(grid_->numProcCol,j)){
2563  statusOFS << "["<<j<<"] ";
2564  for(int i =0; i < isSendToCrossDiagonal_.m()-1;i++){
2565  if(isSendToCrossDiagonal_(i,j))
2566  {
2567  statusOFS<< PNUM(PROW(j,grid_),i,grid_)<<" ";
2568  }
2569  }
2570  statusOFS<<std::endl;
2571  }
2572  }
2573 
2574  statusOFS << std::endl << "isRecvFromCrossDiagonal:" << std::endl;
2575  for(int j =0; j < isRecvFromCrossDiagonal_.n();j++){
2576  if(isRecvFromCrossDiagonal_(grid_->numProcRow,j)){
2577  statusOFS << "["<<j<<"] ";
2578  for(int i =0; i < isRecvFromCrossDiagonal_.m()-1;i++){
2579  if(isRecvFromCrossDiagonal_(i,j))
2580  {
2581  statusOFS<< PNUM(i,PCOL(j,grid_),grid_)<<" ";
2582  }
2583  }
2584  statusOFS<<std::endl;
2585  }
2586  }
2587  #endif
2588 
2589 
2590  TIMER_STOP(SendToCD);
2591 
2592  #ifndef _RELEASE_
2593  PopCallStack();
2594  #endif
2595 
2596 
2597  #ifndef _RELEASE_
2598  PopCallStack();
2599  #endif
2600 
2601 
2602 
2603  //Build the list of supernodes based on the elimination tree from SuperLU
2604  GetWorkSet(snodeEtree,this->WorkingSet());
2605 
2606  TIMER_STOP(ConstructCommunicationPattern_Hybrid);
2607  return ;
2608  } // ----- end of method PMatrix::ConstructCommunicationPattern_Hybrid -----
2609 
2610 
2611  void PMatrix::SelInv_Collectives ( )
2612  {
2613  TIMER_START(SelInv_Collectives);
2614 
2615 #ifndef _RELEASE_
2616  PushCallStack("PMatrix::SelInv_Collectives");
2617 #endif
2618 
2619 
2620 
2621  Int numSuper = this->NumSuper();
2622 
2623  commSendToRightRoot_.resize(numSuper);
2624  commSendToBelowRoot_.resize(numSuper);
2625  commRecvFromBelowRoot_.resize(numSuper);
2626 
2627 
2628  // Main loop
2629  std::vector<std::vector<Int> > & superList = this->WorkingSet();
2630  Int numSteps = superList.size();
2631 
2632  for (Int lidx=0; lidx<numSteps ; lidx++){
2633  Int stepSuper = superList[lidx].size();
2634  ConstructCommunicators_Collectives(lidx);
2635  SelInvIntra_Collectives(lidx);
2636  DestructCommunicators_Collectives ( );
2637  }
2638 
2639 #ifndef _RELEASE_
2640  PopCallStack();
2641 #endif
2642 
2643  TIMER_STOP(SelInv_Collectives);
2644 
2645 
2646  return ;
2647  } // ----- end of method PMatrix::SelInv_Collectives -----
2648 
2649 
2650  void PMatrix::SelInv_Hybrid (Int threshold = BCAST_THRESHOLD)
2651  {
2652  TIMER_START(SelInv_Hybrid);
2653 
2654 #ifndef _RELEASE_
2655  PushCallStack("PMatrix::SelInv_Hybrid");
2656 #endif
2657 
2658 
2659 
2660  Int numSuper = this->NumSuper();
2661 
2662  commSendToRightRoot_.resize(numSuper);
2663  commSendToBelowRoot_.resize(numSuper);
2664  commRecvFromBelowRoot_.resize(numSuper);
2665 
2666  getMaxCommunicatorSizes();
2667 
2668  // Main loop
2669  std::vector<std::vector<Int> > & superList = this->WorkingSet();
2670  Int numSteps = superList.size();
2671 
2672  for (Int lidx=0; lidx<numSteps ; lidx++){
2673  Int stepSuper = superList[lidx].size();
2674 
2675  Int logMaxSize = std::ceil(log2(maxCommSizes_[lidx]));
2676 
2677  if(maxCommSizes_[lidx]>1 && maxCommSizes_[lidx] > logMaxSize*threshold){
2678  ConstructCommunicators_Collectives(lidx);
2679  SelInvIntra_Collectives(lidx);
2680  DestructCommunicators_Collectives ( );
2681  }
2682  else
2683  {
2684  SelInvIntra_P2p(lidx);
2685  }
2686  }
2687 
2688 #ifndef _RELEASE_
2689  PopCallStack();
2690 #endif
2691 
2692  TIMER_STOP(SelInv_Hybrid);
2693 
2694 
2695  return ;
2696  } // ----- end of method PMatrix::SelInv_Hybrid -----
2697 */
2698 
2699  template<typename T>
2700  inline void PMatrix<T>::SelInvIntra_P2p(Int lidx)
2701  {
2702 
2703 #if defined (PROFILE) || defined(PMPI) || defined(USE_TAU)
2704  Real begin_SendULWaitContentFirst, end_SendULWaitContentFirst, time_SendULWaitContentFirst = 0;
2705 #endif
2706  Int numSuper = this->NumSuper();
2707  std::vector<std::vector<Int> > & superList = this->WorkingSet();
2708  Int numSteps = superList.size();
2709  Int stepSuper = superList[lidx].size();
2710 
2711  TIMER_START(AllocateBuffer);
2712 
2713  //This is required to send the size and content of U/L
2714  std::vector<std::vector<MPI_Request> > arrMpireqsSendToBelow;
2715  arrMpireqsSendToBelow.resize( stepSuper, std::vector<MPI_Request>( 2 * grid_->numProcRow, MPI_REQUEST_NULL ));
2716  std::vector<std::vector<MPI_Request> > arrMpireqsSendToRight;
2717  arrMpireqsSendToRight.resize(stepSuper, std::vector<MPI_Request>( 2 * grid_->numProcCol, MPI_REQUEST_NULL ));
2718 
2719  //This is required to reduce L
2720  std::vector<MPI_Request> arrMpireqsSendToLeft;
2721  arrMpireqsSendToLeft.resize(stepSuper, MPI_REQUEST_NULL );
2722  //This is required to reduce D
2723  std::vector<MPI_Request> arrMpireqsSendToAbove;
2724  arrMpireqsSendToAbove.resize(stepSuper, MPI_REQUEST_NULL );
2725 
2726  //This is required to receive the size and content of U/L
2727  std::vector<MPI_Request> arrMpireqsRecvSizeFromAny;
2728  arrMpireqsRecvSizeFromAny.resize(stepSuper*2 , MPI_REQUEST_NULL);
2729  std::vector<MPI_Request> arrMpireqsRecvContentFromAny;
2730  arrMpireqsRecvContentFromAny.resize(stepSuper*2 , MPI_REQUEST_NULL);
2731 
2732 
2733  //allocate the buffers for this supernode
2734  std::vector<SuperNodeBufferType> arrSuperNodes(stepSuper);
2735  for (Int supidx=0; supidx<stepSuper; supidx++){
2736  arrSuperNodes[supidx].Index = superList[lidx][supidx];
2737  }
2738 
2739  NumMat<T> AinvBuf, UBuf;
2740 
2741  TIMER_STOP(AllocateBuffer);
2742 
2743 #ifndef _RELEASE_
2744  PushCallStack("PMatrix::SelInv_P2p::UpdateL");
2745 #endif
2746 #if ( _DEBUGlevel_ >= 1 )
2747  statusOFS << std::endl << "Communication to the Schur complement." << std::endl << std::endl;
2748 #endif
2749 
2750  {
2751  // Senders
2752  for (Int supidx=0; supidx<stepSuper; supidx++){
2753  SuperNodeBufferType & snode = arrSuperNodes[supidx];
2754  std::vector<MPI_Request> & mpireqsSendToBelow = arrMpireqsSendToBelow[supidx];
2755  std::vector<MPI_Request> & mpireqsSendToRight = arrMpireqsSendToRight[supidx];
2756 
2757 #if ( _DEBUGlevel_ >= 1 )
2758  statusOFS << std::endl << "["<<snode.Index<<"] "
2759  << "Communication for the U part." << std::endl << std::endl;
2760 #endif
2761 
2762  // Communication for the U part.
2763  if( MYROW( grid_ ) == PROW( snode.Index, grid_ ) ){
2764  // Pack the data in U
2765  TIMER_START(Serialize_UL);
2766  std::stringstream sstm;
2767  std::vector<Int> mask( UBlockMask::TOTAL_NUMBER, 1 );
2768  std::vector<UBlock<T> >& Urow = this->U( LBi(snode.Index, grid_) );
2769  // All blocks are to be sent down.
2770  serialize( (Int)Urow.size(), sstm, NO_MASK );
2771  for( Int jb = 0; jb < Urow.size(); jb++ ){
2772  serialize( Urow[jb], sstm, mask );
2773  }
2774  snode.SstrUrowSend.resize( Size( sstm ) );
2775  sstm.read( &snode.SstrUrowSend[0], snode.SstrUrowSend.size() );
2776  snode.SizeSstrUrowSend = snode.SstrUrowSend.size();
2777  TIMER_STOP(Serialize_UL);
2778 
2779  for( Int iProcRow = 0; iProcRow < grid_->numProcRow; iProcRow++ ){
2780  if( MYROW( grid_ ) != iProcRow &&
2781  isSendToBelow_( iProcRow,snode.Index ) == true ){
2782  // Use Isend to send to multiple targets
2783  MPI_Isend( &snode.SizeSstrUrowSend, 1, MPI_INT,
2784  iProcRow, IDX_TO_TAG(supidx,SELINV_TAG_U_SIZE), grid_->colComm, &mpireqsSendToBelow[2*iProcRow] );
2785  MPI_Isend( (void*)&snode.SstrUrowSend[0], snode.SizeSstrUrowSend, MPI_BYTE,
2786  iProcRow, IDX_TO_TAG(supidx,SELINV_TAG_U_CONTENT),
2787  grid_->colComm, &mpireqsSendToBelow[2*iProcRow+1] );
2788 #if ( _DEBUGlevel_ >= 1 )
2789  statusOFS << std::endl << "["<<snode.Index<<"] "<< "Sending U " << snode.SizeSstrUrowSend << " BYTES"<< std::endl << std::endl;
2790 #endif
2791  } // Send
2792  } // for (iProcRow)
2793  } // if I am the sender
2794 
2795 #if ( _DEBUGlevel_ >= 1 )
2796  statusOFS << std::endl << "["<<snode.Index<<"] "<< "Communication for the L part." << std::endl << std::endl;
2797 #endif
2798 
2799 
2800 
2801  // Communication for the L part.
2802  if( MYCOL( grid_ ) == PCOL( snode.Index, grid_ ) ){
2803 
2804  TIMER_START(Serialize_UL);
2805  // Pack the data in L
2806  std::stringstream sstm;
2807  std::vector<Int> mask( LBlockMask::TOTAL_NUMBER, 1 );
2808  mask[LBlockMask::NZVAL] = 0; // nzval is excluded
2809 
2810  std::vector<LBlock<T> >& Lcol = this->L( LBj(snode.Index, grid_) );
2811  // All blocks except for the diagonal block are to be sent right
2812 
2813  if( MYROW( grid_ ) == PROW( snode.Index, grid_ ) )
2814  serialize( (Int)Lcol.size() - 1, sstm, NO_MASK );
2815  else
2816  serialize( (Int)Lcol.size(), sstm, NO_MASK );
2817 
2818  for( Int ib = 0; ib < Lcol.size(); ib++ ){
2819  if( Lcol[ib].blockIdx > snode.Index ){
2820 #if ( _DEBUGlevel_ >= 2 )
2821  statusOFS << std::endl << "["<<snode.Index<<"] "<< "Serializing Block index " << Lcol[ib].blockIdx << std::endl;
2822 #endif
2823  serialize( Lcol[ib], sstm, mask );
2824  }
2825  }
2826  snode.SstrLcolSend.resize( Size( sstm ) );
2827  sstm.read( &snode.SstrLcolSend[0], snode.SstrLcolSend.size() );
2828  snode.SizeSstrLcolSend = snode.SstrLcolSend.size();
2829  TIMER_STOP(Serialize_UL);
2830 
2831  for( Int iProcCol = 0; iProcCol < grid_->numProcCol ; iProcCol++ ){
2832  if( MYCOL( grid_ ) != iProcCol &&
2833  isSendToRight_( iProcCol, snode.Index ) == true ){
2834  // Use Isend to send to multiple targets
2835  MPI_Isend( &snode.SizeSstrLcolSend, 1, MPI_INT,
2836  iProcCol, IDX_TO_TAG(supidx,SELINV_TAG_L_SIZE),
2837  grid_->rowComm, &mpireqsSendToRight[2*iProcCol] );
2838  MPI_Isend( (void*)&snode.SstrLcolSend[0], snode.SizeSstrLcolSend, MPI_BYTE,
2839  iProcCol, IDX_TO_TAG(supidx,SELINV_TAG_L_CONTENT),
2840  grid_->rowComm, &mpireqsSendToRight[2*iProcCol+1] );
2841 #if ( _DEBUGlevel_ >= 1 )
2842  statusOFS << std::endl << "["<<snode.Index<<"] "<< "Sending L " << snode.SizeSstrLcolSend<< " BYTES" << std::endl << std::endl;
2843 #endif
2844  } // Send
2845  } // for (iProcCol)
2846  } // if I am the sender
2847  } //Senders
2848 
2849  //TODO Ideally, we should not receive data in sequence but in any order with ksup packed with the data
2850  // Receivers (Size)
2851  for (Int supidx=0; supidx<stepSuper ; supidx++){
2852  SuperNodeBufferType & snode = arrSuperNodes[supidx];
2853  MPI_Request * mpireqsRecvFromAbove = &arrMpireqsRecvSizeFromAny[supidx*2];
2854  MPI_Request * mpireqsRecvFromLeft = &arrMpireqsRecvSizeFromAny[supidx*2+1];
2855 
2856  // Receive the size first
2857  if( isRecvFromAbove_( snode.Index ) &&
2858  MYROW( grid_ ) != PROW( snode.Index, grid_ ) ){
2859  MPI_Irecv( &snode.SizeSstrUrowRecv, 1, MPI_INT, PROW( snode.Index, grid_ ),
2860  IDX_TO_TAG(supidx,SELINV_TAG_U_SIZE),
2861  grid_->colComm, mpireqsRecvFromAbove );
2862 #if ( _DEBUGlevel_ >= 1 )
2863  statusOFS << std::endl << "["<<snode.Index<<"] "<< "Receiving U size on tag " << IDX_TO_TAG(supidx,SELINV_TAG_U_SIZE)<< std::endl << std::endl;
2864 #endif
2865  } // if I need to receive from up
2866 
2867 
2868  if( isRecvFromLeft_( snode.Index ) &&
2869  MYCOL( grid_ ) != PCOL( snode.Index, grid_ ) ){
2870  MPI_Irecv( &snode.SizeSstrLcolRecv, 1, MPI_INT, PCOL( snode.Index, grid_ ),
2871  IDX_TO_TAG(supidx,SELINV_TAG_L_SIZE),
2872  grid_->rowComm, mpireqsRecvFromLeft );
2873 #if ( _DEBUGlevel_ >= 1 )
2874  statusOFS << std::endl << "["<<snode.Index<<"] "<< "Receiving L size on tag " << IDX_TO_TAG(supidx,SELINV_TAG_L_SIZE)<< std::endl << std::endl;
2875 #endif
2876  } // if I need to receive from left
2877  }
2878 
2879  //Wait to receive all the sizes
2880  TIMER_START(WaitSize_UL);
2881  mpi::Waitall(arrMpireqsRecvSizeFromAny);
2882  TIMER_STOP(WaitSize_UL);
2883 
2884  // Receivers (Content)
2885  for (Int supidx=0; supidx<stepSuper ; supidx++){
2886  SuperNodeBufferType & snode = arrSuperNodes[supidx];
2887 
2888  MPI_Request * mpireqsRecvFromAbove = &arrMpireqsRecvContentFromAny[supidx*2];
2889  MPI_Request * mpireqsRecvFromLeft = &arrMpireqsRecvContentFromAny[supidx*2+1];
2890 
2891  if( isRecvFromAbove_( snode.Index ) &&
2892  MYROW( grid_ ) != PROW( snode.Index, grid_ ) ){
2893  snode.SstrUrowRecv.resize( snode.SizeSstrUrowRecv );
2894  MPI_Irecv( &snode.SstrUrowRecv[0], snode.SizeSstrUrowRecv, MPI_BYTE,
2895  PROW( snode.Index, grid_ ), IDX_TO_TAG(supidx,SELINV_TAG_U_CONTENT),
2896  grid_->colComm, mpireqsRecvFromAbove );
2897 #if ( _DEBUGlevel_ >= 1 )
2898  statusOFS << std::endl << "["<<snode.Index<<"] "<< "Receiving U " << snode.SizeSstrUrowRecv << " BYTES"<< std::endl << std::endl;
2899 #endif
2900  } // if I need to receive from up
2901 
2902  if( isRecvFromLeft_( snode.Index ) &&
2903  MYCOL( grid_ ) != PCOL( snode.Index, grid_ ) ){
2904  snode.SstrLcolRecv.resize( snode.SizeSstrLcolRecv );
2905  MPI_Irecv( &snode.SstrLcolRecv[0], snode.SizeSstrLcolRecv, MPI_BYTE,
2906  PCOL( snode.Index, grid_ ), IDX_TO_TAG(supidx,SELINV_TAG_L_CONTENT),
2907  grid_->rowComm,
2908  mpireqsRecvFromLeft );
2909 #if ( _DEBUGlevel_ >= 1 )
2910  statusOFS << std::endl << "["<<snode.Index<<"] "<< "Receiving L " << snode.SizeSstrLcolRecv << " BYTES"<< std::endl << std::endl;
2911 #endif
2912  } // if I need to receive from left
2913  }
2914 
2915  }
2916 
2917 
2918 
2919  TIMER_START(Compute_Sinv_LT);
2920  {
2921  Int gemmProcessed = 0;
2922  Int gemmToDo = 0;
2923  // Int toRecvGemm = 0;
2924  //copy the list of supernodes we need to process
2925  std::vector<Int> readySupidx;
2926  //find local things to do
2927  for(Int supidx = 0;supidx<stepSuper;supidx++){
2928  SuperNodeBufferType & snode = arrSuperNodes[supidx];
2929 
2930 
2931 
2932  // if( isRecvFromAbove_( snode.Index ) &&
2933  // MYROW( grid_ ) != PROW( snode.Index, grid_ ) ){
2934  // toRecvGemm++;
2935  // }
2936  //
2937  // if( isRecvFromLeft_( snode.Index ) &&
2938  // MYCOL( grid_ ) != PCOL( snode.Index, grid_ ) ){
2939  // toRecvGemm++;
2940  // }
2941 
2942  if( isRecvFromAbove_( snode.Index ) && isRecvFromLeft_( snode.Index )){
2943  gemmToDo++;
2944  if( MYCOL( grid_ ) == PCOL( snode.Index, grid_ ) ){
2945  snode.isReady++;
2946  }
2947 
2948  if( MYROW( grid_ ) == PROW( snode.Index, grid_ ) ){
2949  snode.isReady++;
2950  }
2951 
2952  if(snode.isReady==2){
2953  readySupidx.push_back(supidx);
2954 #if ( _DEBUGlevel_ >= 1 )
2955  statusOFS<<std::endl<<"Locally processing ["<<snode.Index<<"]"<<std::endl;
2956 #endif
2957  }
2958  }
2959  else if( (isRecvFromLeft_( snode.Index ) ) && MYCOL( grid_ ) != PCOL( snode.Index, grid_ ) )
2960  {
2961  MPI_Request & mpireqsSendToLeft = arrMpireqsSendToLeft[supidx];
2962  //Dummy 0-b send If I was a receiver, I need to send my data to proc in column of snode.Index
2963  MPI_Isend( NULL, 0, MPI_BYTE, PCOL(snode.Index,grid_) ,IDX_TO_TAG(supidx,SELINV_TAG_L_REDUCE), grid_->rowComm, &mpireqsSendToLeft );
2964 
2965 #if ( _DEBUGlevel_ >= 1 )
2966  statusOFS << std::endl << "["<<snode.Index<<"] "<< " P"<<MYPROC(grid_)<<" has sent "<< 0 << " bytes to " << PNUM(MYROW(grid_),PCOL(snode.Index,grid_),grid_) << std::endl;
2967 #endif
2968  }// if( isRecvFromAbove_( snode.Index ) && isRecvFromLeft_( snode.Index ))
2969  }
2970 
2971 #if ( _DEBUGlevel_ >= 1 )
2972  statusOFS<<std::endl<<"gemmToDo ="<<gemmToDo<<std::endl;
2973  // statusOFS<<std::endl<<"toRecvGemm ="<<toRecvGemm<<std::endl;
2974 #endif
2975 
2976 
2977 #if defined (PROFILE)
2978  end_SendULWaitContentFirst=0;
2979  begin_SendULWaitContentFirst=0;
2980 #endif
2981 
2982  while(gemmProcessed<gemmToDo)
2983  {
2984  Int reqidx = MPI_UNDEFINED;
2985  Int supidx = -1;
2986 
2987 
2988  //while I don't have anything to do, wait for data to arrive
2989  do{
2990 
2991 
2992  int reqIndices[arrMpireqsRecvContentFromAny.size()];
2993  int numRecv = 0;
2994 
2995  //then process with the remote ones
2996 
2997  TIMER_START(WaitContent_UL);
2998 #if defined(PROFILE)
2999  if(begin_SendULWaitContentFirst==0){
3000  begin_SendULWaitContentFirst=1;
3001  TIMER_START(WaitContent_UL_First);
3002  }
3003 #endif
3004 
3005 
3006  // MPI_Waitany(2*stepSuper, &arrMpireqsRecvContentFromAny[0], &reqidx, MPI_STATUS_IGNORE);
3007  // TIMER_STOP(WaitContent_UL);
3008  numRecv = 0;
3009  MPI_Waitsome(2*stepSuper, &arrMpireqsRecvContentFromAny[0], &numRecv, reqIndices, MPI_STATUSES_IGNORE);
3010 
3011  for(int i =0;i<numRecv;i++){
3012  reqidx = reqIndices[i];
3013  //I've received something
3014  if(reqidx!=MPI_UNDEFINED)
3015  {
3016 
3017  supidx = reqidx/2;
3018  SuperNodeBufferType & snode = arrSuperNodes[supidx];
3019  snode.isReady++;
3020 
3021 #if ( _DEBUGlevel_ >= 1 )
3022  statusOFS<<std::endl<<"Received data for ["<<snode.Index<<"] reqidx%2="<<reqidx%2<<" is ready ?"<<snode.isReady<<std::endl;
3023 #endif
3024  //if we received both L and U, the supernode is ready
3025  if(snode.isReady==2){
3026  readySupidx.push_back(supidx);
3027 
3028 #if defined(PROFILE)
3029  if(end_SendULWaitContentFirst==0){
3030  TIMER_STOP(WaitContent_UL_First);
3031  end_SendULWaitContentFirst=1;
3032  }
3033 #endif
3034  }
3035  }
3036 
3037  }//end for waitsome
3038 
3039  TIMER_STOP(WaitContent_UL);
3040 
3041  } while(readySupidx.size()==0);
3042 
3043 
3044 
3045 
3046 
3047 
3048  //If I have some work to do
3049  //while(readySupidx.size()>0)
3050  if(readySupidx.size()>0)
3051  {
3052  supidx = readySupidx.back();
3053  readySupidx.pop_back();
3054  SuperNodeBufferType & snode = arrSuperNodes[supidx];
3055 
3056 
3057  // Only the processors received information participate in the Gemm
3058  if( isRecvFromAbove_( snode.Index ) && isRecvFromLeft_( snode.Index ) ){
3059 
3060  std::vector<LBlock<T> > LcolRecv;
3061  std::vector<UBlock<T> > UrowRecv;
3062  // Save all the data to be updated for { L( isup, snode.Index ) | isup > snode.Index }.
3063  // The size will be updated in the Gemm phase and the reduce phase
3064 
3065  UnpackData(snode, LcolRecv, UrowRecv);
3066 
3067  //NumMat<T> AinvBuf, UBuf;
3068  SelInv_lookup_indexes(snode,LcolRecv, UrowRecv,AinvBuf,UBuf);
3069 
3070  snode.LUpdateBuf.Resize( AinvBuf.m(), SuperSize( snode.Index, super_ ) );
3071  TIMER_START(Compute_Sinv_LT_GEMM);
3072  blas::Gemm( 'N', 'T', AinvBuf.m(), UBuf.m(), AinvBuf.n(), MINUS_ONE<T>(),
3073  AinvBuf.Data(), AinvBuf.m(),
3074  UBuf.Data(), UBuf.m(), ZERO<T>(),
3075  snode.LUpdateBuf.Data(), snode.LUpdateBuf.m() );
3076  TIMER_STOP(Compute_Sinv_LT_GEMM);
3077 
3078 
3079 #if ( _DEBUGlevel_ >= 2 )
3080  statusOFS << std::endl << "["<<snode.Index<<"] "<< "snode.LUpdateBuf: " << snode.LUpdateBuf << std::endl;
3081 #endif
3082  } // if Gemm is to be done locally
3083 
3084 
3085  //If I was a receiver, I need to send my data to proc in column of snode.Index
3086  if( isRecvFromAbove_( snode.Index ) ){
3087  if( isRecvFromLeft_( snode.Index ) && MYCOL( grid_ ) != PCOL( snode.Index, grid_ ) )
3088  {
3089  MPI_Request & mpireqsSendToLeft = arrMpireqsSendToLeft[supidx];
3090 
3091  MPI_Isend( snode.LUpdateBuf.Data(), snode.LUpdateBuf.ByteSize(), MPI_BYTE, PCOL(snode.Index,grid_) ,IDX_TO_TAG(supidx,SELINV_TAG_L_REDUCE), grid_->rowComm, &mpireqsSendToLeft );
3092 
3093 #if ( _DEBUGlevel_ >= 1 )
3094  statusOFS << std::endl << "["<<snode.Index<<"] "<< " P"<<MYCOL(grid_)<<" has sent "<< snode.LUpdateBuf.ByteSize() << " bytes to " << PCOL(snode.Index,grid_) << std::endl;
3095 #endif
3096 
3097  }//Sender
3098  }
3099  gemmProcessed++;
3100 
3101 
3102 #if ( _DEBUGlevel_ >= 1 )
3103  statusOFS<<std::endl<<"gemmProcessed ="<<gemmProcessed<<"/"<<gemmToDo<<std::endl;
3104 #endif
3105 
3106 
3107  }
3108  }
3109 
3110  }
3111  TIMER_STOP(Compute_Sinv_LT);
3112 
3113  //Reduce Sinv L^T to the processors in PCOL(ksup,grid_)
3114  TIMER_START(Reduce_Sinv_LT);
3115 
3116  for (Int supidx=0; supidx<stepSuper; supidx++){
3117  SuperNodeBufferType & snode = arrSuperNodes[supidx];
3118 
3119 
3120  if( MYCOL( grid_ ) == PCOL( snode.Index, grid_ ) ){
3121 
3122  //determine the number of rows in LUpdateBufReduced
3123  Int numRowLUpdateBuf;
3124  std::vector<LBlock<T> >& Lcol = this->L( LBj( snode.Index, grid_ ) );
3125  if( MYROW( grid_ ) != PROW( snode.Index, grid_ ) ){
3126  snode.RowLocalPtr.resize( Lcol.size() + 1 );
3127  snode.BlockIdxLocal.resize( Lcol.size() );
3128  snode.RowLocalPtr[0] = 0;
3129  for( Int ib = 0; ib < Lcol.size(); ib++ ){
3130  snode.RowLocalPtr[ib+1] = snode.RowLocalPtr[ib] + Lcol[ib].numRow;
3131  snode.BlockIdxLocal[ib] = Lcol[ib].blockIdx;
3132  }
3133  } // I do not own the diagonal block
3134  else{
3135  snode.RowLocalPtr.resize( Lcol.size() );
3136  snode.BlockIdxLocal.resize( Lcol.size() - 1 );
3137  snode.RowLocalPtr[0] = 0;
3138  for( Int ib = 1; ib < Lcol.size(); ib++ ){
3139  snode.RowLocalPtr[ib] = snode.RowLocalPtr[ib-1] + Lcol[ib].numRow;
3140  snode.BlockIdxLocal[ib-1] = Lcol[ib].blockIdx;
3141  }
3142  } // I own the diagonal block, skip the diagonal block
3143  numRowLUpdateBuf = *snode.RowLocalPtr.rbegin();
3144 
3145 
3146  if( numRowLUpdateBuf > 0 ){
3147  if( snode.LUpdateBuf.m() == 0 && snode.LUpdateBuf.n() == 0 ){
3148  snode.LUpdateBuf.Resize( numRowLUpdateBuf,SuperSize( snode.Index, super_ ) );
3149  // Fill zero is important
3150  SetValue( snode.LUpdateBuf, ZERO<T>() );
3151  }
3152  }
3153 
3154 #if ( _DEBUGlevel_ >= 2 )
3155  statusOFS << std::endl << "["<<snode.Index<<"] "<< "LUpdateBuf Before Reduction: " << snode.LUpdateBuf << std::endl << std::endl;
3156 #endif
3157 
3158  Int totCountRecv = 0;
3159  Int numRecv = CountSendToRight(snode.Index);
3160  NumMat<T> LUpdateBufRecv(numRowLUpdateBuf,SuperSize( snode.Index, super_ ) );
3161  for( Int countRecv = 0; countRecv < numRecv ; ++countRecv ){
3162  //Do the blocking recv
3163  MPI_Status stat;
3164  Int size = 0;
3165  TIMER_START(L_RECV);
3166  MPI_Recv(LUpdateBufRecv.Data(), LUpdateBufRecv.ByteSize(), MPI_BYTE, MPI_ANY_SOURCE,IDX_TO_TAG(supidx,SELINV_TAG_L_REDUCE), grid_->rowComm,&stat);
3167  TIMER_STOP(L_RECV);
3168  MPI_Get_count(&stat, MPI_BYTE, &size);
3169  //if the processor contributes
3170  if(size>0){
3171 
3172 #if ( _DEBUGlevel_ >= 1 )
3173  statusOFS << std::endl << "["<<snode.Index<<"] "<< " P"<<MYCOL(grid_)<<" has received "<< size << " bytes from " << stat.MPI_SOURCE << std::endl;
3174 #endif
3175 #if ( _DEBUGlevel_ >= 2 )
3176  statusOFS << std::endl << "["<<snode.Index<<"] "<< "LUpdateBufRecv: " << LUpdateBufRecv << std::endl << std::endl;
3177 #endif
3178  //do the sum
3179  blas::Axpy(snode.LUpdateBuf.Size(), ONE<T>(), LUpdateBufRecv.Data(), 1, snode.LUpdateBuf.Data(), 1 );
3180  }
3181  } // for (iProcCol)
3182 
3183 #if ( _DEBUGlevel_ >= 2 )
3184  statusOFS << std::endl << "["<<snode.Index<<"] "<< "LUpdateBuf After Reduction: " << snode.LUpdateBuf << std::endl << std::endl;
3185 #endif
3186  } // Receiver
3187  }
3188 
3189  TIMER_STOP(Reduce_Sinv_LT);
3190 
3191  mpi::Waitall( arrMpireqsSendToLeft );
3192 
3193 
3194  //--------------------- End of reduce of LUpdateBuf-------------------------
3195 #ifndef _RELEASE_
3196  PushCallStack("PMatrix::SelInv_P2p::UpdateD");
3197 #endif
3198 
3199  TIMER_START(Update_Diagonal);
3200  for (Int supidx=0; supidx<stepSuper; supidx++){
3201  SuperNodeBufferType & snode = arrSuperNodes[supidx];
3202 
3203  ComputeDiagUpdate(snode);
3204 
3205  if( MYCOL( grid_ ) == PCOL( snode.Index, grid_ ) ){
3206  if( MYROW( grid_ ) != PROW( snode.Index, grid_ ) ){
3207  if(isSendToDiagonal_(snode.Index)){
3208  //send to above
3209  MPI_Request & mpireqsSendToAbove = arrMpireqsSendToAbove[supidx];
3210  MPI_Isend( snode.DiagBuf.Data(), snode.DiagBuf.ByteSize(), MPI_BYTE,
3211  PROW(snode.Index,grid_) ,IDX_TO_TAG(supidx,SELINV_TAG_D_REDUCE), grid_->colComm, &mpireqsSendToAbove );
3212 
3213 #if ( _DEBUGlevel_ >= 1 )
3214  statusOFS << std::endl << "["<<snode.Index<<"] "<< " P"<<MYROW(grid_)<<" has sent "<< snode.DiagBuf.ByteSize() << " bytes of DiagBuf to " << PROW(snode.Index,grid_) << " isSendToDiagonal = "<< isSendToDiagonal_(snode.Index) << std::endl;
3215 #endif
3216  }
3217  }
3218  }
3219  }
3220 
3221  TIMER_STOP(Update_Diagonal);
3222 
3223 
3224 
3225  TIMER_START(Reduce_Diagonal);
3226 
3227  for (Int supidx=0; supidx<stepSuper; supidx++){
3228  SuperNodeBufferType & snode = arrSuperNodes[supidx];
3229  if( MYCOL( grid_ ) == PCOL( snode.Index, grid_ ) ){
3230  if( MYROW( grid_ ) == PROW( snode.Index, grid_ ) ){
3231  if(snode.DiagBuf.Size()==0){
3232  snode.DiagBuf.Resize( SuperSize( snode.Index, super_ ), SuperSize( snode.Index, super_ ));
3233  SetValue(snode.DiagBuf, ZERO<T>());
3234  }
3235  //receive from below
3236  Int totCountRecv = 0;
3237  Int numRecv = CountRecvFromBelow(snode.Index);
3238  NumMat<T> DiagBufRecv(snode.DiagBuf.m(),snode.DiagBuf.n());
3239 
3240  for( Int countRecv = 0; countRecv < numRecv ; ++countRecv ){
3241  //Do the blocking recv
3242  MPI_Status stat;
3243  Int size = 0;
3244  TIMER_START(D_RECV);
3245  MPI_Recv(DiagBufRecv.Data(), DiagBufRecv.ByteSize(), MPI_BYTE, MPI_ANY_SOURCE,IDX_TO_TAG(supidx,SELINV_TAG_D_REDUCE), grid_->colComm,&stat);
3246  TIMER_STOP(D_RECV);
3247  MPI_Get_count(&stat, MPI_BYTE, &size);
3248  //if the processor contributes
3249  if(size>0){
3250  // Add DiagBufRecv to diagonal block.
3251  blas::Axpy(snode.DiagBuf.Size(), ONE<T>(), DiagBufRecv.Data(),
3252  1, snode.DiagBuf.Data(), 1 );
3253  }
3254  }
3255  LBlock<T> & LB = this->L( LBj( snode.Index, grid_ ) )[0];
3256  // Symmetrize LB
3257  blas::Axpy( LB.numRow * LB.numCol, ONE<T>(), snode.DiagBuf.Data(), 1, LB.nzval.Data(), 1 );
3258  Symmetrize( LB.nzval );
3259  }
3260 
3261  }
3262  }
3263 
3264 
3265  TIMER_STOP(Reduce_Diagonal);
3266 
3267 #ifndef _RELEASE_
3268  PopCallStack();
3269 #endif
3270 
3271 
3272 #ifndef _RELEASE_
3273  PushCallStack("PMatrix::SelInv_P2p::UpdateU");
3274 #endif
3275 
3276  SendRecvCD_UpdateU(arrSuperNodes, stepSuper);
3277 
3278 #ifndef _RELEASE_
3279  PopCallStack();
3280 #endif
3281 
3282 #ifndef _RELEASE_
3283  PushCallStack("PMatrix::SelInv_P2p::UpdateLFinal");
3284 #endif
3285 
3286  TIMER_START(Update_L);
3287 
3288  for (Int supidx=0; supidx<stepSuper; supidx++){
3289  SuperNodeBufferType & snode = arrSuperNodes[supidx];
3290 
3291 #if ( _DEBUGlevel_ >= 1 )
3292  statusOFS << std::endl << "["<<snode.Index<<"] "<< "Finish updating the L part by filling LUpdateBufReduced back to L" << std::endl << std::endl;
3293 #endif
3294 
3295 
3296 
3297  if( MYCOL( grid_ ) == PCOL( snode.Index, grid_ ) && snode.LUpdateBuf.m() > 0 ){
3298  std::vector<LBlock<T> >& Lcol = this->L( LBj( snode.Index, grid_ ) );
3299  //Need to skip the diagonal block if present
3300  Int startBlock = (MYROW( grid_ ) == PROW( snode.Index, grid_ ))?1:0;
3301  for( Int ib = startBlock; ib < Lcol.size(); ib++ ){
3302  LBlock<T> & LB = Lcol[ib];
3303  lapack::Lacpy( 'A', LB.numRow, LB.numCol, &snode.LUpdateBuf( snode.RowLocalPtr[ib-startBlock], 0 ),
3304  snode.LUpdateBuf.m(), LB.nzval.Data(), LB.numRow );
3305  }
3306  } // Finish updating L
3307  } // for (snode.Index) : Main loop
3308 
3309 
3310  TIMER_STOP(Update_L);
3311 
3312 
3313 
3314 
3315 #ifndef _RELEASE_
3316  PopCallStack();
3317 #endif
3318 
3319 
3320  TIMER_START(Barrier);
3321  mpi::Waitall(arrMpireqsRecvContentFromAny);
3322  //Sync for reduce L
3323  // mpi::Waitall( arrMpireqsSendToLeft );
3324  //Sync for reduce D
3325  mpi::Waitall(arrMpireqsSendToAbove);
3326 
3327  for (Int supidx=0; supidx<stepSuper; supidx++){
3328  Int ksup = superList[lidx][supidx];
3329  std::vector<MPI_Request> & mpireqsSendToRight = arrMpireqsSendToRight[supidx];
3330  std::vector<MPI_Request> & mpireqsSendToBelow = arrMpireqsSendToBelow[supidx];
3331 
3332  if( MYCOL( grid_ ) == PCOL( ksup, grid_ ) ){
3333  MPI_Barrier(grid_->colComm);
3334  }
3335 
3336  mpi::Waitall( mpireqsSendToRight );
3337  mpi::Waitall( mpireqsSendToBelow );
3338 
3339  }
3340 
3341  TIMER_STOP(Barrier);
3342 
3343 
3344 #ifdef LIST_BARRIER
3345 #ifndef ALL_BARRIER
3346  if (options_->maxPipelineDepth!=-1)
3347 #endif
3348  {
3349  MPI_Barrier(grid_->comm);
3350  }
3351 #endif
3352 
3353  }
3354 
3355  template<typename T>
3357  {
3358  ConstructCommunicationPattern_P2p();
3359  } // ----- end of method PMatrix::ConstructCommunicationPattern -----
3360 
3361 
3362 
3363  template<typename T>
3365  {
3366 #ifndef _RELEASE_
3367  PushCallStack("PMatrix::ConstructCommunicationPattern_P2p");
3368 #endif
3369 
3370 
3371  Int numSuper = this->NumSuper();
3372 
3373  TIMER_START(Allocate);
3374 
3375 #ifndef _RELEASE_
3376  PushCallStack( "Initialize the communication pattern" );
3377 #endif
3378  isSendToBelow_.Resize(grid_->numProcRow, numSuper);
3379  isSendToRight_.Resize(grid_->numProcCol, numSuper);
3380  isSendToDiagonal_.Resize( numSuper );
3381  SetValue( isSendToBelow_, false );
3382  SetValue( isSendToRight_, false );
3383  SetValue( isSendToDiagonal_, false );
3384 
3385  isSendToCrossDiagonal_.Resize(grid_->numProcCol+1, numSuper );
3386  SetValue( isSendToCrossDiagonal_, false );
3387  isRecvFromCrossDiagonal_.Resize(grid_->numProcRow+1, numSuper );
3388  SetValue( isRecvFromCrossDiagonal_, false );
3389 
3390  isRecvFromAbove_.Resize( numSuper );
3391  isRecvFromLeft_.Resize( numSuper );
3392  isRecvFromBelow_.Resize( grid_->numProcRow, numSuper );
3393  SetValue( isRecvFromAbove_, false );
3394  SetValue( isRecvFromBelow_, false );
3395  SetValue( isRecvFromLeft_, false );
3396 #ifndef _RELEASE_
3397  PopCallStack();
3398 #endif
3399 
3400  TIMER_STOP(Allocate);
3401 
3402  // Skip the communication pattern for one single processor.
3403  // This does not work for now.
3404 // if( grid_ -> mpisize == 1 ){
3405 //#if ( _DEBUGlevel_ >= 0 )
3406 // statusOFS << "Skip the construction of communication pattern for "
3407 // << "a single processor" << std::endl;
3408 // statusOFS << "The values of all communication variables are set to be true, "
3409 // << "but no MPI process will be involved in the selected inversion phase."
3410 // << std::endl;
3411 //#endif
3412 // SetValue( isSendToBelow_, true );
3413 // SetValue( isSendToRight_, true );
3414 // SetValue( isSendToDiagonal_, true );
3415 // SetValue( isSendToCrossDiagonal_, true );
3416 // SetValue( isRecvFromCrossDiagonal_, true );
3417 // SetValue( isRecvFromAbove_, true );
3418 // SetValue( isRecvFromBelow_, true );
3419 // SetValue( isRecvFromLeft_, true );
3420 // return;
3421 // }
3422 
3423 
3424 
3425 TIMER_START(GetEtree);
3426  std::vector<Int> snodeEtree(this->NumSuper());
3427  GetEtree(snodeEtree);
3428 TIMER_STOP(GetEtree);
3429 
3430 
3431 
3432 #ifndef _RELEASE_
3433  PushCallStack( "Local column communication" );
3434 #endif
3435 #if ( _DEBUGlevel_ >= 1 )
3436  statusOFS << std::endl << "Local column communication" << std::endl;
3437 #endif
3438  // localColBlockRowIdx stores the nonzero block indices for each local block column.
3439  // The nonzero block indices including contribution from both L and U.
3440  // Dimension: numLocalBlockCol x numNonzeroBlock
3441  std::vector<std::set<Int> > localColBlockRowIdx;
3442 
3443  localColBlockRowIdx.resize( this->NumLocalBlockCol() );
3444 
3445 
3446 TIMER_START(Column_communication);
3447 
3448 
3449  for( Int ksup = 0; ksup < numSuper; ksup++ ){
3450  // All block columns perform independently
3451  if( MYCOL( grid_ ) == PCOL( ksup, grid_ ) ){
3452 
3453 
3454  // Communication
3455  std::vector<Int> tAllBlockRowIdx;
3456  std::vector<Int> & colBlockIdx = ColBlockIdx(LBj(ksup, grid_));
3457 TIMER_START(Allgatherv_Column_communication);
3458  if( grid_ -> mpisize != 1 )
3459  mpi::Allgatherv( colBlockIdx, tAllBlockRowIdx, grid_->colComm );
3460  else
3461  tAllBlockRowIdx = colBlockIdx;
3462 
3463 TIMER_STOP(Allgatherv_Column_communication);
3464 
3465  localColBlockRowIdx[LBj( ksup, grid_ )].insert(
3466  tAllBlockRowIdx.begin(), tAllBlockRowIdx.end() );
3467 
3468 #if ( _DEBUGlevel_ >= 1 )
3469  statusOFS
3470  << " Column block " << ksup
3471  << " has the following nonzero block rows" << std::endl;
3472  for( std::set<Int>::iterator si = localColBlockRowIdx[LBj( ksup, grid_ )].begin();
3473  si != localColBlockRowIdx[LBj( ksup, grid_ )].end();
3474  si++ ){
3475  statusOFS << *si << " ";
3476  }
3477  statusOFS << std::endl;
3478 #endif
3479 
3480  } // if( MYCOL( grid_ ) == PCOL( ksup, grid_ ) )
3481  } // for(ksup)
3482 
3483 
3484 #ifndef _RELEASE_
3485  PopCallStack();
3486 #endif
3487 
3488 TIMER_STOP(Column_communication);
3489 
3490 TIMER_START(Row_communication);
3491 #ifndef _RELEASE_
3492  PushCallStack( "Local row communication" );
3493 #endif
3494 #if ( _DEBUGlevel_ >= 1 )
3495  statusOFS << std::endl << "Local row communication" << std::endl;
3496 #endif
3497  // localRowBlockColIdx stores the nonzero block indices for each local block row.
3498  // The nonzero block indices including contribution from both L and U.
3499  // Dimension: numLocalBlockRow x numNonzeroBlock
3500  std::vector<std::set<Int> > localRowBlockColIdx;
3501 
3502  localRowBlockColIdx.resize( this->NumLocalBlockRow() );
3503 
3504  for( Int ksup = 0; ksup < numSuper; ksup++ ){
3505  // All block columns perform independently
3506  if( MYROW( grid_ ) == PROW( ksup, grid_ ) ){
3507 
3508  // Communication
3509  std::vector<Int> tAllBlockColIdx;
3510  std::vector<Int> & rowBlockIdx = RowBlockIdx(LBi(ksup, grid_));
3511 TIMER_START(Allgatherv_Row_communication);
3512  if( grid_ -> mpisize != 1 )
3513  mpi::Allgatherv( rowBlockIdx, tAllBlockColIdx, grid_->rowComm );
3514  else
3515  tAllBlockColIdx = rowBlockIdx;
3516 
3517 TIMER_STOP(Allgatherv_Row_communication);
3518 
3519  localRowBlockColIdx[LBi( ksup, grid_ )].insert(
3520  tAllBlockColIdx.begin(), tAllBlockColIdx.end() );
3521 
3522 #if ( _DEBUGlevel_ >= 1 )
3523  statusOFS
3524  << " Row block " << ksup
3525  << " has the following nonzero block columns" << std::endl;
3526  for( std::set<Int>::iterator si = localRowBlockColIdx[LBi( ksup, grid_ )].begin();
3527  si != localRowBlockColIdx[LBi( ksup, grid_ )].end();
3528  si++ ){
3529  statusOFS << *si << " ";
3530  }
3531  statusOFS << std::endl;
3532 #endif
3533 
3534  } // if( MYROW( grid_ ) == PROW( ksup, grid_ ) )
3535  } // for(ksup)
3536 
3537 #ifndef _RELEASE_
3538  PopCallStack();
3539 #endif
3540 
3541 TIMER_STOP(Row_communication);
3542 
3543 TIMER_START(STB_RFA);
3544 #ifndef _RELEASE_
3545  PushCallStack("SendToBelow / RecvFromAbove");
3546 #endif
3547  for( Int ksup = 0; ksup < numSuper - 1; ksup++ ){
3548  // Loop over all the supernodes to the right of ksup
3549 
3550 
3551  Int jsup = snodeEtree[ksup];
3552  while(jsup<numSuper){
3553  Int jsupLocalBlockCol = LBj( jsup, grid_ );
3554  Int jsupProcCol = PCOL( jsup, grid_ );
3555  if( MYCOL( grid_ ) == jsupProcCol ){
3556 
3557  // SendToBelow / RecvFromAbove only if (ksup, jsup) is nonzero.
3558  if( localColBlockRowIdx[jsupLocalBlockCol].count( ksup ) > 0 ) {
3559  for( std::set<Int>::iterator si = localColBlockRowIdx[jsupLocalBlockCol].begin();
3560  si != localColBlockRowIdx[jsupLocalBlockCol].end(); si++ ){
3561  Int isup = *si;
3562  Int isupProcRow = PROW( isup, grid_ );
3563  if( isup > ksup ){
3564  if( MYROW( grid_ ) == isupProcRow ){
3565  isRecvFromAbove_(ksup) = true;
3566  }
3567  if( MYROW( grid_ ) == PROW( ksup, grid_ ) ){
3568  isSendToBelow_( isupProcRow, ksup ) = true;
3569  }
3570  } // if( isup > ksup )
3571  } // for (si)
3572  } // if( localColBlockRowIdx[jsupLocalBlockCol].count( ksup ) > 0 )
3573 
3574  } // if( MYCOL( grid_ ) == PCOL( jsup, grid_ ) )
3575  jsup = snodeEtree[jsup];
3576 
3577  } // for(jsup)
3578  } // for(ksup)
3579 
3580 #if ( _DEBUGlevel_ >= 1 )
3581  statusOFS << std::endl << "isSendToBelow:" << std::endl;
3582  for(int j = 0;j< isSendToBelow_.n();j++){
3583  statusOFS << "["<<j<<"] ";
3584  for(int i =0; i < isSendToBelow_.m();i++){
3585  statusOFS<< isSendToBelow_(i,j) << " ";
3586  }
3587  statusOFS<<std::endl;
3588  }
3589 
3590  statusOFS << std::endl << "isRecvFromAbove:" << std::endl;
3591  for(int j = 0;j< isRecvFromAbove_.m();j++){
3592  statusOFS << "["<<j<<"] "<< isRecvFromAbove_(j)<<std::endl;
3593  }
3594 #endif
3595 
3596 #ifndef _RELEASE_
3597  PopCallStack();
3598 #endif
3599 
3600 
3601 TIMER_STOP(STB_RFA);
3602 
3603 
3604 
3605 
3606 
3607 
3608 
3609 
3610 TIMER_START(STR_RFL_RFB);
3611 
3612 
3613 #ifndef _RELEASE_
3614  PushCallStack("SendToRight / RecvFromLeft");
3615 #endif
3616  for( Int ksup = 0; ksup < numSuper - 1; ksup++ ){
3617  // Loop over all the supernodes below ksup
3618 
3619  Int isup = snodeEtree[ksup];
3620  while(isup<numSuper){
3621  Int isupLocalBlockRow = LBi( isup, grid_ );
3622  Int isupProcRow = PROW( isup, grid_ );
3623  if( MYROW( grid_ ) == isupProcRow ){
3624  // SendToRight / RecvFromLeft only if (isup, ksup) is nonzero.
3625  if( localRowBlockColIdx[isupLocalBlockRow].count( ksup ) > 0 ){
3626  for( std::set<Int>::iterator si = localRowBlockColIdx[isupLocalBlockRow].begin();
3627  si != localRowBlockColIdx[isupLocalBlockRow].end(); si++ ){
3628  Int jsup = *si;
3629  Int jsupProcCol = PCOL( jsup, grid_ );
3630  if( jsup > ksup ){
3631 
3632  if( MYCOL( grid_ ) == jsupProcCol ){
3633  isRecvFromLeft_(ksup) = true;
3634  }
3635  if( MYCOL( grid_ ) == PCOL( ksup, grid_ ) ){
3636  isSendToRight_( jsupProcCol, ksup ) = true;
3637  }
3638  }
3639  } // for (si)
3640  } // if( localRowBlockColIdx[isupLocalBlockRow].count( ksup ) > 0 )
3641  } // if( MYROW( grid_ ) == isupProcRow )
3642 
3643 
3644  if( MYCOL( grid_ ) == PCOL(ksup, grid_) ){
3645 
3646  if( MYROW( grid_ ) == PROW( ksup, grid_ ) ){
3647  isRecvFromBelow_(isupProcRow,ksup) = true;
3648  }
3649  else if (MYROW(grid_) == isupProcRow){
3650  isSendToDiagonal_(ksup)=true;
3651  }
3652  } // if( MYCOL( grid_ ) == PCOL(ksup, grid_) )
3653  isup = snodeEtree[isup];
3654 
3655  } // for (isup)
3656  } // for (ksup)
3657 
3658 
3659 #if ( _DEBUGlevel_ >= 1 )
3660  statusOFS << std::endl << "isSendToRight:" << std::endl;
3661  for(int j = 0;j< isSendToRight_.n();j++){
3662  statusOFS << "["<<j<<"] ";
3663  for(int i =0; i < isSendToRight_.m();i++){
3664  statusOFS<< isSendToRight_(i,j) << " ";
3665  }
3666  statusOFS<<std::endl;
3667  }
3668 
3669  statusOFS << std::endl << "isRecvFromLeft:" << std::endl;
3670  for(int j = 0;j< isRecvFromLeft_.m();j++){
3671  statusOFS << "["<<j<<"] "<< isRecvFromLeft_(j)<<std::endl;
3672  }
3673 
3674  statusOFS << std::endl << "isRecvFromBelow:" << std::endl;
3675  for(int j = 0;j< isRecvFromBelow_.n();j++){
3676  statusOFS << "["<<j<<"] ";
3677  for(int i =0; i < isRecvFromBelow_.m();i++){
3678  statusOFS<< isRecvFromBelow_(i,j) << " ";
3679  }
3680  statusOFS<<std::endl;
3681  }
3682 #endif
3683 
3684 #ifndef _RELEASE_
3685  PopCallStack();
3686 #endif
3687 
3688 
3689 TIMER_STOP(STR_RFL_RFB);
3690 
3691 
3692 
3693 
3694 TIMER_START(STCD_RFCD);
3695 
3696 
3697 #ifndef _RELEASE_
3698  PushCallStack("SendToCrossDiagonal / RecvFromCrossDiagonal");
3699 #endif
3700  for( Int ksup = 0; ksup < numSuper - 1; ksup++ ){
3701  if( MYCOL( grid_ ) == PCOL( ksup, grid_ ) ){
3702  for( std::set<Int>::iterator si = localColBlockRowIdx[LBj( ksup, grid_ )].begin();
3703  si != localColBlockRowIdx[LBj( ksup, grid_ )].end(); si++ ){
3704  Int isup = *si;
3705  Int isupProcRow = PROW( isup, grid_ );
3706  Int isupProcCol = PCOL( isup, grid_ );
3707  if( isup > ksup && MYROW( grid_ ) == isupProcRow ){
3708  isSendToCrossDiagonal_(grid_->numProcCol, ksup ) = true;
3709  isSendToCrossDiagonal_(isupProcCol, ksup ) = true;
3710  }
3711  } // for (si)
3712  } // if( MYCOL( grid_ ) == PCOL( ksup, grid_ ) )
3713  } // for (ksup)
3714 
3715  for( Int ksup = 0; ksup < numSuper - 1; ksup++ ){
3716  if( MYROW( grid_ ) == PROW( ksup, grid_ ) ){
3717  for( std::set<Int>::iterator si = localRowBlockColIdx[ LBi(ksup, grid_) ].begin();
3718  si != localRowBlockColIdx[ LBi(ksup, grid_) ].end(); si++ ){
3719  Int jsup = *si;
3720  Int jsupProcCol = PCOL( jsup, grid_ );
3721  Int jsupProcRow = PROW( jsup, grid_ );
3722  if( jsup > ksup && MYCOL(grid_) == jsupProcCol ){
3723  isRecvFromCrossDiagonal_(grid_->numProcRow, ksup ) = true;
3724  isRecvFromCrossDiagonal_(jsupProcRow, ksup ) = true;
3725  }
3726  } // for (si)
3727  } // if( MYROW( grid_ ) == PROW( ksup, grid_ ) )
3728  } // for (ksup)
3729 #if ( _DEBUGlevel_ >= 1 )
3730  statusOFS << std::endl << "isSendToCrossDiagonal:" << std::endl;
3731  for(int j =0; j < isSendToCrossDiagonal_.n();j++){
3732  if(isSendToCrossDiagonal_(grid_->numProcCol,j)){
3733  statusOFS << "["<<j<<"] ";
3734  for(int i =0; i < isSendToCrossDiagonal_.m()-1;i++){
3735  if(isSendToCrossDiagonal_(i,j))
3736  {
3737  statusOFS<< PNUM(PROW(j,grid_),i,grid_)<<" ";
3738  }
3739  }
3740  statusOFS<<std::endl;
3741  }
3742  }
3743 
3744  statusOFS << std::endl << "isRecvFromCrossDiagonal:" << std::endl;
3745  for(int j =0; j < isRecvFromCrossDiagonal_.n();j++){
3746  if(isRecvFromCrossDiagonal_(grid_->numProcRow,j)){
3747  statusOFS << "["<<j<<"] ";
3748  for(int i =0; i < isRecvFromCrossDiagonal_.m()-1;i++){
3749  if(isRecvFromCrossDiagonal_(i,j))
3750  {
3751  statusOFS<< PNUM(i,PCOL(j,grid_),grid_)<<" ";
3752  }
3753  }
3754  statusOFS<<std::endl;
3755  }
3756  }
3757 
3758 
3759 #endif
3760 
3761 #ifndef _RELEASE_
3762  PopCallStack();
3763 #endif
3764 
3765 TIMER_STOP(STCD_RFCD);
3766 
3767 #ifndef _RELEASE_
3768  PopCallStack();
3769 #endif
3770 
3771  //Build the list of supernodes based on the elimination tree from SuperLU
3772  GetWorkSet(snodeEtree,this->WorkingSet());
3773 
3774  return ;
3775  } // ----- end of method PMatrix::ConstructCommunicationPattern_P2p -----
3776 
3777  template<typename T>
3779  {
3780  SelInv_P2p ( );
3781  } // ----- end of method PMatrix::SelInv -----
3782 
3783 
3784 
3785  template<typename T>
3787  {
3788  TIMER_START(SelInv_P2p);
3789 
3790 #ifndef _RELEASE_
3791  PushCallStack("PMatrix::SelInv_P2p");
3792 #endif
3793 
3794 
3795  Int numSuper = this->NumSuper();
3796 
3797  // Main loop
3798  std::vector<std::vector<Int> > & superList = this->WorkingSet();
3799  Int numSteps = superList.size();
3800 
3801  for (Int lidx=0; lidx<numSteps ; lidx++){
3802  Int stepSuper = superList[lidx].size();
3803 
3804  SelInvIntra_P2p(lidx);
3805  }
3806 
3807 #ifndef _RELEASE_
3808  PopCallStack();
3809 #endif
3810 
3811  TIMER_STOP(SelInv_P2p);
3812 
3813  return ;
3814  } // ----- end of method PMatrix::SelInv_P2p -----
3815 
3816 
3817 
3818  template<typename T>
3820  {
3821  #ifndef _RELEASE_
3822  PushCallStack("PMatrix::PreSelInv");
3823  #endif
3824 
3825  Int numSuper = this->NumSuper();
3826 
3827  #ifndef _RELEASE_
3828  PushCallStack("L(i,k) <- L(i,k) * L(k,k)^{-1}");
3829  #endif
3830  #if ( _DEBUGlevel_ >= 1 )
3831  statusOFS << std::endl << "L(i,k) <- L(i,k) * L(k,k)^{-1}" << std::endl << std::endl;
3832  #endif
3833  for( Int ksup = 0; ksup < numSuper; ksup++ ){
3834  if( MYCOL( grid_ ) == PCOL( ksup, grid_ ) ){
3835  // Broadcast the diagonal L block
3836  NumMat<T> nzvalLDiag;
3837  std::vector<LBlock<T> >& Lcol = this->L( LBj( ksup, grid_ ) );
3838  if( MYROW( grid_ ) == PROW( ksup, grid_ ) ){
3839  nzvalLDiag = Lcol[0].nzval;
3840  if( nzvalLDiag.m() != SuperSize(ksup, super_) ||
3841  nzvalLDiag.n() != SuperSize(ksup, super_) ){
3842  throw std::runtime_error( "The size of the diagonal block of L is wrong." );
3843  }
3844  } // Owns the diagonal block
3845  else
3846  {
3847  nzvalLDiag.Resize( SuperSize(ksup, super_), SuperSize(ksup, super_) );
3848  }
3849  MPI_Bcast( (void*)nzvalLDiag.Data(), nzvalLDiag.ByteSize(),
3850  MPI_BYTE, PROW( ksup, grid_ ), grid_->colComm );
3851 
3852  // Triangular solve
3853  for( Int ib = 0; ib < Lcol.size(); ib++ ){
3854  LBlock<T> & LB = Lcol[ib];
3855  if( LB.blockIdx > ksup ){
3856  #if ( _DEBUGlevel_ >= 2 )
3857  // Check the correctness of the triangular solve for the first local column
3858  if( LBj( ksup, grid_ ) == 0 ){
3859  statusOFS << "Diag L(" << ksup << ", " << ksup << "): " << nzvalLDiag << std::endl;
3860  statusOFS << "Before solve L(" << LB.blockIdx << ", " << ksup << "): " << LB.nzval << std::endl;
3861  }
3862  #endif
3863  blas::Trsm( 'R', 'L', 'N', 'U', LB.numRow, LB.numCol, ONE<T>(),
3864  nzvalLDiag.Data(), LB.numCol, LB.nzval.Data(), LB.numRow );
3865  #if ( _DEBUGlevel_ >= 2 )
3866  // Check the correctness of the triangular solve for the first local column
3867  if( LBj( ksup, grid_ ) == 0 ){
3868  statusOFS << "After solve L(" << LB.blockIdx << ", " << ksup << "): " << LB.nzval << std::endl;
3869  }
3870  #endif
3871  }
3872  }
3873  } // if( MYCOL( grid_ ) == PCOL( ksup, grid_ ) )
3874  } // for (ksup)
3875 
3876 
3877  #ifndef _RELEASE_
3878  PopCallStack();
3879  #endif
3880 
3881 
3882  #ifndef _RELEASE_
3883  PushCallStack("U(k,i) <- L(i,k)");
3884  #endif
3885  #if ( _DEBUGlevel_ >= 1 )
3886  statusOFS << std::endl << "U(k,i) <- L(i,k)" << std::endl << std::endl;
3887  #endif
3888 
3889  for( Int ksup = 0; ksup < numSuper; ksup++ ){
3890  Int ksupProcRow = PROW( ksup, grid_ );
3891  Int ksupProcCol = PCOL( ksup, grid_ );
3892 
3893  Int sendCount = CountSendToCrossDiagonal(ksup);
3894  Int recvCount = CountRecvFromCrossDiagonal(ksup);
3895 
3896  std::vector<MPI_Request > arrMpiReqsSend(sendCount, MPI_REQUEST_NULL );
3897  std::vector<MPI_Request > arrMpiReqsSizeSend(sendCount, MPI_REQUEST_NULL );
3898  std::vector<std::vector<char> > arrSstrLcolSend(sendCount);
3899  std::vector<Int > arrSstrLcolSizeSend(sendCount);
3900 
3901  std::vector<MPI_Request > arrMpiReqsRecv(recvCount, MPI_REQUEST_NULL );
3902  std::vector<MPI_Request > arrMpiReqsSizeRecv(recvCount, MPI_REQUEST_NULL );
3903  std::vector<std::vector<char> > arrSstrLcolRecv(recvCount);
3904  std::vector<Int > arrSstrLcolSizeRecv(recvCount);
3905 
3906 
3907 
3908  // Sender
3909  if( isSendToCrossDiagonal_(grid_->numProcCol,ksup) ){
3910  #if ( _DEBUGlevel_ >= 1 )
3911  statusOFS<<"["<<ksup<<"] P"<<MYPROC(grid_)<<" should send to "<<CountSendToCrossDiagonal(ksup)<<" processors"<<std::endl;
3912  #endif
3913 
3914  Int sendIdx = 0;
3915  for(Int dstCol = 0; dstCol<grid_->numProcCol; dstCol++){
3916  if(isSendToCrossDiagonal_(dstCol,ksup) ){
3917  Int dst = PNUM(PROW(ksup,grid_),dstCol,grid_);
3918  if(MYPROC(grid_)!= dst){
3919  // Pack L data
3920  std::stringstream sstm;
3921  std::vector<char> & sstrLcolSend = arrSstrLcolSend[sendIdx];
3922  Int & sstrSize = arrSstrLcolSizeSend[sendIdx];
3923  MPI_Request & mpiReqSend = arrMpiReqsSend[sendIdx];
3924  MPI_Request & mpiReqSizeSend = arrMpiReqsSizeSend[sendIdx];
3925 
3926  std::vector<Int> mask( LBlockMask::TOTAL_NUMBER, 1 );
3927  std::vector<LBlock<T> >& Lcol = this->L( LBj(ksup, grid_) );
3928  // All blocks except for the diagonal block are to be sent right
3929  //TODO not true > this is a scatter operation ! Can we know the destination ?
3930 
3931  Int count = 0;
3932  if( MYROW( grid_ ) == PROW( ksup, grid_ ) ){
3933  for( Int ib = 1; ib < Lcol.size(); ib++ ){
3934  if( Lcol[ib].blockIdx > ksup && (Lcol[ib].blockIdx % grid_->numProcCol) == dstCol ){
3935  count++;
3936  }
3937  }
3938  }
3939  else{
3940 
3941  for( Int ib = 0; ib < Lcol.size(); ib++ ){
3942  if( Lcol[ib].blockIdx > ksup && (Lcol[ib].blockIdx % grid_->numProcCol) == dstCol ){
3943  count++;
3944  }
3945  }
3946  }
3947 
3948  serialize( (Int)count, sstm, NO_MASK );
3949 
3950  for( Int ib = 0; ib < Lcol.size(); ib++ ){
3951  if( Lcol[ib].blockIdx > ksup && (Lcol[ib].blockIdx % grid_->numProcCol) == dstCol ){
3952  #if ( _DEBUGlevel_ >= 1 )
3953  statusOFS<<"["<<ksup<<"] SEND contains "<<Lcol[ib].blockIdx<< " which corresponds to "<<GBj(ib,grid_)<<std::endl;
3954  #endif
3955  serialize( Lcol[ib], sstm, mask );
3956  }
3957  }
3958 
3959  sstrLcolSend.resize( Size(sstm) );
3960  sstm.read( &sstrLcolSend[0], sstrLcolSend.size() );
3961  sstrSize = sstrLcolSend.size();
3962 
3963 
3964 
3965  // Send/Recv is possible here due to the one to one correspondence
3966  // in the case of square processor grid
3967 
3968  #if ( _DEBUGlevel_ >= 1 )
3969  statusOFS<<"["<<ksup<<"] P"<<MYPROC(grid_)<<" ("<<MYROW(grid_)<<","<<MYCOL(grid_)<<") ---> LBj("<<ksup<<")="<<LBj(ksup,grid_)<<" ---> P"<<dst<<" ("<<ksupProcRow<<","<<dstCol<<")"<<std::endl;
3970  #endif
3971  MPI_Isend( &sstrSize, 1, MPI_INT, dst, SELINV_TAG_D_SIZE, grid_->comm, &mpiReqSizeSend );
3972  MPI_Isend( (void*)&sstrLcolSend[0], sstrSize, MPI_BYTE, dst, SELINV_TAG_D_CONTENT, grid_->comm, &mpiReqSend );
3973  //mpi::Send( sstm, dst,SELINV_TAG_D_SIZE, SELINV_TAG_D_CONTENT, grid_->comm );
3974 
3975  sendIdx++;
3976  } // if I am a sender
3977  }
3978  }
3979  }
3980 
3981 
3982 
3983 
3984 
3985  // Receiver
3986  if( isRecvFromCrossDiagonal_(grid_->numProcRow,ksup) ){
3987 
3988 
3989  #if ( _DEBUGlevel_ >= 1 )
3990  statusOFS<<"["<<ksup<<"] P"<<MYPROC(grid_)<<" should receive from "<<CountRecvFromCrossDiagonal(ksup)<<" processors"<<std::endl;
3991  #endif
3992 
3993 
3994  std::vector<UBlock<T> >& Urow = this->U( LBi( ksup, grid_ ) );
3995  std::vector<bool> isBlockFound(Urow.size(),false);
3996 
3997 
3998  Int recvIdx = 0;
3999  //receive size first
4000  for(Int srcRow = 0; srcRow<grid_->numProcRow; srcRow++){
4001  if(isRecvFromCrossDiagonal_(srcRow,ksup) ){
4002  std::vector<LBlock<T> > LcolRecv;
4003  Int src = PNUM(srcRow,PCOL(ksup,grid_),grid_);
4004  if(MYPROC(grid_)!= src){
4005  MPI_Request & mpiReqSizeRecv = arrMpiReqsSizeRecv[recvIdx];
4006  Int & sstrSize = arrSstrLcolSizeRecv[recvIdx];
4007 
4008  MPI_Irecv( &sstrSize, 1, MPI_INT, src, SELINV_TAG_D_SIZE, grid_->comm, &mpiReqSizeRecv );
4009 
4010  recvIdx++;
4011  }
4012  }
4013  }
4014 
4015  mpi::Waitall(arrMpiReqsSizeRecv);
4016 
4017 
4018 
4019  //receive content
4020  recvIdx = 0;
4021  for(Int srcRow = 0; srcRow<grid_->numProcRow; srcRow++){
4022  if(isRecvFromCrossDiagonal_(srcRow,ksup) ){
4023  std::vector<LBlock<T> > LcolRecv;
4024  Int src = PNUM(srcRow,PCOL(ksup,grid_),grid_);
4025  if(MYPROC(grid_)!= src){
4026  MPI_Request & mpiReqRecv = arrMpiReqsRecv[recvIdx];
4027  Int & sstrSize = arrSstrLcolSizeRecv[recvIdx];
4028  std::vector<char> & sstrLcolRecv = arrSstrLcolRecv[recvIdx];
4029  sstrLcolRecv.resize(sstrSize);
4030 
4031  MPI_Irecv( &sstrLcolRecv[0], sstrSize, MPI_BYTE, src, SELINV_TAG_D_CONTENT, grid_->comm, &mpiReqRecv );
4032 
4033  recvIdx++;
4034  }
4035  }
4036  }
4037 
4038  mpi::Waitall(arrMpiReqsRecv);
4039 
4040 
4041 
4042  //Process the content
4043  recvIdx = 0;
4044  for(Int srcRow = 0; srcRow<grid_->numProcRow; srcRow++){
4045  if(isRecvFromCrossDiagonal_(srcRow,ksup) ){
4046  std::vector<LBlock<T> > LcolRecv;
4047  Int src = PNUM(srcRow,PCOL(ksup,grid_),grid_);
4048  if(MYPROC(grid_)!= src){
4049 
4050  Int & sstrSize = arrSstrLcolSizeRecv[recvIdx];
4051  std::vector<char> & sstrLcolRecv = arrSstrLcolRecv[recvIdx];
4052  std::stringstream sstm;
4053 
4054  #if ( _DEBUGlevel_ >= 1 )
4055  statusOFS<<"["<<ksup<<"] P"<<MYPROC(grid_)<<" ("<<MYROW(grid_)<<","<<MYCOL(grid_)<<") <--- LBj("<<ksup<<") <--- P"<<src<<" ("<<srcRow<<","<<ksupProcCol<<")"<<std::endl;
4056  #endif
4057 
4058 
4059  sstm.write( &sstrLcolRecv[0], sstrSize );
4060 
4061  // Unpack L data.
4062  Int numLBlock;
4063  std::vector<Int> mask( LBlockMask::TOTAL_NUMBER, 1 );
4064  deserialize( numLBlock, sstm, NO_MASK );
4065  LcolRecv.resize(numLBlock);
4066  for( Int ib = 0; ib < numLBlock; ib++ ){
4067  deserialize( LcolRecv[ib], sstm, mask );
4068  #if ( _DEBUGlevel_ >= 1 )
4069  statusOFS<<"["<<ksup<<"] RECV contains "<<LcolRecv[ib].blockIdx<< " which corresponds to "<< ib * grid_->numProcRow + srcRow; // <<std::endl;
4070  // statusOFS<<" L is on row "<< srcRow <<" whereas U is on col "<<((ib * grid_->numProcRow + srcRow)/grid_->numProcCol)%grid_->numProcCol <<std::endl;
4071  statusOFS<<" L is on row "<< srcRow <<" whereas U is on col "<< LcolRecv[ib].blockIdx % grid_->numProcCol <<std::endl;
4072  #endif
4073  }
4074 
4075 
4076  recvIdx++;
4077 
4078  } // sender is not the same as receiver
4079  else{
4080  // L is obtained locally, just make a copy. Do not include the diagonal block
4081  std::vector<LBlock<T> >& Lcol = this->L( LBj( ksup, grid_ ) );
4082  if( MYROW( grid_ ) != PROW( ksup, grid_ ) ){
4083  LcolRecv.resize( Lcol.size() );
4084  for( Int ib = 0; ib < Lcol.size(); ib++ ){
4085  LcolRecv[ib] = Lcol[ib];
4086  }
4087  }
4088  else{
4089  LcolRecv.resize( Lcol.size() - 1 );
4090  for( Int ib = 0; ib < Lcol.size() - 1; ib++ ){
4091  LcolRecv[ib] = Lcol[ib+1];
4092  }
4093  }
4094  } // sender is the same as receiver
4095 
4096  // Update U
4097  // Make sure that the size of L and the corresponding U blocks match.
4098  for( Int ib = 0; ib < LcolRecv.size(); ib++ ){
4099  LBlock<T> & LB = LcolRecv[ib];
4100  if( LB.blockIdx <= ksup ){
4101  throw std::logic_error( "LcolRecv contains the wrong blocks." );
4102  }
4103  for( Int jb = 0; jb < Urow.size(); jb++ ){
4104  UBlock<T> & UB = Urow[jb];
4105  if( LB.blockIdx == UB.blockIdx ){
4106  // Compare size
4107  if( LB.numRow != UB.numCol || LB.numCol != UB.numRow ){
4108  std::ostringstream msg;
4109  msg << "LB(" << LB.blockIdx << ", " << ksup << ") and UB("
4110  << ksup << ", " << UB.blockIdx << ") do not share the same size." << std::endl
4111  << "LB: " << LB.numRow << " x " << LB.numCol << std::endl
4112  << "UB: " << UB.numRow << " x " << UB.numCol << std::endl;
4113  throw std::runtime_error( msg.str().c_str() );
4114  }
4115 
4116  // Note that the order of the column indices of the U
4117  // block may not follow the order of the row indices,
4118  // overwrite the information in U.
4119  UB.cols = LB.rows;
4120  Transpose( LB.nzval, UB.nzval );
4121 
4122  #if ( _DEBUGlevel_ >= 1 )
4123  statusOFS<<"["<<ksup<<"] USING "<<LB.blockIdx<< std::endl;
4124  #endif
4125  isBlockFound[jb] = true;
4126  break;
4127  } // if( LB.blockIdx == UB.blockIdx )
4128  } // for (jb)
4129  } // for (ib)
4130  }
4131  }
4132 
4133  for( Int jb = 0; jb < Urow.size(); jb++ ){
4134  UBlock<T> & UB = Urow[jb];
4135  if( !isBlockFound[jb] ){
4136  throw std::logic_error( "UBlock cannot find its update. Something is seriously wrong." );
4137  }
4138  }
4139 
4140 
4141 
4142  } // if I am a receiver
4143 
4144 
4145  //Wait until every receiver is done
4146  mpi::Waitall(arrMpiReqsSizeSend);
4147  mpi::Waitall(arrMpiReqsSend);
4148 
4149 
4150  } // for (ksup)
4151 
4152  #ifndef _RELEASE_
4153  PopCallStack();
4154  #endif
4155 
4156  #ifndef _RELEASE_
4157  PushCallStack("L(i,i) <- [L(k,k) * U(k,k)]^{-1} ");
4158  #endif
4159  #if ( _DEBUGlevel_ >= 1 )
4160  statusOFS << std::endl << "L(i,i) <- [L(k,k) * U(k,k)]^{-1}" << std::endl << std::endl;
4161  #endif
4162 
4163  for( Int ksup = 0; ksup < numSuper; ksup++ ){
4164  if( MYROW( grid_ ) == PROW( ksup, grid_ ) &&
4165  MYCOL( grid_ ) == PCOL( ksup, grid_ ) ){
4166  IntNumVec ipiv( SuperSize( ksup, super_ ) );
4167  // Note that the pivoting vector ipiv should follow the FORTRAN
4168  // notation by adding the +1
4169  for(Int i = 0; i < SuperSize( ksup, super_ ); i++){
4170  ipiv[i] = i + 1;
4171  }
4172  LBlock<T> & LB = (this->L( LBj( ksup, grid_ ) ))[0];
4173  #if ( _DEBUGlevel_ >= 2 )
4174  // Check the correctness of the matrix inversion for the first local column
4175  statusOFS << "Factorized A (" << ksup << ", " << ksup << "): " << LB.nzval << std::endl;
4176  #endif
4177  lapack::Getri( SuperSize( ksup, super_ ), LB.nzval.Data(),
4178  SuperSize( ksup, super_ ), ipiv.Data() );
4179 
4180  // Symmetrize the diagonal block
4181  Symmetrize( LB.nzval );
4182  #if ( _DEBUGlevel_ >= 2 )
4183  // Check the correctness of the matrix inversion for the first local column
4184  statusOFS << "Inversed A (" << ksup << ", " << ksup << "): " << LB.nzval << std::endl;
4185  #endif
4186  } // if I need to inverse the diagonal block
4187  } // for (ksup)
4188 
4189 
4190  #ifndef _RELEASE_
4191  PopCallStack();
4192  #endif
4193 
4194 
4195 
4196  #ifndef _RELEASE_
4197  PopCallStack();
4198  #endif
4199 
4200  return ;
4201  } // ----- end of method PMatrix::PreSelInv -----
4202 
4203  template<typename T>
4205  {
4206 #ifndef _RELEASE_
4207  PushCallStack("PMatrix::GetDiagonal");
4208 #endif
4209  Int numSuper = this->NumSuper();
4210 
4211  Int numCol = this->NumCol();
4212  const IntNumVec& permInv = super_->permInv;
4213 
4214  NumVec<T> diagLocal( numCol );
4215  SetValue( diagLocal, ZERO<T>() );
4216 
4217  diag.Resize( numCol );
4218  SetValue( diag, ZERO<T>() );
4219 
4220 
4221  for( Int ksup = 0; ksup < numSuper; ksup++ ){
4222  // I own the diagonal block
4223  if( MYROW( grid_ ) == PROW( ksup, grid_ ) &&
4224  MYCOL( grid_ ) == PCOL( ksup, grid_ ) ){
4225  LBlock<T> & LB = this->L( LBj( ksup, grid_ ) )[0];
4226  for( Int i = 0; i < LB.numRow; i++ ){
4227  diagLocal( permInv( LB.rows(i) ) ) = LB.nzval( i, i );
4228  }
4229  }
4230  }
4231 
4232  // All processors own diag
4233  mpi::Allreduce( diagLocal.Data(), diag.Data(), numCol, MPI_SUM, grid_->comm );
4234 
4235 #ifndef _RELEASE_
4236  PopCallStack();
4237 #endif
4238 
4239  return ;
4240  } // ----- end of method PMatrix::GetDiagonal -----
4241 
4242 
4243 
4244  template<typename T>
4246  {
4247  #ifndef _RELEASE_
4248  PushCallStack("PMatrix::PMatrixToDistSparseMatrix");
4249  #endif
4250  #if ( _DEBUGlevel_ >= 1 )
4251  statusOFS << std::endl << "Converting PMatrix to DistSparseMatrix." << std::endl;
4252  #endif
4253  Int mpirank = grid_->mpirank;
4254  Int mpisize = grid_->mpisize;
4255 
4256  std::vector<Int> rowSend( mpisize );
4257  std::vector<Int> colSend( mpisize );
4258  std::vector<T> valSend( mpisize );
4259  std::vector<Int> sizeSend( mpisize, 0 );
4260  std::vector<Int> displsSend( mpisize, 0 );
4261 
4262  std::vector<Int> rowRecv( mpisize );
4263  std::vector<Int> colRecv( mpisize );
4264  std::vector<T> valRecv( mpisize );
4265  std::vector<Int> sizeRecv( mpisize, 0 );
4266  std::vector<Int> displsRecv( mpisize, 0 );
4267 
4268  Int numSuper = this->NumSuper();
4269  const IntNumVec& permInv = super_->permInv;
4270 
4271  // The number of local columns in DistSparseMatrix format for the
4272  // processor with rank 0. This number is the same for processors
4273  // with rank ranging from 0 to mpisize - 2, and may or may not differ
4274  // from the number of local columns for processor with rank mpisize -
4275  // 1.
4276  Int numColFirst = this->NumCol() / mpisize;
4277 
4278  // Count the size first.
4279  for( Int ksup = 0; ksup < numSuper; ksup++ ){
4280  // L blocks
4281  if( MYCOL( grid_ ) == PCOL( ksup, grid_ ) ){
4282  std::vector<LBlock<T> >& Lcol = this->L( LBj( ksup, grid_ ) );
4283  for( Int ib = 0; ib < Lcol.size(); ib++ ){
4284  for( Int j = 0; j < Lcol[ib].numCol; j++ ){
4285  Int jcol = permInv( j + FirstBlockCol( ksup, super_ ) );
4286  Int dest = std::min( jcol / numColFirst, mpisize - 1 );
4287  sizeSend[dest] += Lcol[ib].numRow;
4288  }
4289  }
4290  } // I own the column of ksup
4291 
4292  // U blocks
4293  if( MYROW( grid_ ) == PROW( ksup, grid_ ) ){
4294  std::vector<UBlock<T> >& Urow = this->U( LBi( ksup, grid_ ) );
4295  for( Int jb = 0; jb < Urow.size(); jb++ ){
4296  IntNumVec& cols = Urow[jb].cols;
4297  for( Int j = 0; j < cols.m(); j++ ){
4298  Int jcol = permInv( cols(j) );
4299  Int dest = std::min( jcol / numColFirst, mpisize - 1 );
4300  sizeSend[dest] += Urow[jb].numRow;
4301  }
4302  }
4303  } // I own the row of ksup
4304  } // for (ksup)
4305 
4306  // All-to-all exchange of size information
4307  MPI_Alltoall(
4308  &sizeSend[0], 1, MPI_INT,
4309  &sizeRecv[0], 1, MPI_INT, grid_->comm );
4310 
4311 
4312 
4313  // Reserve the space
4314  for( Int ip = 0; ip < mpisize; ip++ ){
4315  if( ip == 0 ){
4316  displsSend[ip] = 0;
4317  }
4318  else{
4319  displsSend[ip] = displsSend[ip-1] + sizeSend[ip-1];
4320  }
4321 
4322  if( ip == 0 ){
4323  displsRecv[ip] = 0;
4324  }
4325  else{
4326  displsRecv[ip] = displsRecv[ip-1] + sizeRecv[ip-1];
4327  }
4328  }
4329  Int sizeSendTotal = displsSend[mpisize-1] + sizeSend[mpisize-1];
4330  Int sizeRecvTotal = displsRecv[mpisize-1] + sizeRecv[mpisize-1];
4331 
4332  rowSend.resize( sizeSendTotal );
4333  colSend.resize( sizeSendTotal );
4334  valSend.resize( sizeSendTotal );
4335 
4336  rowRecv.resize( sizeRecvTotal );
4337  colRecv.resize( sizeRecvTotal );
4338  valRecv.resize( sizeRecvTotal );
4339 
4340  #if ( _DEBUGlevel_ >= 1 )
4341  statusOFS << "displsSend = " << displsSend << std::endl;
4342  statusOFS << "displsRecv = " << displsRecv << std::endl;
4343  #endif
4344 
4345  // Put (row, col, val) to the sending buffer
4346  std::vector<Int> cntSize( mpisize, 0 );
4347 
4348  for( Int ksup = 0; ksup < numSuper; ksup++ ){
4349  // L blocks
4350  if( MYCOL( grid_ ) == PCOL( ksup, grid_ ) ){
4351  std::vector<LBlock<T> >& Lcol = this->L( LBj( ksup, grid_ ) );
4352  for( Int ib = 0; ib < Lcol.size(); ib++ ){
4353  IntNumVec& rows = Lcol[ib].rows;
4354  NumMat<T>& nzval = Lcol[ib].nzval;
4355  for( Int j = 0; j < Lcol[ib].numCol; j++ ){
4356  Int jcol = permInv( j + FirstBlockCol( ksup, super_ ) );
4357  Int dest = std::min( jcol / numColFirst, mpisize - 1 );
4358  for( Int i = 0; i < rows.m(); i++ ){
4359  rowSend[displsSend[dest] + cntSize[dest]] = permInv( rows(i) );
4360  colSend[displsSend[dest] + cntSize[dest]] = jcol;
4361  valSend[displsSend[dest] + cntSize[dest]] = nzval( i, j );
4362  cntSize[dest]++;
4363  }
4364  }
4365  }
4366  } // I own the column of ksup
4367 
4368  // U blocks
4369  if( MYROW( grid_ ) == PROW( ksup, grid_ ) ){
4370  std::vector<UBlock<T> >& Urow = this->U( LBi( ksup, grid_ ) );
4371  for( Int jb = 0; jb < Urow.size(); jb++ ){
4372  IntNumVec& cols = Urow[jb].cols;
4373  NumMat<T>& nzval = Urow[jb].nzval;
4374  for( Int j = 0; j < cols.m(); j++ ){
4375  Int jcol = permInv( cols(j) );
4376  Int dest = std::min( jcol / numColFirst, mpisize - 1 );
4377  for( Int i = 0; i < Urow[jb].numRow; i++ ){
4378  rowSend[displsSend[dest] + cntSize[dest]] =
4379  permInv( i + FirstBlockCol( ksup, super_ ) );
4380  colSend[displsSend[dest] + cntSize[dest]] = jcol;
4381  valSend[displsSend[dest] + cntSize[dest]] = nzval( i, j );
4382  cntSize[dest]++;
4383  }
4384  }
4385  }
4386  } // I own the row of ksup
4387  }
4388 
4389  // Check sizes match
4390  for( Int ip = 0; ip < mpisize; ip++ ){
4391  if( cntSize[ip] != sizeSend[ip] )
4392  throw std::runtime_error( "Sizes of the sending information do not match." );
4393  }
4394 
4395 
4396  // Alltoallv to exchange information
4397  mpi::Alltoallv(
4398  &rowSend[0], &sizeSend[0], &displsSend[0],
4399  &rowRecv[0], &sizeRecv[0], &displsRecv[0],
4400  grid_->comm );
4401  mpi::Alltoallv(
4402  &colSend[0], &sizeSend[0], &displsSend[0],
4403  &colRecv[0], &sizeRecv[0], &displsRecv[0],
4404  grid_->comm );
4405  mpi::Alltoallv(
4406  &valSend[0], &sizeSend[0], &displsSend[0],
4407  &valRecv[0], &sizeRecv[0], &displsRecv[0],
4408  grid_->comm );
4409 
4410  #if ( _DEBUGlevel_ >= 1 )
4411  statusOFS << "Alltoallv communication finished." << std::endl;
4412  #endif
4413 
4414  //#if ( _DEBUGlevel_ >= 1 )
4415  // for( Int ip = 0; ip < mpisize; ip++ ){
4416  // statusOFS << "rowSend[" << ip << "] = " << rowSend[ip] << std::endl;
4417  // statusOFS << "rowRecv[" << ip << "] = " << rowRecv[ip] << std::endl;
4418  // statusOFS << "colSend[" << ip << "] = " << colSend[ip] << std::endl;
4419  // statusOFS << "colRecv[" << ip << "] = " << colRecv[ip] << std::endl;
4420  // statusOFS << "valSend[" << ip << "] = " << valSend[ip] << std::endl;
4421  // statusOFS << "valRecv[" << ip << "] = " << valRecv[ip] << std::endl;
4422  // }
4423  //#endif
4424 
4425  // Organize the received message.
4426  Int firstCol = mpirank * numColFirst;
4427  Int numColLocal;
4428  if( mpirank == mpisize-1 )
4429  numColLocal = this->NumCol() - numColFirst * (mpisize-1);
4430  else
4431  numColLocal = numColFirst;
4432 
4433  std::vector<std::vector<Int> > rows( numColLocal );
4434  std::vector<std::vector<T> > vals( numColLocal );
4435 
4436  for( Int ip = 0; ip < mpisize; ip++ ){
4437  Int* rowRecvCur = &rowRecv[displsRecv[ip]];
4438  Int* colRecvCur = &colRecv[displsRecv[ip]];
4439  T* valRecvCur = &valRecv[displsRecv[ip]];
4440  for( Int i = 0; i < sizeRecv[ip]; i++ ){
4441  rows[colRecvCur[i]-firstCol].push_back( rowRecvCur[i] );
4442  vals[colRecvCur[i]-firstCol].push_back( valRecvCur[i] );
4443  } // for (i)
4444  } // for (ip)
4445 
4446  // Sort the rows
4447  std::vector<std::vector<Int> > sortIndex( numColLocal );
4448  for( Int j = 0; j < numColLocal; j++ ){
4449  sortIndex[j].resize( rows[j].size() );
4450  for( Int i = 0; i < sortIndex[j].size(); i++ )
4451  sortIndex[j][i] = i;
4452  std::sort( sortIndex[j].begin(), sortIndex[j].end(),
4453  IndexComp<std::vector<Int>& > ( rows[j] ) );
4454  } // for (j)
4455 
4456  // Form DistSparseMatrix according to the received message
4457  // NOTE: for indicies, DistSparseMatrix follows the FORTRAN
4458  // convention (1 based) while PMatrix follows the C convention (0
4459  // based)
4460  A.size = this->NumCol();
4461  A.nnzLocal = 0;
4462  A.colptrLocal.Resize( numColLocal + 1 );
4463  // Note that 1 is important since the index follows the FORTRAN convention
4464  A.colptrLocal(0) = 1;
4465  for( Int j = 0; j < numColLocal; j++ ){
4466  A.nnzLocal += rows[j].size();
4467  A.colptrLocal(j+1) = A.colptrLocal(j) + rows[j].size();
4468  }
4469 
4470  A.comm = grid_->comm;
4471 
4472  #if ( _DEBUGlevel_ >= 1 )
4473  statusOFS << "nnzLocal = " << A.nnzLocal << std::endl;
4474  statusOFS << "nnz = " << A.Nnz() << std::endl;
4475  #endif
4476 
4477 
4478  A.rowindLocal.Resize( A.nnzLocal );
4479  A.nzvalLocal.Resize( A.nnzLocal );
4480 
4481  Int* rowPtr = A.rowindLocal.Data();
4482  T* nzvalPtr = A.nzvalLocal.Data();
4483  for( Int j = 0; j < numColLocal; j++ ){
4484  std::vector<Int>& rowsCur = rows[j];
4485  std::vector<Int>& sortIndexCur = sortIndex[j];
4486  std::vector<T>& valsCur = vals[j];
4487  for( Int i = 0; i < rows[j].size(); i++ ){
4488  // Note that 1 is important since the index follows the FORTRAN convention
4489  *(rowPtr++) = rowsCur[sortIndexCur[i]] + 1;
4490  *(nzvalPtr++) = valsCur[sortIndexCur[i]];
4491  }
4492  }
4493 
4494  #if ( _DEBUGlevel_ >= 1 )
4495  statusOFS << "A.colptrLocal[end] = " << A.colptrLocal(numColLocal) << std::endl;
4496  statusOFS << "A.rowindLocal.size() = " << A.rowindLocal.m() << std::endl;
4497  statusOFS << "A.rowindLocal[end] = " << A.rowindLocal(A.nnzLocal-1) << std::endl;
4498  statusOFS << "A.nzvalLocal[end] = " << A.nzvalLocal(A.nnzLocal-1) << std::endl;
4499  #endif
4500 
4501 
4502  #ifndef _RELEASE_
4503  PopCallStack();
4504  #endif
4505 
4506  return ;
4507  } // ----- end of method PMatrix::PMatrixToDistSparseMatrix -----
4508 
4509 
4510 
4511  template<typename T>
4513  {
4514  #ifndef _RELEASE_
4515  PushCallStack("PMatrix::PMatrixToDistSparseMatrix");
4516  #endif
4517  #if ( _DEBUGlevel_ >= 1 )
4518  statusOFS << std::endl << "Converting PMatrix to DistSparseMatrix (2nd format)." << std::endl;
4519  #endif
4520  Int mpirank = grid_->mpirank;
4521  Int mpisize = grid_->mpisize;
4522 
4523  std::vector<Int> rowSend( mpisize );
4524  std::vector<Int> colSend( mpisize );
4525  std::vector<T> valSend( mpisize );
4526  std::vector<Int> sizeSend( mpisize, 0 );
4527  std::vector<Int> displsSend( mpisize, 0 );
4528 
4529  std::vector<Int> rowRecv( mpisize );
4530  std::vector<Int> colRecv( mpisize );
4531  std::vector<T> valRecv( mpisize );
4532  std::vector<Int> sizeRecv( mpisize, 0 );
4533  std::vector<Int> displsRecv( mpisize, 0 );
4534 
4535  Int numSuper = this->NumSuper();
4536  const IntNumVec& permInv = super_->permInv;
4537 
4538  // The number of local columns in DistSparseMatrix format for the
4539  // processor with rank 0. This number is the same for processors
4540  // with rank ranging from 0 to mpisize - 2, and may or may not differ
4541  // from the number of local columns for processor with rank mpisize -
4542  // 1.
4543  Int numColFirst = this->NumCol() / mpisize;
4544 
4545  // Count the size first.
4546  for( Int ksup = 0; ksup < numSuper; ksup++ ){
4547  // L blocks
4548  if( MYCOL( grid_ ) == PCOL( ksup, grid_ ) ){
4549  std::vector<LBlock<T> >& Lcol = this->L( LBj( ksup, grid_ ) );
4550  for( Int ib = 0; ib < Lcol.size(); ib++ ){
4551  for( Int j = 0; j < Lcol[ib].numCol; j++ ){
4552  Int jcol = permInv( j + FirstBlockCol( ksup, super_ ) );
4553  Int dest = std::min( jcol / numColFirst, mpisize - 1 );
4554  sizeSend[dest] += Lcol[ib].numRow;
4555  }
4556  }
4557  } // I own the column of ksup
4558 
4559  // U blocks
4560  if( MYROW( grid_ ) == PROW( ksup, grid_ ) ){
4561  std::vector<UBlock<T> >& Urow = this->U( LBi( ksup, grid_ ) );
4562  for( Int jb = 0; jb < Urow.size(); jb++ ){
4563  IntNumVec& cols = Urow[jb].cols;
4564  for( Int j = 0; j < cols.m(); j++ ){
4565  Int jcol = permInv( cols(j) );
4566  Int dest = std::min( jcol / numColFirst, mpisize - 1 );
4567  sizeSend[dest] += Urow[jb].numRow;
4568  }
4569  }
4570  } // I own the row of ksup
4571  } // for (ksup)
4572 
4573  // All-to-all exchange of size information
4574  MPI_Alltoall(
4575  &sizeSend[0], 1, MPI_INT,
4576  &sizeRecv[0], 1, MPI_INT, grid_->comm );
4577 
4578 
4579 
4580  // Reserve the space
4581  for( Int ip = 0; ip < mpisize; ip++ ){
4582  if( ip == 0 ){
4583  displsSend[ip] = 0;
4584  }
4585  else{
4586  displsSend[ip] = displsSend[ip-1] + sizeSend[ip-1];
4587  }
4588 
4589  if( ip == 0 ){
4590  displsRecv[ip] = 0;
4591  }
4592  else{
4593  displsRecv[ip] = displsRecv[ip-1] + sizeRecv[ip-1];
4594  }
4595  }
4596  Int sizeSendTotal = displsSend[mpisize-1] + sizeSend[mpisize-1];
4597  Int sizeRecvTotal = displsRecv[mpisize-1] + sizeRecv[mpisize-1];
4598 
4599  rowSend.resize( sizeSendTotal );
4600  colSend.resize( sizeSendTotal );
4601  valSend.resize( sizeSendTotal );
4602 
4603  rowRecv.resize( sizeRecvTotal );
4604  colRecv.resize( sizeRecvTotal );
4605  valRecv.resize( sizeRecvTotal );
4606 
4607  #if ( _DEBUGlevel_ >= 1 )
4608  statusOFS << "displsSend = " << displsSend << std::endl;
4609  statusOFS << "displsRecv = " << displsRecv << std::endl;
4610  #endif
4611 
4612  // Put (row, col, val) to the sending buffer
4613  std::vector<Int> cntSize( mpisize, 0 );
4614 
4615 
4616  for( Int ksup = 0; ksup < numSuper; ksup++ ){
4617  // L blocks
4618  if( MYCOL( grid_ ) == PCOL( ksup, grid_ ) ){
4619  std::vector<LBlock<T> >& Lcol = this->L( LBj( ksup, grid_ ) );
4620  for( Int ib = 0; ib < Lcol.size(); ib++ ){
4621  IntNumVec& rows = Lcol[ib].rows;
4622  NumMat<T>& nzval = Lcol[ib].nzval;
4623  for( Int j = 0; j < Lcol[ib].numCol; j++ ){
4624  Int jcol = permInv( j + FirstBlockCol( ksup, super_ ) );
4625  Int dest = std::min( jcol / numColFirst, mpisize - 1 );
4626  for( Int i = 0; i < rows.m(); i++ ){
4627  rowSend[displsSend[dest] + cntSize[dest]] = permInv( rows(i) );
4628  colSend[displsSend[dest] + cntSize[dest]] = jcol;
4629  valSend[displsSend[dest] + cntSize[dest]] = nzval( i, j );
4630  cntSize[dest]++;
4631  }
4632  }
4633  }
4634  } // I own the column of ksup
4635 
4636 
4637  // U blocks
4638  if( MYROW( grid_ ) == PROW( ksup, grid_ ) ){
4639  std::vector<UBlock<T> >& Urow = this->U( LBi( ksup, grid_ ) );
4640  for( Int jb = 0; jb < Urow.size(); jb++ ){
4641  IntNumVec& cols = Urow[jb].cols;
4642  NumMat<T>& nzval = Urow[jb].nzval;
4643  for( Int j = 0; j < cols.m(); j++ ){
4644  Int jcol = permInv( cols(j) );
4645  Int dest = std::min( jcol / numColFirst, mpisize - 1 );
4646  for( Int i = 0; i < Urow[jb].numRow; i++ ){
4647  rowSend[displsSend[dest] + cntSize[dest]] =
4648  permInv( i + FirstBlockCol( ksup, super_ ) );
4649  colSend[displsSend[dest] + cntSize[dest]] = jcol;
4650  valSend[displsSend[dest] + cntSize[dest]] = nzval( i, j );
4651  cntSize[dest]++;
4652  }
4653  }
4654  }
4655  } // I own the row of ksup
4656  }
4657 
4658 
4659 
4660  // Check sizes match
4661  for( Int ip = 0; ip < mpisize; ip++ ){
4662  if( cntSize[ip] != sizeSend[ip] )
4663  throw std::runtime_error( "Sizes of the sending information do not match." );
4664  }
4665 
4666  // Alltoallv to exchange information
4667  mpi::Alltoallv(
4668  &rowSend[0], &sizeSend[0], &displsSend[0],
4669  &rowRecv[0], &sizeRecv[0], &displsRecv[0],
4670  grid_->comm );
4671  mpi::Alltoallv(
4672  &colSend[0], &sizeSend[0], &displsSend[0],
4673  &colRecv[0], &sizeRecv[0], &displsRecv[0],
4674  grid_->comm );
4675  mpi::Alltoallv(
4676  &valSend[0], &sizeSend[0], &displsSend[0],
4677  &valRecv[0], &sizeRecv[0], &displsRecv[0],
4678  grid_->comm );
4679 
4680  #if ( _DEBUGlevel_ >= 1 )
4681  statusOFS << "Alltoallv communication finished." << std::endl;
4682  #endif
4683 
4684  //#if ( _DEBUGlevel_ >= 1 )
4685  // for( Int ip = 0; ip < mpisize; ip++ ){
4686  // statusOFS << "rowSend[" << ip << "] = " << rowSend[ip] << std::endl;
4687  // statusOFS << "rowRecv[" << ip << "] = " << rowRecv[ip] << std::endl;
4688  // statusOFS << "colSend[" << ip << "] = " << colSend[ip] << std::endl;
4689  // statusOFS << "colRecv[" << ip << "] = " << colRecv[ip] << std::endl;
4690  // statusOFS << "valSend[" << ip << "] = " << valSend[ip] << std::endl;
4691  // statusOFS << "valRecv[" << ip << "] = " << valRecv[ip] << std::endl;
4692  // }
4693  //#endif
4694 
4695  // Organize the received message.
4696  Int firstCol = mpirank * numColFirst;
4697  Int numColLocal;
4698  if( mpirank == mpisize-1 )
4699  numColLocal = this->NumCol() - numColFirst * (mpisize-1);
4700  else
4701  numColLocal = numColFirst;
4702 
4703  std::vector<std::vector<Int> > rows( numColLocal );
4704  std::vector<std::vector<T> > vals( numColLocal );
4705 
4706  for( Int ip = 0; ip < mpisize; ip++ ){
4707  Int* rowRecvCur = &rowRecv[displsRecv[ip]];
4708  Int* colRecvCur = &colRecv[displsRecv[ip]];
4709  T* valRecvCur = &valRecv[displsRecv[ip]];
4710  for( Int i = 0; i < sizeRecv[ip]; i++ ){
4711  rows[colRecvCur[i]-firstCol].push_back( rowRecvCur[i] );
4712  vals[colRecvCur[i]-firstCol].push_back( valRecvCur[i] );
4713  } // for (i)
4714  } // for (ip)
4715 
4716  // Sort the rows
4717  std::vector<std::vector<Int> > sortIndex( numColLocal );
4718  for( Int j = 0; j < numColLocal; j++ ){
4719  sortIndex[j].resize( rows[j].size() );
4720  for( Int i = 0; i < sortIndex[j].size(); i++ )
4721  sortIndex[j][i] = i;
4722  std::sort( sortIndex[j].begin(), sortIndex[j].end(),
4723  IndexComp<std::vector<Int>& > ( rows[j] ) );
4724  } // for (j)
4725 
4726  // Form DistSparseMatrix according to the received message
4727  // NOTE: for indicies, DistSparseMatrix follows the FORTRAN
4728  // convention (1 based) while PMatrix follows the C convention (0
4729  // based)
4730  if( A.size != this->NumCol() ){
4731  throw std::runtime_error( "The DistSparseMatrix providing the pattern has a different size from PMatrix." );
4732  }
4733  if( A.colptrLocal.m() != numColLocal + 1 ){
4734  throw std::runtime_error( "The DistSparseMatrix providing the pattern has a different number of local columns from PMatrix." );
4735  }
4736 
4737  B.size = A.size;
4738  B.nnz = A.nnz;
4739  B.nnzLocal = A.nnzLocal;
4740  B.colptrLocal = A.colptrLocal;
4741  B.rowindLocal = A.rowindLocal;
4742  B.nzvalLocal.Resize( B.nnzLocal );
4743  SetValue( B.nzvalLocal, ZERO<T>() );
4744  // Make sure that the communicator of A and B are the same.
4745  // FIXME Find a better way to compare the communicators
4746  // if( grid_->comm != A.comm ){
4747  // throw std::runtime_error( "The DistSparseMatrix providing the pattern has a different communicator from PMatrix." );
4748  // }
4749  B.comm = grid_->comm;
4750 
4751  Int* rowPtr = B.rowindLocal.Data();
4752  T* nzvalPtr = B.nzvalLocal.Data();
4753  for( Int j = 0; j < numColLocal; j++ ){
4754  std::vector<Int>& rowsCur = rows[j];
4755  std::vector<Int>& sortIndexCur = sortIndex[j];
4756  std::vector<T>& valsCur = vals[j];
4757  std::vector<Int> rowsCurSorted( rowsCur.size() );
4758  // Note that 1 is important since the index follows the FORTRAN convention
4759  for( Int i = 0; i < rowsCurSorted.size(); i++ ){
4760  rowsCurSorted[i] = rowsCur[sortIndexCur[i]] + 1;
4761  }
4762 
4763  // Search and match the indices
4764  std::vector<Int>::iterator it;
4765  for( Int i = B.colptrLocal(j) - 1;
4766  i < B.colptrLocal(j+1) - 1; i++ ){
4767  it = std::lower_bound( rowsCurSorted.begin(), rowsCurSorted.end(),
4768  *(rowPtr++) );
4769  if( it == rowsCurSorted.end() ){
4770  // Did not find the row, set it to zero
4771  *(nzvalPtr++) = ZERO<T>();
4772  }
4773  else{
4774  // Found the row, set it according to the received value
4775  *(nzvalPtr++) = valsCur[ sortIndexCur[it-rowsCurSorted.begin()] ];
4776  }
4777  } // for (i)
4778  } // for (j)
4779 
4780  #if ( _DEBUGlevel_ >= 1 )
4781  statusOFS << "B.colptrLocal[end] = " << B.colptrLocal(numColLocal) << std::endl;
4782  statusOFS << "B.rowindLocal.size() = " << B.rowindLocal.m() << std::endl;
4783  statusOFS << "B.rowindLocal[end] = " << B.rowindLocal(B.nnzLocal-1) << std::endl;
4784  statusOFS << "B.nzvalLocal[end] = " << B.nzvalLocal(B.nnzLocal-1) << std::endl;
4785  #endif
4786 
4787 
4788  #ifndef _RELEASE_
4789  PopCallStack();
4790  #endif
4791 
4792  return ;
4793  } // ----- end of method PMatrix::PMatrixToDistSparseMatrix -----
4794 
4795 
4796  // A (maybe) more memory efficient way for converting the PMatrix to a
4797  // DistSparseMatrix structure.
4798  //
4799  // FIXME NOTE: This routine assumes the matrix to be symmetric!
4800  template<typename T>
4802  {
4803  #ifndef _RELEASE_
4804  PushCallStack("PMatrix::PMatrixToDistSparseMatrix2");
4805  #endif
4806  #if ( _DEBUGlevel_ >= 1 )
4807  statusOFS << std::endl << "Converting PMatrix to DistSparseMatrix (2nd format)." << std::endl;
4808  #endif
4809  Int mpirank = grid_->mpirank;
4810  Int mpisize = grid_->mpisize;
4811 
4812  std::vector<Int> rowSend( mpisize );
4813  std::vector<Int> colSend( mpisize );
4814  std::vector<T> valSend( mpisize );
4815  std::vector<Int> sizeSend( mpisize, 0 );
4816  std::vector<Int> displsSend( mpisize, 0 );
4817 
4818  std::vector<Int> rowRecv( mpisize );
4819  std::vector<Int> colRecv( mpisize );
4820  std::vector<T> valRecv( mpisize );
4821  std::vector<Int> sizeRecv( mpisize, 0 );
4822  std::vector<Int> displsRecv( mpisize, 0 );
4823 
4824  Int numSuper = this->NumSuper();
4825  const IntNumVec& perm = super_->perm;
4826  const IntNumVec& permInv = super_->permInv;
4827 
4828 
4829  // Count the sizes from the A matrix first
4830  Int numColFirst = this->NumCol() / mpisize;
4831  Int firstCol = mpirank * numColFirst;
4832  Int numColLocal;
4833  if( mpirank == mpisize-1 )
4834  numColLocal = this->NumCol() - numColFirst * (mpisize-1);
4835  else
4836  numColLocal = numColFirst;
4837 
4838  Int* rowPtr = A.rowindLocal.Data();
4839  Int* colPtr = A.colptrLocal.Data();
4840 
4841  for( Int j = 0; j < numColLocal; j++ ){
4842  Int col = perm( firstCol + j );
4843  Int blockColIdx = BlockIdx( col, super_ );
4844  Int procCol = PCOL( blockColIdx, grid_ );
4845  for( Int i = colPtr[j] - 1; i < colPtr[j+1] - 1; i++ ){
4846  Int row = perm( *(rowPtr++) - 1 );
4847  Int blockRowIdx = BlockIdx( row, super_ );
4848  Int procRow = PROW( blockRowIdx, grid_ );
4849  Int dest = PNUM( procRow, procCol, grid_ );
4850  #if ( _DEBUGlevel_ >= 1 )
4851  statusOFS << "BlockIdx = " << blockRowIdx << ", " <<blockColIdx << std::endl;
4852  statusOFS << procRow << ", " << procCol << ", "
4853  << dest << std::endl;
4854  #endif
4855  sizeSend[dest]++;
4856  } // for (i)
4857  } // for (j)
4858 
4859  // All-to-all exchange of size information
4860  MPI_Alltoall(
4861  &sizeSend[0], 1, MPI_INT,
4862  &sizeRecv[0], 1, MPI_INT, grid_->comm );
4863 
4864  #if ( _DEBUGlevel_ >= 1 )
4865  statusOFS << std::endl << "sizeSend: " << sizeSend << std::endl;
4866  statusOFS << std::endl << "sizeRecv: " << sizeRecv << std::endl;
4867  #endif
4868 
4869 
4870 
4871  // Reserve the space
4872  for( Int ip = 0; ip < mpisize; ip++ ){
4873  if( ip == 0 ){
4874  displsSend[ip] = 0;
4875  }
4876  else{
4877  displsSend[ip] = displsSend[ip-1] + sizeSend[ip-1];
4878  }
4879 
4880  if( ip == 0 ){
4881  displsRecv[ip] = 0;
4882  }
4883  else{
4884  displsRecv[ip] = displsRecv[ip-1] + sizeRecv[ip-1];
4885  }
4886  }
4887 
4888  Int sizeSendTotal = displsSend[mpisize-1] + sizeSend[mpisize-1];
4889  Int sizeRecvTotal = displsRecv[mpisize-1] + sizeRecv[mpisize-1];
4890 
4891  rowSend.resize( sizeSendTotal );
4892  colSend.resize( sizeSendTotal );
4893  valSend.resize( sizeSendTotal );
4894 
4895  rowRecv.resize( sizeRecvTotal );
4896  colRecv.resize( sizeRecvTotal );
4897  valRecv.resize( sizeRecvTotal );
4898 
4899  #if ( _DEBUGlevel_ >= 1 )
4900  statusOFS << "displsSend = " << displsSend << std::endl;
4901  statusOFS << "displsRecv = " << displsRecv << std::endl;
4902  #endif
4903 
4904  // Put (row, col) to the sending buffer
4905  std::vector<Int> cntSize( mpisize, 0 );
4906 
4907  rowPtr = A.rowindLocal.Data();
4908  colPtr = A.colptrLocal.Data();
4909 
4910  for( Int j = 0; j < numColLocal; j++ ){
4911  Int col = perm( firstCol + j );
4912  Int blockColIdx = BlockIdx( col, super_ );
4913  Int procCol = PCOL( blockColIdx, grid_ );
4914  for( Int i = colPtr[j] - 1; i < colPtr[j+1] - 1; i++ ){
4915  Int row = perm( *(rowPtr++) - 1 );
4916  Int blockRowIdx = BlockIdx( row, super_ );
4917  Int procRow = PROW( blockRowIdx, grid_ );
4918  Int dest = PNUM( procRow, procCol, grid_ );
4919  rowSend[displsSend[dest] + cntSize[dest]] = row;
4920  colSend[displsSend[dest] + cntSize[dest]] = col;
4921  cntSize[dest]++;
4922  } // for (i)
4923  } // for (j)
4924 
4925 
4926  // Check sizes match
4927  for( Int ip = 0; ip < mpisize; ip++ ){
4928  if( cntSize[ip] != sizeSend[ip] )
4929  throw std::runtime_error( "Sizes of the sending information do not match." );
4930  }
4931 
4932  // Alltoallv to exchange information
4933  mpi::Alltoallv(
4934  &rowSend[0], &sizeSend[0], &displsSend[0],
4935  &rowRecv[0], &sizeRecv[0], &displsRecv[0],
4936  grid_->comm );
4937  mpi::Alltoallv(
4938  &colSend[0], &sizeSend[0], &displsSend[0],
4939  &colRecv[0], &sizeRecv[0], &displsRecv[0],
4940  grid_->comm );
4941 
4942  #if ( _DEBUGlevel_ >= 1 )
4943  statusOFS << "Alltoallv communication of nonzero indices finished." << std::endl;
4944  #endif
4945 
4946 
4947  #if ( _DEBUGlevel_ >= 1 )
4948  for( Int ip = 0; ip < mpisize; ip++ ){
4949  statusOFS << "rowSend[" << ip << "] = " << rowSend[ip] << std::endl;
4950  statusOFS << "rowRecv[" << ip << "] = " << rowRecv[ip] << std::endl;
4951  statusOFS << "colSend[" << ip << "] = " << colSend[ip] << std::endl;
4952  statusOFS << "colRecv[" << ip << "] = " << colRecv[ip] << std::endl;
4953  }
4954  #endif
4955 
4956  // For each (row, col), fill the nonzero values to valRecv locally.
4957  for( Int g = 0; g < sizeRecvTotal; g++ ){
4958  Int row = rowRecv[g];
4959  Int col = colRecv[g];
4960 
4961  Int blockRowIdx = BlockIdx( row, super_ );
4962  Int blockColIdx = BlockIdx( col, super_ );
4963 
4964  // Search for the nzval
4965  bool isFound = false;
4966 
4967  if( blockColIdx <= blockRowIdx ){
4968  // Data on the L side
4969 
4970  std::vector<LBlock<T> >& Lcol = this->L( LBj( blockColIdx, grid_ ) );
4971 
4972  for( Int ib = 0; ib < Lcol.size(); ib++ ){
4973  #if ( _DEBUGlevel_ >= 1 )
4974  statusOFS << "blockRowIdx = " << blockRowIdx << ", Lcol[ib].blockIdx = " << Lcol[ib].blockIdx << ", blockColIdx = " << blockColIdx << std::endl;
4975  #endif
4976  if( Lcol[ib].blockIdx == blockRowIdx ){
4977  IntNumVec& rows = Lcol[ib].rows;
4978  for( int iloc = 0; iloc < Lcol[ib].numRow; iloc++ ){
4979  if( rows[iloc] == row ){
4980  Int jloc = col - FirstBlockCol( blockColIdx, super_ );
4981  valRecv[g] = Lcol[ib].nzval( iloc, jloc );
4982  isFound = true;
4983  break;
4984  } // found the corresponding row
4985  }
4986  }
4987  if( isFound == true ) break;
4988  } // for (ib)
4989  }
4990  else{
4991  // Data on the U side
4992 
4993  std::vector<UBlock<T> >& Urow = this->U( LBi( blockRowIdx, grid_ ) );
4994 
4995  for( Int jb = 0; jb < Urow.size(); jb++ ){
4996  if( Urow[jb].blockIdx == blockColIdx ){
4997  IntNumVec& cols = Urow[jb].cols;
4998  for( int jloc = 0; jloc < Urow[jb].numCol; jloc++ ){
4999  if( cols[jloc] == col ){
5000  Int iloc = row - FirstBlockRow( blockRowIdx, super_ );
5001  valRecv[g] = Urow[jb].nzval( iloc, jloc );
5002  isFound = true;
5003  break;
5004  } // found the corresponding col
5005  }
5006  }
5007  if( isFound == true ) break;
5008  } // for (jb)
5009  } // if( blockColIdx <= blockRowIdx )
5010 
5011  // Did not find the corresponding row, set the value to zero.
5012  if( isFound == false ){
5013  statusOFS << "In the permutated order, (" << row << ", " << col <<
5014  ") is not found in PMatrix." << std::endl;
5015  valRecv[g] = ZERO<T>();
5016  }
5017 
5018  } // for (g)
5019 
5020 
5021  // Feed back valRecv to valSend through Alltoallv. NOTE: for the
5022  // values, the roles of "send" and "recv" are swapped.
5023  mpi::Alltoallv(
5024  &valRecv[0], &sizeRecv[0], &displsRecv[0],
5025  &valSend[0], &sizeSend[0], &displsSend[0],
5026  grid_->comm );
5027 
5028  #if ( _DEBUGlevel_ >= 1 )
5029  statusOFS << "Alltoallv communication of nonzero values finished." << std::endl;
5030  #endif
5031 
5032  // Put the nonzero values from valSend to the matrix B.
5033  B.size = A.size;
5034  B.nnz = A.nnz;
5035  B.nnzLocal = A.nnzLocal;
5036  B.colptrLocal = A.colptrLocal;
5037  B.rowindLocal = A.rowindLocal;
5038  B.nzvalLocal.Resize( B.nnzLocal );
5039  SetValue( B.nzvalLocal, ZERO<T>() );
5040  // Make sure that the communicator of A and B are the same.
5041  // FIXME Find a better way to compare the communicators
5042  // if( grid_->comm != A.comm ){
5043  // throw std::runtime_error( "The DistSparseMatrix providing the pattern has a different communicator from PMatrix." );
5044  // }
5045  B.comm = grid_->comm;
5046 
5047  for( Int i = 0; i < mpisize; i++ )
5048  cntSize[i] = 0;
5049 
5050  rowPtr = B.rowindLocal.Data();
5051  colPtr = B.colptrLocal.Data();
5052  T* valPtr = B.nzvalLocal.Data();
5053 
5054  for( Int j = 0; j < numColLocal; j++ ){
5055  Int col = perm( firstCol + j );
5056  Int blockColIdx = BlockIdx( col, super_ );
5057  Int procCol = PCOL( blockColIdx, grid_ );
5058  for( Int i = colPtr[j] - 1; i < colPtr[j+1] - 1; i++ ){
5059  Int row = perm( *(rowPtr++) - 1 );
5060  Int blockRowIdx = BlockIdx( row, super_ );
5061  Int procRow = PROW( blockRowIdx, grid_ );
5062  Int dest = PNUM( procRow, procCol, grid_ );
5063  *(valPtr++) = valSend[displsSend[dest] + cntSize[dest]];
5064  cntSize[dest]++;
5065  } // for (i)
5066  } // for (j)
5067 
5068  // Check sizes match
5069  for( Int ip = 0; ip < mpisize; ip++ ){
5070  if( cntSize[ip] != sizeSend[ip] )
5071  throw std::runtime_error( "Sizes of the sending information do not match." );
5072  }
5073 
5074 
5075  #ifndef _RELEASE_
5076  PopCallStack();
5077  #endif
5078 
5079  return ;
5080  } // ----- end of method PMatrix::PMatrixToDistSparseMatrix2 -----
5081 
5082 
5083 
5084 
5085  template<typename T>
5087  {
5088  #ifndef _RELEASE_
5089  PushCallStack("PMatrix::NnzLocal");
5090  #endif
5091  Int numSuper = this->NumSuper();
5092  Int nnzLocal = 0;
5093  for( Int ksup = 0; ksup < numSuper; ksup++ ){
5094  if( MYCOL( grid_ ) == PCOL( ksup, grid_ ) ){
5095  std::vector<LBlock<T> >& Lcol = this->L( LBj( ksup, grid_ ) );
5096  for( Int ib = 0; ib < Lcol.size(); ib++ ){
5097  nnzLocal += Lcol[ib].numRow * Lcol[ib].numCol;
5098  }
5099  } // if I own the column of ksup
5100  if( MYROW( grid_ ) == PROW( ksup, grid_ ) ){
5101  std::vector<UBlock<T> >& Urow = this->U( LBi( ksup, grid_ ) );
5102  for( Int jb = 0; jb < Urow.size(); jb++ ){
5103  nnzLocal += Urow[jb].numRow * Urow[jb].numCol;
5104  }
5105  } // if I own the row of ksup
5106  }
5107 
5108  #ifndef _RELEASE_
5109  PopCallStack();
5110  #endif
5111 
5112  return nnzLocal;
5113  } // ----- end of method PMatrix::NnzLocal -----
5114 
5115 
5116  template<typename T>
5117  LongInt PMatrix<T>::Nnz ( )
5118  {
5119  #ifndef _RELEASE_
5120  PushCallStack("PMatrix::Nnz");
5121  #endif
5122  LongInt nnzLocal = LongInt( this->NnzLocal() );
5123  LongInt nnz;
5124 
5125  MPI_Allreduce( &nnzLocal, &nnz, 1, MPI_LONG_LONG, MPI_SUM, grid_->comm );
5126 
5127  #ifndef _RELEASE_
5128  PopCallStack();
5129  #endif
5130 
5131  return nnz;
5132  } // ----- end of method PMatrix::Nnz -----
5133 
5134  template< >
5135  inline void PMatrix<Real>::GetNegativeInertia ( Real& inertia )
5136  {
5137  #ifndef _RELEASE_
5138  PushCallStack("PMatrix::GetNegativeInertia");
5139  #endif
5140  Int numSuper = this->NumSuper();
5141 
5142  Real inertiaLocal = 0.0;
5143  inertia = 0.0;
5144 
5145  for( Int ksup = 0; ksup < numSuper; ksup++ ){
5146  // I own the diagonal block
5147  if( MYROW( grid_ ) == PROW( ksup, grid_ ) &&
5148  MYCOL( grid_ ) == PCOL( ksup, grid_ ) ){
5149  LBlock<Real> & LB = this->L( LBj( ksup, grid_ ) )[0];
5150  for( Int i = 0; i < LB.numRow; i++ ){
5151  if( LB.nzval(i, i) < 0 )
5152  inertiaLocal++;
5153  }
5154  }
5155  }
5156 
5157  // All processors own diag
5158  mpi::Allreduce( &inertiaLocal, &inertia, 1, MPI_SUM, grid_->comm );
5159 
5160  #ifndef _RELEASE_
5161  PopCallStack();
5162  #endif
5163 
5164  return ;
5165  } // ----- end of method PMatrix::GetNegativeInertia -----
5166 
5167 
5168  template< >
5169  inline void PMatrix<Complex>::GetNegativeInertia ( Real& inertia )
5170  {
5171  #ifndef _RELEASE_
5172  PushCallStack("PMatrix::GetNegativeInertia");
5173  #endif
5174  Int numSuper = this->NumSuper();
5175 
5176  Real inertiaLocal = 0.0;
5177  inertia = 0.0;
5178 
5179  for( Int ksup = 0; ksup < numSuper; ksup++ ){
5180  // I own the diagonal block
5181  if( MYROW( grid_ ) == PROW( ksup, grid_ ) &&
5182  MYCOL( grid_ ) == PCOL( ksup, grid_ ) ){
5183  LBlock<Complex> & LB = this->L( LBj( ksup, grid_ ) )[0];
5184  for( Int i = 0; i < LB.numRow; i++ ){
5185  if( LB.nzval(i, i).real() < 0 )
5186  inertiaLocal++;
5187  }
5188  }
5189  }
5190 
5191  // All processors own diag
5192  mpi::Allreduce( &inertiaLocal, &inertia, 1, MPI_SUM, grid_->comm );
5193 
5194  #ifndef _RELEASE_
5195  PopCallStack();
5196  #endif
5197 
5198  return ;
5199  } // ----- end of method PMatrix::GetNegativeInertia -----
5200 
5201 
5202 } // namespace PEXSI
5203 
5204 #endif //_PEXSI_PSELINV_IMPL_HPP_
void PreSelInv()
PreSelInv prepares the structure in L_ and U_ so that SelInv only involves matrix-matrix multiplicati...
Definition: pselinv_impl.hpp:3819
Int blockIdx
Block index (supernodal index)
Definition: pselinv.hpp:187
void PMatrixToDistSparseMatrix(DistSparseMatrix< T > &A)
PMatrixToDistSparseMatrix converts the PMatrix into a distributed compressed sparse column matrix for...
Definition: pselinv_impl.hpp:4245
A thin interface for passing parameters to set the SuperLU options.
Definition: superlu_dist_internal.hpp:62
SuperNodeType describes mapping between supernode and column, the permutation information, and potentially the elimination tree (not implemented here).
Definition: pselinv.hpp:122
Interface with SuperLU_Dist (version 3.0 and later)
void GetNegativeInertia(Real &inertia)
GetNegativeInertia computes the negative inertia of a PMatrix. This can be used to estimate e...
Int PROW(Int bnum, const GridType *g)
PROW returns the processor row that the bnum-th block (supernode) belongs to.
Definition: pselinv.hpp:241
Int numRow
Number of nonzero rows.
Definition: pselinv.hpp:142
IntNumVec colptrLocal
Dimension numColLocal + 1, storing the pointers to the nonzero row indices and nonzero values in rowp...
Definition: sparse_matrix.hpp:108
Int MYPROC(const GridType *g)
MYPROC returns the current processor rank.
Definition: pselinv.hpp:228
Int nnzLocal
Local number of local nonzeros elements on this processor.
Definition: sparse_matrix.hpp:96
Int PCOL(Int bnum, const GridType *g)
PCOL returns the processor column that the bnum-th block (supernode) belongs to.
Definition: pselinv.hpp:246
Int LBi(Int bnum, const GridType *g)
LBi returns the local block number on the processor at processor row PROW( bnum, g )...
Definition: pselinv.hpp:256
Profiling and timing using TAU.
Int SuperSize(Int bnum, const SuperNodeType *s)
SuperSize returns the size of the block bnum.
Definition: pselinv.hpp:297
Int LBj(Int bnum, const GridType *g)
LBj returns the local block number on the processor at processor column PCOL( bnum, g ).
Definition: pselinv.hpp:261
Int GBj(Int jLocal, const GridType *g)
GBj returns the global block number from a local block number in the column direction.
Definition: pselinv.hpp:271
void SetValue(NumMat< F > &M, F val)
SetValue sets a numerical matrix to a constant val.
Definition: NumMat_impl.hpp:137
Int numCol
Number of nonzero columns.
Definition: pselinv.hpp:145
Int NumCol(const SuperNodeType *s)
NumCol returns the total number of columns for a supernodal partiiton.
Definition: pselinv.hpp:306
void GetDiagonal(NumVec< T > &diag)
GetDiagonal extracts the diagonal elements of the PMatrix.
Definition: pselinv_impl.hpp:4204
NumVec< F > nzvalLocal
Dimension nnzLocal, storing the nonzero values.
Definition: sparse_matrix.hpp:116
Int FirstBlockRow(Int bnum, const SuperNodeType *s)
FirstBlockRow returns the first column of a block bnum. Note: the functionality of FirstBlockRow is e...
Definition: pselinv.hpp:292
void GetEtree(std::vector< Int > &etree_supno)
GetEtree computes the supernodal elimination tree to be used later in the pipelined selected inversio...
Definition: pselinv_impl.hpp:875
LBlock stores a nonzero block in the lower triangular part or the diagonal part in PSelInv...
Definition: pselinv.hpp:136
IntNumVec rowindLocal
Dimension nnzLocal, storing the nonzero row indices. The indices are 1-based (FORTRAN-convention), i.e. the first row index is 1.
Definition: sparse_matrix.hpp:113
Int PNUM(Int i, Int j, const GridType *g)
PNUM returns the processor rank that the bnum-th block (supernode) belongs to.
Definition: pselinv.hpp:251
void ConstructCommunicationPattern()
ConstructCommunicationPattern constructs the communication pattern to be used later in the selected i...
Definition: pselinv_impl.hpp:3356
void ConstructCommunicationPattern_P2p()
ConstructCommunicationPattern_P2p constructs the communication pattern to be used later in the select...
Definition: pselinv_impl.hpp:3364
Int MYCOL(const GridType *g)
MYCOL returns my processor column.
Definition: pselinv.hpp:236
Int size
Matrix dimension.
Definition: sparse_matrix.hpp:93
IntNumVec rows
Dimension numRow * 1, index (0-based) for the number of nonzero rows.
Definition: pselinv.hpp:148
Int numCol
Number of nonzero columns.
Definition: pselinv.hpp:193
UBlock stores a nonzero block in the upper triangular part in PSelInv.
Definition: pselinv.hpp:184
Int FirstBlockCol(Int bnum, const SuperNodeType *s)
FirstBlockCol returns the first column of a block bnum.
Definition: pselinv.hpp:285
LongInt Nnz()
Compute the total number of nonzeros through MPI_Allreduce.
Definition: sparse_matrix_impl.hpp:60
MPI_Comm comm
MPI communicator.
Definition: sparse_matrix.hpp:119
Int NnzLocal()
NnzLocal computes the number of nonzero elements (L and U) saved locally.
Definition: pselinv_impl.hpp:5086
Int blockIdx
Block index (supernodal index)
Definition: pselinv.hpp:139
PMatrix contains the main data structure and the computational routine for the parallel selected inve...
Definition: pselinv.hpp:482
Int BlockIdx(Int i, const SuperNodeType *s)
BlockIdx returns the block index of a column i.
Definition: pselinv.hpp:280
Int numRow
Number of nonzero rows.
Definition: pselinv.hpp:190
NumMat< T > nzval
Dimension numRow * numCol, nonzero elements.
Definition: pselinv.hpp:199
Int nnz
Total number of nonzeros elements.
Definition: sparse_matrix.hpp:101
Int NumSuper(const SuperNodeType *s)
NumSuper returns the total number of supernodes.
Definition: pselinv.hpp:301
void SelInv_P2p()
Point-to-point version of the selected inversion.
Definition: pselinv_impl.hpp:3786
NumMat< T > nzval
Dimension numRow * numCol, nonzero elements.
Definition: pselinv.hpp:151
Definition: utility.hpp:1481
IntNumVec cols
Dimension numRow * 1, index (0-based) for the number of nonzero rows.
Definition: pselinv.hpp:196
void SelInv()
SelInv is the main function for the selected inversion.
Definition: pselinv_impl.hpp:3778
LongInt Nnz()
Nnz computes the total number of nonzero elements in the PMatrix.
Definition: pselinv_impl.hpp:5117
void PMatrixToDistSparseMatrix2(const DistSparseMatrix< T > &A, DistSparseMatrix< T > &B)
PMatrixToDistSparseMatrix2 is a more efficient version which performs the same job as PMatrixToDistSp...
Definition: pselinv_impl.hpp:4801
DistSparseMatrix describes a Sparse matrix in the compressed sparse column format (CSC) and distribut...
Definition: sparse_matrix.hpp:91
Int MYROW(const GridType *g)
MYROW returns my processor row.
Definition: pselinv.hpp:232