PEXSI
 All Classes Namespaces Files Functions Variables Typedefs Pages
pselinv_impl.hpp
Go to the documentation of this file.
1 /*
2  Copyright (c) 2012 The Regents of the University of California,
3  through Lawrence Berkeley National Laboratory.
4 
5 Authors: Lin Lin and Mathias Jacquelin
6 
7 This file is part of PEXSI. All rights reserved.
8 
9 Redistribution and use in source and binary forms, with or without
10 modification, are permitted provided that the following conditions are met:
11 
12 (1) Redistributions of source code must retain the above copyright notice, this
13 list of conditions and the following disclaimer.
14 (2) Redistributions in binary form must reproduce the above copyright notice,
15 this list of conditions and the following disclaimer in the documentation
16 and/or other materials provided with the distribution.
17 (3) Neither the name of the University of California, Lawrence Berkeley
18 National Laboratory, U.S. Dept. of Energy nor the names of its contributors may
19 be used to endorse or promote products derived from this software without
20 specific prior written permission.
21 
22 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
23 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
24 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
25 DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
26 ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
27 (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
29 ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
31 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 
33 You are under no obligation whatsoever to provide any bug fixes, patches, or
34 upgrades to the features, functionality or performance of the source code
35 ("Enhancements") to anyone; however, if you choose to make your Enhancements
36 available either publicly, or directly to Lawrence Berkeley National
37 Laboratory, without imposing a separate written license agreement for such
38 Enhancements, then you hereby grant the following license: a non-exclusive,
39 royalty-free perpetual license to install, use, modify, prepare derivative
40 works, incorporate into other computer software, distribute, and sublicense
41 such enhancements or derivative works thereof, in binary and source code form.
42  */
46 #ifndef _PEXSI_PSELINV_IMPL_HPP_
47 #define _PEXSI_PSELINV_IMPL_HPP_
48 
49 #include <list>
50 
51 #include "pexsi/timer.h"
53 
54 
55 #define MPI_MAX_COMM (1024)
56 #define BCAST_THRESHOLD 16
57 
58 #define MOD(a,b) \
59  ( ((a)%(b)+(b))%(b))
60 
61 
62 
63  namespace PEXSI{
64  inline GridType::GridType ( MPI_Comm Bcomm, int nprow, int npcol )
65  {
66 #ifndef _RELEASE_
67  PushCallStack("GridType::GridType");
68 #endif
69  Int info;
70  MPI_Initialized( &info );
71  if( !info ){
72 #ifdef USE_ABORT
73  abort();
74 #endif
75  throw std::logic_error( "MPI has not been initialized." );
76  }
77  MPI_Group comm_group;
78  MPI_Comm_group( Bcomm, &comm_group );
79  MPI_Comm_create( Bcomm, comm_group, &comm );
80  // comm = Bcomm;
81 
82  MPI_Comm_rank( comm, &mpirank );
83  MPI_Comm_size( comm, &mpisize );
84  if( mpisize != nprow * npcol ){
85 #ifdef USE_ABORT
86  abort();
87 #endif
88  throw std::logic_error( "mpisize != nprow * npcol." );
89  }
90 
91  numProcRow = nprow;
92  numProcCol = npcol;
93 
94  Int myrow = mpirank / npcol;
95  Int mycol = mpirank % npcol;
96 
97  MPI_Comm_split( comm, myrow, mycol, &rowComm );
98  MPI_Comm_split( comm, mycol, myrow, &colComm );
99 
100  MPI_Group_free( &comm_group );
101 
102 #ifndef _RELEASE_
103  PopCallStack();
104 #endif
105 
106  return ;
107  } // ----- end of method GridType::GridType -----
108 
109 
110  inline GridType::~GridType ( )
111  {
112 #ifndef _RELEASE_
113  PushCallStack("GridType::~GridType");
114 #endif
115  // Dot not free grid.comm which is not generated by GridType().
116 
117  MPI_Comm_free( &rowComm );
118  MPI_Comm_free( &colComm );
119  MPI_Comm_free( &comm );
120 
121 #ifndef _RELEASE_
122  PopCallStack();
123 #endif
124  return ;
125  } // ----- end of method GridType::~GridType -----
126  }
127 
128 
129 namespace PEXSI{
130  template<typename T>
131  void PMatrix<T>::deallocate(){
132 
133  grid_ = NULL;
134  super_ = NULL;
135  options_ = NULL;
136 
137  ColBlockIdx_.clear();
138  RowBlockIdx_.clear();
139  U_.clear();
140  L_.clear();
141  workingSet_.clear();
142 
143  // Communication variables
144  isSendToBelow_.Clear();
145  isSendToRight_.Clear();
146  isSendToDiagonal_.Clear();
147  isSendToCrossDiagonal_.Clear();
148 
149  isRecvFromBelow_.Clear();
150  isRecvFromAbove_.Clear();
151  isRecvFromLeft_.Clear();
152  isRecvFromCrossDiagonal_.Clear();
153 
154 
155  //Cleanup tree information
156  for(int i =0;i<fwdToBelowTree_.size();++i){
157  if(fwdToBelowTree_[i]!=NULL){
158  delete fwdToBelowTree_[i];
159  }
160  }
161 
162  for(int i =0;i<fwdToRightTree_.size();++i){
163  if(fwdToRightTree_[i]!=NULL){
164  delete fwdToRightTree_[i];
165  }
166  }
167 
168  for(int i =0;i<redToLeftTree_.size();++i){
169  if(redToLeftTree_[i]!=NULL){
170  delete redToLeftTree_[i];
171  }
172  }
173  for(int i =0;i<redToAboveTree_.size();++i){
174  if(redToAboveTree_[i]!=NULL){
175  delete redToAboveTree_[i];
176  }
177  }
178  }
179 
180  template<typename T>
181  PMatrix<T> & PMatrix<T>::operator = ( const PMatrix<T> & C){
182  if(&C!=this){
183  //If we have some memory allocated, delete it
184  deallocate();
185 
186  grid_ = C.grid_;
187  super_ = C.super_;
188  options_ = C.options_;
189 
190 
191  ColBlockIdx_ = C.ColBlockIdx_;
192  RowBlockIdx_ = C.RowBlockIdx_;
193  L_ = C.L_;
194  U_ = C.U_;
195 
196  workingSet_ = C.workingSet_;
197 
198 
199  // Communication variables
200  isSendToBelow_ = C.isSendToBelow_;
201  isSendToRight_ = C.isSendToRight_;
202  isSendToDiagonal_ = C.isSendToDiagonal_;
203  isSendToCrossDiagonal_ = C.isSendToCrossDiagonal_;
204 
205  isRecvFromBelow_ = C.isRecvFromBelow_;
206  isRecvFromAbove_ = C.isRecvFromAbove_;
207  isRecvFromLeft_ = C.isRecvFromLeft_;
208  isRecvFromCrossDiagonal_ = C.isRecvFromCrossDiagonal_;
209 
210  fwdToBelowTree_.resize(C.fwdToBelowTree_.size());
211  for(int i = 0 ; i< C.fwdToBelowTree_.size();++i){
212  if(C.fwdToBelowTree_[i]!=NULL){
213  fwdToBelowTree_[i] = C.fwdToBelowTree_[i]->clone();
214  }
215  }
216 
217  fwdToRightTree_.resize(C.fwdToRightTree_.size());
218  for(int i = 0 ; i< C.fwdToRightTree_.size();++i){
219  if(C.fwdToRightTree_[i]!=NULL){
220  fwdToRightTree_[i] = C.fwdToRightTree_[i]->clone();
221  }
222  }
223 
224  redToLeftTree_.resize(C.redToLeftTree_.size());
225  for(int i = 0 ; i< C.redToLeftTree_.size();++i){
226  if(C.redToLeftTree_[i]!=NULL){
227  redToLeftTree_[i] = C.redToLeftTree_[i]->clone();
228  }
229  }
230  redToAboveTree_.resize(C.redToAboveTree_.size());
231  for(int i = 0 ; i< C.redToAboveTree_.size();++i){
232  if(C.redToAboveTree_[i]!=NULL){
233  redToAboveTree_[i] = C.redToAboveTree_[i]->clone();
234  }
235  }
236 
237  }
238 
239  return *this;
240  }
241 
242  template<typename T>
243  PMatrix<T>::PMatrix( const PMatrix<T> & C){
244  //If we have some memory allocated, delete it
245  deallocate();
246 
247  grid_ = C.grid_;
248  super_ = C.super_;
249  options_ = C.options_;
250 
251 
252  ColBlockIdx_ = C.ColBlockIdx_;
253  RowBlockIdx_ = C.RowBlockIdx_;
254  L_ = C.L_;
255  U_ = C.U_;
256 
257  workingSet_ = C.workingSet_;
258 
259 
260  // Communication variables
261  isSendToBelow_ = C.isSendToBelow_;
262  isSendToRight_ = C.isSendToRight_;
263  isSendToDiagonal_ = C.isSendToDiagonal_;
264  isSendToCrossDiagonal_ = C.isSendToCrossDiagonal_;
265 
266  isRecvFromBelow_ = C.isRecvFromBelow_;
267  isRecvFromAbove_ = C.isRecvFromAbove_;
268  isRecvFromLeft_ = C.isRecvFromLeft_;
269  isRecvFromCrossDiagonal_ = C.isRecvFromCrossDiagonal_;
270 
271 
272  fwdToBelowTree_.resize(C.fwdToBelowTree_.size());
273  for(int i = 0 ; i< C.fwdToBelowTree_.size();++i){
274  if(C.fwdToBelowTree_[i]!=NULL){
275  fwdToBelowTree_[i] = C.fwdToBelowTree_[i]->clone();
276  }
277  }
278 
279  fwdToRightTree_.resize(C.fwdToRightTree_.size());
280  for(int i = 0 ; i< C.fwdToRightTree_.size();++i){
281  if(C.fwdToRightTree_[i]!=NULL){
282  fwdToRightTree_[i] = C.fwdToRightTree_[i]->clone();
283  }
284  }
285 
286  redToLeftTree_.resize(C.redToLeftTree_.size());
287  for(int i = 0 ; i< C.redToLeftTree_.size();++i){
288  if(C.redToLeftTree_[i]!=NULL){
289  redToLeftTree_[i] = C.redToLeftTree_[i]->clone();
290  }
291  }
292  redToAboveTree_.resize(C.redToAboveTree_.size());
293  for(int i = 0 ; i< C.redToAboveTree_.size();++i){
294  if(C.redToAboveTree_[i]!=NULL){
295  redToAboveTree_[i] = C.redToAboveTree_[i]->clone();
296  }
297  }
298 
299  }
300 
301  template<typename T>
302  PMatrix<T>::PMatrix (
303  const GridType* g,
304  const SuperNodeType* s,
305  const PEXSI::SuperLUOptions * o
306  )
307  {
308 #ifndef _RELEASE_
309  PushCallStack("PMatrix::PMatrix");
310 #endif
311 
312  this->Setup( g, s, o );
313 
314 #ifndef _RELEASE_
315  PopCallStack();
316 #endif
317  return ;
318  } // ----- end of method PMatrix::PMatrix -----
319 
320  template<typename T>
321  PMatrix<T>::~PMatrix ( )
322  {
323 
324 #ifndef _RELEASE_
325  PushCallStack("PMatrix::~PMatrix");
326 #endif
327 
328  deallocate();
329 
330 #ifndef _RELEASE_
331  PopCallStack();
332 #endif
333  return ;
334  } // ----- end of method PMatrix::~PMatrix -----
335 
336  template<typename T>
337  void PMatrix<T>::Setup(
338  const GridType* g,
339  const SuperNodeType* s,
340  const PEXSI::SuperLUOptions * o
341  )
342  {
343 #ifndef _RELEASE_
344  PushCallStack("PMatrix::Setup");
345 #endif
346 
347  grid_ = g;
348  super_ = s;
349  options_ = o;
350 
351  // if( grid_->numProcRow != grid_->numProcCol ){
352  // #ifdef USE_ABORT
353  //abort();
354  //#endif
355  //throw std::runtime_error( "The current version of SelInv only works for square processor grids." ); }
356 
357  L_.clear();
358  U_.clear();
359  ColBlockIdx_.clear();
360  RowBlockIdx_.clear();
361 
362  L_.resize( this->NumLocalBlockCol() );
363  U_.resize( this->NumLocalBlockRow() );
364 
365  ColBlockIdx_.resize( this->NumLocalBlockCol() );
366  RowBlockIdx_.resize( this->NumLocalBlockRow() );
367 
368  //workingSet_.resize(this->NumSuper());
369 #if ( _DEBUGlevel_ >= 1 )
370  statusOFS << std::endl << "PMatrix is constructed. The grid information: " << std::endl;
371  statusOFS << "mpirank = " << MYPROC(grid_) << std::endl;
372  statusOFS << "myrow = " << MYROW(grid_) << std::endl;
373  statusOFS << "mycol = " << MYCOL(grid_) << std::endl;
374 #endif
375 
376 #ifndef _RELEASE_
377  PopCallStack();
378 #endif
379  return ;
380  } // ----- end of method PMatrix::Setup -----
381 
382 
384  template<typename T>
386  SuperNodeBufferType & snode,
387  std::vector<LBlock<T> > & LcolRecv,
388  std::vector<UBlock<T> > & UrowRecv,
389  NumMat<T> & AinvBuf,
390  NumMat<T> & UBuf )
391  {
392  TIMER_START(Compute_Sinv_LT_Lookup_Indexes);
393 
394  TIMER_START(Build_colptr_rowptr);
395  // rowPtr[ib] gives the row index in snode.LUpdateBuf for the first
396  // nonzero row in LcolRecv[ib]. The total number of rows in
397  // snode.LUpdateBuf is given by rowPtr[end]-1
398  std::vector<Int> rowPtr(LcolRecv.size() + 1);
399  // colPtr[jb] gives the column index in UBuf for the first
400  // nonzero column in UrowRecv[jb]. The total number of rows in
401  // UBuf is given by colPtr[end]-1
402  std::vector<Int> colPtr(UrowRecv.size() + 1);
403 
404  rowPtr[0] = 0;
405  for( Int ib = 0; ib < LcolRecv.size(); ib++ ){
406  rowPtr[ib+1] = rowPtr[ib] + LcolRecv[ib].numRow;
407  }
408  colPtr[0] = 0;
409  for( Int jb = 0; jb < UrowRecv.size(); jb++ ){
410  colPtr[jb+1] = colPtr[jb] + UrowRecv[jb].numCol;
411  }
412 
413  Int numRowAinvBuf = *rowPtr.rbegin();
414  Int numColAinvBuf = *colPtr.rbegin();
415  TIMER_STOP(Build_colptr_rowptr);
416 
417  TIMER_START(Allocate_lookup);
418  // Allocate for the computational storage
419  AinvBuf.Resize( numRowAinvBuf, numColAinvBuf );
420  UBuf.Resize( SuperSize( snode.Index, super_ ), numColAinvBuf );
421  // TIMER_START(SetValue_lookup);
422  // SetValue( AinvBuf, ZERO<T>() );
423  //SetValue( snode.LUpdateBuf, ZERO<T>() );
424  // SetValue( UBuf, ZERO<T>() );
425  // TIMER_STOP(SetValue_lookup);
426  TIMER_STOP(Allocate_lookup);
427 
428  TIMER_START(Fill_UBuf);
429  // Fill UBuf first. Make the transpose later in the Gemm phase.
430  for( Int jb = 0; jb < UrowRecv.size(); jb++ ){
431  UBlock<T>& UB = UrowRecv[jb];
432  if( UB.numRow != SuperSize(snode.Index, super_) ){
433  throw std::logic_error( "The size of UB is not right. Something is seriously wrong." );
434  }
435  lapack::Lacpy( 'A', UB.numRow, UB.numCol, UB.nzval.Data(),
436  UB.numRow, UBuf.VecData( colPtr[jb] ), SuperSize( snode.Index, super_ ) );
437  }
438  TIMER_STOP(Fill_UBuf);
439 
440  // Calculate the relative indices for (isup, jsup)
441  // Fill AinvBuf with the information in L or U block.
442  TIMER_START(JB_Loop);
443 
444 #ifdef STDFIND
445  // for( Int jb = 0; jb < UrowRecv.size(); jb++ ){
446  //
447  // UBlock& UB = UrowRecv[jb];
448  // Int jsup = UB.blockIdx;
449  // Int SinvColsSta = FirstBlockCol( jsup, super_ );
450  //
451  // // Column relative indicies
452  // std::vector<Int> relCols( UB.numCol );
453  // for( Int j = 0; j < UB.numCol; j++ ){
454  // relCols[j] = UB.cols[j] - SinvColsSta;
455  // }
456  //
457  //
458  //
459  //
460  // for( Int ib = 0; ib < LcolRecv.size(); ib++ ){
461  // LBlock& LB = LcolRecv[ib];
462  // Int isup = LB.blockIdx;
463  // Int SinvRowsSta = FirstBlockCol( isup, super_ );
464  // Scalar* nzvalAinv = &AinvBuf( rowPtr[ib], colPtr[jb] );
465  // Int ldAinv = numRowAinvBuf;
466  //
467  // // Pin down the corresponding block in the part of Sinv.
468  // if( isup >= jsup ){
469  // std::vector<LBlock>& LcolSinv = this->L( LBj(jsup, grid_ ) );
470  // bool isBlockFound = false;
471  // TIMER_START(PARSING_ROW_BLOCKIDX);
472  // for( Int ibSinv = 0; ibSinv < LcolSinv.size(); ibSinv++ ){
473  // // Found the (isup, jsup) block in Sinv
474  // if( LcolSinv[ibSinv].blockIdx == isup ){
475  // LBlock& SinvB = LcolSinv[ibSinv];
476  //
477  // // Row relative indices
478  // std::vector<Int> relRows( LB.numRow );
479  // Int* rowsLBPtr = LB.rows.Data();
480  // Int* rowsSinvBPtr = SinvB.rows.Data();
481  //
482  // TIMER_START(STDFIND_ROW);
483  // Int * pos =&rowsSinvBPtr[0];
484  // Int * last =&rowsSinvBPtr[SinvB.numRow];
485  // for( Int i = 0; i < LB.numRow; i++ ){
486  // // pos = std::find(pos, &rowsSinvBPtr[SinvB.numRow-1], rowsLBPtr[i]);
487  // pos = std::find(rowsSinvBPtr, last, rowsLBPtr[i]);
488  // if(pos != last){
489  // relRows[i] = (Int)(pos - rowsSinvBPtr);
490  // }
491  // else{
492  // std::ostringstream msg;
493  // msg << "Row " << rowsLBPtr[i] <<
494  // " in LB cannot find the corresponding row in SinvB" << std::endl
495  // << "LB.rows = " << LB.rows << std::endl
496  // << "SinvB.rows = " << SinvB.rows << std::endl;
497  // throw std::runtime_error( msg.str().c_str() );
498  // }
499  // }
500  // TIMER_STOP(STDFIND_ROW);
501  //
502  // TIMER_START(Copy_Sinv_to_Ainv);
503  // // Transfer the values from Sinv to AinvBlock
504  // Scalar* nzvalSinv = SinvB.nzval.Data();
505  // Int ldSinv = SinvB.numRow;
506  // for( Int j = 0; j < UB.numCol; j++ ){
507  // for( Int i = 0; i < LB.numRow; i++ ){
508  // nzvalAinv[i+j*ldAinv] =
509  // nzvalSinv[relRows[i] + relCols[j] * ldSinv];
510  // }
511  // }
512  // TIMER_STOP(Copy_Sinv_to_Ainv);
513  //
514  // isBlockFound = true;
515  // break;
516  // }
517  // } // for (ibSinv )
518  // TIMER_STOP(PARSING_ROW_BLOCKIDX);
519  // if( isBlockFound == false ){
520  // std::ostringstream msg;
521  // msg << "Block(" << isup << ", " << jsup
522  // << ") did not find a matching block in Sinv." << std::endl;
523  // throw std::runtime_error( msg.str().c_str() );
524  // }
525  // } // if (isup, jsup) is in L
526  // else{
527  // // Row relative indices
528  // std::vector<Int> relRows( LB.numRow );
529  // Int SinvRowsSta = FirstBlockCol( isup, super_ );
530  // for( Int i = 0; i < LB.numRow; i++ ){
531  // relRows[i] = LB.rows[i] - SinvRowsSta;
532  // }
533  // std::vector<UBlock>& UrowSinv = this->U( LBi( isup, grid_ ) );
534  // bool isBlockFound = false;
535  // TIMER_START(PARSING_COL_BLOCKIDX);
536  // for( Int jbSinv = 0; jbSinv < UrowSinv.size(); jbSinv++ ){
537  // // Found the (isup, jsup) block in Sinv
538  // if( UrowSinv[jbSinv].blockIdx == jsup ){
539  // UBlock& SinvB = UrowSinv[jbSinv];
540  //
541  //
542  //
543  // // Column relative indices
544  // std::vector<Int> relCols( UB.numCol );
545  // Int* colsUBPtr = UB.cols.Data();
546  // Int* colsSinvBPtr = SinvB.cols.Data();
547  // TIMER_START(STDFIND_COL);
548  // Int * pos =&colsSinvBPtr[0];
549  // Int * last =&colsSinvBPtr[SinvB.numCol];
550  // for( Int j = 0; j < UB.numCol; j++ ){
551  // //colsUB is sorted
552  // pos = std::find(colsSinvBPtr, last, colsUBPtr[j]);
553  // if(pos !=last){
554  // relCols[j] = (Int)(pos - colsSinvBPtr);
555  // }
556  // else{
557  // std::ostringstream msg;
558  // msg << "Col " << colsUBPtr[j] <<
559  // " in UB cannot find the corresponding row in SinvB" << std::endl
560  // << "UB.cols = " << UB.cols << std::endl
561  // << "UinvB.cols = " << SinvB.cols << std::endl;
562  // throw std::runtime_error( msg.str().c_str() );
563  // }
564  // }
565  // TIMER_STOP(STDFIND_COL);
566  //
567  //
568  // TIMER_START(Copy_Sinv_to_Ainv);
569  // // Transfer the values from Sinv to AinvBlock
570  // Scalar* nzvalSinv = SinvB.nzval.Data();
571  // Int ldSinv = SinvB.numRow;
572  // for( Int j = 0; j < UB.numCol; j++ ){
573  // for( Int i = 0; i < LB.numRow; i++ ){
574  // nzvalAinv[i+j*ldAinv] =
575  // nzvalSinv[relRows[i] + relCols[j] * ldSinv];
576  // }
577  // }
578  // TIMER_STOP(Copy_Sinv_to_Ainv);
579  //
580  // isBlockFound = true;
581  // break;
582  // }
583  // } // for (jbSinv)
584  // TIMER_STOP(PARSING_COL_BLOCKIDX);
585  // if( isBlockFound == false ){
586  // std::ostringstream msg;
587  // msg << "Block(" << isup << ", " << jsup
588  // << ") did not find a matching block in Sinv." << std::endl;
589  // throw std::runtime_error( msg.str().c_str() );
590  // }
591  // } // if (isup, jsup) is in U
592  //
593  // } // for( ib )
594  // } // for ( jb )
595 #else
596  for( Int jb = 0; jb < UrowRecv.size(); jb++ ){
597  for( Int ib = 0; ib < LcolRecv.size(); ib++ ){
598  LBlock<T>& LB = LcolRecv[ib];
599  UBlock<T>& UB = UrowRecv[jb];
600  Int isup = LB.blockIdx;
601  Int jsup = UB.blockIdx;
602  T* nzvalAinv = &AinvBuf( rowPtr[ib], colPtr[jb] );
603  Int ldAinv = AinvBuf.m();
604 
605  // Pin down the corresponding block in the part of Sinv.
606  if( isup >= jsup ){
607  std::vector<LBlock<T> >& LcolSinv = this->L( LBj(jsup, grid_ ) );
608  bool isBlockFound = false;
609  TIMER_START(PARSING_ROW_BLOCKIDX);
610  for( Int ibSinv = 0; ibSinv < LcolSinv.size(); ibSinv++ ){
611  // Found the (isup, jsup) block in Sinv
612  if( LcolSinv[ibSinv].blockIdx == isup ){
613  LBlock<T> & SinvB = LcolSinv[ibSinv];
614 
615  // Row relative indices
616  std::vector<Int> relRows( LB.numRow );
617  Int* rowsLBPtr = LB.rows.Data();
618  Int* rowsSinvBPtr = SinvB.rows.Data();
619  for( Int i = 0; i < LB.numRow; i++ ){
620  bool isRowFound = false;
621  for( Int i1 = 0; i1 < SinvB.numRow; i1++ ){
622  if( rowsLBPtr[i] == rowsSinvBPtr[i1] ){
623  isRowFound = true;
624  relRows[i] = i1;
625  break;
626  }
627  }
628  if( isRowFound == false ){
629  std::ostringstream msg;
630  msg << "Row " << rowsLBPtr[i] <<
631  " in LB cannot find the corresponding row in SinvB" << std::endl
632  << "LB.rows = " << LB.rows << std::endl
633  << "SinvB.rows = " << SinvB.rows << std::endl;
634  throw std::runtime_error( msg.str().c_str() );
635  }
636  }
637 
638  // Column relative indicies
639  std::vector<Int> relCols( UB.numCol );
640  Int SinvColsSta = FirstBlockCol( jsup, super_ );
641  for( Int j = 0; j < UB.numCol; j++ ){
642  relCols[j] = UB.cols[j] - SinvColsSta;
643  }
644 
645  // Transfer the values from Sinv to AinvBlock
646  T* nzvalSinv = SinvB.nzval.Data();
647  Int ldSinv = SinvB.numRow;
648  for( Int j = 0; j < UB.numCol; j++ ){
649  for( Int i = 0; i < LB.numRow; i++ ){
650  nzvalAinv[i+j*ldAinv] =
651  nzvalSinv[relRows[i] + relCols[j] * ldSinv];
652  }
653  }
654 
655  isBlockFound = true;
656  break;
657  }
658  } // for (ibSinv )
659  TIMER_STOP(PARSING_ROW_BLOCKIDX);
660  if( isBlockFound == false ){
661  std::ostringstream msg;
662  msg << "Block(" << isup << ", " << jsup
663  << ") did not find a matching block in Sinv." << std::endl;
664  throw std::runtime_error( msg.str().c_str() );
665  }
666  } // if (isup, jsup) is in L
667  else{
668  std::vector<UBlock<T> >& UrowSinv = this->U( LBi( isup, grid_ ) );
669  bool isBlockFound = false;
670  TIMER_START(PARSING_COL_BLOCKIDX);
671  for( Int jbSinv = 0; jbSinv < UrowSinv.size(); jbSinv++ ){
672  // Found the (isup, jsup) block in Sinv
673  if( UrowSinv[jbSinv].blockIdx == jsup ){
674  UBlock<T> & SinvB = UrowSinv[jbSinv];
675 
676  // Row relative indices
677  std::vector<Int> relRows( LB.numRow );
678  Int SinvRowsSta = FirstBlockCol( isup, super_ );
679  for( Int i = 0; i < LB.numRow; i++ ){
680  relRows[i] = LB.rows[i] - SinvRowsSta;
681  }
682 
683  // Column relative indices
684  std::vector<Int> relCols( UB.numCol );
685  Int* colsUBPtr = UB.cols.Data();
686  Int* colsSinvBPtr = SinvB.cols.Data();
687  for( Int j = 0; j < UB.numCol; j++ ){
688  bool isColFound = false;
689  for( Int j1 = 0; j1 < SinvB.numCol; j1++ ){
690  if( colsUBPtr[j] == colsSinvBPtr[j1] ){
691  isColFound = true;
692  relCols[j] = j1;
693  break;
694  }
695  }
696  if( isColFound == false ){
697  std::ostringstream msg;
698  msg << "Col " << colsUBPtr[j] <<
699  " in UB cannot find the corresponding row in SinvB" << std::endl
700  << "UB.cols = " << UB.cols << std::endl
701  << "UinvB.cols = " << SinvB.cols << std::endl;
702  throw std::runtime_error( msg.str().c_str() );
703  }
704  }
705 
706 
707  // Transfer the values from Sinv to AinvBlock
708  T* nzvalSinv = SinvB.nzval.Data();
709  Int ldSinv = SinvB.numRow;
710  for( Int j = 0; j < UB.numCol; j++ ){
711  for( Int i = 0; i < LB.numRow; i++ ){
712  nzvalAinv[i+j*ldAinv] =
713  nzvalSinv[relRows[i] + relCols[j] * ldSinv];
714  }
715  }
716 
717  isBlockFound = true;
718  break;
719  }
720  } // for (jbSinv)
721  TIMER_STOP(PARSING_COL_BLOCKIDX);
722  if( isBlockFound == false ){
723  std::ostringstream msg;
724  msg << "Block(" << isup << ", " << jsup
725  << ") did not find a matching block in Sinv." << std::endl;
726  throw std::runtime_error( msg.str().c_str() );
727  }
728  } // if (isup, jsup) is in U
729 
730  } // for( ib )
731  } // for ( jb )
732 
733 
734 #endif
735  TIMER_STOP(JB_Loop);
736 
737 
738  TIMER_STOP(Compute_Sinv_LT_Lookup_Indexes);
739  }
740 
741  template<typename T>
743  std::vector<SuperNodeBufferType > & arrSuperNodes,
744  Int stepSuper)
745  {
746 
747  TIMER_START(Send_CD_Update_U);
748  //compute the number of requests
749  Int sendCount = 0;
750  Int recvCount = 0;
751  Int sendOffset[stepSuper];
752  Int recvOffset[stepSuper];
753  Int recvIdx=0;
754  for (Int supidx=0; supidx<stepSuper; supidx++){
755  SuperNodeBufferType & snode = arrSuperNodes[supidx];
756  sendOffset[supidx]=sendCount;
757  recvOffset[supidx]=recvCount;
758  sendCount+= CountSendToCrossDiagonal(snode.Index);
759  recvCount+= CountRecvFromCrossDiagonal(snode.Index);
760  }
761 
762 
763  std::vector<MPI_Request > arrMpiReqsSendCD(sendCount, MPI_REQUEST_NULL );
764  std::vector<MPI_Request > arrMpiReqsSizeSendCD(sendCount, MPI_REQUEST_NULL );
765 
766  std::vector<MPI_Request > arrMpiReqsRecvCD(recvCount, MPI_REQUEST_NULL );
767  std::vector<MPI_Request > arrMpiReqsSizeRecvCD(recvCount, MPI_REQUEST_NULL );
768  std::vector<std::vector<char> > arrSstrLcolSendCD(sendCount);
769  std::vector<int > arrSstrLcolSizeSendCD(sendCount);
770  std::vector<std::vector<char> > arrSstrLcolRecvCD(recvCount);
771  std::vector<int > arrSstrLcolSizeRecvCD(recvCount);
772 
773  for (Int supidx=0; supidx<stepSuper; supidx++){
774  SuperNodeBufferType & snode = arrSuperNodes[supidx];
775 
776  // Send LUpdateBufReduced to the cross diagonal blocks.
777  // NOTE: This assumes square processor grid
778 
779  TIMER_START(Send_L_CrossDiag);
780 
781  if( MYCOL( grid_ ) == PCOL( snode.Index, grid_ ) && isSendToCrossDiagonal_(grid_->numProcCol, snode.Index ) ){
782 
783  Int sendIdx = 0;
784  for(Int dstCol = 0; dstCol<grid_->numProcCol; dstCol++){
785  if(isSendToCrossDiagonal_(dstCol,snode.Index) ){
786  Int dest = PNUM(PROW(snode.Index,grid_),dstCol,grid_);
787 
788  if( MYPROC( grid_ ) != dest ){
789  MPI_Request & mpiReqSizeSend = arrMpiReqsSizeSendCD[sendOffset[supidx]+sendIdx];
790  MPI_Request & mpiReqSend = arrMpiReqsSendCD[sendOffset[supidx]+sendIdx];
791 
792 
793  std::stringstream sstm;
794  std::vector<char> & sstrLcolSend = arrSstrLcolSendCD[sendOffset[supidx]+sendIdx];
795  Int & sstrSize = arrSstrLcolSizeSendCD[sendOffset[supidx]+sendIdx];
796 
797  serialize( snode.RowLocalPtr, sstm, NO_MASK );
798  serialize( snode.BlockIdxLocal, sstm, NO_MASK );
799  serialize( snode.LUpdateBuf, sstm, NO_MASK );
800 
801  sstrLcolSend.resize( Size(sstm) );
802  sstm.read( &sstrLcolSend[0], sstrLcolSend.size() );
803  sstrSize = sstrLcolSend.size();
804 
805 
806  MPI_Isend( &sstrSize, sizeof(sstrSize), MPI_BYTE, dest, IDX_TO_TAG(snode.Index,SELINV_TAG_L_SIZE_CD), grid_->comm, &mpiReqSizeSend );
807  MPI_Isend( (void*)&sstrLcolSend[0], sstrSize, MPI_BYTE, dest, IDX_TO_TAG(snode.Index,SELINV_TAG_L_CONTENT_CD), grid_->comm, &mpiReqSend );
808 
809  PROFILE_COMM(MYPROC(this->grid_),dest,IDX_TO_TAG(snode.Index,SELINV_TAG_L_SIZE_CD),sizeof(sstrSize));
810  PROFILE_COMM(MYPROC(this->grid_),dest,IDX_TO_TAG(snode.Index,SELINV_TAG_L_CONTENT_CD),sstrSize);
811 
812 
813  sendIdx++;
814  }
815  }
816  }
817 
818 
819  } // sender
820  TIMER_STOP(Send_L_CrossDiag);
821  }
822 
823 
824  //Do Irecv for sizes
825  for (Int supidx=0; supidx<stepSuper; supidx++){
826  SuperNodeBufferType & snode = arrSuperNodes[supidx];
827  //If I'm a receiver
828  if( MYROW( grid_ ) == PROW( snode.Index, grid_ ) && isRecvFromCrossDiagonal_(grid_->numProcRow, snode.Index ) ){
829  Int recvIdx=0;
830  for(Int srcRow = 0; srcRow<grid_->numProcRow; srcRow++){
831  if(isRecvFromCrossDiagonal_(srcRow,snode.Index) ){
832  Int src = PNUM(srcRow,PCOL(snode.Index,grid_),grid_);
833  if( MYPROC( grid_ ) != src ){
834  Int & sstrSize = arrSstrLcolSizeRecvCD[recvOffset[supidx]+recvIdx];
835  MPI_Request & mpiReqSizeRecv = arrMpiReqsSizeRecvCD[recvOffset[supidx]+recvIdx];
836  MPI_Irecv( &sstrSize, 1, MPI_INT, src, IDX_TO_TAG(snode.Index,SELINV_TAG_L_SIZE_CD), grid_->comm, &mpiReqSizeRecv );
837  recvIdx++;
838  }
839  }
840  }
841  }//end if I'm a receiver
842  }
843 
844  //waitall sizes
845  mpi::Waitall(arrMpiReqsSizeRecvCD);
846  //Allocate content and do Irecv
847  for (Int supidx=0; supidx<stepSuper; supidx++){
848  SuperNodeBufferType & snode = arrSuperNodes[supidx];
849  //If I'm a receiver
850  if( MYROW( grid_ ) == PROW( snode.Index, grid_ ) && isRecvFromCrossDiagonal_(grid_->numProcRow, snode.Index ) ){
851  Int recvIdx=0;
852  for(Int srcRow = 0; srcRow<grid_->numProcRow; srcRow++){
853  if(isRecvFromCrossDiagonal_(srcRow,snode.Index) ){
854  Int src = PNUM(srcRow,PCOL(snode.Index,grid_),grid_);
855  if( MYPROC( grid_ ) != src ){
856  Int & sstrSize = arrSstrLcolSizeRecvCD[recvOffset[supidx]+recvIdx];
857  std::vector<char> & sstrLcolRecv = arrSstrLcolRecvCD[recvOffset[supidx]+recvIdx];
858  MPI_Request & mpiReqRecv = arrMpiReqsRecvCD[recvOffset[supidx]+recvIdx];
859  sstrLcolRecv.resize( sstrSize);
860  MPI_Irecv( (void*)&sstrLcolRecv[0], sstrSize, MPI_BYTE, src, IDX_TO_TAG(snode.Index,SELINV_TAG_L_CONTENT_CD), grid_->comm, &mpiReqRecv );
861  //statusOFS<<"P"<<MYPROC(grid_)<<" received "<<sstrSize<<" bytes of L/U from CD P"<<src<<std::endl;
862  recvIdx++;
863  }
864  }
865  }
866  }//end if I'm a receiver
867  }
868 
869  //waitall content
870  mpi::Waitall(arrMpiReqsRecvCD);
871  //Do the work
872  for (Int supidx=0; supidx<stepSuper; supidx++){
873  SuperNodeBufferType & snode = arrSuperNodes[supidx];
874 
875  // Send LUpdateBufReduced to the cross diagonal blocks.
876  // NOTE: This assumes square processor grid
877  if( MYROW( grid_ ) == PROW( snode.Index, grid_ ) && isRecvFromCrossDiagonal_(grid_->numProcRow, snode.Index ) ){
878 
879 #if ( _DEBUGlevel_ >= 1 )
880  statusOFS << std::endl << " ["<<snode.Index<<"] "<< "Update the upper triangular block" << std::endl << std::endl;
881  statusOFS << std::endl << " ["<<snode.Index<<"] "<< "blockIdxLocal:" << snode.BlockIdxLocal << std::endl << std::endl;
882  statusOFS << std::endl << " ["<<snode.Index<<"] "<< "rowLocalPtr:" << snode.RowLocalPtr << std::endl << std::endl;
883 #endif
884 
885  std::vector<UBlock<T> >& Urow = this->U( LBi( snode.Index, grid_ ) );
886  std::vector<bool> isBlockFound(Urow.size(),false);
887 
888  recvIdx=0;
889  for(Int srcRow = 0; srcRow<grid_->numProcRow; srcRow++){
890  if(isRecvFromCrossDiagonal_(srcRow,snode.Index) ){
891  Int src = PNUM(srcRow,PCOL(snode.Index,grid_),grid_);
892  TIMER_START(Recv_L_CrossDiag);
893 
894  std::vector<Int> rowLocalPtrRecv;
895  std::vector<Int> blockIdxLocalRecv;
896  NumMat<T> UUpdateBuf;
897 
898  if( MYPROC( grid_ ) != src ){
899  std::stringstream sstm;
900  Int & sstrSize = arrSstrLcolSizeRecvCD[recvOffset[supidx]+recvIdx];
901  std::vector<char> & sstrLcolRecv = arrSstrLcolRecvCD[recvOffset[supidx]+recvIdx];
902  sstm.write( &sstrLcolRecv[0], sstrSize );
903 
904  deserialize( rowLocalPtrRecv, sstm, NO_MASK );
905  deserialize( blockIdxLocalRecv, sstm, NO_MASK );
906  deserialize( UUpdateBuf, sstm, NO_MASK );
907 
908  recvIdx++;
909 
910  } // sender is not the same as receiver
911  else{
912  rowLocalPtrRecv = snode.RowLocalPtr;
913  blockIdxLocalRecv = snode.BlockIdxLocal;
914  UUpdateBuf = snode.LUpdateBuf;
915  } // sender is the same as receiver
916 
917 
918 
919  TIMER_STOP(Recv_L_CrossDiag);
920 
921 #if ( _DEBUGlevel_ >= 1 )
922  statusOFS<<" ["<<snode.Index<<"] P"<<MYPROC(grid_)<<" ("<<MYROW(grid_)<<","<<MYCOL(grid_)<<") <--- LBj("<<snode.Index<<") <--- P"<<src<<std::endl;
923  statusOFS << std::endl << " ["<<snode.Index<<"] "<< "rowLocalPtrRecv:" << rowLocalPtrRecv << std::endl << std::endl;
924  statusOFS << std::endl << " ["<<snode.Index<<"] "<< "blockIdxLocalRecv:" << blockIdxLocalRecv << std::endl << std::endl;
925 #endif
926 
927 
928  // Update U
929  for( Int ib = 0; ib < blockIdxLocalRecv.size(); ib++ ){
930  for( Int jb = 0; jb < Urow.size(); jb++ ){
931  UBlock<T>& UB = Urow[jb];
932  if( UB.blockIdx == blockIdxLocalRecv[ib] ){
933  NumMat<T> Ltmp ( UB.numCol, UB.numRow );
934  lapack::Lacpy( 'A', Ltmp.m(), Ltmp.n(),
935  &UUpdateBuf( rowLocalPtrRecv[ib], 0 ),
936  UUpdateBuf.m(), Ltmp.Data(), Ltmp.m() );
937  isBlockFound[jb] = true;
938  Transpose( Ltmp, UB.nzval );
939  break;
940  }
941  }
942  }
943  }
944  }
945 
946  for( Int jb = 0; jb < Urow.size(); jb++ ){
947  UBlock<T>& UB = Urow[jb];
948  if( !isBlockFound[jb] ){
949 #ifdef USE_ABORT
950  abort();
951 #endif
952  throw std::logic_error( "UBlock cannot find its update. Something is seriously wrong." );
953  }
954  }
955  } // receiver
956  }
957 
958  TIMER_STOP(Send_CD_Update_U);
959 
960  mpi::Waitall(arrMpiReqsSizeSendCD);
961  mpi::Waitall(arrMpiReqsSendCD);
962  };
963 
964  template<typename T>
966  SuperNodeBufferType & snode,
967  std::vector<LBlock<T> > & LcolRecv,
968  std::vector<UBlock<T> > & UrowRecv )
969  {
970 #if ( _DEBUGlevel_ >= 1 )
971  statusOFS << std::endl << "["<<snode.Index<<"] "<< "Unpack the received data for processors participate in Gemm. " << std::endl << std::endl;
972 #endif
973  // U part
974  if( MYROW( grid_ ) != PROW( snode.Index, grid_ ) ){
975  std::stringstream sstm;
976  sstm.write( &snode.SstrUrowRecv[0], snode.SstrUrowRecv.size() );
977  std::vector<Int> mask( UBlockMask::TOTAL_NUMBER, 1 );
978  Int numUBlock;
979  deserialize( numUBlock, sstm, NO_MASK );
980  UrowRecv.resize( numUBlock );
981  for( Int jb = 0; jb < numUBlock; jb++ ){
982  deserialize( UrowRecv[jb], sstm, mask );
983  }
984  } // sender is not the same as receiver
985  else{
986  // U is obtained locally, just make a copy. Include everything
987  // (there is no diagonal block)
988  // Is it a copy ? LL: YES. Maybe we should replace the copy by
989  // something more efficient especially for mpisize == 1
990  UrowRecv.resize(this->U( LBi( snode.Index, grid_ ) ).size());
991  std::copy(this->U( LBi( snode.Index, grid_ ) ).begin(),this->U( LBi( snode.Index, grid_ )).end(),UrowRecv.begin());
992  } // sender is the same as receiver
993 
994 
995  //L part
996  if( MYCOL( grid_ ) != PCOL( snode.Index, grid_ ) ){
997  std::stringstream sstm;
998  sstm.write( &snode.SstrLcolRecv[0], snode.SstrLcolRecv.size() );
999  std::vector<Int> mask( LBlockMask::TOTAL_NUMBER, 1 );
1000  mask[LBlockMask::NZVAL] = 0; // nzval is excluded
1001  Int numLBlock;
1002  deserialize( numLBlock, sstm, NO_MASK );
1003  LcolRecv.resize( numLBlock );
1004  for( Int ib = 0; ib < numLBlock; ib++ ){
1005  deserialize( LcolRecv[ib], sstm, mask );
1006  }
1007  } // sender is not the same as receiver
1008  else{
1009  // L is obtained locally, just make a copy.
1010  // Do not include the diagonal block
1011  std::vector<LBlock<T> >& Lcol = this->L( LBj( snode.Index, grid_ ) );
1012  Int startIdx = ( MYROW( grid_ ) == PROW( snode.Index, grid_ ) )?1:0;
1013  LcolRecv.resize( Lcol.size() - startIdx );
1014  std::copy(Lcol.begin()+startIdx,Lcol.end(),LcolRecv.begin());
1015  } // sender is the same as receiver
1016  }
1017 
1018  template<typename T>
1020  {
1021 
1022  //---------Computing Diagonal block, all processors in the column are participating to all pipelined supernodes
1023  if( MYCOL( grid_ ) == PCOL( snode.Index, grid_ ) ){
1024 #if ( _DEBUGlevel_ >= 2 )
1025  statusOFS << std::endl << "["<<snode.Index<<"] "<< "Updating the diagonal block" << std::endl << std::endl;
1026 #endif
1027  std::vector<LBlock<T> >& Lcol = this->L( LBj( snode.Index, grid_ ) );
1028 
1029  //Allocate DiagBuf even if Lcol.size() == 0
1030  snode.DiagBuf.Resize(SuperSize( snode.Index, super_ ), SuperSize( snode.Index, super_ ));
1031  SetValue(snode.DiagBuf, ZERO<T>());
1032 
1033  // Do I own the diagonal block ?
1034  Int startIb = (MYROW( grid_ ) == PROW( snode.Index, grid_ ))?1:0;
1035  for( Int ib = startIb; ib < Lcol.size(); ib++ ){
1036 
1037 #ifdef GEMM_PROFILE
1038  gemm_stat.push_back(snode.DiagBuf.m());
1039  gemm_stat.push_back(snode.DiagBuf.n());
1040  gemm_stat.push_back(Lcol[ib].numRow);
1041 #endif
1042 
1043  LBlock<T> & LB = Lcol[ib];
1044 
1045  blas::Gemm( 'T', 'N', snode.DiagBuf.m(), snode.DiagBuf.n(), LB.numRow,
1046  MINUS_ONE<T>(), &snode.LUpdateBuf( snode.RowLocalPtr[ib-startIb], 0 ), snode.LUpdateBuf.m(),
1047  LB.nzval.Data(), LB.nzval.m(), ONE<T>(), snode.DiagBuf.Data(), snode.DiagBuf.m() );
1048  }
1049 
1050 #if ( _DEBUGlevel_ >= 1 )
1051  statusOFS << std::endl << "["<<snode.Index<<"] "<< "Updated the diagonal block" << std::endl << std::endl;
1052 #endif
1053  }
1054  }
1055 
1056 
1057  template<typename T>
1058  void PMatrix<T>::GetEtree(std::vector<Int> & etree_supno )
1059  {
1060 #ifndef _RELEASE_
1061  PushCallStack("PMatrix::GetEtree");
1062  double begin = MPI_Wtime( );
1063 #endif
1064  Int nsupers = this->NumSuper();
1065 
1066  if( options_->ColPerm != "PARMETIS" ) {
1067  /* Use the etree computed from serial symb. fact., and turn it
1068  into supernodal tree. */
1069  const SuperNodeType * superNode = this->SuperNode();
1070 
1071 
1072  //translate from columns to supernodes etree using supIdx
1073  etree_supno.resize(this->NumSuper());
1074  for(Int i = 0; i < superNode->etree.m(); ++i){
1075  Int curSnode = superNode->superIdx[i];
1076  Int parentSnode = (superNode->etree[i]>= superNode->etree.m()) ?this->NumSuper():superNode->superIdx[superNode->etree[i]];
1077  if( curSnode != parentSnode){
1078  etree_supno[curSnode] = parentSnode;
1079  }
1080  }
1081 
1082  } else { /* ParSymbFACT==YES and SymPattern==YES and RowPerm == NOROWPERM */
1083  /* Compute an "etree" based on struct(L),
1084  assuming struct(U) = struct(L'). */
1085 
1086  /* find the first block in each supernodal-column of local L-factor */
1087  std::vector<Int> etree_supno_l( nsupers, nsupers );
1088  for( Int ksup = 0; ksup < nsupers; ksup++ ){
1089  if( MYCOL( grid_ ) == PCOL( ksup, grid_ ) ){
1090  // L part
1091  std::vector<LBlock<T> >& Lcol = this->L( LBj(ksup, grid_) );
1092  if(Lcol.size()>0){
1093  Int firstBlk = 0;
1094  if( MYROW( grid_ ) == PROW( ksup, grid_ ) )
1095  firstBlk=1;
1096 
1097  for( Int ib = firstBlk; ib < Lcol.size(); ib++ ){
1098  etree_supno_l[ksup] = std::min(etree_supno_l[ksup] , Lcol[ib].blockIdx);
1099  }
1100  }
1101  }
1102  }
1103 
1104 
1105 #if ( _DEBUGlevel_ >= 1 )
1106  statusOFS << std::endl << " Local supernodal elimination tree is " << etree_supno_l <<std::endl<<std::endl;
1107 
1108 #endif
1109  /* form global e-tree */
1110  etree_supno.resize( nsupers );
1111  mpi::Allreduce( (Int*) &etree_supno_l[0],(Int *) &etree_supno[0], nsupers, MPI_MIN, grid_->comm );
1112  etree_supno[nsupers-1]=nsupers;
1113  }
1114 
1115 #ifndef _RELEASE_
1116  double end = MPI_Wtime( );
1117  statusOFS<<"Building the list took "<<end-begin<<"s"<<std::endl;
1118 #endif
1119 #ifndef _RELEASE_
1120  PopCallStack();
1121 #endif
1122  } // ----- end of method PMatrix::GetEtree -----
1123 
1124  template<typename T>
1126  std::vector<Int> & snodeEtree,
1127  std::vector<std::vector<Int> > & WSet)
1128  {
1129  TIMER_START(Compute_WorkSet);
1130  Int numSuper = this->NumSuper();
1131 
1132 
1133  if (options_->maxPipelineDepth!=1){
1134  //find roots in the supernode etree (it must be postordered)
1135  //initialize the parent we are looking at
1136  //Int rootParent = snodeEtree[numSuper-2];
1137  Int rootParent = numSuper;
1138 
1139  //compute the level of each supernode and the total number of levels
1140  //IntNumVec level(numSuper);
1141  //level(rootParent)=0;
1142  IntNumVec level(numSuper+1);
1143  level(rootParent)=-1;
1144  Int numLevel = 0;
1145  for(Int i=rootParent-1; i>=0; i-- ){ level(i) = level(snodeEtree[i])+1; numLevel = std::max(numLevel, level(i)); }
1146  numLevel++;
1147 
1148  //Compute the number of supernodes at each level
1149  IntNumVec levelSize(numLevel);
1150  SetValue(levelSize,I_ZERO);
1151  //for(Int i=rootParent-1; i>=0; i-- ){ levelSize(level(i))++; }
1152  for(Int i=rootParent-1; i>=0; i-- ){ if(level[i]>=0){ levelSize(level(i))++; } }
1153 
1154  //Allocate memory
1155  WSet.resize(numLevel,std::vector<Int>());
1156  for(Int i=0; i<numLevel; i++ ){WSet[i].reserve(levelSize(i));}
1157 
1158  //Fill the worklist based on the level of each supernode
1159  for(Int i=rootParent-1; i>=0; i-- ){
1160  WSet[level(i)].push_back(i);
1161  }
1162 
1163  //Constrain the size of each list to be min(MPI_MAX_COMM,options_->maxPipelineDepth)
1164  Int limit = (options_->maxPipelineDepth>0)?std::min(MPI_MAX_COMM,options_->maxPipelineDepth):MPI_MAX_COMM;
1165  for (Int lidx=0; lidx<WSet.size() ; lidx++){
1166  if(WSet[lidx].size()>limit)
1167  {
1168  std::vector<std::vector<Int> >::iterator pos = WSet.begin()+lidx+1;
1169  WSet.insert(pos,std::vector<Int>());
1170  WSet[lidx+1].insert(WSet[lidx+1].begin(),WSet[lidx].begin() + limit ,WSet[lidx].end());
1171  WSet[lidx].erase(WSet[lidx].begin()+limit,WSet[lidx].end());
1172  }
1173  }
1174 
1175 
1176 
1177  }
1178  else{
1179  for( Int ksup = numSuper - 2; ksup >= 0; ksup-- ){
1180  WSet.push_back(std::vector<Int>());
1181  WSet.back().push_back(ksup);
1182  }
1183 
1184  }
1185 
1186 
1187 
1188 
1189  TIMER_STOP(Compute_WorkSet);
1190 #if ( _DEBUGlevel_ >= 1 )
1191  for (Int lidx=0; lidx<WSet.size() ; lidx++){
1192  statusOFS << std::endl << "L"<< lidx << " is: {";
1193  for (Int supidx=0; supidx<WSet[lidx].size() ; supidx++){
1194  statusOFS << WSet[lidx][supidx] << " ["<<snodeEtree[WSet[lidx][supidx]]<<"] ";
1195  }
1196  statusOFS << " }"<< std::endl;
1197  }
1198 #endif
1199  }
1200 
1201  template<typename T>
1202  inline void PMatrix<T>::SelInvIntra_P2p(Int lidx)
1203  {
1204 
1205 #if defined (PROFILE) || defined(PMPI) || defined(USE_TAU)
1206  Real begin_SendULWaitContentFirst, end_SendULWaitContentFirst, time_SendULWaitContentFirst = 0;
1207 #endif
1208  Int numSuper = this->NumSuper();
1209  std::vector<std::vector<Int> > & superList = this->WorkingSet();
1210  Int numSteps = superList.size();
1211  Int stepSuper = superList[lidx].size();
1212 
1213  TIMER_START(AllocateBuffer);
1214 
1215  //This is required to send the size and content of U/L
1216  std::vector<std::vector<MPI_Request> > arrMpireqsSendToBelow;
1217  arrMpireqsSendToBelow.resize( stepSuper, std::vector<MPI_Request>( 2 * grid_->numProcRow, MPI_REQUEST_NULL ));
1218  std::vector<std::vector<MPI_Request> > arrMpireqsSendToRight;
1219  arrMpireqsSendToRight.resize(stepSuper, std::vector<MPI_Request>( 2 * grid_->numProcCol, MPI_REQUEST_NULL ));
1220 
1221  //This is required to reduce L
1222  std::vector<MPI_Request> arrMpireqsSendToLeft;
1223  arrMpireqsSendToLeft.resize(stepSuper, MPI_REQUEST_NULL );
1224  //This is required to reduce D
1225  std::vector<MPI_Request> arrMpireqsSendToAbove;
1226  arrMpireqsSendToAbove.resize(stepSuper, MPI_REQUEST_NULL );
1227 
1228  //This is required to receive the size and content of U/L
1229  std::vector<MPI_Request> arrMpireqsRecvSizeFromAny;
1230  arrMpireqsRecvSizeFromAny.resize(stepSuper*2 , MPI_REQUEST_NULL);
1231  std::vector<MPI_Request> arrMpireqsRecvContentFromAny;
1232  arrMpireqsRecvContentFromAny.resize(stepSuper*2 , MPI_REQUEST_NULL);
1233 
1234 
1235  //allocate the buffers for this supernode
1236  std::vector<SuperNodeBufferType> arrSuperNodes(stepSuper);
1237  for (Int supidx=0; supidx<stepSuper; supidx++){
1238  arrSuperNodes[supidx].Index = superList[lidx][supidx];
1239  }
1240 
1241 
1242 
1243  int numSentToLeft = 0;
1244  std::vector<int> reqSentToLeft;
1245 
1246 
1247  NumMat<T> AinvBuf, UBuf;
1248 
1249  TIMER_STOP(AllocateBuffer);
1250 
1251 #ifndef _RELEASE_
1252  PushCallStack("PMatrix::SelInv_P2p::UpdateL");
1253 #endif
1254 #if ( _DEBUGlevel_ >= 1 )
1255  statusOFS << std::endl << "Communication to the Schur complement." << std::endl << std::endl;
1256 #endif
1257 
1258  {
1259  // Senders
1260  //Receivers have to resize their buffers
1261  TIMER_START(IRecv_Content_UL);
1262  // Receivers (Content)
1263  for (Int supidx=0; supidx<stepSuper ; supidx++){
1264  SuperNodeBufferType & snode = arrSuperNodes[supidx];
1265  MPI_Request * mpireqsRecvFromAbove = &arrMpireqsRecvContentFromAny[supidx*2];
1266  MPI_Request * mpireqsRecvFromLeft = &arrMpireqsRecvContentFromAny[supidx*2+1];
1267 
1268  if( isRecvFromAbove_( snode.Index ) &&
1269  MYROW( grid_ ) != PROW( snode.Index, grid_ ) ){
1270 
1271  TreeBcast * bcastUTree = fwdToBelowTree_[snode.Index];
1272  if(bcastUTree!=NULL){
1273  Int myRoot = bcastUTree->GetRoot();
1274  snode.SizeSstrUrowRecv = bcastUTree->GetMsgSize();
1275  snode.SstrUrowRecv.resize( snode.SizeSstrUrowRecv);
1276  MPI_Irecv( &snode.SstrUrowRecv[0], snode.SizeSstrUrowRecv, MPI_BYTE,
1277  myRoot, IDX_TO_TAG(snode.Index,SELINV_TAG_U_CONTENT),
1278  grid_->colComm, mpireqsRecvFromAbove );
1279 #if ( _DEBUGlevel_ >= 1 )
1280  statusOFS << std::endl << "["<<snode.Index<<"] "<< "Receiving U " << snode.SizeSstrUrowRecv << " BYTES from "<<myRoot<<" on tag "<<IDX_TO_TAG(snode.Index,SELINV_TAG_U_CONTENT)<< std::endl << std::endl;
1281 #endif
1282  }
1283  } // if I need to receive from up
1284 
1285  if( isRecvFromLeft_( snode.Index ) &&
1286  MYCOL( grid_ ) != PCOL( snode.Index, grid_ ) ){
1287  TreeBcast * bcastLTree = fwdToRightTree_[snode.Index];
1288  if(bcastLTree!=NULL){
1289  Int myRoot = bcastLTree->GetRoot();
1290  snode.SizeSstrLcolRecv = bcastLTree->GetMsgSize();
1291  snode.SstrLcolRecv.resize(snode.SizeSstrLcolRecv);
1292  MPI_Irecv( &snode.SstrLcolRecv[0], snode.SizeSstrLcolRecv, MPI_BYTE,
1293  myRoot, IDX_TO_TAG(snode.Index,SELINV_TAG_L_CONTENT),
1294  grid_->rowComm, mpireqsRecvFromLeft );
1295 #if ( _DEBUGlevel_ >= 1 )
1296  statusOFS << std::endl << "["<<snode.Index<<"] "<< "Receiving L " << snode.SizeSstrLcolRecv << " BYTES from "<<myRoot<<" on tag "<<IDX_TO_TAG(snode.Index,SELINV_TAG_L_CONTENT)<< std::endl << std::endl;
1297 #endif
1298  }
1299  } // if I need to receive from left
1300  }
1301  TIMER_STOP(IRecv_Content_UL);
1302 
1303  // Senders
1304  TIMER_START(ISend_Content_UL);
1305  for (Int supidx=0; supidx<stepSuper; supidx++){
1306  SuperNodeBufferType & snode = arrSuperNodes[supidx];
1307  std::vector<MPI_Request> & mpireqsSendToBelow = arrMpireqsSendToBelow[supidx];
1308  std::vector<MPI_Request> & mpireqsSendToRight = arrMpireqsSendToRight[supidx];
1309 
1310 #if ( _DEBUGlevel_ >= 1 )
1311  statusOFS << std::endl << "["<<snode.Index<<"] "
1312  << "Communication for the U part." << std::endl << std::endl;
1313 #endif
1314  // Communication for the U part.
1315  if( MYROW( grid_ ) == PROW( snode.Index, grid_ ) ){
1316  std::vector<UBlock<T> >& Urow = this->U( LBi(snode.Index, grid_) );
1317  // Pack the data in U
1318  TIMER_START(Serialize_UL);
1319  std::stringstream sstm;
1320 
1321  std::vector<Int> mask( UBlockMask::TOTAL_NUMBER, 1 );
1322  // All blocks are to be sent down.
1323  serialize( (Int)Urow.size(), sstm, NO_MASK );
1324  for( Int jb = 0; jb < Urow.size(); jb++ ){
1325  serialize( Urow[jb], sstm, mask );
1326  }
1327  snode.SstrUrowSend.resize( Size( sstm ) );
1328  sstm.read( &snode.SstrUrowSend[0], snode.SstrUrowSend.size() );
1329  snode.SizeSstrUrowSend = snode.SstrUrowSend.size();
1330  TIMER_STOP(Serialize_UL);
1331  TreeBcast * bcastUTree = fwdToBelowTree_[snode.Index];
1332  if(bcastUTree!=NULL){
1333  bcastUTree->ForwardMessage((char*)&snode.SstrUrowSend[0], snode.SizeSstrUrowSend,
1334  IDX_TO_TAG(snode.Index,SELINV_TAG_U_CONTENT), &mpireqsSendToBelow[0] );
1335  for( Int idxRecv = 0; idxRecv < bcastUTree->GetDestCount(); ++idxRecv ){
1336  Int iProcRow = bcastUTree->GetDest(idxRecv);
1337 #if ( _DEBUGlevel_ >= 1 )
1338  statusOFS << std::endl << "["<<snode.Index<<"] "<< "Sending U " << snode.SizeSstrUrowSend << " BYTES on tag "<<IDX_TO_TAG(snode.Index,SELINV_TAG_U_CONTENT) << std::endl << std::endl;
1339 #endif
1340  }
1341  }
1342  } // if I am the sender
1343 
1344 
1345 #if ( _DEBUGlevel_ >= 1 )
1346  statusOFS << std::endl << "["<<snode.Index<<"] "<< "Communication for the L part." << std::endl << std::endl;
1347 #endif
1348 
1349 
1350 
1351  // Communication for the L part.
1352  if( MYCOL( grid_ ) == PCOL( snode.Index, grid_ ) ){
1353  std::vector<LBlock<T> >& Lcol = this->L( LBj(snode.Index, grid_) );
1354  TIMER_START(Serialize_UL);
1355  // Pack the data in L
1356  std::stringstream sstm;
1357  std::vector<Int> mask( LBlockMask::TOTAL_NUMBER, 1 );
1358  mask[LBlockMask::NZVAL] = 0; // nzval is excluded
1359 
1360  // All blocks except for the diagonal block are to be sent right
1361 
1362  if( MYROW( grid_ ) == PROW( snode.Index, grid_ ) )
1363  serialize( (Int)Lcol.size() - 1, sstm, NO_MASK );
1364  else
1365  serialize( (Int)Lcol.size(), sstm, NO_MASK );
1366 
1367  for( Int ib = 0; ib < Lcol.size(); ib++ ){
1368  if( Lcol[ib].blockIdx > snode.Index ){
1369 #if ( _DEBUGlevel_ >= 2 )
1370  statusOFS << std::endl << "["<<snode.Index<<"] "<< "Serializing Block index " << Lcol[ib].blockIdx << std::endl;
1371 #endif
1372  serialize( Lcol[ib], sstm, mask );
1373  }
1374  }
1375  snode.SstrLcolSend.resize( Size( sstm ) );
1376  sstm.read( &snode.SstrLcolSend[0], snode.SstrLcolSend.size() );
1377  snode.SizeSstrLcolSend = snode.SstrLcolSend.size();
1378  TIMER_STOP(Serialize_UL);
1379 
1380 
1381  TreeBcast * bcastLTree = fwdToRightTree_[snode.Index];
1382  if(bcastLTree!=NULL){
1383  bcastLTree->ForwardMessage((char*)&snode.SstrLcolSend[0], snode.SizeSstrLcolSend,
1384  IDX_TO_TAG(snode.Index,SELINV_TAG_L_CONTENT), &mpireqsSendToRight[0] );
1385 
1386  for( Int idxRecv = 0; idxRecv < bcastLTree->GetDestCount(); ++idxRecv ){
1387  Int iProcCol = bcastLTree->GetDest(idxRecv);
1388 #if ( _DEBUGlevel_ >= 1 )
1389  statusOFS << std::endl << "["<<snode.Index<<"] "<< "Sending L " << snode.SizeSstrLcolSend << " BYTES on tag "<<IDX_TO_TAG(snode.Index,SELINV_TAG_L_CONTENT) << std::endl << std::endl;
1390 #endif
1391  }
1392  }
1393  } // if I am the sender
1394  } //Senders
1395  TIMER_STOP(ISend_Content_UL);
1396  }
1397 
1398  vector<char> redLdone(stepSuper,0);
1399  for (Int supidx=0; supidx<stepSuper; supidx++){
1400  SuperNodeBufferType & snode = arrSuperNodes[supidx];
1401  TreeReduce<T> * redLTree = redToLeftTree_[snode.Index];
1402 
1403  if(redLTree != NULL){
1404  redLTree->SetTag(IDX_TO_TAG(snode.Index,SELINV_TAG_L_REDUCE));
1405  //Initialize the tree
1406  redLTree->AllocRecvBuffers();
1407  //Post All Recv requests;
1408  redLTree->PostFirstRecv();
1409  }
1410  }
1411 
1412 
1413  TIMER_START(Compute_Sinv_LT);
1414  {
1415  Int msgForwarded = 0;
1416  Int msgToFwd = 0;
1417  Int gemmProcessed = 0;
1418  Int gemmToDo = 0;
1419  // Int toRecvGemm = 0;
1420  //copy the list of supernodes we need to process
1421  std::list<Int> readySupidx;
1422  //find local things to do
1423  for(Int supidx = 0;supidx<stepSuper;supidx++){
1424  SuperNodeBufferType & snode = arrSuperNodes[supidx];
1425 
1426 
1427  if( isRecvFromAbove_( snode.Index ) && isRecvFromLeft_( snode.Index )){
1428  gemmToDo++;
1429  if( MYCOL( grid_ ) == PCOL( snode.Index, grid_ ) ){
1430  snode.isReady++;
1431  }
1432 
1433  if( MYROW( grid_ ) == PROW( snode.Index, grid_ ) ){
1434  snode.isReady++;
1435  }
1436 
1437  if(snode.isReady==2){
1438  readySupidx.push_back(supidx);
1439 #if ( _DEBUGlevel_ >= 1 )
1440  statusOFS<<std::endl<<"Locally processing ["<<snode.Index<<"]"<<std::endl;
1441 #endif
1442  }
1443  }
1444  else if( (isRecvFromLeft_( snode.Index ) ) && MYCOL( grid_ ) != PCOL( snode.Index, grid_ ) )
1445  {
1446  //Get the reduction tree
1447  TreeReduce<T> * redLTree = redToLeftTree_[snode.Index];
1448 
1449  if(redLTree != NULL){
1450  TIMER_START(Reduce_Sinv_LT_Isend);
1451  //send the data to NULL to ensure 0 byte send
1452  redLTree->SetLocalBuffer(NULL);
1453  redLTree->SetDataReady(true);
1454  bool done = redLTree->Progress();
1455  TIMER_STOP(Reduce_Sinv_LT_Isend);
1456  }
1457  }// if( isRecvFromAbove_( snode.Index ) && isRecvFromLeft_( snode.Index ))
1458 
1459  if(MYROW(grid_)!=PROW(snode.Index,grid_)){
1460  TreeBcast * bcastUTree = fwdToBelowTree_[snode.Index];
1461  if(bcastUTree != NULL){
1462  if(bcastUTree->GetDestCount()>0){
1463  msgToFwd++;
1464  }
1465  }
1466  }
1467 
1468  if(MYCOL(grid_)!=PCOL(snode.Index,grid_)){
1469  TreeBcast * bcastLTree = fwdToRightTree_[snode.Index];
1470  if(bcastLTree != NULL){
1471  if(bcastLTree->GetDestCount()>0){
1472  msgToFwd++;
1473  }
1474  }
1475  }
1476  }
1477 
1478 #if ( _DEBUGlevel_ >= 1 )
1479  statusOFS<<std::endl<<"gemmToDo ="<<gemmToDo<<std::endl;
1480  statusOFS<<std::endl<<"msgToFwd ="<<msgToFwd<<std::endl;
1481 #endif
1482 
1483 
1484 #if defined (PROFILE)
1485  end_SendULWaitContentFirst=0;
1486  begin_SendULWaitContentFirst=0;
1487 #endif
1488 
1489  while(gemmProcessed<gemmToDo || msgForwarded < msgToFwd)
1490  {
1491  Int reqidx = MPI_UNDEFINED;
1492  Int supidx = -1;
1493 
1494 
1495  //while I don't have anything to do, wait for data to arrive
1496  do{
1497 
1498 
1499  int reqIndices[arrMpireqsRecvContentFromAny.size()];
1500  int numRecv = 0;
1501 
1502  //then process with the remote ones
1503 
1504  TIMER_START(WaitContent_UL);
1505 #if defined(PROFILE)
1506  if(begin_SendULWaitContentFirst==0){
1507  begin_SendULWaitContentFirst=1;
1508  TIMER_START(WaitContent_UL_First);
1509  }
1510 #endif
1511 
1512  numRecv = 0;
1513  MPI_Waitsome(2*stepSuper, &arrMpireqsRecvContentFromAny[0], &numRecv, reqIndices, MPI_STATUSES_IGNORE);
1514 
1515  for(int i =0;i<numRecv;i++){
1516  reqidx = reqIndices[i];
1517  //I've received something
1518  if(reqidx!=MPI_UNDEFINED)
1519  {
1520  //this stays true
1521  supidx = reqidx/2;
1522  SuperNodeBufferType & snode = arrSuperNodes[supidx];
1523 
1524  //If it's a U block
1525  if(reqidx%2==0){
1526  TreeBcast * bcastUTree = fwdToBelowTree_[snode.Index];
1527  if(bcastUTree != NULL){
1528  if(bcastUTree->GetDestCount()>0){
1529 
1530  std::vector<MPI_Request> & mpireqsSendToBelow = arrMpireqsSendToBelow[supidx];
1531 #if ( _DEBUGlevel_ >= 1 )
1532  for( Int idxRecv = 0; idxRecv < bcastUTree->GetDestCount(); ++idxRecv ){
1533  Int iProcRow = bcastUTree->GetDest(idxRecv);
1534  statusOFS << std::endl << "["<<snode.Index<<"] "<< "Forwarding U " << snode.SizeSstrUrowRecv << " BYTES to "<<iProcRow<<" on tag "<<IDX_TO_TAG(snode.Index,SELINV_TAG_U_CONTENT)<< std::endl << std::endl;
1535  }
1536 #endif
1537 
1538  bcastUTree->ForwardMessage( (char*)&snode.SstrUrowRecv[0], snode.SizeSstrUrowRecv,
1539  IDX_TO_TAG(snode.Index,SELINV_TAG_U_CONTENT), &mpireqsSendToBelow[0] );
1540 #if ( _DEBUGlevel_ >= 1 )
1541  for( Int idxRecv = 0; idxRecv < bcastUTree->GetDestCount(); ++idxRecv ){
1542  Int iProcRow = bcastUTree->GetDest(idxRecv);
1543  statusOFS << std::endl << "["<<snode.Index<<"] "<< "Forwarded U " << snode.SizeSstrUrowRecv << " BYTES to "<<iProcRow<<" on tag "<<IDX_TO_TAG(snode.Index,SELINV_TAG_U_CONTENT)<< std::endl << std::endl;
1544  }
1545 #endif
1546  msgForwarded++;
1547  }
1548  }
1549  }
1550  //If it's a L block
1551  else if(reqidx%2==1){
1552  TreeBcast * bcastLTree = fwdToRightTree_[snode.Index];
1553  if(bcastLTree != NULL){
1554  if(bcastLTree->GetDestCount()>0){
1555 
1556  std::vector<MPI_Request> & mpireqsSendToRight = arrMpireqsSendToRight[supidx];
1557 #if ( _DEBUGlevel_ >= 1 )
1558  for( Int idxRecv = 0; idxRecv < bcastLTree->GetDestCount(); ++idxRecv ){
1559  Int iProcCol = bcastLTree->GetDest(idxRecv);
1560  statusOFS << std::endl << "["<<snode.Index<<"] "<< "Forwarding L " << snode.SizeSstrLcolRecv << " BYTES to "<<iProcCol<<" on tag "<<IDX_TO_TAG(snode.Index,SELINV_TAG_L_CONTENT)<< std::endl << std::endl;
1561  }
1562 #endif
1563 
1564  bcastLTree->ForwardMessage( (char*)&snode.SstrLcolRecv[0], snode.SizeSstrLcolRecv,
1565  IDX_TO_TAG(snode.Index,SELINV_TAG_L_CONTENT), &mpireqsSendToRight[0] );
1566 
1567  // for( Int idxRecv = 0; idxRecv < bcastLTree->GetDestCount(); ++idxRecv ){
1568  // Int iProcCol = bcastLTree->GetDest(idxRecv);
1569  // PROFILE_COMM(MYPROC(this->grid_),PNUM(MYROW(this->grid_),iProcCol,this->grid_),IDX_TO_TAG(snode.Index,SELINV_TAG_L_CONTENT),snode.SizeSstrLcolRecv);
1570  // }
1571 #if ( _DEBUGlevel_ >= 1 )
1572  for( Int idxRecv = 0; idxRecv < bcastLTree->GetDestCount(); ++idxRecv ){
1573  Int iProcCol = bcastLTree->GetDest(idxRecv);
1574  statusOFS << std::endl << "["<<snode.Index<<"] "<< "Forwarded L " << snode.SizeSstrLcolRecv << " BYTES to "<<iProcCol<<" on tag "<<IDX_TO_TAG(snode.Index,SELINV_TAG_L_CONTENT)<< std::endl << std::endl;
1575  }
1576 #endif
1577  msgForwarded++;
1578  }
1579  }
1580  }
1581 
1582 
1583 #if ( _DEBUGlevel_ >= 1 )
1584  statusOFS<<std::endl<<"Received data for ["<<snode.Index<<"] reqidx%2="<<reqidx%2<<" is ready ?"<<snode.isReady<<std::endl;
1585 #endif
1586 
1587  if( isRecvFromAbove_( snode.Index ) && isRecvFromLeft_( snode.Index )){
1588  snode.isReady++;
1589 
1590  //if we received both L and U, the supernode is ready
1591  if(snode.isReady==2){
1592  readySupidx.push_back(supidx);
1593 
1594 #if defined(PROFILE)
1595  if(end_SendULWaitContentFirst==0){
1596  TIMER_STOP(WaitContent_UL_First);
1597  end_SendULWaitContentFirst=1;
1598  }
1599 #endif
1600  }
1601  }
1602  }
1603 
1604  }//end for waitsome
1605 
1606  TIMER_STOP(WaitContent_UL);
1607 
1608  } while( (gemmProcessed<gemmToDo && readySupidx.size()==0) || (gemmProcessed==gemmToDo && msgForwarded<msgToFwd) );
1609 
1610  //If I have some work to do
1611  if(readySupidx.size()>0)
1612  {
1613  supidx = readySupidx.back();
1614  readySupidx.pop_back();
1615  SuperNodeBufferType & snode = arrSuperNodes[supidx];
1616 
1617 
1618  // Only the processors received information participate in the Gemm
1619  if( isRecvFromAbove_( snode.Index ) && isRecvFromLeft_( snode.Index ) ){
1620 
1621  std::vector<LBlock<T> > LcolRecv;
1622  std::vector<UBlock<T> > UrowRecv;
1623  // Save all the data to be updated for { L( isup, snode.Index ) | isup > snode.Index }.
1624  // The size will be updated in the Gemm phase and the reduce phase
1625 
1626  UnpackData(snode, LcolRecv, UrowRecv);
1627 
1628  //NumMat<T> AinvBuf, UBuf;
1629  SelInv_lookup_indexes(snode,LcolRecv, UrowRecv,AinvBuf,UBuf);
1630 
1631  snode.LUpdateBuf.Resize( AinvBuf.m(), SuperSize( snode.Index, super_ ) );
1632 
1633 #ifdef GEMM_PROFILE
1634  gemm_stat.push_back(AinvBuf.m());
1635  gemm_stat.push_back(UBuf.m());
1636  gemm_stat.push_back(AinvBuf.n());
1637 #endif
1638 
1639  TIMER_START(Compute_Sinv_LT_GEMM);
1640  blas::Gemm( 'N', 'T', AinvBuf.m(), UBuf.m(), AinvBuf.n(), MINUS_ONE<T>(),
1641  AinvBuf.Data(), AinvBuf.m(),
1642  UBuf.Data(), UBuf.m(), ZERO<T>(),
1643  snode.LUpdateBuf.Data(), snode.LUpdateBuf.m() );
1644  TIMER_STOP(Compute_Sinv_LT_GEMM);
1645 
1646 
1647 #if ( _DEBUGlevel_ >= 2 )
1648  statusOFS << std::endl << "["<<snode.Index<<"] "<< "snode.LUpdateBuf: " << snode.LUpdateBuf << std::endl;
1649 #endif
1650  } // if Gemm is to be done locally
1651 
1652  //Get the reduction tree
1653  TreeReduce<T> * redLTree = redToLeftTree_[snode.Index];
1654  if(redLTree != NULL){
1655  TIMER_START(Reduce_Sinv_LT_Isend);
1656  //send the data
1657  redLTree->SetLocalBuffer(snode.LUpdateBuf.Data());
1658  redLTree->SetDataReady(true);
1659  bool done = redLTree->Progress();
1660  TIMER_STOP(Reduce_Sinv_LT_Isend);
1661  }
1662  gemmProcessed++;
1663 
1664 #if ( _DEBUGlevel_ >= 1 )
1665  statusOFS<<std::endl<<"gemmProcessed ="<<gemmProcessed<<"/"<<gemmToDo<<std::endl;
1666 #endif
1667 
1668  //advance reductions
1669  for (Int supidx=0; supidx<stepSuper; supidx++){
1670  SuperNodeBufferType & snode = arrSuperNodes[supidx];
1671  TreeReduce<T> * redLTree = redToLeftTree_[snode.Index];
1672  if(redLTree != NULL && !redLdone[supidx]){
1673  bool done = redLTree->Progress();
1674  }
1675  }
1676  }
1677  }
1678 
1679  }
1680  TIMER_STOP(Compute_Sinv_LT);
1681 
1682  //Reduce Sinv L^T to the processors in PCOL(ksup,grid_)
1683 
1684 
1685  TIMER_START(Reduce_Sinv_LT);
1686  //blocking wait for the reduction
1687  bool all_done = false;
1688  while(!all_done)
1689  {
1690  all_done = true;
1691 
1692  for (Int supidx=0; supidx<stepSuper; supidx++){
1693  SuperNodeBufferType & snode = arrSuperNodes[supidx];
1694 
1695  TreeReduce<T> * redLTree = redToLeftTree_[snode.Index];
1696 
1697  if(redLTree != NULL && !redLdone[supidx]){
1698  bool done = redLTree->Progress();
1699  if(done){
1700  if( MYCOL( grid_ ) == PCOL( snode.Index, grid_ ) ){
1701  //determine the number of rows in LUpdateBufReduced
1702  Int numRowLUpdateBuf;
1703  std::vector<LBlock<T> >& Lcol = this->L( LBj( snode.Index, grid_ ) );
1704  if( MYROW( grid_ ) != PROW( snode.Index, grid_ ) ){
1705  snode.RowLocalPtr.resize( Lcol.size() + 1 );
1706  snode.BlockIdxLocal.resize( Lcol.size() );
1707  snode.RowLocalPtr[0] = 0;
1708  for( Int ib = 0; ib < Lcol.size(); ib++ ){
1709  snode.RowLocalPtr[ib+1] = snode.RowLocalPtr[ib] + Lcol[ib].numRow;
1710  snode.BlockIdxLocal[ib] = Lcol[ib].blockIdx;
1711  }
1712  } // I do not own the diagonal block
1713  else{
1714  snode.RowLocalPtr.resize( Lcol.size() );
1715  snode.BlockIdxLocal.resize( Lcol.size() - 1 );
1716  snode.RowLocalPtr[0] = 0;
1717  for( Int ib = 1; ib < Lcol.size(); ib++ ){
1718  snode.RowLocalPtr[ib] = snode.RowLocalPtr[ib-1] + Lcol[ib].numRow;
1719  snode.BlockIdxLocal[ib-1] = Lcol[ib].blockIdx;
1720  }
1721  } // I own the diagonal block, skip the diagonal block
1722  numRowLUpdateBuf = *snode.RowLocalPtr.rbegin();
1723 
1724  if( numRowLUpdateBuf > 0 ){
1725  if( snode.LUpdateBuf.m() == 0 && snode.LUpdateBuf.n() == 0 ){
1726  snode.LUpdateBuf.Resize( numRowLUpdateBuf,SuperSize( snode.Index, super_ ) );
1727  }
1728  }
1729 
1730  //copy the buffer from the reduce tree
1731  redLTree->SetLocalBuffer(snode.LUpdateBuf.Data());
1732 
1733  }
1734  redLdone[supidx]=1;
1735  redLTree->CleanupBuffers();
1736  }
1737 
1738  all_done = all_done && (done || redLdone[supidx]);
1739  }
1740  }
1741  }
1742  TIMER_STOP(Reduce_Sinv_LT);
1743 
1744 
1745 
1746 #ifndef _RELEASE_
1747  PopCallStack();
1748 #endif
1749  //--------------------- End of reduce of LUpdateBuf-------------------------
1750 #ifndef _RELEASE_
1751  PushCallStack("PMatrix::SelInv_P2p::UpdateD");
1752 #endif
1753 
1754  TIMER_START(Update_Diagonal);
1755 
1756  for (Int supidx=0; supidx<stepSuper; supidx++){
1757  SuperNodeBufferType & snode = arrSuperNodes[supidx];
1758 
1759  ComputeDiagUpdate(snode);
1760 
1761  //Get the reduction tree
1762  TreeReduce<T> * redDTree = redToAboveTree_[snode.Index];
1763 
1764  if(redDTree != NULL){
1765  //send the data
1766  if( MYROW( grid_ ) == PROW( snode.Index, grid_ ) ){
1767  if(snode.DiagBuf.Size()==0){
1768  snode.DiagBuf.Resize( SuperSize( snode.Index, super_ ), SuperSize( snode.Index, super_ ));
1769  SetValue(snode.DiagBuf, ZERO<T>());
1770  }
1771  }
1772 
1773 
1774  redDTree->SetLocalBuffer(snode.DiagBuf.Data());
1775  if(!redDTree->IsAllocated()){
1776  redDTree->SetTag(IDX_TO_TAG(snode.Index,SELINV_TAG_D_REDUCE));
1777  redDTree->AllocRecvBuffers();
1778  //Post All Recv requests;
1779  redDTree->PostFirstRecv();
1780  }
1781 
1782  redDTree->SetDataReady(true);
1783  bool done = redDTree->Progress();
1784  }
1785 
1786  //advance reductions
1787  for (Int supidx=0; supidx<stepSuper; supidx++){
1788  SuperNodeBufferType & snode = arrSuperNodes[supidx];
1789  TreeReduce<T> * redDTree = redToAboveTree_[snode.Index];
1790  if(redDTree != NULL){
1791  if(redDTree->IsAllocated()){
1792  bool done = redDTree->Progress();
1793  }
1794  }
1795  }
1796  }
1797  TIMER_STOP(Update_Diagonal);
1798 
1799  TIMER_START(Reduce_Diagonal);
1800  //blocking wait for the reduction
1801  {
1802  vector<char> is_done(stepSuper,0);
1803  bool all_done = false;
1804  while(!all_done)
1805  {
1806  all_done = true;
1807 
1808  for (Int supidx=0; supidx<stepSuper; supidx++){
1809  SuperNodeBufferType & snode = arrSuperNodes[supidx];
1810  TreeReduce<T> * redDTree = redToAboveTree_[snode.Index];
1811 
1812  if(redDTree != NULL){
1813  bool done = redDTree->Progress();
1814  if(done && !is_done[supidx]){
1815  if( MYCOL( grid_ ) == PCOL( snode.Index, grid_ ) ){
1816  if( MYROW( grid_ ) == PROW( snode.Index, grid_ ) ){
1817  LBlock<T> & LB = this->L( LBj( snode.Index, grid_ ) )[0];
1818  // Symmetrize LB
1819  blas::Axpy( LB.numRow * LB.numCol, ONE<T>(), snode.DiagBuf.Data(), 1, LB.nzval.Data(), 1 );
1820  Symmetrize( LB.nzval );
1821  }
1822  }
1823 
1824  is_done[supidx]=1;
1825  redDTree->CleanupBuffers();
1826  }
1827 
1828  all_done = all_done && (done || is_done[supidx]);
1829  }
1830  }
1831  }
1832  }
1833  TIMER_STOP(Reduce_Diagonal);
1834 
1835 #ifndef _RELEASE_
1836  PopCallStack();
1837 #endif
1838 
1839 
1840 #ifndef _RELEASE_
1841  PushCallStack("PMatrix::SelInv_P2p::UpdateU");
1842 #endif
1843 
1844 
1845 
1846  SendRecvCD_UpdateU(arrSuperNodes, stepSuper);
1847 
1848 #ifndef _RELEASE_
1849  PopCallStack();
1850 #endif
1851 
1852 #ifndef _RELEASE_
1853  PushCallStack("PMatrix::SelInv_P2p::UpdateLFinal");
1854 #endif
1855 
1856  TIMER_START(Update_L);
1857  for (Int supidx=0; supidx<stepSuper; supidx++){
1858  SuperNodeBufferType & snode = arrSuperNodes[supidx];
1859 
1860 #if ( _DEBUGlevel_ >= 1 )
1861  statusOFS << std::endl << "["<<snode.Index<<"] "<< "Finish updating the L part by filling LUpdateBufReduced back to L" << std::endl << std::endl;
1862 #endif
1863 
1864  if( MYCOL( grid_ ) == PCOL( snode.Index, grid_ ) && snode.LUpdateBuf.m() > 0 ){
1865  std::vector<LBlock<T> >& Lcol = this->L( LBj( snode.Index, grid_ ) );
1866  //Need to skip the diagonal block if present
1867  Int startBlock = (MYROW( grid_ ) == PROW( snode.Index, grid_ ))?1:0;
1868  for( Int ib = startBlock; ib < Lcol.size(); ib++ ){
1869  LBlock<T> & LB = Lcol[ib];
1870  lapack::Lacpy( 'A', LB.numRow, LB.numCol, &snode.LUpdateBuf( snode.RowLocalPtr[ib-startBlock], 0 ),
1871  snode.LUpdateBuf.m(), LB.nzval.Data(), LB.numRow );
1872  }
1873  } // Finish updating L
1874  } // for (snode.Index) : Main loop
1875  TIMER_STOP(Update_L);
1876 
1877 #ifndef _RELEASE_
1878  PopCallStack();
1879 #endif
1880 
1881 
1882  TIMER_START(Barrier);
1883 
1884 
1885  for (Int supidx=0; supidx<stepSuper; supidx++){
1886  SuperNodeBufferType & snode = arrSuperNodes[supidx];
1887  TreeReduce<T> * redLTree = redToLeftTree_[snode.Index];
1888 
1889  if(redLTree != NULL){
1890  redLTree->Wait();
1891  redLTree->CleanupBuffers();
1892  }
1893  }
1894 
1895 
1896  for (Int supidx=0; supidx<stepSuper; supidx++){
1897  SuperNodeBufferType & snode = arrSuperNodes[supidx];
1898  TreeReduce<T> * redDTree = redToAboveTree_[snode.Index];
1899 
1900  if(redDTree != NULL){
1901  redDTree->Wait();
1902  redDTree->CleanupBuffers();
1903  }
1904  }
1905 
1906  mpi::Waitall(arrMpireqsRecvContentFromAny);
1907 
1908  for (Int supidx=0; supidx<stepSuper; supidx++){
1909  Int ksup = superList[lidx][supidx];
1910  std::vector<MPI_Request> & mpireqsSendToRight = arrMpireqsSendToRight[supidx];
1911  std::vector<MPI_Request> & mpireqsSendToBelow = arrMpireqsSendToBelow[supidx];
1912 
1913 #if ( _DEBUGlevel_ >= 1 )
1914  statusOFS<<"["<<ksup<<"] mpireqsSendToRight"<<std::endl;
1915 #endif
1916  mpi::Waitall( mpireqsSendToRight );
1917 #if ( _DEBUGlevel_ >= 1 )
1918  statusOFS<<"["<<ksup<<"] mpireqsSendToBelow"<<std::endl;
1919 #endif
1920  mpi::Waitall( mpireqsSendToBelow );
1921 
1922  }
1923 
1924 #if ( _DEBUGlevel_ >= 1 )
1925  statusOFS<<"barrier done"<<std::endl;
1926 #endif
1927  TIMER_STOP(Barrier);
1928 
1929 
1930 #ifdef LIST_BARRIER
1931 #ifndef ALL_BARRIER
1932  if (options_->maxPipelineDepth!=-1)
1933 #endif
1934  {
1935  MPI_Barrier(grid_->comm);
1936  }
1937 #endif
1938 
1939  }
1940 
1941  template<typename T>
1943  {
1944  ConstructCommunicationPattern_P2p();
1945  } // ----- end of method PMatrix::ConstructCommunicationPattern -----
1946 
1947 
1948 
1949  template<typename T>
1951  {
1952 #ifndef _RELEASE_
1953  PushCallStack("PMatrix::ConstructCommunicationPattern_P2p");
1954 #endif
1955 
1956 
1957  Int numSuper = this->NumSuper();
1958 
1959  TIMER_START(Allocate);
1960 
1961 #ifndef _RELEASE_
1962  PushCallStack( "Initialize the communication pattern" );
1963 #endif
1964 
1965 
1966  fwdToBelowTree_.resize(numSuper, NULL );
1967  fwdToRightTree_.resize(numSuper, NULL );
1968  redToLeftTree_.resize(numSuper, NULL );
1969  redToAboveTree_.resize(numSuper, NULL );
1970 
1971  isSendToBelow_.Resize(grid_->numProcRow, numSuper);
1972  isSendToRight_.Resize(grid_->numProcCol, numSuper);
1973  isSendToDiagonal_.Resize( numSuper );
1974  SetValue( isSendToBelow_, false );
1975  SetValue( isSendToRight_, false );
1976  SetValue( isSendToDiagonal_, false );
1977 
1978  isSendToCrossDiagonal_.Resize(grid_->numProcCol+1, numSuper );
1979  SetValue( isSendToCrossDiagonal_, false );
1980  isRecvFromCrossDiagonal_.Resize(grid_->numProcRow+1, numSuper );
1981  SetValue( isRecvFromCrossDiagonal_, false );
1982 
1983  isRecvFromAbove_.Resize( numSuper );
1984  isRecvFromLeft_.Resize( numSuper );
1985  isRecvFromBelow_.Resize( grid_->numProcRow, numSuper );
1986  SetValue( isRecvFromAbove_, false );
1987  SetValue( isRecvFromBelow_, false );
1988  SetValue( isRecvFromLeft_, false );
1989 #ifndef _RELEASE_
1990  PopCallStack();
1991 #endif
1992 
1993  TIMER_STOP(Allocate);
1994 
1995  TIMER_START(GetEtree);
1996  std::vector<Int> snodeEtree(this->NumSuper());
1997  GetEtree(snodeEtree);
1998  TIMER_STOP(GetEtree);
1999 
2000 
2001 
2002 #ifndef _RELEASE_
2003  PushCallStack( "Local column communication" );
2004 #endif
2005 #if ( _DEBUGlevel_ >= 1 )
2006  statusOFS << std::endl << "Local column communication" << std::endl;
2007 #endif
2008  // localColBlockRowIdx stores the nonzero block indices for each local block column.
2009  // The nonzero block indices including contribution from both L and U.
2010  // Dimension: numLocalBlockCol x numNonzeroBlock
2011  std::vector<std::set<Int> > localColBlockRowIdx;
2012 
2013  localColBlockRowIdx.resize( this->NumLocalBlockCol() );
2014 
2015 
2016  TIMER_START(Column_communication);
2017 
2018 
2019  for( Int ksup = 0; ksup < numSuper; ksup++ ){
2020  // All block columns perform independently
2021  if( MYCOL( grid_ ) == PCOL( ksup, grid_ ) ){
2022 
2023 
2024  // Communication
2025  std::vector<Int> tAllBlockRowIdx;
2026  std::vector<Int> & colBlockIdx = ColBlockIdx(LBj(ksup, grid_));
2027  TIMER_START(Allgatherv_Column_communication);
2028  if( grid_ -> mpisize != 1 )
2029  mpi::Allgatherv( colBlockIdx, tAllBlockRowIdx, grid_->colComm );
2030  else
2031  tAllBlockRowIdx = colBlockIdx;
2032 
2033  TIMER_STOP(Allgatherv_Column_communication);
2034 
2035  localColBlockRowIdx[LBj( ksup, grid_ )].insert(
2036  tAllBlockRowIdx.begin(), tAllBlockRowIdx.end() );
2037 
2038 #if ( _DEBUGlevel_ >= 1 )
2039  statusOFS
2040  << " Column block " << ksup
2041  << " has the following nonzero block rows" << std::endl;
2042  for( std::set<Int>::iterator si = localColBlockRowIdx[LBj( ksup, grid_ )].begin();
2043  si != localColBlockRowIdx[LBj( ksup, grid_ )].end();
2044  si++ ){
2045  statusOFS << *si << " ";
2046  }
2047  statusOFS << std::endl;
2048 #endif
2049 
2050  } // if( MYCOL( grid_ ) == PCOL( ksup, grid_ ) )
2051  } // for(ksup)
2052 
2053 
2054 #ifndef _RELEASE_
2055  PopCallStack();
2056 #endif
2057 
2058  TIMER_STOP(Column_communication);
2059 
2060  TIMER_START(Row_communication);
2061 #ifndef _RELEASE_
2062  PushCallStack( "Local row communication" );
2063 #endif
2064 #if ( _DEBUGlevel_ >= 1 )
2065  statusOFS << std::endl << "Local row communication" << std::endl;
2066 #endif
2067  // localRowBlockColIdx stores the nonzero block indices for each local block row.
2068  // The nonzero block indices including contribution from both L and U.
2069  // Dimension: numLocalBlockRow x numNonzeroBlock
2070  std::vector<std::set<Int> > localRowBlockColIdx;
2071 
2072  localRowBlockColIdx.resize( this->NumLocalBlockRow() );
2073 
2074  for( Int ksup = 0; ksup < numSuper; ksup++ ){
2075  // All block columns perform independently
2076  if( MYROW( grid_ ) == PROW( ksup, grid_ ) ){
2077 
2078  // Communication
2079  std::vector<Int> tAllBlockColIdx;
2080  std::vector<Int> & rowBlockIdx = RowBlockIdx(LBi(ksup, grid_));
2081  TIMER_START(Allgatherv_Row_communication);
2082  if( grid_ -> mpisize != 1 )
2083  mpi::Allgatherv( rowBlockIdx, tAllBlockColIdx, grid_->rowComm );
2084  else
2085  tAllBlockColIdx = rowBlockIdx;
2086 
2087  TIMER_STOP(Allgatherv_Row_communication);
2088 
2089  localRowBlockColIdx[LBi( ksup, grid_ )].insert(
2090  tAllBlockColIdx.begin(), tAllBlockColIdx.end() );
2091 
2092 #if ( _DEBUGlevel_ >= 1 )
2093  statusOFS
2094  << " Row block " << ksup
2095  << " has the following nonzero block columns" << std::endl;
2096  for( std::set<Int>::iterator si = localRowBlockColIdx[LBi( ksup, grid_ )].begin();
2097  si != localRowBlockColIdx[LBi( ksup, grid_ )].end();
2098  si++ ){
2099  statusOFS << *si << " ";
2100  }
2101  statusOFS << std::endl;
2102 #endif
2103 
2104  } // if( MYROW( grid_ ) == PROW( ksup, grid_ ) )
2105  } // for(ksup)
2106 
2107 #ifndef _RELEASE_
2108  PopCallStack();
2109 #endif
2110 
2111  TIMER_STOP(Row_communication);
2112 
2113  TIMER_START(STB_RFA);
2114 #ifndef _RELEASE_
2115  PushCallStack("SendToBelow / RecvFromAbove");
2116 #endif
2117  for( Int ksup = 0; ksup < numSuper - 1; ksup++ ){
2118  // Loop over all the supernodes to the right of ksup
2119  Int jsup = snodeEtree[ksup];
2120  while(jsup<numSuper){
2121  Int jsupLocalBlockCol = LBj( jsup, grid_ );
2122  Int jsupProcCol = PCOL( jsup, grid_ );
2123  if( MYCOL( grid_ ) == jsupProcCol ){
2124  // SendToBelow / RecvFromAbove only if (ksup, jsup) is nonzero.
2125  if( localColBlockRowIdx[jsupLocalBlockCol].count( ksup ) > 0 ) {
2126  for( std::set<Int>::iterator si = localColBlockRowIdx[jsupLocalBlockCol].begin();
2127  si != localColBlockRowIdx[jsupLocalBlockCol].end(); si++ ){
2128  Int isup = *si;
2129  Int isupProcRow = PROW( isup, grid_ );
2130  if( isup > ksup ){
2131  if( MYROW( grid_ ) == isupProcRow ){
2132  isRecvFromAbove_(ksup) = true;
2133  }
2134  if( MYROW( grid_ ) == PROW( ksup, grid_ ) ){
2135  isSendToBelow_( isupProcRow, ksup ) = true;
2136  }
2137  } // if( isup > ksup )
2138  } // for (si)
2139  } // if( localColBlockRowIdx[jsupLocalBlockCol].count( ksup ) > 0 )
2140  } // if( MYCOL( grid_ ) == PCOL( jsup, grid_ ) )
2141  jsup = snodeEtree[jsup];
2142  } // for(jsup)
2143  } // for(ksup)
2144 #ifndef _RELEASE_
2145  PopCallStack();
2146 #endif
2147  TIMER_STOP(STB_RFA);
2148 
2149  TIMER_START(STR_RFL_RFB);
2150 #ifndef _RELEASE_
2151  PushCallStack("SendToRight / RecvFromLeft");
2152 #endif
2153  for( Int ksup = 0; ksup < numSuper - 1; ksup++ ){
2154  // Loop over all the supernodes below ksup
2155  Int isup = snodeEtree[ksup];
2156  while(isup<numSuper){
2157  Int isupLocalBlockRow = LBi( isup, grid_ );
2158  Int isupProcRow = PROW( isup, grid_ );
2159  if( MYROW( grid_ ) == isupProcRow ){
2160  // SendToRight / RecvFromLeft only if (isup, ksup) is nonzero.
2161  if( localRowBlockColIdx[isupLocalBlockRow].count( ksup ) > 0 ){
2162  for( std::set<Int>::iterator si = localRowBlockColIdx[isupLocalBlockRow].begin();
2163  si != localRowBlockColIdx[isupLocalBlockRow].end(); si++ ){
2164  Int jsup = *si;
2165  Int jsupProcCol = PCOL( jsup, grid_ );
2166  if( jsup > ksup ){
2167  if( MYCOL( grid_ ) == jsupProcCol ){
2168  isRecvFromLeft_(ksup) = true;
2169  }
2170  if( MYCOL( grid_ ) == PCOL( ksup, grid_ ) ){
2171  isSendToRight_( jsupProcCol, ksup ) = true;
2172  }
2173  }
2174  } // for (si)
2175  } // if( localRowBlockColIdx[isupLocalBlockRow].count( ksup ) > 0 )
2176  } // if( MYROW( grid_ ) == isupProcRow )
2177 
2178  if( MYCOL( grid_ ) == PCOL(ksup, grid_) ){
2179  if( MYROW( grid_ ) == PROW( ksup, grid_ ) ){
2180  isRecvFromBelow_(isupProcRow,ksup) = true;
2181  }
2182  else if (MYROW(grid_) == isupProcRow){
2183  isSendToDiagonal_(ksup)=true;
2184  }
2185  } // if( MYCOL( grid_ ) == PCOL(ksup, grid_) )
2186  isup = snodeEtree[isup];
2187  } // for (isup)
2188  } // for (ksup)
2189 
2190 #ifndef _RELEASE_
2191  PopCallStack();
2192 #endif
2193  TIMER_STOP(STR_RFL_RFB);
2194 
2195 
2196  TIMER_START(BUILD_BCAST_TREES);
2197  //Allgather RFL values within column
2198 
2199  vector<double> SeedRFL(numSuper,0.0);
2200  vector<Int> aggRFL(numSuper);
2201  vector<Int> globalAggRFL(numSuper*grid_->numProcCol);
2202  for( Int ksup = 0; ksup < numSuper ; ksup++ ){
2203  if(MYCOL(grid_)==PCOL(ksup,grid_)){
2204  std::vector<LBlock<T> >& Lcol = this->L( LBj(ksup, grid_) );
2205  // All blocks except for the diagonal block are to be sent right
2206 
2207  Int totalSize = 0;
2208  //one integer holding the number of Lblocks
2209  totalSize+=sizeof(Int);
2210  for( Int ib = 0; ib < Lcol.size(); ib++ ){
2211  if( Lcol[ib].blockIdx > ksup ){
2212  //three indices + one IntNumVec
2213  totalSize+=3*sizeof(Int);
2214  totalSize+= sizeof(Int)+Lcol[ib].rows.ByteSize();
2215  }
2216  }
2217 
2218 
2219  aggRFL[ksup]=totalSize;
2220  SeedRFL[ksup]=rand();
2221 
2222  }
2223  else if(isRecvFromLeft_(ksup)){
2224  aggRFL[ksup]=1;
2225  }
2226  else{
2227  aggRFL[ksup]=0;
2228  }
2229  }
2230  // //allgather
2231  MPI_Allgather(&aggRFL[0],numSuper*sizeof(Int),MPI_BYTE,
2232  &globalAggRFL[0],numSuper*sizeof(Int),MPI_BYTE,
2233  grid_->rowComm);
2234  MPI_Allreduce(MPI_IN_PLACE,&SeedRFL[0],numSuper,MPI_DOUBLE,MPI_MAX,grid_->rowComm);
2235 
2236  vector<double> SeedRFA(numSuper,0.0);
2237  vector<Int> aggRFA(numSuper);
2238  vector<Int> globalAggRFA(numSuper*grid_->numProcRow);
2239  for( Int ksup = 0; ksup < numSuper ; ksup++ ){
2240  if(MYROW(grid_)==PROW(ksup,grid_)){
2241  std::vector<UBlock<T> >& Urow = this->U( LBi(ksup, grid_) );
2242  // All blocks except for the diagonal block are to be sent right
2243 
2244  Int totalSize = 0;
2245  //one integer holding the number of Lblocks
2246  totalSize+=sizeof(Int);
2247  for( Int jb = 0; jb < Urow.size(); jb++ ){
2248  if( Urow[jb].blockIdx >= ksup ){
2249  //three indices + one IntNumVec + one NumMat<T>
2250  totalSize+=3*sizeof(Int);
2251  totalSize+= sizeof(Int)+Urow[jb].cols.ByteSize();
2252  totalSize+= 2*sizeof(Int)+Urow[jb].nzval.ByteSize();
2253  }
2254  }
2255  aggRFA[ksup]=totalSize;
2256  SeedRFA[ksup]=rand();
2257  }
2258  else if(isRecvFromAbove_(ksup)){
2259  aggRFA[ksup]=1;
2260  }
2261  else{
2262  aggRFA[ksup]=0;
2263  }
2264  }
2265 
2266 
2267  //allgather
2268  MPI_Allgather(&aggRFA[0],numSuper*sizeof(Int),MPI_BYTE,
2269  &globalAggRFA[0],numSuper*sizeof(Int),MPI_BYTE,
2270  grid_->colComm);
2271  MPI_Allreduce(MPI_IN_PLACE,&SeedRFA[0],numSuper,MPI_DOUBLE,MPI_MAX,grid_->colComm);
2272 
2273  for( Int ksup = 0; ksup < numSuper; ksup++ ){
2274  set<Int> set_ranks;
2275  Int msgSize = 0;
2276  for( Int iProcRow = 0; iProcRow < grid_->numProcRow; iProcRow++ ){
2277  Int isRFA = globalAggRFA[iProcRow*numSuper + ksup];
2278  if(isRFA>0){
2279  if( iProcRow != PROW( ksup, grid_ ) ){
2280  set_ranks.insert(iProcRow);
2281  }
2282  else{
2283  msgSize = isRFA;
2284  }
2285  }
2286  }
2287 
2288  if( isRecvFromAbove_(ksup) || CountSendToBelow(ksup)>0 ){
2289  vector<Int> tree_ranks;
2290  tree_ranks.push_back(PROW(ksup,grid_));
2291  tree_ranks.insert(tree_ranks.end(),set_ranks.begin(),set_ranks.end());
2292  TreeBcast * & BcastUTree = fwdToBelowTree_[ksup];
2293  BcastUTree = TreeBcast::Create(this->grid_->colComm,&tree_ranks[0],tree_ranks.size(),msgSize,SeedRFA[ksup]);
2294 #ifdef COMM_PROFILE
2295  BcastUTree->SetGlobalComm(grid_->comm);
2296 #endif
2297  }
2298 
2299  set_ranks.clear();
2300  for( Int iProcCol = 0; iProcCol < grid_->numProcCol; iProcCol++ ){
2301  Int isRFL = globalAggRFL[iProcCol*numSuper + ksup];
2302  if(isRFL>0){
2303  if( iProcCol != PCOL( ksup, grid_ ) ){
2304  set_ranks.insert(iProcCol);
2305  }
2306  else{
2307  msgSize = isRFL;
2308  }
2309  }
2310  }
2311 
2312  if( isRecvFromLeft_(ksup) || CountSendToRight(ksup)>0 ){
2313  vector<Int> tree_ranks;
2314  tree_ranks.push_back(PCOL(ksup,grid_));
2315  tree_ranks.insert(tree_ranks.end(),set_ranks.begin(),set_ranks.end());
2316 
2317  TreeBcast * & BcastLTree = fwdToRightTree_[ksup];
2318  BcastLTree = TreeBcast::Create(this->grid_->rowComm,&tree_ranks[0],tree_ranks.size(),msgSize,SeedRFL[ksup]);
2319 #ifdef COMM_PROFILE
2320  BcastLTree->SetGlobalComm(grid_->comm);
2321 #endif
2322  }
2323  }
2324 
2325  //do the same for the other arrays
2326  TIMER_STOP(BUILD_BCAST_TREES);
2327  TIMER_START(BUILD_REDUCE_D_TREE);
2328  vector<double> SeedSTD(numSuper,0.0);
2329  vector<Int> aggSTD(numSuper);
2330  vector<Int> globalAggSTD(numSuper*grid_->numProcRow);
2331  for( Int ksup = 0; ksup < numSuper; ksup++ ){
2332  if( MYCOL( grid_ ) == PCOL(ksup, grid_) && MYROW(grid_)==PROW(ksup,grid_)){
2333  Int totalSize = sizeof(T)*SuperSize( ksup, super_ )*SuperSize( ksup, super_ );
2334  aggSTD[ksup]=totalSize;
2335  SeedSTD[ksup]=rand();
2336  }
2337  else if(isSendToDiagonal_(ksup)){
2338  aggSTD[ksup]=1;
2339  }
2340  else{
2341  aggSTD[ksup]=0;
2342  }
2343  }
2344 
2345 
2346  //allgather
2347  MPI_Allgather(&aggSTD[0],numSuper*sizeof(Int),MPI_BYTE,
2348  &globalAggSTD[0],numSuper*sizeof(Int),MPI_BYTE,
2349  grid_->colComm);
2350  MPI_Allreduce(MPI_IN_PLACE,&SeedSTD[0],numSuper,MPI_DOUBLE,MPI_MAX,grid_->colComm);
2351 
2352 
2353  for( Int ksup = 0; ksup < numSuper; ksup++ ){
2354  if( MYCOL( grid_ ) == PCOL(ksup, grid_) ){
2355  set<Int> set_ranks;
2356  Int msgSize = 0;
2357  for( Int iProcRow = 0; iProcRow < grid_->numProcRow; iProcRow++ ){
2358  Int isSTD = globalAggSTD[iProcRow*numSuper + ksup];
2359  if(isSTD>0){
2360  if( iProcRow != PROW( ksup, grid_ ) ){
2361  set_ranks.insert(iProcRow);
2362  }
2363  else{
2364  msgSize = isSTD;
2365  }
2366  }
2367  }
2368 
2369  Int amISTD = globalAggSTD[MYROW(grid_)*numSuper + ksup];
2370 
2371  // if( MYCOL( grid_ ) == PCOL(ksup, grid_) && MYROW(grid_)==PROW(ksup,grid_)){
2372  // assert(amISTD>0);
2373  // }
2374 
2375  if( amISTD ){
2376  vector<Int> tree_ranks;
2377  tree_ranks.push_back(PROW(ksup,grid_));
2378  tree_ranks.insert(tree_ranks.end(),set_ranks.begin(),set_ranks.end());
2379 
2380  //assert(set_ranks.find(MYROW(grid_))!= set_ranks.end() || MYROW(grid_)==tree_ranks[0]);
2381 
2382  TreeReduce<T> * & redDTree = redToAboveTree_[ksup];
2383 
2384 
2385  redDTree = TreeReduce<T>::Create(this->grid_->colComm,&tree_ranks[0],tree_ranks.size(),msgSize,SeedSTD[ksup]);
2386 #ifdef COMM_PROFILE
2387  redDTree->SetGlobalComm(grid_->comm);
2388 #endif
2389  }
2390  }
2391  }
2392 
2393  TIMER_STOP(BUILD_REDUCE_D_TREE);
2394 
2395 
2396 
2397  TIMER_START(BUILD_REDUCE_L_TREE);
2398  vector<double> SeedRTL(numSuper,0.0);
2399  vector<Int> aggRTL(numSuper);
2400  vector<Int> globalAggRTL(numSuper*grid_->numProcCol);
2401  for( Int ksup = 0; ksup < numSuper ; ksup++ ){
2402  if(MYCOL(grid_)==PCOL(ksup,grid_)){
2403  std::vector<LBlock<T> >& Lcol = this->L( LBj(ksup, grid_) );
2404  // All blocks except for the diagonal block are to be sent right
2405 
2406  Int totalSize = 0;
2407 
2408  //determine the number of rows in LUpdateBufReduced
2409  Int numRowLUpdateBuf = 0;
2410  if( MYROW( grid_ ) != PROW( ksup, grid_ ) ){
2411  for( Int ib = 0; ib < Lcol.size(); ib++ ){
2412  numRowLUpdateBuf += Lcol[ib].numRow;
2413  }
2414  } // I do not own the diagonal block
2415  else{
2416  for( Int ib = 1; ib < Lcol.size(); ib++ ){
2417  numRowLUpdateBuf += Lcol[ib].numRow;
2418  }
2419  } // I own the diagonal block, skip the diagonal block
2420 
2421  //if(ksup==297){gdb_lock();}
2422  totalSize = numRowLUpdateBuf*SuperSize( ksup, super_ )*sizeof(T);
2423 
2424  aggRTL[ksup]=totalSize;
2425 
2426  SeedRTL[ksup]=rand();
2427  }
2428  else if(isRecvFromLeft_(ksup)){
2429  aggRTL[ksup]=1;
2430  }
2431  else{
2432  aggRTL[ksup]=0;
2433  }
2434  }
2435  // //allgather
2436  MPI_Allgather(&aggRTL[0],numSuper*sizeof(Int),MPI_BYTE,
2437  &globalAggRTL[0],numSuper*sizeof(Int),MPI_BYTE,
2438  grid_->rowComm);
2439  MPI_Allreduce(MPI_IN_PLACE,&SeedRTL[0],numSuper,MPI_DOUBLE,MPI_MAX,grid_->rowComm);
2440 
2441 
2442  for( Int ksup = 0; ksup < numSuper ; ksup++ ){
2443  set<Int> set_ranks;
2444  Int msgSize = 0;
2445  for( Int iProcCol = 0; iProcCol < grid_->numProcCol; iProcCol++ ){
2446  Int isRTL = globalAggRTL[iProcCol*numSuper + ksup];
2447  if(isRTL>0){
2448  if( iProcCol != PCOL( ksup, grid_ ) ){
2449  set_ranks.insert(iProcCol);
2450  }
2451  else{
2452  msgSize = isRTL;
2453  }
2454  }
2455  }
2456 
2457  if( isRecvFromLeft_(ksup) || CountSendToRight(ksup)>0 ){
2458  vector<Int> tree_ranks;
2459  tree_ranks.push_back(PCOL(ksup,grid_));
2460  tree_ranks.insert(tree_ranks.end(),set_ranks.begin(),set_ranks.end());
2461 
2462  TreeReduce<T> * & redLTree = redToLeftTree_[ksup];
2463  redLTree = TreeReduce<T>::Create(this->grid_->rowComm,&tree_ranks[0],tree_ranks.size(),msgSize,SeedRTL[ksup]);
2464 #ifdef COMM_PROFILE
2465  redLTree->SetGlobalComm(grid_->comm);
2466 #endif
2467  }
2468  }
2469 
2470  TIMER_STOP(BUILD_REDUCE_L_TREE);
2471 
2472 
2473 
2474 
2475 
2476 
2477 
2478 #if ( _DEBUGlevel_ >= 1 )
2479  statusOFS << std::endl << "isSendToBelow:" << std::endl;
2480  for(int j = 0;j< isSendToBelow_.n();j++){
2481  statusOFS << "["<<j<<"] ";
2482  for(int i =0; i < isSendToBelow_.m();i++){
2483  statusOFS<< isSendToBelow_(i,j) << " ";
2484  }
2485  statusOFS<<std::endl;
2486  }
2487 
2488  statusOFS << std::endl << "isRecvFromAbove:" << std::endl;
2489  for(int j = 0;j< isRecvFromAbove_.m();j++){
2490  statusOFS << "["<<j<<"] "<< isRecvFromAbove_(j)<<std::endl;
2491  }
2492 #endif
2493 #if ( _DEBUGlevel_ >= 1 )
2494  statusOFS << std::endl << "isSendToRight:" << std::endl;
2495  for(int j = 0;j< isSendToRight_.n();j++){
2496  statusOFS << "["<<j<<"] ";
2497  for(int i =0; i < isSendToRight_.m();i++){
2498  statusOFS<< isSendToRight_(i,j) << " ";
2499  }
2500  statusOFS<<std::endl;
2501  }
2502 
2503  statusOFS << std::endl << "isRecvFromLeft:" << std::endl;
2504  for(int j = 0;j< isRecvFromLeft_.m();j++){
2505  statusOFS << "["<<j<<"] "<< isRecvFromLeft_(j)<<std::endl;
2506  }
2507 
2508  statusOFS << std::endl << "isRecvFromBelow:" << std::endl;
2509  for(int j = 0;j< isRecvFromBelow_.n();j++){
2510  statusOFS << "["<<j<<"] ";
2511  for(int i =0; i < isRecvFromBelow_.m();i++){
2512  statusOFS<< isRecvFromBelow_(i,j) << " ";
2513  }
2514  statusOFS<<std::endl;
2515  }
2516 #endif
2517 
2518 
2519 
2520 
2521 
2522 
2523 
2524  TIMER_START(STCD_RFCD);
2525 
2526 
2527 #ifndef _RELEASE_
2528  PushCallStack("SendToCrossDiagonal / RecvFromCrossDiagonal");
2529 #endif
2530  for( Int ksup = 0; ksup < numSuper - 1; ksup++ ){
2531  if( MYCOL( grid_ ) == PCOL( ksup, grid_ ) ){
2532  for( std::set<Int>::iterator si = localColBlockRowIdx[LBj( ksup, grid_ )].begin();
2533  si != localColBlockRowIdx[LBj( ksup, grid_ )].end(); si++ ){
2534  Int isup = *si;
2535  Int isupProcRow = PROW( isup, grid_ );
2536  Int isupProcCol = PCOL( isup, grid_ );
2537  if( isup > ksup && MYROW( grid_ ) == isupProcRow ){
2538  isSendToCrossDiagonal_(grid_->numProcCol, ksup ) = true;
2539  isSendToCrossDiagonal_(isupProcCol, ksup ) = true;
2540  }
2541  } // for (si)
2542  } // if( MYCOL( grid_ ) == PCOL( ksup, grid_ ) )
2543  } // for (ksup)
2544 
2545  for( Int ksup = 0; ksup < numSuper - 1; ksup++ ){
2546  if( MYROW( grid_ ) == PROW( ksup, grid_ ) ){
2547  for( std::set<Int>::iterator si = localRowBlockColIdx[ LBi(ksup, grid_) ].begin();
2548  si != localRowBlockColIdx[ LBi(ksup, grid_) ].end(); si++ ){
2549  Int jsup = *si;
2550  Int jsupProcCol = PCOL( jsup, grid_ );
2551  Int jsupProcRow = PROW( jsup, grid_ );
2552  if( jsup > ksup && MYCOL(grid_) == jsupProcCol ){
2553  isRecvFromCrossDiagonal_(grid_->numProcRow, ksup ) = true;
2554  isRecvFromCrossDiagonal_(jsupProcRow, ksup ) = true;
2555  }
2556  } // for (si)
2557  } // if( MYROW( grid_ ) == PROW( ksup, grid_ ) )
2558  } // for (ksup)
2559 #if ( _DEBUGlevel_ >= 1 )
2560  statusOFS << std::endl << "isSendToCrossDiagonal:" << std::endl;
2561  for(int j =0; j < isSendToCrossDiagonal_.n();j++){
2562  if(isSendToCrossDiagonal_(grid_->numProcCol,j)){
2563  statusOFS << "["<<j<<"] ";
2564  for(int i =0; i < isSendToCrossDiagonal_.m()-1;i++){
2565  if(isSendToCrossDiagonal_(i,j))
2566  {
2567  statusOFS<< PNUM(PROW(j,grid_),i,grid_)<<" ";
2568  }
2569  }
2570  statusOFS<<std::endl;
2571  }
2572  }
2573 
2574  statusOFS << std::endl << "isRecvFromCrossDiagonal:" << std::endl;
2575  for(int j =0; j < isRecvFromCrossDiagonal_.n();j++){
2576  if(isRecvFromCrossDiagonal_(grid_->numProcRow,j)){
2577  statusOFS << "["<<j<<"] ";
2578  for(int i =0; i < isRecvFromCrossDiagonal_.m()-1;i++){
2579  if(isRecvFromCrossDiagonal_(i,j))
2580  {
2581  statusOFS<< PNUM(i,PCOL(j,grid_),grid_)<<" ";
2582  }
2583  }
2584  statusOFS<<std::endl;
2585  }
2586  }
2587 
2588 
2589 #endif
2590 
2591 #ifndef _RELEASE_
2592  PopCallStack();
2593 #endif
2594 
2595  TIMER_STOP(STCD_RFCD);
2596 
2597 #ifndef _RELEASE_
2598  PopCallStack();
2599 #endif
2600 
2601  //Build the list of supernodes based on the elimination tree from SuperLU
2602  GetWorkSet(snodeEtree,this->WorkingSet());
2603 
2604  return ;
2605  } // ----- end of method PMatrix::ConstructCommunicationPattern_P2p -----
2606 
2607  template<typename T>
2609  {
2610 
2611  SelInv_P2p ( );
2612 
2613 
2614 #ifdef GEMM_PROFILE
2615  statOFS<<"m"<<"\t"<<"n"<<"\t"<<"z"<<std::endl;
2616  for(auto it = gemm_stat.begin(); it!=gemm_stat.end(); it+=3){
2617  statOFS<<*it<<"\t"<<*(it+1)<<"\t"<<*(it+2)<<std::endl;
2618  }
2619 #endif
2620 
2621 #ifdef COMM_PROFILE
2622  //std::cout<<"DUMPING COMM STATS "<<comm_stat.size()<<" "<<std::endl;
2623  commOFS<<HEADER_COMM<<std::endl;
2624  for(auto it = comm_stat.begin(); it!=comm_stat.end(); it+=4){
2625  commOFS<<LINE_COMM(it)<<std::endl;
2626  }
2627 #endif
2628  } // ----- end of method PMatrix::SelInv -----
2629 
2630 
2631 
2632  template<typename T>
2634  {
2635  TIMER_START(SelInv_P2p);
2636 
2637 #ifndef _RELEASE_
2638  PushCallStack("PMatrix::SelInv_P2p");
2639 #endif
2640 
2641 
2642  Int numSuper = this->NumSuper();
2643 
2644  // Main loop
2645  std::vector<std::vector<Int> > & superList = this->WorkingSet();
2646  Int numSteps = superList.size();
2647 
2648  for (Int lidx=0; lidx<numSteps ; lidx++){
2649  Int stepSuper = superList[lidx].size();
2650 
2651  SelInvIntra_P2p(lidx);
2652 
2653  // if(lidx==1){ return;};
2654  }
2655 
2656 #ifndef _RELEASE_
2657  PopCallStack();
2658 #endif
2659 
2660  TIMER_STOP(SelInv_P2p);
2661 
2662  return ;
2663  } // ----- end of method PMatrix::SelInv_P2p -----
2664 
2665 
2666 
2667  template<typename T>
2669  {
2670 #ifndef _RELEASE_
2671  PushCallStack("PMatrix::PreSelInv");
2672 #endif
2673 
2674  Int numSuper = this->NumSuper();
2675 
2676 #ifndef _RELEASE_
2677  PushCallStack("L(i,k) <- L(i,k) * L(k,k)^{-1}");
2678 #endif
2679 #if ( _DEBUGlevel_ >= 1 )
2680  statusOFS << std::endl << "L(i,k) <- L(i,k) * L(k,k)^{-1}" << std::endl << std::endl;
2681 #endif
2682  for( Int ksup = 0; ksup < numSuper; ksup++ ){
2683  if( MYCOL( grid_ ) == PCOL( ksup, grid_ ) ){
2684  // Broadcast the diagonal L block
2685  NumMat<T> nzvalLDiag;
2686  std::vector<LBlock<T> >& Lcol = this->L( LBj( ksup, grid_ ) );
2687  if( MYROW( grid_ ) == PROW( ksup, grid_ ) ){
2688  nzvalLDiag = Lcol[0].nzval;
2689  if( nzvalLDiag.m() != SuperSize(ksup, super_) ||
2690  nzvalLDiag.n() != SuperSize(ksup, super_) ){
2691 #ifdef USE_ABORT
2692  abort();
2693 #endif
2694  throw std::runtime_error( "The size of the diagonal block of L is wrong." );
2695  }
2696  } // Owns the diagonal block
2697  else
2698  {
2699  nzvalLDiag.Resize( SuperSize(ksup, super_), SuperSize(ksup, super_) );
2700  }
2701  MPI_Bcast( (void*)nzvalLDiag.Data(), nzvalLDiag.ByteSize(),
2702  MPI_BYTE, PROW( ksup, grid_ ), grid_->colComm );
2703 
2704  // Triangular solve
2705  for( Int ib = 0; ib < Lcol.size(); ib++ ){
2706  LBlock<T> & LB = Lcol[ib];
2707  if( LB.blockIdx > ksup ){
2708 #if ( _DEBUGlevel_ >= 2 )
2709  // Check the correctness of the triangular solve for the first local column
2710  if( LBj( ksup, grid_ ) == 0 ){
2711  statusOFS << "Diag L(" << ksup << ", " << ksup << "): " << nzvalLDiag << std::endl;
2712  statusOFS << "Before solve L(" << LB.blockIdx << ", " << ksup << "): " << LB.nzval << std::endl;
2713  }
2714 #endif
2715  blas::Trsm( 'R', 'L', 'N', 'U', LB.numRow, LB.numCol, ONE<T>(),
2716  nzvalLDiag.Data(), LB.numCol, LB.nzval.Data(), LB.numRow );
2717 #if ( _DEBUGlevel_ >= 2 )
2718  // Check the correctness of the triangular solve for the first local column
2719  if( LBj( ksup, grid_ ) == 0 ){
2720  statusOFS << "After solve L(" << LB.blockIdx << ", " << ksup << "): " << LB.nzval << std::endl;
2721  }
2722 #endif
2723  }
2724  }
2725  } // if( MYCOL( grid_ ) == PCOL( ksup, grid_ ) )
2726  } // for (ksup)
2727 
2728 
2729 #ifndef _RELEASE_
2730  PopCallStack();
2731 #endif
2732 
2733 
2734 #ifndef _RELEASE_
2735  PushCallStack("U(k,i) <- L(i,k)");
2736 #endif
2737 #if ( _DEBUGlevel_ >= 1 )
2738  statusOFS << std::endl << "U(k,i) <- L(i,k)" << std::endl << std::endl;
2739 #endif
2740 
2741  for( Int ksup = 0; ksup < numSuper; ksup++ ){
2742  Int ksupProcRow = PROW( ksup, grid_ );
2743  Int ksupProcCol = PCOL( ksup, grid_ );
2744 
2745  Int sendCount = CountSendToCrossDiagonal(ksup);
2746  Int recvCount = CountRecvFromCrossDiagonal(ksup);
2747 
2748  std::vector<MPI_Request > arrMpiReqsSend(sendCount, MPI_REQUEST_NULL );
2749  std::vector<MPI_Request > arrMpiReqsSizeSend(sendCount, MPI_REQUEST_NULL );
2750  std::vector<std::vector<char> > arrSstrLcolSend(sendCount);
2751  std::vector<Int > arrSstrLcolSizeSend(sendCount);
2752 
2753  std::vector<MPI_Request > arrMpiReqsRecv(recvCount, MPI_REQUEST_NULL );
2754  std::vector<MPI_Request > arrMpiReqsSizeRecv(recvCount, MPI_REQUEST_NULL );
2755  std::vector<std::vector<char> > arrSstrLcolRecv(recvCount);
2756  std::vector<Int > arrSstrLcolSizeRecv(recvCount);
2757 
2758 
2759 
2760  // Sender
2761  if( isSendToCrossDiagonal_(grid_->numProcCol,ksup) ){
2762 #if ( _DEBUGlevel_ >= 1 )
2763  statusOFS<<"["<<ksup<<"] P"<<MYPROC(grid_)<<" should send to "<<CountSendToCrossDiagonal(ksup)<<" processors"<<std::endl;
2764 #endif
2765 
2766  Int sendIdx = 0;
2767  for(Int dstCol = 0; dstCol<grid_->numProcCol; dstCol++){
2768  if(isSendToCrossDiagonal_(dstCol,ksup) ){
2769  Int dst = PNUM(PROW(ksup,grid_),dstCol,grid_);
2770  if(MYPROC(grid_)!= dst){
2771  // Pack L data
2772  std::stringstream sstm;
2773  std::vector<char> & sstrLcolSend = arrSstrLcolSend[sendIdx];
2774  Int & sstrSize = arrSstrLcolSizeSend[sendIdx];
2775  MPI_Request & mpiReqSend = arrMpiReqsSend[sendIdx];
2776  MPI_Request & mpiReqSizeSend = arrMpiReqsSizeSend[sendIdx];
2777 
2778  std::vector<Int> mask( LBlockMask::TOTAL_NUMBER, 1 );
2779  std::vector<LBlock<T> >& Lcol = this->L( LBj(ksup, grid_) );
2780  // All blocks except for the diagonal block are to be sent right
2781  //TODO not true > this is a scatter operation ! Can we know the destination ?
2782 
2783  Int count = 0;
2784  if( MYROW( grid_ ) == PROW( ksup, grid_ ) ){
2785  for( Int ib = 1; ib < Lcol.size(); ib++ ){
2786  if( Lcol[ib].blockIdx > ksup && (Lcol[ib].blockIdx % grid_->numProcCol) == dstCol ){
2787  count++;
2788  }
2789  }
2790  }
2791  else{
2792 
2793  for( Int ib = 0; ib < Lcol.size(); ib++ ){
2794  if( Lcol[ib].blockIdx > ksup && (Lcol[ib].blockIdx % grid_->numProcCol) == dstCol ){
2795  count++;
2796  }
2797  }
2798  }
2799 
2800  serialize( (Int)count, sstm, NO_MASK );
2801 
2802  for( Int ib = 0; ib < Lcol.size(); ib++ ){
2803  if( Lcol[ib].blockIdx > ksup && (Lcol[ib].blockIdx % grid_->numProcCol) == dstCol ){
2804 #if ( _DEBUGlevel_ >= 1 )
2805  statusOFS<<"["<<ksup<<"] SEND contains "<<Lcol[ib].blockIdx<< " which corresponds to "<<GBj(ib,grid_)<<std::endl;
2806 #endif
2807  serialize( Lcol[ib], sstm, mask );
2808  }
2809  }
2810 
2811  sstrLcolSend.resize( Size(sstm) );
2812  sstm.read( &sstrLcolSend[0], sstrLcolSend.size() );
2813  sstrSize = sstrLcolSend.size();
2814 
2815 
2816 
2817  // Send/Recv is possible here due to the one to one correspondence
2818  // in the case of square processor grid
2819 
2820 #if ( _DEBUGlevel_ >= 1 )
2821  statusOFS<<"["<<ksup<<"] P"<<MYPROC(grid_)<<" ("<<MYROW(grid_)<<","<<MYCOL(grid_)<<") ---> LBj("<<ksup<<")="<<LBj(ksup,grid_)<<" ---> P"<<dst<<" ("<<ksupProcRow<<","<<dstCol<<")"<<std::endl;
2822 #endif
2823  MPI_Isend( &sstrSize, sizeof(sstrSize), MPI_BYTE, dst, SELINV_TAG_D_SIZE, grid_->comm, &mpiReqSizeSend );
2824  MPI_Isend( (void*)&sstrLcolSend[0], sstrSize, MPI_BYTE, dst, SELINV_TAG_D_CONTENT, grid_->comm, &mpiReqSend );
2825 
2826 
2827  PROFILE_COMM(MYPROC(this->grid_),dst,SELINV_TAG_D_SIZE,sizeof(sstrSize));
2828  PROFILE_COMM(MYPROC(this->grid_),dst,SELINV_TAG_D_CONTENT,sstrSize);
2829 
2830  //mpi::Send( sstm, dst,SELINV_TAG_D_SIZE, SELINV_TAG_D_CONTENT, grid_->comm );
2831 
2832  sendIdx++;
2833  } // if I am a sender
2834  }
2835  }
2836  }
2837 
2838 
2839 
2840 
2841 
2842  // Receiver
2843  if( isRecvFromCrossDiagonal_(grid_->numProcRow,ksup) ){
2844 
2845 
2846 #if ( _DEBUGlevel_ >= 1 )
2847  statusOFS<<"["<<ksup<<"] P"<<MYPROC(grid_)<<" should receive from "<<CountRecvFromCrossDiagonal(ksup)<<" processors"<<std::endl;
2848 #endif
2849 
2850 
2851  std::vector<UBlock<T> >& Urow = this->U( LBi( ksup, grid_ ) );
2852  std::vector<bool> isBlockFound(Urow.size(),false);
2853 
2854 
2855  Int recvIdx = 0;
2856  //receive size first
2857  for(Int srcRow = 0; srcRow<grid_->numProcRow; srcRow++){
2858  if(isRecvFromCrossDiagonal_(srcRow,ksup) ){
2859  std::vector<LBlock<T> > LcolRecv;
2860  Int src = PNUM(srcRow,PCOL(ksup,grid_),grid_);
2861  if(MYPROC(grid_)!= src){
2862  MPI_Request & mpiReqSizeRecv = arrMpiReqsSizeRecv[recvIdx];
2863  Int & sstrSize = arrSstrLcolSizeRecv[recvIdx];
2864 
2865  MPI_Irecv( &sstrSize, 1, MPI_INT, src, SELINV_TAG_D_SIZE, grid_->comm, &mpiReqSizeRecv );
2866 
2867  recvIdx++;
2868  }
2869  }
2870  }
2871 
2872  mpi::Waitall(arrMpiReqsSizeRecv);
2873 
2874 
2875 
2876  //receive content
2877  recvIdx = 0;
2878  for(Int srcRow = 0; srcRow<grid_->numProcRow; srcRow++){
2879  if(isRecvFromCrossDiagonal_(srcRow,ksup) ){
2880  std::vector<LBlock<T> > LcolRecv;
2881  Int src = PNUM(srcRow,PCOL(ksup,grid_),grid_);
2882  if(MYPROC(grid_)!= src){
2883  MPI_Request & mpiReqRecv = arrMpiReqsRecv[recvIdx];
2884  Int & sstrSize = arrSstrLcolSizeRecv[recvIdx];
2885  std::vector<char> & sstrLcolRecv = arrSstrLcolRecv[recvIdx];
2886  sstrLcolRecv.resize(sstrSize);
2887 
2888  MPI_Irecv( &sstrLcolRecv[0], sstrSize, MPI_BYTE, src, SELINV_TAG_D_CONTENT, grid_->comm, &mpiReqRecv );
2889 
2890  recvIdx++;
2891  }
2892  }
2893  }
2894 
2895  mpi::Waitall(arrMpiReqsRecv);
2896 
2897 
2898 
2899  //Process the content
2900  recvIdx = 0;
2901  for(Int srcRow = 0; srcRow<grid_->numProcRow; srcRow++){
2902  if(isRecvFromCrossDiagonal_(srcRow,ksup) ){
2903  std::vector<LBlock<T> > LcolRecv;
2904  Int src = PNUM(srcRow,PCOL(ksup,grid_),grid_);
2905  if(MYPROC(grid_)!= src){
2906 
2907  Int & sstrSize = arrSstrLcolSizeRecv[recvIdx];
2908  std::vector<char> & sstrLcolRecv = arrSstrLcolRecv[recvIdx];
2909  std::stringstream sstm;
2910 
2911 #if ( _DEBUGlevel_ >= 1 )
2912  statusOFS<<"["<<ksup<<"] P"<<MYPROC(grid_)<<" ("<<MYROW(grid_)<<","<<MYCOL(grid_)<<") <--- LBj("<<ksup<<") <--- P"<<src<<" ("<<srcRow<<","<<ksupProcCol<<")"<<std::endl;
2913 #endif
2914 
2915 
2916  sstm.write( &sstrLcolRecv[0], sstrSize );
2917 
2918  // Unpack L data.
2919  Int numLBlock;
2920  std::vector<Int> mask( LBlockMask::TOTAL_NUMBER, 1 );
2921  deserialize( numLBlock, sstm, NO_MASK );
2922  LcolRecv.resize(numLBlock);
2923  for( Int ib = 0; ib < numLBlock; ib++ ){
2924  deserialize( LcolRecv[ib], sstm, mask );
2925 #if ( _DEBUGlevel_ >= 1 )
2926  statusOFS<<"["<<ksup<<"] RECV contains "<<LcolRecv[ib].blockIdx<< " which corresponds to "<< ib * grid_->numProcRow + srcRow; // <<std::endl;
2927  // statusOFS<<" L is on row "<< srcRow <<" whereas U is on col "<<((ib * grid_->numProcRow + srcRow)/grid_->numProcCol)%grid_->numProcCol <<std::endl;
2928  statusOFS<<" L is on row "<< srcRow <<" whereas U is on col "<< LcolRecv[ib].blockIdx % grid_->numProcCol <<std::endl;
2929 #endif
2930  }
2931 
2932 
2933  recvIdx++;
2934 
2935  } // sender is not the same as receiver
2936  else{
2937  // L is obtained locally, just make a copy. Do not include the diagonal block
2938  std::vector<LBlock<T> >& Lcol = this->L( LBj( ksup, grid_ ) );
2939  if( MYROW( grid_ ) != PROW( ksup, grid_ ) ){
2940  LcolRecv.resize( Lcol.size() );
2941  for( Int ib = 0; ib < Lcol.size(); ib++ ){
2942  LcolRecv[ib] = Lcol[ib];
2943  }
2944  }
2945  else{
2946  LcolRecv.resize( Lcol.size() - 1 );
2947  for( Int ib = 0; ib < Lcol.size() - 1; ib++ ){
2948  LcolRecv[ib] = Lcol[ib+1];
2949  }
2950  }
2951  } // sender is the same as receiver
2952 
2953  // Update U
2954  // Make sure that the size of L and the corresponding U blocks match.
2955  for( Int ib = 0; ib < LcolRecv.size(); ib++ ){
2956  LBlock<T> & LB = LcolRecv[ib];
2957  if( LB.blockIdx <= ksup ){
2958 #ifdef USE_ABORT
2959  abort();
2960 #endif
2961  throw std::logic_error( "LcolRecv contains the wrong blocks." );
2962  }
2963  for( Int jb = 0; jb < Urow.size(); jb++ ){
2964  UBlock<T> & UB = Urow[jb];
2965  if( LB.blockIdx == UB.blockIdx ){
2966  // Compare size
2967  if( LB.numRow != UB.numCol || LB.numCol != UB.numRow ){
2968  std::ostringstream msg;
2969  msg << "LB(" << LB.blockIdx << ", " << ksup << ") and UB("
2970  << ksup << ", " << UB.blockIdx << ") do not share the same size." << std::endl
2971  << "LB: " << LB.numRow << " x " << LB.numCol << std::endl
2972  << "UB: " << UB.numRow << " x " << UB.numCol << std::endl;
2973 #ifdef USE_ABORT
2974  abort();
2975 #endif
2976  throw std::runtime_error( msg.str().c_str() );
2977  }
2978 
2979  // Note that the order of the column indices of the U
2980  // block may not follow the order of the row indices,
2981  // overwrite the information in U.
2982  UB.cols = LB.rows;
2983  Transpose( LB.nzval, UB.nzval );
2984 
2985 #if ( _DEBUGlevel_ >= 1 )
2986  statusOFS<<"["<<ksup<<"] USING "<<LB.blockIdx<< std::endl;
2987 #endif
2988  isBlockFound[jb] = true;
2989  break;
2990  } // if( LB.blockIdx == UB.blockIdx )
2991  } // for (jb)
2992  } // for (ib)
2993  }
2994  }
2995 
2996  for( Int jb = 0; jb < Urow.size(); jb++ ){
2997  UBlock<T> & UB = Urow[jb];
2998  if( !isBlockFound[jb] ){
2999 #ifdef USE_ABORT
3000  abort();
3001 #endif
3002  throw std::logic_error( "UBlock cannot find its update. Something is seriously wrong." );
3003  }
3004  }
3005 
3006 
3007 
3008  } // if I am a receiver
3009 
3010 
3011  //Wait until every receiver is done
3012  mpi::Waitall(arrMpiReqsSizeSend);
3013  mpi::Waitall(arrMpiReqsSend);
3014 
3015 
3016  } // for (ksup)
3017 
3018 #ifndef _RELEASE_
3019  PopCallStack();
3020 #endif
3021 
3022 #ifndef _RELEASE_
3023  PushCallStack("L(i,i) <- [L(k,k) * U(k,k)]^{-1} ");
3024 #endif
3025 #if ( _DEBUGlevel_ >= 1 )
3026  statusOFS << std::endl << "L(i,i) <- [L(k,k) * U(k,k)]^{-1}" << std::endl << std::endl;
3027 #endif
3028 
3029  for( Int ksup = 0; ksup < numSuper; ksup++ ){
3030  if( MYROW( grid_ ) == PROW( ksup, grid_ ) &&
3031  MYCOL( grid_ ) == PCOL( ksup, grid_ ) ){
3032  IntNumVec ipiv( SuperSize( ksup, super_ ) );
3033  // Note that the pivoting vector ipiv should follow the FORTRAN
3034  // notation by adding the +1
3035  for(Int i = 0; i < SuperSize( ksup, super_ ); i++){
3036  ipiv[i] = i + 1;
3037  }
3038  LBlock<T> & LB = (this->L( LBj( ksup, grid_ ) ))[0];
3039 #if ( _DEBUGlevel_ >= 2 )
3040  // Check the correctness of the matrix inversion for the first local column
3041  statusOFS << "Factorized A (" << ksup << ", " << ksup << "): " << LB.nzval << std::endl;
3042 #endif
3043  lapack::Getri( SuperSize( ksup, super_ ), LB.nzval.Data(),
3044  SuperSize( ksup, super_ ), ipiv.Data() );
3045 
3046  // Symmetrize the diagonal block
3047  Symmetrize( LB.nzval );
3048 
3049 #if ( _DEBUGlevel_ >= 2 )
3050  // Check the correctness of the matrix inversion for the first local column
3051  statusOFS << "Inversed A (" << ksup << ", " << ksup << "): " << LB.nzval << std::endl;
3052 #endif
3053  } // if I need to inverse the diagonal block
3054  } // for (ksup)
3055 
3056 
3057 #ifndef _RELEASE_
3058  PopCallStack();
3059 #endif
3060 
3061 
3062 
3063 #ifndef _RELEASE_
3064  PopCallStack();
3065 #endif
3066 
3067  return ;
3068  } // ----- end of method PMatrix::PreSelInv -----
3069 
3070  template<typename T>
3072  {
3073 #ifndef _RELEASE_
3074  PushCallStack("PMatrix::GetDiagonal");
3075 #endif
3076  Int numSuper = this->NumSuper();
3077 
3078  Int numCol = this->NumCol();
3079  const IntNumVec& perm = super_->perm;
3080  const IntNumVec& permInv = super_->permInv;
3081 
3082  const IntNumVec * pPerm_r;
3083  const IntNumVec * pPermInv_r;
3084 
3085  if(options_->RowPerm=="NOROWPERM"){
3086  pPerm_r = &super_->perm;
3087  pPermInv_r = &super_->permInv;
3088  }
3089  else{
3090  pPerm_r = &super_->perm_r;
3091  pPermInv_r = &super_->permInv_r;
3092  }
3093 
3094  const IntNumVec& perm_r = *pPerm_r;
3095  const IntNumVec& permInv_r = *pPermInv_r;
3096 
3097 
3098  NumVec<T> diagLocal( numCol );
3099  SetValue( diagLocal, ZERO<T>() );
3100 
3101  diag.Resize( numCol );
3102  SetValue( diag, ZERO<T>() );
3103 
3104  //TODO This doesnt work with row perm
3105  for( Int ksup = 0; ksup < numSuper; ksup++ ){
3106  // I own the diagonal block
3107  if( MYROW( grid_ ) == PROW( ksup, grid_ ) &&
3108  MYCOL( grid_ ) == PCOL( ksup, grid_ ) ){
3109  LBlock<T> & LB = this->L( LBj( ksup, grid_ ) )[0];
3110  for( Int i = 0; i < LB.numRow; i++ ){
3111  diagLocal( permInv_r( LB.rows(i) ) ) = LB.nzval( i, perm(permInv_r(i)) );
3112  }
3113  }
3114  }
3115 
3116  // All processors own diag
3117  mpi::Allreduce( diagLocal.Data(), diag.Data(), numCol, MPI_SUM, grid_->comm );
3118 
3119 #ifndef _RELEASE_
3120  PopCallStack();
3121 #endif
3122 
3123  return ;
3124  } // ----- end of method PMatrix::GetDiagonal -----
3125 
3126 
3127 
3128  template<typename T>
3130  {
3131 #ifndef _RELEASE_
3132  PushCallStack("PMatrix::PMatrixToDistSparseMatrix");
3133 #endif
3134 #if ( _DEBUGlevel_ >= 1 )
3135  statusOFS << std::endl << "Converting PMatrix to DistSparseMatrix." << std::endl;
3136 #endif
3137  Int mpirank = grid_->mpirank;
3138  Int mpisize = grid_->mpisize;
3139 
3140  std::vector<Int> rowSend( mpisize );
3141  std::vector<Int> colSend( mpisize );
3142  std::vector<T> valSend( mpisize );
3143  std::vector<Int> sizeSend( mpisize, 0 );
3144  std::vector<Int> displsSend( mpisize, 0 );
3145 
3146  std::vector<Int> rowRecv( mpisize );
3147  std::vector<Int> colRecv( mpisize );
3148  std::vector<T> valRecv( mpisize );
3149  std::vector<Int> sizeRecv( mpisize, 0 );
3150  std::vector<Int> displsRecv( mpisize, 0 );
3151 
3152  Int numSuper = this->NumSuper();
3153  const IntNumVec& permInv = super_->permInv;
3154  const IntNumVec * pPermInv_r;
3155 
3156  if(options_->RowPerm=="NOROWPERM"){
3157  pPermInv_r = &super_->permInv;
3158  }
3159  else{
3160  pPermInv_r = &super_->permInv_r;
3161  }
3162 
3163  const IntNumVec& permInv_r = *pPermInv_r;
3164 
3165 
3166 
3167  // The number of local columns in DistSparseMatrix format for the
3168  // processor with rank 0. This number is the same for processors
3169  // with rank ranging from 0 to mpisize - 2, and may or may not differ
3170  // from the number of local columns for processor with rank mpisize -
3171  // 1.
3172  Int numColFirst = this->NumCol() / mpisize;
3173 
3174  // Count the size first.
3175  for( Int ksup = 0; ksup < numSuper; ksup++ ){
3176  // L blocks
3177  if( MYCOL( grid_ ) == PCOL( ksup, grid_ ) ){
3178  std::vector<LBlock<T> >& Lcol = this->L( LBj( ksup, grid_ ) );
3179  for( Int ib = 0; ib < Lcol.size(); ib++ ){
3180  for( Int j = 0; j < Lcol[ib].numCol; j++ ){
3181  Int jcol = permInv( j + FirstBlockCol( ksup, super_ ) );
3182  Int dest = std::min( jcol / numColFirst, mpisize - 1 );
3183  sizeSend[dest] += Lcol[ib].numRow;
3184  }
3185  }
3186  } // I own the column of ksup
3187 
3188  // U blocks
3189  if( MYROW( grid_ ) == PROW( ksup, grid_ ) ){
3190  std::vector<UBlock<T> >& Urow = this->U( LBi( ksup, grid_ ) );
3191  for( Int jb = 0; jb < Urow.size(); jb++ ){
3192  IntNumVec& cols = Urow[jb].cols;
3193  for( Int j = 0; j < cols.m(); j++ ){
3194  Int jcol = permInv( cols(j) );
3195  Int dest = std::min( jcol / numColFirst, mpisize - 1 );
3196  sizeSend[dest] += Urow[jb].numRow;
3197  }
3198  }
3199  } // I own the row of ksup
3200  } // for (ksup)
3201 
3202  // All-to-all exchange of size information
3203  MPI_Alltoall(
3204  &sizeSend[0], 1, MPI_INT,
3205  &sizeRecv[0], 1, MPI_INT, grid_->comm );
3206 
3207 
3208 
3209  // Reserve the space
3210  for( Int ip = 0; ip < mpisize; ip++ ){
3211  if( ip == 0 ){
3212  displsSend[ip] = 0;
3213  }
3214  else{
3215  displsSend[ip] = displsSend[ip-1] + sizeSend[ip-1];
3216  }
3217 
3218  if( ip == 0 ){
3219  displsRecv[ip] = 0;
3220  }
3221  else{
3222  displsRecv[ip] = displsRecv[ip-1] + sizeRecv[ip-1];
3223  }
3224  }
3225  Int sizeSendTotal = displsSend[mpisize-1] + sizeSend[mpisize-1];
3226  Int sizeRecvTotal = displsRecv[mpisize-1] + sizeRecv[mpisize-1];
3227 
3228  rowSend.resize( sizeSendTotal );
3229  colSend.resize( sizeSendTotal );
3230  valSend.resize( sizeSendTotal );
3231 
3232  rowRecv.resize( sizeRecvTotal );
3233  colRecv.resize( sizeRecvTotal );
3234  valRecv.resize( sizeRecvTotal );
3235 
3236 #if ( _DEBUGlevel_ >= 1 )
3237  statusOFS << "displsSend = " << displsSend << std::endl;
3238  statusOFS << "displsRecv = " << displsRecv << std::endl;
3239 #endif
3240 
3241  // Put (row, col, val) to the sending buffer
3242  std::vector<Int> cntSize( mpisize, 0 );
3243 
3244  for( Int ksup = 0; ksup < numSuper; ksup++ ){
3245  // L blocks
3246  if( MYCOL( grid_ ) == PCOL( ksup, grid_ ) ){
3247  std::vector<LBlock<T> >& Lcol = this->L( LBj( ksup, grid_ ) );
3248  for( Int ib = 0; ib < Lcol.size(); ib++ ){
3249  IntNumVec& rows = Lcol[ib].rows;
3250  NumMat<T>& nzval = Lcol[ib].nzval;
3251  for( Int j = 0; j < Lcol[ib].numCol; j++ ){
3252  Int jcol = permInv( j + FirstBlockCol( ksup, super_ ) );
3253  Int dest = std::min( jcol / numColFirst, mpisize - 1 );
3254  for( Int i = 0; i < rows.m(); i++ ){
3255  rowSend[displsSend[dest] + cntSize[dest]] = permInv_r( rows(i) );
3256  colSend[displsSend[dest] + cntSize[dest]] = jcol;
3257  valSend[displsSend[dest] + cntSize[dest]] = nzval( i, j );
3258  cntSize[dest]++;
3259  }
3260  }
3261  }
3262  } // I own the column of ksup
3263 
3264  // U blocks
3265  if( MYROW( grid_ ) == PROW( ksup, grid_ ) ){
3266  std::vector<UBlock<T> >& Urow = this->U( LBi( ksup, grid_ ) );
3267  for( Int jb = 0; jb < Urow.size(); jb++ ){
3268  IntNumVec& cols = Urow[jb].cols;
3269  NumMat<T>& nzval = Urow[jb].nzval;
3270  for( Int j = 0; j < cols.m(); j++ ){
3271  Int jcol = permInv( cols(j) );
3272  Int dest = std::min( jcol / numColFirst, mpisize - 1 );
3273  for( Int i = 0; i < Urow[jb].numRow; i++ ){
3274  rowSend[displsSend[dest] + cntSize[dest]] =
3275  permInv_r( i + FirstBlockCol( ksup, super_ ) );
3276  colSend[displsSend[dest] + cntSize[dest]] = jcol;
3277  valSend[displsSend[dest] + cntSize[dest]] = nzval( i, j );
3278  cntSize[dest]++;
3279  }
3280  }
3281  }
3282  } // I own the row of ksup
3283  }
3284 
3285  // Check sizes match
3286  for( Int ip = 0; ip < mpisize; ip++ ){
3287  if( cntSize[ip] != sizeSend[ip] )
3288  {
3289 #ifdef USE_ABORT
3290  abort();
3291 #endif
3292  throw std::runtime_error( "Sizes of the sending information do not match." );
3293  }
3294  }
3295 
3296 
3297  // Alltoallv to exchange information
3298  mpi::Alltoallv(
3299  &rowSend[0], &sizeSend[0], &displsSend[0],
3300  &rowRecv[0], &sizeRecv[0], &displsRecv[0],
3301  grid_->comm );
3302  mpi::Alltoallv(
3303  &colSend[0], &sizeSend[0], &displsSend[0],
3304  &colRecv[0], &sizeRecv[0], &displsRecv[0],
3305  grid_->comm );
3306  mpi::Alltoallv(
3307  &valSend[0], &sizeSend[0], &displsSend[0],
3308  &valRecv[0], &sizeRecv[0], &displsRecv[0],
3309  grid_->comm );
3310 
3311 #if ( _DEBUGlevel_ >= 1 )
3312  statusOFS << "Alltoallv communication finished." << std::endl;
3313 #endif
3314 
3315  //#if ( _DEBUGlevel_ >= 1 )
3316  // for( Int ip = 0; ip < mpisize; ip++ ){
3317  // statusOFS << "rowSend[" << ip << "] = " << rowSend[ip] << std::endl;
3318  // statusOFS << "rowRecv[" << ip << "] = " << rowRecv[ip] << std::endl;
3319  // statusOFS << "colSend[" << ip << "] = " << colSend[ip] << std::endl;
3320  // statusOFS << "colRecv[" << ip << "] = " << colRecv[ip] << std::endl;
3321  // statusOFS << "valSend[" << ip << "] = " << valSend[ip] << std::endl;
3322  // statusOFS << "valRecv[" << ip << "] = " << valRecv[ip] << std::endl;
3323  // }
3324  //#endif
3325 
3326  // Organize the received message.
3327  Int firstCol = mpirank * numColFirst;
3328  Int numColLocal;
3329  if( mpirank == mpisize-1 )
3330  numColLocal = this->NumCol() - numColFirst * (mpisize-1);
3331  else
3332  numColLocal = numColFirst;
3333 
3334  std::vector<std::vector<Int> > rows( numColLocal );
3335  std::vector<std::vector<T> > vals( numColLocal );
3336 
3337  for( Int ip = 0; ip < mpisize; ip++ ){
3338  Int* rowRecvCur = &rowRecv[displsRecv[ip]];
3339  Int* colRecvCur = &colRecv[displsRecv[ip]];
3340  T* valRecvCur = &valRecv[displsRecv[ip]];
3341  for( Int i = 0; i < sizeRecv[ip]; i++ ){
3342  rows[colRecvCur[i]-firstCol].push_back( rowRecvCur[i] );
3343  vals[colRecvCur[i]-firstCol].push_back( valRecvCur[i] );
3344  } // for (i)
3345  } // for (ip)
3346 
3347  // Sort the rows
3348  std::vector<std::vector<Int> > sortIndex( numColLocal );
3349  for( Int j = 0; j < numColLocal; j++ ){
3350  sortIndex[j].resize( rows[j].size() );
3351  for( Int i = 0; i < sortIndex[j].size(); i++ )
3352  sortIndex[j][i] = i;
3353  std::sort( sortIndex[j].begin(), sortIndex[j].end(),
3354  IndexComp<std::vector<Int>& > ( rows[j] ) );
3355  } // for (j)
3356 
3357  // Form DistSparseMatrix according to the received message
3358  // NOTE: for indicies, DistSparseMatrix follows the FORTRAN
3359  // convention (1 based) while PMatrix follows the C convention (0
3360  // based)
3361  A.size = this->NumCol();
3362  A.nnzLocal = 0;
3363  A.colptrLocal.Resize( numColLocal + 1 );
3364  // Note that 1 is important since the index follows the FORTRAN convention
3365  A.colptrLocal(0) = 1;
3366  for( Int j = 0; j < numColLocal; j++ ){
3367  A.nnzLocal += rows[j].size();
3368  A.colptrLocal(j+1) = A.colptrLocal(j) + rows[j].size();
3369  }
3370 
3371  A.comm = grid_->comm;
3372 
3373 #if ( _DEBUGlevel_ >= 1 )
3374  statusOFS << "nnzLocal = " << A.nnzLocal << std::endl;
3375  statusOFS << "nnz = " << A.Nnz() << std::endl;
3376 #endif
3377 
3378 
3379  A.rowindLocal.Resize( A.nnzLocal );
3380  A.nzvalLocal.Resize( A.nnzLocal );
3381 
3382  Int* rowPtr = A.rowindLocal.Data();
3383  T* nzvalPtr = A.nzvalLocal.Data();
3384  for( Int j = 0; j < numColLocal; j++ ){
3385  std::vector<Int>& rowsCur = rows[j];
3386  std::vector<Int>& sortIndexCur = sortIndex[j];
3387  std::vector<T>& valsCur = vals[j];
3388  for( Int i = 0; i < rows[j].size(); i++ ){
3389  // Note that 1 is important since the index follows the FORTRAN convention
3390  *(rowPtr++) = rowsCur[sortIndexCur[i]] + 1;
3391  *(nzvalPtr++) = valsCur[sortIndexCur[i]];
3392  }
3393  }
3394 
3395 #if ( _DEBUGlevel_ >= 1 )
3396  statusOFS << "A.colptrLocal[end] = " << A.colptrLocal(numColLocal) << std::endl;
3397  statusOFS << "A.rowindLocal.size() = " << A.rowindLocal.m() << std::endl;
3398  statusOFS << "A.rowindLocal[end] = " << A.rowindLocal(A.nnzLocal-1) << std::endl;
3399  statusOFS << "A.nzvalLocal[end] = " << A.nzvalLocal(A.nnzLocal-1) << std::endl;
3400 #endif
3401 
3402 
3403 #ifndef _RELEASE_
3404  PopCallStack();
3405 #endif
3406 
3407  return ;
3408  } // ----- end of method PMatrix::PMatrixToDistSparseMatrix -----
3409 
3410 
3411 
3412  template<typename T>
3414  {
3415 #ifndef _RELEASE_
3416  PushCallStack("PMatrix::PMatrixToDistSparseMatrix");
3417 #endif
3418 #if ( _DEBUGlevel_ >= 1 )
3419  statusOFS << std::endl << "Converting PMatrix to DistSparseMatrix (2nd format)." << std::endl;
3420 #endif
3421  Int mpirank = grid_->mpirank;
3422  Int mpisize = grid_->mpisize;
3423 
3424  std::vector<Int> rowSend( mpisize );
3425  std::vector<Int> colSend( mpisize );
3426  std::vector<T> valSend( mpisize );
3427  std::vector<Int> sizeSend( mpisize, 0 );
3428  std::vector<Int> displsSend( mpisize, 0 );
3429 
3430  std::vector<Int> rowRecv( mpisize );
3431  std::vector<Int> colRecv( mpisize );
3432  std::vector<T> valRecv( mpisize );
3433  std::vector<Int> sizeRecv( mpisize, 0 );
3434  std::vector<Int> displsRecv( mpisize, 0 );
3435 
3436  Int numSuper = this->NumSuper();
3437  const IntNumVec& permInv = super_->permInv;
3438 
3439 
3440 
3441  const IntNumVec * pPermInv_r;
3442 
3443  if(options_->RowPerm=="NOROWPERM"){
3444  pPermInv_r = &super_->permInv;
3445  }
3446  else{
3447  pPermInv_r = &super_->permInv_r;
3448  }
3449 
3450  const IntNumVec& permInv_r = *pPermInv_r;
3451 
3452 
3453 
3454 
3455 
3456 
3457  // The number of local columns in DistSparseMatrix format for the
3458  // processor with rank 0. This number is the same for processors
3459  // with rank ranging from 0 to mpisize - 2, and may or may not differ
3460  // from the number of local columns for processor with rank mpisize -
3461  // 1.
3462  Int numColFirst = this->NumCol() / mpisize;
3463 
3464  // Count the size first.
3465  for( Int ksup = 0; ksup < numSuper; ksup++ ){
3466  // L blocks
3467  if( MYCOL( grid_ ) == PCOL( ksup, grid_ ) ){
3468  std::vector<LBlock<T> >& Lcol = this->L( LBj( ksup, grid_ ) );
3469  for( Int ib = 0; ib < Lcol.size(); ib++ ){
3470  for( Int j = 0; j < Lcol[ib].numCol; j++ ){
3471  Int jcol = permInv( j + FirstBlockCol( ksup, super_ ) );
3472  Int dest = std::min( jcol / numColFirst, mpisize - 1 );
3473  sizeSend[dest] += Lcol[ib].numRow;
3474  }
3475  }
3476  } // I own the column of ksup
3477 
3478  // U blocks
3479  if( MYROW( grid_ ) == PROW( ksup, grid_ ) ){
3480  std::vector<UBlock<T> >& Urow = this->U( LBi( ksup, grid_ ) );
3481  for( Int jb = 0; jb < Urow.size(); jb++ ){
3482  IntNumVec& cols = Urow[jb].cols;
3483  for( Int j = 0; j < cols.m(); j++ ){
3484  Int jcol = permInv( cols(j) );
3485  Int dest = std::min( jcol / numColFirst, mpisize - 1 );
3486  sizeSend[dest] += Urow[jb].numRow;
3487  }
3488  }
3489  } // I own the row of ksup
3490  } // for (ksup)
3491 
3492  // All-to-all exchange of size information
3493  MPI_Alltoall(
3494  &sizeSend[0], 1, MPI_INT,
3495  &sizeRecv[0], 1, MPI_INT, grid_->comm );
3496 
3497 
3498 
3499  // Reserve the space
3500  for( Int ip = 0; ip < mpisize; ip++ ){
3501  if( ip == 0 ){
3502  displsSend[ip] = 0;
3503  }
3504  else{
3505  displsSend[ip] = displsSend[ip-1] + sizeSend[ip-1];
3506  }
3507 
3508  if( ip == 0 ){
3509  displsRecv[ip] = 0;
3510  }
3511  else{
3512  displsRecv[ip] = displsRecv[ip-1] + sizeRecv[ip-1];
3513  }
3514  }
3515  Int sizeSendTotal = displsSend[mpisize-1] + sizeSend[mpisize-1];
3516  Int sizeRecvTotal = displsRecv[mpisize-1] + sizeRecv[mpisize-1];
3517 
3518  rowSend.resize( sizeSendTotal );
3519  colSend.resize( sizeSendTotal );
3520  valSend.resize( sizeSendTotal );
3521 
3522  rowRecv.resize( sizeRecvTotal );
3523  colRecv.resize( sizeRecvTotal );
3524  valRecv.resize( sizeRecvTotal );
3525 
3526 #if ( _DEBUGlevel_ >= 1 )
3527  statusOFS << "displsSend = " << displsSend << std::endl;
3528  statusOFS << "displsRecv = " << displsRecv << std::endl;
3529 #endif
3530 
3531  // Put (row, col, val) to the sending buffer
3532  std::vector<Int> cntSize( mpisize, 0 );
3533 
3534 
3535  for( Int ksup = 0; ksup < numSuper; ksup++ ){
3536  // L blocks
3537  if( MYCOL( grid_ ) == PCOL( ksup, grid_ ) ){
3538  std::vector<LBlock<T> >& Lcol = this->L( LBj( ksup, grid_ ) );
3539  for( Int ib = 0; ib < Lcol.size(); ib++ ){
3540  IntNumVec& rows = Lcol[ib].rows;
3541  NumMat<T>& nzval = Lcol[ib].nzval;
3542  for( Int j = 0; j < Lcol[ib].numCol; j++ ){
3543  Int jcol = permInv( j + FirstBlockCol( ksup, super_ ) );
3544  Int dest = std::min( jcol / numColFirst, mpisize - 1 );
3545  for( Int i = 0; i < rows.m(); i++ ){
3546  rowSend[displsSend[dest] + cntSize[dest]] = permInv_r( rows(i) );
3547  colSend[displsSend[dest] + cntSize[dest]] = jcol;
3548  valSend[displsSend[dest] + cntSize[dest]] = nzval( i, j );
3549  cntSize[dest]++;
3550  }
3551  }
3552  }
3553  } // I own the column of ksup
3554 
3555 
3556  // U blocks
3557  if( MYROW( grid_ ) == PROW( ksup, grid_ ) ){
3558  std::vector<UBlock<T> >& Urow = this->U( LBi( ksup, grid_ ) );
3559  for( Int jb = 0; jb < Urow.size(); jb++ ){
3560  IntNumVec& cols = Urow[jb].cols;
3561  NumMat<T>& nzval = Urow[jb].nzval;
3562  for( Int j = 0; j < cols.m(); j++ ){
3563  Int jcol = permInv( cols(j) );
3564  Int dest = std::min( jcol / numColFirst, mpisize - 1 );
3565  for( Int i = 0; i < Urow[jb].numRow; i++ ){
3566  rowSend[displsSend[dest] + cntSize[dest]] =
3567  permInv_r( i + FirstBlockCol( ksup, super_ ) );
3568  colSend[displsSend[dest] + cntSize[dest]] = jcol;
3569  valSend[displsSend[dest] + cntSize[dest]] = nzval( i, j );
3570  cntSize[dest]++;
3571  }
3572  }
3573  }
3574  } // I own the row of ksup
3575  }
3576 
3577 
3578 
3579  // Check sizes match
3580  for( Int ip = 0; ip < mpisize; ip++ ){
3581  if( cntSize[ip] != sizeSend[ip] ){
3582 #ifdef USE_ABORT
3583  abort();
3584 #endif
3585  throw std::runtime_error( "Sizes of the sending information do not match." );
3586  }
3587  }
3588 
3589  // Alltoallv to exchange information
3590  mpi::Alltoallv(
3591  &rowSend[0], &sizeSend[0], &displsSend[0],
3592  &rowRecv[0], &sizeRecv[0], &displsRecv[0],
3593  grid_->comm );
3594  mpi::Alltoallv(
3595  &colSend[0], &sizeSend[0], &displsSend[0],
3596  &colRecv[0], &sizeRecv[0], &displsRecv[0],
3597  grid_->comm );
3598  mpi::Alltoallv(
3599  &valSend[0], &sizeSend[0], &displsSend[0],
3600  &valRecv[0], &sizeRecv[0], &displsRecv[0],
3601  grid_->comm );
3602 
3603 #if ( _DEBUGlevel_ >= 1 )
3604  statusOFS << "Alltoallv communication finished." << std::endl;
3605 #endif
3606 
3607  //#if ( _DEBUGlevel_ >= 1 )
3608  // for( Int ip = 0; ip < mpisize; ip++ ){
3609  // statusOFS << "rowSend[" << ip << "] = " << rowSend[ip] << std::endl;
3610  // statusOFS << "rowRecv[" << ip << "] = " << rowRecv[ip] << std::endl;
3611  // statusOFS << "colSend[" << ip << "] = " << colSend[ip] << std::endl;
3612  // statusOFS << "colRecv[" << ip << "] = " << colRecv[ip] << std::endl;
3613  // statusOFS << "valSend[" << ip << "] = " << valSend[ip] << std::endl;
3614  // statusOFS << "valRecv[" << ip << "] = " << valRecv[ip] << std::endl;
3615  // }
3616  //#endif
3617 
3618  // Organize the received message.
3619  Int firstCol = mpirank * numColFirst;
3620  Int numColLocal;
3621  if( mpirank == mpisize-1 )
3622  numColLocal = this->NumCol() - numColFirst * (mpisize-1);
3623  else
3624  numColLocal = numColFirst;
3625 
3626  std::vector<std::vector<Int> > rows( numColLocal );
3627  std::vector<std::vector<T> > vals( numColLocal );
3628 
3629  for( Int ip = 0; ip < mpisize; ip++ ){
3630  Int* rowRecvCur = &rowRecv[displsRecv[ip]];
3631  Int* colRecvCur = &colRecv[displsRecv[ip]];
3632  T* valRecvCur = &valRecv[displsRecv[ip]];
3633  for( Int i = 0; i < sizeRecv[ip]; i++ ){
3634  rows[colRecvCur[i]-firstCol].push_back( rowRecvCur[i] );
3635  vals[colRecvCur[i]-firstCol].push_back( valRecvCur[i] );
3636  } // for (i)
3637  } // for (ip)
3638 
3639  // Sort the rows
3640  std::vector<std::vector<Int> > sortIndex( numColLocal );
3641  for( Int j = 0; j < numColLocal; j++ ){
3642  sortIndex[j].resize( rows[j].size() );
3643  for( Int i = 0; i < sortIndex[j].size(); i++ )
3644  sortIndex[j][i] = i;
3645  std::sort( sortIndex[j].begin(), sortIndex[j].end(),
3646  IndexComp<std::vector<Int>& > ( rows[j] ) );
3647  } // for (j)
3648 
3649  // Form DistSparseMatrix according to the received message
3650  // NOTE: for indicies, DistSparseMatrix follows the FORTRAN
3651  // convention (1 based) while PMatrix follows the C convention (0
3652  // based)
3653  if( A.size != this->NumCol() ){
3654 #ifdef USE_ABORT
3655  abort();
3656 #endif
3657  throw std::runtime_error( "The DistSparseMatrix providing the pattern has a different size from PMatrix." );
3658  }
3659  if( A.colptrLocal.m() != numColLocal + 1 ){
3660 #ifdef USE_ABORT
3661  abort();
3662 #endif
3663  throw std::runtime_error( "The DistSparseMatrix providing the pattern has a different number of local columns from PMatrix." );
3664  }
3665 
3666  B.size = A.size;
3667  B.nnz = A.nnz;
3668  B.nnzLocal = A.nnzLocal;
3669  B.colptrLocal = A.colptrLocal;
3670  B.rowindLocal = A.rowindLocal;
3671  B.nzvalLocal.Resize( B.nnzLocal );
3672  SetValue( B.nzvalLocal, ZERO<T>() );
3673  // Make sure that the communicator of A and B are the same.
3674  // FIXME Find a better way to compare the communicators
3675  // if( grid_->comm != A.comm ){
3676  // #ifdef USE_ABORT
3677  // abort();
3678  //#endif
3679  //throw std::runtime_error( "The DistSparseMatrix providing the pattern has a different communicator from PMatrix." );
3680  // }
3681  B.comm = grid_->comm;
3682 
3683  Int* rowPtr = B.rowindLocal.Data();
3684  T* nzvalPtr = B.nzvalLocal.Data();
3685  for( Int j = 0; j < numColLocal; j++ ){
3686  std::vector<Int>& rowsCur = rows[j];
3687  std::vector<Int>& sortIndexCur = sortIndex[j];
3688  std::vector<T>& valsCur = vals[j];
3689  std::vector<Int> rowsCurSorted( rowsCur.size() );
3690  // Note that 1 is important since the index follows the FORTRAN convention
3691  for( Int i = 0; i < rowsCurSorted.size(); i++ ){
3692  rowsCurSorted[i] = rowsCur[sortIndexCur[i]] + 1;
3693  }
3694 
3695  // Search and match the indices
3696  std::vector<Int>::iterator it;
3697  for( Int i = B.colptrLocal(j) - 1;
3698  i < B.colptrLocal(j+1) - 1; i++ ){
3699  it = std::lower_bound( rowsCurSorted.begin(), rowsCurSorted.end(),
3700  *(rowPtr++) );
3701  if( it == rowsCurSorted.end() ){
3702  // Did not find the row, set it to zero
3703  *(nzvalPtr++) = ZERO<T>();
3704  }
3705  else{
3706  // Found the row, set it according to the received value
3707  *(nzvalPtr++) = valsCur[ sortIndexCur[it-rowsCurSorted.begin()] ];
3708  }
3709  } // for (i)
3710  } // for (j)
3711 
3712 #if ( _DEBUGlevel_ >= 1 )
3713  statusOFS << "B.colptrLocal[end] = " << B.colptrLocal(numColLocal) << std::endl;
3714  statusOFS << "B.rowindLocal.size() = " << B.rowindLocal.m() << std::endl;
3715  statusOFS << "B.rowindLocal[end] = " << B.rowindLocal(B.nnzLocal-1) << std::endl;
3716  statusOFS << "B.nzvalLocal[end] = " << B.nzvalLocal(B.nnzLocal-1) << std::endl;
3717 #endif
3718 
3719 
3720 #ifndef _RELEASE_
3721  PopCallStack();
3722 #endif
3723 
3724  return ;
3725  } // ----- end of method PMatrix::PMatrixToDistSparseMatrix -----
3726 
3727 
3728  // A (maybe) more memory efficient way for converting the PMatrix to a
3729  // DistSparseMatrix structure.
3730  //
3731  template<typename T>
3733  {
3734 #ifndef _RELEASE_
3735  PushCallStack("PMatrix::PMatrixToDistSparseMatrix2");
3736 #endif
3737 #if ( _DEBUGlevel_ >= 1 )
3738  statusOFS << std::endl << "Converting PMatrix to DistSparseMatrix (2nd format)." << std::endl;
3739 #endif
3740  Int mpirank = grid_->mpirank;
3741  Int mpisize = grid_->mpisize;
3742 
3743  std::vector<Int> rowSend( mpisize );
3744  std::vector<Int> colSend( mpisize );
3745  std::vector<T> valSend( mpisize );
3746  std::vector<Int> sizeSend( mpisize, 0 );
3747  std::vector<Int> displsSend( mpisize, 0 );
3748 
3749  std::vector<Int> rowRecv( mpisize );
3750  std::vector<Int> colRecv( mpisize );
3751  std::vector<T> valRecv( mpisize );
3752  std::vector<Int> sizeRecv( mpisize, 0 );
3753  std::vector<Int> displsRecv( mpisize, 0 );
3754 
3755  Int numSuper = this->NumSuper();
3756  const IntNumVec& perm = super_->perm;
3757  const IntNumVec& permInv = super_->permInv;
3758 
3759  const IntNumVec * pPerm_r;
3760  const IntNumVec * pPermInv_r;
3761 
3762  if(options_->RowPerm=="NOROWPERM"){
3763  pPerm_r = &super_->perm;
3764  pPermInv_r = &super_->permInv;
3765  }
3766  else{
3767  pPerm_r = &super_->perm_r;
3768  pPermInv_r = &super_->permInv_r;
3769  }
3770 
3771  const IntNumVec& perm_r = *pPerm_r;
3772  const IntNumVec& permInv_r = *pPermInv_r;
3773 
3774  // Count the sizes from the A matrix first
3775  Int numColFirst = this->NumCol() / mpisize;
3776  Int firstCol = mpirank * numColFirst;
3777  Int numColLocal;
3778  if( mpirank == mpisize-1 )
3779  numColLocal = this->NumCol() - numColFirst * (mpisize-1);
3780  else
3781  numColLocal = numColFirst;
3782 
3783  Int* rowPtr = A.rowindLocal.Data();
3784  Int* colPtr = A.colptrLocal.Data();
3785 
3786  for( Int j = 0; j < numColLocal; j++ ){
3787  Int col = perm( firstCol + j );
3788  Int blockColIdx = BlockIdx( col, super_ );
3789  Int procCol = PCOL( blockColIdx, grid_ );
3790  for( Int i = colPtr[j] - 1; i < colPtr[j+1] - 1; i++ ){
3791  Int row = perm_r( *(rowPtr++) - 1 );
3792  Int blockRowIdx = BlockIdx( row, super_ );
3793  Int procRow = PROW( blockRowIdx, grid_ );
3794  Int dest = PNUM( procRow, procCol, grid_ );
3795 #if ( _DEBUGlevel_ >= 1 )
3796  statusOFS << "BlockIdx = " << blockRowIdx << ", " <<blockColIdx << std::endl;
3797  statusOFS << procRow << ", " << procCol << ", "
3798  << dest << std::endl;
3799 #endif
3800  sizeSend[dest]++;
3801  } // for (i)
3802  } // for (j)
3803 
3804  // All-to-all exchange of size information
3805  MPI_Alltoall(
3806  &sizeSend[0], 1, MPI_INT,
3807  &sizeRecv[0], 1, MPI_INT, grid_->comm );
3808 
3809 #if ( _DEBUGlevel_ >= 1 )
3810  statusOFS << std::endl << "sizeSend: " << sizeSend << std::endl;
3811  statusOFS << std::endl << "sizeRecv: " << sizeRecv << std::endl;
3812 #endif
3813 
3814 
3815 
3816  // Reserve the space
3817  for( Int ip = 0; ip < mpisize; ip++ ){
3818  if( ip == 0 ){
3819  displsSend[ip] = 0;
3820  }
3821  else{
3822  displsSend[ip] = displsSend[ip-1] + sizeSend[ip-1];
3823  }
3824 
3825  if( ip == 0 ){
3826  displsRecv[ip] = 0;
3827  }
3828  else{
3829  displsRecv[ip] = displsRecv[ip-1] + sizeRecv[ip-1];
3830  }
3831  }
3832 
3833  Int sizeSendTotal = displsSend[mpisize-1] + sizeSend[mpisize-1];
3834  Int sizeRecvTotal = displsRecv[mpisize-1] + sizeRecv[mpisize-1];
3835 
3836  rowSend.resize( sizeSendTotal );
3837  colSend.resize( sizeSendTotal );
3838  valSend.resize( sizeSendTotal );
3839 
3840  rowRecv.resize( sizeRecvTotal );
3841  colRecv.resize( sizeRecvTotal );
3842  valRecv.resize( sizeRecvTotal );
3843 
3844 #if ( _DEBUGlevel_ >= 1 )
3845  statusOFS << "displsSend = " << displsSend << std::endl;
3846  statusOFS << "displsRecv = " << displsRecv << std::endl;
3847 #endif
3848 
3849  // Put (row, col) to the sending buffer
3850  std::vector<Int> cntSize( mpisize, 0 );
3851 
3852  rowPtr = A.rowindLocal.Data();
3853  colPtr = A.colptrLocal.Data();
3854 
3855  for( Int j = 0; j < numColLocal; j++ ){
3856  Int col = perm( firstCol + j );
3857  Int blockColIdx = BlockIdx( col, super_ );
3858  Int procCol = PCOL( blockColIdx, grid_ );
3859  for( Int i = colPtr[j] - 1; i < colPtr[j+1] - 1; i++ ){
3860  Int row = perm_r( *(rowPtr++) - 1 );
3861  Int blockRowIdx = BlockIdx( row, super_ );
3862  Int procRow = PROW( blockRowIdx, grid_ );
3863  Int dest = PNUM( procRow, procCol, grid_ );
3864  rowSend[displsSend[dest] + cntSize[dest]] = row;
3865  colSend[displsSend[dest] + cntSize[dest]] = col;
3866  cntSize[dest]++;
3867  } // for (i)
3868  } // for (j)
3869 
3870 
3871  // Check sizes match
3872  for( Int ip = 0; ip < mpisize; ip++ ){
3873  if( cntSize[ip] != sizeSend[ip] ){
3874 #ifdef USE_ABORT
3875  abort();
3876 #endif
3877  throw std::runtime_error( "Sizes of the sending information do not match." );
3878  }
3879  }
3880 
3881  // Alltoallv to exchange information
3882  mpi::Alltoallv(
3883  &rowSend[0], &sizeSend[0], &displsSend[0],
3884  &rowRecv[0], &sizeRecv[0], &displsRecv[0],
3885  grid_->comm );
3886  mpi::Alltoallv(
3887  &colSend[0], &sizeSend[0], &displsSend[0],
3888  &colRecv[0], &sizeRecv[0], &displsRecv[0],
3889  grid_->comm );
3890 
3891 #if ( _DEBUGlevel_ >= 1 )
3892  statusOFS << "Alltoallv communication of nonzero indices finished." << std::endl;
3893 #endif
3894 
3895 
3896 #if ( _DEBUGlevel_ >= 1 )
3897  for( Int ip = 0; ip < mpisize; ip++ ){
3898  statusOFS << "rowSend[" << ip << "] = " << rowSend[ip] << std::endl;
3899  statusOFS << "rowRecv[" << ip << "] = " << rowRecv[ip] << std::endl;
3900  statusOFS << "colSend[" << ip << "] = " << colSend[ip] << std::endl;
3901  statusOFS << "colRecv[" << ip << "] = " << colRecv[ip] << std::endl;
3902  }
3903 #endif
3904 
3905  // For each (row, col), fill the nonzero values to valRecv locally.
3906  for( Int g = 0; g < sizeRecvTotal; g++ ){
3907  Int row = rowRecv[g];
3908  Int col = colRecv[g];
3909 
3910  Int blockRowIdx = BlockIdx( row, super_ );
3911  Int blockColIdx = BlockIdx( col, super_ );
3912 
3913  // Search for the nzval
3914  bool isFound = false;
3915 
3916  if( blockColIdx <= blockRowIdx ){
3917  // Data on the L side
3918 
3919  std::vector<LBlock<T> >& Lcol = this->L( LBj( blockColIdx, grid_ ) );
3920 
3921  for( Int ib = 0; ib < Lcol.size(); ib++ ){
3922 #if ( _DEBUGlevel_ >= 1 )
3923  statusOFS << "blockRowIdx = " << blockRowIdx << ", Lcol[ib].blockIdx = " << Lcol[ib].blockIdx << ", blockColIdx = " << blockColIdx << std::endl;
3924 #endif
3925  if( Lcol[ib].blockIdx == blockRowIdx ){
3926  IntNumVec& rows = Lcol[ib].rows;
3927  for( int iloc = 0; iloc < Lcol[ib].numRow; iloc++ ){
3928  if( rows[iloc] == row ){
3929  Int jloc = col - FirstBlockCol( blockColIdx, super_ );
3930  valRecv[g] = Lcol[ib].nzval( iloc, jloc );
3931  isFound = true;
3932  break;
3933  } // found the corresponding row
3934  }
3935  }
3936  if( isFound == true ) break;
3937  } // for (ib)
3938  }
3939  else{
3940  // Data on the U side
3941 
3942  std::vector<UBlock<T> >& Urow = this->U( LBi( blockRowIdx, grid_ ) );
3943 
3944  for( Int jb = 0; jb < Urow.size(); jb++ ){
3945  if( Urow[jb].blockIdx == blockColIdx ){
3946  IntNumVec& cols = Urow[jb].cols;
3947  for( int jloc = 0; jloc < Urow[jb].numCol; jloc++ ){
3948  if( cols[jloc] == col ){
3949  Int iloc = row - FirstBlockRow( blockRowIdx, super_ );
3950  valRecv[g] = Urow[jb].nzval( iloc, jloc );
3951  isFound = true;
3952  break;
3953  } // found the corresponding col
3954  }
3955  }
3956  if( isFound == true ) break;
3957  } // for (jb)
3958  } // if( blockColIdx <= blockRowIdx )
3959 
3960  // Did not find the corresponding row, set the value to zero.
3961  if( isFound == false ){
3962  statusOFS << "In the permutated order, (" << row << ", " << col <<
3963  ") is not found in PMatrix." << std::endl;
3964  valRecv[g] = ZERO<T>();
3965  }
3966 
3967  } // for (g)
3968 
3969 
3970  // Feed back valRecv to valSend through Alltoallv. NOTE: for the
3971  // values, the roles of "send" and "recv" are swapped.
3972  mpi::Alltoallv(
3973  &valRecv[0], &sizeRecv[0], &displsRecv[0],
3974  &valSend[0], &sizeSend[0], &displsSend[0],
3975  grid_->comm );
3976 
3977 #if ( _DEBUGlevel_ >= 1 )
3978  statusOFS << "Alltoallv communication of nonzero values finished." << std::endl;
3979 #endif
3980 
3981  // Put the nonzero values from valSend to the matrix B.
3982  B.size = A.size;
3983  B.nnz = A.nnz;
3984  B.nnzLocal = A.nnzLocal;
3985  B.colptrLocal = A.colptrLocal;
3986  B.rowindLocal = A.rowindLocal;
3987  B.nzvalLocal.Resize( B.nnzLocal );
3988  SetValue( B.nzvalLocal, ZERO<T>() );
3989  // Make sure that the communicator of A and B are the same.
3990  // FIXME Find a better way to compare the communicators
3991  // if( grid_->comm != A.comm ){
3992  // #ifdef USE_ABORT
3993  //abort();
3994  //#endif
3995  //throw std::runtime_error( "The DistSparseMatrix providing the pattern has a different communicator from PMatrix." );
3996  // }
3997  B.comm = grid_->comm;
3998 
3999  for( Int i = 0; i < mpisize; i++ )
4000  cntSize[i] = 0;
4001 
4002  rowPtr = B.rowindLocal.Data();
4003  colPtr = B.colptrLocal.Data();
4004  T* valPtr = B.nzvalLocal.Data();
4005 
4006  for( Int j = 0; j < numColLocal; j++ ){
4007  Int col = perm( firstCol + j );
4008  Int blockColIdx = BlockIdx( col, super_ );
4009  Int procCol = PCOL( blockColIdx, grid_ );
4010  for( Int i = colPtr[j] - 1; i < colPtr[j+1] - 1; i++ ){
4011  Int row = perm_r( *(rowPtr++) - 1 );
4012  Int blockRowIdx = BlockIdx( row, super_ );
4013  Int procRow = PROW( blockRowIdx, grid_ );
4014  Int dest = PNUM( procRow, procCol, grid_ );
4015  *(valPtr++) = valSend[displsSend[dest] + cntSize[dest]];
4016  cntSize[dest]++;
4017  } // for (i)
4018  } // for (j)
4019 
4020  // Check sizes match
4021  for( Int ip = 0; ip < mpisize; ip++ ){
4022  if( cntSize[ip] != sizeSend[ip] ){
4023 #ifdef USE_ABORT
4024  abort();
4025 #endif
4026  throw std::runtime_error( "Sizes of the sending information do not match." );
4027  }
4028  }
4029 
4030 
4031 #ifndef _RELEASE_
4032  PopCallStack();
4033 #endif
4034 
4035  return ;
4036  } // ----- end of method PMatrix::PMatrixToDistSparseMatrix2 -----
4037 
4038 
4039 
4040 
4041  template<typename T>
4043  {
4044 #ifndef _RELEASE_
4045  PushCallStack("PMatrix::NnzLocal");
4046 #endif
4047  Int numSuper = this->NumSuper();
4048  Int nnzLocal = 0;
4049  for( Int ksup = 0; ksup < numSuper; ksup++ ){
4050  if( MYCOL( grid_ ) == PCOL( ksup, grid_ ) ){
4051  std::vector<LBlock<T> >& Lcol = this->L( LBj( ksup, grid_ ) );
4052  for( Int ib = 0; ib < Lcol.size(); ib++ ){
4053  nnzLocal += Lcol[ib].numRow * Lcol[ib].numCol;
4054  }
4055  } // if I own the column of ksup
4056  if( MYROW( grid_ ) == PROW( ksup, grid_ ) ){
4057  std::vector<UBlock<T> >& Urow = this->U( LBi( ksup, grid_ ) );
4058  for( Int jb = 0; jb < Urow.size(); jb++ ){
4059  nnzLocal += Urow[jb].numRow * Urow[jb].numCol;
4060  }
4061  } // if I own the row of ksup
4062  }
4063 
4064 #ifndef _RELEASE_
4065  PopCallStack();
4066 #endif
4067 
4068  return nnzLocal;
4069  } // ----- end of method PMatrix::NnzLocal -----
4070 
4071 
4072  template<typename T>
4073  LongInt PMatrix<T>::Nnz ( )
4074  {
4075 #ifndef _RELEASE_
4076  PushCallStack("PMatrix::Nnz");
4077 #endif
4078  LongInt nnzLocal = LongInt( this->NnzLocal() );
4079  LongInt nnz;
4080 
4081  MPI_Allreduce( &nnzLocal, &nnz, 1, MPI_LONG_LONG, MPI_SUM, grid_->comm );
4082 
4083 #ifndef _RELEASE_
4084  PopCallStack();
4085 #endif
4086 
4087  return nnz;
4088  } // ----- end of method PMatrix::Nnz -----
4089 
4090  template< >
4091  inline void PMatrix<Real>::GetNegativeInertia ( Real& inertia )
4092  {
4093 #ifndef _RELEASE_
4094  PushCallStack("PMatrix::GetNegativeInertia");
4095 #endif
4096  Int numSuper = this->NumSuper();
4097 
4098  Real inertiaLocal = 0.0;
4099  inertia = 0.0;
4100 
4101  for( Int ksup = 0; ksup < numSuper; ksup++ ){
4102  // I own the diagonal block
4103  if( MYROW( grid_ ) == PROW( ksup, grid_ ) &&
4104  MYCOL( grid_ ) == PCOL( ksup, grid_ ) ){
4105  LBlock<Real> & LB = this->L( LBj( ksup, grid_ ) )[0];
4106  for( Int i = 0; i < LB.numRow; i++ ){
4107  if( LB.nzval(i, i) < 0 )
4108  inertiaLocal++;
4109  }
4110  }
4111  }
4112 
4113  // All processors own diag
4114  mpi::Allreduce( &inertiaLocal, &inertia, 1, MPI_SUM, grid_->comm );
4115 
4116 #ifndef _RELEASE_
4117  PopCallStack();
4118 #endif
4119 
4120  return ;
4121  } // ----- end of method PMatrix::GetNegativeInertia -----
4122 
4123 
4124  template< >
4125  inline void PMatrix<Complex>::GetNegativeInertia ( Real& inertia )
4126  {
4127 #ifndef _RELEASE_
4128  PushCallStack("PMatrix::GetNegativeInertia");
4129 #endif
4130  Int numSuper = this->NumSuper();
4131 
4132  Real inertiaLocal = 0.0;
4133  inertia = 0.0;
4134 
4135  for( Int ksup = 0; ksup < numSuper; ksup++ ){
4136  // I own the diagonal block
4137  if( MYROW( grid_ ) == PROW( ksup, grid_ ) &&
4138  MYCOL( grid_ ) == PCOL( ksup, grid_ ) ){
4139  LBlock<Complex> & LB = this->L( LBj( ksup, grid_ ) )[0];
4140  for( Int i = 0; i < LB.numRow; i++ ){
4141  if( LB.nzval(i, i).real() < 0 )
4142  inertiaLocal++;
4143  }
4144  }
4145  }
4146 
4147  // All processors own diag
4148  mpi::Allreduce( &inertiaLocal, &inertia, 1, MPI_SUM, grid_->comm );
4149 
4150 #ifndef _RELEASE_
4151  PopCallStack();
4152 #endif
4153 
4154  return ;
4155  } // ----- end of method PMatrix::GetNegativeInertia -----
4156 
4157  template<typename T>
4158  inline PMatrix<T> * PMatrix<T>::Create(const GridType * pGridType, const SuperNodeType * pSuper, const SuperLUOptions * pLuOpt)
4159  {
4160  PMatrix<T> * pMat = NULL;
4161  pMat = new PMatrix<T>( pGridType, pSuper, pLuOpt );
4162 
4163  return pMat;
4164  } // ----- end of factory method PMatrix::Create -----
4165 
4166  template<typename T>
4167  inline PMatrix<T> * PMatrix<T>::Create(const SuperLUOptions * pLuOpt)
4168  {
4169  PMatrix<T> * pMat = NULL;
4170  pMat = new PMatrix<T>();
4171 
4172  return pMat;
4173  } // ----- end of factory method PMatrix::Create -----
4174 } // namespace PEXSI
4175 
4176 
4177 
4178 #endif //_PEXSI_PSELINV_IMPL_HPP_
virtual void PreSelInv()
PreSelInv prepares the structure in L_ and U_ so that SelInv only involves matrix-matrix multiplicati...
Definition: pselinv_impl.hpp:2668
Int blockIdx
Block index (supernodal index)
Definition: pselinv.hpp:214
void PMatrixToDistSparseMatrix(DistSparseMatrix< T > &A)
PMatrixToDistSparseMatrix converts the PMatrix into a distributed compressed sparse column matrix for...
Definition: pselinv_impl.hpp:3129
void SelInvIntra_P2p(Int lidx)
SelInvIntra_P2p.
Definition: pselinv_impl.hpp:1202
A thin interface for passing parameters to set the SuperLU options.
Definition: superlu_dist_internal.hpp:62
Definition: TreeBcast.hpp:22
SuperNodeType describes mapping between supernode and column, the permutation information, and potentially the elimination tree (not implemented here).
Definition: pselinv.hpp:143
Interface with SuperLU_Dist (version 3.0 and later)
void GetNegativeInertia(Real &inertia)
GetNegativeInertia computes the negative inertia of a PMatrix. This can be used to estimate e...
Int PROW(Int bnum, const GridType *g)
PROW returns the processor row that the bnum-th block (supernode) belongs to.
Definition: pselinv.hpp:270
Int numRow
Number of nonzero rows.
Definition: pselinv.hpp:165
IntNumVec colptrLocal
Dimension numColLocal + 1, storing the pointers to the nonzero row indices and nonzero values in rowp...
Definition: sparse_matrix.hpp:108
void UnpackData(SuperNodeBufferType &snode, std::vector< LBlock< T > > &LcolRecv, std::vector< UBlock< T > > &UrowRecv)
UnpackData.
Definition: pselinv_impl.hpp:965
Int MYPROC(const GridType *g)
MYPROC returns the current processor rank.
Definition: pselinv.hpp:257
Int nnzLocal
Local number of local nonzeros elements on this processor.
Definition: sparse_matrix.hpp:96
Int PCOL(Int bnum, const GridType *g)
PCOL returns the processor column that the bnum-th block (supernode) belongs to.
Definition: pselinv.hpp:275
Int LBi(Int bnum, const GridType *g)
LBi returns the local block number on the processor at processor row PROW( bnum, g )...
Definition: pselinv.hpp:285
Profiling and timing using TAU.
Int SuperSize(Int bnum, const SuperNodeType *s)
SuperSize returns the size of the block bnum.
Definition: pselinv.hpp:326
Int LBj(Int bnum, const GridType *g)
LBj returns the local block number on the processor at processor column PCOL( bnum, g ).
Definition: pselinv.hpp:290
Int GBj(Int jLocal, const GridType *g)
GBj returns the global block number from a local block number in the column direction.
Definition: pselinv.hpp:300
void SetValue(NumMat< F > &M, F val)
SetValue sets a numerical matrix to a constant val.
Definition: NumMat_impl.hpp:171
Int numCol
Number of nonzero columns.
Definition: pselinv.hpp:168
Int NumCol(const SuperNodeType *s)
NumCol returns the total number of columns for a supernodal partiiton.
Definition: pselinv.hpp:335
Definition: pselinv.hpp:569
void GetDiagonal(NumVec< T > &diag)
GetDiagonal extracts the diagonal elements of the PMatrix.
Definition: pselinv_impl.hpp:3071
void GetWorkSet(std::vector< Int > &snodeEtree, std::vector< std::vector< Int > > &WSet)
GetWorkSet.
Definition: pselinv_impl.hpp:1125
NumVec< F > nzvalLocal
Dimension nnzLocal, storing the nonzero values.
Definition: sparse_matrix.hpp:116
Int FirstBlockRow(Int bnum, const SuperNodeType *s)
FirstBlockRow returns the first column of a block bnum. Note: the functionality of FirstBlockRow is e...
Definition: pselinv.hpp:321
void GetEtree(std::vector< Int > &etree_supno)
GetEtree computes the supernodal elimination tree to be used later in the pipelined selected inversio...
Definition: pselinv_impl.hpp:1058
LBlock stores a nonzero block in the lower triangular part or the diagonal part in PSelInv...
Definition: pselinv.hpp:159
IntNumVec rowindLocal
Dimension nnzLocal, storing the nonzero row indices. The indices are 1-based (FORTRAN-convention), i.e. the first row index is 1.
Definition: sparse_matrix.hpp:113
Int PNUM(Int i, Int j, const GridType *g)
PNUM returns the processor rank that the bnum-th block (supernode) belongs to.
Definition: pselinv.hpp:280
virtual void ConstructCommunicationPattern()
ConstructCommunicationPattern constructs the communication pattern to be used later in the selected i...
Definition: pselinv_impl.hpp:1942
void ConstructCommunicationPattern_P2p()
ConstructCommunicationPattern_P2p constructs the communication pattern to be used later in the select...
Definition: pselinv_impl.hpp:1950
GridType is the PSelInv way of defining the grid.
Definition: pselinv.hpp:106
Int MYCOL(const GridType *g)
MYCOL returns my processor column.
Definition: pselinv.hpp:265
Int size
Matrix dimension.
Definition: sparse_matrix.hpp:93
IntNumVec rows
Dimension numRow * 1, index (0-based) for the number of nonzero rows.
Definition: pselinv.hpp:171
Int numCol
Number of nonzero columns.
Definition: pselinv.hpp:220
UBlock stores a nonzero block in the upper triangular part in PSelInv.
Definition: pselinv.hpp:211
Int FirstBlockCol(Int bnum, const SuperNodeType *s)
FirstBlockCol returns the first column of a block bnum.
Definition: pselinv.hpp:314
LongInt Nnz()
Compute the total number of nonzeros through MPI_Allreduce.
Definition: sparse_matrix_impl.hpp:107
MPI_Comm comm
MPI communicator.
Definition: sparse_matrix.hpp:119
Int NnzLocal()
NnzLocal computes the number of nonzero elements (L and U) saved locally.
Definition: pselinv_impl.hpp:4042
Int blockIdx
Block index (supernodal index)
Definition: pselinv.hpp:162
PMatrix contains the main data structure and the computational routine for the parallel selected inve...
Definition: ngchol_interf.hpp:57
void SendRecvCD_UpdateU(std::vector< SuperNodeBufferType > &arrSuperNodes, Int stepSuper)
SendRecvCD_UpdateU.
Definition: pselinv_impl.hpp:742
Int BlockIdx(Int i, const SuperNodeType *s)
BlockIdx returns the block index of a column i.
Definition: pselinv.hpp:309
Int numRow
Number of nonzero rows.
Definition: pselinv.hpp:217
NumMat< T > nzval
Dimension numRow * numCol, nonzero elements.
Definition: pselinv.hpp:226
Int nnz
Total number of nonzeros elements.
Definition: sparse_matrix.hpp:101
Int NumSuper(const SuperNodeType *s)
NumSuper returns the total number of supernodes.
Definition: pselinv.hpp:330
void SelInv_P2p()
Point-to-point version of the selected inversion.
Definition: pselinv_impl.hpp:2633
NumMat< T > nzval
Dimension numRow * numCol, nonzero elements.
Definition: pselinv.hpp:175
Definition: utility.hpp:1481
IntNumVec cols
Dimension numRow * 1, index (0-based) for the number of nonzero rows.
Definition: pselinv.hpp:223
virtual void SelInv()
SelInv is the main function for the selected inversion.
Definition: pselinv_impl.hpp:2608
LongInt Nnz()
Nnz computes the total number of nonzero elements in the PMatrix.
Definition: pselinv_impl.hpp:4073
void SelInv_lookup_indexes(SuperNodeBufferType &snode, std::vector< LBlock< T > > &LcolRecv, std::vector< UBlock< T > > &UrowRecv, NumMat< T > &AinvBuf, NumMat< T > &UBuf)
SelInv_lookup_indexes.
Definition: pselinv_impl.hpp:385
void PMatrixToDistSparseMatrix2(const DistSparseMatrix< T > &A, DistSparseMatrix< T > &B)
PMatrixToDistSparseMatrix2 is a more efficient version which performs the same job as PMatrixToDistSp...
Definition: pselinv_impl.hpp:3732
Definition: TreeBcast.hpp:555
void ComputeDiagUpdate(SuperNodeBufferType &snode)
ComputeDiagUpdate.
Definition: pselinv_impl.hpp:1019
DistSparseMatrix describes a Sparse matrix in the compressed sparse column format (CSC) and distribut...
Definition: sparse_matrix.hpp:91
Int MYROW(const GridType *g)
MYROW returns my processor row.
Definition: pselinv.hpp:261