PEXSI
 All Classes Namespaces Files Functions Variables Typedefs Pages
pselinv_impl.hpp
Go to the documentation of this file.
1 /*
2  Copyright (c) 2012 The Regents of the University of California,
3  through Lawrence Berkeley National Laboratory.
4 
5 Authors: Lin Lin and Mathias Jacquelin
6 
7 This file is part of PEXSI. All rights reserved.
8 
9 Redistribution and use in source and binary forms, with or without
10 modification, are permitted provided that the following conditions are met:
11 
12 (1) Redistributions of source code must retain the above copyright notice, this
13 list of conditions and the following disclaimer.
14 (2) Redistributions in binary form must reproduce the above copyright notice,
15 this list of conditions and the following disclaimer in the documentation
16 and/or other materials provided with the distribution.
17 (3) Neither the name of the University of California, Lawrence Berkeley
18 National Laboratory, U.S. Dept. of Energy nor the names of its contributors may
19 be used to endorse or promote products derived from this software without
20 specific prior written permission.
21 
22 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
23 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
24 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
25 DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
26 ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
27 (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
29 ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
31 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 
33 You are under no obligation whatsoever to provide any bug fixes, patches, or
34 upgrades to the features, functionality or performance of the source code
35 ("Enhancements") to anyone; however, if you choose to make your Enhancements
36 available either publicly, or directly to Lawrence Berkeley National
37 Laboratory, without imposing a separate written license agreement for such
38 Enhancements, then you hereby grant the following license: a non-exclusive,
39 royalty-free perpetual license to install, use, modify, prepare derivative
40 works, incorporate into other computer software, distribute, and sublicense
41 such enhancements or derivative works thereof, in binary and source code form.
42  */
46 #ifndef _PEXSI_PSELINV_IMPL_HPP_
47 #define _PEXSI_PSELINV_IMPL_HPP_
48 
49 #include <list>
50 #include <limits>
51 
52 #include "pexsi/timer.h"
54 
55 
56 #define MPI_MAX_COMM (1024)
57 #define BCAST_THRESHOLD 16
58 
59 #define MOD(a,b) \
60  ( ((a)%(b)+(b))%(b))
61 
62 #ifdef COMMUNICATOR_PROFILE
63 namespace PEXSI{
64  class CommunicatorStat{
65  public:
66  typedef std::map<std::vector<bool> , std::vector<Int> > bitMaskSet;
67  //Communicators for the Bcast variant
68  NumVec<Int> countSendToBelow_;
69  bitMaskSet maskSendToBelow_;
70 
71  NumVec<Int> countRecvFromBelow_;
72  bitMaskSet maskRecvFromBelow_;
73 
74  NumVec<Int> countSendToRight_;
75  bitMaskSet maskSendToRight_;
76  };
77 }
78 #endif
79 
80 
81 
82 namespace PEXSI{
83  inline GridType::GridType ( MPI_Comm Bcomm, int nprow, int npcol )
84  {
85 #ifndef _RELEASE_
86  PushCallStack("GridType::GridType");
87 #endif
88  Int info;
89  MPI_Initialized( &info );
90  if( !info ){
91 #ifdef USE_ABORT
92  abort();
93 #endif
94  throw std::logic_error( "MPI has not been initialized." );
95  }
96  MPI_Group comm_group;
97  MPI_Comm_group( Bcomm, &comm_group );
98  MPI_Comm_create( Bcomm, comm_group, &comm );
99  // comm = Bcomm;
100 
101  MPI_Comm_rank( comm, &mpirank );
102  MPI_Comm_size( comm, &mpisize );
103  if( mpisize != nprow * npcol ){
104 #ifdef USE_ABORT
105  abort();
106 #endif
107  throw std::logic_error( "mpisize != nprow * npcol." );
108  }
109 
110 
111  numProcRow = nprow;
112  numProcCol = npcol;
113 #ifdef SWAP_ROWS_COLS
114  Int myrow = mpirank % npcol;
115  Int mycol = mpirank / npcol;
116 #else
117  Int myrow = mpirank / npcol;
118  Int mycol = mpirank % npcol;
119 #endif
120 
121  MPI_Comm_split( comm, myrow, mycol, &rowComm );
122  MPI_Comm_split( comm, mycol, myrow, &colComm );
123 
124  MPI_Group_free( &comm_group );
125 
126 #ifndef _RELEASE_
127  PopCallStack();
128 #endif
129 
130  return ;
131  } // ----- end of method GridType::GridType -----
132 
133 
134  inline GridType::~GridType ( )
135  {
136 #ifndef _RELEASE_
137  PushCallStack("GridType::~GridType");
138 #endif
139  // Dot not free grid.comm which is not generated by GridType().
140 
141  MPI_Comm_free( &rowComm );
142  MPI_Comm_free( &colComm );
143  MPI_Comm_free( &comm );
144 
145 #ifndef _RELEASE_
146  PopCallStack();
147 #endif
148  return ;
149  } // ----- end of method GridType::~GridType -----
150 }
151 
152 
153 namespace PEXSI{
154  template<typename T>
155  void PMatrix<T>::deallocate(){
156 
157  grid_ = NULL;
158  super_ = NULL;
159  options_ = NULL;
160  optionsLU_ = NULL;
161 
162  ColBlockIdx_.clear();
163  RowBlockIdx_.clear();
164  U_.clear();
165  L_.clear();
166  workingSet_.clear();
167 
168  // Communication variables
169  isSendToBelow_.Clear();
170  isSendToRight_.Clear();
171  isSendToDiagonal_.Clear();
172  isSendToCrossDiagonal_.Clear();
173 
174  isRecvFromBelow_.Clear();
175  isRecvFromAbove_.Clear();
176  isRecvFromLeft_.Clear();
177  isRecvFromCrossDiagonal_.Clear();
178 
179 
180  //Cleanup tree information
181 #ifdef NEW_BCAST
182  for(int i =0;i<fwdToBelowTree2_.size();++i){
183  if(fwdToBelowTree2_[i]!=NULL){
184  delete fwdToBelowTree2_[i];
185  fwdToBelowTree2_[i] = NULL;
186  }
187  }
188  for(int i =0;i<fwdToRightTree2_.size();++i){
189  if(fwdToRightTree2_[i]!=NULL){
190  delete fwdToRightTree2_[i];
191  fwdToRightTree2_[i] = NULL;
192  }
193  }
194 #else
195  for(int i =0;i<fwdToBelowTree_.size();++i){
196  if(fwdToBelowTree_[i]!=NULL){
197  delete fwdToBelowTree_[i];
198  fwdToBelowTree_[i] = NULL;
199  }
200  }
201 
202  for(int i =0;i<fwdToRightTree_.size();++i){
203  if(fwdToRightTree_[i]!=NULL){
204  delete fwdToRightTree_[i];
205  fwdToRightTree_[i] = NULL;
206  }
207  }
208 #endif
209 
210  for(int i =0;i<redToLeftTree_.size();++i){
211  if(redToLeftTree_[i]!=NULL){
212  delete redToLeftTree_[i];
213  redToLeftTree_[i] = NULL;
214  }
215  }
216  for(int i =0;i<redToAboveTree_.size();++i){
217  if(redToAboveTree_[i]!=NULL){
218  delete redToAboveTree_[i];
219  redToAboveTree_[i] = NULL;
220  }
221  }
222 
223 //dump comm_profile info
224 #if defined(COMM_PROFILE) || defined(COMM_PROFILE_BCAST)
225  if(commOFS.is_open()){
226  commOFS.close();
227  }
228 #endif
229 
230 
231  }
232 
233  template<typename T>
234  PMatrix<T> & PMatrix<T>::operator = ( const PMatrix<T> & C){
235  if(&C!=this){
236  //If we have some memory allocated, delete it
237  deallocate();
238 
239  grid_ = C.grid_;
240  super_ = C.super_;
241  options_ = C.options_;
242  optionsLU_ = C.optionsLU_;
243 
244 
245  maxTag_ = C.maxTag_;
246  limIndex_ = C.limIndex_;
247 
248  ColBlockIdx_ = C.ColBlockIdx_;
249  RowBlockIdx_ = C.RowBlockIdx_;
250  L_ = C.L_;
251  U_ = C.U_;
252 
253  workingSet_ = C.workingSet_;
254 
255 
256  // Communication variables
257  isSendToBelow_ = C.isSendToBelow_;
258  isSendToRight_ = C.isSendToRight_;
259  isSendToDiagonal_ = C.isSendToDiagonal_;
260  isSendToCrossDiagonal_ = C.isSendToCrossDiagonal_;
261 
262  isRecvFromBelow_ = C.isRecvFromBelow_;
263  isRecvFromAbove_ = C.isRecvFromAbove_;
264  isRecvFromLeft_ = C.isRecvFromLeft_;
265  isRecvFromCrossDiagonal_ = C.isRecvFromCrossDiagonal_;
266 
267 #ifdef NEW_BCAST
268  fwdToBelowTree2_.resize(C.fwdToBelowTree2_.size());
269  for(int i = 0 ; i< C.fwdToBelowTree2_.size();++i){
270  if(C.fwdToBelowTree2_[i]!=NULL){
271  fwdToBelowTree2_[i] = C.fwdToBelowTree2_[i]->clone();
272  }
273  }
274  fwdToRightTree2_.resize(C.fwdToRightTree2_.size());
275  for(int i = 0 ; i< C.fwdToRightTree2_.size();++i){
276  if(C.fwdToRightTree2_[i]!=NULL){
277  fwdToRightTree2_[i] = C.fwdToRightTree2_[i]->clone();
278  }
279  }
280 #else
281  fwdToBelowTree_.resize(C.fwdToBelowTree_.size());
282  for(int i = 0 ; i< C.fwdToBelowTree_.size();++i){
283  if(C.fwdToBelowTree_[i]!=NULL){
284  fwdToBelowTree_[i] = C.fwdToBelowTree_[i]->clone();
285  }
286  }
287  fwdToRightTree_.resize(C.fwdToRightTree_.size());
288  for(int i = 0 ; i< C.fwdToRightTree_.size();++i){
289  if(C.fwdToRightTree_[i]!=NULL){
290  fwdToRightTree_[i] = C.fwdToRightTree_[i]->clone();
291  }
292  }
293 #endif
294 
295  redToLeftTree_.resize(C.redToLeftTree_.size());
296  for(int i = 0 ; i< C.redToLeftTree_.size();++i){
297  if(C.redToLeftTree_[i]!=NULL){
298  redToLeftTree_[i] = C.redToLeftTree_[i]->clone();
299  }
300  }
301  redToAboveTree_.resize(C.redToAboveTree_.size());
302  for(int i = 0 ; i< C.redToAboveTree_.size();++i){
303  if(C.redToAboveTree_[i]!=NULL){
304  redToAboveTree_[i] = C.redToAboveTree_[i]->clone();
305  }
306  }
307 
308  }
309 
310  return *this;
311  }
312 
313  template<typename T>
314  PMatrix<T>::PMatrix( const PMatrix<T> & C):PMatrix(){
315  //If we have some memory allocated, delete it
316  deallocate();
317 
318  grid_ = C.grid_;
319  super_ = C.super_;
320  options_ = C.options_;
321  optionsLU_ = C.optionsLU_;
322 
323 
324  limIndex_ = C.limIndex_;
325  maxTag_ = C.maxTag_;
326  ColBlockIdx_ = C.ColBlockIdx_;
327  RowBlockIdx_ = C.RowBlockIdx_;
328  L_ = C.L_;
329  U_ = C.U_;
330 
331  workingSet_ = C.workingSet_;
332 
333 
334  // Communication variables
335  isSendToBelow_ = C.isSendToBelow_;
336  isSendToRight_ = C.isSendToRight_;
337  isSendToDiagonal_ = C.isSendToDiagonal_;
338  isSendToCrossDiagonal_ = C.isSendToCrossDiagonal_;
339 
340  isRecvFromBelow_ = C.isRecvFromBelow_;
341  isRecvFromAbove_ = C.isRecvFromAbove_;
342  isRecvFromLeft_ = C.isRecvFromLeft_;
343  isRecvFromCrossDiagonal_ = C.isRecvFromCrossDiagonal_;
344 
345 
346 
347 #ifdef NEW_BCAST
348  fwdToBelowTree2_.resize(C.fwdToBelowTree2_.size());
349  for(int i = 0 ; i< C.fwdToBelowTree2_.size();++i){
350  if(C.fwdToBelowTree2_[i]!=NULL){
351  fwdToBelowTree2_[i] = C.fwdToBelowTree2_[i]->clone();
352  }
353  }
354  fwdToRightTree2_.resize(C.fwdToRightTree2_.size());
355  for(int i = 0 ; i< C.fwdToRightTree2_.size();++i){
356  if(C.fwdToRightTree2_[i]!=NULL){
357  fwdToRightTree2_[i] = C.fwdToRightTree2_[i]->clone();
358  }
359  }
360 #else
361  fwdToBelowTree_.resize(C.fwdToBelowTree_.size());
362  for(int i = 0 ; i< C.fwdToBelowTree_.size();++i){
363  if(C.fwdToBelowTree_[i]!=NULL){
364  fwdToBelowTree_[i] = C.fwdToBelowTree_[i]->clone();
365  }
366  }
367  fwdToRightTree_.resize(C.fwdToRightTree_.size());
368  for(int i = 0 ; i< C.fwdToRightTree_.size();++i){
369  if(C.fwdToRightTree_[i]!=NULL){
370  fwdToRightTree_[i] = C.fwdToRightTree_[i]->clone();
371  }
372  }
373 #endif
374 
375  redToLeftTree_.resize(C.redToLeftTree_.size());
376  for(int i = 0 ; i< C.redToLeftTree_.size();++i){
377  if(C.redToLeftTree_[i]!=NULL){
378  redToLeftTree_[i] = C.redToLeftTree_[i]->clone();
379  }
380  }
381  redToAboveTree_.resize(C.redToAboveTree_.size());
382  for(int i = 0 ; i< C.redToAboveTree_.size();++i){
383  if(C.redToAboveTree_[i]!=NULL){
384  redToAboveTree_[i] = C.redToAboveTree_[i]->clone();
385  }
386  }
387 
388  }
389 
390 
391  template<typename T>
392  PMatrix<T>::PMatrix ( ){
393 
394  }
395 
396  template<typename T>
397  PMatrix<T>::PMatrix (
398  const GridType* g,
399  const SuperNodeType* s,
400  const PEXSI::PSelInvOptions * o,
401  const PEXSI::SuperLUOptions * oLU
402  ):PMatrix()
403  {
404 #ifndef _RELEASE_
405  PushCallStack("PMatrix::PMatrix");
406 #endif
407 
408  this->Setup( g, s, o, oLU );
409 
410 
411 #ifndef _RELEASE_
412  PopCallStack();
413 #endif
414  return ;
415  } // ----- end of method PMatrix::PMatrix -----
416 
417  template<typename T>
418  PMatrix<T>::~PMatrix ( )
419  {
420 
421 #ifndef _RELEASE_
422  PushCallStack("PMatrix::~PMatrix");
423 #endif
424 
425  deallocate();
426 
427 #ifndef _RELEASE_
428  PopCallStack();
429 #endif
430  return ;
431  } // ----- end of method PMatrix::~PMatrix -----
432 
433  template<typename T>
434  void PMatrix<T>::Setup(
435  const GridType* g,
436  const SuperNodeType* s,
437  const PEXSI::PSelInvOptions * o,
438  const PEXSI::SuperLUOptions * oLU
439  )
440  {
441 #ifndef _RELEASE_
442  PushCallStack("PMatrix::Setup");
443 #endif
444 
445  grid_ = g;
446  super_ = s;
447  options_ = o;
448  optionsLU_ = oLU;
449 
450  // if( grid_->numProcRow != grid_->numProcCol ){
451  // #ifdef USE_ABORT
452  //abort();
453  //#endif
454  //throw std::runtime_error( "The current version of SelInv only works for square processor grids." ); }
455 
456  L_.clear();
457  U_.clear();
458  ColBlockIdx_.clear();
459  RowBlockIdx_.clear();
460 
461  L_.resize( this->NumLocalBlockCol() );
462  U_.resize( this->NumLocalBlockRow() );
463 
464  ColBlockIdx_.resize( this->NumLocalBlockCol() );
465  RowBlockIdx_.resize( this->NumLocalBlockRow() );
466 
467  //workingSet_.resize(this->NumSuper());
468 #if ( _DEBUGlevel_ >= 1 )
469  statusOFS << std::endl << "PMatrix is constructed. The grid information: " << std::endl;
470  statusOFS << "mpirank = " << MYPROC(grid_) << std::endl;
471  statusOFS << "myrow = " << MYROW(grid_) << std::endl;
472  statusOFS << "mycol = " << MYCOL(grid_) << std::endl;
473 #endif
474 
475  //Get maxTag value and compute the max depth we can use
476  //int * pmaxTag;
477  int pmaxTag = 4000;
478 // int flag;
479 // MPI_Comm_get_attr(grid_->comm, MPI_TAG_UB, (void*)&pmaxTag, &flag);
480  //maxTag_ = *pmaxTag;
481  maxTag_ = pmaxTag;
482  limIndex_ = (Int)maxTag_/(Int)SELINV_TAG_COUNT -1;
483 
484 
485 #if defined(COMM_PROFILE) || defined(COMM_PROFILE_BCAST)
486  std::stringstream ss3;
487  ss3 << "comm_stat" << MYPROC(this->grid_);
488  if(!commOFS.is_open()){
489  commOFS.open( ss3.str().c_str());
490  }
491 #endif
492 
493 
494 #ifndef _RELEASE_
495  PopCallStack();
496 #endif
497 
498  return ;
499  } // ----- end of method PMatrix::Setup -----
500 
501 
503  template<typename T>
505  SuperNodeBufferType & snode,
506  std::vector<LBlock<T> > & LcolRecv,
507  std::vector<UBlock<T> > & UrowRecv,
508  NumMat<T> & AinvBuf,
509  NumMat<T> & UBuf )
510  {
511  TIMER_START(Compute_Sinv_LT_Lookup_Indexes);
512 
513  TIMER_START(Build_colptr_rowptr);
514  // rowPtr[ib] gives the row index in snode.LUpdateBuf for the first
515  // nonzero row in LcolRecv[ib]. The total number of rows in
516  // snode.LUpdateBuf is given by rowPtr[end]-1
517  std::vector<Int> rowPtr(LcolRecv.size() + 1);
518  // colPtr[jb] gives the column index in UBuf for the first
519  // nonzero column in UrowRecv[jb]. The total number of rows in
520  // UBuf is given by colPtr[end]-1
521  std::vector<Int> colPtr(UrowRecv.size() + 1);
522 
523  rowPtr[0] = 0;
524  for( Int ib = 0; ib < LcolRecv.size(); ib++ ){
525  rowPtr[ib+1] = rowPtr[ib] + LcolRecv[ib].numRow;
526  }
527  colPtr[0] = 0;
528  for( Int jb = 0; jb < UrowRecv.size(); jb++ ){
529  colPtr[jb+1] = colPtr[jb] + UrowRecv[jb].numCol;
530  }
531 
532  Int numRowAinvBuf = *rowPtr.rbegin();
533  Int numColAinvBuf = *colPtr.rbegin();
534  TIMER_STOP(Build_colptr_rowptr);
535 
536  TIMER_START(Allocate_lookup);
537  // Allocate for the computational storage
538  AinvBuf.Resize( numRowAinvBuf, numColAinvBuf );
539  UBuf.Resize( SuperSize( snode.Index, super_ ), numColAinvBuf );
540  // TIMER_START(SetValue_lookup);
541  // SetValue( AinvBuf, ZERO<T>() );
542  //SetValue( snode.LUpdateBuf, ZERO<T>() );
543  // SetValue( UBuf, ZERO<T>() );
544  // TIMER_STOP(SetValue_lookup);
545  TIMER_STOP(Allocate_lookup);
546 
547  TIMER_START(Fill_UBuf);
548  // Fill UBuf first. Make the transpose later in the Gemm phase.
549  for( Int jb = 0; jb < UrowRecv.size(); jb++ ){
550  UBlock<T>& UB = UrowRecv[jb];
551  if( UB.numRow != SuperSize(snode.Index, super_) ){
552  throw std::logic_error( "The size of UB is not right. Something is seriously wrong." );
553  }
554  lapack::Lacpy( 'A', UB.numRow, UB.numCol, UB.nzval.Data(),
555  UB.numRow, UBuf.VecData( colPtr[jb] ), SuperSize( snode.Index, super_ ) );
556  }
557  TIMER_STOP(Fill_UBuf);
558 
559  // Calculate the relative indices for (isup, jsup)
560  // Fill AinvBuf with the information in L or U block.
561  TIMER_START(JB_Loop);
562 
563 #ifdef STDFIND
564  // for( Int jb = 0; jb < UrowRecv.size(); jb++ ){
565  //
566  // UBlock& UB = UrowRecv[jb];
567  // Int jsup = UB.blockIdx;
568  // Int SinvColsSta = FirstBlockCol( jsup, super_ );
569  //
570  // // Column relative indicies
571  // std::vector<Int> relCols( UB.numCol );
572  // for( Int j = 0; j < UB.numCol; j++ ){
573  // relCols[j] = UB.cols[j] - SinvColsSta;
574  // }
575  //
576  //
577  //
578  //
579  // for( Int ib = 0; ib < LcolRecv.size(); ib++ ){
580  // LBlock& LB = LcolRecv[ib];
581  // Int isup = LB.blockIdx;
582  // Int SinvRowsSta = FirstBlockCol( isup, super_ );
583  // Scalar* nzvalAinv = &AinvBuf( rowPtr[ib], colPtr[jb] );
584  // Int ldAinv = numRowAinvBuf;
585  //
586  // // Pin down the corresponding block in the part of Sinv.
587  // if( isup >= jsup ){
588  // std::vector<LBlock>& LcolSinv = this->L( LBj(jsup, grid_ ) );
589  // bool isBlockFound = false;
590  // TIMER_START(PARSING_ROW_BLOCKIDX);
591  // for( Int ibSinv = 0; ibSinv < LcolSinv.size(); ibSinv++ ){
592  // // Found the (isup, jsup) block in Sinv
593  // if( LcolSinv[ibSinv].blockIdx == isup ){
594  // LBlock& SinvB = LcolSinv[ibSinv];
595  //
596  // // Row relative indices
597  // std::vector<Int> relRows( LB.numRow );
598  // Int* rowsLBPtr = LB.rows.Data();
599  // Int* rowsSinvBPtr = SinvB.rows.Data();
600  //
601  // TIMER_START(STDFIND_ROW);
602  // Int * pos =&rowsSinvBPtr[0];
603  // Int * last =&rowsSinvBPtr[SinvB.numRow];
604  // for( Int i = 0; i < LB.numRow; i++ ){
605  // // pos = std::find(pos, &rowsSinvBPtr[SinvB.numRow-1], rowsLBPtr[i]);
606  // pos = std::find(rowsSinvBPtr, last, rowsLBPtr[i]);
607  // if(pos != last){
608  // relRows[i] = (Int)(pos - rowsSinvBPtr);
609  // }
610  // else{
611  // std::ostringstream msg;
612  // msg << "Row " << rowsLBPtr[i] <<
613  // " in LB cannot find the corresponding row in SinvB" << std::endl
614  // << "LB.rows = " << LB.rows << std::endl
615  // << "SinvB.rows = " << SinvB.rows << std::endl;
616  // throw std::runtime_error( msg.str().c_str() );
617  // }
618  // }
619  // TIMER_STOP(STDFIND_ROW);
620  //
621  // TIMER_START(Copy_Sinv_to_Ainv);
622  // // Transfer the values from Sinv to AinvBlock
623  // Scalar* nzvalSinv = SinvB.nzval.Data();
624  // Int ldSinv = SinvB.numRow;
625  // for( Int j = 0; j < UB.numCol; j++ ){
626  // for( Int i = 0; i < LB.numRow; i++ ){
627  // nzvalAinv[i+j*ldAinv] =
628  // nzvalSinv[relRows[i] + relCols[j] * ldSinv];
629  // }
630  // }
631  // TIMER_STOP(Copy_Sinv_to_Ainv);
632  //
633  // isBlockFound = true;
634  // break;
635  // }
636  // } // for (ibSinv )
637  // TIMER_STOP(PARSING_ROW_BLOCKIDX);
638  // if( isBlockFound == false ){
639  // std::ostringstream msg;
640  // msg << "Block(" << isup << ", " << jsup
641  // << ") did not find a matching block in Sinv." << std::endl;
642  // throw std::runtime_error( msg.str().c_str() );
643  // }
644  // } // if (isup, jsup) is in L
645  // else{
646  // // Row relative indices
647  // std::vector<Int> relRows( LB.numRow );
648  // Int SinvRowsSta = FirstBlockCol( isup, super_ );
649  // for( Int i = 0; i < LB.numRow; i++ ){
650  // relRows[i] = LB.rows[i] - SinvRowsSta;
651  // }
652  // std::vector<UBlock>& UrowSinv = this->U( LBi( isup, grid_ ) );
653  // bool isBlockFound = false;
654  // TIMER_START(PARSING_COL_BLOCKIDX);
655  // for( Int jbSinv = 0; jbSinv < UrowSinv.size(); jbSinv++ ){
656  // // Found the (isup, jsup) block in Sinv
657  // if( UrowSinv[jbSinv].blockIdx == jsup ){
658  // UBlock& SinvB = UrowSinv[jbSinv];
659  //
660  //
661  //
662  // // Column relative indices
663  // std::vector<Int> relCols( UB.numCol );
664  // Int* colsUBPtr = UB.cols.Data();
665  // Int* colsSinvBPtr = SinvB.cols.Data();
666  // TIMER_START(STDFIND_COL);
667  // Int * pos =&colsSinvBPtr[0];
668  // Int * last =&colsSinvBPtr[SinvB.numCol];
669  // for( Int j = 0; j < UB.numCol; j++ ){
670  // //colsUB is sorted
671  // pos = std::find(colsSinvBPtr, last, colsUBPtr[j]);
672  // if(pos !=last){
673  // relCols[j] = (Int)(pos - colsSinvBPtr);
674  // }
675  // else{
676  // std::ostringstream msg;
677  // msg << "Col " << colsUBPtr[j] <<
678  // " in UB cannot find the corresponding row in SinvB" << std::endl
679  // << "UB.cols = " << UB.cols << std::endl
680  // << "UinvB.cols = " << SinvB.cols << std::endl;
681  // throw std::runtime_error( msg.str().c_str() );
682  // }
683  // }
684  // TIMER_STOP(STDFIND_COL);
685  //
686  //
687  // TIMER_START(Copy_Sinv_to_Ainv);
688  // // Transfer the values from Sinv to AinvBlock
689  // Scalar* nzvalSinv = SinvB.nzval.Data();
690  // Int ldSinv = SinvB.numRow;
691  // for( Int j = 0; j < UB.numCol; j++ ){
692  // for( Int i = 0; i < LB.numRow; i++ ){
693  // nzvalAinv[i+j*ldAinv] =
694  // nzvalSinv[relRows[i] + relCols[j] * ldSinv];
695  // }
696  // }
697  // TIMER_STOP(Copy_Sinv_to_Ainv);
698  //
699  // isBlockFound = true;
700  // break;
701  // }
702  // } // for (jbSinv)
703  // TIMER_STOP(PARSING_COL_BLOCKIDX);
704  // if( isBlockFound == false ){
705  // std::ostringstream msg;
706  // msg << "Block(" << isup << ", " << jsup
707  // << ") did not find a matching block in Sinv." << std::endl;
708  // throw std::runtime_error( msg.str().c_str() );
709  // }
710  // } // if (isup, jsup) is in U
711  //
712  // } // for( ib )
713  // } // for ( jb )
714 #else
715  for( Int jb = 0; jb < UrowRecv.size(); jb++ ){
716  for( Int ib = 0; ib < LcolRecv.size(); ib++ ){
717  LBlock<T>& LB = LcolRecv[ib];
718  UBlock<T>& UB = UrowRecv[jb];
719  Int isup = LB.blockIdx;
720  Int jsup = UB.blockIdx;
721  T* nzvalAinv = &AinvBuf( rowPtr[ib], colPtr[jb] );
722  Int ldAinv = AinvBuf.m();
723 
724  // Pin down the corresponding block in the part of Sinv.
725  if( isup >= jsup ){
726  std::vector<LBlock<T> >& LcolSinv = this->L( LBj(jsup, grid_ ) );
727  bool isBlockFound = false;
728  TIMER_START(PARSING_ROW_BLOCKIDX);
729  for( Int ibSinv = 0; ibSinv < LcolSinv.size(); ibSinv++ ){
730  // Found the (isup, jsup) block in Sinv
731  if( LcolSinv[ibSinv].blockIdx == isup ){
732  LBlock<T> & SinvB = LcolSinv[ibSinv];
733 
734  // Row relative indices
735  std::vector<Int> relRows( LB.numRow );
736  Int* rowsLBPtr = LB.rows.Data();
737  Int* rowsSinvBPtr = SinvB.rows.Data();
738  for( Int i = 0; i < LB.numRow; i++ ){
739  bool isRowFound = false;
740  for( Int i1 = 0; i1 < SinvB.numRow; i1++ ){
741  if( rowsLBPtr[i] == rowsSinvBPtr[i1] ){
742  isRowFound = true;
743  relRows[i] = i1;
744  break;
745  }
746  }
747  if( isRowFound == false ){
748  std::ostringstream msg;
749  msg << "Row " << rowsLBPtr[i] <<
750  " in LB cannot find the corresponding row in SinvB" << std::endl
751  << "LB.rows = " << LB.rows << std::endl
752  << "SinvB.rows = " << SinvB.rows << std::endl;
753  throw std::runtime_error( msg.str().c_str() );
754  }
755  }
756 
757  // Column relative indicies
758  std::vector<Int> relCols( UB.numCol );
759  Int SinvColsSta = FirstBlockCol( jsup, super_ );
760  for( Int j = 0; j < UB.numCol; j++ ){
761  relCols[j] = UB.cols[j] - SinvColsSta;
762  }
763 
764  // Transfer the values from Sinv to AinvBlock
765  T* nzvalSinv = SinvB.nzval.Data();
766  Int ldSinv = SinvB.numRow;
767  for( Int j = 0; j < UB.numCol; j++ ){
768  for( Int i = 0; i < LB.numRow; i++ ){
769  nzvalAinv[i+j*ldAinv] =
770  nzvalSinv[relRows[i] + relCols[j] * ldSinv];
771  }
772  }
773 
774  isBlockFound = true;
775  break;
776  }
777  } // for (ibSinv )
778  TIMER_STOP(PARSING_ROW_BLOCKIDX);
779  if( isBlockFound == false ){
780  std::ostringstream msg;
781  msg << "Block(" << isup << ", " << jsup
782  << ") did not find a matching block in Sinv." << std::endl;
783  throw std::runtime_error( msg.str().c_str() );
784  }
785  } // if (isup, jsup) is in L
786  else{
787  std::vector<UBlock<T> >& UrowSinv = this->U( LBi( isup, grid_ ) );
788  bool isBlockFound = false;
789  TIMER_START(PARSING_COL_BLOCKIDX);
790  for( Int jbSinv = 0; jbSinv < UrowSinv.size(); jbSinv++ ){
791  // Found the (isup, jsup) block in Sinv
792  if( UrowSinv[jbSinv].blockIdx == jsup ){
793  UBlock<T> & SinvB = UrowSinv[jbSinv];
794 
795  // Row relative indices
796  std::vector<Int> relRows( LB.numRow );
797  Int SinvRowsSta = FirstBlockCol( isup, super_ );
798  for( Int i = 0; i < LB.numRow; i++ ){
799  relRows[i] = LB.rows[i] - SinvRowsSta;
800  }
801 
802  // Column relative indices
803  std::vector<Int> relCols( UB.numCol );
804  Int* colsUBPtr = UB.cols.Data();
805  Int* colsSinvBPtr = SinvB.cols.Data();
806  for( Int j = 0; j < UB.numCol; j++ ){
807  bool isColFound = false;
808  for( Int j1 = 0; j1 < SinvB.numCol; j1++ ){
809  if( colsUBPtr[j] == colsSinvBPtr[j1] ){
810  isColFound = true;
811  relCols[j] = j1;
812  break;
813  }
814  }
815  if( isColFound == false ){
816  std::ostringstream msg;
817  msg << "Col " << colsUBPtr[j] <<
818  " in UB cannot find the corresponding row in SinvB" << std::endl
819  << "UB.cols = " << UB.cols << std::endl
820  << "UinvB.cols = " << SinvB.cols << std::endl;
821  throw std::runtime_error( msg.str().c_str() );
822  }
823  }
824 
825 
826  // Transfer the values from Sinv to AinvBlock
827  T* nzvalSinv = SinvB.nzval.Data();
828  Int ldSinv = SinvB.numRow;
829  for( Int j = 0; j < UB.numCol; j++ ){
830  for( Int i = 0; i < LB.numRow; i++ ){
831  nzvalAinv[i+j*ldAinv] =
832  nzvalSinv[relRows[i] + relCols[j] * ldSinv];
833  }
834  }
835 
836  isBlockFound = true;
837  break;
838  }
839  } // for (jbSinv)
840  TIMER_STOP(PARSING_COL_BLOCKIDX);
841  if( isBlockFound == false ){
842  std::ostringstream msg;
843  msg << "Block(" << isup << ", " << jsup
844  << ") did not find a matching block in Sinv." << std::endl;
845  throw std::runtime_error( msg.str().c_str() );
846  }
847  } // if (isup, jsup) is in U
848 
849  } // for( ib )
850  } // for ( jb )
851 
852 
853 #endif
854  TIMER_STOP(JB_Loop);
855 
856 
857  TIMER_STOP(Compute_Sinv_LT_Lookup_Indexes);
858  }
859 
860  template<typename T>
862  std::vector<SuperNodeBufferType > & arrSuperNodes,
863  Int stepSuper)
864  {
865 
866  TIMER_START(Send_CD_Update_U);
867  //compute the number of requests
868  Int sendCount = 0;
869  Int recvCount = 0;
870  Int sendOffset[stepSuper];
871  Int recvOffset[stepSuper];
872  Int recvIdx=0;
873  for (Int supidx=0; supidx<stepSuper; supidx++){
874  SuperNodeBufferType & snode = arrSuperNodes[supidx];
875  sendOffset[supidx]=sendCount;
876  recvOffset[supidx]=recvCount;
877  sendCount+= CountSendToCrossDiagonal(snode.Index);
878  recvCount+= CountRecvFromCrossDiagonal(snode.Index);
879  }
880 
881 
882  std::vector<MPI_Request > arrMpiReqsSendCD(sendCount, MPI_REQUEST_NULL );
883  std::vector<MPI_Request > arrMpiReqsSizeSendCD(sendCount, MPI_REQUEST_NULL );
884 
885  std::vector<MPI_Request > arrMpiReqsRecvCD(recvCount, MPI_REQUEST_NULL );
886  std::vector<MPI_Request > arrMpiReqsSizeRecvCD(recvCount, MPI_REQUEST_NULL );
887  std::vector<std::vector<char> > arrSstrLcolSendCD(sendCount);
888  std::vector<int > arrSstrLcolSizeSendCD(sendCount);
889  std::vector<std::vector<char> > arrSstrLcolRecvCD(recvCount);
890  std::vector<int > arrSstrLcolSizeRecvCD(recvCount);
891 
892  for (Int supidx=0; supidx<stepSuper; supidx++){
893  SuperNodeBufferType & snode = arrSuperNodes[supidx];
894 
895  // Send LUpdateBufReduced to the cross diagonal blocks.
896  // NOTE: This assumes square processor grid
897 
898  TIMER_START(Send_L_CrossDiag);
899 
900  if( MYCOL( grid_ ) == PCOL( snode.Index, grid_ ) && isSendToCrossDiagonal_(grid_->numProcCol, snode.Index ) ){
901 
902  Int sendIdx = 0;
903  for(Int dstCol = 0; dstCol<grid_->numProcCol; dstCol++){
904  if(isSendToCrossDiagonal_(dstCol,snode.Index) ){
905  Int dest = PNUM(PROW(snode.Index,grid_),dstCol,grid_);
906 
907  if( MYPROC( grid_ ) != dest ){
908  MPI_Request & mpiReqSizeSend = arrMpiReqsSizeSendCD[sendOffset[supidx]+sendIdx];
909  MPI_Request & mpiReqSend = arrMpiReqsSendCD[sendOffset[supidx]+sendIdx];
910 
911 
912  std::stringstream sstm;
913  std::vector<char> & sstrLcolSend = arrSstrLcolSendCD[sendOffset[supidx]+sendIdx];
914  Int & sstrSize = arrSstrLcolSizeSendCD[sendOffset[supidx]+sendIdx];
915 
916  serialize( snode.RowLocalPtr, sstm, NO_MASK );
917  serialize( snode.BlockIdxLocal, sstm, NO_MASK );
918  serialize( snode.LUpdateBuf, sstm, NO_MASK );
919 
920  sstrLcolSend.resize( Size(sstm) );
921  sstm.read( &sstrLcolSend[0], sstrLcolSend.size() );
922  sstrSize = sstrLcolSend.size();
923 
924 #if ( _DEBUGlevel_ >= 1 )
925  assert(IDX_TO_TAG(snode.Rank,SELINV_TAG_L_SIZE_CD,limIndex_)<=maxTag_);
926  assert(IDX_TO_TAG(snode.Rank,SELINV_TAG_L_CONTENT_CD,limIndex_)<=maxTag_);
927 #endif
928 
929  MPI_Isend( &sstrSize, sizeof(sstrSize), MPI_BYTE, dest, IDX_TO_TAG(snode.Rank,SELINV_TAG_L_SIZE_CD,limIndex_), grid_->comm, &mpiReqSizeSend );
930  MPI_Isend( (void*)&sstrLcolSend[0], sstrSize, MPI_BYTE, dest, IDX_TO_TAG(snode.Rank,SELINV_TAG_L_CONTENT_CD,limIndex_), grid_->comm, &mpiReqSend );
931 
932  PROFILE_COMM(MYPROC(this->grid_),dest,IDX_TO_TAG(snode.Rank,SELINV_TAG_L_SIZE_CD,limIndex_),sizeof(sstrSize));
933  PROFILE_COMM(MYPROC(this->grid_),dest,IDX_TO_TAG(snode.Rank,SELINV_TAG_L_CONTENT_CD,limIndex_),sstrSize);
934 
935 
936  sendIdx++;
937  }
938  }
939  }
940 
941 
942  } // sender
943  TIMER_STOP(Send_L_CrossDiag);
944  }
945 
946 
947  //Do Irecv for sizes
948  for (Int supidx=0; supidx<stepSuper; supidx++){
949  SuperNodeBufferType & snode = arrSuperNodes[supidx];
950  //If I'm a receiver
951  if( MYROW( grid_ ) == PROW( snode.Index, grid_ ) && isRecvFromCrossDiagonal_(grid_->numProcRow, snode.Index ) ){
952  Int recvIdx=0;
953  for(Int srcRow = 0; srcRow<grid_->numProcRow; srcRow++){
954  if(isRecvFromCrossDiagonal_(srcRow,snode.Index) ){
955  Int src = PNUM(srcRow,PCOL(snode.Index,grid_),grid_);
956  if( MYPROC( grid_ ) != src ){
957  Int & sstrSize = arrSstrLcolSizeRecvCD[recvOffset[supidx]+recvIdx];
958  MPI_Request & mpiReqSizeRecv = arrMpiReqsSizeRecvCD[recvOffset[supidx]+recvIdx];
959 
960 #if ( _DEBUGlevel_ >= 1 )
961  assert(IDX_TO_TAG(snode.Rank,SELINV_TAG_L_CONTENT_CD,limIndex_)<=maxTag_);
962 #endif
963  MPI_Irecv( &sstrSize, 1, MPI_INT, src, IDX_TO_TAG(snode.Rank,SELINV_TAG_L_SIZE_CD,limIndex_), grid_->comm, &mpiReqSizeRecv);
964  recvIdx++;
965  }
966  }
967  }
968  }//end if I'm a receiver
969  }
970 
971  //waitall sizes
972  mpi::Waitall(arrMpiReqsSizeRecvCD);
973  //Allocate content and do Irecv
974  for (Int supidx=0; supidx<stepSuper; supidx++){
975  SuperNodeBufferType & snode = arrSuperNodes[supidx];
976  //If I'm a receiver
977  if( MYROW( grid_ ) == PROW( snode.Index, grid_ ) && isRecvFromCrossDiagonal_(grid_->numProcRow, snode.Index ) ){
978  Int recvIdx=0;
979  for(Int srcRow = 0; srcRow<grid_->numProcRow; srcRow++){
980  if(isRecvFromCrossDiagonal_(srcRow,snode.Index) ){
981  Int src = PNUM(srcRow,PCOL(snode.Index,grid_),grid_);
982  if( MYPROC( grid_ ) != src ){
983  Int & sstrSize = arrSstrLcolSizeRecvCD[recvOffset[supidx]+recvIdx];
984  std::vector<char> & sstrLcolRecv = arrSstrLcolRecvCD[recvOffset[supidx]+recvIdx];
985  MPI_Request & mpiReqRecv = arrMpiReqsRecvCD[recvOffset[supidx]+recvIdx];
986  sstrLcolRecv.resize( sstrSize);
987 
988 #if ( _DEBUGlevel_ >= 1 )
989  assert(IDX_TO_TAG(snode.Rank,SELINV_TAG_L_CONTENT_CD,limIndex_)<=maxTag_);
990 #endif
991  MPI_Irecv( (void*)&sstrLcolRecv[0], sstrSize, MPI_BYTE, src, IDX_TO_TAG(snode.Rank,SELINV_TAG_L_CONTENT_CD,limIndex_), grid_->comm, &mpiReqRecv );
992  //statusOFS<<"P"<<MYPROC(grid_)<<" received "<<sstrSize<<" bytes of L/U from CD P"<<src<<std::endl;
993  recvIdx++;
994  }
995  }
996  }
997  }//end if I'm a receiver
998  }
999 
1000  //waitall content
1001  mpi::Waitall(arrMpiReqsRecvCD);
1002  //Do the work
1003  for (Int supidx=0; supidx<stepSuper; supidx++){
1004  SuperNodeBufferType & snode = arrSuperNodes[supidx];
1005 
1006  // Send LUpdateBufReduced to the cross diagonal blocks.
1007  // NOTE: This assumes square processor grid
1008  if( MYROW( grid_ ) == PROW( snode.Index, grid_ ) && isRecvFromCrossDiagonal_(grid_->numProcRow, snode.Index ) ){
1009 
1010 #if ( _DEBUGlevel_ >= 1 )
1011  statusOFS << std::endl << " ["<<snode.Index<<"] "<< "Update the upper triangular block" << std::endl << std::endl;
1012  statusOFS << std::endl << " ["<<snode.Index<<"] "<< "blockIdxLocal:" << snode.BlockIdxLocal << std::endl << std::endl;
1013  statusOFS << std::endl << " ["<<snode.Index<<"] "<< "rowLocalPtr:" << snode.RowLocalPtr << std::endl << std::endl;
1014 #endif
1015 
1016  std::vector<UBlock<T> >& Urow = this->U( LBi( snode.Index, grid_ ) );
1017  std::vector<bool> isBlockFound(Urow.size(),false);
1018 
1019  recvIdx=0;
1020  for(Int srcRow = 0; srcRow<grid_->numProcRow; srcRow++){
1021  if(isRecvFromCrossDiagonal_(srcRow,snode.Index) ){
1022  Int src = PNUM(srcRow,PCOL(snode.Index,grid_),grid_);
1023  TIMER_START(Recv_L_CrossDiag);
1024 
1025  std::vector<Int> rowLocalPtrRecv;
1026  std::vector<Int> blockIdxLocalRecv;
1027  NumMat<T> UUpdateBuf;
1028 
1029  if( MYPROC( grid_ ) != src ){
1030  std::stringstream sstm;
1031  Int & sstrSize = arrSstrLcolSizeRecvCD[recvOffset[supidx]+recvIdx];
1032  std::vector<char> & sstrLcolRecv = arrSstrLcolRecvCD[recvOffset[supidx]+recvIdx];
1033  sstm.write( &sstrLcolRecv[0], sstrSize );
1034 
1035  deserialize( rowLocalPtrRecv, sstm, NO_MASK );
1036  deserialize( blockIdxLocalRecv, sstm, NO_MASK );
1037  deserialize( UUpdateBuf, sstm, NO_MASK );
1038 
1039  recvIdx++;
1040 
1041  } // sender is not the same as receiver
1042  else{
1043  rowLocalPtrRecv = snode.RowLocalPtr;
1044  blockIdxLocalRecv = snode.BlockIdxLocal;
1045  UUpdateBuf = snode.LUpdateBuf;
1046  } // sender is the same as receiver
1047 
1048 
1049 
1050  TIMER_STOP(Recv_L_CrossDiag);
1051 
1052 #if ( _DEBUGlevel_ >= 1 )
1053  statusOFS<<" ["<<snode.Index<<"] P"<<MYPROC(grid_)<<" ("<<MYROW(grid_)<<","<<MYCOL(grid_)<<") <--- LBj("<<snode.Index<<") <--- P"<<src<<std::endl;
1054  statusOFS << std::endl << " ["<<snode.Index<<"] "<< "rowLocalPtrRecv:" << rowLocalPtrRecv << std::endl << std::endl;
1055  statusOFS << std::endl << " ["<<snode.Index<<"] "<< "blockIdxLocalRecv:" << blockIdxLocalRecv << std::endl << std::endl;
1056 #endif
1057 
1058 
1059  // Update U
1060  for( Int ib = 0; ib < blockIdxLocalRecv.size(); ib++ ){
1061  for( Int jb = 0; jb < Urow.size(); jb++ ){
1062  UBlock<T>& UB = Urow[jb];
1063  if( UB.blockIdx == blockIdxLocalRecv[ib] ){
1064  NumMat<T> Ltmp ( UB.numCol, UB.numRow );
1065  lapack::Lacpy( 'A', Ltmp.m(), Ltmp.n(),
1066  &UUpdateBuf( rowLocalPtrRecv[ib], 0 ),
1067  UUpdateBuf.m(), Ltmp.Data(), Ltmp.m() );
1068  isBlockFound[jb] = true;
1069  Transpose( Ltmp, UB.nzval );
1070  break;
1071  }
1072  }
1073  }
1074  }
1075  }
1076 
1077  for( Int jb = 0; jb < Urow.size(); jb++ ){
1078  UBlock<T>& UB = Urow[jb];
1079  if( !isBlockFound[jb] ){
1080 #ifdef USE_ABORT
1081  abort();
1082 #endif
1083  throw std::logic_error( "UBlock cannot find its update. Something is seriously wrong." );
1084  }
1085  }
1086  } // receiver
1087  }
1088 
1089  TIMER_STOP(Send_CD_Update_U);
1090 
1091  mpi::Waitall(arrMpiReqsSizeSendCD);
1092  mpi::Waitall(arrMpiReqsSendCD);
1093  };
1094 
1095  template<typename T>
1097  SuperNodeBufferType & snode,
1098  std::vector<LBlock<T> > & LcolRecv,
1099  std::vector<UBlock<T> > & UrowRecv )
1100  {
1101 #if ( _DEBUGlevel_ >= 1 )
1102  statusOFS << std::endl << "["<<snode.Index<<"] "<< "Unpack the received data for processors participate in Gemm. " << std::endl << std::endl;
1103 #endif
1104  // U part
1105  if( MYROW( grid_ ) != PROW( snode.Index, grid_ ) ){
1106  std::stringstream sstm;
1107 #ifndef NEW_BCAST
1108  sstm.write( &snode.SstrUrowRecv[0], snode.SstrUrowRecv.size() );
1109 #else
1110  //sstm.write( &snode.SstrUrowRecv[0], snode.SstrUrowRecv.size() );
1111  TreeBcast2<T> * bcastUTree2 = fwdToBelowTree2_[snode.Index];
1112  sstm.write( (char*)bcastUTree2->GetLocalBuffer(),bcastUTree2->GetMsgSize());
1113 #endif
1114  std::vector<Int> mask( UBlockMask::TOTAL_NUMBER, 1 );
1115  Int numUBlock;
1116  deserialize( numUBlock, sstm, NO_MASK );
1117  UrowRecv.resize( numUBlock );
1118  for( Int jb = 0; jb < numUBlock; jb++ ){
1119  deserialize( UrowRecv[jb], sstm, mask );
1120  }
1121  } // sender is not the same as receiver
1122  else{
1123  // U is obtained locally, just make a copy. Include everything
1124  // (there is no diagonal block)
1125  // Is it a copy ? LL: YES. Maybe we should replace the copy by
1126  // something more efficient especially for mpisize == 1
1127  UrowRecv.resize(this->U( LBi( snode.Index, grid_ ) ).size());
1128  std::copy(this->U( LBi( snode.Index, grid_ ) ).begin(),this->U( LBi( snode.Index, grid_ )).end(),UrowRecv.begin());
1129  } // sender is the same as receiver
1130 
1131 
1132  //L part
1133  if( MYCOL( grid_ ) != PCOL( snode.Index, grid_ ) ){
1134  std::stringstream sstm;
1135 #ifndef NEW_BCAST
1136  sstm.write( &snode.SstrLcolRecv[0], snode.SstrLcolRecv.size() );
1137 #else
1138  //sstm.write( &snode.SstrLcolRecv[0], snode.SstrLcolRecv.size() );
1139  TreeBcast2<T> * bcastLTree2 = fwdToRightTree2_[snode.Index];
1140  sstm.write( (char*)bcastLTree2->GetLocalBuffer(),bcastLTree2->GetMsgSize());
1141 #endif
1142  std::vector<Int> mask( LBlockMask::TOTAL_NUMBER, 1 );
1143  mask[LBlockMask::NZVAL] = 0; // nzval is excluded
1144  Int numLBlock;
1145  deserialize( numLBlock, sstm, NO_MASK );
1146  LcolRecv.resize( numLBlock );
1147  for( Int ib = 0; ib < numLBlock; ib++ ){
1148  deserialize( LcolRecv[ib], sstm, mask );
1149  }
1150  } // sender is not the same as receiver
1151  else{
1152  // L is obtained locally, just make a copy.
1153  // Do not include the diagonal block
1154  std::vector<LBlock<T> >& Lcol = this->L( LBj( snode.Index, grid_ ) );
1155  Int startIdx = ( MYROW( grid_ ) == PROW( snode.Index, grid_ ) )?1:0;
1156  LcolRecv.resize( Lcol.size() - startIdx );
1157  std::copy(Lcol.begin()+startIdx,Lcol.end(),LcolRecv.begin());
1158  } // sender is the same as receiver
1159  }
1160 
1161  template<typename T>
1163  {
1164 
1165  //---------Computing Diagonal block, all processors in the column are participating to all pipelined supernodes
1166  if( MYCOL( grid_ ) == PCOL( snode.Index, grid_ ) ){
1167 #if ( _DEBUGlevel_ >= 2 )
1168  statusOFS << std::endl << "["<<snode.Index<<"] "<< "Updating the diagonal block" << std::endl << std::endl;
1169 #endif
1170  std::vector<LBlock<T> >& Lcol = this->L( LBj( snode.Index, grid_ ) );
1171 
1172  //Allocate DiagBuf even if Lcol.size() == 0
1173  snode.DiagBuf.Resize(SuperSize( snode.Index, super_ ), SuperSize( snode.Index, super_ ));
1174  SetValue(snode.DiagBuf, ZERO<T>());
1175 
1176  // Do I own the diagonal block ?
1177  Int startIb = (MYROW( grid_ ) == PROW( snode.Index, grid_ ))?1:0;
1178  for( Int ib = startIb; ib < Lcol.size(); ib++ ){
1179 
1180 #ifdef GEMM_PROFILE
1181  gemm_stat.push_back(snode.DiagBuf.m());
1182  gemm_stat.push_back(snode.DiagBuf.n());
1183  gemm_stat.push_back(Lcol[ib].numRow);
1184 #endif
1185 
1186  LBlock<T> & LB = Lcol[ib];
1187 
1188  blas::Gemm( 'T', 'N', snode.DiagBuf.m(), snode.DiagBuf.n(), LB.numRow,
1189  MINUS_ONE<T>(), &snode.LUpdateBuf( snode.RowLocalPtr[ib-startIb], 0 ), snode.LUpdateBuf.m(),
1190  LB.nzval.Data(), LB.nzval.m(), ONE<T>(), snode.DiagBuf.Data(), snode.DiagBuf.m() );
1191  }
1192 
1193 #if ( _DEBUGlevel_ >= 1 )
1194  statusOFS << std::endl << "["<<snode.Index<<"] "<< "Updated the diagonal block" << std::endl << std::endl;
1195 #endif
1196  }
1197  }
1198 
1199 
1200 
1201  template<typename T>
1202  void PMatrix<T>::GetEtree(std::vector<Int> & etree_supno )
1203  {
1204 
1205 #ifndef _RELEASE_
1206  PushCallStack("PMatrix::GetEtree");
1207  double begin = MPI_Wtime( );
1208 #endif
1209  Int nsupers = this->NumSuper();
1210 
1211  if( optionsLU_->ColPerm != "PARMETIS" ) {
1212  /* Use the etree computed from serial symb. fact., and turn it
1213  into supernodal tree. */
1214  const SuperNodeType * superNode = this->SuperNode();
1215 
1216 
1217  //translate from columns to supernodes etree using supIdx
1218  etree_supno.resize(this->NumSuper());
1219  for(Int i = 0; i < superNode->etree.m(); ++i){
1220  Int curSnode = superNode->superIdx[i];
1221  Int parentSnode = (superNode->etree[i]>= superNode->etree.m()) ?this->NumSuper():superNode->superIdx[superNode->etree[i]];
1222  if( curSnode != parentSnode){
1223  etree_supno[curSnode] = parentSnode;
1224  }
1225  }
1226 
1227  } else { /* ParSymbFACT==YES and SymPattern==YES and RowPerm == NOROWPERM */
1228  /* Compute an "etree" based on struct(L),
1229  assuming struct(U) = struct(L'). */
1230 
1231  /* find the first block in each supernodal-column of local L-factor */
1232  std::vector<Int> etree_supno_l( nsupers, nsupers );
1233  for( Int ksup = 0; ksup < nsupers; ksup++ ){
1234  if( MYCOL( grid_ ) == PCOL( ksup, grid_ ) ){
1235  // L part
1236  std::vector<LBlock<T> >& Lcol = this->L( LBj(ksup, grid_) );
1237  if(Lcol.size()>0){
1238  Int firstBlk = 0;
1239  if( MYROW( grid_ ) == PROW( ksup, grid_ ) )
1240  firstBlk=1;
1241 
1242  for( Int ib = firstBlk; ib < Lcol.size(); ib++ ){
1243  etree_supno_l[ksup] = std::min(etree_supno_l[ksup] , Lcol[ib].blockIdx);
1244  }
1245  }
1246  }
1247  }
1248 
1249 
1250 #if ( _DEBUGlevel_ >= 1 )
1251  statusOFS << std::endl << " Local supernodal elimination tree is " << etree_supno_l <<std::endl<<std::endl;
1252 
1253 #endif
1254  /* form global e-tree */
1255  etree_supno.resize( nsupers );
1256  mpi::Allreduce( (Int*) &etree_supno_l[0],(Int *) &etree_supno[0], nsupers, MPI_MIN, grid_->comm );
1257  etree_supno[nsupers-1]=nsupers;
1258  }
1259 
1260 #ifndef _RELEASE_
1261  double end = MPI_Wtime( );
1262  statusOFS<<"Building the list took "<<end-begin<<"s"<<std::endl;
1263 #endif
1264 #ifndef _RELEASE_
1265  PopCallStack();
1266 #endif
1267  } // ----- end of method PMatrix::GetEtree -----
1268 
1269 
1270  template<typename T>
1272  std::vector<Int> & snodeEtree,
1273  std::vector<std::vector<Int> > & WSet)
1274  {
1275  TIMER_START(Compute_WorkSet);
1276  Int numSuper = this->NumSuper();
1277 
1278 
1279  if (options_->maxPipelineDepth!=1){
1280 
1281  Int maxDepth = options_->maxPipelineDepth;
1282  maxDepth=maxDepth==-1?std::numeric_limits<Int>::max():maxDepth;
1283 #if ( _DEBUGlevel_ >= 1 )
1284  statusOFS<<"MaxDepth is "<<maxDepth<<endl;
1285 #endif
1286 
1287  //find roots in the supernode etree (it must be postordered)
1288  //initialize the parent we are looking at
1289  //Int rootParent = snodeEtree[numSuper-2];
1290  Int rootParent = numSuper;
1291 
1292  //compute the level of each supernode and the total number of levels
1293  //IntNumVec level(numSuper);
1294  //level(rootParent)=0;
1295  IntNumVec level(numSuper+1);
1296  level(rootParent)=-1;
1297  Int numLevel = 0;
1298  for(Int i=rootParent-1; i>=0; i-- ){
1299  level[i] = level[snodeEtree[i]]+1;
1300  numLevel = std::max(numLevel, level[i]);
1301  }
1302  numLevel++;
1303 
1304  //Compute the number of supernodes at each level
1305  IntNumVec levelSize(numLevel);
1306  SetValue(levelSize,I_ZERO);
1307  //for(Int i=rootParent-1; i>=0; i-- ){ levelSize(level(i))++; }
1308  for(Int i=rootParent-1; i>=0; i-- ){
1309  if(level[i]>=0){
1310  levelSize[level[i]]++;
1311  }
1312  }
1313 
1314  //Allocate memory
1315  WSet.resize(numLevel,std::vector<Int>());
1316  for(Int i=0; i<numLevel; i++ ){
1317  WSet[i].reserve(levelSize(i));
1318  }
1319 
1320  //Fill the worklist based on the level of each supernode
1321  for(Int i=rootParent-1; i>=0; i-- ){
1323  // TreeBcast * bcastLTree = fwdToRightTree_[i];
1324  // TreeBcast * bcastUTree = fwdToBelowTree_[i];
1325  // TreeReduce<T> * redLTree = redToLeftTree_[i];
1326  // TreeReduce<T> * redDTree = redToAboveTree_[i];
1327  //
1328  //bool participating = MYROW( grid_ ) == PROW( i, grid_ ) || MYCOL( grid_ ) == PCOL( i, grid_ )
1329  // || CountSendToRight(i) > 0
1330  // || CountSendToBelow(i) > 0
1331  // || CountSendToCrossDiagonal(i) > 0
1332  // || CountRecvFromCrossDiagonal(i) >0
1333  // || ( isRecvFromLeft_( i ) )
1334  // || ( isRecvFromAbove_( i ) )
1335  // || isSendToDiagonal_(i)
1336  // || (bcastUTree!=NULL)
1337  // || (bcastLTree!=NULL)
1338  // || (redLTree!=NULL)
1339  // || (redDTree!=NULL) ;
1340  //participating = true;
1341  //if( participating){
1342  WSet[level[i]].push_back(i);
1343  //}
1344 
1345  }
1346 
1347 #if 1
1348  //Constrain the size of each list to be min(MPI_MAX_COMM,options_->maxPipelineDepth)
1349  Int limit = maxDepth; //(options_->maxPipelineDepth>0)?std::min(MPI_MAX_COMM,options_->maxPipelineDepth):MPI_MAX_COMM;
1350  Int rank = 0;
1351  for (Int lidx=0; lidx<WSet.size() ; lidx++){
1352 
1353 
1354  //Assign a rank in the order they are processed ?
1355 
1356  bool split = false;
1357  Int splitIdx = 0;
1358 
1359  Int orank = rank;
1360  Int maxRank = rank + WSet[lidx].size()-1;
1361  // Int maxRank = WSet[lidx].back();
1362 #if ( _DEBUGlevel_ >= 1 )
1363  statusOFS<< (Int)(rank/limIndex_) << " vs2 "<<(Int)(maxRank/limIndex_)<<std::endl;
1364 #endif
1365  if( (Int)(rank/limIndex_) != (Int)(maxRank/limIndex_)){
1366  split = true;
1367  splitIdx = maxRank - maxRank%limIndex_ - rank;
1368  }
1369 
1370  Int splitPoint = min((Int)WSet[lidx].size()-1,splitIdx>0?min(limit-1,splitIdx):limit-1);
1371 #if ( _DEBUGlevel_ >= 1 )
1372  if(split){
1373  statusOFS<<"TEST SPLIT at "<<splitIdx<<" "<<std::endl;
1374  }
1375 #endif
1376  split = split || (WSet[lidx].size()>limit);
1377 
1378  rank += splitPoint+1;
1379 
1380  if( split && splitPoint>0)
1381  {
1382 #if ( _DEBUGlevel_ >= 1 )
1383  statusOFS<<"SPLITPOINT is "<<splitPoint<<" "<<std::endl;
1384 #endif
1385 
1386 #if ( _DEBUGlevel_ >= 1 )
1387  if(splitPoint != limit-1){
1388  statusOFS<<"-----------------------"<<std::endl;
1389  statusOFS<<lidx<<": "<<orank<<" -- "<<maxRank<<std::endl;
1390  statusOFS<<" is NOW "<<std::endl;
1391  statusOFS<<lidx<<": "<<orank<<" -- "<<rank-1<<std::endl;
1392  statusOFS<<lidx+1<<": "<<rank<<" -- "<<maxRank<<std::endl;
1393  statusOFS<<"-----------------------"<<std::endl;
1394  }
1395 #endif
1396  std::vector<std::vector<Int> >::iterator pos = WSet.begin()+lidx+1;
1397  WSet.insert(pos,std::vector<Int>());
1398  WSet[lidx+1].insert(WSet[lidx+1].begin(),WSet[lidx].begin() + splitPoint+1 ,WSet[lidx].end());
1399  WSet[lidx].erase(WSet[lidx].begin()+splitPoint+1,WSet[lidx].end());
1400 
1401 #if ( _DEBUGlevel_ >= 1 )
1402  if(splitPoint != limit-1){
1403  assert((orank+WSet[lidx].size()-1)%limIndex_==0);
1404  }
1405 #endif
1406 
1407 
1408 
1409 
1410  //statusOFS<<"-----------------------"<<std::endl;
1411  //for(auto it = WSet[lidx].begin();it !=WSet[lidx].end(); it++){statusOFS<<*it<<" ";}statusOFS<<std::endl;
1412  //for(auto it = WSet[lidx+1].begin();it !=WSet[lidx+1].end(); it++){statusOFS<<*it<<" ";}statusOFS<<std::endl;
1413  //statusOFS<<"-----------------------"<<std::endl;
1414 
1427 
1428  // //THERE IS A SPECIAL CASE: list longer than limit AND splitidx in between: we have to check if we don't have to do multiple splits
1429  // if(splitPoint<splitIdx){
1430  // Int newSplitIdx = splitIdx - splitPoint;
1431  //
1432  // pos = WSet.begin()+lidx+2;
1433  // WSet.insert(pos,std::vector<Int>());
1434  // WSet[lidx+2].insert(WSet[lidx+2].begin(),WSet[lidx+1].begin() + newSplitIdx+1 ,WSet[lidx+1].end());
1435  // WSet[lidx+1].erase(WSet[lidx+1].begin()+newSplitIdx+1,WSet[lidx+1].end());
1436  //
1437  // pos2 = workingRanks_.begin()+lidx+2;
1438  // workingRanks_.insert(pos2,std::vector<Int>());
1439  // workingRanks_[lidx+2].insert(workingRanks_[lidx+2].begin(),workingRanks_[lidx+1].begin() + newSplitIdx+1 ,workingRanks_[lidx+1].end());
1440  //
1441  //statusOFS<<"SECOND SPLITPOINT IS "<<newSplitIdx<<std::endl;
1442  //statusOFS<<"-----------------------"<<std::endl;
1443  //statusOFS<<workingRanks_[lidx+1].front()<<" -- "<<workingRanks_[lidx+1].back()<<std::endl;
1444  // workingRanks_[lidx+1].erase(workingRanks_[lidx+1].begin()+newSplitIdx+1,workingRanks_[lidx+1].end());
1445  //statusOFS<<" is NOW "<<std::endl;
1446  //statusOFS<<workingRanks_[lidx+1].front()<<" -- "<<workingRanks_[lidx+1].back()<<std::endl;
1447  //statusOFS<<workingRanks_[lidx+2].front()<<" -- "<<workingRanks_[lidx+2].back()<<std::endl;
1448  //statusOFS<<"-----------------------"<<std::endl;
1449  // }
1450 
1451  }
1452 
1453 
1454  }
1455 
1456  //filter out non participating ?
1457  // for(Int i=rootParent-1; i>=0; i-- ){
1458  // ////TODO do I have something to do here ?
1459  // // TreeBcast * bcastLTree = fwdToRightTree_[i];
1460  // // TreeBcast * bcastUTree = fwdToBelowTree_[i];
1461  // // TreeReduce<T> * redLTree = redToLeftTree_[i];
1462  // // TreeReduce<T> * redDTree = redToAboveTree_[i];
1463  // //
1464  // //bool participating = MYROW( grid_ ) == PROW( i, grid_ ) || MYCOL( grid_ ) == PCOL( i, grid_ )
1465  // // || CountSendToRight(i) > 0
1466  // // || CountSendToBelow(i) > 0
1467  // // || CountSendToCrossDiagonal(i) > 0
1468  // // || CountRecvFromCrossDiagonal(i) >0
1469  // // || ( isRecvFromLeft_( i ) )
1470  // // || ( isRecvFromAbove_( i ) )
1471  // // || isSendToDiagonal_(i)
1472  // // || (bcastUTree!=NULL)
1473  // // || (bcastLTree!=NULL)
1474  // // || (redLTree!=NULL)
1475  // // || (redDTree!=NULL) ;
1476  // //participating = true;
1477  // //if(! participating){
1478  // // WSet[level[i]].erase(i);
1479  // //}
1480 
1481  // }
1482 #endif
1483 
1484 
1485 
1486  }
1487  else{
1488  for( Int ksup = numSuper - 2; ksup >= 0; ksup-- ){
1489  WSet.push_back(std::vector<Int>());
1490  WSet.back().push_back(ksup);
1491  }
1492 
1493  }
1494 
1495 
1496 
1497 
1498  TIMER_STOP(Compute_WorkSet);
1499 #if ( _DEBUGlevel_ >= 1 )
1500  for (Int lidx=0; lidx<WSet.size() ; lidx++){
1501  statusOFS << std::endl << "L"<< lidx << " is: {";
1502  for (Int supidx=0; supidx<WSet[lidx].size() ; supidx++){
1503  statusOFS << WSet[lidx][supidx] << " ["<<snodeEtree[WSet[lidx][supidx]]<<"] ";
1504  }
1505  statusOFS << " }"<< std::endl;
1506  }
1507 #endif
1508  }
1509 
1510  template<typename T>
1511  inline void PMatrix<T>::SelInvIntra_P2p(Int lidx,Int & rank)
1512  {
1513 
1514 #if defined (PROFILE) || defined(PMPI) || defined(USE_TAU)
1515  Real begin_SendULWaitContentFirst, end_SendULWaitContentFirst, time_SendULWaitContentFirst = 0;
1516 #endif
1517  Int numSuper = this->NumSuper();
1518  std::vector<std::vector<Int> > & superList = this->WorkingSet();
1519  Int numSteps = superList.size();
1520  Int stepSuper = superList[lidx].size();
1521 
1522 
1523 
1524 
1525  TIMER_START(AllocateBuffer);
1526 
1527  stepSuper = 0;
1528  for (Int supidx=0; supidx<superList[lidx].size(); supidx++){
1529  Int snodeIdx = superList[lidx][supidx];
1530 #ifndef NEW_BCAST
1531  TreeBcast * bcastLTree = fwdToRightTree_[snodeIdx];
1532  TreeBcast * bcastUTree = fwdToBelowTree_[snodeIdx];
1533 #else
1534  TreeBcast2<T> * bcastLTree = fwdToRightTree2_[snodeIdx];
1535  TreeBcast2<T> * bcastUTree = fwdToBelowTree2_[snodeIdx];
1536 #endif
1537  TreeReduce<T> * redLTree = redToLeftTree_[snodeIdx];
1538  TreeReduce<T> * redDTree = redToAboveTree_[snodeIdx];
1539  bool participating = MYROW( grid_ ) == PROW( snodeIdx, grid_ )
1540  || MYCOL( grid_ ) == PCOL( snodeIdx, grid_ )
1541  || CountSendToRight(snodeIdx) > 0
1542  || CountSendToBelow(snodeIdx) > 0
1543  || CountSendToCrossDiagonal(snodeIdx) > 0
1544  || CountRecvFromCrossDiagonal(snodeIdx) >0
1545  || ( isRecvFromLeft_( snodeIdx ) )
1546  || ( isRecvFromAbove_( snodeIdx ) )
1547  || isSendToDiagonal_(snodeIdx)
1548  || (bcastUTree!=NULL)
1549  || (bcastLTree!=NULL)
1550  || (redLTree!=NULL)
1551  || (redDTree!=NULL) ;
1552  if(participating){
1553  stepSuper++;
1554  }
1555  }
1556 
1557 
1558 
1559 #ifndef NEW_BCAST
1560  //This is required to send the size and content of U/L
1561  std::vector<std::vector<MPI_Request> > arrMpireqsSendToBelow;
1562  arrMpireqsSendToBelow.resize( stepSuper, std::vector<MPI_Request>( 2 * grid_->numProcRow, MPI_REQUEST_NULL ));
1563  std::vector<std::vector<MPI_Request> > arrMpireqsSendToRight;
1564  arrMpireqsSendToRight.resize(stepSuper, std::vector<MPI_Request>( 2 * grid_->numProcCol, MPI_REQUEST_NULL ));
1565 #endif
1566 
1567 #if 0
1568  //This is required to reduce L
1569  std::vector<MPI_Request> arrMpireqsSendToLeft;
1570  arrMpireqsSendToLeft.resize(stepSuper, MPI_REQUEST_NULL );
1571  //This is required to reduce D
1572  std::vector<MPI_Request> arrMpireqsSendToAbove;
1573  arrMpireqsSendToAbove.resize(stepSuper, MPI_REQUEST_NULL );
1574 #endif
1575 
1576  //This is required to receive the size and content of U/L
1577 #ifndef NEW_BCAST
1578  std::vector<MPI_Request> arrMpireqsRecvSizeFromAny;
1579  arrMpireqsRecvSizeFromAny.resize(stepSuper*2 , MPI_REQUEST_NULL);
1580  std::vector<MPI_Request> arrMpireqsRecvContentFromAny;
1581  arrMpireqsRecvContentFromAny.resize(stepSuper*2 , MPI_REQUEST_NULL);
1582 #endif
1583 
1584  //allocate the buffers for this supernode
1585  std::vector<SuperNodeBufferType> arrSuperNodes(stepSuper);
1586  Int pos = 0;
1587  for (Int supidx=0; supidx<superList[lidx].size(); supidx++){
1588  Int snodeIdx = superList[lidx][supidx];
1589 #ifndef NEW_BCAST
1590  TreeBcast * bcastLTree = fwdToRightTree_[snodeIdx];
1591  TreeBcast * bcastUTree = fwdToBelowTree_[snodeIdx];
1592 #else
1593  TreeBcast2<T> * bcastLTree = fwdToRightTree2_[snodeIdx];
1594  TreeBcast2<T> * bcastUTree = fwdToBelowTree2_[snodeIdx];
1595 #endif
1596  TreeReduce<T> * redLTree = redToLeftTree_[snodeIdx];
1597  TreeReduce<T> * redDTree = redToAboveTree_[snodeIdx];
1598  bool participating = MYROW( grid_ ) == PROW( snodeIdx, grid_ )
1599  || MYCOL( grid_ ) == PCOL( snodeIdx, grid_ )
1600  || CountSendToRight(snodeIdx) > 0
1601  || CountSendToBelow(snodeIdx) > 0
1602  || CountSendToCrossDiagonal(snodeIdx) > 0
1603  || CountRecvFromCrossDiagonal(snodeIdx) >0
1604  || ( isRecvFromLeft_( snodeIdx ) )
1605  || ( isRecvFromAbove_( snodeIdx ) )
1606  || isSendToDiagonal_(snodeIdx)
1607  || (bcastUTree!=NULL)
1608  || (bcastLTree!=NULL)
1609  || (redLTree!=NULL)
1610  || (redDTree!=NULL) ;
1611 
1612  if(participating){
1613  arrSuperNodes[pos].Index = superList[lidx][supidx];
1614  arrSuperNodes[pos].Rank = rank;
1615 
1616 
1617  SuperNodeBufferType & snode = arrSuperNodes[pos];
1618 
1619  pos++;
1620  }
1621  rank++;
1622  }
1623 
1624 
1625 
1626  int numSentToLeft = 0;
1627  std::vector<int> reqSentToLeft;
1628 
1629 
1630  NumMat<T> AinvBuf, UBuf;
1631 
1632  TIMER_STOP(AllocateBuffer);
1633 
1634 #ifndef _RELEASE_
1635  PushCallStack("PMatrix::SelInv_P2p::UpdateL");
1636 #endif
1637 #if ( _DEBUGlevel_ >= 1 )
1638  statusOFS << std::endl << "Communication to the Schur complement." << std::endl << std::endl;
1639 #endif
1640 
1641 #ifdef NEW_BCAST
1642  vector<char> bcastUready(stepSuper,0);
1643  // vector<char> bcastUdone(stepSuper,0);
1644  vector<char> bcastLready(stepSuper,0);
1645  // vector<char> bcastLdone(stepSuper,0);
1646 #endif
1647 
1648  {
1649  //Receivers have to resize their buffers
1650  TIMER_START(IRecv_Content_UL);
1651 
1652 
1653 
1654  // Receivers (Content)
1655  for (Int supidx=0; supidx<stepSuper ; supidx++){
1656  SuperNodeBufferType & snode = arrSuperNodes[supidx];
1657 #ifndef NEW_BCAST
1658  MPI_Request * mpireqsRecvFromAbove = &arrMpireqsRecvContentFromAny[supidx*2];
1659  MPI_Request * mpireqsRecvFromLeft = &arrMpireqsRecvContentFromAny[supidx*2+1];
1660 #endif
1661 
1662  if( isRecvFromAbove_( snode.Index ) &&
1663  MYROW( grid_ ) != PROW( snode.Index, grid_ ) ){
1664 
1665 #ifdef NEW_BCAST
1666  TreeBcast2<T> * bcastUTree2 = fwdToBelowTree2_[snode.Index];
1667 
1668  if(bcastUTree2 != NULL){
1669  bcastUTree2->SetTag(IDX_TO_TAG(snode.Rank,SELINV_TAG_U_CONTENT,limIndex_));
1670  //Initialize the tree
1671  //bcastUTree2->AllocRecvBuffer();
1672  //Post Recv request;
1673  bool done = bcastUTree2->Progress();
1674 
1675 #if ( _DEBUGlevel_ >= 1 )
1676  statusOFS<<"["<<snode.Index<<"] "<<" trying to progress bcast U "<<done<<std::endl;
1677 #endif
1678  }
1679 #else
1680  TreeBcast * bcastUTree = fwdToBelowTree_[snode.Index];
1681  if(bcastUTree!=NULL){
1682  Int myRoot = bcastUTree->GetRoot();
1683  snode.SizeSstrUrowRecv = bcastUTree->GetMsgSize();
1684  snode.SstrUrowRecv.resize( snode.SizeSstrUrowRecv);
1685  MPI_Irecv( &snode.SstrUrowRecv[0], snode.SizeSstrUrowRecv, MPI_BYTE,
1686  myRoot, IDX_TO_TAG(snode.Rank,SELINV_TAG_U_CONTENT,limIndex_),
1687  grid_->colComm, mpireqsRecvFromAbove );
1688 #if ( _DEBUGlevel_ >= 1 )
1689  statusOFS << std::endl << "["<<snode.Index<<"] "<< "Receiving U " << snode.SizeSstrUrowRecv << " BYTES from "<<myRoot<<" on tag "<<IDX_TO_TAG(snode.Rank,SELINV_TAG_U_CONTENT,limIndex_)<< std::endl << std::endl;
1690 #endif
1691  }
1692 #endif
1693  } // if I need to receive from up
1694 
1695  if( isRecvFromLeft_( snode.Index ) &&
1696  MYCOL( grid_ ) != PCOL( snode.Index, grid_ ) ){
1697 #ifdef NEW_BCAST
1698  TreeBcast2<T> * bcastLTree2 = fwdToRightTree2_[snode.Index];
1699 
1700  if(bcastLTree2 != NULL){
1701  bcastLTree2->SetTag(IDX_TO_TAG(snode.Rank,SELINV_TAG_L_CONTENT,limIndex_));
1702  //Initialize the tree
1703  //bcastUTree2->AllocRecvBuffer();
1704  //Post Recv request;
1705  bool done = bcastLTree2->Progress();
1706 
1707 #if ( _DEBUGlevel_ >= 1 )
1708  statusOFS<<"["<<snode.Index<<"] "<<" trying to progress bcast L "<<done<<std::endl;
1709 #endif
1710  }
1711 #else
1712 
1713  TreeBcast * bcastLTree = fwdToRightTree_[snode.Index];
1714  if(bcastLTree!=NULL){
1715  Int myRoot = bcastLTree->GetRoot();
1716  snode.SizeSstrLcolRecv = bcastLTree->GetMsgSize();
1717  snode.SstrLcolRecv.resize(snode.SizeSstrLcolRecv);
1718  MPI_Irecv( &snode.SstrLcolRecv[0], snode.SizeSstrLcolRecv, MPI_BYTE,
1719  myRoot, IDX_TO_TAG(snode.Rank,SELINV_TAG_L_CONTENT,limIndex_),
1720  grid_->rowComm, mpireqsRecvFromLeft );
1721 #if ( _DEBUGlevel_ >= 1 )
1722  statusOFS << std::endl << "["<<snode.Index<<"] "<< "Receiving L " << snode.SizeSstrLcolRecv << " BYTES from "<<myRoot<<" on tag "<<IDX_TO_TAG(snode.Rank,SELINV_TAG_L_CONTENT,limIndex_)<< std::endl << std::endl;
1723 #endif
1724  }
1725 #endif
1726  } // if I need to receive from left
1727  }
1728  TIMER_STOP(IRecv_Content_UL);
1729 
1730  // Senders
1731  TIMER_START(ISend_Content_UL);
1732  for (Int supidx=0; supidx<stepSuper; supidx++){
1733  SuperNodeBufferType & snode = arrSuperNodes[supidx];
1734 #ifndef NEW_BCAST
1735  std::vector<MPI_Request> & mpireqsSendToBelow = arrMpireqsSendToBelow[supidx];
1736  std::vector<MPI_Request> & mpireqsSendToRight = arrMpireqsSendToRight[supidx];
1737 #endif
1738 
1739 #if ( _DEBUGlevel_ >= 1 )
1740  statusOFS << std::endl << "["<<snode.Index<<"] "
1741  << "Communication for the U part." << std::endl << std::endl;
1742 #endif
1743  // Communication for the U part.
1744 
1745  if( MYROW( grid_ ) == PROW( snode.Index, grid_ ) ){
1746  std::vector<UBlock<T> >& Urow = this->U( LBi(snode.Index, grid_) );
1747  // Pack the data in U
1748  TIMER_START(Serialize_UL);
1749  std::stringstream sstm;
1750 
1751  std::vector<Int> mask( UBlockMask::TOTAL_NUMBER, 1 );
1752  // All blocks are to be sent down.
1753  serialize( (Int)Urow.size(), sstm, NO_MASK );
1754  for( Int jb = 0; jb < Urow.size(); jb++ ){
1755  serialize( Urow[jb], sstm, mask );
1756  }
1757  snode.SstrUrowSend.resize( Size( sstm ) );
1758  sstm.read( &snode.SstrUrowSend[0], snode.SstrUrowSend.size() );
1759  snode.SizeSstrUrowSend = snode.SstrUrowSend.size();
1760  TIMER_STOP(Serialize_UL);
1761 
1762 #ifdef NEW_BCAST
1763  TreeBcast2<T> * bcastUTree2 = fwdToBelowTree2_[snode.Index];
1764  if(bcastUTree2!=NULL){
1765  bcastUTree2->SetTag(IDX_TO_TAG(snode.Rank,SELINV_TAG_U_CONTENT,limIndex_));
1766  bcastUTree2->SetLocalBuffer((T*)&snode.SstrUrowSend[0]);
1767  bcastUTree2->SetDataReady(true);
1768  bool done = bcastUTree2->Progress();
1769 #if ( _DEBUGlevel_ >= 1 )
1770  statusOFS<<"["<<snode.Index<<"] "<<" trying to progress bcast U "<<done<<std::endl;
1771 #endif
1772 
1773  //progress should do the send
1774 
1775 #if ( _DEBUGlevel_ >= 1 )
1776  for( Int idxRecv = 0; idxRecv < bcastUTree2->GetDestCount(); ++idxRecv ){
1777  Int iProcRow = bcastUTree2->GetDest(idxRecv);
1778  statusOFS << std::endl << "["<<snode.Index<<"] "<< "Sending U " << snode.SizeSstrUrowSend << " BYTES on tag "<<IDX_TO_TAG(snode.Rank,SELINV_TAG_U_CONTENT,limIndex_) << std::endl << std::endl;
1779  }
1780 #endif
1781 
1782  }
1783 #else
1784  TreeBcast * bcastUTree = fwdToBelowTree_[snode.Index];
1785  if(bcastUTree!=NULL){
1786  bcastUTree->ForwardMessage((char*)&snode.SstrUrowSend[0], snode.SizeSstrUrowSend,
1787  IDX_TO_TAG(snode.Rank,SELINV_TAG_U_CONTENT,limIndex_), &mpireqsSendToBelow[0]);
1788  for( Int idxRecv = 0; idxRecv < bcastUTree->GetDestCount(); ++idxRecv ){
1789  Int iProcRow = bcastUTree->GetDest(idxRecv);
1790 #if ( _DEBUGlevel_ >= 1 )
1791  statusOFS << std::endl << "["<<snode.Index<<"] "<< "Sending U " << snode.SizeSstrUrowSend << " BYTES on tag "<<IDX_TO_TAG(snode.Rank,SELINV_TAG_U_CONTENT,limIndex_) << std::endl << std::endl;
1792 #endif
1793  }
1794  }
1795 
1796 #endif
1797  } // if I am the sender
1798 
1799 
1800 #if ( _DEBUGlevel_ >= 1 )
1801  statusOFS << std::endl << "["<<snode.Index<<"] "<< "Communication for the L part." << std::endl << std::endl;
1802 #endif
1803 
1804 
1805 
1806  // Communication for the L part.
1807  if( MYCOL( grid_ ) == PCOL( snode.Index, grid_ ) ){
1808  std::vector<LBlock<T> >& Lcol = this->L( LBj(snode.Index, grid_) );
1809  TIMER_START(Serialize_UL);
1810  // Pack the data in L
1811  std::stringstream sstm;
1812  std::vector<Int> mask( LBlockMask::TOTAL_NUMBER, 1 );
1813  mask[LBlockMask::NZVAL] = 0; // nzval is excluded
1814 
1815  // All blocks except for the diagonal block are to be sent right
1816 
1817  if( MYROW( grid_ ) == PROW( snode.Index, grid_ ) )
1818  serialize( (Int)Lcol.size() - 1, sstm, NO_MASK );
1819  else
1820  serialize( (Int)Lcol.size(), sstm, NO_MASK );
1821 
1822  for( Int ib = 0; ib < Lcol.size(); ib++ ){
1823  if( Lcol[ib].blockIdx > snode.Index ){
1824 #if ( _DEBUGlevel_ >= 2 )
1825  statusOFS << std::endl << "["<<snode.Index<<"] "<< "Serializing Block index " << Lcol[ib].blockIdx << std::endl;
1826 #endif
1827  serialize( Lcol[ib], sstm, mask );
1828  }
1829  }
1830  snode.SstrLcolSend.resize( Size( sstm ) );
1831  sstm.read( &snode.SstrLcolSend[0], snode.SstrLcolSend.size() );
1832  snode.SizeSstrLcolSend = snode.SstrLcolSend.size();
1833  TIMER_STOP(Serialize_UL);
1834 
1835 #ifdef NEW_BCAST
1836  TreeBcast2<T> * bcastLTree2 = fwdToRightTree2_[snode.Index];
1837  if(bcastLTree2!=NULL){
1838  bcastLTree2->SetTag(IDX_TO_TAG(snode.Rank,SELINV_TAG_L_CONTENT,limIndex_));
1839  bcastLTree2->SetLocalBuffer((T*)&snode.SstrLcolSend[0]);
1840  bcastLTree2->SetDataReady(true);
1841  bool done = bcastLTree2->Progress();
1842 #if ( _DEBUGlevel_ >= 1 )
1843  statusOFS<<"["<<snode.Index<<"] "<<" trying to progress bcast L "<<done<<std::endl;
1844 #endif
1845 
1846  //progress should do the send
1847 
1848 #if ( _DEBUGlevel_ >= 1 )
1849  for( Int idxRecv = 0; idxRecv < bcastLTree2->GetDestCount(); ++idxRecv ){
1850  Int iProcRow = bcastLTree2->GetDest(idxRecv);
1851  statusOFS << std::endl << "["<<snode.Index<<"] "<< "Sending L " << snode.SizeSstrLcolSend << " BYTES on tag "<<IDX_TO_TAG(snode.Rank,SELINV_TAG_L_CONTENT,limIndex_) << std::endl << std::endl;
1852  }
1853 #endif
1854 
1855  }
1856 #else
1857 
1858  TreeBcast * bcastLTree = fwdToRightTree_[snode.Index];
1859  if(bcastLTree!=NULL){
1860  bcastLTree->ForwardMessage((char*)&snode.SstrLcolSend[0], snode.SizeSstrLcolSend,
1861  IDX_TO_TAG(snode.Rank,SELINV_TAG_L_CONTENT,limIndex_), &mpireqsSendToRight[0]);
1862 
1863  for( Int idxRecv = 0; idxRecv < bcastLTree->GetDestCount(); ++idxRecv ){
1864  Int iProcCol = bcastLTree->GetDest(idxRecv);
1865 #if ( _DEBUGlevel_ >= 1 )
1866  statusOFS << std::endl << "["<<snode.Index<<"] "<< "Sending L " << snode.SizeSstrLcolSend << " BYTES on tag "<<IDX_TO_TAG(snode.Rank,SELINV_TAG_L_CONTENT,limIndex_) << std::endl << std::endl;
1867 #endif
1868  }
1869  }
1870 #endif
1871  } // if I am the sender
1872  } //Senders
1873  TIMER_STOP(ISend_Content_UL);
1874  }
1875 
1876  vector<char> redLdone(stepSuper,0);
1877  for (Int supidx=0; supidx<stepSuper; supidx++){
1878  SuperNodeBufferType & snode = arrSuperNodes[supidx];
1879  TreeReduce<T> * redLTree = redToLeftTree_[snode.Index];
1880 
1881  if(redLTree != NULL){
1882  redLTree->SetTag(IDX_TO_TAG(snode.Rank,SELINV_TAG_L_REDUCE,limIndex_));
1883  //Initialize the tree
1884  redLTree->AllocRecvBuffers();
1885  //Post All Recv requests;
1886  redLTree->PostFirstRecv();
1887  }
1888  }
1889 
1890 
1891  TIMER_START(Compute_Sinv_LT);
1892  {
1893  Int msgForwarded = 0;
1894  Int msgToFwd = 0;
1895  Int gemmProcessed = 0;
1896  Int gemmToDo = 0;
1897  // Int toRecvGemm = 0;
1898  //copy the list of supernodes we need to process
1899  std::list<Int> readySupidx;
1900  //find local things to do
1901  for(Int supidx = 0;supidx<stepSuper;supidx++){
1902  SuperNodeBufferType & snode = arrSuperNodes[supidx];
1903 
1904 
1905  if( isRecvFromAbove_( snode.Index ) && isRecvFromLeft_( snode.Index )){
1906  gemmToDo++;
1907  if( MYCOL( grid_ ) == PCOL( snode.Index, grid_ ) ){
1908  snode.isReady++;
1909  }
1910 
1911  if( MYROW( grid_ ) == PROW( snode.Index, grid_ ) ){
1912  snode.isReady++;
1913  }
1914 
1915  if(snode.isReady==2){
1916  readySupidx.push_back(supidx);
1917 #if ( _DEBUGlevel_ >= 1 )
1918  statusOFS<<std::endl<<"Locally processing ["<<snode.Index<<"]"<<std::endl;
1919 #endif
1920  }
1921  }
1922  else if( (isRecvFromLeft_( snode.Index ) ) && MYCOL( grid_ ) != PCOL( snode.Index, grid_ ) )
1923  {
1924  //Get the reduction tree
1925  TreeReduce<T> * redLTree = redToLeftTree_[snode.Index];
1926 
1927  if(redLTree != NULL){
1928  TIMER_START(Reduce_Sinv_LT_Isend);
1929  //send the data to NULL to ensure 0 byte send
1930  redLTree->SetLocalBuffer(NULL);
1931  redLTree->SetDataReady(true);
1932 
1933  bool done = redLTree->Progress();
1934 
1935 #if ( _DEBUGlevel_ >= 1 )
1936  statusOFS<<"["<<snode.Index<<"] "<<" trying to progress reduce L "<<done<<std::endl;
1937 #endif
1938  TIMER_STOP(Reduce_Sinv_LT_Isend);
1939  }
1940  }// if( isRecvFromAbove_( snode.Index ) && isRecvFromLeft_( snode.Index ))
1941 
1942  if(MYROW(grid_)!=PROW(snode.Index,grid_)){
1943 #ifdef NEW_BCAST
1944  TreeBcast2<T> * bcastUTree2 = fwdToBelowTree2_[snode.Index];
1945  if(bcastUTree2 != NULL){
1946  if(bcastUTree2->GetDestCount()>0){
1947  msgToFwd++;
1948  }
1949  }
1950 #else
1951  TreeBcast * bcastUTree = fwdToBelowTree_[snode.Index];
1952  if(bcastUTree != NULL){
1953  if(bcastUTree->GetDestCount()>0){
1954  msgToFwd++;
1955  }
1956  }
1957 #endif
1958  }
1959 
1960  if(MYCOL(grid_)!=PCOL(snode.Index,grid_)){
1961 #ifdef NEW_BCAST
1962  TreeBcast2<T> * bcastLTree2 = fwdToRightTree2_[snode.Index];
1963  if(bcastLTree2 != NULL){
1964  if(bcastLTree2->GetDestCount()>0){
1965  msgToFwd++;
1966  }
1967  }
1968 #else
1969 
1970  TreeBcast * bcastLTree = fwdToRightTree_[snode.Index];
1971  if(bcastLTree != NULL){
1972  if(bcastLTree->GetDestCount()>0){
1973  msgToFwd++;
1974  }
1975  }
1976 #endif
1977  }
1978  }
1979 
1980 #if ( _DEBUGlevel_ >= 1 )
1981  statusOFS<<std::endl<<"gemmToDo ="<<gemmToDo<<std::endl;
1982  statusOFS<<std::endl<<"msgToFwd ="<<msgToFwd<<std::endl;
1983 #endif
1984 
1985 
1986 #if defined (PROFILE)
1987  end_SendULWaitContentFirst=0;
1988  begin_SendULWaitContentFirst=0;
1989 #endif
1990 
1991  while(gemmProcessed<gemmToDo || msgForwarded < msgToFwd)
1992  {
1993  Int reqidx = MPI_UNDEFINED;
1994  Int supidx = -1;
1995 
1996 
1997  //while I don't have anything to do, wait for data to arrive
1998  do{
1999 
2000 
2001 
2002  //then process with the remote ones
2003 
2004  TIMER_START(WaitContent_UL);
2005 #if defined(PROFILE)
2006  if(begin_SendULWaitContentFirst==0){
2007  begin_SendULWaitContentFirst=1;
2008  TIMER_START(WaitContent_UL_First);
2009  }
2010 #endif
2011 
2012 
2013 
2014 
2015 
2016 #ifdef NEW_BCAST
2017  //loop through each tree arrays and progress
2018  for (Int supidx2=0; supidx2<stepSuper; supidx2++){
2019  SuperNodeBufferType & snode = arrSuperNodes[supidx2];
2020 
2021  TreeBcast2<T> * bcastUTree2 = fwdToBelowTree2_[snode.Index];
2022 
2023  if(bcastUTree2 != NULL /*&& (!bcastUdone[supidx2])*/){
2024  if(!bcastUready[supidx2]){
2025  if(!bcastUTree2->IsRoot()){
2026  bool ready = bcastUTree2->IsDataReceived();
2027 
2028 
2029  if(ready){
2030  msgForwarded++;
2031  //TODO Temp : we can change snode to have a treepointer inside of it and get rid of this LUUpdateBuf to use tu local buffer of the tree directly instead ?
2032 
2033  //snode.SizeSstrUrowRecv = bcastUTree2->GetMsgSize();
2034  //snode.SstrUrowRecv.resize( snode.SizeSstrUrowRecv);
2035  //bcastUTree2->SetLocalBuffer((T*)&snode.SstrUrowRecv[0]);
2036  //assert( isRecvFromAbove_( snode.Index ) && isRecvFromLeft_( snode.Index ));
2037 
2038  if( isRecvFromAbove_( snode.Index ) && isRecvFromLeft_( snode.Index )){
2039  snode.isReady++;
2040  //if we received both L and U, the supernode is ready
2041  if(snode.isReady==2){
2042  readySupidx.push_back(supidx2);
2043 #if defined(PROFILE)
2044  if(end_SendULWaitContentFirst==0){
2045  TIMER_STOP(WaitContent_UL_First);
2046  end_SendULWaitContentFirst=1;
2047  }
2048 #endif
2049  }
2050  }
2051 
2052  bcastUready[supidx2]=1;
2053  //#if ( _DEBUGlevel_ >= 1 )
2054  for( Int idxRecv = 0; idxRecv < bcastUTree2->GetDestCount(); ++idxRecv ){
2055  Int iProcRow = bcastUTree2->GetDest(idxRecv);
2056  statusOFS << std::endl << "MATDEBUG ["<<snode.Index<<"] "<< "Forwarded U " << snode.SizeSstrUrowRecv << " BYTES to "<<iProcRow<<" on tag "<<IDX_TO_TAG(snode.Rank,SELINV_TAG_U_CONTENT,limIndex_)<< std::endl << std::endl;
2057  }
2058  //#endif
2059 
2060  }
2061 
2062  }
2063  else{
2064  bcastUready[supidx2]=1;
2065  }
2066  }
2067  //Progress is not necessarily what I need: I need to know if I have received the data, and that's it
2068  bool done = bcastUTree2->Progress();
2069 
2070 #if ( _DEBUGlevel_ >= 1 )
2071  statusOFS<<"["<<snode.Index<<"] "<<" trying to progress bcast U "<<done<<std::endl;
2072 #endif
2073  }
2074 
2075 
2076 
2077 
2078 
2079 
2080 
2081 
2082 
2083  TreeBcast2<T> * bcastLTree2 = fwdToRightTree2_[snode.Index];
2084 
2085  if(bcastLTree2 != NULL /*&& (!bcastLdone[supidx2])*/){
2086  if(!bcastLready[supidx2]){
2087  if(!bcastLTree2->IsRoot()){
2088  bool ready = bcastLTree2->IsDataReceived();
2089 
2090 
2091  if(ready){
2092  msgForwarded++;
2093  //snode.SizeSstrLcolRecv = bcastLTree2->GetMsgSize();
2094  //snode.SstrLcolRecv.resize( snode.SizeSstrLcolRecv);
2095  //bcastLTree2->SetLocalBuffer((T*)&snode.SstrLcolRecv[0]);
2096  //assert( isRecvFromAbove_( snode.Index ) && isRecvFromLeft_( snode.Index ));
2097 
2098  if( isRecvFromAbove_( snode.Index ) && isRecvFromLeft_( snode.Index )){
2099  snode.isReady++;
2100  //if we received both L and L, the supernode is ready
2101  if(snode.isReady==2){
2102  readySupidx.push_back(supidx2);
2103 #if defined(PROFILE)
2104  if(end_SendULWaitContentFirst==0){
2105  TIMER_STOP(WaitContent_UL_First);
2106  end_SendULWaitContentFirst=1;
2107  }
2108 #endif
2109  }
2110  }
2111 
2112  bcastLready[supidx2]=1;
2113  //#if ( _DEBLGlevel_ >= 1 )
2114  for( Int idxRecv = 0; idxRecv < bcastLTree2->GetDestCount(); ++idxRecv ){
2115  Int iProcRow = bcastLTree2->GetDest(idxRecv);
2116  statusOFS << std::endl << "MATDEBUG ["<<snode.Index<<"] "<< "Forwarded L " << snode.SizeSstrLcolRecv << " BYTES to "<<iProcRow<<" on tag "<<IDX_TO_TAG(snode.Rank,SELINV_TAG_L_CONTENT,limIndex_)<< std::endl << std::endl;
2117  }
2118  //#endif
2119 
2120  }
2121 
2122  }
2123  else{
2124  bcastLready[supidx2]=1;
2125  }
2126  }
2127  //Progress is not necessarily what I need: I need to know if I have received the data, and that's it
2128  bool done = bcastLTree2->Progress();
2129 
2130 #if ( _DEBUGlevel_ >= 1 )
2131  statusOFS<<"["<<snode.Index<<"] "<<" trying to progress bcast L "<<done<<std::endl;
2132 #endif
2133  }
2134 
2135 
2136 
2137 
2138 
2139 
2140 
2141 
2142 
2143 
2144 
2145 
2146 
2147 
2148 
2149  }
2150 #else
2151  int reqIndices[arrMpireqsRecvContentFromAny.size()];
2152  int numRecv = 0;
2153  numRecv = 0;
2154  int err = MPI_Waitsome(2*stepSuper, &arrMpireqsRecvContentFromAny[0], &numRecv, reqIndices, MPI_STATUSES_IGNORE);
2155  assert(err==MPI_SUCCESS);
2156 
2157  for(int i =0;i<numRecv;i++){
2158  reqidx = reqIndices[i];
2159  //I've received something
2160  if(reqidx!=MPI_UNDEFINED)
2161  {
2162  //this stays true
2163  supidx = reqidx/2;
2164  SuperNodeBufferType & snode = arrSuperNodes[supidx];
2165 
2166  //If it's a U block
2167  if(reqidx%2==0){
2168 
2169  TreeBcast * bcastUTree = fwdToBelowTree_[snode.Index];
2170  if(bcastUTree != NULL){
2171  if(bcastUTree->GetDestCount()>0){
2172 
2173  std::vector<MPI_Request> & mpireqsSendToBelow = arrMpireqsSendToBelow[supidx];
2174 #if ( _DEBUGlevel_ >= 1 )
2175  for( Int idxRecv = 0; idxRecv < bcastUTree->GetDestCount(); ++idxRecv ){
2176  Int iProcRow = bcastUTree->GetDest(idxRecv);
2177  statusOFS << std::endl << "["<<snode.Index<<"] "<< "Forwarding U " << snode.SizeSstrUrowRecv << " BYTES to "<<iProcRow<<" on tag "<<IDX_TO_TAG(snode.Rank,SELINV_TAG_U_CONTENT,limIndex_)<< std::endl << std::endl;
2178  }
2179 #endif
2180 
2181  bcastUTree->ForwardMessage( (char*)&snode.SstrUrowRecv[0], snode.SizeSstrUrowRecv,
2182  IDX_TO_TAG(snode.Rank,SELINV_TAG_U_CONTENT,limIndex_), &mpireqsSendToBelow[0] );
2183 #if ( _DEBUGlevel_ >= 1 )
2184  for( Int idxRecv = 0; idxRecv < bcastUTree->GetDestCount(); ++idxRecv ){
2185  Int iProcRow = bcastUTree->GetDest(idxRecv);
2186  statusOFS << std::endl << "["<<snode.Index<<"] "<< "Forwarded U " << snode.SizeSstrUrowRecv << " BYTES to "<<iProcRow<<" on tag "<<IDX_TO_TAG(snode.Rank,SELINV_TAG_U_CONTENT,limIndex_)<< std::endl << std::endl;
2187  }
2188 #endif
2189 
2190  msgForwarded++;
2191  }
2192  }
2193  }
2194  //If it's a L block
2195  else if(reqidx%2==1){
2196  TreeBcast * bcastLTree = fwdToRightTree_[snode.Index];
2197  if(bcastLTree != NULL){
2198  if(bcastLTree->GetDestCount()>0){
2199 
2200  std::vector<MPI_Request> & mpireqsSendToRight = arrMpireqsSendToRight[supidx];
2201 #if ( _DEBUGlevel_ >= 1 )
2202  for( Int idxRecv = 0; idxRecv < bcastLTree->GetDestCount(); ++idxRecv ){
2203  Int iProcCol = bcastLTree->GetDest(idxRecv);
2204  statusOFS << std::endl << "["<<snode.Index<<"] "<< "Forwarding L " << snode.SizeSstrLcolRecv << " BYTES to "<<iProcCol<<" on tag "<<IDX_TO_TAG(snode.Rank,SELINV_TAG_L_CONTENT,limIndex_)<< std::endl << std::endl;
2205  }
2206 #endif
2207 
2208  bcastLTree->ForwardMessage( (char*)&snode.SstrLcolRecv[0], snode.SizeSstrLcolRecv,
2209  IDX_TO_TAG(snode.Rank,SELINV_TAG_L_CONTENT,limIndex_), &mpireqsSendToRight[0]);
2210 
2211  // for( Int idxRecv = 0; idxRecv < bcastLTree->GetDestCount(); ++idxRecv ){
2212  // Int iProcCol = bcastLTree->GetDest(idxRecv);
2213  // PROFILE_COMM(MYPROC(this->grid_),PNUM(MYROW(this->grid_),iProcCol,this->grid_),IDX_TO_TAG(snode.Index,SELINV_TAG_L_CONTENT),snode.SizeSstrLcolRecv);
2214  // }
2215 #if ( _DEBUGlevel_ >= 1 )
2216  for( Int idxRecv = 0; idxRecv < bcastLTree->GetDestCount(); ++idxRecv ){
2217  Int iProcCol = bcastLTree->GetDest(idxRecv);
2218  statusOFS << std::endl << "["<<snode.Index<<"] "<< "Forwarded L " << snode.SizeSstrLcolRecv << " BYTES to "<<iProcCol<<" on tag "<<IDX_TO_TAG(snode.Rank,SELINV_TAG_L_CONTENT,limIndex_)<< std::endl << std::endl;
2219  }
2220 #endif
2221  msgForwarded++;
2222  }
2223  }
2224  }
2225 
2226 
2227 #if ( _DEBUGlevel_ >= 1 )
2228  statusOFS<<std::endl<<"Received data for ["<<snode.Index<<"] reqidx%2="<<reqidx%2<<" is ready ?"<<snode.isReady<<std::endl;
2229 #endif
2230 
2231  if( isRecvFromAbove_( snode.Index ) && isRecvFromLeft_( snode.Index )){
2232  snode.isReady++;
2233 
2234  //if we received both L and U, the supernode is ready
2235  if(snode.isReady==2){
2236  readySupidx.push_back(supidx);
2237 
2238 #if defined(PROFILE)
2239  if(end_SendULWaitContentFirst==0){
2240  TIMER_STOP(WaitContent_UL_First);
2241  end_SendULWaitContentFirst=1;
2242  }
2243 #endif
2244  }
2245  }
2246  }
2247 
2248  }//end for waitsome
2249 #endif
2250 
2251  TIMER_STOP(WaitContent_UL);
2252 
2253  } while( (gemmProcessed<gemmToDo && readySupidx.size()==0) || (gemmProcessed==gemmToDo && msgForwarded<msgToFwd) );
2254 
2255  //If I have some work to do
2256  if(readySupidx.size()>0)
2257  {
2258  supidx = readySupidx.back();
2259  readySupidx.pop_back();
2260  SuperNodeBufferType & snode = arrSuperNodes[supidx];
2261 
2262 
2263  // Only the processors received information participate in the Gemm
2264  if( isRecvFromAbove_( snode.Index ) && isRecvFromLeft_( snode.Index ) ){
2265 
2266  std::vector<LBlock<T> > LcolRecv;
2267  std::vector<UBlock<T> > UrowRecv;
2268  // Save all the data to be updated for { L( isup, snode.Index ) | isup > snode.Index }.
2269  // The size will be updated in the Gemm phase and the reduce phase
2270 
2271  UnpackData(snode, LcolRecv, UrowRecv);
2272 #ifdef NEW_BCAST
2273  //cleanup corresponding tree
2274  TreeBcast2<T> * bcastUTree2 = fwdToBelowTree2_[snode.Index];
2275  TreeBcast2<T> * bcastLTree2 = fwdToRightTree2_[snode.Index];
2276  if(bcastUTree2->IsDone()){
2277  bcastUTree2->CleanupBuffers();
2278  }
2279  if(bcastLTree2->IsDone()){
2280  bcastLTree2->CleanupBuffers();
2281  }
2282 #endif
2283 
2284  //NumMat<T> AinvBuf, UBuf;
2285  SelInv_lookup_indexes(snode,LcolRecv, UrowRecv,AinvBuf,UBuf);
2286 
2287  snode.LUpdateBuf.Resize( AinvBuf.m(), SuperSize( snode.Index, super_ ) );
2288 
2289 #ifdef GEMM_PROFILE
2290  gemm_stat.push_back(AinvBuf.m());
2291  gemm_stat.push_back(UBuf.m());
2292  gemm_stat.push_back(AinvBuf.n());
2293 #endif
2294 
2295  TIMER_START(Compute_Sinv_LT_GEMM);
2296  blas::Gemm( 'N', 'T', AinvBuf.m(), UBuf.m(), AinvBuf.n(), MINUS_ONE<T>(),
2297  AinvBuf.Data(), AinvBuf.m(),
2298  UBuf.Data(), UBuf.m(), ZERO<T>(),
2299  snode.LUpdateBuf.Data(), snode.LUpdateBuf.m() );
2300  TIMER_STOP(Compute_Sinv_LT_GEMM);
2301 
2302 
2303 #if ( _DEBUGlevel_ >= 2 )
2304  statusOFS << std::endl << "["<<snode.Index<<"] "<< "snode.LUpdateBuf: " << snode.LUpdateBuf << std::endl;
2305 #endif
2306  } // if Gemm is to be done locally
2307 
2308  //Get the reduction tree
2309  TreeReduce<T> * redLTree = redToLeftTree_[snode.Index];
2310  if(redLTree != NULL){
2311  assert( snode.LUpdateBuf.m() != 0 && snode.LUpdateBuf.n() != 0 );
2312  TIMER_START(Reduce_Sinv_LT_Isend);
2313  //send the data
2314  redLTree->SetLocalBuffer(snode.LUpdateBuf.Data());
2315  redLTree->SetDataReady(true);
2316 
2317  bool done = redLTree->Progress();
2318 #if ( _DEBUGlevel_ >= 1 )
2319  statusOFS<<"["<<snode.Index<<"] "<<" trying to progress reduce L "<<done<<std::endl;
2320 #endif
2321  TIMER_STOP(Reduce_Sinv_LT_Isend);
2322  }
2323 
2324  gemmProcessed++;
2325 
2326 #if ( _DEBUGlevel_ >= 1 )
2327  statusOFS<<std::endl<<"gemmProcessed ="<<gemmProcessed<<"/"<<gemmToDo<<std::endl;
2328 #endif
2329 
2330  //advance reductions
2331  for (Int supidx=0; supidx<stepSuper; supidx++){
2332  SuperNodeBufferType & snode = arrSuperNodes[supidx];
2333  TreeReduce<T> * redLTree = redToLeftTree_[snode.Index];
2334  if(redLTree != NULL && !redLdone[supidx]){
2335  bool done = redLTree->Progress();
2336 
2337 #if ( _DEBUGlevel_ >= 1 )
2338  statusOFS<<"["<<snode.Index<<"] "<<" trying to progress reduce L "<<done<<std::endl;
2339 #endif
2340  }
2341  }
2342  }
2343  }
2344 
2345  }
2346  TIMER_STOP(Compute_Sinv_LT);
2347 
2348  //Reduce Sinv L^T to the processors in PCOL(ksup,grid_)
2349 
2350 
2351  TIMER_START(Reduce_Sinv_LT);
2352  //blocking wait for the reduction
2353  bool all_done = false;
2354  while(!all_done)
2355  {
2356  all_done = true;
2357 
2358  for (Int supidx=0; supidx<stepSuper; supidx++){
2359  SuperNodeBufferType & snode = arrSuperNodes[supidx];
2360 
2361  TreeReduce<T> * redLTree = redToLeftTree_[snode.Index];
2362 
2363 #ifdef NEW_BCAST
2364  if(redLTree != NULL /*&& !redLdone[supidx]*/)
2365 #else
2366  if(redLTree != NULL && !redLdone[supidx])
2367 #endif
2368  {
2369 
2370  bool done = redLTree->Progress();
2371 #if ( _DEBUGlevel_ >= 1 )
2372  statusOFS<<"["<<snode.Index<<"] "<<" trying to progress reduce L "<<done<<std::endl;
2373 #endif
2374  redLdone[supidx]=done?1:0;
2375  if(done){
2376 
2377 #if ( _DEBUGlevel_ >= 1 )
2378  statusOFS<<"["<<snode.Index<<"] "<<" DONE reduce L"<<std::endl;
2379 #endif
2380 
2381 //if(redLTree->GetTag() == 2344 && MYPROC(grid_)==0){gdb_lock();}
2382  if( MYCOL( grid_ ) == PCOL( snode.Index, grid_ ) ){
2383  //determine the number of rows in LUpdateBufReduced
2384  Int numRowLUpdateBuf;
2385  std::vector<LBlock<T> >& Lcol = this->L( LBj( snode.Index, grid_ ) );
2386  if( MYROW( grid_ ) != PROW( snode.Index, grid_ ) ){
2387  snode.RowLocalPtr.resize( Lcol.size() + 1 );
2388  snode.BlockIdxLocal.resize( Lcol.size() );
2389  snode.RowLocalPtr[0] = 0;
2390  for( Int ib = 0; ib < Lcol.size(); ib++ ){
2391  snode.RowLocalPtr[ib+1] = snode.RowLocalPtr[ib] + Lcol[ib].numRow;
2392  snode.BlockIdxLocal[ib] = Lcol[ib].blockIdx;
2393  }
2394  } // I do not own the diagonal block
2395  else{
2396  snode.RowLocalPtr.resize( Lcol.size() );
2397  snode.BlockIdxLocal.resize( Lcol.size() - 1 );
2398  snode.RowLocalPtr[0] = 0;
2399  for( Int ib = 1; ib < Lcol.size(); ib++ ){
2400  snode.RowLocalPtr[ib] = snode.RowLocalPtr[ib-1] + Lcol[ib].numRow;
2401  snode.BlockIdxLocal[ib-1] = Lcol[ib].blockIdx;
2402  }
2403  } // I own the diagonal block, skip the diagonal block
2404  numRowLUpdateBuf = *snode.RowLocalPtr.rbegin();
2405 
2406  if( numRowLUpdateBuf > 0 ){
2407  if( snode.LUpdateBuf.m() == 0 && snode.LUpdateBuf.n() == 0 ){
2408  snode.LUpdateBuf.Resize( numRowLUpdateBuf,SuperSize( snode.Index, super_ ) );
2409  SetValue(snode.LUpdateBuf, ZERO<T>());
2410  }
2411  }
2412 
2413  //copy the buffer from the reduce tree
2414  redLTree->SetLocalBuffer(snode.LUpdateBuf.Data());
2415  }
2416 #ifndef NEW_BCAST
2417  redLdone[supidx]=1;
2418 #endif
2419  redLTree->CleanupBuffers();
2420  }
2421 
2422  all_done = all_done && (done || redLdone[supidx]);
2423  }
2424  }
2425  }
2426  TIMER_STOP(Reduce_Sinv_LT);
2427 
2428 
2429 
2430 #ifndef _RELEASE_
2431  PopCallStack();
2432 #endif
2433  //--------------------- End of reduce of LUpdateBuf-------------------------
2434 #ifndef _RELEASE_
2435  PushCallStack("PMatrix::SelInv_P2p::UpdateD");
2436 #endif
2437 
2438  TIMER_START(Update_Diagonal);
2439 
2440  for (Int supidx=0; supidx<stepSuper; supidx++){
2441  SuperNodeBufferType & snode = arrSuperNodes[supidx];
2442 
2443  ComputeDiagUpdate(snode);
2444 
2445  //Get the reduction tree
2446  TreeReduce<T> * redDTree = redToAboveTree_[snode.Index];
2447 
2448  if(redDTree != NULL){
2449  //send the data
2450  if( MYROW( grid_ ) == PROW( snode.Index, grid_ ) ){
2451  if(snode.DiagBuf.Size()==0){
2452  snode.DiagBuf.Resize( SuperSize( snode.Index, super_ ), SuperSize( snode.Index, super_ ));
2453  SetValue(snode.DiagBuf, ZERO<T>());
2454  }
2455  }
2456 
2457 
2458  redDTree->SetLocalBuffer(snode.DiagBuf.Data());
2459  if(!redDTree->IsAllocated()){
2460  redDTree->SetTag(IDX_TO_TAG(snode.Rank,SELINV_TAG_D_REDUCE,limIndex_));
2461  redDTree->AllocRecvBuffers();
2462  //Post All Recv requests;
2463  redDTree->PostFirstRecv();
2464  }
2465 
2466  redDTree->SetDataReady(true);
2467  bool done = redDTree->Progress();
2468 #if ( _DEBUGlevel_ >= 1 )
2469  statusOFS<<"["<<snode.Index<<"] "<<" trying to progress reduce D "<<done<<std::endl;
2470 #endif
2471  }
2472 
2473  //advance reductions
2474  for (Int supidx=0; supidx<stepSuper; supidx++){
2475  SuperNodeBufferType & snode = arrSuperNodes[supidx];
2476  TreeReduce<T> * redDTree = redToAboveTree_[snode.Index];
2477  if(redDTree != NULL){
2478 #ifndef NEW_BCAST
2479  if(redDTree->IsAllocated())
2480 #endif
2481  {
2482  bool done = redDTree->Progress();
2483 #if ( _DEBUGlevel_ >= 1 )
2484  statusOFS<<"["<<snode.Index<<"] "<<" trying to progress reduce D "<<done<<std::endl;
2485 #endif
2486  }
2487  }
2488  }
2489  }
2490  TIMER_STOP(Update_Diagonal);
2491 
2492  TIMER_START(Reduce_Diagonal);
2493  //blocking wait for the reduction
2494  {
2495  vector<char> is_done(stepSuper,0);
2496  bool all_done = false;
2497  while(!all_done)
2498  {
2499  all_done = true;
2500 
2501  for (Int supidx=0; supidx<stepSuper; supidx++){
2502  SuperNodeBufferType & snode = arrSuperNodes[supidx];
2503  TreeReduce<T> * redDTree = redToAboveTree_[snode.Index];
2504 
2505 #ifdef NEW_BCAST
2506  if(redDTree != NULL /*&& !is_done[supidx]*/)
2507 #else
2508  if(redDTree != NULL && !is_done[supidx])
2509 #endif
2510  {
2511 
2512  bool done = redDTree->Progress();
2513 #if ( _DEBUGlevel_ >= 1 )
2514  statusOFS<<"["<<snode.Index<<"] "<<" trying to progress reduce D "<<done<<std::endl;
2515 #endif
2516  is_done[supidx]=done?1:0;
2517  if(done){
2518 
2519 #if ( _DEBUGlevel_ >= 1 )
2520  statusOFS<<"["<<snode.Index<<"] "<<" DONE reduce D"<<std::endl;
2521 #endif
2522  if( MYCOL( grid_ ) == PCOL( snode.Index, grid_ ) ){
2523  if( MYROW( grid_ ) == PROW( snode.Index, grid_ ) ){
2524  LBlock<T> & LB = this->L( LBj( snode.Index, grid_ ) )[0];
2525  // Symmetrize LB
2526  blas::Axpy( LB.numRow * LB.numCol, ONE<T>(), snode.DiagBuf.Data(), 1, LB.nzval.Data(), 1 );
2527  Symmetrize( LB.nzval );
2528  }
2529  }
2530 #ifndef NEW_BCAST
2531  is_done[supidx]=1;
2532 #endif
2533  redDTree->CleanupBuffers();
2534  }
2535 
2536  all_done = all_done && (done || is_done[supidx]);
2537  }
2538  }
2539  }
2540  }
2541  TIMER_STOP(Reduce_Diagonal);
2542 
2543 #ifndef _RELEASE_
2544  PopCallStack();
2545 #endif
2546 
2547 
2548 #ifndef _RELEASE_
2549  PushCallStack("PMatrix::SelInv_P2p::UpdateU");
2550 #endif
2551 
2552 
2553 
2554  SendRecvCD_UpdateU(arrSuperNodes, stepSuper);
2555 
2556 #ifndef _RELEASE_
2557  PopCallStack();
2558 #endif
2559 
2560 #ifndef _RELEASE_
2561  PushCallStack("PMatrix::SelInv_P2p::UpdateLFinal");
2562 #endif
2563 
2564  TIMER_START(Update_L);
2565  for (Int supidx=0; supidx<stepSuper; supidx++){
2566  SuperNodeBufferType & snode = arrSuperNodes[supidx];
2567 
2568 #if ( _DEBUGlevel_ >= 1 )
2569  statusOFS << std::endl << "["<<snode.Index<<"] "<< "Finish updating the L part by filling LUpdateBufReduced back to L" << std::endl << std::endl;
2570 #endif
2571 
2572  if( MYCOL( grid_ ) == PCOL( snode.Index, grid_ ) && snode.LUpdateBuf.m() > 0 ){
2573  std::vector<LBlock<T> >& Lcol = this->L( LBj( snode.Index, grid_ ) );
2574  //Need to skip the diagonal block if present
2575  Int startBlock = (MYROW( grid_ ) == PROW( snode.Index, grid_ ))?1:0;
2576  for( Int ib = startBlock; ib < Lcol.size(); ib++ ){
2577  LBlock<T> & LB = Lcol[ib];
2578  lapack::Lacpy( 'A', LB.numRow, LB.numCol, &snode.LUpdateBuf( snode.RowLocalPtr[ib-startBlock], 0 ),
2579  snode.LUpdateBuf.m(), LB.nzval.Data(), LB.numRow );
2580  }
2581  } // Finish updating L
2582  } // for (snode.Index) : Main loop
2583  TIMER_STOP(Update_L);
2584 
2585 #ifndef _RELEASE_
2586  PopCallStack();
2587 #endif
2588 
2589 
2590  TIMER_START(Barrier);
2591 
2592 #ifdef NEW_BCAST
2593  //block on bcastUTree
2594  for (Int supidx=0; supidx<stepSuper; supidx++){
2595  SuperNodeBufferType & snode = arrSuperNodes[supidx];
2596  TreeBcast2<T> * &bcastUTree2 = fwdToBelowTree2_[snode.Index];
2597 
2598  if(bcastUTree2 != NULL){
2599  bcastUTree2->Wait();
2600  bcastUTree2->CleanupBuffers();
2601  // delete bcastUTree2;
2602  // bcastUTree2 = NULL;
2603  }
2604 
2605  TreeBcast2<T> * &bcastLTree2 = fwdToRightTree2_[snode.Index];
2606 
2607  if(bcastLTree2 != NULL){
2608  bcastLTree2->Wait();
2609  bcastLTree2->CleanupBuffers();
2610  // delete bcastLTree2;
2611  // bcastLTree2 = NULL;
2612  }
2613 
2614  }
2615 #endif
2616 
2617  for (Int supidx=0; supidx<stepSuper; supidx++){
2618  SuperNodeBufferType & snode = arrSuperNodes[supidx];
2619 
2620  TreeReduce<T> * &redLTree = redToLeftTree_[snode.Index];
2621 
2622  if(redLTree != NULL){
2623  redLTree->Wait();
2624  redLTree->CleanupBuffers();
2625  // delete redLTree;
2626  // redLTree = NULL;
2627  }
2628  }
2629 
2630  for (Int supidx=0; supidx<stepSuper; supidx++){
2631  SuperNodeBufferType & snode = arrSuperNodes[supidx];
2632  TreeReduce<T> * &redDTree = redToAboveTree_[snode.Index];
2633 
2634  if(redDTree != NULL){
2635  redDTree->Wait();
2636  redDTree->CleanupBuffers();
2637  // delete redDTree;
2638  // redDTree = NULL;
2639  }
2640  }
2641 
2642 #ifndef NEW_BCAST
2643  mpi::Waitall(arrMpireqsRecvContentFromAny);
2644  for (Int supidx=0; supidx<stepSuper; supidx++){
2645  SuperNodeBufferType & snode = arrSuperNodes[supidx];
2646  Int ksup = snode.Index;
2647  std::vector<MPI_Request> & mpireqsSendToRight = arrMpireqsSendToRight[supidx];
2648  std::vector<MPI_Request> & mpireqsSendToBelow = arrMpireqsSendToBelow[supidx];
2649 
2650 #if ( _DEBUGlevel_ >= 1 )
2651  statusOFS<<"["<<ksup<<"] mpireqsSendToRight"<<std::endl;
2652 #endif
2653  mpi::Waitall( mpireqsSendToRight );
2654 #if ( _DEBUGlevel_ >= 1 )
2655  statusOFS<<"["<<ksup<<"] mpireqsSendToBelow"<<std::endl;
2656 #endif
2657 
2658  mpi::Waitall( mpireqsSendToBelow );
2659  }
2660 #endif
2661 
2662 #if ( _DEBUGlevel_ >= 1 )
2663  statusOFS<<"barrier done"<<std::endl;
2664 #endif
2665  TIMER_STOP(Barrier);
2666 
2667  }
2668 
2669  template<typename T>
2671  {
2672  ConstructCommunicationPattern_P2p();
2673  } // ----- end of method PMatrix::ConstructCommunicationPattern -----
2674 
2675 
2676 
2677  template<typename T>
2679  {
2680 #ifdef COMMUNICATOR_PROFILE
2681  CommunicatorStat stats;
2682  stats.countSendToBelow_.Resize(numSuper);
2683  stats.countSendToRight_.Resize(numSuper);
2684  stats.countRecvFromBelow_.Resize( numSuper );
2685  SetValue( stats.countSendToBelow_, 0 );
2686  SetValue( stats.countSendToRight_, 0 );
2687  SetValue( stats.countRecvFromBelow_, 0 );
2688 #endif
2689 
2690 
2691 #ifndef _RELEASE_
2692  PushCallStack("PMatrix::ConstructCommunicationPattern_P2p");
2693 #endif
2694 
2695 
2696  Int numSuper = this->NumSuper();
2697 
2698  TIMER_START(Allocate);
2699 
2700 #ifndef _RELEASE_
2701  PushCallStack( "Initialize the communication pattern" );
2702 #endif
2703 
2704 
2705 #ifdef NEW_BCAST
2706  fwdToBelowTree2_.resize(numSuper, NULL );
2707  fwdToRightTree2_.resize(numSuper, NULL );
2708 #else
2709  fwdToBelowTree_.resize(numSuper, NULL );
2710  fwdToRightTree_.resize(numSuper, NULL );
2711 #endif
2712  redToLeftTree_.resize(numSuper, NULL );
2713  redToAboveTree_.resize(numSuper, NULL );
2714 
2715  isSendToBelow_.Resize(grid_->numProcRow, numSuper);
2716  isSendToRight_.Resize(grid_->numProcCol, numSuper);
2717  isSendToDiagonal_.Resize( numSuper );
2718  SetValue( isSendToBelow_, false );
2719  SetValue( isSendToRight_, false );
2720  SetValue( isSendToDiagonal_, false );
2721 
2722  isSendToCrossDiagonal_.Resize(grid_->numProcCol+1, numSuper );
2723  SetValue( isSendToCrossDiagonal_, false );
2724  isRecvFromCrossDiagonal_.Resize(grid_->numProcRow+1, numSuper );
2725  SetValue( isRecvFromCrossDiagonal_, false );
2726 
2727  isRecvFromAbove_.Resize( numSuper );
2728  isRecvFromLeft_.Resize( numSuper );
2729  isRecvFromBelow_.Resize( grid_->numProcRow, numSuper );
2730  SetValue( isRecvFromAbove_, false );
2731  SetValue( isRecvFromBelow_, false );
2732  SetValue( isRecvFromLeft_, false );
2733 #ifndef _RELEASE_
2734  PopCallStack();
2735 #endif
2736 
2737  TIMER_STOP(Allocate);
2738 
2739  TIMER_START(GetEtree);
2740  std::vector<Int> snodeEtree(this->NumSuper());
2741  GetEtree(snodeEtree);
2742  TIMER_STOP(GetEtree);
2743 
2744 
2745 
2746 #ifndef _RELEASE_
2747  PushCallStack( "Local column communication" );
2748 #endif
2749 #if ( _DEBUGlevel_ >= 1 )
2750  statusOFS << std::endl << "Local column communication" << std::endl;
2751 #endif
2752  // localColBlockRowIdx stores the nonzero block indices for each local block column.
2753  // The nonzero block indices including contribution from both L and U.
2754  // Dimension: numLocalBlockCol x numNonzeroBlock
2755  std::vector<std::set<Int> > localColBlockRowIdx;
2756 
2757  localColBlockRowIdx.resize( this->NumLocalBlockCol() );
2758 
2759 
2760  TIMER_START(Column_communication);
2761 
2762 
2763  for( Int ksup = 0; ksup < numSuper; ksup++ ){
2764  // All block columns perform independently
2765  if( MYCOL( grid_ ) == PCOL( ksup, grid_ ) ){
2766 
2767 
2768  // Communication
2769  std::vector<Int> tAllBlockRowIdx;
2770  std::vector<Int> & colBlockIdx = ColBlockIdx(LBj(ksup, grid_));
2771  TIMER_START(Allgatherv_Column_communication);
2772  if( grid_ -> mpisize != 1 )
2773  mpi::Allgatherv( colBlockIdx, tAllBlockRowIdx, grid_->colComm );
2774  else
2775  tAllBlockRowIdx = colBlockIdx;
2776 
2777  TIMER_STOP(Allgatherv_Column_communication);
2778 
2779  localColBlockRowIdx[LBj( ksup, grid_ )].insert(
2780  tAllBlockRowIdx.begin(), tAllBlockRowIdx.end() );
2781 
2782 #if ( _DEBUGlevel_ >= 1 )
2783  statusOFS
2784  << " Column block " << ksup
2785  << " has the following nonzero block rows" << std::endl;
2786  for( std::set<Int>::iterator si = localColBlockRowIdx[LBj( ksup, grid_ )].begin();
2787  si != localColBlockRowIdx[LBj( ksup, grid_ )].end();
2788  si++ ){
2789  statusOFS << *si << " ";
2790  }
2791  statusOFS << std::endl;
2792 #endif
2793 
2794  } // if( MYCOL( grid_ ) == PCOL( ksup, grid_ ) )
2795  } // for(ksup)
2796 
2797 
2798 #ifndef _RELEASE_
2799  PopCallStack();
2800 #endif
2801 
2802  TIMER_STOP(Column_communication);
2803 
2804  TIMER_START(Row_communication);
2805 #ifndef _RELEASE_
2806  PushCallStack( "Local row communication" );
2807 #endif
2808 #if ( _DEBUGlevel_ >= 1 )
2809  statusOFS << std::endl << "Local row communication" << std::endl;
2810 #endif
2811  // localRowBlockColIdx stores the nonzero block indices for each local block row.
2812  // The nonzero block indices including contribution from both L and U.
2813  // Dimension: numLocalBlockRow x numNonzeroBlock
2814  std::vector<std::set<Int> > localRowBlockColIdx;
2815 
2816  localRowBlockColIdx.resize( this->NumLocalBlockRow() );
2817 
2818  for( Int ksup = 0; ksup < numSuper; ksup++ ){
2819  // All block columns perform independently
2820  if( MYROW( grid_ ) == PROW( ksup, grid_ ) ){
2821 
2822  // Communication
2823  std::vector<Int> tAllBlockColIdx;
2824  std::vector<Int> & rowBlockIdx = RowBlockIdx(LBi(ksup, grid_));
2825  TIMER_START(Allgatherv_Row_communication);
2826  if( grid_ -> mpisize != 1 )
2827  mpi::Allgatherv( rowBlockIdx, tAllBlockColIdx, grid_->rowComm );
2828  else
2829  tAllBlockColIdx = rowBlockIdx;
2830 
2831  TIMER_STOP(Allgatherv_Row_communication);
2832 
2833  localRowBlockColIdx[LBi( ksup, grid_ )].insert(
2834  tAllBlockColIdx.begin(), tAllBlockColIdx.end() );
2835 
2836 #if ( _DEBUGlevel_ >= 1 )
2837  statusOFS
2838  << " Row block " << ksup
2839  << " has the following nonzero block columns" << std::endl;
2840  for( std::set<Int>::iterator si = localRowBlockColIdx[LBi( ksup, grid_ )].begin();
2841  si != localRowBlockColIdx[LBi( ksup, grid_ )].end();
2842  si++ ){
2843  statusOFS << *si << " ";
2844  }
2845  statusOFS << std::endl;
2846 #endif
2847 
2848  } // if( MYROW( grid_ ) == PROW( ksup, grid_ ) )
2849  } // for(ksup)
2850 
2851 #ifndef _RELEASE_
2852  PopCallStack();
2853 #endif
2854 
2855  TIMER_STOP(Row_communication);
2856 
2857  TIMER_START(STB_RFA);
2858 #ifndef _RELEASE_
2859  PushCallStack("SendToBelow / RecvFromAbove");
2860 #endif
2861  for( Int ksup = 0; ksup < numSuper - 1; ksup++ ){
2862 
2863 #ifdef COMMUNICATOR_PROFILE
2864  std::vector<bool> sTB(grid_->numProcRow,false);
2865 #endif
2866  // Loop over all the supernodes to the right of ksup
2867  Int jsup = snodeEtree[ksup];
2868  while(jsup<numSuper){
2869  Int jsupLocalBlockCol = LBj( jsup, grid_ );
2870  Int jsupProcCol = PCOL( jsup, grid_ );
2871  if( MYCOL( grid_ ) == jsupProcCol ){
2872  // SendToBelow / RecvFromAbove only if (ksup, jsup) is nonzero.
2873  if( localColBlockRowIdx[jsupLocalBlockCol].count( ksup ) > 0 ) {
2874  for( std::set<Int>::iterator si = localColBlockRowIdx[jsupLocalBlockCol].begin();
2875  si != localColBlockRowIdx[jsupLocalBlockCol].end(); si++ ){
2876  Int isup = *si;
2877  Int isupProcRow = PROW( isup, grid_ );
2878  if( isup > ksup ){
2879  if( MYROW( grid_ ) == isupProcRow ){
2880  isRecvFromAbove_(ksup) = true;
2881  }
2882  if( MYROW( grid_ ) == PROW( ksup, grid_ ) ){
2883  isSendToBelow_( isupProcRow, ksup ) = true;
2884  }
2885  } // if( isup > ksup )
2886  } // for (si)
2887  } // if( localColBlockRowIdx[jsupLocalBlockCol].count( ksup ) > 0 )
2888 #ifdef COMMUNICATOR_PROFILE
2889  sTB[ PROW(ksup,grid_) ] = true;
2890  if( localColBlockRowIdx[jsupLocalBlockCol].count( ksup ) > 0 ) {
2891  for( std::set<Int>::iterator si = localColBlockRowIdx[jsupLocalBlockCol].begin();
2892  si != localColBlockRowIdx[jsupLocalBlockCol].end(); si++ ){
2893  Int isup = *si;
2894  Int isupProcRow = PROW( isup, grid_ );
2895  if( isup > ksup ){
2896  sTB[isupProcRow] = true;
2897  } // if( isup > ksup )
2898  } // for (si)
2899  }
2900 #endif
2901 
2902 
2903 
2904  } // if( MYCOL( grid_ ) == PCOL( jsup, grid_ ) )
2905  jsup = snodeEtree[jsup];
2906  } // for(jsup)
2907 
2908 #ifdef COMMUNICATOR_PROFILE
2909  Int count= std::count(sTB.begin(), sTB.end(), true);
2910  Int color = sTB[MYROW(grid_)];
2911  if(count>1){
2912  std::vector<Int> & snodeList = stats.maskSendToBelow_[sTB];
2913  snodeList.push_back(ksup);
2914  }
2915  stats.countSendToBelow_(ksup) = count * color;
2916 #endif
2917 
2918 
2919  } // for(ksup)
2920 #ifndef _RELEASE_
2921  PopCallStack();
2922 #endif
2923  TIMER_STOP(STB_RFA);
2924 
2925  TIMER_START(STR_RFL_RFB);
2926 #ifndef _RELEASE_
2927  PushCallStack("SendToRight / RecvFromLeft");
2928 #endif
2929  for( Int ksup = 0; ksup < numSuper - 1; ksup++ ){
2930 
2931 #ifdef COMMUNICATOR_PROFILE
2932  std::vector<bool> sTR(grid_->numProcCol,false);
2933  std::vector<bool> rFB(grid_->numProcRow,false);
2934 #endif
2935  // Loop over all the supernodes below ksup
2936  Int isup = snodeEtree[ksup];
2937  while(isup<numSuper){
2938  Int isupLocalBlockRow = LBi( isup, grid_ );
2939  Int isupProcRow = PROW( isup, grid_ );
2940  if( MYROW( grid_ ) == isupProcRow ){
2941  // SendToRight / RecvFromLeft only if (isup, ksup) is nonzero.
2942  if( localRowBlockColIdx[isupLocalBlockRow].count( ksup ) > 0 ){
2943  for( std::set<Int>::iterator si = localRowBlockColIdx[isupLocalBlockRow].begin();
2944  si != localRowBlockColIdx[isupLocalBlockRow].end(); si++ ){
2945  Int jsup = *si;
2946  Int jsupProcCol = PCOL( jsup, grid_ );
2947  if( jsup > ksup ){
2948  if( MYCOL( grid_ ) == jsupProcCol ){
2949  isRecvFromLeft_(ksup) = true;
2950  }
2951  if( MYCOL( grid_ ) == PCOL( ksup, grid_ ) ){
2952  isSendToRight_( jsupProcCol, ksup ) = true;
2953  }
2954  }
2955  } // for (si)
2956  } // if( localRowBlockColIdx[isupLocalBlockRow].count( ksup ) > 0 )
2957 
2958 #ifdef COMMUNICATOR_PROFILE
2959  sTR[ PCOL(ksup,grid_) ] = true;
2960  if( localRowBlockColIdx[isupLocalBlockRow].count( ksup ) > 0 ){
2961  for( std::set<Int>::iterator si = localRowBlockColIdx[isupLocalBlockRow].begin();
2962  si != localRowBlockColIdx[isupLocalBlockRow].end(); si++ ){
2963  Int jsup = *si;
2964  Int jsupProcCol = PCOL( jsup, grid_ );
2965  if( jsup > ksup ){
2966  sTR[ jsupProcCol ] = true;
2967  } // if( jsup > ksup )
2968  } // for (si)
2969  }
2970 #endif
2971 
2972 
2973  } // if( MYROW( grid_ ) == isupProcRow )
2974 
2975  if( MYCOL( grid_ ) == PCOL(ksup, grid_) ){
2976  if( MYROW( grid_ ) == PROW( ksup, grid_ ) ){
2977  isRecvFromBelow_(isupProcRow,ksup) = true;
2978  }
2979  else if (MYROW(grid_) == isupProcRow){
2980  isSendToDiagonal_(ksup)=true;
2981  }
2982  } // if( MYCOL( grid_ ) == PCOL(ksup, grid_) )
2983 
2984 #ifdef COMMUNICATOR_PROFILE
2985  rFB[ PROW(ksup,grid_) ] = true;
2986  if( MYCOL( grid_ ) == PCOL(ksup, grid_) ){
2987  rFB[ isupProcRow ] = true;
2988  } // if( MYCOL( grid_ ) == PCOL(ksup, grid_) )
2989 #endif
2990 
2991 
2992  isup = snodeEtree[isup];
2993  } // for (isup)
2994 
2995 #ifdef COMMUNICATOR_PROFILE
2996  Int count= std::count(rFB.begin(), rFB.end(), true);
2997  Int color = rFB[MYROW(grid_)];
2998  if(count>1){
2999  std::vector<Int> & snodeList = stats.maskRecvFromBelow_[rFB];
3000  snodeList.push_back(ksup);
3001  }
3002  stats.countRecvFromBelow_(ksup) = count * color;
3003 
3004  count= std::count(sTR.begin(), sTR.end(), true);
3005  color = sTR[MYCOL(grid_)];
3006  if(count>1){
3007  std::vector<Int> & snodeList = stats.maskSendToRight_[sTR];
3008  snodeList.push_back(ksup);
3009  }
3010  stats.countSendToRight_(ksup) = count * color;
3011 #endif
3012 
3013 
3014  } // for (ksup)
3015 
3016 #ifndef _RELEASE_
3017  PopCallStack();
3018 #endif
3019  TIMER_STOP(STR_RFL_RFB);
3020 
3021 #ifdef COMMUNICATOR_PROFILE
3022  {
3023 
3024 
3025  // statusOFS << std::endl << "stats.countSendToBelow:" << stats.countSendToBelow_ << std::endl;
3026  stats.maskSendToBelow_.insert(stats.maskRecvFromBelow_.begin(),stats.maskRecvFromBelow_.end());
3027  statusOFS << std::endl << "stats.maskSendToBelow_:" << stats.maskSendToBelow_.size() <<std::endl;
3028  // bitMaskSet::iterator it;
3029  // for(it = stats.maskSendToBelow_.begin(); it != stats.maskSendToBelow_.end(); it++){
3030  // //print the involved processors
3031  // for(int curi = 0; curi < it->first.size(); curi++){
3032  // statusOFS << it->first[curi] << " ";
3033  // }
3034  //
3035  // statusOFS<< " ( ";
3036  // //print the involved supernode indexes
3037  // for(int curi = 0; curi < it->second.size(); curi++){
3038  // statusOFS<< it->second[curi]<<" ";
3039  // }
3040  //
3041  // statusOFS << ")"<< std::endl;
3042  // }
3043  // statusOFS << std::endl << "stats.countRecvFromBelow:" << stats.countRecvFromBelow_ << std::endl;
3044  // statusOFS << std::endl << "stats.maskRecvFromBelow_:" << stats.maskRecvFromBelow_.size() <<std::endl;
3045  // statusOFS << std::endl << "stats.countSendToRight:" << stats.countSendToRight_ << std::endl;
3046  statusOFS << std::endl << "stats.maskSendToRight_:" << stats.maskSendToRight_.size() <<std::endl;
3047  }
3048 #endif
3049 
3050 
3051  TIMER_START(BUILD_BCAST_TREES);
3052  //Allgather RFL values within column
3053  {
3054  vector<double> SeedRFL(numSuper,0.0);
3055  vector<Int> aggRFL(numSuper);
3056  vector<Int> globalAggRFL(numSuper*grid_->numProcCol);
3057  for( Int ksup = 0; ksup < numSuper ; ksup++ ){
3058 #if ( _DEBUGlevel_ >= 1 )
3059  statusOFS<<"1 "<<ksup<<std::endl;
3060 #endif
3061  if(MYCOL(grid_)==PCOL(ksup,grid_)){
3062  std::vector<LBlock<T> >& Lcol = this->L( LBj(ksup, grid_) );
3063  // All blocks except for the diagonal block are to be sent right
3064 
3065  Int totalSize = 0;
3066  //one integer holding the number of Lblocks
3067  totalSize+=sizeof(Int);
3068  for( Int ib = 0; ib < Lcol.size(); ib++ ){
3069  if( Lcol[ib].blockIdx > ksup ){
3070  //three indices + one IntNumVec
3071  totalSize+=3*sizeof(Int);
3072  totalSize+= sizeof(Int)+Lcol[ib].rows.ByteSize();
3073  }
3074  }
3075 
3076 
3077  aggRFL[ksup]=totalSize;
3078  SeedRFL[ksup]=rand();
3079 
3080  }
3081  else if(isRecvFromLeft_(ksup)){
3082  aggRFL[ksup]=1;
3083  }
3084  else{
3085  aggRFL[ksup]=0;
3086  }
3087  }
3088  // //allgather
3089  MPI_Allgather(&aggRFL[0],numSuper*sizeof(Int),MPI_BYTE,
3090  &globalAggRFL[0],numSuper*sizeof(Int),MPI_BYTE,
3091  grid_->rowComm);
3092  MPI_Allreduce(MPI_IN_PLACE,&SeedRFL[0],numSuper,MPI_DOUBLE,MPI_MAX,grid_->rowComm);
3093 
3094 
3095  for( Int ksup = 0; ksup < numSuper; ksup++ ){
3096 #if ( _DEBUGlevel_ >= 1 )
3097  statusOFS<<"3 "<<ksup<<std::endl;
3098 #endif
3099  if( isRecvFromLeft_(ksup) || CountSendToRight(ksup)>0 ){
3100  Int msgSize = 0;
3101  vector<Int> tree_ranks;
3102  Int countRFL= 0;
3103  for( Int iProcCol = 0; iProcCol < grid_->numProcCol; iProcCol++ ){
3104  if( iProcCol != PCOL( ksup, grid_ ) ){
3105  Int isRFL = globalAggRFL[iProcCol*numSuper + ksup];
3106  if(isRFL>0){
3107  ++countRFL;
3108  }
3109  }
3110  }
3111 
3112  tree_ranks.reserve(countRFL+1);
3113  tree_ranks.push_back(PCOL(ksup,grid_));
3114 
3115  for( Int iProcCol = 0; iProcCol < grid_->numProcCol; iProcCol++ ){
3116  Int isRFL = globalAggRFL[iProcCol*numSuper + ksup];
3117  if(isRFL>0){
3118  if( iProcCol != PCOL( ksup, grid_ ) ){
3119  tree_ranks.push_back(iProcCol);
3120  }
3121  else{
3122  msgSize = isRFL;
3123  }
3124  }
3125  }
3126 
3127 #ifdef NEW_BCAST
3128  TreeBcast2<T> * & BcastLTree2 = fwdToRightTree2_[ksup];
3129  if(BcastLTree2!=NULL){delete BcastLTree2; BcastLTree2=NULL;}
3130  BcastLTree2 = TreeBcast2<T>::Create(this->grid_->rowComm,&tree_ranks[0],tree_ranks.size(),msgSize,SeedRFL[ksup]);
3131 
3132 #ifdef COMM_PROFILE_BCAST
3133  BcastLTree2->SetGlobalComm(grid_->comm);
3134 #endif
3135 #else
3136  TreeBcast * & BcastLTree = fwdToRightTree_[ksup];
3137  if(BcastLTree!=NULL){delete BcastLTree; BcastLTree=NULL;}
3138  BcastLTree = TreeBcast::Create(this->grid_->rowComm,&tree_ranks[0],tree_ranks.size(),msgSize,SeedRFL[ksup]);
3139 #ifdef COMM_PROFILE_BCAST
3140  BcastLTree->SetGlobalComm(grid_->comm);
3141 #endif
3142 #endif
3143 
3144  }
3145  }
3146 
3147 
3148 
3149 
3150 
3151  }
3152  {
3153  vector<double> SeedRFA(numSuper,0.0);
3154  vector<Int> aggRFA(numSuper);
3155  vector<Int> globalAggRFA(numSuper*grid_->numProcRow);
3156  for( Int ksup = 0; ksup < numSuper ; ksup++ ){
3157 #if ( _DEBUGlevel_ >= 1 )
3158  statusOFS<<"2 "<<ksup<<std::endl;
3159 #endif
3160  if(MYROW(grid_)==PROW(ksup,grid_)){
3161  std::vector<UBlock<T> >& Urow = this->U( LBi(ksup, grid_) );
3162  // All blocks except for the diagonal block are to be sent right
3163 
3164  Int totalSize = 0;
3165  //one integer holding the number of Lblocks
3166  totalSize+=sizeof(Int);
3167  for( Int jb = 0; jb < Urow.size(); jb++ ){
3168  if( Urow[jb].blockIdx >= ksup ){
3169  //three indices + one IntNumVec + one NumMat<T>
3170  totalSize+=3*sizeof(Int);
3171  totalSize+= sizeof(Int)+Urow[jb].cols.ByteSize();
3172  totalSize+= 2*sizeof(Int)+Urow[jb].nzval.ByteSize();
3173  }
3174  }
3175  aggRFA[ksup]=totalSize;
3176  SeedRFA[ksup]=rand();
3177  }
3178  else if(isRecvFromAbove_(ksup)){
3179  aggRFA[ksup]=1;
3180  }
3181  else{
3182  aggRFA[ksup]=0;
3183  }
3184  }
3185 
3186 
3187  //allgather
3188  MPI_Allgather(&aggRFA[0],numSuper*sizeof(Int),MPI_BYTE,
3189  &globalAggRFA[0],numSuper*sizeof(Int),MPI_BYTE,
3190  grid_->colComm);
3191  MPI_Allreduce(MPI_IN_PLACE,&SeedRFA[0],numSuper,MPI_DOUBLE,MPI_MAX,grid_->colComm);
3192 
3193  for( Int ksup = 0; ksup < numSuper; ksup++ ){
3194 #if ( _DEBUGlevel_ >= 1 )
3195  statusOFS<<"4 "<<ksup<<std::endl;
3196 #endif
3197  if( isRecvFromAbove_(ksup) || CountSendToBelow(ksup)>0 ){
3198  vector<Int> tree_ranks;
3199  Int msgSize = 0;
3200  Int countRFA= 0;
3201  for( Int iProcRow = 0; iProcRow < grid_->numProcRow; iProcRow++ ){
3202  if( iProcRow != PROW( ksup, grid_ ) ){
3203  Int isRFA = globalAggRFA[iProcRow*numSuper + ksup];
3204  if(isRFA>0){
3205  ++countRFA;
3206  }
3207  }
3208  }
3209 
3210  tree_ranks.reserve(countRFA+1);
3211  tree_ranks.push_back(PROW(ksup,grid_));
3212 
3213  for( Int iProcRow = 0; iProcRow < grid_->numProcRow; iProcRow++ ){
3214  Int isRFA = globalAggRFA[iProcRow*numSuper + ksup];
3215  if(isRFA>0){
3216  if( iProcRow != PROW( ksup, grid_ ) ){
3217  tree_ranks.push_back(iProcRow);
3218  }
3219  else{
3220  msgSize = isRFA;
3221  }
3222  }
3223  }
3224 
3225 #ifdef NEW_BCAST
3226  TreeBcast2<T> * & BcastUTree2 = fwdToBelowTree2_[ksup];
3227  if(BcastUTree2!=NULL){delete BcastUTree2; BcastUTree2=NULL;}
3228  BcastUTree2 = TreeBcast2<T>::Create(this->grid_->colComm,&tree_ranks[0],tree_ranks.size(),msgSize,SeedRFA[ksup]);
3229 
3230 #ifdef COMM_PROFILE_BCAST
3231  BcastUTree2->SetGlobalComm(grid_->comm);
3232 #endif
3233 #else
3234  TreeBcast * & BcastUTree = fwdToBelowTree_[ksup];
3235  if(BcastUTree!=NULL){delete BcastUTree; BcastUTree=NULL;}
3236  BcastUTree = TreeBcast::Create(this->grid_->colComm,&tree_ranks[0],tree_ranks.size(),msgSize,SeedRFA[ksup]);
3237 #ifdef COMM_PROFILE_BCAST
3238  BcastUTree->SetGlobalComm(grid_->comm);
3239 #endif
3240 #endif
3241  }
3242 
3243  }
3244  }
3245  //do the same for the other arrays
3246  TIMER_STOP(BUILD_BCAST_TREES);
3247  TIMER_START(BUILD_REDUCE_D_TREE);
3248  {
3249  vector<double> SeedSTD(numSuper,0.0);
3250  vector<Int> aggSTD(numSuper);
3251  vector<Int> globalAggSTD(numSuper*grid_->numProcRow);
3252  for( Int ksup = 0; ksup < numSuper; ksup++ ){
3253  if( MYCOL( grid_ ) == PCOL(ksup, grid_) && MYROW(grid_)==PROW(ksup,grid_)){
3254  Int totalSize = sizeof(T)*SuperSize( ksup, super_ )*SuperSize( ksup, super_ );
3255  aggSTD[ksup]=totalSize;
3256  SeedSTD[ksup]=rand();
3257  }
3258  else if(isSendToDiagonal_(ksup)){
3259  aggSTD[ksup]=1;
3260  }
3261  else{
3262  aggSTD[ksup]=0;
3263  }
3264  }
3265 
3266 
3267  //allgather
3268  MPI_Allgather(&aggSTD[0],numSuper*sizeof(Int),MPI_BYTE,
3269  &globalAggSTD[0],numSuper*sizeof(Int),MPI_BYTE,
3270  grid_->colComm);
3271  MPI_Allreduce(MPI_IN_PLACE,&SeedSTD[0],numSuper,MPI_DOUBLE,MPI_MAX,grid_->colComm);
3272 
3273 
3274  for( Int ksup = 0; ksup < numSuper; ksup++ ){
3275 #if ( _DEBUGlevel_ >= 1 )
3276  statusOFS<<"5 "<<ksup<<std::endl;
3277 #endif
3278  if( MYCOL( grid_ ) == PCOL(ksup, grid_) ){
3279 
3280  Int amISTD = globalAggSTD[MYROW(grid_)*numSuper + ksup];
3281  // if( MYCOL( grid_ ) == PCOL(ksup, grid_) && MYROW(grid_)==PROW(ksup,grid_)){
3282  // assert(amISTD>0);
3283  // }
3284 
3285  if( amISTD ){
3286  vector<Int> tree_ranks;
3287  Int msgSize = 0;
3288 #if 0
3289  set<Int> set_ranks;
3290  for( Int iProcRow = 0; iProcRow < grid_->numProcRow; iProcRow++ ){
3291  Int isSTD = globalAggSTD[iProcRow*numSuper + ksup];
3292  if(isSTD>0){
3293  if( iProcRow != PROW( ksup, grid_ ) ){
3294  set_ranks.insert(iProcRow);
3295  }
3296  else{
3297  msgSize = isSTD;
3298  }
3299  }
3300  }
3301 
3302 
3303  tree_ranks.push_back(PROW(ksup,grid_));
3304  tree_ranks.insert(tree_ranks.end(),set_ranks.begin(),set_ranks.end());
3305 #else
3306  Int countSTD= 0;
3307  for( Int iProcRow = 0; iProcRow < grid_->numProcRow; iProcRow++ ){
3308  if( iProcRow != PROW( ksup, grid_ ) ){
3309  Int isSTD = globalAggSTD[iProcRow*numSuper + ksup];
3310  if(isSTD>0){
3311  ++countSTD;
3312  }
3313  }
3314  }
3315 
3316  tree_ranks.reserve(countSTD+1);
3317  tree_ranks.push_back(PROW(ksup,grid_));
3318 
3319  for( Int iProcRow = 0; iProcRow < grid_->numProcRow; iProcRow++ ){
3320  Int isSTD = globalAggSTD[iProcRow*numSuper + ksup];
3321  if(isSTD>0){
3322  if( iProcRow != PROW( ksup, grid_ ) ){
3323  tree_ranks.push_back(iProcRow);
3324  }
3325  else{
3326  msgSize = isSTD;
3327  }
3328  }
3329  }
3330 #endif
3331  //assert(set_ranks.find(MYROW(grid_))!= set_ranks.end() || MYROW(grid_)==tree_ranks[0]);
3332 
3333  TreeReduce<T> * & redDTree = redToAboveTree_[ksup];
3334 
3335 
3336  if(redDTree!=NULL){delete redDTree; redDTree=NULL;}
3337  redDTree = TreeReduce<T>::Create(this->grid_->colComm,&tree_ranks[0],tree_ranks.size(),msgSize,SeedSTD[ksup]);
3338 #ifdef COMM_PROFILE
3339  redDTree->SetGlobalComm(grid_->comm);
3340 #endif
3341  }
3342  }
3343  }
3344  }
3345  TIMER_STOP(BUILD_REDUCE_D_TREE);
3346 
3347 
3348 
3349  TIMER_START(BUILD_REDUCE_L_TREE);
3350  {
3351  vector<double> SeedRTL(numSuper,0.0);
3352  vector<Int> aggRTL(numSuper);
3353  vector<Int> globalAggRTL(numSuper*grid_->numProcCol);
3354  for( Int ksup = 0; ksup < numSuper ; ksup++ ){
3355  if(MYCOL(grid_)==PCOL(ksup,grid_)){
3356  std::vector<LBlock<T> >& Lcol = this->L( LBj(ksup, grid_) );
3357  // All blocks except for the diagonal block are to be sent right
3358 
3359  Int totalSize = 0;
3360 
3361  //determine the number of rows in LUpdateBufReduced
3362  Int numRowLUpdateBuf = 0;
3363  if( MYROW( grid_ ) != PROW( ksup, grid_ ) ){
3364  for( Int ib = 0; ib < Lcol.size(); ib++ ){
3365  numRowLUpdateBuf += Lcol[ib].numRow;
3366  }
3367  } // I do not own the diagonal block
3368  else{
3369  for( Int ib = 1; ib < Lcol.size(); ib++ ){
3370  numRowLUpdateBuf += Lcol[ib].numRow;
3371  }
3372  } // I own the diagonal block, skip the diagonal block
3373 
3374  //if(ksup==297){gdb_lock();}
3375  totalSize = numRowLUpdateBuf*SuperSize( ksup, super_ )*sizeof(T);
3376 
3377  aggRTL[ksup]=totalSize;
3378 
3379  SeedRTL[ksup]=rand();
3380  }
3381  else if(isRecvFromLeft_(ksup)){
3382  aggRTL[ksup]=1;
3383  }
3384  else{
3385  aggRTL[ksup]=0;
3386  }
3387  }
3388  // //allgather
3389  MPI_Allgather(&aggRTL[0],numSuper*sizeof(Int),MPI_BYTE,
3390  &globalAggRTL[0],numSuper*sizeof(Int),MPI_BYTE,
3391  grid_->rowComm);
3392  MPI_Allreduce(MPI_IN_PLACE,&SeedRTL[0],numSuper,MPI_DOUBLE,MPI_MAX,grid_->rowComm);
3393 
3394 
3395  for( Int ksup = 0; ksup < numSuper ; ksup++ ){
3396 #if ( _DEBUGlevel_ >= 1 )
3397  statusOFS<<"6 "<<ksup<<std::endl;
3398 #endif
3399  if( isRecvFromLeft_(ksup) || CountSendToRight(ksup)>0 ){
3400  vector<Int> tree_ranks;
3401  Int msgSize = 0;
3402 #if 0
3403  set<Int> set_ranks;
3404  for( Int iProcCol = 0; iProcCol < grid_->numProcCol; iProcCol++ ){
3405  Int isRTL = globalAggRTL[iProcCol*numSuper + ksup];
3406  if(isRTL>0){
3407  if( iProcCol != PCOL( ksup, grid_ ) ){
3408  set_ranks.insert(iProcCol);
3409  }
3410  else{
3411  msgSize = isRTL;
3412  }
3413  }
3414  }
3415  tree_ranks.push_back(PCOL(ksup,grid_));
3416  tree_ranks.insert(tree_ranks.end(),set_ranks.begin(),set_ranks.end());
3417 #else
3418  Int countRTL= 0;
3419  for( Int iProcCol = 0; iProcCol < grid_->numProcCol; iProcCol++ ){
3420  if( iProcCol != PCOL( ksup, grid_ ) ){
3421  Int isRTL = globalAggRTL[iProcCol*numSuper + ksup];
3422  if(isRTL>0){
3423  ++countRTL;
3424  }
3425  }
3426  }
3427 
3428  tree_ranks.reserve(countRTL+1);
3429  tree_ranks.push_back(PCOL(ksup,grid_));
3430 
3431  for( Int iProcCol = 0; iProcCol < grid_->numProcCol; iProcCol++ ){
3432  Int isRTL = globalAggRTL[iProcCol*numSuper + ksup];
3433  if(isRTL>0){
3434  if( iProcCol != PCOL( ksup, grid_ ) ){
3435  tree_ranks.push_back(iProcCol);
3436  }
3437  else{
3438  msgSize = isRTL;
3439  }
3440  }
3441  }
3442 
3443 #endif
3444 
3445 
3446  TreeReduce<T> * & redLTree = redToLeftTree_[ksup];
3447  if(redLTree!=NULL){delete redLTree; redLTree=NULL;}
3448  redLTree = TreeReduce<T>::Create(this->grid_->rowComm,&tree_ranks[0],tree_ranks.size(),msgSize,SeedRTL[ksup]);
3449 #ifdef COMM_PROFILE
3450  redLTree->SetGlobalComm(grid_->comm);
3451 #endif
3452  }
3453  }
3454  }
3455  TIMER_STOP(BUILD_REDUCE_L_TREE);
3456 
3457 
3458 
3459 
3460 
3461 
3462 
3463 #if ( _DEBUGlevel_ >= 1 )
3464  statusOFS << std::endl << "isSendToBelow:" << std::endl;
3465  for(int j = 0;j< isSendToBelow_.n();j++){
3466  statusOFS << "["<<j<<"] ";
3467  for(int i =0; i < isSendToBelow_.m();i++){
3468  statusOFS<< isSendToBelow_(i,j) << " ";
3469  }
3470  statusOFS<<std::endl;
3471  }
3472 
3473  statusOFS << std::endl << "isRecvFromAbove:" << std::endl;
3474  for(int j = 0;j< isRecvFromAbove_.m();j++){
3475  statusOFS << "["<<j<<"] "<< isRecvFromAbove_(j)<<std::endl;
3476  }
3477 #endif
3478 #if ( _DEBUGlevel_ >= 1 )
3479  statusOFS << std::endl << "isSendToRight:" << std::endl;
3480  for(int j = 0;j< isSendToRight_.n();j++){
3481  statusOFS << "["<<j<<"] ";
3482  for(int i =0; i < isSendToRight_.m();i++){
3483  statusOFS<< isSendToRight_(i,j) << " ";
3484  }
3485  statusOFS<<std::endl;
3486  }
3487 
3488  statusOFS << std::endl << "isRecvFromLeft:" << std::endl;
3489  for(int j = 0;j< isRecvFromLeft_.m();j++){
3490  statusOFS << "["<<j<<"] "<< isRecvFromLeft_(j)<<std::endl;
3491  }
3492 
3493  statusOFS << std::endl << "isRecvFromBelow:" << std::endl;
3494  for(int j = 0;j< isRecvFromBelow_.n();j++){
3495  statusOFS << "["<<j<<"] ";
3496  for(int i =0; i < isRecvFromBelow_.m();i++){
3497  statusOFS<< isRecvFromBelow_(i,j) << " ";
3498  }
3499  statusOFS<<std::endl;
3500  }
3501 #endif
3502 
3503 
3504 
3505 
3506 
3507 
3508 
3509  TIMER_START(STCD_RFCD);
3510 
3511 
3512 #ifndef _RELEASE_
3513  PushCallStack("SendToCrossDiagonal / RecvFromCrossDiagonal");
3514 #endif
3515  for( Int ksup = 0; ksup < numSuper - 1; ksup++ ){
3516  if( MYCOL( grid_ ) == PCOL( ksup, grid_ ) ){
3517  for( std::set<Int>::iterator si = localColBlockRowIdx[LBj( ksup, grid_ )].begin();
3518  si != localColBlockRowIdx[LBj( ksup, grid_ )].end(); si++ ){
3519  Int isup = *si;
3520  Int isupProcRow = PROW( isup, grid_ );
3521  Int isupProcCol = PCOL( isup, grid_ );
3522  if( isup > ksup && MYROW( grid_ ) == isupProcRow ){
3523  isSendToCrossDiagonal_(grid_->numProcCol, ksup ) = true;
3524  isSendToCrossDiagonal_(isupProcCol, ksup ) = true;
3525  }
3526  } // for (si)
3527  } // if( MYCOL( grid_ ) == PCOL( ksup, grid_ ) )
3528  } // for (ksup)
3529 
3530  for( Int ksup = 0; ksup < numSuper - 1; ksup++ ){
3531  if( MYROW( grid_ ) == PROW( ksup, grid_ ) ){
3532  for( std::set<Int>::iterator si = localRowBlockColIdx[ LBi(ksup, grid_) ].begin();
3533  si != localRowBlockColIdx[ LBi(ksup, grid_) ].end(); si++ ){
3534  Int jsup = *si;
3535  Int jsupProcCol = PCOL( jsup, grid_ );
3536  Int jsupProcRow = PROW( jsup, grid_ );
3537  if( jsup > ksup && MYCOL(grid_) == jsupProcCol ){
3538  isRecvFromCrossDiagonal_(grid_->numProcRow, ksup ) = true;
3539  isRecvFromCrossDiagonal_(jsupProcRow, ksup ) = true;
3540  }
3541  } // for (si)
3542  } // if( MYROW( grid_ ) == PROW( ksup, grid_ ) )
3543  } // for (ksup)
3544 #if ( _DEBUGlevel_ >= 1 )
3545  statusOFS << std::endl << "isSendToCrossDiagonal:" << std::endl;
3546  for(int j =0; j < isSendToCrossDiagonal_.n();j++){
3547  if(isSendToCrossDiagonal_(grid_->numProcCol,j)){
3548  statusOFS << "["<<j<<"] ";
3549  for(int i =0; i < isSendToCrossDiagonal_.m()-1;i++){
3550  if(isSendToCrossDiagonal_(i,j))
3551  {
3552  statusOFS<< PNUM(PROW(j,grid_),i,grid_)<<" ";
3553  }
3554  }
3555  statusOFS<<std::endl;
3556  }
3557  }
3558 
3559  statusOFS << std::endl << "isRecvFromCrossDiagonal:" << std::endl;
3560  for(int j =0; j < isRecvFromCrossDiagonal_.n();j++){
3561  if(isRecvFromCrossDiagonal_(grid_->numProcRow,j)){
3562  statusOFS << "["<<j<<"] ";
3563  for(int i =0; i < isRecvFromCrossDiagonal_.m()-1;i++){
3564  if(isRecvFromCrossDiagonal_(i,j))
3565  {
3566  statusOFS<< PNUM(i,PCOL(j,grid_),grid_)<<" ";
3567  }
3568  }
3569  statusOFS<<std::endl;
3570  }
3571  }
3572 
3573 
3574 #endif
3575 
3576 #ifndef _RELEASE_
3577  PopCallStack();
3578 #endif
3579 
3580  TIMER_STOP(STCD_RFCD);
3581 
3582 #ifndef _RELEASE_
3583  PopCallStack();
3584 #endif
3585 
3586  //Build the list of supernodes based on the elimination tree from SuperLU
3587  GetWorkSet(snodeEtree,this->WorkingSet());
3588 
3589  return ;
3590  } // ----- end of method PMatrix::ConstructCommunicationPattern_P2p -----
3591 
3592  template<typename T>
3594  {
3595 
3596  if(optionsLU_->Symmetric == 0){
3597  throw std::logic_error( "The matrix is not symmetric, this routine can't handle it !" );
3598  }
3599  SelInv_P2p ( );
3600 
3601 
3602 #ifdef GEMM_PROFILE
3603  statOFS<<"m"<<"\t"<<"n"<<"\t"<<"z"<<std::endl;
3604  for(std::deque<int >::iterator it = gemm_stat.begin(); it!=gemm_stat.end(); it+=3){
3605  statOFS<<*it<<"\t"<<*(it+1)<<"\t"<<*(it+2)<<std::endl;
3606  }
3607 #endif
3608 
3609 #if defined(COMM_PROFILE) || defined(COMM_PROFILE_BCAST)
3610  //std::cout<<"DUMPING COMM STATS "<<comm_stat.size()<<" "<<std::endl;
3611  commOFS<<HEADER_COMM<<std::endl;
3612  for(std::deque<int>::iterator it = comm_stat.begin(); it!=comm_stat.end(); it+=4){
3613  commOFS<<LINE_COMM(it)<<std::endl;
3614  }
3615 #endif
3616 
3617  //reset the trees
3618  for(int i = 0 ; i< fwdToBelowTree_.size();++i){
3619  if(fwdToBelowTree_[i]!=NULL){
3620  fwdToBelowTree_[i]->Reset();
3621  }
3622  }
3623  for(int i = 0 ; i< fwdToRightTree_.size();++i){
3624  if(fwdToRightTree_[i]!=NULL){
3625  fwdToRightTree_[i]->Reset();
3626  }
3627  }
3628  for(int i = 0 ; i< redToLeftTree_.size();++i){
3629  if(redToLeftTree_[i]!=NULL){
3630  redToLeftTree_[i]->Reset();
3631  }
3632  }
3633  for(int i = 0 ; i< redToAboveTree_.size();++i){
3634  if(redToAboveTree_[i]!=NULL){
3635  redToAboveTree_[i]->Reset();
3636  }
3637  }
3638 
3639 
3640 
3641  } // ----- end of method PMatrix::SelInv -----
3642 
3643 
3644 
3645  template<typename T>
3647  {
3648  TIMER_START(SelInv_P2p);
3649 
3650 #ifndef _RELEASE_
3651  PushCallStack("PMatrix::SelInv_P2p");
3652 #endif
3653 
3654 #if ( _DEBUGlevel_ >= 1 )
3655  statusOFS<<"maxTag value: "<<maxTag_<<std::endl;
3656 #endif
3657 
3658  Int numSuper = this->NumSuper();
3659 
3660  // Main loop
3661  std::vector<std::vector<Int> > & superList = this->WorkingSet();
3662  Int numSteps = superList.size();
3663 
3664  Int rank = 0;
3665  for (Int lidx=0; lidx<numSteps ; lidx++){
3666  Int stepSuper = superList[lidx].size();
3667  //statusOFS<<"IN "<<lidx<<"/"<<numSteps<<std::endl;
3668  SelInvIntra_P2p(lidx,rank);
3669 #if ( _DEBUGlevel_ >= 1 )
3670  statusOFS<<"OUT "<<lidx<<"/"<<numSteps<<" "<<limIndex_<<std::endl;
3671 #endif
3672 
3673 #ifdef LIST_BARRIER
3674 #ifndef ALL_BARRIER
3675  if (options_->maxPipelineDepth!=-1)
3676 #endif
3677  {
3678  MPI_Barrier(grid_->comm);
3679  }
3680 #endif
3681 
3682 
3683 
3684  // assert(workingRanks_[lidx].size()==stepSuper);
3685 
3686  //find max snode.Index
3687  if(lidx>0 && (rank-1)%limIndex_==0){
3688  //#if ( _DEBUGlevel_ >= 1 )
3689  // statusOFS<<rank-1<<" Barrier "<<std::endl;
3690  //#endif
3691  MPI_Barrier(grid_->comm);
3692  }
3693 
3694 
3695  // if(lidx==1){ return;};
3696  }
3697 
3698  MPI_Barrier(grid_->comm);
3699 #ifndef _RELEASE_
3700  PopCallStack();
3701 #endif
3702 
3703  TIMER_STOP(SelInv_P2p);
3704 
3705  return ;
3706  } // ----- end of method PMatrix::SelInv_P2p -----
3707 
3708 
3709 
3710  template<typename T>
3712  {
3713 #ifndef _RELEASE_
3714  PushCallStack("PMatrix::PreSelInv");
3715 #endif
3716 
3717  Int numSuper = this->NumSuper();
3718 
3719 #ifndef _RELEASE_
3720  PushCallStack("L(i,k) <- L(i,k) * L(k,k)^{-1}");
3721 #endif
3722 #if ( _DEBUGlevel_ >= 1 )
3723  statusOFS << std::endl << "L(i,k) <- L(i,k) * L(k,k)^{-1}" << std::endl << std::endl;
3724 #endif
3725  for( Int ksup = 0; ksup < numSuper; ksup++ ){
3726  if( MYCOL( grid_ ) == PCOL( ksup, grid_ ) ){
3727  // Broadcast the diagonal L block
3728  NumMat<T> nzvalLDiag;
3729  std::vector<LBlock<T> >& Lcol = this->L( LBj( ksup, grid_ ) );
3730  if( MYROW( grid_ ) == PROW( ksup, grid_ ) ){
3731  nzvalLDiag = Lcol[0].nzval;
3732  if( nzvalLDiag.m() != SuperSize(ksup, super_) ||
3733  nzvalLDiag.n() != SuperSize(ksup, super_) ){
3734 #ifdef USE_ABORT
3735  abort();
3736 #endif
3737  throw std::runtime_error( "The size of the diagonal block of L is wrong." );
3738  }
3739  } // Owns the diagonal block
3740  else
3741  {
3742  nzvalLDiag.Resize( SuperSize(ksup, super_), SuperSize(ksup, super_) );
3743  }
3744  MPI_Bcast( (void*)nzvalLDiag.Data(), nzvalLDiag.ByteSize(),
3745  MPI_BYTE, PROW( ksup, grid_ ), grid_->colComm );
3746 
3747  // Triangular solve
3748  for( Int ib = 0; ib < Lcol.size(); ib++ ){
3749  LBlock<T> & LB = Lcol[ib];
3750  if( LB.blockIdx > ksup ){
3751 #if ( _DEBUGlevel_ >= 2 )
3752  // Check the correctness of the triangular solve for the first local column
3753  if( LBj( ksup, grid_ ) == 0 ){
3754  statusOFS << "Diag L(" << ksup << ", " << ksup << "): " << nzvalLDiag << std::endl;
3755  statusOFS << "Before solve L(" << LB.blockIdx << ", " << ksup << "): " << LB.nzval << std::endl;
3756  }
3757 #endif
3758  blas::Trsm( 'R', 'L', 'N', 'U', LB.numRow, LB.numCol, ONE<T>(),
3759  nzvalLDiag.Data(), LB.numCol, LB.nzval.Data(), LB.numRow );
3760 #if ( _DEBUGlevel_ >= 2 )
3761  // Check the correctness of the triangular solve for the first local column
3762  if( LBj( ksup, grid_ ) == 0 ){
3763  statusOFS << "After solve L(" << LB.blockIdx << ", " << ksup << "): " << LB.nzval << std::endl;
3764  }
3765 #endif
3766  }
3767  }
3768  } // if( MYCOL( grid_ ) == PCOL( ksup, grid_ ) )
3769  } // for (ksup)
3770 
3771 
3772 #ifndef _RELEASE_
3773  PopCallStack();
3774 #endif
3775 
3776 
3777 #ifndef _RELEASE_
3778  PushCallStack("U(k,i) <- L(i,k)");
3779 #endif
3780 #if ( _DEBUGlevel_ >= 1 )
3781  statusOFS << std::endl << "U(k,i) <- L(i,k)" << std::endl << std::endl;
3782 #endif
3783 
3784  for( Int ksup = 0; ksup < numSuper; ksup++ ){
3785  Int ksupProcRow = PROW( ksup, grid_ );
3786  Int ksupProcCol = PCOL( ksup, grid_ );
3787 
3788  Int sendCount = CountSendToCrossDiagonal(ksup);
3789  Int recvCount = CountRecvFromCrossDiagonal(ksup);
3790 
3791  std::vector<MPI_Request > arrMpiReqsSend(sendCount, MPI_REQUEST_NULL );
3792  std::vector<MPI_Request > arrMpiReqsSizeSend(sendCount, MPI_REQUEST_NULL );
3793  std::vector<std::vector<char> > arrSstrLcolSend(sendCount);
3794  std::vector<Int > arrSstrLcolSizeSend(sendCount);
3795 
3796  std::vector<MPI_Request > arrMpiReqsRecv(recvCount, MPI_REQUEST_NULL );
3797  std::vector<MPI_Request > arrMpiReqsSizeRecv(recvCount, MPI_REQUEST_NULL );
3798  std::vector<std::vector<char> > arrSstrLcolRecv(recvCount);
3799  std::vector<Int > arrSstrLcolSizeRecv(recvCount);
3800 
3801 
3802 
3803  // Sender
3804  if( isSendToCrossDiagonal_(grid_->numProcCol,ksup) ){
3805 #if ( _DEBUGlevel_ >= 1 )
3806  statusOFS<<"["<<ksup<<"] P"<<MYPROC(grid_)<<" should send to "<<CountSendToCrossDiagonal(ksup)<<" processors"<<std::endl;
3807 #endif
3808 
3809  Int sendIdx = 0;
3810  for(Int dstCol = 0; dstCol<grid_->numProcCol; dstCol++){
3811  if(isSendToCrossDiagonal_(dstCol,ksup) ){
3812  Int dst = PNUM(PROW(ksup,grid_),dstCol,grid_);
3813  if(MYPROC(grid_)!= dst){
3814  // Pack L data
3815  std::stringstream sstm;
3816  std::vector<char> & sstrLcolSend = arrSstrLcolSend[sendIdx];
3817  Int & sstrSize = arrSstrLcolSizeSend[sendIdx];
3818  MPI_Request & mpiReqSend = arrMpiReqsSend[sendIdx];
3819  MPI_Request & mpiReqSizeSend = arrMpiReqsSizeSend[sendIdx];
3820 
3821  std::vector<Int> mask( LBlockMask::TOTAL_NUMBER, 1 );
3822  std::vector<LBlock<T> >& Lcol = this->L( LBj(ksup, grid_) );
3823  // All blocks except for the diagonal block are to be sent right
3824  //TODO not true > this is a scatter operation ! Can we know the destination ?
3825 
3826  Int count = 0;
3827  if( MYROW( grid_ ) == PROW( ksup, grid_ ) ){
3828  for( Int ib = 1; ib < Lcol.size(); ib++ ){
3829  if( Lcol[ib].blockIdx > ksup && (Lcol[ib].blockIdx % grid_->numProcCol) == dstCol ){
3830  count++;
3831  }
3832  }
3833  }
3834  else{
3835 
3836  for( Int ib = 0; ib < Lcol.size(); ib++ ){
3837  if( Lcol[ib].blockIdx > ksup && (Lcol[ib].blockIdx % grid_->numProcCol) == dstCol ){
3838  count++;
3839  }
3840  }
3841  }
3842 
3843  serialize( (Int)count, sstm, NO_MASK );
3844 
3845  for( Int ib = 0; ib < Lcol.size(); ib++ ){
3846  if( Lcol[ib].blockIdx > ksup && (Lcol[ib].blockIdx % grid_->numProcCol) == dstCol ){
3847 #if ( _DEBUGlevel_ >= 1 )
3848  statusOFS<<"["<<ksup<<"] SEND contains "<<Lcol[ib].blockIdx<< " which corresponds to "<<GBj(ib,grid_)<<std::endl;
3849 #endif
3850  serialize( Lcol[ib], sstm, mask );
3851  }
3852  }
3853 
3854  sstrLcolSend.resize( Size(sstm) );
3855  sstm.read( &sstrLcolSend[0], sstrLcolSend.size() );
3856  sstrSize = sstrLcolSend.size();
3857 
3858 
3859 
3860  // Send/Recv is possible here due to the one to one correspondence
3861  // in the case of square processor grid
3862 
3863 #if ( _DEBUGlevel_ >= 1 )
3864  statusOFS<<"["<<ksup<<"] P"<<MYPROC(grid_)<<" ("<<MYROW(grid_)<<","<<MYCOL(grid_)<<") ---> LBj("<<ksup<<")="<<LBj(ksup,grid_)<<" ---> P"<<dst<<" ("<<ksupProcRow<<","<<dstCol<<")"<<std::endl;
3865 #endif
3866  MPI_Isend( &sstrSize, sizeof(sstrSize), MPI_BYTE, dst, SELINV_TAG_L_SIZE_CD, grid_->comm, &mpiReqSizeSend );
3867  MPI_Isend( (void*)&sstrLcolSend[0], sstrSize, MPI_BYTE, dst, SELINV_TAG_L_CONTENT_CD, grid_->comm, &mpiReqSend );
3868 
3869 
3870  PROFILE_COMM(MYPROC(this->grid_),dst,SELINV_TAG_L_SIZE_CD,sizeof(sstrSize));
3871  PROFILE_COMM(MYPROC(this->grid_),dst,SELINV_TAG_L_CONTENT_CD,sstrSize);
3872 
3873  //mpi::Send( sstm, dst,SELINV_TAG_L_SIZE_CD, SELINV_TAG_L_CONTENT_CD, grid_->comm );
3874 
3875  sendIdx++;
3876  } // if I am a sender
3877  }
3878  }
3879  }
3880 
3881 
3882 
3883 
3884 
3885  // Receiver
3886  if( isRecvFromCrossDiagonal_(grid_->numProcRow,ksup) ){
3887 
3888 
3889 #if ( _DEBUGlevel_ >= 1 )
3890  statusOFS<<"["<<ksup<<"] P"<<MYPROC(grid_)<<" should receive from "<<CountRecvFromCrossDiagonal(ksup)<<" processors"<<std::endl;
3891 #endif
3892 
3893 
3894  std::vector<UBlock<T> >& Urow = this->U( LBi( ksup, grid_ ) );
3895  std::vector<bool> isBlockFound(Urow.size(),false);
3896 
3897 
3898  Int recvIdx = 0;
3899  //receive size first
3900  for(Int srcRow = 0; srcRow<grid_->numProcRow; srcRow++){
3901  if(isRecvFromCrossDiagonal_(srcRow,ksup) ){
3902  std::vector<LBlock<T> > LcolRecv;
3903  Int src = PNUM(srcRow,PCOL(ksup,grid_),grid_);
3904  if(MYPROC(grid_)!= src){
3905  MPI_Request & mpiReqSizeRecv = arrMpiReqsSizeRecv[recvIdx];
3906  Int & sstrSize = arrSstrLcolSizeRecv[recvIdx];
3907 
3908  MPI_Irecv( &sstrSize, 1, MPI_INT, src, SELINV_TAG_L_SIZE_CD, grid_->comm, &mpiReqSizeRecv );
3909 
3910  recvIdx++;
3911  }
3912  }
3913  }
3914 
3915  mpi::Waitall(arrMpiReqsSizeRecv);
3916 
3917 
3918 
3919  //receive content
3920  recvIdx = 0;
3921  for(Int srcRow = 0; srcRow<grid_->numProcRow; srcRow++){
3922  if(isRecvFromCrossDiagonal_(srcRow,ksup) ){
3923  std::vector<LBlock<T> > LcolRecv;
3924  Int src = PNUM(srcRow,PCOL(ksup,grid_),grid_);
3925  if(MYPROC(grid_)!= src){
3926  MPI_Request & mpiReqRecv = arrMpiReqsRecv[recvIdx];
3927  Int & sstrSize = arrSstrLcolSizeRecv[recvIdx];
3928  std::vector<char> & sstrLcolRecv = arrSstrLcolRecv[recvIdx];
3929  sstrLcolRecv.resize(sstrSize);
3930 
3931  MPI_Irecv( &sstrLcolRecv[0], sstrSize, MPI_BYTE, src, SELINV_TAG_L_CONTENT_CD, grid_->comm, &mpiReqRecv );
3932 
3933  recvIdx++;
3934  }
3935  }
3936  }
3937 
3938  mpi::Waitall(arrMpiReqsRecv);
3939 
3940 
3941 
3942  //Process the content
3943  recvIdx = 0;
3944  for(Int srcRow = 0; srcRow<grid_->numProcRow; srcRow++){
3945  if(isRecvFromCrossDiagonal_(srcRow,ksup) ){
3946  std::vector<LBlock<T> > LcolRecv;
3947  Int src = PNUM(srcRow,PCOL(ksup,grid_),grid_);
3948  if(MYPROC(grid_)!= src){
3949 
3950  Int & sstrSize = arrSstrLcolSizeRecv[recvIdx];
3951  std::vector<char> & sstrLcolRecv = arrSstrLcolRecv[recvIdx];
3952  std::stringstream sstm;
3953 
3954 #if ( _DEBUGlevel_ >= 1 )
3955  statusOFS<<"["<<ksup<<"] P"<<MYPROC(grid_)<<" ("<<MYROW(grid_)<<","<<MYCOL(grid_)<<") <--- LBj("<<ksup<<") <--- P"<<src<<" ("<<srcRow<<","<<ksupProcCol<<")"<<std::endl;
3956 #endif
3957 
3958 
3959  sstm.write( &sstrLcolRecv[0], sstrSize );
3960 
3961  // Unpack L data.
3962  Int numLBlock;
3963  std::vector<Int> mask( LBlockMask::TOTAL_NUMBER, 1 );
3964  deserialize( numLBlock, sstm, NO_MASK );
3965  LcolRecv.resize(numLBlock);
3966  for( Int ib = 0; ib < numLBlock; ib++ ){
3967  deserialize( LcolRecv[ib], sstm, mask );
3968 #if ( _DEBUGlevel_ >= 1 )
3969  statusOFS<<"["<<ksup<<"] RECV contains "<<LcolRecv[ib].blockIdx<< " which corresponds to "<< ib * grid_->numProcRow + srcRow; // <<std::endl;
3970  // statusOFS<<" L is on row "<< srcRow <<" whereas U is on col "<<((ib * grid_->numProcRow + srcRow)/grid_->numProcCol)%grid_->numProcCol <<std::endl;
3971  statusOFS<<" L is on row "<< srcRow <<" whereas U is on col "<< LcolRecv[ib].blockIdx % grid_->numProcCol <<std::endl;
3972 #endif
3973  }
3974 
3975 
3976  recvIdx++;
3977 
3978  } // sender is not the same as receiver
3979  else{
3980  // L is obtained locally, just make a copy. Do not include the diagonal block
3981  std::vector<LBlock<T> >& Lcol = this->L( LBj( ksup, grid_ ) );
3982  if( MYROW( grid_ ) != PROW( ksup, grid_ ) ){
3983  LcolRecv.resize( Lcol.size() );
3984  for( Int ib = 0; ib < Lcol.size(); ib++ ){
3985  LcolRecv[ib] = Lcol[ib];
3986  }
3987  }
3988  else{
3989  LcolRecv.resize( Lcol.size() - 1 );
3990  for( Int ib = 0; ib < Lcol.size() - 1; ib++ ){
3991  LcolRecv[ib] = Lcol[ib+1];
3992  }
3993  }
3994  } // sender is the same as receiver
3995 
3996  // Update U
3997  // Make sure that the size of L and the corresponding U blocks match.
3998  for( Int ib = 0; ib < LcolRecv.size(); ib++ ){
3999  LBlock<T> & LB = LcolRecv[ib];
4000  if( LB.blockIdx <= ksup ){
4001 #ifdef USE_ABORT
4002  abort();
4003 #endif
4004  throw std::logic_error( "LcolRecv contains the wrong blocks." );
4005  }
4006  for( Int jb = 0; jb < Urow.size(); jb++ ){
4007  UBlock<T> & UB = Urow[jb];
4008  if( LB.blockIdx == UB.blockIdx ){
4009  // Compare size
4010  if( LB.numRow != UB.numCol || LB.numCol != UB.numRow ){
4011  std::ostringstream msg;
4012  msg << "LB(" << LB.blockIdx << ", " << ksup << ") and UB("
4013  << ksup << ", " << UB.blockIdx << ") do not share the same size." << std::endl
4014  << "LB: " << LB.numRow << " x " << LB.numCol << std::endl
4015  << "UB: " << UB.numRow << " x " << UB.numCol << std::endl;
4016 #ifdef USE_ABORT
4017  abort();
4018 #endif
4019  throw std::runtime_error( msg.str().c_str() );
4020  }
4021 
4022  // Note that the order of the column indices of the U
4023  // block may not follow the order of the row indices,
4024  // overwrite the information in U.
4025  UB.cols = LB.rows;
4026  Transpose( LB.nzval, UB.nzval );
4027 
4028 #if ( _DEBUGlevel_ >= 1 )
4029  statusOFS<<"["<<ksup<<"] USING "<<LB.blockIdx<< std::endl;
4030 #endif
4031  isBlockFound[jb] = true;
4032  break;
4033  } // if( LB.blockIdx == UB.blockIdx )
4034  } // for (jb)
4035  } // for (ib)
4036  }
4037  }
4038 
4039  for( Int jb = 0; jb < Urow.size(); jb++ ){
4040  UBlock<T> & UB = Urow[jb];
4041  if( !isBlockFound[jb] ){
4042 #ifdef USE_ABORT
4043  abort();
4044 #endif
4045  throw std::logic_error( "UBlock cannot find its update. Something is seriously wrong." );
4046  }
4047  }
4048 
4049 
4050 
4051  } // if I am a receiver
4052 
4053 
4054  //Wait until every receiver is done
4055  mpi::Waitall(arrMpiReqsSizeSend);
4056  mpi::Waitall(arrMpiReqsSend);
4057 
4058 
4059  } // for (ksup)
4060 
4061 #ifndef _RELEASE_
4062  PopCallStack();
4063 #endif
4064 
4065 #ifndef _RELEASE_
4066  PushCallStack("L(i,i) <- [L(k,k) * U(k,k)]^{-1} ");
4067 #endif
4068 #if ( _DEBUGlevel_ >= 1 )
4069  statusOFS << std::endl << "L(i,i) <- [L(k,k) * U(k,k)]^{-1}" << std::endl << std::endl;
4070 #endif
4071 
4072  for( Int ksup = 0; ksup < numSuper; ksup++ ){
4073  if( MYROW( grid_ ) == PROW( ksup, grid_ ) &&
4074  MYCOL( grid_ ) == PCOL( ksup, grid_ ) ){
4075  IntNumVec ipiv( SuperSize( ksup, super_ ) );
4076  // Note that the pivoting vector ipiv should follow the FORTRAN
4077  // notation by adding the +1
4078  for(Int i = 0; i < SuperSize( ksup, super_ ); i++){
4079  ipiv[i] = i + 1;
4080  }
4081  LBlock<T> & LB = (this->L( LBj( ksup, grid_ ) ))[0];
4082 #if ( _DEBUGlevel_ >= 2 )
4083  // Check the correctness of the matrix inversion for the first local column
4084  statusOFS << "Factorized A (" << ksup << ", " << ksup << "): " << LB.nzval << std::endl;
4085 #endif
4086  lapack::Getri( SuperSize( ksup, super_ ), LB.nzval.Data(),
4087  SuperSize( ksup, super_ ), ipiv.Data() );
4088 
4089  // Symmetrize the diagonal block
4090  Symmetrize( LB.nzval );
4091 
4092 #if ( _DEBUGlevel_ >= 2 )
4093  // Check the correctness of the matrix inversion for the first local column
4094  statusOFS << "Inversed A (" << ksup << ", " << ksup << "): " << LB.nzval << std::endl;
4095 #endif
4096  } // if I need to inverse the diagonal block
4097  } // for (ksup)
4098 
4099 
4100 #ifndef _RELEASE_
4101  PopCallStack();
4102 #endif
4103 
4104 
4105 
4106 #ifndef _RELEASE_
4107  PopCallStack();
4108 #endif
4109 
4110  return ;
4111  } // ----- end of method PMatrix::PreSelInv -----
4112 
4113  template<typename T>
4115  {
4116 #ifndef _RELEASE_
4117  PushCallStack("PMatrix::GetDiagonal");
4118 #endif
4119  Int numSuper = this->NumSuper();
4120 
4121  Int numCol = this->NumCol();
4122  const IntNumVec& perm = super_->perm;
4123  const IntNumVec& permInv = super_->permInv;
4124 
4125  const IntNumVec * pPerm_r;
4126  const IntNumVec * pPermInv_r;
4127 
4128  pPerm_r = &super_->perm_r;
4129  pPermInv_r = &super_->permInv_r;
4130 
4131  const IntNumVec& perm_r = *pPerm_r;
4132  const IntNumVec& permInv_r = *pPermInv_r;
4133 
4134 
4135  NumVec<T> diagLocal( numCol );
4136  SetValue( diagLocal, ZERO<T>() );
4137 
4138  diag.Resize( numCol );
4139  SetValue( diag, ZERO<T>() );
4140 
4141  for( Int orow = 0; orow < numCol; orow++){
4142  //row index in the permuted order
4143  Int row = perm[ orow ];
4144  //col index in the permuted order
4145  Int col = perm[ perm_r[ orow] ];
4146 
4147  Int blockColIdx = BlockIdx( col, super_ );
4148  Int blockRowIdx = BlockIdx( row, super_ );
4149 
4150 
4151  // I own the column
4152  if( MYROW( grid_ ) == PROW( blockRowIdx, grid_ ) &&
4153  MYCOL( grid_ ) == PCOL( blockColIdx, grid_ ) ){
4154  // Search for the nzval
4155  bool isFound = false;
4156 
4157  if( blockColIdx <= blockRowIdx ){
4158  // Data on the L side
4159 
4160  std::vector<LBlock<T> >& Lcol = this->L( LBj( blockColIdx, grid_ ) );
4161 
4162  for( Int ib = 0; ib < Lcol.size(); ib++ ){
4163 #if ( _DEBUGlevel_ >= 1 )
4164  statusOFS << "blockRowIdx = " << blockRowIdx << ", Lcol[ib].blockIdx = " << Lcol[ib].blockIdx << ", blockColIdx = " << blockColIdx << std::endl;
4165 #endif
4166  if( Lcol[ib].blockIdx == blockRowIdx ){
4167  IntNumVec& rows = Lcol[ib].rows;
4168  for( int iloc = 0; iloc < Lcol[ib].numRow; iloc++ ){
4169  if( rows[iloc] == row ){
4170  Int jloc = col - FirstBlockCol( blockColIdx, super_ );
4171 
4172  diagLocal[ orow ] = Lcol[ib].nzval( iloc, jloc );
4173 
4174 
4175  isFound = true;
4176  break;
4177  } // found the corresponding row
4178  }
4179  }
4180  if( isFound == true ) break;
4181  } // for (ib)
4182  }
4183  else{
4184  // Data on the U side
4185 
4186  std::vector<UBlock<T> >& Urow = this->U( LBi( blockRowIdx, grid_ ) );
4187 
4188  for( Int jb = 0; jb < Urow.size(); jb++ ){
4189  if( Urow[jb].blockIdx == blockColIdx ){
4190  IntNumVec& cols = Urow[jb].cols;
4191  for( int jloc = 0; jloc < Urow[jb].numCol; jloc++ ){
4192  if( cols[jloc] == col ){
4193  Int iloc = row - FirstBlockRow( blockRowIdx, super_ );
4194 
4195  diagLocal[ orow ] = Urow[jb].nzval( iloc, jloc );
4196 
4197  isFound = true;
4198  break;
4199  } // found the corresponding col
4200  }
4201  }
4202  if( isFound == true ) break;
4203  } // for (jb)
4204  } // if( blockColIdx <= blockRowIdx )
4205 
4206  // Did not find the corresponding row, set the value to zero.
4207  if( isFound == false ){
4208  statusOFS << "In the permutated order, (" << row << ", " << col <<
4209  ") is not found in PMatrix." << std::endl;
4210  diagLocal[orow] = ZERO<T>();
4211  }
4212  }
4213  }
4214 
4215  // //TODO This doesnt work with row perm
4216  // for( Int ksup = 0; ksup < numSuper; ksup++ ){
4217  // Int numRows = SuperSize(ksup, this->super_);
4218  //
4219  // // I own the diagonal block
4220  // if( MYROW( grid_ ) == PROW( ksup, grid_ ) &&
4221  // MYCOL( grid_ ) == PCOL( ksup, grid_ ) ){
4222  // LBlock<T> & LB = this->L( LBj( ksup, grid_ ) )[0];
4223  // for( Int i = 0; i < LB.numRow; i++ ){
4224  // diagLocal( permInv[ LB.rows(i) ] ) = LB.nzval( i, perm[permInv_r[i]] );
4225  // }
4226  // }
4227  // }
4228 
4229  // All processors own diag
4230  mpi::Allreduce( diagLocal.Data(), diag.Data(), numCol, MPI_SUM, grid_->comm );
4231 
4232 #ifndef _RELEASE_
4233  PopCallStack();
4234 #endif
4235 
4236  return ;
4237  } // ----- end of method PMatrix::GetDiagonal -----
4238 
4239 
4240 
4241  template<typename T>
4243  {
4244 #ifndef _RELEASE_
4245  PushCallStack("PMatrix::PMatrixToDistSparseMatrix");
4246 #endif
4247 #if ( _DEBUGlevel_ >= 1 )
4248  statusOFS << std::endl << "Converting PMatrix to DistSparseMatrix." << std::endl;
4249 #endif
4250  Int mpirank = grid_->mpirank;
4251  Int mpisize = grid_->mpisize;
4252 
4253  std::vector<Int> rowSend( mpisize );
4254  std::vector<Int> colSend( mpisize );
4255  std::vector<T> valSend( mpisize );
4256  std::vector<Int> sizeSend( mpisize, 0 );
4257  std::vector<Int> displsSend( mpisize, 0 );
4258 
4259  std::vector<Int> rowRecv( mpisize );
4260  std::vector<Int> colRecv( mpisize );
4261  std::vector<T> valRecv( mpisize );
4262  std::vector<Int> sizeRecv( mpisize, 0 );
4263  std::vector<Int> displsRecv( mpisize, 0 );
4264 
4265  Int numSuper = this->NumSuper();
4266  const IntNumVec& perm = super_->perm;
4267  const IntNumVec& permInv = super_->permInv;
4268 
4269  const IntNumVec * pPerm_r;
4270  const IntNumVec * pPermInv_r;
4271 
4272  pPerm_r = &super_->perm_r;
4273  pPermInv_r = &super_->permInv_r;
4274 
4275  const IntNumVec& perm_r = *pPerm_r;
4276  const IntNumVec& permInv_r = *pPermInv_r;
4277 
4278  // The number of local columns in DistSparseMatrix format for the
4279  // processor with rank 0. This number is the same for processors
4280  // with rank ranging from 0 to mpisize - 2, and may or may not differ
4281  // from the number of local columns for processor with rank mpisize -
4282  // 1.
4283  Int numColFirst = this->NumCol() / mpisize;
4284 
4285  // Count the size first.
4286  for( Int ksup = 0; ksup < numSuper; ksup++ ){
4287  // L blocks
4288  if( MYCOL( grid_ ) == PCOL( ksup, grid_ ) ){
4289  std::vector<LBlock<T> >& Lcol = this->L( LBj( ksup, grid_ ) );
4290  for( Int ib = 0; ib < Lcol.size(); ib++ ){
4291  for( Int j = 0; j < Lcol[ib].numCol; j++ ){
4292  Int jcol = permInv[ permInv_r[ j + FirstBlockCol( ksup, super_ ) ] ];
4293  Int dest = std::min( jcol / numColFirst, mpisize - 1 );
4294  sizeSend[dest] += Lcol[ib].numRow;
4295  }
4296  }
4297  } // I own the column of ksup
4298 
4299  // U blocks
4300  if( MYROW( grid_ ) == PROW( ksup, grid_ ) ){
4301  std::vector<UBlock<T> >& Urow = this->U( LBi( ksup, grid_ ) );
4302  for( Int jb = 0; jb < Urow.size(); jb++ ){
4303  IntNumVec& cols = Urow[jb].cols;
4304  for( Int j = 0; j < cols.m(); j++ ){
4305  Int jcol = permInv[ permInv_r[ cols[j] ] ];
4306  Int dest = std::min( jcol / numColFirst, mpisize - 1 );
4307  sizeSend[dest] += Urow[jb].numRow;
4308  }
4309  }
4310  } // I own the row of ksup
4311  } // for (ksup)
4312 
4313  // All-to-all exchange of size information
4314  MPI_Alltoall(
4315  &sizeSend[0], 1, MPI_INT,
4316  &sizeRecv[0], 1, MPI_INT, grid_->comm );
4317 
4318 
4319 
4320  // Reserve the space
4321  for( Int ip = 0; ip < mpisize; ip++ ){
4322  if( ip == 0 ){
4323  displsSend[ip] = 0;
4324  }
4325  else{
4326  displsSend[ip] = displsSend[ip-1] + sizeSend[ip-1];
4327  }
4328 
4329  if( ip == 0 ){
4330  displsRecv[ip] = 0;
4331  }
4332  else{
4333  displsRecv[ip] = displsRecv[ip-1] + sizeRecv[ip-1];
4334  }
4335  }
4336  Int sizeSendTotal = displsSend[mpisize-1] + sizeSend[mpisize-1];
4337  Int sizeRecvTotal = displsRecv[mpisize-1] + sizeRecv[mpisize-1];
4338 
4339  rowSend.resize( sizeSendTotal );
4340  colSend.resize( sizeSendTotal );
4341  valSend.resize( sizeSendTotal );
4342 
4343  rowRecv.resize( sizeRecvTotal );
4344  colRecv.resize( sizeRecvTotal );
4345  valRecv.resize( sizeRecvTotal );
4346 
4347 #if ( _DEBUGlevel_ >= 1 )
4348  statusOFS << "displsSend = " << displsSend << std::endl;
4349  statusOFS << "displsRecv = " << displsRecv << std::endl;
4350 #endif
4351 
4352  // Put (row, col, val) to the sending buffer
4353  std::vector<Int> cntSize( mpisize, 0 );
4354 
4355  for( Int ksup = 0; ksup < numSuper; ksup++ ){
4356  // L blocks
4357  if( MYCOL( grid_ ) == PCOL( ksup, grid_ ) ){
4358  std::vector<LBlock<T> >& Lcol = this->L( LBj( ksup, grid_ ) );
4359  for( Int ib = 0; ib < Lcol.size(); ib++ ){
4360  IntNumVec& rows = Lcol[ib].rows;
4361  NumMat<T>& nzval = Lcol[ib].nzval;
4362  for( Int j = 0; j < Lcol[ib].numCol; j++ ){
4363  Int jcol = permInv[ permInv_r[ j + FirstBlockCol( ksup, super_ ) ] ];
4364  Int dest = std::min( jcol / numColFirst, mpisize - 1 );
4365  for( Int i = 0; i < rows.m(); i++ ){
4366  rowSend[displsSend[dest] + cntSize[dest]] = permInv[ rows[i] ];
4367  colSend[displsSend[dest] + cntSize[dest]] = jcol;
4368  valSend[displsSend[dest] + cntSize[dest]] = nzval( i, j );
4369  cntSize[dest]++;
4370  }
4371  }
4372  }
4373  } // I own the column of ksup
4374 
4375  // U blocks
4376  if( MYROW( grid_ ) == PROW( ksup, grid_ ) ){
4377  std::vector<UBlock<T> >& Urow = this->U( LBi( ksup, grid_ ) );
4378  for( Int jb = 0; jb < Urow.size(); jb++ ){
4379  IntNumVec& cols = Urow[jb].cols;
4380  NumMat<T>& nzval = Urow[jb].nzval;
4381  for( Int j = 0; j < cols.m(); j++ ){
4382  Int jcol = permInv[ permInv_r[ cols[j] ] ];
4383  Int dest = std::min( jcol / numColFirst, mpisize - 1 );
4384  for( Int i = 0; i < Urow[jb].numRow; i++ ){
4385  rowSend[displsSend[dest] + cntSize[dest]] = permInv[ i + FirstBlockCol( ksup, super_ ) ];
4386  colSend[displsSend[dest] + cntSize[dest]] = jcol;
4387  valSend[displsSend[dest] + cntSize[dest]] = nzval( i, j );
4388  cntSize[dest]++;
4389  }
4390  }
4391  }
4392  } // I own the row of ksup
4393  }
4394 
4395  // Check sizes match
4396  for( Int ip = 0; ip < mpisize; ip++ ){
4397  if( cntSize[ip] != sizeSend[ip] )
4398  {
4399 #ifdef USE_ABORT
4400  abort();
4401 #endif
4402  throw std::runtime_error( "Sizes of the sending information do not match." );
4403  }
4404  }
4405 
4406 
4407  // Alltoallv to exchange information
4408  mpi::Alltoallv(
4409  &rowSend[0], &sizeSend[0], &displsSend[0],
4410  &rowRecv[0], &sizeRecv[0], &displsRecv[0],
4411  grid_->comm );
4412  mpi::Alltoallv(
4413  &colSend[0], &sizeSend[0], &displsSend[0],
4414  &colRecv[0], &sizeRecv[0], &displsRecv[0],
4415  grid_->comm );
4416  mpi::Alltoallv(
4417  &valSend[0], &sizeSend[0], &displsSend[0],
4418  &valRecv[0], &sizeRecv[0], &displsRecv[0],
4419  grid_->comm );
4420 
4421 #if ( _DEBUGlevel_ >= 1 )
4422  statusOFS << "Alltoallv communication finished." << std::endl;
4423 #endif
4424 
4425  //#if ( _DEBUGlevel_ >= 1 )
4426  // for( Int ip = 0; ip < mpisize; ip++ ){
4427  // statusOFS << "rowSend[" << ip << "] = " << rowSend[ip] << std::endl;
4428  // statusOFS << "rowRecv[" << ip << "] = " << rowRecv[ip] << std::endl;
4429  // statusOFS << "colSend[" << ip << "] = " << colSend[ip] << std::endl;
4430  // statusOFS << "colRecv[" << ip << "] = " << colRecv[ip] << std::endl;
4431  // statusOFS << "valSend[" << ip << "] = " << valSend[ip] << std::endl;
4432  // statusOFS << "valRecv[" << ip << "] = " << valRecv[ip] << std::endl;
4433  // }
4434  //#endif
4435 
4436  // Organize the received message.
4437  Int firstCol = mpirank * numColFirst;
4438  Int numColLocal;
4439  if( mpirank == mpisize-1 )
4440  numColLocal = this->NumCol() - numColFirst * (mpisize-1);
4441  else
4442  numColLocal = numColFirst;
4443 
4444  std::vector<std::vector<Int> > rows( numColLocal );
4445  std::vector<std::vector<T> > vals( numColLocal );
4446 
4447  for( Int ip = 0; ip < mpisize; ip++ ){
4448  Int* rowRecvCur = &rowRecv[displsRecv[ip]];
4449  Int* colRecvCur = &colRecv[displsRecv[ip]];
4450  T* valRecvCur = &valRecv[displsRecv[ip]];
4451  for( Int i = 0; i < sizeRecv[ip]; i++ ){
4452  rows[colRecvCur[i]-firstCol].push_back( rowRecvCur[i] );
4453  vals[colRecvCur[i]-firstCol].push_back( valRecvCur[i] );
4454  } // for (i)
4455  } // for (ip)
4456 
4457  // Sort the rows
4458  std::vector<std::vector<Int> > sortIndex( numColLocal );
4459  for( Int j = 0; j < numColLocal; j++ ){
4460  sortIndex[j].resize( rows[j].size() );
4461  for( Int i = 0; i < sortIndex[j].size(); i++ )
4462  sortIndex[j][i] = i;
4463  std::sort( sortIndex[j].begin(), sortIndex[j].end(),
4464  IndexComp<std::vector<Int>& > ( rows[j] ) );
4465  } // for (j)
4466 
4467  // Form DistSparseMatrix according to the received message
4468  // NOTE: for indicies, DistSparseMatrix follows the FORTRAN
4469  // convention (1 based) while PMatrix follows the C convention (0
4470  // based)
4471  A.size = this->NumCol();
4472  A.nnzLocal = 0;
4473  A.colptrLocal.Resize( numColLocal + 1 );
4474  // Note that 1 is important since the index follows the FORTRAN convention
4475  A.colptrLocal(0) = 1;
4476  for( Int j = 0; j < numColLocal; j++ ){
4477  A.nnzLocal += rows[j].size();
4478  A.colptrLocal(j+1) = A.colptrLocal(j) + rows[j].size();
4479  }
4480 
4481  A.comm = grid_->comm;
4482 
4483 #if ( _DEBUGlevel_ >= 1 )
4484  statusOFS << "nnzLocal = " << A.nnzLocal << std::endl;
4485  statusOFS << "nnz = " << A.Nnz() << std::endl;
4486 #endif
4487 
4488 
4489  A.rowindLocal.Resize( A.nnzLocal );
4490  A.nzvalLocal.Resize( A.nnzLocal );
4491 
4492  Int* rowPtr = A.rowindLocal.Data();
4493  T* nzvalPtr = A.nzvalLocal.Data();
4494  for( Int j = 0; j < numColLocal; j++ ){
4495  std::vector<Int>& rowsCur = rows[j];
4496  std::vector<Int>& sortIndexCur = sortIndex[j];
4497  std::vector<T>& valsCur = vals[j];
4498  for( Int i = 0; i < rows[j].size(); i++ ){
4499  // Note that 1 is important since the index follows the FORTRAN convention
4500  *(rowPtr++) = rowsCur[sortIndexCur[i]] + 1;
4501  *(nzvalPtr++) = valsCur[sortIndexCur[i]];
4502  }
4503  }
4504 
4505 #if ( _DEBUGlevel_ >= 1 )
4506  statusOFS << "A.colptrLocal[end] = " << A.colptrLocal(numColLocal) << std::endl;
4507  statusOFS << "A.rowindLocal.size() = " << A.rowindLocal.m() << std::endl;
4508  statusOFS << "A.rowindLocal[end] = " << A.rowindLocal(A.nnzLocal-1) << std::endl;
4509  statusOFS << "A.nzvalLocal[end] = " << A.nzvalLocal(A.nnzLocal-1) << std::endl;
4510 #endif
4511 
4512 
4513 #ifndef _RELEASE_
4514  PopCallStack();
4515 #endif
4516 
4517  return ;
4518  } // ----- end of method PMatrix::PMatrixToDistSparseMatrix -----
4519 
4520 
4521 
4522  template<typename T>
4524  {
4525 #ifndef _RELEASE_
4526  PushCallStack("PMatrix::PMatrixToDistSparseMatrix_OLD");
4527 #endif
4528 #if ( _DEBUGlevel_ >= 1 )
4529  statusOFS << std::endl << "Converting PMatrix to DistSparseMatrix (2nd format)." << std::endl;
4530 #endif
4531  Int mpirank = grid_->mpirank;
4532  Int mpisize = grid_->mpisize;
4533 
4534  std::vector<Int> rowSend( mpisize );
4535  std::vector<Int> colSend( mpisize );
4536  std::vector<T> valSend( mpisize );
4537  std::vector<Int> sizeSend( mpisize, 0 );
4538  std::vector<Int> displsSend( mpisize, 0 );
4539 
4540  std::vector<Int> rowRecv( mpisize );
4541  std::vector<Int> colRecv( mpisize );
4542  std::vector<T> valRecv( mpisize );
4543  std::vector<Int> sizeRecv( mpisize, 0 );
4544  std::vector<Int> displsRecv( mpisize, 0 );
4545 
4546  Int numSuper = this->NumSuper();
4547  const IntNumVec& permInv = super_->permInv;
4548 
4549 
4550 
4551  const IntNumVec * pPermInv_r;
4552 
4553  if(optionsLU_->RowPerm=="NOROWPERM"){
4554  pPermInv_r = &super_->permInv;
4555  }
4556  else{
4557  pPermInv_r = &super_->permInv_r;
4558  }
4559 
4560  const IntNumVec& permInv_r = *pPermInv_r;
4561 
4562 
4563 
4564 
4565 
4566 
4567  // The number of local columns in DistSparseMatrix format for the
4568  // processor with rank 0. This number is the same for processors
4569  // with rank ranging from 0 to mpisize - 2, and may or may not differ
4570  // from the number of local columns for processor with rank mpisize -
4571  // 1.
4572  Int numColFirst = this->NumCol() / mpisize;
4573 
4574  // Count the size first.
4575  for( Int ksup = 0; ksup < numSuper; ksup++ ){
4576  // L blocks
4577  if( MYCOL( grid_ ) == PCOL( ksup, grid_ ) ){
4578  std::vector<LBlock<T> >& Lcol = this->L( LBj( ksup, grid_ ) );
4579  for( Int ib = 0; ib < Lcol.size(); ib++ ){
4580  for( Int j = 0; j < Lcol[ib].numCol; j++ ){
4581  Int jcol = permInv( j + FirstBlockCol( ksup, super_ ) );
4582  Int dest = std::min( jcol / numColFirst, mpisize - 1 );
4583  sizeSend[dest] += Lcol[ib].numRow;
4584  }
4585  }
4586  } // I own the column of ksup
4587 
4588  // U blocks
4589  if( MYROW( grid_ ) == PROW( ksup, grid_ ) ){
4590  std::vector<UBlock<T> >& Urow = this->U( LBi( ksup, grid_ ) );
4591  for( Int jb = 0; jb < Urow.size(); jb++ ){
4592  IntNumVec& cols = Urow[jb].cols;
4593  for( Int j = 0; j < cols.m(); j++ ){
4594  Int jcol = permInv( cols(j) );
4595  Int dest = std::min( jcol / numColFirst, mpisize - 1 );
4596  sizeSend[dest] += Urow[jb].numRow;
4597  }
4598  }
4599  } // I own the row of ksup
4600  } // for (ksup)
4601 
4602  // All-to-all exchange of size information
4603  MPI_Alltoall(
4604  &sizeSend[0], 1, MPI_INT,
4605  &sizeRecv[0], 1, MPI_INT, grid_->comm );
4606 
4607 
4608 
4609  // Reserve the space
4610  for( Int ip = 0; ip < mpisize; ip++ ){
4611  if( ip == 0 ){
4612  displsSend[ip] = 0;
4613  }
4614  else{
4615  displsSend[ip] = displsSend[ip-1] + sizeSend[ip-1];
4616  }
4617 
4618  if( ip == 0 ){
4619  displsRecv[ip] = 0;
4620  }
4621  else{
4622  displsRecv[ip] = displsRecv[ip-1] + sizeRecv[ip-1];
4623  }
4624  }
4625  Int sizeSendTotal = displsSend[mpisize-1] + sizeSend[mpisize-1];
4626  Int sizeRecvTotal = displsRecv[mpisize-1] + sizeRecv[mpisize-1];
4627 
4628  rowSend.resize( sizeSendTotal );
4629  colSend.resize( sizeSendTotal );
4630  valSend.resize( sizeSendTotal );
4631 
4632  rowRecv.resize( sizeRecvTotal );
4633  colRecv.resize( sizeRecvTotal );
4634  valRecv.resize( sizeRecvTotal );
4635 
4636 #if ( _DEBUGlevel_ >= 1 )
4637  statusOFS << "displsSend = " << displsSend << std::endl;
4638  statusOFS << "displsRecv = " << displsRecv << std::endl;
4639 #endif
4640 
4641  // Put (row, col, val) to the sending buffer
4642  std::vector<Int> cntSize( mpisize, 0 );
4643 
4644 
4645  for( Int ksup = 0; ksup < numSuper; ksup++ ){
4646  // L blocks
4647  if( MYCOL( grid_ ) == PCOL( ksup, grid_ ) ){
4648  std::vector<LBlock<T> >& Lcol = this->L( LBj( ksup, grid_ ) );
4649  for( Int ib = 0; ib < Lcol.size(); ib++ ){
4650  IntNumVec& rows = Lcol[ib].rows;
4651  NumMat<T>& nzval = Lcol[ib].nzval;
4652  for( Int j = 0; j < Lcol[ib].numCol; j++ ){
4653  Int jcol = permInv( j + FirstBlockCol( ksup, super_ ) );
4654  Int dest = std::min( jcol / numColFirst, mpisize - 1 );
4655  for( Int i = 0; i < rows.m(); i++ ){
4656  rowSend[displsSend[dest] + cntSize[dest]] = permInv_r( rows(i) );
4657  colSend[displsSend[dest] + cntSize[dest]] = jcol;
4658  valSend[displsSend[dest] + cntSize[dest]] = nzval( i, j );
4659  cntSize[dest]++;
4660  }
4661  }
4662  }
4663  } // I own the column of ksup
4664 
4665 
4666  // U blocks
4667  if( MYROW( grid_ ) == PROW( ksup, grid_ ) ){
4668  std::vector<UBlock<T> >& Urow = this->U( LBi( ksup, grid_ ) );
4669  for( Int jb = 0; jb < Urow.size(); jb++ ){
4670  IntNumVec& cols = Urow[jb].cols;
4671  NumMat<T>& nzval = Urow[jb].nzval;
4672  for( Int j = 0; j < cols.m(); j++ ){
4673  Int jcol = permInv( cols(j) );
4674  Int dest = std::min( jcol / numColFirst, mpisize - 1 );
4675  for( Int i = 0; i < Urow[jb].numRow; i++ ){
4676  rowSend[displsSend[dest] + cntSize[dest]] =
4677  permInv_r( i + FirstBlockCol( ksup, super_ ) );
4678  colSend[displsSend[dest] + cntSize[dest]] = jcol;
4679  valSend[displsSend[dest] + cntSize[dest]] = nzval( i, j );
4680  cntSize[dest]++;
4681  }
4682  }
4683  }
4684  } // I own the row of ksup
4685  }
4686 
4687 
4688 
4689  // Check sizes match
4690  for( Int ip = 0; ip < mpisize; ip++ ){
4691  if( cntSize[ip] != sizeSend[ip] ){
4692 #ifdef USE_ABORT
4693  abort();
4694 #endif
4695  throw std::runtime_error( "Sizes of the sending information do not match." );
4696  }
4697  }
4698 
4699  // Alltoallv to exchange information
4700  mpi::Alltoallv(
4701  &rowSend[0], &sizeSend[0], &displsSend[0],
4702  &rowRecv[0], &sizeRecv[0], &displsRecv[0],
4703  grid_->comm );
4704  mpi::Alltoallv(
4705  &colSend[0], &sizeSend[0], &displsSend[0],
4706  &colRecv[0], &sizeRecv[0], &displsRecv[0],
4707  grid_->comm );
4708  mpi::Alltoallv(
4709  &valSend[0], &sizeSend[0], &displsSend[0],
4710  &valRecv[0], &sizeRecv[0], &displsRecv[0],
4711  grid_->comm );
4712 
4713 #if ( _DEBUGlevel_ >= 1 )
4714  statusOFS << "Alltoallv communication finished." << std::endl;
4715 #endif
4716 
4717  //#if ( _DEBUGlevel_ >= 1 )
4718  // for( Int ip = 0; ip < mpisize; ip++ ){
4719  // statusOFS << "rowSend[" << ip << "] = " << rowSend[ip] << std::endl;
4720  // statusOFS << "rowRecv[" << ip << "] = " << rowRecv[ip] << std::endl;
4721  // statusOFS << "colSend[" << ip << "] = " << colSend[ip] << std::endl;
4722  // statusOFS << "colRecv[" << ip << "] = " << colRecv[ip] << std::endl;
4723  // statusOFS << "valSend[" << ip << "] = " << valSend[ip] << std::endl;
4724  // statusOFS << "valRecv[" << ip << "] = " << valRecv[ip] << std::endl;
4725  // }
4726  //#endif
4727 
4728  // Organize the received message.
4729  Int firstCol = mpirank * numColFirst;
4730  Int numColLocal;
4731  if( mpirank == mpisize-1 )
4732  numColLocal = this->NumCol() - numColFirst * (mpisize-1);
4733  else
4734  numColLocal = numColFirst;
4735 
4736  std::vector<std::vector<Int> > rows( numColLocal );
4737  std::vector<std::vector<T> > vals( numColLocal );
4738 
4739  for( Int ip = 0; ip < mpisize; ip++ ){
4740  Int* rowRecvCur = &rowRecv[displsRecv[ip]];
4741  Int* colRecvCur = &colRecv[displsRecv[ip]];
4742  T* valRecvCur = &valRecv[displsRecv[ip]];
4743  for( Int i = 0; i < sizeRecv[ip]; i++ ){
4744  rows[colRecvCur[i]-firstCol].push_back( rowRecvCur[i] );
4745  vals[colRecvCur[i]-firstCol].push_back( valRecvCur[i] );
4746  } // for (i)
4747  } // for (ip)
4748 
4749  // Sort the rows
4750  std::vector<std::vector<Int> > sortIndex( numColLocal );
4751  for( Int j = 0; j < numColLocal; j++ ){
4752  sortIndex[j].resize( rows[j].size() );
4753  for( Int i = 0; i < sortIndex[j].size(); i++ )
4754  sortIndex[j][i] = i;
4755  std::sort( sortIndex[j].begin(), sortIndex[j].end(),
4756  IndexComp<std::vector<Int>& > ( rows[j] ) );
4757  } // for (j)
4758 
4759  // Form DistSparseMatrix according to the received message
4760  // NOTE: for indicies, DistSparseMatrix follows the FORTRAN
4761  // convention (1 based) while PMatrix follows the C convention (0
4762  // based)
4763  if( A.size != this->NumCol() ){
4764 #ifdef USE_ABORT
4765  abort();
4766 #endif
4767  throw std::runtime_error( "The DistSparseMatrix providing the pattern has a different size from PMatrix." );
4768  }
4769  if( A.colptrLocal.m() != numColLocal + 1 ){
4770 #ifdef USE_ABORT
4771  abort();
4772 #endif
4773  throw std::runtime_error( "The DistSparseMatrix providing the pattern has a different number of local columns from PMatrix." );
4774  }
4775 
4776  B.size = A.size;
4777  B.nnz = A.nnz;
4778  B.nnzLocal = A.nnzLocal;
4779  B.colptrLocal = A.colptrLocal;
4780  B.rowindLocal = A.rowindLocal;
4781  B.nzvalLocal.Resize( B.nnzLocal );
4782  SetValue( B.nzvalLocal, ZERO<T>() );
4783  // Make sure that the communicator of A and B are the same.
4784  // FIXME Find a better way to compare the communicators
4785  // if( grid_->comm != A.comm ){
4786  // #ifdef USE_ABORT
4787  // abort();
4788  //#endif
4789  //throw std::runtime_error( "The DistSparseMatrix providing the pattern has a different communicator from PMatrix." );
4790  // }
4791  B.comm = grid_->comm;
4792 
4793  Int* rowPtr = B.rowindLocal.Data();
4794  T* nzvalPtr = B.nzvalLocal.Data();
4795  for( Int j = 0; j < numColLocal; j++ ){
4796  std::vector<Int>& rowsCur = rows[j];
4797  std::vector<Int>& sortIndexCur = sortIndex[j];
4798  std::vector<T>& valsCur = vals[j];
4799  std::vector<Int> rowsCurSorted( rowsCur.size() );
4800  // Note that 1 is important since the index follows the FORTRAN convention
4801  for( Int i = 0; i < rowsCurSorted.size(); i++ ){
4802  rowsCurSorted[i] = rowsCur[sortIndexCur[i]] + 1;
4803  }
4804 
4805  // Search and match the indices
4806  std::vector<Int>::iterator it;
4807  for( Int i = B.colptrLocal(j) - 1;
4808  i < B.colptrLocal(j+1) - 1; i++ ){
4809  it = std::lower_bound( rowsCurSorted.begin(), rowsCurSorted.end(),
4810  *(rowPtr++) );
4811  if( it == rowsCurSorted.end() ){
4812  // Did not find the row, set it to zero
4813  *(nzvalPtr++) = ZERO<T>();
4814  }
4815  else{
4816  // Found the row, set it according to the received value
4817  *(nzvalPtr++) = valsCur[ sortIndexCur[it-rowsCurSorted.begin()] ];
4818  }
4819  } // for (i)
4820  } // for (j)
4821 
4822 #if ( _DEBUGlevel_ >= 1 )
4823  statusOFS << "B.colptrLocal[end] = " << B.colptrLocal(numColLocal) << std::endl;
4824  statusOFS << "B.rowindLocal.size() = " << B.rowindLocal.m() << std::endl;
4825  statusOFS << "B.rowindLocal[end] = " << B.rowindLocal(B.nnzLocal-1) << std::endl;
4826  statusOFS << "B.nzvalLocal[end] = " << B.nzvalLocal(B.nnzLocal-1) << std::endl;
4827 #endif
4828 
4829 
4830 #ifndef _RELEASE_
4831  PopCallStack();
4832 #endif
4833 
4834  return ;
4835  } // ----- end of method PMatrix::PMatrixToDistSparseMatrix_OLD -----
4836 
4837 
4838  // A (maybe) more memory efficient way for converting the PMatrix to a
4839  // DistSparseMatrix structure.
4840  //
4841  template<typename T>
4843  {
4844 
4845 #ifndef _RELEASE_
4846  PushCallStack("PMatrix::PMatrixToDistSparseMatrix");
4847 #endif
4848 #if ( _DEBUGlevel_ >= 1 )
4849  statusOFS << std::endl << "Converting PMatrix to DistSparseMatrix (2nd format)." << std::endl;
4850 #endif
4851  Int mpirank = grid_->mpirank;
4852  Int mpisize = grid_->mpisize;
4853 
4854  std::vector<Int> rowSend( mpisize );
4855  std::vector<Int> colSend( mpisize );
4856  std::vector<T> valSend( mpisize );
4857  std::vector<Int> sizeSend( mpisize, 0 );
4858  std::vector<Int> displsSend( mpisize, 0 );
4859 
4860  std::vector<Int> rowRecv( mpisize );
4861  std::vector<Int> colRecv( mpisize );
4862  std::vector<T> valRecv( mpisize );
4863  std::vector<Int> sizeRecv( mpisize, 0 );
4864  std::vector<Int> displsRecv( mpisize, 0 );
4865 
4866  Int numSuper = this->NumSuper();
4867  const IntNumVec& perm = super_->perm;
4868  const IntNumVec& permInv = super_->permInv;
4869 
4870  const IntNumVec * pPerm_r;
4871  const IntNumVec * pPermInv_r;
4872 
4873  pPerm_r = &super_->perm_r;
4874  pPermInv_r = &super_->permInv_r;
4875 
4876  const IntNumVec& perm_r = *pPerm_r;
4877  const IntNumVec& permInv_r = *pPermInv_r;
4878 
4879 
4880  // Count the sizes from the A matrix first
4881  Int numColFirst = this->NumCol() / mpisize;
4882  Int firstCol = mpirank * numColFirst;
4883  Int numColLocal;
4884  if( mpirank == mpisize-1 )
4885  numColLocal = this->NumCol() - numColFirst * (mpisize-1);
4886  else
4887  numColLocal = numColFirst;
4888 
4889 
4890 
4891 
4892 
4893  Int* rowPtr = A.rowindLocal.Data();
4894  Int* colPtr = A.colptrLocal.Data();
4895 
4896  for( Int j = 0; j < numColLocal; j++ ){
4897  Int ocol = firstCol + j;
4898  Int col = perm[ perm_r[ ocol] ];
4899  Int blockColIdx = BlockIdx( col, super_ );
4900  Int procCol = PCOL( blockColIdx, grid_ );
4901  for( Int i = colPtr[j] - 1; i < colPtr[j+1] - 1; i++ ){
4902  Int orow = rowPtr[i]-1;
4903  Int row = perm[ orow ];
4904  Int blockRowIdx = BlockIdx( row, super_ );
4905  Int procRow = PROW( blockRowIdx, grid_ );
4906  Int dest = PNUM( procRow, procCol, grid_ );
4907 #if ( _DEBUGlevel_ >= 1 )
4908  statusOFS << "("<< orow<<", "<<ocol<<") == "<< "("<< row<<", "<<col<<")"<< std::endl;
4909  statusOFS << "BlockIdx = " << blockRowIdx << ", " <<blockColIdx << std::endl;
4910  statusOFS << procRow << ", " << procCol << ", "
4911  << dest << std::endl;
4912 #endif
4913  sizeSend[dest]++;
4914  } // for (i)
4915  } // for (j)
4916 
4917  // All-to-all exchange of size information
4918  MPI_Alltoall(
4919  &sizeSend[0], 1, MPI_INT,
4920  &sizeRecv[0], 1, MPI_INT, grid_->comm );
4921 
4922 #if ( _DEBUGlevel_ >= 1 )
4923  statusOFS << std::endl << "sizeSend: " << sizeSend << std::endl;
4924  statusOFS << std::endl << "sizeRecv: " << sizeRecv << std::endl;
4925 #endif
4926 
4927 
4928 
4929  // Reserve the space
4930  for( Int ip = 0; ip < mpisize; ip++ ){
4931  if( ip == 0 ){
4932  displsSend[ip] = 0;
4933  }
4934  else{
4935  displsSend[ip] = displsSend[ip-1] + sizeSend[ip-1];
4936  }
4937 
4938  if( ip == 0 ){
4939  displsRecv[ip] = 0;
4940  }
4941  else{
4942  displsRecv[ip] = displsRecv[ip-1] + sizeRecv[ip-1];
4943  }
4944  }
4945 
4946  Int sizeSendTotal = displsSend[mpisize-1] + sizeSend[mpisize-1];
4947  Int sizeRecvTotal = displsRecv[mpisize-1] + sizeRecv[mpisize-1];
4948 
4949  rowSend.resize( sizeSendTotal );
4950  colSend.resize( sizeSendTotal );
4951  valSend.resize( sizeSendTotal );
4952 
4953  rowRecv.resize( sizeRecvTotal );
4954  colRecv.resize( sizeRecvTotal );
4955  valRecv.resize( sizeRecvTotal );
4956 
4957 #if ( _DEBUGlevel_ >= 1 )
4958  statusOFS << "displsSend = " << displsSend << std::endl;
4959  statusOFS << "displsRecv = " << displsRecv << std::endl;
4960 #endif
4961 
4962  // Put (row, col) to the sending buffer
4963  std::vector<Int> cntSize( mpisize, 0 );
4964 
4965  rowPtr = A.rowindLocal.Data();
4966  colPtr = A.colptrLocal.Data();
4967 
4968  for( Int j = 0; j < numColLocal; j++ ){
4969 
4970  Int ocol = firstCol + j;
4971  Int col = perm[ perm_r[ ocol] ];
4972  Int blockColIdx = BlockIdx( col, super_ );
4973  Int procCol = PCOL( blockColIdx, grid_ );
4974  for( Int i = colPtr[j] - 1; i < colPtr[j+1] - 1; i++ ){
4975  Int orow = rowPtr[i]-1;
4976  Int row = perm[ orow ];
4977  Int blockRowIdx = BlockIdx( row, super_ );
4978  Int procRow = PROW( blockRowIdx, grid_ );
4979  Int dest = PNUM( procRow, procCol, grid_ );
4980  rowSend[displsSend[dest] + cntSize[dest]] = row;
4981  colSend[displsSend[dest] + cntSize[dest]] = col;
4982  cntSize[dest]++;
4983  } // for (i)
4984  } // for (j)
4985 
4986 
4987  // Check sizes match
4988  for( Int ip = 0; ip < mpisize; ip++ ){
4989  if( cntSize[ip] != sizeSend[ip] ){
4990 #ifdef USE_ABORT
4991  abort();
4992 #endif
4993  throw std::runtime_error( "Sizes of the sending information do not match." );
4994  }
4995  }
4996 
4997  // Alltoallv to exchange information
4998  mpi::Alltoallv(
4999  &rowSend[0], &sizeSend[0], &displsSend[0],
5000  &rowRecv[0], &sizeRecv[0], &displsRecv[0],
5001  grid_->comm );
5002  mpi::Alltoallv(
5003  &colSend[0], &sizeSend[0], &displsSend[0],
5004  &colRecv[0], &sizeRecv[0], &displsRecv[0],
5005  grid_->comm );
5006 
5007 #if ( _DEBUGlevel_ >= 1 )
5008  statusOFS << "Alltoallv communication of nonzero indices finished." << std::endl;
5009 #endif
5010 
5011 
5012 #if ( _DEBUGlevel_ >= 1 )
5013  for( Int ip = 0; ip < mpisize; ip++ ){
5014  statusOFS << "rowSend[" << ip << "] = " << rowSend[ip] << std::endl;
5015  statusOFS << "rowRecv[" << ip << "] = " << rowRecv[ip] << std::endl;
5016  statusOFS << "colSend[" << ip << "] = " << colSend[ip] << std::endl;
5017  statusOFS << "colRecv[" << ip << "] = " << colRecv[ip] << std::endl;
5018  }
5019 
5020 
5021  //DumpLU();
5022 
5023 
5024 
5025 #endif
5026 
5027  // For each (row, col), fill the nonzero values to valRecv locally.
5028  for( Int g = 0; g < sizeRecvTotal; g++ ){
5029  Int row = rowRecv[g];
5030  Int col = colRecv[g];
5031 
5032  Int blockRowIdx = BlockIdx( row, super_ );
5033  Int blockColIdx = BlockIdx( col, super_ );
5034 
5035  // Search for the nzval
5036  bool isFound = false;
5037 
5038  if( blockColIdx <= blockRowIdx ){
5039  // Data on the L side
5040 
5041  std::vector<LBlock<T> >& Lcol = this->L( LBj( blockColIdx, grid_ ) );
5042 
5043  for( Int ib = 0; ib < Lcol.size(); ib++ ){
5044 #if ( _DEBUGlevel_ >= 1 )
5045  statusOFS << "blockRowIdx = " << blockRowIdx << ", Lcol[ib].blockIdx = " << Lcol[ib].blockIdx << ", blockColIdx = " << blockColIdx << std::endl;
5046 #endif
5047  if( Lcol[ib].blockIdx == blockRowIdx ){
5048  IntNumVec& rows = Lcol[ib].rows;
5049  for( int iloc = 0; iloc < Lcol[ib].numRow; iloc++ ){
5050  if( rows[iloc] == row ){
5051  Int jloc = col - FirstBlockCol( blockColIdx, super_ );
5052  valRecv[g] = Lcol[ib].nzval( iloc, jloc );
5053  isFound = true;
5054  break;
5055  } // found the corresponding row
5056  }
5057  }
5058  if( isFound == true ) break;
5059  } // for (ib)
5060  }
5061  else{
5062  // Data on the U side
5063 
5064  std::vector<UBlock<T> >& Urow = this->U( LBi( blockRowIdx, grid_ ) );
5065 
5066  for( Int jb = 0; jb < Urow.size(); jb++ ){
5067  if( Urow[jb].blockIdx == blockColIdx ){
5068  IntNumVec& cols = Urow[jb].cols;
5069  for( int jloc = 0; jloc < Urow[jb].numCol; jloc++ ){
5070  if( cols[jloc] == col ){
5071  Int iloc = row - FirstBlockRow( blockRowIdx, super_ );
5072  valRecv[g] = Urow[jb].nzval( iloc, jloc );
5073  isFound = true;
5074  break;
5075  } // found the corresponding col
5076  }
5077  }
5078  if( isFound == true ) break;
5079  } // for (jb)
5080  } // if( blockColIdx <= blockRowIdx )
5081 
5082  // Did not find the corresponding row, set the value to zero.
5083  if( isFound == false ){
5084  statusOFS << "In the permutated order, (" << row << ", " << col <<
5085  ") is not found in PMatrix." << std::endl;
5086  valRecv[g] = ZERO<T>();
5087  }
5088 
5089  } // for (g)
5090 
5091 
5092  // Feed back valRecv to valSend through Alltoallv. NOTE: for the
5093  // values, the roles of "send" and "recv" are swapped.
5094  mpi::Alltoallv(
5095  &valRecv[0], &sizeRecv[0], &displsRecv[0],
5096  &valSend[0], &sizeSend[0], &displsSend[0],
5097  grid_->comm );
5098 
5099 #if ( _DEBUGlevel_ >= 1 )
5100  statusOFS << "Alltoallv communication of nonzero values finished." << std::endl;
5101 #endif
5102 
5103  // Put the nonzero values from valSend to the matrix B.
5104  B.size = A.size;
5105  B.nnz = A.nnz;
5106  B.nnzLocal = A.nnzLocal;
5107  B.colptrLocal = A.colptrLocal;
5108  B.rowindLocal = A.rowindLocal;
5109  B.nzvalLocal.Resize( B.nnzLocal );
5110  SetValue( B.nzvalLocal, ZERO<T>() );
5111  // Make sure that the communicator of A and B are the same.
5112  // FIXME Find a better way to compare the communicators
5113  // if( grid_->comm != A.comm ){
5114  // #ifdef USE_ABORT
5115  //abort();
5116  //#endif
5117  //throw std::runtime_error( "The DistSparseMatrix providing the pattern has a different communicator from PMatrix." );
5118  // }
5119  B.comm = grid_->comm;
5120 
5121  for( Int i = 0; i < mpisize; i++ )
5122  cntSize[i] = 0;
5123 
5124  rowPtr = B.rowindLocal.Data();
5125  colPtr = B.colptrLocal.Data();
5126  T* valPtr = B.nzvalLocal.Data();
5127 
5128  for( Int j = 0; j < numColLocal; j++ ){
5129  Int ocol = firstCol + j;
5130  Int col = perm[ perm_r[ ocol] ];
5131  Int blockColIdx = BlockIdx( col, super_ );
5132  Int procCol = PCOL( blockColIdx, grid_ );
5133  for( Int i = colPtr[j] - 1; i < colPtr[j+1] - 1; i++ ){
5134  Int orow = rowPtr[i]-1;
5135  Int row = perm[ orow ];
5136  Int blockRowIdx = BlockIdx( row, super_ );
5137  Int procRow = PROW( blockRowIdx, grid_ );
5138  Int dest = PNUM( procRow, procCol, grid_ );
5139  valPtr[i] = valSend[displsSend[dest] + cntSize[dest]];
5140  cntSize[dest]++;
5141  } // for (i)
5142  } // for (j)
5143 
5144  // Check sizes match
5145  for( Int ip = 0; ip < mpisize; ip++ ){
5146  if( cntSize[ip] != sizeSend[ip] ){
5147 #ifdef USE_ABORT
5148  abort();
5149 #endif
5150  throw std::runtime_error( "Sizes of the sending information do not match." );
5151  }
5152  }
5153 
5154 
5155 #ifndef _RELEASE_
5156  PopCallStack();
5157 #endif
5158 
5159  return ;
5160  } // ----- end of method PMatrix::PMatrixToDistSparseMatrix -----
5161 
5162 
5163 
5164 
5165  template<typename T>
5167  {
5168 #ifndef _RELEASE_
5169  PushCallStack("PMatrix::NnzLocal");
5170 #endif
5171  Int numSuper = this->NumSuper();
5172  Int nnzLocal = 0;
5173  for( Int ksup = 0; ksup < numSuper; ksup++ ){
5174  if( MYCOL( grid_ ) == PCOL( ksup, grid_ ) ){
5175  std::vector<LBlock<T> >& Lcol = this->L( LBj( ksup, grid_ ) );
5176  for( Int ib = 0; ib < Lcol.size(); ib++ ){
5177  nnzLocal += Lcol[ib].numRow * Lcol[ib].numCol;
5178  }
5179  } // if I own the column of ksup
5180  if( MYROW( grid_ ) == PROW( ksup, grid_ ) ){
5181  std::vector<UBlock<T> >& Urow = this->U( LBi( ksup, grid_ ) );
5182  for( Int jb = 0; jb < Urow.size(); jb++ ){
5183  nnzLocal += Urow[jb].numRow * Urow[jb].numCol;
5184  }
5185  } // if I own the row of ksup
5186  }
5187 
5188 #ifndef _RELEASE_
5189  PopCallStack();
5190 #endif
5191 
5192  return nnzLocal;
5193  } // ----- end of method PMatrix::NnzLocal -----
5194 
5195 
5196  template<typename T>
5197  LongInt PMatrix<T>::Nnz ( )
5198  {
5199 #ifndef _RELEASE_
5200  PushCallStack("PMatrix::Nnz");
5201 #endif
5202  LongInt nnzLocal = LongInt( this->NnzLocal() );
5203  LongInt nnz;
5204 
5205  MPI_Allreduce( &nnzLocal, &nnz, 1, MPI_LONG_LONG, MPI_SUM, grid_->comm );
5206 
5207 #ifndef _RELEASE_
5208  PopCallStack();
5209 #endif
5210 
5211  return nnz;
5212  } // ----- end of method PMatrix::Nnz -----
5213 
5214  template< >
5215  inline void PMatrix<Real>::GetNegativeInertia ( Real& inertia )
5216  {
5217 #ifndef _RELEASE_
5218  PushCallStack("PMatrix::GetNegativeInertia");
5219 #endif
5220  Int numSuper = this->NumSuper();
5221 
5222  Real inertiaLocal = 0.0;
5223  inertia = 0.0;
5224 
5225  for( Int ksup = 0; ksup < numSuper; ksup++ ){
5226  // I own the diagonal block
5227  if( MYROW( grid_ ) == PROW( ksup, grid_ ) &&
5228  MYCOL( grid_ ) == PCOL( ksup, grid_ ) ){
5229  LBlock<Real> & LB = this->L( LBj( ksup, grid_ ) )[0];
5230  for( Int i = 0; i < LB.numRow; i++ ){
5231  if( LB.nzval(i, i) < 0 )
5232  inertiaLocal++;
5233  }
5234  }
5235  }
5236 
5237  // All processors own diag
5238  mpi::Allreduce( &inertiaLocal, &inertia, 1, MPI_SUM, grid_->comm );
5239 
5240 #ifndef _RELEASE_
5241  PopCallStack();
5242 #endif
5243 
5244  return ;
5245  } // ----- end of method PMatrix::GetNegativeInertia -----
5246 
5247 
5248  template< >
5249  inline void PMatrix<Complex>::GetNegativeInertia ( Real& inertia )
5250  {
5251 #ifndef _RELEASE_
5252  PushCallStack("PMatrix::GetNegativeInertia");
5253 #endif
5254  Int numSuper = this->NumSuper();
5255 
5256  Real inertiaLocal = 0.0;
5257  inertia = 0.0;
5258 
5259  for( Int ksup = 0; ksup < numSuper; ksup++ ){
5260  // I own the diagonal block
5261  if( MYROW( grid_ ) == PROW( ksup, grid_ ) &&
5262  MYCOL( grid_ ) == PCOL( ksup, grid_ ) ){
5263  LBlock<Complex> & LB = this->L( LBj( ksup, grid_ ) )[0];
5264  for( Int i = 0; i < LB.numRow; i++ ){
5265  if( LB.nzval(i, i).real() < 0 )
5266  inertiaLocal++;
5267  }
5268  }
5269  }
5270 
5271  // All processors own diag
5272  mpi::Allreduce( &inertiaLocal, &inertia, 1, MPI_SUM, grid_->comm );
5273 
5274 #ifndef _RELEASE_
5275  PopCallStack();
5276 #endif
5277 
5278  return ;
5279  } // ----- end of method PMatrix::GetNegativeInertia -----
5280 
5281  template<typename T>
5282  inline PMatrix<T> * PMatrix<T>::Create(const GridType * pGridType, const SuperNodeType * pSuper, const PSelInvOptions * pSelInvOpt , const SuperLUOptions * pLuOpt)
5283  {
5284  PMatrix<T> * pMat = NULL;
5285  if(pLuOpt->Symmetric == 0){
5286  pMat = new PMatrixUnsym<T>( pGridType, pSuper, pSelInvOpt, pLuOpt );
5287  }
5288  else{
5289  pMat = new PMatrix<T>( pGridType, pSuper, pSelInvOpt, pLuOpt );
5290  }
5291 
5292  return pMat;
5293  } // ----- end of factory method PMatrix::Create -----
5294 
5295  template<typename T>
5296  inline PMatrix<T> * PMatrix<T>::Create(const SuperLUOptions * pLuOpt)
5297  {
5298  PMatrix<T> * pMat = NULL;
5299  if(pLuOpt->Symmetric == 0){
5300  pMat = new PMatrixUnsym<T>();
5301  }
5302  else{
5303  pMat = new PMatrix<T>();
5304  }
5305 
5306  return pMat;
5307  } // ----- end of factory method PMatrix::Create -----
5308 
5309 
5310 
5311  template<typename T>
5312  inline void PMatrix<T>::DumpLU()
5313  {
5314 //#if ( _DEBUGlevel_ >= 1 )
5315  const IntNumVec& perm = super_->perm;
5316  const IntNumVec& permInv = super_->permInv;
5317 
5318  const IntNumVec * pPerm_r;
5319  const IntNumVec * pPermInv_r;
5320 
5321  if(optionsLU_->RowPerm=="NOROWPERM"){
5322  pPerm_r = &super_->perm;
5323  pPermInv_r = &super_->permInv;
5324  }
5325  else{
5326  pPerm_r = &super_->perm_r;
5327  pPermInv_r = &super_->permInv_r;
5328  }
5329 
5330  const IntNumVec& perm_r = *pPerm_r;
5331  const IntNumVec& permInv_r = *pPermInv_r;
5332 
5333 statusOFS<<"Col perm is "<<perm<<std::endl;
5334 statusOFS<<"Row perm is "<<perm_r<<std::endl;
5335 
5336 statusOFS<<"Inv Col perm is "<<permInv<<std::endl;
5337 statusOFS<<"Inv Row perm is "<<permInv_r<<std::endl;
5338 
5339 
5340 
5341 
5342 statusOFS<<"Content of L"<<std::endl;
5343 //dump L
5344  for(Int j = 0;j<this->L_.size()-1;++j){
5345  std::vector<LBlock<T> >& Lcol = this->L( j );
5346  Int blockColIdx = GBj( j, this->grid_ );
5347  Int fc = FirstBlockCol( blockColIdx, this->super_ );
5348 
5349 
5350  for( Int ib = 0; ib < Lcol.size(); ib++ ){
5351  for(Int ir = 0; ir< Lcol[ib].rows.m(); ++ir){
5352  Int row = Lcol[ib].rows[ir];
5353  for(Int col = fc; col<fc+Lcol[ib].numCol;col++){
5354  Int ocol = permInv_r[permInv[col]];
5355  Int orow = permInv[row];
5356  Int jloc = col - FirstBlockCol( blockColIdx, this->super_ );
5357  Int iloc = ir;
5358  T val = Lcol[ib].nzval( iloc, jloc );
5359  statusOFS << "("<< orow<<", "<<ocol<<") == "<< "("<< row<<", "<<col<<") = "<<val<< std::endl;
5360  }
5361  }
5362  }
5363  }
5364 
5365 statusOFS<<"Content of U"<<std::endl;
5366 
5367 //dump U
5368  for(Int i = 0;i<this->U_.size()-1;++i){
5369  std::vector<UBlock<T> >& Urow = this->U( i );
5370  Int blockRowIdx = GBi( i, this->grid_ );
5371  Int fr = FirstBlockRow( blockRowIdx, this->super_ );
5372  for( Int jb = 0; jb < Urow.size(); jb++ ){
5373  for(Int row = fr; row<fr+Urow[jb].numRow;row++){
5374  for(Int ic = 0; ic< Urow[jb].cols.m(); ++ic){
5375  Int col = Urow[jb].cols[ic];
5376  Int ocol = permInv_r[permInv[col]];
5377  Int orow = permInv[row];
5378  Int iloc = row - FirstBlockRow( blockRowIdx, this->super_ );
5379  Int jloc = ic;
5380  T val = Urow[jb].nzval( iloc, jloc );
5381  statusOFS << "("<< orow<<", "<<ocol<<") == "<< "("<< row<<", "<<col<<") = "<<val<< std::endl;
5382 
5383  }
5384  }
5385 
5386  }
5387  }
5388 //#endif
5389 
5390  }
5391 
5392 
5393  template<typename T>
5394  inline void PMatrix<T>::CopyLU( const PMatrix<T> & C){
5395  ColBlockIdx_ = C.ColBlockIdx_;
5396  RowBlockIdx_ = C.RowBlockIdx_;
5397  L_ = C.L_;
5398  U_ = C.U_;
5399  }
5400 
5401 
5402 } // namespace PEXSI
5403 
5404 
5405 
5406 #endif //_PEXSI_PSELINV_IMPL_HPP_
virtual void PreSelInv()
PreSelInv prepares the structure in L_ and U_ so that SelInv only involves matrix-matrix multiplicati...
Definition: pselinv_impl.hpp:3711
Int blockIdx
Block index (supernodal index)
Definition: pselinv.hpp:237
void PMatrixToDistSparseMatrix(DistSparseMatrix< T > &A)
PMatrixToDistSparseMatrix converts the PMatrix into a distributed compressed sparse column matrix for...
Definition: pselinv_impl.hpp:4242
A thin interface for passing parameters to set the SuperLU options.
Definition: superlu_dist_internal.hpp:62
Definition: TreeBcast.hpp:705
SuperNodeType describes mapping between supernode and column, the permutation information, and potentially the elimination tree (not implemented here).
Definition: pselinv.hpp:166
Interface with SuperLU_Dist (version 3.0 and later)
void GetNegativeInertia(Real &inertia)
GetNegativeInertia computes the negative inertia of a PMatrix. This can be used to estimate e...
Int PROW(Int bnum, const GridType *g)
PROW returns the processor row that the bnum-th block (supernode) belongs to.
Definition: pselinv.hpp:293
Int numRow
Number of nonzero rows.
Definition: pselinv.hpp:188
IntNumVec colptrLocal
Dimension numColLocal + 1, storing the pointers to the nonzero row indices and nonzero values in rowp...
Definition: sparse_matrix.hpp:108
void UnpackData(SuperNodeBufferType &snode, std::vector< LBlock< T > > &LcolRecv, std::vector< UBlock< T > > &UrowRecv)
UnpackData.
Definition: pselinv_impl.hpp:1096
Int MYPROC(const GridType *g)
MYPROC returns the current processor rank.
Definition: pselinv.hpp:280
Int nnzLocal
Local number of local nonzeros elements on this processor.
Definition: sparse_matrix.hpp:96
Int PCOL(Int bnum, const GridType *g)
PCOL returns the processor column that the bnum-th block (supernode) belongs to.
Definition: pselinv.hpp:298
Int LBi(Int bnum, const GridType *g)
LBi returns the local block number on the processor at processor row PROW( bnum, g )...
Definition: pselinv.hpp:308
Int Symmetric
Option to specify if matrix is symmetric or not.
Definition: superlu_dist_internal.hpp:108
Profiling and timing using TAU.
Int SuperSize(Int bnum, const SuperNodeType *s)
SuperSize returns the size of the block bnum.
Definition: pselinv.hpp:349
Int LBj(Int bnum, const GridType *g)
LBj returns the local block number on the processor at processor column PCOL( bnum, g ).
Definition: pselinv.hpp:313
Int GBj(Int jLocal, const GridType *g)
GBj returns the global block number from a local block number in the column direction.
Definition: pselinv.hpp:323
void SetValue(NumMat< F > &M, F val)
SetValue sets a numerical matrix to a constant val.
Definition: NumMat_impl.hpp:171
Int numCol
Number of nonzero columns.
Definition: pselinv.hpp:191
Int NumCol(const SuperNodeType *s)
NumCol returns the total number of columns for a supernodal partiiton.
Definition: pselinv.hpp:358
Definition: pselinv.hpp:604
void GetDiagonal(NumVec< T > &diag)
GetDiagonal extracts the diagonal elements of the PMatrix.
Definition: pselinv_impl.hpp:4114
void GetWorkSet(std::vector< Int > &snodeEtree, std::vector< std::vector< Int > > &WSet)
GetWorkSet.
Definition: pselinv_impl.hpp:1271
NumVec< F > nzvalLocal
Dimension nnzLocal, storing the nonzero values.
Definition: sparse_matrix.hpp:116
Int FirstBlockRow(Int bnum, const SuperNodeType *s)
FirstBlockRow returns the first column of a block bnum. Note: the functionality of FirstBlockRow is e...
Definition: pselinv.hpp:344
void GetEtree(std::vector< Int > &etree_supno)
GetEtree computes the supernodal elimination tree to be used later in the pipelined selected inversio...
Definition: pselinv_impl.hpp:1202
LBlock stores a nonzero block in the lower triangular part or the diagonal part in PSelInv...
Definition: pselinv.hpp:182
IntNumVec rowindLocal
Dimension nnzLocal, storing the nonzero row indices. The indices are 1-based (FORTRAN-convention), i.e. the first row index is 1.
Definition: sparse_matrix.hpp:113
Int PNUM(Int i, Int j, const GridType *g)
PNUM returns the processor rank that the bnum-th block (supernode) belongs to.
Definition: pselinv.hpp:303
virtual void ConstructCommunicationPattern()
ConstructCommunicationPattern constructs the communication pattern to be used later in the selected i...
Definition: pselinv_impl.hpp:2670
void ConstructCommunicationPattern_P2p()
ConstructCommunicationPattern_P2p constructs the communication pattern to be used later in the select...
Definition: pselinv_impl.hpp:2678
Int GBi(Int iLocal, const GridType *g)
GBi returns the global block number from a local block number in the row direction.
Definition: pselinv.hpp:318
GridType is the PSelInv way of defining the grid.
Definition: pselinv.hpp:129
void SelInvIntra_P2p(Int lidx, Int &rank)
SelInvIntra_P2p.
Definition: pselinv_impl.hpp:1511
Int MYCOL(const GridType *g)
MYCOL returns my processor column.
Definition: pselinv.hpp:288
Int size
Matrix dimension.
Definition: sparse_matrix.hpp:93
void PMatrixToDistSparseMatrix_OLD(const DistSparseMatrix< T > &A, DistSparseMatrix< T > &B)
PMatrixToDistSparseMatrix_OLD converts the PMatrix into a distributed compressed sparse column matrix...
Definition: pselinv_impl.hpp:4523
IntNumVec rows
Dimension numRow * 1, index (0-based) for the number of nonzero rows.
Definition: pselinv.hpp:194
Int numCol
Number of nonzero columns.
Definition: pselinv.hpp:243
UBlock stores a nonzero block in the upper triangular part in PSelInv.
Definition: pselinv.hpp:234
Int FirstBlockCol(Int bnum, const SuperNodeType *s)
FirstBlockCol returns the first column of a block bnum.
Definition: pselinv.hpp:337
LongInt Nnz()
Compute the total number of nonzeros through MPI_Allreduce.
Definition: sparse_matrix_impl.hpp:107
MPI_Comm comm
MPI communicator.
Definition: sparse_matrix.hpp:119
Int NnzLocal()
NnzLocal computes the number of nonzero elements (L and U) saved locally.
Definition: pselinv_impl.hpp:5166
Int blockIdx
Block index (supernodal index)
Definition: pselinv.hpp:185
PMatrix contains the main data structure and the computational routine for the parallel selected inve...
Definition: ngchol_interf.hpp:57
void SendRecvCD_UpdateU(std::vector< SuperNodeBufferType > &arrSuperNodes, Int stepSuper)
SendRecvCD_UpdateU.
Definition: pselinv_impl.hpp:861
Int BlockIdx(Int i, const SuperNodeType *s)
BlockIdx returns the block index of a column i.
Definition: pselinv.hpp:332
Int numRow
Number of nonzero rows.
Definition: pselinv.hpp:240
NumMat< T > nzval
Dimension numRow * numCol, nonzero elements.
Definition: pselinv.hpp:249
Int nnz
Total number of nonzeros elements.
Definition: sparse_matrix.hpp:101
Int NumSuper(const SuperNodeType *s)
NumSuper returns the total number of supernodes.
Definition: pselinv.hpp:353
void SelInv_P2p()
Point-to-point version of the selected inversion.
Definition: pselinv_impl.hpp:3646
NumMat< T > nzval
Dimension numRow * numCol, nonzero elements.
Definition: pselinv.hpp:198
A thin interface for passing parameters to set the PSelInv options.
Definition: pselinv.hpp:102
Definition: utility.hpp:1481
IntNumVec cols
Dimension numCol * 1, index (0-based) for the number of nonzero rows.
Definition: pselinv.hpp:246
virtual void SelInv()
SelInv is the main function for the selected inversion.
Definition: pselinv_impl.hpp:3593
LongInt Nnz()
Nnz computes the total number of nonzero elements in the PMatrix.
Definition: pselinv_impl.hpp:5197
void SelInv_lookup_indexes(SuperNodeBufferType &snode, std::vector< LBlock< T > > &LcolRecv, std::vector< UBlock< T > > &UrowRecv, NumMat< T > &AinvBuf, NumMat< T > &UBuf)
SelInv_lookup_indexes.
Definition: pselinv_impl.hpp:504
Definition: TreeBcast.hpp:1268
void ComputeDiagUpdate(SuperNodeBufferType &snode)
ComputeDiagUpdate.
Definition: pselinv_impl.hpp:1162
DistSparseMatrix describes a Sparse matrix in the compressed sparse column format (CSC) and distribut...
Definition: sparse_matrix.hpp:91
PMatrixUnsym contains the main data structure and the computational routine for the parallel selected...
Definition: pselinv.hpp:991
Int MYROW(const GridType *g)
MYROW returns my processor row.
Definition: pselinv.hpp:284